From 342bbc3871d1b43f548e9d1ae9d380a1d4989cb3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 24 Jan 2011 14:54:24 +0000 Subject: [PATCH] Import GotoBLAS2 1.13 BSD version codes. --- 00License.txt | 32 + 01Readme.txt | 93 + 02QuickInstall.txt | 118 + 03FAQ.txt | 119 + 04Windows64bit.txt | 13 + 05LargePage | 53 + 06WeirdPerformance | 22 + Makefile | 230 + Makefile.alpha | 57 + Makefile.generic | 6 + Makefile.getarch | 39 + Makefile.ia64 | 22 + Makefile.mips64 | 3 + Makefile.power | 93 + Makefile.rule | 95 + Makefile.sparc | 41 + Makefile.system | 753 + Makefile.tail | 617 + Makefile.x86 | 59 + Makefile.x86_64 | 102 + benchmark/Makefile | 195 + benchmark/cholesky.c | 272 + benchmark/cula_wrapper.c | 28 + benchmark/linpack.c | 273 + c_check | 254 + cblas.h | 273 + common.h | 610 + common_alpha.h | 179 + common_c.h | 611 + common_d.h | 432 + common_ia64.h | 408 + common_interface.h | 736 + common_lapack.h | 296 + common_level1.h | 212 + common_level2.h | 1359 ++ common_level3.h | 1739 ++ common_linux.h | 83 + common_macro.h | 2734 +++ common_mips64.h | 197 + common_param.h | 1098 ++ common_power.h | 795 + common_q.h | 431 + common_reference.h | 0 common_s.h | 436 + common_sparc.h | 224 + common_thread.h | 192 + common_x.h | 611 + common_x86.h | 359 + common_x86_64.h | 451 + common_z.h | 611 + cpuid.S | 67 + cpuid.h | 191 + cpuid_alpha.c | 101 + cpuid_ia64.c | 138 + cpuid_mips.c | 68 + cpuid_power.c | 190 + cpuid_sparc.c | 58 + cpuid_x86.c | 1453 ++ ctest.c | 107 + ctest/LICENSE | 23 + ctest/Makefile | 93 + ctest/auxiliary.c | 38 + ctest/c_c2chke.c | 826 + ctest/c_c3chke.c | 1706 ++ ctest/c_cblas1.c | 75 + ctest/c_cblas2.c | 807 + ctest/c_cblas3.c | 565 + ctest/c_cblat1.f | 682 + ctest/c_cblat2.f | 2932 +++ ctest/c_cblat3.f | 2786 +++ ctest/c_d2chke.c | 789 + ctest/c_d3chke.c | 1271 ++ ctest/c_dblas1.c | 84 + ctest/c_dblas2.c | 583 + ctest/c_dblas3.c | 334 + ctest/c_dblat1.f | 728 + ctest/c_dblat2.f | 2907 +++ ctest/c_dblat3.f | 2475 +++ ctest/c_s2chke.c | 789 + ctest/c_s3chke.c | 1273 ++ ctest/c_sblas1.c | 83 + ctest/c_sblas2.c | 579 + ctest/c_sblas3.c | 330 + ctest/c_sblat1.f | 728 + ctest/c_sblat2.f | 2907 +++ ctest/c_sblat3.f | 2479 +++ ctest/c_xerbla.c | 137 + ctest/c_z2chke.c | 826 + ctest/c_z3chke.c | 1706 ++ ctest/c_zblas1.c | 75 + ctest/c_zblas2.c | 807 + ctest/c_zblas3.c | 564 + ctest/c_zblat1.f | 682 + ctest/c_zblat2.f | 2939 +++ ctest/c_zblat3.f | 2791 +++ ctest/cblas_test.h | 514 + ctest/cin2 | 34 + ctest/cin3 | 22 + ctest/constant.c | 3 + ctest/din2 | 33 + ctest/din3 | 19 + ctest/sin2 | 33 + ctest/sin3 | 19 + ctest/zin2 | 34 + ctest/zin3 | 22 + ctest1.c | 1 + ctest2.c | 1 + driver/level2/Makefile | 3618 ++++ driver/level2/gbmv_k.c | 105 + driver/level2/gbmv_thread.c | 294 + driver/level2/gemv_thread.c | 210 + driver/level2/ger_thread.c | 197 + driver/level2/sbmv_k.c | 97 + driver/level2/sbmv_thread.c | 359 + driver/level2/spmv_k.c | 86 + driver/level2/spmv_thread.c | 345 + driver/level2/spr2_k.c | 75 + driver/level2/spr2_thread.c | 356 + driver/level2/spr_k.c | 69 + driver/level2/spr_thread.c | 291 + driver/level2/symv_thread.c | 295 + driver/level2/syr2_k.c | 75 + driver/level2/syr2_thread.c | 345 + driver/level2/syr_k.c | 69 + driver/level2/syr_thread.c | 283 + driver/level2/tbmv_L.c | 99 + driver/level2/tbmv_U.c | 97 + driver/level2/tbmv_thread.c | 396 + driver/level2/tbsv_L.c | 97 + driver/level2/tbsv_U.c | 99 + driver/level2/tpmv_L.c | 83 + driver/level2/tpmv_U.c | 86 + driver/level2/tpmv_thread.c | 401 + driver/level2/tpsv_L.c | 87 + driver/level2/tpsv_U.c | 83 + driver/level2/trmv_L.c | 103 + driver/level2/trmv_U.c | 104 + driver/level2/trmv_thread.c | 440 + driver/level2/trsv_L.c | 109 + driver/level2/trsv_U.c | 104 + driver/level2/zgbmv_k.c | 145 + driver/level2/zhbmv_k.c | 189 + driver/level2/zher2_k.c | 120 + driver/level2/zher_k.c | 80 + driver/level2/zhpmv_k.c | 177 + driver/level2/zhpr2_k.c | 117 + driver/level2/zhpr_k.c | 79 + driver/level2/zsbmv_k.c | 119 + driver/level2/zspmv_k.c | 108 + driver/level2/zspr2_k.c | 87 + driver/level2/zspr_k.c | 75 + driver/level2/zsyr2_k.c | 89 + driver/level2/zsyr_k.c | 76 + driver/level2/ztbmv_L.c | 131 + driver/level2/ztbmv_U.c | 130 + driver/level2/ztbsv_L.c | 145 + driver/level2/ztbsv_U.c | 148 + driver/level2/ztpmv_L.c | 121 + driver/level2/ztpmv_U.c | 124 + driver/level2/ztpsv_L.c | 142 + driver/level2/ztpsv_U.c | 135 + driver/level2/ztrmv_L.c | 149 + driver/level2/ztrmv_U.c | 155 + driver/level2/ztrsv_L.c | 171 + driver/level2/ztrsv_U.c | 168 + driver/level3/Makefile | 5022 +++++ driver/level3/gemm.c | 66 + driver/level3/gemm3m.c | 58 + driver/level3/gemm3m_level3.c | 531 + driver/level3/gemm_thread_m.c | 90 + driver/level3/gemm_thread_mn.c | 148 + driver/level3/gemm_thread_n.c | 91 + driver/level3/gemm_thread_variable.c | 127 + driver/level3/hemm3m_k.c | 99 + driver/level3/level3.c | 401 + driver/level3/level3_gemm3m_thread.c | 1015 + driver/level3/level3_syr2k.c | 418 + driver/level3/level3_syrk.c | 495 + driver/level3/level3_syrk_threaded.c | 673 + driver/level3/level3_thread.c | 743 + driver/level3/symm3m_k.c | 100 + driver/level3/symm_k.c | 80 + driver/level3/syr2k_k.c | 103 + driver/level3/syr2k_kernel.c | 217 + driver/level3/syrk_k.c | 105 + driver/level3/syrk_kernel.c | 230 + driver/level3/syrk_thread.c | 186 + driver/level3/trmm_L.c | 444 + driver/level3/trmm_R.c | 350 + driver/level3/trsm_L.c | 249 + driver/level3/trsm_R.c | 348 + driver/level3/zhemm_k.c | 80 + driver/level3/zher2k_k.c | 160 + driver/level3/zher2k_kernel.c | 221 + driver/level3/zherk_beta.c | 75 + driver/level3/zherk_k.c | 158 + driver/level3/zherk_kernel.c | 194 + driver/level3/zsyrk_beta.c | 60 + driver/mapper/Makefile | 25 + driver/mapper/device_setup | 11 + driver/mapper/mapper.c | 252 + driver/others/Makefile | 218 + driver/others/abs.c | 71 + driver/others/blas_l1_thread.c | 112 + driver/others/blas_server.c | 848 + driver/others/blas_server_omp.c | 249 + driver/others/blas_server_win32.c | 450 + driver/others/divtable.c | 83 + driver/others/dynamic.c | 219 + driver/others/init.c | 697 + driver/others/lamc3.c | 50 + driver/others/lamch.c | 200 + driver/others/lsame.c | 50 + driver/others/memory.c | 1257 ++ driver/others/memory_qalloc.c | 77 + driver/others/parameter.c | 668 + driver/others/profile.c | 139 + driver/others/xerbla.c | 70 + exports/Makefile | 188 + exports/dllinit.c | 55 + exports/gensymbol | 462 + f_check | 302 + ftest.f | 6 + ftest2.f | 3 + getarch.c | 732 + getarch_2nd.c | 36 + interface/Makefile | 1942 ++ interface/asum.c | 93 + interface/axpy.c | 112 + interface/copy.c | 80 + interface/create | 22 + interface/dot.c | 101 + interface/dsdot.c | 99 + interface/gbmv.c | 252 + interface/gemm.c | 452 + interface/gemv.c | 237 + interface/ger.c | 193 + interface/gesv.c | 154 + interface/getf2.c | 109 + interface/getrf.c | 121 + interface/getrs.c | 152 + interface/imax.c | 171 + interface/larf.c | 109 + interface/laswp.c | 110 + interface/lauu2.c | 128 + interface/lauum.c | 139 + interface/max.c | 169 + interface/nrm2.c | 93 + interface/potf2.c | 128 + interface/potrf.c | 139 + interface/potri.c | 160 + interface/rot.c | 82 + interface/rotg.c | 109 + interface/rotm.c | 155 + interface/rotmg.c | 199 + interface/sbmv.c | 215 + interface/scal.c | 112 + interface/sdsdot.c | 101 + interface/spmv.c | 207 + interface/spr.c | 197 + interface/spr2.c | 203 + interface/swap.c | 110 + interface/symm.c | 422 + interface/symv.c | 205 + interface/syr.c | 200 + interface/syr2.c | 204 + interface/syr2k.c | 366 + interface/syrk.c | 355 + interface/tbmv.c | 248 + interface/tbsv.c | 213 + interface/tpmv.c | 239 + interface/tpsv.c | 204 + interface/trmv.c | 243 + interface/trsm.c | 391 + interface/trsv.c | 208 + interface/trti2.c | 134 + interface/trtri.c | 153 + interface/zaxpy.c | 122 + interface/zdot.c | 202 + interface/zgbmv.c | 271 + interface/zgemv.c | 259 + interface/zger.c | 249 + interface/zgetf2.c | 109 + interface/zgetrf.c | 122 + interface/zgetrs.c | 153 + interface/zhbmv.c | 223 + interface/zhemv.c | 215 + interface/zher.c | 200 + interface/zher2.c | 207 + interface/zhpmv.c | 213 + interface/zhpr.c | 198 + interface/zhpr2.c | 207 + interface/zlaswp.c | 108 + interface/zlauu2.c | 129 + interface/zlauum.c | 141 + interface/zpotf2.c | 129 + interface/zpotrf.c | 141 + interface/zpotri.c | 157 + interface/zrot.c | 72 + interface/zrotg.c | 115 + interface/zsbmv.c | 157 + interface/zscal.c | 117 + interface/zspmv.c | 154 + interface/zspr.c | 146 + interface/zspr2.c | 149 + interface/zswap.c | 111 + interface/zsymv.c | 143 + interface/zsyr.c | 203 + interface/zsyr2.c | 151 + interface/ztbmv.c | 260 + interface/ztbsv.c | 219 + interface/ztpmv.c | 252 + interface/ztpsv.c | 210 + interface/ztrmv.c | 255 + interface/ztrsv.c | 216 + interface/ztrti2.c | 134 + interface/ztrtri.c | 154 + kernel/Makefile | 121 + kernel/Makefile.L1 | 767 + kernel/Makefile.L2 | 428 + kernel/Makefile.L3 | 3135 ++++ kernel/Makefile.LA | 48 + kernel/alpha/KERNEL | 124 + kernel/alpha/Makefile | 2 + kernel/alpha/amax.S | 283 + kernel/alpha/asum.S | 206 + kernel/alpha/axpy.S | 428 + kernel/alpha/cabs.S | 71 + kernel/alpha/cnrm2.S | 426 + kernel/alpha/copy.S | 379 + kernel/alpha/cscal.S | 217 + kernel/alpha/dnrm2.S | 431 + kernel/alpha/dot.S | 530 + kernel/alpha/gemm_beta.S | 179 + kernel/alpha/gemm_kernel_4x4.S | 2852 +++ kernel/alpha/gemv_n.S | 1307 ++ kernel/alpha/gemv_t.S | 1061 ++ kernel/alpha/iamax.S | 440 + kernel/alpha/imax.S | 351 + kernel/alpha/izamax.S | 427 + kernel/alpha/lsame.S | 76 + kernel/alpha/max.S | 227 + kernel/alpha/rot.S | 624 + kernel/alpha/scal.S | 480 + kernel/alpha/snrm2.S | 431 + kernel/alpha/staticbuffer.S | 45 + kernel/alpha/swap.S | 249 + kernel/alpha/trsm_kernel_4x4_LN.S | 4068 ++++ kernel/alpha/trsm_kernel_4x4_LT.S | 4066 ++++ kernel/alpha/trsm_kernel_4x4_RT.S | 4066 ++++ kernel/alpha/zamax.S | 301 + kernel/alpha/zasum.S | 208 + kernel/alpha/zaxpy.S | 611 + kernel/alpha/zdot.S | 500 + kernel/alpha/zgemm_beta.S | 192 + kernel/alpha/zgemm_kernel_2x2.S | 1712 ++ kernel/alpha/zgemv_n.S | 1027 + kernel/alpha/zgemv_t.S | 922 + kernel/alpha/znrm2.S | 426 + kernel/alpha/zrot.S | 631 + kernel/alpha/zscal.S | 255 + kernel/alpha/zswap.S | 244 + kernel/alpha/ztrsm_kernel_2x2_LN.S | 2237 +++ kernel/alpha/ztrsm_kernel_2x2_LT.S | 2230 +++ kernel/alpha/ztrsm_kernel_2x2_RT.S | 2230 +++ kernel/generic/cabs.c | 44 + kernel/generic/gemm_beta.c | 142 + kernel/generic/gemm_ncopy_1.c | 90 + kernel/generic/gemm_ncopy_16.c | 437 + kernel/generic/gemm_ncopy_2.c | 126 + kernel/generic/gemm_ncopy_4.c | 230 + kernel/generic/gemm_ncopy_8.c | 422 + kernel/generic/gemm_tcopy_1.c | 75 + kernel/generic/gemm_tcopy_16.c | 387 + kernel/generic/gemm_tcopy_2.c | 104 + kernel/generic/gemm_tcopy_4.c | 281 + kernel/generic/gemm_tcopy_8.c | 787 + kernel/generic/ger.c | 63 + kernel/generic/laswp_ncopy_1.c | 154 + kernel/generic/laswp_ncopy_2.c | 293 + kernel/generic/laswp_ncopy_4.c | 503 + kernel/generic/laswp_ncopy_8.c | 296 + kernel/generic/lsame.c | 50 + kernel/generic/neg_tcopy_1.c | 75 + kernel/generic/neg_tcopy_16.c | 387 + kernel/generic/neg_tcopy_2.c | 105 + kernel/generic/neg_tcopy_4.c | 281 + kernel/generic/neg_tcopy_8.c | 787 + kernel/generic/symm_lcopy_1.c | 76 + kernel/generic/symm_lcopy_16.c | 273 + kernel/generic/symm_lcopy_2.c | 102 + kernel/generic/symm_lcopy_4.c | 138 + kernel/generic/symm_lcopy_8.c | 188 + kernel/generic/symm_ucopy_1.c | 76 + kernel/generic/symm_ucopy_16.c | 274 + kernel/generic/symm_ucopy_2.c | 101 + kernel/generic/symm_ucopy_4.c | 136 + kernel/generic/symm_ucopy_8.c | 188 + kernel/generic/symv_k.c | 123 + kernel/generic/trmm_lncopy_1.c | 92 + kernel/generic/trmm_lncopy_16.c | 1543 ++ kernel/generic/trmm_lncopy_2.c | 198 + kernel/generic/trmm_lncopy_4.c | 484 + kernel/generic/trmm_lncopy_8.c | 1227 ++ kernel/generic/trmm_ltcopy_1.c | 92 + kernel/generic/trmm_ltcopy_16.c | 1547 ++ kernel/generic/trmm_ltcopy_2.c | 197 + kernel/generic/trmm_ltcopy_4.c | 488 + kernel/generic/trmm_ltcopy_8.c | 1219 ++ kernel/generic/trmm_uncopy_1.c | 91 + kernel/generic/trmm_uncopy_16.c | 1543 ++ kernel/generic/trmm_uncopy_2.c | 195 + kernel/generic/trmm_uncopy_4.c | 489 + kernel/generic/trmm_uncopy_8.c | 1226 ++ kernel/generic/trmm_utcopy_1.c | 90 + kernel/generic/trmm_utcopy_16.c | 1550 ++ kernel/generic/trmm_utcopy_2.c | 191 + kernel/generic/trmm_utcopy_4.c | 472 + kernel/generic/trmm_utcopy_8.c | 1276 ++ kernel/generic/trsm_kernel_LN.c | 333 + kernel/generic/trsm_kernel_LT.c | 317 + kernel/generic/trsm_kernel_RN.c | 315 + kernel/generic/trsm_kernel_RT.c | 341 + kernel/generic/trsm_lncopy_1.c | 90 + kernel/generic/trsm_lncopy_16.c | 271 + kernel/generic/trsm_lncopy_2.c | 154 + kernel/generic/trsm_lncopy_4.c | 326 + kernel/generic/trsm_lncopy_8.c | 841 + kernel/generic/trsm_ltcopy_1.c | 90 + kernel/generic/trsm_ltcopy_16.c | 228 + kernel/generic/trsm_ltcopy_2.c | 160 + kernel/generic/trsm_ltcopy_4.c | 346 + kernel/generic/trsm_ltcopy_8.c | 921 + kernel/generic/trsm_uncopy_1.c | 90 + kernel/generic/trsm_uncopy_16.c | 271 + kernel/generic/trsm_uncopy_2.c | 160 + kernel/generic/trsm_uncopy_4.c | 350 + kernel/generic/trsm_uncopy_8.c | 946 + kernel/generic/trsm_utcopy_1.c | 89 + kernel/generic/trsm_utcopy_16.c | 225 + kernel/generic/trsm_utcopy_2.c | 155 + kernel/generic/trsm_utcopy_4.c | 322 + kernel/generic/trsm_utcopy_8.c | 803 + kernel/generic/zgemm3m_ncopy_1.c | 89 + kernel/generic/zgemm3m_ncopy_2.c | 120 + kernel/generic/zgemm3m_ncopy_4.c | 153 + kernel/generic/zgemm3m_ncopy_8.c | 216 + kernel/generic/zgemm3m_tcopy_1.c | 89 + kernel/generic/zgemm3m_tcopy_2.c | 162 + kernel/generic/zgemm3m_tcopy_4.c | 352 + kernel/generic/zgemm3m_tcopy_8.c | 1072 ++ kernel/generic/zgemm_beta.c | 158 + kernel/generic/zgemm_ncopy_1.c | 107 + kernel/generic/zgemm_ncopy_2.c | 183 + kernel/generic/zgemm_ncopy_4.c | 387 + kernel/generic/zgemm_ncopy_8.c | 213 + kernel/generic/zgemm_tcopy_1.c | 121 + kernel/generic/zgemm_tcopy_2.c | 220 + kernel/generic/zgemm_tcopy_4.c | 403 + kernel/generic/zgemm_tcopy_8.c | 361 + kernel/generic/zger.c | 84 + kernel/generic/zhemm3m_lcopy_1.c | 105 + kernel/generic/zhemm3m_lcopy_2.c | 146 + kernel/generic/zhemm3m_lcopy_4.c | 217 + kernel/generic/zhemm3m_lcopy_8.c | 364 + kernel/generic/zhemm3m_ucopy_1.c | 106 + kernel/generic/zhemm3m_ucopy_2.c | 146 + kernel/generic/zhemm3m_ucopy_4.c | 217 + kernel/generic/zhemm3m_ucopy_8.c | 364 + kernel/generic/zhemm_ltcopy_1.c | 90 + kernel/generic/zhemm_ltcopy_2.c | 144 + kernel/generic/zhemm_ltcopy_4.c | 244 + kernel/generic/zhemm_ltcopy_8.c | 480 + kernel/generic/zhemm_utcopy_1.c | 88 + kernel/generic/zhemm_utcopy_2.c | 142 + kernel/generic/zhemm_utcopy_4.c | 242 + kernel/generic/zhemm_utcopy_8.c | 477 + kernel/generic/zhemv_k.c | 157 + kernel/generic/zlaswp_ncopy_1.c | 186 + kernel/generic/zlaswp_ncopy_2.c | 381 + kernel/generic/zlaswp_ncopy_4.c | 711 + kernel/generic/zneg_tcopy_1.c | 121 + kernel/generic/zneg_tcopy_2.c | 220 + kernel/generic/zneg_tcopy_4.c | 403 + kernel/generic/zneg_tcopy_8.c | 361 + kernel/generic/zsymm3m_lcopy_1.c | 99 + kernel/generic/zsymm3m_lcopy_2.c | 124 + kernel/generic/zsymm3m_lcopy_4.c | 157 + kernel/generic/zsymm3m_lcopy_8.c | 209 + kernel/generic/zsymm3m_ucopy_1.c | 98 + kernel/generic/zsymm3m_ucopy_2.c | 123 + kernel/generic/zsymm3m_ucopy_4.c | 158 + kernel/generic/zsymm3m_ucopy_8.c | 210 + kernel/generic/zsymm_lcopy_1.c | 81 + kernel/generic/zsymm_lcopy_2.c | 112 + kernel/generic/zsymm_lcopy_4.c | 157 + kernel/generic/zsymm_lcopy_8.c | 224 + kernel/generic/zsymm_ucopy_1.c | 80 + kernel/generic/zsymm_ucopy_2.c | 111 + kernel/generic/zsymm_ucopy_4.c | 155 + kernel/generic/zsymm_ucopy_8.c | 224 + kernel/generic/zsymv_k.c | 123 + kernel/generic/ztrmm_lncopy_1.c | 107 + kernel/generic/ztrmm_lncopy_2.c | 230 + kernel/generic/ztrmm_lncopy_4.c | 664 + kernel/generic/ztrmm_lncopy_8.c | 871 + kernel/generic/ztrmm_ltcopy_1.c | 104 + kernel/generic/ztrmm_ltcopy_2.c | 240 + kernel/generic/ztrmm_ltcopy_4.c | 685 + kernel/generic/ztrmm_ltcopy_8.c | 876 + kernel/generic/ztrmm_uncopy_1.c | 109 + kernel/generic/ztrmm_uncopy_2.c | 239 + kernel/generic/ztrmm_uncopy_4.c | 679 + kernel/generic/ztrmm_uncopy_8.c | 876 + kernel/generic/ztrmm_utcopy_1.c | 103 + kernel/generic/ztrmm_utcopy_2.c | 239 + kernel/generic/ztrmm_utcopy_4.c | 663 + kernel/generic/ztrmm_utcopy_8.c | 880 + kernel/generic/ztrsm_lncopy_1.c | 91 + kernel/generic/ztrsm_lncopy_2.c | 171 + kernel/generic/ztrsm_lncopy_4.c | 459 + kernel/generic/ztrsm_lncopy_8.c | 225 + kernel/generic/ztrsm_ltcopy_1.c | 91 + kernel/generic/ztrsm_ltcopy_2.c | 177 + kernel/generic/ztrsm_ltcopy_4.c | 479 + kernel/generic/ztrsm_ltcopy_8.c | 210 + kernel/generic/ztrsm_uncopy_1.c | 90 + kernel/generic/ztrsm_uncopy_2.c | 176 + kernel/generic/ztrsm_uncopy_4.c | 496 + kernel/generic/ztrsm_uncopy_8.c | 228 + kernel/generic/ztrsm_utcopy_1.c | 90 + kernel/generic/ztrsm_utcopy_2.c | 171 + kernel/generic/ztrsm_utcopy_4.c | 444 + kernel/generic/ztrsm_utcopy_8.c | 209 + kernel/ia64/KERNEL | 140 + kernel/ia64/Makefile | 1 + kernel/ia64/amax.S | 396 + kernel/ia64/asum.S | 388 + kernel/ia64/cabs.S | 58 + kernel/ia64/caxpy.S | 519 + kernel/ia64/copy.S | 873 + kernel/ia64/daxpy.S | 1504 ++ kernel/ia64/ddot.S | 1184 ++ kernel/ia64/gemm_beta.S | 512 + kernel/ia64/gemm_kernel.S | 8958 +++++++++ kernel/ia64/gemm_ncopy.S | 493 + kernel/ia64/gemm_tcopy.S | 1695 ++ kernel/ia64/gemv_n.S | 3317 ++++ kernel/ia64/gemv_t.S | 3557 ++++ kernel/ia64/iamax.S | 639 + kernel/ia64/izamax.S | 579 + kernel/ia64/lsame.S | 66 + kernel/ia64/nrm2.S | 310 + kernel/ia64/qaxpy.S | 509 + kernel/ia64/qcopy.S | 581 + kernel/ia64/qdot.S | 421 + kernel/ia64/qgemm_kernel.S | 8993 +++++++++ kernel/ia64/qgemv_n.S | 1676 ++ kernel/ia64/qgemv_t.S | 1287 ++ kernel/ia64/qscal.S | 693 + kernel/ia64/rot.S | 891 + kernel/ia64/saxpy.S | 1667 ++ kernel/ia64/scal.S | 950 + kernel/ia64/sdot.S | 1177 ++ kernel/ia64/sgemv_n.S | 3241 ++++ kernel/ia64/staticbuffer.S | 45 + kernel/ia64/swap.S | 577 + kernel/ia64/symv_U.S | 463 + kernel/ia64/trsm_kernel_LN.S | 14028 ++++++++++++++ kernel/ia64/trsm_kernel_LT.S | 11027 +++++++++++ kernel/ia64/trsm_kernel_RT.S | 16688 +++++++++++++++++ kernel/ia64/xcopy.S | 565 + kernel/ia64/xdot.S | 518 + kernel/ia64/zaxpy.S | 822 + kernel/ia64/zcopy.S | 1378 ++ kernel/ia64/zdot.S | 487 + kernel/ia64/zgemm3m_kernel.S | 6803 +++++++ kernel/ia64/zgemm_beta.S | 517 + kernel/ia64/zgemm_kernel.S | 6849 +++++++ kernel/ia64/zgemm_ncopy.S | 854 + kernel/ia64/zgemm_tcopy.S | 898 + kernel/ia64/zgemv_n.S | 2293 +++ kernel/ia64/zgemv_t.S | 2017 ++ kernel/ia64/zrot.S | 879 + kernel/ia64/zscal.S | 540 + kernel/ia64/zswap.S | 476 + kernel/ia64/ztrsm_kernel_LN.S | 10839 +++++++++++ kernel/ia64/ztrsm_kernel_LT.S | 10835 +++++++++++ kernel/ia64/ztrsm_kernel_RT.S | 10837 +++++++++++ kernel/mips64/KERNEL | 96 + kernel/mips64/Makefile | 2 + kernel/mips64/amax.S | 241 + kernel/mips64/amin.S | 241 + kernel/mips64/asum.S | 332 + kernel/mips64/axpy.S | 409 + kernel/mips64/cnrm2.S | 214 + kernel/mips64/copy.S | 277 + kernel/mips64/dnrm2.S | 397 + kernel/mips64/dot.S | 306 + kernel/mips64/gemm_beta.S | 205 + kernel/mips64/gemm_kernel.S | 2250 +++ kernel/mips64/gemv_n.S | 665 + kernel/mips64/gemv_t.S | 531 + kernel/mips64/iamax.S | 288 + kernel/mips64/iamin.S | 288 + kernel/mips64/imax.S | 262 + kernel/mips64/imin.S | 262 + kernel/mips64/izamax.S | 268 + kernel/mips64/izamin.S | 268 + kernel/mips64/max.S | 213 + kernel/mips64/min.S | 213 + kernel/mips64/rot.S | 367 + kernel/mips64/scal.S | 412 + kernel/mips64/snrm2.S | 337 + kernel/mips64/swap.S | 392 + kernel/mips64/symv_L.S | 658 + kernel/mips64/symv_U.S | 782 + kernel/mips64/trsm_kernel_LN.S | 3544 ++++ kernel/mips64/trsm_kernel_LT.S | 3527 ++++ kernel/mips64/trsm_kernel_RT.S | 3529 ++++ kernel/mips64/zamax.S | 245 + kernel/mips64/zamin.S | 245 + kernel/mips64/zasum.S | 204 + kernel/mips64/zaxpy.S | 438 + kernel/mips64/zcopy.S | 265 + kernel/mips64/zdot.S | 402 + kernel/mips64/zgemm3m_kernel.S | 1666 ++ kernel/mips64/zgemm_kernel.S | 1286 ++ kernel/mips64/zgemv_n.S | 777 + kernel/mips64/zgemv_t.S | 669 + kernel/mips64/znrm2.S | 378 + kernel/mips64/zrot.S | 350 + kernel/mips64/zscal.S | 441 + kernel/mips64/zswap.S | 361 + kernel/mips64/zsymv_L.S | 698 + kernel/mips64/zsymv_U.S | 717 + kernel/mips64/ztrsm_kernel_LT.S | 1685 ++ kernel/mips64/ztrsm_kernel_RT.S | 1684 ++ kernel/power/KERNEL | 86 + kernel/power/KERNEL.CELL | 76 + kernel/power/KERNEL.POWER3 | 2 + kernel/power/KERNEL.POWER4 | 1 + kernel/power/KERNEL.POWER5 | 56 + kernel/power/KERNEL.POWER6 | 56 + kernel/power/KERNEL.PPC440 | 118 + kernel/power/KERNEL.PPC440FP2 | 128 + kernel/power/KERNEL.PPC970 | 56 + kernel/power/KERNEL.PPCG4 | 118 + kernel/power/Makefile | 1 + kernel/power/amax.S | 523 + kernel/power/amax_cell.S | 691 + kernel/power/amax_hummer.S | 540 + kernel/power/amax_ppc440.S | 332 + kernel/power/amin.S | 523 + kernel/power/amin_cell.S | 691 + kernel/power/amin_hummer.S | 539 + kernel/power/amin_ppc440.S | 333 + kernel/power/asum.S | 448 + kernel/power/asum_cell.S | 599 + kernel/power/asum_hummer.S | 455 + kernel/power/asum_ppc440.S | 313 + kernel/power/axpy.S | 550 + kernel/power/axpy_hummer.S | 656 + kernel/power/axpy_ppc440.S | 337 + kernel/power/cabs.S | 54 + kernel/power/cnrm2.S | 418 + kernel/power/cnrm2_hummer.S | 812 + kernel/power/cnrm2_ppc440.S | 301 + kernel/power/copy.S | 226 + kernel/power/copy_hummer.S | 958 + kernel/power/dnrm2_hummer.S | 1066 ++ kernel/power/dnrm2_ppc440.S | 556 + kernel/power/dot.S | 468 + kernel/power/dot_cell.S | 458 + kernel/power/dot_hummer.S | 879 + kernel/power/dot_ppc440.S | 301 + kernel/power/exfunc.S | 66 + kernel/power/gemm_beta.S | 253 + kernel/power/gemm_kernel.S | 2705 +++ kernel/power/gemm_kernel_altivec.S | 2708 +++ kernel/power/gemm_kernel_altivec_cell.S | 2711 +++ kernel/power/gemm_kernel_altivec_g4.S | 2647 +++ kernel/power/gemm_kernel_cell.S | 2642 +++ kernel/power/gemm_kernel_g4.S | 2412 +++ kernel/power/gemm_kernel_hummer.S | 7006 +++++++ kernel/power/gemm_kernel_power3.S | 1664 ++ kernel/power/gemm_kernel_power6.S | 2667 +++ kernel/power/gemm_kernel_ppc440.S | 2470 +++ kernel/power/gemm_ncopy_4.S | 366 + kernel/power/gemm_ncopy_hummer_4.S | 798 + kernel/power/gemm_ncopy_hummer_8.S | 1217 ++ kernel/power/gemm_tcopy_4.S | 452 + kernel/power/gemm_tcopy_hummer_4.S | 521 + kernel/power/gemm_tcopy_hummer_8.S | 1285 ++ kernel/power/gemv_hummer_n.S | 1780 ++ kernel/power/gemv_n.S | 3090 +++ kernel/power/gemv_n_ppc440.S | 1185 ++ kernel/power/gemv_t.S | 2964 +++ kernel/power/gemv_t_ppc440.S | 1089 ++ kernel/power/ger.S | 1209 ++ kernel/power/iamax.S | 802 + kernel/power/iamax_hummer.S | 1015 + kernel/power/iamax_ppc440.S | 482 + kernel/power/iamin.S | 803 + kernel/power/iamin_hummer.S | 1016 + kernel/power/iamin_ppc440.S | 482 + kernel/power/imax.S | 684 + kernel/power/imax_hummer.S | 867 + kernel/power/imax_ppc440.S | 429 + kernel/power/imin.S | 684 + kernel/power/imin_hummer.S | 867 + kernel/power/imin_ppc440.S | 414 + kernel/power/izamax.S | 919 + kernel/power/izamax_hummer.S | 566 + kernel/power/izamax_ppc440.S | 538 + kernel/power/izamin.S | 920 + kernel/power/izamin_hummer.S | 566 + kernel/power/izamin_ppc440.S | 538 + kernel/power/lock.c | 61 + kernel/power/lsame.S | 72 + kernel/power/max.S | 445 + kernel/power/max_hummer.S | 477 + kernel/power/max_ppc440.S | 284 + kernel/power/min.S | 445 + kernel/power/min_hummer.S | 477 + kernel/power/min_ppc440.S | 284 + kernel/power/nrm2.S | 908 + kernel/power/rot.S | 571 + kernel/power/rot_ppc440.S | 286 + kernel/power/scal.S | 401 + kernel/power/scal_hummer.S | 477 + kernel/power/scal_ppc440.S | 239 + kernel/power/snrm2.S | 412 + kernel/power/snrm2_hummer.S | 614 + kernel/power/snrm2_ppc440.S | 301 + kernel/power/staticbuffer.S | 45 + kernel/power/swap.S | 387 + kernel/power/swap_hummer.S | 703 + kernel/power/symv_L.S | 1521 ++ kernel/power/symv_U.S | 1506 ++ kernel/power/trsm_kernel_LN.S | 3652 ++++ kernel/power/trsm_kernel_LT.S | 3665 ++++ kernel/power/trsm_kernel_RT.S | 3679 ++++ kernel/power/trsm_kernel_cell_LN.S | 3666 ++++ kernel/power/trsm_kernel_cell_LT.S | 3680 ++++ kernel/power/trsm_kernel_cell_RT.S | 3675 ++++ kernel/power/trsm_kernel_hummer_LN.S | 5695 ++++++ kernel/power/trsm_kernel_hummer_LT.S | 5697 ++++++ kernel/power/trsm_kernel_hummer_RT.S | 5696 ++++++ kernel/power/trsm_kernel_power6_LN.S | 3688 ++++ kernel/power/trsm_kernel_power6_LT.S | 3676 ++++ kernel/power/trsm_kernel_power6_RT.S | 3696 ++++ kernel/power/trsm_kernel_ppc440_LN.S | 3487 ++++ kernel/power/trsm_kernel_ppc440_LT.S | 3477 ++++ kernel/power/trsm_kernel_ppc440_RT.S | 3496 ++++ kernel/power/zamax.S | 505 + kernel/power/zamax_cell.S | 495 + kernel/power/zamax_hummer.S | 347 + kernel/power/zamax_ppc440.S | 319 + kernel/power/zamin.S | 505 + kernel/power/zamin_cell.S | 495 + kernel/power/zamin_hummer.S | 347 + kernel/power/zamin_ppc440.S | 317 + kernel/power/zasum.S | 456 + kernel/power/zasum_cell.S | 581 + kernel/power/zasum_hummer.S | 583 + kernel/power/zasum_ppc440.S | 321 + kernel/power/zaxpy.S | 683 + kernel/power/zaxpy_hummer.S | 503 + kernel/power/zaxpy_ppc440.S | 413 + kernel/power/zcopy.S | 237 + kernel/power/zcopy_hummer.S | 652 + kernel/power/zdot.S | 654 + kernel/power/zdot_cell.S | 617 + kernel/power/zdot_hummer.S | 529 + kernel/power/zdot_ppc440.S | 441 + kernel/power/zgemm_beta.S | 249 + kernel/power/zgemm_kernel.S | 1837 ++ kernel/power/zgemm_kernel_altivec.S | 1703 ++ kernel/power/zgemm_kernel_altivec_cell.S | 1858 ++ kernel/power/zgemm_kernel_altivec_g4.S | 1757 ++ kernel/power/zgemm_kernel_cell.S | 1784 ++ kernel/power/zgemm_kernel_g4.S | 1637 ++ kernel/power/zgemm_kernel_hummer.S | 4428 +++++ kernel/power/zgemm_kernel_power3.S | 1260 ++ kernel/power/zgemm_kernel_power6.S | 2937 +++ kernel/power/zgemm_kernel_ppc440.S | 1700 ++ kernel/power/zgemm_ncopy_hummer_2.S | 451 + kernel/power/zgemm_ncopy_hummer_4.S | 666 + kernel/power/zgemm_tcopy_hummer_2.S | 308 + kernel/power/zgemm_tcopy_hummer_4.S | 705 + kernel/power/zgemv_n.S | 4290 +++++ kernel/power/zgemv_n_ppc440.S | 1386 ++ kernel/power/zgemv_t.S | 1522 ++ kernel/power/zgemv_t_ppc440.S | 1294 ++ kernel/power/zger.S | 1357 ++ kernel/power/znrm2.S | 924 + kernel/power/znrm2_hummer.S | 1018 + kernel/power/znrm2_ppc440.S | 564 + kernel/power/zrot.S | 595 + kernel/power/zrot_ppc440.S | 301 + kernel/power/zscal.S | 385 + kernel/power/zscal_hummer.S | 871 + kernel/power/zscal_ppc440.S | 276 + kernel/power/zswap.S | 414 + kernel/power/zswap_hummer.S | 665 + kernel/power/zsymv_L.S | 1673 ++ kernel/power/zsymv_U.S | 1653 ++ kernel/power/ztrsm_kernel_LN.S | 2288 +++ kernel/power/ztrsm_kernel_LT.S | 2288 +++ kernel/power/ztrsm_kernel_RT.S | 2289 +++ kernel/power/ztrsm_kernel_cell_LN.S | 2252 +++ kernel/power/ztrsm_kernel_cell_LT.S | 2277 +++ kernel/power/ztrsm_kernel_cell_RT.S | 2249 +++ kernel/power/ztrsm_kernel_hummer_LN.S | 2963 +++ kernel/power/ztrsm_kernel_hummer_LT.S | 2962 +++ kernel/power/ztrsm_kernel_hummer_RT.S | 2962 +++ kernel/power/ztrsm_kernel_power6_LN.S | 4720 +++++ kernel/power/ztrsm_kernel_power6_LT.S | 4697 +++++ kernel/power/ztrsm_kernel_power6_RT.S | 4696 +++++ kernel/power/ztrsm_kernel_ppc440_LN.S | 2256 +++ kernel/power/ztrsm_kernel_ppc440_LT.S | 2208 +++ kernel/power/ztrsm_kernel_ppc440_RT.S | 2209 +++ kernel/setparam-ref.c | 819 + kernel/sparc/KERNEL | 69 + kernel/sparc/KERNEL.sparc | 56 + kernel/sparc/KERNEL.sparcv7 | 59 + kernel/sparc/Makefile | 2 + kernel/sparc/amax.S | 380 + kernel/sparc/asum.S | 325 + kernel/sparc/axpy.S | 503 + kernel/sparc/cabs.S | 58 + kernel/sparc/cnrm2.S | 329 + kernel/sparc/copy.S | 218 + kernel/sparc/dnrm2.S | 675 + kernel/sparc/dot.S | 423 + kernel/sparc/gemm_kernel.S | 3054 +++ kernel/sparc/gemm_kernel_2x8.S | 2561 +++ kernel/sparc/gemm_ncopy.S | 309 + kernel/sparc/gemm_ncopy_2.S | 235 + kernel/sparc/gemm_ncopy_8.S | 921 + kernel/sparc/gemm_tcopy.S | 376 + kernel/sparc/gemm_tcopy_2.S | 298 + kernel/sparc/gemv_n.S | 1400 ++ kernel/sparc/gemv_t.S | 705 + kernel/sparc/ger.S | 464 + kernel/sparc/iamax.S | 456 + kernel/sparc/imax.S | 419 + kernel/sparc/izamax.S | 425 + kernel/sparc/lsame.S | 66 + kernel/sparc/max.S | 339 + kernel/sparc/rot.S | 668 + kernel/sparc/scal.S | 398 + kernel/sparc/snrm2.S | 334 + kernel/sparc/staticbuffer.S | 45 + kernel/sparc/swap.S | 346 + kernel/sparc/trsm_kernel_LN.S | 4254 +++++ kernel/sparc/trsm_kernel_LN_2x8.S | 3897 ++++ kernel/sparc/trsm_kernel_LT.S | 4221 +++++ kernel/sparc/trsm_kernel_LT_2x8.S | 3896 ++++ kernel/sparc/trsm_kernel_RT.S | 4227 +++++ kernel/sparc/trsm_kernel_RT_2x8.S | 3896 ++++ kernel/sparc/zamax.S | 374 + kernel/sparc/zasum.S | 327 + kernel/sparc/zaxpy.S | 594 + kernel/sparc/zcopy.S | 196 + kernel/sparc/zdot.S | 545 + kernel/sparc/zgemm_kernel.S | 1917 ++ kernel/sparc/zgemm_kernel_1x4.S | 1599 ++ kernel/sparc/zgemm_ncopy.S | 250 + kernel/sparc/zgemm_tcopy.S | 305 + kernel/sparc/zgemv_n.S | 1176 ++ kernel/sparc/zgemv_t.S | 1737 ++ kernel/sparc/znrm2.S | 665 + kernel/sparc/zrot.S | 673 + kernel/sparc/zscal.S | 518 + kernel/sparc/zswap.S | 342 + kernel/sparc/ztrsm_kernel_LN.S | 2395 +++ kernel/sparc/ztrsm_kernel_LT.S | 2389 +++ kernel/sparc/ztrsm_kernel_LT_1x4.S | 2131 +++ kernel/sparc/ztrsm_kernel_RT.S | 2389 +++ kernel/sparc/ztrsm_kernel_RT_1x4.S | 2132 +++ kernel/x86/KERNEL | 398 + kernel/x86/KERNEL.ATHLON | 63 + kernel/x86/KERNEL.ATOM | 59 + kernel/x86/KERNEL.BANIAS | 59 + kernel/x86/KERNEL.BARCELONA | 59 + kernel/x86/KERNEL.COPPERMINE | 59 + kernel/x86/KERNEL.CORE2 | 59 + kernel/x86/KERNEL.DUNNINGTON | 59 + kernel/x86/KERNEL.KATMAI | 1 + kernel/x86/KERNEL.NANO | 1 + kernel/x86/KERNEL.NEHALEM | 1 + kernel/x86/KERNEL.NORTHWOOD | 60 + kernel/x86/KERNEL.OPTERON | 59 + kernel/x86/KERNEL.OPTERON_SSE3 | 1 + kernel/x86/KERNEL.P5 | 2 + kernel/x86/KERNEL.P6 | 60 + kernel/x86/KERNEL.PENRYN | 59 + kernel/x86/KERNEL.PRESCOTT | 59 + kernel/x86/KERNEL.VIAC3 | 1 + kernel/x86/KERNEL.YONAH | 59 + kernel/x86/Makefile | 2 + kernel/x86/amax.S | 315 + kernel/x86/amax_sse.S | 510 + kernel/x86/amax_sse2.S | 518 + kernel/x86/asum.S | 225 + kernel/x86/asum_sse.S | 366 + kernel/x86/asum_sse2.S | 318 + kernel/x86/axpy.S | 247 + kernel/x86/axpy_sse.S | 1551 ++ kernel/x86/axpy_sse2.S | 799 + kernel/x86/axpy_sse2_opteron.S | 496 + kernel/x86/cabs.S | 57 + kernel/x86/copy.S | 213 + kernel/x86/copy_sse.S | 962 + kernel/x86/copy_sse2.S | 655 + kernel/x86/cpuid.S | 64 + kernel/x86/dot.S | 219 + kernel/x86/dot_amd.S | 236 + kernel/x86/dot_sse.S | 1320 ++ kernel/x86/dot_sse2.S | 728 + kernel/x86/dot_sse2_opteron.S | 368 + kernel/x86/dot_sse_opteron.S | 411 + kernel/x86/gemm_beta.S | 224 + kernel/x86/gemm_kernel_1x4.S | 907 + kernel/x86/gemm_kernel_2x2.S | 697 + kernel/x86/gemm_kernel_2x2_atom.S | 736 + kernel/x86/gemm_kernel_2x4_3dnow.S | 1917 ++ kernel/x86/gemm_kernel_2x4_barcelona.S | 1268 ++ kernel/x86/gemm_kernel_2x4_core2.S | 1318 ++ kernel/x86/gemm_kernel_2x4_penryn.S | 1367 ++ kernel/x86/gemm_kernel_2x4_sse2.S | 1790 ++ kernel/x86/gemm_kernel_2x4_sse3.S | 1635 ++ kernel/x86/gemm_kernel_4x2_core2.S | 1304 ++ kernel/x86/gemm_kernel_4x2_sse2.S | 1539 ++ kernel/x86/gemm_kernel_4x4_barcelona.S | 2151 +++ kernel/x86/gemm_kernel_4x4_penryn.S | 1831 ++ kernel/x86/gemm_kernel_4x4_sse.S | 2589 +++ kernel/x86/gemm_kernel_4x4_sse3.S | 2090 +++ kernel/x86/gemm_kernel_8x1_sse2.S | 878 + kernel/x86/gemm_kernel_8x2_core2.S | 1622 ++ kernel/x86/gemm_kernel_8x2_sse.S | 2746 +++ kernel/x86/gemm_ncopy_2.S | 274 + kernel/x86/gemm_ncopy_2_sse.S | 215 + kernel/x86/gemm_ncopy_4_sse.S | 315 + kernel/x86/gemm_tcopy_2.S | 305 + kernel/x86/gemm_tcopy_2_sse.S | 236 + kernel/x86/gemm_tcopy_4_sse.S | 305 + kernel/x86/gemv_n.S | 477 + kernel/x86/gemv_n_atom.S | 774 + kernel/x86/gemv_n_sse.S | 662 + kernel/x86/gemv_n_sse2.S | 686 + kernel/x86/gemv_t.S | 583 + kernel/x86/gemv_t_atom.S | 616 + kernel/x86/gemv_t_sse.S | 637 + kernel/x86/gemv_t_sse2.S | 569 + kernel/x86/iamax.S | 364 + kernel/x86/iamax_sse.S | 968 + kernel/x86/iamax_sse2.S | 1152 ++ kernel/x86/izamax.S | 289 + kernel/x86/izamax_sse.S | 596 + kernel/x86/izamax_sse2.S | 619 + kernel/x86/lsame.S | 90 + kernel/x86/nrm2.S | 226 + kernel/x86/nrm2_sse.S | 418 + kernel/x86/qaxpy.S | 254 + kernel/x86/qconjg.S | 60 + kernel/x86/qdot.S | 229 + kernel/x86/qgemm_kernel_2x2.S | 810 + kernel/x86/qgemv_n.S | 477 + kernel/x86/qgemv_t.S | 585 + kernel/x86/qtrsm_kernel_LN_2x2.S | 1231 ++ kernel/x86/qtrsm_kernel_LT_2x2.S | 1229 ++ kernel/x86/qtrsm_kernel_RT_2x2.S | 1231 ++ kernel/x86/rot.S | 388 + kernel/x86/rot_sse.S | 1119 ++ kernel/x86/rot_sse2.S | 960 + kernel/x86/scal.S | 352 + kernel/x86/scal_sse.S | 637 + kernel/x86/scal_sse2.S | 556 + kernel/x86/staticbuffer.S | 49 + kernel/x86/swap.S | 210 + kernel/x86/swap_sse.S | 1139 ++ kernel/x86/swap_sse2.S | 572 + kernel/x86/trsm_kernel_LN_2x2.S | 1127 ++ kernel/x86/trsm_kernel_LN_2x2_atom.S | 1145 ++ kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2076 ++ kernel/x86/trsm_kernel_LN_2x4_sse2.S | 2584 +++ kernel/x86/trsm_kernel_LN_2x4_sse3.S | 2031 ++ kernel/x86/trsm_kernel_LN_4x2_core2.S | 2100 +++ kernel/x86/trsm_kernel_LN_4x2_sse2.S | 2293 +++ kernel/x86/trsm_kernel_LN_4x4_penryn.S | 3129 ++++ kernel/x86/trsm_kernel_LN_4x4_sse.S | 3691 ++++ kernel/x86/trsm_kernel_LN_8x2_sse.S | 3605 ++++ kernel/x86/trsm_kernel_LT_1x4.S | 1251 ++ kernel/x86/trsm_kernel_LT_2x2.S | 1104 ++ kernel/x86/trsm_kernel_LT_2x2_atom.S | 1145 ++ kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2071 ++ kernel/x86/trsm_kernel_LT_2x4_sse2.S | 2583 +++ kernel/x86/trsm_kernel_LT_2x4_sse3.S | 2030 ++ kernel/x86/trsm_kernel_LT_4x2_core2.S | 2100 +++ kernel/x86/trsm_kernel_LT_4x2_sse2.S | 2280 +++ kernel/x86/trsm_kernel_LT_4x4_penryn.S | 3129 ++++ kernel/x86/trsm_kernel_LT_4x4_sse.S | 3690 ++++ kernel/x86/trsm_kernel_LT_8x2_sse.S | 3604 ++++ kernel/x86/trsm_kernel_RT_1x4.S | 1251 ++ kernel/x86/trsm_kernel_RT_2x2.S | 1102 ++ kernel/x86/trsm_kernel_RT_2x2_atom.S | 1145 ++ kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2075 ++ kernel/x86/trsm_kernel_RT_2x4_sse2.S | 2586 +++ kernel/x86/trsm_kernel_RT_2x4_sse3.S | 2030 ++ kernel/x86/trsm_kernel_RT_4x2_core2.S | 2100 +++ kernel/x86/trsm_kernel_RT_4x2_sse2.S | 2282 +++ kernel/x86/trsm_kernel_RT_4x4_penryn.S | 3128 +++ kernel/x86/trsm_kernel_RT_4x4_sse.S | 3683 ++++ kernel/x86/trsm_kernel_RT_8x2_sse.S | 3607 ++++ kernel/x86/xaxpy.S | 356 + kernel/x86/xdot.S | 331 + kernel/x86/xgemm3m_kernel_2x2.S | 796 + kernel/x86/xgemm_kernel_1x1.S | 374 + kernel/x86/xgemv_n.S | 350 + kernel/x86/xgemv_t.S | 369 + kernel/x86/xtrsm_kernel_LT_1x1.S | 493 + kernel/x86/zamax.S | 261 + kernel/x86/zamax_sse.S | 387 + kernel/x86/zamax_sse2.S | 373 + kernel/x86/zasum.S | 228 + kernel/x86/zasum_sse.S | 341 + kernel/x86/zasum_sse2.S | 320 + kernel/x86/zaxpy.S | 348 + kernel/x86/zaxpy_sse.S | 3103 +++ kernel/x86/zaxpy_sse2.S | 1522 ++ kernel/x86/zcopy.S | 250 + kernel/x86/zcopy_sse.S | 994 + kernel/x86/zcopy_sse2.S | 668 + kernel/x86/zdot.S | 310 + kernel/x86/zdot_amd.S | 377 + kernel/x86/zdot_sse.S | 3457 ++++ kernel/x86/zdot_sse2.S | 1543 ++ kernel/x86/zgemm3m_kernel_1x4_athlon.S | 979 + kernel/x86/zgemm3m_kernel_2x2_atom.S | 734 + kernel/x86/zgemm3m_kernel_2x2_coppermine.S | 722 + kernel/x86/zgemm3m_kernel_2x4_barcelona.S | 1291 ++ kernel/x86/zgemm3m_kernel_2x4_opteron.S | 1803 ++ kernel/x86/zgemm3m_kernel_2x4_penryn.S | 1344 ++ kernel/x86/zgemm3m_kernel_2x4_prescott.S | 1590 ++ kernel/x86/zgemm3m_kernel_4x2_core2.S | 1328 ++ kernel/x86/zgemm3m_kernel_4x2_northwood.S | 1522 ++ kernel/x86/zgemm3m_kernel_4x4_barcelona.S | 2153 +++ kernel/x86/zgemm3m_kernel_4x4_opteron.S | 2532 +++ kernel/x86/zgemm3m_kernel_4x4_penryn.S | 1780 ++ kernel/x86/zgemm3m_kernel_4x4_prescott.S | 2060 ++ kernel/x86/zgemm3m_kernel_8x2_core2.S | 1628 ++ kernel/x86/zgemm3m_kernel_8x2_sse.S | 2803 +++ kernel/x86/zgemm_beta.S | 242 + kernel/x86/zgemm_kernel_1x1.S | 450 + kernel/x86/zgemm_kernel_1x1_atom.S | 351 + kernel/x86/zgemm_kernel_1x2.S | 813 + kernel/x86/zgemm_kernel_1x2_3dnow.S | 958 + kernel/x86/zgemm_kernel_1x2_barcelona.S | 728 + kernel/x86/zgemm_kernel_1x2_penryn.S | 701 + kernel/x86/zgemm_kernel_1x2_sse2.S | 909 + kernel/x86/zgemm_kernel_1x2_sse3.S | 857 + kernel/x86/zgemm_kernel_2x1_core2.S | 695 + kernel/x86/zgemm_kernel_2x1_sse2.S | 824 + kernel/x86/zgemm_kernel_2x2_barcelona.S | 1363 ++ kernel/x86/zgemm_kernel_2x2_penryn.S | 1210 ++ kernel/x86/zgemm_kernel_2x2_sse.S | 1562 ++ kernel/x86/zgemm_kernel_2x2_sse3.S | 1365 ++ kernel/x86/zgemm_kernel_4x1_core2.S | 872 + kernel/x86/zgemm_kernel_4x1_sse.S | 1508 ++ kernel/x86/zgemm_ncopy_2.S | 268 + kernel/x86/zgemm_tcopy_2.S | 174 + kernel/x86/zgemv_n.S | 367 + kernel/x86/zgemv_n_atom.S | 545 + kernel/x86/zgemv_n_sse.S | 604 + kernel/x86/zgemv_n_sse2.S | 467 + kernel/x86/zgemv_t.S | 386 + kernel/x86/zgemv_t_atom.S | 445 + kernel/x86/zgemv_t_sse.S | 522 + kernel/x86/zgemv_t_sse2.S | 404 + kernel/x86/znrm2.S | 228 + kernel/x86/znrm2_sse.S | 465 + kernel/x86/zrot.S | 407 + kernel/x86/zrot_sse.S | 1391 ++ kernel/x86/zrot_sse2.S | 1665 ++ kernel/x86/zscal.S | 318 + kernel/x86/zscal_sse.S | 1389 ++ kernel/x86/zscal_sse2.S | 1745 ++ kernel/x86/zswap.S | 248 + kernel/x86/zswap_sse.S | 1112 ++ kernel/x86/zswap_sse2.S | 978 + kernel/x86/ztrsm_kernel_LN_2x1_core2.S | 1057 ++ kernel/x86/ztrsm_kernel_LN_2x1_sse2.S | 1163 ++ kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 1966 ++ kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 2201 +++ kernel/x86/ztrsm_kernel_LN_4x1_sse.S | 1893 ++ kernel/x86/ztrsm_kernel_LT_1x1.S | 493 + kernel/x86/ztrsm_kernel_LT_1x1_atom.S | 453 + kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 969 + kernel/x86/ztrsm_kernel_LT_1x2_sse2.S | 1328 ++ kernel/x86/ztrsm_kernel_LT_1x2_sse3.S | 965 + kernel/x86/ztrsm_kernel_LT_2x1_core2.S | 1056 ++ kernel/x86/ztrsm_kernel_LT_2x1_sse2.S | 1164 ++ kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 1966 ++ kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 2201 +++ kernel/x86/ztrsm_kernel_LT_4x1_sse.S | 1898 ++ kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 969 + kernel/x86/ztrsm_kernel_RT_1x2_sse2.S | 1325 ++ kernel/x86/ztrsm_kernel_RT_1x2_sse3.S | 965 + kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 1966 ++ kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 2202 +++ kernel/x86_64/KERNEL | 456 + kernel/x86_64/KERNEL.ATOM | 85 + kernel/x86_64/KERNEL.BARCELONA | 62 + kernel/x86_64/KERNEL.CORE2 | 60 + kernel/x86_64/KERNEL.DUNNINGTON | 59 + kernel/x86_64/KERNEL.NANO | 59 + kernel/x86_64/KERNEL.NEHALEM | 59 + kernel/x86_64/KERNEL.OPTERON | 59 + kernel/x86_64/KERNEL.OPTERON_SSE3 | 62 + kernel/x86_64/KERNEL.PENRYN | 59 + kernel/x86_64/KERNEL.PRESCOTT | 63 + kernel/x86_64/Makefile | 2 + kernel/x86_64/amax.S | 307 + kernel/x86_64/amax_atom.S | 460 + kernel/x86_64/amax_sse.S | 475 + kernel/x86_64/amax_sse2.S | 498 + kernel/x86_64/asum.S | 197 + kernel/x86_64/asum_atom.S | 433 + kernel/x86_64/asum_sse.S | 345 + kernel/x86_64/asum_sse2.S | 311 + kernel/x86_64/axpy.S | 224 + kernel/x86_64/axpy_atom.S | 555 + kernel/x86_64/axpy_sse.S | 1576 ++ kernel/x86_64/axpy_sse2.S | 906 + kernel/x86_64/builtin_stinit.S | 61 + kernel/x86_64/cabs.S | 70 + kernel/x86_64/cgemv_n.S | 4302 +++++ kernel/x86_64/cgemv_t.S | 4378 +++++ kernel/x86_64/copy.S | 366 + kernel/x86_64/copy_sse.S | 959 + kernel/x86_64/copy_sse2.S | 650 + kernel/x86_64/dgemm_ncopy_2.S | 597 + kernel/x86_64/dgemm_ncopy_4.S | 1237 ++ kernel/x86_64/dgemm_ncopy_8.S | 2002 ++ kernel/x86_64/dgemm_tcopy_2.S | 334 + kernel/x86_64/dgemm_tcopy_4.S | 516 + kernel/x86_64/dgemm_tcopy_8.S | 780 + kernel/x86_64/dgemv_n.S | 2843 +++ kernel/x86_64/dgemv_n_atom.S | 788 + kernel/x86_64/dgemv_t.S | 2490 +++ kernel/x86_64/dgemv_t_atom.S | 686 + kernel/x86_64/dot.S | 184 + kernel/x86_64/dot_atom.S | 299 + kernel/x86_64/dot_sse.S | 1293 ++ kernel/x86_64/dot_sse2.S | 714 + kernel/x86_64/gemm_beta.S | 239 + kernel/x86_64/gemm_kernel_2x8_nehalem.S | 1849 ++ kernel/x86_64/gemm_kernel_4x2_atom.S | 1385 ++ kernel/x86_64/gemm_kernel_4x4_barcelona.S | 2093 +++ kernel/x86_64/gemm_kernel_4x4_core2.S | 2221 +++ kernel/x86_64/gemm_kernel_4x4_penryn.S | 2072 ++ kernel/x86_64/gemm_kernel_4x4_sse2.S | 2707 +++ kernel/x86_64/gemm_kernel_4x4_sse3.S | 2561 +++ kernel/x86_64/gemm_kernel_4x8_nano.S | 2479 +++ kernel/x86_64/gemm_kernel_4x8_nehalem.S | 2397 +++ kernel/x86_64/gemm_kernel_8x4_barcelona.S | 3253 ++++ kernel/x86_64/gemm_kernel_8x4_core2.S | 2615 +++ kernel/x86_64/gemm_kernel_8x4_penryn.S | 2515 +++ kernel/x86_64/gemm_kernel_8x4_sse.S | 3446 ++++ kernel/x86_64/gemm_kernel_8x4_sse3.S | 3022 +++ kernel/x86_64/gemm_ncopy_2.S | 290 + kernel/x86_64/gemm_ncopy_4.S | 470 + kernel/x86_64/gemm_ncopy_4_opteron.S | 388 + kernel/x86_64/gemm_tcopy_2.S | 276 + kernel/x86_64/gemm_tcopy_4.S | 544 + kernel/x86_64/gemm_tcopy_4_opteron.S | 476 + kernel/x86_64/iamax.S | 352 + kernel/x86_64/iamax_sse.S | 1020 + kernel/x86_64/iamax_sse2.S | 1136 ++ kernel/x86_64/izamax.S | 270 + kernel/x86_64/izamax_sse.S | 554 + kernel/x86_64/izamax_sse2.S | 597 + kernel/x86_64/lsame.S | 72 + kernel/x86_64/mcount.S | 46 + kernel/x86_64/nrm2.S | 206 + kernel/x86_64/nrm2_sse.S | 316 + kernel/x86_64/qconjg.S | 54 + kernel/x86_64/qdot.S | 208 + kernel/x86_64/qgemm_kernel_2x2.S | 810 + kernel/x86_64/qgemv_n.S | 410 + kernel/x86_64/qgemv_t.S | 466 + kernel/x86_64/qtrsm_kernel_LN_2x2.S | 1234 ++ kernel/x86_64/qtrsm_kernel_LT_2x2.S | 1234 ++ kernel/x86_64/qtrsm_kernel_RT_2x2.S | 1234 ++ kernel/x86_64/rot.S | 348 + kernel/x86_64/rot_sse.S | 1090 ++ kernel/x86_64/rot_sse2.S | 986 + kernel/x86_64/scal.S | 302 + kernel/x86_64/scal_atom.S | 446 + kernel/x86_64/scal_sse.S | 612 + kernel/x86_64/scal_sse2.S | 588 + kernel/x86_64/sgemv_n.S | 6018 ++++++ kernel/x86_64/sgemv_t.S | 6370 +++++++ kernel/x86_64/staticbuffer.S | 45 + kernel/x86_64/swap.S | 439 + kernel/x86_64/swap_sse.S | 1160 ++ kernel/x86_64/swap_sse2.S | 585 + kernel/x86_64/symv_L_sse.S | 1029 + kernel/x86_64/symv_L_sse2.S | 978 + kernel/x86_64/symv_U_sse.S | 1059 ++ kernel/x86_64/symv_U_sse2.S | 976 + kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S | 3075 +++ kernel/x86_64/trsm_kernel_LN_4x2_atom.S | 2116 +++ kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S | 3390 ++++ kernel/x86_64/trsm_kernel_LN_4x4_core2.S | 3739 ++++ kernel/x86_64/trsm_kernel_LN_4x4_penryn.S | 3425 ++++ kernel/x86_64/trsm_kernel_LN_4x4_sse2.S | 4150 ++++ kernel/x86_64/trsm_kernel_LN_4x4_sse3.S | 3873 ++++ kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S | 4847 +++++ kernel/x86_64/trsm_kernel_LN_8x4_sse.S | 5950 ++++++ kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S | 3077 +++ kernel/x86_64/trsm_kernel_LT_4x2_atom.S | 2116 +++ kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S | 3396 ++++ kernel/x86_64/trsm_kernel_LT_4x4_core2.S | 3730 ++++ kernel/x86_64/trsm_kernel_LT_4x4_penryn.S | 3424 ++++ kernel/x86_64/trsm_kernel_LT_4x4_sse2.S | 4169 ++++ kernel/x86_64/trsm_kernel_LT_4x4_sse3.S | 3856 ++++ kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S | 4847 +++++ kernel/x86_64/trsm_kernel_LT_8x4_sse.S | 5949 ++++++ kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S | 3077 +++ kernel/x86_64/trsm_kernel_RT_4x2_atom.S | 2116 +++ kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S | 3393 ++++ kernel/x86_64/trsm_kernel_RT_4x4_core2.S | 3737 ++++ kernel/x86_64/trsm_kernel_RT_4x4_penryn.S | 3426 ++++ kernel/x86_64/trsm_kernel_RT_4x4_sse2.S | 4134 ++++ kernel/x86_64/trsm_kernel_RT_4x4_sse3.S | 3844 ++++ kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S | 4847 +++++ kernel/x86_64/trsm_kernel_RT_8x4_sse.S | 5975 ++++++ kernel/x86_64/xdot.S | 290 + kernel/x86_64/xgemm3m_kernel_2x2.S | 877 + kernel/x86_64/xgemm_kernel_1x1.S | 374 + kernel/x86_64/xgemv_n.S | 334 + kernel/x86_64/xgemv_t.S | 338 + kernel/x86_64/xtrsm_kernel_LT_1x1.S | 486 + kernel/x86_64/zamax.S | 241 + kernel/x86_64/zamax_atom.S | 336 + kernel/x86_64/zamax_sse.S | 309 + kernel/x86_64/zamax_sse2.S | 341 + kernel/x86_64/zasum.S | 200 + kernel/x86_64/zasum_atom.S | 411 + kernel/x86_64/zasum_sse.S | 332 + kernel/x86_64/zasum_sse2.S | 318 + kernel/x86_64/zaxpy.S | 336 + kernel/x86_64/zaxpy_atom.S | 675 + kernel/x86_64/zaxpy_sse.S | 3118 +++ kernel/x86_64/zaxpy_sse2.S | 1793 ++ kernel/x86_64/zcopy.S | 389 + kernel/x86_64/zcopy_sse.S | 992 + kernel/x86_64/zcopy_sse2.S | 655 + kernel/x86_64/zdot.S | 259 + kernel/x86_64/zdot_atom.S | 461 + kernel/x86_64/zdot_sse.S | 3492 ++++ kernel/x86_64/zdot_sse2.S | 1550 ++ kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S | 1933 ++ kernel/x86_64/zgemm3m_kernel_4x2_atom.S | 1215 ++ kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S | 2467 +++ kernel/x86_64/zgemm3m_kernel_4x4_core2.S | 2282 +++ kernel/x86_64/zgemm3m_kernel_4x4_penryn.S | 2131 +++ kernel/x86_64/zgemm3m_kernel_4x4_sse2.S | 2820 +++ kernel/x86_64/zgemm3m_kernel_4x4_sse3.S | 2622 +++ kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S | 2472 +++ kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S | 3253 ++++ kernel/x86_64/zgemm3m_kernel_8x4_core2.S | 2675 +++ kernel/x86_64/zgemm3m_kernel_8x4_penryn.S | 2593 +++ kernel/x86_64/zgemm3m_kernel_8x4_sse.S | 3498 ++++ kernel/x86_64/zgemm3m_kernel_8x4_sse3.S | 3075 +++ kernel/x86_64/zgemm_beta.S | 260 + kernel/x86_64/zgemm_kernel_1x4_nehalem.S | 1093 ++ kernel/x86_64/zgemm_kernel_2x1_atom.S | 769 + kernel/x86_64/zgemm_kernel_2x2_barcelona.S | 1423 ++ kernel/x86_64/zgemm_kernel_2x2_core2.S | 1353 ++ kernel/x86_64/zgemm_kernel_2x2_penryn.S | 1297 ++ kernel/x86_64/zgemm_kernel_2x2_sse2.S | 1829 ++ kernel/x86_64/zgemm_kernel_2x2_sse3.S | 1539 ++ kernel/x86_64/zgemm_kernel_2x4_nehalem.S | 1628 ++ kernel/x86_64/zgemm_kernel_4x2_barcelona.S | 2226 +++ kernel/x86_64/zgemm_kernel_4x2_core2.S | 1744 ++ kernel/x86_64/zgemm_kernel_4x2_penryn.S | 1794 ++ kernel/x86_64/zgemm_kernel_4x2_sse.S | 2293 +++ kernel/x86_64/zgemm_kernel_4x2_sse3.S | 2101 +++ kernel/x86_64/zgemm_ncopy_1.S | 203 + kernel/x86_64/zgemm_ncopy_2.S | 359 + kernel/x86_64/zgemm_tcopy_1.S | 190 + kernel/x86_64/zgemm_tcopy_2.S | 432 + kernel/x86_64/zgemv_n.S | 2701 +++ kernel/x86_64/zgemv_n_atom.S | 1142 ++ kernel/x86_64/zgemv_n_dup.S | 1500 ++ kernel/x86_64/zgemv_t.S | 2433 +++ kernel/x86_64/zgemv_t_atom.S | 968 + kernel/x86_64/zgemv_t_dup.S | 1223 ++ kernel/x86_64/znrm2.S | 208 + kernel/x86_64/znrm2_sse.S | 387 + kernel/x86_64/zrot.S | 367 + kernel/x86_64/zrot_sse.S | 1622 ++ kernel/x86_64/zrot_sse2.S | 1727 ++ kernel/x86_64/zscal.S | 223 + kernel/x86_64/zscal_atom.S | 394 + kernel/x86_64/zscal_sse.S | 1359 ++ kernel/x86_64/zscal_sse2.S | 1724 ++ kernel/x86_64/zswap.S | 452 + kernel/x86_64/zswap_sse.S | 1134 ++ kernel/x86_64/zswap_sse2.S | 999 + kernel/x86_64/zsymv_L_sse.S | 814 + kernel/x86_64/zsymv_L_sse2.S | 886 + kernel/x86_64/zsymv_U_sse.S | 594 + kernel/x86_64/zsymv_U_sse2.S | 916 + kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S | 995 + kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S | 2162 +++ kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S | 2016 ++ kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S | 2278 +++ kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S | 2203 +++ kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S | 3116 +++ kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S | 4004 ++++ kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S | 1586 ++ kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S | 995 + kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S | 2162 +++ kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S | 2016 ++ kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S | 2266 +++ kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S | 2194 +++ kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S | 3116 +++ kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S | 4004 ++++ kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S | 1586 ++ kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S | 2162 +++ kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S | 2010 ++ kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S | 2266 +++ kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S | 2196 +++ kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S | 3116 +++ kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S | 4005 ++++ l1param.h | 84 + l2param.h | 165 + lapack/Makefile | 40 + lapack/getf2/Makefile | 49 + lapack/getf2/getf2_k.c | 117 + lapack/getf2/zgetf2_k.c | 139 + lapack/getrf/Makefile | 98 + lapack/getrf/getrf_parallel.c | 857 + lapack/getrf/getrf_parallel_omp.c | 222 + lapack/getrf/getrf_single.c | 173 + lapack/getri/cgetri.f | 194 + lapack/getri/dgetri.f | 193 + lapack/getri/sgetri.f | 193 + lapack/getri/zgetri.f | 194 + lapack/getrs/Makefile | 236 + lapack/getrs/getrs_parallel.c | 107 + lapack/getrs/getrs_single.c | 68 + lapack/getrs/zgetrs_parallel.c | 113 + lapack/getrs/zgetrs_single.c | 66 + lapack/laswp/Makefile | 22 + lapack/laswp/alpha/Makefile | 8 + lapack/laswp/generic/Makefile | 95 + lapack/laswp/generic/laswp_k.c | 49 + lapack/laswp/generic/laswp_k_1.c | 195 + lapack/laswp/generic/laswp_k_2.c | 324 + lapack/laswp/generic/laswp_k_4.c | 529 + lapack/laswp/generic/laswp_k_8.c | 909 + lapack/laswp/generic/zlaswp_k.c | 47 + lapack/laswp/generic/zlaswp_k_1.c | 225 + lapack/laswp/generic/zlaswp_k_2.c | 406 + lapack/laswp/generic/zlaswp_k_4.c | 742 + lapack/laswp/ia64/Makefile | 5 + lapack/laswp/mips64/Makefile | 8 + lapack/laswp/power/Makefile | 8 + lapack/laswp/sparc/Makefile | 8 + lapack/laswp/x86/Makefile | 28 + lapack/laswp/x86_64/Makefile | 33 + lapack/lauu2/Makefile | 83 + lapack/lauu2/lauu2_L.c | 78 + lapack/lauu2/lauu2_U.c | 78 + lapack/lauu2/zlauu2_L.c | 83 + lapack/lauu2/zlauu2_U.c | 81 + lapack/lauum/Makefile | 164 + lapack/lauum/lauum_L_parallel.c | 123 + lapack/lauum/lauum_L_single.c | 234 + lapack/lauum/lauum_U_parallel.c | 123 + lapack/lauum/lauum_U_single.c | 268 + lapack/potf2/Makefile | 83 + lapack/potf2/potf2_L.c | 97 + lapack/potf2/potf2_U.c | 94 + lapack/potf2/zpotf2_L.c | 101 + lapack/potf2/zpotf2_U.c | 99 + lapack/potrf/Makefile | 164 + lapack/potrf/potrf_L_parallel.c | 130 + lapack/potrf/potrf_L_single.c | 234 + lapack/potrf/potrf_U_parallel.c | 130 + lapack/potrf/potrf_U_single.c | 193 + lapack/potrf/potrf_parallel.c | 634 + lapack/trti2/Makefile | 155 + lapack/trti2/trti2_L.c | 86 + lapack/trti2/trti2_U.c | 87 + lapack/trti2/ztrti2_L.c | 105 + lapack/trti2/ztrti2_U.c | 107 + lapack/trtri/Makefile | 313 + lapack/trtri/trtri_L_parallel.c | 151 + lapack/trtri/trtri_L_single.c | 190 + lapack/trtri/trtri_U_parallel.c | 147 + lapack/trtri/trtri_U_single.c | 188 + make.inc | 11 + param.h | 1543 ++ patch.for_lapack-3.1.1 | 684 + quickbuild.32bit | 3 + quickbuild.64bit | 3 + quickbuild.win32 | 3 + quickbuild.win64 | 3 + reference/LICENSE | 23 + reference/Makefile | 176 + reference/caxpycf.f | 35 + reference/caxpyf.f | 34 + reference/ccopyf.f | 33 + reference/cdotcf.f | 38 + reference/cdotuf.f | 37 + reference/cgbmvf.f | 450 + reference/cgemm3mf.f | 414 + reference/cgemmf.f | 414 + reference/cgemvf.f | 332 + reference/cgercf.f | 157 + reference/cgeruf.f | 157 + reference/cgesvf.f | 107 + reference/cgetf2f.f | 136 + reference/cgetrff.f | 156 + reference/cgetrsf.f | 150 + reference/chbmvf.f | 309 + reference/chemm3mf.f | 304 + reference/chemmf.f | 304 + reference/chemvf.f | 349 + reference/cher2f.f | 249 + reference/cher2kf.f | 371 + reference/cherf.f | 212 + reference/cherkf.f | 328 + reference/chpmvf.f | 270 + reference/chpr2f.f | 251 + reference/chprf.f | 217 + reference/claswpf.f | 120 + reference/clauu2f.f | 143 + reference/clauumf.f | 161 + reference/cpotf2f.f | 175 + reference/cpotrff.f | 187 + reference/cpotrif.f | 96 + reference/crotgf.f | 20 + reference/csbmvf.f | 306 + reference/cscalf.f | 28 + reference/cspmvf.f | 264 + reference/cspr2f.f | 229 + reference/csprf.f | 213 + reference/csrotf.f | 38 + reference/csscalf.f | 29 + reference/cswapf.f | 36 + reference/csymm3mf.f | 296 + reference/csymmf.f | 296 + reference/csymvf.f | 264 + reference/csyr2f.f | 230 + reference/csyr2kf.f | 324 + reference/csyrf.f | 198 + reference/csyrkf.f | 293 + reference/ctbmvf.f | 377 + reference/ctbsvf.f | 367 + reference/ctpmvf.f | 376 + reference/ctpsvf.f | 379 + reference/ctrmmf.f | 428 + reference/ctrmvf.f | 358 + reference/ctrsmf.f | 459 + reference/ctrsvf.f | 361 + reference/ctrti2f.f | 146 + reference/ctrtrif.f | 177 + reference/damaxf.f | 36 + reference/daminf.f | 36 + reference/dasumf.f | 43 + reference/daxpyf.f | 48 + reference/dcopyf.f | 50 + reference/ddotf.f | 49 + reference/dgbmvf.f | 300 + reference/dgemmf.f | 313 + reference/dgemvf.f | 256 + reference/dgerf.f | 158 + reference/dgesvf.f | 107 + reference/dgetf2f.f | 135 + reference/dgetrff.f | 156 + reference/dgetrsf.f | 150 + reference/dlaswpf.f | 120 + reference/dlauu2f.f | 135 + reference/dlauumf.f | 155 + reference/dmaxf.f | 36 + reference/dminf.f | 36 + reference/dnrm2f.f | 61 + reference/dpotf2f.f | 168 + reference/dpotrff.f | 184 + reference/dpotrif.f | 96 + reference/drotf.f | 37 + reference/drotgf.f | 27 + reference/drotmf.f | 108 + reference/drotmgf.f | 169 + reference/dsbmvf.f | 303 + reference/dscalf.f | 43 + reference/dsdotf.f | 74 + reference/dspmvf.f | 262 + reference/dspr2f.f | 229 + reference/dsprf.f | 198 + reference/dswapf.f | 56 + reference/dsymmf.f | 294 + reference/dsymvf.f | 262 + reference/dsyr2f.f | 230 + reference/dsyr2kf.f | 327 + reference/dsyrf.f | 197 + reference/dsyrkf.f | 294 + reference/dtbmvf.f | 342 + reference/dtbsvf.f | 336 + reference/dtpmvf.f | 299 + reference/dtpsvf.f | 302 + reference/dtrmmf.f | 355 + reference/dtrmvf.f | 286 + reference/dtrsmf.f | 378 + reference/dtrsvf.f | 289 + reference/dtrti2f.f | 146 + reference/dtrtrif.f | 176 + reference/dzamaxf.f | 40 + reference/dzaminf.f | 38 + reference/dzasumf.f | 34 + reference/dznrm2f.f | 67 + reference/icamaxf.f | 41 + reference/icaminf.f | 41 + reference/idamaxf.f | 39 + reference/idaminf.f | 39 + reference/idmaxf.f | 39 + reference/idminf.f | 39 + reference/iqamaxf.f | 48 + reference/iqaminf.f | 49 + reference/iqmaxf.f | 39 + reference/iqminf.f | 39 + reference/isamaxf.f | 39 + reference/isaminf.f | 39 + reference/ismaxf.f | 39 + reference/isminf.f | 39 + reference/ixamaxf.f | 41 + reference/ixaminf.f | 41 + reference/izamaxf.f | 41 + reference/izaminf.f | 41 + reference/lsamef.f | 87 + reference/samaxf.f | 36 + reference/saminf.f | 36 + reference/sasumf.f | 44 + reference/saxpyf.f | 48 + reference/scamaxf.f | 40 + reference/scaminf.f | 38 + reference/scasumf.f | 34 + reference/scnrm2f.f | 67 + reference/scopyf.f | 50 + reference/sdotf.f | 49 + reference/sdsdotf.f | 78 + reference/sgbmvf.f | 300 + reference/sgemmf.f | 313 + reference/sgemvf.f | 257 + reference/sgerf.f | 157 + reference/sgesvf.f | 107 + reference/sgetf2f.f | 135 + reference/sgetrff.f | 156 + reference/sgetrsf.f | 150 + reference/slaswpf.f | 120 + reference/slauu2f.f | 135 + reference/slauumf.f | 156 + reference/smaxf.f | 36 + reference/sminf.f | 36 + reference/snrm2f.f | 60 + reference/spotf2f.f | 168 + reference/spotrff.f | 184 + reference/spotrif.f | 96 + reference/srotf.f | 37 + reference/srotgf.f | 27 + reference/srotmf.f | 106 + reference/srotmgf.f | 166 + reference/ssbmvf.f | 303 + reference/sscalf.f | 43 + reference/sspmvf.f | 262 + reference/sspr2f.f | 229 + reference/ssprf.f | 198 + reference/sswapf.f | 56 + reference/ssymmf.f | 294 + reference/ssymvf.f | 262 + reference/ssyr2f.f | 230 + reference/ssyr2kf.f | 327 + reference/ssyrf.f | 197 + reference/ssyrkf.f | 294 + reference/stbmvf.f | 342 + reference/stbsvf.f | 336 + reference/stpmvf.f | 299 + reference/stpsvf.f | 302 + reference/strmmf.f | 355 + reference/strmvf.f | 286 + reference/strsmf.f | 378 + reference/strsvf.f | 289 + reference/strti2f.f | 146 + reference/strtrif.f | 176 + reference/zaxpycf.f | 36 + reference/zaxpyf.f | 34 + reference/zcopyf.f | 33 + reference/zdotcf.f | 36 + reference/zdotuf.f | 36 + reference/zdrotf.f | 38 + reference/zdscalf.f | 30 + reference/zgbmvf.f | 450 + reference/zgemm3mf.f | 414 + reference/zgemmf.f | 414 + reference/zgemvf.f | 332 + reference/zgercf.f | 157 + reference/zgeruf.f | 157 + reference/zgesvf.f | 107 + reference/zgetf2f.f | 136 + reference/zgetrff.f | 156 + reference/zgetrsf.f | 150 + reference/zhbmvf.f | 406 + reference/zhemm3mf.f | 304 + reference/zhemmf.f | 304 + reference/zhemvf.f | 351 + reference/zher2f.f | 249 + reference/zher2kf.f | 372 + reference/zherf.f | 212 + reference/zherkf.f | 330 + reference/zhpmvf.f | 270 + reference/zhpr2f.f | 251 + reference/zhprf.f | 217 + reference/zlaswpf.f | 120 + reference/zlauu2f.f | 143 + reference/zlauumf.f | 160 + reference/zpotf2f.f | 175 + reference/zpotrff.f | 187 + reference/zpotrif.f | 96 + reference/zrotgf.f | 23 + reference/zsbmvf.f | 306 + reference/zscalf.f | 29 + reference/zspmvf.f | 264 + reference/zspr2f.f | 229 + reference/zsprf.f | 213 + reference/zswapf.f | 36 + reference/zsymm3mf.f | 296 + reference/zsymmf.f | 296 + reference/zsymvf.f | 264 + reference/zsyr2f.f | 230 + reference/zsyr2kf.f | 324 + reference/zsyrf.f | 198 + reference/zsyrkf.f | 293 + reference/ztbmvf.f | 378 + reference/ztbsvf.f | 367 + reference/ztpmvf.f | 377 + reference/ztpsvf.f | 379 + reference/ztrmmf.f | 428 + reference/ztrmvf.f | 358 + reference/ztrsmf.f | 457 + reference/ztrsvf.f | 361 + reference/ztrti2f.f | 146 + reference/ztrtrif.f | 177 + symcopy.h | 1873 ++ test/LICENSE | 23 + test/Makefile | 122 + test/cblat1.f | 681 + test/cblat2.dat | 35 + test/cblat2.f | 3241 ++++ test/cblat3.dat | 23 + test/cblat3.f | 3439 ++++ test/dblat1.f | 769 + test/dblat2.dat | 34 + test/dblat2.f | 3138 ++++ test/dblat3.dat | 20 + test/dblat3.f | 2823 +++ test/sblat1.f | 769 + test/sblat2.dat | 34 + test/sblat2.f | 3138 ++++ test/sblat3.dat | 20 + test/sblat3.f | 2823 +++ test/zblat1.f | 681 + test/zblat2.dat | 35 + test/zblat2.f | 3249 ++++ test/zblat3.dat | 23 + test/zblat3.f | 3445 ++++ version.h | 43 + 1685 files changed, 1382682 insertions(+) create mode 100644 00License.txt create mode 100644 01Readme.txt create mode 100644 02QuickInstall.txt create mode 100644 03FAQ.txt create mode 100644 04Windows64bit.txt create mode 100644 05LargePage create mode 100644 06WeirdPerformance create mode 100644 Makefile create mode 100644 Makefile.alpha create mode 100644 Makefile.generic create mode 100644 Makefile.getarch create mode 100644 Makefile.ia64 create mode 100644 Makefile.mips64 create mode 100644 Makefile.power create mode 100644 Makefile.rule create mode 100644 Makefile.sparc create mode 100644 Makefile.system create mode 100644 Makefile.tail create mode 100644 Makefile.x86 create mode 100644 Makefile.x86_64 create mode 100644 benchmark/Makefile create mode 100644 benchmark/cholesky.c create mode 100644 benchmark/cula_wrapper.c create mode 100644 benchmark/linpack.c create mode 100644 c_check create mode 100644 cblas.h create mode 100644 common.h create mode 100644 common_alpha.h create mode 100644 common_c.h create mode 100644 common_d.h create mode 100644 common_ia64.h create mode 100644 common_interface.h create mode 100644 common_lapack.h create mode 100644 common_level1.h create mode 100644 common_level2.h create mode 100644 common_level3.h create mode 100644 common_linux.h create mode 100644 common_macro.h create mode 100644 common_mips64.h create mode 100644 common_param.h create mode 100644 common_power.h create mode 100644 common_q.h create mode 100644 common_reference.h create mode 100644 common_s.h create mode 100644 common_sparc.h create mode 100644 common_thread.h create mode 100644 common_x.h create mode 100644 common_x86.h create mode 100644 common_x86_64.h create mode 100644 common_z.h create mode 100644 cpuid.S create mode 100644 cpuid.h create mode 100644 cpuid_alpha.c create mode 100644 cpuid_ia64.c create mode 100644 cpuid_mips.c create mode 100644 cpuid_power.c create mode 100644 cpuid_sparc.c create mode 100644 cpuid_x86.c create mode 100644 ctest.c create mode 100644 ctest/LICENSE create mode 100644 ctest/Makefile create mode 100644 ctest/auxiliary.c create mode 100644 ctest/c_c2chke.c create mode 100644 ctest/c_c3chke.c create mode 100644 ctest/c_cblas1.c create mode 100644 ctest/c_cblas2.c create mode 100644 ctest/c_cblas3.c create mode 100644 ctest/c_cblat1.f create mode 100644 ctest/c_cblat2.f create mode 100644 ctest/c_cblat3.f create mode 100644 ctest/c_d2chke.c create mode 100644 ctest/c_d3chke.c create mode 100644 ctest/c_dblas1.c create mode 100644 ctest/c_dblas2.c create mode 100644 ctest/c_dblas3.c create mode 100644 ctest/c_dblat1.f create mode 100644 ctest/c_dblat2.f create mode 100644 ctest/c_dblat3.f create mode 100644 ctest/c_s2chke.c create mode 100644 ctest/c_s3chke.c create mode 100644 ctest/c_sblas1.c create mode 100644 ctest/c_sblas2.c create mode 100644 ctest/c_sblas3.c create mode 100644 ctest/c_sblat1.f create mode 100644 ctest/c_sblat2.f create mode 100644 ctest/c_sblat3.f create mode 100644 ctest/c_xerbla.c create mode 100644 ctest/c_z2chke.c create mode 100644 ctest/c_z3chke.c create mode 100644 ctest/c_zblas1.c create mode 100644 ctest/c_zblas2.c create mode 100644 ctest/c_zblas3.c create mode 100644 ctest/c_zblat1.f create mode 100644 ctest/c_zblat2.f create mode 100644 ctest/c_zblat3.f create mode 100644 ctest/cblas_test.h create mode 100644 ctest/cin2 create mode 100644 ctest/cin3 create mode 100644 ctest/constant.c create mode 100644 ctest/din2 create mode 100644 ctest/din3 create mode 100644 ctest/sin2 create mode 100644 ctest/sin3 create mode 100644 ctest/zin2 create mode 100644 ctest/zin3 create mode 100644 ctest1.c create mode 100644 ctest2.c create mode 100644 driver/level2/Makefile create mode 100644 driver/level2/gbmv_k.c create mode 100644 driver/level2/gbmv_thread.c create mode 100644 driver/level2/gemv_thread.c create mode 100644 driver/level2/ger_thread.c create mode 100644 driver/level2/sbmv_k.c create mode 100644 driver/level2/sbmv_thread.c create mode 100644 driver/level2/spmv_k.c create mode 100644 driver/level2/spmv_thread.c create mode 100644 driver/level2/spr2_k.c create mode 100644 driver/level2/spr2_thread.c create mode 100644 driver/level2/spr_k.c create mode 100644 driver/level2/spr_thread.c create mode 100644 driver/level2/symv_thread.c create mode 100644 driver/level2/syr2_k.c create mode 100644 driver/level2/syr2_thread.c create mode 100644 driver/level2/syr_k.c create mode 100644 driver/level2/syr_thread.c create mode 100644 driver/level2/tbmv_L.c create mode 100644 driver/level2/tbmv_U.c create mode 100644 driver/level2/tbmv_thread.c create mode 100644 driver/level2/tbsv_L.c create mode 100644 driver/level2/tbsv_U.c create mode 100644 driver/level2/tpmv_L.c create mode 100644 driver/level2/tpmv_U.c create mode 100644 driver/level2/tpmv_thread.c create mode 100644 driver/level2/tpsv_L.c create mode 100644 driver/level2/tpsv_U.c create mode 100644 driver/level2/trmv_L.c create mode 100644 driver/level2/trmv_U.c create mode 100644 driver/level2/trmv_thread.c create mode 100644 driver/level2/trsv_L.c create mode 100644 driver/level2/trsv_U.c create mode 100644 driver/level2/zgbmv_k.c create mode 100644 driver/level2/zhbmv_k.c create mode 100644 driver/level2/zher2_k.c create mode 100644 driver/level2/zher_k.c create mode 100644 driver/level2/zhpmv_k.c create mode 100644 driver/level2/zhpr2_k.c create mode 100644 driver/level2/zhpr_k.c create mode 100644 driver/level2/zsbmv_k.c create mode 100644 driver/level2/zspmv_k.c create mode 100644 driver/level2/zspr2_k.c create mode 100644 driver/level2/zspr_k.c create mode 100644 driver/level2/zsyr2_k.c create mode 100644 driver/level2/zsyr_k.c create mode 100644 driver/level2/ztbmv_L.c create mode 100644 driver/level2/ztbmv_U.c create mode 100644 driver/level2/ztbsv_L.c create mode 100644 driver/level2/ztbsv_U.c create mode 100644 driver/level2/ztpmv_L.c create mode 100644 driver/level2/ztpmv_U.c create mode 100644 driver/level2/ztpsv_L.c create mode 100644 driver/level2/ztpsv_U.c create mode 100644 driver/level2/ztrmv_L.c create mode 100644 driver/level2/ztrmv_U.c create mode 100644 driver/level2/ztrsv_L.c create mode 100644 driver/level2/ztrsv_U.c create mode 100644 driver/level3/Makefile create mode 100644 driver/level3/gemm.c create mode 100644 driver/level3/gemm3m.c create mode 100644 driver/level3/gemm3m_level3.c create mode 100644 driver/level3/gemm_thread_m.c create mode 100644 driver/level3/gemm_thread_mn.c create mode 100644 driver/level3/gemm_thread_n.c create mode 100644 driver/level3/gemm_thread_variable.c create mode 100644 driver/level3/hemm3m_k.c create mode 100644 driver/level3/level3.c create mode 100644 driver/level3/level3_gemm3m_thread.c create mode 100644 driver/level3/level3_syr2k.c create mode 100644 driver/level3/level3_syrk.c create mode 100644 driver/level3/level3_syrk_threaded.c create mode 100644 driver/level3/level3_thread.c create mode 100644 driver/level3/symm3m_k.c create mode 100644 driver/level3/symm_k.c create mode 100644 driver/level3/syr2k_k.c create mode 100644 driver/level3/syr2k_kernel.c create mode 100644 driver/level3/syrk_k.c create mode 100644 driver/level3/syrk_kernel.c create mode 100644 driver/level3/syrk_thread.c create mode 100644 driver/level3/trmm_L.c create mode 100644 driver/level3/trmm_R.c create mode 100644 driver/level3/trsm_L.c create mode 100644 driver/level3/trsm_R.c create mode 100644 driver/level3/zhemm_k.c create mode 100644 driver/level3/zher2k_k.c create mode 100644 driver/level3/zher2k_kernel.c create mode 100644 driver/level3/zherk_beta.c create mode 100644 driver/level3/zherk_k.c create mode 100644 driver/level3/zherk_kernel.c create mode 100644 driver/level3/zsyrk_beta.c create mode 100644 driver/mapper/Makefile create mode 100644 driver/mapper/device_setup create mode 100644 driver/mapper/mapper.c create mode 100644 driver/others/Makefile create mode 100644 driver/others/abs.c create mode 100644 driver/others/blas_l1_thread.c create mode 100644 driver/others/blas_server.c create mode 100644 driver/others/blas_server_omp.c create mode 100644 driver/others/blas_server_win32.c create mode 100644 driver/others/divtable.c create mode 100644 driver/others/dynamic.c create mode 100644 driver/others/init.c create mode 100644 driver/others/lamc3.c create mode 100644 driver/others/lamch.c create mode 100644 driver/others/lsame.c create mode 100644 driver/others/memory.c create mode 100644 driver/others/memory_qalloc.c create mode 100644 driver/others/parameter.c create mode 100644 driver/others/profile.c create mode 100644 driver/others/xerbla.c create mode 100644 exports/Makefile create mode 100644 exports/dllinit.c create mode 100644 exports/gensymbol create mode 100644 f_check create mode 100644 ftest.f create mode 100644 ftest2.f create mode 100644 getarch.c create mode 100644 getarch_2nd.c create mode 100644 interface/Makefile create mode 100644 interface/asum.c create mode 100644 interface/axpy.c create mode 100644 interface/copy.c create mode 100644 interface/create create mode 100644 interface/dot.c create mode 100644 interface/dsdot.c create mode 100644 interface/gbmv.c create mode 100644 interface/gemm.c create mode 100644 interface/gemv.c create mode 100644 interface/ger.c create mode 100644 interface/gesv.c create mode 100644 interface/getf2.c create mode 100644 interface/getrf.c create mode 100644 interface/getrs.c create mode 100644 interface/imax.c create mode 100644 interface/larf.c create mode 100644 interface/laswp.c create mode 100644 interface/lauu2.c create mode 100644 interface/lauum.c create mode 100644 interface/max.c create mode 100644 interface/nrm2.c create mode 100644 interface/potf2.c create mode 100644 interface/potrf.c create mode 100644 interface/potri.c create mode 100644 interface/rot.c create mode 100644 interface/rotg.c create mode 100644 interface/rotm.c create mode 100644 interface/rotmg.c create mode 100644 interface/sbmv.c create mode 100644 interface/scal.c create mode 100644 interface/sdsdot.c create mode 100644 interface/spmv.c create mode 100644 interface/spr.c create mode 100644 interface/spr2.c create mode 100644 interface/swap.c create mode 100644 interface/symm.c create mode 100644 interface/symv.c create mode 100644 interface/syr.c create mode 100644 interface/syr2.c create mode 100644 interface/syr2k.c create mode 100644 interface/syrk.c create mode 100644 interface/tbmv.c create mode 100644 interface/tbsv.c create mode 100644 interface/tpmv.c create mode 100644 interface/tpsv.c create mode 100644 interface/trmv.c create mode 100644 interface/trsm.c create mode 100644 interface/trsv.c create mode 100644 interface/trti2.c create mode 100644 interface/trtri.c create mode 100644 interface/zaxpy.c create mode 100644 interface/zdot.c create mode 100644 interface/zgbmv.c create mode 100644 interface/zgemv.c create mode 100644 interface/zger.c create mode 100644 interface/zgetf2.c create mode 100644 interface/zgetrf.c create mode 100644 interface/zgetrs.c create mode 100644 interface/zhbmv.c create mode 100644 interface/zhemv.c create mode 100644 interface/zher.c create mode 100644 interface/zher2.c create mode 100644 interface/zhpmv.c create mode 100644 interface/zhpr.c create mode 100644 interface/zhpr2.c create mode 100644 interface/zlaswp.c create mode 100644 interface/zlauu2.c create mode 100644 interface/zlauum.c create mode 100644 interface/zpotf2.c create mode 100644 interface/zpotrf.c create mode 100644 interface/zpotri.c create mode 100644 interface/zrot.c create mode 100644 interface/zrotg.c create mode 100644 interface/zsbmv.c create mode 100644 interface/zscal.c create mode 100644 interface/zspmv.c create mode 100644 interface/zspr.c create mode 100644 interface/zspr2.c create mode 100644 interface/zswap.c create mode 100644 interface/zsymv.c create mode 100644 interface/zsyr.c create mode 100644 interface/zsyr2.c create mode 100644 interface/ztbmv.c create mode 100644 interface/ztbsv.c create mode 100644 interface/ztpmv.c create mode 100644 interface/ztpsv.c create mode 100644 interface/ztrmv.c create mode 100644 interface/ztrsv.c create mode 100644 interface/ztrti2.c create mode 100644 interface/ztrtri.c create mode 100644 kernel/Makefile create mode 100644 kernel/Makefile.L1 create mode 100644 kernel/Makefile.L2 create mode 100644 kernel/Makefile.L3 create mode 100644 kernel/Makefile.LA create mode 100644 kernel/alpha/KERNEL create mode 100644 kernel/alpha/Makefile create mode 100644 kernel/alpha/amax.S create mode 100644 kernel/alpha/asum.S create mode 100644 kernel/alpha/axpy.S create mode 100644 kernel/alpha/cabs.S create mode 100644 kernel/alpha/cnrm2.S create mode 100644 kernel/alpha/copy.S create mode 100644 kernel/alpha/cscal.S create mode 100644 kernel/alpha/dnrm2.S create mode 100644 kernel/alpha/dot.S create mode 100644 kernel/alpha/gemm_beta.S create mode 100644 kernel/alpha/gemm_kernel_4x4.S create mode 100644 kernel/alpha/gemv_n.S create mode 100644 kernel/alpha/gemv_t.S create mode 100644 kernel/alpha/iamax.S create mode 100644 kernel/alpha/imax.S create mode 100644 kernel/alpha/izamax.S create mode 100644 kernel/alpha/lsame.S create mode 100644 kernel/alpha/max.S create mode 100644 kernel/alpha/rot.S create mode 100644 kernel/alpha/scal.S create mode 100644 kernel/alpha/snrm2.S create mode 100644 kernel/alpha/staticbuffer.S create mode 100644 kernel/alpha/swap.S create mode 100644 kernel/alpha/trsm_kernel_4x4_LN.S create mode 100644 kernel/alpha/trsm_kernel_4x4_LT.S create mode 100644 kernel/alpha/trsm_kernel_4x4_RT.S create mode 100644 kernel/alpha/zamax.S create mode 100644 kernel/alpha/zasum.S create mode 100644 kernel/alpha/zaxpy.S create mode 100644 kernel/alpha/zdot.S create mode 100644 kernel/alpha/zgemm_beta.S create mode 100644 kernel/alpha/zgemm_kernel_2x2.S create mode 100644 kernel/alpha/zgemv_n.S create mode 100644 kernel/alpha/zgemv_t.S create mode 100644 kernel/alpha/znrm2.S create mode 100644 kernel/alpha/zrot.S create mode 100644 kernel/alpha/zscal.S create mode 100644 kernel/alpha/zswap.S create mode 100644 kernel/alpha/ztrsm_kernel_2x2_LN.S create mode 100644 kernel/alpha/ztrsm_kernel_2x2_LT.S create mode 100644 kernel/alpha/ztrsm_kernel_2x2_RT.S create mode 100644 kernel/generic/cabs.c create mode 100644 kernel/generic/gemm_beta.c create mode 100644 kernel/generic/gemm_ncopy_1.c create mode 100644 kernel/generic/gemm_ncopy_16.c create mode 100644 kernel/generic/gemm_ncopy_2.c create mode 100644 kernel/generic/gemm_ncopy_4.c create mode 100644 kernel/generic/gemm_ncopy_8.c create mode 100644 kernel/generic/gemm_tcopy_1.c create mode 100644 kernel/generic/gemm_tcopy_16.c create mode 100644 kernel/generic/gemm_tcopy_2.c create mode 100644 kernel/generic/gemm_tcopy_4.c create mode 100644 kernel/generic/gemm_tcopy_8.c create mode 100644 kernel/generic/ger.c create mode 100644 kernel/generic/laswp_ncopy_1.c create mode 100644 kernel/generic/laswp_ncopy_2.c create mode 100644 kernel/generic/laswp_ncopy_4.c create mode 100644 kernel/generic/laswp_ncopy_8.c create mode 100644 kernel/generic/lsame.c create mode 100644 kernel/generic/neg_tcopy_1.c create mode 100644 kernel/generic/neg_tcopy_16.c create mode 100644 kernel/generic/neg_tcopy_2.c create mode 100644 kernel/generic/neg_tcopy_4.c create mode 100644 kernel/generic/neg_tcopy_8.c create mode 100644 kernel/generic/symm_lcopy_1.c create mode 100644 kernel/generic/symm_lcopy_16.c create mode 100644 kernel/generic/symm_lcopy_2.c create mode 100644 kernel/generic/symm_lcopy_4.c create mode 100644 kernel/generic/symm_lcopy_8.c create mode 100644 kernel/generic/symm_ucopy_1.c create mode 100644 kernel/generic/symm_ucopy_16.c create mode 100644 kernel/generic/symm_ucopy_2.c create mode 100644 kernel/generic/symm_ucopy_4.c create mode 100644 kernel/generic/symm_ucopy_8.c create mode 100644 kernel/generic/symv_k.c create mode 100644 kernel/generic/trmm_lncopy_1.c create mode 100644 kernel/generic/trmm_lncopy_16.c create mode 100644 kernel/generic/trmm_lncopy_2.c create mode 100644 kernel/generic/trmm_lncopy_4.c create mode 100644 kernel/generic/trmm_lncopy_8.c create mode 100644 kernel/generic/trmm_ltcopy_1.c create mode 100644 kernel/generic/trmm_ltcopy_16.c create mode 100644 kernel/generic/trmm_ltcopy_2.c create mode 100644 kernel/generic/trmm_ltcopy_4.c create mode 100644 kernel/generic/trmm_ltcopy_8.c create mode 100644 kernel/generic/trmm_uncopy_1.c create mode 100644 kernel/generic/trmm_uncopy_16.c create mode 100644 kernel/generic/trmm_uncopy_2.c create mode 100644 kernel/generic/trmm_uncopy_4.c create mode 100644 kernel/generic/trmm_uncopy_8.c create mode 100644 kernel/generic/trmm_utcopy_1.c create mode 100644 kernel/generic/trmm_utcopy_16.c create mode 100644 kernel/generic/trmm_utcopy_2.c create mode 100644 kernel/generic/trmm_utcopy_4.c create mode 100644 kernel/generic/trmm_utcopy_8.c create mode 100644 kernel/generic/trsm_kernel_LN.c create mode 100644 kernel/generic/trsm_kernel_LT.c create mode 100644 kernel/generic/trsm_kernel_RN.c create mode 100644 kernel/generic/trsm_kernel_RT.c create mode 100644 kernel/generic/trsm_lncopy_1.c create mode 100644 kernel/generic/trsm_lncopy_16.c create mode 100644 kernel/generic/trsm_lncopy_2.c create mode 100644 kernel/generic/trsm_lncopy_4.c create mode 100644 kernel/generic/trsm_lncopy_8.c create mode 100644 kernel/generic/trsm_ltcopy_1.c create mode 100644 kernel/generic/trsm_ltcopy_16.c create mode 100644 kernel/generic/trsm_ltcopy_2.c create mode 100644 kernel/generic/trsm_ltcopy_4.c create mode 100644 kernel/generic/trsm_ltcopy_8.c create mode 100644 kernel/generic/trsm_uncopy_1.c create mode 100644 kernel/generic/trsm_uncopy_16.c create mode 100644 kernel/generic/trsm_uncopy_2.c create mode 100644 kernel/generic/trsm_uncopy_4.c create mode 100644 kernel/generic/trsm_uncopy_8.c create mode 100644 kernel/generic/trsm_utcopy_1.c create mode 100644 kernel/generic/trsm_utcopy_16.c create mode 100644 kernel/generic/trsm_utcopy_2.c create mode 100644 kernel/generic/trsm_utcopy_4.c create mode 100644 kernel/generic/trsm_utcopy_8.c create mode 100644 kernel/generic/zgemm3m_ncopy_1.c create mode 100644 kernel/generic/zgemm3m_ncopy_2.c create mode 100644 kernel/generic/zgemm3m_ncopy_4.c create mode 100644 kernel/generic/zgemm3m_ncopy_8.c create mode 100644 kernel/generic/zgemm3m_tcopy_1.c create mode 100644 kernel/generic/zgemm3m_tcopy_2.c create mode 100644 kernel/generic/zgemm3m_tcopy_4.c create mode 100644 kernel/generic/zgemm3m_tcopy_8.c create mode 100644 kernel/generic/zgemm_beta.c create mode 100644 kernel/generic/zgemm_ncopy_1.c create mode 100644 kernel/generic/zgemm_ncopy_2.c create mode 100644 kernel/generic/zgemm_ncopy_4.c create mode 100644 kernel/generic/zgemm_ncopy_8.c create mode 100644 kernel/generic/zgemm_tcopy_1.c create mode 100644 kernel/generic/zgemm_tcopy_2.c create mode 100644 kernel/generic/zgemm_tcopy_4.c create mode 100644 kernel/generic/zgemm_tcopy_8.c create mode 100644 kernel/generic/zger.c create mode 100644 kernel/generic/zhemm3m_lcopy_1.c create mode 100644 kernel/generic/zhemm3m_lcopy_2.c create mode 100644 kernel/generic/zhemm3m_lcopy_4.c create mode 100644 kernel/generic/zhemm3m_lcopy_8.c create mode 100644 kernel/generic/zhemm3m_ucopy_1.c create mode 100644 kernel/generic/zhemm3m_ucopy_2.c create mode 100644 kernel/generic/zhemm3m_ucopy_4.c create mode 100644 kernel/generic/zhemm3m_ucopy_8.c create mode 100644 kernel/generic/zhemm_ltcopy_1.c create mode 100644 kernel/generic/zhemm_ltcopy_2.c create mode 100644 kernel/generic/zhemm_ltcopy_4.c create mode 100644 kernel/generic/zhemm_ltcopy_8.c create mode 100644 kernel/generic/zhemm_utcopy_1.c create mode 100644 kernel/generic/zhemm_utcopy_2.c create mode 100644 kernel/generic/zhemm_utcopy_4.c create mode 100644 kernel/generic/zhemm_utcopy_8.c create mode 100644 kernel/generic/zhemv_k.c create mode 100644 kernel/generic/zlaswp_ncopy_1.c create mode 100644 kernel/generic/zlaswp_ncopy_2.c create mode 100644 kernel/generic/zlaswp_ncopy_4.c create mode 100644 kernel/generic/zneg_tcopy_1.c create mode 100644 kernel/generic/zneg_tcopy_2.c create mode 100644 kernel/generic/zneg_tcopy_4.c create mode 100644 kernel/generic/zneg_tcopy_8.c create mode 100644 kernel/generic/zsymm3m_lcopy_1.c create mode 100644 kernel/generic/zsymm3m_lcopy_2.c create mode 100644 kernel/generic/zsymm3m_lcopy_4.c create mode 100644 kernel/generic/zsymm3m_lcopy_8.c create mode 100644 kernel/generic/zsymm3m_ucopy_1.c create mode 100644 kernel/generic/zsymm3m_ucopy_2.c create mode 100644 kernel/generic/zsymm3m_ucopy_4.c create mode 100644 kernel/generic/zsymm3m_ucopy_8.c create mode 100644 kernel/generic/zsymm_lcopy_1.c create mode 100644 kernel/generic/zsymm_lcopy_2.c create mode 100644 kernel/generic/zsymm_lcopy_4.c create mode 100644 kernel/generic/zsymm_lcopy_8.c create mode 100644 kernel/generic/zsymm_ucopy_1.c create mode 100644 kernel/generic/zsymm_ucopy_2.c create mode 100644 kernel/generic/zsymm_ucopy_4.c create mode 100644 kernel/generic/zsymm_ucopy_8.c create mode 100644 kernel/generic/zsymv_k.c create mode 100644 kernel/generic/ztrmm_lncopy_1.c create mode 100644 kernel/generic/ztrmm_lncopy_2.c create mode 100644 kernel/generic/ztrmm_lncopy_4.c create mode 100644 kernel/generic/ztrmm_lncopy_8.c create mode 100644 kernel/generic/ztrmm_ltcopy_1.c create mode 100644 kernel/generic/ztrmm_ltcopy_2.c create mode 100644 kernel/generic/ztrmm_ltcopy_4.c create mode 100644 kernel/generic/ztrmm_ltcopy_8.c create mode 100644 kernel/generic/ztrmm_uncopy_1.c create mode 100644 kernel/generic/ztrmm_uncopy_2.c create mode 100644 kernel/generic/ztrmm_uncopy_4.c create mode 100644 kernel/generic/ztrmm_uncopy_8.c create mode 100644 kernel/generic/ztrmm_utcopy_1.c create mode 100644 kernel/generic/ztrmm_utcopy_2.c create mode 100644 kernel/generic/ztrmm_utcopy_4.c create mode 100644 kernel/generic/ztrmm_utcopy_8.c create mode 100644 kernel/generic/ztrsm_lncopy_1.c create mode 100644 kernel/generic/ztrsm_lncopy_2.c create mode 100644 kernel/generic/ztrsm_lncopy_4.c create mode 100644 kernel/generic/ztrsm_lncopy_8.c create mode 100644 kernel/generic/ztrsm_ltcopy_1.c create mode 100644 kernel/generic/ztrsm_ltcopy_2.c create mode 100644 kernel/generic/ztrsm_ltcopy_4.c create mode 100644 kernel/generic/ztrsm_ltcopy_8.c create mode 100644 kernel/generic/ztrsm_uncopy_1.c create mode 100644 kernel/generic/ztrsm_uncopy_2.c create mode 100644 kernel/generic/ztrsm_uncopy_4.c create mode 100644 kernel/generic/ztrsm_uncopy_8.c create mode 100644 kernel/generic/ztrsm_utcopy_1.c create mode 100644 kernel/generic/ztrsm_utcopy_2.c create mode 100644 kernel/generic/ztrsm_utcopy_4.c create mode 100644 kernel/generic/ztrsm_utcopy_8.c create mode 100644 kernel/ia64/KERNEL create mode 100644 kernel/ia64/Makefile create mode 100644 kernel/ia64/amax.S create mode 100644 kernel/ia64/asum.S create mode 100644 kernel/ia64/cabs.S create mode 100644 kernel/ia64/caxpy.S create mode 100644 kernel/ia64/copy.S create mode 100644 kernel/ia64/daxpy.S create mode 100644 kernel/ia64/ddot.S create mode 100644 kernel/ia64/gemm_beta.S create mode 100644 kernel/ia64/gemm_kernel.S create mode 100644 kernel/ia64/gemm_ncopy.S create mode 100644 kernel/ia64/gemm_tcopy.S create mode 100644 kernel/ia64/gemv_n.S create mode 100644 kernel/ia64/gemv_t.S create mode 100644 kernel/ia64/iamax.S create mode 100644 kernel/ia64/izamax.S create mode 100644 kernel/ia64/lsame.S create mode 100644 kernel/ia64/nrm2.S create mode 100644 kernel/ia64/qaxpy.S create mode 100644 kernel/ia64/qcopy.S create mode 100644 kernel/ia64/qdot.S create mode 100644 kernel/ia64/qgemm_kernel.S create mode 100644 kernel/ia64/qgemv_n.S create mode 100644 kernel/ia64/qgemv_t.S create mode 100644 kernel/ia64/qscal.S create mode 100644 kernel/ia64/rot.S create mode 100644 kernel/ia64/saxpy.S create mode 100644 kernel/ia64/scal.S create mode 100644 kernel/ia64/sdot.S create mode 100644 kernel/ia64/sgemv_n.S create mode 100644 kernel/ia64/staticbuffer.S create mode 100644 kernel/ia64/swap.S create mode 100644 kernel/ia64/symv_U.S create mode 100644 kernel/ia64/trsm_kernel_LN.S create mode 100644 kernel/ia64/trsm_kernel_LT.S create mode 100644 kernel/ia64/trsm_kernel_RT.S create mode 100644 kernel/ia64/xcopy.S create mode 100644 kernel/ia64/xdot.S create mode 100644 kernel/ia64/zaxpy.S create mode 100644 kernel/ia64/zcopy.S create mode 100644 kernel/ia64/zdot.S create mode 100644 kernel/ia64/zgemm3m_kernel.S create mode 100644 kernel/ia64/zgemm_beta.S create mode 100644 kernel/ia64/zgemm_kernel.S create mode 100644 kernel/ia64/zgemm_ncopy.S create mode 100644 kernel/ia64/zgemm_tcopy.S create mode 100644 kernel/ia64/zgemv_n.S create mode 100644 kernel/ia64/zgemv_t.S create mode 100644 kernel/ia64/zrot.S create mode 100644 kernel/ia64/zscal.S create mode 100644 kernel/ia64/zswap.S create mode 100644 kernel/ia64/ztrsm_kernel_LN.S create mode 100644 kernel/ia64/ztrsm_kernel_LT.S create mode 100644 kernel/ia64/ztrsm_kernel_RT.S create mode 100644 kernel/mips64/KERNEL create mode 100644 kernel/mips64/Makefile create mode 100644 kernel/mips64/amax.S create mode 100644 kernel/mips64/amin.S create mode 100644 kernel/mips64/asum.S create mode 100644 kernel/mips64/axpy.S create mode 100644 kernel/mips64/cnrm2.S create mode 100644 kernel/mips64/copy.S create mode 100644 kernel/mips64/dnrm2.S create mode 100644 kernel/mips64/dot.S create mode 100644 kernel/mips64/gemm_beta.S create mode 100644 kernel/mips64/gemm_kernel.S create mode 100644 kernel/mips64/gemv_n.S create mode 100644 kernel/mips64/gemv_t.S create mode 100644 kernel/mips64/iamax.S create mode 100644 kernel/mips64/iamin.S create mode 100644 kernel/mips64/imax.S create mode 100644 kernel/mips64/imin.S create mode 100644 kernel/mips64/izamax.S create mode 100644 kernel/mips64/izamin.S create mode 100644 kernel/mips64/max.S create mode 100644 kernel/mips64/min.S create mode 100644 kernel/mips64/rot.S create mode 100644 kernel/mips64/scal.S create mode 100644 kernel/mips64/snrm2.S create mode 100644 kernel/mips64/swap.S create mode 100644 kernel/mips64/symv_L.S create mode 100644 kernel/mips64/symv_U.S create mode 100644 kernel/mips64/trsm_kernel_LN.S create mode 100644 kernel/mips64/trsm_kernel_LT.S create mode 100644 kernel/mips64/trsm_kernel_RT.S create mode 100644 kernel/mips64/zamax.S create mode 100644 kernel/mips64/zamin.S create mode 100644 kernel/mips64/zasum.S create mode 100644 kernel/mips64/zaxpy.S create mode 100644 kernel/mips64/zcopy.S create mode 100644 kernel/mips64/zdot.S create mode 100644 kernel/mips64/zgemm3m_kernel.S create mode 100644 kernel/mips64/zgemm_kernel.S create mode 100644 kernel/mips64/zgemv_n.S create mode 100644 kernel/mips64/zgemv_t.S create mode 100644 kernel/mips64/znrm2.S create mode 100644 kernel/mips64/zrot.S create mode 100644 kernel/mips64/zscal.S create mode 100644 kernel/mips64/zswap.S create mode 100644 kernel/mips64/zsymv_L.S create mode 100644 kernel/mips64/zsymv_U.S create mode 100644 kernel/mips64/ztrsm_kernel_LT.S create mode 100644 kernel/mips64/ztrsm_kernel_RT.S create mode 100644 kernel/power/KERNEL create mode 100644 kernel/power/KERNEL.CELL create mode 100644 kernel/power/KERNEL.POWER3 create mode 100644 kernel/power/KERNEL.POWER4 create mode 100644 kernel/power/KERNEL.POWER5 create mode 100644 kernel/power/KERNEL.POWER6 create mode 100644 kernel/power/KERNEL.PPC440 create mode 100644 kernel/power/KERNEL.PPC440FP2 create mode 100644 kernel/power/KERNEL.PPC970 create mode 100644 kernel/power/KERNEL.PPCG4 create mode 100644 kernel/power/Makefile create mode 100644 kernel/power/amax.S create mode 100644 kernel/power/amax_cell.S create mode 100644 kernel/power/amax_hummer.S create mode 100644 kernel/power/amax_ppc440.S create mode 100644 kernel/power/amin.S create mode 100644 kernel/power/amin_cell.S create mode 100644 kernel/power/amin_hummer.S create mode 100644 kernel/power/amin_ppc440.S create mode 100644 kernel/power/asum.S create mode 100644 kernel/power/asum_cell.S create mode 100644 kernel/power/asum_hummer.S create mode 100644 kernel/power/asum_ppc440.S create mode 100644 kernel/power/axpy.S create mode 100644 kernel/power/axpy_hummer.S create mode 100644 kernel/power/axpy_ppc440.S create mode 100644 kernel/power/cabs.S create mode 100644 kernel/power/cnrm2.S create mode 100644 kernel/power/cnrm2_hummer.S create mode 100644 kernel/power/cnrm2_ppc440.S create mode 100644 kernel/power/copy.S create mode 100644 kernel/power/copy_hummer.S create mode 100644 kernel/power/dnrm2_hummer.S create mode 100644 kernel/power/dnrm2_ppc440.S create mode 100644 kernel/power/dot.S create mode 100644 kernel/power/dot_cell.S create mode 100644 kernel/power/dot_hummer.S create mode 100644 kernel/power/dot_ppc440.S create mode 100644 kernel/power/exfunc.S create mode 100644 kernel/power/gemm_beta.S create mode 100644 kernel/power/gemm_kernel.S create mode 100644 kernel/power/gemm_kernel_altivec.S create mode 100644 kernel/power/gemm_kernel_altivec_cell.S create mode 100644 kernel/power/gemm_kernel_altivec_g4.S create mode 100644 kernel/power/gemm_kernel_cell.S create mode 100644 kernel/power/gemm_kernel_g4.S create mode 100644 kernel/power/gemm_kernel_hummer.S create mode 100644 kernel/power/gemm_kernel_power3.S create mode 100644 kernel/power/gemm_kernel_power6.S create mode 100644 kernel/power/gemm_kernel_ppc440.S create mode 100644 kernel/power/gemm_ncopy_4.S create mode 100644 kernel/power/gemm_ncopy_hummer_4.S create mode 100644 kernel/power/gemm_ncopy_hummer_8.S create mode 100644 kernel/power/gemm_tcopy_4.S create mode 100644 kernel/power/gemm_tcopy_hummer_4.S create mode 100644 kernel/power/gemm_tcopy_hummer_8.S create mode 100644 kernel/power/gemv_hummer_n.S create mode 100644 kernel/power/gemv_n.S create mode 100644 kernel/power/gemv_n_ppc440.S create mode 100644 kernel/power/gemv_t.S create mode 100644 kernel/power/gemv_t_ppc440.S create mode 100644 kernel/power/ger.S create mode 100644 kernel/power/iamax.S create mode 100644 kernel/power/iamax_hummer.S create mode 100644 kernel/power/iamax_ppc440.S create mode 100644 kernel/power/iamin.S create mode 100644 kernel/power/iamin_hummer.S create mode 100644 kernel/power/iamin_ppc440.S create mode 100644 kernel/power/imax.S create mode 100644 kernel/power/imax_hummer.S create mode 100644 kernel/power/imax_ppc440.S create mode 100644 kernel/power/imin.S create mode 100644 kernel/power/imin_hummer.S create mode 100644 kernel/power/imin_ppc440.S create mode 100644 kernel/power/izamax.S create mode 100644 kernel/power/izamax_hummer.S create mode 100644 kernel/power/izamax_ppc440.S create mode 100644 kernel/power/izamin.S create mode 100644 kernel/power/izamin_hummer.S create mode 100644 kernel/power/izamin_ppc440.S create mode 100644 kernel/power/lock.c create mode 100644 kernel/power/lsame.S create mode 100644 kernel/power/max.S create mode 100644 kernel/power/max_hummer.S create mode 100644 kernel/power/max_ppc440.S create mode 100644 kernel/power/min.S create mode 100644 kernel/power/min_hummer.S create mode 100644 kernel/power/min_ppc440.S create mode 100644 kernel/power/nrm2.S create mode 100644 kernel/power/rot.S create mode 100644 kernel/power/rot_ppc440.S create mode 100644 kernel/power/scal.S create mode 100644 kernel/power/scal_hummer.S create mode 100644 kernel/power/scal_ppc440.S create mode 100644 kernel/power/snrm2.S create mode 100644 kernel/power/snrm2_hummer.S create mode 100644 kernel/power/snrm2_ppc440.S create mode 100644 kernel/power/staticbuffer.S create mode 100644 kernel/power/swap.S create mode 100644 kernel/power/swap_hummer.S create mode 100644 kernel/power/symv_L.S create mode 100644 kernel/power/symv_U.S create mode 100644 kernel/power/trsm_kernel_LN.S create mode 100644 kernel/power/trsm_kernel_LT.S create mode 100644 kernel/power/trsm_kernel_RT.S create mode 100644 kernel/power/trsm_kernel_cell_LN.S create mode 100644 kernel/power/trsm_kernel_cell_LT.S create mode 100644 kernel/power/trsm_kernel_cell_RT.S create mode 100644 kernel/power/trsm_kernel_hummer_LN.S create mode 100644 kernel/power/trsm_kernel_hummer_LT.S create mode 100644 kernel/power/trsm_kernel_hummer_RT.S create mode 100644 kernel/power/trsm_kernel_power6_LN.S create mode 100644 kernel/power/trsm_kernel_power6_LT.S create mode 100644 kernel/power/trsm_kernel_power6_RT.S create mode 100644 kernel/power/trsm_kernel_ppc440_LN.S create mode 100644 kernel/power/trsm_kernel_ppc440_LT.S create mode 100644 kernel/power/trsm_kernel_ppc440_RT.S create mode 100644 kernel/power/zamax.S create mode 100644 kernel/power/zamax_cell.S create mode 100644 kernel/power/zamax_hummer.S create mode 100644 kernel/power/zamax_ppc440.S create mode 100644 kernel/power/zamin.S create mode 100644 kernel/power/zamin_cell.S create mode 100644 kernel/power/zamin_hummer.S create mode 100644 kernel/power/zamin_ppc440.S create mode 100644 kernel/power/zasum.S create mode 100644 kernel/power/zasum_cell.S create mode 100644 kernel/power/zasum_hummer.S create mode 100644 kernel/power/zasum_ppc440.S create mode 100644 kernel/power/zaxpy.S create mode 100644 kernel/power/zaxpy_hummer.S create mode 100644 kernel/power/zaxpy_ppc440.S create mode 100644 kernel/power/zcopy.S create mode 100644 kernel/power/zcopy_hummer.S create mode 100644 kernel/power/zdot.S create mode 100644 kernel/power/zdot_cell.S create mode 100644 kernel/power/zdot_hummer.S create mode 100644 kernel/power/zdot_ppc440.S create mode 100644 kernel/power/zgemm_beta.S create mode 100644 kernel/power/zgemm_kernel.S create mode 100644 kernel/power/zgemm_kernel_altivec.S create mode 100644 kernel/power/zgemm_kernel_altivec_cell.S create mode 100644 kernel/power/zgemm_kernel_altivec_g4.S create mode 100644 kernel/power/zgemm_kernel_cell.S create mode 100644 kernel/power/zgemm_kernel_g4.S create mode 100644 kernel/power/zgemm_kernel_hummer.S create mode 100644 kernel/power/zgemm_kernel_power3.S create mode 100644 kernel/power/zgemm_kernel_power6.S create mode 100644 kernel/power/zgemm_kernel_ppc440.S create mode 100644 kernel/power/zgemm_ncopy_hummer_2.S create mode 100644 kernel/power/zgemm_ncopy_hummer_4.S create mode 100644 kernel/power/zgemm_tcopy_hummer_2.S create mode 100644 kernel/power/zgemm_tcopy_hummer_4.S create mode 100644 kernel/power/zgemv_n.S create mode 100644 kernel/power/zgemv_n_ppc440.S create mode 100644 kernel/power/zgemv_t.S create mode 100644 kernel/power/zgemv_t_ppc440.S create mode 100644 kernel/power/zger.S create mode 100644 kernel/power/znrm2.S create mode 100644 kernel/power/znrm2_hummer.S create mode 100644 kernel/power/znrm2_ppc440.S create mode 100644 kernel/power/zrot.S create mode 100644 kernel/power/zrot_ppc440.S create mode 100644 kernel/power/zscal.S create mode 100644 kernel/power/zscal_hummer.S create mode 100644 kernel/power/zscal_ppc440.S create mode 100644 kernel/power/zswap.S create mode 100644 kernel/power/zswap_hummer.S create mode 100644 kernel/power/zsymv_L.S create mode 100644 kernel/power/zsymv_U.S create mode 100644 kernel/power/ztrsm_kernel_LN.S create mode 100644 kernel/power/ztrsm_kernel_LT.S create mode 100644 kernel/power/ztrsm_kernel_RT.S create mode 100644 kernel/power/ztrsm_kernel_cell_LN.S create mode 100644 kernel/power/ztrsm_kernel_cell_LT.S create mode 100644 kernel/power/ztrsm_kernel_cell_RT.S create mode 100644 kernel/power/ztrsm_kernel_hummer_LN.S create mode 100644 kernel/power/ztrsm_kernel_hummer_LT.S create mode 100644 kernel/power/ztrsm_kernel_hummer_RT.S create mode 100644 kernel/power/ztrsm_kernel_power6_LN.S create mode 100644 kernel/power/ztrsm_kernel_power6_LT.S create mode 100644 kernel/power/ztrsm_kernel_power6_RT.S create mode 100644 kernel/power/ztrsm_kernel_ppc440_LN.S create mode 100644 kernel/power/ztrsm_kernel_ppc440_LT.S create mode 100644 kernel/power/ztrsm_kernel_ppc440_RT.S create mode 100644 kernel/setparam-ref.c create mode 100644 kernel/sparc/KERNEL create mode 100644 kernel/sparc/KERNEL.sparc create mode 100644 kernel/sparc/KERNEL.sparcv7 create mode 100644 kernel/sparc/Makefile create mode 100644 kernel/sparc/amax.S create mode 100644 kernel/sparc/asum.S create mode 100644 kernel/sparc/axpy.S create mode 100644 kernel/sparc/cabs.S create mode 100644 kernel/sparc/cnrm2.S create mode 100644 kernel/sparc/copy.S create mode 100644 kernel/sparc/dnrm2.S create mode 100644 kernel/sparc/dot.S create mode 100644 kernel/sparc/gemm_kernel.S create mode 100644 kernel/sparc/gemm_kernel_2x8.S create mode 100644 kernel/sparc/gemm_ncopy.S create mode 100644 kernel/sparc/gemm_ncopy_2.S create mode 100644 kernel/sparc/gemm_ncopy_8.S create mode 100644 kernel/sparc/gemm_tcopy.S create mode 100644 kernel/sparc/gemm_tcopy_2.S create mode 100644 kernel/sparc/gemv_n.S create mode 100644 kernel/sparc/gemv_t.S create mode 100644 kernel/sparc/ger.S create mode 100644 kernel/sparc/iamax.S create mode 100644 kernel/sparc/imax.S create mode 100644 kernel/sparc/izamax.S create mode 100644 kernel/sparc/lsame.S create mode 100644 kernel/sparc/max.S create mode 100644 kernel/sparc/rot.S create mode 100644 kernel/sparc/scal.S create mode 100644 kernel/sparc/snrm2.S create mode 100644 kernel/sparc/staticbuffer.S create mode 100644 kernel/sparc/swap.S create mode 100644 kernel/sparc/trsm_kernel_LN.S create mode 100644 kernel/sparc/trsm_kernel_LN_2x8.S create mode 100644 kernel/sparc/trsm_kernel_LT.S create mode 100644 kernel/sparc/trsm_kernel_LT_2x8.S create mode 100644 kernel/sparc/trsm_kernel_RT.S create mode 100644 kernel/sparc/trsm_kernel_RT_2x8.S create mode 100644 kernel/sparc/zamax.S create mode 100644 kernel/sparc/zasum.S create mode 100644 kernel/sparc/zaxpy.S create mode 100644 kernel/sparc/zcopy.S create mode 100644 kernel/sparc/zdot.S create mode 100644 kernel/sparc/zgemm_kernel.S create mode 100644 kernel/sparc/zgemm_kernel_1x4.S create mode 100644 kernel/sparc/zgemm_ncopy.S create mode 100644 kernel/sparc/zgemm_tcopy.S create mode 100644 kernel/sparc/zgemv_n.S create mode 100644 kernel/sparc/zgemv_t.S create mode 100644 kernel/sparc/znrm2.S create mode 100644 kernel/sparc/zrot.S create mode 100644 kernel/sparc/zscal.S create mode 100644 kernel/sparc/zswap.S create mode 100644 kernel/sparc/ztrsm_kernel_LN.S create mode 100644 kernel/sparc/ztrsm_kernel_LT.S create mode 100644 kernel/sparc/ztrsm_kernel_LT_1x4.S create mode 100644 kernel/sparc/ztrsm_kernel_RT.S create mode 100644 kernel/sparc/ztrsm_kernel_RT_1x4.S create mode 100644 kernel/x86/KERNEL create mode 100644 kernel/x86/KERNEL.ATHLON create mode 100644 kernel/x86/KERNEL.ATOM create mode 100644 kernel/x86/KERNEL.BANIAS create mode 100644 kernel/x86/KERNEL.BARCELONA create mode 100644 kernel/x86/KERNEL.COPPERMINE create mode 100644 kernel/x86/KERNEL.CORE2 create mode 100644 kernel/x86/KERNEL.DUNNINGTON create mode 100644 kernel/x86/KERNEL.KATMAI create mode 100644 kernel/x86/KERNEL.NANO create mode 100644 kernel/x86/KERNEL.NEHALEM create mode 100644 kernel/x86/KERNEL.NORTHWOOD create mode 100644 kernel/x86/KERNEL.OPTERON create mode 100644 kernel/x86/KERNEL.OPTERON_SSE3 create mode 100644 kernel/x86/KERNEL.P5 create mode 100644 kernel/x86/KERNEL.P6 create mode 100644 kernel/x86/KERNEL.PENRYN create mode 100644 kernel/x86/KERNEL.PRESCOTT create mode 100644 kernel/x86/KERNEL.VIAC3 create mode 100644 kernel/x86/KERNEL.YONAH create mode 100644 kernel/x86/Makefile create mode 100644 kernel/x86/amax.S create mode 100644 kernel/x86/amax_sse.S create mode 100644 kernel/x86/amax_sse2.S create mode 100644 kernel/x86/asum.S create mode 100644 kernel/x86/asum_sse.S create mode 100644 kernel/x86/asum_sse2.S create mode 100644 kernel/x86/axpy.S create mode 100644 kernel/x86/axpy_sse.S create mode 100644 kernel/x86/axpy_sse2.S create mode 100644 kernel/x86/axpy_sse2_opteron.S create mode 100644 kernel/x86/cabs.S create mode 100644 kernel/x86/copy.S create mode 100644 kernel/x86/copy_sse.S create mode 100644 kernel/x86/copy_sse2.S create mode 100644 kernel/x86/cpuid.S create mode 100644 kernel/x86/dot.S create mode 100644 kernel/x86/dot_amd.S create mode 100644 kernel/x86/dot_sse.S create mode 100644 kernel/x86/dot_sse2.S create mode 100644 kernel/x86/dot_sse2_opteron.S create mode 100644 kernel/x86/dot_sse_opteron.S create mode 100644 kernel/x86/gemm_beta.S create mode 100644 kernel/x86/gemm_kernel_1x4.S create mode 100644 kernel/x86/gemm_kernel_2x2.S create mode 100644 kernel/x86/gemm_kernel_2x2_atom.S create mode 100644 kernel/x86/gemm_kernel_2x4_3dnow.S create mode 100644 kernel/x86/gemm_kernel_2x4_barcelona.S create mode 100644 kernel/x86/gemm_kernel_2x4_core2.S create mode 100644 kernel/x86/gemm_kernel_2x4_penryn.S create mode 100644 kernel/x86/gemm_kernel_2x4_sse2.S create mode 100644 kernel/x86/gemm_kernel_2x4_sse3.S create mode 100644 kernel/x86/gemm_kernel_4x2_core2.S create mode 100644 kernel/x86/gemm_kernel_4x2_sse2.S create mode 100644 kernel/x86/gemm_kernel_4x4_barcelona.S create mode 100644 kernel/x86/gemm_kernel_4x4_penryn.S create mode 100644 kernel/x86/gemm_kernel_4x4_sse.S create mode 100644 kernel/x86/gemm_kernel_4x4_sse3.S create mode 100644 kernel/x86/gemm_kernel_8x1_sse2.S create mode 100644 kernel/x86/gemm_kernel_8x2_core2.S create mode 100644 kernel/x86/gemm_kernel_8x2_sse.S create mode 100644 kernel/x86/gemm_ncopy_2.S create mode 100644 kernel/x86/gemm_ncopy_2_sse.S create mode 100644 kernel/x86/gemm_ncopy_4_sse.S create mode 100644 kernel/x86/gemm_tcopy_2.S create mode 100644 kernel/x86/gemm_tcopy_2_sse.S create mode 100644 kernel/x86/gemm_tcopy_4_sse.S create mode 100644 kernel/x86/gemv_n.S create mode 100644 kernel/x86/gemv_n_atom.S create mode 100644 kernel/x86/gemv_n_sse.S create mode 100644 kernel/x86/gemv_n_sse2.S create mode 100644 kernel/x86/gemv_t.S create mode 100644 kernel/x86/gemv_t_atom.S create mode 100644 kernel/x86/gemv_t_sse.S create mode 100644 kernel/x86/gemv_t_sse2.S create mode 100644 kernel/x86/iamax.S create mode 100644 kernel/x86/iamax_sse.S create mode 100644 kernel/x86/iamax_sse2.S create mode 100644 kernel/x86/izamax.S create mode 100644 kernel/x86/izamax_sse.S create mode 100644 kernel/x86/izamax_sse2.S create mode 100644 kernel/x86/lsame.S create mode 100644 kernel/x86/nrm2.S create mode 100644 kernel/x86/nrm2_sse.S create mode 100644 kernel/x86/qaxpy.S create mode 100644 kernel/x86/qconjg.S create mode 100644 kernel/x86/qdot.S create mode 100644 kernel/x86/qgemm_kernel_2x2.S create mode 100644 kernel/x86/qgemv_n.S create mode 100644 kernel/x86/qgemv_t.S create mode 100644 kernel/x86/qtrsm_kernel_LN_2x2.S create mode 100644 kernel/x86/qtrsm_kernel_LT_2x2.S create mode 100644 kernel/x86/qtrsm_kernel_RT_2x2.S create mode 100644 kernel/x86/rot.S create mode 100644 kernel/x86/rot_sse.S create mode 100644 kernel/x86/rot_sse2.S create mode 100644 kernel/x86/scal.S create mode 100644 kernel/x86/scal_sse.S create mode 100644 kernel/x86/scal_sse2.S create mode 100644 kernel/x86/staticbuffer.S create mode 100644 kernel/x86/swap.S create mode 100644 kernel/x86/swap_sse.S create mode 100644 kernel/x86/swap_sse2.S create mode 100644 kernel/x86/trsm_kernel_LN_2x2.S create mode 100644 kernel/x86/trsm_kernel_LN_2x2_atom.S create mode 100644 kernel/x86/trsm_kernel_LN_2x4_penryn.S create mode 100644 kernel/x86/trsm_kernel_LN_2x4_sse2.S create mode 100644 kernel/x86/trsm_kernel_LN_2x4_sse3.S create mode 100644 kernel/x86/trsm_kernel_LN_4x2_core2.S create mode 100644 kernel/x86/trsm_kernel_LN_4x2_sse2.S create mode 100644 kernel/x86/trsm_kernel_LN_4x4_penryn.S create mode 100644 kernel/x86/trsm_kernel_LN_4x4_sse.S create mode 100644 kernel/x86/trsm_kernel_LN_8x2_sse.S create mode 100644 kernel/x86/trsm_kernel_LT_1x4.S create mode 100644 kernel/x86/trsm_kernel_LT_2x2.S create mode 100644 kernel/x86/trsm_kernel_LT_2x2_atom.S create mode 100644 kernel/x86/trsm_kernel_LT_2x4_penryn.S create mode 100644 kernel/x86/trsm_kernel_LT_2x4_sse2.S create mode 100644 kernel/x86/trsm_kernel_LT_2x4_sse3.S create mode 100644 kernel/x86/trsm_kernel_LT_4x2_core2.S create mode 100644 kernel/x86/trsm_kernel_LT_4x2_sse2.S create mode 100644 kernel/x86/trsm_kernel_LT_4x4_penryn.S create mode 100644 kernel/x86/trsm_kernel_LT_4x4_sse.S create mode 100644 kernel/x86/trsm_kernel_LT_8x2_sse.S create mode 100644 kernel/x86/trsm_kernel_RT_1x4.S create mode 100644 kernel/x86/trsm_kernel_RT_2x2.S create mode 100644 kernel/x86/trsm_kernel_RT_2x2_atom.S create mode 100644 kernel/x86/trsm_kernel_RT_2x4_penryn.S create mode 100644 kernel/x86/trsm_kernel_RT_2x4_sse2.S create mode 100644 kernel/x86/trsm_kernel_RT_2x4_sse3.S create mode 100644 kernel/x86/trsm_kernel_RT_4x2_core2.S create mode 100644 kernel/x86/trsm_kernel_RT_4x2_sse2.S create mode 100644 kernel/x86/trsm_kernel_RT_4x4_penryn.S create mode 100644 kernel/x86/trsm_kernel_RT_4x4_sse.S create mode 100644 kernel/x86/trsm_kernel_RT_8x2_sse.S create mode 100644 kernel/x86/xaxpy.S create mode 100644 kernel/x86/xdot.S create mode 100644 kernel/x86/xgemm3m_kernel_2x2.S create mode 100644 kernel/x86/xgemm_kernel_1x1.S create mode 100644 kernel/x86/xgemv_n.S create mode 100644 kernel/x86/xgemv_t.S create mode 100644 kernel/x86/xtrsm_kernel_LT_1x1.S create mode 100644 kernel/x86/zamax.S create mode 100644 kernel/x86/zamax_sse.S create mode 100644 kernel/x86/zamax_sse2.S create mode 100644 kernel/x86/zasum.S create mode 100644 kernel/x86/zasum_sse.S create mode 100644 kernel/x86/zasum_sse2.S create mode 100644 kernel/x86/zaxpy.S create mode 100644 kernel/x86/zaxpy_sse.S create mode 100644 kernel/x86/zaxpy_sse2.S create mode 100644 kernel/x86/zcopy.S create mode 100644 kernel/x86/zcopy_sse.S create mode 100644 kernel/x86/zcopy_sse2.S create mode 100644 kernel/x86/zdot.S create mode 100644 kernel/x86/zdot_amd.S create mode 100644 kernel/x86/zdot_sse.S create mode 100644 kernel/x86/zdot_sse2.S create mode 100644 kernel/x86/zgemm3m_kernel_1x4_athlon.S create mode 100644 kernel/x86/zgemm3m_kernel_2x2_atom.S create mode 100644 kernel/x86/zgemm3m_kernel_2x2_coppermine.S create mode 100644 kernel/x86/zgemm3m_kernel_2x4_barcelona.S create mode 100644 kernel/x86/zgemm3m_kernel_2x4_opteron.S create mode 100644 kernel/x86/zgemm3m_kernel_2x4_penryn.S create mode 100644 kernel/x86/zgemm3m_kernel_2x4_prescott.S create mode 100644 kernel/x86/zgemm3m_kernel_4x2_core2.S create mode 100644 kernel/x86/zgemm3m_kernel_4x2_northwood.S create mode 100644 kernel/x86/zgemm3m_kernel_4x4_barcelona.S create mode 100644 kernel/x86/zgemm3m_kernel_4x4_opteron.S create mode 100644 kernel/x86/zgemm3m_kernel_4x4_penryn.S create mode 100644 kernel/x86/zgemm3m_kernel_4x4_prescott.S create mode 100644 kernel/x86/zgemm3m_kernel_8x2_core2.S create mode 100644 kernel/x86/zgemm3m_kernel_8x2_sse.S create mode 100644 kernel/x86/zgemm_beta.S create mode 100644 kernel/x86/zgemm_kernel_1x1.S create mode 100644 kernel/x86/zgemm_kernel_1x1_atom.S create mode 100644 kernel/x86/zgemm_kernel_1x2.S create mode 100644 kernel/x86/zgemm_kernel_1x2_3dnow.S create mode 100644 kernel/x86/zgemm_kernel_1x2_barcelona.S create mode 100644 kernel/x86/zgemm_kernel_1x2_penryn.S create mode 100644 kernel/x86/zgemm_kernel_1x2_sse2.S create mode 100644 kernel/x86/zgemm_kernel_1x2_sse3.S create mode 100644 kernel/x86/zgemm_kernel_2x1_core2.S create mode 100644 kernel/x86/zgemm_kernel_2x1_sse2.S create mode 100644 kernel/x86/zgemm_kernel_2x2_barcelona.S create mode 100644 kernel/x86/zgemm_kernel_2x2_penryn.S create mode 100644 kernel/x86/zgemm_kernel_2x2_sse.S create mode 100644 kernel/x86/zgemm_kernel_2x2_sse3.S create mode 100644 kernel/x86/zgemm_kernel_4x1_core2.S create mode 100644 kernel/x86/zgemm_kernel_4x1_sse.S create mode 100644 kernel/x86/zgemm_ncopy_2.S create mode 100644 kernel/x86/zgemm_tcopy_2.S create mode 100644 kernel/x86/zgemv_n.S create mode 100644 kernel/x86/zgemv_n_atom.S create mode 100644 kernel/x86/zgemv_n_sse.S create mode 100644 kernel/x86/zgemv_n_sse2.S create mode 100644 kernel/x86/zgemv_t.S create mode 100644 kernel/x86/zgemv_t_atom.S create mode 100644 kernel/x86/zgemv_t_sse.S create mode 100644 kernel/x86/zgemv_t_sse2.S create mode 100644 kernel/x86/znrm2.S create mode 100644 kernel/x86/znrm2_sse.S create mode 100644 kernel/x86/zrot.S create mode 100644 kernel/x86/zrot_sse.S create mode 100644 kernel/x86/zrot_sse2.S create mode 100644 kernel/x86/zscal.S create mode 100644 kernel/x86/zscal_sse.S create mode 100644 kernel/x86/zscal_sse2.S create mode 100644 kernel/x86/zswap.S create mode 100644 kernel/x86/zswap_sse.S create mode 100644 kernel/x86/zswap_sse2.S create mode 100644 kernel/x86/ztrsm_kernel_LN_2x1_core2.S create mode 100644 kernel/x86/ztrsm_kernel_LN_2x1_sse2.S create mode 100644 kernel/x86/ztrsm_kernel_LN_2x2_penryn.S create mode 100644 kernel/x86/ztrsm_kernel_LN_2x2_sse.S create mode 100644 kernel/x86/ztrsm_kernel_LN_4x1_sse.S create mode 100644 kernel/x86/ztrsm_kernel_LT_1x1.S create mode 100644 kernel/x86/ztrsm_kernel_LT_1x1_atom.S create mode 100644 kernel/x86/ztrsm_kernel_LT_1x2_penryn.S create mode 100644 kernel/x86/ztrsm_kernel_LT_1x2_sse2.S create mode 100644 kernel/x86/ztrsm_kernel_LT_1x2_sse3.S create mode 100644 kernel/x86/ztrsm_kernel_LT_2x1_core2.S create mode 100644 kernel/x86/ztrsm_kernel_LT_2x1_sse2.S create mode 100644 kernel/x86/ztrsm_kernel_LT_2x2_penryn.S create mode 100644 kernel/x86/ztrsm_kernel_LT_2x2_sse.S create mode 100644 kernel/x86/ztrsm_kernel_LT_4x1_sse.S create mode 100644 kernel/x86/ztrsm_kernel_RT_1x2_penryn.S create mode 100644 kernel/x86/ztrsm_kernel_RT_1x2_sse2.S create mode 100644 kernel/x86/ztrsm_kernel_RT_1x2_sse3.S create mode 100644 kernel/x86/ztrsm_kernel_RT_2x2_penryn.S create mode 100644 kernel/x86/ztrsm_kernel_RT_2x2_sse.S create mode 100644 kernel/x86_64/KERNEL create mode 100644 kernel/x86_64/KERNEL.ATOM create mode 100644 kernel/x86_64/KERNEL.BARCELONA create mode 100644 kernel/x86_64/KERNEL.CORE2 create mode 100644 kernel/x86_64/KERNEL.DUNNINGTON create mode 100644 kernel/x86_64/KERNEL.NANO create mode 100644 kernel/x86_64/KERNEL.NEHALEM create mode 100644 kernel/x86_64/KERNEL.OPTERON create mode 100644 kernel/x86_64/KERNEL.OPTERON_SSE3 create mode 100644 kernel/x86_64/KERNEL.PENRYN create mode 100644 kernel/x86_64/KERNEL.PRESCOTT create mode 100644 kernel/x86_64/Makefile create mode 100644 kernel/x86_64/amax.S create mode 100644 kernel/x86_64/amax_atom.S create mode 100644 kernel/x86_64/amax_sse.S create mode 100644 kernel/x86_64/amax_sse2.S create mode 100644 kernel/x86_64/asum.S create mode 100644 kernel/x86_64/asum_atom.S create mode 100644 kernel/x86_64/asum_sse.S create mode 100644 kernel/x86_64/asum_sse2.S create mode 100644 kernel/x86_64/axpy.S create mode 100644 kernel/x86_64/axpy_atom.S create mode 100644 kernel/x86_64/axpy_sse.S create mode 100644 kernel/x86_64/axpy_sse2.S create mode 100644 kernel/x86_64/builtin_stinit.S create mode 100644 kernel/x86_64/cabs.S create mode 100644 kernel/x86_64/cgemv_n.S create mode 100644 kernel/x86_64/cgemv_t.S create mode 100644 kernel/x86_64/copy.S create mode 100644 kernel/x86_64/copy_sse.S create mode 100644 kernel/x86_64/copy_sse2.S create mode 100644 kernel/x86_64/dgemm_ncopy_2.S create mode 100644 kernel/x86_64/dgemm_ncopy_4.S create mode 100644 kernel/x86_64/dgemm_ncopy_8.S create mode 100644 kernel/x86_64/dgemm_tcopy_2.S create mode 100644 kernel/x86_64/dgemm_tcopy_4.S create mode 100644 kernel/x86_64/dgemm_tcopy_8.S create mode 100644 kernel/x86_64/dgemv_n.S create mode 100644 kernel/x86_64/dgemv_n_atom.S create mode 100644 kernel/x86_64/dgemv_t.S create mode 100644 kernel/x86_64/dgemv_t_atom.S create mode 100644 kernel/x86_64/dot.S create mode 100644 kernel/x86_64/dot_atom.S create mode 100644 kernel/x86_64/dot_sse.S create mode 100644 kernel/x86_64/dot_sse2.S create mode 100644 kernel/x86_64/gemm_beta.S create mode 100644 kernel/x86_64/gemm_kernel_2x8_nehalem.S create mode 100644 kernel/x86_64/gemm_kernel_4x2_atom.S create mode 100644 kernel/x86_64/gemm_kernel_4x4_barcelona.S create mode 100644 kernel/x86_64/gemm_kernel_4x4_core2.S create mode 100644 kernel/x86_64/gemm_kernel_4x4_penryn.S create mode 100644 kernel/x86_64/gemm_kernel_4x4_sse2.S create mode 100644 kernel/x86_64/gemm_kernel_4x4_sse3.S create mode 100644 kernel/x86_64/gemm_kernel_4x8_nano.S create mode 100644 kernel/x86_64/gemm_kernel_4x8_nehalem.S create mode 100644 kernel/x86_64/gemm_kernel_8x4_barcelona.S create mode 100644 kernel/x86_64/gemm_kernel_8x4_core2.S create mode 100644 kernel/x86_64/gemm_kernel_8x4_penryn.S create mode 100644 kernel/x86_64/gemm_kernel_8x4_sse.S create mode 100644 kernel/x86_64/gemm_kernel_8x4_sse3.S create mode 100644 kernel/x86_64/gemm_ncopy_2.S create mode 100644 kernel/x86_64/gemm_ncopy_4.S create mode 100644 kernel/x86_64/gemm_ncopy_4_opteron.S create mode 100644 kernel/x86_64/gemm_tcopy_2.S create mode 100644 kernel/x86_64/gemm_tcopy_4.S create mode 100644 kernel/x86_64/gemm_tcopy_4_opteron.S create mode 100644 kernel/x86_64/iamax.S create mode 100644 kernel/x86_64/iamax_sse.S create mode 100644 kernel/x86_64/iamax_sse2.S create mode 100644 kernel/x86_64/izamax.S create mode 100644 kernel/x86_64/izamax_sse.S create mode 100644 kernel/x86_64/izamax_sse2.S create mode 100644 kernel/x86_64/lsame.S create mode 100644 kernel/x86_64/mcount.S create mode 100644 kernel/x86_64/nrm2.S create mode 100644 kernel/x86_64/nrm2_sse.S create mode 100644 kernel/x86_64/qconjg.S create mode 100644 kernel/x86_64/qdot.S create mode 100644 kernel/x86_64/qgemm_kernel_2x2.S create mode 100644 kernel/x86_64/qgemv_n.S create mode 100644 kernel/x86_64/qgemv_t.S create mode 100644 kernel/x86_64/qtrsm_kernel_LN_2x2.S create mode 100644 kernel/x86_64/qtrsm_kernel_LT_2x2.S create mode 100644 kernel/x86_64/qtrsm_kernel_RT_2x2.S create mode 100644 kernel/x86_64/rot.S create mode 100644 kernel/x86_64/rot_sse.S create mode 100644 kernel/x86_64/rot_sse2.S create mode 100644 kernel/x86_64/scal.S create mode 100644 kernel/x86_64/scal_atom.S create mode 100644 kernel/x86_64/scal_sse.S create mode 100644 kernel/x86_64/scal_sse2.S create mode 100644 kernel/x86_64/sgemv_n.S create mode 100644 kernel/x86_64/sgemv_t.S create mode 100644 kernel/x86_64/staticbuffer.S create mode 100644 kernel/x86_64/swap.S create mode 100644 kernel/x86_64/swap_sse.S create mode 100644 kernel/x86_64/swap_sse2.S create mode 100644 kernel/x86_64/symv_L_sse.S create mode 100644 kernel/x86_64/symv_L_sse2.S create mode 100644 kernel/x86_64/symv_U_sse.S create mode 100644 kernel/x86_64/symv_U_sse2.S create mode 100644 kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S create mode 100644 kernel/x86_64/trsm_kernel_LN_4x2_atom.S create mode 100644 kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S create mode 100644 kernel/x86_64/trsm_kernel_LN_4x4_core2.S create mode 100644 kernel/x86_64/trsm_kernel_LN_4x4_penryn.S create mode 100644 kernel/x86_64/trsm_kernel_LN_4x4_sse2.S create mode 100644 kernel/x86_64/trsm_kernel_LN_4x4_sse3.S create mode 100644 kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S create mode 100644 kernel/x86_64/trsm_kernel_LN_8x4_sse.S create mode 100644 kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S create mode 100644 kernel/x86_64/trsm_kernel_LT_4x2_atom.S create mode 100644 kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S create mode 100644 kernel/x86_64/trsm_kernel_LT_4x4_core2.S create mode 100644 kernel/x86_64/trsm_kernel_LT_4x4_penryn.S create mode 100644 kernel/x86_64/trsm_kernel_LT_4x4_sse2.S create mode 100644 kernel/x86_64/trsm_kernel_LT_4x4_sse3.S create mode 100644 kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S create mode 100644 kernel/x86_64/trsm_kernel_LT_8x4_sse.S create mode 100644 kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S create mode 100644 kernel/x86_64/trsm_kernel_RT_4x2_atom.S create mode 100644 kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S create mode 100644 kernel/x86_64/trsm_kernel_RT_4x4_core2.S create mode 100644 kernel/x86_64/trsm_kernel_RT_4x4_penryn.S create mode 100644 kernel/x86_64/trsm_kernel_RT_4x4_sse2.S create mode 100644 kernel/x86_64/trsm_kernel_RT_4x4_sse3.S create mode 100644 kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S create mode 100644 kernel/x86_64/trsm_kernel_RT_8x4_sse.S create mode 100644 kernel/x86_64/xdot.S create mode 100644 kernel/x86_64/xgemm3m_kernel_2x2.S create mode 100644 kernel/x86_64/xgemm_kernel_1x1.S create mode 100644 kernel/x86_64/xgemv_n.S create mode 100644 kernel/x86_64/xgemv_t.S create mode 100644 kernel/x86_64/xtrsm_kernel_LT_1x1.S create mode 100644 kernel/x86_64/zamax.S create mode 100644 kernel/x86_64/zamax_atom.S create mode 100644 kernel/x86_64/zamax_sse.S create mode 100644 kernel/x86_64/zamax_sse2.S create mode 100644 kernel/x86_64/zasum.S create mode 100644 kernel/x86_64/zasum_atom.S create mode 100644 kernel/x86_64/zasum_sse.S create mode 100644 kernel/x86_64/zasum_sse2.S create mode 100644 kernel/x86_64/zaxpy.S create mode 100644 kernel/x86_64/zaxpy_atom.S create mode 100644 kernel/x86_64/zaxpy_sse.S create mode 100644 kernel/x86_64/zaxpy_sse2.S create mode 100644 kernel/x86_64/zcopy.S create mode 100644 kernel/x86_64/zcopy_sse.S create mode 100644 kernel/x86_64/zcopy_sse2.S create mode 100644 kernel/x86_64/zdot.S create mode 100644 kernel/x86_64/zdot_atom.S create mode 100644 kernel/x86_64/zdot_sse.S create mode 100644 kernel/x86_64/zdot_sse2.S create mode 100644 kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S create mode 100644 kernel/x86_64/zgemm3m_kernel_4x2_atom.S create mode 100644 kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S create mode 100644 kernel/x86_64/zgemm3m_kernel_4x4_core2.S create mode 100644 kernel/x86_64/zgemm3m_kernel_4x4_penryn.S create mode 100644 kernel/x86_64/zgemm3m_kernel_4x4_sse2.S create mode 100644 kernel/x86_64/zgemm3m_kernel_4x4_sse3.S create mode 100644 kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S create mode 100644 kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S create mode 100644 kernel/x86_64/zgemm3m_kernel_8x4_core2.S create mode 100644 kernel/x86_64/zgemm3m_kernel_8x4_penryn.S create mode 100644 kernel/x86_64/zgemm3m_kernel_8x4_sse.S create mode 100644 kernel/x86_64/zgemm3m_kernel_8x4_sse3.S create mode 100644 kernel/x86_64/zgemm_beta.S create mode 100644 kernel/x86_64/zgemm_kernel_1x4_nehalem.S create mode 100644 kernel/x86_64/zgemm_kernel_2x1_atom.S create mode 100644 kernel/x86_64/zgemm_kernel_2x2_barcelona.S create mode 100644 kernel/x86_64/zgemm_kernel_2x2_core2.S create mode 100644 kernel/x86_64/zgemm_kernel_2x2_penryn.S create mode 100644 kernel/x86_64/zgemm_kernel_2x2_sse2.S create mode 100644 kernel/x86_64/zgemm_kernel_2x2_sse3.S create mode 100644 kernel/x86_64/zgemm_kernel_2x4_nehalem.S create mode 100644 kernel/x86_64/zgemm_kernel_4x2_barcelona.S create mode 100644 kernel/x86_64/zgemm_kernel_4x2_core2.S create mode 100644 kernel/x86_64/zgemm_kernel_4x2_penryn.S create mode 100644 kernel/x86_64/zgemm_kernel_4x2_sse.S create mode 100644 kernel/x86_64/zgemm_kernel_4x2_sse3.S create mode 100644 kernel/x86_64/zgemm_ncopy_1.S create mode 100644 kernel/x86_64/zgemm_ncopy_2.S create mode 100644 kernel/x86_64/zgemm_tcopy_1.S create mode 100644 kernel/x86_64/zgemm_tcopy_2.S create mode 100644 kernel/x86_64/zgemv_n.S create mode 100644 kernel/x86_64/zgemv_n_atom.S create mode 100644 kernel/x86_64/zgemv_n_dup.S create mode 100644 kernel/x86_64/zgemv_t.S create mode 100644 kernel/x86_64/zgemv_t_atom.S create mode 100644 kernel/x86_64/zgemv_t_dup.S create mode 100644 kernel/x86_64/znrm2.S create mode 100644 kernel/x86_64/znrm2_sse.S create mode 100644 kernel/x86_64/zrot.S create mode 100644 kernel/x86_64/zrot_sse.S create mode 100644 kernel/x86_64/zrot_sse2.S create mode 100644 kernel/x86_64/zscal.S create mode 100644 kernel/x86_64/zscal_atom.S create mode 100644 kernel/x86_64/zscal_sse.S create mode 100644 kernel/x86_64/zscal_sse2.S create mode 100644 kernel/x86_64/zswap.S create mode 100644 kernel/x86_64/zswap_sse.S create mode 100644 kernel/x86_64/zswap_sse2.S create mode 100644 kernel/x86_64/zsymv_L_sse.S create mode 100644 kernel/x86_64/zsymv_L_sse2.S create mode 100644 kernel/x86_64/zsymv_U_sse.S create mode 100644 kernel/x86_64/zsymv_U_sse2.S create mode 100644 kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S create mode 100644 kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S create mode 100644 kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S create mode 100644 kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S create mode 100644 kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S create mode 100644 kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S create mode 100644 kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S create mode 100644 kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S create mode 100644 kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S create mode 100644 kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S create mode 100644 kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S create mode 100644 kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S create mode 100644 kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S create mode 100644 kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S create mode 100644 kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S create mode 100644 kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S create mode 100644 kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S create mode 100644 kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S create mode 100644 kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S create mode 100644 kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S create mode 100644 kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S create mode 100644 kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S create mode 100644 l1param.h create mode 100644 l2param.h create mode 100644 lapack/Makefile create mode 100644 lapack/getf2/Makefile create mode 100644 lapack/getf2/getf2_k.c create mode 100644 lapack/getf2/zgetf2_k.c create mode 100644 lapack/getrf/Makefile create mode 100644 lapack/getrf/getrf_parallel.c create mode 100644 lapack/getrf/getrf_parallel_omp.c create mode 100644 lapack/getrf/getrf_single.c create mode 100644 lapack/getri/cgetri.f create mode 100644 lapack/getri/dgetri.f create mode 100644 lapack/getri/sgetri.f create mode 100644 lapack/getri/zgetri.f create mode 100644 lapack/getrs/Makefile create mode 100644 lapack/getrs/getrs_parallel.c create mode 100644 lapack/getrs/getrs_single.c create mode 100644 lapack/getrs/zgetrs_parallel.c create mode 100644 lapack/getrs/zgetrs_single.c create mode 100644 lapack/laswp/Makefile create mode 100644 lapack/laswp/alpha/Makefile create mode 100644 lapack/laswp/generic/Makefile create mode 100644 lapack/laswp/generic/laswp_k.c create mode 100644 lapack/laswp/generic/laswp_k_1.c create mode 100644 lapack/laswp/generic/laswp_k_2.c create mode 100644 lapack/laswp/generic/laswp_k_4.c create mode 100644 lapack/laswp/generic/laswp_k_8.c create mode 100644 lapack/laswp/generic/zlaswp_k.c create mode 100644 lapack/laswp/generic/zlaswp_k_1.c create mode 100644 lapack/laswp/generic/zlaswp_k_2.c create mode 100644 lapack/laswp/generic/zlaswp_k_4.c create mode 100644 lapack/laswp/ia64/Makefile create mode 100644 lapack/laswp/mips64/Makefile create mode 100644 lapack/laswp/power/Makefile create mode 100644 lapack/laswp/sparc/Makefile create mode 100644 lapack/laswp/x86/Makefile create mode 100644 lapack/laswp/x86_64/Makefile create mode 100644 lapack/lauu2/Makefile create mode 100644 lapack/lauu2/lauu2_L.c create mode 100644 lapack/lauu2/lauu2_U.c create mode 100644 lapack/lauu2/zlauu2_L.c create mode 100644 lapack/lauu2/zlauu2_U.c create mode 100644 lapack/lauum/Makefile create mode 100644 lapack/lauum/lauum_L_parallel.c create mode 100644 lapack/lauum/lauum_L_single.c create mode 100644 lapack/lauum/lauum_U_parallel.c create mode 100644 lapack/lauum/lauum_U_single.c create mode 100644 lapack/potf2/Makefile create mode 100644 lapack/potf2/potf2_L.c create mode 100644 lapack/potf2/potf2_U.c create mode 100644 lapack/potf2/zpotf2_L.c create mode 100644 lapack/potf2/zpotf2_U.c create mode 100644 lapack/potrf/Makefile create mode 100644 lapack/potrf/potrf_L_parallel.c create mode 100644 lapack/potrf/potrf_L_single.c create mode 100644 lapack/potrf/potrf_U_parallel.c create mode 100644 lapack/potrf/potrf_U_single.c create mode 100644 lapack/potrf/potrf_parallel.c create mode 100644 lapack/trti2/Makefile create mode 100644 lapack/trti2/trti2_L.c create mode 100644 lapack/trti2/trti2_U.c create mode 100644 lapack/trti2/ztrti2_L.c create mode 100644 lapack/trti2/ztrti2_U.c create mode 100644 lapack/trtri/Makefile create mode 100644 lapack/trtri/trtri_L_parallel.c create mode 100644 lapack/trtri/trtri_L_single.c create mode 100644 lapack/trtri/trtri_U_parallel.c create mode 100644 lapack/trtri/trtri_U_single.c create mode 100644 make.inc create mode 100644 param.h create mode 100644 patch.for_lapack-3.1.1 create mode 100644 quickbuild.32bit create mode 100644 quickbuild.64bit create mode 100644 quickbuild.win32 create mode 100644 quickbuild.win64 create mode 100644 reference/LICENSE create mode 100644 reference/Makefile create mode 100644 reference/caxpycf.f create mode 100644 reference/caxpyf.f create mode 100644 reference/ccopyf.f create mode 100644 reference/cdotcf.f create mode 100644 reference/cdotuf.f create mode 100644 reference/cgbmvf.f create mode 100644 reference/cgemm3mf.f create mode 100644 reference/cgemmf.f create mode 100644 reference/cgemvf.f create mode 100644 reference/cgercf.f create mode 100644 reference/cgeruf.f create mode 100644 reference/cgesvf.f create mode 100644 reference/cgetf2f.f create mode 100644 reference/cgetrff.f create mode 100644 reference/cgetrsf.f create mode 100644 reference/chbmvf.f create mode 100644 reference/chemm3mf.f create mode 100644 reference/chemmf.f create mode 100644 reference/chemvf.f create mode 100644 reference/cher2f.f create mode 100644 reference/cher2kf.f create mode 100644 reference/cherf.f create mode 100644 reference/cherkf.f create mode 100644 reference/chpmvf.f create mode 100644 reference/chpr2f.f create mode 100644 reference/chprf.f create mode 100644 reference/claswpf.f create mode 100644 reference/clauu2f.f create mode 100644 reference/clauumf.f create mode 100644 reference/cpotf2f.f create mode 100644 reference/cpotrff.f create mode 100644 reference/cpotrif.f create mode 100644 reference/crotgf.f create mode 100644 reference/csbmvf.f create mode 100644 reference/cscalf.f create mode 100644 reference/cspmvf.f create mode 100644 reference/cspr2f.f create mode 100644 reference/csprf.f create mode 100644 reference/csrotf.f create mode 100644 reference/csscalf.f create mode 100644 reference/cswapf.f create mode 100644 reference/csymm3mf.f create mode 100644 reference/csymmf.f create mode 100644 reference/csymvf.f create mode 100644 reference/csyr2f.f create mode 100644 reference/csyr2kf.f create mode 100644 reference/csyrf.f create mode 100644 reference/csyrkf.f create mode 100644 reference/ctbmvf.f create mode 100644 reference/ctbsvf.f create mode 100644 reference/ctpmvf.f create mode 100644 reference/ctpsvf.f create mode 100644 reference/ctrmmf.f create mode 100644 reference/ctrmvf.f create mode 100644 reference/ctrsmf.f create mode 100644 reference/ctrsvf.f create mode 100644 reference/ctrti2f.f create mode 100644 reference/ctrtrif.f create mode 100644 reference/damaxf.f create mode 100644 reference/daminf.f create mode 100644 reference/dasumf.f create mode 100644 reference/daxpyf.f create mode 100644 reference/dcopyf.f create mode 100644 reference/ddotf.f create mode 100644 reference/dgbmvf.f create mode 100644 reference/dgemmf.f create mode 100644 reference/dgemvf.f create mode 100644 reference/dgerf.f create mode 100644 reference/dgesvf.f create mode 100644 reference/dgetf2f.f create mode 100644 reference/dgetrff.f create mode 100644 reference/dgetrsf.f create mode 100644 reference/dlaswpf.f create mode 100644 reference/dlauu2f.f create mode 100644 reference/dlauumf.f create mode 100644 reference/dmaxf.f create mode 100644 reference/dminf.f create mode 100644 reference/dnrm2f.f create mode 100644 reference/dpotf2f.f create mode 100644 reference/dpotrff.f create mode 100644 reference/dpotrif.f create mode 100644 reference/drotf.f create mode 100644 reference/drotgf.f create mode 100644 reference/drotmf.f create mode 100644 reference/drotmgf.f create mode 100644 reference/dsbmvf.f create mode 100644 reference/dscalf.f create mode 100644 reference/dsdotf.f create mode 100644 reference/dspmvf.f create mode 100644 reference/dspr2f.f create mode 100644 reference/dsprf.f create mode 100644 reference/dswapf.f create mode 100644 reference/dsymmf.f create mode 100644 reference/dsymvf.f create mode 100644 reference/dsyr2f.f create mode 100644 reference/dsyr2kf.f create mode 100644 reference/dsyrf.f create mode 100644 reference/dsyrkf.f create mode 100644 reference/dtbmvf.f create mode 100644 reference/dtbsvf.f create mode 100644 reference/dtpmvf.f create mode 100644 reference/dtpsvf.f create mode 100644 reference/dtrmmf.f create mode 100644 reference/dtrmvf.f create mode 100644 reference/dtrsmf.f create mode 100644 reference/dtrsvf.f create mode 100644 reference/dtrti2f.f create mode 100644 reference/dtrtrif.f create mode 100644 reference/dzamaxf.f create mode 100644 reference/dzaminf.f create mode 100644 reference/dzasumf.f create mode 100644 reference/dznrm2f.f create mode 100644 reference/icamaxf.f create mode 100644 reference/icaminf.f create mode 100644 reference/idamaxf.f create mode 100644 reference/idaminf.f create mode 100644 reference/idmaxf.f create mode 100644 reference/idminf.f create mode 100644 reference/iqamaxf.f create mode 100644 reference/iqaminf.f create mode 100644 reference/iqmaxf.f create mode 100644 reference/iqminf.f create mode 100644 reference/isamaxf.f create mode 100644 reference/isaminf.f create mode 100644 reference/ismaxf.f create mode 100644 reference/isminf.f create mode 100644 reference/ixamaxf.f create mode 100644 reference/ixaminf.f create mode 100644 reference/izamaxf.f create mode 100644 reference/izaminf.f create mode 100644 reference/lsamef.f create mode 100644 reference/samaxf.f create mode 100644 reference/saminf.f create mode 100644 reference/sasumf.f create mode 100644 reference/saxpyf.f create mode 100644 reference/scamaxf.f create mode 100644 reference/scaminf.f create mode 100644 reference/scasumf.f create mode 100644 reference/scnrm2f.f create mode 100644 reference/scopyf.f create mode 100644 reference/sdotf.f create mode 100644 reference/sdsdotf.f create mode 100644 reference/sgbmvf.f create mode 100644 reference/sgemmf.f create mode 100644 reference/sgemvf.f create mode 100644 reference/sgerf.f create mode 100644 reference/sgesvf.f create mode 100644 reference/sgetf2f.f create mode 100644 reference/sgetrff.f create mode 100644 reference/sgetrsf.f create mode 100644 reference/slaswpf.f create mode 100644 reference/slauu2f.f create mode 100644 reference/slauumf.f create mode 100644 reference/smaxf.f create mode 100644 reference/sminf.f create mode 100644 reference/snrm2f.f create mode 100644 reference/spotf2f.f create mode 100644 reference/spotrff.f create mode 100644 reference/spotrif.f create mode 100644 reference/srotf.f create mode 100644 reference/srotgf.f create mode 100644 reference/srotmf.f create mode 100644 reference/srotmgf.f create mode 100644 reference/ssbmvf.f create mode 100644 reference/sscalf.f create mode 100644 reference/sspmvf.f create mode 100644 reference/sspr2f.f create mode 100644 reference/ssprf.f create mode 100644 reference/sswapf.f create mode 100644 reference/ssymmf.f create mode 100644 reference/ssymvf.f create mode 100644 reference/ssyr2f.f create mode 100644 reference/ssyr2kf.f create mode 100644 reference/ssyrf.f create mode 100644 reference/ssyrkf.f create mode 100644 reference/stbmvf.f create mode 100644 reference/stbsvf.f create mode 100644 reference/stpmvf.f create mode 100644 reference/stpsvf.f create mode 100644 reference/strmmf.f create mode 100644 reference/strmvf.f create mode 100644 reference/strsmf.f create mode 100644 reference/strsvf.f create mode 100644 reference/strti2f.f create mode 100644 reference/strtrif.f create mode 100644 reference/zaxpycf.f create mode 100644 reference/zaxpyf.f create mode 100644 reference/zcopyf.f create mode 100644 reference/zdotcf.f create mode 100644 reference/zdotuf.f create mode 100644 reference/zdrotf.f create mode 100644 reference/zdscalf.f create mode 100644 reference/zgbmvf.f create mode 100644 reference/zgemm3mf.f create mode 100644 reference/zgemmf.f create mode 100644 reference/zgemvf.f create mode 100644 reference/zgercf.f create mode 100644 reference/zgeruf.f create mode 100644 reference/zgesvf.f create mode 100644 reference/zgetf2f.f create mode 100644 reference/zgetrff.f create mode 100644 reference/zgetrsf.f create mode 100644 reference/zhbmvf.f create mode 100644 reference/zhemm3mf.f create mode 100644 reference/zhemmf.f create mode 100644 reference/zhemvf.f create mode 100644 reference/zher2f.f create mode 100644 reference/zher2kf.f create mode 100644 reference/zherf.f create mode 100644 reference/zherkf.f create mode 100644 reference/zhpmvf.f create mode 100644 reference/zhpr2f.f create mode 100644 reference/zhprf.f create mode 100644 reference/zlaswpf.f create mode 100644 reference/zlauu2f.f create mode 100644 reference/zlauumf.f create mode 100644 reference/zpotf2f.f create mode 100644 reference/zpotrff.f create mode 100644 reference/zpotrif.f create mode 100644 reference/zrotgf.f create mode 100644 reference/zsbmvf.f create mode 100644 reference/zscalf.f create mode 100644 reference/zspmvf.f create mode 100644 reference/zspr2f.f create mode 100644 reference/zsprf.f create mode 100644 reference/zswapf.f create mode 100644 reference/zsymm3mf.f create mode 100644 reference/zsymmf.f create mode 100644 reference/zsymvf.f create mode 100644 reference/zsyr2f.f create mode 100644 reference/zsyr2kf.f create mode 100644 reference/zsyrf.f create mode 100644 reference/zsyrkf.f create mode 100644 reference/ztbmvf.f create mode 100644 reference/ztbsvf.f create mode 100644 reference/ztpmvf.f create mode 100644 reference/ztpsvf.f create mode 100644 reference/ztrmmf.f create mode 100644 reference/ztrmvf.f create mode 100644 reference/ztrsmf.f create mode 100644 reference/ztrsvf.f create mode 100644 reference/ztrti2f.f create mode 100644 reference/ztrtrif.f create mode 100644 symcopy.h create mode 100644 test/LICENSE create mode 100644 test/Makefile create mode 100644 test/cblat1.f create mode 100644 test/cblat2.dat create mode 100644 test/cblat2.f create mode 100644 test/cblat3.dat create mode 100644 test/cblat3.f create mode 100644 test/dblat1.f create mode 100644 test/dblat2.dat create mode 100644 test/dblat2.f create mode 100644 test/dblat3.dat create mode 100644 test/dblat3.f create mode 100644 test/sblat1.f create mode 100644 test/sblat2.dat create mode 100644 test/sblat2.f create mode 100644 test/sblat3.dat create mode 100644 test/sblat3.f create mode 100644 test/zblat1.f create mode 100644 test/zblat2.dat create mode 100644 test/zblat2.f create mode 100644 test/zblat3.dat create mode 100644 test/zblat3.f create mode 100644 version.h diff --git a/00License.txt b/00License.txt new file mode 100644 index 0000000000..56a0f740da --- /dev/null +++ b/00License.txt @@ -0,0 +1,32 @@ + +Copyright 2009, 2010 The University of Texas at Austin. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT AUSTIN ``AS IS'' +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT +AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation +are those of the authors and should not be interpreted as representing +official policies, either expressed or implied, of The University of +Texas at Austin. diff --git a/01Readme.txt b/01Readme.txt new file mode 100644 index 0000000000..fdde1e3c7f --- /dev/null +++ b/01Readme.txt @@ -0,0 +1,93 @@ + Optimized GotoBLAS2 libraries version 1.13 + + By Kazushige Goto + +# This is the last update and done on 5th Feb. 2010. + +0. License + + See 00TACC_Research_License.txt. + +1. Supported OS + + Linux + FreeBSD(Also it may work on NetBSD) + OSX + Soralis + Windows 2k, XP, Server 2003 and 2008(both 32bit and 64bit) + AIX + Tru64 UNIX + +2. Supported Architecture + + X86 : Pentium3 Katmai + Coppermine + Athlon (not well optimized, though) + PentiumM Banias, Yonah + Pentium4 Northwood + Nocona (Prescott) + Core 2 Woodcrest + Core 2 Penryn + Nehalem-EP Corei{3,5,7} + Atom + AMD Opteron + AMD Barlcelona, Shanghai, Istanbul + VIA NANO + + X86_64: Pentium4 Nocona + Core 2 Woodcrest + Core 2 Penryn + Nehalem + Atom + AMD Opteron + AMD Barlcelona, Shanghai, Istanbul + VIA NANO + + IA64 : Itanium2 + + Alpha : EV4, EV5, EV6 + + POWER : POWER4 + PPC970/PPC970FX + PPC970MP + CELL (PPU only) + POWER5 + PPC440 (QCDOC) + PPC440FP2(BG/L) + POWERPC G4(PPC7450) + POWER6 + + SPARC : SPARC IV + SPARC VI, VII (Fujitsu chip) + + MIPS64/32: Sicortex + +3. Supported compiler + + C compiler : GNU CC + Cygwin, MinGW + Other commercial compiler(especially for x86/x86_64) + + Fortran Compiler : GNU G77, GFORTRAN + G95 + Open64 + Compaq + F2C + IBM + Intel + PathScale + PGI + SUN + Fujitsu + +4. Suported precision + + Now x86/x86_64 version support 80bit FP precision in addition to +normal double presicion and single precision. Currently only +gfortran supports 80bit FP with "REAL*10". + + +5. How to build library? + + Please see 02QuickInstall.txt or just type "make". + diff --git a/02QuickInstall.txt b/02QuickInstall.txt new file mode 100644 index 0000000000..abf3807415 --- /dev/null +++ b/02QuickInstall.txt @@ -0,0 +1,118 @@ + Quick installation for GotoBLAS2 + +*************************************************************************** +*************************************************************************** +** ** +** ** +** Just type "make" <>. ** +** ** +** If you're not satisfied with this library, ** +** please read following instruction and customize it. ** +** ** +** ** +*************************************************************************** +*************************************************************************** + + +1. REALLY REALLY quick way to build library + + Type "make" or "gmake". + + $shell> make + + The script will detect Fortran compiler, number of cores and + architecture which you're using. If default gcc binary type is + 64bit, 64 bit library will be created. Otherwise 32 bit library + will be created. + + After finishing compile, you'll find various information about + generated library. + + =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + + GotoBLAS2 build complete. + + OS ... Linux + Architecture ... x86_64 + BINARY ... 64bit + C compiler ... GCC (command line : gcc) + Fortran compiler ... PATHSCALE (command line : pathf90) + Library Name ... libgoto_barcelonap-r1.27.a (Multi threaded; Max + num-threads is 16) + + =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= + + +2. Specifying 32bit or 64bit library + + If you need 32bit binary, + + $shell> make BINARY=32 + + If you need 64bit binary, + + $shell> make BINARY=64 + + +3. Specifying target architecture + + If you need library for different architecture, you can use TARGET + option. You can find current available options in top of getarch.c. + For example, if you need library for Intel core2 architecture, + you'll find FORCE_CORE2 option in getarch.c. Therefore you can + specify TARGET=CORE2 (get rid of FORCE_) with make. + + $shell> make TARGET=CORE2 + + Also if you want GotoBLAS2 to support multiple architecture, + + $shell> make DYNAMIC_ARCH=1 + + All kernel will be included in the library and dynamically switched + the best architecutre at run time. + + +4. Specifying for enabling multi-threaded + + Script will detect number of cores and will enable multi threaded + library if number of cores is more than two. If you still want to + create single threaded library, + + $shell> make USE_THREAD=0 + + Or if you need threaded library by force, + + $shell> make USE_THREAD=1 + + +5. Specifying target OS + + Target architecture will be determined by the CC. If you + specify cross compiler for MIPS, you can create library for + MIPS architecture. + + $shell> make CC=mips64el-linux-gcc TARGET=SICORTEX + + Or you can specify your favorite C compiler with absolute path. + + $shell> make CC=/opt/intel/cc/32/10.0.026/bin/icc TARGET=BARCELONA + + Binary type (32bit/64bit) is determined by checking CC, you + can control binary type with this option. + + $shell> make CC="pathcc -m32" + + In this case, 32bit library will be created. + + +6. Specifying Fortran compiler + + If you need to support other Fortran compiler, you can specify with + FC option. + + $shell> make FC=gfortran + + +7. Other useful options + + You'll find other useful options in Makefile.rule. diff --git a/03FAQ.txt b/03FAQ.txt new file mode 100644 index 0000000000..b6033fe530 --- /dev/null +++ b/03FAQ.txt @@ -0,0 +1,119 @@ + GotoBLAS2 FAQ + +1. General + +1.1 Q Can I find useful paper about GotoBLAS2? + + A You may check following URL. + + http://www.cs.utexas.edu/users/flame/Publications/index.htm + + 11. Kazushige Goto and Robert A. van de Geijn, " Anatomy of + High-Performance Matrix Multiplication," ACM Transactions on + Mathematical Software, accepted. + + 15. Kazushige Goto and Robert van de Geijn, "High-Performance + Implementation of the Level-3 BLAS." ACM Transactions on + Mathematical Software, submitted. + + +1.2 Q Does GotoBLAS2 work with Hyperthread (SMT)? + + A Yes, it will work. GotoBLAS2 detects Hyperthread and + avoid scheduling on the same core. + + +1.3 Q When I type "make", following error occured. What's wrong? + + $shell> make + "./Makefile.rule", line 58: Missing dependency operator + "./Makefile.rule", line 61: Need an operator + ... + + A This error occurs because you didn't use GNU make. Some binary + packages install GNU make as "gmake" and it's worth to try. + + +1.4 Q Function "xxx" is slow. Why? + + A Generally GotoBLAS2 has many well optimized functions, but it's + far and far from perfect. Especially Level 1/2 function + performance depends on how you call BLAS. You should understand + what happends between your function and GotoBLAS2 by using profile + enabled version or hardware performance counter. Again, please + don't regard GotoBLAS2 as a black box. + + +1.5 Q I have a commercial C compiler and want to compile GotoBLAS2 with + it. Is it possible? + + A All function that affects performance is written in assembler + and C code is just used for wrapper of assembler functions or + complicated functions. Also I use many inline assembler functions, + unfortunately most of commercial compiler can't handle inline + assembler. Therefore you should use gcc. + + +1.6 Q I use OpenMP compiler. How can I use GotoBLAS2 with it? + + A Please understand that OpenMP is a compromised method to use + thread. If you want to use OpenMP based code with GotoBLAS2, you + should enable "USE_OPENMP=1" in Makefile.rule. + + +1.7 Q Could you tell me how to use profiled library? + + A You need to build and link your application with -pg + option. After executing your application, "gmon.out" is + generated in your current directory. + + $shell> gprof gmon.out + + Each sample counts as 0.01 seconds. + % cumulative self self total + time seconds seconds calls Ks/call Ks/call name + 89.86 975.02 975.02 79317 0.00 0.00 .dgemm_kernel + 4.19 1020.47 45.45 40 0.00 0.00 .dlaswp00N + 2.28 1045.16 24.69 2539 0.00 0.00 .dtrsm_kernel_LT + 1.19 1058.03 12.87 79317 0.00 0.00 .dgemm_otcopy + 1.05 1069.40 11.37 4999 0.00 0.00 .dgemm_oncopy + .... + + I think profiled BLAS library is really useful for your + research. Please find bottleneck of your application and + improve it. + +1.8 Q Is number of thread limited? + + A Basically, there is no limitation about number of threads. You + can specify number of threads as many as you want, but larger + number of threads will consume extra resource. I recommend you to + specify minimum number of threads. + + +2. Architecture Specific issue or Implementation + +2.1 Q GotoBLAS2 seems to support any combination with OS and + architecture. Is it possible? + + A Combination is limited by current OS and architecture. For + examble, the combination OSX with SPARC is impossible. But it + will be possible with slight modification if these combination + appears in front of us. + + +2.2 Q I have POWER architecture systems. Do I need extra work? + + A Although POWER architecture defined special instruction + like CPUID to detect correct architecture, it's privileged + and can't be accessed by user process. So you have to set + the architecture that you have manually in getarch.c. + + +2.3 Q I can't create DLL on Cygwin (Error 53). What's wrong? + + A You have to make sure if lib.exe and mspdb80.dll are in Microsoft + Studio PATH. The easiest way is to use 'which' command. + + $shell> which lib.exe + /cygdrive/c/Program Files/Microsoft Visual Studio/VC98/bin/lib.exe diff --git a/04Windows64bit.txt b/04Windows64bit.txt new file mode 100644 index 0000000000..c9b8fc341d --- /dev/null +++ b/04Windows64bit.txt @@ -0,0 +1,13 @@ + +Quick guide to build library for Windows 64bit. + +1. What you need + + a. Windows Server 2003 or later + b. Cygwin environment(make, gcc, g77, perl, sed, wget) + c. MinGW64 compiler + d. Microsoft Visual Studio (lib.exe and mspdb80.dll are required to create dll) + +2. Do ./quickbuild.win64 + +Good luck diff --git a/05LargePage b/05LargePage new file mode 100644 index 0000000000..fb7de6bba0 --- /dev/null +++ b/05LargePage @@ -0,0 +1,53 @@ + To enhance perfomance, I'd recommend you to enable large page on + your OS (root account is required). + + A) Linux + + x86 32bit ... (number of core) * 4 pages + x86 64bit ... (number of core) * 8 pages + POWER 32/64bit ... (number of core) * 1 pages + + If you want to allocate 64 large pages, + + $shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset + $shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page + $shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number + $shell> echo 3355443200 > /pros/sys/kernel/shmall + + Also may add a few lines into /etc/security/limits.conf file. + + * hard memlock unlimited + * soft memlock unlimited + + Then restart sshd (/etc/init.d/sshd restart). + + B) Solaris + + You don't have to set up. + + C) Windows (Windows Server 2003 or later, XP 64bit) + + You have to assign memory lock operation to your account. + + Control Panel -> Administrative Tools -> Local Security Policy -> + Local Policies -> User Rights Assignment -> Lock pages in memory + + D) AIX + + Ask your administrator + + E) Tru64 UNIX + + Assign shared memory at boot time. + + F) Other aarchitecture which doesn't have Large TLB enhancement + + If you have root permission, please install device driver which + located in drivers/mapper. + + $shell> cd drivers/mapper + $shell> make + $shell> insmod mapper.ko + $shell> ./device_setup + + Then enable DEVICEDRIVER_ALLOCATION = 1 in Makefile.rule. diff --git a/06WeirdPerformance b/06WeirdPerformance new file mode 100644 index 0000000000..8046267639 --- /dev/null +++ b/06WeirdPerformance @@ -0,0 +1,22 @@ + Weird Performance + +1. If you see serious performance loss (extremely low performance), + probably you created too many threads or process. Basically GotoBLAS + assumes that available cores that you specify are exclusively for + BLAS computation. Even one small thread/process conflicts with BLAS + threads, performance will become worse. + + The best solution is to reduce your number of threads or insert + some synchronization mechanism and suspend your threads until BLAS + operation is finished. + + +2. Simlar problem may happen under virtual machine. If supervisor + allocates different cores for each scheduling, BLAS performnace + will be bad. This is because BLAS also utilizes all cache, + unexpected re-schedule for different core may result of heavy + performance loss. + + +Anyway, if you see any weird performance loss, it means your code or +algorithm is not optimal. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..c0cfc6b696 --- /dev/null +++ b/Makefile @@ -0,0 +1,230 @@ +TOPDIR = . +include ./Makefile.system + +BLASDIRS = interface driver/level2 driver/level3 driver/others + +ifndef DYNAMIC_ARCH +BLASDIRS += kernel +endif + +ifdef SANITY_CHECK +BLASDIRS += reference +endif + +SUBDIRS = $(BLASDIRS) lapack + +SUBDIRS_ALL = $(SUBDIRS) test ctest exports benchmark ../laswp ../bench + +.PHONY : all libs netlib test ctest shared +.NOTPARALLEL : all libs prof lapack-test + +all :: libs netlib tests shared + @echo + @echo " GotoBLAS build complete." + @echo + @echo " OS ... $(OSNAME) " + @echo " Architecture ... $(ARCH) " +ifndef BINARY64 + @echo " BINARY ... 32bit " +else + @echo " BINARY ... 64bit " +endif + @echo " C compiler ... $(C_COMPILER) (command line : $(CC))" + @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" +ifneq ($(OSNAME), AIX) + @echo -n " Library Name ... $(LIBNAME)" +else + @echo " Library Name ... $(LIBNAME)" +endif + +ifndef SMP + @echo " (Single threaded) " +else + @echo " (Multi threaded; Max num-threads is $(NUM_THREADS))" +endif + @echo + +shared : +ifeq ($(OSNAME), Linux) + $(MAKE) -C exports so + -ln -fs $(LIBSONAME) libgoto2.so +endif +ifeq ($(OSNAME), FreeBSD) + $(MAKE) -C exports so + -ln -fs $(LIBSONAME) libgoto2.so +endif +ifeq ($(OSNAME), NetBSD) + $(MAKE) -C exports so + -ln -fs $(LIBSONAME) libgoto2.so +endif +ifeq ($(OSNAME), Darwin) + $(MAKE) -C exports dyn + -ln -fs $(LIBDYNNAME) libgoto2.dylib +endif +ifeq ($(OSNAME), WINNT) + $(MAKE) -C exports dll +# -ln -fs $(LIBDLLNAME) libgoto2.dll +endif +ifeq ($(OSNAME), CYGWIN_NT) + $(MAKE) -C exports dll + -ln -fs $(LIBDLLNAME) libgoto2.dll +endif + +tests : +ifndef NOFORTRAN +ifndef TARGET +ifndef CROSS + touch $(LIBNAME) +ifndef NO_FBLAS + $(MAKE) -C test all +endif +ifndef NO_CBLAS + $(MAKE) -C ctest all +endif +endif +endif +endif + +libs : + -ln -fs $(LIBNAME) libgoto2.$(LIBSUFFIX) + for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +ifdef DYNAMIC_ARCH + $(MAKE) -C kernel commonlibs || exit 1 + for d in $(DYNAMIC_CORE) ; \ + do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ + done +endif + +prof : prof_blas prof_lapack + +prof_blas : + ln -fs $(LIBNAME_P) libgoto2_p.$(LIBSUFFIX) + for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d prof || exit 1 ; \ + fi; \ + done +ifdef DYNAMIC_ARCH + $(MAKE) -C kernel commonprof || exit 1 +endif + +blas : + ln -fs $(LIBNAME) libgoto2.$(LIBSUFFIX) + for d in $(BLASDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d libs || exit 1 ; \ + fi; \ + done + +hpl : + ln -fs $(LIBNAME) libgoto2.$(LIBSUFFIX) + for d in $(BLASDIRS) ../laswp exports ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +ifdef DYNAMIC_ARCH + $(MAKE) -C kernel commonlibs || exit 1 + for d in $(DYNAMIC_CORE) ; \ + do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ + done +endif + +hpl_p : + ln -fs $(LIBNAME_P) libgoto2_p.$(LIBSUFFIX) + for d in $(SUBDIRS) ../laswp exports ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done + +netlib : lapack-3.1.1 patch.for_lapack-3.1.1 lapack-3.1.1/make.inc +ifndef NOFORTRAN + -@$(MAKE) -C lapack-3.1.1 lapacklib +endif + +prof_lapack : lapack-3.1.1 lapack-3.1.1/make.inc + -@$(MAKE) -C lapack-3.1.1 lapack_prof + +lapack-3.1.1/make.inc : +ifndef NOFORTRAN + -@echo "FORTRAN = $(FC)" > lapack-3.1.1/make.inc + -@echo "OPTS = $(FFLAGS)" >> lapack-3.1.1/make.inc + -@echo "POPTS = $(FPFLAGS)" >> lapack-3.1.1/make.inc + -@echo "NOOPT = $(FFLAGS) -O0" >> lapack-3.1.1/make.inc + -@echo "PNOOPT = $(FPFLAGS) -O0" >> lapack-3.1.1/make.inc + -@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> lapack-3.1.1/make.inc + -@echo "ARCH = $(AR)" >> lapack-3.1.1/make.inc + -@echo "RANLIB = $(RANLIB)" >> lapack-3.1.1/make.inc + -@echo "LAPACKLIB = ../$(LIBNAME)" >> lapack-3.1.1/make.inc + -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> lapack-3.1.1/make.inc + -@echo "SUFFIX = $(SUFFIX)" >> lapack-3.1.1/make.inc + -@echo "PSUFFIX = $(PSUFFIX)" >> lapack-3.1.1/make.inc +# -@echo "CEXTRALIB = $(CEXTRALIB)" >> lapack-3.1.1/make.inc + -@cat make.inc >> lapack-3.1.1/make.inc +endif + +lapack-3.1.1 : lapack-3.1.1.tgz +ifndef NOFORTRAN + @if test `$(MD5SUM) lapack-3.1.1.tgz | $(AWK) '{print $$1}'` = 00b21551a899bcfbaa7b8443e1faeef9; then \ + echo $(TAR) zxf $< ;\ + $(TAR) zxf $< && (cd lapack-3.1.1; $(PATCH) -p1 < ../patch.for_lapack-3.1.1) ;\ + else \ + echo " lapack-3.1.1.tgz check sum is wrong (Please use orignal)." ;\ + rm -rf lapack-3.1.1 ;\ + fi +endif + +lapack-3.1.1.tgz : +ifndef NOFORTRAN + -wget http://www.netlib.org/lapack/lapack-3.1.1.tgz +endif + +large.tgz : +ifndef NOFORTRAN + -wget http://www.netlib.org/lapack/timing/large.tgz +endif + +timing.tgz : +ifndef NOFORTRAN + -wget http://www.netlib.org/lapack/timing/timing.tgz +endif + +lapack-timing : lapack-3.1.1 large.tgz timing.tgz +ifndef NOFORTRAN + (cd lapack-3.1.1; $(TAR) zxf ../timing.tgz TIMING) + (cd lapack-3.1.1/TIMING; $(TAR) zxf ../../large.tgz ) + make -C lapack-3.1.1 tmglib + make -C lapack-3.1.1/TIMING +endif + + +lapack-test : + $(MAKE) -C lapack-3.1.1 tmglib + $(MAKE) -C lapack-3.1.1/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintsts xlintstz xlintstzc + @rm -f lapack-3.1.1/TESTING/*.out + $(MAKE) -j 1 -C lapack-3.1.1/TESTING + $(GREP) failed lapack-3.1.1/TESTING/*.out + +dummy : + +clean :: + @for d in $(SUBDIRS_ALL) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done +ifdef DYNAMIC_ARCH + @$(MAKE) -C kernel clean +endif + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf libgoto2.$(LIBSUFFIX) libgoto2_p.$(LIBSUFFIX) *.lnk myconfig.h + @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @if test -d lapack-3.1.1; then \ + echo deleting lapack-3.1.1; \ + rm -rf lapack-3.1.1 ;\ + fi + @echo Done. \ No newline at end of file diff --git a/Makefile.alpha b/Makefile.alpha new file mode 100644 index 0000000000..2305483d79 --- /dev/null +++ b/Makefile.alpha @@ -0,0 +1,57 @@ +CPP = $(CC) -E +RANLIB = ranlib + +ifeq ($(LIBSUBARCH), EV4) +LIBNAME = $(LIBPREFIX)_ev4.a +LIBNAME_P = $(LIBPREFIX)_ev4_p.a +endif + +ifeq ($(LIBSUBARCH), EV5) +LIBNAME = $(LIBPREFIX)_ev5.a +LIBNAME_P = $(LIBPREFIX)_ev5_p.a +endif + +ifeq ($(LIBSUBARCH), EV6) +LIBNAME = $(LIBPREFIX)_ev6.a +LIBNAME_P = $(LIBPREFIX)_ev6_p.a +endif + +ifneq ($(COMPILER), NATIVE) +# GCC User +ifeq ($(LIBSUBARCH), EV4) +OPTION += -DEV4 -mcpu=ev4 +endif +ifeq ($(LIBSUBARCH), EV5) +OPTION += -DEV5 -mcpu=ev5 +endif +ifeq ($(LIBSUBARCH), EV6) +OPTION += -DEV6 -mcpu=ev6 +endif +else +# Compaq Compiler User +ifeq ($(LIBSUBARCH), EV4) +OPTION += -DEV4 -tune ev4 -arch ev4 +endif +ifeq ($(LIBSUBARCH), EV5) +OPTION += -DEV5 -tune ev5 -arch ev5 +endif +ifeq ($(LIBSUBARCH), EV6) +OPTION += -DEV6 -tune ev6 -arch ev6 +endif +endif + +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -mieee +endif + +ifeq ($(F_COMPILER), G77) +FCOMMON_OPT += -mieee +endif + +ifndef SMP +LIBCXML = -lcxml -lots -lm +LIBATLAS = -L/usr/lib/atlas3.7.8 -lf77blas -latlas -lm +else +LIBCXML = -lcxmlp -lots -lm +LIBATLAS = -L/usr/lib/atlas3.7.8p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm +endif diff --git a/Makefile.generic b/Makefile.generic new file mode 100644 index 0000000000..770aaf850d --- /dev/null +++ b/Makefile.generic @@ -0,0 +1,6 @@ +COPT = -Wall -O2 # -DGEMMTEST +ifdef BINARY64 +else +# LDFLAGS = -m elf32ppc +LDFLAGS = -m elf_i386 +endif diff --git a/Makefile.getarch b/Makefile.getarch new file mode 100644 index 0000000000..dadfb5b1bc --- /dev/null +++ b/Makefile.getarch @@ -0,0 +1,39 @@ +export BINARY +export USE_OPENMP + +ifdef TARGET_CORE +TARGET_MAKE = Makefile_kernel.conf +TARGET_CONF = config_kernel.h +else +TARGET_MAKE = Makefile.conf +TARGET_CONF = config.h +endif + +# CPUIDEMU = ../../cpuid/table.o + +ifdef CPUIDEMU +EXFLAGS = -DCPUIDEMU -DVENDOR=99 +endif + +all: getarch_2nd + ./getarch_2nd 0 >> $(TARGET_MAKE) + ./getarch_2nd 1 >> $(TARGET_CONF) + +config.h : c_check f_check getarch + perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) + perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) + ./getarch 0 >> $(TARGET_MAKE) + ./getarch 1 >> $(TARGET_CONF) + + +getarch : getarch.c cpuid.S dummy $(CPUIDEMU) + $(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) + +getarch_2nd : getarch_2nd.c config.h dummy +ifndef TARGET_CORE + $(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c +else + $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c +endif + +dummy: diff --git a/Makefile.ia64 b/Makefile.ia64 new file mode 100644 index 0000000000..7ffcd1dbf0 --- /dev/null +++ b/Makefile.ia64 @@ -0,0 +1,22 @@ +CCOMMON_COPT += # -DUSE64BITINT # -DGEMMTEST + +# CCOMMON_OPT += -DPARAMTEST +FLAMEPATH = $(HOME)/flame/lib/ia64 + +ifndef SMP +LIBMKL = -L$(MKLPATH)/64 -Wl,-rpath,$(MKLPATH)/64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lguide -lpthread -lm +else +LIBMKL = -L$(MKLPATH)/64 -Wl,-rpath,$(MKLPATH)/64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm +endif + +LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame $(TOPDIR)/$(LIBNAME) -lgfortran -lpthread -lm + +LIBMLIB = ../../level1/others/libmisc.a -L/opt/intel/fc/ia64/9.1.040/lib -L/opt/mlib/lib \ + -llapack -lguide -lifcore -lm -lpthread +LIBSCSL = -L/opt/scsl/1.4.1.0/lib -Wl,-rpath,/opt/scsl/1.4.1.0/lib -lscs + +ifndef SMP +LIBATLAS = -L/usr/lib/atlas3.6.0 -lf77blas -latlas -lm +else +LIBATLAS = -L$(HOME)/misc/lib -L/usr/lib/atlas3.6.0p -llapack -lptcblas -lptf77blas -latlas -lpthread -lm +endif diff --git a/Makefile.mips64 b/Makefile.mips64 new file mode 100644 index 0000000000..05ea9c679d --- /dev/null +++ b/Makefile.mips64 @@ -0,0 +1,3 @@ +ifdef BINARY64 +else +endif diff --git a/Makefile.power b/Makefile.power new file mode 100644 index 0000000000..35eb2cb7b1 --- /dev/null +++ b/Makefile.power @@ -0,0 +1,93 @@ +# CCOMMON_OPT += -DALLOC_SHM + +FLAMEPATH = $(HOME)/flame/lib + +#ifeq ($(CORE), CELL) +#CELL_SDK_ROOT = /opt/IBM/cell-sdk-1.1/sysroot/usr +#SPU_CC = spu-gcc +#EXTRALIB += -lspe +#endif + +ifeq ($(OSNAME), Linux) +ifdef BINARY64 +# COMPILER_PREFIX = powerpc64-linux- +else +# COMPILER_PREFIX = powerpc-linux- +endif +endif + +ifdef BINARY64 +ifeq ($(OSNAME), Linux) +LDFLAGS = -m elf64ppc +endif + +ifeq ($(OSNAME), Darwin) +LDFLAGS = -arch ppc64 +endif + +ifeq ($(OSNAME), AIX) +CCOMMON_OPT += -mpowerpc64 -maix64 +ifeq ($(COMPILER_F77), g77) +FCOMMON_OPT += -mpowerpc64 -maix64 +endif +ifeq ($(COMPILER_F77), xlf) +FCOMMON_OPT += -q64 +endif +ARFLAGS = -X 64 +LDFLAGS = -b64 +ASFLAGS = -a64 +endif +else +ifeq ($(OSNAME), Linux) +LDFLAGS = -m elf32ppc +endif +ifeq ($(OSNAME), AIX) +CCOMMON_OPT += -Wa,-a32 +ARFLAGS = -X 32 +LDFLAGS = -b32 +ASFLAGS = -a32 +endif +endif + +# CCOMMON_OPT += -maltivec -mabi=altivec + +LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS) + +ifeq ($(OSNAME), Darwin) +CCOMMON_OPT += -force_cpusubtype_ALL +endif + + +ifndef BINARY64 +ifeq ($(OSNAME), Linux) +ESSLPATH = -L/opt/ibmcmp/lib -L/opt/ibmcmp/xlf/11.1/lib -Wl,-rpath,/opt/ibmcmp/lib -Wl,-rpath,/opt/ibmcmp/xlf/11.1/lib -lxlf90_r -lxlomp_ser -lxlfmath -lxl -lpthread +else +ESSLPATH = -lxlf90_r +endif + + +LIBVECLIB = -framework VecLib +ifndef SMP +LIBATLAS = -L/usr/lib/atlas3.7.11 -lf77blas -latlas -lg2c -lm +LIBESSL = -lessl $(ESSLPATH) ../../level1/others/libmisc.a -lm +else +LIBATLAS = -L/usr/lib/atlas3.7.11p -lptf77blas -latlas -lm -lpthread +LIBESSL = -lesslsmp $(ESSLPATH) ../../level1/others/libmisc.a -lm +endif +else +ifeq ($(OSNAME), Linux) +ESSLPATH = -L/opt/ibmcmp/lib64 -Wl,-rpath,/opt/ibmcmp/lib64 -L/opt/ibmcmp/xlf/11.1/lib64 -Wl,-rpath,/opt/ibmcmp/xlf/11.1/lib64 -lxlf90_r -lxlomp_ser +else +ESSLPATH = -lxlf90_r +endif + +LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib + +ifndef SMP +LIBATLAS = -L/usr/lib64/atlas3.7.11 -lf77blas -latlas -lg2c -lm +LIBESSL = -lessl $(ESSLPATH) -lm +else +LIBATLAS = -L/usr/lib64/atlas3.7.11p -lptf77blas -latlas -lm -lpthread +LIBESSL = -lesslsmp $(ESSLPATH) -lxlsmp -lm +endif +endif diff --git a/Makefile.rule b/Makefile.rule new file mode 100644 index 0000000000..8be5515588 --- /dev/null +++ b/Makefile.rule @@ -0,0 +1,95 @@ +# +# Beginning of user configuration +# + +# This library's version +VERSION = 1.13 + +# You can specify the target architecture, otherwise it's +# automatically detected. +# TARGET = PENRYN + +# If you want to support multiple architecture in one binary +# DYNAMIC_ARCH = 1 + +# C compiler including binary type(32bit / 64bit). Default is gcc. +# Don't use Intel Compiler or PGI, it won't generate right codes as I expect. +# CC = gcc + +# Fortran compiler. Default is g77. +# FC = gfortran + +# Even you can specify cross compiler +# CC = x86_64-w64-mingw32-gcc +# FC = x86_64-w64-mingw32-gfortran + +# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 +# BINARY=64 + +# About threaded BLAS. It will be automatically detected if you don't +# specify it. +# For force setting for single threaded, specify USE_THREAD = 0 +# For force setting for multi threaded, specify USE_THREAD = 1 +# USE_THREAD = 0 + +# If you're going to use this library with OpenMP, please comment it in. +# USE_OPENMP = 1 + +# You can define maximum number of threads. Basically it should be +# less than actual number of cores. If you don't specify one, it's +# automatically detected by the the script. +# NUM_THREADS = 24 + +# If you don't need CBLAS interface, please comment it in. +# NO_CBLAS = 1 + +# If you want to use legacy threaded Level 3 implementation. +# USE_SIMPLE_THREADED_LEVEL3 = 1 + +# If you want to drive whole 64bit region by BLAS. Not all Fortran +# compiler supports this. It's safe to keep comment it out if you +# are not sure(equivalent to "-i8" option). +# INTERFACE64 = 1 + +# Unfortunately most of kernel won't give us high quality buffer. +# BLAS tries to find the best region before entering main function, +# but it will consume time. If you don't like it, you can disable one. +# NO_WARMUP = 1 + +# If you want to disable CPU/Memory affinity on Linux. +# NO_AFFINITY = 1 + +# If you would like to know minute performance report of GotoBLAS. +# FUNCTION_PROFILE = 1 + +# Support for IEEE quad precision(it's *real* REAL*16)( under testing) +# QUAD_PRECISION = 1 + +# Theads are still working for a while after finishing BLAS operation +# to reduce thread activate/deactivate overhead. You can determine +# time out to improve performance. This number should be from 4 to 30 +# which corresponds to (1 << n) cycles. For example, if you set to 26, +# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz +# system). Also you can control this mumber by GOTO_THREAD_TIMEOUT +# CCOMMON_OPT += -DTHREAD_TIMEOUT=26 + +# Using special device driver for mapping physically contigous memory +# to the user space. If bigphysarea is enabled, it will use it. +# DEVICEDRIVER_ALLOCATION = 1 + +# If you need to synchronize FP CSR between threads (for x86/x86_64 only). +# CONSISTENT_FPCSR = 1 + +# If you need santy check by comparing reference BLAS. It'll be very +# slow (Not implemented yet). +# SANITY_CHECK = 1 + +# Common Optimization Flag; -O2 is enough. +COMMON_OPT += -O2 + +# Profiling flags +COMMON_PROF = -pg + +# +# End of user configuration +# diff --git a/Makefile.sparc b/Makefile.sparc new file mode 100644 index 0000000000..c2b878e73d --- /dev/null +++ b/Makefile.sparc @@ -0,0 +1,41 @@ +CPP = $(CC) -E +RANLIB = ranlib + +ifdef BINARY64 + +CCOMMON_OPT += -mcpu=v9 -m64 +ifeq ($(COMPILER_F77), g77) +FCOMMON_OPT += -mcpu=v9 -m64 +endif +ifeq ($(COMPILER_F77), f90) +FCOMMON_OPT += -xarch=v9 +endif +LDFLAGS = -64 +else + +CCOMMON_OPT += -mcpu=v9 + +ifeq ($(COMPILER_F77), g77) +FCOMMON_OPT += -mcpu=v9 +endif +ifeq ($(COMPILER_F77), f90) +FCOMMON_OPT += -xarch=v8plusb +endif + +endif + +LIBNAME = $(LIBPREFIX).a + +ifndef SMP +LIBCXML = -L/opt/SUNWspro/lib/v9 +LIBATLAS = -L$(HOME)/misc/lib -lf77blas -latlas -lm +else +LIBCXML = -lcxmlp -lots -lm +endif +ifdef BINARY64 +LIBSUNPERF = -L/opt/SUNWspro/lib/v9 -L/opt/SUNWspro/prod/lib/v9 \ + -Wl,-R,/opt/SUNWspro/lib/v9 -lsunperf -lompstubs -lfui -lfsu -lsunmath +else +LIBSUNPERF = -L/opt/SUNWspro/lib -L/opt/SUNWspro/prod/lib \ + -Wl,-R,/opt/SUNWspro/lib -lsunperf -lompstubs -lfui -lfsu -lsunmath +endif \ No newline at end of file diff --git a/Makefile.system b/Makefile.system new file mode 100644 index 0000000000..cece53541e --- /dev/null +++ b/Makefile.system @@ -0,0 +1,753 @@ +# +# Include user definition +# + +# TO suppress recursive includes +INCLUDED = 1 + +ifndef TOPDIR +TOPDIR = . +endif + +# Default C compiler +CC = gcc + +ifndef MAKEFILE_RULE +include $(TOPDIR)/Makefile.rule +else +include $(TOPDIR)/$(MAKEFILE_RULE) +endif + +# +# Beginning of system configuration +# + +ifndef HOSTCC +HOSTCC = $(CC) +endif + +ifdef TARGET +GETARCH_FLAGS += -DFORCE_$(TARGET) +endif + +# This operation is expensive, so execution should be once. +ifndef GOTOBLAS_MAKEFILE +export GOTOBLAS_MAKEFILE = 1 + +# Generating Makefile.conf and config.h +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS=$(GETARCH_FLAGS) BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) + +ifndef TARGET_CORE +include $(TOPDIR)/Makefile.conf +else +include $(TOPDIR)/Makefile_kernel.conf +endif + +endif + +ifndef NUM_THREADS +NUM_THREADS = $(NUM_CORES) +endif + +ifeq ($(NUM_THREADS), 1) +override USE_THREAD = 0 +endif + +ifdef USE_THREAD +ifeq ($(USE_THREAD), 0) +SMP = +else +SMP = 1 +endif +else +ifeq ($(NUM_THREAD), 1) +SMP = +else +SMP = 1 +endif +endif + +ifndef NEED_PIC +NEED_PIC = 1 +endif + +ARFLAGS = +CPP = $(COMPILER) -E +AR = $(CROSS_SUFFIX)ar +AS = $(CROSS_SUFFIX)as +LD = $(CROSS_SUFFIX)ld +RANLIB = $(CROSS_SUFFIX)ranlib +NM = $(CROSS_SUFFIX)nm +DLLWRAP = $(CROSS_SUFFIX)dllwrap + +# +# OS dependent settings +# + +ifeq ($(OSNAME), Darwin) +EXTRALIB += -lSystemStubs +export MACOSX_DEPLOYMENT_TARGET=10.2 +endif + +ifeq ($(OSNAME), Linux) +EXTRALIB += -lm +endif + +ifeq ($(OSNAME), AIX) +EXTRALIB += -lm +endif + +ifeq ($(OSNAME), WINNT) +NEED_PIC = 0 +NO_EXPRECISION = 1 + +EXTRALIB += -defaultlib:advapi32 + +SUFFIX = obj +PSUFFIX = pobj +LIBSUFFIX = lib +endif + +ifeq ($(OSNAME), Interix) +NEED_PIC = 0 +NO_EXPRECISION = 1 + +INTERIX_TOOL_DIR = /opt/gcc.3.3/i586-pc-interix3/bin +endif + +ifeq ($(OSNAME), CYGWIN_NT) +NEED_PIC = 0 +NO_EXPRECISION = 1 +endif + +ifneq ($(OSNAME), WINNT) +ifneq ($(OSNAME), CYGWIN_NT) +ifneq ($(OSNAME), Interix) +ifdef SMP +EXTRALIB += -lpthread +endif +endif +endif +endif + +ifdef QUAD_PRECISION +CCOMMON_OPT += -DQUAD_PRECISION +NO_EXPRECISION = 1 +endif + +ifneq ($(ARCH), x86) +ifneq ($(ARCH), x86_64) +NO_EXPRECISION = 1 +endif +endif + +ifdef SANITY_CHECK +CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) +endif + +# +# Architecture dependent settings +# + +ifeq ($(ARCH), x86) +ifndef BINARY +NO_BINARY_MODE = 1 +endif +ifndef NO_EXPRECISION +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), GCC) +EXPRECISION = 1 +CCOMMON_OPT += -DEXPRECISION -m128bit-long-double +FCOMMON_OPT += -m128bit-long-double +endif +endif +endif +endif + +ifeq ($(ARCH), x86_64) +ifndef NO_EXPRECISION +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), GCC) +EXPRECISION = 1 +CCOMMON_OPT += -DEXPRECISION -m128bit-long-double +FCOMMON_OPT += -m128bit-long-double +endif +endif +endif +endif + +ifeq ($(C_COMPILER), INTEL) +CCOMMON_OPT += -wd981 +endif + +ifdef USE_OPENMP +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fopenmp +endif + +ifeq ($(C_COMPILER), INTEL) +CCOMMON_OPT += -openmp +endif + +ifeq ($(C_COMPILER), PGI) +CCOMMON_OPT += -mp +endif + +ifeq ($(C_COMPILER), OPEN64) +CCOMMON_OPT += -mp +CEXTRALIB += -lstdc++ +endif + +ifeq ($(C_COMPILER), PATHSCALE) +CCOMMON_OPT += -mp +endif +endif + + +ifdef DYNAMIC_ARCH +ifeq ($(ARCH), x86) +DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ + CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +endif + +ifeq ($(ARCH), x86_64) +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +endif + +ifndef DYNAMIC_CORE +DYNAMIC_ARCH = +endif +endif + +ifeq ($(ARCH), ia64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 + +ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(C_COMPILER), GCC) +# EXPRECISION = 1 +# CCOMMON_OPT += -DEXPRECISION +endif +endif +endif + +ifeq ($(ARCH), mips64) +NO_BINARY_MODE = 1 +endif + +ifeq ($(ARCH), alpha) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif + +# +# C Compiler dependent settings +# + +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -Wall +COMMON_PROF += -fno-inline +NO_UNINITIALIZED_WARN = -Wno-uninitialized + +ifdef NO_BINARY_MODE + +ifeq ($(ARCH), mips64) +ifdef BINARY64 +CCOMMON_OPT += -mabi=64 +else +CCOMMON_OPT += -mabi=n32 +endif +BINARY_DEFINED = 1 +endif + +ifeq ($(OSNAME), AIX) +BINARY_DEFINED = 1 +endif + +endif + +ifndef BINARY_DEFINED +ifdef BINARY64 +CCOMMON_OPT += -m64 +else +CCOMMON_OPT += -m32 +endif +endif + +endif + +ifeq ($(C_COMPILER), PGI) +ifdef BINARY64 +CCOMMON_OPT += -tp p7-64 +else +CCOMMON_OPT += -tp p7 +endif +endif + +ifeq ($(C_COMPILER), PATHSCALE) +ifdef BINARY64 +CCOMMON_OPT += -m64 +else +CCOMMON_OPT += -m32 +endif +endif + +# +# Fortran Compiler dependent settings +# + +ifeq ($(F_COMPILER), G77) +CCOMMON_OPT += -DF_INTERFACE_G77 +FCOMMON_OPT += -Wall +ifndef NO_BINARY_MODE +ifdef BINARY64 +FCOMMON_OPT += -m64 +else +FCOMMON_OPT += -m32 +endif +endif +endif + +ifeq ($(F_COMPILER), G95) +CCOMMON_OPT += -DF_INTERFACE_G95 +FCOMMON_OPT += -Wall +ifndef NO_BINARY_MODE +ifdef BINARY64 +FCOMMON_OPT += -m64 +else +FCOMMON_OPT += -m32 +endif +endif +endif + +ifeq ($(F_COMPILER), GFORTRAN) +CCOMMON_OPT += -DF_INTERFACE_GFORT +FCOMMON_OPT += -Wall +ifdef NO_BINARY_MODE +ifeq ($(ARCH), mips64) +ifdef BINARY64 +FCOMMON_OPT += -mabi=64 +else +FCOMMON_OPT += -mabi=n32 +endif +endif +else +ifdef BINARY64 +FCOMMON_OPT += -m64 +ifdef INTERFACE64 +FCOMMON_OPT += -fdefault-integer-8 +endif +else +FCOMMON_OPT += -m32 +endif +endif +ifdef USE_OPENMP +FCOMMON_OPT += -fopenmp +endif +endif + +ifeq ($(F_COMPILER), INTEL) +CCOMMON_OPT += -DF_INTERFACE_INTEL +ifdef INTERFACE64 +FCOMMON_OPT += -i8 +endif +ifdef USE_OPENMP +FCOMMON_OPT += -openmp +endif +endif + +ifeq ($(F_COMPILER), FUJITSU) +CCOMMON_OPT += -DF_INTERFACE_FUJITSU +ifdef USE_OPENMP +FCOMMON_OPT += -openmp +endif +endif + +ifeq ($(F_COMPILER), IBM) +CCOMMON_OPT += -DF_INTERFACE_IBM +# FCOMMON_OPT += -qarch=440 +ifdef BINARY64 +FCOMMON_OPT += -q64 +ifdef INTERFACE64 +FCOMMON_OPT += -qintsize=8 +endif +else +FCOMMON_OPT += -q32 +endif +ifdef USE_OPENMP +FCOMMON_OPT += -openmp +endif +endif + +ifeq ($(F_COMPILER), PGI) +CCOMMON_OPT += -DF_INTERFACE_PGI +COMMON_PROF += -DPGICOMPILER +ifdef BINARY64 +ifdef INTERFACE64 +FCOMMON_OPT += -i8 +endif +FCOMMON_OPT += -tp p7-64 +else +FCOMMON_OPT += -tp p7 +endif +ifdef USE_OPENMP +FCOMMON_OPT += -mp +endif +endif + +ifeq ($(F_COMPILER), PATHSCALE) +CCOMMON_OPT += -DF_INTERFACE_PATHSCALE +ifdef BINARY64 +ifdef INTERFACE64 +FCOMMON_OPT += -i8 +endif +endif + +ifneq ($(ARCH), mips64) +ifndef BINARY64 +FCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif +else +ifdef BINARY64 +FCOMMON_OPT += -mabi=64 +else +FCOMMON_OPT += -mabi=n32 +endif +endif + +ifdef USE_OPENMP +FCOMMON_OPT += -mp +endif +endif + +ifeq ($(F_COMPILER), OPEN64) +CCOMMON_OPT += -DF_INTERFACE_OPEN64 +ifdef BINARY64 +ifdef INTERFACE64 +FCOMMON_OPT += -i8 +endif +endif +ifndef BINARY64 +FCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif + +ifdef USE_OPENMP +FEXTRALIB += -lstdc++ +FCOMMON_OPT += -mp +endif +endif + +ifeq ($(C_COMPILER), OPEN64) +ifndef BINARY64 +CCOMMON_OPT += -m32 +else +CCOMMON_OPT += -m64 +endif +endif + +ifeq ($(C_COMPILER), SUN) +CCOMMON_OPT += -w +ifeq ($(ARCH), x86) +CCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif +endif + +ifeq ($(F_COMPILER), SUN) +CCOMMON_OPT += -DF_INTERFACE_SUN +ifeq ($(ARCH), x86) +FCOMMON_OPT += -m32 +else +FCOMMON_OPT += -m64 +endif +ifdef USE_OPENMP +FCOMMON_OPT += -xopenmp=parallel +endif +endif + +ifeq ($(F_COMPILER), COMPAQ) +CCOMMON_OPT += -DF_INTERFACE_COMPAQ +ifdef USE_OPENMP +FCOMMON_OPT += -openmp +endif +endif + +ifdef BINARY64 +ifdef INTERFACE64 +CCOMMON_OPT += -DUSE64BITINT +endif +endif + +ifeq ($(NEED_PIC), 1) +ifeq ($(C_COMPILER), IBM) +CCOMMON_OPT += -qpic=large +else +CCOMMON_OPT += -fPIC +endif +ifeq ($(F_COMPILER), SUN) +FCOMMON_OPT += -pic +else +FCOMMON_OPT += -fPIC +endif +endif + +ifeq ($(DYNAMIC_ARCH), 1) +CCOMMON_OPT += -DDYNAMIC_ARCH +endif + +ifdef SMP +CCOMMON_OPT += -DSMP_SERVER + +ifeq ($(ARCH), mips64) +USE_SIMPLE_THREADED_LEVEL3 = 1 +endif + +ifeq ($(USE_OPENMP), 1) +# USE_SIMPLE_THREADED_LEVEL3 = 1 +# NO_AFFINITY = 1 +CCOMMON_OPT += -DUSE_OPENMP +endif + +endif + +ifeq ($(NO_WARMUP), 1) +CCOMMON_OPT += -DNO_WARMUP +endif + +ifeq ($(CONSISTENT_FPCSR), 1) +CCOMMON_OPT += -DCONSISTENT_FPCSR +endif + +# Only for development +# CCOMMON_OPT += -DPARAMTEST +# CCOMMON_OPT += -DPREFETCHTEST +# CCOMMON_OPT += -DNO_SWITCHING +# USE_PAPI = 1 + +ifdef USE_PAPI +CCOMMON_OPT += -DUSE_PAPI +EXTRALIB += -lpapi -lperfctr +endif + +ifdef DYNAMIC_THREADS +CCOMMON_OPT += -DDYNAMIC_THREADS +endif + +CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) + +ifdef USE_SIMPLE_THREADED_LEVEL3 +CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 +endif + +LIBPREFIX = libgoto2 + +KERNELDIR = $(TOPDIR)/kernel/$(ARCH) + +include $(TOPDIR)/Makefile.$(ARCH) + +CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" + +ifeq ($(CORE), PPC440) +CCOMMON_OPT += -DALLOC_QALLOC +endif + +ifeq ($(CORE), PPC440FP2) +STATIC_ALLOCATION = 1 +endif + +ifneq ($(OSNAME), Linux) +NO_AFFINITY = 1 +endif + +ifneq ($(ARCH), x86_64) +ifneq ($(ARCH), x86) +NO_AFFINITY = 1 +endif +endif + +ifdef NO_AFFINITY +CCOMMON_OPT += -DNO_AFFINITY +endif + +ifdef FUNCTION_PROFILE +CCOMMON_OPT += -DFUNCTION_PROFILE +endif + +ifdef HUGETLB_ALLOCATION +CCOMMON_OPT += -DALLOC_HUGETLB +endif + +ifdef HUGETLBFILE_ALLOCATION +CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION) +endif + +ifdef STATIC_ALLOCATION +CCOMMON_OPT += -DALLOC_STATIC +endif + +ifdef DEVICEDRIVER_ALLOCATION +CCOMMON_OPT += -DALLOC_DEVICEDRIVER -DDEVICEDRIVER_NAME=\"/dev/mapper\" +endif + +ifdef MIXED_MEMORY_ALLOCATION +CCOMMON_OPT += -DMIXED_MEMORY_ALLOCATION +endif + +ifeq ($(OSNAME), SunOS) +TAR = gtar +PATCH = gpatch +GREP = ggrep +else +TAR = tar +PATCH = patch +GREP = grep +endif + +MD5SUM = md5sum +AWK = awk + +REVISION = -r$(VERSION) + +CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) +PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) + +FFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) +FPFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF) + +ifndef SUFFIX +SUFFIX = o +endif + +ifndef PSUFFIX +PSUFFIX = po +endif + +ifndef LIBSUFFIX +LIBSUFFIX = a +endif + +ifndef DYNAMIC_ARCH +ifndef SMP +LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX) +LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX) +else +LIBNAME = $(LIBPREFIX)_$(LIBCORE)p$(REVISION).$(LIBSUFFIX) +LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)p$(REVISION)_p.$(LIBSUFFIX) +endif +else +ifndef SMP +LIBNAME = $(LIBPREFIX)$(REVISION).$(LIBSUFFIX) +LIBNAME_P = $(LIBPREFIX)$(REVISION)_p.$(LIBSUFFIX) +else +LIBNAME = $(LIBPREFIX)p$(REVISION).$(LIBSUFFIX) +LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX) +endif +endif + + +LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) +LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll) +LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) +LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) +LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) +LIBZIPNAME = $(LIBNAME:.$(LIBSUFFIX)=.zip) + +LIBS = $(TOPDIR)/$(LIBNAME) +LIBS_P = $(TOPDIR)/$(LIBNAME_P) + +export OSNAME +export ARCH +export CORE +export LIBCORE +export PGCPATH +export CONFIG +export CC +export FC +export BU +export FU +export USE_THREAD +export NUM_THREADS +export NUM_CORES +export SMP +export MAKEFILE_RULE +export NEED_PIC +export BINARY +export BINARY32 +export BINARY64 +export F_COMPILER +export C_COMPILER +export USE_OPENMP +export CROSS +export CROSS_SUFFIX +export NOFORTRAN +export EXTRALIB +export CEXTRALIB +export FEXTRALIB +export HAVE_SSE +export HAVE_SSE2 +export HAVE_SSE3 +export HAVE_SSSE3 +export HAVE_SSE4_1 +export HAVE_SSE4_2 +export HAVE_SSE4A +export HAVE_SSE5 +export KERNELDIR +export FUNCTION_PROFILE +export TARGET_CORE + +export SGEMM_UNROLL_M +export SGEMM_UNROLL_N +export DGEMM_UNROLL_M +export DGEMM_UNROLL_N +export QGEMM_UNROLL_M +export QGEMM_UNROLL_N +export CGEMM_UNROLL_M +export CGEMM_UNROLL_N +export ZGEMM_UNROLL_M +export ZGEMM_UNROLL_N +export XGEMM_UNROLL_M +export XGEMM_UNROLL_N + +ifdef USE_CUDA +export CUDADIR +export CUCC +export CUFLAGS +export CULIB +endif + +.SUFFIXES: .$(PSUFFIX) .$(SUFFIX) .f + +.f.$(SUFFIX): + $(FC) $(FFLAGS) -c $< -o $(@F) + +.f.$(PSUFFIX): + $(FC) $(FPFLAGS) -pg -c $< -o $(@F) + + +ifdef BINARY64 +PATHSCALEPATH = /opt/pathscale/lib/3.1 +PGIPATH = /opt/pgi/linux86-64/7.1-5/lib +else +PATHSCALEPATH = /opt/pathscale/lib/3.1/32 +PGIPATH = /opt/pgi/linux86/7.1-5/lib +endif + +ACMLPATH = /opt/acml/4.3.0 +ifneq ($(OSNAME), Darwin) +MKLPATH = /opt/intel/mkl/10.2.2.025/lib +else +MKLPATH = /Library/Frameworks/Intel_MKL.framework/Versions/10.0.1.014/lib +endif +ATLASPATH = /opt/atlas/3.9.17/opteron +FLAMEPATH = $(HOME)/flame/lib +ifneq ($(OSNAME), SunOS) +SUNPATH = /opt/sunstudio12.1 +else +SUNPATH = /opt/SUNWspro +endif + diff --git a/Makefile.tail b/Makefile.tail new file mode 100644 index 0000000000..64f98ab0cb --- /dev/null +++ b/Makefile.tail @@ -0,0 +1,617 @@ +SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) + +COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) + +HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) + +BLASOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +BLASOBJS_P = $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) + +ifdef EXPRECISION +BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) +BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) +endif + +ifdef QUAD_PRECISION +BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) +BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) +endif + +$(SBLASOBJS) $(SBLASOBJS_P) : CFLAGS += -UDOUBLE -UCOMPLEX +$(DBLASOBJS) $(DBLASOBJS_P) : CFLAGS += -DDOUBLE -UCOMPLEX +$(QBLASOBJS) $(QBLASOBJS_P) : CFLAGS += -DXDOUBLE -UCOMPLEX +$(CBLASOBJS) $(CBLASOBJS_P) : CFLAGS += -UDOUBLE -DCOMPLEX +$(ZBLASOBJS) $(ZBLASOBJS_P) : CFLAGS += -DDOUBLE -DCOMPLEX +$(XBLASOBJS) $(XBLASOBJS_P) : CFLAGS += -DXDOUBLE -DCOMPLEX + +$(SBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(DBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(QBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(CBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(ZBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) +$(XBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF) + +libs :: $(BLASOBJS) $(COMMONOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +prof :: $(BLASOBJS_P) $(COMMONOBJS_P) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ + +hpl :: $(HPLOBJS) $(COMMONOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +hpl_p :: $(HPLOBJS_P) $(COMMONOBJS_P) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ + +kernel :: $(BLASOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +commonlibs :: $(COMMONOBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +commonprof :: $(COMMONOBJS_P) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME_P) $^ + +quick : + $(MAKE) -C $(TOPDIR) libs + +bms.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmd.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmd-k.$(SUFFIX):bm-k.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +ifdef QUAD_PRECISION +bmq.$(SUFFIX):bmq.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmx.$(SUFFIX):bmx.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) +else +bmq.$(SUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmx.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) +endif + +bmc.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) + +bmz.$(SUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) + +bmd_nn.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DNN -c $< -o $(@F) + +bmd_nt.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DNT -c $< -o $(@F) + +bmd_tn.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DTN -c $< -o $(@F) + +bmd_tt.$(SUFFIX):bm_special.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -DTT -c $< -o $(@F) + +bm-phy.$(SUFFIX):bm-phy.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +bms.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmd.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +ifdef QUAD_PRECISION +bmq.$(PSUFFIX):bmq.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmx.$(PSUFFIX):bmx.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) +else +bmq.$(PSUFFIX):bm.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +bmx.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) +endif + +bmc.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) + +bmz.$(PSUFFIX):bmz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(PFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) + +bms : bms.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd : bmd.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) -lm + +bmd-k : bmd-k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) -lm + +bmq : bmq.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmc : bmc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) $(FEXTRALIB) + +bmz : bmz.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmx : bmx.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd_nn : bmd_nn.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd_nt : bmd_nt.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd_tn : bmd_tn.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd_tt : bmd_tt.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bm-phy:bm-phy.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmcc : bmcc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmzc : bmzc.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bms.prof : bms.$(PSUFFIX) $(SBLASOBJS_P) $(COMMONOBJS_P) $(SOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmd.prof : bmd.$(PSUFFIX) $(DBLASOBJS_P) $(COMMONOBJS_P) $(DOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmq.prof : bmq.$(PSUFFIX) $(QBLASOBJS_P) $(COMMONOBJS_P) $(QOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmc.prof : bmc.$(PSUFFIX) $(CBLASOBJS_P) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmz.prof : bmz.$(PSUFFIX) $(ZBLASOBJS_P) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bmx.prof : bmz.$(PSUFFIX) $(XBLASOBJS_P) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS_P) + $(CC) -o $(@F) $(PFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bms.cxml : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bmd.cxml : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bmc.cxml : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bmz.cxml : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bms.scsl : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) + +bmd.scsl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) + +bmc.scsl : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) + +bmz.scsl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) + +bms.acml : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +bmd.acml : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +bmc.acml : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +bmz.acml : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +bms.sun : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bmd.sun : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bmc.sun : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bmz.sun : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bms.atlas : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bmd.atlas : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bmc.atlas : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bmz.atlas : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bms.essl : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) $(FCOMMON_OPT) -o $(@F) $^ $(LIBESSL) + +bmd.essl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) + +bmc.essl : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(F77) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) + +bmz.essl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) $(CCOMMON_OPT) -o $(@F) $^ $(LIBESSL) + +bms.flame : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) + +bmd.flame : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) + +bmc.flame : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) + +bmz.flame : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) + +bms.flame.prof : bms.$(SUFFIX) $(SOBJS) $(OBJS_P) + $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) + +bmd.flame.prof : bmd.$(SUFFIX) $(DOBJS) $(OBJS_P) + $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) + +bmc.flame.prof : bmc.$(SUFFIX) $(COBJS) $(OBJS_P) + $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) + +bmz.flame.prof : bmz.$(SUFFIX) $(ZOBJS) $(OBJS_P) + $(F77) -o $(@F) $(PFLAGS) $^ $(LIBFLAME) + +bms.mkl : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bmd.mkl : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bmc.mkl : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bmz.mkl : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bmq.mkl : bmq.$(SUFFIX) $(QOBJS) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bms.mkl.prof : bms.$(PSUFFIX) $(SOBJS) $(OBJS) + $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) + +bmd.mkl.prof : bmd.$(PSUFFIX) $(DOBJS) $(OBJS) + $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) + +bmc.mkl.prof : bmc.$(PSUFFIX) $(COBJS) $(OBJS) + $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) + +bmz.mkl.prof : bmz.$(PSUFFIX) $(ZOBJS) $(OBJS) + $(CC) -o $(@F) $(PFLAGS) $^ $(LIBMKL) + +bms.mlib : bms.$(SUFFIX) $(SOBJS) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bmd.mlib : bmd.$(SUFFIX) $(DOBJS) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bmc.mlib : bmc.$(SUFFIX) $(COBJS) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bmz.mlib : bmz.$(SUFFIX) $(ZOBJS) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bms.veclib : bms.$(SUFFIX) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +bmd.veclib : bmd.$(SUFFIX) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +bmc.veclib : bmc.$(SUFFIX) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +bmz.veclib : bmz.$(SUFFIX) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +bms.fuji : bms.$(SUFFIX) $(SOBJS) +ifndef SMP + fcc -KV9FMADD -SSL2 -o $(@F) $^ +else + fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ +endif + +bmd.fuji : bmd.$(SUFFIX) $(DOBJS) +ifndef SMP + fcc -KV9FMADD -SSL2 -o $(@F) $^ +else + fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ +endif + +bmc.fuji : bmc.$(SUFFIX) $(COBJS) +ifndef SMP + fcc -KV9FMADD -SSL2 -o $(@F) $^ +else + fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ +endif + +bmz.fuji : bmz.$(SUFFIX) $(ZOBJS) +ifndef SMP + fcc -KV9FMADD -SSL2 -o $(@F) $^ +else + fcc -KV9FMADD -SSL2BLAMP -o $(@F) $^ +endif + +bench: bench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +bench.$(SUFFIX): bench.c + $(CC) -c -o $(@F) $(CFLAGS) $^ + +bench_old: bench_old.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +kbench: kbench.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +prebench: prebench.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +kbench_rank_k: kbench_rank_k.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +smallbench: smallbench.$(SUFFIX) $(BLASOBJS) $(COMMONOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +smallbench.mkl: smallbench.$(SUFFIX) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bench.sun: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) $(EXTRALIB) $(CEXTRALIB) + +bench.cxml: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBCXML) + +bench.atlas: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +bench.essl: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) ../../level1/others/libmisc.$(LIBSUFFIX) + +bench.scsl: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSCSL) $(EXTRALIB) $(CEXTRALIB) + +bench.acml: bench.$(SUFFIX) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBACML) $(EXTRALIB) $(CEXTRALIB) + +bench.flame: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +kbench.mkl: kbench.$(SUFFIX) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bench.mkl: bench.$(SUFFIX) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bench_old.mkl: bench_old.$(SUFFIX) $(OBJS) + $(CC) -static -o $(@F) $(CFLAGS) $^ $(LIBMKL) $(EXTRALIB) $(CEXTRALIB) + +bench.mlib: bench.$(SUFFIX) $(OBJS) + $(F77) -o $(@F) $(CFLAGS) $^ $(LIBMLIB) + +bench.veclib: bench.$(SUFFIX) $(OBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBVECLIB) + +params : params.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramd : paramd.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramq : paramq.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramc : paramc.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramz : paramz.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramx : paramx.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +params-ex : params-ex.$(SUFFIX) $(SBLASOBJS) $(COMMONOBJS) $(SOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramd-ex : paramd-ex.$(SUFFIX) $(DBLASOBJS) $(COMMONOBJS) $(DOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramq-ex : paramq-ex.$(SUFFIX) $(QBLASOBJS) $(COMMONOBJS) $(QOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramc-ex : paramc-ex.$(SUFFIX) $(CBLASOBJS) $(COMMONOBJS) $(COBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramz-ex : paramz-ex.$(SUFFIX) $(ZBLASOBJS) $(COMMONOBJS) $(ZOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +paramx-ex : paramx-ex.$(SUFFIX) $(XBLASOBJS) $(COMMONOBJS) $(XOBJS) $(OBJS) $(LIBS) + $(CC) -o $(@F) $(CFLAGS) $^ $(EXTRALIB) $(CEXTRALIB) + +params.atlas : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +paramd.atlas : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +paramc.atlas : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +paramz.atlas : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBATLAS) + +params.sun : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) + +paramd.sun : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) + +paramc.sun : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) + +paramz.sun : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBSUNPERF) + +params.essl : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) + +paramd.essl : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) + +paramc.essl : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) + +paramz.essl : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBESSL) + +params.mkl : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) + +paramd.mkl : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) + +paramc.mkl : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) + +paramz.mkl : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBMKL) + +params.acml : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +paramd.acml : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +paramc.acml : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +paramz.acml : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBACML) + +params.flame : params.$(SUFFIX) $(OBJS) $(SOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +paramd.flame : paramd.$(SUFFIX) $(OBJS) $(DOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +paramc.flame : paramc.$(SUFFIX) $(OBJS) $(COBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +paramz.flame : paramz.$(SUFFIX) $(OBJS) $(ZOBJS) + $(CC) -o $(@F) $(CFLAGS) $^ $(LIBFLAME) $(EXTRALIB) $(CEXTRALIB) + +params.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramd.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramq.$(SUFFIX):param.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramc.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) + +paramz.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) + +paramx.$(SUFFIX):paramz.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) + +params-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramd-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramq-ex.$(SUFFIX):param-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -UCOMPLEX -c $< -o $(@F) + +paramc-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -UDOUBLE -DCOMPLEX -c $< -o $(@F) + +paramz-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DDOUBLE -DCOMPLEX -c $< -o $(@F) + +paramx-ex.$(SUFFIX):paramz-ex.c $(TOPDIR)/../bench/bmcommon.h + $(CC) $(CFLAGS) -DXDOUBLE -DCOMPLEX -c $< -o $(@F) + +gen_insn_flash.c : + echo '#include ' > gen_insn_flash.c + echo '#include ' >> gen_insn_flash.c + echo '#define ICACHE_SIZE ( 256 << 10)' >> gen_insn_flash.c + echo 'int main(void){' >> gen_insn_flash.c + echo 'int i;' >> gen_insn_flash.c + echo '#ifdef __alpha' >> gen_insn_flash.c + echo 'printf(".set noat;.set noreorder;\n");' >> gen_insn_flash.c + echo 'printf(".arch ev6;.text;.align 5\n");' >> gen_insn_flash.c + echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c + echo 'printf(".ent insn_flash\n");' >> gen_insn_flash.c + echo 'printf("insn_flash:\n");' >> gen_insn_flash.c + echo 'for (i = 0; i < ICACHE_SIZE / 4; i++)' >> gen_insn_flash.c + echo 'printf("br 1f\n 1:\n");' >> gen_insn_flash.c + echo 'printf(".align 5;ret;.end insn_flash\n");'>> gen_insn_flash.c + echo '#else' >> gen_insn_flash.c + echo 'printf(".text;.align 32\n");' >> gen_insn_flash.c + echo 'printf(".globl insn_flash\n");' >> gen_insn_flash.c + echo 'printf("insn_flash:\n");' >> gen_insn_flash.c + echo 'for (i = 0; i < ICACHE_SIZE / 2; i++)' >> gen_insn_flash.c + echo 'printf("jmp 1f\n 1:\n");' >> gen_insn_flash.c + echo 'printf(".align 32;ret\n");' >> gen_insn_flash.c + echo '#endif' >> gen_insn_flash.c + echo 'return 0;' >> gen_insn_flash.c + echo '}' >> gen_insn_flash.c + +insn_flash.$(SUFFIX) : gen_insn_flash + ./gen_insn_flash > temp.s + $(AS) -o $(@F) temp.s + rm -f temp.s + +dummy : + +clean :: + @if test -d $(ARCH); then \ + (cd $(ARCH) && $(MAKE) clean) \ + fi + @rm -rf *.a *.s *.o *.po *.obj *.i *.so core core.* gmon.out *.cso \ + *.csx *.is *~ *.exe *.flame *.pdb *.dwf \ + gen_insn_flash.c gen_insn_flash *.stackdump *.dll *.exp *.lib \ + *.pc *.pcl *.def *.i *.prof linktest.c \ + bms bmd bmc bmz bmq bmx \ + params paramd paramc paramz paramq paramx \ + params-ex paramd-ex paramc-ex paramz-ex paramq-ex paramx-ex \ + bench tpp kbench kbench2 \ + *.mkl *.sun *.acml *.cxml *.essl *.atlas *.scsl *.mlib *.veclib *.fuji diff --git a/Makefile.x86 b/Makefile.x86 new file mode 100644 index 0000000000..94ca7c4a7f --- /dev/null +++ b/Makefile.x86 @@ -0,0 +1,59 @@ +# COMPILER_PREFIX = mingw32- + +ifeq ($(OSNAME), Linux) +LDFLAGS = -melf_i386 +endif + +ifeq ($(OSNAME), Interix) +ARFLAGS = -m x86 +endif + +ifndef SMP +LIBMKL = -L$(MKLPATH)/32 -Wl,-rpath,$(MKLPATH)/32 -lmkl_intel -lmkl_sequential -lmkl_core -lguide -lpthread -lm +else +LIBMKL = -L$(MKLPATH)/32 -Wl,-rpath,$(MKLPATH)/32 -lmkl_intel -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm +endif + +# LIBMKL = -L$(MKLPATH)/32 -lmkl_lapack -lmkl_ia32 -lguide -lpthread -lm + +ifndef SMP +LIBATLAS = -L$(ATLAS) -lf77blas -latlas -lg2c -lm +else +LIBATLAS = -L$(ATLAS) -lptf77blas -latlas -lpthread -lg2c -lm +endif + +ifeq ($(COMPILER_F77), g77) +LIBACML = -L$(ACMLPATH)/gnu32/lib -Wl,-rpath,$(ACMLPATH)/gnu32/lib -lacml -lg2c +endif + +LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame-lapack -lflame-base $(LIBS) + +ifeq ($(F_COMPILER), GFORTRAN) +ifndef SMP +LIBACML = -L$(ACMLPATH)/gfortran32/lib -Wl,-rpath,$(ACMLPATH)/gfortran32/lib -lacml -lgfortran -lm +else +LIBACML = -L$(ACMLPATH)/gfortran32_mp/lib -Wl,-rpath,$(ACMLPATH)/gfortran32_mp/lib -lacml_mp -lgfortran -lgomp -lm +endif +endif + +ifeq ($(COMPILER_F77), pgf77) +LIBACML = -L$(ACMLPATH)/pgi32/lib -lacml -L/opt/pgi/linux86-64/5.2/lib -lpgftnrtl -lnspgc -lpgc +endif + +ifeq ($(F_COMPILER), PATHSCALE) +ifndef SMP +LIBACML = -L$(ACMLPATH)/pathscale32/lib -Wl,-rpath,$(ACMLPATH)/pathscale32/lib -lacml -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lpathfortran -lm +else +LIBACML = -L$(ACMLPATH)/pathscale32_mp/lib -Wl,-rpath,$(ACMLPATH)/pathscale32_mp/lib -lacml_mp -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lopenmp -lpathfortran -lm +endif +endif + +LIBSUNPERF = -L/opt/SUNWspro/lib/sse2 -Wl,-R,/opt/SUNWspro/lib/sse2 -lsunperf + +LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib + +ifndef SMP +LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm +else +LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm +endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 new file mode 100644 index 0000000000..b939e5459b --- /dev/null +++ b/Makefile.x86_64 @@ -0,0 +1,102 @@ +# CCOMMON_OPT += -DFASTCPU + +ifeq ($(OSNAME), SunOS) +ifdef BINARY64 +LDFLAGS = -64 +ifeq ($(F_COMPILER), SUN) +FCOMMON_OPT += -m64 +endif +endif +endif + +ifeq ($(OSNAME), FreeBSD) +LDFLAGS = -m elf_x86_64_fbsd +endif + +ifeq ($(OSNAME), Linux) +LDFLAGS = -m elf_x86_64 +endif + +ifeq ($(OSNAME), Darwin) +LDFLAGS = +endif + +ifeq ($(OSNAME), Interix) +ARFLAGS = -m x64 +endif + +ifeq ($(OSNAME), Darwin) +ifndef SMP +LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lguide -lpthread -lm +else +LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lguide -lpthread -lm +endif +else +ifndef SMP +LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -Wl,--start-group -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -Wl,--end-group -lguide -lpthread -lm +else +LIBMKL = -L$(MKLPATH)/em64t -Wl,-rpath,$(MKLPATH)/em64t -Wl,--start-group -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -Wl,--end-group -lguide -lpthread -lm +endif +endif + + +ifndef SMP +LIBATLAS = -L$(ATLASPATH)64 -llapack -lcblas -lf77blas -latlas -lm +else +LIBATLAS = -L$(ATLASPATH)64 -llapack -lptcblas -lptf77blas -latlas -lpthread -lm +endif + +LIBFLAME = -L$(FLAMEPATH) -llapack2flame -lflame $(TOPDIR)/$(LIBNAME) -lgfortran -lpthread -lm + + +ifeq ($(F_COMPILER), g77) +LIBACML = -L$(ACMLPATH)/gnu64/lib -Wl,-rpath,$(ACMLPATH)/gnu64/lib -lacml -lacml_mv -lg2c -lm +endif + +ifeq ($(F_COMPILER), GFORTRAN) +ifndef SMP +LIBACML = -L$(ACMLPATH)/gfortran64/lib -Wl,-rpath,$(ACMLPATH)/gfortran64/lib -lacml -lacml_mv -lgfortran -lm +else +LIBACML = -L$(ACMLPATH)/gfortran64_mp/lib -Wl,-rpath,$(ACMLPATH)/gfortran64_mp/lib -lacml_mp -lacml_mv -lgfortran -lgomp -lm +endif +endif + +ifeq ($(F_COMPILER), INTEL) +ifndef SMP +LIBACML = -L$(ACMLPATH)/ifort64/lib -Wl,-rpath,$(ACMLPATH)/ifort64/lib -lacml -lacml_mv -lifcoremt_pic -lirc -lm -lpthread -ldl +else +LIBACML = -L$(ACMLPATH)/ifort64_mp/lib -Wl,-rpath,$(ACMLPATH)/ifort64_mp/lib -lacml_mp -lacml_mv -lifcoremt_pic -liomp5 -lirc -lm -lpthread -ldl +endif +endif + +ifeq ($(F_COMPILER), OPEN64) +ifndef SMP +LIBACML = -L$(ACMLPATH)/open64/lib -Wl,-rpath,$(ACMLPATH)/open64/lib -lacml -lacml_mv -lm +else +LIBACML = -L$(ACMLPATH)/open64_mp/lib -Wl,-rpath,$(ACMLPATH)/open64_mp/lib -lacml_mp -lacml_mv -lm -lpthread +endif +endif + +ifeq ($(F_COMPILER), pgf77) +ifndef SMP +LIBACML = -L$(ACMLPATH)/pgi64/lib -Wl,-rpath,$(ACMLPATH)/pgi64/lib -lacml -lacml_mv -L$(PGIPATH) -Wl,-rpath,$(PGIPATH) -lpgftnrtl -lnspgc -lpgmp -lpgc +else +LIBACML = -L$(ACMLPATH)/pgi64_mp/lib -Wl,-rpath,$(ACMLPATH)/pgi64_mp/lib -lacml -lacml_mv -L$(PGIPATH) -Wl,-rpath,$(PGIPATH) -lpgftnrtl -lnspgc -lpgmp -lpgc +endif +endif + +ifeq ($(F_COMPILER), PATHSCALE) +ifndef SMP +LIBACML = -L$(ACMLPATH)/pathscale64/lib -Wl,-rpath,$(ACMLPATH)/pathscale64/lib -lacml -lacml_mv -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lpathfortran -lm +else +LIBACML = -L$(ACMLPATH)/pathscale64_mp/lib -Wl,-rpath,$(ACMLPATH)/pathscale64_mp/lib -lacml_mp -lacml_mv -Wl,-rpath,$(PATHSCALEPATH) -L$(PATHSCALEPATH) -lopenmp -lpathfortran -lm +endif +endif + +ifeq ($(F_COMPILER), f90) +LIBACML = -L$(ACMLPATH)/sun64/lib -Wl,-R,$(ACMLPATH)/sun64/lib -L$(SUNPATH)/lib/amd64 -Wl,-R,$(SUNPATH)/lib/amd64 -lacml -lacml_mv -lfsu +endif + +LIBSUNPERF = -L$(SUNPATH)/lib/amd64 -L$(SUNPATH)/rtlibs/amd64 -Wl,-R,$(SUNPATH)/lib/amd64 -Wl,-R,$(SUNPATH)/rtlibs/amd64 -lsunperf -lfui -lfsu -lmtsk + +LIBVECLIB = /System/Library/Frameworks/vecLib.framework/Versions/Current/vecLib diff --git a/benchmark/Makefile b/benchmark/Makefile new file mode 100644 index 0000000000..0c37570cd0 --- /dev/null +++ b/benchmark/Makefile @@ -0,0 +1,195 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +CULA_INC = -I/usr/local/cula/include +CULA_LIB = -L/usr/local/cula/lib64 -Wl,-rpath,/usr/local/cula/lib64 -lcula_fortran -lcula -lcublas + +all :: dlinpack.goto dlinpack.mkl dlinpack.acml dcholesky.goto dcholesky.mkl dcholesky.acml + ./dlinpack.goto 4000 4000 1 + -./dlinpack.mkl 4000 4000 1 + -./dlinpack.acml 4000 4000 1 + ./dcholesky.goto 4000 4000 1 + -./dcholesky.mkl 4000 4000 1 + -./dcholesky.acml 4000 4000 1 + +slinpack.goto : slinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +qlinpack.goto : qlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +xlinpack.goto : xlinpack.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +qcholesky.goto : qcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +zcholesky.goto : zcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +xcholesky.goto : xcholesky.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) -lm + +slinpack.mkl : slinpack.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.mkl : dlinpack.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.mkl : clinpack.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.mkl : zlinpack.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.mkl : scholesky.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.mkl : dcholesky.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.mkl : ccholesky.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.mkl : zcholesky.$(SUFFIX) + -$(CC) -static $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.acml : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.acml : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.acml : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.acml : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.acml : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.acml : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.acml : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.acml : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.flame : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.flame : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.flame : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.flame : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.flame : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.flame : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.flame : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.flame : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBFLAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.sun : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dlinpack.sun : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.sun : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zlinpack.sun : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +scholesky.sun : scholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dcholesky.sun : dcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ccholesky.sun : ccholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zcholesky.sun : zcholesky.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBSUNPERF) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +slinpack.cula : slinpack.$(SUFFIX) cula_wrapper.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(CULA_LIB) ../$(LIBNAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +clinpack.cula : clinpack.$(SUFFIX) cula_wrapper.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(CULA_LIB) ../$(LIBNAME) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cula_wrapper.$(SUFFIX) : cula_wrapper.c + $(CC) $(CFLAGS) -c $(CULA_INC) -o $(@F) $^ + +slinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +qlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DXDOUBLE -o $(@F) $^ + +clinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +xlinpack.$(SUFFIX) : linpack.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DXDOUBLE -o $(@F) $^ + +scholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +qcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DXDOUBLE -o $(@F) $^ + +ccholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +xcholesky.$(SUFFIX) : cholesky.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DXDOUBLE -o $(@F) $^ + +clean :: + @rm -f *.goto *.mkl *.acml *.sun *.cula + +include $(TOPDIR)/Makefile.tail diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c new file mode 100644 index 0000000000..a40cdd211c --- /dev/null +++ b/benchmark/cholesky.c @@ -0,0 +1,272 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +double fabs(double); + +#undef POTRF + +#ifndef COMPLEX +#ifdef XDOUBLE +#define POTRF BLASFUNC(qpotrf) +#define SYRK BLASFUNC(qsyrk) +#elif defined(DOUBLE) +#define POTRF BLASFUNC(dpotrf) +#define SYRK BLASFUNC(dsyrk) +#else +#define POTRF BLASFUNC(spotrf) +#define SYRK BLASFUNC(ssyrk) +#endif +#else +#ifdef XDOUBLE +#define POTRF BLASFUNC(xpotrf) +#define SYRK BLASFUNC(xherk) +#elif defined(DOUBLE) +#define POTRF BLASFUNC(zpotrf) +#define SYRK BLASFUNC(zherk) +#else +#define POTRF BLASFUNC(cpotrf) +#define SYRK BLASFUNC(cherk) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +static __inline double getmflops(int ratio, int m, double secs){ + + double mm = (double)m; + double mulflops, addflops; + + if (secs==0.) return 0.; + + mulflops = mm * (1./3. + mm * (1./2. + mm * 1./6.)); + addflops = 1./6. * mm * (mm * mm - 1); + + if (ratio == 1) { + return (mulflops + addflops) / secs * 1.e-6; + } else { + return (2. * mulflops + 6. * addflops) / secs * 1.e-6; + } +} + + +int MAIN__(int argc, char *argv[]){ + + char *trans[] = {"T", "N"}; + char *uplo[] = {"U", "L"}; + FLOAT alpha[] = {1.0, 0.0}; + FLOAT beta [] = {0.0, 0.0}; + + FLOAT *a, *b; + + blasint m, i, j, info, uplos; + + int from = 1; + int to = 200; + int step = 1; + + FLOAT maxerr; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + for(m = from; m <= to; m += step){ + + fprintf(stderr, "M = %6d : ", (int)m); + + for (uplos = 0; uplos < 2; uplos ++) { + +#ifndef COMPLEX + if (uplos & 1) { + for (j = 0; j < m; j++) { + for(i = 0; i < j; i++) a[i + j * m] = 0.; + a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; + } + } else { + for (j = 0; j < m; j++) { + for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.; + for(i = j + 1; i < m; i++) a[i + j * m] = 0.; + } + } +#else + if (uplos & 1) { + for (j = 0; j < m; j++) { + for(i = 0; i < j; i++) { + a[(i + j * m) * 2 + 0] = 0.; + a[(i + j * m) * 2 + 1] = 0.; + } + + a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(j + j * m) * 2 + 1] = 0.; + + for(i = j + 1; i < m; i++) { + a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + } + } + } else { + for (j = 0; j < m; j++) { + for(i = 0; i < j; i++) { + a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5; + a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5; + } + + a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.; + a[(j + j * m) * 2 + 1] = 0.; + + for(i = j + 1; i < m; i++) { + a[(i + j * m) * 2 + 0] = 0.; + a[(i + j * m) * 2 + 1] = 0.; + } + } + } +#endif + + SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); + + gettimeofday( &start, (struct timezone *)0); + + POTRF(uplo[uplos], &m, b, &m, &info); + + gettimeofday( &stop, (struct timezone *)0); + + if (info != 0) { + fprintf(stderr, "Info = %d\n", info); + exit(1); + } + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + maxerr = 0.; + + if (!(uplos & 1)) { + for (j = 0; j < m; j++) { + for(i = 0; i <= j; i++) { +#ifndef COMPLEX + if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); +#else + if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); + if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); +#endif + } + } + } else { + for (j = 0; j < m; j++) { + for(i = j; i < m; i++) { +#ifndef COMPLEX + if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]); +#else + if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]); + if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]); +#endif + } + } + } + + fprintf(stderr, +#ifdef XDOUBLE + " %Le %10.3f MFlops", maxerr, +#else + " %e %10.3f MFlops", maxerr, +#endif + getmflops(COMPSIZE * COMPSIZE, m, time1)); + + if (maxerr > 1.e-3) { + fprintf(stderr, "Hmm, probably it has bug.\n"); + exit(1); + } + + } + fprintf(stderr, "\n"); + + } + + return 0; +} + +void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/cula_wrapper.c b/benchmark/cula_wrapper.c new file mode 100644 index 0000000000..05dbcc231a --- /dev/null +++ b/benchmark/cula_wrapper.c @@ -0,0 +1,28 @@ +#include +#include "culapack.h" + +static int initialized = 0; + +int sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info) { + + if (!initialized) { + culaInitialize(); + initialized = 1; + } + + *info = culaSgetrf(*m, *m, a, *lda, ipiv); + + return 0; +} + +int cgetrf_(int *m, int *n, float *a, int *lda, int *ipiv, int *info) { + + if (!initialized) { + culaInitialize(); + initialized = 1; + } + + *info = culaCgetrf(*m, *m, (culaFloatComplex *)a, *lda, ipiv); + + return 0; +} diff --git a/benchmark/linpack.c b/benchmark/linpack.c new file mode 100644 index 0000000000..02618599d1 --- /dev/null +++ b/benchmark/linpack.c @@ -0,0 +1,273 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +double fabs(double); + +#undef GETRF +#undef GETRS + +#ifndef COMPLEX +#ifdef XDOUBLE +#define GETRF BLASFUNC(qgetrf) +#define GETRS BLASFUNC(qgetrs) +#elif defined(DOUBLE) +#define GETRF BLASFUNC(dgetrf) +#define GETRS BLASFUNC(dgetrs) +#else +#define GETRF BLASFUNC(sgetrf) +#define GETRS BLASFUNC(sgetrs) +#endif +#else +#ifdef XDOUBLE +#define GETRF BLASFUNC(xgetrf) +#define GETRS BLASFUNC(xgetrs) +#elif defined(DOUBLE) +#define GETRF BLASFUNC(zgetrf) +#define GETRS BLASFUNC(zgetrs) +#else +#define GETRF BLASFUNC(cgetrf) +#define GETRS BLASFUNC(cgetrs) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int MAIN__(int argc, char *argv[]){ + + FLOAT *a, *b; + blasint *ipiv; + + blasint m, i, j, info; + blasint unit = 1; + + int from = 1; + int to = 200; + int step = 1; + + FLOAT maxerr; + + struct timeval start, stop; + double time1, time2; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( ipiv = (blasint *)malloc(sizeof(blasint) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); + + for(m = from; m <= to; m += step){ + + fprintf(stderr, " %6d : ", (int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < m * COMPSIZE; ++i) b[i] = 0.; + + for (j = 0; j < m; ++j) { + for (i = 0; i < m * COMPSIZE; ++i) { + b[i] += a[i + j * m * COMPSIZE]; + } + } + + gettimeofday( &start, (struct timezone *)0); + + GETRF (&m, &m, a, &m, ipiv, &info); + + gettimeofday( &stop, (struct timezone *)0); + + if (info) { + fprintf(stderr, "Matrix is not singular .. %d\n", info); + exit(1); + } + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + gettimeofday( &start, (struct timezone *)0); + + GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info); + + gettimeofday( &stop, (struct timezone *)0); + + if (info) { + fprintf(stderr, "Matrix is not singular .. %d\n", info); + exit(1); + } + + time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + maxerr = 0.; + + for(i = 0; i < m; i++){ +#ifndef XDOUBLE + if (maxerr < fabs(b[i * COMPSIZE] - 1.0)) maxerr = fabs(b[i * COMPSIZE] - 1.0); +#ifdef COMPLEX + if (maxerr < fabs(b[i * COMPSIZE] + 1)) maxerr = fabs(b[i * COMPSIZE + 1]); +#endif +#else + if (maxerr < fabsl(b[i * COMPSIZE] - 1.0L)) maxerr = fabsl(b[i * COMPSIZE] - 1.0L); +#ifdef COMPLEX + if (maxerr < fabsl(b[i * COMPSIZE] + 1)) maxerr = fabsl(b[i * COMPSIZE + 1]); +#endif +#endif + } + +#ifdef XDOUBLE + fprintf(stderr," %Le ", maxerr); +#else + fprintf(stderr," %e ", maxerr); +#endif + + fprintf(stderr, + " %10.2f MFlops %10.2f MFlops %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. / 3. * (double)m * (double)m * (double)m / time1 * 1.e-6, + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time2 * 1.e-6, + COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m) / (time1 + time2) * 1.e-6); + +#if 0 + if ( +#ifdef DOUBLE + maxerr > 1.e-8 +#else + maxerr > 1.e-1 +#endif + ) { + fprintf(stderr, "Error is too large.\n"); + exit(1); + } +#endif + + } + + return 0; +} + +void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/c_check b/c_check new file mode 100644 index 0000000000..d8025f9f33 --- /dev/null +++ b/c_check @@ -0,0 +1,254 @@ +#!/usr/bin/perl + +# Checking cross compile +$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); +$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); + +$binary = $ENV{"BINARY"}; + +$makefile = shift(@ARGV); +$config = shift(@ARGV); + +$compiler_name = join(" ", @ARGV); + +# First, we need to know the target OS and compiler name + +$data = `$compiler_name -E ctest.c`; + +if ($?) { + printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; + die 1; +} + +$cross_suffix = ""; + +if ($ARGV[0] =~ /(.*)(-[.\d]+)/) { + if ($1 =~ /(.*-)(.*)/) { + $cross_suffix = $1; + } +} else { + if ($ARGV[0] =~ /(.*-)(.*)/) { + $cross_suffix = $1; + } +} + +$compiler = ""; +$compiler = PGI if ($data =~ /COMPILER_PGI/); +$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/); +$compiler = INTEL if ($data =~ /COMPILER_INTEL/); +$compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/); +$compiler = SUN if ($data =~ /COMPILER_SUN/); +$compiler = IBM if ($data =~ /COMPILER_IBM/); +$compiler = DEC if ($data =~ /COMPILER_DEC/); +$compiler = GCC if ($compiler eq ""); + +$os = Linux if ($data =~ /OS_LINUX/); +$os = FreeBSD if ($data =~ /OS_FreeBSD/); +$os = NetBSD if ($data =~ /OS_NetBSD/); +$os = Darwin if ($data =~ /OS_Darwin/); +$os = SunOS if ($data =~ /OS_SunOS/); +$os = AIX if ($data =~ /OS_AIX/); +$os = osf if ($data =~ /OS_OSF/); +$os = WINNT if ($data =~ /OS_WINNT/); +$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); +$os = Interix if ($data =~ /OS_INTERIX/); + +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips32 if ($data =~ /ARCH_MIPS32/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); + +$defined = 0; + +if ($os eq "AIX") { + $compiler_name .= " -maix32" if ($binary eq "32"); + $compiler_name .= " -maix64" if ($binary eq "64"); + $defined = 1; +} + +if (($architecture eq "mips32") || ($architecture eq "mips64")) { + $compiler_name .= " -mabi=n32" if ($binary eq "32"); + $compiler_name .= " -mabi=64" if ($binary eq "64"); + $defined = 1; +} + +if ($architecture eq "alpha") { + $defined = 1; + $binary = 64; +} + +if ($architecture eq "ia64") { + $defined = 1; + $binary = 64; +} + +if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { + $defined = 1; + $binary =32; +} + +if ($compiler eq "PGI") { + $compiler_name .= " -tp p7" if ($binary eq "32"); + $compiler_name .= " -tp p7-64" if ($binary eq "64"); + $openmp = "-mp"; + $defined = 1; +} + +if ($compiler eq "IBM") { + $compiler_name .= " -q32" if ($binary eq "32"); + $compiler_name .= " -q64" if ($binary eq "64"); + $openmp = "-qsmp=omp"; + $defined = 1; +} + +if ($compiler eq "INTEL") { + $openmp = "-openmp"; +} + +if ($compiler eq "PATHSCALE") { + $openmp = "-mp"; +} + +if ($compiler eq "OPEN64") { + $openmp = "-mp"; +} + +if ($compiler eq "GCC") { + $openmp = "-fopenmp"; +} + +if ($defined == 0) { + $compiler_name .= " -m32" if ($binary eq "32"); + $compiler_name .= " -m64" if ($binary eq "64"); +} + +# Do again + +$data = `$compiler_name -E ctest.c`; + +if ($?) { + printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; + die 1; +} + +$architecture = x86 if ($data =~ /ARCH_X86/); +$architecture = x86_64 if ($data =~ /ARCH_X86_64/); +$architecture = power if ($data =~ /ARCH_POWER/); +$architecture = mips32 if ($data =~ /ARCH_MIPS32/); +$architecture = mips64 if ($data =~ /ARCH_MIPS64/); +$architecture = alpha if ($data =~ /ARCH_ALPHA/); +$architecture = sparc if ($data =~ /ARCH_SPARC/); +$architecture = ia64 if ($data =~ /ARCH_IA64/); + +$binformat = bin32; +$binformat = bin64 if ($data =~ /BINARY_64/); + +$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; + +$data =~ /globl\ ([_\.]*)(.*)/; + +$need_fu = $1; + +$cross = 0; +$cross = 1 if ($os ne $hostos); + +if ($architecture ne $hostarch) { + $cross = 1; + $cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86")); + $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); +} + +$openmp = "" if $ENV{USE_OPENMP} != 1; + +$linker_L = ""; +$linker_l = ""; +$linker_a = ""; + +{ + $link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; + + $link =~ s/\-Y\sP\,/\-Y/g; + + @flags = split(/[\s\,\n]/, $link); + + foreach $flags (@flags) { + if ( + ($flags =~ /^\-L/) + && ($flags !~ /^-LIST:/) + && ($flags !~ /^-LANG:/) + ) { + $linker_L .= $flags . " " + } + + if ($flags =~ /^\-Y/) { + $linker_L .= "-Wl,". $flags . " " + } + + if ( + ($flags =~ /^\-l/) + && ($flags !~ /gfortranbegin/) + && ($flags !~ /frtbegin/) + && ($flags !~ /pathfstart/) + && ($flags !~ /numa/) + && ($flags !~ /crt[0-9]/) + && ($flags !~ /gcc/) + && ($flags !~ /user32/) + && ($flags !~ /kernel32/) + && ($flags !~ /advapi32/) + && ($flags !~ /shell32/) + ) { + $linker_l .= $flags . " " + } + + $linker_a .= $flags . " " if $flags =~ /\.a$/; + } + +} + +open(MAKEFILE, "> $makefile") || die "Can't create $makefile"; +open(CONFFILE, "> $config" ) || die "Can't create $config"; + +# print $data, "\n"; + +print MAKEFILE "OSNAME=$os\n"; +print MAKEFILE "ARCH=$architecture\n"; +print MAKEFILE "C_COMPILER=$compiler\n"; +print MAKEFILE "BINARY32=\n" if $binformat ne bin32; +print MAKEFILE "BINARY64=\n" if $binformat ne bin64; +print MAKEFILE "BINARY32=1\n" if $binformat eq bin32; +print MAKEFILE "BINARY64=1\n" if $binformat eq bin64; +print MAKEFILE "FU=$need_fu\n" if $need_fu ne ""; +print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne ""; +print MAKEFILE "CROSS=1\n" if $cross != 0; +print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; + +$os =~ tr/[a-z]/[A-Z]/; +$architecture =~ tr/[a-z]/[A-Z]/; +$compiler =~ tr/[a-z]/[A-Z]/; + +print CONFFILE "#define OS_$os\t1\n"; +print CONFFILE "#define ARCH_$architecture\t1\n"; +print CONFFILE "#define C_$compiler\t1\n"; +print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; +print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; +print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; + +if ($os eq "LINUX") { + + @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`); + + if ($pthread[2] ne "") { + print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n"; + } else { + print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; + } +} else { + print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n"; +} + +close(MAKEFILE); +close(CONFFILE); diff --git a/cblas.h b/cblas.h new file mode 100644 index 0000000000..ea0fbb629d --- /dev/null +++ b/cblas.h @@ -0,0 +1,273 @@ +#ifndef CBLAS_H +#define CBLAS_H + +#define CBLAS_INDEX size_t + +enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; +enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; +enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; +enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; +enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; + +float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); +double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); +float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); +double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); + +float _Complex cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); +float _Complex cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); +double _Complex cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); +double _Complex cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); +void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, float _Complex *ret); +void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); +void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, double _Complex *ret); + +float cblas_sasum (blasint n, float *x, blasint incx); +double cblas_dasum (blasint n, double *x, blasint incx); +float cblas_scasum(blasint n, float *x, blasint incx); +double cblas_dzasum(blasint n, double *x, blasint incx); + +float cblas_snrm2 (blasint N, float *X, blasint incX); +double cblas_dnrm2 (blasint N, double *X, blasint incX); +float cblas_scnrm2(blasint N, float *X, blasint incX); +double cblas_dznrm2(blasint N, double *X, blasint incX); + +CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); +CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); + +void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); +void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); +void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); +void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); + +void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); +void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); + +void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); +void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); + +void cblas_srotg(float *a, float *b, float *c, float *s); +void cblas_drotg(double *a, double *b, double *c, double *s); + +void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); +void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); + +void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); + +void cblas_sscal(blasint N, float alpha, float *X, blasint incX); +void cblas_dscal(blasint N, double alpha, double *X, blasint incX); +void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); +void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); +void cblas_csscal(blasint N, float alpha, float *X, blasint incX); +void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); + +void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); +void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); +void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); +void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, + double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); + +void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); + +void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); +void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); + +void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); +void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); + +void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, + blasint incX, float *Y, blasint incY, float *A, blasint lda); +void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, + blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, + float *Y, blasint incY, float *A, blasint lda); +void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, + double *Y, blasint incY, double *A, blasint lda); + +void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, + blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); + + +void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); +void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); + +void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); +void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, float *Ap, float *X, blasint incX); +void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint N, double *Ap, double *X, blasint incX); + +void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, + blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, + blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, + blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, + blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + + +void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, + float *X, blasint incX, float beta, float *Y, blasint incY); +void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, + double *X, blasint incX, double beta, double *Y, blasint incY); + +void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); +void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); + +void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); +void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); + +void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); +void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); +void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); +void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); + +void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); +void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, + double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); + +void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); +void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); + +void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); +void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, + enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); + +void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); +void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); + +void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); +void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); + +void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); +void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, + double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); + +void cblas_xerbla(blasint p, char *rout, char *form, ...); +#endif diff --git a/common.h b/common.h new file mode 100644 index 0000000000..a481b2acb6 --- /dev/null +++ b/common.h @@ -0,0 +1,610 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_H +#define COMMON_H + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifndef __USE_XOPEN +#define __USE_XOPEN +#endif + +#ifndef __USE_SVID +#define __USE_SVID +#endif + +#ifdef BUILD_KERNEL +#include "config_kernel.h" +#else +#include "config.h" +#endif + +#undef ENABLE_SSE_EXCEPTION + +#if defined(SMP_SERVER) || defined(SMP_ONDEMAND) +#define SMP +#endif + +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) +#define WINDOWS_ABI +#define OS_WINDOWS + +#ifdef DOUBLE +#define DOUBLE_DEFINED DOUBLE +#undef DOUBLE +#endif +#endif + +#if !defined(NOINCLUDE) && !defined(ASSEMBLER) +#include +#include +#include +#include + +#ifdef OS_LINUX +#include +#include +#endif + +#ifdef OS_WINDOWS +#ifdef ATOM +#define GOTO_ATOM ATOM +#undef ATOM +#endif +#include +#include +#ifdef GOTO_ATOM +#define ATOM GOTO_ATOM +#undef GOTO_ATOM +#endif +#else +#include +#include +#include +#include +#include +#ifdef SMP +#include +#endif +#endif + +#if defined(OS_SUNOS) +#include +#endif + +#ifdef __DECC +#include +#include +#endif + +#if defined(ARCH_IA64) && defined(ENABLE_SSE_EXCEPTION) +#include +#endif + +#endif + +#if defined(OS_WINDOWS) && defined(DOUBLE_DEFINED) +#define DOUBLE DOUBLE_DEFINED +#undef DOUBLE_DEFINED +#endif + +#undef DEBUG_INFO +#define SMP_DEBUG +#undef MALLOC_DEBUG +#undef SMP_ALLOC_DEBUG + +#ifndef ZERO +#ifdef XDOUBLE +#define ZERO 0.e0L +#elif defined DOUBLE +#define ZERO 0.e0 +#else +#define ZERO 0.e0f +#endif +#endif + +#ifndef ONE +#ifdef XDOUBLE +#define ONE 1.e0L +#elif defined DOUBLE +#define ONE 1.e0 +#else +#define ONE 1.e0f +#endif +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#define ALLOCA_ALIGN 63UL + +#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) + +#ifdef NEEDBUNDERSCORE +#define BLASFUNC(FUNC) FUNC##_ +#else +#define BLASFUNC(FUNC) FUNC +#endif + +#undef USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK + +#if defined(USE_PTHREAD_LOCK) && defined(USE_PTHREAD_SPINLOCK) +#error "You can't specify both LOCK operation!" +#endif + +#ifdef SMP +#define USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK +#endif + +#ifdef OS_WINDOWS +#undef USE_PTHREAD_LOCK +#undef USE_PTHREAD_SPINLOCK +#endif + +#if defined(USE_PTHREAD_LOCK) +#define LOCK_COMMAND(x) pthread_mutex_lock(x) +#define UNLOCK_COMMAND(x) pthread_mutex_unlock(x) +#elif defined(USE_PTHREAD_SPINLOCK) +#ifndef ASSEMBLER +typedef volatile int pthread_spinlock_t; +int pthread_spin_lock (pthread_spinlock_t *__lock); +int pthread_spin_unlock (pthread_spinlock_t *__lock); +#endif +#define LOCK_COMMAND(x) pthread_spin_lock(x) +#define UNLOCK_COMMAND(x) pthread_spin_unlock(x) +#else +#define LOCK_COMMAND(x) blas_lock(x) +#define UNLOCK_COMMAND(x) blas_unlock(x) +#endif + +#define GOTO_SHMID 0x510510 + +#if 0 +#ifndef __CUDACC__ +#define __global__ +#define __device__ +#define __host__ +#define __shared__ +#endif +#endif + +#ifndef ASSEMBLER + +#ifdef QUAD_PRECISION +typedef struct { + unsigned long x[2]; +} xdouble; +#elif defined EXPRECISION +#define xdouble long double +#else +#define xdouble double +#endif + +#if defined(OS_WINDOWS) && defined(__64BIT__) +typedef long long BLASLONG; +typedef unsigned long long BLASULONG; +#else +typedef long BLASLONG; +typedef unsigned long BLASULONG; +#endif + +#ifdef USE64BITINT +typedef BLASLONG blasint; +#else +typedef int blasint; +#endif +#else +#ifdef USE64BITINT +#define INTSHIFT 3 +#define INTSIZE 8 +#else +#define INTSHIFT 2 +#define INTSIZE 4 +#endif +#endif + +#ifdef XDOUBLE +#define FLOAT xdouble +#ifdef QUAD_PRECISION +#define XFLOAT xidouble +#endif +#ifdef QUAD_PRECISION +#define SIZE 32 +#define BASE_SHIFT 5 +#define ZBASE_SHIFT 6 +#else +#define SIZE 16 +#define BASE_SHIFT 4 +#define ZBASE_SHIFT 5 +#endif +#elif defined(DOUBLE) +#define FLOAT double +#define SIZE 8 +#define BASE_SHIFT 3 +#define ZBASE_SHIFT 4 +#else +#define FLOAT float +#define SIZE 4 +#define BASE_SHIFT 2 +#define ZBASE_SHIFT 3 +#endif + +#ifndef XFLOAT +#define XFLOAT FLOAT +#endif + +#ifndef COMPLEX +#define COMPSIZE 1 +#else +#define COMPSIZE 2 +#endif + +#if defined(C_PGI) || defined(C_SUN) +#define CREAL(X) (*((FLOAT *)&X + 0)) +#define CIMAG(X) (*((FLOAT *)&X + 1)) +#else +#define CREAL __real__ +#define CIMAG __imag__ +#endif + +#define Address_H(x) (((x)+(1<<15))>>16) +#define Address_L(x) ((x)-((Address_H(x))<<16)) + +#ifndef MAX_CPU_NUMBER +#define MAX_CPU_NUMBER 2 +#endif + +#if defined(OS_SUNOS) +#define YIELDING thr_yield() +#endif + +#if defined(OS_WINDOWS) +#define YIELDING SwitchToThread() +#endif + +#ifndef YIELDING +#define YIELDING sched_yield() +#endif + +#ifdef QUAD_PRECISION +#include "common_quad.h" +#endif + +#ifdef ARCH_ALPHA +#include "common_alpha.h" +#endif + +#ifdef ARCH_X86 +#include "common_x86.h" +#endif + +#ifdef ARCH_X86_64 +#include "common_x86_64.h" +#endif + +#ifdef ARCH_IA64 +#include "common_ia64.h" +#endif + +#ifdef ARCH_POWER +#include "common_power.h" +#endif + +#ifdef sparc +#include "common_sparc.h" +#endif + +#ifdef ARCH_MIPS64 +#include "common_mips64.h" +#endif + +#ifdef OS_LINUX +#include "common_linux.h" +#endif + +#define MMAP_ACCESS (PROT_READ | PROT_WRITE) +#define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS) + +#include "param.h" +#include "common_param.h" + +#ifndef STDERR +#define STDERR stderr +#endif + +#ifndef MASK +#define MASK(a, b) (((a) + ((b) - 1)) & ~((b) - 1)) +#endif + +#if defined(XDOUBLE) || defined(DOUBLE) +#define FLOATRET FLOAT +#else +#ifdef NEED_F2CCONV +#define FLOATRET double +#else +#define FLOATRET float +#endif +#endif + +#ifndef IFLUSH +#define IFLUSH +#endif + +#ifndef IFLUSH_HALF +#define IFLUSH_HALF +#endif + +#if defined(C_GCC) && (( __GNUC__ <= 3) || ((__GNUC__ == 4) && (__GNUC_MINOR__ < 2))) +#ifdef USE_OPENMP +#undef USE_OPENMP +#endif +#endif + +#ifndef ASSEMBLER + +#ifndef MIN +#define MIN(a,b) (a>b? b:a) +#endif + +#ifndef MAX +#define MAX(a,b) (a 0x60) (a) -= 0x20;} + +#if defined(__FreeBSD__) || defined(__APPLE__) +#define MAP_ANONYMOUS MAP_ANON +#endif + +/* Common Memory Management Routine */ +void blas_set_parameter(void); +int blas_get_cpu_number(void); +void *blas_memory_alloc (int); +void blas_memory_free (void *); + +int get_num_procs (void); + +#if defined(OS_LINUX) && defined(SMP) && !defined(NO_AFFINITY) +int get_num_nodes (void); +int get_num_proc (int); +int get_node_equal (void); +#endif + +void goto_set_num_threads(int); + +void gotoblas_affinity_init(void); +void gotoblas_affinity_quit(void); +void gotoblas_dynamic_init(void); +void gotoblas_dynamic_quit(void); +void gotoblas_profile_init(void); +void gotoblas_profile_quit(void); + +#ifdef USE_OPENMP +int omp_in_parallel(void); +int omp_get_num_procs(void); +#else +#ifdef __ELF__ +int omp_in_parallel (void) __attribute__ ((weak)); +int omp_get_num_procs(void) __attribute__ ((weak)); +#endif +#endif + +static __inline void blas_unlock(volatile BLASULONG *address){ + MB; + *address = 0; +} + +static __inline int readenv(char *env) { + + char *p; + + p = getenv(env); + + if (p == NULL) return 0; else return atoi(p); +} + + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + +static __inline void compinv(FLOAT *b, FLOAT ar, FLOAT ai){ + +#ifndef UNIT + FLOAT ratio, den; + + if ( +#ifdef XDOUBLE + (fabsl(ar)) >= (fabsl(ai)) +#elif defined DOUBLE + (fabs (ar)) >= (fabs (ai)) +#else + (fabsf(ar)) >= (fabsf(ai)) +#endif + ) { + ratio = ai / ar; + den = (FLOAT)(ONE / (ar * (ONE + ratio * ratio))); + ar = den; + ai = -ratio * den; + } else { + ratio = ar / ai; + den = (FLOAT)(ONE /(ai * (ONE + ratio * ratio))); + ar = ratio * den; + ai = -den; + } + b[0] = ar; + b[1] = ai; +#else + b[0] = ONE; + b[1] = ZERO; +#endif + +} +#endif + +#ifdef MALLOC_DEBUG +void *blas_debug_alloc(int); +void *blas_debug_free(void *); +#undef malloc +#undef free +#define malloc(a) blas_debug_alloc(a) +#define free(a) blas_debug_free (a) +#endif + +#ifndef COPYOVERHEAD +#define GEMMRETTYPE int +#else + +typedef struct { + double outercopy; + double innercopy; + double kernel; + double mflops; +} copyoverhead_t; + +#define GEMMRETTYPE copyoverhead_t +#endif +#endif + +#ifndef BUILD_KERNEL +#define KNAME(A, B) A +#else +#define KNAME(A, B) A##B +#endif + +#include "common_interface.h" +#ifdef SANITY_CHECK +#include "common_reference.h" +#endif +#include "common_macro.h" +#include "common_level1.h" +#include "common_level2.h" +#include "common_level3.h" +#include "common_lapack.h" +#ifdef CBLAS +#include "cblas.h" +#endif + +#ifndef ASSEMBLER +#if 0 +#include "symcopy.h" +#endif + +#if defined(SMP_SERVER) && defined(SMP_ONDEMAND) +#error Both SMP_SERVER and SMP_ONDEMAND are specified. +#endif + +#if defined(SMP_SERVER) || defined(SMP_ONDEMAND) +#include "common_thread.h" +#endif + +#endif + +#define INFO_NUM 99 + +#ifndef DEFAULT_CPU_NUMBER +#define DEFAULT_CPU_NUMBER 4 +#endif + +#ifndef IDEBUG_START +#define IDEBUG_START +#endif + +#ifndef IDEBUG_END +#define IDEBUG_END +#endif + +#if !defined(ASSEMBLER) && defined(FUNCTION_PROFILE) + +typedef struct { + int func; + unsigned long long calls, fops, area, cycles, tcycles; +} func_profile_t; + +extern func_profile_t function_profile_table[]; +extern int gotoblas_profile; + +#ifdef XDOUBLE +#define NUMOPT QNUMOPT +#elif defined DOUBLE +#define NUMOPT DNUMOPT +#else +#define NUMOPT SNUMOPT +#endif + +#define FUNCTION_PROFILE_START() { unsigned long long profile_start = rpcc(), profile_end; +#ifdef SMP +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ + if (gotoblas_profile) { \ + profile_end = rpcc(); \ + function_profile_table[PROFILE_FUNC_NAME].calls ++; \ + function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ + function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ + function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ + function_profile_table[PROFILE_FUNC_NAME].tcycles += blas_cpu_number * (profile_end - profile_start); \ + } \ + } +#else +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) \ + if (gotoblas_profile) { \ + profile_end = rpcc(); \ + function_profile_table[PROFILE_FUNC_NAME].calls ++; \ + function_profile_table[PROFILE_FUNC_NAME].area += SIZE * COMPSIZE * (AREA); \ + function_profile_table[PROFILE_FUNC_NAME].fops += (COMP) * (OPS) / NUMOPT; \ + function_profile_table[PROFILE_FUNC_NAME].cycles += (profile_end - profile_start); \ + function_profile_table[PROFILE_FUNC_NAME].tcycles += (profile_end - profile_start); \ + } \ + } +#endif + +#else +#define FUNCTION_PROFILE_START() +#define FUNCTION_PROFILE_END(COMP, AREA, OPS) +#endif + +#if 1 +#define PRINT_DEBUG_CNAME +#define PRINT_DEBUG_NAME +#else +#define PRINT_DEBUG_CNAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_CNAME) +#define PRINT_DEBUG_NAME if (readenv("GOTO_DEBUG")) fprintf(stderr, "GotoBLAS : %s\n", CHAR_NAME) +#endif + +#endif diff --git a/common_alpha.h b/common_alpha.h new file mode 100644 index 0000000000..cf794739ca --- /dev/null +++ b/common_alpha.h @@ -0,0 +1,179 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_ALPHA +#define COMMON_ALPHA + +#ifndef ASSEMBLER + +#define MB asm("mb") +#define WMB asm("wmb") + +static void __inline blas_lock(unsigned long *address){ +#ifndef __DECC + unsigned long tmp1, tmp2; + asm volatile( + "1: ldq %1, %0\n" + " bne %1, 2f\n" + " ldq_l %1, %0\n" + " bne %1, 2f\n" + " or %1, 1, %2\n" + " stq_c %2, %0\n" + " beq %2, 2f\n" + " mb\n " + " br $31, 3f\n" + "2: br $31, 1b\n" + "3:\n" : "=m"(*address), "=&r"(tmp1), "=&r"(tmp2) : : "memory"); +#else + asm ( + "10:" + " ldq %t0, 0(%a0); " + " bne %t0, 20f; " + " ldq_l %t0, 0(%a0); " + " bne %t0, 20f; " + " or %t0, 1, %t1;" + " stq_c %t1, 0(%a0); " + " beq %t1, 20f; " + " mb; " + " br %r31,30f; " + "20: " + " br %r31,10b; " + "30:", address); +#endif +} + +static __inline unsigned int rpcc(void){ + + unsigned int r0; + +#ifndef __DECC + asm __volatile__("rpcc %0" : "=r"(r0) : : "memory"); +#else + r0 = asm("rpcc %v0"); +#endif + + return r0; +} + + +#define HALT ldq $0, 0($0) + +#ifndef __DECC +#define GET_IMAGE(res) asm __volatile__("fmov $f1, %0" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) res = dasm("fmov $f1, %f0") +#endif + +#ifdef SMP +#ifdef USE64BITINT +static __inline long blas_quickdivide(long x, long y){ + return x/y; +} +#else +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + if (y <= 1) return x; + return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); +} +#endif +#endif + +#define BASE_ADDRESS ((0x1b0UL << 33) | (0x1c0UL << 23) | (0x000UL << 13)) + +#ifndef PAGESIZE +#define PAGESIZE ( 8UL << 10) +#define HUGE_PAGESIZE ( 4 << 20) +#endif +#define BUFFER_SIZE (32UL << 20) + +#else + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#define PROLOGUE \ + .arch ev6; \ + .set noat; \ + .set noreorder; \ +.text; \ + .align 5; \ + .globl REALNAME; \ + .ent REALNAME; \ +REALNAME: + +#ifdef PROFILE +#define PROFCODE \ + ldgp $gp, 0($27); \ + lda $28, _mcount; \ + jsr $28, ($28), _mcount; \ + .prologue 1 +#else +#define PROFCODE .prologue 0 +#endif + +#define EPILOGUE \ + .end REALNAME; \ + .ident VERSION +#endif + +#ifdef DOUBLE +#define SXADDQ s8addq +#define SXSUBL s8subl +#define LD ldt +#define ST stt +#define STQ stq +#define ADD addt/su +#define SUB subt/su +#define MUL mult/su +#define DIV divt/su +#else +#define SXADDQ s4addq +#define SXSUBL s4subl +#define LD lds +#define ST sts +#define STQ stl +#define ADD adds/su +#define SUB subs/su +#define MUL muls/su +#define DIV divs/su +#endif +#endif diff --git a/common_c.h b/common_c.h new file mode 100644 index 0000000000..f78f172134 --- /dev/null +++ b/common_c.h @@ -0,0 +1,611 @@ +#ifndef COMMON_C_H +#define COMMON_C_H + +#ifndef DYNAMIC_ARCH + +#define CAMAX_K camax_k +#define CAMIN_K camin_k +#define CMAX_K cmax_k +#define CMIN_K cmin_k +#define ICAMAX_K icamax_k +#define ICAMIN_K icamin_k +#define ICMAX_K icmax_k +#define ICMIN_K icmin_k +#define CASUM_K casum_k +#define CAXPYU_K caxpy_k +#define CAXPYC_K caxpyc_k +#define CCOPY_K ccopy_k +#define CDOTU_K cdotu_k +#define CDOTC_K cdotc_k +#define CNRM2_K cnrm2_k +#define CSCAL_K cscal_k +#define CSWAP_K cswap_k +#define CROT_K csrot_k + +#define CGEMV_N cgemv_n +#define CGEMV_T cgemv_t +#define CGEMV_R cgemv_r +#define CGEMV_C cgemv_c +#define CGEMV_O cgemv_o +#define CGEMV_U cgemv_u +#define CGEMV_S cgemv_s +#define CGEMV_D cgemv_d + +#define CGERU_K cgeru_k +#define CGERC_K cgerc_k +#define CGERV_K cgerv_k +#define CGERD_K cgerd_k + +#define CSYMV_U csymv_U +#define CSYMV_L csymv_L +#define CHEMV_U chemv_U +#define CHEMV_L chemv_L +#define CHEMV_V chemv_V +#define CHEMV_M chemv_M + +#define CSYMV_THREAD_U csymv_thread_U +#define CSYMV_THREAD_L csymv_thread_L +#define CHEMV_THREAD_U chemv_thread_U +#define CHEMV_THREAD_L chemv_thread_L +#define CHEMV_THREAD_V chemv_thread_V +#define CHEMV_THREAD_M chemv_thread_M + +#define CGEMM_ONCOPY cgemm_oncopy +#define CGEMM_OTCOPY cgemm_otcopy + +#if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N +#define CGEMM_INCOPY cgemm_oncopy +#define CGEMM_ITCOPY cgemm_otcopy +#else +#define CGEMM_INCOPY cgemm_incopy +#define CGEMM_ITCOPY cgemm_itcopy +#endif + +#define CTRMM_OUNUCOPY ctrmm_ounucopy +#define CTRMM_OUNNCOPY ctrmm_ounncopy +#define CTRMM_OUTUCOPY ctrmm_outucopy +#define CTRMM_OUTNCOPY ctrmm_outncopy +#define CTRMM_OLNUCOPY ctrmm_olnucopy +#define CTRMM_OLNNCOPY ctrmm_olnncopy +#define CTRMM_OLTUCOPY ctrmm_oltucopy +#define CTRMM_OLTNCOPY ctrmm_oltncopy + +#define CTRSM_OUNUCOPY ctrsm_ounucopy +#define CTRSM_OUNNCOPY ctrsm_ounncopy +#define CTRSM_OUTUCOPY ctrsm_outucopy +#define CTRSM_OUTNCOPY ctrsm_outncopy +#define CTRSM_OLNUCOPY ctrsm_olnucopy +#define CTRSM_OLNNCOPY ctrsm_olnncopy +#define CTRSM_OLTUCOPY ctrsm_oltucopy +#define CTRSM_OLTNCOPY ctrsm_oltncopy + +#if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N +#define CTRMM_IUNUCOPY ctrmm_ounucopy +#define CTRMM_IUNNCOPY ctrmm_ounncopy +#define CTRMM_IUTUCOPY ctrmm_outucopy +#define CTRMM_IUTNCOPY ctrmm_outncopy +#define CTRMM_ILNUCOPY ctrmm_olnucopy +#define CTRMM_ILNNCOPY ctrmm_olnncopy +#define CTRMM_ILTUCOPY ctrmm_oltucopy +#define CTRMM_ILTNCOPY ctrmm_oltncopy + +#define CTRSM_IUNUCOPY ctrsm_ounucopy +#define CTRSM_IUNNCOPY ctrsm_ounncopy +#define CTRSM_IUTUCOPY ctrsm_outucopy +#define CTRSM_IUTNCOPY ctrsm_outncopy +#define CTRSM_ILNUCOPY ctrsm_olnucopy +#define CTRSM_ILNNCOPY ctrsm_olnncopy +#define CTRSM_ILTUCOPY ctrsm_oltucopy +#define CTRSM_ILTNCOPY ctrsm_oltncopy +#else +#define CTRMM_IUNUCOPY ctrmm_iunucopy +#define CTRMM_IUNNCOPY ctrmm_iunncopy +#define CTRMM_IUTUCOPY ctrmm_iutucopy +#define CTRMM_IUTNCOPY ctrmm_iutncopy +#define CTRMM_ILNUCOPY ctrmm_ilnucopy +#define CTRMM_ILNNCOPY ctrmm_ilnncopy +#define CTRMM_ILTUCOPY ctrmm_iltucopy +#define CTRMM_ILTNCOPY ctrmm_iltncopy + +#define CTRSM_IUNUCOPY ctrsm_iunucopy +#define CTRSM_IUNNCOPY ctrsm_iunncopy +#define CTRSM_IUTUCOPY ctrsm_iutucopy +#define CTRSM_IUTNCOPY ctrsm_iutncopy +#define CTRSM_ILNUCOPY ctrsm_ilnucopy +#define CTRSM_ILNNCOPY ctrsm_ilnncopy +#define CTRSM_ILTUCOPY ctrsm_iltucopy +#define CTRSM_ILTNCOPY ctrsm_iltncopy +#endif + +#define CGEMM_BETA cgemm_beta + +#define CGEMM_KERNEL_N cgemm_kernel_n +#define CGEMM_KERNEL_L cgemm_kernel_l +#define CGEMM_KERNEL_R cgemm_kernel_r +#define CGEMM_KERNEL_B cgemm_kernel_b + +#define CTRMM_KERNEL_LN ctrmm_kernel_LN +#define CTRMM_KERNEL_LT ctrmm_kernel_LT +#define CTRMM_KERNEL_LR ctrmm_kernel_LR +#define CTRMM_KERNEL_LC ctrmm_kernel_LC +#define CTRMM_KERNEL_RN ctrmm_kernel_RN +#define CTRMM_KERNEL_RT ctrmm_kernel_RT +#define CTRMM_KERNEL_RR ctrmm_kernel_RR +#define CTRMM_KERNEL_RC ctrmm_kernel_RC + +#define CTRSM_KERNEL_LN ctrsm_kernel_LN +#define CTRSM_KERNEL_LT ctrsm_kernel_LT +#define CTRSM_KERNEL_LR ctrsm_kernel_LR +#define CTRSM_KERNEL_LC ctrsm_kernel_LC +#define CTRSM_KERNEL_RN ctrsm_kernel_RN +#define CTRSM_KERNEL_RT ctrsm_kernel_RT +#define CTRSM_KERNEL_RR ctrsm_kernel_RR +#define CTRSM_KERNEL_RC ctrsm_kernel_RC + +#define CSYMM_OUTCOPY csymm_outcopy +#define CSYMM_OLTCOPY csymm_oltcopy +#if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N +#define CSYMM_IUTCOPY csymm_outcopy +#define CSYMM_ILTCOPY csymm_oltcopy +#else +#define CSYMM_IUTCOPY csymm_iutcopy +#define CSYMM_ILTCOPY csymm_iltcopy +#endif + +#define CHEMM_OUTCOPY chemm_outcopy +#define CHEMM_OLTCOPY chemm_oltcopy +#if CGEMM_DEFAULT_UNROLL_M == CGEMM_DEFAULT_UNROLL_N +#define CHEMM_IUTCOPY chemm_outcopy +#define CHEMM_ILTCOPY chemm_oltcopy +#else +#define CHEMM_IUTCOPY chemm_iutcopy +#define CHEMM_ILTCOPY chemm_iltcopy +#endif + +#define CGEMM3M_ONCOPYB cgemm3m_oncopyb +#define CGEMM3M_ONCOPYR cgemm3m_oncopyr +#define CGEMM3M_ONCOPYI cgemm3m_oncopyi +#define CGEMM3M_OTCOPYB cgemm3m_otcopyb +#define CGEMM3M_OTCOPYR cgemm3m_otcopyr +#define CGEMM3M_OTCOPYI cgemm3m_otcopyi + +#define CGEMM3M_INCOPYB cgemm3m_incopyb +#define CGEMM3M_INCOPYR cgemm3m_incopyr +#define CGEMM3M_INCOPYI cgemm3m_incopyi +#define CGEMM3M_ITCOPYB cgemm3m_itcopyb +#define CGEMM3M_ITCOPYR cgemm3m_itcopyr +#define CGEMM3M_ITCOPYI cgemm3m_itcopyi + +#define CSYMM3M_ILCOPYB csymm3m_ilcopyb +#define CSYMM3M_IUCOPYB csymm3m_iucopyb +#define CSYMM3M_ILCOPYR csymm3m_ilcopyr +#define CSYMM3M_IUCOPYR csymm3m_iucopyr +#define CSYMM3M_ILCOPYI csymm3m_ilcopyi +#define CSYMM3M_IUCOPYI csymm3m_iucopyi + +#define CSYMM3M_OLCOPYB csymm3m_olcopyb +#define CSYMM3M_OUCOPYB csymm3m_oucopyb +#define CSYMM3M_OLCOPYR csymm3m_olcopyr +#define CSYMM3M_OUCOPYR csymm3m_oucopyr +#define CSYMM3M_OLCOPYI csymm3m_olcopyi +#define CSYMM3M_OUCOPYI csymm3m_oucopyi + +#define CHEMM3M_ILCOPYB chemm3m_ilcopyb +#define CHEMM3M_IUCOPYB chemm3m_iucopyb +#define CHEMM3M_ILCOPYR chemm3m_ilcopyr +#define CHEMM3M_IUCOPYR chemm3m_iucopyr +#define CHEMM3M_ILCOPYI chemm3m_ilcopyi +#define CHEMM3M_IUCOPYI chemm3m_iucopyi + +#define CHEMM3M_OLCOPYB chemm3m_olcopyb +#define CHEMM3M_OUCOPYB chemm3m_oucopyb +#define CHEMM3M_OLCOPYR chemm3m_olcopyr +#define CHEMM3M_OUCOPYR chemm3m_oucopyr +#define CHEMM3M_OLCOPYI chemm3m_olcopyi +#define CHEMM3M_OUCOPYI chemm3m_oucopyi + +#define CGEMM3M_KERNEL cgemm3m_kernel + +#define CNEG_TCOPY cneg_tcopy +#define CLASWP_NCOPY claswp_ncopy + +#else + +#define CAMAX_K gotoblas -> camax_k +#define CAMIN_K gotoblas -> camin_k +#define CMAX_K gotoblas -> cmax_k +#define CMIN_K gotoblas -> cmin_k +#define ICAMAX_K gotoblas -> icamax_k +#define ICAMIN_K gotoblas -> icamin_k +#define ICMAX_K gotoblas -> icmax_k +#define ICMIN_K gotoblas -> icmin_k +#define CASUM_K gotoblas -> casum_k +#define CAXPYU_K gotoblas -> caxpy_k +#define CAXPYC_K gotoblas -> caxpyc_k +#define CCOPY_K gotoblas -> ccopy_k +#define CDOTU_K gotoblas -> cdotu_k +#define CDOTC_K gotoblas -> cdotc_k +#define CNRM2_K gotoblas -> cnrm2_k +#define CSCAL_K gotoblas -> cscal_k +#define CSWAP_K gotoblas -> cswap_k +#define CROT_K gotoblas -> csrot_k + +#define CGEMV_N gotoblas -> cgemv_n +#define CGEMV_T gotoblas -> cgemv_t +#define CGEMV_R gotoblas -> cgemv_r +#define CGEMV_C gotoblas -> cgemv_c +#define CGEMV_O gotoblas -> cgemv_o +#define CGEMV_U gotoblas -> cgemv_u +#define CGEMV_S gotoblas -> cgemv_s +#define CGEMV_D gotoblas -> cgemv_d + +#define CGERU_K gotoblas -> cgeru_k +#define CGERC_K gotoblas -> cgerc_k +#define CGERV_K gotoblas -> cgerv_k +#define CGERD_K gotoblas -> cgerd_k + +#define CSYMV_U gotoblas -> csymv_U +#define CSYMV_L gotoblas -> csymv_L +#define CHEMV_U gotoblas -> chemv_U +#define CHEMV_L gotoblas -> chemv_L +#define CHEMV_V gotoblas -> chemv_V +#define CHEMV_M gotoblas -> chemv_M + +#define CSYMV_THREAD_U csymv_thread_U +#define CSYMV_THREAD_L csymv_thread_L +#define CHEMV_THREAD_U chemv_thread_U +#define CHEMV_THREAD_L chemv_thread_L +#define CHEMV_THREAD_V chemv_thread_V +#define CHEMV_THREAD_M chemv_thread_M + +#define CGEMM_ONCOPY gotoblas -> cgemm_oncopy +#define CGEMM_OTCOPY gotoblas -> cgemm_otcopy +#define CGEMM_INCOPY gotoblas -> cgemm_incopy +#define CGEMM_ITCOPY gotoblas -> cgemm_itcopy + +#define CTRMM_OUNUCOPY gotoblas -> ctrmm_ounucopy +#define CTRMM_OUTUCOPY gotoblas -> ctrmm_outucopy +#define CTRMM_OLNUCOPY gotoblas -> ctrmm_olnucopy +#define CTRMM_OLTUCOPY gotoblas -> ctrmm_oltucopy +#define CTRSM_OUNUCOPY gotoblas -> ctrsm_ounucopy +#define CTRSM_OUTUCOPY gotoblas -> ctrsm_outucopy +#define CTRSM_OLNUCOPY gotoblas -> ctrsm_olnucopy +#define CTRSM_OLTUCOPY gotoblas -> ctrsm_oltucopy + +#define CTRMM_IUNUCOPY gotoblas -> ctrmm_iunucopy +#define CTRMM_IUTUCOPY gotoblas -> ctrmm_iutucopy +#define CTRMM_ILNUCOPY gotoblas -> ctrmm_ilnucopy +#define CTRMM_ILTUCOPY gotoblas -> ctrmm_iltucopy +#define CTRSM_IUNUCOPY gotoblas -> ctrsm_iunucopy +#define CTRSM_IUTUCOPY gotoblas -> ctrsm_iutucopy +#define CTRSM_ILNUCOPY gotoblas -> ctrsm_ilnucopy +#define CTRSM_ILTUCOPY gotoblas -> ctrsm_iltucopy + +#define CTRMM_OUNNCOPY gotoblas -> ctrmm_ounncopy +#define CTRMM_OUTNCOPY gotoblas -> ctrmm_outncopy +#define CTRMM_OLNNCOPY gotoblas -> ctrmm_olnncopy +#define CTRMM_OLTNCOPY gotoblas -> ctrmm_oltncopy +#define CTRSM_OUNNCOPY gotoblas -> ctrsm_ounncopy +#define CTRSM_OUTNCOPY gotoblas -> ctrsm_outncopy +#define CTRSM_OLNNCOPY gotoblas -> ctrsm_olnncopy +#define CTRSM_OLTNCOPY gotoblas -> ctrsm_oltncopy + +#define CTRMM_IUNNCOPY gotoblas -> ctrmm_iunncopy +#define CTRMM_IUTNCOPY gotoblas -> ctrmm_iutncopy +#define CTRMM_ILNNCOPY gotoblas -> ctrmm_ilnncopy +#define CTRMM_ILTNCOPY gotoblas -> ctrmm_iltncopy +#define CTRSM_IUNNCOPY gotoblas -> ctrsm_iunncopy +#define CTRSM_IUTNCOPY gotoblas -> ctrsm_iutncopy +#define CTRSM_ILNNCOPY gotoblas -> ctrsm_ilnncopy +#define CTRSM_ILTNCOPY gotoblas -> ctrsm_iltncopy + +#define CGEMM_BETA gotoblas -> cgemm_beta +#define CGEMM_KERNEL_N gotoblas -> cgemm_kernel_n +#define CGEMM_KERNEL_L gotoblas -> cgemm_kernel_l +#define CGEMM_KERNEL_R gotoblas -> cgemm_kernel_r +#define CGEMM_KERNEL_B gotoblas -> cgemm_kernel_b + +#define CTRMM_KERNEL_LN gotoblas -> ctrmm_kernel_LN +#define CTRMM_KERNEL_LT gotoblas -> ctrmm_kernel_LT +#define CTRMM_KERNEL_LR gotoblas -> ctrmm_kernel_LR +#define CTRMM_KERNEL_LC gotoblas -> ctrmm_kernel_LC +#define CTRMM_KERNEL_RN gotoblas -> ctrmm_kernel_RN +#define CTRMM_KERNEL_RT gotoblas -> ctrmm_kernel_RT +#define CTRMM_KERNEL_RR gotoblas -> ctrmm_kernel_RR +#define CTRMM_KERNEL_RC gotoblas -> ctrmm_kernel_RC + +#define CTRSM_KERNEL_LN gotoblas -> ctrsm_kernel_LN +#define CTRSM_KERNEL_LT gotoblas -> ctrsm_kernel_LT +#define CTRSM_KERNEL_LR gotoblas -> ctrsm_kernel_LR +#define CTRSM_KERNEL_LC gotoblas -> ctrsm_kernel_LC +#define CTRSM_KERNEL_RN gotoblas -> ctrsm_kernel_RN +#define CTRSM_KERNEL_RT gotoblas -> ctrsm_kernel_RT +#define CTRSM_KERNEL_RR gotoblas -> ctrsm_kernel_RR +#define CTRSM_KERNEL_RC gotoblas -> ctrsm_kernel_RC + +#define CSYMM_IUTCOPY gotoblas -> csymm_iutcopy +#define CSYMM_ILTCOPY gotoblas -> csymm_iltcopy +#define CSYMM_OUTCOPY gotoblas -> csymm_outcopy +#define CSYMM_OLTCOPY gotoblas -> csymm_oltcopy + +#define CHEMM_OUTCOPY gotoblas -> chemm_outcopy +#define CHEMM_OLTCOPY gotoblas -> chemm_oltcopy +#define CHEMM_IUTCOPY gotoblas -> chemm_iutcopy +#define CHEMM_ILTCOPY gotoblas -> chemm_iltcopy + +#define CGEMM3M_ONCOPYB gotoblas -> cgemm3m_oncopyb +#define CGEMM3M_ONCOPYR gotoblas -> cgemm3m_oncopyr +#define CGEMM3M_ONCOPYI gotoblas -> cgemm3m_oncopyi +#define CGEMM3M_OTCOPYB gotoblas -> cgemm3m_otcopyb +#define CGEMM3M_OTCOPYR gotoblas -> cgemm3m_otcopyr +#define CGEMM3M_OTCOPYI gotoblas -> cgemm3m_otcopyi + +#define CGEMM3M_INCOPYB gotoblas -> cgemm3m_incopyb +#define CGEMM3M_INCOPYR gotoblas -> cgemm3m_incopyr +#define CGEMM3M_INCOPYI gotoblas -> cgemm3m_incopyi +#define CGEMM3M_ITCOPYB gotoblas -> cgemm3m_itcopyb +#define CGEMM3M_ITCOPYR gotoblas -> cgemm3m_itcopyr +#define CGEMM3M_ITCOPYI gotoblas -> cgemm3m_itcopyi + +#define CSYMM3M_ILCOPYB gotoblas -> csymm3m_ilcopyb +#define CSYMM3M_IUCOPYB gotoblas -> csymm3m_iucopyb +#define CSYMM3M_ILCOPYR gotoblas -> csymm3m_ilcopyr +#define CSYMM3M_IUCOPYR gotoblas -> csymm3m_iucopyr +#define CSYMM3M_ILCOPYI gotoblas -> csymm3m_ilcopyi +#define CSYMM3M_IUCOPYI gotoblas -> csymm3m_iucopyi + +#define CSYMM3M_OLCOPYB gotoblas -> csymm3m_olcopyb +#define CSYMM3M_OUCOPYB gotoblas -> csymm3m_oucopyb +#define CSYMM3M_OLCOPYR gotoblas -> csymm3m_olcopyr +#define CSYMM3M_OUCOPYR gotoblas -> csymm3m_oucopyr +#define CSYMM3M_OLCOPYI gotoblas -> csymm3m_olcopyi +#define CSYMM3M_OUCOPYI gotoblas -> csymm3m_oucopyi + +#define CHEMM3M_ILCOPYB gotoblas -> chemm3m_ilcopyb +#define CHEMM3M_IUCOPYB gotoblas -> chemm3m_iucopyb +#define CHEMM3M_ILCOPYR gotoblas -> chemm3m_ilcopyr +#define CHEMM3M_IUCOPYR gotoblas -> chemm3m_iucopyr +#define CHEMM3M_ILCOPYI gotoblas -> chemm3m_ilcopyi +#define CHEMM3M_IUCOPYI gotoblas -> chemm3m_iucopyi + +#define CHEMM3M_OLCOPYB gotoblas -> chemm3m_olcopyb +#define CHEMM3M_OUCOPYB gotoblas -> chemm3m_oucopyb +#define CHEMM3M_OLCOPYR gotoblas -> chemm3m_olcopyr +#define CHEMM3M_OUCOPYR gotoblas -> chemm3m_oucopyr +#define CHEMM3M_OLCOPYI gotoblas -> chemm3m_olcopyi +#define CHEMM3M_OUCOPYI gotoblas -> chemm3m_oucopyi + +#define CGEMM3M_KERNEL gotoblas -> cgemm3m_kernel + +#define CNEG_TCOPY gotoblas -> cneg_tcopy +#define CLASWP_NCOPY gotoblas -> claswp_ncopy + +#endif + +#define CGEMM_NN cgemm_nn +#define CGEMM_CN cgemm_cn +#define CGEMM_TN cgemm_tn +#define CGEMM_NC cgemm_nc +#define CGEMM_NT cgemm_nt +#define CGEMM_CC cgemm_cc +#define CGEMM_CT cgemm_ct +#define CGEMM_TC cgemm_tc +#define CGEMM_TT cgemm_tt +#define CGEMM_NR cgemm_nr +#define CGEMM_TR cgemm_tr +#define CGEMM_CR cgemm_cr +#define CGEMM_RN cgemm_rn +#define CGEMM_RT cgemm_rt +#define CGEMM_RC cgemm_rc +#define CGEMM_RR cgemm_rr + +#define CSYMM_LU csymm_LU +#define CSYMM_LL csymm_LL +#define CSYMM_RU csymm_RU +#define CSYMM_RL csymm_RL + +#define CHEMM_LU chemm_LU +#define CHEMM_LL chemm_LL +#define CHEMM_RU chemm_RU +#define CHEMM_RL chemm_RL + +#define CSYRK_UN csyrk_UN +#define CSYRK_UT csyrk_UT +#define CSYRK_LN csyrk_LN +#define CSYRK_LT csyrk_LT +#define CSYRK_UR csyrk_UN +#define CSYRK_UC csyrk_UT +#define CSYRK_LR csyrk_LN +#define CSYRK_LC csyrk_LT + +#define CSYRK_KERNEL_U csyrk_kernel_U +#define CSYRK_KERNEL_L csyrk_kernel_L + +#define CHERK_UN cherk_UN +#define CHERK_LN cherk_LN +#define CHERK_UC cherk_UC +#define CHERK_LC cherk_LC + +#define CHER2K_UN cher2k_UN +#define CHER2K_LN cher2k_LN +#define CHER2K_UC cher2k_UC +#define CHER2K_LC cher2k_LC + +#define CSYR2K_UN csyr2k_UN +#define CSYR2K_UT csyr2k_UT +#define CSYR2K_LN csyr2k_LN +#define CSYR2K_LT csyr2k_LT +#define CSYR2K_UR csyr2k_UN +#define CSYR2K_UC csyr2k_UT +#define CSYR2K_LR csyr2k_LN +#define CSYR2K_LC csyr2k_LT + +#define CSYR2K_KERNEL_U csyr2k_kernel_U +#define CSYR2K_KERNEL_L csyr2k_kernel_L + +#define CTRMM_LNUU ctrmm_LNUU +#define CTRMM_LNUN ctrmm_LNUN +#define CTRMM_LNLU ctrmm_LNLU +#define CTRMM_LNLN ctrmm_LNLN +#define CTRMM_LTUU ctrmm_LTUU +#define CTRMM_LTUN ctrmm_LTUN +#define CTRMM_LTLU ctrmm_LTLU +#define CTRMM_LTLN ctrmm_LTLN +#define CTRMM_LRUU ctrmm_LRUU +#define CTRMM_LRUN ctrmm_LRUN +#define CTRMM_LRLU ctrmm_LRLU +#define CTRMM_LRLN ctrmm_LRLN +#define CTRMM_LCUU ctrmm_LCUU +#define CTRMM_LCUN ctrmm_LCUN +#define CTRMM_LCLU ctrmm_LCLU +#define CTRMM_LCLN ctrmm_LCLN +#define CTRMM_RNUU ctrmm_RNUU +#define CTRMM_RNUN ctrmm_RNUN +#define CTRMM_RNLU ctrmm_RNLU +#define CTRMM_RNLN ctrmm_RNLN +#define CTRMM_RTUU ctrmm_RTUU +#define CTRMM_RTUN ctrmm_RTUN +#define CTRMM_RTLU ctrmm_RTLU +#define CTRMM_RTLN ctrmm_RTLN +#define CTRMM_RRUU ctrmm_RRUU +#define CTRMM_RRUN ctrmm_RRUN +#define CTRMM_RRLU ctrmm_RRLU +#define CTRMM_RRLN ctrmm_RRLN +#define CTRMM_RCUU ctrmm_RCUU +#define CTRMM_RCUN ctrmm_RCUN +#define CTRMM_RCLU ctrmm_RCLU +#define CTRMM_RCLN ctrmm_RCLN + +#define CTRSM_LNUU ctrsm_LNUU +#define CTRSM_LNUN ctrsm_LNUN +#define CTRSM_LNLU ctrsm_LNLU +#define CTRSM_LNLN ctrsm_LNLN +#define CTRSM_LTUU ctrsm_LTUU +#define CTRSM_LTUN ctrsm_LTUN +#define CTRSM_LTLU ctrsm_LTLU +#define CTRSM_LTLN ctrsm_LTLN +#define CTRSM_LRUU ctrsm_LRUU +#define CTRSM_LRUN ctrsm_LRUN +#define CTRSM_LRLU ctrsm_LRLU +#define CTRSM_LRLN ctrsm_LRLN +#define CTRSM_LCUU ctrsm_LCUU +#define CTRSM_LCUN ctrsm_LCUN +#define CTRSM_LCLU ctrsm_LCLU +#define CTRSM_LCLN ctrsm_LCLN +#define CTRSM_RNUU ctrsm_RNUU +#define CTRSM_RNUN ctrsm_RNUN +#define CTRSM_RNLU ctrsm_RNLU +#define CTRSM_RNLN ctrsm_RNLN +#define CTRSM_RTUU ctrsm_RTUU +#define CTRSM_RTUN ctrsm_RTUN +#define CTRSM_RTLU ctrsm_RTLU +#define CTRSM_RTLN ctrsm_RTLN +#define CTRSM_RRUU ctrsm_RRUU +#define CTRSM_RRUN ctrsm_RRUN +#define CTRSM_RRLU ctrsm_RRLU +#define CTRSM_RRLN ctrsm_RRLN +#define CTRSM_RCUU ctrsm_RCUU +#define CTRSM_RCUN ctrsm_RCUN +#define CTRSM_RCLU ctrsm_RCLU +#define CTRSM_RCLN ctrsm_RCLN + +#define CGEMM_THREAD_NN cgemm_thread_nn +#define CGEMM_THREAD_CN cgemm_thread_cn +#define CGEMM_THREAD_TN cgemm_thread_tn +#define CGEMM_THREAD_NC cgemm_thread_nc +#define CGEMM_THREAD_NT cgemm_thread_nt +#define CGEMM_THREAD_CC cgemm_thread_cc +#define CGEMM_THREAD_CT cgemm_thread_ct +#define CGEMM_THREAD_TC cgemm_thread_tc +#define CGEMM_THREAD_TT cgemm_thread_tt +#define CGEMM_THREAD_NR cgemm_thread_nr +#define CGEMM_THREAD_TR cgemm_thread_tr +#define CGEMM_THREAD_CR cgemm_thread_cr +#define CGEMM_THREAD_RN cgemm_thread_rn +#define CGEMM_THREAD_RT cgemm_thread_rt +#define CGEMM_THREAD_RC cgemm_thread_rc +#define CGEMM_THREAD_RR cgemm_thread_rr + +#define CSYMM_THREAD_LU csymm_thread_LU +#define CSYMM_THREAD_LL csymm_thread_LL +#define CSYMM_THREAD_RU csymm_thread_RU +#define CSYMM_THREAD_RL csymm_thread_RL + +#define CHEMM_THREAD_LU chemm_thread_LU +#define CHEMM_THREAD_LL chemm_thread_LL +#define CHEMM_THREAD_RU chemm_thread_RU +#define CHEMM_THREAD_RL chemm_thread_RL + +#define CSYRK_THREAD_UN csyrk_thread_UN +#define CSYRK_THREAD_UT csyrk_thread_UT +#define CSYRK_THREAD_LN csyrk_thread_LN +#define CSYRK_THREAD_LT csyrk_thread_LT +#define CSYRK_THREAD_UR csyrk_thread_UN +#define CSYRK_THREAD_UC csyrk_thread_UT +#define CSYRK_THREAD_LR csyrk_thread_LN +#define CSYRK_THREAD_LC csyrk_thread_LT + +#define CHERK_THREAD_UN cherk_thread_UN +#define CHERK_THREAD_UT cherk_thread_UT +#define CHERK_THREAD_LN cherk_thread_LN +#define CHERK_THREAD_LT cherk_thread_LT +#define CHERK_THREAD_UR cherk_thread_UR +#define CHERK_THREAD_UC cherk_thread_UC +#define CHERK_THREAD_LR cherk_thread_LR +#define CHERK_THREAD_LC cherk_thread_LC + +#define CGEMM3M_NN cgemm3m_nn +#define CGEMM3M_CN cgemm3m_cn +#define CGEMM3M_TN cgemm3m_tn +#define CGEMM3M_NC cgemm3m_nc +#define CGEMM3M_NT cgemm3m_nt +#define CGEMM3M_CC cgemm3m_cc +#define CGEMM3M_CT cgemm3m_ct +#define CGEMM3M_TC cgemm3m_tc +#define CGEMM3M_TT cgemm3m_tt +#define CGEMM3M_NR cgemm3m_nr +#define CGEMM3M_TR cgemm3m_tr +#define CGEMM3M_CR cgemm3m_cr +#define CGEMM3M_RN cgemm3m_rn +#define CGEMM3M_RT cgemm3m_rt +#define CGEMM3M_RC cgemm3m_rc +#define CGEMM3M_RR cgemm3m_rr + +#define CGEMM3M_THREAD_NN cgemm3m_thread_nn +#define CGEMM3M_THREAD_CN cgemm3m_thread_cn +#define CGEMM3M_THREAD_TN cgemm3m_thread_tn +#define CGEMM3M_THREAD_NC cgemm3m_thread_nc +#define CGEMM3M_THREAD_NT cgemm3m_thread_nt +#define CGEMM3M_THREAD_CC cgemm3m_thread_cc +#define CGEMM3M_THREAD_CT cgemm3m_thread_ct +#define CGEMM3M_THREAD_TC cgemm3m_thread_tc +#define CGEMM3M_THREAD_TT cgemm3m_thread_tt +#define CGEMM3M_THREAD_NR cgemm3m_thread_nr +#define CGEMM3M_THREAD_TR cgemm3m_thread_tr +#define CGEMM3M_THREAD_CR cgemm3m_thread_cr +#define CGEMM3M_THREAD_RN cgemm3m_thread_rn +#define CGEMM3M_THREAD_RT cgemm3m_thread_rt +#define CGEMM3M_THREAD_RC cgemm3m_thread_rc +#define CGEMM3M_THREAD_RR cgemm3m_thread_rr + +#define CSYMM3M_LU csymm3m_LU +#define CSYMM3M_LL csymm3m_LL +#define CSYMM3M_RU csymm3m_RU +#define CSYMM3M_RL csymm3m_RL + +#define CSYMM3M_THREAD_LU csymm3m_thread_LU +#define CSYMM3M_THREAD_LL csymm3m_thread_LL +#define CSYMM3M_THREAD_RU csymm3m_thread_RU +#define CSYMM3M_THREAD_RL csymm3m_thread_RL + +#define CHEMM3M_LU chemm3m_LU +#define CHEMM3M_LL chemm3m_LL +#define CHEMM3M_RU chemm3m_RU +#define CHEMM3M_RL chemm3m_RL + +#define CHEMM3M_THREAD_LU chemm3m_thread_LU +#define CHEMM3M_THREAD_LL chemm3m_thread_LL +#define CHEMM3M_THREAD_RU chemm3m_thread_RU +#define CHEMM3M_THREAD_RL chemm3m_thread_RL + +#endif diff --git a/common_d.h b/common_d.h new file mode 100644 index 0000000000..4c9a53f6c5 --- /dev/null +++ b/common_d.h @@ -0,0 +1,432 @@ +#ifndef COMMON_D_H +#define COMMON_D_H + +#ifndef DYNAMIC_ARCH + +#define DAMAX_K damax_k +#define DAMIN_K damin_k +#define DMAX_K dmax_k +#define DMIN_K dmin_k +#define IDAMAX_K idamax_k +#define IDAMIN_K idamin_k +#define IDMAX_K idmax_k +#define IDMIN_K idmin_k +#define DASUM_K dasum_k +#define DAXPYU_K daxpy_k +#define DAXPYC_K daxpy_k +#define DCOPY_K dcopy_k +#define DDOTU_K ddot_k +#define DDOTC_K ddot_k +#define DNRM2_K dnrm2_k +#define DSCAL_K dscal_k +#define DSWAP_K dswap_k +#define DROT_K drot_k + +#define DGEMV_N dgemv_n +#define DGEMV_T dgemv_t +#define DGEMV_R dgemv_n +#define DGEMV_C dgemv_t +#define DGEMV_O dgemv_n +#define DGEMV_U dgemv_t +#define DGEMV_S dgemv_n +#define DGEMV_D dgemv_t + +#define DGERU_K dger_k +#define DGERC_K dger_k +#define DGERV_K dger_k +#define DGERD_K dger_k + +#define DSYMV_U dsymv_U +#define DSYMV_L dsymv_L + +#define DSYMV_THREAD_U dsymv_thread_U +#define DSYMV_THREAD_L dsymv_thread_L + +#define DGEMM_ONCOPY dgemm_oncopy +#define DGEMM_OTCOPY dgemm_otcopy + +#if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N +#define DGEMM_INCOPY dgemm_oncopy +#define DGEMM_ITCOPY dgemm_otcopy +#else +#define DGEMM_INCOPY dgemm_incopy +#define DGEMM_ITCOPY dgemm_itcopy +#endif + +#define DTRMM_OUNUCOPY dtrmm_ounucopy +#define DTRMM_OUNNCOPY dtrmm_ounncopy +#define DTRMM_OUTUCOPY dtrmm_outucopy +#define DTRMM_OUTNCOPY dtrmm_outncopy +#define DTRMM_OLNUCOPY dtrmm_olnucopy +#define DTRMM_OLNNCOPY dtrmm_olnncopy +#define DTRMM_OLTUCOPY dtrmm_oltucopy +#define DTRMM_OLTNCOPY dtrmm_oltncopy + +#define DTRSM_OUNUCOPY dtrsm_ounucopy +#define DTRSM_OUNNCOPY dtrsm_ounncopy +#define DTRSM_OUTUCOPY dtrsm_outucopy +#define DTRSM_OUTNCOPY dtrsm_outncopy +#define DTRSM_OLNUCOPY dtrsm_olnucopy +#define DTRSM_OLNNCOPY dtrsm_olnncopy +#define DTRSM_OLTUCOPY dtrsm_oltucopy +#define DTRSM_OLTNCOPY dtrsm_oltncopy + +#if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N +#define DTRMM_IUNUCOPY dtrmm_ounucopy +#define DTRMM_IUNNCOPY dtrmm_ounncopy +#define DTRMM_IUTUCOPY dtrmm_outucopy +#define DTRMM_IUTNCOPY dtrmm_outncopy +#define DTRMM_ILNUCOPY dtrmm_olnucopy +#define DTRMM_ILNNCOPY dtrmm_olnncopy +#define DTRMM_ILTUCOPY dtrmm_oltucopy +#define DTRMM_ILTNCOPY dtrmm_oltncopy + +#define DTRSM_IUNUCOPY dtrsm_ounucopy +#define DTRSM_IUNNCOPY dtrsm_ounncopy +#define DTRSM_IUTUCOPY dtrsm_outucopy +#define DTRSM_IUTNCOPY dtrsm_outncopy +#define DTRSM_ILNUCOPY dtrsm_olnucopy +#define DTRSM_ILNNCOPY dtrsm_olnncopy +#define DTRSM_ILTUCOPY dtrsm_oltucopy +#define DTRSM_ILTNCOPY dtrsm_oltncopy +#else +#define DTRMM_IUNUCOPY dtrmm_iunucopy +#define DTRMM_IUNNCOPY dtrmm_iunncopy +#define DTRMM_IUTUCOPY dtrmm_iutucopy +#define DTRMM_IUTNCOPY dtrmm_iutncopy +#define DTRMM_ILNUCOPY dtrmm_ilnucopy +#define DTRMM_ILNNCOPY dtrmm_ilnncopy +#define DTRMM_ILTUCOPY dtrmm_iltucopy +#define DTRMM_ILTNCOPY dtrmm_iltncopy + +#define DTRSM_IUNUCOPY dtrsm_iunucopy +#define DTRSM_IUNNCOPY dtrsm_iunncopy +#define DTRSM_IUTUCOPY dtrsm_iutucopy +#define DTRSM_IUTNCOPY dtrsm_iutncopy +#define DTRSM_ILNUCOPY dtrsm_ilnucopy +#define DTRSM_ILNNCOPY dtrsm_ilnncopy +#define DTRSM_ILTUCOPY dtrsm_iltucopy +#define DTRSM_ILTNCOPY dtrsm_iltncopy +#endif + +#define DGEMM_BETA dgemm_beta + +#define DGEMM_KERNEL dgemm_kernel + +#define DTRMM_KERNEL_LN dtrmm_kernel_LN +#define DTRMM_KERNEL_LT dtrmm_kernel_LT +#define DTRMM_KERNEL_LR dtrmm_kernel_LN +#define DTRMM_KERNEL_LC dtrmm_kernel_LT +#define DTRMM_KERNEL_RN dtrmm_kernel_RN +#define DTRMM_KERNEL_RT dtrmm_kernel_RT +#define DTRMM_KERNEL_RR dtrmm_kernel_RN +#define DTRMM_KERNEL_RC dtrmm_kernel_RT + +#define DTRSM_KERNEL_LN dtrsm_kernel_LN +#define DTRSM_KERNEL_LT dtrsm_kernel_LT +#define DTRSM_KERNEL_LR dtrsm_kernel_LN +#define DTRSM_KERNEL_LC dtrsm_kernel_LT +#define DTRSM_KERNEL_RN dtrsm_kernel_RN +#define DTRSM_KERNEL_RT dtrsm_kernel_RT +#define DTRSM_KERNEL_RR dtrsm_kernel_RN +#define DTRSM_KERNEL_RC dtrsm_kernel_RT + +#define DSYMM_OUTCOPY dsymm_outcopy +#define DSYMM_OLTCOPY dsymm_oltcopy +#if DGEMM_DEFAULT_UNROLL_M == DGEMM_DEFAULT_UNROLL_N +#define DSYMM_IUTCOPY dsymm_outcopy +#define DSYMM_ILTCOPY dsymm_oltcopy +#else +#define DSYMM_IUTCOPY dsymm_iutcopy +#define DSYMM_ILTCOPY dsymm_iltcopy +#endif + +#define DNEG_TCOPY dneg_tcopy +#define DLASWP_NCOPY dlaswp_ncopy + +#else + +#define DAMAX_K gotoblas -> damax_k +#define DAMIN_K gotoblas -> damin_k +#define DMAX_K gotoblas -> dmax_k +#define DMIN_K gotoblas -> dmin_k +#define IDAMAX_K gotoblas -> idamax_k +#define IDAMIN_K gotoblas -> idamin_k +#define IDMAX_K gotoblas -> idmax_k +#define IDMIN_K gotoblas -> idmin_k +#define DASUM_K gotoblas -> dasum_k +#define DAXPYU_K gotoblas -> daxpy_k +#define DAXPYC_K gotoblas -> daxpy_k +#define DCOPY_K gotoblas -> dcopy_k +#define DDOTU_K gotoblas -> ddot_k +#define DDOTC_K gotoblas -> ddot_k +#define DNRM2_K gotoblas -> dnrm2_k +#define DSCAL_K gotoblas -> dscal_k +#define DSWAP_K gotoblas -> dswap_k +#define DROT_K gotoblas -> drot_k + +#define DGEMV_N gotoblas -> dgemv_n +#define DGEMV_T gotoblas -> dgemv_t +#define DGEMV_R gotoblas -> dgemv_n +#define DGEMV_C gotoblas -> dgemv_t +#define DGEMV_O gotoblas -> dgemv_n +#define DGEMV_U gotoblas -> dgemv_t +#define DGEMV_S gotoblas -> dgemv_n +#define DGEMV_D gotoblas -> dgemv_t + +#define DGERU_K gotoblas -> dger_k +#define DGERC_K gotoblas -> dger_k +#define DGERV_K gotoblas -> dger_k +#define DGERD_K gotoblas -> dger_k + +#define DSYMV_U gotoblas -> dsymv_U +#define DSYMV_L gotoblas -> dsymv_L + +#define DSYMV_THREAD_U dsymv_thread_U +#define DSYMV_THREAD_L dsymv_thread_L + +#define DGEMM_ONCOPY gotoblas -> dgemm_oncopy +#define DGEMM_OTCOPY gotoblas -> dgemm_otcopy +#define DGEMM_INCOPY gotoblas -> dgemm_incopy +#define DGEMM_ITCOPY gotoblas -> dgemm_itcopy + +#define DTRMM_OUNUCOPY gotoblas -> dtrmm_ounucopy +#define DTRMM_OUTUCOPY gotoblas -> dtrmm_outucopy +#define DTRMM_OLNUCOPY gotoblas -> dtrmm_olnucopy +#define DTRMM_OLTUCOPY gotoblas -> dtrmm_oltucopy +#define DTRSM_OUNUCOPY gotoblas -> dtrsm_ounucopy +#define DTRSM_OUTUCOPY gotoblas -> dtrsm_outucopy +#define DTRSM_OLNUCOPY gotoblas -> dtrsm_olnucopy +#define DTRSM_OLTUCOPY gotoblas -> dtrsm_oltucopy + +#define DTRMM_IUNUCOPY gotoblas -> dtrmm_iunucopy +#define DTRMM_IUTUCOPY gotoblas -> dtrmm_iutucopy +#define DTRMM_ILNUCOPY gotoblas -> dtrmm_ilnucopy +#define DTRMM_ILTUCOPY gotoblas -> dtrmm_iltucopy +#define DTRSM_IUNUCOPY gotoblas -> dtrsm_iunucopy +#define DTRSM_IUTUCOPY gotoblas -> dtrsm_iutucopy +#define DTRSM_ILNUCOPY gotoblas -> dtrsm_ilnucopy +#define DTRSM_ILTUCOPY gotoblas -> dtrsm_iltucopy + +#define DTRMM_OUNNCOPY gotoblas -> dtrmm_ounncopy +#define DTRMM_OUTNCOPY gotoblas -> dtrmm_outncopy +#define DTRMM_OLNNCOPY gotoblas -> dtrmm_olnncopy +#define DTRMM_OLTNCOPY gotoblas -> dtrmm_oltncopy +#define DTRSM_OUNNCOPY gotoblas -> dtrsm_ounncopy +#define DTRSM_OUTNCOPY gotoblas -> dtrsm_outncopy +#define DTRSM_OLNNCOPY gotoblas -> dtrsm_olnncopy +#define DTRSM_OLTNCOPY gotoblas -> dtrsm_oltncopy + +#define DTRMM_IUNNCOPY gotoblas -> dtrmm_iunncopy +#define DTRMM_IUTNCOPY gotoblas -> dtrmm_iutncopy +#define DTRMM_ILNNCOPY gotoblas -> dtrmm_ilnncopy +#define DTRMM_ILTNCOPY gotoblas -> dtrmm_iltncopy +#define DTRSM_IUNNCOPY gotoblas -> dtrsm_iunncopy +#define DTRSM_IUTNCOPY gotoblas -> dtrsm_iutncopy +#define DTRSM_ILNNCOPY gotoblas -> dtrsm_ilnncopy +#define DTRSM_ILTNCOPY gotoblas -> dtrsm_iltncopy + +#define DGEMM_BETA gotoblas -> dgemm_beta +#define DGEMM_KERNEL gotoblas -> dgemm_kernel + +#define DTRMM_KERNEL_LN gotoblas -> dtrmm_kernel_LN +#define DTRMM_KERNEL_LT gotoblas -> dtrmm_kernel_LT +#define DTRMM_KERNEL_LR gotoblas -> dtrmm_kernel_LN +#define DTRMM_KERNEL_LC gotoblas -> dtrmm_kernel_LT +#define DTRMM_KERNEL_RN gotoblas -> dtrmm_kernel_RN +#define DTRMM_KERNEL_RT gotoblas -> dtrmm_kernel_RT +#define DTRMM_KERNEL_RR gotoblas -> dtrmm_kernel_RN +#define DTRMM_KERNEL_RC gotoblas -> dtrmm_kernel_RT + +#define DTRSM_KERNEL_LN gotoblas -> dtrsm_kernel_LN +#define DTRSM_KERNEL_LT gotoblas -> dtrsm_kernel_LT +#define DTRSM_KERNEL_LR gotoblas -> dtrsm_kernel_LN +#define DTRSM_KERNEL_LC gotoblas -> dtrsm_kernel_LT +#define DTRSM_KERNEL_RN gotoblas -> dtrsm_kernel_RN +#define DTRSM_KERNEL_RT gotoblas -> dtrsm_kernel_RT +#define DTRSM_KERNEL_RR gotoblas -> dtrsm_kernel_RN +#define DTRSM_KERNEL_RC gotoblas -> dtrsm_kernel_RT + +#define DSYMM_IUTCOPY gotoblas -> dsymm_iutcopy +#define DSYMM_ILTCOPY gotoblas -> dsymm_iltcopy +#define DSYMM_OUTCOPY gotoblas -> dsymm_outcopy +#define DSYMM_OLTCOPY gotoblas -> dsymm_oltcopy + +#define DNEG_TCOPY gotoblas -> dneg_tcopy +#define DLASWP_NCOPY gotoblas -> dlaswp_ncopy + +#endif + +#define DGEMM_NN dgemm_nn +#define DGEMM_CN dgemm_tn +#define DGEMM_TN dgemm_tn +#define DGEMM_NC dgemm_nt +#define DGEMM_NT dgemm_nt +#define DGEMM_CC dgemm_tt +#define DGEMM_CT dgemm_tt +#define DGEMM_TC dgemm_tt +#define DGEMM_TT dgemm_tt +#define DGEMM_NR dgemm_nn +#define DGEMM_TR dgemm_tn +#define DGEMM_CR dgemm_tn +#define DGEMM_RN dgemm_nn +#define DGEMM_RT dgemm_nt +#define DGEMM_RC dgemm_nt +#define DGEMM_RR dgemm_nn + +#define DSYMM_LU dsymm_LU +#define DSYMM_LL dsymm_LL +#define DSYMM_RU dsymm_RU +#define DSYMM_RL dsymm_RL + +#define DHEMM_LU dhemm_LU +#define DHEMM_LL dhemm_LL +#define DHEMM_RU dhemm_RU +#define DHEMM_RL dhemm_RL + +#define DSYRK_UN dsyrk_UN +#define DSYRK_UT dsyrk_UT +#define DSYRK_LN dsyrk_LN +#define DSYRK_LT dsyrk_LT +#define DSYRK_UR dsyrk_UN +#define DSYRK_UC dsyrk_UT +#define DSYRK_LR dsyrk_LN +#define DSYRK_LC dsyrk_LT + +#define DSYRK_KERNEL_U dsyrk_kernel_U +#define DSYRK_KERNEL_L dsyrk_kernel_L + +#define DHERK_UN dsyrk_UN +#define DHERK_LN dsyrk_LN +#define DHERK_UC dsyrk_UT +#define DHERK_LC dsyrk_LT + +#define DHER2K_UN dsyr2k_UN +#define DHER2K_LN dsyr2k_LN +#define DHER2K_UC dsyr2k_UT +#define DHER2K_LC dsyr2k_LT + +#define DSYR2K_UN dsyr2k_UN +#define DSYR2K_UT dsyr2k_UT +#define DSYR2K_LN dsyr2k_LN +#define DSYR2K_LT dsyr2k_LT +#define DSYR2K_UR dsyr2k_UN +#define DSYR2K_UC dsyr2k_UT +#define DSYR2K_LR dsyr2k_LN +#define DSYR2K_LC dsyr2k_LT + +#define DSYR2K_KERNEL_U dsyr2k_kernel_U +#define DSYR2K_KERNEL_L dsyr2k_kernel_L + +#define DTRMM_LNUU dtrmm_LNUU +#define DTRMM_LNUN dtrmm_LNUN +#define DTRMM_LNLU dtrmm_LNLU +#define DTRMM_LNLN dtrmm_LNLN +#define DTRMM_LTUU dtrmm_LTUU +#define DTRMM_LTUN dtrmm_LTUN +#define DTRMM_LTLU dtrmm_LTLU +#define DTRMM_LTLN dtrmm_LTLN +#define DTRMM_LRUU dtrmm_LNUU +#define DTRMM_LRUN dtrmm_LNUN +#define DTRMM_LRLU dtrmm_LNLU +#define DTRMM_LRLN dtrmm_LNLN +#define DTRMM_LCUU dtrmm_LTUU +#define DTRMM_LCUN dtrmm_LTUN +#define DTRMM_LCLU dtrmm_LTLU +#define DTRMM_LCLN dtrmm_LTLN +#define DTRMM_RNUU dtrmm_RNUU +#define DTRMM_RNUN dtrmm_RNUN +#define DTRMM_RNLU dtrmm_RNLU +#define DTRMM_RNLN dtrmm_RNLN +#define DTRMM_RTUU dtrmm_RTUU +#define DTRMM_RTUN dtrmm_RTUN +#define DTRMM_RTLU dtrmm_RTLU +#define DTRMM_RTLN dtrmm_RTLN +#define DTRMM_RRUU dtrmm_RNUU +#define DTRMM_RRUN dtrmm_RNUN +#define DTRMM_RRLU dtrmm_RNLU +#define DTRMM_RRLN dtrmm_RNLN +#define DTRMM_RCUU dtrmm_RTUU +#define DTRMM_RCUN dtrmm_RTUN +#define DTRMM_RCLU dtrmm_RTLU +#define DTRMM_RCLN dtrmm_RTLN + +#define DTRSM_LNUU dtrsm_LNUU +#define DTRSM_LNUN dtrsm_LNUN +#define DTRSM_LNLU dtrsm_LNLU +#define DTRSM_LNLN dtrsm_LNLN +#define DTRSM_LTUU dtrsm_LTUU +#define DTRSM_LTUN dtrsm_LTUN +#define DTRSM_LTLU dtrsm_LTLU +#define DTRSM_LTLN dtrsm_LTLN +#define DTRSM_LRUU dtrsm_LNUU +#define DTRSM_LRUN dtrsm_LNUN +#define DTRSM_LRLU dtrsm_LNLU +#define DTRSM_LRLN dtrsm_LNLN +#define DTRSM_LCUU dtrsm_LTUU +#define DTRSM_LCUN dtrsm_LTUN +#define DTRSM_LCLU dtrsm_LTLU +#define DTRSM_LCLN dtrsm_LTLN +#define DTRSM_RNUU dtrsm_RNUU +#define DTRSM_RNUN dtrsm_RNUN +#define DTRSM_RNLU dtrsm_RNLU +#define DTRSM_RNLN dtrsm_RNLN +#define DTRSM_RTUU dtrsm_RTUU +#define DTRSM_RTUN dtrsm_RTUN +#define DTRSM_RTLU dtrsm_RTLU +#define DTRSM_RTLN dtrsm_RTLN +#define DTRSM_RRUU dtrsm_RNUU +#define DTRSM_RRUN dtrsm_RNUN +#define DTRSM_RRLU dtrsm_RNLU +#define DTRSM_RRLN dtrsm_RNLN +#define DTRSM_RCUU dtrsm_RTUU +#define DTRSM_RCUN dtrsm_RTUN +#define DTRSM_RCLU dtrsm_RTLU +#define DTRSM_RCLN dtrsm_RTLN + +#define DGEMM_THREAD_NN dgemm_thread_nn +#define DGEMM_THREAD_CN dgemm_thread_tn +#define DGEMM_THREAD_TN dgemm_thread_tn +#define DGEMM_THREAD_NC dgemm_thread_nt +#define DGEMM_THREAD_NT dgemm_thread_nt +#define DGEMM_THREAD_CC dgemm_thread_tt +#define DGEMM_THREAD_CT dgemm_thread_tt +#define DGEMM_THREAD_TC dgemm_thread_tt +#define DGEMM_THREAD_TT dgemm_thread_tt +#define DGEMM_THREAD_NR dgemm_thread_nn +#define DGEMM_THREAD_TR dgemm_thread_tn +#define DGEMM_THREAD_CR dgemm_thread_tn +#define DGEMM_THREAD_RN dgemm_thread_nn +#define DGEMM_THREAD_RT dgemm_thread_nt +#define DGEMM_THREAD_RC dgemm_thread_nt +#define DGEMM_THREAD_RR dgemm_thread_nn + +#define DSYMM_THREAD_LU dsymm_thread_LU +#define DSYMM_THREAD_LL dsymm_thread_LL +#define DSYMM_THREAD_RU dsymm_thread_RU +#define DSYMM_THREAD_RL dsymm_thread_RL + +#define DHEMM_THREAD_LU dhemm_thread_LU +#define DHEMM_THREAD_LL dhemm_thread_LL +#define DHEMM_THREAD_RU dhemm_thread_RU +#define DHEMM_THREAD_RL dhemm_thread_RL + +#define DSYRK_THREAD_UN dsyrk_thread_UN +#define DSYRK_THREAD_UT dsyrk_thread_UT +#define DSYRK_THREAD_LN dsyrk_thread_LN +#define DSYRK_THREAD_LT dsyrk_thread_LT +#define DSYRK_THREAD_UR dsyrk_thread_UN +#define DSYRK_THREAD_UC dsyrk_thread_UT +#define DSYRK_THREAD_LR dsyrk_thread_LN +#define DSYRK_THREAD_LC dsyrk_thread_LT + +#define DHERK_THREAD_UN dsyrk_thread_UN +#define DHERK_THREAD_UT dsyrk_thread_UT +#define DHERK_THREAD_LN dsyrk_thread_LN +#define DHERK_THREAD_LT dsyrk_thread_LT +#define DHERK_THREAD_UR dsyrk_thread_UN +#define DHERK_THREAD_UC dsyrk_thread_UT +#define DHERK_THREAD_LR dsyrk_thread_LN +#define DHERK_THREAD_LC dsyrk_thread_LT + +#endif diff --git a/common_ia64.h b/common_ia64.h new file mode 100644 index 0000000000..81939cc1bd --- /dev/null +++ b/common_ia64.h @@ -0,0 +1,408 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_IA64 +#define COMMON_IA64 + +#ifndef ASSEMBLER + +#ifndef MAP_WRITECOMBINED +#define MAP_WRITECOMBINED 0x10000 +#endif + +#define MB +#define WMB + +#ifdef __ECC +#include +#endif + +#define RPCC64BIT + +#ifndef __ECC +static __inline void blas_lock(volatile unsigned long *address){ + + unsigned long ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__ ("mov ar.ccv=r0\n;;\n" + "cmpxchg4.acq %0=[%2],%1,ar.ccv\n" + : "=r"(ret) : "r"(1), "r"(address) + : "ar.ccv", "memory"); + } while (ret); +} + +static __inline unsigned long rpcc(void) { + unsigned long clocks; + + __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(clocks)); + return clocks; +} + + +static __inline unsigned long stmxcsr(void){ + unsigned long fp; + + __asm__ __volatile__ ("mov.m %0=ar.fpsr" : "=r" (fp)); + + return fp; +} + +static __inline void ldmxcsr(unsigned long fp) { + + __asm__ __volatile__ ("mov.m ar.fpsr=%0" :: "r" (fp)); + +} + +#define GET_IMAGE(res) asm __volatile__("mov %0 = f9" : "=f"(res) : : "memory") + +#else + +static __inline void blas_lock(volatile unsigned long *address){ + while (*address || _InterlockedCompareExchange((volatile int *) address,1,0)) + ; +} + +static __inline unsigned int rpcc(void) { + return __getReg(_IA64_REG_AR_ITC); +} + +static __inline unsigned int stmxcsr(void) { + return __getReg(_IA64_REG_AR_FPSR); +} + +static __inline void ldmxcsr(unsigned long fp) { + + return __setReg(_IA64_REG_AR_FPSR, fp); + +} + +#ifdef DOUBLE +#define GET_IMAGE(res) __stfd(&res, 9) +#else +#define GET_IMAGE(res) __stfs(&res, 9) +#endif + +#endif + +#define GET_IMAGE_CANCEL + +#ifdef ENABLE_SSE_EXCEPTION + +#define IDEBUG_START \ + { \ + unsigned long fp_sse_mode, new_fp_mode; \ + fp_sse_mode = stmxcsr();\ + new_fp_mode = (fp_sse_mode & ~(FE_UNDERFLOW | FE_OVERFLOW | FE_UNNORMAL | FE_INVALID));\ + ldmxcsr(new_fp_mode); + +#define IDEBUG_END \ + ldmxcsr(fp_sse_mode); \ + } + +#endif + +#ifdef SMP + +#ifdef USE64BITINT + +/* 64bit version */ + +extern unsigned long blas_quick_divide_table[]; + +#ifndef __ECC +static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){ + unsigned long ret; + + if (y <= 1) return x; + + __asm__ __volatile__("setf.sig f6 = %1\n\t" + "ldf8 f7 = [%2];;\n\t" + "xmpy.hu f6= f6, f7;;\n\t" + "getf.sig %0 = f6;;\n" + : "=r"(ret) + : "r"(x), "r"(&blas_quick_divide_table[y]) : "f6", "f7" + ); + + return ret; +} +#else +/* Using Intel Compiler */ +static __inline long blas_quickdivide(unsigned long int x, unsigned long int y){ + if (y <= 1) return x; + return _m64_xmahu(x, blas_quick_divide_table[y], 0); +} +#endif + +#else + /* 32bit version */ +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + if (y <= 1) return x; + return (int)((x * (unsigned long)blas_quick_divide_table[y]) >> 32); +} +#endif +#endif + +#endif + +#if 0 +#ifdef DOUBLE +#define GEMM_NCOPY dgemm_ncopy +#define GEMM_TCOPY dgemm_tcopy +#define ZGEMM_NCOPY zgemm_ncopy +#define ZGEMM_TCOPY zgemm_tcopy +#define GEMM_KERNEL dgemm_kernel + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ZGEMM_KERNEL zgemm_kernel_n +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define ZGEMM_KERNEL zgemm_kernel_l +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define ZGEMM_KERNEL zgemm_kernel_r +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define ZGEMM_KERNEL zgemm_kernel_b +#endif + +#else +#define GEMM_NCOPY sgemm_ncopy +#define GEMM_TCOPY sgemm_tcopy +#define ZGEMM_NCOPY cgemm_ncopy +#define ZGEMM_TCOPY cgemm_tcopy +#define GEMM_KERNEL sgemm_kernel + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ZGEMM_KERNEL cgemm_kernel_n +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define ZGEMM_KERNEL cgemm_kernel_l +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define ZGEMM_KERNEL cgemm_kernel_r +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define ZGEMM_KERNEL cgemm_kernel_b +#endif + +#endif +#endif + +#ifdef USE64BITINT +#define LDINT ld8 +#define INTSIZE 8 +#define CMP4GE cmp.ge +#define CMP4NE cmp.ge +#define CMP4EQ cmp.eq +#else +#define LDINT ld4 +#define INTSIZE 4 +#define CMP4GE cmp4.ge +#define CMP4NE cmp4.ne +#define CMP4EQ cmp4.eq +#endif + +#define HALT mov r0 = 0 + +#ifdef XDOUBLE +#define LD8 ld8 +#define ST8 st8 +#define LDFD ldfe +#define LDFPD ldfpe +#define LDFD_T1 ldfe.t1 +#define LDFD_NT1 ldfe.nt1 +#define LDFD_NT2 ldfe.nt2 +#define LDFD_NTA ldfe.nta +#define LDFPD_NT1 ldfpe.nt1 +#define LDFPD_NT2 ldfpe.nt2 +#define LDFPD_NTA ldfpe.nta +#define STFD stfe +#define STFD_NTA stfe.nta +#define FADD fadd +#define FSUB fsub +#define FMPY fmpy +#define FMA fma +#define FMS fms +#define FNMA fnma +#define FPMA fpma +#define SETF setf.d +#elif defined(DOUBLE) +#define LD8 ld8 +#define ST8 st8 +#define LDF8 ldf8 +#define LDF8_NT1 ldf8.nt1 +#define LDF8_NTA ldf8.nta +#define STF8 stf8 +#define STF8_NTA stf8.nta +#define LDFD ldfd +#define LDFPD ldfpd +#define LDFD_T1 ldfd.t1 +#define LDFD_NT1 ldfd.nt1 +#define LDFD_NT2 ldfd.nt2 +#define LDFD_NTA ldfd.nta +#define LDFPD_NT1 ldfpd.nt1 +#define LDFPD_NT2 ldfpd.nt2 +#define LDFPD_NTA ldfpd.nta +#define STFD stfd +#define STFD_NTA stfd.nta +#define FADD fadd.d +#define FSUB fsub.d +#define FMPY fmpy.d +#define FMA fma.d +#define FMS fms.d +#define FNMA fnma.d +#define FPMA fpma.d +#define SETF setf.d +#else +#define LD8 ld4 +#define ST8 st4 +#define LDF8 ldfs +#define LDF8_NT1 ldfs.nt1 +#define LDF8_NTA ldfs.nta +#define STF8 stfs +#define STF8_NTA stfs.nta +#define LDFD ldfs +#define LDFPD ldfps +#define LDFD_T1 ldfs.t1 +#define LDFD_NT1 ldfs.nt1 +#define LDFD_NT2 ldfs.nt2 +#define LDFD_NTA ldfs.nta +#define LDFPD_NT1 ldfps.nt1 +#define LDFPD_NT2 ldfps.nt2 +#define LDFPD_NTA ldfps.nta +#define STFD stfs +#define STFD_NTA stfs.nta +#if 0 +#define FADD fadd.s +#define FSUB fsub.s +#define FMPY fmpy.s +#define FMA fma.s +#define FMS fms.s +#define FNMA fnma.s +#define FPMA fpma.s +#else +#define FADD fadd +#define FSUB fsub +#define FMPY fmpy +#define FMA fma +#define FMS fms +#define FNMA fnma +#define FPMA fpma +#endif +#define SETF setf.s +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#ifdef F_INTERFACE_G77 +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_G95 +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_GFORT +#define RETURN_BY_REGS +#endif + +#ifdef F_INTERFACE_INTEL +#define RETURN_BY_STACK +#endif + +#define PROLOGUE \ + .explicit; \ + .text; \ + .align 128; \ + .global REALNAME; \ + .proc REALNAME; \ +REALNAME: + + +#ifdef PROFILE +#define PROFCODE \ + .data; \ + .align 8; \ +.LP0:; \ + data8 0; \ + .text; \ + alloc out0 = ar.pfs, 8, 0, 4, 0; \ + mov out1 = r1; \ + mov out2 = b0; \ + addl out3 = @ltoff(.LP0), r1;;; \ + br.call.sptk.many b0 = _mcount;; +#else +#define PROFCODE +#endif + +#define EPILOGUE \ + .endp REALNAME + +#define START_ADDRESS 0x20000fc800000000UL + +#undef SEEK_ADDRESS + +#if 0 +#ifdef CONFIG_IA64_PAGE_SIZE_4KB +#define SEEK_ADDRESS +#endif + +#ifdef CONFIG_IA64_PAGE_SIZE_8KB +#define SEEK_ADDRESS +#endif +#endif + +#define BUFFER_SIZE (128 << 20) + +#ifndef PAGESIZE +#define PAGESIZE (16UL << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BASE_ADDRESS (START_ADDRESS - (BLASULONG)BUFFER_SIZE * MAX_CPU_NUMBER) + +#endif diff --git a/common_interface.h b/common_interface.h new file mode 100644 index 0000000000..36bf5aa480 --- /dev/null +++ b/common_interface.h @@ -0,0 +1,736 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +int BLASFUNC(xerbla)(char *, blasint *info, blasint); + +FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); +FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); + +double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); +double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); +xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + + +#ifdef RETURN_BY_STRUCT +typedef struct { + float r, i; +} myccomplex_t; + +typedef struct { + double r, i; +} myzcomplex_t; + +typedef struct { + xdouble r, i; +} myxcomplex_t; + +myccomplex_t BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); +myccomplex_t BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); +myzcomplex_t BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); +myzcomplex_t BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); +myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + +#elif defined RETURN_BY_STACK +void BLASFUNC(cdotu) (float _Complex *, blasint *, float * , blasint *, float *, blasint *); +void BLASFUNC(cdotc) (float _Complex *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zdotu) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(zdotc) (double _Complex *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xdotu) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(xdotc) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +#else +float _Complex BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *); +float _Complex BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *); +double _Complex BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *); +double _Complex BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *); +xdouble _Complex BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +xdouble _Complex BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +#endif + +void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC(qaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(caxpy) (blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC(xaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(caxpyc)(blasint *, float *, float *, blasint *, float *, blasint *); +void BLASFUNC(zaxpyc)(blasint *, double *, double *, blasint *, double *, blasint *); +void BLASFUNC(xaxpyc)(blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(scopy) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dcopy) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ccopy) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zcopy) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(sswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(cswap) (blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zswap) (blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(sasum) (blasint *, float *, blasint *); +FLOATRET BLASFUNC(scasum)(blasint *, float *, blasint *); +double BLASFUNC(dasum) (blasint *, double *, blasint *); +xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); +double BLASFUNC(dzasum)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); + +blasint BLASFUNC(isamax)(blasint *, float *, blasint *); +blasint BLASFUNC(idamax)(blasint *, double *, blasint *); +blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); +blasint BLASFUNC(icamax)(blasint *, float *, blasint *); +blasint BLASFUNC(izamax)(blasint *, double *, blasint *); +blasint BLASFUNC(ixamax)(blasint *, xdouble *, blasint *); + +blasint BLASFUNC(ismax) (blasint *, float *, blasint *); +blasint BLASFUNC(idmax) (blasint *, double *, blasint *); +blasint BLASFUNC(iqmax) (blasint *, xdouble *, blasint *); +blasint BLASFUNC(icmax) (blasint *, float *, blasint *); +blasint BLASFUNC(izmax) (blasint *, double *, blasint *); +blasint BLASFUNC(ixmax) (blasint *, xdouble *, blasint *); + +blasint BLASFUNC(isamin)(blasint *, float *, blasint *); +blasint BLASFUNC(idamin)(blasint *, double *, blasint *); +blasint BLASFUNC(iqamin)(blasint *, xdouble *, blasint *); +blasint BLASFUNC(icamin)(blasint *, float *, blasint *); +blasint BLASFUNC(izamin)(blasint *, double *, blasint *); +blasint BLASFUNC(ixamin)(blasint *, xdouble *, blasint *); + +blasint BLASFUNC(ismin)(blasint *, float *, blasint *); +blasint BLASFUNC(idmin)(blasint *, double *, blasint *); +blasint BLASFUNC(iqmin)(blasint *, xdouble *, blasint *); +blasint BLASFUNC(icmin)(blasint *, float *, blasint *); +blasint BLASFUNC(izmin)(blasint *, double *, blasint *); +blasint BLASFUNC(ixmin)(blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(samax) (blasint *, float *, blasint *); +double BLASFUNC(damax) (blasint *, double *, blasint *); +xdouble BLASFUNC(qamax) (blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(scamax)(blasint *, float *, blasint *); +double BLASFUNC(dzamax)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxamax)(blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(samin) (blasint *, float *, blasint *); +double BLASFUNC(damin) (blasint *, double *, blasint *); +xdouble BLASFUNC(qamin) (blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(scamin)(blasint *, float *, blasint *); +double BLASFUNC(dzamin)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxamin)(blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(smax) (blasint *, float *, blasint *); +double BLASFUNC(dmax) (blasint *, double *, blasint *); +xdouble BLASFUNC(qmax) (blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(scmax) (blasint *, float *, blasint *); +double BLASFUNC(dzmax) (blasint *, double *, blasint *); +xdouble BLASFUNC(qxmax) (blasint *, xdouble *, blasint *); + +FLOATRET BLASFUNC(smin) (blasint *, float *, blasint *); +double BLASFUNC(dmin) (blasint *, double *, blasint *); +xdouble BLASFUNC(qmin) (blasint *, xdouble *, blasint *); +FLOATRET BLASFUNC(scmin) (blasint *, float *, blasint *); +double BLASFUNC(dzmin) (blasint *, double *, blasint *); +xdouble BLASFUNC(qxmin) (blasint *, xdouble *, blasint *); + +void BLASFUNC(sscal) (blasint *, float *, float *, blasint *); +void BLASFUNC(dscal) (blasint *, double *, double *, blasint *); +void BLASFUNC(qscal) (blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cscal) (blasint *, float *, float *, blasint *); +void BLASFUNC(zscal) (blasint *, double *, double *, blasint *); +void BLASFUNC(xscal) (blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csscal)(blasint *, float *, float *, blasint *); +void BLASFUNC(zdscal)(blasint *, double *, double *, blasint *); +void BLASFUNC(xqscal)(blasint *, xdouble *, xdouble *, blasint *); + +FLOATRET BLASFUNC(snrm2) (blasint *, float *, blasint *); +FLOATRET BLASFUNC(scnrm2)(blasint *, float *, blasint *); + +double BLASFUNC(dnrm2) (blasint *, double *, blasint *); +xdouble BLASFUNC(qnrm2) (blasint *, xdouble *, blasint *); +double BLASFUNC(dznrm2)(blasint *, double *, blasint *); +xdouble BLASFUNC(qxnrm2)(blasint *, xdouble *, blasint *); + +void BLASFUNC(srot) (blasint *, float *, blasint *, float *, blasint *, float *, float *); +void BLASFUNC(drot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); +void BLASFUNC(qrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); +void BLASFUNC(csrot) (blasint *, float *, blasint *, float *, blasint *, float *, float *); +void BLASFUNC(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *); +void BLASFUNC(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *); + +void BLASFUNC(srotg) (float *, float *, float *, float *); +void BLASFUNC(drotg) (double *, double *, double *, double *); +void BLASFUNC(qrotg) (xdouble *, xdouble *, xdouble *, xdouble *); +void BLASFUNC(crotg) (float *, float *, float *, float *); +void BLASFUNC(zrotg) (double *, double *, double *, double *); +void BLASFUNC(xrotg) (xdouble *, xdouble *, xdouble *, xdouble *); + +void BLASFUNC(srotmg)(float *, float *, float *, float *, float *); +void BLASFUNC(drotmg)(double *, double *, double *, double *, double *); + +void BLASFUNC(srotm) (blasint *, float *, blasint *, float *, blasint *, float *); +void BLASFUNC(drotm) (blasint *, double *, blasint *, double *, blasint *, double *); +void BLASFUNC(qrotm) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *); + +/* Level 2 routines */ + +void BLASFUNC(sger)(blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, blasint *); +void BLASFUNC(dger)(blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, blasint *); +void BLASFUNC(qger)(blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(cgeru)(blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, blasint *); +void BLASFUNC(cgerc)(blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, blasint *); +void BLASFUNC(zgeru)(blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, blasint *); +void BLASFUNC(zgerc)(blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, blasint *); +void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cgemv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(strsv) (char *, char *, char *, blasint *, float *, blasint *, + float *, blasint *); +void BLASFUNC(dtrsv) (char *, char *, char *, blasint *, double *, blasint *, + double *, blasint *); +void BLASFUNC(qtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, + xdouble *, blasint *); +void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float *, blasint *, + float *, blasint *); +void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *, + double *, blasint *); +void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *, + xdouble *, blasint *); + +void BLASFUNC(strmv) (char *, char *, char *, blasint *, float *, blasint *, + float *, blasint *); +void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *, + double *, blasint *); +void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, + xdouble *, blasint *); +void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float *, blasint *, + float *, blasint *); +void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *, + double *, blasint *); +void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *, + xdouble *, blasint *); + +void BLASFUNC(stpsv) (char *, char *, char *, blasint *, float *, float *, blasint *); +void BLASFUNC(dtpsv) (char *, char *, char *, blasint *, double *, double *, blasint *); +void BLASFUNC(qtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(ctpsv) (char *, char *, char *, blasint *, float *, float *, blasint *); +void BLASFUNC(ztpsv) (char *, char *, char *, blasint *, double *, double *, blasint *); +void BLASFUNC(xtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(stpmv) (char *, char *, char *, blasint *, float *, float *, blasint *); +void BLASFUNC(dtpmv) (char *, char *, char *, blasint *, double *, double *, blasint *); +void BLASFUNC(qtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(ctpmv) (char *, char *, char *, blasint *, float *, float *, blasint *); +void BLASFUNC(ztpmv) (char *, char *, char *, blasint *, double *, double *, blasint *); +void BLASFUNC(xtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(stbmv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dtbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ctbmv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(ztbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(stbsv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dtbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ctbsv) (char *, char *, char *, blasint *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(ztbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(ssymv) (char *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dsymv) (char *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csymv) (char *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsymv) (char *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xsymv) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(sspmv) (char *, blasint *, float *, float *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dspmv) (char *, blasint *, double *, double *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qspmv) (char *, blasint *, xdouble *, xdouble *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cspmv) (char *, blasint *, float *, float *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zspmv) (char *, blasint *, double *, double *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xspmv) (char *, blasint *, xdouble *, xdouble *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(ssyr) (char *, blasint *, float *, float *, blasint *, + float *, blasint *); +void BLASFUNC(dsyr) (char *, blasint *, double *, double *, blasint *, + double *, blasint *); +void BLASFUNC(qsyr) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *); +void BLASFUNC(csyr) (char *, blasint *, float *, float *, blasint *, + float *, blasint *); +void BLASFUNC(zsyr) (char *, blasint *, double *, double *, blasint *, + double *, blasint *); +void BLASFUNC(xsyr) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *); + +void BLASFUNC(ssyr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(dsyr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(qsyr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(csyr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zsyr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xsyr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(sspr) (char *, blasint *, float *, float *, blasint *, + float *); +void BLASFUNC(dspr) (char *, blasint *, double *, double *, blasint *, + double *); +void BLASFUNC(qspr) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *); +void BLASFUNC(cspr) (char *, blasint *, float *, float *, blasint *, + float *); +void BLASFUNC(zspr) (char *, blasint *, double *, double *, blasint *, + double *); +void BLASFUNC(xspr) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *); + +void BLASFUNC(sspr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *); +void BLASFUNC(dspr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *); +void BLASFUNC(qspr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *); +void BLASFUNC(cspr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *); +void BLASFUNC(zspr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *); +void BLASFUNC(xspr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *); + +void BLASFUNC(cher) (char *, blasint *, float *, float *, blasint *, + float *, blasint *); +void BLASFUNC(zher) (char *, blasint *, double *, double *, blasint *, + double *, blasint *); +void BLASFUNC(xher) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *); + +void BLASFUNC(chpr) (char *, blasint *, float *, float *, blasint *, float *); +void BLASFUNC(zhpr) (char *, blasint *, double *, double *, blasint *, double *); +void BLASFUNC(xhpr) (char *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); + +void BLASFUNC(cher2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *); +void BLASFUNC(zher2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *); +void BLASFUNC(xher2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(chpr2) (char *, blasint *, float *, + float *, blasint *, float *, blasint *, float *); +void BLASFUNC(zhpr2) (char *, blasint *, double *, + double *, blasint *, double *, blasint *, double *); +void BLASFUNC(xhpr2) (char *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *); + +void BLASFUNC(chemv) (char *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhemv) (char *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhemv) (char *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(chpmv) (char *, blasint *, float *, float *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhpmv) (char *, blasint *, double *, double *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhpmv) (char *, blasint *, xdouble *, xdouble *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(snorm)(char *, blasint *, blasint *, float *, blasint *); +int BLASFUNC(dnorm)(char *, blasint *, blasint *, double *, blasint *); +int BLASFUNC(cnorm)(char *, blasint *, blasint *, float *, blasint *); +int BLASFUNC(znorm)(char *, blasint *, blasint *, double *, blasint *); + +void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csbmv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(chbmv)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +/* Level 3 routines */ + +void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(cgemm)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemm)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(cgemm3m)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *, + float *, float *, blasint *); +int BLASFUNC(dge2mm)(char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *, + double *, double *, blasint *); +int BLASFUNC(cge2mm)(char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *, + float *, float *, blasint *); +int BLASFUNC(zge2mm)(char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *, + double *, double *, blasint *); + +void BLASFUNC(strsm)(char *, char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *); +void BLASFUNC(dtrsm)(char *, char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *); +void BLASFUNC(qtrsm)(char *, char *, char *, char *, blasint *, blasint *, + xdouble *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ctrsm)(char *, char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *); +void BLASFUNC(ztrsm)(char *, char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *); +void BLASFUNC(xtrsm)(char *, char *, char *, char *, blasint *, blasint *, + xdouble *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(strmm)(char *, char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *); +void BLASFUNC(dtrmm)(char *, char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *); +void BLASFUNC(qtrmm)(char *, char *, char *, char *, blasint *, blasint *, + xdouble *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(ctrmm)(char *, char *, char *, char *, blasint *, blasint *, + float *, float *, blasint *, float *, blasint *); +void BLASFUNC(ztrmm)(char *, char *, char *, char *, blasint *, blasint *, + double *, double *, blasint *, double *, blasint *); +void BLASFUNC(xtrmm)(char *, char *, char *, char *, blasint *, blasint *, + xdouble *, xdouble *, blasint *, xdouble *, blasint *); + +void BLASFUNC(ssymm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(qsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csymm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(csymm3m)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsymm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xsymm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(ssyrk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, float *, blasint *); +void BLASFUNC(dsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, double *, blasint *); +void BLASFUNC(qsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, xdouble *, blasint *); +void BLASFUNC(csyrk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, float *, blasint *); +void BLASFUNC(zsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, double *, blasint *); +void BLASFUNC(xsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, xdouble *, blasint *); + +void BLASFUNC(ssyr2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(dsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double*, blasint *, double *, double *, blasint *); +void BLASFUNC(qsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble*, blasint *, xdouble *, xdouble *, blasint *); +void BLASFUNC(csyr2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double*, blasint *, double *, double *, blasint *); +void BLASFUNC(xsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble*, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(chemm)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhemm)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhemm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(chemm3m)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zhemm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *, double *, double *, blasint *); +void BLASFUNC(xhemm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +void BLASFUNC(cherk)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, float *, blasint *); +void BLASFUNC(zherk)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double *, double *, blasint *); +void BLASFUNC(xherk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble *, xdouble *, blasint *); + +void BLASFUNC(cher2k)(char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +void BLASFUNC(zher2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *, + double*, blasint *, double *, double *, blasint *); +void BLASFUNC(xher2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble*, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(cher2m)(char *, char *, char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *, float *, float *, blasint *); +int BLASFUNC(zher2m)(char *, char *, char *, blasint *, blasint *, double *, double *, blasint *, + double*, blasint *, double *, double *, blasint *); +int BLASFUNC(xher2m)(char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *, + xdouble*, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(sgemt)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *); +int BLASFUNC(dgemt)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *); +int BLASFUNC(cgemt)(char *, blasint *, blasint *, float *, float *, blasint *, + float *, blasint *); +int BLASFUNC(zgemt)(char *, blasint *, blasint *, double *, double *, blasint *, + double *, blasint *); + +int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, float *, blasint *, float *, blasint *); +int BLASFUNC(dgema)(char *, char *, blasint *, blasint *, double *, + double *, blasint *, double*, double *, blasint *, double*, blasint *); +int BLASFUNC(cgema)(char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, float *, blasint *, float *, blasint *); +int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *, + double *, blasint *, double*, double *, blasint *, double*, blasint *); + +int BLASFUNC(sgems)(char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, float *, blasint *, float *, blasint *); +int BLASFUNC(dgems)(char *, char *, blasint *, blasint *, double *, + double *, blasint *, double*, double *, blasint *, double*, blasint *); +int BLASFUNC(cgems)(char *, char *, blasint *, blasint *, float *, + float *, blasint *, float *, float *, blasint *, float *, blasint *); +int BLASFUNC(zgems)(char *, char *, blasint *, blasint *, double *, + double *, blasint *, double*, double *, blasint *, double*, blasint *); + +int BLASFUNC(sgemc)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *, float *, float *, blasint *); +int BLASFUNC(dgemc)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *); +int BLASFUNC(qgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); +int BLASFUNC(cgemc)(char *, char *, blasint *, blasint *, blasint *, float *, + float *, blasint *, float *, blasint *, float *, blasint *, float *, float *, blasint *); +int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *, + double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *); +int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *, + xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); + +int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); +int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); +int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); +int BLASFUNC(cgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); +int BLASFUNC(zgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); +int BLASFUNC(xgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); + +int BLASFUNC(sgetrf)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); +int BLASFUNC(dgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); +int BLASFUNC(qgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); +int BLASFUNC(cgetrf)(blasint *, blasint *, float *, blasint *, blasint *, blasint *); +int BLASFUNC(zgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *); +int BLASFUNC(xgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *); + +int BLASFUNC(slaswp)(blasint *, float *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(dlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(qlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(claswp)(blasint *, float *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(zlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *); +int BLASFUNC(xlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *); + +int BLASFUNC(sgetrs)(char *, blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cgetrs)(char *, blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(sgesv)(blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); +int BLASFUNC(qgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); +int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *); +int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *); + +int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotf2)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotf2)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotf2)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(spotrf)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotrf)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(clauu2)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zlauu2)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xlauu2)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(slauum)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dlauum)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qlauum)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(clauum)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zlauum)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xlauum)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(strti2)(char *, char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dtrti2)(char *, char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(ctrti2)(char *, char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(ztrti2)(char *, char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(strtri)(char *, char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dtrtri)(char *, char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(ctrtri)(char *, char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(ztrtri)(char *, char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *); +int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *); +int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *); +int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *); + +int BLASFUNC(slarf)(char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *); +int BLASFUNC(dlarf)(char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *); +int BLASFUNC(qlarf)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); +int BLASFUNC(clarf)(char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *); +int BLASFUNC(zlarf)(char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *); +int BLASFUNC(xlarf)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *); + +FLOATRET BLASFUNC(slamch)(char *); +double BLASFUNC(dlamch)(char *); +xdouble BLASFUNC(qlamch)(char *); + +FLOATRET BLASFUNC(slamc3)(float *, float *); +double BLASFUNC(dlamc3)(double *, double *); +xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *); +#endif diff --git a/common_lapack.h b/common_lapack.h new file mode 100644 index 0000000000..f6d1956fc9 --- /dev/null +++ b/common_lapack.h @@ -0,0 +1,296 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +/* Lapack Library */ + +blasint sgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetf2_k(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint sgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetrf_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint sgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetrf_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int slaswp_plus (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); +int slaswp_minus(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); +int dlaswp_plus (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); +int dlaswp_minus(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); +int qlaswp_plus (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); +int qlaswp_minus(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); + +int claswp_plus (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); +int claswp_minus(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, blasint *, BLASLONG); +int zlaswp_plus (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); +int zlaswp_minus(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, blasint *, BLASLONG); +int xlaswp_plus (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); +int xlaswp_minus(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, blasint *, BLASLONG); + +int slaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +int dlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +int qlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); +int claswp_ncopy(BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +int zlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +int xlaswp_ncopy(BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); + +blasint sgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint sgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetrs_N_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_T_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_R_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_C_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint sgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint sgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xgetrs_N_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_T_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_R_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xgetrs_C_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint spotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint spotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xpotf2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xpotf2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint spotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint spotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xpotrf_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xpotrf_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint spotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint spotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint cpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint cpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xpotrf_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xpotrf_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint slauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint slauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint clauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint clauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xlauu2_U(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xlauu2_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint slauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint slauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint clauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint clauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xlauum_U_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xlauum_L_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint slauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint slauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint clauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint clauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xlauum_U_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xlauum_L_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint strti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint ctrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ztrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xtrti2_UU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrti2_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrti2_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrti2_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint strtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint ctrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ztrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xtrtri_UU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_UN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_LU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_LN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +blasint strtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint strtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint ctrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ctrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint ztrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint ztrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xtrtri_UU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_UN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_LU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xtrtri_LN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int sneg_tcopy(BLASLONG, BLASLONG, float *, BLASLONG, float *); +int dneg_tcopy(BLASLONG, BLASLONG, double *, BLASLONG, double *); +int qneg_tcopy(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); +int cneg_tcopy(BLASLONG, BLASLONG, float *, BLASLONG, float *); +int zneg_tcopy(BLASLONG, BLASLONG, double *, BLASLONG, double *); +int xneg_tcopy(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + +blasint slarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint slarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint dlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint dlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint qlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint qlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint clarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint clarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +blasint zlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +#endif diff --git a/common_level1.h b/common_level1.h new file mode 100644 index 0000000000..f51ced6683 --- /dev/null +++ b/common_level1.h @@ -0,0 +1,212 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +#ifdef __CUDACC__ +extern "C" { +#endif + +float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); +double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); +double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); +xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +float _Complex cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); +float _Complex cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); +double _Complex zdotc_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); +double _Complex zdotu_k (BLASLONG, double *, BLASLONG, double *, BLASLONG); +xdouble _Complex xdotc_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +xdouble _Complex xdotu_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +int saxpy_k (BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int daxpy_k (BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int qaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int caxpy_k (BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zaxpy_k (BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xaxpy_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int caxpyc_k (BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zaxpyc_k (BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xaxpyc_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +int scopy_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); +int dcopy_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); +int qcopy_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int ccopy_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zcopy_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xcopy_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +int sswap_k (BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int dswap_k (BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double*, BLASLONG); +int qswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble*, BLASLONG); +int cswap_k (BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zswap_k (BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double*, BLASLONG); +int xswap_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble*, BLASLONG); + +float sasum_k (BLASLONG, float *, BLASLONG); +double dasum_k (BLASLONG, double *, BLASLONG); +xdouble qasum_k (BLASLONG, xdouble *, BLASLONG); +float casum_k (BLASLONG, float *, BLASLONG); +double zasum_k (BLASLONG, double *, BLASLONG); +xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); + +float samax_k (BLASLONG, float *, BLASLONG); +double damax_k (BLASLONG, double *, BLASLONG); +xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); +float camax_k (BLASLONG, float *, BLASLONG); +double zamax_k (BLASLONG, double *, BLASLONG); +xdouble xamax_k (BLASLONG, xdouble *, BLASLONG); + +float samin_k (BLASLONG, float *, BLASLONG); +double damin_k (BLASLONG, double *, BLASLONG); +xdouble qamin_k (BLASLONG, xdouble *, BLASLONG); +float camin_k (BLASLONG, float *, BLASLONG); +double zamin_k (BLASLONG, double *, BLASLONG); +xdouble xamin_k (BLASLONG, xdouble *, BLASLONG); + +BLASLONG isamax_k(BLASLONG, float *, BLASLONG); +BLASLONG idamax_k(BLASLONG, double *, BLASLONG); +BLASLONG iqamax_k(BLASLONG, xdouble *, BLASLONG); +BLASLONG icamax_k(BLASLONG, float *, BLASLONG); +BLASLONG izamax_k(BLASLONG, double *, BLASLONG); +BLASLONG ixamax_k(BLASLONG, xdouble *, BLASLONG); + +BLASLONG isamin_k(BLASLONG, float *, BLASLONG); +BLASLONG idamin_k(BLASLONG, double *, BLASLONG); +BLASLONG iqamin_k(BLASLONG, xdouble *, BLASLONG); +BLASLONG icamin_k(BLASLONG, float *, BLASLONG); +BLASLONG izamin_k(BLASLONG, double *, BLASLONG); +BLASLONG ixamin_k(BLASLONG, xdouble *, BLASLONG); + +float smax_k (BLASLONG, float *, BLASLONG); +double dmax_k (BLASLONG, double *, BLASLONG); +xdouble qmax_k (BLASLONG, xdouble *, BLASLONG); +float cmax_k (BLASLONG, float *, BLASLONG); +double zmax_k (BLASLONG, double *, BLASLONG); +xdouble xmax_k (BLASLONG, xdouble *, BLASLONG); + +float smin_k (BLASLONG, float *, BLASLONG); +double dmin_k (BLASLONG, double *, BLASLONG); +xdouble qmin_k (BLASLONG, xdouble *, BLASLONG); +float cmin_k (BLASLONG, float *, BLASLONG); +double zmin_k (BLASLONG, double *, BLASLONG); +xdouble xmin_k (BLASLONG, xdouble *, BLASLONG); + +BLASLONG ismax_k(BLASLONG, float *, BLASLONG); +BLASLONG idmax_k(BLASLONG, double *, BLASLONG); +BLASLONG iqmax_k(BLASLONG, xdouble *, BLASLONG); +BLASLONG icmax_k(BLASLONG, float *, BLASLONG); +BLASLONG izmax_k(BLASLONG, double *, BLASLONG); +BLASLONG ixmax_k(BLASLONG, xdouble *, BLASLONG); + +BLASLONG ismin_k(BLASLONG, float *, BLASLONG); +BLASLONG idmin_k(BLASLONG, double *, BLASLONG); +BLASLONG iqmin_k(BLASLONG, xdouble *, BLASLONG); +BLASLONG icmin_k(BLASLONG, float *, BLASLONG); +BLASLONG izmin_k(BLASLONG, double *, BLASLONG); +BLASLONG ixmin_k(BLASLONG, xdouble *, BLASLONG); + +int sscal_k(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int dscal_k(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int qscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int cscal_k(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zscal_k(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int csscal_k(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zdscal_k(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int xqscal_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + +float snrm2_k(BLASLONG, float *, BLASLONG); +double dnrm2_k(BLASLONG, double *, BLASLONG); +xdouble qnrm2_k(BLASLONG, xdouble *, BLASLONG); +float cnrm2_k(BLASLONG, float *, BLASLONG); +double znrm2_k(BLASLONG, double *, BLASLONG); +xdouble xnrm2_k(BLASLONG, xdouble *, BLASLONG); + +int srot_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float , float ); +int drot_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); +int qrot_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); +int csrot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG, float , float ); +int zdrot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); +int xqrot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + +int srotg_k(float *, float *, float *, float *); +int drotg_k(double *, double *, double *, double *); +int qrotg_k(xdouble *, xdouble *, xdouble *, xdouble *); +int csrotg_k(float *, float *, float *, float *); +int zdrotg_k(double *, double *, double *, double *); +int xqrotg_k(xdouble *, xdouble *, xdouble *, xdouble *); + +int srotmg_k(float *, float *, float *, float *, float *); +int drotmg_k(double *, double *, double *, double *, double *); +int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); + +int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); +int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); +int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); + +#ifdef __CUDACC__ +} +#endif + +#endif + diff --git a/common_level2.h b/common_level2.h new file mode 100644 index 0000000000..2ab682a022 --- /dev/null +++ b/common_level2.h @@ -0,0 +1,1359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +/* Level 2 Blas routines */ + +#ifdef __CUDACC__ +extern "C" { +#endif + +int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int cgeru_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cgerc_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cgerv_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cgerd_k(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zgeru_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zgerc_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zgerv_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zgerd_k(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xgeru_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xgerc_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xgerv_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xgerd_k(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int sger_thread (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int dger_thread (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int qger_thread (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int cger_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cger_thread_C(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cger_thread_V(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cger_thread_D(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zger_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zger_thread_C(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zger_thread_V(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zger_thread_D(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xger_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xger_thread_C(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xger_thread_V(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xger_thread_D(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int sgemv_n(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int sgemv_t(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int dgemv_n(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int dgemv_t(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int qgemv_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int qgemv_t(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); + +int cgemv_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_t(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_c(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_o(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_u(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_s(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); +int cgemv_d(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer); + +int zgemv_n(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_t(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_r(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_c(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_o(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_u(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_s(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); +int zgemv_d(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer); + +int xgemv_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_t(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_r(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_c(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_o(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_u(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_s(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); +int xgemv_d(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer); + +int sgemv_thread_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int sgemv_thread_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int dgemv_thread_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int dgemv_thread_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int qgemv_thread_n(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int qgemv_thread_t(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); + +int cgemv_thread_n(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_t(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_r(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_c(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_o(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_u(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_s(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgemv_thread_d(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); + +int zgemv_thread_n(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_t(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_r(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_c(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_o(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_u(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_s(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgemv_thread_d(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); + +int xgemv_thread_n(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_t(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_r(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_c(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_o(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_u(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_s(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgemv_thread_d(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); + +int strsv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int strsv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int dtrsv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtrsv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int qtrsv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtrsv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int ctrsv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctrsv_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int ztrsv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztrsv_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int xtrsv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtrsv_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int strmv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int strmv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + +int dtrmv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dtrmv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + +int qtrmv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qtrmv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int ctrmv_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ctrmv_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + +int ztrmv_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int ztrmv_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + +int xtrmv_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xtrmv_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int strmv_thread_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int strmv_thread_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); + +int dtrmv_thread_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtrmv_thread_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); + +int qtrmv_thread_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtrmv_thread_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ctrmv_thread_NUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_NUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_NLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_NLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_TUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_TUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_TLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_TLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_RUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_RUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_RLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_RLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_CUU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_CUN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_CLU(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctrmv_thread_CLN(BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); + +int ztrmv_thread_NUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_NUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_NLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_NLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_TUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_TUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_TLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_TLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_RUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_RUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_RLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_RLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_CUU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_CUN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_CLU(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztrmv_thread_CLN(BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); + +int xtrmv_thread_NUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_NUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_NLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_NLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_TUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_TUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_TLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_TLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_RUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_RUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_RLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_RLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_CUU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_CUN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_CLU(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtrmv_thread_CLN(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int stpsv_NUU(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_NUN(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_NLU(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_NLN(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_TUU(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_TUN(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_TLU(BLASLONG, float *, float *, BLASLONG, void *); +int stpsv_TLN(BLASLONG, float *, float *, BLASLONG, void *); + +int dtpsv_NUU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_NUN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_NLU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_NLN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_TUU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_TUN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_TLU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpsv_TLN(BLASLONG, double *, double *, BLASLONG, void *); + +int qtpsv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpsv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); + +int ctpsv_NUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_NUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_NLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_NLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_TUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_TUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_TLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_TLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_RUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_RUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_RLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_RLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_CUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_CUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_CLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpsv_CLN(BLASLONG, float *, float *, BLASLONG, void *); + +int ztpsv_NUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_NUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_NLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_NLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_TUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_TUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_TLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_TLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_RUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_RUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_RLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_RLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_CUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_CUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_CLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpsv_CLN(BLASLONG, double *, double *, BLASLONG, void *); + +int xtpsv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpsv_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); + +int stpmv_NUU(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_NUN(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_NLU(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_NLN(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_TUU(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_TUN(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_TLU(BLASLONG, float *, float *, BLASLONG, void *); +int stpmv_TLN(BLASLONG, float *, float *, BLASLONG, void *); + +int dtpmv_NUU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_NUN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_NLU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_NLN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_TUU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_TUN(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_TLU(BLASLONG, double *, double *, BLASLONG, void *); +int dtpmv_TLN(BLASLONG, double *, double *, BLASLONG, void *); + +int qtpmv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int qtpmv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); + +int ctpmv_NUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_NUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_NLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_NLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_TUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_TUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_TLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_TLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_RUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_RUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_RLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_RLN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_CUU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_CUN(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_CLU(BLASLONG, float *, float *, BLASLONG, void *); +int ctpmv_CLN(BLASLONG, float *, float *, BLASLONG, void *); + +int ztpmv_NUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_NUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_NLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_NLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_TUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_TUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_TLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_TLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_RUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_RUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_RLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_RLN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_CUU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_CUN(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_CLU(BLASLONG, double *, double *, BLASLONG, void *); +int ztpmv_CLN(BLASLONG, double *, double *, BLASLONG, void *); + +int xtpmv_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); +int xtpmv_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, void *); + +int stpmv_thread_NUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_NUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_NLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_NLN(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_TUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_TUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_TLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int stpmv_thread_TLN(BLASLONG, float *, float *, BLASLONG, float *, int); + +int dtpmv_thread_NUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_NUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_NLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_NLN(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_TUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_TUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_TLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int dtpmv_thread_TLN(BLASLONG, double *, double *, BLASLONG, double *, int); + +int qtpmv_thread_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int qtpmv_thread_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); + +int ctpmv_thread_NUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_NUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_NLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_NLN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_TUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_TUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_TLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_TLN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_RUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_RUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_RLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_RLN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_CUU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_CUN(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_CLU(BLASLONG, float *, float *, BLASLONG, float *, int); +int ctpmv_thread_CLN(BLASLONG, float *, float *, BLASLONG, float *, int); + +int ztpmv_thread_NUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_NUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_NLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_NLN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_TUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_TUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_TLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_TLN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_RUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_RUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_RLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_RLN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_CUU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_CUN(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_CLU(BLASLONG, double *, double *, BLASLONG, double *, int); +int ztpmv_thread_CLN(BLASLONG, double *, double *, BLASLONG, double *, int); + +int xtpmv_thread_NUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_NUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_NLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_NLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_TUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_TUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_TLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_TLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_RUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_RUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_RLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_RLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_CUU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_CUN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_CLU(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); +int xtpmv_thread_CLN(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, int); + +int ssymv_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ssymv_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int dsymv_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dsymv_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int qsymv_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qsymv_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int csymv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int csymv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zsymv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zsymv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xsymv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xsymv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int ssymv_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ssymv_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int dsymv_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dsymv_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int qsymv_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qsymv_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int csymv_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int csymv_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zsymv_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zsymv_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xsymv_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xsymv_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int chemv_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chemv_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chemv_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chemv_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zhemv_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhemv_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhemv_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhemv_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xhemv_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhemv_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhemv_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhemv_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int sspmv_L(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int sspmv_U(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int dspmv_L(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int dspmv_U(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int qspmv_L(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qspmv_U(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int cspmv_L(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int cspmv_U(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int zspmv_L(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int zspmv_U(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int xspmv_L(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xspmv_U(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int sspmv_thread_L(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int sspmv_thread_U(BLASLONG, float, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int dspmv_thread_L(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int dspmv_thread_U(BLASLONG, double, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int qspmv_thread_L(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qspmv_thread_U(BLASLONG, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int cspmv_thread_L(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int cspmv_thread_U(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int zspmv_thread_L(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zspmv_thread_U(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int xspmv_thread_L(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xspmv_thread_U(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ssyr_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int ssyr_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int dsyr_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int dsyr_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int qsyr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qsyr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int csyr_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *); +int csyr_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *); +int zsyr_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *); +int zsyr_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *); +int xsyr_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xsyr_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int ssyr_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int ssyr_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int dsyr_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int dsyr_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int qsyr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qsyr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int csyr_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int csyr_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int zsyr_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zsyr_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int xsyr_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xsyr_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ssyr2_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int ssyr2_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int dsyr2_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int dsyr2_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int qsyr2_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int qsyr2_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int csyr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int csyr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zsyr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zsyr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xsyr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xsyr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int ssyr2_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ssyr2_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int dsyr2_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dsyr2_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int qsyr2_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qsyr2_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int csyr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int csyr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zsyr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zsyr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xsyr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xsyr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int sspr_L(BLASLONG, float, float *, BLASLONG, float *, float *); +int sspr_U(BLASLONG, float, float *, BLASLONG, float *, float *); +int dspr_L(BLASLONG, double, double *, BLASLONG, double *, double *); +int dspr_U(BLASLONG, double, double *, BLASLONG, double *, double *); +int qspr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int qspr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int cspr_L(BLASLONG, float, float, float *, BLASLONG, float *, float *); +int cspr_U(BLASLONG, float, float, float *, BLASLONG, float *, float *); +int zspr_L(BLASLONG, double, double, double *, BLASLONG, double *, double *); +int zspr_U(BLASLONG, double, double, double *, BLASLONG, double *, double *); +int xspr_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int xspr_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); + +int sspr_thread_L(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int sspr_thread_U(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int dspr_thread_L(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int dspr_thread_U(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int qspr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int qspr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int cspr_thread_L(BLASLONG, float *, float *, BLASLONG, float *, float *, int); +int cspr_thread_U(BLASLONG, float *, float *, BLASLONG, float *, float *, int); +int zspr_thread_L(BLASLONG, double *, double *, BLASLONG, double *, double *, int); +int zspr_thread_U(BLASLONG, double *, double *, BLASLONG, double *, double *, int); +int xspr_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xspr_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, xdouble *, int); + +int sspr2_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int sspr2_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int dspr2_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int dspr2_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int qspr2_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int qspr2_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int cspr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int cspr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int zspr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int zspr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int xspr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int xspr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); + +int sspr2_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int sspr2_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int dspr2_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int dspr2_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int qspr2_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int qspr2_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int cspr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int cspr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int zspr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int zspr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int xspr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xspr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); + +int cher_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int cher_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int cher_V(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int cher_M(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *); +int zher_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int zher_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int zher_V(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int zher_M(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *); +int xher_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int cher_thread_L(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher_thread_U(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher_thread_V(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher_thread_M(BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, int); +int zher_thread_L(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher_thread_U(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher_thread_V(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher_thread_M(BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, int); +int xher_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher_thread_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher_thread_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int cher2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cher2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cher2_M(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int cher2_V(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zher2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zher2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zher2_M(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zher2_V(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xher2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher2_M(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xher2_V(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int cher2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher2_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int cher2_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zher2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher2_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zher2_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xher2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher2_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xher2_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int chpr_L(BLASLONG, float, float *, BLASLONG, float *, float *); +int chpr_U(BLASLONG, float, float *, BLASLONG, float *, float *); +int chpr_M(BLASLONG, float, float *, BLASLONG, float *, float *); +int chpr_V(BLASLONG, float, float *, BLASLONG, float *, float *); +int zhpr_L(BLASLONG, double, double *, BLASLONG, double *, double *); +int zhpr_U(BLASLONG, double, double *, BLASLONG, double *, double *); +int zhpr_M(BLASLONG, double, double *, BLASLONG, double *, double *); +int zhpr_V(BLASLONG, double, double *, BLASLONG, double *, double *); +int xhpr_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *); + +int chpr_thread_L(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int chpr_thread_U(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int chpr_thread_M(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int chpr_thread_V(BLASLONG, float, float *, BLASLONG, float *, float *, int); +int zhpr_thread_L(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int zhpr_thread_U(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int zhpr_thread_M(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int zhpr_thread_V(BLASLONG, double, double *, BLASLONG, double *, double *, int); +int xhpr_thread_L(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr_thread_U(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr_thread_M(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr_thread_V(BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, xdouble *, int); + +int chpr2_L(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int chpr2_U(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int chpr2_M(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int chpr2_V(BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, float *); +int zhpr2_L(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int zhpr2_U(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int zhpr2_M(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int zhpr2_V(BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, double *); +int xhpr2_L(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr2_U(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr2_M(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); +int xhpr2_V(BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *); + +int chpr2_thread_L(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int chpr2_thread_U(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int chpr2_thread_M(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int chpr2_thread_V(BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, float *, int); +int zhpr2_thread_L(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int zhpr2_thread_U(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int zhpr2_thread_M(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int zhpr2_thread_V(BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, double *, int); +int xhpr2_thread_L(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr2_thread_U(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr2_thread_M(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); +int xhpr2_thread_V(BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, xdouble *, int); + +int chemv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int chemv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int chemv_M(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int chemv_V(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +int zhemv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zhemv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zhemv_M(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int zhemv_V(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +int xhemv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xhemv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xhemv_M(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); +int xhemv_V(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + +int chpmv_L(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int chpmv_U(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int chpmv_M(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int chpmv_V(BLASLONG, float, float, float *, float *, BLASLONG, float *, BLASLONG, void *); +int zhpmv_L(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int zhpmv_U(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int zhpmv_M(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int zhpmv_V(BLASLONG, double, double, double *, double *, BLASLONG, double *, BLASLONG, void *); +int xhpmv_L(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhpmv_U(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhpmv_M(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhpmv_V(BLASLONG, xdouble, xdouble, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int chpmv_thread_L(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int chpmv_thread_U(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int chpmv_thread_M(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int chpmv_thread_V(BLASLONG, float *, float *, float *, BLASLONG, float *, BLASLONG, float *, int); +int zhpmv_thread_L(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhpmv_thread_U(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhpmv_thread_M(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhpmv_thread_V(BLASLONG, double *, double *, double *, BLASLONG, double *, BLASLONG, double *, int); +int xhpmv_thread_L(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhpmv_thread_U(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhpmv_thread_M(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhpmv_thread_V(BLASLONG, xdouble *, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ssbmv_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ssbmv_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int dsbmv_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dsbmv_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int qsbmv_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qsbmv_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int csbmv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int csbmv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int zsbmv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int zsbmv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int xsbmv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xsbmv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int chbmv_L(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int chbmv_U(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int chbmv_M(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int chbmv_V(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int zhbmv_L(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int zhbmv_U(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int zhbmv_M(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int zhbmv_V(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int xhbmv_L(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhbmv_U(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhbmv_M(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xhbmv_V(BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + + +int ssbmv_thread_L(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ssbmv_thread_U(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int dsbmv_thread_L(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dsbmv_thread_U(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int qsbmv_thread_L(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qsbmv_thread_U(BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int csbmv_thread_L(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int csbmv_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zsbmv_thread_L(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zsbmv_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xsbmv_thread_L(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xsbmv_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int chbmv_thread_L(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chbmv_thread_U(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chbmv_thread_M(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int chbmv_thread_V(BLASLONG, BLASLONG, float *, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int zhbmv_thread_L(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhbmv_thread_U(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhbmv_thread_M(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int zhbmv_thread_V(BLASLONG, BLASLONG, double *, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int xhbmv_thread_L(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhbmv_thread_U(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhbmv_thread_M(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xhbmv_thread_V(BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int snorm_n(BLASLONG, BLASLONG, float *a, BLASLONG); +int snorm_t(BLASLONG, BLASLONG, float *a, BLASLONG); +int dnorm_n(BLASLONG, BLASLONG, double *a, BLASLONG); +int dnorm_t(BLASLONG, BLASLONG, double *a, BLASLONG); +int cnorm_n(BLASLONG, BLASLONG, float *a, BLASLONG); +int cnorm_t(BLASLONG, BLASLONG, float *a, BLASLONG); +int znorm_n(BLASLONG, BLASLONG, double *a, BLASLONG); +int znorm_t(BLASLONG, BLASLONG, double *a, BLASLONG); + +void sgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void sgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); + +void dgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void dgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); + +void qgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void qgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); + +void cgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); +void cgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *buffer); + +void zgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); +void zgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *buffer); + +void xgbmv_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); +void xgbmv_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *buffer); + +int sgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int sgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); + +int dgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int dgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); + +int qgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int qgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); + +int cgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); +int cgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, float *, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *buffer, int); + +int zgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); +int zgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, double *, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *buffer, int); + +int xgbmv_thread_n(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_t(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_r(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_c(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_o(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_u(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_s(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); +int xgbmv_thread_d(BLASLONG, BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *buffer, int); + +int stbmv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbmv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int dtbmv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbmv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int qtbmv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbmv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int ctbmv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbmv_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int ztbmv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbmv_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int xtbmv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbmv_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int stbmv_thread_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int stbmv_thread_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); + +int dtbmv_thread_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int dtbmv_thread_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); + +int qtbmv_thread_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int qtbmv_thread_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int ctbmv_thread_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); +int ctbmv_thread_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, int); + +int ztbmv_thread_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); +int ztbmv_thread_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, int); + +int xtbmv_thread_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); +int xtbmv_thread_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, int); + +int stbsv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int stbsv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int dtbsv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int dtbsv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int qtbsv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int qtbsv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +int ctbsv_NUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_NUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_NLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_NLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_TUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_TUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_TLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_TLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_RUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_RUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_RLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_RLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_CUU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_CUN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_CLU(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); +int ctbsv_CLN(BLASLONG, BLASLONG, float *, BLASLONG, float *, BLASLONG, void *); + +int ztbsv_NUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_NUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_NLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_NLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_TUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_TUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_TLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_TLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_RUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_RUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_RLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_RLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_CUU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_CUN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_CLU(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); +int ztbsv_CLN(BLASLONG, BLASLONG, double *, BLASLONG, double *, BLASLONG, void *); + +int xtbsv_NUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_NUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_NLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_NLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_TUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_TUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_TLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_TLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_RUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_RUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_RLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_RLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_CUU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_CUN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_CLU(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); +int xtbsv_CLN(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, void *); + +#ifdef __CUDACC__ +} +#endif + +#endif diff --git a/common_level3.h b/common_level3.h new file mode 100644 index 0000000000..cbc67a6c33 --- /dev/null +++ b/common_level3.h @@ -0,0 +1,1739 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef ASSEMBLER + +#ifdef __CUDACC__ +__global__ void cuda_sgemm_kernel(int, int, int, float *, float *, float *); +__global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); +#endif + +#ifdef __CUDACC__ +extern "C" { +#endif + +int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); +int cgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +int zgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + +#ifdef EXPRECISION +int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +#else +int qgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, + xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +#endif + +int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int sgemm_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int dgemm_incopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int dgemm_itcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int dgemm_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int dgemm_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int cgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int zgemm_incopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm_itcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); + +#ifdef QUAD_PRECISION +int qgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int qgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int qgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int qgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int xgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int xgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int xgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +int xgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xidouble *b); +#else +int qgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int qgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int qgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int qgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm_incopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm_itcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +#endif + + +int strsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int dtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + +int qtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + +int ctrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + +int ztrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + +int xtrsm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrsm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + +int strmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); +int strmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + +int dtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); +int dtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + +int qtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int qtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + +int ctrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); +int ctrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + +int ztrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); +int ztrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + +int xtrmm_kernel_RN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_RT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_RR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_RC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_LN(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_LT(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_LR(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); +int xtrmm_kernel_LC(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + +int strmm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int strmm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); + +int dtrmm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dtrmm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); + +int qtrmm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qtrmm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); + +int ctrmm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ctrmm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); + +int ztrmm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int ztrmm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); + +int xtrmm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xtrmm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); + +int strsm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int strsm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); + +int dtrsm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int dtrsm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); + +int qtrsm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int qtrsm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); + +int ctrsm_iunucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iunncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iutucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iutncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_ounucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_ounncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_outucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_outncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); +int ctrsm_oltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG offset, float *b); + +int ztrsm_iunucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iunncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iutucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iutncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_ounucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_ounncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_outucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_outncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); +int ztrsm_oltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG offset, double *b); + +int xtrsm_iunucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iunncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iutucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iutncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_ounucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_ounncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_outucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_outncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_ilnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_ilnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_iltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_olnucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_olnncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_oltucopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); +int xtrsm_oltncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG offset, xdouble *b); + +int ssymm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ssymm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ssymm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ssymm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int dsymm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dsymm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dsymm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dsymm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int qsymm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qsymm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qsymm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int qsymm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int csymm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int csymm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int csymm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int csymm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int zsymm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zsymm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zsymm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zsymm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int xsymm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xsymm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xsymm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xsymm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); + +int chemm_iutcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int chemm_outcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int chemm_iltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int chemm_oltcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int zhemm_iutcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zhemm_outcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zhemm_iltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zhemm_oltcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int xhemm_iutcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xhemm_outcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xhemm_iltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); +int xhemm_oltcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, xdouble *b); + +int ssyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int ssyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); + +int dsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int dsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); + +int qsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int qsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); + +int csyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int csyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int zsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int zsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int xsyrk_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int xsyrk_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); + +int ssyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int ssyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int dsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int dsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int qsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int qsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); + +int csyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int csyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int zsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int zsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int xsyr2k_kernel_U(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int xsyr2k_kernel_L(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); + +int cherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int cherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int cherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); +int cherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset); + +int zherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int zherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int zherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); +int zherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset); + +int xherk_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int xherk_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int xherk_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); +int xherk_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset); + +int cher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int cher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int cher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); +int cher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, float alpha_r, float alpha_i, float *a, float *b, float *c, BLASLONG ldc, BLASLONG offset, int flag); + +int zher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int zher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int zher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); +int zher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, double alpha_r, double alpha_i, double *a, double *b, double *c, BLASLONG ldc, BLASLONG offset, int flag); + +int xher2k_kernel_UN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); +int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); + +int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); +int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); + +#ifdef QUAD_PRECISION +int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble *, xdouble *, BLASLONG); +#else +int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +#endif + +int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int cgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + +int zgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + +int xgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int xgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int xgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int xgemm_kernel_b(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + +int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + +int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +#ifdef QUAD_PRECISION +int qgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +#else +int qgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +#endif + +int cgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +#ifdef QUAD_PRECISION +int xgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +#else +int xgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +#endif + +int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +#ifdef QUAD_PRECISION +int qgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +int qgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xidouble *, xidouble *, BLASLONG); +#else +int qgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +#endif + +int cgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int zgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int xgemm3m_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xgemm3m_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemm3m_thread_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cher2m_LNN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LNT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LNR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LNC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LTN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LTT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LTR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LTC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LRN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LRT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LRR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LRC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LCN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LCT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LCR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_LCC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UNN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UNT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UNR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UNC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UTN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UTT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UTR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UTC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_URN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_URT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_URR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_URC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UCN(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UCT(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UCR(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); +int cher2m_UCC(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, + float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, BLASLONG); + +int zher2m_LNN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LNT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LNR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LNC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LTN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LTT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LTR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LTC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LRN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LRT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LRR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LRC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LCN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LCT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LCR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_LCC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UNN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UNT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UNR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UNC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UTN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UTT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UTR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UTC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_URN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_URT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_URR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_URC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UCN(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UCT(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UCR(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); +int zher2m_UCC(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, + double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, BLASLONG); + +int strsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ctrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int ztrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xtrsm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrsm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int strmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int strmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ctrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ctrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int ztrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int ztrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xtrmm_LNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_LCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RNUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RNUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RNLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RNLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RTUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RTUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RTLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RTLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RRUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RRUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RRLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RRLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RCUU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RCUN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RCLU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xtrmm_RCLN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsymm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsymm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsymm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int chemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zhemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xhemm_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int chemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zhemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xhemm3m_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int chemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zhemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xhemm3m_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm3m_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsymm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsymm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int chemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int chemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zhemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zhemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xhemm_thread_LU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_thread_LL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_thread_RU(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xhemm_thread_RL(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsyrk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsyrk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_thread_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyrk_thread_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int ssyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int ssyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int csyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int csyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xsyr2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyr2k_UT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyr2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xsyr2k_LT(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xherk_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xherk_thread_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_thread_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_thread_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xherk_thread_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xher2k_UN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xher2k_UC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xher2k_LN(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xher2k_LC(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int sgemt_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, int); +int sgemt_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, int); +int dgemt_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, int); +int dgemt_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, int); + +int cgemt_n(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); +int cgemt_t(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); +int cgemt_r(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); +int cgemt_c(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, int); +int zgemt_n(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); +int zgemt_t(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); +int zgemt_r(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); +int zgemt_c(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, int); + +int sgema_n(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); +int sgema_t(BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG); +int dgema_n(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); +int dgema_t(BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG); + +int cgema_n(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cgema_t(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cgema_r(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int cgema_c(BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG); +int zgema_n(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zgema_t(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zgema_r(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int zgema_c(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); + +int cgemm3m_incopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_incopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_incopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_itcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_itcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int cgemm3m_itcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); + +int cgemm3m_oncopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_oncopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_oncopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_otcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_otcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); +int cgemm3m_otcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float alpha_r, float alpha_i, float *b); + +int zgemm3m_incopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_incopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_incopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_itcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_itcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zgemm3m_itcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); + +int zgemm3m_oncopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_oncopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_oncopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_otcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_otcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); +int zgemm3m_otcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double alpha_r, double alpha_i, double *b); + +int xgemm3m_incopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_incopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_incopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_itcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_itcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); +int xgemm3m_itcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b); + +int xgemm3m_oncopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_oncopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_oncopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_otcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_otcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xgemm3m_otcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble alpha_r, xdouble alpha_i, xdouble *b); + +int csymm3m_iucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_ilcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_iucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_ilcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_iucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int csymm3m_ilcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); + +int csymm3m_oucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_olcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_oucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_olcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_oucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int csymm3m_olcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); + +int zsymm3m_iucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_ilcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_iucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_ilcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_iucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zsymm3m_ilcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); + +int zsymm3m_oucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_olcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_oucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_olcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_oucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zsymm3m_olcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); + +int xsymm3m_iucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_ilcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_iucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_ilcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_iucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xsymm3m_ilcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); + +int xsymm3m_oucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_olcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_oucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_olcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_oucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xsymm3m_olcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); + +int chemm3m_iucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_ilcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_iucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_ilcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_iucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); +int chemm3m_ilcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float *b); + +int chemm3m_oucopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_olcopyb(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_oucopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_olcopyr(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_oucopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); +int chemm3m_olcopyi(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG x, BLASLONG y, float alpha_r, float alpha_i, float *b); + +int zhemm3m_iucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_ilcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_iucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_ilcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_iucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); +int zhemm3m_ilcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double *b); + +int zhemm3m_oucopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_olcopyb(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_oucopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_olcopyr(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_oucopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); +int zhemm3m_olcopyi(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG x, BLASLONG y, double alpha_r, double alpha_i, double *b); + +int xhemm3m_iucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_ilcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_iucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_ilcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_iucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); +int xhemm3m_ilcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble *b); + +int xhemm3m_oucopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_olcopyb(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_oucopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_olcopyr(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_oucopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); +int xhemm3m_olcopyi(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, BLASLONG x, BLASLONG y, xdouble alpha_r, xdouble alpha_i, xdouble *b); + +int sgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int sgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int dgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int dgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int qgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int qgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int cgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); +int cgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); + +int zgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); +int zgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG); + +int xgemc_nn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_nt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_nr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_nc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_tn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_tt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_tr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_tc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_rn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_rt(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_rr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_rc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_cn(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_ct(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +int xgemc_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); + +int sgemc_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); +int sgemc_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); +int dgemc_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); +int dgemc_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); +int qgemc_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); +int qgemc_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); +int cgemc_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); +int cgemc_otcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b, BLASLONG ldb, float *c); +int zgemc_oncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); +int zgemc_otcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b, BLASLONG ldb, double *c); +int xgemc_oncopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); +int xgemc_otcopy(BLASLONG m, BLASLONG n, xdouble *a, BLASLONG lda, xdouble *b, BLASLONG ldb, xdouble *c); + +#ifdef __CUDACC__ +} +#endif + +#endif diff --git a/common_linux.h b/common_linux.h new file mode 100644 index 0000000000..d18cd2b721 --- /dev/null +++ b/common_linux.h @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_LINUX_H +#define COMMON_LINUX_H + +#ifndef ASSEMBLER + +#include + +extern long int syscall (long int __sysno, ...); + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#ifndef MPOL_INTERLEAVE +#define MPOL_INTERLEAVE 3 +#endif + +#if defined(ARCH_IA64) && defined(__ECC) +#ifndef __NR_mbind +#define __NR_mbind 1259 +#endif +#ifndef __NR_get_mempolicy +#define __NR_get_mempolicy 1260 +#endif +#ifndef __NR_set_mempolicy +#define __NR_set_mempolicy 1261 +#endif +#endif + +static inline int my_mbind(void *addr, unsigned long len, int mode, + unsigned long *nodemask, unsigned long maxnode, + unsigned flags) { + + return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); +} + +static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { + + return syscall(SYS_set_mempolicy, mode, addr, flag); +} + +static inline int my_gettid(void) { return syscall(SYS_gettid); } + +#endif +#endif diff --git a/common_macro.h b/common_macro.h new file mode 100644 index 0000000000..bcaa9f38b9 --- /dev/null +++ b/common_macro.h @@ -0,0 +1,2734 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_MACRO +#define COMMON_MACRO + +#include "common_s.h" +#include "common_d.h" +#include "common_q.h" + +#include "common_c.h" +#include "common_z.h" +#include "common_x.h" + +#ifndef COMPLEX +#ifdef XDOUBLE + +#define AMAX_K QAMAX_K +#define AMIN_K QAMIN_K +#define MAX_K QMAX_K +#define MIN_K QMIN_K +#define IAMAX_K IQAMAX_K +#define IAMIN_K IQAMIN_K +#define IMAX_K IQMAX_K +#define IMIN_K IQMIN_K +#define ASUM_K QASUM_K +#define AXPYU_K QAXPYU_K +#define AXPYC_K QAXPYC_K +#define COPY_K QCOPY_K +#define DOTU_K QDOTU_K +#define DOTC_K QDOTC_K +#define NRM2_K QNRM2_K +#define SCAL_K QSCAL_K +#define SWAP_K QSWAP_K +#define ROT_K QROT_K + +#define GEMV_N QGEMV_N +#define GEMV_T QGEMV_T +#define GEMV_R QGEMV_R +#define GEMV_C QGEMV_C +#define GEMV_O QGEMV_O +#define GEMV_U QGEMV_U +#define GEMV_S QGEMV_S +#define GEMV_D QGEMV_D + +#define GERU_K QGERU_K +#define GERC_K QGERC_K +#define GERV_K QGERV_K +#define GERD_K QGERD_K + +#define SYMV_U QSYMV_U +#define SYMV_L QSYMV_L + +#define SYMV_THREAD_U QSYMV_THREAD_U +#define SYMV_THREAD_L QSYMV_THREAD_L + +#define GEMM_ONCOPY QGEMM_ONCOPY +#define GEMM_OTCOPY QGEMM_OTCOPY +#define GEMM_INCOPY QGEMM_INCOPY +#define GEMM_ITCOPY QGEMM_ITCOPY + +#ifdef UNIT + +#define TRMM_OUNCOPY QTRMM_OUNUCOPY +#define TRMM_OUTCOPY QTRMM_OUTUCOPY +#define TRMM_OLNCOPY QTRMM_OLNUCOPY +#define TRMM_OLTCOPY QTRMM_OLTUCOPY +#define TRSM_OUNCOPY QTRSM_OUNUCOPY +#define TRSM_OUTCOPY QTRSM_OUTUCOPY +#define TRSM_OLNCOPY QTRSM_OLNUCOPY +#define TRSM_OLTCOPY QTRSM_OLTUCOPY + +#define TRMM_IUNCOPY QTRMM_IUNUCOPY +#define TRMM_IUTCOPY QTRMM_IUTUCOPY +#define TRMM_ILNCOPY QTRMM_ILNUCOPY +#define TRMM_ILTCOPY QTRMM_ILTUCOPY +#define TRSM_IUNCOPY QTRSM_IUNUCOPY +#define TRSM_IUTCOPY QTRSM_IUTUCOPY +#define TRSM_ILNCOPY QTRSM_ILNUCOPY +#define TRSM_ILTCOPY QTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY QTRMM_OUNNCOPY +#define TRMM_OUTCOPY QTRMM_OUTNCOPY +#define TRMM_OLNCOPY QTRMM_OLNNCOPY +#define TRMM_OLTCOPY QTRMM_OLTNCOPY +#define TRSM_OUNCOPY QTRSM_OUNNCOPY +#define TRSM_OUTCOPY QTRSM_OUTNCOPY +#define TRSM_OLNCOPY QTRSM_OLNNCOPY +#define TRSM_OLTCOPY QTRSM_OLTNCOPY + +#define TRMM_IUNCOPY QTRMM_IUNNCOPY +#define TRMM_IUTCOPY QTRMM_IUTNCOPY +#define TRMM_ILNCOPY QTRMM_ILNNCOPY +#define TRMM_ILTCOPY QTRMM_ILTNCOPY +#define TRSM_IUNCOPY QTRSM_IUNNCOPY +#define TRSM_IUTCOPY QTRSM_IUTNCOPY +#define TRSM_ILNCOPY QTRSM_ILNNCOPY +#define TRSM_ILTCOPY QTRSM_ILTNCOPY + +#endif + +#define GEMM_BETA QGEMM_BETA + +#define GEMM_KERNEL_N QGEMM_KERNEL +#define GEMM_KERNEL_L QGEMM_KERNEL +#define GEMM_KERNEL_R QGEMM_KERNEL +#define GEMM_KERNEL_B QGEMM_KERNEL + +#define TRMM_KERNEL_LN QTRMM_KERNEL_LN +#define TRMM_KERNEL_LT QTRMM_KERNEL_LT +#define TRMM_KERNEL_LR QTRMM_KERNEL_LN +#define TRMM_KERNEL_LC QTRMM_KERNEL_LT +#define TRMM_KERNEL_RN QTRMM_KERNEL_RN +#define TRMM_KERNEL_RT QTRMM_KERNEL_RT +#define TRMM_KERNEL_RR QTRMM_KERNEL_RN +#define TRMM_KERNEL_RC QTRMM_KERNEL_RT + +#define TRSM_KERNEL_LN QTRSM_KERNEL_LN +#define TRSM_KERNEL_LT QTRSM_KERNEL_LT +#define TRSM_KERNEL_LR QTRSM_KERNEL_LN +#define TRSM_KERNEL_LC QTRSM_KERNEL_LT +#define TRSM_KERNEL_RN QTRSM_KERNEL_RN +#define TRSM_KERNEL_RT QTRSM_KERNEL_RT +#define TRSM_KERNEL_RR QTRSM_KERNEL_RN +#define TRSM_KERNEL_RC QTRSM_KERNEL_RT + +#define SYMM_IUTCOPY QSYMM_IUTCOPY +#define SYMM_ILTCOPY QSYMM_ILTCOPY +#define SYMM_OUTCOPY QSYMM_OUTCOPY +#define SYMM_OLTCOPY QSYMM_OLTCOPY + +#define GEMM_NN QGEMM_NN +#define GEMM_CN QGEMM_TN +#define GEMM_TN QGEMM_TN +#define GEMM_NC QGEMM_NT +#define GEMM_NT QGEMM_NT +#define GEMM_CC QGEMM_TT +#define GEMM_CT QGEMM_TT +#define GEMM_TC QGEMM_TT +#define GEMM_TT QGEMM_TT +#define GEMM_NR QGEMM_NN +#define GEMM_TR QGEMM_TN +#define GEMM_CR QGEMM_TN +#define GEMM_RN QGEMM_NN +#define GEMM_RT QGEMM_NT +#define GEMM_RC QGEMM_NT +#define GEMM_RR QGEMM_NN + +#define SYMM_LU QSYMM_LU +#define SYMM_LL QSYMM_LL +#define SYMM_RU QSYMM_RU +#define SYMM_RL QSYMM_RL + +#define HEMM_LU QHEMM_LU +#define HEMM_LL QHEMM_LL +#define HEMM_RU QHEMM_RU +#define HEMM_RL QHEMM_RL + +#define SYRK_UN QSYRK_UN +#define SYRK_UT QSYRK_UT +#define SYRK_LN QSYRK_LN +#define SYRK_LT QSYRK_LT +#define SYRK_UR QSYRK_UN +#define SYRK_UC QSYRK_UT +#define SYRK_LR QSYRK_LN +#define SYRK_LC QSYRK_LT + +#define SYRK_KERNEL_U QSYRK_KERNEL_U +#define SYRK_KERNEL_L QSYRK_KERNEL_L + +#define HERK_UN QSYRK_UN +#define HERK_LN QSYRK_LN +#define HERK_UC QSYRK_UT +#define HERK_LC QSYRK_LT + +#define HER2K_UN QSYR2K_UN +#define HER2K_LN QSYR2K_LN +#define HER2K_UC QSYR2K_UT +#define HER2K_LC QSYR2K_LT + +#define SYR2K_UN QSYR2K_UN +#define SYR2K_UT QSYR2K_UT +#define SYR2K_LN QSYR2K_LN +#define SYR2K_LT QSYR2K_LT +#define SYR2K_UR QSYR2K_UN +#define SYR2K_UC QSYR2K_UT +#define SYR2K_LR QSYR2K_LN +#define SYR2K_LC QSYR2K_LT + +#define SYR2K_KERNEL_U QSYR2K_KERNEL_U +#define SYR2K_KERNEL_L QSYR2K_KERNEL_L + +#define TRMM_LNUU QTRMM_LNUU +#define TRMM_LNUN QTRMM_LNUN +#define TRMM_LNLU QTRMM_LNLU +#define TRMM_LNLN QTRMM_LNLN +#define TRMM_LTUU QTRMM_LTUU +#define TRMM_LTUN QTRMM_LTUN +#define TRMM_LTLU QTRMM_LTLU +#define TRMM_LTLN QTRMM_LTLN +#define TRMM_LRUU QTRMM_LNUU +#define TRMM_LRUN QTRMM_LNUN +#define TRMM_LRLU QTRMM_LNLU +#define TRMM_LRLN QTRMM_LNLN +#define TRMM_LCUU QTRMM_LTUU +#define TRMM_LCUN QTRMM_LTUN +#define TRMM_LCLU QTRMM_LTLU +#define TRMM_LCLN QTRMM_LTLN +#define TRMM_RNUU QTRMM_RNUU +#define TRMM_RNUN QTRMM_RNUN +#define TRMM_RNLU QTRMM_RNLU +#define TRMM_RNLN QTRMM_RNLN +#define TRMM_RTUU QTRMM_RTUU +#define TRMM_RTUN QTRMM_RTUN +#define TRMM_RTLU QTRMM_RTLU +#define TRMM_RTLN QTRMM_RTLN +#define TRMM_RRUU QTRMM_RNUU +#define TRMM_RRUN QTRMM_RNUN +#define TRMM_RRLU QTRMM_RNLU +#define TRMM_RRLN QTRMM_RNLN +#define TRMM_RCUU QTRMM_RTUU +#define TRMM_RCUN QTRMM_RTUN +#define TRMM_RCLU QTRMM_RTLU +#define TRMM_RCLN QTRMM_RTLN + +#define TRSM_LNUU QTRSM_LNUU +#define TRSM_LNUN QTRSM_LNUN +#define TRSM_LNLU QTRSM_LNLU +#define TRSM_LNLN QTRSM_LNLN +#define TRSM_LTUU QTRSM_LTUU +#define TRSM_LTUN QTRSM_LTUN +#define TRSM_LTLU QTRSM_LTLU +#define TRSM_LTLN QTRSM_LTLN +#define TRSM_LRUU QTRSM_LNUU +#define TRSM_LRUN QTRSM_LNUN +#define TRSM_LRLU QTRSM_LNLU +#define TRSM_LRLN QTRSM_LNLN +#define TRSM_LCUU QTRSM_LTUU +#define TRSM_LCUN QTRSM_LTUN +#define TRSM_LCLU QTRSM_LTLU +#define TRSM_LCLN QTRSM_LTLN +#define TRSM_RNUU QTRSM_RNUU +#define TRSM_RNUN QTRSM_RNUN +#define TRSM_RNLU QTRSM_RNLU +#define TRSM_RNLN QTRSM_RNLN +#define TRSM_RTUU QTRSM_RTUU +#define TRSM_RTUN QTRSM_RTUN +#define TRSM_RTLU QTRSM_RTLU +#define TRSM_RTLN QTRSM_RTLN +#define TRSM_RRUU QTRSM_RNUU +#define TRSM_RRUN QTRSM_RNUN +#define TRSM_RRLU QTRSM_RNLU +#define TRSM_RRLN QTRSM_RNLN +#define TRSM_RCUU QTRSM_RTUU +#define TRSM_RCUN QTRSM_RTUN +#define TRSM_RCLU QTRSM_RTLU +#define TRSM_RCLN QTRSM_RTLN + +#define GEMM_THREAD_NN QGEMM_THREAD_NN +#define GEMM_THREAD_CN QGEMM_THREAD_TN +#define GEMM_THREAD_TN QGEMM_THREAD_TN +#define GEMM_THREAD_NC QGEMM_THREAD_NT +#define GEMM_THREAD_NT QGEMM_THREAD_NT +#define GEMM_THREAD_CC QGEMM_THREAD_TT +#define GEMM_THREAD_CT QGEMM_THREAD_TT +#define GEMM_THREAD_TC QGEMM_THREAD_TT +#define GEMM_THREAD_TT QGEMM_THREAD_TT +#define GEMM_THREAD_NR QGEMM_THREAD_NN +#define GEMM_THREAD_TR QGEMM_THREAD_TN +#define GEMM_THREAD_CR QGEMM_THREAD_TN +#define GEMM_THREAD_RN QGEMM_THREAD_NN +#define GEMM_THREAD_RT QGEMM_THREAD_NT +#define GEMM_THREAD_RC QGEMM_THREAD_NT +#define GEMM_THREAD_RR QGEMM_THREAD_NN + +#define SYMM_THREAD_LU QSYMM_THREAD_LU +#define SYMM_THREAD_LL QSYMM_THREAD_LL +#define SYMM_THREAD_RU QSYMM_THREAD_RU +#define SYMM_THREAD_RL QSYMM_THREAD_RL + +#define HEMM_THREAD_LU QHEMM_THREAD_LU +#define HEMM_THREAD_LL QHEMM_THREAD_LL +#define HEMM_THREAD_RU QHEMM_THREAD_RU +#define HEMM_THREAD_RL QHEMM_THREAD_RL + +#define SYRK_THREAD_UN QSYRK_THREAD_UN +#define SYRK_THREAD_UT QSYRK_THREAD_UT +#define SYRK_THREAD_LN QSYRK_THREAD_LN +#define SYRK_THREAD_LT QSYRK_THREAD_LT +#define SYRK_THREAD_UR QSYRK_THREAD_UR +#define SYRK_THREAD_UC QSYRK_THREAD_UC +#define SYRK_THREAD_LR QSYRK_THREAD_LN +#define SYRK_THREAD_LC QSYRK_THREAD_LT + +#define HERK_THREAD_UN QSYRK_THREAD_UN +#define HERK_THREAD_UT QSYRK_THREAD_UT +#define HERK_THREAD_LN QSYRK_THREAD_LN +#define HERK_THREAD_LT QSYRK_THREAD_LT +#define HERK_THREAD_UR QSYRK_THREAD_UR +#define HERK_THREAD_UC QSYRK_THREAD_UC +#define HERK_THREAD_LR QSYRK_THREAD_LN +#define HERK_THREAD_LC QSYRK_THREAD_LT + +#elif defined(DOUBLE) + +#define AMAX_K DAMAX_K +#define AMIN_K DAMIN_K +#define MAX_K DMAX_K +#define MIN_K DMIN_K +#define IAMAX_K IDAMAX_K +#define IAMIN_K IDAMIN_K +#define IMAX_K IDMAX_K +#define IMIN_K IDMIN_K +#define ASUM_K DASUM_K +#define AXPYU_K DAXPYU_K +#define AXPYC_K DAXPYC_K +#define COPY_K DCOPY_K +#define DOTU_K DDOTU_K +#define DOTC_K DDOTC_K +#define NRM2_K DNRM2_K +#define SCAL_K DSCAL_K +#define SWAP_K DSWAP_K +#define ROT_K DROT_K + +#define GEMV_N DGEMV_N +#define GEMV_T DGEMV_T +#define GEMV_R DGEMV_R +#define GEMV_C DGEMV_C +#define GEMV_O DGEMV_O +#define GEMV_U DGEMV_U +#define GEMV_S DGEMV_S +#define GEMV_D DGEMV_D + +#define GERU_K DGERU_K +#define GERC_K DGERC_K +#define GERV_K DGERV_K +#define GERD_K DGERD_K + +#define SYMV_U DSYMV_U +#define SYMV_L DSYMV_L + +#define SYMV_THREAD_U DSYMV_THREAD_U +#define SYMV_THREAD_L DSYMV_THREAD_L + +#define GEMM_ONCOPY DGEMM_ONCOPY +#define GEMM_OTCOPY DGEMM_OTCOPY +#define GEMM_INCOPY DGEMM_INCOPY +#define GEMM_ITCOPY DGEMM_ITCOPY + +#ifdef UNIT + +#define TRMM_OUNCOPY DTRMM_OUNUCOPY +#define TRMM_OUTCOPY DTRMM_OUTUCOPY +#define TRMM_OLNCOPY DTRMM_OLNUCOPY +#define TRMM_OLTCOPY DTRMM_OLTUCOPY +#define TRSM_OUNCOPY DTRSM_OUNUCOPY +#define TRSM_OUTCOPY DTRSM_OUTUCOPY +#define TRSM_OLNCOPY DTRSM_OLNUCOPY +#define TRSM_OLTCOPY DTRSM_OLTUCOPY + +#define TRMM_IUNCOPY DTRMM_IUNUCOPY +#define TRMM_IUTCOPY DTRMM_IUTUCOPY +#define TRMM_ILNCOPY DTRMM_ILNUCOPY +#define TRMM_ILTCOPY DTRMM_ILTUCOPY +#define TRSM_IUNCOPY DTRSM_IUNUCOPY +#define TRSM_IUTCOPY DTRSM_IUTUCOPY +#define TRSM_ILNCOPY DTRSM_ILNUCOPY +#define TRSM_ILTCOPY DTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY DTRMM_OUNNCOPY +#define TRMM_OUTCOPY DTRMM_OUTNCOPY +#define TRMM_OLNCOPY DTRMM_OLNNCOPY +#define TRMM_OLTCOPY DTRMM_OLTNCOPY +#define TRSM_OUNCOPY DTRSM_OUNNCOPY +#define TRSM_OUTCOPY DTRSM_OUTNCOPY +#define TRSM_OLNCOPY DTRSM_OLNNCOPY +#define TRSM_OLTCOPY DTRSM_OLTNCOPY + +#define TRMM_IUNCOPY DTRMM_IUNNCOPY +#define TRMM_IUTCOPY DTRMM_IUTNCOPY +#define TRMM_ILNCOPY DTRMM_ILNNCOPY +#define TRMM_ILTCOPY DTRMM_ILTNCOPY +#define TRSM_IUNCOPY DTRSM_IUNNCOPY +#define TRSM_IUTCOPY DTRSM_IUTNCOPY +#define TRSM_ILNCOPY DTRSM_ILNNCOPY +#define TRSM_ILTCOPY DTRSM_ILTNCOPY + +#endif + +#define GEMM_BETA DGEMM_BETA + +#define GEMM_KERNEL_N DGEMM_KERNEL +#define GEMM_KERNEL_L DGEMM_KERNEL +#define GEMM_KERNEL_R DGEMM_KERNEL +#define GEMM_KERNEL_B DGEMM_KERNEL + +#define TRMM_KERNEL_LN DTRMM_KERNEL_LN +#define TRMM_KERNEL_LT DTRMM_KERNEL_LT +#define TRMM_KERNEL_LR DTRMM_KERNEL_LN +#define TRMM_KERNEL_LC DTRMM_KERNEL_LT +#define TRMM_KERNEL_RN DTRMM_KERNEL_RN +#define TRMM_KERNEL_RT DTRMM_KERNEL_RT +#define TRMM_KERNEL_RR DTRMM_KERNEL_RN +#define TRMM_KERNEL_RC DTRMM_KERNEL_RT + +#define TRSM_KERNEL_LN DTRSM_KERNEL_LN +#define TRSM_KERNEL_LT DTRSM_KERNEL_LT +#define TRSM_KERNEL_LR DTRSM_KERNEL_LN +#define TRSM_KERNEL_LC DTRSM_KERNEL_LT +#define TRSM_KERNEL_RN DTRSM_KERNEL_RN +#define TRSM_KERNEL_RT DTRSM_KERNEL_RT +#define TRSM_KERNEL_RR DTRSM_KERNEL_RN +#define TRSM_KERNEL_RC DTRSM_KERNEL_RT + +#define SYMM_IUTCOPY DSYMM_IUTCOPY +#define SYMM_ILTCOPY DSYMM_ILTCOPY +#define SYMM_OUTCOPY DSYMM_OUTCOPY +#define SYMM_OLTCOPY DSYMM_OLTCOPY + +#define GEMM_NN DGEMM_NN +#define GEMM_CN DGEMM_TN +#define GEMM_TN DGEMM_TN +#define GEMM_NC DGEMM_NT +#define GEMM_NT DGEMM_NT +#define GEMM_CC DGEMM_TT +#define GEMM_CT DGEMM_TT +#define GEMM_TC DGEMM_TT +#define GEMM_TT DGEMM_TT +#define GEMM_NR DGEMM_NN +#define GEMM_TR DGEMM_TN +#define GEMM_CR DGEMM_TN +#define GEMM_RN DGEMM_NN +#define GEMM_RT DGEMM_NT +#define GEMM_RC DGEMM_NT +#define GEMM_RR DGEMM_NN + +#define SYMM_LU DSYMM_LU +#define SYMM_LL DSYMM_LL +#define SYMM_RU DSYMM_RU +#define SYMM_RL DSYMM_RL + +#define HEMM_LU DHEMM_LU +#define HEMM_LL DHEMM_LL +#define HEMM_RU DHEMM_RU +#define HEMM_RL DHEMM_RL + +#define SYRK_UN DSYRK_UN +#define SYRK_UT DSYRK_UT +#define SYRK_LN DSYRK_LN +#define SYRK_LT DSYRK_LT +#define SYRK_UR DSYRK_UN +#define SYRK_UC DSYRK_UT +#define SYRK_LR DSYRK_LN +#define SYRK_LC DSYRK_LT + +#define SYRK_KERNEL_U DSYRK_KERNEL_U +#define SYRK_KERNEL_L DSYRK_KERNEL_L + +#define HERK_UN DSYRK_UN +#define HERK_LN DSYRK_LN +#define HERK_UC DSYRK_UT +#define HERK_LC DSYRK_LT + +#define HER2K_UN DSYR2K_UN +#define HER2K_LN DSYR2K_LN +#define HER2K_UC DSYR2K_UT +#define HER2K_LC DSYR2K_LT + +#define SYR2K_UN DSYR2K_UN +#define SYR2K_UT DSYR2K_UT +#define SYR2K_LN DSYR2K_LN +#define SYR2K_LT DSYR2K_LT +#define SYR2K_UR DSYR2K_UN +#define SYR2K_UC DSYR2K_UT +#define SYR2K_LR DSYR2K_LN +#define SYR2K_LC DSYR2K_LT + +#define SYR2K_KERNEL_U DSYR2K_KERNEL_U +#define SYR2K_KERNEL_L DSYR2K_KERNEL_L + +#define TRMM_LNUU DTRMM_LNUU +#define TRMM_LNUN DTRMM_LNUN +#define TRMM_LNLU DTRMM_LNLU +#define TRMM_LNLN DTRMM_LNLN +#define TRMM_LTUU DTRMM_LTUU +#define TRMM_LTUN DTRMM_LTUN +#define TRMM_LTLU DTRMM_LTLU +#define TRMM_LTLN DTRMM_LTLN +#define TRMM_LRUU DTRMM_LNUU +#define TRMM_LRUN DTRMM_LNUN +#define TRMM_LRLU DTRMM_LNLU +#define TRMM_LRLN DTRMM_LNLN +#define TRMM_LCUU DTRMM_LTUU +#define TRMM_LCUN DTRMM_LTUN +#define TRMM_LCLU DTRMM_LTLU +#define TRMM_LCLN DTRMM_LTLN +#define TRMM_RNUU DTRMM_RNUU +#define TRMM_RNUN DTRMM_RNUN +#define TRMM_RNLU DTRMM_RNLU +#define TRMM_RNLN DTRMM_RNLN +#define TRMM_RTUU DTRMM_RTUU +#define TRMM_RTUN DTRMM_RTUN +#define TRMM_RTLU DTRMM_RTLU +#define TRMM_RTLN DTRMM_RTLN +#define TRMM_RRUU DTRMM_RNUU +#define TRMM_RRUN DTRMM_RNUN +#define TRMM_RRLU DTRMM_RNLU +#define TRMM_RRLN DTRMM_RNLN +#define TRMM_RCUU DTRMM_RTUU +#define TRMM_RCUN DTRMM_RTUN +#define TRMM_RCLU DTRMM_RTLU +#define TRMM_RCLN DTRMM_RTLN + +#define TRSM_LNUU DTRSM_LNUU +#define TRSM_LNUN DTRSM_LNUN +#define TRSM_LNLU DTRSM_LNLU +#define TRSM_LNLN DTRSM_LNLN +#define TRSM_LTUU DTRSM_LTUU +#define TRSM_LTUN DTRSM_LTUN +#define TRSM_LTLU DTRSM_LTLU +#define TRSM_LTLN DTRSM_LTLN +#define TRSM_LRUU DTRSM_LNUU +#define TRSM_LRUN DTRSM_LNUN +#define TRSM_LRLU DTRSM_LNLU +#define TRSM_LRLN DTRSM_LNLN +#define TRSM_LCUU DTRSM_LTUU +#define TRSM_LCUN DTRSM_LTUN +#define TRSM_LCLU DTRSM_LTLU +#define TRSM_LCLN DTRSM_LTLN +#define TRSM_RNUU DTRSM_RNUU +#define TRSM_RNUN DTRSM_RNUN +#define TRSM_RNLU DTRSM_RNLU +#define TRSM_RNLN DTRSM_RNLN +#define TRSM_RTUU DTRSM_RTUU +#define TRSM_RTUN DTRSM_RTUN +#define TRSM_RTLU DTRSM_RTLU +#define TRSM_RTLN DTRSM_RTLN +#define TRSM_RRUU DTRSM_RNUU +#define TRSM_RRUN DTRSM_RNUN +#define TRSM_RRLU DTRSM_RNLU +#define TRSM_RRLN DTRSM_RNLN +#define TRSM_RCUU DTRSM_RTUU +#define TRSM_RCUN DTRSM_RTUN +#define TRSM_RCLU DTRSM_RTLU +#define TRSM_RCLN DTRSM_RTLN + +#define GEMM_THREAD_NN DGEMM_THREAD_NN +#define GEMM_THREAD_CN DGEMM_THREAD_TN +#define GEMM_THREAD_TN DGEMM_THREAD_TN +#define GEMM_THREAD_NC DGEMM_THREAD_NT +#define GEMM_THREAD_NT DGEMM_THREAD_NT +#define GEMM_THREAD_CC DGEMM_THREAD_TT +#define GEMM_THREAD_CT DGEMM_THREAD_TT +#define GEMM_THREAD_TC DGEMM_THREAD_TT +#define GEMM_THREAD_TT DGEMM_THREAD_TT +#define GEMM_THREAD_NR DGEMM_THREAD_NN +#define GEMM_THREAD_TR DGEMM_THREAD_TN +#define GEMM_THREAD_CR DGEMM_THREAD_TN +#define GEMM_THREAD_RN DGEMM_THREAD_NN +#define GEMM_THREAD_RT DGEMM_THREAD_NT +#define GEMM_THREAD_RC DGEMM_THREAD_NT +#define GEMM_THREAD_RR DGEMM_THREAD_NN + +#define SYMM_THREAD_LU DSYMM_THREAD_LU +#define SYMM_THREAD_LL DSYMM_THREAD_LL +#define SYMM_THREAD_RU DSYMM_THREAD_RU +#define SYMM_THREAD_RL DSYMM_THREAD_RL + +#define HEMM_THREAD_LU DHEMM_THREAD_LU +#define HEMM_THREAD_LL DHEMM_THREAD_LL +#define HEMM_THREAD_RU DHEMM_THREAD_RU +#define HEMM_THREAD_RL DHEMM_THREAD_RL + +#define SYRK_THREAD_UN DSYRK_THREAD_UN +#define SYRK_THREAD_UT DSYRK_THREAD_UT +#define SYRK_THREAD_LN DSYRK_THREAD_LN +#define SYRK_THREAD_LT DSYRK_THREAD_LT +#define SYRK_THREAD_UR DSYRK_THREAD_UR +#define SYRK_THREAD_UC DSYRK_THREAD_UC +#define SYRK_THREAD_LR DSYRK_THREAD_LN +#define SYRK_THREAD_LC DSYRK_THREAD_LT + +#define HERK_THREAD_UN DSYRK_THREAD_UN +#define HERK_THREAD_UT DSYRK_THREAD_UT +#define HERK_THREAD_LN DSYRK_THREAD_LN +#define HERK_THREAD_LT DSYRK_THREAD_LT +#define HERK_THREAD_UR DSYRK_THREAD_UR +#define HERK_THREAD_UC DSYRK_THREAD_UC +#define HERK_THREAD_LR DSYRK_THREAD_LN +#define HERK_THREAD_LC DSYRK_THREAD_LT + +#else + +#define AMAX_K SAMAX_K +#define AMIN_K SAMIN_K +#define MAX_K SMAX_K +#define MIN_K SMIN_K +#define IAMAX_K ISAMAX_K +#define IAMIN_K ISAMIN_K +#define IMAX_K ISMAX_K +#define IMIN_K ISMIN_K +#define ASUM_K SASUM_K +#define AXPYU_K SAXPYU_K +#define AXPYC_K SAXPYU_K +#define COPY_K SCOPY_K +#define DOTU_K SDOTU_K +#define DOTC_K SDOTC_K +#define NRM2_K SNRM2_K +#define SCAL_K SSCAL_K +#define SWAP_K SSWAP_K +#define ROT_K SROT_K + +#define GEMV_N SGEMV_N +#define GEMV_T SGEMV_T +#define GEMV_R SGEMV_R +#define GEMV_C SGEMV_C +#define GEMV_O SGEMV_O +#define GEMV_U SGEMV_U +#define GEMV_S SGEMV_S +#define GEMV_D SGEMV_D + +#define GERU_K SGERU_K +#define GERC_K SGERC_K +#define GERV_K SGERV_K +#define GERD_K SGERD_K + +#define SYMV_U SSYMV_U +#define SYMV_L SSYMV_L + +#define SYMV_THREAD_U SSYMV_THREAD_U +#define SYMV_THREAD_L SSYMV_THREAD_L + +#define GEMM_ONCOPY SGEMM_ONCOPY +#define GEMM_OTCOPY SGEMM_OTCOPY +#define GEMM_INCOPY SGEMM_INCOPY +#define GEMM_ITCOPY SGEMM_ITCOPY + +#ifdef UNIT + +#define TRMM_OUNCOPY STRMM_OUNUCOPY +#define TRMM_OUTCOPY STRMM_OUTUCOPY +#define TRMM_OLNCOPY STRMM_OLNUCOPY +#define TRMM_OLTCOPY STRMM_OLTUCOPY +#define TRSM_OUNCOPY STRSM_OUNUCOPY +#define TRSM_OUTCOPY STRSM_OUTUCOPY +#define TRSM_OLNCOPY STRSM_OLNUCOPY +#define TRSM_OLTCOPY STRSM_OLTUCOPY + +#define TRMM_IUNCOPY STRMM_IUNUCOPY +#define TRMM_IUTCOPY STRMM_IUTUCOPY +#define TRMM_ILNCOPY STRMM_ILNUCOPY +#define TRMM_ILTCOPY STRMM_ILTUCOPY +#define TRSM_IUNCOPY STRSM_IUNUCOPY +#define TRSM_IUTCOPY STRSM_IUTUCOPY +#define TRSM_ILNCOPY STRSM_ILNUCOPY +#define TRSM_ILTCOPY STRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY STRMM_OUNNCOPY +#define TRMM_OUTCOPY STRMM_OUTNCOPY +#define TRMM_OLNCOPY STRMM_OLNNCOPY +#define TRMM_OLTCOPY STRMM_OLTNCOPY +#define TRSM_OUNCOPY STRSM_OUNNCOPY +#define TRSM_OUTCOPY STRSM_OUTNCOPY +#define TRSM_OLNCOPY STRSM_OLNNCOPY +#define TRSM_OLTCOPY STRSM_OLTNCOPY + +#define TRMM_IUNCOPY STRMM_IUNNCOPY +#define TRMM_IUTCOPY STRMM_IUTNCOPY +#define TRMM_ILNCOPY STRMM_ILNNCOPY +#define TRMM_ILTCOPY STRMM_ILTNCOPY +#define TRSM_IUNCOPY STRSM_IUNNCOPY +#define TRSM_IUTCOPY STRSM_IUTNCOPY +#define TRSM_ILNCOPY STRSM_ILNNCOPY +#define TRSM_ILTCOPY STRSM_ILTNCOPY + +#endif + +#define GEMM_BETA SGEMM_BETA + +#define GEMM_KERNEL_N SGEMM_KERNEL +#define GEMM_KERNEL_L SGEMM_KERNEL +#define GEMM_KERNEL_R SGEMM_KERNEL +#define GEMM_KERNEL_B SGEMM_KERNEL + +#define TRMM_KERNEL_LN STRMM_KERNEL_LN +#define TRMM_KERNEL_LT STRMM_KERNEL_LT +#define TRMM_KERNEL_LR STRMM_KERNEL_LN +#define TRMM_KERNEL_LC STRMM_KERNEL_LT +#define TRMM_KERNEL_RN STRMM_KERNEL_RN +#define TRMM_KERNEL_RT STRMM_KERNEL_RT +#define TRMM_KERNEL_RR STRMM_KERNEL_RN +#define TRMM_KERNEL_RC STRMM_KERNEL_RT + +#define TRSM_KERNEL_LN STRSM_KERNEL_LN +#define TRSM_KERNEL_LT STRSM_KERNEL_LT +#define TRSM_KERNEL_LR STRSM_KERNEL_LN +#define TRSM_KERNEL_LC STRSM_KERNEL_LT +#define TRSM_KERNEL_RN STRSM_KERNEL_RN +#define TRSM_KERNEL_RT STRSM_KERNEL_RT +#define TRSM_KERNEL_RR STRSM_KERNEL_RN +#define TRSM_KERNEL_RC STRSM_KERNEL_RT + +#define SYMM_IUTCOPY SSYMM_IUTCOPY +#define SYMM_ILTCOPY SSYMM_ILTCOPY +#define SYMM_OUTCOPY SSYMM_OUTCOPY +#define SYMM_OLTCOPY SSYMM_OLTCOPY + +#define GEMM_NN SGEMM_NN +#define GEMM_CN SGEMM_TN +#define GEMM_TN SGEMM_TN +#define GEMM_NC SGEMM_NT +#define GEMM_NT SGEMM_NT +#define GEMM_CC SGEMM_TT +#define GEMM_CT SGEMM_TT +#define GEMM_TC SGEMM_TT +#define GEMM_TT SGEMM_TT +#define GEMM_NR SGEMM_NN +#define GEMM_TR SGEMM_TN +#define GEMM_CR SGEMM_TN +#define GEMM_RN SGEMM_NN +#define GEMM_RT SGEMM_NT +#define GEMM_RC SGEMM_NT +#define GEMM_RR SGEMM_NN + +#define SYMM_LU SSYMM_LU +#define SYMM_LL SSYMM_LL +#define SYMM_RU SSYMM_RU +#define SYMM_RL SSYMM_RL + +#define HEMM_LU SHEMM_LU +#define HEMM_LL SHEMM_LL +#define HEMM_RU SHEMM_RU +#define HEMM_RL SHEMM_RL + +#define SYRK_UN SSYRK_UN +#define SYRK_UT SSYRK_UT +#define SYRK_LN SSYRK_LN +#define SYRK_LT SSYRK_LT +#define SYRK_UR SSYRK_UN +#define SYRK_UC SSYRK_UT +#define SYRK_LR SSYRK_LN +#define SYRK_LC SSYRK_LT + +#define SYRK_KERNEL_U SSYRK_KERNEL_U +#define SYRK_KERNEL_L SSYRK_KERNEL_L + +#define HERK_UN SSYRK_UN +#define HERK_LN SSYRK_LN +#define HERK_UC SSYRK_UT +#define HERK_LC SSYRK_LT + +#define HER2K_UN SSYR2K_UN +#define HER2K_LN SSYR2K_LN +#define HER2K_UC SSYR2K_UT +#define HER2K_LC SSYR2K_LT + +#define SYR2K_UN SSYR2K_UN +#define SYR2K_UT SSYR2K_UT +#define SYR2K_LN SSYR2K_LN +#define SYR2K_LT SSYR2K_LT +#define SYR2K_UR SSYR2K_UN +#define SYR2K_UC SSYR2K_UT +#define SYR2K_LR SSYR2K_LN +#define SYR2K_LC SSYR2K_LT + +#define SYR2K_KERNEL_U SSYR2K_KERNEL_U +#define SYR2K_KERNEL_L SSYR2K_KERNEL_L + +#define TRMM_LNUU STRMM_LNUU +#define TRMM_LNUN STRMM_LNUN +#define TRMM_LNLU STRMM_LNLU +#define TRMM_LNLN STRMM_LNLN +#define TRMM_LTUU STRMM_LTUU +#define TRMM_LTUN STRMM_LTUN +#define TRMM_LTLU STRMM_LTLU +#define TRMM_LTLN STRMM_LTLN +#define TRMM_LRUU STRMM_LNUU +#define TRMM_LRUN STRMM_LNUN +#define TRMM_LRLU STRMM_LNLU +#define TRMM_LRLN STRMM_LNLN +#define TRMM_LCUU STRMM_LTUU +#define TRMM_LCUN STRMM_LTUN +#define TRMM_LCLU STRMM_LTLU +#define TRMM_LCLN STRMM_LTLN +#define TRMM_RNUU STRMM_RNUU +#define TRMM_RNUN STRMM_RNUN +#define TRMM_RNLU STRMM_RNLU +#define TRMM_RNLN STRMM_RNLN +#define TRMM_RTUU STRMM_RTUU +#define TRMM_RTUN STRMM_RTUN +#define TRMM_RTLU STRMM_RTLU +#define TRMM_RTLN STRMM_RTLN +#define TRMM_RRUU STRMM_RNUU +#define TRMM_RRUN STRMM_RNUN +#define TRMM_RRLU STRMM_RNLU +#define TRMM_RRLN STRMM_RNLN +#define TRMM_RCUU STRMM_RTUU +#define TRMM_RCUN STRMM_RTUN +#define TRMM_RCLU STRMM_RTLU +#define TRMM_RCLN STRMM_RTLN + +#define TRSM_LNUU STRSM_LNUU +#define TRSM_LNUN STRSM_LNUN +#define TRSM_LNLU STRSM_LNLU +#define TRSM_LNLN STRSM_LNLN +#define TRSM_LTUU STRSM_LTUU +#define TRSM_LTUN STRSM_LTUN +#define TRSM_LTLU STRSM_LTLU +#define TRSM_LTLN STRSM_LTLN +#define TRSM_LRUU STRSM_LNUU +#define TRSM_LRUN STRSM_LNUN +#define TRSM_LRLU STRSM_LNLU +#define TRSM_LRLN STRSM_LNLN +#define TRSM_LCUU STRSM_LTUU +#define TRSM_LCUN STRSM_LTUN +#define TRSM_LCLU STRSM_LTLU +#define TRSM_LCLN STRSM_LTLN +#define TRSM_RNUU STRSM_RNUU +#define TRSM_RNUN STRSM_RNUN +#define TRSM_RNLU STRSM_RNLU +#define TRSM_RNLN STRSM_RNLN +#define TRSM_RTUU STRSM_RTUU +#define TRSM_RTUN STRSM_RTUN +#define TRSM_RTLU STRSM_RTLU +#define TRSM_RTLN STRSM_RTLN +#define TRSM_RRUU STRSM_RNUU +#define TRSM_RRUN STRSM_RNUN +#define TRSM_RRLU STRSM_RNLU +#define TRSM_RRLN STRSM_RNLN +#define TRSM_RCUU STRSM_RTUU +#define TRSM_RCUN STRSM_RTUN +#define TRSM_RCLU STRSM_RTLU +#define TRSM_RCLN STRSM_RTLN + +#define GEMM_THREAD_NN SGEMM_THREAD_NN +#define GEMM_THREAD_CN SGEMM_THREAD_TN +#define GEMM_THREAD_TN SGEMM_THREAD_TN +#define GEMM_THREAD_NC SGEMM_THREAD_NT +#define GEMM_THREAD_NT SGEMM_THREAD_NT +#define GEMM_THREAD_CC SGEMM_THREAD_TT +#define GEMM_THREAD_CT SGEMM_THREAD_TT +#define GEMM_THREAD_TC SGEMM_THREAD_TT +#define GEMM_THREAD_TT SGEMM_THREAD_TT +#define GEMM_THREAD_NR SGEMM_THREAD_NN +#define GEMM_THREAD_TR SGEMM_THREAD_TN +#define GEMM_THREAD_CR SGEMM_THREAD_TN +#define GEMM_THREAD_RN SGEMM_THREAD_NN +#define GEMM_THREAD_RT SGEMM_THREAD_NT +#define GEMM_THREAD_RC SGEMM_THREAD_NT +#define GEMM_THREAD_RR SGEMM_THREAD_NN + +#define SYMM_THREAD_LU SSYMM_THREAD_LU +#define SYMM_THREAD_LL SSYMM_THREAD_LL +#define SYMM_THREAD_RU SSYMM_THREAD_RU +#define SYMM_THREAD_RL SSYMM_THREAD_RL + +#define HEMM_THREAD_LU SHEMM_THREAD_LU +#define HEMM_THREAD_LL SHEMM_THREAD_LL +#define HEMM_THREAD_RU SHEMM_THREAD_RU +#define HEMM_THREAD_RL SHEMM_THREAD_RL + +#define SYRK_THREAD_UN SSYRK_THREAD_UN +#define SYRK_THREAD_UT SSYRK_THREAD_UT +#define SYRK_THREAD_LN SSYRK_THREAD_LN +#define SYRK_THREAD_LT SSYRK_THREAD_LT +#define SYRK_THREAD_UR SSYRK_THREAD_UR +#define SYRK_THREAD_UC SSYRK_THREAD_UC +#define SYRK_THREAD_LR SSYRK_THREAD_LN +#define SYRK_THREAD_LC SSYRK_THREAD_LT + +#define HERK_THREAD_UN SSYRK_THREAD_UN +#define HERK_THREAD_UT SSYRK_THREAD_UT +#define HERK_THREAD_LN SSYRK_THREAD_LN +#define HERK_THREAD_LT SSYRK_THREAD_LT +#define HERK_THREAD_UR SSYRK_THREAD_UR +#define HERK_THREAD_UC SSYRK_THREAD_UC +#define HERK_THREAD_LR SSYRK_THREAD_LN +#define HERK_THREAD_LC SSYRK_THREAD_LT + +#endif +#else +#ifdef XDOUBLE + +#define AMAX_K XAMAX_K +#define AMIN_K XAMIN_K +#define MAX_K XMAX_K +#define MIN_K XMIN_K +#define IAMAX_K IXAMAX_K +#define IAMIN_K IXAMIN_K +#define IMAX_K IXMAX_K +#define IMIN_K IXMIN_K +#define ASUM_K XASUM_K +#define AXPYU_K XAXPYU_K +#define AXPYC_K XAXPYC_K +#define COPY_K XCOPY_K +#define DOTU_K XDOTU_K +#define DOTC_K XDOTC_K +#define NRM2_K XNRM2_K +#define SCAL_K XSCAL_K +#define SWAP_K XSWAP_K +#define ROT_K XROT_K + +#define GEMV_N XGEMV_N +#define GEMV_T XGEMV_T +#define GEMV_R XGEMV_R +#define GEMV_C XGEMV_C +#define GEMV_O XGEMV_O +#define GEMV_U XGEMV_U +#define GEMV_S XGEMV_S +#define GEMV_D XGEMV_D + +#define GERU_K XGERU_K +#define GERC_K XGERC_K +#define GERV_K XGERV_K +#define GERD_K XGERD_K + +#define SYMV_U XSYMV_U +#define SYMV_L XSYMV_L +#define HEMV_U XHEMV_U +#define HEMV_L XHEMV_L +#define HEMV_V XHEMV_V +#define HEMV_M XHEMV_M + +#define SYMV_THREAD_U XSYMV_THREAD_U +#define SYMV_THREAD_L XSYMV_THREAD_L +#define HEMV_THREAD_U XHEMV_THREAD_U +#define HEMV_THREAD_L XHEMV_THREAD_L +#define HEMV_THREAD_V XHEMV_THREAD_V +#define HEMV_THREAD_M XHEMV_THREAD_M + +#define GEMM_ONCOPY XGEMM_ONCOPY +#define GEMM_OTCOPY XGEMM_OTCOPY +#define GEMM_INCOPY XGEMM_INCOPY +#define GEMM_ITCOPY XGEMM_ITCOPY + +#define GEMM3M_ONCOPYB XGEMM3M_ONCOPYB +#define GEMM3M_ONCOPYR XGEMM3M_ONCOPYR +#define GEMM3M_ONCOPYI XGEMM3M_ONCOPYI +#define GEMM3M_OTCOPYB XGEMM3M_OTCOPYB +#define GEMM3M_OTCOPYR XGEMM3M_OTCOPYR +#define GEMM3M_OTCOPYI XGEMM3M_OTCOPYI +#define GEMM3M_INCOPYB XGEMM3M_INCOPYB +#define GEMM3M_INCOPYR XGEMM3M_INCOPYR +#define GEMM3M_INCOPYI XGEMM3M_INCOPYI +#define GEMM3M_ITCOPYB XGEMM3M_ITCOPYB +#define GEMM3M_ITCOPYR XGEMM3M_ITCOPYR +#define GEMM3M_ITCOPYI XGEMM3M_ITCOPYI + +#ifdef UNIT + +#define TRMM_OUNCOPY XTRMM_OUNUCOPY +#define TRMM_OUTCOPY XTRMM_OUTUCOPY +#define TRMM_OLNCOPY XTRMM_OLNUCOPY +#define TRMM_OLTCOPY XTRMM_OLTUCOPY +#define TRSM_OUNCOPY XTRSM_OUNUCOPY +#define TRSM_OUTCOPY XTRSM_OUTUCOPY +#define TRSM_OLNCOPY XTRSM_OLNUCOPY +#define TRSM_OLTCOPY XTRSM_OLTUCOPY + +#define TRMM_IUNCOPY XTRMM_IUNUCOPY +#define TRMM_IUTCOPY XTRMM_IUTUCOPY +#define TRMM_ILNCOPY XTRMM_ILNUCOPY +#define TRMM_ILTCOPY XTRMM_ILTUCOPY +#define TRSM_IUNCOPY XTRSM_IUNUCOPY +#define TRSM_IUTCOPY XTRSM_IUTUCOPY +#define TRSM_ILNCOPY XTRSM_ILNUCOPY +#define TRSM_ILTCOPY XTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY XTRMM_OUNNCOPY +#define TRMM_OUTCOPY XTRMM_OUTNCOPY +#define TRMM_OLNCOPY XTRMM_OLNNCOPY +#define TRMM_OLTCOPY XTRMM_OLTNCOPY +#define TRSM_OUNCOPY XTRSM_OUNNCOPY +#define TRSM_OUTCOPY XTRSM_OUTNCOPY +#define TRSM_OLNCOPY XTRSM_OLNNCOPY +#define TRSM_OLTCOPY XTRSM_OLTNCOPY + +#define TRMM_IUNCOPY XTRMM_IUNNCOPY +#define TRMM_IUTCOPY XTRMM_IUTNCOPY +#define TRMM_ILNCOPY XTRMM_ILNNCOPY +#define TRMM_ILTCOPY XTRMM_ILTNCOPY +#define TRSM_IUNCOPY XTRSM_IUNNCOPY +#define TRSM_IUTCOPY XTRSM_IUTNCOPY +#define TRSM_ILNCOPY XTRSM_ILNNCOPY +#define TRSM_ILTCOPY XTRSM_ILTNCOPY + +#endif + +#define SYMM3M_ILCOPYB XSYMM3M_ILCOPYB +#define SYMM3M_IUCOPYB XSYMM3M_IUCOPYB +#define SYMM3M_ILCOPYR XSYMM3M_ILCOPYR +#define SYMM3M_IUCOPYR XSYMM3M_IUCOPYR +#define SYMM3M_ILCOPYI XSYMM3M_ILCOPYI +#define SYMM3M_IUCOPYI XSYMM3M_IUCOPYI + +#define SYMM3M_OLCOPYB XSYMM3M_OLCOPYB +#define SYMM3M_OUCOPYB XSYMM3M_OUCOPYB +#define SYMM3M_OLCOPYR XSYMM3M_OLCOPYR +#define SYMM3M_OUCOPYR XSYMM3M_OUCOPYR +#define SYMM3M_OLCOPYI XSYMM3M_OLCOPYI +#define SYMM3M_OUCOPYI XSYMM3M_OUCOPYI + +#define HEMM3M_ILCOPYB XHEMM3M_ILCOPYB +#define HEMM3M_IUCOPYB XHEMM3M_IUCOPYB +#define HEMM3M_ILCOPYR XHEMM3M_ILCOPYR +#define HEMM3M_IUCOPYR XHEMM3M_IUCOPYR +#define HEMM3M_ILCOPYI XHEMM3M_ILCOPYI +#define HEMM3M_IUCOPYI XHEMM3M_IUCOPYI + +#define HEMM3M_OLCOPYB XHEMM3M_OLCOPYB +#define HEMM3M_OUCOPYB XHEMM3M_OUCOPYB +#define HEMM3M_OLCOPYR XHEMM3M_OLCOPYR +#define HEMM3M_OUCOPYR XHEMM3M_OUCOPYR +#define HEMM3M_OLCOPYI XHEMM3M_OLCOPYI +#define HEMM3M_OUCOPYI XHEMM3M_OUCOPYI + +#define GEMM_BETA XGEMM_BETA + +#define GEMM_KERNEL_N XGEMM_KERNEL_N +#define GEMM_KERNEL_L XGEMM_KERNEL_L +#define GEMM_KERNEL_R XGEMM_KERNEL_R +#define GEMM_KERNEL_B XGEMM_KERNEL_B + +#define GEMM3M_KERNEL XGEMM3M_KERNEL + +#define TRMM_KERNEL_LN XTRMM_KERNEL_LN +#define TRMM_KERNEL_LT XTRMM_KERNEL_LT +#define TRMM_KERNEL_LR XTRMM_KERNEL_LR +#define TRMM_KERNEL_LC XTRMM_KERNEL_LC +#define TRMM_KERNEL_RN XTRMM_KERNEL_RN +#define TRMM_KERNEL_RT XTRMM_KERNEL_RT +#define TRMM_KERNEL_RR XTRMM_KERNEL_RR +#define TRMM_KERNEL_RC XTRMM_KERNEL_RC + +#define TRSM_KERNEL_LN XTRSM_KERNEL_LN +#define TRSM_KERNEL_LT XTRSM_KERNEL_LT +#define TRSM_KERNEL_LR XTRSM_KERNEL_LR +#define TRSM_KERNEL_LC XTRSM_KERNEL_LC +#define TRSM_KERNEL_RN XTRSM_KERNEL_RN +#define TRSM_KERNEL_RT XTRSM_KERNEL_RT +#define TRSM_KERNEL_RR XTRSM_KERNEL_RR +#define TRSM_KERNEL_RC XTRSM_KERNEL_RC + +#define GEMM_NN XGEMM_NN +#define GEMM_CN XGEMM_CN +#define GEMM_TN XGEMM_TN +#define GEMM_NC XGEMM_NC +#define GEMM_NT XGEMM_NT +#define GEMM_CC XGEMM_CC +#define GEMM_CT XGEMM_CT +#define GEMM_TC XGEMM_TC +#define GEMM_TT XGEMM_TT +#define GEMM_NR XGEMM_NR +#define GEMM_TR XGEMM_TR +#define GEMM_CR XGEMM_CR +#define GEMM_RN XGEMM_RN +#define GEMM_RT XGEMM_RT +#define GEMM_RC XGEMM_RC +#define GEMM_RR XGEMM_RR + +#define SYMM_LU XSYMM_LU +#define SYMM_LL XSYMM_LL +#define SYMM_RU XSYMM_RU +#define SYMM_RL XSYMM_RL + +#define HEMM_LU XHEMM_LU +#define HEMM_LL XHEMM_LL +#define HEMM_RU XHEMM_RU +#define HEMM_RL XHEMM_RL + +#define HEMM_IUTCOPY XHEMM_IUTCOPY +#define HEMM_ILTCOPY XHEMM_ILTCOPY +#define HEMM_OUTCOPY XHEMM_OUTCOPY +#define HEMM_OLTCOPY XHEMM_OLTCOPY + +#define SYRK_UN XSYRK_UN +#define SYRK_UT XSYRK_UT +#define SYRK_LN XSYRK_LN +#define SYRK_LT XSYRK_LT +#define SYRK_UR XSYRK_UN +#define SYRK_UC XSYRK_UT +#define SYRK_LR XSYRK_LN +#define SYRK_LC XSYRK_LT + +#define SYRK_KERNEL_U XSYRK_KERNEL_U +#define SYRK_KERNEL_L XSYRK_KERNEL_L + +#define HERK_UN XHERK_UN +#define HERK_LN XHERK_LN +#define HERK_UC XHERK_UC +#define HERK_LC XHERK_LC + +#define HER2K_UN XHER2K_UN +#define HER2K_LN XHER2K_LN +#define HER2K_UC XHER2K_UC +#define HER2K_LC XHER2K_LC + +#define SYR2K_UN XSYR2K_UN +#define SYR2K_UT XSYR2K_UT +#define SYR2K_LN XSYR2K_LN +#define SYR2K_LT XSYR2K_LT +#define SYR2K_UR XSYR2K_UN +#define SYR2K_UC XSYR2K_UT +#define SYR2K_LR XSYR2K_LN +#define SYR2K_LC XSYR2K_LT + +#define SYR2K_KERNEL_U XSYR2K_KERNEL_U +#define SYR2K_KERNEL_L XSYR2K_KERNEL_L + +#define TRMM_LNUU XTRMM_LNUU +#define TRMM_LNUN XTRMM_LNUN +#define TRMM_LNLU XTRMM_LNLU +#define TRMM_LNLN XTRMM_LNLN +#define TRMM_LTUU XTRMM_LTUU +#define TRMM_LTUN XTRMM_LTUN +#define TRMM_LTLU XTRMM_LTLU +#define TRMM_LTLN XTRMM_LTLN +#define TRMM_LRUU XTRMM_LRUU +#define TRMM_LRUN XTRMM_LRUN +#define TRMM_LRLU XTRMM_LRLU +#define TRMM_LRLN XTRMM_LRLN +#define TRMM_LCUU XTRMM_LCUU +#define TRMM_LCUN XTRMM_LCUN +#define TRMM_LCLU XTRMM_LCLU +#define TRMM_LCLN XTRMM_LCLN +#define TRMM_RNUU XTRMM_RNUU +#define TRMM_RNUN XTRMM_RNUN +#define TRMM_RNLU XTRMM_RNLU +#define TRMM_RNLN XTRMM_RNLN +#define TRMM_RTUU XTRMM_RTUU +#define TRMM_RTUN XTRMM_RTUN +#define TRMM_RTLU XTRMM_RTLU +#define TRMM_RTLN XTRMM_RTLN +#define TRMM_RRUU XTRMM_RRUU +#define TRMM_RRUN XTRMM_RRUN +#define TRMM_RRLU XTRMM_RRLU +#define TRMM_RRLN XTRMM_RRLN +#define TRMM_RCUU XTRMM_RCUU +#define TRMM_RCUN XTRMM_RCUN +#define TRMM_RCLU XTRMM_RCLU +#define TRMM_RCLN XTRMM_RCLN + +#define TRSM_LNUU XTRSM_LNUU +#define TRSM_LNUN XTRSM_LNUN +#define TRSM_LNLU XTRSM_LNLU +#define TRSM_LNLN XTRSM_LNLN +#define TRSM_LTUU XTRSM_LTUU +#define TRSM_LTUN XTRSM_LTUN +#define TRSM_LTLU XTRSM_LTLU +#define TRSM_LTLN XTRSM_LTLN +#define TRSM_LRUU XTRSM_LRUU +#define TRSM_LRUN XTRSM_LRUN +#define TRSM_LRLU XTRSM_LRLU +#define TRSM_LRLN XTRSM_LRLN +#define TRSM_LCUU XTRSM_LCUU +#define TRSM_LCUN XTRSM_LCUN +#define TRSM_LCLU XTRSM_LCLU +#define TRSM_LCLN XTRSM_LCLN +#define TRSM_RNUU XTRSM_RNUU +#define TRSM_RNUN XTRSM_RNUN +#define TRSM_RNLU XTRSM_RNLU +#define TRSM_RNLN XTRSM_RNLN +#define TRSM_RTUU XTRSM_RTUU +#define TRSM_RTUN XTRSM_RTUN +#define TRSM_RTLU XTRSM_RTLU +#define TRSM_RTLN XTRSM_RTLN +#define TRSM_RRUU XTRSM_RRUU +#define TRSM_RRUN XTRSM_RRUN +#define TRSM_RRLU XTRSM_RRLU +#define TRSM_RRLN XTRSM_RRLN +#define TRSM_RCUU XTRSM_RCUU +#define TRSM_RCUN XTRSM_RCUN +#define TRSM_RCLU XTRSM_RCLU +#define TRSM_RCLN XTRSM_RCLN + + +#define GEMM_THREAD_NN XGEMM_THREAD_NN +#define GEMM_THREAD_CN XGEMM_THREAD_CN +#define GEMM_THREAD_TN XGEMM_THREAD_TN +#define GEMM_THREAD_NC XGEMM_THREAD_NC +#define GEMM_THREAD_NT XGEMM_THREAD_NT +#define GEMM_THREAD_CC XGEMM_THREAD_CC +#define GEMM_THREAD_CT XGEMM_THREAD_CT +#define GEMM_THREAD_TC XGEMM_THREAD_TC +#define GEMM_THREAD_TT XGEMM_THREAD_TT +#define GEMM_THREAD_NR XGEMM_THREAD_NR +#define GEMM_THREAD_TR XGEMM_THREAD_TR +#define GEMM_THREAD_CR XGEMM_THREAD_CR +#define GEMM_THREAD_RN XGEMM_THREAD_RN +#define GEMM_THREAD_RT XGEMM_THREAD_RT +#define GEMM_THREAD_RC XGEMM_THREAD_RC +#define GEMM_THREAD_RR XGEMM_THREAD_RR + +#define SYMM_THREAD_LU XSYMM_THREAD_LU +#define SYMM_THREAD_LL XSYMM_THREAD_LL +#define SYMM_THREAD_RU XSYMM_THREAD_RU +#define SYMM_THREAD_RL XSYMM_THREAD_RL + +#define HEMM_THREAD_LU XHEMM_THREAD_LU +#define HEMM_THREAD_LL XHEMM_THREAD_LL +#define HEMM_THREAD_RU XHEMM_THREAD_RU +#define HEMM_THREAD_RL XHEMM_THREAD_RL + +#define SYRK_THREAD_UN XSYRK_THREAD_UN +#define SYRK_THREAD_UT XSYRK_THREAD_UT +#define SYRK_THREAD_LN XSYRK_THREAD_LN +#define SYRK_THREAD_LT XSYRK_THREAD_LT +#define SYRK_THREAD_UR XSYRK_THREAD_UR +#define SYRK_THREAD_UC XSYRK_THREAD_UC +#define SYRK_THREAD_LR XSYRK_THREAD_LR +#define SYRK_THREAD_LC XSYRK_THREAD_LC + +#define HERK_THREAD_UN XHERK_THREAD_UN +#define HERK_THREAD_UT XHERK_THREAD_UT +#define HERK_THREAD_LN XHERK_THREAD_LN +#define HERK_THREAD_LT XHERK_THREAD_LT +#define HERK_THREAD_UR XHERK_THREAD_UR +#define HERK_THREAD_UC XHERK_THREAD_UC +#define HERK_THREAD_LR XHERK_THREAD_LR +#define HERK_THREAD_LC XHERK_THREAD_LC + +#define GEMM3M_NN XGEMM3M_NN +#define GEMM3M_CN XGEMM3M_CN +#define GEMM3M_TN XGEMM3M_TN +#define GEMM3M_NC XGEMM3M_NC +#define GEMM3M_NT XGEMM3M_NT +#define GEMM3M_CC XGEMM3M_CC +#define GEMM3M_CT XGEMM3M_CT +#define GEMM3M_TC XGEMM3M_TC +#define GEMM3M_TT XGEMM3M_TT +#define GEMM3M_NR XGEMM3M_NR +#define GEMM3M_TR XGEMM3M_TR +#define GEMM3M_CR XGEMM3M_CR +#define GEMM3M_RN XGEMM3M_RN +#define GEMM3M_RT XGEMM3M_RT +#define GEMM3M_RC XGEMM3M_RC +#define GEMM3M_RR XGEMM3M_RR + +#define GEMM3M_THREAD_NN XGEMM3M_THREAD_NN +#define GEMM3M_THREAD_CN XGEMM3M_THREAD_CN +#define GEMM3M_THREAD_TN XGEMM3M_THREAD_TN +#define GEMM3M_THREAD_NC XGEMM3M_THREAD_NC +#define GEMM3M_THREAD_NT XGEMM3M_THREAD_NT +#define GEMM3M_THREAD_CC XGEMM3M_THREAD_CC +#define GEMM3M_THREAD_CT XGEMM3M_THREAD_CT +#define GEMM3M_THREAD_TC XGEMM3M_THREAD_TC +#define GEMM3M_THREAD_TT XGEMM3M_THREAD_TT +#define GEMM3M_THREAD_NR XGEMM3M_THREAD_NR +#define GEMM3M_THREAD_TR XGEMM3M_THREAD_TR +#define GEMM3M_THREAD_CR XGEMM3M_THREAD_CR +#define GEMM3M_THREAD_RN XGEMM3M_THREAD_RN +#define GEMM3M_THREAD_RT XGEMM3M_THREAD_RT +#define GEMM3M_THREAD_RC XGEMM3M_THREAD_RC +#define GEMM3M_THREAD_RR XGEMM3M_THREAD_RR + +#define SYMM3M_LU XSYMM3M_LU +#define SYMM3M_LL XSYMM3M_LL +#define SYMM3M_RU XSYMM3M_RU +#define SYMM3M_RL XSYMM3M_RL + +#define SYMM3M_THREAD_LU XSYMM3M_THREAD_LU +#define SYMM3M_THREAD_LL XSYMM3M_THREAD_LL +#define SYMM3M_THREAD_RU XSYMM3M_THREAD_RU +#define SYMM3M_THREAD_RL XSYMM3M_THREAD_RL + +#define HEMM3M_LU XHEMM3M_LU +#define HEMM3M_LL XHEMM3M_LL +#define HEMM3M_RU XHEMM3M_RU +#define HEMM3M_RL XHEMM3M_RL + +#define HEMM3M_THREAD_LU XHEMM3M_THREAD_LU +#define HEMM3M_THREAD_LL XHEMM3M_THREAD_LL +#define HEMM3M_THREAD_RU XHEMM3M_THREAD_RU +#define HEMM3M_THREAD_RL XHEMM3M_THREAD_RL + +#define SYMM_IUTCOPY XSYMM_IUTCOPY +#define SYMM_ILTCOPY XSYMM_ILTCOPY +#define SYMM_OUTCOPY XSYMM_OUTCOPY +#define SYMM_OLTCOPY XSYMM_OLTCOPY + +#elif defined(DOUBLE) + +#define AMAX_K ZAMAX_K +#define AMIN_K ZAMIN_K +#define MAX_K ZMAX_K +#define MIN_K ZMIN_K +#define IAMAX_K IZAMAX_K +#define IAMIN_K IZAMIN_K +#define IMAX_K IZMAX_K +#define IMIN_K IZMIN_K +#define ASUM_K ZASUM_K +#define AXPYU_K ZAXPYU_K +#define AXPYC_K ZAXPYC_K +#define COPY_K ZCOPY_K +#define DOTU_K ZDOTU_K +#define DOTC_K ZDOTC_K +#define NRM2_K ZNRM2_K +#define SCAL_K ZSCAL_K +#define SWAP_K ZSWAP_K +#define ROT_K ZROT_K + +#define GEMV_N ZGEMV_N +#define GEMV_T ZGEMV_T +#define GEMV_R ZGEMV_R +#define GEMV_C ZGEMV_C +#define GEMV_O ZGEMV_O +#define GEMV_U ZGEMV_U +#define GEMV_S ZGEMV_S +#define GEMV_D ZGEMV_D + +#define GERU_K ZGERU_K +#define GERC_K ZGERC_K +#define GERV_K ZGERV_K +#define GERD_K ZGERD_K + +#define SYMV_U ZSYMV_U +#define SYMV_L ZSYMV_L +#define HEMV_U ZHEMV_U +#define HEMV_L ZHEMV_L +#define HEMV_V ZHEMV_V +#define HEMV_M ZHEMV_M + +#define SYMV_THREAD_U ZSYMV_THREAD_U +#define SYMV_THREAD_L ZSYMV_THREAD_L +#define HEMV_THREAD_U ZHEMV_THREAD_U +#define HEMV_THREAD_L ZHEMV_THREAD_L +#define HEMV_THREAD_V ZHEMV_THREAD_V +#define HEMV_THREAD_M ZHEMV_THREAD_M + +#define GEMM_ONCOPY ZGEMM_ONCOPY +#define GEMM_OTCOPY ZGEMM_OTCOPY +#define GEMM_INCOPY ZGEMM_INCOPY +#define GEMM_ITCOPY ZGEMM_ITCOPY + +#define GEMM3M_ONCOPYB ZGEMM3M_ONCOPYB +#define GEMM3M_ONCOPYR ZGEMM3M_ONCOPYR +#define GEMM3M_ONCOPYI ZGEMM3M_ONCOPYI +#define GEMM3M_OTCOPYB ZGEMM3M_OTCOPYB +#define GEMM3M_OTCOPYR ZGEMM3M_OTCOPYR +#define GEMM3M_OTCOPYI ZGEMM3M_OTCOPYI +#define GEMM3M_INCOPYB ZGEMM3M_INCOPYB +#define GEMM3M_INCOPYR ZGEMM3M_INCOPYR +#define GEMM3M_INCOPYI ZGEMM3M_INCOPYI +#define GEMM3M_ITCOPYB ZGEMM3M_ITCOPYB +#define GEMM3M_ITCOPYR ZGEMM3M_ITCOPYR +#define GEMM3M_ITCOPYI ZGEMM3M_ITCOPYI + +#ifdef UNIT + +#define TRMM_OUNCOPY ZTRMM_OUNUCOPY +#define TRMM_OUTCOPY ZTRMM_OUTUCOPY +#define TRMM_OLNCOPY ZTRMM_OLNUCOPY +#define TRMM_OLTCOPY ZTRMM_OLTUCOPY +#define TRSM_OUNCOPY ZTRSM_OUNUCOPY +#define TRSM_OUTCOPY ZTRSM_OUTUCOPY +#define TRSM_OLNCOPY ZTRSM_OLNUCOPY +#define TRSM_OLTCOPY ZTRSM_OLTUCOPY + +#define TRMM_IUNCOPY ZTRMM_IUNUCOPY +#define TRMM_IUTCOPY ZTRMM_IUTUCOPY +#define TRMM_ILNCOPY ZTRMM_ILNUCOPY +#define TRMM_ILTCOPY ZTRMM_ILTUCOPY +#define TRSM_IUNCOPY ZTRSM_IUNUCOPY +#define TRSM_IUTCOPY ZTRSM_IUTUCOPY +#define TRSM_ILNCOPY ZTRSM_ILNUCOPY +#define TRSM_ILTCOPY ZTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY ZTRMM_OUNNCOPY +#define TRMM_OUTCOPY ZTRMM_OUTNCOPY +#define TRMM_OLNCOPY ZTRMM_OLNNCOPY +#define TRMM_OLTCOPY ZTRMM_OLTNCOPY +#define TRSM_OUNCOPY ZTRSM_OUNNCOPY +#define TRSM_OUTCOPY ZTRSM_OUTNCOPY +#define TRSM_OLNCOPY ZTRSM_OLNNCOPY +#define TRSM_OLTCOPY ZTRSM_OLTNCOPY + +#define TRMM_IUNCOPY ZTRMM_IUNNCOPY +#define TRMM_IUTCOPY ZTRMM_IUTNCOPY +#define TRMM_ILNCOPY ZTRMM_ILNNCOPY +#define TRMM_ILTCOPY ZTRMM_ILTNCOPY +#define TRSM_IUNCOPY ZTRSM_IUNNCOPY +#define TRSM_IUTCOPY ZTRSM_IUTNCOPY +#define TRSM_ILNCOPY ZTRSM_ILNNCOPY +#define TRSM_ILTCOPY ZTRSM_ILTNCOPY + +#endif + +#define SYMM3M_ILCOPYB ZSYMM3M_ILCOPYB +#define SYMM3M_IUCOPYB ZSYMM3M_IUCOPYB +#define SYMM3M_ILCOPYR ZSYMM3M_ILCOPYR +#define SYMM3M_IUCOPYR ZSYMM3M_IUCOPYR +#define SYMM3M_ILCOPYI ZSYMM3M_ILCOPYI +#define SYMM3M_IUCOPYI ZSYMM3M_IUCOPYI + +#define SYMM3M_OLCOPYB ZSYMM3M_OLCOPYB +#define SYMM3M_OUCOPYB ZSYMM3M_OUCOPYB +#define SYMM3M_OLCOPYR ZSYMM3M_OLCOPYR +#define SYMM3M_OUCOPYR ZSYMM3M_OUCOPYR +#define SYMM3M_OLCOPYI ZSYMM3M_OLCOPYI +#define SYMM3M_OUCOPYI ZSYMM3M_OUCOPYI + +#define HEMM3M_ILCOPYB ZHEMM3M_ILCOPYB +#define HEMM3M_IUCOPYB ZHEMM3M_IUCOPYB +#define HEMM3M_ILCOPYR ZHEMM3M_ILCOPYR +#define HEMM3M_IUCOPYR ZHEMM3M_IUCOPYR +#define HEMM3M_ILCOPYI ZHEMM3M_ILCOPYI +#define HEMM3M_IUCOPYI ZHEMM3M_IUCOPYI + +#define HEMM3M_OLCOPYB ZHEMM3M_OLCOPYB +#define HEMM3M_OUCOPYB ZHEMM3M_OUCOPYB +#define HEMM3M_OLCOPYR ZHEMM3M_OLCOPYR +#define HEMM3M_OUCOPYR ZHEMM3M_OUCOPYR +#define HEMM3M_OLCOPYI ZHEMM3M_OLCOPYI +#define HEMM3M_OUCOPYI ZHEMM3M_OUCOPYI + +#define GEMM_BETA ZGEMM_BETA + +#define GEMM_KERNEL_N ZGEMM_KERNEL_N +#define GEMM_KERNEL_L ZGEMM_KERNEL_L +#define GEMM_KERNEL_R ZGEMM_KERNEL_R +#define GEMM_KERNEL_B ZGEMM_KERNEL_B + +#define GEMM3M_KERNEL ZGEMM3M_KERNEL + +#define TRMM_KERNEL_LN ZTRMM_KERNEL_LN +#define TRMM_KERNEL_LT ZTRMM_KERNEL_LT +#define TRMM_KERNEL_LR ZTRMM_KERNEL_LR +#define TRMM_KERNEL_LC ZTRMM_KERNEL_LC +#define TRMM_KERNEL_RN ZTRMM_KERNEL_RN +#define TRMM_KERNEL_RT ZTRMM_KERNEL_RT +#define TRMM_KERNEL_RR ZTRMM_KERNEL_RR +#define TRMM_KERNEL_RC ZTRMM_KERNEL_RC + +#define TRSM_KERNEL_LN ZTRSM_KERNEL_LN +#define TRSM_KERNEL_LT ZTRSM_KERNEL_LT +#define TRSM_KERNEL_LR ZTRSM_KERNEL_LR +#define TRSM_KERNEL_LC ZTRSM_KERNEL_LC +#define TRSM_KERNEL_RN ZTRSM_KERNEL_RN +#define TRSM_KERNEL_RT ZTRSM_KERNEL_RT +#define TRSM_KERNEL_RR ZTRSM_KERNEL_RR +#define TRSM_KERNEL_RC ZTRSM_KERNEL_RC + +#define GEMM_NN ZGEMM_NN +#define GEMM_CN ZGEMM_CN +#define GEMM_TN ZGEMM_TN +#define GEMM_NC ZGEMM_NC +#define GEMM_NT ZGEMM_NT +#define GEMM_CC ZGEMM_CC +#define GEMM_CT ZGEMM_CT +#define GEMM_TC ZGEMM_TC +#define GEMM_TT ZGEMM_TT +#define GEMM_NR ZGEMM_NR +#define GEMM_TR ZGEMM_TR +#define GEMM_CR ZGEMM_CR +#define GEMM_RN ZGEMM_RN +#define GEMM_RT ZGEMM_RT +#define GEMM_RC ZGEMM_RC +#define GEMM_RR ZGEMM_RR + +#define SYMM_LU ZSYMM_LU +#define SYMM_LL ZSYMM_LL +#define SYMM_RU ZSYMM_RU +#define SYMM_RL ZSYMM_RL + +#define HEMM_LU ZHEMM_LU +#define HEMM_LL ZHEMM_LL +#define HEMM_RU ZHEMM_RU +#define HEMM_RL ZHEMM_RL + +#define HEMM_IUTCOPY ZHEMM_IUTCOPY +#define HEMM_ILTCOPY ZHEMM_ILTCOPY +#define HEMM_OUTCOPY ZHEMM_OUTCOPY +#define HEMM_OLTCOPY ZHEMM_OLTCOPY + +#define SYRK_UN ZSYRK_UN +#define SYRK_UT ZSYRK_UT +#define SYRK_LN ZSYRK_LN +#define SYRK_LT ZSYRK_LT +#define SYRK_UR ZSYRK_UN +#define SYRK_UC ZSYRK_UT +#define SYRK_LR ZSYRK_LN +#define SYRK_LC ZSYRK_LT + +#define SYRK_KERNEL_U ZSYRK_KERNEL_U +#define SYRK_KERNEL_L ZSYRK_KERNEL_L + +#define HERK_UN ZHERK_UN +#define HERK_LN ZHERK_LN +#define HERK_UC ZHERK_UC +#define HERK_LC ZHERK_LC + +#define HER2K_UN ZHER2K_UN +#define HER2K_LN ZHER2K_LN +#define HER2K_UC ZHER2K_UC +#define HER2K_LC ZHER2K_LC + +#define SYR2K_UN ZSYR2K_UN +#define SYR2K_UT ZSYR2K_UT +#define SYR2K_LN ZSYR2K_LN +#define SYR2K_LT ZSYR2K_LT +#define SYR2K_UR ZSYR2K_UN +#define SYR2K_UC ZSYR2K_UT +#define SYR2K_LR ZSYR2K_LN +#define SYR2K_LC ZSYR2K_LT + +#define SYR2K_KERNEL_U ZSYR2K_KERNEL_U +#define SYR2K_KERNEL_L ZSYR2K_KERNEL_L + +#define TRMM_LNUU ZTRMM_LNUU +#define TRMM_LNUN ZTRMM_LNUN +#define TRMM_LNLU ZTRMM_LNLU +#define TRMM_LNLN ZTRMM_LNLN +#define TRMM_LTUU ZTRMM_LTUU +#define TRMM_LTUN ZTRMM_LTUN +#define TRMM_LTLU ZTRMM_LTLU +#define TRMM_LTLN ZTRMM_LTLN +#define TRMM_LRUU ZTRMM_LRUU +#define TRMM_LRUN ZTRMM_LRUN +#define TRMM_LRLU ZTRMM_LRLU +#define TRMM_LRLN ZTRMM_LRLN +#define TRMM_LCUU ZTRMM_LCUU +#define TRMM_LCUN ZTRMM_LCUN +#define TRMM_LCLU ZTRMM_LCLU +#define TRMM_LCLN ZTRMM_LCLN +#define TRMM_RNUU ZTRMM_RNUU +#define TRMM_RNUN ZTRMM_RNUN +#define TRMM_RNLU ZTRMM_RNLU +#define TRMM_RNLN ZTRMM_RNLN +#define TRMM_RTUU ZTRMM_RTUU +#define TRMM_RTUN ZTRMM_RTUN +#define TRMM_RTLU ZTRMM_RTLU +#define TRMM_RTLN ZTRMM_RTLN +#define TRMM_RRUU ZTRMM_RRUU +#define TRMM_RRUN ZTRMM_RRUN +#define TRMM_RRLU ZTRMM_RRLU +#define TRMM_RRLN ZTRMM_RRLN +#define TRMM_RCUU ZTRMM_RCUU +#define TRMM_RCUN ZTRMM_RCUN +#define TRMM_RCLU ZTRMM_RCLU +#define TRMM_RCLN ZTRMM_RCLN + +#define TRSM_LNUU ZTRSM_LNUU +#define TRSM_LNUN ZTRSM_LNUN +#define TRSM_LNLU ZTRSM_LNLU +#define TRSM_LNLN ZTRSM_LNLN +#define TRSM_LTUU ZTRSM_LTUU +#define TRSM_LTUN ZTRSM_LTUN +#define TRSM_LTLU ZTRSM_LTLU +#define TRSM_LTLN ZTRSM_LTLN +#define TRSM_LRUU ZTRSM_LRUU +#define TRSM_LRUN ZTRSM_LRUN +#define TRSM_LRLU ZTRSM_LRLU +#define TRSM_LRLN ZTRSM_LRLN +#define TRSM_LCUU ZTRSM_LCUU +#define TRSM_LCUN ZTRSM_LCUN +#define TRSM_LCLU ZTRSM_LCLU +#define TRSM_LCLN ZTRSM_LCLN +#define TRSM_RNUU ZTRSM_RNUU +#define TRSM_RNUN ZTRSM_RNUN +#define TRSM_RNLU ZTRSM_RNLU +#define TRSM_RNLN ZTRSM_RNLN +#define TRSM_RTUU ZTRSM_RTUU +#define TRSM_RTUN ZTRSM_RTUN +#define TRSM_RTLU ZTRSM_RTLU +#define TRSM_RTLN ZTRSM_RTLN +#define TRSM_RRUU ZTRSM_RRUU +#define TRSM_RRUN ZTRSM_RRUN +#define TRSM_RRLU ZTRSM_RRLU +#define TRSM_RRLN ZTRSM_RRLN +#define TRSM_RCUU ZTRSM_RCUU +#define TRSM_RCUN ZTRSM_RCUN +#define TRSM_RCLU ZTRSM_RCLU +#define TRSM_RCLN ZTRSM_RCLN + + +#define GEMM_THREAD_NN ZGEMM_THREAD_NN +#define GEMM_THREAD_CN ZGEMM_THREAD_CN +#define GEMM_THREAD_TN ZGEMM_THREAD_TN +#define GEMM_THREAD_NC ZGEMM_THREAD_NC +#define GEMM_THREAD_NT ZGEMM_THREAD_NT +#define GEMM_THREAD_CC ZGEMM_THREAD_CC +#define GEMM_THREAD_CT ZGEMM_THREAD_CT +#define GEMM_THREAD_TC ZGEMM_THREAD_TC +#define GEMM_THREAD_TT ZGEMM_THREAD_TT +#define GEMM_THREAD_NR ZGEMM_THREAD_NR +#define GEMM_THREAD_TR ZGEMM_THREAD_TR +#define GEMM_THREAD_CR ZGEMM_THREAD_CR +#define GEMM_THREAD_RN ZGEMM_THREAD_RN +#define GEMM_THREAD_RT ZGEMM_THREAD_RT +#define GEMM_THREAD_RC ZGEMM_THREAD_RC +#define GEMM_THREAD_RR ZGEMM_THREAD_RR + +#define SYMM_THREAD_LU ZSYMM_THREAD_LU +#define SYMM_THREAD_LL ZSYMM_THREAD_LL +#define SYMM_THREAD_RU ZSYMM_THREAD_RU +#define SYMM_THREAD_RL ZSYMM_THREAD_RL + +#define HEMM_THREAD_LU ZHEMM_THREAD_LU +#define HEMM_THREAD_LL ZHEMM_THREAD_LL +#define HEMM_THREAD_RU ZHEMM_THREAD_RU +#define HEMM_THREAD_RL ZHEMM_THREAD_RL + +#define SYRK_THREAD_UN ZSYRK_THREAD_UN +#define SYRK_THREAD_UT ZSYRK_THREAD_UT +#define SYRK_THREAD_LN ZSYRK_THREAD_LN +#define SYRK_THREAD_LT ZSYRK_THREAD_LT +#define SYRK_THREAD_UR ZSYRK_THREAD_UR +#define SYRK_THREAD_UC ZSYRK_THREAD_UC +#define SYRK_THREAD_LR ZSYRK_THREAD_LR +#define SYRK_THREAD_LC ZSYRK_THREAD_LC + +#define HERK_THREAD_UN ZHERK_THREAD_UN +#define HERK_THREAD_UT ZHERK_THREAD_UT +#define HERK_THREAD_LN ZHERK_THREAD_LN +#define HERK_THREAD_LT ZHERK_THREAD_LT +#define HERK_THREAD_UR ZHERK_THREAD_UR +#define HERK_THREAD_UC ZHERK_THREAD_UC +#define HERK_THREAD_LR ZHERK_THREAD_LR +#define HERK_THREAD_LC ZHERK_THREAD_LC + +#define GEMM3M_NN ZGEMM3M_NN +#define GEMM3M_CN ZGEMM3M_CN +#define GEMM3M_TN ZGEMM3M_TN +#define GEMM3M_NC ZGEMM3M_NC +#define GEMM3M_NT ZGEMM3M_NT +#define GEMM3M_CC ZGEMM3M_CC +#define GEMM3M_CT ZGEMM3M_CT +#define GEMM3M_TC ZGEMM3M_TC +#define GEMM3M_TT ZGEMM3M_TT +#define GEMM3M_NR ZGEMM3M_NR +#define GEMM3M_TR ZGEMM3M_TR +#define GEMM3M_CR ZGEMM3M_CR +#define GEMM3M_RN ZGEMM3M_RN +#define GEMM3M_RT ZGEMM3M_RT +#define GEMM3M_RC ZGEMM3M_RC +#define GEMM3M_RR ZGEMM3M_RR + +#define GEMM3M_THREAD_NN ZGEMM3M_THREAD_NN +#define GEMM3M_THREAD_CN ZGEMM3M_THREAD_CN +#define GEMM3M_THREAD_TN ZGEMM3M_THREAD_TN +#define GEMM3M_THREAD_NC ZGEMM3M_THREAD_NC +#define GEMM3M_THREAD_NT ZGEMM3M_THREAD_NT +#define GEMM3M_THREAD_CC ZGEMM3M_THREAD_CC +#define GEMM3M_THREAD_CT ZGEMM3M_THREAD_CT +#define GEMM3M_THREAD_TC ZGEMM3M_THREAD_TC +#define GEMM3M_THREAD_TT ZGEMM3M_THREAD_TT +#define GEMM3M_THREAD_NR ZGEMM3M_THREAD_NR +#define GEMM3M_THREAD_TR ZGEMM3M_THREAD_TR +#define GEMM3M_THREAD_CR ZGEMM3M_THREAD_CR +#define GEMM3M_THREAD_RN ZGEMM3M_THREAD_RN +#define GEMM3M_THREAD_RT ZGEMM3M_THREAD_RT +#define GEMM3M_THREAD_RC ZGEMM3M_THREAD_RC +#define GEMM3M_THREAD_RR ZGEMM3M_THREAD_RR + +#define SYMM3M_LU ZSYMM3M_LU +#define SYMM3M_LL ZSYMM3M_LL +#define SYMM3M_RU ZSYMM3M_RU +#define SYMM3M_RL ZSYMM3M_RL + +#define SYMM3M_THREAD_LU ZSYMM3M_THREAD_LU +#define SYMM3M_THREAD_LL ZSYMM3M_THREAD_LL +#define SYMM3M_THREAD_RU ZSYMM3M_THREAD_RU +#define SYMM3M_THREAD_RL ZSYMM3M_THREAD_RL + +#define HEMM3M_LU ZHEMM3M_LU +#define HEMM3M_LL ZHEMM3M_LL +#define HEMM3M_RU ZHEMM3M_RU +#define HEMM3M_RL ZHEMM3M_RL + +#define HEMM3M_THREAD_LU ZHEMM3M_THREAD_LU +#define HEMM3M_THREAD_LL ZHEMM3M_THREAD_LL +#define HEMM3M_THREAD_RU ZHEMM3M_THREAD_RU +#define HEMM3M_THREAD_RL ZHEMM3M_THREAD_RL + +#define SYMM_IUTCOPY ZSYMM_IUTCOPY +#define SYMM_ILTCOPY ZSYMM_ILTCOPY +#define SYMM_OUTCOPY ZSYMM_OUTCOPY +#define SYMM_OLTCOPY ZSYMM_OLTCOPY + +#else + +#define AMAX_K CAMAX_K +#define AMIN_K CAMIN_K +#define MAX_K CMAX_K +#define MIN_K CMIN_K +#define IAMAX_K ICAMAX_K +#define IAMIN_K ICAMIN_K +#define IMAX_K ICMAX_K +#define IMIN_K ICMIN_K +#define ASUM_K CASUM_K +#define AXPYU_K CAXPYU_K +#define AXPYC_K CAXPYC_K +#define COPY_K CCOPY_K +#define DOTU_K CDOTU_K +#define DOTC_K CDOTC_K +#define NRM2_K CNRM2_K +#define SCAL_K CSCAL_K +#define SWAP_K CSWAP_K +#define ROT_K CROT_K + +#define GEMV_N CGEMV_N +#define GEMV_T CGEMV_T +#define GEMV_R CGEMV_R +#define GEMV_C CGEMV_C +#define GEMV_O CGEMV_O +#define GEMV_U CGEMV_U +#define GEMV_S CGEMV_S +#define GEMV_D CGEMV_D + +#define GERU_K CGERU_K +#define GERC_K CGERC_K +#define GERV_K CGERV_K +#define GERD_K CGERD_K + +#define SYMV_U CSYMV_U +#define SYMV_L CSYMV_L +#define HEMV_U CHEMV_U +#define HEMV_L CHEMV_L +#define HEMV_V CHEMV_V +#define HEMV_M CHEMV_M + +#define SYMV_THREAD_U CSYMV_THREAD_U +#define SYMV_THREAD_L CSYMV_THREAD_L +#define HEMV_THREAD_U CHEMV_THREAD_U +#define HEMV_THREAD_L CHEMV_THREAD_L +#define HEMV_THREAD_V CHEMV_THREAD_V +#define HEMV_THREAD_M CHEMV_THREAD_M + +#define GEMM_ONCOPY CGEMM_ONCOPY +#define GEMM_OTCOPY CGEMM_OTCOPY +#define GEMM_INCOPY CGEMM_INCOPY +#define GEMM_ITCOPY CGEMM_ITCOPY + +#define GEMM3M_ONCOPYB CGEMM3M_ONCOPYB +#define GEMM3M_ONCOPYR CGEMM3M_ONCOPYR +#define GEMM3M_ONCOPYI CGEMM3M_ONCOPYI +#define GEMM3M_OTCOPYB CGEMM3M_OTCOPYB +#define GEMM3M_OTCOPYR CGEMM3M_OTCOPYR +#define GEMM3M_OTCOPYI CGEMM3M_OTCOPYI +#define GEMM3M_INCOPYB CGEMM3M_INCOPYB +#define GEMM3M_INCOPYR CGEMM3M_INCOPYR +#define GEMM3M_INCOPYI CGEMM3M_INCOPYI +#define GEMM3M_ITCOPYB CGEMM3M_ITCOPYB +#define GEMM3M_ITCOPYR CGEMM3M_ITCOPYR +#define GEMM3M_ITCOPYI CGEMM3M_ITCOPYI + +#ifdef UNIT + +#define TRMM_OUNCOPY CTRMM_OUNUCOPY +#define TRMM_OUTCOPY CTRMM_OUTUCOPY +#define TRMM_OLNCOPY CTRMM_OLNUCOPY +#define TRMM_OLTCOPY CTRMM_OLTUCOPY +#define TRSM_OUNCOPY CTRSM_OUNUCOPY +#define TRSM_OUTCOPY CTRSM_OUTUCOPY +#define TRSM_OLNCOPY CTRSM_OLNUCOPY +#define TRSM_OLTCOPY CTRSM_OLTUCOPY + +#define TRMM_IUNCOPY CTRMM_IUNUCOPY +#define TRMM_IUTCOPY CTRMM_IUTUCOPY +#define TRMM_ILNCOPY CTRMM_ILNUCOPY +#define TRMM_ILTCOPY CTRMM_ILTUCOPY +#define TRSM_IUNCOPY CTRSM_IUNUCOPY +#define TRSM_IUTCOPY CTRSM_IUTUCOPY +#define TRSM_ILNCOPY CTRSM_ILNUCOPY +#define TRSM_ILTCOPY CTRSM_ILTUCOPY + +#else + +#define TRMM_OUNCOPY CTRMM_OUNNCOPY +#define TRMM_OUTCOPY CTRMM_OUTNCOPY +#define TRMM_OLNCOPY CTRMM_OLNNCOPY +#define TRMM_OLTCOPY CTRMM_OLTNCOPY +#define TRSM_OUNCOPY CTRSM_OUNNCOPY +#define TRSM_OUTCOPY CTRSM_OUTNCOPY +#define TRSM_OLNCOPY CTRSM_OLNNCOPY +#define TRSM_OLTCOPY CTRSM_OLTNCOPY + +#define TRMM_IUNCOPY CTRMM_IUNNCOPY +#define TRMM_IUTCOPY CTRMM_IUTNCOPY +#define TRMM_ILNCOPY CTRMM_ILNNCOPY +#define TRMM_ILTCOPY CTRMM_ILTNCOPY +#define TRSM_IUNCOPY CTRSM_IUNNCOPY +#define TRSM_IUTCOPY CTRSM_IUTNCOPY +#define TRSM_ILNCOPY CTRSM_ILNNCOPY +#define TRSM_ILTCOPY CTRSM_ILTNCOPY + +#endif + +#define SYMM3M_ILCOPYB CSYMM3M_ILCOPYB +#define SYMM3M_IUCOPYB CSYMM3M_IUCOPYB +#define SYMM3M_ILCOPYR CSYMM3M_ILCOPYR +#define SYMM3M_IUCOPYR CSYMM3M_IUCOPYR +#define SYMM3M_ILCOPYI CSYMM3M_ILCOPYI +#define SYMM3M_IUCOPYI CSYMM3M_IUCOPYI + +#define SYMM3M_OLCOPYB CSYMM3M_OLCOPYB +#define SYMM3M_OUCOPYB CSYMM3M_OUCOPYB +#define SYMM3M_OLCOPYR CSYMM3M_OLCOPYR +#define SYMM3M_OUCOPYR CSYMM3M_OUCOPYR +#define SYMM3M_OLCOPYI CSYMM3M_OLCOPYI +#define SYMM3M_OUCOPYI CSYMM3M_OUCOPYI + +#define HEMM3M_ILCOPYB CHEMM3M_ILCOPYB +#define HEMM3M_IUCOPYB CHEMM3M_IUCOPYB +#define HEMM3M_ILCOPYR CHEMM3M_ILCOPYR +#define HEMM3M_IUCOPYR CHEMM3M_IUCOPYR +#define HEMM3M_ILCOPYI CHEMM3M_ILCOPYI +#define HEMM3M_IUCOPYI CHEMM3M_IUCOPYI + +#define HEMM3M_OLCOPYB CHEMM3M_OLCOPYB +#define HEMM3M_OUCOPYB CHEMM3M_OUCOPYB +#define HEMM3M_OLCOPYR CHEMM3M_OLCOPYR +#define HEMM3M_OUCOPYR CHEMM3M_OUCOPYR +#define HEMM3M_OLCOPYI CHEMM3M_OLCOPYI +#define HEMM3M_OUCOPYI CHEMM3M_OUCOPYI + +#define GEMM_BETA CGEMM_BETA + +#define GEMM_KERNEL_N CGEMM_KERNEL_N +#define GEMM_KERNEL_L CGEMM_KERNEL_L +#define GEMM_KERNEL_R CGEMM_KERNEL_R +#define GEMM_KERNEL_B CGEMM_KERNEL_B + +#define GEMM3M_KERNEL CGEMM3M_KERNEL + +#define TRMM_KERNEL_LN CTRMM_KERNEL_LN +#define TRMM_KERNEL_LT CTRMM_KERNEL_LT +#define TRMM_KERNEL_LR CTRMM_KERNEL_LR +#define TRMM_KERNEL_LC CTRMM_KERNEL_LC +#define TRMM_KERNEL_RN CTRMM_KERNEL_RN +#define TRMM_KERNEL_RT CTRMM_KERNEL_RT +#define TRMM_KERNEL_RR CTRMM_KERNEL_RR +#define TRMM_KERNEL_RC CTRMM_KERNEL_RC + +#define TRSM_KERNEL_LN CTRSM_KERNEL_LN +#define TRSM_KERNEL_LT CTRSM_KERNEL_LT +#define TRSM_KERNEL_LR CTRSM_KERNEL_LR +#define TRSM_KERNEL_LC CTRSM_KERNEL_LC +#define TRSM_KERNEL_RN CTRSM_KERNEL_RN +#define TRSM_KERNEL_RT CTRSM_KERNEL_RT +#define TRSM_KERNEL_RR CTRSM_KERNEL_RR +#define TRSM_KERNEL_RC CTRSM_KERNEL_RC + +#define GEMM_NN CGEMM_NN +#define GEMM_CN CGEMM_CN +#define GEMM_TN CGEMM_TN +#define GEMM_NC CGEMM_NC +#define GEMM_NT CGEMM_NT +#define GEMM_CC CGEMM_CC +#define GEMM_CT CGEMM_CT +#define GEMM_TC CGEMM_TC +#define GEMM_TT CGEMM_TT +#define GEMM_NR CGEMM_NR +#define GEMM_TR CGEMM_TR +#define GEMM_CR CGEMM_CR +#define GEMM_RN CGEMM_RN +#define GEMM_RT CGEMM_RT +#define GEMM_RC CGEMM_RC +#define GEMM_RR CGEMM_RR + +#define SYMM_LU CSYMM_LU +#define SYMM_LL CSYMM_LL +#define SYMM_RU CSYMM_RU +#define SYMM_RL CSYMM_RL + +#define HEMM_LU CHEMM_LU +#define HEMM_LL CHEMM_LL +#define HEMM_RU CHEMM_RU +#define HEMM_RL CHEMM_RL + +#define HEMM_IUTCOPY CHEMM_IUTCOPY +#define HEMM_ILTCOPY CHEMM_ILTCOPY +#define HEMM_OUTCOPY CHEMM_OUTCOPY +#define HEMM_OLTCOPY CHEMM_OLTCOPY + +#define SYRK_UN CSYRK_UN +#define SYRK_UT CSYRK_UT +#define SYRK_LN CSYRK_LN +#define SYRK_LT CSYRK_LT +#define SYRK_UR CSYRK_UN +#define SYRK_UC CSYRK_UT +#define SYRK_LR CSYRK_LN +#define SYRK_LC CSYRK_LT + +#define SYRK_KERNEL_U CSYRK_KERNEL_U +#define SYRK_KERNEL_L CSYRK_KERNEL_L + +#define HERK_UN CHERK_UN +#define HERK_LN CHERK_LN +#define HERK_UC CHERK_UC +#define HERK_LC CHERK_LC + +#define HER2K_UN CHER2K_UN +#define HER2K_LN CHER2K_LN +#define HER2K_UC CHER2K_UC +#define HER2K_LC CHER2K_LC + +#define SYR2K_UN CSYR2K_UN +#define SYR2K_UT CSYR2K_UT +#define SYR2K_LN CSYR2K_LN +#define SYR2K_LT CSYR2K_LT +#define SYR2K_UR CSYR2K_UN +#define SYR2K_UC CSYR2K_UT +#define SYR2K_LR CSYR2K_LN +#define SYR2K_LC CSYR2K_LT + +#define SYR2K_KERNEL_U CSYR2K_KERNEL_U +#define SYR2K_KERNEL_L CSYR2K_KERNEL_L + +#define TRMM_LNUU CTRMM_LNUU +#define TRMM_LNUN CTRMM_LNUN +#define TRMM_LNLU CTRMM_LNLU +#define TRMM_LNLN CTRMM_LNLN +#define TRMM_LTUU CTRMM_LTUU +#define TRMM_LTUN CTRMM_LTUN +#define TRMM_LTLU CTRMM_LTLU +#define TRMM_LTLN CTRMM_LTLN +#define TRMM_LRUU CTRMM_LRUU +#define TRMM_LRUN CTRMM_LRUN +#define TRMM_LRLU CTRMM_LRLU +#define TRMM_LRLN CTRMM_LRLN +#define TRMM_LCUU CTRMM_LCUU +#define TRMM_LCUN CTRMM_LCUN +#define TRMM_LCLU CTRMM_LCLU +#define TRMM_LCLN CTRMM_LCLN +#define TRMM_RNUU CTRMM_RNUU +#define TRMM_RNUN CTRMM_RNUN +#define TRMM_RNLU CTRMM_RNLU +#define TRMM_RNLN CTRMM_RNLN +#define TRMM_RTUU CTRMM_RTUU +#define TRMM_RTUN CTRMM_RTUN +#define TRMM_RTLU CTRMM_RTLU +#define TRMM_RTLN CTRMM_RTLN +#define TRMM_RRUU CTRMM_RRUU +#define TRMM_RRUN CTRMM_RRUN +#define TRMM_RRLU CTRMM_RRLU +#define TRMM_RRLN CTRMM_RRLN +#define TRMM_RCUU CTRMM_RCUU +#define TRMM_RCUN CTRMM_RCUN +#define TRMM_RCLU CTRMM_RCLU +#define TRMM_RCLN CTRMM_RCLN + +#define TRSM_LNUU CTRSM_LNUU +#define TRSM_LNUN CTRSM_LNUN +#define TRSM_LNLU CTRSM_LNLU +#define TRSM_LNLN CTRSM_LNLN +#define TRSM_LTUU CTRSM_LTUU +#define TRSM_LTUN CTRSM_LTUN +#define TRSM_LTLU CTRSM_LTLU +#define TRSM_LTLN CTRSM_LTLN +#define TRSM_LRUU CTRSM_LRUU +#define TRSM_LRUN CTRSM_LRUN +#define TRSM_LRLU CTRSM_LRLU +#define TRSM_LRLN CTRSM_LRLN +#define TRSM_LCUU CTRSM_LCUU +#define TRSM_LCUN CTRSM_LCUN +#define TRSM_LCLU CTRSM_LCLU +#define TRSM_LCLN CTRSM_LCLN +#define TRSM_RNUU CTRSM_RNUU +#define TRSM_RNUN CTRSM_RNUN +#define TRSM_RNLU CTRSM_RNLU +#define TRSM_RNLN CTRSM_RNLN +#define TRSM_RTUU CTRSM_RTUU +#define TRSM_RTUN CTRSM_RTUN +#define TRSM_RTLU CTRSM_RTLU +#define TRSM_RTLN CTRSM_RTLN +#define TRSM_RRUU CTRSM_RRUU +#define TRSM_RRUN CTRSM_RRUN +#define TRSM_RRLU CTRSM_RRLU +#define TRSM_RRLN CTRSM_RRLN +#define TRSM_RCUU CTRSM_RCUU +#define TRSM_RCUN CTRSM_RCUN +#define TRSM_RCLU CTRSM_RCLU +#define TRSM_RCLN CTRSM_RCLN + + +#define GEMM_THREAD_NN CGEMM_THREAD_NN +#define GEMM_THREAD_CN CGEMM_THREAD_CN +#define GEMM_THREAD_TN CGEMM_THREAD_TN +#define GEMM_THREAD_NC CGEMM_THREAD_NC +#define GEMM_THREAD_NT CGEMM_THREAD_NT +#define GEMM_THREAD_CC CGEMM_THREAD_CC +#define GEMM_THREAD_CT CGEMM_THREAD_CT +#define GEMM_THREAD_TC CGEMM_THREAD_TC +#define GEMM_THREAD_TT CGEMM_THREAD_TT +#define GEMM_THREAD_NR CGEMM_THREAD_NR +#define GEMM_THREAD_TR CGEMM_THREAD_TR +#define GEMM_THREAD_CR CGEMM_THREAD_CR +#define GEMM_THREAD_RN CGEMM_THREAD_RN +#define GEMM_THREAD_RT CGEMM_THREAD_RT +#define GEMM_THREAD_RC CGEMM_THREAD_RC +#define GEMM_THREAD_RR CGEMM_THREAD_RR + +#define SYMM_THREAD_LU CSYMM_THREAD_LU +#define SYMM_THREAD_LL CSYMM_THREAD_LL +#define SYMM_THREAD_RU CSYMM_THREAD_RU +#define SYMM_THREAD_RL CSYMM_THREAD_RL + +#define HEMM_THREAD_LU CHEMM_THREAD_LU +#define HEMM_THREAD_LL CHEMM_THREAD_LL +#define HEMM_THREAD_RU CHEMM_THREAD_RU +#define HEMM_THREAD_RL CHEMM_THREAD_RL + +#define SYRK_THREAD_UN CSYRK_THREAD_UN +#define SYRK_THREAD_UT CSYRK_THREAD_UT +#define SYRK_THREAD_LN CSYRK_THREAD_LN +#define SYRK_THREAD_LT CSYRK_THREAD_LT +#define SYRK_THREAD_UR CSYRK_THREAD_UR +#define SYRK_THREAD_UC CSYRK_THREAD_UC +#define SYRK_THREAD_LR CSYRK_THREAD_LR +#define SYRK_THREAD_LC CSYRK_THREAD_LC + +#define HERK_THREAD_UN CHERK_THREAD_UN +#define HERK_THREAD_UT CHERK_THREAD_UT +#define HERK_THREAD_LN CHERK_THREAD_LN +#define HERK_THREAD_LT CHERK_THREAD_LT +#define HERK_THREAD_UR CHERK_THREAD_UR +#define HERK_THREAD_UC CHERK_THREAD_UC +#define HERK_THREAD_LR CHERK_THREAD_LR +#define HERK_THREAD_LC CHERK_THREAD_LC + +#define GEMM3M_NN CGEMM3M_NN +#define GEMM3M_CN CGEMM3M_CN +#define GEMM3M_TN CGEMM3M_TN +#define GEMM3M_NC CGEMM3M_NC +#define GEMM3M_NT CGEMM3M_NT +#define GEMM3M_CC CGEMM3M_CC +#define GEMM3M_CT CGEMM3M_CT +#define GEMM3M_TC CGEMM3M_TC +#define GEMM3M_TT CGEMM3M_TT +#define GEMM3M_NR CGEMM3M_NR +#define GEMM3M_TR CGEMM3M_TR +#define GEMM3M_CR CGEMM3M_CR +#define GEMM3M_RN CGEMM3M_RN +#define GEMM3M_RT CGEMM3M_RT +#define GEMM3M_RC CGEMM3M_RC +#define GEMM3M_RR CGEMM3M_RR + +#define GEMM3M_THREAD_NN CGEMM3M_THREAD_NN +#define GEMM3M_THREAD_CN CGEMM3M_THREAD_CN +#define GEMM3M_THREAD_TN CGEMM3M_THREAD_TN +#define GEMM3M_THREAD_NC CGEMM3M_THREAD_NC +#define GEMM3M_THREAD_NT CGEMM3M_THREAD_NT +#define GEMM3M_THREAD_CC CGEMM3M_THREAD_CC +#define GEMM3M_THREAD_CT CGEMM3M_THREAD_CT +#define GEMM3M_THREAD_TC CGEMM3M_THREAD_TC +#define GEMM3M_THREAD_TT CGEMM3M_THREAD_TT +#define GEMM3M_THREAD_NR CGEMM3M_THREAD_NR +#define GEMM3M_THREAD_TR CGEMM3M_THREAD_TR +#define GEMM3M_THREAD_CR CGEMM3M_THREAD_CR +#define GEMM3M_THREAD_RN CGEMM3M_THREAD_RN +#define GEMM3M_THREAD_RT CGEMM3M_THREAD_RT +#define GEMM3M_THREAD_RC CGEMM3M_THREAD_RC +#define GEMM3M_THREAD_RR CGEMM3M_THREAD_RR + +#define SYMM3M_LU CSYMM3M_LU +#define SYMM3M_LL CSYMM3M_LL +#define SYMM3M_RU CSYMM3M_RU +#define SYMM3M_RL CSYMM3M_RL + +#define SYMM3M_THREAD_LU CSYMM3M_THREAD_LU +#define SYMM3M_THREAD_LL CSYMM3M_THREAD_LL +#define SYMM3M_THREAD_RU CSYMM3M_THREAD_RU +#define SYMM3M_THREAD_RL CSYMM3M_THREAD_RL + +#define HEMM3M_LU CHEMM3M_LU +#define HEMM3M_LL CHEMM3M_LL +#define HEMM3M_RU CHEMM3M_RU +#define HEMM3M_RL CHEMM3M_RL + +#define HEMM3M_THREAD_LU CHEMM3M_THREAD_LU +#define HEMM3M_THREAD_LL CHEMM3M_THREAD_LL +#define HEMM3M_THREAD_RU CHEMM3M_THREAD_RU +#define HEMM3M_THREAD_RL CHEMM3M_THREAD_RL + +#define SYMM_IUTCOPY CSYMM_IUTCOPY +#define SYMM_ILTCOPY CSYMM_ILTCOPY +#define SYMM_OUTCOPY CSYMM_OUTCOPY +#define SYMM_OLTCOPY CSYMM_OLTCOPY + +#endif +#endif + +#ifndef ASSEMBLER +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +extern BLASLONG sgemm_p; +extern BLASLONG sgemm_q; +extern BLASLONG sgemm_r; +extern BLASLONG dgemm_p; +extern BLASLONG dgemm_q; +extern BLASLONG dgemm_r; +extern BLASLONG qgemm_p; +extern BLASLONG qgemm_q; +extern BLASLONG qgemm_r; +extern BLASLONG cgemm_p; +extern BLASLONG cgemm_q; +extern BLASLONG cgemm_r; +extern BLASLONG zgemm_p; +extern BLASLONG zgemm_q; +extern BLASLONG zgemm_r; +extern BLASLONG xgemm_p; +extern BLASLONG xgemm_q; +extern BLASLONG xgemm_r; +#endif + +typedef struct { + void *a, *b, *c, *d, *alpha, *beta; + BLASLONG m, n, k, lda, ldb, ldc, ldd; + +#ifdef SMP + void *common; + BLASLONG nthreads; +#endif + +#ifdef PARAMTEST + BLASLONG gemm_p, gemm_q, gemm_r; +#endif + +#ifdef PREFETCHTEST + BLASLONG prea, preb, prec, pred; +#endif + +} blas_arg_t; +#endif + +#ifdef XDOUBLE + +#define TRSV_NUU qtrsv_NUU +#define TRSV_NUN qtrsv_NUN +#define TRSV_NLU qtrsv_NLU +#define TRSV_NLN qtrsv_NLN +#define TRSV_TUU qtrsv_TUU +#define TRSV_TUN qtrsv_TUN +#define TRSV_TLU qtrsv_TLU +#define TRSV_TLN qtrsv_TLN + +#define ZTRSV_NUU xtrsv_NUU +#define ZTRSV_NUN xtrsv_NUN +#define ZTRSV_NLU xtrsv_NLU +#define ZTRSV_NLN xtrsv_NLN +#define ZTRSV_TUU xtrsv_TUU +#define ZTRSV_TUN xtrsv_TUN +#define ZTRSV_TLU xtrsv_TLU +#define ZTRSV_TLN xtrsv_TLN +#define ZTRSV_RUU xtrsv_RUU +#define ZTRSV_RUN xtrsv_RUN +#define ZTRSV_RLU xtrsv_RLU +#define ZTRSV_RLN xtrsv_RLN +#define ZTRSV_CUU xtrsv_CUU +#define ZTRSV_CUN xtrsv_CUN +#define ZTRSV_CLU xtrsv_CLU +#define ZTRSV_CLN xtrsv_CLN + +#define TRMV_NUU qtrmv_NUU +#define TRMV_NUN qtrmv_NUN +#define TRMV_NLU qtrmv_NLU +#define TRMV_NLN qtrmv_NLN +#define TRMV_TUU qtrmv_TUU +#define TRMV_TUN qtrmv_TUN +#define TRMV_TLU qtrmv_TLU +#define TRMV_TLN qtrmv_TLN + +#define TRMV_THREAD_NUU qtrmv_thread_NUU +#define TRMV_THREAD_NUN qtrmv_thread_NUN +#define TRMV_THREAD_NLU qtrmv_thread_NLU +#define TRMV_THREAD_NLN qtrmv_thread_NLN +#define TRMV_THREAD_TUU qtrmv_thread_TUU +#define TRMV_THREAD_TUN qtrmv_thread_TUN +#define TRMV_THREAD_TLU qtrmv_thread_TLU +#define TRMV_THREAD_TLN qtrmv_thread_TLN + +#define ZTRMV_NUU xtrmv_NUU +#define ZTRMV_NUN xtrmv_NUN +#define ZTRMV_NLU xtrmv_NLU +#define ZTRMV_NLN xtrmv_NLN +#define ZTRMV_TUU xtrmv_TUU +#define ZTRMV_TUN xtrmv_TUN +#define ZTRMV_TLU xtrmv_TLU +#define ZTRMV_TLN xtrmv_TLN +#define ZTRMV_RUU xtrmv_RUU +#define ZTRMV_RUN xtrmv_RUN +#define ZTRMV_RLU xtrmv_RLU +#define ZTRMV_RLN xtrmv_RLN +#define ZTRMV_CUU xtrmv_CUU +#define ZTRMV_CUN xtrmv_CUN +#define ZTRMV_CLU xtrmv_CLU +#define ZTRMV_CLN xtrmv_CLN + +#define ZTRMV_THREAD_NUU xtrmv_thread_NUU +#define ZTRMV_THREAD_NUN xtrmv_thread_NUN +#define ZTRMV_THREAD_NLU xtrmv_thread_NLU +#define ZTRMV_THREAD_NLN xtrmv_thread_NLN +#define ZTRMV_THREAD_TUU xtrmv_thread_TUU +#define ZTRMV_THREAD_TUN xtrmv_thread_TUN +#define ZTRMV_THREAD_TLU xtrmv_thread_TLU +#define ZTRMV_THREAD_TLN xtrmv_thread_TLN +#define ZTRMV_THREAD_RUU xtrmv_thread_RUU +#define ZTRMV_THREAD_RUN xtrmv_thread_RUN +#define ZTRMV_THREAD_RLU xtrmv_thread_RLU +#define ZTRMV_THREAD_RLN xtrmv_thread_RLN +#define ZTRMV_THREAD_CUU xtrmv_thread_CUU +#define ZTRMV_THREAD_CUN xtrmv_thread_CUN +#define ZTRMV_THREAD_CLU xtrmv_thread_CLU +#define ZTRMV_THREAD_CLN xtrmv_thread_CLN + +#elif defined(DOUBLE) + +#define TRSV_NUU dtrsv_NUU +#define TRSV_NUN dtrsv_NUN +#define TRSV_NLU dtrsv_NLU +#define TRSV_NLN dtrsv_NLN +#define TRSV_TUU dtrsv_TUU +#define TRSV_TUN dtrsv_TUN +#define TRSV_TLU dtrsv_TLU +#define TRSV_TLN dtrsv_TLN + +#define ZTRSV_NUU ztrsv_NUU +#define ZTRSV_NUN ztrsv_NUN +#define ZTRSV_NLU ztrsv_NLU +#define ZTRSV_NLN ztrsv_NLN +#define ZTRSV_TUU ztrsv_TUU +#define ZTRSV_TUN ztrsv_TUN +#define ZTRSV_TLU ztrsv_TLU +#define ZTRSV_TLN ztrsv_TLN +#define ZTRSV_RUU ztrsv_RUU +#define ZTRSV_RUN ztrsv_RUN +#define ZTRSV_RLU ztrsv_RLU +#define ZTRSV_RLN ztrsv_RLN +#define ZTRSV_CUU ztrsv_CUU +#define ZTRSV_CUN ztrsv_CUN +#define ZTRSV_CLU ztrsv_CLU +#define ZTRSV_CLN ztrsv_CLN + +#define TRMV_NUU dtrmv_NUU +#define TRMV_NUN dtrmv_NUN +#define TRMV_NLU dtrmv_NLU +#define TRMV_NLN dtrmv_NLN +#define TRMV_TUU dtrmv_TUU +#define TRMV_TUN dtrmv_TUN +#define TRMV_TLU dtrmv_TLU +#define TRMV_TLN dtrmv_TLN + +#define TRMV_THREAD_NUU dtrmv_thread_NUU +#define TRMV_THREAD_NUN dtrmv_thread_NUN +#define TRMV_THREAD_NLU dtrmv_thread_NLU +#define TRMV_THREAD_NLN dtrmv_thread_NLN +#define TRMV_THREAD_TUU dtrmv_thread_TUU +#define TRMV_THREAD_TUN dtrmv_thread_TUN +#define TRMV_THREAD_TLU dtrmv_thread_TLU +#define TRMV_THREAD_TLN dtrmv_thread_TLN + +#define ZTRMV_NUU ztrmv_NUU +#define ZTRMV_NUN ztrmv_NUN +#define ZTRMV_NLU ztrmv_NLU +#define ZTRMV_NLN ztrmv_NLN +#define ZTRMV_TUU ztrmv_TUU +#define ZTRMV_TUN ztrmv_TUN +#define ZTRMV_TLU ztrmv_TLU +#define ZTRMV_TLN ztrmv_TLN +#define ZTRMV_RUU ztrmv_RUU +#define ZTRMV_RUN ztrmv_RUN +#define ZTRMV_RLU ztrmv_RLU +#define ZTRMV_RLN ztrmv_RLN +#define ZTRMV_CUU ztrmv_CUU +#define ZTRMV_CUN ztrmv_CUN +#define ZTRMV_CLU ztrmv_CLU +#define ZTRMV_CLN ztrmv_CLN + +#define ZTRMV_THREAD_NUU ztrmv_thread_NUU +#define ZTRMV_THREAD_NUN ztrmv_thread_NUN +#define ZTRMV_THREAD_NLU ztrmv_thread_NLU +#define ZTRMV_THREAD_NLN ztrmv_thread_NLN +#define ZTRMV_THREAD_TUU ztrmv_thread_TUU +#define ZTRMV_THREAD_TUN ztrmv_thread_TUN +#define ZTRMV_THREAD_TLU ztrmv_thread_TLU +#define ZTRMV_THREAD_TLN ztrmv_thread_TLN +#define ZTRMV_THREAD_RUU ztrmv_thread_RUU +#define ZTRMV_THREAD_RUN ztrmv_thread_RUN +#define ZTRMV_THREAD_RLU ztrmv_thread_RLU +#define ZTRMV_THREAD_RLN ztrmv_thread_RLN +#define ZTRMV_THREAD_CUU ztrmv_thread_CUU +#define ZTRMV_THREAD_CUN ztrmv_thread_CUN +#define ZTRMV_THREAD_CLU ztrmv_thread_CLU +#define ZTRMV_THREAD_CLN ztrmv_thread_CLN + +#else + +#define TRSV_NUU strsv_NUU +#define TRSV_NUN strsv_NUN +#define TRSV_NLU strsv_NLU +#define TRSV_NLN strsv_NLN +#define TRSV_TUU strsv_TUU +#define TRSV_TUN strsv_TUN +#define TRSV_TLU strsv_TLU +#define TRSV_TLN strsv_TLN + +#define ZTRSV_NUU ctrsv_NUU +#define ZTRSV_NUN ctrsv_NUN +#define ZTRSV_NLU ctrsv_NLU +#define ZTRSV_NLN ctrsv_NLN +#define ZTRSV_TUU ctrsv_TUU +#define ZTRSV_TUN ctrsv_TUN +#define ZTRSV_TLU ctrsv_TLU +#define ZTRSV_TLN ctrsv_TLN +#define ZTRSV_RUU ctrsv_RUU +#define ZTRSV_RUN ctrsv_RUN +#define ZTRSV_RLU ctrsv_RLU +#define ZTRSV_RLN ctrsv_RLN +#define ZTRSV_CUU ctrsv_CUU +#define ZTRSV_CUN ctrsv_CUN +#define ZTRSV_CLU ctrsv_CLU +#define ZTRSV_CLN ctrsv_CLN + +#define TRMV_NUU strmv_NUU +#define TRMV_NUN strmv_NUN +#define TRMV_NLU strmv_NLU +#define TRMV_NLN strmv_NLN +#define TRMV_TUU strmv_TUU +#define TRMV_TUN strmv_TUN +#define TRMV_TLU strmv_TLU +#define TRMV_TLN strmv_TLN + +#define TRMV_THREAD_NUU strmv_thread_NUU +#define TRMV_THREAD_NUN strmv_thread_NUN +#define TRMV_THREAD_NLU strmv_thread_NLU +#define TRMV_THREAD_NLN strmv_thread_NLN +#define TRMV_THREAD_TUU strmv_thread_TUU +#define TRMV_THREAD_TUN strmv_thread_TUN +#define TRMV_THREAD_TLU strmv_thread_TLU +#define TRMV_THREAD_TLN strmv_thread_TLN + +#define ZTRMV_NUU ctrmv_NUU +#define ZTRMV_NUN ctrmv_NUN +#define ZTRMV_NLU ctrmv_NLU +#define ZTRMV_NLN ctrmv_NLN +#define ZTRMV_TUU ctrmv_TUU +#define ZTRMV_TUN ctrmv_TUN +#define ZTRMV_TLU ctrmv_TLU +#define ZTRMV_TLN ctrmv_TLN +#define ZTRMV_RUU ctrmv_RUU +#define ZTRMV_RUN ctrmv_RUN +#define ZTRMV_RLU ctrmv_RLU +#define ZTRMV_RLN ctrmv_RLN +#define ZTRMV_CUU ctrmv_CUU +#define ZTRMV_CUN ctrmv_CUN +#define ZTRMV_CLU ctrmv_CLU +#define ZTRMV_CLN ctrmv_CLN + +#define ZTRMV_THREAD_NUU ctrmv_thread_NUU +#define ZTRMV_THREAD_NUN ctrmv_thread_NUN +#define ZTRMV_THREAD_NLU ctrmv_thread_NLU +#define ZTRMV_THREAD_NLN ctrmv_thread_NLN +#define ZTRMV_THREAD_TUU ctrmv_thread_TUU +#define ZTRMV_THREAD_TUN ctrmv_thread_TUN +#define ZTRMV_THREAD_TLU ctrmv_thread_TLU +#define ZTRMV_THREAD_TLN ctrmv_thread_TLN +#define ZTRMV_THREAD_RUU ctrmv_thread_RUU +#define ZTRMV_THREAD_RUN ctrmv_thread_RUN +#define ZTRMV_THREAD_RLU ctrmv_thread_RLU +#define ZTRMV_THREAD_RLN ctrmv_thread_RLN +#define ZTRMV_THREAD_CUU ctrmv_thread_CUU +#define ZTRMV_THREAD_CUN ctrmv_thread_CUN +#define ZTRMV_THREAD_CLU ctrmv_thread_CLU +#define ZTRMV_THREAD_CLN ctrmv_thread_CLN + +#endif + +#define SGETF2 sgetf2_k +#define DGETF2 dgetf2_k +#define QGETF2 qgetf2_k +#define CGETF2 cgetf2_k +#define ZGETF2 zgetf2_k +#define XGETF2 xgetf2_k + +#define SLASWP_PLUS slaswp_plus +#define SLASWP_MINUS slaswp_minus +#define DLASWP_PLUS dlaswp_plus +#define DLASWP_MINUS dlaswp_minus +#define QLASWP_PLUS qlaswp_plus +#define QLASWP_MINUS qlaswp_minus +#define CLASWP_PLUS claswp_plus +#define CLASWP_MINUS claswp_minus +#define ZLASWP_PLUS zlaswp_plus +#define ZLASWP_MINUS zlaswp_minus +#define XLASWP_PLUS xlaswp_plus +#define XLASWP_MINUS xlaswp_minus + +#define SLARF_L slarf_L +#define SLARF_R slarf_R +#define DLARF_L dlarf_L +#define DLARF_R dlarf_R +#define QLARF_L qlarf_L +#define QLARF_R qlarf_R +#define CLARF_L clarf_L +#define CLARF_R clarf_R +#define ZLARF_L zlarf_L +#define ZLARF_R zlarf_R +#define XLARF_L xlarf_L +#define XLARF_R xlarf_R + +#ifndef COMPLEX +#ifdef XDOUBLE +#define GETF2 QGETF2 +#define GETRF QGETRF +#define GETRS_N_SINGLE qgetrs_N_single +#define GETRS_T_SINGLE qgetrs_T_single +#define GETRS_R_SINGLE qgetrs_N_single +#define GETRS_C_SINGLE qgetrs_T_single +#define GETRS_N_PARALLEL qgetrs_N_parallel +#define GETRS_T_PARALLEL qgetrs_T_parallel +#define GETRS_R_PARALLEL qgetrs_N_parallel +#define GETRS_C_PARALLEL qgetrs_T_parallel +#define LASWP_PLUS QLASWP_PLUS +#define LASWP_MINUS QLASWP_MINUS +#define LASWP_NCOPY QLASWP_NCOPY +#define GETRS_N QGETRS_N +#define GETRS_T QGETRS_T +#define GETRF_SINGLE qgetrf_single +#define GETRF_PARALLEL qgetrf_parallel +#define NEG_TCOPY QNEG_TCOPY +#define LARF_L QLARF_L +#define LARF_R QLARF_R +#elif defined(DOUBLE) +#define GETF2 DGETF2 +#define GETRF DGETRF +#define GETRS_N_SINGLE dgetrs_N_single +#define GETRS_T_SINGLE dgetrs_T_single +#define GETRS_R_SINGLE dgetrs_N_single +#define GETRS_C_SINGLE dgetrs_T_single +#define GETRS_N_PARALLEL dgetrs_N_parallel +#define GETRS_T_PARALLEL dgetrs_T_parallel +#define GETRS_R_PARALLEL dgetrs_N_parallel +#define GETRS_C_PARALLEL dgetrs_T_parallel +#define LASWP_PLUS DLASWP_PLUS +#define LASWP_MINUS DLASWP_MINUS +#define LASWP_NCOPY DLASWP_NCOPY +#define GETRS_N DGETRS_N +#define GETRS_T DGETRS_T +#define GETRF_SINGLE dgetrf_single +#define GETRF_PARALLEL dgetrf_parallel +#define NEG_TCOPY DNEG_TCOPY +#define LARF_L DLARF_L +#define LARF_R DLARF_R +#else +#define GETF2 SGETF2 +#define GETRF SGETRF +#define GETRS_N_SINGLE sgetrs_N_single +#define GETRS_T_SINGLE sgetrs_T_single +#define GETRS_R_SINGLE sgetrs_N_single +#define GETRS_C_SINGLE sgetrs_T_single +#define GETRS_N_PARALLEL sgetrs_N_parallel +#define GETRS_T_PARALLEL sgetrs_T_parallel +#define GETRS_R_PARALLEL sgetrs_N_parallel +#define GETRS_C_PARALLEL sgetrs_T_parallel +#define LASWP_PLUS SLASWP_PLUS +#define LASWP_MINUS SLASWP_MINUS +#define LASWP_NCOPY SLASWP_NCOPY +#define GETRS_N SGETRS_N +#define GETRS_T SGETRS_T +#define GETRF_SINGLE sgetrf_single +#define GETRF_PARALLEL sgetrf_parallel +#define NEG_TCOPY SNEG_TCOPY +#define LARF_L SLARF_L +#define LARF_R SLARF_R +#endif +#else +#ifdef XDOUBLE +#define GETF2 XGETF2 +#define GETRF XGETRF +#define GETRS_N_SINGLE xgetrs_N_single +#define GETRS_T_SINGLE xgetrs_T_single +#define GETRS_R_SINGLE xgetrs_R_single +#define GETRS_C_SINGLE xgetrs_C_single +#define GETRS_N_PARALLEL xgetrs_N_parallel +#define GETRS_T_PARALLEL xgetrs_T_parallel +#define GETRS_R_PARALLEL xgetrs_R_parallel +#define GETRS_C_PARALLEL xgetrs_C_parallel +#define LASWP_PLUS XLASWP_PLUS +#define LASWP_MINUS XLASWP_MINUS +#define LASWP_NCOPY XLASWP_NCOPY +#define GETRS_N XGETRS_N +#define GETRS_T XGETRS_T +#define GETRF_SINGLE xgetrf_single +#define GETRF_PARALLEL xgetrf_parallel +#define NEG_TCOPY XNEG_TCOPY +#define LARF_L XLARF_L +#define LARF_R XLARF_R +#elif defined(DOUBLE) +#define GETF2 ZGETF2 +#define GETRF ZGETRF +#define GETRS_N_SINGLE zgetrs_N_single +#define GETRS_T_SINGLE zgetrs_T_single +#define GETRS_R_SINGLE zgetrs_R_single +#define GETRS_C_SINGLE zgetrs_C_single +#define GETRS_N_PARALLEL zgetrs_N_parallel +#define GETRS_T_PARALLEL zgetrs_T_parallel +#define GETRS_R_PARALLEL zgetrs_R_parallel +#define GETRS_C_PARALLEL zgetrs_C_parallel +#define LASWP_PLUS ZLASWP_PLUS +#define LASWP_MINUS ZLASWP_MINUS +#define LASWP_NCOPY ZLASWP_NCOPY +#define GETRS_N ZGETRS_N +#define GETRS_T ZGETRS_T +#define GETRF_SINGLE zgetrf_single +#define GETRF_PARALLEL zgetrf_parallel +#define NEG_TCOPY ZNEG_TCOPY +#define LARF_L ZLARF_L +#define LARF_R ZLARF_R +#else +#define GETF2 CGETF2 +#define GETRF CGETRF +#define GETRS_N_SINGLE cgetrs_N_single +#define GETRS_T_SINGLE cgetrs_T_single +#define GETRS_R_SINGLE cgetrs_R_single +#define GETRS_C_SINGLE cgetrs_C_single +#define GETRS_N_PARALLEL cgetrs_N_parallel +#define GETRS_T_PARALLEL cgetrs_T_parallel +#define GETRS_R_PARALLEL cgetrs_R_parallel +#define GETRS_C_PARALLEL cgetrs_C_parallel +#define LASWP_PLUS CLASWP_PLUS +#define LASWP_MINUS CLASWP_MINUS +#define LASWP_NCOPY CLASWP_NCOPY +#define GETRS_N CGETRS_N +#define GETRS_T CGETRS_T +#define GETRF_SINGLE cgetrf_single +#define GETRF_PARALLEL cgetrf_parallel +#define NEG_TCOPY CNEG_TCOPY +#define LARF_L CLARF_L +#define LARF_R CLARF_R +#endif +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define POTF2_U qpotf2_U +#define POTF2_L qpotf2_L +#define LAUU2_U qlauu2_U +#define LAUU2_L qlauu2_L +#define POTRF_U_SINGLE qpotrf_U_single +#define POTRF_L_SINGLE qpotrf_L_single +#define POTRF_U_PARALLEL qpotrf_U_parallel +#define POTRF_L_PARALLEL qpotrf_L_parallel +#define LAUUM_U_SINGLE qlauum_U_single +#define LAUUM_L_SINGLE qlauum_L_single +#define LAUUM_U_PARALLEL qlauum_U_parallel +#define LAUUM_L_PARALLEL qlauum_L_parallel +#define TRTI2_UU qtrti2_UU +#define TRTI2_UN qtrti2_UN +#define TRTI2_LU qtrti2_LU +#define TRTI2_LN qtrti2_LN +#define TRTRI_UU_SINGLE qtrtri_UU_single +#define TRTRI_UN_SINGLE qtrtri_UN_single +#define TRTRI_LU_SINGLE qtrtri_LU_single +#define TRTRI_LN_SINGLE qtrtri_LN_single +#define TRTRI_UU_PARALLEL qtrtri_UU_parallel +#define TRTRI_UN_PARALLEL qtrtri_UN_parallel +#define TRTRI_LU_PARALLEL qtrtri_LU_parallel +#define TRTRI_LN_PARALLEL qtrtri_LN_parallel +#elif defined(DOUBLE) +#define POTF2_U dpotf2_U +#define POTF2_L dpotf2_L +#define LAUU2_U dlauu2_U +#define LAUU2_L dlauu2_L +#define POTRF_U_SINGLE dpotrf_U_single +#define POTRF_L_SINGLE dpotrf_L_single +#define POTRF_U_PARALLEL dpotrf_U_parallel +#define POTRF_L_PARALLEL dpotrf_L_parallel +#define LAUUM_U_SINGLE dlauum_U_single +#define LAUUM_L_SINGLE dlauum_L_single +#define LAUUM_U_PARALLEL dlauum_U_parallel +#define LAUUM_L_PARALLEL dlauum_L_parallel +#define TRTI2_UU dtrti2_UU +#define TRTI2_UN dtrti2_UN +#define TRTI2_LU dtrti2_LU +#define TRTI2_LN dtrti2_LN +#define TRTRI_UU_SINGLE dtrtri_UU_single +#define TRTRI_UN_SINGLE dtrtri_UN_single +#define TRTRI_LU_SINGLE dtrtri_LU_single +#define TRTRI_LN_SINGLE dtrtri_LN_single +#define TRTRI_UU_PARALLEL dtrtri_UU_parallel +#define TRTRI_UN_PARALLEL dtrtri_UN_parallel +#define TRTRI_LU_PARALLEL dtrtri_LU_parallel +#define TRTRI_LN_PARALLEL dtrtri_LN_parallel +#else +#define POTF2_U spotf2_U +#define POTF2_L spotf2_L +#define LAUU2_U slauu2_U +#define LAUU2_L slauu2_L +#define POTRF_U_SINGLE spotrf_U_single +#define POTRF_L_SINGLE spotrf_L_single +#define POTRF_U_PARALLEL spotrf_U_parallel +#define POTRF_L_PARALLEL spotrf_L_parallel +#define LAUUM_U_SINGLE slauum_U_single +#define LAUUM_L_SINGLE slauum_L_single +#define LAUUM_U_PARALLEL slauum_U_parallel +#define LAUUM_L_PARALLEL slauum_L_parallel +#define TRTI2_UU strti2_UU +#define TRTI2_UN strti2_UN +#define TRTI2_LU strti2_LU +#define TRTI2_LN strti2_LN +#define TRTRI_UU_SINGLE strtri_UU_single +#define TRTRI_UN_SINGLE strtri_UN_single +#define TRTRI_LU_SINGLE strtri_LU_single +#define TRTRI_LN_SINGLE strtri_LN_single +#define TRTRI_UU_PARALLEL strtri_UU_parallel +#define TRTRI_UN_PARALLEL strtri_UN_parallel +#define TRTRI_LU_PARALLEL strtri_LU_parallel +#define TRTRI_LN_PARALLEL strtri_LN_parallel +#endif +#else +#ifdef XDOUBLE +#define POTF2_U xpotf2_U +#define POTF2_L xpotf2_L +#define LAUU2_U xlauu2_U +#define LAUU2_L xlauu2_L +#define POTRF_U_SINGLE xpotrf_U_single +#define POTRF_L_SINGLE xpotrf_L_single +#define POTRF_U_PARALLEL xpotrf_U_parallel +#define POTRF_L_PARALLEL xpotrf_L_parallel +#define LAUUM_U_SINGLE xlauum_U_single +#define LAUUM_L_SINGLE xlauum_L_single +#define LAUUM_U_PARALLEL xlauum_U_parallel +#define LAUUM_L_PARALLEL xlauum_L_parallel +#define TRTI2_UU xtrti2_UU +#define TRTI2_UN xtrti2_UN +#define TRTI2_LU xtrti2_LU +#define TRTI2_LN xtrti2_LN +#define TRTRI_UU_SINGLE xtrtri_UU_single +#define TRTRI_UN_SINGLE xtrtri_UN_single +#define TRTRI_LU_SINGLE xtrtri_LU_single +#define TRTRI_LN_SINGLE xtrtri_LN_single +#define TRTRI_UU_PARALLEL xtrtri_UU_parallel +#define TRTRI_UN_PARALLEL xtrtri_UN_parallel +#define TRTRI_LU_PARALLEL xtrtri_LU_parallel +#define TRTRI_LN_PARALLEL xtrtri_LN_parallel +#elif defined(DOUBLE) +#define POTF2_U zpotf2_U +#define POTF2_L zpotf2_L +#define LAUU2_U zlauu2_U +#define LAUU2_L zlauu2_L +#define POTRF_U_SINGLE zpotrf_U_single +#define POTRF_L_SINGLE zpotrf_L_single +#define POTRF_U_PARALLEL zpotrf_U_parallel +#define POTRF_L_PARALLEL zpotrf_L_parallel +#define LAUUM_U_SINGLE zlauum_U_single +#define LAUUM_L_SINGLE zlauum_L_single +#define LAUUM_U_PARALLEL zlauum_U_parallel +#define LAUUM_L_PARALLEL zlauum_L_parallel +#define TRTI2_UU ztrti2_UU +#define TRTI2_UN ztrti2_UN +#define TRTI2_LU ztrti2_LU +#define TRTI2_LN ztrti2_LN +#define TRTRI_UU_SINGLE ztrtri_UU_single +#define TRTRI_UN_SINGLE ztrtri_UN_single +#define TRTRI_LU_SINGLE ztrtri_LU_single +#define TRTRI_LN_SINGLE ztrtri_LN_single +#define TRTRI_UU_PARALLEL ztrtri_UU_parallel +#define TRTRI_UN_PARALLEL ztrtri_UN_parallel +#define TRTRI_LU_PARALLEL ztrtri_LU_parallel +#define TRTRI_LN_PARALLEL ztrtri_LN_parallel +#else +#define POTF2_U cpotf2_U +#define POTF2_L cpotf2_L +#define LAUU2_U clauu2_U +#define LAUU2_L clauu2_L +#define POTRF_U_SINGLE cpotrf_U_single +#define POTRF_L_SINGLE cpotrf_L_single +#define POTRF_U_PARALLEL cpotrf_U_parallel +#define POTRF_L_PARALLEL cpotrf_L_parallel +#define LAUUM_U_SINGLE clauum_U_single +#define LAUUM_L_SINGLE clauum_L_single +#define LAUUM_U_PARALLEL clauum_U_parallel +#define LAUUM_L_PARALLEL clauum_L_parallel +#define TRTI2_UU ctrti2_UU +#define TRTI2_UN ctrti2_UN +#define TRTI2_LU ctrti2_LU +#define TRTI2_LN ctrti2_LN +#define TRTRI_UU_SINGLE ctrtri_UU_single +#define TRTRI_UN_SINGLE ctrtri_UN_single +#define TRTRI_LU_SINGLE ctrtri_LU_single +#define TRTRI_LN_SINGLE ctrtri_LN_single +#define TRTRI_UU_PARALLEL ctrtri_UU_parallel +#define TRTRI_UN_PARALLEL ctrtri_UN_parallel +#define TRTRI_LU_PARALLEL ctrtri_LU_parallel +#define TRTRI_LN_PARALLEL ctrtri_LN_parallel +#endif +#endif + +#endif diff --git a/common_mips64.h b/common_mips64.h new file mode 100644 index 0000000000..332af3ef56 --- /dev/null +++ b/common_mips64.h @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_MIPS64 +#define COMMON_MIPS64 + +#define MB +#define WMB + +#define INLINE inline + +#ifndef ASSEMBLER + +static void INLINE blas_lock(volatile unsigned long *address){ + + long int ret, val = 1; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "1: ll %0, %3\n" + " ori %2, %0, 1\n" + " sc %2, %1\n" + " beqz %2, 1b\n" + " andi %2, %0, 1\n" + " sync\n" + : "=&r" (val), "=m" (address), "=&r" (ret) + : "m" (address) + : "memory"); + + } while (ret); +} + +static inline unsigned int rpcc(void){ + unsigned long ret; + + __asm__ __volatile__(".set push \n" + ".set mips32r2\n" + "rdhwr %0, $30 \n" + ".set pop" : "=r"(ret) : : "memory"); + + return ret; +} + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#ifdef DOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("mov.d %0, $f2" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("mov.s %0, $f2" : "=f"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#endif + + +#ifdef ASSEMBLER + +#define HALT teq $0, $0 +#define NOP move $0, $0 + +#ifdef DOUBLE +#define LD ldc1 +#define ST sdc1 +#define MADD madd.d +#define NMADD nmadd.d +#define MSUB msub.d +#define NMSUB nmsub.d +#define ADD add.d +#define SUB sub.d +#define MUL mul.d +#define MOV mov.d +#define CMOVF movf.d +#define CMOVT movt.d +#define MTC dmtc1 +#define FABS abs.d +#define CMPEQ c.eq.d +#define CMPLE c.le.d +#define CMPLT c.lt.d +#else +#define LD lwc1 +#define ST swc1 +#define MADD madd.s +#define NMADD nmadd.s +#define MSUB msub.s +#define NMSUB nmsub.s +#define ADD add.s +#define SUB sub.s +#define MUL mul.s +#define MOV mov.s +#define CMOVF movf.s +#define CMOVT movt.s +#define MTC mtc1 +#define FABS abs.s +#define CMPEQ c.eq.s +#define CMPLE c.le.s +#define CMPLT c.lt.s +#endif + +#if defined(__64BIT__) && defined(USE64BITINT) +#define LDINT ld +#define LDARG ld +#define SDARG sd +#elif defined(__64BIT__) && !defined(USE64BITINT) +#define LDINT lw +#define LDARG ld +#define SDARG sd +#else +#define LDINT lw +#define LDARG lw +#define SDARG sw +#endif + + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#define PROLOGUE \ + .text ;\ + .set mips64 ;\ + .align 5 ;\ + .globl REALNAME ;\ + .ent REALNAME ;\ + .type REALNAME, @function ;\ +REALNAME: ;\ + .set noreorder ;\ + .set nomacro + +#define EPILOGUE \ + .set macro ;\ + .set reorder ;\ + .end REALNAME + +#define PROFCODE +#endif + +#endif + +#define SEEK_ADDRESS + +#define BUFFER_SIZE ( 8 << 20) + +#ifndef PAGESIZE +#define PAGESIZE (64UL << 10) +#endif +#define HUGE_PAGESIZE ( 2 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif diff --git a/common_param.h b/common_param.h new file mode 100644 index 0000000000..c4580cc22e --- /dev/null +++ b/common_param.h @@ -0,0 +1,1098 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_PARAM_H +#define COMMON_PARAM_H + +#ifndef ASSEMBLER + +#ifdef DYNAMIC_ARCH + +typedef struct { + int offsetA, offsetB, align; + + int sgemm_p, sgemm_q, sgemm_r; + int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; + + int exclusive_cache; + + float (*samax_k) (BLASLONG, float *, BLASLONG); + float (*samin_k) (BLASLONG, float *, BLASLONG); + float (*smax_k) (BLASLONG, float *, BLASLONG); + float (*smin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); + + float (*snrm2_k) (BLASLONG, float *, BLASLONG); + float (*sasum_k) (BLASLONG, float *, BLASLONG); + int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); + int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*strsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*strmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*strmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*strmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*ssymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + + int dgemm_p, dgemm_q, dgemm_r; + int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; + + double (*damax_k) (BLASLONG, double *, BLASLONG); + double (*damin_k) (BLASLONG, double *, BLASLONG); + double (*dmax_k) (BLASLONG, double *, BLASLONG); + double (*dmin_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); + + double (*dnrm2_k) (BLASLONG, double *, BLASLONG); + double (*dasum_k) (BLASLONG, double *, BLASLONG); + int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); + + int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); + int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*dtrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*dtrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + + int (*dtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*dtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*dtrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dtrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*dsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); + +#ifdef EXPRECISION + + int qgemm_p, qgemm_q, qgemm_r; + int qgemm_unroll_m, qgemm_unroll_n, qgemm_unroll_mn; + + xdouble (*qamax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qamin_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qmax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qmin_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqamax_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqamin_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqmax_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); + + xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); + int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + + int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*qswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*qgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qger_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*qgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*qgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*qtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*qtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*qtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + + int (*qtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*qtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*qtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*qsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*qsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*qneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*qlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); + +#endif + + int cgemm_p, cgemm_q, cgemm_r; + int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; + + float (*camax_k) (BLASLONG, float *, BLASLONG); + float (*camin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); + + float (*cnrm2_k) (BLASLONG, float *, BLASLONG); + float (*casum_k) (BLASLONG, float *, BLASLONG); + int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float _Complex (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float _Complex (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + + int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*cgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_r) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_c) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_o) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*cgerd_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*csymv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*csymv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_L) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + + int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + + int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*ctrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*ctrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*ctrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ctrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*csymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*chemm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*cgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + + int (*cgemm3m_incopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_incopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_incopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*cgemm3m_itcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + + int (*cgemm3m_oncopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_oncopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_oncopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + int (*cgemm3m_otcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, float, float, float *); + + int (*csymm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*csymm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*csymm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*csymm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + + int (*chemm3m_iucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_iucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_iucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*chemm3m_ilcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*chemm3m_oucopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyb)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_oucopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); + + int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + + int zgemm_p, zgemm_q, zgemm_r; + int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; + + double (*zamax_k) (BLASLONG, double *, BLASLONG); + double (*zamin_k) (BLASLONG, double *, BLASLONG); +BLASLONG (*izamax_k)(BLASLONG, double *, BLASLONG); +BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); + + double (*znrm2_k) (BLASLONG, double *, BLASLONG); + double (*zasum_k) (BLASLONG, double *, BLASLONG); + int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + double _Complex (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + double _Complex (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zdrot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); + + int (*zaxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*zgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_r) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_c) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_o) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_u) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_s) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgemv_d) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgeru_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerc_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerv_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zgerd_k) (BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*zsymv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zsymv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_L) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_U) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_M) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + int (*zhemv_V) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); + + int (*zgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + + int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*ztrsm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + int (*ztrsm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, double *); + + int (*ztrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + int (*ztrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); + + int (*ztrmm_iunucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iunncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iutucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iutncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ilnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_outncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_olnucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_olnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_oltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*ztrmm_oltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zsymm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zhemm_iutcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_iltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_outcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm_oltcopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + + int (*zgemm3m_incopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_incopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_incopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zgemm3m_itcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + + int (*zgemm3m_oncopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_oncopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_oncopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + int (*zgemm3m_otcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, double, double, double *); + + int (*zsymm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zsymm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zsymm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zsymm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + + int (*zhemm3m_iucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_iucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_iucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zhemm3m_ilcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + + int (*zhemm3m_oucopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyb)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_oucopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyr)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_oucopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + int (*zhemm3m_olcopyi)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double, double, double *); + + int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); + +#ifdef EXPRECISION + + int xgemm_p, xgemm_q, xgemm_r; + int xgemm_unroll_m, xgemm_unroll_n, xgemm_unroll_mn; + + xdouble (*xamax_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xamin_k) (BLASLONG, xdouble *, BLASLONG); +BLASLONG (*ixamax_k)(BLASLONG, xdouble *, BLASLONG); +BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); + + xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); + xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); + int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + xdouble _Complex (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + xdouble _Complex (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xqrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); + + int (*xaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xaxpyc_k)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + int (*xswap_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*xgemv_n) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_t) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_r) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_c) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_o) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_u) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_s) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemv_d) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgeru_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerc_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerv_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgerd_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xsymv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xsymv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_L) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_U) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_M) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xhemv_V) (BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + int (*xgemm_beta )(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); + + int (*xgemm_incopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_itcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_oncopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm_otcopy )(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrsm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*xtrsm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + int (*xtrsm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, xdouble *); + + int (*xtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_RC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + int (*xtrmm_kernel_LC)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG, BLASLONG); + + int (*xtrmm_iunucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iunncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iutucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iutncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ilnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ilnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_iltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ounucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_ounncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_outucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_outncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_olnucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_olnncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_oltucopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xtrmm_oltncopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xsymm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xhemm_iutcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_iltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_outcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm_oltcopy)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xgemm3m_kernel)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); + + int (*xgemm3m_incopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_incopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_incopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xgemm3m_itcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + + int (*xgemm3m_oncopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_oncopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_oncopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + int (*xgemm3m_otcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xsymm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xsymm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xsymm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xsymm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xhemm3m_iucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_iucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_iucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + int (*xhemm3m_ilcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble *); + + int (*xhemm3m_oucopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyb)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_oucopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyr)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_oucopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + int (*xhemm3m_olcopyi)(BLASLONG, BLASLONG, xdouble *, BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *); + + int (*xneg_tcopy) (BLASLONG, BLASLONG, xdouble *, BLASLONG, xdouble *); + int (*xlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, xdouble *, BLASLONG, blasint *, xdouble *); + +#endif + + void (*init)(void); + + int snum_opt, dnum_opt, qnum_opt; + +} gotoblas_t; + +extern gotoblas_t *gotoblas; + +#define GEMM_OFFSET_A gotoblas -> offsetA +#define GEMM_OFFSET_B gotoblas -> offsetB +#define GEMM_ALIGN gotoblas -> align + +#define HAVE_EX_L2 gotoblas -> exclusive_cache + +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R gotoblas -> sgemm_r +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn + +#define DGEMM_P gotoblas -> dgemm_p +#define DGEMM_Q gotoblas -> dgemm_q +#define DGEMM_R gotoblas -> dgemm_r +#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m +#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n +#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn + +#define QGEMM_P gotoblas -> qgemm_p +#define QGEMM_Q gotoblas -> qgemm_q +#define QGEMM_R gotoblas -> qgemm_r +#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m +#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n +#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn + +#define CGEMM_P gotoblas -> cgemm_p +#define CGEMM_Q gotoblas -> cgemm_q +#define CGEMM_R gotoblas -> cgemm_r +#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m +#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n +#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn + +#define ZGEMM_P gotoblas -> zgemm_p +#define ZGEMM_Q gotoblas -> zgemm_q +#define ZGEMM_R gotoblas -> zgemm_r +#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m +#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n +#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn + +#define XGEMM_P gotoblas -> xgemm_p +#define XGEMM_Q gotoblas -> xgemm_q +#define XGEMM_R gotoblas -> xgemm_r +#define XGEMM_UNROLL_M gotoblas -> xgemm_unroll_m +#define XGEMM_UNROLL_N gotoblas -> xgemm_unroll_n +#define XGEMM_UNROLL_MN gotoblas -> xgemm_unroll_mn + +#else + +#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A +#define GEMM_OFFSET_B GEMM_DEFAULT_OFFSET_B +#define GEMM_ALIGN GEMM_DEFAULT_ALIGN + +#ifdef HAVE_EXCLUSIVE_CACHE +#define HAVE_EX_L2 1 +#else +#define HAVE_EX_L2 0 +#endif + +#define SGEMM_P SGEMM_DEFAULT_P +#define SGEMM_Q SGEMM_DEFAULT_Q +#define SGEMM_R SGEMM_DEFAULT_R +#define SGEMM_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define SGEMM_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#define SGEMM_UNROLL_MN MAX((SGEMM_UNROLL_M), (SGEMM_UNROLL_N)) + +#define DGEMM_P DGEMM_DEFAULT_P +#define DGEMM_Q DGEMM_DEFAULT_Q +#define DGEMM_R DGEMM_DEFAULT_R +#define DGEMM_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define DGEMM_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#define DGEMM_UNROLL_MN MAX((DGEMM_UNROLL_M), (DGEMM_UNROLL_N)) + +#define QGEMM_P QGEMM_DEFAULT_P +#define QGEMM_Q QGEMM_DEFAULT_Q +#define QGEMM_R QGEMM_DEFAULT_R +#define QGEMM_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define QGEMM_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#define QGEMM_UNROLL_MN MAX((QGEMM_UNROLL_M), (QGEMM_UNROLL_N)) + +#define CGEMM_P CGEMM_DEFAULT_P +#define CGEMM_Q CGEMM_DEFAULT_Q +#define CGEMM_R CGEMM_DEFAULT_R +#define CGEMM_UNROLL_M CGEMM_DEFAULT_UNROLL_M +#define CGEMM_UNROLL_N CGEMM_DEFAULT_UNROLL_N +#define CGEMM_UNROLL_MN MAX((CGEMM_UNROLL_M), (CGEMM_UNROLL_N)) + +#define ZGEMM_P ZGEMM_DEFAULT_P +#define ZGEMM_Q ZGEMM_DEFAULT_Q +#define ZGEMM_R ZGEMM_DEFAULT_R +#define ZGEMM_UNROLL_M ZGEMM_DEFAULT_UNROLL_M +#define ZGEMM_UNROLL_N ZGEMM_DEFAULT_UNROLL_N +#define ZGEMM_UNROLL_MN MAX((ZGEMM_UNROLL_M), (ZGEMM_UNROLL_N)) + +#define XGEMM_P XGEMM_DEFAULT_P +#define XGEMM_Q XGEMM_DEFAULT_Q +#define XGEMM_R XGEMM_DEFAULT_R +#define XGEMM_UNROLL_M XGEMM_DEFAULT_UNROLL_M +#define XGEMM_UNROLL_N XGEMM_DEFAULT_UNROLL_N +#define XGEMM_UNROLL_MN MAX((XGEMM_UNROLL_M), (XGEMM_UNROLL_N)) + +#endif +#endif + +#ifndef COMPLEX +#if defined(XDOUBLE) +#define GEMM_P QGEMM_P +#define GEMM_Q QGEMM_Q +#define GEMM_R QGEMM_R +#define GEMM_UNROLL_M QGEMM_UNROLL_M +#define GEMM_UNROLL_N QGEMM_UNROLL_N +#define GEMM_UNROLL_MN QGEMM_UNROLL_MN +#define GEMM_DEFAULT_P QGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q QGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R QGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#elif defined(DOUBLE) +#define GEMM_P DGEMM_P +#define GEMM_Q DGEMM_Q +#define GEMM_R DGEMM_R +#define GEMM_UNROLL_M DGEMM_UNROLL_M +#define GEMM_UNROLL_N DGEMM_UNROLL_N +#define GEMM_UNROLL_MN DGEMM_UNROLL_MN +#define GEMM_DEFAULT_P DGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q DGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R DGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define GEMM_P SGEMM_P +#define GEMM_Q SGEMM_Q +#define GEMM_R SGEMM_R +#define GEMM_UNROLL_M SGEMM_UNROLL_M +#define GEMM_UNROLL_N SGEMM_UNROLL_N +#define GEMM_UNROLL_MN SGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif +#else +#if defined(XDOUBLE) +#define GEMM_P XGEMM_P +#define GEMM_Q XGEMM_Q +#define GEMM_R XGEMM_R +#define GEMM_UNROLL_M XGEMM_UNROLL_M +#define GEMM_UNROLL_N XGEMM_UNROLL_N +#define GEMM_UNROLL_MN XGEMM_UNROLL_MN +#define GEMM_DEFAULT_P XGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q XGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R XGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M XGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N XGEMM_DEFAULT_UNROLL_N +#elif defined(DOUBLE) +#define GEMM_P ZGEMM_P +#define GEMM_Q ZGEMM_Q +#define GEMM_R ZGEMM_R +#define GEMM_UNROLL_M ZGEMM_UNROLL_M +#define GEMM_UNROLL_N ZGEMM_UNROLL_N +#define GEMM_UNROLL_MN ZGEMM_UNROLL_MN +#define GEMM_DEFAULT_P ZGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q ZGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R ZGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M ZGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N ZGEMM_DEFAULT_UNROLL_N +#else +#define GEMM_P CGEMM_P +#define GEMM_Q CGEMM_Q +#define GEMM_R CGEMM_R +#define GEMM_UNROLL_M CGEMM_UNROLL_M +#define GEMM_UNROLL_N CGEMM_UNROLL_N +#define GEMM_UNROLL_MN CGEMM_UNROLL_MN +#define GEMM_DEFAULT_P CGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q CGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R CGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M CGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N CGEMM_DEFAULT_UNROLL_N +#endif +#endif + +#ifdef XDOUBLE +#define GEMM3M_UNROLL_M QGEMM_DEFAULT_UNROLL_M +#define GEMM3M_UNROLL_N QGEMM_DEFAULT_UNROLL_N +#elif defined(DOUBLE) +#define GEMM3M_UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define GEMM3M_UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define GEMM3M_UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define GEMM3M_UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif + + +#ifndef QGEMM_DEFAULT_UNROLL_M +#define QGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef QGEMM_DEFAULT_UNROLL_N +#define QGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_M +#define XGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_N +#define XGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef GEMM_THREAD +#define GEMM_THREAD gemm_thread_n +#endif + +#ifndef SGEMM_DEFAULT_R +#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) +#endif + +#ifndef DGEMM_DEFAULT_R +#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) +#endif + +#ifndef QGEMM_DEFAULT_R +#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) +#endif + +#ifndef CGEMM_DEFAULT_R +#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) +#endif + +#ifndef ZGEMM_DEFAULT_R +#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) +#endif + +#ifndef XGEMM_DEFAULT_R +#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) +#endif + +#ifndef SNUMOPT +#define SNUMOPT 2 +#endif + +#ifndef DNUMOPT +#define DNUMOPT 2 +#endif + +#ifndef QNUMOPT +#define QNUMOPT 1 +#endif + +#ifndef GEMM3M_P +#ifdef XDOUBLE +#define GEMM3M_P QGEMM_P +#elif defined(DOUBLE) +#define GEMM3M_P DGEMM_P +#else +#define GEMM3M_P SGEMM_P +#endif +#endif + +#ifndef GEMM3M_Q +#ifdef XDOUBLE +#define GEMM3M_Q QGEMM_Q +#elif defined(DOUBLE) +#define GEMM3M_Q DGEMM_Q +#else +#define GEMM3M_Q SGEMM_Q +#endif +#endif + +#ifndef GEMM3M_R +#ifdef XDOUBLE +#define GEMM3M_R QGEMM_R +#elif defined(DOUBLE) +#define GEMM3M_R DGEMM_R +#else +#define GEMM3M_R SGEMM_R +#endif +#endif + + +#endif diff --git a/common_power.h b/common_power.h new file mode 100644 index 0000000000..34a61539dd --- /dev/null +++ b/common_power.h @@ -0,0 +1,795 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_POWER +#define COMMON_POWER + +#define MB __asm__ __volatile__ ("sync") +#define WMB __asm__ __volatile__ ("sync") + +#define INLINE inline + +#ifdef PPC440 +#define STDERR stdout +#define QNONCACHE 0x1 +#define QCOMMS 0x2 +#define QFAST 0x4 +#endif + +#ifndef ASSEMBLER + +void *qalloc(int flags, size_t bytes); + +static void INLINE blas_lock(volatile unsigned long *address){ + + long int ret, val = 1; + + do { + while (*address) {YIELDING;}; + +#if defined(OS_LINUX) || defined(OS_DARWIN) + __asm__ __volatile__ ( + "0: lwarx %0, 0, %1\n" + " cmpwi %0, 0\n" + " bne- 1f\n" + " stwcx. %2,0, %1\n" + " bne- 0b\n" + "1: " + : "=&r"(ret) + : "r"(address), "r" (val) + : "cr0", "memory"); +#else + __asm__ __volatile__ ( + ".machine \"any\"\n" + " lwarx %0, 0, %1\n" + " cmpwi %0, 0\n" + " bne- $+12\n" + " stwcx. %2,0, %1\n" + " bne- $-16\n" + : "=&r"(ret) + : "r"(address), "r" (val) + : "cr0", "memory"); +#endif + } while (ret); +} + +static inline unsigned long rpcc(void){ + unsigned long ret; + +#ifdef OS_AIX + __asm__ __volatile__(".machine \"any\" ;"); +#endif + __asm__ __volatile__ ("mftb %0" : "=r" (ret) : ); + +#if defined(POWER5) || defined(PPC970) + return (ret << 6); +#else + return (ret << 3); +#endif + +} + +#ifdef __64BIT__ +#define RPCC64BIT +#endif + +static inline unsigned long getstackaddr(void){ + unsigned long addr; + + __asm__ __volatile__ ("mr %0, 1" + : "=r"(addr) : : "memory"); + + return addr; +}; + +#if defined(OS_LINUX) || defined(OS_AIX) +#define GET_IMAGE(res) __asm__ __volatile__("fmr %0, 2" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fmr %0, f2" : "=f"(res) : : "memory") + +#define GET_IMAGE_CANCEL + +#endif + +#ifdef SMP +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} +#endif + +#endif + + +#ifdef ASSEMBLER + +#ifdef DOUBLE +#define LFD lfd +#define LFDX lfdx +#define LFPDX lfpdx +#define LFSDX lfsdx +#define LFXDX lfxdx +#define LFDU lfdu +#define LFDUX lfdux +#define LFPDUX lfpdux +#define LFSDUX lfsdux +#define LFXDUX lfxdux +#define STFD stfd +#define STFDX stfdx +#define STFPDX stfpdx +#define STFSDX stfsdx +#define STFXDX stfxdx +#define STFDU stfdu +#define STFDUX stfdux +#define STFPDUX stfpdux +#define STFSDUX stfsdux +#define STFXDUX stfxdux +#define FMADD fmadd +#define FMSUB fmsub +#define FNMADD fnmadd +#define FNMSUB fnmsub +#define FMUL fmul +#define FADD fadd +#define FSUB fsub +#else +#define LFD lfs +#define LFDX lfsx +#define LFPDX lfpsx +#define LFSDX lfssx +#define LFXDX lfxsx +#define LFDU lfsu +#define LFDUX lfsux +#define LFPDUX lfpsux +#define LFSDUX lfssux +#define LFXDUX lfxsux +#define STFD stfs +#define STFDX stfsx +#define STFPDX stfpsx +#define STFSDX stfssx +#define STFXDX stfxsx +#define STFDU stfsu +#define STFDUX stfsux +#define STFPDUX stfpsux +#define STFSDUX stfssux +#define STFXDUX stfxsux +#define FMADD fmadds +#define FMSUB fmsubs +#define FNMADD fnmadds +#define FNMSUB fnmsubs +#define FMUL fmuls +#define FADD fadds +#define FSUB fsubs +#endif + +#ifdef __64BIT__ +#define LDLONG ld +#else +#define LDLONG lwz +#endif + +#ifdef OS_DARWIN +#define LL(x) L##x +#endif + +#ifdef OS_LINUX +#define LL(x) .L##x +#endif + +#ifndef LL +#define LL(x) __L##x +#endif + + +#if defined(__64BIT__) && defined(USE64BITINT) +#define LDINT ld +#elif defined(__64BIT__) && !defined(USE64BITINT) +#define LDINT lwa +#else +#define LDINT lwz +#endif + +/* +#define DCBT(REGA, REGB, NUM) .long (0x7c00022c | (REGA << 16) | (REGB << 11) | ((NUM) << 21)) +#define DCBTST(REGA, REGB, NUM) .long (0x7c0001ec | (REGA << 16) | (REGB << 11) | ((NUM) << 21)) +*/ + +#define DSTATTR_H(SIZE, COUNT, STRIDE) ((SIZE << 8) | (COUNT)) +#define DSTATTR_L(SIZE, COUNT, STRIDE) (STRIDE) + +#if defined(PPC970) || defined(POWER3) || defined(POWER4) || defined(POWER5) || defined(PPCG4) +#define HAVE_PREFETCH +#endif + +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) +#define DCBT_ARG 0 +#else +#define DCBT_ARG 8 +#endif + +#ifdef CELL +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (64 + 128 * 13) +#endif + +#if defined(POWER3) || defined(POWER4) || defined(POWER5) +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (96 + 128 * 12) +#endif + +#if defined(POWER6) +#define L1_DUALFETCH +#define L1_PREFETCHSIZE (16 + 128 * 100) +#define L1_PREFETCH dcbtst +#endif + +#ifndef L1_PREFETCH +#define L1_PREFETCH dcbt +#endif + +#ifndef L1_PREFETCHW +#define L1_PREFETCHW dcbtst +#endif + +#if DCBT_ARG == 0 +#define DCBT(REGA, REGB) L1_PREFETCH REGB, REGA +#define DCBTST(REGA, REGB) L1_PREFETCHW REGB, REGA +#else +#define DCBT(REGA, REGB) L1_PREFETCH DCBT_ARG, REGB, REGA +#define DCBTST(REGA, REGB) L1_PREFETCHW DCBT_ARG, REGB, REGA +#endif + + +#ifndef L1_PREFETCHSIZE +#define L1_PREFETCHSIZE (96 + 128 * 12) +#endif + +#if !defined(OS_DARWIN) || defined(NEEDPARAM) +#define f0 0 +#define f1 1 +#define f2 2 +#define f3 3 +#define f4 4 +#define f5 5 +#define f6 6 +#define f7 7 +#define f8 8 +#define f9 9 +#define f10 10 +#define f11 11 +#define f12 12 +#define f13 13 +#define f14 14 +#define f15 15 +#define f16 16 +#define f17 17 +#define f18 18 +#define f19 19 +#define f20 20 +#define f21 21 +#define f22 22 +#define f23 23 +#define f24 24 +#define f25 25 +#define f26 26 +#define f27 27 +#define f28 28 +#define f29 29 +#define f30 30 +#define f31 31 + +#define r0 0 +#define r1 1 +#define r2 2 +#define r3 3 +#define r4 4 +#define r5 5 +#define r6 6 +#define r7 7 +#define r8 8 +#define r9 9 +#define r10 10 +#define r11 11 +#define r12 12 +#define r13 13 +#define r14 14 +#define r15 15 +#define r16 16 +#define r17 17 +#define r18 18 +#define r19 19 +#define r20 20 +#define r21 21 +#define r22 22 +#define r23 23 +#define r24 24 +#define r25 25 +#define r26 26 +#define r27 27 +#define r28 28 +#define r29 29 +#define r30 30 +#define r31 31 + +#define v0 0 +#define v1 1 +#define v2 2 +#define v3 3 +#define v4 4 +#define v5 5 +#define v6 6 +#define v7 7 +#define v8 8 +#define v9 9 +#define v10 10 +#define v11 11 +#define v12 12 +#define v13 13 +#define v14 14 +#define v15 15 +#define v16 16 +#define v17 17 +#define v18 18 +#define v19 19 +#define v20 20 +#define v21 21 +#define v22 22 +#define v23 23 +#define v24 24 +#define v25 25 +#define v26 26 +#define v27 27 +#define v28 28 +#define v29 29 +#define v30 30 +#define v31 31 + +#define BO_dCTR_NZERO_AND_NOT 0 +#define BO_dCTR_NZERO_AND_NOT_1 1 +#define BO_dCTR_ZERO_AND_NOT 2 +#define BO_dCTR_ZERO_AND_NOT_1 3 +#define BO_IF_NOT 4 +#define BO_IF_NOT_1 5 +#define BO_IF_NOT_2 6 +#define BO_IF_NOT_3 7 +#define BO_dCTR_NZERO_AND 8 +#define BO_dCTR_NZERO_AND_1 9 +#define BO_dCTR_ZERO_AND 10 +#define BO_dCTR_ZERO_AND_1 11 +#define BO_IF 12 +#define BO_IF_1 13 +#define BO_IF_2 14 +#define BO_IF_3 15 +#define BO_dCTR_NZERO 16 +#define BO_dCTR_NZERO_1 17 +#define BO_dCTR_ZERO 18 +#define BO_dCTR_ZERO_1 19 +#define BO_ALWAYS 20 +#define BO_ALWAYS_1 21 +#define BO_ALWAYS_2 22 +#define BO_ALWAYS_3 23 +#define BO_dCTR_NZERO_8 24 +#define BO_dCTR_NZERO_9 25 +#define BO_dCTR_ZERO_8 26 +#define BO_dCTR_ZERO_9 27 +#define BO_ALWAYS_8 28 +#define BO_ALWAYS_9 29 +#define BO_ALWAYS_10 30 +#define BO_ALWAYS_11 31 + +#define CR0_LT 0 +#define CR0_GT 1 +#define CR0_EQ 2 +#define CR0_SO 3 +#define CR1_FX 4 +#define CR1_FEX 5 +#define CR1_VX 6 +#define CR1_OX 7 +#define CR2_LT 8 +#define CR2_GT 9 +#define CR2_EQ 10 +#define CR2_SO 11 +#define CR3_LT 12 +#define CR3_GT 13 +#define CR3_EQ 14 +#define CR3_SO 15 +#define CR4_LT 16 +#define CR4_GT 17 +#define CR4_EQ 18 +#define CR4_SO 19 +#define CR5_LT 20 +#define CR5_GT 21 +#define CR5_EQ 22 +#define CR5_SO 23 +#define CR6_LT 24 +#define CR6_GT 25 +#define CR6_EQ 26 +#define CR6_SO 27 +#define CR7_LT 28 +#define CR7_GT 29 +#define CR7_EQ 30 +#define CR7_SO 31 +#define TO_LT 16 +#define TO_GT 8 +#define TO_EQ 4 +#define TO_LLT 2 +#define TO_LGT 1 +#define CR0 0 +#define CR1 1 +#define CR2 2 +#define CR3 3 +#define CR4 4 +#define CR5 5 +#define CR6 6 +#define CR7 7 +#define cr0 0 +#define cr1 1 +#define cr2 2 +#define cr3 3 +#define cr4 4 +#define cr5 5 +#define cr6 6 +#define cr7 7 +#define VRsave 256 + +#endif + +#define CTR 9 +#define SP r1 + +#ifdef __64BIT__ +#define slwi sldi +#define cmpwi cmpdi +#define srawi sradi +#define mullw mulld +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(ASSEMBLER) && !defined(NEEDPARAM) + +#ifdef OS_LINUX +#ifndef __64BIT__ +#define PROLOGUE \ + .section .text;\ + .align 6;\ + .globl REALNAME;\ + .type REALNAME, @function;\ +REALNAME: +#define EPILOGUE .size REALNAME, .-REALNAME +#else +#define PROLOGUE \ + .section .text;\ + .align 5;\ + .globl REALNAME;\ + .section ".opd","aw";\ + .align 3;\ +REALNAME:;\ + .quad .REALNAME, .TOC.@tocbase, 0;\ + .previous;\ + .size REALNAME, 24;\ + .type .REALNAME, @function;\ + .globl .REALNAME;\ +.REALNAME: +#define EPILOGUE \ + .long 0 ; \ + .byte 0,0,0,1,128,0,0,0 ; \ + .size .REALNAME, .-.REALNAME; \ + .section .note.GNU-stack,"",@progbits +#endif + +#ifdef PROFILE +#ifndef __64BIT__ +#define PROFCODE ;\ + .section ".data";\ + .align 2;\ +.LP3:;\ + .long 0;\ + .section ".text";\ + mflr r0;\ + stw r0, 4(SP);\ + lis r12, .LP3@ha;\ + la r0, .LP3@l(r12);\ + bl _mcount;\ + lwz r0, 4(SP);\ + mtlr r0 +#else +#define PROFCODE \ + .globl _mcount; \ + mflr r0; \ + std r0, 16(SP); \ + mr r11, SP; \ + addi SP, SP, -256; \ + std r11, 0(SP); \ + std r3, 128(SP); \ + std r4, 136(SP); \ + std r5, 144(SP); \ + std r6, 152(SP); \ + std r7, 160(SP); \ + std r8, 168(SP); \ + std r9, 176(SP); \ + std r10, 184(SP); \ + stfd f3, 192(SP); \ + stfd f4, 200(SP); \ + bl ._mcount; \ + nop; \ + ld r3, 128(SP);\ + ld r4, 136(SP);\ + ld r5, 144(SP);\ + ld r6, 152(SP);\ + ld r7, 160(SP);\ + ld r8, 168(SP);\ + ld r9, 176(SP);\ + ld r10, 184(SP);\ + lfd f3, 192(SP);\ + lfd f4, 200(SP);\ + addi SP, SP, 256;\ + ld r0, 16(SP);\ + mtlr r0 +#endif +#else +#define PROFCODE +#endif + +#endif + +#if OS_AIX +#ifndef __64BIT__ +#define PROLOGUE \ + .machine "any";\ + .globl .REALNAME;\ + .csect .text[PR],5;\ +.REALNAME:; + +#define EPILOGUE \ +_section_.text:;\ + .csect .data[RW],4;\ + .long _section_.text; + +#else + +#define PROLOGUE \ + .machine "any";\ + .globl .REALNAME;\ + .csect .text[PR], 5;\ +.REALNAME:; + +#define EPILOGUE \ +_section_.text:;\ + .csect .data[RW],4;\ + .llong _section_.text; +#endif + +#define PROFCODE + +#endif + +#ifdef OS_DARWIN +#ifndef __64BIT__ + .macro PROLOGUE + .section __TEXT,__text,regular,pure_instructions + .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 + .machine ppc + .text + .align 4 + .globl REALNAME +REALNAME: + .endmacro +#else + .macro PROLOGUE + .section __TEXT,__text,regular,pure_instructions + .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 + .machine ppc64 + .text + .align 4 + .globl REALNAME +REALNAME: + .endmacro +#endif + +#ifndef PROFILE +#define PROFCODE +#define EPILOGUE .subsections_via_symbols +#else +#ifndef __64BIT__ + + .macro PROFCODE + mflr r0 + stw r0, 8(SP) + addi SP, SP, -64 + stw SP, 0(SP) + stw r3, 12(SP) + stw r4, 16(SP) + stw r5, 20(SP) + stw r6, 24(SP) + stw r7, 28(SP) + stw r8, 32(SP) + stw r9, 36(SP) + stw r10, 40(SP) + stfd f1, 48(SP) + stfd f2, 56(SP) + mr r3, r0 + bl Lmcount$stub + nop + lwz r3, 12(SP) + lwz r4, 16(SP) + lwz r5, 20(SP) + lwz r6, 24(SP) + lwz r7, 28(SP) + lwz r8, 32(SP) + lwz r9, 36(SP) + lwz r10, 40(SP) + lfd f1, 48(SP) + lfd f2, 56(SP) + addi SP, SP, 64 + lwz r0, 8(SP) + mtlr r0 + .endmacro + + .macro EPILOGUE + .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 + .align 5 +Lmcount$stub: + .indirect_symbol mcount + mflr r0 + bcl 20,31,L00000000001$spb +L00000000001$spb: + mflr r11 + addis r11,r11,ha16(Lmcount$lazy_ptr-L00000000001$spb) + mtlr r0 + lwzu r12,lo16(Lmcount$lazy_ptr-L00000000001$spb)(r11) + mtctr r12 + bctr + .lazy_symbol_pointer +Lmcount$lazy_ptr: + .indirect_symbol mcount + .long dyld_stub_binding_helper + .subsections_via_symbols + .endmacro + +#else + .macro PROFCODE + mflr r0 + std r0, 16(SP) + addi SP, SP, -128 + std SP, 0(SP) + std r3, 24(SP) + std r4, 32(SP) + std r5, 40(SP) + std r6, 48(SP) + std r7, 56(SP) + std r8, 64(SP) + std r9, 72(SP) + std r10, 80(SP) + stfd f1, 88(SP) + stfd f2, 96(SP) + mr r3, r0 + bl Lmcount$stub + nop + ld r3, 24(SP) + ld r4, 32(SP) + ld r5, 40(SP) + ld r6, 48(SP) + ld r7, 56(SP) + ld r8, 64(SP) + ld r9, 72(SP) + ld r10, 80(SP) + lfd f1, 88(SP) + lfd f2, 86(SP) + addi SP, SP, 128 + ld r0, 16(SP) + mtlr r0 + .endmacro + + .macro EPILOGUE + .data + .section __TEXT,__picsymbolstub1,symbol_stubs,pure_instructions,32 + .align 5 +Lmcount$stub: + .indirect_symbol mcount + mflr r0 + bcl 20,31,L00000000001$spb +L00000000001$spb: + mflr r11 + addis r11,r11,ha16(Lmcount$lazy_ptr-L00000000001$spb) + mtlr r0 + ld r12,lo16(Lmcount$lazy_ptr-L00000000001$spb)(r11) + mtctr r12 + bctr + .lazy_symbol_pointer +Lmcount$lazy_ptr: + .indirect_symbol mcount + .quad dyld_stub_binding_helper + .subsections_via_symbols + .endmacro +#endif + +#endif + +#endif +#endif + +#endif + +#define HALT mfspr r0, 1023 + +#ifdef OS_LINUX +#if defined(PPC440) || defined(PPC440FP2) +#undef MAX_CPU_NUMBER +#define MAX_CPU_NUMBER 1 +#endif +#if !defined(__64BIT__) && !defined(PROFILE) && !defined(PPC440) && !defined(PPC440FP2) +#define START_ADDRESS (0x0b000000UL) +#else +#define SEEK_ADDRESS +#endif +#endif + +#ifdef OS_AIX +#ifndef __64BIT__ +#define START_ADDRESS (0xf0000000UL) +#else +#define SEEK_ADDRESS +#endif +#endif + +#ifdef OS_DARWIN +#define SEEK_ADDRESS +#endif + +#if defined(PPC440) +#define BUFFER_SIZE ( 2 << 20) +#elif defined(PPC440FP2) +#define BUFFER_SIZE ( 16 << 20) +#else +#define BUFFER_SIZE ( 16 << 20) +#endif + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE (16 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif diff --git a/common_q.h b/common_q.h new file mode 100644 index 0000000000..30ad3727ad --- /dev/null +++ b/common_q.h @@ -0,0 +1,431 @@ +#ifndef COMMON_Q_H +#define COMMON_Q_H + +#ifndef DYNAMIC_ARCH + +#define QAMAX_K qamax_k +#define QAMIN_K qamin_k +#define QMAX_K qmax_k +#define QMIN_K qmin_k +#define IQAMAX_K iqamax_k +#define IQAMIN_K iqamin_k +#define IQMAX_K iqmax_k +#define IQMIN_K iqmin_k +#define QASUM_K qasum_k +#define QAXPYU_K qaxpy_k +#define QAXPYC_K qaxpy_k +#define QCOPY_K qcopy_k +#define QDOTU_K qdot_k +#define QDOTC_K qdot_k +#define QNRM2_K qnrm2_k +#define QSCAL_K qscal_k +#define QSWAP_K qswap_k +#define QROT_K qrot_k + +#define QGEMV_N qgemv_n +#define QGEMV_T qgemv_t +#define QGEMV_R qgemv_n +#define QGEMV_C qgemv_t +#define QGEMV_O qgemv_n +#define QGEMV_U qgemv_t +#define QGEMV_S qgemv_n +#define QGEMV_D qgemv_t + +#define QGERU_K qger_k +#define QGERC_K qger_k +#define QGERV_K qger_k +#define QGERD_K qger_k + +#define QSYMV_U qsymv_U +#define QSYMV_L qsymv_L +#define QSYMV_THREAD_U qsymv_thread_U +#define QSYMV_THREAD_L qsymv_thread_L + +#define QGEMM_ONCOPY qgemm_oncopy +#define QGEMM_OTCOPY qgemm_otcopy + +#if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N +#define QGEMM_INCOPY qgemm_oncopy +#define QGEMM_ITCOPY qgemm_otcopy +#else +#define QGEMM_INCOPY qgemm_incopy +#define QGEMM_ITCOPY qgemm_itcopy +#endif + +#define QTRMM_OUNUCOPY qtrmm_ounucopy +#define QTRMM_OUNNCOPY qtrmm_ounncopy +#define QTRMM_OUTUCOPY qtrmm_outucopy +#define QTRMM_OUTNCOPY qtrmm_outncopy +#define QTRMM_OLNUCOPY qtrmm_olnucopy +#define QTRMM_OLNNCOPY qtrmm_olnncopy +#define QTRMM_OLTUCOPY qtrmm_oltucopy +#define QTRMM_OLTNCOPY qtrmm_oltncopy + +#define QTRSM_OUNUCOPY qtrsm_ounucopy +#define QTRSM_OUNNCOPY qtrsm_ounncopy +#define QTRSM_OUTUCOPY qtrsm_outucopy +#define QTRSM_OUTNCOPY qtrsm_outncopy +#define QTRSM_OLNUCOPY qtrsm_olnucopy +#define QTRSM_OLNNCOPY qtrsm_olnncopy +#define QTRSM_OLTUCOPY qtrsm_oltucopy +#define QTRSM_OLTNCOPY qtrsm_oltncopy + +#if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N +#define QTRMM_IUNUCOPY qtrmm_ounucopy +#define QTRMM_IUNNCOPY qtrmm_ounncopy +#define QTRMM_IUTUCOPY qtrmm_outucopy +#define QTRMM_IUTNCOPY qtrmm_outncopy +#define QTRMM_ILNUCOPY qtrmm_olnucopy +#define QTRMM_ILNNCOPY qtrmm_olnncopy +#define QTRMM_ILTUCOPY qtrmm_oltucopy +#define QTRMM_ILTNCOPY qtrmm_oltncopy + +#define QTRSM_IUNUCOPY qtrsm_ounucopy +#define QTRSM_IUNNCOPY qtrsm_ounncopy +#define QTRSM_IUTUCOPY qtrsm_outucopy +#define QTRSM_IUTNCOPY qtrsm_outncopy +#define QTRSM_ILNUCOPY qtrsm_olnucopy +#define QTRSM_ILNNCOPY qtrsm_olnncopy +#define QTRSM_ILTUCOPY qtrsm_oltucopy +#define QTRSM_ILTNCOPY qtrsm_oltncopy +#else +#define QTRMM_IUNUCOPY qtrmm_iunucopy +#define QTRMM_IUNNCOPY qtrmm_iunncopy +#define QTRMM_IUTUCOPY qtrmm_iutucopy +#define QTRMM_IUTNCOPY qtrmm_iutncopy +#define QTRMM_ILNUCOPY qtrmm_ilnucopy +#define QTRMM_ILNNCOPY qtrmm_ilnncopy +#define QTRMM_ILTUCOPY qtrmm_iltucopy +#define QTRMM_ILTNCOPY qtrmm_iltncopy + +#define QTRSM_IUNUCOPY qtrsm_iunucopy +#define QTRSM_IUNNCOPY qtrsm_iunncopy +#define QTRSM_IUTUCOPY qtrsm_iutucopy +#define QTRSM_IUTNCOPY qtrsm_iutncopy +#define QTRSM_ILNUCOPY qtrsm_ilnucopy +#define QTRSM_ILNNCOPY qtrsm_ilnncopy +#define QTRSM_ILTUCOPY qtrsm_iltucopy +#define QTRSM_ILTNCOPY qtrsm_iltncopy +#endif + +#define QGEMM_BETA qgemm_beta + +#define QGEMM_KERNEL qgemm_kernel + +#define QTRMM_KERNEL_LN qtrmm_kernel_LN +#define QTRMM_KERNEL_LT qtrmm_kernel_LT +#define QTRMM_KERNEL_LR qtrmm_kernel_LN +#define QTRMM_KERNEL_LC qtrmm_kernel_LT +#define QTRMM_KERNEL_RN qtrmm_kernel_RN +#define QTRMM_KERNEL_RT qtrmm_kernel_RT +#define QTRMM_KERNEL_RR qtrmm_kernel_RN +#define QTRMM_KERNEL_RC qtrmm_kernel_RT + +#define QTRSM_KERNEL_LN qtrsm_kernel_LN +#define QTRSM_KERNEL_LT qtrsm_kernel_LT +#define QTRSM_KERNEL_LR qtrsm_kernel_LN +#define QTRSM_KERNEL_LC qtrsm_kernel_LT +#define QTRSM_KERNEL_RN qtrsm_kernel_RN +#define QTRSM_KERNEL_RT qtrsm_kernel_RT +#define QTRSM_KERNEL_RR qtrsm_kernel_RN +#define QTRSM_KERNEL_RC qtrsm_kernel_RT + +#define QSYMM_OUTCOPY qsymm_outcopy +#define QSYMM_OLTCOPY qsymm_oltcopy +#if QGEMM_DEFAULT_UNROLL_M == QGEMM_DEFAULT_UNROLL_N +#define QSYMM_IUTCOPY qsymm_outcopy +#define QSYMM_ILTCOPY qsymm_oltcopy +#else +#define QSYMM_IUTCOPY qsymm_iutcopy +#define QSYMM_ILTCOPY qsymm_iltcopy +#endif + +#define QNEG_TCOPY qneg_tcopy +#define QLASWP_NCOPY qlaswp_ncopy + +#else + +#define QAMAX_K gotoblas -> qamax_k +#define QAMIN_K gotoblas -> qamin_k +#define QMAX_K gotoblas -> qmax_k +#define QMIN_K gotoblas -> qmin_k +#define IQAMAX_K gotoblas -> iqamax_k +#define IQAMIN_K gotoblas -> iqamin_k +#define IQMAX_K gotoblas -> iqmax_k +#define IQMIN_K gotoblas -> iqmin_k +#define QASUM_K gotoblas -> qasum_k +#define QAXPYU_K gotoblas -> qaxpy_k +#define QAXPYC_K gotoblas -> qaxpy_k +#define QCOPY_K gotoblas -> qcopy_k +#define QDOTU_K gotoblas -> qdot_k +#define QDOTC_K gotoblas -> qdot_k +#define QNRM2_K gotoblas -> qnrm2_k +#define QSCAL_K gotoblas -> qscal_k +#define QSWAP_K gotoblas -> qswap_k +#define QROT_K gotoblas -> qrot_k + +#define QGEMV_N gotoblas -> qgemv_n +#define QGEMV_T gotoblas -> qgemv_t +#define QGEMV_R gotoblas -> qgemv_n +#define QGEMV_C gotoblas -> qgemv_t +#define QGEMV_O gotoblas -> qgemv_n +#define QGEMV_U gotoblas -> qgemv_t +#define QGEMV_S gotoblas -> qgemv_n +#define QGEMV_D gotoblas -> qgemv_t + +#define QGERU_K gotoblas -> qger_k +#define QGERC_K gotoblas -> qger_k +#define QGERV_K gotoblas -> qger_k +#define QGERD_K gotoblas -> qger_k + +#define QSYMV_U gotoblas -> qsymv_U +#define QSYMV_L gotoblas -> qsymv_L + +#define QSYMV_THREAD_U qsymv_thread_U +#define QSYMV_THREAD_L qsymv_thread_L + +#define QGEMM_ONCOPY gotoblas -> qgemm_oncopy +#define QGEMM_OTCOPY gotoblas -> qgemm_otcopy +#define QGEMM_INCOPY gotoblas -> qgemm_incopy +#define QGEMM_ITCOPY gotoblas -> qgemm_itcopy + +#define QTRMM_OUNUCOPY gotoblas -> qtrmm_ounucopy +#define QTRMM_OUTUCOPY gotoblas -> qtrmm_outucopy +#define QTRMM_OLNUCOPY gotoblas -> qtrmm_olnucopy +#define QTRMM_OLTUCOPY gotoblas -> qtrmm_oltucopy +#define QTRSM_OUNUCOPY gotoblas -> qtrsm_ounucopy +#define QTRSM_OUTUCOPY gotoblas -> qtrsm_outucopy +#define QTRSM_OLNUCOPY gotoblas -> qtrsm_olnucopy +#define QTRSM_OLTUCOPY gotoblas -> qtrsm_oltucopy + +#define QTRMM_IUNUCOPY gotoblas -> qtrmm_iunucopy +#define QTRMM_IUTUCOPY gotoblas -> qtrmm_iutucopy +#define QTRMM_ILNUCOPY gotoblas -> qtrmm_ilnucopy +#define QTRMM_ILTUCOPY gotoblas -> qtrmm_iltucopy +#define QTRSM_IUNUCOPY gotoblas -> qtrsm_iunucopy +#define QTRSM_IUTUCOPY gotoblas -> qtrsm_iutucopy +#define QTRSM_ILNUCOPY gotoblas -> qtrsm_ilnucopy +#define QTRSM_ILTUCOPY gotoblas -> qtrsm_iltucopy + +#define QTRMM_OUNNCOPY gotoblas -> qtrmm_ounncopy +#define QTRMM_OUTNCOPY gotoblas -> qtrmm_outncopy +#define QTRMM_OLNNCOPY gotoblas -> qtrmm_olnncopy +#define QTRMM_OLTNCOPY gotoblas -> qtrmm_oltncopy +#define QTRSM_OUNNCOPY gotoblas -> qtrsm_ounncopy +#define QTRSM_OUTNCOPY gotoblas -> qtrsm_outncopy +#define QTRSM_OLNNCOPY gotoblas -> qtrsm_olnncopy +#define QTRSM_OLTNCOPY gotoblas -> qtrsm_oltncopy + +#define QTRMM_IUNNCOPY gotoblas -> qtrmm_iunncopy +#define QTRMM_IUTNCOPY gotoblas -> qtrmm_iutncopy +#define QTRMM_ILNNCOPY gotoblas -> qtrmm_ilnncopy +#define QTRMM_ILTNCOPY gotoblas -> qtrmm_iltncopy +#define QTRSM_IUNNCOPY gotoblas -> qtrsm_iunncopy +#define QTRSM_IUTNCOPY gotoblas -> qtrsm_iutncopy +#define QTRSM_ILNNCOPY gotoblas -> qtrsm_ilnncopy +#define QTRSM_ILTNCOPY gotoblas -> qtrsm_iltncopy + +#define QGEMM_BETA gotoblas -> qgemm_beta +#define QGEMM_KERNEL gotoblas -> qgemm_kernel + +#define QTRMM_KERNEL_LN gotoblas -> qtrmm_kernel_LN +#define QTRMM_KERNEL_LT gotoblas -> qtrmm_kernel_LT +#define QTRMM_KERNEL_LR gotoblas -> qtrmm_kernel_LN +#define QTRMM_KERNEL_LC gotoblas -> qtrmm_kernel_LT +#define QTRMM_KERNEL_RN gotoblas -> qtrmm_kernel_RN +#define QTRMM_KERNEL_RT gotoblas -> qtrmm_kernel_RT +#define QTRMM_KERNEL_RR gotoblas -> qtrmm_kernel_RN +#define QTRMM_KERNEL_RC gotoblas -> qtrmm_kernel_RT + +#define QTRSM_KERNEL_LN gotoblas -> qtrsm_kernel_LN +#define QTRSM_KERNEL_LT gotoblas -> qtrsm_kernel_LT +#define QTRSM_KERNEL_LR gotoblas -> qtrsm_kernel_LN +#define QTRSM_KERNEL_LC gotoblas -> qtrsm_kernel_LT +#define QTRSM_KERNEL_RN gotoblas -> qtrsm_kernel_RN +#define QTRSM_KERNEL_RT gotoblas -> qtrsm_kernel_RT +#define QTRSM_KERNEL_RR gotoblas -> qtrsm_kernel_RN +#define QTRSM_KERNEL_RC gotoblas -> qtrsm_kernel_RT + +#define QSYMM_IUTCOPY gotoblas -> qsymm_iutcopy +#define QSYMM_ILTCOPY gotoblas -> qsymm_iltcopy +#define QSYMM_OUTCOPY gotoblas -> qsymm_outcopy +#define QSYMM_OLTCOPY gotoblas -> qsymm_oltcopy + +#define QNEG_TCOPY gotoblas -> qneg_tcopy +#define QLASWP_NCOPY gotoblas -> qlaswp_ncopy + +#endif + +#define QGEMM_NN qgemm_nn +#define QGEMM_CN qgemm_tn +#define QGEMM_TN qgemm_tn +#define QGEMM_NC qgemm_nt +#define QGEMM_NT qgemm_nt +#define QGEMM_CC qgemm_tt +#define QGEMM_CT qgemm_tt +#define QGEMM_TC qgemm_tt +#define QGEMM_TT qgemm_tt +#define QGEMM_NR qgemm_nn +#define QGEMM_TR qgemm_tn +#define QGEMM_CR qgemm_tn +#define QGEMM_RN qgemm_nn +#define QGEMM_RT qgemm_nt +#define QGEMM_RC qgemm_nt +#define QGEMM_RR qgemm_nn + +#define QSYMM_LU qsymm_LU +#define QSYMM_LL qsymm_LL +#define QSYMM_RU qsymm_RU +#define QSYMM_RL qsymm_RL + +#define QHEMM_LU qhemm_LU +#define QHEMM_LL qhemm_LL +#define QHEMM_RU qhemm_RU +#define QHEMM_RL qhemm_RL + +#define QSYRK_UN qsyrk_UN +#define QSYRK_UT qsyrk_UT +#define QSYRK_LN qsyrk_LN +#define QSYRK_LT qsyrk_LT +#define QSYRK_UR qsyrk_UN +#define QSYRK_UC qsyrk_UT +#define QSYRK_LR qsyrk_LN +#define QSYRK_LC qsyrk_LT + +#define QSYRK_KERNEL_U qsyrk_kernel_U +#define QSYRK_KERNEL_L qsyrk_kernel_L + +#define QHERK_UN qsyrk_UN +#define QHERK_LN qsyrk_LN +#define QHERK_UC qsyrk_UT +#define QHERK_LC qsyrk_LT + +#define QHER2K_UN qsyr2k_UN +#define QHER2K_LN qsyr2k_LN +#define QHER2K_UC qsyr2k_UT +#define QHER2K_LC qsyr2k_LT + +#define QSYR2K_UN qsyr2k_UN +#define QSYR2K_UT qsyr2k_UT +#define QSYR2K_LN qsyr2k_LN +#define QSYR2K_LT qsyr2k_LT +#define QSYR2K_UR qsyr2k_UN +#define QSYR2K_UC qsyr2k_UT +#define QSYR2K_LR qsyr2k_LN +#define QSYR2K_LC qsyr2k_LT + +#define QSYR2K_KERNEL_U qsyr2k_kernel_U +#define QSYR2K_KERNEL_L qsyr2k_kernel_L + +#define QTRMM_LNUU qtrmm_LNUU +#define QTRMM_LNUN qtrmm_LNUN +#define QTRMM_LNLU qtrmm_LNLU +#define QTRMM_LNLN qtrmm_LNLN +#define QTRMM_LTUU qtrmm_LTUU +#define QTRMM_LTUN qtrmm_LTUN +#define QTRMM_LTLU qtrmm_LTLU +#define QTRMM_LTLN qtrmm_LTLN +#define QTRMM_LRUU qtrmm_LNUU +#define QTRMM_LRUN qtrmm_LNUN +#define QTRMM_LRLU qtrmm_LNLU +#define QTRMM_LRLN qtrmm_LNLN +#define QTRMM_LCUU qtrmm_LTUU +#define QTRMM_LCUN qtrmm_LTUN +#define QTRMM_LCLU qtrmm_LTLU +#define QTRMM_LCLN qtrmm_LTLN +#define QTRMM_RNUU qtrmm_RNUU +#define QTRMM_RNUN qtrmm_RNUN +#define QTRMM_RNLU qtrmm_RNLU +#define QTRMM_RNLN qtrmm_RNLN +#define QTRMM_RTUU qtrmm_RTUU +#define QTRMM_RTUN qtrmm_RTUN +#define QTRMM_RTLU qtrmm_RTLU +#define QTRMM_RTLN qtrmm_RTLN +#define QTRMM_RRUU qtrmm_RNUU +#define QTRMM_RRUN qtrmm_RNUN +#define QTRMM_RRLU qtrmm_RNLU +#define QTRMM_RRLN qtrmm_RNLN +#define QTRMM_RCUU qtrmm_RTUU +#define QTRMM_RCUN qtrmm_RTUN +#define QTRMM_RCLU qtrmm_RTLU +#define QTRMM_RCLN qtrmm_RTLN + +#define QTRSM_LNUU qtrsm_LNUU +#define QTRSM_LNUN qtrsm_LNUN +#define QTRSM_LNLU qtrsm_LNLU +#define QTRSM_LNLN qtrsm_LNLN +#define QTRSM_LTUU qtrsm_LTUU +#define QTRSM_LTUN qtrsm_LTUN +#define QTRSM_LTLU qtrsm_LTLU +#define QTRSM_LTLN qtrsm_LTLN +#define QTRSM_LRUU qtrsm_LNUU +#define QTRSM_LRUN qtrsm_LNUN +#define QTRSM_LRLU qtrsm_LNLU +#define QTRSM_LRLN qtrsm_LNLN +#define QTRSM_LCUU qtrsm_LTUU +#define QTRSM_LCUN qtrsm_LTUN +#define QTRSM_LCLU qtrsm_LTLU +#define QTRSM_LCLN qtrsm_LTLN +#define QTRSM_RNUU qtrsm_RNUU +#define QTRSM_RNUN qtrsm_RNUN +#define QTRSM_RNLU qtrsm_RNLU +#define QTRSM_RNLN qtrsm_RNLN +#define QTRSM_RTUU qtrsm_RTUU +#define QTRSM_RTUN qtrsm_RTUN +#define QTRSM_RTLU qtrsm_RTLU +#define QTRSM_RTLN qtrsm_RTLN +#define QTRSM_RRUU qtrsm_RNUU +#define QTRSM_RRUN qtrsm_RNUN +#define QTRSM_RRLU qtrsm_RNLU +#define QTRSM_RRLN qtrsm_RNLN +#define QTRSM_RCUU qtrsm_RTUU +#define QTRSM_RCUN qtrsm_RTUN +#define QTRSM_RCLU qtrsm_RTLU +#define QTRSM_RCLN qtrsm_RTLN + +#define QGEMM_THREAD_NN qgemm_thread_nn +#define QGEMM_THREAD_CN qgemm_thread_tn +#define QGEMM_THREAD_TN qgemm_thread_tn +#define QGEMM_THREAD_NC qgemm_thread_nt +#define QGEMM_THREAD_NT qgemm_thread_nt +#define QGEMM_THREAD_CC qgemm_thread_tt +#define QGEMM_THREAD_CT qgemm_thread_tt +#define QGEMM_THREAD_TC qgemm_thread_tt +#define QGEMM_THREAD_TT qgemm_thread_tt +#define QGEMM_THREAD_NR qgemm_thread_nn +#define QGEMM_THREAD_TR qgemm_thread_tn +#define QGEMM_THREAD_CR qgemm_thread_tn +#define QGEMM_THREAD_RN qgemm_thread_nn +#define QGEMM_THREAD_RT qgemm_thread_nt +#define QGEMM_THREAD_RC qgemm_thread_nt +#define QGEMM_THREAD_RR qgemm_thread_nn + +#define QSYMM_THREAD_LU qsymm_thread_LU +#define QSYMM_THREAD_LL qsymm_thread_LL +#define QSYMM_THREAD_RU qsymm_thread_RU +#define QSYMM_THREAD_RL qsymm_thread_RL + +#define QHEMM_THREAD_LU qhemm_thread_LU +#define QHEMM_THREAD_LL qhemm_thread_LL +#define QHEMM_THREAD_RU qhemm_thread_RU +#define QHEMM_THREAD_RL qhemm_thread_RL + +#define QSYRK_THREAD_UN qsyrk_thread_UN +#define QSYRK_THREAD_UT qsyrk_thread_UT +#define QSYRK_THREAD_LN qsyrk_thread_LN +#define QSYRK_THREAD_LT qsyrk_thread_LT +#define QSYRK_THREAD_UR qsyrk_thread_UN +#define QSYRK_THREAD_UC qsyrk_thread_UT +#define QSYRK_THREAD_LR qsyrk_thread_LN +#define QSYRK_THREAD_LC qsyrk_thread_LT + +#define QHERK_THREAD_UN qsyrk_thread_UN +#define QHERK_THREAD_UT qsyrk_thread_UT +#define QHERK_THREAD_LN qsyrk_thread_LN +#define QHERK_THREAD_LT qsyrk_thread_LT +#define QHERK_THREAD_UR qsyrk_thread_UN +#define QHERK_THREAD_UC qsyrk_thread_UT +#define QHERK_THREAD_LR qsyrk_thread_LN +#define QHERK_THREAD_LC qsyrk_thread_LT + +#endif diff --git a/common_reference.h b/common_reference.h new file mode 100644 index 0000000000..e69de29bb2 diff --git a/common_s.h b/common_s.h new file mode 100644 index 0000000000..db8d69a0fe --- /dev/null +++ b/common_s.h @@ -0,0 +1,436 @@ +#ifndef COMMON_S_H +#define COMMON_S_H + +#ifndef DYNAMIC_ARCH + +#define SAMAX_K samax_k +#define SAMIN_K samin_k +#define SMAX_K smax_k +#define SMIN_K smin_k +#define ISAMAX_K isamax_k +#define ISAMIN_K isamin_k +#define ISMAX_K ismax_k +#define ISMIN_K ismin_k +#define SASUM_K sasum_k +#define SAXPYU_K saxpy_k +#define SAXPYC_K saxpy_k +#define SCOPY_K scopy_k +#define SDOTU_K sdot_k +#define SDOTC_K sdot_k +#define SDSDOT_K sdot_k +#define DSDOT_K dsdot_k +#define SNRM2_K snrm2_k +#define SSCAL_K sscal_k +#define SSWAP_K sswap_k +#define SROT_K srot_k + +#define SGEMV_N sgemv_n +#define SGEMV_T sgemv_t +#define SGEMV_R sgemv_n +#define SGEMV_C sgemv_t +#define SGEMV_O sgemv_n +#define SGEMV_U sgemv_t +#define SGEMV_S sgemv_n +#define SGEMV_D sgemv_t + +#define SGERU_K sger_k +#define SGERC_K sger_k +#define SGERV_K sger_k +#define SGERD_K sger_k + +#define SSYMV_U ssymv_U +#define SSYMV_L ssymv_L + +#define SSYMV_THREAD_U ssymv_thread_U +#define SSYMV_THREAD_L ssymv_thread_L + +#define SGEMM_ONCOPY sgemm_oncopy +#define SGEMM_OTCOPY sgemm_otcopy + +#if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N +#define SGEMM_INCOPY sgemm_oncopy +#define SGEMM_ITCOPY sgemm_otcopy +#else +#define SGEMM_INCOPY sgemm_incopy +#define SGEMM_ITCOPY sgemm_itcopy +#endif + +#define STRMM_OUNUCOPY strmm_ounucopy +#define STRMM_OUNNCOPY strmm_ounncopy +#define STRMM_OUTUCOPY strmm_outucopy +#define STRMM_OUTNCOPY strmm_outncopy +#define STRMM_OLNUCOPY strmm_olnucopy +#define STRMM_OLNNCOPY strmm_olnncopy +#define STRMM_OLTUCOPY strmm_oltucopy +#define STRMM_OLTNCOPY strmm_oltncopy + +#define STRSM_OUNUCOPY strsm_ounucopy +#define STRSM_OUNNCOPY strsm_ounncopy +#define STRSM_OUTUCOPY strsm_outucopy +#define STRSM_OUTNCOPY strsm_outncopy +#define STRSM_OLNUCOPY strsm_olnucopy +#define STRSM_OLNNCOPY strsm_olnncopy +#define STRSM_OLTUCOPY strsm_oltucopy +#define STRSM_OLTNCOPY strsm_oltncopy + +#if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N +#define STRMM_IUNUCOPY strmm_ounucopy +#define STRMM_IUNNCOPY strmm_ounncopy +#define STRMM_IUTUCOPY strmm_outucopy +#define STRMM_IUTNCOPY strmm_outncopy +#define STRMM_ILNUCOPY strmm_olnucopy +#define STRMM_ILNNCOPY strmm_olnncopy +#define STRMM_ILTUCOPY strmm_oltucopy +#define STRMM_ILTNCOPY strmm_oltncopy + +#define STRSM_IUNUCOPY strsm_ounucopy +#define STRSM_IUNNCOPY strsm_ounncopy +#define STRSM_IUTUCOPY strsm_outucopy +#define STRSM_IUTNCOPY strsm_outncopy +#define STRSM_ILNUCOPY strsm_olnucopy +#define STRSM_ILNNCOPY strsm_olnncopy +#define STRSM_ILTUCOPY strsm_oltucopy +#define STRSM_ILTNCOPY strsm_oltncopy +#else +#define STRMM_IUNUCOPY strmm_iunucopy +#define STRMM_IUNNCOPY strmm_iunncopy +#define STRMM_IUTUCOPY strmm_iutucopy +#define STRMM_IUTNCOPY strmm_iutncopy +#define STRMM_ILNUCOPY strmm_ilnucopy +#define STRMM_ILNNCOPY strmm_ilnncopy +#define STRMM_ILTUCOPY strmm_iltucopy +#define STRMM_ILTNCOPY strmm_iltncopy + +#define STRSM_IUNUCOPY strsm_iunucopy +#define STRSM_IUNNCOPY strsm_iunncopy +#define STRSM_IUTUCOPY strsm_iutucopy +#define STRSM_IUTNCOPY strsm_iutncopy +#define STRSM_ILNUCOPY strsm_ilnucopy +#define STRSM_ILNNCOPY strsm_ilnncopy +#define STRSM_ILTUCOPY strsm_iltucopy +#define STRSM_ILTNCOPY strsm_iltncopy +#endif + +#define SGEMM_BETA sgemm_beta + +#define SGEMM_KERNEL sgemm_kernel + +#define STRMM_KERNEL_LN strmm_kernel_LN +#define STRMM_KERNEL_LT strmm_kernel_LT +#define STRMM_KERNEL_LR strmm_kernel_LN +#define STRMM_KERNEL_LC strmm_kernel_LT +#define STRMM_KERNEL_RN strmm_kernel_RN +#define STRMM_KERNEL_RT strmm_kernel_RT +#define STRMM_KERNEL_RR strmm_kernel_RN +#define STRMM_KERNEL_RC strmm_kernel_RT + +#define STRSM_KERNEL_LN strsm_kernel_LN +#define STRSM_KERNEL_LT strsm_kernel_LT +#define STRSM_KERNEL_LR strsm_kernel_LN +#define STRSM_KERNEL_LC strsm_kernel_LT +#define STRSM_KERNEL_RN strsm_kernel_RN +#define STRSM_KERNEL_RT strsm_kernel_RT +#define STRSM_KERNEL_RR strsm_kernel_RN +#define STRSM_KERNEL_RC strsm_kernel_RT + +#define SSYMM_OUTCOPY ssymm_outcopy +#define SSYMM_OLTCOPY ssymm_oltcopy +#if SGEMM_DEFAULT_UNROLL_M == SGEMM_DEFAULT_UNROLL_N +#define SSYMM_IUTCOPY ssymm_outcopy +#define SSYMM_ILTCOPY ssymm_oltcopy +#else +#define SSYMM_IUTCOPY ssymm_iutcopy +#define SSYMM_ILTCOPY ssymm_iltcopy +#endif + +#define SNEG_TCOPY sneg_tcopy +#define SLASWP_NCOPY slaswp_ncopy + +#else + +#define SAMAX_K gotoblas -> samax_k +#define SAMIN_K gotoblas -> samin_k +#define SMAX_K gotoblas -> smax_k +#define SMIN_K gotoblas -> smin_k +#define ISAMAX_K gotoblas -> isamax_k +#define ISAMIN_K gotoblas -> isamin_k +#define ISMAX_K gotoblas -> ismax_k +#define ISMIN_K gotoblas -> ismin_k +#define SASUM_K gotoblas -> sasum_k +#define SAXPYU_K gotoblas -> saxpy_k +#define SAXPYC_K gotoblas -> saxpy_k +#define SCOPY_K gotoblas -> scopy_k +#define SDOTU_K gotoblas -> sdot_k +#define SDOTC_K gotoblas -> sdot_k +#define SDSDOT_K gotoblas -> sdot_k +#define DSDOT_K gotoblas -> dsdot_k +#define SNRM2_K gotoblas -> snrm2_k +#define SSCAL_K gotoblas -> sscal_k +#define SSWAP_K gotoblas -> sswap_k +#define SROT_K gotoblas -> srot_k + +#define SGEMV_N gotoblas -> sgemv_n +#define SGEMV_T gotoblas -> sgemv_t +#define SGEMV_R gotoblas -> sgemv_n +#define SGEMV_C gotoblas -> sgemv_t +#define SGEMV_O gotoblas -> sgemv_n +#define SGEMV_U gotoblas -> sgemv_t +#define SGEMV_S gotoblas -> sgemv_n +#define SGEMV_D gotoblas -> sgemv_t + +#define SGERU_K gotoblas -> sger_k +#define SGERC_K gotoblas -> sger_k +#define SGERV_K gotoblas -> sger_k +#define SGERD_K gotoblas -> sger_k + +#define SSYMV_U gotoblas -> ssymv_U +#define SSYMV_L gotoblas -> ssymv_L + +#define SSYMV_THREAD_U ssymv_thread_U +#define SSYMV_THREAD_L ssymv_thread_L + +#define SGEMM_ONCOPY gotoblas -> sgemm_oncopy +#define SGEMM_OTCOPY gotoblas -> sgemm_otcopy +#define SGEMM_INCOPY gotoblas -> sgemm_incopy +#define SGEMM_ITCOPY gotoblas -> sgemm_itcopy + +#define STRMM_OUNUCOPY gotoblas -> strmm_ounucopy +#define STRMM_OUTUCOPY gotoblas -> strmm_outucopy +#define STRMM_OLNUCOPY gotoblas -> strmm_olnucopy +#define STRMM_OLTUCOPY gotoblas -> strmm_oltucopy +#define STRSM_OUNUCOPY gotoblas -> strsm_ounucopy +#define STRSM_OUTUCOPY gotoblas -> strsm_outucopy +#define STRSM_OLNUCOPY gotoblas -> strsm_olnucopy +#define STRSM_OLTUCOPY gotoblas -> strsm_oltucopy + +#define STRMM_IUNUCOPY gotoblas -> strmm_iunucopy +#define STRMM_IUTUCOPY gotoblas -> strmm_iutucopy +#define STRMM_ILNUCOPY gotoblas -> strmm_ilnucopy +#define STRMM_ILTUCOPY gotoblas -> strmm_iltucopy +#define STRSM_IUNUCOPY gotoblas -> strsm_iunucopy +#define STRSM_IUTUCOPY gotoblas -> strsm_iutucopy +#define STRSM_ILNUCOPY gotoblas -> strsm_ilnucopy +#define STRSM_ILTUCOPY gotoblas -> strsm_iltucopy + +#define STRMM_OUNNCOPY gotoblas -> strmm_ounncopy +#define STRMM_OUTNCOPY gotoblas -> strmm_outncopy +#define STRMM_OLNNCOPY gotoblas -> strmm_olnncopy +#define STRMM_OLTNCOPY gotoblas -> strmm_oltncopy +#define STRSM_OUNNCOPY gotoblas -> strsm_ounncopy +#define STRSM_OUTNCOPY gotoblas -> strsm_outncopy +#define STRSM_OLNNCOPY gotoblas -> strsm_olnncopy +#define STRSM_OLTNCOPY gotoblas -> strsm_oltncopy + +#define STRMM_IUNNCOPY gotoblas -> strmm_iunncopy +#define STRMM_IUTNCOPY gotoblas -> strmm_iutncopy +#define STRMM_ILNNCOPY gotoblas -> strmm_ilnncopy +#define STRMM_ILTNCOPY gotoblas -> strmm_iltncopy +#define STRSM_IUNNCOPY gotoblas -> strsm_iunncopy +#define STRSM_IUTNCOPY gotoblas -> strsm_iutncopy +#define STRSM_ILNNCOPY gotoblas -> strsm_ilnncopy +#define STRSM_ILTNCOPY gotoblas -> strsm_iltncopy + +#define SGEMM_BETA gotoblas -> sgemm_beta +#define SGEMM_KERNEL gotoblas -> sgemm_kernel + +#define STRMM_KERNEL_LN gotoblas -> strmm_kernel_LN +#define STRMM_KERNEL_LT gotoblas -> strmm_kernel_LT +#define STRMM_KERNEL_LR gotoblas -> strmm_kernel_LN +#define STRMM_KERNEL_LC gotoblas -> strmm_kernel_LT +#define STRMM_KERNEL_RN gotoblas -> strmm_kernel_RN +#define STRMM_KERNEL_RT gotoblas -> strmm_kernel_RT +#define STRMM_KERNEL_RR gotoblas -> strmm_kernel_RN +#define STRMM_KERNEL_RC gotoblas -> strmm_kernel_RT + +#define STRSM_KERNEL_LN gotoblas -> strsm_kernel_LN +#define STRSM_KERNEL_LT gotoblas -> strsm_kernel_LT +#define STRSM_KERNEL_LR gotoblas -> strsm_kernel_LN +#define STRSM_KERNEL_LC gotoblas -> strsm_kernel_LT +#define STRSM_KERNEL_RN gotoblas -> strsm_kernel_RN +#define STRSM_KERNEL_RT gotoblas -> strsm_kernel_RT +#define STRSM_KERNEL_RR gotoblas -> strsm_kernel_RN +#define STRSM_KERNEL_RC gotoblas -> strsm_kernel_RT + +#define SSYMM_IUTCOPY gotoblas -> ssymm_iutcopy +#define SSYMM_ILTCOPY gotoblas -> ssymm_iltcopy +#define SSYMM_OUTCOPY gotoblas -> ssymm_outcopy +#define SSYMM_OLTCOPY gotoblas -> ssymm_oltcopy + +#define SNEG_TCOPY gotoblas -> sneg_tcopy +#define SLASWP_NCOPY gotoblas -> slaswp_ncopy + +#endif + +#define SGEMM_NN sgemm_nn +#define SGEMM_CN sgemm_tn +#define SGEMM_TN sgemm_tn +#define SGEMM_NC sgemm_nt +#define SGEMM_NT sgemm_nt +#define SGEMM_CC sgemm_tt +#define SGEMM_CT sgemm_tt +#define SGEMM_TC sgemm_tt +#define SGEMM_TT sgemm_tt +#define SGEMM_NR sgemm_nn +#define SGEMM_TR sgemm_tn +#define SGEMM_CR sgemm_tn +#define SGEMM_RN sgemm_nn +#define SGEMM_RT sgemm_nt +#define SGEMM_RC sgemm_nt +#define SGEMM_RR sgemm_nn + +#define SSYMM_LU ssymm_LU +#define SSYMM_LL ssymm_LL +#define SSYMM_RU ssymm_RU +#define SSYMM_RL ssymm_RL + +#define SHEMM_LU shemm_LU +#define SHEMM_LL shemm_LL +#define SHEMM_RU shemm_RU +#define SHEMM_RL shemm_RL + +#define SSYRK_UN ssyrk_UN +#define SSYRK_UT ssyrk_UT +#define SSYRK_LN ssyrk_LN +#define SSYRK_LT ssyrk_LT +#define SSYRK_UR ssyrk_UN +#define SSYRK_UC ssyrk_UT +#define SSYRK_LR ssyrk_LN +#define SSYRK_LC ssyrk_LT + +#define SSYRK_KERNEL_U ssyrk_kernel_U +#define SSYRK_KERNEL_L ssyrk_kernel_L + +#define SHERK_UN ssyrk_UN +#define SHERK_LN ssyrk_LN +#define SHERK_UC ssyrk_UT +#define SHERK_LC ssyrk_LT + +#define SHER2K_UN ssyr2k_UN +#define SHER2K_LN ssyr2k_LN +#define SHER2K_UC ssyr2k_UT +#define SHER2K_LC ssyr2k_LT + +#define SSYR2K_UN ssyr2k_UN +#define SSYR2K_UT ssyr2k_UT +#define SSYR2K_LN ssyr2k_LN +#define SSYR2K_LT ssyr2k_LT +#define SSYR2K_UR ssyr2k_UN +#define SSYR2K_UC ssyr2k_UT +#define SSYR2K_LR ssyr2k_LN +#define SSYR2K_LC ssyr2k_LT + +#define SSYR2K_KERNEL_U ssyr2k_kernel_U +#define SSYR2K_KERNEL_L ssyr2k_kernel_L + +#define STRMM_LNUU strmm_LNUU +#define STRMM_LNUN strmm_LNUN +#define STRMM_LNLU strmm_LNLU +#define STRMM_LNLN strmm_LNLN +#define STRMM_LTUU strmm_LTUU +#define STRMM_LTUN strmm_LTUN +#define STRMM_LTLU strmm_LTLU +#define STRMM_LTLN strmm_LTLN +#define STRMM_LRUU strmm_LNUU +#define STRMM_LRUN strmm_LNUN +#define STRMM_LRLU strmm_LNLU +#define STRMM_LRLN strmm_LNLN +#define STRMM_LCUU strmm_LTUU +#define STRMM_LCUN strmm_LTUN +#define STRMM_LCLU strmm_LTLU +#define STRMM_LCLN strmm_LTLN +#define STRMM_RNUU strmm_RNUU +#define STRMM_RNUN strmm_RNUN +#define STRMM_RNLU strmm_RNLU +#define STRMM_RNLN strmm_RNLN +#define STRMM_RTUU strmm_RTUU +#define STRMM_RTUN strmm_RTUN +#define STRMM_RTLU strmm_RTLU +#define STRMM_RTLN strmm_RTLN +#define STRMM_RRUU strmm_RNUU +#define STRMM_RRUN strmm_RNUN +#define STRMM_RRLU strmm_RNLU +#define STRMM_RRLN strmm_RNLN +#define STRMM_RCUU strmm_RTUU +#define STRMM_RCUN strmm_RTUN +#define STRMM_RCLU strmm_RTLU +#define STRMM_RCLN strmm_RTLN + +#define STRSM_LNUU strsm_LNUU +#define STRSM_LNUN strsm_LNUN +#define STRSM_LNLU strsm_LNLU +#define STRSM_LNLN strsm_LNLN +#define STRSM_LTUU strsm_LTUU +#define STRSM_LTUN strsm_LTUN +#define STRSM_LTLU strsm_LTLU +#define STRSM_LTLN strsm_LTLN +#define STRSM_LRUU strsm_LNUU +#define STRSM_LRUN strsm_LNUN +#define STRSM_LRLU strsm_LNLU +#define STRSM_LRLN strsm_LNLN +#define STRSM_LCUU strsm_LTUU +#define STRSM_LCUN strsm_LTUN +#define STRSM_LCLU strsm_LTLU +#define STRSM_LCLN strsm_LTLN +#define STRSM_RNUU strsm_RNUU +#define STRSM_RNUN strsm_RNUN +#define STRSM_RNLU strsm_RNLU +#define STRSM_RNLN strsm_RNLN +#define STRSM_RTUU strsm_RTUU +#define STRSM_RTUN strsm_RTUN +#define STRSM_RTLU strsm_RTLU +#define STRSM_RTLN strsm_RTLN +#define STRSM_RRUU strsm_RNUU +#define STRSM_RRUN strsm_RNUN +#define STRSM_RRLU strsm_RNLU +#define STRSM_RRLN strsm_RNLN +#define STRSM_RCUU strsm_RTUU +#define STRSM_RCUN strsm_RTUN +#define STRSM_RCLU strsm_RTLU +#define STRSM_RCLN strsm_RTLN + +#define SGEMM_THREAD_NN sgemm_thread_nn +#define SGEMM_THREAD_CN sgemm_thread_tn +#define SGEMM_THREAD_TN sgemm_thread_tn +#define SGEMM_THREAD_NC sgemm_thread_nt +#define SGEMM_THREAD_NT sgemm_thread_nt +#define SGEMM_THREAD_CC sgemm_thread_tt +#define SGEMM_THREAD_CT sgemm_thread_tt +#define SGEMM_THREAD_TC sgemm_thread_tt +#define SGEMM_THREAD_TT sgemm_thread_tt +#define SGEMM_THREAD_NR sgemm_thread_nn +#define SGEMM_THREAD_TR sgemm_thread_tn +#define SGEMM_THREAD_CR sgemm_thread_tn +#define SGEMM_THREAD_RN sgemm_thread_nn +#define SGEMM_THREAD_RT sgemm_thread_nt +#define SGEMM_THREAD_RC sgemm_thread_nt +#define SGEMM_THREAD_RR sgemm_thread_nn + +#define SSYMM_THREAD_LU ssymm_thread_LU +#define SSYMM_THREAD_LL ssymm_thread_LL +#define SSYMM_THREAD_RU ssymm_thread_RU +#define SSYMM_THREAD_RL ssymm_thread_RL + +#define SHEMM_THREAD_LU shemm_thread_LU +#define SHEMM_THREAD_LL shemm_thread_LL +#define SHEMM_THREAD_RU shemm_thread_RU +#define SHEMM_THREAD_RL shemm_thread_RL + +#define SSYRK_THREAD_UN ssyrk_thread_UN +#define SSYRK_THREAD_UT ssyrk_thread_UT +#define SSYRK_THREAD_LN ssyrk_thread_LN +#define SSYRK_THREAD_LT ssyrk_thread_LT +#define SSYRK_THREAD_UR ssyrk_thread_UN +#define SSYRK_THREAD_UC ssyrk_thread_UT +#define SSYRK_THREAD_LR ssyrk_thread_LN +#define SSYRK_THREAD_LC ssyrk_thread_LT + +#define SHERK_THREAD_UN ssyrk_thread_UN +#define SHERK_THREAD_UT ssyrk_thread_UT +#define SHERK_THREAD_LN ssyrk_thread_LN +#define SHERK_THREAD_LT ssyrk_thread_LT +#define SHERK_THREAD_UR ssyrk_thread_UN +#define SHERK_THREAD_UC ssyrk_thread_UT +#define SHERK_THREAD_LR ssyrk_thread_LN +#define SHERK_THREAD_LC ssyrk_thread_LT + +#endif diff --git a/common_sparc.h b/common_sparc.h new file mode 100644 index 0000000000..35d8bdb5f1 --- /dev/null +++ b/common_sparc.h @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_POWER +#define COMMON_POWER + +#define MB __asm__ __volatile__ ("nop") +#define WMB __asm__ __volatile__ ("nop") + +#ifndef ASSEMBLER + +static void __inline blas_lock(volatile unsigned long *address){ + + long int ret = 1; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "ldstub [%1], %0" + : "=&r"(ret) + : "r" (address) + : "memory"); + } while (ret); +} + +static __inline unsigned long rpcc(void){ + unsigned long clocks; + + __asm__ __volatile__ ("rd %%tick, %0" : "=r" (clocks)); + + return clocks; +}; + +#ifdef __64BIT__ +#define RPCC64BIT +#endif + +#ifndef __BIG_ENDIAN__ +#define __BIG_ENDIAN__ +#endif + +#ifdef DOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fmovd %%f2, %0" : "=f"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fmovs %%f1, %0" : "=f"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#ifdef SMP +static __inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} +#endif +#endif + + +#ifdef ASSEMBLER + +#ifndef __64BIT__ +#define STACK_START 128 +#define SAVESP save %sp, -64, %sp +#else +#define STACK_START 2423 +#define SAVESP save %sp, -256, %sp +#endif + +#define NOP or %g1, %g1, %g1 + +#ifdef DOUBLE +#define LDF ldd +#define STF std +#define FADD faddd +#define FMUL fmuld +#define FMOV fmovd +#define FABS fabsd +#define FSUB fsubd +#define FCMP fcmpd +#define FMOVG fmovdg +#define FMOVL fmovdl +#define FSQRT fsqrtd +#define FDIV fdivd +#else +#define LDF ld +#define STF st +#define FADD fadds +#define FMUL fmuls +#define FMOV fmovs +#define FABS fabss +#define FSUB fsubs +#define FCMP fcmps +#define FMOVG fmovsg +#define FMOVL fmovsl +#define FSQRT fsqrts +#define FDIV fdivs +#endif + +#define HALT prefetch [%g0], 5 + +#define FMADDS(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 1 << 5) | (rs2)) + +#define FMADDD(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 2 << 5) | (rs2)) + +#define FMSUBS(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 5 << 5) | (rs2)) + +#define FMSUBD(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 6 << 5) | (rs2)) + +#define FNMSUBS(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | ( 9 << 5) | (rs2)) + +#define FNMSUBD(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (10 << 5) | (rs2)) + +#define FNMADDS(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (13 << 5) | (rs2)) + +#define FNMADDD(rs1, rs2, rs3, rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x37 << 19) | ((rs1) << 14) | ((rs3) << 9) | (14 << 5) | (rs2)) + +#define FCLRS(rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x61 << 5)) + +#define FCLRD(rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x60 << 5)) + +#define FONES(rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x7f << 5)) + +#define FONED(rd) \ + .word ((2 << 30) | ((rd) << 25) | ( 0x36 << 19) | ( 0x7e << 5)) + +#ifndef DOUBLE +#define FCLR(a) FCLRS(a) +#define FONE(a) FONES(a) +#define FMADD(a, b, c, d) FMADDS(a, b, c, d) +#define FMSUB(a, b, c, d) FMSUBS(a, b, c, d) +#define FNMADD(a, b, c, d) FNMADDS(a, b, c, d) +#define FNMSUB(a, b, c, d) FNMSUBS(a, b, c, d) +#else +#define FCLR(a) FCLRD(a) +#define FONE(a) FONED(a) +#define FMADD(a, b, c, d) FMADDD(a, b, c, d) +#define FMSUB(a, b, c, d) FMSUBD(a, b, c, d) +#define FNMADD(a, b, c, d) FNMADDD(a, b, c, d) +#define FNMSUB(a, b, c, d) FNMSUBD(a, b, c, d) +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#ifdef sparc +#define PROLOGUE \ + .section ".text"; \ + .align 32; \ + .global REALNAME;\ + .type REALNAME, #function; \ + .proc 07; \ +REALNAME:; +#define EPILOGUE \ + .size REALNAME, .-REALNAME +#endif + +#endif + +#ifdef sparc +#define SEEK_ADDRESS +#endif + +#define BUFFER_SIZE (32 << 20) + +#ifndef PAGESIZE +#define PAGESIZE ( 8 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) + +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS MAP_ANON +#endif +#endif diff --git a/common_thread.h b/common_thread.h new file mode 100644 index 0000000000..d74af3287e --- /dev/null +++ b/common_thread.h @@ -0,0 +1,192 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_THREAD +#define COMMON_THREAD + +/* Basic Thread Debugging */ +#undef SMP_DEBUG + +/* Thread Timing Debugging */ +#undef TIMING_DEBUG + +/* Global Parameter */ +extern int blas_cpu_number; +extern int blas_num_threads; +extern int blas_omp_linked; + +#define BLAS_LEGACY 0x8000U +#define BLAS_PTHREAD 0x4000U +#define BLAS_NODE 0x2000U + +#define BLAS_PREC 0x0003U +#define BLAS_SINGLE 0x0000U +#define BLAS_DOUBLE 0x0001U +#define BLAS_XDOUBLE 0x0002U +#define BLAS_REAL 0x0000U +#define BLAS_COMPLEX 0x0004U + +#define BLAS_TRANSA 0x0030U /* 2bit */ +#define BLAS_TRANSA_N 0x0000U +#define BLAS_TRANSA_T 0x0010U +#define BLAS_TRANSA_R 0x0020U +#define BLAS_TRANSA_C 0x0030U +#define BLAS_TRANSA_SHIFT 4 + +#define BLAS_TRANSB 0x0300U /* 2bit */ +#define BLAS_TRANSB_N 0x0000U +#define BLAS_TRANSB_T 0x0100U +#define BLAS_TRANSB_R 0x0200U +#define BLAS_TRANSB_C 0x0300U +#define BLAS_TRANSB_SHIFT 8 + +#define BLAS_RSIDE 0x0400U +#define BLAS_RSIDE_SHIFT 10 +#define BLAS_UPLO 0x0800U +#define BLAS_UPLO_SHIFT 11 + +#define BLAS_STATUS_NOTYET 0 +#define BLAS_STATUS_QUEUED 1 +#define BLAS_STATUS_RUNNING 2 +#define BLAS_STATUS_FINISHED 4 + +typedef struct blas_queue { + + void *routine; + BLASLONG position; + BLASLONG assigned; + + blas_arg_t *args; + void *range_m; + void *range_n; + void *sa, *sb; + + struct blas_queue *next; + +#if defined( __WIN32__) || defined(__CYGWIN32__) + CRITICAL_SECTION lock; + HANDLE finish; +#else + pthread_mutex_t lock; + pthread_cond_t finished; +#endif + + int mode, status; + +#ifdef CONSISTENT_FPCSR + unsigned int sse_mode, x87_mode; +#endif + +#ifdef SMP_DEBUG + int num; +#endif +#ifdef TIMING_DEBUG + unsigned int clocks; +#endif +} blas_queue_t; + +#ifdef SMP_SERVER + +extern int blas_server_avail; + +static __inline int num_cpu_avail(int level) { + + if ((blas_cpu_number == 1) + +#ifdef USE_OPENMP + || omp_in_parallel() +#endif + ) return 1; + + return blas_cpu_number; + +} + +static __inline void blas_queue_init(blas_queue_t *queue){ + + queue -> sa = NULL; + queue -> sb = NULL; + queue-> next = NULL; +} + +int blas_thread_init(void); +int BLASFUNC(blas_thread_shutdown)(void); +int exec_blas(BLASLONG, blas_queue_t *); +int exec_blas_async(BLASLONG, blas_queue_t *); +int exec_blas_async_wait(BLASLONG, blas_queue_t *); + +#else +int exec_blas_async(BLASLONG num_cpu, blas_param_t *param, pthread_t *); +int exec_blas_async_wait(BLASLONG num_cpu, pthread_t *blas_threads); +int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer); +#endif + +#ifndef ASSEMBLER + +int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int threads); + +int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); + +int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); + +int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); + +int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG); + +int trsm_thread(int mode, BLASLONG m, BLASLONG n, + double alpha_r, double alpha_i, + void *a, BLASLONG lda, + void *c, BLASLONG ldc, int (*function)(), void *buffer); + +int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); + +int beta_thread(int mode, BLASLONG m, BLASLONG n, + double alpha_r, double alpha_i, + void *c, BLASLONG ldc, int (*fuction)()); + +int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, + void *offsetA, BLASLONG lda, + void *offsetB, BLASLONG jb, + void *ipiv, BLASLONG offset, int (*function)(), void *buffer); + +#endif /* ENDIF ASSEMBLER */ + +#endif diff --git a/common_x.h b/common_x.h new file mode 100644 index 0000000000..03b98db4f4 --- /dev/null +++ b/common_x.h @@ -0,0 +1,611 @@ +#ifndef COMMON_X_H +#define COMMON_X_H + +#ifndef DYNAMIC_ARCH + +#define XAMAX_K xamax_k +#define XAMIN_K xamin_k +#define XMAX_K xmax_k +#define XMIN_K xmin_k +#define IXAMAX_K ixamax_k +#define IXAMIN_K ixamin_k +#define IXMAX_K ixmax_k +#define IXMIN_K ixmin_k +#define XASUM_K xasum_k +#define XAXPYU_K xaxpy_k +#define XAXPYC_K xaxpyc_k +#define XCOPY_K xcopy_k +#define XDOTU_K xdotu_k +#define XDOTC_K xdotc_k +#define XNRM2_K xnrm2_k +#define XSCAL_K xscal_k +#define XSWAP_K xswap_k +#define XROT_K xqrot_k + +#define XGEMV_N xgemv_n +#define XGEMV_T xgemv_t +#define XGEMV_R xgemv_r +#define XGEMV_C xgemv_c +#define XGEMV_O xgemv_o +#define XGEMV_U xgemv_u +#define XGEMV_S xgemv_s +#define XGEMV_D xgemv_d + +#define XGERU_K xgeru_k +#define XGERC_K xgerc_k +#define XGERV_K xgerv_k +#define XGERD_K xgerd_k + +#define XSYMV_U xsymv_U +#define XSYMV_L xsymv_L +#define XHEMV_U xhemv_U +#define XHEMV_L xhemv_L +#define XHEMV_V xhemv_V +#define XHEMV_M xhemv_M + +#define XSYMV_THREAD_U xsymv_thread_U +#define XSYMV_THREAD_L xsymv_thread_L +#define XHEMV_THREAD_U xhemv_thread_U +#define XHEMV_THREAD_L xhemv_thread_L +#define XHEMV_THREAD_V xhemv_thread_V +#define XHEMV_THREAD_M xhemv_thread_M + +#define XGEMM_ONCOPY xgemm_oncopy +#define XGEMM_OTCOPY xgemm_otcopy + +#if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N +#define XGEMM_INCOPY xgemm_oncopy +#define XGEMM_ITCOPY xgemm_otcopy +#else +#define XGEMM_INCOPY xgemm_incopy +#define XGEMM_ITCOPY xgemm_itcopy +#endif + +#define XTRMM_OUNUCOPY xtrmm_ounucopy +#define XTRMM_OUNNCOPY xtrmm_ounncopy +#define XTRMM_OUTUCOPY xtrmm_outucopy +#define XTRMM_OUTNCOPY xtrmm_outncopy +#define XTRMM_OLNUCOPY xtrmm_olnucopy +#define XTRMM_OLNNCOPY xtrmm_olnncopy +#define XTRMM_OLTUCOPY xtrmm_oltucopy +#define XTRMM_OLTNCOPY xtrmm_oltncopy + +#define XTRSM_OUNUCOPY xtrsm_ounucopy +#define XTRSM_OUNNCOPY xtrsm_ounncopy +#define XTRSM_OUTUCOPY xtrsm_outucopy +#define XTRSM_OUTNCOPY xtrsm_outncopy +#define XTRSM_OLNUCOPY xtrsm_olnucopy +#define XTRSM_OLNNCOPY xtrsm_olnncopy +#define XTRSM_OLTUCOPY xtrsm_oltucopy +#define XTRSM_OLTNCOPY xtrsm_oltncopy + +#if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N +#define XTRMM_IUNUCOPY xtrmm_ounucopy +#define XTRMM_IUNNCOPY xtrmm_ounncopy +#define XTRMM_IUTUCOPY xtrmm_outucopy +#define XTRMM_IUTNCOPY xtrmm_outncopy +#define XTRMM_ILNUCOPY xtrmm_olnucopy +#define XTRMM_ILNNCOPY xtrmm_olnncopy +#define XTRMM_ILTUCOPY xtrmm_oltucopy +#define XTRMM_ILTNCOPY xtrmm_oltncopy + +#define XTRSM_IUNUCOPY xtrsm_ounucopy +#define XTRSM_IUNNCOPY xtrsm_ounncopy +#define XTRSM_IUTUCOPY xtrsm_outucopy +#define XTRSM_IUTNCOPY xtrsm_outncopy +#define XTRSM_ILNUCOPY xtrsm_olnucopy +#define XTRSM_ILNNCOPY xtrsm_olnncopy +#define XTRSM_ILTUCOPY xtrsm_oltucopy +#define XTRSM_ILTNCOPY xtrsm_oltncopy +#else +#define XTRMM_IUNUCOPY xtrmm_iunucopy +#define XTRMM_IUNNCOPY xtrmm_iunncopy +#define XTRMM_IUTUCOPY xtrmm_iutucopy +#define XTRMM_IUTNCOPY xtrmm_iutncopy +#define XTRMM_ILNUCOPY xtrmm_ilnucopy +#define XTRMM_ILNNCOPY xtrmm_ilnncopy +#define XTRMM_ILTUCOPY xtrmm_iltucopy +#define XTRMM_ILTNCOPY xtrmm_iltncopy + +#define XTRSM_IUNUCOPY xtrsm_iunucopy +#define XTRSM_IUNNCOPY xtrsm_iunncopy +#define XTRSM_IUTUCOPY xtrsm_iutucopy +#define XTRSM_IUTNCOPY xtrsm_iutncopy +#define XTRSM_ILNUCOPY xtrsm_ilnucopy +#define XTRSM_ILNNCOPY xtrsm_ilnncopy +#define XTRSM_ILTUCOPY xtrsm_iltucopy +#define XTRSM_ILTNCOPY xtrsm_iltncopy +#endif + +#define XGEMM_BETA xgemm_beta + +#define XGEMM_KERNEL_N xgemm_kernel_n +#define XGEMM_KERNEL_L xgemm_kernel_l +#define XGEMM_KERNEL_R xgemm_kernel_r +#define XGEMM_KERNEL_B xgemm_kernel_b + +#define XTRMM_KERNEL_LN xtrmm_kernel_LN +#define XTRMM_KERNEL_LT xtrmm_kernel_LT +#define XTRMM_KERNEL_LR xtrmm_kernel_LR +#define XTRMM_KERNEL_LC xtrmm_kernel_LC +#define XTRMM_KERNEL_RN xtrmm_kernel_RN +#define XTRMM_KERNEL_RT xtrmm_kernel_RT +#define XTRMM_KERNEL_RR xtrmm_kernel_RR +#define XTRMM_KERNEL_RC xtrmm_kernel_RC + +#define XTRSM_KERNEL_LN xtrsm_kernel_LN +#define XTRSM_KERNEL_LT xtrsm_kernel_LT +#define XTRSM_KERNEL_LR xtrsm_kernel_LR +#define XTRSM_KERNEL_LC xtrsm_kernel_LC +#define XTRSM_KERNEL_RN xtrsm_kernel_RN +#define XTRSM_KERNEL_RT xtrsm_kernel_RT +#define XTRSM_KERNEL_RR xtrsm_kernel_RR +#define XTRSM_KERNEL_RC xtrsm_kernel_RC + +#define XSYMM_OUTCOPY xsymm_outcopy +#define XSYMM_OLTCOPY xsymm_oltcopy +#if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N +#define XSYMM_IUTCOPY xsymm_outcopy +#define XSYMM_ILTCOPY xsymm_oltcopy +#else +#define XSYMM_IUTCOPY xsymm_iutcopy +#define XSYMM_ILTCOPY xsymm_iltcopy +#endif + +#define XHEMM_OUTCOPY xhemm_outcopy +#define XHEMM_OLTCOPY xhemm_oltcopy +#if XGEMM_DEFAULT_UNROLL_M == XGEMM_DEFAULT_UNROLL_N +#define XHEMM_IUTCOPY xhemm_outcopy +#define XHEMM_ILTCOPY xhemm_oltcopy +#else +#define XHEMM_IUTCOPY xhemm_iutcopy +#define XHEMM_ILTCOPY xhemm_iltcopy +#endif + +#define XGEMM3M_ONCOPYB xgemm3m_oncopyb +#define XGEMM3M_ONCOPYR xgemm3m_oncopyr +#define XGEMM3M_ONCOPYI xgemm3m_oncopyi +#define XGEMM3M_OTCOPYB xgemm3m_otcopyb +#define XGEMM3M_OTCOPYR xgemm3m_otcopyr +#define XGEMM3M_OTCOPYI xgemm3m_otcopyi + +#define XGEMM3M_INCOPYB xgemm3m_incopyb +#define XGEMM3M_INCOPYR xgemm3m_incopyr +#define XGEMM3M_INCOPYI xgemm3m_incopyi +#define XGEMM3M_ITCOPYB xgemm3m_itcopyb +#define XGEMM3M_ITCOPYR xgemm3m_itcopyr +#define XGEMM3M_ITCOPYI xgemm3m_itcopyi + +#define XSYMM3M_ILCOPYB xsymm3m_ilcopyb +#define XSYMM3M_IUCOPYB xsymm3m_iucopyb +#define XSYMM3M_ILCOPYR xsymm3m_ilcopyr +#define XSYMM3M_IUCOPYR xsymm3m_iucopyr +#define XSYMM3M_ILCOPYI xsymm3m_ilcopyi +#define XSYMM3M_IUCOPYI xsymm3m_iucopyi + +#define XSYMM3M_OLCOPYB xsymm3m_olcopyb +#define XSYMM3M_OUCOPYB xsymm3m_oucopyb +#define XSYMM3M_OLCOPYR xsymm3m_olcopyr +#define XSYMM3M_OUCOPYR xsymm3m_oucopyr +#define XSYMM3M_OLCOPYI xsymm3m_olcopyi +#define XSYMM3M_OUCOPYI xsymm3m_oucopyi + +#define XHEMM3M_ILCOPYB xhemm3m_ilcopyb +#define XHEMM3M_IUCOPYB xhemm3m_iucopyb +#define XHEMM3M_ILCOPYR xhemm3m_ilcopyr +#define XHEMM3M_IUCOPYR xhemm3m_iucopyr +#define XHEMM3M_ILCOPYI xhemm3m_ilcopyi +#define XHEMM3M_IUCOPYI xhemm3m_iucopyi + +#define XHEMM3M_OLCOPYB xhemm3m_olcopyb +#define XHEMM3M_OUCOPYB xhemm3m_oucopyb +#define XHEMM3M_OLCOPYR xhemm3m_olcopyr +#define XHEMM3M_OUCOPYR xhemm3m_oucopyr +#define XHEMM3M_OLCOPYI xhemm3m_olcopyi +#define XHEMM3M_OUCOPYI xhemm3m_oucopyi + +#define XGEMM3M_KERNEL xgemm3m_kernel + +#define XNEG_TCOPY xneg_tcopy +#define XLASWP_NCOPY xlaswp_ncopy + +#else + +#define XAMAX_K gotoblas -> xamax_k +#define XAMIN_K gotoblas -> xamin_k +#define XMAX_K gotoblas -> xmax_k +#define XMIN_K gotoblas -> xmin_k +#define IXAMAX_K gotoblas -> ixamax_k +#define IXAMIN_K gotoblas -> ixamin_k +#define IXMAX_K gotoblas -> ixmax_k +#define IXMIN_K gotoblas -> ixmin_k +#define XASUM_K gotoblas -> xasum_k +#define XAXPYU_K gotoblas -> xaxpy_k +#define XAXPYC_K gotoblas -> xaxpyc_k +#define XCOPY_K gotoblas -> xcopy_k +#define XDOTU_K gotoblas -> xdotu_k +#define XDOTC_K gotoblas -> xdotc_k +#define XNRM2_K gotoblas -> xnrm2_k +#define XSCAL_K gotoblas -> xscal_k +#define XSWAP_K gotoblas -> xswap_k +#define XROT_K gotoblas -> xqrot_k + +#define XGEMV_N gotoblas -> xgemv_n +#define XGEMV_T gotoblas -> xgemv_t +#define XGEMV_R gotoblas -> xgemv_r +#define XGEMV_C gotoblas -> xgemv_c +#define XGEMV_O gotoblas -> xgemv_o +#define XGEMV_U gotoblas -> xgemv_u +#define XGEMV_S gotoblas -> xgemv_s +#define XGEMV_D gotoblas -> xgemv_d + +#define XGERU_K gotoblas -> xgeru_k +#define XGERC_K gotoblas -> xgerc_k +#define XGERV_K gotoblas -> xgerv_k +#define XGERD_K gotoblas -> xgerd_k + +#define XSYMV_U gotoblas -> xsymv_U +#define XSYMV_L gotoblas -> xsymv_L +#define XHEMV_U gotoblas -> xhemv_U +#define XHEMV_L gotoblas -> xhemv_L +#define XHEMV_V gotoblas -> xhemv_V +#define XHEMV_M gotoblas -> xhemv_M + +#define XSYMV_THREAD_U xsymv_thread_U +#define XSYMV_THREAD_L xsymv_thread_L +#define XHEMV_THREAD_U xhemv_thread_U +#define XHEMV_THREAD_L xhemv_thread_L +#define XHEMV_THREAD_V xhemv_thread_V +#define XHEMV_THREAD_M xhemv_thread_M + +#define XGEMM_ONCOPY gotoblas -> xgemm_oncopy +#define XGEMM_OTCOPY gotoblas -> xgemm_otcopy +#define XGEMM_INCOPY gotoblas -> xgemm_incopy +#define XGEMM_ITCOPY gotoblas -> xgemm_itcopy + +#define XTRMM_OUNUCOPY gotoblas -> xtrmm_ounucopy +#define XTRMM_OUTUCOPY gotoblas -> xtrmm_outucopy +#define XTRMM_OLNUCOPY gotoblas -> xtrmm_olnucopy +#define XTRMM_OLTUCOPY gotoblas -> xtrmm_oltucopy +#define XTRSM_OUNUCOPY gotoblas -> xtrsm_ounucopy +#define XTRSM_OUTUCOPY gotoblas -> xtrsm_outucopy +#define XTRSM_OLNUCOPY gotoblas -> xtrsm_olnucopy +#define XTRSM_OLTUCOPY gotoblas -> xtrsm_oltucopy + +#define XTRMM_IUNUCOPY gotoblas -> xtrmm_iunucopy +#define XTRMM_IUTUCOPY gotoblas -> xtrmm_iutucopy +#define XTRMM_ILNUCOPY gotoblas -> xtrmm_ilnucopy +#define XTRMM_ILTUCOPY gotoblas -> xtrmm_iltucopy +#define XTRSM_IUNUCOPY gotoblas -> xtrsm_iunucopy +#define XTRSM_IUTUCOPY gotoblas -> xtrsm_iutucopy +#define XTRSM_ILNUCOPY gotoblas -> xtrsm_ilnucopy +#define XTRSM_ILTUCOPY gotoblas -> xtrsm_iltucopy + +#define XTRMM_OUNNCOPY gotoblas -> xtrmm_ounncopy +#define XTRMM_OUTNCOPY gotoblas -> xtrmm_outncopy +#define XTRMM_OLNNCOPY gotoblas -> xtrmm_olnncopy +#define XTRMM_OLTNCOPY gotoblas -> xtrmm_oltncopy +#define XTRSM_OUNNCOPY gotoblas -> xtrsm_ounncopy +#define XTRSM_OUTNCOPY gotoblas -> xtrsm_outncopy +#define XTRSM_OLNNCOPY gotoblas -> xtrsm_olnncopy +#define XTRSM_OLTNCOPY gotoblas -> xtrsm_oltncopy + +#define XTRMM_IUNNCOPY gotoblas -> xtrmm_iunncopy +#define XTRMM_IUTNCOPY gotoblas -> xtrmm_iutncopy +#define XTRMM_ILNNCOPY gotoblas -> xtrmm_ilnncopy +#define XTRMM_ILTNCOPY gotoblas -> xtrmm_iltncopy +#define XTRSM_IUNNCOPY gotoblas -> xtrsm_iunncopy +#define XTRSM_IUTNCOPY gotoblas -> xtrsm_iutncopy +#define XTRSM_ILNNCOPY gotoblas -> xtrsm_ilnncopy +#define XTRSM_ILTNCOPY gotoblas -> xtrsm_iltncopy + +#define XGEMM_BETA gotoblas -> xgemm_beta +#define XGEMM_KERNEL_N gotoblas -> xgemm_kernel_n +#define XGEMM_KERNEL_L gotoblas -> xgemm_kernel_l +#define XGEMM_KERNEL_R gotoblas -> xgemm_kernel_r +#define XGEMM_KERNEL_B gotoblas -> xgemm_kernel_b + +#define XTRMM_KERNEL_LN gotoblas -> xtrmm_kernel_LN +#define XTRMM_KERNEL_LT gotoblas -> xtrmm_kernel_LT +#define XTRMM_KERNEL_LR gotoblas -> xtrmm_kernel_LR +#define XTRMM_KERNEL_LC gotoblas -> xtrmm_kernel_LC +#define XTRMM_KERNEL_RN gotoblas -> xtrmm_kernel_RN +#define XTRMM_KERNEL_RT gotoblas -> xtrmm_kernel_RT +#define XTRMM_KERNEL_RR gotoblas -> xtrmm_kernel_RR +#define XTRMM_KERNEL_RC gotoblas -> xtrmm_kernel_RC + +#define XTRSM_KERNEL_LN gotoblas -> xtrsm_kernel_LN +#define XTRSM_KERNEL_LT gotoblas -> xtrsm_kernel_LT +#define XTRSM_KERNEL_LR gotoblas -> xtrsm_kernel_LR +#define XTRSM_KERNEL_LC gotoblas -> xtrsm_kernel_LC +#define XTRSM_KERNEL_RN gotoblas -> xtrsm_kernel_RN +#define XTRSM_KERNEL_RT gotoblas -> xtrsm_kernel_RT +#define XTRSM_KERNEL_RR gotoblas -> xtrsm_kernel_RR +#define XTRSM_KERNEL_RC gotoblas -> xtrsm_kernel_RC + +#define XSYMM_IUTCOPY gotoblas -> xsymm_iutcopy +#define XSYMM_ILTCOPY gotoblas -> xsymm_iltcopy +#define XSYMM_OUTCOPY gotoblas -> xsymm_outcopy +#define XSYMM_OLTCOPY gotoblas -> xsymm_oltcopy + +#define XHEMM_OUTCOPY gotoblas -> xhemm_outcopy +#define XHEMM_OLTCOPY gotoblas -> xhemm_oltcopy +#define XHEMM_IUTCOPY gotoblas -> xhemm_iutcopy +#define XHEMM_ILTCOPY gotoblas -> xhemm_iltcopy + +#define XGEMM3M_ONCOPYB gotoblas -> xgemm3m_oncopyb +#define XGEMM3M_ONCOPYR gotoblas -> xgemm3m_oncopyr +#define XGEMM3M_ONCOPYI gotoblas -> xgemm3m_oncopyi +#define XGEMM3M_OTCOPYB gotoblas -> xgemm3m_otcopyb +#define XGEMM3M_OTCOPYR gotoblas -> xgemm3m_otcopyr +#define XGEMM3M_OTCOPYI gotoblas -> xgemm3m_otcopyi + +#define XGEMM3M_INCOPYB gotoblas -> xgemm3m_incopyb +#define XGEMM3M_INCOPYR gotoblas -> xgemm3m_incopyr +#define XGEMM3M_INCOPYI gotoblas -> xgemm3m_incopyi +#define XGEMM3M_ITCOPYB gotoblas -> xgemm3m_itcopyb +#define XGEMM3M_ITCOPYR gotoblas -> xgemm3m_itcopyr +#define XGEMM3M_ITCOPYI gotoblas -> xgemm3m_itcopyi + +#define XSYMM3M_ILCOPYB gotoblas -> xsymm3m_ilcopyb +#define XSYMM3M_IUCOPYB gotoblas -> xsymm3m_iucopyb +#define XSYMM3M_ILCOPYR gotoblas -> xsymm3m_ilcopyr +#define XSYMM3M_IUCOPYR gotoblas -> xsymm3m_iucopyr +#define XSYMM3M_ILCOPYI gotoblas -> xsymm3m_ilcopyi +#define XSYMM3M_IUCOPYI gotoblas -> xsymm3m_iucopyi + +#define XSYMM3M_OLCOPYB gotoblas -> xsymm3m_olcopyb +#define XSYMM3M_OUCOPYB gotoblas -> xsymm3m_oucopyb +#define XSYMM3M_OLCOPYR gotoblas -> xsymm3m_olcopyr +#define XSYMM3M_OUCOPYR gotoblas -> xsymm3m_oucopyr +#define XSYMM3M_OLCOPYI gotoblas -> xsymm3m_olcopyi +#define XSYMM3M_OUCOPYI gotoblas -> xsymm3m_oucopyi + +#define XHEMM3M_ILCOPYB gotoblas -> xhemm3m_ilcopyb +#define XHEMM3M_IUCOPYB gotoblas -> xhemm3m_iucopyb +#define XHEMM3M_ILCOPYR gotoblas -> xhemm3m_ilcopyr +#define XHEMM3M_IUCOPYR gotoblas -> xhemm3m_iucopyr +#define XHEMM3M_ILCOPYI gotoblas -> xhemm3m_ilcopyi +#define XHEMM3M_IUCOPYI gotoblas -> xhemm3m_iucopyi + +#define XHEMM3M_OLCOPYB gotoblas -> xhemm3m_olcopyb +#define XHEMM3M_OUCOPYB gotoblas -> xhemm3m_oucopyb +#define XHEMM3M_OLCOPYR gotoblas -> xhemm3m_olcopyr +#define XHEMM3M_OUCOPYR gotoblas -> xhemm3m_oucopyr +#define XHEMM3M_OLCOPYI gotoblas -> xhemm3m_olcopyi +#define XHEMM3M_OUCOPYI gotoblas -> xhemm3m_oucopyi + +#define XGEMM3M_KERNEL gotoblas -> xgemm3m_kernel + +#define XNEG_TCOPY gotoblas -> xneg_tcopy +#define XLASWP_NCOPY gotoblas -> xlaswp_ncopy + +#endif + +#define XGEMM_NN xgemm_nn +#define XGEMM_CN xgemm_cn +#define XGEMM_TN xgemm_tn +#define XGEMM_NC xgemm_nc +#define XGEMM_NT xgemm_nt +#define XGEMM_CC xgemm_cc +#define XGEMM_CT xgemm_ct +#define XGEMM_TC xgemm_tc +#define XGEMM_TT xgemm_tt +#define XGEMM_NR xgemm_nr +#define XGEMM_TR xgemm_tr +#define XGEMM_CR xgemm_cr +#define XGEMM_RN xgemm_rn +#define XGEMM_RT xgemm_rt +#define XGEMM_RC xgemm_rc +#define XGEMM_RR xgemm_rr + +#define XSYMM_LU xsymm_LU +#define XSYMM_LL xsymm_LL +#define XSYMM_RU xsymm_RU +#define XSYMM_RL xsymm_RL + +#define XHEMM_LU xhemm_LU +#define XHEMM_LL xhemm_LL +#define XHEMM_RU xhemm_RU +#define XHEMM_RL xhemm_RL + +#define XSYRK_UN xsyrk_UN +#define XSYRK_UT xsyrk_UT +#define XSYRK_LN xsyrk_LN +#define XSYRK_LT xsyrk_LT +#define XSYRK_UR xsyrk_UN +#define XSYRK_UC xsyrk_UT +#define XSYRK_LR xsyrk_LN +#define XSYRK_LC xsyrk_LT + +#define XSYRK_KERNEL_U xsyrk_kernel_U +#define XSYRK_KERNEL_L xsyrk_kernel_L + +#define XHERK_UN xherk_UN +#define XHERK_LN xherk_LN +#define XHERK_UC xherk_UC +#define XHERK_LC xherk_LC + +#define XHER2K_UN xher2k_UN +#define XHER2K_LN xher2k_LN +#define XHER2K_UC xher2k_UC +#define XHER2K_LC xher2k_LC + +#define XSYR2K_UN xsyr2k_UN +#define XSYR2K_UT xsyr2k_UT +#define XSYR2K_LN xsyr2k_LN +#define XSYR2K_LT xsyr2k_LT +#define XSYR2K_UR xsyr2k_UN +#define XSYR2K_UC xsyr2k_UT +#define XSYR2K_LR xsyr2k_LN +#define XSYR2K_LC xsyr2k_LT + +#define XSYR2K_KERNEL_U xsyr2k_kernel_U +#define XSYR2K_KERNEL_L xsyr2k_kernel_L + +#define XTRMM_LNUU xtrmm_LNUU +#define XTRMM_LNUN xtrmm_LNUN +#define XTRMM_LNLU xtrmm_LNLU +#define XTRMM_LNLN xtrmm_LNLN +#define XTRMM_LTUU xtrmm_LTUU +#define XTRMM_LTUN xtrmm_LTUN +#define XTRMM_LTLU xtrmm_LTLU +#define XTRMM_LTLN xtrmm_LTLN +#define XTRMM_LRUU xtrmm_LRUU +#define XTRMM_LRUN xtrmm_LRUN +#define XTRMM_LRLU xtrmm_LRLU +#define XTRMM_LRLN xtrmm_LRLN +#define XTRMM_LCUU xtrmm_LCUU +#define XTRMM_LCUN xtrmm_LCUN +#define XTRMM_LCLU xtrmm_LCLU +#define XTRMM_LCLN xtrmm_LCLN +#define XTRMM_RNUU xtrmm_RNUU +#define XTRMM_RNUN xtrmm_RNUN +#define XTRMM_RNLU xtrmm_RNLU +#define XTRMM_RNLN xtrmm_RNLN +#define XTRMM_RTUU xtrmm_RTUU +#define XTRMM_RTUN xtrmm_RTUN +#define XTRMM_RTLU xtrmm_RTLU +#define XTRMM_RTLN xtrmm_RTLN +#define XTRMM_RRUU xtrmm_RRUU +#define XTRMM_RRUN xtrmm_RRUN +#define XTRMM_RRLU xtrmm_RRLU +#define XTRMM_RRLN xtrmm_RRLN +#define XTRMM_RCUU xtrmm_RCUU +#define XTRMM_RCUN xtrmm_RCUN +#define XTRMM_RCLU xtrmm_RCLU +#define XTRMM_RCLN xtrmm_RCLN + +#define XTRSM_LNUU xtrsm_LNUU +#define XTRSM_LNUN xtrsm_LNUN +#define XTRSM_LNLU xtrsm_LNLU +#define XTRSM_LNLN xtrsm_LNLN +#define XTRSM_LTUU xtrsm_LTUU +#define XTRSM_LTUN xtrsm_LTUN +#define XTRSM_LTLU xtrsm_LTLU +#define XTRSM_LTLN xtrsm_LTLN +#define XTRSM_LRUU xtrsm_LRUU +#define XTRSM_LRUN xtrsm_LRUN +#define XTRSM_LRLU xtrsm_LRLU +#define XTRSM_LRLN xtrsm_LRLN +#define XTRSM_LCUU xtrsm_LCUU +#define XTRSM_LCUN xtrsm_LCUN +#define XTRSM_LCLU xtrsm_LCLU +#define XTRSM_LCLN xtrsm_LCLN +#define XTRSM_RNUU xtrsm_RNUU +#define XTRSM_RNUN xtrsm_RNUN +#define XTRSM_RNLU xtrsm_RNLU +#define XTRSM_RNLN xtrsm_RNLN +#define XTRSM_RTUU xtrsm_RTUU +#define XTRSM_RTUN xtrsm_RTUN +#define XTRSM_RTLU xtrsm_RTLU +#define XTRSM_RTLN xtrsm_RTLN +#define XTRSM_RRUU xtrsm_RRUU +#define XTRSM_RRUN xtrsm_RRUN +#define XTRSM_RRLU xtrsm_RRLU +#define XTRSM_RRLN xtrsm_RRLN +#define XTRSM_RCUU xtrsm_RCUU +#define XTRSM_RCUN xtrsm_RCUN +#define XTRSM_RCLU xtrsm_RCLU +#define XTRSM_RCLN xtrsm_RCLN + +#define XGEMM_THREAD_NN xgemm_thread_nn +#define XGEMM_THREAD_CN xgemm_thread_cn +#define XGEMM_THREAD_TN xgemm_thread_tn +#define XGEMM_THREAD_NC xgemm_thread_nc +#define XGEMM_THREAD_NT xgemm_thread_nt +#define XGEMM_THREAD_CC xgemm_thread_cc +#define XGEMM_THREAD_CT xgemm_thread_ct +#define XGEMM_THREAD_TC xgemm_thread_tc +#define XGEMM_THREAD_TT xgemm_thread_tt +#define XGEMM_THREAD_NR xgemm_thread_nr +#define XGEMM_THREAD_TR xgemm_thread_tr +#define XGEMM_THREAD_CR xgemm_thread_cr +#define XGEMM_THREAD_RN xgemm_thread_rn +#define XGEMM_THREAD_RT xgemm_thread_rt +#define XGEMM_THREAD_RC xgemm_thread_rc +#define XGEMM_THREAD_RR xgemm_thread_rr + +#define XSYMM_THREAD_LU xsymm_thread_LU +#define XSYMM_THREAD_LL xsymm_thread_LL +#define XSYMM_THREAD_RU xsymm_thread_RU +#define XSYMM_THREAD_RL xsymm_thread_RL + +#define XHEMM_THREAD_LU xhemm_thread_LU +#define XHEMM_THREAD_LL xhemm_thread_LL +#define XHEMM_THREAD_RU xhemm_thread_RU +#define XHEMM_THREAD_RL xhemm_thread_RL + +#define XSYRK_THREAD_UN xsyrk_thread_UN +#define XSYRK_THREAD_UT xsyrk_thread_UT +#define XSYRK_THREAD_LN xsyrk_thread_LN +#define XSYRK_THREAD_LT xsyrk_thread_LT +#define XSYRK_THREAD_UR xsyrk_thread_UN +#define XSYRK_THREAD_UC xsyrk_thread_UT +#define XSYRK_THREAD_LR xsyrk_thread_LN +#define XSYRK_THREAD_LC xsyrk_thread_LT + +#define XHERK_THREAD_UN xherk_thread_UN +#define XHERK_THREAD_UT xherk_thread_UT +#define XHERK_THREAD_LN xherk_thread_LN +#define XHERK_THREAD_LT xherk_thread_LT +#define XHERK_THREAD_UR xherk_thread_UR +#define XHERK_THREAD_UC xherk_thread_UC +#define XHERK_THREAD_LR xherk_thread_LR +#define XHERK_THREAD_LC xherk_thread_LC + +#define XGEMM3M_NN xgemm3m_nn +#define XGEMM3M_CN xgemm3m_cn +#define XGEMM3M_TN xgemm3m_tn +#define XGEMM3M_NC xgemm3m_nc +#define XGEMM3M_NT xgemm3m_nt +#define XGEMM3M_CC xgemm3m_cc +#define XGEMM3M_CT xgemm3m_ct +#define XGEMM3M_TC xgemm3m_tc +#define XGEMM3M_TT xgemm3m_tt +#define XGEMM3M_NR xgemm3m_nr +#define XGEMM3M_TR xgemm3m_tr +#define XGEMM3M_CR xgemm3m_cr +#define XGEMM3M_RN xgemm3m_rn +#define XGEMM3M_RT xgemm3m_rt +#define XGEMM3M_RC xgemm3m_rc +#define XGEMM3M_RR xgemm3m_rr + +#define XGEMM3M_THREAD_NN xgemm3m_thread_nn +#define XGEMM3M_THREAD_CN xgemm3m_thread_cn +#define XGEMM3M_THREAD_TN xgemm3m_thread_tn +#define XGEMM3M_THREAD_NC xgemm3m_thread_nc +#define XGEMM3M_THREAD_NT xgemm3m_thread_nt +#define XGEMM3M_THREAD_CC xgemm3m_thread_cc +#define XGEMM3M_THREAD_CT xgemm3m_thread_ct +#define XGEMM3M_THREAD_TC xgemm3m_thread_tc +#define XGEMM3M_THREAD_TT xgemm3m_thread_tt +#define XGEMM3M_THREAD_NR xgemm3m_thread_nr +#define XGEMM3M_THREAD_TR xgemm3m_thread_tr +#define XGEMM3M_THREAD_CR xgemm3m_thread_cr +#define XGEMM3M_THREAD_RN xgemm3m_thread_rn +#define XGEMM3M_THREAD_RT xgemm3m_thread_rt +#define XGEMM3M_THREAD_RC xgemm3m_thread_rc +#define XGEMM3M_THREAD_RR xgemm3m_thread_rr + +#define XSYMM3M_LU xsymm3m_LU +#define XSYMM3M_LL xsymm3m_LL +#define XSYMM3M_RU xsymm3m_RU +#define XSYMM3M_RL xsymm3m_RL + +#define XSYMM3M_THREAD_LU xsymm3m_thread_LU +#define XSYMM3M_THREAD_LL xsymm3m_thread_LL +#define XSYMM3M_THREAD_RU xsymm3m_thread_RU +#define XSYMM3M_THREAD_RL xsymm3m_thread_RL + +#define XHEMM3M_LU xhemm3m_LU +#define XHEMM3M_LL xhemm3m_LL +#define XHEMM3M_RU xhemm3m_RU +#define XHEMM3M_RL xhemm3m_RL + +#define XHEMM3M_THREAD_LU xhemm3m_thread_LU +#define XHEMM3M_THREAD_LL xhemm3m_thread_LL +#define XHEMM3M_THREAD_RU xhemm3m_thread_RU +#define XHEMM3M_THREAD_RL xhemm3m_thread_RL + +#endif diff --git a/common_x86.h b/common_x86.h new file mode 100644 index 0000000000..fbb91f8884 --- /dev/null +++ b/common_x86.h @@ -0,0 +1,359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_X86 +#define COMMON_X86 + +#ifndef ASSEMBLER + +#define MB +#define WMB + +#ifdef C_SUN +#define __asm__ __asm +#define __volatile__ +#endif + +static void __inline blas_lock(volatile BLASULONG *address){ + + int ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "xchgl %0, %1\n" + : "=r"(ret), "=m"(*address) + : "0"(1), "m"(*address) + : "memory"); + + } while (ret); + +} + +static __inline unsigned long long rpcc(void){ + unsigned int a, d; + + __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); + + return ((unsigned long long)a + ((unsigned long long)d << 32)); +}; + +static __inline unsigned long getstackaddr(void){ + unsigned long addr; + + __asm__ __volatile__ ("mov %%esp, %0" + : "=r"(addr) : : "memory"); + + return addr; +}; + + +static __inline long double sqrt_long(long double val) { + long double result; + + __asm__ __volatile__ ("fldt %1\n" + "fsqrt\n" + "fstpt %0\n" : "=m" (result) : "m"(val)); + return result; +} + +#define SQRT(a) sqrt_long(a) + +/* This is due to gcc's bug */ +void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); + +#define WHEREAMI + +static inline int WhereAmI(void){ + int eax, ebx, ecx, edx; + int apicid; + + cpuid(1, &eax, &ebx, &ecx, &edx); + apicid = BITMASK(ebx, 24, 0xff); + + return apicid; +} + +#ifdef ENABLE_SSE_EXCEPTION + +#define IDEBUG_START \ +{ \ + unsigned int fp_sse_mode, new_fp_mode; \ + __asm__ __volatile__ ("stmxcsr %0" : "=m" (fp_sse_mode) : ); \ + new_fp_mode = fp_sse_mode & ~0xd00; \ + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (new_fp_mode) ); + +#define IDEBUG_END \ + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (fp_sse_mode) ); \ +} + +#endif + +#ifdef XDOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fstpt %0" : "=m"(res) : : "memory") +#elif defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("fstpl %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("fstps %0" : "=m"(res) : : "memory"); +#endif + +#define GET_IMAGE_CANCEL __asm__ __volatile__ ("ffree %st") + +#ifdef SMP +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + + unsigned int result; + + if (y <= 1) return x; + + y = blas_quick_divide_table[y]; + + __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + + return result; +} +#endif + +#endif + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 4 << 20) + +#define BUFFER_SIZE (16 << 20) + +#define SEEK_ADDRESS + +#if defined(DOUBLE) || defined(XDOUBLE) +#define MMXLOAD movq +#define MMXSTORE movq +#else +#define MMXLOAD movd +#define MMXSTORE movd +#endif + +#if defined(HAVE_3DNOW) +#define EMMS femms +#elif defined(HAVE_MMX) +#define EMMS emms +#endif + +#ifndef EMMS +#define EMMS +#endif + +#if defined(CORE2) || defined(PENTIUM4) +#define movapd movaps +#endif + +#define BRANCH .byte 0x3e +#define NOBRANCH .byte 0x2e +#define PADDING .byte 0x66; +#define HALT hlt + +#ifndef COMPLEX +#ifdef XDOUBLE +#define LOCAL_BUFFER_SIZE QLOCAL_BUFFER_SIZE +#elif defined DOUBLE +#define LOCAL_BUFFER_SIZE DLOCAL_BUFFER_SIZE +#else +#define LOCAL_BUFFER_SIZE SLOCAL_BUFFER_SIZE +#endif +#else +#ifdef XDOUBLE +#define LOCAL_BUFFER_SIZE XLOCAL_BUFFER_SIZE +#elif defined DOUBLE +#define LOCAL_BUFFER_SIZE ZLOCAL_BUFFER_SIZE +#else +#define LOCAL_BUFFER_SIZE CLOCAL_BUFFER_SIZE +#endif +#endif + +#if defined(OS_WINDOWS) +#if LOCAL_BUFFER_SIZE > 16384 +#define STACK_TOUCHING \ + movl $0, 4096 * 4(%esp);\ + movl $0, 4096 * 3(%esp);\ + movl $0, 4096 * 2(%esp);\ + movl $0, 4096 * 1(%esp); +#elif LOCAL_BUFFER_SIZE > 12288 +#define STACK_TOUCHING \ + movl $0, 4096 * 3(%esp);\ + movl $0, 4096 * 2(%esp);\ + movl $0, 4096 * 1(%esp); +#elif LOCAL_BUFFER_SIZE > 8192 +#define STACK_TOUCHING \ + movl $0, 4096 * 2(%esp);\ + movl $0, 4096 * 1(%esp); +#elif LOCAL_BUFFER_SIZE > 4096 +#define STACK_TOUCHING \ + movl $0, 4096 * 1(%esp); +#else +#define STACK_TOUCHING +#endif +#else +#define STACK_TOUCHING +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#if defined(F_INTERFACE_PATHSCALE) || defined(F_INTERFACE_OPEN64) +#define RETURN_BY_STRUCT +#elif defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) +#define RETURN_BY_COMPLEX +#else +#define RETURN_BY_STACK +#endif + +#ifdef OS_DARWIN +#define PROLOGUE .text;.align 5; .globl REALNAME; REALNAME: +#define EPILOGUE .subsections_via_symbols +#define PROFCODE +#endif + +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#define SAVEREGISTERS \ + subl $32, %esp;\ + movups %xmm6, 0(%esp);\ + movups %xmm7, 16(%esp) + +#define RESTOREREGISTERS \ + movups 0(%esp), %xmm6;\ + movups 16(%esp), %xmm7;\ + addl $32, %esp +#else +#define SAVEREGISTERS +#define RESTOREREGISTERS +#endif + +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#define PROLOGUE \ + .text; \ + .align 16; \ + .globl REALNAME ;\ + .def REALNAME;.scl 2;.type 32;.endef; \ +REALNAME: + +#define PROFCODE + +#define EPILOGUE .end REALNAME +#endif + +#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) +#define PROLOGUE \ + .text; \ + .align 16; \ + .globl REALNAME ;\ + .type REALNAME, @function; \ +REALNAME: + +#ifdef PROFILE +#define PROFCODE call mcount +#else +#define PROFCODE +#endif + +#define EPILOGUE .size REALNAME, .-REALNAME + +#endif + +#ifdef XDOUBLE +#define FLD fldt +#define FST fstpt +#define FSTU fstt +#define FMUL fmult +#define FADD faddt +#define FSUB fsubt +#define FSUBR fsubrt +#elif defined(DOUBLE) +#define FLD fldl +#define FST fstpl +#define FSTU fstl +#define FMUL fmull +#define FADD faddl +#define FSUB fsubl +#define FSUBR fsubrl +#else +#define FLD flds +#define FST fstps +#define FSTU fsts +#define FMUL fmuls +#define FADD fadds +#define FSUB fsubs +#define FSUBR fsubrs +#endif +#endif + +#ifdef C_SUN +#define ffreep fstp +#endif + +#ifdef __APPLE__ +#define ALIGN_2 .align 2 +#define ALIGN_3 .align 3 +#define ALIGN_4 .align 4 +#define ffreep fstp +#endif + +#ifndef ALIGN_2 +#define ALIGN_2 .align 4 +#endif + +#ifndef ALIGN_3 +#define ALIGN_3 .align 8 +#endif + +#ifndef ALIGN_4 +#define ALIGN_4 .align 16 +#endif + +#ifndef ALIGN_5 +#define ALIGN_5 .align 32 +#endif + +#ifndef ALIGN_6 +#define ALIGN_6 .align 64 +#endif diff --git a/common_x86_64.h b/common_x86_64.h new file mode 100644 index 0000000000..53b702185b --- /dev/null +++ b/common_x86_64.h @@ -0,0 +1,451 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_X86 +#define COMMON_X86 + +#ifndef ASSEMBLER + +#ifdef C_SUN +#define __asm__ __asm +#define __volatile__ +#endif + +#ifdef HAVE_SSE2 +#define MB __asm__ __volatile__ ("mfence"); +#define WMB __asm__ __volatile__ ("sfence"); +#else +#define MB +#define WMB +#endif + +static void __inline blas_lock(volatile BLASULONG *address){ + + int ret; + + do { + while (*address) {YIELDING;}; + + __asm__ __volatile__( + "xchgl %0, %1\n" + : "=r"(ret), "=m"(*address) + : "0"(1), "m"(*address) + : "memory"); + + } while (ret); +} + +static __inline BLASULONG rpcc(void){ + BLASULONG a, d; + + __asm__ __volatile__ ("rdtsc" : "=a" (a), "=d" (d)); + + return ((BLASULONG)a + ((BLASULONG)d << 32)); +} + +#define RPCC64BIT + +static __inline BLASULONG getstackaddr(void){ + BLASULONG addr; + + __asm__ __volatile__ ("movq %%rsp, %0" + : "=r"(addr) : : "memory"); + + return addr; +} + +static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ + + __asm__ __volatile__("cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (op)); +} + +#define WHEREAMI + +static inline int WhereAmI(void){ + int eax, ebx, ecx, edx; + int apicid; + + cpuid(1, &eax, &ebx, &ecx, &edx); + apicid = BITMASK(ebx, 24, 0xff); + + return apicid; +} + +#ifdef CORE_BARCELONA +#define IFLUSH gotoblas_iflush() +#define IFLUSH_HALF gotoblas_iflush_half() +#endif + +#ifdef ENABLE_SSE_EXCEPTION + +#define IDEBUG_START \ +{ \ + unsigned int fp_sse_mode, new_fp_mode; \ + __asm__ __volatile__ ("stmxcsr %0" : "=m" (fp_sse_mode) : ); \ + new_fp_mode = fp_sse_mode & ~0xd00; \ + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (new_fp_mode) ); + +#define IDEBUG_END \ + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (fp_sse_mode) ); \ +} + +#endif + +#ifdef XDOUBLE +#define GET_IMAGE(res) __asm__ __volatile__("fstpt %0" : "=m"(res) : : "memory") +#elif defined(DOUBLE) +#define GET_IMAGE(res) __asm__ __volatile__("movsd %%xmm1, %0" : "=m"(res) : : "memory") +#else +#define GET_IMAGE(res) __asm__ __volatile__("movss %%xmm1, %0" : "=m"(res) : : "memory") +#endif + +#define GET_IMAGE_CANCEL + +#ifdef SMP +#ifdef USE64BITINT +static __inline blasint blas_quickdivide(blasint x, blasint y){ + return x / y; +} +#else +extern unsigned int blas_quick_divide_table[]; + +static __inline int blas_quickdivide(unsigned int x, unsigned int y){ + + unsigned int result; + + if (y <= 1) return x; + + y = blas_quick_divide_table[y]; + + __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + + return result; +} +#endif +#endif + +#endif + +#ifndef PAGESIZE +#define PAGESIZE ( 4 << 10) +#endif +#define HUGE_PAGESIZE ( 2 << 20) + +#define BUFFER_SIZE (32 << 20) + +#define SEEK_ADDRESS + +#ifdef F_INTERFACE_G77 +#define RETURN_BY_STACK +#define NEED_F2CCONV +#endif + +#ifdef F_INTERFACE_G95 +#define RETURN_BY_PACKED +#endif + +#ifdef F_INTERFACE_GFORT +#ifdef OS_WINDOWS +#ifndef DOUBLE +#define RETURN_BY_REGS +#else +#define RETURN_BY_STACK +#endif +#else +#define RETURN_BY_PACKED +#endif +#endif + +#ifdef F_INTERFACE_INTEL +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_FUJITSU +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_PGI +#define RETURN_BY_STACK +#endif + +#ifdef F_INTERFACE_PATHSCALE +#define RETURN_BY_PACKED +#endif + +#ifdef F_INTERFACE_SUN +#define RETURN_BY_PACKED +#endif + +#ifdef ASSEMBLER + +#if defined(HAVE_3DNOW) +#define EMMS femms +#elif defined(HAVE_MMX) +#define EMMS emms +#endif + +#ifndef EMMS +#define EMMS +#endif + +#define BRANCH .byte 0x3e +#define NOBRANCH .byte 0x2e +#define PADDING .byte 0x66 + +#ifdef OS_WINDOWS +#define ARG1 %rcx +#define ARG2 %rdx +#define ARG3 %r8 +#define ARG4 %r9 +#else +#define ARG1 %rdi +#define ARG2 %rsi +#define ARG3 %rdx +#define ARG4 %rcx +#define ARG5 %r8 +#define ARG6 %r9 +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define LOCAL_BUFFER_SIZE QLOCAL_BUFFER_SIZE +#elif defined DOUBLE +#define LOCAL_BUFFER_SIZE DLOCAL_BUFFER_SIZE +#else +#define LOCAL_BUFFER_SIZE SLOCAL_BUFFER_SIZE +#endif +#else +#ifdef XDOUBLE +#define LOCAL_BUFFER_SIZE XLOCAL_BUFFER_SIZE +#elif defined DOUBLE +#define LOCAL_BUFFER_SIZE ZLOCAL_BUFFER_SIZE +#else +#define LOCAL_BUFFER_SIZE CLOCAL_BUFFER_SIZE +#endif +#endif + +#if defined(OS_WINDOWS) +#if LOCAL_BUFFER_SIZE > 16384 +#define STACK_TOUCHING \ + movl $0, 4096 * 4(%rsp);\ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif LOCAL_BUFFER_SIZE > 12288 +#define STACK_TOUCHING \ + movl $0, 4096 * 3(%rsp);\ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif LOCAL_BUFFER_SIZE > 8192 +#define STACK_TOUCHING \ + movl $0, 4096 * 2(%rsp);\ + movl $0, 4096 * 1(%rsp); +#elif LOCAL_BUFFER_SIZE > 4096 +#define STACK_TOUCHING \ + movl $0, 4096 * 1(%rsp); +#else +#define STACK_TOUCHING +#endif +#else +#define STACK_TOUCHING +#endif + +#if defined(CORE2) +#define movapd movaps +#define andpd andps +#define movlpd movlps +#define movhpd movhps +#endif + +#ifndef F_INTERFACE +#define REALNAME ASMNAME +#else +#define REALNAME ASMFNAME +#endif + +#ifdef OS_DARWIN +#define PROLOGUE .text;.align 5; .globl REALNAME; REALNAME: +#define EPILOGUE .subsections_via_symbols +#define PROFCODE +#endif + +#ifdef OS_WINDOWS +#define SAVEREGISTERS \ + subq $256, %rsp;\ + movups %xmm6, 0(%rsp);\ + movups %xmm7, 16(%rsp);\ + movups %xmm8, 32(%rsp);\ + movups %xmm9, 48(%rsp);\ + movups %xmm10, 64(%rsp);\ + movups %xmm11, 80(%rsp);\ + movups %xmm12, 96(%rsp);\ + movups %xmm13, 112(%rsp);\ + movups %xmm14, 128(%rsp);\ + movups %xmm15, 144(%rsp) + +#define RESTOREREGISTERS \ + movups 0(%rsp), %xmm6;\ + movups 16(%rsp), %xmm7;\ + movups 32(%rsp), %xmm8;\ + movups 48(%rsp), %xmm9;\ + movups 64(%rsp), %xmm10;\ + movups 80(%rsp), %xmm11;\ + movups 96(%rsp), %xmm12;\ + movups 112(%rsp), %xmm13;\ + movups 128(%rsp), %xmm14;\ + movups 144(%rsp), %xmm15;\ + addq $256, %rsp +#else +#define SAVEREGISTERS +#define RESTOREREGISTERS +#endif + +#if defined(OS_WINDOWS) && !defined(C_PGI) +#define PROLOGUE \ + .text; \ + .align 16; \ + .globl REALNAME ;\ + .def REALNAME;.scl 2;.type 32;.endef; \ +REALNAME: + +#define PROFCODE + +#define EPILOGUE .end REALNAME +#endif + +#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) +#define PROLOGUE \ + .text; \ + .align 512; \ + .globl REALNAME ;\ + .type REALNAME, @function; \ +REALNAME: + +#ifdef PROFILE +#define PROFCODE call *mcount@GOTPCREL(%rip) +#else +#define PROFCODE +#endif + +#define EPILOGUE .size REALNAME, .-REALNAME + +#endif + +#endif + +#ifdef XDOUBLE +#define FLD fldt +#define FST fstpt +#define MOVQ movq +#elif defined(DOUBLE) +#define FLD fldl +#define FST fstpl +#define FSTU fstl +#define FMUL fmull +#define FADD faddl +#define MOVSD movsd +#define MULSD mulsd +#define MULPD mulpd +#define CMPEQPD cmpeqpd +#define COMISD comisd +#define PSRLQ psrlq +#define ANDPD andpd +#define ADDPD addpd +#define ADDSD addsd +#define SUBPD subpd +#define SUBSD subsd +#define MOVQ movq +#define MOVUPD movupd +#define XORPD xorpd +#else +#define FLD flds +#define FST fstps +#define FSTU fsts +#define FMUL fmuls +#define FADD fadds +#define MOVSD movss +#define MULSD mulss +#define MULPD mulps +#define CMPEQPD cmpeqps +#define COMISD comiss +#define PSRLQ psrld +#define ANDPD andps +#define ADDPD addps +#define ADDSD addss +#define SUBPD subps +#define SUBSD subss +#define MOVQ movd +#define MOVUPD movups +#define XORPD xorps +#endif + +#define HALT hlt + +#ifdef OS_DARWIN +#define ALIGN_2 .align 2 +#define ALIGN_3 .align 3 +#define ALIGN_4 .align 4 +#define ffreep fstp +#endif + +#ifndef ALIGN_2 +#define ALIGN_2 .align 4 +#endif + +#ifndef ALIGN_3 +#define ALIGN_3 .align 8 +#endif + +#ifndef ALIGN_4 +#define ALIGN_4 .align 16 +#endif + +#ifndef ALIGN_5 +#define ALIGN_5 .align 32 +#endif + +#ifndef ALIGN_6 +#define ALIGN_6 .align 64 +#endif + +#endif diff --git a/common_z.h b/common_z.h new file mode 100644 index 0000000000..8832caccb1 --- /dev/null +++ b/common_z.h @@ -0,0 +1,611 @@ +#ifndef COMMON_Z_H +#define COMMON_Z_H + +#ifndef DYNAMIC_ARCH + +#define ZAMAX_K zamax_k +#define ZAMIN_K zamin_k +#define ZMAX_K zmax_k +#define ZMIN_K zmin_k +#define IZAMAX_K izamax_k +#define IZAMIN_K izamin_k +#define IZMAX_K izmax_k +#define IZMIN_K izmin_k +#define ZASUM_K zasum_k +#define ZAXPYU_K zaxpy_k +#define ZAXPYC_K zaxpyc_k +#define ZCOPY_K zcopy_k +#define ZDOTU_K zdotu_k +#define ZDOTC_K zdotc_k +#define ZNRM2_K znrm2_k +#define ZSCAL_K zscal_k +#define ZSWAP_K zswap_k +#define ZROT_K zdrot_k + +#define ZGEMV_N zgemv_n +#define ZGEMV_T zgemv_t +#define ZGEMV_R zgemv_r +#define ZGEMV_C zgemv_c +#define ZGEMV_O zgemv_o +#define ZGEMV_U zgemv_u +#define ZGEMV_S zgemv_s +#define ZGEMV_D zgemv_d + +#define ZGERU_K zgeru_k +#define ZGERC_K zgerc_k +#define ZGERV_K zgerv_k +#define ZGERD_K zgerd_k + +#define ZSYMV_U zsymv_U +#define ZSYMV_L zsymv_L +#define ZHEMV_U zhemv_U +#define ZHEMV_L zhemv_L +#define ZHEMV_V zhemv_V +#define ZHEMV_M zhemv_M + +#define ZSYMV_THREAD_U zsymv_thread_U +#define ZSYMV_THREAD_L zsymv_thread_L +#define ZHEMV_THREAD_U zhemv_thread_U +#define ZHEMV_THREAD_L zhemv_thread_L +#define ZHEMV_THREAD_V zhemv_thread_V +#define ZHEMV_THREAD_M zhemv_thread_M + +#define ZGEMM_ONCOPY zgemm_oncopy +#define ZGEMM_OTCOPY zgemm_otcopy + +#if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N +#define ZGEMM_INCOPY zgemm_oncopy +#define ZGEMM_ITCOPY zgemm_otcopy +#else +#define ZGEMM_INCOPY zgemm_incopy +#define ZGEMM_ITCOPY zgemm_itcopy +#endif + +#define ZTRMM_OUNUCOPY ztrmm_ounucopy +#define ZTRMM_OUNNCOPY ztrmm_ounncopy +#define ZTRMM_OUTUCOPY ztrmm_outucopy +#define ZTRMM_OUTNCOPY ztrmm_outncopy +#define ZTRMM_OLNUCOPY ztrmm_olnucopy +#define ZTRMM_OLNNCOPY ztrmm_olnncopy +#define ZTRMM_OLTUCOPY ztrmm_oltucopy +#define ZTRMM_OLTNCOPY ztrmm_oltncopy + +#define ZTRSM_OUNUCOPY ztrsm_ounucopy +#define ZTRSM_OUNNCOPY ztrsm_ounncopy +#define ZTRSM_OUTUCOPY ztrsm_outucopy +#define ZTRSM_OUTNCOPY ztrsm_outncopy +#define ZTRSM_OLNUCOPY ztrsm_olnucopy +#define ZTRSM_OLNNCOPY ztrsm_olnncopy +#define ZTRSM_OLTUCOPY ztrsm_oltucopy +#define ZTRSM_OLTNCOPY ztrsm_oltncopy + +#if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N +#define ZTRMM_IUNUCOPY ztrmm_ounucopy +#define ZTRMM_IUNNCOPY ztrmm_ounncopy +#define ZTRMM_IUTUCOPY ztrmm_outucopy +#define ZTRMM_IUTNCOPY ztrmm_outncopy +#define ZTRMM_ILNUCOPY ztrmm_olnucopy +#define ZTRMM_ILNNCOPY ztrmm_olnncopy +#define ZTRMM_ILTUCOPY ztrmm_oltucopy +#define ZTRMM_ILTNCOPY ztrmm_oltncopy + +#define ZTRSM_IUNUCOPY ztrsm_ounucopy +#define ZTRSM_IUNNCOPY ztrsm_ounncopy +#define ZTRSM_IUTUCOPY ztrsm_outucopy +#define ZTRSM_IUTNCOPY ztrsm_outncopy +#define ZTRSM_ILNUCOPY ztrsm_olnucopy +#define ZTRSM_ILNNCOPY ztrsm_olnncopy +#define ZTRSM_ILTUCOPY ztrsm_oltucopy +#define ZTRSM_ILTNCOPY ztrsm_oltncopy +#else +#define ZTRMM_IUNUCOPY ztrmm_iunucopy +#define ZTRMM_IUNNCOPY ztrmm_iunncopy +#define ZTRMM_IUTUCOPY ztrmm_iutucopy +#define ZTRMM_IUTNCOPY ztrmm_iutncopy +#define ZTRMM_ILNUCOPY ztrmm_ilnucopy +#define ZTRMM_ILNNCOPY ztrmm_ilnncopy +#define ZTRMM_ILTUCOPY ztrmm_iltucopy +#define ZTRMM_ILTNCOPY ztrmm_iltncopy + +#define ZTRSM_IUNUCOPY ztrsm_iunucopy +#define ZTRSM_IUNNCOPY ztrsm_iunncopy +#define ZTRSM_IUTUCOPY ztrsm_iutucopy +#define ZTRSM_IUTNCOPY ztrsm_iutncopy +#define ZTRSM_ILNUCOPY ztrsm_ilnucopy +#define ZTRSM_ILNNCOPY ztrsm_ilnncopy +#define ZTRSM_ILTUCOPY ztrsm_iltucopy +#define ZTRSM_ILTNCOPY ztrsm_iltncopy +#endif + +#define ZGEMM_BETA zgemm_beta + +#define ZGEMM_KERNEL_N zgemm_kernel_n +#define ZGEMM_KERNEL_L zgemm_kernel_l +#define ZGEMM_KERNEL_R zgemm_kernel_r +#define ZGEMM_KERNEL_B zgemm_kernel_b + +#define ZTRMM_KERNEL_LN ztrmm_kernel_LN +#define ZTRMM_KERNEL_LT ztrmm_kernel_LT +#define ZTRMM_KERNEL_LR ztrmm_kernel_LR +#define ZTRMM_KERNEL_LC ztrmm_kernel_LC +#define ZTRMM_KERNEL_RN ztrmm_kernel_RN +#define ZTRMM_KERNEL_RT ztrmm_kernel_RT +#define ZTRMM_KERNEL_RR ztrmm_kernel_RR +#define ZTRMM_KERNEL_RC ztrmm_kernel_RC + +#define ZTRSM_KERNEL_LN ztrsm_kernel_LN +#define ZTRSM_KERNEL_LT ztrsm_kernel_LT +#define ZTRSM_KERNEL_LR ztrsm_kernel_LR +#define ZTRSM_KERNEL_LC ztrsm_kernel_LC +#define ZTRSM_KERNEL_RN ztrsm_kernel_RN +#define ZTRSM_KERNEL_RT ztrsm_kernel_RT +#define ZTRSM_KERNEL_RR ztrsm_kernel_RR +#define ZTRSM_KERNEL_RC ztrsm_kernel_RC + +#define ZSYMM_OUTCOPY zsymm_outcopy +#define ZSYMM_OLTCOPY zsymm_oltcopy +#if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N +#define ZSYMM_IUTCOPY zsymm_outcopy +#define ZSYMM_ILTCOPY zsymm_oltcopy +#else +#define ZSYMM_IUTCOPY zsymm_iutcopy +#define ZSYMM_ILTCOPY zsymm_iltcopy +#endif + +#define ZHEMM_OUTCOPY zhemm_outcopy +#define ZHEMM_OLTCOPY zhemm_oltcopy +#if ZGEMM_DEFAULT_UNROLL_M == ZGEMM_DEFAULT_UNROLL_N +#define ZHEMM_IUTCOPY zhemm_outcopy +#define ZHEMM_ILTCOPY zhemm_oltcopy +#else +#define ZHEMM_IUTCOPY zhemm_iutcopy +#define ZHEMM_ILTCOPY zhemm_iltcopy +#endif + +#define ZGEMM3M_ONCOPYB zgemm3m_oncopyb +#define ZGEMM3M_ONCOPYR zgemm3m_oncopyr +#define ZGEMM3M_ONCOPYI zgemm3m_oncopyi +#define ZGEMM3M_OTCOPYB zgemm3m_otcopyb +#define ZGEMM3M_OTCOPYR zgemm3m_otcopyr +#define ZGEMM3M_OTCOPYI zgemm3m_otcopyi + +#define ZGEMM3M_INCOPYB zgemm3m_incopyb +#define ZGEMM3M_INCOPYR zgemm3m_incopyr +#define ZGEMM3M_INCOPYI zgemm3m_incopyi +#define ZGEMM3M_ITCOPYB zgemm3m_itcopyb +#define ZGEMM3M_ITCOPYR zgemm3m_itcopyr +#define ZGEMM3M_ITCOPYI zgemm3m_itcopyi + +#define ZSYMM3M_ILCOPYB zsymm3m_ilcopyb +#define ZSYMM3M_IUCOPYB zsymm3m_iucopyb +#define ZSYMM3M_ILCOPYR zsymm3m_ilcopyr +#define ZSYMM3M_IUCOPYR zsymm3m_iucopyr +#define ZSYMM3M_ILCOPYI zsymm3m_ilcopyi +#define ZSYMM3M_IUCOPYI zsymm3m_iucopyi + +#define ZSYMM3M_OLCOPYB zsymm3m_olcopyb +#define ZSYMM3M_OUCOPYB zsymm3m_oucopyb +#define ZSYMM3M_OLCOPYR zsymm3m_olcopyr +#define ZSYMM3M_OUCOPYR zsymm3m_oucopyr +#define ZSYMM3M_OLCOPYI zsymm3m_olcopyi +#define ZSYMM3M_OUCOPYI zsymm3m_oucopyi + +#define ZHEMM3M_ILCOPYB zhemm3m_ilcopyb +#define ZHEMM3M_IUCOPYB zhemm3m_iucopyb +#define ZHEMM3M_ILCOPYR zhemm3m_ilcopyr +#define ZHEMM3M_IUCOPYR zhemm3m_iucopyr +#define ZHEMM3M_ILCOPYI zhemm3m_ilcopyi +#define ZHEMM3M_IUCOPYI zhemm3m_iucopyi + +#define ZHEMM3M_OLCOPYB zhemm3m_olcopyb +#define ZHEMM3M_OUCOPYB zhemm3m_oucopyb +#define ZHEMM3M_OLCOPYR zhemm3m_olcopyr +#define ZHEMM3M_OUCOPYR zhemm3m_oucopyr +#define ZHEMM3M_OLCOPYI zhemm3m_olcopyi +#define ZHEMM3M_OUCOPYI zhemm3m_oucopyi + +#define ZGEMM3M_KERNEL zgemm3m_kernel + +#define ZNEG_TCOPY zneg_tcopy +#define ZLASWP_NCOPY zlaswp_ncopy + +#else + +#define ZAMAX_K gotoblas -> zamax_k +#define ZAMIN_K gotoblas -> zamin_k +#define ZMAX_K gotoblas -> zmax_k +#define ZMIN_K gotoblas -> zmin_k +#define IZAMAX_K gotoblas -> izamax_k +#define IZAMIN_K gotoblas -> izamin_k +#define IZMAX_K gotoblas -> izmax_k +#define IZMIN_K gotoblas -> izmin_k +#define ZASUM_K gotoblas -> zasum_k +#define ZAXPYU_K gotoblas -> zaxpy_k +#define ZAXPYC_K gotoblas -> zaxpyc_k +#define ZCOPY_K gotoblas -> zcopy_k +#define ZDOTU_K gotoblas -> zdotu_k +#define ZDOTC_K gotoblas -> zdotc_k +#define ZNRM2_K gotoblas -> znrm2_k +#define ZSCAL_K gotoblas -> zscal_k +#define ZSWAP_K gotoblas -> zswap_k +#define ZROT_K gotoblas -> zdrot_k + +#define ZGEMV_N gotoblas -> zgemv_n +#define ZGEMV_T gotoblas -> zgemv_t +#define ZGEMV_R gotoblas -> zgemv_r +#define ZGEMV_C gotoblas -> zgemv_c +#define ZGEMV_O gotoblas -> zgemv_o +#define ZGEMV_U gotoblas -> zgemv_u +#define ZGEMV_S gotoblas -> zgemv_s +#define ZGEMV_D gotoblas -> zgemv_d + +#define ZGERU_K gotoblas -> zgeru_k +#define ZGERC_K gotoblas -> zgerc_k +#define ZGERV_K gotoblas -> zgerv_k +#define ZGERD_K gotoblas -> zgerd_k + +#define ZSYMV_U gotoblas -> zsymv_U +#define ZSYMV_L gotoblas -> zsymv_L +#define ZHEMV_U gotoblas -> zhemv_U +#define ZHEMV_L gotoblas -> zhemv_L +#define ZHEMV_V gotoblas -> zhemv_V +#define ZHEMV_M gotoblas -> zhemv_M + +#define ZSYMV_THREAD_U zsymv_thread_U +#define ZSYMV_THREAD_L zsymv_thread_L +#define ZHEMV_THREAD_U zhemv_thread_U +#define ZHEMV_THREAD_L zhemv_thread_L +#define ZHEMV_THREAD_V zhemv_thread_V +#define ZHEMV_THREAD_M zhemv_thread_M + +#define ZGEMM_ONCOPY gotoblas -> zgemm_oncopy +#define ZGEMM_OTCOPY gotoblas -> zgemm_otcopy +#define ZGEMM_INCOPY gotoblas -> zgemm_incopy +#define ZGEMM_ITCOPY gotoblas -> zgemm_itcopy + +#define ZTRMM_OUNUCOPY gotoblas -> ztrmm_ounucopy +#define ZTRMM_OUTUCOPY gotoblas -> ztrmm_outucopy +#define ZTRMM_OLNUCOPY gotoblas -> ztrmm_olnucopy +#define ZTRMM_OLTUCOPY gotoblas -> ztrmm_oltucopy +#define ZTRSM_OUNUCOPY gotoblas -> ztrsm_ounucopy +#define ZTRSM_OUTUCOPY gotoblas -> ztrsm_outucopy +#define ZTRSM_OLNUCOPY gotoblas -> ztrsm_olnucopy +#define ZTRSM_OLTUCOPY gotoblas -> ztrsm_oltucopy + +#define ZTRMM_IUNUCOPY gotoblas -> ztrmm_iunucopy +#define ZTRMM_IUTUCOPY gotoblas -> ztrmm_iutucopy +#define ZTRMM_ILNUCOPY gotoblas -> ztrmm_ilnucopy +#define ZTRMM_ILTUCOPY gotoblas -> ztrmm_iltucopy +#define ZTRSM_IUNUCOPY gotoblas -> ztrsm_iunucopy +#define ZTRSM_IUTUCOPY gotoblas -> ztrsm_iutucopy +#define ZTRSM_ILNUCOPY gotoblas -> ztrsm_ilnucopy +#define ZTRSM_ILTUCOPY gotoblas -> ztrsm_iltucopy + +#define ZTRMM_OUNNCOPY gotoblas -> ztrmm_ounncopy +#define ZTRMM_OUTNCOPY gotoblas -> ztrmm_outncopy +#define ZTRMM_OLNNCOPY gotoblas -> ztrmm_olnncopy +#define ZTRMM_OLTNCOPY gotoblas -> ztrmm_oltncopy +#define ZTRSM_OUNNCOPY gotoblas -> ztrsm_ounncopy +#define ZTRSM_OUTNCOPY gotoblas -> ztrsm_outncopy +#define ZTRSM_OLNNCOPY gotoblas -> ztrsm_olnncopy +#define ZTRSM_OLTNCOPY gotoblas -> ztrsm_oltncopy + +#define ZTRMM_IUNNCOPY gotoblas -> ztrmm_iunncopy +#define ZTRMM_IUTNCOPY gotoblas -> ztrmm_iutncopy +#define ZTRMM_ILNNCOPY gotoblas -> ztrmm_ilnncopy +#define ZTRMM_ILTNCOPY gotoblas -> ztrmm_iltncopy +#define ZTRSM_IUNNCOPY gotoblas -> ztrsm_iunncopy +#define ZTRSM_IUTNCOPY gotoblas -> ztrsm_iutncopy +#define ZTRSM_ILNNCOPY gotoblas -> ztrsm_ilnncopy +#define ZTRSM_ILTNCOPY gotoblas -> ztrsm_iltncopy + +#define ZGEMM_BETA gotoblas -> zgemm_beta +#define ZGEMM_KERNEL_N gotoblas -> zgemm_kernel_n +#define ZGEMM_KERNEL_L gotoblas -> zgemm_kernel_l +#define ZGEMM_KERNEL_R gotoblas -> zgemm_kernel_r +#define ZGEMM_KERNEL_B gotoblas -> zgemm_kernel_b + +#define ZTRMM_KERNEL_LN gotoblas -> ztrmm_kernel_LN +#define ZTRMM_KERNEL_LT gotoblas -> ztrmm_kernel_LT +#define ZTRMM_KERNEL_LR gotoblas -> ztrmm_kernel_LR +#define ZTRMM_KERNEL_LC gotoblas -> ztrmm_kernel_LC +#define ZTRMM_KERNEL_RN gotoblas -> ztrmm_kernel_RN +#define ZTRMM_KERNEL_RT gotoblas -> ztrmm_kernel_RT +#define ZTRMM_KERNEL_RR gotoblas -> ztrmm_kernel_RR +#define ZTRMM_KERNEL_RC gotoblas -> ztrmm_kernel_RC + +#define ZTRSM_KERNEL_LN gotoblas -> ztrsm_kernel_LN +#define ZTRSM_KERNEL_LT gotoblas -> ztrsm_kernel_LT +#define ZTRSM_KERNEL_LR gotoblas -> ztrsm_kernel_LR +#define ZTRSM_KERNEL_LC gotoblas -> ztrsm_kernel_LC +#define ZTRSM_KERNEL_RN gotoblas -> ztrsm_kernel_RN +#define ZTRSM_KERNEL_RT gotoblas -> ztrsm_kernel_RT +#define ZTRSM_KERNEL_RR gotoblas -> ztrsm_kernel_RR +#define ZTRSM_KERNEL_RC gotoblas -> ztrsm_kernel_RC + +#define ZSYMM_IUTCOPY gotoblas -> zsymm_iutcopy +#define ZSYMM_ILTCOPY gotoblas -> zsymm_iltcopy +#define ZSYMM_OUTCOPY gotoblas -> zsymm_outcopy +#define ZSYMM_OLTCOPY gotoblas -> zsymm_oltcopy + +#define ZHEMM_OUTCOPY gotoblas -> zhemm_outcopy +#define ZHEMM_OLTCOPY gotoblas -> zhemm_oltcopy +#define ZHEMM_IUTCOPY gotoblas -> zhemm_iutcopy +#define ZHEMM_ILTCOPY gotoblas -> zhemm_iltcopy + +#define ZGEMM3M_ONCOPYB gotoblas -> zgemm3m_oncopyb +#define ZGEMM3M_ONCOPYR gotoblas -> zgemm3m_oncopyr +#define ZGEMM3M_ONCOPYI gotoblas -> zgemm3m_oncopyi +#define ZGEMM3M_OTCOPYB gotoblas -> zgemm3m_otcopyb +#define ZGEMM3M_OTCOPYR gotoblas -> zgemm3m_otcopyr +#define ZGEMM3M_OTCOPYI gotoblas -> zgemm3m_otcopyi + +#define ZGEMM3M_INCOPYB gotoblas -> zgemm3m_incopyb +#define ZGEMM3M_INCOPYR gotoblas -> zgemm3m_incopyr +#define ZGEMM3M_INCOPYI gotoblas -> zgemm3m_incopyi +#define ZGEMM3M_ITCOPYB gotoblas -> zgemm3m_itcopyb +#define ZGEMM3M_ITCOPYR gotoblas -> zgemm3m_itcopyr +#define ZGEMM3M_ITCOPYI gotoblas -> zgemm3m_itcopyi + +#define ZSYMM3M_ILCOPYB gotoblas -> zsymm3m_ilcopyb +#define ZSYMM3M_IUCOPYB gotoblas -> zsymm3m_iucopyb +#define ZSYMM3M_ILCOPYR gotoblas -> zsymm3m_ilcopyr +#define ZSYMM3M_IUCOPYR gotoblas -> zsymm3m_iucopyr +#define ZSYMM3M_ILCOPYI gotoblas -> zsymm3m_ilcopyi +#define ZSYMM3M_IUCOPYI gotoblas -> zsymm3m_iucopyi + +#define ZSYMM3M_OLCOPYB gotoblas -> zsymm3m_olcopyb +#define ZSYMM3M_OUCOPYB gotoblas -> zsymm3m_oucopyb +#define ZSYMM3M_OLCOPYR gotoblas -> zsymm3m_olcopyr +#define ZSYMM3M_OUCOPYR gotoblas -> zsymm3m_oucopyr +#define ZSYMM3M_OLCOPYI gotoblas -> zsymm3m_olcopyi +#define ZSYMM3M_OUCOPYI gotoblas -> zsymm3m_oucopyi + +#define ZHEMM3M_ILCOPYB gotoblas -> zhemm3m_ilcopyb +#define ZHEMM3M_IUCOPYB gotoblas -> zhemm3m_iucopyb +#define ZHEMM3M_ILCOPYR gotoblas -> zhemm3m_ilcopyr +#define ZHEMM3M_IUCOPYR gotoblas -> zhemm3m_iucopyr +#define ZHEMM3M_ILCOPYI gotoblas -> zhemm3m_ilcopyi +#define ZHEMM3M_IUCOPYI gotoblas -> zhemm3m_iucopyi + +#define ZHEMM3M_OLCOPYB gotoblas -> zhemm3m_olcopyb +#define ZHEMM3M_OUCOPYB gotoblas -> zhemm3m_oucopyb +#define ZHEMM3M_OLCOPYR gotoblas -> zhemm3m_olcopyr +#define ZHEMM3M_OUCOPYR gotoblas -> zhemm3m_oucopyr +#define ZHEMM3M_OLCOPYI gotoblas -> zhemm3m_olcopyi +#define ZHEMM3M_OUCOPYI gotoblas -> zhemm3m_oucopyi + +#define ZGEMM3M_KERNEL gotoblas -> zgemm3m_kernel + +#define ZNEG_TCOPY gotoblas -> zneg_tcopy +#define ZLASWP_NCOPY gotoblas -> zlaswp_ncopy + +#endif + +#define ZGEMM_NN zgemm_nn +#define ZGEMM_CN zgemm_cn +#define ZGEMM_TN zgemm_tn +#define ZGEMM_NC zgemm_nc +#define ZGEMM_NT zgemm_nt +#define ZGEMM_CC zgemm_cc +#define ZGEMM_CT zgemm_ct +#define ZGEMM_TC zgemm_tc +#define ZGEMM_TT zgemm_tt +#define ZGEMM_NR zgemm_nr +#define ZGEMM_TR zgemm_tr +#define ZGEMM_CR zgemm_cr +#define ZGEMM_RN zgemm_rn +#define ZGEMM_RT zgemm_rt +#define ZGEMM_RC zgemm_rc +#define ZGEMM_RR zgemm_rr + +#define ZSYMM_LU zsymm_LU +#define ZSYMM_LL zsymm_LL +#define ZSYMM_RU zsymm_RU +#define ZSYMM_RL zsymm_RL + +#define ZHEMM_LU zhemm_LU +#define ZHEMM_LL zhemm_LL +#define ZHEMM_RU zhemm_RU +#define ZHEMM_RL zhemm_RL + +#define ZSYRK_UN zsyrk_UN +#define ZSYRK_UT zsyrk_UT +#define ZSYRK_LN zsyrk_LN +#define ZSYRK_LT zsyrk_LT +#define ZSYRK_UR zsyrk_UN +#define ZSYRK_UC zsyrk_UT +#define ZSYRK_LR zsyrk_LN +#define ZSYRK_LC zsyrk_LT + +#define ZSYRK_KERNEL_U zsyrk_kernel_U +#define ZSYRK_KERNEL_L zsyrk_kernel_L + +#define ZHERK_UN zherk_UN +#define ZHERK_LN zherk_LN +#define ZHERK_UC zherk_UC +#define ZHERK_LC zherk_LC + +#define ZHER2K_UN zher2k_UN +#define ZHER2K_LN zher2k_LN +#define ZHER2K_UC zher2k_UC +#define ZHER2K_LC zher2k_LC + +#define ZSYR2K_UN zsyr2k_UN +#define ZSYR2K_UT zsyr2k_UT +#define ZSYR2K_LN zsyr2k_LN +#define ZSYR2K_LT zsyr2k_LT +#define ZSYR2K_UR zsyr2k_UN +#define ZSYR2K_UC zsyr2k_UT +#define ZSYR2K_LR zsyr2k_LN +#define ZSYR2K_LC zsyr2k_LT + +#define ZSYR2K_KERNEL_U zsyr2k_kernel_U +#define ZSYR2K_KERNEL_L zsyr2k_kernel_L + +#define ZTRMM_LNUU ztrmm_LNUU +#define ZTRMM_LNUN ztrmm_LNUN +#define ZTRMM_LNLU ztrmm_LNLU +#define ZTRMM_LNLN ztrmm_LNLN +#define ZTRMM_LTUU ztrmm_LTUU +#define ZTRMM_LTUN ztrmm_LTUN +#define ZTRMM_LTLU ztrmm_LTLU +#define ZTRMM_LTLN ztrmm_LTLN +#define ZTRMM_LRUU ztrmm_LRUU +#define ZTRMM_LRUN ztrmm_LRUN +#define ZTRMM_LRLU ztrmm_LRLU +#define ZTRMM_LRLN ztrmm_LRLN +#define ZTRMM_LCUU ztrmm_LCUU +#define ZTRMM_LCUN ztrmm_LCUN +#define ZTRMM_LCLU ztrmm_LCLU +#define ZTRMM_LCLN ztrmm_LCLN +#define ZTRMM_RNUU ztrmm_RNUU +#define ZTRMM_RNUN ztrmm_RNUN +#define ZTRMM_RNLU ztrmm_RNLU +#define ZTRMM_RNLN ztrmm_RNLN +#define ZTRMM_RTUU ztrmm_RTUU +#define ZTRMM_RTUN ztrmm_RTUN +#define ZTRMM_RTLU ztrmm_RTLU +#define ZTRMM_RTLN ztrmm_RTLN +#define ZTRMM_RRUU ztrmm_RRUU +#define ZTRMM_RRUN ztrmm_RRUN +#define ZTRMM_RRLU ztrmm_RRLU +#define ZTRMM_RRLN ztrmm_RRLN +#define ZTRMM_RCUU ztrmm_RCUU +#define ZTRMM_RCUN ztrmm_RCUN +#define ZTRMM_RCLU ztrmm_RCLU +#define ZTRMM_RCLN ztrmm_RCLN + +#define ZTRSM_LNUU ztrsm_LNUU +#define ZTRSM_LNUN ztrsm_LNUN +#define ZTRSM_LNLU ztrsm_LNLU +#define ZTRSM_LNLN ztrsm_LNLN +#define ZTRSM_LTUU ztrsm_LTUU +#define ZTRSM_LTUN ztrsm_LTUN +#define ZTRSM_LTLU ztrsm_LTLU +#define ZTRSM_LTLN ztrsm_LTLN +#define ZTRSM_LRUU ztrsm_LRUU +#define ZTRSM_LRUN ztrsm_LRUN +#define ZTRSM_LRLU ztrsm_LRLU +#define ZTRSM_LRLN ztrsm_LRLN +#define ZTRSM_LCUU ztrsm_LCUU +#define ZTRSM_LCUN ztrsm_LCUN +#define ZTRSM_LCLU ztrsm_LCLU +#define ZTRSM_LCLN ztrsm_LCLN +#define ZTRSM_RNUU ztrsm_RNUU +#define ZTRSM_RNUN ztrsm_RNUN +#define ZTRSM_RNLU ztrsm_RNLU +#define ZTRSM_RNLN ztrsm_RNLN +#define ZTRSM_RTUU ztrsm_RTUU +#define ZTRSM_RTUN ztrsm_RTUN +#define ZTRSM_RTLU ztrsm_RTLU +#define ZTRSM_RTLN ztrsm_RTLN +#define ZTRSM_RRUU ztrsm_RRUU +#define ZTRSM_RRUN ztrsm_RRUN +#define ZTRSM_RRLU ztrsm_RRLU +#define ZTRSM_RRLN ztrsm_RRLN +#define ZTRSM_RCUU ztrsm_RCUU +#define ZTRSM_RCUN ztrsm_RCUN +#define ZTRSM_RCLU ztrsm_RCLU +#define ZTRSM_RCLN ztrsm_RCLN + +#define ZGEMM_THREAD_NN zgemm_thread_nn +#define ZGEMM_THREAD_CN zgemm_thread_cn +#define ZGEMM_THREAD_TN zgemm_thread_tn +#define ZGEMM_THREAD_NC zgemm_thread_nc +#define ZGEMM_THREAD_NT zgemm_thread_nt +#define ZGEMM_THREAD_CC zgemm_thread_cc +#define ZGEMM_THREAD_CT zgemm_thread_ct +#define ZGEMM_THREAD_TC zgemm_thread_tc +#define ZGEMM_THREAD_TT zgemm_thread_tt +#define ZGEMM_THREAD_NR zgemm_thread_nr +#define ZGEMM_THREAD_TR zgemm_thread_tr +#define ZGEMM_THREAD_CR zgemm_thread_cr +#define ZGEMM_THREAD_RN zgemm_thread_rn +#define ZGEMM_THREAD_RT zgemm_thread_rt +#define ZGEMM_THREAD_RC zgemm_thread_rc +#define ZGEMM_THREAD_RR zgemm_thread_rr + +#define ZSYMM_THREAD_LU zsymm_thread_LU +#define ZSYMM_THREAD_LL zsymm_thread_LL +#define ZSYMM_THREAD_RU zsymm_thread_RU +#define ZSYMM_THREAD_RL zsymm_thread_RL + +#define ZHEMM_THREAD_LU zhemm_thread_LU +#define ZHEMM_THREAD_LL zhemm_thread_LL +#define ZHEMM_THREAD_RU zhemm_thread_RU +#define ZHEMM_THREAD_RL zhemm_thread_RL + +#define ZSYRK_THREAD_UN zsyrk_thread_UN +#define ZSYRK_THREAD_UT zsyrk_thread_UT +#define ZSYRK_THREAD_LN zsyrk_thread_LN +#define ZSYRK_THREAD_LT zsyrk_thread_LT +#define ZSYRK_THREAD_UR zsyrk_thread_UN +#define ZSYRK_THREAD_UC zsyrk_thread_UT +#define ZSYRK_THREAD_LR zsyrk_thread_LN +#define ZSYRK_THREAD_LC zsyrk_thread_LT + +#define ZHERK_THREAD_UN zherk_thread_UN +#define ZHERK_THREAD_UT zherk_thread_UT +#define ZHERK_THREAD_LN zherk_thread_LN +#define ZHERK_THREAD_LT zherk_thread_LT +#define ZHERK_THREAD_UR zherk_thread_UR +#define ZHERK_THREAD_UC zherk_thread_UC +#define ZHERK_THREAD_LR zherk_thread_LR +#define ZHERK_THREAD_LC zherk_thread_LC + +#define ZGEMM3M_NN zgemm3m_nn +#define ZGEMM3M_CN zgemm3m_cn +#define ZGEMM3M_TN zgemm3m_tn +#define ZGEMM3M_NC zgemm3m_nc +#define ZGEMM3M_NT zgemm3m_nt +#define ZGEMM3M_CC zgemm3m_cc +#define ZGEMM3M_CT zgemm3m_ct +#define ZGEMM3M_TC zgemm3m_tc +#define ZGEMM3M_TT zgemm3m_tt +#define ZGEMM3M_NR zgemm3m_nr +#define ZGEMM3M_TR zgemm3m_tr +#define ZGEMM3M_CR zgemm3m_cr +#define ZGEMM3M_RN zgemm3m_rn +#define ZGEMM3M_RT zgemm3m_rt +#define ZGEMM3M_RC zgemm3m_rc +#define ZGEMM3M_RR zgemm3m_rr + +#define ZGEMM3M_THREAD_NN zgemm3m_thread_nn +#define ZGEMM3M_THREAD_CN zgemm3m_thread_cn +#define ZGEMM3M_THREAD_TN zgemm3m_thread_tn +#define ZGEMM3M_THREAD_NC zgemm3m_thread_nc +#define ZGEMM3M_THREAD_NT zgemm3m_thread_nt +#define ZGEMM3M_THREAD_CC zgemm3m_thread_cc +#define ZGEMM3M_THREAD_CT zgemm3m_thread_ct +#define ZGEMM3M_THREAD_TC zgemm3m_thread_tc +#define ZGEMM3M_THREAD_TT zgemm3m_thread_tt +#define ZGEMM3M_THREAD_NR zgemm3m_thread_nr +#define ZGEMM3M_THREAD_TR zgemm3m_thread_tr +#define ZGEMM3M_THREAD_CR zgemm3m_thread_cr +#define ZGEMM3M_THREAD_RN zgemm3m_thread_rn +#define ZGEMM3M_THREAD_RT zgemm3m_thread_rt +#define ZGEMM3M_THREAD_RC zgemm3m_thread_rc +#define ZGEMM3M_THREAD_RR zgemm3m_thread_rr + +#define ZSYMM3M_LU zsymm3m_LU +#define ZSYMM3M_LL zsymm3m_LL +#define ZSYMM3M_RU zsymm3m_RU +#define ZSYMM3M_RL zsymm3m_RL + +#define ZSYMM3M_THREAD_LU zsymm3m_thread_LU +#define ZSYMM3M_THREAD_LL zsymm3m_thread_LL +#define ZSYMM3M_THREAD_RU zsymm3m_thread_RU +#define ZSYMM3M_THREAD_RL zsymm3m_thread_RL + +#define ZHEMM3M_LU zhemm3m_LU +#define ZHEMM3M_LL zhemm3m_LL +#define ZHEMM3M_RU zhemm3m_RU +#define ZHEMM3M_RL zhemm3m_RL + +#define ZHEMM3M_THREAD_LU zhemm3m_thread_LU +#define ZHEMM3M_THREAD_LL zhemm3m_thread_LL +#define ZHEMM3M_THREAD_RU zhemm3m_thread_RU +#define ZHEMM3M_THREAD_RL zhemm3m_thread_RL + +#endif diff --git a/cpuid.S b/cpuid.S new file mode 100644 index 0000000000..3f7bf5f900 --- /dev/null +++ b/cpuid.S @@ -0,0 +1,67 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#if defined(__APPLE__) && defined(__i386__) + +/* Quick hack for Darwin/x86 */ + + .text + .globl _cpuid +_cpuid: + pushl %esi + pushl %ebx + + movl 12(%esp), %eax + cpuid + + movl 16(%esp), %esi + movl %eax, (%esi) + movl 20(%esp), %esi + movl %ebx, (%esi) + movl 24(%esp), %esi + movl %ecx, (%esi) + movl 28(%esp), %esi + movl %edx, (%esi) + + popl %ebx + popl %esi + ret + + .subsections_via_symbols + +#endif diff --git a/cpuid.h b/cpuid.h new file mode 100644 index 0000000000..665ede0770 --- /dev/null +++ b/cpuid.h @@ -0,0 +1,191 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef CPUID_H +#define CPUID_H + +#define VENDOR_INTEL 1 +#define VENDOR_UMC 2 +#define VENDOR_AMD 3 +#define VENDOR_CYRIX 4 +#define VENDOR_NEXGEN 5 +#define VENDOR_CENTAUR 6 +#define VENDOR_RISE 7 +#define VENDOR_SIS 8 +#define VENDOR_TRANSMETA 9 +#define VENDOR_NSC 10 +#define VENDOR_UNKNOWN 99 + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#define FAMILY_80486 4 +#define FAMILY_P5 5 +#define FAMILY_P6 6 +#define FAMILY_PM 7 +#define FAMILY_IA64 8 + +#if defined(__i386__) || defined(__x86_64__) +#define GET_EXFAMILY 1 +#define GET_EXMODEL 2 +#define GET_TYPE 3 +#define GET_FAMILY 4 +#define GET_MODEL 5 +#define GET_APICID 6 +#define GET_LCOUNT 7 +#define GET_CHUNKS 8 +#define GET_STEPPING 9 +#define GET_BLANDID 10 +#define GET_FEATURE 11 +#define GET_NUMSHARE 12 +#define GET_NUMCORES 13 +#endif + +#ifdef __ia64__ +#define GET_ARCHREV 1 +#define GET_FAMILY 2 +#define GET_MODEL 3 +#define GET_REVISION 4 +#define GET_NUMBER 5 +#endif + +#define CORE_UNKNOWN 0 +#define CORE_80486 1 +#define CORE_P5 2 +#define CORE_P6 3 +#define CORE_KATMAI 4 +#define CORE_COPPERMINE 5 +#define CORE_NORTHWOOD 6 +#define CORE_PRESCOTT 7 +#define CORE_BANIAS 8 +#define CORE_ATHLON 9 +#define CORE_OPTERON 10 +#define CORE_BARCELONA 11 +#define CORE_VIAC3 12 +#define CORE_YONAH 13 +#define CORE_CORE2 14 +#define CORE_PENRYN 15 +#define CORE_DUNNINGTON 16 +#define CORE_NEHALEM 17 +#define CORE_ATOM 18 +#define CORE_NANO 19 + +#define HAVE_SSE (1 << 0) +#define HAVE_SSE2 (1 << 1) +#define HAVE_SSE3 (1 << 2) +#define HAVE_SSSE3 (1 << 3) +#define HAVE_SSE4_1 (1 << 4) +#define HAVE_SSE4_2 (1 << 5) +#define HAVE_SSE4A (1 << 6) +#define HAVE_SSE5 (1 << 7) +#define HAVE_MMX (1 << 8) +#define HAVE_3DNOW (1 << 9) +#define HAVE_3DNOWEX (1 << 10) +#define HAVE_CMOV (1 << 11) +#define HAVE_PSE (1 << 12) +#define HAVE_CFLUSH (1 << 13) +#define HAVE_HIT (1 << 14) +#define HAVE_MISALIGNSSE (1 << 15) +#define HAVE_128BITFPU (1 << 16) +#define HAVE_FASTMOVU (1 << 17) + +#define CACHE_INFO_L1_I 1 +#define CACHE_INFO_L1_D 2 +#define CACHE_INFO_L2 3 +#define CACHE_INFO_L3 4 +#define CACHE_INFO_L1_ITB 5 +#define CACHE_INFO_L1_DTB 6 +#define CACHE_INFO_L1_LITB 7 +#define CACHE_INFO_L1_LDTB 8 +#define CACHE_INFO_L2_ITB 9 +#define CACHE_INFO_L2_DTB 10 +#define CACHE_INFO_L2_LITB 11 +#define CACHE_INFO_L2_LDTB 12 + +typedef struct { + int size; + int associative; + int linesize; + int shared; +} cache_info_t; + +#define CPUTYPE_UNKNOWN 0 +#define CPUTYPE_INTEL_UNKNOWN 1 +#define CPUTYPE_UMC_UNKNOWN 2 +#define CPUTYPE_AMD_UNKNOWN 3 +#define CPUTYPE_CYRIX_UNKNOWN 4 +#define CPUTYPE_NEXGEN_UNKNOWN 5 +#define CPUTYPE_CENTAUR_UNKNOWN 6 +#define CPUTYPE_RISE_UNKNOWN 7 +#define CPUTYPE_SIS_UNKNOWN 8 +#define CPUTYPE_TRANSMETA_UNKNOWN 9 +#define CPUTYPE_NSC_UNKNOWN 10 + +#define CPUTYPE_80386 11 +#define CPUTYPE_80486 12 +#define CPUTYPE_PENTIUM 13 +#define CPUTYPE_PENTIUM2 14 +#define CPUTYPE_PENTIUM3 15 +#define CPUTYPE_PENTIUMM 16 +#define CPUTYPE_PENTIUM4 17 +#define CPUTYPE_CORE2 18 +#define CPUTYPE_PENRYN 19 +#define CPUTYPE_DUNNINGTON 20 +#define CPUTYPE_NEHALEM 21 +#define CPUTYPE_ATOM 22 +#define CPUTYPE_ITANIUM 23 +#define CPUTYPE_ITANIUM2 24 +#define CPUTYPE_AMD5X86 25 +#define CPUTYPE_AMDK6 26 +#define CPUTYPE_ATHLON 27 +#define CPUTYPE_DURON 28 +#define CPUTYPE_OPTERON 29 +#define CPUTYPE_BARCELONA 30 +#define CPUTYPE_SHANGHAI 31 +#define CPUTYPE_ISTANBUL 32 +#define CPUTYPE_CYRIX5X86 33 +#define CPUTYPE_CYRIXM1 34 +#define CPUTYPE_CYRIXM2 35 +#define CPUTYPE_NEXGENNX586 36 +#define CPUTYPE_CENTAURC6 37 +#define CPUTYPE_RISEMP6 38 +#define CPUTYPE_SYS55X 39 +#define CPUTYPE_CRUSOETM3X 40 +#define CPUTYPE_NSGEODE 41 +#define CPUTYPE_VIAC3 42 +#define CPUTYPE_NANO 43 +#endif diff --git a/cpuid_alpha.c b/cpuid_alpha.c new file mode 100644 index 0000000000..ca786d550d --- /dev/null +++ b/cpuid_alpha.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#if defined(__alpha) && defined(__DECC) +#include +#endif + +int implver(void){ + int arch; + +#ifndef __DECC + asm __volatile__("implver %0" : "=r"(arch) : : "memory"); +#else + arch = asm("implver %v0"); +#endif + return arch; +} + +void get_architecture(void){ + printf("ALPHA"); +} + +void get_subarchitecture(void){ + printf("ev%d", implver() + 4); +} + +void get_subdirname(void){ + printf("alpha"); +} + +void get_cpuconfig(void){ + printf("#define EV%d\n", implver() + 4); + + switch (implver()){ + case 0: + printf("#define L1_DATA_SIZE 16384\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_ENTRIES 32\n"); + printf("#define DTB_SIZE 8192\n"); + break; + + case 1: + printf("#define L1_DATA_SIZE 16384\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_ENTRIES 64\n"); + printf("#define DTB_SIZE 8192\n"); + break; + + case 2: + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L2_SIZE 4194304\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define DTB_ENTRIES 64\n"); + printf("#define DTB_SIZE 8192\n"); + break; + } +} + +void get_libname(void){ + printf("ev%d\n", implver() + 4); +} diff --git a/cpuid_ia64.c b/cpuid_ia64.c new file mode 100644 index 0000000000..7f0fa6d2f9 --- /dev/null +++ b/cpuid_ia64.c @@ -0,0 +1,138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include "cpuid.h" + +#ifdef __ECC +#include +#endif + +static inline unsigned long cpuid(unsigned long regnum){ + unsigned long value; + +#ifdef __ECC + value = __getIndReg(_IA64_REG_INDR_CPUID, regnum); +#else + asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum)); +#endif + + return value; +} + +int have_cpuid(void){ return 1;} + +int get_vendor(void){ + unsigned long cpuid0, cpuid1; + char vendor[18]; + + cpuid0 = cpuid(0); + cpuid1 = cpuid(1); + + *(unsigned long *)(&vendor[0]) = cpuid0; + *(unsigned long *)(&vendor[8]) = cpuid1; + vendor[17] = (char)0; + + if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +int get_cputype(int gettype){ + unsigned long cpuid3; + + cpuid3 = cpuid(3); + + switch (gettype) { + case GET_ARCHREV : + return BITMASK(cpuid3, 32, 0xff); + case GET_FAMILY : + return BITMASK(cpuid3, 24, 0xff); + case GET_MODEL : + return BITMASK(cpuid3, 16, 0xff); + case GET_REVISION : + return BITMASK(cpuid3, 8, 0xff); + case GET_NUMBER : + return BITMASK(cpuid3, 0, 0xff); + } + + return 0; +} + +char *get_cpunamechar(void){ + if (get_cputype(GET_FAMILY) == 0x07) return "ITANIUM"; + if (get_cputype(GET_FAMILY) == 0x1f) return "ITANIUM2"; + if (get_cputype(GET_FAMILY) == 0x20) return "ITANIUM2"; + + return "UNKNOWN"; +} + +char *get_libname(void){ + if (get_cputype(GET_FAMILY) == 0x07) { printf("itanium"); return NULL;} + if (get_cputype(GET_FAMILY) == 0x1f) { printf("itanium2"); return NULL;} + if (get_cputype(GET_FAMILY) == 0x20) { printf("itanium2"); return NULL;} + + printf("UNKNOWN"); + + return NULL; +} + +void get_architecture(void){ + printf("IA64"); +} + +void get_subarchitecture(void){ + printf("%s", get_cpunamechar()); +} + +void get_subdirname(void){ + printf("ia64"); +} + +void get_cpuconfig(void){ + printf("#define %s\n", get_cpunamechar()); + printf("#define L1_DATA_SIZE 262144\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 1572864\n"); + printf("#define L2_LINESIZE 128\n"); + printf("#define DTB_SIZE 16384\n"); + printf("#define DTB_ENTRIES 128\n"); +} + diff --git a/cpuid_mips.c b/cpuid_mips.c new file mode 100644 index 0000000000..99e4bcc258 --- /dev/null +++ b/cpuid_mips.c @@ -0,0 +1,68 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +void get_architecture(void){ + printf("MIPS64"); +} + +void get_subarchitecture(void){ + printf("SICORTEX"); +} + +void get_subdirname(void){ + printf("mips64"); +} + +void get_cpuconfig(void){ + printf("#define SICORTEX\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_ENTRIES 32\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); +} + +void get_libname(void){ +#ifdef __mips64 + printf("mips64\n"); +#else + printf("mips32\n"); +#endif +} diff --git a/cpuid_power.c b/cpuid_power.c new file mode 100644 index 0000000000..46ff30a3a4 --- /dev/null +++ b/cpuid_power.c @@ -0,0 +1,190 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#ifdef _AIX +#include +#endif +#ifdef __APPLE__ +#include +#include +#include +#include +#endif + +#define CPUTYPE_UNKNOWN 0 +#define CPUTYPE_POWER3 1 +#define CPUTYPE_POWER4 2 +#define CPUTYPE_PPC970 3 +#define CPUTYPE_POWER5 4 +#define CPUTYPE_POWER6 5 +#define CPUTYPE_CELL 6 +#define CPUTYPE_PPCG4 7 + +char *cpuname[] = { + "UNKNOWN", + "POWER3", + "POWER4", + "PPC970", + "POWER5", + "POWER6", + "CELL", + "PPCG4", +}; + +char *lowercpuname[] = { + "unknown", + "power3", + "power4", + "ppc970", + "power5", + "power6", + "cell", + "ppcg4", +}; + +char *corename[] = { + "UNKNOWN", + "POWER3", + "POWER4", + "POWER4", + "POWER4", + "POWER6", + "CELL", + "PPCG4", +}; + +int detect(void){ + +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("cpu", buffer, 3)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; + if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; + if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; + if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; + if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; + if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; + if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; + + return CPUTYPE_UNKNOWN; +#endif + +#ifdef _AIX + return CPUTYPE_POWER5; +#endif + +#ifdef __APPLE__ + host_basic_info_data_t hostInfo; + mach_msg_type_number_t infoCount; + + infoCount = HOST_BASIC_INFO_COUNT; + host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount); + + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4; + if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970; + + return CPUTYPE_PPC970; +#endif +} + +void get_architecture(void){ + printf("POWER"); +} + +void get_subdirname(void){ + printf("power"); +} + + +void get_subarchitecture(void){ + printf("%s", cpuname[detect()]); +} + +void get_cpuconfig(void){ +#if 0 +#ifdef _AIX + struct vminfo info; +#endif +#endif + + printf("#define %s\n", cpuname[detect()]); + printf("#define CORE_%s\n", corename[detect()]); + + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 128\n"); + printf("#define L2_SIZE 524288\n"); + printf("#define L2_LINESIZE 128 \n"); + printf("#define DTB_ENTRIES 128\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + +#if 0 +#ifdef _AIX + if (vmgetinfo(&info, VMINFO, 0) == 0) { + if ((info.lgpg_size >> 20) >= 1024) { + printf("#define ALLOC_HUGETLB\n"); + } + } +#endif +#endif + +} + +void get_libname(void){ + printf("%s", lowercpuname[detect()]); +} + +char *get_corename(void){ + return cpuname[detect()]; +} diff --git a/cpuid_sparc.c b/cpuid_sparc.c new file mode 100644 index 0000000000..b65c69de43 --- /dev/null +++ b/cpuid_sparc.c @@ -0,0 +1,58 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +void get_architecture(void){ + printf("SPARC"); +} + +void get_subarchitecture(void){ + printf("v9"); +} + +void get_subdirname(void){ + printf("sparc"); +} + +void get_cpuconfig(void){ + printf("#define V9\n"); + printf("#define DTB_ENTRIES 32\n"); +} + +void get_libname(void){ + printf("v9\n"); +} diff --git a/cpuid_x86.c b/cpuid_x86.c new file mode 100644 index 0000000000..2887544976 --- /dev/null +++ b/cpuid_x86.c @@ -0,0 +1,1453 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "cpuid.h" + +#ifndef CPUIDEMU + +#if defined(__APPLE__) && defined(__i386__) +void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx); +#else +static inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ + __asm__ __volatile__ + ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); + +} +#endif + +#else + +typedef struct { + unsigned int id, a, b, c, d; +} idlist_t; + +typedef struct { + char *vendor; + char *name; + int start, stop; +} vendor_t; + +extern idlist_t idlist[]; +extern vendor_t vendor[]; + +static int cv = VENDOR; + +void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx){ + + static int current = 0; + + int start = vendor[cv].start; + int stop = vendor[cv].stop; + int count = stop - start; + + if ((current < start) || (current > stop)) current = start; + + while ((count > 0) && (idlist[current].id != op)) { + + current ++; + if (current > stop) current = start; + count --; + + } + + *eax = idlist[current].a; + *ebx = idlist[current].b; + *ecx = idlist[current].c; + *edx = idlist[current].d; +} + +#endif + +static inline int have_cpuid(void){ + int eax, ebx, ecx, edx; + + cpuid(0, &eax, &ebx, &ecx, &edx); + return eax; +} + +static inline int have_excpuid(void){ + int eax, ebx, ecx, edx; + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + return eax & 0xffff; +} + +int get_vendor(void){ + int eax, ebx, ecx, edx; + char vendor[13]; + + cpuid(0, &eax, &ebx, &ecx, &edx); + + *(int *)(&vendor[0]) = ebx; + *(int *)(&vendor[4]) = edx; + *(int *)(&vendor[8]) = ecx; + vendor[12] = (char)0; + + if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL; + if (!strcmp(vendor, " UMC UMC UMC")) return VENDOR_UMC; + if (!strcmp(vendor, "AuthenticAMD")) return VENDOR_AMD; + if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; + if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; + if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; + if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; + if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; + if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; + if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; + + if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +int get_cputype(int gettype){ + int eax, ebx, ecx, edx; + int extend_family, family; + int extend_model, model; + int type, stepping; + int feature = 0; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + switch (gettype) { + case GET_EXFAMILY : + return BITMASK(eax, 20, 0xff); + case GET_EXMODEL : + return BITMASK(eax, 16, 0x0f); + case GET_TYPE : + return BITMASK(eax, 12, 0x03); + case GET_FAMILY : + return BITMASK(eax, 8, 0x0f); + case GET_MODEL : + return BITMASK(eax, 4, 0x0f); + case GET_APICID : + return BITMASK(ebx, 24, 0x0f); + case GET_LCOUNT : + return BITMASK(ebx, 16, 0x0f); + case GET_CHUNKS : + return BITMASK(ebx, 8, 0x0f); + case GET_STEPPING : + return BITMASK(eax, 0, 0x0f); + case GET_BLANDID : + return BITMASK(ebx, 0, 0xff); + case GET_NUMSHARE : + if (have_cpuid() < 4) return 0; + cpuid(4, &eax, &ebx, &ecx, &edx); + return BITMASK(eax, 14, 0xfff); + case GET_NUMCORES : + if (have_cpuid() < 4) return 0; + cpuid(4, &eax, &ebx, &ecx, &edx); + return BITMASK(eax, 26, 0x3f); + + case GET_FEATURE : + if ((edx & (1 << 3)) != 0) feature |= HAVE_PSE; + if ((edx & (1 << 15)) != 0) feature |= HAVE_CMOV; + if ((edx & (1 << 19)) != 0) feature |= HAVE_CFLUSH; + if ((edx & (1 << 23)) != 0) feature |= HAVE_MMX; + if ((edx & (1 << 25)) != 0) feature |= HAVE_SSE; + if ((edx & (1 << 26)) != 0) feature |= HAVE_SSE2; + if ((edx & (1 << 27)) != 0) { + if (BITMASK(ebx, 16, 0x0f) > 0) feature |= HAVE_HIT; + } + if ((ecx & (1 << 0)) != 0) feature |= HAVE_SSE3; + if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; + if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; + if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; + + if (have_excpuid() >= 0x01) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; + if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; + if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; + if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; + } + + if (have_excpuid() >= 0x1a) { + cpuid(0x8000001a, &eax, &ebx, &ecx, &edx); + if ((eax & (1 << 0)) != 0) feature |= HAVE_128BITFPU; + if ((eax & (1 << 1)) != 0) feature |= HAVE_FASTMOVU; + } + + } + return feature; +} + +int get_cacheinfo(int type, cache_info_t *cacheinfo){ + int eax, ebx, ecx, edx, cpuid_level; + int info[15]; + int i; + cache_info_t LC1, LD1, L2, L3, + ITB, DTB, LITB, LDTB, + L2ITB, L2DTB, L2LITB, L2LDTB; + + LC1.size = 0; LC1.associative = 0; LC1.linesize = 0; LC1.shared = 0; + LD1.size = 0; LD1.associative = 0; LD1.linesize = 0; LD1.shared = 0; + L2.size = 0; L2.associative = 0; L2.linesize = 0; L2.shared = 0; + L3.size = 0; L3.associative = 0; L3.linesize = 0; L3.shared = 0; + ITB.size = 0; ITB.associative = 0; ITB.linesize = 0; ITB.shared = 0; + DTB.size = 0; DTB.associative = 0; DTB.linesize = 0; DTB.shared = 0; + LITB.size = 0; LITB.associative = 0; LITB.linesize = 0; LITB.shared = 0; + LDTB.size = 0; LDTB.associative = 0; LDTB.linesize = 0; LDTB.shared = 0; + L2ITB.size = 0; L2ITB.associative = 0; L2ITB.linesize = 0; L2ITB.shared = 0; + L2DTB.size = 0; L2DTB.associative = 0; L2DTB.linesize = 0; L2DTB.shared = 0; + L2LITB.size = 0; L2LITB.associative = 0; L2LITB.linesize = 0; L2LITB.shared = 0; + L2LDTB.size = 0; L2LDTB.associative = 0; L2LDTB.linesize = 0; L2LDTB.shared = 0; + + cpuid(0, &cpuid_level, &ebx, &ecx, &edx); + + if (cpuid_level > 1) { + + cpuid(2, &eax, &ebx, &ecx, &edx); + + info[ 0] = BITMASK(eax, 8, 0xff); + info[ 1] = BITMASK(eax, 16, 0xff); + info[ 2] = BITMASK(eax, 24, 0xff); + + info[ 3] = BITMASK(ebx, 0, 0xff); + info[ 4] = BITMASK(ebx, 8, 0xff); + info[ 5] = BITMASK(ebx, 16, 0xff); + info[ 6] = BITMASK(ebx, 24, 0xff); + + info[ 7] = BITMASK(ecx, 0, 0xff); + info[ 8] = BITMASK(ecx, 8, 0xff); + info[ 9] = BITMASK(ecx, 16, 0xff); + info[10] = BITMASK(ecx, 24, 0xff); + + info[11] = BITMASK(edx, 0, 0xff); + info[12] = BITMASK(edx, 8, 0xff); + info[13] = BITMASK(edx, 16, 0xff); + info[14] = BITMASK(edx, 24, 0xff); + + for (i = 0; i < 15; i++){ + + switch (info[i]){ + + /* This table is from http://www.sandpile.org/ia32/cpuid.htm */ + + case 0x01 : + ITB.size = 4; + ITB.associative = 4; + ITB.linesize = 32; + break; + case 0x02 : + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 2; + break; + case 0x03 : + DTB.size = 4; + DTB.associative = 4; + DTB.linesize = 64; + break; + case 0x04 : + LDTB.size = 4096; + LDTB.associative = 4; + LDTB.linesize = 8; + break; + case 0x05 : + LDTB.size = 4096; + LDTB.associative = 4; + LDTB.linesize = 32; + break; + case 0x06 : + LC1.size = 8; + LC1.associative = 4; + LC1.linesize = 32; + break; + case 0x08 : + LC1.size = 16; + LC1.associative = 4; + LC1.linesize = 32; + break; + case 0x09 : + LC1.size = 32; + LC1.associative = 4; + LC1.linesize = 64; + break; + case 0x0a : + LD1.size = 8; + LD1.associative = 2; + LD1.linesize = 32; + break; + case 0x0c : + LD1.size = 16; + LD1.associative = 4; + LD1.linesize = 32; + break; + case 0x0d : + LD1.size = 16; + LD1.associative = 4; + LD1.linesize = 64; + break; + case 0x0e : + LD1.size = 24; + LD1.associative = 6; + LD1.linesize = 64; + break; + case 0x10 : + LD1.size = 16; + LD1.associative = 4; + LD1.linesize = 32; + break; + case 0x15 : + LC1.size = 16; + LC1.associative = 4; + LC1.linesize = 32; + break; + case 0x1a : + L2.size = 96; + L2.associative = 6; + L2.linesize = 64; + break; + case 0x21 : + L2.size = 256; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x22 : + L3.size = 512; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x23 : + L3.size = 1024; + L3.associative = 8; + L3.linesize = 64; + break; + case 0x25 : + L3.size = 2048; + L3.associative = 8; + L3.linesize = 64; + break; + case 0x29 : + L3.size = 4096; + L3.associative = 8; + L3.linesize = 64; + break; + case 0x2c : + LD1.size = 32; + LD1.associative = 8; + LD1.linesize = 64; + break; + case 0x30 : + LC1.size = 32; + LC1.associative = 8; + LC1.linesize = 64; + break; + case 0x39 : + L2.size = 128; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x3a : + L2.size = 192; + L2.associative = 6; + L2.linesize = 64; + break; + case 0x3b : + L2.size = 128; + L2.associative = 2; + L2.linesize = 64; + break; + case 0x3c : + L2.size = 256; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x3d : + L2.size = 384; + L2.associative = 6; + L2.linesize = 64; + break; + case 0x3e : + L2.size = 512; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x41 : + L2.size = 128; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x42 : + L2.size = 256; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x43 : + L2.size = 512; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x44 : + L2.size = 1024; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x45 : + L2.size = 2048; + L2.associative = 4; + L2.linesize = 32; + break; + case 0x46 : + L3.size = 4096; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x47 : + L3.size = 8192; + L3.associative = 8; + L3.linesize = 64; + break; + case 0x48 : + L2.size = 3184; + L2.associative = 12; + L2.linesize = 64; + break; + case 0x49 : + if ((get_cputype(GET_FAMILY) == 0x0f) && (get_cputype(GET_MODEL) == 0x06)) { + L3.size = 4096; + L3.associative = 16; + L3.linesize = 64; + } else { + L2.size = 4096; + L2.associative = 16; + L2.linesize = 64; + } + break; + case 0x4a : + L3.size = 6144; + L3.associative = 12; + L3.linesize = 64; + break; + case 0x4b : + L3.size = 8192; + L3.associative = 16; + L3.linesize = 64; + break; + case 0x4c : + L3.size = 12280; + L3.associative = 12; + L3.linesize = 64; + break; + case 0x4d : + L3.size = 16384; + L3.associative = 16; + L3.linesize = 64; + break; + case 0x4e : + L2.size = 6144; + L2.associative = 24; + L2.linesize = 64; + break; + case 0x4f : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 32; + break; + case 0x50 : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 64; + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 64; + LITB.shared = 1; + break; + case 0x51 : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 128; + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 128; + LITB.shared = 1; + break; + case 0x52 : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 256; + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 256; + LITB.shared = 1; + break; + case 0x55 : + LITB.size = 4096; + LITB.associative = 0; + LITB.linesize = 7; + LITB.shared = 1; + break; + case 0x56 : + LDTB.size = 4096; + LDTB.associative = 4; + LDTB.linesize = 16; + break; + case 0x57 : + LDTB.size = 4096; + LDTB.associative = 4; + LDTB.linesize = 16; + break; + case 0x5b : + DTB.size = 4; + DTB.associative = 0; + DTB.linesize = 64; + LDTB.size = 4096; + LDTB.associative = 0; + LDTB.linesize = 64; + LDTB.shared = 1; + break; + case 0x5c : + DTB.size = 4; + DTB.associative = 0; + DTB.linesize = 128; + LDTB.size = 4096; + LDTB.associative = 0; + LDTB.linesize = 128; + LDTB.shared = 1; + break; + case 0x5d : + DTB.size = 4; + DTB.associative = 0; + DTB.linesize = 256; + LDTB.size = 4096; + LDTB.associative = 0; + LDTB.linesize = 256; + LDTB.shared = 1; + break; + case 0x60 : + LD1.size = 16; + LD1.associative = 8; + LD1.linesize = 64; + break; + case 0x66 : + LD1.size = 8; + LD1.associative = 4; + LD1.linesize = 64; + break; + case 0x67 : + LD1.size = 16; + LD1.associative = 4; + LD1.linesize = 64; + break; + case 0x68 : + LD1.size = 32; + LD1.associative = 4; + LD1.linesize = 64; + break; + case 0x70 : + LC1.size = 12; + LC1.associative = 8; + break; + case 0x71 : + LC1.size = 16; + LC1.associative = 8; + break; + case 0x72 : + LC1.size = 32; + LC1.associative = 8; + break; + case 0x73 : + LC1.size = 64; + LC1.associative = 8; + break; + case 0x77 : + LC1.size = 16; + LC1.associative = 4; + LC1.linesize = 64; + break; + case 0x78 : + L2.size = 1024; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x79 : + L2.size = 128; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7a : + L2.size = 256; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7b : + L2.size = 512; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7c : + L2.size = 1024; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7d : + L2.size = 2048; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x7e : + L2.size = 256; + L2.associative = 8; + L2.linesize = 128; + break; + case 0x7f : + L2.size = 512; + L2.associative = 2; + L2.linesize = 64; + break; + case 0x81 : + L2.size = 128; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x82 : + L2.size = 256; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x83 : + L2.size = 512; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x84 : + L2.size = 1024; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x85 : + L2.size = 2048; + L2.associative = 8; + L2.linesize = 32; + break; + case 0x86 : + L2.size = 512; + L2.associative = 4; + L2.linesize = 64; + break; + case 0x87 : + L2.size = 1024; + L2.associative = 8; + L2.linesize = 64; + break; + case 0x88 : + L3.size = 2048; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x89 : + L3.size = 4096; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x8a : + L3.size = 8192; + L3.associative = 4; + L3.linesize = 64; + break; + case 0x8d : + L3.size = 3096; + L3.associative = 12; + L3.linesize = 128; + break; + case 0x90 : + ITB.size = 4; + ITB.associative = 0; + ITB.linesize = 64; + break; + case 0x96 : + DTB.size = 4; + DTB.associative = 0; + DTB.linesize = 32; + break; + case 0x9b : + L2DTB.size = 4; + L2DTB.associative = 0; + L2DTB.linesize = 96; + break; + case 0xb0 : + ITB.size = 4; + ITB.associative = 4; + ITB.linesize = 128; + break; + case 0xb1 : + LITB.size = 4096; + LITB.associative = 4; + LITB.linesize = 4; + break; + case 0xb2 : + ITB.size = 4; + ITB.associative = 4; + ITB.linesize = 64; + break; + case 0xb3 : + DTB.size = 4; + DTB.associative = 4; + DTB.linesize = 128; + break; + case 0xb4 : + DTB.size = 4; + DTB.associative = 4; + DTB.linesize = 256; + break; + case 0xba : + DTB.size = 4; + DTB.associative = 4; + DTB.linesize = 64; + break; + case 0xd0 : + L3.size = 512; + L3.associative = 4; + L3.linesize = 64; + break; + case 0xd1 : + L3.size = 1024; + L3.associative = 4; + L3.linesize = 64; + break; + case 0xd2 : + L3.size = 2048; + L3.associative = 4; + L3.linesize = 64; + break; + case 0xd6 : + L3.size = 1024; + L3.associative = 8; + L3.linesize = 64; + break; + case 0xd7 : + L3.size = 2048; + L3.associative = 8; + L3.linesize = 64; + break; + case 0xd8 : + L3.size = 4096; + L3.associative = 8; + L3.linesize = 64; + break; + case 0xdc : + L3.size = 2048; + L3.associative = 12; + L3.linesize = 64; + break; + case 0xdd : + L3.size = 4096; + L3.associative = 12; + L3.linesize = 64; + break; + case 0xde : + L3.size = 8192; + L3.associative = 12; + L3.linesize = 64; + break; + case 0xe2 : + L3.size = 2048; + L3.associative = 16; + L3.linesize = 64; + break; + case 0xe3 : + L3.size = 4096; + L3.associative = 16; + L3.linesize = 64; + break; + case 0xe4 : + L3.size = 8192; + L3.associative = 16; + L3.linesize = 64; + break; + } + } + } + + if (get_vendor() == VENDOR_INTEL) { + cpuid(0x80000000, &cpuid_level, &ebx, &ecx, &edx); + if (cpuid_level >= 0x80000006) { + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + L2.size = BITMASK(ecx, 16, 0xffff); + L2.associative = BITMASK(ecx, 12, 0x0f); + L2.linesize = BITMASK(ecx, 0, 0xff); + } + } + + if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { + cpuid(0x80000005, &eax, &ebx, &ecx, &edx); + + LDTB.size = 4096; + LDTB.associative = BITMASK(eax, 24, 0xff); + if (LDTB.associative == 0xff) LDTB.associative = 0; + LDTB.linesize = BITMASK(eax, 16, 0xff); + + LITB.size = 4096; + LITB.associative = BITMASK(eax, 8, 0xff); + if (LITB.associative == 0xff) LITB.associative = 0; + LITB.linesize = BITMASK(eax, 0, 0xff); + + DTB.size = 4; + DTB.associative = BITMASK(ebx, 24, 0xff); + if (DTB.associative == 0xff) DTB.associative = 0; + DTB.linesize = BITMASK(ebx, 16, 0xff); + + ITB.size = 4; + ITB.associative = BITMASK(ebx, 8, 0xff); + if (ITB.associative == 0xff) ITB.associative = 0; + ITB.linesize = BITMASK(ebx, 0, 0xff); + + LD1.size = BITMASK(ecx, 24, 0xff); + LD1.associative = BITMASK(ecx, 16, 0xff); + if (LD1.associative == 0xff) LD1.associative = 0; + LD1.linesize = BITMASK(ecx, 0, 0xff); + + LC1.size = BITMASK(ecx, 24, 0xff); + LC1.associative = BITMASK(ecx, 16, 0xff); + if (LC1.associative == 0xff) LC1.associative = 0; + LC1.linesize = BITMASK(ecx, 0, 0xff); + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + L2LDTB.size = 4096; + L2LDTB.associative = BITMASK(eax, 24, 0xff); + if (L2LDTB.associative == 0xff) L2LDTB.associative = 0; + L2LDTB.linesize = BITMASK(eax, 16, 0xff); + + L2LITB.size = 4096; + L2LITB.associative = BITMASK(eax, 8, 0xff); + if (L2LITB.associative == 0xff) L2LITB.associative = 0; + L2LITB.linesize = BITMASK(eax, 0, 0xff); + + L2DTB.size = 4; + L2DTB.associative = BITMASK(ebx, 24, 0xff); + if (L2DTB.associative == 0xff) L2DTB.associative = 0; + L2DTB.linesize = BITMASK(ebx, 16, 0xff); + + L2ITB.size = 4; + L2ITB.associative = BITMASK(ebx, 8, 0xff); + if (L2ITB.associative == 0xff) L2ITB.associative = 0; + L2ITB.linesize = BITMASK(ebx, 0, 0xff); + + L2.size = BITMASK(ecx, 16, 0xffff); + L2.associative = BITMASK(ecx, 12, 0xf); + if (L2.associative == 0xff) L2.associative = 0; + L2.linesize = BITMASK(ecx, 0, 0xff); + + L3.size = BITMASK(edx, 18, 0x3fff) * 512; + L3.associative = BITMASK(edx, 12, 0xf); + if (L3.associative == 0xff) L2.associative = 0; + L3.linesize = BITMASK(edx, 0, 0xff); + + } + + switch (type) { + + case CACHE_INFO_L1_I : + *cacheinfo = LC1; + break; + case CACHE_INFO_L1_D : + *cacheinfo = LD1; + break; + case CACHE_INFO_L2 : + *cacheinfo = L2; + break; + case CACHE_INFO_L3 : + *cacheinfo = L3; + break; + case CACHE_INFO_L1_DTB : + *cacheinfo = DTB; + break; + case CACHE_INFO_L1_ITB : + *cacheinfo = ITB; + break; + case CACHE_INFO_L1_LDTB : + *cacheinfo = LDTB; + break; + case CACHE_INFO_L1_LITB : + *cacheinfo = LITB; + break; + case CACHE_INFO_L2_DTB : + *cacheinfo = L2DTB; + break; + case CACHE_INFO_L2_ITB : + *cacheinfo = L2ITB; + break; + case CACHE_INFO_L2_LDTB : + *cacheinfo = L2LDTB; + break; + case CACHE_INFO_L2_LITB : + *cacheinfo = L2LITB; + break; + } + return 0; +} + +int get_cpuname(void){ + + int family, exfamily, model, vendor, exmodel; + + if (!have_cpuid()) return CPUTYPE_80386; + + family = get_cputype(GET_FAMILY); + exfamily = get_cputype(GET_EXFAMILY); + model = get_cputype(GET_MODEL); + exmodel = get_cputype(GET_EXMODEL); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 0x4: + return CPUTYPE_80486; + case 0x5: + return CPUTYPE_PENTIUM; + case 0x6: + switch (exmodel) { + case 0: + switch (model) { + case 1: + case 3: + case 5: + case 6: + return CPUTYPE_PENTIUM2; + case 7: + case 8: + case 10: + case 11: + return CPUTYPE_PENTIUM3; + case 9: + case 13: + case 14: + return CPUTYPE_PENTIUMM; + case 15: + return CPUTYPE_CORE2; + } + break; + case 1: + switch (model) { + case 6: + return CPUTYPE_CORE2; + case 7: + return CPUTYPE_PENRYN; + case 10: + case 11: + case 14: + case 15: + return CPUTYPE_NEHALEM; + case 12: + return CPUTYPE_ATOM; + case 13: + return CPUTYPE_DUNNINGTON; + break; + } + } + break; + case 0x7: + return CPUTYPE_ITANIUM; + case 0xf: + switch (exfamily) { + case 0 : + return CPUTYPE_PENTIUM4; + case 1 : + return CPUTYPE_ITANIUM; + } + break; + } + return CPUTYPE_INTEL_UNKNOWN; + } + + if (vendor == VENDOR_AMD){ + switch (family) { + case 0x4: + return CPUTYPE_AMD5X86; + case 0x5: + return CPUTYPE_AMDK6; + case 0x6: + return CPUTYPE_ATHLON; + case 0xf: + switch (exfamily) { + case 0: + case 2: + return CPUTYPE_OPTERON; + case 1: + case 10: + return CPUTYPE_BARCELONA; + } + break; + } + return CPUTYPE_AMD_UNKNOWN; + } + + if (vendor == VENDOR_CYRIX){ + switch (family) { + case 0x4: + return CPUTYPE_CYRIX5X86; + case 0x5: + return CPUTYPE_CYRIXM1; + case 0x6: + return CPUTYPE_CYRIXM2; + } + return CPUTYPE_CYRIX_UNKNOWN; + } + + if (vendor == VENDOR_NEXGEN){ + switch (family) { + case 0x5: + return CPUTYPE_NEXGENNX586; + } + return CPUTYPE_NEXGEN_UNKNOWN; + } + + if (vendor == VENDOR_CENTAUR){ + switch (family) { + case 0x5: + return CPUTYPE_CENTAURC6; + break; + case 0x6: + return CPUTYPE_NANO; + break; + + } + return CPUTYPE_VIAC3; + } + + if (vendor == VENDOR_RISE){ + switch (family) { + case 0x5: + return CPUTYPE_RISEMP6; + } + return CPUTYPE_RISE_UNKNOWN; + } + + if (vendor == VENDOR_SIS){ + switch (family) { + case 0x5: + return CPUTYPE_SYS55X; + } + return CPUTYPE_SIS_UNKNOWN; + } + + if (vendor == VENDOR_TRANSMETA){ + switch (family) { + case 0x5: + return CPUTYPE_CRUSOETM3X; + } + return CPUTYPE_TRANSMETA_UNKNOWN; + } + + if (vendor == VENDOR_NSC){ + switch (family) { + case 0x5: + return CPUTYPE_NSGEODE; + } + return CPUTYPE_NSC_UNKNOWN; + } + + return CPUTYPE_UNKNOWN; +} + +static char *cpuname[] = { + "UNKNOWN", + "INTEL_UNKNOWN", + "UMC_UNKNOWN", + "AMD_UNKNOWN", + "CYRIX_UNKNOWN", + "NEXGEN_UNKNOWN", + "CENTAUR_UNKNOWN", + "RISE_UNKNOWN", + "SIS_UNKNOWN", + "TRANSMETA_UNKNOWN", + "NSC_UNKNOWN", + "80386", + "80486", + "PENTIUM", + "PENTIUM2", + "PENTIUM3", + "PENTIUMM", + "PENTIUM4", + "CORE2", + "PENRYN", + "DUNNINGTON", + "NEHALEM", + "ATOM", + "ITANIUM", + "ITANIUM2", + "5X86", + "K6", + "ATHLON", + "DURON", + "OPTERON", + "BARCELONA", + "SHANGHAI", + "ISTANBUL", + "CYRIX5X86", + "CYRIXM1", + "CYRIXM2", + "NEXGENNX586", + "CENTAURC6", + "RISEMP6", + "SYS55X", + "TM3X00", + "NSGEODE", + "VIAC3", + "NANO", +}; + +static char *lowercpuname[] = { + "unknown", + "intel_unknown", + "umc_unknown", + "amd_unknown", + "cyrix_unknown", + "nexgen_unknown", + "centaur_unknown", + "rise_unknown", + "sis_unknown", + "transmeta_unknown", + "nsc_unknown", + "80386", + "80486", + "pentium", + "pentium2", + "pentium3", + "pentiumm", + "pentium4", + "core2", + "penryn", + "dunnington", + "nehalem", + "atom", + "itanium", + "itanium2", + "5x86", + "k6", + "athlon", + "duron", + "opteron", + "barcelona", + "shanghai", + "istanbul", + "cyrix5x86", + "cyrixm1", + "cyrixm2", + "nexgennx586", + "centaurc6", + "risemp6", + "sys55x", + "tms3x00", + "nsgeode", + "nano", +}; + +static char *corename[] = { + "UNKOWN", + "80486", + "P5", + "P6", + "KATMAI", + "COPPERMINE", + "NORTHWOOD", + "PRESCOTT", + "BANIAS", + "ATHLON", + "OPTERON", + "BARCELONA", + "VIAC3", + "YONAH", + "CORE2", + "PENRYN", + "DUNNINGTON", + "NEHALEM", + "ATOM", + "NANO", +}; + +static char *corename_lower[] = { + "unknown", + "80486", + "p5", + "p6", + "katmai", + "coppermine", + "northwood", + "prescott", + "banias", + "athlon", + "opteron", + "barcelona", + "viac3", + "yonah", + "core2", + "penryn", + "dunnington", + "nehalem", + "atom", + "nano", +}; + + +char *get_cpunamechar(void){ + return cpuname[get_cpuname()]; +} + +char *get_lower_cpunamechar(void){ + return lowercpuname[get_cpuname()]; +} + + +int get_coretype(void){ + + int family, exfamily, model, exmodel, vendor; + + if (!have_cpuid()) return CORE_80486; + + family = get_cputype(GET_FAMILY); + exfamily = get_cputype(GET_EXFAMILY); + model = get_cputype(GET_MODEL); + exmodel = get_cputype(GET_EXMODEL); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 4: + return CORE_80486; + case 5: + return CORE_P5; + case 6: + switch (exmodel) { + case 0: + switch (model) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + return CORE_P6; + case 7: + return CORE_KATMAI; + case 8: + case 10: + case 11: + return CORE_COPPERMINE; + case 9: + case 13: + case 14: + return CORE_BANIAS; + case 15: + return CORE_CORE2; + } + break; + case 1: + switch (model) { + case 6: + return CORE_CORE2; + case 7: + return CORE_PENRYN; + case 10: + case 11: + case 14: + case 15: + return CORE_NEHALEM; + case 12: + return CORE_ATOM; + case 13: + return CORE_DUNNINGTON; + break; + } + } + case 15: + if (model <= 0x2) return CORE_NORTHWOOD; + return CORE_PRESCOTT; + } + } + + if (vendor == VENDOR_AMD){ + if (family <= 0x5) return CORE_80486; + if (family <= 0xe) return CORE_ATHLON; + if (family == 0xf){ + if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; + } + } + + if (vendor == VENDOR_CENTAUR) { + switch (family) { + case 0x6: + return CORE_NANO; + break; + } + return CORE_VIAC3; + } + + return CORE_UNKNOWN; +} + +void get_cpuconfig(void){ + + cache_info_t info; + int features; + + printf("#define %s\n", cpuname[get_cpuname()]); + + + if (get_coretype() != CORE_P5) { + + get_cacheinfo(CACHE_INFO_L1_I, &info); + if (info.size > 0) { + printf("#define L1_CODE_SIZE %d\n", info.size * 1024); + printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_CODE_LINESIZE %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L1_D, &info); + if (info.size > 0) { + printf("#define L1_DATA_SIZE %d\n", info.size * 1024); + printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_DATA_LINESIZE %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L2, &info); + if (info.size > 0) { + printf("#define L2_SIZE %d\n", info.size * 1024); + printf("#define L2_ASSOCIATIVE %d\n", info.associative); + printf("#define L2_LINESIZE %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L3, &info); + if (info.size > 0) { + printf("#define L3_SIZE %d\n", info.size * 1024); + printf("#define L3_ASSOCIATIVE %d\n", info.associative); + printf("#define L3_LINESIZE %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L1_ITB, &info); + if (info.size > 0) { + printf("#define ITB_SIZE %d\n", info.size * 1024); + printf("#define ITB_ASSOCIATIVE %d\n", info.associative); + printf("#define ITB_ENTRIES %d\n", info.linesize); + } + + get_cacheinfo(CACHE_INFO_L1_DTB, &info); + if (info.size > 0) { + printf("#define DTB_SIZE %d\n", info.size * 1024); + printf("#define DTB_ASSOCIATIVE %d\n", info.associative); + printf("#define DTB_ENTRIES %d\n", info.linesize); + } + + features = get_cputype(GET_FEATURE); + + if (features & HAVE_CMOV ) printf("#define HAVE_CMOV\n"); + if (features & HAVE_MMX ) printf("#define HAVE_MMX\n"); + if (features & HAVE_SSE ) printf("#define HAVE_SSE\n"); + if (features & HAVE_SSE2 ) printf("#define HAVE_SSE2\n"); + if (features & HAVE_SSE3 ) printf("#define HAVE_SSE3\n"); + if (features & HAVE_SSSE3) printf("#define HAVE_SSSE3\n"); + if (features & HAVE_SSE4_1) printf("#define HAVE_SSE4_1\n"); + if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); + if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); + if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); + if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); + if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); + if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); + if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); + if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); + if (features & HAVE_128BITFPU) printf("#define HAVE_128BITFPU\n"); + if (features & HAVE_FASTMOVU) printf("#define HAVE_FASTMOVU\n"); + + printf("#define NUM_SHAREDCACHE %d\n", get_cputype(GET_NUMSHARE) + 1); + printf("#define NUM_CORES %d\n", get_cputype(GET_NUMCORES) + 1); + + features = get_coretype(); + if (features > 0) printf("#define CORE_%s\n", corename[features]); + } else { + printf("#define DTB_ENTRIES 16\n"); + printf("#define L1_CODE_SIZE 8192\n"); + printf("#define L1_DATA_SIZE 8192\n"); + printf("#define L2_SIZE 0\n"); + } +} + +void get_architecture(void){ +#ifndef __64BIT__ + printf("X86"); +#else + printf("X86_64"); +#endif +} + +void get_subarchitecture(void){ + printf("%s", get_cpunamechar()); +} + +void get_subdirname(void){ +#ifndef __64BIT__ + printf("x86"); +#else + printf("x86_64"); +#endif +} + +char *get_corename(void){ + return corename[get_coretype()]; +} + +void get_libname(void){ + printf("%s", corename_lower[get_coretype()]); +} + +/* This if for Makefile */ +void get_sse(void){ + + int features; + + features = get_cputype(GET_FEATURE); + + if (features & HAVE_MMX ) printf("HAVE_MMX=1\n"); + if (features & HAVE_SSE ) printf("HAVE_SSE=1\n"); + if (features & HAVE_SSE2 ) printf("HAVE_SSE2=1\n"); + if (features & HAVE_SSE3 ) printf("HAVE_SSE3=1\n"); + if (features & HAVE_SSSE3) printf("HAVE_SSSE3=1\n"); + if (features & HAVE_SSE4_1) printf("HAVE_SSE4_1=1\n"); + if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); + if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); + if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); + if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); + if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); + +} diff --git a/ctest.c b/ctest.c new file mode 100644 index 0000000000..0c373bf2b9 --- /dev/null +++ b/ctest.c @@ -0,0 +1,107 @@ +#if defined(__PGI) || defined(__PGIC__) +COMPILER_PGI +#endif + +#if defined(__PATHSCALE__) || defined(__PATHCC__) +COMPILER_PATHSCALE +#endif + +#if defined(__INTEL_COMPILER) || defined(__ICC) || defined(__ECC) +COMPILER_INTEL +#endif + +#if defined(__OPENCC__) +COMPILER_OPEN64 +#endif + +#if defined(__SUNPRO_C) +COMPILER_SUN +#endif + +#if defined(__IBMC__) || defined(__xlc__) +COMPILER_IBM +#endif + +#if defined(__DECCC__) +COMPILER_DEC +#endif + +#if defined(__GNUC__) +COMPILER_GNU +#endif + +#if defined(__linux__) +OS_LINUX +#endif + +#if defined(__FreeBSD__) +OS_FreeBSD +#endif + +#if defined(__NetBSD__) +OS_NetBSD +#endif + +#if defined(__sun) +OS_SunOS +#endif + +#if defined(__APPLE__) +OS_Darwin +#endif + +#if defined(_AIX) +OS_AIX +#endif + +#if defined(__OSF) +OS_OSF +#endif + +#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT) +OS_WINNT +#endif + +#if defined(__CYGWIN__) +OS_CYGWIN +#endif + +#if defined(__INTERIX) +OS_INTERIX +#endif + +#if defined(__i386) || defined(_X86) +ARCH_X86 +#endif + +#if defined(__x86_64__) || defined(__amd64__) +ARCH_X86_64 +#endif + +#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) +ARCH_POWER +#endif + +#ifdef __mips64 +ARCH_MIPS64 +#endif + +#if defined(__mips32) || defined(__mips) +ARCH_MIPS32 +#endif + +#ifdef __alpha +ARCH_ALPHA +#endif + +#if defined(__sparc) || defined(__sparc__) +ARCH_SPARC +#endif + +#if defined(__ia64__) || defined(__ia64) +ARCH_IA64 +#endif + +#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) +BINARY_64 +#endif diff --git a/ctest/LICENSE b/ctest/LICENSE new file mode 100644 index 0000000000..85061f29fe --- /dev/null +++ b/ctest/LICENSE @@ -0,0 +1,23 @@ +This directory contains the reference implementation of BLAS +which is obtainable at: http://netlib.org/blas/ + +The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, +2010, is as follows: + +2) Are there legal restrictions on the use of BLAS reference implementation +software? + +The reference BLAS is a freely-available software package. It is available from +netlib via anonymous ftp and the World Wide Web. Thus, it can be included in +commercial software packages (and has been). We only ask that proper credit be +given to the authors. + +Like all software, it is copyrighted. It is not trademarked, but we do ask the +following: + +If you modify the source for these routines we ask that you change the name of +the routine and comment the changes made to the original. + +We will gladly answer any questions regarding the software. If a modification +is done, however, it is the responsibility of the person who modified the +routine to provide support. diff --git a/ctest/Makefile b/ctest/Makefile new file mode 100644 index 0000000000..3cd6cc82c3 --- /dev/null +++ b/ctest/Makefile @@ -0,0 +1,93 @@ +# +# The Makefile compiles c wrappers and testers for CBLAS. +# + +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +CFLAGS += -DADD$(BU) -DCBLAS + +LIB = $(TOPDIR)/$(LIBNAME) + +stestl1o = c_sblas1.o + +stestl2o = c_sblas2.o c_s2chke.o auxiliary.o c_xerbla.o constant.o + +stestl3o = c_sblas3.o c_s3chke.o auxiliary.o c_xerbla.o constant.o + +dtestl1o = c_dblas1.o + +dtestl2o = c_dblas2.o c_d2chke.o auxiliary.o c_xerbla.o constant.o + +dtestl3o = c_dblas3.o c_d3chke.o auxiliary.o c_xerbla.o constant.o + +ctestl1o = c_cblas1.o + +ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o + +ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o + +ztestl1o = c_zblas1.o + +ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o + +ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o + +all :: all1 all2 all3 + +all1: xscblat1 xdcblat1 xccblat1 xzcblat1 + GOTO_NUM_THREADS=2 ./xscblat1 + GOTO_NUM_THREADS=2 ./xdcblat1 + GOTO_NUM_THREADS=2 ./xccblat1 + GOTO_NUM_THREADS=2 ./xzcblat1 + +all2: xscblat2 xdcblat2 xccblat2 xzcblat2 + GOTO_NUM_THREADS=2 ./xscblat2 < sin2 + GOTO_NUM_THREADS=2 ./xdcblat2 < din2 + GOTO_NUM_THREADS=2 ./xccblat2 < cin2 + GOTO_NUM_THREADS=2 ./xzcblat2 < zin2 + +all3: xscblat3 xdcblat3 xccblat3 xzcblat3 + GOTO_NUM_THREADS=2 ./xscblat3 < sin3 + GOTO_NUM_THREADS=2 ./xdcblat3 < din3 + GOTO_NUM_THREADS=2 ./xccblat3 < cin3 + GOTO_NUM_THREADS=2 ./xzcblat3 < zin3 + +clean :: + rm -f x* + +FLDFLAGS = $(FFLAGS:-fPIC=) +CEXTRALIB = + +# Single real +xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +# Double real +xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + +# Single complex +xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + +# Double complex +xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) + $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + +include $(TOPDIR)/Makefile.tail diff --git a/ctest/auxiliary.c b/ctest/auxiliary.c new file mode 100644 index 0000000000..1f47acfd69 --- /dev/null +++ b/ctest/auxiliary.c @@ -0,0 +1,38 @@ +/* + * Written by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void get_transpose_type(char *type, enum CBLAS_TRANSPOSE *trans) { + if( (strncmp( type,"n",1 )==0)||(strncmp( type,"N",1 )==0) ) + *trans = CblasNoTrans; + else if( (strncmp( type,"t",1 )==0)||(strncmp( type,"T",1 )==0) ) + *trans = CblasTrans; + else if( (strncmp( type,"c",1 )==0)||(strncmp( type,"C",1 )==0) ) + *trans = CblasConjTrans; + else *trans = UNDEFINED; +} + +void get_uplo_type(char *type, enum CBLAS_UPLO *uplo) { + if( (strncmp( type,"u",1 )==0)||(strncmp( type,"U",1 )==0) ) + *uplo = CblasUpper; + else if( (strncmp( type,"l",1 )==0)||(strncmp( type,"L",1 )==0) ) + *uplo = CblasLower; + else *uplo = UNDEFINED; +} +void get_diag_type(char *type, enum CBLAS_DIAG *diag) { + if( (strncmp( type,"u",1 )==0)||(strncmp( type,"U",1 )==0) ) + *diag = CblasUnit; + else if( (strncmp( type,"n",1 )==0)||(strncmp( type,"N",1 )==0) ) + *diag = CblasNonUnit; + else *diag = UNDEFINED; +} +void get_side_type(char *type, enum CBLAS_SIDE *side) { + if( (strncmp( type,"l",1 )==0)||(strncmp( type,"L",1 )==0) ) + *side = CblasLeft; + else if( (strncmp( type,"r",1 )==0)||(strncmp( type,"R",1 )==0) ) + *side = CblasRight; + else *side = UNDEFINED; +} diff --git a/ctest/c_c2chke.c b/ctest/c_c2chke.c new file mode 100644 index 0000000000..611cc215d1 --- /dev/null +++ b/ctest/c_c2chke.c @@ -0,0 +1,826 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_c2chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_cgemv",11)==0) { + cblas_rout = "cblas_cgemv"; + cblas_info = 1; + cblas_cgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_cgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_cgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgbmv",11)==0) { + cblas_rout = "cblas_cgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chemv",11)==0) { + cblas_rout = "cblas_chemv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chemv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_chemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_chemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chbmv",11)==0) { + cblas_rout = "cblas_chbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_chbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_chbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpmv",11)==0) { + cblas_rout = "cblas_chpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctrmv",11)==0) { + cblas_rout = "cblas_ctrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ctrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ctrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctbmv",11)==0) { + cblas_rout = "cblas_ctbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctpmv",11)==0) { + cblas_rout = "cblas_ctpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctrsv",11)==0) { + cblas_rout = "cblas_ctrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ctrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ctrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctbsv",11)==0) { + cblas_rout = "cblas_ctbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ctpsv",11)==0) { + cblas_rout = "cblas_ctpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ctpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ctpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ctpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgeru",10)==0) { + cblas_rout = "cblas_cgeru"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgeru(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cgeru(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cgeru(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_cgerc",10)==0) { + cblas_rout = "cblas_cgerc"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cgerc(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cgerc(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cgerc(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_cher2",11)==0) { + cblas_rout = "cblas_cher2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cher2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpr2",11)==0) { + cblas_rout = "cblas_chpr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_cher",10)==0) { + cblas_rout = "cblas_cher"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_cher(INVALID, CblasUpper, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher(CblasColMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher(CblasRowMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_chpr",10)==0) { + cblas_rout = "cblas_chpr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_chpr(INVALID, CblasUpper, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_chpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_c3chke.c b/ctest/c_c3chke.c new file mode 100644 index 0000000000..29515527b9 --- /dev/null +++ b/ctest/c_c3chke.c @@ -0,0 +1,1706 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_c3chke(char * rout) { + char *sf = ( rout ) ; + float A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + if (strncmp( sf,"cblas_cgemm" ,11)==0) { + cblas_rout = "cblas_cgemm" ; + + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_cgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_cgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_cgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_chemm" ,11)==0) { + cblas_rout = "cblas_chemm" ; + + cblas_info = 1; + cblas_chemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_chemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_chemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csymm" ,11)==0) { + cblas_rout = "cblas_csymm" ; + + cblas_info = 1; + cblas_csymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrmm" ,11)==0) { + cblas_rout = "cblas_ctrmm" ; + + cblas_info = 1; + cblas_ctrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ctrsm" ,11)==0) { + cblas_rout = "cblas_ctrsm" ; + + cblas_info = 1; + cblas_ctrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ctrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ctrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cherk" ,11)==0) { + cblas_rout = "cblas_cherk" ; + + cblas_info = 1; + cblas_cherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_cherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_cherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyrk" ,11)==0) { + cblas_rout = "cblas_csyrk" ; + + cblas_info = 1; + cblas_csyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_csyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_csyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_cher2k" ,12)==0) { + cblas_rout = "cblas_cher2k" ; + + cblas_info = 1; + cblas_cher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_cher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_cher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_csyr2k" ,12)==0) { + cblas_rout = "cblas_csyr2k" ; + + cblas_info = 1; + cblas_csyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_csyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_csyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_cblas1.c b/ctest/c_cblas1.c new file mode 100644 index 0000000000..f5ffc14bf3 --- /dev/null +++ b/ctest/c_cblas1.c @@ -0,0 +1,75 @@ +/* + * c_cblas1.c + * + * The program is a C wrapper for ccblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "common.h" +#include "cblas_test.h" + +void F77_caxpy(const int *N, const void *alpha, void *X, + const int *incX, void *Y, const int *incY) +{ + cblas_caxpy(*N, alpha, X, *incX, Y, *incY); + return; +} + +void F77_ccopy(const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_ccopy(*N, X, *incX, Y, *incY); + return; +} + +void F77_cdotc(const int *N, void *X, const int *incX, + void *Y, const int *incY, void *dotc) +{ + cblas_cdotc_sub(*N, X, *incX, Y, *incY, dotc); + return; +} + +void F77_cdotu(const int *N, void *X, const int *incX, + void *Y, const int *incY,void *dotu) +{ + cblas_cdotu_sub(*N, X, *incX, Y, *incY, dotu); + return; +} + +void F77_cscal(const int *N, const void * *alpha, void *X, + const int *incX) +{ + cblas_cscal(*N, alpha, X, *incX); + return; +} + +void F77_csscal(const int *N, const float *alpha, void *X, + const int *incX) +{ + cblas_csscal(*N, *alpha, X, *incX); + return; +} + +void F77_cswap( const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_cswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_icamax(const int *N, const void *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_icamax(*N, X, *incX)+1); +} + +float F77_scnrm2(const int *N, const void *X, const int *incX) +{ + return cblas_scnrm2(*N, X, *incX); +} + +float F77_scasum(const int *N, void *X, const int *incX) +{ + return cblas_scasum(*N, X, *incX); +} diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c new file mode 100644 index 0000000000..7a886ac010 --- /dev/null +++ b/ctest/c_cblas2.c @@ -0,0 +1,807 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/08/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void F77_cgemv(int *order, char *transp, int *m, int *n, + const void *alpha, + CBLAS_TEST_COMPLEX *a, int *lda, const void *x, int *incx, + const void *beta, void *y, int *incy) { + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgemv( CblasRowMajor, trans, *m, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_cgemv( CblasColMajor, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); + else + cblas_cgemv( UNDEFINED, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); +} + +void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *x, int *incx, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy) { + + CBLAS_TEST_COMPLEX *A; + int i,j,irow,jcol,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ){ + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + cblas_cgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, alpha, A, LDA, x, + *incx, beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_cgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else + cblas_cgbmv( UNDEFINED, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); +} + +void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, + CBLAS_TEST_COMPLEX *a, int *lda){ + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgeru( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_cgeru( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_cgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *y, int *incy, + CBLAS_TEST_COMPLEX *a, int *lda) { + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_cgerc( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_cgerc( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_cgerc( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *a, int *lda, CBLAS_TEST_COMPLEX *x, + int *incx, CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ + + CBLAS_TEST_COMPLEX *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_chemv( CblasRowMajor, uplo, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_chemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_chemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_chbmv(int *order, char *uplow, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *x, int *incx, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *y, int *incy){ + +CBLAS_TEST_COMPLEX *A; +int i,irow,j,jcol,LDA; + + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_chbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else { + LDA = *k+2; + A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) { + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + } + cblas_chbmv( CblasRowMajor, uplo, *n, *k, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + } + else if (*order == TEST_COL_MJR) + cblas_chbmv(CblasColMajor, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_chbmv(UNDEFINED, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, + CBLAS_TEST_COMPLEX *ap, CBLAS_TEST_COMPLEX *x, int *incx, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *y, int *incy){ + + CBLAS_TEST_COMPLEX *A, *AP; + int i,j,k,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + if (*order == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_chpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, + beta, y, *incy); + else { + LDA = *n; + A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); + AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + sizeof( CBLAS_TEST_COMPLEX )); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "common.h" +#include "cblas_test.h" + +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_cgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_COMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_cgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_chemm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_chemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_chemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_chemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_csymm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + + CBLAS_TEST_COMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_csymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_csymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_cherk(int *order, char *uplow, char *transp, int *n, int *k, + float *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + float *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_cherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_csyrk(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *beta, CBLAS_TEST_COMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_COMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_csyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_cher2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, float *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + B=(CBLAS_TEST_COMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_cher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_cher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_cher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_csyr2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, int *lda, + CBLAS_TEST_COMPLEX *b, int *ldb, CBLAS_TEST_COMPLEX *beta, + CBLAS_TEST_COMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_COMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + B=(CBLAS_TEST_COMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_COMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_csyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_csyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_csyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ctrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ctrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ctrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_COMPLEX *alpha, CBLAS_TEST_COMPLEX *a, + int *lda, CBLAS_TEST_COMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_COMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_COMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_COMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ctrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ctrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ctrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f new file mode 100644 index 0000000000..c741ce5064 --- /dev/null +++ b/ctest/c_cblat1.f @@ -0,0 +1,682 @@ + PROGRAM CCBLAT1 +* Test program for the COMPLEX Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_CDOTC'/ + DATA L(2)/'CBLAS_CDOTU'/ + DATA L(3)/'CBLAS_CAXPY'/ + DATA L(4)/'CBLAS_CCOPY'/ + DATA L(5)/'CBLAS_CSWAP'/ + DATA L(6)/'CBLAS_SCNRM2'/ + DATA L(7)/'CBLAS_SCASUM'/ + DATA L(8)/'CBLAS_CSCAL'/ + DATA L(9)/'CBLAS_CSSCAL'/ + DATA L(10)/'CBLAS_ICAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA + REAL SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + REAL STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + REAL SCASUMTEST, SCNRM2TEST + INTEGER ICAMAXTEST + EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST +* .. External Subroutines .. + EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3E0, (0.4E0,-0.7E0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.3E0,-0.4E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.1E0,-0.3E0), (0.5E0,-0.1E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), + + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.3E0,-0.4E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.1E0,-0.3E0), (8.0E0,9.0E0), (0.5E0,-0.1E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (0.1E0,0.1E0), + + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), + + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), + + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ + DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ + DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (-0.16E0,-0.37E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (-0.17E0,-0.19E0), (0.13E0,-0.39E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.19E0,-0.17E0), (0.32E0,0.09E0), + + (0.23E0,-0.24E0), (0.18E0,0.01E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (-0.16E0,-0.37E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (-0.17E0,-0.19E0), (8.0E0,9.0E0), + + (0.13E0,-0.39E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.11E0,-0.03E0), (3.0E0,6.0E0), + + (-0.17E0,0.46E0), (4.0E0,7.0E0), + + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), + + (0.32E0,0.09E0), (6.0E0,9.0E0), + + (0.23E0,-0.24E0), (8.0E0,3.0E0), + + (0.18E0,0.01E0), (9.0E0,4.0E0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.09E0,-0.12E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.03E0,-0.09E0), (0.15E0,-0.03E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.03E0,0.03E0), (-0.18E0,0.03E0), + + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.09E0,0.03E0), (0.03E0,0.12E0), + + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.09E0,-0.12E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.03E0,-0.09E0), (8.0E0,9.0E0), + + (0.15E0,-0.03E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.03E0,0.03E0), (3.0E0,6.0E0), + + (-0.18E0,0.03E0), (4.0E0,7.0E0), + + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), + + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), + + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. SCNRM2TEST .. + CALL STEST1(SCNRM2TEST(N,CX,INCX),STRUE2(NP1), + + STRUE2(NP1), SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. SCASUMTEST .. + CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), + + STRUE4(NP1),SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. CSCAL .. + CALL CSCAL(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. CSSCALTEST .. + CALL CSSCALTEST(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ICAMAXTEST .. + CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* CSCAL +* Add a test for alpha equal to zero. + CA = (0.0E0,0.0E0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 80 CONTINUE + CALL CSCAL(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* CSSCALTEST +* Add a test for alpha equal to zero. + SA = 0.0E0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 100 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0E0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0E0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL CSSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA,CTEMP + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL CDOTCTEST, CDOTUTEST +* .. External Subroutines .. + EXTERNAL CAXPYTEST, CCOPYTEST, CSWAPTEST, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4E0,-0.7E0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (-0.1E0,-0.9E0), (0.2E0,-0.8E0), + + (-0.9E0,-0.4E0), (0.1E0,0.4E0), (-0.6E0,0.6E0)/ + DATA CY1/(0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.7E0,-0.6E0), (0.1E0,-0.5E0), (-0.1E0,-0.2E0), + + (-0.5E0,-0.3E0), (0.8E0,-0.7E0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-1.55E0,0.5E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (-1.55E0,0.5E0), + + (0.03E0,-0.89E0), (-0.38E0,-0.96E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-0.9E0,0.5E0), (0.42E0,-1.41E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-0.9E0,0.5E0), + + (0.06E0,-0.13E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.52E0,-1.51E0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-1.18E0,-0.31E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-1.54E0,0.97E0), + + (0.03E0,-0.89E0), (-0.18E0,-1.31E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), (-0.9E0,0.5E0), + + (0.05E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-0.9E0,0.5E0), (0.05E0,-0.6E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.32E0,-1.16E0)/ + DATA CT7/(0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (0.65E0,-0.47E0), (-0.34E0,-1.22E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.59E0,-1.46E0), (-1.04E0,-0.04E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.83E0,0.59E0), (0.07E0,-0.37E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.76E0,-1.15E0), (-1.33E0,-1.82E0)/ + DATA CT6/(0.0E0,0.0E0), (0.90E0,0.06E0), + + (0.91E0,-0.77E0), (1.80E0,-0.10E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.45E0,0.74E0), + + (0.20E0,0.90E0), (0.0E0,0.0E0), (0.90E0,0.06E0), + + (-0.55E0,0.23E0), (0.83E0,-0.39E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.04E0,0.79E0), + + (1.95E0,1.22E0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (-0.9E0,0.5E0), (0.7E0,-0.6E0), (0.1E0,-0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.6E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.8E0,-0.7E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.2E0), + + (0.2E0,-0.8E0), (0.7E0,-0.6E0), (0.1E0,0.4E0), + + (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.9E0,0.5E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.1E0,-0.5E0), + + (-0.4E0,-0.7E0), (0.7E0,-0.6E0), (0.2E0,-0.8E0), + + (-0.9E0,0.5E0), (0.1E0,0.4E0), (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (0.7E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (0.7E0,-0.6E0), (-0.1E0,-0.2E0), (0.8E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.9E0), + + (0.2E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (-0.9E0,0.5E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,0.5E0), (-0.9E0,-0.4E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.7E0,-0.8E0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,-0.4E0), (-0.1E0,-0.9E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.9E0,0.5E0), + + (-0.4E0,-0.7E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.9E0,0.5E0), (-0.4E0,-0.7E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.2E0,-0.8E0)/ + DATA CSIZE1/(0.0E0,0.0E0), (0.9E0,0.9E0), + + (1.63E0,1.73E0), (2.90E0,2.78E0)/ + DATA CSIZE3/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0)/ + DATA CSIZE2/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. CDOTCTEST .. + CALL CDOTCTEST(N,CX,INCX,CY,INCY,CTEMP) + CDOT(1) = CTEMP + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. CDOTUTEST .. + CALL CDOTUTEST(N,CX,INCX,CY,INCY,CTEMP) + CDOT(1) = CTEMP + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. CAXPYTEST .. + CALL CAXPYTEST(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. CCOPYTEST .. + CALL CCOPYTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE IF (ICASE.EQ.5) THEN +* .. CSWAPTEST .. + CALL CSWAPTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0E0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + REAL SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC AIMAG, REAL +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = REAL(CCOMP(I)) + SCOMP(2*I) = AIMAG(CCOMP(I)) + STRUE(2*I-1) = REAL(CTRUE(I)) + STRUE(2*I) = AIMAG(CTRUE(I)) + SSIZE(2*I-1) = REAL(CSIZE(I)) + SSIZE(2*I) = AIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/ctest/c_cblat2.f b/ctest/c_cblat2.f new file mode 100644 index 0000000000..545ba4b9fc --- /dev/null +++ b/ctest/c_cblat2.f @@ -0,0 +1,2932 @@ + PROGRAM CBLAT2 +* +* Test program for the COMPLEX Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_cgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgerc T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cgeru T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chpr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHK6, + $ CC2CHKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_cgemv ', 'cblas_cgbmv ', + $ 'cblas_chemv ','cblas_chbmv ','cblas_chpmv ', + $ 'cblas_ctrmv ','cblas_ctbmv ','cblas_ctpmv ', + $ 'cblas_ctrsv ','cblas_ctbsv ','cblas_ctpsv ', + $ 'cblas_cgerc ','cblas_cgeru ','cblas_cher ', + $ 'cblas_chpr ','cblas_cher2 ','cblas_chpr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from CMVCH YT holds +* the result computed by CMVCH. + TRANS = 'N' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CC2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test CGEMV, 01, and CGBMV, 02. + 140 IF (CORDER) THEN + CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. + 150 IF (CORDER) THEN + CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test CTRMV, 06, CTBMV, 07, CTPMV, 08, +* CTRSV, 09, CTBSV, 10, and CTPSV, 11. + 160 IF (CORDER) THEN + CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test CGERC, 12, CGERU, 13. + 170 IF (CORDER) THEN + CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test CHER, 14, and CHPR, 15. + 180 IF (CORDER) THEN + CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test CHER2, 16, and CHPR2, 17. + 190 IF (CORDER) THEN + CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT(' TESTS OF THE COMPLEX LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT(' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT(' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT2. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CGEMV and CGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGBMV, CCGEMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CCGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* +* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LCERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CHEMV, CHBMV and CHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHBMV, CCHEMV, CCHPMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHEMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LCE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LCE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,',/ 10x, I2, ',(', F4.1, ',', F4.1, + $ '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', F4.1, ',', + $ F4.1, '), ', 'Y,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests CTRMV, CTBMV, CTPMV, CTRSV, CTBSV and CTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX TRANSL + REAL ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMVCH, CCTBMV, CCTBSV, CCTPMV, + $ CCTPSV, CCTRMV, CCTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for CMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LCERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CGERC and CGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGERC, CCGERU, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. + CONJ = SNAME( 11: 11 ).EQ.'c' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE(SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL CCGERC( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL CCGERU( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = CONJG( W( 1 ) ) + CALL CMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CHER and CHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, TRANSL + REAL ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER, CCHPR, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = REAL( ALF( IA ) ) + ALPHA = CMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CCHER( IORDER, UPLO, N, RALPHA, XX, + $ INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CCHPR( IORDER, UPLO, N, RALPHA, + $ XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LCERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = CONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END + SUBROUTINE CCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests CHER2 and CHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER2, CCHPR2, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CCHER2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CCHPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*CONJG( Z( J, 2 ) ) + W( 2 ) = CONJG( ALPHA )*CONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK6. +* + END + SUBROUTINE CMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX C + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + CONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of CMVCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'he' or 'hp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'he', 'hb', 'hp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, MIN, REAL +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'h' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = CBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'hb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'hp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = CMPLX( REAL( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f new file mode 100644 index 0000000000..b03d47916c --- /dev/null +++ b/ctest/c_cblat3.f @@ -0,0 +1,2786 @@ + PROGRAM CBLAT3 +* +* Test program for the COMPLEX Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 22 lines: +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_cgemm ', 'cblas_chemm ', + $ 'cblas_csymm ', 'cblas_ctrmm ', 'cblas_ctrsm ', + $ 'cblas_cherk ', 'cblas_csyrk ', 'cblas_cher2k', + $ 'cblas_csyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from CMMCH CT holds +* the result computed by CMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CC3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test CGEMM, 01. + 140 IF (CORDER) THEN + CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CHEMM, 02, CSYMM, 03. + 150 IF (CORDER) THEN + CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CTRMM, 04, CTRSM, 05. + 160 IF (CORDER) THEN + CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test CHERK, 06, CSYRK, 07. + 170 IF (CORDER) THEN + CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL CCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test CHER2K, 08, CSYR2K, 09. + 180 IF (CORDER) THEN + CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL CCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT(' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT(' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT(' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, + $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT3. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCGEMM, CMAKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL CPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LCERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END +* + SUBROUTINE CPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', + $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) + END +* + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CHEMM and CSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHEMM, CMAKE, CMMCH, CCSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL CMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL CPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CCHEMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + ELSE + CALL CCSYMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END +* + SUBROUTINE CPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, + $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) + END +* + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests CTRMM and CTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS + REAL ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMMCH, CCTRMM, CCTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for CMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL CMAKE( 'tr', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL CMAKE( 'ge', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CCTRMM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL CPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CCTRSM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LCE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LCE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LCERES( 'ge', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL CPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT(' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END +* + SUBROUTINE CPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + COMPLEX ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', + $ F4.1, '), A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests CHERK and CSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHERK, CMAKE, CMMCH, CCSYRK +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = REAL( ALPHA ) + ALPHA = CMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL CPRCN6( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, + $ LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCHERK( IORDER, UPLO, TRANS, N, K, + $ RALPHA, AA, LDA, RBETA, CC, + $ LDC ) + ELSE + IF( TRACE ) + $ CALL CPRCN4( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCSYRK( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LCERES( SNAME( 8: 9 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL CMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL CPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, + $ LDA, rBETA, LDC) + ELSE + CALL CPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END +* + SUBROUTINE CPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', + $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE CPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests CHER2K and CSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CCHER2K, CMAKE, CMMCH, CCSYR2K +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL CMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL CPRCN7( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ RBETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCHER2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, RBETA, + $ CC, LDC ) + ELSE + IF( TRACE ) + $ CALL CPRCN5( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CCSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'he', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = CONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*CONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = CONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL CPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, RBETA, LDC) + ELSE + CALL CPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, BETA, LDC) + END IF +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END +* + SUBROUTINE CPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE CPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + COMPLEX ALPHA + REAL BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE CMAKE(TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'he', 'sy' or 'tr'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, REAL +* .. Executable Statements .. + GEN = TYPE.EQ.'ge' + HER = TYPE.EQ.'he' + SYM = TYPE.EQ.'sy' + TRI = TYPE.EQ.'tr' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = CBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMMCH(TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX CL + REAL ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )* + $ CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of CMMCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge' or 'he' or 'sy'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/ctest/c_d2chke.c b/ctest/c_d2chke.c new file mode 100644 index 0000000000..23de9a4e71 --- /dev/null +++ b/ctest/c_d2chke.c @@ -0,0 +1,789 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_d2chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_dgemv",11)==0) { + cblas_rout = "cblas_dgemv"; + cblas_info = 1; + cblas_dgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dgbmv",11)==0) { + cblas_rout = "cblas_dgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsymv",11)==0) { + cblas_rout = "cblas_dsymv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsymv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsbmv",11)==0) { + cblas_rout = "cblas_dsbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dsbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dsbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspmv",11)==0) { + cblas_rout = "cblas_dspmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtrmv",11)==0) { + cblas_rout = "cblas_dtrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dtrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dtrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtbmv",11)==0) { + cblas_rout = "cblas_dtbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtpmv",11)==0) { + cblas_rout = "cblas_dtpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtrsv",11)==0) { + cblas_rout = "cblas_dtrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dtrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtbsv",11)==0) { + cblas_rout = "cblas_dtbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dtpsv",11)==0) { + cblas_rout = "cblas_dtpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dtpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dtpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dtpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_dger",10)==0) { + cblas_rout = "cblas_dger"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dger(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dger(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dger(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dsyr2",11)==0) { + cblas_rout = "cblas_dsyr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsyr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspr2",11)==0) { + cblas_rout = "cblas_dspr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_dsyr",10)==0) { + cblas_rout = "cblas_dsyr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dsyr(INVALID, CblasUpper, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_dspr",10)==0) { + cblas_rout = "cblas_dspr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_dspr(INVALID, CblasUpper, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_d3chke.c b/ctest/c_d3chke.c new file mode 100644 index 0000000000..1149475ab3 --- /dev/null +++ b/ctest/c_d3chke.c @@ -0,0 +1,1271 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_d3chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + B[2] = {0.0,0.0}, + C[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_dgemm" ,11)==0) { + cblas_rout = "cblas_dgemm" ; + + cblas_info = 1; + cblas_dgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_dgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_dgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_dgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsymm" ,11)==0) { + cblas_rout = "cblas_dsymm" ; + + cblas_info = 1; + cblas_dsymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dtrmm" ,11)==0) { + cblas_rout = "cblas_dtrmm" ; + + cblas_info = 1; + cblas_dtrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dtrsm" ,11)==0) { + cblas_rout = "cblas_dtrsm" ; + + cblas_info = 1; + cblas_dtrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_dtrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_dtrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsyrk" ,11)==0) { + cblas_rout = "cblas_dsyrk" ; + + cblas_info = 1; + cblas_dsyrk( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_dsyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_dsyrk( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_dsyr2k" ,12)==0) { + cblas_rout = "cblas_dsyr2k" ; + + cblas_info = 1; + cblas_dsyr2k( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_dsyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_dsyr2k( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + } + if (cblas_ok == TRUE ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_dblas1.c b/ctest/c_dblas1.c new file mode 100644 index 0000000000..2371d338b4 --- /dev/null +++ b/ctest/c_dblas1.c @@ -0,0 +1,84 @@ +/* + * c_dblas1.c + * + * The program is a C wrapper for dcblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "common.h" +#include "cblas_test.h" + +double F77_dasum(const int *N, double *X, const int *incX) +{ + return cblas_dasum(*N, X, *incX); +} + +void F77_daxpy(const int *N, const double *alpha, const double *X, + const int *incX, double *Y, const int *incY) +{ + cblas_daxpy(*N, *alpha, X, *incX, Y, *incY); + return; +} + +void F77_dcopy(const int *N, double *X, const int *incX, + double *Y, const int *incY) +{ + cblas_dcopy(*N, X, *incX, Y, *incY); + return; +} + +double F77_ddot(const int *N, const double *X, const int *incX, + const double *Y, const int *incY) +{ + return cblas_ddot(*N, X, *incX, Y, *incY); +} + +double F77_dnrm2(const int *N, const double *X, const int *incX) +{ + return cblas_dnrm2(*N, X, *incX); +} + +void F77_drotg( double *a, double *b, double *c, double *s) +{ + cblas_drotg(a,b,c,s); + return; +} + +void F77_drot( const int *N, double *X, const int *incX, double *Y, + const int *incY, const double *c, const double *s) +{ + + cblas_drot(*N,X,*incX,Y,*incY,*c,*s); + return; +} + +void F77_dscal(const int *N, const double *alpha, double *X, + const int *incX) +{ + cblas_dscal(*N, *alpha, X, *incX); + return; +} + +void F77_dswap( const int *N, double *X, const int *incX, + double *Y, const int *incY) +{ + cblas_dswap(*N,X,*incX,Y,*incY); + return; +} + +double F77_dzasum(const int *N, void *X, const int *incX) +{ + return cblas_dzasum(*N, X, *incX); +} + +double F77_dznrm2(const int *N, const void *X, const int *incX) +{ + return cblas_dznrm2(*N, X, *incX); +} + +int F77_idamax(const int *N, const double *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_idamax(*N, X, *incX)+1); +} diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c new file mode 100644 index 0000000000..ed68402d17 --- /dev/null +++ b/ctest/c_dblas2.c @@ -0,0 +1,583 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, + double *a, int *lda, double *x, int *incx, double *beta, + double *y, int *incy ) { + + double *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dgemv( CblasRowMajor, trans, + *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_dgemv( CblasColMajor, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); + else + cblas_dgemv( UNDEFINED, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx, + double *y, int *incy, double *a, int *lda ) { + + double *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + + for( i=0; i<*m; i++ ) { + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + } + + cblas_dger(CblasRowMajor, *m, *n, *alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dger( CblasColMajor, *m, *n, *alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, + int *n, double *a, int *lda, double *x, int *incx) { + double *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dtrmv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_dtrmv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx); + else { + cblas_dtrmv(UNDEFINED, uplo, trans, diag, *n, a, *lda, x, *incx); + } +} + +void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, + int *n, double *a, int *lda, double *x, int *incx ) { + double *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dtrsv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx ); + free(A); + } + else + cblas_dtrsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); +} +void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, + int *lda, double *x, int *incx, double *beta, double *y, + int *incy) { + double *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsymv(CblasRowMajor, uplo, *n, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_dsymv(CblasColMajor, uplo, *n, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, + int *incx, double *a, int *lda) { + double *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsyr(CblasRowMajor, uplo, *n, *alpha, x, *incx, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dsyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); +} + +void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, + int *incx, double *y, int *incy, double *a, int *lda) { + double *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_dsyr2(CblasRowMajor, uplo, *n, *alpha, x, *incx, y, *incy, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_dsyr2(CblasColMajor, uplo, *n, *alpha, x, *incx, y, *incy, a, *lda); +} + +void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + double *alpha, double *a, int *lda, double *x, int *incx, + double *beta, double *y, int *incy ) { + + double *A; + int i,irow,j,jcol,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + + if (*order == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + cblas_dgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, + A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else + cblas_dgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, *alpha, + a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, + int *n, int *k, double *a, int *lda, double *x, int *incx) { + double *A; + int irow, jcol, i, j, LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dtbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_dtbmv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, + int *n, int *k, double *a, int *lda, double *x, int *incx) { + double *A; + int irow, jcol, i, j, LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dtbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_dtbsv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, + double *a, int *lda, double *x, int *incx, double *beta, + double *y, int *incy) { + double *A; + int i,j,irow,jcol,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_dsbmv(CblasRowMajor, uplo, *n, *k, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_dsbmv(CblasColMajor, uplo, *n, *k, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, + double *x, int *incx, double *beta, double *y, int *incy) { + double *A,*AP; + int i,j,k,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n; + A = ( double* )malloc( LDA*LDA*sizeof( double ) ); + AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "common.h" +#include "cblas_test.h" + +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + + double *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A = (double *)malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else { + LDA = *m+1; + A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + if (transb == CblasNoTrans) { + LDB = *n+1; + B = ( double* )malloc( (*k)*LDB*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + else { + LDB = *k+1; + B = ( double* )malloc( LDB*(*n)*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + LDC = *n+1; + C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + + cblas_dgemm( CblasRowMajor, transa, transb, *m, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_dgemm( CblasColMajor, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_dgemm( UNDEFINED, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n, + double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + + double *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB, + *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_dsymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); + else + cblas_dsymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); +} + +void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k, + double *alpha, double *a, int *lda, + double *beta, double *c, int *ldc ) { + + int i,j,LDA,LDC; + double *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*k)*LDA*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDC = *n+1; + C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_dsyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_dsyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k, + double *alpha, double *a, int *lda, double *b, int *ldb, + double *beta, double *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + double *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + B = ( double* )malloc( (*n)*LDB*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); + B = ( double* )malloc( LDB*(*k)*sizeof( double ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + LDC = *n+1; + C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_dsyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_dsyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_dsyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, double *alpha, double *a, int *lda, double *b, + int *ldb) { + int i,j,LDA,LDB; + double *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_dtrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_dtrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_dtrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} + +void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, double *alpha, double *a, int *lda, double *b, + int *ldb) { + int i,j,LDA,LDB; + double *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_dtrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_dtrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_dtrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f new file mode 100644 index 0000000000..63e1ed805e --- /dev/null +++ b/ctest/c_dblat1.f @@ -0,0 +1,728 @@ + PROGRAM DCBLAT1 +* Test program for the DOUBLE PRECISION Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_DDOT'/ + DATA L(2)/'CBLAS_DAXPY '/ + DATA L(3)/'CBLAS_DROTG '/ + DATA L(4)/'CBLAS_DROT '/ + DATA L(5)/'CBLAS_DCOPY '/ + DATA L(6)/'CBLAS_DSWAP '/ + DATA L(7)/'CBLAS_DNRM2 '/ + DATA L(8)/'CBLAS_DASUM '/ + DATA L(9)/'CBLAS_DSCAL '/ + DATA L(10)/'CBLAS_IDAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + DOUBLE PRECISION DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL DROTGTEST, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3D0, 0.4D0, -0.3D0, -0.4D0, -0.3D0, 0.0D0, + + 0.0D0, 1.0D0/ + DATA DB1/0.4D0, 0.3D0, 0.4D0, 0.3D0, -0.4D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DC1/0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.6D0, 1.0D0, + + 0.0D0, 1.0D0/ + DATA DS1/0.8D0, 0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DATRUE/0.5D0, 0.5D0, 0.5D0, -0.5D0, -0.5D0, + + 0.0D0, 1.0D0, 1.0D0/ + DATA DBTRUE/0.0D0, 0.6D0, 0.0D0, -0.6D0, 0.0D0, + + 0.0D0, 1.0D0, 0.0D0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0D0/0.6D0 + DBTRUE(3) = -1.0D0/0.6D0 + DBTRUE(5) = 1.0D0/0.6D0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. DROTGTEST .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL DROTGTEST(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + DOUBLE PRECISION DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + DOUBLE PRECISION DASUMTEST, DNRM2TEST + INTEGER IDAMAXTEST + EXTERNAL DASUMTEST, DNRM2TEST, IDAMAXTEST +* .. External Subroutines .. + EXTERNAL ITEST1, DSCALTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0, -1.0D0, 0.0D0, 1.0D0, 0.3D0, 0.3D0, + + 0.3D0, 0.3D0, 0.3D0, 0.3D0/ + DATA DV/0.1D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 0.3D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 0.3D0, -0.4D0, 4.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 0.2D0, + + -0.6D0, 0.3D0, 5.0D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 0.1D0, -0.3D0, 0.5D0, -0.1D0, 6.0D0, + + 6.0D0, 6.0D0, 6.0D0, 0.1D0, 8.0D0, 8.0D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 0.3D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 0.3D0, 2.0D0, + + -0.4D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 0.2D0, 3.0D0, -0.6D0, 5.0D0, 0.3D0, 2.0D0, + + 2.0D0, 2.0D0, 0.1D0, 4.0D0, -0.3D0, 6.0D0, + + -0.5D0, 7.0D0, -0.1D0, 3.0D0/ + DATA DTRUE1/0.0D0, 0.3D0, 0.5D0, 0.7D0, 0.6D0/ + DATA DTRUE3/0.0D0, 0.3D0, 0.7D0, 1.1D0, 1.0D0/ + DATA DTRUE5/0.10D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, -0.3D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, 0.0D0, 0.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, + + 0.20D0, -0.60D0, 0.30D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 5.0D0, 0.03D0, -0.09D0, 0.15D0, -0.03D0, + + 6.0D0, 6.0D0, 6.0D0, 6.0D0, 0.10D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, + + 0.09D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 0.09D0, 2.0D0, -0.12D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, 2.0D0, 0.06D0, 3.0D0, + + -0.18D0, 5.0D0, 0.09D0, 2.0D0, 2.0D0, 2.0D0, + + 0.03D0, 4.0D0, -0.09D0, 6.0D0, -0.15D0, 7.0D0, + + -0.03D0, 3.0D0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. DNRM2TEST .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(DNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. DASUMTEST .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(DASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. DSCALTEST .. + CALL DSCALTEST(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IDAMAXTEST .. + CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL DDOTTEST + DOUBLE PRECISION DDOTTEST +* .. External Subroutines .. + EXTERNAL DAXPYTEST, DCOPYTEST, DSWAPTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA DT7/0.0D0, 0.30D0, 0.21D0, 0.62D0, 0.0D0, + + 0.30D0, -0.07D0, 0.85D0, 0.0D0, 0.30D0, -0.79D0, + + -0.74D0, 0.0D0, 0.30D0, 0.33D0, 1.27D0/ + DATA DT8/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.15D0, + + 0.94D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.68D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.35D0, -0.9D0, 0.48D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.38D0, -0.9D0, 0.57D0, 0.7D0, -0.75D0, + + 0.2D0, 0.98D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.35D0, -0.72D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.38D0, + + -0.63D0, 0.15D0, 0.88D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.7D0, + + -0.75D0, 0.2D0, 1.04D0/ + DATA DT10X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, -0.9D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, -0.9D0, 0.3D0, 0.7D0, + + 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.3D0, 0.1D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.8D0, 0.1D0, -0.6D0, + + 0.8D0, 0.3D0, -0.3D0, 0.5D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.9D0, + + 0.1D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + 0.1D0, 0.3D0, 0.8D0, -0.9D0, -0.3D0, 0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, -0.6D0, 0.8D0, 0.0D0, 0.0D0, + + 0.0D0/ + DATA DT10Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, -0.5D0, -0.9D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, -0.4D0, -0.9D0, 0.9D0, + + 0.7D0, -0.5D0, 0.2D0, 0.6D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + -0.4D0, 0.9D0, -0.5D0, 0.6D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.7D0, + + -0.5D0, 0.2D0, 0.8D0/ + DATA SSIZE1/0.0D0, 0.3D0, 1.6D0, 3.2D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. DDOTTEST .. + CALL STEST1(DDOTTEST(N,SX,INCX,SY,INCY),DT7(KN,KI), + + SSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. DAXPYTEST .. + CALL DAXPYTEST(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. DCOPYTEST .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL DCOPYTEST(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE IF (ICASE.EQ.6) THEN +* .. DSWAPTEST .. + CALL DSWAPTEST(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0D0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL STEST,DROTTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA SC, SS/0.8D0, 0.6D0/ + DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + + 0.0D0, 0.0D0, 0.0D0/ + DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + + -0.18D0, 0.2D0, 0.16D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. DROTTEST .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL DROTTEST(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0.0 + DO 100 I = 2, 6 + MWPS(I) = 1.0 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1.0 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL DROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/ctest/c_dblat2.f b/ctest/c_dblat2.f new file mode 100644 index 0000000000..357816bd3d --- /dev/null +++ b/ctest/c_dblat2.f @@ -0,0 +1,2907 @@ + PROGRAM DBLAT2 +* +* Test program for the DOUBLE PRECISION Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 33 lines: +* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* cblas_dgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsymv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dger T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dspr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHK6, + $ CD2CHKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_dgemv ', 'cblas_dgbmv ', + $ 'cblas_dsymv ','cblas_dsbmv ','cblas_dspmv ', + $ 'cblas_dtrmv ','cblas_dtbmv ','cblas_dtpmv ', + $ 'cblas_dtrsv ','cblas_dtbsv ','cblas_dtpsv ', + $ 'cblas_dger ','cblas_dsyr ','cblas_dspr ', + $ 'cblas_dsyr2 ','cblas_dspr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from DMVCH YT holds +* the result computed by DMVCH. + TRANS = 'N' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CD2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test DGEMV, 01, and DGBMV, 02. + 140 IF (CORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. + 150 IF (CORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test DTRMV, 06, DTBMV, 07, DTPMV, 08, +* DTRSV, 09, DTBSV, 10, and DTPSV, 11. + 160 IF (CORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test DGER, 12. + 170 IF (CORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test DSYR, 13, and DSPR, 14. + 180 IF (CORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test DSYR2, 15, and DSPR2, 16. + 190 IF (CORDER) THEN + CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN DMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT2. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests DGEMV and DGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL CDGBMV, CDGEMV, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, + $ BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LDERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ',',/ 10x,'X,', I2, ',', F4.1, ', Y,', + $ I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests DSYMV, DSBMV and DSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSBMV, CDSPMV, CDSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSYMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LDE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests DTRMV, DTBMV, DTPMV, DTRSV, DTBSV and DTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDTBMV, CDTBSV, CDTPMV, + $ CDTPSV, CDTRMV, CDTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for DMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LDERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGER, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDGER( IORDER, M, N, ALPHA, XX, INCX, YY, + $ INCY, AA, LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL DMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DSYR and DSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSPR, CDSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDSYR( IORDER, UPLO, N, ALPHA, XX, INCX, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CDSPR( IORDER, UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LDERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END + SUBROUTINE DCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests DSYR2 and DSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, CDSPR2, CDSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CDSYR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CDSPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK6. +* + END + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'sy', 'sb', 'sp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'s' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = DBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'sb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'sp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of DMVCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'sy' or 'sp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC DBLE +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = DBLE( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END diff --git a/ctest/c_dblat3.f b/ctest/c_dblat3.f new file mode 100644 index 0000000000..fb9acbb914 --- /dev/null +++ b/ctest/c_dblat3.f @@ -0,0 +1,2475 @@ + PROGRAM DBLAT3 +* +* Test program for the DOUBLE PRECISION Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 19 lines: +* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* cblas_dgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dtrsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_dsyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, CD3CHKE, + $ DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_dgemm ', 'cblas_dsymm ', + $ 'cblas_dtrmm ', 'cblas_dtrsm ','cblas_dsyrk ', + $ 'cblas_dsyr2k'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + NOUTC = NOUT +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from DMMCH CT holds +* the result computed by DMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CD3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test DGEMM, 01. + 140 IF (CORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DSYMM, 02. + 150 IF (CORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DTRMM, 03, DTRSM, 04. + 160 IF (CORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test DSYRK, 05. + 170 IF (CORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test DSYR2K, 06. + 180 IF (CORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN DMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT3. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL CDGEMM, DMAKE, DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL DMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYMM( IORDER, SIDE, UPLO, M, N, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END +* + SUBROUTINE DPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 2( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests DTRMM and DTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDTRMM, CDTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for DMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL DMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CDTRMM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL DPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CDTRSM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LDE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LDE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END +* + SUBROUTINE DPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + DOUBLE PRECISION ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, IORDER) +* +* Tests DSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN4( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYRK( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL DMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END +* + SUBROUTINE DPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests DSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, CDSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL DPRCN5( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CDSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL DMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL DMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL DPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END +* + SUBROUTINE DPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = DBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of DMMCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = ( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END diff --git a/ctest/c_s2chke.c b/ctest/c_s2chke.c new file mode 100644 index 0000000000..b0a48a6f3e --- /dev/null +++ b/ctest/c_s2chke.c @@ -0,0 +1,789 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_s2chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_sgemv",11)==0) { + cblas_rout = "cblas_sgemv"; + cblas_info = 1; + cblas_sgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_sgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_sgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sgbmv",11)==0) { + cblas_rout = "cblas_sgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssymv",11)==0) { + cblas_rout = "cblas_ssymv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssymv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssymv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssymv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssbmv",11)==0) { + cblas_rout = "cblas_ssbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ssbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ssbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspmv",11)==0) { + cblas_rout = "cblas_sspmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_sspmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_sspmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_strmv",11)==0) { + cblas_rout = "cblas_strmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_strmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_strmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_strmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stbmv",11)==0) { + cblas_rout = "cblas_stbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_stbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_stbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stpmv",11)==0) { + cblas_rout = "cblas_stpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_strsv",11)==0) { + cblas_rout = "cblas_strsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_strsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_strsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_strsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stbsv",11)==0) { + cblas_rout = "cblas_stbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_stbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_stbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_stpsv",11)==0) { + cblas_rout = "cblas_stpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_stpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_stpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_stpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_sger",10)==0) { + cblas_rout = "cblas_sger"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sger(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_sger(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_sger(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_ssyr2",11)==0) { + cblas_rout = "cblas_ssyr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssyr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspr2",11)==0) { + cblas_rout = "cblas_sspr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_sspr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_sspr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_ssyr",10)==0) { + cblas_rout = "cblas_ssyr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ssyr(INVALID, CblasUpper, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, INVALID, 0, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_sspr",10)==0) { + cblas_rout = "cblas_sspr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_sspr(INVALID, CblasUpper, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, INVALID, 0, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sspr(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_s3chke.c b/ctest/c_s3chke.c new file mode 100644 index 0000000000..7c832c1dd9 --- /dev/null +++ b/ctest/c_s3chke.c @@ -0,0 +1,1273 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_s3chke(char *rout) { + char *sf = ( rout ) ; + float A[2] = {0.0,0.0}, + B[2] = {0.0,0.0}, + C[2] = {0.0,0.0}, + ALPHA=0.0, BETA=0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_sgemm" ,11)==0) { + cblas_rout = "cblas_sgemm" ; + cblas_info = 1; + cblas_sgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_sgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_sgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_sgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssymm" ,11)==0) { + cblas_rout = "cblas_ssymm" ; + + cblas_info = 1; + cblas_ssymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_strmm" ,11)==0) { + cblas_rout = "cblas_strmm" ; + + cblas_info = 1; + cblas_strmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_strsm" ,11)==0) { + cblas_rout = "cblas_strsm" ; + + cblas_info = 1; + cblas_strsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_strsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_strsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssyrk" ,11)==0) { + cblas_rout = "cblas_ssyrk" ; + + cblas_info = 1; + cblas_ssyrk( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_ssyrk( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_ssyrk( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ssyr2k" ,12)==0) { + cblas_rout = "cblas_ssyr2k" ; + + cblas_info = 1; + cblas_ssyr2k( INVALID, CblasUpper, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, INVALID, CblasNoTrans, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, INVALID, + 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + INVALID, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, INVALID, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_ssyr2k( CblasRowMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasUpper, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasNoTrans, + 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_ssyr2k( CblasColMajor, CblasLower, CblasTrans, + 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + } + if (cblas_ok == TRUE ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_sblas1.c b/ctest/c_sblas1.c new file mode 100644 index 0000000000..5ccb2d3e18 --- /dev/null +++ b/ctest/c_sblas1.c @@ -0,0 +1,83 @@ +/* + * c_sblas1.c + * + * The program is a C wrapper for scblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "common.h" +#include "cblas_test.h" + +float F77_sasum(blasint *N, float *X, blasint *incX) +{ + return cblas_sasum(*N, X, *incX); +} + +void F77_saxpy(blasint *N, const float *alpha, const float *X, + blasint *incX, float *Y, blasint *incY) +{ + cblas_saxpy(*N, *alpha, X, *incX, Y, *incY); + return; +} + +float F77_scasum(blasint *N, float *X, blasint *incX) +{ + return cblas_scasum(*N, X, *incX); +} + +float F77_scnrm2(blasint *N, const float *X, blasint *incX) +{ + return cblas_scnrm2(*N, X, *incX); +} + +void F77_scopy(blasint *N, const float *X, blasint *incX, + float *Y, blasint *incY) +{ + cblas_scopy(*N, X, *incX, Y, *incY); + return; +} + +float F77_sdot(blasint *N, const float *X, blasint *incX, + const float *Y, blasint *incY) +{ + return cblas_sdot(*N, X, *incX, Y, *incY); +} + +float F77_snrm2(blasint *N, const float *X, blasint *incX) +{ + return cblas_snrm2(*N, X, *incX); +} + +void F77_srotg( float *a, float *b, float *c, float *s) +{ + cblas_srotg(a,b,c,s); + return; +} + +void F77_srot( blasint *N, float *X, blasint *incX, float *Y, + blasint *incY, const float *c, const float *s) +{ + cblas_srot(*N,X,*incX,Y,*incY,*c,*s); + return; +} + +void F77_sscal(blasint *N, const float *alpha, float *X, + blasint *incX) +{ + cblas_sscal(*N, *alpha, X, *incX); + return; +} + +void F77_sswap( blasint *N, float *X, blasint *incX, + float *Y, blasint *incY) +{ + cblas_sswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_isamax(blasint *N, const float *X, blasint *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return (cblas_isamax(*N, X, *incX)+1); +} diff --git a/ctest/c_sblas2.c b/ctest/c_sblas2.c new file mode 100644 index 0000000000..3059525e40 --- /dev/null +++ b/ctest/c_sblas2.c @@ -0,0 +1,579 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 1/23/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha, + float *a, int *lda, float *x, int *incx, float *beta, + float *y, int *incy ) { + + float *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_sgemv( CblasRowMajor, trans, + *m, *n, *alpha, A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_sgemv( CblasColMajor, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); + else + cblas_sgemv( UNDEFINED, trans, + *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx, + float *y, int *incy, float *a, int *lda ) { + + float *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + + for( i=0; i<*m; i++ ) { + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + } + + cblas_sger(CblasRowMajor, *m, *n, *alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_sger( CblasColMajor, *m, *n, *alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_strmv(int *order, char *uplow, char *transp, char *diagn, + int *n, float *a, int *lda, float *x, int *incx) { + float *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_strmv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_strmv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx); + else { + cblas_strmv(UNDEFINED, uplo, trans, diag, *n, a, *lda, x, *incx); + } +} + +void F77_strsv(int *order, char *uplow, char *transp, char *diagn, + int *n, float *a, int *lda, float *x, int *incx ) { + float *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_strsv(CblasRowMajor, uplo, trans, diag, *n, A, LDA, x, *incx ); + free(A); + } + else + cblas_strsv(CblasColMajor, uplo, trans, diag, *n, a, *lda, x, *incx ); +} +void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a, + int *lda, float *x, int *incx, float *beta, float *y, + int *incy) { + float *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssymv(CblasRowMajor, uplo, *n, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_ssymv(CblasColMajor, uplo, *n, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x, + int *incx, float *a, int *lda) { + float *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssyr(CblasRowMajor, uplo, *n, *alpha, x, *incx, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_ssyr(CblasColMajor, uplo, *n, *alpha, x, *incx, a, *lda); +} + +void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x, + int *incx, float *y, int *incy, float *a, int *lda) { + float *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[ LDA*i+j ]=a[ (*lda)*j+i ]; + cblas_ssyr2(CblasRowMajor, uplo, *n, *alpha, x, *incx, y, *incy, A, LDA); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + a[ (*lda)*j+i ]=A[ LDA*i+j ]; + free(A); + } + else + cblas_ssyr2(CblasColMajor, uplo, *n, *alpha, x, *incx, y, *incy, a, *lda); +} + +void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + float *alpha, float *a, int *lda, float *x, int *incx, + float *beta, float *y, int *incy ) { + + float *A; + int i,irow,j,jcol,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + + if (*order == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) ); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + cblas_sgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, *alpha, + A, LDA, x, *incx, *beta, y, *incy ); + free(A); + } + else + cblas_sgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, *alpha, + a, *lda, x, *incx, *beta, y, *incy ); +} + +void F77_stbmv(int *order, char *uplow, char *transp, char *diagn, + int *n, int *k, float *a, int *lda, float *x, int *incx) { + float *A; + int irow, jcol, i, j, LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_stbmv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_stbmv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_stbsv(int *order, char *uplow, char *transp, char *diagn, + int *n, int *k, float *a, int *lda, float *x, int *incx) { + float *A; + int irow, jcol, i, j, LDA; + enum CBLAS_TRANSPOSE trans; + enum CBLAS_UPLO uplo; + enum CBLAS_DIAG diag; + + get_transpose_type(transp,&trans); + get_uplo_type(uplow,&uplo); + get_diag_type(diagn,&diag); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_stbsv(CblasRowMajor, uplo, trans, diag, *n, *k, A, LDA, x, *incx); + free(A); + } + else + cblas_stbsv(CblasColMajor, uplo, trans, diag, *n, *k, a, *lda, x, *incx); +} + +void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha, + float *a, int *lda, float *x, int *incx, float *beta, + float *y, int *incy) { + float *A; + int i,j,irow,jcol,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *k+1; + A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) + A[ LDA*(j-jcol)+irow ]=a[ (*lda)*j+i ]; + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) + A[ LDA*j+irow ]=a[ (*lda)*j+i ]; + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) + A[ LDA*j+irow ]=a[ (*lda)*(j-jcol)+i ]; + } + } + cblas_ssbmv(CblasRowMajor, uplo, *n, *k, *alpha, A, LDA, x, *incx, + *beta, y, *incy ); + free(A); + } + else + cblas_ssbmv(CblasColMajor, uplo, *n, *k, *alpha, a, *lda, x, *incx, + *beta, y, *incy ); +} + +void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap, + float *x, int *incx, float *beta, float *y, int *incy) { + float *A,*AP; + int i,j,k,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n; + A = ( float* )malloc( LDA*LDA*sizeof( float ) ); + AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include +#include "common.h" +#include "cblas_test.h" + +void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + + float *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A = (float *)malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else { + LDA = *m+1; + A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + if (transb == CblasNoTrans) { + LDB = *n+1; + B = ( float* )malloc( (*k)*LDB*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + else { + LDB = *k+1; + B = ( float* )malloc( LDB*(*n)*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + } + LDC = *n+1; + C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_sgemm( CblasRowMajor, transa, transb, *m, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_sgemm( CblasColMajor, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_sgemm( UNDEFINED, transa, transb, *m, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n, + float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + + float *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssymm( CblasRowMajor, side, uplo, *m, *n, *alpha, A, LDA, B, LDB, + *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_ssymm( CblasColMajor, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); + else + cblas_ssymm( UNDEFINED, side, uplo, *m, *n, *alpha, a, *lda, b, *ldb, + *beta, c, *ldc ); +} + +void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k, + float *alpha, float *a, int *lda, + float *beta, float *c, int *ldc ) { + + int i,j,LDA,LDC; + float *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*k)*LDA*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDC = *n+1; + C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssyrk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_ssyrk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_ssyrk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k, + float *alpha, float *a, int *lda, float *b, int *ldb, + float *beta, float *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + float *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + B = ( float* )malloc( (*n)*LDB*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); + B = ( float* )malloc( LDB*(*k)*sizeof( float ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j]=a[j*(*lda)+i]; + B[i*LDB+j]=b[j*(*ldb)+i]; + } + } + LDC = *n+1; + C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_ssyr2k(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_ssyr2k(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_ssyr2k(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, float *alpha, float *a, int *lda, float *b, + int *ldb) { + int i,j,LDA,LDB; + float *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_strmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_strmm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_strmm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} + +void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, float *alpha, float *a, int *lda, float *b, + int *ldb) { + int i,j,LDA,LDB; + float *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + cblas_strsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, *alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + b[j*(*ldb)+i]=B[i*LDB+j]; + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_strsm(CblasColMajor, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); + else + cblas_strsm(UNDEFINED, side, uplo, trans, diag, *m, *n, *alpha, + a, *lda, b, *ldb); +} diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f new file mode 100644 index 0000000000..de2b0380b8 --- /dev/null +++ b/ctest/c_sblat1.f @@ -0,0 +1,728 @@ + PROGRAM SCBLAT1 +* Test program for the REAL Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_SDOT '/ + DATA L(2)/'CBLAS_SAXPY '/ + DATA L(3)/'CBLAS_SROTG '/ + DATA L(4)/'CBLAS_SROT '/ + DATA L(5)/'CBLAS_SCOPY '/ + DATA L(6)/'CBLAS_SSWAP '/ + DATA L(7)/'CBLAS_SNRM2 '/ + DATA L(8)/'CBLAS_SASUM '/ + DATA L(9)/'CBLAS_SSCAL '/ + DATA L(10)/'CBLAS_ISAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + REAL DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL SROTGTEST, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3E0, 0.4E0, -0.3E0, -0.4E0, -0.3E0, 0.0E0, + + 0.0E0, 1.0E0/ + DATA DB1/0.4E0, 0.3E0, 0.4E0, 0.3E0, -0.4E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DC1/0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.6E0, 1.0E0, + + 0.0E0, 1.0E0/ + DATA DS1/0.8E0, 0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DATRUE/0.5E0, 0.5E0, 0.5E0, -0.5E0, -0.5E0, + + 0.0E0, 1.0E0, 1.0E0/ + DATA DBTRUE/0.0E0, 0.6E0, 0.0E0, -0.6E0, 0.0E0, + + 0.0E0, 1.0E0, 0.0E0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0E0/0.6E0 + DBTRUE(3) = -1.0E0/0.6E0 + DBTRUE(5) = 1.0E0/0.6E0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. SROTGTEST .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL SROTGTEST(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + REAL DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + REAL SASUMTEST, SNRM2TEST + INTEGER ISAMAXTEST + EXTERNAL SASUMTEST, SNRM2TEST, ISAMAXTEST +* .. External Subroutines .. + EXTERNAL ITEST1, SSCALTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0, -1.0E0, 0.0E0, 1.0E0, 0.3E0, 0.3E0, + + 0.3E0, 0.3E0, 0.3E0, 0.3E0/ + DATA DV/0.1E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 0.3E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 0.3E0, -0.4E0, 4.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 0.2E0, + + -0.6E0, 0.3E0, 5.0E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 0.1E0, -0.3E0, 0.5E0, -0.1E0, 6.0E0, + + 6.0E0, 6.0E0, 6.0E0, 0.1E0, 8.0E0, 8.0E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 0.3E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 0.3E0, 2.0E0, + + -0.4E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 0.2E0, 3.0E0, -0.6E0, 5.0E0, 0.3E0, 2.0E0, + + 2.0E0, 2.0E0, 0.1E0, 4.0E0, -0.3E0, 6.0E0, + + -0.5E0, 7.0E0, -0.1E0, 3.0E0/ + DATA DTRUE1/0.0E0, 0.3E0, 0.5E0, 0.7E0, 0.6E0/ + DATA DTRUE3/0.0E0, 0.3E0, 0.7E0, 1.1E0, 1.0E0/ + DATA DTRUE5/0.10E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, -0.3E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, 0.0E0, 0.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, + + 0.20E0, -0.60E0, 0.30E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 5.0E0, 0.03E0, -0.09E0, 0.15E0, -0.03E0, + + 6.0E0, 6.0E0, 6.0E0, 6.0E0, 0.10E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, + + 0.09E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 0.09E0, 2.0E0, -0.12E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, 2.0E0, 0.06E0, 3.0E0, + + -0.18E0, 5.0E0, 0.09E0, 2.0E0, 2.0E0, 2.0E0, + + 0.03E0, 4.0E0, -0.09E0, 6.0E0, -0.15E0, 7.0E0, + + -0.03E0, 3.0E0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. SNRM2TEST .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(SNRM2TEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. SASUMTEST .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(SASUMTEST(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. SSCALTEST .. + CALL SSCALTEST(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ISAMAXTEST .. + CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + REAL SDOTTEST + EXTERNAL SDOTTEST +* .. External Subroutines .. + EXTERNAL SAXPYTEST, SCOPYTEST, SSWAPTEST, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA DT7/0.0E0, 0.30E0, 0.21E0, 0.62E0, 0.0E0, + + 0.30E0, -0.07E0, 0.85E0, 0.0E0, 0.30E0, -0.79E0, + + -0.74E0, 0.0E0, 0.30E0, 0.33E0, 1.27E0/ + DATA DT8/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.15E0, + + 0.94E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.68E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.35E0, -0.9E0, 0.48E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.38E0, -0.9E0, 0.57E0, 0.7E0, -0.75E0, + + 0.2E0, 0.98E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.35E0, -0.72E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.38E0, + + -0.63E0, 0.15E0, 0.88E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.7E0, + + -0.75E0, 0.2E0, 1.04E0/ + DATA DT10X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, -0.9E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0, + + 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.3E0, 0.1E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.8E0, 0.1E0, -0.6E0, + + 0.8E0, 0.3E0, -0.3E0, 0.5E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.9E0, + + 0.1E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + 0.1E0, 0.3E0, 0.8E0, -0.9E0, -0.3E0, 0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, -0.6E0, 0.8E0, 0.0E0, 0.0E0, + + 0.0E0/ + DATA DT10Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, -0.5E0, -0.9E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, -0.4E0, -0.9E0, 0.9E0, + + 0.7E0, -0.5E0, 0.2E0, 0.6E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + -0.4E0, 0.9E0, -0.5E0, 0.6E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.7E0, + + -0.5E0, 0.2E0, 0.8E0/ + DATA SSIZE1/0.0E0, 0.3E0, 1.6E0, 3.2E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. SDOTTEST .. + CALL STEST1(SDOTTEST(N,SX,INCX,SY,INCY),DT7(KN,KI), + + SSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. SAXPYTEST .. + CALL SAXPYTEST(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. SCOPYTEST .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL SCOPYTEST(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE IF (ICASE.EQ.6) THEN +* .. SSWAPTEST .. + CALL SSWAPTEST(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0E0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL SROTTEST, STEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA SC, SS/0.8E0, 0.6E0/ + DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + + 0.0E0, 0.0E0, 0.0E0/ + DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + + -0.18E0, 0.2E0, 0.16E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. SROTTEST .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL SROTTEST(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0 + DO 100 I = 2, 6 + MWPS(I) = 1 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL SROTTEST(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/ctest/c_sblat2.f b/ctest/c_sblat2.f new file mode 100644 index 0000000000..bf6f3e454f --- /dev/null +++ b/ctest/c_sblat2.f @@ -0,0 +1,2907 @@ + PROGRAM SBLAT2 +* +* Test program for the REAL Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 33 lines: +* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* cblas_sgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssymv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_stpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sger T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_sspr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHK6, + $ CS2CHKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_sgemv ', 'cblas_sgbmv ', + $ 'cblas_ssymv ','cblas_ssbmv ','cblas_sspmv ', + $ 'cblas_strmv ','cblas_stbmv ','cblas_stpmv ', + $ 'cblas_strsv ','cblas_stbsv ','cblas_stpsv ', + $ 'cblas_sger ','cblas_ssyr ','cblas_sspr ', + $ 'cblas_ssyr2 ','cblas_sspr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from SMVCH YT holds +* the result computed by SMVCH. + TRANS = 'N' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CS2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test SGEMV, 01, and SGBMV, 02. + 140 IF (CORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. + 150 IF (CORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test STRMV, 06, STBMV, 07, STPMV, 08, +* STRSV, 09, STBSV, 10, and STPSV, 11. + 160 IF (CORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test SGER, 12. + 170 IF (CORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test SSYR, 13, and SSPR, 14. + 180 IF (CORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test SSYR2, 15, and SSPR2, 16. + 190 IF (CORDER) THEN + CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE REAL LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN SMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT(A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT2. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests SGEMV and SGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGBMV, CSGEMV, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, + $ BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LSERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ',',/ 10x, 'X,', I2, ',', F4.1, ', Y,', + $ I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests SSYMV, SSBMV and SSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSBMV, CSSPMV, CSSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSYMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LSE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LSE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSTBMV, CSTBSV, CSTPMV, + $ CSTPSV, CSTRMV, CSTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for SMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LSERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 10: 11 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ K, LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14,',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGER, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSGER( IORDER, M, N, ALPHA, XX, INCX, YY, + $ INCY, AA, LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL SMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SSYR and SSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSPR, CSSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSSYR( IORDER, UPLO, N, ALPHA, XX, INCX, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CSSPR( IORDER, UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LSERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END + SUBROUTINE SCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests SSYR2 and SSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, CSSPR2, CSSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'y' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CSSYR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CSSPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK6. +* + END + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'sy', 'sb', 'sp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'s' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = SBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'sb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'sp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of SMVCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'sy' or 'sp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = REAL( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/ctest/c_sblat3.f b/ctest/c_sblat3.f new file mode 100644 index 0000000000..948fd6ed17 --- /dev/null +++ b/ctest/c_sblat3.f @@ -0,0 +1,2479 @@ + PROGRAM SBLAT3 +* +* Test program for the REAL Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 19 lines: +* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* cblas_sgemm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssymm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strmm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_strsm T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyrk T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ssyr2k T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + REAL AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, CS3CHKE, + $ SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_sgemm ', 'cblas_ssymm ', + $ 'cblas_strmm ', 'cblas_strsm ','cblas_ssyrk ', + $ 'cblas_ssyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN +* OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from SMMCH CT holds +* the result computed by SMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CS3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test SGEMM, 01. + 140 IF (CORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test SSYMM, 02. + 150 IF (CORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test STRMM, 03, STRSM, 04. + 160 IF (CORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test SSYRK, 05. + 170 IF (CORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test SSYR2K, 06. + 180 IF (CORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE REAL LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* ', + $ 'TESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN SMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT3. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL CSGEMM, SMAKE, SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END +* +* +* + SUBROUTINE SPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL SMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYMM( IORDER, SIDE, UPLO, M, N, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END +* + SUBROUTINE SPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 20X, 2( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', + $ F4.1, ', ', 'C,', I3, ').' ) + END +* + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests STRMM and STRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSTRMM, CSTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for SMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL SMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CSTRMM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL SPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CSTRSM( IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LSE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LSE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END +* + SUBROUTINE SPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + REAL ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = 'CblasRowMajor' + ELSE + CRC = 'CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 22X, 2( A14, ',') , 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests SSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN4( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYRK( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL SMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END +* + SUBROUTINE SPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests SSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + REAL AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, CSSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL SPRCN5( NTRA, NC, SNAME, IORDER, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CSSYR2K( IORDER, UPLO, TRANS, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I+1 + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL SMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL SMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL SPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END +* + SUBROUTINE SPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + REAL ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 20X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = SBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of SMMCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = ( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END diff --git a/ctest/c_xerbla.c b/ctest/c_xerbla.c new file mode 100644 index 0000000000..3402460cd4 --- /dev/null +++ b/ctest/c_xerbla.c @@ -0,0 +1,137 @@ +#include +#include +#include +#include +#include "common.h" +#include "cblas_test.h" + +void cblas_xerbla(blasint info, char *rout, char *form, ...) +{ + extern int cblas_lerr, cblas_info, cblas_ok; + extern int link_xerbla; + extern int RowMajorStrg; + extern char *cblas_rout; + + /* Initially, c__3chke will call this routine with + * global variable link_xerbla=1, and F77_xerbla will set link_xerbla=0. + * This is done to fool the linker into loading these subroutines first + * instead of ones in the CBLAS or the legacy BLAS library. + */ + if (link_xerbla) return; + + if (cblas_rout != NULL && strcmp(cblas_rout, rout) != 0){ + printf("***** XERBLA WAS CALLED WITH SRNAME = <%s> INSTEAD OF <%s> *******\n", rout, cblas_rout); + cblas_ok = FALSE; + } + + if (RowMajorStrg) + { + /* To properly check leading dimension problems in cblas__gemm, we + * need to do the following trick. When cblas__gemm is called with + * CblasRowMajor, the arguments A and B switch places in the call to + * f77__gemm. Thus when we test for bad leading dimension problems + * for A and B, lda is in position 11 instead of 9, and ldb is in + * position 9 instead of 11. + */ + if (strstr(rout,"gemm") != 0) + { + if (info == 5 ) info = 4; + else if (info == 4 ) info = 5; + else if (info == 11) info = 9; + else if (info == 9 ) info = 11; + } + else if (strstr(rout,"symm") != 0 || strstr(rout,"hemm") != 0) + { + if (info == 5 ) info = 4; + else if (info == 4 ) info = 5; + } + else if (strstr(rout,"trmm") != 0 || strstr(rout,"trsm") != 0) + { + if (info == 7 ) info = 6; + else if (info == 6 ) info = 7; + } + else if (strstr(rout,"gemv") != 0) + { + if (info == 4) info = 3; + else if (info == 3) info = 4; + } + else if (strstr(rout,"gbmv") != 0) + { + if (info == 4) info = 3; + else if (info == 3) info = 4; + else if (info == 6) info = 5; + else if (info == 5) info = 6; + } + else if (strstr(rout,"ger") != 0) + { + if (info == 3) info = 2; + else if (info == 2) info = 3; + else if (info == 8) info = 6; + else if (info == 6) info = 8; + } + else if ( ( strstr(rout,"her2") != 0 || strstr(rout,"hpr2") != 0 ) + && strstr(rout,"her2k") == 0 ) + { + if (info == 8) info = 6; + else if (info == 6) info = 8; + } + } + + if (info != cblas_info){ + printf("***** XERBLA WAS CALLED WITH INFO = %d INSTEAD OF %d in %s *******\n",info, cblas_info, rout); + cblas_lerr = PASSED; + cblas_ok = FALSE; + } else cblas_lerr = FAILED; +} + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo) +#else +void F77_xerbla(char *srname, void *vinfo) +#endif +{ +#ifdef F77_Char + char *srname; +#endif + + char rout[] = {'c','b','l','a','s','_','\0','\0','\0','\0','\0','\0','\0'}; + +#ifdef F77_Integer + F77_Integer *info=vinfo; + F77_Integer i; + extern F77_Integer link_xerbla; +#else + int *info=vinfo; + int i; + extern int link_xerbla; +#endif +#ifdef F77_Char + srname = F2C_STR(F77_srname, XerblaStrLen); +#endif + + /* See the comment in cblas_xerbla() above */ + if (link_xerbla) + { + link_xerbla = 0; + return; + } + for(i=0; i < 6; i++) rout[i+6] = tolower(srname[i]); + for(i=11; i >= 9; i--) if (rout[i] == ' ') rout[i] = '\0'; + + /* We increment *info by 1 since the CBLAS interface adds one more + * argument to all level 2 and 3 routines. + */ + cblas_xerbla(*info+1,rout,""); +} + +#ifdef USE64BITINT +#undef int +#endif + +int BLASFUNC(xerbla)(char *name, blasint *info, blasint length) { + + F77_xerbla(name, info); + +}; + + diff --git a/ctest/c_z2chke.c b/ctest/c_z2chke.c new file mode 100644 index 0000000000..ac6097153d --- /dev/null +++ b/ctest/c_z2chke.c @@ -0,0 +1,826 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_z2chke(char *rout) { + char *sf = ( rout ) ; + double A[2] = {0.0,0.0}, + X[2] = {0.0,0.0}, + Y[2] = {0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (strncmp( sf,"cblas_zgemv",11)==0) { + cblas_rout = "cblas_zgemv"; + cblas_info = 1; + cblas_zgemv(INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 2, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_zgemv(CblasColMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + + cblas_info = 2; RowMajorStrg = TRUE; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_zgemv(CblasRowMajor, CblasNoTrans, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgbmv",11)==0) { + cblas_rout = "cblas_zgbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgbmv(INVALID, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgbmv(CblasColMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, INVALID, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 2, 0, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 1, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgbmv(CblasRowMajor, CblasNoTrans, 0, 0, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhemv",11)==0) { + cblas_rout = "cblas_zhemv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhemv(INVALID, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zhemv(CblasColMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 2, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zhemv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhbmv",11)==0) { + cblas_rout = "cblas_zhbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhbmv(INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_zhbmv(CblasColMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, INVALID, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, INVALID, 0, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, INVALID, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 1, + ALPHA, A, 1, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_zhbmv(CblasRowMajor, CblasUpper, 0, 0, + ALPHA, A, 1, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpmv",11)==0) { + cblas_rout = "cblas_zhpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpmv(INVALID, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhpmv(CblasColMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, INVALID, 0, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, INVALID, + ALPHA, A, X, 1, BETA, Y, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 0, BETA, Y, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhpmv(CblasRowMajor, CblasUpper, 0, + ALPHA, A, X, 1, BETA, Y, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztrmv",11)==0) { + cblas_rout = "cblas_ztrmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztrmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ztrmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ztrmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztbmv",11)==0) { + cblas_rout = "cblas_ztbmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztbmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztbmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztbmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztpmv",11)==0) { + cblas_rout = "cblas_ztpmv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztpmv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztpmv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztpmv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztrsv",11)==0) { + cblas_rout = "cblas_ztrsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztrsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_ztrsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, A, 1, X, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_ztrsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztbsv",11)==0) { + cblas_rout = "cblas_ztbsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztbsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztbsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, A, 1, X, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, A, 1, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 1, A, 1, X, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztbsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, A, 1, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_ztpsv",11)==0) { + cblas_rout = "cblas_ztpsv"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_ztpsv(INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_ztpsv(CblasColMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, INVALID, CblasNoTrans, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, INVALID, + CblasNonUnit, 0, A, X, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + INVALID, 0, A, X, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, A, X, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_ztpsv(CblasRowMajor, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, A, X, 0 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgeru",10)==0) { + cblas_rout = "cblas_zgeru"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgeru(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zgeru(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zgeru(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zgerc",10)==0) { + cblas_rout = "cblas_zgerc"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zgerc(INVALID, 0, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zgerc(CblasColMajor, 2, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zgerc(CblasRowMajor, 0, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zher2",11)==0) { + cblas_rout = "cblas_zher2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zher2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2(CblasColMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2(CblasRowMajor, CblasUpper, 2, ALPHA, X, 1, Y, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpr2",11)==0) { + cblas_rout = "cblas_zhpr2"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpr2(INVALID, CblasUpper, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhpr2(CblasColMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, INVALID, 0, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, INVALID, ALPHA, X, 1, Y, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 0, Y, 1, A ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhpr2(CblasRowMajor, CblasUpper, 0, ALPHA, X, 1, Y, 0, A ); + chkxer(); + } else if (strncmp( sf,"cblas_zher",10)==0) { + cblas_rout = "cblas_zher"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zher(INVALID, CblasUpper, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher(CblasColMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, INVALID, 0, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, INVALID, RALPHA, X, 1, A, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, 0, RALPHA, X, 0, A, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher(CblasRowMajor, CblasUpper, 2, RALPHA, X, 1, A, 1 ); + chkxer(); + } else if (strncmp( sf,"cblas_zhpr",10)==0) { + cblas_rout = "cblas_zhpr"; + cblas_info = 1; RowMajorStrg = FALSE; + cblas_zhpr(INVALID, CblasUpper, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, INVALID, 0, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, INVALID, RALPHA, X, 1, A ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zhpr(CblasColMajor, CblasUpper, 0, RALPHA, X, 0, A ); + chkxer(); + } + if (cblas_ok == TRUE) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("******* %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_z3chke.c b/ctest/c_z3chke.c new file mode 100644 index 0000000000..b58cb62170 --- /dev/null +++ b/ctest/c_z3chke.c @@ -0,0 +1,1706 @@ +#include +#include +#include "common.h" +#include "cblas_test.h" + +int cblas_ok, cblas_lerr, cblas_info; +int link_xerbla=TRUE; +char *cblas_rout; + +#ifdef F77_Char +void F77_xerbla(F77_Char F77_srname, void *vinfo); +#else +void F77_xerbla(char *srname, void *vinfo); +#endif + +void chkxer(void) { + extern int cblas_ok, cblas_lerr, cblas_info; + extern int link_xerbla; + extern char *cblas_rout; + if (cblas_lerr == 1 ) { + printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", cblas_info, cblas_rout); + cblas_ok = 0 ; + } + cblas_lerr = 1 ; +} + +void F77_z3chke(char * rout) { + char *sf = ( rout ) ; + double A[4] = {0.0,0.0,0.0,0.0}, + B[4] = {0.0,0.0,0.0,0.0}, + C[4] = {0.0,0.0,0.0,0.0}, + ALPHA[2] = {0.0,0.0}, + BETA[2] = {0.0,0.0}, + RALPHA = 0.0, RBETA = 0.0; + extern int cblas_info, cblas_lerr, cblas_ok; + extern int RowMajorStrg; + extern char *cblas_rout; + + cblas_ok = TRUE ; + cblas_lerr = PASSED ; + + if (link_xerbla) /* call these first to link */ + { + cblas_xerbla(cblas_info,cblas_rout,""); + F77_xerbla(cblas_rout,&cblas_info); + } + + if (strncmp( sf,"cblas_zgemm" ,11)==0) { + cblas_rout = "cblas_zgemm" ; + + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasNoTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 1; + cblas_zgemm( INVALID, CblasTrans, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasNoTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, INVALID, CblasTrans, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, INVALID, 0, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasNoTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = FALSE; + cblas_zgemm( CblasColMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 2, 0, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 9; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 2, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasNoTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasNoTrans, 0, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 14; RowMajorStrg = TRUE; + cblas_zgemm( CblasRowMajor, CblasTrans, CblasTrans, 0, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zhemm" ,11)==0) { + cblas_rout = "cblas_zhemm" ; + + cblas_info = 1; + cblas_zhemm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zhemm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zhemm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsymm" ,11)==0) { + cblas_rout = "cblas_zsymm" ; + + cblas_info = 1; + cblas_zsymm( INVALID, CblasRight, CblasLower, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, INVALID, CblasUpper, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, INVALID, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsymm( CblasColMajor, CblasRight, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasUpper, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasUpper, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasLeft, CblasLower, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsymm( CblasRowMajor, CblasRight, CblasLower, 0, 2, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrmm" ,11)==0) { + cblas_rout = "cblas_ztrmm" ; + + cblas_info = 1; + cblas_ztrmm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrmm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrmm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_ztrsm" ,11)==0) { + cblas_rout = "cblas_ztrsm" ; + + cblas_info = 1; + cblas_ztrsm( INVALID, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, INVALID, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, INVALID, CblasNoTrans, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, INVALID, + CblasNonUnit, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + INVALID, 0, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = FALSE; + cblas_ztrsm( CblasColMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 6; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, INVALID, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 7; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, INVALID, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 2, 0, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 2 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasUpper, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasLeft, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 1, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasNoTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + cblas_info = 12; RowMajorStrg = TRUE; + cblas_ztrsm( CblasRowMajor, CblasRight, CblasLower, CblasTrans, + CblasNonUnit, 0, 2, ALPHA, A, 2, B, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zherk" ,11)==0) { + cblas_rout = "cblas_zherk" ; + + cblas_info = 1; + cblas_zherk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zherk(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + RALPHA, A, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zherk(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + RALPHA, A, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyrk" ,11)==0) { + cblas_rout = "cblas_zsyrk" ; + + cblas_info = 1; + cblas_zsyrk(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = TRUE; + cblas_zsyrk(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 11; RowMajorStrg = FALSE; + cblas_zsyrk(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, BETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zher2k" ,12)==0) { + cblas_rout = "cblas_zher2k" ; + + cblas_info = 1; + cblas_zher2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasTrans, 0, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, INVALID, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, INVALID, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, RBETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 0, 2, + ALPHA, A, 2, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zher2k(CblasRowMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasUpper, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, RBETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zher2k(CblasColMajor, CblasLower, CblasConjTrans, 2, 0, + ALPHA, A, 1, B, 1, RBETA, C, 1 ); + chkxer(); + + } else if (strncmp( sf,"cblas_zsyr2k" ,12)==0) { + cblas_rout = "cblas_zsyr2k" ; + + cblas_info = 1; + cblas_zsyr2k(INVALID, CblasUpper, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 2; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, INVALID, CblasNoTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 3; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasConjTrans, 0, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 4; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, INVALID, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 5; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, INVALID, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 1, B, 2, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 8; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 1, BETA, C, 2 ); + chkxer(); + cblas_info = 10; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 0, 2, + ALPHA, A, 2, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = TRUE; + cblas_zsyr2k(CblasRowMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasUpper, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasNoTrans, 2, 0, + ALPHA, A, 2, B, 2, BETA, C, 1 ); + chkxer(); + cblas_info = 13; RowMajorStrg = FALSE; + cblas_zsyr2k(CblasColMajor, CblasLower, CblasTrans, 2, 0, + ALPHA, A, 1, B, 1, BETA, C, 1 ); + chkxer(); + + } + + if (cblas_ok == 1 ) + printf(" %-12s PASSED THE TESTS OF ERROR-EXITS\n", cblas_rout); + else + printf("***** %s FAILED THE TESTS OF ERROR-EXITS *******\n",cblas_rout); +} diff --git a/ctest/c_zblas1.c b/ctest/c_zblas1.c new file mode 100644 index 0000000000..0a36f33bde --- /dev/null +++ b/ctest/c_zblas1.c @@ -0,0 +1,75 @@ +/* + * c_zblas1.c + * + * The program is a C wrapper for zcblat1. + * + * Written by Keita Teranishi. 2/11/1998 + * + */ +#include "common.h" +#include "cblas_test.h" + +void F77_zaxpy(const int *N, const void *alpha, void *X, + const int *incX, void *Y, const int *incY) +{ + cblas_zaxpy(*N, alpha, X, *incX, Y, *incY); + return; +} + +void F77_zcopy(const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_zcopy(*N, X, *incX, Y, *incY); + return; +} + +void F77_zdotc(const int *N, const void *X, const int *incX, + const void *Y, const int *incY,void *dotc) +{ + cblas_zdotc_sub(*N, X, *incX, Y, *incY, dotc); + return; +} + +void F77_zdotu(const int *N, void *X, const int *incX, + void *Y, const int *incY,void *dotu) +{ + cblas_zdotu_sub(*N, X, *incX, Y, *incY, dotu); + return; +} + +void F77_zdscal(const int *N, const double *alpha, void *X, + const int *incX) +{ + cblas_zdscal(*N, *alpha, X, *incX); + return; +} + +void F77_zscal(const int *N, const void * *alpha, void *X, + const int *incX) +{ + cblas_zscal(*N, alpha, X, *incX); + return; +} + +void F77_zswap( const int *N, void *X, const int *incX, + void *Y, const int *incY) +{ + cblas_zswap(*N,X,*incX,Y,*incY); + return; +} + +int F77_izamax(const int *N, const void *X, const int *incX) +{ + if (*N < 1 || *incX < 1) return(0); + return(cblas_izamax(*N, X, *incX)+1); +} + +double F77_dznrm2(const int *N, const void *X, const int *incX) +{ + return cblas_dznrm2(*N, X, *incX); +} + +double F77_dzasum(const int *N, void *X, const int *incX) +{ + return cblas_dzasum(*N, X, *incX); +} diff --git a/ctest/c_zblas2.c b/ctest/c_zblas2.c new file mode 100644 index 0000000000..6291abe116 --- /dev/null +++ b/ctest/c_zblas2.c @@ -0,0 +1,807 @@ +/* + * Written by D.P. Manley, Digital Equipment Corporation. + * Prefixed "C_" to BLAS routines and their declarations. + * + * Modified by T. H. Do, 4/08/98, SGI/CRAY Research. + */ +#include +#include "common.h" +#include "cblas_test.h" + +void F77_zgemv(int *order, char *transp, int *m, int *n, + const void *alpha, + CBLAS_TEST_ZOMPLEX *a, int *lda, const void *x, int *incx, + const void *beta, void *y, int *incy) { + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgemv( CblasRowMajor, trans, *m, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zgemv( CblasColMajor, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); + else + cblas_zgemv( UNDEFINED, trans, + *m, *n, alpha, a, *lda, x, *incx, beta, y, *incy ); +} + +void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *x, int *incx, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy) { + + CBLAS_TEST_ZOMPLEX *A; + int i,j,irow,jcol,LDA; + enum CBLAS_TRANSPOSE trans; + + get_transpose_type(transp, &trans); + if (*order == TEST_ROW_MJR) { + LDA = *ku+*kl+2; + A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*ku; i++ ){ + irow=*ku+*kl-i; + jcol=(*ku)-i; + for( j=jcol; j<*n; j++ ){ + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*ku; + irow=*ku+*kl-i; + for( j=0; j<*n; j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=*ku+1; i<*ku+*kl+1; i++ ){ + irow=*ku+*kl-i; + jcol=i-(*ku); + for( j=jcol; j<(*n+*kl); j++ ){ + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + cblas_zgbmv( CblasRowMajor, trans, *m, *n, *kl, *ku, alpha, A, LDA, x, + *incx, beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zgbmv( CblasColMajor, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else + cblas_zgbmv( UNDEFINED, trans, *m, *n, *kl, *ku, alpha, a, *lda, x, + *incx, beta, y, *incy ); +} + +void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, + CBLAS_TEST_ZOMPLEX *a, int *lda){ + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgeru( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zgeru( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_zgeru( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *y, int *incy, + CBLAS_TEST_ZOMPLEX *a, int *lda) { + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zgerc( CblasRowMajor, *m, *n, alpha, x, *incx, y, *incy, A, LDA ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ){ + a[ (*lda)*j+i ].real=A[ LDA*i+j ].real; + a[ (*lda)*j+i ].imag=A[ LDA*i+j ].imag; + } + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zgerc( CblasColMajor, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); + else + cblas_zgerc( UNDEFINED, *m, *n, alpha, x, *incx, y, *incy, a, *lda ); +} + +void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *a, int *lda, CBLAS_TEST_ZOMPLEX *x, + int *incx, CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ + + CBLAS_TEST_ZOMPLEX *A; + int i,j,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + LDA = *n+1; + A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ){ + A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; + A[ LDA*i+j ].imag=a[ (*lda)*j+i ].imag; + } + cblas_zhemv( CblasRowMajor, uplo, *n, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + else if (*order == TEST_COL_MJR) + cblas_zhemv( CblasColMajor, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_zhemv( UNDEFINED, uplo, *n, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_zhbmv(int *order, char *uplow, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *x, int *incx, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *y, int *incy){ + +CBLAS_TEST_ZOMPLEX *A; +int i,irow,j,jcol,LDA; + + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + + if (*order == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_zhbmv(CblasRowMajor, UNDEFINED, *n, *k, alpha, a, *lda, x, + *incx, beta, y, *incy ); + else { + LDA = *k+2; + A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + if (uplo == CblasUpper) { + for( i=0; i<*k; i++ ){ + irow=*k-i; + jcol=(*k)-i; + for( j=jcol; j<*n; j++ ) { + A[ LDA*(j-jcol)+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*(j-jcol)+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + i=*k; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + } + else { + i=0; + irow=*k-i; + for( j=0; j<*n; j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*j+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*j+i ].imag; + } + for( i=1; i<*k+1; i++ ){ + irow=*k-i; + jcol=i; + for( j=jcol; j<(*n+*k); j++ ) { + A[ LDA*j+irow ].real=a[ (*lda)*(j-jcol)+i ].real; + A[ LDA*j+irow ].imag=a[ (*lda)*(j-jcol)+i ].imag; + } + } + } + cblas_zhbmv( CblasRowMajor, uplo, *n, *k, alpha, A, LDA, x, *incx, + beta, y, *incy ); + free(A); + } + } + else if (*order == TEST_COL_MJR) + cblas_zhbmv(CblasColMajor, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); + else + cblas_zhbmv(UNDEFINED, uplo, *n, *k, alpha, a, *lda, x, *incx, + beta, y, *incy ); +} + +void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, + CBLAS_TEST_ZOMPLEX *ap, CBLAS_TEST_ZOMPLEX *x, int *incx, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *y, int *incy){ + + CBLAS_TEST_ZOMPLEX *A, *AP; + int i,j,k,LDA; + enum CBLAS_UPLO uplo; + + get_uplo_type(uplow,&uplo); + if (*order == TEST_ROW_MJR) { + if (uplo != CblasUpper && uplo != CblasLower ) + cblas_zhpmv(CblasRowMajor, UNDEFINED, *n, alpha, ap, x, *incx, + beta, y, *incy); + else { + LDA = *n; + A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); + AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)* + sizeof( CBLAS_TEST_ZOMPLEX )); + if (uplo == CblasUpper) { + for( j=0, k=0; j<*n; j++ ) + for( i=0; i +#include "common.h" +#include "cblas_test.h" +#define TEST_COL_MJR 0 +#define TEST_ROW_MJR 1 +#define UNDEFINED -1 + +void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, + int *k, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_TRANSPOSE transa, transb; + + get_transpose_type(transpa, &transa); + get_transpose_type(transpb, &transb); + + if (*order == TEST_ROW_MJR) { + if (transa == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + + if (transb == CblasNoTrans) { + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDB = *k+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zgemm( CblasRowMajor, transa, transb, *m, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zgemm( CblasColMajor, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zgemm( UNDEFINED, transa, transb, *m, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zhemm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zhemm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zhemm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} +void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + CBLAS_TEST_ZOMPLEX *A, *B, *C; + int i,j,LDA, LDB, LDC; + enum CBLAS_UPLO uplo; + enum CBLAS_SIDE side; + + get_uplo_type(uplow,&uplo); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) + A[i*LDA+j]=a[j*(*lda)+i]; + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) + B[i*LDB+j]=b[j*(*ldb)+i]; + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + C[i*LDC+j]=c[j*(*ldc)+i]; + cblas_zsymm( CblasRowMajor, side, uplo, *m, *n, alpha, A, LDA, B, LDB, + beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) + c[j*(*ldc)+i]=C[i*LDC+j]; + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsymm( CblasColMajor, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); + else + cblas_zsymm( UNDEFINED, side, uplo, *m, *n, alpha, a, *lda, b, *ldb, + beta, c, *ldc ); +} + +void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, + double *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + double *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zherk(CblasRowMajor, uplo, trans, *n, *k, *alpha, A, LDA, *beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zherk(CblasColMajor, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); + else + cblas_zherk(UNDEFINED, uplo, trans, *n, *k, *alpha, a, *lda, *beta, + c, *ldc ); +} + +void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *beta, CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + + int i,j,LDA,LDC; + CBLAS_TEST_ZOMPLEX *A, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyrk(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, beta, + C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsyrk(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); + else + cblas_zsyrk(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, beta, + c, *ldc ); +} +void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, double *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zher2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, *beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zher2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); + else + cblas_zher2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, *beta, c, *ldc ); +} +void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, + CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, int *lda, + CBLAS_TEST_ZOMPLEX *b, int *ldb, CBLAS_TEST_ZOMPLEX *beta, + CBLAS_TEST_ZOMPLEX *c, int *ldc ) { + int i,j,LDA,LDB,LDC; + CBLAS_TEST_ZOMPLEX *A, *B, *C; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + + if (*order == TEST_ROW_MJR) { + if (trans == CblasNoTrans) { + LDA = *k+1; + LDB = *k+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*k; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + else { + LDA = *n+1; + LDB = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*k; i++ ) + for( j=0; j<*n; j++ ){ + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + } + LDC = *n+1; + C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + C[i*LDC+j].real=c[j*(*ldc)+i].real; + C[i*LDC+j].imag=c[j*(*ldc)+i].imag; + } + cblas_zsyr2k(CblasRowMajor, uplo, trans, *n, *k, alpha, A, LDA, + B, LDB, beta, C, LDC ); + for( j=0; j<*n; j++ ) + for( i=0; i<*n; i++ ) { + c[j*(*ldc)+i].real=C[i*LDC+j].real; + c[j*(*ldc)+i].imag=C[i*LDC+j].imag; + } + free(A); + free(B); + free(C); + } + else if (*order == TEST_COL_MJR) + cblas_zsyr2k(CblasColMajor, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); + else + cblas_zsyr2k(UNDEFINED, uplo, trans, *n, *k, alpha, a, *lda, + b, *ldb, beta, c, *ldc ); +} +void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrmm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ztrmm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrmm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} + +void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, + int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, CBLAS_TEST_ZOMPLEX *a, + int *lda, CBLAS_TEST_ZOMPLEX *b, int *ldb) { + int i,j,LDA,LDB; + CBLAS_TEST_ZOMPLEX *A, *B; + enum CBLAS_SIDE side; + enum CBLAS_DIAG diag; + enum CBLAS_UPLO uplo; + enum CBLAS_TRANSPOSE trans; + + get_uplo_type(uplow,&uplo); + get_transpose_type(transp,&trans); + get_diag_type(diagn,&diag); + get_side_type(rtlf,&side); + + if (*order == TEST_ROW_MJR) { + if (side == CblasLeft) { + LDA = *m+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); + for( i=0; i<*m; i++ ) + for( j=0; j<*m; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + else{ + LDA = *n+1; + A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*n; i++ ) + for( j=0; j<*n; j++ ) { + A[i*LDA+j].real=a[j*(*lda)+i].real; + A[i*LDA+j].imag=a[j*(*lda)+i].imag; + } + } + LDB = *n+1; + B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); + for( i=0; i<*m; i++ ) + for( j=0; j<*n; j++ ) { + B[i*LDB+j].real=b[j*(*ldb)+i].real; + B[i*LDB+j].imag=b[j*(*ldb)+i].imag; + } + cblas_ztrsm(CblasRowMajor, side, uplo, trans, diag, *m, *n, alpha, + A, LDA, B, LDB ); + for( j=0; j<*n; j++ ) + for( i=0; i<*m; i++ ) { + b[j*(*ldb)+i].real=B[i*LDB+j].real; + b[j*(*ldb)+i].imag=B[i*LDB+j].imag; + } + free(A); + free(B); + } + else if (*order == TEST_COL_MJR) + cblas_ztrsm(CblasColMajor, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); + else + cblas_ztrsm(UNDEFINED, side, uplo, trans, diag, *m, *n, alpha, + a, *lda, b, *ldb); +} diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f new file mode 100644 index 0000000000..03753e782c --- /dev/null +++ b/ctest/c_zblat1.f @@ -0,0 +1,682 @@ + PROGRAM ZCBLAT1 +* Test program for the COMPLEX*16 Level 1 CBLAS. +* Based upon the original CBLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex CBLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*15 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CBLAS_ZDOTC'/ + DATA L(2)/'CBLAS_ZDOTU'/ + DATA L(3)/'CBLAS_ZAXPY'/ + DATA L(4)/'CBLAS_ZCOPY'/ + DATA L(5)/'CBLAS_ZSWAP'/ + DATA L(6)/'CBLAS_DZNRM2'/ + DATA L(7)/'CBLAS_DZASUM'/ + DATA L(8)/'CBLAS_ZSCAL'/ + DATA L(9)/'CBLAS_ZDSCAL'/ + DATA L(10)/'CBLAS_IZAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,9X,A15) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA + DOUBLE PRECISION SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX*16 CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + DOUBLE PRECISION STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + DOUBLE PRECISION DZASUMTEST, DZNRM2TEST + INTEGER IZAMAXTEST + EXTERNAL DZASUMTEST, DZNRM2TEST, IZAMAXTEST +* .. External Subroutines .. + EXTERNAL ZSCALTEST, ZDSCALTEST, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3D0, (0.4D0,-0.7D0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.3D0,-0.4D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.1D0,-0.3D0), (0.5D0,-0.1D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), + + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.3D0,-0.4D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.1D0,-0.3D0), (8.0D0,9.0D0), (0.5D0,-0.1D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (0.1D0,0.1D0), + + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), + + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), + + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ + DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ + DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (-0.16D0,-0.37D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (-0.17D0,-0.19D0), (0.13D0,-0.39D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.19D0,-0.17D0), (0.32D0,0.09D0), + + (0.23D0,-0.24D0), (0.18D0,0.01D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (-0.16D0,-0.37D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (-0.17D0,-0.19D0), (8.0D0,9.0D0), + + (0.13D0,-0.39D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.11D0,-0.03D0), (3.0D0,6.0D0), + + (-0.17D0,0.46D0), (4.0D0,7.0D0), + + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), + + (0.32D0,0.09D0), (6.0D0,9.0D0), + + (0.23D0,-0.24D0), (8.0D0,3.0D0), + + (0.18D0,0.01D0), (9.0D0,4.0D0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.09D0,-0.12D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.03D0,-0.09D0), (0.15D0,-0.03D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.03D0,0.03D0), (-0.18D0,0.03D0), + + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.09D0,0.03D0), (0.03D0,0.12D0), + + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.09D0,-0.12D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.03D0,-0.09D0), (8.0D0,9.0D0), + + (0.15D0,-0.03D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.03D0,0.03D0), (3.0D0,6.0D0), + + (-0.18D0,0.03D0), (4.0D0,7.0D0), + + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), + + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), + + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. DZNRM2TEST .. + CALL STEST1(DZNRM2TEST(N,CX,INCX),STRUE2(NP1), + + STRUE2(NP1),SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. DZASUMTEST .. + CALL STEST1(DZASUMTEST(N,CX,INCX),STRUE4(NP1), + + STRUE4(NP1),SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. ZSCALTEST .. + CALL ZSCALTEST(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. ZDSCALTEST .. + CALL ZDSCALTEST(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IZAMAXTEST .. + CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* ZSCALTEST +* Add a test for alpha equal to zero. + CA = (0.0D0,0.0D0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 80 CONTINUE + CALL ZSCALTEST(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* ZDSCALTEST +* Add a test for alpha equal to zero. + SA = 0.0D0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 100 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0D0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0D0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL ZDSCALTEST(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA,ZTEMP + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX*16 CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + EXTERNAL ZDOTCTEST, ZDOTUTEST +* .. External Subroutines .. + EXTERNAL ZAXPYTEST, ZCOPYTEST, ZSWAPTEST, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4D0,-0.7D0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (-0.1D0,-0.9D0), (0.2D0,-0.8D0), + + (-0.9D0,-0.4D0), (0.1D0,0.4D0), (-0.6D0,0.6D0)/ + DATA CY1/(0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.7D0,-0.6D0), (0.1D0,-0.5D0), (-0.1D0,-0.2D0), + + (-0.5D0,-0.3D0), (0.8D0,-0.7D0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-1.55D0,0.5D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (-1.55D0,0.5D0), + + (0.03D0,-0.89D0), (-0.38D0,-0.96D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-0.9D0,0.5D0), (0.42D0,-1.41D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-0.9D0,0.5D0), + + (0.06D0,-0.13D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.52D0,-1.51D0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-1.18D0,-0.31D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-1.54D0,0.97D0), + + (0.03D0,-0.89D0), (-0.18D0,-1.31D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), (-0.9D0,0.5D0), + + (0.05D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-0.9D0,0.5D0), (0.05D0,-0.6D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.32D0,-1.16D0)/ + DATA CT7/(0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (0.65D0,-0.47D0), (-0.34D0,-1.22D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.59D0,-1.46D0), (-1.04D0,-0.04D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.83D0,0.59D0), (0.07D0,-0.37D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.76D0,-1.15D0), (-1.33D0,-1.82D0)/ + DATA CT6/(0.0D0,0.0D0), (0.90D0,0.06D0), + + (0.91D0,-0.77D0), (1.80D0,-0.10D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.45D0,0.74D0), + + (0.20D0,0.90D0), (0.0D0,0.0D0), (0.90D0,0.06D0), + + (-0.55D0,0.23D0), (0.83D0,-0.39D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.04D0,0.79D0), + + (1.95D0,1.22D0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (-0.9D0,0.5D0), (0.7D0,-0.6D0), (0.1D0,-0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.6D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.8D0,-0.7D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.2D0), + + (0.2D0,-0.8D0), (0.7D0,-0.6D0), (0.1D0,0.4D0), + + (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.9D0,0.5D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.1D0,-0.5D0), + + (-0.4D0,-0.7D0), (0.7D0,-0.6D0), (0.2D0,-0.8D0), + + (-0.9D0,0.5D0), (0.1D0,0.4D0), (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (0.7D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (0.7D0,-0.6D0), (-0.1D0,-0.2D0), (0.8D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.9D0), + + (0.2D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (-0.9D0,0.5D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,0.5D0), (-0.9D0,-0.4D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.7D0,-0.8D0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,-0.4D0), (-0.1D0,-0.9D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.9D0,0.5D0), + + (-0.4D0,-0.7D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.9D0,0.5D0), (-0.4D0,-0.7D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.2D0,-0.8D0)/ + DATA CSIZE1/(0.0D0,0.0D0), (0.9D0,0.9D0), + + (1.63D0,1.73D0), (2.90D0,2.78D0)/ + DATA CSIZE3/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0)/ + DATA CSIZE2/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. ZDOTCTEST .. + CALL ZDOTCTEST(N,CX,INCX,CY,INCY,ZTEMP) + CDOT(1) = ZTEMP + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. ZDOTUTEST .. + CALL ZDOTUTEST(N,CX,INCX,CY,INCY,ZTEMP) + CDOT(1) = ZTEMP + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. ZAXPYTEST .. + CALL ZAXPYTEST(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. ZCOPYTEST .. + CALL ZCOPYTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE IF (ICASE.EQ.5) THEN +* .. ZSWAPTEST .. + CALL ZSWAPTEST(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0D0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX*16 CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC DIMAG, DBLE +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = DBLE(CCOMP(I)) + SCOMP(2*I) = DIMAG(CCOMP(I)) + STRUE(2*I-1) = DBLE(CTRUE(I)) + STRUE(2*I) = DIMAG(CTRUE(I)) + SSIZE(2*I-1) = DBLE(CSIZE(I)) + SSIZE(2*I) = DIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f new file mode 100644 index 0000000000..236088ff31 --- /dev/null +++ b/ctest/c_zblat2.f @@ -0,0 +1,2939 @@ + PROGRAM ZBLAT2 +* +* Test program for the COMPLEX*16 Level 2 Blas. +* +* The program must be driven by a short data file. The first 17 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A12, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* cblas_zgemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhemv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztrmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztbmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztpmv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztrsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztbsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_ztpsv T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgerc T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zgeru T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zher T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpr T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zher2 T PUT F FOR NO TEST. SAME COLUMNS. +* cblas_zhpr2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NTRA, LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANS + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHK6, + $ CZ2CHKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_zgemv ', 'cblas_zgbmv ', + $ 'cblas_zhemv ','cblas_zhbmv ','cblas_zhpmv ', + $ 'cblas_ztrmv ','cblas_ztbmv ','cblas_ztpmv ', + $ 'cblas_ztrsv ','cblas_ztbsv ','cblas_ztpsv ', + $ 'cblas_zgerc ','cblas_zgeru ','cblas_zher ', + $ 'cblas_zhpr ','cblas_zher2 ','cblas_zhpr2 '/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from CMVCH YT holds +* the result computed by CMVCH. + TRANS = 'N' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CZ2CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test ZGEMV, 01, and ZGBMV, 02. + 140 IF (CORDER) THEN + CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. + 150 IF (CORDER) THEN + CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G, 1 ) + END IF + GO TO 200 +* Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, +* ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. + 160 IF (CORDER) THEN + CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z, + $ 1 ) + END IF + GO TO 200 +* Test ZGERC, 12, ZGERU, 13. + 170 IF (CORDER) THEN + CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test ZHER, 14, and ZHPR, 15. + 180 IF (CORDER) THEN + CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF + GO TO 200 +* Test ZHER2, 16, and ZHPR2, 17. + 190 IF (CORDER) THEN + CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z, 1 ) + END IF +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT( ' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT(' TESTS OF THE COMPLEX*16 LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7('(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT(' SUBPROGRAM NAME ',A12, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT(' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A12, L2 ) + 9983 FORMAT( 1X,A12, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT2. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CGEMV and CGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*14 CTRANS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGBMV, CZGEMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CTRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CZGEMV( IORDER, TRANS, M, N, + $ ALPHA, AA, LDA, XX, INCX, + $ BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CTRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZGBMV( IORDER, TRANS, M, N, KL, + $ KU, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* +* IF(TRANS .NE. 'C' .OR. (INCX .GT. 0 .AND. INCY .GT. 0)) THEN + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LZERES( 'ge', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CTRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CTRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,',/ 10x, I3, ', X,', I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G, IORDER ) +* +* Tests CHEMV, CHBMV and CHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHBMV, CZHEMV, CZHPMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHEMV( IORDER, UPLO, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHBMV( IORDER, UPLO, N, K, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHPMV( IORDER, UPLO, N, ALPHA, AA, + $ XX, INCX, BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LZE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LZE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,',/ 10x, I2, ',(', F4.1, ',', F4.1, + $ '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', + $ F4.1, ',', F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,',/ 10x, I2, ',(', F4.1, ',', + $ F4.1, '), ', 'Y,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CZHK2. +* + END + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z, IORDER ) +* +* Tests ZTRMV, ZTBMV, ZTPMV, ZTRSV, ZTBSV and ZTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*14 CUPLO,CTRANS,CDIAG + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMVCH, CZTBMV, CZTBSV, CZTPMV, + $ CZTPSV, CZTRMV, CZTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'r' + BANDED = SNAME( 9: 9 ).EQ.'b' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for ZMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + IF (TRANS.EQ.'N')THEN + CTRANS = ' CblasNoTrans' + ELSE IF (TRANS.EQ.'T')THEN + CTRANS = ' CblasTrans' + ELSE + CTRANS = 'CblasConjTrans' + END IF +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) + IF (DIAG.EQ.'N')THEN + CDIAG = ' CblasNonUnit' + ELSE + CDIAG = ' CblasUnit' + END IF +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'mv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTRMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTBMV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTPMV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTRSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, LDA, XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTBSV( IORDER, UPLO, TRANS, DIAG, + $ N, K, AA, LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ CUPLO, CTRANS, CDIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZTPSV( IORDER, UPLO, TRANS, DIAG, + $ N, AA, XX, INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LZERES( 'ge', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'mv' )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'sv' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ LDA, INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, CUPLO, CTRANS, CDIAG, N, + $ INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ',A12, '(', 3( A14, ',' ),/ 10x, I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZGERC and ZGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGERC, CZGERU, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Executable Statements .. + CONJ = SNAME( 5: 5 ).EQ.'c' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE(SNAME( 8: 9 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL CZGERC( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL CZGERU( IORDER, M, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( 'ge', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = DCONJG( W( 1 ) ) + CALL ZMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK4. +* + END + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZHER and ZHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, TRANSL + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER, CZHPR, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = DBLE( ALF( IA ) ) + ALPHA = DCMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CZHER( IORDER, UPLO, N, RALPHA, XX, + $ INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CZHPR( IORDER, UPLO, N, RALPHA, + $ XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LZERES( SNAME( 8: 9 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = DCONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CZHK5. +* + END + SUBROUTINE ZCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z, IORDER ) +* +* Tests ZHER2 and ZHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA, + $ IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*14 CUPLO + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER2, CZHPR2, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 9: 9 ).EQ.'e' + PACKED = SNAME( 9: 9 ).EQ.'p' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + IF (UPLO.EQ.'U')THEN + CUPLO = ' CblasUpper' + ELSE + CUPLO = ' CblasLower' + END IF + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'ge', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'ge', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CZHER2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, CUPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CZHPR2( IORDER, UPLO, N, ALPHA, XX, INCX, + $ YY, INCY, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( SNAME( 8: 9 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*DCONJG( Z( J, 2 ) ) + W( 2 ) = DCONJG( ALPHA )*DCONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, CUPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, CUPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT(' ',A12, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT(' ',A12, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ',A12, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT(1X, I6, ': ',A12, '(', A14, ',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK6. +* + END + SUBROUTINE ZMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 C + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + DCONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT(' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of ZMVCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge', 'he' or 'hp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'gb', 'he', 'hb', 'hp', 'tr', 'tb' OR 'tp'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, MIN, DBLE +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'g' + SYM = TYPE( 1: 1 ).EQ.'h' + TRI = TYPE( 1: 1 ).EQ.'t' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'gb' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'tr' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'hb'.OR.TYPE.EQ.'tb' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'hp'.OR.TYPE.EQ.'tp' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = DCMPLX( DBLE( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f new file mode 100644 index 0000000000..6e9dbbd8c0 --- /dev/null +++ b/ctest/c_zblat3.f @@ -0,0 +1,2791 @@ + PROGRAM ZBLAT3 +* +* Test program for the COMPLEX*16 Level 3 Blas. +* +* The program must be driven by a short data file. The first 13 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A12,L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 22 lines: +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN, NOUT + PARAMETER ( NIN = 5, NOUT = 6 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NTRA, + $ LAYOUT + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR, CORDER, RORDER + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAMET + CHARACTER*32 SNAPS +* .. Local Arrays .. + COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*12 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5,ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*12 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'cblas_zgemm ', 'cblas_zhemm ', + $ 'cblas_zsymm ', 'cblas_ztrmm ', 'cblas_ztrsm ', + $ 'cblas_zherk ', 'cblas_zsyrk ', 'cblas_zher2k', + $ 'cblas_zsyr2k'/ +* .. Executable Statements .. +* + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the flag that indicates whether row-major data layout to be tested. + READ( NIN, FMT = * )LAYOUT +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) + + RORDER = .FALSE. + CORDER = .FALSE. + IF (LAYOUT.EQ.2) THEN + RORDER = .TRUE. + CORDER = .TRUE. + WRITE( *, FMT = 10002 ) + ELSE IF (LAYOUT.EQ.1) THEN + RORDER = .TRUE. + WRITE( *, FMT = 10001 ) + ELSE IF (LAYOUT.EQ.0) THEN + CORDER = .TRUE. + WRITE( *, FMT = 10000 ) + END IF + WRITE( *, FMT = * ) + +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from ZMMCH CT holds +* the result computed by ZMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CZ3CHKE( SNAMES( ISNUM ) ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test ZGEMM, 01. + 140 IF (CORDER) THEN + CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK1(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZHEMM, 02, ZSYMM, 03. + 150 IF (CORDER) THEN + CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK2(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZTRMM, 04, ZTRSM, 05. + 160 IF (CORDER) THEN + CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK3(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C, + $ 1 ) + END IF + GO TO 190 +* Test ZHERK, 06, ZSYRK, 07. + 170 IF (CORDER) THEN + CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK4(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G, 1 ) + END IF + GO TO 190 +* Test ZHER2K, 08, ZSYR2K, 09. + 180 IF (CORDER) THEN + CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 0 ) + END IF + IF (RORDER) THEN + CALL ZCHK5(SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ 1 ) + END IF + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* +10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) +10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) +10000 FORMAT(' COLUMN-MAJOR DATA LAYOUT IS TESTED' ) + 9999 FORMAT(' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT(' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT(' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT('TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT(' SUBPROGRAM NAME ', A12,' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT(' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, + $ 'AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ ' ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A12,L2 ) + 9987 FORMAT( 1X, A12,' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT3. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZGEMM, ZMAKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL ZPRCN1(NTRA, NC, SNAME, IORDER, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, + $ LDB, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZGEMM( IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, AA, LDA, BB, LDB, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LZERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, + $ M, N, K, ALPHA, LDA, LDB, BETA, LDC) +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END +* + SUBROUTINE ZPRCN1(NOUT, NC, SNAME, IORDER, TRANSA, TRANSB, M, N, + $ K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 TRANSA, TRANSB + CHARACTER*12 SNAME + CHARACTER*14 CRC, CTA,CTB + + IF (TRANSA.EQ.'N')THEN + CTA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CTA = ' CblasTrans' + ELSE + CTA = 'CblasConjTrans' + END IF + IF (TRANSB.EQ.'N')THEN + CTB = ' CblasNoTrans' + ELSE IF (TRANSB.EQ.'T')THEN + CTB = ' CblasTrans' + ELSE + CTB = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CTA,CTB + WRITE(NOUT, FMT = 9994)M, N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 3( I3, ',' ) ,' (', F4.1,',',F4.1,') , A,', + $ I3, ', B,', I3, ', (', F4.1,',',F4.1,') , C,', I3, ').' ) + END +* + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZHEMM and ZSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHEMM, ZMAKE, ZMMCH, CZSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL ZMAKE(SNAME( 8: 9 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ CALL ZPRCN2(NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, M, N, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CZHEMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + ELSE + CALL CZSYMM( IORDER, SIDE, UPLO, M, N, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'ge', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC) +* + 120 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END +* + SUBROUTINE ZPRCN2(NOUT, NC, SNAME, IORDER, SIDE, UPLO, M, N, + $ ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 SIDE, UPLO + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS,CU + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)M, N, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( I3, ',' ),' (',F4.1,',',F4.1, '), A,', I3, + $ ', B,', I3, ', (',F4.1,',',F4.1, '), ', 'C,', I3, ').' ) + END +* + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C, IORDER ) +* +* Tests ZTRMM and ZTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMMCH, CZTRMM, CZTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for ZMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL ZMAKE( 'tr', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL ZMAKE( 'ge', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 10: 11 ).EQ.'mm' )THEN + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CZTRMM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN + IF( TRACE ) + $ CALL ZPRCN3( NTRA, NC, SNAME, IORDER, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB) + IF( REWI ) + $ REWIND NTRA + CALL CZTRSM(IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, AA, LDA, + $ BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LZE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LZE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LZERES( 'ge', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 10: 11 ).EQ.'mm' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 10: 11 ).EQ.'sm' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + CALL ZPRCN3( NTRA, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, DIAG, + $ M, N, ALPHA, LDA, LDB) +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT(' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END +* + SUBROUTINE ZPRCN3(NOUT, NC, SNAME, IORDER, SIDE, UPLO, TRANSA, + $ DIAG, M, N, ALPHA, LDA, LDB) + INTEGER NOUT, NC, IORDER, M, N, LDA, LDB + DOUBLE COMPLEX ALPHA + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + CHARACTER*12 SNAME + CHARACTER*14 CRC, CS, CU, CA, CD + + IF (SIDE.EQ.'L')THEN + CS = ' CblasLeft' + ELSE + CS = ' CblasRight' + END IF + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (DIAG.EQ.'N')THEN + CD = ' CblasNonUnit' + ELSE + CD = ' CblasUnit' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC,SNAME,CRC, CS,CU + WRITE(NOUT, FMT = 9994)CA, CD, M, N, ALPHA, LDA, LDB + + 9995 FORMAT( 1X, I6, ': ', A12,'(', A14, ',', A14, ',', A14, ',') + 9994 FORMAT( 10X, 2( A14, ',') , 2( I3, ',' ), ' (', F4.1, ',', + $ F4.1, '), A,', I3, ', B,', I3, ').' ) + END +* + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G, + $ IORDER ) +* +* Tests ZHERK and ZSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHERK, ZMAKE, ZMMCH, CZSYRK +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = DBLE( ALPHA ) + ALPHA = DCMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL ZPRCN6( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, RALPHA, LDA, RBETA, + $ LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZHERK( IORDER, UPLO, TRANS, N, K, + $ RALPHA, AA, LDA, RBETA, CC, + $ LDC ) + ELSE + IF( TRACE ) + $ CALL ZPRCN4( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZSYRK( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LZERES( SNAME( 8: 9 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL ZMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL ZPRCN6( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, RALPHA, + $ LDA, rBETA, LDC) + ELSE + CALL ZPRCN4( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC) + END IF +* + 130 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END +* + SUBROUTINE ZPRCN4(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1 ,'), A,', + $ I3, ', (', F4.1,',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE ZPRCN6(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W, + $ IORDER ) +* +* Tests ZHER2K and ZSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA, IORDER + LOGICAL FATAL, REWI, TRACE + CHARACTER*12 SNAME +* .. Array Arguments .. + COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL CZHER2K, ZMAKE, ZMMCH, CZSYR2K +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 8: 9 ).EQ.'he' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL ZMAKE( 'ge', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 8: 9 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ CALL ZPRCN7( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ RBETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZHER2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, RBETA, + $ CC, LDC ) + ELSE + IF( TRACE ) + $ CALL ZPRCN5( NTRA, NC, SNAME, IORDER, + $ UPLO, TRANS, N, K, ALPHA, LDA, LDB, + $ BETA, LDC) + IF( REWI ) + $ REWIND NTRA + CALL CZSYR2K( IORDER, UPLO, TRANS, N, K, + $ ALPHA, AA, LDA, BB, LDB, BETA, + $ CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'he', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = DCONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*DCONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = DCONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10000 )SNAME, NC + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10001 )SNAME, NC + ELSE + IF ( IORDER.EQ.0) WRITE( NOUT, FMT = 10002 )SNAME, NC, ERRMAX + IF ( IORDER.EQ.1) WRITE( NOUT, FMT = 10003 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + CALL ZPRCN7( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, RBETA, LDC) + ELSE + CALL ZPRCN5( NOUT, NC, SNAME, IORDER, UPLO, TRANS, N, K, + $ ALPHA, LDA, LDB, BETA, LDC) + END IF +* + 160 CONTINUE + RETURN +* +10003 FORMAT( ' ', A12,' COMPLETED THE ROW-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10002 FORMAT( ' ', A12,' COMPLETED THE COLUMN-MAJOR COMPUTATIONAL ', + $ 'TESTS (', I6, ' CALLS)', /' ******* BUT WITH MAXIMUM TEST ', + $ 'RATIO ', F8.2, ' - SUSPECT *******' ) +10001 FORMAT( ' ', A12,' PASSED THE ROW-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) +10000 FORMAT( ' ', A12,' PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS', + $ ' (', I6, ' CALL', 'S)' ) + 9998 FORMAT(' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9996 FORMAT( ' ******* ', A12,' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT(' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END +* + SUBROUTINE ZPRCN5(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA, BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ', (', F4.1, ',', F4.1, '), C,', I3, ').' ) + END +* +* + SUBROUTINE ZPRCN7(NOUT, NC, SNAME, IORDER, UPLO, TRANSA, + $ N, K, ALPHA, LDA, LDB, BETA, LDC) + INTEGER NOUT, NC, IORDER, N, K, LDA, LDB, LDC + DOUBLE COMPLEX ALPHA + DOUBLE PRECISION BETA + CHARACTER*1 UPLO, TRANSA + CHARACTER*12 SNAME + CHARACTER*14 CRC, CU, CA + + IF (UPLO.EQ.'U')THEN + CU = ' CblasUpper' + ELSE + CU = ' CblasLower' + END IF + IF (TRANSA.EQ.'N')THEN + CA = ' CblasNoTrans' + ELSE IF (TRANSA.EQ.'T')THEN + CA = ' CblasTrans' + ELSE + CA = 'CblasConjTrans' + END IF + IF (IORDER.EQ.1)THEN + CRC = ' CblasRowMajor' + ELSE + CRC = ' CblasColMajor' + END IF + WRITE(NOUT, FMT = 9995)NC, SNAME, CRC, CU, CA + WRITE(NOUT, FMT = 9994)N, K, ALPHA, LDA, LDB, BETA, LDC + + 9995 FORMAT( 1X, I6, ': ', A12,'(', 3( A14, ',') ) + 9994 FORMAT( 10X, 2( I3, ',' ), ' (', F4.1, ',', F4.1, '), A,', + $ I3, ', B', I3, ',', F4.1, ', C,', I3, ').' ) + END +* + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'ge', 'he', 'sy' or 'tr'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, DBLE +* .. Executable Statements .. + GEN = TYPE.EQ.'ge' + HER = TYPE.EQ.'he' + SYM = TYPE.EQ.'sy' + TRI = TYPE.EQ.'tr' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'ge' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy'.OR.TYPE.EQ.'tr' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 CL + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of ZMMCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'ge' or 'he' or 'sy'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'ge' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'he'.OR.TYPE.EQ.'sy' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + diff --git a/ctest/cblas_test.h b/ctest/cblas_test.h new file mode 100644 index 0000000000..53cb99f9ea --- /dev/null +++ b/ctest/cblas_test.h @@ -0,0 +1,514 @@ +/* + * cblas_test.h + * Written by Keita Teranishi + */ +#ifndef CBLAS_TEST_H +#define CBLAS_TEST_H +#include "cblas.h" + +#ifdef USE64BITINT +#define int long +#endif + +#define TRUE 1 +#define PASSED 1 +#define TEST_ROW_MJR 1 + +#define FALSE 0 +#define FAILED 0 +#define TEST_COL_MJR 0 + +#define INVALID -1 +#define UNDEFINED -1 + +typedef struct { float real; float imag; } CBLAS_TEST_COMPLEX; +typedef struct { double real; double imag; } CBLAS_TEST_ZOMPLEX; + +#if defined(ADD_) +/* + * Level 1 BLAS + */ + #define F77_srotg srotgtest_ + #define F77_srotmg srotmgtest_ + #define F77_srot srottest_ + #define F77_srotm srotmtest_ + #define F77_drotg drotgtest_ + #define F77_drotmg drotmgtest_ + #define F77_drot drottest_ + #define F77_drotm drotmtest_ + #define F77_sswap sswaptest_ + #define F77_scopy scopytest_ + #define F77_saxpy saxpytest_ + #define F77_isamax isamaxtest_ + #define F77_dswap dswaptest_ + #define F77_dcopy dcopytest_ + #define F77_daxpy daxpytest_ + #define F77_idamax idamaxtest_ + #define F77_cswap cswaptest_ + #define F77_ccopy ccopytest_ + #define F77_caxpy caxpytest_ + #define F77_icamax icamaxtest_ + #define F77_zswap zswaptest_ + #define F77_zcopy zcopytest_ + #define F77_zaxpy zaxpytest_ + #define F77_izamax izamaxtest_ + #define F77_sdot sdottest_ + #define F77_ddot ddottest_ + #define F77_dsdot dsdottest_ + #define F77_sscal sscaltest_ + #define F77_dscal dscaltest_ + #define F77_cscal cscaltest_ + #define F77_zscal zscaltest_ + #define F77_csscal csscaltest_ + #define F77_zdscal zdscaltest_ + #define F77_cdotu cdotutest_ + #define F77_cdotc cdotctest_ + #define F77_zdotu zdotutest_ + #define F77_zdotc zdotctest_ + #define F77_snrm2 snrm2test_ + #define F77_sasum sasumtest_ + #define F77_dnrm2 dnrm2test_ + #define F77_dasum dasumtest_ + #define F77_scnrm2 scnrm2test_ + #define F77_scasum scasumtest_ + #define F77_dznrm2 dznrm2test_ + #define F77_dzasum dzasumtest_ + #define F77_sdsdot sdsdottest_ +/* + * Level 2 BLAS + */ + #define F77_s2chke cs2chke_ + #define F77_d2chke cd2chke_ + #define F77_c2chke cc2chke_ + #define F77_z2chke cz2chke_ + #define F77_ssymv cssymv_ + #define F77_ssbmv cssbmv_ + #define F77_sspmv csspmv_ + #define F77_sger csger_ + #define F77_ssyr cssyr_ + #define F77_sspr csspr_ + #define F77_ssyr2 cssyr2_ + #define F77_sspr2 csspr2_ + #define F77_dsymv cdsymv_ + #define F77_dsbmv cdsbmv_ + #define F77_dspmv cdspmv_ + #define F77_dger cdger_ + #define F77_dsyr cdsyr_ + #define F77_dspr cdspr_ + #define F77_dsyr2 cdsyr2_ + #define F77_dspr2 cdspr2_ + #define F77_chemv cchemv_ + #define F77_chbmv cchbmv_ + #define F77_chpmv cchpmv_ + #define F77_cgeru ccgeru_ + #define F77_cgerc ccgerc_ + #define F77_cher ccher_ + #define F77_chpr cchpr_ + #define F77_cher2 ccher2_ + #define F77_chpr2 cchpr2_ + #define F77_zhemv czhemv_ + #define F77_zhbmv czhbmv_ + #define F77_zhpmv czhpmv_ + #define F77_zgeru czgeru_ + #define F77_zgerc czgerc_ + #define F77_zher czher_ + #define F77_zhpr czhpr_ + #define F77_zher2 czher2_ + #define F77_zhpr2 czhpr2_ + #define F77_sgemv csgemv_ + #define F77_sgbmv csgbmv_ + #define F77_strmv cstrmv_ + #define F77_stbmv cstbmv_ + #define F77_stpmv cstpmv_ + #define F77_strsv cstrsv_ + #define F77_stbsv cstbsv_ + #define F77_stpsv cstpsv_ + #define F77_dgemv cdgemv_ + #define F77_dgbmv cdgbmv_ + #define F77_dtrmv cdtrmv_ + #define F77_dtbmv cdtbmv_ + #define F77_dtpmv cdtpmv_ + #define F77_dtrsv cdtrsv_ + #define F77_dtbsv cdtbsv_ + #define F77_dtpsv cdtpsv_ + #define F77_cgemv ccgemv_ + #define F77_cgbmv ccgbmv_ + #define F77_ctrmv cctrmv_ + #define F77_ctbmv cctbmv_ + #define F77_ctpmv cctpmv_ + #define F77_ctrsv cctrsv_ + #define F77_ctbsv cctbsv_ + #define F77_ctpsv cctpsv_ + #define F77_zgemv czgemv_ + #define F77_zgbmv czgbmv_ + #define F77_ztrmv cztrmv_ + #define F77_ztbmv cztbmv_ + #define F77_ztpmv cztpmv_ + #define F77_ztrsv cztrsv_ + #define F77_ztbsv cztbsv_ + #define F77_ztpsv cztpsv_ +/* + * Level 3 BLAS + */ + #define F77_s3chke cs3chke_ + #define F77_d3chke cd3chke_ + #define F77_c3chke cc3chke_ + #define F77_z3chke cz3chke_ + #define F77_chemm cchemm_ + #define F77_cherk ccherk_ + #define F77_cher2k ccher2k_ + #define F77_zhemm czhemm_ + #define F77_zherk czherk_ + #define F77_zher2k czher2k_ + #define F77_sgemm csgemm_ + #define F77_ssymm cssymm_ + #define F77_ssyrk cssyrk_ + #define F77_ssyr2k cssyr2k_ + #define F77_strmm cstrmm_ + #define F77_strsm cstrsm_ + #define F77_dgemm cdgemm_ + #define F77_dsymm cdsymm_ + #define F77_dsyrk cdsyrk_ + #define F77_dsyr2k cdsyr2k_ + #define F77_dtrmm cdtrmm_ + #define F77_dtrsm cdtrsm_ + #define F77_cgemm ccgemm_ + #define F77_csymm ccsymm_ + #define F77_csyrk ccsyrk_ + #define F77_csyr2k ccsyr2k_ + #define F77_ctrmm cctrmm_ + #define F77_ctrsm cctrsm_ + #define F77_zgemm czgemm_ + #define F77_zsymm czsymm_ + #define F77_zsyrk czsyrk_ + #define F77_zsyr2k czsyr2k_ + #define F77_ztrmm cztrmm_ + #define F77_ztrsm cztrsm_ +#elif defined(UPCASE) +/* + * Level 1 BLAS + */ + #define F77_srotg SROTGTEST + #define F77_srotmg SROTMGTEST + #define F77_srot SROTCTEST + #define F77_srotm SROTMTEST + #define F77_drotg DROTGTEST + #define F77_drotmg DROTMGTEST + #define F77_drot DROTTEST + #define F77_drotm DROTMTEST + #define F77_sswap SSWAPTEST + #define F77_scopy SCOPYTEST + #define F77_saxpy SAXPYTEST + #define F77_isamax ISAMAXTEST + #define F77_dswap DSWAPTEST + #define F77_dcopy DCOPYTEST + #define F77_daxpy DAXPYTEST + #define F77_idamax IDAMAXTEST + #define F77_cswap CSWAPTEST + #define F77_ccopy CCOPYTEST + #define F77_caxpy CAXPYTEST + #define F77_icamax ICAMAXTEST + #define F77_zswap ZSWAPTEST + #define F77_zcopy ZCOPYTEST + #define F77_zaxpy ZAXPYTEST + #define F77_izamax IZAMAXTEST + #define F77_sdot SDOTTEST + #define F77_ddot DDOTTEST + #define F77_dsdot DSDOTTEST + #define F77_sscal SSCALTEST + #define F77_dscal DSCALTEST + #define F77_cscal CSCALTEST + #define F77_zscal ZSCALTEST + #define F77_csscal CSSCALTEST + #define F77_zdscal ZDSCALTEST + #define F77_cdotu CDOTUTEST + #define F77_cdotc CDOTCTEST + #define F77_zdotu ZDOTUTEST + #define F77_zdotc ZDOTCTEST + #define F77_snrm2 SNRM2TEST + #define F77_sasum SASUMTEST + #define F77_dnrm2 DNRM2TEST + #define F77_dasum DASUMTEST + #define F77_scnrm2 SCNRM2TEST + #define F77_scasum SCASUMTEST + #define F77_dznrm2 DZNRM2TEST + #define F77_dzasum DZASUMTEST + #define F77_sdsdot SDSDOTTEST +/* + * Level 2 BLAS + */ + #define F77_s2chke CS2CHKE + #define F77_d2chke CD2CHKE + #define F77_c2chke CC2CHKE + #define F77_z2chke CZ2CHKE + #define F77_ssymv CSSYMV + #define F77_ssbmv CSSBMV + #define F77_sspmv CSSPMV + #define F77_sger CSGER + #define F77_ssyr CSSYR + #define F77_sspr CSSPR + #define F77_ssyr2 CSSYR2 + #define F77_sspr2 CSSPR2 + #define F77_dsymv CDSYMV + #define F77_dsbmv CDSBMV + #define F77_dspmv CDSPMV + #define F77_dger CDGER + #define F77_dsyr CDSYR + #define F77_dspr CDSPR + #define F77_dsyr2 CDSYR2 + #define F77_dspr2 CDSPR2 + #define F77_chemv CCHEMV + #define F77_chbmv CCHBMV + #define F77_chpmv CCHPMV + #define F77_cgeru CCGERU + #define F77_cgerc CCGERC + #define F77_cher CCHER + #define F77_chpr CCHPR + #define F77_cher2 CCHER2 + #define F77_chpr2 CCHPR2 + #define F77_zhemv CZHEMV + #define F77_zhbmv CZHBMV + #define F77_zhpmv CZHPMV + #define F77_zgeru CZGERU + #define F77_zgerc CZGERC + #define F77_zher CZHER + #define F77_zhpr CZHPR + #define F77_zher2 CZHER2 + #define F77_zhpr2 CZHPR2 + #define F77_sgemv CSGEMV + #define F77_sgbmv CSGBMV + #define F77_strmv CSTRMV + #define F77_stbmv CSTBMV + #define F77_stpmv CSTPMV + #define F77_strsv CSTRSV + #define F77_stbsv CSTBSV + #define F77_stpsv CSTPSV + #define F77_dgemv CDGEMV + #define F77_dgbmv CDGBMV + #define F77_dtrmv CDTRMV + #define F77_dtbmv CDTBMV + #define F77_dtpmv CDTPMV + #define F77_dtrsv CDTRSV + #define F77_dtbsv CDTBSV + #define F77_dtpsv CDTPSV + #define F77_cgemv CCGEMV + #define F77_cgbmv CCGBMV + #define F77_ctrmv CCTRMV + #define F77_ctbmv CCTBMV + #define F77_ctpmv CCTPMV + #define F77_ctrsv CCTRSV + #define F77_ctbsv CCTBSV + #define F77_ctpsv CCTPSV + #define F77_zgemv CZGEMV + #define F77_zgbmv CZGBMV + #define F77_ztrmv CZTRMV + #define F77_ztbmv CZTBMV + #define F77_ztpmv CZTPMV + #define F77_ztrsv CZTRSV + #define F77_ztbsv CZTBSV + #define F77_ztpsv CZTPSV +/* + * Level 3 BLAS + */ + #define F77_s3chke CS3CHKE + #define F77_d3chke CD3CHKE + #define F77_c3chke CC3CHKE + #define F77_z3chke CZ3CHKE + #define F77_chemm CCHEMM + #define F77_cherk CCHERK + #define F77_cher2k CCHER2K + #define F77_zhemm CZHEMM + #define F77_zherk CZHERK + #define F77_zher2k CZHER2K + #define F77_sgemm CSGEMM + #define F77_ssymm CSSYMM + #define F77_ssyrk CSSYRK + #define F77_ssyr2k CSSYR2K + #define F77_strmm CSTRMM + #define F77_strsm CSTRSM + #define F77_dgemm CDGEMM + #define F77_dsymm CDSYMM + #define F77_dsyrk CDSYRK + #define F77_dsyr2k CDSYR2K + #define F77_dtrmm CDTRMM + #define F77_dtrsm CDTRSM + #define F77_cgemm CCGEMM + #define F77_csymm CCSYMM + #define F77_csyrk CCSYRK + #define F77_csyr2k CCSYR2K + #define F77_ctrmm CCTRMM + #define F77_ctrsm CCTRSM + #define F77_zgemm CZGEMM + #define F77_zsymm CZSYMM + #define F77_zsyrk CZSYRK + #define F77_zsyr2k CZSYR2K + #define F77_ztrmm CZTRMM + #define F77_ztrsm CZTRSM +#elif defined(NOCHANGE) +/* + * Level 1 BLAS + */ + #define F77_srotg srotgtest + #define F77_srotmg srotmgtest + #define F77_srot srottest + #define F77_srotm srotmtest + #define F77_drotg drotgtest + #define F77_drotmg drotmgtest + #define F77_drot drottest + #define F77_drotm drotmtest + #define F77_sswap sswaptest + #define F77_scopy scopytest + #define F77_saxpy saxpytest + #define F77_isamax isamaxtest + #define F77_dswap dswaptest + #define F77_dcopy dcopytest + #define F77_daxpy daxpytest + #define F77_idamax idamaxtest + #define F77_cswap cswaptest + #define F77_ccopy ccopytest + #define F77_caxpy caxpytest + #define F77_icamax icamaxtest + #define F77_zswap zswaptest + #define F77_zcopy zcopytest + #define F77_zaxpy zaxpytest + #define F77_izamax izamaxtest + #define F77_sdot sdottest + #define F77_ddot ddottest + #define F77_dsdot dsdottest + #define F77_sscal sscaltest + #define F77_dscal dscaltest + #define F77_cscal cscaltest + #define F77_zscal zscaltest + #define F77_csscal csscaltest + #define F77_zdscal zdscaltest + #define F77_cdotu cdotutest + #define F77_cdotc cdotctest + #define F77_zdotu zdotutest + #define F77_zdotc zdotctest + #define F77_snrm2 snrm2test + #define F77_sasum sasumtest + #define F77_dnrm2 dnrm2test + #define F77_dasum dasumtest + #define F77_scnrm2 scnrm2test + #define F77_scasum scasumtest + #define F77_dznrm2 dznrm2test + #define F77_dzasum dzasumtest + #define F77_sdsdot sdsdottest +/* + * Level 2 BLAS + */ + #define F77_s2chke cs2chke + #define F77_d2chke cd2chke + #define F77_c2chke cc2chke + #define F77_z2chke cz2chke + #define F77_ssymv cssymv + #define F77_ssbmv cssbmv + #define F77_sspmv csspmv + #define F77_sger csger + #define F77_ssyr cssyr + #define F77_sspr csspr + #define F77_ssyr2 cssyr2 + #define F77_sspr2 csspr2 + #define F77_dsymv cdsymv + #define F77_dsbmv cdsbmv + #define F77_dspmv cdspmv + #define F77_dger cdger + #define F77_dsyr cdsyr + #define F77_dspr cdspr + #define F77_dsyr2 cdsyr2 + #define F77_dspr2 cdspr2 + #define F77_chemv cchemv + #define F77_chbmv cchbmv + #define F77_chpmv cchpmv + #define F77_cgeru ccgeru + #define F77_cgerc ccgerc + #define F77_cher ccher + #define F77_chpr cchpr + #define F77_cher2 ccher2 + #define F77_chpr2 cchpr2 + #define F77_zhemv czhemv + #define F77_zhbmv czhbmv + #define F77_zhpmv czhpmv + #define F77_zgeru czgeru + #define F77_zgerc czgerc + #define F77_zher czher + #define F77_zhpr czhpr + #define F77_zher2 czher2 + #define F77_zhpr2 czhpr2 + #define F77_sgemv csgemv + #define F77_sgbmv csgbmv + #define F77_strmv cstrmv + #define F77_stbmv cstbmv + #define F77_stpmv cstpmv + #define F77_strsv cstrsv + #define F77_stbsv cstbsv + #define F77_stpsv cstpsv + #define F77_dgemv cdgemv + #define F77_dgbmv cdgbmv + #define F77_dtrmv cdtrmv + #define F77_dtbmv cdtbmv + #define F77_dtpmv cdtpmv + #define F77_dtrsv cdtrsv + #define F77_dtbsv cdtbsv + #define F77_dtpsv cdtpsv + #define F77_cgemv ccgemv + #define F77_cgbmv ccgbmv + #define F77_ctrmv cctrmv + #define F77_ctbmv cctbmv + #define F77_ctpmv cctpmv + #define F77_ctrsv cctrsv + #define F77_ctbsv cctbsv + #define F77_ctpsv cctpsv + #define F77_zgemv czgemv + #define F77_zgbmv czgbmv + #define F77_ztrmv cztrmv + #define F77_ztbmv cztbmv + #define F77_ztpmv cztpmv + #define F77_ztrsv cztrsv + #define F77_ztbsv cztbsv + #define F77_ztpsv cztpsv +/* + * Level 3 BLAS + */ + #define F77_s3chke cs3chke + #define F77_d3chke cd3chke + #define F77_c3chke cc3chke + #define F77_z3chke cz3chke + #define F77_chemm cchemm + #define F77_cherk ccherk + #define F77_cher2k ccher2k + #define F77_zhemm czhemm + #define F77_zherk czherk + #define F77_zher2k czher2k + #define F77_sgemm csgemm + #define F77_ssymm cssymm + #define F77_ssyrk cssyrk + #define F77_ssyr2k cssyr2k + #define F77_strmm cstrmm + #define F77_strsm cstrsm + #define F77_dgemm cdgemm + #define F77_dsymm cdsymm + #define F77_dsyrk cdsyrk + #define F77_dsyr2k cdsyr2k + #define F77_dtrmm cdtrmm + #define F77_dtrsm cdtrsm + #define F77_cgemm ccgemm + #define F77_csymm ccsymm + #define F77_csyrk ccsyrk + #define F77_csyr2k ccsyr2k + #define F77_ctrmm cctrmm + #define F77_ctrsm cctrsm + #define F77_zgemm czgemm + #define F77_zsymm czsymm + #define F77_zsyrk czsyrk + #define F77_zsyr2k czsyr2k + #define F77_ztrmm cztrmm + #define F77_ztrsm cztrsm +#endif + +void get_transpose_type(char *type, enum CBLAS_TRANSPOSE *trans); +void get_uplo_type(char *type, enum CBLAS_UPLO *uplo); +void get_diag_type(char *type, enum CBLAS_DIAG *diag); +void get_side_type(char *type, enum CBLAS_SIDE *side); + +#endif /* CBLAS_TEST_H */ diff --git a/ctest/cin2 b/ctest/cin2 new file mode 100644 index 0000000000..032fcbb395 --- /dev/null +++ b/ctest/cin2 @@ -0,0 +1,34 @@ +'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_cgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgerc T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cgeru T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chpr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/cin3 b/ctest/cin3 new file mode 100644 index 0000000000..223d165db6 --- /dev/null +++ b/ctest/cin3 @@ -0,0 +1,22 @@ +'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_cgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_chemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ctrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cherk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. +cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/constant.c b/ctest/constant.c new file mode 100644 index 0000000000..861d70bcc8 --- /dev/null +++ b/ctest/constant.c @@ -0,0 +1,3 @@ +int CBLAS_CallFromC; +int RowMajorStrg; + diff --git a/ctest/din2 b/ctest/din2 new file mode 100644 index 0000000000..6f42b27929 --- /dev/null +++ b/ctest/din2 @@ -0,0 +1,33 @@ +'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +cblas_dgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsymv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dger T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dspr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/din3 b/ctest/din3 new file mode 100644 index 0000000000..cbbcc22aba --- /dev/null +++ b/ctest/din3 @@ -0,0 +1,19 @@ +'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +1 2 3 5 7 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +cblas_dgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dtrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_dsyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/sin2 b/ctest/sin2 new file mode 100644 index 0000000000..3eee5c2f9f --- /dev/null +++ b/ctest/sin2 @@ -0,0 +1,33 @@ +'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +cblas_sgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssymv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_stpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sger T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_sspr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/sin3 b/ctest/sin3 new file mode 100644 index 0000000000..01e32d6ee9 --- /dev/null +++ b/ctest/sin3 @@ -0,0 +1,19 @@ +'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 5 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +cblas_sgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_strsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ssyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/zin2 b/ctest/zin2 new file mode 100644 index 0000000000..4c0affe92d --- /dev/null +++ b/ctest/zin2 @@ -0,0 +1,34 @@ +'ZBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 LOGICAL FLAG, T TO TEST ROW-MAJOR (IF FALSE COLUMN-MAJOR IS TESTED) +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_zgemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhemv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztbmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztpmv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztbsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztpsv T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgerc T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zgeru T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpr T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher2 T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhpr2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest/zin3 b/ctest/zin3 new file mode 100644 index 0000000000..70050b6937 --- /dev/null +++ b/ctest/zin3 @@ -0,0 +1,22 @@ +'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 5 9 35 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +cblas_zgemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zhemm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsymm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrmm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_ztrsm T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zherk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsyrk T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zher2k T PUT F FOR NO TEST. SAME COLUMNS. +cblas_zsyr2k T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/ctest1.c b/ctest1.c new file mode 100644 index 0000000000..5ab6338d1e --- /dev/null +++ b/ctest1.c @@ -0,0 +1 @@ +int hogehoge(void){return 0;} diff --git a/ctest2.c b/ctest2.c new file mode 100644 index 0000000000..f7e582f825 --- /dev/null +++ b/ctest2.c @@ -0,0 +1 @@ +int main(void){return 0;} diff --git a/driver/level2/Makefile b/driver/level2/Makefile new file mode 100644 index 0000000000..7043e52e14 --- /dev/null +++ b/driver/level2/Makefile @@ -0,0 +1,3618 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = \ + sgbmv_n.$(SUFFIX) sgbmv_t.$(SUFFIX) \ + ssbmv_U.$(SUFFIX) ssbmv_L.$(SUFFIX) sspmv_U.$(SUFFIX) sspmv_L.$(SUFFIX) \ + sspr_U.$(SUFFIX) sspr_L.$(SUFFIX) sspr2_U.$(SUFFIX) sspr2_L.$(SUFFIX) \ + ssyr_U.$(SUFFIX) ssyr_L.$(SUFFIX) ssyr2_U.$(SUFFIX) ssyr2_L.$(SUFFIX) \ + stbmv_NUU.$(SUFFIX) stbmv_NUN.$(SUFFIX) stbmv_NLU.$(SUFFIX) stbmv_NLN.$(SUFFIX) \ + stbmv_TUU.$(SUFFIX) stbmv_TUN.$(SUFFIX) stbmv_TLU.$(SUFFIX) stbmv_TLN.$(SUFFIX) \ + stbsv_NUU.$(SUFFIX) stbsv_NUN.$(SUFFIX) stbsv_NLU.$(SUFFIX) stbsv_NLN.$(SUFFIX) \ + stbsv_TUU.$(SUFFIX) stbsv_TUN.$(SUFFIX) stbsv_TLU.$(SUFFIX) stbsv_TLN.$(SUFFIX) \ + stpmv_NUU.$(SUFFIX) stpmv_NUN.$(SUFFIX) stpmv_NLU.$(SUFFIX) stpmv_NLN.$(SUFFIX) \ + stpmv_TUU.$(SUFFIX) stpmv_TUN.$(SUFFIX) stpmv_TLU.$(SUFFIX) stpmv_TLN.$(SUFFIX) \ + stpsv_NUU.$(SUFFIX) stpsv_NUN.$(SUFFIX) stpsv_NLU.$(SUFFIX) stpsv_NLN.$(SUFFIX) \ + stpsv_TUU.$(SUFFIX) stpsv_TUN.$(SUFFIX) stpsv_TLU.$(SUFFIX) stpsv_TLN.$(SUFFIX) \ + strmv_NUU.$(SUFFIX) strmv_NUN.$(SUFFIX) strmv_NLU.$(SUFFIX) strmv_NLN.$(SUFFIX) \ + strmv_TUU.$(SUFFIX) strmv_TUN.$(SUFFIX) strmv_TLU.$(SUFFIX) strmv_TLN.$(SUFFIX) \ + strsv_NUU.$(SUFFIX) strsv_NUN.$(SUFFIX) strsv_NLU.$(SUFFIX) strsv_NLN.$(SUFFIX) \ + strsv_TUU.$(SUFFIX) strsv_TUN.$(SUFFIX) strsv_TLU.$(SUFFIX) strsv_TLN.$(SUFFIX) + +DBLASOBJS = \ + dgbmv_n.$(SUFFIX) dgbmv_t.$(SUFFIX) \ + dsbmv_U.$(SUFFIX) dsbmv_L.$(SUFFIX) dspmv_U.$(SUFFIX) dspmv_L.$(SUFFIX) \ + dspr_U.$(SUFFIX) dspr_L.$(SUFFIX) dspr2_U.$(SUFFIX) dspr2_L.$(SUFFIX) \ + dsyr_U.$(SUFFIX) dsyr_L.$(SUFFIX) dsyr2_U.$(SUFFIX) dsyr2_L.$(SUFFIX) \ + dtbmv_NUU.$(SUFFIX) dtbmv_NUN.$(SUFFIX) dtbmv_NLU.$(SUFFIX) dtbmv_NLN.$(SUFFIX) \ + dtbmv_TUU.$(SUFFIX) dtbmv_TUN.$(SUFFIX) dtbmv_TLU.$(SUFFIX) dtbmv_TLN.$(SUFFIX) \ + dtbsv_NUU.$(SUFFIX) dtbsv_NUN.$(SUFFIX) dtbsv_NLU.$(SUFFIX) dtbsv_NLN.$(SUFFIX) \ + dtbsv_TUU.$(SUFFIX) dtbsv_TUN.$(SUFFIX) dtbsv_TLU.$(SUFFIX) dtbsv_TLN.$(SUFFIX) \ + dtpmv_NUU.$(SUFFIX) dtpmv_NUN.$(SUFFIX) dtpmv_NLU.$(SUFFIX) dtpmv_NLN.$(SUFFIX) \ + dtpmv_TUU.$(SUFFIX) dtpmv_TUN.$(SUFFIX) dtpmv_TLU.$(SUFFIX) dtpmv_TLN.$(SUFFIX) \ + dtpsv_NUU.$(SUFFIX) dtpsv_NUN.$(SUFFIX) dtpsv_NLU.$(SUFFIX) dtpsv_NLN.$(SUFFIX) \ + dtpsv_TUU.$(SUFFIX) dtpsv_TUN.$(SUFFIX) dtpsv_TLU.$(SUFFIX) dtpsv_TLN.$(SUFFIX) \ + dtrmv_NUU.$(SUFFIX) dtrmv_NUN.$(SUFFIX) dtrmv_NLU.$(SUFFIX) dtrmv_NLN.$(SUFFIX) \ + dtrmv_TUU.$(SUFFIX) dtrmv_TUN.$(SUFFIX) dtrmv_TLU.$(SUFFIX) dtrmv_TLN.$(SUFFIX) \ + dtrsv_NUU.$(SUFFIX) dtrsv_NUN.$(SUFFIX) dtrsv_NLU.$(SUFFIX) dtrsv_NLN.$(SUFFIX) \ + dtrsv_TUU.$(SUFFIX) dtrsv_TUN.$(SUFFIX) dtrsv_TLU.$(SUFFIX) dtrsv_TLN.$(SUFFIX) + +QBLASOBJS = \ + qgbmv_n.$(SUFFIX) qgbmv_t.$(SUFFIX) \ + qsbmv_U.$(SUFFIX) qsbmv_L.$(SUFFIX) qspmv_U.$(SUFFIX) qspmv_L.$(SUFFIX) \ + qspr_U.$(SUFFIX) qspr_L.$(SUFFIX) qspr2_U.$(SUFFIX) qspr2_L.$(SUFFIX) \ + qsyr_U.$(SUFFIX) qsyr_L.$(SUFFIX) qsyr2_U.$(SUFFIX) qsyr2_L.$(SUFFIX) \ + qtbmv_NUU.$(SUFFIX) qtbmv_NUN.$(SUFFIX) qtbmv_NLU.$(SUFFIX) qtbmv_NLN.$(SUFFIX) \ + qtbmv_TUU.$(SUFFIX) qtbmv_TUN.$(SUFFIX) qtbmv_TLU.$(SUFFIX) qtbmv_TLN.$(SUFFIX) \ + qtbsv_NUU.$(SUFFIX) qtbsv_NUN.$(SUFFIX) qtbsv_NLU.$(SUFFIX) qtbsv_NLN.$(SUFFIX) \ + qtbsv_TUU.$(SUFFIX) qtbsv_TUN.$(SUFFIX) qtbsv_TLU.$(SUFFIX) qtbsv_TLN.$(SUFFIX) \ + qtpmv_NUU.$(SUFFIX) qtpmv_NUN.$(SUFFIX) qtpmv_NLU.$(SUFFIX) qtpmv_NLN.$(SUFFIX) \ + qtpmv_TUU.$(SUFFIX) qtpmv_TUN.$(SUFFIX) qtpmv_TLU.$(SUFFIX) qtpmv_TLN.$(SUFFIX) \ + qtpsv_NUU.$(SUFFIX) qtpsv_NUN.$(SUFFIX) qtpsv_NLU.$(SUFFIX) qtpsv_NLN.$(SUFFIX) \ + qtpsv_TUU.$(SUFFIX) qtpsv_TUN.$(SUFFIX) qtpsv_TLU.$(SUFFIX) qtpsv_TLN.$(SUFFIX) \ + qtrmv_NUU.$(SUFFIX) qtrmv_NUN.$(SUFFIX) qtrmv_NLU.$(SUFFIX) qtrmv_NLN.$(SUFFIX) \ + qtrmv_TUU.$(SUFFIX) qtrmv_TUN.$(SUFFIX) qtrmv_TLU.$(SUFFIX) qtrmv_TLN.$(SUFFIX) \ + qtrsv_NUU.$(SUFFIX) qtrsv_NUN.$(SUFFIX) qtrsv_NLU.$(SUFFIX) qtrsv_NLN.$(SUFFIX) \ + qtrsv_TUU.$(SUFFIX) qtrsv_TUN.$(SUFFIX) qtrsv_TLU.$(SUFFIX) qtrsv_TLN.$(SUFFIX) + +CBLASOBJS += \ + cgbmv_n.$(SUFFIX) cgbmv_t.$(SUFFIX) cgbmv_r.$(SUFFIX) cgbmv_c.$(SUFFIX) \ + cgbmv_o.$(SUFFIX) cgbmv_u.$(SUFFIX) cgbmv_s.$(SUFFIX) cgbmv_d.$(SUFFIX) \ + chbmv_U.$(SUFFIX) chbmv_L.$(SUFFIX) chbmv_V.$(SUFFIX) chbmv_M.$(SUFFIX) \ + cher_U.$(SUFFIX) cher_L.$(SUFFIX) cher_V.$(SUFFIX) cher_M.$(SUFFIX) \ + cher2_U.$(SUFFIX) cher2_L.$(SUFFIX) cher2_V.$(SUFFIX) cher2_M.$(SUFFIX) \ + chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \ + chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \ + chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \ + csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ + cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ + csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ + ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \ + ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \ + ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \ + ctbmv_CUU.$(SUFFIX) ctbmv_CUN.$(SUFFIX) ctbmv_CLU.$(SUFFIX) ctbmv_CLN.$(SUFFIX) \ + ctbsv_NUU.$(SUFFIX) ctbsv_NUN.$(SUFFIX) ctbsv_NLU.$(SUFFIX) ctbsv_NLN.$(SUFFIX) \ + ctbsv_TUU.$(SUFFIX) ctbsv_TUN.$(SUFFIX) ctbsv_TLU.$(SUFFIX) ctbsv_TLN.$(SUFFIX) \ + ctbsv_RUU.$(SUFFIX) ctbsv_RUN.$(SUFFIX) ctbsv_RLU.$(SUFFIX) ctbsv_RLN.$(SUFFIX) \ + ctbsv_CUU.$(SUFFIX) ctbsv_CUN.$(SUFFIX) ctbsv_CLU.$(SUFFIX) ctbsv_CLN.$(SUFFIX) \ + ctpmv_NUU.$(SUFFIX) ctpmv_NUN.$(SUFFIX) ctpmv_NLU.$(SUFFIX) ctpmv_NLN.$(SUFFIX) \ + ctpmv_TUU.$(SUFFIX) ctpmv_TUN.$(SUFFIX) ctpmv_TLU.$(SUFFIX) ctpmv_TLN.$(SUFFIX) \ + ctpmv_RUU.$(SUFFIX) ctpmv_RUN.$(SUFFIX) ctpmv_RLU.$(SUFFIX) ctpmv_RLN.$(SUFFIX) \ + ctpmv_CUU.$(SUFFIX) ctpmv_CUN.$(SUFFIX) ctpmv_CLU.$(SUFFIX) ctpmv_CLN.$(SUFFIX) \ + ctpsv_NUU.$(SUFFIX) ctpsv_NUN.$(SUFFIX) ctpsv_NLU.$(SUFFIX) ctpsv_NLN.$(SUFFIX) \ + ctpsv_TUU.$(SUFFIX) ctpsv_TUN.$(SUFFIX) ctpsv_TLU.$(SUFFIX) ctpsv_TLN.$(SUFFIX) \ + ctpsv_RUU.$(SUFFIX) ctpsv_RUN.$(SUFFIX) ctpsv_RLU.$(SUFFIX) ctpsv_RLN.$(SUFFIX) \ + ctpsv_CUU.$(SUFFIX) ctpsv_CUN.$(SUFFIX) ctpsv_CLU.$(SUFFIX) ctpsv_CLN.$(SUFFIX) \ + ctrmv_NUU.$(SUFFIX) ctrmv_NUN.$(SUFFIX) ctrmv_NLU.$(SUFFIX) ctrmv_NLN.$(SUFFIX) \ + ctrmv_TUU.$(SUFFIX) ctrmv_TUN.$(SUFFIX) ctrmv_TLU.$(SUFFIX) ctrmv_TLN.$(SUFFIX) \ + ctrmv_RUU.$(SUFFIX) ctrmv_RUN.$(SUFFIX) ctrmv_RLU.$(SUFFIX) ctrmv_RLN.$(SUFFIX) \ + ctrmv_CUU.$(SUFFIX) ctrmv_CUN.$(SUFFIX) ctrmv_CLU.$(SUFFIX) ctrmv_CLN.$(SUFFIX) \ + ctrsv_NUU.$(SUFFIX) ctrsv_NUN.$(SUFFIX) ctrsv_NLU.$(SUFFIX) ctrsv_NLN.$(SUFFIX) \ + ctrsv_TUU.$(SUFFIX) ctrsv_TUN.$(SUFFIX) ctrsv_TLU.$(SUFFIX) ctrsv_TLN.$(SUFFIX) \ + ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ + ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) + +ZBLASOBJS += \ + zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \ + zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \ + zhbmv_U.$(SUFFIX) zhbmv_L.$(SUFFIX) zhbmv_V.$(SUFFIX) zhbmv_M.$(SUFFIX) \ + zher_U.$(SUFFIX) zher_L.$(SUFFIX) zher_V.$(SUFFIX) zher_M.$(SUFFIX) \ + zher2_U.$(SUFFIX) zher2_L.$(SUFFIX) zher2_V.$(SUFFIX) zher2_M.$(SUFFIX) \ + zhpmv_U.$(SUFFIX) zhpmv_L.$(SUFFIX) zhpmv_V.$(SUFFIX) zhpmv_M.$(SUFFIX) \ + zhpr_U.$(SUFFIX) zhpr_L.$(SUFFIX) zhpr_V.$(SUFFIX) zhpr_M.$(SUFFIX) \ + zhpr2_U.$(SUFFIX) zhpr2_L.$(SUFFIX) zhpr2_V.$(SUFFIX) zhpr2_M.$(SUFFIX) \ + zsbmv_U.$(SUFFIX) zsbmv_L.$(SUFFIX) zspmv_U.$(SUFFIX) zspmv_L.$(SUFFIX) \ + zspr_U.$(SUFFIX) zspr_L.$(SUFFIX) zspr2_U.$(SUFFIX) zspr2_L.$(SUFFIX) \ + zsyr_U.$(SUFFIX) zsyr_L.$(SUFFIX) zsyr2_U.$(SUFFIX) zsyr2_L.$(SUFFIX) \ + ztbmv_NUU.$(SUFFIX) ztbmv_NUN.$(SUFFIX) ztbmv_NLU.$(SUFFIX) ztbmv_NLN.$(SUFFIX) \ + ztbmv_TUU.$(SUFFIX) ztbmv_TUN.$(SUFFIX) ztbmv_TLU.$(SUFFIX) ztbmv_TLN.$(SUFFIX) \ + ztbmv_RUU.$(SUFFIX) ztbmv_RUN.$(SUFFIX) ztbmv_RLU.$(SUFFIX) ztbmv_RLN.$(SUFFIX) \ + ztbmv_CUU.$(SUFFIX) ztbmv_CUN.$(SUFFIX) ztbmv_CLU.$(SUFFIX) ztbmv_CLN.$(SUFFIX) \ + ztbsv_NUU.$(SUFFIX) ztbsv_NUN.$(SUFFIX) ztbsv_NLU.$(SUFFIX) ztbsv_NLN.$(SUFFIX) \ + ztbsv_TUU.$(SUFFIX) ztbsv_TUN.$(SUFFIX) ztbsv_TLU.$(SUFFIX) ztbsv_TLN.$(SUFFIX) \ + ztbsv_RUU.$(SUFFIX) ztbsv_RUN.$(SUFFIX) ztbsv_RLU.$(SUFFIX) ztbsv_RLN.$(SUFFIX) \ + ztbsv_CUU.$(SUFFIX) ztbsv_CUN.$(SUFFIX) ztbsv_CLU.$(SUFFIX) ztbsv_CLN.$(SUFFIX) \ + ztpmv_NUU.$(SUFFIX) ztpmv_NUN.$(SUFFIX) ztpmv_NLU.$(SUFFIX) ztpmv_NLN.$(SUFFIX) \ + ztpmv_TUU.$(SUFFIX) ztpmv_TUN.$(SUFFIX) ztpmv_TLU.$(SUFFIX) ztpmv_TLN.$(SUFFIX) \ + ztpmv_RUU.$(SUFFIX) ztpmv_RUN.$(SUFFIX) ztpmv_RLU.$(SUFFIX) ztpmv_RLN.$(SUFFIX) \ + ztpmv_CUU.$(SUFFIX) ztpmv_CUN.$(SUFFIX) ztpmv_CLU.$(SUFFIX) ztpmv_CLN.$(SUFFIX) \ + ztpsv_NUU.$(SUFFIX) ztpsv_NUN.$(SUFFIX) ztpsv_NLU.$(SUFFIX) ztpsv_NLN.$(SUFFIX) \ + ztpsv_TUU.$(SUFFIX) ztpsv_TUN.$(SUFFIX) ztpsv_TLU.$(SUFFIX) ztpsv_TLN.$(SUFFIX) \ + ztpsv_RUU.$(SUFFIX) ztpsv_RUN.$(SUFFIX) ztpsv_RLU.$(SUFFIX) ztpsv_RLN.$(SUFFIX) \ + ztpsv_CUU.$(SUFFIX) ztpsv_CUN.$(SUFFIX) ztpsv_CLU.$(SUFFIX) ztpsv_CLN.$(SUFFIX) \ + ztrmv_NUU.$(SUFFIX) ztrmv_NUN.$(SUFFIX) ztrmv_NLU.$(SUFFIX) ztrmv_NLN.$(SUFFIX) \ + ztrmv_TUU.$(SUFFIX) ztrmv_TUN.$(SUFFIX) ztrmv_TLU.$(SUFFIX) ztrmv_TLN.$(SUFFIX) \ + ztrmv_RUU.$(SUFFIX) ztrmv_RUN.$(SUFFIX) ztrmv_RLU.$(SUFFIX) ztrmv_RLN.$(SUFFIX) \ + ztrmv_CUU.$(SUFFIX) ztrmv_CUN.$(SUFFIX) ztrmv_CLU.$(SUFFIX) ztrmv_CLN.$(SUFFIX) \ + ztrsv_NUU.$(SUFFIX) ztrsv_NUN.$(SUFFIX) ztrsv_NLU.$(SUFFIX) ztrsv_NLN.$(SUFFIX) \ + ztrsv_TUU.$(SUFFIX) ztrsv_TUN.$(SUFFIX) ztrsv_TLU.$(SUFFIX) ztrsv_TLN.$(SUFFIX) \ + ztrsv_RUU.$(SUFFIX) ztrsv_RUN.$(SUFFIX) ztrsv_RLU.$(SUFFIX) ztrsv_RLN.$(SUFFIX) \ + ztrsv_CUU.$(SUFFIX) ztrsv_CUN.$(SUFFIX) ztrsv_CLU.$(SUFFIX) ztrsv_CLN.$(SUFFIX) + +XBLASOBJS += \ + xgbmv_n.$(SUFFIX) xgbmv_t.$(SUFFIX) xgbmv_r.$(SUFFIX) xgbmv_c.$(SUFFIX) \ + xgbmv_o.$(SUFFIX) xgbmv_u.$(SUFFIX) xgbmv_s.$(SUFFIX) xgbmv_d.$(SUFFIX) \ + xhbmv_U.$(SUFFIX) xhbmv_L.$(SUFFIX) xhbmv_V.$(SUFFIX) xhbmv_M.$(SUFFIX) \ + xher_U.$(SUFFIX) xher_L.$(SUFFIX) xher_V.$(SUFFIX) xher_M.$(SUFFIX) \ + xher2_U.$(SUFFIX) xher2_L.$(SUFFIX) xher2_V.$(SUFFIX) xher2_M.$(SUFFIX) \ + xhpmv_U.$(SUFFIX) xhpmv_L.$(SUFFIX) xhpmv_V.$(SUFFIX) xhpmv_M.$(SUFFIX) \ + xhpr_U.$(SUFFIX) xhpr_L.$(SUFFIX) xhpr_V.$(SUFFIX) xhpr_M.$(SUFFIX) \ + xhpr2_U.$(SUFFIX) xhpr2_L.$(SUFFIX) xhpr2_V.$(SUFFIX) xhpr2_M.$(SUFFIX) \ + xsbmv_U.$(SUFFIX) xsbmv_L.$(SUFFIX) xspmv_U.$(SUFFIX) xspmv_L.$(SUFFIX) \ + xspr_U.$(SUFFIX) xspr_L.$(SUFFIX) xspr2_U.$(SUFFIX) xspr2_L.$(SUFFIX) \ + xsyr_U.$(SUFFIX) xsyr_L.$(SUFFIX) xsyr2_U.$(SUFFIX) xsyr2_L.$(SUFFIX) \ + xtbmv_NUU.$(SUFFIX) xtbmv_NUN.$(SUFFIX) xtbmv_NLU.$(SUFFIX) xtbmv_NLN.$(SUFFIX) \ + xtbmv_TUU.$(SUFFIX) xtbmv_TUN.$(SUFFIX) xtbmv_TLU.$(SUFFIX) xtbmv_TLN.$(SUFFIX) \ + xtbmv_RUU.$(SUFFIX) xtbmv_RUN.$(SUFFIX) xtbmv_RLU.$(SUFFIX) xtbmv_RLN.$(SUFFIX) \ + xtbmv_CUU.$(SUFFIX) xtbmv_CUN.$(SUFFIX) xtbmv_CLU.$(SUFFIX) xtbmv_CLN.$(SUFFIX) \ + xtbsv_NUU.$(SUFFIX) xtbsv_NUN.$(SUFFIX) xtbsv_NLU.$(SUFFIX) xtbsv_NLN.$(SUFFIX) \ + xtbsv_TUU.$(SUFFIX) xtbsv_TUN.$(SUFFIX) xtbsv_TLU.$(SUFFIX) xtbsv_TLN.$(SUFFIX) \ + xtbsv_RUU.$(SUFFIX) xtbsv_RUN.$(SUFFIX) xtbsv_RLU.$(SUFFIX) xtbsv_RLN.$(SUFFIX) \ + xtbsv_CUU.$(SUFFIX) xtbsv_CUN.$(SUFFIX) xtbsv_CLU.$(SUFFIX) xtbsv_CLN.$(SUFFIX) \ + xtpmv_NUU.$(SUFFIX) xtpmv_NUN.$(SUFFIX) xtpmv_NLU.$(SUFFIX) xtpmv_NLN.$(SUFFIX) \ + xtpmv_TUU.$(SUFFIX) xtpmv_TUN.$(SUFFIX) xtpmv_TLU.$(SUFFIX) xtpmv_TLN.$(SUFFIX) \ + xtpmv_RUU.$(SUFFIX) xtpmv_RUN.$(SUFFIX) xtpmv_RLU.$(SUFFIX) xtpmv_RLN.$(SUFFIX) \ + xtpmv_CUU.$(SUFFIX) xtpmv_CUN.$(SUFFIX) xtpmv_CLU.$(SUFFIX) xtpmv_CLN.$(SUFFIX) \ + xtpsv_NUU.$(SUFFIX) xtpsv_NUN.$(SUFFIX) xtpsv_NLU.$(SUFFIX) xtpsv_NLN.$(SUFFIX) \ + xtpsv_TUU.$(SUFFIX) xtpsv_TUN.$(SUFFIX) xtpsv_TLU.$(SUFFIX) xtpsv_TLN.$(SUFFIX) \ + xtpsv_RUU.$(SUFFIX) xtpsv_RUN.$(SUFFIX) xtpsv_RLU.$(SUFFIX) xtpsv_RLN.$(SUFFIX) \ + xtpsv_CUU.$(SUFFIX) xtpsv_CUN.$(SUFFIX) xtpsv_CLU.$(SUFFIX) xtpsv_CLN.$(SUFFIX) \ + xtrmv_NUU.$(SUFFIX) xtrmv_NUN.$(SUFFIX) xtrmv_NLU.$(SUFFIX) xtrmv_NLN.$(SUFFIX) \ + xtrmv_TUU.$(SUFFIX) xtrmv_TUN.$(SUFFIX) xtrmv_TLU.$(SUFFIX) xtrmv_TLN.$(SUFFIX) \ + xtrmv_RUU.$(SUFFIX) xtrmv_RUN.$(SUFFIX) xtrmv_RLU.$(SUFFIX) xtrmv_RLN.$(SUFFIX) \ + xtrmv_CUU.$(SUFFIX) xtrmv_CUN.$(SUFFIX) xtrmv_CLU.$(SUFFIX) xtrmv_CLN.$(SUFFIX) \ + xtrsv_NUU.$(SUFFIX) xtrsv_NUN.$(SUFFIX) xtrsv_NLU.$(SUFFIX) xtrsv_NLN.$(SUFFIX) \ + xtrsv_TUU.$(SUFFIX) xtrsv_TUN.$(SUFFIX) xtrsv_TLU.$(SUFFIX) xtrsv_TLN.$(SUFFIX) \ + xtrsv_RUU.$(SUFFIX) xtrsv_RUN.$(SUFFIX) xtrsv_RLU.$(SUFFIX) xtrsv_RLN.$(SUFFIX) \ + xtrsv_CUU.$(SUFFIX) xtrsv_CUN.$(SUFFIX) xtrsv_CLU.$(SUFFIX) xtrsv_CLN.$(SUFFIX) + +HPLOBJS = \ + dtrsv_NLU.$(SUFFIX) dtrsv_NUN.$(SUFFIX) dtrsv_NUU.$(SUFFIX) dtrsv_NLN.$(SUFFIX) \ + dtrsv_TLN.$(SUFFIX) dtrsv_TLU.$(SUFFIX) dtrsv_TUN.$(SUFFIX) dtrsv_TUU.$(SUFFIX) + +ifdef SMP +SBLASOBJS += \ + sgemv_thread_n.$(SUFFIX) sgemv_thread_t.$(SUFFIX) \ + sger_thread.$(SUFFIX) \ + ssymv_thread_U.$(SUFFIX) ssymv_thread_L.$(SUFFIX) \ + ssyr_thread_U.$(SUFFIX) ssyr_thread_L.$(SUFFIX) \ + ssyr2_thread_U.$(SUFFIX) ssyr2_thread_L.$(SUFFIX) \ + sspr_thread_U.$(SUFFIX) sspr_thread_L.$(SUFFIX) \ + sspr2_thread_U.$(SUFFIX) sspr2_thread_L.$(SUFFIX) \ + strmv_thread_NUU.$(SUFFIX) strmv_thread_NUN.$(SUFFIX) \ + strmv_thread_NLU.$(SUFFIX) strmv_thread_NLN.$(SUFFIX) \ + strmv_thread_TUU.$(SUFFIX) strmv_thread_TUN.$(SUFFIX) \ + strmv_thread_TLU.$(SUFFIX) strmv_thread_TLN.$(SUFFIX) \ + sspmv_thread_U.$(SUFFIX) sspmv_thread_L.$(SUFFIX) \ + stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUN.$(SUFFIX) \ + stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLN.$(SUFFIX) \ + stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUN.$(SUFFIX) \ + stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLN.$(SUFFIX) \ + sgbmv_thread_n.$(SUFFIX) sgbmv_thread_t.$(SUFFIX) \ + ssbmv_thread_U.$(SUFFIX) ssbmv_thread_L.$(SUFFIX) \ + stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUN.$(SUFFIX) \ + stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLN.$(SUFFIX) \ + stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUN.$(SUFFIX) \ + stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLN.$(SUFFIX) \ + +DBLASOBJS += \ + dgemv_thread_n.$(SUFFIX) dgemv_thread_t.$(SUFFIX) \ + dger_thread.$(SUFFIX) \ + dsymv_thread_U.$(SUFFIX) dsymv_thread_L.$(SUFFIX) \ + dsyr_thread_U.$(SUFFIX) dsyr_thread_L.$(SUFFIX) \ + dsyr2_thread_U.$(SUFFIX) dsyr2_thread_L.$(SUFFIX) \ + dspr_thread_U.$(SUFFIX) dspr_thread_L.$(SUFFIX) \ + dspr2_thread_U.$(SUFFIX) dspr2_thread_L.$(SUFFIX) \ + dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUN.$(SUFFIX) \ + dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLN.$(SUFFIX) \ + dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUN.$(SUFFIX) \ + dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLN.$(SUFFIX) \ + dspmv_thread_U.$(SUFFIX) dspmv_thread_L.$(SUFFIX) \ + dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUN.$(SUFFIX) \ + dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLN.$(SUFFIX) \ + dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUN.$(SUFFIX) \ + dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLN.$(SUFFIX) \ + dgbmv_thread_n.$(SUFFIX) dgbmv_thread_t.$(SUFFIX) \ + dsbmv_thread_U.$(SUFFIX) dsbmv_thread_L.$(SUFFIX) \ + dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUN.$(SUFFIX) \ + dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLN.$(SUFFIX) \ + dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUN.$(SUFFIX) \ + dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLN.$(SUFFIX) \ + +QBLASOBJS += \ + qgemv_thread_n.$(SUFFIX) qgemv_thread_t.$(SUFFIX) \ + qger_thread.$(SUFFIX) \ + qsymv_thread_U.$(SUFFIX) qsymv_thread_L.$(SUFFIX) \ + qsyr_thread_U.$(SUFFIX) qsyr_thread_L.$(SUFFIX) \ + qsyr2_thread_U.$(SUFFIX) qsyr2_thread_L.$(SUFFIX) \ + qspr_thread_U.$(SUFFIX) qspr_thread_L.$(SUFFIX) \ + qspr2_thread_U.$(SUFFIX) qspr2_thread_L.$(SUFFIX) \ + qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUN.$(SUFFIX) \ + qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLN.$(SUFFIX) \ + qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUN.$(SUFFIX) \ + qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLN.$(SUFFIX) \ + qspmv_thread_U.$(SUFFIX) qspmv_thread_L.$(SUFFIX) \ + qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUN.$(SUFFIX) \ + qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLN.$(SUFFIX) \ + qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUN.$(SUFFIX) \ + qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLN.$(SUFFIX) \ + qgbmv_thread_n.$(SUFFIX) qgbmv_thread_t.$(SUFFIX) \ + qsbmv_thread_U.$(SUFFIX) qsbmv_thread_L.$(SUFFIX) \ + qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUN.$(SUFFIX) \ + qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLN.$(SUFFIX) \ + qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUN.$(SUFFIX) \ + qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLN.$(SUFFIX) \ + +CBLASOBJS += \ + cgemv_thread_n.$(SUFFIX) cgemv_thread_t.$(SUFFIX) \ + cgemv_thread_r.$(SUFFIX) cgemv_thread_c.$(SUFFIX) \ + cgemv_thread_o.$(SUFFIX) cgemv_thread_u.$(SUFFIX) \ + cgemv_thread_s.$(SUFFIX) cgemv_thread_d.$(SUFFIX) \ + cger_thread_U.$(SUFFIX) cger_thread_C.$(SUFFIX) \ + cger_thread_V.$(SUFFIX) cger_thread_D.$(SUFFIX) \ + csymv_thread_U.$(SUFFIX) csymv_thread_L.$(SUFFIX) \ + chemv_thread_U.$(SUFFIX) chemv_thread_L.$(SUFFIX) \ + chemv_thread_V.$(SUFFIX) chemv_thread_M.$(SUFFIX) \ + csyr_thread_U.$(SUFFIX) csyr_thread_L.$(SUFFIX) \ + cher_thread_U.$(SUFFIX) cher_thread_L.$(SUFFIX) \ + cher_thread_V.$(SUFFIX) cher_thread_M.$(SUFFIX) \ + csyr2_thread_U.$(SUFFIX) csyr2_thread_L.$(SUFFIX) \ + cher2_thread_U.$(SUFFIX) cher2_thread_L.$(SUFFIX) \ + cher2_thread_V.$(SUFFIX) cher2_thread_M.$(SUFFIX) \ + cspr_thread_U.$(SUFFIX) cspr_thread_L.$(SUFFIX) \ + chpr_thread_U.$(SUFFIX) chpr_thread_L.$(SUFFIX) \ + chpr_thread_V.$(SUFFIX) chpr_thread_M.$(SUFFIX) \ + cspr2_thread_U.$(SUFFIX) cspr2_thread_L.$(SUFFIX) \ + chpr2_thread_U.$(SUFFIX) chpr2_thread_L.$(SUFFIX) \ + chpr2_thread_V.$(SUFFIX) chpr2_thread_M.$(SUFFIX) \ + ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUN.$(SUFFIX) \ + ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLN.$(SUFFIX) \ + ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUN.$(SUFFIX) \ + ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLN.$(SUFFIX) \ + ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUN.$(SUFFIX) \ + ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLN.$(SUFFIX) \ + ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUN.$(SUFFIX) \ + ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLN.$(SUFFIX) \ + cspmv_thread_U.$(SUFFIX) cspmv_thread_L.$(SUFFIX) \ + chpmv_thread_U.$(SUFFIX) chpmv_thread_L.$(SUFFIX) \ + chpmv_thread_V.$(SUFFIX) chpmv_thread_M.$(SUFFIX) \ + ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUN.$(SUFFIX) \ + ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLN.$(SUFFIX) \ + ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUN.$(SUFFIX) \ + ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLN.$(SUFFIX) \ + ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUN.$(SUFFIX) \ + ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLN.$(SUFFIX) \ + ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUN.$(SUFFIX) \ + ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLN.$(SUFFIX) \ + cgbmv_thread_n.$(SUFFIX) cgbmv_thread_t.$(SUFFIX) \ + cgbmv_thread_r.$(SUFFIX) cgbmv_thread_c.$(SUFFIX) \ + cgbmv_thread_o.$(SUFFIX) cgbmv_thread_u.$(SUFFIX) \ + cgbmv_thread_s.$(SUFFIX) cgbmv_thread_d.$(SUFFIX) \ + csbmv_thread_U.$(SUFFIX) csbmv_thread_L.$(SUFFIX) \ + chbmv_thread_U.$(SUFFIX) chbmv_thread_L.$(SUFFIX) \ + chbmv_thread_V.$(SUFFIX) chbmv_thread_M.$(SUFFIX) \ + ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUN.$(SUFFIX) \ + ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLN.$(SUFFIX) \ + ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUN.$(SUFFIX) \ + ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLN.$(SUFFIX) \ + ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUN.$(SUFFIX) \ + ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLN.$(SUFFIX) \ + ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUN.$(SUFFIX) \ + ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLN.$(SUFFIX) \ + + +ZBLASOBJS += \ + zgemv_thread_n.$(SUFFIX) zgemv_thread_t.$(SUFFIX) \ + zgemv_thread_r.$(SUFFIX) zgemv_thread_c.$(SUFFIX) \ + zgemv_thread_o.$(SUFFIX) zgemv_thread_u.$(SUFFIX) \ + zgemv_thread_s.$(SUFFIX) zgemv_thread_d.$(SUFFIX) \ + zger_thread_U.$(SUFFIX) zger_thread_C.$(SUFFIX) \ + zger_thread_V.$(SUFFIX) zger_thread_D.$(SUFFIX) \ + zsymv_thread_U.$(SUFFIX) zsymv_thread_L.$(SUFFIX) \ + zhemv_thread_U.$(SUFFIX) zhemv_thread_L.$(SUFFIX) \ + zhemv_thread_V.$(SUFFIX) zhemv_thread_M.$(SUFFIX) \ + zsyr_thread_U.$(SUFFIX) zsyr_thread_L.$(SUFFIX) \ + zher_thread_U.$(SUFFIX) zher_thread_L.$(SUFFIX) \ + zher_thread_V.$(SUFFIX) zher_thread_M.$(SUFFIX) \ + zsyr2_thread_U.$(SUFFIX) zsyr2_thread_L.$(SUFFIX) \ + zher2_thread_U.$(SUFFIX) zher2_thread_L.$(SUFFIX) \ + zher2_thread_V.$(SUFFIX) zher2_thread_M.$(SUFFIX) \ + zspr_thread_U.$(SUFFIX) zspr_thread_L.$(SUFFIX) \ + zhpr_thread_U.$(SUFFIX) zhpr_thread_L.$(SUFFIX) \ + zhpr_thread_V.$(SUFFIX) zhpr_thread_M.$(SUFFIX) \ + zspr2_thread_U.$(SUFFIX) zspr2_thread_L.$(SUFFIX) \ + zhpr2_thread_U.$(SUFFIX) zhpr2_thread_L.$(SUFFIX) \ + zhpr2_thread_V.$(SUFFIX) zhpr2_thread_M.$(SUFFIX) \ + ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUN.$(SUFFIX) \ + ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLN.$(SUFFIX) \ + ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUN.$(SUFFIX) \ + ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLN.$(SUFFIX) \ + ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUN.$(SUFFIX) \ + ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLN.$(SUFFIX) \ + ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUN.$(SUFFIX) \ + ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLN.$(SUFFIX) \ + zspmv_thread_U.$(SUFFIX) zspmv_thread_L.$(SUFFIX) \ + zhpmv_thread_U.$(SUFFIX) zhpmv_thread_L.$(SUFFIX) \ + zhpmv_thread_V.$(SUFFIX) zhpmv_thread_M.$(SUFFIX) \ + ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUN.$(SUFFIX) \ + ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLN.$(SUFFIX) \ + ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUN.$(SUFFIX) \ + ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLN.$(SUFFIX) \ + ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUN.$(SUFFIX) \ + ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLN.$(SUFFIX) \ + ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUN.$(SUFFIX) \ + ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLN.$(SUFFIX) \ + zgbmv_thread_n.$(SUFFIX) zgbmv_thread_t.$(SUFFIX) \ + zgbmv_thread_r.$(SUFFIX) zgbmv_thread_c.$(SUFFIX) \ + zgbmv_thread_o.$(SUFFIX) zgbmv_thread_u.$(SUFFIX) \ + zgbmv_thread_s.$(SUFFIX) zgbmv_thread_d.$(SUFFIX) \ + zsbmv_thread_U.$(SUFFIX) zsbmv_thread_L.$(SUFFIX) \ + zhbmv_thread_U.$(SUFFIX) zhbmv_thread_L.$(SUFFIX) \ + zhbmv_thread_V.$(SUFFIX) zhbmv_thread_M.$(SUFFIX) \ + ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUN.$(SUFFIX) \ + ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLN.$(SUFFIX) \ + ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUN.$(SUFFIX) \ + ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLN.$(SUFFIX) \ + ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUN.$(SUFFIX) \ + ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLN.$(SUFFIX) \ + ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUN.$(SUFFIX) \ + ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLN.$(SUFFIX) \ + +XBLASOBJS += \ + xgemv_thread_n.$(SUFFIX) xgemv_thread_t.$(SUFFIX) \ + xgemv_thread_r.$(SUFFIX) xgemv_thread_c.$(SUFFIX) \ + xgemv_thread_o.$(SUFFIX) xgemv_thread_u.$(SUFFIX) \ + xgemv_thread_s.$(SUFFIX) xgemv_thread_d.$(SUFFIX) \ + xger_thread_U.$(SUFFIX) xger_thread_C.$(SUFFIX) \ + xger_thread_V.$(SUFFIX) xger_thread_D.$(SUFFIX) \ + xsymv_thread_U.$(SUFFIX) xsymv_thread_L.$(SUFFIX) \ + xhemv_thread_U.$(SUFFIX) xhemv_thread_L.$(SUFFIX) \ + xhemv_thread_V.$(SUFFIX) xhemv_thread_M.$(SUFFIX) \ + xsyr_thread_U.$(SUFFIX) xsyr_thread_L.$(SUFFIX) \ + xher_thread_U.$(SUFFIX) xher_thread_L.$(SUFFIX) \ + xher_thread_V.$(SUFFIX) xher_thread_M.$(SUFFIX) \ + xsyr2_thread_U.$(SUFFIX) xsyr2_thread_L.$(SUFFIX) \ + xher2_thread_U.$(SUFFIX) xher2_thread_L.$(SUFFIX) \ + xher2_thread_V.$(SUFFIX) xher2_thread_M.$(SUFFIX) \ + xspr_thread_U.$(SUFFIX) xspr_thread_L.$(SUFFIX) \ + xhpr_thread_U.$(SUFFIX) xhpr_thread_L.$(SUFFIX) \ + xhpr_thread_V.$(SUFFIX) xhpr_thread_M.$(SUFFIX) \ + xspr2_thread_U.$(SUFFIX) xspr2_thread_L.$(SUFFIX) \ + xhpr2_thread_U.$(SUFFIX) xhpr2_thread_L.$(SUFFIX) \ + xhpr2_thread_V.$(SUFFIX) xhpr2_thread_M.$(SUFFIX) \ + xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUN.$(SUFFIX) \ + xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLN.$(SUFFIX) \ + xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUN.$(SUFFIX) \ + xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLN.$(SUFFIX) \ + xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUN.$(SUFFIX) \ + xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLN.$(SUFFIX) \ + xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUN.$(SUFFIX) \ + xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLN.$(SUFFIX) \ + xspmv_thread_U.$(SUFFIX) xspmv_thread_L.$(SUFFIX) \ + xhpmv_thread_U.$(SUFFIX) xhpmv_thread_L.$(SUFFIX) \ + xhpmv_thread_V.$(SUFFIX) xhpmv_thread_M.$(SUFFIX) \ + xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUN.$(SUFFIX) \ + xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLN.$(SUFFIX) \ + xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUN.$(SUFFIX) \ + xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLN.$(SUFFIX) \ + xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUN.$(SUFFIX) \ + xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLN.$(SUFFIX) \ + xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUN.$(SUFFIX) \ + xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLN.$(SUFFIX) \ + xgbmv_thread_n.$(SUFFIX) xgbmv_thread_t.$(SUFFIX) \ + xgbmv_thread_r.$(SUFFIX) xgbmv_thread_c.$(SUFFIX) \ + xgbmv_thread_o.$(SUFFIX) xgbmv_thread_u.$(SUFFIX) \ + xgbmv_thread_s.$(SUFFIX) xgbmv_thread_d.$(SUFFIX) \ + xsbmv_thread_U.$(SUFFIX) xsbmv_thread_L.$(SUFFIX) \ + xhbmv_thread_U.$(SUFFIX) xhbmv_thread_L.$(SUFFIX) \ + xhbmv_thread_V.$(SUFFIX) xhbmv_thread_M.$(SUFFIX) \ + xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUN.$(SUFFIX) \ + xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLN.$(SUFFIX) \ + xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUN.$(SUFFIX) \ + xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLN.$(SUFFIX) \ + xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \ + xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \ + xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \ + xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) \ + +endif + +all :: + +sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -UDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< + +sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -UDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< + +dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -DDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< + +dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -DDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< + +qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< + +qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c + $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< + +cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_t.$(SUFFIX) cgbmv_t.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_r.$(SUFFIX) cgbmv_r.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_c.$(SUFFIX) cgbmv_c.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_o.$(SUFFIX) cgbmv_o.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_u.$(SUFFIX) cgbmv_u.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_t.$(SUFFIX) zgbmv_t.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_r.$(SUFFIX) zgbmv_r.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_c.$(SUFFIX) zgbmv_c.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_o.$(SUFFIX) zgbmv_o.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_u.$(SUFFIX) zgbmv_u.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_t.$(SUFFIX) xgbmv_t.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_r.$(SUFFIX) xgbmv_r.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_c.$(SUFFIX) xgbmv_c.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_o.$(SUFFIX) xgbmv_o.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_u.$(SUFFIX) xgbmv_u.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -UDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< + +sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -UDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< + +dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -DDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< + +dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -DDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< + +qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< + +qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< + +cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_t.$(SUFFIX) cgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_r.$(SUFFIX) cgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_c.$(SUFFIX) cgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_o.$(SUFFIX) cgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_u.$(SUFFIX) cgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_t.$(SUFFIX) zgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_r.$(SUFFIX) zgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_c.$(SUFFIX) zgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_o.$(SUFFIX) zgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_u.$(SUFFIX) zgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_t.$(SUFFIX) xgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_r.$(SUFFIX) xgbmv_thread_r.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_c.$(SUFFIX) xgbmv_thread_c.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_o.$(SUFFIX) xgbmv_thread_o.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_u.$(SUFFIX) xgbmv_thread_u.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c + $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_t.$(SUFFIX) cgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_r.$(SUFFIX) cgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_c.$(SUFFIX) cgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) + +cgemv_thread_o.$(SUFFIX) cgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) + +cgemv_thread_u.$(SUFFIX) cgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) + +cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) + +cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) + +zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +zgemv_thread_t.$(SUFFIX) zgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +zgemv_thread_r.$(SUFFIX) zgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) + +zgemv_thread_c.$(SUFFIX) zgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) + +zgemv_thread_o.$(SUFFIX) zgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) + +zgemv_thread_u.$(SUFFIX) zgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) + +zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) + +zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) + +xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) + +xgemv_thread_t.$(SUFFIX) xgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +xgemv_thread_r.$(SUFFIX) xgemv_thread_r.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -UXCONJ $< -o $(@F) + +xgemv_thread_c.$(SUFFIX) xgemv_thread_c.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -UXCONJ $< -o $(@F) + +xgemv_thread_o.$(SUFFIX) xgemv_thread_o.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -DXCONJ $< -o $(@F) + +xgemv_thread_u.$(SUFFIX) xgemv_thread_u.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -DXCONJ $< -o $(@F) + +xgemv_thread_s.$(SUFFIX) xgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DCONJ -DXCONJ $< -o $(@F) + +xgemv_thread_d.$(SUFFIX) xgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) + +sger_thread.$(SUFFIX) sger_thread.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +dger_thread.$(SUFFIX) dger_thread.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +qger_thread.$(SUFFIX) qger_thread.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +cger_thread_U.$(SUFFIX) cger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +cger_thread_C.$(SUFFIX) cger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -UXCONJ $< -o $(@F) + +cger_thread_V.$(SUFFIX) cger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ -DXCONJ $< -o $(@F) + +cger_thread_D.$(SUFFIX) cger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ -DXCONJ $< -o $(@F) + +zger_thread_U.$(SUFFIX) zger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +zger_thread_C.$(SUFFIX) zger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -UXCONJ $< -o $(@F) + +zger_thread_V.$(SUFFIX) zger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ -DXCONJ $< -o $(@F) + +zger_thread_D.$(SUFFIX) zger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ -DXCONJ $< -o $(@F) + +xger_thread_U.$(SUFFIX) xger_thread_U.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -UXCONJ $< -o $(@F) + +xger_thread_C.$(SUFFIX) xger_thread_C.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -UXCONJ $< -o $(@F) + +xger_thread_V.$(SUFFIX) xger_thread_V.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ -DXCONJ $< -o $(@F) + +xger_thread_D.$(SUFFIX) xger_thread_D.$(PSUFFIX) : ger_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ -DXCONJ $< -o $(@F) + +ssymv_thread_U.$(SUFFIX) ssymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssymv_thread_L.$(SUFFIX) ssymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsymv_thread_U.$(SUFFIX) dsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsymv_thread_L.$(SUFFIX) dsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsymv_thread_U.$(SUFFIX) qsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsymv_thread_L.$(SUFFIX) qsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csymv_thread_U.$(SUFFIX) csymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csymv_thread_L.$(SUFFIX) csymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsymv_thread_U.$(SUFFIX) zsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsymv_thread_L.$(SUFFIX) zsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsymv_thread_U.$(SUFFIX) xsymv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsymv_thread_L.$(SUFFIX) xsymv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +chemv_thread_U.$(SUFFIX) chemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) + +chemv_thread_L.$(SUFFIX) chemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) + +chemv_thread_V.$(SUFFIX) chemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chemv_thread_M.$(SUFFIX) chemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhemv_thread_U.$(SUFFIX) zhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) + +zhemv_thread_L.$(SUFFIX) zhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) + +zhemv_thread_V.$(SUFFIX) zhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhemv_thread_M.$(SUFFIX) zhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhemv_thread_U.$(SUFFIX) xhemv_thread_U.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) + +xhemv_thread_L.$(SUFFIX) xhemv_thread_L.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) + +xhemv_thread_V.$(SUFFIX) xhemv_thread_V.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhemv_thread_M.$(SUFFIX) xhemv_thread_M.$(PSUFFIX) : symv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +ssyr_thread_U.$(SUFFIX) ssyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssyr_thread_L.$(SUFFIX) ssyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsyr_thread_U.$(SUFFIX) dsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsyr_thread_L.$(SUFFIX) dsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsyr_thread_U.$(SUFFIX) qsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsyr_thread_L.$(SUFFIX) qsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csyr_thread_U.$(SUFFIX) csyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csyr_thread_L.$(SUFFIX) csyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsyr_thread_U.$(SUFFIX) zsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsyr_thread_L.$(SUFFIX) zsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsyr_thread_U.$(SUFFIX) xsyr_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsyr_thread_L.$(SUFFIX) xsyr_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cher_thread_U.$(SUFFIX) cher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F) + +cher_thread_L.$(SUFFIX) cher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F) + +cher_thread_V.$(SUFFIX) cher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F) + +cher_thread_M.$(SUFFIX) cher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F) + +zher_thread_U.$(SUFFIX) zher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F) + +zher_thread_L.$(SUFFIX) zher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F) + +zher_thread_V.$(SUFFIX) zher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F) + +zher_thread_M.$(SUFFIX) zher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F) + +xher_thread_U.$(SUFFIX) xher_thread_U.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F) + +xher_thread_L.$(SUFFIX) xher_thread_L.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F) + +xher_thread_V.$(SUFFIX) xher_thread_V.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F) + +xher_thread_M.$(SUFFIX) xher_thread_M.$(PSUFFIX) : syr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F) + +ssyr2_thread_U.$(SUFFIX) ssyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssyr2_thread_L.$(SUFFIX) ssyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsyr2_thread_U.$(SUFFIX) dsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsyr2_thread_L.$(SUFFIX) dsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsyr2_thread_U.$(SUFFIX) qsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsyr2_thread_L.$(SUFFIX) qsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csyr2_thread_U.$(SUFFIX) csyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csyr2_thread_L.$(SUFFIX) csyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsyr2_thread_U.$(SUFFIX) zsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsyr2_thread_L.$(SUFFIX) zsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsyr2_thread_U.$(SUFFIX) xsyr2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsyr2_thread_L.$(SUFFIX) xsyr2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cher2_thread_U.$(SUFFIX) cher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHER $< -o $(@F) + +cher2_thread_L.$(SUFFIX) cher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHER $< -o $(@F) + +cher2_thread_V.$(SUFFIX) cher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHERREV $< -o $(@F) + +cher2_thread_M.$(SUFFIX) cher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHERREV $< -o $(@F) + +zher2_thread_U.$(SUFFIX) zher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHER $< -o $(@F) + +zher2_thread_L.$(SUFFIX) zher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHER $< -o $(@F) + +zher2_thread_V.$(SUFFIX) zher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHERREV $< -o $(@F) + +zher2_thread_M.$(SUFFIX) zher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHERREV $< -o $(@F) + +xher2_thread_U.$(SUFFIX) xher2_thread_U.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHER $< -o $(@F) + +xher2_thread_L.$(SUFFIX) xher2_thread_L.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHER $< -o $(@F) + +xher2_thread_V.$(SUFFIX) xher2_thread_V.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHERREV $< -o $(@F) + +xher2_thread_M.$(SUFFIX) xher2_thread_M.$(PSUFFIX) : syr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHERREV $< -o $(@F) + +chbmv_U.$(SUFFIX) chbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +chbmv_L.$(SUFFIX) chbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +chbmv_V.$(SUFFIX) chbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chbmv_M.$(SUFFIX) chbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhbmv_U.$(SUFFIX) zhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zhbmv_L.$(SUFFIX) zhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +zhbmv_V.$(SUFFIX) zhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhbmv_M.$(SUFFIX) zhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhbmv_U.$(SUFFIX) xhbmv_U.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xhbmv_L.$(SUFFIX) xhbmv_L.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +xhbmv_V.$(SUFFIX) xhbmv_V.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhbmv_M.$(SUFFIX) xhbmv_M.$(PSUFFIX) : zhbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chbmv_thread_U.$(SUFFIX) chbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) + +chbmv_thread_L.$(SUFFIX) chbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) + +chbmv_thread_V.$(SUFFIX) chbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chbmv_thread_M.$(SUFFIX) chbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhbmv_thread_U.$(SUFFIX) zhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) + +zhbmv_thread_L.$(SUFFIX) zhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) + +zhbmv_thread_V.$(SUFFIX) zhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhbmv_thread_M.$(SUFFIX) zhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhbmv_thread_U.$(SUFFIX) xhbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) + +xhbmv_thread_L.$(SUFFIX) xhbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) + +xhbmv_thread_V.$(SUFFIX) xhbmv_thread_V.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhbmv_thread_M.$(SUFFIX) xhbmv_thread_M.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +cher_U.$(SUFFIX) cher_U.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F) + +cher_L.$(SUFFIX) cher_L.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F) + +cher_V.$(SUFFIX) cher_V.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +cher_M.$(SUFFIX) cher_M.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zher_U.$(SUFFIX) zher_U.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F) + +zher_L.$(SUFFIX) zher_L.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F) + +zher_V.$(SUFFIX) zher_V.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zher_M.$(SUFFIX) zher_M.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xher_U.$(SUFFIX) xher_U.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F) + +xher_L.$(SUFFIX) xher_L.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F) + +xher_V.$(SUFFIX) xher_V.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xher_M.$(SUFFIX) xher_M.$(PSUFFIX) : zher_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +cher2_U.$(SUFFIX) cher2_U.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +cher2_L.$(SUFFIX) cher2_L.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +cher2_V.$(SUFFIX) cher2_V.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +cher2_M.$(SUFFIX) cher2_M.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +zher2_U.$(SUFFIX) zher2_U.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zher2_L.$(SUFFIX) zher2_L.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zher2_V.$(SUFFIX) zher2_V.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +zher2_M.$(SUFFIX) zher2_M.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +xher2_U.$(SUFFIX) xher2_U.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xher2_L.$(SUFFIX) xher2_L.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xher2_V.$(SUFFIX) xher2_V.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -DHEMVREV -o $(@F) + +xher2_M.$(SUFFIX) xher2_M.$(PSUFFIX) : zher2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) + +chpmv_U.$(SUFFIX) chpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +chpmv_L.$(SUFFIX) chpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +chpmv_V.$(SUFFIX) chpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chpmv_M.$(SUFFIX) chpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhpmv_U.$(SUFFIX) zhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zhpmv_L.$(SUFFIX) zhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +zhpmv_V.$(SUFFIX) zhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhpmv_M.$(SUFFIX) zhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhpmv_U.$(SUFFIX) xhpmv_U.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xhpmv_L.$(SUFFIX) xhpmv_L.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +xhpmv_V.$(SUFFIX) xhpmv_V.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhpmv_M.$(SUFFIX) xhpmv_M.$(PSUFFIX) : zhpmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chpmv_thread_U.$(SUFFIX) chpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $(@F) + +chpmv_thread_L.$(SUFFIX) chpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $(@F) + +chpmv_thread_V.$(SUFFIX) chpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chpmv_thread_M.$(SUFFIX) chpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhpmv_thread_U.$(SUFFIX) zhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $(@F) + +zhpmv_thread_L.$(SUFFIX) zhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $(@F) + +zhpmv_thread_V.$(SUFFIX) zhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhpmv_thread_M.$(SUFFIX) zhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhpmv_thread_U.$(SUFFIX) xhpmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) + +xhpmv_thread_L.$(SUFFIX) xhpmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) + +xhpmv_thread_V.$(SUFFIX) xhpmv_thread_V.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhpmv_thread_M.$(SUFFIX) xhpmv_thread_M.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chpr_U.$(SUFFIX) chpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER $< -o $(@F) + +chpr_L.$(SUFFIX) chpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER $< -o $(@F) + +chpr_V.$(SUFFIX) chpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chpr_M.$(SUFFIX) chpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhpr_U.$(SUFFIX) zhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER $< -o $(@F) + +zhpr_L.$(SUFFIX) zhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER $< -o $(@F) + +zhpr_V.$(SUFFIX) zhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhpr_M.$(SUFFIX) zhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhpr_U.$(SUFFIX) xhpr_U.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER $< -o $(@F) + +xhpr_L.$(SUFFIX) xhpr_L.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER $< -o $(@F) + +xhpr_V.$(SUFFIX) xhpr_V.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhpr_M.$(SUFFIX) xhpr_M.$(PSUFFIX) : zhpr_k.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chpr_thread_U.$(SUFFIX) chpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMV $< -o $(@F) + +chpr_thread_L.$(SUFFIX) chpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMV $< -o $(@F) + +chpr_thread_V.$(SUFFIX) chpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +chpr_thread_M.$(SUFFIX) chpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +zhpr_thread_U.$(SUFFIX) zhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMV $< -o $(@F) + +zhpr_thread_L.$(SUFFIX) zhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMV $< -o $(@F) + +zhpr_thread_V.$(SUFFIX) zhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +zhpr_thread_M.$(SUFFIX) zhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +xhpr_thread_U.$(SUFFIX) xhpr_thread_U.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMV $< -o $(@F) + +xhpr_thread_L.$(SUFFIX) xhpr_thread_L.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMV $< -o $(@F) + +xhpr_thread_V.$(SUFFIX) xhpr_thread_V.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -ULOWER -DHEMVREV $< -o $(@F) + +xhpr_thread_M.$(SUFFIX) xhpr_thread_M.$(PSUFFIX) : spr_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DLOWER -DHEMVREV $< -o $(@F) + +chpr2_U.$(SUFFIX) chpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +chpr2_L.$(SUFFIX) chpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +chpr2_V.$(SUFFIX) chpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +chpr2_M.$(SUFFIX) chpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +zhpr2_U.$(SUFFIX) zhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zhpr2_L.$(SUFFIX) zhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zhpr2_V.$(SUFFIX) zhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +zhpr2_M.$(SUFFIX) zhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +xhpr2_U.$(SUFFIX) xhpr2_U.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xhpr2_L.$(SUFFIX) xhpr2_L.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xhpr2_V.$(SUFFIX) xhpr2_V.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +xhpr2_M.$(SUFFIX) xhpr2_M.$(PSUFFIX) : zhpr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) + +chpr2_thread_U.$(SUFFIX) chpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) + +chpr2_thread_L.$(SUFFIX) chpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) + +chpr2_thread_V.$(SUFFIX) chpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +chpr2_thread_M.$(SUFFIX) chpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +zhpr2_thread_U.$(SUFFIX) zhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) + +zhpr2_thread_L.$(SUFFIX) zhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) + +zhpr2_thread_V.$(SUFFIX) zhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +zhpr2_thread_M.$(SUFFIX) zhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DHEMVREV $< -o $(@F) + +xhpr2_thread_U.$(SUFFIX) xhpr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMV $< -o $(@F) + +xhpr2_thread_L.$(SUFFIX) xhpr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DHEMV $< -o $(@F) + +xhpr2_thread_V.$(SUFFIX) xhpr2_thread_V.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DHEMVREV $< -o $(@F) + +xhpr2_thread_M.$(SUFFIX) xhpr2_thread_M.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -DHEMVREV -o $(@F) + +ssbmv_U.$(SUFFIX) ssbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssbmv_L.$(SUFFIX) ssbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsbmv_U.$(SUFFIX) dsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsbmv_L.$(SUFFIX) dsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsbmv_U.$(SUFFIX) qsbmv_U.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsbmv_L.$(SUFFIX) qsbmv_L.$(PSUFFIX) : sbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csbmv_U.$(SUFFIX) csbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csbmv_L.$(SUFFIX) csbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsbmv_U.$(SUFFIX) zsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsbmv_L.$(SUFFIX) zsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsbmv_U.$(SUFFIX) xsbmv_U.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsbmv_L.$(SUFFIX) xsbmv_L.$(PSUFFIX) : zsbmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +ssbmv_thread_U.$(SUFFIX) ssbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssbmv_thread_L.$(SUFFIX) ssbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsbmv_thread_U.$(SUFFIX) dsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsbmv_thread_L.$(SUFFIX) dsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsbmv_thread_U.$(SUFFIX) qsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsbmv_thread_L.$(SUFFIX) qsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csbmv_thread_U.$(SUFFIX) csbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csbmv_thread_L.$(SUFFIX) csbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsbmv_thread_U.$(SUFFIX) zsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsbmv_thread_L.$(SUFFIX) zsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsbmv_thread_U.$(SUFFIX) xsbmv_thread_U.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsbmv_thread_L.$(SUFFIX) xsbmv_thread_L.$(PSUFFIX) : sbmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspmv_U.$(SUFFIX) sspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspmv_L.$(SUFFIX) sspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspmv_U.$(SUFFIX) dspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspmv_L.$(SUFFIX) dspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspmv_U.$(SUFFIX) qspmv_U.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspmv_L.$(SUFFIX) qspmv_L.$(PSUFFIX) : spmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspmv_U.$(SUFFIX) cspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspmv_L.$(SUFFIX) cspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspmv_U.$(SUFFIX) zspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspmv_L.$(SUFFIX) zspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspmv_U.$(SUFFIX) xspmv_U.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspmv_L.$(SUFFIX) xspmv_L.$(PSUFFIX) : zspmv_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspmv_thread_U.$(SUFFIX) sspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspmv_thread_L.$(SUFFIX) sspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspmv_thread_U.$(SUFFIX) dspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspmv_thread_L.$(SUFFIX) dspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspmv_thread_U.$(SUFFIX) qspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspmv_thread_L.$(SUFFIX) qspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspmv_thread_U.$(SUFFIX) cspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspmv_thread_L.$(SUFFIX) cspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspmv_thread_U.$(SUFFIX) zspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspmv_thread_L.$(SUFFIX) zspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspmv_thread_U.$(SUFFIX) xspmv_thread_U.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspmv_thread_L.$(SUFFIX) xspmv_thread_L.$(PSUFFIX) : spmv_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspr_U.$(SUFFIX) sspr_U.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspr_L.$(SUFFIX) sspr_L.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspr_U.$(SUFFIX) dspr_U.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspr_L.$(SUFFIX) dspr_L.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspr_U.$(SUFFIX) qspr_U.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspr_L.$(SUFFIX) qspr_L.$(PSUFFIX) : spr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspr_U.$(SUFFIX) cspr_U.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspr_L.$(SUFFIX) cspr_L.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspr_U.$(SUFFIX) zspr_U.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspr_L.$(SUFFIX) zspr_L.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspr_U.$(SUFFIX) xspr_U.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspr_L.$(SUFFIX) xspr_L.$(PSUFFIX) : zspr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspr_thread_U.$(SUFFIX) sspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspr_thread_L.$(SUFFIX) sspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspr_thread_U.$(SUFFIX) dspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspr_thread_L.$(SUFFIX) dspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspr_thread_U.$(SUFFIX) qspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspr_thread_L.$(SUFFIX) qspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspr_thread_U.$(SUFFIX) cspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspr_thread_L.$(SUFFIX) cspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspr_thread_U.$(SUFFIX) zspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspr_thread_L.$(SUFFIX) zspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspr_thread_U.$(SUFFIX) xspr_thread_U.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspr_thread_L.$(SUFFIX) xspr_thread_L.$(PSUFFIX) : spr_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspr2_U.$(SUFFIX) sspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspr2_L.$(SUFFIX) sspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspr2_U.$(SUFFIX) dspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspr2_L.$(SUFFIX) dspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspr2_U.$(SUFFIX) qspr2_U.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspr2_L.$(SUFFIX) qspr2_L.$(PSUFFIX) : spr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspr2_U.$(SUFFIX) cspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspr2_L.$(SUFFIX) cspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspr2_U.$(SUFFIX) zspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspr2_L.$(SUFFIX) zspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspr2_U.$(SUFFIX) xspr2_U.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspr2_L.$(SUFFIX) xspr2_L.$(PSUFFIX) : zspr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +sspr2_thread_U.$(SUFFIX) sspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +sspr2_thread_L.$(SUFFIX) sspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dspr2_thread_U.$(SUFFIX) dspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dspr2_thread_L.$(SUFFIX) dspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qspr2_thread_U.$(SUFFIX) qspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qspr2_thread_L.$(SUFFIX) qspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +cspr2_thread_U.$(SUFFIX) cspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +cspr2_thread_L.$(SUFFIX) cspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zspr2_thread_U.$(SUFFIX) zspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zspr2_thread_L.$(SUFFIX) zspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xspr2_thread_U.$(SUFFIX) xspr2_thread_U.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xspr2_thread_L.$(SUFFIX) xspr2_thread_L.$(PSUFFIX) : spr2_thread.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +ssyr_U.$(SUFFIX) ssyr_U.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssyr_L.$(SUFFIX) ssyr_L.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsyr_U.$(SUFFIX) dsyr_U.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsyr_L.$(SUFFIX) dsyr_L.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsyr_U.$(SUFFIX) qsyr_U.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsyr_L.$(SUFFIX) qsyr_L.$(PSUFFIX) : syr_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csyr_U.$(SUFFIX) csyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csyr_L.$(SUFFIX) csyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsyr_U.$(SUFFIX) zsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsyr_L.$(SUFFIX) zsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsyr_U.$(SUFFIX) xsyr_U.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsyr_L.$(SUFFIX) xsyr_L.$(PSUFFIX) : zsyr_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +ssyr2_U.$(SUFFIX) ssyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +ssyr2_L.$(SUFFIX) ssyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +dsyr2_U.$(SUFFIX) dsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +dsyr2_L.$(SUFFIX) dsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +qsyr2_U.$(SUFFIX) qsyr2_U.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +qsyr2_L.$(SUFFIX) qsyr2_L.$(PSUFFIX) : syr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +csyr2_U.$(SUFFIX) csyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $(@F) + +csyr2_L.$(SUFFIX) csyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $(@F) + +zsyr2_U.$(SUFFIX) zsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $(@F) + +zsyr2_L.$(SUFFIX) zsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $(@F) + +xsyr2_U.$(SUFFIX) xsyr2_U.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $(@F) + +xsyr2_L.$(SUFFIX) xsyr2_L.$(PSUFFIX) : zsyr2_k.c ../../param.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $(@F) + +stbmv_NUU.$(SUFFIX) stbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stbmv_NUN.$(SUFFIX) stbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stbmv_TLU.$(SUFFIX) stbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stbmv_TLN.$(SUFFIX) stbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +stbmv_NLU.$(SUFFIX) stbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stbmv_NLN.$(SUFFIX) stbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stbmv_TUU.$(SUFFIX) stbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stbmv_TUN.$(SUFFIX) stbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtbmv_NUU.$(SUFFIX) dtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtbmv_NUN.$(SUFFIX) dtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtbmv_TLU.$(SUFFIX) dtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtbmv_TLN.$(SUFFIX) dtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtbmv_NLU.$(SUFFIX) dtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtbmv_NLN.$(SUFFIX) dtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtbmv_TUU.$(SUFFIX) dtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtbmv_TUN.$(SUFFIX) dtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtbmv_NUU.$(SUFFIX) qtbmv_NUU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtbmv_NUN.$(SUFFIX) qtbmv_NUN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtbmv_TLU.$(SUFFIX) qtbmv_TLU.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtbmv_TLN.$(SUFFIX) qtbmv_TLN.$(PSUFFIX) : tbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtbmv_NLU.$(SUFFIX) qtbmv_NLU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtbmv_NLN.$(SUFFIX) qtbmv_NLN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtbmv_TUU.$(SUFFIX) qtbmv_TUU.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtbmv_TUN.$(SUFFIX) qtbmv_TUN.$(PSUFFIX) : tbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctbmv_NUU.$(SUFFIX) ctbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbmv_NUN.$(SUFFIX) ctbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbmv_TLU.$(SUFFIX) ctbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbmv_TLN.$(SUFFIX) ctbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbmv_RLU.$(SUFFIX) ctbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbmv_RLN.$(SUFFIX) ctbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbmv_CLU.$(SUFFIX) ctbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbmv_CLN.$(SUFFIX) ctbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ctbmv_NLU.$(SUFFIX) ctbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbmv_NLN.$(SUFFIX) ctbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbmv_TUU.$(SUFFIX) ctbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbmv_TUN.$(SUFFIX) ctbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbmv_RUU.$(SUFFIX) ctbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbmv_RUN.$(SUFFIX) ctbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbmv_CUU.$(SUFFIX) ctbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbmv_CUN.$(SUFFIX) ctbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbmv_NUU.$(SUFFIX) ztbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbmv_NUN.$(SUFFIX) ztbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbmv_TLU.$(SUFFIX) ztbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbmv_TLN.$(SUFFIX) ztbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbmv_RLU.$(SUFFIX) ztbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbmv_RLN.$(SUFFIX) ztbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbmv_CLU.$(SUFFIX) ztbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbmv_CLN.$(SUFFIX) ztbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbmv_NLU.$(SUFFIX) ztbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbmv_NLN.$(SUFFIX) ztbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbmv_TUU.$(SUFFIX) ztbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbmv_TUN.$(SUFFIX) ztbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbmv_RUU.$(SUFFIX) ztbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbmv_RUN.$(SUFFIX) ztbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbmv_CUU.$(SUFFIX) ztbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbmv_CUN.$(SUFFIX) ztbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbmv_NUU.$(SUFFIX) xtbmv_NUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbmv_NUN.$(SUFFIX) xtbmv_NUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbmv_TLU.$(SUFFIX) xtbmv_TLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbmv_TLN.$(SUFFIX) xtbmv_TLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbmv_RLU.$(SUFFIX) xtbmv_RLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbmv_RLN.$(SUFFIX) xtbmv_RLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbmv_CLU.$(SUFFIX) xtbmv_CLU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbmv_CLN.$(SUFFIX) xtbmv_CLN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbmv_NLU.$(SUFFIX) xtbmv_NLU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbmv_NLN.$(SUFFIX) xtbmv_NLN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbmv_TUU.$(SUFFIX) xtbmv_TUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbmv_TUN.$(SUFFIX) xtbmv_TUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbmv_RUU.$(SUFFIX) xtbmv_RUU.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbmv_RUN.$(SUFFIX) xtbmv_RUN.$(PSUFFIX) : ztbmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbmv_CUU.$(SUFFIX) xtbmv_CUU.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbmv_CUN.$(SUFFIX) xtbmv_CUN.$(PSUFFIX) : ztbmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +stbmv_thread_NUU.$(SUFFIX) stbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +stbmv_thread_NUN.$(SUFFIX) stbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +stbmv_thread_TLU.$(SUFFIX) stbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +stbmv_thread_TLN.$(SUFFIX) stbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +stbmv_thread_NLU.$(SUFFIX) stbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +stbmv_thread_NLN.$(SUFFIX) stbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +stbmv_thread_TUU.$(SUFFIX) stbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +stbmv_thread_TUN.$(SUFFIX) stbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +dtbmv_thread_NUU.$(SUFFIX) dtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +dtbmv_thread_NUN.$(SUFFIX) dtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +dtbmv_thread_TLU.$(SUFFIX) dtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +dtbmv_thread_TLN.$(SUFFIX) dtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +dtbmv_thread_NLU.$(SUFFIX) dtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +dtbmv_thread_NLN.$(SUFFIX) dtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +dtbmv_thread_TUU.$(SUFFIX) dtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +dtbmv_thread_TUN.$(SUFFIX) dtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +qtbmv_thread_NUU.$(SUFFIX) qtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +qtbmv_thread_NUN.$(SUFFIX) qtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +qtbmv_thread_TLU.$(SUFFIX) qtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +qtbmv_thread_TLN.$(SUFFIX) qtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +qtbmv_thread_NLU.$(SUFFIX) qtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +qtbmv_thread_NLN.$(SUFFIX) qtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +qtbmv_thread_TUU.$(SUFFIX) qtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +qtbmv_thread_TUN.$(SUFFIX) qtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +ctbmv_thread_NUU.$(SUFFIX) ctbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbmv_thread_NUN.$(SUFFIX) ctbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbmv_thread_TLU.$(SUFFIX) ctbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbmv_thread_TLN.$(SUFFIX) ctbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbmv_thread_RLU.$(SUFFIX) ctbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbmv_thread_RLN.$(SUFFIX) ctbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbmv_thread_CLU.$(SUFFIX) ctbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbmv_thread_CLN.$(SUFFIX) ctbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ctbmv_thread_NLU.$(SUFFIX) ctbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbmv_thread_NLN.$(SUFFIX) ctbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbmv_thread_TUU.$(SUFFIX) ctbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbmv_thread_TUN.$(SUFFIX) ctbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbmv_thread_RUU.$(SUFFIX) ctbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbmv_thread_RUN.$(SUFFIX) ctbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbmv_thread_CUU.$(SUFFIX) ctbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbmv_thread_CUN.$(SUFFIX) ctbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbmv_thread_NUU.$(SUFFIX) ztbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbmv_thread_NUN.$(SUFFIX) ztbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbmv_thread_TLU.$(SUFFIX) ztbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbmv_thread_TLN.$(SUFFIX) ztbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbmv_thread_RLU.$(SUFFIX) ztbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbmv_thread_RLN.$(SUFFIX) ztbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbmv_thread_CLU.$(SUFFIX) ztbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbmv_thread_CLN.$(SUFFIX) ztbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbmv_thread_NLU.$(SUFFIX) ztbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbmv_thread_NLN.$(SUFFIX) ztbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbmv_thread_TUU.$(SUFFIX) ztbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbmv_thread_TUN.$(SUFFIX) ztbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbmv_thread_RUU.$(SUFFIX) ztbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbmv_thread_RUN.$(SUFFIX) ztbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbmv_thread_CUU.$(SUFFIX) ztbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbmv_thread_CUN.$(SUFFIX) ztbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbmv_thread_NUU.$(SUFFIX) xtbmv_thread_NUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbmv_thread_NUN.$(SUFFIX) xtbmv_thread_NUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbmv_thread_TLU.$(SUFFIX) xtbmv_thread_TLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbmv_thread_TLN.$(SUFFIX) xtbmv_thread_TLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbmv_thread_RLN.$(SUFFIX) xtbmv_thread_RLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbmv_thread_CLN.$(SUFFIX) xtbmv_thread_CLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbmv_thread_NLU.$(SUFFIX) xtbmv_thread_NLU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbmv_thread_NLN.$(SUFFIX) xtbmv_thread_NLN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbmv_thread_TUU.$(SUFFIX) xtbmv_thread_TUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbmv_thread_TUN.$(SUFFIX) xtbmv_thread_TUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbmv_thread_RUN.$(SUFFIX) xtbmv_thread_RUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUU.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbmv_thread_CUN.$(SUFFIX) xtbmv_thread_CUN.$(PSUFFIX) : tbmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +stbsv_NUU.$(SUFFIX) stbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stbsv_NUN.$(SUFFIX) stbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stbsv_TLU.$(SUFFIX) stbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stbsv_TLN.$(SUFFIX) stbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +stbsv_NLU.$(SUFFIX) stbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stbsv_NLN.$(SUFFIX) stbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stbsv_TUU.$(SUFFIX) stbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stbsv_TUN.$(SUFFIX) stbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtbsv_NUU.$(SUFFIX) dtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtbsv_NUN.$(SUFFIX) dtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtbsv_TLU.$(SUFFIX) dtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtbsv_TLN.$(SUFFIX) dtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtbsv_NLU.$(SUFFIX) dtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtbsv_NLN.$(SUFFIX) dtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtbsv_TUU.$(SUFFIX) dtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtbsv_TUN.$(SUFFIX) dtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtbsv_NUU.$(SUFFIX) qtbsv_NUU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtbsv_NUN.$(SUFFIX) qtbsv_NUN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtbsv_TLU.$(SUFFIX) qtbsv_TLU.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtbsv_TLN.$(SUFFIX) qtbsv_TLN.$(PSUFFIX) : tbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtbsv_NLU.$(SUFFIX) qtbsv_NLU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtbsv_NLN.$(SUFFIX) qtbsv_NLN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtbsv_TUU.$(SUFFIX) qtbsv_TUU.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtbsv_TUN.$(SUFFIX) qtbsv_TUN.$(PSUFFIX) : tbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctbsv_NUU.$(SUFFIX) ctbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbsv_NUN.$(SUFFIX) ctbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbsv_TLU.$(SUFFIX) ctbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbsv_TLN.$(SUFFIX) ctbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbsv_RLU.$(SUFFIX) ctbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbsv_RLN.$(SUFFIX) ctbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbsv_CLU.$(SUFFIX) ctbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbsv_CLN.$(SUFFIX) ctbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ctbsv_NLU.$(SUFFIX) ctbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctbsv_NLN.$(SUFFIX) ctbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctbsv_TUU.$(SUFFIX) ctbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctbsv_TUN.$(SUFFIX) ctbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctbsv_RUU.$(SUFFIX) ctbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctbsv_RUN.$(SUFFIX) ctbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctbsv_CUU.$(SUFFIX) ctbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctbsv_CUN.$(SUFFIX) ctbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbsv_NUU.$(SUFFIX) ztbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbsv_NUN.$(SUFFIX) ztbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbsv_TLU.$(SUFFIX) ztbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbsv_TLN.$(SUFFIX) ztbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbsv_RLU.$(SUFFIX) ztbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbsv_RLN.$(SUFFIX) ztbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbsv_CLU.$(SUFFIX) ztbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbsv_CLN.$(SUFFIX) ztbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztbsv_NLU.$(SUFFIX) ztbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztbsv_NLN.$(SUFFIX) ztbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztbsv_TUU.$(SUFFIX) ztbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztbsv_TUN.$(SUFFIX) ztbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztbsv_RUU.$(SUFFIX) ztbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztbsv_RUN.$(SUFFIX) ztbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztbsv_CUU.$(SUFFIX) ztbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztbsv_CUN.$(SUFFIX) ztbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbsv_NUU.$(SUFFIX) xtbsv_NUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbsv_NUN.$(SUFFIX) xtbsv_NUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbsv_TLU.$(SUFFIX) xtbsv_TLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbsv_TLN.$(SUFFIX) xtbsv_TLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbsv_RLU.$(SUFFIX) xtbsv_RLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbsv_RLN.$(SUFFIX) xtbsv_RLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbsv_CLU.$(SUFFIX) xtbsv_CLU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbsv_CLN.$(SUFFIX) xtbsv_CLN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtbsv_NLU.$(SUFFIX) xtbsv_NLU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtbsv_NLN.$(SUFFIX) xtbsv_NLN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtbsv_TUU.$(SUFFIX) xtbsv_TUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtbsv_TUN.$(SUFFIX) xtbsv_TUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtbsv_RUU.$(SUFFIX) xtbsv_RUU.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtbsv_RUN.$(SUFFIX) xtbsv_RUN.$(PSUFFIX) : ztbsv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtbsv_CUU.$(SUFFIX) xtbsv_CUU.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtbsv_CUN.$(SUFFIX) xtbsv_CUN.$(PSUFFIX) : ztbsv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +stpmv_NUU.$(SUFFIX) stpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stpmv_NUN.$(SUFFIX) stpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stpmv_TLU.$(SUFFIX) stpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stpmv_TLN.$(SUFFIX) stpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +stpmv_NLU.$(SUFFIX) stpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stpmv_NLN.$(SUFFIX) stpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stpmv_TUU.$(SUFFIX) stpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stpmv_TUN.$(SUFFIX) stpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtpmv_NUU.$(SUFFIX) dtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtpmv_NUN.$(SUFFIX) dtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtpmv_TLU.$(SUFFIX) dtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtpmv_TLN.$(SUFFIX) dtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtpmv_NLU.$(SUFFIX) dtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtpmv_NLN.$(SUFFIX) dtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtpmv_TUU.$(SUFFIX) dtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtpmv_TUN.$(SUFFIX) dtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtpmv_NUU.$(SUFFIX) qtpmv_NUU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtpmv_NUN.$(SUFFIX) qtpmv_NUN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtpmv_TLU.$(SUFFIX) qtpmv_TLU.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtpmv_TLN.$(SUFFIX) qtpmv_TLN.$(PSUFFIX) : tpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtpmv_NLU.$(SUFFIX) qtpmv_NLU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtpmv_NLN.$(SUFFIX) qtpmv_NLN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtpmv_TUU.$(SUFFIX) qtpmv_TUU.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtpmv_TUN.$(SUFFIX) qtpmv_TUN.$(PSUFFIX) : tpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctpmv_NUU.$(SUFFIX) ctpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpmv_NUN.$(SUFFIX) ctpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpmv_TLU.$(SUFFIX) ctpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpmv_TLN.$(SUFFIX) ctpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpmv_RLU.$(SUFFIX) ctpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpmv_RLN.$(SUFFIX) ctpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpmv_CLU.$(SUFFIX) ctpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpmv_CLN.$(SUFFIX) ctpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ctpmv_NLU.$(SUFFIX) ctpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpmv_NLN.$(SUFFIX) ctpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpmv_TUU.$(SUFFIX) ctpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpmv_TUN.$(SUFFIX) ctpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpmv_RUU.$(SUFFIX) ctpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpmv_RUN.$(SUFFIX) ctpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpmv_CUU.$(SUFFIX) ctpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpmv_CUN.$(SUFFIX) ctpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpmv_NUU.$(SUFFIX) ztpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpmv_NUN.$(SUFFIX) ztpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpmv_TLU.$(SUFFIX) ztpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpmv_TLN.$(SUFFIX) ztpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpmv_RLU.$(SUFFIX) ztpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpmv_RLN.$(SUFFIX) ztpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpmv_CLU.$(SUFFIX) ztpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpmv_CLN.$(SUFFIX) ztpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpmv_NLU.$(SUFFIX) ztpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpmv_NLN.$(SUFFIX) ztpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpmv_TUU.$(SUFFIX) ztpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpmv_TUN.$(SUFFIX) ztpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpmv_RUU.$(SUFFIX) ztpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpmv_RUN.$(SUFFIX) ztpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpmv_CUU.$(SUFFIX) ztpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpmv_CUN.$(SUFFIX) ztpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpmv_NUU.$(SUFFIX) xtpmv_NUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpmv_NUN.$(SUFFIX) xtpmv_NUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpmv_TLU.$(SUFFIX) xtpmv_TLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpmv_TLN.$(SUFFIX) xtpmv_TLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpmv_RLU.$(SUFFIX) xtpmv_RLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpmv_RLN.$(SUFFIX) xtpmv_RLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpmv_CLU.$(SUFFIX) xtpmv_CLU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpmv_CLN.$(SUFFIX) xtpmv_CLN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpmv_NLU.$(SUFFIX) xtpmv_NLU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpmv_NLN.$(SUFFIX) xtpmv_NLN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpmv_TUU.$(SUFFIX) xtpmv_TUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpmv_TUN.$(SUFFIX) xtpmv_TUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpmv_RUU.$(SUFFIX) xtpmv_RUU.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpmv_RUN.$(SUFFIX) xtpmv_RUN.$(PSUFFIX) : ztpmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpmv_CUU.$(SUFFIX) xtpmv_CUU.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpmv_CUN.$(SUFFIX) xtpmv_CUN.$(PSUFFIX) : ztpmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + + +stpmv_thread_NUU.$(SUFFIX) stpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +stpmv_thread_NUN.$(SUFFIX) stpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +stpmv_thread_TLU.$(SUFFIX) stpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +stpmv_thread_TLN.$(SUFFIX) stpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +stpmv_thread_NLU.$(SUFFIX) stpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +stpmv_thread_NLN.$(SUFFIX) stpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +stpmv_thread_TUU.$(SUFFIX) stpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +stpmv_thread_TUN.$(SUFFIX) stpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +dtpmv_thread_NUU.$(SUFFIX) dtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +dtpmv_thread_NUN.$(SUFFIX) dtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +dtpmv_thread_TLU.$(SUFFIX) dtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +dtpmv_thread_TLN.$(SUFFIX) dtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +dtpmv_thread_NLU.$(SUFFIX) dtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +dtpmv_thread_NLN.$(SUFFIX) dtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +dtpmv_thread_TUU.$(SUFFIX) dtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +dtpmv_thread_TUN.$(SUFFIX) dtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +qtpmv_thread_NUU.$(SUFFIX) qtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +qtpmv_thread_NUN.$(SUFFIX) qtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +qtpmv_thread_TLU.$(SUFFIX) qtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +qtpmv_thread_TLN.$(SUFFIX) qtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +qtpmv_thread_NLU.$(SUFFIX) qtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +qtpmv_thread_NLN.$(SUFFIX) qtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +qtpmv_thread_TUU.$(SUFFIX) qtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +qtpmv_thread_TUN.$(SUFFIX) qtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +ctpmv_thread_NUU.$(SUFFIX) ctpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpmv_thread_NUN.$(SUFFIX) ctpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpmv_thread_TLU.$(SUFFIX) ctpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpmv_thread_TLN.$(SUFFIX) ctpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpmv_thread_RLU.$(SUFFIX) ctpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpmv_thread_RLN.$(SUFFIX) ctpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpmv_thread_CLU.$(SUFFIX) ctpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpmv_thread_CLN.$(SUFFIX) ctpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ctpmv_thread_NLU.$(SUFFIX) ctpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpmv_thread_NLN.$(SUFFIX) ctpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpmv_thread_TUU.$(SUFFIX) ctpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpmv_thread_TUN.$(SUFFIX) ctpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpmv_thread_RUU.$(SUFFIX) ctpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpmv_thread_RUN.$(SUFFIX) ctpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpmv_thread_CUU.$(SUFFIX) ctpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpmv_thread_CUN.$(SUFFIX) ctpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpmv_thread_NUU.$(SUFFIX) ztpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpmv_thread_NUN.$(SUFFIX) ztpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpmv_thread_TLU.$(SUFFIX) ztpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpmv_thread_TLN.$(SUFFIX) ztpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpmv_thread_RLU.$(SUFFIX) ztpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpmv_thread_RLN.$(SUFFIX) ztpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpmv_thread_CLU.$(SUFFIX) ztpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpmv_thread_CLN.$(SUFFIX) ztpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpmv_thread_NLU.$(SUFFIX) ztpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpmv_thread_NLN.$(SUFFIX) ztpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpmv_thread_TUU.$(SUFFIX) ztpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpmv_thread_TUN.$(SUFFIX) ztpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpmv_thread_RUU.$(SUFFIX) ztpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpmv_thread_RUN.$(SUFFIX) ztpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpmv_thread_CUU.$(SUFFIX) ztpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpmv_thread_CUN.$(SUFFIX) ztpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpmv_thread_NUU.$(SUFFIX) xtpmv_thread_NUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpmv_thread_NUN.$(SUFFIX) xtpmv_thread_NUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpmv_thread_TLU.$(SUFFIX) xtpmv_thread_TLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpmv_thread_TLN.$(SUFFIX) xtpmv_thread_TLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpmv_thread_RLU.$(SUFFIX) xtpmv_thread_RLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpmv_thread_RLN.$(SUFFIX) xtpmv_thread_RLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpmv_thread_CLU.$(SUFFIX) xtpmv_thread_CLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpmv_thread_CLN.$(SUFFIX) xtpmv_thread_CLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpmv_thread_NLU.$(SUFFIX) xtpmv_thread_NLU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpmv_thread_NLN.$(SUFFIX) xtpmv_thread_NLN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpmv_thread_TUU.$(SUFFIX) xtpmv_thread_TUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpmv_thread_TUN.$(SUFFIX) xtpmv_thread_TUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpmv_thread_RUU.$(SUFFIX) xtpmv_thread_RUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpmv_thread_RUN.$(SUFFIX) xtpmv_thread_RUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpmv_thread_CUU.$(SUFFIX) xtpmv_thread_CUU.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpmv_thread_CUN.$(SUFFIX) xtpmv_thread_CUN.$(PSUFFIX) : tpmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +stpsv_NUU.$(SUFFIX) stpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stpsv_NUN.$(SUFFIX) stpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stpsv_TLU.$(SUFFIX) stpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stpsv_TLN.$(SUFFIX) stpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +stpsv_NLU.$(SUFFIX) stpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +stpsv_NLN.$(SUFFIX) stpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +stpsv_TUU.$(SUFFIX) stpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +stpsv_TUN.$(SUFFIX) stpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtpsv_NUU.$(SUFFIX) dtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtpsv_NUN.$(SUFFIX) dtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtpsv_TLU.$(SUFFIX) dtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtpsv_TLN.$(SUFFIX) dtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtpsv_NLU.$(SUFFIX) dtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtpsv_NLN.$(SUFFIX) dtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtpsv_TUU.$(SUFFIX) dtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtpsv_TUN.$(SUFFIX) dtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtpsv_NUU.$(SUFFIX) qtpsv_NUU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtpsv_NUN.$(SUFFIX) qtpsv_NUN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtpsv_TLU.$(SUFFIX) qtpsv_TLU.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtpsv_TLN.$(SUFFIX) qtpsv_TLN.$(PSUFFIX) : tpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtpsv_NLU.$(SUFFIX) qtpsv_NLU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtpsv_NLN.$(SUFFIX) qtpsv_NLN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtpsv_TUU.$(SUFFIX) qtpsv_TUU.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtpsv_TUN.$(SUFFIX) qtpsv_TUN.$(PSUFFIX) : tpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctpsv_NUU.$(SUFFIX) ctpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpsv_NUN.$(SUFFIX) ctpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpsv_TLU.$(SUFFIX) ctpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpsv_TLN.$(SUFFIX) ctpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpsv_RLU.$(SUFFIX) ctpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpsv_RLN.$(SUFFIX) ctpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpsv_CLU.$(SUFFIX) ctpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpsv_CLN.$(SUFFIX) ctpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ctpsv_NLU.$(SUFFIX) ctpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ctpsv_NLN.$(SUFFIX) ctpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ctpsv_TUU.$(SUFFIX) ctpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ctpsv_TUN.$(SUFFIX) ctpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ctpsv_RUU.$(SUFFIX) ctpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ctpsv_RUN.$(SUFFIX) ctpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ctpsv_CUU.$(SUFFIX) ctpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ctpsv_CUN.$(SUFFIX) ctpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpsv_NUU.$(SUFFIX) ztpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpsv_NUN.$(SUFFIX) ztpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpsv_TLU.$(SUFFIX) ztpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpsv_TLN.$(SUFFIX) ztpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpsv_RLU.$(SUFFIX) ztpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpsv_RLN.$(SUFFIX) ztpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpsv_CLU.$(SUFFIX) ztpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpsv_CLN.$(SUFFIX) ztpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ztpsv_NLU.$(SUFFIX) ztpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ztpsv_NLN.$(SUFFIX) ztpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ztpsv_TUU.$(SUFFIX) ztpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ztpsv_TUN.$(SUFFIX) ztpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ztpsv_RUU.$(SUFFIX) ztpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ztpsv_RUN.$(SUFFIX) ztpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ztpsv_CUU.$(SUFFIX) ztpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ztpsv_CUN.$(SUFFIX) ztpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpsv_NUU.$(SUFFIX) xtpsv_NUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpsv_NUN.$(SUFFIX) xtpsv_NUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpsv_TLU.$(SUFFIX) xtpsv_TLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpsv_TLN.$(SUFFIX) xtpsv_TLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpsv_RLU.$(SUFFIX) xtpsv_RLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpsv_RLN.$(SUFFIX) xtpsv_RLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpsv_CLU.$(SUFFIX) xtpsv_CLU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpsv_CLN.$(SUFFIX) xtpsv_CLN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +xtpsv_NLU.$(SUFFIX) xtpsv_NLU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +xtpsv_NLN.$(SUFFIX) xtpsv_NLN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +xtpsv_TUU.$(SUFFIX) xtpsv_TUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +xtpsv_TUN.$(SUFFIX) xtpsv_TUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +xtpsv_RUU.$(SUFFIX) xtpsv_RUU.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +xtpsv_RUN.$(SUFFIX) xtpsv_RUN.$(PSUFFIX) : ztpsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +xtpsv_CUU.$(SUFFIX) xtpsv_CUU.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +xtpsv_CUN.$(SUFFIX) xtpsv_CUN.$(PSUFFIX) : ztpsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +strmv_NUU.$(SUFFIX) strmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +strmv_NUN.$(SUFFIX) strmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +strmv_TLU.$(SUFFIX) strmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +strmv_TLN.$(SUFFIX) strmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +strmv_NLU.$(SUFFIX) strmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +strmv_NLN.$(SUFFIX) strmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +strmv_TUU.$(SUFFIX) strmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +strmv_TUN.$(SUFFIX) strmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtrmv_NUU.$(SUFFIX) dtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtrmv_NUN.$(SUFFIX) dtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtrmv_TLU.$(SUFFIX) dtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtrmv_TLN.$(SUFFIX) dtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtrmv_NLU.$(SUFFIX) dtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtrmv_NLN.$(SUFFIX) dtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtrmv_TUU.$(SUFFIX) dtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtrmv_TUN.$(SUFFIX) dtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtrmv_NUU.$(SUFFIX) qtrmv_NUU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtrmv_NUN.$(SUFFIX) qtrmv_NUN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtrmv_TLU.$(SUFFIX) qtrmv_TLU.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtrmv_TLN.$(SUFFIX) qtrmv_TLN.$(PSUFFIX) : trmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtrmv_NLU.$(SUFFIX) qtrmv_NLU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtrmv_NLN.$(SUFFIX) qtrmv_NLN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtrmv_TUU.$(SUFFIX) qtrmv_TUU.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtrmv_TUN.$(SUFFIX) qtrmv_TUN.$(PSUFFIX) : trmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctrmv_NUU.$(SUFFIX) ctrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrmv_NUN.$(SUFFIX) ctrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrmv_TLU.$(SUFFIX) ctrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrmv_TLN.$(SUFFIX) ctrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrmv_RLU.$(SUFFIX) ctrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrmv_RLN.$(SUFFIX) ctrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrmv_CLU.$(SUFFIX) ctrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrmv_CLN.$(SUFFIX) ctrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ctrmv_NLU.$(SUFFIX) ctrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrmv_NLN.$(SUFFIX) ctrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrmv_TUU.$(SUFFIX) ctrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrmv_TUN.$(SUFFIX) ctrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrmv_RUU.$(SUFFIX) ctrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrmv_RUN.$(SUFFIX) ctrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrmv_CUU.$(SUFFIX) ctrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrmv_CUN.$(SUFFIX) ctrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrmv_NUU.$(SUFFIX) ztrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrmv_NUN.$(SUFFIX) ztrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrmv_TLU.$(SUFFIX) ztrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrmv_TLN.$(SUFFIX) ztrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrmv_RLU.$(SUFFIX) ztrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrmv_RLN.$(SUFFIX) ztrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrmv_CLU.$(SUFFIX) ztrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrmv_CLN.$(SUFFIX) ztrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrmv_NLU.$(SUFFIX) ztrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrmv_NLN.$(SUFFIX) ztrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrmv_TUU.$(SUFFIX) ztrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrmv_TUN.$(SUFFIX) ztrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrmv_RUU.$(SUFFIX) ztrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrmv_RUN.$(SUFFIX) ztrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrmv_CUU.$(SUFFIX) ztrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrmv_CUN.$(SUFFIX) ztrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrmv_NUU.$(SUFFIX) xtrmv_NUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrmv_NUN.$(SUFFIX) xtrmv_NUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrmv_TLU.$(SUFFIX) xtrmv_TLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrmv_TLN.$(SUFFIX) xtrmv_TLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrmv_RLU.$(SUFFIX) xtrmv_RLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrmv_RLN.$(SUFFIX) xtrmv_RLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrmv_CLU.$(SUFFIX) xtrmv_CLU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrmv_CLN.$(SUFFIX) xtrmv_CLN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrmv_NLU.$(SUFFIX) xtrmv_NLU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrmv_NLN.$(SUFFIX) xtrmv_NLN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrmv_TUU.$(SUFFIX) xtrmv_TUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrmv_TUN.$(SUFFIX) xtrmv_TUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrmv_RUU.$(SUFFIX) xtrmv_RUU.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrmv_RUN.$(SUFFIX) xtrmv_RUN.$(PSUFFIX) : ztrmv_U.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrmv_CUU.$(SUFFIX) xtrmv_CUU.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrmv_CUN.$(SUFFIX) xtrmv_CUN.$(PSUFFIX) : ztrmv_L.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA=4 -UUNIT $< -o $(@F) + +strmv_thread_NUU.$(SUFFIX) strmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +strmv_thread_NUN.$(SUFFIX) strmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +strmv_thread_TLU.$(SUFFIX) strmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +strmv_thread_TLN.$(SUFFIX) strmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +strmv_thread_NLU.$(SUFFIX) strmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +strmv_thread_NLN.$(SUFFIX) strmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +strmv_thread_TUU.$(SUFFIX) strmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +strmv_thread_TUN.$(SUFFIX) strmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +dtrmv_thread_NUU.$(SUFFIX) dtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +dtrmv_thread_NUN.$(SUFFIX) dtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +dtrmv_thread_TLU.$(SUFFIX) dtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +dtrmv_thread_TLN.$(SUFFIX) dtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +dtrmv_thread_NLU.$(SUFFIX) dtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +dtrmv_thread_NLN.$(SUFFIX) dtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +dtrmv_thread_TUU.$(SUFFIX) dtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +dtrmv_thread_TUN.$(SUFFIX) dtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +qtrmv_thread_NUU.$(SUFFIX) qtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -DUNIT $< -o $(@F) + +qtrmv_thread_NUN.$(SUFFIX) qtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -UTRANSA -UUNIT $< -o $(@F) + +qtrmv_thread_TLU.$(SUFFIX) qtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -DUNIT $< -o $(@F) + +qtrmv_thread_TLN.$(SUFFIX) qtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -DTRANSA -UUNIT $< -o $(@F) + +qtrmv_thread_NLU.$(SUFFIX) qtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -DUNIT $< -o $(@F) + +qtrmv_thread_NLN.$(SUFFIX) qtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER -UTRANSA -UUNIT $< -o $(@F) + +qtrmv_thread_TUU.$(SUFFIX) qtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -DUNIT $< -o $(@F) + +qtrmv_thread_TUN.$(SUFFIX) qtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER -DTRANSA -UUNIT $< -o $(@F) + +ctrmv_thread_NUU.$(SUFFIX) ctrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrmv_thread_NUN.$(SUFFIX) ctrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrmv_thread_TLU.$(SUFFIX) ctrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrmv_thread_TLN.$(SUFFIX) ctrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrmv_thread_RLU.$(SUFFIX) ctrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrmv_thread_RLN.$(SUFFIX) ctrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrmv_thread_CLU.$(SUFFIX) ctrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrmv_thread_CLN.$(SUFFIX) ctrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ctrmv_thread_NLU.$(SUFFIX) ctrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrmv_thread_NLN.$(SUFFIX) ctrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrmv_thread_TUU.$(SUFFIX) ctrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrmv_thread_TUN.$(SUFFIX) ctrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrmv_thread_RUU.$(SUFFIX) ctrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrmv_thread_RUN.$(SUFFIX) ctrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrmv_thread_CUU.$(SUFFIX) ctrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrmv_thread_CUN.$(SUFFIX) ctrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrmv_thread_NUU.$(SUFFIX) ztrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrmv_thread_NUN.$(SUFFIX) ztrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrmv_thread_TLU.$(SUFFIX) ztrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrmv_thread_TLN.$(SUFFIX) ztrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrmv_thread_RLU.$(SUFFIX) ztrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrmv_thread_RLN.$(SUFFIX) ztrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrmv_thread_CLU.$(SUFFIX) ztrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrmv_thread_CLN.$(SUFFIX) ztrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrmv_thread_NLU.$(SUFFIX) ztrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrmv_thread_NLN.$(SUFFIX) ztrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrmv_thread_TUU.$(SUFFIX) ztrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrmv_thread_TUN.$(SUFFIX) ztrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrmv_thread_RUU.$(SUFFIX) ztrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrmv_thread_RUN.$(SUFFIX) ztrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrmv_thread_CUU.$(SUFFIX) ztrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrmv_thread_CUN.$(SUFFIX) ztrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrmv_thread_NUU.$(SUFFIX) xtrmv_thread_NUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrmv_thread_NUN.$(SUFFIX) xtrmv_thread_NUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrmv_thread_TLU.$(SUFFIX) xtrmv_thread_TLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrmv_thread_TLN.$(SUFFIX) xtrmv_thread_TLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrmv_thread_RLU.$(SUFFIX) xtrmv_thread_RLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrmv_thread_RLN.$(SUFFIX) xtrmv_thread_RLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrmv_thread_CLU.$(SUFFIX) xtrmv_thread_CLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrmv_thread_CLN.$(SUFFIX) xtrmv_thread_CLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrmv_thread_NLU.$(SUFFIX) xtrmv_thread_NLU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrmv_thread_NLN.$(SUFFIX) xtrmv_thread_NLN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrmv_thread_TUU.$(SUFFIX) xtrmv_thread_TUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrmv_thread_TUN.$(SUFFIX) xtrmv_thread_TUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrmv_thread_RUU.$(SUFFIX) xtrmv_thread_RUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrmv_thread_RUN.$(SUFFIX) xtrmv_thread_RUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrmv_thread_CUU.$(SUFFIX) xtrmv_thread_CUU.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrmv_thread_CUN.$(SUFFIX) xtrmv_thread_CUN.$(PSUFFIX) : trmv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DTRANSA=4 -UUNIT $< -o $(@F) + +strsv_NUU.$(SUFFIX) strsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +strsv_NUN.$(SUFFIX) strsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +strsv_TLU.$(SUFFIX) strsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +strsv_TLN.$(SUFFIX) strsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +strsv_NLU.$(SUFFIX) strsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +strsv_NLN.$(SUFFIX) strsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +strsv_TUU.$(SUFFIX) strsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +strsv_TUN.$(SUFFIX) strsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtrsv_NUU.$(SUFFIX) dtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtrsv_NUN.$(SUFFIX) dtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtrsv_TLU.$(SUFFIX) dtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtrsv_TLN.$(SUFFIX) dtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +dtrsv_NLU.$(SUFFIX) dtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +dtrsv_NLN.$(SUFFIX) dtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +dtrsv_TUU.$(SUFFIX) dtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +dtrsv_TUN.$(SUFFIX) dtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtrsv_NUU.$(SUFFIX) qtrsv_NUU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtrsv_NUN.$(SUFFIX) qtrsv_NUN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtrsv_TLU.$(SUFFIX) qtrsv_TLU.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtrsv_TLN.$(SUFFIX) qtrsv_TLN.$(PSUFFIX) : trsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +qtrsv_NLU.$(SUFFIX) qtrsv_NLU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -DUNIT $< -o $(@F) + +qtrsv_NLN.$(SUFFIX) qtrsv_NLN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UTRANSA -UUNIT $< -o $(@F) + +qtrsv_TUU.$(SUFFIX) qtrsv_TUU.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -DUNIT $< -o $(@F) + +qtrsv_TUN.$(SUFFIX) qtrsv_TUN.$(PSUFFIX) : trsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DTRANSA -UUNIT $< -o $(@F) + +ctrsv_NUU.$(SUFFIX) ctrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrsv_NUN.$(SUFFIX) ctrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrsv_TLU.$(SUFFIX) ctrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrsv_TLN.$(SUFFIX) ctrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrsv_RLU.$(SUFFIX) ctrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrsv_RLN.$(SUFFIX) ctrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrsv_CLU.$(SUFFIX) ctrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrsv_CLN.$(SUFFIX) ctrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ctrsv_NLU.$(SUFFIX) ctrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ctrsv_NLN.$(SUFFIX) ctrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ctrsv_TUU.$(SUFFIX) ctrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ctrsv_TUN.$(SUFFIX) ctrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ctrsv_RUU.$(SUFFIX) ctrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ctrsv_RUN.$(SUFFIX) ctrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ctrsv_CUU.$(SUFFIX) ctrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ctrsv_CUN.$(SUFFIX) ctrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrsv_NUU.$(SUFFIX) ztrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrsv_NUN.$(SUFFIX) ztrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrsv_TLU.$(SUFFIX) ztrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrsv_TLN.$(SUFFIX) ztrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrsv_RLU.$(SUFFIX) ztrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrsv_RLN.$(SUFFIX) ztrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrsv_CLU.$(SUFFIX) ztrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrsv_CLN.$(SUFFIX) ztrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +ztrsv_NLU.$(SUFFIX) ztrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +ztrsv_NLN.$(SUFFIX) ztrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +ztrsv_TUU.$(SUFFIX) ztrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +ztrsv_TUN.$(SUFFIX) ztrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +ztrsv_RUU.$(SUFFIX) ztrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +ztrsv_RUN.$(SUFFIX) ztrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +ztrsv_CUU.$(SUFFIX) ztrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +ztrsv_CUN.$(SUFFIX) ztrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrsv_NUU.$(SUFFIX) xtrsv_NUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrsv_NUN.$(SUFFIX) xtrsv_NUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrsv_TLU.$(SUFFIX) xtrsv_TLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrsv_TLN.$(SUFFIX) xtrsv_TLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrsv_RLU.$(SUFFIX) xtrsv_RLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrsv_RLN.$(SUFFIX) xtrsv_RLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrsv_CLU.$(SUFFIX) xtrsv_CLU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrsv_CLN.$(SUFFIX) xtrsv_CLN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +xtrsv_NLU.$(SUFFIX) xtrsv_NLU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -DUNIT $< -o $(@F) + +xtrsv_NLN.$(SUFFIX) xtrsv_NLN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=1 -UUNIT $< -o $(@F) + +xtrsv_TUU.$(SUFFIX) xtrsv_TUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -DUNIT $< -o $(@F) + +xtrsv_TUN.$(SUFFIX) xtrsv_TUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=2 -UUNIT $< -o $(@F) + +xtrsv_RUU.$(SUFFIX) xtrsv_RUU.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -DUNIT $< -o $(@F) + +xtrsv_RUN.$(SUFFIX) xtrsv_RUN.$(PSUFFIX) : ztrsv_U.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=3 -UUNIT $< -o $(@F) + +xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -DUNIT $< -o $(@F) + +xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) + +include ../../Makefile.tail diff --git a/driver/level2/gbmv_k.c b/driver/level2/gbmv_k.c new file mode 100644 index 0000000000..317d420471 --- /dev/null +++ b/driver/level2/gbmv_k.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef TRANS +#define M m +#define N n +#else +#define N m +#define M n +#endif + +void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, offset_u, offset_l, start, end, length; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + M * sizeof(FLOAT) + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(M, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + N * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(N, x, incx, X, 1); + } + + offset_u = ku; + offset_l = ku + m; + + for (i = 0; i < MIN(n, m + ku); i++) { + + start = MAX(offset_u, 0); + end = MIN(offset_l, ku + kl + 1); + + length = end - start; + +#ifndef TRANS + AXPYU_K(length, 0, 0, + alpha * X[i], + a + start, 1, Y + start - offset_u, 1, NULL, 0); +#else + Y[i] += alpha * DOTU_K(length, a + start, 1, X + start - offset_u, 1); +#endif + + offset_u --; + offset_l --; + + a += lda; + } + + if (incy != 1) { + COPY_K(M, Y, 1, y, incy); + } + + return; +} + diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c new file mode 100644 index 0000000000..18aae26aea --- /dev/null +++ b/driver/level2/gbmv_thread.c @@ -0,0 +1,294 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#if !defined(CONJ) && !defined(XCONJ) +#define MYAXPY AXPYU_K +#define MYDOT DOTU_K +#elif defined(CONJ) && !defined(XCONJ) +#define MYAXPY AXPYC_K +#define MYDOT DOTC_K +#elif !defined(CONJ) && defined(XCONJ) +#define MYAXPY AXPYU_K +#define MYDOT DOTC_K +#else +#define MYAXPY AXPYC_K +#define MYDOT DOTU_K +#endif + +static int gbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx; + BLASLONG n_from, n_to; + BLASLONG i, offset_l, offset_u, uu, ll, ku, kl; +#ifdef TRANSA +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + ku = args -> ldc; + kl = args -> ldd; + + n_from = 0; + n_to = args -> n; + + if (range_m) y += *range_m * COMPSIZE; + + if (range_n) { + n_from = *(range_n + 0); + n_to = *(range_n + 1); + + a += n_from * lda * COMPSIZE; + } + + n_to = MIN(n_to, args -> m + ku); + +#ifdef TRANSA + if (incx != 1) { + COPY_K(args -> m, x, incx, buffer, 1); + + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } +#endif + + SCAL_K( +#ifndef TRANSA + args -> m, +#else + args -> n, +#endif + 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); + + offset_u = ku - n_from; + offset_l = ku - n_from + args -> m; + +#ifndef TRANSA + x += n_from * incx * COMPSIZE; + y -= offset_u * COMPSIZE; +#else + x -= offset_u * COMPSIZE; + y += n_from * COMPSIZE; +#endif + + for (i = n_from; i < n_to; i++) { + + uu = MAX(offset_u, 0); + ll = MIN(offset_l, ku + kl + 1); + +#ifndef TRANSA + MYAXPY(ll - uu, 0, 0, + *(x + 0), +#ifdef COMPLEX +#ifndef XCONJ + *(x + 1), +#else + -*(x + 1), +#endif +#endif + a + uu * COMPSIZE, 1, y + uu * COMPSIZE, 1, NULL, 0); + + x += incx * COMPSIZE; +#else + result = MYDOT(ll - uu, a + uu * COMPSIZE, 1, x + uu * COMPSIZE, 1); + +#ifndef COMPLEX + *y = result; +#else + *(y + 0) += CREAL(result); +#ifndef XCONJ + *(y + 1) += CIMAG(result); +#else + *(y + 1) -= CIMAG(result); +#endif +#endif + + x += COMPSIZE; +#endif + + y += COMPSIZE; + + offset_u --; + offset_l --; + + a += lda * COMPSIZE; + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + args.n = n; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)buffer; + + args.lda = lda; + args.ldb = incx; + args.ldc = ku; + args.ldd = kl; + + num_cpu = 0; + + range_n[0] = 0; + i = n; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + if (width < 4) width = 4; + if (i < width) width = i; + + range_n[num_cpu + 1] = range_n[num_cpu] + width; + +#ifndef TRANSA + range_m[num_cpu] = num_cpu * ((m + 15) & ~15); +#else + range_m[num_cpu] = num_cpu * ((n + 15) & ~15); +#endif + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = gbmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + + if (num_cpu) { + queue[0].sa = NULL; +#ifndef TRANSA + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; +#else + queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; +#endif + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + for (i = 1; i < num_cpu; i ++) { + AXPYU_K( +#ifndef TRANSA + m, +#else + n, +#endif + 0, 0, +#ifndef COMPLEX + ONE, +#else + ONE, ZERO, +#endif + buffer + range_m[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + } + + AXPYU_K( +#ifndef TRANSA + m, +#else + n, +#endif + 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer, 1, y, incy, NULL, 0); + + return 0; +} diff --git a/driver/level2/gemv_thread.c b/driver/level2/gemv_thread.c new file mode 100644 index 0000000000..5f8abf26f4 --- /dev/null +++ b/driver/level2/gemv_thread.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef TRANSA +#if !defined(CONJ) && !defined(XCONJ) +#define GEMV GEMV_N +#elif defined(CONJ) && !defined(XCONJ) +#define GEMV GEMV_R +#elif !defined(CONJ) && defined(XCONJ) +#define GEMV GEMV_O +#else +#define GEMV GEMV_S +#endif +#else +#if !defined(CONJ) && !defined(XCONJ) +#define GEMV GEMV_T +#elif defined(CONJ) && !defined(XCONJ) +#define GEMV GEMV_C +#elif !defined(CONJ) && defined(XCONJ) +#define GEMV GEMV_U +#else +#define GEMV GEMV_D +#endif +#endif + +static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx, incy; + BLASLONG m_from, m_to, n_from, n_to; + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + incy = args -> ldc; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + + a += m_from * COMPSIZE; +#ifndef TRANSA + y += m_from * incy * COMPSIZE; +#endif + } + + n_from = 0; + n_to = args -> n; + + if (range_n) { + n_from = *(range_n + 0); + n_to = *(range_n + 1); + + a += n_from * lda * COMPSIZE; +#ifdef TRANSA + y += n_from * incy * COMPSIZE; +#endif + } + + // fprintf(stderr, "M_From = %d M_To = %d N_From = %d N_To = %d\n", m_from, m_to, n_from, n_to); + + GEMV(m_to - m_from, n_to - n_from, 0, + *((FLOAT *)args -> alpha + 0), +#ifdef COMPLEX + *((FLOAT *)args -> alpha + 1), +#endif + a, lda, x, incx, y, incy, buffer); + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + args.n = n; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)y; + + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *) alpha; +#endif + + num_cpu = 0; + + range[0] = 0; +#ifndef TRANSA + i = m; +#else + i = n; +#endif + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + if (width < 4) width = 4; + if (i < width) width = i; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = gemv_kernel; + queue[num_cpu].args = &args; +#ifndef TRANSA + queue[num_cpu].range_m = &range[num_cpu]; + queue[num_cpu].range_n = NULL; +#else + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = &range[num_cpu]; +#endif + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/ger_thread.c b/driver/level2/ger_thread.c new file mode 100644 index 0000000000..9e2f520ef8 --- /dev/null +++ b/driver/level2/ger_thread.c @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#ifndef XCONJ +#define AXPY AXPYU_K +#else +#define AXPY AXPYC_K +#endif + +static int ger_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + FLOAT alpha_r; +#ifdef COMPLEX + FLOAT alpha_i; +#endif + BLASLONG lda, incx, incy; + BLASLONG m, n_from, n_to; + BLASLONG i; + + x = (FLOAT *)args -> a; + y = (FLOAT *)args -> b; + a = (FLOAT *)args -> c; + + incx = args -> lda; + incy = args -> ldb; + lda = args -> ldc; + + m = args -> m; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#ifdef COMPLEX + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + n_from = 0; + n_to = args -> n; + + if (range_n) { + n_from = *(range_n + 0); + n_to = *(range_n + 1); + + y += n_from * incy * COMPSIZE; + a += n_from * lda * COMPSIZE; + } + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + x = buffer; + } + + for (i = n_from; i < n_to; i ++) { + + AXPY(m, 0, 0, +#ifndef COMPLEX + alpha_r * *y, +#else +#ifndef CONJ + alpha_r * *(y + 0) - alpha_i * *(y + 1), alpha_r * *(y + 1) + alpha_i * *(y + 0), +#else + alpha_r * *(y + 0) + alpha_i * *(y + 1), - alpha_r * *(y + 1) + alpha_i * *(y + 0), +#endif +#endif + x, 1, a, 1, NULL, 0); + + y += incy * COMPSIZE; + a += lda * COMPSIZE; + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_n[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + args.n = n; + + args.a = (void *)x; + args.b = (void *)y; + args.c = (void *)a; + + args.lda = incx; + args.ldb = incy; + args.ldc = lda; + +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *) alpha; +#endif + + num_cpu = 0; + + range_n[0] = 0; + i = n; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + if (width < 4) width = 4; + if (i < width) width = i; + + range_n[num_cpu + 1] = range_n[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = ger_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/sbmv_k.c b/driver/level2/sbmv_k.c new file mode 100644 index 0000000000..d0adc678a4 --- /dev/null +++ b/driver/level2/sbmv_k.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, length; + + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *sbmvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = sbmvbuffer; + FLOAT *bufferX = sbmvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) + 4095) & ~4095); + sbmvbuffer = bufferX; + COPY_K(n, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, x, incx, X, 1); + } + + for (i = 0; i < n; i++) { + +#ifndef LOWER + length = i; + if (length > k) length = k; + + AXPYU_K(length + 1, 0, 0, + alpha * X[i], + a + k - length, 1, Y + i - length, 1, NULL, 0); + Y[i] += alpha * DOTU_K(length, a + k - length, 1, X + i - length, 1); +#else + length = k; + if (n - i - 1 < k) length = n - i - 1; + + AXPYU_K(length + 1, 0, 0, + alpha * X[i], + a, 1, Y + i, 1, NULL, 0); + Y[i] += alpha * DOTU_K(length, a + 1, 1, X + i + 1, 1); +#endif + + a += lda; + } + + if (incy != 1) { + COPY_K(n, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c new file mode 100644 index 0000000000..222734d5e6 --- /dev/null +++ b/driver/level2/sbmv_thread.c @@ -0,0 +1,359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#if !defined(HEMV) && !defined(HEMVREV) +#define MYAXPY AXPYU_K +#define MYDOT DOTU_K +#elif defined HEMV +#define MYAXPY AXPYU_K +#define MYDOT DOTC_K +#else +#define MYAXPY AXPYC_K +#define MYDOT DOTU_K +#endif + +static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx; + BLASLONG n, k, n_from, n_to; + BLASLONG i, length; +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + + n = args -> n; + k = args -> k; + + n_from = 0; + n_to = n; + + if (range_m) { + n_from = *(range_m + 0); + n_to = *(range_m + 1); + + a += n_from * lda * COMPSIZE; + } + + if (range_n) y += *range_n * COMPSIZE; + + if (incx != 1) { + COPY_K(n, x, incx, buffer, 1); + + x = buffer; + buffer += ((COMPSIZE * n + 1023) & ~1023); + } + + SCAL_K(n, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); + + for (i = n_from; i < n_to; i++) { + +#ifndef LOWER + + length = i; + if (length > k) length = k; + + MYAXPY(length, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); + +#if !defined(HEMV) && !defined(HEMVREV) + result = MYDOT(length + 1, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); +#else + result = MYDOT(length , a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); +#endif + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else +#if !defined(HEMV) && !defined(HEMVREV) + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#else + *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + k * COMPSIZE) * *(x + i * COMPSIZE + 1); +#endif +#endif + +#else + + length = k; + if (n - i - 1 < k) length = n - i - 1; + + MYAXPY(length, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); + +#if !defined(HEMV) && !defined(HEMVREV) + result = MYDOT(length + 1, a, 1, x + i * COMPSIZE, 1); +#else + result = MYDOT(length , a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1) ; +#endif + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else +#if !defined(HEMV) && !defined(HEMVREV) + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#else + *(y + i * COMPSIZE + 0) += CREAL(result) + *a * *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += CIMAG(result) + *a * *(x + i * COMPSIZE + 1); +#endif +#endif + +#endif + + a += lda * COMPSIZE; + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.n = n; + args.k = k; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)buffer; + + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + + dnum = (double)n * (double)n / (double)nthreads; + num_cpu = 0; + + if (n < 2 * k) { + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = n; + i = 0; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(n - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = n - i; + } + + if (width < 16) width = 16; + if (width > n - i) width = n - i; + + } else { + width = n - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = sbmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(n - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = n - i; + } + + if (width < 16) width = 16; + if (width > n - i) width = n - i; + + } else { + width = n - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = sbmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + } else { + + range_m[0] = 0; + i = n; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + if (width < 4) width = 4; + if (i < width) width = i; + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + range_n[num_cpu] = num_cpu * ((n + 15) & ~15); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = sbmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + } + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + for (i = 1; i < num_cpu; i ++) { + AXPYU_K(n, 0, 0, +#ifndef COMPLEX + ONE, +#else + ONE, ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + } + + AXPYU_K(n, 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer, 1, y, incy, NULL, 0); + + return 0; +} diff --git a/driver/level2/spmv_k.c b/driver/level2/spmv_k.c new file mode 100644 index 0000000000..07ec660953 --- /dev/null +++ b/driver/level2/spmv_k.c @@ -0,0 +1,86 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + + for (i = 0; i < m; i++) { +#ifndef LOWER + if (i > 0) Y[i] += alpha * DOTU_K(i, a, 1, X, 1); + AXPYU_K(i + 1, 0, 0, alpha * X[i], a, 1, Y, 1, NULL, 0); + a += i + 1; + +#else + Y[i] += alpha * DOTU_K(m - i, a + i, 1, X + i, 1); + if (m - i > 1) AXPYU_K(m - i - 1, 0, 0, alpha * X[i], + a + i + 1, 1, Y + i + 1, 1, NULL, 0); + a += m - i - 1; +#endif + } + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c new file mode 100644 index 0000000000..7717bbf2bc --- /dev/null +++ b/driver/level2/spmv_thread.c @@ -0,0 +1,345 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#if! defined(HEMV) && !defined(HEMVREV) +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif defined HEMV +#define MYDOT DOTC_K +#define MYAXPY AXPYU_K +#else +#define MYDOT DOTU_K +#define MYAXPY AXPYC_K +#endif + +static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG incx, incy; + BLASLONG m_from, m_to, i; +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + incx = args -> ldb; + incy = args -> ldc; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (range_n) y += *range_n * COMPSIZE; + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + + x = buffer; + } + +#ifndef LOWER + SCAL_K(m_to, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); +#else + SCAL_K(args -> m - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); +#endif + +#ifndef LOWER + a += (m_from + 1) * m_from / 2 * COMPSIZE; +#else + a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; +#endif + + for (i = m_from; i < m_to; i++) { +#ifndef LOWER + +#if !defined(HEMV) && !defined(HEMVREV) + result = MYDOT(i + 1, a, 1, x, 1); +#else + result = MYDOT(i , a, 1, x, 1); +#endif + +#ifndef COMPLEX + *(y + i * COMPSIZE) += result; +#else +#if !defined(HEMV) && !defined(HEMVREV) + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#else + *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); +#endif +#endif + + MYAXPY(i, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a, 1, y, 1, NULL, 0); + + a += (i + 1) * COMPSIZE; + +#else +#if !defined(HEMV) && !defined(HEMVREV) + result = MYDOT(args -> m - i , a + i * COMPSIZE, 1, x + i * COMPSIZE, 1); +#else + result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); +#endif + +#ifndef COMPLEX + *(y + i * COMPSIZE) += result; +#else +#if !defined(HEMV) && !defined(HEMVREV) + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#else + *(y + i * COMPSIZE + 0) += CREAL(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += CIMAG(result) + *(a + i * COMPSIZE) * *(x + i * COMPSIZE + 1); +#endif +#endif + + MYAXPY(args -> m - i - 1, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (i + 1) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); + + a += (args -> m - i - 1) * COMPSIZE; + +#endif + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)buffer; + + args.ldb = incx; + args.ldc = incy; + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = spmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = spmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + for (i = 1; i < num_cpu; i ++) { + +#ifndef LOWER + + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + +#else + + AXPYU_K(m - range_m[i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); + +#endif + + } + + AXPYU_K(m, 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer, 1, y, incy, NULL, 0); + + return 0; +} diff --git a/driver/level2/spr2_k.c b/driver/level2/spr2_k.c new file mode 100644 index 0000000000..58e14ebe2f --- /dev/null +++ b/driver/level2/spr2_k.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * X[i], Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha_r * Y[i], X, 1, a, 1, NULL, 0); + a += i + 1; +#else + AXPYU_K(m - i, 0, 0, alpha_r * X[i], Y + i, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, alpha_r * Y[i], X + i, 1, a, 1, NULL, 0); + a += m - i; +#endif + } + + return 0; +} diff --git a/driver/level2/spr2_thread.c b/driver/level2/spr2_thread.c new file mode 100644 index 0000000000..b20eb055ae --- /dev/null +++ b/driver/level2/spr2_thread.c @@ -0,0 +1,356 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx, incy; + BLASLONG i, m_from, m_to; + FLOAT alpha_r; +#ifdef COMPLEX + FLOAT alpha_i; +#endif + + x = (FLOAT *)args -> a; + y = (FLOAT *)args -> b; + a = (FLOAT *)args -> c; + + incx = args -> lda; + incy = args -> ldb; + lda = args -> ldc; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#ifdef COMPLEX + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } + + if (incy != 1) { +#ifndef LOWER + COPY_K(m_to, y, incy, buffer, 1); +#else + COPY_K(args -> m - m_from, y + m_from * incy * COMPSIZE, incy, buffer + m_from * COMPSIZE, 1); +#endif + y = buffer; + } + +#ifndef LOWER + a += (m_from + 1) * m_from / 2 * COMPSIZE; +#else + a += (2 * args -> m - m_from + 1) * m_from / 2 * COMPSIZE; +#endif + + for (i = m_from; i < m_to; i++){ +#if !defined(HEMV) && !defined(HEMVREV) +#ifndef COMPLEX + if (x[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * x[i], y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], y + i, 1, a, 1, NULL, 0); +#endif + } + if (y[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * y[i], x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i], x + i, 1, a, 1, NULL, 0); +#endif + } +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif + } + if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif + } +#endif +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#endif + } + if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#endif + } +#ifndef LOWER + a[i * COMPSIZE + 1] = ZERO; +#else + a[ 1] = ZERO; +#endif +#endif + +#ifndef LOWER + a += (i + 1) * COMPSIZE; +#else + a += (args -> m - i) * COMPSIZE; +#endif + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)x; + args.b = (void *)y; + args.c = (void *)a; + + args.lda = incx; + args.ldb = incy; +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/spr_k.c b/driver/level2/spr_k.c new file mode 100644 index 0000000000..996d9257e9 --- /dev/null +++ b/driver/level2/spr_k.c @@ -0,0 +1,69 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, + FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + if (X[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha_r * X[i], X, 1, a, 1, NULL, 0); + } + a += i + 1; +#else + if (X[i] != ZERO) { + AXPYU_K(m - i, 0, 0, alpha_r * X[i], X + i, 1, a, 1, NULL, 0); + } + a += m - i; +#endif + } + + return 0; +} diff --git a/driver/level2/spr_thread.c b/driver/level2/spr_thread.c new file mode 100644 index 0000000000..f889506989 --- /dev/null +++ b/driver/level2/spr_thread.c @@ -0,0 +1,291 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x; + BLASLONG incx; + BLASLONG i, m_from, m_to; + FLOAT alpha_r; +#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) + FLOAT alpha_i; +#endif + + x = (FLOAT *)args -> a; + a = (FLOAT *)args -> b; + + incx = args -> lda; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + x = buffer; + } + +#ifndef LOWER + a += (m_from + 1) * m_from / 2 * COMPSIZE; +#else + a += (2 * args -> m - m_from + 1) * m_from / 2 * COMPSIZE; +#endif + + for (i = m_from; i < m_to; i++){ +#if !defined(HEMV) && !defined(HEMVREV) +#ifndef COMPLEX + if (x[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * x[i], x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], x + i, 1, a, 1, NULL, 0); +#endif + } +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif + } +#endif +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0], - alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0], - alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a, 1, NULL, 0); +#endif +#endif + } +#ifndef LOWER + a[i * COMPSIZE + 1] = ZERO; +#else + a[ 1] = ZERO; +#endif +#endif + +#ifndef LOWER + a += (i + 1) * COMPSIZE; +#else + a += (args -> m - i) * COMPSIZE; +#endif + } + + return 0; +} + +#if !defined(COMPLEX) || defined(HEMV) || defined(HEMVREV) +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)x; + args.b = (void *)a; + + args.lda = incx; + +#if !defined(COMPLEX) || defined(HEMV) || defined(HEMVREV) + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c new file mode 100644 index 0000000000..cf0e2d0c05 --- /dev/null +++ b/driver/level2/symv_thread.c @@ -0,0 +1,295 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#if! defined(HEMV) && !defined(HEMVREV) +#define MYSYMV_U SYMV_U +#define MYSYMV_L SYMV_L +#elif defined HEMV +#define MYSYMV_U HEMV_U +#define MYSYMV_L HEMV_L +#else +#define MYSYMV_U HEMV_V +#define MYSYMV_L HEMV_M +#endif + +static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx, incy; + BLASLONG m_from, m_to; + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + incy = args -> ldc; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (range_n) y += *range_n * COMPSIZE; + +#ifndef LOWER + + SCAL_K(m_to, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); + + MYSYMV_U (m_to, m_to - m_from, ONE, +#ifdef COMPLEX + ZERO, +#endif + a, lda, x, incx, y, 1, buffer); + +#else + + SCAL_K(args -> m - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + + MYSYMV_L (args -> m - m_from, m_to - m_from, ONE, +#ifdef COMPLEX + ZERO, +#endif + a + m_from * (lda + 1) * COMPSIZE, lda, x + m_from * incx * COMPSIZE, incx, y + m_from * COMPSIZE, 1, buffer); +#endif + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 3; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)buffer; + + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask; + + if (width < 4) width = 4; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[MAX_CPU_NUMBER - num_cpu - 1].mode = mode; + queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel; + queue[MAX_CPU_NUMBER - num_cpu - 1].args = &args; + queue[MAX_CPU_NUMBER - num_cpu - 1].range_m = &range_m[num_cpu]; + queue[MAX_CPU_NUMBER - num_cpu - 1].range_n = &range_n[num_cpu]; + queue[MAX_CPU_NUMBER - num_cpu - 1].sa = NULL; + queue[MAX_CPU_NUMBER - num_cpu - 1].sb = NULL; + queue[MAX_CPU_NUMBER - num_cpu - 1].next = &queue[MAX_CPU_NUMBER - num_cpu]; + + num_cpu ++; + i += width; + } + + if (num_cpu) { + queue[MAX_CPU_NUMBER - num_cpu].sa = NULL; + queue[MAX_CPU_NUMBER - num_cpu].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[MAX_CPU_NUMBER - 1].next = NULL; + + exec_blas(num_cpu, &queue[MAX_CPU_NUMBER - num_cpu]); + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 4) width = 4; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = symv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#endif + +#ifndef LOWER + + for (i = 0; i < num_cpu - 1; i ++) { + + AXPYU_K(range_m[i + 1], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer + range_n[num_cpu - 1] * COMPSIZE, 1, NULL, 0); + } + + AXPYU_K(m, 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer + range_n[num_cpu - 1] * COMPSIZE, 1, y, incy, NULL, 0); + +#else + + for (i = 1; i < num_cpu; i ++) { + + AXPYU_K(m - range_m[i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); + } + + AXPYU_K(m, 0, 0, +#ifndef COMPLEX + alpha, +#else + alpha[0], alpha[1], +#endif + buffer, 1, y, incy, NULL, 0); + +#endif + + return 0; +} diff --git a/driver/level2/syr2_k.c b/driver/level2/syr2_k.c new file mode 100644 index 0000000000..bca8b3bca0 --- /dev/null +++ b/driver/level2/syr2_k.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * X[i], Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, alpha_r * Y[i], X, 1, a, 1, NULL, 0); + a += lda; +#else + AXPYU_K(m - i, 0, 0, alpha_r * X[i], Y + i, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, alpha_r * Y[i], X + i, 1, a, 1, NULL, 0); + a += 1 + lda; +#endif + } + + return 0; +} diff --git a/driver/level2/syr2_thread.c b/driver/level2/syr2_thread.c new file mode 100644 index 0000000000..130a62d3e2 --- /dev/null +++ b/driver/level2/syr2_thread.c @@ -0,0 +1,345 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + BLASLONG lda, incx, incy; + BLASLONG i, m_from, m_to; + FLOAT alpha_r; +#ifdef COMPLEX + FLOAT alpha_i; +#endif + + x = (FLOAT *)args -> a; + y = (FLOAT *)args -> b; + a = (FLOAT *)args -> c; + + incx = args -> lda; + incy = args -> ldb; + lda = args -> ldc; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#ifdef COMPLEX + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } + + if (incy != 1) { +#ifndef LOWER + COPY_K(m_to, y, incy, buffer, 1); +#else + COPY_K(args -> m - m_from, y + m_from * incy * COMPSIZE, incy, buffer + m_from * COMPSIZE, 1); +#endif + y = buffer; + } + + a += m_from * lda * COMPSIZE; + + for (i = m_from; i < m_to; i++){ +#if !defined(HER) && !defined(HERREV) +#ifndef COMPLEX + if (x[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * x[i], y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], y + i, 1, a + i, 1, NULL, 0); +#endif + } + if (y[i] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * y[i], x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * y[i], x + i, 1, a + i, 1, NULL, 0); +#endif + } +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif + } + if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] - alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif + } +#endif +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef HERREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + - alpha_i * x[i * COMPSIZE + 0] - alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + y + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#endif + } + if ((y[i * COMPSIZE + 0] != ZERO) || (y[i * COMPSIZE + 1] != ZERO)) { +#ifndef HERREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + alpha_i * y[i * COMPSIZE + 0] - alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * y[i * COMPSIZE + 0] + alpha_i * y[i * COMPSIZE + 1], + - alpha_i * y[i * COMPSIZE + 0] + alpha_r * y[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#endif + } + a[i * COMPSIZE + 1] = ZERO; +#endif + a += lda * COMPSIZE; + + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)x; + args.b = (void *)y; + args.c = (void *)a; + + args.lda = incx; + args.ldb = incy; + args.ldc = lda; +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/syr_k.c b/driver/level2/syr_k.c new file mode 100644 index 0000000000..a0d9a2fa0d --- /dev/null +++ b/driver/level2/syr_k.c @@ -0,0 +1,69 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, + FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + if (X[i] != ZERO) { + AXPYU_K(i + 1, 0, 0, alpha_r * X[i], X, 1, a, 1, NULL, 0); + } + a += lda; +#else + if (X[i] != ZERO) { + AXPYU_K(m - i, 0, 0, alpha_r * X[i], X + i, 1, a, 1, NULL, 0); + } + a += 1 + lda; +#endif + } + + return 0; +} diff --git a/driver/level2/syr_thread.c b/driver/level2/syr_thread.c new file mode 100644 index 0000000000..250e8c0063 --- /dev/null +++ b/driver/level2/syr_thread.c @@ -0,0 +1,283 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x; + BLASLONG lda, incx; + BLASLONG i, m_from, m_to; + FLOAT alpha_r; +#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) + FLOAT alpha_i; +#endif + + + x = (FLOAT *)args -> a; + a = (FLOAT *)args -> b; + + incx = args -> lda; + lda = args -> ldb; + + alpha_r = *((FLOAT *)args -> alpha + 0); +#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) + alpha_i = *((FLOAT *)args -> alpha + 1); +#endif + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + + x = buffer; + } + + a += m_from * lda * COMPSIZE; + + for (i = m_from; i < m_to; i++){ +#if !defined(HER) && !defined(HERREV) +#ifndef COMPLEX + if (x[i * COMPSIZE] != ZERO) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha_r * x[i], x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, alpha_r * x[i], x + i, 1, a + i, 1, NULL, 0); +#endif + } +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0] - alpha_i * x[i * COMPSIZE + 1], + alpha_i * x[i * COMPSIZE + 0] + alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif + } +#endif +#else + if ((x[i * COMPSIZE + 0] != ZERO) || (x[i * COMPSIZE + 1] != ZERO)) { +#ifndef HERREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYU_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0], -alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], + x, 1, a, 1, NULL, 0); +#else + AXPYC_K(args -> m - i, 0, 0, + alpha_r * x[i * COMPSIZE + 0], alpha_r * x[i * COMPSIZE + 1], + x + i * COMPSIZE, 1, a + i * COMPSIZE, 1, NULL, 0); +#endif +#endif + + } + a[i * COMPSIZE + 1] = ZERO; +#endif + a += lda * COMPSIZE; + + } + + return 0; +} + +#if !defined(COMPLEX) || defined(HER) || defined(HERREV) +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)x; + args.b = (void *)a; + + args.lda = incx; + args.ldb = lda; +#if !defined(COMPLEX) || defined(HER) || defined(HERREV) + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = syr_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level2/tbmv_L.c b/driver/level2/tbmv_L.c new file mode 100644 index 0000000000..05e7cf869b --- /dev/null +++ b/driver/level2/tbmv_L.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + a += (n - 1) * lda; + + for (i = n - 1; i >= 0; i--) { + +#ifndef TRANSA + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { + AXPYU_K(length, 0, 0, + B[i], + a + 1, 1, B + i + 1, 1, NULL, 0); + } +#endif + +#ifndef UNIT +#ifndef TRANSA + B[i] *= a[0]; +#else + B[i] *= a[k]; +#endif +#endif + +#ifdef TRANSA + length = i; + if (length > k) length = k; + + if (length > 0) { + B[i] += DOTU_K(length, a + k - length, 1, B + i - length, 1); + } +#endif + + a -= lda; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tbmv_U.c b/driver/level2/tbmv_U.c new file mode 100644 index 0000000000..49d28dcf57 --- /dev/null +++ b/driver/level2/tbmv_U.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + for (i = 0; i < n; i++) { + +#ifndef TRANSA + length = i; + if (length > k) length = k; + + if (length > 0) { + AXPYU_K(length, 0, 0, + B[i], + a + k - length, 1, B + i - length, 1, NULL, 0); + } +#endif + +#ifndef UNIT +#ifndef TRANSA + B[i] *= a[k]; +#else + B[i] *= a[0]; +#endif +#endif + +#ifdef TRANSA + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { + B[i] += DOTU_K(length, a + 1, 1, B + i + 1, 1); + } +#endif + + a += lda; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c new file mode 100644 index 0000000000..e3d0588262 --- /dev/null +++ b/driver/level2/tbmv_thread.c @@ -0,0 +1,396 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#ifndef COMPLEX +#ifndef TRANSA +#undef TRANS +#else +#define TRANS +#endif +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#else +#if (TRANSA == 1) || (TRANSA == 3) +#undef TRANS +#else +#define TRANS +#endif +#if (TRANSA == 1) || (TRANSA == 2) +#define MYAXPY AXPYU_K +#define MYDOT DOTU_K +#else +#define MYAXPY AXPYC_K +#define MYDOT DOTC_K +#endif +#endif + +static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + + BLASLONG k, lda, incx; + BLASLONG n_from, n_to; + BLASLONG i, length; + +#ifdef TRANS +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif +#endif + +#if defined(COMPLEX) && !defined(UNIT) + FLOAT ar, ai, xr, xi; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + k = args -> k; + n_from = 0; + n_to = args -> n; + + lda = args -> lda; + incx = args -> ldb; + + if (range_m) { + n_from = *(range_m + 0); + n_to = *(range_m + 1); + + a += n_from * lda * COMPSIZE; + } + + if (incx != 1) { + + COPY_K(args -> n, x, incx, buffer, 1); + + x = buffer; + buffer += ((args -> n * COMPSIZE + 1023) & ~1023); + } + + if (range_n) y += *range_n * COMPSIZE; + + SCAL_K(args -> n, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); + + for (i = n_from; i < n_to; i++) { + +#ifndef LOWER + length = i; +#else + length = args -> n - i - 1; +#endif + if (length > k) length = k; + +#ifndef LOWER + if (length > 0) { +#ifndef TRANS + MYAXPY(length, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (k - length) * COMPSIZE, 1, y + (i - length) * COMPSIZE, 1, NULL, 0); +#else + result = MYDOT(length, a + (k - length) * COMPSIZE, 1, x + (i - length) * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif +#endif + } +#endif + +#ifndef COMPLEX +#ifdef UNIT + *(y + i * COMPSIZE) += *(x + i * COMPSIZE); +#else +#ifndef LOWER + *(y + i * COMPSIZE) += *(a + k * COMPSIZE) * *(x + i * COMPSIZE); +#else + *(y + i * COMPSIZE) += *(a + 0 * COMPSIZE) * *(x + i * COMPSIZE); +#endif +#endif +#else +#ifdef UNIT + *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); +#else +#ifndef LOWER + ar = *(a + k * COMPSIZE + 0); + ai = *(a + k * COMPSIZE + 1); +#else + ar = *(a + 0); + ai = *(a + 1); +#endif + xr = *(x + i * COMPSIZE + 0); + xi = *(x + i * COMPSIZE + 1); + +#if (TRANSA == 1) || (TRANSA == 2) + *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; +#else + *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; +#endif +#endif +#endif + +#ifdef LOWER + if (length > 0) { +#ifndef TRANS + MYAXPY(length, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + result = MYDOT(length, a + COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif +#endif + } +#endif + + a += lda * COMPSIZE; + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.n = n; + args.k = k; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)(buffer); + + args.lda = lda; + args.ldb = incx; + + dnum = (double)n * (double)n / (double)nthreads; + num_cpu = 0; + + if (n < 2 * k) { + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = n; + i = 0; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(n - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = n - i; + } + + if (width < 16) width = 16; + if (width > n - i) width = n - i; + + } else { + width = n - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(n - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = n - i; + } + + if (width < 16) width = 16; + if (width > n - i) width = n - i; + + } else { + width = n - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + } else { + + range_m[0] = 0; + i = n; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + if (width < 4) width = 4; + if (i < width) width = i; + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i -= width; + } + } + + + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + for (i = 1; i < num_cpu; i ++) { + AXPYU_K(n, 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + } + + COPY_K(n, buffer, 1, x, incx); + + return 0; +} diff --git a/driver/level2/tbsv_L.c b/driver/level2/tbsv_L.c new file mode 100644 index 0000000000..e9c9158e4f --- /dev/null +++ b/driver/level2/tbsv_L.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + for (i = 0; i < n; i++) { + +#ifdef TRANSA + length = i; + if (length > k) length = k; + + if (length > 0) { + B[i] -= DOTU_K(length, a + k - length, 1, B + i - length, 1); + } +#endif + +#ifndef UNIT +#ifdef TRANSA + B[i] /= a[k]; +#else + B[i] /= a[0]; +#endif +#endif + +#ifndef TRANSA + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { + AXPYU_K(length, 0, 0, + -B[i], + a + 1, 1, B + i + 1, 1, NULL, 0); + } +#endif + + a += lda; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tbsv_U.c b/driver/level2/tbsv_U.c new file mode 100644 index 0000000000..0b1fca8f02 --- /dev/null +++ b/driver/level2/tbsv_U.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + a += (n - 1) * lda; + + for (i = n - 1; i >= 0; i--) { + +#ifdef TRANSA + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { + B[i] -= DOTU_K(length, a + 1, 1, B + i + 1, 1); + } +#endif + +#ifndef UNIT +#ifdef TRANSA + B[i] /= a[0]; +#else + B[i] /= a[k]; +#endif +#endif + +#ifndef TRANSA + length = i; + if (length > k) length = k; + + if (length > 0) { + AXPYU_K(length, 0, 0, + - B[i], + a + k - length, 1, B + i - length, 1, NULL, 0); + } +#endif + + a -= lda; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tpmv_L.c b/driver/level2/tpmv_L.c new file mode 100644 index 0000000000..c139eb79dd --- /dev/null +++ b/driver/level2/tpmv_L.c @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + COPY_K(m, b, incb, buffer, 1); + } + + a += (m + 1) * m / 2 - 1; + + for (i = 0; i < m; i++) { +#ifndef TRANSA + if (i > 0) AXPYU_K(i, 0, 0, B[m - i - 1], a + 1, 1, B + m - i, 1, NULL, 0); +#endif + +#ifndef UNIT + B[m - i - 1] *= a[0]; +#endif + +#ifdef TRANSA + if (i < m - 1) B[m - i - 1] += DOTU_K(m - i - 1, a - (m - i - 1), 1, B, 1); +#endif + +#ifndef TRANSA + a -= (i + 2); +#else + a -= (m - i); +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tpmv_U.c b/driver/level2/tpmv_U.c new file mode 100644 index 0000000000..6d69df6f01 --- /dev/null +++ b/driver/level2/tpmv_U.c @@ -0,0 +1,86 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + COPY_K(m, b, incb, buffer, 1); + } + + for (i = 0; i < m; i++) { + +#ifndef TRANSA + if (i > 0) AXPYU_K(i, 0, 0, B[i], a, 1, B, 1, NULL, 0); +#endif + +#ifndef UNIT +#ifndef TRANSA + B[i] *= a[i]; +#else + B[i] *= a[0]; +#endif +#endif + +#ifdef TRANSA + if (i < m - 1) B[i] += DOTU_K(m - i - 1, a + 1, 1, B + i + 1, 1); +#endif + +#ifndef TRANSA + a += (i + 1); +#else + a += (m - i); +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c new file mode 100644 index 0000000000..64b725f865 --- /dev/null +++ b/driver/level2/tpmv_thread.c @@ -0,0 +1,401 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#ifndef COMPLEX +#ifndef TRANSA +#undef TRANS +#else +#define TRANS +#endif +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#else +#if TRANSA == 1 +#undef TRANS +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif TRANSA == 2 +#define TRANS +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif TRANSA == 3 +#undef TRANS +#define MYDOT DOTC_K +#define MYAXPY AXPYC_K +#else +#define TRANS +#define MYDOT DOTC_K +#define MYAXPY AXPYC_K +#endif +#endif + +static int tpmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + + BLASLONG incx; + BLASLONG m_from, m_to; + BLASLONG i; + +#ifdef TRANS +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif +#endif + +#if defined(COMPLEX) && !defined(UNIT) + FLOAT ar, ai, xr, xi; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + incx = args -> ldb; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { + +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } + +#ifndef TRANS + if (range_n) y += *range_n * COMPSIZE; + +#ifndef LOWER + SCAL_K(m_to, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); +#else + SCAL_K(args -> m - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); +#endif + +#else + + SCAL_K(m_to - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + +#endif + +#ifndef LOWER + a += (m_from + 1) * m_from / 2 * COMPSIZE; +#else + a += (2 * args -> m - m_from - 1) * m_from / 2 * COMPSIZE; +#endif + + for (i = m_from; i < m_to; i++) { + +#ifndef LOWER + if (i > 0) { +#ifndef TRANS + MYAXPY(i, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a, 1, y, 1, NULL, 0); +#else + result = MYDOT(i, a, 1, x, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif + +#endif + } +#endif + +#ifndef COMPLEX +#ifdef UNIT + *(y + i * COMPSIZE) += *(x + i * COMPSIZE); +#else + *(y + i * COMPSIZE) += *(a + i * COMPSIZE) * *(x + i * COMPSIZE); +#endif +#else +#ifdef UNIT + *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); +#else + ar = *(a + i * COMPSIZE + 0); + ai = *(a + i * COMPSIZE + 1); + xr = *(x + i * COMPSIZE + 0); + xi = *(x + i * COMPSIZE + 1); + +#if (TRANSA == 1) || (TRANSA == 2) + *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; +#else + *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; +#endif +#endif +#endif + +#ifdef LOWER + if (args -> m > i + 1) { +#ifndef TRANS + MYAXPY(args -> m - i - 1, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (i + 1 ) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + + result = MYDOT(args -> m - i - 1, a + (i + 1) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif + +#endif + } +#endif + +#ifndef LOWER + a += (i + 1) * COMPSIZE; +#else + a += (args -> m - i - 1) * COMPSIZE; +#endif + + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)(buffer); + + args.ldb = incx; + args.ldc = incx; + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = tpmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = tpmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#ifndef TRANS + for (i = 1; i < num_cpu; i ++) { + +#ifndef LOWER + + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + +#else + + AXPYU_K(m - range_m[i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); + +#endif + + } +#endif + + COPY_K(m, buffer, 1, x, incx); + + return 0; +} diff --git a/driver/level2/tpsv_L.c b/driver/level2/tpsv_L.c new file mode 100644 index 0000000000..9f76181e1a --- /dev/null +++ b/driver/level2/tpsv_L.c @@ -0,0 +1,87 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (i = 0; i < m; i++) { +#ifdef TRANSA + if (i > 0) B[i] -= DOTU_K(i, a, 1, B, 1); +#endif + +#ifndef UNIT +#ifndef TRANSA + B[i] /= a[0]; +#else + B[i] /= a[i]; +#endif +#endif + +#ifndef TRANSA + if (i < m - 1) { + AXPYU_K(m - i - 1 , 0, 0, - B[i], + a + 1, 1, B + i + 1, 1, NULL, 0); + } +#endif + +#ifndef TRANSA + a += (m - i); +#else + a += (i + 1); +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} diff --git a/driver/level2/tpsv_U.c b/driver/level2/tpsv_U.c new file mode 100644 index 0000000000..7a0958021d --- /dev/null +++ b/driver/level2/tpsv_U.c @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + a += (m + 1) * m / 2 - 1; + + for (i = 0; i < m; i++) { +#ifdef TRANSA + if (i > 0) B[m - i - 1] -= DOTU_K(i, a + 1, 1, B + m - i, 1); +#endif + +#ifndef UNIT + B[m - i - 1] /= a[0]; +#endif + +#ifndef TRANSA + if (i < m - 1) AXPYU_K(m - i - 1, 0, 0, -B[m - i - 1], a - (m - i - 1), 1, B, 1, NULL, 0); +#endif + +#ifndef TRANSA + a -= (m - i); +#else + a -= (i + 2); +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/trmv_L.c b/driver/level2/trmv_L.c new file mode 100644 index 0000000000..e515ba60be --- /dev/null +++ b/driver/level2/trmv_L.c @@ -0,0 +1,103 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ + + BLASLONG i, is, min_i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = m; is > 0; is -= DTB_ENTRIES){ + + min_i = MIN(is, DTB_ENTRIES); + +#ifndef TRANSA + if (m - is > 0){ + GEMV_N(m - is, min_i, 0, dp1, + a + is + (is - min_i) * lda, lda, + B + is - min_i, 1, + B + is, 1, gemvbuffer); + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda; + FLOAT *BB = B + (is - i - 1); + +#ifndef TRANSA + if (i > 0) AXPYU_K(i, 0, 0, BB[0], AA + 1, 1, BB + 1, 1, NULL, 0); +#endif + +#ifndef UNIT + BB[0] *= AA[0]; +#endif + +#ifdef TRANSA + if (i < min_i - 1) BB[0] += DOTU_K(min_i - i - 1, AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1); +#endif + } + +#ifdef TRANSA + if (is - min_i > 0){ + GEMV_T(is - min_i, min_i, 0, dp1, + a + (is - min_i) * lda, lda, + B, 1, + B + is - min_i, 1, gemvbuffer); + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/trmv_U.c b/driver/level2/trmv_U.c new file mode 100644 index 0000000000..3c36f77d98 --- /dev/null +++ b/driver/level2/trmv_U.c @@ -0,0 +1,104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ + + BLASLONG i, is, min_i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = 0; is < m; is += DTB_ENTRIES){ + + min_i = MIN(m - is, DTB_ENTRIES); + +#ifndef TRANSA + if (is > 0){ + GEMV_N(is, min_i, 0, dp1, + a + is * lda, lda, + B + is, 1, + B, 1, gemvbuffer); + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + is + (i + is) * lda; + FLOAT *BB = B + is; + +#ifndef TRANSA + if (i > 0) AXPYU_K(i, 0, 0, BB[i], AA, 1, BB, 1, NULL, 0); +#endif + +#ifndef UNIT + BB[i] *= AA[i]; +#endif + +#ifdef TRANSA + if (i < min_i - 1) BB[i] += DOTU_K(min_i - i - 1, AA + i + 1, 1, BB + i + 1, 1); +#endif + } + +#ifdef TRANSA + if (m - is > min_i){ + GEMV_T(m - is - min_i, min_i, 0, dp1, + a + is + min_i + is * lda, lda, + B + is + min_i, 1, + B + is, 1, gemvbuffer); + } +#endif + + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c new file mode 100644 index 0000000000..4f5b27c692 --- /dev/null +++ b/driver/level2/trmv_thread.c @@ -0,0 +1,440 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +#ifndef COMPLEX +#ifndef TRANSA +#define MYGEMV GEMV_N +#undef TRANS +#else +#define MYGEMV GEMV_T +#define TRANS +#endif +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#else +#if TRANSA == 1 +#define MYGEMV GEMV_N +#undef TRANS +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif TRANSA == 2 +#define MYGEMV GEMV_T +#define TRANS +#define MYDOT DOTU_K +#define MYAXPY AXPYU_K +#elif TRANSA == 3 +#define MYGEMV GEMV_R +#undef TRANS +#define MYDOT DOTC_K +#define MYAXPY AXPYC_K +#else +#define MYGEMV GEMV_C +#define TRANS +#define MYDOT DOTC_K +#define MYAXPY AXPYC_K +#endif +#endif + +static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ + + FLOAT *a, *x, *y; + + BLASLONG lda, incx; + BLASLONG m_from, m_to; + BLASLONG i, is, min_i; + +#ifdef TRANS +#ifndef COMPLEX + FLOAT result; +#else + FLOAT _Complex result; +#endif +#endif + +#if defined(COMPLEX) && !defined(UNIT) + FLOAT ar, ai, xr, xi; +#endif + + a = (FLOAT *)args -> a; + x = (FLOAT *)args -> b; + y = (FLOAT *)args -> c; + + lda = args -> lda; + incx = args -> ldb; + + m_from = 0; + m_to = args -> m; + + if (range_m) { + m_from = *(range_m + 0); + m_to = *(range_m + 1); + } + + if (incx != 1) { + +#ifndef LOWER + COPY_K(m_to, x, incx, buffer, 1); +#else + COPY_K(args -> m - m_from, x + m_from * incx * COMPSIZE, incx, buffer + m_from * COMPSIZE, 1); +#endif + + x = buffer; + buffer += ((COMPSIZE * args -> m + 1023) & ~1023); + } + +#ifndef TRANS + if (range_n) y += *range_n * COMPSIZE; + +#ifndef LOWER + SCAL_K(m_to, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y, 1, NULL, 0, NULL, 0); +#else + SCAL_K(args -> m - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); +#endif + +#else + + SCAL_K(m_to - m_from, 0, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + y + m_from * COMPSIZE, 1, NULL, 0, NULL, 0); + +#endif + + for (is = m_from; is < m_to; is += DTB_ENTRIES){ + + min_i = MIN(m_to - is, DTB_ENTRIES); + +#ifndef LOWER + if (is > 0){ + MYGEMV(is, min_i, 0, + ONE, +#ifdef COMPLEX + ZERO, +#endif + a + is * lda * COMPSIZE, lda, +#ifndef TRANS + x + is * COMPSIZE, 1, + y, 1, +#else + x, 1, + y + is * COMPSIZE, 1, +#endif + buffer); + } +#endif + + for (i = is; i < is + min_i; i++) { + +#ifndef LOWER + if (i - is > 0) { +#ifndef TRANS + MYAXPY(i - is, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (is + i * lda) * COMPSIZE, 1, y + is * COMPSIZE, 1, NULL, 0); +#else + + result = MYDOT(i - is, a + (is + i * lda) * COMPSIZE, 1, x + is * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif + +#endif + } +#endif + +#ifndef COMPLEX +#ifdef UNIT + *(y + i * COMPSIZE) += *(x + i * COMPSIZE); +#else + *(y + i * COMPSIZE) += *(a + (i + i * lda) * COMPSIZE) * *(x + i * COMPSIZE); +#endif +#else +#ifdef UNIT + *(y + i * COMPSIZE + 0) += *(x + i * COMPSIZE + 0); + *(y + i * COMPSIZE + 1) += *(x + i * COMPSIZE + 1); +#else + ar = *(a + (i + i * lda) * COMPSIZE + 0); + ai = *(a + (i + i * lda) * COMPSIZE + 1); + xr = *(x + i * COMPSIZE + 0); + xi = *(x + i * COMPSIZE + 1); + +#if (TRANSA == 1) || (TRANSA == 2) + *(y + i * COMPSIZE + 0) += ar * xr - ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi + ai * xr; +#else + *(y + i * COMPSIZE + 0) += ar * xr + ai * xi; + *(y + i * COMPSIZE + 1) += ar * xi - ai * xr; +#endif +#endif +#endif + +#ifdef LOWER + if (is + min_i > i + 1) { +#ifndef TRANS + MYAXPY(is + min_i - i - 1, 0, 0, + *(x + i * COMPSIZE + 0), +#ifdef COMPLEX + *(x + i * COMPSIZE + 1), +#endif + a + (i + 1 + i * lda) * COMPSIZE, 1, y + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + + result = MYDOT(is + min_i - i - 1, a + (i + 1 + i * lda) * COMPSIZE, 1, x + (i + 1) * COMPSIZE, 1); + +#ifndef COMPLEX + *(y + i * COMPSIZE + 0) += result; +#else + *(y + i * COMPSIZE + 0) += CREAL(result); + *(y + i * COMPSIZE + 1) += CIMAG(result); +#endif + +#endif + } +#endif + } + +#ifdef LOWER + if (args -> m > is + min_i){ + MYGEMV(args -> m - is - min_i, min_i, 0, + ONE, +#ifdef COMPLEX + ZERO, +#endif + a + (is + min_i + is * lda) * COMPSIZE, lda, +#ifndef TRANS + x + is * COMPSIZE, 1, + y + (is + min_i) * COMPSIZE, 1, +#else + x + (is + min_i) * COMPSIZE, 1, + y + is * COMPSIZE, 1, +#endif + buffer); + } +#endif + } + + return 0; +} + +#ifndef COMPLEX +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#else +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthreads){ +#endif + + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range_m[MAX_CPU_NUMBER + 1]; + BLASLONG range_n[MAX_CPU_NUMBER]; + + BLASLONG width, i, num_cpu; + + double dnum; + int mask = 7; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + args.m = m; + + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)(buffer); + + args.lda = lda; + args.ldb = incx; + args.ldc = incx; + + dnum = (double)m * (double)m / (double)nthreads; + num_cpu = 0; + +#ifndef LOWER + + range_m[MAX_CPU_NUMBER] = m; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[MAX_CPU_NUMBER - num_cpu - 1]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#else + + range_m[0] = 0; + i = 0; + + while (i < m){ + + if (nthreads - num_cpu > 1) { + + double di = (double)(m - i); + if (di * di - dnum > 0) { + width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask; + } else { + width = m - i; + } + + if (width < 16) width = 16; + if (width > m - i) width = m - i; + + } else { + width = m - i; + } + + range_m[num_cpu + 1] = range_m[num_cpu] + width; + range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = trmv_kernel; + queue[num_cpu].args = &args; + queue[num_cpu].range_m = &range_m[num_cpu]; + queue[num_cpu].range_n = &range_n[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + if (num_cpu) { + queue[0].sa = NULL; + queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + +#ifndef TRANS + for (i = 1; i < num_cpu; i ++) { + +#ifndef LOWER + + AXPYU_K(range_m[MAX_CPU_NUMBER - i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + +#else + + AXPYU_K(m - range_m[i], 0, 0, ONE, +#ifdef COMPLEX + ZERO, +#endif + buffer + (range_n[i] + range_m[i]) * COMPSIZE, 1, buffer + range_m[i] * COMPSIZE, 1, NULL, 0); + +#endif + + } +#endif + + COPY_K(m, buffer, 1, x, incx); + + return 0; +} diff --git a/driver/level2/trsv_L.c b/driver/level2/trsv_L.c new file mode 100644 index 0000000000..44bcfe3982 --- /dev/null +++ b/driver/level2/trsv_L.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +#undef GEMV_UNROLL +#define GEMV_UNROLL DTB_ENTRIES + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i, is, min_i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = 0; is < m; is += GEMV_UNROLL){ + + min_i = MIN(m - is, GEMV_UNROLL); + +#ifdef TRANSA + if (is > 0){ + GEMV_T(is, min_i, 0, dm1, + a + is * lda , lda, + B, 1, + B + is, 1, gemvbuffer); + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + is + (i + is) * lda; + FLOAT *BB = B + is; + +#ifdef TRANSA + if (i > 0) BB[i] -= DOTU_K(i, AA, 1, BB, 1); +#endif + +#ifndef UNIT + BB[i] /= AA[i]; +#endif + +#ifndef TRANSA + if (i < min_i - 1) { + AXPYU_K(min_i - i - 1 , 0, 0, - BB[i], + AA + i + 1, 1, BB + i + 1, 1, NULL, 0); + } +#endif + } + +#ifndef TRANSA + if (m - is > min_i){ + GEMV_N(m - is - min_i, min_i, 0, dm1, + a + is + min_i + is * lda, lda, + B + is, 1, + B + (is + min_i), 1, gemvbuffer); + } +#endif + + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} diff --git a/driver/level2/trsv_U.c b/driver/level2/trsv_U.c new file mode 100644 index 0000000000..f02512bbbe --- /dev/null +++ b/driver/level2/trsv_U.c @@ -0,0 +1,104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i, is, min_i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = m; is > 0; is -= DTB_ENTRIES){ + + min_i = MIN(is, DTB_ENTRIES); + +#ifdef TRANSA + if (m - is > 0){ + GEMV_T(m - is, min_i, 0, dm1, + a + is + (is - min_i) * lda, lda, + B + is, 1, + B + is - min_i, 1, gemvbuffer); + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + (is - i - 1) + (is - i - 1) * lda; + FLOAT *BB = B + (is - i - 1); + +#ifdef TRANSA + if (i > 0) BB[0] -= DOTU_K(i, AA + 1, 1, BB + 1, 1); +#endif + +#ifndef UNIT + BB[0] /= AA[0]; +#endif + +#ifndef TRANSA + if (i < min_i - 1) AXPYU_K(min_i - i - 1, 0, 0, -BB[0], AA - (min_i - i - 1), 1, BB - (min_i - i - 1), 1, NULL, 0); +#endif + } + +#ifndef TRANSA + if (is - min_i > 0){ + GEMV_N(is - min_i, min_i, 0, dm1, + a + (is - min_i) * lda, lda, + B + is - min_i, 1, + B, 1, gemvbuffer); + } +#endif + + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/zgbmv_k.c b/driver/level2/zgbmv_k.c new file mode 100644 index 0000000000..7832a7ea5a --- /dev/null +++ b/driver/level2/zgbmv_k.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef XCONJ +#ifndef CONJ +#define ZAXPY AXPYU_K +#define ZDOT DOTU_K +#else +#define ZAXPY AXPYC_K +#define ZDOT DOTC_K +#endif +#else +#ifndef CONJ +#define ZAXPY AXPYU_K +#define ZDOT DOTC_K +#else +#define ZAXPY AXPYC_K +#define ZDOT DOTU_K +#endif +#endif + +#ifndef TRANS +#define M m +#define N n +#else +#define N m +#define M n +#endif + +void CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, offset_u, offset_l, start, end, length; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; +#ifdef TRANS + FLOAT _Complex temp; +#endif + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + M * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(M, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + N * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(N, x, incx, X, 1); + } + + offset_u = ku; + offset_l = ku + m; + + for (i = 0; i < MIN(n, m + ku); i++) { + + start = MAX(offset_u, 0); + end = MIN(offset_l, ku + kl + 1); + + length = end - start; + +#ifndef TRANS + ZAXPY(length, 0, 0, +#ifndef XCONJ + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], +#else + alpha_r * X[i * 2 + 0] + alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], +#endif + a + start * 2, 1, Y + (start - offset_u) * 2, 1, NULL, 0); +#else + +#ifndef XCONJ + temp = ZDOT(length, a + start * 2, 1, X + (start - offset_u) * 2, 1); +#else + temp = ZDOT(length, X + (start - offset_u) * 2, 1, a + start * 2, 1); +#endif + +#if !defined(XCONJ) || !defined(CONJ) + Y[i * 2 + 0] += alpha_r * CREAL(temp) - alpha_i * CIMAG(temp); + Y[i * 2 + 1] += alpha_i * CREAL(temp) + alpha_r * CIMAG(temp); +#else + Y[i * 2 + 0] += alpha_r * CREAL(temp) + alpha_i * CIMAG(temp); + Y[i * 2 + 1] += alpha_i * CREAL(temp) - alpha_r * CIMAG(temp); +#endif +#endif + + offset_u --; + offset_l --; + + a += lda * 2; + } + + if (incy != 1) { + COPY_K(M, Y, 1, y, incy); + } + + return; +} + diff --git a/driver/level2/zhbmv_k.c b/driver/level2/zhbmv_k.c new file mode 100644 index 0000000000..8771942d06 --- /dev/null +++ b/driver/level2/zhbmv_k.c @@ -0,0 +1,189 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, length; +#ifndef LOWER + BLASLONG offset; +#endif + + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *sbmvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = sbmvbuffer; + FLOAT *bufferX = sbmvbuffer; + FLOAT temp[2]; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + sbmvbuffer = bufferX; + COPY_K(n, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + COPY_K(n, x, incx, X, 1); + } + +#ifndef LOWER + offset = k; +#endif + + for (i = 0; i < n; i++) { + +#ifndef HEMVREV +#ifndef LOWER + length = k - offset; + + if (length > 0) { + AXPYU_K(length, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); + } + + temp[0] = a[k * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[k * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (length > 0) { + FLOAT _Complex result = DOTC_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + if (offset > 0) offset --; +#else + length = k; + if (n - i - 1 < k) length = n - i - 1; + + if (length > 0) { + AXPYU_K(length, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0); + } + + temp[0] = a[0] * X[i * 2 + 0]; + temp[1] = a[0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (length > 0) { + FLOAT _Complex result = DOTC_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } +#endif +#else +#ifndef LOWER + + length = k - offset; + + if (length > 0) { + AXPYC_K(length, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); + } + + temp[0] = a[k * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[k * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (length > 0) { + FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + if (offset > 0) offset --; +#else + length = k; + if (n - i - 1 < k) length = n - i - 1; + + if (length > 0) { + AXPYC_K(length, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + COMPSIZE, 1, Y + (i + 1) * COMPSIZE, 1, NULL, 0); + } + + temp[0] = a[0] * X[i * 2 + 0]; + temp[1] = a[0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (length > 0) { + FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } +#endif +#endif + + + a += lda * 2; + } + + if (incy != 1) { + COPY_K(n, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/zher2_k.c b/driver/level2/zher2_k.c new file mode 100644 index 0000000000..3e924582f1 --- /dev/null +++ b/driver/level2/zher2_k.c @@ -0,0 +1,120 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + lda *= 2; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += lda; +#else + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += 2 + lda; +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYC_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += lda; +#else + AXPYC_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYC_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += 2 + lda; +#endif +#endif + + } + + + return 0; +} diff --git a/driver/level2/zher_k.c b/driver/level2/zher_k.c new file mode 100644 index 0000000000..772034f1da --- /dev/null +++ b/driver/level2/zher_k.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, + BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + lda *= 2; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += lda; +#else + AXPYU_K(m - i, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += 2 + lda; +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += lda; +#else + AXPYC_K(m - i, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += 2 + lda; +#endif +#endif + } + + return 0; +} diff --git a/driver/level2/zhpmv_k.c b/driver/level2/zhpmv_k.c new file mode 100644 index 0000000000..5f95ce7bdf --- /dev/null +++ b/driver/level2/zhpmv_k.c @@ -0,0 +1,177 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + FLOAT temp[2]; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + + for (i = 0; i < m; i++) { + +#ifndef HEMVREV +#ifndef LOWER + if (i > 0) { + FLOAT _Complex result = DOTC_K(i, a, 1, X, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (i > 0) { + AXPYU_K(i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a, 1, Y, 1, NULL, 0); + } + + a += (i + 1) * 2; + +#else + + if (m - i > 1) { + FLOAT _Complex result = DOTC_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (m - i > 1) { + AXPYU_K(m - i - 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); + } + + a += (m - i - 1) * 2; + +#endif +#else +#ifndef LOWER + if (i > 0) { + FLOAT _Complex result = DOTU_K(i, a, 1, X, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (i > 0) { + AXPYC_K(i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a, 1, Y, 1, NULL, 0); + } + + a += (i + 1) * 2; + +#else + + if (m - i > 1) { + FLOAT _Complex result = DOTU_K(m - i - 1, a + (i + 1) * 2, 1, X + (i + 1) * 2, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + temp[0] = a[i * 2 + 0] * X[i * 2 + 0]; + temp[1] = a[i * 2 + 0] * X[i * 2 + 1]; + + Y[i * 2 + 0] += alpha_r * temp[0] - alpha_i * temp[1]; + Y[i * 2 + 1] += alpha_r * temp[1] + alpha_i * temp[0]; + + if (m - i > 1) { + AXPYC_K(m - i - 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); + } + + a += (m - i - 1) * 2; + +#endif +#endif + + } + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/zhpr2_k.c b/driver/level2/zhpr2_k.c new file mode 100644 index 0000000000..f4608ff9dd --- /dev/null +++ b/driver/level2/zhpr2_k.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += (i + 1) * 2; +#else + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + - alpha_i * X[i * 2 + 0] - alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] - alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += (m - i) * 2; +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYC_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += (i + 1) * 2; +#else + AXPYC_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYC_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] + alpha_i * Y[i * 2 + 1], + - alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += (m - i) * 2; +#endif +#endif + } + + + return 0; +} diff --git a/driver/level2/zhpr_k.c b/driver/level2/zhpr_k.c new file mode 100644 index 0000000000..c564d499e7 --- /dev/null +++ b/driver/level2/zhpr_k.c @@ -0,0 +1,79 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha, FLOAT *x, + BLASLONG incx, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef HEMVREV +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += (i + 1) * 2; +#else + AXPYU_K(m - i, 0, 0, alpha * X[i * 2 + 0], -alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += (m - i) * 2; +#endif +#else +#ifndef LOWER + AXPYC_K(i + 1, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X, 1, a, 1, NULL, 0); + a[i * 2 + 1] = ZERO; + a += (i + 1) * 2; +#else + AXPYC_K(m - i, 0, 0, alpha * X[i * 2 + 0], alpha * X[i * 2 + 1], X + i * 2, 1, a, 1, NULL, 0); + a[1] = ZERO; + a += (m - i) * 2; +#endif +#endif + } + + return 0; +} diff --git a/driver/level2/zsbmv_k.c b/driver/level2/zsbmv_k.c new file mode 100644 index 0000000000..de5dfdde22 --- /dev/null +++ b/driver/level2/zsbmv_k.c @@ -0,0 +1,119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i, length; +#ifndef LOWER + BLASLONG offset; +#endif + + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *sbmvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = sbmvbuffer; + FLOAT *bufferX = sbmvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + sbmvbuffer = bufferX; + COPY_K(n, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + sbmvbuffer = (FLOAT *)(((BLASLONG)bufferX + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + COPY_K(n, x, incx, X, 1); + } + +#ifndef LOWER + offset = k; +#endif + + for (i = 0; i < n; i++) { + +#ifndef LOWER + length = k - offset; + + AXPYU_K(length + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + offset * COMPSIZE, 1, Y + (i - length) * COMPSIZE, 1, NULL, 0); + + if (length > 0) { + FLOAT _Complex result = DOTU_K(length, a + offset * COMPSIZE, 1, X + (i - length) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + if (offset > 0) offset --; +#else + length = k; + if (n - i - 1 < k) length = n - i - 1; + + AXPYU_K(length + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a, 1, Y + i * COMPSIZE, 1, NULL, 0); + + if (length > 0) { + FLOAT _Complex result = DOTU_K(length, a + COMPSIZE, 1, X + (i + 1) * COMPSIZE, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } +#endif + + a += lda * 2; + } + + if (incy != 1) { + COPY_K(n, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/zspmv_k.c b/driver/level2/zspmv_k.c new file mode 100644 index 0000000000..c93b1e17e0 --- /dev/null +++ b/driver/level2/zspmv_k.c @@ -0,0 +1,108 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, void *buffer){ + + BLASLONG i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + FLOAT _Complex result; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + + for (i = 0; i < m; i++) { +#ifndef LOWER + + if (i > 0) { + result = DOTU_K(i, a, 1, X, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + } + + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a, 1, Y, 1, NULL, 0); + + a += (i + 1) * 2; + +#else + + result = DOTU_K(m - i, a + i * 2, 1, X + i * 2, 1); + + Y[i * 2 + 0] += alpha_r * CREAL(result) - alpha_i * CIMAG(result); + Y[i * 2 + 1] += alpha_r * CIMAG(result) + alpha_i * CREAL(result); + + if (m - i > 1) + AXPYU_K(m - i - 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_r * X[i * 2 + 1] + alpha_i * X[i * 2 + 0], + a + (i + 1) * 2, 1, Y + (i + 1) * 2, 1, NULL, 0); + + a += (m - i - 1) * 2; + +#endif + } + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/driver/level2/zspr2_k.c b/driver/level2/zspr2_k.c new file mode 100644 index 0000000000..48c81a366e --- /dev/null +++ b/driver/level2/zspr2_k.c @@ -0,0 +1,87 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a += (i + 1) * 2; +#else + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a += (m - i) * 2; +#endif + } + + return 0; +} diff --git a/driver/level2/zspr_k.c b/driver/level2/zspr_k.c new file mode 100644 index 0000000000..a187bdbfad --- /dev/null +++ b/driver/level2/zspr_k.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, FLOAT *a, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + X, 1, a, 1, NULL, 0); + } + a += (i + 1) * 2; +#else + if ((X[i * 2 + 0] != ZERO) && (X[i * 2 + 1] != ZERO)) { + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + } + a += (m - i) * 2; +#endif + } + + return 0; +} diff --git a/driver/level2/zsyr2_k.c b/driver/level2/zsyr2_k.c new file mode 100644 index 0000000000..f7bbbb2f2e --- /dev/null +++ b/driver/level2/zsyr2_k.c @@ -0,0 +1,89 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X, *Y; + + X = x; + Y = y; + + lda *= 2; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + if (incy != 1) { + COPY_K(m, y, incy, (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)), 1); + Y = (FLOAT *)((BLASLONG)buffer + (BUFFER_SIZE / 2)); + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y, 1, a, 1, NULL, 0); + AXPYU_K(i + 1, 0, 0, + alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X, 1, a, 1, NULL, 0); + a += lda; +#else + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + Y + i * 2, 1, a, 1, NULL, 0); + AXPYU_K(m - i, 0, 0, + alpha_r * Y[i * 2 + 0] - alpha_i * Y[i * 2 + 1], + alpha_i * Y[i * 2 + 0] + alpha_r * Y[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + a += 2 + lda; +#endif + } + + return 0; +} diff --git a/driver/level2/zsyr_k.c b/driver/level2/zsyr_k.c new file mode 100644 index 0000000000..9d800d37da --- /dev/null +++ b/driver/level2/zsyr_k.c @@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + BLASLONG i; + FLOAT *X; + + X = x; + lda *= 2; + + if (incx != 1) { + COPY_K(m, x, incx, buffer, 1); + X = buffer; + } + + for (i = 0; i < m; i++){ +#ifndef LOWER + if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { + AXPYU_K(i + 1, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + X, 1, a, 1, NULL, 0); + } + a += lda; +#else + if ((X[i * 2 + 0] != ZERO) || (X[i * 2 + 1] != ZERO)) { + AXPYU_K(m - i, 0, 0, + alpha_r * X[i * 2 + 0] - alpha_i * X[i * 2 + 1], + alpha_i * X[i * 2 + 0] + alpha_r * X[i * 2 + 1], + X + i * 2, 1, a, 1, NULL, 0); + } + a += 2 + lda; +#endif + } + + return 0; +} diff --git a/driver/level2/ztbmv_L.c b/driver/level2/ztbmv_L.c new file mode 100644 index 0000000000..9b604c04f0 --- /dev/null +++ b/driver/level2/ztbmv_L.c @@ -0,0 +1,131 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + a += (n - 1) * lda * COMPSIZE; + + for (i = n - 1; i >= 0; i--) { + +#if (TRANSA == 1) || (TRANSA == 3) + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 1 + AXPYU_K(length, 0, 0, + B[i * 2 + 0], B[i * 2 + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(length, 0, 0, + B[i * 2 + 0], B[i * 2 + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + atemp1 = a[0]; + atemp2 = a[1]; +#else + atemp1 = a[k * 2 + 0]; + atemp2 = a[k * 2 + 1]; +#endif + + btemp1 = B[i * 2 + 0]; + btemp2 = B[i * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + length = i; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 2 + temp = DOTU_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); +#else + temp = DOTC_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); +#endif + + B[i * 2 + 0] += CREAL(temp); + B[i * 2 + 1] += CIMAG(temp); + } +#endif + + a -= lda * COMPSIZE; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztbmv_U.c b/driver/level2/ztbmv_U.c new file mode 100644 index 0000000000..4e86f4fb13 --- /dev/null +++ b/driver/level2/ztbmv_U.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + for (i = 0; i < n; i++) { + +#if (TRANSA == 1) || (TRANSA == 3) + length = i; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 1 + AXPYU_K(length, 0, 0, + B[i * 2 + 0], B[i * 2 + 1], + a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(length, 0, 0, + B[i * 2 + 0], B[i * 2 + 1], + a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); +#endif + + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + atemp1 = a[k * 2 + 0]; + atemp2 = a[k * 2 + 1]; +#else + atemp1 = a[0]; + atemp2 = a[1]; +#endif + + btemp1 = B[i * 2 + 0]; + btemp2 = B[i * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 2 + temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); +#else + temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); +#endif + + B[i * 2 + 0] += CREAL(temp); + B[i * 2 + 1] += CIMAG(temp); + } +#endif + + a += lda * COMPSIZE; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztbsv_L.c b/driver/level2/ztbsv_L.c new file mode 100644 index 0000000000..f32ddff249 --- /dev/null +++ b/driver/level2/ztbsv_L.c @@ -0,0 +1,145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + for (i = 0; i < n; i++) { + +#if (TRANSA == 2) || (TRANSA == 4) + length = i; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 2 + temp = DOTU_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); +#else + temp = DOTC_K(length, a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1); +#endif + + B[i * 2 + 0] -= CREAL(temp); + B[i * 2 + 1] -= CIMAG(temp); + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + ar = a[0]; + ai = a[1]; +#else + ar = a[k * 2 + 0]; + ai = a[k * 2 + 1]; +#endif + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = B[i * 2 + 0]; + bi = B[i * 2 + 1]; + + B[i * 2 + 0] = ar*br - ai*bi; + B[i * 2 + 1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 1 + AXPYU_K(length, 0, 0, + -B[i * 2 + 0], -B[i * 2 + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(length, 0, 0, + -B[i * 2 + 0], -B[i * 2 + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + + a += lda * COMPSIZE; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztbsv_U.c b/driver/level2/ztbsv_U.c new file mode 100644 index 0000000000..252f3bace8 --- /dev/null +++ b/driver/level2/ztbsv_U.c @@ -0,0 +1,148 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + BLASLONG length; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); + COPY_K(n, b, incb, buffer, 1); + } + + a += (n - 1) * lda * COMPSIZE; + + for (i = n - 1; i >= 0; i--) { + +#if (TRANSA == 2) || (TRANSA == 4) + length = n - i - 1; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 2 + temp = DOTU_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); +#else + temp = DOTC_K(length, a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1); +#endif + + B[i * 2 + 0] -= CREAL(temp); + B[i * 2 + 1] -= CIMAG(temp); + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + ar = a[k * 2 + 0]; + ai = a[k * 2 + 1]; +#else + ar = a[0]; + ai = a[1]; +#endif + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = B[i * 2 + 0]; + bi = B[i * 2 + 1]; + + B[i * 2 + 0] = ar*br - ai*bi; + B[i * 2 + 1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + length = i; + if (length > k) length = k; + + if (length > 0) { +#if TRANSA == 1 + AXPYU_K(length, 0, 0, + -B[i * 2 + 0], -B[i * 2 + 1], + a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(length, 0, 0, + -B[i * 2 + 0], -B[i * 2 + 1], + a + (k - length) * COMPSIZE, 1, B + (i - length) * COMPSIZE, 1, NULL, 0); +#endif + + } +#endif + + a -= lda * COMPSIZE; + } + + if (incb != 1) { + COPY_K(n, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztpmv_L.c b/driver/level2/ztpmv_L.c new file mode 100644 index 0000000000..62b9dc6ce6 --- /dev/null +++ b/driver/level2/ztpmv_L.c @@ -0,0 +1,121 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + a += (m + 1) * m - 2; + + for (i = 0; i < m; i++) { + +#if (TRANSA == 1) || (TRANSA == 3) +#if TRANSA == 1 + if (i > 0) AXPYU_K (i, 0, 0, + B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], + a + 2, 1, B + (m - i) * 2, 1, NULL, 0); +#else + if (i > 0) AXPYC_K(i, 0, 0, + B[(m - i - 1) * 2 + 0], B[(m - i - 1) * 2 + 1], + a + 2, 1, B + (m - i) * 2, 1, NULL, 0); +#endif +#endif + +#ifndef UNIT + atemp1 = a[0]; + atemp2 = a[1]; + + btemp1 = B[(m - i - 1) * 2 + 0]; + btemp2 = B[(m - i - 1) * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + B[(m - i - 1) * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + B[(m - i - 1) * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + if (i < m - 1) { +#if TRANSA == 2 + temp = DOTU_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); +#else + temp = DOTC_K(m - i - 1, a - (m - i - 1) * 2, 1, B, 1); +#endif + + B[(m - i - 1) * 2 + 0] += CREAL(temp); + B[(m - i - 1) * 2 + 1] += CIMAG(temp); + } +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + a -= (i + 2) * 2; +#else + a -= (m - i) * 2; +#endif + + } + + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztpmv_U.c b/driver/level2/ztpmv_U.c new file mode 100644 index 0000000000..2ff3bfb563 --- /dev/null +++ b/driver/level2/ztpmv_U.c @@ -0,0 +1,124 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (i = 0; i < m; i++) { + +#if (TRANSA == 1) || (TRANSA == 3) +#if TRANSA == 1 + if (i > 0) AXPYU_K (i, 0, 0, B[i * 2 + 0], B[i * 2 + 1], + a, 1, B, 1, NULL, 0); +#else + if (i > 0) AXPYC_K(i, 0, 0, B[i * 2 + 0], B[i * 2 + 1], + a, 1, B, 1, NULL, 0); +#endif +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + atemp1 = a[i * 2 + 0]; + atemp2 = a[i * 2 + 1]; +#else + atemp1 = a[0]; + atemp2 = a[1]; +#endif + + btemp1 = B[i * 2 + 0]; + btemp2 = B[i * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + B[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + B[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + B[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + if (i < m - 1) { +#if TRANSA == 2 + temp = DOTU_K(m - i - 1, + a + 2, 1, + B + (i + 1) * 2, 1); +#else + temp = DOTC_K(m - i - 1, + a + 2, 1, + B + (i + 1) * 2, 1); +#endif + + B[i * 2 + 0] += CREAL(temp); + B[i * 2 + 1] += CIMAG(temp); + } +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + a += (i + 1) * 2; +#else + a += (m - i) * 2; +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztpsv_L.c b/driver/level2/ztpsv_L.c new file mode 100644 index 0000000000..e9317fbdd7 --- /dev/null +++ b/driver/level2/ztpsv_L.c @@ -0,0 +1,142 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex result; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (i = 0; i < m; i++) { + +#if (TRANSA == 2) || (TRANSA == 4) + if (i > 0) { +#if TRANSA == 2 + result = DOTU_K(i, a, 1, B, 1); +#else + result = DOTC_K(i, a, 1, B, 1); +#endif + + B[i * COMPSIZE + 0] -= CREAL(result); + B[i * COMPSIZE + 1] -= CIMAG(result); + } +#endif + +#ifndef UNIT +#if (TRANSA == 1) || (TRANSA == 3) + ar = a[0]; + ai = a[1]; +#else + ar = a[i * COMPSIZE + 0]; + ai = a[i * COMPSIZE + 1]; +#endif + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = B[i * COMPSIZE + 0]; + bi = B[i * COMPSIZE + 1]; + + B[i * COMPSIZE + 0] = ar*br - ai*bi; + B[i * COMPSIZE + 1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + if (i < m - 1) { +#if TRANSA == 1 + AXPYU_K(m - i - 1 , 0, 0, + - B[i * COMPSIZE + 0], - B[i * COMPSIZE + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(m - i - 1 , 0, 0, + - B[i * COMPSIZE + 0], - B[i * COMPSIZE + 1], + a + COMPSIZE, 1, B + (i + 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + a += (m - i) * 2; +#else + a += (i + 1) * 2; +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztpsv_U.c b/driver/level2/ztpsv_U.c new file mode 100644 index 0000000000..54903dc305 --- /dev/null +++ b/driver/level2/ztpsv_U.c @@ -0,0 +1,135 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex result; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + a += (m + 1) * m - 2; + + for (i = 0; i < m; i++) { + +#if (TRANSA == 2) || (TRANSA == 4) + if (i > 0) { +#if TRANSA == 2 + result = DOTU_K(i, a + 2, 1, B + (m - i) * 2, 1); +#else + result = DOTC_K(i, a + 2, 1, B + (m - i) * 2, 1); +#endif + + B[(m - i - 1) * 2 + 0] -= CREAL(result); + B[(m - i - 1) * 2 + 1] -= CIMAG(result); + } +#endif + +#ifndef UNIT + ar = a[0]; + ai = a[1]; + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if (TRANSA == 1) || (TRANSA == 2) + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if (TRANSA == 1) || (TRANSA == 2) + ai = -den; +#else + ai = den; +#endif + } + + br = B[(m - i - 1) * 2 + 0]; + bi = B[(m - i - 1) * 2 + 1]; + + B[(m - i - 1) * 2 + 0] = ar*br - ai*bi; + B[(m - i - 1) * 2 + 1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + if (i < m - 1) { +#if TRANSA == 1 + AXPYU_K (m - i - 1, 0, 0, - B[(m - i - 1) * 2 + 0], -B[(m - i - 1) * 2 + 1], + a - (m - i - 1) * COMPSIZE, 1, B, 1, NULL, 0); +#else + AXPYC_K (m - i - 1, 0, 0, - B[(m - i - 1) * 2 + 0], -B[(m - i - 1) * 2 + 1], + a - (m - i - 1) * COMPSIZE, 1, B, 1, NULL, 0); +#endif + } +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + a -= (m - i) * 2; +#else + a -= (i + 2) * 2; +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztrmv_L.c b/driver/level2/ztrmv_L.c new file mode 100644 index 0000000000..3688f588e5 --- /dev/null +++ b/driver/level2/ztrmv_L.c @@ -0,0 +1,149 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ + + BLASLONG i, is, min_i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = m; is > 0; is -= DTB_ENTRIES){ + + min_i = MIN(is, DTB_ENTRIES); + +#if (TRANSA == 1) || (TRANSA == 3) + if (m - is > 0){ +#if TRANSA == 1 + GEMV_N(m - is, min_i, 0, dp1, ZERO, + a + (is + (is - min_i) * lda) * 2, lda, + B + (is - min_i) * 2, 1, + B + is * 2, 1, gemvbuffer); +#else + GEMV_R(m - is, min_i, 0, dp1, ZERO, + a + (is + (is - min_i) * lda) * 2, lda, + B + (is - min_i) * 2, 1, + B + is * 2, 1, gemvbuffer); +#endif + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * 2; + FLOAT *BB = B + (is - i - 1) * 2; + +#if (TRANSA == 1) || (TRANSA == 3) +#if TRANSA == 1 + if (i > 0) AXPYU_K (i, 0, 0, BB[0], BB[1], AA + 2, 1, BB + 2, 1, NULL, 0); +#else + if (i > 0) AXPYC_K(i, 0, 0, BB[0], BB[1], AA + 2, 1, BB + 2, 1, NULL, 0); +#endif +#endif + +#ifndef UNIT + atemp1 = AA[0]; + atemp2 = AA[1]; + + btemp1 = BB[0]; + btemp2 = BB[1]; + +#if (TRANSA == 1) || (TRANSA == 2) + BB[0] = atemp1 * btemp1 - atemp2 * btemp2; + BB[1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + BB[0] = atemp1 * btemp1 + atemp2 * btemp2; + BB[1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + if (i < min_i - 1) { +#if TRANSA == 2 + temp = DOTU_K(min_i - i - 1, AA - (min_i - i - 1) * 2, 1, BB - (min_i - i - 1) * 2, 1); +#else + temp = DOTC_K(min_i - i - 1, AA - (min_i - i - 1) * 2, 1, BB - (min_i - i - 1) * 2, 1); +#endif + + BB[0] += CREAL(temp); + BB[1] += CIMAG(temp); + } +#endif + + } + +#if (TRANSA == 2) || (TRANSA == 4) + if (is - min_i > 0){ +#if TRANSA == 2 + GEMV_T(is - min_i, min_i, 0, dp1, ZERO, + a + (is - min_i) * lda * 2, lda, + B, 1, + B + (is - min_i) * 2, 1, gemvbuffer); +#else + GEMV_C(is - min_i, min_i, 0, dp1, ZERO, + a + (is - min_i) * lda * 2, lda, + B, 1, + B + (is - min_i) * 2, 1, gemvbuffer); +#endif + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztrmv_U.c b/driver/level2/ztrmv_U.c new file mode 100644 index 0000000000..a9fb6d1d08 --- /dev/null +++ b/driver/level2/ztrmv_U.c @@ -0,0 +1,155 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *buffer){ + + BLASLONG i, is, min_i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex temp; +#endif +#ifndef UNIT + FLOAT atemp1, atemp2, btemp1, btemp2; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is =0; is < m; is += DTB_ENTRIES){ + + min_i = MIN(m - is, DTB_ENTRIES); + +#if (TRANSA) == 1 || (TRANSA == 3) + if (is > 0){ +#if TRANSA == 1 + GEMV_N(is, min_i, 0, dp1, ZERO, + a + is * lda * 2, lda, + B + is * 2, 1, + B, 1, gemvbuffer); +#else + GEMV_R(is, min_i, 0, dp1, ZERO, + a + is * lda * 2, lda, + B + is * 2, 1, + B, 1, gemvbuffer); +#endif + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + (is + (i + is) * lda) * 2; + FLOAT *BB = B + is * 2; + +#if (TRANSA == 1) || (TRANSA == 3) +#if TRANSA == 1 + if (i > 0) AXPYU_K (i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], + AA, 1, BB, 1, NULL, 0); +#else + if (i > 0) AXPYC_K(i, 0, 0, BB[i * 2 + 0], BB[i * 2 + 1], + AA, 1, BB, 1, NULL, 0); +#endif +#endif + +#ifndef UNIT + atemp1 = AA[i * 2 + 0]; + atemp2 = AA[i * 2 + 1]; + + btemp1 = BB[i * 2 + 0]; + btemp2 = BB[i * 2 + 1]; + +#if (TRANSA == 1) || (TRANSA == 2) + BB[i * 2 + 0] = atemp1 * btemp1 - atemp2 * btemp2; + BB[i * 2 + 1] = atemp1 * btemp2 + atemp2 * btemp1; +#else + BB[i * 2 + 0] = atemp1 * btemp1 + atemp2 * btemp2; + BB[i * 2 + 1] = atemp1 * btemp2 - atemp2 * btemp1; +#endif +#endif + +#if (TRANSA == 2) || (TRANSA == 4) + if (i < min_i - 1) { +#if TRANSA == 2 + temp = DOTU_K(min_i - i - 1, + AA + (i + 1) * 2, 1, + BB + (i + 1) * 2, 1); +#else + temp = DOTC_K(min_i - i - 1, + AA + (i + 1) * 2, 1, + BB + (i + 1) * 2, 1); +#endif + + BB[i * 2 + 0] += CREAL(temp); + BB[i * 2 + 1] += CIMAG(temp); + } +#endif + + } + +#if (TRANSA) == 2 || (TRANSA == 4) + if (m - is > min_i){ +#if TRANSA == 2 + GEMV_T(m - is - min_i, min_i, 0, dp1, ZERO, + a + (is + min_i + is * lda) * 2, lda, + B + (is + min_i) * 2, 1, + B + is * 2, 1, gemvbuffer); +#else + GEMV_C(m - is - min_i, min_i, 0, dp1, ZERO, + a + (is + min_i + is * lda) * 2, lda, + B + (is + min_i) * 2, 1, + B + is * 2, 1, gemvbuffer); +#endif + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztrsv_L.c b/driver/level2/ztrsv_L.c new file mode 100644 index 0000000000..f825c61f5c --- /dev/null +++ b/driver/level2/ztrsv_L.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i, is, min_i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex result; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is =0; is < m; is += DTB_ENTRIES){ + + min_i = MIN(m - is, DTB_ENTRIES); + +#if (TRANSA == 2) || (TRANSA == 4) + if (is > 0){ +#if TRANSA == 2 + GEMV_T(is, min_i, 0, dm1, ZERO, + a + is * lda * COMPSIZE, lda, + B, 1, + B + is * COMPSIZE, 1, gemvbuffer); +#else + GEMV_C(is, min_i, 0, dm1, ZERO, + a + is * lda * COMPSIZE, lda, + B, 1, + B + is * COMPSIZE, 1, gemvbuffer); +#endif + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + (is + (i + is) * lda) * COMPSIZE; + FLOAT *BB = B + is * COMPSIZE; + +#if (TRANSA == 2) || (TRANSA == 4) + if (i > 0) { +#if TRANSA == 2 + result = DOTU_K(i, AA, 1, BB, 1); +#else + result = DOTC_K(i, AA, 1, BB, 1); +#endif + + BB[i * COMPSIZE + 0] -= CREAL(result); + BB[i * COMPSIZE + 1] -= CIMAG(result); + } +#endif + +#ifndef UNIT + ar = AA[i * COMPSIZE + 0]; + ai = AA[i * COMPSIZE + 1]; + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = BB[i * COMPSIZE + 0]; + bi = BB[i * COMPSIZE + 1]; + + BB[i * COMPSIZE + 0] = ar*br - ai*bi; + BB[i * COMPSIZE + 1] = ar*bi + ai*br; +#endif + + +#if (TRANSA == 1) || (TRANSA == 3) + if (i < min_i - 1) { +#if TRANSA == 1 + AXPYU_K(min_i - i - 1 , 0, 0, + - BB[i * COMPSIZE + 0], - BB[i * COMPSIZE + 1], + AA + (i + 1) * COMPSIZE, 1, BB + (i + 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(min_i - i - 1 , 0, 0, + - BB[i * COMPSIZE + 0], - BB[i * COMPSIZE + 1], + AA + (i + 1) * COMPSIZE, 1, BB + (i + 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + } + +#if (TRANSA == 1) || (TRANSA == 3) + if (m - is > min_i){ +#if TRANSA == 1 + GEMV_N(m - is - min_i, min_i, 0, dm1, ZERO, + a + (is + min_i + is * lda) * COMPSIZE, lda, + B + is * COMPSIZE, 1, + B + (is + min_i) * COMPSIZE, 1, gemvbuffer); +#else + GEMV_R(m - is - min_i, min_i, 0, dm1, ZERO, + a + (is + min_i + is * lda) * COMPSIZE, lda, + B + is * COMPSIZE, 1, + B + (is + min_i) * COMPSIZE, 1, gemvbuffer); +#endif + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level2/ztrsv_U.c b/driver/level2/ztrsv_U.c new file mode 100644 index 0000000000..3b750a29fb --- /dev/null +++ b/driver/level2/ztrsv_U.c @@ -0,0 +1,168 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ + + BLASLONG i, is, min_i; +#if (TRANSA == 2) || (TRANSA == 4) + FLOAT _Complex result; +#endif +#ifndef UNIT + FLOAT ar, ai, br, bi, ratio, den; +#endif + FLOAT *gemvbuffer = (FLOAT *)buffer; + FLOAT *B = b; + + if (incb != 1) { + B = buffer; + gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, b, incb, buffer, 1); + } + + for (is = m; is > 0; is -= DTB_ENTRIES){ + + min_i = MIN(is, DTB_ENTRIES); + +#if (TRANSA == 2) || (TRANSA == 4) + if (m - is > 0){ +#if TRANSA == 2 + GEMV_T(m - is, min_i, 0, dm1, ZERO, + a + (is + (is - min_i) * lda) * COMPSIZE, lda, + B + is * COMPSIZE, 1, + B + (is - min_i) * COMPSIZE, 1, gemvbuffer); +#else + GEMV_C(m - is, min_i, 0, dm1, ZERO, + a + (is + (is - min_i) * lda) * COMPSIZE, lda, + B + is * COMPSIZE, 1, + B + (is - min_i) * COMPSIZE, 1, gemvbuffer); +#endif + } +#endif + + for (i = 0; i < min_i; i++) { + FLOAT *AA = a + ((is - i - 1) + (is - i - 1) * lda) * COMPSIZE; + FLOAT *BB = B + (is - i - 1) * COMPSIZE; + +#if (TRANSA == 2) || (TRANSA == 4) + if (i > 0) { +#if TRANSA == 2 + result = DOTU_K(i, AA + 2, 1, BB + 2, 1); +#else + result = DOTC_K(i, AA + 2, 1, BB + 2, 1); +#endif + + BB[0] -= CREAL(result); + BB[1] -= CIMAG(result); + } +#endif + +#ifndef UNIT + ar = AA[0]; + ai = AA[1]; + + if (fabs(ar) >= fabs(ai)){ + ratio = ai / ar; + den = 1./(ar * ( 1 + ratio * ratio)); + + ar = den; +#if TRANSA < 3 + ai = -ratio * den; +#else + ai = ratio * den; +#endif + } else { + ratio = ar / ai; + den = 1./(ai * ( 1 + ratio * ratio)); + ar = ratio * den; +#if TRANSA < 3 + ai = -den; +#else + ai = den; +#endif + } + + br = BB[0]; + bi = BB[1]; + + BB[0] = ar*br - ai*bi; + BB[1] = ar*bi + ai*br; +#endif + +#if (TRANSA == 1) || (TRANSA == 3) + if (i < min_i - 1) { +#if TRANSA == 1 + AXPYU_K (min_i - i - 1, 0, 0, - BB[0], -BB[1], + AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); +#else + AXPYC_K(min_i - i - 1, 0, 0, - BB[0], -BB[1], + AA - (min_i - i - 1) * COMPSIZE, 1, BB - (min_i - i - 1) * COMPSIZE, 1, NULL, 0); +#endif + } +#endif + } + +#if (TRANSA == 1) || (TRANSA == 3) + if (is - min_i > 0){ +#if TRANSA == 1 + GEMV_N(is - min_i, min_i, 0, dm1, ZERO, + a + (is - min_i) * lda * COMPSIZE, lda, + B + (is - min_i) * COMPSIZE, 1, + B, 1, gemvbuffer); +#else + GEMV_R(is - min_i, min_i, 0, dm1, ZERO, + a + (is - min_i) * lda * COMPSIZE, lda, + B + (is - min_i) * COMPSIZE, 1, + B, 1, gemvbuffer); +#endif + } +#endif + } + + if (incb != 1) { + COPY_K(m, buffer, 1, b, incb); + } + + return 0; +} + diff --git a/driver/level3/Makefile b/driver/level3/Makefile new file mode 100644 index 0000000000..7d7d723398 --- /dev/null +++ b/driver/level3/Makefile @@ -0,0 +1,5022 @@ +TOPDIR = ../.. +include ../../Makefile.system + +ifeq ($(ARCH), x86) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +USE_GEMM3M = 1 +endif + +SBLASOBJS += \ + sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ + strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ + strmm_LTUU.$(SUFFIX) strmm_LTUN.$(SUFFIX) strmm_LTLU.$(SUFFIX) strmm_LTLN.$(SUFFIX) \ + strmm_RNUU.$(SUFFIX) strmm_RNUN.$(SUFFIX) strmm_RNLU.$(SUFFIX) strmm_RNLN.$(SUFFIX) \ + strmm_RTUU.$(SUFFIX) strmm_RTUN.$(SUFFIX) strmm_RTLU.$(SUFFIX) strmm_RTLN.$(SUFFIX) \ + strsm_LNUU.$(SUFFIX) strsm_LNUN.$(SUFFIX) strsm_LNLU.$(SUFFIX) strsm_LNLN.$(SUFFIX) \ + strsm_LTUU.$(SUFFIX) strsm_LTUN.$(SUFFIX) strsm_LTLU.$(SUFFIX) strsm_LTLN.$(SUFFIX) \ + strsm_RNUU.$(SUFFIX) strsm_RNUN.$(SUFFIX) strsm_RNLU.$(SUFFIX) strsm_RNLN.$(SUFFIX) \ + strsm_RTUU.$(SUFFIX) strsm_RTUN.$(SUFFIX) strsm_RTLU.$(SUFFIX) strsm_RTLN.$(SUFFIX) \ + ssymm_LU.$(SUFFIX) ssymm_LL.$(SUFFIX) ssymm_RU.$(SUFFIX) ssymm_RL.$(SUFFIX) \ + ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ + ssyr2k_UN.$(SUFFIX) ssyr2k_UT.$(SUFFIX) ssyr2k_LN.$(SUFFIX) ssyr2k_LT.$(SUFFIX) \ + ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) \ + ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) + +DBLASOBJS += \ + dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ + dtrmm_LNUU.$(SUFFIX) dtrmm_LNUN.$(SUFFIX) dtrmm_LNLU.$(SUFFIX) dtrmm_LNLN.$(SUFFIX) \ + dtrmm_LTUU.$(SUFFIX) dtrmm_LTUN.$(SUFFIX) dtrmm_LTLU.$(SUFFIX) dtrmm_LTLN.$(SUFFIX) \ + dtrmm_RNUU.$(SUFFIX) dtrmm_RNUN.$(SUFFIX) dtrmm_RNLU.$(SUFFIX) dtrmm_RNLN.$(SUFFIX) \ + dtrmm_RTUU.$(SUFFIX) dtrmm_RTUN.$(SUFFIX) dtrmm_RTLU.$(SUFFIX) dtrmm_RTLN.$(SUFFIX) \ + dtrsm_LNUU.$(SUFFIX) dtrsm_LNUN.$(SUFFIX) dtrsm_LNLU.$(SUFFIX) dtrsm_LNLN.$(SUFFIX) \ + dtrsm_LTUU.$(SUFFIX) dtrsm_LTUN.$(SUFFIX) dtrsm_LTLU.$(SUFFIX) dtrsm_LTLN.$(SUFFIX) \ + dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \ + dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX) \ + dsymm_LU.$(SUFFIX) dsymm_LL.$(SUFFIX) dsymm_RU.$(SUFFIX) dsymm_RL.$(SUFFIX) \ + dsyrk_UN.$(SUFFIX) dsyrk_UT.$(SUFFIX) dsyrk_LN.$(SUFFIX) dsyrk_LT.$(SUFFIX) \ + dsyr2k_UN.$(SUFFIX) dsyr2k_UT.$(SUFFIX) dsyr2k_LN.$(SUFFIX) dsyr2k_LT.$(SUFFIX) \ + dsyrk_kernel_U.$(SUFFIX) dsyrk_kernel_L.$(SUFFIX) \ + dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) + +QBLASOBJS += \ + qgemm_nn.$(SUFFIX) qgemm_nt.$(SUFFIX) qgemm_tn.$(SUFFIX) qgemm_tt.$(SUFFIX) \ + qtrmm_LNUU.$(SUFFIX) qtrmm_LNUN.$(SUFFIX) qtrmm_LNLU.$(SUFFIX) qtrmm_LNLN.$(SUFFIX) \ + qtrmm_LTUU.$(SUFFIX) qtrmm_LTUN.$(SUFFIX) qtrmm_LTLU.$(SUFFIX) qtrmm_LTLN.$(SUFFIX) \ + qtrmm_RNUU.$(SUFFIX) qtrmm_RNUN.$(SUFFIX) qtrmm_RNLU.$(SUFFIX) qtrmm_RNLN.$(SUFFIX) \ + qtrmm_RTUU.$(SUFFIX) qtrmm_RTUN.$(SUFFIX) qtrmm_RTLU.$(SUFFIX) qtrmm_RTLN.$(SUFFIX) \ + qtrsm_LNUU.$(SUFFIX) qtrsm_LNUN.$(SUFFIX) qtrsm_LNLU.$(SUFFIX) qtrsm_LNLN.$(SUFFIX) \ + qtrsm_LTUU.$(SUFFIX) qtrsm_LTUN.$(SUFFIX) qtrsm_LTLU.$(SUFFIX) qtrsm_LTLN.$(SUFFIX) \ + qtrsm_RNUU.$(SUFFIX) qtrsm_RNUN.$(SUFFIX) qtrsm_RNLU.$(SUFFIX) qtrsm_RNLN.$(SUFFIX) \ + qtrsm_RTUU.$(SUFFIX) qtrsm_RTUN.$(SUFFIX) qtrsm_RTLU.$(SUFFIX) qtrsm_RTLN.$(SUFFIX) \ + qsymm_LU.$(SUFFIX) qsymm_LL.$(SUFFIX) qsymm_RU.$(SUFFIX) qsymm_RL.$(SUFFIX) \ + qsyrk_UN.$(SUFFIX) qsyrk_UT.$(SUFFIX) qsyrk_LN.$(SUFFIX) qsyrk_LT.$(SUFFIX) \ + qsyr2k_UN.$(SUFFIX) qsyr2k_UT.$(SUFFIX) qsyr2k_LN.$(SUFFIX) qsyr2k_LT.$(SUFFIX) \ + qsyrk_kernel_U.$(SUFFIX) qsyrk_kernel_L.$(SUFFIX) \ + qsyr2k_kernel_U.$(SUFFIX) qsyr2k_kernel_L.$(SUFFIX) + +CBLASOBJS += \ + cgemm_nn.$(SUFFIX) cgemm_cn.$(SUFFIX) cgemm_tn.$(SUFFIX) cgemm_nc.$(SUFFIX) \ + cgemm_nt.$(SUFFIX) cgemm_cc.$(SUFFIX) cgemm_ct.$(SUFFIX) cgemm_tc.$(SUFFIX) \ + cgemm_tt.$(SUFFIX) cgemm_nr.$(SUFFIX) cgemm_tr.$(SUFFIX) cgemm_cr.$(SUFFIX) \ + cgemm_rn.$(SUFFIX) cgemm_rt.$(SUFFIX) cgemm_rc.$(SUFFIX) cgemm_rr.$(SUFFIX) \ + ctrmm_LNUU.$(SUFFIX) ctrmm_LNUN.$(SUFFIX) ctrmm_LNLU.$(SUFFIX) ctrmm_LNLN.$(SUFFIX) \ + ctrmm_LTUU.$(SUFFIX) ctrmm_LTUN.$(SUFFIX) ctrmm_LTLU.$(SUFFIX) ctrmm_LTLN.$(SUFFIX) \ + ctrmm_LRUU.$(SUFFIX) ctrmm_LRUN.$(SUFFIX) ctrmm_LRLU.$(SUFFIX) ctrmm_LRLN.$(SUFFIX) \ + ctrmm_LCUU.$(SUFFIX) ctrmm_LCUN.$(SUFFIX) ctrmm_LCLU.$(SUFFIX) ctrmm_LCLN.$(SUFFIX) \ + ctrmm_RNUU.$(SUFFIX) ctrmm_RNUN.$(SUFFIX) ctrmm_RNLU.$(SUFFIX) ctrmm_RNLN.$(SUFFIX) \ + ctrmm_RTUU.$(SUFFIX) ctrmm_RTUN.$(SUFFIX) ctrmm_RTLU.$(SUFFIX) ctrmm_RTLN.$(SUFFIX) \ + ctrmm_RRUU.$(SUFFIX) ctrmm_RRUN.$(SUFFIX) ctrmm_RRLU.$(SUFFIX) ctrmm_RRLN.$(SUFFIX) \ + ctrmm_RCUU.$(SUFFIX) ctrmm_RCUN.$(SUFFIX) ctrmm_RCLU.$(SUFFIX) ctrmm_RCLN.$(SUFFIX) \ + ctrsm_LNUU.$(SUFFIX) ctrsm_LNUN.$(SUFFIX) ctrsm_LNLU.$(SUFFIX) ctrsm_LNLN.$(SUFFIX) \ + ctrsm_LTUU.$(SUFFIX) ctrsm_LTUN.$(SUFFIX) ctrsm_LTLU.$(SUFFIX) ctrsm_LTLN.$(SUFFIX) \ + ctrsm_LRUU.$(SUFFIX) ctrsm_LRUN.$(SUFFIX) ctrsm_LRLU.$(SUFFIX) ctrsm_LRLN.$(SUFFIX) \ + ctrsm_LCUU.$(SUFFIX) ctrsm_LCUN.$(SUFFIX) ctrsm_LCLU.$(SUFFIX) ctrsm_LCLN.$(SUFFIX) \ + ctrsm_RNUU.$(SUFFIX) ctrsm_RNUN.$(SUFFIX) ctrsm_RNLU.$(SUFFIX) ctrsm_RNLN.$(SUFFIX) \ + ctrsm_RTUU.$(SUFFIX) ctrsm_RTUN.$(SUFFIX) ctrsm_RTLU.$(SUFFIX) ctrsm_RTLN.$(SUFFIX) \ + ctrsm_RRUU.$(SUFFIX) ctrsm_RRUN.$(SUFFIX) ctrsm_RRLU.$(SUFFIX) ctrsm_RRLN.$(SUFFIX) \ + ctrsm_RCUU.$(SUFFIX) ctrsm_RCUN.$(SUFFIX) ctrsm_RCLU.$(SUFFIX) ctrsm_RCLN.$(SUFFIX) \ + csymm_LU.$(SUFFIX) csymm_LL.$(SUFFIX) csymm_RU.$(SUFFIX) csymm_RL.$(SUFFIX) \ + chemm_LU.$(SUFFIX) chemm_LL.$(SUFFIX) chemm_RU.$(SUFFIX) chemm_RL.$(SUFFIX) \ + csyrk_UN.$(SUFFIX) csyrk_UT.$(SUFFIX) csyrk_LN.$(SUFFIX) csyrk_LT.$(SUFFIX) \ + cherk_UN.$(SUFFIX) cherk_UC.$(SUFFIX) cherk_LN.$(SUFFIX) cherk_LC.$(SUFFIX) \ + csyr2k_UN.$(SUFFIX) csyr2k_UT.$(SUFFIX) csyr2k_LN.$(SUFFIX) csyr2k_LT.$(SUFFIX) \ + cher2k_UN.$(SUFFIX) cher2k_UC.$(SUFFIX) cher2k_LN.$(SUFFIX) cher2k_LC.$(SUFFIX) \ + csyrk_kernel_U.$(SUFFIX) csyrk_kernel_L.$(SUFFIX) \ + cherk_kernel_UN.$(SUFFIX) cherk_kernel_UC.$(SUFFIX) \ + cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \ + csyr2k_kernel_U.$(SUFFIX) csyr2k_kernel_L.$(SUFFIX) \ + cher2k_kernel_UN.$(SUFFIX) cher2k_kernel_UC.$(SUFFIX) \ + cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) + +ZBLASOBJS += \ + zgemm_nn.$(SUFFIX) zgemm_cn.$(SUFFIX) zgemm_tn.$(SUFFIX) zgemm_nc.$(SUFFIX) \ + zgemm_nt.$(SUFFIX) zgemm_cc.$(SUFFIX) zgemm_ct.$(SUFFIX) zgemm_tc.$(SUFFIX) \ + zgemm_tt.$(SUFFIX) zgemm_nr.$(SUFFIX) zgemm_tr.$(SUFFIX) zgemm_cr.$(SUFFIX) \ + zgemm_rn.$(SUFFIX) zgemm_rt.$(SUFFIX) zgemm_rc.$(SUFFIX) zgemm_rr.$(SUFFIX) \ + ztrmm_LNUU.$(SUFFIX) ztrmm_LNUN.$(SUFFIX) ztrmm_LNLU.$(SUFFIX) ztrmm_LNLN.$(SUFFIX) \ + ztrmm_LTUU.$(SUFFIX) ztrmm_LTUN.$(SUFFIX) ztrmm_LTLU.$(SUFFIX) ztrmm_LTLN.$(SUFFIX) \ + ztrmm_LRUU.$(SUFFIX) ztrmm_LRUN.$(SUFFIX) ztrmm_LRLU.$(SUFFIX) ztrmm_LRLN.$(SUFFIX) \ + ztrmm_LCUU.$(SUFFIX) ztrmm_LCUN.$(SUFFIX) ztrmm_LCLU.$(SUFFIX) ztrmm_LCLN.$(SUFFIX) \ + ztrmm_RNUU.$(SUFFIX) ztrmm_RNUN.$(SUFFIX) ztrmm_RNLU.$(SUFFIX) ztrmm_RNLN.$(SUFFIX) \ + ztrmm_RTUU.$(SUFFIX) ztrmm_RTUN.$(SUFFIX) ztrmm_RTLU.$(SUFFIX) ztrmm_RTLN.$(SUFFIX) \ + ztrmm_RRUU.$(SUFFIX) ztrmm_RRUN.$(SUFFIX) ztrmm_RRLU.$(SUFFIX) ztrmm_RRLN.$(SUFFIX) \ + ztrmm_RCUU.$(SUFFIX) ztrmm_RCUN.$(SUFFIX) ztrmm_RCLU.$(SUFFIX) ztrmm_RCLN.$(SUFFIX) \ + ztrsm_LNUU.$(SUFFIX) ztrsm_LNUN.$(SUFFIX) ztrsm_LNLU.$(SUFFIX) ztrsm_LNLN.$(SUFFIX) \ + ztrsm_LTUU.$(SUFFIX) ztrsm_LTUN.$(SUFFIX) ztrsm_LTLU.$(SUFFIX) ztrsm_LTLN.$(SUFFIX) \ + ztrsm_LRUU.$(SUFFIX) ztrsm_LRUN.$(SUFFIX) ztrsm_LRLU.$(SUFFIX) ztrsm_LRLN.$(SUFFIX) \ + ztrsm_LCUU.$(SUFFIX) ztrsm_LCUN.$(SUFFIX) ztrsm_LCLU.$(SUFFIX) ztrsm_LCLN.$(SUFFIX) \ + ztrsm_RNUU.$(SUFFIX) ztrsm_RNUN.$(SUFFIX) ztrsm_RNLU.$(SUFFIX) ztrsm_RNLN.$(SUFFIX) \ + ztrsm_RTUU.$(SUFFIX) ztrsm_RTUN.$(SUFFIX) ztrsm_RTLU.$(SUFFIX) ztrsm_RTLN.$(SUFFIX) \ + ztrsm_RRUU.$(SUFFIX) ztrsm_RRUN.$(SUFFIX) ztrsm_RRLU.$(SUFFIX) ztrsm_RRLN.$(SUFFIX) \ + ztrsm_RCUU.$(SUFFIX) ztrsm_RCUN.$(SUFFIX) ztrsm_RCLU.$(SUFFIX) ztrsm_RCLN.$(SUFFIX) \ + zsymm_LU.$(SUFFIX) zsymm_LL.$(SUFFIX) zsymm_RU.$(SUFFIX) zsymm_RL.$(SUFFIX) \ + zhemm_LU.$(SUFFIX) zhemm_LL.$(SUFFIX) zhemm_RU.$(SUFFIX) zhemm_RL.$(SUFFIX) \ + zsyrk_UN.$(SUFFIX) zsyrk_UT.$(SUFFIX) zsyrk_LN.$(SUFFIX) zsyrk_LT.$(SUFFIX) \ + zherk_UN.$(SUFFIX) zherk_UC.$(SUFFIX) zherk_LN.$(SUFFIX) zherk_LC.$(SUFFIX) \ + zsyr2k_UN.$(SUFFIX) zsyr2k_UT.$(SUFFIX) zsyr2k_LN.$(SUFFIX) zsyr2k_LT.$(SUFFIX) \ + zher2k_UN.$(SUFFIX) zher2k_UC.$(SUFFIX) zher2k_LN.$(SUFFIX) zher2k_LC.$(SUFFIX) \ + zsyrk_kernel_U.$(SUFFIX) zsyrk_kernel_L.$(SUFFIX) \ + zherk_kernel_UN.$(SUFFIX) zherk_kernel_UC.$(SUFFIX) \ + zherk_kernel_LN.$(SUFFIX) zherk_kernel_LC.$(SUFFIX) \ + zsyr2k_kernel_U.$(SUFFIX) zsyr2k_kernel_L.$(SUFFIX) \ + zher2k_kernel_UN.$(SUFFIX) zher2k_kernel_UC.$(SUFFIX) \ + zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) + + +XBLASOBJS += \ + xgemm_nn.$(SUFFIX) xgemm_cn.$(SUFFIX) xgemm_tn.$(SUFFIX) xgemm_nc.$(SUFFIX) \ + xgemm_nt.$(SUFFIX) xgemm_cc.$(SUFFIX) xgemm_ct.$(SUFFIX) xgemm_tc.$(SUFFIX) \ + xgemm_tt.$(SUFFIX) xgemm_nr.$(SUFFIX) xgemm_tr.$(SUFFIX) xgemm_cr.$(SUFFIX) \ + xgemm_rn.$(SUFFIX) xgemm_rt.$(SUFFIX) xgemm_rc.$(SUFFIX) xgemm_rr.$(SUFFIX) \ + xtrmm_LNUU.$(SUFFIX) xtrmm_LNUN.$(SUFFIX) xtrmm_LNLU.$(SUFFIX) xtrmm_LNLN.$(SUFFIX) \ + xtrmm_LTUU.$(SUFFIX) xtrmm_LTUN.$(SUFFIX) xtrmm_LTLU.$(SUFFIX) xtrmm_LTLN.$(SUFFIX) \ + xtrmm_LRUU.$(SUFFIX) xtrmm_LRUN.$(SUFFIX) xtrmm_LRLU.$(SUFFIX) xtrmm_LRLN.$(SUFFIX) \ + xtrmm_LCUU.$(SUFFIX) xtrmm_LCUN.$(SUFFIX) xtrmm_LCLU.$(SUFFIX) xtrmm_LCLN.$(SUFFIX) \ + xtrmm_RNUU.$(SUFFIX) xtrmm_RNUN.$(SUFFIX) xtrmm_RNLU.$(SUFFIX) xtrmm_RNLN.$(SUFFIX) \ + xtrmm_RTUU.$(SUFFIX) xtrmm_RTUN.$(SUFFIX) xtrmm_RTLU.$(SUFFIX) xtrmm_RTLN.$(SUFFIX) \ + xtrmm_RRUU.$(SUFFIX) xtrmm_RRUN.$(SUFFIX) xtrmm_RRLU.$(SUFFIX) xtrmm_RRLN.$(SUFFIX) \ + xtrmm_RCUU.$(SUFFIX) xtrmm_RCUN.$(SUFFIX) xtrmm_RCLU.$(SUFFIX) xtrmm_RCLN.$(SUFFIX) \ + xtrsm_LNUU.$(SUFFIX) xtrsm_LNUN.$(SUFFIX) xtrsm_LNLU.$(SUFFIX) xtrsm_LNLN.$(SUFFIX) \ + xtrsm_LTUU.$(SUFFIX) xtrsm_LTUN.$(SUFFIX) xtrsm_LTLU.$(SUFFIX) xtrsm_LTLN.$(SUFFIX) \ + xtrsm_LRUU.$(SUFFIX) xtrsm_LRUN.$(SUFFIX) xtrsm_LRLU.$(SUFFIX) xtrsm_LRLN.$(SUFFIX) \ + xtrsm_LCUU.$(SUFFIX) xtrsm_LCUN.$(SUFFIX) xtrsm_LCLU.$(SUFFIX) xtrsm_LCLN.$(SUFFIX) \ + xtrsm_RNUU.$(SUFFIX) xtrsm_RNUN.$(SUFFIX) xtrsm_RNLU.$(SUFFIX) xtrsm_RNLN.$(SUFFIX) \ + xtrsm_RTUU.$(SUFFIX) xtrsm_RTUN.$(SUFFIX) xtrsm_RTLU.$(SUFFIX) xtrsm_RTLN.$(SUFFIX) \ + xtrsm_RRUU.$(SUFFIX) xtrsm_RRUN.$(SUFFIX) xtrsm_RRLU.$(SUFFIX) xtrsm_RRLN.$(SUFFIX) \ + xtrsm_RCUU.$(SUFFIX) xtrsm_RCUN.$(SUFFIX) xtrsm_RCLU.$(SUFFIX) xtrsm_RCLN.$(SUFFIX) \ + xsymm_LU.$(SUFFIX) xsymm_LL.$(SUFFIX) xsymm_RU.$(SUFFIX) xsymm_RL.$(SUFFIX) \ + xhemm_LU.$(SUFFIX) xhemm_LL.$(SUFFIX) xhemm_RU.$(SUFFIX) xhemm_RL.$(SUFFIX) \ + xsyrk_UN.$(SUFFIX) xsyrk_UT.$(SUFFIX) xsyrk_LN.$(SUFFIX) xsyrk_LT.$(SUFFIX) \ + xherk_UN.$(SUFFIX) xherk_UC.$(SUFFIX) xherk_LN.$(SUFFIX) xherk_LC.$(SUFFIX) \ + xsyr2k_UN.$(SUFFIX) xsyr2k_UT.$(SUFFIX) xsyr2k_LN.$(SUFFIX) xsyr2k_LT.$(SUFFIX) \ + xher2k_UN.$(SUFFIX) xher2k_UC.$(SUFFIX) xher2k_LN.$(SUFFIX) xher2k_LC.$(SUFFIX) \ + xsyrk_kernel_U.$(SUFFIX) xsyrk_kernel_L.$(SUFFIX) \ + xherk_kernel_UN.$(SUFFIX) xherk_kernel_UC.$(SUFFIX) \ + xherk_kernel_LN.$(SUFFIX) xherk_kernel_LC.$(SUFFIX) \ + xsyr2k_kernel_U.$(SUFFIX) xsyr2k_kernel_L.$(SUFFIX) \ + xher2k_kernel_UN.$(SUFFIX) xher2k_kernel_UC.$(SUFFIX) \ + xher2k_kernel_LN.$(SUFFIX) xher2k_kernel_LC.$(SUFFIX) + +ifdef USE_GEMM3M + +CBLASOBJS += \ + cgemm3m_nn.$(SUFFIX) cgemm3m_cn.$(SUFFIX) cgemm3m_tn.$(SUFFIX) cgemm3m_nc.$(SUFFIX) \ + cgemm3m_nt.$(SUFFIX) cgemm3m_cc.$(SUFFIX) cgemm3m_ct.$(SUFFIX) cgemm3m_tc.$(SUFFIX) \ + cgemm3m_tt.$(SUFFIX) cgemm3m_nr.$(SUFFIX) cgemm3m_tr.$(SUFFIX) cgemm3m_cr.$(SUFFIX) \ + cgemm3m_rn.$(SUFFIX) cgemm3m_rt.$(SUFFIX) cgemm3m_rc.$(SUFFIX) cgemm3m_rr.$(SUFFIX) \ + csymm3m_LU.$(SUFFIX) csymm3m_LL.$(SUFFIX) csymm3m_RU.$(SUFFIX) csymm3m_RL.$(SUFFIX) \ + chemm3m_LU.$(SUFFIX) chemm3m_LL.$(SUFFIX) chemm3m_RU.$(SUFFIX) chemm3m_RL.$(SUFFIX) + +ZBLASOBJS += \ + zgemm3m_nn.$(SUFFIX) zgemm3m_cn.$(SUFFIX) zgemm3m_tn.$(SUFFIX) zgemm3m_nc.$(SUFFIX) \ + zgemm3m_nt.$(SUFFIX) zgemm3m_cc.$(SUFFIX) zgemm3m_ct.$(SUFFIX) zgemm3m_tc.$(SUFFIX) \ + zgemm3m_tt.$(SUFFIX) zgemm3m_nr.$(SUFFIX) zgemm3m_tr.$(SUFFIX) zgemm3m_cr.$(SUFFIX) \ + zgemm3m_rn.$(SUFFIX) zgemm3m_rt.$(SUFFIX) zgemm3m_rc.$(SUFFIX) zgemm3m_rr.$(SUFFIX) \ + zsymm3m_LU.$(SUFFIX) zsymm3m_LL.$(SUFFIX) zsymm3m_RU.$(SUFFIX) zsymm3m_RL.$(SUFFIX) \ + zhemm3m_LU.$(SUFFIX) zhemm3m_LL.$(SUFFIX) zhemm3m_RU.$(SUFFIX) zhemm3m_RL.$(SUFFIX) + +XBLASOBJS += \ + xgemm3m_nn.$(SUFFIX) xgemm3m_cn.$(SUFFIX) xgemm3m_tn.$(SUFFIX) xgemm3m_nc.$(SUFFIX) \ + xgemm3m_nt.$(SUFFIX) xgemm3m_cc.$(SUFFIX) xgemm3m_ct.$(SUFFIX) xgemm3m_tc.$(SUFFIX) \ + xgemm3m_tt.$(SUFFIX) xgemm3m_nr.$(SUFFIX) xgemm3m_tr.$(SUFFIX) xgemm3m_cr.$(SUFFIX) \ + xgemm3m_rn.$(SUFFIX) xgemm3m_rt.$(SUFFIX) xgemm3m_rc.$(SUFFIX) xgemm3m_rr.$(SUFFIX) \ + xsymm3m_LU.$(SUFFIX) xsymm3m_LL.$(SUFFIX) xsymm3m_RU.$(SUFFIX) xsymm3m_RL.$(SUFFIX) \ + xhemm3m_LU.$(SUFFIX) xhemm3m_LL.$(SUFFIX) xhemm3m_RU.$(SUFFIX) xhemm3m_RL.$(SUFFIX) + +endif + +ifdef SMP +COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(SUFFIX) gemm_thread_variable.$(SUFFIX) +COMMONOBJS += syrk_thread.$(SUFFIX) + +ifndef USE_SIMPLE_THREADED_LEVEL3 + +SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) +DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) +QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) +CBLASOBJS += cgemm_thread_nn.$(SUFFIX) cgemm_thread_nt.$(SUFFIX) cgemm_thread_nr.$(SUFFIX) cgemm_thread_nc.$(SUFFIX) +CBLASOBJS += cgemm_thread_tn.$(SUFFIX) cgemm_thread_tt.$(SUFFIX) cgemm_thread_tr.$(SUFFIX) cgemm_thread_tc.$(SUFFIX) +CBLASOBJS += cgemm_thread_rn.$(SUFFIX) cgemm_thread_rt.$(SUFFIX) cgemm_thread_rr.$(SUFFIX) cgemm_thread_rc.$(SUFFIX) +CBLASOBJS += cgemm_thread_cn.$(SUFFIX) cgemm_thread_ct.$(SUFFIX) cgemm_thread_cr.$(SUFFIX) cgemm_thread_cc.$(SUFFIX) +ZBLASOBJS += zgemm_thread_nn.$(SUFFIX) zgemm_thread_nt.$(SUFFIX) zgemm_thread_nr.$(SUFFIX) zgemm_thread_nc.$(SUFFIX) +ZBLASOBJS += zgemm_thread_tn.$(SUFFIX) zgemm_thread_tt.$(SUFFIX) zgemm_thread_tr.$(SUFFIX) zgemm_thread_tc.$(SUFFIX) +ZBLASOBJS += zgemm_thread_rn.$(SUFFIX) zgemm_thread_rt.$(SUFFIX) zgemm_thread_rr.$(SUFFIX) zgemm_thread_rc.$(SUFFIX) +ZBLASOBJS += zgemm_thread_cn.$(SUFFIX) zgemm_thread_ct.$(SUFFIX) zgemm_thread_cr.$(SUFFIX) zgemm_thread_cc.$(SUFFIX) +XBLASOBJS += xgemm_thread_nn.$(SUFFIX) xgemm_thread_nt.$(SUFFIX) xgemm_thread_nr.$(SUFFIX) xgemm_thread_nc.$(SUFFIX) +XBLASOBJS += xgemm_thread_tn.$(SUFFIX) xgemm_thread_tt.$(SUFFIX) xgemm_thread_tr.$(SUFFIX) xgemm_thread_tc.$(SUFFIX) +XBLASOBJS += xgemm_thread_rn.$(SUFFIX) xgemm_thread_rt.$(SUFFIX) xgemm_thread_rr.$(SUFFIX) xgemm_thread_rc.$(SUFFIX) +XBLASOBJS += xgemm_thread_cn.$(SUFFIX) xgemm_thread_ct.$(SUFFIX) xgemm_thread_cr.$(SUFFIX) xgemm_thread_cc.$(SUFFIX) + +SBLASOBJS += ssymm_thread_LU.$(SUFFIX) ssymm_thread_LL.$(SUFFIX) ssymm_thread_RU.$(SUFFIX) ssymm_thread_RL.$(SUFFIX) +DBLASOBJS += dsymm_thread_LU.$(SUFFIX) dsymm_thread_LL.$(SUFFIX) dsymm_thread_RU.$(SUFFIX) dsymm_thread_RL.$(SUFFIX) +QBLASOBJS += qsymm_thread_LU.$(SUFFIX) qsymm_thread_LL.$(SUFFIX) qsymm_thread_RU.$(SUFFIX) qsymm_thread_RL.$(SUFFIX) +CBLASOBJS += csymm_thread_LU.$(SUFFIX) csymm_thread_LL.$(SUFFIX) csymm_thread_RU.$(SUFFIX) csymm_thread_RL.$(SUFFIX) +ZBLASOBJS += zsymm_thread_LU.$(SUFFIX) zsymm_thread_LL.$(SUFFIX) zsymm_thread_RU.$(SUFFIX) zsymm_thread_RL.$(SUFFIX) +XBLASOBJS += xsymm_thread_LU.$(SUFFIX) xsymm_thread_LL.$(SUFFIX) xsymm_thread_RU.$(SUFFIX) xsymm_thread_RL.$(SUFFIX) + +CBLASOBJS += chemm_thread_LU.$(SUFFIX) chemm_thread_LL.$(SUFFIX) chemm_thread_RU.$(SUFFIX) chemm_thread_RL.$(SUFFIX) +ZBLASOBJS += zhemm_thread_LU.$(SUFFIX) zhemm_thread_LL.$(SUFFIX) zhemm_thread_RU.$(SUFFIX) zhemm_thread_RL.$(SUFFIX) +XBLASOBJS += xhemm_thread_LU.$(SUFFIX) xhemm_thread_LL.$(SUFFIX) xhemm_thread_RU.$(SUFFIX) xhemm_thread_RL.$(SUFFIX) + +SBLASOBJS += ssyrk_thread_UN.$(SUFFIX) ssyrk_thread_UT.$(SUFFIX) ssyrk_thread_LN.$(SUFFIX) ssyrk_thread_LT.$(SUFFIX) +DBLASOBJS += dsyrk_thread_UN.$(SUFFIX) dsyrk_thread_UT.$(SUFFIX) dsyrk_thread_LN.$(SUFFIX) dsyrk_thread_LT.$(SUFFIX) +QBLASOBJS += qsyrk_thread_UN.$(SUFFIX) qsyrk_thread_UT.$(SUFFIX) qsyrk_thread_LN.$(SUFFIX) qsyrk_thread_LT.$(SUFFIX) +CBLASOBJS += csyrk_thread_UN.$(SUFFIX) csyrk_thread_UT.$(SUFFIX) csyrk_thread_LN.$(SUFFIX) csyrk_thread_LT.$(SUFFIX) +ZBLASOBJS += zsyrk_thread_UN.$(SUFFIX) zsyrk_thread_UT.$(SUFFIX) zsyrk_thread_LN.$(SUFFIX) zsyrk_thread_LT.$(SUFFIX) +XBLASOBJS += xsyrk_thread_UN.$(SUFFIX) xsyrk_thread_UT.$(SUFFIX) xsyrk_thread_LN.$(SUFFIX) xsyrk_thread_LT.$(SUFFIX) +CBLASOBJS += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread_LN.$(SUFFIX) cherk_thread_LC.$(SUFFIX) +ZBLASOBJS += zherk_thread_UN.$(SUFFIX) zherk_thread_UC.$(SUFFIX) zherk_thread_LN.$(SUFFIX) zherk_thread_LC.$(SUFFIX) +XBLASOBJS += xherk_thread_UN.$(SUFFIX) xherk_thread_UC.$(SUFFIX) xherk_thread_LN.$(SUFFIX) xherk_thread_LC.$(SUFFIX) + +ifdef USE_GEMM3M + +CBLASOBJS += cgemm3m_thread_nn.$(SUFFIX) cgemm3m_thread_nt.$(SUFFIX) cgemm3m_thread_nr.$(SUFFIX) cgemm3m_thread_nc.$(SUFFIX) +CBLASOBJS += cgemm3m_thread_tn.$(SUFFIX) cgemm3m_thread_tt.$(SUFFIX) cgemm3m_thread_tr.$(SUFFIX) cgemm3m_thread_tc.$(SUFFIX) +CBLASOBJS += cgemm3m_thread_rn.$(SUFFIX) cgemm3m_thread_rt.$(SUFFIX) cgemm3m_thread_rr.$(SUFFIX) cgemm3m_thread_rc.$(SUFFIX) +CBLASOBJS += cgemm3m_thread_cn.$(SUFFIX) cgemm3m_thread_ct.$(SUFFIX) cgemm3m_thread_cr.$(SUFFIX) cgemm3m_thread_cc.$(SUFFIX) +ZBLASOBJS += zgemm3m_thread_nn.$(SUFFIX) zgemm3m_thread_nt.$(SUFFIX) zgemm3m_thread_nr.$(SUFFIX) zgemm3m_thread_nc.$(SUFFIX) +ZBLASOBJS += zgemm3m_thread_tn.$(SUFFIX) zgemm3m_thread_tt.$(SUFFIX) zgemm3m_thread_tr.$(SUFFIX) zgemm3m_thread_tc.$(SUFFIX) +ZBLASOBJS += zgemm3m_thread_rn.$(SUFFIX) zgemm3m_thread_rt.$(SUFFIX) zgemm3m_thread_rr.$(SUFFIX) zgemm3m_thread_rc.$(SUFFIX) +ZBLASOBJS += zgemm3m_thread_cn.$(SUFFIX) zgemm3m_thread_ct.$(SUFFIX) zgemm3m_thread_cr.$(SUFFIX) zgemm3m_thread_cc.$(SUFFIX) +XBLASOBJS += xgemm3m_thread_nn.$(SUFFIX) xgemm3m_thread_nt.$(SUFFIX) xgemm3m_thread_nr.$(SUFFIX) xgemm3m_thread_nc.$(SUFFIX) +XBLASOBJS += xgemm3m_thread_tn.$(SUFFIX) xgemm3m_thread_tt.$(SUFFIX) xgemm3m_thread_tr.$(SUFFIX) xgemm3m_thread_tc.$(SUFFIX) +XBLASOBJS += xgemm3m_thread_rn.$(SUFFIX) xgemm3m_thread_rt.$(SUFFIX) xgemm3m_thread_rr.$(SUFFIX) xgemm3m_thread_rc.$(SUFFIX) +XBLASOBJS += xgemm3m_thread_cn.$(SUFFIX) xgemm3m_thread_ct.$(SUFFIX) xgemm3m_thread_cr.$(SUFFIX) xgemm3m_thread_cc.$(SUFFIX) + +CBLASOBJS += csymm3m_thread_LU.$(SUFFIX) csymm3m_thread_LL.$(SUFFIX) csymm3m_thread_RU.$(SUFFIX) csymm3m_thread_RL.$(SUFFIX) +ZBLASOBJS += zsymm3m_thread_LU.$(SUFFIX) zsymm3m_thread_LL.$(SUFFIX) zsymm3m_thread_RU.$(SUFFIX) zsymm3m_thread_RL.$(SUFFIX) +XBLASOBJS += xsymm3m_thread_LU.$(SUFFIX) xsymm3m_thread_LL.$(SUFFIX) xsymm3m_thread_RU.$(SUFFIX) xsymm3m_thread_RL.$(SUFFIX) + +CBLASOBJS += chemm3m_thread_LU.$(SUFFIX) chemm3m_thread_LL.$(SUFFIX) chemm3m_thread_RU.$(SUFFIX) chemm3m_thread_RL.$(SUFFIX) +ZBLASOBJS += zhemm3m_thread_LU.$(SUFFIX) zhemm3m_thread_LL.$(SUFFIX) zhemm3m_thread_RU.$(SUFFIX) zhemm3m_thread_RL.$(SUFFIX) +XBLASOBJS += xhemm3m_thread_LU.$(SUFFIX) xhemm3m_thread_LL.$(SUFFIX) xhemm3m_thread_RU.$(SUFFIX) xhemm3m_thread_RL.$(SUFFIX) + +endif + +endif +endif + +HPLOBJS = \ + dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ + dtrsm_LNUU.$(SUFFIX) dtrsm_LNUN.$(SUFFIX) dtrsm_LNLU.$(SUFFIX) dtrsm_LNLN.$(SUFFIX) \ + dtrsm_LTUU.$(SUFFIX) dtrsm_LTUN.$(SUFFIX) dtrsm_LTLU.$(SUFFIX) dtrsm_LTLN.$(SUFFIX) \ + dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \ + dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX) + +ifndef USE_SIMPLE_THREADED_LEVEL3 +HPLOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) \ + dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) +endif + +all :: + +sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +sgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +sgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +sgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +dgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +dgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +dgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +dgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +qgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +qgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +qgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +qgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +cgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm_nr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm_nc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm_tr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm_tc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm_rn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm_rt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +gemm_thread_m.$(SUFFIX) : gemm_thread_m.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +gemm_thread_n.$(SUFFIX) : gemm_thread_n.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +gemm_thread_mn.$(SUFFIX) : gemm_thread_mn.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +beta_thread.$(SUFFIX) : beta_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + + +sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +sgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +sgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +sgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +dgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +dgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +dgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +dgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +qgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +qgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +qgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +qgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +cgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm_thread_nr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm_thread_nc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm_thread_tr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm_thread_tc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm_thread_rn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm_thread_rt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +strmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +ctrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LRUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LRUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LRLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LRLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LCUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LCUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LCLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LCLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RRUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RRUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RRLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RRLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RCUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RCUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RCLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RCLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LRUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LRUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LRLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LRLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LCUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LCUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LCLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LCLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RRUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RRUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RRLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RRLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RCUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RCUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RCLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RCLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LNUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LNUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LNLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LNLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LTUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LTUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LTLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LTLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LRUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LRUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LRLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LRLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LCUU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LCUN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LCLU.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LCLN.$(SUFFIX) : trmm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RNUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RNUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RNLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RNLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RTUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RTUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RTLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RTLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RRUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RRUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RRLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RRLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RCUU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RCUN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RCLU.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RCLN.$(SUFFIX) : trmm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ssymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +ssymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +ssymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +dsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +dsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +qsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +qsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_LU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm_LL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm_RU.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_RL.$(SUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +ssymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +ssymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +dsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +dsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +qsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +qsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_thread_LU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm_thread_LL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm_thread_RU.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_thread_RL.$(SUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +ssyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyrk_UN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyrk_UT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyrk_LN.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyrk_LT.$(SUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyrk_thread_UN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyrk_thread_UT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyrk_thread_LN.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyrk_thread_LT.$(SUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +ssyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +dsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +dsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +qsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +qsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +csyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +csyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xsyrk_kernel_U.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xsyrk_kernel_L.$(SUFFIX) : syrk_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +syrk_thread.$(SUFFIX) : syrk_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyr2k_UN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyr2k_UT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyr2k_LN.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyr2k_LT.$(SUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +ssyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +dsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +dsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +qsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +qsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +csyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +csyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xsyr2k_kernel_U.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xsyr2k_kernel_L.$(SUFFIX) : syr2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +chemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +chemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_LU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm_LL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm_RU.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_RL.$(SUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +chemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +chemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_thread_LU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm_thread_LL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm_thread_RU.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_thread_RL.$(SUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +cherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xherk_UN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xherk_UC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xherk_LN.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xherk_LC.$(SUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cherk_kernel_UN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +cherk_kernel_UC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +cherk_kernel_LN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +cherk_kernel_LC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +zherk_kernel_UN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +zherk_kernel_UC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +zherk_kernel_LN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +zherk_kernel_LC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +xherk_kernel_UN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +xherk_kernel_UC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +xherk_kernel_LN.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +xherk_kernel_LC.$(SUFFIX) : zherk_kernel.c + $(CC) -c $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +cherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +cherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +cherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +cherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +zherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +zherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +zherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +zherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +xherk_thread_UN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +xherk_thread_UC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +xherk_thread_LN.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +xherk_thread_LC.$(SUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +cher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xher2k_UN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xher2k_UC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xher2k_LN.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xher2k_LC.$(SUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(CFLAGS) -DHER2K -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +cher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +cher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +cher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +zher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +zher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +zher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +zher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +xher2k_kernel_UN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +xher2k_kernel_UC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +xher2k_kernel_LN.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +xher2k_kernel_LC.$(SUFFIX) : zher2k_kernel.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +cgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm3m_nn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm3m_nt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm3m_nr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm3m_nc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm3m_tn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm3m_tt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm3m_tr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm3m_tc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm3m_rn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm3m_rt.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +cgemmf.$(SUFFIX) : zgemmf.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX $< -o $(@F) + +zgemmf.$(SUFFIX) : zgemmf.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX $< -o $(@F) + +xgemmf.$(SUFFIX) : zgemmf.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX $< -o $(@F) + +cgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm3m_thread_nn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm3m_thread_nt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm3m_thread_nr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm3m_thread_nc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm3m_thread_tn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm3m_thread_tt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm3m_thread_tr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm3m_thread_tc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm3m_thread_rn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm3m_thread_rt.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +csymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_LU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_LL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_RU.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_RL.$(SUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_thread_LU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_thread_LL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_thread_RU.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_thread_RL.$(SUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_LU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_LL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_RU.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_RL.$(SUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_thread_LU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_thread_LL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_thread_RU.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_thread_RL.$(SUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(CFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +strsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +ctrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LRUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LRUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LRLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LRLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LCUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LCUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LCLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LCLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RRUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RRUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RRLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RRLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RCUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RCUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RCLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RCLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + + +ztrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LRUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LRUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LRLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LRLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LCUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LCUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LCLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LCLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RRUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RRUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RRLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RRLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RCUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RCUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RCLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RCLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LNUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LNUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LNLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LNLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LTUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LTUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LTLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LTLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LRUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LRUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LRLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LRLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LCUU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LCUN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LCLU.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LCLN.$(SUFFIX) : trsm_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RNUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RNUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RNLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RNLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RTUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RTUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RTLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RTLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RRUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RRUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RRLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RRLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RCUU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RCUN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RCLU.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RCLN.$(SUFFIX) : trsm_R.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +sgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +sgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +sgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +dgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +dgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +dgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +dgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +qgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +qgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +qgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +qgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +cgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm_nr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm_nc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm_tr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm_tc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm_rn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm_rt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +gemm_thread_m.$(PSUFFIX) : gemm_thread_m.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +gemm_thread_n.$(PSUFFIX) : gemm_thread_n.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +gemm_thread_mn.$(PSUFFIX) : gemm_thread_mn.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +gemm_thread_variable.$(PSUFFIX) : gemm_thread_variable.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + + +sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +sgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +sgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +sgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +dgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +dgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +dgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +dgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +qgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNN $< -o $(@F) + +qgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DNT $< -o $(@F) + +qgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTN $< -o $(@F) + +qgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -UCOMPLEX -DTT $< -o $(@F) + +cgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm_thread_nr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm_thread_nc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm_thread_tr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm_thread_tc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm_thread_rn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm_thread_rt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +strmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +ctrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_LRUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LRUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LRLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LRLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LCUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LCUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_LCLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_LCLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrmm_RRUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RRUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RRLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RRLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RCUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RCUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrmm_RCLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrmm_RCLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_LRUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LRUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LRLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LRLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LCUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LCUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_LCLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_LCLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrmm_RRUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RRUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RRLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RRLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RCUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RCUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrmm_RCLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrmm_RCLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LNUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LNUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LNLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LNLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LTUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LTUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LTLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_LTLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_LRUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LRUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LRLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LRLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LCUU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LCUN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_LCLU.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_LCLN.$(PSUFFIX) : trmm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RNUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RNUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RNLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RNLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RTUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RTUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RTLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrmm_RTLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrmm_RRUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RRUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RRLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RRLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RCUU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RCUN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrmm_RCLU.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrmm_RCLN.$(PSUFFIX) : trmm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ssymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +ssymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +ssymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +dsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +dsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +qsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +qsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_LU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm_LL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm_RU.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_RL.$(PSUFFIX) : symm_k.c level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +ssymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +ssymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +ssymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +dsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +dsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +dsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +qsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +qsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +qsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_thread_LU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm_thread_LL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm_thread_RU.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm_thread_RL.$(PSUFFIX) : symm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +ssyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyrk_UN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyrk_UT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyrk_LN.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyrk_LT.$(PSUFFIX) : syrk_k.c level3_syrk.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyrk_thread_UN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyrk_thread_UT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyrk_thread_LN.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyrk_thread_LT.$(PSUFFIX) : syrk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +ssyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +dsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +dsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +qsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +qsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +csyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +csyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xsyrk_kernel_U.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xsyrk_kernel_L.$(PSUFFIX) : syrk_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +syrk_thread.$(PSUFFIX) : syrk_thread.c ../../common.h + $(CC) -c $(PFLAGS) $< -o $(@F) + +ssyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +ssyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +ssyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +ssyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +dsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +dsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +dsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +dsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +qsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +qsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +qsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +qsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +csyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +csyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +csyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +csyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +zsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +zsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +zsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +zsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +xsyr2k_UN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS $< -o $(@F) + +xsyr2k_UT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS $< -o $(@F) + +xsyr2k_LN.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS $< -o $(@F) + +xsyr2k_LT.$(PSUFFIX) : syr2k_k.c level3_syr2k.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS $< -o $(@F) + +ssyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +ssyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +dsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +dsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +qsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -ULOWER $< -o $(@F) + +qsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DLOWER $< -o $(@F) + +csyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +csyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +zsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +zsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +xsyr2k_kernel_U.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER $< -o $(@F) + +xsyr2k_kernel_L.$(PSUFFIX) : syr2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER $< -o $(@F) + +chemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +chemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_LU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm_LL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm_RU.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_RL.$(PSUFFIX) : zhemm_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +chemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +chemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +zhemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_thread_LU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm_thread_LL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm_thread_RU.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNC $< -o $(@F) + +xhemm_thread_RL.$(PSUFFIX) : zhemm_k.c level3_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNC $< -o $(@F) + +cherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xherk_UN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xherk_UC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xherk_LN.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xherk_LC.$(PSUFFIX) : zherk_k.c level3_syrk.c ../../common.h + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +cherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +cherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +cherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +zherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +zherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +zherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +zherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +xherk_kernel_UN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +xherk_kernel_UC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +xherk_kernel_LN.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +xherk_kernel_LC.$(PSUFFIX) : zherk_kernel.c + $(CC) -c $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +cherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +cherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +cherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +cherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -UDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +zherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +zherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +zherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +zherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +xherk_thread_UN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -UTRANS -UCONJ $< -o $(@F) + +xherk_thread_UC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -ULOWER -DTRANS -DCONJ $< -o $(@F) + +xherk_thread_LN.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -UTRANS -UCONJ $< -o $(@F) + +xherk_thread_LC.$(PSUFFIX) : zherk_k.c level3_syrk_threaded.c + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DHERK -DXDOUBLE -DCOMPLEX -DLOWER -DTRANS -DCONJ $< -o $(@F) + +cher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +cher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -UDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +zher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +zher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xher2k_UN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -ULOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xher2k_UC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -ULOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +xher2k_LN.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -DLOWER -UTRANS -DCOMPLEX -UCONJ $< -o $(@F) + +xher2k_LC.$(PSUFFIX) : zher2k_k.c level3_syr2k.c ../../common.h + $(CC) -c $(PFLAGS) -DHER2K -DXDOUBLE -DLOWER -DTRANS -DCOMPLEX -DCONJ $< -o $(@F) + +cher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +cher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +cher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +cher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +zher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +zher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +zher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +zher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +xher2k_kernel_UN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -UCONJ $< -o $(@F) + +xher2k_kernel_UC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DCONJ $< -o $(@F) + +xher2k_kernel_LN.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -UCONJ $< -o $(@F) + +xher2k_kernel_LC.$(PSUFFIX) : zher2k_kernel.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DCONJ $< -o $(@F) + +cgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c + $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm3m_nn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm3m_nt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm3m_nr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm3m_nc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm3m_tn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm3m_tt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm3m_tr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm3m_tc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm3m_rn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm3m_rt.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +cgemmf.$(PSUFFIX) : zgemmf.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX $< -o $(@F) + +zgemmf.$(PSUFFIX) : zgemmf.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX $< -o $(@F) + +xgemmf.$(PSUFFIX) : zgemmf.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX $< -o $(@F) + +cgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +cgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +cgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +cgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +cgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +cgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +cgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +cgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +cgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +cgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +zgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +zgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +zgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +zgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +zgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +zgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +zgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +zgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +zgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +zgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +xgemm3m_thread_nn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNN $< -o $(@F) + +xgemm3m_thread_nt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNT $< -o $(@F) + +xgemm3m_thread_nr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNR $< -o $(@F) + +xgemm3m_thread_nc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DNC $< -o $(@F) + +xgemm3m_thread_tn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTN $< -o $(@F) + +xgemm3m_thread_tt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTT $< -o $(@F) + +xgemm3m_thread_tr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTR $< -o $(@F) + +xgemm3m_thread_tc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DTC $< -o $(@F) + +xgemm3m_thread_rn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRN $< -o $(@F) + +xgemm3m_thread_rt.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRT $< -o $(@F) + +xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) + +xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) + +xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) + +xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) + +xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) + +xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) + +csymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_LU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_LL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_RU.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_RL.$(PSUFFIX) : symm3m_k.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +csymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +csymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zsymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_thread_LU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_thread_LL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xsymm3m_thread_RU.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xsymm3m_thread_RL.$(PSUFFIX) : symm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_LU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_LL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_RU.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_RL.$(PSUFFIX) : hemm3m_k.c gemm3m_level3.c ../../param.h + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +chemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +chemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -UDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +zhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +zhemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_thread_LU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_thread_LL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -URSIDE -DNN $< -o $(@F) + +xhemm3m_thread_RU.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -ULOWER -DRSIDE -DNN $< -o $(@F) + +xhemm3m_thread_RL.$(PSUFFIX) : hemm3m_k.c level3_gemm3m_thread.c ../../param.h + $(CC) -c -DTHREADED_LEVEL3 $(PFLAGS) -DXDOUBLE -DCOMPLEX -DLOWER -DRSIDE -DNN $< -o $(@F) + +strsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +strsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +strsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +strsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +strsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +dtrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +dtrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +dtrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +dtrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT $< -o $(@F) + +qtrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT $< -o $(@F) + +qtrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT $< -o $(@F) + +qtrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT $< -o $(@F) + +qtrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT $< -o $(@F) + +ctrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_LRUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LRUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LRLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LRLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LCUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LCUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_LCLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_LCLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ctrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ctrsm_RRUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RRUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RRLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RRLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RCUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RCUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ctrsm_RCLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ctrsm_RCLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + + +ztrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_LRUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LRUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LRLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LRLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LCUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LCUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_LCLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_LCLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +ztrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +ztrsm_RRUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RRUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RRLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RRLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RCUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RCUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +ztrsm_RCLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +ztrsm_RCLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LNUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LNUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LNLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LNLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LTUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LTUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LTLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_LTLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_LRUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LRUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LRLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LRLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LCUU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LCUN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_LCLU.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_LCLN.$(PSUFFIX) : trsm_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RNUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RNUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RNLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RNLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RTUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RTUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RTLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -UCONJ $< -o $(@F) + +xtrsm_RTLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -UCONJ $< -o $(@F) + +xtrsm_RRUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RRUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RRLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RRLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RCUU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RCUN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -DUPPER -UUNIT -DCONJ $< -o $(@F) + +xtrsm_RCLU.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -DUNIT -DCONJ $< -o $(@F) + +xtrsm_RCLN.$(PSUFFIX) : trsm_R.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) + +include ../../Makefile.tail diff --git a/driver/level3/gemm.c b/driver/level3/gemm.c new file mode 100644 index 0000000000..2b13da7d70 --- /dev/null +++ b/driver/level3/gemm.c @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifdef PARAMTEST +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P (args -> gemm_p) +#define GEMM_Q (args -> gemm_q) +#define GEMM_R (args -> gemm_r) +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q + +#define GEMM_P 504 +#define GEMM_Q 128 +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_thread.c" +#else +#include "level3.c" +#endif diff --git a/driver/level3/gemm3m.c b/driver/level3/gemm3m.c new file mode 100644 index 0000000000..8f31cf5b27 --- /dev/null +++ b/driver/level3/gemm3m.c @@ -0,0 +1,58 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifdef PARAMTEST +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P (args -> gemm_p) +#define GEMM_Q (args -> gemm_q) +#define GEMM_R (args -> gemm_r) +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_gemm3m_thread.c" +#else +#include "gemm3m_level3.c" +#endif diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c new file mode 100644 index 0000000000..8c5473c037 --- /dev/null +++ b/driver/level3/gemm3m_level3.c @@ -0,0 +1,531 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef BETA_OPERATION +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) +#endif + +#ifndef ICOPYB_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) +#else +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) +#endif +#endif + +#ifndef ICOPYR_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) +#else +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) +#endif +#endif + +#ifndef ICOPYI_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER) +#else +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER) +#endif +#endif + + +#ifndef OCOPYB_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + +#ifndef OCOPYR_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + + +#ifndef OCOPYI_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + +#ifndef KERNEL_FUNC +#define KERNEL_FUNC GEMM3M_KERNEL +#endif + +#ifndef KERNEL_OPERATION +#define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef B +#define B args -> b +#endif +#ifndef LDB +#define LDB args -> ldb +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ZERO +#define ALPHA6 ONE + +#define ALPHA7 ONE +#define ALPHA8 ZERO +#define ALPHA11 ONE +#define ALPHA12 -ONE + +#define ALPHA13 ZERO +#define ALPHA14 ONE +#define ALPHA17 -ONE +#define ALPHA18 -ONE +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ONE +#define ALPHA6 ZERO + +#define ALPHA7 ZERO +#define ALPHA8 ONE +#define ALPHA11 -ONE +#define ALPHA12 -ONE + +#define ALPHA13 ONE +#define ALPHA14 ZERO +#define ALPHA17 -ONE +#define ALPHA18 ONE +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ONE +#define ALPHA6 ZERO + +#define ALPHA7 ZERO +#define ALPHA8 ONE +#define ALPHA11 -ONE +#define ALPHA12 ONE + +#define ALPHA13 ONE +#define ALPHA14 ZERO +#define ALPHA17 -ONE +#define ALPHA18 -ONE +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ZERO +#define ALPHA6 -ONE + +#define ALPHA7 ONE +#define ALPHA8 ZERO +#define ALPHA11 ONE +#define ALPHA12 ONE + +#define ALPHA13 ZERO +#define ALPHA14 ONE +#define ALPHA17 -ONE +#define ALPHA18 ONE +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + FLOAT *sa, FLOAT *sb, BLASLONG dummy){ + BLASLONG k, lda, ldb, ldc; + FLOAT *alpha, *beta; + FLOAT *a, *b, *c; + BLASLONG m_from, m_to, n_from, n_to; + + BLASLONG ls, is, js, jjs; + BLASLONG min_l, min_i, min_j, min_jj; + +#ifdef TIMING + BLASULONG rpcc_counter; + BLASULONG BLASLONG innercost = 0; + BLASULONG BLASLONG outercost = 0; + BLASULONG BLASLONG kernelcost = 0; + double total; +#endif + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = *(((BLASLONG *)range_m) + 0); + m_to = *(((BLASLONG *)range_m) + 1); + } + + n_from = 0; + n_to = N; + + if (range_n) { + n_from = *(((BLASLONG *)range_n) + 0); + n_to = *(((BLASLONG *)range_n) + 1); + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#ifdef COMPLEX + && (alpha[1] == ZERO) +#endif + ) return 0; + +#if 0 + printf("GEMM: M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); + printf("GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM3M_P, (BLASLONG)GEMM3M_Q, (BLASLONG)GEMM3M_R); + printf("GEMM: SA .. %p SB .. %p\n", sa, sb); +#endif + +#ifdef DEBUG + innercost = 0; + outercost = 0; + kernelcost = 0; +#endif + + for(js = n_from; js < n_to; js += GEMM3M_R){ + min_j = n_to - js; + if (min_j > GEMM3M_R) min_j = GEMM3M_R; + + for(ls = 0; ls < k; ls += min_l){ + min_l = k - ls; + + if (min_l >= GEMM3M_Q * 2) { + min_l = GEMM3M_Q; + } else { + if (min_l > GEMM3M_Q) { + min_l = (min_l + 1) / 2; +#ifdef UNROLL_X + min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1); +#endif + } + } + + min_i = m_to - m_from; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else { + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + } + + START_RPCC(); + + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#else + OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#endif + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6, + sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA5, ALPHA6, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + } + + min_i = m_to - m_from; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else { + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + } + + START_RPCC(); + + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#else + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#endif + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12, + sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA11, ALPHA12, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + + min_i = m_to - m_from; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else { + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + } + + START_RPCC(); + + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#else + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, sb + min_l * (jjs - js)); +#endif + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18, + sa, sb + min_l * (jjs - js), c, ldc, m_from, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, ALPHA17, ALPHA18, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + + } /* end of js */ + } /* end of ls */ + + +#ifdef TIMING + total = (double)outercost + (double)innercost + (double)kernelcost; + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f\n", + innercost / total * 100., outercost / total * 100., + kernelcost / total * 100.); + + printf( " Total %10.3f%% %10.3f MFlops\n", + ((double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost / 2 * 100, + 2400. * (2. * (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k) / (double)kernelcost); +#endif + + return 0; +} diff --git a/driver/level3/gemm_thread_m.c b/driver/level3/gemm_thread_m.c new file mode 100644 index 0000000000..52c9b2d3e1 --- /dev/null +++ b/driver/level3/gemm_thread_m.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + if (!range_m) { + range[0] = 0; + i = arg -> m; + } else { + range[0] = range_m[0]; + i = range_m[1] - range_m[0]; + } + + num_cpu = 0; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + i -= width; + if (i < 0) width = width + i; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = arg; + queue[num_cpu].range_m = &range[num_cpu]; + queue[num_cpu].range_n = range_n; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + num_cpu ++; + } + + if (num_cpu) { + queue[0].sa = sa; + queue[0].sb = sb; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c new file mode 100644 index 0000000000..321e88f0cd --- /dev/null +++ b/driver/level3/gemm_thread_mn.c @@ -0,0 +1,148 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static const int divide_rule[][2] = + {{ 0, 0}, + { 1, 1}, { 1, 2}, { 1, 3}, { 2, 2}, + { 1, 5}, { 2, 3}, { 1, 7}, { 2, 4}, + { 3, 3}, { 2, 5}, { 1, 11}, { 2, 6}, + { 1, 13}, { 2, 7}, { 3, 5}, { 4, 4}, + { 1, 17}, { 3, 6}, { 1, 19}, { 4, 5}, + { 3, 7}, { 2, 11}, { 1, 23}, { 4, 6}, + { 5, 5}, { 2, 13}, { 3, 9}, { 4, 7}, + { 1, 29}, { 5, 6}, { 1, 31}, { 4, 8}, + { 3, 11}, { 2, 17}, { 5, 7}, { 6, 6}, + { 1, 37}, { 2, 19}, { 3, 13}, { 5, 8}, + { 1, 41}, { 6, 7}, { 1, 43}, { 4, 11}, + { 5, 9}, { 2, 23}, { 1, 47}, { 6, 8}, + { 7, 7}, { 5, 10}, { 3, 17}, { 4, 13}, + { 1, 53}, { 6, 9}, { 5, 11}, { 7, 8}, + { 3, 19}, { 2, 29}, { 1, 59}, { 6, 10}, + { 1, 61}, { 2, 31}, { 7, 9}, { 8, 8}, +}; + +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; + BLASLONG procs, total_procs, num_cpu_m, num_cpu_n; + + BLASLONG width, i, j; + BLASLONG divM, divN; + + divM = divide_rule[nthreads][0]; + divN = divide_rule[nthreads][1]; + + if (!range_m) { + range_M[0] = 0; + i = arg -> m; + } else { + range_M[0] = range_M[0]; + i = range_M[1] - range_M[0]; + } + + num_cpu_m = 0; + + while (i > 0){ + + width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m); + + i -= width; + if (i < 0) width = width + i; + + range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; + + num_cpu_m ++; + } + + if (!range_n) { + range_N[0] = 0; + i = arg -> n; + } else { + range_N[0] = range_n[0]; + i = range_n[1] - range_n[0]; + } + + num_cpu_n = 0; + + while (i > 0){ + + width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n); + + i -= width; + if (i < 0) width = width + i; + + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; + + num_cpu_n ++; + } + + procs = 0; + + for (j = 0; j < num_cpu_n; j++) { + for (i = 0; i < num_cpu_m; i++) { + + queue[procs].mode = mode; + queue[procs].routine = function; + queue[procs].args = arg; + queue[procs].range_m = &range_M[i]; + queue[procs].range_n = &range_N[j]; + queue[procs].sa = NULL; + queue[procs].sb = NULL; + queue[procs].next = &queue[procs + 1]; + + procs ++; + } + } + + if (procs) { + queue[0].sa = sa; + queue[0].sb = sb; + + queue[procs - 1].next = NULL; + + exec_blas(procs, queue); + } + + return 0; +} diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c new file mode 100644 index 0000000000..ba54612ebf --- /dev/null +++ b/driver/level3/gemm_thread_n.c @@ -0,0 +1,91 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i, num_cpu; + + if (!range_n) { + range[0] = 0; + i = arg -> n; + } else { + range[0] = range_n[0]; + i = range_n[1] - range_n[0]; + } + + num_cpu = 0; + + while (i > 0){ + + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + + i -= width; + if (i < 0) width = width + i; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = arg; + queue[num_cpu].range_m = range_m; + queue[num_cpu].range_n = &range[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + num_cpu ++; + } + + if (num_cpu) { + queue[0].sa = sa; + queue[0].sb = sb; + + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, + queue); + } + + return 0; +} diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c new file mode 100644 index 0000000000..9d83e950af --- /dev/null +++ b/driver/level3/gemm_thread_variable.c @@ -0,0 +1,127 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(int mode, + blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, + int (*function)(), void *sa, void *sb, BLASLONG divM, BLASLONG divN) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; + BLASLONG procs, num_cpu_m, num_cpu_n; + + BLASLONG width, i, j; + + if (!range_m) { + range_M[0] = 0; + i = arg -> m; + } else { + range_M[0] = range_M[0]; + i = range_M[1] - range_M[0]; + } + + num_cpu_m = 0; + + while (i > 0){ + + width = blas_quickdivide(i + divM - num_cpu_m - 1, divM - num_cpu_m); + + i -= width; + if (i < 0) width = width + i; + + range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; + + num_cpu_m ++; + } + + if (!range_n) { + range_N[0] = 0; + i = arg -> n; + } else { + range_N[0] = range_n[0]; + i = range_n[1] - range_n[0]; + } + + num_cpu_n = 0; + + while (i > 0){ + + width = blas_quickdivide(i + divN - num_cpu_n - 1, divN - num_cpu_n); + + i -= width; + if (i < 0) width = width + i; + + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; + + num_cpu_n ++; + } + + procs = 0; + + for (j = 0; j < num_cpu_n; j++) { + for (i = 0; i < num_cpu_m; i++) { + + queue[procs].mode = mode; + queue[procs].routine = function; + queue[procs].args = arg; + queue[procs].range_m = &range_M[i]; + queue[procs].range_n = &range_N[j]; + queue[procs].sa = NULL; + queue[procs].sb = NULL; + queue[procs].next = &queue[procs + 1]; + + procs ++; + } + } + + if (procs) { + queue[0].sa = sa; + queue[0].sb = sb; + + queue[procs - 1].next = NULL; + + exec_blas(procs, queue); + } + + return 0; +} + diff --git a/driver/level3/hemm3m_k.c b/driver/level3/hemm3m_k.c new file mode 100644 index 0000000000..2f3cf820dc --- /dev/null +++ b/driver/level3/hemm3m_k.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) + +#ifndef RSIDE +#ifndef LOWER +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYB(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYR(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_IUCOPYI(M, N, A, LDA, Y, X, BUFFER) +#else +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYB(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYR(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM3M_ILCOPYI(M, N, A, LDA, Y, X, BUFFER) +#endif +#endif + +#ifdef RSIDE +#ifndef LOWER +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OUCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OUCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OUCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OLCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OLCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + HEMM3M_OLCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + +#ifndef RSIDE +#define K args -> m +#ifndef LOWER +#define GEMM3M_LOCAL HEMM3M_LU +#else +#define GEMM3M_LOCAL HEMM3M_LL +#endif +#else +#define K args -> n +#ifndef LOWER +#define GEMM3M_LOCAL HEMM3M_RU +#else +#define GEMM3M_LOCAL HEMM3M_RL +#endif +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_gemm3m_thread.c" +#else +#include "gemm3m_level3.c" +#endif diff --git a/driver/level3/level3.c b/driver/level3/level3.c new file mode 100644 index 0000000000..62b310aba2 --- /dev/null +++ b/driver/level3/level3.c @@ -0,0 +1,401 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/* This file is a template for level 3 operation */ + +#ifndef BETA_OPERATION +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) +#ifndef COMPLEX +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#else +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#endif +#else +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA, NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#endif +#endif + +#ifndef ICOPY_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef KERNEL_FUNC +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define KERNEL_FUNC GEMM_KERNEL_N +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define KERNEL_FUNC GEMM_KERNEL_L +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define KERNEL_FUNC GEMM_KERNEL_R +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define KERNEL_FUNC GEMM_KERNEL_B +#endif +#endif + +#ifndef KERNEL_OPERATION +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif +#endif + +#ifndef FUSED_KERNEL_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#ifndef COMPLEX +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#else +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) + +#endif +#else +#ifndef COMPLEX +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#else +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#endif +#endif +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef B +#define B args -> b +#endif +#ifndef LDB +#define LDB args -> ldb +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ + BLASLONG k, lda, ldb, ldc; + FLOAT *alpha, *beta; + FLOAT *a, *b, *c; + BLASLONG m_from, m_to, n_from, n_to; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; +#if !defined(FUSED_GEMM) || defined(TIMING) + BLASLONG jjs, min_jj; +#endif + + BLASLONG l1stride, gemm_p, l2size; + +#if defined(XDOUBLE) && defined(QUAD_PRECISION) + xidouble xalpha; +#endif + +#ifdef TIMING + unsigned long long rpcc_counter; + unsigned long long innercost = 0; + unsigned long long outercost = 0; + unsigned long long kernelcost = 0; + double total; +#endif + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = *(((BLASLONG *)range_m) + 0); + m_to = *(((BLASLONG *)range_m) + 1); + } + + n_from = 0; + n_to = N; + + if (range_n) { + n_from = *(((BLASLONG *)range_n) + 0); + n_to = *(((BLASLONG *)range_n) + 1); + } + + if (beta) { +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) +#ifndef COMPLEX + if (beta[0] != ONE +#else + if ((beta[0] != ONE) || (beta[1] != ZERO) +#endif +#else + if (((beta[0].x[1] != 0x3fff000000000000UL) || beta[0].x[0] != 0) +#ifdef COMPLEX + &&(((beta[1].x[0] | beta[1].x[1]) << 1) != 0) +#endif +#endif + ) { +#if defined(XDOUBLE) && defined(QUAD_PRECISION) + xidouble xbeta; + + qtox(&xbeta, beta); +#endif + BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); + } + } + + if ((k == 0) || (alpha == NULL)) return 0; + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + if ((alpha[0] == ZERO) +#ifdef COMPLEX + && (alpha[1] == ZERO) +#endif + ) return 0; +#else + if (((alpha[0].x[0] | alpha[0].x[1] +#ifdef COMPLEX + | alpha[1].x[0] | alpha[1].x[1] +#endif + ) << 1) == 0) return 0; +#endif + +#if defined(XDOUBLE) && defined(QUAD_PRECISION) + qtox(&xalpha, alpha); +#endif + + l2size = GEMM_P * GEMM_Q; + +#if 0 + fprintf(stderr, "GEMM(Single): M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); + fprintf(stderr, "GEMM(Single):: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); + // fprintf(stderr, "GEMM: SA .. %p SB .. %p\n", sa, sb); + + // fprintf(stderr, "A = %p B = %p C = %p\n\tlda = %ld ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); +#endif + +#ifdef DEBUG + innercost = 0; + outercost = 0; + kernelcost = 0; +#endif + + for(js = n_from; js < n_to; js += GEMM_R){ + min_j = n_to - js; + if (min_j > GEMM_R) min_j = GEMM_R; + + for(ls = 0; ls < k; ls += min_l){ + + min_l = k - ls; + + if (min_l >= GEMM_Q * 2) { + gemm_p = GEMM_P; + min_l = GEMM_Q; + } else { + if (min_l > GEMM_Q) { + min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } + gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1)); + while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; + } + + /* First, we have to move data A to L2 cache */ + min_i = m_to - m_from; + l1stride = 1; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else { + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } else { + l1stride = 0; + } + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(innercost); + +#if defined(FUSED_GEMM) && !defined(TIMING) + + FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha, + sa, sb, b, ldb, c, ldc, m_from, js, ls); + + +#else + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, + sb + min_l * (jjs - js) * COMPSIZE * l1stride); + + STOP_RPCC(outercost); + + START_RPCC(); + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); +#else + KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, + sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); +#endif + + STOP_RPCC(kernelcost); + } +#endif + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); +#else + KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js); +#endif + + STOP_RPCC(kernelcost); + + } /* end of is */ + } /* end of js */ + } /* end of ls */ + + +#ifdef TIMING + total = (double)outercost + (double)innercost + (double)kernelcost; + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", + innercost / total * 100., outercost / total * 100., + kernelcost / total * 100., + (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., + (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); + +#endif + + return 0; +} diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c new file mode 100644 index 0000000000..bddb5eb878 --- /dev/null +++ b/driver/level3/level3_gemm3m_thread.c @@ -0,0 +1,1015 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef GEMM3M_LOCAL +#if defined(NN) +#define GEMM3M_LOCAL GEMM3M_NN +#elif defined(NT) +#define GEMM3M_LOCAL GEMM3M_NT +#elif defined(NR) +#define GEMM3M_LOCAL GEMM3M_NR +#elif defined(NC) +#define GEMM3M_LOCAL GEMM3M_NC +#elif defined(TN) +#define GEMM3M_LOCAL GEMM3M_TN +#elif defined(TT) +#define GEMM3M_LOCAL GEMM3M_TT +#elif defined(TR) +#define GEMM3M_LOCAL GEMM3M_TR +#elif defined(TC) +#define GEMM3M_LOCAL GEMM3M_TC +#elif defined(RN) +#define GEMM3M_LOCAL GEMM3M_RN +#elif defined(RT) +#define GEMM3M_LOCAL GEMM3M_RT +#elif defined(RR) +#define GEMM3M_LOCAL GEMM3M_RR +#elif defined(RC) +#define GEMM3M_LOCAL GEMM3M_RC +#elif defined(CN) +#define GEMM3M_LOCAL GEMM3M_CN +#elif defined(CT) +#define GEMM3M_LOCAL GEMM3M_CT +#elif defined(CR) +#define GEMM3M_LOCAL GEMM3M_CR +#elif defined(CC) +#define GEMM3M_LOCAL GEMM3M_CC +#endif +#endif + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef BETA_OPERATION +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) +#endif + +#ifndef ICOPYB_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef ICOPYR_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef ICOPYI_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_ITCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) \ + GEMM3M_INCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + + +#ifndef OCOPYB_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYB(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#else +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYB(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#endif +#endif + +#ifndef OCOPYR_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYR(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#else +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYR(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#endif +#endif + + +#ifndef OCOPYI_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_ONCOPYI(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#else +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + GEMM3M_OTCOPYI(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, ALPHA_R, ALPHA_I, BUFFER); +#endif +#endif + +#ifndef KERNEL_FUNC +#define KERNEL_FUNC GEMM3M_KERNEL +#endif + +#ifndef KERNEL_OPERATION +#define KERNEL_OPERATION(M, N, K, ALPHA_R, ALPHA_I, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA_R, ALPHA_I, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef B +#define B args -> b +#endif +#ifndef LDB +#define LDB args -> ldb +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ZERO +#define ALPHA6 ONE + +#define ALPHA7 ONE +#define ALPHA8 ZERO +#define ALPHA11 ONE +#define ALPHA12 -ONE + +#define ALPHA13 ZERO +#define ALPHA14 ONE +#define ALPHA17 -ONE +#define ALPHA18 -ONE +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ONE +#define ALPHA6 ZERO + +#define ALPHA7 ZERO +#define ALPHA8 ONE +#define ALPHA11 -ONE +#define ALPHA12 -ONE + +#define ALPHA13 ONE +#define ALPHA14 ZERO +#define ALPHA17 -ONE +#define ALPHA18 ONE +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ONE +#define ALPHA6 ZERO + +#define ALPHA7 ZERO +#define ALPHA8 ONE +#define ALPHA11 -ONE +#define ALPHA12 ONE + +#define ALPHA13 ONE +#define ALPHA14 ZERO +#define ALPHA17 -ONE +#define ALPHA18 -ONE +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define ALPHA1 ONE +#define ALPHA2 ONE +#define ALPHA5 ZERO +#define ALPHA6 -ONE + +#define ALPHA7 ONE +#define ALPHA8 ZERO +#define ALPHA11 ONE +#define ALPHA12 ONE + +#define ALPHA13 ZERO +#define ALPHA14 ONE +#define ALPHA17 -ONE +#define ALPHA18 ONE +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG k, lda, ldb, ldc; + BLASLONG m_from, m_to, n_from, n_to, N_from, N_to; + + FLOAT *alpha, *beta; + FLOAT *a, *b, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG ls, min_l, jjs, min_jj; + BLASLONG is, min_i, div_n; + BLASLONG i, current; + +#ifdef TIMING + BLASLONG rpcc_counter; + BLASLONG copy_A = 0; + BLASLONG copy_B = 0; + BLASLONG kernel = 0; + BLASLONG waiting1 = 0; + BLASLONG waiting2 = 0; + BLASLONG waiting3 = 0; + BLASLONG waiting6[MAX_CPU_NUMBER]; + BLASLONG ops = 0; + + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; +#endif + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = range_m[0]; + m_to = range_m[1]; + } + + n_from = 0; + n_to = N; + + N_from = 0; + N_to = N; + + if (range_n) { + n_from = range_n[mypos + 0]; + n_to = range_n[mypos + 1]; + + N_from = range_n[0]; + N_to = range_n[args -> nthreads]; + } + + if (beta) { + if ((beta[0] != ONE) || (beta[1] != ZERO)) + BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) && (alpha[1] == ZERO)) return 0; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", + mypos, m_from, m_to, n_from, n_to, N_from, N_to); +#endif + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + buffer[0] = sb; + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1)); + } + + for(ls = 0; ls < k; ls += min_l){ + min_l = k - ls; + if (min_l >= GEMM3M_Q * 2) { + min_l = GEMM3M_Q; + } else { + if (min_l > GEMM3M_Q) { + min_l = (min_l + 1) / 2; + } + } + + min_i = m_to - m_from; + + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else { + if (min_i > GEMM3M_P) { + min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + } + + + START_RPCC(); + + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(copy_A); + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using another buffer */ + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#else + OCOPYB_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#endif + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA5, ALPHA6, + sa, buffer[bufferside] + min_l * (jjs - xxx), + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + + for (i = 0; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + } + + current = mypos; + + do { + current ++; + if (current >= args -> nthreads) current = 0; + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if (current != mypos) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, m_from, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + } + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + } while (current != mypos); + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYB_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA5, ALPHA6, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; +#endif + if (is + min_i >= m_to) { + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + + } /* end of is */ + + START_RPCC(); + + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(copy_A); + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using another buffer */ + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#else + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#endif + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA11, ALPHA12, + sa, buffer[bufferside] + min_l * (jjs - xxx), + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + + for (i = 0; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + } + + current = mypos; + + do { + current ++; + if (current >= args -> nthreads) current = 0; + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if (current != mypos) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, m_from, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + } + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + } while (current != mypos); + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYR_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA11, ALPHA12, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; +#endif + if (is + min_i >= m_to) { + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + + } /* end of is */ + + + START_RPCC(); + + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(copy_A); + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using another buffer */ + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + + START_RPCC(); + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) + OCOPYI_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#else + OCOPYR_OPERATION(min_l, min_jj, b, ldb, alpha[0], -alpha[1], ls, jjs, buffer[bufferside] + min_l * (jjs - xxx)); +#endif + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, ALPHA17, ALPHA18, + sa, buffer[bufferside] + min_l * (jjs - xxx), + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + + for (i = 0; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + } + + current = mypos; + + do { + current ++; + if (current >= args -> nthreads) current = 0; + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if (current != mypos) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, m_from, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + } + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + } while (current != mypos); + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + if (min_i >= GEMM3M_P * 2) { + min_i = GEMM3M_P; + } else + if (min_i > GEMM3M_P) { + min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPYI_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, ALPHA17, ALPHA18, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * (range_n[current + 1] - range_n[current] - div_n) * min_l; +#endif + if (is + min_i >= m_to) { + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + + } /* end of is */ + + } + + START_RPCC(); + + for (i = 0; i < args -> nthreads; i++) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + + STOP_RPCC(waiting3); + +#ifdef TIMING + BLASLONG waiting = waiting1 + waiting2 + waiting3; + BLASLONG total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait : %6.2f Kernel : %6.2f\n", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting /(double)total * 100., + (double)ops/(double)kernel / 2. * 100.); + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", + mypos, copy_A, copy_B, waiting); + +#if 0 + fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", + mypos, + (double)waiting1/(double)waiting * 100., + (double)waiting2/(double)waiting * 100., + (double)waiting3/(double)waiting * 100.); +#endif + fprintf(stderr, "\n"); +#endif + + + + return 0; +} + +static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG + *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + blas_arg_t newarg; + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1]; + BLASLONG range_N[MAX_CPU_NUMBER + 1]; + + job_t job[MAX_CPU_NUMBER]; + + BLASLONG num_cpu_m, num_cpu_n; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k, js; + BLASLONG m, n, n_from, n_to; + int mode; + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE; +#else + mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; +#endif + + newarg.m = args -> m; + newarg.n = args -> n; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.ldb = args -> ldb; + newarg.ldc = args -> ldc; + newarg.alpha = args -> alpha; + newarg.beta = args -> beta; + newarg.nthreads = args -> nthreads; + newarg.common = (void *)job; + + if (!range_m) { + range_M[0] = 0; + m = args -> m; + } else { + range_M[0] = range_m[0]; + m = range_m[1] - range_m[0]; + } + + num_cpu_m = 0; + + while (m > 0){ + + width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); + + m -= width; + if (m < 0) width = width + m; + + range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; + + num_cpu_m ++; + } + + for (i = 0; i < num_cpu_m; i++) { + queue[i].mode = mode; + queue[i].routine = inner_thread; + queue[i].args = &newarg; + queue[i].range_m = &range_M[i]; + queue[i].range_n = &range_N[0]; + queue[i].sa = NULL; + queue[i].sb = NULL; + queue[i].next = &queue[i + 1]; + } + + queue[0].sa = sa; + queue[0].sb = sb; + + if (!range_n) { + n_from = 0; + n_to = args -> n; + } else { + n_from = range_n[0]; + n_to = range_n[1]; + } + + for(js = n_from; js < n_to; js += GEMM_R * nthreads){ + n = n_to - js; + if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; + + range_N[0] = js; + + num_cpu_n = 0; + + while (n > 0){ + + width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); + + n -= width; + if (n < 0) width = width + n; + + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; + + num_cpu_n ++; + } + + for (j = 0; j < num_cpu_m; j++) { + for (i = 0; i < num_cpu_m; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[num_cpu_m - 1].next = NULL; + + exec_blas(num_cpu_m, queue); + } + + return 0; +} + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG m = args -> m; + BLASLONG n = args -> n; + BLASLONG nthreads = args -> nthreads; + BLASLONG divN, divT; + int mode; + + if (range_m) { + BLASLONG m_from = *(((BLASLONG *)range_m) + 0); + BLASLONG m_to = *(((BLASLONG *)range_m) + 1); + + m = m_to - m_from; + } + + if (range_n) { + BLASLONG n_from = *(((BLASLONG *)range_n) + 0); + BLASLONG n_to = *(((BLASLONG *)range_n) + 1); + + n = n_to - n_from; + } + + if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; + } + + divT = nthreads; + divN = 1; + + while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) { + do { + divT --; + divN = 1; + while (divT * divN < nthreads) divN ++; + } while ((divT * divN != nthreads) && (divT > 1)); + } + + args -> nthreads = divT; + + if (divN == 1){ + gemm_driver(args, range_m, range_n, sa, sb, 0); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + +#if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \ + defined(CN) || defined(CT) || defined(CR) || defined(CC) + mode |= (BLAS_TRANSA_T); +#endif +#if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \ + defined(NC) || defined(TC) || defined(RC) || defined(CC) + mode |= (BLAS_TRANSB_T); +#endif + + gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); + } + + return 0; +} diff --git a/driver/level3/level3_syr2k.c b/driver/level3/level3_syr2k.c new file mode 100644 index 0000000000..2db18578b9 --- /dev/null +++ b/driver/level3/level3_syr2k.c @@ -0,0 +1,418 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) +#endif +#endif + +#ifndef KERNEL_OPERATION_C +#define KERNEL_OPERATION_C KERNEL_OPERATION +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef M +#define M args -> n +#endif + +#ifndef N +#define N args -> n +#endif + +#ifndef K +#define K args -> k +#endif + +#ifndef A +#define A args -> a +#endif + +#ifndef B +#define B args -> b +#endif + +#ifndef C +#define C args -> c +#endif + +#ifndef LDA +#define LDA args -> lda +#endif + +#ifndef LDB +#define LDB args -> ldb +#endif + +#ifndef LDC +#define LDC args -> ldc +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m_from, m_to, n_from, n_to, k, lda, ldb, ldc; + FLOAT *a, *b, *c, *alpha, *beta; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + BLASLONG m_start, m_end; + + FLOAT *aa; + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = *(((BLASLONG *)range_m) + 0); + m_to = *(((BLASLONG *)range_m) + 1); + } + + n_from = 0; + n_to = N; + + if (range_n) { + n_from = *(((BLASLONG *)range_n) + 0); + n_to = *(((BLASLONG *)range_n) + 1); + } + + if (beta) { +#if !defined(COMPLEX) || defined(HER2K) + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#ifdef COMPLEX + && (alpha[1] == ZERO) +#endif + ) return 0; + + for(js = n_from; js < n_to; js += GEMM_R){ + min_j = n_to - js; + if (min_j > GEMM_R) min_j = GEMM_R; + +#ifndef LOWER + m_start = m_from; + m_end = js + min_j; + if (m_end > m_to) m_end = m_to; +#else + m_start = m_from; + m_end = m_to; + if (m_start < js) m_start = js; +#endif + + for(ls = 0; ls < k; ls += min_l){ + min_l = k - ls; + if (min_l >= GEMM_Q * 2) { + min_l = GEMM_Q; + } else + if (min_l > GEMM_Q) { + min_l = (min_l + 1) / 2; + } + + min_i = m_end - m_start; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifndef LOWER + + if (m_start >= js) { + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa); + + KERNEL_OPERATION(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 1); + + jjs = m_start + min_i; + + } else { + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + jjs = js; + } + + for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE, + c, ldc, m_start, jjs, 1); + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + min_i = m_end - is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1); + + } + + min_i = m_end - m_start; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + if (m_start >= js) { + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa); + + KERNEL_OPERATION_C(min_i, min_i, min_l, alpha, sa, aa, c, ldc, m_start, m_start, 0); + + jjs = m_start + min_i; + + } else { + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); + + jjs = js; + } + + for(; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE, + c, ldc, m_start, jjs, 0); + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + min_i = m_end - is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); + + KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0); + + } + +#else + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + OCOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, aa); + + KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, + sa, aa, c, ldc, m_start, m_start, 1); + + for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){ + min_jj = m_start - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 1); + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + + min_i = m_end - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + aa = sb + min_l * (is - js) * COMPSIZE; + + if (is < js + min_j) { + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + OCOPY_OPERATION(min_l, min_i, b, ldb, ls, is, aa); + + KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 1); + + KERNEL_OPERATION(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 1); + + } else { + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 1); + + } + + } + + min_i = m_end - m_start; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, m_start, sa); + + OCOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, aa); + + KERNEL_OPERATION_C(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, + sa, aa, c, ldc, m_start, m_start, 0); + + for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_MN){ + min_jj = m_start - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + KERNEL_OPERATION_C(min_i, min_jj, min_l, alpha, + sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs, 0); + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + + min_i = m_end - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + aa = sb + min_l * (is - js) * COMPSIZE; + + if (is < js + min_j) { + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); + + OCOPY_OPERATION(min_l, min_i, a, lda, ls, is, aa); + + KERNEL_OPERATION_C(min_i, MIN(min_i, min_j - is + js), min_l, alpha, sa, aa, c, ldc, is, is, 0); + + KERNEL_OPERATION_C(min_i, is - js, min_l, alpha, sa, sb, c, ldc, is, js, 0); + + } else { + + ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa); + + KERNEL_OPERATION_C(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js, 0); + + } + + } + + + +#endif + } + } + + return 0; +} diff --git a/driver/level3/level3_syrk.c b/driver/level3/level3_syrk.c new file mode 100644 index 0000000000..249c140cda --- /dev/null +++ b/driver/level3/level3_syrk.c @@ -0,0 +1,495 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef M +#define M args -> n +#endif + +#ifndef N +#define N args -> n +#endif + +#ifndef K +#define K args -> k +#endif + +#ifndef A +#define A args -> a +#endif + +#ifndef C +#define C args -> c +#endif + +#ifndef LDA +#define LDA args -> lda +#endif + +#ifndef LDC +#define LDC args -> ldc +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m_from, m_to, n_from, n_to, k, lda, ldc; + FLOAT *a, *c, *alpha, *beta; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + BLASLONG m_start, m_end; + + int shared = ((GEMM_UNROLL_M == GEMM_UNROLL_N) && !HAVE_EX_L2); + + FLOAT *aa; + +#ifdef TIMING + unsigned long long rpcc_counter; + unsigned long long innercost = 0; + unsigned long long outercost = 0; + unsigned long long kernelcost = 0; + double total; +#endif + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = *(((BLASLONG *)range_m) + 0); + m_to = *(((BLASLONG *)range_m) + 1); + } + + n_from = 0; + n_to = N; + + if (range_n) { + n_from = *(((BLASLONG *)range_n) + 0); + n_to = *(((BLASLONG *)range_n) + 1); + } + + if (beta) { +#if !defined(COMPLEX) || defined(HERK) + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#if defined(COMPLEX) && !defined(HERK) + && (alpha[1] == ZERO) +#endif + ) return 0; + +#if 0 + fprintf(stderr, "m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", + m_from, m_to, n_from, n_to); +#endif + + for(js = n_from; js < n_to; js += GEMM_R){ + min_j = n_to - js; + if (min_j > GEMM_R) min_j = GEMM_R; + +#ifndef LOWER + m_start = m_from; + m_end = js + min_j; + if (m_end > m_to) m_end = m_to; +#else + m_start = m_from; + m_end = m_to; + if (m_start < js) m_start = js; +#endif + + for(ls = 0; ls < k; ls += min_l){ + min_l = k - ls; + if (min_l >= GEMM_Q * 2) { + min_l = GEMM_Q; + } else + if (min_l > GEMM_Q) { + min_l = (min_l + 1) / 2; + } + + min_i = m_end - m_start; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifndef LOWER + + if (m_end >= js) { + + aa = sb + min_l * MAX(m_start - js, 0) * COMPSIZE; + if (!shared) aa = sa; + + for(jjs = MAX(m_start, js); jjs < js + min_j; jjs += min_jj){ + min_jj = js + min_j - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + if (!shared && (jjs - MAX(m_start, js) < min_i)) { + START_RPCC(); + + ICOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sa + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(innercost); + } + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, aa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, MAX(m_start, js), jjs); + + STOP_RPCC(kernelcost); + } + + for(is = MAX(m_start, js) + min_i; is < m_end; is += min_i){ + min_i = m_end - is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + aa = sb + min_l * (is - js) * COMPSIZE; + + if (!shared) { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + aa = sa; + } + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, aa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + + } + + if (m_start < js) { + + if (m_end < js) { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_MN){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); + + STOP_RPCC(kernelcost); + + } + } else { + min_i = 0; + } + + for(is = m_start + min_i; is < MIN(m_end, js); is += min_i){ + + min_i = MIN(m_end, js)- is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + } + +#else + + if (m_start < js + min_j) { + + aa = sb + min_l * (m_start - js) * COMPSIZE; + + if (!shared) { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + STOP_RPCC(innercost); + + } + + START_RPCC(); + + OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j + js - m_start)), a, lda, ls, m_start, aa); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(min_i, min_j + js - m_start), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, m_start, m_start); + + STOP_RPCC(kernelcost); + + for(jjs = js; jjs < m_start; jjs += GEMM_UNROLL_N){ + min_jj = m_start - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, (shared? (aa) : (sa)), sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + + min_i = m_end - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + if (is < js + min_j) { + + if (!shared) { + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + } + + aa = sb + min_l * (is - js) * COMPSIZE; + + START_RPCC(); + + OCOPY_OPERATION(min_l, (shared? (min_i) : MIN(min_i, min_j - is + js)), a, lda, ls, is, aa); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(min_i, min_j - is + js), min_l, alpha, (shared? (aa) : (sa)), aa, c, ldc, is, is); + + STOP_RPCC(kernelcost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, is - js, min_l, alpha, (shared? (aa) : (sa)), sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } else { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + + } + + } else { + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_start, sa); + + STOP_RPCC(innercost); + + for(jjs = js; jjs < min_j; jjs += GEMM_UNROLL_N){ + min_jj = min_j - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, sa, sb + min_l * (jjs - js) * COMPSIZE, c, ldc, m_start, jjs); + + STOP_RPCC(kernelcost); + + } + + for(is = m_start + min_i; is < m_end; is += min_i){ + + min_i = m_end - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(innercost); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); + + STOP_RPCC(kernelcost); + + } + } +#endif + } + } + +#ifdef TIMING + total = (double)outercost + (double)innercost + (double)kernelcost; + + printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", + innercost / total * 100., outercost / total * 100., kernelcost / total * 100., + (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / (double)DNUMOPT, + (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / (double)DNUMOPT); + +#endif + + return 0; +} diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c new file mode 100644 index 0000000000..9d1f4d2a04 --- /dev/null +++ b/driver/level3/level3_syrk_threaded.c @@ -0,0 +1,673 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef SYRK_LOCAL +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL SYRK_UT +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_LN +#else +#define SYRK_LOCAL SYRK_LT +#endif +#endif + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#undef TIMING + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda, ldc; + BLASLONG m_from, m_to, n_from, n_to; + + FLOAT *alpha, *beta; + FLOAT *a, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG ls, min_l, jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; +#ifdef LOWER + BLASLONG start_i; +#endif + +#ifdef TIMING + BLASLONG rpcc_counter; + BLASLONG copy_A = 0; + BLASLONG copy_B = 0; + BLASLONG kernel = 0; + BLASLONG waiting1 = 0; + BLASLONG waiting2 = 0; + BLASLONG waiting3 = 0; + BLASLONG waiting6[MAX_CPU_NUMBER]; + BLASLONG ops = 0; + + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; +#endif + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = N; + + /* Global Range */ + n_from = 0; + n_to = N; + + if (range_n) { + m_from = range_n[mypos + 0]; + m_to = range_n[mypos + 1]; + + n_from = range_n[0]; + n_to = range_n[args -> nthreads]; + } + + if (beta) { +#if !defined(COMPLEX) || defined(HERK) + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + syrk_beta(m_from, m_to, n_from, n_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#if defined(COMPLEX) && !defined(HERK) + && (alpha[1] == ZERO) +#endif + ) return 0; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", mypos, m_from, m_to, n_from, n_to); +#endif + + div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + buffer[0] = sb; + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; + } + + for(ls = 0; ls < k; ls += min_l){ + + min_l = k - ls; + if (min_l >= GEMM_Q * 2) { + min_l = GEMM_Q; + } else { + if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; + } + + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else { + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + } + +#ifdef LOWER + xxx = (m_to - m_from - min_i) % GEMM_P; + + if (xxx) min_i -= GEMM_P - xxx; +#endif + + START_RPCC(); + +#ifndef LOWER + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); +#else + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_to - min_i, sa); +#endif + + STOP_RPCC(copy_A); + + div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using buffer */ +#ifndef LOWER + for (i = 0; i < mypos; i++) +#else + for (i = mypos + 1; i < args -> nthreads; i++) +#endif + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + +#ifndef LOWER + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + + if (xxx == m_from) { + if (min_jj > min_i) min_jj = min_i; + } else { + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + } + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, + buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE); + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE, + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + +#else + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, a, lda, ls, jjs, + buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE); + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE, + c, ldc, m_to - min_i, jjs); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } + +#endif + +#ifndef LOWER + for (i = 0; i <= mypos; i++) +#else + for (i = mypos; i < args -> nthreads; i++) +#endif + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + + WMB; + } + + +#ifndef LOWER + current = mypos + 1; + while (current < args -> nthreads) { +#else + current = mypos - 1; + while (current >= 0) { +#endif + + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + +#ifndef LOWER + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, + m_from, + xxx); +#else + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, + m_to - min_i, + xxx); +#endif + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + } + } + +#ifndef LOWER + current ++; +#else + current --; +#endif + } + +#ifndef LOWER + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; +#else + start_i = min_i; + + for(is = m_from; is < m_to - start_i; is += min_i){ + min_i = m_to - start_i - is; +#endif + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + + do { + + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + +#ifndef LOWER + if (is + min_i >= m_to) { +#else + if (is + min_i >= m_to - start_i) { +#endif + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + +#ifndef LOWER + current ++; + } while (current != args -> nthreads); +#else + current --; + } while (current >= 0); +#endif + + + } + } + + START_RPCC(); + + for (i = 0; i < args -> nthreads; i++) { + if (i != mypos) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + } + + STOP_RPCC(waiting3); + +#ifdef TIMING + BLASLONG waiting = waiting1 + waiting2 + waiting3; + BLASLONG total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting1 /(double)total * 100., + (double)waiting2 /(double)total * 100., + (double)waiting3 /(double)total * 100., + (double)ops/(double)kernel / 4. * 100.); + +#if 0 + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", + mypos, copy_A, copy_B, waiting); + + fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", + mypos, + (double)waiting1/(double)waiting * 100., + (double)waiting2/(double)waiting * 100., + (double)waiting3/(double)waiting * 100.); +#endif + fprintf(stderr, "\n"); +#endif + + return 0; +} + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + blas_arg_t newarg; + + job_t job[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range[MAX_CPU_NUMBER + 100]; + + BLASLONG num_cpu; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k; + BLASLONG n, n_from, n_to; + int mode, mask; + double dnum; + + if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { + SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; + } + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_REAL; + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; +#endif +#endif + + newarg.m = args -> m; + newarg.n = args -> n; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.ldb = args -> ldb; + newarg.ldc = args -> ldc; + newarg.alpha = args -> alpha; + newarg.beta = args -> beta; + newarg.common = (void *)job; + + if (!range_n) { + n_from = 0; + n_to = args -> n; + } else { + n_from = range_n[0]; + n_to = range_n[1] - range_n[0]; + } + +#ifndef LOWER + + range[MAX_CPU_NUMBER] = n_to - n_from; + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); + + if (num_cpu == 0) width = n - ((n - width) & ~mask); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = range_m; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; + +#else + + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = range_m; + queue[num_cpu].range_n = range; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + newarg.nthreads = num_cpu; + + if (num_cpu) { + + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + + return 0; +} diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c new file mode 100644 index 0000000000..000d423974 --- /dev/null +++ b/driver/level3/level3_thread.c @@ -0,0 +1,743 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef GEMM_LOCAL +#if defined(NN) +#define GEMM_LOCAL GEMM_NN +#elif defined(NT) +#define GEMM_LOCAL GEMM_NT +#elif defined(NR) +#define GEMM_LOCAL GEMM_NR +#elif defined(NC) +#define GEMM_LOCAL GEMM_NC +#elif defined(TN) +#define GEMM_LOCAL GEMM_TN +#elif defined(TT) +#define GEMM_LOCAL GEMM_TT +#elif defined(TR) +#define GEMM_LOCAL GEMM_TR +#elif defined(TC) +#define GEMM_LOCAL GEMM_TC +#elif defined(RN) +#define GEMM_LOCAL GEMM_RN +#elif defined(RT) +#define GEMM_LOCAL GEMM_RT +#elif defined(RR) +#define GEMM_LOCAL GEMM_RR +#elif defined(RC) +#define GEMM_LOCAL GEMM_RC +#elif defined(CN) +#define GEMM_LOCAL GEMM_CN +#elif defined(CT) +#define GEMM_LOCAL GEMM_CT +#elif defined(CR) +#define GEMM_LOCAL GEMM_CR +#elif defined(CC) +#define GEMM_LOCAL GEMM_CC +#endif +#endif + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef BETA_OPERATION +#ifndef COMPLEX +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#else +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) +#endif +#endif + +#ifndef ICOPY_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef KERNEL_FUNC +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define KERNEL_FUNC GEMM_KERNEL_N +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define KERNEL_FUNC GEMM_KERNEL_L +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define KERNEL_FUNC GEMM_KERNEL_R +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define KERNEL_FUNC GEMM_KERNEL_B +#endif +#endif + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif +#endif + +#ifndef FUSED_KERNEL_OPERATION +#if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ + defined(NR) || defined(TR) || defined(CR) || defined(RR) +#ifndef COMPLEX +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#else +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) + +#endif +#else +#ifndef COMPLEX +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#else +#define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ + FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ + (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) +#endif +#endif +#endif + +#ifndef A +#define A args -> a +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef B +#define B args -> b +#endif +#ifndef LDB +#define LDB args -> ldb +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDC +#define LDC args -> ldc +#endif +#ifndef M +#define M args -> m +#endif +#ifndef N +#define N args -> n +#endif +#ifndef K +#define K args -> k +#endif + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda, ldb, ldc; + BLASLONG m_from, m_to, n_from, n_to, N_from, N_to; + + FLOAT *alpha, *beta; + FLOAT *a, *b, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG ls, min_l, jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; + BLASLONG l1stride, l2size; + +#ifdef TIMING + BLASULONG rpcc_counter; + BLASULONG copy_A = 0; + BLASULONG copy_B = 0; + BLASULONG kernel = 0; + BLASULONG waiting1 = 0; + BLASULONG waiting2 = 0; + BLASULONG waiting3 = 0; + BLASULONG waiting6[MAX_CPU_NUMBER]; + BLASULONG ops = 0; + + for (i = 0; i < args -> nthreads; i++) waiting6[i] = 0; +#endif + + k = K; + + a = (FLOAT *)A; + b = (FLOAT *)B; + c = (FLOAT *)C; + + lda = LDA; + ldb = LDB; + ldc = LDC; + + alpha = (FLOAT *)args -> alpha; + beta = (FLOAT *)args -> beta; + + m_from = 0; + m_to = M; + + if (range_m) { + m_from = range_m[0]; + m_to = range_m[1]; + } + + n_from = 0; + n_to = N; + + N_from = 0; + N_to = N; + + if (range_n) { + n_from = range_n[mypos + 0]; + n_to = range_n[mypos + 1]; + + N_from = range_n[0]; + N_to = range_n[args -> nthreads]; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) +#endif + BETA_OPERATION(m_from, m_to, N_from, N_to, beta, c, ldc); + } + + if ((k == 0) || (alpha == NULL)) return 0; + + if ((alpha[0] == ZERO) +#ifdef COMPLEX + && (alpha[1] == ZERO) +#endif + ) return 0; + + l2size = GEMM_P * GEMM_Q; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", + mypos, m_from, m_to, n_from, n_to, N_from, N_to); + + fprintf(stderr, "GEMM: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); + +#endif + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + buffer[0] = sb; + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE; + } + + + for(ls = 0; ls < k; ls += min_l){ + + min_l = k - ls; + + if (min_l >= GEMM_Q * 2) { + min_l = GEMM_Q; + } else { + if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; + } + + l1stride = 1; + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else { + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } else { + if (args -> nthreads == 1) l1stride = 0; + } + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); + + STOP_RPCC(copy_A); + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + START_RPCC(); + + /* Make sure if no one is using buffer */ + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; + + STOP_RPCC(waiting1); + +#if defined(FUSED_GEMM) && !defined(TIMING) + + FUSED_KERNEL_OPERATION(min_i, MIN(n_to, xxx + div_n) - xxx, min_l, alpha, + sa, buffer[bufferside], b, ldb, c, ldc, m_from, xxx, ls); + +#else + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, + buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride); + + STOP_RPCC(copy_B); + + START_RPCC(); + + KERNEL_OPERATION(min_i, min_jj, min_l, alpha, + sa, buffer[bufferside] + min_l * (jjs - xxx) * COMPSIZE * l1stride, + c, ldc, m_from, jjs); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * min_jj * min_l; +#endif + + } +#endif + + for (i = 0; i < args -> nthreads; i++) job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + WMB; + } + + current = mypos; + + do { + current ++; + if (current >= args -> nthreads) current = 0; + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if (current != mypos) { + + START_RPCC(); + + /* thread has to wait */ + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + STOP_RPCC(waiting2); + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, m_from, xxx); + + STOP_RPCC(kernel); +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + } + + if (m_to - m_from == min_i) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + } + } + } while (current != mypos); + + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } + + START_RPCC(); + + ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); + + STOP_RPCC(copy_A); + + current = mypos; + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + START_RPCC(); + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), min_l, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, ldc, is, xxx); + + STOP_RPCC(kernel); + +#ifdef TIMING + ops += 2 * min_i * MIN(range_n[current + 1] - xxx, div_n) * min_l; +#endif + + if (is + min_i >= m_to) { + /* Thread doesn't need this buffer any more */ + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + + } + + } + + START_RPCC(); + + for (i = 0; i < args -> nthreads; i++) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + + STOP_RPCC(waiting3); + +#ifdef TIMING + BLASLONG waiting = waiting1 + waiting2 + waiting3; + BLASLONG total = copy_A + copy_B + kernel + waiting; + + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2f Copy_B : %6.2f Wait1 : %6.2f Wait2 : %6.2f Wait3 : %6.2f Kernel : %6.2f", + mypos, (double)copy_A /(double)total * 100., (double)copy_B /(double)total * 100., + (double)waiting1 /(double)total * 100., + (double)waiting2 /(double)total * 100., + (double)waiting3 /(double)total * 100., + (double)ops/(double)kernel / 4. * 100.); + +#if 0 + fprintf(stderr, "GEMM [%2ld] Copy_A : %6.2ld Copy_B : %6.2ld Wait : %6.2ld\n", + mypos, copy_A, copy_B, waiting); + + fprintf(stderr, "Waiting[%2ld] %6.2f %6.2f %6.2f\n", + mypos, + (double)waiting1/(double)waiting * 100., + (double)waiting2/(double)waiting * 100., + (double)waiting3/(double)waiting * 100.); +#endif + fprintf(stderr, "\n"); +#endif + + return 0; +} + +static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG + *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + blas_arg_t newarg; + + job_t job[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1]; + BLASLONG range_N[MAX_CPU_NUMBER + 1]; + + BLASLONG num_cpu_m, num_cpu_n; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k, js; + BLASLONG m, n, n_from, n_to; + int mode; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL | BLAS_NODE; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL | BLAS_NODE; +#else + mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX | BLAS_NODE; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX | BLAS_NODE; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX | BLAS_NODE; +#endif +#endif + + newarg.m = args -> m; + newarg.n = args -> n; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.ldb = args -> ldb; + newarg.ldc = args -> ldc; + newarg.alpha = args -> alpha; + newarg.beta = args -> beta; + newarg.nthreads = args -> nthreads; + newarg.common = (void *)job; + +#ifdef PARAMTEST + newarg.gemm_p = args -> gemm_p; + newarg.gemm_q = args -> gemm_q; + newarg.gemm_r = args -> gemm_r; +#endif + + if (!range_m) { + range_M[0] = 0; + m = args -> m; + } else { + range_M[0] = range_m[0]; + m = range_m[1] - range_m[0]; + } + + num_cpu_m = 0; + + while (m > 0){ + + width = blas_quickdivide(m + nthreads - num_cpu_m - 1, nthreads - num_cpu_m); + + m -= width; + if (m < 0) width = width + m; + + range_M[num_cpu_m + 1] = range_M[num_cpu_m] + width; + + num_cpu_m ++; + } + + for (i = 0; i < num_cpu_m; i++) { + queue[i].mode = mode; + queue[i].routine = inner_thread; + queue[i].args = &newarg; + queue[i].range_m = &range_M[i]; + queue[i].range_n = &range_N[0]; + queue[i].sa = NULL; + queue[i].sb = NULL; + queue[i].next = &queue[i + 1]; + } + + queue[0].sa = sa; + queue[0].sb = sb; + + if (!range_n) { + n_from = 0; + n_to = args -> n; + } else { + n_from = range_n[0]; + n_to = range_n[1]; + } + + for(js = n_from; js < n_to; js += GEMM_R * nthreads){ + n = n_to - js; + if (n > GEMM_R * nthreads) n = GEMM_R * nthreads; + + range_N[0] = js; + + num_cpu_n = 0; + + while (n > 0){ + + width = blas_quickdivide(n + nthreads - num_cpu_n - 1, nthreads - num_cpu_n); + + n -= width; + if (n < 0) width = width + n; + + range_N[num_cpu_n + 1] = range_N[num_cpu_n] + width; + + num_cpu_n ++; + } + + for (j = 0; j < num_cpu_m; j++) { + for (i = 0; i < num_cpu_m; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[num_cpu_m - 1].next = NULL; + + exec_blas(num_cpu_m, queue); + } + + return 0; +} + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG m = args -> m; + BLASLONG n = args -> n; + BLASLONG nthreads = args -> nthreads; + BLASLONG divN, divT; + int mode; + + if (nthreads == 1) { + GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; + } + + if (range_m) { + BLASLONG m_from = *(((BLASLONG *)range_m) + 0); + BLASLONG m_to = *(((BLASLONG *)range_m) + 1); + + m = m_to - m_from; + } + + if (range_n) { + BLASLONG n_from = *(((BLASLONG *)range_n) + 0); + BLASLONG n_to = *(((BLASLONG *)range_n) + 1); + + n = n_to - n_from; + } + + if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); + return 0; + } + + divT = nthreads; + divN = 1; + +#if 0 + while ((GEMM_P * divT > m * SWITCH_RATIO) && (divT > 1)) { + do { + divT --; + divN = 1; + while (divT * divN < nthreads) divN ++; + } while ((divT * divN != nthreads) && (divT > 1)); + } +#endif + + // fprintf(stderr, "divN = %4ld divT = %4ld\n", divN, divT); + + args -> nthreads = divT; + + if (divN == 1){ + + gemm_driver(args, range_m, range_n, sa, sb, 0); + } else { +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + +#if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \ + defined(CN) || defined(CT) || defined(CR) || defined(CC) + mode |= (BLAS_TRANSA_T); +#endif +#if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \ + defined(NC) || defined(TC) || defined(RC) || defined(CC) + mode |= (BLAS_TRANSB_T); +#endif + +#ifdef OS_WINDOWS + gemm_thread_n(mode, args, range_m, range_n, GEMM_LOCAL, sa, sb, divN); +#else + gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN); +#endif + + } + + return 0; +} diff --git a/driver/level3/symm3m_k.c b/driver/level3/symm3m_k.c new file mode 100644 index 0000000000..764c2ff338 --- /dev/null +++ b/driver/level3/symm3m_k.c @@ -0,0 +1,100 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ + GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ + BETA[0], BETA[1], NULL, 0, NULL, 0, \ + (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) + +#ifndef RSIDE +#ifndef LOWER +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYB(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYR(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_IUCOPYI(M, N, A, LDA, Y, X, BUFFER) +#else +#define ICOPYB_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYB(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYR_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYR(M, N, A, LDA, Y, X, BUFFER) +#define ICOPYI_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM3M_ILCOPYI(M, N, A, LDA, Y, X, BUFFER) +#endif +#endif + +#ifdef RSIDE +#ifndef LOWER +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OUCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OUCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OUCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#else +#define OCOPYB_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OLCOPYB(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYR_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OLCOPYR(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#define OCOPYI_OPERATION(M, N, A, LDA, ALPHA_R, ALPHA_I, X, Y, BUFFER) \ + SYMM3M_OLCOPYI(M, N, A, LDA, Y, X, ALPHA_R, ALPHA_I, BUFFER) +#endif +#endif + +#ifndef RSIDE +#define K args -> m +#ifndef LOWER +#define GEMM3M_LOCAL SYMM3M_LU +#else +#define GEMM3M_LOCAL SYMM3M_LL +#endif +#else +#define K args -> n +#ifndef LOWER +#define GEMM3M_LOCAL SYMM3M_RU +#else +#define GEMM3M_LOCAL SYMM3M_RL +#endif +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_gemm3m_thread.c" +#else +#include "gemm3m_level3.c" +#endif + diff --git a/driver/level3/symm_k.c b/driver/level3/symm_k.c new file mode 100644 index 0000000000..567896a436 --- /dev/null +++ b/driver/level3/symm_k.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifndef RSIDE +#ifndef LOWER +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_IUTCOPY(M, N, A, LDA, Y, X, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_ILTCOPY(M, N, A, LDA, Y, X, BUFFER); +#endif +#endif + +#ifdef RSIDE +#ifndef LOWER +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_OUTCOPY(M, N, A, LDA, Y, X, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) SYMM_OLTCOPY(M, N, A, LDA, Y, X, BUFFER); +#endif +#endif + +#ifndef RSIDE +#define K args -> m +#ifndef LOWER +#define GEMM_LOCAL SYMM_LU +#else +#define GEMM_LOCAL SYMM_LL +#endif +#else +#define K args -> n +#ifndef LOWER +#define GEMM_LOCAL SYMM_RU +#else +#define GEMM_LOCAL SYMM_RL +#endif +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_thread.c" +#else +#include "level3.c" +#endif diff --git a/driver/level3/syr2k_k.c b/driver/level3/syr2k_k.c new file mode 100644 index 0000000000..01251d4836 --- /dev/null +++ b/driver/level3/syr2k_k.c @@ -0,0 +1,103 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYR2K_KERNEL_U +#else +#define KERNEL_FUNC SYR2K_KERNEL_L +#endif +#endif + +static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { + + BLASLONG i; + +#ifndef LOWER + if (m_from > n_from) n_from = m_from; + if (m_to > n_to ) m_to = n_to; +#else + if (m_from < n_from) m_from = n_from; + if (m_to < n_to ) n_to = m_to; +#endif + + c += (m_from + n_from * ldc) * COMPSIZE; + + m_to -= m_from; + n_to -= n_from; + + for (i = 0; i < n_to; i++){ + +#ifndef LOWER + + SCAL_K(MIN(i + n_from - m_from + 1, m_to), 0, 0, alpha[0], +#ifdef COMPLEX + alpha[1], +#endif + c, 1, NULL, 0, NULL, 0); + + c += ldc * COMPSIZE; + +#else + + SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], +#ifdef COMPLEX + alpha[1], +#endif + c, 1, NULL, 0, NULL, 0); + + if (i < m_from - n_from) { + c += ldc * COMPSIZE; + } else { + c += (1 + ldc) * COMPSIZE; + } +#endif + + } + + return 0; +} + +#ifdef THREADED_LEVEL3 +#include "level3_syr2k_threaded.c" +#else +#include "level3_syr2k.c" +#endif diff --git a/driver/level3/syr2k_kernel.c b/driver/level3/syr2k_kernel.c new file mode 100644 index 0000000000..8c476f50c3 --- /dev/null +++ b/driver/level3/syr2k_kernel.c @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, +#ifdef COMPLEX + FLOAT alpha_i, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){ + + BLASLONG i, j; + BLASLONG loop; + FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE]; + + if (m + offset < 0) { +#ifndef LOWER + GEMM_KERNEL_N(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + if (n < offset) { +#ifdef LOWER + GEMM_KERNEL_N(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + + if (offset > 0) { +#ifdef LOWER + GEMM_KERNEL_N(m, offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + b += offset * k * COMPSIZE; + c += offset * ldc * COMPSIZE; + n -= offset; + offset = 0; + + if (n <= 0) return 0; + } + + if (n > m + offset) { +#ifndef LOWER + GEMM_KERNEL_N(m, n - m - offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, + b + (m + offset) * k * COMPSIZE, + c + (m + offset) * ldc * COMPSIZE, ldc); +#endif + + n = m + offset; + if (n <= 0) return 0; + } + + + if (offset < 0) { +#ifndef LOWER + GEMM_KERNEL_N(-offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + a -= offset * k * COMPSIZE; + c -= offset * COMPSIZE; + m += offset; + offset = 0; + + if (m <= 0) return 0; + } + + if (m > n - offset) { +#ifdef LOWER + GEMM_KERNEL_N(m - n + offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (n - offset) * k * COMPSIZE, + b, + c + (n - offset) * COMPSIZE, ldc); +#endif + m = n + offset; + if (m <= 0) return 0; + } + + for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { + + int mm, nn; + + mm = (loop & ~(GEMM_UNROLL_MN - 1)); + nn = MIN(GEMM_UNROLL_MN, n - loop); + +#ifndef LOWER + GEMM_KERNEL_N(mm, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); +#endif + + if (flag) { + GEMM_BETA(nn, nn, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, subbuffer, nn); + + GEMM_KERNEL_N(nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + +#ifndef LOWER + + for (j = 0; j < nn; j ++) { + for (i = 0; i <= j; i ++) { +#ifndef COMPLEX + c[i + loop + (j + loop) * ldc] += + subbuffer[i + j * nn] + subbuffer[j + i * nn]; +#else + c[(i + loop + (j + loop) * ldc) * 2 + 0] += + subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; + c[(i + loop + (j + loop) * ldc) * 2 + 1] += + subbuffer[(i + j * nn) * 2 + 1] + subbuffer[(j + i * nn) * 2 + 1]; +#endif + } + } +#else + for (j = 0; j < nn; j ++) { + for (i = j; i < nn; i ++) { +#ifndef COMPLEX + c[i + loop + (j + loop) * ldc] += + subbuffer[i + j * nn] + subbuffer[j + i * nn]; +#else + c[(i + loop + (j + loop) * ldc) * 2 + 0] += + subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; + c[(i + loop + (j + loop) * ldc) * 2 + 1] += + subbuffer[(i + j * nn) * 2 + 1] + subbuffer[(j + i * nn) * 2 + 1]; +#endif + } + } +#endif + } + +#ifdef LOWER + GEMM_KERNEL_N(m - mm - nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); +#endif + } + + return 0; +} diff --git a/driver/level3/syrk_k.c b/driver/level3/syrk_k.c new file mode 100644 index 0000000000..9c9700ef3f --- /dev/null +++ b/driver/level3/syrk_k.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYRK_KERNEL_U +#else +#define KERNEL_FUNC SYRK_KERNEL_L +#endif +#endif + +static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { + + BLASLONG i; + +#ifndef LOWER + if (m_from > n_from) n_from = m_from; + if (m_to > n_to ) m_to = n_to; +#else + if (m_from < n_from) m_from = n_from; + if (m_to < n_to ) n_to = m_to; +#endif + + c += (m_from + n_from * ldc) * COMPSIZE; + + m_to -= m_from; + n_to -= n_from; + + for (i = 0; i < n_to; i++){ + +#ifndef LOWER + + SCAL_K(MIN(i + n_from - m_from + 1, m_to), 0, 0, alpha[0], +#ifdef COMPLEX + alpha[1], +#endif + c, 1, NULL, 0, NULL, 0); + + c += ldc * COMPSIZE; + +#else + + SCAL_K(MIN(m_to - i + m_from - n_from, m_to), 0, 0, alpha[0], +#ifdef COMPLEX + alpha[1], +#endif + c, 1, NULL, 0, NULL, 0); + + if (i < m_from - n_from) { + c += ldc * COMPSIZE; + } else { + c += (1 + ldc) * COMPSIZE; + } +#endif + + } + + return 0; +} + +#ifdef THREADED_LEVEL3 +#include "level3_syrk_threaded.c" +#else +#include "level3_syrk.c" +#endif diff --git a/driver/level3/syrk_kernel.c b/driver/level3/syrk_kernel.c new file mode 100644 index 0000000000..65d108a49e --- /dev/null +++ b/driver/level3/syrk_kernel.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef CONJA +#ifndef CONJB +#define GEMM_KERNEL GEMM_KERNEL_N +#else +#define GEMM_KERNEL GEMM_KERNEL_R +#endif +#else +#ifndef CONJB +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_B +#endif +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, +#ifdef COMPLEX + FLOAT alpha_i, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + BLASLONG loop; + FLOAT *cc, *ss; + FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE]; + + if (m + offset < 0) { +#ifndef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + if (n < offset) { +#ifdef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + if (offset > 0) { +#ifdef LOWER + GEMM_KERNEL(m, offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + b += offset * k * COMPSIZE; + c += offset * ldc * COMPSIZE; + n -= offset; + offset = 0; + + if (n <= 0) return 0; + } + + if (n > m + offset) { +#ifndef LOWER + GEMM_KERNEL(m, n - m - offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, + b + (m + offset) * k * COMPSIZE, + c + (m + offset) * ldc * COMPSIZE, ldc); +#endif + + n = m + offset; + if (n <= 0) return 0; + } + + if (offset < 0) { +#ifndef LOWER + GEMM_KERNEL(-offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + a -= offset * k * COMPSIZE; + c -= offset * COMPSIZE; + m += offset; + offset = 0; + + if (m <= 0) return 0; + } + + if (m > n - offset) { +#ifdef LOWER + GEMM_KERNEL(m - n + offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (n - offset) * k * COMPSIZE, + b, + c + (n - offset) * COMPSIZE, ldc); +#endif + m = n + offset; + + if (m <= 0) return 0; + } + + for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { + + int mm, nn; + + mm = (loop & ~(GEMM_UNROLL_MN - 1)); + nn = MIN(GEMM_UNROLL_MN, n - loop); + +#ifndef LOWER + GEMM_KERNEL(mm, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); +#endif + + GEMM_BETA(nn, nn, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, subbuffer, nn); + + GEMM_KERNEL(nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + + cc = c + (loop + loop * ldc) * COMPSIZE; + ss = subbuffer; + +#ifndef LOWER + for (j = 0; j < nn; j ++) { + for (i = 0; i <= j; i ++) { +#ifndef COMPLEX + cc[i] += ss[i]; +#else + cc[i * 2 + 0] += ss[i * 2 + 0]; + cc[i * 2 + 1] += ss[i * 2 + 1]; +#endif + } + ss += nn * COMPSIZE; + cc += ldc * COMPSIZE; + } +#else + for (j = 0; j < nn; j ++) { + for (i = j; i < nn; i ++) { +#ifndef COMPLEX + cc[i] += ss[i]; +#else + cc[i * 2 + 0] += ss[i * 2 + 0]; + cc[i * 2 + 1] += ss[i * 2 + 1]; +#endif + } + ss += nn * COMPSIZE; + cc += ldc * COMPSIZE; + } +#endif + +#ifdef LOWER + GEMM_KERNEL(m - mm - nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); +#endif + + } + + return 0; +} diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c new file mode 100644 index 0000000000..837670b9f5 --- /dev/null +++ b/driver/level3/syrk_thread.c @@ -0,0 +1,186 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include "common.h" + +int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, i; + BLASLONG n_from, n_to; + double dnum, nf, nt, di; + + int num_cpu; + int mask = 0; + + if (!(mode & BLAS_COMPLEX)) { + + switch (mode & BLAS_PREC) { + case BLAS_SINGLE: + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; + break; + case BLAS_DOUBLE: + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; + break; +#ifdef EXPRECISION + case BLAS_XDOUBLE: + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; + break; +#endif + } + } else { + switch (mode & BLAS_PREC) { + case BLAS_SINGLE: + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; + break; + case BLAS_DOUBLE: + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; + break; +#ifdef EXPRECISION + case BLAS_XDOUBLE: + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; + break; +#endif + } + } + + n_from = 0; + n_to = arg -> n; + + if (range_n) { + n_from = *(range_n + 0); + n_to = *(range_n + 1); + } + + if (!(mode & BLAS_UPLO)) { + + nf = (double)(n_from); + nt = (double)(n_to); + + dnum = (nt * nt - nf * nf) / (double)nthreads; + + num_cpu = 0; + + range[0] = n_from; + i = n_from; + + while (i < n_to){ + + if (nthreads - num_cpu > 1) { + + di = (double)i; + width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask; + + if ((width <= 0) || (width > n_to - i)) width = n_to - i; + + } else { + width = n_to - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = arg; + queue[num_cpu].range_m = range_m; + queue[num_cpu].range_n = &range[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + } else { + + nf = (double)(arg -> n - n_from); + nt = (double)(arg -> n - n_to); + + dnum = (nt * nt - nf * nf) / (double)nthreads; + + num_cpu = 0; + + range[0] = n_from; + i = n_from; + + while (i < n_to){ + + if (nthreads - num_cpu > 1) { + + di = (double)(arg -> n - i); + width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask; + + if ((width <= 0) || (width > n_to - i)) width = n_to - i; + + } else { + width = n_to - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = arg; + queue[num_cpu].range_m = range_m; + queue[num_cpu].range_n = &range[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + } + + if (num_cpu) { + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c new file mode 100644 index 0000000000..9e46df05c1 --- /dev/null +++ b/driver/level3/trmm_L.c @@ -0,0 +1,444 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#define TRMM_KERNEL_N TRMM_KERNEL_LR +#define TRMM_KERNEL_T TRMM_KERNEL_LC +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#define TRMM_KERNEL_N TRMM_KERNEL_LN +#define TRMM_KERNEL_T TRMM_KERNEL_LT +#endif + +#undef TIMING + +#ifdef TIMING +#define START_RPCC() rpcc_counter = rpcc() +#define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter +#else +#define START_RPCC() +#define STOP_RPCC(COUNTER) +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m, n, lda, ldb; + FLOAT *beta, *a, *b; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + +#ifdef TIMING + unsigned long long rpcc_counter; + unsigned long long innercost = 0; + unsigned long long outercost = 0; + unsigned long long gemmcost = 0; + unsigned long long trmmcost = 0; + double total; +#endif + + m = args -> m; + n = args -> n; + + a = (FLOAT *)args -> a; + b = (FLOAT *)args -> b; + + lda = args -> lda; + ldb = args -> ldb; + + beta = (FLOAT *)args -> beta; + + if (range_n) { + BLASLONG n_from = *(((BLASLONG *)range_n) + 0); + BLASLONG n_to = *(((BLASLONG *)range_n) + 1); + + n = n_to - n_from; + + b += n_from * ldb * COMPSIZE; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) + GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); + if (beta[0] == ZERO) return 0; +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) + GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); + if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; +#endif + } + + for(js = 0; js < n; js += GEMM_R){ + min_j = n - js; + if (min_j > GEMM_R) min_j = GEMM_R; + +#if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) + + min_l = m; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_IUTCOPY(min_l, min_i, a, lda, 0, 0, sa); +#else + TRMM_ILNCOPY(min_l, min_i, a, lda, 0, 0, sa); +#endif + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, b + (jjs * ldb) * COMPSIZE, ldb, 0); + + STOP_RPCC(trmmcost); + } + + + for(is = min_i; is < min_l; is += GEMM_P){ + min_i = min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_IUTCOPY(min_l, min_i, a, lda, 0, is, sa); +#else + TRMM_ILNCOPY(min_l, min_i, a, lda, 0, is, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + TRMM_KERNEL_N(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is); + + STOP_RPCC(trmmcost); + + } + + for(ls = min_l; ls < m; ls += GEMM_Q){ + min_l = m - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = ls; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (ls * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + (ls ) * COMPSIZE, lda, sa); +#endif + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(gemmcost); + + START_RPCC(); + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs * ldb) * COMPSIZE, ldb); + + STOP_RPCC(gemmcost); + } + + for(is = min_i; is < ls; is += GEMM_P){ + min_i = ls - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + GEMM_KERNEL(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + + STOP_RPCC(gemmcost); + } + + for(is = ls; is < ls + min_l; is += GEMM_P){ + min_i = ls + min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_IUTCOPY(min_l, min_i, a, lda, ls, is, sa); +#else + TRMM_ILNCOPY(min_l, min_i, a, lda, ls, is, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + TRMM_KERNEL_N(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); + + STOP_RPCC(trmmcost); + } + } + +#else + min_l = m; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); +#else + TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, m - min_l, sa); +#endif + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, + sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, 0); + + STOP_RPCC(trmmcost); + } + + for(is = m - min_l + min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_ILTCOPY(min_l, min_i, a, lda, m - min_l, is, sa); +#else + TRMM_IUNCOPY(min_l, min_i, a, lda, m - min_l, is, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + TRMM_KERNEL_T(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - m + min_l); + + STOP_RPCC(trmmcost); + } + + for(ls = m - min_l; ls > 0; ls -= GEMM_Q){ + min_l = ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); +#else + TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, ls - min_l, sa); +#endif + + STOP_RPCC(innercost); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + START_RPCC(); + + GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, + sb + min_l * (jjs - js) * COMPSIZE); + + STOP_RPCC(outercost); + + START_RPCC(); + + TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, 0); + + STOP_RPCC(trmmcost); + } + + for(is = ls - min_l + min_i; is < ls; is += GEMM_P){ + min_i = ls - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + TRMM_ILTCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); +#else + TRMM_IUNCOPY(min_l, min_i, a, lda, ls - min_l, is, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + TRMM_KERNEL_T(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls + min_l); + + STOP_RPCC(trmmcost); + } + + + for(is = ls; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + START_RPCC(); + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); +#endif + + STOP_RPCC(innercost); + + START_RPCC(); + + GEMM_KERNEL(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + + STOP_RPCC(gemmcost); + } + } + +#endif + + } + +#ifdef TIMING + total = (double)outercost + (double)innercost + (double)gemmcost + (double)trmmcost; + + printf( "Copy A : %5.2f Copy B: %5.2f GEMM Kernel : %5.2f TRMM Kerlnel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", + innercost / total * 100., outercost / total * 100., + gemmcost / total * 100., trmmcost / total * 100., + (double)n * (double)n * (double)n / (double)(trmmcost + gemmcost) * 100. * (double)COMPSIZE / 2., + (double)n * (double)n * (double)n / total * 100. * (double)COMPSIZE / 2.); + +#endif + + return 0; +} diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c new file mode 100644 index 0000000000..e46553c3f7 --- /dev/null +++ b/driver/level3/trmm_R.c @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dp1 = 1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#define TRMM_KERNEL_N TRMM_KERNEL_RR +#define TRMM_KERNEL_T TRMM_KERNEL_RC +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#define TRMM_KERNEL_N TRMM_KERNEL_RN +#define TRMM_KERNEL_T TRMM_KERNEL_RT +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 12 +#define GEMM_R 16 +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m, n, lda, ldb; + FLOAT *beta, *a, *b; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + + m = args -> m; + n = args -> n; + + a = (FLOAT *)args -> a; + b = (FLOAT *)args -> b; + + lda = args -> lda; + ldb = args -> ldb; + + beta = (FLOAT *)args -> beta; + + if (range_m) { + BLASLONG m_from = *(((BLASLONG *)range_m) + 0); + BLASLONG m_to = *(((BLASLONG *)range_m) + 1); + + m = m_to - m_from; + + b += m_from * COMPSIZE; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) + GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); + if (beta[0] == ZERO) return 0; +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) + GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); + if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; +#endif + } + +#if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) + + for(js = 0; js < n; js += GEMM_R){ + min_j = n - js; + if (min_j > GEMM_R) min_j = GEMM_R; + + for(ls = js; ls < js + min_j; ls += GEMM_Q){ + min_l = js + min_j - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = 0; jjs < ls - js; jjs += min_jj){ + min_jj = ls - js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + ((js + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * jjs * COMPSIZE, + b + ((js + jjs) * ldb) * COMPSIZE, ldb); + } + + for(jjs = 0; jjs < min_l; jjs += min_jj){ + min_jj = min_l - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); +#else + TRMM_OUTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); +#endif + + TRMM_KERNEL_T(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + (ls - js + jjs) * min_l * COMPSIZE, + b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, ls - js, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, + b + (is + js * ldb) * COMPSIZE, ldb); + + TRMM_KERNEL_T(min_i, min_l, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + (ls - js) * min_l * COMPSIZE, + b + (is + ls * ldb) * COMPSIZE, ldb, 0); + } + } + + + for(ls = js + min_j; ls < n; ls += GEMM_Q){ + min_l = n - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs * ldb) * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + } + } + } + +#else + BLASLONG start_ls; + + for(js = n; js > 0; js -= GEMM_R){ + min_j = js; + if (min_j > GEMM_R) min_j = GEMM_R; + + start_ls = js - min_j; + while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; + + for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ + min_l = js - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = 0; jjs < min_l; jjs += min_jj){ + min_jj = min_l - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); +#else + TRMM_OLTCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); +#endif + + TRMM_KERNEL_N(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * jjs * COMPSIZE, + b + ((ls + jjs) * ldb) * COMPSIZE, ldb, -jjs); + } + + for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ + min_jj = js - ls - min_l - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, + sb + min_l * (min_l + jjs) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, + sb + min_l * (min_l + jjs) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * (min_l + jjs) * COMPSIZE, + b + ((ls + min_l + jjs) * ldb) * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + TRMM_KERNEL_N(min_i, min_l, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb, + b + (is + ls * ldb) * COMPSIZE, ldb, 0); + + if (js - ls - min_l > 0) { + GEMM_KERNEL(min_i, js - ls - min_l, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * min_l * COMPSIZE, + b + (is + (ls + min_l) * ldb) * COMPSIZE, ldb); + } + } + } + + for(ls = 0; ls < js - min_j; ls += GEMM_Q){ + min_l = js - min_j - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + ((jjs - min_j) * ldb) * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, min_j, min_l, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + } + } + } + +#endif + + return 0; +} diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c new file mode 100644 index 0000000000..2c3006f091 --- /dev/null +++ b/driver/level3/trsm_L.c @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) +#define TRSM_KERNEL TRSM_KERNEL_LR +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#endif +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) +#define TRSM_KERNEL TRSM_KERNEL_LN +#else +#define TRSM_KERNEL TRSM_KERNEL_LT +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 12 +#define GEMM_R 1600 +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m, n, lda, ldb; + FLOAT *beta, *a, *b; + + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + + m = args -> m; + n = args -> n; + + a = (FLOAT *)args -> a; + b = (FLOAT *)args -> b; + + lda = args -> lda; + ldb = args -> ldb; + + beta = (FLOAT *)args -> beta; + + if (range_n) { + BLASLONG n_from = *(((BLASLONG *)range_n) + 0); + BLASLONG n_to = *(((BLASLONG *)range_n) + 1); + + n = n_to - n_from; + + b += n_from * ldb * COMPSIZE; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) + GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); + if (beta[0] == ZERO) return 0; +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) + GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); + if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; +#endif + } + + for(js = 0; js < n; js += GEMM_R){ + min_j = n - js; + if (min_j > GEMM_R) min_j = GEMM_R; + +#if (!defined(UPPER) && !defined(TRANSA)) || (defined(UPPER) && defined(TRANSA)) + for(ls = 0; ls < m; ls += GEMM_Q){ + min_l = m - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + TRSM_ILTCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); +#else + TRSM_IUNCOPY(min_l, min_i, a + (ls + ls * lda) * COMPSIZE, lda, 0, sa); +#endif + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); + + TRSM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (ls + jjs * ldb) * COMPSIZE, ldb, 0); + } + + for(is = ls + min_i; is < ls + min_l; is += GEMM_P){ + min_i = ls + min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + TRSM_ILTCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, is - ls, sa); +#else + TRSM_IUNCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, is - ls, sa); +#endif + + TRSM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb, is - ls); + } + + + for(is = ls + min_l; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); +#endif + + GEMM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + } + } +#else + BLASLONG start_is; + + for(ls = m; ls > 0; ls -= GEMM_Q){ + min_l = ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + start_is = ls - min_l; + while (start_is + GEMM_P < ls) start_is += GEMM_P; + min_i = ls - start_is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + TRSM_IUTCOPY(min_l, min_i, a + (start_is + (ls - min_l) * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); +#else + TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + start_is * lda) * COMPSIZE, lda, start_is - (ls - min_l), sa); +#endif + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); + + TRSM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (start_is + jjs * ldb) * COMPSIZE, ldb, start_is - ls + min_l); + } + + for(is = start_is - GEMM_P; is >= ls - min_l; is -= GEMM_P){ + min_i = ls - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + TRSM_IUTCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, is - (ls - min_l), sa); +#else + TRSM_ILNCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, is - (ls - min_l), sa); +#endif + TRSM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, + b + (is + js * ldb) * COMPSIZE, ldb, + is - (ls - min_l) ); + } + + + for(is = 0; is < ls - min_l; is += GEMM_P){ + min_i = ls - min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifndef TRANSA + GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); +#else + GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); +#endif + + GEMM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + } + } + +#endif + } + + return 0; +} diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c new file mode 100644 index 0000000000..0964d78605 --- /dev/null +++ b/driver/level3/trsm_R.c @@ -0,0 +1,348 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +const static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) +#define TRSM_KERNEL TRSM_KERNEL_RR +#else +#define TRSM_KERNEL TRSM_KERNEL_RC +#endif +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#if (!defined(TRANSA) && defined(UPPER)) || (defined(TRANSA) && !defined(UPPER)) +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RT +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 16 +#define GEMM_Q 20 +#define GEMM_R 24 +#endif + +int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG dummy) { + + BLASLONG m, n, lda, ldb; + FLOAT *beta, *a, *b; + BLASLONG ls, is, js; + BLASLONG min_l, min_i, min_j; + BLASLONG jjs, min_jj; + + m = args -> m; + n = args -> n; + + a = (FLOAT *)args -> a; + b = (FLOAT *)args -> b; + + lda = args -> lda; + ldb = args -> ldb; + + beta = (FLOAT *)args -> beta; + + if (range_m) { + BLASLONG m_from = *(((BLASLONG *)range_m) + 0); + BLASLONG m_to = *(((BLASLONG *)range_m) + 1); + + m = m_to - m_from; + + b += m_from * COMPSIZE; + } + + if (beta) { +#ifndef COMPLEX + if (beta[0] != ONE) + GEMM_BETA(m, n, 0, beta[0], NULL, 0, NULL, 0, b, ldb); + if (beta[0] == ZERO) return 0; +#else + if ((beta[0] != ONE) || (beta[1] != ZERO)) + GEMM_BETA(m, n, 0, beta[0], beta[1], NULL, 0, NULL, 0, b, ldb); + if ((beta[0] == ZERO) && (beta[1] == ZERO)) return 0; +#endif + } + +#if (defined(UPPER) && !defined(TRANSA)) || (!defined(UPPER) && defined(TRANSA)) + for(js = 0; js < n; js += GEMM_R){ + min_j = n - js; + if (min_j > GEMM_R) min_j = GEMM_R; + + for(ls = 0; ls < js; ls += GEMM_Q){ + min_l = js - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs * ldb) * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + js * ldb) * COMPSIZE, ldb); + } + } + + for(ls = js; ls < js + min_j; ls += GEMM_Q){ + min_l = js + min_j - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + +#ifndef TRANSA + TRSM_OUNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); +#else + TRSM_OLTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, 0, sb); +#endif + + TRSM_KERNEL(min_i, min_l, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb, + b + (ls * ldb) * COMPSIZE, ldb, 0); + + for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ + min_jj = min_j - min_l - ls + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY (min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, + sb + min_l * (min_l + jjs) * COMPSIZE); +#else + GEMM_OTCOPY (min_l, min_jj, a + ((ls + min_l + jjs) + ls * lda) * COMPSIZE, lda, + sb + min_l * (min_l + jjs) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * (min_l + jjs) * COMPSIZE, + b + (min_l + ls + jjs) * ldb * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + TRSM_KERNEL(min_i, min_l, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb, + b + (is + ls * ldb) * COMPSIZE, ldb, 0); + + GEMM_KERNEL(min_i, min_j - min_l + js - ls, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * min_l * COMPSIZE, + b + (is + ( min_l + ls) * ldb) * COMPSIZE, ldb); + } + } + } + +#else + BLASLONG start_ls; + + for(js = n; js > 0; js -= GEMM_R){ + min_j = js; + if (min_j > GEMM_R) min_j = GEMM_R; + + for (ls = js; ls < n; ls += GEMM_Q) { + min_l = n - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + + for(jjs = js; jjs < js + min_j; jjs += min_jj){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#else + GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb + min_l * (jjs - js) * COMPSIZE, + b + (jjs - min_j) * ldb * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + GEMM_KERNEL(min_i, min_j, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + } + } + + start_ls = js - min_j; + while (start_ls + GEMM_Q < js) start_ls += GEMM_Q; + + for(ls = start_ls; ls >= js - min_j; ls -= GEMM_Q){ + min_l = js - ls; + if (min_l > GEMM_Q) min_l = GEMM_Q; + min_i = m; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + +#ifndef TRANSA + TRSM_OLNCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, + 0, sb + min_l * (min_j - js + ls) * COMPSIZE); +#else + TRSM_OUTCOPY(min_l, min_l, a + (ls + ls * lda) * COMPSIZE, lda, + 0, sb + min_l * (min_j - js + ls) * COMPSIZE); +#endif + + TRSM_KERNEL(min_i, min_l, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * (min_j - js + ls) * COMPSIZE, + b + (ls * ldb) * COMPSIZE, ldb, 0); + + for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ + min_jj = min_j - js + ls - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#ifndef TRANSA + GEMM_ONCOPY (min_l, min_jj, a + (ls + (js - min_j + jjs) * lda) * COMPSIZE, lda, + sb + min_l * jjs * COMPSIZE); +#else + GEMM_OTCOPY (min_l, min_jj, a + ((js - min_j + jjs) + ls * lda) * COMPSIZE, lda, + sb + min_l * jjs * COMPSIZE); +#endif + + GEMM_KERNEL(min_i, min_jj, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * jjs * COMPSIZE, + b + (js - min_j + jjs) * ldb * COMPSIZE, ldb); + } + + for(is = min_i; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + + TRSM_KERNEL(min_i, min_l, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + min_l * (min_j - js + ls) * COMPSIZE, + b + (is + ls * ldb) * COMPSIZE, ldb, 0); + + GEMM_KERNEL(min_i, min_j - js + ls, min_l, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb, + b + (is + (js - min_j) * ldb) * COMPSIZE, ldb); + } + + } + } + +#endif + + return 0; +} diff --git a/driver/level3/zhemm_k.c b/driver/level3/zhemm_k.c new file mode 100644 index 0000000000..50da97a35b --- /dev/null +++ b/driver/level3/zhemm_k.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#undef TIMING + +#ifndef RSIDE +#ifndef LOWER +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_IUTCOPY(M, N, A, LDA, Y, X, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_ILTCOPY(M, N, A, LDA, Y, X, BUFFER); +#endif +#endif + +#ifdef RSIDE +#ifndef LOWER +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_OUTCOPY(M, N, A, LDA, Y, X, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) HEMM_OLTCOPY(M, N, A, LDA, Y, X, BUFFER); +#endif +#endif + +#ifndef RSIDE +#define K args -> m +#ifndef LOWER +#define GEMM_LOCAL HEMM_LU +#else +#define GEMM_LOCAL HEMM_LL +#endif +#else +#define K args -> n +#ifndef LOWER +#define GEMM_LOCAL HEMM_RU +#else +#define GEMM_LOCAL HEMM_RL +#endif +#endif + +#ifdef THREADED_LEVEL3 +#include "level3_thread.c" +#else +#include "level3.c" +#endif diff --git a/driver/level3/zher2k_k.c b/driver/level3/zher2k_k.c new file mode 100644 index 0000000000..93bb781f1f --- /dev/null +++ b/driver/level3/zher2k_k.c @@ -0,0 +1,160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef LOWER + +#ifndef CONJ +#ifdef XDOUBLE +#define KERNEL_FUNC xher2k_kernel_UN +#elif defined(DOUBLE) +#define KERNEL_FUNC zher2k_kernel_UN +#else +#define KERNEL_FUNC cher2k_kernel_UN +#endif +#else +#ifdef XDOUBLE +#define KERNEL_FUNC xher2k_kernel_UC +#elif defined(DOUBLE) +#define KERNEL_FUNC zher2k_kernel_UC +#else +#define KERNEL_FUNC cher2k_kernel_UC +#endif +#endif + +#else + +#ifndef CONJ +#ifdef XDOUBLE +#define KERNEL_FUNC xher2k_kernel_LN +#elif defined(DOUBLE) +#define KERNEL_FUNC zher2k_kernel_LN +#else +#define KERNEL_FUNC cher2k_kernel_LN +#endif +#else +#ifdef XDOUBLE +#define KERNEL_FUNC xher2k_kernel_LC +#elif defined(DOUBLE) +#define KERNEL_FUNC zher2k_kernel_LC +#else +#define KERNEL_FUNC cher2k_kernel_LC +#endif +#endif + +#endif + +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) + +#define KERNEL_OPERATION_C(M, N, K, ALPHA, SA, SB, C, LDC, X, Y, FLAG) \ + KERNEL_FUNC(M, N, K, ALPHA[0], -ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y), FLAG) + +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL HER2K_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL HER2K_UC +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL HER2K_LN +#else +#define SYRK_LOCAL HER2K_LC +#endif + +#undef SCAL_K + +#ifdef XDOUBLE +#define SCAL_K QSCAL_K +#elif defined(DOUBLE) +#define SCAL_K DSCAL_K +#else +#define SCAL_K SSCAL_K +#endif + +static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { + + BLASLONG i; + +#ifndef LOWER + if (m_from > n_from) n_from = m_from; + if (m_to > n_to ) m_to = n_to; +#else + if (m_from < n_from) m_from = n_from; + if (m_to < n_to ) n_to = m_to; +#endif + + c += (m_from + n_from * ldc) * COMPSIZE; + + m_to -= m_from; + n_to -= n_from; + + for (i = 0; i < n_to; i++){ + +#ifndef LOWER + + SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); + + if (i + n_from - m_from + 1 <= m_to) + *(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO; + + c += ldc * COMPSIZE; + +#else + + SCAL_K(MIN(m_to - i + m_from - n_from, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); + + if (i < m_from - n_from) { + c += ldc * COMPSIZE; + } else { + *(c + 1) = ZERO; + c += (1 + ldc) * COMPSIZE; + } + +#endif + + } + + return 0; +} + +#ifdef THREADED_LEVEL3 +#include "level3_syr2k_threaded.c" +#else +#include "level3_syr2k.c" +#endif diff --git a/driver/level3/zher2k_kernel.c b/driver/level3/zher2k_kernel.c new file mode 100644 index 0000000000..9b4c450336 --- /dev/null +++ b/driver/level3/zher2k_kernel.c @@ -0,0 +1,221 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0 +#else +#define GEMM_KERNEL GEMM_KERNEL_L +#define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset, int flag){ + + BLASLONG i, j; + BLASLONG loop; + FLOAT subbuffer[GEMM_UNROLL_MN * GEMM_UNROLL_MN * COMPSIZE]; + + if (m + offset < 0) { +#ifndef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + if (n < offset) { +#ifdef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + return 0; + } + + + if (offset > 0) { +#ifdef LOWER + GEMM_KERNEL(m, offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + b += offset * k * COMPSIZE; + c += offset * ldc * COMPSIZE; + n -= offset; + offset = 0; + + if (n <= 0) return 0; + } + + if (n > m + offset) { +#ifndef LOWER + GEMM_KERNEL(m, n - m - offset, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, + b + (m + offset) * k * COMPSIZE, + c + (m + offset) * ldc * COMPSIZE, ldc); +#endif + + n = m + offset; + if (n <= 0) return 0; + } + + + if (offset < 0) { +#ifndef LOWER + GEMM_KERNEL(-offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b, c, ldc); +#endif + a -= offset * k * COMPSIZE; + c -= offset * COMPSIZE; + m += offset; + offset = 0; + + if (m <= 0) return 0; + } + + if (m > n - offset) { +#ifdef LOWER + GEMM_KERNEL(m - n + offset, n, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (n - offset) * k * COMPSIZE, + b, + c + (n - offset) * COMPSIZE, ldc); +#endif + m = n + offset; + if (m <= 0) return 0; + } + + for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { + + int mm, nn; + + mm = (loop & ~(GEMM_UNROLL_MN - 1)); + nn = MIN(GEMM_UNROLL_MN, n - loop); + +#ifndef LOWER + GEMM_KERNEL(mm, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); +#endif + + if (flag) { + GEMM_BETA(nn, nn, 0, ZERO, +#ifdef COMPLEX + ZERO, +#endif + NULL, 0, NULL, 0, subbuffer, nn); + + GEMM_KERNEL(nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + + +#ifndef LOWER + + for (j = 0; j < nn; j ++) { + for (i = 0; i <= j; i ++) { + c[(i + loop + (j + loop) * ldc) * 2 + 0] += + subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; + if (i != j) { + c[(i + loop + (j + loop) * ldc) * 2 + 1] += + subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; + } else { + c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; + } + } + } +#else + for (j = 0; j < nn; j ++) { + for (i = j; i < nn; i ++) { + c[(i + loop + (j + loop) * ldc) * 2 + 0] += + subbuffer[(i + j * nn) * 2 + 0] + subbuffer[(j + i * nn) * 2 + 0]; + if (i != j) { + c[(i + loop + (j + loop) * ldc) * 2 + 1] += + subbuffer[(i + j * nn) * 2 + 1] - subbuffer[(j + i * nn) * 2 + 1]; + } else { + c[(i + loop + (j + loop) * ldc) * 2 + 1] = ZERO; + } + } + } +#endif + } + +#ifdef LOWER + GEMM_KERNEL(m - mm - nn, nn, k, + alpha_r, +#ifdef COMPLEX + alpha_i, +#endif + a + (mm + nn) * k * COMPSIZE, b + loop * k * COMPSIZE, + c + (mm + nn + loop * ldc) * COMPSIZE, ldc); +#endif + } + + return 0; +} diff --git a/driver/level3/zherk_beta.c b/driver/level3/zherk_beta.c new file mode 100644 index 0000000000..6867cc010a --- /dev/null +++ b/driver/level3/zherk_beta.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int CNAME(BLASLONG dummy1, BLASLONG n, BLASLONG dummy2, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *dummy3, BLASLONG dummy4, FLOAT *dummy5, BLASLONG dummy6, + FLOAT *c, BLASLONG ldc, + FLOAT *dummy7, FLOAT *dummy8, BLASLONG from, BLASLONG to){ + + BLASLONG i; + +#ifndef LOWER + for (i = from; i < to; i++){ + SCAL_K(i * 2, 0, 0, alpha_r, c + i * ldc * 2, 1, NULL, 0, NULL, 0); + if (alpha_r == ZERO ){ + c[i * 2 + 0 + i * ldc * 2] = ZERO; + c[i * 2 + 1 + i * ldc * 2] = ZERO; + } else { + c[i * 2 + 0 + i * ldc * 2] *= alpha_r; + c[i * 2 + 1 + i * ldc * 2] = ZERO; + } + } +#else + for (i = from; i < to; i++){ + if (alpha_r == ZERO) { + c[i * 2 + 0 + i * ldc * 2] = ZERO; + c[i * 2 + 1 + i * ldc * 2] = ZERO; + } else { + c[i * 2 + 0 + i * ldc * 2] *= alpha_r; + c[i * 2 + 1 + i * ldc * 2] = ZERO; + } + SCAL_K((n - i - 1) * 2, 0, 0, alpha_r, c + 2 + i * (ldc + 1) * 2, 1, NULL, 0, NULL, 0); + } +#endif + + return 0; +} diff --git a/driver/level3/zherk_k.c b/driver/level3/zherk_k.c new file mode 100644 index 0000000000..d1ffbdb125 --- /dev/null +++ b/driver/level3/zherk_k.c @@ -0,0 +1,158 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef LOWER + +#ifndef CONJ +#ifdef XDOUBLE +#define KERNEL_FUNC xherk_kernel_UN +#elif defined(DOUBLE) +#define KERNEL_FUNC zherk_kernel_UN +#else +#define KERNEL_FUNC cherk_kernel_UN +#endif +#else +#ifdef XDOUBLE +#define KERNEL_FUNC xherk_kernel_UC +#elif defined(DOUBLE) +#define KERNEL_FUNC zherk_kernel_UC +#else +#define KERNEL_FUNC cherk_kernel_UC +#endif +#endif + +#else + +#ifndef CONJ +#ifdef XDOUBLE +#define KERNEL_FUNC xherk_kernel_LN +#elif defined(DOUBLE) +#define KERNEL_FUNC zherk_kernel_LN +#else +#define KERNEL_FUNC cherk_kernel_LN +#endif +#else +#ifdef XDOUBLE +#define KERNEL_FUNC xherk_kernel_LC +#elif defined(DOUBLE) +#define KERNEL_FUNC zherk_kernel_LC +#else +#define KERNEL_FUNC cherk_kernel_LC +#endif +#endif + +#endif + +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) + +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL HERK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL HERK_UC +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL HERK_LN +#else +#define SYRK_LOCAL HERK_LC +#endif + +#undef SCAL_K + +#ifdef XDOUBLE +#define SCAL_K QSCAL_K +#elif defined(DOUBLE) +#define SCAL_K DSCAL_K +#else +#define SCAL_K SSCAL_K +#endif + + +static inline int syrk_beta(BLASLONG m_from, BLASLONG m_to, BLASLONG n_from, BLASLONG n_to, FLOAT *alpha, FLOAT *c, BLASLONG ldc) { + + BLASLONG i; + +#ifndef LOWER + if (m_from > n_from) n_from = m_from; + if (m_to > n_to ) m_to = n_to; +#else + if (m_from < n_from) m_from = n_from; + if (m_to < n_to ) n_to = m_to; +#endif + + c += (m_from + n_from * ldc) * COMPSIZE; + + m_to -= m_from; + n_to -= n_from; + + for (i = 0; i < n_to; i++){ + +#ifndef LOWER + + SCAL_K(MIN(i + n_from - m_from + 1, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); + + if (i + n_from - m_from + 1 <= m_to) + *(c + (i + n_from - m_from) * COMPSIZE + 1) = ZERO; + + c += ldc * COMPSIZE; + +#else + + SCAL_K(MIN(m_to - i + m_from - n_from, m_to) * COMPSIZE, 0, 0, alpha[0], c, 1, NULL, 0, NULL, 0); + + if (i < m_from - n_from) { + c += ldc * COMPSIZE; + } else { + *(c + 1) = ZERO; + c += (1 + ldc) * COMPSIZE; + } + +#endif + + } + + return 0; +} + +#ifdef THREADED_LEVEL3 +#include "level3_syrk_threaded.c" +#else +#include "level3_syrk.c" +#endif diff --git a/driver/level3/zherk_kernel.c b/driver/level3/zherk_kernel.c new file mode 100644 index 0000000000..fd8ff9cf31 --- /dev/null +++ b/driver/level3/zherk_kernel.c @@ -0,0 +1,194 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#define GEMM_KERNEL_B0 GEMM_KERNEL_R_B0 +#else +#define GEMM_KERNEL GEMM_KERNEL_L +#define GEMM_KERNEL_B0 GEMM_KERNEL_L_B0 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + BLASLONG loop; + FLOAT *cc, *ss; + FLOAT subbuffer[GEMM_UNROLL_MN * (GEMM_UNROLL_MN + 1) * COMPSIZE]; + + if (m + offset < 0) { +#ifndef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, ZERO, + a, b, c, ldc); +#endif + return 0; + } + + if (n < offset) { +#ifdef LOWER + GEMM_KERNEL(m, n, k, + alpha_r, ZERO, + a, b, c, ldc); +#endif + return 0; + } + + + if (offset > 0) { +#ifdef LOWER + GEMM_KERNEL(m, offset, k, + alpha_r, ZERO, + a, b, c, ldc); +#endif + b += offset * k * COMPSIZE; + c += offset * ldc * COMPSIZE; + n -= offset; + offset = 0; + + if (n <= 0) return 0; + } + + if (n > m + offset) { +#ifndef LOWER + GEMM_KERNEL(m, n - m - offset, k, + alpha_r, ZERO, + a, + b + (m + offset) * k * COMPSIZE, + c + (m + offset) * ldc * COMPSIZE, ldc); +#endif + + n = m + offset; + if (n <= 0) return 0; + } + + + if (offset < 0) { +#ifndef LOWER + GEMM_KERNEL(-offset, n, k, + alpha_r, ZERO, + a, b, c, ldc); +#endif + a -= offset * k * COMPSIZE; + c -= offset * COMPSIZE; + m += offset; + offset = 0; + + if (m <= 0) return 0; + } + + if (m > n - offset) { +#ifdef LOWER + GEMM_KERNEL(m - n + offset, n, k, + alpha_r, ZERO, + a + (n - offset) * k * COMPSIZE, + b, + c + (n - offset) * COMPSIZE, ldc); +#endif + m = n + offset; + if (m <= 0) return 0; + } + + for (loop = 0; loop < n; loop += GEMM_UNROLL_MN) { + + int mm, nn; + + mm = (loop & ~(GEMM_UNROLL_MN - 1)); + nn = MIN(GEMM_UNROLL_MN, n - loop); + +#ifndef LOWER + GEMM_KERNEL(mm, nn, k, + alpha_r, ZERO, + a, b + loop * k * COMPSIZE, c + loop * ldc * COMPSIZE, ldc); +#endif + + GEMM_BETA(nn, nn, 0, ZERO, ZERO, + NULL, 0, NULL, 0, subbuffer, nn); + + GEMM_KERNEL(nn, nn, k, + alpha_r, ZERO, + a + loop * k * COMPSIZE, b + loop * k * COMPSIZE, subbuffer, nn); + + cc = c + (loop + loop * ldc) * COMPSIZE; + ss = subbuffer; + +#ifndef LOWER + for (j = 0; j < nn; j ++) { + + for (i = 0; i +#include +#include "common.h" + +int CNAME(BLASLONG dummy1, BLASLONG n, BLASLONG dummy2, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *dummy3, BLASLONG dummy4, FLOAT *dummy5, BLASLONG dummy6, + FLOAT *c, BLASLONG ldc, + FLOAT *dummy7, FLOAT *dummy8, BLASLONG from, BLASLONG to){ + + BLASLONG i; + +#ifndef LOWER + for (i = from; i < to; i++){ + ZSCAL_K(i + 1, 0, 0, alpha_r, alpha_i, c + i * ldc * 2, 1, NULL, 0, NULL, 0); + } +#else + for (i = from; i < to; i++){ + ZSCAL_K(n - i, 0, 0, alpha_r, alpha_i, c + i * (ldc + 1) * 2, 1, NULL, 0, NULL, 0); + } +#endif + return 0; +} diff --git a/driver/mapper/Makefile b/driver/mapper/Makefile new file mode 100644 index 0000000000..67e7e03ee1 --- /dev/null +++ b/driver/mapper/Makefile @@ -0,0 +1,25 @@ +MODULENAME := mapper + +KDIR := /lib/modules/$(shell uname -r)/build +PWD := $(shell pwd) + +CC := gcc -Wall + +ifeq ($(KERNELRELEASE),) +all :: + $(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules +else + obj-m := $(MODULENAME).o +endif + +load: + insmod ./$(MODULENAME).ko + +unload: + rmmod $(MODULENAME) + +setup: + ./device_setup + +clean: + rm -rf *.o *.ko Module.symvers *.mod.c .tmp_versions .mapper* modules.order diff --git a/driver/mapper/device_setup b/driver/mapper/device_setup new file mode 100644 index 0000000000..0afbdebe3d --- /dev/null +++ b/driver/mapper/device_setup @@ -0,0 +1,11 @@ +#!/bin/sh + +drivername=mapper + +devicename=/dev/$drivername +major=`cat /proc/devices | grep $drivername | awk '{print $1;}'` + +rm -f $devicename +mknod $devicename c $major 0 +chmod go+rw $devicename + diff --git a/driver/mapper/mapper.c b/driver/mapper/mapper.c new file mode 100644 index 0000000000..83805fb1e5 --- /dev/null +++ b/driver/mapper/mapper.c @@ -0,0 +1,252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_BIGPHYS_AREA +#include +#endif +#include +#ifdef MODVERSIONS +#include +#endif +#include + +typedef struct { + pid_t pid; +#ifndef CONFIG_BIGPHYS_AREA + long size; +#endif + caddr_t address; + +} buffer_t; + +#define MAX_BUFF_SIZE 1024 +#define MAX_LENGTH (4UL << 20) + +static spinlock_t lock __attribute__((aligned(64))); + +static buffer_t buffer[MAX_BUFF_SIZE]; + +static dev_t mapper_dev; +static struct cdev mapper_cdev; + +static int mapper_open (struct inode *inode, struct file *fp){ return 0;} + +static int mapper_release(struct inode *inode, struct file *fp){ + + int pos; +#ifndef CONFIG_BIGPHYS_AREA + caddr_t addr; +#endif + + // printk("Releasing memory... %d\n", current -> tgid); + + spin_lock(&lock); + + for (pos = 0; pos < MAX_BUFF_SIZE; pos ++) { + if (buffer[pos].pid == (pid_t) current -> tgid) { + +#ifdef CONFIG_BIGPHYS_AREA + bigphysarea_free_pages(buffer[pos].address); +#else + + for (addr = buffer[pos].address; addr < buffer[pos].address + buffer[pos].size; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + } + + kfree(buffer[pos].address); + buffer[pos].size = 0; +#endif + buffer[pos].pid = 0; + buffer[pos].address = 0; + } + } + + spin_unlock(&lock); + + return 0; +} + +int mapper_mapper(struct file *fp, struct vm_area_struct *vma){ + + int ret, pos; + caddr_t alloc_addr; +#ifndef CONFIG_BIGPHYS_AREA + caddr_t addr; +#endif + long all_length, length, current_addr; + + all_length = vma->vm_end - vma->vm_start; + current_addr = vma -> vm_start; + + spin_lock(&lock); + + while (all_length > 0) { + length = all_length; + if (length > MAX_LENGTH) length = MAX_LENGTH; + all_length -= MAX_LENGTH; + + // printk("Allocating memory... %d\n", length); + + pos = 0; + while ((pos < MAX_BUFF_SIZE) && (buffer[pos].address != 0)) pos ++; + + if (pos >= MAX_BUFF_SIZE) { + + printk("Memory Allocator : too much memory allocation requested.\n"); + + spin_unlock(&lock); + + return -EIO; + } + +#ifdef CONFIG_BIGPHYS_AREA + alloc_addr = (caddr_t)bigphysarea_alloc_pages(length >> PAGE_SHIFT, 1, GFP_KERNEL); +#else + alloc_addr = (caddr_t)kmalloc(length, GFP_KERNEL); +#endif + + if (alloc_addr == (caddr_t)NULL) { + + spin_unlock(&lock); + + return -EIO; + } + +#ifndef CONFIG_BIGPHYS_AREA + for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) { + clear_page(addr); + SetPageReserved(virt_to_page(addr)); + } +#endif + + if ((ret = remap_pfn_range(vma, + current_addr, + virt_to_phys((void *)alloc_addr) >> PAGE_SHIFT, + length, + PAGE_SHARED)) < 0) { + +#ifdef CONFIG_BIGPHYS_AREA + bigphysarea_free_pages((caddr_t)alloc_addr); +#else + + for (addr = alloc_addr; addr < alloc_addr + length; addr += PAGE_SIZE) ClearPageReserved(virt_to_page(addr)); + + kfree((caddr_t)alloc_addr); +#endif + + spin_unlock(&lock); + + return ret; + } + + buffer[pos].pid = current -> tgid; + buffer[pos].address = alloc_addr; +#ifndef CONFIG_BIGPHYS_AREA + buffer[pos].size = length; +#endif + + current_addr += length; + } + + spin_unlock(&lock); + + return 0; +} + +static struct file_operations mapper_fops = { + .open = mapper_open, + .release = mapper_release, + .mmap = mapper_mapper, + .owner = THIS_MODULE, +}; + +static int __init mapper_init(void){ + + int ret, i; + + ret = alloc_chrdev_region(&mapper_dev, 0, 1, "mapper"); + + cdev_init(&mapper_cdev, &mapper_fops); + + ret = cdev_add(&mapper_cdev, mapper_dev, 1); + + spin_lock_init(&lock); + + for (i = 0; i < MAX_BUFF_SIZE; i++) { + buffer[i].pid = 0; +#ifndef CONFIG_BIGPHYS_AREA + buffer[i].size = 0; +#endif + buffer[i].address = 0; + } + + return ret; +} + +static void __exit mapper_exit(void){ + + int pos; + + for (pos = 0; pos < MAX_BUFF_SIZE; pos ++) { + if (buffer[pos].address != 0) { +#ifdef CONFIG_BIGPHYS_AREA + bigphysarea_free_pages(buffer[pos].address); +#else + kfree(buffer[pos].address); +#endif + } + } + + cdev_del(&mapper_cdev); + + unregister_chrdev_region(mapper_dev, 1); +} + +module_init(mapper_init); +module_exit(mapper_exit); +MODULE_DESCRIPTION("BigPhysArea User Mapping Driver"); +MODULE_LICENSE("Unknown"); diff --git a/driver/others/Makefile b/driver/others/Makefile new file mode 100644 index 0000000000..bc5de38486 --- /dev/null +++ b/driver/others/Makefile @@ -0,0 +1,218 @@ +TOPDIR = ../.. +include ../../Makefile.system + +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) + +COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) + +ifdef SMP +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) +ifndef NO_AFFINITY +COMMONOBJS += init.$(SUFFIX) +endif +endif + +# COMMONOBJS += info.$(SUFFIX) + +ifdef DYNAMIC_ARCH +COMMONOBJS += dynamic.$(SUFFIX) +else +COMMONOBJS += parameter.$(SUFFIX) +endif + +ifdef EXPRECISION +COMMONOBJS += x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX) +endif + +ifdef QUAD_PRECISION +COMMONOBJS += addx.$(SUFFIX) mulx.$(SUFFIX) +endif + +ifeq ($(OSNAME), CYGWIN_NT) +ifeq ($(C_COMPILER), PGI) +# COMMONOBJS += __builtin_stinit.$(SUFFIX) +endif +endif + +ifdef USE_CUDA +COMMONOBJS += cuda_init.$(SUFFIX) +endif + +ifdef FUNCTION_PROFILE +COMMONOBJS += profile.$(SUFFIX) +endif + +LIBOTHERS = libothers.$(LIBSUFFIX) + +ifeq ($(CORE), PPC440) +MEMORY = memory_qalloc.c +endif + +ifndef MEMORY +MEMORY = memory.c +endif + +ifeq ($(USE_OPENMP), 1) +BLAS_SERVER = blas_server_omp.c +else +ifeq ($(OSNAME), WINNT) +BLAS_SERVER = blas_server_win32.c +endif +ifeq ($(OSNAME), CYGWIN_NT) +BLAS_SERVER = blas_server_win32.c +endif +ifeq ($(OSNAME), Interix) +BLAS_SERVER = blas_server_win32.c +endif +endif + +ifndef BLAS_SERVER +BLAS_SERVER = blas_server.c +endif + +ifdef DYNAMIC_ARCH +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) +else +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) +endif + +xerbla.$(SUFFIX) : xerbla.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dynamic.$(SUFFIX) : dynamic.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dynamic.$(PSUFFIX) : dynamic.c + $(CC) $(PFLAGS) -c $< -o $(@F) + +parameter.$(SUFFIX) : parameter.c ../../param.h + $(CC) $(CFLAGS) -c $< -o $(@F) + +init.$(SUFFIX) : init.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +profile.$(SUFFIX) : profile.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +memory.$(SUFFIX) : $(MEMORY) ../../common.h ../../param.h + $(CC) $(CFLAGS) -c $< -o $(@F) + +blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../../param.h + $(CC) $(CFLAGS) -c $< -o $(@F) + +blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h + $(CC) $(CFLAGS) -c $< -o $(@F) + +cuda_init.$(SUFFIX) : cuda_init.c + $(CUCC) $(COMMON_OPT) -I$(TOPDIR) $(CUFLAGS) -DCNAME=$(*F) -c $< -o $(@F) + +c_abs.$(SUFFIX) : abs.c + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +z_abs.$(SUFFIX) : abs.c + $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) + +x_abs.$(SUFFIX) : abs.c + $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) + +slamch.$(SUFFIX) : lamch.c + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +dlamch.$(SUFFIX) : lamch.c + $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) + +qlamch.$(SUFFIX) : lamch.c + $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) + +slamc3.$(SUFFIX) : lamc3.c + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +dlamc3.$(SUFFIX) : lamc3.c + $(CC) $(CFLAGS) -c -DDOUBLE $< -o $(@F) + +qlamc3.$(SUFFIX) : lamc3.c + $(CC) $(CFLAGS) -c -DXDOUBLE $< -o $(@F) + +divtable.$(SUFFIX) : divtable.c + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +__builtin_stinit.$(SUFFIX) : $(ARCH)/builtin_stinit.S + $(CC) $(CFLAGS) -c -UDOUBLE $< -o $(@F) + +addx.$(SUFFIX) : $(ARCH)/addx.c + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) + +mulx.$(SUFFIX) : $(ARCH)/mulx.c + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) + +xerbla.$(PSUFFIX) : xerbla.c + $(CC) $(PFLAGS) -c $< -o $(@F) + +parameter.$(PSUFFIX) : parameter.c ../../param.h + $(CC) $(PFLAGS) -c $< -o $(@F) + +init.$(PSUFFIX) : init.c + $(CC) $(PFLAGS) -c $< -o $(@F) + +profile.$(PSUFFIX) : profile.c + $(CC) $(PFLAGS) -c $< -o $(@F) + +memory.$(PSUFFIX) : $(MEMORY) ../../common.h ../../param.h + $(CC) $(PFLAGS) -c $< -o $(@F) + +blas_server.$(PSUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h + $(CC) $(PFLAGS) -c $< -o $(@F) + +blasL1thread.$(PSUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h + $(CC) $(PFLAGS) -c $< -o $(@F) + +cuda_init.$(PSUFFIX) : cuda_init.c + $(CUCC) $(COMMON_OPT) -I$(TOPDIR) $(CUFLAGS) -DCNAME=$(*F) -c $< -o $(@F) + +c_abs.$(PSUFFIX) : abs.c + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +z_abs.$(PSUFFIX) : abs.c + $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) + +x_abs.$(PSUFFIX) : abs.c + $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) + +slamch.$(PUFFIX) : lamch.c + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +dlamch.$(PUFFIX) : lamch.c + $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) + +qlamch.$(PUFFIX) : lamch.c + $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) + +slamc3.$(PUFFIX) : lamc3.c + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +dlamc3.$(PUFFIX) : lamc3.c + $(CC) $(PFLAGS) -c -DDOUBLE $< -o $(@F) + +qlamc3.$(PUFFIX) : lamc3.c + $(CC) $(PFLAGS) -c -DXDOUBLE $< -o $(@F) + +divtable.$(PSUFFIX) : divtable.c + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +__builtin_stinit.$(PPSUFFIX) : $(ARCH)/builtin_stinit.S + $(CC) $(PFLAGS) -c -UDOUBLE $< -o $(@F) + +addx.$(PSUFFIX) : $(ARCH)/addx.c + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) + +mulx.$(PSUFFIX) : $(ARCH)/mulx.c + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) + +info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h + $(CC) $(CFLAGS) -c $< -o $(@F) + + +hpl : CFLAGS += -DHPL +hpl_p : CFLAGS += -DHPL + +include $(TOPDIR)/Makefile.tail diff --git a/driver/others/abs.c b/driver/others/abs.c new file mode 100644 index 0000000000..e3ce16113b --- /dev/null +++ b/driver/others/abs.c @@ -0,0 +1,71 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +double fabs(double); +double sqrt(double); + +#ifdef NEED_F2CCONV +double +#else +FLOAT +#endif + CNAME(FLOAT *z){ + + FLOAT real = z[0]; + FLOAT imag = z[1]; + double temp; + + real = fabs(real); + imag = fabs(imag); + +if(imag > real){ + temp = real; + real = imag; + imag = temp; +} + + if (imag == 0.) return real; + + temp = imag/real; + temp = real * sqrt(1.0 + temp*temp); + + return temp; + +} diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c new file mode 100644 index 0000000000..851135b107 --- /dev/null +++ b/driver/others/blas_l1_thread.c @@ -0,0 +1,112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, + void *a, BLASLONG lda, + void *b, BLASLONG ldb, + void *c, BLASLONG ldc, int (*function)(), int nthreads){ + + blas_queue_t queue[MAX_CPU_NUMBER]; + blas_arg_t args [MAX_CPU_NUMBER]; + + BLASLONG i, width, astride, bstride; + int num_cpu, calc_type; + + calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; + + mode |= BLAS_LEGACY; + + for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); + + num_cpu = 0; + i = m; + + while (i > 0){ + + /* Adjust Parameters */ + width = blas_quickdivide(i + nthreads - num_cpu - 1, + nthreads - num_cpu); + + i -= width; + if (i < 0) width = width + i; + + astride = width * lda; + + if (!(mode & BLAS_TRANSB_T)) { + bstride = width * ldb; + } else { + bstride = width; + } + + astride <<= calc_type; + bstride <<= calc_type; + + args[num_cpu].m = width; + args[num_cpu].n = n; + args[num_cpu].k = k; + args[num_cpu].a = (void *)a; + args[num_cpu].b = (void *)b; + args[num_cpu].c = (void *)c; + args[num_cpu].lda = lda; + args[num_cpu].ldb = ldb; + args[num_cpu].ldc = ldc; + args[num_cpu].alpha = alpha; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = function; + queue[num_cpu].args = &args[num_cpu]; + queue[num_cpu].next = &queue[num_cpu + 1]; + + a = (void *)((BLASULONG)a + astride); + b = (void *)((BLASULONG)b + bstride); + + num_cpu ++; + } + + if (num_cpu) { + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c new file mode 100644 index 0000000000..62aefe9798 --- /dev/null +++ b/driver/others/blas_server.c @@ -0,0 +1,848 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#ifdef OS_LINUX +#include +#include +#endif + +#ifdef SMP_SERVER + +#undef MONITOR +#undef TIMING +#undef TIMING_DEBUG +#undef NEED_STACKATTR + +#define ATTRIBUTE_SIZE 128 + +/* This is a thread server model implementation. The threads are */ +/* spawned at first access to blas library, and still remains until */ +/* destruction routine is called. The number of threads are */ +/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ +/* jobs is queued. */ + +/* We need this grobal for cheking if initialization is finished. */ +int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; + +/* Local Variables */ +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t server_lock = 0; +#else +static unsigned long server_lock = 0; +#endif + +#define THREAD_STATUS_SLEEP 2 +#define THREAD_STATUS_WAKEUP 4 + +static pthread_t blas_threads [MAX_CPU_NUMBER]; + +typedef struct { + blas_queue_t * volatile queue __attribute__((aligned(ATTRIBUTE_SIZE))); + +#if defined(OS_LINUX) && !defined(NO_AFFINITY) + int node; +#endif + + volatile long status; + + pthread_mutex_t lock; + pthread_cond_t wakeup; + +} thread_status_t; + +static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE))); + +#ifndef THREAD_TIMEOUT +#define THREAD_TIMEOUT 28 +#endif + +static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); + +#ifdef MONITOR + +/* Monitor is a function to see thread's status for every seconds. */ +/* Usually it turns off and it's for debugging. */ + +static pthread_t monitor_thread; +static int main_status[MAX_CPU_NUMBER]; +#define MAIN_ENTER 0x01 +#define MAIN_EXIT 0x02 +#define MAIN_TRYLOCK 0x03 +#define MAIN_LOCKSUCCESS 0x04 +#define MAIN_QUEUING 0x05 +#define MAIN_RECEIVING 0x06 +#define MAIN_RUNNING1 0x07 +#define MAIN_RUNNING2 0x08 +#define MAIN_RUNNING3 0x09 +#define MAIN_WAITING 0x0a +#define MAIN_SLEEPING 0x0b +#define MAIN_FINISH 0x0c +#define MAIN_DONE 0x0d +#endif + +#define BLAS_QUEUE_FINISHED 3 +#define BLAS_QUEUE_RUNNING 4 + +#ifdef TIMING +BLASLONG exit_time[MAX_CPU_NUMBER]; +#endif + +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ + + if (!(mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* REAL / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* REAL / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } else { +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* COMPLEX / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + ((xdouble *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* COMPLEX / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + ((double *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* COMPLEX / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + ((float *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } +} + +#if defined(OS_LINUX) && !defined(NO_AFFINITY) +int gotoblas_set_affinity(int); +int gotoblas_set_affinity2(int); +int get_node(void); +#endif + +static int increased_threads = 0; + +static int blas_thread_server(void *arg){ + + /* Thread identifier */ + BLASLONG cpu = (BLASLONG)arg; + unsigned int last_tick; + void *buffer, *sa, *sb; + blas_queue_t *queue; +#ifdef TIMING_DEBUG + unsigned long start, stop; +#endif + +#if defined(OS_LINUX) && !defined(NO_AFFINITY) + if (!increased_threads) + thread_status[cpu].node = gotoblas_set_affinity(cpu + 1); + else + thread_status[cpu].node = gotoblas_set_affinity(-1); +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_ENTER; +#endif + + buffer = blas_memory_alloc(2); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu); +#endif + + while (1){ + +#ifdef MONITOR + main_status[cpu] = MAIN_QUEUING; +#endif + +#ifdef TIMING + exit_time[cpu] = rpcc(); +#endif + + last_tick = (unsigned int)rpcc(); + + while (!thread_status[cpu].queue) { + + YIELDING; + + if ((unsigned int)rpcc() - last_tick > thread_timeout) { + + pthread_mutex_lock (&thread_status[cpu].lock); + + if (!thread_status[cpu].queue) { + thread_status[cpu].status = THREAD_STATUS_SLEEP; + while (thread_status[cpu].status == THREAD_STATUS_SLEEP) { + +#ifdef MONITOR + main_status[cpu] = MAIN_SLEEPING; +#endif + + pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock); + } + } + + pthread_mutex_unlock(&thread_status[cpu].lock); + + last_tick = (unsigned int)rpcc(); + } + + } + + queue = thread_status[cpu].queue; + + if ((long)queue == -1) break; + +#ifdef MONITOR + main_status[cpu] = MAIN_RECEIVING; +#endif + +#ifdef TIMING_DEBUG + start = rpcc(); +#endif + + if (queue) { + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + + thread_status[cpu].queue = (blas_queue_t *)1; + + sa = queue -> sa; + sb = queue -> sb; + +#ifdef SMP_DEBUG + if (queue -> args) { + fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); + } +#endif + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; +#endif + + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { + if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + + } else { + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } else { +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } + } + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; +#endif + + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + } else + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu); +#endif + +#ifdef MONITOR + main_status[cpu] = MAIN_FINISH; +#endif + + thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */ + WMB; + + } + +#ifdef MONITOR + main_status[cpu] = MAIN_DONE; +#endif + +#ifdef TIMING_DEBUG + stop = rpcc(); + + fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1, + start, stop, + stop - start); +#endif + + } + + /* Shutdown procedure */ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); +#endif + + blas_memory_free(buffer); + + pthread_exit(NULL); + + return 0; +} + +#ifdef MONITOR + +static BLASLONG num_suspend = 0; + +static int blas_monitor(void *arg){ + int i; + + while(1){ + for (i = 0; i < blas_num_threads - 1; i++){ + switch (main_status[i]) { + case MAIN_ENTER : + fprintf(STDERR, "THREAD[%2d] : Entering.\n", i); + break; + case MAIN_EXIT : + fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i); + break; + case MAIN_TRYLOCK : + fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i); + break; + case MAIN_QUEUING : + fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i); + break; + case MAIN_RECEIVING : + fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i); + break; + case MAIN_RUNNING1 : + fprintf(STDERR, "THREAD[%2d] : Running1.\n", i); + break; + case MAIN_RUNNING2 : + fprintf(STDERR, "THREAD[%2d] : Running2.\n", i); + break; + case MAIN_RUNNING3 : + fprintf(STDERR, "THREAD[%2d] : Running3.\n", i); + break; + case MAIN_WAITING : + fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i); + break; + case MAIN_SLEEPING : + fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i); + break; + case MAIN_FINISH : + fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i); + break; + case MAIN_DONE : + fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i); + break; + } + + fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend); + } + sleep(1); + } + + return 0; +} +#endif + +/* Initializing routine */ +int blas_thread_init(void){ + BLASLONG i; +#ifdef NEED_STACKATTR + pthread_attr_t attr; +#endif + + if (blas_server_avail) return 0; + +#ifdef NEED_STACKATTR + pthread_attr_init(&attr); + pthread_attr_setguardsize(&attr, 0x1000U); + pthread_attr_setstacksize( &attr, 0x1000U); +#endif + + LOCK_COMMAND(&server_lock); + + if (!blas_server_avail){ + + char *p; + + p = getenv("GOTO_THREAD_TIMEOUT"); + + if (p) { + thread_timeout = atoi(p); + if (thread_timeout < 4) thread_timeout = 4; + if (thread_timeout > 30) thread_timeout = 30; + thread_timeout = (1 << thread_timeout); + } + + for(i = 0; i < blas_num_threads - 1; i++){ + + thread_status[i].queue = (blas_queue_t *)NULL; + thread_status[i].status = THREAD_STATUS_WAKEUP; + + pthread_mutex_init(&thread_status[i].lock, NULL); + pthread_cond_init (&thread_status[i].wakeup, NULL); + +#ifdef NEED_STACKATTR + pthread_create(&blas_threads[i], &attr, + (void *)&blas_thread_server, (void *)i); +#else + pthread_create(&blas_threads[i], NULL, + (void *)&blas_thread_server, (void *)i); +#endif + } + +#ifdef MONITOR + pthread_create(&monitor_thread, NULL, + (void *)&blas_monitor, (void *)NULL); +#endif + + blas_server_avail = 1; + } + + UNLOCK_COMMAND(&server_lock); + + return 0; +} + +/* + User can call one of two routines. + + exec_blas_async ... immediately returns after jobs are queued. + + exec_blas ... returns after jobs are finished. +*/ + +static BLASULONG exec_queue_lock = 0; + +int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ + + BLASLONG i = 0; + blas_queue_t *current = queue; +#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) + int node = get_node(); + int nodes = get_num_nodes(); +#endif + +#ifdef SMP_DEBUG + int exec_count = 0; + fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos); +#endif + + blas_lock(&exec_queue_lock); + + while (queue) { + queue -> position = pos; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); + __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); +#endif + +#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) + + /* Node Mapping Mode */ + + if (queue -> mode & BLAS_NODE) { + + do { + while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++; + + if (i < blas_num_threads - 1) break; + + i ++; + if (i >= blas_num_threads - 1) { + i = 0; + node ++; + if (node >= nodes) node = 0; + } + + } while (1); + + } else { + while(thread_status[i].queue) { + i ++; + if (i >= blas_num_threads - 1) i = 0; + } + } +#else + while(thread_status[i].queue) { + i ++; + if (i >= blas_num_threads - 1) i = 0; + } +#endif + + queue -> assigned = i; + WMB; + thread_status[i].queue = queue; + WMB; + + queue = queue -> next; + pos ++; +#ifdef SMP_DEBUG + exec_count ++; +#endif + + } + + blas_unlock(&exec_queue_lock); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count); +#endif + + while (current) { + + pos = current -> assigned; + + if ((BLASULONG)thread_status[pos].queue > 1) { + + if (thread_status[pos].status == THREAD_STATUS_SLEEP) { + + pthread_mutex_lock (&thread_status[pos].lock); + +#ifdef MONITOR + num_suspend ++; +#endif + + if (thread_status[pos].status == THREAD_STATUS_SLEEP) { + thread_status[pos].status = THREAD_STATUS_WAKEUP; + pthread_cond_signal(&thread_status[pos].wakeup); + } + pthread_mutex_unlock(&thread_status[pos].lock); + } + } + + current = current -> next; + } + + return 0; +} + +int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ + + while ((num > 0) && queue) { + + while(thread_status[queue -> assigned].queue) { + YIELDING; + }; + + queue = queue -> next; + num --; + } + +#ifdef SMP_DEBUG + fprintf(STDERR, "Done.\n\n"); +#endif + + return 0; +} + +/* Execute Threads */ +int exec_blas(BLASLONG num, blas_queue_t *queue){ + + int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); + +#ifdef TIMING_DEBUG + BLASULONG start, stop; +#endif + + if ((num <= 0) || (queue == NULL)) return 0; + +#ifdef SMP_DEBUG + fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num); +#endif + +#ifdef __ELF__ + if (omp_in_parallel && (num > 1)) { + if (omp_in_parallel() > 0) { + fprintf(stderr, + "GotoBLAS Warning : Detect OpenMP Loop and this application may hang. " + "Please rebuild the library with USE_OPENMP=1 option.\n"); + } + } +#endif + + if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); + +#ifdef TIMING_DEBUG + start = rpcc(); + + fprintf(STDERR, "\n"); +#endif + + routine = queue -> routine; + + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + } else + (routine)(queue -> args, queue -> range_m, queue -> range_n, + queue -> sa, queue -> sb, 0); + +#ifdef TIMING_DEBUG + stop = rpcc(); +#endif + + if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + +#ifdef TIMING_DEBUG + fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", + start, stop, + stop - start); +#endif + + return 0; +} + +void goto_set_num_threads(int num_threads) { + + long i; + + if (num_threads < 1) num_threads = blas_num_threads; + + if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; + + if (num_threads > blas_num_threads) { + + LOCK_COMMAND(&server_lock); + + increased_threads = 1; + + for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + + thread_status[i].queue = (blas_queue_t *)NULL; + thread_status[i].status = THREAD_STATUS_WAKEUP; + + pthread_mutex_init(&thread_status[i].lock, NULL); + pthread_cond_init (&thread_status[i].wakeup, NULL); + +#ifdef NEED_STACKATTR + pthread_create(&blas_threads[i], &attr, + (void *)&blas_thread_server, (void *)i); +#else + pthread_create(&blas_threads[i], NULL, + (void *)&blas_thread_server, (void *)i); +#endif + } + + blas_num_threads = num_threads; + + UNLOCK_COMMAND(&server_lock); + } + + blas_cpu_number = num_threads; + +} + +/* Compatible function with pthread_create / join */ + +int gotoblas_pthread(int numthreads, void *function, void *args, int stride) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + int i; + + if (numthreads <= 0) return 0; + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif +#endif + + for (i = 0; i < numthreads; i ++) { + + queue[i].mode = BLAS_PTHREAD; + queue[i].routine = function; + queue[i].args = args; + queue[i].range_m = NULL; + queue[i].range_n = NULL; + queue[i].sa = args; + queue[i].sb = args; + queue[i].next = &queue[i + 1]; + + args += stride; + } + + queue[numthreads - 1].next = NULL; + + exec_blas(numthreads, queue); + + return 0; +} + +/* Shutdown procedure, but user don't have to call this routine. The */ +/* kernel automatically kill threads. */ + +int BLASFUNC(blas_thread_shutdown)(void){ + + int i; + + if (!blas_server_avail) return 0; + + LOCK_COMMAND(&server_lock); + + for (i = 0; i < blas_num_threads - 1; i++) { + + blas_lock(&exec_queue_lock); + + thread_status[i].queue = (blas_queue_t *)-1; + + blas_unlock(&exec_queue_lock); + + pthread_mutex_lock (&thread_status[i].lock); + + thread_status[i].status = THREAD_STATUS_WAKEUP; + + pthread_cond_signal (&thread_status[i].wakeup); + + pthread_mutex_unlock(&thread_status[i].lock); + + } + + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_join(blas_threads[i], NULL); + } + + for(i = 0; i < blas_num_threads - 1; i++){ + pthread_mutex_destroy(&thread_status[i].lock); + pthread_cond_destroy (&thread_status[i].wakeup); + } + +#ifdef NEED_STACKATTR + pthread_attr_destory(&attr); +#endif + + blas_server_avail = 0; + + UNLOCK_COMMAND(&server_lock); + + return 0; +} + +#endif + diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c new file mode 100644 index 0000000000..3e70d8549b --- /dev/null +++ b/driver/others/blas_server_omp.c @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include "common.h" + +#ifndef USE_OPENMP + +#include "blas_server.c" + +#else + +int blas_server_avail = 0; + +int blas_thread_init(void){ + + blas_get_cpu_number(); + + blas_server_avail = 1; + + return 0; +} + +int BLASFUNC(blas_thread_shutdown)(void){ + + blas_server_avail = 0; + + return 0; +} + +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ + + if (!(mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* REAL / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* REAL / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } else { +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* COMPLEX / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + ((xdouble *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* COMPLEX / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + ((double *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* COMPLEX / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + ((float *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } +} + +static void exec_threads(blas_queue_t *queue){ + + void *buffer, *sa, *sb; + + buffer = NULL; + sa = queue -> sa; + sb = queue -> sb; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif + + if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { + + buffer = blas_memory_alloc(2); + + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { + if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + + } else { + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } else { +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } + } + } + + if (queue -> mode & BLAS_LEGACY) { + legacy_exec(queue -> routine, queue -> mode, queue -> args, sb); + } else + if (queue -> mode & BLAS_PTHREAD) { + void (*pthreadcompat)(void *) = queue -> routine; + (pthreadcompat)(queue -> args); + + } else { + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + + } + + if (buffer != NULL) blas_memory_free(buffer); + +} + +int exec_blas(BLASLONG num, blas_queue_t *queue){ + + BLASLONG i; + + if ((num <= 0) || (queue == NULL)) return 0; + +#ifdef CONSISTENT_FPCSR + for (i = 0; i < num; i ++) { + __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); + __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); + } +#endif + +#pragma omp parallel for schedule(static) + for (i = 0; i < num; i ++) { + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + queue[i].position = i; +#endif + + exec_threads(&queue[i]); + } + + return 0; +} + +#endif diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c new file mode 100644 index 0000000000..6708509e12 --- /dev/null +++ b/driver/others/blas_server_win32.c @@ -0,0 +1,450 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +/* This is a thread implementation for Win32 lazy implementation */ + +/* Thread server common infomation */ +typedef struct{ + CRITICAL_SECTION lock; + HANDLE filled; + HANDLE killed; + + blas_queue_t *queue; /* Parameter Pointer */ + int shutdown; /* server shutdown flag */ + +} blas_pool_t; + +/* We need this grobal for cheking if initialization is finished. */ +int blas_server_avail = 0; + +/* Local Variables */ +static BLASULONG server_lock = 0; + +static blas_pool_t pool; +static HANDLE blas_threads [MAX_CPU_NUMBER]; +static DWORD blas_threads_id[MAX_CPU_NUMBER]; + +static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ + + if (!(mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* REAL / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* REAL / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } else { +#ifdef EXPRECISION + if (mode & BLAS_XDOUBLE){ + /* COMPLEX / Extended Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, + xdouble *, BLASLONG, xdouble *, BLASLONG, + xdouble *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((xdouble *)args -> alpha)[0], + ((xdouble *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else +#endif + if (mode & BLAS_DOUBLE){ + /* COMPLEX / Double */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, + double *, BLASLONG, double *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + ((double *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else { + /* COMPLEX / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + ((float *)args -> alpha)[1], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } + } +} + +/* This is a main routine of threads. Each thread waits until job is */ +/* queued. */ + +static DWORD WINAPI blas_thread_server(void *arg){ + + /* Thread identifier */ +#ifdef SMP_DEBUG + BLASLONG cpu = (BLASLONG)arg; +#endif + + void *buffer, *sa, *sb; + blas_queue_t *queue; + DWORD action; + HANDLE handles[] = {pool.filled, pool.killed}; + + /* Each server needs each buffer */ + buffer = blas_memory_alloc(2); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu); +#endif + + while (1){ + + /* Waiting for Queue */ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu); +#endif + + do { + action = WaitForMultipleObjects(2, handles, FALSE, INFINITE); + } while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1)); + + if (action == WAIT_OBJECT_0 + 1) break; + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); +#endif + + EnterCriticalSection(&pool.lock); + + queue = pool.queue; + if (queue) pool.queue = queue->next; + + LeaveCriticalSection(&pool.lock); + + if (queue) { + int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; + + if (pool.queue) SetEvent(pool.filled); + + sa = queue -> sa; + sb = queue -> sb; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); + __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); +#endif + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n", + cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); +#endif + + // fprintf(stderr, "queue start[%ld]!!!\n", cpu); + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING1; +#endif + + if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); + + if (sb == NULL) { + if (!(queue -> mode & BLAS_COMPLEX)){ +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + + } else { + sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } else { +#ifdef EXPRECISION + if (queue -> mode & BLAS_XDOUBLE){ + sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else +#endif + if (queue -> mode & BLAS_DOUBLE){ + sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } + } + } + +#ifdef MONITOR + main_status[cpu] = MAIN_RUNNING2; +#endif + + if (!(queue -> mode & BLAS_LEGACY)) { + + (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); + } else { + legacy_exec(routine, queue -> mode, queue -> args, sb); + } + } + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Finished!\n", cpu); +#endif + + EnterCriticalSection(&queue->lock); + + queue -> status = BLAS_STATUS_FINISHED; + + LeaveCriticalSection(&queue->lock); + + SetEvent(queue->finish); + } + + /* Shutdown procedure */ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu); +#endif + + blas_memory_free(buffer); + + return 0; + } + +/* Initializing routine */ +int blas_thread_init(void){ + BLASLONG i; + + if (blas_server_avail || (blas_cpu_number <= 1)) return 0; + + LOCK_COMMAND(&server_lock); + +#ifdef SMP_DEBUG + fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n", + blas_cpu_number); +#endif + + if (!blas_server_avail){ + + InitializeCriticalSection(&pool.lock); + pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL); + pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL); + + pool.shutdown = 0; + pool.queue = NULL; + + for(i = 0; i < blas_cpu_number - 1; i++){ + blas_threads[i] = CreateThread(NULL, 0, + blas_thread_server, (void *)i, + 0, &blas_threads_id[i]); + } + + blas_server_avail = 1; + } + + UNLOCK_COMMAND(&server_lock); + + return 0; +} + +/* + User can call one of two routines. + + exec_blas_async ... immediately returns after jobs are queued. + + exec_blas ... returns after jobs are finished. +*/ + +int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ + + blas_queue_t *current; + + current = queue; + + while (current) { + InitializeCriticalSection(¤t -> lock); + current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL); + current -> position = pos; + +#ifdef CONSISTENT_FPCSR + __asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode)); + __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); +#endif + + current = current -> next; + pos ++; + } + + EnterCriticalSection(&pool.lock); + + if (pool.queue) { + current = pool.queue; + while (current -> next) current = current -> next; + current -> next = queue; + } else { + pool.queue = queue; + } + + LeaveCriticalSection(&pool.lock); + + SetEvent(pool.filled); + + return 0; +} + +int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ + +#ifdef SMP_DEBUG + fprintf(STDERR, "Synchronization Waiting.\n"); +#endif + + while (num){ +#ifdef SMP_DEBUG + fprintf(STDERR, "Waiting Queue ..\n"); +#endif + + WaitForSingleObject(queue->finish, INFINITE); + + CloseHandle(queue->finish); + DeleteCriticalSection(&queue -> lock); + + queue = queue -> next; + num --; + } + +#ifdef SMP_DEBUG + fprintf(STDERR, "Completely Done.\n\n"); +#endif + + return 0; +} + +/* Execute Threads */ +int exec_blas(BLASLONG num, blas_queue_t *queue){ + +#ifndef ALL_THREADED + int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG); +#endif + + if ((num <= 0) || (queue == NULL)) return 0; + + if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); + + routine = queue -> routine; + + if (!(queue -> mode & BLAS_LEGACY)) { + (routine)(queue -> args, queue -> range_m, queue -> range_n, + queue -> sa, queue -> sb, 0); + } else { + legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); + } + + if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); + + return 0; +} + +/* Shutdown procedure, but user don't have to call this routine. The */ +/* kernel automatically kill threads. */ + +int blas_thread_shutdown_(void){ + + int i; + + if (!blas_server_avail) return 0; + + LOCK_COMMAND(&server_lock); + + if (blas_server_avail){ + + SetEvent(pool.killed); + + for(i = 0; i < blas_cpu_number - 1; i++){ + WaitForSingleObject(blas_threads[i], INFINITE); + } + + blas_server_avail = 0; + } + + UNLOCK_COMMAND(&server_lock); + + return 0; +} diff --git a/driver/others/divtable.c b/driver/others/divtable.c new file mode 100644 index 0000000000..7a191dbe29 --- /dev/null +++ b/driver/others/divtable.c @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef SMP +#ifndef USE64BITINT +unsigned int blas_quick_divide_table[] = { + 0x00000000, 0x00000001, 0x80000001, 0x55555556, + 0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925, + 0x20000001, 0x1c71c71d, 0x1999999a, 0x1745d175, + 0x15555556, 0x13b13b14, 0x12492493, 0x11111112, + 0x10000001, 0x0f0f0f10, 0x0e38e38f, 0x0d79435f, + 0x0ccccccd, 0x0c30c30d, 0x0ba2e8bb, 0x0b21642d, + 0x0aaaaaab, 0x0a3d70a4, 0x09d89d8a, 0x097b425f, + 0x0924924a, 0x08d3dcb1, 0x08888889, 0x08421085, + 0x08000001, 0x07c1f07d, 0x07878788, 0x07507508, + 0x071c71c8, 0x06eb3e46, 0x06bca1b0, 0x06906907, + 0x06666667, 0x063e7064, 0x06186187, 0x05f417d1, + 0x05d1745e, 0x05b05b06, 0x0590b217, 0x0572620b, + 0x05555556, 0x0539782a, 0x051eb852, 0x05050506, + 0x04ec4ec5, 0x04d4873f, 0x04bda130, 0x04a7904b, + 0x04924925, 0x047dc120, 0x0469ee59, 0x0456c798, + 0x04444445, 0x04325c54, 0x04210843, 0x04104105, + 0x04000001, +}; +#else +BLASULONG blas_quick_divide_table[] = { +0x0000000000000000, 0x0000000000000001, 0x8000000000000001, 0x5555555555555557, +0x4000000000000001, 0x3333333333333335, 0x2aaaaaaaaaaaaaac, 0x2492492492492494, +0x2000000000000001, 0x1c71c71c71c71c73, 0x199999999999999b, 0x1745d1745d1745d3, +0x1555555555555557, 0x13b13b13b13b13b3, 0x124924924924924b, 0x1111111111111113, +0x1000000000000001, 0x0f0f0f0f0f0f0f11, 0x0e38e38e38e38e3a, 0x0d79435e50d79437, +0x0cccccccccccccce, 0x0c30c30c30c30c32, 0x0ba2e8ba2e8ba2ea, 0x0b21642c8590b218, +0x0aaaaaaaaaaaaaac, 0x0a3d70a3d70a3d72, 0x09d89d89d89d89da, 0x097b425ed097b427, +0x0924924924924926, 0x08d3dcb08d3dcb0a, 0x088888888888888a, 0x0842108421084212, +0x0800000000000001, 0x07c1f07c1f07c1f2, 0x0787878787878789, 0x0750750750750752, +0x071c71c71c71c71e, 0x06eb3e45306eb3e6, 0x06bca1af286bca1c, 0x0690690690690692, +0x0666666666666668, 0x063e7063e7063e72, 0x061861861861861a, 0x05f417d05f417d07, +0x05d1745d1745d176, 0x05b05b05b05b05b2, 0x0590b21642c8590d, 0x0572620ae4c415cb, +0x0555555555555557, 0x05397829cbc14e60, 0x051eb851eb851eba, 0x0505050505050507, +0x04ec4ec4ec4ec4ee, 0x04d4873ecade304f, 0x04bda12f684bda14, 0x04a7904a7904a792, +0x0492492492492494, 0x047dc11f7047dc13, 0x0469ee58469ee586, 0x0456c797dd49c343, +0x0444444444444446, 0x04325c53ef368eb2, 0x042108421084210a, 0x0410410410410412, +0x0400000000000001, +}; +#endif +#endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c new file mode 100644 index 0000000000..eef3db94f9 --- /dev/null +++ b/driver/others/dynamic.c @@ -0,0 +1,219 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef ARCH_X86 +#define EXTERN extern +#else +#define EXTERN +#endif + +EXTERN gotoblas_t gotoblas_KATMAI; +EXTERN gotoblas_t gotoblas_COPPERMINE; +EXTERN gotoblas_t gotoblas_NORTHWOOD; +EXTERN gotoblas_t gotoblas_BANIAS; +EXTERN gotoblas_t gotoblas_ATHLON; + +extern gotoblas_t gotoblas_PRESCOTT; +extern gotoblas_t gotoblas_ATOM; +extern gotoblas_t gotoblas_NANO; +extern gotoblas_t gotoblas_CORE2; +extern gotoblas_t gotoblas_PENRYN; +extern gotoblas_t gotoblas_DUNNINGTON; +extern gotoblas_t gotoblas_NEHALEM; +extern gotoblas_t gotoblas_OPTERON; +extern gotoblas_t gotoblas_OPTERON_SSE3; +extern gotoblas_t gotoblas_BARCELONA; + +#define VENDOR_INTEL 1 +#define VENDOR_AMD 2 +#define VENDOR_CENTAUR 3 +#define VENDOR_UNKNOWN 99 + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +static int get_vendor(void){ + int eax, ebx, ecx, edx; + char vendor[13]; + + cpuid(0, &eax, &ebx, &ecx, &edx); + + *(int *)(&vendor[0]) = ebx; + *(int *)(&vendor[4]) = edx; + *(int *)(&vendor[8]) = ecx; + vendor[12] = (char)0; + + if (!strcmp(vendor, "GenuineIntel")) return VENDOR_INTEL; + if (!strcmp(vendor, "AuthenticAMD")) return VENDOR_AMD; + if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; + + if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; + + return VENDOR_UNKNOWN; +} + +static gotoblas_t *get_coretype(void){ + + int eax, ebx, ecx, edx; + int family, exfamily, model, vendor, exmodel; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + family = BITMASK(eax, 8, 0x0f); + exfamily = BITMASK(eax, 20, 0xff); + model = BITMASK(eax, 4, 0x0f); + exmodel = BITMASK(eax, 16, 0x0f); + + vendor = get_vendor(); + + if (vendor == VENDOR_INTEL){ + switch (family) { + case 0x6: + switch (exmodel) { + case 0: + if (model <= 0x7) return &gotoblas_KATMAI; + if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; + if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; + if (model == 14) return &gotoblas_BANIAS; + if (model == 15) return &gotoblas_CORE2; + return NULL; + + case 1: + if (model == 6) return &gotoblas_CORE2; + if (model == 7) return &gotoblas_PENRYN; + if (model == 13) return &gotoblas_DUNNINGTON; + if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; + if (model == 12) return &gotoblas_ATOM; + return NULL; + } + case 0xf: + if (model <= 0x2) return &gotoblas_NORTHWOOD; + return &gotoblas_PRESCOTT; + } + } + + if (vendor == VENDOR_AMD){ + if (family <= 0xe) return &gotoblas_ATHLON; + if (family == 0xf){ + if ((exfamily == 0) || (exfamily == 2)) { + if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; + else return &gotoblas_OPTERON; + } else { + return &gotoblas_BARCELONA; + } + } + } + + if (vendor == VENDOR_CENTAUR) { + switch (family) { + case 0x6: + return &gotoblas_NANO; + break; + } + } + + return NULL; +} + +static char *corename[] = { + "Unknown", + "Katmai", + "Coppermine", + "Northwood", + "Prescott", + "Banias", + "Atom", + "Core2", + "Penryn", + "Dunnington", + "Nehalem", + "Athlon", + "Opteron", + "Opteron(SSE3)", + "Barcelona", + "Nano", +}; + +char *gotoblas_corename(void) { + + if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; + if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; + if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; + if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; + if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; + if (gotoblas == &gotoblas_ATOM) return corename[ 6]; + if (gotoblas == &gotoblas_CORE2) return corename[ 7]; + if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; + if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; + if (gotoblas == &gotoblas_NEHALEM) return corename[10]; + if (gotoblas == &gotoblas_ATHLON) return corename[11]; + if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; + if (gotoblas == &gotoblas_OPTERON) return corename[13]; + if (gotoblas == &gotoblas_BARCELONA) return corename[14]; + if (gotoblas == &gotoblas_NANO) return corename[15]; + + return corename[0]; +} + +void gotoblas_dynamic_init(void) { + + if (gotoblas) return; + + gotoblas = get_coretype(); + +#ifdef ARCH_X86 + if (gotoblas == NULL) gotoblas = gotoblas_KATMAI; +#else + if (gotoblas == NULL) gotoblas = gotoblas_PRESCOTT; +#endif + + if (gotoblas && gotoblas -> init) { + gotoblas -> init(); + } else { + fprintf(stderr, "GotoBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + + gotoblas = NULL; + +} diff --git a/driver/others/init.c b/driver/others/init.c new file mode 100644 index 0000000000..657e8dd3f1 --- /dev/null +++ b/driver/others/init.c @@ -0,0 +1,697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if defined(OS_LINUX) && defined(SMP) + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_NODES 16 +#define MAX_CPUS 256 + +#define SH_MAGIC 0x510510 + +#define CPUMAP_NAME "/sys/devices/system/node/node%d/cpumap" +#define SHARE_NAME "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_map" +#define NODE_DIR "/sys/devices/system/node" + +#undef DEBUG + +/* Private variables */ +typedef struct { + unsigned long lock; + unsigned int magic; + unsigned int shmid; + + int num_nodes; + int num_procs; + int final_num_procs; + unsigned long avail; + + unsigned long cpu_info [MAX_CPUS]; + unsigned long node_info [MAX_NODES]; + int cpu_use[MAX_CPUS]; + +} shm_t; + +static cpu_set_t cpu_orig_mask[4]; + +static int cpu_mapping[MAX_CPUS]; +static int node_mapping[MAX_CPUS * 4]; +static int cpu_sub_mapping[MAX_CPUS]; +static int disable_mapping; + +/* Number of cores per nodes */ +static int node_cpu[MAX_NODES]; +static int node_equal = 0; + +static shm_t *common = (void *)-1; +static int shmid, pshmid; +static void *paddr; + +static unsigned long lprocmask, lnodemask; +static int numprocs = 1; +static int numnodes = 1; + +#if 1 +#define READ_CPU(x) ( (x) & 0xff) +#define READ_NODE(x) (((x) >> 8) & 0xff) +#define READ_CORE(x) (((x) >> 16) & 0xff) + +#define WRITE_CPU(x) (x) +#define WRITE_NODE(x) ((x) << 8) +#define WRITE_CORE(x) ((x) << 16) +#else +#define READ_CPU(x) ( (x) & 0xff) +#define READ_CORE(x) (((x) >> 8) & 0xff) +#define READ_NODE(x) (((x) >> 16) & 0xff) + +#define WRITE_CPU(x) (x) +#define WRITE_CORE(x) ((x) << 8) +#define WRITE_NODE(x) ((x) << 16) +#endif + +static inline int popcount(unsigned long number) { + + int count = 0; + + while (number > 0) { + if (number & 1) count ++; + number >>= 1; + } + + return count; +} + +static inline int rcount(unsigned long number) { + + int count = -1; + + while ((number > 0) && ((number & 0)) == 0) { + count ++; + number >>= 1; + } + + return count; +} + +static inline unsigned long get_cpumap(int node) { + + int infile; + unsigned long affinity; + char name[160]; + char *p, *dummy; + + sprintf(name, CPUMAP_NAME, node); + + infile = open(name, O_RDONLY); + + affinity = 0; + + if (infile != -1) { + + read(infile, name, sizeof(name)); + + p = name; + + while ((*p == '0') || (*p == ',')) p++; + + affinity = strtol(p, &dummy, 16); + + close(infile); + } + + return affinity; +} + +static inline unsigned long get_share(int cpu, int level) { + + int infile; + unsigned long affinity; + char name[160]; + char *p; + + sprintf(name, SHARE_NAME, cpu, level); + + infile = open(name, O_RDONLY); + + affinity = (1UL << cpu); + + if (infile != -1) { + + read(infile, name, sizeof(name)); + + p = name; + + while ((*p == '0') || (*p == ',')) p++; + + affinity = strtol(p, &p, 16); + + close(infile); + } + + return affinity; +} + +static int numa_check(void) { + + DIR *dp; + struct dirent *dir; + int node; + + common -> num_nodes = 0; + + dp = opendir(NODE_DIR); + + if (dp == NULL) { + common -> num_nodes = 1; + return 0; + } + + for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; + + while ((dir = readdir(dp)) != NULL) { + if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { + + node = atoi(&dir -> d_name[4]); + + if (node > MAX_NODES) { + fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); + exit(1); + } + + common -> num_nodes ++; + common -> node_info[node] = get_cpumap(node); + + } + } + + closedir(dp); + + if (common -> num_nodes == 1) return 1; + +#ifdef DEBUG + fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); + + for (node = 0; node < common -> num_nodes; node ++) + fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); +#endif + + return common -> num_nodes; +} + +static void numa_mapping(void) { + + int node, cpu, core; + int i, j, h; + unsigned long work, bit; + int count = 0; + + for (node = 0; node < common -> num_nodes; node ++) { + core = 0; + for (cpu = 0; cpu < common -> num_procs; cpu ++) { + if (common -> node_info[node] & common -> avail & (1UL << cpu)) { + common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); + count ++; + core ++; + } + + } + } + +#ifdef DEBUG + fprintf(stderr, "\nFrom /sys ...\n\n"); + + for (cpu = 0; cpu < count; cpu++) + fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); +#endif + + h = 1; + + while (h < count) h = 2 * h + 1; + + while (h > 1) { + h /= 2; + for (i = h; i < count; i++) { + work = common -> cpu_info[i]; + bit = CPU_ISSET(i, &cpu_orig_mask[0]); + j = i - h; + while (work < common -> cpu_info[j]) { + common -> cpu_info[j + h] = common -> cpu_info[j]; + if (CPU_ISSET(j, &cpu_orig_mask[0])) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + j -= h; + if (j < 0) break; + } + common -> cpu_info[j + h] = work; + if (bit) { + CPU_SET(j + h, &cpu_orig_mask[0]); + } else { + CPU_CLR(j + h, &cpu_orig_mask[0]); + } + + } + } + +#ifdef DEBUG + fprintf(stderr, "\nSorting ...\n\n"); + + for (cpu = 0; cpu < count; cpu++) + fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]); +#endif + +} + +static void disable_hyperthread(void) { + + unsigned long share; + int cpu; + + common -> avail = (1UL << common -> num_procs) - 1; + +#ifdef DEBUG + fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); +#endif + + for (cpu = 0; cpu < common -> num_procs; cpu ++) { + + share = (get_share(cpu, 1) & common -> avail); + + if (popcount(share) > 1) { + +#ifdef DEBUG + fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", + cpu, share & ~(1UL << cpu)); +#endif + + common -> avail &= ~((share & ~(1UL << cpu))); + } + } +} + +static void disable_affinity(void) { + +#ifdef DEBUG + fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); + fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); +#endif + + lprocmask = (1UL << common -> final_num_procs) - 1; + +#ifndef USE_OPENMP + lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; +#endif + +#ifdef DEBUG + fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); +#endif + +} + +static void setup_mempolicy(void) { + + int cpu, mynode, maxcpu; + + for (cpu = 0; cpu < MAX_NODES; cpu ++) node_cpu[cpu] = 0; + + maxcpu = 0; + + for (cpu = 0; cpu < numprocs; cpu ++) { + mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[cpu]]); + + lnodemask |= (1UL << mynode); + + node_cpu[mynode] ++; + + if (maxcpu < node_cpu[mynode]) maxcpu = node_cpu[mynode]; + } + + node_equal = 1; + + for (cpu = 0; cpu < MAX_NODES; cpu ++) if ((node_cpu[cpu] != 0) && (node_cpu[cpu] != maxcpu)) node_equal = 0; + + if (lnodemask) { + +#ifdef DEBUG + fprintf(stderr, "Node mask = %lx\n", lnodemask); +#endif + + my_set_mempolicy(MPOL_INTERLEAVE, &lnodemask, sizeof(lnodemask) * 8); + + numnodes = popcount(lnodemask); + } +} + +static inline int is_dead(int id) { + + struct shmid_ds ds; + + return shmctl(id, IPC_STAT, &ds); +} +static void open_shmem(void) { + + int try = 0; + + do { + + shmid = shmget(SH_MAGIC, 4096, 0666); + + if (shmid == -1) { + shmid = shmget(SH_MAGIC, 4096, IPC_CREAT | 0666); + } + + try ++; + + } while ((try < 10) && (shmid == -1)); + + if (shmid == -1) { + fprintf(stderr, "GotoBLAS : Can't open shared memory. Terminated.\n"); + exit(1); + } + + if (shmid != -1) common = (shm_t *)shmat(shmid, NULL, 0); + +#ifdef DEBUG + fprintf(stderr, "Shared Memory id = %x Address = %p\n", shmid, common); +#endif + +} + +static void create_pshmem(void) { + + pshmid = shmget(IPC_PRIVATE, 4096, IPC_CREAT | 0666); + + paddr = shmat(pshmid, NULL, 0); + + shmctl(pshmid, IPC_RMID, 0); + +#ifdef DEBUG + fprintf(stderr, "Private Shared Memory id = %x Address = %p\n", pshmid, paddr); +#endif +} + +static void local_cpu_map(void) { + + int cpu, id, mapping; + + cpu = 0; + mapping = 0; + + do { + id = common -> cpu_use[cpu]; + + if (id > 0) { + if (is_dead(id)) common -> cpu_use[cpu] = 0; + } + + if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { + + common -> cpu_use[cpu] = pshmid; + cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); + cpu_sub_mapping[mapping] = cpu; + + mapping ++; + } + + cpu ++; + + } while ((mapping < numprocs) && (cpu < common -> final_num_procs)); + + disable_mapping = 0; + + if ((mapping < numprocs) || (numprocs == 1)) { + for (cpu = 0; cpu < common -> final_num_procs; cpu ++) { + if (common -> cpu_use[cpu] == pshmid) common -> cpu_use[cpu] = 0; + } + disable_mapping = 1; + } + +#ifdef DEBUG + for (cpu = 0; cpu < numprocs; cpu ++) { + fprintf(stderr, "Local Mapping : %2d --> %2d (%2d)\n", cpu, cpu_mapping[cpu], cpu_sub_mapping[cpu]); + } +#endif +} + +/* Public Functions */ + +int get_num_procs(void) { return numprocs; } +int get_num_nodes(void) { return numnodes; } +int get_node_equal(void) { + + return (((blas_cpu_number % numnodes) == 0) && node_equal); + +} + +int gotoblas_set_affinity(int pos) { + + cpu_set_t cpu_mask; + + int mynode = 1; + + /* if number of threads is larger than inital condition */ + if (pos < 0) { + sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); + return 0; + } + + if (!disable_mapping) { + + mynode = READ_NODE(common -> cpu_info[cpu_sub_mapping[pos]]); + +#ifdef DEBUG + fprintf(stderr, "Giving Affinity[%4d %3d] --> %3d My node = %3d\n", getpid(), pos, cpu_mapping[pos], mynode); +#endif + + CPU_ZERO(&cpu_mask); + CPU_SET (cpu_mapping[pos], &cpu_mask); + + sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask); + + node_mapping[WhereAmI()] = mynode; + + } + + return mynode; +} + +int get_node(void) { + + if (!disable_mapping) return node_mapping[WhereAmI()]; + + return 1; +} + +static int initialized = 0; + +void gotoblas_affinity_init(void) { + + int cpu, num_avail; +#ifndef USE_OPENMP + cpu_set_t cpu_mask; +#endif + + if (initialized) return; + + initialized = 1; + + sched_getaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); + +#ifdef USE_OPENMP + numprocs = 0; +#else + numprocs = readenv("GOTO_NUM_THREADS"); +#endif + + if (numprocs == 0) numprocs = readenv("OMP_NUM_THREADS"); + + numnodes = 1; + + if (numprocs == 1) { + disable_mapping = 1; + return; + } + + create_pshmem(); + + open_shmem(); + + while ((common -> lock) && (common -> magic != SH_MAGIC)) { + if (is_dead(common -> shmid)) { + common -> lock = 0; + common -> shmid = 0; + common -> magic = 0; + } else { + sched_yield(); + } + } + + blas_lock(&common -> lock); + + if ((common -> shmid) && is_dead(common -> shmid)) common -> magic = 0; + + common -> shmid = pshmid; + + if (common -> magic != SH_MAGIC) { + +#ifdef DEBUG + fprintf(stderr, "Shared Memory Initialization.\n"); +#endif + + common -> num_procs = get_nprocs(); + + for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; + + numa_check(); + + disable_hyperthread(); + + if (common -> num_nodes > 1) numa_mapping(); + + common -> final_num_procs = popcount(common -> avail); + + for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; + + common -> magic = SH_MAGIC; + + } + + disable_affinity(); + + num_avail = popcount(lprocmask); + + if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; + +#ifdef DEBUG + fprintf(stderr, "Number of threads = %d\n", numprocs); +#endif + + local_cpu_map(); + + blas_unlock(&common -> lock); + +#ifndef USE_OPENMP + if (!disable_mapping) { + +#ifdef DEBUG + fprintf(stderr, "Giving Affinity[%3d] --> %3d\n", 0, cpu_mapping[0]); +#endif + + CPU_ZERO(&cpu_mask); + CPU_SET (cpu_mapping[0], &cpu_mask); + + sched_setaffinity(0, sizeof(cpu_mask), &cpu_mask); + + node_mapping[WhereAmI()] = READ_NODE(common -> cpu_info[cpu_sub_mapping[0]]); + + setup_mempolicy(); + + if (readenv("GOTOBLAS_MAIN_FREE")) { + sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); + } + + } +#endif + +#ifdef DEBUG + fprintf(stderr, "Initialization is done.\n"); +#endif +} + +void gotoblas_affinity_quit(void) { + + int i; + struct shmid_ds ds; + +#ifdef DEBUG + fprintf(stderr, "Terminating ..\n"); +#endif + + if ((numprocs == 1) || (initialized == 0)) return; + + if (!disable_mapping) { + + blas_lock(&common -> lock); + + for (i = 0; i < numprocs; i ++) common -> cpu_use[cpu_mapping[i]] = -1; + + blas_unlock(&common -> lock); + + } + + shmctl(shmid, IPC_STAT, &ds); + + if (ds.shm_nattch == 1) shmctl(shmid, IPC_RMID, 0); + + shmdt(common); + + shmdt(paddr); + + initialized = 0; +} + +#else + +void gotoblas_affinity_init(void) {}; + +void gotoblas_set_affinity(int threads) {}; + +void gotoblas_set_affinity2(int threads) {}; + +void gotoblas_affinity_reschedule(void) {}; + +int get_num_procs(void) { return get_nprocs(); } + +int get_num_nodes(void) { return 1; } + +int get_node(void) { return 1;} +#endif + + diff --git a/driver/others/lamc3.c b/driver/others/lamc3.c new file mode 100644 index 0000000000..439ef6e3dc --- /dev/null +++ b/driver/others/lamc3.c @@ -0,0 +1,50 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifdef NEED_F2CCONV +double +#else +FLOAT +#endif +NAME(FLOAT *a, FLOAT *b){ + + return *a + *b; + +} diff --git a/driver/others/lamch.c b/driver/others/lamch.c new file mode 100644 index 0000000000..b04450024e --- /dev/null +++ b/driver/others/lamch.c @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if 0 +static FLOAT hdata[] __attribute__((aligned(128))) = { +#ifdef XDOUBLE + +0x1.0000000000000000P-00064L, + +0x1.0000000000000000P-16382L, + +0x1.0000000000000000P+00001L, + +0x1.0000000000000000P-00063L, + +0x1.0000000000000000P+00006L, + +0x1.0000000000000000P+00000L, + -0x1.ffe8000000000000P+00013L, + +0x1.0000000000000000P-16382L, + +0x1.0004000000000000P+00014L, + +0x1.fffffffffffffffeP+16383L, +#elif defined DOUBLE + +0x1.0000000000000P-0053, + +0x1.0000000000000P-1022, + +0x1.0000000000000P+0001, + +0x1.0000000000000P-0052, + +0x1.a800000000000P+0005, + +0x1.0000000000000P+0000, + -0x1.fe80000000000P+0009, + +0x1.0000000000000P-1022, + +0x1.0000000000000P+0010, + +0x1.fffffffffffffP+1023, +#else + +0x1.000000P-024f, + +0x1.000000P-126f, + +0x1.000000P+001f, + +0x1.000000P-023f, + +0x1.800000P+004f, + +0x1.000000P+000f, + -0x1.f40000P+006f, + +0x1.000000P-126f, + +0x1.000000P+007f, + +0x1.fffffeP+127f, +#endif +}; + +#endif + +static unsigned int idata[] __attribute__((aligned(128))) = { + +#if defined XDOUBLE +#ifndef __BIG_ENDIAN__ + 0x00000000, 0x80000000, 0x00003fbf, 0x00000000, + 0x00000000, 0x80000000, 0x00000001, 0x00000000, + 0x00000000, 0x80000000, 0x00004000, 0x00000000, + 0x00000000, 0x80000000, 0x00003fc0, 0x00000000, + 0x00000000, 0x80000000, 0x00004005, 0x00000000, + 0x00000000, 0x80000000, 0x00003fff, 0x00000000, + 0x00000000, 0xff400000, 0x0000c00c, 0x00000000, + 0x00000000, 0x80000000, 0x00000001, 0x00000000, + 0x00000000, 0x80200000, 0x0000400d, 0x00000000, + 0xffffffff, 0xffffffff, 0x00007ffe, 0x00000000, +#else + 0x00000000, 0x00003fbf, 0x80000000, 0x00000000, + 0x00000000, 0x00000001, 0x80000000, 0x00000000, + 0x00000000, 0x00004000, 0x80000000, 0x00000000, + 0x00000000, 0x00003fc0, 0x80000000, 0x00000000, + 0x00000000, 0x00004005, 0x80000000, 0x00000000, + 0x00000000, 0x00003fff, 0x80000000, 0x00000000, + 0x00000000, 0x0000c00c, 0xff400000, 0x00000000, + 0x00000000, 0x00000001, 0x80000000, 0x00000000, + 0x00000000, 0x0000400d, 0x80200000, 0x00000000, + 0x00000000, 0x00007ffe, 0xffffffff, 0xffffffff, + +#endif +#elif defined DOUBLE +#ifndef __BIG_ENDIAN__ + 0x00000000, 0x3ca00000, + 0x00000000, 0x00100000, + 0x00000000, 0x40000000, + 0x00000000, 0x3cb00000, + 0x00000000, 0x404a8000, + 0x00000000, 0x3ff00000, + 0x00000000, 0xc08fe800, + 0x00000000, 0x00100000, + 0x00000000, 0x40900000, + 0xffffffff, 0x7fefffff, +#else + 0x3ca00000, 0x00000000, + 0x00100000, 0x00000000, + 0x40000000, 0x00000000, + 0x3cb00000, 0x00000000, + 0x404a8000, 0x00000000, + 0x3ff00000, 0x00000000, + 0xc08fe800, 0x00000000, + 0x00100000, 0x00000000, + 0x40900000, 0x00000000, + 0x7fefffff, 0xffffffff, +#endif +#else + + 0x33800000, + 0x00800000, + 0x40000000, + 0x34000000, + 0x41c00000, + 0x3f800000, + 0xc2fa0000, + 0x00800000, + 0x43000000, + 0x7f7fffff, + +#endif +}; + + +#ifdef NEED_F2CCONV +double +#else +FLOAT +#endif +NAME(char *P){ + + char p = *P; + int pos; + FLOAT *hdata = (FLOAT *)idata; + + TOUPPER(p); + + switch (p) { + case 'E': + pos = 0; + break; + case 'S': + pos = 1; + break; + case 'B': + pos = 2; + break; + case 'P': + pos = 3; + break; + case 'N': + pos = 4; + break; + case 'R': + pos = 5; + break; + case 'M': + pos = 6; + break; + case 'U': + pos = 7; + break; + case 'L': + pos = 8; + break; + case 'O': + pos = 9; + break; + default: + pos = 0; + break; + } + + return hdata[pos]; + +} diff --git a/driver/others/lsame.c b/driver/others/lsame.c new file mode 100644 index 0000000000..cae8b4ae3d --- /dev/null +++ b/driver/others/lsame.c @@ -0,0 +1,50 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include + +int NAME(char *A, char *B){ + + char a = *A; + char b = *B; + + if (a > 96) a -= 32; + if (b > 96) b -= 32; + + return (a == b); +} diff --git a/driver/others/memory.c b/driver/others/memory.c new file mode 100644 index 0000000000..1983931824 --- /dev/null +++ b/driver/others/memory.c @@ -0,0 +1,1257 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#undef DEBUG + +#include "common.h" + +#ifdef OS_WINDOWS +#define ALLOC_WINDOWS +#ifndef MEM_LARGE_PAGES +#define MEM_LARGE_PAGES 0x20000000 +#endif +#else +#define ALLOC_MMAP +#define ALLOC_MALLOC +#endif + +#include +#include +#include + +#ifndef OS_WINDOWS +#include +#include +#include +#endif + +#include + +#ifdef OS_LINUX +#include +#include +#include +#include +#include +#endif + +#if defined(OS_FreeBSD) || defined(OS_Darwin) +#include +#endif + +#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) +#include +#undef printf +#define printf _cprintf +#endif + +#ifdef OS_LINUX + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#endif + +#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP) +#define NO_WARMUP +#endif + +#ifdef ALLOC_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#ifndef FIXED_PAGESIZE +#define FIXED_PAGESIZE 4096 +#endif + +#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) + +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) + +#ifdef DYNAMIC_ARCH +gotoblas_t *gotoblas = NULL; +#endif + +#ifndef SMP + +#define blas_cpu_number 1 +#define blas_num_threads 1 + +/* Dummy Function */ +int goto_get_num_procs (void) { return 1;}; +void goto_set_num_threads(int num_threads) {}; + +#else + +#ifdef OS_LINUX +#ifndef NO_AFFINITY +int get_num_procs(void); +#else +int get_num_procs(void) { + static int nums = 0; + if (!nums) nums = get_nprocs(); + return nums; +} +#endif +#endif + +#ifdef OS_WINDOWS + +int get_num_procs(void) { + + static int nums = 0; + + if (nums == 0) { + + SYSTEM_INFO sysinfo; + + GetSystemInfo(&sysinfo); + + nums = sysinfo.dwNumberOfProcessors; + } + + return nums; +} + +#endif + +#if defined(OS_FreeBSD) || defined(OS_Darwin) + +int get_num_procs(void) { + + static int nums = 0; + + int m[2]; + size_t len; + + if (nums == 0) { + m[0] = CTL_HW; + m[1] = HW_NCPU; + len = sizeof(int); + sysctl(m, 2, &nums, &len, NULL, 0); + } + + return nums; +} + +#endif + +int blas_cpu_number = 0; +int blas_num_threads = 0; + +int goto_get_num_procs (void) { + return blas_cpu_number; +} + +int blas_get_cpu_number(void){ + char *p; +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) + int max_num; +#endif + int blas_goto_num = 0; + int blas_omp_num = 0; + + if (blas_num_threads) return blas_num_threads; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) + max_num = get_num_procs(); +#endif + + blas_goto_num = 0; +#ifndef USE_OPENMP + p = getenv("GOTO_NUM_THREADS"); + if (p) blas_goto_num = atoi(p); + if (blas_goto_num < 0) blas_goto_num = 0; +#endif + + blas_omp_num = 0; + p = getenv("OMP_NUM_THREADS"); + if (p) blas_omp_num = atoi(p); + if (blas_omp_num < 0) blas_omp_num = 0; + + if (blas_goto_num > 0) blas_num_threads = blas_goto_num; + else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; + else blas_num_threads = MAX_CPU_NUMBER; + +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) + if (blas_num_threads > max_num) blas_num_threads = max_num; +#endif + + if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER; + +#ifdef DEBUG + printf( "Adjusted number of threads : %3d\n", blas_num_threads); +#endif + + blas_cpu_number = blas_num_threads; + + return blas_num_threads; +} +#endif + +struct release_t { + void *address; + void (*func)(struct release_t *); + long attr; +}; + +int hugetlb_allocated = 0; + +static struct release_t release_info[NUM_BUFFERS]; +static int release_pos = 0; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) +static int hot_alloc = 0; +#endif + +#ifdef ALLOC_MMAP + +static void alloc_mmap_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("GotoBLAS : munmap failed\n"); + } +} + +#ifdef NO_WARMUP + +static void *alloc_mmap(void *address){ + void *map_address; + + if (address){ + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + } else { + map_address = mmap(address, + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + } + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + } + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + return map_address; +} + +#else + +#define BENCH_ITERATION 4 +#define SCALING 2 + +static inline BLASULONG run_bench(BLASULONG address, long size) { + + BLASULONG original, *p; + BLASULONG start, stop, min; + int iter, i, count; + + min = (BLASULONG)-1; + + original = *(BLASULONG *)(address + size - PAGESIZE); + + *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address; + + for (iter = 0; iter < BENCH_ITERATION; iter ++ ) { + + p = (BLASULONG *)address; + + count = size / PAGESIZE; + + start = rpcc(); + + for (i = 0; i < count; i ++) { + p = (BLASULONG *)(*p); + } + + stop = rpcc(); + + if (min > stop - start) min = stop - start; + } + + *(BLASULONG *)(address + size - PAGESIZE + 0) = original; + *(BLASULONG *)(address + size - PAGESIZE + 8) = (BLASULONG)p; + + return min; +} + +static void *alloc_mmap(void *address){ + void *map_address, *best_address; + BLASULONG best, start, current; + BLASULONG allocsize; + + if (address){ + /* Just give up use advanced operation */ + map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc == 0) { + map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + } else { +#endif + + map_address = mmap(NULL, BUFFER_SIZE * SCALING, + MMAP_ACCESS, MMAP_POLICY, -1, 0); + + if (map_address != (void *)-1) { + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); +#endif + + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + + start = (BLASULONG)map_address; + current = (SCALING - 1) * BUFFER_SIZE; + + while(current > 0) { + *(long *)start = (long)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } + + *(long *)(start - PAGESIZE) = (BLASULONG)map_address; + + start = (BLASULONG)map_address; + + best = (BLASULONG)-1; + best_address = map_address; + + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + + current = run_bench(start, allocsize); + + if (best > current) { + best = current; + best_address = (void *)start; + } + + start += PAGESIZE; + + } + + if ((BLASULONG)best_address > (BLASULONG)map_address) + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + + munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); + + map_address = best_address; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + hot_alloc = 2; +#endif + } + } +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_mmap_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#endif + + +#ifdef ALLOC_MALLOC + +static void alloc_malloc_free(struct release_t *release){ + + free(release -> address); + +} + +static void *alloc_malloc(void *address){ + + void *map_address; + + map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_malloc_free; + release_pos ++; + } + + return map_address; + +} + +#endif + +#ifdef ALLOC_QALLOC + +void *qalloc(int flags, size_t bytes); +void *qfree (void *address); + +#define QNONCACHE 0x1 +#define QCOMMS 0x2 +#define QFAST 0x4 + +static void alloc_qalloc_free(struct release_t *release){ + + qfree(release -> address); + +} + +static void *alloc_qalloc(void *address){ + void *map_address; + + map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_qalloc_free; + release_pos ++; + } + + return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1)); +} + +#endif + +#ifdef ALLOC_WINDOWS + +static void alloc_windows_free(struct release_t *release){ + + VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT); + +} + +static void *alloc_windows(void *address){ + void *map_address; + + map_address = VirtualAlloc(address, + BUFFER_SIZE, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + if (map_address == (void *)NULL) map_address = (void *)-1; + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_windows_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_DEVICEDRIVER +#ifndef DEVICEDRIVER_NAME +#define DEVICEDRIVER_NAME "/dev/mapper" +#endif + +static void alloc_devicedirver_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("GotoBLAS : Bugphysarea unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("GotoBLAS : Bugphysarea close failed.\n"); + } + +} + +static void *alloc_devicedirver(void *address){ + + int fd; + void *map_address; + + if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) { + + return (void *)-1; + + } + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_devicedirver_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#ifdef ALLOC_SHM + +static void alloc_shm_free(struct release_t *release){ + + if (shmdt(release -> address)) { + printf("GotoBLAS : Shared memory unmap failed.\n"); + } +} + +static void *alloc_shm(void *address){ + void *map_address; + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600); + + map_address = (void *)shmat(shmid, address, 0); + + if (map_address != (void *)-1){ + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + shmctl(shmid, IPC_RMID, 0); + + release_info[release_pos].address = map_address; + release_info[release_pos].attr = shmid; + release_info[release_pos].func = alloc_shm_free; + release_pos ++; + } + + return map_address; +} + +#endif + +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + +static void alloc_hugetlb_free(struct release_t *release){ + +#if defined(OS_LINUX) || defined(OS_AIX) + if (shmdt(release -> address)) { + printf("GotoBLAS : Hugepage unmap failed.\n"); + } +#endif + +#ifdef __sun__ + + munmap(release -> address, BUFFER_SIZE); + +#endif + +#ifdef OS_WINDOWS + + VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT); + +#endif + +} + +static void *alloc_hugetlb(void *address){ + + void *map_address = (void *)-1; + +#if defined(OS_LINUX) || defined(OS_AIX) + int shmid; + + shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, +#ifdef OS_LINUX + SHM_HUGETLB | +#endif +#ifdef OS_AIX + SHM_LGPAGE | SHM_PIN | +#endif + IPC_CREAT | SHM_R | SHM_W); + + if (shmid != -1) { + map_address = (void *)shmat(shmid, address, SHM_RND); + +#ifdef OS_LINUX + my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0); +#endif + + if (map_address != (void *)-1){ + shmctl(shmid, IPC_RMID, 0); + } + } +#endif + +#ifdef __sun__ + struct memcntl_mha mha; + + mha.mha_cmd = MHA_MAPSIZE_BSSBRK; + mha.mha_flags = 0; + mha.mha_pagesize = HUGE_PAGESIZE; + memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0); + + map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE); +#endif + +#ifdef OS_WINDOWS + + HANDLE hToken; + TOKEN_PRIVILEGES tp; + + if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1; + + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) return (void *) -1; + + if (AdjustTokenPrivileges(hToken, FALSE, (PTOKEN_PRIVILEGES)&tp, 0, NULL, NULL) != TRUE) return (void *) -1; + + map_address = (void *)VirtualAlloc(address, + BUFFER_SIZE, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); + + AdjustTokenPrivileges(hToken, TRUE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, NULL); + + if (map_address == (void *)NULL) map_address = (void *)-1; + +#endif + + if (map_address != (void *)-1){ + release_info[release_pos].address = map_address; + release_info[release_pos].func = alloc_hugetlb_free; + release_pos ++; + } + + return map_address; +} +#endif + +#ifdef ALLOC_HUGETLBFILE + +static int hugetlb_pid = 0; + +static void alloc_hugetlbfile_free(struct release_t *release){ + + if (munmap(release -> address, BUFFER_SIZE)) { + printf("GotoBLAS : HugeTLBfs unmap failed.\n"); + } + + if (close(release -> attr)) { + printf("GotoBLAS : HugeTLBfs close failed.\n"); + } +} + +static void *alloc_hugetlbfile(void *address){ + + void *map_address = (void *)-1; + int fd; + char filename[64]; + + if (!hugetlb_pid) hugetlb_pid = getpid(); + + sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid); + + if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) { + return (void *)-1; + } + + unlink(filename); + + map_address = mmap(address, BUFFER_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); + + if (map_address != (void *)-1) { + release_info[release_pos].address = map_address; + release_info[release_pos].attr = fd; + release_info[release_pos].func = alloc_hugetlbfile_free; + release_pos ++; + } + + return map_address; +} +#endif + +/* Global lock for memory allocation */ + +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t alloc_lock = 0; +#else +static BLASULONG alloc_lock = 0UL; +#endif + +#ifdef SEEK_ADDRESS +static BLASULONG base_address = 0UL; +#else +static BLASULONG base_address = BASE_ADDRESS; +#endif + +static volatile struct { + BLASULONG lock; + void *addr; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int pos; +#endif + int used; +#ifndef __64BIT__ + char dummy[48]; +#else + char dummy[40]; +#endif + +} memory[NUM_BUFFERS]; + +static int memory_initialized = 0; +static void gotoblas_memory_init(void); + +/* Memory allocation routine */ +/* procpos ... indicates where it comes from */ +/* 0 : Level 3 functions */ +/* 1 : Level 2 functions */ +/* 2 : Thread */ + +void *blas_memory_alloc(int procpos){ + + int position; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + int mypos; +#endif + + void *map_address; + + void *(*memoryalloc[])(void *address) = { +#ifdef ALLOC_DEVICEDRIVER + alloc_devicedirver, +#endif +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + alloc_hugetlb, +#endif +#ifdef ALLOC_SHM + alloc_shm, +#endif +#ifdef ALLOC_MMAP + alloc_mmap, +#endif +#ifdef ALLOC_QALLOC + alloc_qalloc, +#endif +#ifdef ALLOC_WINDOWS + alloc_windows, +#endif +#ifdef ALLOC_MALLOC + alloc_malloc, +#endif + NULL, + }; + void *(**func)(void *address); + + if (!memory_initialized) { + + LOCK_COMMAND(&alloc_lock); + + if (!memory_initialized) { + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + for (position = 0; position < NUM_BUFFERS; position ++){ + memory[position].addr = (void *)0; + memory[position].pos = -1; + memory[position].used = 0; + memory[position].lock = 0; + } +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#ifdef SMP + if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) +#ifndef DYNAMIC_ARCH + blas_set_parameter(); +#endif +#endif + + memory_initialized = 1; + } + + UNLOCK_COMMAND(&alloc_lock); + } + +#ifdef DEBUG + printf("Alloc Start ...\n"); +#endif + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + mypos = WhereAmI(); + + position = mypos; + while (position > NUM_BUFFERS) position >>= 1; + + do { + if (!memory[position].used && (memory[position].pos == mypos)) { + + blas_lock(&memory[position].lock); + + if (!memory[position].used) goto allocation; + + blas_unlock(&memory[position].lock); + } + + position ++; + + } while (position < NUM_BUFFERS); + + +#endif + + position = 0; + + do { + if (!memory[position].used) { + + blas_lock(&memory[position].lock); + + if (!memory[position].used) goto allocation; + + blas_unlock(&memory[position].lock); + } + + position ++; + + } while (position < NUM_BUFFERS); + + goto error; + + allocation : + +#ifdef DEBUG + printf(" Position -> %d\n", position); +#endif + + memory[position].used = 1; + + blas_unlock(&memory[position].lock); + + if (!memory[position].addr) { + do { +#ifdef DEBUG + printf("Allocation Start : %lx\n", base_address); +#endif + + map_address = (void *)-1; + + func = &memoryalloc[0]; + + while ((func != NULL) && (map_address == (void *) -1)) { + + map_address = (*func)((void *)base_address); + +#ifdef ALLOC_DEVICEDRIVER + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "GotoBLAS Warning ... Physically contigous allocation was failed.\n"); + } +#endif + +#ifdef ALLOC_HUGETLBFILE + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { +#ifndef OS_WINDOWS + fprintf(stderr, "GotoBLAS Warning ... HugeTLB(File) allocation was failed.\n"); +#endif + } +#endif + +#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; +#endif + + func ++; + } + +#ifdef DEBUG + printf(" Success -> %08lx\n", map_address); +#endif + if (((BLASLONG) map_address) == -1) base_address = 0UL; + + if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; + + } while ((BLASLONG)map_address == -1); + + memory[position].addr = map_address; + +#ifdef DEBUG + printf(" Mapping Succeeded. %p(%d)\n", (void *)alloc_area[position], position); +#endif + } + +#if defined(WHEREAMI) && !defined(USE_OPENMP) + + if (memory[position].pos == -1) memory[position].pos = mypos; + +#endif + +#ifdef DYNAMIC_ARCH + + if (memory_initialized == 1) { + + LOCK_COMMAND(&alloc_lock); + + if (memory_initialized == 1) { + + if (!gotoblas) gotoblas_dynamic_init(); + + memory_initialized = 2; + } + + UNLOCK_COMMAND(&alloc_lock); + + } +#endif + + +#ifdef DEBUG + printf("Mapped : %p %3d\n\n", + (void *)alloc_area[position], position); +#endif + + return (void *)memory[position].addr; + + error: + printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); + + return NULL; +} + +void blas_memory_free(void *free_area){ + + int position; + +#ifdef DEBUG + printf("Unmapped Start : %p ...\n", free_area); +#endif + + position = 0; + + while ((memory[position].addr != free_area) + && (position < NUM_BUFFERS)) position++; + + if (memory[position].addr != free_area) goto error; + +#ifdef DEBUG + printf(" Position : %d\n", position); +#endif + + memory[position].used = 0; + +#ifdef DEBUG + printf("Unmap Succeeded.\n\n"); +#endif + + return; + + error: + printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); + +#ifdef DEBUG + for (position = 0; position < NUM_BUFFERS; position++) + printf("%4ld %p : %d\n", position, alloc_area[position], alloc_used[position]); +#endif + + return; +} + +void blas_shutdown(void){ + + int pos; + +#ifdef SMP + BLASFUNC(blas_thread_shutdown)(); +#endif + + LOCK_COMMAND(&alloc_lock); + + for (pos = 0; pos < release_pos; pos ++) { + release_info[pos].func(&release_info[pos]); + } + +#ifdef SEEK_ADDRESS + base_address = 0UL; +#else + base_address = BASE_ADDRESS; +#endif + + for (pos = 0; pos < NUM_BUFFERS; pos ++){ + memory[pos].addr = (void *)0; + memory[pos].used = 0; +#if defined(WHEREAMI) && !defined(USE_OPENMP) + memory[pos].pos = -1; +#endif + memory[pos].lock = 0; + } + + UNLOCK_COMMAND(&alloc_lock); + + return; +} + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + +#ifdef SMP +#if defined(USE_PTHREAD_LOCK) +static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; +#elif defined(USE_PTHREAD_SPINLOCK) +static pthread_spinlock_t init_lock = 0; +#else +static BLASULONG init_lock = 0UL; +#endif +#endif + +static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, + void *sa, void *sb, BLASLONG pos) { + +#ifndef ARCH_POWER + + long size; + BLASULONG buffer; + + size = BUFFER_SIZE - PAGESIZE; + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + if (hot_alloc != 2) { +#endif + +#ifdef SMP + LOCK_COMMAND(&init_lock); +#endif + + while (size > 0) { + *(int *)buffer = size; + buffer += PAGESIZE; + size -= PAGESIZE; + } + +#ifdef SMP + UNLOCK_COMMAND(&init_lock); +#endif + + size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE); + buffer = (BLASULONG)sa + GEMM_OFFSET_A; + + while (size > 0) { + *(int *)buffer = size; + buffer += 64; + size -= 64; + } + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + } +#endif + +#endif +} + +#ifdef SMP + +static void _init_thread_memory(void *buffer) { + + blas_queue_t queue[MAX_CPU_NUMBER]; + int num_cpu; + + for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) { + + blas_queue_init(&queue[num_cpu]); + queue[num_cpu].mode = BLAS_DOUBLE | BLAS_REAL; + queue[num_cpu].routine = &_touch_memory; + queue[num_cpu].args = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + } + + queue[num_cpu - 1].next = NULL; + queue[0].sa = buffer; + + exec_blas(num_cpu, queue); + +} +#endif + +static void gotoblas_memory_init(void) { + + void *buffer; + + hot_alloc = 1; + + buffer = (void *)blas_memory_alloc(0); + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif + + _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A)); + +#else + + _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0); + +#endif + + blas_memory_free(buffer); +} +#endif + +/* Initialization for all function; this function should be called before main */ + +static int gotoblas_initialized = 0; + +void CONSTRUCTOR gotoblas_init(void) { + + if (gotoblas_initialized) return; + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_init(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_init(); +#endif + +#if defined(OS_LINUX) && !defined(NO_WARMUP) + gotoblas_memory_init(); +#endif + +#ifdef SMP + if (blas_cpu_number == 0) blas_get_cpu_number(); +#ifdef SMP_SERVER + if (blas_server_avail == 0) blas_thread_init(); +#endif +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_init(); +#endif + + gotoblas_initialized = 1; + +#ifdef PROFILE + moncontrol (1); +#endif + +} + +void DESTRUCTOR gotoblas_quit(void) { + + if (gotoblas_initialized == 0) return; + +#ifdef PROFILE + moncontrol (0); +#endif + +#ifdef FUNCTION_PROFILE + gotoblas_profile_quit(); +#endif + +#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY) + gotoblas_affinity_quit(); +#endif + +#ifdef DYNAMIC_ARCH + gotoblas_dynamic_quit(); +#endif + + gotoblas_initialized = 0; + +#ifdef PROFILE + moncontrol (1); +#endif + +} + +#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) +/* Don't call me; this is just work around for PGI / Sun bug */ +void gotoblas_dummy_for_PGI(void) { + + gotoblas_init(); + gotoblas_quit(); + +#if 0 + asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); + asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); +#else + asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); + asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); +#endif +} +#endif diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c new file mode 100644 index 0000000000..10b35aa31f --- /dev/null +++ b/driver/others/memory_qalloc.c @@ -0,0 +1,77 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef SMP +#define blas_cpu_number 1 +#else + +int blas_cpu_number = 1; + +int blas_get_cpu_number(void){ + + return blas_cpu_number; +} +#endif + +#define FIXED_PAGESIZE 4096 + +void *sa = NULL; +void *sb = NULL; +static double static_buffer[BUFFER_SIZE/sizeof(double)]; + +void *blas_memory_alloc(int numproc){ + + if (sa == NULL){ +#if 1 + sa = (void *)qalloc(QFAST, BUFFER_SIZE); +#else + sa = (void *)malloc(BUFFER_SIZE); +#endif + sb = (void *)&static_buffer[0]; + } + + return sa; +} + +void blas_memory_free(void *free_area){ + return; +} + diff --git a/driver/others/parameter.c b/driver/others/parameter.c new file mode 100644 index 0000000000..9e72fd24f4 --- /dev/null +++ b/driver/others/parameter.c @@ -0,0 +1,668 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +int get_L2_size(void); + +#define DEFAULT_GEMM_P 128 +#define DEFAULT_GEMM_Q 128 +#define DEFAULT_GEMM_R 128 + +/* Global Parameter */ +#if SGEMM_P == sgemm_p +BLASLONG sgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG sgemm_p = SGEMM_P; +#endif +#if DGEMM_P == dgemm_p +BLASLONG dgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG dgemm_p = DGEMM_P; +#endif +#if CGEMM_P == cgemm_p +BLASLONG cgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG cgemm_p = CGEMM_P; +#endif +#if ZGEMM_P == zgemm_p +BLASLONG zgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG zgemm_p = ZGEMM_P; +#endif + +#if SGEMM_Q == sgemm_q +BLASLONG sgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG sgemm_q = SGEMM_Q; +#endif +#if DGEMM_Q == dgemm_q +BLASLONG dgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG dgemm_q = DGEMM_Q; +#endif +#if CGEMM_Q == cgemm_q +BLASLONG cgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG cgemm_q = CGEMM_Q; +#endif +#if ZGEMM_Q == zgemm_q +BLASLONG zgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG zgemm_q = ZGEMM_Q; +#endif + +#if SGEMM_R == sgemm_r +BLASLONG sgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG sgemm_r = SGEMM_R; +#endif +#if DGEMM_R == dgemm_r +BLASLONG dgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG dgemm_r = DGEMM_R; +#endif +#if CGEMM_R == cgemm_r +BLASLONG cgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG cgemm_r = CGEMM_R; +#endif +#if ZGEMM_R == zgemm_r +BLASLONG zgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG zgemm_r = ZGEMM_R; +#endif + +#if defined(EXPRECISION) || defined(QUAD_PRECISION) +#if QGEMM_P == qgemm_p +BLASLONG qgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG qgemm_p = QGEMM_P; +#endif +#if XGEMM_P == xgemm_p +BLASLONG xgemm_p = DEFAULT_GEMM_P; +#else +BLASLONG xgemm_p = XGEMM_P; +#endif +#if QGEMM_Q == qgemm_q +BLASLONG qgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG qgemm_q = QGEMM_Q; +#endif +#if XGEMM_Q == xgemm_q +BLASLONG xgemm_q = DEFAULT_GEMM_Q; +#else +BLASLONG xgemm_q = XGEMM_Q; +#endif +#if QGEMM_R == qgemm_r +BLASLONG qgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG qgemm_r = QGEMM_R; +#endif +#if XGEMM_R == xgemm_r +BLASLONG xgemm_r = DEFAULT_GEMM_R; +#else +BLASLONG xgemm_r = XGEMM_R; +#endif +#endif + +#if defined(ARCH_X86) || defined(ARCH_X86_64) + +int get_L2_size(void){ + + int eax, ebx, ecx, edx; + +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ + defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ + defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + return BITMASK(ecx, 16, 0xffff); + +#else + + int info[15]; + int i; + + cpuid(2, &eax, &ebx, &ecx, &edx); + + info[ 0] = BITMASK(eax, 8, 0xff); + info[ 1] = BITMASK(eax, 16, 0xff); + info[ 2] = BITMASK(eax, 24, 0xff); + + info[ 3] = BITMASK(ebx, 0, 0xff); + info[ 4] = BITMASK(ebx, 8, 0xff); + info[ 5] = BITMASK(ebx, 16, 0xff); + info[ 6] = BITMASK(ebx, 24, 0xff); + + info[ 7] = BITMASK(ecx, 0, 0xff); + info[ 8] = BITMASK(ecx, 8, 0xff); + info[ 9] = BITMASK(ecx, 16, 0xff); + info[10] = BITMASK(ecx, 24, 0xff); + + info[11] = BITMASK(edx, 0, 0xff); + info[12] = BITMASK(edx, 8, 0xff); + info[13] = BITMASK(edx, 16, 0xff); + info[14] = BITMASK(edx, 24, 0xff); + + for (i = 0; i < 15; i++){ + + switch (info[i]){ + case 0x3b : + case 0x41 : + case 0x79 : + return 128; + break; + + case 0x3c : + case 0x42 : + case 0x7a : + case 0x7e : + case 0x82 : + return 256; + break; + + case 0x43 : + case 0x7b : + case 0x7f : + case 0x83 : + case 0x86 : + return 512; + break; + + case 0x44 : + case 0x78 : + case 0x7c : + case 0x84 : + case 0x87 : + return 1024; + break; + + case 0x45 : + case 0x7d : + case 0x85 : + return 2048; + + case 0x49 : + return 4096; + break; + } + } + + /* Never reached */ + return 0; +#endif +} + +void blas_set_parameter(void){ + + char *p; + int factor; + int size = get_L2_size(); + +#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) + size >>= 7; + +#if defined(CORE_BANIAS) && (HAVE_HIT > 1) + sgemm_p = 64 / HAVE_HIT * size; + dgemm_p = 32 / HAVE_HIT * size; + cgemm_p = 32 / HAVE_HIT * size; + zgemm_p = 16 / HAVE_HIT * size; +#ifdef EXPRECISION + qgemm_p = 16 / HAVE_HIT * size; + xgemm_p = 8 / HAVE_HIT * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 8 / HAVE_HIT * size; + xgemm_p = 4 / HAVE_HIT * size; +#endif +#else + sgemm_p = 64 * size; + dgemm_p = 32 * size; + cgemm_p = 32 * size; + zgemm_p = 16 * size; +#ifdef EXPRECISION + qgemm_p = 16 * size; + xgemm_p = 8 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 8 * size; + xgemm_p = 4 * size; +#endif +#endif +#endif + +#if defined(CORE_NORTHWOOD) + size >>= 7; + +#ifdef ALLOC_HUGETLB + sgemm_p = 128 * size; + dgemm_p = 64 * size; + cgemm_p = 64 * size; + zgemm_p = 32 * size; +#ifdef EXPRECISION + qgemm_p = 32 * size; + xgemm_p = 16 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 16 * size; + xgemm_p = 8 * size; +#endif +#else + sgemm_p = 96 * size; + dgemm_p = 48 * size; + cgemm_p = 48 * size; + zgemm_p = 24 * size; +#ifdef EXPRECISION + qgemm_p = 24 * size; + xgemm_p = 12 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 12 * size; + xgemm_p = 6 * size; +#endif +#endif +#endif + +#if defined(CORE_CORE2) + + size >>= 9; + + sgemm_p = 92 * size; + dgemm_p = 46 * size; + cgemm_p = 46 * size; + zgemm_p = 23 * size; + +#ifdef EXPRECISION + qgemm_p = 23 * size; + xgemm_p = 11 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 11 * size; + xgemm_p = 5 * size; +#endif +#endif + +#if defined(PENRYN) + + size >>= 9; + + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; + +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 21 * size + 4; + xgemm_p = 10 * size + 2; +#endif +#endif + +#if defined(DUNNINGTON) + + size >>= 9; + + sgemm_p = 384; + dgemm_p = 384; + cgemm_p = 384; + zgemm_p = 384; + +#ifdef EXPRECISION + qgemm_p = 384; + xgemm_p = 384; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 21 * size + 4; + xgemm_p = 10 * size + 2; +#endif +#endif + +#if defined(NEHALEM) + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#endif + +#if defined(CORE_PRESCOTT) || defined(GENERIC) + size >>= 6; + + if (size > 16) size = 16; + + sgemm_p = 56 * size; + dgemm_p = 28 * size; + cgemm_p = 28 * size; + zgemm_p = 14 * size; +#ifdef EXPRECISION + qgemm_p = 14 * size; + xgemm_p = 7 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 7 * size; + xgemm_p = 3 * size; +#endif +#endif + +#if defined(CORE_OPTERON) + sgemm_p = 224 + 14 * (size >> 5); + dgemm_p = 112 + 14 * (size >> 6); + cgemm_p = 116 + 14 * (size >> 6); + zgemm_p = 58 + 14 * (size >> 7); +#ifdef EXPRECISION + qgemm_p = 58 + 14 * (size >> 7); + xgemm_p = 29 + 14 * (size >> 8); +#endif +#ifdef QUAD_PRECISION + qgemm_p = 29 + 14 * (size >> 8); + xgemm_p = 15 + 14 * (size >> 9); +#endif +#endif + +#if defined(ATOM) + size >>= 8; + + sgemm_p = 256; + dgemm_p = 128; + cgemm_p = 128; + zgemm_p = 64; +#ifdef EXPRECISION + qgemm_p = 64; + xgemm_p = 32; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 32; + xgemm_p = 16; +#endif +#endif + +#if defined(CORE_BARCELONA) + size >>= 8; + + sgemm_p = 232 * size; + dgemm_p = 116 * size; + cgemm_p = 116 * size; + zgemm_p = 58 * size; +#ifdef EXPRECISION + qgemm_p = 58 * size; + xgemm_p = 26 * size; +#endif +#ifdef QUAD_PRECISION + qgemm_p = 26 * size; + xgemm_p = 13 * size; +#endif +#endif + + p = getenv("GOTO_BLOCK_FACTOR"); + + if (p) { + factor = atoi(p); + if (factor < 10) factor = 10; + if (factor > 200) factor = 200; + + sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L; + dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L; + cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L; + zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L; +#ifdef EXPRECISION + qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L; + xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L; +#endif + } + + if (sgemm_p == 0) sgemm_p = 64; + if (dgemm_p == 0) dgemm_p = 64; + if (cgemm_p == 0) cgemm_p = 64; + if (zgemm_p == 0) zgemm_p = 64; +#ifdef EXPRECISION + if (qgemm_p == 0) qgemm_p = 64; + if (xgemm_p == 0) xgemm_p = 64; +#endif + +#ifdef QUAD_PRECISION + if (qgemm_p == 0) qgemm_p = 64; + if (xgemm_p == 0) xgemm_p = 64; +#endif + + sgemm_p = (sgemm_p + SGEMM_UNROLL_M - 1) & ~(SGEMM_UNROLL_M - 1); + dgemm_p = (dgemm_p + DGEMM_UNROLL_M - 1) & ~(DGEMM_UNROLL_M - 1); + cgemm_p = (cgemm_p + CGEMM_UNROLL_M - 1) & ~(CGEMM_UNROLL_M - 1); + zgemm_p = (zgemm_p + ZGEMM_UNROLL_M - 1) & ~(ZGEMM_UNROLL_M - 1); +#ifdef QUAD_PRECISION + qgemm_p = (qgemm_p + QGEMM_UNROLL_M - 1) & ~(QGEMM_UNROLL_M - 1); + xgemm_p = (xgemm_p + XGEMM_UNROLL_M - 1) & ~(XGEMM_UNROLL_M - 1); +#endif + + sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; + dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; + cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; + zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; +#if defined(EXPRECISION) || defined(QUAD_PRECISION) + qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; + xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; +#endif + +#if 0 + fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R); + fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R); + fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R); + fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R); +#endif + + return; +} + +#if 0 + +int get_current_cpu_info(void){ + + int nlprocs, ncores, cmplegacy; + int htt = 0; + int apicid = 0; + +#if defined(CORE_PRESCOTT) || defined(CORE_OPTERON) + int eax, ebx, ecx, edx; + + cpuid(1, &eax, &ebx, &ecx, &edx); + nlprocs = BITMASK(ebx, 16, 0xff); + apicid = BITMASK(ebx, 24, 0xff); + htt = BITMASK(edx, 28, 0x01); +#endif + +#if defined(CORE_PRESCOTT) + cpuid(4, &eax, &ebx, &ecx, &edx); + ncores = BITMASK(eax, 26, 0x3f); + + if (htt == 0) nlprocs = 0; +#endif + +#if defined(CORE_OPTERON) + cpuid(0x80000008, &eax, &ebx, &ecx, &edx); + ncores = BITMASK(ecx, 0, 0xff); + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + cmplegacy = BITMASK(ecx, 1, 0x01); + + if (htt == 0) { + nlprocs = 0; + ncores = 0; + cmplegacy = 0; + } +#endif + + ncores ++; + + fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores); + + return 0; +} +#endif + +#endif + +#if defined(ARCH_IA64) + +static inline BLASULONG cpuid(BLASULONG regnum){ + BLASULONG value; + +#ifndef __ECC + asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum)); +#else + value = __getIndReg(_IA64_REG_INDR_CPUID, regnum); +#endif + + return value; +} + +#if 1 + +void blas_set_parameter(void){ + + BLASULONG cpuid3, size; + + cpuid3 = cpuid(3); + + size = BITMASK(cpuid3, 16, 0xff); + + sgemm_p = 192 * (size + 1); + dgemm_p = 96 * (size + 1); + cgemm_p = 96 * (size + 1); + zgemm_p = 48 * (size + 1); +#ifdef EXPRECISION + qgemm_p = 64 * (size + 1); + xgemm_p = 32 * (size + 1); +#endif +#ifdef QUAD_PRECISION + qgemm_p = 32 * (size + 1); + xgemm_p = 16 * (size + 1); +#endif + + sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; + dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; + cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; + zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; +#if defined(EXPRECISION) || defined(QUAD_PRECISION) + qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; + xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; +#endif + + return; +} + +#else + +#define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size" +#define IA64_PROC_NAME "/proc/pal/cpu0/cache_info" + +void blas_set_parameter(void){ + + BLASULONG cpuid3; + int size = 0; + +#if 1 + char buffer[128]; + FILE *infile; + + if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) { + + fgets(buffer, sizeof(buffer), infile); + fclose(infile); + + size = atoi(buffer) / 1536; + } + + if (size <= 0) { + if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) { + + while(fgets(buffer, sizeof(buffer), infile) != NULL) { + if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break; + } + + fgets(buffer, sizeof(buffer), infile); + + fclose(infile); + + *strstr(buffer, "bytes") = (char)NULL; + + size = atoi(strchr(buffer, ':') + 1) / 1572864; + } + } +#endif + + /* The last resort */ + + if (size <= 0) { + cpuid3 = cpuid(3); + + size = BITMASK(cpuid3, 16, 0xff) + 1; + } + + sgemm_p = 320 * size; + dgemm_p = 160 * size; + cgemm_p = 160 * size; + zgemm_p = 80 * size; +#ifdef EXPRECISION + qgemm_p = 80 * size; + xgemm_p = 40 * size; +#endif + + sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; + dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; + cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; + zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; +#ifdef EXPRECISION + qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; + xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; +#endif + + return; +} + +#endif + +#endif diff --git a/driver/others/profile.c b/driver/others/profile.c new file mode 100644 index 0000000000..f65550c9f8 --- /dev/null +++ b/driver/others/profile.c @@ -0,0 +1,139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include +#include +#define USE_FUNCTABLE +#include "../../interface/functable.h" + +func_profile_t function_profile_table[MAX_PROF_TABLE]; + +int gotoblas_profile = 1; + +static struct sigaction sa, ig; + +void gotoblas_profile_quit(void) { + + int i; + unsigned long long calls, fops, cycles, tcycles, area; + + sigaction(SIGPROF, &ig, NULL); + + calls = 0; + fops = 0; + cycles = 0; + tcycles = 0; + area = 0; + + for (i = 0; i < MAX_PROF_TABLE; i ++) { + if (function_profile_table[i].calls) { + calls += function_profile_table[i].calls; + cycles += function_profile_table[i].cycles; + tcycles += function_profile_table[i].tcycles; + area += function_profile_table[i].area; + fops += function_profile_table[i].fops; + } + } + + if (cycles > 0) { + + fprintf(stderr, "\n\t====== BLAS Profiling Result =======\n\n"); + fprintf(stderr, " Function No. of Calls Time Consumption Efficiency Bytes/cycle\n"); + + for (i = 0; i < MAX_PROF_TABLE; i ++) { + if (function_profile_table[i].calls) { +#ifndef OS_WINDOWS + fprintf(stderr, "%-12s : %10Ld %8.2f%% %10.3f%% %8.2f\n", +#else + fprintf(stderr, "%-12s : %10lld %8.2f%% %10.3f%% %8.2f\n", +#endif + func_table[i], + function_profile_table[i].calls, + (double)function_profile_table[i].cycles / (double)cycles * 100., + (double)function_profile_table[i].fops / (double)function_profile_table[i].tcycles * 100., + (double)function_profile_table[i].area / (double)function_profile_table[i].cycles + ); + } + } + + fprintf(stderr, " --------------------------------------------------------------------\n"); + +#ifndef OS_WINDOWS + fprintf(stderr, "%-12s : %10Ld %10.3f%% %8.2f\n", +#else + fprintf(stderr, "%-12s : %10lld %10.3f%% %8.2f\n", +#endif + "Total", + calls, + (double)fops / (double)tcycles * 100., + (double)area / (double)cycles); + } + + sigaction(SIGPROF, &sa, NULL); +} + +void gotoblas_profile_clear(void) { + + int i; + + for (i = 0; i < MAX_PROF_TABLE; i ++) { + function_profile_table[i].calls = 0; + function_profile_table[i].cycles = 0; + function_profile_table[i].tcycles = 0; + function_profile_table[i].area = 0; + function_profile_table[i].fops = 0; + } + +} + +void gotoblas_profile_init(void) { + + gotoblas_profile_clear(); + + bzero(&sa, sizeof(struct sigaction)); + sa.sa_handler = (void *)gotoblas_profile_quit; + sa.sa_flags = SA_NODEFER | SA_RESETHAND; + + bzero(&ig, sizeof(struct sigaction)); + ig.sa_handler = SIG_IGN; + ig.sa_flags |= SA_NODEFER | SA_RESETHAND; + + sigaction(SIGPROF, &sa, NULL); + +} diff --git a/driver/others/xerbla.c b/driver/others/xerbla.c new file mode 100644 index 0000000000..6f5170ef17 --- /dev/null +++ b/driver/others/xerbla.c @@ -0,0 +1,70 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) +#include +#undef printf +#define printf _cprintf +#endif + +#ifdef __ELF__ +int __xerbla(char *message, blasint *info, blasint length){ + + printf(" ** On entry to %6s parameter number %2d had an illegal value\n", + message, *info); + + return 0; +} + +int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("__xerbla"))); + +#else + +int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){ + + printf(" ** On entry to %6s parameter number %2d had an illegal value\n", + message, *info); + + return 0; +} + +#endif diff --git a/exports/Makefile b/exports/Makefile new file mode 100644 index 0000000000..00e6fed464 --- /dev/null +++ b/exports/Makefile @@ -0,0 +1,188 @@ +TOPDIR = .. + +include ../Makefile.system + +ifndef EXPRECISION +EXPRECISION = 0 +endif + +ifndef NO_CBLAS +NO_CBLAS = 0 +endif + +ifeq ($(OSNAME), WINNT) +ifeq ($(F_COMPILER), GFORTRAN) +EXTRALIB += -lgfortran +endif +endif + +ifeq ($(OSNAME), CYGWIN_NT) +ifeq ($(F_COMPILER), GFORTRAN) +EXTRALIB += -lgfortran +endif +endif + +all:: + +libs:: + +prof:: + +hpl:: libgoto_hpl.$(LIBSUFFIX) + +hpl_p:: libgoto_hpl_p.$(LIBSUFFIX) + +libgoto_hpl.$(LIBSUFFIX) : ../$(LIBNAME) + rm -f $(@F) + $(LD) -r $(LDFLAGS) -o goto.$(SUFFIX) --whole-archive $< --no-whole-archive + $(AR) cq $(@F) goto.$(SUFFIX) + $(RANLIB) libgoto_hpl.$(LIBSUFFIX) + +libgoto_hpl_p.$(LIBSUFFIX) : ../$(LIBNAME_P) + rm -f $(@F) + $(LD) -r $(LDFLAGS) -o goto.$(PSUFFIX) --whole-archive $< --no-whole-archive + $(AR) cq $(@F) goto.$(PSUFFIX) + $(RANLIB) libgoto_hpl_p.$(LIBSUFFIX) + +libgoto_hpl.dll : libgoto_hpl.$(LIBSUFFIX) dllinit.$(SUFFIX) libgoto_hpl.def + $(DLLWRAP) -o $(@F) --def libgoto_hpl.def --entry _dllinit -s dllinit.$(SUFFIX) --dllname libgoto_hpl.dll libgoto_hpl.$(LIBSUFFIX) + lib /machine:X64 /def:libgoto_hpl.def + +dyn : $(LIBDYNNAME) + +zip : dll + zip $(LIBZIPNAME) $(LIBDLLNAME) $(LIBNAME) + +dll : libgoto2.dll + +dll2 : libgoto2_shared.dll + +libgoto2.dll : ../$(LIBNAME) libgoto2.def dllinit.$(SUFFIX) + $(RANLIB) ../$(LIBNAME) +ifeq ($(BINARY32), 1) + $(DLLWRAP) -o $(@F) --def libgoto2.def \ + --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + -lib /machine:i386 /def:libgoto2.def +else + $(DLLWRAP) -o $(@F) --def libgoto2.def \ + --entry _dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + -lib /machine:X64 /def:libgoto2.def +endif + +libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def + $(CC) $(CFLAGS) libgoto2_shared.def -shared -o $(@F) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB) + +libgoto2.def : gensymbol + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + +libgoto2_shared.def : gensymbol + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + +libgoto_hpl.def : gensymbol + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) > $(@F) + +$(LIBDYNNAME) : ../$(LIBNAME) osx.def + $(PREFIX)gcc $(CFLAGS) -all_load -dynamiclib -o $(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) + +symbol.$(SUFFIX) : symbol.S + $(CC) $(CFLAGS) -c -o $(@F) $^ + +dllinit.$(SUFFIX) : dllinit.c + $(CC) $(CFLAGS) -c -o $(@F) -s $< + +ifeq ($(OSNAME), Linux) + +so : ../$(LIBSONAME) + +../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c + $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. + rm -f linktest + +endif + +ifeq ($(OSNAME), FreeBSD) + +so : ../$(LIBSONAME) + +../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c + $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ + -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. + rm -f linktest + +endif + +ifeq ($(OSNAME), OSF1) + +so : ../$(LIBSONAME) + +../$(LIBSONAME) : + $(CC) -shared -o ../$(LIBSONAME) ../$(LIBNAME) +endif + +ifeq ($(OSNAME), SunOS) + +so : ../$(LIBSONAME) + $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB) + $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. + rm -f linktest + +endif + +ifeq ($(OSNAME), AIX) + +ifeq ($(COMPILER_F77), xlf) + +goto32.$(SUFFIX) : ../$(LIBNAME) aix.def + ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lxlf90 -lc -lm -lpthread + +goto64.$(SUFFIX) : ../$(LIBNAME) aix.def + ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lxlf90 -lc -lm -lpthread +else +goto32.$(SUFFIX) : ../$(LIBNAME) aix.def + ld -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib -lg2c -lc -lm + +goto64.$(SUFFIX) : ../$(LIBNAME) aix.def + ld -b64 -o $(@F) ../$(LIBNAME) -bE:aix.def -bM:SRE -bnoexpall -bnoentry -L$(HOME)/misc/lib/ppc64 -lg2c -lc -lm +endif +endif + +static : ../$(LIBNAME) + $(LD) $(LDFLAGS) -r -o goto.$(SUFFIX) \ + --whole-archive ../$(LIBNAME) --no-whole-archive + rm -f ../$(LIBNAME) + $(AR) -cq ../$(LIBNAME) goto.$(SUFFIX) + rm -f goto.$(SUFFIX) + +linux.def : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + +osx.def : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + +aix.def : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > $(@F) + +symbol.S : gensymbol + perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) > symbol.S + +test : linktest.c + $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. + rm -f linktest + +linktest.c : gensymbol ../Makefile.system ../getarch.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) > linktest.c + +clean :: + @rm -f *.def *.dylib __.SYMDEF* + +include ../Makefile.tail + + diff --git a/exports/dllinit.c b/exports/dllinit.c new file mode 100644 index 0000000000..54ec1c36cb --- /dev/null +++ b/exports/dllinit.c @@ -0,0 +1,55 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +void gotoblas_init(void); +void gotoblas_quit(void); + +BOOL APIENTRY dllinit(HINSTANCE hInst, DWORD reason, LPVOID reserved) { + + if (reason == DLL_PROCESS_ATTACH) { + gotoblas_init(); + } + + if (reason == DLL_PROCESS_DETACH) { + gotoblas_quit(); + } + + return TRUE; +} diff --git a/exports/gensymbol b/exports/gensymbol new file mode 100644 index 0000000000..8455e51b6c --- /dev/null +++ b/exports/gensymbol @@ -0,0 +1,462 @@ +#!/usr/bin/perl + +@blasobjs = ( + caxpy,ccopy,cdotc,cdotu,cgbmv,cgemm,cgemv,cgerc,cgeru, + chbmv,chemm,chemv,cher2,cher2k,cher,cherk, + chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap, + csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, + ctrsv, csymv, + damax,damin,dasum,daxpy,dcabs1,dcopy,ddot,dgbmv,dgemm, + dgemv,dger,dmax,dmin,dnrm2,drot,drotg,drotm,drotmg,dsbmv, + dscal,dsdot,dspmv,dspr2, + dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, + dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,dzamax,dzamin,dzasum,dznrm2, + icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, + izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, + scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, + smax,smin,snrm2, + srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, + ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, + strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, + zdscal,zgbmv,zgemm,zgemv,zgerc,zgeru, + zhbmv,zhemm,zhemv,zher2,zher2k,zher,zherk,zhpmv,zhpr2, + zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, + ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv, + xerbla); + +@cblasobjs = ( + cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, + cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, + cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, + cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, + cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, + cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, + cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, + cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, + cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, + cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, + cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_dzasum, + cblas_dznrm2, cblas_icamax, cblas_idamax, + cblas_isamax, cblas_izamax, + cblas_sasum, cblas_saxpy, + cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, + cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, + cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, + cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, + cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, + cblas_strsv, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, + cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, + cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2, + cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, + cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, + cblas_ztrsv); + +@exblasobjs = ( + qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, + qgemv,qger,qmax,qmin, + qnrm2, + qsbmv,qscal,qspmv,qspr2, + qspr,qswap,qsymm,qsymv,qsyr2,qsyr2k,qsyr,qsyrk,qtbmv,qtbsv, + qtpmv,qtpsv,qtrmm,qtrmv,qtrsm,qtrsv, + qxamax,qxamin,qxasum,qxnrm2, + xaxpy,xcopy,xdotc,xdotu, + xqscal,xgbmv,xgemm,xgemv,xgerc,xgeru, + xhbmv,xhemm,xhemv,xher2,xher2k,xher,xherk,xhpmv,xhpr2, + xhpr,xscal,xswap,xsymm,xsyr2k,xsyrk,xtbmv, + xtbsv,xtpmv,xtpsv,xtrmm,xtrmv,xtrsm,xtrsv, +# qrot,qrotg,qrotm,qrotmg, +# xdrot,xrotg, + ); + +@gemm3mobjs = ( + zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, + ); + +@lapackobjs = ( + sgesv, dgesv, cgesv, zgesv, + sgetf2, dgetf2, cgetf2, zgetf2, + sgetrf, dgetrf, cgetrf, zgetrf, + slaswp, dlaswp, claswp, zlaswp, + sgetrs, dgetrs, cgetrs, zgetrs, + slauu2, dlauu2, clauu2, zlauu2, + slauum, dlauum, clauum, zlauum, + spotf2, dpotf2, cpotf2, zpotf2, + spotrf, dpotrf, cpotrf, zpotrf, + strti2, dtrti2, ctrti2, ztrti2, + strtri, dtrtri, ctrtri, ztrtri, + spotri, dpotri, cpotri, zpotri, + ); + +@lapackobjs2 = ( + sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, + sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, + sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, + sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, + sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, + sgeqp3, sgeqpf, sgeqr2, sgeqrf, sgerfs, sgerq2, sgerqf, + sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, + sgetri, + sggbak, sggbal, sgges, sggesx, sggev, sggevx, + sggglm, sgghrd, sgglse, sggqrf, + sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, + sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, + shsein, shseqr, slabrd, slacon, slacn2, + slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, + slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, + slangb, slange, slangt, slanhs, slansb, slansp, + slansy, slantb, slantp, slantr, slanv2, + slapll, slapmt, + slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, + slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, + slaqtr, slar1v, slar2v, + slarf, slarfb, slarfg, slarft, slarfx, slargv, + slarrv, slartv, + slarz, slarzb, slarzt, slasy2, slasyf, + slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, + sopgtr, sopmtr, sorg2l, sorg2r, + sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, + sorgrq, sorgtr, sorm2l, sorm2r, + sormbr, sormhr, sorml2, sormlq, sormql, sormqr, sormr2, + sormr3, sormrq, sormrz, sormtr, spbcon, spbequ, spbrfs, + spbstf, spbsv, spbsvx, + spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, + sposvx, spotrs, sppcon, sppequ, + spprfs, sppsv, sppsvx, spptrf, spptri, spptrs, sptcon, + spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, + ssbev, ssbevd, ssbevx, ssbgst, ssbgv, ssbgvd, ssbgvx, + ssbtrd, sspcon, sspev, sspevd, sspevx, sspgst, + sspgv, sspgvd, sspgvx, ssprfs, sspsv, sspsvx, ssptrd, + ssptrf, ssptri, ssptrs, sstegr, sstein, sstev, sstevd, sstevr, + sstevx, ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, + ssygst, ssygv, ssygvd, ssygvx, ssyrfs, ssysv, ssysvx, + ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytrs, stbcon, + stbrfs, stbtrs, stgevc, stgex2, stgexc, stgsen, + stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, + stptrs, + strcon, strevc, strexc, strrfs, strsen, strsna, strsyl, + strtrs, stzrqf, stzrzf, sstemr, + + cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, + cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, + cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, + cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, + cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, + cgeqpf, cgeqr2, cgeqrf, cgerfs, cgerq2, cgerqf, + cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, + cgetri, + cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, + cgghrd, cgglse, cggqrf, cggrqf, + cggsvd, cggsvp, + cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, + chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, + checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, + chegv, chegvd, chegvx, cherfs, chesv, chesvx, chetd2, + chetf2, chetrd, + chetrf, chetri, chetrs, chgeqz, chpcon, chpev, chpevd, + chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, + chpsvx, + chptrd, chptrf, chptri, chptrs, chsein, chseqr, clabrd, + clacgv, clacon, clacn2, clacp2, clacpy, clacrm, clacrt, cladiv, + claed0, claed7, claed8, + claein, claesy, claev2, clags2, clagtm, + clahef, clahqr, + clahrd, clahr2, claic1, clals0, clalsa, clalsd, clangb, clange, clangt, + clanhb, clanhe, + clanhp, clanhs, clanht, clansb, clansp, clansy, clantb, + clantp, clantr, clapll, clapmt, clarcm, claqgb, claqge, + claqhb, claqhe, claqhp, claqp2, claqps, claqsb, + claqr0, claqr1, claqr2, claqr3, claqr4, claqr5, + claqsp, claqsy, clar1v, clar2v, clarf, clarfb, clarfg, clarft, + clarfx, clargv, clarnv, clarrv, clartg, clartv, + clarz, clarzb, clarzt, clascl, claset, clasr, classq, + clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, + clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, + cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, + cposv, cposvx, cpotrs, cppcon, + cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, + cptcon, cpteqr, cptrfs, cptsv, cptsvx, cpttrf, cpttrs, cptts2, + crot, cspcon, cspmv, cspr, csprfs, cspsv, + cspsvx, csptrf, csptri, csptrs, csrscl, cstedc, + cstegr, cstein, csteqr, csycon, + csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, + csytrs, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, + ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, + ctprfs, ctptri, + ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, + ctrsyl, ctrtrs, ctzrqf, ctzrzf, cung2l, cung2r, + cungbr, cunghr, cungl2, cunglq, cungql, cungqr, cungr2, + cungrq, cungtr, cunm2l, cunm2r, cunmbr, cunmhr, cunml2, + cunmlq, cunmql, cunmqr, cunmr2, cunmr3, cunmrq, cunmrz, + cunmtr, cupgtr, cupmtr, icmax1, scsum1, cstemr, + + dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, + dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, + dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, + dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, + dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, + dgeqp3, dgeqpf, dgeqr2, dgeqrf, dgerfs, dgerq2, dgerqf, + dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, + dgetri, + dggbak, dggbal, dgges, dggesx, dggev, dggevx, + dggglm, dgghrd, dgglse, dggqrf, + dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, + dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, + dhsein, dhseqr, dlabrd, dlacon, dlacn2, + dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, + dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, + dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, + dlansy, dlantb, dlantp, dlantr, dlanv2, + dlapll, dlapmt, + dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, + dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, + dlaqtr, dlar1v, dlar2v, + dlarf, dlarfb, dlarfg, dlarft, dlarfx, dlargv, + dlarrv, dlartv, + dlarz, dlarzb, dlarzt, dlasy2, dlasyf, + dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, + dopgtr, dopmtr, dorg2l, dorg2r, + dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, + dorgrq, dorgtr, dorm2l, dorm2r, + dormbr, dormhr, dorml2, dormlq, dormql, dormqr, dormr2, + dormr3, dormrq, dormrz, dormtr, dpbcon, dpbequ, dpbrfs, + dpbstf, dpbsv, dpbsvx, + dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, + dposvx, dpotrs, dppcon, dppequ, + dpprfs, dppsv, dppsvx, dpptrf, dpptri, dpptrs, dptcon, + dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, + dsbev, dsbevd, dsbevx, dsbgst, dsbgv, dsbgvd, dsbgvx, + dsbtrd, dspcon, dspev, dspevd, dspevx, dspgst, + dspgv, dspgvd, dspgvx, dsprfs, dspsv, dspsvx, dsptrd, + dsptrf, dsptri, dsptrs, dstegr, dstein, dstev, dstevd, dstevr, + dstevx, dsycon, dsyev, dsyevd, dsyevr, + dsyevx, dsygs2, dsygst, dsygv, dsygvd, dsygvx, dsyrfs, + dsysv, dsysvx, + dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytrs, dtbcon, + dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, + dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, + dtptrs, + dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, + dtrtrs, dtzrqf, dtzrzf, dstemr, + dsgesv, dlag2s, slag2d, + + zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, + zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, + zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, + zgegs, zgegv, zgehd2, zgehrd, zgelq2, zgelqf, + zgels, zgelsd, zgelss, zgelsx, zgelsy, zgeql2, zgeqlf, zgeqp3, + zgeqpf, zgeqr2, zgeqrf, zgerfs, zgerq2, zgerqf, + zgesc2, zgesdd, zgesvd, zgesvx, zgetc2, + zgetri, + zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, + zgghrd, zgglse, zggqrf, zggrqf, + zggsvd, zggsvp, + zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, + zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, + zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, + zhegv, zhegvd, zhegvx, zherfs, zhesv, zhesvx, zhetd2, + zhetf2, zhetrd, + zhetrf, zhetri, zhetrs, zhgeqz, zhpcon, zhpev, zhpevd, + zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, + zhpsvx, + zhptrd, zhptrf, zhptri, zhptrs, zhsein, zhseqr, zlabrd, + zlacgv, zlacon, zlacn2, zlacp2, zlacpy, zlacrm, zlacrt, zladiv, + zlaed0, zlaed7, zlaed8, + zlaein, zlaesy, zlaev2, zlags2, zlagtm, + zlahef, zlahqr, + zlahrd, zlahr2, zlaic1, zlals0, zlalsa, zlalsd, zlangb, zlange, + zlangt, zlanhb, + zlanhe, + zlanhp, zlanhs, zlanht, zlansb, zlansp, zlansy, zlantb, + zlantp, zlantr, zlapll, zlapmt, zlaqgb, zlaqge, + zlaqhb, zlaqhe, zlaqhp, zlaqp2, zlaqps, zlaqsb, + zlaqr0, zlaqr1, zlaqr2, zlaqr3, zlaqr4, zlaqr5, + zlaqsp, zlaqsy, zlar1v, zlar2v, zlarcm, zlarf, zlarfb, + zlarfg, zlarft, + zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, + zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, + zlassq, zlasyf, + zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, + zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, + zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, + zposv, zposvx, zpotrs, zppcon, + zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, + zptcon, zpteqr, zptrfs, zptsv, zptsvx, zpttrf, zpttrs, zptts2, + zrot, zspcon, zspmv, zspr, zsprfs, zspsv, + zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc, + zstegr, zstein, zsteqr, zsycon, + zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, + zsytrs, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, + ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, + ztprfs, ztptri, + ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, + ztrsyl, ztrtrs, ztzrqf, ztzrzf, zung2l, + zung2r, zungbr, zunghr, zungl2, zunglq, zungql, zungqr, zungr2, + zungrq, zungtr, zunm2l, zunm2r, zunmbr, zunmhr, zunml2, + zunmlq, zunmql, zunmqr, zunmr2, zunmr3, zunmrq, zunmrz, + zunmtr, zupgtr, + zupmtr, izmax1, dzsum1, zstemr, + zcgesv, zlag2c, clag2z, + ); + +if (-d "../lapack-3.1.1") { + @objs = (@blasobjs, @lapackobjs, @lapackobjs2); +} else { + @objs = (@blasobjs, @lapackobjs); +} + +if ($ARGV[3] == 1){ @objs = (@objs, @exblasobjs); }; + +if ($ARGV[1] eq "X86_64"){ @objs = (@objs, @gemm3mobjs); }; + +if ($ARGV[1] eq "x86"){ @objs = (@objs, @gemm3mobjs); }; + +if ($ARGV[1] eq "ia64"){ @objs = (@objs, @gemm3mobjs); }; + +if ($ARGV[1] eq "MIPS"){ @objs = (@objs, @gemm3mobjs); }; + +@linuxobjs = ('__strtol_internal', 'exit', 'free', 'getenv', 'malloc', + 'mmap', 'printf', 'sqrt', + 'pthread_cond_broadcast', 'pthread_cond_destroy', + 'pthread_cond_init', 'pthread_cond_signal', 'pthread_cond_wait', + 'pthread_create', 'pthread_exit', 'pthread_join', + 'pthread_mutex_destroy', 'pthread_mutex_init', + 'pthread_mutex_lock', 'pthread_mutex_unlock'); + +@hplobjs = (daxpy, dcopy, dscal, idamax, dgemv, dtrsv, dger, dgemm, dtrsm); +@hplobjs2 = (HPL_dlaswp00N, HPL_dlaswp01N, HPL_dlaswp01T); + +$bu = $ARGV[2]; + +$bu = "" if (($bu eq "0") || ($bu eq "1")); + +if ($ARGV[0] eq "linux"){ + foreach $objs (@objs) { + print $objs, $bu, "\n"; + } + + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print $objs, "\n"; + } + } + + foreach $objs (@linuxobjs) { + print $objs, "\n"; + } + exit(0); +} + +if ($ARGV[0] eq "osx"){ + foreach $objs (@objs) { + print "_", $objs, $bu, "\n"; + } + + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print "_", $objs, "\n"; + } + } + exit(0); +} + +if ($ARGV[0] eq "aix"){ + foreach $objs (@objs) { + print $objs, $bu, "\n"; + } + + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print $objs, "\n"; + } + } + exit(0); +} + +if ($ARGV[0] eq "win2k"){ + print "EXPORTS\n"; + $count = 1; + foreach $objs (@objs) { + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t$objs=$objs","_ \@", $count, "\n"; + $count ++; + print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + $count ++; + print "\t$uppercase=$objs", "_ \@", $count, "\n"; + $count ++; + } + + exit(0); +} + +if ($ARGV[0] eq "win2khpl"){ + print "EXPORTS\n"; + $count = 1; + foreach $objs (@hplobjs) { + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t$objs=$objs","_ \@", $count, "\n"; + $count ++; + print "\t",$objs, "_=$objs","_ \@", $count, "\n"; + $count ++; + print "\t$uppercase=$objs", "_ \@", $count, "\n"; + $count ++; + } + +# foreach $objs (@hplobjs2) { +# print "\t$objs=$objs"," \@", $count, "\n"; +# $count ++; +# } + + exit(0); +} + +if ($ARGV[0] eq "microsoft"){ + print "EXPORTS\n"; + $count = 1; + foreach $objs (@objs) { + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t$objs = $objs","_\n"; + $count ++; + print "\t$objs\_ = $objs","_\n"; + $count ++; + print "\t$uppercase = $objs","_\n"; + $count ++; + print "\t$uppercase\_ = $objs","_\n"; + $count ++; + } + exit(0); +} + +if ($ARGV[0] eq "win2kasm"){ + print "\t.text\n"; + foreach $objs (@objs) { + $uppercase = $objs; + $uppercase =~ tr/[a-z]/[A-Z]/; + print "\t.align 16\n"; + print "\t.globl _", $uppercase, "_\n"; + print "_", $uppercase, "_:\n"; + print "\tjmp\t_", $objs, "_\n"; + } + exit(0); +} + +if ($ARGV[0] eq "linktest"){ + print "int main(void){\n"; + foreach $objs (@objs) { + print $objs, $bu, "();\n" if $objs ne "xerbla"; + } + if ($ARGV[4] == 0) { + foreach $objs (@cblasobjs) { + print $objs, "();\n"; + } + } + + + + + print "return 0;}\n"; + exit(0); +} + diff --git a/f_check b/f_check new file mode 100644 index 0000000000..26c57bcc9b --- /dev/null +++ b/f_check @@ -0,0 +1,302 @@ +#!/usr/bin/perl + +# +# 1. Not specified +# 1.1 Automatically detect, then check compiler +# 1.2 If no fortran compiler is detected, g77 is default with NOFORTRAN definition +# 2. Specified +# 2.1 If path is correct, check compiler +# 2.2 If path is not correct, but still valid compiler name, force setting +# 2.2.2 Path is not correct, invalid compiler name, then g77 is default with NOFORTRAN definition +# + +$makefile = shift(@ARGV); +$config = shift(@ARGV); + +$nofortran = 0; + +$compiler = join(" ", @ARGV); + +# f77 is too ambiguous +$compiler = "" if $compiler eq "f77"; + +@path = split(/:/, $ENV{"PATH"}); + +if ($compiler eq "") { + + @lists = ("f77", "g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95", + "sunf77", "sunf90", "sunf95", + "xlf95", "xlf90", "xlf", + "ppuf77", "ppuf95", "ppuf90", "ppuxlf", + "pathf90", "pathf95", + "pgf95", "pgf90", "pgf77", + "ifort"); + + foreach $lists (@lists) { + foreach $path (@path) { + if (-f $path . "/" . $lists) { + $compiler = $lists; + break; + } + } + } + +} + +if ($compiler eq "") { + + $nofortran = 1; + $compiler = "g77"; + $vendor = G77; + $bu = "_"; + +} else { + + $data = `which $compiler > /dev/null 2> /dev/null`; + $vendor = ""; + + if (!$?) { + + $data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`; + + if ($data =~ /zhoge_/) { + $bu = "_"; + } + + if ($data =~ /GNU/) { + + $data =~ /(\d)\.(\d).(\d)/; + $major = $1; + $minor = $2; + + if ($major >= 4) { + $vendor = GFORTRAN; + $openmp = "-fopenmp"; + } else { + $vendor = G77; + $openmp = ""; + } + + } + + if ($data =~ /g95/) { + $vendor = G95; + $openmp = ""; + } + + if ($data =~ /Intel/) { + $vendor = INTEL; + $openmp = "-openmp"; + } + + if ($data =~ /Sun Fortran/) { + $vendor = SUN; + $openmp = "-xopenmp=parallel"; + } + + if ($data =~ /PathScale/) { + $vendor = PATHSCALE; + $openmp = "-openmp"; + } + + if ($data =~ /Open64/) { + $vendor = OPEN64; + $openmp = "-mp"; + } + + if ($data =~ /PGF/) { + $vendor = PGI; + $openmp = "-mp"; + } + + if ($data =~ /IBM/) { + $vendor = IBM; + $openmp = "-openmp"; + } + } + + if ($vendor eq "") { + + if ($compiler =~ /g77/) { + $vendor = G77; + $bu = "_"; + $openmp = ""; + } + + if ($compiler =~ /g95/) { + $vendor = G95; + $bu = "_"; + $openmp = ""; + } + + if ($compiler =~ /gfortran/) { + $vendor = GFORTRAN; + $bu = "_"; + $openmp = "-fopenmp"; + } + + if ($compiler =~ /ifort/) { + $vendor = INTEL; + $bu = "_"; + $openmp = "-openmp"; + } + + if ($compiler =~ /pathf/) { + $vendor = PATHSCALE; + $bu = "_"; + $openmp = "-mp"; + } + + if ($compiler =~ /pgf/) { + $vendor = PGI; + $bu = "_"; + $openmp = "-mp"; + } + + if ($compiler =~ /ftn/) { + $vendor = PGI; + $bu = "_"; + $openmp = "-openmp"; + } + + if ($compiler =~ /frt/) { + $vendor = FUJITSU; + $bu = "_"; + $openmp = "-openmp"; + } + + if ($compiler =~ /sunf77|sunf90|sunf95/) { + $vendor = SUN; + $bu = "_"; + $openmp = "-xopenmp=parallel"; + } + + if ($compiler =~ /ppuf/) { + $vendor = IBM; + $openmp = "-openmp"; + } + + if ($compiler =~ /xlf/) { + $vendor = IBM; + $openmp = "-openmp"; + } + + if ($compiler =~ /open64/) { + $vendor = OPEN64; + $openmp = "-mp"; + } + + if ($vendor eq "") { + $nofortran = 1; + $compiler = "g77"; + $vendor = G77; + $bu = "_"; + $openmp = ""; + } + + } +} + +$data = `which $compiler > /dev/null 2> /dev/null`; + +if (!$?) { + + $binary = $ENV{"BINARY"}; + + $openmp = "" if $ENV{USE_OPENMP} != 1; + + if ($binary == 32) { + $link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + if ($?) { + $link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } + $binary = "" if ($?); + } + + if ($binary == 64) { + $link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + if ($?) { + $link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } + $binary = "" if ($?); + } + + if ($binary eq "") { + $link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`; + } +} + +$linker_L = ""; +$linker_l = ""; +$linker_a = ""; + +if ($link ne "") { + + $link =~ s/\-Y\sP\,/\-Y/g; + + $link =~ s/\-rpath\s+/\-rpath\@/g; + + @flags = split(/[\s\,\n]/, $link); + + foreach $flags (@flags) { + if ( + ($flags =~ /^\-L/) + && ($flags !~ /^-LIST:/) + && ($flags !~ /^-LANG:/) + ) { + if ($vendor eq "PGI") { + $flags =~ s/lib$/libso/; + } + $linker_L .= $flags . " "; + } + + if ($flags =~ /^\-Y/) { + $linker_L .= "-Wl,". $flags . " "; + } + + if ($flags =~ /^\-rpath/) { + $flags =~ s/\@/\,/g; + if ($vendor eq "PGI") { + $flags =~ s/lib$/libso/; + } + $linker_L .= "-Wl,". $flags . " " ; + } + + if ( + ($flags =~ /^\-l/) + && ($flags !~ /gfortranbegin/) + && ($flags !~ /frtbegin/) + && ($flags !~ /pathfstart/) + && ($flags !~ /numa/) + && ($flags !~ /crt[0-9]/) + && ($flags !~ /gcc/) + && ($flags !~ /user32/) + && ($flags !~ /kernel32/) + && ($flags !~ /advapi32/) + && ($flags !~ /shell32/) + ) { + $linker_l .= $flags . " "; + } + + $linker_a .= $flags . " " if $flags =~ /\.a$/; + } + +} + +open(MAKEFILE, ">> $makefile") || die "Can't append $makefile"; +open(CONFFILE, ">> $config" ) || die "Can't append $config"; + +print MAKEFILE "F_COMPILER=$vendor\n"; +print MAKEFILE "FC=$compiler\n"; +print MAKEFILE "BU=$bu\n" if $bu ne ""; +print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1; + +print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne ""; +print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne ""; + +if (($linker_l ne "") || ($linker_a ne "")) { + print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n"; +} + +close(MAKEFILE); +close(CONFFILE); diff --git a/ftest.f b/ftest.f new file mode 100644 index 0000000000..94ba566f55 --- /dev/null +++ b/ftest.f @@ -0,0 +1,6 @@ + double complex function zhoge() + + zhoge = (0.0d0,0.0d0) + + return + end diff --git a/ftest2.f b/ftest2.f new file mode 100644 index 0000000000..1d9a11489f --- /dev/null +++ b/ftest2.f @@ -0,0 +1,3 @@ + program main + + end diff --git a/getarch.c b/getarch.c new file mode 100644 index 0000000000..347dbb1de4 --- /dev/null +++ b/getarch.c @@ -0,0 +1,732 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#if defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) +#define OS_WINDOWS +#endif + +#include +#include +#ifdef OS_WINDOWS +#include +#endif +#if defined(__FreeBSD__) || defined(__APPLE__) +#include +#include +#endif +#ifdef linux +#include +#endif + +/* #define FORCE_P2 */ +/* #define FORCE_KATMAI */ +/* #define FORCE_COPPERMINE */ +/* #define FORCE_NORTHWOOD */ +/* #define FORCE_PRESCOTT */ +/* #define FORCE_BANIAS */ +/* #define FORCE_YONAH */ +/* #define FORCE_CORE2 */ +/* #define FORCE_PENRYN */ +/* #define FORCE_DUNNINGTON */ +/* #define FORCE_NEHALEM */ +/* #define FORCE_ATHLON */ +/* #define FORCE_OPTERON */ +/* #define FORCE_OPTERON_SSE3 */ +/* #define FORCE_BARCELONA */ +/* #define FORCE_SHANGHAI */ +/* #define FORCE_ISTANBUL */ +/* #define FORCE_SSE_GENERIC */ +/* #define FORCE_VIAC3 */ +/* #define FORCE_NANO */ +/* #define FORCE_POWER3 */ +/* #define FORCE_POWER4 */ +/* #define FORCE_POWER5 */ +/* #define FORCE_POWER6 */ +/* #define FORCE_PPCG4 */ +/* #define FORCE_PPC970 */ +/* #define FORCE_PPC970MP */ +/* #define FORCE_PPC440 */ +/* #define FORCE_PPC440FP2 */ +/* #define FORCE_CELL */ +/* #define FORCE_SICORTEX */ +/* #define FORCE_ITANIUM2 */ +/* #define FORCE_GENERIC */ +/* #define FORCE_SPARC */ +/* #define FORCE_SPARCV7 */ + +#ifdef FORCE_P2 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM2" +#define ARCHCONFIG "-DPENTIUM2 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX" +#define LIBNAME "p2" +#define CORENAME "P5" +#endif + +#ifdef FORCE_COPPERMINE +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM3" +#define ARCHCONFIG "-DPENTIUM3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "coppermine" +#define CORENAME "COPPERMINE" +#endif + +#ifdef FORCE_KATMAI +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM3" +#define ARCHCONFIG "-DPENTIUM3 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "katmai" +#define CORENAME "KATMAI" +#endif + +#ifdef FORCE_NORTHWOOD +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM4" +#define ARCHCONFIG "-DPENTIUM4 " \ + "-DL1_DATA_SIZE=8192 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " +#define LIBNAME "northwood" +#define CORENAME "NORTHWOOD" +#endif + +#ifdef FORCE_PRESCOTT +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENTIUM4" +#define ARCHCONFIG "-DPENTIUM4 " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3" +#define LIBNAME "prescott" +#define CORENAME "PRESCOTT" +#endif + +#ifdef FORCE_BANIAS +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BANIAS" +#define ARCHCONFIG "-DPENTIUMM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " +#define LIBNAME "banias" +#define CORENAME "BANIAS" +#endif + +#ifdef FORCE_YONAH +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "YONAH" +#define ARCHCONFIG "-DPENTIUMM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " +#define LIBNAME "yonah" +#define CORENAME "YONAH" +#endif + +#ifdef FORCE_CORE2 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "CONRORE" +#define ARCHCONFIG "-DCORE2 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=256 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" +#define LIBNAME "core2" +#define CORENAME "CORE2" +#endif + +#ifdef FORCE_PENRYN +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "PENRYN" +#define ARCHCONFIG "-DPENRYN " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=256 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1" +#define LIBNAME "penryn" +#define CORENAME "PENRYN" +#endif + +#ifdef FORCE_DUNNINGTON +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "DUNNINGTON" +#define ARCHCONFIG "-DDUNNINGTON " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DL3_SIZE=16777216 -DL3_LINESIZE=64 " \ + "-DDTB_ENTRIES=256 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1" +#define LIBNAME "dunnington" +#define CORENAME "DUNNINGTON" +#endif + +#ifdef FORCE_NEHALEM +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "NEHALEM" +#define ARCHCONFIG "-DNEHALEM " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" +#define LIBNAME "nehalem" +#define CORENAME "NEHALEM" +#endif + +#ifdef FORCE_ATOM +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "ATOM" +#define ARCHCONFIG "-DATOM " \ + "-DL1_DATA_SIZE=24576 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" +#define LIBNAME "atom" +#define CORENAME "ATOM" +#endif + +#ifdef FORCE_ATHLON +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "ATHLON" +#define ARCHCONFIG "-DATHLON " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ + "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "athlon" +#define CORENAME "ATHLON" +#endif + +#ifdef FORCE_OPTERON +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "OPTERON" +#define ARCHCONFIG "-DOPTERON " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ + "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 " +#define LIBNAME "opteron" +#define CORENAME "OPTERON" +#endif + +#ifdef FORCE_OPTERON_SSE3 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "OPTERON" +#define ARCHCONFIG "-DOPTERON " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=32 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ + "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3" +#define LIBNAME "opteron" +#define CORENAME "OPTERON" +#endif + +#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BARCELONA" +#define ARCHCONFIG "-DBARCELONA " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL3_SIZE=2097152 " \ + "-DDTB_ENTRIES=48 -DDTB_SIZE=4096 -DHAVE_3DNOW " \ + "-DHAVE_3DNOWEX -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" +#define LIBNAME "barcelona" +#define CORENAME "BARCELONA" +#endif + +#ifdef FORCE_SSE_GENERIC +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "GENERIC" +#define ARCHCONFIG "-DGENERIC " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2" +#define LIBNAME "generic" +#define CORENAME "GENERIC" +#endif + +#ifdef FORCE_VIAC3 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "VIAC3" +#define ARCHCONFIG "-DVIAC3 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=65536 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE " +#define LIBNAME "viac3" +#define CORENAME "VIAC3" +#endif + +#ifdef FORCE_NANO +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "NANO" +#define ARCHCONFIG "-DNANO " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3" +#define LIBNAME "nano" +#define CORENAME "NANO" +#endif + +#ifdef FORCE_POWER3 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER3" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER3 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=256 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power3" +#define CORENAME "POWER3" +#endif + +#ifdef FORCE_POWER4 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER4" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER4 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=1509949 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=6 " +#define LIBNAME "power4" +#define CORENAME "POWER4" +#endif + +#ifdef FORCE_POWER5 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER5" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER5 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=1509949 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=6 " +#define LIBNAME "power5" +#define CORENAME "POWER5" +#endif + +#ifdef FORCE_POWER6 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER6" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER6 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power6" +#define CORENAME "POWER6" +#endif + +#ifdef FORCE_PPCG4 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPCG4" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPCG4 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ppcg4" +#define CORENAME "PPCG4" +#endif + +#ifdef FORCE_PPC970 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPC970" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPC970 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ppc970" +#define CORENAME "PPC970" +#endif + +#ifdef FORCE_PPC970MP +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPC970" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPC970 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=1024976 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "ppc970mp" +#define CORENAME "PPC970" +#endif + +#ifdef FORCE_PPC440 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPC440" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPC440 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=16384 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " +#define LIBNAME "ppc440" +#define CORENAME "PPC440" +#endif + +#ifdef FORCE_PPC440FP2 +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "PPC440FP2" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPPC440FP2 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=16384 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 " +#define LIBNAME "ppc440FP2" +#define CORENAME "PPC440FP2" +#endif + +#ifdef FORCE_CELL +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "CELL" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DCELL " \ + "-DL1_DATA_SIZE=262144 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "cell" +#define CORENAME "CELL" +#endif + +#ifdef FORCE_SICORTEX +#define FORCE +#define ARCHITECTURE "MIPS" +#define SUBARCHITECTURE "SICORTEX" +#define SUBDIRNAME "mips" +#define ARCHCONFIG "-DSICORTEX " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ + "-DDTB_ENTRIES=32 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "mips" +#define CORENAME "sicortex" +#endif + +#ifdef FORCE_ITANIUM2 +#define FORCE +#define ARCHITECTURE "IA64" +#define SUBARCHITECTURE "ITANIUM2" +#define SUBDIRNAME "ia64" +#define ARCHCONFIG "-DITANIUM2 " \ + "-DL1_DATA_SIZE=262144 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=1572864 -DL2_LINESIZE=128 -DDTB_SIZE=16384 -DDTB_ENTRIES=128 " +#define LIBNAME "itanium2" +#define CORENAME "itanium2" +#endif + +#ifdef FORCE_SPARC +#define FORCE +#define ARCHITECTURE "SPARC" +#define SUBARCHITECTURE "SPARC" +#define SUBDIRNAME "sparc" +#define ARCHCONFIG "-DSPARC -DV9 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1572864 -DL2_LINESIZE=64 -DDTB_SIZE=8192 -DDTB_ENTRIES=64 " +#define LIBNAME "sparc" +#define CORENAME "sparc" +#endif + +#ifdef FORCE_SPARCV7 +#define FORCE +#define ARCHITECTURE "SPARC" +#define SUBARCHITECTURE "SPARC" +#define SUBDIRNAME "sparc" +#define ARCHCONFIG "-DSPARC -DV7 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1572864 -DL2_LINESIZE=64 -DDTB_SIZE=8192 -DDTB_ENTRIES=64 " +#define LIBNAME "sparcv7" +#define CORENAME "sparcv7" +#endif + +#ifdef FORCE_GENERIC +#define FORCE +#define ARCHITECTURE "GENERIC" +#define SUBARCHITECTURE "GENERIC" +#define SUBDIRNAME "generic" +#define ARCHCONFIG "-DGENERIC " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=512488 -DL2_LINESIZE=128 " \ + "-DDTB_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "generic" +#define CORENAME "generic" +#endif + +#ifndef FORCE + +#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ + defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) +#ifndef POWER +#define POWER +#endif +#endif + +#if defined(__i386__) || (__x86_64__) +#include "cpuid_x86.c" +#endif + +#ifdef __ia64__ +#include "cpuid_ia64.c" +#endif + +#ifdef __alpha +#include "cpuid_alpha.c" +#endif + +#ifdef POWER +#include "cpuid_power.c" +#endif + +#ifdef sparc +#include "cpuid_sparc.c" +#endif + +#ifdef __mips__ +#include "cpuid_mips.c" +#endif + +#else + +#endif + +static int get_num_cores(void) { + +#ifdef OS_WINDOWS + SYSTEM_INFO sysinfo; +#elif defined(__FreeBSD__) || defined(__APPLE__) + int m[2], count; + size_t len; +#endif + +#ifdef linux + return get_nprocs(); + +#elif defined(OS_WINDOWS) + + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; + +#elif defined(__FreeBSD__) || defined(__APPLE__) + m[0] = CTL_HW; + m[1] = HW_NCPU; + len = sizeof(int); + sysctl(m, 2, &count, &len, NULL, 0); + + return count; +#else + return 2; +#endif +} + +int main(int argc, char *argv[]){ + +#ifdef FORCE + char buffer[8192], *p, *q; + int length; +#endif + + if (argc == 1) return 0; + + switch (argv[1][0]) { + + case '0' : /* for Makefile */ + +#ifdef FORCE + printf("CORE=%s\n", CORENAME); +#else +#if defined(__i386__) || defined(__x86_64__) || defined(POWER) + printf("CORE=%s\n", get_corename()); +#endif +#endif + +#ifdef FORCE + printf("LIBCORE=%s\n", LIBNAME); +#else + printf("LIBCORE="); + get_libname(); + printf("\n"); +#endif + + printf("NUM_CORES=%d\n", get_num_cores()); + +#if defined(__i386__) || defined(__x86_64__) +#ifndef FORCE + get_sse(); +#else + + sprintf(buffer, "%s", ARCHCONFIG); + + p = &buffer[0]; + + while (*p) { + if ((*p == '-') && (*(p + 1) == 'D')) { + p += 2; + + while ((*p != ' ') && (*p != '\0')) { + + if (*p == '=') { + printf("="); + p ++; + while ((*p != ' ') && (*p != '\0')) { + printf("%c", *p); + p ++; + } + } else { + printf("%c", *p); + p ++; + if ((*p == ' ') || (*p =='\0')) printf("=1"); + } + } + + printf("\n"); + } else p ++; + } +#endif +#endif + +#ifndef OS_WINDOWS + printf("MAKE += -j %d\n", get_num_cores()); +#endif + + break; + + case '1' : /* For config.h */ +#ifdef FORCE + sprintf(buffer, "%s -DCORE_%s\n", ARCHCONFIG, CORENAME); + + p = &buffer[0]; + while (*p) { + if ((*p == '-') && (*(p + 1) == 'D')) { + p += 2; + printf("#define "); + + while ((*p != ' ') && (*p != '\0')) { + + if (*p == '=') { + printf(" "); + p ++; + while ((*p != ' ') && (*p != '\0')) { + printf("%c", *p); + p ++; + } + } else { + printf("%c", *p); + p ++; + } + } + + printf("\n"); + } else p ++; + } +#else + get_cpuconfig(); +#endif + break; + + case '2' : /* SMP */ + if (get_num_cores() > 1) printf("SMP=1\n"); + break; + } + + fflush(stdout); + + return 0; +} + diff --git a/getarch_2nd.c b/getarch_2nd.c new file mode 100644 index 0000000000..31babd28a5 --- /dev/null +++ b/getarch_2nd.c @@ -0,0 +1,36 @@ +#include +#ifndef BUILD_KERNEL +#include "config.h" +#else +#include "config_kernel.h" +#endif +#include "param.h" + +int main(int argc, char **argv) { + + if ((argc < 1) || (*argv[1] == '0')) { + printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); + printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); + printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); + printf("DGEMM_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N); + printf("QGEMM_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M); + printf("QGEMM_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N); + + printf("CGEMM_UNROLL_M=%d\n", CGEMM_DEFAULT_UNROLL_M); + printf("CGEMM_UNROLL_N=%d\n", CGEMM_DEFAULT_UNROLL_N); + printf("ZGEMM_UNROLL_M=%d\n", ZGEMM_DEFAULT_UNROLL_M); + printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N); + printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M); + printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N); + } + + + if ((argc >= 1) && (*argv[1] == '1')) { + printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); + printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); + printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); + printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); + } + + return 0; +} diff --git a/interface/Makefile b/interface/Makefile new file mode 100644 index 0000000000..5bfc5f389d --- /dev/null +++ b/interface/Makefile @@ -0,0 +1,1942 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + +ifndef NO_FBLAS + +SBLAS1OBJS = \ + saxpy.$(SUFFIX) sswap.$(SUFFIX) \ + scopy.$(SUFFIX) sscal.$(SUFFIX) \ + sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ + sasum.$(SUFFIX) snrm2.$(SUFFIX) \ + smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ + smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ + srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ + +SBLAS2OBJS = \ + sgemv.$(SUFFIX) sger.$(SUFFIX) \ + strsv.$(SUFFIX) strmv.$(SUFFIX) ssymv.$(SUFFIX) \ + ssyr.$(SUFFIX) ssyr2.$(SUFFIX) sgbmv.$(SUFFIX) \ + ssbmv.$(SUFFIX) sspmv.$(SUFFIX) \ + sspr.$(SUFFIX) sspr2.$(SUFFIX) \ + stbsv.$(SUFFIX) stbmv.$(SUFFIX) \ + stpsv.$(SUFFIX) stpmv.$(SUFFIX) + +SBLAS3OBJS = \ + sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ + strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) + +DBLAS1OBJS = \ + daxpy.$(SUFFIX) dswap.$(SUFFIX) \ + dcopy.$(SUFFIX) dscal.$(SUFFIX) \ + ddot.$(SUFFIX) \ + dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ + dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ + dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ + drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ + +DBLAS2OBJS = \ + dgemv.$(SUFFIX) dger.$(SUFFIX) \ + dtrsv.$(SUFFIX) dtrmv.$(SUFFIX) dsymv.$(SUFFIX) \ + dsyr.$(SUFFIX) dsyr2.$(SUFFIX) dgbmv.$(SUFFIX) \ + dsbmv.$(SUFFIX) dspmv.$(SUFFIX) \ + dspr.$(SUFFIX) dspr2.$(SUFFIX) \ + dtbsv.$(SUFFIX) dtbmv.$(SUFFIX) \ + dtpsv.$(SUFFIX) dtpmv.$(SUFFIX) + +DBLAS3OBJS = \ + dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ + dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) + +CBLAS1OBJS = \ + caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ + ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ + cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ + scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ + scamax.$(SUFFIX) icamax.$(SUFFIX) \ + scamin.$(SUFFIX) icamin.$(SUFFIX) \ + csrot.$(SUFFIX) crotg.$(SUFFIX) \ + +CBLAS2OBJS = \ + cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ + ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) csymv.$(SUFFIX) \ + csyr.$(SUFFIX) csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ + csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ + cspr.$(SUFFIX) cspr2.$(SUFFIX) \ + ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ + ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ + chemv.$(SUFFIX) chbmv.$(SUFFIX) \ + cher.$(SUFFIX) cher2.$(SUFFIX) \ + chpmv.$(SUFFIX) chpr.$(SUFFIX) chpr2.$(SUFFIX) + +CBLAS3OBJS = \ + cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \ + ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ + chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) + +ZBLAS1OBJS = \ + zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ + zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ + zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ + dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ + dzamax.$(SUFFIX) izamax.$(SUFFIX) \ + dzamin.$(SUFFIX) izamin.$(SUFFIX) \ + zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ + +ZBLAS2OBJS = \ + zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ + ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) zsymv.$(SUFFIX) \ + zsyr.$(SUFFIX) zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ + zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ + zspr.$(SUFFIX) zspr2.$(SUFFIX) \ + ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ + ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ + zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \ + zher.$(SUFFIX) zher2.$(SUFFIX) \ + zhpmv.$(SUFFIX) zhpr.$(SUFFIX) zhpr2.$(SUFFIX) + +ZBLAS3OBJS = \ + zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \ + ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ + zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) + +ifdef SUPPORT_GEMM3M + +CBLAS3OBJS += cgemm3m.$(SUFFIX) csymm3m.$(SUFFIX) chemm3m.$(SUFFIX) + +ZBLAS3OBJS += zgemm3m.$(SUFFIX) zsymm3m.$(SUFFIX) zhemm3m.$(SUFFIX) + +endif + +ifdef EXPRECISION + +QBLAS1OBJS = \ + qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ + qcopy.$(SUFFIX) qscal.$(SUFFIX) \ + qdot.$(SUFFIX) \ + qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ + qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ + qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ + +QBLAS2OBJS = \ + qgemv.$(SUFFIX) qger.$(SUFFIX) \ + qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ + qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ + qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ + qspr.$(SUFFIX) qspr2.$(SUFFIX) \ + qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ + qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) + +QBLAS3OBJS = \ + qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ + qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) + +XBLAS1OBJS = \ + xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ + xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ + xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ + qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ + qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ + xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ + +XBLAS2OBJS = \ + xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ + xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ + xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ + xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ + xspr.$(SUFFIX) xspr2.$(SUFFIX) \ + xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ + xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ + xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ + xher.$(SUFFIX) xher2.$(SUFFIX) \ + xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) + +XBLAS3OBJS = \ + xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ + xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ + xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) + +ifdef SUPPORT_GEMM3M + +XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) + +endif + +endif + +ifdef QUAD_PRECISION + +QBLAS1OBJS = \ + qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ + qcopy.$(SUFFIX) qscal.$(SUFFIX) \ + qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ + qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ + qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ + +QBLAS2OBJS = \ + qgemv.$(SUFFIX) qger.$(SUFFIX) \ + qtrsv.$(SUFFIX) qtrmv.$(SUFFIX) qsymv.$(SUFFIX) \ + qsyr.$(SUFFIX) qsyr2.$(SUFFIX) qgbmv.$(SUFFIX) \ + qsbmv.$(SUFFIX) qspmv.$(SUFFIX) \ + qspr.$(SUFFIX) qspr2.$(SUFFIX) \ + qtbsv.$(SUFFIX) qtbmv.$(SUFFIX) \ + qtpsv.$(SUFFIX) qtpmv.$(SUFFIX) + +QBLAS3OBJS = \ + qgemm.$(SUFFIX) qsymm.$(SUFFIX) qtrmm.$(SUFFIX) \ + qtrsm.$(SUFFIX) qsyrk.$(SUFFIX) qsyr2k.$(SUFFIX) + +XBLAS1OBJS = \ + xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ + xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ + qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ + qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ + xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ + +XBLAS2OBJS = \ + xgemv.$(SUFFIX) xgeru.$(SUFFIX) xgerc.$(SUFFIX) \ + xtrsv.$(SUFFIX) xtrmv.$(SUFFIX) xsymv.$(SUFFIX) \ + xsyr.$(SUFFIX) xsyr2.$(SUFFIX) xgbmv.$(SUFFIX) \ + xsbmv.$(SUFFIX) xspmv.$(SUFFIX) \ + xspr.$(SUFFIX) xspr2.$(SUFFIX) \ + xtbsv.$(SUFFIX) xtbmv.$(SUFFIX) \ + xtpsv.$(SUFFIX) xtpmv.$(SUFFIX) \ + xhemv.$(SUFFIX) xhbmv.$(SUFFIX) \ + xher.$(SUFFIX) xher2.$(SUFFIX) \ + xhpmv.$(SUFFIX) xhpr.$(SUFFIX) xhpr2.$(SUFFIX) + +XBLAS3OBJS = \ + xgemm.$(SUFFIX) xsymm.$(SUFFIX) xtrmm.$(SUFFIX) \ + xtrsm.$(SUFFIX) xsyrk.$(SUFFIX) xsyr2k.$(SUFFIX) \ + xhemm.$(SUFFIX) xherk.$(SUFFIX) xher2k.$(SUFFIX) + +ifdef SUPPORT_GEMM3M + +XBLAS3OBJS += xgemm3m.$(SUFFIX) xsymm3m.$(SUFFIX) xhemm3m.$(SUFFIX) + +endif +endif + +endif + +HPLOBJS = dgemm.$(SUFFIX) dtrsm.$(SUFFIX) \ + dgemv.$(SUFFIX) dtrsv.$(SUFFIX) dger.$(SUFFIX) \ + idamax.$(SUFFIX) daxpy.$(SUFFIX) dcopy.$(SUFFIX) dscal.$(SUFFIX) + +CSBLAS1OBJS = \ + cblas_isamax.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ + cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ + cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ + cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) + +CSBLAS2OBJS = \ + cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ + cblas_strsv.$(SUFFIX) cblas_ssyr.$(SUFFIX) cblas_ssyr2.$(SUFFIX) cblas_sgbmv.$(SUFFIX) \ + cblas_ssbmv.$(SUFFIX) cblas_sspmv.$(SUFFIX) cblas_sspr.$(SUFFIX) cblas_sspr2.$(SUFFIX) \ + cblas_stbmv.$(SUFFIX) cblas_stbsv.$(SUFFIX) cblas_stpmv.$(SUFFIX) cblas_stpsv.$(SUFFIX) + +CSBLAS3OBJS = \ + cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ + cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) + +CDBLAS1OBJS = \ + cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ + cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ + cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ + cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) + +CDBLAS2OBJS = \ + cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ + cblas_dtrsv.$(SUFFIX) cblas_dsyr.$(SUFFIX) cblas_dsyr2.$(SUFFIX) cblas_dgbmv.$(SUFFIX) \ + cblas_dsbmv.$(SUFFIX) cblas_dspmv.$(SUFFIX) cblas_dspr.$(SUFFIX) cblas_dspr2.$(SUFFIX) \ + cblas_dtbmv.$(SUFFIX) cblas_dtbsv.$(SUFFIX) cblas_dtpmv.$(SUFFIX) cblas_dtpsv.$(SUFFIX) + +CDBLAS3OBJS += \ + cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ + cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) + +CCBLAS1OBJS = \ + cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ + cblas_ccopy.$(SUFFIX) \ + cblas_cdotc.$(SUFFIX) cblas_cdotu.$(SUFFIX) \ + cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ + cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ + cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) + +CCBLAS2OBJS = \ + cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ + cblas_cgbmv.$(SUFFIX) cblas_chbmv.$(SUFFIX) cblas_chemv.$(SUFFIX) \ + cblas_cher.$(SUFFIX) cblas_cher2.$(SUFFIX) cblas_chpmv.$(SUFFIX) \ + cblas_chpr.$(SUFFIX) cblas_chpr2.$(SUFFIX) cblas_ctbmv.$(SUFFIX) \ + cblas_ctbsv.$(SUFFIX) cblas_ctpmv.$(SUFFIX) cblas_ctpsv.$(SUFFIX) \ + cblas_ctrmv.$(SUFFIX) cblas_ctrsv.$(SUFFIX) + +CCBLAS3OBJS = \ + cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ + cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ + cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) + +CZBLAS1OBJS = \ + cblas_izamax.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ + cblas_zcopy.$(SUFFIX) \ + cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ + cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ + cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ + cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) + +CZBLAS2OBJS = \ + cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ + cblas_zgbmv.$(SUFFIX) cblas_zhbmv.$(SUFFIX) cblas_zhemv.$(SUFFIX) \ + cblas_zher.$(SUFFIX) cblas_zher2.$(SUFFIX) cblas_zhpmv.$(SUFFIX) \ + cblas_zhpr.$(SUFFIX) cblas_zhpr2.$(SUFFIX) cblas_ztbmv.$(SUFFIX) \ + cblas_ztbsv.$(SUFFIX) cblas_ztpmv.$(SUFFIX) cblas_ztpsv.$(SUFFIX) \ + cblas_ztrmv.$(SUFFIX) cblas_ztrsv.$(SUFFIX) + +CZBLAS3OBJS = \ + cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ + cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ + cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX) + +ifndef NO_CBLAS + +CFLAGS += -I. + +SBLAS1OBJS += $(CSBLAS1OBJS) +SBLAS2OBJS += $(CSBLAS2OBJS) +SBLAS3OBJS += $(CSBLAS3OBJS) +DBLAS1OBJS += $(CDBLAS1OBJS) +DBLAS2OBJS += $(CDBLAS2OBJS) +DBLAS3OBJS += $(CDBLAS3OBJS) +CBLAS1OBJS += $(CCBLAS1OBJS) +CBLAS2OBJS += $(CCBLAS2OBJS) +CBLAS3OBJS += $(CCBLAS3OBJS) +ZBLAS1OBJS += $(CZBLAS1OBJS) +ZBLAS2OBJS += $(CZBLAS2OBJS) +ZBLAS3OBJS += $(CZBLAS3OBJS) + +endif + +SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) +DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) +QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) +CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) +ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) +XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) + +SBLASOBJS += \ + sgetf2.$(SUFFIX) sgetrf.$(SUFFIX) slauu2.$(SUFFIX) slauum.$(SUFFIX) \ + spotf2.$(SUFFIX) spotrf.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) \ + slaswp.$(SUFFIX) sgetrs.$(SUFFIX) sgesv.$(SUFFIX) spotri.$(SUFFIX) \ + +DBLASOBJS += \ + dgetf2.$(SUFFIX) dgetrf.$(SUFFIX) dlauu2.$(SUFFIX) dlauum.$(SUFFIX) \ + dpotf2.$(SUFFIX) dpotrf.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) \ + dlaswp.$(SUFFIX) dgetrs.$(SUFFIX) dgesv.$(SUFFIX) dpotri.$(SUFFIX) \ + +QBLASOBJS += \ + qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ + qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \ + qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \ + +CBLASOBJS += \ + cgetf2.$(SUFFIX) cgetrf.$(SUFFIX) clauu2.$(SUFFIX) clauum.$(SUFFIX) \ + cpotf2.$(SUFFIX) cpotrf.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) \ + claswp.$(SUFFIX) cgetrs.$(SUFFIX) cgesv.$(SUFFIX) cpotri.$(SUFFIX) \ + +ZBLASOBJS += \ + zgetf2.$(SUFFIX) zgetrf.$(SUFFIX) zlauu2.$(SUFFIX) zlauum.$(SUFFIX) \ + zpotf2.$(SUFFIX) zpotrf.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) \ + zlaswp.$(SUFFIX) zgetrs.$(SUFFIX) zgesv.$(SUFFIX) zpotri.$(SUFFIX) \ + +XBLASOBJS += \ + xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \ + xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \ + xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \ + + +FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) + +ifdef EXPRECISION +FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) +endif + +ifdef QUAD_PRECISION +FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) +endif + +FUNCALLFILES = $(FUNCOBJS:.$(SUFFIX)=) + +include $(TOPDIR)/Makefile.tail + +all :: libs + +ifdef FUNCTION_PROFILE +$(BLASOBJS) $(BLASOBJS_P) : functable.h +$(BLASOBJS) $(BLASOBJS_P) : CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F) + +functable.h : Makefile + ./create $(FUNCALLFILES) > functable.h + +endif + +clean :: + @rm -f functable.h + +level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +$(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ +$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : CFLAGS += -DCBLAS + +srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +drot.$(SUFFIX) drot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qrot.$(SUFFIX) qrot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +csrot.$(SUFFIX) csrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zdrot.$(SUFFIX) zdrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xqrot.$(SUFFIX) xqrot.$(PSUFFIX) : zrot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +srotm.$(SUFFIX) srotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotm.$(SUFFIX) drotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotm.$(SUFFIX) qrotm.$(PSUFFIX): rotm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +srotmg.$(SUFFIX) srotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotmg.$(SUFFIX) drotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotmg.$(SUFFIX) qrotmg.$(PSUFFIX): rotmg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +srotg.$(SUFFIX) srotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +drotg.$(SUFFIX) drotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qrotg.$(SUFFIX) qrotg.$(PSUFFIX): rotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +crotg.$(SUFFIX) crotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zrotg.$(SUFFIX) zrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xrotg.$(SUFFIX) xrotg.$(PSUFFIX): zrotg.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sasum.$(SUFFIX) sasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dasum.$(SUFFIX) dasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qasum.$(SUFFIX) qasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scasum.$(SUFFIX) scasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dnrm2.$(SUFFIX) dnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qnrm2.$(SUFFIX) qnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +scnrm2.$(SUFFIX) scnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dznrm2.$(SUFFIX) dznrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qxnrm2.$(SUFFIX) qxnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +samax.$(SUFFIX) samax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +damax.$(SUFFIX) damax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +qamax.$(SUFFIX) qamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +scamax.$(SUFFIX) scamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +dzamax.$(SUFFIX) dzamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +qxamax.$(SUFFIX) qxamax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +samin.$(SUFFIX) samin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +damin.$(SUFFIX) damin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +qamin.$(SUFFIX) qamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +scamin.$(SUFFIX) scamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +dzamin.$(SUFFIX) dzamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +qxamin.$(SUFFIX) qxamin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +smax.$(SUFFIX) smax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +dmax.$(SUFFIX) dmax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +qmax.$(SUFFIX) qmax.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +smin.$(SUFFIX) smin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +dmin.$(SUFFIX) dmin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +qmin.$(SUFFIX) qmin.$(PSUFFIX) : max.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +isamax.$(SUFFIX) isamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +idamax.$(SUFFIX) idamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +iqamax.$(SUFFIX) iqamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +icamax.$(SUFFIX) icamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +izamax.$(SUFFIX) izamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +ixamax.$(SUFFIX) ixamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +isamin.$(SUFFIX) isamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +idamin.$(SUFFIX) idamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +iqamin.$(SUFFIX) iqamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +icamin.$(SUFFIX) icamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +izamin.$(SUFFIX) izamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +ixamin.$(SUFFIX) ixamin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -DUSE_ABS -DUSE_MIN $< -o $(@F) + +ismax.$(SUFFIX) ismax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +idmax.$(SUFFIX) idmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +iqmax.$(SUFFIX) iqmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +ismin.$(SUFFIX) ismin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +idmin.$(SUFFIX) idmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +iqmin.$(SUFFIX) iqmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +ddot.$(SUFFIX) ddot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qdot.$(SUFFIX) qdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cdotu.$(SUFFIX) cdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +cdotc.$(SUFFIX) cdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +zdotu.$(SUFFIX) zdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +zdotc.$(SUFFIX) zdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +xdotu.$(SUFFIX) xdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -UCONJ $< -o $(@F) + +xdotc.$(SUFFIX) xdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +saxpy.$(SUFFIX) saxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +daxpy.$(SUFFIX) daxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qaxpy.$(SUFFIX) qaxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +caxpy.$(SUFFIX) caxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zaxpy.$(SUFFIX) zaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xaxpy.$(SUFFIX) xaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +caxpyc.$(SUFFIX) caxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +zaxpyc.$(SUFFIX) zaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +xaxpyc.$(SUFFIX) xaxpyc.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -c -DCONJ $< -o $(@F) + +sscal.$(SUFFIX) sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dscal.$(SUFFIX) dscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qscal.$(SUFFIX) qscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cscal.$(SUFFIX) cscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zscal.$(SUFFIX) zscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xscal.$(SUFFIX) xscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +csscal.$(SUFFIX) csscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +zdscal.$(SUFFIX) zdscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +xqscal.$(SUFFIX) xqscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -c -DSSCAL $< -o $(@F) + +scopy.$(SUFFIX) scopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dcopy.$(SUFFIX) dcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qcopy.$(SUFFIX) qcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +ccopy.$(SUFFIX) ccopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zcopy.$(SUFFIX) zcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xcopy.$(SUFFIX) xcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +sswap.$(SUFFIX) sswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +dswap.$(SUFFIX) dswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +qswap.$(SUFFIX) qswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +cswap.$(SUFFIX) cswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +zswap.$(SUFFIX) zswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +xswap.$(SUFFIX) xswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +sger.$(SUFFIX) sger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dger.$(SUFFIX) dger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qger.$(SUFFIX) qger.$(PSUFFIX) : ger.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgeru.$(SUFFIX) cgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +cgerc.$(SUFFIX) cgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +zgeru.$(SUFFIX) zgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +zgerc.$(SUFFIX) zgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -UCONJ $< -o $(@F) + +xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c + $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) + +sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +dgemv.$(SUFFIX) dgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +qgemv.$(SUFFIX) qgemv.$(PSUFFIX): gemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +cgemv.$(SUFFIX) cgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +zgemv.$(SUFFIX) zgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +xgemv.$(SUFFIX) xgemv.$(PSUFFIX): zgemv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +strsv.$(SUFFIX) strsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrsv.$(SUFFIX) dtrsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrsv.$(SUFFIX) qtrsv.$(PSUFFIX) : trsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrsv.$(SUFFIX) ctrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrsv.$(SUFFIX) ztrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrsv.$(SUFFIX) xtrsv.$(PSUFFIX) : ztrsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strmv.$(SUFFIX) strmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrmv.$(SUFFIX) dtrmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrmv.$(SUFFIX) qtrmv.$(PSUFFIX) : trmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrmv.$(SUFFIX) ctrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrmv.$(SUFFIX) ztrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrmv.$(SUFFIX) xtrmv.$(PSUFFIX) : ztrmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssymv.$(SUFFIX) ssymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr.$(SUFFIX) ssyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr2.$(SUFFIX) ssyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr2.$(SUFFIX) dsyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr2.$(SUFFIX) qsyr2.$(PSUFFIX) : syr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr2.$(SUFFIX) csyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr2.$(SUFFIX) zsyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr2.$(SUFFIX) xsyr2.$(PSUFFIX) : zsyr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgbmv.$(SUFFIX) sgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +dgbmv.$(SUFFIX) dgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +qgbmv.$(SUFFIX) qgbmv.$(PSUFFIX): gbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +cgbmv.$(SUFFIX) cgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +zgbmv.$(SUFFIX) zgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +xgbmv.$(SUFFIX) xgbmv.$(PSUFFIX): zgbmv.c + $(CC) -c $(CFLAGS) -o $(@F) $< + +ssbmv.$(SUFFIX) ssbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsbmv.$(SUFFIX) dsbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsbmv.$(SUFFIX) qsbmv.$(PSUFFIX) : sbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csbmv.$(SUFFIX) csbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsbmv.$(SUFFIX) zsbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsbmv.$(SUFFIX) xsbmv.$(PSUFFIX) : zsbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspmv.$(SUFFIX) sspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspr.$(SUFFIX) sspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sspr2.$(SUFFIX) sspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dspr2.$(SUFFIX) dspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qspr2.$(SUFFIX) qspr2.$(PSUFFIX) : spr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cspr2.$(SUFFIX) cspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zspr2.$(SUFFIX) zspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xspr2.$(SUFFIX) xspr2.$(PSUFFIX) : zspr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stbmv.$(SUFFIX) stbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtbmv.$(SUFFIX) dtbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtbmv.$(SUFFIX) qtbmv.$(PSUFFIX) : tbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctbmv.$(SUFFIX) ctbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztbmv.$(SUFFIX) ztbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtbmv.$(SUFFIX) xtbmv.$(PSUFFIX) : ztbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stbsv.$(SUFFIX) stbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtbsv.$(SUFFIX) dtbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtbsv.$(SUFFIX) qtbsv.$(PSUFFIX) : tbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctbsv.$(SUFFIX) ctbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztbsv.$(SUFFIX) ztbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtbsv.$(SUFFIX) xtbsv.$(PSUFFIX) : ztbsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stpsv.$(SUFFIX) stpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtpsv.$(SUFFIX) dtpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtpsv.$(SUFFIX) qtpsv.$(PSUFFIX) : tpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctpsv.$(SUFFIX) ctpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztpsv.$(SUFFIX) ztpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtpsv.$(SUFFIX) xtpsv.$(PSUFFIX) : ztpsv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +stpmv.$(SUFFIX) stpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtpmv.$(SUFFIX) dtpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtpmv.$(SUFFIX) qtpmv.$(PSUFFIX) : tpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctpmv.$(SUFFIX) ctpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztpmv.$(SUFFIX) ztpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtpmv.$(SUFFIX) xtpmv.$(PSUFFIX) : ztpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chemv.$(SUFFIX) chemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhemv.$(SUFFIX) zhemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhemv.$(SUFFIX) xhemv.$(PSUFFIX) : zhemv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chbmv.$(SUFFIX) chbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhbmv.$(SUFFIX) zhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhbmv.$(SUFFIX) xhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cher.$(SUFFIX) cher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zher.$(SUFFIX) zher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xher.$(SUFFIX) xher.$(PSUFFIX) : zher.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cher2.$(SUFFIX) cher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zher2.$(SUFFIX) zher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xher2.$(SUFFIX) xher2.$(PSUFFIX) : zher2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpmv.$(SUFFIX) chpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpmv.$(SUFFIX) zhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpmv.$(SUFFIX) xhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpr.$(SUFFIX) chpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpr.$(SUFFIX) zhpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpr.$(SUFFIX) xhpr.$(PSUFFIX) : zhpr.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chpr2.$(SUFFIX) chpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemm.$(SUFFIX) dgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgemm.$(SUFFIX) qgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemm.$(SUFFIX) cgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsymm.$(SUFFIX) dsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsymm.$(SUFFIX) qsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csymm.$(SUFFIX) csymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsymm.$(SUFFIX) zsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsymm.$(SUFFIX) xsymm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strmm.$(SUFFIX) strmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +dtrmm.$(SUFFIX) dtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +qtrmm.$(SUFFIX) qtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +ctrmm.$(SUFFIX) ctrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +ztrmm.$(SUFFIX) ztrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +xtrmm.$(SUFFIX) xtrmm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) -DTRMM $< -o $(@F) + +strsm.$(SUFFIX) strsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrsm.$(SUFFIX) dtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrsm.$(SUFFIX) qtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrsm.$(SUFFIX) ctrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrsm.$(SUFFIX) ztrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrsm.$(SUFFIX) xtrsm.$(PSUFFIX) : trsm.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyrk.$(SUFFIX) ssyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyrk.$(SUFFIX) dsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyrk.$(SUFFIX) qsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyrk.$(SUFFIX) csyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyrk.$(SUFFIX) zsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyrk.$(SUFFIX) xsyrk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ssyr2k.$(SUFFIX) ssyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dsyr2k.$(SUFFIX) dsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qsyr2k.$(SUFFIX) qsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +csyr2k.$(SUFFIX) csyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zsyr2k.$(SUFFIX) zsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xsyr2k.$(SUFFIX) xsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +chemm.$(SUFFIX) chemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zhemm.$(SUFFIX) zhemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xhemm.$(SUFFIX) xhemm.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cherk.$(SUFFIX) cherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zherk.$(SUFFIX) zherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xherk.$(SUFFIX) xherk.$(PSUFFIX) : syrk.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cher2k.$(SUFFIX) cher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +zher2k.$(SUFFIX) zher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +xher2k.$(SUFFIX) xher2k.$(PSUFFIX) : syr2k.c + $(CC) -c $(CFLAGS) -DHEMM $< -o $(@F) + +cgemm3m.$(SUFFIX) cgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +zgemm3m.$(SUFFIX) zgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +xgemm3m.$(SUFFIX) xgemm3m.$(PSUFFIX) : gemm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +csymm3m.$(SUFFIX) csymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +zsymm3m.$(SUFFIX) zsymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +xsymm3m.$(SUFFIX) xsymm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M $< -o $(@F) + +chemm3m.$(SUFFIX) chemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +zhemm3m.$(SUFFIX) zhemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +xhemm3m.$(SUFFIX) xhemm3m.$(PSUFFIX) : symm.c + $(CC) -c $(CFLAGS) -DGEMM3M -DHEMM $< -o $(@F) + +cblas_isamax.$(SUFFIX) cblas_isamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_idamax.$(SUFFIX) cblas_idamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_icamax.$(SUFFIX) cblas_icamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_izamax.$(SUFFIX) cblas_izamax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_ismax.$(SUFFIX) cblas_ismax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_idmax.$(SUFFIX) cblas_idmax.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) + +cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c + $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) + +cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dasum.$(SUFFIX) cblas_dasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cdotu.$(SUFFIX) cblas_cdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) + +cblas_cdotc.$(SUFFIX) cblas_cdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_zdotu.$(SUFFIX) cblas_zdotu.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -UCONJ $< -o $(@F) + +cblas_zdotc.$(SUFFIX) cblas_zdotc.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F) + +cblas_cdotu_sub.$(SUFFIX) cblas_cdotu_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) + +cblas_cdotc_sub.$(SUFFIX) cblas_cdotc_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) + +cblas_zdotu_sub.$(SUFFIX) cblas_zdotu_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -UCONJ $< -o $(@F) + +cblas_zdotc_sub.$(SUFFIX) cblas_zdotc_sub.$(PSUFFIX) : zdot.c + $(CC) $(CFLAGS) -DCBLAS -DFORCE_USE_STACK -c -DCONJ $< -o $(@F) + +cblas_snrm2.$(SUFFIX) cblas_snrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dnrm2.$(SUFFIX) cblas_dnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scnrm2.$(SUFFIX) cblas_scnrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dznrm2.$(SUFFIX) cblas_dznrm2.$(PSUFFIX) : nrm2.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_saxpy.$(SUFFIX) cblas_saxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_scopy.$(SUFFIX) cblas_scopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dcopy.$(SUFFIX) cblas_dcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_ccopy.$(SUFFIX) cblas_ccopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zcopy.$(SUFFIX) cblas_zcopy.$(PSUFFIX) : copy.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sswap.$(SUFFIX) cblas_sswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dswap.$(SUFFIX) cblas_dswap.$(PSUFFIX) : swap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cswap.$(SUFFIX) cblas_cswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zswap.$(SUFFIX) cblas_zswap.$(PSUFFIX) : zswap.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srot.$(SUFFIX) cblas_srot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drot.$(SUFFIX) cblas_drot.$(PSUFFIX) : rot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotm.$(SUFFIX) cblas_drotm.$(PSUFFIX): rotm.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_srotmg.$(SUFFIX) cblas_srotmg.$(PSUFFIX): rotmg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_drotmg.$(SUFFIX) cblas_drotmg.$(PSUFFIX): rotmg.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_sscal.$(SUFFIX) cblas_sscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_dscal.$(SUFFIX) cblas_dscal.$(PSUFFIX) : scal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_cscal.$(SUFFIX) cblas_cscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_zscal.$(SUFFIX) cblas_zscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + +cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) + +cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c + $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) + +cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_dgemv.$(SUFFIX) cblas_dgemv.$(PSUFFIX): gemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_cgemv.$(SUFFIX) cblas_cgemv.$(PSUFFIX): zgemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_zgemv.$(SUFFIX) cblas_zgemv.$(PSUFFIX): zgemv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_sger.$(SUFFIX) cblas_sger.$(PSUFFIX) : ger.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dger.$(SUFFIX) cblas_dger.$(PSUFFIX) : ger.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgeru.$(SUFFIX) cblas_cgeru.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) + +cblas_cgerc.$(SUFFIX) cblas_cgerc.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) + +cblas_zgeru.$(SUFFIX) cblas_zgeru.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -UCONJ $< -o $(@F) + +cblas_zgerc.$(SUFFIX) cblas_zgerc.$(PSUFFIX) : zger.c + $(CC) -DCBLAS -c $(CFLAGS) -DCONJ $< -o $(@F) + +cblas_strsv.$(SUFFIX) cblas_strsv.$(PSUFFIX) : trsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrsv.$(SUFFIX) cblas_dtrsv.$(PSUFFIX) : trsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrsv.$(SUFFIX) cblas_ctrsv.$(PSUFFIX) : ztrsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrsv.$(SUFFIX) cblas_ztrsv.$(PSUFFIX) : ztrsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_strmv.$(SUFFIX) cblas_strmv.$(PSUFFIX) : trmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrmv.$(SUFFIX) cblas_dtrmv.$(PSUFFIX) : trmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrmv.$(SUFFIX) cblas_ctrmv.$(PSUFFIX) : ztrmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrmv.$(SUFFIX) cblas_ztrmv.$(PSUFFIX) : ztrmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr.$(SUFFIX) cblas_ssyr.$(PSUFFIX) : syr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr.$(SUFFIX) cblas_dsyr.$(PSUFFIX) : syr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cher.$(SUFFIX) cblas_cher.$(PSUFFIX) : zher.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zher.$(SUFFIX) cblas_zher.$(PSUFFIX) : zher.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr2.$(SUFFIX) cblas_ssyr2.$(PSUFFIX) : syr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr2.$(SUFFIX) cblas_dsyr2.$(PSUFFIX) : syr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cher2.$(SUFFIX) cblas_cher2.$(PSUFFIX) : zher2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zher2.$(SUFFIX) cblas_zher2.$(PSUFFIX) : zher2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sgbmv.$(SUFFIX) cblas_sgbmv.$(PSUFFIX): gbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_dgbmv.$(SUFFIX) cblas_dgbmv.$(PSUFFIX): gbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_cgbmv.$(SUFFIX) cblas_cgbmv.$(PSUFFIX): zgbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_zgbmv.$(SUFFIX) cblas_zgbmv.$(PSUFFIX): zgbmv.c + $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< + +cblas_ssbmv.$(SUFFIX) cblas_ssbmv.$(PSUFFIX) : sbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsbmv.$(SUFFIX) cblas_dsbmv.$(PSUFFIX) : sbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chbmv.$(SUFFIX) cblas_chbmv.$(PSUFFIX) : zhbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhbmv.$(SUFFIX) cblas_zhbmv.$(PSUFFIX) : zhbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspmv.$(SUFFIX) cblas_sspmv.$(PSUFFIX) : spmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspmv.$(SUFFIX) cblas_dspmv.$(PSUFFIX) : spmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspr.$(SUFFIX) cblas_sspr.$(PSUFFIX) : spr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspr.$(SUFFIX) cblas_dspr.$(PSUFFIX) : spr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpr.$(SUFFIX) cblas_chpr.$(PSUFFIX) : zhpr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpr.$(SUFFIX) cblas_zhpr.$(PSUFFIX) : zhpr.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sspr2.$(SUFFIX) cblas_sspr2.$(PSUFFIX) : spr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dspr2.$(SUFFIX) cblas_dspr2.$(PSUFFIX) : spr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpr2.$(SUFFIX) cblas_chpr2.$(PSUFFIX) : zhpr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpr2.$(SUFFIX) cblas_zhpr2.$(PSUFFIX) : zhpr2.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stbmv.$(SUFFIX) cblas_stbmv.$(PSUFFIX) : tbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtbmv.$(SUFFIX) cblas_dtbmv.$(PSUFFIX) : tbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctbmv.$(SUFFIX) cblas_ctbmv.$(PSUFFIX) : ztbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztbmv.$(SUFFIX) cblas_ztbmv.$(PSUFFIX) : ztbmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stbsv.$(SUFFIX) cblas_stbsv.$(PSUFFIX) : tbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtbsv.$(SUFFIX) cblas_dtbsv.$(PSUFFIX) : tbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctbsv.$(SUFFIX) cblas_ctbsv.$(PSUFFIX) : ztbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztbsv.$(SUFFIX) cblas_ztbsv.$(PSUFFIX) : ztbsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stpmv.$(SUFFIX) cblas_stpmv.$(PSUFFIX) : tpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtpmv.$(SUFFIX) cblas_dtpmv.$(PSUFFIX) : tpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctpmv.$(SUFFIX) cblas_ctpmv.$(PSUFFIX) : ztpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztpmv.$(SUFFIX) cblas_ztpmv.$(PSUFFIX) : ztpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chpmv.$(SUFFIX) cblas_chpmv.$(PSUFFIX) : zhpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhpmv.$(SUFFIX) cblas_zhpmv.$(PSUFFIX) : zhpmv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_stpsv.$(SUFFIX) cblas_stpsv.$(PSUFFIX) : tpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtpsv.$(SUFFIX) cblas_dtpsv.$(PSUFFIX) : tpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctpsv.$(SUFFIX) cblas_ctpsv.$(PSUFFIX) : ztpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztpsv.$(SUFFIX) cblas_ztpsv.$(PSUFFIX) : ztpsv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssymv.$(SUFFIX) cblas_ssymv.$(PSUFFIX) : symv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsymv.$(SUFFIX) cblas_dsymv.$(PSUFFIX) : symv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chemv.$(SUFFIX) cblas_chemv.$(PSUFFIX) : zhemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsymm.$(SUFFIX) cblas_dsymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csymm.$(SUFFIX) cblas_csymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsymm.$(SUFFIX) cblas_zsymm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyrk.$(SUFFIX) cblas_ssyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyrk.$(SUFFIX) cblas_dsyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csyrk.$(SUFFIX) cblas_csyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsyrk.$(SUFFIX) cblas_zsyrk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ssyr2k.$(SUFFIX) cblas_ssyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dsyr2k.$(SUFFIX) cblas_dsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_csyr2k.$(SUFFIX) cblas_csyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zsyr2k.$(SUFFIX) cblas_zsyr2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_strmm.$(SUFFIX) cblas_strmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_dtrmm.$(SUFFIX) cblas_dtrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_ctrmm.$(SUFFIX) cblas_ctrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_ztrmm.$(SUFFIX) cblas_ztrmm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) -DTRMM $< -o $(@F) + +cblas_strsm.$(SUFFIX) cblas_strsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_dtrsm.$(SUFFIX) cblas_dtrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ctrsm.$(SUFFIX) cblas_ctrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_ztrsm.$(SUFFIX) cblas_ztrsm.$(PSUFFIX) : trsm.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_chemm.$(SUFFIX) cblas_chemm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zhemm.$(SUFFIX) cblas_zhemm.$(PSUFFIX) : symm.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_cherk.$(SUFFIX) cblas_cherk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zherk.$(SUFFIX) cblas_zherk.$(PSUFFIX) : syrk.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_cher2k.$(SUFFIX) cblas_cher2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +cblas_zher2k.$(SUFFIX) cblas_zher2k.$(PSUFFIX) : syr2k.c + $(CC) -DCBLAS -c $(CFLAGS) -DHEMM $< -o $(@F) + +sgetf2.$(SUFFIX) sgetf2.$(PSUFFIX) : getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetf2.$(SUFFIX) dgetf2.$(PSUFFIX) : getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetf2.$(SUFFIX) qgetf2.$(PSUFFIX) : getf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetf2.$(SUFFIX) cgetf2.$(PSUFFIX) : zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetf2.$(SUFFIX) zgetf2.$(PSUFFIX) : zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetf2.$(SUFFIX) xgetf2.$(PSUFFIX) : zgetf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgetrf.$(SUFFIX) sgetrf.$(PSUFFIX) : getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetrf.$(SUFFIX) dgetrf.$(PSUFFIX) : getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetrf.$(SUFFIX) qgetrf.$(PSUFFIX) : getrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetrf.$(SUFFIX) cgetrf.$(PSUFFIX) : zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetrf.$(SUFFIX) zgetrf.$(PSUFFIX) : zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetrf.$(SUFFIX) xgetrf.$(PSUFFIX) : zgetrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slauu2.$(SUFFIX) slauu2.$(PSUFFIX) : lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlauu2.$(SUFFIX) dlauu2.$(PSUFFIX) : lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlauu2.$(SUFFIX) qlauu2.$(PSUFFIX) : lauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clauu2.$(SUFFIX) clauu2.$(PSUFFIX) : zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlauu2.$(SUFFIX) zlauu2.$(PSUFFIX) : zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlauu2.$(SUFFIX) xlauu2.$(PSUFFIX) : zlauu2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slauum.$(SUFFIX) slauum.$(PSUFFIX) : lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlauum.$(SUFFIX) dlauum.$(PSUFFIX) : lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlauum.$(SUFFIX) qlauum.$(PSUFFIX) : lauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clauum.$(SUFFIX) clauum.$(PSUFFIX) : zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlauum.$(SUFFIX) zlauum.$(PSUFFIX) : zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlauum.$(SUFFIX) xlauum.$(PSUFFIX) : zlauum.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotf2.$(SUFFIX) spotf2.$(PSUFFIX) : potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotf2.$(SUFFIX) dpotf2.$(PSUFFIX) : potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotf2.$(SUFFIX) qpotf2.$(PSUFFIX) : potf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotf2.$(SUFFIX) cpotf2.$(PSUFFIX) : zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotf2.$(SUFFIX) zpotf2.$(PSUFFIX) : zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotf2.$(SUFFIX) xpotf2.$(PSUFFIX) : zpotf2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotrf.$(SUFFIX) spotrf.$(PSUFFIX) : potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotrf.$(SUFFIX) dpotrf.$(PSUFFIX) : potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotrf.$(SUFFIX) qpotrf.$(PSUFFIX) : potrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotrf.$(SUFFIX) cpotrf.$(PSUFFIX) : zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotrf.$(SUFFIX) zpotrf.$(PSUFFIX) : zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotrf.$(SUFFIX) xpotrf.$(PSUFFIX) : zpotrf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strti2.$(SUFFIX) strti2.$(PSUFFIX) : trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrti2.$(SUFFIX) dtrti2.$(PSUFFIX) : trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrti2.$(SUFFIX) qtrti2.$(PSUFFIX) : trti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrti2.$(SUFFIX) ctrti2.$(PSUFFIX) : ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrti2.$(SUFFIX) ztrti2.$(PSUFFIX) : ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrti2.$(SUFFIX) xtrti2.$(PSUFFIX) : ztrti2.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +strtri.$(SUFFIX) strtri.$(PSUFFIX) : trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dtrtri.$(SUFFIX) dtrtri.$(PSUFFIX) : trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qtrtri.$(SUFFIX) qtrtri.$(PSUFFIX) : trtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ctrtri.$(SUFFIX) ctrtri.$(PSUFFIX) : ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +ztrtri.$(SUFFIX) ztrtri.$(PSUFFIX) : ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xtrtri.$(SUFFIX) xtrtri.$(PSUFFIX) : ztrtri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slaswp.$(SUFFIX) slaswp.$(PSUFFIX) : laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlaswp.$(SUFFIX) dlaswp.$(PSUFFIX) : laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlaswp.$(SUFFIX) qlaswp.$(PSUFFIX) : laswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +claswp.$(SUFFIX) claswp.$(PSUFFIX) : zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlaswp.$(SUFFIX) zlaswp.$(PSUFFIX) : zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlaswp.$(SUFFIX) xlaswp.$(PSUFFIX) : zlaswp.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgesv.$(SUFFIX) dgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgesv.$(SUFFIX) qgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgesv.$(SUFFIX) cgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgesv.$(SUFFIX) zgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xgesv.$(SUFFIX) xgesv.$(PSUFFIX) : gesv.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +spotri.$(SUFFIX) spotri.$(PSUFFIX) : potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dpotri.$(SUFFIX) dpotri.$(PSUFFIX) : potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qpotri.$(SUFFIX) qpotri.$(PSUFFIX) : potri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cpotri.$(SUFFIX) cpotri.$(PSUFFIX) : zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zpotri.$(SUFFIX) zpotri.$(PSUFFIX) : zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xpotri.$(SUFFIX) xpotri.$(PSUFFIX) : zpotri.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +slarf.$(SUFFIX) slarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlarf.$(SUFFIX) dlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +qlarf.$(SUFFIX) qlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +clarf.$(SUFFIX) clarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c + $(CC) -c $(CFLAGS) $< -o $(@F) + diff --git a/interface/asum.c b/interface/asum.c new file mode 100644 index 0000000000..634836e28a --- /dev/null +++ b/interface/asum.c @@ -0,0 +1,93 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (FLOATRET)ASUM_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, n); + + IDEBUG_END; + + return ret; +} + +#else + +FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = ASUM_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, n); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/axpy.c b/interface/axpy.c new file mode 100644 index 0000000000..03b981985a --- /dev/null +++ b/interface/axpy.c @@ -0,0 +1,112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOAT alpha = *ALPHA; + +#else + +void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + +#endif + +#ifdef SMP + int mode, nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (n <= 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + AXPYU_K(n, 0, 0, alpha, x, incx, y, incy, NULL, 0); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, 0, 0, &alpha, + x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); + + } +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/copy.c b/interface/copy.c new file mode 100644 index 0000000000..6965682ec1 --- /dev/null +++ b/interface/copy.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + + PRINT_DEBUG_NAME; + +#else + +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + + PRINT_DEBUG_CNAME; + +#endif + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx * COMPSIZE; + if (incy < 0) y -= (n - 1) * incy * COMPSIZE; + + COPY_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(COMPSIZE, COMPSIZE * n, 0); + + IDEBUG_END; + + return; + +} diff --git a/interface/create b/interface/create new file mode 100644 index 0000000000..b7be8ab6e4 --- /dev/null +++ b/interface/create @@ -0,0 +1,22 @@ +#!/usr/bin/perl + +$count = 0; + +foreach (@ARGV) { + print "#define\tinterface_", $_, "\t\t", $count, "\n"; + $count ++; +} + +print "#ifdef USE_FUNCTABLE\n"; + +print "#define MAX_PROF_TABLE ", $count, "\n"; + +print "static char *func_table[] = {\n"; + +foreach (@ARGV) { + print "\"", $_, "\",\n"; +} + +print "};\n"; +print "#endif\n"; + diff --git a/interface/dot.c b/interface/dot.c new file mode 100644 index 0000000000..3744db5eac --- /dev/null +++ b/interface/dot.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ret = (FLOATRET)DOTU_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; +} + +#else + +FLOAT CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ret = DOTU_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; + +} + +#endif diff --git a/interface/dsdot.c b/interface/dsdot.c new file mode 100644 index 0000000000..66f7917d57 --- /dev/null +++ b/interface/dsdot.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +double NAME(blasint *N, float *x, blasint *INCX, float *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + return DSDOT_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, n, n); + + IDEBUG_END; + + return 0; + +} + +#else + +double CNAME(blasint n, float *x, blasint incx, float *y, blasint incy){ + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + return DSDOT_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, n, n); + + IDEBUG_END; + + return 0; + +} + +#endif diff --git a/interface/gbmv.c b/interface/gbmv.c new file mode 100644 index 0000000000..a76c48d045 --- /dev/null +++ b/interface/gbmv.c @@ -0,0 +1,252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DGBMV " +#else +#define ERROR_NAME "SGBMV " +#endif + +static void (*gbmv[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qgbmv_n, qgbmv_t, +#elif defined(DOUBLE) + dgbmv_n, dgbmv_t, +#else + sgbmv_n, sgbmv_t, +#endif +}; + +#ifdef SMP +static int (*gbmv_thread[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qgbmv_thread_n, qgbmv_thread_t, +#elif defined(DOUBLE) + dgbmv_thread_n, dgbmv_thread_t, +#else + sgbmv_thread_n, sgbmv_thread_t, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, + blasint *KU, blasint *KL, + FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, + FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint ku = *KU; + blasint kl = *KL; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + FLOAT alpha = *ALPHA; + FLOAT beta = *BETA; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') i = 0; + if (trans == 'T') i = 1; + if (trans == 'R') i = 0; + if (trans == 'C') i = 1; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (i < 0) info = 1; + + trans = i; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_TRANSPOSE TransA, + blasint m, blasint n, + blasint ku, blasint kl, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT beta, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + blasint lenx, leny, info, t; + int trans; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + info = -1; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + info = -1; + + t = n; + n = m; + m = t; + + t = ku; + ku = kl; + kl = t; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + lenx = n; + leny = m; + if (trans) lenx = m; + if (trans) leny = n; + + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (lenx-1)*incx; + if (incy < 0) y -= (leny-1)*incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (gbmv[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (gbmv_thread[(int)trans])(m, n, kl, ku, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, m * n / 2 + n, m * n); + + IDEBUG_END; + + return; +} diff --git a/interface/gemm.c b/interface/gemm.c new file mode 100644 index 0000000000..7919f822e9 --- /dev/null +++ b/interface/gemm.c @@ -0,0 +1,452 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QGEMM " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMM " +#else +#define ERROR_NAME "SGEMM " +#endif +#else +#ifndef GEMM3M +#ifdef XDOUBLE +#define ERROR_NAME "XGEMM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMM " +#else +#define ERROR_NAME "CGEMM " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XGEMM3M " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMM3M " +#else +#define ERROR_NAME "CGEMM3M " +#endif +#endif +#endif + +static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef GEMM3M + GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, + GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, + GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR, + GEMM_NC, GEMM_TC, GEMM_RC, GEMM_CC, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + GEMM_THREAD_NN, GEMM_THREAD_TN, GEMM_THREAD_RN, GEMM_THREAD_CN, + GEMM_THREAD_NT, GEMM_THREAD_TT, GEMM_THREAD_RT, GEMM_THREAD_CT, + GEMM_THREAD_NR, GEMM_THREAD_TR, GEMM_THREAD_RR, GEMM_THREAD_CR, + GEMM_THREAD_NC, GEMM_THREAD_TC, GEMM_THREAD_RC, GEMM_THREAD_CC, +#endif +#else + GEMM3M_NN, GEMM3M_TN, GEMM3M_RN, GEMM3M_CN, + GEMM3M_NT, GEMM3M_TT, GEMM3M_RT, GEMM3M_CT, + GEMM3M_NR, GEMM3M_TR, GEMM3M_RR, GEMM3M_CR, + GEMM3M_NC, GEMM3M_TC, GEMM3M_RC, GEMM3M_CC, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + GEMM3M_THREAD_NN, GEMM3M_THREAD_TN, GEMM3M_THREAD_RN, GEMM3M_THREAD_CN, + GEMM3M_THREAD_NT, GEMM3M_THREAD_TT, GEMM3M_THREAD_RT, GEMM3M_THREAD_CT, + GEMM3M_THREAD_NR, GEMM3M_THREAD_TR, GEMM3M_THREAD_RR, GEMM3M_THREAD_CR, + GEMM3M_THREAD_NC, GEMM3M_THREAD_TC, GEMM3M_THREAD_RC, GEMM3M_THREAD_CC, +#endif +#endif +}; + +#ifndef CBLAS + +void NAME(char *TRANSA, char *TRANSB, + blasint *M, blasint *N, blasint *K, + FLOAT *alpha, + FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, + FLOAT *beta, + FLOAT *c, blasint *ldC){ + + blas_arg_t args; + + int transa, transb, nrowa, nrowb; + blasint info; + + char transA, transB; + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + +#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) + int nodes; +#endif + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.k = *K; + + args.a = (void *)a; + args.b = (void *)b; + args.c = (void *)c; + + args.lda = *ldA; + args.ldb = *ldB; + args.ldc = *ldC; + + args.alpha = (void *)alpha; + args.beta = (void *)beta; + + transA = *TRANSA; + transB = *TRANSB; + + TOUPPER(transA); + TOUPPER(transB); + + transa = -1; + transb = -1; + + if (transA == 'N') transa = 0; + if (transA == 'T') transa = 1; +#ifndef COMPLEX + if (transA == 'R') transa = 0; + if (transA == 'C') transa = 1; +#else + if (transA == 'R') transa = 2; + if (transA == 'C') transa = 3; +#endif + + if (transB == 'N') transb = 0; + if (transB == 'T') transb = 1; +#ifndef COMPLEX + if (transB == 'R') transb = 0; + if (transB == 'C') transb = 1; +#else + if (transB == 'R') transb = 2; + if (transB == 'C') transb = 3; +#endif + + nrowa = args.m; + if (transa & 1) nrowa = args.k; + nrowb = args.k; + if (transb & 1) nrowb = args.n; + + info = 0; + + if (args.ldc < args.m) info = 13; + if (args.ldb < nrowb) info = 10; + if (args.lda < nrowa) info = 8; + if (args.k < 0) info = 5; + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (transb < 0) info = 2; + if (transa < 0) info = 1; + + if (info){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, + blasint m, blasint n, blasint k, +#ifndef COMPLEX + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, + FLOAT *b, blasint ldb, +#ifndef COMPLEX + FLOAT beta, +#else + FLOAT *beta, +#endif + FLOAT *c, blasint ldc) { + + blas_arg_t args; + int transa, transb; + blasint nrowa, nrowb, info; + + XFLOAT *buffer; + XFLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + +#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3) + int nodes; +#endif + + PRINT_DEBUG_CNAME; + +#ifndef COMPLEX + args.alpha = (void *)α + args.beta = (void *)β +#else + args.alpha = (void *)alpha; + args.beta = (void *)beta; +#endif + + transa = -1; + transb = -1; + info = 0; + + if (order == CblasColMajor) { + args.m = m; + args.n = n; + args.k = k; + + args.a = (void *)a; + args.b = (void *)b; + args.c = (void *)c; + + args.lda = lda; + args.ldb = ldb; + args.ldc = ldc; + + if (TransA == CblasNoTrans) transa = 0; + if (TransA == CblasTrans) transa = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) transa = 0; + if (TransA == CblasConjTrans) transa = 1; +#else + if (TransA == CblasConjNoTrans) transa = 2; + if (TransA == CblasConjTrans) transa = 3; +#endif + if (TransB == CblasNoTrans) transb = 0; + if (TransB == CblasTrans) transb = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) transb = 0; + if (TransB == CblasConjTrans) transb = 1; +#else + if (TransB == CblasConjNoTrans) transb = 2; + if (TransB == CblasConjTrans) transb = 3; +#endif + + nrowa = args.m; + if (transa & 1) nrowa = args.k; + nrowb = args.k; + if (transb & 1) nrowb = args.n; + + info = -1; + + if (args.ldc < args.m) info = 13; + if (args.ldb < nrowb) info = 10; + if (args.lda < nrowa) info = 8; + if (args.k < 0) info = 5; + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (transb < 0) info = 2; + if (transa < 0) info = 1; + } + + if (order == CblasRowMajor) { + args.m = n; + args.n = m; + args.k = k; + + args.a = (void *)b; + args.b = (void *)a; + args.c = (void *)c; + + args.lda = ldb; + args.ldb = lda; + args.ldc = ldc; + + if (TransB == CblasNoTrans) transa = 0; + if (TransB == CblasTrans) transa = 1; +#ifndef COMPLEX + if (TransB == CblasConjNoTrans) transa = 0; + if (TransB == CblasConjTrans) transa = 1; +#else + if (TransB == CblasConjNoTrans) transa = 2; + if (TransB == CblasConjTrans) transa = 3; +#endif + if (TransA == CblasNoTrans) transb = 0; + if (TransA == CblasTrans) transb = 1; +#ifndef COMPLEX + if (TransA == CblasConjNoTrans) transb = 0; + if (TransA == CblasConjTrans) transb = 1; +#else + if (TransA == CblasConjNoTrans) transb = 2; + if (TransA == CblasConjTrans) transb = 3; +#endif + + nrowa = args.m; + if (transa & 1) nrowa = args.k; + nrowb = args.k; + if (transb & 1) nrowb = args.n; + + info = -1; + + if (args.ldc < args.m) info = 13; + if (args.ldb < nrowb) info = 10; + if (args.lda < nrowa) info = 8; + if (args.k < 0) info = 5; + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (transb < 0) info = 2; + if (transa < 0) info = 1; + + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((args.m == 0) || (args.n == 0)) return; + +#if 0 + fprintf(stderr, "m = %4d n = %d k = %d lda = %4d ldb = %4d ldc = %4d\n", + args.m, args.n, args.k, args.lda, args.ldb, args.ldc); +#endif + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (XFLOAT *)blas_memory_alloc(0); + + sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); + sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + mode |= (transa << BLAS_TRANSA_SHIFT); + mode |= (transb << BLAS_TRANSB_SHIFT); + + args.common = NULL; + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (gemm[(transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + + } else { + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + +#ifndef NO_AFFINITY + nodes = get_num_nodes(); + + if ((nodes > 1) && get_node_equal()) { + + args.nthreads /= nodes; + + gemm_thread_mn(mode, &args, NULL, NULL, gemm[16 | (transb << 2) | transa], sa, sb, nodes); + + } else { +#endif + + (gemm[16 | (transb << 2) | transa])(&args, NULL, NULL, sa, sb, 0); + +#else + + GEMM_THREAD(mode, &args, NULL, NULL, gemm[(transb << 2) | transa], sa, sb, args.nthreads); + +#endif + +#ifndef USE_SIMPLE_THREADED_LEVEL3 +#ifndef NO_AFFINITY + } +#endif +#endif + +#endif + +#ifdef SMP + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.k + args.k * args.n + args.m * args.n, 2 * args.m * args.n * args.k); + + IDEBUG_END; + + return; +} diff --git a/interface/gemv.c b/interface/gemv.c new file mode 100644 index 0000000000..9ea8aa8959 --- /dev/null +++ b/interface/gemv.c @@ -0,0 +1,237 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGEMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DGEMV " +#else +#define ERROR_NAME "SGEMV " +#endif + +#ifdef SMP +static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qgemv_thread_n, qgemv_thread_t, +#elif defined DOUBLE + dgemv_thread_n, dgemv_thread_t, +#else + sgemv_thread_n, sgemv_thread_t, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, + FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, + FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + FLOAT alpha = *ALPHA; + FLOAT beta = *BETA; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T, + }; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') i = 0; + if (trans == 'T') i = 1; + if (trans == 'R') i = 0; + if (trans == 'C') i = 1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (i < 0) info = 1; + + trans = i; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_TRANSPOSE TransA, + blasint m, blasint n, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT beta, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T, + }; + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + + } + + if (order == CblasRowMajor) { + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + info = -1; + + t = n; + n = m; + m = t; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + lenx = n; + leny = m; + if (trans) lenx = m; + if (trans) leny = n; + + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (lenx - 1) * incx; + if (incy < 0) y -= (leny - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (gemv[(int)trans])(m, n, 0, alpha, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (gemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/ger.c b/interface/ger.c new file mode 100644 index 0000000000..0218d94dd7 --- /dev/null +++ b/interface/ger.c @@ -0,0 +1,193 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGER " +#elif defined DOUBLE +#define ERROR_NAME "DGER " +#else +#define ERROR_NAME "SGER " +#endif + +#define GER GERU_K + +#if defined XDOUBLE +#define GER_THREAD qger_thread +#elif defined DOUBLE +#define GER_THREAD dger_thread +#else +#define GER_THREAD sger_thread +#endif + + +#ifndef CBLAS + +void NAME(blasint *M, blasint *N, FLOAT *Alpha, + FLOAT *x, blasint *INCX, + FLOAT *y, blasint *INCY, + FLOAT *a, blasint *LDA){ + + blasint m = *M; + blasint n = *N; + FLOAT alpha = *Alpha; + blasint incx = *INCX; + blasint incy = *INCY; + blasint lda = *LDA; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + blasint info; + + PRINT_DEBUG_NAME; + + info = 0; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + + if (info){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + blasint m, blasint n, + FLOAT alpha, + FLOAT *x, blasint incx, + FLOAT *y, blasint incy, + FLOAT *a, blasint lda) { + + FLOAT *buffer; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + info = 0; + + if (order == CblasColMajor) { + info = -1; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (order == CblasRowMajor) { + info = -1; + + t = n; + n = m; + m = t; + + t = incx; + incx = incy; + incy = t; + + buffer = x; + x = y; + y = buffer; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + /* Quick return if possible. */ + if (m == 0 || n == 0) return; + if (alpha == 0.) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incy < 0) y -= (n - 1) * incy; + if (incx < 0) x -= (m - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer); + +#ifdef SMP + } else { + + GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); + + IDEBUG_END; + + return; +} diff --git a/interface/gesv.c b/interface/gesv.c new file mode 100644 index 0000000000..ce6bcbd0bb --- /dev/null +++ b/interface/gesv.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QGESV " +#elif defined(DOUBLE) +#define ERROR_NAME "DGESV " +#else +#define ERROR_NAME "SGESV " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XGESV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGESV " +#else +#define ERROR_NAME "CGESV " +#endif +#endif + +int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv, + FLOAT *b, blasint *ldB, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *N; + args.n = *NRHS; + args.a = (void *)a; + args.lda = *ldA; + args.b = (void *)b; + args.ldb = *ldB; + args.c = (void *)ipiv; + + info = 0; + if (args.ldb < MAX(1,args.m)) info = 7; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + args.alpha = NULL; + args.beta = NULL; + + *Info = 0; + + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + args.n = *N; + info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); + + if (info == 0){ + args.n = *NRHS; + GETRS_N_SINGLE(&args, NULL, NULL, sa, sb, 0); + } + +#ifdef SMP + } else { + + args.n = *N; + info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); + + if (info == 0){ + args.n = *NRHS; + GETRS_N_PARALLEL(&args, NULL, NULL, sa, sb, 0); + } + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + *Info = info; + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, *N * *N, 2. / 3. * *N * *N * *N + *N * *N); + + IDEBUG_END; + + return 0; +} diff --git a/interface/getf2.c b/interface/getf2.c new file mode 100644 index 0000000000..cae15953b7 --- /dev/null +++ b/interface/getf2.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGETF2" +#elif defined(DOUBLE) +#define ERROR_NAME "DGETF2" +#else +#define ERROR_NAME "SGETF2" +#endif + +int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + args.c = (void *)ipiv; + + info = 0; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = GETF2(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/getrf.c b/interface/getrf.c new file mode 100644 index 0000000000..aa799e8d3e --- /dev/null +++ b/interface/getrf.c @@ -0,0 +1,121 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGETRF" +#elif defined(DOUBLE) +#define ERROR_NAME "DGETRF" +#else +#define ERROR_NAME "SGETRF" +#endif + +int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + args.c = (void *)ipiv; + + info = 0; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/getrs.c b/interface/getrs.c new file mode 100644 index 0000000000..761a00160e --- /dev/null +++ b/interface/getrs.c @@ -0,0 +1,152 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QGETRS" +#elif defined(DOUBLE) +#define ERROR_NAME "DGETRS" +#else +#define ERROR_NAME "SGETRS" +#endif + +static blasint (*getrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + GETRS_N_SINGLE, GETRS_T_SINGLE, +}; + +#ifdef SMP +static blasint (*getrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + GETRS_N_PARALLEL, GETRS_T_PARALLEL, +}; +#endif + +int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, + blasint *ipiv, FLOAT *b, blasint *ldB, blasint *Info){ + + char trans_arg = *TRANS; + + blas_arg_t args; + + blasint info; + int trans; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *N; + args.n = *NRHS; + args.a = (void *)a; + args.lda = *ldA; + args.b = (void *)b; + args.ldb = *ldB; + args.c = (void *)ipiv; + + info = 0; + + TOUPPER(trans_arg); + trans = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (args.ldb < MAX(1, args.m)) info = 8; + if (args.lda < MAX(1, args.m)) info = 5; + if (args.n < 0) info = 3; + if (args.m < 0) info = 2; + if (trans < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return 0; + } + + args.alpha = NULL; + args.beta = NULL; + + *Info = info; + + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + (getrs_single[trans])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + (getrs_parallel[trans])(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n); + + IDEBUG_END; + + return 0; + +} diff --git a/interface/imax.c b/interface/imax.c new file mode 100644 index 0000000000..37396c7f82 --- /dev/null +++ b/interface/imax.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#undef MAX_K + +#ifdef USE_ABS + +#ifndef USE_MIN + +/* ABS & MAX */ +#ifndef COMPLEX +#ifdef XDOUBLE +#define MAX_K IQAMAX_K +#elif defined(DOUBLE) +#define MAX_K IDAMAX_K +#else +#define MAX_K ISAMAX_K +#endif +#else +#ifdef XDOUBLE +#define MAX_K IXAMAX_K +#elif defined(DOUBLE) +#define MAX_K IZAMAX_K +#else +#define MAX_K ICAMAX_K +#endif +#endif + +#else + +/* ABS & MIN */ +#ifndef COMPLEX +#ifdef XDOUBLE +#define MAX_K IQAMIN_K +#elif defined(DOUBLE) +#define MAX_K IDAMIN_K +#else +#define MAX_K ISAMIN_K +#endif +#else +#ifdef XDOUBLE +#define MAX_K IXAMIN_K +#elif defined(DOUBLE) +#define MAX_K IZAMIN_K +#else +#define MAX_K ICAMIN_K +#endif +#endif + +#endif + +#else + +#ifndef USE_MIN + +/* MAX */ +#ifdef XDOUBLE +#define MAX_K IQMAX_K +#elif defined(DOUBLE) +#define MAX_K IDMAX_K +#else +#define MAX_K ISMAX_K +#endif + +#else + +/* MIN */ +#ifdef XDOUBLE +#define MAX_K IQMIN_K +#elif defined(DOUBLE) +#define MAX_K IDMIN_K +#else +#define MAX_K ISMIN_K +#endif + +#endif + +#endif + +#ifndef CBLAS + +blasint NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + blasint ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (blasint)MAX_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 0); + + IDEBUG_END; + + return ret; +} + +#else + +CBLAS_INDEX CNAME(blasint n, FLOAT *x, blasint incx){ + + CBLAS_INDEX ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = MAX_K(n, x, incx); + + if (ret) ret --; + + FUNCTION_PROFILE_END(COMPSIZE, n, 0); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/larf.c b/interface/larf.c new file mode 100644 index 0000000000..3b538c4a20 --- /dev/null +++ b/interface/larf.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +static int (*larf[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LARF_L, LARF_R, +}; + +int NAME(char *SIDE, blasint *M, blasint *N, FLOAT *v, blasint *incV, FLOAT *tau, FLOAT *c, blasint *ldC, FLOAT *work){ + + blas_arg_t args; + + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + char side_arg = *SIDE; + int side; + + PRINT_DEBUG_NAME; + + TOUPPER(side_arg); + + args.m = *M; + args.n = *N; + args.a = (void *)v; + args.lda = *incV; + args.c = (void *)c; + args.ldc = *ldC; + + args.alpha = (void *)tau; + + side = -1; + if (side_arg == 'L') side = 0; + if (side_arg == 'R') side = 1; + + if (args.m == 0 || args.n == 0) return 0; + +#ifndef COMPLEX + if (*tau == ZERO) return 0; +#else + if ((*(tau + 0) == ZERO) && (*(tau + 1) == ZERO)) return 0; +#endif + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + larf[side](&args, NULL, NULL, sa, sb, 0); + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/laswp.c b/interface/laswp.c new file mode 100644 index 0000000000..026b5156f4 --- /dev/null +++ b/interface/laswp.c @@ -0,0 +1,110 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, blasint *, BLASLONG) = { +#ifdef XDOUBLE + qlaswp_plus, qlaswp_minus, +#elif defined(DOUBLE) + dlaswp_plus, dlaswp_minus, +#else + slaswp_plus, slaswp_minus, +#endif +}; + +int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){ + + blasint n = *N; + blasint lda = *LDA; + blasint k1 = *K1; + blasint k2 = *K2; + blasint incx = *INCX; + int flag; + +#ifdef SMP + int mode, nthreads; + FLOAT dummyalpha[2] = {ZERO, ZERO}; +#endif + + PRINT_DEBUG_NAME; + + if (incx == 0 || n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + flag = (incx < 0); + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + (laswp[flag])(n, k1, k2, ZERO, a, lda, NULL, 0, ipiv, incx); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, k1, k2, dummyalpha, + a, lda, NULL, 0, ipiv, incx, + laswp[flag], nthreads); + } +#endif + + FUNCTION_PROFILE_END(COMPSIZE, n * (k2 - k1), 0); + + IDEBUG_END; + + return 0; + +} diff --git a/interface/lauu2.c b/interface/lauu2.c new file mode 100644 index 0000000000..14417e9867 --- /dev/null +++ b/interface/lauu2.c @@ -0,0 +1,128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QLAUU2" +#elif defined(DOUBLE) +#define ERROR_NAME "DLAUU2" +#else +#define ERROR_NAME "SLAUU2" +#endif + +static blasint (*lauu2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifdef XDOUBLE + qlauu2_U, qlauu2_L, +#elif defined(DOUBLE) + dlauu2_U, dlauu2_L, +#else + slauu2_U, slauu2_L, +#endif + }; + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (lauu2[uplo])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/lauum.c b/interface/lauum.c new file mode 100644 index 0000000000..e5b593f306 --- /dev/null +++ b/interface/lauum.c @@ -0,0 +1,139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QLAUUM" +#elif defined(DOUBLE) +#define ERROR_NAME "DLAUUM" +#else +#define ERROR_NAME "SLAUUM" +#endif + +static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LAUUM_U_SINGLE, LAUUM_L_SINGLE, +}; + +#ifdef SMP +static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + *Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/max.c b/interface/max.c new file mode 100644 index 0000000000..9bedaddd0b --- /dev/null +++ b/interface/max.c @@ -0,0 +1,169 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#undef MAX_K + +#ifdef USE_ABS + +#ifndef USE_MIN + +/* ABS & MAX */ +#ifndef COMPLEX +#ifdef XDOUBLE +#define MAX_K QAMAX_K +#elif defined(DOUBLE) +#define MAX_K DAMAX_K +#else +#define MAX_K SAMAX_K +#endif +#else +#ifdef XDOUBLE +#define MAX_K XAMAX_K +#elif defined(DOUBLE) +#define MAX_K ZAMAX_K +#else +#define MAX_K CAMAX_K +#endif +#endif + +#else + +/* ABS & MIN */ +#ifndef COMPLEX +#ifdef XDOUBLE +#define MAX_K QAMIN_K +#elif defined(DOUBLE) +#define MAX_K DAMIN_K +#else +#define MAX_K SAMIN_K +#endif +#else +#ifdef XDOUBLE +#define MAX_K XAMIN_K +#elif defined(DOUBLE) +#define MAX_K ZAMIN_K +#else +#define MAX_K CAMIN_K +#endif +#endif + +#endif + +#else + +#ifndef USE_MIN + +/* MAX */ +#ifdef XDOUBLE +#define MAX_K QMAX_K +#elif defined(DOUBLE) +#define MAX_K DMAX_K +#else +#define MAX_K SMAX_K +#endif + +#else + +/* MIN */ +#ifdef XDOUBLE +#define MAX_K QMIN_K +#elif defined(DOUBLE) +#define MAX_K DMIN_K +#else +#define MAX_K SMIN_K +#endif + +#endif + +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (FLOATRET)MAX_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 0); + + IDEBUG_END; + + return ret; +} + +#else + +FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = MAX_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 0); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/nrm2.c b/interface/nrm2.c new file mode 100644 index 0000000000..ff8ef6d0dd --- /dev/null +++ b/interface/nrm2.c @@ -0,0 +1,93 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = (FLOATRET)NRM2_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 2 * n); + + IDEBUG_END; + + return ret; +} + +#else + +FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + ret = NRM2_K(n, x, incx); + + FUNCTION_PROFILE_END(COMPSIZE, n, 2 * n); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/potf2.c b/interface/potf2.c new file mode 100644 index 0000000000..76822a49c7 --- /dev/null +++ b/interface/potf2.c @@ -0,0 +1,128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QPOTF2" +#elif defined(DOUBLE) +#define ERROR_NAME "DPOTF2" +#else +#define ERROR_NAME "SPOTF2" +#endif + +static blasint (*potf2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifdef XDOUBLE + qpotf2_U, qpotf2_L, +#elif defined(DOUBLE) + dpotf2_U, dpotf2_L, +#else + spotf2_U, spotf2_L, +#endif + }; + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (potf2[uplo])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/potrf.c b/interface/potrf.c new file mode 100644 index 0000000000..9a15012d3a --- /dev/null +++ b/interface/potrf.c @@ -0,0 +1,139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QPOTRF" +#elif defined(DOUBLE) +#define ERROR_NAME "DPOTRF" +#else +#define ERROR_NAME "SPOTRF" +#endif + +static blasint (*potrf_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + POTRF_U_SINGLE, POTRF_L_SINGLE, +}; + +#ifdef SMP +static blasint (*potrf_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + POTRF_U_PARALLEL, POTRF_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (potrf_single[uplo])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + *Info = (potrf_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/potri.c b/interface/potri.c new file mode 100644 index 0000000000..a4f33221a8 --- /dev/null +++ b/interface/potri.c @@ -0,0 +1,160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QPOTRI" +#elif defined(DOUBLE) +#define ERROR_NAME "DPOTRI" +#else +#define ERROR_NAME "SPOTRI" +#endif + +static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UN_SINGLE, TRTRI_LN_SINGLE, +}; + +static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + LAUUM_U_SINGLE, LAUUM_L_SINGLE, +}; + +#ifdef SMP +static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UN_PARALLEL, TRTRI_LN_PARALLEL, +}; + +static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + info = (trtri_single[uplo])(&args, NULL, NULL, sa, sb, 0); + + if (!info) { + info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); + } + + *Info = info; + +#ifdef SMP + } else { + info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + + if (!info) { + info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + } + + *Info = info; + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/interface/rot.c b/interface/rot.c new file mode 100644 index 0000000000..2e458b12d5 --- /dev/null +++ b/interface/rot.c @@ -0,0 +1,82 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOAT c = *C; + FLOAT s = *S; + + PRINT_DEBUG_NAME; + +#else + +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT c, FLOAT s){ + + PRINT_DEBUG_CNAME; + +#endif + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ROT_K(n, x, incx, y, incy, c, s); + + FUNCTION_PROFILE_END(1, n, n); + + IDEBUG_END; + + return; + +} diff --git a/interface/rotg.c b/interface/rotg.c new file mode 100644 index 0000000000..49088ab020 --- /dev/null +++ b/interface/rotg.c @@ -0,0 +1,109 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ + +#else + +void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ + +#endif + + +#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) + + long double da = *DA; + long double db = *DB; + long double c; + long double s; + long double r, roe, z; + + long double ada = fabs(da); + long double adb = fabs(db); + long double scale = ada + adb; + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + roe = db; + if (ada > adb) roe = da; + + if (scale == ZERO) { + *C = ONE; + *S = ZERO; + *DA = ZERO; + *DB = ZERO; + } else { + r = sqrt(da * da + db * db); + if (roe < 0) r = -r; + c = da / r; + s = db / r; + z = ONE; + if (da != ZERO) { + if (ada > adb){ + z = s; + } else { + z = ONE / c; + } + } + + *C = c; + *S = s; + *DA = r; + *DB = z; + } + +#else + FLOAT da = *DA; + FLOAT db = *DB; + FLOAT c = *C; + FLOAT s = *S; + FLOAT r, roe, z; + + FLOAT ada = fabs(da); + FLOAT adb = fabs(db); + FLOAT scale = ada + adb; + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + roe = db; + if (ada > adb) roe = da; + + if (scale == ZERO) { + *C = ONE; + *S = ZERO; + *DA = ZERO; + *DB = ZERO; + } else { + FLOAT aa = da / scale; + FLOAT bb = db / scale; + + r = scale * sqrt(aa * aa + bb * bb); + if (roe < 0) r = -r; + c = da / r; + s = db / r; + z = ONE; + if (ada > adb) z = s; + if ((ada < adb) && (c != ZERO)) z = ONE / c; + + *C = c; + *S = s; + *DA = r; + *DB = z; + } +#endif + + return; +} diff --git a/interface/rotm.c b/interface/rotm.c new file mode 100644 index 0000000000..4f026c75d6 --- /dev/null +++ b/interface/rotm.c @@ -0,0 +1,155 @@ +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ + + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + +#else + +void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ + +#endif + + blasint i__1, i__2; + + blasint i__; + FLOAT w, z__; + blasint kx, ky; + FLOAT dh11, dh12, dh22, dh21, dflag; + blasint nsteps; + +#ifndef CBLAS + PRINT_DEBUG_CNAME; +#else + PRINT_DEBUG_CNAME; +#endif + + --dparam; + --dy; + --dx; + + dflag = dparam[1]; + if (n <= 0 || dflag == - 2.0) goto L140; + + if (! (incx == incy && incx > 0)) goto L70; + + nsteps = n * incx; + if (dflag < 0.) { + goto L50; + } else if (dflag == 0) { + goto L10; + } else { + goto L30; + } +L10: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__1 = nsteps; + i__2 = incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w + z__ * dh12; + dy[i__] = w * dh21 + z__; +/* L20: */ + } + goto L140; +L30: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = nsteps; + i__1 = incx; + for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__; + dy[i__] = -w + dh22 * z__; +/* L40: */ + } + goto L140; +L50: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__1 = nsteps; + i__2 = incx; + for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { + w = dx[i__]; + z__ = dy[i__]; + dx[i__] = w * dh11 + z__ * dh12; + dy[i__] = w * dh21 + z__ * dh22; +/* L60: */ + } + goto L140; +L70: + kx = 1; + ky = 1; + if (incx < 0) { + kx = (1 - n) * incx + 1; + } + if (incy < 0) { + ky = (1 - n) * incy + 1; + } + + if (dflag < 0.) { + goto L120; + } else if (dflag == 0) { + goto L80; + } else { + goto L100; + } +L80: + dh12 = dparam[4]; + dh21 = dparam[3]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w + z__ * dh12; + dy[ky] = w * dh21 + z__; + kx += incx; + ky += incy; +/* L90: */ + } + goto L140; +L100: + dh11 = dparam[2]; + dh22 = dparam[5]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__; + dy[ky] = -w + dh22 * z__; + kx += incx; + ky += incy; +/* L110: */ + } + goto L140; +L120: + dh11 = dparam[2]; + dh12 = dparam[4]; + dh21 = dparam[3]; + dh22 = dparam[5]; + i__2 = n; + for (i__ = 1; i__ <= i__2; ++i__) { + w = dx[kx]; + z__ = dy[ky]; + dx[kx] = w * dh11 + z__ * dh12; + dy[ky] = w * dh21 + z__ * dh22; + kx += incx; + ky += incy; +/* L130: */ + } +L140: + return; +} + diff --git a/interface/rotmg.c b/interface/rotmg.c new file mode 100644 index 0000000000..c37c099140 --- /dev/null +++ b/interface/rotmg.c @@ -0,0 +1,199 @@ +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#define GAM 4096.e0 +#define GAMSQ 16777216.e0 +#define RGAMSQ 5.9604645e-8 + +#ifndef CBLAS + +void NAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT *DY1, FLOAT *dparam){ + + FLOAT dy1 = *DY1; + +#else + +void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){ + +#endif + + FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22; + int igo, flag; + FLOAT dtemp; + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + dh11 = ZERO; + dh12 = ZERO; + dh21 = ZERO; + dh22 = ZERO; + + if (*dd1 < ZERO) goto L60; + + dp2 = *dd2 * dy1; + + if (dp2 == ZERO) { + flag = -2; + goto L260; + } + + dp1 = *dd1 * *dx1; + dq2 = dp2 * dy1; + dq1 = dp1 * *dx1; + + if (! (abs(dq1) > abs(dq2))) goto L40; + + dh21 = -(dy1) / *dx1; + dh12 = dp2 / dp1; + + du = ONE - dh12 * dh21; + + if (du <= ZERO) goto L60; + + flag = 0; + *dd1 /= du; + *dd2 /= du; + *dx1 *= du; + + goto L100; + +L40: + if (dq2 < ZERO) goto L60; + + flag = 1; + dh11 = dp1 / dp2; + dh22 = *dx1 / dy1; + du = ONE + dh11 * dh22; + dtemp = *dd2 / du; + *dd2 = *dd1 / du; + *dd1 = dtemp; + *dx1 = dy1 * du; + goto L100; + +L60: + flag = -1; + dh11 = ZERO; + dh12 = ZERO; + dh21 = ZERO; + dh22 = ZERO; + + *dd1 = ZERO; + *dd2 = ZERO; + *dx1 = ZERO; + goto L220; + + +L70: + if (flag < 0) goto L90; + + if (flag > 0) goto L80; + + dh11 = ONE; + dh22 = ONE; + flag = -1; + goto L90; + +L80: + dh21 = -ONE; + dh12 = ONE; + flag = -1; + +L90: + switch (igo) { + case 0: goto L120; + case 1: goto L150; + case 2: goto L180; + case 3: goto L210; + } + +L100: + if (!(*dd1 <= RGAMSQ)) goto L130; + if (*dd1 == ZERO) goto L160; + igo = 0; + goto L70; + +L120: + *dd1 *= GAM * GAM; + *dx1 /= GAM; + dh11 /= GAM; + dh12 /= GAM; + goto L100; + +L130: + if (! (*dd1 >= GAMSQ)) { + goto L160; + } + igo = 1; + goto L70; + +L150: + *dd1 /= GAM * GAM; + *dx1 *= GAM; + dh11 *= GAM; + dh12 *= GAM; + goto L130; + +L160: + if (! (abs(*dd2) <= RGAMSQ)) { + goto L190; + } + if (*dd2 == ZERO) { + goto L220; + } + igo = 2; + goto L70; + +L180: +/* Computing 2nd power */ + *dd2 *= GAM * GAM; + dh21 /= GAM; + dh22 /= GAM; + goto L160; + +L190: + if (! (abs(*dd2) >= GAMSQ)) { + goto L220; + } + igo = 3; + goto L70; + +L210: +/* Computing 2nd power */ + *dd2 /= GAM * GAM; + dh21 *= GAM; + dh22 *= GAM; + goto L190; + +L220: + if (flag < 0) { + goto L250; + } else if (flag == 0) { + goto L230; + } else { + goto L240; + } +L230: + dparam[2] = dh21; + dparam[3] = dh12; + goto L260; +L240: + dparam[2] = dh11; + dparam[4] = dh22; + goto L260; +L250: + dparam[1] = dh11; + dparam[2] = dh21; + dparam[3] = dh12; + dparam[4] = dh22; +L260: + dparam[0] = (FLOAT) flag; + return; +} + + diff --git a/interface/sbmv.c b/interface/sbmv.c new file mode 100644 index 0000000000..2ffe7f166b --- /dev/null +++ b/interface/sbmv.c @@ -0,0 +1,215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DSBMV " +#else +#define ERROR_NAME "SSBMV " +#endif + +static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qsbmv_U, qsbmv_L, +#elif defined(DOUBLE) + dsbmv_U, dsbmv_L, +#else + ssbmv_U, ssbmv_L, +#endif +}; + +#ifdef SMP +static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qsbmv_thread_U, qsbmv_thread_L, +#elif defined(DOUBLE) + dsbmv_thread_U, dsbmv_thread_L, +#else + ssbmv_thread_U, ssbmv_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + blasint k = *K; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta = *BETA; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, blasint k, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT beta, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (sbmv[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (sbmv_thread[uplo])(n, k, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/scal.c b/interface/scal.c new file mode 100644 index 0000000000..7b72ca01c1 --- /dev/null +++ b/interface/scal.c @@ -0,0 +1,112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ + + blasint n = *N; + blasint incx = *INCX; + FLOAT alpha = *ALPHA; + +#else + +void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ + +#endif + +#ifdef SMP + int mode, nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (incx <= 0 || n <= 0) return; + + if (alpha == ONE) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0); + +#ifdef SMP + } else { + +#ifdef DOUBLE + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, 0, 0, +#ifndef CBLAS + ALPHA, +#else + &alpha, +#endif + x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + + } +#endif + + FUNCTION_PROFILE_END(1, n, n); + + IDEBUG_END; + + return; + +} diff --git a/interface/sdsdot.c b/interface/sdsdot.c new file mode 100644 index 0000000000..8540be6609 --- /dev/null +++ b/interface/sdsdot.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +FLOATRET NAME(blasint *N, FLOAT *a, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOATRET ret; + + PRINT_DEBUG_NAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ret = (FLOATRET)(SDSDOT_K(n, x, incx, y, incy) + *a); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; + +} + +#else + +FLOAT CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + + FLOAT ret; + + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + + ret = SDSDOT_K(n, x, incx, y, incy) + alpha; + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/spmv.c b/interface/spmv.c new file mode 100644 index 0000000000..8d89027633 --- /dev/null +++ b/interface/spmv.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DSPMV " +#else +#define ERROR_NAME "SSPMV " +#endif + +static int (*spmv[])(BLASLONG, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qspmv_U, qspmv_L, +#elif defined(DOUBLE) + dspmv_U, dspmv_L, +#else + sspmv_U, sspmv_L, +#endif +}; + +#ifdef SMP +static int (*spmv_thread[])(BLASLONG, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qspmv_thread_U, qspmv_thread_L, +#elif defined(DOUBLE) + dspmv_thread_U, dspmv_thread_L, +#else + sspmv_thread_U, sspmv_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint incx = *INCX; + FLOAT beta = *BETA; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT alpha, + FLOAT *a, + FLOAT *x, blasint incx, + FLOAT beta, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spmv[uplo])(n, alpha, a, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (spmv_thread[uplo])(n, alpha, a, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/spr.c b/interface/spr.c new file mode 100644 index 0000000000..aa2ff8f3f3 --- /dev/null +++ b/interface/spr.c @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSPR " +#elif defined(DOUBLE) +#define ERROR_NAME "DSPR " +#else +#define ERROR_NAME "SSPR " +#endif + +static int (*spr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + qspr_U, qspr_L, +#elif defined(DOUBLE) + dspr_U, dspr_L, +#else + sspr_U, sspr_L, +#endif +}; + +#ifdef SMP +static int (*spr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + qspr_thread_U, qspr_thread_L, +#elif defined(DOUBLE) + dspr_thread_U, dspr_thread_L, +#else + sspr_thread_U, sspr_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT alpha, + FLOAT *x, blasint incx, + FLOAT *a) { + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spr[uplo])(n, alpha, x, incx, a, buffer); + +#ifdef SMP + } else { + + (spr_thread[uplo])(n, alpha, x, incx, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/spr2.c b/interface/spr2.c new file mode 100644 index 0000000000..e556d3fa8d --- /dev/null +++ b/interface/spr2.c @@ -0,0 +1,203 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSPR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "DSPR2 " +#else +#define ERROR_NAME "SSPR2 " +#endif + +static int (*spr2[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + qspr2_U, qspr2_L, +#elif defined(DOUBLE) + dspr2_U, dspr2_L, +#else + sspr2_U, sspr2_L, +#endif +}; + +#ifdef SMP +static int (*spr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + qspr2_thread_U, qspr2_thread_L, +#elif defined(DOUBLE) + dspr2_thread_U, dspr2_thread_L, +#else + sspr2_thread_U, sspr2_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT alpha, + FLOAT *x, blasint incx, + FLOAT *y, blasint incy, + FLOAT *a) { + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spr2[uplo])(n, alpha, x, incx, y, incy, a, buffer); + +#ifdef SMP + } else { + + (spr2_thread[uplo])(n, alpha, x, incx, y, incy, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/swap.c b/interface/swap.c new file mode 100644 index 0000000000..7676246f92 --- /dev/null +++ b/interface/swap.c @@ -0,0 +1,110 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + +#else + +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + +#endif + +#ifdef SMP + int mode, nthreads; + FLOAT dummyalpha[2] = {ZERO, ZERO}; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + SWAP_K(n, 0, 0, ZERO, x, incx, y, incy, NULL, 0); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + blas_level1_thread(mode, n, 0, 0, dummyalpha, + x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads); + } + +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 0); + + IDEBUG_END; + + return; + +} diff --git a/interface/symm.c b/interface/symm.c new file mode 100644 index 0000000000..a0d52c49dc --- /dev/null +++ b/interface/symm.c @@ -0,0 +1,422 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QSYMM " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYMM " +#else +#define ERROR_NAME "SSYMM " +#endif +#else +#ifndef GEMM3M +#ifndef HEMM +#ifdef XDOUBLE +#define ERROR_NAME "XSYMM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYMM " +#else +#define ERROR_NAME "CSYMM " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XHEMM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHEMM " +#else +#define ERROR_NAME "CHEMM " +#endif +#endif +#else +#ifndef HEMM +#ifdef XDOUBLE +#define ERROR_NAME "XSYMM3M " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYMM3M " +#else +#define ERROR_NAME "CSYMM3M " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XHEMM3M " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHEMM3M " +#else +#define ERROR_NAME "CHEMM3M " +#endif +#endif +#endif +#endif + +static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef GEMM3M +#ifndef HEMM + SYMM_LU, SYMM_LL, SYMM_RU, SYMM_RL, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + SYMM_THREAD_LU, SYMM_THREAD_LL, SYMM_THREAD_RU, SYMM_THREAD_RL, +#endif +#else + HEMM_LU, HEMM_LL, HEMM_RU, HEMM_RL, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + HEMM_THREAD_LU, HEMM_THREAD_LL, HEMM_THREAD_RU, HEMM_THREAD_RL, +#endif +#endif +#else +#ifndef HEMM + SYMM3M_LU, SYMM3M_LL, SYMM3M_RU, SYMM3M_RL, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + SYMM3M_THREAD_LU, SYMM3M_THREAD_LL, SYMM3M_THREAD_RU, SYMM3M_THREAD_RL, +#endif +#else + HEMM3M_LU, HEMM3M_LL, HEMM3M_RU, HEMM3M_RL, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + HEMM3M_THREAD_LU, HEMM3M_THREAD_LL, HEMM3M_THREAD_RU, HEMM3M_THREAD_RL, +#endif +#endif +#endif +}; + +#ifndef CBLAS + +void NAME(char *SIDE, char *UPLO, + blasint *M, blasint *N, + FLOAT *alpha, FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, + FLOAT *beta, FLOAT *c, blasint *ldC){ + + char side_arg = *SIDE; + char uplo_arg = *UPLO; + + blas_arg_t args; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#endif + +#if defined(SMP) && !defined(NO_AFFINITY) + int nodes; +#endif + + blasint info; + int side; + int uplo; + + PRINT_DEBUG_NAME; + + args.alpha = (void *)alpha; + args.beta = (void *)beta; + + TOUPPER(side_arg); + TOUPPER(uplo_arg); + + side = -1; + uplo = -1; + + if (side_arg == 'L') side = 0; + if (side_arg == 'R') side = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + args.m = *M; + args.n = *N; + + args.c = (void *)c; + args.ldc = *ldC; + + info = 0; + + if (args.ldc < MAX(1, args.m)) info = 12; + + if (!side) { + args.a = (void *)a; + args.b = (void *)b; + + args.lda = *ldA; + args.ldb = *ldB; + + if (args.ldb < MAX(1, args.m)) info = 9; + if (args.lda < MAX(1, args.m)) info = 7; + + } else { + args.a = (void *)b; + args.b = (void *)a; + + args.lda = *ldB; + args.ldb = *ldA; + + if (args.lda < MAX(1, args.m)) info = 9; + if (args.ldb < MAX(1, args.n)) info = 7; + } + + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, + blasint m, blasint n, +#ifndef COMPLEX + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, + FLOAT *b, blasint ldb, +#ifndef COMPLEX + FLOAT beta, +#else + FLOAT *beta, +#endif + FLOAT *c, blasint ldc) { + + blas_arg_t args; + int side, uplo; + blasint info; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#endif + +#if defined(SMP) && !defined(NO_AFFINITY) + int nodes; +#endif + + PRINT_DEBUG_CNAME; + +#ifndef COMPLEX + args.alpha = (void *)α + args.beta = (void *)β +#else + args.alpha = (void *)alpha; + args.beta = (void *)beta; +#endif + + args.c = (void *)c; + args.ldc = ldc; + + side = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Side == CblasLeft) side = 0; + if (Side == CblasRight) side = 1; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + args.m = m; + args.n = n; + + if (args.ldc < MAX(1, args.m)) info = 12; + + if (!side) { + args.a = (void *)a; + args.b = (void *)b; + + args.lda = lda; + args.ldb = ldb; + + if (args.ldb < MAX(1, args.m)) info = 9; + if (args.lda < MAX(1, args.m)) info = 7; + + } else { + args.a = (void *)b; + args.b = (void *)a; + + args.lda = ldb; + args.ldb = lda; + + if (args.lda < MAX(1, args.m)) info = 9; + if (args.ldb < MAX(1, args.n)) info = 7; + } + + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Side == CblasLeft) side = 1; + if (Side == CblasRight) side = 0; + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + args.m = n; + args.n = m; + + if (args.ldc < MAX(1, args.m)) info = 12; + + if (!side) { + args.a = (void *)a; + args.b = (void *)b; + + args.lda = lda; + args.ldb = ldb; + + if (args.ldb < MAX(1, args.m)) info = 9; + if (args.lda < MAX(1, args.m)) info = 7; + + } else { + args.a = (void *)b; + args.b = (void *)a; + + args.lda = ldb; + args.ldb = lda; + + if (args.lda < MAX(1, args.m)) info = 9; + if (args.ldb < MAX(1, args.n)) info = 7; + } + + if (args.n < 0) info = 4; + if (args.m < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (args.m == 0 || args.n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (FLOAT *)blas_memory_alloc(0); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (symm[(side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + + } else { + +#ifndef NO_AFFINITY + nodes = get_num_nodes(); + + if (nodes > 1) { + + args.nthreads /= nodes; + + gemm_thread_mn(mode, &args, NULL, NULL, + symm[4 | (side << 1) | uplo ], sa, sb, nodes); + + } else { +#endif + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + + (symm[4 | (side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0); + +#else + + GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); + +#endif + +#ifndef NO_AFFINITY + } +#endif + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + (!side)? args.m * (args.m / 2 + args.n) : args.n * (args.m + args.n / 2), + (!side)? 2 * args.m * args.m * args.n : 2 * args.m * args.n * args.n); + + IDEBUG_END; + + return; +} diff --git a/interface/symv.c b/interface/symv.c new file mode 100644 index 0000000000..e8c24df66f --- /dev/null +++ b/interface/symv.c @@ -0,0 +1,205 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSYMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYMV " +#else +#define ERROR_NAME "SSYMV " +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta = *BETA; + blasint incy = *INCY; + + int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + SYMV_U, SYMV_L, + }; + +#ifdef SMP + int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + SYMV_THREAD_U, SYMV_THREAD_L, + }; +#endif + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, + FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT beta, FLOAT *y, blasint incy) { + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + SYMV_U, SYMV_L, + }; + +#ifdef SMP + int (*symv_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + SYMV_THREAD_U, SYMV_THREAD_L, + }; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (beta != ONE) SCAL_K(n, 0, 0, beta, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (symv[uplo])(n, n, alpha, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (symv_thread[uplo])(n, alpha, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/syr.c b/interface/syr.c new file mode 100644 index 0000000000..2b2d3d1e21 --- /dev/null +++ b/interface/syr.c @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSYR " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYR " +#else +#define ERROR_NAME "SSYR " +#endif + +static int (*syr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + qsyr_U, qsyr_L, +#elif defined(DOUBLE) + dsyr_U, dsyr_L, +#else + ssyr_U, ssyr_L, +#endif +}; + +#ifdef SMP +static int (*syr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qsyr_thread_U, qsyr_thread_L, +#elif defined(DOUBLE) + dsyr_thread_U, dsyr_thread_L, +#else + ssyr_thread_U, ssyr_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (syr[uplo])(n, alpha, x, incx, a, lda, buffer); + +#ifdef SMP + } else { + + (syr_thread[uplo])(n, alpha, x, incx, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/syr2.c b/interface/syr2.c new file mode 100644 index 0000000000..15dbae4bd7 --- /dev/null +++ b/interface/syr2.c @@ -0,0 +1,204 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSYR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYR2 " +#else +#define ERROR_NAME "SSYR2 " +#endif + +static int (*syr2[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + qsyr2_U, qsyr2_L, +#elif defined(DOUBLE) + dsyr2_U, dsyr2_L, +#else + ssyr2_U, ssyr2_L, +#endif +}; + +#ifdef SMP +static int (*syr2_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qsyr2_thread_U, qsyr2_thread_L, +#elif defined(DOUBLE) + dsyr2_thread_U, dsyr2_thread_L, +#else + ssyr2_thread_U, ssyr2_thread_L, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (syr2[uplo])(n, alpha, x, incx, y, incy, a, lda, buffer); + +#ifdef SMP + } else { + + (syr2_thread[uplo])(n, alpha, x, incx, y, incy, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/syr2k.c b/interface/syr2k.c new file mode 100644 index 0000000000..70b8409552 --- /dev/null +++ b/interface/syr2k.c @@ -0,0 +1,366 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QSYR2K" +#elif defined(DOUBLE) +#define ERROR_NAME "DSYR2K" +#else +#define ERROR_NAME "SSYR2K" +#endif +#else +#ifndef HEMM +#ifdef XDOUBLE +#define ERROR_NAME "XSYR2K" +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYR2K" +#else +#define ERROR_NAME "CSYR2K" +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XHER2K" +#elif defined(DOUBLE) +#define ERROR_NAME "ZHER2K" +#else +#define ERROR_NAME "CHER2K" +#endif +#endif +#endif + +static int (*syr2k[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef HEMM + SYR2K_UN, SYR2K_UC, SYR2K_LN, SYR2K_LC, +#else + HER2K_UN, HER2K_UC, HER2K_LN, HER2K_LC, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, + blasint *N, blasint *K, + FLOAT *alpha, FLOAT *a, blasint *ldA, + FLOAT *b, blasint *ldB, + FLOAT *beta, FLOAT *c, blasint *ldC){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + + blas_arg_t args; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + blasint info; + int uplo; + int trans; + int nrowa; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.k = *K; + + args.a = (void *)a; + args.b = (void *)b; + args.c = (void *)c; + + args.lda = *ldA; + args.ldb = *ldB; + args.ldc = *ldC; + + args.alpha = (void *)alpha; + args.beta = (void *)beta; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + + uplo = -1; + trans = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + info = 0; + + if (args.ldc < MAX(1,args.n)) info = 12; + if (args.ldb < MAX(1,nrowa)) info = 9; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint n, blasint k, +#ifndef COMPLEX + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, + FLOAT *b, blasint ldb, +#if !defined(COMPLEX) || defined(HEMM) + FLOAT beta, +#else + FLOAT *beta, +#endif + FLOAT *c, blasint ldc) { + + blas_arg_t args; + int uplo, trans; + blasint info, nrowa; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef HEMM + FLOAT CAlpha[2]; +#endif + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + PRINT_DEBUG_CNAME; + + args.n = n; + args.k = k; + + args.a = (void *)a; + args.b = (void *)b; + args.c = (void *)c; + + args.lda = lda; + args.ldb = ldb; + args.ldc = ldc; + +#ifndef COMPLEX + args.alpha = (void *)α +#else + args.alpha = (void *)alpha; +#endif + +#if !defined(COMPLEX) || defined(HEMM) + args.beta = (void *)β +#else + args.beta = (void *)beta; +#endif + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (Trans == CblasNoTrans) trans = 0; +#ifndef COMPLEX + if (Trans == CblasTrans) trans = 1; + if (Trans == CblasConjNoTrans) trans = 0; + if (Trans == CblasConjTrans) trans = 1; +#elif !defined(HEMM) + if (Trans == CblasTrans) trans = 1; +#else + if (Trans == CblasConjTrans) trans = 1; +#endif + + info = -1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + if (args.ldc < MAX(1,args.n)) info = 12; + if (args.ldb < MAX(1,nrowa)) info = 9; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + +#ifdef HEMM + CAlpha[0] = alpha[0]; + CAlpha[1] = -alpha[1]; + + args.alpha = (void *)CAlpha; +#endif + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (Trans == CblasNoTrans) trans = 1; +#ifndef COMPLEX + if (Trans == CblasTrans) trans = 0; + if (Trans == CblasConjNoTrans) trans = 1; + if (Trans == CblasConjTrans) trans = 0; +#elif !defined(HEMM) + if (Trans == CblasTrans) trans = 0; +#else + if (Trans == CblasConjTrans) trans = 0; +#endif + + info = -1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + if (args.ldc < MAX(1,args.n)) info = 12; + if (args.ldb < MAX(1,nrowa)) info = 9; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (args.n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (FLOAT *)blas_memory_alloc(0); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + if (!trans){ + mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); + } else { + mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N); + } + + mode |= (uplo << BLAS_UPLO_SHIFT); + + args.common = NULL; + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (syr2k[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + + } else { + + syrk_thread(mode, &args, NULL, NULL, syr2k[(uplo << 1) | trans ], sa, sb, args.nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, 2 * args.n * args.k + args.n * args.n, 2 * args.n * args.n * args.k); + + IDEBUG_END; + + return; +} diff --git a/interface/syrk.c b/interface/syrk.c new file mode 100644 index 0000000000..a0cc64180e --- /dev/null +++ b/interface/syrk.c @@ -0,0 +1,355 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QSYRK " +#elif defined(DOUBLE) +#define ERROR_NAME "DSYRK " +#else +#define ERROR_NAME "SSYRK " +#endif +#else +#ifndef HEMM +#ifdef XDOUBLE +#define ERROR_NAME "XSYRK " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYRK " +#else +#define ERROR_NAME "CSYRK " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XHERK " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHERK " +#else +#define ERROR_NAME "CHERK " +#endif +#endif +#endif + +static int (*syrk[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef HEMM + SYRK_UN, SYRK_UC, SYRK_LN, SYRK_LC, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + SYRK_THREAD_UN, SYRK_THREAD_UC, SYRK_THREAD_LN, SYRK_THREAD_LC, +#endif +#else + HERK_UN, HERK_UC, HERK_LN, HERK_LC, +#if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) + HERK_THREAD_UN, HERK_THREAD_UC, HERK_THREAD_LN, HERK_THREAD_LC, +#endif +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, + blasint *N, blasint *K, + FLOAT *alpha, FLOAT *a, blasint *ldA, + FLOAT *beta, FLOAT *c, blasint *ldC){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + + blas_arg_t args; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + blasint info; + int uplo; + int trans; + int nrowa; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.k = *K; + + args.a = (void *)a; + args.c = (void *)c; + + args.lda = *ldA; + args.ldc = *ldC; + + args.alpha = (void *)alpha; + args.beta = (void *)beta; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + + uplo = -1; + trans = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + info = 0; + + if (args.ldc < MAX(1,args.n)) info = 10; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, + blasint n, blasint k, +#if !defined(COMPLEX) || defined(HEMM) + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, +#if !defined(COMPLEX) || defined(HEMM) + FLOAT beta, +#else + FLOAT *beta, +#endif + FLOAT *c, blasint ldc) { + + blas_arg_t args; + int uplo, trans; + blasint info, nrowa; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + PRINT_DEBUG_CNAME; + + args.n = n; + args.k = k; + + args.a = (void *)a; + args.c = (void *)c; + + args.lda = lda; + args.ldc = ldc; + +#if !defined(COMPLEX) || defined(HEMM) + args.alpha = (void *)α + args.beta = (void *)β +#else + args.alpha = (void *)alpha; + args.beta = (void *)beta; +#endif + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (Trans == CblasNoTrans) trans = 0; +#ifndef COMPLEX + if (Trans == CblasTrans) trans = 1; + if (Trans == CblasConjNoTrans) trans = 0; + if (Trans == CblasConjTrans) trans = 1; +#elif !defined(HEMM) + if (Trans == CblasTrans) trans = 1; +#else + if (Trans == CblasConjTrans) trans = 1; +#endif + + info = -1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + if (args.ldc < MAX(1,args.n)) info = 10; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (Trans == CblasNoTrans) trans = 1; +#ifndef COMPLEX + if (Trans == CblasTrans) trans = 0; + if (Trans == CblasConjNoTrans) trans = 1; + if (Trans == CblasConjTrans) trans = 0; +#elif !defined(HEMM) + if (Trans == CblasTrans) trans = 0; +#else + if (Trans == CblasConjTrans) trans = 0; +#endif + + info = -1; + + nrowa = args.n; + if (trans & 1) nrowa = args.k; + + if (args.ldc < MAX(1,args.n)) info = 10; + if (args.lda < MAX(1,nrowa)) info = 7; + if (args.k < 0) info = 4; + if (args.n < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (args.n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (FLOAT *)blas_memory_alloc(0); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + if (!trans){ + mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); + } else { + mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N); + } + + mode |= (uplo << BLAS_UPLO_SHIFT); + + args.common = NULL; + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (syrk[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + + } else { + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + + (syrk[4 | (uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); + +#else + + syrk_thread(mode, &args, NULL, NULL, syrk[(uplo << 1) | trans ], sa, sb, args.nthreads); + +#endif + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.n * args.k + args.n * args.n / 2, args.n * args.n * args.k); + + IDEBUG_END; + + return; +} diff --git a/interface/tbmv.c b/interface/tbmv.c new file mode 100644 index 0000000000..cec2be465c --- /dev/null +++ b/interface/tbmv.c @@ -0,0 +1,248 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTBMV " +#else +#define ERROR_NAME "STBMV " +#endif + +static int (*tbmv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtbmv_NUU, qtbmv_NUN, qtbmv_NLU, qtbmv_NLN, + qtbmv_TUU, qtbmv_TUN, qtbmv_TLU, qtbmv_TLN, +#elif defined(DOUBLE) + dtbmv_NUU, dtbmv_NUN, dtbmv_NLU, dtbmv_NLN, + dtbmv_TUU, dtbmv_TUN, dtbmv_TLU, dtbmv_TLN, +#else + stbmv_NUU, stbmv_NUN, stbmv_NLU, stbmv_NLN, + stbmv_TUU, stbmv_TUN, stbmv_TLU, stbmv_TLN, +#endif +}; + +#ifdef SMP +static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qtbmv_thread_NUU, qtbmv_thread_NUN, qtbmv_thread_NLU, qtbmv_thread_NLN, + qtbmv_thread_TUU, qtbmv_thread_TUN, qtbmv_thread_TLU, qtbmv_thread_TLN, +#elif defined(DOUBLE) + dtbmv_thread_NUU, dtbmv_thread_NUN, dtbmv_thread_NLU, dtbmv_thread_NLN, + dtbmv_thread_TUU, dtbmv_thread_TUN, dtbmv_thread_TLU, dtbmv_thread_TLN, +#else + stbmv_thread_NUU, stbmv_thread_NUN, stbmv_thread_NLU, stbmv_thread_NLN, + stbmv_thread_TUU, stbmv_thread_TUN, stbmv_thread_TLU, stbmv_thread_TLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, blasint *K, + FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint k = *K; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (tbmv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); + +#ifdef SMP + } else { + + (tbmv_thread[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/tbsv.c b/interface/tbsv.c new file mode 100644 index 0000000000..a07c4c584d --- /dev/null +++ b/interface/tbsv.c @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTBSV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTBSV " +#else +#define ERROR_NAME "STBSV " +#endif + +static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtbsv_NUU, qtbsv_NUN, qtbsv_NLU, qtbsv_NLN, + qtbsv_TUU, qtbsv_TUN, qtbsv_TLU, qtbsv_TLN, +#elif defined(DOUBLE) + dtbsv_NUU, dtbsv_NUN, dtbsv_NLU, dtbsv_NLN, + dtbsv_TUU, dtbsv_TUN, dtbsv_TLU, dtbsv_TLN, +#else + stbsv_NUU, stbsv_NUN, stbsv_NLU, stbsv_NLN, + stbsv_TUU, stbsv_TUN, stbsv_TLU, stbsv_TLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, blasint *K, + FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint k = *K; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (tbsv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/tpmv.c b/interface/tpmv.c new file mode 100644 index 0000000000..f0fc4f71cb --- /dev/null +++ b/interface/tpmv.c @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTPMV " +#else +#define ERROR_NAME "STPMV " +#endif + +static int (*tpmv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtpmv_NUU, qtpmv_NUN, qtpmv_NLU, qtpmv_NLN, + qtpmv_TUU, qtpmv_TUN, qtpmv_TLU, qtpmv_TLN, +#elif defined(DOUBLE) + dtpmv_NUU, dtpmv_NUN, dtpmv_NLU, dtpmv_NLN, + dtpmv_TUU, dtpmv_TUN, dtpmv_TLU, dtpmv_TLN, +#else + stpmv_NUU, stpmv_NUN, stpmv_NLU, stpmv_NLN, + stpmv_TUU, stpmv_TUN, stpmv_TLU, stpmv_TLN, +#endif +}; + +#ifdef SMP +static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qtpmv_thread_NUU, qtpmv_thread_NUN, qtpmv_thread_NLU, qtpmv_thread_NLN, + qtpmv_thread_TUU, qtpmv_thread_TUN, qtpmv_thread_TLU, qtpmv_thread_TLN, +#elif defined(DOUBLE) + dtpmv_thread_NUU, dtpmv_thread_NUN, dtpmv_thread_NLU, dtpmv_thread_NLN, + dtpmv_thread_TUU, dtpmv_thread_TUN, dtpmv_thread_TLU, dtpmv_thread_TLN, +#else + stpmv_thread_NUU, stpmv_thread_NUN, stpmv_thread_NLU, stpmv_thread_NLN, + stpmv_thread_TUU, stpmv_thread_TUN, stpmv_thread_TLU, stpmv_thread_TLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (tpmv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); + +#ifdef SMP + } else { + + (tpmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/tpsv.c b/interface/tpsv.c new file mode 100644 index 0000000000..9dafd0b684 --- /dev/null +++ b/interface/tpsv.c @@ -0,0 +1,204 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTPSV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTPSV " +#else +#define ERROR_NAME "STPSV " +#endif + +static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtpsv_NUU, qtpsv_NUN, qtpsv_NLU, qtpsv_NLN, + qtpsv_TUU, qtpsv_TUN, qtpsv_TLU, qtpsv_TLN, +#elif defined(DOUBLE) + dtpsv_NUU, dtpsv_NUN, dtpsv_NLU, dtpsv_NLN, + dtpsv_TUU, dtpsv_TUN, dtpsv_TLU, dtpsv_TLN, +#else + stpsv_NUU, stpsv_NUN, stpsv_NLU, stpsv_NLN, + stpsv_TUU, stpsv_TUN, stpsv_TLU, stpsv_TLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (tpsv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/trmv.c b/interface/trmv.c new file mode 100644 index 0000000000..ed23cedc6a --- /dev/null +++ b/interface/trmv.c @@ -0,0 +1,243 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRMV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTRMV " +#else +#define ERROR_NAME "STRMV " +#endif + +static int (*trmv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + qtrmv_NUU, qtrmv_NUN, qtrmv_NLU, qtrmv_NLN, + qtrmv_TUU, qtrmv_TUN, qtrmv_TLU, qtrmv_TLN, +#elif defined(DOUBLE) + dtrmv_NUU, dtrmv_NUN, dtrmv_NLU, dtrmv_NLN, + dtrmv_TUU, dtrmv_TUN, dtrmv_TLU, dtrmv_TLN, +#else + strmv_NUU, strmv_NUN, strmv_NLU, strmv_NLN, + strmv_TUU, strmv_TUN, strmv_TLU, strmv_TLN, +#endif +}; + +#ifdef SMP +static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + qtrmv_thread_NUU, qtrmv_thread_NUN, qtrmv_thread_NLU, qtrmv_thread_NLN, + qtrmv_thread_TUU, qtrmv_thread_TUN, qtrmv_thread_TLU, qtrmv_thread_TLN, +#elif defined(DOUBLE) + dtrmv_thread_NUU, dtrmv_thread_NUN, dtrmv_thread_NLU, dtrmv_thread_NLN, + dtrmv_thread_TUU, dtrmv_thread_TUN, dtrmv_thread_TLU, dtrmv_thread_TLN, +#else + strmv_thread_NUU, strmv_thread_NUN, strmv_thread_NLU, strmv_thread_NLN, + strmv_thread_TUU, strmv_thread_TUN, strmv_thread_TLU, strmv_thread_TLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + +#ifdef SMP + } else { + + (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/trsm.c b/interface/trsm.c new file mode 100644 index 0000000000..5836ce2f00 --- /dev/null +++ b/interface/trsm.c @@ -0,0 +1,391 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef TRMM +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QTRSM " +#elif defined(DOUBLE) +#define ERROR_NAME "DTRSM " +#else +#define ERROR_NAME "STRSM " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XTRSM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRSM " +#else +#define ERROR_NAME "CTRSM " +#endif +#endif +#else +#ifndef COMPLEX +#ifdef XDOUBLE +#define ERROR_NAME "QTRMM " +#elif defined(DOUBLE) +#define ERROR_NAME "DTRMM " +#else +#define ERROR_NAME "STRMM " +#endif +#else +#ifdef XDOUBLE +#define ERROR_NAME "XTRMM " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRMM " +#else +#define ERROR_NAME "CTRMM " +#endif +#endif +#endif + +static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifndef TRMM + TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, + TRSM_LTUU, TRSM_LTUN, TRSM_LTLU, TRSM_LTLN, + TRSM_LRUU, TRSM_LRUN, TRSM_LRLU, TRSM_LRLN, + TRSM_LCUU, TRSM_LCUN, TRSM_LCLU, TRSM_LCLN, + TRSM_RNUU, TRSM_RNUN, TRSM_RNLU, TRSM_RNLN, + TRSM_RTUU, TRSM_RTUN, TRSM_RTLU, TRSM_RTLN, + TRSM_RRUU, TRSM_RRUN, TRSM_RRLU, TRSM_RRLN, + TRSM_RCUU, TRSM_RCUN, TRSM_RCLU, TRSM_RCLN, +#else + TRMM_LNUU, TRMM_LNUN, TRMM_LNLU, TRMM_LNLN, + TRMM_LTUU, TRMM_LTUN, TRMM_LTLU, TRMM_LTLN, + TRMM_LRUU, TRMM_LRUN, TRMM_LRLU, TRMM_LRLN, + TRMM_LCUU, TRMM_LCUN, TRMM_LCLU, TRMM_LCLN, + TRMM_RNUU, TRMM_RNUN, TRMM_RNLU, TRMM_RNLN, + TRMM_RTUU, TRMM_RTUN, TRMM_RTLU, TRMM_RTLN, + TRMM_RRUU, TRMM_RRUN, TRMM_RRLU, TRMM_RRLN, + TRMM_RCUU, TRMM_RCUN, TRMM_RCLU, TRMM_RCLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, + blasint *M, blasint *N, FLOAT *alpha, + FLOAT *a, blasint *ldA, FLOAT *b, blasint *ldB){ + + char side_arg = *SIDE; + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blas_arg_t args; + + FLOAT *buffer; + FLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + blasint info; + int side; + int uplo; + int unit; + int trans; + int nrowa; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + + args.a = (void *)a; + args.b = (void *)b; + + args.lda = *ldA; + args.ldb = *ldB; + + args.beta = (void *)alpha; + + TOUPPER(side_arg); + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + side = -1; + trans = -1; + unit = -1; + uplo = -1; + + if (side_arg == 'L') side = 0; + if (side_arg == 'R') side = 1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + nrowa = args.m; + if (side & 1) nrowa = args.n; + + info = 0; + + if (args.ldb < MAX(1,args.m)) info = 11; + if (args.lda < MAX(1,nrowa)) info = 9; + if (args.n < 0) info = 6; + if (args.m < 0) info = 5; + if (unit < 0) info = 4; + if (trans < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE Trans, enum CBLAS_DIAG Diag, + blasint m, blasint n, +#ifndef COMPLEX + FLOAT alpha, +#else + FLOAT *alpha, +#endif + FLOAT *a, blasint lda, + FLOAT *b, blasint ldb) { + + blas_arg_t args; + int side, uplo, trans, unit; + blasint info, nrowa; + + XFLOAT *buffer; + XFLOAT *sa, *sb; + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_REAL; +#else + int mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + int mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + int mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + int mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif +#endif + + PRINT_DEBUG_CNAME; + + args.a = (void *)a; + args.b = (void *)b; + + args.lda = lda; + args.ldb = ldb; + +#ifndef COMPLEX + args.beta = (void *)α +#else + args.beta = (void *)alpha; +#endif + + side = -1; + uplo = -1; + trans = -1; + unit = -1; + info = 0; + + if (order == CblasColMajor) { + args.m = m; + args.n = n; + + if (Side == CblasLeft) side = 0; + if (Side == CblasRight) side = 1; + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (Trans == CblasNoTrans) trans = 0; + if (Trans == CblasTrans) trans = 1; +#ifndef COMPLEX + if (Trans == CblasConjNoTrans) trans = 0; + if (Trans == CblasConjTrans) trans = 1; +#else + if (Trans == CblasConjNoTrans) trans = 2; + if (Trans == CblasConjTrans) trans = 3; +#endif + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + nrowa = args.m; + if (side & 1) nrowa = args.n; + + if (args.ldb < MAX(1,args.m)) info = 11; + if (args.lda < MAX(1,nrowa)) info = 9; + if (args.n < 0) info = 6; + if (args.m < 0) info = 5; + if (unit < 0) info = 4; + if (trans < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + } + + if (order == CblasRowMajor) { + args.m = n; + args.n = m; + + if (Side == CblasLeft) side = 1; + if (Side == CblasRight) side = 0; + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (Trans == CblasNoTrans) trans = 0; + if (Trans == CblasTrans) trans = 1; +#ifndef COMPLEX + if (Trans == CblasConjNoTrans) trans = 0; + if (Trans == CblasConjTrans) trans = 1; +#else + if (Trans == CblasConjNoTrans) trans = 2; + if (Trans == CblasConjTrans) trans = 3; +#endif + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + nrowa = args.m; + if (side & 1) nrowa = args.n; + + if (args.ldb < MAX(1,args.m)) info = 11; + if (args.lda < MAX(1,nrowa)) info = 9; + if (args.n < 0) info = 6; + if (args.m < 0) info = 5; + if (unit < 0) info = 4; + if (trans < 0) info = 3; + if (uplo < 0) info = 2; + if (side < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((args.m == 0) || (args.n == 0)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + buffer = (FLOAT *)blas_memory_alloc(0); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + mode |= (trans << BLAS_TRANSA_SHIFT); + mode |= (side << BLAS_RSIDE_SHIFT); + + args.nthreads = num_cpu_avail(3); + + if (args.nthreads == 1) { +#endif + + (trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + if (!side) { + gemm_thread_n(mode, &args, NULL, NULL, trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit], sa, sb, args.nthreads); + } else { + gemm_thread_m(mode, &args, NULL, NULL, trsm[(side<<4) | (trans<<2) | (uplo<<1) | unit], sa, sb, args.nthreads); + } + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, + (!side) ? args.m * (args.m + args.n) : args.n * (args.m + args.n), + (!side) ? args.m * args.m * args.n : args.m * args.n * args.n); + + IDEBUG_END; + + return; +} + diff --git a/interface/trsv.c b/interface/trsv.c new file mode 100644 index 0000000000..8ef6998db1 --- /dev/null +++ b/interface/trsv.c @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRSV " +#elif defined(DOUBLE) +#define ERROR_NAME "DTRSV " +#else +#define ERROR_NAME "STRSV " +#endif + +static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + qtrsv_NUU, qtrsv_NUN, qtrsv_NLU, qtrsv_NLN, + qtrsv_TUU, qtrsv_TUN, qtrsv_TLU, qtrsv_TLN, +#elif defined(DOUBLE) + dtrsv_NUU, dtrsv_NUN, dtrsv_NLU, dtrsv_NLN, + dtrsv_TUU, dtrsv_TUN, dtrsv_TLU, dtrsv_TLN, +#else + strsv_NUU, strsv_NUN, strsv_NLU, strsv_NLN, + strsv_TUU, strsv_TUN, strsv_TLU, strsv_TLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 0; + if (trans_arg == 'C') trans = 1; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 0; + if (TransA == CblasConjTrans) trans = 1; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 1; + if (TransA == CblasConjTrans) trans = 0; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(1, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/trti2.c b/interface/trti2.c new file mode 100644 index 0000000000..e119b45af8 --- /dev/null +++ b/interface/trti2.c @@ -0,0 +1,134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRTI2" +#elif defined(DOUBLE) +#define ERROR_NAME "DTRTI2" +#else +#define ERROR_NAME "STRTI2" +#endif + +static blasint (*trti2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifdef XDOUBLE + qtrti2_UU, qtrti2_UN, qtrti2_LU, qtrti2_LN, +#elif defined(DOUBLE) + dtrti2_UU, dtrti2_UN, dtrti2_LU, dtrti2_LN, +#else + strti2_UU, strti2_UN, strti2_LU, strti2_LN, +#endif + }; + +int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint diag_arg = *DIAG; + blasint uplo, diag; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + TOUPPER(diag_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 5; + if (args.n < 0) info = 3; + if (diag < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (trti2[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/interface/trtri.c b/interface/trtri.c new file mode 100644 index 0000000000..9e31905df0 --- /dev/null +++ b/interface/trtri.c @@ -0,0 +1,153 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QTRTRI" +#elif defined(DOUBLE) +#define ERROR_NAME "DTRTRI" +#else +#define ERROR_NAME "STRTRI" +#endif + +static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UU_SINGLE, TRTRI_UN_SINGLE, TRTRI_LU_SINGLE, TRTRI_LN_SINGLE, +}; + +#ifdef SMP +static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UU_PARALLEL, TRTRI_UN_PARALLEL, TRTRI_LU_PARALLEL, TRTRI_LN_PARALLEL, +}; +#endif + +int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint diag_arg = *DIAG; + blasint uplo, diag; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + TOUPPER(diag_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 5; + if (args.n < 0) info = 3; + if (diag < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + if (diag) { + if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { + *Info = IAMIN_K(args.n, args.a, args.lda + 1); + return 0; + } + } + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, .5 * args.n * args.n, + args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zaxpy.c b/interface/zaxpy.c new file mode 100644 index 0000000000..d3355ea570 --- /dev/null +++ b/interface/zaxpy.c @@ -0,0 +1,122 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + +#else + +void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + +#endif + + FLOAT alpha_r = *(ALPHA + 0); + FLOAT alpha_i = *(ALPHA + 1); + +#ifdef SMP + int mode, nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_CNAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (n <= 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx * 2; + if (incy < 0) y -= (n - 1) * incy * 2; + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + +#ifndef CONJ + AXPYU_K (n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); +#else + AXPYC_K(n, 0, 0, alpha_r, alpha_i, x, incx, y, incy, NULL, 0); +#endif + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, +#ifndef CONJ + (void *)AXPYU_K, +#else + (void *)AXPYC_K, +#endif + nthreads); + } +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zdot.c b/interface/zdot.c new file mode 100644 index 0000000000..1380ce2925 --- /dev/null +++ b/interface/zdot.c @@ -0,0 +1,202 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef RETURN_BY_STRUCT +#ifdef XDOUBLE +#define MYTYPE myxcomplex_t +#elif defined DOUBLE +#define MYTYPE myzcomplex_t +#else +#define MYTYPE myccomplex_t +#endif +#endif + +#ifndef CBLAS + +#ifdef RETURN_BY_STRUCT +MYTYPE NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +#elif defined RETURN_BY_STACK +void NAME(FLOAT _Complex *result, blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +#else +FLOAT _Complex NAME( blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY) { +#endif + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; +#ifndef RETURN_BY_STACK + FLOAT _Complex ret; +#endif +#ifdef RETURN_BY_STRUCT + MYTYPE myret; +#endif + + PRINT_DEBUG_NAME; + + if (n <= 0) { +#ifdef RETURN_BY_STRUCT + myret.r = 0.; + myret.i = 0.; + return myret; +#elif defined RETURN_BY_STACK + *result = ZERO; + return; +#else + return ZERO; +#endif + } + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx * 2; + if (incy < 0) y -= (n - 1) * incy * 2; + +#ifdef RETURN_BY_STRUCT + +#ifndef CONJ + ret = DOTU_K(n, x, incx, y, incy); +#else + ret = DOTC_K(n, x, incx, y, incy); +#endif + + myret.r = CREAL ret; + myret.i = CIMAG ret; + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + + return myret; + +#elif defined RETURN_BY_STACK + +#ifndef CONJ + *result = DOTU_K(n, x, incx, y, incy); +#else + *result = DOTC_K(n, x, incx, y, incy); +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + +#else + +#ifndef CONJ + ret = DOTU_K(n, x, incx, y, incy); +#else + ret = DOTC_K(n, x, incx, y, incy); +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; + +#endif + +} + +#else + +#ifdef FORCE_USE_STACK +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT _Complex *result){ +#else +FLOAT _Complex CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + + FLOAT _Complex ret; +#endif + + PRINT_DEBUG_CNAME; + + if (n <= 0) { +#ifdef FORCE_USE_STACK + *result = ZERO; + return; +#else + return ZERO; +#endif + } + + if (incx < 0) x -= (n - 1) * incx * 2; + if (incy < 0) y -= (n - 1) * incy * 2; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifdef FORCE_USE_STACK + +#ifndef CONJ + *result = DOTU_K(n, x, incx, y, incy); +#else + *result = DOTC_K(n, x, incx, y, incy); +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + +#else + +#ifndef CONJ + ret = DOTU_K(n, x, incx, y, incy); +#else + ret = DOTC_K(n, x, incx, y, incy); +#endif + + FUNCTION_PROFILE_END(4, 2 * n, 2 * n); + + IDEBUG_END; + + return ret; + +#endif + +} + +#endif diff --git a/interface/zgbmv.c b/interface/zgbmv.c new file mode 100644 index 0000000000..ae1fd24bff --- /dev/null +++ b/interface/zgbmv.c @@ -0,0 +1,271 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGBMV " +#else +#define ERROR_NAME "CGBMV " +#endif + +static void (*gbmv[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xgbmv_n, xgbmv_t, xgbmv_r, xgbmv_c, + xgbmv_o, xgbmv_u, xgbmv_s, xgbmv_d, +#elif defined(DOUBLE) + zgbmv_n, zgbmv_t, zgbmv_r, zgbmv_c, + zgbmv_o, zgbmv_u, zgbmv_s, zgbmv_d, +#else + cgbmv_n, cgbmv_t, cgbmv_r, cgbmv_c, + cgbmv_o, cgbmv_u, cgbmv_s, cgbmv_d, +#endif +}; + +#ifdef SMP +static int (*gbmv_thread[])(BLASLONG, BLASLONG, BLASLONG, BLASLONG, FLOAT *, + FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xgbmv_thread_n, xgbmv_thread_t, xgbmv_thread_r, xgbmv_thread_c, + xgbmv_thread_o, xgbmv_thread_u, xgbmv_thread_s, xgbmv_thread_d, +#elif defined(DOUBLE) + zgbmv_thread_n, zgbmv_thread_t, zgbmv_thread_r, zgbmv_thread_c, + zgbmv_thread_o, zgbmv_thread_u, zgbmv_thread_s, zgbmv_thread_d, +#else + cgbmv_thread_n, cgbmv_thread_t, cgbmv_thread_r, cgbmv_thread_c, + cgbmv_thread_o, cgbmv_thread_u, cgbmv_thread_s, cgbmv_thread_d, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, + blasint *KU, blasint *KL, + FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, + FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint ku = *KU; + blasint kl = *KL; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') i = 0; + if (trans == 'T') i = 1; + if (trans == 'R') i = 2; + if (trans == 'C') i = 3; + if (trans == 'O') i = 4; + if (trans == 'U') i = 5; + if (trans == 'S') i = 6; + if (trans == 'D') i = 7; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (i < 0) info = 1; + + trans = i; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_TRANSPOSE TransA, + blasint m, blasint n, + blasint ku, blasint kl, + FLOAT *ALPHA, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT *BETA, + FLOAT *y, blasint incy){ + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + + FLOAT *buffer; + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + info = -1; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + info = -1; + + t = n; + n = m; + m = t; + + t = ku; + ku = kl; + kl = t; + + if (incy == 0) info = 13; + if (incx == 0) info = 10; + if (lda < kl + ku + 1) info = 8; + if (kl < 0) info = 5; + if (ku < 0) info = 4; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + lenx = n; + leny = m; + if (trans & 1) lenx = m; + if (trans & 1) leny = n; + + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (lenx - 1) * incx * 2; + if (incy < 0) y -= (leny - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (gbmv[(int)trans])(m, n, kl, ku, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + + } else { + + (gbmv_thread[(int)trans])(m, n, kl, ku, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, m * n / 2 + n, m * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zgemv.c b/interface/zgemv.c new file mode 100644 index 0000000000..fb4784202a --- /dev/null +++ b/interface/zgemv.c @@ -0,0 +1,259 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGEMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZGEMV " +#else +#define ERROR_NAME "CGEMV " +#endif + +#ifdef SMP +static int (*gemv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, xgemv_thread_d, +#elif defined DOUBLE + zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, zgemv_thread_d, +#else + cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, cgemv_thread_d, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, + FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, + FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, + FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T, GEMV_R, GEMV_C, + GEMV_O, GEMV_U, GEMV_S, GEMV_D, + }; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + FLOAT alpha_r = *(ALPHA + 0); + FLOAT alpha_i = *(ALPHA + 1); + + FLOAT beta_r = *(BETA + 0); + FLOAT beta_i = *(BETA + 1); + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') i = 0; + if (trans == 'T') i = 1; + if (trans == 'R') i = 2; + if (trans == 'C') i = 3; + if (trans == 'O') i = 4; + if (trans == 'U') i = 5; + if (trans == 'S') i = 6; + if (trans == 'D') i = 7; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1,m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (i < 0) info = 1; + + trans = i; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_TRANSPOSE TransA, + blasint m, blasint n, + FLOAT *ALPHA, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT *BETA, + FLOAT *y, blasint incy){ + + FLOAT *buffer; + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + int (*gemv[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, + FLOAT * , BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + GEMV_N, GEMV_T, GEMV_R, GEMV_C, + GEMV_O, GEMV_U, GEMV_S, GEMV_D, + }; + + PRINT_DEBUG_CNAME; + + FLOAT alpha_r = *(ALPHA + 0); + FLOAT alpha_i = *(ALPHA + 1); + + FLOAT beta_r = *(BETA + 0); + FLOAT beta_i = *(BETA + 1); + + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + + } + + if (order == CblasRowMajor) { + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + info = -1; + + t = n; + n = m; + m = t; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < MAX(1, m)) info = 6; + if (n < 0) info = 3; + if (m < 0) info = 2; + if (trans < 0) info = 1; + + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + /* Quick return if possible. */ + + if (m == 0 || n == 0) return; + + lenx = n; + leny = m; + + if (trans & 1) lenx = m; + if (trans & 1) leny = n; + + if (beta_r != ONE || beta_i != ZERO) SCAL_K(leny, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if (alpha_r == ZERO && alpha_i == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (lenx - 1) * incx * 2; + if (incy < 0) y -= (leny - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (gemv[(int)trans])(m, n, 0, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + + } else { + + (gemv_thread[(int)trans])(m, n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zger.c b/interface/zger.c new file mode 100644 index 0000000000..ad52f40bb8 --- /dev/null +++ b/interface/zger.c @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#ifndef CONJ +#define ERROR_NAME "XGERU " +#else +#define ERROR_NAME "XGERC " +#endif +#elif defined DOUBLE +#ifndef CONJ +#define ERROR_NAME "ZGERU " +#else +#define ERROR_NAME "ZGERC " +#endif +#else +#ifndef CONJ +#define ERROR_NAME "CGERU " +#else +#define ERROR_NAME "CGERC " +#endif +#endif + +#if defined XDOUBLE +#ifndef CONJ +#define GER GERU_K +#define GER_THREAD xger_thread_U +#else +#define GER GERC_K +#define GER_THREAD xger_thread_C +#define GERV GERV_K +#define GERV_THREAD xger_thread_V +#endif +#elif defined DOUBLE +#ifndef CONJ +#define GER GERU_K +#define GER_THREAD zger_thread_U +#else +#define GER GERC_K +#define GER_THREAD zger_thread_C +#define GERV GERV_K +#define GERV_THREAD zger_thread_V +#endif +#else +#ifndef CONJ +#define GER GERU_K +#define GER_THREAD cger_thread_U +#else +#define GER GERC_K +#define GER_THREAD cger_thread_C +#define GERV GERV_K +#define GERV_THREAD cger_thread_V +#endif +#endif + +#ifndef CBLAS + +void NAME(blasint *M, blasint *N, FLOAT *Alpha, + FLOAT *x, blasint *INCX, + FLOAT *y, blasint *INCY, + FLOAT *a, blasint *LDA){ + + blasint m = *M; + blasint n = *N; + FLOAT alpha_r = Alpha[0]; + FLOAT alpha_i = Alpha[1]; + blasint incx = *INCX; + blasint incy = *INCY; + blasint lda = *LDA; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + blasint info; + + PRINT_DEBUG_NAME; + + info = 0; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + + if (info){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + blasint m, blasint n, + FLOAT *Alpha, + FLOAT *x, blasint incx, + FLOAT *y, blasint incy, + FLOAT *a, blasint lda) { + + FLOAT alpha_r = Alpha[0]; + FLOAT alpha_i = Alpha[1]; + + FLOAT *buffer; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + info = 0; + + if (order == CblasColMajor) { + info = -1; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (order == CblasRowMajor) { + info = -1; + + t = n; + n = m; + m = t; + + t = incx; + incx = incy; + incy = t; + + buffer = x; + x = y; + y = buffer; + + if (lda < MAX(1,m)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + /* Quick return if possible. */ + if (m == 0 || n == 0) return; + + if ((alpha_r == 0.) && (alpha_i == 0.)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incy < 0) y -= (n - 1) * incy * 2; + if (incx < 0) x -= (m - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + +#if !defined(CBLAS) || !defined(CONJ) + GER(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); +#else + if (order == CblasColMajor) { + GER(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); + } else { + GERV(m, n, 0, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); + } +#endif + +#ifdef SMP + + } else { + +#if !defined(CBLAS) || !defined(CONJ) + GER_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); +#else + if (order == CblasColMajor) { + GER_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); + } else { + GERV_THREAD(m, n, Alpha, x, incx, y, incy, a, lda, buffer, nthreads); + } +#endif + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, m * n + m + n, 2 * m * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zgetf2.c b/interface/zgetf2.c new file mode 100644 index 0000000000..950ef46e9e --- /dev/null +++ b/interface/zgetf2.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGETF2" +#elif defined(DOUBLE) +#define ERROR_NAME "ZGETF2" +#else +#define ERROR_NAME "CGETF2" +#endif + +int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + args.c = (void *)ipiv; + + info = 0; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = GETF2(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zgetrf.c b/interface/zgetrf.c new file mode 100644 index 0000000000..9f041d9bdb --- /dev/null +++ b/interface/zgetrf.c @@ -0,0 +1,122 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGETRF" +#elif defined(DOUBLE) +#define ERROR_NAME "ZGETRF" +#else +#define ERROR_NAME "CGETRF" +#endif + +int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint *Info){ + + blas_arg_t args; + + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *M; + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + args.c = (void *)ipiv; + + info = 0; + if (args.lda < MAX(1,args.m)) info = 4; + if (args.n < 0) info = 2; + if (args.m < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = GETRF_SINGLE(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = GETRF_PARALLEL(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zgetrs.c b/interface/zgetrs.c new file mode 100644 index 0000000000..81d50e34f3 --- /dev/null +++ b/interface/zgetrs.c @@ -0,0 +1,153 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XGETRS" +#elif defined(DOUBLE) +#define ERROR_NAME "ZGETRS" +#else +#define ERROR_NAME "CGETRS" +#endif + +static blasint (*getrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + GETRS_N_SINGLE, GETRS_T_SINGLE, GETRS_R_SINGLE, GETRS_C_SINGLE, +}; + +#ifdef SMP +static blasint (*getrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + GETRS_N_PARALLEL, GETRS_T_PARALLEL, GETRS_R_PARALLEL, GETRS_C_PARALLEL, +}; +#endif + +int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, + blasint *ipiv, FLOAT *b, blasint *ldB, blasint *Info){ + + char trans_arg = *TRANS; + + blas_arg_t args; + + blasint info; + int trans; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.m = *N; + args.n = *NRHS; + args.a = (void *)a; + args.lda = *ldA; + args.b = (void *)b; + args.ldb = *ldB; + args.c = (void *)ipiv; + + info = 0; + + TOUPPER(trans_arg); + trans = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (args.ldb < MAX(1, args.m)) info = 8; + if (args.lda < MAX(1, args.m)) info = 5; + if (args.n < 0) info = 3; + if (args.m < 0) info = 2; + if (trans < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return 0; + } + + args.alpha = NULL; + args.beta = NULL; + + *Info = info; + + if (args.m == 0 || args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + (getrs_single[trans])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + (getrs_parallel[trans])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n); + + IDEBUG_END; + + return 0; + +} diff --git a/interface/zhbmv.c b/interface/zhbmv.c new file mode 100644 index 0000000000..c14ad98595 --- /dev/null +++ b/interface/zhbmv.c @@ -0,0 +1,223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHBMV " +#else +#define ERROR_NAME "CHBMV " +#endif + +static int (*hbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xhbmv_U, xhbmv_L, xhbmv_V, xhbmv_M, +#elif defined(DOUBLE) + zhbmv_U, zhbmv_L, zhbmv_V, zhbmv_M, +#else + chbmv_U, chbmv_L, chbmv_V, chbmv_M, +#endif +}; + +#ifdef SMP +static int (*hbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xhbmv_thread_U, xhbmv_thread_L, xhbmv_thread_V, xhbmv_thread_M, +#elif defined(DOUBLE) + zhbmv_thread_U, zhbmv_thread_L, zhbmv_thread_V, zhbmv_thread_M, +#else + chbmv_thread_U, chbmv_thread_L, chbmv_thread_V, chbmv_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + blasint k = *K; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + if (uplo_arg == 'V') uplo = 2; + if (uplo_arg == 'M') uplo = 3; + + info = 0; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, blasint k, + FLOAT *ALPHA, + FLOAT *a, blasint lda, + FLOAT *x, blasint incx, + FLOAT *BETA, + FLOAT *y, blasint incy){ + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * COMPSIZE; + if (incy < 0 ) y -= (n - 1) * incy * COMPSIZE; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (hbmv_thread[uplo])(n, k, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/zhemv.c b/interface/zhemv.c new file mode 100644 index 0000000000..3cba445c22 --- /dev/null +++ b/interface/zhemv.c @@ -0,0 +1,215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHEMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHEMV " +#else +#define ERROR_NAME "CHEMV " +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; +#ifdef SMP + int nthreads; +#endif + + int (*hemv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + HEMV_U, HEMV_L, HEMV_V, HEMV_M, + }; + +#ifdef SMP + int (*hemv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + HEMV_THREAD_U, HEMV_THREAD_L, HEMV_THREAD_V, HEMV_THREAD_M, + }; +#endif + + blasint info; + int uplo; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + if (uplo_arg == 'V') uplo = 2; + if (uplo_arg == 'M') uplo = 3; + + info = 0; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, + FLOAT *a, blasint lda, FLOAT *x, blasint incx, FLOAT *BETA, FLOAT *y, blasint incy) { + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + int (*hemv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + HEMV_U, HEMV_L, HEMV_V, HEMV_M, + }; + +#ifdef SMP + int (*hemv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + HEMV_THREAD_U, HEMV_THREAD_L, HEMV_THREAD_V, HEMV_THREAD_M, + }; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + if (incy < 0 ) y -= (n - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hemv[uplo])(n, n, alpha_r, alpha_i, a, lda, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (hemv_thread[uplo])(n, ALPHA, a, lda, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zher.c b/interface/zher.c new file mode 100644 index 0000000000..ad982dd685 --- /dev/null +++ b/interface/zher.c @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHER " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHER " +#else +#define ERROR_NAME "CHER " +#endif + +static int (*her[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xher_U, xher_L, xher_V, xher_M, +#elif defined(DOUBLE) + zher_U, zher_L, zher_V, zher_M, +#else + cher_U, cher_L, cher_V, cher_M, +#endif +}; + +#ifdef SMP +static int (*her_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xher_thread_U, xher_thread_L, xher_thread_V, xher_thread_M, +#elif defined(DOUBLE) + zher_thread_U, zher_thread_L, zher_thread_V, zher_thread_M, +#else + cher_thread_U, cher_thread_L, cher_thread_V, cher_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (her[uplo])(n, alpha, x, incx, a, lda, buffer); + +#ifdef SMP + } else { + + (her_thread[uplo])(n, alpha, x, incx, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zher2.c b/interface/zher2.c new file mode 100644 index 0000000000..88fececf73 --- /dev/null +++ b/interface/zher2.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHER2 " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHER2 " +#else +#define ERROR_NAME "CHER2 " +#endif + +static int (*her2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xher2_U, xher2_L, xher2_V, xher2_M, +#elif defined(DOUBLE) + zher2_U, zher2_L, zher2_V, zher2_M, +#else + cher2_U, cher2_L, cher2_V, cher2_M, +#endif +}; + +#ifdef SMP +static int (*her2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xher2_thread_U, xher2_thread_L, xher2_thread_V, xher2_thread_M, +#elif defined(DOUBLE) + zher2_thread_U, zher2_thread_L, zher2_thread_V, zher2_thread_M, +#else + cher2_thread_U, cher2_thread_L, cher2_thread_V, cher2_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (lda < MAX(1, n)) info = 9; + if (incx == 0) info = 7; + if (incy == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + if (incy < 0 ) y -= (n - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (her2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); + +#ifdef SMP + } else { + + (her2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zhpmv.c b/interface/zhpmv.c new file mode 100644 index 0000000000..d7013e6685 --- /dev/null +++ b/interface/zhpmv.c @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHPMV " +#else +#define ERROR_NAME "CHPMV " +#endif + +static int (*hpmv[])(BLASLONG, FLOAT, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xhpmv_U, xhpmv_L, xhpmv_V, xhpmv_M, +#elif defined(DOUBLE) + zhpmv_U, zhpmv_L, zhpmv_V, zhpmv_M, +#else + chpmv_U, chpmv_L, chpmv_V, chpmv_M, +#endif +}; + +#ifdef SMP +static int (*hpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xhpmv_thread_U, xhpmv_thread_L, xhpmv_thread_V, xhpmv_thread_M, +#elif defined(DOUBLE) + zhpmv_thread_U, zhpmv_thread_L, zhpmv_thread_V, zhpmv_thread_M, +#else + chpmv_thread_U, chpmv_thread_L, chpmv_thread_V, chpmv_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, + FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT *ALPHA, + FLOAT *a, + FLOAT *x, blasint incx, + FLOAT *BETA, + FLOAT *y, blasint incy){ + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, y, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + if (incy < 0 ) y -= (n - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hpmv[uplo])(n, alpha_r, alpha_i, a, x, incx, y, incy, buffer); + +#ifdef SMP + } else { + + (hpmv_thread[uplo])(n, ALPHA, a, x, incx, y, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zhpr.c b/interface/zhpr.c new file mode 100644 index 0000000000..c48e352383 --- /dev/null +++ b/interface/zhpr.c @@ -0,0 +1,198 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHPR " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHPR " +#else +#define ERROR_NAME "CHPR " +#endif + +static int (*hpr[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + xhpr_U, xhpr_L, xhpr_V, xhpr_M, +#elif defined(DOUBLE) + zhpr_U, zhpr_L, zhpr_V, zhpr_M, +#else + chpr_U, chpr_L, chpr_V, chpr_M, +#endif +}; + +#ifdef SMP +static int (*hpr_thread[])(BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + xhpr_thread_U, xhpr_thread_L, xhpr_thread_V, xhpr_thread_M, +#elif defined(DOUBLE) + zhpr_thread_U, zhpr_thread_L, zhpr_thread_V, zhpr_thread_M, +#else + chpr_thread_U, chpr_thread_L, chpr_thread_V, chpr_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha = *ALPHA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT alpha, + FLOAT *x, blasint incx, + FLOAT *a) { + + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if (alpha == ZERO) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hpr[uplo])(n, alpha, x, incx, a, buffer); + +#ifdef SMP + + } else { + + (hpr_thread[uplo])(n, alpha, x, incx, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zhpr2.c b/interface/zhpr2.c new file mode 100644 index 0000000000..cf1d5f9fc3 --- /dev/null +++ b/interface/zhpr2.c @@ -0,0 +1,207 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XHPR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "ZHPR2 " +#else +#define ERROR_NAME "CHPR2 " +#endif + +static int (*hpr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + xhpr2_U, xhpr2_L, xhpr2_V, xhpr2_M, +#elif defined(DOUBLE) + zhpr2_U, zhpr2_L, zhpr2_V, zhpr2_M, +#else + chpr2_U, chpr2_L, chpr2_V, chpr2_M, +#endif +}; + +#ifdef SMP +static int (*hpr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + xhpr2_thread_U, xhpr2_thread_L, xhpr2_thread_V, xhpr2_thread_M, +#elif defined(DOUBLE) + zhpr2_thread_U, zhpr2_thread_L, zhpr2_thread_V, zhpr2_thread_M, +#else + chpr2_thread_U, chpr2_thread_L, chpr2_thread_V, chpr2_thread_M, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, + enum CBLAS_UPLO Uplo, + blasint n, + FLOAT *ALPHA, + FLOAT *x, blasint incx, + FLOAT *y, blasint incy, + FLOAT *a) { + + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + FLOAT *buffer; + int uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 3; + if (Uplo == CblasLower) uplo = 2; + + info = -1; + + if (incx == 0) info = 7; + if (incy == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + if (incy < 0 ) y -= (n - 1) * incy * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (hpr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer); + +#ifdef SMP + } else { + + (hpr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zlaswp.c b/interface/zlaswp.c new file mode 100644 index 0000000000..85ead2c86a --- /dev/null +++ b/interface/zlaswp.c @@ -0,0 +1,108 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +static int (*laswp[])(BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, blasint *, BLASLONG) = { +#ifdef XDOUBLE + xlaswp_plus, xlaswp_minus, +#elif defined(DOUBLE) + zlaswp_plus, zlaswp_minus, +#else + claswp_plus, claswp_minus, +#endif +}; + +int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *ipiv, blasint *INCX){ + + blasint n = *N; + blasint lda = *LDA; + blasint k1 = *K1; + blasint k2 = *K2; + blasint incx = *INCX; + int flag; + +#ifdef SMP + int mode; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + if (incx == 0 || n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + flag = (incx < 0); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (laswp[flag])(n, k1, k2, ZERO, ZERO, a, lda, NULL, 0, ipiv, incx); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, laswp[flag], nthreads); + } +#endif + + FUNCTION_PROFILE_END(COMPSIZE, n * (k2 - k1), 0); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zlauu2.c b/interface/zlauu2.c new file mode 100644 index 0000000000..05603fe1b3 --- /dev/null +++ b/interface/zlauu2.c @@ -0,0 +1,129 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QLAUU2" +#elif defined(DOUBLE) +#define ERROR_NAME "ZLAUU2" +#else +#define ERROR_NAME "CLAUU2" +#endif + +static blasint (*lauu2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + +#ifdef XDOUBLE + xlauu2_U, xlauu2_L, +#elif defined(DOUBLE) + zlauu2_U, zlauu2_L, +#else + clauu2_U, clauu2_L, +#endif + }; + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (lauu2[uplo])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zlauum.c b/interface/zlauum.c new file mode 100644 index 0000000000..23990e8e47 --- /dev/null +++ b/interface/zlauum.c @@ -0,0 +1,141 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XLAUUM" +#elif defined(DOUBLE) +#define ERROR_NAME "ZLAUUM" +#else +#define ERROR_NAME "CLAUUM" +#endif + +static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LAUUM_U_SINGLE, LAUUM_L_SINGLE, +}; + +#ifdef SMP +static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zpotf2.c b/interface/zpotf2.c new file mode 100644 index 0000000000..f8f81e2c5a --- /dev/null +++ b/interface/zpotf2.c @@ -0,0 +1,129 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XPOTF2" +#elif defined(DOUBLE) +#define ERROR_NAME "ZPOTF2" +#else +#define ERROR_NAME "CPOTF2" +#endif + +static blasint (*potf2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { + +#ifdef XDOUBLE + xpotf2_U, xpotf2_L, +#elif defined(DOUBLE) + zpotf2_U, zpotf2_L, +#else + cpotf2_U, cpotf2_L, +#endif + }; + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (potf2[uplo])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zpotrf.c b/interface/zpotrf.c new file mode 100644 index 0000000000..e2004d7444 --- /dev/null +++ b/interface/zpotrf.c @@ -0,0 +1,141 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XPOTRF" +#elif defined(DOUBLE) +#define ERROR_NAME "ZPOTRF" +#else +#define ERROR_NAME "CPOTRF" +#endif + +static blasint (*potrf_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + POTRF_U_SINGLE, POTRF_L_SINGLE, +}; + +#ifdef SMP +static blasint (*potrf_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + POTRF_U_PARALLEL, POTRF_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (potrf_single[uplo])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = (potrf_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * 1./6. * args.n * (args.n * args.n - 1)); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zpotri.c b/interface/zpotri.c new file mode 100644 index 0000000000..df325424eb --- /dev/null +++ b/interface/zpotri.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XPOTRI" +#elif defined(DOUBLE) +#define ERROR_NAME "ZPOTRI" +#else +#define ERROR_NAME "CPOTRI" +#endif + +static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UN_SINGLE, TRTRI_LN_SINGLE, +}; + +static blasint (*lauum_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + LAUUM_U_SINGLE, LAUUM_L_SINGLE, +}; + +#ifdef SMP +static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UN_PARALLEL, TRTRI_LN_PARALLEL, +}; + +static blasint (*lauum_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + LAUUM_U_PARALLEL, LAUUM_L_PARALLEL, +}; +#endif + +int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint uplo; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 4; + if (args.n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + info = (trtri_single[uplo])(&args, NULL, NULL, sa, sb, 0); + + if (!info) { + info = (lauum_single[uplo])(&args, NULL, NULL, sa, sb, 0); + } + + *Info = info; + +#ifdef SMP + } else { + info = (trtri_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + + if (!info) { + info = (lauum_parallel[uplo])(&args, NULL, NULL, sa, sb, 0); + } + + *Info = info; + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2. / 3. * args.m * args.n * args.n); + + IDEBUG_END; + + return 0; +} diff --git a/interface/zrot.c b/interface/zrot.c new file mode 100644 index 0000000000..f18bbc6d16 --- /dev/null +++ b/interface/zrot.c @@ -0,0 +1,72 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *C, FLOAT *S){ + + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + FLOAT c = *C; + FLOAT s = *S; + + PRINT_DEBUG_NAME; + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * 2 * incx; + if (incy < 0) y -= (n - 1) * 2 * incy; + + ROT_K(n, x, incx, y, incy, c, s); + + FUNCTION_PROFILE_END(4, n, n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zrotg.c b/interface/zrotg.c new file mode 100644 index 0000000000..e9e8a11dfe --- /dev/null +++ b/interface/zrotg.c @@ -0,0 +1,115 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ + + PRINT_DEBUG_NAME; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) + + long double da_r = *(DA + 0); + long double da_i = *(DA + 1); + long double db_r = *(DB + 0); + long double db_i = *(DB + 1); + long double r; + + long double ada = fabs(da_r) + fabs(da_i); + + if (ada == ZERO) { + *C = ZERO; + *(S + 0) = ONE; + *(S + 1) = ZERO; + *(DA + 0) = db_r; + *(DA + 1) = db_i; + } else { + long double alpha_r, alpha_i; + + ada = sqrt(da_r * da_r + da_i * da_i); + + r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i); + + alpha_r = da_r / ada; + alpha_i = da_i / ada; + + *(C + 0) = ada / r; + *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; + *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; + *(DA + 0) = alpha_r * r; + *(DA + 1) = alpha_i * r; + } +#else + FLOAT da_r = *(DA + 0); + FLOAT da_i = *(DA + 1); + FLOAT db_r = *(DB + 0); + FLOAT db_i = *(DB + 1); + FLOAT r; + + FLOAT ada = fabs(da_r) + fabs(da_i); + FLOAT adb; + + if (ada == ZERO) { + *C = ZERO; + *(S + 0) = ONE; + *(S + 1) = ZERO; + *(DA + 0) = db_r; + *(DA + 1) = db_i; + } else { + FLOAT scale; + FLOAT aa_r, aa_i, bb_r, bb_i; + FLOAT alpha_r, alpha_i; + + aa_r = fabs(da_r); + aa_i = fabs(da_i); + + if (aa_i > aa_r) { + aa_r = fabs(da_i); + aa_i = fabs(da_r); + } + + scale = (aa_i / aa_r); + ada = aa_r * sqrt(ONE + scale * scale); + + bb_r = fabs(db_r); + bb_i = fabs(db_i); + + if (bb_i > bb_r) { + bb_r = fabs(bb_i); + bb_i = fabs(bb_r); + } + + scale = (bb_i / bb_r); + adb = bb_r * sqrt(ONE + scale * scale); + + scale = ada + adb; + + aa_r = da_r / scale; + aa_i = da_i / scale; + bb_r = db_r / scale; + bb_i = db_i / scale; + + r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i); + + alpha_r = da_r / ada; + alpha_i = da_i / ada; + + *(C + 0) = ada / r; + *(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r; + *(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r; + *(DA + 0) = alpha_r * r; + *(DA + 1) = alpha_i * r; + } +#endif + + FUNCTION_PROFILE_END(4, 4, 4); + + IDEBUG_END; + + return; +} diff --git a/interface/zsbmv.c b/interface/zsbmv.c new file mode 100644 index 0000000000..71c03a660d --- /dev/null +++ b/interface/zsbmv.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSBMV " +#else +#define ERROR_NAME "CSBMV " +#endif + +static int (*sbmv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xsbmv_U, xsbmv_L, +#elif defined(DOUBLE) + zsbmv_U, zsbmv_L, +#else + csbmv_U, csbmv_L, +#endif +}; + +#ifdef SMP +static int (*sbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xsbmv_thread_U, xsbmv_thread_L, +#elif defined(DOUBLE) + zsbmv_thread_U, zsbmv_thread_L, +#else + csbmv_thread_U, csbmv_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, blasint *K, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + blasint k = *K; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 11; + if (incx == 0) info = 8; + if (lda < k + 1) info = 6; + if (k < 0) info = 3; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; + if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (sbmv[uplo])(n, k, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); + +#ifdef SMP + } else { + + (sbmv_thread[uplo])(n, k, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/zscal.c b/interface/zscal.c new file mode 100644 index 0000000000..ad99874dc1 --- /dev/null +++ b/interface/zscal.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX){ + + blasint n = *N; + blasint incx = *INCX; + +#ifndef SSCAL + FLOAT *alpha=ALPHA; +#else + FLOAT alpha[2] = {ALPHA[0], ZERO}; +#endif + +#else + +#ifndef SSCAL +void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx){ + + FLOAT *alpha=ALPHA; +#else +void CNAME(blasint n, FLOAT alpha_r, FLOAT *x, blasint incx){ + + FLOAT alpha[2] = {alpha_r, ZERO}; +#endif +#endif + +#ifdef SMP + int mode; + int nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (incx <= 0 || n <= 0) return; + + if ((alpha[0] == ONE) && (alpha[1] == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + SCAL_K(n, 0, 0, alpha[0], alpha[1], x, incx, NULL, 0, NULL, 0); + +#ifdef SMP + } else { +#ifdef DOUBLE + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); + + } +#endif + + FUNCTION_PROFILE_END(4, n, n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zspmv.c b/interface/zspmv.c new file mode 100644 index 0000000000..ecf1af586f --- /dev/null +++ b/interface/zspmv.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "ZSPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSPMV " +#else +#define ERROR_NAME "CSPMV " +#endif + +static int (*spmv[])(BLASLONG, FLOAT, FLOAT, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xspmv_U, xspmv_L, +#elif defined(DOUBLE) + zspmv_U, zspmv_L, +#else + cspmv_U, cspmv_L, +#endif +}; + +#ifdef SMP +static int (*spmv_thread[])(BLASLONG, FLOAT *, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xspmv_thread_U, xspmv_thread_L, +#elif defined(DOUBLE) + zspmv_thread_U, zspmv_thread_L, +#else + cspmv_thread_U, cspmv_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, + FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 9; + if (incx == 0) info = 6; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; + if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spmv[uplo])(n, alpha_r, alpha_i, a, b, incx, c, incy, buffer); + +#ifdef SMP + + } else { + + (spmv_thread[uplo])(n, ALPHA, a, b, incx, c, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zspr.c b/interface/zspr.c new file mode 100644 index 0000000000..0021bcda4f --- /dev/null +++ b/interface/zspr.c @@ -0,0 +1,146 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSPR " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSPR " +#else +#define ERROR_NAME "CSPR " +#endif + +static int (*spr[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + xspr_U, xspr_L, +#elif defined(DOUBLE) + zspr_U, zspr_L, +#else + cspr_U, cspr_L, +#endif +}; + +#ifdef SMP +static int (*spr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + xspr_thread_U, xspr_thread_L, +#elif defined(DOUBLE) + zspr_thread_U, zspr_thread_L, +#else + cspr_thread_U, cspr_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spr[uplo])(n, alpha_r, alpha_i, x, incx, a, buffer); + +#ifdef SMP + } else { + + (spr_thread[uplo])(n, ALPHA, x, incx, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zspr2.c b/interface/zspr2.c new file mode 100644 index 0000000000..b54e1651aa --- /dev/null +++ b/interface/zspr2.c @@ -0,0 +1,149 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSPR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSPR2 " +#else +#define ERROR_NAME "CSPR2 " +#endif + +static int (*spr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *) = { +#ifdef XDOUBLE + xspr2_U, xspr2_L, +#elif defined(DOUBLE) + zspr2_U, zspr2_L, +#else + cspr2_U, cspr2_L, +#endif +}; + +#ifdef SMP +static int (*spr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, FLOAT *, int) = { +#ifdef XDOUBLE + xspr2_thread_U, xspr2_thread_L, +#elif defined(DOUBLE) + zspr2_thread_U, zspr2_thread_L, +#else + cspr2_thread_U, cspr2_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (spr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, buffer); + +#ifdef SMP + } else { + + (spr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zswap.c b/interface/zswap.c new file mode 100644 index 0000000000..f4a03a5508 --- /dev/null +++ b/interface/zswap.c @@ -0,0 +1,111 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS + +void NAME(blasint *N, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){ + + blasint n = *N; + blasint incx = *INCX; + blasint incy = *INCY; + +#else + +void CNAME(blasint n, FLOAT *x, blasint incx, FLOAT *y, blasint incy){ + +#endif + +#ifdef SMP + int mode; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + int nthreads; +#endif + +#ifndef CBLAS + PRINT_DEBUG_NAME; +#else + PRINT_DEBUG_CNAME; +#endif + + if (n <= 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx * 2; + if (incy < 0) y -= (n - 1) * incy * 2; + +#ifdef SMP + nthreads = num_cpu_avail(1); + + if (nthreads == 1) { +#endif + + SWAP_K(n, 0, 0, ZERO, ZERO, x, incx, y, incy, NULL, 0); + +#ifdef SMP + } else { + +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + blas_level1_thread(mode, n, 0, 0, dummyalpha, + x, incx, y, incy, NULL, 0, (void *)SWAP_K, nthreads); + + } +#endif + + FUNCTION_PROFILE_END(2, 2 * n, 0); + + IDEBUG_END; + + return; + +} diff --git a/interface/zsymv.c b/interface/zsymv.c new file mode 100644 index 0000000000..afb2c17344 --- /dev/null +++ b/interface/zsymv.c @@ -0,0 +1,143 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSYMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYMV " +#else +#define ERROR_NAME "CSYMV " +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *b, blasint *INCX, FLOAT *BETA, FLOAT *c, blasint *INCY){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + FLOAT beta_r = BETA[0]; + FLOAT beta_i = BETA[1]; + blasint incy = *INCY; + + int (*symv[])(BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { + SYMV_U, SYMV_L, + }; + +#ifdef SMP + int (*symv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { + SYMV_THREAD_U, SYMV_THREAD_L, + }; +#endif + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incy == 0) info = 10; + if (incx == 0) info = 7; + if (lda < MAX(1, n)) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((beta_r != ONE) || (beta_i != ZERO)) SCAL_K(n, 0, 0, beta_r, beta_i, c, abs(incy), NULL, 0, NULL, 0); + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) b -= (n - 1) * incx * COMPSIZE; + if (incy < 0 ) c -= (n - 1) * incy * COMPSIZE; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (symv[uplo])(n, n, alpha_r, alpha_i, a, lda, b, incx, c, incy, buffer); + +#ifdef SMP + } else { + + (symv_thread[uplo])(n, ALPHA, a, lda, b, incx, c, incy, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zsyr.c b/interface/zsyr.c new file mode 100644 index 0000000000..b6b5202ec6 --- /dev/null +++ b/interface/zsyr.c @@ -0,0 +1,203 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XSYR " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYR " +#else +#define ERROR_NAME "CSYR " +#endif + +static int (*syr[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xsyr_U, xsyr_L, +#elif defined(DOUBLE) + zsyr_U, zsyr_L, +#else + csyr_U, csyr_L, +#endif +}; + +#ifdef SMP +static int (*syr_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xsyr_thread_U, xsyr_thread_L, +#elif defined(DOUBLE) + zsyr_thread_U, zsyr_thread_L, +#else + csyr_thread_U, csyr_thread_L, +#endif +}; +#endif + + +#ifndef CBLAS + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { + + FLOAT *buffer; + int trans, uplo; + blasint info; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + trans = -1; + uplo = -1; + info = 0; + + if (order == CblasColMajor) { + + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + } + + if (order == CblasRowMajor) { + + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + info = -1; + + if (lda < MAX(1, n)) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (syr[uplo])(n, alpha_r, alpha_i, x, incx, a, lda, buffer); + +#ifdef SMP + } else { + + (syr_thread[uplo])(n, ALPHA, x, incx, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/zsyr2.c b/interface/zsyr2.c new file mode 100644 index 0000000000..0c705cb12a --- /dev/null +++ b/interface/zsyr2.c @@ -0,0 +1,151 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "QSYR2 " +#elif defined(DOUBLE) +#define ERROR_NAME "ZSYR2 " +#else +#define ERROR_NAME "CSYR2 " +#endif + +static int (*syr2[])(BLASLONG, FLOAT, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xsyr2_U, xsyr2_L, +#elif defined(DOUBLE) + zsyr2_U, zsyr2_L, +#else + csyr2_U, csyr2_L, +#endif +}; + +#ifdef SMP +static int (*syr2_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xsyr2_thread_U, xsyr2_thread_L, +#elif defined(DOUBLE) + zsyr2_thread_U, zsyr2_thread_L, +#else + csyr2_thread_U, csyr2_thread_L, +#endif +}; +#endif + +void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, + FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY, FLOAT *a, blasint *LDA){ + + char uplo_arg = *UPLO; + blasint n = *N; + FLOAT alpha_r = ALPHA[0]; + FLOAT alpha_i = ALPHA[1]; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + + blasint info; + int uplo; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + uplo = -1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (lda < MAX(1, n)) info = 9; + if (incy == 0) info = 7; + if (incx == 0) info = 5; + if (n < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + if (n == 0) return; + + if ((alpha_r == ZERO) && (alpha_i == ZERO)) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx; + if (incy < 0 ) y -= (n - 1) * incy; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (syr2[uplo])(n, alpha_r, alpha_i, x, incx, y, incy, a, lda, buffer); + +#ifdef SMP + } else { + + (syr2_thread[uplo])(n, ALPHA, x, incx, y, incy, a, lda, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + 2 * n, 2 * n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztbmv.c b/interface/ztbmv.c new file mode 100644 index 0000000000..85f53c4bed --- /dev/null +++ b/interface/ztbmv.c @@ -0,0 +1,260 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTBMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTBMV " +#else +#define ERROR_NAME "CTBMV " +#endif + +static int (*tbmv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtbmv_NUU, xtbmv_NUN, xtbmv_NLU, xtbmv_NLN, + xtbmv_TUU, xtbmv_TUN, xtbmv_TLU, xtbmv_TLN, + xtbmv_RUU, xtbmv_RUN, xtbmv_RLU, xtbmv_RLN, + xtbmv_CUU, xtbmv_CUN, xtbmv_CLU, xtbmv_CLN, +#elif defined(DOUBLE) + ztbmv_NUU, ztbmv_NUN, ztbmv_NLU, ztbmv_NLN, + ztbmv_TUU, ztbmv_TUN, ztbmv_TLU, ztbmv_TLN, + ztbmv_RUU, ztbmv_RUN, ztbmv_RLU, ztbmv_RLN, + ztbmv_CUU, ztbmv_CUN, ztbmv_CLU, ztbmv_CLN, +#else + ctbmv_NUU, ctbmv_NUN, ctbmv_NLU, ctbmv_NLN, + ctbmv_TUU, ctbmv_TUN, ctbmv_TLU, ctbmv_TLN, + ctbmv_RUU, ctbmv_RUN, ctbmv_RLU, ctbmv_RLN, + ctbmv_CUU, ctbmv_CUN, ctbmv_CLU, ctbmv_CLN, +#endif +}; + +#ifdef SMP +static int (*tbmv_thread[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xtbmv_thread_NUU, xtbmv_thread_NUN, xtbmv_thread_NLU, xtbmv_thread_NLN, + xtbmv_thread_TUU, xtbmv_thread_TUN, xtbmv_thread_TLU, xtbmv_thread_TLN, + xtbmv_thread_RUU, xtbmv_thread_RUN, xtbmv_thread_RLU, xtbmv_thread_RLN, + xtbmv_thread_CUU, xtbmv_thread_CUN, xtbmv_thread_CLU, xtbmv_thread_CLN, +#elif defined(DOUBLE) + ztbmv_thread_NUU, ztbmv_thread_NUN, ztbmv_thread_NLU, ztbmv_thread_NLN, + ztbmv_thread_TUU, ztbmv_thread_TUN, ztbmv_thread_TLU, ztbmv_thread_TLN, + ztbmv_thread_RUU, ztbmv_thread_RUN, ztbmv_thread_RLU, ztbmv_thread_RLN, + ztbmv_thread_CUU, ztbmv_thread_CUN, ztbmv_thread_CLU, ztbmv_thread_CLN, +#else + ctbmv_thread_NUU, ctbmv_thread_NUN, ctbmv_thread_NLU, ctbmv_thread_NLN, + ctbmv_thread_TUU, ctbmv_thread_TUN, ctbmv_thread_TLU, ctbmv_thread_TLN, + ctbmv_thread_RUU, ctbmv_thread_RUN, ctbmv_thread_RLU, ctbmv_thread_RLN, + ctbmv_thread_CUU, ctbmv_thread_CUN, ctbmv_thread_CLU, ctbmv_thread_CLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, blasint *K, + FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint k = *K; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (tbmv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); + +#ifdef SMP + } else { + + (tbmv_thread[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/ztbsv.c b/interface/ztbsv.c new file mode 100644 index 0000000000..3846a4b3d2 --- /dev/null +++ b/interface/ztbsv.c @@ -0,0 +1,219 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTBSV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTBSV " +#else +#define ERROR_NAME "CTBSV " +#endif + +static int (*tbsv[])(BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtbsv_NUU, xtbsv_NUN, xtbsv_NLU, xtbsv_NLN, + xtbsv_TUU, xtbsv_TUN, xtbsv_TLU, xtbsv_TLN, + xtbsv_RUU, xtbsv_RUN, xtbsv_RLU, xtbsv_RLN, + xtbsv_CUU, xtbsv_CUN, xtbsv_CLU, xtbsv_CLN, +#elif defined(DOUBLE) + ztbsv_NUU, ztbsv_NUN, ztbsv_NLU, ztbsv_NLN, + ztbsv_TUU, ztbsv_TUN, ztbsv_TLU, ztbsv_TLN, + ztbsv_RUU, ztbsv_RUN, ztbsv_RLU, ztbsv_RLN, + ztbsv_CUU, ztbsv_CUN, ztbsv_CLU, ztbsv_CLN, +#else + ctbsv_NUU, ctbsv_NUN, ctbsv_NLU, ctbsv_NLN, + ctbsv_TUU, ctbsv_TUN, ctbsv_TLU, ctbsv_TLN, + ctbsv_RUU, ctbsv_RUN, ctbsv_RLU, ctbsv_RLN, + ctbsv_CUU, ctbsv_CUN, ctbsv_CLU, ctbsv_CLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, blasint *K, + FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint k = *K; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, blasint k, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 9; + if (lda < k + 1) info = 7; + if (k < 0) info = 5; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (tbsv[(trans<<2) | (uplo<<1) | unit])(n, k, a, lda, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * k / 2 + n, n * k); + + IDEBUG_END; + + return; +} diff --git a/interface/ztpmv.c b/interface/ztpmv.c new file mode 100644 index 0000000000..2f9c48f5a8 --- /dev/null +++ b/interface/ztpmv.c @@ -0,0 +1,252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTPMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTPMV " +#else +#define ERROR_NAME "CTPMV " +#endif + +static int (*tpmv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtpmv_NUU, xtpmv_NUN, xtpmv_NLU, xtpmv_NLN, + xtpmv_TUU, xtpmv_TUN, xtpmv_TLU, xtpmv_TLN, + xtpmv_RUU, xtpmv_RUN, xtpmv_RLU, xtpmv_RLN, + xtpmv_CUU, xtpmv_CUN, xtpmv_CLU, xtpmv_CLN, +#elif defined(DOUBLE) + ztpmv_NUU, ztpmv_NUN, ztpmv_NLU, ztpmv_NLN, + ztpmv_TUU, ztpmv_TUN, ztpmv_TLU, ztpmv_TLN, + ztpmv_RUU, ztpmv_RUN, ztpmv_RLU, ztpmv_RLN, + ztpmv_CUU, ztpmv_CUN, ztpmv_CLU, ztpmv_CLN, +#else + ctpmv_NUU, ctpmv_NUN, ctpmv_NLU, ctpmv_NLN, + ctpmv_TUU, ctpmv_TUN, ctpmv_TLU, ctpmv_TLN, + ctpmv_RUU, ctpmv_RUN, ctpmv_RLU, ctpmv_RLN, + ctpmv_CUU, ctpmv_CUN, ctpmv_CLU, ctpmv_CLN, +#endif +}; + +#ifdef SMP +static int (*tpmv_thread[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xtpmv_thread_NUU, xtpmv_thread_NUN, xtpmv_thread_NLU, xtpmv_thread_NLN, + xtpmv_thread_TUU, xtpmv_thread_TUN, xtpmv_thread_TLU, xtpmv_thread_TLN, + xtpmv_thread_RUU, xtpmv_thread_RUN, xtpmv_thread_RLU, xtpmv_thread_RLN, + xtpmv_thread_CUU, xtpmv_thread_CUN, xtpmv_thread_CLU, xtpmv_thread_CLN, +#elif defined(DOUBLE) + ztpmv_thread_NUU, ztpmv_thread_NUN, ztpmv_thread_NLU, ztpmv_thread_NLN, + ztpmv_thread_TUU, ztpmv_thread_TUN, ztpmv_thread_TLU, ztpmv_thread_TLN, + ztpmv_thread_RUU, ztpmv_thread_RUN, ztpmv_thread_RLU, ztpmv_thread_RLN, + ztpmv_thread_CUU, ztpmv_thread_CUN, ztpmv_thread_CLU, ztpmv_thread_CLN, +#else + ctpmv_thread_NUU, ctpmv_thread_NUN, ctpmv_thread_NLU, ctpmv_thread_NLN, + ctpmv_thread_TUU, ctpmv_thread_TUN, ctpmv_thread_TLU, ctpmv_thread_TLN, + ctpmv_thread_RUU, ctpmv_thread_RUN, ctpmv_thread_RLU, ctpmv_thread_RLN, + ctpmv_thread_CUU, ctpmv_thread_CUN, ctpmv_thread_CLU, ctpmv_thread_CLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; +#ifdef SMP + int nthreads; +#endif + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (tpmv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); + +#ifdef SMP + + } else { + + (tpmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztpsv.c b/interface/ztpsv.c new file mode 100644 index 0000000000..fde500e376 --- /dev/null +++ b/interface/ztpsv.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTPSV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTPSV " +#else +#define ERROR_NAME "CTPSV " +#endif + +static int (*tpsv[])(BLASLONG, FLOAT *, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtpsv_NUU, xtpsv_NUN, xtpsv_NLU, xtpsv_NLN, + xtpsv_TUU, xtpsv_TUN, xtpsv_TLU, xtpsv_TLN, + xtpsv_RUU, xtpsv_RUN, xtpsv_RLU, xtpsv_RLN, + xtpsv_CUU, xtpsv_CUN, xtpsv_CLU, xtpsv_CLN, +#elif defined(DOUBLE) + ztpsv_NUU, ztpsv_NUN, ztpsv_NLU, ztpsv_NLN, + ztpsv_TUU, ztpsv_TUN, ztpsv_TLU, ztpsv_TLN, + ztpsv_RUU, ztpsv_RUN, ztpsv_RLU, ztpsv_RLN, + ztpsv_CUU, ztpsv_CUN, ztpsv_CLU, ztpsv_CLN, +#else + ctpsv_NUU, ctpsv_NUN, ctpsv_NLU, ctpsv_NLN, + ctpsv_TUU, ctpsv_TUN, ctpsv_TLU, ctpsv_TLN, + ctpsv_RUU, ctpsv_RUN, ctpsv_RLU, ctpsv_RLN, + ctpsv_CUU, ctpsv_CUN, ctpsv_CLU, ctpsv_CLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 7; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (tpsv[(trans<<2) | (uplo<<1) | unit])(n, a, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztrmv.c b/interface/ztrmv.c new file mode 100644 index 0000000000..5a18a85b15 --- /dev/null +++ b/interface/ztrmv.c @@ -0,0 +1,255 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRMV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRMV " +#else +#define ERROR_NAME "CTRMV " +#endif + +static int (*trmv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { +#ifdef XDOUBLE + xtrmv_NUU, xtrmv_NUN, xtrmv_NLU, xtrmv_NLN, + xtrmv_TUU, xtrmv_TUN, xtrmv_TLU, xtrmv_TLN, + xtrmv_RUU, xtrmv_RUN, xtrmv_RLU, xtrmv_RLN, + xtrmv_CUU, xtrmv_CUN, xtrmv_CLU, xtrmv_CLN, +#elif defined(DOUBLE) + ztrmv_NUU, ztrmv_NUN, ztrmv_NLU, ztrmv_NLN, + ztrmv_TUU, ztrmv_TUN, ztrmv_TLU, ztrmv_TLN, + ztrmv_RUU, ztrmv_RUN, ztrmv_RLU, ztrmv_RLN, + ztrmv_CUU, ztrmv_CUN, ztrmv_CLU, ztrmv_CLN, +#else + ctrmv_NUU, ctrmv_NUN, ctrmv_NLU, ctrmv_NLN, + ctrmv_TUU, ctrmv_TUN, ctrmv_TLU, ctrmv_TLN, + ctrmv_RUU, ctrmv_RUN, ctrmv_RLU, ctrmv_RLN, + ctrmv_CUU, ctrmv_CUN, ctrmv_CLU, ctrmv_CLN, +#endif +}; + +#ifdef SMP +static int (*trmv_thread[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *, int) = { +#ifdef XDOUBLE + xtrmv_thread_NUU, xtrmv_thread_NUN, xtrmv_thread_NLU, xtrmv_thread_NLN, + xtrmv_thread_TUU, xtrmv_thread_TUN, xtrmv_thread_TLU, xtrmv_thread_TLN, + xtrmv_thread_RUU, xtrmv_thread_RUN, xtrmv_thread_RLU, xtrmv_thread_RLN, + xtrmv_thread_CUU, xtrmv_thread_CUN, xtrmv_thread_CLU, xtrmv_thread_CLN, +#elif defined(DOUBLE) + ztrmv_thread_NUU, ztrmv_thread_NUN, ztrmv_thread_NLU, ztrmv_thread_NLN, + ztrmv_thread_TUU, ztrmv_thread_TUN, ztrmv_thread_TLU, ztrmv_thread_TLN, + ztrmv_thread_RUU, ztrmv_thread_RUN, ztrmv_thread_RLU, ztrmv_thread_RLN, + ztrmv_thread_CUU, ztrmv_thread_CUN, ztrmv_thread_CLU, ztrmv_thread_CLN, +#else + ctrmv_thread_NUU, ctrmv_thread_NUN, ctrmv_thread_NLU, ctrmv_thread_NLN, + ctrmv_thread_TUU, ctrmv_thread_TUN, ctrmv_thread_TLU, ctrmv_thread_TLN, + ctrmv_thread_RUU, ctrmv_thread_RUN, ctrmv_thread_RLU, ctrmv_thread_RLN, + ctrmv_thread_CUU, ctrmv_thread_CUN, ctrmv_thread_CLU, ctrmv_thread_CLN, +#endif +}; +#endif + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + info = 0; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; +#ifdef SMP + int nthreads; +#endif + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + +#ifdef SMP + nthreads = num_cpu_avail(2); + + if (nthreads == 1) { +#endif + + (trmv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + +#ifdef SMP + } else { + + (trmv_thread[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer, nthreads); + + } +#endif + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztrsv.c b/interface/ztrsv.c new file mode 100644 index 0000000000..08f7dc68cd --- /dev/null +++ b/interface/ztrsv.c @@ -0,0 +1,216 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRSV " +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRSV " +#else +#define ERROR_NAME "CTRSV " +#endif + +static int (*trsv[])(BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, void *) = { +#ifdef XDOUBLE + xtrsv_NUU, xtrsv_NUN, xtrsv_NLU, xtrsv_NLN, + xtrsv_TUU, xtrsv_TUN, xtrsv_TLU, xtrsv_TLN, + xtrsv_RUU, xtrsv_RUN, xtrsv_RLU, xtrsv_RLN, + xtrsv_CUU, xtrsv_CUN, xtrsv_CLU, xtrsv_CLN, +#elif defined(DOUBLE) + ztrsv_NUU, ztrsv_NUN, ztrsv_NLU, ztrsv_NLN, + ztrsv_TUU, ztrsv_TUN, ztrsv_TLU, ztrsv_TLN, + ztrsv_RUU, ztrsv_RUN, ztrsv_RLU, ztrsv_RLN, + ztrsv_CUU, ztrsv_CUN, ztrsv_CLU, ztrsv_CLN, +#else + ctrsv_NUU, ctrsv_NUN, ctrsv_NLU, ctrsv_NLN, + ctrsv_TUU, ctrsv_TUN, ctrsv_TLU, ctrsv_TLN, + ctrsv_RUU, ctrsv_RUN, ctrsv_RLU, ctrsv_RLN, + ctrsv_CUU, ctrsv_CUN, ctrsv_CLU, ctrsv_CLN, +#endif +}; + +#ifndef CBLAS + +void NAME(char *UPLO, char *TRANS, char *DIAG, + blasint *N, FLOAT *a, blasint *LDA, FLOAT *x, blasint *INCX){ + + char uplo_arg = *UPLO; + char trans_arg = *TRANS; + char diag_arg = *DIAG; + + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + + blasint info; + int uplo; + int unit; + int trans; + FLOAT *buffer; + + PRINT_DEBUG_NAME; + + TOUPPER(uplo_arg); + TOUPPER(trans_arg); + TOUPPER(diag_arg); + + trans = -1; + unit = -1; + uplo = -1; + + if (trans_arg == 'N') trans = 0; + if (trans_arg == 'T') trans = 1; + if (trans_arg == 'R') trans = 2; + if (trans_arg == 'C') trans = 3; + + if (diag_arg == 'U') unit = 0; + if (diag_arg == 'N') unit = 1; + + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + + + info = 0; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, + enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, + blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) { + + int trans, uplo, unit; + blasint info; + FLOAT *buffer; + + PRINT_DEBUG_CNAME; + + unit = -1; + uplo = -1; + trans = -1; + info = 0; + + if (order == CblasColMajor) { + if (Uplo == CblasUpper) uplo = 0; + if (Uplo == CblasLower) uplo = 1; + + if (TransA == CblasNoTrans) trans = 0; + if (TransA == CblasTrans) trans = 1; + if (TransA == CblasConjNoTrans) trans = 2; + if (TransA == CblasConjTrans) trans = 3; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (order == CblasRowMajor) { + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; + + if (TransA == CblasNoTrans) trans = 1; + if (TransA == CblasTrans) trans = 0; + if (TransA == CblasConjNoTrans) trans = 3; + if (TransA == CblasConjTrans) trans = 2; + + if (Diag == CblasUnit) unit = 0; + if (Diag == CblasNonUnit) unit = 1; + + info = -1; + + if (incx == 0) info = 8; + if (lda < MAX(1, n)) info = 6; + if (n < 0) info = 4; + if (unit < 0) info = 3; + if (trans < 0) info = 2; + if (uplo < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if (n == 0) return; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + if (incx < 0 ) x -= (n - 1) * incx * 2; + + buffer = (FLOAT *)blas_memory_alloc(1); + + (trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer); + + blas_memory_free(buffer); + + FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n); + + IDEBUG_END; + + return; +} diff --git a/interface/ztrti2.c b/interface/ztrti2.c new file mode 100644 index 0000000000..017374c376 --- /dev/null +++ b/interface/ztrti2.c @@ -0,0 +1,134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRTI2" +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRTI2" +#else +#define ERROR_NAME "CTRTI2" +#endif + +static blasint (*trti2[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { +#ifdef XDOUBLE + xtrti2_UU, xtrti2_UN, xtrti2_LU, xtrti2_LN, +#elif defined(DOUBLE) + ztrti2_UU, ztrti2_UN, ztrti2_LU, ztrti2_LN, +#else + ctrti2_UU, ctrti2_UN, ctrti2_LU, ctrti2_LN, +#endif + }; + +int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint diag_arg = *DIAG; + blasint uplo, diag; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + TOUPPER(diag_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 5; + if (args.n < 0) info = 3; + if (diag < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n <= 0) return 0; + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + + info = (trti2[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + + *Info = info; + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/interface/ztrtri.c b/interface/ztrtri.c new file mode 100644 index 0000000000..89caf80d38 --- /dev/null +++ b/interface/ztrtri.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifdef XDOUBLE +#define ERROR_NAME "XTRTRI" +#elif defined(DOUBLE) +#define ERROR_NAME "ZTRTRI" +#else +#define ERROR_NAME "CTRTRI" +#endif + +static blasint (*trtri_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UU_SINGLE, TRTRI_UN_SINGLE, TRTRI_LU_SINGLE, TRTRI_LN_SINGLE, +}; + +#ifdef SMP +static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) ={ + TRTRI_UU_PARALLEL, TRTRI_UN_PARALLEL, TRTRI_LU_PARALLEL, TRTRI_LN_PARALLEL, +}; +#endif + +int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){ + + blas_arg_t args; + + blasint uplo_arg = *UPLO; + blasint diag_arg = *DIAG; + blasint uplo, diag; + blasint info; + FLOAT *buffer; +#ifdef PPC440 + extern +#endif + FLOAT *sa, *sb; + + PRINT_DEBUG_NAME; + + args.n = *N; + args.a = (void *)a; + args.lda = *ldA; + + TOUPPER(uplo_arg); + TOUPPER(diag_arg); + + uplo = -1; + if (uplo_arg == 'U') uplo = 0; + if (uplo_arg == 'L') uplo = 1; + diag = -1; + if (diag_arg == 'U') diag = 0; + if (diag_arg == 'N') diag = 1; + + info = 0; + if (args.lda < MAX(1,args.n)) info = 5; + if (args.n < 0) info = 3; + if (diag < 0) info = 2; + if (uplo < 0) info = 1; + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + *Info = - info; + return 0; + } + + *Info = 0; + + if (args.n == 0) return 0; + + if (diag) { + if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) { + *Info = IAMIN_K(args.n, args.a, args.lda + 1); + return 0; + } + } + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + +#ifndef PPC440 + buffer = (FLOAT *)blas_memory_alloc(1); + + sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); + sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif + +#ifdef SMP + args.common = NULL; + args.nthreads = num_cpu_avail(4); + + if (args.nthreads == 1) { +#endif + + *Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + +#ifdef SMP + } else { + + *Info = (trtri_parallel[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0); + + } +#endif + +#ifndef PPC440 + blas_memory_free(buffer); +#endif + + FUNCTION_PROFILE_END(1, .5 * args.n * args.n, + 2. * args.n * (1./3. + args.n * ( 1./2. + args.n * 1./6.)) + + 6. * args.n * (1./3. + args.n * (-1./2. + args.n * 1./6.))); + + IDEBUG_END; + + return 0; +} diff --git a/kernel/Makefile b/kernel/Makefile new file mode 100644 index 0000000000..6084cbc3f5 --- /dev/null +++ b/kernel/Makefile @@ -0,0 +1,121 @@ +ifdef TARGET_CORE +TARGET = $(TARGET_CORE) +endif + +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +ifdef TARGET_CORE +CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) +BUILD_KERNEL = 1 +KDIR = +TSUFFIX = _$(TARGET_CORE) +else +TARGET_CORE = $(CORE) +KDIR = +TSUFFIX = +endif + +-include $(KERNELDIR)/KERNEL.$(TARGET_CORE) + +include $(KERNELDIR)/KERNEL + +include Makefile.L1 + +include Makefile.L2 + +include Makefile.L3 + +include Makefile.LA + +HPLOBJS = \ + dgemm_kernel.$(SUFFIX) \ + $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ + $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) \ + dtrsm_kernel_LN.$(SUFFIX) dtrsm_kernel_LT.$(SUFFIX) \ + dtrsm_kernel_RN.$(SUFFIX) dtrsm_kernel_RT.$(SUFFIX) \ + daxpy_k.$(SUFFIX) dcopy_k.$(SUFFIX) ddot_k.$(SUFFIX) \ + dger_k.$(SUFFIX) dscal_k.$(SUFFIX) idamax_k.$(SUFFIX) \ + dgemv_n.$(SUFFIX) dgemv_t.$(SUFFIX) dgemm_beta.$(SUFFIX) \ + dtrsm_iunucopy.$(SUFFIX) dtrsm_iunncopy.$(SUFFIX) \ + dtrsm_ilnucopy.$(SUFFIX) dtrsm_ilnncopy.$(SUFFIX) \ + dtrsm_iutucopy.$(SUFFIX) dtrsm_iutncopy.$(SUFFIX) \ + dtrsm_iltucopy.$(SUFFIX) dtrsm_iltncopy.$(SUFFIX) \ + dtrsm_ounucopy.$(SUFFIX) dtrsm_ounncopy.$(SUFFIX) \ + dtrsm_olnucopy.$(SUFFIX) dtrsm_olnncopy.$(SUFFIX) \ + dtrsm_outucopy.$(SUFFIX) dtrsm_outncopy.$(SUFFIX) \ + dtrsm_oltucopy.$(SUFFIX) dtrsm_oltncopy.$(SUFFIX) + +COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX) + +ifdef DYNAMIC_ARCH +SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX) +CCOMMON_OPT += -DTS=$(TSUFFIX) +endif + +ifeq ($(ARCH), x86) +COMMONOBJS += cpuid.$(SUFFIX) +endif + +ifdef EXPRECISION +COMMONOBJS += qconjg.$(SUFFIX) qcabs1.$(SUFFIX) +endif + +ifdef QUAD_PRECISION +COMMONOBJS += qconjg.$(SUFFIX) qcabs1.$(SUFFIX) +endif + +all : libs + +scabs1.$(SUFFIX): $(KERNELDIR)/$(SCABS_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DF_INTERFACE $< -o $(@F) + +dcabs1.$(SUFFIX): $(KERNELDIR)/$(DCABS_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DF_INTERFACE $< -o $(@F) + +qcabs1.$(SUFFIX): $(KERNELDIR)/$(QCABS_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) + +qconjg.$(SUFFIX): $(KERNELDIR)/qconjg.S + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) + +lsame.$(SUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) + $(CC) -c $(CFLAGS) -DF_INTERFACE $< -o $(@F) + +setparam$(TSUFFIX).$(SUFFIX): setparam$(TSUFFIX).c kernel$(TSUFFIX).h + $(CC) -c $(CFLAGS) $< -o $@ + +setparam$(TSUFFIX).c : setparam-ref.c + sed 's/TS/$(TSUFFIX)/g' $< > $(@F) + +kernel$(TSUFFIX).h : ../common_level1.h ../common_level2.h ../common_level3.h ../common_lapack.h + sed 's/\ *(/$(TSUFFIX)(/g' $^ > $(@F) + +cpuid.$(SUFFIX): $(KERNELDIR)/cpuid.S + $(CC) -c $(CFLAGS) $< -o $(@F) + +scabs1.$(PSUFFIX): $(KERNELDIR)/$(SCABS_KERNEL) + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DF_INTERFACE $< -o $(@F) + +dcabs1.$(PSUFFIX): $(KERNELDIR)/$(DCABS_KERNEL) + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DF_INTERFACE $< -o $(@F) + +qcabs1.$(PSUFFIX): $(KERNELDIR)/$(QCABS_KERNEL) + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) + +qconjg.$(PSUFFIX): $(KERNELDIR)/qconjg.S + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DF_INTERFACE $< -o $(@F) + +lsame.$(PSUFFIX): $(KERNELDIR)/$(LSAME_KERNEL) + $(CC) -c $(PFLAGS) -DF_INTERFACE $< -o $(@F) + +cpuid.$(PSUFFIX): $(KERNELDIR)/cpuid.S + $(CC) -c $(PFLAGS) $< -o $(@F) + +ifdef DYNAMIC_ARCH +clean :: + @rm -f setparam_*.c kernel_*.h setparam.h kernel.h + +endif + +include $(TOPDIR)/Makefile.tail diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 new file mode 100644 index 0000000000..317f143631 --- /dev/null +++ b/kernel/Makefile.L1 @@ -0,0 +1,767 @@ +### AMAX ### + +ifndef SAMAXKERNEL +SAMAXKERNEL = amax.S +endif + +ifndef DAMAXKERNEL +DAMAXKERNEL = amax.S +endif + +ifndef QAMAXKERNEL +QAMAXKERNEL = amax.S +endif + +ifndef CAMAXKERNEL +CAMAXKERNEL = zamax.S +endif + +ifndef ZAMAXKERNEL +ZAMAXKERNEL = zamax.S +endif + +ifndef XAMAXKERNEL +XAMAXKERNEL = zamax.S +endif + +### AMIN ### + +ifndef SAMINKERNEL +SAMINKERNEL = amin.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amin.S +endif + +ifndef QAMINKERNEL +QAMINKERNEL = amin.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamin.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamin.S +endif + +ifndef XAMINKERNEL +XAMINKERNEL = zamin.S +endif + +### MAX ### + +ifndef SMAXKERNEL +SMAXKERNEL = max.S +endif + +ifndef DMAXKERNEL +DMAXKERNEL = max.S +endif + +ifndef QMAXKERNEL +QMAXKERNEL = max.S +endif + +### MIN ### + +ifndef SMINKERNEL +SMINKERNEL = min.S +endif + +ifndef DMINKERNEL +DMINKERNEL = min.S +endif + +ifndef QMINKERNEL +QMINKERNEL = min.S +endif + +### IAMAX ### + +ifndef ISAMAXKERNEL +ISAMAXKERNEL = iamax.S +endif + +ifndef IDAMAXKERNEL +IDAMAXKERNEL = iamax.S +endif + +ifndef IQAMAXKERNEL +IQAMAXKERNEL = iamax.S +endif + +ifndef ICAMAXKERNEL +ICAMAXKERNEL = izamax.S +endif + +ifndef IZAMAXKERNEL +IZAMAXKERNEL = izamax.S +endif + +ifndef IXAMAXKERNEL +IXAMAXKERNEL = izamax.S +endif + +### IAMIN ### + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamin.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamin.S +endif + +ifndef IQAMINKERNEL +IQAMINKERNEL = iamin.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamin.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamin.S +endif + +ifndef IXAMINKERNEL +IXAMINKERNEL = izamin.S +endif + +### IMAX ### + +ifndef ISMAXKERNEL +ISMAXKERNEL = iamax.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = iamax.S +endif + +ifndef IQMAXKERNEL +IQMAXKERNEL = iamax.S +endif + +### IMIN ### + +ifndef ISMINKERNEL +ISMINKERNEL = iamin.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamin.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = iamin.S +endif + +### ASUM ### + +ifndef SASUMKERNEL +SASUMKERNEL = asum.S +endif + +ifndef DASUMKERNEL +DASUMKERNEL = asum.S +endif + +ifndef CASUMKERNEL +CASUMKERNEL = zasum.S +endif + +ifndef ZASUMKERNEL +ZASUMKERNEL = zasum.S +endif + +ifndef QASUMKERNEL +QASUMKERNEL = asum.S +endif + +ifndef XASUMKERNEL +XASUMKERNEL = zasum.S +endif + +### AXPY ### + +ifndef SAXPYKERNEL +SAXPYKERNEL = axpy.S +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = axpy.S +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = zaxpy.S +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = zaxpy.S +endif + +ifndef QAXPYKERNEL +QAXPYKERNEL = axpy.S +endif + +ifndef XAXPYKERNEL +XAXPYKERNEL = zaxpy.S +endif + +### COPY ### + +ifndef SCOPYKERNEL +SCOPYKERNEL = copy.S +endif + +ifndef DCOPYKERNEL +DCOPYKERNEL = copy.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = zcopy.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = zcopy.S +endif + +ifndef QCOPYKERNEL +QCOPYKERNEL = copy.S +endif + +ifndef XCOPYKERNEL +XCOPYKERNEL = zcopy.S +endif + +### DOT ### + +ifndef SDOTKERNEL +SDOTKERNEL = dot.S +endif + +ifndef DDOTKERNEL +DDOTKERNEL = dot.S +endif + +ifndef CDOTKERNEL +CDOTKERNEL = zdot.S +endif + +ifndef ZDOTKERNEL +ZDOTKERNEL = zdot.S +endif + +ifndef QDOTKERNEL +QDOTKERNEL = dot.S +endif + +ifndef XDOTKERNEL +XDOTKERNEL = zdot.S +endif + +### NRM2 ### + +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.S +endif + +ifndef QNRM2KERNEL +QNRM2KERNEL = nrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef XNRM2KERNEL +XNRM2KERNEL = znrm2.S +endif + +### ROT ### + +ifndef SROTKERNEL +SROTKERNEL = rot.S +endif + +ifndef DROTKERNEL +DROTKERNEL = rot.S +endif + +ifndef QROTKERNEL +QROTKERNEL = rot.S +endif + +ifndef CROTKERNEL +CROTKERNEL = zrot.S +endif + +ifndef ZROTKERNEL +ZROTKERNEL = zrot.S +endif + +ifndef XROTKERNEL +XROTKERNEL = zrot.S +endif + +### SCAL ### + +ifndef SSCALKERNEL +SSCALKERNEL = scal.S +endif + +ifndef DSCALKERNEL +DSCALKERNEL = scal.S +endif + +ifndef CSCALKERNEL +CSCALKERNEL = zscal.S +endif + +ifndef ZSCALKERNEL +ZSCALKERNEL = zscal.S +endif + +ifndef QSCALKERNEL +QSCALKERNEL = scal.S +endif + +ifndef XSCALKERNEL +XSCALKERNEL = zscal.S +endif + +### SWAP ### + +ifndef SSWAPKERNEL +SSWAPKERNEL = swap.S +endif + +ifndef DSWAPKERNEL +DSWAPKERNEL = swap.S +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = zswap.S +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = zswap.S +endif + +ifndef QSWAPKERNEL +QSWAPKERNEL = swap.S +endif + +ifndef XSWAPKERNEL +XSWAPKERNEL = zswap.S +endif + +### GEMV ### + +ifndef SGEMVNKERNEL +SGEMVNKERNEL = gemv_n.S +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = gemv_t.S +endif + +ifndef DGEMVNKERNEL +DGEMVNKERNEL = gemv_n.S +endif + +ifndef DGEMVTKERNEL +DGEMVTKERNEL = gemv_t.S +endif + +ifndef CGEMVNKERNEL +CGEMVNKERNEL = zgemv_n.S +endif + +ifndef CGEMVTKERNEL +CGEMVTKERNEL = zgemv_t.S +endif + +ifndef ZGEMVNKERNEL +ZGEMVNKERNEL = zgemv_n.S +endif + +ifndef ZGEMVTKERNEL +ZGEMVTKERNEL = zgemv_t.S +endif + +ifndef QGEMVNKERNEL +QGEMVNKERNEL = gemv_n.S +endif + +ifndef QGEMVTKERNEL +QGEMVTKERNEL = gemv_t.S +endif + +ifndef XGEMVNKERNEL +XGEMVNKERNEL = zgemv_n.S +endif + +ifndef XGEMVTKERNEL +XGEMVTKERNEL = zgemv_t.S +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = cabs.S +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = cabs.S +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = cabs.S +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = lsame.S +endif + +SBLASOBJS += \ + samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ + isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ + sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ + sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ + snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ + idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ + dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ + dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) + +QBLASOBJS += \ + qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ + iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ + qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ + qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) + +CBLASOBJS += \ + camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ + casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ + cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ + cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ + zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ + zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ + zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += \ + xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ + xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ + xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ + xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) + +### AMAX ### + + + + +$(KDIR)samax_k$(TSUFFIX).$(SUFFIX) $(KDIR)samax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)damax_k$(TSUFFIX).$(SUFFIX) $(KDIR)damax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)qamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)camax_k$(TSUFFIX).$(SUFFIX) $(KDIR)camax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)zamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)xamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +### AMIN ### + +$(KDIR)samin_k$(TSUFFIX).$(SUFFIX) $(KDIR)samin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)damin_k$(TSUFFIX).$(SUFFIX) $(KDIR)damin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)qamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)camin_k$(TSUFFIX).$(SUFFIX) $(KDIR)camin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)zamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)zamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)xamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)xamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +### MAX ### + +$(KDIR)smax_k$(TSUFFIX).$(SUFFIX) $(KDIR)smax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)dmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)qmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +### MIN ### + +$(KDIR)smin_k$(TSUFFIX).$(SUFFIX) $(KDIR)smin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)dmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)dmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)qmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)qmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + + +### IAMAX ### + +$(KDIR)isamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)idamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)iqamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)icamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)izamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)ixamax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMAXKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -UUSE_MIN $< -o $@ + +### IAMIN ### + +$(KDIR)isamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)isamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)idamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)iqamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQAMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)icamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)icamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ICAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)izamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)izamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IZAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)ixamin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ixamin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IXAMINKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUSE_ABS -DUSE_MIN $< -o $@ + +### IMAX ### + +$(KDIR)ismax_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)idmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +$(KDIR)iqmax_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmax_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMAXKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -UUSE_MIN $< -o $@ + +### IMIN ### + +$(KDIR)ismin_k$(TSUFFIX).$(SUFFIX) $(KDIR)ismin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ISMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IDMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + +$(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ + + +$(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)dasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DASUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QASUMKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)casum_k$(TSUFFIX).$(SUFFIX) $(KDIR)casum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CASUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZASUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ + +$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ + +$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ + +$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ + +$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ + +$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ + +$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + +$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)dsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)sdsdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdsdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UCONJ $< -o $@ + +$(KDIR)zdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCONJ $< -o $@ + +$(KDIR)xdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UCONJ $< -o $@ + +$(KDIR)xdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCONJ $< -o $@ + +$(KDIR)cdotu_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotu_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UCONJ $< -o $@ + +$(KDIR)cdotc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cdotc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CDOTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCONJ $< -o $@ + +$(KDIR)snrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)snrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SNRM2KERNEL) + $(CC) $(CFLAGS) -UCOMPLEX -c -UDOUBLE $< -o $@ + +$(KDIR)dnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)dnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DNRM2KERNEL) + $(CC) $(CFLAGS) -UCOMPLEX -c -DDOUBLE $< -o $@ + +$(KDIR)qnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)qnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QNRM2KERNEL) + $(CC) $(CFLAGS) -UCOMPLEX -c -DXDOUBLE $< -o $@ + +$(KDIR)cnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)cnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CNRM2KERNEL) + $(CC) $(CFLAGS) -DCOMPLEX -c -UDOUBLE $< -o $@ + +$(KDIR)znrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)znrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZNRM2KERNEL) + $(CC) $(CFLAGS) -DCOMPLEX -c -DDOUBLE $< -o $@ + +$(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XNRM2KERNEL) + $(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@ + +$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zdrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)zdrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZROTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xqrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)xqrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XROTKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)sscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)sscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSCALKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)dscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)dscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSCALKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)qscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSCALKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)cscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)cscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSCALKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)zscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSCALKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xscal_k$(TSUFFIX).$(SUFFIX) $(KDIR)xscal_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSCALKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)sswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)sswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSWAPKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)dswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)dswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSWAPKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)qswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)qswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSWAPKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ + +$(KDIR)cswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)cswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSWAPKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ + +$(KDIR)zswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)zswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSWAPKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ + +$(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSWAPKERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 new file mode 100644 index 0000000000..f26292d8f9 --- /dev/null +++ b/kernel/Makefile.L2 @@ -0,0 +1,428 @@ +### GEMV ### + +ifndef SGEMVNKERNEL +SGEMVNKERNEL = gemv_n.S +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = gemv_t.S +endif + +ifndef DGEMVNKERNEL +DGEMVNKERNEL = gemv_n.S +endif + +ifndef DGEMVTKERNEL +DGEMVTKERNEL = gemv_t.S +endif + +ifndef CGEMVNKERNEL +CGEMVNKERNEL = zgemv_n.S +endif + +ifndef CGEMVTKERNEL +CGEMVTKERNEL = zgemv_t.S +endif + +ifndef ZGEMVNKERNEL +ZGEMVNKERNEL = zgemv_n.S +endif + +ifndef ZGEMVTKERNEL +ZGEMVTKERNEL = zgemv_t.S +endif + +ifndef QGEMVNKERNEL +QGEMVNKERNEL = gemv_n.S +endif + +ifndef QGEMVTKERNEL +QGEMVTKERNEL = gemv_t.S +endif + +ifndef XGEMVNKERNEL +XGEMVNKERNEL = zgemv_n.S +endif + +ifndef XGEMVTKERNEL +XGEMVTKERNEL = zgemv_t.S +endif + +### GER ### + +ifndef SGERKERNEL +SGERKERNEL = ../generic/ger.c +endif + +ifndef DGERKERNEL +DGERKERNEL = ../generic/ger.c +endif + +ifndef QGERKERNEL +QGERKERNEL = ../generic/ger.c +endif + +ifndef CGERUKERNEL +CGERUKERNEL = ../generic/zger.c +endif + +ifndef CGERCKERNEL +CGERCKERNEL = ../generic/zger.c +endif + +ifndef ZGERUKERNEL +ZGERUKERNEL = ../generic/zger.c +endif + +ifndef ZGERCKERNEL +ZGERCKERNEL = ../generic/zger.c +endif + +ifndef XGERUKERNEL +XGERUKERNEL = ../generic/zger.c +endif + +ifndef XGERCKERNEL +XGERCKERNEL = ../generic/zger.c +endif + +### SYMV ### + +ifndef SSYMV_U_KERNEL +SSYMV_U_KERNEL = ../generic/symv_k.c +endif + +ifndef SSYMV_L_KERNEL +SSYMV_L_KERNEL = ../generic/symv_k.c +endif + +ifndef DSYMV_U_KERNEL +DSYMV_U_KERNEL = ../generic/symv_k.c +endif + +ifndef DSYMV_L_KERNEL +DSYMV_L_KERNEL = ../generic/symv_k.c +endif + +ifndef QSYMV_U_KERNEL +QSYMV_U_KERNEL = ../generic/symv_k.c +endif + +ifndef QSYMV_L_KERNEL +QSYMV_L_KERNEL = ../generic/symv_k.c +endif + +ifndef CSYMV_U_KERNEL +CSYMV_U_KERNEL = ../generic/zsymv_k.c +endif + +ifndef CSYMV_L_KERNEL +CSYMV_L_KERNEL = ../generic/zsymv_k.c +endif + +ifndef ZSYMV_U_KERNEL +ZSYMV_U_KERNEL = ../generic/zsymv_k.c +endif + +ifndef ZSYMV_L_KERNEL +ZSYMV_L_KERNEL = ../generic/zsymv_k.c +endif + +ifndef XSYMV_U_KERNEL +XSYMV_U_KERNEL = ../generic/zsymv_k.c +endif + +ifndef XSYMV_L_KERNEL +XSYMV_L_KERNEL = ../generic/zsymv_k.c +endif + +### HEMV ### + +ifndef CHEMV_U_KERNEL +CHEMV_U_KERNEL = ../generic/zhemv_k.c +endif + +ifndef CHEMV_L_KERNEL +CHEMV_L_KERNEL = ../generic/zhemv_k.c +endif + +ifndef CHEMV_V_KERNEL +CHEMV_V_KERNEL = ../generic/zhemv_k.c +endif + +ifndef CHEMV_M_KERNEL +CHEMV_M_KERNEL = ../generic/zhemv_k.c +endif + +ifndef ZHEMV_U_KERNEL +ZHEMV_U_KERNEL = ../generic/zhemv_k.c +endif + +ifndef ZHEMV_L_KERNEL +ZHEMV_L_KERNEL = ../generic/zhemv_k.c +endif + +ifndef ZHEMV_V_KERNEL +ZHEMV_V_KERNEL = ../generic/zhemv_k.c +endif + +ifndef ZHEMV_M_KERNEL +ZHEMV_M_KERNEL = ../generic/zhemv_k.c +endif + +ifndef XHEMV_U_KERNEL +XHEMV_U_KERNEL = ../generic/zhemv_k.c +endif + +ifndef XHEMV_L_KERNEL +XHEMV_L_KERNEL = ../generic/zhemv_k.c +endif + +ifndef XHEMV_V_KERNEL +XHEMV_V_KERNEL = ../generic/zhemv_k.c +endif + +ifndef XHEMV_M_KERNEL +XHEMV_M_KERNEL = ../generic/zhemv_k.c +endif + +SBLASOBJS += \ + sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \ + sger_k$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \ + dger_k$(TSUFFIX).$(SUFFIX) + +QBLASOBJS += \ + qgemv_n$(TSUFFIX).$(SUFFIX) qgemv_t$(TSUFFIX).$(SUFFIX) qsymv_U$(TSUFFIX).$(SUFFIX) qsymv_L$(TSUFFIX).$(SUFFIX) \ + qger_k$(TSUFFIX).$(SUFFIX) + +CBLASOBJS += \ + cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \ + cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX) \ + csymv_U$(TSUFFIX).$(SUFFIX) csymv_L$(TSUFFIX).$(SUFFIX) \ + chemv_U$(TSUFFIX).$(SUFFIX) chemv_L$(TSUFFIX).$(SUFFIX) chemv_V$(TSUFFIX).$(SUFFIX) chemv_M$(TSUFFIX).$(SUFFIX) \ + cgeru_k$(TSUFFIX).$(SUFFIX) cgerc_k$(TSUFFIX).$(SUFFIX) cgerv_k$(TSUFFIX).$(SUFFIX) cgerd_k$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zgemv_n$(TSUFFIX).$(SUFFIX) zgemv_t$(TSUFFIX).$(SUFFIX) zgemv_r$(TSUFFIX).$(SUFFIX) zgemv_c$(TSUFFIX).$(SUFFIX) \ + zgemv_o$(TSUFFIX).$(SUFFIX) zgemv_u$(TSUFFIX).$(SUFFIX) zgemv_s$(TSUFFIX).$(SUFFIX) zgemv_d$(TSUFFIX).$(SUFFIX) \ + zsymv_U$(TSUFFIX).$(SUFFIX) zsymv_L$(TSUFFIX).$(SUFFIX) \ + zhemv_U$(TSUFFIX).$(SUFFIX) zhemv_L$(TSUFFIX).$(SUFFIX) zhemv_V$(TSUFFIX).$(SUFFIX) zhemv_M$(TSUFFIX).$(SUFFIX) \ + zgeru_k$(TSUFFIX).$(SUFFIX) zgerc_k$(TSUFFIX).$(SUFFIX) zgerv_k$(TSUFFIX).$(SUFFIX) zgerd_k$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += \ + xgemv_n$(TSUFFIX).$(SUFFIX) xgemv_t$(TSUFFIX).$(SUFFIX) xgemv_r$(TSUFFIX).$(SUFFIX) xgemv_c$(TSUFFIX).$(SUFFIX) \ + xgemv_o$(TSUFFIX).$(SUFFIX) xgemv_u$(TSUFFIX).$(SUFFIX) xgemv_s$(TSUFFIX).$(SUFFIX) xgemv_d$(TSUFFIX).$(SUFFIX) \ + xsymv_U$(TSUFFIX).$(SUFFIX) xsymv_L$(TSUFFIX).$(SUFFIX) \ + xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ + xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) + +$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ + +$(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DTRANS $< -o $@ + +$(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ + +$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ + +$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -UTRANS $< -o $@ + +$(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DTRANS $< -o $@ + +$(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)cgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)cgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)cgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)cgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)cgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)zgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)zgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)zgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)zgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)zgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)xgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -UXCONJ $< -o $@ + +$(KDIR)xgemv_r$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)xgemv_c$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_c$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -UXCONJ $< -o $@ + +$(KDIR)xgemv_o$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_o$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)xgemv_u$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_u$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -UCONJ -DXCONJ $< -o $@ + +$(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ + +$(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@ + +$(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM) + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $@ + +$(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $@ + +$(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM) + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $@ + +$(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $@ + +$(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $@ + +$(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $@ + +$(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $@ + +$(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $@ + +$(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $@ + +$(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $@ + +$(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $@ + +$(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE $< -o $@ + +$(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE $< -o $@ + +$(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE $< -o $@ + +$(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ $< -o $@ + +$(KDIR)cgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ $< -o $@ + +$(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ -DXCONJ $< -o $@ + +$(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) + $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ -DXCONJ $< -o $@ + +$(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ $< -o $@ + +$(KDIR)zgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ $< -o $@ + +$(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ -DXCONJ $< -o $@ + +$(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) + $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ -DXCONJ $< -o $@ + +$(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ $< -o $@ + +$(KDIR)xgerc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerc_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ $< -o $@ + +$(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@ + +$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) + $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ-DXCONJ $< -o $@ + +$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ + +$(KDIR)chemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_L_KERNEL) $(CHEMV_L_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV $< -o $@ + +$(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_V_KERNEL) $(CHEMV_U_PARAM) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $@ + +$(KDIR)zhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_L_KERNEL) $(ZHEMV_L_PARAM) + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV $< -o $@ + +$(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_V_KERNEL) $(ZHEMV_U_PARAM) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $@ + +$(KDIR)xhemv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_L_KERNEL) + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV $< -o $@ + +$(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_V_KERNEL) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV -DHEMVREV $< -o $@ + +$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ + diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 new file mode 100644 index 0000000000..4e331a445f --- /dev/null +++ b/kernel/Makefile.L3 @@ -0,0 +1,3135 @@ +ifeq ($(ARCH), x86) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +USE_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +USE_GEMM3M = 1 +endif + +SKERNELOBJS += \ + sgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ + $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) + +DKERNELOBJS += \ + dgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ + $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) + +QKERNELOBJS += \ + qgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \ + $(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ) + +CKERNELOBJS += \ + cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ + $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) + +ZKERNELOBJS += \ + zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ + $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) + +XKERNELOBJS += \ + xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ + xgemm_kernel_l$(TSUFFIX).$(SUFFIX) xgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ + $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) + +SBLASOBJS += $(SKERNELOBJS) +DBLASOBJS += $(DKERNELOBJS) +QBLASOBJS += $(QKERNELOBJS) +CBLASOBJS += $(CKERNELOBJS) +ZBLASOBJS += $(ZKERNELOBJS) +XBLASOBJS += $(XKERNELOBJS) + +SBLASOBJS += \ + sgemm_beta$(TSUFFIX).$(SUFFIX) \ + strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + +DBLASOBJS += \ + dgemm_beta$(TSUFFIX).$(SUFFIX) \ + dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + +QBLASOBJS += \ + qgemm_beta$(TSUFFIX).$(SUFFIX) \ + qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + +CBLASOBJS += \ + cgemm_beta$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + +ZBLASOBJS += \ + zgemm_beta$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + +XBLASOBJS += \ + xgemm_beta$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + +ifdef USE_GEMM3M + +CBLASOBJS += cgemm3m_kernel$(TSUFFIX).$(SUFFIX) +ZBLASOBJS += zgemm3m_kernel$(TSUFFIX).$(SUFFIX) +XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) + +endif + +SBLASOBJS += \ + strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ + strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + strsm_iltucopy$(TSUFFIX).$(SUFFIX) strsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ + strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += \ + dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) + +QBLASOBJS += \ + qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_outucopy$(TSUFFIX).$(SUFFIX) qtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + +CBLASOBJS += \ + ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_outucopy$(TSUFFIX).$(SUFFIX) ztrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + zsymm_iutcopy$(TSUFFIX).$(SUFFIX) zsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += \ + xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_outucopy$(TSUFFIX).$(SUFFIX) xtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ + xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_outucopy$(TSUFFIX).$(SUFFIX) xtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ + xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + xsymm_iutcopy$(TSUFFIX).$(SUFFIX) xsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + xsymm_outcopy$(TSUFFIX).$(SUFFIX) xsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX) + +ifdef USE_GEMM3M + +CBLASOBJS += \ + cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += \ + zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += \ + xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ + xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ + xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \ + xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \ + xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ + xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) + +endif + +SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SGEMMOTCOPYOBJ_P = $(SGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMINCOPYOBJ_P = $(DGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMITCOPYOBJ_P = $(DGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMONCOPYOBJ_P = $(DGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +DGEMMOTCOPYOBJ_P = $(DGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMINCOPYOBJ_P = $(QGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMITCOPYOBJ_P = $(QGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMONCOPYOBJ_P = $(QGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +QGEMMOTCOPYOBJ_P = $(QGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMINCOPYOBJ_P = $(CGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMITCOPYOBJ_P = $(CGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMONCOPYOBJ_P = $(CGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +CGEMMOTCOPYOBJ_P = $(CGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMINCOPYOBJ_P = $(ZGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMITCOPYOBJ_P = $(ZGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMONCOPYOBJ_P = $(ZGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ZGEMMOTCOPYOBJ_P = $(ZGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMINCOPYOBJ_P = $(XGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) + +$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + +$(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +$(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(KDIR)$(QGEMMONCOPYOBJ) : $(KERNELDIR)/$(QGEMMONCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(QGEMMOTCOPYOBJ) : $(KERNELDIR)/$(QGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) + +$(KDIR)$(QGEMMINCOPYOBJ) : $(KERNELDIR)/$(QGEMMINCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(QGEMMITCOPYOBJ) : $(KERNELDIR)/$(QGEMMITCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) + +$(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(KDIR)$(ZGEMMONCOPYOBJ) : $(KERNELDIR)/$(ZGEMMONCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(ZGEMMOTCOPYOBJ) : $(KERNELDIR)/$(ZGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) + +$(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(KDIR)$(XGEMMONCOPYOBJ) : $(KERNELDIR)/$(XGEMMONCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(XGEMMOTCOPYOBJ) : $(KERNELDIR)/$(XGEMMOTCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) + +$(KDIR)$(XGEMMINCOPYOBJ) : $(KERNELDIR)/$(XGEMMINCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)$(XGEMMITCOPYOBJ) : $(KERNELDIR)/$(XGEMMITCOPY) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)strsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + + +$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + + +$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMM_BETA) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM_BETA) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@ + +$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(SGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMOTCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) + +$(SGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMINCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +$(DGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMINCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(QGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N)) + +$(QGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMINCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(QGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMITCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(CGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMONCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(CGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMOTCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) + +$(CGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMINCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +endif + +$(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(ZGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) + +$(ZGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMINCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +endif + +ifdef EXPRECISION + +$(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(XGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N)) + +$(XGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMINCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(XGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMITCOPY) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +endif + +endif + +$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ + +$(KDIR)cgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@ + +$(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@ + +$(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ + +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ + +$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ + +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ + +$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ + +$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) + $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ + +$(KDIR)cgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)zgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)xgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL) + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ + +$(KDIR)strsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)strsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@ + +$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND) + $(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@ + + +$(KDIR)strmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ssymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)ssymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)ssymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)dsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)dsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)qsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)qsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)csymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)csymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)zsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)zsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ + +$(KDIR)xsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ + +$(KDIR)xsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ + +$(KDIR)chemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)chemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)chemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)zhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)zhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ + +$(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ + +$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ + +$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@ + +$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@ + +$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@ + +$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c + $(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ + +$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)strsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)strsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)strsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)strsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@ + +$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@ + +$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ diff --git a/kernel/Makefile.LA b/kernel/Makefile.LA new file mode 100644 index 0000000000..496d05cf63 --- /dev/null +++ b/kernel/Makefile.LA @@ -0,0 +1,48 @@ +SBLASOBJS += sneg_tcopy$(TSUFFIX).$(SUFFIX) slaswp_ncopy$(TSUFFIX).$(SUFFIX) + +DBLASOBJS += dneg_tcopy$(TSUFFIX).$(SUFFIX) dlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +QBLASOBJS += qneg_tcopy$(TSUFFIX).$(SUFFIX) qlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +CBLASOBJS += cneg_tcopy$(TSUFFIX).$(SUFFIX) claswp_ncopy$(TSUFFIX).$(SUFFIX) + +ZBLASOBJS += zneg_tcopy$(TSUFFIX).$(SUFFIX) zlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +XBLASOBJS += xneg_tcopy$(TSUFFIX).$(SUFFIX) xlaswp_ncopy$(TSUFFIX).$(SUFFIX) + +$(KDIR)sneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)sneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(SGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)dneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)dneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(DGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)qneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)qneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/neg_tcopy_$(QGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)cneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)cneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(CGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)zneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)zneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(ZGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)xneg_tcopy$(TSUFFIX).$(SUFFIX) $(KDIR)xneg_tcopy$(TSUFFIX).$(PSUFFIX) : generic/zneg_tcopy_$(XGEMM_UNROLL_M).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)slaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)slaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(SGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)dlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)dlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(DGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)qlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)qlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/laswp_ncopy_$(QGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)claswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)claswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(CGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)zlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)zlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(ZGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + +$(KDIR)xlaswp_ncopy$(TSUFFIX).$(SUFFIX) $(KDIR)xlaswp_ncopy$(TSUFFIX).$(PSUFFIX) : generic/zlaswp_ncopy_$(XGEMM_UNROLL_N).c + $(CC) -c $(CFLAGS) $< -o $@ + diff --git a/kernel/alpha/KERNEL b/kernel/alpha/KERNEL new file mode 100644 index 0000000000..a39ccd536e --- /dev/null +++ b/kernel/alpha/KERNEL @@ -0,0 +1,124 @@ +ifndef SAMINKERNEL +SAMINKERNEL = amax.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = max.S +endif + +ifndef DMINKERNEL +DMINKERNEL = max.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = copy.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = copy.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +SGEMMKERNEL = gemm_kernel_4x4.S +SGEMM_BETA = gemm_beta.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) + +DGEMMKERNEL = gemm_kernel_4x4.S +DGEMM_BETA = gemm_beta.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) + +CGEMMKERNEL = zgemm_kernel_2x2.S +CGEMM_BETA = zgemm_beta.S +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_2x2.S +ZGEMM_BETA = zgemm_beta.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) + +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S + +STRSMKERNEL_LN = trsm_kernel_4x4_LN.S +STRSMKERNEL_LT = trsm_kernel_4x4_LT.S +STRSMKERNEL_RN = trsm_kernel_4x4_LT.S +STRSMKERNEL_RT = trsm_kernel_4x4_RT.S + +DTRSMKERNEL_LN = trsm_kernel_4x4_LN.S +DTRSMKERNEL_LT = trsm_kernel_4x4_LT.S +DTRSMKERNEL_RN = trsm_kernel_4x4_LT.S +DTRSMKERNEL_RT = trsm_kernel_4x4_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S diff --git a/kernel/alpha/Makefile b/kernel/alpha/Makefile new file mode 100644 index 0000000000..efae70d7b7 --- /dev/null +++ b/kernel/alpha/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/alpha/amax.S b/kernel/alpha/amax.S new file mode 100644 index 0000000000..e528adc072 --- /dev/null +++ b/kernel/alpha/amax.S @@ -0,0 +1,283 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 6 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + nop + .align 4 + + stt $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + stt $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + stt $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + stt $f5, 24($sp) + fclr $f19 + and $2, $3, $0 + unop + + stt $f6, 32($sp) + fclr $f0 + sra N, 3, $1 + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + unop + fabs $f20, $f0 + ble $1, $L15 + .align 4 + + fabs $f20, $f1 + unop + addq X, INCX, X + unop + + LD $f21, 0 * SIZE(X) + fabs $f20, $f2 + addq X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fabs $f20, $f3 + addq X, INCX, X + unop + + LD $f23, 0 * SIZE(X) + fabs $f20, $f4 + addq X, INCX, X + unop + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + fabs $f20, $f5 + unop + + LD $f25, 0 * SIZE(X) + fabs $f20, $f6 + addq X, INCX, X + unop + + LD $f26, 0 * SIZE(X) + fabs $f20, $f28 + addq X, INCX, X + lda $1, -1($1) + + LD $f27, 0 * SIZE(X) + unop + addq X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f12, $f4 + unop + fabs $f20, $f29 + ldl $31, 56 * SIZE(X) + + fcmovne $f17, $f13, $f5 + LD $f20, 0 * SIZE(X) + fabs $f21, $f30 + addq X, INCX, X + + fcmovne $f18, $f14, $f6 + LD $f21, 0 * SIZE(X) + fabs $f22, $f10 + addq X, INCX, X + + fcmovne $f19, $f15, $f28 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + addq X, INCX, X + + fabs $f24, $f12 + LD $f23, 0 * SIZE(X) + CMPLT($f0, $f29), $f16 + addq X, INCX, X + + fabs $f25, $f13 + LD $f24, 0 * SIZE(X) + CMPLT($f1, $f30), $f17 + addq X, INCX, X + + fabs $f26, $f14 + LD $f25, 0 * SIZE(X) + CMPLT($f2, $f10), $f18 + addq X, INCX, X + + fabs $f27, $f15 + LD $f26, 0 * SIZE(X) + CMPLT($f3, $f11), $f19 + addq X, INCX, X + + fcmovne $f16, $f29, $f0 + LD $f27, 0 * SIZE(X) + CMPLT($f4, $f12), $f16 + addq X, INCX, X + + fcmovne $f17, $f30, $f1 + unop + CMPLT($f5, $f13), $f17 + lda $1, -1($1) # i -- + + fcmovne $f18, $f10, $f2 + unop + CMPLT($f6, $f14), $f18 + unop + + fcmovne $f19, $f11, $f3 + unop + CMPLT($f28, $f15), $f19 + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f12, $f4 + fabs $f20, $f29 + fcmovne $f17, $f13, $f5 + fabs $f21, $f30 + + fcmovne $f18, $f14, $f6 + fabs $f22, $f10 + fcmovne $f19, $f15, $f28 + fabs $f23, $f11 + + fabs $f24, $f12 + CMPLT($f0, $f29), $f16 + fabs $f25, $f13 + CMPLT($f1, $f30), $f17 + + fabs $f26, $f14 + CMPLT($f2, $f10), $f18 + fabs $f27, $f15 + CMPLT($f3, $f11), $f19 + + fcmovne $f16, $f29, $f0 + CMPLT($f4, $f12), $f16 + fcmovne $f17, $f30, $f1 + CMPLT($f5, $f13), $f17 + + fcmovne $f18, $f10, $f2 + CMPLT($f6, $f14), $f18 + fcmovne $f19, $f11, $f3 + CMPLT($f28, $f15), $f19 + + fcmovne $f16, $f12, $f4 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f13, $f5 + CMPLT($f2, $f3), $f17 + + fcmovne $f18, $f14, $f6 + CMPLT($f4, $f5), $f18 + fcmovne $f19, $f15, $f28 + CMPLT($f6, $f28), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f3, $f2 + fcmovne $f18, $f5, $f4 + fcmovne $f19, $f28, $f6 + + CMPLT($f0, $f2), $f16 + CMPLT($f4, $f6), $f17 + + fcmovne $f16, $f2, $f0 + fcmovne $f17, $f6, $f4 + + CMPLT($f0, $f4), $f16 + fcmovne $f16, $f4, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + fabs $f20, $f29 + CMPLT($f0, $f29), $f16 + fcmovne $f16, $f29, $f0 + + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + + ldt $f6, 32($sp) + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/asum.S b/kernel/alpha/asum.S new file mode 100644 index 0000000000..b312d064b9 --- /dev/null +++ b/kernel/alpha/asum.S @@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + ble N, $L999 + + sra N, 3, I + fclr s1 + fclr s2 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t1 + SXADDQ INCX, X, X + fclr t2 + + LD a1, 0 * SIZE(X) + fclr t3 + SXADDQ INCX, X, X + fclr s3 + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + ldl $31, PREFETCHSIZE * 2 * SIZE(X) + fabs a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a7, 0 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + SXADDQ INCX, X, X + + ADD s0, t0, s0 + LD a1, 0 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + LD a3, 0 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a7, 0 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fabs a2, t2 + ADD s3, t3, s3 + fabs a3, t3 + + ADD s0, t0, s0 + fabs a4, t0 + ADD s1, t1, s1 + fabs a5, t1 + ADD s2, t2, s2 + fabs a6, t2 + ADD s3, t3, s3 + fabs a7, t3 + + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s1, s0 + ADD s2, s3, s2 + .align 4 + +$L15: + and N, 7, I + ADD s0, s2, s0 + unop + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + fabs a0, t0 + + lda I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ret + EPILOGUE diff --git a/kernel/alpha/axpy.S b/kernel/alpha/axpy.S new file mode 100644 index 0000000000..1007b063b6 --- /dev/null +++ b/kernel/alpha/axpy.S @@ -0,0 +1,428 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 40 + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldq $24, 0($sp) + fmov $f19, $f30 + ldl $23, 8($sp) + lda $sp, -16($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + nop + sra $16, 3, $1 + stt $f2, 0($sp) + cmpeq $21, 1, $3 + + stt $f3, 8($sp) + cmpeq $23, 1, $4 + and $16, 7, $2 + ble $16, $End + + and $3, $4, $3 + fbeq $f30, $End + + beq $3, $Sub + ble $1, $Remain + .align 4 + + LD $f10, 0*SIZE($20) + LD $f11, 1*SIZE($20) + LD $f12, 2*SIZE($20) + LD $f13, 3*SIZE($20) + + LD $f18, 0*SIZE($24) + LD $f19, 1*SIZE($24) + LD $f20, 2*SIZE($24) + LD $f21, 3*SIZE($24) + + LD $f14, 4*SIZE($20) + LD $f15, 5*SIZE($20) + LD $f16, 6*SIZE($20) + LD $f17, 7*SIZE($20) + + LD $f22, 4*SIZE($24) + LD $f23, 5*SIZE($24) + LD $f24, 6*SIZE($24) + LD $f25, 7*SIZE($24) + + subq $1, 1, $1 + addq $20, 8*SIZE, $20 + unop + ble $1, $LoopEnd + .align 4 + +$Loop: + ldt $f31, PREFETCHSIZE * SIZE($24) + ldl $31, PREFETCHSIZE * SIZE($20) + + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + LD $f10, 0*SIZE($20) + MUL $f30, $f11, $f27 + LD $f11, 1*SIZE($20) + + MUL $f30, $f12, $f28 + LD $f12, 2*SIZE($20) + MUL $f30, $f13, $f29 + LD $f13, 3*SIZE($20) + + ADD $f18, $f26, $f0 + LD $f18, 8*SIZE($24) + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + LD $f14, 4*SIZE($20) + + ADD $f19, $f27, $f1 + LD $f19, 9*SIZE($24) + MUL $f30, $f15, $f27 + LD $f15, 5*SIZE($20) + + ADD $f20, $f28, $f2 + LD $f20, 10*SIZE($24) + MUL $f30, $f16, $f28 + LD $f16, 6*SIZE($20) + + ADD $f21, $f29, $f3 + LD $f21, 11*SIZE($24) + MUL $f30, $f17, $f29 + LD $f17, 7*SIZE($20) + + ST $f0, 0*SIZE($24) + ADD $f22, $f26, $f0 + ST $f1, 1*SIZE($24) + ADD $f23, $f27, $f1 + + ST $f2, 2*SIZE($24) + ADD $f24, $f28, $f2 + ST $f3, 3*SIZE($24) + ADD $f25, $f29, $f3 + + LD $f22, 12*SIZE($24) + LD $f23, 13*SIZE($24) + LD $f24, 14*SIZE($24) + LD $f25, 15*SIZE($24) + + ST $f0, 4*SIZE($24) + ST $f1, 5*SIZE($24) + ST $f2, 6*SIZE($24) + ST $f3, 7*SIZE($24) + + subq $1, 1, $1 + addq $24, 8*SIZE, $24 + addq $20, 8*SIZE, $20 + bgt $1, $Loop + .align 4 + +$LoopEnd: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + MUL $f30, $f11, $f27 + MUL $f30, $f12, $f28 + MUL $f30, $f13, $f29 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + + ST $f0, 0*SIZE($24) + ADD $f22, $f26, $f0 + ST $f1, 1*SIZE($24) + ADD $f23, $f27, $f1 + + ST $f2, 2*SIZE($24) + ADD $f24, $f28, $f2 + ST $f3, 3*SIZE($24) + ADD $f25, $f29, $f3 + + ST $f0, 4*SIZE($24) + ST $f1, 5*SIZE($24) + ST $f2, 6*SIZE($24) + ST $f3, 7*SIZE($24) + addq $24, 8*SIZE, $24 + .align 4 + +$Remain: + ble $2, $End + .align 4 + +$RemainLoop: + LD $f10, 0*SIZE($20) + LD $f11, 0*SIZE($24) + addq $20, SIZE, $20 + addq $24, SIZE, $24 + + MUL $f30, $f10, $f12 + subq $2, 1, $2 + ADD $f11, $f12, $f13 + ST $f13, -1*SIZE($24) + bgt $2, $RemainLoop + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + lda $sp, 16($sp) + ret + .align 4 + +$Sub: + SXSUBL $16, SIZE, $22 + subq $1, 1, $4 + ble $1, $SubRemain + .align 4 + + LD $f10, 0($20) + SXADDQ $21, $20, $20 + + LD $f11, 0($20) + SXADDQ $21, $20, $20 + LD $f12, 0($20) + SXADDQ $21, $20, $20 + + LD $f13, 0($20) + SXADDQ $21, $20, $20 + LD $f18, 0($24) + SXADDQ $23, $24, $22 + + LD $f19, 0($22) + SXADDQ $23, $22, $22 + LD $f20, 0($22) + SXADDQ $23, $22, $22 + + LD $f21, 0($22) + SXADDQ $23, $22, $22 + LD $f14, 0($20) + SXADDQ $21, $20, $20 + + LD $f15, 0($20) + SXADDQ $21, $20, $20 + LD $f16, 0($20) + SXADDQ $21, $20, $20 + + LD $f17, 0($20) + SXADDQ $21, $20, $20 + LD $f22, 0($22) + SXADDQ $23, $22, $22 + + LD $f23, 0($22) + SXADDQ $23, $22, $22 + LD $f24, 0($22) + SXADDQ $23, $22, $22 + + LD $f25, 0($22) + SXADDQ $23, $22, $22 + unop + ble $4, $SubLoopEnd + .align 4 + +$SubLoop: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + LD $f10, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f11, $f27 + LD $f11, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f12, $f28 + LD $f12, 0($20) + unop + SXADDQ $21, $20, $20 + + MUL $f30, $f13, $f29 + LD $f13, 0($20) + unop + SXADDQ $21, $20, $20 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + LD $f14, 0($20) + SXADDQ $21, $20, $20 + + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + LD $f15, 0($20) + SXADDQ $21, $20, $20 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + LD $f16, 0($20) + SXADDQ $21, $20, $20 + + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + LD $f17, 0($20) + SXADDQ $21, $20, $20 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ADD $f22, $f26, $f0 + unop + + ST $f1, 0($24) + SXADDQ $23, $24, $24 + ADD $f23, $f27, $f1 + unop + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ADD $f24, $f28, $f2 + unop + + ST $f3, 0($24) + SXADDQ $23, $24, $24 + ADD $f25, $f29, $f3 + unop + + LD $f18, 0($22) + SXADDQ $23, $22, $22 + LD $f19, 0($22) + SXADDQ $23, $22, $22 + + LD $f20, 0($22) + SXADDQ $23, $22, $22 + LD $f21, 0($22) + SXADDQ $23, $22, $22 + + LD $f22, 0($22) + SXADDQ $23, $22, $22 + LD $f23, 0($22) + SXADDQ $23, $22, $22 + + LD $f24, 0($22) + SXADDQ $23, $22, $22 + LD $f25, 0($22) + SXADDQ $23, $22, $22 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + + subq $4, 1, $4 + bgt $4, $SubLoop + .align 4 + +$SubLoopEnd: + MUL $f30, $f10, $f26 # ctemp1 = da * atemp1 + MUL $f30, $f11, $f27 + MUL $f30, $f12, $f28 + MUL $f30, $f13, $f29 + + ADD $f18, $f26, $f0 + MUL $f30, $f14, $f26 # ctemp1 = da * atemp1 + ADD $f19, $f27, $f1 + MUL $f30, $f15, $f27 + + ADD $f20, $f28, $f2 + MUL $f30, $f16, $f28 + ADD $f21, $f29, $f3 + MUL $f30, $f17, $f29 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + + ADD $f22, $f26, $f0 + ADD $f23, $f27, $f1 + ADD $f24, $f28, $f2 + ADD $f25, $f29, $f3 + + ST $f0, 0($24) + SXADDQ $23, $24, $24 + ST $f1, 0($24) + SXADDQ $23, $24, $24 + + ST $f2, 0($24) + SXADDQ $23, $24, $24 + ST $f3, 0($24) + SXADDQ $23, $24, $24 + .align 4 + +$SubRemain: + ble $2, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0($20) + LD $f11, 0($24) + SXADDQ $21, $20, $20 + + MUL $f30, $f10, $f12 + subq $2, 1, $2 + ADD $f11, $f12, $f13 + ST $f13, 0($24) + SXADDQ $23, $24, $24 + + bgt $2, $SubRemainLoop + .align 4 + +$SubEnd: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + lda $sp, 16($sp) + ret + EPILOGUE diff --git a/kernel/alpha/cabs.S b/kernel/alpha/cabs.S new file mode 100644 index 0000000000..5fa27af53e --- /dev/null +++ b/kernel/alpha/cabs.S @@ -0,0 +1,71 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl NAME + .ent NAME +NAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount +#endif + + LD $f10, 0($16) + LD $f11, SIZE($16) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fabs $f10, $f12 + fabs $f11, $f0 + ADD $f12, $f0, $f0 + ret + .end NAME + .ident VERSION diff --git a/kernel/alpha/cnrm2.S b/kernel/alpha/cnrm2.S new file mode 100644 index 0000000000..03343b2aec --- /dev/null +++ b/kernel/alpha/cnrm2.S @@ -0,0 +1,426 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldah $29, 0($27) !gpdisp!1 + lda $29, 0($29) !gpdisp!1 + + lda $sp, -16($sp) + ldq $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + lda I, -1(I) + ble I, $L12 + .align 4 + +$L11: + addt a0, t0, a0 + ldl $31, (PREFETCH_SIZE) * SIZE(X) + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + mov X, XX + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(X) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(X) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(X) + + addt a3, t3, a3 + unop + mult x7, x7, t3 + LD x7, 15 * SIZE(X) + + addt a0, t0, a0 + unop + mult x0, x0, t0 + LD x0, 16 * SIZE(X) + + addt a1, t1, a1 + lda X, 16 * SIZE(X) + mult x1, x1, t1 + LD x1, 17 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 18 * SIZE(XX) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 19 * SIZE(XX) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 20 * SIZE(XX) + + addt a1, t1, a1 + lda I, -1(I) + mult x5, x5, t1 + LD x5, 21 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 22 * SIZE(XX) + + addt a3, t3, a3 + mult x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + addt a0, t0, a0 + mov X, XX + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + unop + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(XX) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(XX) + + addt a3, t3, a3 + lda X, 16 * SIZE(X) + mult x7, x7, t3 + LD x7, 15 * SIZE(XX) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + lda X, 2 * SIZE(X) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + lda I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addq X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addq X, INCX, X + + LD x4, 0 * SIZE(X) + lda I, -1(I) + LD x5, 1 * SIZE(X) + addq X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + addt a0, t0, a0 + LD x7, 1 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x0, 0 * SIZE(X) + mult x1, x1, t1 + unop + + addt a2, t2, a2 + LD x1, 1 * SIZE(X) + mult x2, x2, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x2, 0 * SIZE(X) + mult x3, x3, t3 + unop + + addt a0, t0, a0 + LD x3, 1 * SIZE(X) + mult x4, x4, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x4, 0 * SIZE(X) + mult x5, x5, t1 + lda I, -1(I) + + addt a2, t2, a2 + LD x5, 1 * SIZE(X) + mult x6, x6, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x6, 0 * SIZE(X) + mult x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + addt a0, t0, a0 + LD x7, 1 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + mult x1, x1, t1 + addt a2, t2, a2 + mult x2, x2, t2 + + addt a3, t3, a3 + mult x3, x3, t3 + addt a0, t0, a0 + mult x4, x4, t0 + + addt a1, t1, a1 + mult x5, x5, t1 + addt a2, t2, a2 + mult x6, x6, t2 + + addt a3, t3, a3 + mult x7, x7, t3 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + lda I, -1(I) + LD x1, 1 * SIZE(X) + addq X, INCX, X + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + addt a0, t0, a0 + addt a1, t1, a1 + + addt a0, a1, a0 + addt a2, a3, a2 + +#if defined(EV4) || defined(EV5) + addt a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldah $29, 0($26) !gpdisp!3 + lda $29, 0($29) !gpdisp!3 +#else + addt a0, a2, a0 + sqrtt a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldq $26, 0($sp) + lda $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/alpha/copy.S b/kernel/alpha/copy.S new file mode 100644 index 0000000000..749039c9ea --- /dev/null +++ b/kernel/alpha/copy.S @@ -0,0 +1,379 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + cmpeq INCX, 1, $0 + ble N, $End +#ifndef COMPLEX + sra N, 4, $4 +#else + sra N, 3, $4 +#endif + cmpeq INCY, 1, $1 + + and $0, $1, $0 + beq $0, $Sub +#ifndef COMPLEX + and N, 15, $5 +#else + and N, 7, $5 +#endif + ble $4, $Remain + + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + LD $f12, 2*SIZE(X) + LD $f13, 3*SIZE(X) + LD $f14, 4*SIZE(X) + LD $f15, 5*SIZE(X) + LD $f16, 6*SIZE(X) + LD $f17, 7*SIZE(X) + + LD $f18, 8*SIZE(X) + LD $f19, 9*SIZE(X) + LD $f20, 10*SIZE(X) + LD $f21, 11*SIZE(X) + LD $f22, 12*SIZE(X) + LD $f23, 13*SIZE(X) + LD $f24, 14*SIZE(X) + LD $f25, 15*SIZE(X) + + subq $4, 1, $4 + lda X, 16*SIZE(X) + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + ST $f12, 2*SIZE(Y) + ST $f13, 3*SIZE(Y) + + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + LD $f12, 2*SIZE(X) + LD $f13, 3*SIZE(X) + + ST $f14, 4*SIZE(Y) + ST $f15, 5*SIZE(Y) + ST $f16, 6*SIZE(Y) + ST $f17, 7*SIZE(Y) + + LD $f14, 4*SIZE(X) + LD $f15, 5*SIZE(X) + LD $f16, 6*SIZE(X) + LD $f17, 7*SIZE(X) + + ST $f18, 8*SIZE(Y) + ST $f19, 9*SIZE(Y) + ST $f20, 10*SIZE(Y) + ST $f21, 11*SIZE(Y) + + LD $f18, 8*SIZE(X) + LD $f19, 9*SIZE(X) + LD $f20, 10*SIZE(X) + LD $f21, 11*SIZE(X) + + ST $f22, 12*SIZE(Y) + ST $f23, 13*SIZE(Y) + ST $f24, 14*SIZE(Y) + ST $f25, 15*SIZE(Y) + + LD $f22, 12*SIZE(X) + LD $f23, 13*SIZE(X) + LD $f24, 14*SIZE(X) + LD $f25, 15*SIZE(X) + + subq $4, 1, $4 + lda Y, 16*SIZE(Y) + lda X, 16*SIZE(X) + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + ST $f12, 2*SIZE(Y) + ST $f13, 3*SIZE(Y) + ST $f14, 4*SIZE(Y) + ST $f15, 5*SIZE(Y) + ST $f16, 6*SIZE(Y) + ST $f17, 7*SIZE(Y) + + ST $f18, 8*SIZE(Y) + ST $f19, 9*SIZE(Y) + ST $f20, 10*SIZE(Y) + ST $f21, 11*SIZE(Y) + ST $f22, 12*SIZE(Y) + ST $f23, 13*SIZE(Y) + ST $f24, 14*SIZE(Y) + ST $f25, 15*SIZE(Y) + + lda Y, 16*SIZE(Y) + .align 4 + +$Remain: + ble $5, $End + .align 4 + +$RemainLoop: +#ifndef COMPLEX + LD $f10, 0*SIZE(X) + lda X, 1*SIZE(X) + ST $f10, 0*SIZE(Y) + lda Y, 1*SIZE(Y) +#else + LD $f10, 0*SIZE(X) + LD $f11, 1*SIZE(X) + lda X, 2*SIZE(X) + ST $f10, 0*SIZE(Y) + ST $f11, 1*SIZE(Y) + lda Y, 2*SIZE(Y) +#endif + subq $5, 1, $5 + bgt $5, $RemainLoop + .align 4 +$End: + ret + .align 4 + +$Sub: +#ifdef COMPLEX + addq INCX, INCX, INCX + addq INCY, INCY, INCY + and N, 7, $5 +#else + and N, 15, $5 +#endif + ble $4, $SubRemain + .align 4 + +$SubMainLoop: +#ifndef COMPLEX + LD $f10, 0(X) + SXADDQ INCX, X, X + LD $f11, 0(X) + SXADDQ INCX, X, X + + LD $f12, 0(X) + SXADDQ INCX, X, X + LD $f13, 0(X) + SXADDQ INCX, X, X + + LD $f14, 0(X) + SXADDQ INCX, X, X + LD $f15, 0(X) + SXADDQ INCX, X, X + + LD $f16, 0(X) + SXADDQ INCX, X, X + LD $f17, 0(X) + SXADDQ INCX, X, X + + LD $f18, 0(X) + SXADDQ INCX, X, X + LD $f19, 0(X) + SXADDQ INCX, X, X + + LD $f20, 0(X) + SXADDQ INCX, X, X + LD $f21, 0(X) + SXADDQ INCX, X, X + + LD $f22, 0(X) + SXADDQ INCX, X, X + LD $f23, 0(X) + SXADDQ INCX, X, X + + LD $f24, 0(X) + SXADDQ INCX, X, X + LD $f25, 0(X) + SXADDQ INCX, X, X + + ST $f10, 0(Y) + SXADDQ INCY, Y, Y + ST $f11, 0(Y) + SXADDQ INCY, Y, Y + + ST $f12, 0(Y) + SXADDQ INCY, Y, Y + ST $f13, 0(Y) + SXADDQ INCY, Y, Y + + ST $f14, 0(Y) + SXADDQ INCY, Y, Y + ST $f15, 0(Y) + SXADDQ INCY, Y, Y + + ST $f16, 0(Y) + SXADDQ INCY, Y, Y + ST $f17, 0(Y) + SXADDQ INCY, Y, Y + + ST $f18, 0(Y) + SXADDQ INCY, Y, Y + ST $f19, 0(Y) + SXADDQ INCY, Y, Y + + ST $f20, 0(Y) + SXADDQ INCY, Y, Y + ST $f21, 0(Y) + SXADDQ INCY, Y, Y + + ST $f22, 0(Y) + SXADDQ INCY, Y, Y + ST $f23, 0(Y) + SXADDQ INCY, Y, Y + + ST $f24, 0(Y) + SXADDQ INCY, Y, Y + ST $f25, 0(Y) + SXADDQ INCY, Y, Y +#else + LD $f10, 0(X) + LD $f11, SIZE(X) + SXADDQ INCX, X, X + + LD $f12, 0(X) + LD $f13, SIZE(X) + SXADDQ INCX, X, X + + LD $f14, 0(X) + LD $f15, SIZE(X) + SXADDQ INCX, X, X + + LD $f16, 0(X) + LD $f17, SIZE(X) + SXADDQ INCX, X, X + + LD $f18, 0(X) + LD $f19, SIZE(X) + SXADDQ INCX, X, X + + LD $f20, 0(X) + LD $f21, SIZE(X) + SXADDQ INCX, X, X + + LD $f22, 0(X) + LD $f23, SIZE(X) + SXADDQ INCX, X, X + + LD $f24, 0(X) + LD $f25, SIZE(X) + SXADDQ INCX, X, X + + ST $f10, 0(Y) + ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f12, 0(Y) + ST $f13, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f14, 0(Y) + ST $f15, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f16, 0(Y) + ST $f17, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f18, 0(Y) + ST $f19, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f20, 0(Y) + ST $f21, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f22, 0(Y) + ST $f23, SIZE(Y) + SXADDQ INCY, Y, Y + + ST $f24, 0(Y) + ST $f25, SIZE(Y) + SXADDQ INCY, Y, Y +#endif + subq $4, 1, $4 + bgt $4, $SubMainLoop + .align 4 + +$SubRemain: + ble $5, $SubEnd + .align 4 + + $SubRemainLoop: +#ifndef COMPLEX + LD $f10, 0(X) + SXADDQ INCX, X, X + ST $f10, 0(Y) + SXADDQ INCY, Y, Y +#else + LD $f10, 0(X) + LD $f11, SIZE(X) + SXADDQ INCX, X, X + ST $f10, 0(Y) + ST $f11, SIZE(Y) + SXADDQ INCY, Y, Y +#endif + subq $5, 1, $5 + bgt $5, $SubRemainLoop + .align 4 + +$SubEnd: + ret + EPILOGUE diff --git a/kernel/alpha/cscal.S b/kernel/alpha/cscal.S new file mode 100644 index 0000000000..bba3137a9b --- /dev/null +++ b/kernel/alpha/cscal.S @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + + .set noat + .set noreorder + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + + .globl NAME + .ent NAME + +NAME: +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount +#endif + +#ifndef C_INTERFACE + ldl $16, 0($16) # n + mov $18, $20 # Store Address + ldl $19, 0($19) # incx + nop + + LD $f1, 0($17) # alpha +#else + mov $18, $20 # Store Address + fmov $f17, $f1 # alpha +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + sra $16, 1, $21 # 4-unrolling + ble $16, $End + + lda $23, -1($19) + ble $19, $End + + bgt $23, $INC_NOT_1 + .align 4 + + ble $21, $Sub + lda $21, -1($21) + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + + LD $f12, 2*SIZE($18) + LD $f13, 3*SIZE($18) + lda $18, 4*SIZE($18) + ble $21, $MainRemain + .align 4 + +$MainLoop: + MUL $f10, $f1, $f20 + LD $f10, 0*SIZE($18) + MUL $f11, $f1, $f21 + LD $f11, 1*SIZE($18) + + MUL $f12, $f1, $f22 + LD $f12, 2*SIZE($18) + MUL $f13, $f1, $f23 + LD $f13, 3*SIZE($18) + + lda $18, 4*SIZE($18) + lda $21, -1($21) + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + ST $f22, 2*SIZE($20) + ST $f23, 3*SIZE($20) + lda $20, 4*SIZE($20) + + bgt $21, $MainLoop + .align 4 + +$MainRemain: + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + MUL $f12, $f1, $f22 + MUL $f13, $f1, $f23 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + ST $f22, 2*SIZE($20) + ST $f23, 3*SIZE($20) + lda $20, 4*SIZE($20) + .align 4 + +$Sub: + blbc $16, $End + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + .align 4 + +$End: + ret + .align 4 + +$INC_NOT_1: + addl $19, $19, $19 + ble $21, $INC_Sub + lda $21, -1($21) + + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f12, 0*SIZE($18) + LD $f13, 1*SIZE($18) + SXADDQ $19, $18, $18 + ble $21, $INC_MainRemain + .align 4 + +$INC_MainLoop: + MUL $f10, $f1, $f20 + LD $f10, 0*SIZE($18) + MUL $f11, $f1, $f21 + LD $f11, 1*SIZE($18) + + SXADDQ $19, $18, $18 + + MUL $f12, $f1, $f22 + LD $f12, 0*SIZE($18) + MUL $f13, $f1, $f23 + LD $f13, 1*SIZE($18) + + SXADDQ $19, $18, $18 + + ST $f20, 0*SIZE($20) + lda $21, -1($21) + ST $f21, 1*SIZE($20) + SXADDQ $19, $20, $20 + + ST $f22, 0*SIZE($20) + ST $f23, 1*SIZE($20) + SXADDQ $19, $20, $20 + unop + bgt $21, $INC_MainLoop + .align 4 + +$INC_MainRemain: + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + MUL $f12, $f1, $f22 + MUL $f13, $f1, $f23 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + SXADDQ $19, $20, $20 + + ST $f22, 0*SIZE($20) + ST $f23, 1*SIZE($20) + SXADDQ $19, $20, $20 + .align 4 + +$INC_Sub: + blbc $16, $INC_End + + LD $f10, 0*SIZE($18) + LD $f11, 1*SIZE($18) + MUL $f10, $f1, $f20 + MUL $f11, $f1, $f21 + + ST $f20, 0*SIZE($20) + ST $f21, 1*SIZE($20) + .align 4 + +$INC_End: + ret + .end NAME + .ident VERSION diff --git a/kernel/alpha/dnrm2.S b/kernel/alpha/dnrm2.S new file mode 100644 index 0000000000..b8ccc75f6e --- /dev/null +++ b/kernel/alpha/dnrm2.S @@ -0,0 +1,431 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldah $29, 0($27) !gpdisp!1 + lda $29, 0($29) !gpdisp!1 + + lda $sp, -16($sp) + ldq $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + lda I, -1(I) + ble I, $L12 + .align 4 + +$L11: + addt a0, t0, a0 + ldl $31, (PREFETCH_SIZE) * SIZE(X) + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + mov X, XX + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(X) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(X) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(X) + + addt a3, t3, a3 + unop + mult x7, x7, t3 + LD x7, 15 * SIZE(X) + + addt a0, t0, a0 + unop + mult x0, x0, t0 + LD x0, 16 * SIZE(X) + + addt a1, t1, a1 + lda X, 16 * SIZE(X) + mult x1, x1, t1 + LD x1, 17 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 18 * SIZE(XX) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 19 * SIZE(XX) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 20 * SIZE(XX) + + addt a1, t1, a1 + lda I, -1(I) + mult x5, x5, t1 + LD x5, 21 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 22 * SIZE(XX) + + addt a3, t3, a3 + mult x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + addt a0, t0, a0 + mov X, XX + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + unop + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(XX) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(XX) + + addt a3, t3, a3 + lda X, 16 * SIZE(X) + mult x7, x7, t3 + LD x7, 15 * SIZE(XX) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a1, t1, a1 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + lda X, 1 * SIZE(X) + + addt a0, t0, a0 + mult x0, x0, t0 + + lda I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addq X, INCX, X + LD x1, 0 * SIZE(X) + addq X, INCX, X + LD x2, 0 * SIZE(X) + addq X, INCX, X + LD x3, 0 * SIZE(X) + addq X, INCX, X + + LD x4, 0 * SIZE(X) + addq X, INCX, X + LD x5, 0 * SIZE(X) + addq X, INCX, X + LD x6, 0 * SIZE(X) + addq X, INCX, X + + lda I, -1(I) + ble I, $L22 + .align 4 + +$L21: + addt a0, t0, a0 + LD x7, 0 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x0, 0 * SIZE(X) + mult x1, x1, t1 + addq X, INCX, X + + addt a2, t2, a2 + LD x1, 0 * SIZE(X) + mult x2, x2, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x2, 0 * SIZE(X) + mult x3, x3, t3 + addq X, INCX, X + + addt a0, t0, a0 + LD x3, 0 * SIZE(X) + mult x4, x4, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x4, 0 * SIZE(X) + mult x5, x5, t1 + addq X, INCX, X + + addt a2, t2, a2 + LD x5, 0 * SIZE(X) + mult x6, x6, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x6, 0 * SIZE(X) + mult x7, x7, t3 + addq X, INCX, X + + lda I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + addt a0, t0, a0 + LD x7, 0 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + unop + mult x1, x1, t1 + unop + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a1, t1, a1 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addq X, INCX, X + + addt a0, t0, a0 + mult x0, x0, t0 + + lda I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + addt a0, t0, a0 + + addt a0, a1, a0 + addt a2, a3, a2 + +#if defined(EV4) || defined(EV5) + addt a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldah $29, 0($26) !gpdisp!3 + lda $29, 0($29) !gpdisp!3 +#else + addt a0, a2, a0 + sqrtt a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldq $26, 0($sp) + lda $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/alpha/dot.S b/kernel/alpha/dot.S new file mode 100644 index 0000000000..330196c78f --- /dev/null +++ b/kernel/alpha/dot.S @@ -0,0 +1,530 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 + +#define I $5 + +#define s0 $f0 +#define s1 $f30 +#define s2 $f1 +#define s3 $f2 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + lda $sp, -16($sp) + fclr s0 + stt $f2, 0($sp) + fclr s1 + + fclr s2 + nop + fclr s3 + ble N, $L999 + + fclr t0 + cmpeq INCX, 1, $21 + fclr t1 + cmpeq INCY, 1, $22 + fclr t2 + and $21, $22, $22 + fclr t3 + beq $22, $L20 + +#ifndef DOUBLE + srl N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addq X, 16 * SIZE, X + subq I, 1, I + + addq Y, 16 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + ldl $31, PREFETCHSIZE * 2 * SIZE(X) + subq I, 1, I + ldl $31, PREFETCHSIZE * 2 * SIZE(Y) + addq X, 16 * SIZE, X + + ADD s0, t0, s0 + LD b6, -10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -24 * SIZE(X) + MUL a1, b1, t1 + LD a1, -23 * SIZE(X) + + ADD s2, t2, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -22 * SIZE(X) + MUL a3, b3, t3 + LD a3, -21 * SIZE(X) + + ADD s0, t0, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -20 * SIZE(X) + MUL a5, b5, t1 + LD a5, -19 * SIZE(X) + + ADD s2, t2, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -18 * SIZE(X) + MUL a7, b7, t3 + LD a7, -17 * SIZE(X) + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -16 * SIZE(X) + MUL a1, b1, t1 + LD a1, -15 * SIZE(X) + + ADD s2, t2, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -14 * SIZE(X) + MUL a3, b3, t3 + LD a3, -13 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -12 * SIZE(X) + MUL a5, b5, t1 + LD a5, -11 * SIZE(X) + + ADD s2, t2, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -10 * SIZE(X) + MUL a7, b7, t3 + LD a7, -9 * SIZE(X) + + addq Y, 16 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, s0 + LD b6,-10 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -9 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, s2 + LD b0, -8 * SIZE(Y) + MUL a2, b2, t2 + LD b1, -7 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, s0 + LD b2, -6 * SIZE(Y) + MUL a4, b4, t0 + LD b3, -5 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, s2 + LD b4, -4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, -3 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, s1 + MUL a1, b1, t1 + + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a5, b5, t1 + ADD s2, t2, s2 + MUL a6, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, s0 + and N, 15, I + ADD s1, t1, s1 + ble I, $L18 + .align 4 + +#else + + srl N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + LD b2, 2 * SIZE(Y) + LD b3, 3 * SIZE(Y) + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + LD b4, 4 * SIZE(Y) + LD b5, 5 * SIZE(Y) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + addq X, 8 * SIZE, X + subq I, 1, I + + addq Y, 8 * SIZE, Y + ble I, $L13 + .align 4 + +$L12: + ldl $31, PREFETCHSIZE * SIZE(X) + subq I, 1, I + ldl $31, PREFETCHSIZE * SIZE(Y) + addq X, 8 * SIZE, X + + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + + ADD s1, t1, s1 + LD a0, -8 * SIZE(X) + MUL a1, b1, t1 + LD a1, -7 * SIZE(X) + + ADD s2, t2, s2 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t2 + LD b1, 1 * SIZE(Y) + + ADD s3, t3, s3 + LD a2, -6 * SIZE(X) + MUL a3, b3, t3 + LD a3, -5 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 2 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 3 * SIZE(Y) + + ADD s1, t1, s1 + LD a4, -4 * SIZE(X) + MUL a5, b5, t1 + LD a5, -3 * SIZE(X) + + ADD s2, t2, s2 + LD b4, 4 * SIZE(Y) + MUL a6, b6, t2 + LD b5, 5 * SIZE(Y) + + ADD s3, t3, s3 + LD a6, -2 * SIZE(X) + MUL a7, b7, t3 + LD a7, -1 * SIZE(X) + + addq Y, 8 * SIZE, Y + bgt I, $L12 + nop + fnop + .align 4 + +$L13: + ADD s0, t0, s0 + LD b6, -2 * SIZE(Y) + MUL a0, b0, t0 + LD b7, -1 * SIZE(Y) + ADD s1, t1, s1 + MUL a1, b1, t1 + + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a5, b5, t1 + ADD s2, t2, s2 + MUL a6, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L15: + ADD s0, t0, s0 + and N, 7, I + ADD s1, t1, s1 + ble I, $L18 + .align 4 + +#endif + +$L16: + LD a0, 0 * SIZE(X) + addq X, SIZE, X + LD b0, 0 * SIZE(Y) + addq Y, SIZE, Y + + ADD s2, t2, s2 + MUL a0, b0, t2 + subq I, 1, I + bgt I, $L16 + .align 4 + +$L18: + ADD s2, t2, s2 + ADD s3, t3, s3 + br $L999 + .align 4 + +$L20: + srl N, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + subq I, 1, I + + SXADDQ INCY, Y, Y + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a1, b1, t1 + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b1, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b2, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b3, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + subq I, 1, I + bgt I, $L22 + nop + fnop + .align 4 + +$L23: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a1, b1, t1 + ADD s2, t2, s2 + MUL a2, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + .align 4 + +$L25: + ADD s0, t0, s0 + and N, 3, I + ADD s1, t1, s1 + ble I, $L28 + .align 4 + +$L26: + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD b0, 0 * SIZE(Y) + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + MUL a0, b0, t2 + subq I, 1, I + bgt I, $L26 + .align 4 + +$L28: + ADD s2, t2, s2 + ADD s3, t3, s3 + .align 4 + +$L999: + ADD s2, s3, s2 + ldt $f2, 0($sp) + ADD s0, s1, s0 + lda $sp, 16($sp) + + ADD s0, s2, s0 + ret + + EPILOGUE diff --git a/kernel/alpha/gemm_beta.S b/kernel/alpha/gemm_beta.S new file mode 100644 index 0000000000..44b2fada16 --- /dev/null +++ b/kernel/alpha/gemm_beta.S @@ -0,0 +1,179 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl CNAME + .ent CNAME +CNAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount +#endif + + ldq $18, 16($sp) + ble $16, $End + ldl $19, 24($sp) + ble $17, $End +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fbeq $f19, $BETA_EQ_ZERO # if (beta == ZERO) + .align 4 + +$BETA_NE_ZERO: + sra $16, 3, $2 # i = (m >> 3) + mov $18, $1 # c_offset = c + lda $17, -1($17) # j -- + ble $2,$L52 + .align 4 + +$L51: + lds $f31, 64($1) + lda $2, -1($2) + + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + LD $f16, 2*SIZE($1) + LD $f17, 3*SIZE($1) + LD $f18, 4*SIZE($1) + LD $f11, 5*SIZE($1) + LD $f21, 6*SIZE($1) + LD $f22, 7*SIZE($1) + + MUL $f19, $f14, $f23 + MUL $f19, $f15, $f24 + MUL $f19, $f16, $f25 + MUL $f19, $f17, $f26 + MUL $f19, $f18, $f27 + MUL $f19, $f11, $f28 + MUL $f19, $f21, $f29 + MUL $f19, $f22, $f30 + + ST $f23, 0*SIZE($1) + ST $f24, 1*SIZE($1) + ST $f25, 2*SIZE($1) + ST $f26, 3*SIZE($1) + ST $f27, 4*SIZE($1) + ST $f28, 5*SIZE($1) + ST $f29, 6*SIZE($1) + ST $f30, 7*SIZE($1) + + lda $1,8*SIZE($1) + bgt $2,$L51 + .align 4 + +$L52: + and $16, 7, $2 + ble $2,$L54 + .align 4 + +$L53: + LD $f12, 0($1) + lda $2, -1($2) + MUL $f19, $f12, $f23 + ST $f23, 0($1) + lda $1, SIZE($1) + bgt $2,$L53 + .align 4 + +$L54: + SXADDQ $19, $18, $18 # c += ldc + bgt $17,$BETA_NE_ZERO + clr $0 + ret + .align 4 + +$BETA_EQ_ZERO: + sra $16, 3, $2 # i = (m >> 3) + lda $4, 8*SIZE($18) + mov $18, $1 # c_offset = c + lda $17, -1($17) # j -- + ble $2,$L42 + .align 4 + +$L41: + ST $f31, 0*SIZE($1) + ST $f31, 1*SIZE($1) + ST $f31, 2*SIZE($1) + ST $f31, 3*SIZE($1) + ST $f31, 4*SIZE($1) + ST $f31, 5*SIZE($1) + ST $f31, 6*SIZE($1) + ST $f31, 7*SIZE($1) + lda $2, -1($2) + + lda $4, 8*SIZE($4) + lda $1, 8*SIZE($1) + bgt $2,$L41 + .align 4 + +$L42: + and $16, 7, $2 + ble $2,$L44 + .align 4 + +$L43: + lda $2, -1($2) + ST $f31, 0($1) + lda $1, SIZE($1) + bgt $2, $L43 + .align 4 + +$L44: + SXADDQ $19, $18, $18 # c += ldc + bgt $17,$BETA_EQ_ZERO + clr $0 + .align 4 + +$End: + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/gemm_kernel_4x4.S b/kernel/alpha/gemm_kernel_4x4.S new file mode 100644 index 0000000000..4e9253488b --- /dev/null +++ b/kernel/alpha/gemm_kernel_4x4.S @@ -0,0 +1,2852 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define ALPHA 64($sp) + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + ldq C, 0 + STACKSIZE($sp) + ldq LDC, 8 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldq OFFSET, 16 + STACKSIZE($sp) +#endif + + SXADDQ LDC, 0, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + stt $f19, ALPHA + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subq $31, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: + mov C, C1 + addq C, LDC, C2 + mov A, AO + s4addq K, 0, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + addq C2, LDC, C3 + s4addq LDC, C, C + + SXADDQ BB, B, BB + fclr t1 + addq C3, LDC, C4 + fclr t2 + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(EV5) || defined(EV6) + ldl $31, 0 * SIZE(BB) + ldl $31, 8 * SIZE(BB) + unop + lda BB, 16 * SIZE(BB) +#endif + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 4, TMP1 +#else + addq KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(B) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + +#else + sll KK, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(TMP1) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(BO) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 +#endif + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + ldt alpha, ALPHA + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L18 +#else + blbs TMP1, $L18 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L18: + ADD c12, t2, c12 + unop + MUL b1, a2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 +#ifndef TRMMKERNEL + LD b5, 1 * SIZE(C1) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL b1, a3, t1 + unop + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(C2) +#else + unop +#endif + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 +#ifndef TRMMKERNEL + LD a1, 0 * SIZE(C3) +#else + unop +#endif + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 +#ifndef TRMMKERNEL + LD a2, 2 * SIZE(C1) +#else + unop +#endif + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 +#ifndef TRMMKERNEL + LD b2, 3 * SIZE(C1) +#else + unop +#endif + + ADD c09, t1, c09 + lda I, -1(I) + MUL b3, a3, t1 + unop + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 +#ifndef TRMMKERNEL + LD b3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 +#ifndef TRMMKERNEL + LD a4, 1 * SIZE(C2) +#else + unop +#endif + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 +#ifndef TRMMKERNEL + LD a3, 2 * SIZE(C2) +#else + unop +#endif + + ADD c11, t1, c11 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD b4, 3 * SIZE(C2) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL alpha, c02, c02 +#ifndef TRMMKERNEL + LD t1, 1 * SIZE(C3) +#else + unop +#endif + + ADD c16, t3, c16 + unop + MUL alpha, c03, c03 +#ifndef TRMMKERNEL + LD t2, 2 * SIZE(C3) +#else + unop +#endif + + ADD c15, t4, c15 + unop + MUL alpha, c04, c04 +#ifndef TRMMKERNEL + LD t3, 3 * SIZE(C3) +#else + unop +#endif + + MUL alpha, c05, c05 + unop +#ifndef TRMMKERNEL + ADD c01, a5, c01 + LD t4, 1 * SIZE(C4) +#else + unop + unop +#endif + + MUL alpha, c06, c06 +#ifndef TRMMKERNEL + unop + ADD c02, b5, c02 + LD a5, 2 * SIZE(C4) +#endif + + MUL alpha, c07, c07 +#ifndef TRMMKERNEL + unop + ADD c03, a2, c03 + LD b5, 3 * SIZE(C4) +#endif + + MUL alpha, c08, c08 +#ifndef TRMMKERNEL + unop + ADD c04, b2, c04 + unop +#endif + + MUL alpha, c09, c09 + ST c01, 0 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c05, b1, c05 + unop +#endif + + MUL alpha, c10, c10 + ST c02, 1 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c06, a4, c06 + unop +#endif + + MUL alpha, c11, c11 + ST c03, 2 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c07, a3, c07 + unop +#endif + + MUL alpha, c12, c12 + ST c04, 3 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c08, b4, c08 +#else + unop +#endif + lda C1, 4 * SIZE(C1) + + MUL alpha, c13, c13 + ST c05, 0 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c09, a1, c09 + unop +#endif + + MUL alpha, c14, c14 + ST c06, 1 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c10, t1, c10 + unop +#endif + + MUL alpha, c15, c15 + ST c07, 2 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c11, t2, c11 + unop +#endif + + MUL alpha, c16, c16 + ST c08, 3 * SIZE(C2) +#ifndef TRMMKERNEL + ADD c12, t3, c12 +#else + unop +#endif + lda C2, 4 * SIZE(C2) + +#ifndef TRMMKERNEL + ADD c13, b3, c13 +#else + unop +#endif + ST c09, 0 * SIZE(C3) + fclr t1 + lda C4, 4 * SIZE(C4) + +#ifndef TRMMKERNEL + ADD c14, t4, c14 +#else + unop +#endif + ST c10, 1 * SIZE(C3) + fclr t2 + unop + +#ifndef TRMMKERNEL + ADD c15, a5, c15 +#else + unop +#endif + ST c11, 2 * SIZE(C3) + fclr t3 + unop + +#ifndef TRMMKERNEL + ADD c16, b5, c16 +#else + unop +#endif + ST c12, 3 * SIZE(C3) + fclr t4 + lda C3, 4 * SIZE(C3) + + ST c13, -4 * SIZE(C4) + ST c14, -3 * SIZE(C4) + ST c15, -2 * SIZE(C4) + ST c16, -1 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 4, TMP1 +#else + subq TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 4, KK +#endif + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + lda BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble L, $L25 + +#else + sll KK, BASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + lda BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L28 +#else + blbs TMP1, $L28 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L28: + ADD c10, t2, c10 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C1) +#else + unop +#endif + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD a4, 1 * SIZE(C1) +#else + unop +#endif + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 +#ifndef TRMMKERNEL + LD b5, 1 * SIZE(C2) +#else + unop +#endif + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(C3) +#else + unop +#endif + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 +#ifndef TRMMKERNEL + LD b2, 1 * SIZE(C3) +#else + unop +#endif + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 +#ifndef TRMMKERNEL + LD b3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c09, t1, c09 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD b4, 1 * SIZE(C4) +#else + unop +#endif + + ADD c10, t2, c10 + unop + MUL alpha, c02, c02 + unop + + ADD c13, t3, c13 + MUL alpha, c05, c05 + ADD c14, t4, c14 + MUL alpha, c06, c06 + + MUL alpha, c09, c09 +#ifndef TRMMKERNEL + ADD c01, a3, c01 +#endif + MUL alpha, c10, c10 +#ifndef TRMMKERNEL + ADD c02, a4, c02 +#endif + + MUL alpha, c13, c13 +#ifndef TRMMKERNEL + ADD c05, a5, c05 +#endif + MUL alpha, c14, c14 +#ifndef TRMMKERNEL + ADD c06, b5, c06 +#endif + +#ifndef TRMMKERNEL + ADD c09, b1, c09 + unop +#endif + ST c01, 0 * SIZE(C1) + fclr t1 + +#ifndef TRMMKERNEL + ADD c10, b2, c10 + unop +#endif + ST c02, 1 * SIZE(C1) + fclr t2 + +#ifndef TRMMKERNEL + ADD c13, b3, c13 + unop +#endif + ST c05, 0 * SIZE(C2) + fclr t3 + +#ifndef TRMMKERNEL + ADD c14, b4, c14 + unop +#endif + ST c06, 1 * SIZE(C2) + fclr t4 + + ST c09, 0 * SIZE(C3) + lda C1, 2 * SIZE(C1) + ST c10, 1 * SIZE(C3) + lda C2, 2 * SIZE(C2) + + ST c13, 0 * SIZE(C4) + lda C3, 2 * SIZE(C3) + ST c14, 1 * SIZE(C4) + lda C4, 2 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 4, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + lda BO, 4 * SIZE(B) + ble L, $L35 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + lda BO, 4 * SIZE(BO) + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + lda AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + lda BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L38 +#else + blbs TMP1, $L38 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + lda AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L38: + ADD c05, t2, c05 + unop + MUL a1, b2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c09, t3, c09 + unop + MUL a1, b3, t3 +#ifndef TRMMKERNEL + LD b5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c13, t4, c13 + unop + MUL a1, b4, t4 +#ifndef TRMMKERNEL + LD a2, 0 * SIZE(C3) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C4) +#else + unop +#endif + + ADD c05, t2, c05 + unop + MUL alpha, c05, c05 + unop + + ADD c09, t3, c09 + MUL alpha, c09, c09 + ADD c13, t4, c13 + MUL alpha, c13, c13 + +#ifndef TRMMKERNEL + ADD c01, a5, c01 + ADD c05, b5, c05 + ADD c09, a2, c09 + ADD c13, a3, c13 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 1, TMP1 +#else + subq TMP1, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 1, KK +#endif + .align 4 + +$L39: + mov BO, B + lda J, -1(J) +#if defined(TRMMKERNEL) && !defined(LEFT) + addq KK, 4, KK +#else + unop +#endif + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + + mov C, C1 + addq C, LDC, C2 + mov A, AO + fclr t1 + addq C2, LDC, C + fclr t2 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 4, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + lda BO, 2 * SIZE(B) + lda AO, 4 * SIZE(AO) + ble L, $L55 +#else + sll KK, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + lda BO, 2 * SIZE(BO) + lda AO, 4 * SIZE(AO) + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L58 +#else + blbs TMP1, $L58 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L58: + ADD c06, t2, c06 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 +#ifndef TRMMKERNEL + LD c11, 2 * SIZE(C1) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 +#ifndef TRMMKERNEL + LD c12, 3 * SIZE(C1) +#else + unop +#endif + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 +#ifndef TRMMKERNEL + LD c13, 0 * SIZE(C2) + unop +#endif + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 +#ifndef TRMMKERNEL + LD c14, 1 * SIZE(C2) +#else + unop +#endif + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 +#ifndef TRMMKERNEL + LD c15, 2 * SIZE(C2) +#else + unop +#endif + + ADD c05, t1, c05 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD c16, 3 * SIZE(C2) +#else + unop +#endif + + ADD c06, t2, c06 + lda I, -1(I) + MUL alpha, c02, c02 + unop + + ADD c07, t3, c07 + MUL alpha, c03, c03 + ADD c08, t4, c08 + MUL alpha, c04, c04 + + MUL alpha, c05, c05 +#ifndef TRMMKERNEL + ADD c01, c09, c01 +#endif + MUL alpha, c06, c06 +#ifndef TRMMKERNEL + ADD c02, c10, c02 +#endif + + MUL alpha, c07, c07 +#ifndef TRMMKERNEL + ADD c03, c11, c03 +#endif + MUL alpha, c08, c08 +#ifndef TRMMKERNEL + ADD c04, c12, c04 +#endif + +#ifndef TRMMKERNEL + ADD c05, c13, c05 +#endif + ST c01, 0 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c06, c14, c06 +#endif + ST c02, 1 * SIZE(C1) + +#ifndef TRMMKERNEL + ADD c07, c15, c07 +#endif + ST c03, 2 * SIZE(C1) +#ifndef TRMMKERNEL + ADD c08, c16, c08 +#endif + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + fclr t1 + ST c06, 1 * SIZE(C2) + fclr t2 + ST c07, 2 * SIZE(C2) + fclr t3 + ST c08, 3 * SIZE(C2) + fclr t4 + + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 4, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 4, KK +#endif + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + ble L, $L65 +#else + sll KK, BASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L68 +#else + blbs TMP1, $L68 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L68: + ADD c02, t2, c02 + unop + MUL a2, b1, t2 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD c05, t3, c05 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c11, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, t1, c01 + unop + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + LD c12, 1 * SIZE(C2) +#else + unop +#endif + + ADD c02, t2, c02 + lda C1, 2 * SIZE(C1) + MUL alpha, c02, c02 + lda C2, 2 * SIZE(C2) + + ADD c05, t3, c05 + MUL alpha, c05, c05 + ADD c06, t4, c06 + MUL alpha, c06, c06 + +#ifndef TRMMKERNEL + ADD c01, c09, c01 + ADD c02, c10, c02 + ADD c05, c11, c05 + ADD c06, c12, c06 +#endif + + ST c01, -2 * SIZE(C1) + fclr t1 + ST c02, -1 * SIZE(C1) + fclr t2 + ST c05, -2 * SIZE(C2) + fclr t3 + ST c06, -1 * SIZE(C2) + fclr t4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + + LD b3, 2 * SIZE(B) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + ble L, $L75 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + + LD b3, 2 * SIZE(BO) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + lda AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + lda BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + ldt alpha, ALPHA + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L78 +#else + blbs TMP1, $L78 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L78: + ADD c05, t2, c05 + MUL a1, b2, t2 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD c02, t3, c02 + ADD c06, t4, c06 +#ifndef TRMMKERNEL + LD b5, 0 * SIZE(C2) +#else + unop +#endif + + ADD c01, c02, c01 + ADD c05, c06, c05 + + ADD c01, t1, c01 + ADD c05, t2, c05 + + MUL alpha, c01, c01 + MUL alpha, c05, c05 + +#ifndef TRMMKERNEL + ADD c01, a5, c01 + ADD c05, b5, c05 +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 1, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 1, KK +#endif + .align 4 + +$L79: + mov BO, B +#if defined(TRMMKERNEL) && !defined(LEFT) + addq KK, 2, KK +#else + unop +#endif + unop + unop + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + + mov C, C1 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 4, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L95 +#else + sll KK, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + lda AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + ldt alpha, ALPHA + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + lda AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: +#ifndef TRMMKERNEL + ADD c01, t1, c01 + LD c05, 0 * SIZE(C1) + ADD c02, t2, c02 + LD c06, 1 * SIZE(C1) + ADD c03, t3, c03 + LD c07, 2 * SIZE(C1) + ADD c04, t4, c04 + LD c08, 3 * SIZE(C1) +#else + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 +#endif + + MUL alpha, c01, c01 + MUL alpha, c02, c02 + MUL alpha, c03, c03 + MUL alpha, c04, c04 + +#ifndef TRMMKERNEL + ADD c01, c05, c01 + ADD c02, c06, c02 + ADD c03, c07, c03 + ADD c04, c08, c04 +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + lda C1, 4 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 4, TMP1 +#else + subq TMP1, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 4, KK +#endif + + lda I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + unop + unop + ble I, $L110 + .align 4 + +$L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L105 +#else + sll KK, BASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + lda BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + lda AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + ldt alpha, ALPHA +#ifndef TRMMKERNEL + LD a3, 0 * SIZE(C1) + LD a4, 1 * SIZE(C1) +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda AO, 2 * SIZE(AO) + unop + lda BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + fclr t1 + ADD c02, t2, c02 + fclr t2 + ADD c03, t3, c03 + fclr t3 + ADD c04, t4, c04 + fclr t4 + + ADD c01, c03, c01 + ADD c02, c04, c02 + + MUL alpha, c01, c01 + MUL alpha, c02, c02 + +#ifndef TRMMKERNEL + ADD c01, a3, c01 + ADD c02, a4, c02 +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + lda C1, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L999 + .align 4 + +$L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + mov B, BO + unop + ble L, $L115 +#else + sll KK, BASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + +#ifndef TRMMKERNEL + sra K, 2, L +#else + sra TMP1, 2, L +#endif + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + lda L, -1(L) + lda AO, 4 * SIZE(AO) + lda BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#ifndef TRMMKERNEL + and K, 3, L +#else + and TMP1, 3, L +#endif + ldt alpha, ALPHA +#ifndef TRMMKERNEL + LD a2, 0 * SIZE(C1) +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda L, -1(L) + lda AO, 1 * SIZE(AO) + lda BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + + MUL alpha, c01, c01 +#ifndef TRMMKERNEL + ADD c01, a2, c01 +#endif + ST c01, 0 * SIZE(C1) + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/gemv_n.S b/kernel/alpha/gemv_n.S new file mode 100644 index 0000000000..665b217a30 --- /dev/null +++ b/kernel/alpha/gemv_n.S @@ -0,0 +1,1307 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $20 +#define LDA $21 + +#define X $18 +#define INCX $19 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define Y1 $4 + +#define A1 $5 +#define A2 $6 +#define A3 $7 +#define A4 $8 + +#define alpha $f19 + +#define alpha1 $f0 +#define alpha2 $f1 +#define alpha3 $f10 +#define alpha4 $f11 + +#define y0 $f12 +#define y1 $f13 +#define y2 $f14 +#define y3 $f15 + +#define y4 $f16 +#define y5 $f17 +#define y6 $f18 +#define y7 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + + PROLOGUE + + lda $sp, -STACKSIZE($sp) + ldq X, 0 + STACKSIZE($sp) + ldq INCX, 8 + STACKSIZE($sp) + ldq Y, 16 + STACKSIZE($sp) + ldq INCY, 24 + STACKSIZE($sp) + ldq BUFFER, 32 + STACKSIZE($sp) + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + SXADDQ INCX, 0, INCX + cmple N, 0, $1 + SXADDQ INCY, 0, INCY + + or $0, $1, $0 + bne $0, $L999 + + SXADDQ LDA, 0, LDA + + cmpeq INCY, SIZE, $0 + bne $0, $L10 + + mov BUFFER, Y1 + + mov Y, BUFFER + mov Y1, Y + + sra M, 3, I + ble I, $L05 + .align 4 + +$L02: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + ST $f31, 2 * SIZE(Y1) + ST $f31, 3 * SIZE(Y1) + ST $f31, 4 * SIZE(Y1) + ST $f31, 5 * SIZE(Y1) + ST $f31, 6 * SIZE(Y1) + ST $f31, 7 * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + lda I, -1(I) + bgt I, $L02 + .align 4 + +$L05: + and M, 7, I + ble I, $L10 + .align 4 + +$L06: + ST $f31, 0 * SIZE(Y1) + addq Y1, SIZE, Y1 + + lda I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + sra N, 2, J + ble J, $L20 + .align 4 + +$L11: + LD alpha1, 0 * SIZE(X) + addq X, INCX, X + LD alpha2, 0 * SIZE(X) + addq X, INCX, X + LD alpha3, 0 * SIZE(X) + addq X, INCX, X + LD alpha4, 0 * SIZE(X) + addq X, INCX, X + + MUL alpha, alpha1, alpha1 + MUL alpha, alpha2, alpha2 + MUL alpha, alpha3, alpha3 + MUL alpha, alpha4, alpha4 + + mov A, A1 + addq A, LDA, A2 + addq A2, LDA, A3 + addq A3, LDA, A4 + s4addq LDA, A, A + + mov Y, Y1 + ldl $31, 4 * SIZE(X) + + sra M, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a8, 0 * SIZE(A3) + LD a9, 1 * SIZE(A3) + LD a10, 2 * SIZE(A3) + LD a11, 3 * SIZE(A3) + + LD y4, 4 * SIZE(Y1) + LD y5, 5 * SIZE(Y1) + LD y6, 6 * SIZE(Y1) + LD y7, 7 * SIZE(Y1) + + MUL alpha1, a0, a0 + LD a12, 0 * SIZE(A4) + MUL alpha1, a1, a1 + LD a13, 1 * SIZE(A4) + MUL alpha1, a2, a2 + LD a14, 2 * SIZE(A4) + MUL alpha1, a3, a3 + LD a15, 3 * SIZE(A4) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + unop + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + unop + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + unop + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + unop + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha3, a8, a8 + unop + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha3, a9, a9 + lda I, -1(I) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha3, a10, a10 + unop + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha3, a11, a11 + unop + + ADD y0, a8, y0 + LD a8, 4 * SIZE(A3) + MUL alpha4, a12, a12 + ble I, $L13 + .align 4 + +$L12: + ADD y1, a9, y1 + LD a9, 5 * SIZE(A3) + MUL alpha4, a13, a13 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + + ADD y2, a10, y2 + LD a10, 6 * SIZE(A3) + MUL alpha4, a14, a14 + unop + + ADD y3, a11, y3 + LD a11, 7 * SIZE(A3) + MUL alpha4, a15, a15 + lda I, -1(I) + + ADD y0, a12, y0 + LD a12, 4 * SIZE(A4) + MUL alpha1, a0, a0 + lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + + ADD y1, a13, y1 + LD a13, 5 * SIZE(A4) + MUL alpha1, a1, a1 + unop + + ADD y2, a14, y2 + LD a14, 6 * SIZE(A4) + MUL alpha1, a2, a2 + unop + + ADD y3, a15, y3 + LD a15, 7 * SIZE(A4) + MUL alpha1, a3, a3 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 11 * SIZE(A1) + + ADD y4, a4, y4 + LD a4, 8 * SIZE(A2) + MUL alpha3, a8, a8 + LD y0, 8 * SIZE(Y1) + + ADD y5, a5, y5 + LD a5, 9 * SIZE(A2) + MUL alpha3, a9, a9 + LD y1, 9 * SIZE(Y1) + + ADD y6, a6, y6 + LD a6, 10 * SIZE(A2) + MUL alpha3, a10, a10 + LD y2, 10 * SIZE(Y1) + + ADD y7, a7, y7 + LD a7, 11 * SIZE(A2) + MUL alpha3, a11, a11 + LD y3, 11 * SIZE(Y1) + + ADD y4, a8, y4 + LD a8, 8 * SIZE(A3) + MUL alpha4, a12, a12 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A3) + + ADD y5, a9, y5 + LD a9, 9 * SIZE(A3) + MUL alpha4, a13, a13 + lda A1, 8 * SIZE(A1) + + ADD y6, a10, y6 + LD a10, 10 * SIZE(A3) + MUL alpha4, a14, a14 + lda A2, 8 * SIZE(A2) + + ADD y7, a11, y7 + LD a11, 11 * SIZE(A3) + MUL alpha4, a15, a15 + lda Y1, 8 * SIZE(Y1) + + ADD y4, a12, y4 + LD a12, 8 * SIZE(A4) + MUL alpha1, a0, a0 + unop + + ADD y5, a13, y5 + LD a13, 9 * SIZE(A4) + MUL alpha1, a1, a1 + lda A3, 8 * SIZE(A3) + + ADD y6, a14, y6 + LD a14, 10 * SIZE(A4) + MUL alpha1, a2, a2 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A4) + + ADD y7, a15, y7 + LD a15, 11 * SIZE(A4) + MUL alpha1, a3, a3 + lda A4, 8 * SIZE(A4) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + ST y4, -4 * SIZE(Y1) + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + ST y5, -3 * SIZE(Y1) + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + ST y6, -2 * SIZE(Y1) + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + ST y7, -1 * SIZE(Y1) + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha3, a8, a8 + LD y4, 4 * SIZE(Y1) + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha3, a9, a9 + LD y5, 5 * SIZE(Y1) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha3, a10, a10 + LD y6, 6 * SIZE(Y1) + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha3, a11, a11 + LD y7, 7 * SIZE(Y1) + + ADD y0, a8, y0 + LD a8, 4 * SIZE(A3) + MUL alpha4, a12, a12 + bgt I, $L12 + .align 4 + +$L13: + ADD y1, a9, y1 + LD a9, 5 * SIZE(A3) + MUL alpha4, a13, a13 + unop + + ADD y2, a10, y2 + LD a10, 6 * SIZE(A3) + MUL alpha4, a14, a14 + unop + + ADD y3, a11, y3 + LD a11, 7 * SIZE(A3) + MUL alpha4, a15, a15 + unop + + ADD y0, a12, y0 + LD a12, 4 * SIZE(A4) + MUL alpha1, a0, a0 + unop + + ADD y1, a13, y1 + LD a13, 5 * SIZE(A4) + MUL alpha1, a1, a1 + unop + + ADD y2, a14, y2 + LD a14, 6 * SIZE(A4) + MUL alpha1, a2, a2 + unop + + ADD y3, a15, y3 + LD a15, 7 * SIZE(A4) + MUL alpha1, a3, a3 + unop + + ST y0, 0 * SIZE(Y1) + ADD y4, a0, y4 + unop + MUL alpha2, a4, a4 + + ST y1, 1 * SIZE(Y1) + ADD y5, a1, y5 + unop + MUL alpha2, a5, a5 + + ST y2, 2 * SIZE(Y1) + ADD y6, a2, y6 + unop + MUL alpha2, a6, a6 + + ST y3, 3 * SIZE(Y1) + ADD y7, a3, y7 + lda Y1, 8 * SIZE(Y1) + MUL alpha2, a7, a7 + + ADD y4, a4, y4 + MUL alpha3, a8, a8 + ADD y5, a5, y5 + MUL alpha3, a9, a9 + ADD y6, a6, y6 + MUL alpha3, a10, a10 + ADD y7, a7, y7 + MUL alpha3, a11, a11 + + ADD y4, a8, y4 + MUL alpha4, a12, a12 + ADD y5, a9, y5 + MUL alpha4, a13, a13 + ADD y6, a10, y6 + MUL alpha4, a14, a14 + ADD y7, a11, y7 + MUL alpha4, a15, a15 + + ADD y4, a12, y4 + ADD y5, a13, y5 + ADD y6, a14, y6 + ADD y7, a15, y7 + + ST y4, -4 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y5, -3 * SIZE(Y1) + lda A2, 8 * SIZE(A2) + ST y6, -2 * SIZE(Y1) + lda A3, 8 * SIZE(A3) + ST y7, -1 * SIZE(Y1) + lda A4, 8 * SIZE(A4) + .align 4 + +$L15: + and M, 4, I + ble I, $L16 + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD a8, 0 * SIZE(A3) + LD a9, 1 * SIZE(A3) + LD a10, 2 * SIZE(A3) + LD a11, 3 * SIZE(A3) + + MUL alpha1, a0, a0 + LD a12, 0 * SIZE(A4) + MUL alpha1, a1, a1 + LD a13, 1 * SIZE(A4) + MUL alpha1, a2, a2 + LD a14, 2 * SIZE(A4) + MUL alpha1, a3, a3 + LD a15, 3 * SIZE(A4) + + ADD y0, a0, y0 + MUL alpha2, a4, a4 + ADD y1, a1, y1 + MUL alpha2, a5, a5 + ADD y2, a2, y2 + MUL alpha2, a6, a6 + ADD y3, a3, y3 + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + MUL alpha3, a8, a8 + ADD y1, a5, y1 + MUL alpha3, a9, a9 + ADD y2, a6, y2 + MUL alpha3, a10, a10 + ADD y3, a7, y3 + MUL alpha3, a11, a11 + + ADD y0, a8, y0 + MUL alpha4, a12, a12 + ADD y1, a9, y1 + MUL alpha4, a13, a13 + ADD y2, a10, y2 + MUL alpha4, a14, a14 + ADD y3, a11, y3 + MUL alpha4, a15, a15 + + ADD y0, a12, y0 + lda Y1, 4 * SIZE(Y1) + ADD y1, a13, y1 + unop + + ADD y2, a14, y2 + unop + ADD y3, a15, y3 + unop + + ST y0, -4 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y1, -3 * SIZE(Y1) + lda A2, 4 * SIZE(A2) + ST y2, -2 * SIZE(Y1) + lda A3, 4 * SIZE(A3) + ST y3, -1 * SIZE(Y1) + lda A4, 4 * SIZE(A4) + .align 4 + +$L16: + and M, 2, I + ble I, $L17 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + LD a4, 0 * SIZE(A3) + MUL alpha1, a0, a0 + LD a5, 1 * SIZE(A3) + MUL alpha1, a1, a1 + LD a6, 0 * SIZE(A4) + MUL alpha2, a2, a2 + LD a7, 1 * SIZE(A4) + MUL alpha2, a3, a3 + + ADD y0, a0, y0 + MUL alpha3, a4, a4 + ADD y1, a1, y1 + MUL alpha3, a5, a5 + ADD y0, a2, y0 + MUL alpha4, a6, a6 + ADD y1, a3, y1 + MUL alpha4, a7, a7 + + ADD y0, a4, y0 + lda A1, 2 * SIZE(A1) + ADD y1, a5, y1 + lda A2, 2 * SIZE(A2) + ADD y0, a6, y0 + lda A3, 2 * SIZE(A3) + ADD y1, a7, y1 + lda A4, 2 * SIZE(A4) + + ST y0, 0 * SIZE(Y1) + unop + ST y1, 1 * SIZE(Y1) + lda Y1, 2 * SIZE(Y1) + .align 4 + +$L17: + blbc M, $L18 + + LD y0, 0 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + + MUL alpha1, a0, a0 + MUL alpha2, a1, a1 + MUL alpha3, a2, a2 + MUL alpha4, a3, a3 + + ADD y0, a0, y0 + ADD y0, a1, y0 + ADD y0, a2, y0 + ADD y0, a3, y0 + + ST y0, 0 * SIZE(Y1) + .align 4 + +$L18: + lda J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + and N, 2, J + ble J, $L30 + + LD alpha1, 0 * SIZE(X) + addq X, INCX, X + LD alpha2, 0 * SIZE(X) + addq X, INCX, X + + mov A, A1 + MUL alpha, alpha1, alpha1 + addq A, LDA, A2 + MUL alpha, alpha2, alpha2 + + addq A2, LDA, A + mov Y, Y1 + + sra M, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + MUL alpha1, a0, a0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a1, a1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a2, a2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a3, a3 + LD y7, 7 * SIZE(Y1) + + ADD y0, a0, y0 + LD a0, 4 * SIZE(A1) + MUL alpha2, a4, a4 + + ADD y1, a1, y1 + LD a1, 5 * SIZE(A1) + MUL alpha2, a5, a5 + + ADD y2, a2, y2 + LD a2, 6 * SIZE(A1) + MUL alpha2, a6, a6 + + ADD y3, a3, y3 + LD a3, 7 * SIZE(A1) + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha1, a0, a0 + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha1, a1, a1 + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha1, a2, a2 + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha1, a3, a3 + + lda I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + lda I, -1(I) + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + lda A2, 8 * SIZE(A2) + + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 11 * SIZE(A1) + + ADD y4, a4, y4 + LD a4, 0 * SIZE(A2) + MUL alpha1, a0, a0 + LD y0, 8 * SIZE(Y1) + + ADD y5, a5, y5 + LD a5, 1 * SIZE(A2) + MUL alpha1, a1, a1 + LD y1, 9 * SIZE(Y1) + + ADD y6, a6, y6 + LD a6, 2 * SIZE(A2) + MUL alpha1, a2, a2 + LD y2, 10 * SIZE(Y1) + + ADD y7, a7, y7 + LD a7, 3 * SIZE(A2) + MUL alpha1, a3, a3 + LD y3, 11 * SIZE(Y1) + + ADD y0, a0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha2, a4, a4 + LD a0, 12 * SIZE(A1) + + ADD y1, a1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha2, a5, a5 + LD a1, 13 * SIZE(A1) + + ADD y2, a2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha2, a6, a6 + LD a2, 14 * SIZE(A1) + + ADD y3, a3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha2, a7, a7 + LD a3, 15 * SIZE(A1) + + ADD y0, a4, y0 + LD a4, 4 * SIZE(A2) + MUL alpha1, a0, a0 + LD y4, 12 * SIZE(Y1) + + ADD y1, a5, y1 + LD a5, 5 * SIZE(A2) + MUL alpha1, a1, a1 + LD y5, 13 * SIZE(Y1) + + ADD y2, a6, y2 + LD a6, 6 * SIZE(A2) + MUL alpha1, a2, a2 + LD y6, 14 * SIZE(Y1) + + ADD y3, a7, y3 + LD a7, 7 * SIZE(A2) + MUL alpha1, a3, a3 + LD y7, 15 * SIZE(Y1) + + lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + lda A1, 8 * SIZE(A1) + lda Y1, 8 * SIZE(Y1) + bgt I, $L22 + .align 4 + +$L23: + ADD y4, a0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a4, a4 + unop + + ADD y5, a1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a5, a5 + unop + + ADD y6, a2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a6, a6 + unop + + ADD y7, a3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a7, a7 + unop + + ADD y4, a4, y4 + ADD y5, a5, y5 + ADD y6, a6, y6 + ADD y7, a7, y7 + + ST y4, 4 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y5, 5 * SIZE(Y1) + lda A2, 8 * SIZE(A2) + + ST y6, 6 * SIZE(Y1) + unop + ST y7, 7 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) + .align 4 + +$L25: + and M, 4, I + ble I, $L26 + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, a0 + LD a4, 0 * SIZE(A2) + MUL alpha1, a1, a1 + LD a5, 1 * SIZE(A2) + MUL alpha1, a2, a2 + LD a6, 2 * SIZE(A2) + MUL alpha1, a3, a3 + LD a7, 3 * SIZE(A2) + + ADD y0, a0, y0 + MUL alpha2, a4, a4 + ADD y1, a1, y1 + MUL alpha2, a5, a5 + ADD y2, a2, y2 + MUL alpha2, a6, a6 + ADD y3, a3, y3 + MUL alpha2, a7, a7 + + ADD y0, a4, y0 + lda Y1, 4 * SIZE(Y1) + ADD y1, a5, y1 + unop + ADD y2, a6, y2 + unop + ADD y3, a7, y3 + unop + + ST y0, -4 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y1, -3 * SIZE(Y1) + lda A2, 4 * SIZE(A2) + ST y2, -2 * SIZE(Y1) + lda A3, 4 * SIZE(A3) + ST y3, -1 * SIZE(Y1) + lda A4, 4 * SIZE(A4) + .align 4 + +$L26: + and M, 2, I + ble I, $L27 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a0, a0 + MUL alpha1, a1, a1 + MUL alpha2, a2, a2 + MUL alpha2, a3, a3 + + ADD y0, a0, y0 + lda A1, 2 * SIZE(A1) + ADD y1, a1, y1 + lda A2, 2 * SIZE(A2) + ADD y0, a2, y0 + unop + ADD y1, a3, y1 + unop + + ST y0, 0 * SIZE(Y1) + unop + ST y1, 1 * SIZE(Y1) + lda Y1, 2 * SIZE(Y1) + .align 4 + +$L27: + blbc M, $L30 + + LD y0, 0 * SIZE(Y1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + + MUL alpha1, a0, a0 + MUL alpha2, a1, a1 + + ADD y0, a0, y0 + ADD y0, a1, y0 + + ST y0, 0 * SIZE(Y1) + .align 4 + +$L30: + blbc N, $L990 + + LD alpha1, 0 * SIZE(X) + mov A, A1 + MUL alpha, alpha1, alpha1 + mov Y, Y1 + + sra M, 3, I + ble I, $L35 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + LD a4, 4 * SIZE(A1) + LD a5, 5 * SIZE(A1) + LD a6, 6 * SIZE(A1) + LD a7, 7 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + LD y4, 4 * SIZE(Y1) + LD y5, 5 * SIZE(Y1) + LD y6, 6 * SIZE(Y1) + LD y7, 7 * SIZE(Y1) + + MUL alpha1, a0, a0 + MUL alpha1, a1, a1 + MUL alpha1, a2, a2 + MUL alpha1, a3, a3 + + lda I, -1(I) + ble I, $L33 + .align 4 + +$L32: + ADD y0, a0, y0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, a4 + LD a0, 8 * SIZE(A1) + + ADD y1, a1, y1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, a5 + LD a1, 9 * SIZE(A1) + + ADD y2, a2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, a6 + LD a2, 10 * SIZE(A1) + + ADD y3, a3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, a7 + LD a3, 11 * SIZE(A1) + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + ST y2, 2 * SIZE(Y1) + ST y3, 3 * SIZE(Y1) + + ADD y4, a4, y4 + LD y0, 8 * SIZE(Y1) + MUL alpha1, a0, a0 + LD a4, 12 * SIZE(A1) + + ADD y5, a5, y5 + LD y1, 9 * SIZE(Y1) + MUL alpha1, a1, a1 + LD a5, 13 * SIZE(A1) + + ADD y6, a6, y6 + LD y2, 10 * SIZE(Y1) + MUL alpha1, a2, a2 + LD a6, 14 * SIZE(A1) + + ADD y7, a7, y7 + LD y3, 11 * SIZE(Y1) + MUL alpha1, a3, a3 + LD a7, 15 * SIZE(A1) + + ST y4, 4 * SIZE(Y1) + lda I, -1(I) + ST y5, 5 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + + ST y6, 6 * SIZE(Y1) + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + ST y7, 7 * SIZE(Y1) + lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + bgt I, $L32 + .align 4 + +$L33: + ADD y0, a0, y0 + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, a4 + unop + + ADD y1, a1, y1 + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, a5 + unop + + ADD y2, a2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, a6 + unop + + ADD y3, a3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, a7 + unop + + ADD y4, a4, y4 + ST y0, 0 * SIZE(Y1) + ADD y5, a5, y5 + ST y1, 1 * SIZE(Y1) + ADD y6, a6, y6 + ST y2, 2 * SIZE(Y1) + ADD y7, a7, y7 + ST y3, 3 * SIZE(Y1) + + ST y4, 4 * SIZE(Y1) + unop + ST y5, 5 * SIZE(Y1) + unop + + ST y6, 6 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y7, 7 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) + .align 4 + +$L35: + and M, 4, I + ble I, $L36 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, a0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, a1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, a2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, a3 + LD y3, 3 * SIZE(Y1) + + ADD y0, a0, y0 + ADD y1, a1, y1 + ADD y2, a2, y2 + ADD y3, a3, y3 + + ST y0, 0 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + lda A2, 4 * SIZE(A2) + ST y2, 2 * SIZE(Y1) + unop + ST y3, 3 * SIZE(Y1) + lda Y1, 4 * SIZE(Y1) + .align 4 + +$L36: + and M, 2, I + ble I, $L37 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + MUL alpha1, a0, a0 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a1, a1 + + ADD y0, a0, y0 + ADD y1, a1, y1 + + ST y0, 0 * SIZE(Y1) + lda A1, 2 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + lda Y1, 2 * SIZE(Y1) + .align 4 + +$L37: + blbc M, $L990 + + LD y0, 0 * SIZE(Y1) + LD a0, 0 * SIZE(A1) + + MUL alpha1, a0, a0 + + ADD y0, a0, y0 + ST y0, 0 * SIZE(Y1) + .align 4 + +$L990: + cmpeq INCY, SIZE, $0 + bne $0, $L999 + + mov BUFFER, Y1 + + sra M, 3, I + ble I, $L995 + .align 4 + +$L992: + LD a0, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a1, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a2, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a3, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + LD y2, 2 * SIZE(Y) + LD y3, 3 * SIZE(Y) + + LD a4, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a5, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a6, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a7, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y4, 4 * SIZE(Y) + LD y5, 5 * SIZE(Y) + LD y6, 6 * SIZE(Y) + LD y7, 7 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + ADD a2, y2, a2 + ADD a3, y3, a3 + ADD a4, y4, a4 + ADD a5, y5, a5 + ADD a6, y6, a6 + ADD a7, y7, a7 + + ST a0, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a1, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a2, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a3, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + + ST a4, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a5, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a6, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a7, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda I, -1(I) + lda Y, 8 * SIZE(Y) + bgt I, $L992 + .align 4 + +$L995: + and M, 7, I + ble I, $L999 + .align 4 + +$L996: + LD a0, 0 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + lda Y, 1 * SIZE(Y) + + ADD a0, y0, a0 + + ST a0, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda I, -1(I) + bgt I, $L996 + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/gemv_t.S b/kernel/alpha/gemv_t.S new file mode 100644 index 0000000000..ea95546e87 --- /dev/null +++ b/kernel/alpha/gemv_t.S @@ -0,0 +1,1061 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $20 +#define LDA $21 + +#define X $18 +#define INCX $19 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define X1 $3 +#define Y1 $4 + +#define A1 $5 +#define A2 $6 +#define A3 $7 +#define A4 $8 + +#define alpha $f19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + + PROLOGUE + + lda $sp, -STACKSIZE($sp) + ldq X, 0 + STACKSIZE($sp) + ldq INCX, 8 + STACKSIZE($sp) + ldq Y, 16 + STACKSIZE($sp) + ldq INCY, 24 + STACKSIZE($sp) + ldq BUFFER, 32 + STACKSIZE($sp) + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + SXADDQ INCX, 0, INCX + cmple N, 0, $1 + SXADDQ INCY, 0, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCX, SIZE, $0 + mov X, X1 + SXADDQ LDA, 0, LDA + bne $0, $L10 + + sra M, 3, I + mov BUFFER, Y1 + mov BUFFER, X + ble I, $L05 + .align 4 + +$L02: + ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) + lda I, -1(I) + + LD a0, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a1, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a2, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a3, 0 * SIZE(X1) + addq X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ST a2, 2 * SIZE(Y1) + ST a3, 3 * SIZE(Y1) + + LD a4, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a5, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a6, 0 * SIZE(X1) + addq X1, INCX, X1 + LD a7, 0 * SIZE(X1) + addq X1, INCX, X1 + + ST a4, 4 * SIZE(Y1) + ST a5, 5 * SIZE(Y1) + ST a6, 6 * SIZE(Y1) + ST a7, 7 * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + bgt I, $L02 + .align 4 + +$L05: + and M, 7, I + ble I, $L10 + .align 4 + +$L06: + LD a0, 0 * SIZE(X1) + addq X1, INCX, X1 + ST a0, 0 * SIZE(Y1) + addq Y1, SIZE, Y1 + + lda I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + mov Y, Y1 + fclr t0 + unop + fclr t1 + + sra N, 2, J + fclr t2 + fclr t3 + ble J, $L20 + .align 4 + +$L11: + mov A, A1 + fclr s0 + addq A, LDA, A2 + fclr s1 + + addq A2, LDA, A3 + fclr s2 + addq A3, LDA, A4 + fclr s3 + + s4addq LDA, A, A + unop + mov X, X1 + lds $f31, 3 * SIZE(Y) + + sra M, 3, I + ble I, $L15 + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + LD a4, 1 * SIZE(A1) + LD a5, 1 * SIZE(A2) + LD a6, 1 * SIZE(A3) + LD a7, 1 * SIZE(A4) + LD a8, 2 * SIZE(A1) + LD a9, 2 * SIZE(A2) + LD a10, 2 * SIZE(A3) + LD a11, 2 * SIZE(A4) + LD a12, 3 * SIZE(A1) + LD a13, 3 * SIZE(A2) + LD a14, 3 * SIZE(A3) + LD a15, 3 * SIZE(A4) + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 4 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + LD a1, 4 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 4 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 4 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 5 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 8 * SIZE(A1) + MUL x1, a5, t1 + LD a5, 5 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 5 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 5 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a8, t0 + LD a8, -2 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + MUL x2, a9, t1 + LD a9, 6 * SIZE(A2) + + ADD s2, t2, s2 + lda A2, 8 * SIZE(A2) + MUL x2, a10, t2 + LD a10, 6 * SIZE(A3) + + ADD s3, t3, s3 + lda A3, 8 * SIZE(A3) + MUL x2, a11, t3 + LD a11, 6 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a12, t0 + LD a12, -1 * SIZE(A1) + + ADD s1, t1, s1 + lda A4, 8 * SIZE(A4) + MUL x3, a13, t1 + LD a13, -1 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x3, a14, t2 + LD a14, -1 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x3, a15, t3 + LD a15, -1 * SIZE(A4) + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 0 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE - 8) * SIZE(A3) + MUL x0, a1, t1 + LD a1, 0 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 0 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 0 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 8 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 1 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x1, a5, t1 + LD a5, 1 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 1 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 1 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 9 * SIZE(X1) + MUL x2, a8, t0 + LD a8, 2 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE - 8) * SIZE(A4) + MUL x2, a9, t1 + LD a9, 2 * SIZE(A2) + + ADD s2, t2, s2 + lda X1, 8 * SIZE(X1) + MUL x2, a10, t2 + LD a10, 2 * SIZE(A3) + + ADD s3, t3, s3 + lda I, -1(I) + MUL x2, a11, t3 + LD a11, 2 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 2 * SIZE(X1) + MUL x3, a12, t0 + LD a12, 3 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE - 8) * SIZE(X1) + MUL x3, a13, t1 + LD a13, 3 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x3, a14, t2 + LD a14, 3 * SIZE(A3) + + ADD s3, t3, s3 + MUL x3, a15, t3 + LD a15, 3 * SIZE(A4) + bgt I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 4 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x0, a1, t1 + LD a1, 4 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x0, a2, t2 + LD a2, 4 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x0, a3, t3 + LD a3, 4 * SIZE(A4) + + ADD s0, t0, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a4, t0 + LD a4, 5 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x1, a5, t1 + LD a5, 5 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x1, a6, t2 + LD a6, 5 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x1, a7, t3 + LD a7, 5 * SIZE(A4) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a8, t0 + LD a8, 6 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x2, a9, t1 + LD a9, 6 * SIZE(A2) + + ADD s2, t2, s2 + unop + MUL x2, a10, t2 + LD a10, 6 * SIZE(A3) + + ADD s3, t3, s3 + unop + MUL x2, a11, t3 + LD a11, 6 * SIZE(A4) + + ADD s0, t0, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a12, t0 + LD a12, 7 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 8 * SIZE(A1) + MUL x3, a13, t1 + LD a13, 7 * SIZE(A2) + + ADD s2, t2, s2 + lda A2, 8 * SIZE(A2) + MUL x3, a14, t2 + LD a14, 7 * SIZE(A3) + + ADD s3, t3, s3 + lda A3, 8 * SIZE(A3) + MUL x3, a15, t3 + LD a15, 7 * SIZE(A4) + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a0, t0 + unop + + ADD s1, t1, s1 + lda X1, 8 * SIZE(X1) + MUL x0, a1, t1 + lda A4, 8 * SIZE(A4) + + ADD s2, t2, s2 + MUL x0, a2, t2 + ADD s3, t3, s3 + MUL x0, a3, t3 + + ADD s0, t0, s0 + MUL x1, a4, t0 + ADD s1, t1, s1 + MUL x1, a5, t1 + + ADD s2, t2, s2 + MUL x1, a6, t2 + ADD s3, t3, s3 + MUL x1, a7, t3 + + ADD s0, t0, s0 + MUL x2, a8, t0 + ADD s1, t1, s1 + MUL x2, a9, t1 + + ADD s2, t2, s2 + MUL x2, a10, t2 + ADD s3, t3, s3 + MUL x2, a11, t3 + + ADD s0, t0, s0 + MUL x3, a12, t0 + ADD s1, t1, s1 + MUL x3, a13, t1 + + ADD s2, t2, s2 + MUL x3, a14, t2 + ADD s3, t3, s3 + MUL x3, a15, t3 + .align 4 + +$L15: + and M, 7, I + ble I, $L18 + + LD x0, 0 * SIZE(X1) + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 0 * SIZE(A3) + LD a3, 0 * SIZE(A4) + + lda I, -1(I) + ble I, $L17 + .align 4 + +$L16: + ADD s0, t0, s0 + lda A4, 1 * SIZE(A4) + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 1 * SIZE(A1) + MUL x0, a1, t1 + LD a1, 1 * SIZE(A2) + + ADD s2, t2, s2 + lda A2, 1 * SIZE(A2) + MUL x0, a2, t2 + LD a2, 1 * SIZE(A3) + + ADD s3, t3, s3 + lda A3, 1 * SIZE(A3) + MUL x0, a3, t3 + LD a3, 0 * SIZE(A4) + + LD x0, 1 * SIZE(X1) + lda X1, 1 * SIZE(X1) + lda I, -1(I) + bgt I, $L16 + .align 4 + +$L17: + ADD s0, t0, s0 + MUL x0, a0, t0 + ADD s1, t1, s1 + MUL x0, a1, t1 + + ADD s2, t2, s2 + MUL x0, a2, t2 + ADD s3, t3, s3 + MUL x0, a3, t3 + .align 4 + +$L18: + LD a0, 0 * SIZE(Y) + addq Y, INCY, Y + LD a1, 0 * SIZE(Y) + addq Y, INCY, Y + LD a2, 0 * SIZE(Y) + addq Y, INCY, Y + LD a3, 0 * SIZE(Y) + addq Y, INCY, Y + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + MUL alpha, s0, s0 + MUL alpha, s1, s1 + MUL alpha, s2, s2 + MUL alpha, s3, s3 + + ADD a0, s0, a0 + fclr t0 + ADD a1, s1, a1 + fclr t1 + ADD a2, s2, a2 + fclr t2 + ADD a3, s3, a3 + fclr t3 + + ST a0, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a1, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a2, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a3, 0 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + and N, 2, J + ble J, $L30 + mov A, A1 + addq A, LDA, A2 + + addq A2, LDA, A + fclr s0 + mov X, X1 + fclr s1 + + sra M, 3, I + fclr s2 + fclr s3 + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD a2, 1 * SIZE(A1) + LD a3, 1 * SIZE(A2) + LD a4, 2 * SIZE(A1) + LD a5, 2 * SIZE(A2) + LD a6, 3 * SIZE(A1) + LD a7, 3 * SIZE(A2) + + LD a8, 4 * SIZE(A1) + LD a9, 4 * SIZE(A2) + LD a10, 5 * SIZE(A1) + LD a11, 5 * SIZE(A2) + LD a12, 6 * SIZE(A1) + LD a13, 6 * SIZE(A2) + LD a14, 7 * SIZE(A1) + LD a15, 7 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + lda I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + LD a0, 8 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + LD a1, 8 * SIZE(A2) + + ADD s0, t2, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a2, t2 + LD a2, 9 * SIZE(A1) + + ADD s1, t3, s1 + unop + MUL x1, a3, t3 + LD a3, 9 * SIZE(A2) + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a4, t0 + LD a4, 10 * SIZE(A1) + + ADD s1, t1, s1 + lda I, -1(I) + MUL x2, a5, t1 + LD a5, 10 * SIZE(A2) + + ADD s0, t2, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a6, t2 + LD a6, 11 * SIZE(A1) + + ADD s1, t3, s1 + lda X1, 8 * SIZE(X1) + MUL x3, a7, t3 + LD a7, 11 * SIZE(A2) + + ADD s0, t0, s0 + LD x3, -1 * SIZE(X1) + MUL x0, a8, t0 + LD a8, 12 * SIZE(A1) + + ADD s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + MUL x0, a9, t1 + LD a9, 12 * SIZE(A2) + + ADD s0, t0, s0 + LD x0, 0 * SIZE(X1) + MUL x1, a10, t0 + LD a10, 13 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 8 * SIZE(A1) + MUL x1, a11, t1 + LD a11, 13 * SIZE(A2) + + ADD s0, t0, s0 + LD x1, 1 * SIZE(X1) + MUL x2, a12, t0 + LD a12, 6 * SIZE(A1) + + ADD s1, t1, s1 + MUL x2, a13, t1 + LD a13, 14 * SIZE(A2) + lda A2, 8 * SIZE(A2) + + ADD s0, t0, s0 + LD x2, 2 * SIZE(X1) + MUL x3, a14, t0 + LD a14, 7 * SIZE(A1) + + ADD s1, t1, s1 + MUL x3, a15, t1 + LD a15, 7 * SIZE(A2) + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, s0 + LD x3, 3 * SIZE(X1) + MUL x0, a0, t0 + lda A1, 8 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD s0, t2, s0 + LD x0, 4 * SIZE(X1) + MUL x1, a2, t2 + lda A2, 8 * SIZE(A2) + + ADD s1, t3, s1 + unop + MUL x1, a3, t3 + unop + + ADD s0, t0, s0 + LD x1, 5 * SIZE(X1) + MUL x2, a4, t0 + unop + + ADD s1, t1, s1 + unop + MUL x2, a5, t1 + unop + + ADD s0, t2, s0 + LD x2, 6 * SIZE(X1) + MUL x3, a6, t2 + unop + + ADD s1, t3, s1 + unop + MUL x3, a7, t3 + unop + + ADD s0, t0, s0 + LD x3, 7 * SIZE(X1) + MUL x0, a8, t0 + lda X1, 8 * SIZE(X1) + + ADD s1, t1, s1 + unop + MUL x0, a9, t1 + unop + + ADD s0, t0, s0 + MUL x1, a10, t0 + ADD s1, t1, s1 + MUL x1, a11, t1 + + ADD s0, t0, s0 + MUL x2, a12, t0 + ADD s1, t1, s1 + MUL x2, a13, t1 + + ADD s0, t0, s0 + MUL x3, a14, t0 + ADD s1, t1, s1 + MUL x3, a15, t1 + .align 4 + +$L25: + and M, 7, I + ble I, $L28 + + LD a0, 0 * SIZE(A1) + LD a1, 0 * SIZE(A2) + LD x0, 0 * SIZE(X1) + + lda I, -1(I) + ble I, $L27 + .align 4 + +$L26: + ADD s0, t0, s0 + lda A2, 1 * SIZE(A2) + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + + ADD s1, t1, s1 + lda A1, 1 * SIZE(A1) + MUL x0, a1, t1 + LD a1, 0 * SIZE(A2) + + LD x0, 1 * SIZE(X1) + lda X1, 1 * SIZE(X1) + lda I, -1(I) + bgt I, $L26 + .align 4 + +$L27: + ADD s0, t0, s0 + MUL x0, a0, t0 + ADD s1, t1, s1 + MUL x0, a1, t1 + .align 4 + +$L28: + LD a0, 0 * SIZE(Y) + addq Y, INCY, Y + LD a1, 0 * SIZE(Y) + addq Y, INCY, Y + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + + MUL alpha, s0, s0 + MUL alpha, s1, s1 + + ADD a0, s0, a0 + ADD a1, s1, a1 + + ST a0, 0 * SIZE(Y1) + fclr t0 + addq Y1, INCY, Y1 + fclr t1 + + ST a1, 0 * SIZE(Y1) + fclr t2 + addq Y1, INCY, Y1 + fclr t3 + .align 4 + +$L30: + blbc N, $L999 + + mov A, A1 + fclr s0 + mov X, X1 + fclr s1 + + sra M, 3, I + fclr s2 + fclr s3 + ble I, $L35 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a8, 0 * SIZE(X1) + LD a9, 1 * SIZE(X1) + + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + LD a10, 2 * SIZE(X1) + LD a11, 3 * SIZE(X1) + + LD a4, 4 * SIZE(A1) + LD a5, 5 * SIZE(A1) + LD a12, 4 * SIZE(X1) + LD a13, 5 * SIZE(X1) + + LD a6, 6 * SIZE(A1) + LD a7, 7 * SIZE(A1) + LD a14, 6 * SIZE(X1) + + lda I, -1(I) + ble I, $L33 + .align 4 + +$L32: + ADD s0, t0, s0 + LD a15, 7 * SIZE(X1) + MUL a0, a8, t0 + LD a0, 8 * SIZE(A1) + + ADD s1, t1, s1 + LD a8, 8 * SIZE(X1) + MUL a1, a9, t1 + LD a1, 9 * SIZE(A1) + + ADD s2, t2, s2 + LD a9, 9 * SIZE(X1) + MUL a2, a10, t2 + LD a2, 10 * SIZE(A1) + + ADD s3, t3, s3 + LD a10, 10 * SIZE(X1) + MUL a3, a11, t3 + LD a3, 11 * SIZE(A1) + + ADD s0, t0, s0 + LD a11, 11 * SIZE(X1) + MUL a4, a12, t0 + LD a4, 12 * SIZE(A1) + + ADD s1, t1, s1 + LD a12, 12 * SIZE(X1) + MUL a5, a13, t1 + LD a5, 13 * SIZE(A1) + + ADD s2, t2, s2 + LD a13, 13 * SIZE(X1) + MUL a6, a14, t2 + LD a6, 14 * SIZE(A1) + + ADD s3, t3, s3 + LD a14, 14 * SIZE(X1) + MUL a7, a15, t3 + LD a7, 15 * SIZE(A1) + + lda A1, 8 * SIZE(A1) + lda I, -1(I) + lda X1, 8 * SIZE(X1) + bgt I, $L32 + .align 4 + +$L33: + ADD s0, t0, s0 + LD a15, 7 * SIZE(X1) + MUL a0, a8, t0 + lda A1, 8 * SIZE(A1) + + ADD s1, t1, s1 + unop + MUL a1, a9, t1 + lda X1, 8 * SIZE(X1) + + ADD s2, t2, s2 + MUL a2, a10, t2 + ADD s3, t3, s3 + MUL a3, a11, t3 + + ADD s0, t0, s0 + MUL a4, a12, t0 + ADD s1, t1, s1 + MUL a5, a13, t1 + + ADD s2, t2, s2 + MUL a6, a14, t2 + ADD s3, t3, s3 + MUL a7, a15, t3 + .align 4 + +$L35: + and M, 7, I + ble I, $L38 + + LD a0, 0 * SIZE(A1) + LD x0, 0 * SIZE(X1) + + lda I, -1(I) + ble I, $L37 + .align 4 + +$L36: + ADD s0, t0, s0 + MUL x0, a0, t0 + LD a0, 1 * SIZE(A1) + LD x0, 1 * SIZE(X1) + + lda A1, 1 * SIZE(A1) + lda X1, 1 * SIZE(X1) + lda I, -1(I) + bgt I, $L36 + .align 4 + +$L37: + ADD s0, t0, s0 + MUL x0, a0, t0 + .align 4 + +$L38: + LD a0, 0 * SIZE(Y) + + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + ADD s0, s1, s0 + + MUL alpha, s0, s0 + ADD a0, s0, a0 + + ST a0, 0 * SIZE(Y1) + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/iamax.S b/kernel/alpha/iamax.S new file mode 100644 index 0000000000..cb87632900 --- /dev/null +++ b/kernel/alpha/iamax.S @@ -0,0 +1,440 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 6 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + +#ifdef F_INTERFACE + ldl N, 0(N) # n + ldl INCX, 0(INCX) # incx +#endif + lda $sp, -STACKSIZE($sp) + mov X, XX + .align 4 + + stt $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + stt $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + stt $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + stt $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + stt $f6, 32($sp) + fclr $f0 + sra N, 3, $1 + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + unop + fabs $f20, $f0 + ble $1, $L15 + .align 4 + + fabs $f20, $f1 + unop + addq X, INCX, X + unop + + LD $f21, 0 * SIZE(X) + fabs $f20, $f2 + addq X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fabs $f20, $f3 + addq X, INCX, X + unop + + LD $f23, 0 * SIZE(X) + fabs $f20, $f4 + addq X, INCX, X + unop + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + fabs $f20, $f5 + unop + + LD $f25, 0 * SIZE(X) + fabs $f20, $f6 + addq X, INCX, X + unop + + LD $f26, 0 * SIZE(X) + fabs $f20, $f28 + addq X, INCX, X + lda $1, -1($1) + + LD $f27, 0 * SIZE(X) + unop + addq X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f12, $f4 + unop + fabs $f20, $f29 + ldl $31, 56 * SIZE(X) + + fcmovne $f17, $f13, $f5 + LD $f20, 0 * SIZE(X) + fabs $f21, $f30 + addq X, INCX, X + + fcmovne $f18, $f14, $f6 + LD $f21, 0 * SIZE(X) + fabs $f22, $f10 + addq X, INCX, X + + fcmovne $f19, $f15, $f28 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + addq X, INCX, X + + fabs $f24, $f12 + LD $f23, 0 * SIZE(X) + CMPLT($f0, $f29), $f16 + addq X, INCX, X + + fabs $f25, $f13 + LD $f24, 0 * SIZE(X) + CMPLT($f1, $f30), $f17 + addq X, INCX, X + + fabs $f26, $f14 + LD $f25, 0 * SIZE(X) + CMPLT($f2, $f10), $f18 + addq X, INCX, X + + fabs $f27, $f15 + LD $f26, 0 * SIZE(X) + CMPLT($f3, $f11), $f19 + addq X, INCX, X + + fcmovne $f16, $f29, $f0 + LD $f27, 0 * SIZE(X) + CMPLT($f4, $f12), $f16 + addq X, INCX, X + + fcmovne $f17, $f30, $f1 + unop + CMPLT($f5, $f13), $f17 + lda $1, -1($1) # i -- + + fcmovne $f18, $f10, $f2 + unop + CMPLT($f6, $f14), $f18 + unop + + fcmovne $f19, $f11, $f3 + unop + CMPLT($f28, $f15), $f19 + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f12, $f4 + fabs $f20, $f29 + fcmovne $f17, $f13, $f5 + fabs $f21, $f30 + + fcmovne $f18, $f14, $f6 + fabs $f22, $f10 + fcmovne $f19, $f15, $f28 + fabs $f23, $f11 + + fabs $f24, $f12 + CMPLT($f0, $f29), $f16 + fabs $f25, $f13 + CMPLT($f1, $f30), $f17 + + fabs $f26, $f14 + CMPLT($f2, $f10), $f18 + fabs $f27, $f15 + CMPLT($f3, $f11), $f19 + + fcmovne $f16, $f29, $f0 + CMPLT($f4, $f12), $f16 + fcmovne $f17, $f30, $f1 + CMPLT($f5, $f13), $f17 + + fcmovne $f18, $f10, $f2 + CMPLT($f6, $f14), $f18 + fcmovne $f19, $f11, $f3 + CMPLT($f28, $f15), $f19 + + fcmovne $f16, $f12, $f4 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f13, $f5 + CMPLT($f2, $f3), $f17 + + fcmovne $f18, $f14, $f6 + CMPLT($f4, $f5), $f18 + fcmovne $f19, $f15, $f28 + CMPLT($f6, $f28), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f3, $f2 + fcmovne $f18, $f5, $f4 + fcmovne $f19, $f28, $f6 + + CMPLT($f0, $f2), $f16 + CMPLT($f4, $f6), $f17 + + fcmovne $f16, $f2, $f0 + fcmovne $f17, $f6, $f4 + + CMPLT($f0, $f4), $f16 + fcmovne $f16, $f4, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + fabs $f20, $f29 + CMPLT($f0, $f29), $f16 + fcmovne $f16, $f29, $f0 + + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 3, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f11, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f13, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f15, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f17, 0 * SIZE(XX) + addq XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + lda $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + addq XX, INCX, XX + cmpteq $f0, $f18, $f2 + + LD $f11, 0 * SIZE(XX) + fabs $f15, $f23 + addq XX, INCX, XX + cmpteq $f0, $f19, $f3 + + LD $f12, 0 * SIZE(XX) + fabs $f16, $f24 + addq XX, INCX, XX + cmpteq $f0, $f20, $f4 + + LD $f13, 0 * SIZE(XX) + fabs $f17, $f25 + addq XX, INCX, XX + cmpteq $f0, $f21, $f5 + + LD $f14, 0 * SIZE(XX) + lda $1, -1($1) # i -- + cmpteq $f0, $f22, $f26 + addq XX, INCX, XX + + lda $0, 1($0) + fbne $f2, $End + + LD $f15, 0 * SIZE(XX) + cmpteq $f0, $f23, $f27 + lda $0, 1($0) + fbne $f3, $End + + addq XX, INCX, XX + cmpteq $f0, $f24, $f28 + lda $0, 1($0) + fbne $f4, $End + + LD $f16, 0 * SIZE(XX) + cmpteq $f0, $f25, $f29 + lda $0, 1($0) + fbne $f5, $End + + addq XX, INCX, XX + lda $0, 1($0) + fabs $f10, $f18 + fbne $f26, $End + + LD $f17, 0 * SIZE(XX) + lda $0, 1($0) + fabs $f11, $f19 + fbne $f27, $End + + addq XX, INCX, XX + lda $0, 1($0) + fabs $f12, $f20 + fbne $f28, $End + + lda $0, 1($0) + fabs $f13, $f21 + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + cmpteq $f0, $f18, $f2 + fabs $f15, $f23 + cmpteq $f0, $f19, $f3 + + fabs $f16, $f24 + cmpteq $f0, $f20, $f4 + fabs $f17, $f25 + cmpteq $f0, $f21, $f5 + + cmpteq $f0, $f22, $f26 + lda $0, 1($0) + unop + fbne $f2, $End + + cmpteq $f0, $f23, $f27 + lda $0, 1($0) + unop + fbne $f3, $End + + cmpteq $f0, $f24, $f28 + lda $0, 1($0) + unop + fbne $f4, $End + + cmpteq $f0, $f25, $f29 + lda $0, 1($0) + unop + fbne $f5, $End + + lda $0, 1($0) + fbne $f26, $End + lda $0, 1($0) + fbne $f27, $End + lda $0, 1($0) + fbne $f28, $End + lda $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f20, 0 * SIZE(XX) + addq XX, INCX, XX + + fabs $f20, $f25 + cmpteq $f0, $f25, $f29 + + lda $0, 1($0) + fbne $f29, $End + br $31, $L40 + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + + ldt $f6, 32($sp) + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/imax.S b/kernel/alpha/imax.S new file mode 100644 index 0000000000..b0cf5c8abf --- /dev/null +++ b/kernel/alpha/imax.S @@ -0,0 +1,351 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + + clr $0 + mov X, XX + .align 4 + + cmplt $31, N, $2 + cmplt $31, INCX, $3 + SXADDQ INCX, $31, INCX + and $2, $3, $2 + + sra N, 3, $1 + fclr $f0 + unop + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f0, 0 * SIZE(X) + unop + unop + ble $1, $L15 + .align 4 + + fmov $f0, $f1 + addq X, INCX, X + fmov $f0, $f10 + lda $1, -1($1) + + LD $f21, 0 * SIZE(X) + fmov $f0, $f11 + addq X, INCX, X + fmov $f0, $f12 + + LD $f22, 0 * SIZE(X) + fmov $f0, $f13 + addq X, INCX, X + fmov $f0, $f14 + + LD $f23, 0 * SIZE(X) + fmov $f0, $f15 + addq X, INCX, X + fmov $f0, $f20 + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + LD $f25, 0 * SIZE(X) + addq X, INCX, X + LD $f26, 0 * SIZE(X) + addq X, INCX, X + LD $f27, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + CMPLT($f1, $f21), $f17 + CMPLT($f10, $f22), $f18 + CMPLT($f11, $f23), $f19 + + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f20, $f0 + LD $f20, 0 * SIZE(X) + CMPLT($f12, $f24), $f16 + addq X, INCX, X + + fcmovne $f17, $f21, $f1 + LD $f21, 0 * SIZE(X) + CMPLT($f13, $f25), $f17 + addq X, INCX, X + + fcmovne $f18, $f22, $f10 + LD $f22, 0 * SIZE(X) + CMPLT($f14, $f26), $f18 + addq X, INCX, X + + fcmovne $f19, $f23, $f11 + LD $f23, 0 * SIZE(X) + CMPLT($f15, $f27), $f19 + addq X, INCX, X + + fcmovne $f16, $f24, $f12 + LD $f24, 0 * SIZE(X) + CMPLT($f0, $f20), $f16 + addq X, INCX, X + + fcmovne $f17, $f25, $f13 + LD $f25, 0 * SIZE(X) + CMPLT($f1, $f21), $f17 + addq X, INCX, X + + fcmovne $f18, $f26, $f14 + LD $f26, 0 * SIZE(X) + CMPLT($f10, $f22), $f18 + addq X, INCX, X + + fcmovne $f19, $f27, $f15 + LD $f27, 0 * SIZE(X) + CMPLT($f11, $f23), $f19 + lda $1, -1($1) # i -- + + addq X, INCX, X + unop + unop + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f20, $f0 + CMPLT($f12, $f24), $f16 + + fcmovne $f17, $f21, $f1 + CMPLT($f13, $f25), $f17 + + fcmovne $f18, $f22, $f10 + CMPLT($f14, $f26), $f18 + + fcmovne $f19, $f23, $f11 + CMPLT($f15, $f27), $f19 + + fcmovne $f16, $f24, $f12 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f25, $f13 + CMPLT($f10, $f11), $f17 + + fcmovne $f18, $f26, $f14 + CMPLT($f12, $f13), $f18 + fcmovne $f19, $f27, $f15 + CMPLT($f14, $f15), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f11, $f10 + fcmovne $f18, $f13, $f12 + fcmovne $f19, $f15, $f14 + + CMPLT($f0, $f10), $f16 + CMPLT($f12, $f14), $f17 + + fcmovne $f16, $f10, $f0 + fcmovne $f17, $f14, $f12 + + CMPLT($f0, $f12), $f16 + fcmovne $f16, $f12, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + fcmovne $f16, $f20, $f0 + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 3, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f11, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f13, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f15, 0 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + addq XX, INCX, XX + LD $f17, 0 * SIZE(XX) + addq XX, INCX, XX + + cmpteq $f0, $f10, $f20 + cmpteq $f0, $f11, $f21 + cmpteq $f0, $f12, $f22 + cmpteq $f0, $f13, $f23 + + lda $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + cmpteq $f0, $f14, $f24 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f20, $End + + LD $f11, 0 * SIZE(XX) + cmpteq $f0, $f15, $f25 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f21, $End + + LD $f12, 0 * SIZE(XX) + cmpteq $f0, $f16, $f26 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f22, $End + + LD $f13, 0 * SIZE(XX) + cmpteq $f0, $f17, $f27 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f23, $End + + LD $f14, 0 * SIZE(XX) + cmpteq $f0, $f10, $f20 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f24, $End + + LD $f15, 0 * SIZE(XX) + cmpteq $f0, $f11, $f21 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f25, $End + + LD $f16, 0 * SIZE(XX) + lda $1, -1($1) # i -- + cmpteq $f0, $f12, $f22 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f26, $End + + LD $f17, 0 * SIZE(XX) + cmpteq $f0, $f13, $f23 + lda $0, 1($0) + addq XX, INCX, XX + fbne $f27, $End + + bgt $1, $L22 + .align 4 + +$L23: + lda $0, 1($0) + cmpteq $f0, $f14, $f24 + unop + fbne $f20, $End + + lda $0, 1($0) + cmpteq $f0, $f15, $f25 + unop + fbne $f21, $End + + lda $0, 1($0) + cmpteq $f0, $f16, $f26 + unop + fbne $f22, $End + + lda $0, 1($0) + cmpteq $f0, $f17, $f27 + unop + fbne $f23, $End + + lda $0, 1($0) + fbne $f24, $End + lda $0, 1($0) + fbne $f25, $End + lda $0, 1($0) + fbne $f26, $End + lda $0, 1($0) + fbne $f27, $End + .align 4 + +$L40: + LD $f20, 0 * SIZE(XX) + addq XX, INCX, XX + + cmpteq $f0, $f20, $f29 + + lda $0, 1($0) + fbne $f29, $End + br $31, $L40 + .align 4 + +$End: + ret + + EPILOGUE diff --git a/kernel/alpha/izamax.S b/kernel/alpha/izamax.S new file mode 100644 index 0000000000..2269b12cc0 --- /dev/null +++ b/kernel/alpha/izamax.S @@ -0,0 +1,427 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + + lda $sp, -STACKSIZE($sp) + + stt $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + unop + + stt $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + stt $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + stt $f5, 24($sp) + fclr $f19 + and $2, $3, $2 + clr $0 + + stt $f6, 32($sp) + mov X, XX + + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + fclr $f0 + beq $2, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addq INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + addt $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + lda $1, -1($1) + unop + addq X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addq X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addq X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addq X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + lda $1, -1($1) + addq X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addq X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addq X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addq X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + addt $f8, $f9, $f16 + unop + fabs $f20, $f8 + ldl $31, 64 * SIZE(X) + + addt $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + addt $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addq X, INCX, X + + addt $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addq X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addq X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + + fcmovne $f4, $f16, $f0 + LD $f27, 1 * SIZE(X) + addq X, INCX, X + lda $1, -1($1) # i -- + + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + bgt $1,$L12 + .align 4 + +$L13: + addt $f8, $f9, $f16 + fabs $f20, $f8 + + addt $f10, $f11, $f17 + fabs $f21, $f9 + + addt $f12, $f13, $f18 + fabs $f22, $f10 + + addt $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + + fcmovne $f4, $f16, $f0 + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + .align 4 + +$L14: + addt $f8, $f9, $f16 + addt $f10, $f11, $f17 + addt $f12, $f13, $f18 + addt $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + + fcmovne $f4, $f16, $f0 + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f3, $f2 + + CMPLT($f0, $f2), $f16 + fcmovne $f16, $f2, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $L20 + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addq X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + addt $f29, $f30, $f29 + + CMPLT($f0, $f29), $f16 + fcmovne $f16, $f29, $f0 + + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$L20: + sra N, 2, $1 + ble $1, $L40 + .align 4 + + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + LD $f13, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + LD $f15, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + LD $f17, 1 * SIZE(XX) + addq XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + fabs $f12, $f20 + fabs $f13, $f21 + + lda $1, -1($1) + ble $1, $L23 + .align 4 + +$L22: + LD $f10, 0 * SIZE(XX) + fabs $f14, $f22 + LD $f11, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f12, 0 * SIZE(XX) + fabs $f15, $f23 + LD $f13, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f14, 0 * SIZE(XX) + fabs $f16, $f24 + LD $f15, 1 * SIZE(XX) + addq XX, INCX, XX + + LD $f16, 0 * SIZE(XX) + fabs $f17, $f25 + LD $f17, 1 * SIZE(XX) + addq XX, INCX, XX + + addt $f18, $f19, $f4 + addt $f20, $f21, $f5 + addt $f22, $f23, $f6 + addt $f24, $f25, $f7 + + cmpteq $f0, $f4, $f26 + cmpteq $f0, $f5, $f27 + cmpteq $f0, $f6, $f28 + cmpteq $f0, $f7, $f29 + + fabs $f10, $f18 + lda $0, 1($0) + lda $1, -1($1) # i -- + fbne $f26, $End + + fabs $f11, $f19 + lda $0, 1($0) + unop + fbne $f27, $End + + fabs $f12, $f20 + lda $0, 1($0) + unop + fbne $f28, $End + + fabs $f13, $f21 + lda $0, 1($0) + fbne $f29, $End + bgt $1, $L22 + .align 4 + +$L23: + fabs $f14, $f22 + fabs $f15, $f23 + fabs $f16, $f24 + fabs $f17, $f25 + + addt $f18, $f19, $f4 + addt $f20, $f21, $f5 + addt $f22, $f23, $f6 + addt $f24, $f25, $f7 + + cmpteq $f0, $f4, $f26 + cmpteq $f0, $f5, $f27 + cmpteq $f0, $f6, $f28 + cmpteq $f0, $f7, $f29 + + lda $0, 1($0) + fbne $f26, $End + lda $0, 1($0) + fbne $f27, $End + lda $0, 1($0) + fbne $f28, $End + lda $0, 1($0) + fbne $f29, $End + .align 4 + +$L40: + LD $f10, 0 * SIZE(XX) + LD $f11, 1 * SIZE(XX) + + addq XX, INCX, XX + + fabs $f10, $f18 + fabs $f11, $f19 + + addt $f18, $f19, $f18 + cmpteq $f0, $f18, $f2 + + lda $0, 1($0) + fbne $f2, $End + br $31, $L40 + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/lsame.S b/kernel/alpha/lsame.S new file mode 100644 index 0000000000..082f790829 --- /dev/null +++ b/kernel/alpha/lsame.S @@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl lsame_ + .ent lsame_ +lsame_: + .frame $sp,0,$26,0 +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + ldq_u $5, 0($16) + ldq_u $6, 0($17) + extbl $5, $16, $5 + extbl $6, $17, $6 + + subl $5, 96, $1 + subl $6, 96, $2 + subl $5, 32, $3 + subl $6, 32, $4 + + cmovgt $1, $3, $5 + cmovgt $2, $4, $6 + cmpeq $5, $6, $0 + .align 4 + +$End: + ret + .end lsame_ + .ident VERSION diff --git a/kernel/alpha/max.S b/kernel/alpha/max.S new file mode 100644 index 0000000000..af1b8fb850 --- /dev/null +++ b/kernel/alpha/max.S @@ -0,0 +1,227 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + +#ifdef F_INTERFACE + ldl N, 0(N) # n + ldl INCX, 0(INCX) # incx +#endif + lda $sp, -STACKSIZE($sp) + nop + .align 4 + + cmplt $31, N, $2 + cmplt $31, INCX, $3 + SXADDQ INCX, $31, INCX + and $2, $3, $0 + + sra N, 3, $1 + fclr $f0 + unop + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f0, 0 * SIZE(X) + unop + unop + ble $1, $L15 + .align 4 + + fmov $f0, $f1 + addq X, INCX, X + fmov $f0, $f10 + lda $1, -1($1) + + LD $f21, 0 * SIZE(X) + fmov $f0, $f11 + addq X, INCX, X + fmov $f0, $f12 + + LD $f22, 0 * SIZE(X) + fmov $f0, $f13 + addq X, INCX, X + fmov $f0, $f14 + + LD $f23, 0 * SIZE(X) + fmov $f0, $f15 + addq X, INCX, X + fmov $f0, $f20 + + LD $f24, 0 * SIZE(X) + addq X, INCX, X + LD $f25, 0 * SIZE(X) + addq X, INCX, X + LD $f26, 0 * SIZE(X) + addq X, INCX, X + LD $f27, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + CMPLT($f1, $f21), $f17 + CMPLT($f10, $f22), $f18 + CMPLT($f11, $f23), $f19 + + ble $1, $L13 + .align 4 + +$L12: + fcmovne $f16, $f20, $f0 + LD $f20, 0 * SIZE(X) + CMPLT($f12, $f24), $f16 + addq X, INCX, X + + fcmovne $f17, $f21, $f1 + LD $f21, 0 * SIZE(X) + CMPLT($f13, $f25), $f17 + addq X, INCX, X + + fcmovne $f18, $f22, $f10 + LD $f22, 0 * SIZE(X) + CMPLT($f14, $f26), $f18 + addq X, INCX, X + + fcmovne $f19, $f23, $f11 + LD $f23, 0 * SIZE(X) + CMPLT($f15, $f27), $f19 + addq X, INCX, X + + fcmovne $f16, $f24, $f12 + LD $f24, 0 * SIZE(X) + CMPLT($f0, $f20), $f16 + addq X, INCX, X + + fcmovne $f17, $f25, $f13 + LD $f25, 0 * SIZE(X) + CMPLT($f1, $f21), $f17 + addq X, INCX, X + + fcmovne $f18, $f26, $f14 + LD $f26, 0 * SIZE(X) + CMPLT($f10, $f22), $f18 + addq X, INCX, X + + fcmovne $f19, $f27, $f15 + LD $f27, 0 * SIZE(X) + CMPLT($f11, $f23), $f19 + lda $1, -1($1) # i -- + + addq X, INCX, X + unop + unop + bgt $1,$L12 + .align 4 + +$L13: + fcmovne $f16, $f20, $f0 + CMPLT($f12, $f24), $f16 + + fcmovne $f17, $f21, $f1 + CMPLT($f13, $f25), $f17 + + fcmovne $f18, $f22, $f10 + CMPLT($f14, $f26), $f18 + + fcmovne $f19, $f23, $f11 + CMPLT($f15, $f27), $f19 + + fcmovne $f16, $f24, $f12 + CMPLT($f0, $f1), $f16 + fcmovne $f17, $f25, $f13 + CMPLT($f10, $f11), $f17 + + fcmovne $f18, $f26, $f14 + CMPLT($f12, $f13), $f18 + fcmovne $f19, $f27, $f15 + CMPLT($f14, $f15), $f19 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f11, $f10 + fcmovne $f18, $f13, $f12 + fcmovne $f19, $f15, $f14 + + CMPLT($f0, $f10), $f16 + CMPLT($f12, $f14), $f17 + + fcmovne $f16, $f10, $f0 + fcmovne $f17, $f14, $f12 + + CMPLT($f0, $f12), $f16 + fcmovne $f16, $f12, $f0 + .align 4 + +$L15: + and N, 7, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + addq X, INCX, X + + CMPLT($f0, $f20), $f16 + fcmovne $f16, $f20, $f0 + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/rot.S b/kernel/alpha/rot.S new file mode 100644 index 0000000000..d1656d7e3e --- /dev/null +++ b/kernel/alpha/rot.S @@ -0,0 +1,624 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + cmpeq INCX, 1, $23 + cmpeq INCY, 1, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 + + sra N, 3, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + lda I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + lds $f31, (PREFETCH_SIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + lds $f31, (PREFETCH_SIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + lda I, -1(I) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + lda X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + lda Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f28 + + ST $f26, 7*SIZE(X) + lda X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + lda Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + lda I, -1(I) + + ST $f25, 0*SIZE(X) + lda X, 1 * SIZE(X) + ST $f26, 0*SIZE(Y) + lda Y, 1 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 3, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f13, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f14, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f16, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f17, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + LD $f18, 0*SIZE(X) + SXADDQ INCX, X, X + LD $f19, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f16, $f21 + MUL S, $f17, $f22 + MUL C, $f17, $f23 + MUL S, $f16, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + ST $f22, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f24, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + MUL C, $f18, $f25 + MUL S, $f19, $f26 + MUL C, $f19, $f27 + MUL S, $f18, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f26, 0*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 0*SIZE(YY) + SXADDQ INCY, YY, YY + + lda I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 7, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f25 + SUB $f23, $f24, $f26 + lda I, -1(I) + + ST $f25, 0*SIZE(X) + SXADDQ INCX, X, X + ST $f26, 0*SIZE(Y) + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/alpha/scal.S b/kernel/alpha/scal.S new file mode 100644 index 0000000000..2d95801c83 --- /dev/null +++ b/kernel/alpha/scal.S @@ -0,0 +1,480 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $20 +#define INCX $21 + +#define XX $18 +#define I $19 + +#define ALPHA $f19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + + PROLOGUE + PROFCODE + + mov X, XX + ble N, $L999 + + cmpeq INCX, 1, $0 + beq $0, $L20 + +#ifndef DOUBLE + sra N, 4, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA, t0 + LD a5, 5 * SIZE(X) + MUL a1, ALPHA, t1 + LD a6, 6 * SIZE(X) + MUL a2, ALPHA, t2 + LD a7, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 8 * SIZE(X) + LD a1, 9 * SIZE(X) + LD a2, 10 * SIZE(X) + LD a3, 11 * SIZE(X) + + ST t0, 4 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 5 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 6 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 12 * SIZE(X) + LD a5, 13 * SIZE(X) + LD a6, 14 * SIZE(X) + LD a7, 15 * SIZE(X) + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t0, 8 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 9 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 10 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 11 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 16 * SIZE(X) + LD a1, 17 * SIZE(X) + LD a2, 18 * SIZE(X) + LD a3, 19 * SIZE(X) + + ST t0, 12 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 13 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 14 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 15 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 20 * SIZE(X) + LD a5, 21 * SIZE(X) + LD a6, 22 * SIZE(X) + LD a7, 23 * SIZE(X) + + ST t0, 16 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 17 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 18 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 19 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 24 * SIZE(X) + LD a1, 25 * SIZE(X) + LD a2, 26 * SIZE(X) + LD a3, 27 * SIZE(X) + + ST t0, 20 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, 21 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, 22 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, 23 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 28 * SIZE(X) + LD a5, 29 * SIZE(X) + LD a6, 30 * SIZE(X) + LD a7, 31 * SIZE(X) + + lds $f31, PREFETCHSIZE * SIZE(X) + lda I, -1(I) + addq X, 16 * SIZE, X + bne I, $L12 + .align 4 + +$L13: + ST t0, 8 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 9 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 10 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 11 * SIZE(X) + MUL a7, ALPHA, t3 + + ST t0, 12 * SIZE(X) + ST t1, 13 * SIZE(X) + ST t2, 14 * SIZE(X) + ST t3, 15 * SIZE(X) + addq X, 16 * SIZE, X + .align 4 + +$L15: + and N, 15, I + +#else + + sra N, 3, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + LD a4, 4 * SIZE(X) + MUL a0, ALPHA, t0 + LD a5, 5 * SIZE(X) + MUL a1, ALPHA, t1 + + LD a6, 6 * SIZE(X) + MUL a2, ALPHA, t2 + LD a7, 7 * SIZE(X) + MUL a3, ALPHA, t3 + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + LD a0, 8 * SIZE(X) + lda I, -1(I) + LD a1, 9 * SIZE(X) + addq X, 8 * SIZE, X + + LD a2, 2 * SIZE(X) + LD a3, 3 * SIZE(X) + + ST t0, -4 * SIZE(X) + MUL a0, ALPHA, t0 + ST t1, -3 * SIZE(X) + MUL a1, ALPHA, t1 + + ST t2, -2 * SIZE(X) + MUL a2, ALPHA, t2 + ST t3, -1 * SIZE(X) + MUL a3, ALPHA, t3 + + LD a4, 4 * SIZE(X) + LD a5, 5 * SIZE(X) + + LD a6, 6 * SIZE(X) + LD a7, 7 * SIZE(X) + lds $f31, PREFETCHSIZE * SIZE(X) + bne I, $L12 + .align 4 + +$L13: + ST t0, 0 * SIZE(X) + MUL a4, ALPHA, t0 + ST t1, 1 * SIZE(X) + MUL a5, ALPHA, t1 + + ST t2, 2 * SIZE(X) + MUL a6, ALPHA, t2 + ST t3, 3 * SIZE(X) + MUL a7, ALPHA, t3 + + ST t0, 4 * SIZE(X) + ST t1, 5 * SIZE(X) + ST t2, 6 * SIZE(X) + ST t3, 7 * SIZE(X) + addq X, 8 * SIZE, X + .align 4 + +$L15: + and N, 7, I + +#endif + + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(X) + + addq X, SIZE, X + + lda I, -1(I) + bne I, $L17 + ret + .align 4 + +$L20: + sra N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + MUL a0, ALPHA, t0 + lda I, -1(I) + SXADDQ INCX, X, X + + LD a5, 0 * SIZE(X) + MUL a1, ALPHA, t1 + SXADDQ INCX, X, X + unop + + LD a6, 0 * SIZE(X) + MUL a2, ALPHA, t2 + SXADDQ INCX, X, X + unop + + LD a7, 0 * SIZE(X) + MUL a3, ALPHA, t3 + SXADDQ INCX, X, X + ble I, $L23 + .align 4 + +$L22: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + lds $f31, PREFETCHSIZE * SIZE(X) + SXADDQ INCX, XX, XX + + LD a0, 0 * SIZE(X) + SXADDQ INCX, X, X + lda I, -1(I) + unop + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a1, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a2, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a3, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t0, 0 * SIZE(XX) + MUL a0, ALPHA, t0 + SXADDQ INCX, XX, XX + unop + + LD a4, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t1, 0 * SIZE(XX) + MUL a1, ALPHA, t1 + SXADDQ INCX, XX, XX + unop + + LD a5, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t2, 0 * SIZE(XX) + MUL a2, ALPHA, t2 + SXADDQ INCX, XX, XX + unop + + LD a6, 0 * SIZE(X) + SXADDQ INCX, X, X + + ST t3, 0 * SIZE(XX) + MUL a3, ALPHA, t3 + SXADDQ INCX, XX, XX + unop + + LD a7, 0 * SIZE(X) + SXADDQ INCX, X, X + unop + bne I, $L22 + .align 4 + +$L23: + ST t0, 0 * SIZE(XX) + MUL a4, ALPHA, t0 + SXADDQ INCX, XX, XX + + ST t1, 0 * SIZE(XX) + MUL a5, ALPHA, t1 + SXADDQ INCX, XX, XX + + ST t2, 0 * SIZE(XX) + MUL a6, ALPHA, t2 + SXADDQ INCX, XX, XX + + ST t3, 0 * SIZE(XX) + MUL a7, ALPHA, t3 + SXADDQ INCX, XX, XX + + ST t0, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t1, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t2, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + ST t3, 0 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L999 + .align 4 + +$L27: + LD a0, 0 * SIZE(X) + + MUL a0, ALPHA, t0 + + ST t0, 0 * SIZE(XX) + + SXADDQ INCX, X, X + SXADDQ INCX, XX, XX + + lda I, -1(I) + bne I, $L27 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/alpha/snrm2.S b/kernel/alpha/snrm2.S new file mode 100644 index 0000000000..b8ccc75f6e --- /dev/null +++ b/kernel/alpha/snrm2.S @@ -0,0 +1,431 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldah $29, 0($27) !gpdisp!1 + lda $29, 0($29) !gpdisp!1 + + lda $sp, -16($sp) + ldq $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + SXADDQ INCX, 0, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 4, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + lda I, -1(I) + ble I, $L12 + .align 4 + +$L11: + addt a0, t0, a0 + ldl $31, (PREFETCH_SIZE) * SIZE(X) + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + mov X, XX + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(X) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(X) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(X) + + addt a3, t3, a3 + unop + mult x7, x7, t3 + LD x7, 15 * SIZE(X) + + addt a0, t0, a0 + unop + mult x0, x0, t0 + LD x0, 16 * SIZE(X) + + addt a1, t1, a1 + lda X, 16 * SIZE(X) + mult x1, x1, t1 + LD x1, 17 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 18 * SIZE(XX) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 19 * SIZE(XX) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 20 * SIZE(XX) + + addt a1, t1, a1 + lda I, -1(I) + mult x5, x5, t1 + LD x5, 21 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 22 * SIZE(XX) + + addt a3, t3, a3 + mult x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + addt a0, t0, a0 + mov X, XX + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + unop + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(XX) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(XX) + + addt a3, t3, a3 + lda X, 16 * SIZE(X) + mult x7, x7, t3 + LD x7, 15 * SIZE(XX) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a1, t1, a1 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L15: + and N, 15, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + lda X, 1 * SIZE(X) + + addt a0, t0, a0 + mult x0, x0, t0 + + lda I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L25 + + fclr t2 + fclr t3 + + LD x0, 0 * SIZE(X) + addq X, INCX, X + LD x1, 0 * SIZE(X) + addq X, INCX, X + LD x2, 0 * SIZE(X) + addq X, INCX, X + LD x3, 0 * SIZE(X) + addq X, INCX, X + + LD x4, 0 * SIZE(X) + addq X, INCX, X + LD x5, 0 * SIZE(X) + addq X, INCX, X + LD x6, 0 * SIZE(X) + addq X, INCX, X + + lda I, -1(I) + ble I, $L22 + .align 4 + +$L21: + addt a0, t0, a0 + LD x7, 0 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x0, 0 * SIZE(X) + mult x1, x1, t1 + addq X, INCX, X + + addt a2, t2, a2 + LD x1, 0 * SIZE(X) + mult x2, x2, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x2, 0 * SIZE(X) + mult x3, x3, t3 + addq X, INCX, X + + addt a0, t0, a0 + LD x3, 0 * SIZE(X) + mult x4, x4, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x4, 0 * SIZE(X) + mult x5, x5, t1 + addq X, INCX, X + + addt a2, t2, a2 + LD x5, 0 * SIZE(X) + mult x6, x6, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x6, 0 * SIZE(X) + mult x7, x7, t3 + addq X, INCX, X + + lda I, -1(I) + bgt I, $L21 + .align 4 + +$L22: + addt a0, t0, a0 + LD x7, 0 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + unop + mult x1, x1, t1 + unop + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a1, t1, a1 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L25: + and N, 7, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + addq X, INCX, X + + addt a0, t0, a0 + mult x0, x0, t0 + + lda I, -1(I) + bgt I, $L26 + .align 4 + + +$L998: + addt a0, t0, a0 + + addt a0, a1, a0 + addt a2, a3, a2 + +#if defined(EV4) || defined(EV5) + addt a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldah $29, 0($26) !gpdisp!3 + lda $29, 0($29) !gpdisp!3 +#else + addt a0, a2, a0 + sqrtt a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldq $26, 0($sp) + lda $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/alpha/staticbuffer.S b/kernel/alpha/staticbuffer.S new file mode 100644 index 0000000000..7bbd23d891 --- /dev/null +++ b/kernel/alpha/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 8 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 +#endif diff --git a/kernel/alpha/swap.S b/kernel/alpha/swap.S new file mode 100644 index 0000000000..9e21990c44 --- /dev/null +++ b/kernel/alpha/swap.S @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov $20, $17 + mov $21, $18 + ldq $19, 0($sp) + ldl $20, 8($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + subl $18, 1, $1 + subl $20, 1, $2 + ble $16, $SubEnd # if n <= 0 goto $End + or $1, $2, $1 + + sra $16, 3, $21 + + and $16, 7, $22 + bne $1, $Sub + ble $21, $MainRemain + .align 4 + +$MainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f12, 2*SIZE($19) + LD $f13, 3*SIZE($19) + LD $f14, 4*SIZE($19) + LD $f15, 5*SIZE($19) + LD $f16, 6*SIZE($19) + LD $f17, 7*SIZE($19) + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + LD $f22, 2*SIZE($17) + LD $f23, 3*SIZE($17) + LD $f24, 4*SIZE($17) + LD $f25, 5*SIZE($17) + LD $f26, 6*SIZE($17) + LD $f27, 7*SIZE($17) + + lds $f31, 32*SIZE($17) + unop + lds $f31, 32*SIZE($19) + subl $21, 1, $21 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f12, 2*SIZE($17) + ST $f13, 3*SIZE($17) + ST $f14, 4*SIZE($17) + ST $f15, 5*SIZE($17) + ST $f16, 6*SIZE($17) + ST $f17, 7*SIZE($17) + + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + ST $f22, 2*SIZE($19) + ST $f23, 3*SIZE($19) + ST $f24, 4*SIZE($19) + ST $f25, 5*SIZE($19) + ST $f26, 6*SIZE($19) + ST $f27, 7*SIZE($19) + + lda $17, 8*SIZE($17) + lda $19, 8*SIZE($19) + bgt $21, $MainLoop + .align 4 + +$MainRemain: + ble $22, $MainEnd + .align 4 + +$MainRemainLoop: + LD $f10, 0*SIZE($19) + LD $f20, 0*SIZE($17) + lda $17, 1*SIZE($17) + lda $19, 1*SIZE($19) + subl $22, 1, $22 + ST $f10, -1*SIZE($17) + ST $f20, -1*SIZE($19) + bgt $22, $MainRemainLoop + .align 4 + +$MainEnd: + clr $0 + ret + .align 4 + +$Sub: + mov $17, $23 + mov $19, $24 + + ble $21, $SubRemain + .align 4 + +$SubLoop: + LD $f10, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f11, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f12, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f13, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f14, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f15, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f16, 0*SIZE($19) + SXADDQ $20, $19, $19 + LD $f17, 0*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f20, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f21, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f22, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f23, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f24, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f25, 0*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f26, 0*SIZE($17) + SXADDQ $18, $17, $17 + LD $f27, 0*SIZE($17) + SXADDQ $18, $17, $17 + + ST $f10, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f11, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f12, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f13, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f14, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f15, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f16, 0*SIZE($23) + SXADDQ $18, $23, $23 + ST $f17, 0*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f20, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f21, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f22, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f23, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f24, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f25, 0*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f26, 0*SIZE($24) + SXADDQ $20, $24, $24 + ST $f27, 0*SIZE($24) + SXADDQ $20, $24, $24 + + subl $21, 1, $21 + bgt $21, $SubLoop + .align 4 + +$SubRemain: + ble $22, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0*SIZE($19) + LD $f20, 0*SIZE($17) + + subl $22, 1, $22 + + ST $f10, 0*SIZE($17) + ST $f20, 0*SIZE($19) + + SXADDQ $18, $17, $17 + SXADDQ $20, $19, $19 + bgt $22, $SubRemainLoop + .align 4 + +$SubEnd: + clr $0 + ret + EPILOGUE diff --git a/kernel/alpha/trsm_kernel_4x4_LN.S b/kernel/alpha/trsm_kernel_4x4_LN.S new file mode 100644 index 0000000000..a1760c6f6d --- /dev/null +++ b/kernel/alpha/trsm_kernel_4x4_LN.S @@ -0,0 +1,4068 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + ldq C, 0 + STACKSIZE($sp) + ldq LDC, 8 + STACKSIZE($sp) + ldq OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mulq M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + s4addq LDC, 0, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C3 +#ifndef RT + s4addq LDC, C, C +#endif + + fclr t1 + addq C3, LDC, C4 + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + fclr t3 + fclr t4 + + and M, 1, I + ble I, $L20 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + lda BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + lda BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + lda AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + lda BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + lda AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + lda AO, 1 * SIZE(AO) + MUL a1, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) + lda C3, -1 * SIZE(C3) + lda C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + lda BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + lda BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) + lda C3, -2 * SIZE(C3) + lda C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) + lda C3, 2 * SIZE(C3) + lda C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L30: + sra M, 2, I + ble I, $L39 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(KK) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(B) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(TMP1) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(BO) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) + lda C3, -4 * SIZE(C3) + lda C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) + lda C3, 4 * SIZE(C3) + lda C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L11 + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 4, KK +#endif + +#ifdef RT + subq KK, 4, KK +#endif + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + addq LDC, LDC, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + fclr t1 +#ifndef RT + addq C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + fclr t3 + fclr t4 + + and M, 1, I + ble I, $L60 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + LD b3, 2 * SIZE(B) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + lda AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + lda BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + lda AO, 1 * SIZE(AO) + ADD c05, c06, c05 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L70: + sra M, 2, I + ble I, $L79 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + lda BO, 2 * SIZE(B) + lda AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + lda BO, 2 * SIZE(BO) + lda AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L51 + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + ble I, $L100 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + lda L, -1(L) + lda AO, 4 * SIZE(AO) + lda BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda L, -1(L) + lda AO, 1 * SIZE(AO) + lda BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + lda C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + lda BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + lda AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda AO, 2 * SIZE(AO) + unop + lda BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L110: + sra M, 2, I + ble I, $L119 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + lda AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + lda AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + bgt I, $L91 + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/trsm_kernel_4x4_LT.S b/kernel/alpha/trsm_kernel_4x4_LT.S new file mode 100644 index 0000000000..2848d26652 --- /dev/null +++ b/kernel/alpha/trsm_kernel_4x4_LT.S @@ -0,0 +1,4066 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + ldq C, 0 + STACKSIZE($sp) + ldq LDC, 8 + STACKSIZE($sp) + ldq OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mulq M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + sra N, 2, J + ble J, $L40 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + s4addq LDC, 0, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C3 +#ifndef RT + s4addq LDC, C, C +#endif + + fclr t1 + addq C3, LDC, C4 + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(KK) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(B) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(TMP1) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(BO) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) + lda C3, -4 * SIZE(C3) + lda C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) + lda C3, 4 * SIZE(C3) + lda C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + lda BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + lda BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) + lda C3, -2 * SIZE(C3) + lda C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) + lda C3, 2 * SIZE(C3) + lda C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + lda BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + lda BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + lda AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + lda BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + lda AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + lda AO, 1 * SIZE(AO) + MUL a1, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) + lda C3, -1 * SIZE(C3) + lda C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 4, KK +#endif + +#ifdef RT + subq KK, 4, KK +#endif + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + addq LDC, LDC, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + fclr t1 +#ifndef RT + addq C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + lda BO, 2 * SIZE(B) + lda AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + lda BO, 2 * SIZE(BO) + lda AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + LD b3, 2 * SIZE(B) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + lda AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + lda BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + lda AO, 1 * SIZE(AO) + ADD c05, c06, c05 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + .align 4 + +$L80: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + lda AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + lda AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + lda BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + lda AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda AO, 2 * SIZE(AO) + unop + lda BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L119 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + lda L, -1(L) + lda AO, 4 * SIZE(AO) + lda BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda L, -1(L) + lda AO, 1 * SIZE(AO) + lda BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + lda C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/trsm_kernel_4x4_RT.S b/kernel/alpha/trsm_kernel_4x4_RT.S new file mode 100644 index 0000000000..6d3d2e39a2 --- /dev/null +++ b/kernel/alpha/trsm_kernel_4x4_RT.S @@ -0,0 +1,4066 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 56 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $20 +#define B $21 +#define C $22 +#define LDC $23 + +#define C1 $19 +#define C2 $24 +#define C3 $25 +#define C4 $27 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + ldq C, 0 + STACKSIZE($sp) + ldq LDC, 8 + STACKSIZE($sp) + ldq OFFSET, 16 + STACKSIZE($sp) + + SXADDQ LDC, 0, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + mulq M, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ M, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + and N, 1, J + ble J, $L40 + +#ifdef RT + sll K, BASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C +#endif + + mov C, C1 +#ifndef RT + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + ble I, $L100 + .align 4 + +$L91: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L95 + +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L95 +#endif + .align 5 + +$L92: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda L, -1(L) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 8 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 9 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 10 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 11 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + LD a1, 12 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD a2, 13 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b3, t3 + LD a3, 14 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b3, t4 + LD a5, 15 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b4, t1 + LD a1, 16 * SIZE(AO) + lda AO, 16 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b4, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L92 + .align 4 + +$L95: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + unop + ble L, $L98 + .align 4 + +$L96: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 1 * SIZE(BO) + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b1, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b1, t4 + LD a4, 7 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + lda AO, 4 * SIZE(AO) + bgt L, $L96 + .align 4 + +$L98: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a2, c04, t1 + SUB c03, t1, c03 + MUL a3, c04, t1 + SUB c02, t1, c02 + MUL a4, c04, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b2, c03, t1 + SUB c02, t1, c02 + MUL b3, c03, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c01, t1 + SUB c03, t1, c03 + MUL a4, c01, t1 + SUB c04, t1, c04 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b2, c02, t1 + SUB c03, t1, c03 + MUL b3, c02, t1 + SUB c04, t1, c04 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a2, c03, t1 + SUB c04, t1, c04 + MUL a3, c04, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + bgt I, $L91 + .align 4 + +$L100: + and M, 2, I + ble I, $L110 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + ble L, $L105 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + ble L, $L105 +#endif + .align 5 + +$L102: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 5 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c03, t3, c03 + lda BO, 4 * SIZE(BO) + MUL a3, b2, t3 + LD a3, 6 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a5, 7 * SIZE(AO) + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + MUL a1, b3, t1 + LD a1, 8 * SIZE(AO) + lda AO, 8 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, 3 * SIZE(AO) + MUL a3, b4, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 3 * SIZE(BO) + bgt L, $L102 + .align 4 + +$L105: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L108 + .align 4 + +$L106: + ADD c01, t1, c01 + lda L, -1(L) + MUL a1, b1, t1 + LD a1, 2 * SIZE(AO) + + ADD c02, t2, c02 + MUL a2, b1, t2 + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda AO, 2 * SIZE(AO) + unop + lda BO, 1 * SIZE(BO) + bgt L, $L106 + .align 4 + +$L108: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c03, c01 + ADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a2, c02, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c02, t1, c02 + MUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L110: + and M, 1, I + ble I, $L119 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c04 + + sra KK, 2, L + mov B, BO + unop + ble L, $L115 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c04 + + sra TMP1, 2, L + unop + ble L, $L115 +#endif + .align 4 + +$L112: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 4 * SIZE(AO) + LD b1, 4 * SIZE(BO) + + ADD c02, t2, c02 + MUL a2, b2, t2 + LD a2, 5 * SIZE(AO) + LD b2, 5 * SIZE(BO) + + ADD c03, t3, c03 + MUL a3, b3, t3 + LD a3, 6 * SIZE(AO) + LD b3, 6 * SIZE(BO) + + ADD c04, t4, c04 + MUL a4, b4, t4 + LD a4, 7 * SIZE(AO) + LD b4, 7 * SIZE(BO) + + lda L, -1(L) + lda AO, 4 * SIZE(AO) + lda BO, 4 * SIZE(BO) + bgt L, $L112 + .align 4 + +$L115: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TMP1, 3, L +#endif + ble L, $L118 + .align 4 + +$L116: + ADD c01, t1, c01 + MUL a1, b1, t1 + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + lda L, -1(L) + lda AO, 1 * SIZE(AO) + lda BO, 1 * SIZE(BO) + bgt L, $L116 + .align 4 + +$L118: + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c03, t3, c03 + ADD c04, t4, c04 + + ADD c01, c02, c01 + ADD c03, c04, c03 + ADD c01, c03, c01 + +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + addq B, TMP2, BO +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + + SUB a1, c01, c01 +#else + LD a1, 0 * SIZE(AO) + + SUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + + MUL a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + +#ifndef LN + lda C1, 1 * SIZE(C1) +#endif + +#ifdef RT + SXADDQ K, AORIG, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L119: +#ifdef LN + SXADDQ K, B, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L40: + and N, 2, J + ble J, $L80 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + addq LDC, LDC, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + fclr t1 +#ifndef RT + addq C2, LDC, C +#endif + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L60 + .align 4 + +$L51: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + lda BO, 2 * SIZE(B) + lda AO, 4 * SIZE(AO) + + ble KK, $L58 + + ble L, $L55 +#else +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c03 + LD a2, 1 * SIZE(AO) + fclr c07 + LD a3, 2 * SIZE(AO) + fclr c04 + LD a4, 3 * SIZE(AO) + fclr c08 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + lda BO, 2 * SIZE(BO) + lda AO, 4 * SIZE(AO) + + ble TMP1, $L58 + + ble L, $L55 +#endif + .align 4 + +$L52: + ADD c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L52 + .align 4 + +$L55: + ADD c05, t1, c05 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L57: + ADD c06, t2, c06 + MUL a2, b1, t2 + ADD c07, t3, c07 + MUL a3, b1, t3 + + ADD c08, t4, c08 + MUL a4, b1, t4 + ADD c01, t1, c01 + MUL a1, b2, t1 + + ADD c02, t2, c02 + MUL a2, b2, t2 + ADD c03, t3, c03 + MUL a3, b2, t3 + + ADD c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c05, t1, c05 + ADD c06, t2, c06 + ADD c07, t3, c07 + ADD c08, t4, c08 + .align 4 + +$L58: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 + + SUB b1, c03, c03 + SUB b2, c07, c07 + SUB b3, c04, c04 + SUB b4, c08, c08 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + + MUL a2, c04, t1 + MUL a2, c08, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a3, c04, t1 + MUL a3, c08, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a4, c04, t1 + MUL a4, c08, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + + MUL b2, c03, t1 + MUL b2, c07, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL b3, c03, t1 + MUL b3, c07, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c05, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL a4, c01, t1 + MUL a4, c05, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + + MUL b2, c02, t1 + MUL b2, c06, t2 + + SUB c03, t1, c03 + SUB c07, t2, c07 + + MUL b3, c02, t1 + MUL b3, c06, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + + MUL a2, c03, t1 + MUL a2, c07, t2 + + SUB c04, t1, c04 + SUB c08, t2, c08 + + MUL a3, c04, c04 + MUL a3, c08, c08 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c05, c05 + MUL a3, c06, c06 + MUL a3, c07, c07 + MUL a3, c08, c08 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c07, 5 * SIZE(BO) + ST c04, 6 * SIZE(BO) + ST c08, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L51 + .align 4 + +$L60: + and M, 2, I + ble I, $L70 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L68 + + ble L, $L65 +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L68 + + ble L, $L65 +#endif + .align 4 + +$L62: + ADD c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L62 + .align 4 + +$L65: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L67 +#else + blbs TMP1, $L67 +#endif + .align 4 + + ADD c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L67: + ADD c02, t2, c02 + MUL a2, b1, t2 + ADD c05, t3, c05 + MUL a1, b2, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c02, t2, c02 + ADD c05, t3, c05 + ADD c06, t4, c06 + .align 4 + +$L68: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c02, c02 + SUB a4, c06, c06 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + + MUL a2, c02, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c05, t2, c05 + + MUL a3, c01, c01 + MUL a3, c05, c05 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + + MUL a2, c01, t1 + MUL a2, c05, t2 + + SUB c02, t1, c02 + SUB c06, t2, c06 + + MUL a3, c02, c02 + MUL a3, c06, c06 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c05, c05 + MUL a3, c06, c06 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c02, 2 * SIZE(BO) + ST c06, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L70: + and M, 1, I + ble I, $L79 + +#if defined(LT) || defined(RN) + + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + fclr c02 + LD b2, 1 * SIZE(B) + fclr c06 + + lda L, -2(KK) + + LD b3, 2 * SIZE(B) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 2 * SIZE(B) + + ble KK, $L78 + + ble L, $L75 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + fclr c02 + LD b2, 1 * SIZE(BO) + fclr c06 + + lda L, -2(TMP1) + + LD b3, 2 * SIZE(BO) + lda AO, 1 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 2 * SIZE(BO) + + ble TMP1, $L78 + + ble L, $L75 +#endif + .align 4 + +$L72: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 2 * SIZE(BO) + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 1 * SIZE(AO) + LD b2, 3 * SIZE(BO) + + ADD c02, t3, c02 + lda AO, 2 * SIZE(AO) + MUL a2, b3, t3 + LD b3, 4 * SIZE(BO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD a2, 0 * SIZE(AO) + LD b4, 5 * SIZE(BO) + + lda BO, 4 * SIZE(BO) + unop + unop + bgt L, $L72 + .align 4 + +$L75: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L77 +#else + blbs TMP1, $L77 +#endif + .align 4 + + ADD c05, t2, c05 + MUL a1, b2, t2 + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + ADD c01, t1, c01 + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L77: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c02, t3, c02 + ADD c06, t4, c06 + + ADD c01, c02, c01 + lda AO, 1 * SIZE(AO) + ADD c05, c06, c05 + lda BO, 2 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + + .align 4 + +$L78: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c05, c05 +#endif + +#ifdef RT + LD a1, 3 * SIZE(BO) + LD a2, 2 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L79: +#ifdef LN + sll K, 1 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + .align 4 + +$L80: + sra N, 2, J + ble J, $L999 + .align 4 + +$L01: +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + subq B, TMP1, B + + s4addq LDC, 0, TMP1 + subq C, TMP1, C +#endif + + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C3 +#ifndef RT + s4addq LDC, C, C +#endif + + fclr t1 + addq C3, LDC, C4 + fclr t2 + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 2, I + fclr t3 + fclr t4 + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c02 + + LD b3, 2 * SIZE(B) + fclr c06 + LD b4, 3 * SIZE(B) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(KK) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(B) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble KK, $L18 +#else + +#ifdef LN + sll K, BASE_SHIFT + 2, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 2, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c11 + LD a2, 1 * SIZE(AO) + fclr c12 + + LD a3, 2 * SIZE(AO) + fclr c16 + LD a4, 3 * SIZE(AO) + fclr c15 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c02 + + LD b3, 2 * SIZE(BO) + fclr c06 + LD b4, 3 * SIZE(BO) + fclr c05 + + lds $f31, 4 * SIZE(C1) + fclr c03 + lda L, -2(TMP1) + fclr c04 + + lds $f31, 7 * SIZE(C2) + fclr c08 + lda BO, 4 * SIZE(BO) + fclr c13 + + lds $f31, 4 * SIZE(C3) + fclr c09 + lda AO, 4 * SIZE(AO) + fclr c10 + + lds $f31, 7 * SIZE(C4) + fclr c14 + fclr c07 + ble TMP1, $L18 +#endif + + ble L, $L15 + .align 5 + +$L12: +/* 1 */ + ADD c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD c11, t1, c11 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD c06, t3, c06 + MUL b2, a4, t3 + ADD c05, t4, c05 + MUL b4, a1, t4 + + ADD c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD c12, t2, c12 + MUL b1, a2, t2 + ADD c16, t3, c16 + MUL b2, a2, t3 + + ADD c15, t4, c15 + MUL b2, a1, t4 + ADD c01, t1, c01 + MUL b1, a3, t1 + + ADD c02, t2, c02 + MUL b1, a4, t2 + ADD c06, t3, c06 + MUL b2, a4, t3 + + ADD c05, t4, c05 + MUL b4, a1, t4 + ADD c03, t1, c03 + MUL b3, a1, t1 + + ADD c04, t2, c04 + MUL b3, a2, t2 + ADD c08, t3, c08 + MUL b4, a2, t3 + + ADD c13, t4, c13 + MUL b2, a3, t4 + ADD c09, t1, c09 + MUL b3, a3, t1 + + ADD c10, t2, c10 + MUL b3, a4, t2 + ADD c14, t3, c14 + MUL b4, a4, t3 + + ADD c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD c11, t1, c11 + ADD c12, t2, c12 + ADD c16, t3, c16 + ADD c15, t4, c15 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 4, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 2, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + + LD a1, 8 * SIZE(BO) + LD a2, 9 * SIZE(BO) + LD a3, 10 * SIZE(BO) + LD a4, 11 * SIZE(BO) + + LD b1, 12 * SIZE(BO) + LD b2, 13 * SIZE(BO) + LD b3, 14 * SIZE(BO) + LD b4, 15 * SIZE(BO) + + SUB a1, c03, c03 + SUB a2, c07, c07 + SUB a3, c11, c11 + SUB a4, c15, c15 + + SUB b1, c04, c04 + SUB b2, c08, c08 + SUB b3, c12, c12 + SUB b4, c16, c16 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c05, c05 + SUB b2, c06, c06 + SUB b3, c07, c07 + SUB b4, c08, c08 + + LD a1, 8 * SIZE(AO) + LD a2, 9 * SIZE(AO) + LD a3, 10 * SIZE(AO) + LD a4, 11 * SIZE(AO) + + LD b1, 12 * SIZE(AO) + LD b2, 13 * SIZE(AO) + LD b3, 14 * SIZE(AO) + LD b4, 15 * SIZE(AO) + + SUB a1, c09, c09 + SUB a2, c10, c10 + SUB a3, c11, c11 + SUB a4, c12, c12 + + SUB b1, c13, c13 + SUB b2, c14, c14 + SUB b3, c15, c15 + SUB b4, c16, c16 +#endif + +#ifdef LN + LD a1, 15 * SIZE(AO) + LD a2, 14 * SIZE(AO) + LD a3, 13 * SIZE(AO) + LD a4, 12 * SIZE(AO) + + MUL a1, c04, c04 + MUL a1, c08, c08 + MUL a1, c12, c12 + MUL a1, c16, c16 + + MUL a2, c04, t1 + MUL a2, c08, t2 + MUL a2, c12, t3 + MUL a2, c16, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a3, c04, t1 + MUL a3, c08, t2 + MUL a3, c12, t3 + MUL a3, c16, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a4, c04, t1 + MUL a4, c08, t2 + MUL a4, c12, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD b1, 10 * SIZE(AO) + LD b2, 9 * SIZE(AO) + LD b3, 8 * SIZE(AO) + + MUL b1, c03, c03 + MUL b1, c07, c07 + MUL b1, c11, c11 + MUL b1, c15, c15 + + MUL b2, c03, t1 + MUL b2, c07, t2 + MUL b2, c11, t3 + MUL b2, c15, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL b3, c03, t1 + MUL b3, c07, t2 + MUL b3, c11, t3 + MUL b3, c15, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + LD a1, 5 * SIZE(AO) + LD a2, 4 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c01, t1 + MUL a3, c05, t2 + MUL a3, c09, t3 + MUL a3, c13, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL a4, c01, t1 + MUL a4, c05, t2 + MUL a4, c09, t3 + MUL a4, c13, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(AO) + LD b2, 6 * SIZE(AO) + LD b3, 7 * SIZE(AO) + + MUL b1, c02, c02 + MUL b1, c06, c06 + MUL b1, c10, c10 + MUL b1, c14, c14 + + MUL b2, c02, t1 + MUL b2, c06, t2 + MUL b2, c10, t3 + MUL b2, c14, t4 + + SUB c03, t1, c03 + SUB c07, t2, c07 + SUB c11, t3, c11 + SUB c15, t4, c15 + + MUL b3, c02, t1 + MUL b3, c06, t2 + MUL b3, c10, t3 + MUL b3, c14, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(AO) + LD a2, 11 * SIZE(AO) + LD a3, 15 * SIZE(AO) + + MUL a1, c03, c03 + MUL a1, c07, c07 + MUL a1, c11, c11 + MUL a1, c15, c15 + + MUL a2, c03, t1 + MUL a2, c07, t2 + MUL a2, c11, t3 + MUL a2, c15, t4 + + SUB c04, t1, c04 + SUB c08, t2, c08 + SUB c12, t3, c12 + SUB c16, t4, c16 + + MUL a3, c04, c04 + MUL a3, c08, c08 + MUL a3, c12, c12 + MUL a3, c16, c16 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + MUL a2, c01, t1 + MUL a2, c02, t2 + MUL a2, c03, t3 + MUL a2, c04, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c01, t1 + MUL a4, c02, t2 + MUL a4, c03, t3 + MUL a4, c04, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + MUL b1, c07, c07 + MUL b1, c08, c08 + + MUL b2, c05, t1 + MUL b2, c06, t2 + MUL b2, c07, t3 + MUL b2, c08, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL b3, c05, t1 + MUL b3, c06, t2 + MUL b3, c07, t3 + MUL b3, c08, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + MUL a2, c09, t1 + MUL a2, c10, t2 + MUL a2, c11, t3 + MUL a2, c12, t4 + + SUB c13, t1, c13 + SUB c14, t2, c14 + SUB c15, t3, c15 + SUB c16, t4, c16 + + MUL a3, c13, c13 + MUL a3, c14, c14 + MUL a3, c15, c15 + MUL a3, c16, c16 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + MUL a1, c15, c15 + MUL a1, c16, c16 + + MUL a2, c13, t1 + MUL a2, c14, t2 + MUL a2, c15, t3 + MUL a2, c16, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a3, c13, t1 + MUL a3, c14, t2 + MUL a3, c15, t3 + MUL a3, c16, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL a4, c13, t1 + MUL a4, c14, t2 + MUL a4, c15, t3 + MUL a4, c16, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + MUL b1, c11, c11 + MUL b1, c12, c12 + + MUL b2, c09, t1 + MUL b2, c10, t2 + MUL b2, c11, t3 + MUL b2, c12, t4 + + SUB c05, t1, c05 + SUB c06, t2, c06 + SUB c07, t3, c07 + SUB c08, t4, c08 + + MUL b3, c09, t1 + MUL b3, c10, t2 + MUL b3, c11, t3 + MUL b3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + MUL a1, c07, c07 + MUL a1, c08, c08 + + MUL a2, c05, t1 + MUL a2, c06, t2 + MUL a2, c07, t3 + MUL a2, c08, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a3, c01, c01 + MUL a3, c02, c02 + MUL a3, c03, c03 + MUL a3, c04, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) + + ST c03, 8 * SIZE(BO) + ST c07, 9 * SIZE(BO) + ST c11, 10 * SIZE(BO) + ST c15, 11 * SIZE(BO) + + ST c04, 12 * SIZE(BO) + ST c08, 13 * SIZE(BO) + ST c12, 14 * SIZE(BO) + ST c16, 15 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c05, 4 * SIZE(AO) + ST c06, 5 * SIZE(AO) + ST c07, 6 * SIZE(AO) + ST c08, 7 * SIZE(AO) + + ST c09, 8 * SIZE(AO) + ST c10, 9 * SIZE(AO) + ST c11, 10 * SIZE(AO) + ST c12, 11 * SIZE(AO) + + ST c13, 12 * SIZE(AO) + ST c14, 13 * SIZE(AO) + ST c15, 14 * SIZE(AO) + ST c16, 15 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) + lda C3, -4 * SIZE(C3) + lda C4, -4 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + ST c07, 2 * SIZE(C2) + ST c08, 3 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c11, 2 * SIZE(C3) + ST c12, 3 * SIZE(C3) + + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + ST c15, 2 * SIZE(C4) + ST c16, 3 * SIZE(C4) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) + lda C3, 4 * SIZE(C3) + lda C4, 4 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 2 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 2, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 4, KK +#endif + +#ifdef LN + subq KK, 4, KK +#endif + + lda I, -1(I) + + bgt I, $L11 + .align 4 + +$L20: + and M, 2, I + ble I, $L30 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c01 + LD b4, 3 * SIZE(B) + fclr c05 + + lda BO, 4 * SIZE(B) + fclr c02 + fclr c06 + ble KK, $L28 + + ble L, $L25 + +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c10 + LD a4, 3 * SIZE(AO) + fclr c14 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 2 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c01 + LD b4, 3 * SIZE(BO) + fclr c05 + + lda BO, 4 * SIZE(BO) + fclr c02 + fclr c06 + ble TMP1, $L28 + + ble L, $L25 +#endif + .align 4 + +$L22: + ADD c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + + ADD c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD c10, t2, c10 + MUL a2, b1, t2 + ADD c13, t3, c13 + MUL a1, b2, t3 + + ADD c14, t4, c14 + MUL a2, b2, t4 + ADD c01, t1, c01 + MUL a1, b3, t1 + + ADD c02, t2, c02 + MUL a2, b3, t2 + ADD c05, t3, c05 + MUL a1, b4, t3 + + ADD c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c09, t1, c09 + ADD c10, t2, c10 + ADD c13, t3, c13 + ADD c14, t4, c14 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 + + SUB b1, c02, c02 + SUB b2, c06, c06 + SUB b3, c10, c10 + SUB b4, c14, c14 + +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c05, c05 + SUB a4, c06, c06 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c13, c13 + SUB b4, c14, c14 +#endif + +#ifdef LN + LD a1, 3 * SIZE(AO) + LD a2, 2 * SIZE(AO) + LD a3, 0 * SIZE(AO) + + MUL a1, c02, c02 + MUL a1, c06, c06 + MUL a1, c10, c10 + MUL a1, c14, c14 + + MUL a2, c02, t1 + MUL a2, c06, t2 + MUL a2, c10, t3 + MUL a2, c14, t4 + + SUB c01, t1, c01 + SUB c05, t2, c05 + SUB c09, t3, c09 + SUB c13, t4, c13 + + MUL a3, c01, c01 + MUL a3, c05, c05 + MUL a3, c09, c09 + MUL a3, c13, c13 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 3 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 + + MUL a2, c01, t1 + MUL a2, c05, t2 + MUL a2, c09, t3 + MUL a2, c13, t4 + + SUB c02, t1, c02 + SUB c06, t2, c06 + SUB c10, t3, c10 + SUB c14, t4, c14 + + MUL a3, c02, c02 + MUL a3, c06, c06 + MUL a3, c10, c10 + MUL a3, c14, c14 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a1, c02, c02 + + MUL a2, c01, t1 + MUL a2, c02, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c01, t1 + MUL a4, c02, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b1, c06, c06 + + MUL b2, c05, t1 + MUL b2, c06, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL b3, c05, t1 + MUL b3, c06, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a1, c10, c10 + + MUL a2, c09, t1 + MUL a2, c10, t2 + + SUB c13, t1, c13 + SUB c14, t2, c14 + + MUL a3, c13, c13 + MUL a3, c14, c14 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a1, c14, c14 + + MUL a2, c13, t1 + MUL a2, c14, t2 + + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a3, c13, t1 + MUL a3, c14, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL a4, c13, t1 + MUL a4, c14, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b1, c10, c10 + + MUL b2, c09, t1 + MUL b2, c10, t2 + + SUB c05, t1, c05 + SUB c06, t2, c06 + + MUL b3, c09, t1 + MUL b3, c10, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a1, c06, c06 + + MUL a2, c05, t1 + MUL a2, c06, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a3, c01, c01 + MUL a3, c02, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) + + ST c02, 4 * SIZE(BO) + ST c06, 5 * SIZE(BO) + ST c10, 6 * SIZE(BO) + ST c14, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c05, 2 * SIZE(AO) + ST c06, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c13, 6 * SIZE(AO) + ST c14, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) + lda C3, -2 * SIZE(C3) + lda C4, -2 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c06, 1 * SIZE(C2) + + ST c09, 0 * SIZE(C3) + ST c10, 1 * SIZE(C3) + ST c13, 0 * SIZE(C4) + ST c14, 1 * SIZE(C4) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) + lda C3, 2 * SIZE(C3) + lda C4, 2 * SIZE(C4) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, 1 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + .align 4 + +$L30: + and M, 1, I + ble I, $L39 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(B) + lda L, -2(KK) + LD b2, 1 * SIZE(B) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(B) + fclr c09 + LD b4, 3 * SIZE(B) + fclr c13 + + lda BO, 4 * SIZE(B) + ble KK, $L38 + + ble L, $L35 +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c01 + LD a2, 1 * SIZE(AO) + fclr c05 + + LD b1, 0 * SIZE(BO) + lda L, -2(TMP1) + LD b2, 1 * SIZE(BO) + lda AO, 1 * SIZE(AO) + + LD b3, 2 * SIZE(BO) + fclr c09 + LD b4, 3 * SIZE(BO) + fclr c13 + + lda BO, 4 * SIZE(BO) + ble TMP1, $L38 + + ble L, $L35 +#endif + .align 4 + +$L32: + ADD c01, t1, c01 + lda L, -2(L) + MUL a1, b1, t1 + LD b1, 0 * SIZE(BO) + + ADD c05, t2, c05 + lda AO, 2 * SIZE(AO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + LD b5, 3 * SIZE(BO) + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, -1 * SIZE(AO) + + ADD c01, t1, c01 + MUL a2, b1, t1 + LD b1, 4 * SIZE(BO) + lda BO, 8 * SIZE(BO) + + ADD c05, t2, c05 + MUL a2, b2, t2 + LD b2, -3 * SIZE(BO) + + ADD c09, t3, c09 + LD b4, -1 * SIZE(BO) + MUL a2, b3, t3 + LD b3, -2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a2, b5, t4 + LD a2, 0 * SIZE(AO) + bgt L, $L32 + .align 4 + +$L35: + ADD c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L37 +#else + blbs TMP1, $L37 +#endif + .align 4 + + ADD c05, t2, c05 + LD b1, 0 * SIZE(BO) + MUL a1, b2, t2 + LD b2, 1 * SIZE(BO) + + ADD c09, t3, c09 + MUL a1, b3, t3 + LD b3, 2 * SIZE(BO) + + ADD c13, t4, c13 + MUL a1, b4, t4 + LD a1, 0 * SIZE(AO) + lda AO, 1 * SIZE(AO) + + ADD c01, t1, c01 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L37: + ADD c05, t2, c05 + MUL a1, b2, t2 + ADD c09, t3, c09 + MUL a1, b3, t3 + + ADD c13, t4, c13 + lda AO, 1 * SIZE(AO) + MUL a1, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD c01, t1, c01 + ADD c05, t2, c05 + ADD c09, t3, c09 + ADD c13, t4, c13 + +$L38: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 4, TMP1 +#endif + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq B, TMP2, BO +#else + lda AO, -1 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c05, c05 + SUB a3, c09, c09 + SUB a4, c13, c13 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + + MUL a1, c01, c01 + MUL a1, c05, c05 + MUL a1, c09, c09 + MUL a1, c13, c13 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a1, c01, c01 + MUL a2, c01, t1 + SUB c05, t1, c05 + MUL a3, c01, t1 + SUB c09, t1, c09 + MUL a4, c01, t1 + SUB c13, t1, c13 + + LD b1, 5 * SIZE(BO) + LD b2, 6 * SIZE(BO) + LD b3, 7 * SIZE(BO) + + MUL b1, c05, c05 + MUL b2, c05, t1 + SUB c09, t1, c09 + MUL b3, c05, t1 + SUB c13, t1, c13 + + LD a1, 10 * SIZE(BO) + LD a2, 11 * SIZE(BO) + LD a3, 15 * SIZE(BO) + + MUL a1, c09, c09 + MUL a2, c09, t1 + SUB c13, t1, c13 + MUL a3, c13, c13 +#endif + +#ifdef RT + LD a1, 15 * SIZE(BO) + LD a2, 14 * SIZE(BO) + LD a3, 13 * SIZE(BO) + LD a4, 12 * SIZE(BO) + + MUL a1, c13, c13 + MUL a2, c13, t1 + SUB c09, t1, c09 + MUL a3, c13, t1 + SUB c05, t1, c05 + MUL a4, c13, t1 + SUB c01, t1, c01 + + LD b1, 10 * SIZE(BO) + LD b2, 9 * SIZE(BO) + LD b3, 8 * SIZE(BO) + + MUL b1, c09, c09 + MUL b2, c09, t1 + SUB c05, t1, c05 + MUL b3, c09, t1 + SUB c01, t1, c01 + + LD a1, 5 * SIZE(BO) + LD a2, 4 * SIZE(BO) + LD a3, 0 * SIZE(BO) + + MUL a1, c05, c05 + MUL a2, c05, t1 + SUB c01, t1, c01 + MUL a3, c01, c01 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c05, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c13, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c05, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c13, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -1 * SIZE(C1) + lda C2, -1 * SIZE(C2) + lda C3, -1 * SIZE(C3) + lda C4, -1 * SIZE(C4) +#endif + + ST c01, 0 * SIZE(C1) + ST c05, 0 * SIZE(C2) + ST c09, 0 * SIZE(C3) + ST c13, 0 * SIZE(C4) + +#ifdef RT + sll K, 0 + BASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, BASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, BASE_SHIFT + 2, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L39: +#ifdef LN + sll K, 2 + BASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 4, KK +#endif + +#ifdef RT + subq KK, 4, KK +#endif + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/zamax.S b/kernel/alpha/zamax.S new file mode 100644 index 0000000000..01fb4e1181 --- /dev/null +++ b/kernel/alpha/zamax.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 + +#ifndef USE_MIN +#define CMPLT(a, b) cmptlt a, b +#else +#define CMPLT(a, b) cmptlt b, a +#endif + +#define STACKSIZE 8 * 8 + + PROLOGUE + PROFCODE + .frame $sp, STACKSIZE, $26, 0 + + lda $sp, -STACKSIZE($sp) + + stt $f2, 0($sp) + fclr $f16 + cmplt $31, N, $2 + + stt $f3, 8($sp) + fclr $f17 + cmplt $31, INCX, $3 + unop + + stt $f4, 16($sp) + fclr $f18 + SXADDQ INCX, $31, INCX + unop + + stt $f5, 24($sp) + fclr $f19 + and $2, $3, $0 + unop + + stt $f6, 32($sp) + unop + + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + fclr $f0 + beq $0, $End # if (n <= 0) or (incx <= 0) return + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + sra N, 2, $1 + addq INCX, INCX, INCX + + fabs $f20, $f20 + fabs $f21, $f21 + addt $f20, $f21, $f0 + ble $1, $L15 + .align 4 + + lda $1, -1($1) + unop + addq X, INCX, X + unop + + LD $f22, 0 * SIZE(X) + fmov $f0, $f1 + LD $f23, 1 * SIZE(X) + addq X, INCX, X + + LD $f24, 0 * SIZE(X) + fmov $f0, $f2 + LD $f25, 1 * SIZE(X) + addq X, INCX, X + + LD $f26, 0 * SIZE(X) + fmov $f0, $f3 + LD $f27, 1 * SIZE(X) + addq X, INCX, X + + fabs $f20, $f8 + fabs $f21, $f9 + fabs $f22, $f10 + fabs $f23, $f11 + + fabs $f24, $f12 + fabs $f25, $f13 + fabs $f26, $f14 + fabs $f27, $f15 + + ble $1, $L14 + .align 4 + + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + lda $1, -1($1) + addq X, INCX, X + + LD $f22, 0 * SIZE(X) + LD $f23, 1 * SIZE(X) + unop + addq X, INCX, X + + LD $f24, 0 * SIZE(X) + LD $f25, 1 * SIZE(X) + unop + addq X, INCX, X + + LD $f26, 0 * SIZE(X) + LD $f27, 1 * SIZE(X) + addq X, INCX, X + ble $1, $L13 + .align 4 + +$L12: + addt $f8, $f9, $f16 + unop + fabs $f20, $f8 + ldl $31, 64 * SIZE(X) + + addt $f10, $f11, $f17 + unop + fabs $f21, $f9 + LD $f20, 0 * SIZE(X) + + addt $f12, $f13, $f18 + LD $f21, 1 * SIZE(X) + fabs $f22, $f10 + addq X, INCX, X + + addt $f14, $f15, $f19 + LD $f22, 0 * SIZE(X) + fabs $f23, $f11 + unop + + CMPLT($f0, $f16), $f4 + LD $f23, 1 * SIZE(X) + fabs $f24, $f12 + addq X, INCX, X + + CMPLT($f1, $f17), $f5 + LD $f24, 0 * SIZE(X) + fabs $f25, $f13 + unop + + CMPLT($f2, $f18), $f6 + LD $f25, 1 * SIZE(X) + fabs $f26, $f14 + addq X, INCX, X + + CMPLT($f3, $f19), $f7 + LD $f26, 0 * SIZE(X) + fabs $f27, $f15 + unop + + fcmovne $f4, $f16, $f0 + LD $f27, 1 * SIZE(X) + addq X, INCX, X + lda $1, -1($1) # i -- + + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + bgt $1,$L12 + .align 4 + +$L13: + addt $f8, $f9, $f16 + fabs $f20, $f8 + + addt $f10, $f11, $f17 + fabs $f21, $f9 + + addt $f12, $f13, $f18 + fabs $f22, $f10 + + addt $f14, $f15, $f19 + fabs $f23, $f11 + + CMPLT($f0, $f16), $f4 + fabs $f24, $f12 + + CMPLT($f1, $f17), $f5 + fabs $f25, $f13 + + CMPLT($f2, $f18), $f6 + fabs $f26, $f14 + CMPLT($f3, $f19), $f7 + fabs $f27, $f15 + + fcmovne $f4, $f16, $f0 + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + .align 4 + +$L14: + addt $f8, $f9, $f16 + addt $f10, $f11, $f17 + addt $f12, $f13, $f18 + addt $f14, $f15, $f19 + + CMPLT($f0, $f16), $f4 + CMPLT($f1, $f17), $f5 + CMPLT($f2, $f18), $f6 + CMPLT($f3, $f19), $f7 + + fcmovne $f4, $f16, $f0 + fcmovne $f5, $f17, $f1 + fcmovne $f6, $f18, $f2 + fcmovne $f7, $f19, $f3 + + CMPLT($f0, $f1), $f16 + CMPLT($f2, $f3), $f17 + + fcmovne $f16, $f1, $f0 + fcmovne $f17, $f3, $f2 + + CMPLT($f0, $f2), $f16 + fcmovne $f16, $f2, $f0 + .align 4 + +$L15: + and N, 3, $1 + unop + unop + ble $1, $End + .align 4 + +$L16: + LD $f20, 0 * SIZE(X) + LD $f21, 1 * SIZE(X) + unop + addq X, INCX, X + + fabs $f20, $f29 + fabs $f21, $f30 + addt $f29, $f30, $f29 + + CMPLT($f0, $f29), $f16 + fcmovne $f16, $f29, $f0 + + lda $1, -1($1) # i -- + bgt $1, $L16 + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + lda $sp, STACKSIZE($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/zasum.S b/kernel/alpha/zasum.S new file mode 100644 index 0000000000..67ed785846 --- /dev/null +++ b/kernel/alpha/zasum.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define I $19 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f19 + +#define t0 $f20 +#define t1 $f21 +#define t2 $f22 +#define t3 $f23 + + PROLOGUE + PROFCODE + + fclr s0 + unop + fclr t0 + addq INCX, INCX, INCX + + fclr s1 + unop + fclr t1 + ble N, $L999 + + fclr s2 + sra N, 2, I + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(X) + fclr t2 + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a2, 0 * SIZE(X) + fclr t3 + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + lda I, -1(I) + + ble I, $L13 + .align 4 + +$L12: + ADD s0, t0, s0 + ldl $31, PREFETCHSIZE * SIZE(X) + fabs a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a6, 0 * SIZE(X) + fabs a1, t1 + unop + + ADD s2, t2, s2 + LD a7, 1 * SIZE(X) + fabs a2, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + fabs a3, t3 + unop + + ADD s0, t0, s0 + LD a1, 1 * SIZE(X) + fabs a4, t0 + SXADDQ INCX, X, X + + ADD s1, t1, s1 + LD a2, 0 * SIZE(X) + fabs a5, t1 + unop + + ADD s2, t2, s2 + LD a3, 1 * SIZE(X) + fabs a6, t2 + SXADDQ INCX, X, X + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + fabs a7, t3 + unop + + LD a5, 1 * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + ADD s0, t0, s0 + LD a6, 0 * SIZE(X) + fabs a0, t0 + + ADD s1, t1, s1 + LD a7, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + fabs a2, t2 + ADD s3, t3, s3 + fabs a3, t3 + + ADD s0, t0, s0 + fabs a4, t0 + ADD s1, t1, s1 + fabs a5, t1 + ADD s2, t2, s2 + fabs a6, t2 + ADD s3, t3, s3 + fabs a7, t3 + + ADD s2, t2, s2 + ADD s3, t3, s3 + + .align 4 + +$L15: + ADD s0, s2, s0 + and N, 3, I + ADD s1, s3, s1 + ble I, $L999 + .align 4 + +$L17: + ADD s0, t0, s0 + LD a0, 0 * SIZE(X) + fabs a0, t0 + lda I, -1(I) + + ADD s1, t1, s1 + LD a1, 1 * SIZE(X) + fabs a1, t1 + SXADDQ INCX, X, X + + bne I, $L17 + .align 4 + +$L999: + ADD s0, t0, s0 + ADD s1, t1, s1 + + ADD s0, s1, s0 + ret + EPILOGUE diff --git a/kernel/alpha/zaxpy.S b/kernel/alpha/zaxpy.S new file mode 100644 index 0000000000..a6f3c1d2fe --- /dev/null +++ b/kernel/alpha/zaxpy.S @@ -0,0 +1,611 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 40 + +#ifndef CONJ +#define ADD1 SUB +#define ADD2 ADD +#else +#define ADD1 ADD +#define ADD2 SUB +#endif + + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + ldl $19, 0($sp) + fmov $f19, $f29 + ldq $20, 8($sp) + fmov $f20, $f30 + + mov $21, $18 + ldl $21, 16($sp) + lda $sp, -64($sp) + nop + + stt $f2, 0($sp) + cmpeq $19, 1, $1 + stt $f3, 8($sp) + cmpeq $21, 1, $2 + + stt $f4, 16($sp) + and $16, 3, $5 + stt $f5, 24($sp) + stt $f6, 32($sp) + + stt $f7, 40($sp) + stt $f8, 48($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + and $1, $2, $1 + ble $16, $End + sra $16, 2, $4 + beq $1, $Sub + + ble $4, $Remain + subq $4, 1, $4 + + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + LD $f2, 2*SIZE($18) + LD $f3, 3*SIZE($18) + LD $f4, 4*SIZE($18) + LD $f5, 5*SIZE($18) + LD $f6, 6*SIZE($18) + LD $f7, 7*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + LD $f10, 2*SIZE($20) + LD $f11, 3*SIZE($20) + LD $f12, 4*SIZE($20) + LD $f13, 5*SIZE($20) + LD $f14, 6*SIZE($20) + LD $f15, 7*SIZE($20) + + addq $18, 8*SIZE, $18 + ble $4, $MainLoopEnd + .align 4 + +$MainLoop: + ldt $f31, PREFETCHSIZE * SIZE($20) + ldl $31, PREFETCHSIZE * SIZE($18) + + MUL $f29, $f0, $f20 + LD $f31, 9*SIZE($18) + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + unop + MUL $f30, $f3, $f25 + nop + + MUL $f30, $f2, $f26 + LD $f2, 2*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 3*SIZE($18) + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 4*SIZE($18) + + ADD2 $f26, $f27, $f19 + addq $20, 8*SIZE, $20 + MUL $f29, $f5, $f23 + LD $f5, 5*SIZE($18) + + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($20) + MUL $f29, $f6, $f24 + unop + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($20) + MUL $f30, $f7, $f25 + unop + + ADD $f18, $f10, $f18 + LD $f10, 2*SIZE($20) + MUL $f30, $f6, $f26 + LD $f6, 6*SIZE($18) + + ADD $f19, $f11, $f19 + LD $f11, 3*SIZE($20) + MUL $f29, $f7, $f27 + LD $f7, 7*SIZE($18) + + ST $f16,-8*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17,-7*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18,-6*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19,-5*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, $f16 + LD $f12, 4*SIZE($20) + ADD $f17, $f13, $f17 + LD $f13, 5*SIZE($20) + ADD $f18, $f14, $f18 + LD $f14, 6*SIZE($20) + ADD $f19, $f15, $f19 + LD $f15, 7*SIZE($20) + + ST $f16,-4*SIZE($20) + addq $18, 8*SIZE, $18 + ST $f17,-3*SIZE($20) + subq $4, 1, $4 + + ST $f18,-2*SIZE($20) + nop + ST $f19,-1*SIZE($20) + bgt $4, $MainLoop + .align 4 + +$MainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + ST $f18, 2*SIZE($20) + ADD1 $f24, $f25, $f18 + ST $f19, 3*SIZE($20) + ADD2 $f26, $f27, $f19 + + ADD $f16, $f12, $f16 + ADD $f17, $f13, $f17 + ADD $f18, $f14, $f18 + ADD $f19, $f15, $f19 + + ST $f16, 4*SIZE($20) + ST $f17, 5*SIZE($20) + ST $f18, 6*SIZE($20) + ST $f19, 7*SIZE($20) + + unop + addq $20, 8*SIZE, $20 + unop + ble $5, $End + .align 4 + +$Remain: + subq $5, 1, $6 + ble $5, $End + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + addq $18, 2*SIZE, $18 + ble $6, $RemainLoopEnd + .align 4 + +$RemainLoop: + MUL $f29, $f0, $f20 + subq $6, 1, $6 + MUL $f30, $f1, $f21 + addq $20, 2*SIZE, $20 + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($20) + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($20) + + ST $f16,-2*SIZE($20) + addq $18, 2*SIZE, $18 + ST $f17,-1*SIZE($20) + bgt $6, $RemainLoop + .align 4 + +$RemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + ADD $f17, $f28, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$End: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + lda $sp, 64($sp) + ret + .align 4 + +$Sub: + SXSUBL $16, SIZE, $22 + addq $22, $22, $22 # Complex + .align 4 + + addq $19, $19, $19 # Complex + addq $21, $21, $21 # Complex + + ble $4, $SubRemain + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f2, 0*SIZE($18) + LD $f3, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f4, 0*SIZE($18) + LD $f5, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f6, 0*SIZE($18) + LD $f7, 1*SIZE($18) + SXADDQ $19, $18, $18 + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $21, $20, $24 + + LD $f10, 0*SIZE($24) + LD $f11, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f12, 0*SIZE($24) + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + LD $f14, 0*SIZE($24) + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + subq $4, 1, $4 + ble $4, $SubMainLoopEnd + .align 4 + +$SubMainLoop: + MUL $f29, $f0, $f20 + unop + MUL $f30, $f1, $f21 + unop + + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + + MUL $f29, $f2, $f24 + SXADDQ $19, $18, $18 + MUL $f30, $f3, $f25 + unop + + MUL $f30, $f2, $f26 + LD $f2, 0*SIZE($18) + MUL $f29, $f3, $f27 + LD $f3, 1*SIZE($18) + + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + MUL $f29, $f4, $f20 + unop + + ADD2 $f22, $f23, $f17 + unop + MUL $f30, $f5, $f21 + unop + + ADD1 $f24, $f25, $f18 + unop + MUL $f30, $f4, $f22 + LD $f4, 0*SIZE($18) + + ADD2 $f26, $f27, $f19 + unop + MUL $f29, $f5, $f23 + LD $f5, 1*SIZE($18) + + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($24) + MUL $f29, $f6, $f24 + SXADDQ $19, $18, $18 + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($24) + MUL $f30, $f7, $f25 + SXADDQ $21, $24, $24 + + ADD $f18, $f10, $f18 + LD $f10, 0*SIZE($24) + MUL $f30, $f6, $f26 + LD $f6, 0*SIZE($18) + + ADD $f19, $f11, $f19 + LD $f11, 1*SIZE($24) + MUL $f29, $f7, $f27 + LD $f7, 1*SIZE($18) + + ST $f16, 0*SIZE($20) + SXADDQ $19, $18, $18 + ADD1 $f20, $f21, $f16 + unop + + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + ADD2 $f22, $f23, $f17 + unop + + ST $f18, 0*SIZE($20) + SXADDQ $21, $24, $24 + ADD1 $f24, $f25, $f18 + unop + + ST $f19, 1*SIZE($20) + unop + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + + ADD $f16, $f12, $f16 + unop + LD $f12, 0*SIZE($24) + unop + + ADD $f17, $f13, $f17 + unop + LD $f13, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ADD $f18, $f14, $f18 + subq $4, 1, $4 + LD $f14, 0*SIZE($24) + unop + + ADD $f19, $f15, $f19 + unop + LD $f15, 1*SIZE($24) + SXADDQ $21, $24, $24 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + unop + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $4, $SubMainLoop + .align 4 + +$SubMainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + MUL $f29, $f2, $f24 + MUL $f30, $f3, $f25 + MUL $f30, $f2, $f26 + MUL $f29, $f3, $f27 + + ADD1 $f20, $f21, $f16 + MUL $f29, $f4, $f20 + ADD2 $f22, $f23, $f17 + MUL $f30, $f5, $f21 + + ADD1 $f24, $f25, $f18 + MUL $f30, $f4, $f22 + ADD2 $f26, $f27, $f19 + MUL $f29, $f5, $f23 + + ADD $f16, $f8, $f16 + MUL $f29, $f6, $f24 + ADD $f17, $f28, $f17 + MUL $f30, $f7, $f25 + + ADD $f18, $f10, $f18 + MUL $f30, $f6, $f26 + ADD $f19, $f11, $f19 + MUL $f29, $f7, $f27 + + ST $f16, 0*SIZE($20) + ADD1 $f20, $f21, $f16 + ST $f17, 1*SIZE($20) + ADD2 $f22, $f23, $f17 + + SXADDQ $21, $20, $20 + nop + ST $f18, 0*SIZE($20) + ADD1 $f24, $f25, $f18 + + ST $f19, 1*SIZE($20) + ADD2 $f26, $f27, $f19 + SXADDQ $21, $20, $20 + ADD $f16, $f12, $f16 + + ADD $f17, $f13, $f17 + ADD $f18, $f14, $f18 + ADD $f19, $f15, $f19 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + + ST $f18, 0*SIZE($20) + ST $f19, 1*SIZE($20) + SXADDQ $21, $20, $20 + ble $5, $SubEnd + .align 4 + +$SubRemain: + subq $5, 1, $6 + ble $5, $SubEnd + LD $f0, 0*SIZE($18) + LD $f1, 1*SIZE($18) + + LD $f8, 0*SIZE($20) + LD $f28, 1*SIZE($20) + SXADDQ $19, $18, $18 + SXADDQ $21, $20, $24 + ble $6, $SubRemainLoopEnd + .align 4 + +$SubRemainLoop: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + LD $f0, 0*SIZE($18) + + MUL $f29, $f1, $f23 + LD $f1, 1*SIZE($18) + ADD1 $f20, $f21, $f16 + SXADDQ $19, $18, $18 + + ADD2 $f22, $f23, $f17 + nop + ADD $f16, $f8, $f16 + LD $f8, 0*SIZE($24) + + ADD $f17, $f28, $f17 + LD $f28, 1*SIZE($24) + SXADDQ $21, $24, $24 + subq $6, 1, $6 + + ST $f16, 0*SIZE($20) + ST $f17, 1*SIZE($20) + SXADDQ $21, $20, $20 + bgt $6, $SubRemainLoop + .align 4 + +$SubRemainLoopEnd: + MUL $f29, $f0, $f20 + MUL $f30, $f1, $f21 + MUL $f30, $f0, $f22 + MUL $f29, $f1, $f23 + + ADD1 $f20, $f21, $f16 + ADD2 $f22, $f23, $f17 + ADD $f16, $f8, $f16 + ADD $f17, $f28, $f17 + + ST $f16, 0*SIZE($20) + nop + ST $f17, 1*SIZE($20) + nop + .align 4 + +$SubEnd: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + lda $sp, 64($sp) + ret + EPILOGUE diff --git a/kernel/alpha/zdot.S b/kernel/alpha/zdot.S new file mode 100644 index 0000000000..78dcae6681 --- /dev/null +++ b/kernel/alpha/zdot.S @@ -0,0 +1,500 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define XX $21 +#define YY $23 + +#define I $5 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f2 +#define s3 $f30 + +#define a0 $f10 +#define a1 $f11 +#define a2 $f12 +#define a3 $f13 +#define a4 $f14 +#define a5 $f15 +#define a6 $f16 +#define a7 $f17 + +#define b0 $f18 +#define b1 $f19 +#define b2 $f20 +#define b3 $f21 +#define b4 $f22 +#define b5 $f23 +#define b6 $f24 +#define b7 $f25 + +#define t0 $f26 +#define t1 $f27 +#define t2 $f28 +#define t3 $f29 + + PROLOGUE + PROFCODE + .frame $sp, 16, $26, 0 + + lda $sp, -16($sp) + fclr s0 + stt $f2, 0($sp) + fclr s1 + + fclr s2 + addq INCX, INCX, INCX + fclr s3 + ble N, $L999 + + addq INCY, INCY, INCY + fclr t0 + fclr t1 + fclr t2 + fclr t3 + + srl N, 3, I + ble I, $L25 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + LD b2, 0 * SIZE(Y) + LD b3, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + LD b4, 0 * SIZE(Y) + LD b5, 1 * SIZE(Y) + + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + LD a6, 0 * SIZE(X) + LD b6, 0 * SIZE(Y) + + subq I, 1, I + ble I, $L23 + .align 4 + +$L22: + ADD s0, t0, s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + ldl $31, PREFETCHSIZE * SIZE(X) + MUL a0, b1, t1 + SXADDQ INCX, X, X + + ADD s2, t2, s2 + ldl $31, PREFETCHSIZE * SIZE(Y) + MUL a1, b0, t2 + SXADDQ INCY, Y, Y + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + subq I, 1, I + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + LD a6, 0 * SIZE(X) + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD b6, 0 * SIZE(Y) + MUL a7, b7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD s0, t0, s0 + LD a7, 1 * SIZE(X) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a1, b0, t2 + unop + + ADD s3, t3, s3 + LD a0, 0 * SIZE(X) + MUL a1, b1, t3 + LD a1, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b0, 0 * SIZE(Y) + MUL a2, b2, t0 + LD b1, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a2, b3, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a3, b2, t2 + unop + + ADD s3, t3, s3 + LD a2, 0 * SIZE(X) + MUL a3, b3, t3 + LD a3, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b2, 0 * SIZE(Y) + MUL a4, b4, t0 + LD b3, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a4, b5, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a5, b4, t2 + unop + + ADD s3, t3, s3 + LD a4, 0 * SIZE(X) + MUL a5, b5, t3 + LD a5, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b4, 0 * SIZE(Y) + MUL a6, b6, t0 + LD b5, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a6, b7, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + unop + MUL a7, b6, t2 + unop + + ADD s3, t3, s3 + LD a6, 0 * SIZE(X) + MUL a7, b7, t3 + LD a7, 1 * SIZE(X) + + ADD s0, t0, s0 + LD b6, 0 * SIZE(Y) + MUL a0, b0, t0 + LD b7, 1 * SIZE(Y) + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + MUL a1, b0, t2 + ADD s3, t3, s3 + MUL a1, b1, t3 + + ADD s0, t0, s0 + MUL a2, b2, t0 + ADD s1, t1, s1 + MUL a2, b3, t1 + + ADD s2, t2, s2 + MUL a3, b2, t2 + ADD s3, t3, s3 + MUL a3, b3, t3 + + ADD s0, t0, s0 + MUL a4, b4, t0 + ADD s1, t1, s1 + MUL a4, b5, t1 + + ADD s2, t2, s2 + MUL a5, b4, t2 + ADD s3, t3, s3 + MUL a5, b5, t3 + + ADD s0, t0, s0 + MUL a6, b6, t0 + ADD s1, t1, s1 + MUL a6, b7, t1 + + ADD s2, t2, s2 + MUL a7, b6, t2 + ADD s3, t3, s3 + MUL a7, b7, t3 + .align 4 + +$L25: + and N, 7, I + unop + unop + ble I, $L998 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + LD b0, 0 * SIZE(Y) + LD b1, 1 * SIZE(Y) + + SXADDQ INCX, X, X + subq I, 1, I + SXADDQ INCY, Y, Y + ble I, $L28 + .align 4 + +$L26: + ADD s0, t0, s0 + mov X, XX + MUL a0, b0, t0 + mov Y, YY + + ADD s1, t1, s1 + SXADDQ INCX, X, X + MUL a0, b1, t1 + SXADDQ INCY, Y, Y + + ADD s2, t2, s2 + LD a0, 0 * SIZE(XX) + MUL a1, b0, t2 + LD b0, 0 * SIZE(YY) + + ADD s3, t3, s3 + subq I, 1, I + MUL a1, b1, t3 + LD a1, 1 * SIZE(XX) + + LD b1, 1 * SIZE(YY) + bgt I, $L26 + .align 4 + +$L28: + ADD s0, t0, s0 + MUL a0, b0, t0 + ADD s1, t1, s1 + MUL a0, b1, t1 + + ADD s2, t2, s2 + MUL a1, b0, t2 + ADD s3, t3, s3 + MUL a1, b1, t3 + .align 4 + +$L998: + ADD s0, t0, s0 + ADD s1, t1, s1 + ADD s2, t2, s2 + ADD s3, t3, s3 + +#ifndef CONJ + SUB s0, s3, s0 + ADD s1, s2, s1 +#else + ADD s0, s3, s0 + SUB s1, s2, s1 +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + lda $sp, 16($sp) + ret + + EPILOGUE diff --git a/kernel/alpha/zgemm_beta.S b/kernel/alpha/zgemm_beta.S new file mode 100644 index 0000000000..f7ca347f13 --- /dev/null +++ b/kernel/alpha/zgemm_beta.S @@ -0,0 +1,192 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + .set noat + .set noreorder +.text + .align 5 + .globl CNAME + .ent CNAME +CNAME: + .frame $sp, 0, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $28, _mcount + jsr $28, ($28), _mcount + .prologue 1 +#else + .prologue 0 +#endif + + ldq $18, 24($sp) + ble $16, $End + ldl $19, 32($sp) + ble $17, $End + + addq $19, $19, $19 + fbne $f19,$Main + fbne $f20,$Main + .align 4 + +$L13: + mov $18, $1 + lda $17, -1($17) + SXADDQ $19, $18, $18 + mov $16, $2 + .align 4 + +$L12: + ST $f31, 0*SIZE($1) + ST $f31, 1*SIZE($1) + lda $2, -1($2) + lda $1, 2*SIZE($1) + bgt $2, $L12 + bgt $17,$L13 + clr $0 + ret + .align 4 + +/* Main Routine */ +$Main: + sra $16, 1, $2 # $2 = (m >> 1) + mov $18, $1 # c_offset = c + lda $17, -1($17) # n -- + SXADDQ $19, $18, $18 # c += ldc + beq $2, $L18 + + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + LD $f24, 2*SIZE($1) + LD $f25, 3*SIZE($1) + lda $2, -1($2) # $2 -- + ble $2, $L19 + .align 4 + + +$L23: + MUL $f19, $f14, $f10 + lds $f31, 9*SIZE($1) + MUL $f20, $f15, $f11 + lda $2, -1($2) + + MUL $f19, $f15, $f12 + LD $f15, 5*SIZE($1) + MUL $f20, $f14, $f13 + LD $f14, 4*SIZE($1) + + MUL $f19, $f24, $f16 + unop + MUL $f20, $f25, $f17 + unop + + MUL $f19, $f25, $f18 + LD $f25, 7*SIZE($1) + SUB $f10, $f11, $f22 + unop + + MUL $f20, $f24, $f21 + LD $f24, 6*SIZE($1) + ADD $f12, $f13, $f23 + lda $1, 4*SIZE($1) + + SUB $f16, $f17, $f26 + ADD $f18, $f21, $f27 + ST $f22,-4*SIZE($1) + ST $f23,-3*SIZE($1) + + ST $f26,-2*SIZE($1) + ST $f27,-1*SIZE($1) + unop + bgt $2,$L23 + .align 4 + +$L19: + MUL $f19, $f14, $f10 + MUL $f20, $f15, $f11 + MUL $f19, $f15, $f12 + MUL $f20, $f14, $f13 + + MUL $f19, $f24, $f16 + MUL $f20, $f25, $f17 + MUL $f19, $f25, $f18 + MUL $f20, $f24, $f21 + + SUB $f10, $f11, $f22 + ADD $f12, $f13, $f23 + SUB $f16, $f17, $f26 + ADD $f18, $f21, $f27 + lda $1, 4*SIZE($1) + + ST $f22, -4*SIZE($1) + ST $f23, -3*SIZE($1) + ST $f26, -2*SIZE($1) + ST $f27, -1*SIZE($1) + + blbs $16, $L18 + bgt $17, $Main + clr $0 + ret + .align 4 + +$L18: + LD $f14, 0*SIZE($1) + LD $f15, 1*SIZE($1) + MUL $f19, $f15, $f13 + MUL $f20, $f14, $f10 + + MUL $f19, $f14, $f12 + MUL $f20, $f15, $f11 + ADD $f13, $f10, $f26 + SUB $f12, $f11, $f27 + + ST $f26, 1*SIZE($1) + ST $f27, 0*SIZE($1) + lda $1, 2*SIZE($1) + bgt $17, $Main + .align 4 + +$End: + clr $0 + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/zgemm_kernel_2x2.S b/kernel/alpha/zgemm_kernel_2x2.S new file mode 100644 index 0000000000..33c50ddf85 --- /dev/null +++ b/kernel/alpha/zgemm_kernel_2x2.S @@ -0,0 +1,1712 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define BB $3 +#define OFFSET $4 + +#define ALPHA_R 64($sp) +#define ALPHA_I 72($sp) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + lda $sp, -STACKSIZE($sp) + + ldq B, 0 + STACKSIZE($sp) + ldq C, 8 + STACKSIZE($sp) + ldq LDC, 16 + STACKSIZE($sp) +#ifdef TRMMKERNEL + ldq OFFSET, 24 + STACKSIZE($sp) +#endif + + sll LDC, ZBASE_SHIFT, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + stt $f19, ALPHA_R + stt $f20, ALPHA_I + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + subq $31, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: + mov C, C1 + addq C, LDC, C2 + mov A, AO + s4addq K, 0, BB + + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + SXADDQ BB, B, BB + addq C2, LDC, C + unop + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#ifndef EV4 + ldl $31, 0 * SIZE(BB) + ldl $31, 8 * SIZE(BB) + unop + lda BB, 16 * SIZE(BB) +#endif + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + lda BO, 4 * SIZE(B) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble L, $L15 +#else + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + lda BO, 4 * SIZE(BO) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(TMP1) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + ldt alpha_r, ALPHA_R + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L18 +#else + blbs TMP1, $L18 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L18: + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + ldt alpha_i, ALPHA_I + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 +#ifndef TRMMKERNEL + LD a5, 0 * SIZE(C1) +#else + unop +#endif + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 +#ifndef TRMMKERNEL + LD b1, 1 * SIZE(C1) +#else + unop +#endif + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 +#ifndef TRMMKERNEL + LD a1, 2 * SIZE(C1) +#else + unop +#endif + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 +#ifndef TRMMKERNEL + LD a2, 3 * SIZE(C1) +#else + unop +#endif + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 +#ifndef TRMMKERNEL + LD b2, 0 * SIZE(C2) +#else + unop +#endif + + ADD1 c09, t1, c09 + lda I, -1(I) + MUL b3, a3, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 +#ifndef TRMMKERNEL + LD b3, 1 * SIZE(C2) +#else + unop +#endif + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 +#ifndef TRMMKERNEL + LD a4, 2 * SIZE(C2) +#else + unop +#endif + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 +#ifndef TRMMKERNEL + LD a3, 3 * SIZE(C2) +#else + unop +#endif + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + MUL alpha_r, c01, t1 + ADD c10, c13, c10 + MUL alpha_r, c02, t2 + + ADD c11, c16, c11 + MUL alpha_r, c03, t3 + ADD c12, c15, c12 + MUL alpha_r, c04, t4 + +#ifndef TRMMKERNEL + ADD a5, t1, a5 + MUL alpha_i, c02, t1 + ADD b1, t2, b1 + MUL alpha_i, c01, t2 + + ADD a1, t3, a1 + MUL alpha_i, c04, t3 + ADD a2, t4, a2 + MUL alpha_i, c03, t4 +#else + ADD $f31, t1, a5 + MUL alpha_i, c02, t1 + ADD $f31, t2, b1 + MUL alpha_i, c01, t2 + + ADD $f31, t3, a1 + MUL alpha_i, c04, t3 + ADD $f31, t4, a2 + MUL alpha_i, c03, t4 +#endif + + SUB a5, t1, a5 + MUL alpha_r, c09, t1 + ADD b1, t2, b1 + MUL alpha_r, c10, t2 + + SUB a1, t3, a1 + MUL alpha_r, c11, t3 + ADD a2, t4, a2 + MUL alpha_r, c12, t4 + +#ifndef TRMMKERNEL + ADD b2, t1, b2 + MUL alpha_i, c10, t1 + ADD b3, t2, b3 + MUL alpha_i, c09, t2 + + ADD a4, t3, a4 + MUL alpha_i, c12, t3 + ADD a3, t4, a3 + MUL alpha_i, c11, t4 +#else + ADD $f31, t1, b2 + MUL alpha_i, c10, t1 + ADD $f31, t2, b3 + MUL alpha_i, c09, t2 + + ADD $f31, t3, a4 + MUL alpha_i, c12, t3 + ADD $f31, t4, a3 + MUL alpha_i, c11, t4 +#endif + + SUB b2, t1, b2 + ST a5, 0 * SIZE(C1) + fclr t1 + unop + + ADD b3, t2, b3 + ST b1, 1 * SIZE(C1) + fclr t2 + unop + + SUB a4, t3, a4 + ST a1, 2 * SIZE(C1) + fclr t3 + unop + + ADD a3, t4, a3 + ST a2, 3 * SIZE(C1) + fclr t4 + unop + + ST b2, 0 * SIZE(C2) + fclr c01 + ST b3, 1 * SIZE(C2) + fclr c05 + + ST a4, 2 * SIZE(C2) + lda C1, 4 * SIZE(C1) + ST a3, 3 * SIZE(C2) + lda C2, 4 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 2, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 4 * SIZE(B) + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + ble L, $L25 +#else + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 4 * SIZE(BO) + + lda L, -2(TMP1) + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + ldt alpha_r, ALPHA_R + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L28 +#else + blbs TMP1, $L28 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L28: + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + ldt alpha_i, ALPHA_I + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c03, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c04, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 +#ifndef TRMMKERNEL + LD c11, 0 * SIZE(C2) +#else + unop +#endif + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 +#ifndef TRMMKERNEL + LD c12, 1 * SIZE(C2) +#else + unop +#endif + + ADD4 c05, t3, c05 + MUL a1, b4, t3 + ADD2 c06, t4, c06 + MUL a2, b4, t4 + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_r, c09, t3 + MUL alpha_r, c10, t4 + +#ifndef TRMMKERNEL + ADD c03, t1, c03 + MUL alpha_i, c02, t1 + ADD c04, t2, c04 + MUL alpha_i, c01, t2 + + ADD c11, t3, c11 + MUL alpha_i, c10, t3 + ADD c12, t4, c12 + MUL alpha_i, c09, t4 +#else + ADD $f31, t1, c03 + MUL alpha_i, c02, t1 + ADD $f31, t2, c04 + MUL alpha_i, c01, t2 + + ADD $f31, t3, c11 + MUL alpha_i, c10, t3 + ADD $f31, t4, c12 + MUL alpha_i, c09, t4 +#endif + + SUB c03, t1, c03 + ADD c04, t2, c04 + SUB c11, t3, c11 + ADD c12, t4, c12 + + ST c03, 0 * SIZE(C1) + ST c04, 1 * SIZE(C1) + ST c11, 0 * SIZE(C2) + ST c12, 1 * SIZE(C2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 1, TMP1 +#else + subq TMP1, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 1, KK +#endif + .align 4 + +$L29: + mov BO, B + lda J, -1(J) +#if defined(TRMMKERNEL) && !defined(LEFT) + addq KK, 2, KK +#else + unop +#endif + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + + mov C, C1 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 2, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda BO, 2 * SIZE(B) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + fclr c04 + fclr c08 + ble L, $L45 +#else + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + sll KK, ZBASE_SHIFT + 0, TMP1 + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda BO, 2 * SIZE(BO) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(TMP1) + fclr c04 + fclr c08 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + ldt alpha_r, ALPHA_R + MUL b1, a1, t1 +#ifndef TRMMKERNEL + blbs K, $L48 +#else + blbs TMP1, $L48 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L48: + ADD2 c06, t2, c06 + unop + MUL a2, b1, t2 + ldt alpha_i, ALPHA_I + + ADD4 c07, t3, c07 + lda I, -1(I) + MUL a3, b1, t3 +#ifndef TRMMKERNEL + LD c09, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 +#ifndef TRMMKERNEL + LD c10, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 +#ifndef TRMMKERNEL + LD c11, 2 * SIZE(C1) +#else + unop +#endif + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 +#ifndef TRMMKERNEL + LD c12, 3 * SIZE(C1) +#else + unop +#endif + + ADD1 c03, t3, c03 + MUL a3, b2, t3 + ADD3 c04, t4, c04 + MUL a4, b2, t4 + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_r, c03, t3 + MUL alpha_r, c04, t4 + +#ifndef TRMMKERNEL + ADD c09, t1, c09 + MUL alpha_i, c02, t1 + ADD c10, t2, c10 + MUL alpha_i, c01, t2 + + ADD c11, t3, c11 + MUL alpha_i, c04, t3 + ADD c12, t4, c12 + MUL alpha_i, c03, t4 +#else + ADD $f31, t1, c09 + MUL alpha_i, c02, t1 + ADD $f31, t2, c10 + MUL alpha_i, c01, t2 + + ADD $f31, t3, c11 + MUL alpha_i, c04, t3 + ADD $f31, t4, c12 + MUL alpha_i, c03, t4 +#endif + + SUB c09, t1, c09 + ADD c10, t2, c10 + SUB c11, t3, c11 + ADD c12, t4, c12 + + ST c09, 0 * SIZE(C1) + ST c10, 1 * SIZE(C1) + ST c11, 2 * SIZE(C1) + ST c12, 3 * SIZE(C1) + + lda C1, 4 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + subq K, KK, TMP1 +#ifdef LEFT + subq TMP1, 2, TMP1 +#else + subq TMP1, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq BO, TMP2, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq KK, 2, KK +#endif + + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef TRMMKERNEL +#ifdef LEFT + addq KK, 1, TMP1 +#else + addq KK, 1, TMP1 +#endif +#endif + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(B) + +#ifndef TRMMKERNEL + lda L, -2(K) +#else + lda L, -2(TMP1) +#endif + ble L, $L55 +#else + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AO, TMP1, AO + addq B, TMP1, BO + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(BO) + + lda L, -2(TMP1) + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + ldt alpha_r, ALPHA_R + MUL a1, b1, t1 +#ifndef TRMMKERNEL + blbs K, $L58 +#else + blbs TMP1, $L58 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L58: + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + ldt alpha_i, ALPHA_I + + ADD4 c05, t3, c05 + unop + MUL a1, b2, t3 +#ifndef TRMMKERNEL + LD c03, 0 * SIZE(C1) +#else + unop +#endif + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 +#ifndef TRMMKERNEL + LD c04, 1 * SIZE(C1) +#else + unop +#endif + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + + MUL alpha_r, c01, t1 + MUL alpha_r, c02, t2 + MUL alpha_i, c02, t3 + MUL alpha_i, c01, t4 + +#ifndef TRMMKERNEL + ADD c03, t1, c03 + ADD c04, t2, c04 +#else + ADD $f31, t1, c03 + ADD $f31, t2, c04 +#endif + + SUB c03, t3, c03 + ADD c04, t4, c04 + + ST c03, 0 * SIZE(C1) + ST c04, 1 * SIZE(C1) + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/zgemv_n.S b/kernel/alpha/zgemv_n.S new file mode 100644 index 0000000000..fd602a3eb2 --- /dev/null +++ b/kernel/alpha/zgemv_n.S @@ -0,0 +1,1027 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $21 +#define LDA $18 + +#define X $19 +#define INCX $20 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define Y1 $4 +#define A1 $5 +#define A2 $6 + +#define alpha_r $f19 +#define alpha_i $f20 + +#define alpha1 $f0 +#define alpha2 $f1 +#define alpha3 $f10 +#define alpha4 $f11 + +#define y0 $f12 +#define y1 $f13 +#define y2 $f14 +#define y3 $f15 + +#define y4 $f16 +#define y5 $f17 +#define y6 $f18 +#define y7 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define t0 $f2 +#define t1 $f3 +#define t2 $f4 +#define t3 $f5 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + + PROLOGUE + + lda $sp, -STACKSIZE($sp) + ldq LDA, 0 + STACKSIZE($sp) + ldq X, 8 + STACKSIZE($sp) + ldq INCX, 16 + STACKSIZE($sp) + ldq Y, 24 + STACKSIZE($sp) + ldq INCY, 32 + STACKSIZE($sp) + ldq BUFFER, 40 + STACKSIZE($sp) + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 + sll INCY, ZBASE_SHIFT, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCY, 2 * SIZE, $0 + sll LDA, ZBASE_SHIFT,LDA + bne $0, $L10 + + mov BUFFER, Y1 + + mov Y, BUFFER + mov Y1, Y + + sra M, 2, I + ble I, $L05 + .align 4 + +$L02: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + ST $f31, 2 * SIZE(Y1) + ST $f31, 3 * SIZE(Y1) + ST $f31, 4 * SIZE(Y1) + ST $f31, 5 * SIZE(Y1) + ST $f31, 6 * SIZE(Y1) + ST $f31, 7 * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + lda I, -1(I) + bgt I, $L02 + .align 4 + +$L05: + and M, 3, I + ble I, $L10 + .align 4 + +$L06: + ST $f31, 0 * SIZE(Y1) + ST $f31, 1 * SIZE(Y1) + addq Y1, 2 * SIZE, Y1 + + lda I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + sra N, 1, J + ble J, $L20 + .align 4 + +$L11: + LD alpha1, 0 * SIZE(X) + LD alpha2, 1 * SIZE(X) + addq X, INCX, X + LD alpha3, 0 * SIZE(X) + LD alpha4, 1 * SIZE(X) + addq X, INCX, X + + MUL alpha_r, alpha1, y0 + MUL alpha_r, alpha2, y1 + MUL alpha_r, alpha3, y2 + MUL alpha_r, alpha4, y3 + + MUL alpha_i, alpha2, t0 + mov A, A1 + MUL alpha_i, alpha1, t1 + addq A, LDA, A2 + MUL alpha_i, alpha4, t2 + addq A2, LDA, A + MUL alpha_i, alpha3, t3 + mov Y, Y1 + +#ifndef XCONJ + SUB y0, t0, alpha1 + ADD y1, t1, alpha2 + SUB y2, t2, alpha3 + ADD y3, t3, alpha4 +#else + ADD y0, t0, alpha1 + SUB y1, t1, alpha2 + ADD y2, t2, alpha3 + SUB y3, t3, alpha4 +#endif + + ldl $31, 4 * SIZE(X) + + sra M, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha3, a4, t0 + LD y4, 4 * SIZE(Y1) + + ADD2 y1, t1, y1 + unop + MUL alpha3, a5, t1 + LD y5, 5 * SIZE(Y1) + + ADD1 y2, t2, y2 + unop + MUL alpha3, a6, t2 + LD y6, 6 * SIZE(Y1) + + ADD2 y3, t3, y3 + unop + MUL alpha3, a7, t3 + LD y7, 7 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 5 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 4 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 7 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 6 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + MUL alpha4, a5, t0 + LD a5, 5 * SIZE(A2) + + ADD4 y1, t1, y1 + unop + MUL alpha4, a4, t1 + LD a4, 4 * SIZE(A2) + + ADD3 y2, t2, y2 + unop + MUL alpha4, a7, t2 + LD a7, 7 * SIZE(A2) + + ADD4 y3, t3, y3 + unop + MUL alpha4, a6, t3 + LD a6, 6 * SIZE(A2) + + ADD3 y0, t0, y0 + MUL alpha1, a0, t0 + ADD4 y1, t1, y1 + MUL alpha1, a1, t1 + + ADD3 y2, t2, y2 + unop + MUL alpha1, a2, t2 + unop + + ADD4 y3, t3, y3 + lda I, -1(I) + MUL alpha1, a3, t3 + ble I, $L13 + .align 4 + +$L12: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha3, a4, t0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha3, a5, t1 + lda I, -1(I) + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha3, a7, t3 + unop + + ADD1 y4, t0, y4 + unop + MUL alpha2, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 y5, t1, y5 + unop + MUL alpha2, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 y6, t2, y6 + unop + MUL alpha2, a3, t2 + LD a3, 11 * SIZE(A1) + + ADD2 y7, t3, y7 + unop + MUL alpha2, a2, t3 + LD a2, 10 * SIZE(A1) + + ADD3 y4, t0, y4 + lds $f31, (PREFETCHSIZE + 0) * SIZE(Y1) + MUL alpha4, a5, t0 + LD a5, 9 * SIZE(A2) + + ADD4 y5, t1, y5 + unop + MUL alpha4, a4, t1 + LD a4, 8 * SIZE(A2) + + ADD3 y6, t2, y6 + unop + MUL alpha4, a7, t2 + LD a7, 11 * SIZE(A2) + + ADD4 y7, t3, y7 + unop + MUL alpha4, a6, t3 + LD a6, 10 * SIZE(A2) + + ADD3 y4, t0, y4 + unop + MUL alpha1, a0, t0 + LD y0, 8 * SIZE(Y1) + + ADD4 y5, t1, y5 + unop + MUL alpha1, a1, t1 + LD y1, 9 * SIZE(Y1) + + ADD3 y6, t2, y6 + unop + MUL alpha1, a2, t2 + LD y2, 10 * SIZE(Y1) + + ADD4 y7, t3, y7 + unop + MUL alpha1, a3, t3 + LD y3, 11 * SIZE(Y1) + + ADD1 y0, t0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha3, a4, t0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + + ADD2 y1, t1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha3, a5, t1 + unop + + ADD1 y2, t2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y3, t3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha3, a7, t3 + lda Y1, 8 * SIZE(Y1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 13 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 12 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 15 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 14 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + MUL alpha4, a5, t0 + LD a5, 13 * SIZE(A2) + + ADD4 y1, t1, y1 + unop + MUL alpha4, a4, t1 + LD a4, 12 * SIZE(A2) + + ADD3 y2, t2, y2 + unop + MUL alpha4, a7, t2 + LD a7, 15 * SIZE(A2) + + ADD4 y3, t3, y3 + unop + MUL alpha4, a6, t3 + LD a6, 14 * SIZE(A2) + + ADD3 y0, t0, y0 + unop + MUL alpha1, a0, t0 + LD y4, 4 * SIZE(Y1) + + ADD4 y1, t1, y1 + lda A2, 8 * SIZE(A2) + MUL alpha1, a1, t1 + LD y5, 5 * SIZE(Y1) + + ADD3 y2, t2, y2 + lda A1, 8 * SIZE(A1) + MUL alpha1, a2, t2 + LD y6, 6 * SIZE(Y1) + + ADD4 y3, t3, y3 + MUL alpha1, a3, t3 + LD y7, 7 * SIZE(Y1) + bgt I, $L12 + .align 4 + +$L13: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha3, a4, t0 + unop + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha3, a5, t1 + unop + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha3, a6, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha3, a7, t3 + unop + + ADD1 y4, t0, y4 + MUL alpha2, a1, t0 + ADD2 y5, t1, y5 + MUL alpha2, a0, t1 + + ADD1 y6, t2, y6 + MUL alpha2, a3, t2 + ADD2 y7, t3, y7 + MUL alpha2, a2, t3 + + ADD3 y4, t0, y4 + MUL alpha4, a5, t0 + ADD4 y5, t1, y5 + MUL alpha4, a4, t1 + + ADD3 y6, t2, y6 + MUL alpha4, a7, t2 + ADD4 y7, t3, y7 + MUL alpha4, a6, t3 + + ADD3 y4, t0, y4 + ADD4 y5, t1, y5 + ADD3 y6, t2, y6 + ADD4 y7, t3, y7 + + ST y4, 4 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y5, 5 * SIZE(Y1) + lda A2, 8 * SIZE(A2) + + ST y6, 6 * SIZE(Y1) + unop + ST y7, 7 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) + .align 4 + +$L15: + and M, 2, I + ble I, $L17 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD a4, 0 * SIZE(A2) + LD a5, 1 * SIZE(A2) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha3, a4, t0 + ADD2 y1, t1, y1 + MUL alpha3, a5, t1 + ADD1 y2, t2, y2 + MUL alpha3, a6, t2 + ADD2 y3, t3, y3 + MUL alpha3, a7, t3 + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD1 y2, t2, y2 + MUL alpha2, a3, t2 + ADD2 y3, t3, y3 + MUL alpha2, a2, t3 + + ADD3 y0, t0, y0 + MUL alpha4, a5, t0 + ADD4 y1, t1, y1 + MUL alpha4, a4, t1 + + ADD3 y2, t2, y2 + MUL alpha4, a7, t2 + ADD4 y3, t3, y3 + MUL alpha4, a6, t3 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + ADD3 y2, t2, y2 + ADD4 y3, t3, y3 + + ST y0, 0 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y1, 1 * SIZE(Y1) + lda A2, 4 * SIZE(A2) + + ST y2, 2 * SIZE(Y1) + unop + ST y3, 3 * SIZE(Y1) + lda Y1, 4 * SIZE(Y1) + .align 4 + +$L17: + blbc M, $L18 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + + MUL alpha1, a0, t0 + MUL alpha1, a1, t1 + + ADD1 y0, t0, y0 + MUL alpha3, a2, t0 + ADD2 y1, t1, y1 + MUL alpha3, a3, t1 + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD3 y0, t0, y0 + MUL alpha4, a3, t0 + ADD4 y1, t1, y1 + MUL alpha4, a2, t1 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + .align 4 + +$L18: + lda J, -1(J) + bgt J, $L11 + .align 4 + +$L20: + blbc N, $L990 + + LD alpha1, 0 * SIZE(X) + LD alpha2, 1 * SIZE(X) + + MUL alpha_r, alpha1, y0 + MUL alpha_r, alpha2, y1 + + MUL alpha_i, alpha2, t0 + mov A, A1 + MUL alpha_i, alpha1, t1 + mov Y, Y1 + +#ifndef XCONJ + SUB y0, t0, alpha1 + ADD y1, t1, alpha2 +#else + ADD y0, t0, alpha1 + SUB y1, t1, alpha2 +#endif + + sra M, 2, I + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + LD y0, 0 * SIZE(Y1) + LD y1, 1 * SIZE(Y1) + LD y2, 2 * SIZE(Y1) + LD y3, 3 * SIZE(Y1) + + MUL alpha1, a0, t0 + LD a4, 4 * SIZE(A1) + MUL alpha1, a1, t1 + LD a5, 5 * SIZE(A1) + MUL alpha1, a2, t2 + LD a6, 6 * SIZE(A1) + MUL alpha1, a3, t3 + LD a7, 7 * SIZE(A1) + + ADD1 y0, t0, y0 + unop + MUL alpha2, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 y1, t1, y1 + unop + MUL alpha2, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 y2, t2, y2 + unop + MUL alpha2, a3, t2 + LD a3, 11 * SIZE(A1) + + ADD2 y3, t3, y3 + unop + MUL alpha2, a2, t3 + LD a2, 10 * SIZE(A1) + + ADD3 y0, t0, y0 + unop + LD y4, 4 * SIZE(Y1) + MUL alpha1, a4, t0 + + ADD4 y1, t1, y1 + unop + LD y5, 5 * SIZE(Y1) + MUL alpha1, a5, t1 + + ADD3 y2, t2, y2 + LD y6, 6 * SIZE(Y1) + MUL alpha1, a6, t2 + lda I, -1(I) + + ADD4 y3, t3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, t3 + ble I, $L23 + .align 4 + +$L22: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a5, t0 + LD a5, 13 * SIZE(A1) + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a4, t1 + LD a4, 12 * SIZE(A1) + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a7, t2 + LD a7, 15 * SIZE(A1) + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a6, t3 + LD a6, 14 * SIZE(A1) + + ADD3 y4, t0, y4 + LD y0, 8 * SIZE(Y1) + MUL alpha1, a0, t0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + + ADD4 y5, t1, y5 + LD y1, 9 * SIZE(Y1) + MUL alpha1, a1, t1 + lda I, -1(I) + + ADD3 y6, t2, y6 + LD y2, 10 * SIZE(Y1) + MUL alpha1, a2, t2 + unop + + ADD4 y7, t3, y7 + LD y3, 11 * SIZE(Y1) + MUL alpha1, a3, t3 + unop + + ADD1 y0, t0, y0 + ST y4, 4 * SIZE(Y1) + MUL alpha2, a1, t0 + LD a1, 17 * SIZE(A1) + + ADD2 y1, t1, y1 + ST y5, 5 * SIZE(Y1) + MUL alpha2, a0, t1 + LD a0, 16 * SIZE(A1) + + ADD1 y2, t2, y2 + ST y6, 6 * SIZE(Y1) + MUL alpha2, a3, t2 + LD a3, 19 * SIZE(A1) + + ADD2 y3, t3, y3 + ST y7, 7 * SIZE(Y1) + MUL alpha2, a2, t3 + LD a2, 18 * SIZE(A1) + + ADD3 y0, t0, y0 + LD y4, 12 * SIZE(Y1) + MUL alpha1, a4, t0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(Y1) + + ADD4 y1, t1, y1 + LD y5, 13 * SIZE(Y1) + MUL alpha1, a5, t1 + lda A1, 8 * SIZE(A1) + + ADD3 y2, t2, y2 + LD y6, 14 * SIZE(Y1) + MUL alpha1, a6, t2 + lda Y1, 8 * SIZE(Y1) + + ADD4 y3, t3, y3 + LD y7, 7 * SIZE(Y1) + MUL alpha1, a7, t3 + bgt I, $L22 + .align 4 + +$L23: + ADD1 y4, t0, y4 + ST y0, 0 * SIZE(Y1) + MUL alpha2, a5, t0 + unop + + ADD2 y5, t1, y5 + ST y1, 1 * SIZE(Y1) + MUL alpha2, a4, t1 + unop + + ADD1 y6, t2, y6 + ST y2, 2 * SIZE(Y1) + MUL alpha2, a7, t2 + unop + + ADD2 y7, t3, y7 + ST y3, 3 * SIZE(Y1) + MUL alpha2, a6, t3 + unop + + ADD3 y4, t0, y4 + ADD4 y5, t1, y5 + ADD3 y6, t2, y6 + ADD4 y7, t3, y7 + + ST y4, 4 * SIZE(Y1) + unop + ST y5, 5 * SIZE(Y1) + unop + + ST y6, 6 * SIZE(Y1) + lda A1, 8 * SIZE(A1) + ST y7, 7 * SIZE(Y1) + lda Y1, 8 * SIZE(Y1) + .align 4 + +$L25: + and M, 2, I + ble I, $L27 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 2 * SIZE(A1) + LD a3, 3 * SIZE(A1) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + MUL alpha1, a2, t2 + LD y2, 2 * SIZE(Y1) + MUL alpha1, a3, t3 + LD y3, 3 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + ADD1 y2, t2, y2 + MUL alpha2, a3, t2 + ADD2 y3, t3, y3 + MUL alpha2, a2, t3 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + ADD3 y2, t2, y2 + ADD4 y3, t3, y3 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + + ST y2, 2 * SIZE(Y1) + lda A1, 4 * SIZE(A1) + ST y3, 3 * SIZE(Y1) + lda Y1, 4 * SIZE(Y1) + .align 4 + +$L27: + blbc M, $L990 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + MUL alpha1, a0, t0 + LD y0, 0 * SIZE(Y1) + MUL alpha1, a1, t1 + LD y1, 1 * SIZE(Y1) + + ADD1 y0, t0, y0 + MUL alpha2, a1, t0 + ADD2 y1, t1, y1 + MUL alpha2, a0, t1 + + ADD3 y0, t0, y0 + ADD4 y1, t1, y1 + + ST y0, 0 * SIZE(Y1) + ST y1, 1 * SIZE(Y1) + .align 4 + +$L990: + cmpeq INCY, 2 * SIZE, $0 + bne $0, $L999 + + mov BUFFER, Y1 + + sra M, 2, I + ble I, $L995 + .align 4 + +$L992: + LD a0, 0 * SIZE(BUFFER) + LD a1, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a2, 0 * SIZE(BUFFER) + LD a3, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + LD y2, 2 * SIZE(Y) + LD y3, 3 * SIZE(Y) + + LD a4, 0 * SIZE(BUFFER) + LD a5, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + LD a6, 0 * SIZE(BUFFER) + LD a7, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y4, 4 * SIZE(Y) + LD y5, 5 * SIZE(Y) + LD y6, 6 * SIZE(Y) + LD y7, 7 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + ADD a2, y2, a2 + ADD a3, y3, a3 + + ST a0, 0 * SIZE(Y1) + ADD a4, y4, a4 + ST a1, 1 * SIZE(Y1) + ADD a5, y5, a5 + addq Y1, INCY, Y1 + + ST a2, 0 * SIZE(Y1) + ADD a6, y6, a6 + ST a3, 1 * SIZE(Y1) + ADD a7, y7, a7 + addq Y1, INCY, Y1 + + ST a4, 0 * SIZE(Y1) + ST a5, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + ST a6, 0 * SIZE(Y1) + ST a7, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda I, -1(I) + lda Y, 8 * SIZE(Y) + bgt I, $L992 + .align 4 + +$L995: + and M, 3, I + ble I, $L999 + .align 4 + +$L996: + LD a0, 0 * SIZE(BUFFER) + LD a1, 1 * SIZE(BUFFER) + addq BUFFER, INCY, BUFFER + + LD y0, 0 * SIZE(Y) + LD y1, 1 * SIZE(Y) + lda Y, 2 * SIZE(Y) + + ADD a0, y0, a0 + ADD a1, y1, a1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + + lda I, -1(I) + bgt I, $L996 + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/zgemv_t.S b/kernel/alpha/zgemv_t.S new file mode 100644 index 0000000000..bac56eb3fe --- /dev/null +++ b/kernel/alpha/zgemv_t.S @@ -0,0 +1,922 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define STACKSIZE 64 +#define PREFETCHSIZE 32 + +#define M $16 +#define N $17 +#define A $21 +#define LDA $18 + +#define X $19 +#define INCX $20 +#define Y $22 +#define INCY $23 + +#define BUFFER $24 + +#define I $25 +#define J $27 + +#define X1 $3 +#define Y1 $4 +#define A1 $5 +#define A2 $6 + +#define alpha_r $f19 +#define alpha_i $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f21 + +#define a0 $f22 +#define a1 $f23 +#define a2 $f24 +#define a3 $f25 +#define a4 $f26 +#define a5 $f27 +#define a6 $f28 +#define a7 $f29 + +#define a8 $f2 +#define a9 $f3 +#define a10 $f4 +#define a11 $f5 +#define a12 $f6 +#define a13 $f7 +#define a14 $f8 +#define a15 $f9 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#else +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 SUB +#define ADD4 SUB +#endif + + PROLOGUE + + lda $sp, -STACKSIZE($sp) + ldq LDA, 0 + STACKSIZE($sp) + ldq X, 8 + STACKSIZE($sp) + ldq INCX, 16 + STACKSIZE($sp) + ldq Y, 24 + STACKSIZE($sp) + ldq INCY, 32 + STACKSIZE($sp) + ldq BUFFER, 40 + STACKSIZE($sp) + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + PROFCODE + + cmple M, 0, $0 + sll INCX, ZBASE_SHIFT, INCX + cmple N, 0, $1 + sll INCY, ZBASE_SHIFT, INCY + + or $0, $1, $0 + bne $0, $L999 + + cmpeq INCX, 2 * SIZE, $0 + mov X, X1 + sll LDA, ZBASE_SHIFT,LDA + bne $0, $L10 + + sra M, 2, I + mov BUFFER, Y1 + mov BUFFER, X + ble I, $L05 + .align 4 + +$L02: + ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) + lda I, -1(I) + + LD a0, 0 * SIZE(X1) + LD a1, 1 * SIZE(X1) + addq X1, INCX, X1 + LD a2, 0 * SIZE(X1) + LD a3, 1 * SIZE(X1) + addq X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + ST a2, 2 * SIZE(Y1) + ST a3, 3 * SIZE(Y1) + + LD a4, 0 * SIZE(X1) + LD a5, 1 * SIZE(X1) + addq X1, INCX, X1 + LD a6, 0 * SIZE(X1) + LD a7, 1 * SIZE(X1) + addq X1, INCX, X1 + + ST a4, 4 * SIZE(Y1) + ST a5, 5 * SIZE(Y1) + ST a6, 6 * SIZE(Y1) + ST a7, 7 * SIZE(Y1) + + lda Y1, 8 * SIZE(Y1) + bgt I, $L02 + .align 4 + +$L05: + and M, 3, I + ble I, $L10 + .align 4 + +$L06: + LD a0, 0 * SIZE(X1) + LD a1, 1 * SIZE(X1) + addq X1, INCX, X1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + lda Y1, 2 * SIZE(Y1) + + lda I, -1(I) + bgt I, $L06 + .align 4 + +$L10: + mov Y, Y1 + fclr t0 + unop + fclr t1 + + sra N, 1, J + fclr t2 + fclr t3 + ble J, $L20 + .align 4 + +$L11: + mov A, A1 + fclr s0 + addq A, LDA, A2 + fclr s1 + + addq A2, LDA, A + unop + mov X, X1 + lds $f31, 3 * SIZE(Y) + + sra M, 2, I + fclr s2 + fclr s3 + ble I, $L15 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + LD a4, 2 * SIZE(A1) + LD a5, 3 * SIZE(A1) + LD a6, 2 * SIZE(A2) + LD a7, 3 * SIZE(A2) + + LD a8, 4 * SIZE(A1) + LD a9, 5 * SIZE(A1) + LD a10, 4 * SIZE(A2) + LD a11, 5 * SIZE(A2) + LD a12, 6 * SIZE(A1) + LD a13, 7 * SIZE(A1) + LD a14, 6 * SIZE(A2) + LD a15, 7 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a1, t1 + unop + + ADD3 s2, t2, s2 + unop + MUL x0, a2, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 4 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x1, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x1, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x1, a3, t2 + LD a3, 9 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x1, a2, t3 + LD a2, 8 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + LD x1, 5 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x2, a5, t1 + ADD3 s2, t2, s2 + MUL x2, a6, t2 + + ADD4 s3, t3, s3 + unop + MUL x2, a7, t3 + LD x2, 6 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x3, a5, t0 + LD a5, 11 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x3, a4, t1 + LD a4, 10 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x3, a7, t2 + LD a7, 11 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x3, a6, t3 + LD a6, 10 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x0, a8, t0 + LD x3, 7 * SIZE(X1) + + ADD4 s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A2) + MUL x0, a9, t1 + unop + + ADD3 s2, t2, s2 + lda I, -1(I) + MUL x0, a10, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x0, a11, t3 + LD x0, 8 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x1, a9, t0 + LD a9, 13 * SIZE(A1) + + ADD2 s1, t1, s1 + unop + MUL x1, a8, t1 + LD a8, 12 * SIZE(A1) + + ADD1 s2, t2, s2 + lda A1, 8 * SIZE(A1) + MUL x1, a11, t2 + LD a11, 13 * SIZE(A2) + + ADD2 s3, t3, s3 + unop + MUL x1, a10, t3 + LD a10, 12 * SIZE(A2) + + ADD3 s0, t0, s0 + unop + MUL x2, a12, t0 + LD x1, 9 * SIZE(X1) + + ADD4 s1, t1, s1 + ldl $31, (PREFETCHSIZE + 0) * SIZE(X1) + MUL x2, a13, t1 + lda A2, 8 * SIZE(A2) + + ADD3 s2, t2, s2 + unop + MUL x2, a14, t2 + unop + + ADD4 s3, t3, s3 + unop + MUL x2, a15, t3 + LD x2, 10 * SIZE(X1) + + ADD1 s0, t0, s0 + unop + MUL x3, a13, t0 + LD a13, 7 * SIZE(A1) + + ADD2 s1, t1, s1 + lda X1, 8 * SIZE(X1) + MUL x3, a12, t1 + LD a12, 6 * SIZE(A1) + + ADD1 s2, t2, s2 + unop + MUL x3, a15, t2 + LD a15, 7 * SIZE(A2) + + ADD2 s3, t3, s3 + MUL x3, a14, t3 + LD a14, 6 * SIZE(A2) + bgt I, $L12 + .align 4 + +$L13: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x0, a1, t1 + ADD3 s2, t2, s2 + MUL x0, a2, t2 + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 4 * SIZE(X1) + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + + ADD1 s2, t2, s2 + unop + MUL x1, a3, t2 + unop + + ADD2 s3, t3, s3 + lda A1, 8 * SIZE(A1) + MUL x1, a2, t3 + LD x1, 5 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x2, a4, t0 + ADD4 s1, t1, s1 + MUL x2, a5, t1 + + ADD3 s2, t2, s2 + unop + MUL x2, a6, t2 + unop + + ADD4 s3, t3, s3 + lda A2, 8 * SIZE(A2) + MUL x2, a7, t3 + LD x2, 6 * SIZE(X1) + + ADD1 s0, t0, s0 + MUL x3, a5, t0 + ADD2 s1, t1, s1 + MUL x3, a4, t1 + + ADD1 s2, t2, s2 + unop + MUL x3, a7, t2 + lda X1, 8 * SIZE(X1) + + ADD2 s3, t3, s3 + unop + MUL x3, a6, t3 + LD x3, -1 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x0, a8, t0 + ADD4 s1, t1, s1 + MUL x0, a9, t1 + + ADD3 s2, t2, s2 + MUL x0, a10, t2 + ADD4 s3, t3, s3 + MUL x0, a11, t3 + + ADD1 s0, t0, s0 + MUL x1, a9, t0 + ADD2 s1, t1, s1 + MUL x1, a8, t1 + + ADD1 s2, t2, s2 + MUL x1, a11, t2 + ADD2 s3, t3, s3 + MUL x1, a10, t3 + + ADD3 s0, t0, s0 + MUL x2, a12, t0 + ADD4 s1, t1, s1 + MUL x2, a13, t1 + + ADD3 s2, t2, s2 + MUL x2, a14, t2 + ADD4 s3, t3, s3 + MUL x2, a15, t3 + + ADD1 s0, t0, s0 + MUL x3, a13, t0 + ADD2 s1, t1, s1 + MUL x3, a12, t1 + + ADD1 s2, t2, s2 + MUL x3, a15, t2 + ADD2 s3, t3, s3 + MUL x3, a14, t3 + .align 4 + +$L15: + and M, 3, I + ble I, $L18 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a2, 0 * SIZE(A2) + LD a3, 1 * SIZE(A2) + + LD x0, 0 * SIZE(X1) + + lda I, -1(I) + ble I, $L17 + .align 4 + +$L16: + ADD3 s0, t0, s0 + lda I, -1(I) + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + MUL x0, a1, t1 + ADD3 s2, t2, s2 + MUL x0, a2, t2 + + ADD4 s3, t3, s3 + unop + MUL x0, a3, t3 + LD x0, 2 * SIZE(X1) + + ADD1 s0, t0, s0 + lda A2, 2 * SIZE(A2) + MUL x1, a1, t0 + LD a1, 3 * SIZE(A1) + + ADD2 s1, t1, s1 + lda X1, 2 * SIZE(X1) + MUL x1, a0, t1 + LD a0, 2 * SIZE(A1) + + ADD1 s2, t2, s2 + lda A1, 2 * SIZE(A1) + MUL x1, a3, t2 + LD a3, 1 * SIZE(A2) + + ADD2 s3, t3, s3 + MUL x1, a2, t3 + LD a2, 0 * SIZE(A2) + bgt I, $L16 + .align 4 + +$L17: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD3 s2, t2, s2 + MUL x0, a2, t2 + ADD4 s3, t3, s3 + MUL x0, a3, t3 + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + + ADD1 s2, t2, s2 + MUL x1, a3, t2 + ADD2 s3, t3, s3 + MUL x1, a2, t3 + .align 4 + +$L18: + LD a0, 0 * SIZE(Y) + unop + LD a1, 1 * SIZE(Y) + addq Y, INCY, Y + + LD a2, 0 * SIZE(Y) + unop + LD a3, 1 * SIZE(Y) + addq Y, INCY, Y + + ADD3 s0, t0, s0 + ADD4 s1, t1, s1 + ADD3 s2, t2, s2 + ADD4 s3, t3, s3 + + MUL alpha_r, s0, t0 + MUL alpha_r, s1, t1 + MUL alpha_r, s2, t2 + MUL alpha_r, s3, t3 + + ADD a0, t0, a0 + MUL alpha_i, s1, t0 + ADD a1, t1, a1 + MUL alpha_i, s0, t1 + ADD a2, t2, a2 + MUL alpha_i, s3, t2 + ADD a3, t3, a3 + MUL alpha_i, s2, t3 + + SUB a0, t0, a0 + ADD a1, t1, a1 + SUB a2, t2, a2 + ADD a3, t3, a3 + + ST a0, 0 * SIZE(Y1) + fclr t0 + ST a1, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + + ST a2, 0 * SIZE(Y1) + fclr t1 + ST a3, 1 * SIZE(Y1) + addq Y1, INCY, Y1 + + fclr t2 + lda J, -1(J) + fclr t3 + bgt J, $L11 + .align 4 + +$L20: + blbc N, $L999 + + mov A, A1 + fclr s0 + fclr s1 + mov X, X1 + + sra M, 2, I + fclr s2 + fclr s3 + ble I, $L25 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + LD a4, 2 * SIZE(A1) + LD a5, 3 * SIZE(A1) + LD a8, 4 * SIZE(A1) + LD a9, 5 * SIZE(A1) + LD a12, 6 * SIZE(A1) + LD a13, 7 * SIZE(A1) + + LD x0, 0 * SIZE(X1) + LD x1, 1 * SIZE(X1) + LD x2, 2 * SIZE(X1) + + lda I, -1(I) + ble I, $L23 + .align 4 + +$L22: + ADD3 s0, t0, s0 + ldl $31, (PREFETCHSIZE + 0) * SIZE(A1) + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + LD x0, 4 * SIZE(X1) + + ADD1 s2, t0, s2 + lda I, -1(I) + MUL x1, a1, t0 + LD a1, 9 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a0, t1 + LD a0, 8 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + LD x1, 5 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x2, a5, t1 + LD x2, 6 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x3, a5, t0 + LD a5, 11 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x3, a4, t1 + LD a4, 10 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x0, a8, t0 + LD x3, 7 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a9, t1 + LD x0, 8 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x1, a9, t0 + LD a9, 13 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a8, t1 + LD a8, 12 * SIZE(A1) + + ADD3 s0, t0, s0 + unop + MUL x2, a12, t0 + LD x1, 9 * SIZE(X1) + + ADD4 s1, t1, s1 + lda A1, 8 * SIZE(A1) + MUL x2, a13, t1 + LD x2, 10 * SIZE(X1) + + ADD1 s2, t0, s2 + lda X1, 8 * SIZE(X1) + MUL x3, a13, t0 + LD a13, 7 * SIZE(A1) + + ADD2 s3, t1, s3 + MUL x3, a12, t1 + LD a12, 6 * SIZE(A1) + bgt I, $L22 + .align 4 + +$L23: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x3, 3 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + LD x0, 4 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x1, a1, t0 + lda A1, 8 * SIZE(A1) + + ADD2 s3, t1, s3 + unop + MUL x1, a0, t1 + LD x1, 5 * SIZE(X1) + + ADD3 s0, t0, s0 + unop + MUL x2, a4, t0 + unop + + ADD4 s1, t1, s1 + unop + MUL x2, a5, t1 + LD x2, 6 * SIZE(X1) + + ADD1 s2, t0, s2 + unop + MUL x3, a5, t0 + lda X1, 8 * SIZE(X1) + + ADD2 s3, t1, s3 + unop + MUL x3, a4, t1 + LD x3, -1 * SIZE(X1) + + ADD3 s0, t0, s0 + MUL x0, a8, t0 + ADD4 s1, t1, s1 + MUL x0, a9, t1 + + ADD1 s2, t0, s2 + MUL x1, a9, t0 + ADD2 s3, t1, s3 + MUL x1, a8, t1 + + ADD3 s0, t0, s0 + MUL x2, a12, t0 + ADD4 s1, t1, s1 + MUL x2, a13, t1 + + ADD1 s2, t0, s2 + MUL x3, a13, t0 + ADD2 s3, t1, s3 + MUL x3, a12, t1 + .align 4 + +$L25: + and M, 3, I + ble I, $L28 + + LD a0, 0 * SIZE(A1) + LD a1, 1 * SIZE(A1) + + LD x0, 0 * SIZE(X1) + + lda I, -1(I) + ble I, $L27 + .align 4 + +$L26: + ADD3 s0, t0, s0 + lda A1, 2 * SIZE(A1) + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + lda I, -1(I) + MUL x0, a1, t1 + LD x0, 2 * SIZE(X1) + + ADD1 s0, t0, s0 + lda X1, 2 * SIZE(X1) + MUL x1, a1, t0 + LD a1, 1 * SIZE(A1) + + ADD2 s1, t1, s1 + MUL x1, a0, t1 + LD a0, 0 * SIZE(A1) + bgt I, $L26 + .align 4 + +$L27: + ADD3 s0, t0, s0 + unop + MUL x0, a0, t0 + LD x1, 1 * SIZE(X1) + + ADD4 s1, t1, s1 + unop + MUL x0, a1, t1 + unop + + ADD1 s0, t0, s0 + MUL x1, a1, t0 + ADD2 s1, t1, s1 + MUL x1, a0, t1 + .align 4 + +$L28: + LD a0, 0 * SIZE(Y) + LD a1, 1 * SIZE(Y) + + ADD3 s0, t0, s0 + ADD4 s1, t1, s1 + ADD3 s2, t2, s2 + ADD4 s3, t3, s3 + + ADD s0, s2, s0 + ADD s1, s3, s1 + + MUL alpha_r, s0, t0 + MUL alpha_r, s1, t1 + + ADD a0, t0, a0 + MUL alpha_i, s1, t0 + ADD a1, t1, a1 + MUL alpha_i, s0, t1 + + SUB a0, t0, a0 + ADD a1, t1, a1 + + ST a0, 0 * SIZE(Y1) + ST a1, 1 * SIZE(Y1) + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + + lda $sp, STACKSIZE($sp) + ret + EPILOGUE diff --git a/kernel/alpha/znrm2.S b/kernel/alpha/znrm2.S new file mode 100644 index 0000000000..03343b2aec --- /dev/null +++ b/kernel/alpha/znrm2.S @@ -0,0 +1,426 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER + +#include "common.h" +#include "version.h" + +#define PREFETCH_SIZE 80 + +#define N $16 +#define X $17 +#define INCX $18 +#define XX $19 + +#define I $0 + +#define a0 $f0 +#define a1 $f1 +#define a2 $f10 +#define a3 $f11 +#define t0 $f12 +#define t1 $f13 +#define t2 $f14 +#define t3 $f15 + +#define x0 $f16 +#define x1 $f17 +#define x2 $f18 +#define x3 $f19 +#define x4 $f20 +#define x5 $f21 +#define x6 $f22 +#define x7 $f23 + + PROLOGUE + +#if defined(EV4) || defined(EV5) + .frame $30,16,$26,0 + .mask 0x4000000,-16 + ldah $29, 0($27) !gpdisp!1 + lda $29, 0($29) !gpdisp!1 + + lda $sp, -16($sp) + ldq $27, sqrt($29) !literal!2 + stq $26, 0($sp) + + PROFCODE + .prologue 1 +#else + PROFCODE +#endif + + fclr a0 + sll INCX, ZBASE_SHIFT, INCX + fclr a1 + ble N, $L999 + + fclr a2 + cmpeq INCX, 2 * SIZE, $0 + fclr a3 + beq $0, $L20 + + fclr t0 + sra N, 3, I + fclr t1 + ble I, $L15 + + fclr t2 + LD x0, 0 * SIZE(X) + fclr t3 + LD x1, 1 * SIZE(X) + + LD x2, 2 * SIZE(X) + LD x3, 3 * SIZE(X) + LD x4, 4 * SIZE(X) + LD x5, 5 * SIZE(X) + LD x6, 6 * SIZE(X) + LD x7, 7 * SIZE(X) + + lda I, -1(I) + ble I, $L12 + .align 4 + +$L11: + addt a0, t0, a0 + ldl $31, (PREFETCH_SIZE) * SIZE(X) + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + mov X, XX + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(X) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(X) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(X) + + addt a3, t3, a3 + unop + mult x7, x7, t3 + LD x7, 15 * SIZE(X) + + addt a0, t0, a0 + unop + mult x0, x0, t0 + LD x0, 16 * SIZE(X) + + addt a1, t1, a1 + lda X, 16 * SIZE(X) + mult x1, x1, t1 + LD x1, 17 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 18 * SIZE(XX) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 19 * SIZE(XX) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 20 * SIZE(XX) + + addt a1, t1, a1 + lda I, -1(I) + mult x5, x5, t1 + LD x5, 21 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 22 * SIZE(XX) + + addt a3, t3, a3 + mult x7, x7, t3 + LD x7, 23 * SIZE(XX) + bgt I, $L11 + .align 4 + +$L12: + addt a0, t0, a0 + mov X, XX + mult x0, x0, t0 + LD x0, 8 * SIZE(X) + + addt a1, t1, a1 + unop + mult x1, x1, t1 + LD x1, 9 * SIZE(X) + + addt a2, t2, a2 + unop + mult x2, x2, t2 + LD x2, 10 * SIZE(X) + + addt a3, t3, a3 + unop + mult x3, x3, t3 + LD x3, 11 * SIZE(X) + + addt a0, t0, a0 + unop + mult x4, x4, t0 + LD x4, 12 * SIZE(XX) + + addt a1, t1, a1 + unop + mult x5, x5, t1 + LD x5, 13 * SIZE(XX) + + addt a2, t2, a2 + unop + mult x6, x6, t2 + LD x6, 14 * SIZE(XX) + + addt a3, t3, a3 + lda X, 16 * SIZE(X) + mult x7, x7, t3 + LD x7, 15 * SIZE(XX) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + addt a2, t2, a2 + mult x2, x2, t2 + addt a3, t3, a3 + mult x3, x3, t3 + + addt a0, t0, a0 + mult x4, x4, t0 + addt a1, t1, a1 + mult x5, x5, t1 + + addt a2, t2, a2 + mult x6, x6, t2 + addt a3, t3, a3 + mult x7, x7, t3 + + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L15: + and N, 7, I + ble I, $L998 + .align 4 + +$L16: + LD x0, 0 * SIZE(X) + LD x1, 1 * SIZE(X) + + lda X, 2 * SIZE(X) + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + lda I, -1(I) + bgt I, $L16 + bsr $31, $L998 + .align 4 + +$L20: + fclr t0 + sra N, 2, I + fclr t1 + ble I, $L25 + + LD x0, 0 * SIZE(X) + fclr t2 + LD x1, 1 * SIZE(X) + addq X, INCX, X + LD x2, 0 * SIZE(X) + fclr t3 + LD x3, 1 * SIZE(X) + addq X, INCX, X + + LD x4, 0 * SIZE(X) + lda I, -1(I) + LD x5, 1 * SIZE(X) + addq X, INCX, X + + LD x6, 0 * SIZE(X) + ble I, $L22 + .align 4 + +$L21: + addt a0, t0, a0 + LD x7, 1 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x0, 0 * SIZE(X) + mult x1, x1, t1 + unop + + addt a2, t2, a2 + LD x1, 1 * SIZE(X) + mult x2, x2, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x2, 0 * SIZE(X) + mult x3, x3, t3 + unop + + addt a0, t0, a0 + LD x3, 1 * SIZE(X) + mult x4, x4, t0 + addq X, INCX, X + + addt a1, t1, a1 + LD x4, 0 * SIZE(X) + mult x5, x5, t1 + lda I, -1(I) + + addt a2, t2, a2 + LD x5, 1 * SIZE(X) + mult x6, x6, t2 + addq X, INCX, X + + addt a3, t3, a3 + LD x6, 0 * SIZE(X) + mult x7, x7, t3 + bgt I, $L21 + .align 4 + +$L22: + addt a0, t0, a0 + LD x7, 1 * SIZE(X) + mult x0, x0, t0 + addq X, INCX, X + + addt a1, t1, a1 + mult x1, x1, t1 + addt a2, t2, a2 + mult x2, x2, t2 + + addt a3, t3, a3 + mult x3, x3, t3 + addt a0, t0, a0 + mult x4, x4, t0 + + addt a1, t1, a1 + mult x5, x5, t1 + addt a2, t2, a2 + mult x6, x6, t2 + + addt a3, t3, a3 + mult x7, x7, t3 + addt a2, t2, a2 + addt a3, t3, a3 + .align 4 + +$L25: + and N, 3, I + ble I, $L998 + .align 4 + +$L26: + LD x0, 0 * SIZE(X) + lda I, -1(I) + LD x1, 1 * SIZE(X) + addq X, INCX, X + + addt a0, t0, a0 + mult x0, x0, t0 + addt a1, t1, a1 + mult x1, x1, t1 + + bgt I, $L26 + .align 4 + + +$L998: + addt a0, t0, a0 + addt a1, t1, a1 + + addt a0, a1, a0 + addt a2, a3, a2 + +#if defined(EV4) || defined(EV5) + addt a0, a2, $f16 + jsr $26, ($27), sqrt !lituse_jsr!2 + + ldah $29, 0($26) !gpdisp!3 + lda $29, 0($29) !gpdisp!3 +#else + addt a0, a2, a0 + sqrtt a0, a0 +#endif + .align 4 + +$L999: +#if defined(EV4) || defined(EV5) + ldq $26, 0($sp) + lda $sp, 16($sp) +#endif + ret + EPILOGUE diff --git a/kernel/alpha/zrot.S b/kernel/alpha/zrot.S new file mode 100644 index 0000000000..afcdf12b4d --- /dev/null +++ b/kernel/alpha/zrot.S @@ -0,0 +1,631 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define N $16 +#define X $17 +#define INCX $18 +#define Y $19 +#define INCY $20 +#define I $21 +#define XX $23 +#define YY $24 + +#define C $f10 +#define S $f11 + +#define PREFETCH_SIZE 80 + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + fmov $f21, C + LD S, 0($sp) + + addq INCX, INCX, INCX + addq INCY, INCY, INCY + + cmpeq INCX, 2, $23 + cmpeq INCY, 2, $24 + ble N, $L998 + + and $23, $24, $23 + beq $23, $L50 + + sra N, 2, I + ble I, $L15 + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + LD $f16, 2*SIZE(X) + LD $f17, 2*SIZE(Y) + LD $f18, 3*SIZE(X) + LD $f19, 3*SIZE(Y) + + MUL C, $f12, $f21 + unop + MUL S, $f13, $f22 + MUL C, $f13, $f23 + + LD $f13, 4*SIZE(Y) + MUL S, $f12, $f24 + LD $f12, 4*SIZE(X) + MUL C, $f14, $f25 + + lda I, -1(I) + MUL S, $f15, $f26 + ADD $f21, $f22, $f22 + MUL C, $f15, $f27 + + LD $f15, 5*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + ble I, $L13 + .align 4 + +$L12: + MUL C, $f16, $f21 + lds $f31, (PREFETCH_SIZE) * SIZE(X) + unop + LD $f14, 5*SIZE(X) + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + lds $f31, (PREFETCH_SIZE) * SIZE(Y) + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 6*SIZE(X) + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 7*SIZE(X) + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 8*SIZE(Y) + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 8*SIZE(X) + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 9*SIZE(Y) + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + LD $f14, 9*SIZE(X) + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + LD $f17, 10*SIZE(Y) + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + LD $f16, 10*SIZE(X) + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + LD $f19, 11*SIZE(Y) + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + lda I, -1(I) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + LD $f18, 11*SIZE(X) + unop + unop + + ST $f22, 6*SIZE(X) + MUL S, $f13, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + LD $f13, 12*SIZE(Y) + lda X, 8*SIZE(X) + unop + + ST $f24, 6*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + LD $f12, 4*SIZE(X) + lda Y, 8*SIZE(Y) + unop + + ST $f26, -1*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + LD $f15, 5*SIZE(Y) + unop + unop + + ST $f28, -1*SIZE(Y) + MUL S, $f14, $f28 + SUB $f23, $f24, $f24 + bgt I, $L12 + .align 4 + +$L13: + MUL C, $f16, $f21 + LD $f14, 5*SIZE(X) + unop + unop + + ST $f22, 0*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + LD $f17, 6*SIZE(Y) + + ST $f24, 0*SIZE(Y) + MUL S, $f16, $f24 + LD $f16, 6*SIZE(X) + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 1*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + LD $f19, 7*SIZE(Y) + + ST $f28, 1*SIZE(Y) + MUL S, $f18, $f28 + LD $f18, 7*SIZE(X) + SUB $f23, $f24, $f24 + + MUL C, $f12, $f21 + unop + unop + unop + + ST $f22, 2*SIZE(X) + unop + MUL S, $f13, $f22 + ADD $f25, $f26, $f26 + + MUL C, $f13, $f23 + unop + unop + unop + + ST $f24, 2*SIZE(Y) + MUL S, $f12, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f14, $f25 + unop + unop + unop + + ST $f26, 3*SIZE(X) + MUL S, $f15, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f15, $f27 + unop + unop + unop + + ST $f28, 3*SIZE(Y) + MUL S, $f14, $f28 + unop + SUB $f23, $f24, $f24 + + MUL C, $f16, $f21 + unop + unop + unop + + ST $f22, 4*SIZE(X) + MUL S, $f17, $f22 + unop + ADD $f25, $f26, $f26 + + MUL C, $f17, $f23 + unop + unop + unop + + ST $f24, 4*SIZE(Y) + MUL S, $f16, $f24 + unop + SUB $f27, $f28, $f28 + + MUL C, $f18, $f25 + unop + unop + unop + + ST $f26, 5*SIZE(X) + MUL S, $f19, $f26 + unop + ADD $f21, $f22, $f22 + + MUL C, $f19, $f27 + unop + unop + unop + + ST $f28, 5*SIZE(Y) + MUL S, $f18, $f28 + unop + SUB $f23, $f24, $f24 + + ST $f22, 6*SIZE(X) + ADD $f25, $f26, $f26 + ST $f24, 6*SIZE(Y) + SUB $f27, $f28, $f28 + + ST $f26, 7*SIZE(X) + lda X, 8*SIZE(X) + ST $f28, 7*SIZE(Y) + lda Y, 8*SIZE(Y) + .align 4 + + +$L15: + and N, 3, I + ble I, $L998 + .align 4 + +$L16: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + lda I, -1(I) + + ST $f26, 1*SIZE(X) + lda X, 2 * SIZE(X) + ST $f28, 1*SIZE(Y) + lda Y, 2 * SIZE(Y) + + bgt I, $L16 + .align 4 + +$L998: + clr $0 + ret + .align 4 + +$L50: + mov X, XX + mov Y, YY + + sra N, 2, I + ble I, $L55 + .align 4 + +$L51: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + SXADDQ INCX, X, X + LD $f15, 1*SIZE(Y) + SXADDQ INCY, Y, Y + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(XX) + ST $f24, 0*SIZE(YY) + ST $f26, 1*SIZE(XX) + SXADDQ INCX, XX, XX + ST $f28, 1*SIZE(YY) + SXADDQ INCY, YY, YY + + lda I, -1(I) + bgt I, $L51 + .align 4 + +$L55: + and N, 3, I + ble I, $L999 + .align 4 + +$L56: + LD $f12, 0*SIZE(X) + LD $f13, 0*SIZE(Y) + LD $f14, 1*SIZE(X) + LD $f15, 1*SIZE(Y) + + MUL C, $f12, $f21 + MUL S, $f13, $f22 + MUL C, $f13, $f23 + MUL S, $f12, $f24 + + ADD $f21, $f22, $f22 + SUB $f23, $f24, $f24 + + MUL C, $f14, $f25 + MUL S, $f15, $f26 + MUL C, $f15, $f27 + MUL S, $f14, $f28 + + ADD $f25, $f26, $f26 + SUB $f27, $f28, $f28 + + ST $f22, 0*SIZE(X) + ST $f24, 0*SIZE(Y) + lda I, -1(I) + + ST $f26, 1*SIZE(X) + ST $f28, 1*SIZE(Y) + SXADDQ INCX, X, X + SXADDQ INCY, Y, Y + + bgt I, $L56 + .align 4 + +$L999: + clr $0 + ret + EPILOGUE diff --git a/kernel/alpha/zscal.S b/kernel/alpha/zscal.S new file mode 100644 index 0000000000..1a2ac10b32 --- /dev/null +++ b/kernel/alpha/zscal.S @@ -0,0 +1,255 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#define PREFETCHSIZE 88 + +#define N $16 +#define X $21 +#define INCX $17 + +#define XX $18 +#define I $19 + +#define ALPHA_R $f19 +#define ALPHA_I $f20 + +#define s0 $f0 +#define s1 $f1 +#define s2 $f10 +#define s3 $f11 + +#define a0 $f12 +#define a1 $f13 +#define a2 $f14 +#define a3 $f15 +#define a4 $f16 +#define a5 $f17 +#define a6 $f18 +#define a7 $f21 + +#define t0 $f22 +#define t1 $f23 +#define t2 $f24 +#define t3 $f25 + +#define t4 $f26 +#define t5 $f27 +#define t6 $f28 +#define t7 $f29 + + PROLOGUE + PROFCODE + + ldq INCX, 0($sp) + mov X, XX + ble N, $L999 + + addq INCX, INCX, INCX + + sra N, 2, I + ble I, $L15 + + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a2, 0 * SIZE(X) + LD a3, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a4, 0 * SIZE(X) + LD a5, 1 * SIZE(X) + SXADDQ INCX, X, X + LD a6, 0 * SIZE(X) + LD a7, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + lda I, -1(I) + ble I, $L13 + .align 4 + +$L12: + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_R, t0 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_I, t1 + + MUL a2, ALPHA_I, t2 + LD a0, 0 * SIZE(X) + MUL a3, ALPHA_R, t3 + LD a1, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a4, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a5, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a4, ALPHA_I, t2 + LD a2, 0 * SIZE(X) + MUL a5, ALPHA_R, t3 + LD a3, 1 * SIZE(X) + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + SXADDQ INCX, X, X + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + LD a4, 0 * SIZE(X) + MUL a7, ALPHA_R, t3 + LD a5, 1 * SIZE(X) + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + ST t6, 0 * SIZE(XX) + MUL a1, ALPHA_I, t1 + ST t7, 1 * SIZE(XX) + + MUL a0, ALPHA_I, t2 + LD a6, 0 * SIZE(X) + MUL a1, ALPHA_R, t3 + LD a7, 1 * SIZE(X) + + SUB t0, t1, t4 + lda I, -1(I) + ADD t2, t3, t5 + SXADDQ INCX, XX, XX + + lds $f31, PREFETCHSIZE * SIZE(X) + unop + SXADDQ INCX, X, X + bne I, $L12 + .align 4 + +$L13: + MUL a2, ALPHA_R, t0 + MUL a3, ALPHA_I, t1 + ST t4, 0 * SIZE(XX) + MUL a2, ALPHA_I, t2 + ST t5, 1 * SIZE(XX) + MUL a3, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + unop + + ST t6, 0 * SIZE(XX) + MUL a4, ALPHA_R, t0 + ST t7, 1 * SIZE(XX) + MUL a5, ALPHA_I, t1 + MUL a4, ALPHA_I, t2 + MUL a5, ALPHA_R, t3 + + SUB t0, t1, t4 + SXADDQ INCX, XX, XX + ADD t2, t3, t5 + unop + + MUL a6, ALPHA_R, t0 + ST t4, 0 * SIZE(XX) + MUL a7, ALPHA_I, t1 + ST t5, 1 * SIZE(XX) + + MUL a6, ALPHA_I, t2 + MUL a7, ALPHA_R, t3 + + SUB t0, t1, t6 + SXADDQ INCX, XX, XX + ADD t2, t3, t7 + + ST t6, 0 * SIZE(XX) + ST t7, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + .align 4 + +$L15: + and N, 3, I + unop + unop + ble I, $L999 + .align 4 + +$L17: + LD a0, 0 * SIZE(X) + LD a1, 1 * SIZE(X) + SXADDQ INCX, X, X + + MUL a0, ALPHA_R, t0 + MUL a1, ALPHA_I, t1 + MUL a0, ALPHA_I, t2 + MUL a1, ALPHA_R, t3 + + SUB t0, t1, t4 + ADD t2, t3, t5 + + ST t4, 0 * SIZE(XX) + ST t5, 1 * SIZE(XX) + SXADDQ INCX, XX, XX + + lda I, -1(I) + bne I, $L17 + .align 4 + +$L999: + ret + EPILOGUE diff --git a/kernel/alpha/zswap.S b/kernel/alpha/zswap.S new file mode 100644 index 0000000000..a12a2c7a73 --- /dev/null +++ b/kernel/alpha/zswap.S @@ -0,0 +1,244 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + + PROLOGUE + PROFCODE + .frame $sp, 0, $26, 0 + + mov $21, $17 + ldl $18, 0($sp) + ldq $19, 8($sp) + ldl $20, 16($sp) +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + ble $16, $SubEnd # if n <= 0 goto $End + + cmpeq $18, 1, $1 + addq $18, $18, $18 + cmpeq $20, 1, $2 + addq $20, $20, $20 + + sra $16, 2, $21 + and $1, $2, $1 + and $16, 3, $22 + beq $1, $Sub + + ble $21, $MainRemain + .align 4 + +$MainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f12, 2*SIZE($19) + LD $f13, 3*SIZE($19) + LD $f14, 4*SIZE($19) + LD $f15, 5*SIZE($19) + LD $f16, 6*SIZE($19) + LD $f17, 7*SIZE($19) + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + LD $f22, 2*SIZE($17) + LD $f23, 3*SIZE($17) + LD $f24, 4*SIZE($17) + LD $f25, 5*SIZE($17) + LD $f26, 6*SIZE($17) + LD $f27, 7*SIZE($17) + + lds $f31, 16*SIZE($17) + unop + lds $f31, 16*SIZE($19) + subl $21, 1, $21 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f12, 2*SIZE($17) + ST $f13, 3*SIZE($17) + ST $f14, 4*SIZE($17) + ST $f15, 5*SIZE($17) + ST $f16, 6*SIZE($17) + ST $f17, 7*SIZE($17) + + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + ST $f22, 2*SIZE($19) + ST $f23, 3*SIZE($19) + ST $f24, 4*SIZE($19) + ST $f25, 5*SIZE($19) + ST $f26, 6*SIZE($19) + ST $f27, 7*SIZE($19) + + lda $17, 8*SIZE($17) + lda $19, 8*SIZE($19) + bgt $21, $MainLoop + .align 4 + +$MainRemain: + ble $22, $MainEnd + .align 4 + +$MainRemainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + + lda $17, 2*SIZE($17) + lda $19, 2*SIZE($19) + subl $22, 1, $22 + ST $f10, -2*SIZE($17) + ST $f11, -1*SIZE($17) + ST $f20, -2*SIZE($19) + ST $f21, -1*SIZE($19) + bgt $22, $MainRemainLoop + .align 4 + +$MainEnd: + clr $0 + ret + .align 4 + +$Sub: + mov $17, $23 + mov $19, $24 + ble $21, $SubRemain + .align 4 + +$SubLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f12, 0*SIZE($19) + LD $f13, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f14, 0*SIZE($19) + LD $f15, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f16, 0*SIZE($19) + LD $f17, 1*SIZE($19) + SXADDQ $20, $19, $19 + + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f22, 0*SIZE($17) + LD $f23, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f24, 0*SIZE($17) + LD $f25, 1*SIZE($17) + SXADDQ $18, $17, $17 + + LD $f26, 0*SIZE($17) + LD $f27, 1*SIZE($17) + SXADDQ $18, $17, $17 + + ST $f10, 0*SIZE($23) + ST $f11, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f12, 0*SIZE($23) + ST $f13, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f14, 0*SIZE($23) + ST $f15, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f16, 0*SIZE($23) + ST $f17, 1*SIZE($23) + SXADDQ $18, $23, $23 + + ST $f20, 0*SIZE($24) + ST $f21, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f22, 0*SIZE($24) + ST $f23, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f24, 0*SIZE($24) + ST $f25, 1*SIZE($24) + SXADDQ $20, $24, $24 + + ST $f26, 0*SIZE($24) + ST $f27, 1*SIZE($24) + SXADDQ $20, $24, $24 + + subl $21, 1, $21 + bgt $21, $SubLoop + .align 4 + +$SubRemain: + ble $22, $SubEnd + .align 4 + +$SubRemainLoop: + LD $f10, 0*SIZE($19) + LD $f11, 1*SIZE($19) + LD $f20, 0*SIZE($17) + LD $f21, 1*SIZE($17) + + subl $22, 1, $22 + + ST $f10, 0*SIZE($17) + ST $f11, 1*SIZE($17) + ST $f20, 0*SIZE($19) + ST $f21, 1*SIZE($19) + + SXADDQ $18, $17, $17 + SXADDQ $20, $19, $19 + bgt $22, $SubRemainLoop + .align 4 + +$SubEnd: + clr $0 + ret + EPILOGUE diff --git a/kernel/alpha/ztrsm_kernel_2x2_LN.S b/kernel/alpha/ztrsm_kernel_2x2_LN.S new file mode 100644 index 0000000000..2921f9e807 --- /dev/null +++ b/kernel/alpha/ztrsm_kernel_2x2_LN.S @@ -0,0 +1,2237 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + lda $sp, -STACKSIZE($sp) + + ldq B, 0 + STACKSIZE($sp) + ldq C, 8 + STACKSIZE($sp) + ldq LDC, 16 + STACKSIZE($sp) + ldq OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addq M, M, TMP2 + mulq TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + addq TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subq B, TMP1, B + + subq C, LDC, C2 + subq C2, LDC, C1 + subq C2, LDC, C +#else + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + ble I, $L20 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 4 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 4 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L20: + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L29 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + lda BO, 4 * SIZE(B) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(KK) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + lda BO, 4 * SIZE(BO) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(TMP1) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + fclr c01 + fclr c05 + + lda I, -1(I) + bgt I, $L11 + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C1 + subq C, LDC, C +#else + mov C, C1 + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + ble I, $L50 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L50: + sra M, 1, I + ble I, $L59 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda BO, 2 * SIZE(B) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda BO, 2 * SIZE(BO) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + + lda I, -1(I) + bgt I, $L41 + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/ztrsm_kernel_2x2_LT.S b/kernel/alpha/ztrsm_kernel_2x2_LT.S new file mode 100644 index 0000000000..e6ffc0f928 --- /dev/null +++ b/kernel/alpha/ztrsm_kernel_2x2_LT.S @@ -0,0 +1,2230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + lda $sp, -STACKSIZE($sp) + + ldq B, 0 + STACKSIZE($sp) + ldq C, 8 + STACKSIZE($sp) + ldq LDC, 16 + STACKSIZE($sp) + ldq OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addq M, M, TMP2 + mulq TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + addq TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + sra N, 1, J + ble J, $L30 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subq B, TMP1, B + + subq C, LDC, C2 + subq C2, LDC, C1 + subq C2, LDC, C +#else + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + lda BO, 4 * SIZE(B) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(KK) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + lda BO, 4 * SIZE(BO) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(TMP1) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + fclr c01 + fclr c05 + + lda I, -1(I) + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 4 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 4 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L30: + and N, 1, J + ble J, $L999 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C1 + subq C, LDC, C +#else + mov C, C1 + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda BO, 2 * SIZE(B) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda BO, 2 * SIZE(BO) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + + lda I, -1(I) + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/alpha/ztrsm_kernel_2x2_RT.S b/kernel/alpha/ztrsm_kernel_2x2_RT.S new file mode 100644 index 0000000000..4c490fc763 --- /dev/null +++ b/kernel/alpha/ztrsm_kernel_2x2_RT.S @@ -0,0 +1,2230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "version.h" + +#if !defined(EV4) && !defined(EV5) && !defined(EV6) +#error "Architecture is not specified." +#endif + +#ifdef EV6 +#define PREFETCHSIZE 56 +#define UNOP unop +#endif + +#ifdef EV5 +#define PREFETCHSIZE 48 +#define UNOP +#endif + +#ifdef EV4 +#define UNOP +#endif + + .set noat + .set noreorder + .arch ev6 + +.text + .align 5 + .globl CNAME + .ent CNAME + +#define STACKSIZE 80 + +#define M $16 +#define N $17 +#define K $18 +#define A $21 +#define B $22 +#define C $20 +#define LDC $23 + +#define C1 $19 +#define C2 $24 + +#define AO $at +#define BO $5 +#define I $6 +#define J $7 +#define L $8 + +#define a1 $f16 +#define a2 $f17 +#define a3 $f18 +#define a4 $f19 + +#define b1 $f20 +#define b2 $f21 +#define b3 $f22 +#define b4 $f23 + +#define t1 $f24 +#define t2 $f25 +#define t3 $f26 +#define t4 $f27 + +#define a5 $f28 +#define a6 $f30 +#define b5 $f29 + +#define alpha_i $f29 +#define alpha_r $f30 + +#define c01 $f0 +#define c02 $f1 +#define c03 $f2 +#define c04 $f3 + +#define c05 $f4 +#define c06 $f5 +#define c07 $f6 +#define c08 $f7 + +#define c09 $f8 +#define c10 $f9 +#define c11 $f10 +#define c12 $f11 + +#define c13 $f12 +#define c14 $f13 +#define c15 $f14 +#define c16 $f15 + +#define TMP1 $0 +#define TMP2 $1 +#define KK $2 +#define AORIG $3 +#define OFFSET $4 + +#if defined(LN) || defined(LT) +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 SUB +#define ADD4 ADD +#define ADD5 ADD +#define ADD6 SUB +#endif +#else +#ifndef CONJ +#define ADD1 ADD +#define ADD2 SUB +#define ADD3 ADD +#define ADD4 ADD +#define ADD5 SUB +#define ADD6 ADD +#else +#define ADD1 ADD +#define ADD2 ADD +#define ADD3 ADD +#define ADD4 SUB +#define ADD5 ADD +#define ADD6 SUB +#endif +#endif + + +CNAME: + .frame $sp, STACKSIZE, $26, 0 + +#ifdef PROFILE + ldgp $gp, 0($27) + lda $at, _mcount + jsr $at, ($at), _mcount +#endif + +#ifndef PROFILE + .prologue 0 +#else + .prologue 1 +#endif + + lda $sp, -STACKSIZE($sp) + + ldq B, 0 + STACKSIZE($sp) + ldq C, 8 + STACKSIZE($sp) + ldq LDC, 16 + STACKSIZE($sp) + ldq OFFSET, 24 + STACKSIZE($sp) + + sll LDC, ZBASE_SHIFT, LDC + + stt $f2, 0($sp) + stt $f3, 8($sp) + stt $f4, 16($sp) + stt $f5, 24($sp) + stt $f6, 32($sp) + stt $f7, 40($sp) + stt $f8, 48($sp) + stt $f9, 56($sp) + + cmple M, 0, $0 + cmple N, 0, $1 + cmple K, 0, $2 + + or $0, $1, $0 + or $0, $2, $0 + bne $0, $L999 + +#ifdef LN + addq M, M, TMP2 + mulq TMP2, K, TMP1 + SXADDQ TMP1, A, A + SXADDQ TMP2, C, C +#endif + +#ifdef RN + negq OFFSET, KK +#endif + +#ifdef RT + mulq N, K, TMP1 + addq TMP1, TMP1, TMP1 + SXADDQ TMP1, B, B + + mulq N, LDC, TMP1 + addq TMP1, C, C + + subq N, OFFSET, KK +#endif + + and N, 1, J + ble J, $L30 + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + subq B, TMP1, B + + subq C, LDC, C1 + subq C, LDC, C +#else + mov C, C1 + addq C, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + ble I, $L50 + .align 4 + +$L41: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda BO, 2 * SIZE(B) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(KK) + fclr c04 + fclr c08 + + ble KK, $L48 + ble L, $L45 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda BO, 2 * SIZE(BO) + fclr c03 + lda AO, 4 * SIZE(AO) + fclr c07 + + lda L, -2(TMP1) + fclr c04 + fclr c08 + + ble TMP1, $L48 + ble L, $L45 +#endif + .align 5 + +$L42: + ADD4 c05, t1, c05 + unop + MUL a1, b1, t1 + unop + + ADD2 c06, t2, c06 + lda L, -2(L) + MUL a2, b1, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b1, t3 + unop + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + lda BO, 4 * SIZE(BO) + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + unop + MUL a4, b2, t4 + LD a5, 3 * SIZE(AO) + + ADD4 c05, t1, c05 + unop + MUL a1, b3, t1 + LD b2, -1 * SIZE(BO) + + ADD2 c06, t2, c06 + unop + MUL a2, b3, t2 + unop + + ADD4 c07, t3, c07 + unop + MUL a3, b3, t3 + lda AO, 8 * SIZE(AO) + + ADD2 c08, t4, c08 + unop + MUL a5, b3, t4 + LD b3, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b4, t1 + LD a1, -4 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b4, t2 + LD a2, -3 * SIZE(AO) + + ADD1 c03, t3, c03 + LD a4, -1 * SIZE(AO) + MUL a3, b4, t3 + LD a3, -2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a5, b4, t4 + LD b4, 1 * SIZE(BO) + bgt L, $L42 + .align 4 + +$L45: + ADD4 c05, t1, c05 + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L47 +#else + blbs TMP1, $L47 +#endif + .align 4 + + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + unop + MUL a4, b1, t4 + LD b1, 0 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b2, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b2, t2 + LD a2, 1 * SIZE(AO) + + ADD1 c03, t3, c03 + unop + MUL a3, b2, t3 + LD a3, 2 * SIZE(AO) + + ADD3 c04, t4, c04 + MUL a4, b2, t4 + LD a4, 3 * SIZE(AO) + lda AO, 4 * SIZE(AO) + + ADD4 c05, t1, c05 + LD b2, 1 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 2 * SIZE(BO) + .align 4 + +$L47: + ADD2 c06, t2, c06 + MUL a2, b1, t2 + ADD4 c07, t3, c07 + MUL a3, b1, t3 + + ADD2 c08, t4, c08 + MUL a4, b1, t4 + ADD1 c01, t1, c01 + MUL a1, b2, t1 + + ADD3 c02, t2, c02 + MUL a2, b2, t2 + ADD1 c03, t3, c03 + MUL a3, b2, t3 + + ADD3 c04, t4, c04 + lda AO, 4 * SIZE(AO) + MUL a4, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD4 c05, t1, c05 + ADD2 c06, t2, c06 + ADD4 c07, t3, c07 + ADD2 c08, t4, c08 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + +$L48: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 1, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + MUL a3, c03, t1 + MUL a3, c04, t2 + + SUB c01, t1, c01 + SUB c02, t2, c02 + MUL a4, c04, t1 + MUL a4, c03, t2 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + MUL a3, c01, t1 + MUL a3, c02, t2 + + SUB c03, t1, c03 + SUB c04, t2, c04 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c03, 2 * SIZE(BO) + ST c04, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + +#ifndef LN + lda C1, 4 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + + lda I, -1(I) + bgt I, $L41 + .align 4 + +$L50: + and M, 1, I + ble I, $L59 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(B) + fclr c01 + LD b2, 1 * SIZE(B) + fclr c05 + + LD b3, 2 * SIZE(B) + fclr c02 + LD b4, 3 * SIZE(B) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L58 + ble L, $L55 +#else +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr t1 + LD a2, 1 * SIZE(AO) + fclr t2 + LD a3, 2 * SIZE(AO) + fclr t3 + LD a4, 3 * SIZE(AO) + fclr t4 + + LD b1, 0 * SIZE(BO) + fclr c01 + LD b2, 1 * SIZE(BO) + fclr c05 + + LD b3, 2 * SIZE(BO) + fclr c02 + LD b4, 3 * SIZE(BO) + fclr c06 + + lda AO, 2 * SIZE(AO) + lda BO, 2 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L58 + ble L, $L55 +#endif + .align 5 + +$L52: + ADD1 c01, t1, c01 + unop + MUL a1, b1, t1 + unop + + ADD3 c02, t2, c02 + lda AO, 4 * SIZE(AO) + MUL a2, b1, t2 + LD b1, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + lda L, -2(L) + MUL a1, b2, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, 3 * SIZE(BO) + MUL a3, b3, t1 + lda BO, 4 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b4, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b4, t4 + LD b4, 1 * SIZE(BO) + unop + + LD a4, 1 * SIZE(AO) + unop + unop + bgt L, $L52 + .align 4 + +$L55: + ADD1 c01, t1, c01 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L57 +#else + blbs TMP1, $L57 +#endif + .align 4 + + ADD3 c02, t2, c02 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c05, t3, c05 + lda BO, 2 * SIZE(BO) + MUL a1, b2, t3 + LD a1, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b2, t4 + LD a2, 1 * SIZE(AO) + + ADD1 c01, t1, c01 + LD b2, -1 * SIZE(BO) + MUL a1, b1, t1 + lda AO, 2 * SIZE(AO) + .align 4 + +$L57: + ADD3 c02, t2, c02 + MUL a2, b1, t2 + ADD4 c05, t3, c05 + MUL a1, b2, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b2, t4 + lda BO, 2 * SIZE(BO) + + ADD1 c01, t1, c01 + ADD3 c02, t2, c02 + ADD4 c05, t3, c05 + ADD2 c06, t4, c06 + + ADD c01, c06, c01 + ADD c02, c05, c02 + +$L58: +#if defined(LN) || defined(RT) + subq KK, 1, TMP1 + + sll TMP1, ZBASE_SHIFT, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -2 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(RN) || defined(RT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + +#ifndef LN + lda C1, 2 * SIZE(C1) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L59: +#ifdef LN + sll K, ZBASE_SHIFT, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 1, KK +#endif + +#ifdef RT + subq KK, 1, KK +#endif + .align 4 + +$L30: + sra N, 1, J + ble J, $L999 + .align 4 + +$L01: +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + subq B, TMP1, B + + subq C, LDC, C2 + subq C2, LDC, C1 + subq C2, LDC, C +#else + mov C, C1 + addq C, LDC, C2 + addq C2, LDC, C +#endif + +#ifdef LN + addq M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + fclr t1 + fclr t2 + fclr t3 + fclr t4 + + fclr c01 + fclr c05 + + ble I, $L20 + .align 4 + +$L11: +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + fclr c03 + LD b4, 3 * SIZE(B) + fclr c07 + + lda BO, 4 * SIZE(B) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(KK) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble KK, $L18 + ble L, $L15 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AO + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + fclr c03 + LD b4, 3 * SIZE(BO) + fclr c07 + + lda BO, 4 * SIZE(BO) + fclr c11 + lda AO, 4 * SIZE(AO) + fclr c15 + + lds $f31, 4 * SIZE(C1) + fclr c04 + lda L, -2(TMP1) + fclr c08 + + lds $f31, 4 * SIZE(C2) + fclr c12 + fclr c16 + ble TMP1, $L18 + ble L, $L15 +#endif + .align 5 + +$L12: +/* 1 */ + ADD1 c11, t1, c11 +#ifndef EV4 + ldq $31, PREFETCHSIZE * SIZE(AO) +#else + unop +#endif + MUL b1, a1, t1 +#ifndef EV4 + ldl $31, PREFETCHSIZE * SIZE(BO) +#else + unop +#endif + + ADD3 c12, t2, c12 + unop + MUL b1, a2, t2 + unop + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + LD a5, 0 * SIZE(AO) + + ADD4 c15, t4, c15 + unop + MUL b2, a1, t4 + LD b5, 0 * SIZE(BO) + +/* 2 */ + ADD1 c01, t1, c01 + UNOP + MUL b1, a3, t1 + UNOP + + ADD3 c02, t2, c02 + UNOP + MUL b1, a4, t2 + UNOP + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a1, t4 + unop + +/* 3 */ + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + unop + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + +/* 4 */ + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + LD a6, 2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, 3 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD b4, 3 * SIZE(BO) + +/* 5 */ + ADD1 c11, t1, c11 + unop + MUL b5, a5, t1 + LD a1, 4 * SIZE(AO) + + ADD3 c12, t2, c12 + lda L, -2(L) + MUL b5, a2, t2 + LD b1, 4 * SIZE(BO) + + ADD2 c16, t3, c16 + unop + MUL b2, a2, t3 + unop + + ADD4 c15, t4, c15 + unop + MUL b2, a5, t4 + unop + +/* 6 */ + ADD1 c01, t1, c01 + unop + MUL b5, a6, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL b5, a4, t2 + unop + + ADD2 c06, t3, c06 + unop + MUL b2, a4, t3 + unop + + ADD4 c05, t4, c05 + unop + MUL b4, a5, t4 + unop + +/* 7 */ + ADD1 c03, t1, c03 + lda AO, 8 * SIZE(AO) + MUL b3, a5, t1 + unop + + ADD3 c04, t2, c04 + lda BO, 8 * SIZE(BO) + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, -3 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a6, t4 + LD b2, -3 * SIZE(BO) + +/* 8 */ + ADD1 c09, t1, c09 + unop + MUL b3, a6, t1 + LD a3, -2 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, -2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + MUL b4, a6, t4 + LD b4, -1 * SIZE(BO) + bgt L, $L12 + .align 4 + +$L15: + ADD1 c11, t1, c11 + unop + MUL b1, a1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L17 +#else + blbs TMP1, $L17 +#endif + .align 4 + + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + unop + MUL b1, a4, t2 + LD b1, 0 * SIZE(BO) + + ADD2 c06, t3, c06 + MUL b2, a4, t3 + ADD4 c05, t4, c05 + MUL b4, a1, t4 + + ADD1 c03, t1, c03 + unop + MUL b3, a1, t1 + LD a1, 0 * SIZE(AO) + + ADD3 c04, t2, c04 + unop + MUL b3, a2, t2 + unop + + ADD2 c08, t3, c08 + unop + MUL b4, a2, t3 + LD a2, 1 * SIZE(AO) + + ADD4 c13, t4, c13 + unop + MUL b2, a3, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL b3, a3, t1 + lda AO, 4 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL b3, a4, t2 + LD b3, 2 * SIZE(BO) + + ADD2 c14, t3, c14 + unop + MUL b4, a4, t3 + LD a4, -1 * SIZE(AO) + + ADD4 c07, t4, c07 + unop + MUL b4, a3, t4 + LD a3, -2 * SIZE(AO) + + ADD1 c11, t1, c11 + LD b4, 3 * SIZE(BO) + MUL b1, a1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L17: + ADD3 c12, t2, c12 + MUL b1, a2, t2 + ADD2 c16, t3, c16 + MUL b2, a2, t3 + + ADD4 c15, t4, c15 + MUL b2, a1, t4 + ADD1 c01, t1, c01 + MUL b1, a3, t1 + + ADD3 c02, t2, c02 + MUL b1, a4, t2 + ADD2 c06, t3, c06 + MUL b2, a4, t3 + + ADD4 c05, t4, c05 + MUL b4, a1, t4 + ADD1 c03, t1, c03 + MUL b3, a1, t1 + + ADD3 c04, t2, c04 + MUL b3, a2, t2 + ADD2 c08, t3, c08 + MUL b4, a2, t3 + + ADD4 c13, t4, c13 + MUL b2, a3, t4 + ADD1 c09, t1, c09 + MUL b3, a3, t1 + + ADD3 c10, t2, c10 + MUL b3, a4, t2 + ADD2 c14, t3, c14 + MUL b4, a4, t3 + + ADD4 c07, t4, c07 + lda AO, 4 * SIZE(AO) + MUL b4, a3, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c11, t1, c11 + ADD3 c12, t2, c12 + ADD2 c16, t3, c16 + ADD4 c15, t4, c15 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c03, c08, c03 + ADD c04, c07, c04 + + ADD c09, c14, c09 + ADD c10, c13, c10 + ADD c11, c16, c11 + ADD c12, c15, c12 + .align 4 + +$L18: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 2, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -4 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + LD b1, 4 * SIZE(BO) + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 + + SUB b1, c03, c03 + SUB b2, c04, c04 + SUB b3, c11, c11 + SUB b4, c12, c12 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 4 * SIZE(AO) + LD b2, 5 * SIZE(AO) + LD b3, 6 * SIZE(AO) + LD b4, 7 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c03, c03 + SUB a4, c04, c04 + + SUB b1, c09, c09 + SUB b2, c10, c10 + SUB b3, c11, c11 + SUB b4, c12, c12 +#endif + +#ifdef LN + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD a3, 4 * SIZE(AO) + LD a4, 5 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c03, t1 + MUL a3, c04, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c09, t3, c09 + SUB c10, t4, c10 + + MUL a4, c04, t1 + MUL a4, c03, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c09, t3, c09 + ADD5 c10, t4, c10 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef LT + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c09, t3 + MUL a3, c10, t4 + + SUB c03, t1, c03 + SUB c04, t2, c04 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c10, t3 + MUL a4, c09, t4 + + ADD6 c03, t1, c03 + ADD5 c04, t2, c04 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + + MUL a2, c04, t1 + MUL a2, c03, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c03, c03 + MUL a1, c04, c04 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c03, t1, c03 + ADD6 c04, t2, c04 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 + + MUL a3, c01, t1 + MUL a3, c02, t2 + MUL a3, c03, t3 + MUL a3, c04, t4 + + SUB c09, t1, c09 + SUB c10, t2, c10 + SUB c11, t3, c11 + SUB c12, t4, c12 + + MUL a4, c02, t1 + MUL a4, c01, t2 + MUL a4, c04, t3 + MUL a4, c03, t4 + + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + ADD6 c11, t3, c11 + ADD5 c12, t4, c12 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a2, c12, t3 + MUL a2, c11, t4 + + MUL a1, c09, c09 + MUL a1, c10, c10 + MUL a1, c11, c11 + MUL a1, c12, c12 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + ADD5 c11, t3, c11 + ADD6 c12, t4, c12 + + MUL a3, c09, t1 + MUL a3, c10, t2 + MUL a3, c11, t3 + MUL a3, c12, t4 + + SUB c01, t1, c01 + SUB c02, t2, c02 + SUB c03, t3, c03 + SUB c04, t4, c04 + + MUL a4, c10, t1 + MUL a4, c09, t2 + MUL a4, c12, t3 + MUL a4, c11, t4 + + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + ADD6 c03, t3, c03 + ADD5 c04, t4, c04 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c04, t3 + MUL a2, c03, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c03, c03 + MUL a1, c04, c04 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c03, t3, c03 + ADD6 c04, t4, c04 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) + + ST c03, 4 * SIZE(BO) + ST c04, 5 * SIZE(BO) + ST c11, 6 * SIZE(BO) + ST c12, 7 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c03, 2 * SIZE(AO) + ST c04, 3 * SIZE(AO) + + ST c09, 4 * SIZE(AO) + ST c10, 5 * SIZE(AO) + ST c11, 6 * SIZE(AO) + ST c12, 7 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -4 * SIZE(C1) + lda C2, -4 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c03, 2 * SIZE(C1) + ST c04, 3 * SIZE(C1) + + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + ST c11, 2 * SIZE(C2) + ST c12, 3 * SIZE(C2) + +#ifndef LN + lda C1, 4 * SIZE(C1) + lda C2, 4 * SIZE(C2) +#endif + + fclr t1 + fclr t2 + fclr t3 + fclr t4 + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 1, TMP1 + addq AO, TMP1, AO + addq BO, TMP1, BO +#endif + +#ifdef LT + addq KK, 2, KK +#endif + +#ifdef LN + subq KK, 2, KK +#endif + fclr c01 + fclr c05 + + lda I, -1(I) + bgt I, $L11 + .align 4 + +$L20: + and M, 1, I + ble I, $L29 + +#if defined(LT) || defined(RN) + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(B) + fclr c10 + LD b2, 1 * SIZE(B) + fclr c14 + + LD b3, 2 * SIZE(B) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(B) + lda BO, 4 * SIZE(B) + + lda L, -2(KK) + + ble KK, $L28 + ble L, $L25 +#else +#ifdef LN + sll K, ZBASE_SHIFT + 0, TMP1 + subq AORIG, TMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TMP1 + addq AORIG, TMP1, AO + sll KK, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, BO + + subq K, KK, TMP1 + + LD a1, 0 * SIZE(AO) + fclr c09 + LD a2, 1 * SIZE(AO) + fclr c13 + + LD a3, 2 * SIZE(AO) + fclr c02 + LD a4, 3 * SIZE(AO) + fclr c06 + + LD b1, 0 * SIZE(BO) + fclr c10 + LD b2, 1 * SIZE(BO) + fclr c14 + + LD b3, 2 * SIZE(BO) + lda AO, 2 * SIZE(AO) + LD b4, 3 * SIZE(BO) + lda BO, 4 * SIZE(BO) + + lda L, -2(TMP1) + + ble TMP1, $L28 + ble L, $L25 +#endif + .align 5 + +$L22: + ADD1 c09, t1, c09 + unop + MUL a1, b1, t1 + unop + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + lda BO, 8 * SIZE(BO) + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, -7 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + unop + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, -6 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, 2 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a2, b4, t4 + LD b5, -5 * SIZE(BO) + + ADD1 c09, t1, c09 + unop + MUL a3, b1, t1 + LD a2, 3 * SIZE(AO) + + ADD3 c10, t2, c10 + unop + MUL a4, b1, t2 + LD b1, -4 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a3, b2, t3 + lda AO, 4 * SIZE(AO) + + ADD2 c14, t4, c14 + MUL a4, b2, t4 + LD b2, -3 * SIZE(BO) + + ADD1 c01, t1, c01 + lda L, -2(L) + MUL a3, b3, t1 + LD b4, -1 * SIZE(BO) + + ADD3 c02, t2, c02 + unop + MUL a4, b3, t2 + LD b3, -2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a3, b5, t3 + LD a3, 0 * SIZE(AO) + + ADD2 c06, t4, c06 + MUL a4, b5, t4 + LD a4, 1 * SIZE(AO) + bgt L, $L22 + .align 4 + +$L25: + ADD1 c09, t1, c09 + MUL a1, b1, t1 +#if defined(LT) || defined(RN) + blbs KK, $L27 +#else + blbs TMP1, $L27 +#endif + .align 4 + + ADD3 c10, t2, c10 + unop + MUL a2, b1, t2 + LD b1, 0 * SIZE(BO) + + ADD4 c13, t3, c13 + unop + MUL a1, b2, t3 + unop + + ADD2 c14, t4, c14 + unop + MUL a2, b2, t4 + LD b2, 1 * SIZE(BO) + + ADD1 c01, t1, c01 + unop + MUL a1, b3, t1 + lda AO, 2 * SIZE(AO) + + ADD3 c02, t2, c02 + unop + MUL a2, b3, t2 + LD b3, 2 * SIZE(BO) + + ADD4 c05, t3, c05 + unop + MUL a1, b4, t3 + LD a1, -2 * SIZE(AO) + + ADD2 c06, t4, c06 + unop + MUL a2, b4, t4 + LD a2, -1 * SIZE(AO) + + ADD1 c09, t1, c09 + LD b4, 3 * SIZE(BO) + MUL a1, b1, t1 + lda BO, 4 * SIZE(BO) + .align 4 + +$L27: + ADD3 c10, t2, c10 + MUL a2, b1, t2 + ADD4 c13, t3, c13 + MUL a1, b2, t3 + + ADD2 c14, t4, c14 + MUL a2, b2, t4 + ADD1 c01, t1, c01 + MUL a1, b3, t1 + + ADD3 c02, t2, c02 + MUL a2, b3, t2 + ADD4 c05, t3, c05 + MUL a1, b4, t3 + + ADD2 c06, t4, c06 + lda AO, 2 * SIZE(AO) + MUL a2, b4, t4 + lda BO, 4 * SIZE(BO) + + ADD1 c09, t1, c09 + ADD3 c10, t2, c10 + ADD4 c13, t3, c13 + ADD2 c14, t4, c14 + + ADD c01, c06, c01 + ADD c02, c05, c02 + ADD c09, c14, c09 + ADD c10, c13, c10 + .align 4 + +$L28: +#if defined(LN) || defined(RT) +#ifdef LN + subq KK, 1, TMP1 +#else + subq KK, 2, TMP1 +#endif + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AORIG, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq B, TMP2, BO +#else + lda AO, -2 * SIZE(AO) + lda BO, -4 * SIZE(BO) +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + SUB a1, c01, c01 + SUB a2, c02, c02 + SUB a3, c09, c09 + SUB a4, c10, c10 +#endif + +#if defined(LN) || defined(LT) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a2, c10, t3 + MUL a2, c09, t4 + + MUL a1, c01, c01 + MUL a1, c02, c02 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + ADD5 c09, t3, c09 + ADD6 c10, t4, c10 +#endif + +#ifdef RN + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + LD a3, 2 * SIZE(BO) + LD a4, 3 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 + + MUL a3, c01, t1 + MUL a3, c02, t2 + SUB c09, t1, c09 + SUB c10, t2, c10 + + MUL a4, c02, t1 + MUL a4, c01, t2 + ADD6 c09, t1, c09 + ADD5 c10, t2, c10 + + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 +#endif + +#ifdef RT + LD a1, 6 * SIZE(BO) + LD a2, 7 * SIZE(BO) + LD a3, 4 * SIZE(BO) + LD a4, 5 * SIZE(BO) + + MUL a2, c10, t1 + MUL a2, c09, t2 + MUL a1, c09, c09 + MUL a1, c10, c10 + + ADD5 c09, t1, c09 + ADD6 c10, t2, c10 + + MUL a3, c09, t1 + MUL a3, c10, t2 + SUB c01, t1, c01 + SUB c02, t2, c02 + + MUL a4, c10, t1 + MUL a4, c09, t2 + ADD6 c01, t1, c01 + ADD5 c02, t2, c02 + + LD a1, 0 * SIZE(BO) + LD a2, 1 * SIZE(BO) + + MUL a2, c02, t1 + MUL a2, c01, t2 + MUL a1, c01, c01 + MUL a1, c02, c02 + + ADD5 c01, t1, c01 + ADD6 c02, t2, c02 +#endif + +#if defined(LN) || defined(LT) + ST c01, 0 * SIZE(BO) + ST c02, 1 * SIZE(BO) + ST c09, 2 * SIZE(BO) + ST c10, 3 * SIZE(BO) +#else + ST c01, 0 * SIZE(AO) + ST c02, 1 * SIZE(AO) + ST c09, 2 * SIZE(AO) + ST c10, 3 * SIZE(AO) +#endif + +#ifdef LN + lda C1, -2 * SIZE(C1) + lda C2, -2 * SIZE(C2) +#endif + + ST c01, 0 * SIZE(C1) + ST c02, 1 * SIZE(C1) + ST c09, 0 * SIZE(C2) + ST c10, 1 * SIZE(C2) + +#ifndef LN + lda C1, 2 * SIZE(C1) + lda C2, 2 * SIZE(C2) +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TMP1 + addq AORIG, TMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + subq K, KK, TMP1 + sll TMP1, ZBASE_SHIFT + 0, TMP2 + addq AO, TMP2, AO + sll TMP1, ZBASE_SHIFT + 1, TMP2 + addq BO, TMP2, BO +#endif + +#ifdef LT + addq KK, 1, KK +#endif + +#ifdef LN + subq KK, 1, KK +#endif + .align 4 + +$L29: +#ifdef LN + sll K, ZBASE_SHIFT + 1, TMP1 + addq B, TMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + addq KK, 2, KK +#endif + +#ifdef RT + subq KK, 2, KK +#endif + + lda J, -1(J) + bgt J, $L01 + .align 4 + +$L999: + ldt $f2, 0($sp) + ldt $f3, 8($sp) + ldt $f4, 16($sp) + ldt $f5, 24($sp) + ldt $f6, 32($sp) + ldt $f7, 40($sp) + ldt $f8, 48($sp) + ldt $f9, 56($sp) + clr $0 + lda $sp, STACKSIZE($sp) + ret + .ident VERSION + .end CNAME diff --git a/kernel/generic/cabs.c b/kernel/generic/cabs.c new file mode 100644 index 0000000000..f76f69b20b --- /dev/null +++ b/kernel/generic/cabs.c @@ -0,0 +1,44 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +FLOAT NAME(FLOAT *a){ + return fabs(a[0]) + fabs(a[1]); +} diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c new file mode 100644 index 0000000000..525ff9495e --- /dev/null +++ b/kernel/generic/gemm_beta.c @@ -0,0 +1,142 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, + FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + + BLASLONG i, j; + FLOAT *c_offset1, *c_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + c_offset = c; + + if (beta == ZERO){ + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + *(c_offset1 + 2) = ZERO; + *(c_offset1 + 3) = ZERO; + *(c_offset1 + 4) = ZERO; + *(c_offset1 + 5) = ZERO; + *(c_offset1 + 6) = ZERO; + *(c_offset1 + 7) = ZERO; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + *c_offset1 = ZERO; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 3); + if (i > 0){ + do { + ctemp1 = *(c_offset1 + 0); + ctemp2 = *(c_offset1 + 1); + ctemp3 = *(c_offset1 + 2); + ctemp4 = *(c_offset1 + 3); + ctemp5 = *(c_offset1 + 4); + ctemp6 = *(c_offset1 + 5); + ctemp7 = *(c_offset1 + 6); + ctemp8 = *(c_offset1 + 7); + + ctemp1 *= beta; + ctemp2 *= beta; + ctemp3 *= beta; + ctemp4 *= beta; + ctemp5 *= beta; + ctemp6 *= beta; + ctemp7 *= beta; + ctemp8 *= beta; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + *(c_offset1 + 4) = ctemp5; + *(c_offset1 + 5) = ctemp6; + *(c_offset1 + 6) = ctemp7; + *(c_offset1 + 7) = ctemp8; + c_offset1 += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0){ + do { + ctemp1 = *c_offset1; + ctemp1 *= beta; + *c_offset1 = ctemp1; + c_offset1 ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + + } + return 0; +}; diff --git a/kernel/generic/gemm_ncopy_1.c b/kernel/generic/gemm_ncopy_1.c new file mode 100644 index 0000000000..e990de771b --- /dev/null +++ b/kernel/generic/gemm_ncopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + FLOAT *a_offset, *a_offset1; + FLOAT *b_offset; + + a_offset = a; + b_offset = b; + + j = n; + + if (j > 0){ + do { + a_offset1 = a_offset; + a_offset += lda; + + i = (m >> 3); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset1 + 0); + *(b_offset + 1) = *(a_offset1 + 1); + *(b_offset + 2) = *(a_offset1 + 2); + *(b_offset + 3) = *(a_offset1 + 3); + *(b_offset + 4) = *(a_offset1 + 4); + *(b_offset + 5) = *(a_offset1 + 5); + *(b_offset + 6) = *(a_offset1 + 6); + *(b_offset + 7) = *(a_offset1 + 7); + a_offset1 += 8; + b_offset += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset1 + 0); + a_offset1 ++; + b_offset ++; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + } + + return 0; +} diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c new file mode 100644 index 0000000000..4a9269ec12 --- /dev/null +++ b/kernel/generic/gemm_ncopy_16.c @@ -0,0 +1,437 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + ctemp17 = *(aoffset9 + 0); + ctemp18 = *(aoffset9 + 1); + ctemp19 = *(aoffset10 + 0); + ctemp20 = *(aoffset10 + 1); + + ctemp21 = *(aoffset11 + 0); + ctemp22 = *(aoffset11 + 1); + ctemp23 = *(aoffset12 + 0); + ctemp24 = *(aoffset12 + 1); + + ctemp25 = *(aoffset13 + 0); + ctemp26 = *(aoffset13 + 1); + ctemp27 = *(aoffset14 + 0); + ctemp28 = *(aoffset14 + 1); + + ctemp29 = *(aoffset15 + 0); + ctemp30 = *(aoffset15 + 1); + ctemp31 = *(aoffset16 + 0); + ctemp32 = *(aoffset16 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + + *(boffset + 16) = ctemp02; + *(boffset + 17) = ctemp04; + *(boffset + 18) = ctemp06; + *(boffset + 19) = ctemp08; + *(boffset + 20) = ctemp10; + *(boffset + 21) = ctemp12; + *(boffset + 22) = ctemp14; + *(boffset + 23) = ctemp16; + + *(boffset + 24) = ctemp18; + *(boffset + 25) = ctemp20; + *(boffset + 26) = ctemp22; + *(boffset + 27) = ctemp24; + *(boffset + 28) = ctemp26; + *(boffset + 29) = ctemp28; + *(boffset + 30) = ctemp30; + *(boffset + 31) = ctemp32; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + aoffset9 += 2; + aoffset10 += 2; + aoffset11 += 2; + aoffset12 += 2; + aoffset13 += 2; + aoffset14 += 2; + aoffset15 += 2; + aoffset16 += 2; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + ctemp17 = *(aoffset9 + 0); + ctemp19 = *(aoffset10 + 0); + ctemp21 = *(aoffset11 + 0); + ctemp23 = *(aoffset12 + 0); + ctemp25 = *(aoffset13 + 0); + ctemp27 = *(aoffset14 + 0); + ctemp29 = *(aoffset15 + 0); + ctemp31 = *(aoffset16 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + + boffset += 16; + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp02; + *(boffset + 9) = ctemp04; + *(boffset + 10) = ctemp06; + *(boffset + 11) = ctemp08; + *(boffset + 12) = ctemp10; + *(boffset + 13) = ctemp12; + *(boffset + 14) = ctemp14; + *(boffset + 15) = ctemp16; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp04; + *(boffset + 6) = ctemp06; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + boffset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/gemm_ncopy_2.c b/kernel/generic/gemm_ncopy_2.c new file mode 100644 index 0000000000..0ec807cc4c --- /dev/null +++ b/kernel/generic/gemm_ncopy_2.c @@ -0,0 +1,126 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset; + + a_offset = a; + b_offset = b; + + j = (n >> 1); + + if (j > 0){ + do { + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + i = (m >> 2); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset1 + 0); + *(b_offset + 1) = *(a_offset2 + 0); + *(b_offset + 2) = *(a_offset1 + 1); + *(b_offset + 3) = *(a_offset2 + 1); + *(b_offset + 4) = *(a_offset1 + 2); + *(b_offset + 5) = *(a_offset2 + 2); + *(b_offset + 6) = *(a_offset1 + 3); + *(b_offset + 7) = *(a_offset2 + 3); + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + } while (i > 0); + } + + i = (m & 3); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset1 + 0); + *(b_offset + 1) = *(a_offset2 + 0); + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + } + + if (n & 1){ + + i = (m >> 3); + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset + 0); + *(b_offset + 1) = *(a_offset + 1); + *(b_offset + 2) = *(a_offset + 2); + *(b_offset + 3) = *(a_offset + 3); + *(b_offset + 4) = *(a_offset + 4); + *(b_offset + 5) = *(a_offset + 5); + *(b_offset + 6) = *(a_offset + 6); + *(b_offset + 7) = *(a_offset + 7); + a_offset += 8; + b_offset += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + + if (i > 0){ + do { + *(b_offset + 0) = *(a_offset + 0); + a_offset ++; + b_offset ++; + i --; + } while (i > 0); + } + } + + return 0; +} + diff --git a/kernel/generic/gemm_ncopy_4.c b/kernel/generic/gemm_ncopy_4.c new file mode 100644 index 0000000000..1ecb93c658 --- /dev/null +++ b/kernel/generic/gemm_ncopy_4.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + *(b_offset + 4) = ctemp2; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp10; + *(b_offset + 7) = ctemp14; + + *(b_offset + 8) = ctemp3; + *(b_offset + 9) = ctemp7; + *(b_offset + 10) = ctemp11; + *(b_offset + 11) = ctemp15; + + *(b_offset + 12) = ctemp4; + *(b_offset + 13) = ctemp8; + *(b_offset + 14) = ctemp12; + *(b_offset + 15) = ctemp16; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + ctemp9 = *(a_offset3 + 0); + ctemp13 = *(a_offset4 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp9; + *(b_offset + 3) = ctemp13; + + a_offset1 ++; + a_offset2 ++; + a_offset3 ++; + a_offset4 ++; + + b_offset += 4; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + *(b_offset + 2) = ctemp2; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp3; + *(b_offset + 5) = ctemp7; + *(b_offset + 6) = ctemp4; + *(b_offset + 7) = ctemp8; + + a_offset1 += 4; + a_offset2 += 4; + b_offset += 8; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp5 = *(a_offset2 + 0); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp5; + + a_offset1 ++; + a_offset2 ++; + b_offset += 2; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 1){ + a_offset1 = a_offset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 4; + b_offset += 4; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + *(b_offset + 0) = ctemp1; + a_offset1 ++; + b_offset += 1; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c new file mode 100644 index 0000000000..bdaaba1135 --- /dev/null +++ b/kernel/generic/gemm_ncopy_8.c @@ -0,0 +1,422 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + ctemp33 = *(aoffset5 + 0); + ctemp34 = *(aoffset5 + 1); + ctemp35 = *(aoffset5 + 2); + ctemp36 = *(aoffset5 + 3); + ctemp37 = *(aoffset5 + 4); + ctemp38 = *(aoffset5 + 5); + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + + ctemp41 = *(aoffset6 + 0); + ctemp42 = *(aoffset6 + 1); + ctemp43 = *(aoffset6 + 2); + ctemp44 = *(aoffset6 + 3); + ctemp45 = *(aoffset6 + 4); + ctemp46 = *(aoffset6 + 5); + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + + ctemp49 = *(aoffset7 + 0); + ctemp50 = *(aoffset7 + 1); + ctemp51 = *(aoffset7 + 2); + ctemp52 = *(aoffset7 + 3); + ctemp53 = *(aoffset7 + 4); + ctemp54 = *(aoffset7 + 5); + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + + ctemp57 = *(aoffset8 + 0); + ctemp58 = *(aoffset8 + 1); + ctemp59 = *(aoffset8 + 2); + ctemp60 = *(aoffset8 + 3); + ctemp61 = *(aoffset8 + 4); + ctemp62 = *(aoffset8 + 5); + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + *(boffset + 8) = ctemp02; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp18; + *(boffset + 11) = ctemp26; + *(boffset + 12) = ctemp34; + *(boffset + 13) = ctemp42; + *(boffset + 14) = ctemp50; + *(boffset + 15) = ctemp58; + + *(boffset + 16) = ctemp03; + *(boffset + 17) = ctemp11; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp27; + *(boffset + 20) = ctemp35; + *(boffset + 21) = ctemp43; + *(boffset + 22) = ctemp51; + *(boffset + 23) = ctemp59; + + *(boffset + 24) = ctemp04; + *(boffset + 25) = ctemp12; + *(boffset + 26) = ctemp20; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp36; + *(boffset + 29) = ctemp44; + *(boffset + 30) = ctemp52; + *(boffset + 31) = ctemp60; + + *(boffset + 32) = ctemp05; + *(boffset + 33) = ctemp13; + *(boffset + 34) = ctemp21; + *(boffset + 35) = ctemp29; + *(boffset + 36) = ctemp37; + *(boffset + 37) = ctemp45; + *(boffset + 38) = ctemp53; + *(boffset + 39) = ctemp61; + + *(boffset + 40) = ctemp06; + *(boffset + 41) = ctemp14; + *(boffset + 42) = ctemp22; + *(boffset + 43) = ctemp30; + *(boffset + 44) = ctemp38; + *(boffset + 45) = ctemp46; + *(boffset + 46) = ctemp54; + *(boffset + 47) = ctemp62; + + *(boffset + 48) = ctemp07; + *(boffset + 49) = ctemp15; + *(boffset + 50) = ctemp23; + *(boffset + 51) = ctemp31; + *(boffset + 52) = ctemp39; + *(boffset + 53) = ctemp47; + *(boffset + 54) = ctemp55; + *(boffset + 55) = ctemp63; + + *(boffset + 56) = ctemp08; + *(boffset + 57) = ctemp16; + *(boffset + 58) = ctemp24; + *(boffset + 59) = ctemp32; + *(boffset + 60) = ctemp40; + *(boffset + 61) = ctemp48; + *(boffset + 62) = ctemp56; + *(boffset + 63) = ctemp64; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp13; + + *(boffset + 4) = ctemp02; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp10; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp07; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp15; + + *(boffset + 12) = ctemp04; + *(boffset + 13) = ctemp08; + *(boffset + 14) = ctemp12; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_1.c b/kernel/generic/gemm_tcopy_1.c new file mode 100644 index 0000000000..c0c8bd023a --- /dev/null +++ b/kernel/generic/gemm_tcopy_1.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1; + FLOAT *b_offset, *b_offset1; + + a_offset = a; + b_offset = b; + + i = m; + + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset ++; + + j = n; + if (j > 0) { + do { + *(b_offset1 + 0) = *(a_offset1 + 0); + a_offset1 ++; + b_offset1 += m; + j --; + } while (j > 0); + } + i --; + } while (i > 0); + } + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c new file mode 100644 index 0000000000..e5732250f7 --- /dev/null +++ b/kernel/generic/gemm_tcopy_16.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = ctemp01; + boffset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_2.c b/kernel/generic/gemm_tcopy_2.c new file mode 100644 index 0000000000..0aa9c2e53d --- /dev/null +++ b/kernel/generic/gemm_tcopy_2.c @@ -0,0 +1,104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + + a_offset = a; + b_offset = b; + b_offset2 = b + m * (n & ~1); + + i = (m >> 1); + + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + j = (n >> 1); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0); + *(b_offset1 + 1) = *(a_offset1 + 1); + *(b_offset1 + 2) = *(a_offset2 + 0); + *(b_offset1 + 3) = *(a_offset2 + 1); + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += m * 2; + j--; + } while (j > 0); + } + + if (n & 1){ + *(b_offset2 + 0) = *(a_offset1 + 0); + *(b_offset2 + 1) = *(a_offset2 + 0); + b_offset2 += 2; + } + i --; + } while (i > 0); + } + + if (m & 1) { + j = (n >> 1); + if (j > 0){ + do { + *(b_offset + 0) = *(a_offset + 0); + *(b_offset + 1) = *(a_offset + 1); + a_offset += 2; + b_offset += m * 2; + j--; + } while (j > 0); + } + + if (n & 1){ + *(b_offset2 + 0) = *(a_offset + 0); + } + } + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_4.c b/kernel/generic/gemm_tcopy_4.c new file mode 100644 index 0000000000..bd32090e7a --- /dev/null +++ b/kernel/generic/gemm_tcopy_4.c @@ -0,0 +1,281 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + *(b_offset1 + 8) = ctemp9; + *(b_offset1 + 9) = ctemp10; + *(b_offset1 + 10) = ctemp11; + *(b_offset1 + 11) = ctemp12; + + *(b_offset1 + 12) = ctemp13; + *(b_offset1 + 13) = ctemp14; + *(b_offset1 + 14) = ctemp15; + *(b_offset1 + 15) = ctemp16; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset3 + 0); + ctemp6 = *(a_offset3 + 1); + + ctemp7 = *(a_offset4 + 0); + ctemp8 = *(a_offset4 + 1); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + *(b_offset2 + 4) = ctemp5; + *(b_offset2 + 5) = ctemp6; + *(b_offset2 + 6) = ctemp7; + *(b_offset2 + 7) = ctemp8; + + b_offset2 += 8; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + ctemp3 = *(a_offset3 + 0); + ctemp4 = *(a_offset4 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + *(b_offset3 + 2) = ctemp3; + *(b_offset3 + 3) = ctemp4; + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp5; + *(b_offset1 + 5) = ctemp6; + *(b_offset1 + 6) = ctemp7; + *(b_offset1 + 7) = ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp3; + *(b_offset2 + 3) = ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = ctemp1; + *(b_offset3 + 1) = ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = ctemp1; + } + } + + return 0; +} diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c new file mode 100644 index 0000000000..8f6e33c8a2 --- /dev/null +++ b/kernel/generic/gemm_tcopy_8.c @@ -0,0 +1,787 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + ctemp33 = *(aoffset5 + 0); + ctemp34 = *(aoffset5 + 1); + ctemp35 = *(aoffset5 + 2); + ctemp36 = *(aoffset5 + 3); + ctemp37 = *(aoffset5 + 4); + ctemp38 = *(aoffset5 + 5); + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + aoffset5 += 8; + + ctemp41 = *(aoffset6 + 0); + ctemp42 = *(aoffset6 + 1); + ctemp43 = *(aoffset6 + 2); + ctemp44 = *(aoffset6 + 3); + ctemp45 = *(aoffset6 + 4); + ctemp46 = *(aoffset6 + 5); + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + aoffset6 += 8; + + ctemp49 = *(aoffset7 + 0); + ctemp50 = *(aoffset7 + 1); + ctemp51 = *(aoffset7 + 2); + ctemp52 = *(aoffset7 + 3); + ctemp53 = *(aoffset7 + 4); + ctemp54 = *(aoffset7 + 5); + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + aoffset7 += 8; + + ctemp57 = *(aoffset8 + 0); + ctemp58 = *(aoffset8 + 1); + ctemp59 = *(aoffset8 + 2); + ctemp60 = *(aoffset8 + 3); + ctemp61 = *(aoffset8 + 4); + ctemp62 = *(aoffset8 + 5); + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + aoffset8 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + *(boffset1 + 16) = ctemp17; + *(boffset1 + 17) = ctemp18; + *(boffset1 + 18) = ctemp19; + *(boffset1 + 19) = ctemp20; + *(boffset1 + 20) = ctemp21; + *(boffset1 + 21) = ctemp22; + *(boffset1 + 22) = ctemp23; + *(boffset1 + 23) = ctemp24; + + *(boffset1 + 24) = ctemp25; + *(boffset1 + 25) = ctemp26; + *(boffset1 + 26) = ctemp27; + *(boffset1 + 27) = ctemp28; + *(boffset1 + 28) = ctemp29; + *(boffset1 + 29) = ctemp30; + *(boffset1 + 30) = ctemp31; + *(boffset1 + 31) = ctemp32; + + *(boffset1 + 32) = ctemp33; + *(boffset1 + 33) = ctemp34; + *(boffset1 + 34) = ctemp35; + *(boffset1 + 35) = ctemp36; + *(boffset1 + 36) = ctemp37; + *(boffset1 + 37) = ctemp38; + *(boffset1 + 38) = ctemp39; + *(boffset1 + 39) = ctemp40; + + *(boffset1 + 40) = ctemp41; + *(boffset1 + 41) = ctemp42; + *(boffset1 + 42) = ctemp43; + *(boffset1 + 43) = ctemp44; + *(boffset1 + 44) = ctemp45; + *(boffset1 + 45) = ctemp46; + *(boffset1 + 46) = ctemp47; + *(boffset1 + 47) = ctemp48; + + *(boffset1 + 48) = ctemp49; + *(boffset1 + 49) = ctemp50; + *(boffset1 + 50) = ctemp51; + *(boffset1 + 51) = ctemp52; + *(boffset1 + 52) = ctemp53; + *(boffset1 + 53) = ctemp54; + *(boffset1 + 54) = ctemp55; + *(boffset1 + 55) = ctemp56; + + *(boffset1 + 56) = ctemp57; + *(boffset1 + 57) = ctemp58; + *(boffset1 + 58) = ctemp59; + *(boffset1 + 59) = ctemp60; + *(boffset1 + 60) = ctemp61; + *(boffset1 + 61) = ctemp62; + *(boffset1 + 62) = ctemp63; + *(boffset1 + 63) = ctemp64; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + ctemp17 = *(aoffset5 + 0); + ctemp18 = *(aoffset5 + 1); + ctemp19 = *(aoffset5 + 2); + ctemp20 = *(aoffset5 + 3); + aoffset5 += 4; + + ctemp21 = *(aoffset6 + 0); + ctemp22 = *(aoffset6 + 1); + ctemp23 = *(aoffset6 + 2); + ctemp24 = *(aoffset6 + 3); + aoffset6 += 4; + + ctemp25 = *(aoffset7 + 0); + ctemp26 = *(aoffset7 + 1); + ctemp27 = *(aoffset7 + 2); + ctemp28 = *(aoffset7 + 3); + aoffset7 += 4; + + ctemp29 = *(aoffset8 + 0); + ctemp30 = *(aoffset8 + 1); + ctemp31 = *(aoffset8 + 2); + ctemp32 = *(aoffset8 + 3); + aoffset8 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp10; + *(boffset2 + 10) = ctemp11; + *(boffset2 + 11) = ctemp12; + *(boffset2 + 12) = ctemp13; + *(boffset2 + 13) = ctemp14; + *(boffset2 + 14) = ctemp15; + *(boffset2 + 15) = ctemp16; + + *(boffset2 + 16) = ctemp17; + *(boffset2 + 17) = ctemp18; + *(boffset2 + 18) = ctemp19; + *(boffset2 + 19) = ctemp20; + *(boffset2 + 20) = ctemp21; + *(boffset2 + 21) = ctemp22; + *(boffset2 + 22) = ctemp23; + *(boffset2 + 23) = ctemp24; + *(boffset2 + 24) = ctemp25; + *(boffset2 + 25) = ctemp26; + *(boffset2 + 26) = ctemp27; + *(boffset2 + 27) = ctemp28; + *(boffset2 + 28) = ctemp29; + *(boffset2 + 29) = ctemp30; + *(boffset2 + 30) = ctemp31; + *(boffset2 + 31) = ctemp32; + + boffset2 += 32; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + aoffset5 += 2; + + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + aoffset6 += 2; + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + aoffset7 += 2; + + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + aoffset8 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + *(boffset3 + 8) = ctemp09; + *(boffset3 + 9) = ctemp10; + *(boffset3 + 10) = ctemp11; + *(boffset3 + 11) = ctemp12; + *(boffset3 + 12) = ctemp13; + *(boffset3 + 13) = ctemp14; + *(boffset3 + 14) = ctemp15; + *(boffset3 + 15) = ctemp16; + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + *(boffset4 + 4) = ctemp05; + *(boffset4 + 5) = ctemp06; + *(boffset4 + 6) = ctemp07; + *(boffset4 + 7) = ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + *(boffset1 + 16) = ctemp17; + *(boffset1 + 17) = ctemp18; + *(boffset1 + 18) = ctemp19; + *(boffset1 + 19) = ctemp20; + *(boffset1 + 20) = ctemp21; + *(boffset1 + 21) = ctemp22; + *(boffset1 + 22) = ctemp23; + *(boffset1 + 23) = ctemp24; + + *(boffset1 + 24) = ctemp25; + *(boffset1 + 25) = ctemp26; + *(boffset1 + 26) = ctemp27; + *(boffset1 + 27) = ctemp28; + *(boffset1 + 28) = ctemp29; + *(boffset1 + 29) = ctemp30; + *(boffset1 + 30) = ctemp31; + *(boffset1 + 31) = ctemp32; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp10; + *(boffset2 + 10) = ctemp11; + *(boffset2 + 11) = ctemp12; + *(boffset2 + 12) = ctemp13; + *(boffset2 + 13) = ctemp14; + *(boffset2 + 14) = ctemp15; + *(boffset2 + 15) = ctemp16; + boffset2 += 16; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + boffset2 += 8; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + aoffset += lda; + + boffset1 = boffset; + boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + boffset2 += 4; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = ctemp01; + boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/generic/ger.c b/kernel/generic/ger.c new file mode 100644 index 0000000000..2438786a4b --- /dev/null +++ b/kernel/generic/ger.c @@ -0,0 +1,63 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + while (n > 0) { + AXPYU_K(m, 0, 0, alpha * *y, X, 1, a, 1, NULL, 0); + a += lda; + y += incy; + n --; + } + + return 0; +} + diff --git a/kernel/generic/laswp_ncopy_1.c b/kernel/generic/laswp_ncopy_1.c new file mode 100644 index 0000000000..4394474edd --- /dev/null +++ b/kernel/generic/laswp_ncopy_1.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 1) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, B1, B2; + + a--; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + + j = n; + do { + piv = ipiv; + + a1 = a + k1 + 1; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = B2; + + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A1; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = B2; + *b2 = A1; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = A2; + *b1 = A1; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = A1; + *b1 = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *b1 = A1; + *b2 = A2; + } + } + + buffer += 2; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + + if (a1 == b1) { + *(buffer + 0) = A1; + } else { + *(buffer + 0) = B1; + *b1 = A1; + } + } + a += lda; + j --; + } while (j > 0); + + return 0; +} + diff --git a/kernel/generic/laswp_ncopy_2.c b/kernel/generic/laswp_ncopy_2.c new file mode 100644 index 0000000000..806a1e1091 --- /dev/null +++ b/kernel/generic/laswp_ncopy_2.c @@ -0,0 +1,293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define PREFETCHSIZE 12 + +#define a2 (a1 + 1) +#define a4 (a3 + 1) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3; + FLOAT *b1, *b2, *b3, *b4; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + + a--; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 1); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + k1 + 1; + a3 = a1 + 1 * lda; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + +#ifdef __GNUC__ + __builtin_prefetch(a1 + PREFETCHSIZE, 0, 0); + __builtin_prefetch(a3 + PREFETCHSIZE, 0, 0); +#endif + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A2; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + + *b2 = A2; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A3; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + *b2 = A1; + *b4 = A3; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = A2; + *(buffer + 3) = A4; + *b1 = A1; + *b3 = A3; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = A1; + *(buffer + 3) = A3; + *b1 = A2; + *b3 = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + *b1 = A1; + *b2 = A2; + *b3 = A3; + *b4 = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + a1 += 2; + a3 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *b1 = A1; + *b3 = A3; + } + buffer += 2; + } + + a += 2 * lda; + j --; + } while (j > 0); + } + + if (n & 1) { + piv = ipiv; + + a1 = a + k1 + 1; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = B2; + + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A1; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = B2; + *b2 = A1; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = A2; + *b1 = A1; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = A1; + *b1 = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *b1 = A1; + *b2 = A2; + } + } + + buffer += 2; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + + if (a1 == b1) { + *(buffer + 0) = A1; + } else { + *(buffer + 0) = B1; + *b1 = A1; + } + } + } + + return 0; +} + diff --git a/kernel/generic/laswp_ncopy_4.c b/kernel/generic/laswp_ncopy_4.c new file mode 100644 index 0000000000..0736f07425 --- /dev/null +++ b/kernel/generic/laswp_ncopy_4.c @@ -0,0 +1,503 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define PREFETCHSIZE 8 + +#define a2 (a1 + 1) +#define a4 (a3 + 1) +#define a6 (a5 + 1) +#define a8 (a7 + 1) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + + a--; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 2); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + k1 + 1; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + +#ifdef __GNUC__ + __builtin_prefetch(a1 + PREFETCHSIZE, 0, 0); + __builtin_prefetch(a3 + PREFETCHSIZE, 0, 0); + __builtin_prefetch(a5 + PREFETCHSIZE, 0, 0); + __builtin_prefetch(a7 + PREFETCHSIZE, 0, 0); +#endif + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A5; + *(buffer + 3) = A7; + + *(buffer + 4) = A2; + *(buffer + 5) = A4; + *(buffer + 6) = A6; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A5; + *(buffer + 3) = A7; + + *(buffer + 4) = B2; + *(buffer + 5) = B4; + *(buffer + 6) = B6; + *(buffer + 7) = B8; + + *b2 = A2; + *b4 = A4; + *b6 = A6; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = A6; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A3; + *(buffer + 6) = A5; + *(buffer + 7) = A7; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = A6; + *(buffer + 3) = A8; + *(buffer + 4) = B2; + *(buffer + 5) = B4; + *(buffer + 6) = B6; + *(buffer + 7) = B8; + *b2 = A1; + *b4 = A3; + *b6 = A5; + *b8 = A7; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B5; + *(buffer + 3) = B7; + *(buffer + 4) = A2; + *(buffer + 5) = A4; + *(buffer + 6) = A6; + *(buffer + 7) = A8; + *b1 = A1; + *b3 = A3; + *b5 = A5; + *b7 = A7; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B5; + *(buffer + 3) = B7; + *(buffer + 4) = A1; + *(buffer + 5) = A3; + *(buffer + 6) = A5; + *(buffer + 7) = A7; + *b1 = A2; + *b3 = A4; + *b5 = A6; + *b7 = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B5; + *(buffer + 3) = B7; + *(buffer + 4) = B2; + *(buffer + 5) = B4; + *(buffer + 6) = B6; + *(buffer + 7) = B8; + *b1 = A1; + *b2 = A2; + *b3 = A3; + *b4 = A4; + *b5 = A5; + *b6 = A6; + *b7 = A7; + *b8 = A8; + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A5; + *(buffer + 3) = A7; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B5; + *(buffer + 3) = B7; + *b1 = A1; + *b3 = A3; + *b5 = A5; + *b7 = A7; + } + buffer += 4; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + if (n & 2) { + piv = ipiv; + + a1 = a + k1 + 1; + a3 = a1 + 1 * lda; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = A2; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + + *b2 = A2; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A3; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A4; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + *b2 = A1; + *b4 = A3; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = A2; + *(buffer + 3) = A4; + *b1 = A1; + *b3 = A3; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = A1; + *(buffer + 3) = A3; + *b1 = A2; + *b3 = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *(buffer + 2) = B2; + *(buffer + 3) = B4; + *b1 = A1; + *b2 = A2; + *b3 = A3; + *b4 = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + a1 += 2; + a3 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A3; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B3; + *b1 = A1; + *b3 = A3; + } + buffer += 2; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + k1 + 1; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *(piv + 0); + ip2 = *(piv + 1); + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = B2; + + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A1; + } else { + *(buffer + 0) = A2; + *(buffer + 1) = B2; + *b2 = A1; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = A2; + *b1 = A1; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = A1; + *b1 = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *b1 = A1; + *b2 = A2; + } + } + + buffer += 2; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 2; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + + if (a1 == b1) { + *(buffer + 0) = A1; + } else { + *(buffer + 0) = B1; + *b1 = A1; + } + } + } + + return 0; +} + diff --git a/kernel/generic/laswp_ncopy_8.c b/kernel/generic/laswp_ncopy_8.c new file mode 100644 index 0000000000..e08c8ceeb3 --- /dev/null +++ b/kernel/generic/laswp_ncopy_8.c @@ -0,0 +1,296 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define PREFETCHSIZE 4 + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip; + blasint *piv; + FLOAT *dx1, *dy1; + FLOAT *dx2, *dy2; + FLOAT *dx3, *dy3; + FLOAT *dx4, *dy4; + FLOAT *dx5, *dy5; + FLOAT *dx6, *dy6; + FLOAT *dx7, *dy7; + FLOAT *dx8, *dy8; + FLOAT atemp1, btemp1; + FLOAT atemp2, btemp2; + FLOAT atemp3, btemp3; + FLOAT atemp4, btemp4; + FLOAT atemp5, btemp5; + FLOAT atemp6, btemp6; + FLOAT atemp7, btemp7; + FLOAT atemp8, btemp8; + + a--; + ipiv += k1 - 1; + + if (n <= 0) return 0; + if (k1 > k2) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + i = k1; + + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + i + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + i + lda * 3; + dy4 = a + ip + lda * 3; + dx5 = a + i + lda * 4; + dy5 = a + ip + lda * 4; + dx6 = a + i + lda * 5; + dy6 = a + ip + lda * 5; + dx7 = a + i + lda * 6; + dy7 = a + ip + lda * 6; + dx8 = a + i + lda * 7; + dy8 = a + ip + lda * 7; + +#ifdef __GNUC__ + __builtin_prefetch(dx1 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx2 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx3 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx4 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx5 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx6 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx7 + PREFETCHSIZE, 0, 1); + __builtin_prefetch(dx8 + PREFETCHSIZE, 0, 1); +#endif + + atemp1 = *dx1; + btemp1 = *dy1; + atemp2 = *dx2; + btemp2 = *dy2; + atemp3 = *dx3; + btemp3 = *dy3; + atemp4 = *dx4; + btemp4 = *dy4; + + atemp5 = *dx5; + btemp5 = *dy5; + atemp6 = *dx6; + btemp6 = *dy6; + atemp7 = *dx7; + btemp7 = *dy7; + atemp8 = *dx8; + btemp8 = *dy8; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *dy3 = atemp3; + *dy4 = atemp4; + *dy5 = atemp5; + *dy6 = atemp6; + *dy7 = atemp7; + *dy8 = atemp8; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + *(buffer + 2) = btemp3; + *(buffer + 3) = btemp4; + *(buffer + 4) = btemp5; + *(buffer + 5) = btemp6; + *(buffer + 6) = btemp7; + *(buffer + 7) = btemp8; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + *(buffer + 2) = atemp3; + *(buffer + 3) = atemp4; + *(buffer + 4) = atemp5; + *(buffer + 5) = atemp6; + *(buffer + 6) = atemp7; + *(buffer + 7) = atemp8; + } + + buffer += 8; + + i++; + } while (i <= k2); + + a += 8 * lda; + j --; + } while (j > 0); + } + + if (n & 4) { + piv = ipiv; + + ip = *piv; + piv ++; + + dx1 = a + k1; + dy1 = a + ip; + dx2 = a + k1 + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + k1 + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + k1 + lda * 3; + dy4 = a + ip + lda * 3; + + i = k1; + + do { + atemp1 = *dx1; + atemp2 = *dx2; + atemp3 = *dx3; + atemp4 = *dx4; + + btemp1 = *dy1; + btemp2 = *dy2; + btemp3 = *dy3; + btemp4 = *dy4; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *dy3 = atemp3; + *dy4 = atemp4; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + *(buffer + 2) = btemp3; + *(buffer + 3) = btemp4; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + *(buffer + 2) = atemp3; + *(buffer + 3) = atemp4; + } + + ip = *piv; + piv ++; + + i++; + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda * 1; + dy2 = a + ip + lda * 1; + dx3 = a + i + lda * 2; + dy3 = a + ip + lda * 2; + dx4 = a + i + lda * 3; + dy4 = a + ip + lda * 3; + + buffer += 4; + + } while (i <= k2); + + a += 4 * lda; + } + + if (n & 2) { + piv = ipiv; + + i = k1; + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + dx2 = a + i + lda; + dy2 = a + ip + lda; + + atemp1 = *dx1; + btemp1 = *dy1; + atemp2 = *dx2; + btemp2 = *dy2; + + if (ip != i) { + *dy1 = atemp1; + *dy2 = atemp2; + *(buffer + 0) = btemp1; + *(buffer + 1) = btemp2; + } else { + *(buffer + 0) = atemp1; + *(buffer + 1) = atemp2; + } + + buffer += 2; + + i++; + } while (i <= k2); + + a += 2 * lda; + } + + + if (n & 1) { + piv = ipiv; + + i = k1; + do { + ip = *piv; + piv ++; + + dx1 = a + i; + dy1 = a + ip; + atemp1 = *dx1; + btemp1 = *dy1; + + if (ip != i) { + *dy1 = atemp1; + *buffer = btemp1; + } else { + *buffer = atemp1; + } + + buffer ++; + + i++; + } while (i <= k2); + + a += lda; + } + + return 0; +} + diff --git a/kernel/generic/lsame.c b/kernel/generic/lsame.c new file mode 100644 index 0000000000..cae8b4ae3d --- /dev/null +++ b/kernel/generic/lsame.c @@ -0,0 +1,50 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include + +int NAME(char *A, char *B){ + + char a = *A; + char b = *B; + + if (a > 96) a -= 32; + if (b > 96) b -= 32; + + return (a == b); +} diff --git a/kernel/generic/neg_tcopy_1.c b/kernel/generic/neg_tcopy_1.c new file mode 100644 index 0000000000..3845f0439e --- /dev/null +++ b/kernel/generic/neg_tcopy_1.c @@ -0,0 +1,75 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1; + FLOAT *b_offset, *b_offset1; + + a_offset = a; + b_offset = b; + + i = m; + + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset ++; + + j = n; + if (j > 0) { + do { + *(b_offset1 + 0) = -*(a_offset1 + 0); + a_offset1 ++; + b_offset1 += m; + j --; + } while (j > 0); + } + i --; + } while (i > 0); + } + + return 0; +} diff --git a/kernel/generic/neg_tcopy_16.c b/kernel/generic/neg_tcopy_16.c new file mode 100644 index 0000000000..2d47b27649 --- /dev/null +++ b/kernel/generic/neg_tcopy_16.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = -ctemp01; + boffset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/neg_tcopy_2.c b/kernel/generic/neg_tcopy_2.c new file mode 100644 index 0000000000..e4dfa0bce0 --- /dev/null +++ b/kernel/generic/neg_tcopy_2.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + + a_offset = a; + b_offset = b; + b_offset2 = b + m * (n & ~1); + + i = (m >> 1); + + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + j = (n >> 1); + if (j > 0){ + do { + *(b_offset1 + 0) = -*(a_offset1 + 0); + *(b_offset1 + 1) = -*(a_offset1 + 1); + *(b_offset1 + 2) = -*(a_offset2 + 0); + *(b_offset1 + 3) = -*(a_offset2 + 1); + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += m * 2; + j--; + } while (j > 0); + } + + if (n & 1){ + *(b_offset2 + 0) = -*(a_offset1 + 0); + *(b_offset2 + 1) = -*(a_offset2 + 0); + b_offset2 += 2; + } + i --; + } while (i > 0); + } + + if (m & 1) { + j = (n >> 1); + if (j > 0){ + do { + *(b_offset + 0) = -*(a_offset + 0); + *(b_offset + 1) = -*(a_offset + 1); + a_offset += 2; + b_offset += m * 2; + j--; + } while (j > 0); + } + + if (n & 1){ + *(b_offset2 + 0) = -*(a_offset + 0); + } + } + + return 0; +} diff --git a/kernel/generic/neg_tcopy_4.c b/kernel/generic/neg_tcopy_4.c new file mode 100644 index 0000000000..9fb1dc7f90 --- /dev/null +++ b/kernel/generic/neg_tcopy_4.c @@ -0,0 +1,281 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset3 + 0); + ctemp10 = *(a_offset3 + 1); + ctemp11 = *(a_offset3 + 2); + ctemp12 = *(a_offset3 + 3); + + ctemp13 = *(a_offset4 + 0); + ctemp14 = *(a_offset4 + 1); + ctemp15 = *(a_offset4 + 2); + ctemp16 = *(a_offset4 + 3); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + *(b_offset1 + 4) = -ctemp5; + *(b_offset1 + 5) = -ctemp6; + *(b_offset1 + 6) = -ctemp7; + *(b_offset1 + 7) = -ctemp8; + + *(b_offset1 + 8) = -ctemp9; + *(b_offset1 + 9) = -ctemp10; + *(b_offset1 + 10) = -ctemp11; + *(b_offset1 + 11) = -ctemp12; + + *(b_offset1 + 12) = -ctemp13; + *(b_offset1 + 13) = -ctemp14; + *(b_offset1 + 14) = -ctemp15; + *(b_offset1 + 15) = -ctemp16; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset3 + 0); + ctemp6 = *(a_offset3 + 1); + + ctemp7 = *(a_offset4 + 0); + ctemp8 = *(a_offset4 + 1); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + *(b_offset2 + 2) = -ctemp3; + *(b_offset2 + 3) = -ctemp4; + + *(b_offset2 + 4) = -ctemp5; + *(b_offset2 + 5) = -ctemp6; + *(b_offset2 + 6) = -ctemp7; + *(b_offset2 + 7) = -ctemp8; + + b_offset2 += 8; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + ctemp3 = *(a_offset3 + 0); + ctemp4 = *(a_offset4 + 0); + + *(b_offset3 + 0) = -ctemp1; + *(b_offset3 + 1) = -ctemp2; + *(b_offset3 + 2) = -ctemp3; + *(b_offset3 + 3) = -ctemp4; + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset2 + 0); + ctemp6 = *(a_offset2 + 1); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + a_offset1 += 4; + a_offset2 += 4; + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + *(b_offset1 + 4) = -ctemp5; + *(b_offset1 + 5) = -ctemp6; + *(b_offset1 + 6) = -ctemp7; + *(b_offset1 + 7) = -ctemp8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + a_offset1 += 2; + a_offset2 += 2; + + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + *(b_offset2 + 2) = -ctemp3; + *(b_offset2 + 3) = -ctemp4; + + b_offset2 += 4; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset2 + 0); + + *(b_offset3 + 0) = -ctemp1; + *(b_offset3 + 1) = -ctemp2; + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + a_offset1 += 4; + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + a_offset1 += 2; + + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + } + + if (n & 1) { + ctemp1 = *(a_offset1 + 0); + *(b_offset3 + 0) = -ctemp1; + } + } + + return 0; +} diff --git a/kernel/generic/neg_tcopy_8.c b/kernel/generic/neg_tcopy_8.c new file mode 100644 index 0000000000..97fec3bd40 --- /dev/null +++ b/kernel/generic/neg_tcopy_8.c @@ -0,0 +1,787 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + FLOAT ctemp33, ctemp34, ctemp35, ctemp36; + FLOAT ctemp37, ctemp38, ctemp39, ctemp40; + FLOAT ctemp41, ctemp42, ctemp43, ctemp44; + FLOAT ctemp45, ctemp46, ctemp47, ctemp48; + FLOAT ctemp49, ctemp50, ctemp51, ctemp52; + FLOAT ctemp53, ctemp54, ctemp55, ctemp56; + FLOAT ctemp57, ctemp58, ctemp59, ctemp60; + FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + ctemp33 = *(aoffset5 + 0); + ctemp34 = *(aoffset5 + 1); + ctemp35 = *(aoffset5 + 2); + ctemp36 = *(aoffset5 + 3); + ctemp37 = *(aoffset5 + 4); + ctemp38 = *(aoffset5 + 5); + ctemp39 = *(aoffset5 + 6); + ctemp40 = *(aoffset5 + 7); + aoffset5 += 8; + + ctemp41 = *(aoffset6 + 0); + ctemp42 = *(aoffset6 + 1); + ctemp43 = *(aoffset6 + 2); + ctemp44 = *(aoffset6 + 3); + ctemp45 = *(aoffset6 + 4); + ctemp46 = *(aoffset6 + 5); + ctemp47 = *(aoffset6 + 6); + ctemp48 = *(aoffset6 + 7); + aoffset6 += 8; + + ctemp49 = *(aoffset7 + 0); + ctemp50 = *(aoffset7 + 1); + ctemp51 = *(aoffset7 + 2); + ctemp52 = *(aoffset7 + 3); + ctemp53 = *(aoffset7 + 4); + ctemp54 = *(aoffset7 + 5); + ctemp55 = *(aoffset7 + 6); + ctemp56 = *(aoffset7 + 7); + aoffset7 += 8; + + ctemp57 = *(aoffset8 + 0); + ctemp58 = *(aoffset8 + 1); + ctemp59 = *(aoffset8 + 2); + ctemp60 = *(aoffset8 + 3); + ctemp61 = *(aoffset8 + 4); + ctemp62 = *(aoffset8 + 5); + ctemp63 = *(aoffset8 + 6); + ctemp64 = *(aoffset8 + 7); + aoffset8 += 8; + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + *(boffset1 + 16) = -ctemp17; + *(boffset1 + 17) = -ctemp18; + *(boffset1 + 18) = -ctemp19; + *(boffset1 + 19) = -ctemp20; + *(boffset1 + 20) = -ctemp21; + *(boffset1 + 21) = -ctemp22; + *(boffset1 + 22) = -ctemp23; + *(boffset1 + 23) = -ctemp24; + + *(boffset1 + 24) = -ctemp25; + *(boffset1 + 25) = -ctemp26; + *(boffset1 + 26) = -ctemp27; + *(boffset1 + 27) = -ctemp28; + *(boffset1 + 28) = -ctemp29; + *(boffset1 + 29) = -ctemp30; + *(boffset1 + 30) = -ctemp31; + *(boffset1 + 31) = -ctemp32; + + *(boffset1 + 32) = -ctemp33; + *(boffset1 + 33) = -ctemp34; + *(boffset1 + 34) = -ctemp35; + *(boffset1 + 35) = -ctemp36; + *(boffset1 + 36) = -ctemp37; + *(boffset1 + 37) = -ctemp38; + *(boffset1 + 38) = -ctemp39; + *(boffset1 + 39) = -ctemp40; + + *(boffset1 + 40) = -ctemp41; + *(boffset1 + 41) = -ctemp42; + *(boffset1 + 42) = -ctemp43; + *(boffset1 + 43) = -ctemp44; + *(boffset1 + 44) = -ctemp45; + *(boffset1 + 45) = -ctemp46; + *(boffset1 + 46) = -ctemp47; + *(boffset1 + 47) = -ctemp48; + + *(boffset1 + 48) = -ctemp49; + *(boffset1 + 49) = -ctemp50; + *(boffset1 + 50) = -ctemp51; + *(boffset1 + 51) = -ctemp52; + *(boffset1 + 52) = -ctemp53; + *(boffset1 + 53) = -ctemp54; + *(boffset1 + 54) = -ctemp55; + *(boffset1 + 55) = -ctemp56; + + *(boffset1 + 56) = -ctemp57; + *(boffset1 + 57) = -ctemp58; + *(boffset1 + 58) = -ctemp59; + *(boffset1 + 59) = -ctemp60; + *(boffset1 + 60) = -ctemp61; + *(boffset1 + 61) = -ctemp62; + *(boffset1 + 62) = -ctemp63; + *(boffset1 + 63) = -ctemp64; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + ctemp17 = *(aoffset5 + 0); + ctemp18 = *(aoffset5 + 1); + ctemp19 = *(aoffset5 + 2); + ctemp20 = *(aoffset5 + 3); + aoffset5 += 4; + + ctemp21 = *(aoffset6 + 0); + ctemp22 = *(aoffset6 + 1); + ctemp23 = *(aoffset6 + 2); + ctemp24 = *(aoffset6 + 3); + aoffset6 += 4; + + ctemp25 = *(aoffset7 + 0); + ctemp26 = *(aoffset7 + 1); + ctemp27 = *(aoffset7 + 2); + ctemp28 = *(aoffset7 + 3); + aoffset7 += 4; + + ctemp29 = *(aoffset8 + 0); + ctemp30 = *(aoffset8 + 1); + ctemp31 = *(aoffset8 + 2); + ctemp32 = *(aoffset8 + 3); + aoffset8 += 4; + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + *(boffset2 + 8) = -ctemp09; + *(boffset2 + 9) = -ctemp10; + *(boffset2 + 10) = -ctemp11; + *(boffset2 + 11) = -ctemp12; + *(boffset2 + 12) = -ctemp13; + *(boffset2 + 13) = -ctemp14; + *(boffset2 + 14) = -ctemp15; + *(boffset2 + 15) = -ctemp16; + + *(boffset2 + 16) = -ctemp17; + *(boffset2 + 17) = -ctemp18; + *(boffset2 + 18) = -ctemp19; + *(boffset2 + 19) = -ctemp20; + *(boffset2 + 20) = -ctemp21; + *(boffset2 + 21) = -ctemp22; + *(boffset2 + 22) = -ctemp23; + *(boffset2 + 23) = -ctemp24; + *(boffset2 + 24) = -ctemp25; + *(boffset2 + 25) = -ctemp26; + *(boffset2 + 26) = -ctemp27; + *(boffset2 + 27) = -ctemp28; + *(boffset2 + 28) = -ctemp29; + *(boffset2 + 29) = -ctemp30; + *(boffset2 + 30) = -ctemp31; + *(boffset2 + 31) = -ctemp32; + + boffset2 += 32; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + aoffset5 += 2; + + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + aoffset6 += 2; + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + aoffset7 += 2; + + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + aoffset8 += 2; + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + *(boffset3 + 4) = -ctemp05; + *(boffset3 + 5) = -ctemp06; + *(boffset3 + 6) = -ctemp07; + *(boffset3 + 7) = -ctemp08; + *(boffset3 + 8) = -ctemp09; + *(boffset3 + 9) = -ctemp10; + *(boffset3 + 10) = -ctemp11; + *(boffset3 + 11) = -ctemp12; + *(boffset3 + 12) = -ctemp13; + *(boffset3 + 13) = -ctemp14; + *(boffset3 + 14) = -ctemp15; + *(boffset3 + 15) = -ctemp16; + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = -ctemp01; + *(boffset4 + 1) = -ctemp02; + *(boffset4 + 2) = -ctemp03; + *(boffset4 + 3) = -ctemp04; + *(boffset4 + 4) = -ctemp05; + *(boffset4 + 5) = -ctemp06; + *(boffset4 + 6) = -ctemp07; + *(boffset4 + 7) = -ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + *(boffset1 + 16) = -ctemp17; + *(boffset1 + 17) = -ctemp18; + *(boffset1 + 18) = -ctemp19; + *(boffset1 + 19) = -ctemp20; + *(boffset1 + 20) = -ctemp21; + *(boffset1 + 21) = -ctemp22; + *(boffset1 + 22) = -ctemp23; + *(boffset1 + 23) = -ctemp24; + + *(boffset1 + 24) = -ctemp25; + *(boffset1 + 25) = -ctemp26; + *(boffset1 + 26) = -ctemp27; + *(boffset1 + 27) = -ctemp28; + *(boffset1 + 28) = -ctemp29; + *(boffset1 + 29) = -ctemp30; + *(boffset1 + 30) = -ctemp31; + *(boffset1 + 31) = -ctemp32; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + + *(boffset2 + 8) = -ctemp09; + *(boffset2 + 9) = -ctemp10; + *(boffset2 + 10) = -ctemp11; + *(boffset2 + 11) = -ctemp12; + *(boffset2 + 12) = -ctemp13; + *(boffset2 + 13) = -ctemp14; + *(boffset2 + 14) = -ctemp15; + *(boffset2 + 15) = -ctemp16; + boffset2 += 16; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + *(boffset3 + 4) = -ctemp05; + *(boffset3 + 5) = -ctemp06; + *(boffset3 + 6) = -ctemp07; + *(boffset3 + 7) = -ctemp08; + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = -ctemp01; + *(boffset4 + 1) = -ctemp02; + *(boffset4 + 2) = -ctemp03; + *(boffset4 + 3) = -ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + boffset2 += 8; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = -ctemp01; + *(boffset4 + 1) = -ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + aoffset += lda; + + boffset1 = boffset; + boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + boffset2 += 4; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = -ctemp01; + boffset4 ++; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_1.c b/kernel/generic/symm_lcopy_1.c new file mode 100644 index 0000000000..7b6cfea274 --- /dev/null +++ b/kernel/generic/symm_lcopy_1.c @@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + js = n; + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b += 1; + + offset --; + i --; + } + + posX += 1; + js --; + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_16.c b/kernel/generic/symm_lcopy_16.c new file mode 100644 index 0000000000..2c8ad81d04 --- /dev/null +++ b/kernel/generic/symm_lcopy_16.c @@ -0,0 +1,273 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; + if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; + if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; + if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; + if (offset > -8) ao9 = a + posX + 8 + posY * lda; else ao9 = a + posY + (posX + 8) * lda; + if (offset > -9) ao10 = a + posX + 9 + posY * lda; else ao10 = a + posY + (posX + 9) * lda; + if (offset > -10) ao11 = a + posX + 10 + posY * lda; else ao11 = a + posY + (posX + 10) * lda; + if (offset > -11) ao12 = a + posX + 11 + posY * lda; else ao12 = a + posY + (posX + 11) * lda; + if (offset > -12) ao13 = a + posX + 12 + posY * lda; else ao13 = a + posY + (posX + 12) * lda; + if (offset > -13) ao14 = a + posX + 13 + posY * lda; else ao14 = a + posY + (posX + 13) * lda; + if (offset > -14) ao15 = a + posX + 14 + posY * lda; else ao15 = a + posY + (posX + 14) * lda; + if (offset > -15) ao16 = a + posX + 15 + posY * lda; else ao16 = a + posY + (posX + 15) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + data09 = *(ao9 + 0); + data10 = *(ao10 + 0); + data11 = *(ao11 + 0); + data12 = *(ao12 + 0); + data13 = *(ao13 + 0); + data14 = *(ao14 + 0); + data15 = *(ao15 + 0); + data16 = *(ao16 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + if (offset > -4) ao5 += lda; else ao5 ++; + if (offset > -5) ao6 += lda; else ao6 ++; + if (offset > -6) ao7 += lda; else ao7 ++; + if (offset > -7) ao8 += lda; else ao8 ++; + if (offset > -8) ao9 += lda; else ao9 ++; + if (offset > -9) ao10 += lda; else ao10 ++; + if (offset > -10) ao11 += lda; else ao11 ++; + if (offset > -11) ao12 += lda; else ao12 ++; + if (offset > -12) ao13 += lda; else ao13 ++; + if (offset > -13) ao14 += lda; else ao14 ++; + if (offset > -14) ao15 += lda; else ao15 ++; + if (offset > -15) ao16 += lda; else ao16 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; + if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; + if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; + if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + if (offset > -4) ao5 += lda; else ao5 ++; + if (offset > -5) ao6 += lda; else ao6 ++; + if (offset > -6) ao7 += lda; else ao7 ++; + if (offset > -7) ao8 += lda; else ao8 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_2.c b/kernel/generic/symm_lcopy_2.c new file mode 100644 index 0000000000..e7944c4472 --- /dev/null +++ b/kernel/generic/symm_lcopy_2.c @@ -0,0 +1,102 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_4.c b/kernel/generic/symm_lcopy_4.c new file mode 100644 index 0000000000..ac04943e23 --- /dev/null +++ b/kernel/generic/symm_lcopy_4.c @@ -0,0 +1,138 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_lcopy_8.c b/kernel/generic/symm_lcopy_8.c new file mode 100644 index 0000000000..c315574ead --- /dev/null +++ b/kernel/generic/symm_lcopy_8.c @@ -0,0 +1,188 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + if (offset > -4) ao5 = a + posX + 4 + posY * lda; else ao5 = a + posY + (posX + 4) * lda; + if (offset > -5) ao6 = a + posX + 5 + posY * lda; else ao6 = a + posY + (posX + 5) * lda; + if (offset > -6) ao7 = a + posX + 6 + posY * lda; else ao7 = a + posY + (posX + 6) * lda; + if (offset > -7) ao8 = a + posX + 7 + posY * lda; else ao8 = a + posY + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + if (offset > -4) ao5 += lda; else ao5 ++; + if (offset > -5) ao6 += lda; else ao6 ++; + if (offset > -6) ao7 += lda; else ao7 ++; + if (offset > -7) ao8 += lda; else ao8 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + if (offset > -2) ao3 = a + posX + 2 + posY * lda; else ao3 = a + posY + (posX + 2) * lda; + if (offset > -3) ao4 = a + posX + 3 + posY * lda; else ao4 = a + posY + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + if (offset > -2) ao3 += lda; else ao3 ++; + if (offset > -3) ao4 += lda; else ao4 ++; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + if (offset > -1) ao2 = a + posX + 1 + posY * lda; else ao2 = a + posY + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + if (offset > -1) ao2 += lda; else ao2 ++; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posX + 0 + posY * lda; else ao1 = a + posY + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 += lda; else ao1 ++; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} + diff --git a/kernel/generic/symm_ucopy_1.c b/kernel/generic/symm_ucopy_1.c new file mode 100644 index 0000000000..4ab9bb4227 --- /dev/null +++ b/kernel/generic/symm_ucopy_1.c @@ -0,0 +1,76 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + js = n; + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_16.c b/kernel/generic/symm_ucopy_16.c new file mode 100644 index 0000000000..094810b970 --- /dev/null +++ b/kernel/generic/symm_ucopy_16.c @@ -0,0 +1,274 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16; + + js = (n >> 4); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; + if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; + if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; + if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; + if (offset > -8) ao9 = a + posY + (posX + 8) * lda; else ao9 = a + posX + 8 + posY * lda; + if (offset > -9) ao10 = a + posY + (posX + 9) * lda; else ao10 = a + posX + 9 + posY * lda; + if (offset > -10) ao11 = a + posY + (posX + 10) * lda; else ao11 = a + posX + 10 + posY * lda; + if (offset > -11) ao12 = a + posY + (posX + 11) * lda; else ao12 = a + posX + 11 + posY * lda; + if (offset > -12) ao13 = a + posY + (posX + 12) * lda; else ao13 = a + posX + 12 + posY * lda; + if (offset > -13) ao14 = a + posY + (posX + 13) * lda; else ao14 = a + posX + 13 + posY * lda; + if (offset > -14) ao15 = a + posY + (posX + 14) * lda; else ao15 = a + posX + 14 + posY * lda; + if (offset > -15) ao16 = a + posY + (posX + 15) * lda; else ao16 = a + posX + 15 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + data09 = *(ao9 + 0); + data10 = *(ao10 + 0); + data11 = *(ao11 + 0); + data12 = *(ao12 + 0); + data13 = *(ao13 + 0); + data14 = *(ao14 + 0); + data15 = *(ao15 + 0); + data16 = *(ao16 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + if (offset > -4) ao5 ++; else ao5 += lda; + if (offset > -5) ao6 ++; else ao6 += lda; + if (offset > -6) ao7 ++; else ao7 += lda; + if (offset > -7) ao8 ++; else ao8 += lda; + if (offset > -8) ao9 ++; else ao9 += lda; + if (offset > -9) ao10 ++; else ao10 += lda; + if (offset > -10) ao11 ++; else ao11 += lda; + if (offset > -11) ao12 ++; else ao12 += lda; + if (offset > -12) ao13 ++; else ao13 += lda; + if (offset > -13) ao14 ++; else ao14 += lda; + if (offset > -14) ao15 ++; else ao15 += lda; + if (offset > -15) ao16 ++; else ao16 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 16; + js --; + } + + if (n & 8) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; + if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; + if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; + if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + if (offset > -4) ao5 ++; else ao5 += lda; + if (offset > -5) ao6 ++; else ao6 += lda; + if (offset > -6) ao7 ++; else ao7 += lda; + if (offset > -7) ao8 ++; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_2.c b/kernel/generic/symm_ucopy_2.c new file mode 100644 index 0000000000..6396b746bf --- /dev/null +++ b/kernel/generic/symm_ucopy_2.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_4.c b/kernel/generic/symm_ucopy_4.c new file mode 100644 index 0000000000..9b9cff8209 --- /dev/null +++ b/kernel/generic/symm_ucopy_4.c @@ -0,0 +1,136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symm_ucopy_8.c b/kernel/generic/symm_ucopy_8.c new file mode 100644 index 0000000000..411768ba55 --- /dev/null +++ b/kernel/generic/symm_ucopy_8.c @@ -0,0 +1,188 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + if (offset > -4) ao5 = a + posY + (posX + 4) * lda; else ao5 = a + posX + 4 + posY * lda; + if (offset > -5) ao6 = a + posY + (posX + 5) * lda; else ao6 = a + posX + 5 + posY * lda; + if (offset > -6) ao7 = a + posY + (posX + 6) * lda; else ao7 = a + posX + 6 + posY * lda; + if (offset > -7) ao8 = a + posY + (posX + 7) * lda; else ao8 = a + posX + 7 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + data05 = *(ao5 + 0); + data06 = *(ao6 + 0); + data07 = *(ao7 + 0); + data08 = *(ao8 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + if (offset > -4) ao5 ++; else ao5 += lda; + if (offset > -5) ao6 ++; else ao6 += lda; + if (offset > -6) ao7 ++; else ao7 += lda; + if (offset > -7) ao8 ++; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + if (offset > -2) ao3 = a + posY + (posX + 2) * lda; else ao3 = a + posX + 2 + posY * lda; + if (offset > -3) ao4 = a + posY + (posX + 3) * lda; else ao4 = a + posX + 3 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + if (offset > -2) ao3 ++; else ao3 += lda; + if (offset > -3) ao4 ++; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + if (offset > -1) ao2 = a + posY + (posX + 1) * lda; else ao2 = a + posX + 1 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + if (offset > -1) ao2 ++; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY + (posX + 0) * lda; else ao1 = a + posX + 0 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + + if (offset > 0) ao1 ++; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/symv_k.c b/kernel/generic/symv_k.c new file mode 100644 index 0000000000..bd882fe85e --- /dev/null +++ b/kernel/generic/symv_k.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, + FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + + BLASLONG is, min_i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *symbuffer = buffer; + FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) + 4095) & ~4095); + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + +#ifndef LOWER + for(is = m - offset; is < m; is += SYMV_P){ + min_i = MIN(m - is, SYMV_P); +#else + for(is = 0; is < offset; is += SYMV_P){ + min_i = MIN(offset - is, SYMV_P); +#endif + +#ifndef LOWER + if (is >0){ + GEMV_T(is, min_i, 0, alpha, + a + is * lda, lda, + X, 1, + Y + is, 1, gemvbuffer); + + GEMV_N(is, min_i, 0, alpha, + a + is * lda, lda, + X + is, 1, + Y, 1, gemvbuffer); + } +#endif + +#ifdef LOWER + SYMCOPY_L(min_i, a + is + is * lda, lda, symbuffer); +#else + SYMCOPY_U(min_i, a + is + is * lda, lda, symbuffer); +#endif + + GEMV_N(min_i, min_i, 0, alpha, + symbuffer, min_i, + X + is, 1, + Y + is, 1, gemvbuffer); + +#ifdef LOWER + if (m - is > min_i){ + GEMV_T(m - is - min_i, min_i, 0, alpha, + a + (is + min_i) + is * lda, lda, + X + (is + min_i), 1, + Y + is, 1, gemvbuffer); + + GEMV_N(m - is - min_i, min_i, 0, alpha, + a + (is + min_i) + is * lda, lda, + X + is, 1, + Y + (is + min_i), 1, gemvbuffer); + } +#endif + + } /* end of is */ + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/kernel/generic/trmm_lncopy_1.c b/kernel/generic/trmm_lncopy_1.c new file mode 100644 index 0000000000..66e407f80b --- /dev/null +++ b/kernel/generic/trmm_lncopy_1.c @@ -0,0 +1,92 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, X; + + FLOAT data01; + FLOAT *ao1; + + while (n > 0) { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X < posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + n --; + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_16.c b/kernel/generic/trmm_lncopy_16.c new file mode 100644 index 0000000000..a183402979 --- /dev/null +++ b/kernel/generic/trmm_lncopy_16.c @@ -0,0 +1,1543 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + a09 = a + posY + (posX + 8) * lda; + a10 = a + posY + (posX + 9) * lda; + a11 = a + posY + (posX + 10) * lda; + a12 = a + posY + (posX + 11) * lda; + a13 = a + posY + (posX + 12) * lda; + a14 = a + posY + (posX + 13) * lda; + a15 = a + posY + (posX + 14) * lda; + a16 = a + posY + (posX + 15) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + a09 = a + posX + (posY + 8) * lda; + a10 = a + posX + (posY + 9) * lda; + a11 = a + posX + (posY + 10) * lda; + a12 = a + posX + (posY + 11) * lda; + a13 = a + posX + (posY + 12) * lda; + a14 = a + posX + (posY + 13) * lda; + a15 = a + posX + (posY + 14) * lda; + a16 = a + posX + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + a09 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + } + } else + if (X < posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 256; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 1); +#ifdef UNIT + b[ 17] = ONE; +#else + b[ 17] = *(a02 + 1); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 2); + b[ 33] = *(a02 + 2); +#ifdef UNIT + b[ 34] = ONE; +#else + b[ 34] = *(a03 + 2); +#endif + b[ 35] = ZERO; + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a01 + 3); + b[ 49] = *(a02 + 3); + b[ 50] = *(a03 + 3); +#ifdef UNIT + b[ 51] = ONE; +#else + b[ 51] = *(a04 + 3); +#endif + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a01 + 4); + b[ 65] = *(a02 + 4); + b[ 66] = *(a03 + 4); + b[ 67] = *(a04 + 4); +#ifdef UNIT + b[ 68] = ONE; +#else + b[ 68] = *(a05 + 4); +#endif + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a01 + 5); + b[ 81] = *(a02 + 5); + b[ 82] = *(a03 + 5); + b[ 83] = *(a04 + 5); + b[ 84] = *(a05 + 5); +#ifdef UNIT + b[ 85] = ONE; +#else + b[ 85] = *(a06 + 5); +#endif + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a01 + 6); + b[ 97] = *(a02 + 6); + b[ 98] = *(a03 + 6); + b[ 99] = *(a04 + 6); + b[100] = *(a05 + 6); + b[101] = *(a06 + 6); +#ifdef UNIT + b[102] = ONE; +#else + b[102] = *(a07 + 6); +#endif + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a01 + 7); + b[113] = *(a02 + 7); + b[114] = *(a03 + 7); + b[115] = *(a04 + 7); + b[116] = *(a05 + 7); + b[117] = *(a06 + 7); + b[118] = *(a07 + 7); +#ifdef UNIT + b[119] = ONE; +#else + b[119] = *(a08 + 7); +#endif + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a01 + 8); + b[129] = *(a02 + 8); + b[130] = *(a03 + 8); + b[131] = *(a04 + 8); + b[132] = *(a05 + 8); + b[133] = *(a06 + 8); + b[134] = *(a07 + 8); + b[135] = *(a08 + 8); +#ifdef UNIT + b[136] = ONE; +#else + b[136] = *(a09 + 8); +#endif + b[137] = ZERO; + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + + b[144] = *(a01 + 9); + b[145] = *(a02 + 9); + b[146] = *(a03 + 9); + b[147] = *(a04 + 9); + b[148] = *(a05 + 9); + b[149] = *(a06 + 9); + b[150] = *(a07 + 9); + b[151] = *(a08 + 9); + b[152] = *(a09 + 9); +#ifdef UNIT + b[153] = ONE; +#else + b[153] = *(a10 + 9); +#endif + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a01 + 10); + b[161] = *(a02 + 10); + b[162] = *(a03 + 10); + b[163] = *(a04 + 10); + b[164] = *(a05 + 10); + b[165] = *(a06 + 10); + b[166] = *(a07 + 10); + b[167] = *(a08 + 10); + b[168] = *(a09 + 10); + b[169] = *(a10 + 10); +#ifdef UNIT + b[170] = ONE; +#else + b[170] = *(a11 + 10); +#endif + b[171] = ZERO; + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + + b[176] = *(a01 + 11); + b[177] = *(a02 + 11); + b[178] = *(a03 + 11); + b[179] = *(a04 + 11); + b[180] = *(a05 + 11); + b[181] = *(a06 + 11); + b[182] = *(a07 + 11); + b[183] = *(a08 + 11); + b[184] = *(a09 + 11); + b[185] = *(a10 + 11); + b[186] = *(a11 + 11); +#ifdef UNIT + b[187] = ONE; +#else + b[187] = *(a12 + 11); +#endif + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a01 + 12); + b[193] = *(a02 + 12); + b[194] = *(a03 + 12); + b[195] = *(a04 + 12); + b[196] = *(a05 + 12); + b[197] = *(a06 + 12); + b[198] = *(a07 + 12); + b[199] = *(a08 + 12); + b[200] = *(a09 + 12); + b[201] = *(a10 + 12); + b[202] = *(a11 + 12); + b[203] = *(a12 + 12); +#ifdef UNIT + b[204] = ONE; +#else + b[204] = *(a13 + 12); +#endif + b[205] = ZERO; + b[206] = ZERO; + b[207] = ZERO; + + b[208] = *(a01 + 13); + b[209] = *(a02 + 13); + b[210] = *(a03 + 13); + b[211] = *(a04 + 13); + b[212] = *(a05 + 13); + b[213] = *(a06 + 13); + b[214] = *(a07 + 13); + b[215] = *(a08 + 13); + b[216] = *(a09 + 13); + b[217] = *(a10 + 13); + b[218] = *(a11 + 13); + b[219] = *(a12 + 13); + b[220] = *(a13 + 13); +#ifdef UNIT + b[221] = ONE; +#else + b[221] = *(a14 + 13); +#endif + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a01 + 14); + b[225] = *(a02 + 14); + b[226] = *(a03 + 14); + b[227] = *(a04 + 14); + b[228] = *(a05 + 14); + b[229] = *(a06 + 14); + b[230] = *(a07 + 14); + b[231] = *(a08 + 14); + b[232] = *(a09 + 14); + b[233] = *(a10 + 14); + b[234] = *(a11 + 14); + b[235] = *(a12 + 14); + b[236] = *(a13 + 14); + b[237] = *(a14 + 14); +#ifdef UNIT + b[238] = ONE; +#else + b[238] = *(a15 + 14); +#endif + b[239] = ZERO; + + b[240] = *(a01 + 15); + b[241] = *(a02 + 15); + b[242] = *(a03 + 15); + b[243] = *(a04 + 15); + b[244] = *(a05 + 15); + b[245] = *(a06 + 15); + b[246] = *(a07 + 15); + b[247] = *(a08 + 15); + b[248] = *(a09 + 15); + b[249] = *(a10 + 15); + b[250] = *(a11 + 15); + b[251] = *(a12 + 15); + b[252] = *(a13 + 15); + b[253] = *(a14 + 15); + b[254] = *(a15 + 15); +#ifdef UNIT + b[255] = ONE; +#else + b[255] = *(a16 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + a09 += 16; + a10 += 16; + a11 += 16; + a12 += 16; + a13 += 16; + a14 += 16; + a15 += 16; + a16 += 16; + b += 256; + + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + a09 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + } + } else + if (X < posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + + if (i >= 2) { + b[ 0] = *(a01 + 1); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a02 + 2); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a01 + 3); + b[ 1] = *(a02 + 3); + b[ 2] = *(a03 + 3); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 11] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a02 + 4); + b[ 2] = *(a03 + 4); + b[ 3] = *(a04 + 4); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a01 + 5); + b[ 1] = *(a02 + 5); + b[ 2] = *(a03 + 5); + b[ 3] = *(a04 + 5); + b[ 4] = *(a05 + 5); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a02 + 6); + b[ 2] = *(a03 + 6); + b[ 3] = *(a04 + 6); + b[ 4] = *(a05 + 6); + b[ 5] = *(a06 + 6); +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 8) { + b[ 0] = *(a01 + 7); + b[ 1] = *(a02 + 7); + b[ 2] = *(a03 + 7); + b[ 3] = *(a04 + 7); + b[ 4] = *(a05 + 7); + b[ 5] = *(a06 + 7); + b[ 6] = *(a07 + 7); +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(a08 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 9) { + b[ 0] = *(a01 + 8); + b[ 1] = *(a02 + 8); + b[ 2] = *(a03 + 8); + b[ 3] = *(a04 + 8); + b[ 4] = *(a05 + 8); + b[ 5] = *(a06 + 8); + b[ 6] = *(a07 + 8); + b[ 7] = *(a08 + 8); +#ifdef UNIT + b[ 8] = ONE; +#else + b[ 8] = *(a09 + 8); +#endif + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 10) { + b[ 0] = *(a01 + 9); + b[ 1] = *(a02 + 9); + b[ 2] = *(a03 + 9); + b[ 3] = *(a04 + 9); + b[ 4] = *(a05 + 9); + b[ 5] = *(a06 + 9); + b[ 6] = *(a07 + 9); + b[ 7] = *(a08 + 9); + b[ 8] = *(a09 + 9); +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a10 + 9); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 11) { + b[ 0] = *(a01 + 10); + b[ 1] = *(a02 + 10); + b[ 2] = *(a03 + 10); + b[ 3] = *(a04 + 10); + b[ 4] = *(a05 + 10); + b[ 5] = *(a06 + 10); + b[ 6] = *(a07 + 10); + b[ 7] = *(a08 + 10); + b[ 8] = *(a09 + 10); + b[ 9] = *(a10 + 10); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a11 + 10); +#endif + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 12) { + b[ 0] = *(a01 + 11); + b[ 1] = *(a02 + 11); + b[ 2] = *(a03 + 11); + b[ 3] = *(a04 + 11); + b[ 4] = *(a05 + 11); + b[ 5] = *(a06 + 11); + b[ 6] = *(a07 + 11); + b[ 7] = *(a08 + 11); + b[ 8] = *(a09 + 11); + b[ 9] = *(a10 + 11); + b[ 10] = *(a11 + 11); +#ifdef UNIT + b[ 11] = ONE; +#else + b[ 11] = *(a12 + 11); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 13) { + b[ 0] = *(a01 + 12); + b[ 1] = *(a02 + 12); + b[ 2] = *(a03 + 12); + b[ 3] = *(a04 + 12); + b[ 4] = *(a05 + 12); + b[ 5] = *(a06 + 12); + b[ 6] = *(a07 + 12); + b[ 7] = *(a08 + 12); + b[ 8] = *(a09 + 12); + b[ 9] = *(a10 + 12); + b[ 10] = *(a11 + 12); + b[ 11] = *(a12 + 12); +#ifdef UNIT + b[ 12] = ONE; +#else + b[ 12] = *(a13 + 12); +#endif + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 14) { + b[ 0] = *(a01 + 13); + b[ 1] = *(a02 + 13); + b[ 2] = *(a03 + 13); + b[ 3] = *(a04 + 13); + b[ 4] = *(a05 + 13); + b[ 5] = *(a06 + 13); + b[ 6] = *(a07 + 13); + b[ 7] = *(a08 + 13); + b[ 8] = *(a09 + 13); + b[ 9] = *(a10 + 13); + b[ 10] = *(a11 + 13); + b[ 11] = *(a12 + 13); + b[ 12] = *(a13 + 13); +#ifdef UNIT + b[ 13] = ONE; +#else + b[ 13] = *(a14 + 13); +#endif + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + } + + if (i >= 15) { + b[ 0] = *(a01 + 14); + b[ 1] = *(a02 + 14); + b[ 2] = *(a03 + 14); + b[ 3] = *(a04 + 14); + b[ 4] = *(a05 + 14); + b[ 5] = *(a06 + 14); + b[ 6] = *(a07 + 14); + b[ 7] = *(a08 + 14); + b[ 8] = *(a09 + 14); + b[ 9] = *(a10 + 14); + b[ 10] = *(a11 + 14); + b[ 11] = *(a12 + 14); + b[ 12] = *(a13 + 14); + b[ 13] = *(a14 + 14); +#ifdef UNIT + b[ 14] = ONE; +#else + b[ 14] = *(a15 + 14); +#endif + b[ 15] = ZERO; + b += 16; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + b += 8; + } + } else + if (X < posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 64; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a01 + 1); +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a02 + 1); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a01 + 2); + b[ 17] = *(a02 + 2); +#ifdef UNIT + b[ 18] = ONE; +#else + b[ 18] = *(a03 + 2); +#endif + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a01 + 3); + b[ 25] = *(a02 + 3); + b[ 26] = *(a03 + 3); +#ifdef UNIT + b[ 27] = ONE; +#else + b[ 27] = *(a04 + 3); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a01 + 4); + b[ 33] = *(a02 + 4); + b[ 34] = *(a03 + 4); + b[ 35] = *(a04 + 4); +#ifdef UNIT + b[ 36] = ONE; +#else + b[ 36] = *(a05 + 4); +#endif + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + + b[ 40] = *(a01 + 5); + b[ 41] = *(a02 + 5); + b[ 42] = *(a03 + 5); + b[ 43] = *(a04 + 5); + b[ 44] = *(a05 + 5); +#ifdef UNIT + b[ 45] = ONE; +#else + b[ 45] = *(a06 + 5); +#endif + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a01 + 6); + b[ 49] = *(a02 + 6); + b[ 50] = *(a03 + 6); + b[ 51] = *(a04 + 6); + b[ 52] = *(a05 + 6); + b[ 53] = *(a06 + 6); +#ifdef UNIT + b[ 54] = ONE; +#else + b[ 54] = *(a07 + 6); +#endif + b[ 55] = ZERO; + + b[ 56] = *(a01 + 7); + b[ 57] = *(a02 + 7); + b[ 58] = *(a03 + 7); + b[ 59] = *(a04 + 7); + b[ 60] = *(a05 + 7); + b[ 61] = *(a06 + 7); + b[ 62] = *(a07 + 7); +#ifdef UNIT + b[ 63] = ONE; +#else + b[ 63] = *(a08 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + a05 += 8; + a06 += 8; + a07 += 8; + a08 += 8; + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + b += 8; + } + } else + if (X < posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(a01 + 1); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a02 + 2); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 4) { + b[ 0] = *(a01 + 3); + b[ 1] = *(a02 + 3); + b[ 2] = *(a03 + 3); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 5) { + b[ 0] = *(a01 + 4); + b[ 1] = *(a02 + 4); + b[ 2] = *(a03 + 4); + b[ 3] = *(a04 + 4); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 6) { + b[ 0] = *(a01 + 5); + b[ 1] = *(a02 + 5); + b[ 2] = *(a03 + 5); + b[ 3] = *(a04 + 5); + b[ 4] = *(a05 + 5); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 7) { + b[ 0] = *(a01 + 6); + b[ 1] = *(a02 + 6); + b[ 2] = *(a03 + 6); + b[ 3] = *(a04 + 6); + b[ 4] = *(a05 + 6); + b[ 5] = *(a06 + 6); +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 8; + } + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + b += 4; + } + } else + if (X < posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 16; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a01 + 1); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a02 + 1); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a01 + 2); + b[ 9] = *(a02 + 2); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a03 + 2); +#endif + b[ 11] = ZERO; + + b[ 12] = *(a01 + 3); + b[ 13] = *(a02 + 3); + b[ 14] = *(a03 + 3); +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(a04 + 3); +#endif + + a01 += 4; + a02 += 4; + a03 += 4; + a04 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + b += 4; + } + } else + if (X < posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + b += 4 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if (i >= 2) { + b[ 0] = *(a01 + 1); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = *(a01 + 2); + b[ 1] = *(a02 + 2); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a01 + 1); + b[ 3] = *(a02 + 1); + a01 += 2; + a02 += 2; + b += 4; + } else + if (X < posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + + b[ 2] = *(a01 + 1); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a02 + 1); +#endif + + a01 += 2; + a02 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + + a01 ++; + a02 ++; + b += 2; + } else + if (X < posY) { + a01 += lda; + a02 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + b[ 0] = *(a01 + 0); + a01 += 1; + b += 1; + } else + if (X < posY) { + a01 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_2.c b/kernel/generic/trmm_lncopy_2.c new file mode 100644 index 0000000000..f7fefaaadf --- /dev/null +++ b/kernel/generic/trmm_lncopy_2.c @@ -0,0 +1,198 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data02; + b[ 3] = data04; + + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data04; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data03; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data03; +#else + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data03; +#endif + ao1 += 1; + ao2 += 1; + b += 2; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X < posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_4.c b/kernel/generic/trmm_lncopy_4.c new file mode 100644 index 0000000000..6cd16673a3 --- /dev/null +++ b/kernel/generic/trmm_lncopy_4.c @@ -0,0 +1,484 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = ONE; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = ZERO; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + data03 = *(ao3 + 0); + data04 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += 1; + } else + if (X < posY) { + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_lncopy_8.c b/kernel/generic/trmm_lncopy_8.c new file mode 100644 index 0000000000..4a1964bd79 --- /dev/null +++ b/kernel/generic/trmm_lncopy_8.c @@ -0,0 +1,1227 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + ao7 = a + posY + (posX + 6) * lda; + ao8 = a + posY + (posX + 7) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + ao7 = a + posX + (posY + 6) * lda; + ao8 = a + posX + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + data56 = *(ao7 + 7); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = data03; + b[17] = data11; + b[18] = data19; + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = data04; + b[25] = data12; + b[26] = data20; + b[27] = data28; + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + b[32] = data05; + b[33] = data13; + b[34] = data21; + b[35] = data29; + b[36] = data37; + b[37] = data45; + b[38] = data53; + b[39] = data61; + + b[40] = data06; + b[41] = data14; + b[42] = data22; + b[43] = data30; + b[44] = data38; + b[45] = data46; + b[46] = data54; + b[47] = data62; + + b[48] = data07; + b[49] = data15; + b[50] = data23; + b[51] = data31; + b[52] = data39; + b[53] = data47; + b[54] = data55; + b[55] = data63; + + b[56] = data08; + b[57] = data16; + b[58] = data24; + b[59] = data32; + b[60] = data40; + b[61] = data48; + b[62] = data56; + b[63] = data64; + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + + } else + if (X < posY) { + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + + } else { +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data56 = *(ao7 + 7); + +#ifndef UNIT + data64 = *(ao8 + 7); +#endif + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data02; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = data10; +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data03; + b[17] = data11; +#ifdef UNIT + b[18] = ONE; +#else + b[18] = data19; +#endif + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data04; + b[25] = data12; + b[26] = data20; +#ifdef UNIT + b[27] = ONE; +#else + b[27] = data28; +#endif + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + + b[32] = data05; + b[33] = data13; + b[34] = data21; + b[35] = data29; +#ifdef UNIT + b[36] = ONE; +#else + b[36] = data37; +#endif + b[37] = ZERO; + b[38] = ZERO; + b[39] = ZERO; + + b[40] = data06; + b[41] = data14; + b[42] = data22; + b[43] = data30; + b[44] = data38; +#ifdef UNIT + b[45] = ONE; +#else + b[45] = data46; +#endif + b[46] = ZERO; + b[47] = ZERO; + + b[48] = data07; + b[49] = data15; + b[50] = data23; + b[51] = data31; + b[52] = data39; + b[53] = data47; +#ifdef UNIT + b[54] = ONE; +#else + b[54] = data55; +#endif + b[55] = ZERO; + + b[56] = data08; + b[57] = data16; + b[58] = data24; + b[59] = data32; + b[60] = data40; + b[61] = data48; + b[62] = data56; +#ifdef UNIT + b[63] = ONE; +#else + b[63] = data64; +#endif + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + + if (m & 4) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = data03; + b[17] = data11; + b[18] = data19; + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = data04; + b[25] = data12; + b[26] = data20; + b[27] = data28; + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + ao5 += 4; + ao6 += 4; + ao7 += 4; + ao8 += 4; + + b += 32; + } + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + data33 = *(ao5 + 0); + data41 = *(ao6 + 0); + data49 = *(ao7 + 0); + data57 = *(ao8 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b += 8; + } + } else + if (X < posY) { + if (m & 4) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + if (m & 2) { + ao1 += 2 * lda; + b += 16; + } + + if (m & 1) { + b += 8; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + } + + if (i >= 4) { +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + } + + if (i >= 5) { +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + } + + if (i >= 6) { +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + } + + if (i >= 7) { +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data56 = *(ao7 + 7); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = data02; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data03; + b[ 1] = data11; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 4) { + b[ 0] = data04; + b[ 1] = data12; + b[ 2] = data20; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = data28; +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 5) { + b[ 0] = data05; + b[ 1] = data13; + b[ 2] = data21; + b[ 3] = data29; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = data37; +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 6) { + b[ 0] = data06; + b[ 1] = data14; + b[ 2] = data22; + b[ 3] = data30; + b[ 4] = data38; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = data46; +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 7) { + b[ 0] = data07; + b[ 1] = data15; + b[ 2] = data23; + b[ 3] = data31; + b[ 4] = data39; + b[ 5] = data47; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = data55; +#endif + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data11; + b[10] = data19; + b[11] = data27; + + b[12] = data04; + b[13] = data12; + b[14] = data20; + b[15] = data28; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data20 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data02; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data11; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data04; + b[13] = data12; + b[14] = data20; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data11; + b[10] = data19; + b[11] = ZERO; + + b[12] = data04; + b[13] = data12; + b[14] = data20; + b[15] = data28; +#endif + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b += 4; + } + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + b += 8; + } + + if (m & 1) { + b += 4; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data02; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data03; + b[ 1] = data11; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = ZERO; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data02; + b[ 3] = data10; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data02; + b[ 3] = data10; +#endif + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b += 2; + } else + if (X < posY) { + b += 2; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data09; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data09; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X < posY) { + ao1 += lda; + b += 1; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 ++; + b ++; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_1.c b/kernel/generic/trmm_ltcopy_1.c new file mode 100644 index 0000000000..ab5e9d8e97 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_1.c @@ -0,0 +1,92 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, X; + + FLOAT data01; + FLOAT *ao1; + + while (n > 0) { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + ao1 += 1; + b += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + n --; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_16.c b/kernel/generic/trmm_ltcopy_16.c new file mode 100644 index 0000000000..0598de896c --- /dev/null +++ b/kernel/generic/trmm_ltcopy_16.c @@ -0,0 +1,1547 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + a09 = a + posY + (posX + 8) * lda; + a10 = a + posY + (posX + 9) * lda; + a11 = a + posY + (posX + 10) * lda; + a12 = a + posY + (posX + 11) * lda; + a13 = a + posY + (posX + 12) * lda; + a14 = a + posY + (posX + 13) * lda; + a15 = a + posY + (posX + 14) * lda; + a16 = a + posY + (posX + 15) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + a09 = a + posX + (posY + 8) * lda; + a10 = a + posX + (posY + 9) * lda; + a11 = a + posX + (posY + 10) * lda; + a12 = a + posX + (posY + 11) * lda; + a13 = a + posX + (posY + 12) * lda; + a14 = a + posX + (posY + 13) * lda; + a15 = a + posX + (posY + 14) * lda; + a16 = a + posX + (posY + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X > posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + a09 += 16; + a10 += 16; + a11 += 16; + a12 += 16; + a13 += 16; + a14 += 16; + a15 += 16; + a16 += 16; + b += 256; + } else + if (X < posY) { + + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = ZERO; +#ifdef UNIT + b[ 17] = ONE; +#else + b[ 17] = *(a02 + 1); +#endif + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); + b[ 20] = *(a02 + 4); + b[ 21] = *(a02 + 5); + b[ 22] = *(a02 + 6); + b[ 23] = *(a02 + 7); + b[ 24] = *(a02 + 8); + b[ 25] = *(a02 + 9); + b[ 26] = *(a02 + 10); + b[ 27] = *(a02 + 11); + b[ 28] = *(a02 + 12); + b[ 29] = *(a02 + 13); + b[ 30] = *(a02 + 14); + b[ 31] = *(a02 + 15); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; +#else + b[ 34] = *(a03 + 2); +#endif + b[ 35] = *(a03 + 3); + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); + b[ 38] = *(a03 + 6); + b[ 39] = *(a03 + 7); + b[ 40] = *(a03 + 8); + b[ 41] = *(a03 + 9); + b[ 42] = *(a03 + 10); + b[ 43] = *(a03 + 11); + b[ 44] = *(a03 + 12); + b[ 45] = *(a03 + 13); + b[ 46] = *(a03 + 14); + b[ 47] = *(a03 + 15); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; +#ifdef UNIT + b[ 51] = ONE; +#else + b[ 51] = *(a04 + 3); +#endif + b[ 52] = *(a04 + 4); + b[ 53] = *(a04 + 5); + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); + b[ 56] = *(a04 + 8); + b[ 57] = *(a04 + 9); + b[ 58] = *(a04 + 10); + b[ 59] = *(a04 + 11); + b[ 60] = *(a04 + 12); + b[ 61] = *(a04 + 13); + b[ 62] = *(a04 + 14); + b[ 63] = *(a04 + 15); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; +#else + b[ 68] = *(a05 + 4); +#endif + b[ 69] = *(a05 + 5); + b[ 70] = *(a05 + 6); + b[ 71] = *(a05 + 7); + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); + b[ 74] = *(a05 + 10); + b[ 75] = *(a05 + 11); + b[ 76] = *(a05 + 12); + b[ 77] = *(a05 + 13); + b[ 78] = *(a05 + 14); + b[ 79] = *(a05 + 15); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; +#ifdef UNIT + b[ 85] = ONE; +#else + b[ 85] = *(a06 + 5); +#endif + b[ 86] = *(a06 + 6); + b[ 87] = *(a06 + 7); + b[ 88] = *(a06 + 8); + b[ 89] = *(a06 + 9); + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); + b[ 92] = *(a06 + 12); + b[ 93] = *(a06 + 13); + b[ 94] = *(a06 + 14); + b[ 95] = *(a06 + 15); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; +#else + b[102] = *(a07 + 6); +#endif + b[103] = *(a07 + 7); + b[104] = *(a07 + 8); + b[105] = *(a07 + 9); + b[106] = *(a07 + 10); + b[107] = *(a07 + 11); + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); + b[110] = *(a07 + 14); + b[111] = *(a07 + 15); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; +#ifdef UNIT + b[119] = ONE; +#else + b[119] = *(a08 + 7); +#endif + b[120] = *(a08 + 8); + b[121] = *(a08 + 9); + b[122] = *(a08 + 10); + b[123] = *(a08 + 11); + b[124] = *(a08 + 12); + b[125] = *(a08 + 13); + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; +#else + b[136] = *(a09 + 8); +#endif + b[137] = *(a09 + 9); + b[138] = *(a09 + 10); + b[139] = *(a09 + 11); + b[140] = *(a09 + 12); + b[141] = *(a09 + 13); + b[142] = *(a09 + 14); + b[143] = *(a09 + 15); + + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; +#ifdef UNIT + b[153] = ONE; +#else + b[153] = *(a10 + 9); +#endif + b[154] = *(a10 + 10); + b[155] = *(a10 + 11); + b[156] = *(a10 + 12); + b[157] = *(a10 + 13); + b[158] = *(a10 + 14); + b[159] = *(a10 + 15); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; +#else + b[170] = *(a11 + 10); +#endif + b[171] = *(a11 + 11); + b[172] = *(a11 + 12); + b[173] = *(a11 + 13); + b[174] = *(a11 + 14); + b[175] = *(a11 + 15); + + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; +#ifdef UNIT + b[187] = ONE; +#else + b[187] = *(a12 + 11); +#endif + b[188] = *(a12 + 12); + b[189] = *(a12 + 13); + b[190] = *(a12 + 14); + b[191] = *(a12 + 15); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; +#else + b[204] = *(a13 + 12); +#endif + b[205] = *(a13 + 13); + b[206] = *(a13 + 14); + b[207] = *(a13 + 15); + + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; +#ifdef UNIT + b[221] = ONE; +#else + b[221] = *(a14 + 13); +#endif + b[222] = *(a14 + 14); + b[223] = *(a14 + 15); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; +#else + b[238] = *(a15 + 14); +#endif + b[239] = *(a15 + 15); + + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; +#ifdef UNIT + b[255] = ONE; +#else + b[255] = *(a16 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + a09 += 16; + a10 += 16; + a11 += 16; + a12 += 16; + a13 += 16; + a14 += 16; + a15 += 16; + a16 += 16; + + b += 256; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i > 0) { + if (X > posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + a05 += i; + a06 += i; + a07 += i; + a08 += i; + a09 += i; + a10 += i; + a11 += i; + a12 += i; + a13 += i; + a14 += i; + a15 += i; + a16 += i; + b += 16 * i; + } else + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[10] = *(a02 + 10); + b[11] = *(a02 + 11); + b[12] = *(a02 + 12); + b[13] = *(a02 + 13); + b[14] = *(a02 + 14); + b[15] = *(a02 + 15); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a03 + 3); + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[10] = *(a03 + 10); + b[11] = *(a03 + 11); + b[12] = *(a03 + 12); + b[13] = *(a03 + 13); + b[14] = *(a03 + 14); + b[15] = *(a03 + 15); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[10] = *(a04 + 10); + b[11] = *(a04 + 11); + b[12] = *(a04 + 12); + b[13] = *(a04 + 13); + b[14] = *(a04 + 14); + b[15] = *(a04 + 15); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); + b[10] = *(a05 + 10); + b[11] = *(a05 + 11); + b[12] = *(a05 + 12); + b[13] = *(a05 + 13); + b[14] = *(a05 + 14); + b[15] = *(a05 + 15); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); + b[12] = *(a06 + 12); + b[13] = *(a06 + 13); + b[14] = *(a06 + 14); + b[15] = *(a06 + 15); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); + b[14] = *(a07 + 14); + b[15] = *(a07 + 15); + b += 16; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(a08 + 7); +#endif + b[ 8] = *(a08 + 8); + b[ 9] = *(a08 + 9); + b[10] = *(a08 + 10); + b[11] = *(a08 + 11); + b[12] = *(a08 + 12); + b[13] = *(a08 + 13); + b[14] = *(a08 + 14); + b[15] = *(a08 + 15); + b += 16; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; +#else + b[ 8] = *(a09 + 8); +#endif + b[ 9] = *(a09 + 9); + b[10] = *(a09 + 10); + b[11] = *(a09 + 11); + b[12] = *(a09 + 12); + b[13] = *(a09 + 13); + b[14] = *(a09 + 14); + b[15] = *(a09 + 15); + b += 16; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a10 + 9); +#endif + b[10] = *(a10 + 10); + b[11] = *(a10 + 11); + b[12] = *(a10 + 12); + b[13] = *(a10 + 13); + b[14] = *(a10 + 14); + b[15] = *(a10 + 15); + b += 16; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; +#else + b[10] = *(a11 + 10); +#endif + b[11] = *(a11 + 11); + b[12] = *(a11 + 12); + b[13] = *(a11 + 13); + b[14] = *(a11 + 14); + b[15] = *(a11 + 15); + b += 16; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; +#ifdef UNIT + b[11] = ONE; +#else + b[11] = *(a12 + 11); +#endif + b[12] = *(a12 + 12); + b[13] = *(a12 + 13); + b[14] = *(a12 + 14); + b[15] = *(a12 + 15); + b += 16; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; +#else + b[12] = *(a13 + 12); +#endif + b[13] = *(a13 + 13); + b[14] = *(a13 + 14); + b[15] = *(a13 + 15); + b += 16; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; +#ifdef UNIT + b[13] = ONE; +#else + b[13] = *(a14 + 13); +#endif + b[14] = *(a14 + 14); + b[15] = *(a14 + 15); + b += 16; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(a15 + 14); +#endif + b[15] = *(a15 + 15); + b += 16; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + a05 += 8; + a06 += 8; + a07 += 8; + a08 += 8; + b += 64; + } else + if (X < posY) { + + for (ii = 0; ii < 8; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + a01 += lda; + b += 8; + } + + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a02 + 1); +#endif + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); + b[ 12] = *(a02 + 4); + b[ 13] = *(a02 + 5); + b[ 14] = *(a02 + 6); + b[ 15] = *(a02 + 7); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; +#else + b[ 18] = *(a03 + 2); +#endif + b[ 19] = *(a03 + 3); + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); + b[ 22] = *(a03 + 6); + b[ 23] = *(a03 + 7); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; +#ifdef UNIT + b[ 27] = ONE; +#else + b[ 27] = *(a04 + 3); +#endif + b[ 28] = *(a04 + 4); + b[ 29] = *(a04 + 5); + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; +#else + b[ 36] = *(a05 + 4); +#endif + b[ 37] = *(a05 + 5); + b[ 38] = *(a05 + 6); + b[ 39] = *(a05 + 7); + + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; +#ifdef UNIT + b[ 45] = ONE; +#else + b[ 45] = *(a06 + 5); +#endif + b[ 46] = *(a06 + 6); + b[ 47] = *(a06 + 7); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; +#else + b[ 54] = *(a07 + 6); +#endif + b[ 55] = *(a07 + 7); + + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; +#ifdef UNIT + b[ 63] = ONE; +#else + b[ 63] = *(a08 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + a05 += 8; + a06 += 8; + a07 += 8; + a08 += 8; + b += 64; + + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0) { + if (X > posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + a05 += i; + a06 += i; + a07 += i; + a08 += i; + b += 8 * i; + } else + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 8; + } + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a03 + 3); + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b += 8; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); + b += 8; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); + b += 8; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b += 8; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = *(a07 + 7); + b += 8; + } + } + } + posY += 8; + } + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + a01 += 4; + a02 += 4; + a03 += 4; + a04 += 4; + b += 16; + } else + if (X < posY) { + + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + a01 += lda; + b += 4; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a02 + 1); +#endif + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a03 + 2); +#endif + b[ 11] = *(a03 + 3); + + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(a04 + 3); +#endif + + a01 += 4; + a02 += 4; + a03 += 4; + a04 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X > posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + b += 4 * i; + } else + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 4; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a03 + 3); + b += 4; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + a01 += 2; + a02 += 2; + b += 4; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a02 + 1); +#endif + + a01 += 2; + a02 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + if (X > posY) { + a01 ++; + a02 ++; + b += 2; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += lda; + a02 += lda; + b += 2; + } + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a01 + 1); + b += 2; + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY + (posX + 0) * lda; + } else { + a01 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + b ++; + a01 ++; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + a01 += lda; + b ++; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + a01 ++; + b ++; + } + X += 1; + i --; + } while (i > 0); + } + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_2.c b/kernel/generic/trmm_ltcopy_2.c new file mode 100644 index 0000000000..098e16f968 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_2.c @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data04; +#endif + + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + ao1 += 1; + b += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_4.c b/kernel/generic/trmm_ltcopy_4.c new file mode 100644 index 0000000000..69a233be6e --- /dev/null +++ b/kernel/generic/trmm_ltcopy_4.c @@ -0,0 +1,488 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data12 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data12; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data12 = *(ao3 + 3); + } + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data12; + b += 4; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data07; + b[ 3] = data08; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 1; + ao2 += 1; + + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 1; + ao1 += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += 1; + b += 1; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_ltcopy_8.c b/kernel/generic/trmm_ltcopy_8.c new file mode 100644 index 0000000000..64954da406 --- /dev/null +++ b/kernel/generic/trmm_ltcopy_8.c @@ -0,0 +1,1219 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + ao7 = a + posY + (posX + 6) * lda; + ao8 = a + posY + (posX + 7) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + ao7 = a + posX + (posY + 6) * lda; + ao8 = a + posX + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + data56 = *(ao7 + 7); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b[32] = data33; + b[33] = data34; + b[34] = data35; + b[35] = data36; + b[36] = data37; + b[37] = data38; + b[38] = data39; + b[39] = data40; + + b[40] = data41; + b[41] = data42; + b[42] = data43; + b[43] = data44; + b[44] = data45; + b[45] = data46; + b[46] = data47; + b[47] = data48; + + b[48] = data49; + b[49] = data50; + b[50] = data51; + b[51] = data52; + b[52] = data53; + b[53] = data54; + b[54] = data55; + b[55] = data56; + + b[56] = data57; + b[57] = data58; + b[58] = data59; + b[59] = data60; + b[60] = data61; + b[61] = data62; + b[62] = data63; + b[63] = data64; + + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data56 = *(ao7 + 7); + +#ifndef UNIT + data64 = *(ao8 + 7); +#endif + + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = data10; +#endif + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = ZERO; + b[17] = ZERO; +#ifdef UNIT + b[18] = ONE; +#else + b[18] = data19; +#endif + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; +#ifdef UNIT + b[27] = ONE; +#else + b[27] = data28; +#endif + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ZERO; +#ifdef UNIT + b[36] = ONE; +#else + b[36] = data37; +#endif + b[37] = data38; + b[38] = data39; + b[39] = data40; + + b[40] = ZERO; + b[41] = ZERO; + b[42] = ZERO; + b[43] = ZERO; + b[44] = ZERO; +#ifdef UNIT + b[45] = ONE; +#else + b[45] = data46; +#endif + b[46] = data47; + b[47] = data48; + + b[48] = ZERO; + b[49] = ZERO; + b[50] = ZERO; + b[51] = ZERO; + b[52] = ZERO; + b[53] = ZERO; +#ifdef UNIT + b[54] = ONE; +#else + b[54] = data55; +#endif + b[55] = data56; + + b[56] = ZERO; + b[57] = ZERO; + b[58] = ZERO; + b[59] = ZERO; + b[60] = ZERO; + b[61] = ZERO; + b[62] = ZERO; +#ifdef UNIT + b[63] = ONE; +#else + b[63] = data64; +#endif + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + + if (m & 4) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + ao5 += 4; + ao6 += 4; + ao7 += 4; + ao8 += 4; + + b += 32; + } + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + + b += 16; + } + + if (m & 1) { + b += 8; + } + } else + if (X < posY) { + if (m & 4) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 2 * lda; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + } + } else { +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + } + + if (i >= 4) { +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + } + + if (i >= 5) { +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + } + + if (i >= 6) { +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + } + + if (i >= 7) { +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data56 = *(ao7 + 7); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = data13; + b[ 5] = data14; + b[ 6] = data15; + b[ 7] = data16; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = data20; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = data23; + b[ 7] = data24; + b += 8; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = data28; +#endif + b[ 4] = data29; + b[ 5] = data30; + b[ 6] = data31; + b[ 7] = data32; + b += 8; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = data37; +#endif + b[ 5] = data38; + b[ 6] = data39; + b[ 7] = data40; + b += 8; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = data46; +#endif + b[ 6] = data47; + b[ 7] = data48; + b += 8; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = data55; +#endif + b[ 7] = data56; + b += 8; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + b[ 8] = data17; + b[ 9] = data18; + b[10] = data19; + b[11] = data20; + + b[12] = data25; + b[13] = data26; + b[14] = data27; + b[15] = data28; + + b += 16; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data20 = *(ao3 + 3); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data11; + b[ 7] = data12; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data20; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = ZERO; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data19; + b[11] = data20; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data28; +#endif + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + + b += 8; + } + + if (m & 1) { + b += 4; + } + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data20 = *(ao3 + 3); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = data11; + b[ 3] = data12; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = data20; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + + b += 4; + + } else { + +#ifdef UNIT + data02 = *(ao1 + 1); + + b[ 0] = ONE; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = data10; +#endif + + ao1 += 2; + ao2 += 2; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b += 2; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data09; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data09; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY + (posX + 0) * lda; + } else { + ao1 = a + posX + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + ao1 += 1; + b += 1; + } else + if (X < posY) { + data01 = *(ao1 + 0); + ao1 += lda; + + b[ 0] = data01; + b += 1; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 ++; + b ++; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_1.c b/kernel/generic/trmm_uncopy_1.c new file mode 100644 index 0000000000..6e75c2fa55 --- /dev/null +++ b/kernel/generic/trmm_uncopy_1.c @@ -0,0 +1,91 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, X; + + FLOAT data01; + FLOAT *ao1; + + while (n > 0) { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + posY ++; + n --; + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_16.c b/kernel/generic/trmm_uncopy_16.c new file mode 100644 index 0000000000..6325a26a07 --- /dev/null +++ b/kernel/generic/trmm_uncopy_16.c @@ -0,0 +1,1543 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + a09 = a + posX + (posY + 8) * lda; + a10 = a + posX + (posY + 9) * lda; + a11 = a + posX + (posY + 10) * lda; + a12 = a + posX + (posY + 11) * lda; + a13 = a + posX + (posY + 12) * lda; + a14 = a + posX + (posY + 13) * lda; + a15 = a + posX + (posY + 14) * lda; + a16 = a + posX + (posY + 15) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + a09 = a + posY + (posX + 8) * lda; + a10 = a + posY + (posX + 9) * lda; + a11 = a + posY + (posX + 10) * lda; + a12 = a + posY + (posX + 11) * lda; + a13 = a + posY + (posX + 12) * lda; + a14 = a + posY + (posX + 13) * lda; + a15 = a + posY + (posX + 14) * lda; + a16 = a + posY + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + a09 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + } + } else + if (X > posY) { + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + b += 256; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + b[ 16] = ZERO; +#ifdef UNIT + b[ 17] = ONE; +#else + b[ 17] = *(a02 + 1); +#endif + b[ 18] = *(a03 + 1); + b[ 19] = *(a04 + 1); + b[ 20] = *(a05 + 1); + b[ 21] = *(a06 + 1); + b[ 22] = *(a07 + 1); + b[ 23] = *(a08 + 1); + b[ 24] = *(a09 + 1); + b[ 25] = *(a10 + 1); + b[ 26] = *(a11 + 1); + b[ 27] = *(a12 + 1); + b[ 28] = *(a13 + 1); + b[ 29] = *(a14 + 1); + b[ 30] = *(a15 + 1); + b[ 31] = *(a16 + 1); + + b[ 32] = ZERO; + b[ 33] = ZERO; +#ifdef UNIT + b[ 34] = ONE; +#else + b[ 34] = *(a03 + 2); +#endif + b[ 35] = *(a04 + 2); + b[ 36] = *(a05 + 2); + b[ 37] = *(a06 + 2); + b[ 38] = *(a07 + 2); + b[ 39] = *(a08 + 2); + b[ 40] = *(a09 + 2); + b[ 41] = *(a10 + 2); + b[ 42] = *(a11 + 2); + b[ 43] = *(a12 + 2); + b[ 44] = *(a13 + 2); + b[ 45] = *(a14 + 2); + b[ 46] = *(a15 + 2); + b[ 47] = *(a16 + 2); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; +#ifdef UNIT + b[ 51] = ONE; +#else + b[ 51] = *(a04 + 3); +#endif + b[ 52] = *(a05 + 3); + b[ 53] = *(a06 + 3); + b[ 54] = *(a07 + 3); + b[ 55] = *(a08 + 3); + b[ 56] = *(a09 + 3); + b[ 57] = *(a10 + 3); + b[ 58] = *(a11 + 3); + b[ 59] = *(a12 + 3); + b[ 60] = *(a13 + 3); + b[ 61] = *(a14 + 3); + b[ 62] = *(a15 + 3); + b[ 63] = *(a16 + 3); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; +#ifdef UNIT + b[ 68] = ONE; +#else + b[ 68] = *(a05 + 4); +#endif + b[ 69] = *(a06 + 4); + b[ 70] = *(a07 + 4); + b[ 71] = *(a08 + 4); + b[ 72] = *(a09 + 4); + b[ 73] = *(a10 + 4); + b[ 74] = *(a11 + 4); + b[ 75] = *(a12 + 4); + b[ 76] = *(a13 + 4); + b[ 77] = *(a14 + 4); + b[ 78] = *(a15 + 4); + b[ 79] = *(a16 + 4); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; +#ifdef UNIT + b[ 85] = ONE; +#else + b[ 85] = *(a06 + 5); +#endif + b[ 86] = *(a07 + 5); + b[ 87] = *(a08 + 5); + b[ 88] = *(a09 + 5); + b[ 89] = *(a10 + 5); + b[ 90] = *(a11 + 5); + b[ 91] = *(a12 + 5); + b[ 92] = *(a13 + 5); + b[ 93] = *(a14 + 5); + b[ 94] = *(a15 + 5); + b[ 95] = *(a16 + 5); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; +#ifdef UNIT + b[102] = ONE; +#else + b[102] = *(a07 + 6); +#endif + b[103] = *(a08 + 6); + b[104] = *(a09 + 6); + b[105] = *(a10 + 6); + b[106] = *(a11 + 6); + b[107] = *(a12 + 6); + b[108] = *(a13 + 6); + b[109] = *(a14 + 6); + b[110] = *(a15 + 6); + b[111] = *(a16 + 6); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; +#ifdef UNIT + b[119] = ONE; +#else + b[119] = *(a08 + 7); +#endif + b[120] = *(a09 + 7); + b[121] = *(a10 + 7); + b[122] = *(a11 + 7); + b[123] = *(a12 + 7); + b[124] = *(a13 + 7); + b[125] = *(a14 + 7); + b[126] = *(a15 + 7); + b[127] = *(a16 + 7); + + b[128] = ZERO; + b[129] = ZERO; + b[130] = ZERO; + b[131] = ZERO; + b[132] = ZERO; + b[133] = ZERO; + b[134] = ZERO; + b[135] = ZERO; +#ifdef UNIT + b[136] = ONE; +#else + b[136] = *(a09 + 8); +#endif + b[137] = *(a10 + 8); + b[138] = *(a11 + 8); + b[139] = *(a12 + 8); + b[140] = *(a13 + 8); + b[141] = *(a14 + 8); + b[142] = *(a15 + 8); + b[143] = *(a16 + 8); + + b[144] = ZERO; + b[145] = ZERO; + b[146] = ZERO; + b[147] = ZERO; + b[148] = ZERO; + b[149] = ZERO; + b[150] = ZERO; + b[151] = ZERO; + b[152] = ZERO; +#ifdef UNIT + b[153] = ONE; +#else + b[153] = *(a10 + 9); +#endif + b[154] = *(a11 + 9); + b[155] = *(a12 + 9); + b[156] = *(a13 + 9); + b[157] = *(a14 + 9); + b[158] = *(a15 + 9); + b[159] = *(a16 + 9); + + b[160] = ZERO; + b[161] = ZERO; + b[162] = ZERO; + b[163] = ZERO; + b[164] = ZERO; + b[165] = ZERO; + b[166] = ZERO; + b[167] = ZERO; + b[168] = ZERO; + b[169] = ZERO; +#ifdef UNIT + b[170] = ONE; +#else + b[170] = *(a11 + 10); +#endif + b[171] = *(a12 + 10); + b[172] = *(a13 + 10); + b[173] = *(a14 + 10); + b[174] = *(a15 + 10); + b[175] = *(a16 + 10); + + b[176] = ZERO; + b[177] = ZERO; + b[178] = ZERO; + b[179] = ZERO; + b[180] = ZERO; + b[181] = ZERO; + b[182] = ZERO; + b[183] = ZERO; + b[184] = ZERO; + b[185] = ZERO; + b[186] = ZERO; +#ifdef UNIT + b[187] = ONE; +#else + b[187] = *(a12 + 11); +#endif + b[188] = *(a13 + 11); + b[189] = *(a14 + 11); + b[190] = *(a15 + 11); + b[191] = *(a16 + 11); + + b[192] = ZERO; + b[193] = ZERO; + b[194] = ZERO; + b[195] = ZERO; + b[196] = ZERO; + b[197] = ZERO; + b[198] = ZERO; + b[199] = ZERO; + b[200] = ZERO; + b[201] = ZERO; + b[202] = ZERO; + b[203] = ZERO; +#ifdef UNIT + b[204] = ONE; +#else + b[204] = *(a13 + 12); +#endif + b[205] = *(a14 + 12); + b[206] = *(a15 + 12); + b[207] = *(a16 + 12); + + b[208] = ZERO; + b[209] = ZERO; + b[210] = ZERO; + b[211] = ZERO; + b[212] = ZERO; + b[213] = ZERO; + b[214] = ZERO; + b[215] = ZERO; + b[216] = ZERO; + b[217] = ZERO; + b[218] = ZERO; + b[219] = ZERO; + b[220] = ZERO; +#ifdef UNIT + b[221] = ONE; +#else + b[221] = *(a14 + 13); +#endif + b[222] = *(a15 + 13); + b[223] = *(a16 + 13); + + b[224] = ZERO; + b[225] = ZERO; + b[226] = ZERO; + b[227] = ZERO; + b[228] = ZERO; + b[229] = ZERO; + b[230] = ZERO; + b[231] = ZERO; + b[232] = ZERO; + b[233] = ZERO; + b[234] = ZERO; + b[235] = ZERO; + b[236] = ZERO; + b[237] = ZERO; +#ifdef UNIT + b[238] = ONE; +#else + b[238] = *(a15 + 14); +#endif + b[239] = *(a16 + 14); + + b[240] = ZERO; + b[241] = ZERO; + b[242] = ZERO; + b[243] = ZERO; + b[244] = ZERO; + b[245] = ZERO; + b[246] = ZERO; + b[247] = ZERO; + b[248] = ZERO; + b[249] = ZERO; + b[250] = ZERO; + b[251] = ZERO; + b[252] = ZERO; + b[253] = ZERO; + b[254] = ZERO; +#ifdef UNIT + b[255] = ONE; +#else + b[255] = *(a16 + 15); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 256; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + a09 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + } + } else + if (X > posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + a09 += i * lda; + a10 += i * lda; + a11 += i * lda; + a12 += i * lda; + a13 += i * lda; + a14 += i * lda; + a15 += i * lda; + a16 += i * lda; + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + b[ 8] = *(a09 + 0); + b[ 9] = *(a10 + 0); + b[ 10] = *(a11 + 0); + b[ 11] = *(a12 + 0); + b[ 12] = *(a13 + 0); + b[ 13] = *(a14 + 0); + b[ 14] = *(a15 + 0); + b[ 15] = *(a16 + 0); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a03 + 1); + b[ 3] = *(a04 + 1); + b[ 4] = *(a05 + 1); + b[ 5] = *(a06 + 1); + b[ 6] = *(a07 + 1); + b[ 7] = *(a08 + 1); + b[ 8] = *(a09 + 1); + b[ 9] = *(a10 + 1); + b[ 10] = *(a11 + 1); + b[ 11] = *(a12 + 1); + b[ 12] = *(a13 + 1); + b[ 13] = *(a14 + 1); + b[ 14] = *(a15 + 1); + b[ 15] = *(a16 + 1); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a04 + 2); + b[ 4] = *(a05 + 2); + b[ 5] = *(a06 + 2); + b[ 6] = *(a07 + 2); + b[ 7] = *(a08 + 2); + b[ 8] = *(a09 + 2); + b[ 9] = *(a10 + 2); + b[ 10] = *(a11 + 2); + b[ 11] = *(a12 + 2); + b[ 12] = *(a13 + 2); + b[ 13] = *(a14 + 2); + b[ 14] = *(a15 + 2); + b[ 15] = *(a16 + 2); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = *(a05 + 3); + b[ 5] = *(a06 + 3); + b[ 6] = *(a07 + 3); + b[ 7] = *(a08 + 3); + b[ 8] = *(a09 + 3); + b[ 9] = *(a10 + 3); + b[ 10] = *(a11 + 3); + b[ 11] = *(a12 + 3); + b[ 12] = *(a13 + 3); + b[ 13] = *(a14 + 3); + b[ 14] = *(a15 + 3); + b[ 15] = *(a16 + 3); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = *(a06 + 4); + b[ 6] = *(a07 + 4); + b[ 7] = *(a08 + 4); + b[ 8] = *(a09 + 4); + b[ 9] = *(a10 + 4); + b[ 10] = *(a11 + 4); + b[ 11] = *(a12 + 4); + b[ 12] = *(a13 + 4); + b[ 13] = *(a14 + 4); + b[ 14] = *(a15 + 4); + b[ 15] = *(a16 + 4); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = *(a07 + 5); + b[ 7] = *(a08 + 5); + b[ 8] = *(a09 + 5); + b[ 9] = *(a10 + 5); + b[ 10] = *(a11 + 5); + b[ 11] = *(a12 + 5); + b[ 12] = *(a13 + 5); + b[ 13] = *(a14 + 5); + b[ 14] = *(a15 + 5); + b[ 15] = *(a16 + 5); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = *(a08 + 6); + b[ 8] = *(a09 + 6); + b[ 9] = *(a10 + 6); + b[ 10] = *(a11 + 6); + b[ 11] = *(a12 + 6); + b[ 12] = *(a13 + 6); + b[ 13] = *(a14 + 6); + b[ 14] = *(a15 + 6); + b[ 15] = *(a16 + 6); + b += 16; + } + + if (i >= 8) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(a08 + 7); +#endif + b[ 8] = *(a09 + 7); + b[ 9] = *(a10 + 7); + b[ 10] = *(a11 + 7); + b[ 11] = *(a12 + 7); + b[ 12] = *(a13 + 7); + b[ 13] = *(a14 + 7); + b[ 14] = *(a15 + 7); + b[ 15] = *(a16 + 7); + b += 16; + } + + if (i >= 9) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; +#else + b[ 8] = *(a09 + 8); +#endif + b[ 9] = *(a10 + 8); + b[ 10] = *(a11 + 8); + b[ 11] = *(a12 + 8); + b[ 12] = *(a13 + 8); + b[ 13] = *(a14 + 8); + b[ 14] = *(a15 + 8); + b[ 15] = *(a16 + 8); + b += 16; + } + + if (i >= 10) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a10 + 9); +#endif + b[ 10] = *(a11 + 9); + b[ 11] = *(a12 + 9); + b[ 12] = *(a13 + 9); + b[ 13] = *(a14 + 9); + b[ 14] = *(a15 + 9); + b[ 15] = *(a16 + 9); + b += 16; + } + + if (i >= 11) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a11 + 10); +#endif + b[ 11] = *(a12 + 10); + b[ 12] = *(a13 + 10); + b[ 13] = *(a14 + 10); + b[ 14] = *(a15 + 10); + b[ 15] = *(a16 + 10); + b += 16; + } + + if (i >= 12) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; +#ifdef UNIT + b[ 11] = ONE; +#else + b[ 11] = *(a12 + 11); +#endif + b[ 12] = *(a13 + 11); + b[ 13] = *(a14 + 11); + b[ 14] = *(a15 + 11); + b[ 15] = *(a16 + 11); + b += 16; + } + + if (i >= 13) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; +#ifdef UNIT + b[ 12] = ONE; +#else + b[ 12] = *(a13 + 12); +#endif + b[ 13] = *(a14 + 12); + b[ 14] = *(a15 + 12); + b[ 15] = *(a16 + 12); + b += 16; + } + + if (i >= 14) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; +#ifdef UNIT + b[ 13] = ONE; +#else + b[ 13] = *(a14 + 13); +#endif + b[ 14] = *(a15 + 13); + b[ 15] = *(a16 + 13); + b += 16; + } + + if (i >= 15) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; +#ifdef UNIT + b[ 14] = ONE; +#else + b[ 14] = *(a15 + 14); +#endif + b[ 15] = *(a16 + 14); + b += 16; + } + } + } + + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + b += 8; + } + } else + if (X > posY) { + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 64; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a02 + 1); +#endif + b[ 10] = *(a03 + 1); + b[ 11] = *(a04 + 1); + b[ 12] = *(a05 + 1); + b[ 13] = *(a06 + 1); + b[ 14] = *(a07 + 1); + b[ 15] = *(a08 + 1); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; +#else + b[ 18] = *(a03 + 2); +#endif + b[ 19] = *(a04 + 2); + b[ 20] = *(a05 + 2); + b[ 21] = *(a06 + 2); + b[ 22] = *(a07 + 2); + b[ 23] = *(a08 + 2); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; +#ifdef UNIT + b[ 27] = ONE; +#else + b[ 27] = *(a04 + 3); +#endif + b[ 28] = *(a05 + 3); + b[ 29] = *(a06 + 3); + b[ 30] = *(a07 + 3); + b[ 31] = *(a08 + 3); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; +#else + b[ 36] = *(a05 + 4); +#endif + b[ 37] = *(a06 + 4); + b[ 38] = *(a07 + 4); + b[ 39] = *(a08 + 4); + + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; +#ifdef UNIT + b[ 45] = ONE; +#else + b[ 45] = *(a06 + 5); +#endif + b[ 46] = *(a07 + 5); + b[ 47] = *(a08 + 5); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; +#else + b[ 54] = *(a07 + 6); +#endif + b[ 55] = *(a08 + 6); + + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; +#ifdef UNIT + b[ 63] = ONE; +#else + b[ 63] = *(a08 + 7); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + a05 ++; + a06 ++; + a07 ++; + a08 ++; + b += 8; + } + } else + if (X > posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b[ 4] = *(a05 + 0); + b[ 5] = *(a06 + 0); + b[ 6] = *(a07 + 0); + b[ 7] = *(a08 + 0); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a03 + 1); + b[ 3] = *(a04 + 1); + b[ 4] = *(a05 + 1); + b[ 5] = *(a06 + 1); + b[ 6] = *(a07 + 1); + b[ 7] = *(a08 + 1); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a04 + 2); + b[ 4] = *(a05 + 2); + b[ 5] = *(a06 + 2); + b[ 6] = *(a07 + 2); + b[ 7] = *(a08 + 2); + b += 8; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = *(a05 + 3); + b[ 5] = *(a06 + 3); + b[ 6] = *(a07 + 3); + b[ 7] = *(a08 + 3); + b += 8; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = *(a06 + 4); + b[ 6] = *(a07 + 4); + b[ 7] = *(a08 + 4); + b += 8; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = *(a07 + 5); + b[ 7] = *(a08 + 5); + b += 8; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = *(a08 + 6); + b += 8; + } + } + } + + posY += 8; + } + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + b += 4; + } + } else + if (X > posY) { + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 16; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a02 + 1); +#endif + b[ 6] = *(a03 + 1); + b[ 7] = *(a04 + 1); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a03 + 2); +#endif + b[ 11] = *(a04 + 2); + + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(a04 + 3); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + + a01 ++; + a02 ++; + a03 ++; + a04 ++; + b += 4; + } + } else + if (X > posY) { + a01 += i * lda; + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + b += 4 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b[ 2] = *(a03 + 0); + b[ 3] = *(a04 + 0); + b += 4; + + if (i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = *(a03 + 1); + b[ 3] = *(a04 + 1); + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = *(a04 + 2); + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + b[ 2] = *(a01 + 1); + b[ 3] = *(a02 + 1); + + a01 += 2; + a02 += 2; + b += 4; + } else + if (X > posY) { + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a02 + 1); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a02 + 0); + + a01 ++; + a02 ++; + b += 2; + } else + if (X > posY) { + a01 += lda; + a02 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = *(a02 + 0); + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b[ 0] = *(a01 + 0); + a01 += 1; + b += 1; + } else + if (X > posY) { + a01 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_2.c b/kernel/generic/trmm_uncopy_2.c new file mode 100644 index 0000000000..1b6d2356a4 --- /dev/null +++ b/kernel/generic/trmm_uncopy_2.c @@ -0,0 +1,195 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data02; + b[ 3] = data04; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data03; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = ZERO; + b[ 3] = data04; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data03; + + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data03; +#else + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data03; +#endif + ao1 += lda; + b += 2; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_4.c b/kernel/generic/trmm_uncopy_4.c new file mode 100644 index 0000000000..4ff6948394 --- /dev/null +++ b/kernel/generic/trmm_uncopy_4.c @@ -0,0 +1,489 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b[ 4] = data02; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = data03; + b[ 9] = data07; + b[10] = data11; + b[11] = data15; + b[12] = data04; + b[13] = data08; + b[14] = data12; + b[15] = data16; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + + b[ 4] = ZERO; + b[ 5] = data06; + b[ 6] = data10; + b[ 7] = data14; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data15; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data16; +#endif + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + b[ 4] = data02; + b[ 5] = data04; + b[ 6] = data06; + b[ 7] = data08; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data05 = *(ao3 + 0); + data07 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data03; + b[ 2] = data05; + b[ 3] = data07; + + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + ao1 += lda; + b += 4; + } + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data15 = *(ao4 + 2); + } + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ONE; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = data15; + b += 4; + } +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data13 = *(ao4 + 0); + + if (i >= 2) { + data06 = *(ao2 + 1); + data10 = *(ao3 + 1); + data14 = *(ao4 + 1); + } + + if (i >= 3) { + data11 = *(ao3 + 2); + data15 = *(ao4 + 2); + } + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data09; + b[ 3] = data13; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = data06; + b[ 2] = data10; + b[ 3] = data14; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data15; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = data02; + b[ 3] = data06; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data05; + b[ 2] = ZERO; + b[ 3] = data06; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + ao1 += lda; + ao2 += lda; + b += 2; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data05; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data05; +#endif + ao1 += lda; + ao2 += lda; + b += 2; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_uncopy_8.c b/kernel/generic/trmm_uncopy_8.c new file mode 100644 index 0000000000..4e23ffc693 --- /dev/null +++ b/kernel/generic/trmm_uncopy_8.c @@ -0,0 +1,1226 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + ao7 = a + posX + (posY + 6) * lda; + ao8 = a + posX + (posY + 7) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + ao7 = a + posY + (posX + 6) * lda; + ao8 = a + posY + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + data56 = *(ao7 + 7); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = data03; + b[17] = data11; + b[18] = data19; + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = data04; + b[25] = data12; + b[26] = data20; + b[27] = data28; + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + b[32] = data05; + b[33] = data13; + b[34] = data21; + b[35] = data29; + b[36] = data37; + b[37] = data45; + b[38] = data53; + b[39] = data61; + + b[40] = data06; + b[41] = data14; + b[42] = data22; + b[43] = data30; + b[44] = data38; + b[45] = data46; + b[46] = data54; + b[47] = data62; + + b[48] = data07; + b[49] = data15; + b[50] = data23; + b[51] = data31; + b[52] = data39; + b[53] = data47; + b[54] = data55; + b[55] = data63; + + b[56] = data08; + b[57] = data16; + b[58] = data24; + b[59] = data32; + b[60] = data40; + b[61] = data48; + b[62] = data56; + b[63] = data64; + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + + } else + if (X > posY) { + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + + data09 = *(ao2 + 0); +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); +#ifndef UNIT + data64 = *(ao8 + 7); +#endif + + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = ZERO; +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = data10; +#endif + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = ZERO; + b[17] = ZERO; +#ifdef UNIT + b[18] = ONE; +#else + b[18] = data19; +#endif + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; +#ifdef UNIT + b[27] = ONE; +#else + b[27] = data28; +#endif + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + b[32] = ZERO; + b[33] = ZERO; + b[34] = ZERO; + b[35] = ZERO; +#ifdef UNIT + b[36] = ONE; +#else + b[36] = data37; +#endif + b[37] = data45; + b[38] = data53; + b[39] = data61; + + b[40] = ZERO; + b[41] = ZERO; + b[42] = ZERO; + b[43] = ZERO; + b[44] = ZERO; +#ifdef UNIT + b[45] = ONE; +#else + b[45] = data46; +#endif + b[46] = data54; + b[47] = data62; + + b[48] = ZERO; + b[49] = ZERO; + b[50] = ZERO; + b[51] = ZERO; + b[52] = ZERO; + b[53] = ZERO; +#ifdef UNIT + b[54] = ONE; +#else + b[54] = data55; +#endif + b[55] = data63; + + b[56] = ZERO; + b[57] = ZERO; + b[58] = ZERO; + b[59] = ZERO; + b[60] = ZERO; + b[61] = ZERO; + b[62] = ZERO; +#ifdef UNIT + b[63] = ONE; +#else + b[63] = data64; +#endif + + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + + if (m & 4) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + b[16] = data03; + b[17] = data11; + b[18] = data19; + b[19] = data27; + b[20] = data35; + b[21] = data43; + b[22] = data51; + b[23] = data59; + + b[24] = data04; + b[25] = data12; + b[26] = data20; + b[27] = data28; + b[28] = data36; + b[29] = data44; + b[30] = data52; + b[31] = data60; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + ao5 += 4; + ao6 += 4; + ao7 += 4; + ao8 += 4; + + b += 32; + } + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b[ 8] = data02; + b[ 9] = data10; + b[10] = data18; + b[11] = data26; + b[12] = data34; + b[13] = data42; + b[14] = data50; + b[15] = data58; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + data33 = *(ao5 + 0); + data41 = *(ao6 + 0); + data49 = *(ao7 + 0); + data57 = *(ao8 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + + b += 8; + } + } else + if (X > posY) { + if (m & 4) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + if (m & 2) { + ao1 += 2 * lda; + b += 16; + } + + if (m & 1) { + b += 8; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + data33 = *(ao5 + 0); + data41 = *(ao6 + 0); + data49 = *(ao7 + 0); + data57 = *(ao8 + 0); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data18 = *(ao3 + 1); + data26 = *(ao4 + 1); + data34 = *(ao5 + 1); + data42 = *(ao6 + 1); + data50 = *(ao7 + 1); + data58 = *(ao8 + 1); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data27 = *(ao4 + 2); + data35 = *(ao5 + 2); + data43 = *(ao6 + 2); + data51 = *(ao7 + 2); + data59 = *(ao8 + 2); + } + + if (i >= 4) { +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data36 = *(ao5 + 3); + data44 = *(ao6 + 3); + data52 = *(ao7 + 3); + data60 = *(ao8 + 3); + } + + if (i >= 5) { +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data45 = *(ao6 + 4); + data53 = *(ao7 + 4); + data61 = *(ao8 + 4); + } + + if (i >= 6) { +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data54 = *(ao7 + 5); + data62 = *(ao8 + 5); + } + + if (i >= 7) { +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data63 = *(ao8 + 6); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b[ 4] = data33; + b[ 5] = data41; + b[ 6] = data49; + b[ 7] = data57; + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = data18; + b[ 3] = data26; + b[ 4] = data34; + b[ 5] = data42; + b[ 6] = data50; + b[ 7] = data58; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = data27; + b[ 4] = data35; + b[ 5] = data43; + b[ 6] = data51; + b[ 7] = data59; + b += 8; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = data28; +#endif + b[ 4] = data36; + b[ 5] = data44; + b[ 6] = data52; + b[ 7] = data60; + b += 8; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = data37; +#endif + b[ 5] = data45; + b[ 6] = data53; + b[ 7] = data61; + b += 8; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = data46; +#endif + b[ 6] = data54; + b[ 7] = data62; + b += 8; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = data55; +#endif + b[ 7] = data63; + b += 8; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data11; + b[10] = data19; + b[11] = data27; + + b[12] = data04; + b[13] = data12; + b[14] = data20; + b[15] = data28; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + + } else + if (X > posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = ZERO; + b[ 5] = ONE; + b[ 6] = data18; + b[ 7] = data26; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = data27; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = ZERO; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data19; + b[11] = data27; + + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = data28; +#endif + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b[ 4] = data02; + b[ 5] = data10; + b[ 6] = data18; + b[ 7] = data26; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + + b += 4; + } + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + b += 8; + } + + if (m & 1) { + b += 4; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data18 = *(ao3 + 1); + data26 = *(ao4 + 1); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data27 = *(ao4 + 2); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = data09; + b[ 2] = data17; + b[ 3] = data25; + b += 4; + + if(i >= 2) { + b[ 0] = ZERO; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = data18; + b[ 3] = data26; + b += 4; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = data27; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = data02; + b[ 3] = data10; + + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = data09; + b[ 2] = ZERO; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data09; + b[ 2] = ZERO; + b[ 3] = data10; +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + + b[ 0] = data01; + b[ 1] = data09; + b += 2; + } else + if (X > posY) { + b += 2; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data09; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data09; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += 1; + b += 1; + } else + if (X > posY) { + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_1.c b/kernel/generic/trmm_utcopy_1.c new file mode 100644 index 0000000000..92f2da3da4 --- /dev/null +++ b/kernel/generic/trmm_utcopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, X; + + FLOAT data01; + FLOAT *ao1; + + while (n > 0) { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + posY ++; + n --; + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_16.c b/kernel/generic/trmm_utcopy_16.c new file mode 100644 index 0000000000..a964cd3544 --- /dev/null +++ b/kernel/generic/trmm_utcopy_16.c @@ -0,0 +1,1550 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + js = (n >> 4); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + a09 = a + posX + (posY + 8) * lda; + a10 = a + posX + (posY + 9) * lda; + a11 = a + posX + (posY + 10) * lda; + a12 = a + posX + (posY + 11) * lda; + a13 = a + posX + (posY + 12) * lda; + a14 = a + posX + (posY + 13) * lda; + a15 = a + posX + (posY + 14) * lda; + a16 = a + posX + (posY + 15) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + a09 = a + posY + (posX + 8) * lda; + a10 = a + posY + (posX + 9) * lda; + a11 = a + posY + (posX + 10) * lda; + a12 = a + posY + (posX + 11) * lda; + a13 = a + posY + (posX + 12) * lda; + a14 = a + posY + (posX + 13) * lda; + a15 = a + posY + (posX + 14) * lda; + a16 = a + posY + (posX + 15) * lda; + } + + i = (m >> 4); + if (i > 0) { + do { + if (X < posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + a09 += 16; + a10 += 16; + a11 += 16; + a12 += 16; + a13 += 16; + a14 += 16; + a15 += 16; + a16 += 16; + b += 256; + } else + if (X > posY) { + + for (ii = 0; ii < 16; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a02 + 0); +#ifdef UNIT + b[ 17] = ONE; +#else + b[ 17] = *(a02 + 1); +#endif + b[ 18] = ZERO; + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a03 + 0); + b[ 33] = *(a03 + 1); +#ifdef UNIT + b[ 34] = ONE; +#else + b[ 34] = *(a03 + 2); +#endif + b[ 35] = ZERO; + b[ 36] = ZERO; + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a04 + 0); + b[ 49] = *(a04 + 1); + b[ 50] = *(a04 + 2); +#ifdef UNIT + b[ 51] = ONE; +#else + b[ 51] = *(a04 + 3); +#endif + b[ 52] = ZERO; + b[ 53] = ZERO; + b[ 54] = ZERO; + b[ 55] = ZERO; + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a05 + 0); + b[ 65] = *(a05 + 1); + b[ 66] = *(a05 + 2); + b[ 67] = *(a05 + 3); +#ifdef UNIT + b[ 68] = ONE; +#else + b[ 68] = *(a05 + 4); +#endif + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; + b[ 72] = ZERO; + b[ 73] = ZERO; + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a06 + 0); + b[ 81] = *(a06 + 1); + b[ 82] = *(a06 + 2); + b[ 83] = *(a06 + 3); + b[ 84] = *(a06 + 4); +#ifdef UNIT + b[ 85] = ONE; +#else + b[ 85] = *(a06 + 5); +#endif + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; + b[ 90] = ZERO; + b[ 91] = ZERO; + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a07 + 0); + b[ 97] = *(a07 + 1); + b[ 98] = *(a07 + 2); + b[ 99] = *(a07 + 3); + b[100] = *(a07 + 4); + b[101] = *(a07 + 5); +#ifdef UNIT + b[102] = ONE; +#else + b[102] = *(a07 + 6); +#endif + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; + b[108] = ZERO; + b[109] = ZERO; + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a08 + 0); + b[113] = *(a08 + 1); + b[114] = *(a08 + 2); + b[115] = *(a08 + 3); + b[116] = *(a08 + 4); + b[117] = *(a08 + 5); + b[118] = *(a08 + 6); +#ifdef UNIT + b[119] = ONE; +#else + b[119] = *(a08 + 7); +#endif + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; + b[126] = ZERO; + b[127] = ZERO; + + b[128] = *(a09 + 0); + b[129] = *(a09 + 1); + b[130] = *(a09 + 2); + b[131] = *(a09 + 3); + b[132] = *(a09 + 4); + b[133] = *(a09 + 5); + b[134] = *(a09 + 6); + b[135] = *(a09 + 7); +#ifdef UNIT + b[136] = ONE; +#else + b[136] = *(a09 + 8); +#endif + b[137] = ZERO; + b[138] = ZERO; + b[139] = ZERO; + b[140] = ZERO; + b[141] = ZERO; + b[142] = ZERO; + b[143] = ZERO; + + b[144] = *(a10 + 0); + b[145] = *(a10 + 1); + b[146] = *(a10 + 2); + b[147] = *(a10 + 3); + b[148] = *(a10 + 4); + b[149] = *(a10 + 5); + b[150] = *(a10 + 6); + b[151] = *(a10 + 7); + b[152] = *(a10 + 8); +#ifdef UNIT + b[153] = ONE; +#else + b[153] = *(a10 + 9); +#endif + b[154] = ZERO; + b[155] = ZERO; + b[156] = ZERO; + b[157] = ZERO; + b[158] = ZERO; + b[159] = ZERO; + + b[160] = *(a11 + 0); + b[161] = *(a11 + 1); + b[162] = *(a11 + 2); + b[163] = *(a11 + 3); + b[164] = *(a11 + 4); + b[165] = *(a11 + 5); + b[166] = *(a11 + 6); + b[167] = *(a11 + 7); + b[168] = *(a11 + 8); + b[169] = *(a11 + 9); +#ifdef UNIT + b[170] = ONE; +#else + b[170] = *(a11 + 10); +#endif + b[171] = ZERO; + b[172] = ZERO; + b[173] = ZERO; + b[174] = ZERO; + b[175] = ZERO; + + b[176] = *(a12 + 0); + b[177] = *(a12 + 1); + b[178] = *(a12 + 2); + b[179] = *(a12 + 3); + b[180] = *(a12 + 4); + b[181] = *(a12 + 5); + b[182] = *(a12 + 6); + b[183] = *(a12 + 7); + b[184] = *(a12 + 8); + b[185] = *(a12 + 9); + b[186] = *(a12 + 10); +#ifdef UNIT + b[187] = ONE; +#else + b[187] = *(a12 + 11); +#endif + b[188] = ZERO; + b[189] = ZERO; + b[190] = ZERO; + b[191] = ZERO; + + b[192] = *(a13 + 0); + b[193] = *(a13 + 1); + b[194] = *(a13 + 2); + b[195] = *(a13 + 3); + b[196] = *(a13 + 4); + b[197] = *(a13 + 5); + b[198] = *(a13 + 6); + b[199] = *(a13 + 7); + b[200] = *(a13 + 8); + b[201] = *(a13 + 9); + b[202] = *(a13 + 10); + b[203] = *(a13 + 11); +#ifdef UNIT + b[204] = ONE; +#else + b[204] = *(a13 + 12); +#endif + b[205] = ZERO; + b[206] = ZERO; + b[207] = ZERO; + + b[208] = *(a14 + 0); + b[209] = *(a14 + 1); + b[210] = *(a14 + 2); + b[211] = *(a14 + 3); + b[212] = *(a14 + 4); + b[213] = *(a14 + 5); + b[214] = *(a14 + 6); + b[215] = *(a14 + 7); + b[216] = *(a14 + 8); + b[217] = *(a14 + 9); + b[218] = *(a14 + 10); + b[219] = *(a14 + 11); + b[220] = *(a14 + 12); +#ifdef UNIT + b[221] = ONE; +#else + b[221] = *(a14 + 13); +#endif + b[222] = ZERO; + b[223] = ZERO; + + b[224] = *(a15 + 0); + b[225] = *(a15 + 1); + b[226] = *(a15 + 2); + b[227] = *(a15 + 3); + b[228] = *(a15 + 4); + b[229] = *(a15 + 5); + b[230] = *(a15 + 6); + b[231] = *(a15 + 7); + b[232] = *(a15 + 8); + b[233] = *(a15 + 9); + b[234] = *(a15 + 10); + b[235] = *(a15 + 11); + b[236] = *(a15 + 12); + b[237] = *(a15 + 13); +#ifdef UNIT + b[238] = ONE; +#else + b[238] = *(a15 + 14); +#endif + b[239] = ZERO; + + b[240] = *(a16 + 0); + b[241] = *(a16 + 1); + b[242] = *(a16 + 2); + b[243] = *(a16 + 3); + b[244] = *(a16 + 4); + b[245] = *(a16 + 5); + b[246] = *(a16 + 6); + b[247] = *(a16 + 7); + b[248] = *(a16 + 8); + b[249] = *(a16 + 9); + b[250] = *(a16 + 10); + b[251] = *(a16 + 11); + b[252] = *(a16 + 12); + b[253] = *(a16 + 13); + b[254] = *(a16 + 14); +#ifdef UNIT + b[255] = ONE; +#else + b[255] = *(a16 + 15); +#endif + + a01 += 16 * lda; + a02 += 16 * lda; + a03 += 16 * lda; + a04 += 16 * lda; + a05 += 16 * lda; + a06 += 16 * lda; + a07 += 16 * lda; + a08 += 16 * lda; + a09 += 16 * lda; + a10 += 16 * lda; + a11 += 16 * lda; + a12 += 16 * lda; + a13 += 16 * lda; + a14 += 16 * lda; + a15 += 16 * lda; + a16 += 16 * lda; + + b += 256; + } + + X += 16; + i --; + } while (i > 0); + } + + i = (m & 15); + if (i > 0) { + if (X < posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + a05 += i; + a06 += i; + a07 += i; + a08 += i; + a09 += i; + a10 += i; + a11 += i; + a12 += i; + a13 += i; + a14 += i; + a15 += i; + a16 += i; + b += 16 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + a09 += lda; + a10 += lda; + a11 += lda; + a12 += lda; + a13 += lda; + a14 += lda; + a15 += lda; + a16 += lda; + b += 16; + } + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + if (i >= 2) { + b[ 0] = *(a02 + 0); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b += 16; + } + + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 8) { + b[ 0] = *(a08 + 0); + b[ 1] = *(a08 + 1); + b[ 2] = *(a08 + 2); + b[ 3] = *(a08 + 3); + b[ 4] = *(a08 + 4); + b[ 5] = *(a08 + 5); + b[ 6] = *(a08 + 6); +#ifdef UNIT + b[ 7] = ONE; +#else + b[ 7] = *(a08 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b[16] = ZERO; + b += 16; + } + + if (i >= 9) { + b[ 0] = *(a09 + 0); + b[ 1] = *(a09 + 1); + b[ 2] = *(a09 + 2); + b[ 3] = *(a09 + 3); + b[ 4] = *(a09 + 4); + b[ 5] = *(a09 + 5); + b[ 6] = *(a09 + 6); + b[ 7] = *(a09 + 7); +#ifdef UNIT + b[ 8] = ONE; +#else + b[ 8] = *(a09 + 8); +#endif + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 10) { + b[ 0] = *(a10 + 0); + b[ 1] = *(a10 + 1); + b[ 2] = *(a10 + 2); + b[ 3] = *(a10 + 3); + b[ 4] = *(a10 + 4); + b[ 5] = *(a10 + 5); + b[ 6] = *(a10 + 6); + b[ 7] = *(a10 + 7); + b[ 8] = *(a10 + 8); +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a10 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 11) { + b[ 0] = *(a11 + 0); + b[ 1] = *(a11 + 1); + b[ 2] = *(a11 + 2); + b[ 3] = *(a11 + 3); + b[ 4] = *(a11 + 4); + b[ 5] = *(a11 + 5); + b[ 6] = *(a11 + 6); + b[ 7] = *(a11 + 7); + b[ 8] = *(a11 + 8); + b[ 9] = *(a11 + 9); +#ifdef UNIT + b[10] = ONE; +#else + b[10] = *(a11 + 10); +#endif + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 12) { + b[ 0] = *(a12 + 0); + b[ 1] = *(a12 + 1); + b[ 2] = *(a12 + 2); + b[ 3] = *(a12 + 3); + b[ 4] = *(a12 + 4); + b[ 5] = *(a12 + 5); + b[ 6] = *(a12 + 6); + b[ 7] = *(a12 + 7); + b[ 8] = *(a12 + 8); + b[ 9] = *(a12 + 9); + b[10] = *(a12 + 10); +#ifdef UNIT + b[11] = ONE; +#else + b[11] = *(a12 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 13) { + b[ 0] = *(a13 + 0); + b[ 1] = *(a13 + 1); + b[ 2] = *(a13 + 2); + b[ 3] = *(a13 + 3); + b[ 4] = *(a13 + 4); + b[ 5] = *(a13 + 5); + b[ 6] = *(a13 + 6); + b[ 7] = *(a13 + 7); + b[ 8] = *(a13 + 8); + b[ 9] = *(a13 + 9); + b[10] = *(a13 + 10); + b[11] = *(a13 + 11); +#ifdef UNIT + b[12] = ONE; +#else + b[12] = *(a13 + 12); +#endif + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 14) { + b[ 0] = *(a14 + 0); + b[ 1] = *(a14 + 1); + b[ 2] = *(a14 + 2); + b[ 3] = *(a14 + 3); + b[ 4] = *(a14 + 4); + b[ 5] = *(a14 + 5); + b[ 6] = *(a14 + 6); + b[ 7] = *(a14 + 7); + b[ 8] = *(a14 + 8); + b[ 9] = *(a14 + 9); + b[10] = *(a14 + 10); + b[11] = *(a14 + 11); + b[12] = *(a14 + 12); +#ifdef UNIT + b[13] = ONE; +#else + b[13] = *(a14 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 15) { + b[ 0] = *(a15 + 0); + b[ 1] = *(a15 + 1); + b[ 2] = *(a15 + 2); + b[ 3] = *(a15 + 3); + b[ 4] = *(a15 + 4); + b[ 5] = *(a15 + 5); + b[ 6] = *(a15 + 6); + b[ 7] = *(a15 + 7); + b[ 8] = *(a15 + 8); + b[ 9] = *(a15 + 9); + b[10] = *(a15 + 10); + b[11] = *(a15 + 11); + b[12] = *(a15 + 12); + b[13] = *(a15 + 13); +#ifdef UNIT + b[14] = ONE; +#else + b[14] = *(a15 + 14); +#endif + b[15] = ZERO; + } + } + } + posY += 16; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 8){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + a05 = a + posX + (posY + 4) * lda; + a06 = a + posX + (posY + 5) * lda; + a07 = a + posX + (posY + 6) * lda; + a08 = a + posX + (posY + 7) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + a05 = a + posY + (posX + 4) * lda; + a06 = a + posY + (posX + 5) * lda; + a07 = a + posY + (posX + 6) * lda; + a08 = a + posY + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + a05 += 8; + a06 += 8; + a07 += 8; + a08 += 8; + b += 64; + } else + if (X > posY) { + + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a02 + 0); +#ifdef UNIT + b[ 9] = ONE; +#else + b[ 9] = *(a02 + 1); +#endif + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a03 + 0); + b[ 17] = *(a03 + 1); +#ifdef UNIT + b[ 18] = ONE; +#else + b[ 18] = *(a03 + 2); +#endif + b[ 19] = ZERO; + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a04 + 0); + b[ 25] = *(a04 + 1); + b[ 26] = *(a04 + 2); +#ifdef UNIT + b[ 27] = ONE; +#else + b[ 27] = *(a04 + 3); +#endif + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a05 + 0); + b[ 33] = *(a05 + 1); + b[ 34] = *(a05 + 2); + b[ 35] = *(a05 + 3); +#ifdef UNIT + b[ 36] = ONE; +#else + b[ 36] = *(a05 + 4); +#endif + b[ 37] = ZERO; + b[ 38] = ZERO; + b[ 39] = ZERO; + + b[ 40] = *(a06 + 0); + b[ 41] = *(a06 + 1); + b[ 42] = *(a06 + 2); + b[ 43] = *(a06 + 3); + b[ 44] = *(a06 + 4); +#ifdef UNIT + b[ 45] = ONE; +#else + b[ 45] = *(a06 + 5); +#endif + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a07 + 0); + b[ 49] = *(a07 + 1); + b[ 50] = *(a07 + 2); + b[ 51] = *(a07 + 3); + b[ 52] = *(a07 + 4); + b[ 53] = *(a07 + 5); +#ifdef UNIT + b[ 54] = ONE; +#else + b[ 54] = *(a07 + 6); +#endif + b[ 55] = ZERO; + + b[ 56] = *(a08 + 0); + b[ 57] = *(a08 + 1); + b[ 58] = *(a08 + 2); + b[ 59] = *(a08 + 3); + b[ 60] = *(a08 + 4); + b[ 61] = *(a08 + 5); + b[ 62] = *(a08 + 6); +#ifdef UNIT + b[ 63] = ONE; +#else + b[ 63] = *(a08 + 7); +#endif + + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0) { + if (X < posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + a05 += i; + a06 += i; + a07 += i; + a08 += i; + b += 8 * i; + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += i * lda; + a03 += i * lda; + a04 += i * lda; + a05 += i * lda; + a06 += i * lda; + a07 += i * lda; + a08 += i * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(a02 + 0); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a04 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = *(a05 + 4); +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a06 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = *(a07 + 6); +#endif + b[ 7] = ZERO; + b += 8; + } + } + } + posY += 8; + } + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + a03 = a + posX + (posY + 2) * lda; + a04 = a + posX + (posY + 3) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + a03 = a + posY + (posX + 2) * lda; + a04 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + a01 += 4; + a02 += 4; + a03 += 4; + a04 += 4; + b += 16; + } else + if (X > posY) { + + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + a01 += lda; + b += 4; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a02 + 0); +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = *(a02 + 1); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a03 + 0); + b[ 9] = *(a03 + 1); +#ifdef UNIT + b[ 10] = ONE; +#else + b[ 10] = *(a03 + 2); +#endif + b[ 11] = ZERO; + + b[ 12] = *(a04 + 0); + b[ 13] = *(a04 + 1); + b[ 14] = *(a04 + 2); +#ifdef UNIT + b[ 15] = ONE; +#else + b[ 15] = *(a04 + 3); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X < posY) { + a01 += i; + a02 += i; + a03 += i; + a04 += i; + b += 4 * i; + } else + if (X > posY) { + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + a01 += lda; + b += 4; + } + a02 += lda; + a03 += lda; + a04 += lda; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if (i >= 2) { + b[ 0] = *(a02 + 0); +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = *(a02 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = *(a03 + 2); +#endif + b[ 3] = ZERO; + b += 4; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + a02 = a + posX + (posY + 1) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + a02 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + a01 += 2; + a02 += 2; + b += 4; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } else { + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + b[ 1] = ZERO; + + b[ 2] = *(a02 + 0); +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = *(a02 + 1); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + if (X < posY) { + a01 ++; + a02 ++; + b += 2; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = *(a01 + 1); +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b += 2; + } + } + posY += 2; + } + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX + (posY + 0) * lda; + } else { + a01 = a + posY + (posX + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X < posY) { + a01 += 1; + b ++; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + a01 += lda; + b ++; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = *(a01 + 0); +#endif + a01 += lda; + b ++; + } + + X += 1; + i --; + } while (i > 0); + } + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_2.c b/kernel/generic/trmm_utcopy_2.c new file mode 100644 index 0000000000..620b06a4f1 --- /dev/null +++ b/kernel/generic/trmm_utcopy_2.c @@ -0,0 +1,191 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + ao1 += 1; + ao2 += 1; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + + b[ 0] = data01; + b[ 1] = ZERO; +#endif + ao1 += lda; + b += 2; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + b += 1; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + b += 1; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_4.c b/kernel/generic/trmm_utcopy_4.c new file mode 100644 index 0000000000..7d4dba34b4 --- /dev/null +++ b/kernel/generic/trmm_utcopy_4.c @@ -0,0 +1,472 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data12 = *(ao3 + 3); + + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 16; + + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + data13 = *(ao4 + 0); + data14 = *(ao4 + 1); + data15 = *(ao4 + 2); + data16 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + if (m & 1) { + ao1 += 1; + ao2 += 1; + ao3 += 1; + ao4 += 1; + b += 4; + } + + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + } + + } else { + +#ifdef UNIT + if (i >= 2) { + data05 = *(ao2 + 0); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = ONE; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = ONE; + b[ 3] = ZERO; + b += 4; + } +#else + data01 = *(ao1 + 0); + + if (i >= 2) { + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + } + + if (i >= 3) { + data09 = *(ao3 + 0); + data10 = *(ao3 + 1); + data11 = *(ao3 + 2); + } + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = data11; + b[ 3] = ZERO; + b += 4; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data05 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; + +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + + b[ 0] = data01; + b[ 1] = ZERO; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + b += 1; + ao1 += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + b[ 0] = data01; + ao1 += lda; + b += 1; + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trmm_utcopy_8.c b/kernel/generic/trmm_utcopy_8.c new file mode 100644 index 0000000000..6dbf8bd284 --- /dev/null +++ b/kernel/generic/trmm_utcopy_8.c @@ -0,0 +1,1276 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + ao5 = a + posX + (posY + 4) * lda; + ao6 = a + posX + (posY + 5) * lda; + ao7 = a + posX + (posY + 6) * lda; + ao8 = a + posX + (posY + 7) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + ao5 = a + posY + (posX + 4) * lda; + ao6 = a + posY + (posX + 5) * lda; + ao7 = a + posY + (posX + 6) * lda; + ao8 = a + posY + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + ao5 += 8; + ao6 += 8; + ao7 += 8; + ao8 += 8; + + b += 64; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + data38 = *(ao5 + 5); + data39 = *(ao5 + 6); + data40 = *(ao5 + 7); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + data47 = *(ao6 + 6); + data48 = *(ao6 + 7); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + data56 = *(ao7 + 7); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + b[32] = data33; + b[33] = data34; + b[34] = data35; + b[35] = data36; + b[36] = data37; + b[37] = data38; + b[38] = data39; + b[39] = data40; + + b[40] = data41; + b[41] = data42; + b[42] = data43; + b[43] = data44; + b[44] = data45; + b[45] = data46; + b[46] = data47; + b[47] = data48; + + b[48] = data49; + b[49] = data50; + b[50] = data51; + b[51] = data52; + b[52] = data53; + b[53] = data54; + b[54] = data55; + b[55] = data56; + + b[56] = data57; + b[57] = data58; + b[58] = data59; + b[59] = data60; + b[60] = data61; + b[61] = data62; + b[62] = data63; + b[63] = data64; + + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = ONE; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data17; + b[17] = data18; + b[18] = ONE; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = ONE; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + + b[32] = data33; + b[33] = data34; + b[34] = data35; + b[35] = data36; + b[36] = ONE; + b[37] = ZERO; + b[38] = ZERO; + b[39] = ZERO; + + b[40] = data41; + b[41] = data42; + b[42] = data43; + b[43] = data44; + b[44] = data45; + b[45] = ONE; + b[46] = ZERO; + b[47] = ZERO; + + b[48] = data49; + b[49] = data50; + b[50] = data51; + b[51] = data52; + b[52] = data53; + b[53] = data54; + b[54] = ONE; + b[55] = ZERO; + + b[56] = data57; + b[57] = data58; + b[58] = data59; + b[59] = data60; + b[60] = data61; + b[61] = data62; + b[62] = data63; + b[63] = ONE; +#else + data01 = *(ao1 + 0); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + data33 = *(ao5 + 0); + data34 = *(ao5 + 1); + data35 = *(ao5 + 2); + data36 = *(ao5 + 3); + data37 = *(ao5 + 4); + + data41 = *(ao6 + 0); + data42 = *(ao6 + 1); + data43 = *(ao6 + 2); + data44 = *(ao6 + 3); + data45 = *(ao6 + 4); + data46 = *(ao6 + 5); + + data49 = *(ao7 + 0); + data50 = *(ao7 + 1); + data51 = *(ao7 + 2); + data52 = *(ao7 + 3); + data53 = *(ao7 + 4); + data54 = *(ao7 + 5); + data55 = *(ao7 + 6); + + data57 = *(ao8 + 0); + data58 = *(ao8 + 1); + data59 = *(ao8 + 2); + data60 = *(ao8 + 3); + data61 = *(ao8 + 4); + data62 = *(ao8 + 5); + data63 = *(ao8 + 6); + data64 = *(ao8 + 7); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = ZERO; + b[20] = ZERO; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ZERO; + b[31] = ZERO; + + b[32] = data33; + b[33] = data34; + b[34] = data35; + b[35] = data36; + b[36] = data37; + b[37] = ZERO; + b[38] = ZERO; + b[39] = ZERO; + + b[40] = data41; + b[41] = data42; + b[42] = data43; + b[43] = data44; + b[44] = data45; + b[45] = data46; + b[46] = ZERO; + b[47] = ZERO; + + b[48] = data49; + b[49] = data50; + b[50] = data51; + b[51] = data52; + b[52] = data53; + b[53] = data54; + b[54] = data55; + b[55] = ZERO; + + b[56] = data57; + b[57] = data58; + b[58] = data59; + b[59] = data60; + b[60] = data61; + b[61] = data62; + b[62] = data63; + b[63] = data64; + +#endif + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 64; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + + if (m & 4) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + ao5 += 4; + ao6 += 4; + ao7 += 4; + ao8 += 4; + + b += 32; + } + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + + b += 16; + } + + if (m & 1) { + b += 8; + } + } else + if (X > posY) { + if (m & 4) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 2 * lda; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + data33 = *(ao5 + 0); + data41 = *(ao6 + 0); + data49 = *(ao7 + 0); + data57 = *(ao8 + 0); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data18 = *(ao3 + 1); + data26 = *(ao4 + 1); + data34 = *(ao5 + 1); + data42 = *(ao6 + 1); + data50 = *(ao7 + 1); + data58 = *(ao8 + 1); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data27 = *(ao4 + 2); + data35 = *(ao5 + 2); + data43 = *(ao6 + 2); + data51 = *(ao7 + 2); + data59 = *(ao8 + 2); + } + + if (i >= 4) { +#ifndef UNIT + data28 = *(ao4 + 3); +#endif + data36 = *(ao5 + 3); + data44 = *(ao6 + 3); + data52 = *(ao7 + 3); + data60 = *(ao8 + 3); + } + + if (i >= 5) { +#ifndef UNIT + data37 = *(ao5 + 4); +#endif + data45 = *(ao6 + 4); + data53 = *(ao7 + 4); + data61 = *(ao8 + 4); + } + + if (i >= 6) { +#ifndef UNIT + data46 = *(ao6 + 5); +#endif + data54 = *(ao7 + 5); + data62 = *(ao8 + 5); + } + + if (i >= 7) { +#ifndef UNIT + data55 = *(ao7 + 6); +#endif + data63 = *(ao8 + 6); + } + +#ifdef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = data09; +#ifdef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data17; + b[ 1] = data18; +#ifdef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 4) { + b[ 0] = data25; + b[ 1] = data26; + b[ 2] = data27; +#ifdef UNIT + b[ 3] = ONE; +#else + b[ 3] = data28; +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 5) { + b[ 0] = data33; + b[ 1] = data34; + b[ 2] = data35; + b[ 3] = data36; +#ifdef UNIT + b[ 4] = ONE; +#else + b[ 4] = data37; +#endif + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 6) { + b[ 0] = data41; + b[ 1] = data42; + b[ 2] = data43; + b[ 3] = data44; + b[ 4] = data45; +#ifdef UNIT + b[ 5] = ONE; +#else + b[ 5] = data46; +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 7) { + b[ 0] = data49; + b[ 1] = data50; + b[ 2] = data51; + b[ 3] = data52; + b[ 4] = data53; + b[ 5] = data54; +#ifdef UNIT + b[ 6] = ONE; +#else + b[ 6] = data55; +#endif + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + ao3 = a + posX + (posY + 2) * lda; + ao4 = a + posX + (posY + 3) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + ao3 = a + posY + (posX + 2) * lda; + ao4 = a + posY + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + + b += 16; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + b[ 8] = data17; + b[ 9] = data18; + b[10] = data19; + b[11] = data20; + + b[12] = data25; + b[13] = data26; + b[14] = data27; + b[15] = data28; + + b += 16; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data09; + b[ 5] = ONE; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data17; + b[ 9] = data18; + b[10] = ONE; + b[11] = ZERO; + + b[12] = data25; + b[13] = data26; + b[14] = data27; + b[15] = ONE; +#else + data01 = *(ao1 + 0); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data17; + b[ 9] = data18; + b[10] = data19; + b[11] = ZERO; + + b[12] = data25; + b[13] = data26; + b[14] = data27; + b[15] = data28; +#endif + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 16; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + + b += 8; + } + + if (m & 1) { + b += 4; + } + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 2 * lda; + b += 8; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + } + } else { + +#ifndef UNIT + data01 = *(ao1 + 0); +#endif + data09 = *(ao2 + 0); + data17 = *(ao3 + 0); + data25 = *(ao4 + 0); + + if (i >= 2) { +#ifndef UNIT + data10 = *(ao2 + 1); +#endif + data18 = *(ao3 + 1); + data26 = *(ao4 + 1); + } + + if (i >= 3) { +#ifndef UNIT + data19 = *(ao3 + 2); +#endif + data27 = *(ao4 + 2); + } + +#ifndef UNIT + b[ 0] = ONE; +#else + b[ 0] = data01; +#endif + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + + if(i >= 2) { + b[ 0] = data09; +#ifndef UNIT + b[ 1] = ONE; +#else + b[ 1] = data10; +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + + if (i >= 3) { + b[ 0] = data17; + + b[ 1] = data18; +#ifndef UNIT + b[ 2] = ONE; +#else + b[ 2] = data19; +#endif + b[ 3] = ZERO; + b += 4; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + ao2 = a + posX + (posY + 1) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + ao2 = a + posY + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + + b += 4; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = ONE; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 4; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b += 2; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + b[ 0] = ONE; + b[ 1] = data09; +#else + data01 = *(ao1 + 0); + data09 = *(ao2 + 0); + b[ 0] = data01; + b[ 1] = data09; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX + (posY + 0) * lda; + } else { + ao1 = a + posY + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + ao1 += 1; + b += 1; + } else + if (X > posY) { + data01 = *(ao1 + 0); + ao1 += lda; + + b[ 0] = data01; + b += 1; + + } else { +#ifdef UNIT + b[ 0] = ONE; +#else + data01 = *(ao1 + 0); + b[ 0] = data01; +#endif + ao1 += lda; + b += 1; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_LN.c b/kernel/generic/trsm_kernel_LN.c new file mode 100644 index 0000000000..068a202b8c --- /dev/null +++ b/kernel/generic/trsm_kernel_LN.c @@ -0,0 +1,333 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_LT.c b/kernel/generic/trsm_kernel_LT.c new file mode 100644 index 0000000000..300fdd4836 --- /dev/null +++ b/kernel/generic/trsm_kernel_LT.c @@ -0,0 +1,317 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_RN.c b/kernel/generic/trsm_kernel_RN.c new file mode 100644 index 0000000000..b85c3c1e93 --- /dev/null +++ b/kernel/generic/trsm_kernel_RN.c @@ -0,0 +1,315 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_kernel_RT.c b/kernel/generic/trsm_kernel_RT.c new file mode 100644 index 0000000000..2adb3a4f7a --- /dev/null +++ b/kernel/generic/trsm_kernel_RT.c @@ -0,0 +1,341 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + + +#ifndef COMPLEX + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/generic/trsm_lncopy_1.c b/kernel/generic/trsm_lncopy_1.c new file mode 100644 index 0000000000..abad971a6e --- /dev/null +++ b/kernel/generic/trsm_lncopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + +#ifndef UNIT + FLOAT data01; +#endif + FLOAT *a1; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) *(b + 0) = *(a1 + 0); + + a1 ++; + b ++; + + i --; + ii ++; + } + + a += lda; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/trsm_lncopy_16.c b/kernel/generic/trsm_lncopy_16.c new file mode 100644 index 0000000000..a7f9cb0b33 --- /dev/null +++ b/kernel/generic/trsm_lncopy_16.c @@ -0,0 +1,271 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + *(b + 4) = *(a5 + 0); + *(b + 5) = *(a6 + 0); + *(b + 6) = *(a7 + 0); + *(b + 7) = *(a8 + 0); + *(b + 8) = *(a9 + 0); + *(b + 9) = *(a10 + 0); + *(b + 10) = *(a11 + 0); + *(b + 11) = *(a12 + 0); + *(b + 12) = *(a13 + 0); + *(b + 13) = *(a14 + 0); + *(b + 14) = *(a15 + 0); + *(b + 15) = *(a16 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + a5 ++; + a6 ++; + a7 ++; + a8 ++; + a9 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + *(b + 4) = *(a5 + 0); + *(b + 5) = *(a6 + 0); + *(b + 6) = *(a7 + 0); + *(b + 7) = *(a8 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + a5 ++; + a6 ++; + a7 ++; + a8 ++; + b += 8; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + b += 4; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + } + + a1 ++; + a2 ++; + b += 2; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k * lda); + } + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + } + + a1 ++; + b += 1; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_lncopy_2.c b/kernel/generic/trsm_lncopy_2.c new file mode 100644 index 0000000000..20cc642536 --- /dev/null +++ b/kernel/generic/trsm_lncopy_2.c @@ -0,0 +1,154 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT *a1, *a2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + + a += 2 * lda; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_lncopy_4.c b/kernel/generic/trsm_lncopy_4.c new file mode 100644 index 0000000000..9f7bcc2dd5 --- /dev/null +++ b/kernel/generic/trsm_lncopy_4.c @@ -0,0 +1,326 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = INV(data11); + + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data05; + *(b + 3) = data07; + *(b + 4) = data02; + *(b + 5) = data04; + *(b + 6) = data06; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_lncopy_8.c b/kernel/generic/trsm_lncopy_8.c new file mode 100644 index 0000000000..40feb810f9 --- /dev/null +++ b/kernel/generic/trsm_lncopy_8.c @@ -0,0 +1,841 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + ii = 0; + i = (m >> 3); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + +#ifndef UNIT + data37 = *(a5 + 4); +#endif + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + +#ifndef UNIT + data46 = *(a6 + 5); +#endif + data47 = *(a6 + 6); + data48 = *(a6 + 7); + +#ifndef UNIT + data55 = *(a7 + 6); +#endif + data56 = *(a7 + 7); + +#ifndef UNIT + data64 = *(a8 + 7); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data02; + *(b + 9) = INV(data10); + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = INV(data19); + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = INV(data28); + + *(b + 32) = data05; + *(b + 33) = data13; + *(b + 34) = data21; + *(b + 35) = data29; + *(b + 36) = INV(data37); + + *(b + 40) = data06; + *(b + 41) = data14; + *(b + 42) = data22; + *(b + 43) = data30; + *(b + 44) = data38; + *(b + 45) = INV(data46); + + *(b + 48) = data07; + *(b + 49) = data15; + *(b + 50) = data23; + *(b + 51) = data31; + *(b + 52) = data39; + *(b + 53) = data47; + *(b + 54) = INV(data55); + + *(b + 56) = data08; + *(b + 57) = data16; + *(b + 58) = data24; + *(b + 59) = data32; + *(b + 60) = data40; + *(b + 61) = data48; + *(b + 62) = data56; + *(b + 63) = INV(data64); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + data47 = *(a6 + 6); + data48 = *(a6 + 7); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); + data55 = *(a7 + 6); + data56 = *(a7 + 7); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); + data64 = *(a8 + 7); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = data19; + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = data28; + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + *(b + 32) = data05; + *(b + 33) = data13; + *(b + 34) = data21; + *(b + 35) = data29; + *(b + 36) = data37; + *(b + 37) = data45; + *(b + 38) = data53; + *(b + 39) = data61; + + *(b + 40) = data06; + *(b + 41) = data14; + *(b + 42) = data22; + *(b + 43) = data30; + *(b + 44) = data38; + *(b + 45) = data46; + *(b + 46) = data54; + *(b + 47) = data62; + + *(b + 48) = data07; + *(b + 49) = data15; + *(b + 50) = data23; + *(b + 51) = data31; + *(b + 52) = data39; + *(b + 53) = data47; + *(b + 54) = data55; + *(b + 55) = data63; + + *(b + 56) = data08; + *(b + 57) = data16; + *(b + 58) = data24; + *(b + 59) = data32; + *(b + 60) = data40; + *(b + 61) = data48; + *(b + 62) = data56; + *(b + 63) = data64; + } + + a1 += 8; + a2 += 8; + a3 += 8; + a4 += 8; + a5 += 8; + a6 += 8; + a7 += 8; + a8 += 8; + b += 64; + + i --; + ii += 8; + } + + if (m & 4) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data02; + *(b + 9) = INV(data10); + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = INV(data19); + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = data19; + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = data28; + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + a5 += 4; + a6 += 4; + a7 += 4; + a8 += 4; + b += 32; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data02; + *(b + 9) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + data33 = *(a5 + 0); + data41 = *(a6 + 0); + data49 = *(a7 + 0); + data57 = *(a8 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + } + b += 8; + } + + a += 8 * lda; + jj += 8; + j --; + } + + if (n & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data10); + + *(b + 8) = data03; + *(b + 9) = data11; + *(b + 10) = INV(data19); + + *(b + 12) = data04; + *(b + 13) = data12; + *(b + 14) = data20; + *(b + 15) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data11; + *(b + 10) = data19; + *(b + 11) = data27; + + *(b + 12) = data04; + *(b + 13) = data12; + *(b + 14) = data20; + *(b + 15) = data28; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data02; + *(b + 5) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data02; + *(b + 3) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data02; + *(b + 3) = data10; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + } + b += 2; + } + + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + ii = 0; + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_1.c b/kernel/generic/trsm_ltcopy_1.c new file mode 100644 index 0000000000..ea84136fe7 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + +#ifndef UNIT + FLOAT data01; +#endif + FLOAT *a1; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) *(b + 0) = *(a1 + 0); + + a1 += lda; + b ++; + + i --; + ii ++; + } + + a ++; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_16.c b/kernel/generic/trsm_ltcopy_16.c new file mode 100644 index 0000000000..1203f1bfaf --- /dev/null +++ b/kernel/generic/trsm_ltcopy_16.c @@ -0,0 +1,228 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 2; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k) = *(a1 + k); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + } + + b += 1; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_2.c b/kernel/generic/trsm_ltcopy_2.c new file mode 100644 index 0000000000..4705635173 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_2.c @@ -0,0 +1,160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT *a1, *a2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + + a += 2; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_4.c b/kernel/generic/trsm_ltcopy_4.c new file mode 100644 index 0000000000..d891468a47 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_4.c @@ -0,0 +1,346 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + +#ifndef UNIT + data11 = *(a3 + 2); +#endif + data12 = *(a3 + 3); + +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 10) = INV(data11); + *(b + 11) = data12; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data06 = *(a2 + 1); +#endif + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data06); + *(b + 6) = data07; + *(b + 7) = data08; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_ltcopy_8.c b/kernel/generic/trsm_ltcopy_8.c new file mode 100644 index 0000000000..0925dccd50 --- /dev/null +++ b/kernel/generic/trsm_ltcopy_8.c @@ -0,0 +1,921 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + ii = 0; + i = (m >> 3); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + +#ifndef UNIT + data37 = *(a5 + 4); +#endif + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + +#ifndef UNIT + data46 = *(a6 + 5); +#endif + data47 = *(a6 + 6); + data48 = *(a6 + 7); + +#ifndef UNIT + data55 = *(a7 + 6); +#endif + data56 = *(a7 + 7); + +#ifndef UNIT + data64 = *(a8 + 7); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 9) = INV(data10); + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 18) = INV(data19); + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + + *(b + 27) = INV(data28); + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + + *(b + 36) = INV(data37); + *(b + 37) = data38; + *(b + 38) = data39; + *(b + 39) = data40; + + *(b + 45) = INV(data46); + *(b + 46) = data47; + *(b + 47) = data48; + + *(b + 54) = INV(data55); + *(b + 55) = data56; + + *(b + 63) = INV(data64); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + data47 = *(a6 + 6); + data48 = *(a6 + 7); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); + data55 = *(a7 + 6); + data56 = *(a7 + 7); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); + data64 = *(a8 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + + *(b + 32) = data33; + *(b + 33) = data34; + *(b + 34) = data35; + *(b + 35) = data36; + *(b + 36) = data37; + *(b + 37) = data38; + *(b + 38) = data39; + *(b + 39) = data40; + *(b + 40) = data41; + *(b + 41) = data42; + *(b + 42) = data43; + *(b + 43) = data44; + *(b + 44) = data45; + *(b + 45) = data46; + *(b + 46) = data47; + *(b + 47) = data48; + + *(b + 48) = data49; + *(b + 49) = data50; + *(b + 50) = data51; + *(b + 51) = data52; + *(b + 52) = data53; + *(b + 53) = data54; + *(b + 54) = data55; + *(b + 55) = data56; + *(b + 56) = data57; + *(b + 57) = data58; + *(b + 58) = data59; + *(b + 59) = data60; + *(b + 60) = data61; + *(b + 61) = data62; + *(b + 62) = data63; + *(b + 63) = data64; + } + + a1 += 8 * lda; + a2 += 8 * lda; + a3 += 8 * lda; + a4 += 8 * lda; + a5 += 8 * lda; + a6 += 8 * lda; + a7 += 8 * lda; + a8 += 8 * lda; + b += 64; + + i --; + ii += 8; + } + + if (m & 4) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 9) = INV(data10); + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 18) = INV(data19); + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + + *(b + 27) = INV(data28); + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 32; + + ii += 4; + } + + if (m & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 9) = INV(data10); + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 16; + + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + b += 8; + } + a += 8; + jj += 8; + j --; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + +#ifndef UNIT + data19 = *(a3 + 2); +#endif + data20 = *(a3 + 3); + + +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 5) = INV(data10); + *(b + 6) = data11; + *(b + 7) = data12; + + *(b + 10) = INV(data19); + *(b + 11) = data20; + *(b + 15) = INV(data28); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + + *(b + 8) = data17; + *(b + 9) = data18; + *(b + 10) = data19; + *(b + 11) = data20; + *(b + 12) = data25; + *(b + 13) = data26; + *(b + 14) = data27; + *(b + 15) = data28; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + + *(b + 6) = INV(data10); + *(b + 7) = data11; + *(b + 8) = data12; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + a += 4; + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data02; + *(b + 3) = INV(data10); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a1 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += lda; + b += 1; + + i --; + ii += 1; + } + + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_1.c b/kernel/generic/trsm_uncopy_1.c new file mode 100644 index 0000000000..3a258609e5 --- /dev/null +++ b/kernel/generic/trsm_uncopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + +#ifndef UNIT + FLOAT data01; +#endif + FLOAT *a1; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) *(b + 0) = *(a1 + 0); + + a1 ++; + b ++; + i --; + ii ++; + } + + a += lda; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_16.c b/kernel/generic/trsm_uncopy_16.c new file mode 100644 index 0000000000..e2b8ce49c4 --- /dev/null +++ b/kernel/generic/trsm_uncopy_16.c @@ -0,0 +1,271 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16; + + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a9 = a + 8 * lda; + a10 = a + 9 * lda; + a11 = a + 10 * lda; + a12 = a + 11 * lda; + a13 = a + 12 * lda; + a14 = a + 13 * lda; + a15 = a + 14 * lda; + a16 = a + 15 * lda; + + a += 16 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 16; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + *(b + 4) = *(a5 + 0); + *(b + 5) = *(a6 + 0); + *(b + 6) = *(a7 + 0); + *(b + 7) = *(a8 + 0); + *(b + 8) = *(a9 + 0); + *(b + 9) = *(a10 + 0); + *(b + 10) = *(a11 + 0); + *(b + 11) = *(a12 + 0); + *(b + 12) = *(a13 + 0); + *(b + 13) = *(a14 + 0); + *(b + 14) = *(a15 + 0); + *(b + 15) = *(a16 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + a5 ++; + a6 ++; + a7 ++; + a8 ++; + a9 ++; + a10 ++; + a11 ++; + a12 ++; + a13 ++; + a14 ++; + a15 ++; + a16 ++; + b += 16; + ii ++; + } + + jj += 16; + j --; + } + + if (n & 8) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + *(b + 4) = *(a5 + 0); + *(b + 5) = *(a6 + 0); + *(b + 6) = *(a7 + 0); + *(b + 7) = *(a8 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + a5 ++; + a6 ++; + a7 ++; + a8 ++; + b += 8; + ii ++; + } + + jj += 8; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + *(b + 2) = *(a3 + 0); + *(b + 3) = *(a4 + 0); + } + + a1 ++; + a2 ++; + a3 ++; + a4 ++; + b += 4; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a2 + 0); + } + + a1 ++; + a2 ++; + b += 2; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + *(b + ii - jj) = INV(*(a1 + (ii - jj) * lda)); + for (k = ii - jj + 1; k < 1; k ++) { + *(b + k) = *(a1 + k * lda); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + } + + a1 ++; + b += 1; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_2.c b/kernel/generic/trsm_uncopy_2.c new file mode 100644 index 0000000000..f7f3435f9b --- /dev/null +++ b/kernel/generic/trsm_uncopy_2.c @@ -0,0 +1,160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT *a1, *a2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + + *(b + 0) = INV(data01); + *(b + 1) = data03; + + *(b + 3) = INV(data04); + } + + if (ii < jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data02 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data02; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + + a += 2 * lda; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_4.c b/kernel/generic/trsm_uncopy_4.c new file mode 100644 index 0000000000..837a25019b --- /dev/null +++ b/kernel/generic/trsm_uncopy_4.c @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 10) = INV(data11); + *(b + 11) = data15; + + *(b + 15) = INV(data16); + } + + if (ii < jj) { + + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + *(b + 4) = data02; + *(b + 5) = data06; + *(b + 6) = data10; + *(b + 7) = data14; + + *(b + 8) = data03; + *(b + 9) = data07; + *(b + 10) = data11; + *(b + 11) = data15; + *(b + 12) = data04; + *(b + 13) = data08; + *(b + 14) = data12; + *(b + 15) = data16; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + + *(b + 5) = INV(data06); + *(b + 6) = data10; + *(b + 7) = data14; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + data05 = *(a3 + 0); + data06 = *(a3 + 1); + data07 = *(a4 + 0); + data08 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2; + a2 += 2; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); + data09 = *(a3 + 0); + data13 = *(a4 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data05; + *(b + 2) = data09; + *(b + 3) = data13; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + data03 = *(a3 + 0); + data04 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data03; + *(b + 3) = INV(data04); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data03; + *(b + 2) = data02; + *(b + 3) = data04; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { + + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data03; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a2 + 0); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1+= 1; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_uncopy_8.c b/kernel/generic/trsm_uncopy_8.c new file mode 100644 index 0000000000..8c5623dffc --- /dev/null +++ b/kernel/generic/trsm_uncopy_8.c @@ -0,0 +1,946 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + ii = 0; + + i = (m >> 3); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); +#ifndef UNIT + data37 = *(a5 + 4); +#endif + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); +#ifndef UNIT + data46 = *(a6 + 5); +#endif + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); +#ifndef UNIT + data55 = *(a7 + 6); +#endif + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); +#ifndef UNIT + data64 = *(a8 + 7); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 9) = INV(data10); + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 18) = INV(data19); + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 27) = INV(data28); + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + *(b + 36) = INV(data37); + *(b + 37) = data45; + *(b + 38) = data53; + *(b + 39) = data61; + + *(b + 45) = INV(data46); + *(b + 46) = data54; + *(b + 47) = data62; + + *(b + 54) = INV(data55); + *(b + 55) = data63; + + *(b + 63) = INV(data64); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + data47 = *(a6 + 6); + data48 = *(a6 + 7); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); + data55 = *(a7 + 6); + data56 = *(a7 + 7); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); + data64 = *(a8 + 7); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = data19; + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = data28; + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + *(b + 32) = data05; + *(b + 33) = data13; + *(b + 34) = data21; + *(b + 35) = data29; + *(b + 36) = data37; + *(b + 37) = data45; + *(b + 38) = data53; + *(b + 39) = data61; + + *(b + 40) = data06; + *(b + 41) = data14; + *(b + 42) = data22; + *(b + 43) = data30; + *(b + 44) = data38; + *(b + 45) = data46; + *(b + 46) = data54; + *(b + 47) = data62; + + *(b + 48) = data07; + *(b + 49) = data15; + *(b + 50) = data23; + *(b + 51) = data31; + *(b + 52) = data39; + *(b + 53) = data47; + *(b + 54) = data55; + *(b + 55) = data63; + + *(b + 56) = data08; + *(b + 57) = data16; + *(b + 58) = data24; + *(b + 59) = data32; + *(b + 60) = data40; + *(b + 61) = data48; + *(b + 62) = data56; + *(b + 63) = data64; + } + + a1 += 8; + a2 += 8; + a3 += 8; + a4 += 8; + a5 += 8; + a6 += 8; + a7 += 8; + a8 += 8; + b += 64; + + i --; + ii += 8; + } + + if (m & 4) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 9) = INV(data10); + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 18) = INV(data19); + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 27) = INV(data28); + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + + *(b + 16) = data03; + *(b + 17) = data11; + *(b + 18) = data19; + *(b + 19) = data27; + *(b + 20) = data35; + *(b + 21) = data43; + *(b + 22) = data51; + *(b + 23) = data59; + + *(b + 24) = data04; + *(b + 25) = data12; + *(b + 26) = data20; + *(b + 27) = data28; + *(b + 28) = data36; + *(b + 29) = data44; + *(b + 30) = data52; + *(b + 31) = data60; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + a5 += 4; + a6 += 4; + a7 += 4; + a8 += 4; + b += 32; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 9) = INV(data10); + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data57 = *(a8 + 0); + data58 = *(a8 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + + *(b + 8) = data02; + *(b + 9) = data10; + *(b + 10) = data18; + *(b + 11) = data26; + *(b + 12) = data34; + *(b + 13) = data42; + *(b + 14) = data50; + *(b + 15) = data58; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + data33 = *(a5 + 0); + data41 = *(a6 + 0); + data49 = *(a7 + 0); + data57 = *(a8 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data33; + *(b + 5) = data41; + *(b + 6) = data49; + *(b + 7) = data57; + } + b += 8; + ii += 1; + } + + a += 8 * lda; + jj += 8; + j --; + } + + + if (n & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + + *(b + 5) = INV(data10); + *(b + 6) = data18; + *(b + 7) = data26; + + *(b + 10) = INV(data19); + *(b + 11) = data27; + + *(b + 15) = INV(data28); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data11; + *(b + 10) = data19; + *(b + 11) = data27; + *(b + 12) = data04; + *(b + 13) = data12; + *(b + 14) = data20; + *(b + 15) = data28; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + + *(b + 5) = INV(data10); + *(b + 6) = data18; + *(b + 7) = data26; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + *(b + 4) = data02; + *(b + 5) = data10; + *(b + 6) = data18; + *(b + 7) = data26; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + data17 = *(a3 + 0); + data25 = *(a4 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data17; + *(b + 3) = data25; + } + b += 4; + ii += 1; + } + + a += 4 * lda; + jj += 4; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 1) = data09; + + *(b + 3) = INV(data10); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data09; + *(b + 2) = data02; + *(b + 3) = data10; + } + + a1 += 2; + a2 += 2; + b += 4; + + i --; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data09 = *(a2 + 0); + + *(b + 0) = INV(data01); + *(b + 1) = data09; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data09 = *(a2 + 0); + + *(b + 0) = data01; + *(b + 1) = data09; + } + b += 2; + ii += 1; + } + + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii < jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1; + b += 1; + i --; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_1.c b/kernel/generic/trsm_utcopy_1.c new file mode 100644 index 0000000000..ea490d5314 --- /dev/null +++ b/kernel/generic/trsm_utcopy_1.c @@ -0,0 +1,89 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + +#ifndef UNIT + FLOAT data01; +#endif + FLOAT *a1; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) *(b + 0) = *(a1 + 0); + + a1 += lda; + b ++; + i --; + ii ++; + } + + a ++; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_16.c b/kernel/generic/trsm_utcopy_16.c new file mode 100644 index 0000000000..546641242c --- /dev/null +++ b/kernel/generic/trsm_utcopy_16.c @@ -0,0 +1,225 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + + jj = offset; + + j = (n >> 4); + while (j > 0){ + + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 16)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 16) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 16; + j --; + } + + j = (n & 8); + if (j > 0) { + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 8; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 2; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k) = *(a1 + k); + } + + *(b + ii - jj) = INV(*(a1 + ii - jj)); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + } + + b += 1; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_2.c b/kernel/generic/trsm_utcopy_2.c new file mode 100644 index 0000000000..3def611eb9 --- /dev/null +++ b/kernel/generic/trsm_utcopy_2.c @@ -0,0 +1,155 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT *a1, *a2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 2) = data03; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + + a += 2; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_4.c b/kernel/generic/trsm_utcopy_4.c new file mode 100644 index 0000000000..bbba78d535 --- /dev/null +++ b/kernel/generic/trsm_utcopy_4.c @@ -0,0 +1,322 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *a1, *a2, *a3, *a4; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + data09 = *(a3 + 0); + data10 = *(a3 + 1); +#ifndef UNIT + data11 = *(a3 + 2); +#endif + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); +#ifndef UNIT + data16 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = INV(data11); + + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = INV(data16); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + data09 = *(a3 + 0); + data10 = *(a3 + 1); + data11 = *(a3 + 2); + data12 = *(a3 + 3); + + data13 = *(a4 + 0); + data14 = *(a4 + 1); + data15 = *(a4 + 2); + data16 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + + i --; + ii += 4; + } + + if ((m & 2) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data05 = *(a2 + 0); +#ifndef UNIT + data06 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + + *(b + 4) = data05; + *(b + 5) = INV(data06); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 4; + j --; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + data03 = *(a2 + 0); +#ifndef UNIT + data04 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data03; + *(b + 3) = INV(data04); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += 1 * lda; + b += 1; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/trsm_utcopy_8.c b/kernel/generic/trsm_utcopy_8.c new file mode 100644 index 0000000000..531ac59e4c --- /dev/null +++ b/kernel/generic/trsm_utcopy_8.c @@ -0,0 +1,803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef UNIT +#define INV(a) (ONE / (a)) +#else +#define INV(a) (ONE) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT data33, data34, data35, data36, data37, data38, data39, data40; + FLOAT data41, data42, data43, data44, data45, data46, data47, data48; + FLOAT data49, data50, data51, data52, data53, data54, data55, data56; + FLOAT data57, data58, data59, data60, data61, data62, data63, data64; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + i = (m >> 3); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); +#ifndef UNIT + data37 = *(a5 + 4); +#endif + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); +#ifndef UNIT + data46 = *(a6 + 5); +#endif + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); +#ifndef UNIT + data55 = *(a7 + 6); +#endif + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); +#ifndef UNIT + data64 = *(a8 + 7); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data09; + *(b + 9) = INV(data10); + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = INV(data19); + + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = INV(data28); + + *(b + 32) = data33; + *(b + 33) = data34; + *(b + 34) = data35; + *(b + 35) = data36; + *(b + 36) = INV(data37); + + *(b + 40) = data41; + *(b + 41) = data42; + *(b + 42) = data43; + *(b + 43) = data44; + *(b + 44) = data45; + *(b + 45) = INV(data46); + + *(b + 48) = data49; + *(b + 49) = data50; + *(b + 50) = data51; + *(b + 51) = data52; + *(b + 52) = data53; + *(b + 53) = data54; + *(b + 54) = INV(data55); + + *(b + 56) = data57; + *(b + 57) = data58; + *(b + 58) = data59; + *(b + 59) = data60; + *(b + 60) = data61; + *(b + 61) = data62; + *(b + 62) = data63; + *(b + 63) = INV(data64); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + data33 = *(a5 + 0); + data34 = *(a5 + 1); + data35 = *(a5 + 2); + data36 = *(a5 + 3); + data37 = *(a5 + 4); + data38 = *(a5 + 5); + data39 = *(a5 + 6); + data40 = *(a5 + 7); + + data41 = *(a6 + 0); + data42 = *(a6 + 1); + data43 = *(a6 + 2); + data44 = *(a6 + 3); + data45 = *(a6 + 4); + data46 = *(a6 + 5); + data47 = *(a6 + 6); + data48 = *(a6 + 7); + + data49 = *(a7 + 0); + data50 = *(a7 + 1); + data51 = *(a7 + 2); + data52 = *(a7 + 3); + data53 = *(a7 + 4); + data54 = *(a7 + 5); + data55 = *(a7 + 6); + data56 = *(a7 + 7); + + data57 = *(a8 + 0); + data58 = *(a8 + 1); + data59 = *(a8 + 2); + data60 = *(a8 + 3); + data61 = *(a8 + 4); + data62 = *(a8 + 5); + data63 = *(a8 + 6); + data64 = *(a8 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + + *(b + 32) = data33; + *(b + 33) = data34; + *(b + 34) = data35; + *(b + 35) = data36; + *(b + 36) = data37; + *(b + 37) = data38; + *(b + 38) = data39; + *(b + 39) = data40; + *(b + 40) = data41; + *(b + 41) = data42; + *(b + 42) = data43; + *(b + 43) = data44; + *(b + 44) = data45; + *(b + 45) = data46; + *(b + 46) = data47; + *(b + 47) = data48; + + *(b + 48) = data49; + *(b + 49) = data50; + *(b + 50) = data51; + *(b + 51) = data52; + *(b + 52) = data53; + *(b + 53) = data54; + *(b + 54) = data55; + *(b + 55) = data56; + *(b + 56) = data57; + *(b + 57) = data58; + *(b + 58) = data59; + *(b + 59) = data60; + *(b + 60) = data61; + *(b + 61) = data62; + *(b + 62) = data63; + *(b + 63) = data64; + } + + a1 += 8 * lda; + a2 += 8 * lda; + a3 += 8 * lda; + a4 += 8 * lda; + a5 += 8 * lda; + a6 += 8 * lda; + a7 += 8 * lda; + a8 += 8 * lda; + b += 64; + + i --; + ii += 8; + } + + if (m & 4) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + + *(b + 8) = data09; + *(b + 9) = INV(data10); + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = INV(data19); + + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 32; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 8) = data09; + *(b + 9) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 16; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + b += 8; + } + + a += 8; + jj += 8; + j --; + } + + if (n & 4) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + i = (m >> 2); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); +#ifndef UNIT + data19 = *(a3 + 2); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); +#ifndef UNIT + data28 = *(a4 + 3); +#endif + + *(b + 0) = INV(data01); + *(b + 4) = data09; + *(b + 5) = INV(data10); + + *(b + 8) = data17; + *(b + 9) = data18; + *(b + 10) = INV(data19); + + *(b + 12) = data25; + *(b + 13) = data26; + *(b + 14) = data27; + *(b + 15) = INV(data28); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + + *(b + 8) = data17; + *(b + 9) = data18; + *(b + 10) = data19; + *(b + 11) = data20; + *(b + 12) = data25; + *(b + 13) = data26; + *(b + 14) = data27; + *(b + 15) = data28; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 16; + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + a += 4; + jj += 4; + } + + if (n & 2) { + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + + data09 = *(a2 + 0); +#ifndef UNIT + data10 = *(a2 + 1); +#endif + + *(b + 0) = INV(data01); + *(b + 2) = data09; + *(b + 3) = INV(data10); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 4; + i --; + ii += 2; + } + + if (m & 1) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + b += 2; + } + a += 2; + jj += 2; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); +#endif + *(b + 0) = INV(data01); + } + + if (ii > jj) { + data01 = *(a1 + 0); + *(b + 0) = data01; + } + + a1 += lda; + b += 1; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_ncopy_1.c b/kernel/generic/zgemm3m_ncopy_1.c new file mode 100644 index 0000000000..7ac734b4c7 --- /dev/null +++ b/kernel/generic/zgemm3m_ncopy_1.c @@ -0,0 +1,89 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i; + + FLOAT *a_offset, a1, a2; + + lda *= 2; + + while (n > 0) { + a_offset = a; + a += lda; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset + 0); + a2 = *(a_offset + 1); + + *(b + 0) = CMULT(a1, a2); + + a_offset += 2; + + b ++; + } + n --; + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_ncopy_2.c b/kernel/generic/zgemm3m_ncopy_2.c new file mode 100644 index 0000000000..702524a4e7 --- /dev/null +++ b/kernel/generic/zgemm3m_ncopy_2.c @@ -0,0 +1,120 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset; + FLOAT a1, a2, a3, a4; + + lda *= 2; + + a_offset = a; + b_offset = b; + + j = (n >> 1); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + + a_offset1 += 2; + a_offset2 += 2; + + b_offset += 2; + + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 1) { + a_offset1 = a_offset; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + + a_offset1 += 2; + + b_offset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_ncopy_4.c b/kernel/generic/zgemm3m_ncopy_4.c new file mode 100644 index 0000000000..1117d77bfb --- /dev/null +++ b/kernel/generic/zgemm3m_ncopy_4.c @@ -0,0 +1,153 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + + lda *= 2; + + a_offset = a; + b_offset = b; + + j = (n >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + a5 = *(a_offset3 + 0); + a6 = *(a_offset3 + 1); + a7 = *(a_offset4 + 0); + a8 = *(a_offset4 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + *(b_offset + 2) = CMULT(a5, a6); + *(b_offset + 3) = CMULT(a7, a8); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset += 4; + + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + + a_offset1 += 2; + a_offset2 += 2; + + b_offset += 2; + + } + } + + if (n & 1) { + a_offset1 = a_offset; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + + a_offset1 += 2; + + b_offset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_ncopy_8.c b/kernel/generic/zgemm3m_ncopy_8.c new file mode 100644 index 0000000000..0c3cb5d767 --- /dev/null +++ b/kernel/generic/zgemm3m_ncopy_8.c @@ -0,0 +1,216 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + FLOAT a9, a10, a11, a12, a13, a14, a15, a16; + +#if 0 +#ifdef REAL_ONLY + fprintf(stderr, "NON Real "); +#elif defined(IMAGE_ONLY) + fprintf(stderr, "NON Image "); +#else + fprintf(stderr, "NON Both "); +#endif + +#ifdef ICOPY + fprintf(stderr, " ICOPY %ld x %ld\n", m, n); +#else + fprintf(stderr, " OCOPY %ld x %ld\n", m, n); +#endif +#endif + + lda *= 2; + + a_offset = a; + b_offset = b; + + j = (n >> 3); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + a_offset += 8 * lda; + + for (i = 0; i < m; i ++) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + a5 = *(a_offset3 + 0); + a6 = *(a_offset3 + 1); + a7 = *(a_offset4 + 0); + a8 = *(a_offset4 + 1); + a9 = *(a_offset5 + 0); + a10 = *(a_offset5 + 1); + a11 = *(a_offset6 + 0); + a12 = *(a_offset6 + 1); + a13 = *(a_offset7 + 0); + a14 = *(a_offset7 + 1); + a15 = *(a_offset8 + 0); + a16 = *(a_offset8 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + *(b_offset + 2) = CMULT(a5, a6); + *(b_offset + 3) = CMULT(a7, a8); + *(b_offset + 4) = CMULT(a9, a10); + *(b_offset + 5) = CMULT(a11, a12); + *(b_offset + 6) = CMULT(a13, a14); + *(b_offset + 7) = CMULT(a15, a16); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + a_offset5 += 2; + a_offset6 += 2; + a_offset7 += 2; + a_offset8 += 2; + + b_offset += 8; + } + + j--; + }while(j > 0); + } + + if (n & 4){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + for (i = 0; i < m; i ++) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + a5 = *(a_offset3 + 0); + a6 = *(a_offset3 + 1); + a7 = *(a_offset4 + 0); + a8 = *(a_offset4 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + *(b_offset + 2) = CMULT(a5, a6); + *(b_offset + 3) = CMULT(a7, a8); + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset += 4; + } + } + + if (n & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + for (i = 0; i < m; i ++) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + *(b_offset + 1) = CMULT(a3, a4); + + a_offset1 += 2; + a_offset2 += 2; + + b_offset += 2; + } + } + + if (n & 1){ + a_offset1 = a_offset; + + for (i = 0; i < m; i ++) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset + 0) = CMULT(a1, a2); + + a_offset1 += 2; + b_offset += 1; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_tcopy_1.c b/kernel/generic/zgemm3m_tcopy_1.c new file mode 100644 index 0000000000..47cf7e58dc --- /dev/null +++ b/kernel/generic/zgemm3m_tcopy_1.c @@ -0,0 +1,89 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i; + + FLOAT *a_offset, a1, a2; + + lda *= 2; + + while (n > 0) { + a_offset = a; + a += 2; + + for (i = 0; i < m; i ++) { + + a1 = *(a_offset + 0); + a2 = *(a_offset + 1); + + *(b + 0) = CMULT(a1, a2); + + a_offset += lda; + + b ++; + } + n --; + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_tcopy_2.c b/kernel/generic/zgemm3m_tcopy_2.c new file mode 100644 index 0000000000..f6fe10be3c --- /dev/null +++ b/kernel/generic/zgemm3m_tcopy_2.c @@ -0,0 +1,162 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + b_offset2 = b + m * (n & ~1); + + j = (m >> 1); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 4; + + i = (n >> 1); + if (i > 0){ + do{ + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset2 + 0); + a6 = *(a_offset2 + 1); + a7 = *(a_offset2 + 2); + a8 = *(a_offset2 + 3); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + + a_offset1 += 4; + a_offset2 += 4; + + b_offset1 += m * 2; + i --; + }while(i > 0); + } + + if (n & 1) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + + b_offset2 += 2; + } + + j--; + }while(j > 0); + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 1); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + + a_offset1 += 4; + + b_offset1 += 2 * m; + + i --; + }while(i > 0); + } + + if (n & 1) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset2 + 0) = CMULT(a1, a2); + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_tcopy_4.c b/kernel/generic/zgemm3m_tcopy_4.c new file mode 100644 index 0000000000..e0722627e9 --- /dev/null +++ b/kernel/generic/zgemm3m_tcopy_4.c @@ -0,0 +1,352 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + b_offset2 = b + m * (n & ~3); + b_offset3 = b + m * (n & ~1); + + j = (m >> 2); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset1 + 4) = CMULT(a1, a2); + *(b_offset1 + 5) = CMULT(a3, a4); + *(b_offset1 + 6) = CMULT(a5, a6); + *(b_offset1 + 7) = CMULT(a7, a8); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + + *(b_offset1 + 8) = CMULT(a1, a2); + *(b_offset1 + 9) = CMULT(a3, a4); + *(b_offset1 + 10) = CMULT(a5, a6); + *(b_offset1 + 11) = CMULT(a7, a8); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + + *(b_offset1 + 12) = CMULT(a1, a2); + *(b_offset1 + 13) = CMULT(a3, a4); + *(b_offset1 + 14) = CMULT(a5, a6); + *(b_offset1 + 15) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + a_offset3 += 8; + a_offset4 += 8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset2 + 0); + a6 = *(a_offset2 + 1); + a7 = *(a_offset2 + 2); + a8 = *(a_offset2 + 3); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset4 + 0); + a6 = *(a_offset4 + 1); + a7 = *(a_offset4 + 2); + a8 = *(a_offset4 + 3); + + *(b_offset2 + 4) = CMULT(a1, a2); + *(b_offset2 + 5) = CMULT(a3, a4); + *(b_offset2 + 6) = CMULT(a5, a6); + *(b_offset2 + 7) = CMULT(a7, a8); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset2 += 8; + } + + if (n & 1) { + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + a5 = *(a_offset3 + 0); + a6 = *(a_offset3 + 1); + a7 = *(a_offset4 + 0); + a8 = *(a_offset4 + 1); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + *(b_offset3 + 2) = CMULT(a5, a6); + *(b_offset3 + 3) = CMULT(a7, a8); + + b_offset3 += 4; + } + + j--; + }while(j > 0); + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset1 + 4) = CMULT(a1, a2); + *(b_offset1 + 5) = CMULT(a3, a4); + *(b_offset1 + 6) = CMULT(a5, a6); + *(b_offset1 + 7) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + + b_offset1 += m * 4; + i --; + }while(i > 0); + } + + if (n & 2) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset2 + 0); + a6 = *(a_offset2 + 1); + a7 = *(a_offset2 + 2); + a8 = *(a_offset2 + 3); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a_offset1 += 4; + a_offset2 += 4; + b_offset2 += 4; + } + + if (n & 1) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset2 + 0); + a4 = *(a_offset2 + 1); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + b_offset3 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 2); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + + a_offset1 += 8; + + b_offset1 += 4 * m; + + i --; + }while(i > 0); + } + + if (n & 2) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + + a_offset1 += 4; + } + + if (n & 1) { + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset3 + 0) = CMULT(a1, a2); + } + } + + return 0; +} diff --git a/kernel/generic/zgemm3m_tcopy_8.c b/kernel/generic/zgemm3m_tcopy_8.c new file mode 100644 index 0000000000..e68bccfbac --- /dev/null +++ b/kernel/generic/zgemm3m_tcopy_8.c @@ -0,0 +1,1072 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; + FLOAT a1, a2, a3, a4, a5, a6, a7, a8; + FLOAT a9, a10, a11, a12, a13, a14, a15, a16; + +#if 0 +#ifdef REAL_ONLY + fprintf(stderr, "TNS Real "); +#elif defined(IMAGE_ONLY) + fprintf(stderr, "TNS Image "); +#else + fprintf(stderr, "TNS Both "); +#endif + +#ifdef ICOPY + fprintf(stderr, " ICOPY %ld x %ld\n", m, n); +#else + fprintf(stderr, " OCOPY %ld x %ld\n", m, n); +#endif +#endif + + a_offset = a; + b_offset = b; + + lda *= 2; + + b_offset2 = b + m * (n & ~7); + b_offset3 = b + m * (n & ~3); + b_offset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset5 = a_offset4 + lda; + a_offset6 = a_offset5 + lda; + a_offset7 = a_offset6 + lda; + a_offset8 = a_offset7 + lda; + + a_offset += 8 * lda; + + b_offset1 = b_offset; + b_offset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + a9 = *(a_offset1 + 8); + a10 = *(a_offset1 + 9); + a11 = *(a_offset1 + 10); + a12 = *(a_offset1 + 11); + a13 = *(a_offset1 + 12); + a14 = *(a_offset1 + 13); + a15 = *(a_offset1 + 14); + a16 = *(a_offset1 + 15); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + *(b_offset1 + 4) = CMULT(a9, a10); + *(b_offset1 + 5) = CMULT(a11, a12); + *(b_offset1 + 6) = CMULT(a13, a14); + *(b_offset1 + 7) = CMULT(a15, a16); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + a9 = *(a_offset2 + 8); + a10 = *(a_offset2 + 9); + a11 = *(a_offset2 + 10); + a12 = *(a_offset2 + 11); + a13 = *(a_offset2 + 12); + a14 = *(a_offset2 + 13); + a15 = *(a_offset2 + 14); + a16 = *(a_offset2 + 15); + + *(b_offset1 + 8) = CMULT(a1, a2); + *(b_offset1 + 9) = CMULT(a3, a4); + *(b_offset1 + 10) = CMULT(a5, a6); + *(b_offset1 + 11) = CMULT(a7, a8); + *(b_offset1 + 12) = CMULT(a9, a10); + *(b_offset1 + 13) = CMULT(a11, a12); + *(b_offset1 + 14) = CMULT(a13, a14); + *(b_offset1 + 15) = CMULT(a15, a16); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + a9 = *(a_offset3 + 8); + a10 = *(a_offset3 + 9); + a11 = *(a_offset3 + 10); + a12 = *(a_offset3 + 11); + a13 = *(a_offset3 + 12); + a14 = *(a_offset3 + 13); + a15 = *(a_offset3 + 14); + a16 = *(a_offset3 + 15); + + *(b_offset1 + 16) = CMULT(a1, a2); + *(b_offset1 + 17) = CMULT(a3, a4); + *(b_offset1 + 18) = CMULT(a5, a6); + *(b_offset1 + 19) = CMULT(a7, a8); + *(b_offset1 + 20) = CMULT(a9, a10); + *(b_offset1 + 21) = CMULT(a11, a12); + *(b_offset1 + 22) = CMULT(a13, a14); + *(b_offset1 + 23) = CMULT(a15, a16); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + a9 = *(a_offset4 + 8); + a10 = *(a_offset4 + 9); + a11 = *(a_offset4 + 10); + a12 = *(a_offset4 + 11); + a13 = *(a_offset4 + 12); + a14 = *(a_offset4 + 13); + a15 = *(a_offset4 + 14); + a16 = *(a_offset4 + 15); + + *(b_offset1 + 24) = CMULT(a1, a2); + *(b_offset1 + 25) = CMULT(a3, a4); + *(b_offset1 + 26) = CMULT(a5, a6); + *(b_offset1 + 27) = CMULT(a7, a8); + *(b_offset1 + 28) = CMULT(a9, a10); + *(b_offset1 + 29) = CMULT(a11, a12); + *(b_offset1 + 30) = CMULT(a13, a14); + *(b_offset1 + 31) = CMULT(a15, a16); + + a1 = *(a_offset5 + 0); + a2 = *(a_offset5 + 1); + a3 = *(a_offset5 + 2); + a4 = *(a_offset5 + 3); + a5 = *(a_offset5 + 4); + a6 = *(a_offset5 + 5); + a7 = *(a_offset5 + 6); + a8 = *(a_offset5 + 7); + a9 = *(a_offset5 + 8); + a10 = *(a_offset5 + 9); + a11 = *(a_offset5 + 10); + a12 = *(a_offset5 + 11); + a13 = *(a_offset5 + 12); + a14 = *(a_offset5 + 13); + a15 = *(a_offset5 + 14); + a16 = *(a_offset5 + 15); + + *(b_offset1 + 32) = CMULT(a1, a2); + *(b_offset1 + 33) = CMULT(a3, a4); + *(b_offset1 + 34) = CMULT(a5, a6); + *(b_offset1 + 35) = CMULT(a7, a8); + *(b_offset1 + 36) = CMULT(a9, a10); + *(b_offset1 + 37) = CMULT(a11, a12); + *(b_offset1 + 38) = CMULT(a13, a14); + *(b_offset1 + 39) = CMULT(a15, a16); + + a1 = *(a_offset6 + 0); + a2 = *(a_offset6 + 1); + a3 = *(a_offset6 + 2); + a4 = *(a_offset6 + 3); + a5 = *(a_offset6 + 4); + a6 = *(a_offset6 + 5); + a7 = *(a_offset6 + 6); + a8 = *(a_offset6 + 7); + a9 = *(a_offset6 + 8); + a10 = *(a_offset6 + 9); + a11 = *(a_offset6 + 10); + a12 = *(a_offset6 + 11); + a13 = *(a_offset6 + 12); + a14 = *(a_offset6 + 13); + a15 = *(a_offset6 + 14); + a16 = *(a_offset6 + 15); + + *(b_offset1 + 40) = CMULT(a1, a2); + *(b_offset1 + 41) = CMULT(a3, a4); + *(b_offset1 + 42) = CMULT(a5, a6); + *(b_offset1 + 43) = CMULT(a7, a8); + *(b_offset1 + 44) = CMULT(a9, a10); + *(b_offset1 + 45) = CMULT(a11, a12); + *(b_offset1 + 46) = CMULT(a13, a14); + *(b_offset1 + 47) = CMULT(a15, a16); + + a1 = *(a_offset7 + 0); + a2 = *(a_offset7 + 1); + a3 = *(a_offset7 + 2); + a4 = *(a_offset7 + 3); + a5 = *(a_offset7 + 4); + a6 = *(a_offset7 + 5); + a7 = *(a_offset7 + 6); + a8 = *(a_offset7 + 7); + a9 = *(a_offset7 + 8); + a10 = *(a_offset7 + 9); + a11 = *(a_offset7 + 10); + a12 = *(a_offset7 + 11); + a13 = *(a_offset7 + 12); + a14 = *(a_offset7 + 13); + a15 = *(a_offset7 + 14); + a16 = *(a_offset7 + 15); + + *(b_offset1 + 48) = CMULT(a1, a2); + *(b_offset1 + 49) = CMULT(a3, a4); + *(b_offset1 + 50) = CMULT(a5, a6); + *(b_offset1 + 51) = CMULT(a7, a8); + *(b_offset1 + 52) = CMULT(a9, a10); + *(b_offset1 + 53) = CMULT(a11, a12); + *(b_offset1 + 54) = CMULT(a13, a14); + *(b_offset1 + 55) = CMULT(a15, a16); + + a1 = *(a_offset8 + 0); + a2 = *(a_offset8 + 1); + a3 = *(a_offset8 + 2); + a4 = *(a_offset8 + 3); + a5 = *(a_offset8 + 4); + a6 = *(a_offset8 + 5); + a7 = *(a_offset8 + 6); + a8 = *(a_offset8 + 7); + a9 = *(a_offset8 + 8); + a10 = *(a_offset8 + 9); + a11 = *(a_offset8 + 10); + a12 = *(a_offset8 + 11); + a13 = *(a_offset8 + 12); + a14 = *(a_offset8 + 13); + a15 = *(a_offset8 + 14); + a16 = *(a_offset8 + 15); + + *(b_offset1 + 56) = CMULT(a1, a2); + *(b_offset1 + 57) = CMULT(a3, a4); + *(b_offset1 + 58) = CMULT(a5, a6); + *(b_offset1 + 59) = CMULT(a7, a8); + *(b_offset1 + 60) = CMULT(a9, a10); + *(b_offset1 + 61) = CMULT(a11, a12); + *(b_offset1 + 62) = CMULT(a13, a14); + *(b_offset1 + 63) = CMULT(a15, a16); + + a_offset1 += 16; + a_offset2 += 16; + a_offset3 += 16; + a_offset4 += 16; + a_offset5 += 16; + a_offset6 += 16; + a_offset7 += 16; + a_offset8 += 16; + + b_offset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset2 + 4) = CMULT(a1, a2); + *(b_offset2 + 5) = CMULT(a3, a4); + *(b_offset2 + 6) = CMULT(a5, a6); + *(b_offset2 + 7) = CMULT(a7, a8); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + + *(b_offset2 + 8) = CMULT(a1, a2); + *(b_offset2 + 9) = CMULT(a3, a4); + *(b_offset2 + 10) = CMULT(a5, a6); + *(b_offset2 + 11) = CMULT(a7, a8); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + + *(b_offset2 + 12) = CMULT(a1, a2); + *(b_offset2 + 13) = CMULT(a3, a4); + *(b_offset2 + 14) = CMULT(a5, a6); + *(b_offset2 + 15) = CMULT(a7, a8); + + a1 = *(a_offset5 + 0); + a2 = *(a_offset5 + 1); + a3 = *(a_offset5 + 2); + a4 = *(a_offset5 + 3); + a5 = *(a_offset5 + 4); + a6 = *(a_offset5 + 5); + a7 = *(a_offset5 + 6); + a8 = *(a_offset5 + 7); + + *(b_offset2 + 16) = CMULT(a1, a2); + *(b_offset2 + 17) = CMULT(a3, a4); + *(b_offset2 + 18) = CMULT(a5, a6); + *(b_offset2 + 19) = CMULT(a7, a8); + + a1 = *(a_offset6 + 0); + a2 = *(a_offset6 + 1); + a3 = *(a_offset6 + 2); + a4 = *(a_offset6 + 3); + a5 = *(a_offset6 + 4); + a6 = *(a_offset6 + 5); + a7 = *(a_offset6 + 6); + a8 = *(a_offset6 + 7); + + *(b_offset2 + 20) = CMULT(a1, a2); + *(b_offset2 + 21) = CMULT(a3, a4); + *(b_offset2 + 22) = CMULT(a5, a6); + *(b_offset2 + 23) = CMULT(a7, a8); + + a1 = *(a_offset7 + 0); + a2 = *(a_offset7 + 1); + a3 = *(a_offset7 + 2); + a4 = *(a_offset7 + 3); + a5 = *(a_offset7 + 4); + a6 = *(a_offset7 + 5); + a7 = *(a_offset7 + 6); + a8 = *(a_offset7 + 7); + + *(b_offset2 + 24) = CMULT(a1, a2); + *(b_offset2 + 25) = CMULT(a3, a4); + *(b_offset2 + 26) = CMULT(a5, a6); + *(b_offset2 + 27) = CMULT(a7, a8); + + a1 = *(a_offset8 + 0); + a2 = *(a_offset8 + 1); + a3 = *(a_offset8 + 2); + a4 = *(a_offset8 + 3); + a5 = *(a_offset8 + 4); + a6 = *(a_offset8 + 5); + a7 = *(a_offset8 + 6); + a8 = *(a_offset8 + 7); + + *(b_offset2 + 28) = CMULT(a1, a2); + *(b_offset2 + 29) = CMULT(a3, a4); + *(b_offset2 + 30) = CMULT(a5, a6); + *(b_offset2 + 31) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + a_offset3 += 8; + a_offset4 += 8; + a_offset5 += 8; + a_offset6 += 8; + a_offset7 += 8; + a_offset8 += 8; + + b_offset2 += 32; + } + + if (n & 2){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + + *(b_offset3 + 2) = CMULT(a1, a2); + *(b_offset3 + 3) = CMULT(a3, a4); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + + *(b_offset3 + 4) = CMULT(a1, a2); + *(b_offset3 + 5) = CMULT(a3, a4); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + + *(b_offset3 + 6) = CMULT(a1, a2); + *(b_offset3 + 7) = CMULT(a3, a4); + + a1 = *(a_offset5 + 0); + a2 = *(a_offset5 + 1); + a3 = *(a_offset5 + 2); + a4 = *(a_offset5 + 3); + + *(b_offset3 + 8) = CMULT(a1, a2); + *(b_offset3 + 9) = CMULT(a3, a4); + + a1 = *(a_offset6 + 0); + a2 = *(a_offset6 + 1); + a3 = *(a_offset6 + 2); + a4 = *(a_offset6 + 3); + + *(b_offset3 + 10) = CMULT(a1, a2); + *(b_offset3 + 11) = CMULT(a3, a4); + + a1 = *(a_offset7 + 0); + a2 = *(a_offset7 + 1); + a3 = *(a_offset7 + 2); + a4 = *(a_offset7 + 3); + + *(b_offset3 + 12) = CMULT(a1, a2); + *(b_offset3 + 13) = CMULT(a3, a4); + + a1 = *(a_offset8 + 0); + a2 = *(a_offset8 + 1); + a3 = *(a_offset8 + 2); + a4 = *(a_offset8 + 3); + + *(b_offset3 + 14) = CMULT(a1, a2); + *(b_offset3 + 15) = CMULT(a3, a4); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + a_offset5 += 4; + a_offset6 += 4; + a_offset7 += 4; + a_offset8 += 4; + + b_offset3 += 16; + } + + if (n & 1){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset4 + 0) = CMULT(a1, a2); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + + *(b_offset4 + 1) = CMULT(a1, a2); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + + *(b_offset4 + 2) = CMULT(a1, a2); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + + *(b_offset4 + 3) = CMULT(a1, a2); + + a1 = *(a_offset5 + 0); + a2 = *(a_offset5 + 1); + + *(b_offset4 + 4) = CMULT(a1, a2); + + a1 = *(a_offset6 + 0); + a2 = *(a_offset6 + 1); + + *(b_offset4 + 5) = CMULT(a1, a2); + + a1 = *(a_offset7 + 0); + a2 = *(a_offset7 + 1); + + *(b_offset4 + 6) = CMULT(a1, a2); + + a1 = *(a_offset8 + 0); + a2 = *(a_offset8 + 1); + + *(b_offset4 + 7) = CMULT(a1, a2); + + b_offset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset += 32; + + i = (n >> 3); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + a9 = *(a_offset1 + 8); + a10 = *(a_offset1 + 9); + a11 = *(a_offset1 + 10); + a12 = *(a_offset1 + 11); + a13 = *(a_offset1 + 12); + a14 = *(a_offset1 + 13); + a15 = *(a_offset1 + 14); + a16 = *(a_offset1 + 15); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + *(b_offset1 + 4) = CMULT(a9, a10); + *(b_offset1 + 5) = CMULT(a11, a12); + *(b_offset1 + 6) = CMULT(a13, a14); + *(b_offset1 + 7) = CMULT(a15, a16); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + a9 = *(a_offset2 + 8); + a10 = *(a_offset2 + 9); + a11 = *(a_offset2 + 10); + a12 = *(a_offset2 + 11); + a13 = *(a_offset2 + 12); + a14 = *(a_offset2 + 13); + a15 = *(a_offset2 + 14); + a16 = *(a_offset2 + 15); + + *(b_offset1 + 8) = CMULT(a1, a2); + *(b_offset1 + 9) = CMULT(a3, a4); + *(b_offset1 + 10) = CMULT(a5, a6); + *(b_offset1 + 11) = CMULT(a7, a8); + *(b_offset1 + 12) = CMULT(a9, a10); + *(b_offset1 + 13) = CMULT(a11, a12); + *(b_offset1 + 14) = CMULT(a13, a14); + *(b_offset1 + 15) = CMULT(a15, a16); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + a9 = *(a_offset3 + 8); + a10 = *(a_offset3 + 9); + a11 = *(a_offset3 + 10); + a12 = *(a_offset3 + 11); + a13 = *(a_offset3 + 12); + a14 = *(a_offset3 + 13); + a15 = *(a_offset3 + 14); + a16 = *(a_offset3 + 15); + + *(b_offset1 + 16) = CMULT(a1, a2); + *(b_offset1 + 17) = CMULT(a3, a4); + *(b_offset1 + 18) = CMULT(a5, a6); + *(b_offset1 + 19) = CMULT(a7, a8); + *(b_offset1 + 20) = CMULT(a9, a10); + *(b_offset1 + 21) = CMULT(a11, a12); + *(b_offset1 + 22) = CMULT(a13, a14); + *(b_offset1 + 23) = CMULT(a15, a16); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + a9 = *(a_offset4 + 8); + a10 = *(a_offset4 + 9); + a11 = *(a_offset4 + 10); + a12 = *(a_offset4 + 11); + a13 = *(a_offset4 + 12); + a14 = *(a_offset4 + 13); + a15 = *(a_offset4 + 14); + a16 = *(a_offset4 + 15); + + *(b_offset1 + 24) = CMULT(a1, a2); + *(b_offset1 + 25) = CMULT(a3, a4); + *(b_offset1 + 26) = CMULT(a5, a6); + *(b_offset1 + 27) = CMULT(a7, a8); + *(b_offset1 + 28) = CMULT(a9, a10); + *(b_offset1 + 29) = CMULT(a11, a12); + *(b_offset1 + 30) = CMULT(a13, a14); + *(b_offset1 + 31) = CMULT(a15, a16); + + a_offset1 += 16; + a_offset2 += 16; + a_offset3 += 16; + a_offset4 += 16; + + b_offset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset2 + 4) = CMULT(a1, a2); + *(b_offset2 + 5) = CMULT(a3, a4); + *(b_offset2 + 6) = CMULT(a5, a6); + *(b_offset2 + 7) = CMULT(a7, a8); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + a5 = *(a_offset3 + 4); + a6 = *(a_offset3 + 5); + a7 = *(a_offset3 + 6); + a8 = *(a_offset3 + 7); + + *(b_offset2 + 8) = CMULT(a1, a2); + *(b_offset2 + 9) = CMULT(a3, a4); + *(b_offset2 + 10) = CMULT(a5, a6); + *(b_offset2 + 11) = CMULT(a7, a8); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + a5 = *(a_offset4 + 4); + a6 = *(a_offset4 + 5); + a7 = *(a_offset4 + 6); + a8 = *(a_offset4 + 7); + + *(b_offset2 + 12) = CMULT(a1, a2); + *(b_offset2 + 13) = CMULT(a3, a4); + *(b_offset2 + 14) = CMULT(a5, a6); + *(b_offset2 + 15) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + a_offset3 += 8; + a_offset4 += 8; + + b_offset2 += 16; + } + + if (n & 2){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + + *(b_offset3 + 2) = CMULT(a1, a2); + *(b_offset3 + 3) = CMULT(a3, a4); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + a3 = *(a_offset3 + 2); + a4 = *(a_offset3 + 3); + + *(b_offset3 + 4) = CMULT(a1, a2); + *(b_offset3 + 5) = CMULT(a3, a4); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + a3 = *(a_offset4 + 2); + a4 = *(a_offset4 + 3); + + *(b_offset3 + 6) = CMULT(a1, a2); + *(b_offset3 + 7) = CMULT(a3, a4); + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + + b_offset3 += 8; + } + + if (n & 1){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset4 + 0) = CMULT(a1, a2); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + + *(b_offset4 + 1) = CMULT(a1, a2); + + a1 = *(a_offset3 + 0); + a2 = *(a_offset3 + 1); + + *(b_offset4 + 2) = CMULT(a1, a2); + + a1 = *(a_offset4 + 0); + a2 = *(a_offset4 + 1); + + *(b_offset4 + 3) = CMULT(a1, a2); + + b_offset4 += 4; + } + } + + if (m & 2){ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + a9 = *(a_offset1 + 8); + a10 = *(a_offset1 + 9); + a11 = *(a_offset1 + 10); + a12 = *(a_offset1 + 11); + a13 = *(a_offset1 + 12); + a14 = *(a_offset1 + 13); + a15 = *(a_offset1 + 14); + a16 = *(a_offset1 + 15); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + *(b_offset1 + 4) = CMULT(a9, a10); + *(b_offset1 + 5) = CMULT(a11, a12); + *(b_offset1 + 6) = CMULT(a13, a14); + *(b_offset1 + 7) = CMULT(a15, a16); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + a9 = *(a_offset2 + 8); + a10 = *(a_offset2 + 9); + a11 = *(a_offset2 + 10); + a12 = *(a_offset2 + 11); + a13 = *(a_offset2 + 12); + a14 = *(a_offset2 + 13); + a15 = *(a_offset2 + 14); + a16 = *(a_offset2 + 15); + + *(b_offset1 + 8) = CMULT(a1, a2); + *(b_offset1 + 9) = CMULT(a3, a4); + *(b_offset1 + 10) = CMULT(a5, a6); + *(b_offset1 + 11) = CMULT(a7, a8); + *(b_offset1 + 12) = CMULT(a9, a10); + *(b_offset1 + 13) = CMULT(a11, a12); + *(b_offset1 + 14) = CMULT(a13, a14); + *(b_offset1 + 15) = CMULT(a15, a16); + + a_offset1 += 16; + a_offset2 += 16; + + b_offset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + a5 = *(a_offset2 + 4); + a6 = *(a_offset2 + 5); + a7 = *(a_offset2 + 6); + a8 = *(a_offset2 + 7); + + *(b_offset2 + 4) = CMULT(a1, a2); + *(b_offset2 + 5) = CMULT(a3, a4); + *(b_offset2 + 6) = CMULT(a5, a6); + *(b_offset2 + 7) = CMULT(a7, a8); + + a_offset1 += 8; + a_offset2 += 8; + + b_offset2 += 8; + } + + if (n & 2){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + a3 = *(a_offset2 + 2); + a4 = *(a_offset2 + 3); + + *(b_offset3 + 2) = CMULT(a1, a2); + *(b_offset3 + 3) = CMULT(a3, a4); + + a_offset1 += 4; + a_offset2 += 4; + + b_offset3 += 4; + } + + if (n & 1){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset4 + 0) = CMULT(a1, a2); + + a1 = *(a_offset2 + 0); + a2 = *(a_offset2 + 1); + + *(b_offset4 + 1) = CMULT(a1, a2); + + b_offset4 += 2; + } + } + + if (m & 1){ + a_offset1 = a_offset; + b_offset1 = b_offset; + + i = (n >> 3); + if (i > 0){ + do{ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + a9 = *(a_offset1 + 8); + a10 = *(a_offset1 + 9); + a11 = *(a_offset1 + 10); + a12 = *(a_offset1 + 11); + a13 = *(a_offset1 + 12); + a14 = *(a_offset1 + 13); + a15 = *(a_offset1 + 14); + a16 = *(a_offset1 + 15); + + *(b_offset1 + 0) = CMULT(a1, a2); + *(b_offset1 + 1) = CMULT(a3, a4); + *(b_offset1 + 2) = CMULT(a5, a6); + *(b_offset1 + 3) = CMULT(a7, a8); + *(b_offset1 + 4) = CMULT(a9, a10); + *(b_offset1 + 5) = CMULT(a11, a12); + *(b_offset1 + 6) = CMULT(a13, a14); + *(b_offset1 + 7) = CMULT(a15, a16); + + a_offset1 += 16; + + b_offset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + a5 = *(a_offset1 + 4); + a6 = *(a_offset1 + 5); + a7 = *(a_offset1 + 6); + a8 = *(a_offset1 + 7); + + *(b_offset2 + 0) = CMULT(a1, a2); + *(b_offset2 + 1) = CMULT(a3, a4); + *(b_offset2 + 2) = CMULT(a5, a6); + *(b_offset2 + 3) = CMULT(a7, a8); + + a_offset1 += 8; + b_offset2 += 4; + } + + if (n & 2){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + a3 = *(a_offset1 + 2); + a4 = *(a_offset1 + 3); + + *(b_offset3 + 0) = CMULT(a1, a2); + *(b_offset3 + 1) = CMULT(a3, a4); + + a_offset1 += 4; + b_offset3 += 2; + } + + if (n & 1){ + a1 = *(a_offset1 + 0); + a2 = *(a_offset1 + 1); + + *(b_offset4 + 0) = CMULT(a1, a2); + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_beta.c b/kernel/generic/zgemm_beta.c new file mode 100644 index 0000000000..b7a77a27aa --- /dev/null +++ b/kernel/generic/zgemm_beta.c @@ -0,0 +1,158 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, + FLOAT beta_r, FLOAT beta_i, + FLOAT *dummy2, BLASLONG dummy3, + FLOAT *dummy4, BLASLONG dummy5, + FLOAT *c, BLASLONG ldc){ + BLASLONG i, j; + + FLOAT *c_offset, *c_offset1; + FLOAT atemp1, atemp2, atemp3, atemp4; + FLOAT btemp1, btemp2, btemp3, btemp4; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + + ldc *= 2; + + c_offset = c; + + if (beta_r == 0. && beta_i == 0.) { + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 2); + if (i > 0){ + do { + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + *(c_offset1 + 2) = ZERO; + *(c_offset1 + 3) = ZERO; + *(c_offset1 + 4) = ZERO; + *(c_offset1 + 5) = ZERO; + *(c_offset1 + 6) = ZERO; + *(c_offset1 + 7) = ZERO; + c_offset1 += 8; + i--; + } while (i > 0); + } + + i = (m & 3); + if (i > 0){ + do { + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + c_offset1 += 2; + i--; + } while (i > 0); + } + j --; + } while (j > 0); + + } else { + + j = n; + do { + c_offset1 = c_offset; + c_offset += ldc; + + i = (m >> 1); + if (i > 0){ + do { + atemp1 = *(c_offset1 + 0); + atemp2 = *(c_offset1 + 1); + atemp3 = *(c_offset1 + 2); + atemp4 = *(c_offset1 + 3); + + btemp1 = beta_r * atemp1; + btemp2 = beta_i * atemp2; + btemp3 = beta_r * atemp2; + btemp4 = beta_i * atemp1; + + ctemp1 = btemp1 - btemp2; + ctemp2 = btemp3 + btemp4; + + btemp1 = beta_r * atemp3; + btemp2 = beta_i * atemp4; + btemp3 = beta_r * atemp4; + btemp4 = beta_i * atemp3; + + ctemp3 = btemp1 - btemp2; + ctemp4 = btemp3 + btemp4; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + *(c_offset1 + 2) = ctemp3; + *(c_offset1 + 3) = ctemp4; + c_offset1 += 4; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0){ + do { + atemp1 = *(c_offset1 + 0); + atemp2 = *(c_offset1 + 1); + + btemp1 = beta_r * atemp1; + btemp2 = beta_i * atemp2; + btemp3 = beta_r * atemp2; + btemp4 = beta_i * atemp1; + + ctemp1 = btemp1 - btemp2; + ctemp2 = btemp3 + btemp4; + + *(c_offset1 + 0) = ctemp1; + *(c_offset1 + 1) = ctemp2; + c_offset1 += 2; + i --; + } while (i > 0); + } + j --; + } while (j > 0); + } + return 0; +} diff --git a/kernel/generic/zgemm_ncopy_1.c b/kernel/generic/zgemm_ncopy_1.c new file mode 100644 index 0000000000..6679a33601 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_1.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + i = n; + + if (i > 0){ + do { + + j = (m >> 2); + if (j > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + *(b_offset + 4) = ctemp5; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp7; + *(b_offset + 7) = ctemp8; + + a_offset += 8; + b_offset += 8; + j --; + } while(j>0); + } + + j = (m & 3); + if (j > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + a_offset += 2; + b_offset += 2; + j --; + } while(j>0); + } + a_offset += lda - m * 2; + i--; + } while (i > 0); + } + + return 0; +} diff --git a/kernel/generic/zgemm_ncopy_2.c b/kernel/generic/zgemm_ncopy_2.c new file mode 100644 index 0000000000..2d5f2555da --- /dev/null +++ b/kernel/generic/zgemm_ncopy_2.c @@ -0,0 +1,183 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + lda *= 2; + + i = (n >> 1); + + if (i > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset + lda; + a_offset += 2 * lda; + + j = (m >> 2); + if (j > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + ctemp5 = *(a_offset1 + 2); + ctemp6 = *(a_offset1 + 3); + ctemp7 = *(a_offset2 + 2); + ctemp8 = *(a_offset2 + 3); + + ctemp9 = *(a_offset1 + 4); + ctemp10 = *(a_offset1 + 5); + ctemp11 = *(a_offset2 + 4); + ctemp12 = *(a_offset2 + 5); + + ctemp13 = *(a_offset1 + 6); + ctemp14 = *(a_offset1 + 7); + ctemp15 = *(a_offset2 + 6); + ctemp16 = *(a_offset2 + 7); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + *(b_offset + 4) = ctemp5; + *(b_offset + 5) = ctemp6; + *(b_offset + 6) = ctemp7; + *(b_offset + 7) = ctemp8; + + *(b_offset + 8) = ctemp9; + *(b_offset + 9) = ctemp10; + *(b_offset +10) = ctemp11; + *(b_offset +11) = ctemp12; + + *(b_offset +12) = ctemp13; + *(b_offset +13) = ctemp14; + *(b_offset +14) = ctemp15; + *(b_offset +15) = ctemp16; + + a_offset1 += 8; + a_offset2 += 8; + b_offset += 16; + j --; + } while(j>0); + } + + j = (m & 3); + if (j > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset2 + 0); + ctemp4 = *(a_offset2 + 1); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + a_offset1 += 2; + a_offset2 += 2; + b_offset += 4; + j --; + } while(j>0); + } + i --; + } while(i>0); + } + + if (n & 1){ + j = (m >> 2); + if (j > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp5 = *(a_offset + 2); + ctemp6 = *(a_offset + 3); + + ctemp9 = *(a_offset + 4); + ctemp10 = *(a_offset + 5); + ctemp13 = *(a_offset + 6); + ctemp14 = *(a_offset + 7); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp5; + *(b_offset + 3) = ctemp6; + + *(b_offset + 4) = ctemp9; + *(b_offset + 5) = ctemp10; + *(b_offset + 6) = ctemp13; + *(b_offset + 7) = ctemp14; + + a_offset += 8; + b_offset += 8; + j --; + } while(j>0); + } + + j = (m & 3); + if (j > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + a_offset += 2; + b_offset += 2; + j --; + } while(j > 0); + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_ncopy_4.c b/kernel/generic/zgemm_ncopy_4.c new file mode 100644 index 0000000000..abd1d57846 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4.c @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m,n ); +#endif + + j = (n >> 2); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp10; + *(boffset + 4) = ctemp17; + *(boffset + 5) = ctemp18; + *(boffset + 6) = ctemp25; + *(boffset + 7) = ctemp26; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp04; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp19; + *(boffset + 13) = ctemp20; + *(boffset + 14) = ctemp27; + *(boffset + 15) = ctemp28; + + *(boffset + 16) = ctemp05; + *(boffset + 17) = ctemp06; + *(boffset + 18) = ctemp13; + *(boffset + 19) = ctemp14; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp29; + *(boffset + 23) = ctemp30; + + *(boffset + 24) = ctemp07; + *(boffset + 25) = ctemp08; + *(boffset + 26) = ctemp15; + *(boffset + 27) = ctemp16; + *(boffset + 28) = ctemp23; + *(boffset + 29) = ctemp24; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + boffset += 32; + i --; + }while(i > 0); + } + + if (m & 2) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp06; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp10; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp04; + *(boffset + 10) = ctemp07; + *(boffset + 11) = ctemp08; + *(boffset + 12) = ctemp11; + *(boffset + 13) = ctemp12; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + } + + if (m & 1) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + boffset += 8; + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp09; + *(boffset + 3) = ctemp10; + *(boffset + 4) = ctemp03; + *(boffset + 5) = ctemp04; + *(boffset + 6) = ctemp11; + *(boffset + 7) = ctemp12; + + *(boffset + 8) = ctemp05; + *(boffset + 9) = ctemp06; + *(boffset + 10) = ctemp13; + *(boffset + 11) = ctemp14; + *(boffset + 12) = ctemp07; + *(boffset + 13) = ctemp08; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 8; + aoffset2 += 8; + boffset += 16; + i --; + }while(i > 0); + } + + if (m & 2) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp06; + *(boffset + 4) = ctemp03; + *(boffset + 5) = ctemp04; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 4; + aoffset2 += 4; + boffset += 8; + } + + if (m & 1) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 8; + boffset += 8; + i --; + }while(i > 0); + } + + if (m & 2) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 4; + boffset += 4; + } + + if (m & 1) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_ncopy_8.c b/kernel/generic/zgemm_ncopy_8.c new file mode 100644 index 0000000000..6490285608 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_8.c @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + FLOAT *boffset; + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + aoffset = a; + boffset = b; + lda *= 2; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + i --; + }while(i > 0); + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset += 8; + i --; + }while(i > 0); + } + } /* end of if(j > 0) */ + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/generic/zgemm_tcopy_1.c b/kernel/generic/zgemm_tcopy_1.c new file mode 100644 index 0000000000..03dfcc7d93 --- /dev/null +++ b/kernel/generic/zgemm_tcopy_1.c @@ -0,0 +1,121 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + FLOAT *a_offset; + FLOAT *b_offset, *b_offset1; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + j = m; + + m *= 2; + + if (j > 0){ + do { + b_offset1 = b_offset; + b_offset += 2; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + + b_offset1 += m; + + *(b_offset1 + 0) = ctemp3; + *(b_offset1 + 1) = ctemp4; + + b_offset1 += m; + + *(b_offset1 + 0) = ctemp5; + *(b_offset1 + 1) = ctemp6; + + b_offset1 += m; + + *(b_offset1 + 0) = ctemp7; + *(b_offset1 + 1) = ctemp8; + + b_offset1 += m; + a_offset += 8; + i --; + } while(i>0); + } + + i = (n & 3); + if (i > 0){ + do { + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + + b_offset1 += m; + a_offset += 2; + i --; + } while(i > 0); + } + a_offset += lda - n * 2; + j --; + } while (j > 0); + } + + return 0; +} diff --git a/kernel/generic/zgemm_tcopy_2.c b/kernel/generic/zgemm_tcopy_2.c new file mode 100644 index 0000000000..75aff7f972 --- /dev/null +++ b/kernel/generic/zgemm_tcopy_2.c @@ -0,0 +1,220 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~1) * 2; + + lda *= 2; + + j = (m >> 1); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset1 + 4); + ctemp6 = *(a_offset1 + 5); + ctemp7 = *(a_offset1 + 6); + ctemp8 = *(a_offset1 + 7); + + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + ctemp11 = *(a_offset2 + 2); + ctemp12 = *(a_offset2 + 3); + + ctemp13 = *(a_offset2 + 4); + ctemp14 = *(a_offset2 + 5); + ctemp15 = *(a_offset2 + 6); + ctemp16 = *(a_offset2 + 7); + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp9; + *(b_offset1 + 5) = ctemp10; + *(b_offset1 + 6) = ctemp11; + *(b_offset1 + 7) = ctemp12; + + b_offset1 += m * 4; + + *(b_offset1 + 0) = ctemp5; + *(b_offset1 + 1) = ctemp6; + *(b_offset1 + 2) = ctemp7; + *(b_offset1 + 3) = ctemp8; + + *(b_offset1 + 4) = ctemp13; + *(b_offset1 + 5) = ctemp14; + *(b_offset1 + 6) = ctemp15; + *(b_offset1 + 7) = ctemp16; + + b_offset1 += m * 4; + + a_offset1 += 8; + a_offset2 += 8; + i --; + } while(i>0); + } + + if (n & 2){ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + ctemp11 = *(a_offset2 + 2); + ctemp12 = *(a_offset2 + 3); + + *(b_offset1 + 0) = ctemp1; + *(b_offset1 + 1) = ctemp2; + *(b_offset1 + 2) = ctemp3; + *(b_offset1 + 3) = ctemp4; + + *(b_offset1 + 4) = ctemp9; + *(b_offset1 + 5) = ctemp10; + *(b_offset1 + 6) = ctemp11; + *(b_offset1 + 7) = ctemp12; + + b_offset1 += m * 4; + a_offset1 += 4; + a_offset2 += 4; + } + + if (n & 1){ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + *(b_offset2 + 2) = ctemp9; + *(b_offset2 + 3) = ctemp10; + b_offset2 += 4; + } + j--; + } while(j > 0); + } + + if (m & 1){ + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + b_offset += m * 4; + + *(b_offset + 0) = ctemp5; + *(b_offset + 1) = ctemp6; + *(b_offset + 2) = ctemp7; + *(b_offset + 3) = ctemp8; + + b_offset += m * 4; + a_offset += 8; + i --; + } while(i > 0); + } + + if (n & 2){ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + *(b_offset + 0) = ctemp1; + *(b_offset + 1) = ctemp2; + *(b_offset + 2) = ctemp3; + *(b_offset + 3) = ctemp4; + + b_offset += m * 4; + a_offset += 4; + } + + if (n & 1){ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + *(b_offset2 + 0) = ctemp1; + *(b_offset2 + 1) = ctemp2; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_tcopy_4.c b/kernel/generic/zgemm_tcopy_4.c new file mode 100644 index 0000000000..c61d9d52a0 --- /dev/null +++ b/kernel/generic/zgemm_tcopy_4.c @@ -0,0 +1,403 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m,n ); +#endif + + j = (m >> 2); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + *(boffset1 + 16) = ctemp17; + *(boffset1 + 17) = ctemp18; + *(boffset1 + 18) = ctemp19; + *(boffset1 + 19) = ctemp20; + *(boffset1 + 20) = ctemp21; + *(boffset1 + 21) = ctemp22; + *(boffset1 + 22) = ctemp23; + *(boffset1 + 23) = ctemp24; + + *(boffset1 + 24) = ctemp25; + *(boffset1 + 25) = ctemp26; + *(boffset1 + 26) = ctemp27; + *(boffset1 + 27) = ctemp28; + *(boffset1 + 28) = ctemp29; + *(boffset1 + 29) = ctemp30; + *(boffset1 + 30) = ctemp31; + *(boffset1 + 31) = ctemp32; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp10; + *(boffset2 + 10) = ctemp11; + *(boffset2 + 11) = ctemp12; + *(boffset2 + 12) = ctemp13; + *(boffset2 + 13) = ctemp14; + *(boffset2 + 14) = ctemp15; + *(boffset2 + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + + boffset2 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset3 += 8; + } + j--; + }while(j > 0); + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + *(boffset1 + 8) = ctemp09; + *(boffset1 + 9) = ctemp10; + *(boffset1 + 10) = ctemp11; + *(boffset1 + 11) = ctemp12; + *(boffset1 + 12) = ctemp13; + *(boffset1 + 13) = ctemp14; + *(boffset1 + 14) = ctemp15; + *(boffset1 + 15) = ctemp16; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + *(boffset2 + 4) = ctemp05; + *(boffset2 + 5) = ctemp06; + *(boffset2 + 6) = ctemp07; + *(boffset2 + 7) = ctemp08; + + aoffset1 += 4; + aoffset2 += 4; + + boffset2 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset3 += 4; + } + } + + if (m & 1){ + aoffset1 = aoffset; + boffset1 = boffset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + aoffset1 += 8; + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + + aoffset1 += 4; + boffset2 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + } + } + + return 0; +} diff --git a/kernel/generic/zgemm_tcopy_8.c b/kernel/generic/zgemm_tcopy_8.c new file mode 100644 index 0000000000..b25878567a --- /dev/null +++ b/kernel/generic/zgemm_tcopy_8.c @@ -0,0 +1,361 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + boffset += 8; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + return 0; +} diff --git a/kernel/generic/zger.c b/kernel/generic/zger.c new file mode 100644 index 0000000000..134ff5fb97 --- /dev/null +++ b/kernel/generic/zger.c @@ -0,0 +1,84 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *x, BLASLONG incx, + FLOAT *y, BLASLONG incy, + FLOAT *a, BLASLONG lda, FLOAT *buffer){ + + FLOAT *X = x; + + if (incx != 1) { + X = buffer; + COPY_K(m, x, incx, X, 1); + } + + lda *= 2; + incy *= 2; + + while (n > 0) { + FLOAT beta_r = y[0]; + FLOAT beta_i = y[1]; + +#ifndef XCONJ + AXPYU_K +#else + AXPYC_K +#endif + (m, 0, 0, +#ifndef CONJ + alpha_r * beta_r - alpha_i * beta_i, + alpha_r * beta_i + alpha_i * beta_r, +#else + alpha_r * beta_r + alpha_i * beta_i, + -alpha_r * beta_i + alpha_i * beta_r, +#endif + X, 1, a, 1, NULL, 0); + + a += lda; + y += incy; + n --; + } + + return 0; +} + diff --git a/kernel/generic/zhemm3m_lcopy_1.c b/kernel/generic/zhemm3m_lcopy_1.c new file mode 100644 index 0000000000..72f473de36 --- /dev/null +++ b/kernel/generic/zhemm3m_lcopy_1.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_lcopy_2.c b/kernel/generic/zhemm3m_lcopy_2.c new file mode 100644 index 0000000000..f0da12ccad --- /dev/null +++ b/kernel/generic/zhemm3m_lcopy_2.c @@ -0,0 +1,146 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_lcopy_4.c b/kernel/generic/zhemm3m_lcopy_4.c new file mode 100644 index 0000000000..7e958f180e --- /dev/null +++ b/kernel/generic/zhemm3m_lcopy_4.c @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + } else + if (offset < -3) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_lcopy_8.c b/kernel/generic/zhemm3m_lcopy_8.c new file mode 100644 index 0000000000..86600b5277 --- /dev/null +++ b/kernel/generic/zhemm3m_lcopy_8.c @@ -0,0 +1,364 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + } else + if (offset < -7) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -4 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), ZERO); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -5 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), ZERO); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -6 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), ZERO); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + break; + case -7 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + } else + if (offset < -3) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_ucopy_1.c b/kernel/generic/zhemm3m_ucopy_1.c new file mode 100644 index 0000000000..a6d4975e24 --- /dev/null +++ b/kernel/generic/zhemm3m_ucopy_1.c @@ -0,0 +1,106 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_ucopy_2.c b/kernel/generic/zhemm3m_ucopy_2.c new file mode 100644 index 0000000000..fecbae615e --- /dev/null +++ b/kernel/generic/zhemm3m_ucopy_2.c @@ -0,0 +1,146 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_ucopy_4.c b/kernel/generic/zhemm3m_ucopy_4.c new file mode 100644 index 0000000000..6a45c7ed6a --- /dev/null +++ b/kernel/generic/zhemm3m_ucopy_4.c @@ -0,0 +1,217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + } else + if (offset < -3) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm3m_ucopy_8.c b/kernel/generic/zhemm3m_ucopy_8.c new file mode 100644 index 0000000000..efed390a72 --- /dev/null +++ b/kernel/generic/zhemm3m_ucopy_8.c @@ -0,0 +1,364 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) + alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) - alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + } else + if (offset < -7) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + data05 = CMULT(*(ao5 + 0), -*(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -4 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), ZERO); + data06 = CMULT(*(ao6 + 0), -*(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -5 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), ZERO); + data07 = CMULT(*(ao7 + 0), -*(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -6 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), ZERO); + data08 = CMULT(*(ao8 + 0), -*(ao8 + 1)); + break; + case -7 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + } else + if (offset < -3) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + data03 = CMULT(*(ao3 + 0), -*(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -2 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), ZERO); + data04 = CMULT(*(ao4 + 0), -*(ao4 + 1)); + break; + case -3 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + } else + if (offset < -1) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + } else { + switch (offset) { + case 0 : + data01 = CMULT(*(ao1 + 0), ZERO); + data02 = CMULT(*(ao2 + 0), -*(ao2 + 1)); + break; + case -1 : + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), ZERO); + break; + } + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + if (offset > 0) { + data01 = CMULT(*(ao1 + 0), -*(ao1 + 1)); + } else + if (offset < 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + } else { + data01 = CMULT(*(ao1 + 0), ZERO); + } + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zhemm_ltcopy_1.c b/kernel/generic/zhemm_ltcopy_1.c new file mode 100644 index 0000000000..6f5615b79d --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + posX ++; + js --; + + } + + return 0; +} diff --git a/kernel/generic/zhemm_ltcopy_2.c b/kernel/generic/zhemm_ltcopy_2.c new file mode 100644 index 0000000000..8547b4d685 --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_2.c @@ -0,0 +1,144 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_ltcopy_4.c b/kernel/generic/zhemm_ltcopy_4.c new file mode 100644 index 0000000000..d7afc11743 --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_4.c @@ -0,0 +1,244 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_ltcopy_8.c b/kernel/generic/zhemm_ltcopy_8.c new file mode 100644 index 0000000000..d5ebd1c81b --- /dev/null +++ b/kernel/generic/zhemm_ltcopy_8.c @@ -0,0 +1,480 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} + diff --git a/kernel/generic/zhemm_utcopy_1.c b/kernel/generic/zhemm_utcopy_1.c new file mode 100644 index 0000000000..961b8497e7 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_1.c @@ -0,0 +1,88 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1; + + lda *= 2; + + js = n; + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_2.c b/kernel/generic/zhemm_utcopy_2.c new file mode 100644 index 0000000000..91e7108b4b --- /dev/null +++ b/kernel/generic/zhemm_utcopy_2.c @@ -0,0 +1,142 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_4.c b/kernel/generic/zhemm_utcopy_4.c new file mode 100644 index 0000000000..15671b44a5 --- /dev/null +++ b/kernel/generic/zhemm_utcopy_4.c @@ -0,0 +1,242 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemm_utcopy_8.c b/kernel/generic/zhemm_utcopy_8.c new file mode 100644 index 0000000000..1cfd3bd59d --- /dev/null +++ b/kernel/generic/zhemm_utcopy_8.c @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + } else + if (offset < -7) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + b[ 8] = data09; + b[ 9] = -data10; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -4 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = ZERO; + b[10] = data11; + b[11] = -data12; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -5 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = ZERO; + b[12] = data13; + b[13] = -data14; + b[14] = data15; + b[15] = -data16; + break; + case -6 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = ZERO; + b[14] = data15; + b[15] = -data16; + break; + case -7 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = ZERO; + break; + } + } + + b += 16; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + } else + if (offset < -3) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + b[ 4] = data05; + b[ 5] = -data06; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -2 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = -data08; + break; + case -3 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = ZERO; + break; + } + } + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + b[ 2] = data03; + b[ 3] = -data04; + } else + if (offset < -1) { + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + } else { + switch (offset) { + case 0 : + b[ 0] = data01; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = -data04; + break; + case -1 : + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = ZERO; + break; + } + } + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + if (offset > 0) { + b[ 0] = data01; + b[ 1] = -data02; + } else + if (offset < 0) { + b[ 0] = data01; + b[ 1] = data02; + } else { + b[ 0] = data01; + b[ 1] = ZERO; + } + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zhemv_k.c b/kernel/generic/zhemv_k.c new file mode 100644 index 0000000000..3551938daa --- /dev/null +++ b/kernel/generic/zhemv_k.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + + BLASLONG is, min_i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *symbuffer = buffer; + FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) * 2 + 4095) & ~4095); + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + +#ifndef LOWER + for(is = m - offset; is < m; is += SYMV_P){ + min_i = MIN(m - is, SYMV_P); +#else + for(is = 0; is < offset; is += SYMV_P){ + min_i = MIN(offset - is, SYMV_P); +#endif + +#ifndef LOWER + if (is > 0){ +#ifndef HEMVREV + GEMV_C(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * 2, lda, + X, 1, + Y + is * 2, 1, gemvbuffer); + + GEMV_N(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * 2, lda, + X + is * 2, 1, + Y, 1, gemvbuffer); +#else + GEMV_T(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * 2, lda, + X, 1, + Y + is * 2, 1, gemvbuffer); + + GEMV_R(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * 2, lda, + X + is * 2, 1, + Y, 1, gemvbuffer); +#endif + } +#endif + +#ifndef HEMVREV +#ifdef LOWER + ZHEMCOPY_L(min_i, a + (is + is * lda) * 2, lda, symbuffer); +#else + ZHEMCOPY_U(min_i, a + (is + is * lda) * 2, lda, symbuffer); +#endif +#else +#ifdef LOWER + ZHEMCOPY_M(min_i, a + (is + is * lda) * 2, lda, symbuffer); +#else + ZHEMCOPY_V(min_i, a + (is + is * lda) * 2, lda, symbuffer); +#endif +#endif + + GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, + symbuffer, min_i, + X + is * 2, 1, + Y + is * 2, 1, gemvbuffer); + +#ifdef LOWER + if (m - is - min_i > 0){ + +#ifndef HEMVREV + GEMV_C(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * 2, lda, + X + (is + min_i) * 2, 1, + Y + is * 2, 1, gemvbuffer); + + GEMV_N(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * 2, lda, + X + is * 2, 1, + Y + (is + min_i) * 2, 1, gemvbuffer); +#else + GEMV_T(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * 2, lda, + X + (is + min_i) * 2, 1, + Y + is * 2, 1, gemvbuffer); + + GEMV_R(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * 2, lda, + X + is * 2, 1, + Y + (is + min_i) * 2, 1, gemvbuffer); +#endif + + } +#endif + + } /* end of is */ + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} + diff --git a/kernel/generic/zlaswp_ncopy_1.c b/kernel/generic/zlaswp_ncopy_1.c new file mode 100644 index 0000000000..acbda68fdf --- /dev/null +++ b/kernel/generic/zlaswp_ncopy_1.c @@ -0,0 +1,186 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, A3, A4; + FLOAT B1, B2, B3, B4; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = n; + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + buffer += 2; + } + + a += lda; + j --; + } while (j > 0); + + return 0; +} + diff --git a/kernel/generic/zlaswp_ncopy_2.c b/kernel/generic/zlaswp_ncopy_2.c new file mode 100644 index 0000000000..7fa56be21c --- /dev/null +++ b/kernel/generic/zlaswp_ncopy_2.c @@ -0,0 +1,381 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3; + FLOAT *b1, *b2, *b3, *b4; + FLOAT A1, A2, A3, A4; + FLOAT A5, A6, A7, A8; + FLOAT B1, B2, B3, B4; + FLOAT B5, B6, B7, B8; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 1); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + j --; + } while (j > 0); + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/generic/zlaswp_ncopy_4.c b/kernel/generic/zlaswp_ncopy_4.c new file mode 100644 index 0000000000..c9c44fcab8 --- /dev/null +++ b/kernel/generic/zlaswp_ncopy_4.c @@ -0,0 +1,711 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, A3, A4, A5, A6, A7, A8; + FLOAT B1, B2, B3, B4, B5, B6, B7, B8; + + FLOAT A9, A10, A11, A12, A13, A14, A15, A16; + FLOAT B9, B10, B11, B12, B13, B14, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + + ipiv += k1; + + if (n <= 0) return 0; + + j = (n >> 2); + if (j > 0) { + do { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + A2 = *(a2 + 0); + A10 = *(a2 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + A4 = *(a4 + 0); + A12 = *(a4 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + A6 = *(a6 + 0); + A14 = *(a6 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + A8 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B9 = *(b1 + 1); + B2 = *(b2 + 0); + B10 = *(b2 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + B4 = *(b4 + 0); + B12 = *(b4 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + B6 = *(b6 + 0); + B14 = *(b6 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + B8 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + } else { + *(buffer + 0) = A2; + *(buffer + 1) = A10; + *(buffer + 2) = A4; + *(buffer + 3) = A12; + *(buffer + 4) = A6; + *(buffer + 5) = A14; + *(buffer + 6) = A8; + *(buffer + 7) = A16; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b2 + 0) = A1; + *(b2 + 1) = A9; + *(b4 + 0) = A3; + *(b4 + 1) = A11; + *(b6 + 0) = A5; + *(b6 + 1) = A13; + *(b8 + 0) = A7; + *(b8 + 1) = A15; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A2; + *(buffer + 9) = A10; + *(buffer + 10) = A4; + *(buffer + 11) = A12; + *(buffer + 12) = A6; + *(buffer + 13) = A14; + *(buffer + 14) = A8; + *(buffer + 15) = A16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = A1; + *(buffer + 9) = A9; + *(buffer + 10) = A3; + *(buffer + 11) = A11; + *(buffer + 12) = A5; + *(buffer + 13) = A13; + *(buffer + 14) = A7; + *(buffer + 15) = A15; + + *(b1 + 0) = A2; + *(b1 + 1) = A10; + *(b3 + 0) = A4; + *(b3 + 1) = A12; + *(b5 + 0) = A6; + *(b5 + 1) = A14; + *(b7 + 0) = A8; + *(b7 + 1) = A16; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + *(buffer + 8) = B2; + *(buffer + 9) = B10; + *(buffer + 10) = B4; + *(buffer + 11) = B12; + *(buffer + 12) = B6; + *(buffer + 13) = B14; + *(buffer + 14) = B8; + *(buffer + 15) = B16; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b2 + 0) = A2; + *(b2 + 1) = A10; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b4 + 0) = A4; + *(b4 + 1) = A12; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b6 + 0) = A6; + *(b6 + 1) = A14; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + *(b8 + 0) = A8; + *(b8 + 1) = A16; + } + } + + buffer += 16; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A9 = *(a1 + 1); + B1 = *(b1 + 0); + B9 = *(b1 + 1); + A3 = *(a3 + 0); + A11 = *(a3 + 1); + B3 = *(b3 + 0); + B11 = *(b3 + 1); + A5 = *(a5 + 0); + A13 = *(a5 + 1); + B5 = *(b5 + 0); + B13 = *(b5 + 1); + A7 = *(a7 + 0); + A15 = *(a7 + 1); + B7 = *(b7 + 0); + B15 = *(b7 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A9; + *(buffer + 2) = A3; + *(buffer + 3) = A11; + *(buffer + 4) = A5; + *(buffer + 5) = A13; + *(buffer + 6) = A7; + *(buffer + 7) = A15; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B9; + *(buffer + 2) = B3; + *(buffer + 3) = B11; + *(buffer + 4) = B5; + *(buffer + 5) = B13; + *(buffer + 6) = B7; + *(buffer + 7) = B15; + + *(b1 + 0) = A1; + *(b1 + 1) = A9; + *(b3 + 0) = A3; + *(b3 + 1) = A11; + *(b5 + 0) = A5; + *(b5 + 1) = A13; + *(b7 + 0) = A7; + *(b7 + 1) = A15; + } + buffer += 8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + if (n & 2) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + a3 = a1 + lda; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A5; + *(buffer + 3) = A6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A7; + *(buffer + 3) = A8; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A3; + *(buffer + 5) = A4; + *(buffer + 6) = A7; + *(buffer + 7) = A8; + + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = A1; + *(buffer + 5) = A2; + *(buffer + 6) = A5; + *(buffer + 7) = A6; + + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B5; + *(buffer + 3) = B6; + *(buffer + 4) = B3; + *(buffer + 5) = B4; + *(buffer + 6) = B7; + *(buffer + 7) = B8; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + + buffer += 8; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + a1 += 4; + a3 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + buffer += 4; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + + a1 = a + (k1 + 1) * 2; + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *(piv + 0) * 2; + ip2 = *(piv + 1) * 2; + piv += 2; + + if (b1 == a1) { + if (b2 == a2) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + } else { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 == a2) { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + } else { + *(buffer + 0) = A3; + *(buffer + 1) = A4; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } else { + if (b2 == a2) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A3; + *(buffer + 3) = A4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = A1; + *(buffer + 3) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(buffer + 2) = B3; + *(buffer + 3) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + buffer += 4; + + b1 = a + ip1; + b2 = a + ip2; + + a1 += 4; + + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + + if (a1 == b1) { + *(buffer + 0) = A1; + *(buffer + 1) = A2; + } else { + *(buffer + 0) = B1; + *(buffer + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + buffer += 2; + } + } + + return 0; +} + diff --git a/kernel/generic/zneg_tcopy_1.c b/kernel/generic/zneg_tcopy_1.c new file mode 100644 index 0000000000..3701c9cffd --- /dev/null +++ b/kernel/generic/zneg_tcopy_1.c @@ -0,0 +1,121 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + FLOAT *a_offset; + FLOAT *b_offset, *b_offset1; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + + a_offset = a; + b_offset = b; + + lda *= 2; + + j = m; + + m *= 2; + + if (j > 0){ + do { + b_offset1 = b_offset; + b_offset += 2; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + + b_offset1 += m; + + *(b_offset1 + 0) = -ctemp3; + *(b_offset1 + 1) = -ctemp4; + + b_offset1 += m; + + *(b_offset1 + 0) = -ctemp5; + *(b_offset1 + 1) = -ctemp6; + + b_offset1 += m; + + *(b_offset1 + 0) = -ctemp7; + *(b_offset1 + 1) = -ctemp8; + + b_offset1 += m; + a_offset += 8; + i --; + } while(i>0); + } + + i = (n & 3); + if (i > 0){ + do { + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + + b_offset1 += m; + a_offset += 2; + i --; + } while(i > 0); + } + a_offset += lda - n * 2; + j --; + } while (j > 0); + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_2.c b/kernel/generic/zneg_tcopy_2.c new file mode 100644 index 0000000000..40dd1151d5 --- /dev/null +++ b/kernel/generic/zneg_tcopy_2.c @@ -0,0 +1,220 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2; + FLOAT *b_offset, *b_offset1, *b_offset2; + FLOAT ctemp1, ctemp2, ctemp3, ctemp4; + FLOAT ctemp5, ctemp6, ctemp7, ctemp8; + FLOAT ctemp9, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + + a_offset = a; + b_offset = b; + + b_offset2 = b + m * (n & ~1) * 2; + + lda *= 2; + + j = (m >> 1); + if (j > 0){ + do{ + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset += 8; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp5 = *(a_offset1 + 4); + ctemp6 = *(a_offset1 + 5); + ctemp7 = *(a_offset1 + 6); + ctemp8 = *(a_offset1 + 7); + + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + ctemp11 = *(a_offset2 + 2); + ctemp12 = *(a_offset2 + 3); + + ctemp13 = *(a_offset2 + 4); + ctemp14 = *(a_offset2 + 5); + ctemp15 = *(a_offset2 + 6); + ctemp16 = *(a_offset2 + 7); + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + *(b_offset1 + 4) = -ctemp9; + *(b_offset1 + 5) = -ctemp10; + *(b_offset1 + 6) = -ctemp11; + *(b_offset1 + 7) = -ctemp12; + + b_offset1 += m * 4; + + *(b_offset1 + 0) = -ctemp5; + *(b_offset1 + 1) = -ctemp6; + *(b_offset1 + 2) = -ctemp7; + *(b_offset1 + 3) = -ctemp8; + + *(b_offset1 + 4) = -ctemp13; + *(b_offset1 + 5) = -ctemp14; + *(b_offset1 + 6) = -ctemp15; + *(b_offset1 + 7) = -ctemp16; + + b_offset1 += m * 4; + + a_offset1 += 8; + a_offset2 += 8; + i --; + } while(i>0); + } + + if (n & 2){ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp3 = *(a_offset1 + 2); + ctemp4 = *(a_offset1 + 3); + + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + ctemp11 = *(a_offset2 + 2); + ctemp12 = *(a_offset2 + 3); + + *(b_offset1 + 0) = -ctemp1; + *(b_offset1 + 1) = -ctemp2; + *(b_offset1 + 2) = -ctemp3; + *(b_offset1 + 3) = -ctemp4; + + *(b_offset1 + 4) = -ctemp9; + *(b_offset1 + 5) = -ctemp10; + *(b_offset1 + 6) = -ctemp11; + *(b_offset1 + 7) = -ctemp12; + + b_offset1 += m * 4; + a_offset1 += 4; + a_offset2 += 4; + } + + if (n & 1){ + ctemp1 = *(a_offset1 + 0); + ctemp2 = *(a_offset1 + 1); + ctemp9 = *(a_offset2 + 0); + ctemp10 = *(a_offset2 + 1); + + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + *(b_offset2 + 2) = -ctemp9; + *(b_offset2 + 3) = -ctemp10; + b_offset2 += 4; + } + j--; + } while(j > 0); + } + + if (m & 1){ + i = (n >> 2); + if (i > 0){ + do{ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + ctemp5 = *(a_offset + 4); + ctemp6 = *(a_offset + 5); + ctemp7 = *(a_offset + 6); + ctemp8 = *(a_offset + 7); + + *(b_offset + 0) = -ctemp1; + *(b_offset + 1) = -ctemp2; + *(b_offset + 2) = -ctemp3; + *(b_offset + 3) = -ctemp4; + + b_offset += m * 4; + + *(b_offset + 0) = -ctemp5; + *(b_offset + 1) = -ctemp6; + *(b_offset + 2) = -ctemp7; + *(b_offset + 3) = -ctemp8; + + b_offset += m * 4; + a_offset += 8; + i --; + } while(i > 0); + } + + if (n & 2){ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + ctemp3 = *(a_offset + 2); + ctemp4 = *(a_offset + 3); + + *(b_offset + 0) = -ctemp1; + *(b_offset + 1) = -ctemp2; + *(b_offset + 2) = -ctemp3; + *(b_offset + 3) = -ctemp4; + + b_offset += m * 4; + a_offset += 4; + } + + if (n & 1){ + ctemp1 = *(a_offset + 0); + ctemp2 = *(a_offset + 1); + *(b_offset2 + 0) = -ctemp1; + *(b_offset2 + 1) = -ctemp2; + } + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_4.c b/kernel/generic/zneg_tcopy_4.c new file mode 100644 index 0000000000..7cd9887541 --- /dev/null +++ b/kernel/generic/zneg_tcopy_4.c @@ -0,0 +1,403 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + FLOAT *boffset, *boffset1, *boffset2, *boffset3; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + + boffset2 = b + 2 * m * (n & ~3); + boffset3 = b + 2 * m * (n & ~1); + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m,n ); +#endif + + j = (m >> 2); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + *(boffset1 + 16) = -ctemp17; + *(boffset1 + 17) = -ctemp18; + *(boffset1 + 18) = -ctemp19; + *(boffset1 + 19) = -ctemp20; + *(boffset1 + 20) = -ctemp21; + *(boffset1 + 21) = -ctemp22; + *(boffset1 + 22) = -ctemp23; + *(boffset1 + 23) = -ctemp24; + + *(boffset1 + 24) = -ctemp25; + *(boffset1 + 25) = -ctemp26; + *(boffset1 + 26) = -ctemp27; + *(boffset1 + 27) = -ctemp28; + *(boffset1 + 28) = -ctemp29; + *(boffset1 + 29) = -ctemp30; + *(boffset1 + 30) = -ctemp31; + *(boffset1 + 31) = -ctemp32; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + + *(boffset2 + 8) = -ctemp09; + *(boffset2 + 9) = -ctemp10; + *(boffset2 + 10) = -ctemp11; + *(boffset2 + 11) = -ctemp12; + *(boffset2 + 12) = -ctemp13; + *(boffset2 + 13) = -ctemp14; + *(boffset2 + 14) = -ctemp15; + *(boffset2 + 15) = -ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + + boffset2 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + *(boffset3 + 4) = -ctemp05; + *(boffset3 + 5) = -ctemp06; + *(boffset3 + 6) = -ctemp07; + *(boffset3 + 7) = -ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset3 += 8; + } + j--; + }while(j > 0); + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + *(boffset1 + 8) = -ctemp09; + *(boffset1 + 9) = -ctemp10; + *(boffset1 + 10) = -ctemp11; + *(boffset1 + 11) = -ctemp12; + *(boffset1 + 12) = -ctemp13; + *(boffset1 + 13) = -ctemp14; + *(boffset1 + 14) = -ctemp15; + *(boffset1 + 15) = -ctemp16; + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + *(boffset2 + 4) = -ctemp05; + *(boffset2 + 5) = -ctemp06; + *(boffset2 + 6) = -ctemp07; + *(boffset2 + 7) = -ctemp08; + + aoffset1 += 4; + aoffset2 += 4; + + boffset2 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + *(boffset3 + 2) = -ctemp03; + *(boffset3 + 3) = -ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset3 += 4; + } + } + + if (m & 1){ + aoffset1 = aoffset; + boffset1 = boffset; + + i = (n >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset1 + 0) = -ctemp01; + *(boffset1 + 1) = -ctemp02; + *(boffset1 + 2) = -ctemp03; + *(boffset1 + 3) = -ctemp04; + *(boffset1 + 4) = -ctemp05; + *(boffset1 + 5) = -ctemp06; + *(boffset1 + 6) = -ctemp07; + *(boffset1 + 7) = -ctemp08; + + aoffset1 += 8; + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset2 + 0) = -ctemp01; + *(boffset2 + 1) = -ctemp02; + *(boffset2 + 2) = -ctemp03; + *(boffset2 + 3) = -ctemp04; + + aoffset1 += 4; + boffset2 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset3 + 0) = -ctemp01; + *(boffset3 + 1) = -ctemp02; + } + } + + return 0; +} diff --git a/kernel/generic/zneg_tcopy_8.c b/kernel/generic/zneg_tcopy_8.c new file mode 100644 index 0000000000..fe8f25cbac --- /dev/null +++ b/kernel/generic/zneg_tcopy_8.c @@ -0,0 +1,361 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ + + BLASLONG i, j; + + FLOAT *aoffset; + FLOAT *aoffset1, *aoffset2; + + FLOAT *boffset; + + FLOAT ctemp01, ctemp02, ctemp03, ctemp04; + FLOAT ctemp05, ctemp06, ctemp07, ctemp08; + FLOAT ctemp09, ctemp10, ctemp11, ctemp12; + FLOAT ctemp13, ctemp14, ctemp15, ctemp16; + FLOAT ctemp17, ctemp18, ctemp19, ctemp20; + FLOAT ctemp21, ctemp22, ctemp23, ctemp24; + FLOAT ctemp25, ctemp26, ctemp27, ctemp28; + FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + lda *= 2; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + ctemp17 = *(aoffset2 + 0); + ctemp18 = *(aoffset2 + 1); + ctemp19 = *(aoffset2 + 2); + ctemp20 = *(aoffset2 + 3); + ctemp21 = *(aoffset2 + 4); + ctemp22 = *(aoffset2 + 5); + ctemp23 = *(aoffset2 + 6); + ctemp24 = *(aoffset2 + 7); + ctemp25 = *(aoffset2 + 8); + ctemp26 = *(aoffset2 + 9); + ctemp27 = *(aoffset2 + 10); + ctemp28 = *(aoffset2 + 11); + ctemp29 = *(aoffset2 + 12); + ctemp30 = *(aoffset2 + 13); + ctemp31 = *(aoffset2 + 14); + ctemp32 = *(aoffset2 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + *(boffset + 16) = -ctemp17; + *(boffset + 17) = -ctemp18; + *(boffset + 18) = -ctemp19; + *(boffset + 19) = -ctemp20; + *(boffset + 20) = -ctemp21; + *(boffset + 21) = -ctemp22; + *(boffset + 22) = -ctemp23; + *(boffset + 23) = -ctemp24; + + *(boffset + 24) = -ctemp25; + *(boffset + 25) = -ctemp26; + *(boffset + 26) = -ctemp27; + *(boffset + 27) = -ctemp28; + *(boffset + 28) = -ctemp29; + *(boffset + 29) = -ctemp30; + *(boffset + 30) = -ctemp31; + *(boffset + 31) = -ctemp32; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + ctemp09 = *(aoffset1 + 8); + ctemp10 = *(aoffset1 + 9); + ctemp11 = *(aoffset1 + 10); + ctemp12 = *(aoffset1 + 11); + ctemp13 = *(aoffset1 + 12); + ctemp14 = *(aoffset1 + 13); + ctemp15 = *(aoffset1 + 14); + ctemp16 = *(aoffset1 + 15); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + *(boffset + 8) = -ctemp09; + *(boffset + 9) = -ctemp10; + *(boffset + 10) = -ctemp11; + *(boffset + 11) = -ctemp12; + *(boffset + 12) = -ctemp13; + *(boffset + 13) = -ctemp14; + *(boffset + 14) = -ctemp15; + *(boffset + 15) = -ctemp16; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + boffset += 8; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + *(boffset + 4) = -ctemp05; + *(boffset + 5) = -ctemp06; + *(boffset + 6) = -ctemp07; + *(boffset + 7) = -ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + boffset += 4; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + *(boffset + 2) = -ctemp03; + *(boffset + 3) = -ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = -ctemp01; + *(boffset + 1) = -ctemp02; + boffset += 2; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_lcopy_1.c b/kernel/generic/zsymm3m_lcopy_1.c new file mode 100644 index 0000000000..0e0d5a3e3b --- /dev/null +++ b/kernel/generic/zsymm3m_lcopy_1.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + lda *= 2; + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + js = n; + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_lcopy_2.c b/kernel/generic/zsymm3m_lcopy_2.c new file mode 100644 index 0000000000..96686c1e4a --- /dev/null +++ b/kernel/generic/zsymm3m_lcopy_2.c @@ -0,0 +1,124 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + lda *= 2; + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_lcopy_4.c b/kernel/generic/zsymm3m_lcopy_4.c new file mode 100644 index 0000000000..38a58cfccb --- /dev/null +++ b/kernel/generic/zsymm3m_lcopy_4.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_lcopy_8.c b/kernel/generic/zsymm3m_lcopy_8.c new file mode 100644 index 0000000000..4e5cddcdab --- /dev/null +++ b/kernel/generic/zsymm3m_lcopy_8.c @@ -0,0 +1,209 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_ucopy_1.c b/kernel/generic/zsymm3m_ucopy_1.c new file mode 100644 index 0000000000..14ca6e76e4 --- /dev/null +++ b/kernel/generic/zsymm3m_ucopy_1.c @@ -0,0 +1,98 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_ucopy_2.c b/kernel/generic/zsymm3m_ucopy_2.c new file mode 100644 index 0000000000..4ba1e69966 --- /dev/null +++ b/kernel/generic/zsymm3m_ucopy_2.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_ucopy_4.c b/kernel/generic/zsymm3m_ucopy_4.c new file mode 100644 index 0000000000..8de026a5c2 --- /dev/null +++ b/kernel/generic/zsymm3m_ucopy_4.c @@ -0,0 +1,158 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm3m_ucopy_8.c b/kernel/generic/zsymm3m_ucopy_8.c new file mode 100644 index 0000000000..79ef3649c7 --- /dev/null +++ b/kernel/generic/zsymm3m_ucopy_8.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_ALPHA +#define REAL_PART(a, b) (a) +#define IMAGE_PART(a, b) (b) +#else +#define REAL_PART(a, b) (alpha_r * (a) - alpha_i * (b)) +#define IMAGE_PART(a, b) (alpha_i * (a) + alpha_r * (b)) +#endif + +#if defined(REAL_ONLY) +#define CMULT(a, b) (REAL_PART(a, b)) +#elif defined(IMAGE_ONLY) +#define CMULT(a, b) (IMAGE_PART(a, b)) +#else +#define CMULT(a, b) (REAL_PART(a, b) + IMAGE_PART(a, b)) +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, +#ifdef USE_ALPHA + FLOAT alpha_r, FLOAT alpha_i, +#endif + FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + data05 = CMULT(*(ao5 + 0), *(ao5 + 1)); + data06 = CMULT(*(ao6 + 0), *(ao6 + 1)); + data07 = CMULT(*(ao7 + 0), *(ao7 + 1)); + data08 = CMULT(*(ao8 + 0), *(ao8 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + data03 = CMULT(*(ao3 + 0), *(ao3 + 1)); + data04 = CMULT(*(ao4 + 0), *(ao4 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + data02 = CMULT(*(ao2 + 0), *(ao2 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = CMULT(*(ao1 + 0), *(ao1 + 1)); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + + b ++; + + offset --; + i --; + } + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_1.c b/kernel/generic/zsymm_lcopy_1.c new file mode 100644 index 0000000000..1b4f58d53b --- /dev/null +++ b/kernel/generic/zsymm_lcopy_1.c @@ -0,0 +1,81 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1; + + lda *= 2; + + js = n; + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_2.c b/kernel/generic/zsymm_lcopy_2.c new file mode 100644 index 0000000000..ce1b16e9cb --- /dev/null +++ b/kernel/generic/zsymm_lcopy_2.c @@ -0,0 +1,112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_4.c b/kernel/generic/zsymm_lcopy_4.c new file mode 100644 index 0000000000..dd2034d441 --- /dev/null +++ b/kernel/generic/zsymm_lcopy_4.c @@ -0,0 +1,157 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_lcopy_8.c b/kernel/generic/zsymm_lcopy_8.c new file mode 100644 index 0000000000..33976124ff --- /dev/null +++ b/kernel/generic/zsymm_lcopy_8.c @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda; + if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda; + if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda; + if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + if (offset > -4) ao5 += lda; else ao5 += 2; + if (offset > -5) ao6 += lda; else ao6 += 2; + if (offset > -6) ao7 += lda; else ao7 += 2; + if (offset > -7) ao8 += lda; else ao8 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda; + if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + if (offset > -2) ao3 += lda; else ao3 += 2; + if (offset > -3) ao4 += lda; else ao4 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + if (offset > -1) ao2 += lda; else ao2 += 2; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += lda; else ao1 += 2; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_1.c b/kernel/generic/zsymm_ucopy_1.c new file mode 100644 index 0000000000..9943a2dade --- /dev/null +++ b/kernel/generic/zsymm_ucopy_1.c @@ -0,0 +1,80 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02; + FLOAT *ao1; + + lda *= 2; + + js = n; + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + posX ++; + js --; + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_2.c b/kernel/generic/zsymm_ucopy_2.c new file mode 100644 index 0000000000..da64cde154 --- /dev/null +++ b/kernel/generic/zsymm_ucopy_2.c @@ -0,0 +1,111 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04; + FLOAT *ao1, *ao2; + + lda *= 2; + + js = (n >> 1); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + js --; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_4.c b/kernel/generic/zsymm_ucopy_4.c new file mode 100644 index 0000000000..eed0bcacbc --- /dev/null +++ b/kernel/generic/zsymm_ucopy_4.c @@ -0,0 +1,155 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda *= 2; + + js = (n >> 2); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + js --; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymm_ucopy_8.c b/kernel/generic/zsymm_ucopy_8.c new file mode 100644 index 0000000000..c81a7a8908 --- /dev/null +++ b/kernel/generic/zsymm_ucopy_8.c @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, offset; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda *= 2; + + js = (n >> 3); + while (js > 0){ + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda; + if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda; + if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda; + if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + data09 = *(ao5 + 0); + data10 = *(ao5 + 1); + data11 = *(ao6 + 0); + data12 = *(ao6 + 1); + data13 = *(ao7 + 0); + data14 = *(ao7 + 1); + data15 = *(ao8 + 0); + data16 = *(ao8 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + if (offset > -4) ao5 += 2; else ao5 += lda; + if (offset > -5) ao6 += 2; else ao6 += lda; + if (offset > -6) ao7 += 2; else ao7 += lda; + if (offset > -7) ao8 += 2; else ao8 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b += 16; + + offset --; + i --; + } + + posX += 8; + js --; + } + + if (n & 4) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda; + if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + data05 = *(ao3 + 0); + data06 = *(ao3 + 1); + data07 = *(ao4 + 0); + data08 = *(ao4 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + if (offset > -2) ao3 += 2; else ao3 += lda; + if (offset > -3) ao4 += 2; else ao4 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b += 8; + + offset --; + i --; + } + + posX += 4; + } + + if (n & 2) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + if (offset > -1) ao2 += 2; else ao2 += lda; + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + + offset --; + i --; + } + + posX += 2; + } + + if (n & 1) { + + offset = posX - posY; + + if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda; + + i = m; + + while (i > 0) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (offset > 0) ao1 += 2; else ao1 += lda; + + b[ 0] = data01; + b[ 1] = data02; + + b += 2; + + offset --; + i --; + } + + } + + return 0; +} diff --git a/kernel/generic/zsymv_k.c b/kernel/generic/zsymv_k.c new file mode 100644 index 0000000000..211def30f6 --- /dev/null +++ b/kernel/generic/zsymv_k.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +#include "symcopy.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + + BLASLONG is, min_i; + FLOAT *X = x; + FLOAT *Y = y; + FLOAT *symbuffer = buffer; + FLOAT *gemvbuffer = (FLOAT *)(((BLASLONG)buffer + SYMV_P * SYMV_P * sizeof(FLOAT) * 2 + 4095) & ~4095); + FLOAT *bufferY = gemvbuffer; + FLOAT *bufferX = gemvbuffer; + + if (incy != 1) { + Y = bufferY; + bufferX = (FLOAT *)(((BLASLONG)bufferY + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + gemvbuffer = bufferX; + COPY_K(m, y, incy, Y, 1); + } + + if (incx != 1) { + X = bufferX; + gemvbuffer = (FLOAT *)(((BLASLONG)bufferX + m * sizeof(FLOAT) * 2 + 4095) & ~4095); + COPY_K(m, x, incx, X, 1); + } + +#ifndef LOWER + for(is = m - offset; is < m; is += SYMV_P){ + min_i = MIN(m - is, SYMV_P); +#else + for(is = 0; is < offset; is += SYMV_P){ + min_i = MIN(offset - is, SYMV_P); +#endif + +#ifndef LOWER + if (is >0){ + GEMV_T(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * COMPSIZE, lda, + X, 1, + Y + is * COMPSIZE, 1, gemvbuffer); + + GEMV_N(is, min_i, 0, alpha_r, alpha_i, + a + is * lda * COMPSIZE, lda, + X + is * COMPSIZE, 1, + Y, 1, gemvbuffer); + } +#endif + +#ifdef LOWER + ZSYMCOPY_L(min_i, a + (is + is * lda) * COMPSIZE, lda, symbuffer); +#else + ZSYMCOPY_U(min_i, a + (is + is * lda) * COMPSIZE, lda, symbuffer); +#endif + + GEMV_N(min_i, min_i, 0, alpha_r, alpha_i, + symbuffer, min_i, + X + is * COMPSIZE, 1, + Y + is * COMPSIZE, 1, gemvbuffer); + + +#ifdef LOWER + if (m - is > min_i){ + GEMV_T(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * COMPSIZE, lda, + X + (is + min_i) * COMPSIZE, 1, + Y + is * COMPSIZE, 1, gemvbuffer); + + GEMV_N(m - is - min_i, min_i, 0, alpha_r, alpha_i, + a + ((is + min_i) + is * lda) * COMPSIZE, lda, + X + is * COMPSIZE, 1, + Y + (is + min_i) * COMPSIZE, 1, gemvbuffer); + } +#endif + + } /* end of is */ + + if (incy != 1) { + COPY_K(m, Y, 1, y, incy); + } + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_1.c b/kernel/generic/ztrmm_lncopy_1.c new file mode 100644 index 0000000000..15a05090b7 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_1.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02; + FLOAT *ao1; + + lda += lda; + + js = n; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 2; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + + posY ++; + js --; + } while (js > 0); + } /* End of main loop */ + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_2.c b/kernel/generic/ztrmm_lncopy_2.c new file mode 100644 index 0000000000..f41ee5b932 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_2.c @@ -0,0 +1,230 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *ao1, *ao2; + + lda += lda; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 4; + ao2 += 4; + b += 8; + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data07; + b[ 7] = data08; +#endif + ao1 += 4; + ao2 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + ao1 += lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 0] = ZERO; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; +#endif + + b += 4; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + b += 2; + ao1 += 2; + } else + if (X < posY) { + b += 2; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + ao1 += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_4.c b/kernel/generic/ztrmm_lncopy_4.c new file mode 100644 index 0000000000..76170c7667 --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_4.c @@ -0,0 +1,664 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda += lda; + + js = (n >> 2); + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + b[16] = data05; + b[17] = data06; + b[18] = data13; + b[19] = data14; + b[20] = data21; + b[21] = data22; + b[22] = data29; + b[23] = data30; + + b[24] = data07; + b[25] = data08; + b[26] = data15; + b[27] = data16; + b[28] = data23; + b[29] = data24; + b[30] = data31; + b[31] = data32; + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = ONE; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data05; + b[17] = data06; + b[18] = data13; + b[19] = data14; + b[20] = ONE; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data07; + b[25] = data08; + b[26] = data15; + b[27] = data16; + b[28] = data23; + b[29] = data24; + b[30] = ONE; + b[31] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data05; + b[17] = data06; + b[18] = data13; + b[19] = data14; + b[20] = data21; + b[21] = data22; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data07; + b[25] = data08; + b[26] = data15; + b[27] = data16; + b[28] = data23; + b[29] = data24; + b[30] = data31; + b[31] = data32; +#endif + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + } else + if (X < posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 16; + } + + if (m & 1) { + ao1 += lda; + b += 8; + } + + } else { +#ifdef UNIT + + if (i >= 2) { + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + } + + if (i >= 3) { + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = data03; + b[ 1] = data04; + b[ 2] = ONE; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = data13; + b[ 3] = data14; + b[ 4] = ONE; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (i >= 2) { + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + } + + if (i >= 3) { + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = data03; + b[ 1] = data04; + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data05; + b[ 1] = data06; + b[ 2] = data13; + b[ 3] = data14; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data11; + b[ 7] = data12; +#endif + ao1 += 4; + ao2 += 4; + + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + ao1 += lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + ao1 += 2; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += lda; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_lncopy_8.c b/kernel/generic/ztrmm_lncopy_8.c new file mode 100644 index 0000000000..308ddd75fd --- /dev/null +++ b/kernel/generic/ztrmm_lncopy_8.c @@ -0,0 +1,871 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X, ii; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda += lda; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + ao5 = a + posY * 2 + (posX + 4) * lda; + ao6 = a + posY * 2 + (posX + 5) * lda; + ao7 = a + posY * 2 + (posX + 6) * lda; + ao8 = a + posY * 2 + (posX + 7) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + ao5 = a + posX * 2 + (posY + 4) * lda; + ao6 = a + posX * 2 + (posY + 5) * lda; + ao7 = a + posX * 2 + (posY + 6) * lda; + ao8 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + b += 16; + } + } else + if (X < posY) { + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 128; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(ao1 + 2); + b[ 17] = *(ao1 + 3); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(ao2 + 2); + b[ 19] = *(ao2 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(ao1 + 4); + b[ 33] = *(ao1 + 5); + b[ 34] = *(ao2 + 4); + b[ 35] = *(ao2 + 5); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(ao3 + 4); + b[ 37] = *(ao3 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(ao1 + 6); + b[ 49] = *(ao1 + 7); + b[ 50] = *(ao2 + 6); + b[ 51] = *(ao2 + 7); + b[ 52] = *(ao3 + 6); + b[ 53] = *(ao3 + 7); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(ao4 + 6); + b[ 55] = *(ao4 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(ao1 + 8); + b[ 65] = *(ao1 + 9); + b[ 66] = *(ao2 + 8); + b[ 67] = *(ao2 + 9); + b[ 68] = *(ao3 + 8); + b[ 69] = *(ao3 + 9); + b[ 70] = *(ao4 + 8); + b[ 71] = *(ao4 + 9); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(ao5 + 8); + b[ 73] = *(ao5 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(ao1 + 10); + b[ 81] = *(ao1 + 11); + b[ 82] = *(ao2 + 10); + b[ 83] = *(ao2 + 11); + b[ 84] = *(ao3 + 10); + b[ 85] = *(ao3 + 11); + b[ 86] = *(ao4 + 10); + b[ 87] = *(ao4 + 11); + b[ 88] = *(ao5 + 10); + b[ 89] = *(ao5 + 11); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(ao6 + 10); + b[ 91] = *(ao6 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(ao1 + 12); + b[ 97] = *(ao1 + 13); + b[ 98] = *(ao2 + 12); + b[ 99] = *(ao2 + 13); + b[100] = *(ao3 + 12); + b[101] = *(ao3 + 13); + b[102] = *(ao4 + 12); + b[103] = *(ao4 + 13); + b[104] = *(ao5 + 12); + b[105] = *(ao5 + 13); + b[106] = *(ao6 + 12); + b[107] = *(ao6 + 13); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(ao7 + 12); + b[109] = *(ao7 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(ao1 + 14); + b[113] = *(ao1 + 15); + b[114] = *(ao2 + 14); + b[115] = *(ao2 + 15); + b[116] = *(ao3 + 14); + b[117] = *(ao3 + 15); + b[118] = *(ao4 + 14); + b[119] = *(ao4 + 15); + b[120] = *(ao5 + 14); + b[121] = *(ao5 + 15); + b[122] = *(ao6 + 14); + b[123] = *(ao6 + 15); + b[124] = *(ao7 + 14); + b[125] = *(ao7 + 15); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(ao8 + 14); + b[127] = *(ao8 + 15); +#endif + + ao1 += 16; + ao2 += 16; + ao3 += 16; + ao4 += 16; + ao5 += 16; + ao6 += 16; + ao7 += 16; + ao8 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + b += 16; + } + } else + if (X < posY) { + ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; + ao5 += i * lda; + ao6 += i * lda; + ao7 += i * lda; + ao8 += i * lda; + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + b += 16; + + if (i >= 2) { + b[ 0] = *(ao1 + 2); + b[ 1] = *(ao1 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(ao1 + 4); + b[ 1] = *(ao1 + 5); + b[ 2] = *(ao2 + 4); + b[ 3] = *(ao2 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(ao1 + 6); + b[ 1] = *(ao1 + 7); + b[ 2] = *(ao2 + 6); + b[ 3] = *(ao2 + 7); + b[ 4] = *(ao3 + 6); + b[ 5] = *(ao3 + 7); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(ao4 + 6); + b[ 7] = *(ao4 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(ao1 + 8); + b[ 1] = *(ao1 + 9); + b[ 2] = *(ao2 + 8); + b[ 3] = *(ao2 + 9); + b[ 4] = *(ao3 + 8); + b[ 5] = *(ao3 + 9); + b[ 6] = *(ao4 + 8); + b[ 7] = *(ao4 + 9); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(ao5 + 8); + b[ 9] = *(ao5 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(ao1 + 10); + b[ 1] = *(ao1 + 11); + b[ 2] = *(ao2 + 10); + b[ 3] = *(ao2 + 11); + b[ 4] = *(ao3 + 10); + b[ 5] = *(ao3 + 11); + b[ 6] = *(ao4 + 10); + b[ 7] = *(ao4 + 11); + b[ 8] = *(ao5 + 10); + b[ 9] = *(ao5 + 11); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(ao6 + 10); + b[11] = *(ao6 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(ao1 + 12); + b[ 1] = *(ao1 + 13); + b[ 2] = *(ao2 + 12); + b[ 3] = *(ao2 + 13); + b[ 4] = *(ao3 + 12); + b[ 5] = *(ao3 + 13); + b[ 6] = *(ao4 + 12); + b[ 7] = *(ao4 + 13); + b[ 8] = *(ao5 + 12); + b[ 9] = *(ao5 + 13); + b[10] = *(ao6 + 12); + b[11] = *(ao6 + 13); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(ao7 + 12); + b[13] = *(ao7 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + } else + if (X < posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(ao1 + 2); + b[ 9] = *(ao1 + 3); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(ao2 + 2); + b[ 11] = *(ao2 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(ao1 + 4); + b[ 17] = *(ao1 + 5); + b[ 18] = *(ao2 + 4); + b[ 19] = *(ao2 + 5); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(ao3 + 4); + b[ 21] = *(ao3 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(ao1 + 6); + b[ 25] = *(ao1 + 7); + b[ 26] = *(ao2 + 6); + b[ 27] = *(ao2 + 7); + b[ 28] = *(ao3 + 6); + b[ 29] = *(ao3 + 7); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(ao4 + 6); + b[ 31] = *(ao4 + 7); +#endif + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + } else + if (X < posY) { + ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = *(ao1 + 2); + b[ 1] = *(ao1 + 3); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(ao1 + 4); + b[ 1] = *(ao1 + 5); + b[ 2] = *(ao2 + 4); + b[ 3] = *(ao2 + 5); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao1 + 2); + b[ 5] = *(ao1 + 3); + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); + + ao1 += 4; + ao2 += 4; + b += 8; + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(ao1 + 2); + b[ 5] = *(ao1 + 3); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); +#endif + ao1 += 4; + ao2 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X < posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X > posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + ao1 += 2; + b += 2; + } else + if (X < posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + ao1 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_1.c b/kernel/generic/ztrmm_ltcopy_1.c new file mode 100644 index 0000000000..1229b45876 --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_1.c @@ -0,0 +1,104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02; + FLOAT *ao1; + + lda += lda; + + js = n; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + ao1 += 2; + b += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY ++; + js --; + } while (js > 0); + } /* End of main loop */ + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_2.c b/kernel/generic/ztrmm_ltcopy_2.c new file mode 100644 index 0000000000..7bcadf3f16 --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_2.c @@ -0,0 +1,240 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data1, data2, data3, data4, data5, data6, data7, data8; + + FLOAT *ao1, *ao2; + + lda += lda; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X < posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + data7 = *(ao2 + 2); + data8 = *(ao2 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + b[ 4] = data5; + b[ 5] = data6; + b[ 6] = data7; + b[ 7] = data8; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data3; + b[ 3] = data4; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + data7 = *(ao2 + 2); + data8 = *(ao2 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = data7; + b[ 7] = data8; +#endif + ao1 += 4; + ao2 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X > posY) { + ao1 += 2; + ao2 += 2; + b += 4; + + } else + if (X < posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + + ao1 += lda; + b += 4; + } else { +#ifdef UNIT + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data3; + b[ 3] = data4; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; +#endif + b += 4; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X > posY) { + b += 2; + ao1 += 2; + } else + if (X < posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + b[ 0] = data1; + b[ 1] = data2; + b += 2; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + b[ 0] = data1; + b[ 1] = data2; +#endif + b += 2; + ao1 += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_4.c b/kernel/generic/ztrmm_ltcopy_4.c new file mode 100644 index 0000000000..e43ed1269c --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_4.c @@ -0,0 +1,685 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda += lda; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + } else { + +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = ZERO; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ONE; + b[21] = ZERO; + b[22] = data23; + b[23] = data24; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ONE; + b[31] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = data31; + b[31] = data32; +#endif + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X > posY) { + + if (m & 2) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + if (m & 1) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + } else + if (X < posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += lda; + b += 8; + } + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + if (i >= 2) { + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + } + + if (i >= 3) { + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = ZERO; + b[ 4] = data13; + b[ 5] = data14; + b[ 6] = data15; + b[ 7] = data16; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ONE; + b[ 5] = ZERO; + b[ 6] = data23; + b[ 7] = data24; + b += 8; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + if (i >= 2) { + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + } + + if (i >= 3) { + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = data13; + b[ 5] = data14; + b[ 6] = data15; + b[ 7] = data16; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = data23; + b[ 7] = data24; + b += 8; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = data11; + b[ 7] = data12; +#endif + ao1 += 4; + ao2 += 4; + + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X > posY) { + ao1 += 2; + ao2 += 2; + + b += 4; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += lda; + b += 4; + + } else { +#ifdef UNIT + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posY * 2 + (posX + 0) * lda; + } else { + ao1 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + b += 2; + ao1 += 2; + } else + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_ltcopy_8.c b/kernel/generic/ztrmm_ltcopy_8.c new file mode 100644 index 0000000000..e25d9221e7 --- /dev/null +++ b/kernel/generic/ztrmm_ltcopy_8.c @@ -0,0 +1,876 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + + lda *= 2; + + js = (n >> 3); + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X > posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X < posY) { + + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = *(a02 + 4); + b[ 21] = *(a02 + 5); + b[ 22] = *(a02 + 6); + b[ 23] = *(a02 + 7); + b[ 24] = *(a02 + 8); + b[ 25] = *(a02 + 9); + b[ 26] = *(a02 + 10); + b[ 27] = *(a02 + 11); + b[ 28] = *(a02 + 12); + b[ 29] = *(a02 + 13); + b[ 30] = *(a02 + 14); + b[ 31] = *(a02 + 15); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = *(a03 + 6); + b[ 39] = *(a03 + 7); + b[ 40] = *(a03 + 8); + b[ 41] = *(a03 + 9); + b[ 42] = *(a03 + 10); + b[ 43] = *(a03 + 11); + b[ 44] = *(a03 + 12); + b[ 45] = *(a03 + 13); + b[ 46] = *(a03 + 14); + b[ 47] = *(a03 + 15); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = *(a04 + 8); + b[ 57] = *(a04 + 9); + b[ 58] = *(a04 + 10); + b[ 59] = *(a04 + 11); + b[ 60] = *(a04 + 12); + b[ 61] = *(a04 + 13); + b[ 62] = *(a04 + 14); + b[ 63] = *(a04 + 15); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = *(a05 + 10); + b[ 75] = *(a05 + 11); + b[ 76] = *(a05 + 12); + b[ 77] = *(a05 + 13); + b[ 78] = *(a05 + 14); + b[ 79] = *(a05 + 15); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = *(a06 + 12); + b[ 93] = *(a06 + 13); + b[ 94] = *(a06 + 14); + b[ 95] = *(a06 + 15); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = *(a07 + 14); + b[111] = *(a07 + 15); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i > 0) { + if (X > posY) { + a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; + b += 16 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + b += 16; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + + b[ 8] = *(a02 + 8); + b[ 9] = *(a02 + 9); + b[10] = *(a02 + 10); + b[11] = *(a02 + 11); + b[12] = *(a02 + 12); + b[13] = *(a02 + 13); + b[14] = *(a02 + 14); + b[15] = *(a02 + 15); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + + b[ 8] = *(a03 + 8); + b[ 9] = *(a03 + 9); + b[10] = *(a03 + 10); + b[11] = *(a03 + 11); + b[12] = *(a03 + 12); + b[13] = *(a03 + 13); + b[14] = *(a03 + 14); + b[15] = *(a03 + 15); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + + b[ 8] = *(a04 + 8); + b[ 9] = *(a04 + 9); + b[10] = *(a04 + 10); + b[11] = *(a04 + 11); + b[12] = *(a04 + 12); + b[13] = *(a04 + 13); + b[14] = *(a04 + 14); + b[15] = *(a04 + 15); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = *(a05 + 10); + b[11] = *(a05 + 11); + b[12] = *(a05 + 12); + b[13] = *(a05 + 13); + b[14] = *(a05 + 14); + b[15] = *(a05 + 15); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = *(a06 + 12); + b[13] = *(a06 + 13); + b[14] = *(a06 + 14); + b[15] = *(a06 + 15); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = *(a07 + 14); + b[15] = *(a07 + 15); + b += 16; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X > posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = *(a02 + 4); + b[ 13] = *(a02 + 5); + b[ 14] = *(a02 + 6); + b[ 15] = *(a02 + 7); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = *(a03 + 6); + b[ 23] = *(a03 + 7); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i > 0) { + if (X > posY) { + a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + b += 8 * i; + } else + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = *(a02 + 4); + b[ 5] = *(a02 + 5); + b[ 6] = *(a02 + 6); + b[ 7] = *(a02 + 7); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = *(a03 + 6); + b[ 7] = *(a03 + 7); + b += 8; + } + } + } + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X > posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + b[4] = *(a02 + 0); + b[5] = *(a02 + 1); + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + b[2] = *(a01 + 2); + b[3] = *(a01 + 3); + + b[4] = ZERO; + b[5] = ZERO; +#ifdef UNIT + b[6] = ONE; + b[7] = ZERO; +#else + b[6] = *(a02 + 2); + b[7] = *(a02 + 3); +#endif + a01 += 4; + a02 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i > 0) { + if (X > posY) { + a01 += 2; + a02 += 2; + b += 4; + } else + if (X < posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + + a01 += lda; + a02 += lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posY * 2 + (posX + 0) * lda; + } else { + a01 = a + posX * 2 + (posY + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X > posY) { + a01 += 2; + b += 2; + } else + if (X < posY) { + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[0] = ONE; + b[1] = ZERO; +#else + b[0] = *(a01 + 0); + b[1] = *(a01 + 1); +#endif + a01 += 2; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + posY += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_1.c b/kernel/generic/ztrmm_uncopy_1.c new file mode 100644 index 0000000000..595f009554 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_1.c @@ -0,0 +1,109 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02; + FLOAT *ao1; + + lda += lda; + + js = n; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (i > 0) { + do { + + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + + ao1 += 2; + b += 2; + + } else + if (X > posY) { + ao1 += lda; + b += 2; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + + ao1 += lda; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY ++; + js --; + } while (js > 0); + } /* End of main loop */ + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_2.c b/kernel/generic/ztrmm_uncopy_2.c new file mode 100644 index 0000000000..6beddf5b94 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_2.c @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *ao1, *ao2; + + lda += lda; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { + +#ifdef UNIT + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data05; + b[ 3] = data06; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao2 + 0); + data06 = *(ao2 + 1); + data07 = *(ao2 + 2); + data08 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data05; + b[ 3] = data06; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = data07; + b[ 7] = data08; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X > posY) { + ao1 += lda; + b += 4; + } else { +#ifdef UNIT + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data03; + b[ 3] = data04; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao2 + 0); + data04 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; +#endif + b += 4; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + ao1 += 2; + b += 2; + } else + if (X > posY) { + b += 2; + ao1 += lda; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + ao1 += lda; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_4.c b/kernel/generic/ztrmm_uncopy_4.c new file mode 100644 index 0000000000..f885b0dc29 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_4.c @@ -0,0 +1,679 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda += lda; + + js = (n >> 2); + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + b[16] = data05; + b[17] = data06; + b[18] = data13; + b[19] = data14; + b[20] = data21; + b[21] = data22; + b[22] = data29; + b[23] = data30; + + b[24] = data07; + b[25] = data08; + b[26] = data15; + b[27] = data16; + b[28] = data23; + b[29] = data24; + b[30] = data31; + b[31] = data32; + + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + } else + if (X > posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ONE; + b[11] = ZERO; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = ONE; + b[21] = ZERO; + b[22] = data29; + b[23] = data30; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = ONE; + b[31] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + b[16] = ZERO; + b[17] = ZERO; + b[18] = ZERO; + b[19] = ZERO; + b[20] = data21; + b[21] = data22; + b[22] = data29; + b[23] = data30; + + b[24] = ZERO; + b[25] = ZERO; + b[26] = ZERO; + b[27] = ZERO; + b[28] = ZERO; + b[29] = ZERO; + b[30] = data31; + b[31] = data32; +#endif + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + b[ 8] = data03; + b[ 9] = data04; + b[10] = data11; + b[11] = data12; + b[12] = data19; + b[13] = data20; + b[14] = data27; + b[15] = data28; + + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + } else + if (X > posY) { + if (m & 2) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 16; + } + + if (m & 1) { + ao1 += lda; + b += 8; + } + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + if (i >= 2) { + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + } + + if (i >= 3) { + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ONE; + b[ 3] = ZERO; + b[ 4] = data19; + b[ 5] = data20; + b[ 6] = data27; + b[ 7] = data28; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ONE; + b[ 5] = ZERO; + b[ 6] = data29; + b[ 7] = data30; + b += 8; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + + if (i >= 2) { + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + } + + if (i >= 3) { + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + } + + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data17; + b[ 5] = data18; + b[ 6] = data25; + b[ 7] = data26; + b += 8; + + if (i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = data19; + b[ 5] = data20; + b[ 6] = data27; + b[ 7] = data28; + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = data29; + b[ 7] = data30; + b += 8; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = data03; + b[ 5] = data04; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = data11; + b[ 7] = data12; +#endif + ao1 += 4; + ao2 += 4; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data09; + b[ 3] = data10; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data09; + b[ 3] = data10; +#endif + ao1 += 2; + ao2 += 2; + b += 4; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + ao1 += 2; + b += 2; + } else + if (X > posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += 2; + b += 2; + } + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_uncopy_8.c b/kernel/generic/ztrmm_uncopy_8.c new file mode 100644 index 0000000000..c02c1dedf0 --- /dev/null +++ b/kernel/generic/ztrmm_uncopy_8.c @@ -0,0 +1,876 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8; + + lda += lda; + + js = (n >> 3); + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + ao5 = a + posX * 2 + (posY + 4) * lda; + ao6 = a + posX * 2 + (posY + 5) * lda; + ao7 = a + posX * 2 + (posY + 6) * lda; + ao8 = a + posX * 2 + (posY + 7) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + ao5 = a + posY * 2 + (posX + 4) * lda; + ao6 = a + posY * 2 + (posX + 5) * lda; + ao7 = a + posY * 2 + (posX + 6) * lda; + ao8 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + b += 16; + } + } else + if (X > posY) { + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + + b += 128; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + b[ 16] = ZERO; + b[ 17] = ZERO; +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(ao2 + 2); + b[ 19] = *(ao2 + 3); +#endif + b[ 20] = *(ao3 + 2); + b[ 21] = *(ao3 + 3); + b[ 22] = *(ao4 + 2); + b[ 23] = *(ao4 + 3); + b[ 24] = *(ao5 + 2); + b[ 25] = *(ao5 + 3); + b[ 26] = *(ao6 + 2); + b[ 27] = *(ao6 + 3); + b[ 28] = *(ao7 + 2); + b[ 29] = *(ao7 + 3); + b[ 30] = *(ao8 + 2); + b[ 31] = *(ao8 + 3); + + b[ 32] = ZERO; + b[ 33] = ZERO; + b[ 34] = ZERO; + b[ 35] = ZERO; +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(ao3 + 4); + b[ 37] = *(ao3 + 5); +#endif + b[ 38] = *(ao4 + 4); + b[ 39] = *(ao4 + 5); + b[ 40] = *(ao5 + 4); + b[ 41] = *(ao5 + 5); + b[ 42] = *(ao6 + 4); + b[ 43] = *(ao6 + 5); + b[ 44] = *(ao7 + 4); + b[ 45] = *(ao7 + 5); + b[ 46] = *(ao8 + 4); + b[ 47] = *(ao8 + 5); + + b[ 48] = ZERO; + b[ 49] = ZERO; + b[ 50] = ZERO; + b[ 51] = ZERO; + b[ 52] = ZERO; + b[ 53] = ZERO; +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(ao4 + 6); + b[ 55] = *(ao4 + 7); +#endif + b[ 56] = *(ao5 + 6); + b[ 57] = *(ao5 + 7); + b[ 58] = *(ao6 + 6); + b[ 59] = *(ao6 + 7); + b[ 60] = *(ao7 + 6); + b[ 61] = *(ao7 + 7); + b[ 62] = *(ao8 + 6); + b[ 63] = *(ao8 + 7); + + b[ 64] = ZERO; + b[ 65] = ZERO; + b[ 66] = ZERO; + b[ 67] = ZERO; + b[ 68] = ZERO; + b[ 69] = ZERO; + b[ 70] = ZERO; + b[ 71] = ZERO; +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(ao5 + 8); + b[ 73] = *(ao5 + 9); +#endif + b[ 74] = *(ao6 + 8); + b[ 75] = *(ao6 + 9); + b[ 76] = *(ao7 + 8); + b[ 77] = *(ao7 + 9); + b[ 78] = *(ao8 + 8); + b[ 79] = *(ao8 + 9); + + b[ 80] = ZERO; + b[ 81] = ZERO; + b[ 82] = ZERO; + b[ 83] = ZERO; + b[ 84] = ZERO; + b[ 85] = ZERO; + b[ 86] = ZERO; + b[ 87] = ZERO; + b[ 88] = ZERO; + b[ 89] = ZERO; +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(ao6 + 10); + b[ 91] = *(ao6 + 11); +#endif + b[ 92] = *(ao7 + 10); + b[ 93] = *(ao7 + 11); + b[ 94] = *(ao8 + 10); + b[ 95] = *(ao8 + 11); + + b[ 96] = ZERO; + b[ 97] = ZERO; + b[ 98] = ZERO; + b[ 99] = ZERO; + b[100] = ZERO; + b[101] = ZERO; + b[102] = ZERO; + b[103] = ZERO; + b[104] = ZERO; + b[105] = ZERO; + b[106] = ZERO; + b[107] = ZERO; +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(ao7 + 12); + b[109] = *(ao7 + 13); +#endif + b[110] = *(ao8 + 12); + b[111] = *(ao8 + 13); + + b[112] = ZERO; + b[113] = ZERO; + b[114] = ZERO; + b[115] = ZERO; + b[116] = ZERO; + b[117] = ZERO; + b[118] = ZERO; + b[119] = ZERO; + b[120] = ZERO; + b[121] = ZERO; + b[122] = ZERO; + b[123] = ZERO; + b[124] = ZERO; + b[125] = ZERO; +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(ao8 + 14); + b[127] = *(ao8 + 15); +#endif + + ao1 += 8 * lda; + ao2 += 8 * lda; + ao3 += 8 * lda; + ao4 += 8 * lda; + ao5 += 8 * lda; + ao6 += 8 * lda; + ao7 += 8 * lda; + ao8 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[ 10] = *(ao6 + 0); + b[ 11] = *(ao6 + 1); + b[ 12] = *(ao7 + 0); + b[ 13] = *(ao7 + 1); + b[ 14] = *(ao8 + 0); + b[ 15] = *(ao8 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + ao5 += 2; + ao6 += 2; + ao7 += 2; + ao8 += 2; + b += 16; + } + } else + if (X > posY) { + ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; + ao5 += i * lda; + ao6 += i * lda; + ao7 += i * lda; + ao8 += i * lda; + b += 16 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + b[ 8] = *(ao5 + 0); + b[ 9] = *(ao5 + 1); + b[10] = *(ao6 + 0); + b[11] = *(ao6 + 1); + b[12] = *(ao7 + 0); + b[13] = *(ao7 + 1); + b[14] = *(ao8 + 0); + b[15] = *(ao8 + 1); + b += 16; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); +#endif + b[ 4] = *(ao3 + 2); + b[ 5] = *(ao3 + 3); + b[ 6] = *(ao4 + 2); + b[ 7] = *(ao4 + 3); + b[ 8] = *(ao5 + 2); + b[ 9] = *(ao5 + 3); + b[10] = *(ao6 + 2); + b[11] = *(ao6 + 3); + b[12] = *(ao7 + 2); + b[13] = *(ao7 + 3); + b[14] = *(ao8 + 2); + b[15] = *(ao8 + 3); + b += 16; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); +#endif + b[ 6] = *(ao4 + 4); + b[ 7] = *(ao4 + 5); + b[ 8] = *(ao5 + 4); + b[ 9] = *(ao5 + 5); + b[10] = *(ao6 + 4); + b[11] = *(ao6 + 5); + b[12] = *(ao7 + 4); + b[13] = *(ao7 + 5); + b[14] = *(ao8 + 4); + b[15] = *(ao8 + 5); + b += 16; + } + + if (i >= 4) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(ao4 + 6); + b[ 7] = *(ao4 + 7); +#endif + b[ 8] = *(ao5 + 6); + b[ 9] = *(ao5 + 7); + b[10] = *(ao6 + 6); + b[11] = *(ao6 + 7); + b[12] = *(ao7 + 6); + b[13] = *(ao7 + 7); + b[14] = *(ao8 + 6); + b[15] = *(ao8 + 7); + b += 16; + } + + if (i >= 5) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(ao5 + 8); + b[ 9] = *(ao5 + 9); +#endif + b[10] = *(ao6 + 8); + b[11] = *(ao6 + 9); + b[12] = *(ao7 + 8); + b[13] = *(ao7 + 9); + b[14] = *(ao8 + 8); + b[15] = *(ao8 + 9); + b += 16; + } + + if (i >= 6) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(ao6 + 10); + b[11] = *(ao6 + 11); +#endif + b[12] = *(ao7 + 10); + b[13] = *(ao7 + 11); + b[14] = *(ao8 + 10); + b[15] = *(ao8 + 11); + b += 16; + } + + if (i >= 7) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(ao7 + 12); + b[13] = *(ao7 + 13); +#endif + b[14] = *(ao8 + 12); + b[15] = *(ao8 + 13); + b += 16; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + } else + if (X > posY) { + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + b[ 8] = ZERO; + b[ 9] = ZERO; +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(ao2 + 2); + b[ 11] = *(ao2 + 3); +#endif + b[ 12] = *(ao3 + 2); + b[ 13] = *(ao3 + 3); + b[ 14] = *(ao4 + 2); + b[ 15] = *(ao4 + 3); + + b[ 16] = ZERO; + b[ 17] = ZERO; + b[ 18] = ZERO; + b[ 19] = ZERO; +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(ao3 + 4); + b[ 21] = *(ao3 + 5); +#endif + b[ 22] = *(ao4 + 4); + b[ 23] = *(ao4 + 5); + + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(ao4 + 6); + b[ 31] = *(ao4 + 7); +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + for (ii = 0; ii < i; ii++){ + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + } else + if (X > posY) { + ao1 += i * lda; + ao2 += i * lda; + ao3 += i * lda; + ao4 += i * lda; + b += 8 * i; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao3 + 0); + b[ 5] = *(ao3 + 1); + b[ 6] = *(ao4 + 0); + b[ 7] = *(ao4 + 1); + b += 8; + + if(i >= 2) { + b[ 0] = ZERO; + b[ 1] = ZERO; +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(ao2 + 2); + b[ 3] = *(ao2 + 3); +#endif + b[ 4] = *(ao3 + 2); + b[ 5] = *(ao3 + 3); + b[ 6] = *(ao4 + 2); + b[ 7] = *(ao4 + 3); + b += 8; + } + + if (i >= 3) { + b[ 0] = ZERO; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(ao3 + 4); + b[ 5] = *(ao3 + 5); +#endif + b[ 6] = *(ao4 + 4); + b[ 7] = *(ao4 + 5); + b += 8; + } + } + } + + posY += 4; + } + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + b[ 4] = *(ao1 + 2); + b[ 5] = *(ao1 + 3); + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); + + ao1 += 4; + ao2 += 4; + b += 8; + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + + b[ 4] = ZERO; + b[ 5] = ZERO; +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(ao2 + 2); + b[ 7] = *(ao2 + 3); +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + + if (X < posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X > posY) { + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + b[ 2] = *(ao2 + 0); + b[ 3] = *(ao2 + 1); +#endif + b += 2; + } + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); + ao1 += 2; + b += 2; + } else + if (X > posY) { + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(ao1 + 0); + b[ 1] = *(ao1 + 1); +#endif + ao1 += lda; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_1.c b/kernel/generic/ztrmm_utcopy_1.c new file mode 100644 index 0000000000..d4406c980e --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_1.c @@ -0,0 +1,103 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02; + FLOAT *ao1; + + lda += lda; + + js = n; + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (i > 0) { + do { + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + ao1 += lda; + b += 2; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; +#endif + ao1 += lda; + b += 2; + } + + X ++; + i --; + } while (i > 0); + } + + posY ++; + js --; + } while (js > 0); + } /* End of main loop */ + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_2.c b/kernel/generic/ztrmm_utcopy_2.c new file mode 100644 index 0000000000..c71a55c286 --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_2.c @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data1, data2, data3, data4, data5, data6, data7, data8; + + FLOAT *ao1, *ao2; + + lda += lda; + + js = (n >> 1); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X > posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + data7 = *(ao2 + 2); + data8 = *(ao2 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + b[ 4] = data5; + b[ 5] = data6; + b[ 6] = data7; + b[ 7] = data8; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data5; + b[ 5] = data6; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + data7 = *(ao2 + 2); + data8 = *(ao2 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data5; + b[ 5] = data6; + b[ 6] = data7; + b[ 7] = data8; +#endif + + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + if (m & 1) { + if (X < posY) { + ao1 += 2; + ao2 += 2; + b += 4; + } else + if (X > posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data3 = *(ao1 + 2); + data4 = *(ao1 + 3); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data3; + b[ 3] = data4; + + ao1 += lda; + b += 4; + + } else { +#ifdef UNIT + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = data5; + b[ 3] = data6; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + data5 = *(ao2 + 0); + data6 = *(ao2 + 1); + + b[ 0] = data1; + b[ 1] = data2; + b[ 2] = data5; + b[ 3] = data6; +#endif + b += 4; + } + } + + posY += 2; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + ao1 += 2; + b += 2; + } else + if (X > posY) { + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + b[ 0] = data1; + b[ 1] = data2; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data1 = *(ao1 + 0); + data2 = *(ao1 + 1); + + b[ 0] = data1; + b[ 1] = data2; +#endif + ao1 += lda; + b += 2; + } + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_4.c b/kernel/generic/ztrmm_utcopy_4.c new file mode 100644 index 0000000000..cda62bc3b2 --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_4.c @@ -0,0 +1,663 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js; + BLASLONG X; + + FLOAT data01, data02, data03, data04, data05, data06, data07, data08; + FLOAT data09, data10, data11, data12, data13, data14, data15, data16; + FLOAT data17, data18, data19, data20, data21, data22, data23, data24; + FLOAT data25, data26, data27, data28, data29, data30, data31, data32; + FLOAT *ao1, *ao2, *ao3, *ao4; + + lda += lda; + + js = (n >> 2); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + ao3 = a + posX * 2 + (posY + 2) * lda; + ao4 = a + posX * 2 + (posY + 3) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + ao3 = a + posY * 2 + (posX + 2) * lda; + ao4 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + ao1 += 8; + ao2 += 8; + ao3 += 8; + ao4 += 8; + b += 32; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + data23 = *(ao3 + 6); + data24 = *(ao3 + 7); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = data23; + b[23] = data24; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + b += 32; + + } else { + +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = ONE; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = ONE; + b[21] = ZERO; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = ONE; + b[31] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + + data25 = *(ao4 + 0); + data26 = *(ao4 + 1); + data27 = *(ao4 + 2); + data28 = *(ao4 + 3); + data29 = *(ao4 + 4); + data30 = *(ao4 + 5); + data31 = *(ao4 + 6); + data32 = *(ao4 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + + b[16] = data17; + b[17] = data18; + b[18] = data19; + b[19] = data20; + b[20] = data21; + b[21] = data22; + b[22] = ZERO; + b[23] = ZERO; + + b[24] = data25; + b[25] = data26; + b[26] = data27; + b[27] = data28; + b[28] = data29; + b[29] = data30; + b[30] = data31; + b[31] = data32; +#endif + + ao1 += 4 * lda; + ao2 += 4 * lda; + ao3 += 4 * lda; + ao4 += 4 * lda; + + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + + if (m & 2) { + ao1 += 4; + ao2 += 4; + ao3 += 4; + ao4 += 4; + b += 16; + } + + if (m & 1) { + ao1 += 2; + ao2 += 2; + ao3 += 2; + ao4 += 2; + b += 8; + } + + } else + if (X > posY) { + if (m & 2) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + data13 = *(ao2 + 4); + data14 = *(ao2 + 5); + data15 = *(ao2 + 6); + data16 = *(ao2 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + b[ 8] = data09; + b[ 9] = data10; + b[10] = data11; + b[11] = data12; + b[12] = data13; + b[13] = data14; + b[14] = data15; + b[15] = data16; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 16; + } + + if (m & 1) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + data05 = *(ao1 + 4); + data06 = *(ao1 + 5); + data07 = *(ao1 + 6); + data08 = *(ao1 + 7); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data05; + b[ 5] = data06; + b[ 6] = data07; + b[ 7] = data08; + + ao1 += lda; + b += 8; + } + + } else { + +#ifdef UNIT + if (i >= 2) { + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + } + + if (i >= 3) { + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + } + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = ONE; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data17; + b[ 1] = data18; + b[ 2] = data19; + b[ 3] = data20; + b[ 4] = ONE; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + if (i >= 2) { + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + } + + if (i >= 3) { + data17 = *(ao3 + 0); + data18 = *(ao3 + 1); + data19 = *(ao3 + 2); + data20 = *(ao3 + 3); + data21 = *(ao3 + 4); + data22 = *(ao3 + 5); + } + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if (i >= 2) { + b[ 0] = data09; + b[ 1] = data10; + b[ 2] = data11; + b[ 3] = data12; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = data17; + b[ 1] = data18; + b[ 2] = data19; + b[ 3] = data20; + b[ 4] = data21; + b[ 5] = data22; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } +#endif + } + } + + posY += 4; + js --; + } while (js > 0); + } /* End of main loop */ + + if (n & 2){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + ao2 = a + posX * 2 + (posY + 1) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + ao2 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + ao1 += 4; + ao2 += 4; + b += 8; + + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; + + ao1 += 2 * lda; + ao2 += 2 * lda; + b += 8; + + } else { +#ifdef UNIT + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = ONE; + b[ 7] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + data09 = *(ao2 + 0); + data10 = *(ao2 + 1); + data11 = *(ao2 + 2); + data12 = *(ao2 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = data09; + b[ 5] = data10; + b[ 6] = data11; + b[ 7] = data12; +#endif + ao1 += 2 * lda; + ao2 += 2 * lda; + + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + b += 4; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + data03 = *(ao1 + 2); + data04 = *(ao1 + 3); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = data03; + b[ 3] = data04; + + b += 4; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; + b[ 2] = ZERO; + b[ 3] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; + b[ 2] = ZERO; + b[ 3] = ZERO; +#endif + b += 4; + } + } + + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + ao1 = a + posX * 2 + (posY + 0) * lda; + } else { + ao1 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + + if (X < posY) { + b += 2; + ao1 += 2; + } else + if (X > posY) { + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + b[ 0] = data01; + b[ 1] = data02; + + ao1 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + data01 = *(ao1 + 0); + data02 = *(ao1 + 1); + + b[ 0] = data01; + b[ 1] = data02; +#endif + b += 2; + ao1 += lda; + } + + + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrmm_utcopy_8.c b/kernel/generic/ztrmm_utcopy_8.c new file mode 100644 index 0000000000..08dd80ca2a --- /dev/null +++ b/kernel/generic/ztrmm_utcopy_8.c @@ -0,0 +1,880 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ + + BLASLONG i, js, ii; + BLASLONG X; + + FLOAT *a01, *a02, *a03 ,*a04, *a05, *a06, *a07, *a08; + + lda *= 2; + + js = (n >> 3); + + if (js > 0){ + do { + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + a05 = a + posX * 2 + (posY + 4) * lda; + a06 = a + posX * 2 + (posY + 5) * lda; + a07 = a + posX * 2 + (posY + 6) * lda; + a08 = a + posX * 2 + (posY + 7) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + a05 = a + posY * 2 + (posX + 4) * lda; + a06 = a + posY * 2 + (posX + 5) * lda; + a07 = a + posY * 2 + (posX + 6) * lda; + a08 = a + posY * 2 + (posX + 7) * lda; + } + + i = (m >> 3); + if (i > 0) { + do { + if (X < posY) { + a01 += 16; + a02 += 16; + a03 += 16; + a04 += 16; + a05 += 16; + a06 += 16; + a07 += 16; + a08 += 16; + b += 128; + } else + if (X > posY) { + + for (ii = 0; ii < 8; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + b += 16; + } + + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = ZERO; + b[ 9] = ZERO; + b[ 10] = ZERO; + b[ 11] = ZERO; + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a02 + 0); + b[ 17] = *(a02 + 1); +#ifdef UNIT + b[ 18] = ONE; + b[ 19] = ZERO; +#else + b[ 18] = *(a02 + 2); + b[ 19] = *(a02 + 3); +#endif + b[ 20] = ZERO; + b[ 21] = ZERO; + b[ 22] = ZERO; + b[ 23] = ZERO; + b[ 24] = ZERO; + b[ 25] = ZERO; + b[ 26] = ZERO; + b[ 27] = ZERO; + b[ 28] = ZERO; + b[ 29] = ZERO; + b[ 30] = ZERO; + b[ 31] = ZERO; + + b[ 32] = *(a03 + 0); + b[ 33] = *(a03 + 1); + b[ 34] = *(a03 + 2); + b[ 35] = *(a03 + 3); +#ifdef UNIT + b[ 36] = ONE; + b[ 37] = ZERO; +#else + b[ 36] = *(a03 + 4); + b[ 37] = *(a03 + 5); +#endif + b[ 38] = ZERO; + b[ 39] = ZERO; + b[ 40] = ZERO; + b[ 41] = ZERO; + b[ 42] = ZERO; + b[ 43] = ZERO; + b[ 44] = ZERO; + b[ 45] = ZERO; + b[ 46] = ZERO; + b[ 47] = ZERO; + + b[ 48] = *(a04 + 0); + b[ 49] = *(a04 + 1); + b[ 50] = *(a04 + 2); + b[ 51] = *(a04 + 3); + b[ 52] = *(a04 + 4); + b[ 53] = *(a04 + 5); +#ifdef UNIT + b[ 54] = ONE; + b[ 55] = ZERO; +#else + b[ 54] = *(a04 + 6); + b[ 55] = *(a04 + 7); +#endif + b[ 56] = ZERO; + b[ 57] = ZERO; + b[ 58] = ZERO; + b[ 59] = ZERO; + b[ 60] = ZERO; + b[ 61] = ZERO; + b[ 62] = ZERO; + b[ 63] = ZERO; + + b[ 64] = *(a05 + 0); + b[ 65] = *(a05 + 1); + b[ 66] = *(a05 + 2); + b[ 67] = *(a05 + 3); + b[ 68] = *(a05 + 4); + b[ 69] = *(a05 + 5); + b[ 70] = *(a05 + 6); + b[ 71] = *(a05 + 7); +#ifdef UNIT + b[ 72] = ONE; + b[ 73] = ZERO; +#else + b[ 72] = *(a05 + 8); + b[ 73] = *(a05 + 9); +#endif + b[ 74] = ZERO; + b[ 75] = ZERO; + b[ 76] = ZERO; + b[ 77] = ZERO; + b[ 78] = ZERO; + b[ 79] = ZERO; + + b[ 80] = *(a06 + 0); + b[ 81] = *(a06 + 1); + b[ 82] = *(a06 + 2); + b[ 83] = *(a06 + 3); + b[ 84] = *(a06 + 4); + b[ 85] = *(a06 + 5); + b[ 86] = *(a06 + 6); + b[ 87] = *(a06 + 7); + b[ 88] = *(a06 + 8); + b[ 89] = *(a06 + 9); +#ifdef UNIT + b[ 90] = ONE; + b[ 91] = ZERO; +#else + b[ 90] = *(a06 + 10); + b[ 91] = *(a06 + 11); +#endif + b[ 92] = ZERO; + b[ 93] = ZERO; + b[ 94] = ZERO; + b[ 95] = ZERO; + + b[ 96] = *(a07 + 0); + b[ 97] = *(a07 + 1); + b[ 98] = *(a07 + 2); + b[ 99] = *(a07 + 3); + b[100] = *(a07 + 4); + b[101] = *(a07 + 5); + b[102] = *(a07 + 6); + b[103] = *(a07 + 7); + b[104] = *(a07 + 8); + b[105] = *(a07 + 9); + b[106] = *(a07 + 10); + b[107] = *(a07 + 11); +#ifdef UNIT + b[108] = ONE; + b[109] = ZERO; +#else + b[108] = *(a07 + 12); + b[109] = *(a07 + 13); +#endif + b[110] = ZERO; + b[111] = ZERO; + + b[112] = *(a08 + 0); + b[113] = *(a08 + 1); + b[114] = *(a08 + 2); + b[115] = *(a08 + 3); + b[116] = *(a08 + 4); + b[117] = *(a08 + 5); + b[118] = *(a08 + 6); + b[119] = *(a08 + 7); + b[120] = *(a08 + 8); + b[121] = *(a08 + 9); + b[122] = *(a08 + 10); + b[123] = *(a08 + 11); + b[124] = *(a08 + 12); + b[125] = *(a08 + 13); +#ifdef UNIT + b[126] = ONE; + b[127] = ZERO; +#else + b[126] = *(a08 + 14); + b[127] = *(a08 + 15); +#endif + a01 += 8 * lda; + a02 += 8 * lda; + a03 += 8 * lda; + a04 += 8 * lda; + a05 += 8 * lda; + a06 += 8 * lda; + a07 += 8 * lda; + a08 += 8 * lda; + b += 128; + } + + X += 8; + i --; + } while (i > 0); + } + + i = (m & 7); + if (i) { + + if (X < posY) { + + a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + a05 += 2 * i; + a06 += 2 * i; + a07 += 2 * i; + a08 += 2 * i; + b += 16 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + b[ 8] = *(a01 + 8); + b[ 9] = *(a01 + 9); + b[ 10] = *(a01 + 10); + b[ 11] = *(a01 + 11); + b[ 12] = *(a01 + 12); + b[ 13] = *(a01 + 13); + b[ 14] = *(a01 + 14); + b[ 15] = *(a01 + 15); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + a05 += lda; + a06 += lda; + a07 += lda; + a08 += lda; + b += 16; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 4) { + b[ 0] = *(a04 + 0); + b[ 1] = *(a04 + 1); + b[ 2] = *(a04 + 2); + b[ 3] = *(a04 + 3); + b[ 4] = *(a04 + 4); + b[ 5] = *(a04 + 5); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a04 + 6); + b[ 7] = *(a04 + 7); +#endif + b[ 8] = ZERO; + b[ 9] = ZERO; + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 5) { + b[ 0] = *(a05 + 0); + b[ 1] = *(a05 + 1); + b[ 2] = *(a05 + 2); + b[ 3] = *(a05 + 3); + b[ 4] = *(a05 + 4); + b[ 5] = *(a05 + 5); + b[ 6] = *(a05 + 6); + b[ 7] = *(a05 + 7); +#ifdef UNIT + b[ 8] = ONE; + b[ 9] = ZERO; +#else + b[ 8] = *(a05 + 8); + b[ 9] = *(a05 + 9); +#endif + b[10] = ZERO; + b[11] = ZERO; + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 6) { + b[ 0] = *(a06 + 0); + b[ 1] = *(a06 + 1); + b[ 2] = *(a06 + 2); + b[ 3] = *(a06 + 3); + b[ 4] = *(a06 + 4); + b[ 5] = *(a06 + 5); + b[ 6] = *(a06 + 6); + b[ 7] = *(a06 + 7); + b[ 8] = *(a06 + 8); + b[ 9] = *(a06 + 9); +#ifdef UNIT + b[10] = ONE; + b[11] = ZERO; +#else + b[10] = *(a06 + 10); + b[11] = *(a06 + 11); +#endif + b[12] = ZERO; + b[13] = ZERO; + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + + if (i >= 7) { + b[ 0] = *(a07 + 0); + b[ 1] = *(a07 + 1); + b[ 2] = *(a07 + 2); + b[ 3] = *(a07 + 3); + b[ 4] = *(a07 + 4); + b[ 5] = *(a07 + 5); + b[ 6] = *(a07 + 6); + b[ 7] = *(a07 + 7); + b[ 8] = *(a07 + 8); + b[ 9] = *(a07 + 9); + b[10] = *(a07 + 10); + b[11] = *(a07 + 11); +#ifdef UNIT + b[12] = ONE; + b[13] = ZERO; +#else + b[12] = *(a07 + 12); + b[13] = *(a07 + 13); +#endif + b[14] = ZERO; + b[15] = ZERO; + b += 16; + } + } + } + + posY += 8; + js --; + } while (js > 0); + } /* End of main loop */ + + + if (n & 4){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + a03 = a + posX * 2 + (posY + 2) * lda; + a04 = a + posX * 2 + (posY + 3) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + a03 = a + posY * 2 + (posX + 2) * lda; + a04 = a + posY * 2 + (posX + 3) * lda; + } + + i = (m >> 2); + if (i > 0) { + do { + if (X < posY) { + a01 += 8; + a02 += 8; + a03 += 8; + a04 += 8; + b += 32; + } else + if (X > posY) { + + for (ii = 0; ii < 4; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + b += 8; + } + + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + + b[ 8] = *(a02 + 0); + b[ 9] = *(a02 + 1); +#ifdef UNIT + b[ 10] = ONE; + b[ 11] = ZERO; +#else + b[ 10] = *(a02 + 2); + b[ 11] = *(a02 + 3); +#endif + b[ 12] = ZERO; + b[ 13] = ZERO; + b[ 14] = ZERO; + b[ 15] = ZERO; + + b[ 16] = *(a03 + 0); + b[ 17] = *(a03 + 1); + b[ 18] = *(a03 + 2); + b[ 19] = *(a03 + 3); +#ifdef UNIT + b[ 20] = ONE; + b[ 21] = ZERO; +#else + b[ 20] = *(a03 + 4); + b[ 21] = *(a03 + 5); +#endif + b[ 22] = ZERO; + b[ 23] = ZERO; + + b[ 24] = *(a04 + 0); + b[ 25] = *(a04 + 1); + b[ 26] = *(a04 + 2); + b[ 27] = *(a04 + 3); + b[ 28] = *(a04 + 4); + b[ 29] = *(a04 + 5); +#ifdef UNIT + b[ 30] = ONE; + b[ 31] = ZERO; +#else + b[ 30] = *(a04 + 6); + b[ 31] = *(a04 + 7); +#endif + + a01 += 4 * lda; + a02 += 4 * lda; + a03 += 4 * lda; + a04 += 4 * lda; + b += 32; + } + + X += 4; + i --; + } while (i > 0); + } + + i = (m & 3); + if (i) { + + if (X < posY) { + a01 += 2 * i; + a02 += 2 * i; + a03 += 2 * i; + a04 += 2 * i; + b += 8 * i; + } else + if (X > posY) { + + for (ii = 0; ii < i; ii++){ + + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a01 + 4); + b[ 5] = *(a01 + 5); + b[ 6] = *(a01 + 6); + b[ 7] = *(a01 + 7); + + a01 += lda; + a02 += lda; + a03 += lda; + a04 += lda; + b += 8; + } + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + + if(i >= 2) { + b[ 0] = *(a02 + 0); + b[ 1] = *(a02 + 1); +#ifdef UNIT + b[ 2] = ONE; + b[ 3] = ZERO; +#else + b[ 2] = *(a02 + 2); + b[ 3] = *(a02 + 3); +#endif + b[ 4] = ZERO; + b[ 5] = ZERO; + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + + if (i >= 3) { + b[ 0] = *(a03 + 0); + b[ 1] = *(a03 + 1); + b[ 2] = *(a03 + 2); + b[ 3] = *(a03 + 3); +#ifdef UNIT + b[ 4] = ONE; + b[ 5] = ZERO; +#else + b[ 4] = *(a03 + 4); + b[ 5] = *(a03 + 5); +#endif + b[ 6] = ZERO; + b[ 7] = ZERO; + b += 8; + } + } + } + + posY += 4; + } + + + if (n & 2){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + a02 = a + posX * 2 + (posY + 1) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + a02 = a + posY * 2 + (posX + 1) * lda; + } + + i = (m >> 1); + if (i > 0) { + do { + if (X < posY) { + a01 += 4; + a02 += 4; + b += 8; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } else { + +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = ZERO; + b[ 3] = ZERO; + + b[ 4] = *(a02 + 0); + b[ 5] = *(a02 + 1); +#ifdef UNIT + b[ 6] = ONE; + b[ 7] = ZERO; +#else + b[ 6] = *(a02 + 2); + b[ 7] = *(a02 + 3); +#endif + + a01 += 2 * lda; + a02 += 2 * lda; + b += 8; + } + + X += 2; + i --; + } while (i > 0); + } + + i = (m & 1); + if (i) { + + if (X < posY) { + b += 4; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + b[ 2] = *(a01 + 2); + b[ 3] = *(a01 + 3); + b += 4; + } + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + b[ 2] = *(a02 + 0); + b[ 3] = *(a02 + 1); + b += 4; + } + posY += 2; + } + + if (n & 1){ + X = posX; + + if (posX <= posY) { + a01 = a + posX * 2 + (posY + 0) * lda; + } else { + a01 = a + posY * 2 + (posX + 0) * lda; + } + + i = m; + if (m > 0) { + do { + if (X < posY) { + a01 += 2; + b += 2; + } else + if (X > posY) { + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); + a01 += lda; + b += 2; + } else { +#ifdef UNIT + b[ 0] = ONE; + b[ 1] = ZERO; +#else + b[ 0] = *(a01 + 0); + b[ 1] = *(a01 + 1); +#endif + a01 += lda; + b += 2; + } + X += 1; + i --; + } while (i > 0); + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_1.c b/kernel/generic/ztrsm_lncopy_1.c new file mode 100644 index 0000000000..ec8ffbcc9b --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_1.c @@ -0,0 +1,91 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02; + FLOAT *a1; + + lda *= 2; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 2; + b += 2; + + i --; + ii ++; + } + + a += lda; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_2.c b/kernel/generic/ztrsm_lncopy_2.c new file mode 100644 index 0000000000..967b60c1e5 --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_2.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *a1, *a2; + + lda *= 2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data07 = *(a2 + 2); + data08 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 4) = data03; + *(b + 5) = data04; + compinv(b + 6, data07, data08); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data05; + *(b + 3) = data06; + *(b + 4) = data03; + *(b + 5) = data04; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 4; + a2 += 4; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data05; + *(b + 3) = data06; + } + b += 4; + } + + a += 2 * lda; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1+= 2; + b += 2; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_4.c b/kernel/generic/ztrsm_lncopy_4.c new file mode 100644 index 0000000000..e4a3fb93a4 --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_4.c @@ -0,0 +1,459 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT data09, data10, data11, data12; + FLOAT data13, data14, data15, data16; + FLOAT data17, data18, data19, data20; + FLOAT data21, data22, data23, data24; + FLOAT data25, data26, data27, data28; + FLOAT data29, data30, data31, data32; + + FLOAT *a1, *a2, *a3, *a4; + + lda *= 2; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data21 = *(a3 + 4); + data22 = *(a3 + 5); +#endif + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data31 = *(a4 + 6); + data32 = *(a4 + 7); +#endif + + compinv(b + 0, data01, data02); + + *(b + 8) = data03; + *(b + 9) = data04; + compinv(b + 10, data11, data12); + + *(b + 16) = data05; + *(b + 17) = data06; + *(b + 18) = data13; + *(b + 19) = data14; + compinv(b + 20, data21, data22); + + *(b + 24) = data07; + *(b + 25) = data08; + *(b + 26) = data15; + *(b + 27) = data16; + *(b + 28) = data23; + *(b + 29) = data24; + compinv(b + 30, data31, data32); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data04; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + + *(b + 16) = data05; + *(b + 17) = data06; + *(b + 18) = data13; + *(b + 19) = data14; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 24) = data07; + *(b + 25) = data08; + *(b + 26) = data15; + *(b + 27) = data16; + *(b + 28) = data23; + *(b + 29) = data24; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 8; + a2 += 8; + a3 += 8; + a4 += 8; + b += 32; + + i --; + ii += 4; + } + + if (m & 2) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + + *(b + 4) = data03; + *(b + 5) = data04; + compinv(b + 6, data11, data12); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data04; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 1; + } + a += 4 * lda; + jj += 4; + j --; + } + + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + + *(b + 4) = data03; + *(b + 5) = data04; + compinv(b + 6, data11, data12); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + + *(b + 4) = data03; + *(b + 5) = data04; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 4; + a2 += 4; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + } + + a1 += 2; + a2 += 2; + b += 4; + + ii += 1; + } + a += 2 * lda; + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 2; + b += 2; + + i --; + ii += 1; + } + + a += lda; + jj += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_lncopy_8.c b/kernel/generic/ztrsm_lncopy_8.c new file mode 100644 index 0000000000..0176f91bec --- /dev/null +++ b/kernel/generic/ztrsm_lncopy_8.c @@ -0,0 +1,225 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + j --; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_1.c b/kernel/generic/ztrsm_ltcopy_1.c new file mode 100644 index 0000000000..ef495327b0 --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_1.c @@ -0,0 +1,91 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02; + FLOAT *a1; + + lda *= 2; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += lda; + b += 2; + + i --; + ii ++; + } + + a += 2; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_2.c b/kernel/generic/ztrsm_ltcopy_2.c new file mode 100644 index 0000000000..bcc2bbc919 --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_2.c @@ -0,0 +1,177 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *a1, *a2; + + lda *= 2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data07 = *(a2 + 2); + data08 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + compinv(b + 6, data07, data08); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii== jj) { + +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 1 * lda; + b += 2; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_4.c b/kernel/generic/ztrsm_ltcopy_4.c new file mode 100644 index 0000000000..8c4e66b7fb --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_4.c @@ -0,0 +1,479 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT data09, data10, data11, data12; + FLOAT data13, data14, data15, data16; + FLOAT data17, data18, data19, data20; + FLOAT data21, data22, data23, data24; + FLOAT data25, data26, data27, data28; + FLOAT data29, data30, data31, data32; + + FLOAT *a1, *a2, *a3, *a4; + + lda *= 2; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + +#ifndef UNIT + data21 = *(a3 + 4); + data22 = *(a3 + 5); +#endif + data23 = *(a3 + 6); + data24 = *(a3 + 7); + +#ifndef UNIT + data31 = *(a4 + 6); + data32 = *(a4 + 7); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + compinv(b + 10, data11, data12); + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + compinv(b + 20, data21, data22); + *(b + 22) = data23; + *(b + 23) = data24; + + compinv(b + 30, data31, data32); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 32; + + i --; + ii += 4; + } + + if (m & 2) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + compinv(b + 10, data11, data12); + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 16; + + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += lda; + b += 8; + ii += 1; + } + + a += 8; + jj += 4; + j --; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + compinv(b + 6, data11, data12); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + compinv(b + 0, data01, data02); + *(b + 2) = data03; + *(b + 3) = data04; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += lda; + b += 4; + ii += 1; + } + + a += 4; + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += lda; + b += 2; + + i --; + ii += 1; + } + + a += 2; + jj += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_ltcopy_8.c b/kernel/generic/ztrsm_ltcopy_8.c new file mode 100644 index 0000000000..899c9ab30c --- /dev/null +++ b/kernel/generic/ztrsm_ltcopy_8.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + j --; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_1.c b/kernel/generic/ztrsm_uncopy_1.c new file mode 100644 index 0000000000..0891300d19 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02; + FLOAT *a1; + + lda *= 2; + + jj = offset; + + j = n; + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 2; + b += 2; + + i --; + ii ++; + } + + a += lda; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_2.c b/kernel/generic/ztrsm_uncopy_2.c new file mode 100644 index 0000000000..45c2093630 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_2.c @@ -0,0 +1,176 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *a1, *a2; + + lda *= 2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { + +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data05 = *(a2 + 0); + data06 = *(a2 + 1); +#ifndef UNIT + data07 = *(a2 + 2); + data08 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data05; + *(b + 3) = data06; + compinv(b + 6, data07, data08); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data05; + *(b + 3) = data06; + *(b + 4) = data03; + *(b + 5) = data04; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 4; + a2 += 4; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + data05 = *(a2 + 0); + data06 = *(a2 + 1); + + compinv(b + 0, data01, data02); + *(b + 2) = data05; + *(b + 3) = data06; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a2 + 0); + data04 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 2 * lda; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1+= 2; + b += 2; + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_4.c b/kernel/generic/ztrsm_uncopy_4.c new file mode 100644 index 0000000000..9cbc6c729f --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_4.c @@ -0,0 +1,496 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT data09, data10, data11, data12; + FLOAT data13, data14, data15, data16; + FLOAT data17, data18, data19, data20; + FLOAT data21, data22, data23, data24; + FLOAT data25, data26, data27, data28; + FLOAT data29, data30, data31, data32; + + FLOAT *a1, *a2, *a3, *a4; + + lda *= 2; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); +#ifndef UNIT + data21 = *(a3 + 4); + data22 = *(a3 + 5); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); +#ifndef UNIT + data31 = *(a4 + 6); + data32 = *(a4 + 7); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + compinv(b + 10, data11, data12); + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + + compinv(b + 20, data21, data22); + *(b + 22) = data29; + *(b + 23) = data30; + compinv(b + 30, data31, data32); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data04; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + + *(b + 16) = data05; + *(b + 17) = data06; + *(b + 18) = data13; + *(b + 19) = data14; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data29; + *(b + 23) = data30; + + *(b + 24) = data07; + *(b + 25) = data08; + *(b + 26) = data15; + *(b + 27) = data16; + *(b + 28) = data23; + *(b + 29) = data24; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 8; + a2 += 8; + a3 += 8; + a4 += 8; + b += 32; + + i --; + ii += 4; + } + + if (m & 2) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + compinv(b + 10, data11, data12); + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + + *(b + 8) = data03; + *(b + 9) = data04; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data19; + *(b + 13) = data20; + *(b + 14) = data27; + *(b + 15) = data28; + } + + a1 += 4; + a2 += 4; + a3 += 4; + a4 += 4; + b += 16; + + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data25 = *(a4 + 0); + data26 = *(a4 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data17; + *(b + 5) = data18; + *(b + 6) = data25; + *(b + 7) = data26; + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + + ii += 1; + } + + a += 4 * lda; + jj += 4; + j --; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + compinv(b + 6, data11, data12); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + *(b + 4) = data03; + *(b + 5) = data04; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 4; + a2 += 4; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + compinv(b + 0, data01, data02); + *(b + 2) = data09; + *(b + 3) = data10; + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data09 = *(a2 + 0); + data10 = *(a2 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data09; + *(b + 3) = data10; + } + + a1 += 2; + a2 += 2; + b += 4; + + ii += 1; + } + + a += 2 *lda; + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + compinv(b + 0, data01, data02); + } + + if (ii < jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 2; + b += 2; + + i --; + ii += 1; + } + + a += lda; + jj += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_uncopy_8.c b/kernel/generic/ztrsm_uncopy_8.c new file mode 100644 index 0000000000..2ce1c72ca3 --- /dev/null +++ b/kernel/generic/ztrsm_uncopy_8.c @@ -0,0 +1,228 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8; + FLOAT data1, data2; + + lda *= 2; + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a5 = a + 4 * lda; + a6 = a + 5 * lda; + a7 = a + 6 * lda; + a8 = a + 7 * lda; + + a += 8 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 8; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + *(b + 8) = *(a5 + 0); + *(b + 9) = *(a5 + 1); + *(b + 10) = *(a6 + 0); + *(b + 11) = *(a6 + 1); + *(b + 12) = *(a7 + 0); + *(b + 13) = *(a7 + 1); + *(b + 14) = *(a8 + 0); + *(b + 15) = *(a8 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + a5 += 2; + a6 += 2; + a7 += 2; + a8 += 2; + b += 16; + ii ++; + } + + jj += 8; + j --; + } + + if (n & 4) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + a += 4 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + + for (k = ii - jj + 1; k < 4; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + *(b + 4) = *(a3 + 0); + *(b + 5) = *(a3 + 1); + *(b + 6) = *(a4 + 0); + *(b + 7) = *(a4 + 1); + } + + a1 += 2; + a2 += 2; + a3 += 2; + a4 += 2; + b += 8; + ii ++; + } + + jj += 4; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a += 2 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 2; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a2 + 0); + *(b + 3) = *(a2 + 1); + } + + a1 += 2; + a2 += 2; + b += 4; + ii ++; + } + + jj += 2; + } + + if (n & 1) { + + a1 = a + 0 * lda; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + data1 = *(a1 + (ii - jj) * lda + 0); + data2 = *(a1 + (ii - jj) * lda + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + for (k = ii - jj + 1; k < 1; k ++) { + *(b + k * 2 + 0) = *(a1 + k * lda + 0); + *(b + k * 2 + 1) = *(a1 + k * lda + 1); + } + } + + if (ii - jj < 0) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + a1 += 2; + b += 2; + ii ++; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c new file mode 100644 index 0000000000..42ecc471b0 --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02; + FLOAT *a1; + + lda *= 2; + + jj = offset; + + j = (n); + while (j > 0){ + + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += lda; + b += 2; + + i --; + ii ++; + } + + a += 2; + jj ++; + j --; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c new file mode 100644 index 0000000000..fd7affb3f4 --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -0,0 +1,171 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT *a1, *a2; + + lda *= 2; + + jj = offset; + + j = (n >> 1); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + i = (m >> 1); + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data05 = *(a2 + 0); + data06 = *(a2 + 1); +#ifndef UNIT + data07 = *(a2 + 2); + data08 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 4) = data05; + *(b + 5) = data06; + compinv(b + 6, data07, data08); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a2 + 0); + data06 = *(a2 + 1); + data07 = *(a2 + 2); + data08 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + i --; + ii += 2; + } + + if ((m & 1) != 0) { + + if (ii== jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + b += 4; + } + + a += 4; + jj += 2; + j --; + } + + if (n & 1) { + a1 = a + 0 * lda; + + i = m; + ii = 0; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += 1 * lda; + b += 2; + + i --; + ii += 1; + } + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_4.c b/kernel/generic/ztrsm_utcopy_4.c new file mode 100644 index 0000000000..fd3483c103 --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_4.c @@ -0,0 +1,444 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj; + + FLOAT data01, data02, data03, data04; + FLOAT data05, data06, data07, data08; + FLOAT data09, data10, data11, data12; + FLOAT data13, data14, data15, data16; + FLOAT data17, data18, data19, data20; + FLOAT data21, data22, data23, data24; + FLOAT data25, data26, data27, data28; + FLOAT data29, data30, data31, data32; + + FLOAT *a1, *a2, *a3, *a4; + + lda *= 2; + + jj = offset; + + j = (n >> 2); + while (j > 0){ + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + a3 = a + 2 * lda; + a4 = a + 3 * lda; + + ii = 0; + + i = (m >> 2); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); +#ifndef UNIT + data21 = *(a3 + 4); + data22 = *(a3 + 5); +#endif + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); +#ifndef UNIT + data31 = *(a4 + 6); + data32 = *(a4 + 7); +#endif + + compinv(b + 0, data01, data02); + *(b + 8) = data09; + *(b + 9) = data10; + compinv(b + 10, data11, data12); + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + compinv(b + 20, data21, data22); + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + compinv(b + 30, data31, data32); + + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + data17 = *(a3 + 0); + data18 = *(a3 + 1); + data19 = *(a3 + 2); + data20 = *(a3 + 3); + data21 = *(a3 + 4); + data22 = *(a3 + 5); + data23 = *(a3 + 6); + data24 = *(a3 + 7); + + data25 = *(a4 + 0); + data26 = *(a4 + 1); + data27 = *(a4 + 2); + data28 = *(a4 + 3); + data29 = *(a4 + 4); + data30 = *(a4 + 5); + data31 = *(a4 + 6); + data32 = *(a4 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + + *(b + 16) = data17; + *(b + 17) = data18; + *(b + 18) = data19; + *(b + 19) = data20; + *(b + 20) = data21; + *(b + 21) = data22; + *(b + 22) = data23; + *(b + 23) = data24; + + *(b + 24) = data25; + *(b + 25) = data26; + *(b + 26) = data27; + *(b + 27) = data28; + *(b + 28) = data29; + *(b + 29) = data30; + *(b + 30) = data31; + *(b + 31) = data32; + } + + a1 += 4 * lda; + a2 += 4 * lda; + a3 += 4 * lda; + a4 += 4 * lda; + b += 32; + + i --; + ii += 4; + } + + if (m & 2) { + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 8) = data09; + *(b + 9) = data10; + compinv(b + 10, data11, data12); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + data13 = *(a2 + 4); + data14 = *(a2 + 5); + data15 = *(a2 + 6); + data16 = *(a2 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + + *(b + 8) = data09; + *(b + 9) = data10; + *(b + 10) = data11; + *(b + 11) = data12; + *(b + 12) = data13; + *(b + 13) = data14; + *(b + 14) = data15; + *(b + 15) = data16; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 16; + + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + data05 = *(a1 + 4); + data06 = *(a1 + 5); + data07 = *(a1 + 6); + data08 = *(a1 + 7); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data05; + *(b + 5) = data06; + *(b + 6) = data07; + *(b + 7) = data08; + } + + a1 += lda; + b += 8; + + ii += 1; + } + + a += 8; + jj += 4; + j --; + } + + if (n & 2) { + + a1 = a + 0 * lda; + a2 = a + 1 * lda; + + ii = 0; + + i = (m >> 1); + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + + data09 = *(a2 + 0); + data10 = *(a2 + 1); +#ifndef UNIT + data11 = *(a2 + 2); + data12 = *(a2 + 3); +#endif + + compinv(b + 0, data01, data02); + *(b + 4) = data09; + *(b + 5) = data10; + compinv(b + 6, data11, data12); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + data09 = *(a2 + 0); + data10 = *(a2 + 1); + data11 = *(a2 + 2); + data12 = *(a2 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + *(b + 4) = data09; + *(b + 5) = data10; + *(b + 6) = data11; + *(b + 7) = data12; + } + + a1 += 2 * lda; + a2 += 2 * lda; + b += 8; + + i --; + ii += 2; + } + + if (m & 1) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + data03 = *(a1 + 2); + data04 = *(a1 + 3); + + *(b + 0) = data01; + *(b + 1) = data02; + *(b + 2) = data03; + *(b + 3) = data04; + } + + a1 += lda; + b += 4; + + ii += 1; + } + + a += 4; + jj += 2; + j --; + } + + if (n & 1) { + + a1 = a + 0 * lda; + + ii = 0; + + i = m; + while (i > 0) { + + if (ii == jj) { +#ifndef UNIT + data01 = *(a1 + 0); + data02 = *(a1 + 1); +#endif + compinv(b + 0, data01, data02); + } + + if (ii > jj) { + data01 = *(a1 + 0); + data02 = *(a1 + 1); + *(b + 0) = data01; + *(b + 1) = data02; + } + + a1 += lda; + b += 2; + + i --; + ii += 1; + } + + a += 2; + jj += 1; + } + + return 0; +} diff --git a/kernel/generic/ztrsm_utcopy_8.c b/kernel/generic/ztrsm_utcopy_8.c new file mode 100644 index 0000000000..52c7ed5a32 --- /dev/null +++ b/kernel/generic/ztrsm_utcopy_8.c @@ -0,0 +1,209 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ + + BLASLONG i, ii, j, jj, k; + + FLOAT *a1, data1, data2; + + lda *= 2; + + jj = offset; + + j = (n >> 3); + while (j > 0){ + + a1 = a; + a += 16; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 8)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 8) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + *(b + 8) = *(a1 + 8); + *(b + 9) = *(a1 + 9); + *(b + 10) = *(a1 + 10); + *(b + 11) = *(a1 + 11); + *(b + 12) = *(a1 + 12); + *(b + 13) = *(a1 + 13); + *(b + 14) = *(a1 + 14); + *(b + 15) = *(a1 + 15); + } + + b += 16; + a1 += lda; + ii ++; + } + + jj += 8; + j --; + } + + j = (n & 4); + if (j > 0) { + + a1 = a; + a += 8; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 4)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 4) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + *(b + 4) = *(a1 + 4); + *(b + 5) = *(a1 + 5); + *(b + 6) = *(a1 + 6); + *(b + 7) = *(a1 + 7); + } + + b += 8; + a1 += lda; + ii ++; + } + + jj += 4; + } + + j = (n & 2); + if (j > 0) { + + a1 = a; + a += 4; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 2)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 2) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + *(b + 2) = *(a1 + 2); + *(b + 3) = *(a1 + 3); + } + + b += 4; + a1 += lda; + ii ++; + } + + jj += 2; + } + + j = (n & 1); + if (j > 0) { + + a1 = a; + ii = 0; + + for (i = 0; i < m; i++) { + + if ((ii >= jj ) && (ii - jj < 1)) { + for (k = 0; k < ii - jj; k ++) { + *(b + k * 2 + 0) = *(a1 + k * 2 + 0); + *(b + k * 2 + 1) = *(a1 + k * 2 + 1); + } + + data1 = *(a1 + (ii - jj) * 2 + 0); + data2 = *(a1 + (ii - jj) * 2 + 1); + + compinv(b + (ii - jj) * 2, data1, data2); + } + + if (ii - jj >= 1) { + *(b + 0) = *(a1 + 0); + *(b + 1) = *(a1 + 1); + } + + b += 2; + a1 += lda; + ii ++; + } + } + + return 0; +} diff --git a/kernel/ia64/KERNEL b/kernel/ia64/KERNEL new file mode 100644 index 0000000000..10a7e61e25 --- /dev/null +++ b/kernel/ia64/KERNEL @@ -0,0 +1,140 @@ +SAXPYKERNEL = saxpy.S +DAXPYKERNEL = daxpy.S +QAXPYKERNEL = qaxpy.S +CAXPYKERNEL = caxpy.S +ZAXPYKERNEL = zaxpy.S +XAXPYKERNEL = zaxpy.S + +SDOTKERNEL = sdot.S +DDOTKERNEL = ddot.S +QDOTKERNEL = qdot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +XDOTKERNEL = xdot.S + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +QAMAXKERNEL = amax.S +CAMAXKERNEL = izamax.S +ZAMAXKERNEL = izamax.S +XAMAXKERNEL = izamax.S + +SAMINKERNEL = amax.S +DAMINKERNEL = amax.S +QAMINKERNEL = amax.S +CAMINKERNEL = izamax.S +ZAMINKERNEL = izamax.S +XAMINKERNEL = izamax.S + +SMAXKERNEL = amax.S +DMAXKERNEL = amax.S +QMAXKERNEL = amax.S + +SMINKERNEL = amax.S +DMINKERNEL = amax.S +QMINKERNEL = amax.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +IQAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S +IXAMAXKERNEL = izamax.S + +ISAMINKERNEL = iamax.S +IDAMINKERNEL = iamax.S +IQAMINKERNEL = iamax.S +ICAMINKERNEL = izamax.S +IZAMINKERNEL = izamax.S +IXAMINKERNEL = izamax.S + +ISMAXKERNEL = iamax.S +IDMAXKERNEL = iamax.S +IQMAXKERNEL = iamax.S + +ISMINKERNEL = iamax.S +IDMINKERNEL = iamax.S +IQMINKERNEL = iamax.S + +CASUMKERNEL = asum.S +ZASUMKERNEL = asum.S +XASUMKERNEL = asum.S + +CNRM2KERNEL = nrm2.S +ZNRM2KERNEL = nrm2.S +XNRM2KERNEL = nrm2.S + +QCOPYKERNEL = qcopy.S +XCOPYKERNEL = xcopy.S + +QSCALKERNEL = qscal.S + +QGEMVNKERNEL = qgemv_n.S +QGEMVTKERNEL = qgemv_t.S +XGEMVNKERNEL = xgemv_n.S +XGEMVTKERNEL = xgemv_t.S + +SGEMMKERNEL = gemm_kernel.S +SGEMM_BETA = gemm_beta.S +SGEMMONCOPY = gemm_ncopy.S +SGEMMOTCOPY = gemm_tcopy.S +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) + +DGEMMKERNEL = gemm_kernel.S +DGEMM_BETA = gemm_beta.S +DGEMMONCOPY = gemm_ncopy.S +DGEMMOTCOPY = gemm_tcopy.S +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) + +QGEMMKERNEL = qgemm_kernel.S +QGEMM_BETA = ../generic/gemm_beta.c +QGEMMONCOPY = ../generic/gemm_ncopy_8.c +QGEMMOTCOPY = ../generic/gemm_tcopy_8.c +QGEMMONCOPYOBJ = qgemm_oncopy.$(SUFFIX) +QGEMMOTCOPYOBJ = qgemm_otcopy.$(SUFFIX) + +CGEMMKERNEL = zgemm_kernel.S +CGEMM_BETA = zgemm_beta.S +CGEMMONCOPY = zgemm_ncopy.S +CGEMMOTCOPY = zgemm_tcopy.S +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel.S +ZGEMM_BETA = zgemm_beta.S +ZGEMMONCOPY = zgemm_ncopy.S +ZGEMMOTCOPY = zgemm_tcopy.S +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) + +XGEMMKERNEL = zgemm_kernel.S +XGEMM_BETA = ../generic/zgemm_beta.c +XGEMMONCOPY = ../generic/zgemm_ncopy_4.c +XGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +XGEMMONCOPYOBJ = xgemm_oncopy.$(SUFFIX) +XGEMMOTCOPYOBJ = xgemm_otcopy.$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +CGEMM3MKERNEL = zgemm3m_kernel.S +ZGEMM3MKERNEL = zgemm3m_kernel.S diff --git a/kernel/ia64/Makefile b/kernel/ia64/Makefile new file mode 100644 index 0000000000..520349bd69 --- /dev/null +++ b/kernel/ia64/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/ia64/amax.S b/kernel/ia64/amax.S new file mode 100644 index 0000000000..fae96f12b5 --- /dev/null +++ b/kernel/ia64/amax.S @@ -0,0 +1,396 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#if !defined(USE_MIN) && defined(USE_ABS) +#define FMAX famax +#elif !defined(USE_MIN) && !defined(USE_ABS) +#define FMAX fmax +#elif defined(USE_MIN) && defined(USE_ABS) +#define FMAX famin +#else +#define FMAX fmin +#endif + +#define RET r8 + +#define N r32 +#define DX r33 +#define INCX r34 + +#define PRE1 r2 +#define J r14 +#define K r15 +#define X2 r16 +#define X3 r17 +#define INCX5 r18 +#define INCX16 r19 + +#define DMAX1 f8 +#define DMAX2 f9 +#define DMAX3 f10 +#define DMAX4 f11 +#define DMAX5 f12 +#define DMAX6 f13 +#define DMAX7 f14 +#define DMAX8 f15 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + mov RET = 0 + mov DMAX1 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body + +#ifdef F_INTERFACE + { .mmi + LDINT N = [N] + LDINT INCX = [INCX] + nop.i 0 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#endif + { .mii + mov PR = pr + cmp.ge p6, p0 = 0, INCX + } + { .mbb + cmp.ge p8, p0 = 0, N + (p8) br.ret.sptk.many b0 + (p6) br.ret.sptk.many b0 + } + ;; + { .mmi + LDFD DMAX1 = [DX] + shladd INCX = INCX, BASE_SHIFT, r0 + mov pr.rot= 0 + } + ;; + { .mmf + add DX = DX, INCX + adds K = -1, N + mov DMAX2 = DMAX1 + } + ;; + { .mfi + shladd X2 = INCX, 2, DX + mov DMAX5 = DMAX1 + shr J = K, 4 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + nop.m 0 + mov DMAX6 = DMAX1 + } + ;; + { .mfi + shladd INCX5 = INCX, 2, INCX + mov DMAX3 = DMAX1 + mov ar.ec= 4 + } + { .mmf +#ifdef XDOUBLE + shladd INCX16= INCX, 3, r0 +#else + shladd INCX16= INCX, 4, r0 +#endif + adds J = -1, J + mov DMAX7 = DMAX1 + } + ;; + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, DX + mov DMAX4 = DMAX1 + mov ar.lc = J + } + { .mfb + cmp.eq p7 ,p0 = -1, J + mov DMAX8 = DMAX1 + (p7) br.cond.dpnt .L15 + } + .align 32 + ;; +.L10: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [DX], INCX + (p19) FMAX DMAX1 = f35, DMAX1 + } + { .mmf + (p16) LDFD f48 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX5 = f51, DMAX5 + } + ;; + { .mmf + (p16) LDFD f36 = [DX], INCX + nop.m 0 + (p19) FMAX DMAX2 = f39, DMAX2 + } + { .mmf + (p16) LDFD f52 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX6 = f55, DMAX6 + } + ;; + { .mmf + (p16) LDFD f40 = [DX], INCX + nop.m 0 + (p19) FMAX DMAX3 = f43, DMAX3 + } + { .mmf + (p16) LDFD f56 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX7 = f59, DMAX7 + } + ;; + { .mmf + (p16) LDFD f44 = [DX], INCX5 + nop.m 0 + (p19) FMAX DMAX4 = f47, DMAX4 + } + { .mmf + (p16) LDFD f60 = [X2], INCX5 + nop.m 0 + (p19) FMAX DMAX8 = f63, DMAX8 + } + ;; + { .mmf +#ifdef XDOUBLE + (p16) lfetch.nt1 [PRE1], INCX16 +#endif + (p16) LDFD f64 = [DX], INCX +#ifndef XDOUBLE + nop.m 0 +#endif + (p19) FMAX DMAX1 = f67, DMAX1 + } + { .mmf + (p16) LDFD f80 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX5 = f83, DMAX5 + } + ;; + { .mmf + (p16) LDFD f68 = [DX], INCX + nop.m 0 + (p19) FMAX DMAX2 = f71, DMAX2 + } + { .mmf + (p16) LDFD f84 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX6 = f87, DMAX6 + } + ;; + { .mmf + (p16) LDFD f72 = [DX], INCX + nop.m 0 + (p19) FMAX DMAX3 = f75, DMAX3 + } + { .mmf + (p16) LDFD f88 = [X2], INCX + nop.m 0 + (p19) FMAX DMAX7 = f91, DMAX7 + } + ;; + { .mmf + (p16) LDFD f76 = [DX], INCX5 + nop.m 0 + (p19) FMAX DMAX4 = f79, DMAX4 + } + { .mfb + (p16) LDFD f92 = [X2], INCX5 + (p19) FMAX DMAX8 = f95, DMAX8 + br.ctop.sptk.few .L10 + } + .align 32 + ;; +.L15: + and J = 15, K + tbit.z p0, p12 = K, 3 + mov X3 = DX + ;; + { .mmi + (p12) LDFD f32 = [DX], INCX + (p12) LDFD f36 = [X2], INCX + tbit.z p0, p13 = K, 2 + } + { .mib + cmp.eq p8 ,p0 = r0, J + tbit.z p0, p14 = K, 1 + (p8) br.cond.dpnt .L99 + } + ;; + { .mmi + (p12) LDFD f33 = [DX], INCX + (p12) LDFD f37 = [X2], INCX + tbit.z p0, p15 = K, 0 + } + ;; + { .mmi + (p12) LDFD f34 = [DX], INCX + (p12) LDFD f38 = [X2], INCX + (p12) shladd X3 = INCX, 3, X3 + } + ;; + { .mmi + (p12) LDFD f35 = [DX], INCX5 + (p12) LDFD f39 = [X2], INCX5 + (p13) shladd X3 = INCX, 2, X3 + } + ;; + { .mmi + (p13) LDFD f40 = [DX], INCX + (p14) LDFD f44 = [X3], INCX + nop.i 0 + } + ;; + { .mmi + (p13) LDFD f41 = [DX], INCX + (p14) LDFD f45 = [X3], INCX + nop.i 0 + } + ;; + { .mmf + (p13) LDFD f42 = [DX], INCX + nop.m 0 + (p12) FMAX DMAX1 = f32, DMAX1 + } + { .mmf + (p15) LDFD f46 = [X3], INCX + nop.m 0 + (p12) FMAX DMAX5 = f36, DMAX5 + } + ;; + { .mmf + (p13) LDFD f43 = [DX], INCX + nop.m 0 + (p12) FMAX DMAX2 = f33, DMAX2 + } + (p12) FMAX DMAX6 = f37, DMAX6 + (p12) FMAX DMAX3 = f34, DMAX3 + (p12) FMAX DMAX7 = f38, DMAX7 + (p12) FMAX DMAX4 = f35, DMAX4 + (p12) FMAX DMAX8 = f39, DMAX8 + ;; + (p13) FMAX DMAX1 = f40, DMAX1 + (p14) FMAX DMAX5 = f44, DMAX5 + (p13) FMAX DMAX2 = f41, DMAX2 + (p14) FMAX DMAX6 = f45, DMAX6 + (p13) FMAX DMAX3 = f42, DMAX3 + (p15) FMAX DMAX7 = f46, DMAX7 + (p13) FMAX DMAX4 = f43, DMAX4 + ;; + .align 32 + +.L99: + { .mfi + nop.m 0 + FMAX DMAX1 = DMAX5, DMAX1 + mov ar.lc = ARLC + } + { .mmf + nop.m 0 + nop.m 0 + FMAX DMAX2 = DMAX6, DMAX2 + } + ;; + { .mfi + nop.m 0 + FMAX DMAX3 = DMAX7, DMAX3 + mov pr = PR, -65474 + } + { .mmf + nop.m 0 + nop.m 0 + FMAX DMAX4 = DMAX8, DMAX4 + } + ;; + { .mmf + FMAX DMAX1 = DMAX2, DMAX1 + } + { .mmf + FMAX DMAX3 = DMAX4, DMAX3 + } + ;; +#ifndef USE_ABS + { .mfb + FMAX DMAX1 = DMAX3, DMAX1 + br.ret.sptk.many b0 + } +#else + { .mmf + FMAX DMAX1 = DMAX3, DMAX1 + } + ;; + { .mfb + fabs DMAX1 = DMAX1 + br.ret.sptk.many b0 + } +#endif + ;; + EPILOGUE + + + diff --git a/kernel/ia64/asum.S b/kernel/ia64/asum.S new file mode 100644 index 0000000000..6114f57ed2 --- /dev/null +++ b/kernel/ia64/asum.S @@ -0,0 +1,388 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#ifndef COMPLEX +#define COMPADD 0 +#define STRIDE INCX +#else +#define COMPADD 1 +#define STRIDE SIZE +#endif + +#define PRE1 r2 + +#define I r17 +#define J r18 +#define INCX16 r21 + +#define PR r30 +#define ARLC r31 + +#define N r32 +#define X r33 +#define INCX r34 + + + PROLOGUE + .prologue + PROFCODE + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, X + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body +#ifdef F_INTERFACE + { .mmi + LDINT N = [N] + LDINT INCX = [INCX] + nop.i 0 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#endif + { .mmi + cmp.lt p0, p6 = r0, INCX + cmp.lt p0, p7 = r0, N + shr I = N, (4 - COMPADD) + } + { .mbb + and J = ((1 << (4 - COMPADD)) - 1), N + (p6) br.ret.sptk.many b0 + (p7) br.ret.sptk.many b0 + } + ;; + { .mfi + adds I = -1, I + mov f10 = f0 + mov PR = pr + } + { .mfi + cmp.eq p9, p0 = r0, J + mov f9 = f0 + tbit.z p0, p12 = N, 3 - COMPADD + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + cmp.ne p17, p0 = r0, r0 + mov ar.ec= 3 + } + { .mfi + cmp.ne p18, p0 = r0, r0 + mov f11 = f0 + shl INCX = INCX, BASE_SHIFT + COMPADD + } + ;; + { .mmi +#ifdef XDOUBLE + shladd INCX16 = INCX, (3 - COMPADD), r0 +#else + shladd INCX16 = INCX, (4 - COMPADD), r0 +#endif + cmp.ne p19, p0 = r0, r0 + mov ar.lc = I + } + { .mmb + cmp.gt p8 ,p0 = r0, I +#ifdef COMPLEX + adds INCX = - SIZE, INCX +#else + nop.m 0 +#endif + (p8) br.cond.dpnt .L55 + } + ;; + .align 32 + +.L52: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X], STRIDE + (p18) fabs f34 = f34 + } + { .mfb + (p19) FADD f8 = f8, f71 + } + ;; + { .mmf + (p16) LDFD f35 = [X], INCX + (p18) fabs f37 = f37 + } + { .mfb + (p19) FADD f9 = f9, f74 + } + ;; + { .mmf + (p16) LDFD f38 = [X], STRIDE + (p18) fabs f40 = f40 + } + { .mfb + (p19) FADD f10 = f10, f77 + } + ;; + { .mmf + (p16) LDFD f41 = [X], INCX + (p18) fabs f43 = f43 + } + { .mfb + (p19) FADD f11 = f11, f80 + } + ;; + { .mmf + (p16) LDFD f44 = [X], STRIDE + (p18) fabs f46 = f46 + } + { .mfb + (p18) FADD f8 = f8, f34 + } + ;; + { .mmf + (p16) LDFD f47 = [X], INCX + (p18) fabs f49 = f49 + } + { .mfb + (p18) FADD f9 = f9, f37 + } + ;; + { .mmf + (p16) LDFD f50 = [X], STRIDE + (p18) fabs f52 = f52 + } + { .mfb + (p18) FADD f10 = f10, f40 + } + ;; + { .mmf + (p16) LDFD f53 = [X], INCX + (p18) fabs f55 = f55 + } + { .mfb + (p18) FADD f11 = f11, f43 + } + ;; + { .mmf +#ifdef XDOUBLE + (p16) lfetch.nt1 [PRE1], INCX16 +#endif + (p16) LDFD f56 = [X], STRIDE + (p18) fabs f58 = f58 + } + { .mfb + (p18) FADD f8 = f8, f46 + } + ;; + { .mmf + (p16) LDFD f59 = [X], INCX + (p18) fabs f61 = f61 + } + { .mfb + (p18) FADD f9 = f9, f49 + } + ;; + { .mmf + (p16) LDFD f62 = [X], STRIDE + (p18) fabs f64 = f64 + } + { .mfb + (p18) FADD f10 = f10, f52 + } + ;; + { .mmf + (p16) LDFD f65 = [X], INCX + (p18) fabs f67 = f67 + } + { .mfb + (p18) FADD f11 = f11, f55 + } + ;; + { .mmf + (p16) LDFD f68 = [X], STRIDE + (p18) fabs f70 = f70 + } + { .mfb + (p18) FADD f8 = f8, f58 + } + ;; + { .mmf + (p16) LDFD f71 = [X], INCX + (p18) fabs f73 = f73 + } + { .mfb + (p18) FADD f9 = f9, f61 + } + ;; + { .mmf + (p16) LDFD f74 = [X], STRIDE + (p18) fabs f76 = f76 + } + { .mfb + (p18) FADD f10 = f10, f64 + } + ;; + { .mmf + (p16) LDFD f77 = [X], INCX + (p18) fabs f79 = f79 + } + { .mfb + (p18) FADD f11 = f11, f67 + br.ctop.sptk.few .L52 + } + ;; + FADD f8 = f8, f71 + FADD f9 = f9, f74 + FADD f10 = f10, f77 + FADD f11 = f11, f80 + .align 32 + ;; +.L55: + (p12) LDFD f32 = [X], STRIDE + (p9) br.cond.dptk .L998 + ;; + (p12) LDFD f33 = [X], INCX + ;; + (p12) LDFD f34 = [X], STRIDE + ;; + (p12) LDFD f35 = [X], INCX + tbit.z p0, p13 = N, (2 - COMPADD) + ;; + (p12) LDFD f36 = [X], STRIDE + tbit.z p0, p14 = N, (1 - COMPADD) + ;; + (p12) LDFD f37 = [X], INCX +#ifndef COMPLEX + tbit.z p0, p15 = N, 0 +#endif + ;; + (p12) LDFD f38 = [X], STRIDE + (p12) fabs f32 = f32 + ;; + (p12) LDFD f39 = [X], INCX + (p12) fabs f33 = f33 + ;; + (p13) LDFD f40 = [X], STRIDE + (p12) fabs f34 = f34 + ;; + (p13) LDFD f41 = [X], INCX + (p12) fabs f35 = f35 + ;; + (p13) LDFD f42 = [X], STRIDE + (p12) fabs f36 = f36 + (p12) FADD f8 = f8, f32 + ;; + (p13) LDFD f43 = [X], INCX + (p12) fabs f37 = f37 + (p12) FADD f9 = f9, f33 + ;; + (p14) LDFD f44 = [X], STRIDE + (p12) fabs f38 = f38 + (p12) FADD f10 = f10, f34 + ;; + (p14) LDFD f45 = [X], INCX + (p12) fabs f39 = f39 + (p12) FADD f11 = f11, f35 + ;; +#ifndef COMPLEX + (p15) LDFD f46 = [X] +#endif + (p13) fabs f40 = f40 + (p12) FADD f8 = f8, f36 + ;; + (p13) fabs f41 = f41 + (p12) FADD f9 = f9, f37 + (p13) fabs f42 = f42 + (p12) FADD f10 = f10, f38 + (p13) fabs f43 = f43 + (p12) FADD f11 = f11, f39 + ;; + (p14) fabs f44 = f44 + (p13) FADD f8 = f8, f40 + (p14) fabs f45 = f45 + (p13) FADD f9 = f9, f41 +#ifndef COMPLEX + (p15) fabs f46 = f46 +#endif + (p13) FADD f10 = f10, f42 + ;; + (p13) FADD f11 = f11, f43 + (p14) FADD f8 = f8, f44 + (p14) FADD f9 = f9, f45 +#ifndef COMPLEX + (p15) FADD f10 = f10, f46 +#endif + ;; + .align 32 + +.L998: + { .mfi + FADD f8 = f8, f9 + mov ar.lc = ARLC + } + { .mmf + FADD f10 = f10, f11 + } + ;; + { .mii + mov pr = PR, -65474 + } + ;; + { .mfb + FADD f8 = f8, f10 + br.ret.sptk.many b0 + } + EPILOGUE diff --git a/kernel/ia64/cabs.S b/kernel/ia64/cabs.S new file mode 100644 index 0000000000..834b1bd6db --- /dev/null +++ b/kernel/ia64/cabs.S @@ -0,0 +1,58 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + .prologue + .body + LDFD f8 = [r32], SIZE + ;; + LDFD f6 = [r32] + ;; + fabs f8 = f8 + fabs f6 = f6 + ;; + FADD f8 = f6, f8 + br.ret.sptk.many b0 + + EPILOGUE + diff --git a/kernel/ia64/caxpy.S b/kernel/ia64/caxpy.S new file mode 100644 index 0000000000..0a28ebe363 --- /dev/null +++ b/kernel/ia64/caxpy.S @@ -0,0 +1,519 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (32 * 16) + +#ifndef CONJ +#define FMA1 FNMA +#define FMA2 FMA +#else +#define FMA1 FMA +#define FMA2 FNMA +#endif + +#define SP r12 + +#define N r32 +#define X1 r37 +#define INCX r38 +#define Y1 r39 +#define INCY r36 + +#define PREX1 r2 +#define PREY1 r3 + +#define I r33 +#define J r34 +#define Y2 r35 +#define X2 r14 +#define YY1 r15 +#define YY2 r16 +#define YY3 r17 +#define YY4 r18 + +#define INCXM1 r19 +#define INCYM1 r20 +#define INCX3M1 r21 +#define INCY3M1 r22 +#define INCX7M1 r23 +#define INCY7M1 r24 + +#define X3 r8 +#define Y3 r9 +#define X4 r10 +#define Y4 r11 +#define INCX8 r25 +#define INCY8 r26 + +#define ARLC r29 +#define PR r30 + +#define ALPHA_R f8 +#define ALPHA_I f9 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds r14 = 16, SP + and J = 7, N + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.gt p15, p0 = r0, N + shr I = N, 3 + (p15) br.ret.sptk.many b0 + } + ;; + { .mmi + ld8 INCY = [r14] + nop __LINE__ + mov PR = pr + } + { .mmi + adds PREX1 = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY1 = (PREFETCH_SIZE + 0) * SIZE, Y1 + shl INCX = INCX, ZBASE_SHIFT + } + ;; + { .mii + adds I = -1, I + mov pr.rot= 0 + shl INCY = INCY, ZBASE_SHIFT + } + ;; + { .mmi + adds INCXM1 = -SIZE, INCX + adds INCYM1 = -SIZE, INCY + mov ar.ec = 3 + } + { .mmi + shladd X2 = INCX, 1, X1 + shladd Y2 = INCY, 1, Y1 + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mmi + shladd INCX3M1 = INCX, 1, INCXM1 + shladd INCY3M1 = INCY, 1, INCYM1 + shladd INCX8 = INCX, 3, r0 + } + { .mmi + shladd X3 = INCX, 1, X2 + shladd Y3 = INCY, 1, Y2 + shladd INCY8 = INCY, 3, r0 + } + ;; + { .mmi + shladd X4 = INCX, 1, X3 + shladd Y4 = INCY, 1, Y3 + shladd INCX7M1 = INCX, 2, INCX3M1 + } + { .mmi + mov YY1 = Y1 + mov YY2 = Y2 + shladd INCY7M1 = INCY, 2, INCY3M1 + } + ;; + { .mmi + mov YY3 = Y3 + mov YY4 = Y4 + mov ar.lc = I + } + { .mib + cmp.eq p11 ,p0 = -1, I + tbit.z p0, p13 = N, 2 + (p11) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmf + (p19) STFD [YY3] = f14 + (p19) STFD [YY4] = f15 + (p18) FMA2 f14 = ALPHA_R, f64, f112 + } + { .mmf + (p16) LDFD f80 = [Y1], 1 * SIZE + (p16) LDFD f92 = [Y2], 1 * SIZE + (p18) FMA2 f15 = ALPHA_R, f76, f124 + } + ;; + { .mmf + (p16) lfetch.excl.nt1 [PREY1], INCY8 + (p16) LDFD f104 = [Y3], 1 * SIZE + (p18) FMA1 f6 = ALPHA_I, f40, f6 + } + { .mmf + (p16) LDFD f116 = [Y4], 1 * SIZE + nop __LINE__ + (p18) FMA1 f7 = ALPHA_I, f52, f7 + } + ;; + { .mmf + (p16) LDFD f86 = [Y1], INCYM1 + (p16) LDFD f98 = [Y2], INCYM1 + (p18) FMA1 f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA1 f11 = ALPHA_I, f76, f11 + } + ;; + { .mmf + (p16) LDFD f110 = [Y3], INCYM1 + (p16) LDFD f122 = [Y4], INCYM1 + (p18) FMA f12 = ALPHA_I, f34, f12 + } + { .mmf + (p19) add YY1 = YY1, INCY7M1 + (p19) add YY2 = YY2, INCY7M1 + (p18) FMA f13 = ALPHA_I, f46, f13 + } + ;; + { .mmf + (p16) LDFD f32 = [X1], 1 * SIZE + (p16) LDFD f44 = [X2], 1 * SIZE + (p18) FMA f14 = ALPHA_I, f58, f14 + } + { .mmf + (p19) add YY3 = YY3, INCY7M1 + (p19) add YY4 = YY4, INCY7M1 + (p18) FMA f15 = ALPHA_I, f70, f15 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA_R, f37, f85 + } + { .mmf + (p16) LDFD f56 = [X3], 1 * SIZE + (p16) LDFD f68 = [X4], 1 * SIZE + (p18) FMA f7 = ALPHA_R, f49, f97 + } + ;; + { .mmf + (p18) STFD [YY3] = f10, 1 * SIZE + (p18) STFD [YY4] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA_R, f61, f109 + } + { .mmf + (p16) LDFD f38 = [X1], INCXM1 + (p16) LDFD f50 = [X2], INCXM1 + (p18) FMA f11 = ALPHA_R, f73, f121 + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) STFD [YY2] = f13 + (p18) FMA2 f12 = ALPHA_R, f43, f91 + } + { .mmf + (p16) LDFD f62 = [X3], INCXM1 + (p16) LDFD f74 = [X4], INCXM1 + (p18) FMA2 f13 = ALPHA_R, f55, f103 + } + ;; + { .mmf + (p18) STFD [YY3] = f14 + (p18) STFD [YY4] = f15 + (p18) FMA2 f14 = ALPHA_R, f67, f115 + } + { .mmf + (p16) LDFD f83 = [Y1], 1 * SIZE + (p16) LDFD f95 = [Y2], 1 * SIZE + (p18) FMA2 f15 = ALPHA_R, f79, f127 + } + ;; + { .mmf + (p16) LDFD f107 = [Y3], 1 * SIZE + (p16) LDFD f119 = [Y4], 1 * SIZE + (p18) FMA1 f6 = ALPHA_I, f43, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA1 f7 = ALPHA_I, f55, f7 + } + ;; + { .mmf + (p16) LDFD f89 = [Y1], INCY7M1 + (p16) LDFD f101 = [Y2], INCY7M1 + (p18) FMA1 f10 = ALPHA_I, f67, f10 + } + { .mmf + (p18) add YY1 = YY1, INCYM1 + (p18) add YY2 = YY2, INCYM1 + (p18) FMA1 f11 = ALPHA_I, f79, f11 + } + ;; + { .mmf + (p16) LDFD f113 = [Y3], INCY7M1 + (p16) LDFD f125 = [Y4], INCY7M1 + (p18) FMA f12 = ALPHA_I, f37, f12 + } + { .mmf + (p18) add YY3 = YY3, INCYM1 + (p18) add YY4 = YY4, INCYM1 + (p18) FMA f13 = ALPHA_I, f49, f13 + } + ;; + { .mmf + (p16) LDFD f35 = [X1], 1 * SIZE + (p16) LDFD f47 = [X2], 1 * SIZE + (p18) FMA f14 = ALPHA_I, f61, f14 + } + { .mmf + (p16) LDFD f59 = [X3], 1 * SIZE + (p16) LDFD f71 = [X4], 1 * SIZE + (p18) FMA f15 = ALPHA_I, f73, f15 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) FMA f6 = ALPHA_R, f33, f81 + } + { .mmf + (p16) LDFD f41 = [X1], INCX7M1 + (p16) LDFD f53 = [X2], INCX7M1 + (p17) FMA f7 = ALPHA_R, f45, f93 + } + ;; + { .mmf + (p18) STFD [YY3] = f10, 1 * SIZE + (p18) STFD [YY4] = f11, 1 * SIZE + (p17) FMA f10 = ALPHA_R, f57, f105 + } + { .mmf + (p16) LDFD f65 = [X3], INCX7M1 + (p16) LDFD f77 = [X4], INCX7M1 + (p17) FMA f11 = ALPHA_R, f69, f117 + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) STFD [YY2] = f13 + (p17) FMA2 f12 = ALPHA_R, f39, f87 + } + { .mfb + (p16) lfetch.nt1 [PREX1], INCX8 + (p17) FMA2 f13 = ALPHA_R, f51, f99 + br.ctop.sptk.few .L22 + } + ;; + (p19) add YY1 = YY1, INCY7M1 + (p19) add YY2 = YY2, INCY7M1 + ;; + { .mmf + (p19) STFD [YY3] = f14 + (p19) STFD [YY4] = f15 + } + { .mmf + (p19) add YY3 = YY3, INCY7M1 + (p19) add YY4 = YY4, INCY7M1 + } + ;; + .align 32 + +.L25: + { .mmi + (p13) LDFD f32 = [X1], 1 * SIZE + (p13) LDFD f36 = [X2], 1 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmi + (p13) LDFD f80 = [Y1], 1 * SIZE + (p13) LDFD f84 = [Y2], 1 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p13) LDFD f33 = [X1], INCXM1 + (p13) LDFD f37 = [X2], INCXM1 + cmp.eq p12, p0 = r0, J + } + ;; + { .mmb + (p13) LDFD f81 = [Y1], INCYM1 + (p13) LDFD f85 = [Y2], INCYM1 + (p12) br.ret.sptk.many b0 + } + ;; + { .mmi + (p13) LDFD f34 = [X1], 1 * SIZE + (p13) LDFD f38 = [X2], 1 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFD f82 = [Y1], 1 * SIZE + (p13) LDFD f86 = [Y2], 1 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmf + (p13) LDFD f35 = [X1], INCX3M1 + (p13) LDFD f39 = [X2], INCX3M1 + (p13) FMA f80 = ALPHA_R, f32, f80 + } + ;; + { .mmf + (p13) LDFD f83 = [Y1], INCY3M1 + (p13) LDFD f87 = [Y2], INCY3M1 + (p13) FMA f84 = ALPHA_R, f36, f84 + } + ;; + { .mmf + (p14) LDFD f40 = [X1], 1 * SIZE + (p14) LDFD f88 = [Y1], 1 * SIZE + (p13) FMA2 f81 = ALPHA_R, f33, f81 + } + ;; + { .mmf + (p14) LDFD f41 = [X1], INCXM1 + (p14) LDFD f89 = [Y1], INCYM1 + (p13) FMA2 f85 = ALPHA_R, f37, f85 + } + ;; + { .mmf + (p14) LDFD f42 = [X1], 1 * SIZE + (p14) LDFD f90 = [Y1], 1 * SIZE + (p13) FMA f82 = ALPHA_R, f34, f82 + } + ;; + { .mmf + (p14) LDFD f43 = [X1], INCXM1 + (p14) LDFD f91 = [Y1], INCYM1 + (p13) FMA f86 = ALPHA_R, f38, f86 + } + ;; + { .mmf + (p15) LDFD f44 = [X1], 1 * SIZE + (p15) LDFD f92 = [Y1], 1 * SIZE + (p13) FMA2 f83 = ALPHA_R, f35, f83 + } + ;; + { .mmf + (p15) LDFD f45 = [X1] + (p15) LDFD f93 = [Y1] + (p13) FMA2 f87 = ALPHA_R, f39, f87 + } + ;; + (p13) FMA1 f80 = ALPHA_I, f33, f80 + (p13) FMA1 f84 = ALPHA_I, f37, f84 + (p13) FMA f81 = ALPHA_I, f32, f81 + (p13) FMA f85 = ALPHA_I, f36, f85 + (p13) FMA1 f82 = ALPHA_I, f35, f82 + (p13) FMA1 f86 = ALPHA_I, f39, f86 + (p13) FMA f83 = ALPHA_I, f34, f83 + (p13) FMA f87 = ALPHA_I, f38, f87 + ;; + { .mmf + (p13) STFD [YY1] = f80, 1 * SIZE + (p13) STFD [YY2] = f84, 1 * SIZE + (p14) FMA f88 = ALPHA_R, f40, f88 + } + ;; + { .mmf + (p13) STFD [YY1] = f81 + (p13) STFD [YY2] = f85 + (p14) FMA2 f89 = ALPHA_R, f41, f89 + } + { .mmf + (p13) add YY1 = YY1, INCYM1 + (p13) add YY2 = YY2, INCYM1 + (p14) FMA f90 = ALPHA_R, f42, f90 + } + ;; + { .mmf + (p13) STFD [YY1] = f82, 1 * SIZE + (p13) STFD [YY2] = f86, 1 * SIZE + (p14) FMA2 f91 = ALPHA_R, f43, f91 + } + ;; + { .mmf + (p13) STFD [YY1] = f83 + (p13) STFD [YY2] = f87 + (p15) FMA f92 = ALPHA_R, f44, f92 + } + { .mmf + (p13) add YY1 = YY1, INCY3M1 + nop __LINE__ + (p15) FMA2 f93 = ALPHA_R, f45, f93 + } + ;; + (p14) FMA1 f88 = ALPHA_I, f41, f88 + (p14) FMA f89 = ALPHA_I, f40, f89 + (p14) FMA1 f90 = ALPHA_I, f43, f90 + (p14) FMA f91 = ALPHA_I, f42, f91 + ;; + { .mmf + (p14) STFD [YY1] = f88, 1 * SIZE + (p15) FMA1 f92 = ALPHA_I, f45, f92 + } + ;; + { .mmf + (p14) STFD [YY1] = f89 + (p14) add YY1 = YY1, INCYM1 + (p15) FMA f93 = ALPHA_I, f44, f93 + } + ;; + (p14) STFD [YY1] = f90, 1 * SIZE + ;; + (p14) STFD [YY1] = f91 + (p14) add YY1 = YY1, INCYM1 + ;; + (p15) STFD [YY1] = f92, 1 * SIZE + ;; + { .mmb + (p15) STFD [YY1] = f93 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + EPILOGUE diff --git a/kernel/ia64/copy.S b/kernel/ia64/copy.S new file mode 100644 index 0000000000..b5d7f482ba --- /dev/null +++ b/kernel/ia64/copy.S @@ -0,0 +1,873 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREA r2 +#define PREB r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define INCX3 r18 +#define INCY3 r19 +#define INCX5 r20 +#define INCY5 r21 +#define INCX16 r22 +#define INCY16 r23 +#define XX r24 +#define YY r25 +#define XA r26 +#define YA r27 +#define PR r30 +#define ARLC r31 + +#ifdef DOUBLE +#define PREFETCH_SIZE (4 * 32) +#else +#define PREFETCH_SIZE (4 * 64) +#endif + + PROLOGUE + .prologue + PROFCODE + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + tbit.z p0, p7 = X1, BASE_SHIFT + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + sub XA = Y1, X1 + (p7) LDFD f32 = [X1], INCX + mov PR = pr + } + { .mmi + mov YY = Y1 + (p7) adds N = -1, N + (p7) add Y1 = Y1, INCY + } + ;; + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCY5 = INCY, 2, INCY + mov pr.rot = 0 + } + { .mmi + mov XX = X1 + nop.m 0 + shr.u XA = XA, BASE_SHIFT + } + ;; + { .mmi + and J = 15, N + cmp.eq p16, p0 = r0, r0 + shr I = N, 4 + } + { .mmb + cmp.ne p6, p0 = SIZE, INCX +#ifdef DOUBLE + adds XA = 2, XA +#else + nop.m 0 +#endif + (p6) br.cond.dpnt .L100 + } + ;; +/* INCX == 1 */ + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + { .mmi +#ifdef DOUBLE + and XA = 31, XA +#else + and XA = 63, XA +#endif + adds I = -1, I + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + shladd Y2 = INCY, 2, Y1 + mov ar.lc = I + } + { .mib +#ifdef DOUBLE + cmp.gt p8, p0 = 15, XA +#else + cmp.gt p8, p0 = 30, XA +#endif + cmp.eq p9, p0 = r0, J + (p8)br.cond.dpnt .L30 + } + ;; + { .mmi + (p7) STFD [YY] = f32 + cmp.gt p8 ,p0 = r0, I + mov ar.ec = 5 + } + { .mmb + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 +#ifdef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE + 32, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE - 40, Y1 +#endif + (p8) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmi + (p20) STFD [Y1] = f36 + (p20) STFD [Y2] = f56 + (p20) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX16 + (p16) LDFPD f32, f37 = [X1], 2 * SIZE + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f41 + (p20) STFD [Y2] = f61 + (p20) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY16 + (p16) LDFPD f42, f47 = [X1], 2 * SIZE + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f46 + (p20) STFD [Y2] = f66 + (p20) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1], 2 * SIZE + nop.m 0 + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f51 + (p20) STFD [Y2] = f71 + (p20) add Y1 = INCY5, Y1 + } + { .mmi + (p16) LDFPD f62, f67 = [X1], 2 * SIZE + nop.m 0 + (p20) add Y2 = INCY5, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f76 + (p20) STFD [Y2] = f96 + (p16) adds XX = 8 * SIZE, X1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1], 2 * SIZE + (p20) add Y1 = INCY, Y1 + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f81 + (p20) STFD [Y2] = f101 + (p20) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFPD f82, f87 = [X1], 2 * SIZE + nop.m 0 + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f86 + (p20) STFD [Y2] = f106 + (p16) shladd X2 = INCX, 2, XX + } + { .mmi + (p16) LDFPD f92, f97 = [X1], 2 * SIZE + (p20) add Y1 = INCY, Y1 + (p20) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f111 + (p20) add Y1 = INCY5, Y1 + } + { .mmb + (p16) LDFPD f102, f107 = [X1], 2 * SIZE + (p20) add Y2 = INCY5, Y2 + br.ctop.sptk.few .L22 + } + ;; + .align 32 +.L25: + { .mmi + (p12) LDFPD f48, f49 = [X1], 2 * SIZE + (p12) LDFPD f52, f53 = [X2], 2 * SIZE + mov ar.lc = ARLC + } + { .mmi + (p12) adds XX = 8 * SIZE, XX + nop.m 0 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) LDFPD f50, f51 = [X1] + (p12) LDFPD f54, f55 = [X2] + mov pr = PR, -65474 + } + { .mmb + (p12) adds X1 = 6 * SIZE, X1 + (p13) adds XX = 4 * SIZE, XX + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p13) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFPD f60, f61 = [XX], 2 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p13) LDFPD f58, f59 = [X1], 2 * SIZE + (p15) LDFD f62 = [XX] + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + mov YY = Y1 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p12) add Y2 = INCY, Y2 + (p12) shladd YY = INCY, 3, YY + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p12) add Y2 = INCY, Y2 + (p13) shladd YY = INCY, 2, YY + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY5, Y1 + } + { .mmi + (p12) add Y2 = INCY5, Y2 + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p14) STFD [YY] = f60 + (p13) add Y1 = INCY, Y1 + } + { .mmi + (p14) add YY = INCY, YY + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCY, Y1 + } + { .mmi + (p14) add YY = INCY, YY + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p15) STFD [YY] = f62 + (p13) add Y1 = INCY, Y1 + } + ;; + { .mmb + (p13) STFD [Y1] = f59 + nop.m 0 + br.ret.sptk.many b0 + } + .align 32 + ;; +.L30: + { .mmi + (p7) STFD [YY] = f32 + cmp.gt p8 ,p0 = r0, I + mov ar.ec = 4 + } + { .mmb + adds PREA = PREFETCH_SIZE * SIZE + 24, X1 +#ifdef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE + 64, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 +#endif + (p8) br.cond.dpnt .L35 + } + ;; + .align 32 +.L32: + { .mmi + (p19) STFD [Y1] = f35 + (p19) STFD [Y2] = f55 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX16 + (p16) LDFPD f32, f37 = [X1], 2 * SIZE + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY16 + (p16) LDFPD f42, f47 = [X1], 2 * SIZE + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f45 + (p19) STFD [Y2] = f65 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1], 2 * SIZE + nop.m 0 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f50 + (p19) STFD [Y2] = f70 + (p19) add Y1 = INCY5, Y1 + } + { .mmi + (p16) LDFPD f62, f67 = [X1], 2 * SIZE + nop.m 0 + (p19) add Y2 = INCY5, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f95 + (p16) adds XX = 8 * SIZE, X1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1], 2 * SIZE + (p19) add Y1 = INCY, Y1 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f80 + (p19) STFD [Y2] = f100 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFPD f82, f87 = [X1], 2 * SIZE + nop.m 0 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f85 + (p19) STFD [Y2] = f105 + (p16) shladd X2 = INCX, 2, XX + } + { .mmi + (p16) LDFPD f92, f97 = [X1], 2 * SIZE + (p19) add Y1 = INCY, Y1 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f90 + (p19) STFD [Y2] = f110 + (p19) add Y1 = INCY5, Y1 + } + { .mmb + (p16) LDFPD f102, f107 = [X1], 2 * SIZE + (p19) add Y2 = INCY5, Y2 + br.ctop.sptk.few .L32 + } + ;; + .align 32 +.L35: + { .mmi + (p12) LDFPD f48, f49 = [X1], 2 * SIZE + (p12) LDFPD f52, f53 = [X2], 2 * SIZE + mov ar.lc = ARLC + } + { .mmi + (p12) adds XX = 8 * SIZE, XX + nop.m 0 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) LDFPD f50, f51 = [X1] + (p12) LDFPD f54, f55 = [X2] + mov pr = PR, -65474 + } + { .mmi + (p12) adds X1 = 6 * SIZE, X1 + (p12) adds X2 = 6 * SIZE, X2 + (p13) adds XX = 4 * SIZE, XX + } + ;; + { .mmi + (p13) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFPD f60, f61 = [XX], 2 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmb + (p13) LDFPD f58, f59 = [X1], 2 * SIZE + (p15) LDFD f62 = [XX] + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + mov YY = Y1 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p12) add Y2 = INCY, Y2 + (p12) shladd YY = INCY, 3, YY + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p12) add Y2 = INCY, Y2 + (p13) shladd YY = INCY, 2, YY + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + nop.i 0 + } + { .mmi + (p12) add Y1 = INCY5, Y1 + (p12) add Y2 = INCY5, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p14) STFD [YY] = f60 + nop.i 0 + } + { .mmi + (p13) add Y1 = INCY, Y1 + (p14) add YY = INCY, YY + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + nop.i 0 + } + { .mmi + (p13) add Y1 = INCY, Y1 + (p14) add YY = INCY, YY + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p15) STFD [YY] = f62 + (p13) add Y1 = INCY, Y1 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + .align 32 + ;; + + /* INCX != 1 */ +.L100: + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + { .mmi + nop.m 0 + nop.m 0 + nop.i 0 + } + ;; + { .mmi + adds PREA = PREFETCH_SIZE * SIZE, X1 + adds PREB = PREFETCH_SIZE * SIZE, Y1 + mov ar.ec = 6 + } + { .mmi + cmp.eq p8 ,p0 = r0, I + cmp.eq p9, p0 = r0, J + adds I = -1, I + } + ;; + { .mmi + (p7) STFD [YY] = f32 + shladd X2 = INCX, 2, X1 + mov ar.lc = I + } + { .mib + shladd Y2 = INCY, 2, Y1 + cmp.eq p16, p0 = r0, r0 + (p8) br.cond.dpnt .L120 + } + ;; + .align 32 + +.L110: + { .mmi + (p21) STFD [Y1] = f37 + (p21) STFD [Y2] = f61 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX16 + (p16) lfetch.excl.nt1 [PREB], INCY16 + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f43 + (p21) STFD [Y2] = f67 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f56 = [X2], INCX + (p16) LDFD f32 = [X1], INCX + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f49 + (p21) STFD [Y2] = f73 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f38 = [X1], INCX + (p16) LDFD f62 = [X2], INCX + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f55 + (p21) STFD [Y2] = f79 + (p21) add Y1 = INCY5, Y1 + } + { .mmi + (p16) LDFD f44 = [X1], INCX + (p16) LDFD f68 = [X2], INCX + (p21) add Y2 = INCY5, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f85 + (p21) STFD [Y2] = f109 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f50 = [X1], INCX5 + (p16) LDFD f74 = [X2], INCX5 + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f91 + (p21) STFD [Y2] = f115 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f104 = [X2], INCX + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f97 + (p21) STFD [Y2] = f121 + (p21) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f86 = [X1], INCX + (p16) LDFD f110 = [X2], INCX + (p21) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f103 + (p21) STFD [Y2] = f127 + (p21) add Y1 = INCY5, Y1 + } + { .mmi + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f116 = [X2], INCX + (p21) add Y2 = INCY5, Y2 + } + ;; + { .mmi + nop.m 0 + (p16) add XX = INCX5, X1 + nop.i 0 + } + { .mmb + (p16) LDFD f98 = [X1], INCX5 + (p16) LDFD f122 = [X2], INCX5 + br.ctop.sptk.few .L110 + } + ;; + .align 32 + +.L120: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f52 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f54 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + { .mmb + nop.m 0 + nop.m 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX5 + (p12) LDFD f55 = [X2], INCX5 + (p12) shladd XX = INCX, 3, XX + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX + (p13) shladd XX = INCX, 2, XX + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCX + (p14) LDFD f60 = [XX], INCX + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX + (p14) LDFD f61 = [XX], INCX + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p13) LDFD f59 = [X1], INCX + (p15) LDFD f62 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + nop.i 0 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + nop.i 0 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + nop.i 0 + } + { .mmi + (p12) add Y1 = INCY, Y1 + (p12) add Y2 = INCY, Y2 + nop.i 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY5, Y1 + } + { .mmi + (p12) add Y2 = INCY5, Y2 + (p12) shladd YY = INCY, 3, YY + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) add Y1 = INCY, Y1 + (p13) shladd YY =INCY, 2, YY + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f60 + nop.i 0 + } + { .mmi + (p13) add Y1 = INCY, Y1 + (p14) add YY = INCY, YY + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p14) STFD [YY] = f61 + nop.i 0 + } + { .mmi + (p13) add Y1 = INCY, Y1 + (p14) add YY = INCY, YY + nop.i 0 + } + ;; + { .mmb + (p13) STFD [Y1] = f59 + (p15) STFD [YY] = f62 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/daxpy.S b/kernel/ia64/daxpy.S new file mode 100644 index 0000000000..b971df6f0d --- /dev/null +++ b/kernel/ia64/daxpy.S @@ -0,0 +1,1504 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (16 * 16) + +#define N r32 +#define X1 r36 +#define INCX r37 +#define Y1 r38 +#define INCY r39 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define X2 r16 +#define Y2 r17 +#define YY1 r18 +#define YY2 r19 +#define INCX16 r20 +#define INCY16 r21 +#define X3 r26 +#define YY r27 +#define PR r30 +#define ARLC r31 + +#define ALPHA f8 + + PROLOGUE + PROFCODE + .prologue + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + tbit.nz p10, p0 = Y1, BASE_SHIFT + (p6) br.ret.dpnt.many b0 + } + ;; + .body + { .mmi + (p10) LDFD f32 = [X1], INCX + (p10) LDFD f33 = [Y1] + mov PR = pr + } + { .mmi + (p10) adds N = -1, N + mov YY = Y1 + (p10) add Y1 = Y1, INCY + } + ;; + { .mmi + mov YY1 = Y1 + shladd YY2 = INCY, 1, Y1 + mov pr.rot= 0 + } + { .mmi + sub r8 = X1, Y1 + mov r9 = 0xf0 + nop __LINE__ + } + ;; + { .mmi + cmp.ne p6, p0 = SIZE, INCX + cmp.ne p7, p0 = SIZE, INCY + tbit.nz p8, p0 = X1, BASE_SHIFT + } + { .mbb + and J = 15, N + (p6) br.cond.dpnt .L100 + (p7) br.cond.dpnt .L100 + } + ;; + { .mfi + cmp.eq p16, p0 = r0, r0 + (p10) FMA f9 = ALPHA, f32, f33 + shr I = N, 4 + } + { .mmb + add X3 = X1, INCX + and r8 = r9, r8 + (p8) br.cond.dpnt.many .L30 + } + ;; + { .mmi + cmp.eq p11, p0 = r0, J + adds I = -1, I + mov ar.ec = 3 + } + { .mib + cmp.lt p9, p0 = 127, r8 + tbit.nz p12, p0 = N, 3 + (p9) br.cond.dpnt.many .L20 + } + ;; + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 + (p7) br.cond.dpnt .L15 + } + ;; + .align 32 + +.L12: + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) lfetch.fault.nt1 [PREX], 16 * SIZE + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p18) FMA f7 = ALPHA, f64, f112 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) LDFPD f80, f83 = [Y1], 2 * SIZE + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p16) LDFPD f86, f89 = [Y1], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p18) FMA f14 = ALPHA, f73, f121 + } + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p16) LDFPD f92, f95 = [Y1], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p16) LDFPD f98, f101 = [Y1], 2 * SIZE + (p17) FMA f7 = ALPHA, f39, f87 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p17) FMA f10 = ALPHA, f36, f84 + } + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p16) LDFPD f104, f107 = [Y1], 2 * SIZE + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p17) FMA f12 = ALPHA, f45, f93 + } + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p16) LDFPD f110, f113 = [Y1], 2 * SIZE + (p17) FMA f13 = ALPHA, f51, f99 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p17) FMA f14 = ALPHA, f48, f96 + } + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p16) LDFPD f116, f119 = [Y1], 2 * SIZE + (p17) FMA f15 = ALPHA, f54, f102 + } + ;; + { .mmi + (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE + (p16) LDFPD f122, f125 = [Y1], 2 * SIZE + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + (p12) LDFPD f34, f35 = [Y1], 2 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) LDFPD f38, f39 = [Y1], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p12) LDFPD f42, f43 = [Y1], 2 * SIZE + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFPD f46, f47 = [Y1], 2 * SIZE + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFPD f50, f51 = [Y1], 2 * SIZE + tbit.nz p14, p0 = N, 1 + } + ;; + { .mmi + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFPD f54, f55 = [Y1], 2 * SIZE + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmi + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFPD f58, f59 = [Y1], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f60 = [X1] + (p15) LDFD f61 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f36, f38 + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f37, f39 + (p12) FMA f12 = ALPHA, f40, f42 + (p12) FMA f13 = ALPHA, f44, f46 + (p12) FMA f14 = ALPHA, f41, f43 + (p12) FMA f15 = ALPHA, f45, f47 + ;; + { .mmf + (p12) STFD [YY1] = f6, 1 * SIZE + (p12) STFD [YY2] = f7, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f7 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f10, 3 * SIZE + (p12) STFD [YY2] = f11, 3 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p12) STFD [YY1] = f12, 1 * SIZE + (p12) STFD [YY2] = f13, 1 * SIZE + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p12) STFD [YY1] = f14, 3 * SIZE + (p12) STFD [YY2] = f15, 3 * SIZE + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f6, 1 * SIZE + (p13) STFD [YY2] = f7, 1 * SIZE + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f10, 3 * SIZE + (p13) STFD [YY2] = f11, 3 * SIZE + } + ;; + { .mmi + (p14) STFD [YY1] = f12, 1 * SIZE + ;; + (p14) STFD [YY1] = f13, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L20: + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE - 4) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 + (p7) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) lfetch.fault.nt1 [PREX], 16 * SIZE + (p17) LDFPD f57, f60 = [X1], 2 * SIZE + (p18) FMA f7 = ALPHA, f64, f112 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE + (p16) LDFPD f80, f83 = [Y1], 2 * SIZE + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmf + (p17) LDFPD f63, f66 = [X1], 2 * SIZE + (p16) LDFPD f86, f89 = [Y1], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p18) FMA f14 = ALPHA, f73, f121 + } + { .mmf + (p17) LDFPD f69, f72 = [X1], 2 * SIZE + (p16) LDFPD f92, f95 = [Y1], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmf + (p17) LDFPD f75, f78 = [X1], 2 * SIZE + (p16) LDFPD f98, f101 = [Y1], 2 * SIZE + (p17) FMA f7 = ALPHA, f39, f87 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p17) FMA f10 = ALPHA, f36, f84 + } + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) LDFPD f104, f107 = [Y1], 2 * SIZE + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p17) FMA f12 = ALPHA, f45, f93 + } + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) LDFPD f110, f113 = [Y1], 2 * SIZE + (p17) FMA f13 = ALPHA, f51, f99 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p17) FMA f14 = ALPHA, f48, f96 + } + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p16) LDFPD f116, f119 = [Y1], 2 * SIZE + (p17) FMA f15 = ALPHA, f54, f102 + } + ;; + { .mmi + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p16) LDFPD f122, f125 = [Y1], 2 * SIZE + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + (p12) LDFPD f34, f35 = [Y1], 2 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) LDFPD f38, f39 = [Y1], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p12) LDFPD f42, f43 = [Y1], 2 * SIZE + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFPD f46, f47 = [Y1], 2 * SIZE + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFPD f50, f51 = [Y1], 2 * SIZE + tbit.nz p14, p0 = N, 1 + } + ;; + { .mmi + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFPD f54, f55 = [Y1], 2 * SIZE + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmi + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFPD f58, f59 = [Y1], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f60 = [X1] + (p15) LDFD f61 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f36, f38 + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f37, f39 + (p12) FMA f12 = ALPHA, f40, f42 + (p12) FMA f13 = ALPHA, f44, f46 + (p12) FMA f14 = ALPHA, f41, f43 + (p12) FMA f15 = ALPHA, f45, f47 + ;; + { .mmf + (p12) STFD [YY1] = f6, 1 * SIZE + (p12) STFD [YY2] = f7, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f7 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f10, 3 * SIZE + (p12) STFD [YY2] = f11, 3 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p12) STFD [YY1] = f12, 1 * SIZE + (p12) STFD [YY2] = f13, 1 * SIZE + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p12) STFD [YY1] = f14, 3 * SIZE + (p12) STFD [YY2] = f15, 3 * SIZE + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f6, 1 * SIZE + (p13) STFD [YY2] = f7, 1 * SIZE + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f10, 3 * SIZE + (p13) STFD [YY2] = f11, 3 * SIZE + } + ;; + { .mmi + (p14) STFD [YY1] = f12, 1 * SIZE + ;; + (p14) STFD [YY1] = f13, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L30: + { .mmi + cmp.eq p11, p0 = r0, J + adds I = -1, I + mov ar.ec = 3 + } + { .mib + cmp.lt p9, p0 = 127, r8 + tbit.nz p12, p0 = N, 3 + (p9) br.cond.dptk.many .L40 + } + ;; + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 + (p7) br.cond.dpnt .L35 + } + ;; + .align 32 + +.L32: + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) lfetch.fault.nt1 [PREX], 16 * SIZE + (p16) LDFD f32 = [X1], 1 * SIZE + (p18) FMA f7 = ALPHA, f64, f112 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p16) LDFPD f35, f38 = [X1], 2 * SIZE + (p16) LDFPD f80, f83 = [Y1], 2 * SIZE + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmf + (p16) LDFPD f41, f44 = [X1], 2 * SIZE + (p16) LDFPD f86, f89 = [Y1], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p18) FMA f14 = ALPHA, f73, f121 + } + { .mmf + (p16) LDFPD f47, f50 = [X1], 2 * SIZE + (p16) LDFPD f92, f95 = [Y1], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmf + (p16) LDFPD f53, f56 = [X1], 2 * SIZE + (p16) LDFPD f98, f101 = [Y1], 2 * SIZE + (p17) FMA f7 = ALPHA, f39, f87 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p17) FMA f10 = ALPHA, f36, f84 + } + { .mmf + (p16) LDFPD f59, f62 = [X1], 2 * SIZE + (p16) LDFPD f104, f107 = [Y1], 2 * SIZE + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p17) FMA f12 = ALPHA, f45, f93 + } + { .mmf + (p16) LDFPD f65, f68 = [X1], 2 * SIZE + (p16) LDFPD f110, f113 = [Y1], 2 * SIZE + (p17) FMA f13 = ALPHA, f51, f99 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p17) FMA f14 = ALPHA, f48, f96 + } + { .mmf + (p16) LDFPD f71, f74 = [X1], 2 * SIZE + (p16) LDFPD f116, f119 = [Y1], 2 * SIZE + (p17) FMA f15 = ALPHA, f54, f102 + } + ;; + { .mmi + (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE + (p16) LDFPD f122, f125 = [Y1], 2 * SIZE + adds X3 = 1 * SIZE, X1 + } + { .mmb + (p16) LDFD f77 = [X1], 1 * SIZE + nop __LINE__ + br.ctop.sptk.few .L32 + } + ;; + .align 32 + +.L35: + { .mmi + (p12) LDFPD f33, f36 = [X3] + (p12) LDFPD f34, f35 = [Y1], 2 * SIZE + mov pr = PR, -65474 + } + { .mmi + (p12) LDFD f32 = [X1], 3 * SIZE + (p12) adds X3 = 8 * SIZE, X3 + nop __LINE__ + } + ;; + { .mmi + (p12) LDFPD f37, f40 = [X1], 2 * SIZE + (p12) LDFPD f38, f39 = [Y1], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f41, f44 = [X1], 2 * SIZE + (p12) LDFPD f42, f43 = [Y1], 2 * SIZE + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFD f45 = [X1], 1 * SIZE + (p12) LDFPD f46, f47 = [Y1], 2 * SIZE + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f49, f52 = [X3] + (p13) LDFPD f50, f51 = [Y1], 2 * SIZE + tbit.nz p14, p0 = N, 1 + } + { .mmi + (p13) LDFD f48 = [X1], 3 * SIZE + (p13) adds X3 = 4 * SIZE, X3 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f53 = [X1], 1 * SIZE + (p13) LDFPD f54, f55 = [Y1], 2 * SIZE + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmi + (p14) LDFD f56 = [X1], 2 * SIZE + (p14) LDFPD f58, f59 = [Y1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p14) LDFD f57 = [X3] + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f60 = [X1] + (p15) LDFD f61 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f36, f38 + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f37, f39 + (p12) FMA f12 = ALPHA, f40, f42 + (p12) FMA f13 = ALPHA, f44, f46 + (p12) FMA f14 = ALPHA, f41, f43 + (p12) FMA f15 = ALPHA, f45, f47 + ;; + { .mmf + (p12) STFD [YY1] = f6, 1 * SIZE + (p12) STFD [YY2] = f7, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f7 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f10, 3 * SIZE + (p12) STFD [YY2] = f11, 3 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p12) STFD [YY1] = f12, 1 * SIZE + (p12) STFD [YY2] = f13, 1 * SIZE + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p12) STFD [YY1] = f14, 3 * SIZE + (p12) STFD [YY2] = f15, 3 * SIZE + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f6, 1 * SIZE + (p13) STFD [YY2] = f7, 1 * SIZE + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f10, 3 * SIZE + (p13) STFD [YY2] = f11, 3 * SIZE + } + ;; + { .mmi + (p14) STFD [YY1] = f12, 1 * SIZE + ;; + (p14) STFD [YY1] = f13, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L40: + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 8) * SIZE, Y1 + (p7) br.cond.dpnt .L45 + } + ;; + .align 32 + +.L42: + { .mmf + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p17) LDFPD f54, f57 = [X1], 2 * SIZE + (p16) LDFPD f80, f83 = [Y1], 2 * SIZE + (p18) FMA f7 = ALPHA, f64, f112 + } + ;; + { .mmf + (p18) STFD [YY1] = f10, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p18) FMA f64 = ALPHA, f61, f109 + } + { .mmf + (p17) LDFPD f60, f63 = [X1], 2 * SIZE + (p16) LDFPD f86, f89 = [Y1], 2 * SIZE + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY1] = f12, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p18) FMA f67 = ALPHA, f70, f118 + } + { .mmf + (p17) LDFPD f66, f69 = [X1], 2 * SIZE + (p16) LDFPD f92, f95 = [Y1], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [YY1] = f14, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p18) FMA f9 = ALPHA, f73, f121 + } + { .mmf + (p17) LDFPD f72, f75 = [X1], 2 * SIZE + (p16) LDFPD f98, f101 = [Y1], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmi + (p18) STFD [YY1] = f6, 1 * SIZE + (p18) STFD [YY2] = f7, 1 * SIZE + (p17) adds X3 = 2 * SIZE, X1 + } + { .mmf + (p16) LDFPD f104, f107 = [Y1], 2 * SIZE + (p17) LDFD f78 = [X1], 1 * SIZE + (p17) FMA f6 = ALPHA, f33, f81 + } + ;; + { .mmf + (p16) LDFPD f110, f113 = [Y1], 2 * SIZE + (p16) lfetch.fault.nt1 [PREX], 16 * SIZE + (p17) FMA f7 = ALPHA, f39, f87 + } + { .mmf + (p16) LDFD f32 = [X1], 1 * SIZE + (p17) FMA f10 = ALPHA, f36, f84 + } + ;; + { .mmf + (p18) STFD [YY1] = f64, 3 * SIZE + (p18) STFD [YY2] = f11, 3 * SIZE + (p17) FMA f11 = ALPHA, f42, f90 + } + { .mmf + (p16) LDFPD f35, f38 = [X1], 2 * SIZE + (p16) LDFPD f116, f119 = [Y1], 2 * SIZE + (p17) FMA f12 = ALPHA, f45, f93 + } + ;; + { .mmf + (p18) STFD [YY1] = f67, 1 * SIZE + (p18) STFD [YY2] = f13, 1 * SIZE + (p17) FMA f13 = ALPHA, f51, f99 + } + { .mmf + (p16) LDFPD f41, f44 = [X1], 2 * SIZE + (p16) LDFPD f122, f125 = [Y1], 2 * SIZE + (p17) FMA f14 = ALPHA, f48, f96 + } + ;; + { .mmf + (p18) STFD [YY1] = f9, 3 * SIZE + (p18) STFD [YY2] = f15, 3 * SIZE + (p17) FMA f15 = ALPHA, f54, f102 + } + { .mmb + (p16) lfetch.fault.excl.nt1 [PREY], 16 * SIZE + (p16) LDFPD f47, f50 = [X1], 2 * SIZE + br.ctop.sptk.few .L42 + } + ;; + .align 32 + +.L45: + { .mmi + (p12) LDFPD f33, f36 = [X3] + (p12) LDFPD f34, f35 = [Y1], 2 * SIZE + mov pr = PR, -65474 + } + { .mmi + (p12) LDFD f32 = [X1], 3 * SIZE + (p12) adds X3 = 8 * SIZE, X3 + nop __LINE__ + } + ;; + { .mmi + (p12) LDFPD f37, f40 = [X1], 2 * SIZE + (p12) LDFPD f38, f39 = [Y1], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f41, f44 = [X1], 2 * SIZE + (p12) LDFPD f42, f43 = [Y1], 2 * SIZE + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFD f45 = [X1], 1 * SIZE + (p12) LDFPD f46, f47 = [Y1], 2 * SIZE + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f49, f52 = [X3] + (p13) LDFPD f50, f51 = [Y1], 2 * SIZE + tbit.nz p14, p0 = N, 1 + } + { .mmi + (p13) LDFD f48 = [X1], 3 * SIZE + (p13) adds X3 = 4 * SIZE, X3 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f53 = [X1], 1 * SIZE + (p13) LDFPD f54, f55 = [Y1], 2 * SIZE + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmi + (p14) LDFD f56 = [X1], 2 * SIZE + (p14) LDFPD f58, f59 = [Y1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p14) LDFD f57 = [X3] + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f60 = [X1] + (p15) LDFD f61 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f36, f38 + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f37, f39 + (p12) FMA f12 = ALPHA, f40, f42 + (p12) FMA f13 = ALPHA, f44, f46 + (p12) FMA f14 = ALPHA, f41, f43 + (p12) FMA f15 = ALPHA, f45, f47 + ;; + { .mmf + (p12) STFD [YY1] = f6, 1 * SIZE + (p12) STFD [YY2] = f7, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f7 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f10, 3 * SIZE + (p12) STFD [YY2] = f11, 3 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p12) STFD [YY1] = f12, 1 * SIZE + (p12) STFD [YY2] = f13, 1 * SIZE + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p12) STFD [YY1] = f14, 3 * SIZE + (p12) STFD [YY2] = f15, 3 * SIZE + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f6, 1 * SIZE + (p13) STFD [YY2] = f7, 1 * SIZE + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f10, 3 * SIZE + (p13) STFD [YY2] = f11, 3 * SIZE + } + ;; + { .mmi + (p14) STFD [YY1] = f12, 1 * SIZE + ;; + (p14) STFD [YY1] = f13, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mfi + cmp.eq p16, p0 = r0, r0 + (p10) FMA f9 = ALPHA, f32, f33 + shr I = N, 4 + } + ;; + { .mmi + cmp.eq p11, p0 = r0, J + adds I = -1, I + mov ar.ec = 3 + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.nz p12, p0 = N, 3 + } + ;; + { .mmi + (p10) STFD [YY] = f9 + cmp.eq p7 ,p0 = -1, I + mov ar.lc = I + } + { .mib + adds PREX = (PREFETCHSIZE + 2) * SIZE, X1 + adds PREY = (PREFETCHSIZE + 2) * SIZE, Y1 + (p7) br.cond.dpnt .L115 + } + ;; + .align 32 + +.L112: + { .mmf + (p18) STFD [YY1] = f6 + (p16) lfetch.fault.nt1 [PREX], INCX16 + (p18) FMA f12 = ALPHA, f46, f94 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f80 = [Y1], INCY + (p18) add YY1 = YY1, INCY + } + ;; + { .mmf + (p18) STFD [YY1] = f7 + (p18) add YY1 = YY1, INCY + (p18) FMA f13 = ALPHA, f49, f97 + } + { .mmi + (p16) LDFD f35 = [X1], INCX + (p16) LDFD f83 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f10 + (p18) add YY1 = YY1, INCY + (p18) FMA f14 = ALPHA, f52, f100 + } + { .mmi + (p16) LDFD f38 = [X1], INCX + (p16) LDFD f86 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f11 + (p18) add YY1 = YY1, INCY + (p18) FMA f15 = ALPHA, f55, f103 + } + { .mmi + (p16) LDFD f41 = [X1], INCX + (p16) LDFD f89 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) add YY1 = YY1, INCY + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmi + (p16) LDFD f44 = [X1], INCX + (p16) LDFD f92 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f13 + (p18) add YY1 = YY1, INCY + (p18) FMA f7 = ALPHA, f61, f109 + } + { .mmi + (p16) LDFD f47 = [X1], INCX + (p16) LDFD f95 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f14 + (p18) add YY1 = YY1, INCY + (p18) FMA f10 = ALPHA, f64, f112 + } + { .mmi + (p16) LDFD f50 = [X1], INCX + (p16) LDFD f98 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f15 + (p18) add YY1 = YY1, INCY + (p18) FMA f11 = ALPHA, f67, f115 + } + { .mmi + (p16) LDFD f53 = [X1], INCX + (p16) LDFD f101 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f6 + (p16) lfetch.fault.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmi + (p16) LDFD f56 = [X1], INCX + (p16) LDFD f104 = [Y1], INCY + (p18) add YY1 = YY1, INCY + } + ;; + { .mmf + (p18) STFD [YY1] = f7 + (p18) add YY1 = YY1, INCY + (p18) FMA f13 = ALPHA, f73, f121 + } + { .mmi + (p16) LDFD f59 = [X1], INCX + (p16) LDFD f107 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f10 + (p18) add YY1 = YY1, INCY + (p18) FMA f14 = ALPHA, f76, f124 + } + { .mmi + (p16) LDFD f62 = [X1], INCX + (p16) LDFD f110 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f11 + (p18) add YY1 = YY1, INCY + (p18) FMA f15 = ALPHA, f79, f127 + } + { .mmi + (p16) LDFD f65 = [X1], INCX + (p16) LDFD f113 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) add YY1 = YY1, INCY + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmi + (p16) LDFD f68 = [X1], INCX + (p16) LDFD f116 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f13 + (p18) add YY1 = YY1, INCY + (p17) FMA f7 = ALPHA, f36, f84 + } + { .mmi + (p16) LDFD f71 = [X1], INCX + (p16) LDFD f119 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f14 + (p18) add YY1 = YY1, INCY + (p17) FMA f10 = ALPHA, f39, f87 + } + { .mmi + (p16) LDFD f74 = [X1], INCX + (p16) LDFD f122 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f15 + (p18) add YY1 = YY1, INCY + (p17) FMA f11 = ALPHA, f42, f90 + } + { .mmb + (p16) LDFD f77 = [X1], INCX + (p16) LDFD f125 = [Y1], INCY + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + { .mmi + (p12) LDFD f32 = [X1], INCX + (p12) LDFD f34 = [Y1], INCY + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFD f33 = [X1], INCX + (p12) LDFD f35 = [Y1], INCY + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFD f36 = [X1], INCX + (p12) LDFD f38 = [Y1], INCY + (p11) br.ret.dpnt.many b0 + } + ;; + { .mmi + (p12) LDFD f37 = [X1], INCX + (p12) LDFD f39 = [Y1], INCY + tbit.nz p13, p0 = N, 2 + } + ;; + { .mmi + (p12) LDFD f40 = [X1], INCX + (p12) LDFD f42 = [Y1], INCY + tbit.nz p14, p0 = N, 1 + } + ;; + { .mmi + (p12) LDFD f41 = [X1], INCX + (p12) LDFD f43 = [Y1], INCY + tbit.nz p15, p0 = N, 0 + } + ;; + { .mmf + (p12) LDFD f44 = [X1], INCX + (p12) LDFD f46 = [Y1], INCY + (p12) FMA f6 = ALPHA, f32, f34 + } + ;; + { .mmf + (p12) LDFD f45 = [X1], INCX + (p12) LDFD f47 = [Y1], INCY + (p12) FMA f7 = ALPHA, f33, f35 + } + ;; + { .mmf + (p13) LDFD f48 = [X1], INCX + (p13) LDFD f50 = [Y1], INCY + (p12) FMA f10 = ALPHA, f36, f38 + } + ;; + { .mmf + (p13) LDFD f49 = [X1], INCX + (p13) LDFD f51 = [Y1], INCY + (p12) FMA f11 = ALPHA, f37, f39 + } + ;; + { .mmf + (p12) STFD [YY1] = f6 + (p12) add YY1 = YY1, INCY + (p12) FMA f12 = ALPHA, f40, f42 + } + { .mmi + (p13) LDFD f52 = [X1], INCX + (p13) LDFD f54 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f7 + (p12) add YY1 = YY1, INCY + (p12) FMA f13 = ALPHA, f41, f43 + } + { .mmi + (p13) LDFD f53 = [X1], INCX + (p13) LDFD f55 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f10 + (p12) add YY1 = YY1, INCY + (p12) FMA f14 = ALPHA, f44, f46 + } + { .mmi + (p14) LDFD f56 = [X1], INCX + (p14) LDFD f58 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f11 + (p12) add YY1 = YY1, INCY + (p12) FMA f15 = ALPHA, f45, f47 + } + { .mmi + (p14) LDFD f57 = [X1], INCX + (p14) LDFD f59 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f12 + (p12) add YY1 = YY1, INCY + (p13) FMA f6 = ALPHA, f48, f50 + } + { .mmi + (p15) LDFD f60 = [X1], INCX + (p15) LDFD f61 = [Y1], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [YY1] = f13 + (p12) add YY1 = YY1, INCY + (p13) FMA f7 = ALPHA, f49, f51 + } + ;; + { .mmf + (p12) STFD [YY1] = f14 + (p12) add YY1 = YY1, INCY + (p13) FMA f10 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [YY1] = f15 + (p12) add YY1 = YY1, INCY + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + ;; + { .mmf + (p13) STFD [YY1] = f6 + (p13) add YY1 = YY1, INCY + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p13) STFD [YY1] = f7 + (p13) add YY1 = YY1, INCY + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [YY1] = f10 + (p13) add YY1 = YY1, INCY + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + { .mmi + (p13) STFD [YY1] = f11 + (p13) add YY1 = YY1, INCY + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YY1] = f12 + (p14) add YY1 = YY1, INCY + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YY1] = f13 + (p14) add YY1 = YY1, INCY + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f14 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + + + EPILOGUE + diff --git a/kernel/ia64/ddot.S b/kernel/ia64/ddot.S new file mode 100644 index 0000000000..082c303d85 --- /dev/null +++ b/kernel/ia64/ddot.S @@ -0,0 +1,1184 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (16 * 16 + 2) + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 +#define INCX16 r18 +#define INCY16 r19 +#define INCX3 r20 +#define INCY3 r21 +#define YY r22 +#define XA r23 +#define YA r24 +#define XX r25 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + nop.m 0 + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov r26 = 1 + mov f9 = f0 + shr XA = X1, 4 + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif + + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + (p6) shladd X1 = r26, BASE_SHIFT, X1 + (p7) shladd Y1 = r27, BASE_SHIFT, Y1 + ;; +#endif + { .mfi + shladd INCX = INCX, BASE_SHIFT, r0 + mov f32 = f0 + mov PR = pr + } + { .mfb + cmp.lt p0, p6 = r0, N + mov f80 = f0 + (p6) br.ret.sptk.many b0 + } + ;; + { .mfi + shladd INCY = INCY, BASE_SHIFT, r0 + mov f10 = f0 + tbit.nz p15, p0 = X1, BASE_SHIFT + } + { .mfb + cmp.ne p6, p0 = SIZE, INCX + mov f11 = f0 + (p6) br.cond.dptk .L100 + } + ;; + { .mfi + (p15) LDFD f32 = [X1], INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + (p15) adds N = -1, N + mov f13 = f0 + shr YA = Y1, 4 + } + ;; + { .mfi + (p15) LDFD f80 = [Y1], INCY + mov f14 = f0 + shr I = N, 4 + } + { .mmi + and J = 15, N + and XA = 0xf, XA + and YA = 0xf, YA + } + ;; + { .mmi + shladd INCX3 = INCX, 1, INCX + shladd INCY3 = INCY, 1, INCY + sub XA = YA, XA + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + shladd Y2 = INCY, 1, Y1 + cmp.eq p7, p0 = r0, J + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.ge p8, p0 = 2, XA + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mbb + cmp.le p9, p0 = 12, XA + (p8) br.cond.dpnt .L20 + (p9) br.cond.dpnt .L20 + } + ;; + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L15 + } + ;; + .align 32 + +/* INCX == 1 && X is aligned */ +.L12: + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [Y1], INCY + (p16) LDFD f86 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f83 = [Y1], INCY3 + (p16) LDFD f89 = [Y2], INCY3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f92 = [Y1], INCY + (p16) LDFD f98 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f95 = [Y1], INCY3 + (p16) LDFD f101 = [Y2], INCY3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f104 = [Y1], INCY + (p16) LDFD f110 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f107 = [Y1], INCY3 + (p16) LDFD f113 = [Y2], INCY3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f116 = [Y1], INCY + (p16) LDFD f122 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p16) LDFD f119 = [Y1], INCY3 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f125 = [Y2], INCY3 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + mov YY = Y1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f38 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) shladd YY = INCY, 3, YY + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [Y1], INCY3 + (p12) LDFD f39 = [Y2], INCY3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p13) shladd YY = INCY, 2, YY + } + { .mmi + (p12) LDFD f42 = [Y1], INCY + (p12) LDFD f46 = [Y2], INCY + } + ;; + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFD f43 = [Y1], INCY3 + (p12) LDFD f47 = [Y2], INCY3 + (p14) shladd YY = INCY, 1, YY + ;; + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFD f50 = [Y1], INCY + (p13) LDFD f54 = [Y2], INCY + ;; + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFD f51 = [Y1], INCY3 + (p13) LDFD f55 = [Y2], INCY3 + ;; + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFD f58 = [Y1], INCY + (p15) LDFD f61 = [YY] + ;; + (p14) LDFD f59 = [Y1] + (p15) LDFD f60 = [X1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L20: + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 18) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p17) LDFD f105 = [Y1], INCY + (p17) LDFD f111 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p17) LDFD f108 = [Y1], INCY3 + (p17) LDFD f114 = [Y2], INCY3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p17) LDFD f117 = [Y1], INCY + (p17) LDFD f123 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p17) LDFD f120 = [Y1], INCY3 + (p17) LDFD f126 = [Y2], INCY3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f80 = [Y1], INCY + (p16) LDFD f86 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f83 = [Y1], INCY3 + (p16) LDFD f89 = [Y2], INCY3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f92 = [Y1], INCY + (p16) LDFD f98 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p16) LDFD f95 = [Y1], INCY3 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f101 = [Y2], INCY3 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + mov YY = Y1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f38 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) shladd YY = INCY, 3, YY + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [Y1], INCY3 + (p12) LDFD f39 = [Y2], INCY3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p13) shladd YY = INCY, 2, YY + } + { .mmi + (p12) LDFD f42 = [Y1], INCY + (p12) LDFD f46 = [Y2], INCY + } + ;; + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFD f43 = [Y1], INCY3 + (p12) LDFD f47 = [Y2], INCY3 + (p14) shladd YY = INCY, 1, YY + ;; + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFD f50 = [Y1], INCY + (p13) LDFD f54 = [Y2], INCY + ;; + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFD f51 = [Y1], INCY3 + (p13) LDFD f55 = [Y2], INCY3 + ;; + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p14) LDFD f58 = [Y1], INCY + (p15) LDFD f61 = [YY] + ;; + (p14) LDFD f59 = [Y1] + (p15) LDFD f60 = [X1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L100: + { .mmi + shladd X2 = INCX, 1, X1 + } + { .mib + cmp.ne p6, p0 = SIZE, INCY + tbit.nz p15, p0 = Y1, BASE_SHIFT + (p6) br.cond.dptk .L200 + } + ;; + { .mfi + (p15) LDFD f32 = [X1], INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + (p15) adds N = -1, N + mov f13 = f0 + shr YA = Y1, 4 + } + ;; + { .mfi + (p15) LDFD f80 = [Y1], INCY + mov f14 = f0 + shr I = N, 4 + } + { .mmi + and J = 15, N + and XA = 0xf, XA + and YA = 0xf, YA + } + ;; + { .mmi + shladd INCX3 = INCX, 1, INCX + shladd INCY3 = INCY, 1, INCY + sub XA = YA, XA + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + shladd X2 = INCX, 1, X1 + cmp.eq p7, p0 = r0, J + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.ge p8, p0 = 4, XA + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mbb + cmp.le p9, p0 = 14, XA + (p8) br.cond.dpnt .L120 + (p9) br.cond.dpnt .L120 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L115 + } + ;; + .align 32 + +/* INCY == 1 */ +.L112: + { .mmf + (p16) LDFPD f32, f35 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f86 = [X2], INCX + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f83 = [X1], INCX3 + (p16) LDFD f89 = [X2], INCX3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [Y1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [Y1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f95 = [X1], INCX3 + (p16) LDFD f101 = [X2], INCX3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [Y1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f104 = [X1], INCX + (p16) LDFD f110 = [X2], INCX + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [Y1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f107 = [X1], INCX3 + (p16) LDFD f113 = [X2], INCX3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [Y1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f116 = [X1], INCX + (p16) LDFD f122 = [X2], INCX + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [Y1], 2 * SIZE + (p16) LDFD f119 = [X1], INCX3 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f125 = [X2], INCX3 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + { .mmi + (p12) LDFPD f32, f33 = [Y1], 2 * SIZE + mov XX = X1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f38 = [X2], INCX + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [Y1], 2 * SIZE + (p12) shladd XX = INCX, 3, XX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [X1], INCX3 + (p12) LDFD f39 = [X2], INCX3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [Y1], 2 * SIZE + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f42 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + } + ;; + (p12) LDFPD f44, f45 = [Y1], 2 * SIZE + (p12) LDFD f43 = [X1], INCX3 + (p12) LDFD f47 = [X2], INCX3 + (p14) shladd XX = INCX, 1, XX + ;; + (p13) LDFPD f48, f49 = [Y1], 2 * SIZE + (p13) LDFD f50 = [X1], INCX + (p13) LDFD f54 = [X2], INCX + ;; + (p13) LDFPD f52, f53 = [Y1], 2 * SIZE + (p13) LDFD f51 = [X1], INCX3 + (p13) LDFD f55 = [X2], INCX3 + ;; + (p14) LDFPD f56, f57 = [Y1], 2 * SIZE + (p14) LDFD f58 = [X1], INCX + (p15) LDFD f61 = [XX] + ;; + (p14) LDFD f59 = [X1] + (p15) LDFD f60 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L120: + { .mmi + adds PREX = (PREFETCH_SIZE + 17) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 19) * SIZE, X1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L125 + } + ;; + .align 32 + +.L122: + { .mmf + (p16) LDFPD f32, f35 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p17) LDFD f105 = [X1], INCX + (p17) LDFD f111 = [X2], INCX + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p17) LDFD f108 = [X1], INCX3 + (p17) LDFD f114 = [X2], INCX3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [Y1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p17) LDFD f117 = [X1], INCX + (p17) LDFD f123 = [X2], INCX + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [Y1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p17) LDFD f120 = [X1], INCX3 + (p17) LDFD f126 = [X2], INCX3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [Y1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f86 = [X2], INCX + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [Y1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f83 = [X1], INCX3 + (p16) LDFD f89 = [X2], INCX3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [Y1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [Y1], 2 * SIZE + (p16) LDFD f95 = [X1], INCX3 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f101 = [X2], INCX3 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L122 + } + ;; + .align 32 + +.L125: + { .mmi + (p12) LDFPD f32, f33 = [Y1], 2 * SIZE + mov XX = X1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f38 = [X2], INCX + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [Y1], 2 * SIZE + (p12) shladd XX = INCX, 3, XX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [X1], INCX3 + (p12) LDFD f39 = [X2], INCX3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [Y1], 2 * SIZE + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f42 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + } + ;; + (p12) LDFPD f44, f45 = [Y1], 2 * SIZE + (p12) LDFD f43 = [X1], INCX3 + (p12) LDFD f47 = [X2], INCX3 + (p14) shladd XX = INCX, 1, XX + ;; + (p13) LDFPD f48, f49 = [Y1], 2 * SIZE + (p13) LDFD f50 = [X1], INCX + (p13) LDFD f54 = [X2], INCX + ;; + (p13) LDFPD f52, f53 = [Y1], 2 * SIZE + (p13) LDFD f51 = [X1], INCX3 + (p13) LDFD f55 = [X2], INCX3 + ;; + (p14) LDFPD f56, f57 = [Y1], 2 * SIZE + (p14) LDFD f58 = [X1], INCX + (p15) LDFD f61 = [XX] + ;; + (p14) LDFD f59 = [X1] + (p15) LDFD f60 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L200: + { .mfi + shladd INCX3 = INCX, 1, INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + and J = 15, N + mov f13 = f0 + shr I = N, 4 + } + ;; + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd INCY3 = INCY, 1, INCY + mov f14 = f0 + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + cmp.eq p7, p0 = r0, J + adds I = -1, I + mov ar.ec= 3 + } + { .mmi + shladd Y2 = INCY, 1, Y1 + mov XX = X1 + mov YY = Y1 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + mov f15 = f0 + (p6) br.cond.dpnt .L215 + } + ;; + .align 32 + +/* INCY == 1 */ +.L212: + { .mmf + (p16) lfetch.nt1 [PREX], INCX16 + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f32 = [Y1], INCY + (p16) LDFD f38 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f86 = [X2], INCX + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f35 = [Y1], INCY3 + (p16) LDFD f41 = [Y2], INCY3 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFD f83 = [X1], INCX3 + (p16) LDFD f89 = [X2], INCX3 + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f44 = [Y1], INCY + (p16) LDFD f50 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f47 = [Y1], INCY3 + (p16) LDFD f53 = [Y2], INCY3 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFD f95 = [X1], INCX3 + (p16) LDFD f101 = [X2], INCX3 + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f56 = [Y1], INCY + (p16) LDFD f62 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFD f104 = [X1], INCX + (p16) LDFD f110 = [X2], INCX + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f59 = [Y1], INCY3 + (p16) LDFD f65 = [Y2], INCY3 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFD f107 = [X1], INCX3 + (p16) LDFD f113 = [X2], INCX3 + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f68 = [Y1], INCY + (p16) LDFD f74 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFD f116 = [X1], INCX + (p16) LDFD f122 = [X2], INCX + (p18) FMA f14 = f76, f124, f14 + } + { .mmf + (p16) LDFD f71 = [Y1], INCY3 + (p16) LDFD f77 = [Y2], INCY3 + (p18) FMA f15 = f79, f127, f15 + } + ;; + { .mmi + (p16) LDFD f119 = [X1], INCX3 + (p16) LDFD f125 = [X2], INCX3 + } + { .mmb + (p16) add XX = INCX16, XX + (p16) add YY = INCY16, YY + br.ctop.sptk.few .L212 + } + ;; + .align 32 + +.L215: + { .mmi + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f38 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f32 = [Y1], INCY + (p12) LDFD f36 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFD f35 = [X1], INCX3 + (p12) LDFD f39 = [X2], INCX3 + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f33 = [Y1], INCY3 + (p12) LDFD f37 = [Y2], INCY3 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFD f42 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + (p12) shladd XX = INCX, 3, XX + } + { .mmi + (p12) LDFD f40 = [Y1], INCY + (p12) LDFD f44 = [Y2], INCY + (p12) shladd YY = INCY, 3, YY + } + ;; + { .mmi + (p12) LDFD f43 = [X1], INCX3 + (p12) LDFD f47 = [X2], INCX3 + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f41 = [Y1], INCY3 + (p12) LDFD f45 = [Y2], INCY3 + (p13) shladd YY = INCY, 2, YY + } + ;; + (p13) LDFD f50 = [X1], INCX + (p13) LDFD f54 = [X2], INCX + (p14) shladd XX = INCX, 1, XX + (p13) LDFD f48 = [Y1], INCY + (p13) LDFD f52 = [Y2], INCY + (p14) shladd YY = INCY, 1, YY + ;; + (p13) LDFD f51 = [X1], INCX3 + (p13) LDFD f55 = [X2] + (p13) LDFD f49 = [Y1], INCY3 + (p13) LDFD f53 = [Y2] + ;; + (p14) LDFD f58 = [X1], INCX + (p15) LDFD f61 = [XX] + (p14) LDFD f56 = [Y1], INCY + (p15) LDFD f60 = [YY] + ;; + (p14) LDFD f59 = [X1] + (p14) LDFD f57 = [Y1] + ;; + ;; + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + ;; + .align 32 + +.L999: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f10 + FADD f12 = f12, f14 + mov ar.lc = ARLC + ;; + FADD f8 = f8, f12 + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/gemm_beta.S b/kernel/ia64/gemm_beta.S new file mode 100644 index 0000000000..ceeca4acbd --- /dev/null +++ b/kernel/ia64/gemm_beta.S @@ -0,0 +1,512 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 140 + +#define CO1 r14 +#define CO2 r15 +#define CO3 r16 +#define DO1 r17 +#define DO2 r18 +#define DO3 r19 + +#define I r22 +#define I_AND_15 r23 +#define PRE1 r24 + +#define PR r30 +#define ARLC r31 + +#define M r32 +#define N r33 +#define C r34 +#define LDC r35 +#define J r36 + +#define BETA f8 + + PROLOGUE + .prologue + PROFCODE + + { .mmi +#ifndef XDOUBLE + adds CO1 = 16, r12 + adds CO2 = 24, r12 +#else + adds CO1 = 32, r12 + adds CO2 = 40, r12 +#endif + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, N + fcmp.eq p0, p15 = BETA, f0 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + ld8 C = [CO1], 8 + ld8 LDC = [CO2] + mov PR = pr + } + { .mmi + mov J = N + shr I = M, 4 + } + ;; + { .mmb + shladd LDC = LDC, BASE_SHIFT, r0 + adds I = -1, I + (p15) br.cond.dpnt .L100 // if (beta != 0) goto L100 + } + ;; + .align 32 + +.L60: + { .mmi + mov CO1 = C + mov CO3 = C + add CO2 = 4 * SIZE, C + } + { .mmi + adds PRE1 = PREFETCHSIZE * SIZE, C + add C = C, LDC + tbit.nz p12, p0 = M, 3 + } + ;; + { .mmi + and I_AND_15 = 15, M + mov ar.lc = I + } + { .mib + cmp.gt p8, p0 = 0, I + (p8) br.cond.dpnt .L80 + } + ;; + .align 32 + +.L70: + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + { .mmi + lfetch.excl.nt1 [PRE1] + nop.m 0 + adds PRE1 = 16 * SIZE, PRE1 + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + adds CO3 = 16 * SIZE, CO3 + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 5 * SIZE + STFD [CO2] = f0, 5 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmb + STFD [CO1] = f0, 5 * SIZE + STFD [CO2] = f0, 5 * SIZE + br.cloop.sptk.few .L70 + } + ;; + .align 32 + +.L80: + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + tbit.nz p13, p0 = M, 2 + } + { .mmb + cmp.eq p9, p0 = 0, I_AND_15 + adds J = -1, J + (p9) br.cond.dptk .L99 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + tbit.nz p14, p0 = M, 1 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + (p12) adds CO3 = 8 * SIZE, CO3 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 5 * SIZE + (p12) STFD [CO2] = f0 + (p13) adds CO3 = 4 * SIZE, CO3 + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p14) STFD [CO3] = f0, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p14) STFD [CO3] = f0, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p15) STFD [CO3] = f0 + } + ;; + { .mmi + (p13) STFD [CO1] = f0 + } + ;; + .align 32 + +.L99: + { .mib + cmp.lt p6, p0 = 0, J + mov ar.lc = ARLC + } + { .mbb + (p6) br.cond.dptk .L60 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mmi + mov CO1 = C + mov CO3 = C + mov pr.rot = 0 + } + { .mmi + adds PRE1 = PREFETCHSIZE * SIZE, C + add CO2 = 4 * SIZE, C + mov DO1 = C + } + ;; + { .mmi + mov ar.ec = 6 + } + { .mmi + adds DO2 = 4 * SIZE, C + mov DO3 = C + add C = C, LDC + } + ;; + { .mmi + and I_AND_15 = 15, M + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + } + { .mib + cmp.gt p8, p0 = 0, I + tbit.nz p12, p0 = M, 3 + (p8) br.cond.dpnt .L180 + } + ;; + .align 32 + +.L170: + { .mmf + (p21) STFD [DO1] = f6, 1 * SIZE + (p21) STFD [DO2] = f7, 1 * SIZE + (p21) FMPY f6 = BETA, f85 + } + { .mmf + (p16) lfetch.excl.nt1 [PRE1] + (p16) adds CO3 = 16 * SIZE, CO3 + (p21) FMPY f7 = BETA, f91 + } + ;; + { .mmf + (p21) STFD [DO1] = f10, 1 * SIZE + (p21) STFD [DO2] = f11, 1 * SIZE + (p21) FMPY f10 = BETA, f97 + } + { .mmf + (p16) LDFD f32 = [CO1], 1 * SIZE + (p16) LDFD f38 = [CO2], 1 * SIZE + (p21) FMPY f11 = BETA, f103 + } + ;; + { .mmf + (p21) STFD [DO1] = f12, 1 * SIZE + (p21) STFD [DO2] = f13, 1 * SIZE + (p21) FMPY f12 = BETA, f109 + } + { .mmf + (p16) LDFD f44 = [CO1], 1 * SIZE + (p16) LDFD f50 = [CO2], 1 * SIZE + (p21) FMPY f13 = BETA, f115 + } + ;; + { .mmf + (p21) STFD [DO1] = f14, 5 * SIZE + (p21) STFD [DO2] = f15, 5 * SIZE + (p21) FMPY f14 = BETA, f121 + } + { .mmf + (p16) LDFD f56 = [CO1], 1 * SIZE + (p16) LDFD f62 = [CO2], 1 * SIZE + (p21) FMPY f15 = BETA, f127 + } + ;; + { .mmf + (p21) STFD [DO1] = f6, 1 * SIZE + (p21) STFD [DO2] = f7, 1 * SIZE + (p20) FMPY f6 = BETA, f36 + } + { .mmf + (p16) LDFD f68 = [CO1], 5 * SIZE + (p16) LDFD f74 = [CO2], 5 * SIZE + (p20) FMPY f7 = BETA, f42 + } + ;; + { .mmf + (p21) STFD [DO1] = f10, 1 * SIZE + (p21) STFD [DO2] = f11, 1 * SIZE + (p20) FMPY f10 = BETA, f48 + } + { .mmf + (p16) LDFD f80 = [CO1], 1 * SIZE + (p16) LDFD f86 = [CO2], 1 * SIZE + (p20) FMPY f11 = BETA, f54 + } + ;; + { .mmf + (p21) STFD [DO1] = f12, 1 * SIZE + (p21) STFD [DO2] = f13, 1 * SIZE + (p20) FMPY f12 = BETA, f60 + } + { .mmf + (p16) LDFD f92 = [CO1], 1 * SIZE + (p16) LDFD f98 = [CO2], 1 * SIZE + (p20) FMPY f13 = BETA, f66 + } + ;; + { .mmf + (p21) STFD [DO1] = f14, 5 * SIZE + (p21) STFD [DO2] = f15, 5 * SIZE + (p20) FMPY f14 = BETA, f72 + } + { .mmf + (p16) LDFD f104 = [CO1], 1 * SIZE + (p16) LDFD f110 = [CO2], 1 * SIZE + (p20) FMPY f15 = BETA, f78 + } + ;; + { .mmi + (p16) LDFD f116 = [CO1], 5 * SIZE + (p16) LDFD f122 = [CO2], 5 * SIZE + adds PRE1 = 16 * SIZE, PRE1 + } + { .mmb + (p16) adds DO3 = 16 * SIZE, DO3 + nop.m 0 + br.ctop.sptk.few .L170 + } + ;; + .align 32 + +.L180: + { .mmi + (p12) LDFD f32 = [CO1], 1 * SIZE + (p12) LDFD f36 = [CO2], 1 * SIZE + tbit.nz p13, p0 = M, 2 + } + { .mmb + cmp.eq p9, p0 = 0, I_AND_15 + adds J = -1, J + (p9) br.cond.dptk .L199 + } + ;; + { .mmi + (p12) LDFD f33 = [CO1], 1 * SIZE + (p12) LDFD f37 = [CO2], 1 * SIZE + tbit.nz p14, p0 = M, 1 + } + ;; + { .mmi + (p12) LDFD f34 = [CO1], 1 * SIZE + (p12) LDFD f38 = [CO2], 1 * SIZE + (p12) adds CO3 = 8 * SIZE, CO3 + } + ;; + { .mmi + (p12) LDFD f35 = [CO1], 5 * SIZE + (p12) LDFD f39 = [CO2] + (p13) adds CO3 = 4 * SIZE, CO3 + } + ;; + { .mmi + (p13) LDFD f40 = [CO1], 1 * SIZE + (p14) LDFD f44 = [CO3], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f41 = [CO1], 1 * SIZE + (p14) LDFD f45 = [CO3], 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + ;; + { .mmf + (p13) LDFD f42 = [CO1], 1 * SIZE + (p15) LDFD f46 = [CO3] + (p12) FMPY f32 = BETA, f32 + } + { .mmf + (p12) FMPY f36 = BETA, f36 + } + ;; + { .mmf + (p13) LDFD f43 = [CO1] + (p12) FMPY f33 = BETA, f33 + } + { .mmf + (p12) FMPY f37 = BETA, f37 + } + ;; + (p12) FMPY f34 = BETA, f34 + (p12) FMPY f38 = BETA, f38 + (p12) FMPY f35 = BETA, f35 + (p12) FMPY f39 = BETA, f39 + + ;; + { .mmf + (p12) STFD [DO1] = f32, 1 * SIZE + (p12) STFD [DO2] = f36, 1 * SIZE + (p13) FMPY f40 = BETA, f40 + } + { .mmf + (p12) adds DO3 = 8 * SIZE, DO3 + (p14) FMPY f44 = BETA, f44 + } + ;; + { .mmf + (p12) STFD [DO1] = f33, 1 * SIZE + (p12) STFD [DO2] = f37, 1 * SIZE + (p13) FMPY f41 = BETA, f41 + } + { .mmf + (p13) adds DO3 = 4 * SIZE, DO3 + (p14) FMPY f45 = BETA, f45 + } + ;; + { .mmf + (p12) STFD [DO1] = f34, 1 * SIZE + (p12) STFD [DO2] = f38, 1 * SIZE + (p13) FMPY f42 = BETA, f42 + } + { .mmf + (p15) FMPY f46 = BETA, f46 + } + ;; + { .mmf + (p12) STFD [DO1] = f35, 5 * SIZE + (p12) STFD [DO2] = f39 + (p13) FMPY f43 = BETA, f43 + } + ;; + { .mmi + (p13) STFD [DO1] = f40, 1 * SIZE + (p14) STFD [DO3] = f44, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [DO1] = f41, 1 * SIZE + (p14) STFD [DO3] = f45, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [DO1] = f42, 1 * SIZE + (p15) STFD [DO3] = f46 + } + ;; + { .mmi + (p13) STFD [DO1] = f43 + } + ;; + .align 32 + +.L199: + { .mib + cmp.lt p6, p0 = 0, J + mov ar.lc = ARLC + (p6) br.cond.dptk .L100 + } + ;; + { .mib + mov pr = PR, -1 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/gemm_kernel.S b/kernel/ia64/gemm_kernel.S new file mode 100644 index 0000000000..d1d4731dda --- /dev/null +++ b/kernel/ia64/gemm_kernel.S @@ -0,0 +1,8958 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE 7 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r36 +#define B r37 +#define C r38 +#define LDC r39 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define BB r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS +#ifdef TRMMKERNEL + alloc ARPFS = ar.pfs, 8, 16, 0, 0 +#else + alloc ARPFS = ar.pfs, 8, 8, 0, 0 +#endif + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -16 * 16, SP + adds r9 = -15 * 16, SP + adds SP = -16 * 16, SP + } + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + shladd LDC = LDC, BASE_SHIFT, r0 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + shr J = N, 3 + ;; + stf.spill [r8] = f22, 32 + stf.spill [r9] = f23, 32 + mov AOFFSET = A + ;; + stf.spill [r8] = f24, 32 + stf.spill [r9] = f25, 32 + cmp.ge p6, p0 = 0, J + ;; + stf.spill [r8] = f26, 32 + stf.spill [r9] = f27, 32 + shr BB = K, 3 + ;; + stf.spill [r8] = f28, 32 + stf.spill [r9] = f29, 32 + ;; + stf.spill [r8] = f30 + stf.spill [r9] = f31 +#ifndef TRMMKERNEL + (p6) br.cond.dpnt .L050 + .body + ;; +#else + .body + ;; + ld8 OFFSET = [r14] +#if defined(TRMMKERNEL) && !defined(LEFT) + ;; + sub KK = r0, OFFSET +#endif + (p6) br.cond.dpnt .L050 + ;; +#endif + .align 32 + +.L010: + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 + shladd BB = BB, BASE_SHIFT, B + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc + shladd C = LDC, 3, C // coffset += 8 * ldc + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc + shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + }{ .mfb + sub C8 = C, LDC // coffset8 = c + 7 * ldc + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 3, B + mov f65 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + { .mfb + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mmf + lfetch.nt1 [BB] + setf.d f113 = r0 + mov f121 = f0 + } + ;; + { .mmf + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + adds BB = 16 * SIZE, BB + } + ;; + { .mmf + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfb + setf.d f114 = r0 + mov f122 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 8, KK +#endif +#endif + } + ;; + { .mmf + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfi + setf.d f119 = r0 + mov f127 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfi + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfi + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfi + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfi + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfi + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfi + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfi + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfi + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfi + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfi + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfi + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfi + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfi + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfi + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfi + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfi + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfi + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfi + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfi + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfi + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfi + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfi + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfi + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfi + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfi + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfi + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfi + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfi + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfi + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfi + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfi + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfi + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfi + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfi + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfi + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfi + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f6 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f7 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f10 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f11 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f12 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f13 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f14 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f15 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f16 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f17 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f18 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f19 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f20 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f21 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f22 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f23 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f24 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f25 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f26 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f27 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f28 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f29 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f30 = [C3 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f31 = [C11], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f32 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f33 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f34 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f35 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f36 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f37 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f38 = [C4 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f39 = [C12], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f48 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f49 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f50 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f51 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f52 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f53 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f54 = [C5 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f55 = [C13], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f40 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f41 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f42 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f43 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f44 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f45 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f59 = [C6 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f60 = [C14], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f61 = [C7 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f62 = [C15], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; +.L013: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + (p5) LDFD f63 = [C7 ], SIZE + FMA f64 = ALPHA, f64, f6 + cmp.ne p6, p0 = 1, I + } + { .mfb + (p5) LDFD f6 = [C15], SIZE + FMA f68 = ALPHA, f68, f7 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f7 = [C7 ], SIZE + FMA f65 = ALPHA, f65, f10 + adds I = -1, I + } + { .mfb + (p5) LDFD f10 = [C15], SIZE + FMA f69 = ALPHA, f69, f11 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f11 = [C7 ], -3 * SIZE + FMA f66 = ALPHA, f66, f12 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C15], -3 * SIZE + FMA f70 = ALPHA, f70, f13 + nop __LINE__ + } + ;; + { .mfb + LDFD f13 = [C8 ], SIZE + FMA f67 = ALPHA, f67, f14 + nop __LINE__ + } + { .mfb + LDFD f14 = [C16], SIZE + FMA f71 = ALPHA, f71, f15 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + FMA f72 = ALPHA, f72, f16 + } + { .mmf + LDFD f15 = [C8 ], SIZE + LDFD f16 = [C16], SIZE + FMA f76 = ALPHA, f76, f17 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + FMA f73 = ALPHA, f73, f18 + } + { .mmf + LDFD f17 = [C8 ], SIZE + LDFD f18 = [C16], SIZE + FMA f77 = ALPHA, f77, f19 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + FMA f74 = ALPHA, f74, f20 + } + { .mmf + LDFD f19 = [C8 ], -3 * SIZE + LDFD f20 = [C16], -3 * SIZE + FMA f78 = ALPHA, f78, f21 + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f22 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f23 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f80 = ALPHA, f80, f24 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMA f84 = ALPHA, f84, f25 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMA f81 = ALPHA, f81, f26 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMA f85 = ALPHA, f85, f27 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMA f82 = ALPHA, f82, f28 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMA f86 = ALPHA, f86, f29 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMA f83 = ALPHA, f83, f30 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMA f87 = ALPHA, f87, f31 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f88 = ALPHA, f88, f32 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMA f92 = ALPHA, f92, f33 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f89 = ALPHA, f89, f34 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMA f93 = ALPHA, f93, f35 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMA f90 = ALPHA, f90, f36 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMA f94 = ALPHA, f94, f37 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMA f91 = ALPHA, f91, f38 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMA f95 = ALPHA, f95, f39 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMA f96 = ALPHA, f96, f48 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + FMA f100 = ALPHA, f100, f49 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + FMA f97 = ALPHA, f97, f50 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + FMA f101 = ALPHA, f101, f51 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + FMA f98 = ALPHA, f98, f52 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + FMA f102 = ALPHA, f102, f53 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + FMA f99 = ALPHA, f99, f54 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + FMA f103 = ALPHA, f103, f55 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMA f104 = ALPHA, f104, f40 + nop __LINE__ + } + { .mfb + STFD [C13] = f100, SIZE + FMA f108 = ALPHA, f108, f41 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + FMA f105 = ALPHA, f105, f42 + nop __LINE__ + } + { .mfb + STFD [C13] = f101, SIZE + FMA f109 = ALPHA, f109, f43 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f98, SIZE + FMA f106 = ALPHA, f106, f44 + nop __LINE__ + } + { .mfb + STFD [C13] = f102, SIZE + FMA f110 = ALPHA, f110, f45 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f99, 5 * SIZE + FMA f107 = ALPHA, f107, f59 + nop __LINE__ + } + { .mfb + STFD [C13] = f103, 5 * SIZE + FMA f111 = ALPHA, f111, f60 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMA f112 = ALPHA, f112, f61 + nop __LINE__ + } + { .mfb + STFD [C14] = f108, SIZE + FMA f116 = ALPHA, f116, f62 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, SIZE + FMA f113 = ALPHA, f113, f63 + nop __LINE__ + } + { .mfb + STFD [C14] = f109, SIZE + FMA f117 = ALPHA, f117, f6 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f106, SIZE + FMA f114 = ALPHA, f114, f7 + nop __LINE__ + } + { .mfb + STFD [C14] = f110, SIZE + FMA f118 = ALPHA, f118, f10 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f107, 5 * SIZE + FMA f115 = ALPHA, f115, f11 + nop __LINE__ + } + { .mfb + STFD [C14] = f111, 5 * SIZE + FMA f119 = ALPHA, f119, f12 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + FMA f120 = ALPHA, f120, f13 + nop __LINE__ + } + { .mfb + STFD [C15] = f116, SIZE + FMA f124 = ALPHA, f124, f14 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + FMA f121 = ALPHA, f121, f15 + nop __LINE__ + } + { .mfb + STFD [C15] = f117, SIZE + FMA f125 = ALPHA, f125, f16 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f114, SIZE + FMA f122 = ALPHA, f122, f17 + nop __LINE__ + } + { .mfb + STFD [C15] = f118, SIZE + FMA f126 = ALPHA, f126, f18 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f115, 5 * SIZE + FMA f123 = ALPHA, f123, f19 + nop __LINE__ + } + { .mfb + STFD [C15] = f119, 5 * SIZE + FMA f127 = ALPHA, f127, f20 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f120, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f124, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f125, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f122, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f126, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f123, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f127, 5 * SIZE + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + FMPY f72 = ALPHA, f72 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f76 = ALPHA, f76 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + FMPY f73 = ALPHA, f73 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f77 = ALPHA, f77 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + FMPY f74 = ALPHA, f74 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f78 = ALPHA, f78 + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMPY f84 = ALPHA, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMPY f85 = ALPHA, f85 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMPY f86 = ALPHA, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMPY f87 = ALPHA, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMPY f92 = ALPHA, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMPY f93 = ALPHA, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMPY f94 = ALPHA, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMPY f91 = ALPHA, f91 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMPY f95 = ALPHA, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + FMPY f100 = ALPHA, f100 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + FMPY f101 = ALPHA, f101 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + FMPY f98 = ALPHA, f98 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + FMPY f102 = ALPHA, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + FMPY f99 = ALPHA, f99 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + FMPY f103 = ALPHA, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + { .mfb + STFD [C13] = f100, SIZE + FMPY f108 = ALPHA, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + { .mfb + STFD [C13] = f101, SIZE + FMPY f109 = ALPHA, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f98, SIZE + FMPY f106 = ALPHA, f106 + nop __LINE__ + } + { .mfb + STFD [C13] = f102, SIZE + FMPY f110 = ALPHA, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f99, 5 * SIZE + FMPY f107 = ALPHA, f107 + nop __LINE__ + } + { .mfb + STFD [C13] = f103, 5 * SIZE + FMPY f111 = ALPHA, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMPY f112 = ALPHA, f112 + nop __LINE__ + } + { .mfb + STFD [C14] = f108, SIZE + FMPY f116 = ALPHA, f116 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, SIZE + FMPY f113 = ALPHA, f113 + nop __LINE__ + } + { .mfb + STFD [C14] = f109, SIZE + FMPY f117 = ALPHA, f117 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f106, SIZE + FMPY f114 = ALPHA, f114 + nop __LINE__ + } + { .mfb + STFD [C14] = f110, SIZE + FMPY f118 = ALPHA, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f107, 5 * SIZE + FMPY f115 = ALPHA, f115 + nop __LINE__ + } + { .mfb + STFD [C14] = f111, 5 * SIZE + FMPY f119 = ALPHA, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + FMPY f120 = ALPHA, f120 + nop __LINE__ + } + { .mfb + STFD [C15] = f116, SIZE + FMPY f124 = ALPHA, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + FMPY f121 = ALPHA, f121 + nop __LINE__ + } + { .mfb + STFD [C15] = f117, SIZE + FMPY f125 = ALPHA, f125 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f114, SIZE + FMPY f122 = ALPHA, f122 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C15] = f118, SIZE + FMPY f126 = ALPHA, f126 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f115, 5 * SIZE + FMPY f123 = ALPHA, f123 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C15] = f119, 5 * SIZE + FMPY f127 = ALPHA, f127 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f120, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f124, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C16] = f125, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f122, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f126, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f123, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f127, 5 * SIZE + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f89 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 8, KK +#endif +#endif + mov f81 = f0 + (p6) br.cond.dptk .L030 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 3, B + mov f65 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; +#endif + { .mmf + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfi + setf.d f113 = r0 + mov f121 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mmf + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f75 = f0 + adds L = -1, L + } + { .mmf + setf.d f67 = r0 + setf.d f83 = r0 + mov f91 = f0 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f107 = f0 + mov ar.lc = L + } + { .mmf + setf.d f99 = r0 + setf.d f115 = r0 + mov f123 = f0 + } + ;; + .align 32 + +.L022: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C10], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f86 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f87 = [C11], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C12], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C5 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C13], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C6 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C14], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f116 = [C7 ], SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f118 = [C15], SIZE + FMA f66 = ALPHA, f66, f70 + nop __LINE__ + } + ;; + { .mfb + LDFD f117 = [C7 ], -1 * SIZE + FMA f65 = ALPHA, f65, f69 + nop __LINE__ + } + { .mfb + LDFD f119 = [C15], -1 * SIZE + FMA f67 = ALPHA, f67, f71 + nop __LINE__ + } + ;; + { .mfb + LDFD f124 = [C8], SIZE + FMA f72 = ALPHA, f72, f76 + nop __LINE__ + } + { .mfb + LDFD f126 = [C16], SIZE + FMA f74 = ALPHA, f74, f78 + nop __LINE__ + } + ;; + { .mfb + LDFD f125 = [C8], -1 * SIZE + FMA f73 = ALPHA, f73, f77 + nop __LINE__ + } + { .mfb + LDFD f127 = [C16], -1 * SIZE + FMA f75 = ALPHA, f75, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMA f82 = ALPHA, f82, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMA f83 = ALPHA, f83, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMA f90 = ALPHA, f90, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMA f91 = ALPHA, f91, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f96 = ALPHA, f96, f100 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + FMA f98 = ALPHA, f98, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + FMA f97 = ALPHA, f97, f101 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + FMA f99 = ALPHA, f99, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMA f104 = ALPHA, f104, f108 + nop __LINE__ + } + { .mfb + STFD [C12] = f90, SIZE + FMA f106 = ALPHA, f106, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, 3 * SIZE + FMA f105 = ALPHA, f105, f109 + nop __LINE__ + } + { .mfb + STFD [C12] = f91, 3 * SIZE + FMA f107 = ALPHA, f107, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMA f112 = ALPHA, f112, f116 + nop __LINE__ + } + { .mfb + STFD [C13] = f98, SIZE + FMA f114 = ALPHA, f114, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, 3 * SIZE + FMA f113 = ALPHA, f113, f117 + nop __LINE__ + } + { .mfb + STFD [C13] = f99, 3 * SIZE + FMA f115 = ALPHA, f115, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMA f120 = ALPHA, f120, f124 + nop __LINE__ + } + { .mfb + STFD [C14] = f106, SIZE + FMA f122 = ALPHA, f122, f126 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, 3 * SIZE + FMA f121 = ALPHA, f121, f125 + nop __LINE__ + } + { .mfb + STFD [C14] = f107, 3 * SIZE + FMA f123 = ALPHA, f123, f127 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C15] = f114, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, 3 * SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C15] = f115, 3 * SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f120, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f122, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f121, 3 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f123, 3 * SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mfb + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + { .mfb + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + ;; + { .mfb + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + ;; + { .mfb + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMPY f91 = ALPHA, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + FMPY f98 = ALPHA, f98 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + FMPY f99 = ALPHA, f99 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + { .mfb + STFD [C12] = f90, SIZE + FMPY f106 = ALPHA, f106 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, 3 * SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + { .mfb + STFD [C12] = f91, 3 * SIZE + FMPY f107 = ALPHA, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMPY f112 = ALPHA, f112 + nop __LINE__ + } + { .mfb + STFD [C13] = f98, SIZE + FMPY f114 = ALPHA, f114 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, 3 * SIZE + FMPY f113 = ALPHA, f113 + nop __LINE__ + } + { .mfb + STFD [C13] = f99, 3 * SIZE + FMPY f115 = ALPHA, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C6 ] = f104, SIZE + FMPY f120 = ALPHA, f120 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C14] = f106, SIZE + FMPY f122 = ALPHA, f122 + nop __LINE__ + } + ;; + { .mfi + STFD [C6 ] = f105, 3 * SIZE + FMPY f121 = ALPHA, f121 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C14] = f107, 3 * SIZE + FMPY f123 = ALPHA, f123 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C15] = f114, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f113, 3 * SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C15] = f115, 3 * SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f120, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f122, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, 3 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f123, 3 * SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L030: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 8, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#else + { .mmf + shladd BOFFSET = KK8, 3, B + shladd AOFFSET = KK8, 1, AOFFSET + mov f65 = f0 + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#endif + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f81 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f97 = f0 + adds L = -1, L + } + { .mfi + nop __LINE__ + mov f105 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f113 = f0 + mov ar.lc = L + } + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f121 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f100 = [C5], SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f108 = [C6], SIZE + FMA f65 = ALPHA, f65, f69 + nop __LINE__ + } + ;; + { .mfb + LDFD f101 = [C5], -1 * SIZE + FMA f72 = ALPHA, f72, f76 + nop __LINE__ + } + { .mfb + LDFD f109 = [C6], -1 * SIZE + FMA f73 = ALPHA, f73, f77 + nop __LINE__ + } + ;; + { .mfb + LDFD f116 = [C7], SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + LDFD f124 = [C8], SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + ;; + { .mfb + LDFD f117 = [C7], -1 * SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + LDFD f125 = [C8], -1 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f96 = ALPHA, f96, f100 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + FMA f104 = ALPHA, f104, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f97 = ALPHA, f97, f101 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + FMA f105 = ALPHA, f105, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f112 = ALPHA, f112, f116 + nop __LINE__ + } + { .mfb + STFD [C4 ] = f88, SIZE + FMA f120 = ALPHA, f120, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f113 = ALPHA, f113, f117 + nop __LINE__ + } + { .mfb + STFD [C4 ] = f89, SIZE + FMA f121 = ALPHA, f121, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C6 ] = f104, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C6 ] = f105, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f120, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f121, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + FMPY f112 = ALPHA, f112 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f120 = ALPHA, f120 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f81, SIZE + FMPY f113 = ALPHA, f113 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4 ] = f89, SIZE + FMPY f121 = ALPHA, f121 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C5 ] = f96, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C6 ] = f104, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C5 ] = f97, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6 ] = f105, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8 ] = f120, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f113, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8 ] = f121, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L040: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 8, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#else + { .mmi + shladd BOFFSET = KK8, 3, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#endif + ;; + { .mii + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + LDFD f32 = [AOFFSET], 1 * SIZE + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + { .mmi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + nop __LINE__ + } + ;; + .align 32 + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2] +#else + nop __LINE__ +#endif + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3] +#else + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4] +#else + nop __LINE__ +#endif + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C5] + (p5) LDFD f108 = [C6] +#else + nop __LINE__ + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f116 = [C7] + (p5) LDFD f124 = [C8] +#else + nop __LINE__ + nop __LINE__ +#endif + br.cloop.sptk.few .L042 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + FMA f80 = ALPHA, f80, f84 + FMA f88 = ALPHA, f88, f92 + + FMA f96 = ALPHA, f96, f100 + FMA f104 = ALPHA, f104, f108 + FMA f112 = ALPHA, f112, f116 + FMA f120 = ALPHA, f120, f124 + ;; + STFD [C1 ] = f64, SIZE + mov f64 = f0 + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; + STFD [C5 ] = f96, SIZE + mov f96 = f0 + STFD [C6 ] = f104, SIZE + mov f104 = f0 + ;; + STFD [C7 ] = f112, SIZE + mov f112 = f0 + STFD [C8 ] = f120, SIZE + mov f120 = f0 + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f72 = ALPHA, f72 + FMPY f80 = ALPHA, f80 + FMPY f88 = ALPHA, f88 + + { .mfi + FMPY f96 = ALPHA, f96 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f112 = ALPHA, f112 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f120 = ALPHA, f120 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C5 ] = f96, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6 ] = f104, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8 ] = f120, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L049: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mib + cmp.lt p6, p0 = 0, J + shr BB = K, 3 + (p6) br.cond.dptk .L010 + } + ;; + .align 32 + +.L050: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 2 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + shladd C3 = LDC, 1, C + mov f80 = f0 + nop __LINE__ + } + { .mfb + mov AOFFSET = A + mov f88 = f0 + (p6) br.cond.dpnt .L090 + } + ;; + { .mfi + cmp.eq p6, p7 = 0, I + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + { .mfi + shladd C4 = LDC, 1, C2 + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + shladd C = LDC, 2, C + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 32 + +.L052: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f74 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f66 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; +#endif + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 4, KK +#endif +#endif + } + { .mfi + setf.d f84 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f67 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f75 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f83 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f91 = r0 + mov f68 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f76 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f92 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f69 = f0 + shr L = L, 1 + } + { .mmf + setf.d f77 = r0 + setf.d f85 = r0 + mov f93 = f0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f70 = f0 + adds L = -1, L + } + { .mmf + setf.d f78 = r0 + setf.d f86 = r0 + mov f94 = f0 + } + ;; + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + mov ar.lc = L + } + { .mmf + setf.d f79 = r0 + setf.d f87 = r0 + mov f95 = f0 + } + ;; + .align 32 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f96 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f97 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f98 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f99 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f112 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f113 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f114 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f115 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f116 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f117 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f118 = [C3 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f119 = [C11], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 32 + +.L058: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + LDFD f120 = [C4 ], SIZE + FMA f64 = ALPHA, f64, f96 + cmp.ne p6, p0 = 1, I + } + { .mfb + LDFD f121 = [C12], SIZE + FMA f68 = ALPHA, f68, f97 + nop __LINE__ + } + ;; + { .mfi + LDFD f122 = [C4 ], SIZE + FMA f65 = ALPHA, f65, f98 + adds I = -1, I + } + { .mfb + LDFD f123 = [C12], SIZE + FMA f69 = ALPHA, f69, f99 + nop __LINE__ + } + ;; + { .mfb + LDFD f124 = [C4 ], SIZE + FMA f66 = ALPHA, f66, f100 + nop __LINE__ + } + { .mfb + LDFD f125 = [C12], SIZE + FMA f70 = ALPHA, f70, f101 + nop __LINE__ + } + ;; + { .mfb + LDFD f126 = [C4 ], -3 * SIZE + FMA f67 = ALPHA, f67, f102 + nop __LINE__ + } + { .mfb + LDFD f127 = [C12], -3 * SIZE + FMA f71 = ALPHA, f71, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f72 = ALPHA, f72, f104 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMA f76 = ALPHA, f76, f105 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f73 = ALPHA, f73, f106 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMA f77 = ALPHA, f77, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMA f74 = ALPHA, f74, f108 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMA f78 = ALPHA, f78, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f110 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f80 = ALPHA, f80, f112 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMA f84 = ALPHA, f84, f113 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMA f81 = ALPHA, f81, f114 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMA f85 = ALPHA, f85, f115 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMA f82 = ALPHA, f82, f116 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMA f86 = ALPHA, f86, f117 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMA f83 = ALPHA, f83, f118 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMA f87 = ALPHA, f87, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f88 = ALPHA, f88, f120 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMA f92 = ALPHA, f92, f121 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f89 = ALPHA, f89, f122 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMA f93 = ALPHA, f93, f123 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMA f90 = ALPHA, f90, f124 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMA f94 = ALPHA, f94, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMA f91 = ALPHA, f91, f126 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMA f95 = ALPHA, f95, f127 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMPY f76 = ALPHA, f76 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMPY f77 = ALPHA, f77 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMPY f78 = ALPHA, f78 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMPY f84 = ALPHA, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMPY f85 = ALPHA, f85 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMPY f86 = ALPHA, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMPY f87 = ALPHA, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMPY f92 = ALPHA, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMPY f93 = ALPHA, f93 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f82, SIZE + FMPY f90 = ALPHA, f90 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C11] = f86, SIZE + FMPY f94 = ALPHA, f94 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f83, 5 * SIZE + FMPY f91 = ALPHA, f91 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C11] = f87, 5 * SIZE + FMPY f95 = ALPHA, f95 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f92, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C12] = f93, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f90, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f94, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f91, 5 * SIZE + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f95, 5 * SIZE + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#endif + .align 32 + +.L060: + { .mfi + nop __LINE__ + mov f66 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 4, KK +#endif +#endif + mov f74 = f0 + (p6) br.cond.dptk .L070 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f82 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f90 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f82 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f90 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f67 = f0 + adds L = -1, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f75 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C10], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f86 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f87 = [C11], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C12], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f66 = ALPHA, f66, f70 + FMA f65 = ALPHA, f65, f69 + FMA f67 = ALPHA, f67, f71 + FMA f72 = ALPHA, f72, f76 + FMA f74 = ALPHA, f74, f78 + FMA f73 = ALPHA, f73, f77 + FMA f75 = ALPHA, f75, f79 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMA f82 = ALPHA, f82, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMA f83 = ALPHA, f83, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMA f90 = ALPHA, f90, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMA f91 = ALPHA, f91, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + mov f64 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 + adds L = 1, K + } + { .mfb + STFD [C12] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, 3 * SIZE + mov f89 = f0 + shr L = L, 1 + } + { .mfb + STFD [C12] = f91, 3 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f66 = ALPHA, f66 + FMPY f65 = ALPHA, f65 + FMPY f67 = ALPHA, f67 + FMPY f72 = ALPHA, f72 + FMPY f74 = ALPHA, f74 + FMPY f73 = ALPHA, f73 + FMPY f75 = ALPHA, f75 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f74, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, 3 * SIZE + FMPY f89 = ALPHA, f89 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C10] = f75, 3 * SIZE + FMPY f91 = ALPHA, f91 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C11] = f82, SIZE + mov f64 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f81, 3 * SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C11] = f83, 3 * SIZE + mov f72 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, 3 * SIZE + mov f89 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f91, 3 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L070: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L080 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 2, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + nop __LINE__ + } + ;; +#endif + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + FMA f72 = ALPHA, f72, f76 + FMA f73 = ALPHA, f73, f77 + + FMA f80 = ALPHA, f80, f84 + FMA f81 = ALPHA, f81, f85 + FMA f88 = ALPHA, f88, f92 + FMA f89 = ALPHA, f89, f93 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 + adds L = 1, K + } + { .mfb + STFD [C4 ] = f88, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE + shr L = L, 1 + } + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f65 = ALPHA, f65 + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = ALPHA, f80 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f81 = ALPHA, f81 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4 ] = f88, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L080: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 2, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + nop __LINE__ + } + ;; +#endif + + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2] + (p5) LDFD f84 = [C3] +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4] +#else + nop __LINE__ +#endif + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + FMA f80 = ALPHA, f80, f84 + FMA f88 = ALPHA, f88, f92 + ;; + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = ALPHA, f80 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L089: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + +.L090: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 1 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + setf.d f66 = r0 + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + { .mfb + mov AOFFSET = A + mov f73 = f0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mfi +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + mov f67 = f0 + shladd C = LDC, 1, C + } + { .mfb + cmp.eq p6, p7 = 0, I + mov f74 = f0 + (p6) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L092: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f79 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f68 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f79 = f0 + nop __LINE__ + } + ;; +#endif + + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f75 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 2, KK +#endif +#endif + } + ;; + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + mov f76 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f69 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f77 = f0 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC], LDC + mov f70 = f0 + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f78 = f0 + mov ar.lc = L + } + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + .align 32 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f96 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f97 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f98 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f99 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + nop __LINE__ + FMA f64 = ALPHA, f64, f96 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMA f68 = ALPHA, f68, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f65 = ALPHA, f65, f98 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMA f69 = ALPHA, f69, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f66 = ALPHA, f66, f100 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f70 = ALPHA, f70, f101 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = ALPHA, f67, f102 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = ALPHA, f71, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f72 = ALPHA, f72, f104 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMA f76 = ALPHA, f76, f105 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f73 = ALPHA, f73, f106 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMA f77 = ALPHA, f77, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMA f74 = ALPHA, f74, f108 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMA f78 = ALPHA, f78, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f110 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + mov f67 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + (p6) br.cond.dptk .L092 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMPY f76 = ALPHA, f76 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMPY f77 = ALPHA, f77 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMPY f74 = ALPHA, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f70, SIZE + FMPY f78 = ALPHA, f78 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f76, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C10] = f77, SIZE + mov f73 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f74, SIZE + mov f66 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f78, SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f75, 5 * SIZE + mov f67 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + STFD [C10] = f79, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L092 + } + ;; +#endif + .align 32 + +.L100: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmf + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + mov f75 = f0 + } + { .mii + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f75 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f77 = [C2 ], -1 * SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f79 = [C10], -1 * SIZE + FMA f66 = ALPHA, f66, f70 + nop __LINE__ + } + ;; + FMA f65 = ALPHA, f65, f69 + adds L = 1, K + FMA f67 = ALPHA, f67, f71 + ;; + FMA f72 = ALPHA, f72, f76 + shr L = L, 1 + FMA f74 = ALPHA, f74, f78 + FMA f73 = ALPHA, f73, f77 + FMA f75 = ALPHA, f75, f79 + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f66, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, 3 * SIZE + STFD [C9 ] = f67, 3 * SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f74, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C2 ] = f73, 3 * SIZE + STFD [C10] = f75, 3 * SIZE + mov f73 = f0 + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + FMPY f65 = ALPHA, f65 + FMPY f67 = ALPHA, f67 + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f75 = ALPHA, f75 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f66, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f67, 3 * SIZE + nop __LINE__ +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + mov f72 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C10] = f74, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, 3 * SIZE + mov f73 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + STFD [C10] = f75, 3 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#endif + + .align 32 + +.L110: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + shladd AOFFSET = KK8, 1, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + FMA f72 = ALPHA, f72, f76 + FMA f73 = ALPHA, f73, f77 + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f73, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L120: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + add AOFFSET = KK8, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + nop __LINE__ + mov ar.lc = L + } + ;; + .align 32 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] + (p5) LDFD f76 = [C2] +#else + nop __LINE__ + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + ;; + { .mfi + STFD [C1 ] = f64 + mov f64 = f0 + } + { .mfb + STFD [C2 ] = f72 + mov f72 = f0 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + ;; + { .mmi + nop __LINE__ +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + { .mfi + STFD [C1 ] = f64 + mov f64 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72 + mov f72 = f0 + } + ;; +#endif + .align 32 + +.L129: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + +.L130: + { .mfi +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f64 = f0 + tbit.z p6, p0 = N, 0 + } + { .mib + mov AOFFSET = A + shr I = M, 3 + (p6) br.cond.dpnt .L999 + } + ;; + { .mfi + mov C1 = C + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + mov f66 = f0 + nop __LINE__ + } + { .mfb + cmp.eq p7, p0 = 0, I + mov f67 = f0 + (p7) br.cond.dpnt .L140 + } + ;; + .align 32 + +.L132: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFD f48 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 1 * SIZE, B + mov f69 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#else + { .mfi + add BOFFSET = KK8, B + mov f68 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFD f48 = [BOFFSET], 1 * SIZE + mov f69 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f70 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mii + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f71 = f0 + adds L = -1, L + } + ;; + { .mmi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds PREC = CPREFETCHSIZE * SIZE, C1 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmi + CPREFETCH [PREC] + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f6 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f7 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f10 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f11 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f12 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f13 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f14 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f15 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + FMA f64 = ALPHA, f64, f6 + cmp.ne p6, p0 = 1, I + } + { .mfb + FMA f68 = ALPHA, f68, f7 + } + ;; + { .mfi + FMA f65 = ALPHA, f65, f10 + adds I = -1, I + } + { .mfb + FMA f69 = ALPHA, f69, f11 + } + ;; + { .mfi + FMA f66 = ALPHA, f66, f12 + } + { .mfb + FMA f70 = ALPHA, f70, f13 + } + ;; + { .mfb + FMA f67 = ALPHA, f67, f14 + } + { .mfb + FMA f71 = ALPHA, f71, f15 + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + mov f66 = f0 + } + ;; + { .mmf + STFD [C1 ] = f67, 5 * SIZE + nop __LINE__ + mov f67 = f0 + } + { .mmb + STFD [C9 ] = f71, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; +#else + { .mfi + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + FMPY f68 = ALPHA, f68 + } + ;; + { .mfi + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + FMPY f69 = ALPHA, f69 + } + ;; + { .mfi + FMPY f66 = ALPHA, f66 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + FMPY f70 = ALPHA, f70 + } + ;; + { .mfi + FMPY f67 = ALPHA, f67 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f71 = ALPHA, f71 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f68, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f69, SIZE + nop __LINE__ +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f66 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f70, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f67 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmb + STFD [C9 ] = f71, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; +#endif + .align 32 + +.L140: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 2, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + adds L = -1, L + nop __LINE__ + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f66 = ALPHA, f66, f70 + FMA f65 = ALPHA, f65, f69 + FMA f67 = ALPHA, f67, f71 + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + adds L = 1, K + } + { .mfb + STFD [C9 ] = f66, SIZE + mov f66 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 + shr L = L, 1 + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f67 = ALPHA, f67 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f66, SIZE + mov f66 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L150: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + LDFD f68 = [C1 ], SIZE + ;; + LDFD f69 = [C1 ], -1 * SIZE + ;; + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + ;; + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mii + nop __LINE__ +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L160: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 +#else + FMPY f64 = ALPHA, f64 +#endif + ;; + STFD [C1 ] = f64 + ;; + .align 32 + +.L169: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f24 = [SP], 32 + ldf.fill f25 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f26 = [SP], 32 + ldf.fill f27 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f28 = [SP], 32 + ldf.fill f29 = [r9], 32 + ;; + ldf.fill f30 = [SP], 32 + ldf.fill f31 = [r9] + + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/gemm_ncopy.S b/kernel/ia64/gemm_ncopy.S new file mode 100644 index 0000000000..ebb80bfaa6 --- /dev/null +++ b/kernel/ia64/gemm_ncopy.S @@ -0,0 +1,493 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 64 +#define WPREFETCHSIZE 32 + +#ifndef XDOUBLE +#define LD LDF8 +#define ST STF8_NTA +#else +#define LD LDFD +#define ST STFD_NTA +#endif + +#define J r15 +#define PREB r17 +#define PREA r18 + +#define A1 r19 +#define A2 r20 +#define A3 r21 +#define A4 r22 +#define A5 r23 +#define A6 r24 +#define A7 r25 +#define A8 r26 +#define B1 r27 +#define B2 r28 + +#define COUNT r9 +#define I r10 +#define II r11 + +#define ARLC r29 +#define PR r30 + +#define M r32 +#define N r33 +#define A r34 +#define LDA r35 +#define B r36 + + PROLOGUE + .prologue + PROFCODE + + .body + { .mii + shladd LDA = LDA, BASE_SHIFT, r0 + mov PR = pr + shr J = N, 3 + } + ;; + { .mib + cmp.eq p8, p0 = 0, J + mov ARLC = ar.lc + (p8) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L11: + { .mmi + mov A1 = A + add A2 = A, LDA + mov pr.rot = 0 + } + { .mmi + shladd A3 = LDA, 1, A + shladd A5 = LDA, 2, A + adds I = 1, M + } + ;; + { .mmi + shladd A4 = LDA, 1, A2 + shladd A6 = LDA, 2, A2 + mov ar.ec = 6 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + shladd A7 = LDA, 2, A3 + shr I = I, 1 + } + ;; + { .mmi + adds B1 = 8 * SIZE, B + shladd A8 = LDA, 2, A4 + shladd A = LDA, 3, A + } + { .mmi + adds I = -1, I + mov COUNT = 0 + adds J = -1, J + } + ;; + { .mmi + adds PREA = PREFETCHSIZE * SIZE, A + adds PREB = WPREFETCHSIZE * SIZE, B + mov ar.lc = I + } + { .mmi + mov I = M + mov II = M + cmp.ne p14, p0 = r0, r0 + } + ;; + .align 32 + +.L12: + { .mmi + (p21) ST [B ] = f37, 1 * SIZE + (p14) ST [B1] = f49, 1 * SIZE + (p16) cmp.ne.unc p13, p0 = 1, I + } + { .mmi + lfetch.nt1 [PREA], LDA + lfetch.excl.nt1 [PREB] + adds PREB = 16 * SIZE, PREB + } + ;; + { .mmi + (p21) ST [B ] = f43, 1 * SIZE + (p14) ST [B1] = f55, 1 * SIZE + cmp.eq p9, p0 = 8, COUNT + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f38 = [A2], SIZE + (p16) adds I = -2, I + } + ;; + { .mmi + (p21) ST [B ] = f61, 1 * SIZE + (p14) ST [B1] = f73, 1 * SIZE + (p9) mov COUNT = 0 + } + { .mmi + (p13) LD f44 = [A1], SIZE + (p13) LD f50 = [A2], SIZE + (p21) adds II = -2, II + } + ;; + { .mmb + (p21) ST [B ] = f67, 1 * SIZE + (p14) ST [B1] = f79, 1 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f56 = [A3], SIZE + (p16) LD f62 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmi + (p21) ST [B ] = f85, 1 * SIZE + (p14) ST [B1] = f97, 1 * SIZE + (p9) adds PREA = (PREFETCHSIZE - 2)* SIZE, A1 + } + { .mmb + (p13) LD f68 = [A3], SIZE + (p13) LD f74 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p21) ST [B ] = f91, 1 * SIZE + (p14) ST [B1] = f103, 1 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f80 = [A5], SIZE + (p16) LD f86 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p21) ST [B ] = f109, 1 * SIZE + (p14) ST [B1] = f121, 1 * SIZE + nop __LINE__ + } + { .mmb + (p13) LD f92 = [A5], SIZE + (p13) LD f98 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p21) ST [B ] = f115, 1 * SIZE + (p14) ST [B1] = f127, 9 * SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p16) LD f104 = [A7], SIZE + (p16) LD f110 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LD f116 = [A7], SIZE + (p13) LD f122 = [A8], SIZE + (p14) adds B = 8 * SIZE, B + } + { .mmb + (p20) cmp.ne.unc p14, p0 = 1, II + nop __LINE__ + br.ctop.sptk.few .L12 + } + ;; + { .mmb + cmp.ne p6, p0 = 0, J + nop __LINE__ + (p6) br.cond.dptk .L11 + } + ;; + .align 32 + +.L20: + { .mmi + adds I = 1, M + mov A1 = A + mov pr.rot = 0 + } + { .mmi + add A2 = A, LDA + shladd A3 = LDA, 1, A + tbit.z p6, p0 = N, 2 + } + ;; + { .mmi + shladd A4 = LDA, 1, A2 + adds B1 = 4 * SIZE, B + mov ar.ec = 6 + } + { .mib + cmp.eq p16, p0 = r0, r0 + shr I = I, 1 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmi + shladd A = LDA, 2, A + nop __LINE__ + nop __LINE__ + } + { .mmi + adds I = -1, I + mov COUNT = 0 + adds J = -1, J + } + ;; + { .mmi + adds PREA = PREFETCHSIZE * SIZE, A + adds PREB = WPREFETCHSIZE * SIZE, B + mov ar.lc = I + } + { .mmi + mov I = M + mov II = M + cmp.ne p14, p0 = r0, r0 + } + ;; + .align 32 + +.L22: + { .mmi + (p21) ST [B ] = f37, 1 * SIZE + (p14) ST [B1] = f49, 1 * SIZE + (p16) cmp.ne.unc p13, p0 = 1, I + } + { .mmi + lfetch.nt1 [PREA], LDA + lfetch.excl.nt1 [PREB], 8 * SIZE + cmp.eq p9, p0 = 4, COUNT + } + ;; + { .mmi + (p21) ST [B ] = f43, 1 * SIZE + (p14) ST [B1] = f55, 1 * SIZE + (p16) adds I = -2, I + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f38 = [A2], SIZE + (p21) adds II = -2, II + } + ;; + { .mmi + (p21) ST [B ] = f61, 1 * SIZE + (p14) ST [B1] = f73, 1 * SIZE + (p9) mov COUNT = 0 + } + { .mmi + (p13) LD f44 = [A1], SIZE + (p13) LD f50 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p21) ST [B ] = f67, 1 * SIZE + (p14) ST [B1] = f79, 5 * SIZE + (p9) adds PREA = PREFETCHSIZE * SIZE, A1 + } + { .mmb + (p16) LD f56 = [A3], SIZE + (p16) LD f62 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LD f68 = [A3], SIZE + (p13) LD f74 = [A4], SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p14) adds B = 4 * SIZE, B + (p20) cmp.ne.unc p14, p0 = 1, II + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L30: + { .mmi + adds I = 1, M + mov A1 = A + mov pr.rot = 0 + } + { .mmi + add A2 = A, LDA + adds B1 = 2 * SIZE, B + tbit.z p6, p0 = N, 1 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.ec = 6 + } + { .mib + cmp.eq p16, p0 = r0, r0 + shr I = I, 1 + (p6) br.cond.dpnt .L40 + } + ;; + { .mmi + adds I = -1, I + ;; + shladd A = LDA, 1, A + mov ar.lc = I + } + { .mmi + mov I = M + mov II = M + cmp.ne p14, p0 = r0, r0 + } + ;; + .align 32 + +.L32: + { .mmi + (p21) ST [B ] = f37, 1 * SIZE + (p14) ST [B1] = f49, 1 * SIZE + (p16) cmp.ne.unc p13, p0 = 1, I + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) adds II = -2, II + } + ;; + { .mmi + (p21) ST [B ] = f43, 1 * SIZE + (p14) ST [B1] = f55, 3 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f38 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LD f44 = [A1], SIZE + (p13) LD f50 = [A2], SIZE + (p16) adds I = -2, I + } + { .mmb + (p14) adds B = 2 * SIZE, B + (p20) cmp.ne.unc p14, p0 = 1, II + br.ctop.sptk.few .L32 + } + ;; + .align 32 + +.L40: + { .mmi + adds I = 1, M + mov A1 = A + mov pr.rot = 0 + } + { .mmi + tbit.z p6, p0 = N, 0 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.ec = 6 + } + { .mib + cmp.eq p16, p0 = r0, r0 + shr I = I, 1 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + adds I = -1, I + ;; + mov ar.lc = I + } + { .mmi + mov I = M + mov II = M + cmp.ne p14, p0 = r0, r0 + } + ;; + .align 32 + +.L42: + { .mmi + (p21) ST [B ] = f37, 1 * SIZE + (p16) cmp.ne.unc p13, p0 = 1, I + (p21) adds II = -2, II + } + ;; + { .mmi + (p14) ST [B ] = f49, 1 * SIZE + (p16) LD f32 = [A1], SIZE + (p16) adds I = -2, I + } + ;; + { .mmb + (p13) LD f44 = [A1], SIZE + (p20) cmp.ne.unc p14, p0 = 1, II + br.ctop.sptk.few .L42 + } + ;; + .align 32 + +.L999: + mov pr = PR, -1 + mov ar.lc = ARLC + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/gemm_tcopy.S b/kernel/ia64/gemm_tcopy.S new file mode 100644 index 0000000000..44555fa6e6 --- /dev/null +++ b/kernel/ia64/gemm_tcopy.S @@ -0,0 +1,1695 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 24 +#define WPREFETCHSIZE 32 + +#ifndef XDOUBLE +#define LD LDFD +#define ST STFD_NTA +#else +#define LD LDFD +#define ST STFD_NTA +#endif + +#define PREA r2 +#define PREB r3 + +#define A1 r14 +#define A2 r15 +#define B1 r16 +#define B2 r17 +#define I r18 +#define J r19 + +#define BO2 r20 +#define BO3 r21 +#define BO4 r22 + +#define LDB r23 +#define II r24 +#define TEMP1 r25 +#define TEMP2 r26 +#define TEMP3 r27 +#define LCOUNT r28 +#define SCOUNT r29 + +#define ARLC r30 +#define PR r31 + +#define MLDA8 r8 + +#define M r32 +#define N r33 +#define A r34 +#define LDA r35 +#define B r36 + + PROLOGUE + .prologue + PROFCODE + + .body + { .mmi + setf.sig f32 = M + and r8 = -8, N + mov ARLC = ar.lc + } + ;; + { .mmi + setf.sig f33 = r8 + and r9 = -4, N + mov PR = pr + } + ;; + { .mmi + setf.sig f34 = r9 + and r10 = -2, N + shladd LDA = LDA, BASE_SHIFT, r0 + } + ;; + { .mmi + setf.sig f35 = r10 + shladd MLDA8 = LDA, 3, r0 + shl LDB = M, BASE_SHIFT + 3 + } + ;; + { .mfi + sub MLDA8 = r0, MLDA8 + xmpy.l f33 = f32, f33 + shr J = M, 3 + } + { .mfi + xmpy.l f34 = f32, f34 + } + ;; + { .mmf + getf.sig BO2 = f33 + adds MLDA8 = 16 * SIZE, MLDA8 + xmpy.l f35 = f32, f35 + } + ;; + { .mmi + getf.sig BO3 = f34 + getf.sig BO4 = f35 + nop __LINE__ + } + ;; + { .mmi + shladd BO2 = BO2, BASE_SHIFT, B + shladd BO3 = BO3, BASE_SHIFT, B + shladd BO4 = BO4, BASE_SHIFT, B + } + { .mib + cmp.eq p6, p0 = 0, J + nop __LINE__ + (p6) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L11: + { .mmi + add I = 8, N + mov A1 = A + mov pr.rot = 0 + } + { .mmi + adds A2 = 4 * SIZE, A + shladd A = LDA, 3, A + shr II = N, 3 + } + ;; + { .mmi + mov B1 = B + cmp.eq p16, p0 = r0, r0 + mov ar.ec = 3 + } + { .mmi + adds B2 = 4 * SIZE, B + adds B = 64 * SIZE, B + shr I = I, 4 + } + ;; + { .mmi + cmp.eq p8, p0 = 0, I + shladd I = I, 2, r0 + nop __LINE__ + } + ;; + { .mmi + mov LCOUNT = 0 + mov SCOUNT = 0 + adds I = -1, I + } + ;; + { .mmi + adds PREA = PREFETCHSIZE * SIZE, A1 + adds PREB = WPREFETCHSIZE * SIZE, B1 + mov ar.lc = I + } + { .mib + adds J = -1, J + mov I = II + (p8) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L12: + { .mmi + (p18) ST [B1] = f34, 1 * SIZE + (p18) ST [B2] = f46, 1 * SIZE + (p18) cmp.ne.unc p13, p0 = 1, II + } + { .mmi + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], LDB + (p16) cmp.ne.unc p12, p0 = 1, I + } + ;; + { .mmi + (p18) ST [B1] = f37, 1 * SIZE + (p18) ST [B2] = f49, 1 * SIZE + (p18) adds SCOUNT = 1, SCOUNT + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f44 = [A2], SIZE + (p16) adds LCOUNT = 1, LCOUNT + } + ;; + { .mmi + (p18) ST [B1] = f40, 1 * SIZE + (p18) ST [B2] = f52, 1 * SIZE + (p16) cmp.eq.unc p14, p0 = 4, LCOUNT + } + { .mmi + (p16) LD f35 = [A1], SIZE + (p16) LD f47 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f43, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + (p18) cmp.eq.unc p15, p0 = 4, SCOUNT + } + { .mmi + (p16) LD f38 = [A1], SIZE + (p16) LD f50 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p18) ST [B1] = f82, 1 * SIZE + (p18) ST [B2] = f94, 1 * SIZE + } + { .mmi + (p16) LD f41 = [A1], TEMP1 + (p16) LD f53 = [A2], TEMP1 + } + ;; + { .mmi + (p18) ST [B1] = f85, 1 * SIZE + (p18) ST [B2] = f97, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f56 = [A1], SIZE + (p12) LD f68 = [A2], SIZE + shladd TEMP3 = LDA, 3, r0 + } + ;; + { .mmi + (p18) ST [B1] = f88, 1 * SIZE + (p18) ST [B2] = f100, 1 * SIZE + (p13) adds TEMP2 = - 11 * SIZE, LDB + } + { .mmi + (p12) LD f59 = [A1], SIZE + (p12) LD f71 = [A2], SIZE + (p12) adds TEMP1 = - 11 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f91 + (p18) ST [B2] = f103 + (p18) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f62 = [A1], SIZE + (p12) LD f74 = [A2], SIZE + (p18) add B2 = B2, TEMP2 + } + ;; + { .mmi + (p13) ST [B1] = f58, 1 * SIZE + (p13) ST [B2] = f70, 1 * SIZE + } + { .mmi + (p12) LD f65 = [A1], TEMP1 + (p12) LD f77 = [A2], TEMP1 + sub TEMP3 = LDA, TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f61, 1 * SIZE + (p13) ST [B2] = f73, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB] + adds TEMP3 = 5 * SIZE, TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f64, 1 * SIZE + (p13) ST [B2] = f76, 1 * SIZE + } + { .mmi + (p16) LD f80 = [A1], SIZE + (p16) LD f92 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p13) ST [B1] = f67, 5 * SIZE + (p13) ST [B2] = f79, 5 * SIZE + } + { .mmi + (p16) LD f83 = [A1], SIZE + (p16) LD f95 = [A2], SIZE + (p14) mov TEMP1 = TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f106, 1 * SIZE + (p13) ST [B2] = f118, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p16) LD f86 = [A1], SIZE + (p16) LD f98 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p13) ST [B1] = f109, 1 * SIZE + (p13) ST [B2] = f121, 1 * SIZE + sub TEMP2 = TEMP2, LDB + } + { .mmi + (p16) LD f89 = [A1], TEMP1 + (p16) LD f101 = [A2], TEMP1 + } + ;; + { .mmi + (p13) ST [B1] = f112, 1 * SIZE + (p13) ST [B2] = f124, 1 * SIZE + (p15) adds TEMP2 = -59 * SIZE, LDB + } + { .mmi + (p12) LD f104 = [A1], SIZE + (p12) LD f116 = [A2], SIZE + (p14) add PREA = PREA, MLDA8 + } + ;; + { .mmi + (p13) ST [B1] = f115 + (p13) ST [B2] = f127 + (p13) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f107 = [A1], SIZE + (p12) LD f119 = [A2], SIZE + adds TEMP1 = -11 * SIZE, LDA + } + ;; + { .mmi + (p12) LD f110 = [A1], SIZE + (p12) LD f122 = [A2], SIZE + (p14) mov TEMP1 = TEMP3 + } + { .mmi + (p14) mov LCOUNT = 0 + (p15) mov SCOUNT = 0 + adds PREB = WPREFETCHSIZE * SIZE, B1 + } + ;; + { .mmi + (p12) LD f113 = [A1], TEMP1 + (p12) LD f125 = [A2], TEMP1 + (p13) add B2 = B2, TEMP2 + } + { .mib + (p14) adds I = -2, I + (p15) adds II = -2, II + br.ctop.sptk .L12 + } + ;; + .align 32 + +.L20: + { .mmi + add A2 = A1, LDA + and TEMP3 = 7, N + tbit.nz p7, p0 = N, 2 + } + ;; + { .mmi + (p7) LD f32 = [A1], SIZE + (p7) LD f36 = [A2], SIZE + cmp.eq p6, p0 = 0, TEMP3 + } + ;; + { .mmi + (p7) LD f33 = [A1], SIZE + (p7) LD f37 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p7) LD f34 = [A1], SIZE + (p7) LD f38 = [A2], SIZE + add TEMP1 = TEMP1, LDA + } + ;; + { .mmi + (p7) LD f35 = [A1], TEMP1 + (p7) LD f39 = [A2], TEMP1 + (p6) cmp.ne.unc p10, p0 = 0, J + } + ;; + { .mmb + (p7) LD f40 = [A1], SIZE + (p7) LD f44 = [A2], SIZE + (p10) br.cond.dptk .L11 + } + ;; + { .mmi + (p7) LD f41 = [A1], SIZE + (p7) LD f45 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) LD f42 = [A1], SIZE + (p7) LD f46 = [A2], SIZE + tbit.nz p8, p0 = N, 1 + } + ;; + { .mmi + (p7) LD f43 = [A1], TEMP1 + (p7) LD f47 = [A2], TEMP1 + adds B2 = 4 * SIZE, BO2 + } + ;; + { .mmi + (p7) ST [BO2] = f32, 1 * SIZE + (p7) ST [B2 ] = f36, 1 * SIZE + tbit.nz p9, p0 = N, 0 + } + { .mmi + (p7) LD f48 = [A1], SIZE + (p7) LD f52 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f33, 1 * SIZE + (p7) ST [B2 ] = f37, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f49 = [A1], SIZE + (p7) LD f53 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f34, 1 * SIZE + (p7) ST [B2 ] = f38, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f50 = [A1], SIZE + (p7) LD f54 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f35, 5 * SIZE + (p7) ST [B2 ] = f39, 5 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f51 = [A1], TEMP1 + (p7) LD f55 = [A2], TEMP1 + mov TEMP1 = -1 * SIZE + } + ;; + { .mmi + (p7) ST [BO2] = f40, 1 * SIZE + (p7) ST [B2 ] = f44, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f56 = [A1], SIZE + (p7) LD f60 = [A2], SIZE + shladd TEMP1 = LDA, 3, TEMP1 + } + ;; + { .mmi + (p7) ST [BO2] = f41, 1 * SIZE + (p7) ST [B2 ] = f45, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f57 = [A1], SIZE + (p7) LD f61 = [A2], SIZE + sub TEMP1 = 0, TEMP1 + } + ;; + { .mmi + (p7) ST [BO2] = f42, 1 * SIZE + (p7) ST [B2 ] = f46, 1 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f58 = [A1], SIZE + (p7) LD f62 = [A2], SIZE + shladd TEMP1 = LDA, 1, TEMP1 + } + ;; + { .mmi + (p7) ST [BO2] = f43, 5 * SIZE + (p7) ST [B2 ] = f47, 5 * SIZE + nop __LINE__ + } + { .mmi + (p7) LD f59 = [A1], TEMP1 + (p7) LD f63 = [A2], TEMP1 + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f48, 1 * SIZE + (p7) ST [B2 ] = f52, 1 * SIZE + nop __LINE__ + } + { .mmi + add A2 = A1, LDA + adds TEMP1 = -1 * SIZE, LDA + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f49, 1 * SIZE + (p7) ST [B2 ] = f53, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f64 = [A1], SIZE + (p8) LD f66 = [A2], SIZE + add TEMP1 = TEMP1, LDA + } + ;; + { .mmi + (p7) ST [BO2] = f50, 1 * SIZE + (p7) ST [B2 ] = f54, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f65 = [A1], TEMP1 + (p8) LD f67 = [A2], TEMP1 + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f51, 5 * SIZE + (p7) ST [B2 ] = f55, 5 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f68 = [A1], SIZE + (p8) LD f70 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f56, 1 * SIZE + (p7) ST [B2 ] = f60, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f69 = [A1], TEMP1 + (p8) LD f71 = [A2], TEMP1 + mov TEMP3 = -1 * SIZE + } + ;; + { .mmi + (p7) ST [BO2] = f57, 1 * SIZE + (p7) ST [B2 ] = f61, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f72 = [A1], SIZE + (p8) LD f74 = [A2], SIZE + shladd TEMP3 = LDA, 3, TEMP3 + } + ;; + { .mmi + (p7) ST [BO2] = f58, 1 * SIZE + (p7) ST [B2 ] = f62, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f73 = [A1], TEMP1 + (p8) LD f75 = [A2], TEMP1 + sub TEMP3 = 0, TEMP3 + } + ;; + { .mmi + (p7) ST [BO2] = f59, 5 * SIZE + (p7) ST [B2 ] = f63 + adds B2 = 4 * SIZE, BO3 + } + { .mmi + (p8) LD f76 = [A1], SIZE + (p8) LD f78 = [A2], SIZE + shladd TEMP3 = LDA, 1, TEMP3 + } + ;; + { .mmi + (p8) ST [BO3] = f64, 1 * SIZE + (p8) ST [B2 ] = f68, 1 * SIZE + nop __LINE__ + } + { .mmi + (p8) LD f77 = [A1], TEMP3 + (p8) LD f79 = [A2], TEMP3 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f65, 1 * SIZE + (p8) ST [B2 ] = f69, 1 * SIZE + nop __LINE__ + } + { .mmi + add A2 = A1, LDA + shladd TEMP3 = LDA, 1, r0 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f66, 1 * SIZE + (p8) ST [B2 ] = f70, 1 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f80 = [A1], TEMP3 + (p9) LD f81 = [A2], TEMP3 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f67, 5 * SIZE + (p8) ST [B2 ] = f71, 5 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f82 = [A1], TEMP3 + (p9) LD f83 = [A2], TEMP3 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f72, 1 * SIZE + (p8) ST [B2 ] = f76, 1 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f84 = [A1], TEMP3 + (p9) LD f85 = [A2], TEMP3 + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f73, 1 * SIZE + (p8) ST [B2 ] = f77, 1 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f86 = [A1] + (p9) LD f87 = [A2] + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f74, 1 * SIZE + (p8) ST [B2 ] = f78, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f75, 5 * SIZE + (p8) ST [B2 ] = f79 + adds B2 = 4 * SIZE, BO4 + } + ;; + { .mmi + (p9) ST [BO4] = f80, 1 * SIZE + (p9) ST [B2 ] = f84, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p9) ST [BO4] = f81, 1 * SIZE + (p9) ST [B2 ] = f85, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p9) ST [BO4] = f82, 1 * SIZE + (p9) ST [B2 ] = f86, 1 * SIZE + cmp.ne p8, p0 = 0, J + } + ;; + { .mmb + (p9) ST [BO4] = f83, 5 * SIZE + (p9) ST [B2 ] = f87, 5 * SIZE + (p8) br.cond.dptk .L11 + } + ;; + .align 32 + +.L100: + { .mmi + mov A1 = A + add I = 8, N + mov pr.rot = 0 + } + { .mmi + adds A2 = 4 * SIZE, A + tbit.z p6, p0 = M, 2 + } + ;; + { .mmi + mov B1 = B + adds B2 = 4 * SIZE, B + mov ar.ec = 3 + } + { .mib + cmp.eq p16, p0 = r0, r0 + shr I = I, 4 + (p6) br.cond.dpnt .L200 + } + ;; + { .mmi + cmp.eq p8, p0 = 0, I + shladd I = I, 1, r0 + shladd A = LDA, 2, A + } + ;; + { .mmi + adds B = 32 * SIZE, B + adds I = -1, I + shr II = N, 3 + } + ;; + { .mmi + mov LCOUNT = 0 + mov SCOUNT = 0 + mov ar.lc = I + } + { .mib + nop __LINE__ + mov I = II + (p8) br.cond.dpnt .L120 + } + ;; + .align 32 + +.L112: + { .mmi + (p18) ST [B1] = f34, 1 * SIZE + (p18) ST [B2] = f46, 1 * SIZE + (p16) cmp.ne.unc p12, p0 = 1, I + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f44 = [A2], SIZE + (p18) cmp.ne.unc p13, p0 = 1, II + } + ;; + { .mmi + (p18) ST [B1] = f37, 1 * SIZE + (p18) ST [B2] = f49, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f35 = [A1], SIZE + (p16) LD f47 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f40, 1 * SIZE + (p18) ST [B2] = f52, 1 * SIZE + shladd TEMP3 = LDA, 2, r0 + } + { .mmi + (p16) LD f38 = [A1], SIZE + (p16) LD f50 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p18) ST [B1] = f43, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + (p16) adds LCOUNT = 1, LCOUNT + } + { .mmi + (p16) LD f41 = [A1], TEMP1 + (p16) LD f53 = [A2], TEMP1 + (p18) adds SCOUNT = 1, SCOUNT + } + ;; + { .mmi + (p18) ST [B1] = f82, 1 * SIZE + (p18) ST [B2] = f94, 1 * SIZE + (p16) cmp.eq.unc p14, p0 = 2, LCOUNT + } + { .mmi + (p12) LD f56 = [A1], SIZE + (p12) LD f68 = [A2], SIZE + (p18) cmp.eq.unc p15, p0 = 2, SCOUNT + } + ;; + { .mmi + (p18) ST [B1] = f85, 1 * SIZE + (p18) ST [B2] = f97, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f59 = [A1], SIZE + (p12) LD f71 = [A2], SIZE + sub TEMP3 = LDA, TEMP3 + } + ;; + { .mmi + (p18) ST [B1] = f88, 1 * SIZE + (p18) ST [B2] = f100, 1 * SIZE + (p13) adds TEMP2 = - 11 * SIZE, LDB + } + { .mmi + (p12) LD f62 = [A1], SIZE + (p12) LD f74 = [A2], SIZE + (p12) adds TEMP1 = - 11 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f91 + (p18) ST [B2] = f103 + (p18) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f65 = [A1], TEMP1 + (p12) LD f77 = [A2], TEMP1 + (p18) add B2 = B2, TEMP2 + } + ;; + { .mmi + (p13) ST [B1] = f58, 1 * SIZE + (p13) ST [B2] = f70, 1 * SIZE + adds TEMP3 = 5 * SIZE, TEMP3 + } + { .mmi + (p16) LD f80 = [A1], SIZE + (p16) LD f92 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p13) ST [B1] = f61, 1 * SIZE + (p13) ST [B2] = f73, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f83 = [A1], SIZE + (p16) LD f95 = [A2], SIZE + (p14) mov TEMP1 = TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f64, 1 * SIZE + (p13) ST [B2] = f76, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f86 = [A1], SIZE + (p16) LD f98 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p13) ST [B1] = f67, 5 * SIZE + (p13) ST [B2] = f79, 5 * SIZE + (p14) mov LCOUNT = 0 + } + { .mmi + (p16) LD f89 = [A1], TEMP1 + (p16) LD f101 = [A2], TEMP1 + (p15) mov SCOUNT = 0 + } + ;; + { .mmi + (p13) ST [B1] = f106, 1 * SIZE + (p13) ST [B2] = f118, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f104 = [A1], SIZE + (p12) LD f116 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) ST [B1] = f109, 1 * SIZE + (p13) ST [B2] = f121, 1 * SIZE + sub TEMP2 = TEMP2, LDB + } + { .mmi + (p12) LD f107 = [A1], SIZE + (p12) LD f119 = [A2], SIZE + adds TEMP1 = -11 * SIZE, LDA + } + ;; + { .mmi + (p13) ST [B1] = f112, 1 * SIZE + (p13) ST [B2] = f124, 1 * SIZE + (p15) adds TEMP2 = -27 * SIZE, LDB + } + { .mmi + (p12) LD f110 = [A1], SIZE + (p12) LD f122 = [A2], SIZE + (p14) mov TEMP1 = TEMP3 + } + ;; + { .mmi + (p13) ST [B1] = f115 + (p13) ST [B2] = f127 + (p13) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f113 = [A1], TEMP1 + (p12) LD f125 = [A2], TEMP1 + (p13) add B2 = B2, TEMP2 + } + ;; + { .mmb + (p14) adds I = -2, I + (p15) adds II = -2, II + br.ctop.sptk .L112 + } + ;; + .align 32 + +.L120: + { .mmi + add A2 = A1, LDA + nop __LINE__ + tbit.nz p7, p0 = N, 2 + } + ;; + { .mmi + (p7) LD f32 = [A1], SIZE + (p7) LD f36 = [A2], SIZE + tbit.nz p8, p0 = N, 1 + } + ;; + { .mmi + (p7) LD f33 = [A1], SIZE + (p7) LD f37 = [A2], SIZE + adds TEMP1 = -3 * SIZE, LDA + } + ;; + { .mmi + (p7) LD f34 = [A1], SIZE + (p7) LD f38 = [A2], SIZE + add TEMP1 = TEMP1, LDA + } + ;; + { .mmi + (p7) LD f35 = [A1], TEMP1 + (p7) LD f39 = [A2], TEMP1 + tbit.nz p9, p0 = N, 0 + } + ;; + { .mmi + (p7) LD f40 = [A1], SIZE + (p7) LD f44 = [A2], SIZE + mov TEMP2 = -1 * SIZE + } + ;; + { .mmi + (p7) LD f41 = [A1], SIZE + (p7) LD f45 = [A2], SIZE + shladd TEMP2 = LDA, 1, TEMP2 + } + ;; + { .mmi + (p7) LD f42 = [A1], SIZE + (p7) LD f46 = [A2], SIZE + sub TEMP2 = 0, TEMP2 + } + ;; + { .mmi + (p7) LD f43 = [A1], TEMP2 + (p7) LD f47 = [A2] + nop __LINE__ + } + ;; + { .mmi + add A2 = A1, LDA + adds TEMP1 = -1 * SIZE, LDA + mov TEMP2 = -1 * SIZE + } + ;; + { .mmi + (p8) LD f48 = [A1], SIZE + (p8) LD f50 = [A2], SIZE + add TEMP1 = TEMP1, LDA + } + ;; + { .mmi + (p8) LD f49 = [A1], TEMP1 + (p8) LD f51 = [A2], TEMP1 + shladd TEMP2 = LDA, 1, TEMP2 + } + ;; + { .mmi + (p8) LD f52 = [A1], SIZE + (p8) LD f54 = [A2], SIZE + sub TEMP2 = r0, TEMP2 + } + ;; + { .mmi + (p8) LD f53 = [A1], TEMP2 + (p8) LD f55 = [A2], TEMP2 + nop __LINE__ + } + ;; + { .mmi + add A2 = A1, LDA + adds B2 = 4 * SIZE, BO2 + nop __LINE__ + } + ;; + { .mmi + (p9) LD f56 = [A1] + nop __LINE__ + (p9) shladd A1 = LDA, 1, A1 + } + { .mmi + (p9) LD f57 = [A2] + nop __LINE__ + (p9) shladd A2 = LDA, 1, A2 + } + ;; + { .mmi + (p7) ST [BO2] = f32, 1 * SIZE + (p7) ST [B2 ] = f36, 1 * SIZE + nop __LINE__ + } + { .mmi + (p9) LD f58 = [A1] + (p9) LD f59 = [A2] + nop __LINE__ + } + ;; + ;; + { .mmi + (p7) ST [BO2] = f33, 1 * SIZE + (p7) ST [B2 ] = f37, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f34, 1 * SIZE + (p7) ST [B2 ] = f38, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f35, 5 * SIZE + (p7) ST [B2 ] = f39, 5 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f40, 1 * SIZE + (p7) ST [B2 ] = f44, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f41, 1 * SIZE + (p7) ST [B2 ] = f45, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f42, 1 * SIZE + (p7) ST [B2 ] = f46, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f43, 5 * SIZE + (p7) ST [B2 ] = f47 + adds B2 = 4 * SIZE, BO3 + } + ;; + { .mmi + (p8) ST [BO3] = f48, 1 * SIZE + (p8) ST [B2 ] = f52, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f49, 1 * SIZE + (p8) ST [B2 ] = f53, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f50, 1 * SIZE + (p8) ST [B2 ] = f54, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f51, 5 * SIZE + (p8) ST [B2 ] = f55 + adds B2 = 2 * SIZE, BO4 + } + ;; + { .mmi + (p9) ST [BO4] = f56, 1 * SIZE + (p9) ST [B2 ] = f58, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p9) ST [BO4] = f57, 3 * SIZE + (p9) ST [B2 ] = f59 + nop __LINE__ + } + ;; + .align 32 + +.L200: + { .mmi + add I = 8, N + mov A1 = A + mov pr.rot = 0 + } + { .mmi + adds A2 = 4 * SIZE, A + nop __LINE__ + tbit.z p6, p0 = M, 1 + } + ;; + { .mmi + mov B1 = B + cmp.eq p16, p0 = r0, r0 + mov ar.ec = 3 + } + { .mib + adds B2 = 4 * SIZE, B + shr I = I, 4 + (p6) br.cond.dpnt .L300 + } + ;; + { .mmi + shladd A = LDA, 1, A + adds B = 16 * SIZE, B + shr II = N, 3 + } + { .mmi + cmp.eq p8, p0 = 0, I + adds I = -1, I + nop __LINE__ + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = I + } + { .mib + mov I = II + nop __LINE__ + (p8) br.cond.dpnt .L220 + } + ;; + .align 32 + +.L212: + { .mmi + (p18) ST [B1] = f34, 1 * SIZE + (p18) ST [B2] = f46, 1 * SIZE + (p16) cmp.ne.unc p12, p0 = 1, I + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f44 = [A2], SIZE + (p18) cmp.ne.unc p13, p0 = 1, II + } + ;; + { .mmi + (p18) ST [B1] = f37, 1 * SIZE + (p18) ST [B2] = f49, 1 * SIZE + adds TEMP1 = -3 * SIZE, LDA + } + { .mmi + (p16) LD f35 = [A1], SIZE + (p16) LD f47 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f40, 1 * SIZE + (p18) ST [B2] = f52, 1 * SIZE + (p12) mov TEMP1 = 5 * SIZE + } + { .mmi + (p16) LD f38 = [A1], SIZE + (p16) LD f50 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f43, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f41 = [A1], TEMP1 + (p16) LD f53 = [A2], TEMP1 + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f82, 1 * SIZE + (p18) ST [B2] = f94, 1 * SIZE + nop __LINE__ + } + { .mmi + (p12) LD f56 = [A1], SIZE + (p12) LD f68 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f85, 1 * SIZE + (p18) ST [B2] = f97, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f59 = [A1], SIZE + (p12) LD f71 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f88, 1 * SIZE + (p18) ST [B2] = f100, 1 * SIZE + (p13) adds TEMP2 = - 11 * SIZE, LDB + } + { .mmi + (p12) LD f62 = [A1], SIZE + (p12) LD f74 = [A2], SIZE + (p12) adds TEMP1 = - 11 * SIZE, LDA + } + ;; + { .mmi + (p18) ST [B1] = f91 + (p18) ST [B2] = f103 + (p18) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f65 = [A1], TEMP1 + (p12) LD f77 = [A2], TEMP1 + (p18) add B2 = B2, TEMP2 + } + ;; + { .mmi + (p13) ST [B1] = f58, 1 * SIZE + (p13) ST [B2] = f70, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f80 = [A1], SIZE + (p16) LD f92 = [A2], SIZE + sub TEMP1 = r0, LDA + } + ;; + { .mmi + (p13) ST [B1] = f61, 1 * SIZE + (p13) ST [B2] = f73, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f83 = [A1], SIZE + (p16) LD f95 = [A2], SIZE + (p16) adds TEMP1 = 5 * SIZE, TEMP1 + } + ;; + { .mmi + (p13) ST [B1] = f64, 1 * SIZE + (p13) ST [B2] = f76, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f86 = [A1], SIZE + (p16) LD f98 = [A2], SIZE + (p12) mov TEMP1 = 5 * SIZE + } + ;; + { .mmi + (p13) ST [B1] = f67, 5 * SIZE + (p13) ST [B2] = f79, 5 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f89 = [A1], TEMP1 + (p16) LD f101 = [A2], TEMP1 + adds TEMP1 = -11 * SIZE, LDA + } + ;; + { .mmi + (p13) ST [B1] = f106, 1 * SIZE + (p13) ST [B2] = f118, 1 * SIZE + mov TEMP2 = 5 * SIZE + } + { .mmi + (p12) LD f104 = [A1], SIZE + (p12) LD f116 = [A2], SIZE + (p16) shladd TEMP1 = LDA, 1, r0 + } + ;; + { .mmi + (p13) ST [B1] = f109, 1 * SIZE + (p13) ST [B2] = f121, 1 * SIZE + sub TEMP2 = TEMP2, LDB + } + { .mmi + (p12) LD f107 = [A1], SIZE + (p12) LD f119 = [A2], SIZE + (p16) sub TEMP1 = LDA, TEMP1 + } + ;; + { .mmi + (p13) ST [B1] = f112, 1 * SIZE + (p13) ST [B2] = f124, 1 * SIZE + (p18) adds TEMP2 = -11 * SIZE, LDB + } + { .mmi + (p12) LD f110 = [A1], SIZE + (p12) LD f122 = [A2], SIZE + (p16) adds TEMP1 = 5 * SIZE, TEMP1 + } + ;; + { .mmi + (p13) ST [B1] = f115 + (p13) ST [B2] = f127 + (p13) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f113 = [A1], TEMP1 + (p12) LD f125 = [A2], TEMP1 + (p13) add B2 = B2, TEMP2 + } + ;; + { .mmb + (p16) adds I = -2, I + (p18) adds II = -2, II + br.ctop.sptk .L212 + } + ;; + .align 32 + +.L220: + { .mmi + add A2 = A1, LDA + nop __LINE__ + tbit.nz p7, p0 = N, 2 + } + ;; + { .mmi + (p7) LD f32 = [A1], SIZE + (p7) LD f36 = [A2], SIZE + tbit.nz p8, p0 = N, 1 + } + ;; + { .mmi + (p7) LD f33 = [A1], SIZE + (p7) LD f37 = [A2], SIZE + tbit.nz p9, p0 = N, 0 + } + ;; + { .mmi + (p7) LD f34 = [A1], SIZE + (p7) LD f38 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) LD f35 = [A1], SIZE + (p7) LD f39 = [A2] + nop __LINE__ + } + ;; + { .mmi + add A2 = A1, LDA + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p8) LD f40 = [A1], SIZE + (p8) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) LD f41 = [A1], SIZE + (p8) LD f43 = [A2] + nop __LINE__ + } + ;; + { .mmi + add A2 = A1, LDA + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p9) LD f44 = [A1] + (p9) LD f45 = [A2] + adds B2 = 4 * SIZE, BO2 + } + ;; + { .mmi + (p7) ST [BO2] = f32, 1 * SIZE + (p7) ST [B2 ] = f36, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f33, 1 * SIZE + (p7) ST [B2 ] = f37, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f34, 1 * SIZE + (p7) ST [B2 ] = f38, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f35, 5 * SIZE + (p7) ST [B2 ] = f39 + adds B2 = 2 * SIZE, BO3 + } + ;; + { .mmi + (p8) ST [BO3] = f40, 1 * SIZE + (p8) ST [B2 ] = f42, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p8) ST [BO3] = f41, 3 * SIZE + (p8) ST [B2 ] = f43 + adds B2 = 1 * SIZE, BO4 + } + ;; + { .mmi + (p9) ST [BO4] = f44, 2 * SIZE + (p9) ST [B2 ] = f45 + nop __LINE__ + } + ;; + .align 32 + +.L300: + { .mmi + add I = 8, N + mov A1 = A + mov pr.rot = 0 + } + { .mmi + mov B1 = B + adds A2 = 4 * SIZE, A + tbit.z p6, p0 = M, 0 + } + ;; + { .mmi + adds B2 = 4 * SIZE, B + cmp.eq p16, p0 = r0, r0 + mov ar.ec = 3 + } + { .mib + nop __LINE__ + shr I = I, 4 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + cmp.eq p8, p0 = 0, I + adds I = -1, I + shr II = N, 3 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = I + } + { .mib + nop __LINE__ + mov I = II + (p8) br.cond.dpnt .L320 + } + ;; + .align 32 + +.L312: + { .mmi + (p18) ST [B1] = f34, 1 * SIZE + (p18) ST [B2] = f46, 1 * SIZE + (p16) cmp.ne.unc p12, p0 = 1, I + } + { .mmi + (p16) LD f32 = [A1], SIZE + (p16) LD f44 = [A2], SIZE + (p18) cmp.ne.unc p13, p0 = 1, II + } + ;; + { .mmi + (p18) ST [B1] = f37, 1 * SIZE + (p18) ST [B2] = f49, 1 * SIZE + adds TEMP2 = - 3 * SIZE, LDB + } + { .mmi + (p16) LD f35 = [A1], SIZE + (p16) LD f47 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f40, 1 * SIZE + (p18) ST [B2] = f52, 1 * SIZE + nop __LINE__ + } + { .mmi + (p16) LD f38 = [A1], SIZE + (p16) LD f50 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f43 + (p18) ST [B2] = f55 + (p18) add B1 = B1, TEMP2 + } + { .mmi + (p16) LD f41 = [A1], 5 * SIZE + (p16) LD f53 = [A2], 5 * SIZE + (p18) add B2 = B2, TEMP2 + } + ;; + { .mmi + (p13) ST [B1] = f58, 1 * SIZE + (p13) ST [B2] = f70, 1 * SIZE + (p16) adds I = -2, I + } + { .mmi + (p12) LD f56 = [A1], SIZE + (p12) LD f68 = [A2], SIZE + (p18) adds II = -2, II + } + ;; + { .mmi + (p13) ST [B1] = f61, 1 * SIZE + (p13) ST [B2] = f73, 1 * SIZE + nop __LINE__ + } + { .mmi + (p12) LD f59 = [A1], SIZE + (p12) LD f71 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) ST [B1] = f64, 1 * SIZE + (p13) ST [B2] = f76, 1 * SIZE + nop __LINE__ + } + { .mmi + (p12) LD f62 = [A1], SIZE + (p12) LD f74 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) ST [B1] = f67 + (p13) ST [B2] = f79 + (p13) add B1 = B1, TEMP2 + } + { .mmi + (p12) LD f65 = [A1], 5 * SIZE + (p12) LD f77 = [A2], 5 * SIZE + (p13) add B2 = B2, TEMP2 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + br.ctop.sptk .L312 + } + ;; + .align 32 + +.L320: + { .mmi + adds A2 = 2 * SIZE, A1 + adds B2 = 2 * SIZE, BO2 + tbit.nz p7, p0 = N, 2 + } + ;; + { .mmi + (p7) LD f32 = [A1], SIZE + (p7) LD f34 = [A2], SIZE + tbit.nz p8, p0 = N, 1 + } + ;; + { .mmi + (p7) LD f33 = [A1], 3 * SIZE + (p7) LD f35 = [A2] + nop __LINE__ + } + ;; + { .mmi + adds A2 = SIZE, A1 + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p8) LD f36 = [A1], 2 * SIZE + (p8) LD f37 = [A2] + tbit.nz p9, p0 = N, 0 + } + ;; + { .mmi + (p9) LD f38 = [A1] + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f32, 1 * SIZE + (p7) ST [B2 ] = f34, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p7) ST [BO2] = f33, 3 * SIZE + (p7) ST [B2 ] = f35 + adds B2 = SIZE, BO3 + } + ;; + { .mmi + (p8) ST [BO3] = f36, 2 * SIZE + (p8) ST [B2 ] = f37 + nop __LINE__ + } + ;; + { .mmi + (p9) ST [BO4] = f38, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + .align 32 + +.L999: + mov pr = PR, -1 + mov ar.lc = ARLC + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/gemv_n.S b/kernel/ia64/gemv_n.S new file mode 100644 index 0000000000..4826bf5b45 --- /dev/null +++ b/kernel/ia64/gemv_n.S @@ -0,0 +1,3317 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#define A r36 +#define LDA r37 +#define X r38 +#define INCX r39 +#define Y r34 +#define INCY r35 +#define BUFFER r11 + +#define I r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define AO5 r20 +#define AO6 r21 +#define AO7 r22 +#define AO8 r23 +#define YLD1 r24 +#define YST1 r25 +#define YST2 r27 +#define MM r28 +#define YY r9 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define AO11 loc8 +#define AO21 loc9 +#define AO31 loc10 +#define AO41 loc11 +#define AO51 loc12 +#define AO61 loc13 +#define AO71 loc14 +#define AO81 loc15 + +#define PREB r8 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 8) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 8, 0 + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + ;; + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + ;; + stf.spill [r8] = f22 + stf.spill [r9] = f23 + .body + ;; + + ld8 Y = [r14] + ld8 INCY = [r15] + ld8 BUFFER = [r16] + + mov ALPHA = f8 + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + ;; + shladd INCX = INCX, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + ;; + tbit.nz p8, p0 = A, BASE_SHIFT + tbit.nz p9, p0 = LDA, BASE_SHIFT + mov MM = M + ;; + (p8) adds MM = -1, M + ;; + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + ;; + sub I = A, Y + cmp.eq p10, p0 = SIZE, INCY + mov YY = Y + ;; + (p10) tbit.z.unc p10, p0 = I, BASE_SHIFT + ;; + (p10) br.cond.dptk .L10 + ;; + shr J = M, 3 + mov YY = BUFFER + ;; + (p8) adds YY = SIZE, BUFFER + ;; + mov ar.lc = J + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + ;; +.L02: + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 5 * SIZE + STFD [YST2] = f0, 5 * SIZE + br.cloop.sptk.few .L02 + ;; + +.L10: + { .mib + nop __LINE__ + shr J = N, 3 + (p9) br.cond.dptk .L100 + } + ;; + { .mib + nop __LINE__ + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + mov YLD1 = YY + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + ;; + LDFD f8 = [X], INCX + ;; + LDFD f9 = [X], INCX + ;; + LDFD f10 = [X], INCX + ;; + LDFD f11 = [X], INCX + ;; + LDFD f12 = [X], INCX + ;; + LDFD f13 = [X], INCX + ;; + LDFD f14 = [X], INCX + ;; + LDFD f15 = [X], INCX + ;; + FMPY f8 = ALPHA, f8 + FMPY f9 = ALPHA, f9 + FMPY f10 = ALPHA, f10 + FMPY f11 = ALPHA, f11 + FMPY f12 = ALPHA, f12 + FMPY f13 = ALPHA, f13 + FMPY f14 = ALPHA, f14 + FMPY f15 = ALPHA, f15 + ;; + mov AO1 = A + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + ;; + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + ;; + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + shladd A = LDA, 3, A + ;; + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + + (p8) LDFD f80 = [AO1], 1 * SIZE + (p8) LDFD f81 = [AO2], 1 * SIZE + (p8) LDFD f82 = [AO3], 1 * SIZE + (p8) LDFD f83 = [AO4], 1 * SIZE + (p8) LDFD f84 = [AO5], 1 * SIZE + (p8) LDFD f85 = [AO6], 1 * SIZE + (p8) LDFD f86 = [AO7], 1 * SIZE + (p8) LDFD f87 = [AO8], 1 * SIZE + (p8) LDFD f106 = [YLD1], 1 * SIZE + ;; + (p8) FMPY f32 = f8, f80 + (p8) FMPY f33 = f9, f81 + (p8) FMPY f34 = f10, f82 + (p8) FMA f35 = f11, f83, f106 + ;; + (p8) FMA f32 = f12, f84, f32 + (p8) FMA f33 = f13, f85, f33 + (p8) FMA f34 = f14, f86, f34 + (p8) FMA f35 = f15, f87, f35 + ;; + (p8) FADD f32 = f32, f33 + (p8) FADD f34 = f34, f35 + ;; + (p8) FADD f32 = f32, f34 + ;; + (p8) STFD [YST1] = f32, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + ;; + + shr I = MM, 3 + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + ;; + mov ar.lc = I + mov ar.ec = 2 + (p6) br.cond.dpnt .L15 + ;; + .align 16 + +.L12: + { .mmf + (p18) STFD [YST1] = f16, 1 * SIZE + (p18) STFD [YST2] = f17, 1 * SIZE + (p17) FMA f16 = f8, f33, f101 + } + { .mfi + (p17) LDFPD f93, f94 = [AO8], 2 * SIZE + (p17) FMA f17 = f8, f37, f113 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p18) STFD [YST1] = f18, 1 * SIZE + (p18) STFD [YST2] = f19, 1 * SIZE + (p17) FMA f18 = f8, f34, f104 + } + { .mmf + (p14) lfetch.excl.nt1 [PREB], 16 * SIZE + (p17) LDFPD f95, f96 = [AO8], 2 * SIZE + (p17) FMA f19 = f8, f38, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f20, 1 * SIZE + (p18) STFD [YST2] = f21, 1 * SIZE + (p17) FMA f20 = f8, f35, f107 + } + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f21 = f8, f39, f119 + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YST1] = f22, 5 * SIZE + (p18) STFD [YST2] = f23, 5 * SIZE + (p17) FMA f22 = f8, f36, f110 + } + { .mmf + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f23 = f8, f40, f122 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f16 = f9, f41, f16 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f17 = f9, f45, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f18 = f9, f42, f18 + nop __LINE__ + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f19 = f9, f46, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f43, f20 + nop __LINE__ + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f21 = f9, f47, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f44, f22 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f9, f48, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f16 = f10, f49, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f10, f53, f17 + nop __LINE__ + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFPD f46, f47 = [AO2], 2 * SIZE + (p17) FMA f18 = f10, f50, f18 + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f10, f54, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f20 = f10, f51, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f10, f55, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f22 = f10, f52, f22 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f10, f56, f23 + nop __LINE__ + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f16 + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f11, f61, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f18 = f11, f58, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f11, f62, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f59, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f11, f63, f21 + nop __LINE__ + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFPD f58, f59 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f60, f22 + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f11, f64, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO4], 2 * SIZE + (p17) FMA f16 = f12, f65, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f12, f69, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f62, f63 = [AO4], 2 * SIZE + (p17) FMA f18 = f12, f66, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f12, f70, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO5], 2 * SIZE + (p17) FMA f20 = f12, f67, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f12, f71, f21 + nop __LINE__ + } + ;; + { .mmf + (p14) PREFETCH [RPRE5], 16 * SIZE + (p16) LDFPD f66, f67 = [AO5], 2 * SIZE + (p17) FMA f22 = f12, f68, f22 + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f12, f72, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO5], 2 * SIZE + (p17) FMA f16 = f13, f73, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f13, f77, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO5], 2 * SIZE + (p17) FMA f18 = f13, f74, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f13, f78, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f72, f73 = [AO6], 2 * SIZE + (p17) FMA f20 = f13, f75, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f13, f79, f21 + nop __LINE__ + } + ;; + { .mmf + (p15) PREFETCH [RPRE6], 16 * SIZE + (p16) LDFPD f74, f75 = [AO6], 2 * SIZE + (p17) FMA f22 = f13, f76, f22 + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f13, f80, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f76, f77 = [AO6], 2 * SIZE + (p17) FMA f16 = f14, f81, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f14, f85, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f78, f79 = [AO6], 2 * SIZE + (p17) FMA f18 = f14, f82, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f14, f86, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO7], 2 * SIZE + (p17) FMA f20 = f14, f83, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f14, f87, f21 + nop __LINE__ + } + ;; + { .mmf + (p14) PREFETCH [RPRE7], 16 * SIZE + (p16) LDFPD f82, f83 = [AO7], 2 * SIZE + (p17) FMA f22 = f14, f84, f22 + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f14, f88, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO7], 2 * SIZE + (p17) FMA f16 = f15, f89, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f15, f93, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO7], 2 * SIZE + (p17) FMA f18 = f15, f90, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f15, f94, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f88, f89 = [AO8], 2 * SIZE + (p17) FMA f20 = f15, f91, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f15, f95, f21 + (p16) adds I = -1, I + } + ;; + { .mmf + (p15) PREFETCH [RPRE8], 16 * SIZE + (p16) LDFPD f90, f91 = [AO8], 2 * SIZE + (p17) FMA f22 = f15, f92, f22 + } + { .mfb + nop __LINE__ + (p17) FMA f23 = f15, f96, f23 + br.ctop.sptk.few .L12 + } + ;; + .align 16 + +.L15: + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + (p18) STFD [YST2] = f17, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + cmp.lt p6, p0 = 1, J + } + ;; + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + (p18) STFD [YST2] = f19, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + adds J = -1, J + } + ;; + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + (p18) STFD [YST2] = f21, 1 * SIZE + nop __LINE__ + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [YST1] = f22, 5 * SIZE + (p18) STFD [YST2] = f23, 5 * SIZE + nop __LINE__ + } + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f81 = [AO2] + (p15) LDFD f82 = [AO3] + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f38, f39 = [AO4], 2 * SIZE + (p13) FMA f100 = f8, f32, f100 + nop __LINE__ + } + { .mfi + (p13) LDFPD f40, f41 = [AO5], 2 * SIZE + (p13) FMA f101 = f8, f33, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f54, f55 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f56, f57 = [AO5], 2 * SIZE + (p13) FMA f103 = f8, f49, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f70, f71 = [AO4], 2 * SIZE + (p14) FMA f104 = f8, f64, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f72, f73 = [AO5], 2 * SIZE + (p14) FMA f105 = f8, f65, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f83 = [AO4] + (p15) FMA f106 = f8, f80, f106 + nop __LINE__ + } + { .mfi + (p15) LDFD f84 = [AO5] + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f42, f43 = [AO6], 2 * SIZE + (p13) FMA f100 = f9, f34, f100 + nop __LINE__ + } + { .mfi + (p13) LDFPD f44, f45 = [AO7], 2 * SIZE + (p13) FMA f101 = f9, f35, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f58, f59 = [AO6], 2 * SIZE + (p13) FMA f102 = f9, f50, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f60, f61 = [AO7], 2 * SIZE + (p13) FMA f103 = f9, f51, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f74, f75 = [AO6], 2 * SIZE + (p14) FMA f104 = f9, f66, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f76, f77 = [AO7], 2 * SIZE + (p14) FMA f105 = f9, f67, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f85 = [AO6] + (p15) FMA f106 = f9, f81, f106 + nop __LINE__ + } + { .mfi + (p15) LDFD f86 = [AO7] + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f46, f47 = [AO8], 2 * SIZE + (p13) FMA f100 = f10, f36, f100 + nop __LINE__ + } + { .mfi + (p13) FMA f101 = f10, f37, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f62, f63 = [AO8], 2 * SIZE + (p13) FMA f102 = f10, f52, f102 + nop __LINE__ + } + { .mfi + (p13) FMA f103 = f10, f53, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f78, f79 = [AO8], 2 * SIZE + (p14) FMA f104 = f10, f68, f104 + nop __LINE__ + } + { .mfi + (p14) FMA f105 = f10, f69, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f87 = [AO8] + (p15) FMA f106 = f10, f82, f106 + nop __LINE__ + } + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + (p13) FMA f102 = f11, f54, f102 + (p13) FMA f103 = f11, f55, f103 + (p14) FMA f104 = f11, f70, f104 + (p14) FMA f105 = f11, f71, f105 + (p15) FMA f106 = f11, f83, f106 + ;; + (p13) FMA f100 = f12, f40, f100 + (p13) FMA f101 = f12, f41, f101 + (p13) FMA f102 = f12, f56, f102 + (p13) FMA f103 = f12, f57, f103 + (p14) FMA f104 = f12, f72, f104 + (p14) FMA f105 = f12, f73, f105 + (p15) FMA f106 = f12, f84, f106 + ;; + (p13) FMA f100 = f13, f42, f100 + (p13) FMA f101 = f13, f43, f101 + (p13) FMA f102 = f13, f58, f102 + (p13) FMA f103 = f13, f59, f103 + (p14) FMA f104 = f13, f74, f104 + (p14) FMA f105 = f13, f75, f105 + (p15) FMA f106 = f13, f85, f106 + ;; + (p13) FMA f100 = f14, f44, f100 + (p13) FMA f101 = f14, f45, f101 + (p13) FMA f102 = f14, f60, f102 + (p13) FMA f103 = f14, f61, f103 + (p14) FMA f104 = f14, f76, f104 + (p14) FMA f105 = f14, f77, f105 + (p15) FMA f106 = f14, f86, f106 + ;; + (p13) FMA f100 = f15, f46, f100 + (p13) FMA f101 = f15, f47, f101 + (p13) FMA f102 = f15, f62, f102 + (p13) FMA f103 = f15, f63, f103 + (p14) FMA f104 = f15, f78, f104 + (p14) FMA f105 = f15, f79, f105 + (p15) FMA f106 = f15, f87, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + (p6) br.cond.dptk .L11 + ;; + .align 16 + + +.L20: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd AO4 = LDA, 1, AO2 + } + ;; + { .mmi + LDFD f10 = [X], INCX + (p8) LDFD f81 = [AO2], 1 * SIZE + shladd AO3 = LDA, 1, A + } + ;; + { .mmi + LDFD f11 = [X], INCX + (p8) LDFD f82 = [AO3], 1 * SIZE + } + ;; + { .mfi + (p8) LDFD f83 = [AO4], 1 * SIZE + FMPY f8 = ALPHA, f8 + adds PREB = RPREFETCH * SIZE, YLD1 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + } + ;; + FMPY f10 = ALPHA, f10 + shladd A = LDA, 2, A + FMPY f11 = ALPHA, f11 + ;; + { .mfi + adds RPRE3 = RPREFETCH * SIZE, AO3 + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 2 + } + ;; + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMA f106 = f9, f81, f106 + shr I = MM, 3 + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + (p8) FMA f106 = f10, f82, f106 + } + ;; + { .mfi + adds I = -1, I + (p8) FMA f106 = f11, f83, f106 + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mfi + (p17) LDFPD f63, f64 = [AO4], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + (p16) adds I = -1, I + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mmf + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFPD f46, f47 = [AO2], 2 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f101 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f17 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO4], 2 * SIZE + (p17) FMA f18 = f11, f59, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f58, f59 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f61, f113 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f63, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f11, f64, f122 + br.ctop.sptk.few .L22 + } + ;; + .align 16 + +.L25: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmf + (p15) LDFD f81 = [AO2] + (p15) LDFD f82 = [AO3] + (p13) FMA f100 = f8, f32, f100 + } + { .mfi + (p18) STFD [YST1] = f23, 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + } + ;; + ;; + { .mfi + (p13) LDFPD f38, f39 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + } + { .mfi + (p13) FMA f103 = f8, f49, f103 + } + ;; + { .mfi + (p13) LDFPD f54, f55 = [AO4], 2 * SIZE + (p14) FMA f104 = f8, f64, f104 + } + { .mfi + (p14) FMA f105 = f8, f65, f105 + } + ;; + { .mfi + (p14) LDFPD f70, f71 = [AO4], 2 * SIZE + (p15) FMA f106 = f8, f80, f106 + } + { .mfi + (p13) FMA f100 = f9, f34, f100 + } + ;; + { .mfi + (p15) LDFD f83 = [AO4] + (p13) FMA f101 = f9, f35, f101 + } + { .mfi + (p13) FMA f102 = f9, f50, f102 + } + ;; + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) FMA f100 = f10, f36, f100 + (p13) FMA f101 = f10, f37, f101 + (p13) FMA f102 = f10, f52, f102 + (p13) FMA f103 = f10, f53, f103 + (p14) FMA f104 = f10, f68, f104 + (p14) FMA f105 = f10, f69, f105 + (p15) FMA f106 = f10, f82, f106 + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + ;; + (p13) FMA f102 = f11, f54, f102 + (p13) STFD [YST1] = f100, 1 * SIZE + (p13) FMA f103 = f11, f55, f103 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f104 = f11, f70, f104 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p14) FMA f105 = f11, f71, f105 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + (p15) FMA f106 = f11, f83, f106 + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L30: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L40 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd A = LDA, 1, A + } + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA, f8 + mov ar.ec= 2 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + shr I = MM, 3 + ;; + (p8) LDFD f81 = [AO2], 1 * SIZE + cmp.eq p6, p0 = 0, I + ;; + (p8) FMA f106 = f8, f80, f106 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + tbit.nz p13, p0 = MM, 2 + ;; + (p8) FMA f106 = f9, f81, f106 + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + { .mfi + (p17) LDFPD f47, f48 = [AO2], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mmf + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + adds I = -1, I + } + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mmf + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f16 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f17 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p17) FMA f18 = f9, f43, f107 + } + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f45, f113 + } + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f47, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f9, f48, f122 + br.ctop.sptk.few .L32 + } + ;; + .align 16 + +.L35: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f81 = [AO2] + (p18) STFD [YST1] = f23, 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) FMA f100 = f9, f34, f100 + (p13) FMA f101 = f9, f35, f101 + (p13) FMA f102 = f9, f50, f102 + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L40: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 0 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L990 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + adds RPRE1 = RPREFETCH * SIZE, AO1 + } + ;; + { .mii + (p8) LDFD f80 = [AO1], 1 * SIZE + adds PREB = RPREFETCH * SIZE, YLD1 + } + ;; + FMPY f8 = ALPHA, f8 + shr I = MM, 3 + ;; + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 3 + ;; + { .mmi + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + tbit.nz p14, p15 = r0, 0 + } + ;; + { .mmi + adds YST2 = 4 * SIZE, YST1 + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mmi + (p8) STFD [YST1] = f106, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + } + { .mib + mov ar.lc = I + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L42: + { .mmf + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + (p18) FMA f16 = f8, f34, f102 + } + { .mmf + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) FMA f20 = f8, f46, f114 + } + ;; + { .mmf + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + (p18) FMA f17 = f8, f37, f105 + } + { .mmf + (p16) LDFPD f38, f41 = [AO1], 2 * SIZE + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) FMA f21 = f8, f49, f117 + } + ;; + { .mmf + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + (p18) FMA f18 = f8, f40, f108 + } + { .mmf + (p16) LDFPD f44, f47 = [AO1], 2 * SIZE + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) FMA f22 = f8, f52, f120 + } + ;; + { .mmf + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + (p18) FMA f19 = f8, f43, f111 + } + { .mmf + (p16) LDFPD f50, f53 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) FMA f23 = f8, f55, f123 + } + ;; + { .mmi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p14) PREFETCH [PREB], 16 * SIZE + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mib + nop __LINE__ + (p16) adds I = -1, I + br.ctop.sptk.few .L42 + } + ;; + .align 16 + +.L45: + { .mmi + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + } + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f105 = f8, f65, f105 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + br .L990 + ;; + .align 16 + +.L100: + shr J = N, 3 + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L120 + ;; + .align 16 + +.L111: + mov YLD1 = YY + mov YST1 = YY + ;; + LDFD f8 = [X], INCX + ;; + LDFD f9 = [X], INCX + ;; + LDFD f10 = [X], INCX + ;; + LDFD f11 = [X], INCX + ;; + LDFD f12 = [X], INCX + ;; + LDFD f13 = [X], INCX + ;; + LDFD f14 = [X], INCX + ;; + LDFD f15 = [X], INCX + ;; + FMPY f8 = ALPHA, f8 + FMPY f9 = ALPHA, f9 + FMPY f10 = ALPHA, f10 + FMPY f11 = ALPHA, f11 + FMPY f12 = ALPHA, f12 + FMPY f13 = ALPHA, f13 + FMPY f14 = ALPHA, f14 + FMPY f15 = ALPHA, f15 + ;; + mov AO1 = A + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + ;; + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + ;; + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + shladd A = LDA, 3, A + ;; + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + + (p8) LDFD f80 = [AO1], 1 * SIZE + (p8) LDFD f81 = [AO2], 1 * SIZE + (p8) LDFD f82 = [AO3], 1 * SIZE + (p8) LDFD f83 = [AO4], 1 * SIZE + (p8) LDFD f84 = [AO5], 1 * SIZE + (p8) LDFD f85 = [AO6], 1 * SIZE + (p8) LDFD f86 = [AO7], 1 * SIZE + (p8) LDFD f87 = [AO8], 1 * SIZE + (p8) LDFD f106 = [YLD1], 1 * SIZE + ;; + (p8) FMPY f32 = f8, f80 + (p8) FMPY f33 = f9, f81 + (p8) FMPY f34 = f10, f82 + (p8) FMA f35 = f11, f83, f106 + ;; + (p8) FMA f32 = f12, f84, f32 + (p8) FMA f33 = f13, f85, f33 + (p8) FMA f34 = f14, f86, f34 + (p8) FMA f35 = f15, f87, f35 + ;; + (p8) FADD f32 = f32, f33 + (p8) FADD f34 = f34, f35 + ;; + (p8) FADD f32 = f32, f34 + ;; + (p8) STFD [YST1] = f32, 1 * SIZE + + shr I = MM, 3 + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + ;; + mov ar.lc = I + mov ar.ec= 2 + (p6) br.cond.dpnt .L115 + ;; + .align 16 + +.L112: + { .mfi + (p17) LDFD f96 = [AO8], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFD f47 = [AO2], 1 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f101 = f11, f57, f101 + } + { .mmf + (p18) STFD [YST1] = f19, 1 * SIZE + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f104 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f57, f58 = [AO4], 2 * SIZE + (p17) FMA f107 = f11, f59, f107 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f110 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f59, f60 = [AO4], 2 * SIZE + (p17) FMA f113 = f11, f61, f113 + } + { .mfi + (p17) FMA f116 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f61, f62 = [AO4], 2 * SIZE + (p17) FMA f119 = f11, f63, f119 + } + { .mfi + (p17) FMA f122 = f11, f64, f122 + } + ;; + { .mfi + (p16) LDFD f63 = [AO4], 1 * SIZE + (p17) FMA f101 = f12, f65, f101 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f104 = f12, f66, f104 + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO5], 2 * SIZE + (p17) FMA f107 = f12, f67, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f110 = f12, f68, f110 + } + ;; + { .mfi + (p16) LDFPD f66, f67 = [AO5], 2 * SIZE + (p17) FMA f113 = f12, f69, f113 + } + { .mfi + (p14) PREFETCH [RPRE5], 16 * SIZE + (p17) FMA f116 = f12, f70, f116 + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO5], 2 * SIZE + (p17) FMA f119 = f12, f71, f119 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f122 = f12, f72, f122 + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO5], 2 * SIZE + (p17) FMA f101 = f13, f73, f101 + } + { .mmf + (p18) STFD [YST1] = f23, 1 * SIZE + (p16) LDFD f72 = [AO6], 1 * SIZE + (p17) FMA f104 = f13, f74, f104 + } + ;; + { .mfi + (p16) LDFPD f73, f74 = [AO6], 2 * SIZE + (p17) FMA f107 = f13, f75, f107 + } + { .mfi + (p15) PREFETCH [RPRE6], 16 * SIZE + (p17) FMA f110 = f13, f76, f110 + } + ;; + { .mfi + (p16) LDFPD f75, f76 = [AO6], 2 * SIZE + (p17) FMA f113 = f13, f77, f113 + } + { .mfi + (p17) FMA f116 = f13, f78, f116 + } + ;; + { .mfi + (p16) LDFPD f77, f78 = [AO6], 2 * SIZE + (p17) FMA f119 = f13, f79, f119 + } + { .mfi + (p17) FMA f122 = f13, f80, f122 + } + ;; + { .mfi + (p16) LDFD f79 = [AO6], 1 * SIZE + (p17) FMA f101 = f14, f81, f101 + } + { .mfi + (p17) FMA f104 = f14, f82, f104 + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO7], 2 * SIZE + (p17) FMA f107 = f14, f83, f107 + } + { .mfi + (p14) PREFETCH [RPRE7], 16 * SIZE + (p17) FMA f110 = f14, f84, f110 + } + ;; + { .mfi + (p16) LDFPD f82, f83 = [AO7], 2 * SIZE + (p17) FMA f113 = f14, f85, f113 + } + { .mfi + (p17) FMA f116 = f14, f86, f116 + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO7], 2 * SIZE + (p17) FMA f119 = f14, f87, f119 + } + { .mfi + (p17) FMA f122 = f14, f88, f122 + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO7], 2 * SIZE + (p17) FMA f16 = f15, f89, f101 + } + { .mfi + (p16) LDFD f88 = [AO8], 1 * SIZE + (p17) FMA f17 = f15, f90, f104 + } + ;; + { .mfi + (p16) LDFPD f89, f90 = [AO8], 2 * SIZE + (p17) FMA f18 = f15, f91, f107 + } + { .mfi + (p15) PREFETCH [RPRE8], 16 * SIZE + (p17) FMA f19 = f15, f92, f110 + } + ;; + { .mfi + (p16) LDFPD f91, f92 = [AO8], 2 * SIZE + (p17) FMA f20 = f15, f93, f113 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f21 = f15, f94, f116 + } + ;; + { .mfi + (p16) LDFPD f93, f94 = [AO8], 2 * SIZE + (p17) FMA f22 = f15, f95, f119 + } + { .mfb + (p16) adds I = -1, I + (p17) FMA f23 = f15, f96, f122 + br.ctop.sptk.few .L112 + } + ;; + .align 16 + +.L115: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + cmp.lt p6, p0 = 1, J + adds J = -1, J + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + (p13) LDFD f34 = [AO2], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f35, f50 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f66 = [AO2], 1 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f67 = [AO2], 1 * SIZE + (p15) LDFD f82 = [AO3] + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f23, 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p15) LDFD f81 = [AO2] + (p13) LDFD f38 = [AO4], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + } + { .mfi + (p13) LDFPD f40, f41 = [AO5], 2 * SIZE + (p13) FMA f101 = f8, f33, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f39, f54 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f56, f57 = [AO5], 2 * SIZE + (p13) FMA f103 = f8, f49, f103 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFD f55 = [AO4], 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f72, f73 = [AO5], 2 * SIZE + (p14) FMA f105 = f8, f65, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f70 = [AO4], 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + nop __LINE__ + } + { .mmi + (p15) LDFD f84 = [AO5] + (p13) LDFD f42 = [AO6], 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p13) LDFPD f43, f58 = [AO6], 2 * SIZE + (p14) LDFD f71 = [AO4], 1 * SIZE + (p13) FMA f100 = f9, f34, f100 + } + { .mfi + (p13) LDFPD f44, f45 = [AO7], 2 * SIZE + (p13) FMA f101 = f9, f35, f101 + nop __LINE__ + } + ;; + { .mmf + (p13) LDFD f59 = [AO6], 1 * SIZE + (p15) LDFD f83 = [AO4] + (p13) FMA f102 = f9, f50, f102 + } + { .mfi + (p13) LDFPD f60, f61 = [AO7], 2 * SIZE + (p13) FMA f103 = f9, f51, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f74 = [AO6], 1 * SIZE + (p14) FMA f104 = f9, f66, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f76, f77 = [AO7], 2 * SIZE + (p14) FMA f105 = f9, f67, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f75 = [AO6], 1 * SIZE + (p15) FMA f106 = f9, f81, f106 + nop __LINE__ + } + { .mmi + (p15) LDFD f86 = [AO7] + (p13) LDFD f46 = [AO8], 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p13) LDFPD f47, f62 = [AO8], 2 * SIZE + (p15) LDFD f85 = [AO6] + (p13) FMA f100 = f10, f36, f100 + } + { .mfi + (p13) FMA f101 = f10, f37, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFD f63 = [AO8], 1 * SIZE + (p13) FMA f102 = f10, f52, f102 + nop __LINE__ + } + { .mfi + (p13) FMA f103 = f10, f53, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f78 = [AO8], 1 * SIZE + (p14) FMA f104 = f10, f68, f104 + nop __LINE__ + } + { .mfi + (p14) FMA f105 = f10, f69, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f79 = [AO8], 1 * SIZE + (p15) FMA f106 = f10, f82, f106 + nop __LINE__ + } + ;; + (p15) LDFD f87 = [AO8] + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + (p13) FMA f102 = f11, f54, f102 + (p13) FMA f103 = f11, f55, f103 + (p14) FMA f104 = f11, f70, f104 + (p14) FMA f105 = f11, f71, f105 + (p15) FMA f106 = f11, f83, f106 + ;; + (p13) FMA f100 = f12, f40, f100 + (p13) FMA f101 = f12, f41, f101 + (p13) FMA f102 = f12, f56, f102 + (p13) FMA f103 = f12, f57, f103 + (p14) FMA f104 = f12, f72, f104 + (p14) FMA f105 = f12, f73, f105 + (p15) FMA f106 = f12, f84, f106 + ;; + (p13) FMA f100 = f13, f42, f100 + (p13) FMA f101 = f13, f43, f101 + (p13) FMA f102 = f13, f58, f102 + (p13) FMA f103 = f13, f59, f103 + (p14) FMA f104 = f13, f74, f104 + (p14) FMA f105 = f13, f75, f105 + (p15) FMA f106 = f13, f85, f106 + ;; + (p13) FMA f100 = f14, f44, f100 + (p13) FMA f101 = f14, f45, f101 + (p13) FMA f102 = f14, f60, f102 + (p13) FMA f103 = f14, f61, f103 + (p14) FMA f104 = f14, f76, f104 + (p14) FMA f105 = f14, f77, f105 + (p15) FMA f106 = f14, f86, f106 + ;; + (p13) FMA f100 = f15, f46, f100 + (p13) FMA f101 = f15, f47, f101 + (p13) FMA f102 = f15, f62, f102 + (p13) FMA f103 = f15, f63, f103 + (p14) FMA f104 = f15, f78, f104 + (p14) FMA f105 = f15, f79, f105 + (p15) FMA f106 = f15, f87, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + (p6) br.cond.dptk .L111 + ;; + .align 16 + +.L120: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd AO4 = LDA, 1, AO2 + } + ;; + { .mmi + LDFD f10 = [X], INCX + (p8) LDFD f81 = [AO2], 1 * SIZE + shladd AO3 = LDA, 1, A + } + ;; + { .mmi + LDFD f11 = [X], INCX + (p8) LDFD f82 = [AO3], 1 * SIZE + } + ;; + { .mfi + (p8) LDFD f83 = [AO4], 1 * SIZE + FMPY f8 = ALPHA, f8 + adds PREB = RPREFETCH * SIZE, YLD1 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + } + ;; + FMPY f10 = ALPHA, f10 + shladd A = LDA, 2, A + FMPY f11 = ALPHA, f11 + ;; + { .mfi + adds RPRE3 = RPREFETCH * SIZE, AO3 + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 2 + } + ;; + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMA f106 = f9, f81, f106 + shr I = MM, 3 + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + (p8) FMA f106 = f10, f82, f106 + } + ;; + { .mfi + adds I = -1, I + (p8) FMA f106 = f11, f83, f106 + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L125 + } + ;; + .align 16 + +.L122: + { .mfi + (p17) LDFD f64 = [AO4], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + (p16) adds I = -1, I + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mmf + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFD f47 = [AO2], 1 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f101 + } + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f17 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f57, f58 = [AO4], 2 * SIZE + (p17) FMA f18 = f11, f59, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f59, f60 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f61, f113 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f61, f62 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f63, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f11, f64, f122 + br.ctop.sptk.few .L122 + } + ;; + .align 16 + +.L125: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + (p15) LDFD f80 = [AO1] + } + { .mmi + (p15) LDFD f106 = [YLD1], 1 * SIZE + (p13) LDFD f34 = [AO2], 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f35, f50 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFD f66 = [AO2], 1 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmf + (p18) STFD [YST1] = f23, 1 * SIZE + (p14) LDFD f67 = [AO2], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + } + { .mmf + (p15) LDFD f82 = [AO3] + (p13) LDFD f38 = [AO4], 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + } + ;; + ;; + { .mmf + (p13) LDFPD f39, f54 = [AO4], 2 * SIZE + (p15) LDFD f81 = [AO2] + (p13) FMA f102 = f8, f48, f102 + } + { .mfi + (p13) FMA f103 = f8, f49, f103 + } + ;; + { .mfi + (p13) LDFD f55 = [AO4], 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + } + { .mfi + (p14) FMA f105 = f8, f65, f105 + } + ;; + { .mfi + (p14) LDFD f70 = [AO4], 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + } + { .mfi + (p13) FMA f100 = f9, f34, f100 + } + ;; + { .mfi + (p14) LDFD f71 = [AO4], 1 * SIZE + (p13) FMA f101 = f9, f35, f101 + } + { .mfi + (p13) FMA f102 = f9, f50, f102 + } + ;; + (p15) LDFD f83 = [AO4] + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) FMA f100 = f10, f36, f100 + (p13) FMA f101 = f10, f37, f101 + (p13) FMA f102 = f10, f52, f102 + (p13) FMA f103 = f10, f53, f103 + (p14) FMA f104 = f10, f68, f104 + (p14) FMA f105 = f10, f69, f105 + (p15) FMA f106 = f10, f82, f106 + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + ;; + (p13) FMA f102 = f11, f54, f102 + (p13) STFD [YST1] = f100, 1 * SIZE + (p13) FMA f103 = f11, f55, f103 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f104 = f11, f70, f104 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p14) FMA f105 = f11, f71, f105 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + (p15) FMA f106 = f11, f83, f106 + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L130: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L140 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd A = LDA, 1, A + } + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA, f8 + mov ar.ec= 2 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + shr I = MM, 3 + ;; + (p8) LDFD f81 = [AO2], 1 * SIZE + cmp.eq p6, p0 = 0, I + ;; + (p8) FMA f106 = f8, f80, f106 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + tbit.nz p13, p0 = MM, 2 + ;; + (p8) FMA f106 = f9, f81, f106 + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L135 + } + ;; + .align 16 + +.L132: + { .mfi + (p17) LDFD f48 = [AO2], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mmf + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + adds I = -1, I + } + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mmf + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mmf + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f16 = f9, f41, f101 + } + { .mmf + (p18) STFD [YST1] = f20, 1 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f17 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p17) FMA f18 = f9, f43, f107 + } + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f45, f113 + } + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f47, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f9, f48, f122 + br.ctop.sptk.few .L132 + } + ;; + .align 16 + +.L135: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f34 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f35 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f50 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f23, 1 * SIZE + } + ;; + (p14) LDFD f66 = [AO2], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + ;; + (p14) LDFD f67 = [AO2], 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + ;; + (p15) LDFD f81 = [AO2] + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) FMA f100 = f9, f34, f100 + (p13) FMA f101 = f9, f35, f101 + (p13) FMA f102 = f9, f50, f102 + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L140: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 0 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L990 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + adds RPRE1 = RPREFETCH * SIZE, AO1 + } + ;; + { .mmi + (p8) LDFD f80 = [AO1], 1 * SIZE + adds PREB = RPREFETCH * SIZE, YLD1 + } + ;; + FMPY f8 = ALPHA, f8 + shr I = MM, 3 + ;; + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 3 + ;; + { .mmi + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + tbit.nz p14, p15 = r0, 0 + } + ;; + { .mmi + adds YST2 = 4 * SIZE, YST1 + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mmi + (p8) STFD [YST1] = f106, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + } + { .mib + mov ar.lc = I + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L142: + { .mmf + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + (p18) FMA f16 = f8, f34, f102 + } + { .mmf + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) FMA f20 = f8, f46, f114 + } + ;; + { .mmf + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + (p18) FMA f17 = f8, f37, f105 + } + { .mmf + (p16) LDFPD f38, f41 = [AO1], 2 * SIZE + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) FMA f21 = f8, f49, f117 + } + ;; + { .mmf + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + (p18) FMA f18 = f8, f40, f108 + } + { .mmf + (p16) LDFPD f44, f47 = [AO1], 2 * SIZE + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) FMA f22 = f8, f52, f120 + } + ;; + { .mmf + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + (p18) FMA f19 = f8, f43, f111 + } + { .mmf + (p16) LDFPD f50, f53 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) FMA f23 = f8, f55, f123 + } + ;; + { .mmi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p14) PREFETCH [PREB], 16 * SIZE + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mib + nop __LINE__ + (p16) adds I = -1, I + br.ctop.sptk.few .L142 + } + ;; + .align 16 + +.L145: + { .mmi + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + } + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L990: + { .mmi + mov YLD1 = YY + mov YST1 = Y + mov pr.rot= 0 + } + { .mib + mov YST2 = Y + shr J = M, 3 + (p10) br.cond.dptk .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = r0, J + adds J = -1, J + mov ar.ec = 4 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + nop __LINE__ + tbit.nz p13, p0 = M, 2 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = J + (p6) br.cond.dpnt .L995 + } + ;; +.L992: + { .mfi + (p19) STFD [YST2] = f35 + (p18) FADD f34 = f34, f66 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f64 = [YLD1], 1 * SIZE + (p16) LDFD f32 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f39 + (p18) FADD f38 = f38, f70 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f36 = [YST1], INCY + (p16) LDFD f68 = [YLD1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f43 + (p18) FADD f42 = f42, f74 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f72 = [YLD1], 1 * SIZE + (p16) LDFD f40 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f47 + (p18) FADD f46 = f46, f78 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f76 = [YLD1], 1 * SIZE + (p16) LDFD f44 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f51 + (p18) FADD f50 = f50, f82 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f80 = [YLD1], 1 * SIZE + (p16) LDFD f48 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f55 + (p18) FADD f54 = f54, f86 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f84 = [YLD1], 1 * SIZE + (p16) LDFD f52 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f59 + (p18) FADD f58 = f58, f90 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f88 = [YLD1], 1 * SIZE + (p16) LDFD f56 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f63 + (p18) FADD f62 = f62, f94 + (p19) add YST2 = YST2, INCY + } + { .mmb + (p16) LDFD f92 = [YLD1], 1 * SIZE + (p16) LDFD f60 = [YST1], INCY + br.ctop.sptk.few .L992 + } + ;; + +.L995: + (p13) LDFD f32 = [YST1], INCY + (p13) LDFD f40 = [YLD1], 1 * SIZE + tbit.nz p14, p0 = M, 1 + ;; + (p13) LDFD f33 = [YST1], INCY + (p13) LDFD f41 = [YLD1], 1 * SIZE + tbit.nz p15, p0 = M, 0 + ;; + (p13) LDFD f34 = [YST1], INCY + (p13) LDFD f42 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f35 = [YST1], INCY + (p13) LDFD f43 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f36 = [YST1], INCY + (p14) LDFD f44 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f37 = [YST1], INCY + (p14) LDFD f45 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f38 = [YST1], INCY + (p15) LDFD f46 = [YLD1], 1 * SIZE + ;; + (p13) FADD f32 = f32, f40 + (p13) FADD f33 = f33, f41 + (p13) FADD f34 = f34, f42 + (p13) FADD f35 = f35, f43 + (p14) FADD f36 = f36, f44 + (p14) FADD f37 = f37, f45 + (p15) FADD f38 = f38, f46 + ;; + (p13) STFD [YST2] = f32 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f33 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f34 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f35 + (p13) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f36 + (p14) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f37 + (p14) add YST2 = YST2, INCY + ;; + (p15) STFD [YST2] = f38 + ;; + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/gemv_t.S b/kernel/ia64/gemv_t.S new file mode 100644 index 0000000000..6bc579ed59 --- /dev/null +++ b/kernel/ia64/gemv_t.S @@ -0,0 +1,3557 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 4096 +#define SP r12 + +#define M r32 +#define N r33 +#define A r36 +#define LDA r37 +#define X r38 +#define INCX r39 +#define Y r34 +#define INCY r35 +#define BUFFER r11 + +#define MIN_M r14 +#define I r15 +#define J r16 +#define IS r17 +#define AO1 r18 +#define AO2 r19 +#define AO3 r20 +#define AO4 r21 +#define AO5 r22 +#define AO6 r23 +#define AO7 r24 +#define AO8 r25 +#define BO r26 +#define LDAP r27 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define AO21 loc8 +#define AO41 loc9 +#define AO61 loc10 +#define AO81 loc11 + +#define PREB r8 +#define WPRE r9 +#define OFFSET PREB +#define CO r10 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 8) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 8, 0 + setf.sig f11 = LDA + mov ARLC = ar.lc + } + { .mmi + adds r15 = 24, SP + adds r16 = 32, SP + adds r14 = 16, SP + } + ;; + { .mmi + setf.sig f10 = N + ld8 Y = [r14] + mov PR = pr + } + { .mmi + ld8 INCY = [r15] + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + adds SP = -8 * 16, SP + } + ;; + { .mmf + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + mov ALPHA = f8 + } + ;; + { .mmi + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + mov IS = 0 + } + ;; + { .mmf + stf.spill [r8] = f22 + stf.spill [r9] = f23 + xmpy.l f10 = f10, f11 + } + .body + ;; + ;; + { .mmi + ld8 BUFFER = [r16] + cmp.ge p7, p0 = r0, M + cmp.ge p6, p0 = r0, N + } + ;; + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + } + ;; + { .mmi + getf.sig LDAP = f10 + mov r2 = P + tbit.nz p8, p0 = A, BASE_SHIFT + } + { .mmi + nop __LINE__ + nop __LINE__ + tbit.nz p9, p0 = LDA, BASE_SHIFT + } + ;; + { .mbb + sub LDAP = r2, LDAP + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + } + .align 16 + ;; + +.LIs_loop: + { .mmi + sub MIN_M = M, IS + (p8) LDFD f32 = [X], INCX + mov pr.rot= 0 + } + { .mmi + mov AO1 = BUFFER + adds AO2 = 4 * SIZE, BUFFER + } + ;; + cmp.le p6, p0 = r2, MIN_M + ;; + (p6) mov MIN_M = P + ;; + (p8) adds MIN_M = -1, MIN_M + ;; + { .mmi + shladd OFFSET = INCX, 2, INCX + shladd BO = INCX, 2, X + shr I = MIN_M, 3 + } + ;; + { .mmi + adds I = -1, I + cmp.eq p16, p0 = r0, r0 + mov ar.ec= 5 + } + ;; + { .mmi + (p8) STFD [AO1] = f32, 2 * SIZE + (p8) adds AO2 = 6 * SIZE, BUFFER + mov ar.lc = I + } + { .mib + cmp.gt p6, p0 = 0, I + tbit.nz p13, p0 = MIN_M, 2 + (p6) br.cond.dpnt .L05 + } + ;; + .align 16 + +.L01: + (p20) STFD [AO1] = f36, SIZE + (p20) STFD [AO2] = f56, SIZE + (p16) LDFD f32 = [X], INCX + (p16) LDFD f52 = [BO], INCX + ;; + (p20) STFD [AO1] = f41, SIZE + (p20) STFD [AO2] = f61, SIZE + (p16) LDFD f37 = [X], INCX + (p16) LDFD f57 = [BO], INCX + ;; + (p20) STFD [AO1] = f46, SIZE + (p20) STFD [AO2] = f66, SIZE + (p16) LDFD f42 = [X], INCX + (p16) LDFD f62 = [BO], INCX + ;; + (p20) STFD [AO1] = f51, 5 * SIZE + (p20) STFD [AO2] = f71, 5 * SIZE + (p16) LDFD f47 = [X], OFFSET + (p16) LDFD f67 = [BO], OFFSET + br.ctop.sptk.few .L01 + ;; + .align 16 + +.L05: + (p13) LDFD f32 = [X], INCX + tbit.nz p14, p0 = MIN_M, 1 + ;; + (p13) LDFD f33 = [X], INCX + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p13) LDFD f34 = [X], INCX + ;; + (p13) LDFD f35 = [X], INCX + ;; + (p14) LDFD f36 = [X], INCX + ;; + (p13) STFD [AO1] = f32, SIZE + (p14) LDFD f37 = [X], INCX + ;; + (p13) STFD [AO1] = f33, SIZE + (p15) LDFD f38 = [X], INCX + ;; + (p13) STFD [AO1] = f34, SIZE + ;; + (p13) STFD [AO1] = f35, SIZE + ;; + (p14) STFD [AO1] = f36, SIZE + ;; + (p14) STFD [AO1] = f37, SIZE + ;; + (p15) STFD [AO1] = f38, SIZE + (p9) br.cond.dpnt .L100 + ;; + .align 16 + +.L10: + { .mmi + mov CO = Y + nop __LINE__ + shr J = N, 3 + } + ;; + { .mib + nop __LINE__ + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + shr I = MIN_M, 4 + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f14 = f0 + } + ;; + { .mmf + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + mov f16 = f0 + } + { .mmf + (p8) LDFD f34 = [AO3], SIZE + (p8) LDFD f35 = [AO4], SIZE + mov f18 = f0 + } + ;; + { .mmf + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + mov f20 = f0 + } + { .mmf + (p8) LDFD f36 = [AO5], SIZE + (p8) LDFD f37 = [AO6], SIZE + mov f22 = f0 + } + ;; + { .mfi + (p8) LDFD f38 = [AO7], SIZE + mov f9 = f0 + mov ar.ec= 2 + } + { .mmf + (p8) LDFD f39 = [AO8], SIZE + mov BO = BUFFER + mov f11 = f0 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + shladd A = LDA, 3, A + cmp.eq p16, p0 = r0, r0 + mov f15 = f0 + } + ;; + { .mmf + add I = I, I + nop __LINE__ + mov f17 = f0 + } + { .mmf + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + mov f19 = f0 + } + ;; + { .mmf + adds I = -1, I + nop __LINE__ + mov f21 = f0 + } + { .mmf + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + mov f23 = f0 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p8) FMPY f8 = f40, f32 + } + { .mmf + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + (p8) FMPY f10 = f40, f33 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p8) FMPY f12 = f40, f34 + } + { .mmf + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + (p8) FMPY f14 = f40, f35 + } + ;; + { .mfi + nop __LINE__ + (p8) FMPY f16 = f40, f36 + mov ar.lc = I + } + { .mmf + adds WPRE = 8 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f18 = f40, f37 + } + ;; + { .mmf + lfetch.excl.nt1 [WPRE] + nop __LINE__ + (p8) FMPY f20 = f40, f38 + } + { .mfb + nop __LINE__ + (p8) FMPY f22 = f40, f39 + (p6) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mfi + (p17) LDFPD f95, f96 = [AO8], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f9 = f105, f34, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f11 = f105, f36, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO2], 2 * SIZE + (p17) FMA f12 = f104, f37, f12 + nop __LINE__ + } + { .mfi + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f13 = f105, f38, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO3], 2 * SIZE + (p17) FMA f14 = f104, f39, f14 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f15 = f105, f40, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO4], 2 * SIZE + (p17) FMA f16 = f104, f41, f16 + nop __LINE__ + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f17 = f105, f42, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO5], 2 * SIZE + (p17) FMA f18 = f104, f43, f18 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [RPRE5], 16 * SIZE + (p17) FMA f19 = f105, f44, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO6], 2 * SIZE + (p17) FMA f20 = f104, f45, f20 + nop __LINE__ + } + { .mfi + (p15) PREFETCH [RPRE6], 16 * SIZE + (p17) FMA f21 = f105, f46, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO7], 2 * SIZE + (p17) FMA f22 = f104, f47, f22 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [RPRE7], 16 * SIZE + (p17) FMA f23 = f105, f48, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f46, f47 = [AO8], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + nop __LINE__ + } + { .mfi + (p15) PREFETCH [RPRE8], 16 * SIZE + (p17) FMA f9 = f107, f50, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + nop __LINE__ + } + { .mfi + (p14) PREFETCH [PREB], 16 * SIZE + (p17) FMA f11 = f107, f52, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO2], 2 * SIZE + (p17) FMA f12 = f106, f53, f12 + nop __LINE__ + } + { .mfi + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f13 = f107, f54, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f14 = f106, f55, f14 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f15 = f107, f56, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO4], 2 * SIZE + (p17) FMA f16 = f106, f57, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f107, f58, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO5], 2 * SIZE + (p17) FMA f18 = f106, f59, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f107, f60, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f58, f59 = [AO6], 2 * SIZE + (p17) FMA f20 = f106, f61, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f107, f62, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO7], 2 * SIZE + (p17) FMA f22 = f106, f63, f22 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f107, f64, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f62, f63 = [AO8], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + nop __LINE__ + } + { .mfi + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p17) FMA f9 = f109, f66, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p17) FMA f10 = f108, f67, f10 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f66, f67 = [AO2], 2 * SIZE + (p17) FMA f12 = f108, f69, f12 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f13 = f109, f70, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO3], 2 * SIZE + (p17) FMA f14 = f108, f71, f14 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f15 = f109, f72, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO4], 2 * SIZE + (p17) FMA f16 = f108, f73, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f109, f74, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f72, f73 = [AO5], 2 * SIZE + (p17) FMA f18 = f108, f75, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f109, f76, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f74, f75 = [AO6], 2 * SIZE + (p17) FMA f20 = f108, f77, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f109, f78, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f76, f77 = [AO7], 2 * SIZE + (p17) FMA f22 = f108, f79, f22 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f23 = f109, f80, f23 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + nop __LINE__ + } + { .mfi + (p16) LDFPD f78, f79 = [AO8], 2 * SIZE + (p17) FMA f9 = f111, f82, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + (p17) FMA f10 = f110, f83, f10 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f11 = f111, f84, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f82, f83 = [AO2], 2 * SIZE + (p17) FMA f12 = f110, f85, f12 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f13 = f111, f86, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO3], 2 * SIZE + (p17) FMA f14 = f110, f87, f14 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f15 = f111, f88, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO4], 2 * SIZE + (p17) FMA f16 = f110, f89, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f17 = f111, f90, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f88, f89 = [AO5], 2 * SIZE + (p17) FMA f18 = f110, f91, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f19 = f111, f92, f19 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f90, f91 = [AO6], 2 * SIZE + (p17) FMA f20 = f110, f93, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p17) FMA f21 = f111, f94, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f92, f93 = [AO7], 2 * SIZE + (p17) FMA f22 = f110, f95, f22 + nop __LINE__ + } + { .mfb + adds I = -1, I + (p17) FMA f23 = f111, f96, f23 + br.ctop.sptk.few .L12 + } + ;; + .align 16 + +.L15: + and I = 15, MIN_M + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p15 = r0, r0 + ;; + adds I = 1, I + ;; + shr I = I, 1 + ;; + adds I = -1, I + ;; + mov ar.lc = I + mov ar.ec= 3 + and I = 15, MIN_M + (p6) br.cond.dpnt .L18 + ;; + .align 16 + +.L16: + { .mfi + (p16) LDFPD f104, f107 = [BO], 2 * SIZE + (p18) FMA f8 = f106, f34, f8 + nop __LINE__ + } + { .mfi + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p15) FMA f9 = f109, f37, f9 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f38, f41 = [AO2], 2 * SIZE + (p18) FMA f10 = f106, f40, f10 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f11 = f109, f43, f11 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f44, f47 = [AO3], 2 * SIZE + (p18) FMA f12 = f106, f46, f12 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f13 = f109, f49, f13 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f50, f53 = [AO4], 2 * SIZE + (p18) FMA f14 = f106, f52, f14 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f15 = f109, f55, f15 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f56, f59 = [AO5], 2 * SIZE + (p18) FMA f16 = f106, f58, f16 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f17 = f109, f61, f17 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f62, f65 = [AO6], 2 * SIZE + (p18) FMA f18 = f106, f64, f18 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f19 = f109, f67, f19 + (p17) adds I = -2, I + } + ;; + { .mfi + (p16) LDFPD f68, f71 = [AO7], 2 * SIZE + (p18) FMA f20 = f106, f70, f20 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p15) FMA f21 = f109, f73, f21 + nop __LINE__ + } + ;; + { .mfi + (p16) LDFPD f74, f77 = [AO8], 2 * SIZE + (p15) FMA f23 = f109, f79, f23 + (p17) cmp.ne.unc p15, p0 = -1, I + } + { .mfb + nop __LINE__ + (p18) FMA f22 = f106, f76, f22 + br.ctop.sptk.few .L16 + } + ;; + +.L18: + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + FADD f8 = f8, f9 + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + LDFD f34 = [CO], INCY + nop __LINE__ + FADD f12 = f12, f13 + } + ;; + { .mmf + LDFD f35 = [CO], INCY + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + LDFD f36 = [CO], INCY + nop __LINE__ + FADD f16 = f16, f17 + } + ;; + { .mmf + LDFD f37 = [CO], INCY + nop __LINE__ + FADD f18 = f18, f19 + } + ;; + { .mmf + LDFD f38 = [CO], INCY + nop __LINE__ + FADD f20 = f20, f21 + } + ;; + { .mmf + LDFD f39 = [CO], INCY + nop __LINE__ + FADD f22 = f22, f23 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f34 = ALPHA, f12, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA, f14, f35 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + FMA f36 = ALPHA, f16, f36 + } + ;; + { .mmf + STFD [AO1] = f33 + add AO1 = AO1, INCY + FMA f37 = ALPHA, f18, f37 + } + ;; + { .mmf + STFD [AO1] = f34 + add AO1 = AO1, INCY + FMA f38 = ALPHA, f20, f38 + } + ;; + { .mmf + STFD [AO1] = f35 + add AO1 = AO1, INCY + FMA f39 = ALPHA, f22, f39 + } + ;; + { .mmi + STFD [AO1] = f36 + add AO1 = AO1, INCY + adds J = -1, J + } + ;; + { .mmi + STFD [AO1] = f37 + add AO1 = AO1, INCY + nop __LINE__ + } + ;; + { .mmi + STFD [AO1] = f38 + add AO1 = AO1, INCY + cmp4.lt p6, p0 = 0, J + } + ;; + { .mib + STFD [AO1] = f39 + add AO1 = AO1, INCY + (p6) br.cond.dptk .L11 + } + ;; + .align 16 + +.L20: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 2 + } + ;; + { .mfi + shladd AO3 = LDA, 1, A + mov f12 = f0 + shr I = MIN_M, 4 + } + { .mfb + shladd AO4 = LDA, 1, AO2 + mov f14 = f0 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f9 = f0 + } + { .mmf + mov BO = BUFFER + shladd A = LDA, 2, A + mov f11 = f0 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + (p8) LDFD f34 = [AO3], SIZE + (p8) LDFD f35 = [AO4], SIZE + mov f15 = f0 + } + ;; + { .mmi + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + mov ar.ec= 2 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + add I = I, I + } + ;; + { .mmf + adds WPRE = 4 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f8 = f40, f32 + } + { .mmf + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds I = -1, I + (p8) FMPY f10 = f40, f33 + } + ;; + { .mfi + lfetch.excl.nt1 [WPRE] + (p8) FMPY f12 = f40, f34 + mov ar.lc = I + } + { .mfb + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMPY f14 = f40, f35 + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mmf + (p17) LDFPD f87, f88 = [AO4], 2 * SIZE + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + nop __LINE__ + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFPD f34, f35 = [AO2], 2 * SIZE + (p17) FMA f12 = f104, f37, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f105, f38, f13 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFPD f36, f37 = [AO3], 2 * SIZE + (p17) FMA f14 = f104, f39, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f105, f40, f15 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFPD f38, f39 = [AO4], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f50, f51 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f12 = f106, f53, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f107, f54, f13 + } + ;; + { .mmf + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f106, f55, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f107, f56, f15 + } + ;; + { .mmf + (p16) LDFPD f54, f55 = [AO4], 2 * SIZE + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f66, f67 = [AO2], 2 * SIZE + nop __LINE__ + (p17) FMA f12 = f108, f69, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f109, f70, f13 + } + ;; + { .mmf + (p16) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f108, f71, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f109, f72, f15 + } + ;; + { .mmf + (p16) LDFPD f70, f71 = [AO4], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f111, f84, f11 + } + ;; + { .mmf + (p16) LDFPD f82, f83 = [AO2], 2 * SIZE + nop __LINE__ + (p17) FMA f12 = f110, f85, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f111, f86, f13 + } + ;; + { .mmf + (p16) LDFPD f84, f85 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f110, f87, f14 + } + { .mfb + adds I = -1, I + (p17) FMA f15 = f111, f88, f15 + br.ctop.sptk.few .L22 + } + ;; + .align 16 + +.L25: + and I = 15, MIN_M + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p15 = r0, r0 + ;; + adds I = 1, I + ;; + shr I = I, 1 + ;; + adds I = -1, I + ;; + mov ar.lc = I + mov ar.ec= 3 + and I = 15, MIN_M + (p6) br.cond.dpnt .L28 + ;; + .align 16 + +.L26: + { .mmf + (p16) LDFPD f104, f107 = [BO], 2 * SIZE + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p18) FMA f8 = f106, f34, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f9 = f109, f37, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [AO2], 2 * SIZE + nop __LINE__ + (p18) FMA f10 = f106, f40, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f11 = f109, f43, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [AO3], 2 * SIZE + nop __LINE__ + (p18) FMA f12 = f106, f46, f12 + } + { .mmf + nop __LINE__ + (p17) adds I = -2, I + (p15) FMA f13 = f109, f49, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [AO4], 2 * SIZE + nop __LINE__ + (p15) FMA f15 = f109, f55, f15 + } + { .mfb + (p17) cmp.ne.unc p15, p0 = -1, I + (p18) FMA f14 = f106, f52, f14 + br.ctop.sptk.few .L26 + } + ;; + +.L28: + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + FADD f8 = f8, f9 + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + LDFD f34 = [CO], INCY + nop __LINE__ + FADD f12 = f12, f13 + } + ;; + { .mmf + LDFD f35 = [CO], INCY + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f34 = ALPHA, f12, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA, f14, f35 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f33 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f34 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f35 + add AO1 = AO1, INCY + } + ;; + .align 16 + +.L30: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 1 + } + ;; + { .mfi + mov BO = BUFFER + mov f12 = f0 + shr I = MIN_M, 4 + } + { .mfb + adds WPRE = 4 * SIZE, CO + mov f14 = f0 + (p6) br.cond.dpnt .L40 + } + ;; + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f9 = f0 + } + { .mfi + shladd A = LDA, 1, A + mov f11 = f0 + mov ar.ec= 2 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + adds RPRE1 = RPREFETCH * SIZE, AO1 + add I = I, I + mov f15 = f0 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds I = -1, I + } + ;; + { .mfi + lfetch.excl.nt1 [WPRE] + (p8) FMPY f8 = f40, f32 + mov ar.lc = I + } + { .mfb + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f10 = f40, f33 + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + { .mmf + (p17) LDFPD f83, f84 = [AO2], 2 * SIZE + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + nop __LINE__ + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFPD f34, f35 = [AO2], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f50, f51 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f66, f67 = [AO2], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mfb + adds I = -1, I + (p17) FMA f11 = f111, f84, f11 + br.ctop.sptk.few .L32 + } + ;; + .align 16 + +.L35: + and I = 15, MIN_M + ;; + cmp.eq p6, p0 = 0, I + (p6) br.cond.dpnt .L38 + ;; + tbit.nz p12, p0 = MIN_M, 3 + tbit.nz p13, p0 = MIN_M, 2 + tbit.nz p14, p0 = MIN_M, 1 + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p12) LDFPD f32, f33 = [AO1], 2 * SIZE + (p12) LDFPD f34, f35 = [AO2], 2 * SIZE + (p12) LDFPD f100, f101 = [BO], 2 * SIZE + ;; + (p12) LDFPD f36, f37 = [AO1], 2 * SIZE + (p12) LDFPD f38, f39 = [AO2], 2 * SIZE + (p12) LDFPD f102, f103 = [BO], 2 * SIZE + ;; + (p12) LDFPD f40, f41 = [AO1], 2 * SIZE + (p12) LDFPD f42, f43 = [AO2], 2 * SIZE + (p12) LDFPD f104, f105 = [BO], 2 * SIZE + ;; + (p12) LDFPD f44, f45 = [AO1], 2 * SIZE + (p12) LDFPD f46, f47 = [AO2], 2 * SIZE + (p12) LDFPD f106, f107 = [BO], 2 * SIZE + ;; + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f108, f109 = [BO], 2 * SIZE + ;; + (p13) LDFPD f52, f53 = [AO1], 2 * SIZE + (p13) LDFPD f54, f55 = [AO2], 2 * SIZE + (p13) LDFPD f110, f111 = [BO], 2 * SIZE + ;; + (p14) LDFPD f56, f57 = [AO1], 2 * SIZE + (p14) LDFPD f58, f59 = [AO2], 2 * SIZE + (p14) LDFPD f112, f113 = [BO], 2 * SIZE + ;; + (p15) LDFD f60 = [AO1] + (p15) LDFD f61 = [AO2] + (p15) LDFD f114 = [BO] + ;; + (p12) FMA f8 = f100, f32, f8 + (p12) FMA f9 = f101, f33, f9 + (p12) FMA f10 = f100, f34, f10 + (p12) FMA f11 = f101, f35, f11 + ;; + (p12) FMA f12 = f102, f36, f12 + (p12) FMA f13 = f103, f37, f13 + (p12) FMA f14 = f102, f38, f14 + (p12) FMA f15 = f103, f39, f15 + ;; + (p12) FMA f8 = f104, f40, f8 + (p12) FMA f9 = f105, f41, f9 + (p12) FMA f10 = f104, f42, f10 + (p12) FMA f11 = f105, f43, f11 + ;; + (p12) FMA f12 = f106, f44, f12 + (p12) FMA f13 = f107, f45, f13 + (p12) FMA f14 = f106, f46, f14 + (p12) FMA f15 = f107, f47, f15 + ;; + (p13) FMA f8 = f108, f48, f8 + (p13) FMA f9 = f109, f49, f9 + (p13) FMA f10 = f108, f50, f10 + (p13) FMA f11 = f109, f51, f11 + ;; + (p13) FMA f12 = f110, f52, f12 + (p13) FMA f13 = f111, f53, f13 + (p13) FMA f14 = f110, f54, f14 + (p13) FMA f15 = f111, f55, f15 + ;; + (p14) FMA f8 = f112, f56, f8 + (p14) FMA f9 = f113, f57, f9 + (p14) FMA f10 = f112, f58, f10 + (p14) FMA f11 = f113, f59, f11 + ;; + (p15) FMA f12 = f114, f60, f12 + (p15) FMA f14 = f114, f61, f14 + ;; +.L38: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f12 + FADD f10 = f10, f14 + ;; + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f33 + } + ;; + .align 16 + +.L40: + { .mfi + mov AO1 = A + mov f8 = f0 + shr I = MIN_M, 4 + } + { .mfi + mov BO = BUFFER + mov f10 = f0 + tbit.z p7, p0 = N, 0 + } + ;; + { .mfi + cmp.eq p6, p0 = 0, I + mov f12 = f0 + mov pr.rot= 0 + } + { .mfb + add I = I, I + mov f14 = f0 + (p7) br.cond.dpnt .L99 + } + ;; + { .mfi + (p8) LDFD f32 = [AO1], SIZE + mov f9 = f0 + mov ar.ec= 2 + } + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + add A = A, LDA + mov f11 = f0 + } + ;; + { .mmf + adds WPRE = 1 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + mov f13 = f0 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + mov f15 = f0 + } + ;; + { .mfi + lfetch.excl.nt1 [WPRE] + (p8) FMPY f8 = f40, f32 + mov ar.lc = I + } + { .mmb + nop __LINE__ + nop __LINE__ + (p6) br.cond.dpnt .L45 + } + ;; + .align 16 + +.L42: + { .mmf + (p17) LDFPD f81, f82 = [AO1], 2 * SIZE + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + nop __LINE__ + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mfb + adds I = -1, I + (p17) FMA f9 = f111, f82, f9 + br.ctop.sptk.few .L42 + } + ;; + .align 16 + +.L45: + and I = 15, MIN_M + ;; + cmp.eq p6, p0 = 0, I + (p6) br.cond.dpnt .L48 + ;; + tbit.nz p12, p0 = MIN_M, 3 + tbit.nz p13, p0 = MIN_M, 2 + tbit.nz p14, p0 = MIN_M, 1 + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p12) LDFPD f32, f33 = [AO1], 2 * SIZE + (p12) LDFPD f100, f101 = [BO], 2 * SIZE + ;; + (p12) LDFPD f36, f37 = [AO1], 2 * SIZE + (p12) LDFPD f102, f103 = [BO], 2 * SIZE + ;; + (p12) LDFPD f40, f41 = [AO1], 2 * SIZE + (p12) LDFPD f104, f105 = [BO], 2 * SIZE + ;; + (p12) LDFPD f44, f45 = [AO1], 2 * SIZE + (p12) LDFPD f106, f107 = [BO], 2 * SIZE + ;; + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f108, f109 = [BO], 2 * SIZE + ;; + (p13) LDFPD f52, f53 = [AO1], 2 * SIZE + (p13) LDFPD f110, f111 = [BO], 2 * SIZE + ;; + (p14) LDFPD f56, f57 = [AO1], 2 * SIZE + (p14) LDFPD f112, f113 = [BO], 2 * SIZE + ;; + (p15) LDFD f60 = [AO1] + (p15) LDFD f114 = [BO] + ;; + (p12) FMA f8 = f100, f32, f8 + (p12) FMA f9 = f101, f33, f9 + (p12) FMA f10 = f102, f36, f10 + (p12) FMA f11 = f103, f37, f11 + (p12) FMA f12 = f104, f40, f12 + (p12) FMA f13 = f105, f41, f13 + (p12) FMA f14 = f106, f44, f14 + (p12) FMA f15 = f107, f45, f15 + ;; + (p13) FMA f8 = f108, f48, f8 + (p13) FMA f9 = f109, f49, f9 + (p13) FMA f10 = f110, f52, f10 + (p13) FMA f11 = f111, f53, f11 + (p14) FMA f12 = f112, f56, f12 + (p14) FMA f13 = f113, f57, f13 + (p15) FMA f14 = f114, f60, f14 + ;; +.L48: + { .mmf + LDFD f32 = [CO] + nop __LINE__ + FADD f8 = f8, f9 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f12 = f12, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f8 = f8, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f10 = f10, f14 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f8 = f8, f10 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + ;; + { .mmf + STFD [CO] = f32 + } + ;; + .align 16 + +.L99: + adds IS = P, IS + shladd A = LDAP, BASE_SHIFT, A + ;; + cmp.gt p6, p0 = M, IS + (p6) br.cond.dptk .LIs_loop + br .L999 + .align 4 + ;; + +.L100: + shr J = N, 3 + mov CO = Y + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L120 + ;; + .align 16 + +.L111: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + shr I = MIN_M, 4 + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f14 = f0 + } + ;; + { .mmf + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + mov f16 = f0 + } + { .mmf + (p8) LDFD f34 = [AO3], SIZE + (p8) LDFD f35 = [AO4], SIZE + mov f18 = f0 + } + ;; + { .mmf + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + mov f20 = f0 + } + { .mmf + (p8) LDFD f36 = [AO5], SIZE + (p8) LDFD f37 = [AO6], SIZE + mov f22 = f0 + } + ;; + { .mfi + (p8) LDFD f38 = [AO7], SIZE + mov f9 = f0 + mov ar.ec= 2 + } + { .mmf + (p8) LDFD f39 = [AO8], SIZE + mov BO = BUFFER + mov f11 = f0 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + shladd A = LDA, 3, A + cmp.eq p16, p0 = r0, r0 + mov f15 = f0 + } + ;; + { .mmf + add I = I, I + nop __LINE__ + mov f17 = f0 + } + { .mmf + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + mov f19 = f0 + } + ;; + { .mmf + adds I = -1, I + nop __LINE__ + mov f21 = f0 + } + { .mmf + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + mov f23 = f0 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p8) FMPY f8 = f40, f32 + } + { .mmf + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + (p8) FMPY f10 = f40, f33 + } + ;; + { .mmf + adds AO21 = 7 * SIZE, AO2 + adds AO41 = 7 * SIZE, AO4 + (p8) FMPY f12 = f40, f34 + } + { .mmf + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + (p8) FMPY f14 = f40, f35 + } + ;; + { .mfi + nop __LINE__ + (p8) FMPY f16 = f40, f36 + mov ar.lc = I + } + { .mmf + adds WPRE = 8 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f18 = f40, f37 + } + ;; + { .mmf + lfetch.excl.nt1 [WPRE] + adds AO61 = 7 * SIZE, AO6 + (p8) FMPY f20 = f40, f38 + } + { .mfb + adds AO81 = 7 * SIZE, AO8 + (p8) FMPY f22 = f40, f39 + (p6) br.cond.dpnt .L115 + } + ;; + .align 16 + +.L112: + { .mmf + (p17) LDFPD f80, f95 = [AO8] + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + (p17) adds AO8 = 3 * SIZE, AO8 + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f34 = [AO2], 1 * SIZE + (p17) FMA f12 = f104, f37, f12 + } + { .mmf + (p17) LDFD f84 = [AO21], 8 * SIZE + nop __LINE__ + (p17) FMA f13 = f105, f38, f13 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFPD f36, f37 = [AO3], 2 * SIZE + (p17) FMA f14 = f104, f39, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f105, f40, f15 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f38 = [AO4], 1 * SIZE + (p17) FMA f16 = f104, f41, f16 + } + { .mmf + (p17) LDFD f88 = [AO41], 8 * SIZE + nop __LINE__ + (p17) FMA f17 = f105, f42, f17 + } + ;; + { .mmf + (p14) PREFETCH [RPRE5], 16 * SIZE + (p16) LDFPD f40, f41 = [AO5], 2 * SIZE + (p17) FMA f18 = f104, f43, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f19 = f105, f44, f19 + } + ;; + { .mmf + (p15) PREFETCH [RPRE6], 16 * SIZE + (p16) LDFD f42 = [AO6], 1 * SIZE + (p17) FMA f20 = f104, f45, f20 + } + { .mmf + (p17) LDFD f92 = [AO61], 8 * SIZE + nop __LINE__ + (p17) FMA f21 = f105, f46, f21 + } + ;; + { .mmf + (p14) PREFETCH [RPRE7], 16 * SIZE + (p16) LDFPD f44, f45 = [AO7], 2 * SIZE + (p17) FMA f22 = f104, f47, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f23 = f105, f48, f23 + } + ;; + { .mmf + (p15) PREFETCH [RPRE8], 16 * SIZE + (p16) LDFD f46 = [AO8], 1 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + (p17) LDFD f96 = [AO81], 8 * SIZE + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f35, f50 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f12 = f106, f53, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f107, f54, f13 + } + ;; + { .mmf + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f106, f55, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f107, f56, f15 + } + ;; + { .mmf + (p16) LDFPD f39, f54 = [AO4], 2 * SIZE + nop __LINE__ + (p17) FMA f16 = f106, f57, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f17 = f107, f58, f17 + } + ;; + { .mmf + (p16) LDFPD f56, f57 = [AO5], 2 * SIZE + nop __LINE__ + (p17) FMA f18 = f106, f59, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f19 = f107, f60, f19 + } + ;; + { .mmf + (p16) LDFPD f43, f58 = [AO6], 2 * SIZE + nop __LINE__ + (p17) FMA f20 = f106, f61, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f21 = f107, f62, f21 + } + ;; + { .mmf + (p16) LDFPD f60, f61 = [AO7], 2 * SIZE + nop __LINE__ + (p17) FMA f22 = f106, f63, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f23 = f107, f64, f23 + } + ;; + { .mmf + (p16) LDFPD f47, f62 = [AO8], 2 * SIZE + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f51, f66 = [AO2], 2 * SIZE + nop __LINE__ + (p17) FMA f12 = f108, f69, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f109, f70, f13 + } + ;; + { .mmf + (p16) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f108, f71, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f109, f72, f15 + } + ;; + { .mmf + (p16) LDFPD f55, f70 = [AO4], 2 * SIZE + nop __LINE__ + (p17) FMA f16 = f108, f73, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f17 = f109, f74, f17 + } + ;; + { .mmf + (p16) LDFPD f72, f73 = [AO5], 2 * SIZE + nop __LINE__ + (p17) FMA f18 = f108, f75, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f19 = f109, f76, f19 + } + ;; + { .mmf + (p16) LDFPD f59, f74 = [AO6], 2 * SIZE + nop __LINE__ + (p17) FMA f20 = f108, f77, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f21 = f109, f78, f21 + } + ;; + { .mmf + (p16) LDFPD f76, f77 = [AO7], 2 * SIZE + nop __LINE__ + (p17) FMA f22 = f108, f79, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f23 = f109, f80, f23 + } + ;; + { .mmf + (p16) LDFPD f63, f78 = [AO8], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f111, f84, f11 + } + ;; + { .mmf + (p16) LDFPD f67, f82 = [AO2] + nop __LINE__ + (p17) FMA f12 = f110, f85, f12 + } + { .mmf + nop __LINE__ + (p16) adds AO2 = 3 * SIZE, AO2 + (p17) FMA f13 = f111, f86, f13 + } + ;; + { .mmf + (p16) LDFPD f84, f85 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f110, f87, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f111, f88, f15 + } + ;; + { .mmf + (p16) LDFPD f71, f86 = [AO4] + nop __LINE__ + (p17) FMA f16 = f110, f89, f16 + } + { .mmf + nop __LINE__ + (p16) adds AO4 = 3 * SIZE, AO4 + (p17) FMA f17 = f111, f90, f17 + } + ;; + { .mmf + (p16) LDFPD f88, f89 = [AO5], 2 * SIZE + nop __LINE__ + (p17) FMA f18 = f110, f91, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f19 = f111, f92, f19 + } + ;; + { .mmf + (p16) LDFPD f75, f90 = [AO6] + nop __LINE__ + (p17) FMA f20 = f110, f93, f20 + } + { .mmf + nop __LINE__ + (p16) adds AO6 = 3 * SIZE, AO6 + (p17) FMA f21 = f111, f94, f21 + } + ;; + { .mmf + (p16) LDFPD f92, f93 = [AO7], 2 * SIZE + nop __LINE__ + (p17) FMA f22 = f110, f95, f22 + } + { .mfb + adds I = -1, I + (p17) FMA f23 = f111, f96, f23 + br.ctop.sptk.few .L112 + } + ;; + .align 16 + +.L115: + and I = 15, MIN_M + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p15 = r0, r0 + ;; + adds I = 1, I + ;; + shr I = I, 1 + ;; + adds I = -1, I + adds AO21 = 1 * SIZE, AO2 + adds AO41 = 1 * SIZE, AO4 + adds AO61 = 1 * SIZE, AO6 + adds AO81 = 1 * SIZE, AO8 + ;; + mov ar.lc = I + mov ar.ec= 3 + and I = 15, MIN_M + (p6) br.cond.dpnt .L118 + ;; + .align 16 + +.L116: + { .mmf + (p16) LDFPD f104, f107 = [BO], 2 * SIZE + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p18) FMA f8 = f106, f34, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f9 = f109, f37, f9 + } + ;; + { .mmf + (p16) LDFD f38 = [AO2], 2 * SIZE + (p17) LDFD f42 = [AO21], 2 * SIZE + (p18) FMA f10 = f106, f40, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f11 = f109, f43, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [AO3], 2 * SIZE + nop __LINE__ + (p18) FMA f12 = f106, f46, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f13 = f109, f49, f13 + } + ;; + { .mmf + (p16) LDFD f50 = [AO4], 2 * SIZE + (p17) LDFD f54 = [AO41], 2 * SIZE + (p18) FMA f14 = f106, f52, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f15 = f109, f55, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [AO5], 2 * SIZE + nop __LINE__ + (p18) FMA f16 = f106, f58, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f17 = f109, f61, f17 + } + ;; + { .mmf + (p16) LDFD f62 = [AO6], 2 * SIZE + (p17) LDFD f66 = [AO61], 2 * SIZE + (p18) FMA f18 = f106, f64, f18 + } + { .mmf + nop __LINE__ + (p17) adds I = -2, I + (p15) FMA f19 = f109, f67, f19 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [AO7], 2 * SIZE + nop __LINE__ + (p18) FMA f20 = f106, f70, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f21 = f109, f73, f21 + } + ;; + { .mmf + (p16) LDFD f74 = [AO8], 2 * SIZE + (p17) LDFD f78 = [AO81], 2 * SIZE + (p15) FMA f23 = f109, f79, f23 + } + { .mfb + (p17) cmp.ne.unc p15, p0 = -1, I + (p18) FMA f22 = f106, f76, f22 + br.ctop.sptk.few .L116 + } + ;; + +.L118: + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + FADD f8 = f8, f9 + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + LDFD f34 = [CO], INCY + nop __LINE__ + FADD f12 = f12, f13 + } + ;; + { .mmf + LDFD f35 = [CO], INCY + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + LDFD f36 = [CO], INCY + nop __LINE__ + FADD f16 = f16, f17 + } + ;; + { .mmf + LDFD f37 = [CO], INCY + nop __LINE__ + FADD f18 = f18, f19 + } + ;; + { .mmf + LDFD f38 = [CO], INCY + nop __LINE__ + FADD f20 = f20, f21 + } + ;; + { .mmf + LDFD f39 = [CO], INCY + nop __LINE__ + FADD f22 = f22, f23 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f34 = ALPHA, f12, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA, f14, f35 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + FMA f36 = ALPHA, f16, f36 + } + ;; + { .mmf + STFD [AO1] = f33 + add AO1 = AO1, INCY + FMA f37 = ALPHA, f18, f37 + } + ;; + { .mmf + STFD [AO1] = f34 + add AO1 = AO1, INCY + FMA f38 = ALPHA, f20, f38 + } + ;; + { .mmf + STFD [AO1] = f35 + add AO1 = AO1, INCY + FMA f39 = ALPHA, f22, f39 + } + ;; + { .mmi + STFD [AO1] = f36 + add AO1 = AO1, INCY + adds J = -1, J + } + ;; + { .mmi + STFD [AO1] = f37 + add AO1 = AO1, INCY + nop __LINE__ + } + ;; + { .mmi + STFD [AO1] = f38 + add AO1 = AO1, INCY + cmp4.lt p6, p0 = 0, J + } + ;; + { .mib + STFD [AO1] = f39 + add AO1 = AO1, INCY + (p6) br.cond.dptk .L111 + } + ;; + .align 16 + +.L120: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 2 + } + ;; + { .mfi + shladd AO3 = LDA, 1, A + mov f12 = f0 + shr I = MIN_M, 4 + } + { .mfb + shladd AO4 = LDA, 1, AO2 + mov f14 = f0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f9 = f0 + } + { .mmf + mov BO = BUFFER + shladd A = LDA, 2, A + mov f11 = f0 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + (p8) LDFD f34 = [AO3], SIZE + (p8) LDFD f35 = [AO4], SIZE + mov f15 = f0 + } + ;; + { .mmi + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + mov ar.ec= 2 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + add I = I, I + adds AO21 = 7 * SIZE, AO2 + } + ;; + { .mmf + adds WPRE = 4 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f8 = f40, f32 + } + { .mmf + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds I = -1, I + (p8) FMPY f10 = f40, f33 + } + ;; + { .mfi + adds AO41 = 7 * SIZE, AO4 + (p8) FMPY f12 = f40, f34 + mov ar.lc = I + } + { .mfb + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMPY f14 = f40, f35 + (p6) br.cond.dpnt .L125 + } + ;; + .align 16 + +.L122: + { .mmf + (p17) LDFPD f72, f87 = [AO4] + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + (p17) adds AO4 = 3 * SIZE, AO4 + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f34 = [AO2], 1 * SIZE + (p17) FMA f12 = f104, f37, f12 + } + { .mmf + (p17) LDFD f84 = [AO21], 8 * SIZE + nop __LINE__ + (p17) FMA f13 = f105, f38, f13 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFPD f36, f37 = [AO3], 2 * SIZE + (p17) FMA f14 = f104, f39, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f105, f40, f15 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f38 = [AO4], 1 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + (p17) LDFD f88 = [AO41], 8 * SIZE + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f35, f50 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f12 = f106, f53, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f107, f54, f13 + } + ;; + { .mmf + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f106, f55, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f107, f56, f15 + } + ;; + { .mmf + (p16) LDFPD f39, f54 = [AO4], 2 * SIZE + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f51, f66 = [AO2], 2 * SIZE + nop __LINE__ + (p17) FMA f12 = f108, f69, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f13 = f109, f70, f13 + } + ;; + { .mmf + (p16) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f108, f71, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f15 = f109, f72, f15 + } + ;; + { .mmf + (p16) LDFPD f55, f70 = [AO4], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f111, f84, f11 + } + ;; + { .mmf + (p16) LDFPD f67, f82 = [AO2] + nop __LINE__ + (p17) FMA f12 = f110, f85, f12 + } + { .mmf + nop __LINE__ + (p16) adds AO2 = 3 * SIZE, AO2 + (p17) FMA f13 = f111, f86, f13 + } + ;; + { .mmf + (p16) LDFPD f84, f85 = [AO3], 2 * SIZE + nop __LINE__ + (p17) FMA f14 = f110, f87, f14 + } + { .mfb + adds I = -1, I + (p17) FMA f15 = f111, f88, f15 + br.ctop.sptk.few .L122 + } + ;; + .align 16 + +.L125: + and I = 15, MIN_M + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p15 = r0, r0 + ;; + adds I = 1, I + adds AO21 = 1 * SIZE, AO2 + adds AO41 = 1 * SIZE, AO4 + ;; + shr I = I, 1 + ;; + adds I = -1, I + ;; + mov ar.lc = I + mov ar.ec= 3 + and I = 15, MIN_M + (p6) br.cond.dpnt .L128 + ;; + .align 16 + +.L126: + { .mmf + (p16) LDFPD f104, f107 = [BO], 2 * SIZE + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p18) FMA f8 = f106, f34, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f9 = f109, f37, f9 + } + ;; + { .mmf + (p17) LDFD f42 = [AO21], 2 * SIZE + (p16) LDFD f38 = [AO2], 2 * SIZE + (p18) FMA f10 = f106, f40, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f11 = f109, f43, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [AO3], 2 * SIZE + nop __LINE__ + (p18) FMA f12 = f106, f46, f12 + } + { .mmf + nop __LINE__ + (p17) adds I = -2, I + (p15) FMA f13 = f109, f49, f13 + } + ;; + { .mmf + (p17) LDFD f54 = [AO41], 2 * SIZE + (p16) LDFD f50 = [AO4], 2 * SIZE + (p15) FMA f15 = f109, f55, f15 + } + { .mfb + (p17) cmp.ne.unc p15, p0 = -1, I + (p18) FMA f14 = f106, f52, f14 + br.ctop.sptk.few .L126 + } + ;; + +.L128: + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + FADD f8 = f8, f9 + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + LDFD f34 = [CO], INCY + nop __LINE__ + FADD f12 = f12, f13 + } + ;; + { .mmf + LDFD f35 = [CO], INCY + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f34 = ALPHA, f12, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA, f14, f35 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f33 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f34 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f35 + add AO1 = AO1, INCY + } + ;; + .align 16 + +.L130: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 1 + } + ;; + { .mfi + mov BO = BUFFER + mov f12 = f0 + shr I = MIN_M, 4 + } + { .mfb + adds WPRE = 4 * SIZE, CO + mov f14 = f0 + (p6) br.cond.dpnt .L140 + } + ;; + { .mmf + (p8) LDFD f32 = [AO1], SIZE + (p8) LDFD f33 = [AO2], SIZE + mov f9 = f0 + } + { .mfi + shladd A = LDA, 1, A + mov f11 = f0 + mov ar.ec= 2 + } + ;; + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + cmp.eq p6, p0 = 0, I + mov f13 = f0 + } + { .mmf + adds RPRE1 = RPREFETCH * SIZE, AO1 + add I = I, I + mov f15 = f0 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds I = -1, I + } + ;; + { .mfi + adds AO21 = 7 * SIZE, AO2 + (p8) FMPY f8 = f40, f32 + mov ar.lc = I + } + { .mfb + adds PREB = RPREFETCH * SIZE, BO + (p8) FMPY f10 = f40, f33 + (p6) br.cond.dpnt .L135 + } + ;; + .align 16 + +.L132: + { .mmf + (p17) LDFPD f68, f83 = [AO2] + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + (p17) adds AO2 = 3 * SIZE, AO2 + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f10 = f104, f35, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f105, f36, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f34 = [AO2], 1 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + (p17) LDFD f84 = [AO21], 8 * SIZE + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f10 = f106, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f107, f52, f11 + } + ;; + { .mmf + (p16) LDFPD f35, f50 = [AO2], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p17) FMA f10 = f108, f67, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f11 = f109, f68, f11 + } + ;; + { .mmf + (p16) LDFPD f51, f66 = [AO2], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f111, f82, f9 + } + ;; + { .mmf + (p16) LDFPD f80, f81 = [AO1], 2 * SIZE + nop __LINE__ + (p17) FMA f10 = f110, f83, f10 + } + { .mfb + adds I = -1, I + (p17) FMA f11 = f111, f84, f11 + br.ctop.sptk.few .L132 + } + ;; + .align 16 + +.L135: + and I = 15, MIN_M + ;; + cmp.eq p6, p0 = 0, I + (p6) br.cond.dpnt .L138 + ;; + tbit.nz p12, p0 = MIN_M, 3 + tbit.nz p13, p0 = MIN_M, 2 + tbit.nz p14, p0 = MIN_M, 1 + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p12) LDFPD f100, f101 = [BO], 2 * SIZE + (p12) LDFPD f32, f33 = [AO1], 2 * SIZE + (p12) LDFD f34 = [AO2], 1 * SIZE + ;; + (p12) LDFPD f36, f37 = [AO1], 2 * SIZE + (p12) LDFPD f35, f38 = [AO2], 2 * SIZE + ;; + (p12) LDFPD f102, f103 = [BO], 2 * SIZE + (p12) LDFPD f39, f42 = [AO2], 2 * SIZE + ;; + (p12) LDFPD f40, f41 = [AO1], 2 * SIZE + (p12) LDFPD f43, f46 = [AO2], 2 * SIZE + ;; + (p12) LDFPD f104, f105 = [BO], 2 * SIZE + (p12) LDFPD f44, f45 = [AO1], 2 * SIZE + (p12) LDFD f47 = [AO2], 1 * SIZE + ;; + (p12) LDFPD f106, f107 = [BO], 2 * SIZE + (p13) LDFD f50 = [AO2], 1 * SIZE + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + ;; + (p13) LDFPD f108, f109 = [BO], 2 * SIZE + (p13) LDFPD f51, f54 = [AO2], 2 * SIZE + ;; + (p13) LDFPD f110, f111 = [BO], 2 * SIZE + (p13) LDFPD f52, f53 = [AO1], 2 * SIZE + (p13) LDFD f55 = [AO2], 1 * SIZE + ;; + (p14) LDFPD f56, f57 = [AO1], 2 * SIZE + (p14) LDFD f58 = [AO2], 1 * SIZE + ;; + (p14) LDFPD f112, f113 = [BO], 2 * SIZE + (p15) LDFD f60 = [AO1] + (p14) LDFD f59 = [AO2], 1 * SIZE + ;; + (p15) LDFD f61 = [AO2] + (p15) LDFD f114 = [BO] + ;; + (p12) FMA f8 = f100, f32, f8 + (p12) FMA f9 = f101, f33, f9 + (p12) FMA f10 = f100, f34, f10 + (p12) FMA f11 = f101, f35, f11 + ;; + (p12) FMA f12 = f102, f36, f12 + (p12) FMA f13 = f103, f37, f13 + (p12) FMA f14 = f102, f38, f14 + (p12) FMA f15 = f103, f39, f15 + ;; + (p12) FMA f8 = f104, f40, f8 + (p12) FMA f9 = f105, f41, f9 + (p12) FMA f10 = f104, f42, f10 + (p12) FMA f11 = f105, f43, f11 + ;; + (p12) FMA f12 = f106, f44, f12 + (p12) FMA f13 = f107, f45, f13 + (p12) FMA f14 = f106, f46, f14 + (p12) FMA f15 = f107, f47, f15 + ;; + (p13) FMA f8 = f108, f48, f8 + (p13) FMA f9 = f109, f49, f9 + (p13) FMA f10 = f108, f50, f10 + (p13) FMA f11 = f109, f51, f11 + ;; + (p13) FMA f12 = f110, f52, f12 + (p13) FMA f13 = f111, f53, f13 + (p13) FMA f14 = f110, f54, f14 + (p13) FMA f15 = f111, f55, f15 + ;; + (p14) FMA f8 = f112, f56, f8 + (p14) FMA f9 = f113, f57, f9 + (p14) FMA f10 = f112, f58, f10 + (p14) FMA f11 = f113, f59, f11 + ;; + (p15) FMA f12 = f114, f60, f12 + (p15) FMA f14 = f114, f61, f14 + ;; +.L138: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f12 + FADD f10 = f10, f14 + ;; + { .mmf + mov AO1 = CO + LDFD f32 = [CO], INCY + } + ;; + { .mmf + LDFD f33 = [CO], INCY + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA, f10, f33 + } + ;; + { .mmf + STFD [AO1] = f32 + add AO1 = AO1, INCY + } + ;; + { .mmf + STFD [AO1] = f33 + } + ;; + .align 16 + +.L140: + { .mfi + mov AO1 = A + mov f8 = f0 + shr I = MIN_M, 4 + } + { .mfi + mov BO = BUFFER + mov f10 = f0 + tbit.z p7, p0 = N, 0 + } + ;; + { .mfi + cmp.eq p6, p0 = 0, I + mov f12 = f0 + mov pr.rot= 0 + } + { .mfb + add I = I, I + mov f14 = f0 + (p7) br.cond.dpnt .L199 + } + ;; + { .mfi + (p8) LDFD f32 = [AO1], SIZE + mov f9 = f0 + mov ar.ec= 2 + } + { .mmf + (p8) LDFD f40 = [BO], 2 * SIZE + add A = A, LDA + mov f11 = f0 + } + ;; + { .mmf + adds WPRE = 1 * SIZE, CO + adds PREB = RPREFETCH * SIZE, BO + mov f13 = f0 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + mov f15 = f0 + } + ;; + { .mfi + lfetch.excl.nt1 [WPRE] + (p8) FMPY f8 = f40, f32 + mov ar.lc = I + } + { .mmb + nop __LINE__ + nop __LINE__ + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L142: + { .mmf + (p17) LDFPD f81, f82 = [AO1], 2 * SIZE + (p17) LDFPD f110, f111 = [BO], 2 * SIZE + (p17) FMA f8 = f104, f33, f8 + } + { .mfi + nop __LINE__ + (p17) FMA f9 = f105, f34, f9 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + ;; + { .mmf + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p16) LDFPD f103, f104 = [BO], 2 * SIZE + (p17) FMA f8 = f106, f49, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f107, f50, f9 + } + ;; + { .mmf + (p16) LDFPD f105, f106 = [BO], 2 * SIZE + (p16) LDFPD f48, f49 = [AO1], 2 * SIZE + (p17) FMA f8 = f108, f65, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f9 = f109, f66, f9 + } + ;; + { .mmf + (p16) LDFPD f64, f65 = [AO1], 2 * SIZE + (p16) LDFPD f107, f108 = [BO], 2 * SIZE + (p17) FMA f8 = f110, f81, f8 + } + { .mfb + adds I = -1, I + (p17) FMA f9 = f111, f82, f9 + br.ctop.sptk.few .L142 + } + ;; + .align 16 + +.L145: + and I = 15, MIN_M + ;; + cmp.eq p6, p0 = 0, I + (p6) br.cond.dpnt .L148 + ;; + tbit.nz p12, p0 = MIN_M, 3 + tbit.nz p13, p0 = MIN_M, 2 + tbit.nz p14, p0 = MIN_M, 1 + tbit.nz p15, p0 = MIN_M, 0 + ;; + (p12) LDFPD f32, f33 = [AO1], 2 * SIZE + (p12) LDFPD f100, f101 = [BO], 2 * SIZE + ;; + (p12) LDFPD f36, f37 = [AO1], 2 * SIZE + (p12) LDFPD f102, f103 = [BO], 2 * SIZE + ;; + (p12) LDFPD f40, f41 = [AO1], 2 * SIZE + (p12) LDFPD f104, f105 = [BO], 2 * SIZE + ;; + (p12) LDFPD f44, f45 = [AO1], 2 * SIZE + (p12) LDFPD f106, f107 = [BO], 2 * SIZE + ;; + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f108, f109 = [BO], 2 * SIZE + ;; + (p13) LDFPD f52, f53 = [AO1], 2 * SIZE + (p13) LDFPD f110, f111 = [BO], 2 * SIZE + ;; + (p14) LDFPD f56, f57 = [AO1], 2 * SIZE + (p14) LDFPD f112, f113 = [BO], 2 * SIZE + ;; + (p15) LDFD f60 = [AO1] + (p15) LDFD f114 = [BO] + ;; + (p12) FMA f8 = f100, f32, f8 + (p12) FMA f9 = f101, f33, f9 + (p12) FMA f10 = f102, f36, f10 + (p12) FMA f11 = f103, f37, f11 + (p12) FMA f12 = f104, f40, f12 + (p12) FMA f13 = f105, f41, f13 + (p12) FMA f14 = f106, f44, f14 + (p12) FMA f15 = f107, f45, f15 + ;; + (p13) FMA f8 = f108, f48, f8 + (p13) FMA f9 = f109, f49, f9 + (p13) FMA f10 = f110, f52, f10 + (p13) FMA f11 = f111, f53, f11 + (p14) FMA f12 = f112, f56, f12 + (p14) FMA f13 = f113, f57, f13 + (p15) FMA f14 = f114, f60, f14 + ;; +.L148: + { .mmf + LDFD f32 = [CO] + nop __LINE__ + FADD f8 = f8, f9 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f10 = f10, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f12 = f12, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f14 = f14, f15 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f8 = f8, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FADD f10 = f10, f14 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FADD f8 = f8, f10 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f32 = ALPHA, f8, f32 + } + ;; + { .mmf + STFD [CO] = f32 + nop __LINE__ + nop __LINE__ + } + ;; + .align 16 + +.L199: + adds IS = P, IS + shladd A = LDAP, BASE_SHIFT, A + ;; + cmp.gt p6, p0 = M, IS + (p6) br.cond.dptk .LIs_loop + .align 4 + ;; + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/iamax.S b/kernel/ia64/iamax.S new file mode 100644 index 0000000000..a091675df8 --- /dev/null +++ b/kernel/ia64/iamax.S @@ -0,0 +1,639 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#if !defined(USE_MIN) && defined(USE_ABS) +#define FMAX famax +#elif !defined(USE_MIN) && !defined(USE_ABS) +#define FMAX fmax +#elif defined(USE_MIN) && defined(USE_ABS) +#define FMAX famin +#else +#define FMAX fmin +#endif + +#define IMAX1 r8 +#define IMAX2 r26 +#define IMAX3 r27 +#define IMAX4 r28 + +#define PRE1 r2 + +#define N r14 +#define X1 r15 +#define INCX r16 + +#define I r17 +#define X2 r18 +#define INCX5 r19 +#define INCX16 r20 +#define CURRENT r21 + +#define DMAX1 f8 +#define DMAX2 f9 +#define DMAX3 f10 +#define DMAX4 f11 +#define DMAX5 f12 +#define DMAX6 f13 +#define DMAX7 f14 +#define DMAX8 f15 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + mov IMAX1 = 0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body + +#ifdef F_INTERFACE + { .mmi + LDINT N = [r32] + LDINT INCX = [r34] + mov X1 = r33 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#else + { .mmi + mov N = r32 + mov X1 = r33 + mov INCX = r34 + } + ;; +#endif + + { .mii + mov PR = pr + cmp.ge p6, p0 = 0, INCX + } + { .mbb + cmp.ge p8, p0 = 0, N + (p8) br.ret.sptk.many b0 + (p6) br.ret.sptk.many b0 + } + ;; + { .mmi + LDFD DMAX1 = [X1] + shladd INCX = INCX, BASE_SHIFT, r0 + mov pr.rot= 0 + } + ;; + mov IMAX1 = 1 + mov IMAX2 = 1 + mov IMAX3 = 1 + mov IMAX4 = 1 + mov CURRENT = 1 + adds N = -1, N + ;; + + { .mmf + add X1 = X1, INCX + mov DMAX2 = DMAX1 + } + ;; + { .mmf + shladd X2 = INCX, 2, X1 + } + { .mfi + cmp.eq p16, p0 = r0, r0 + shr I = N, 4 + } + ;; + { .mfi + shladd INCX5 = INCX, 2, INCX + mov DMAX3 = DMAX1 + mov ar.ec= 4 + } + { .mmf +#ifdef XDOUBLE + shladd INCX16= INCX, 3, r0 +#else + shladd INCX16= INCX, 4, r0 +#endif + adds I = -1, I + } + ;; + tbit.z p0, p7 = N, 3 + ;; + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, X1 + mov DMAX4 = DMAX1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + (p6) br.cond.dpnt .L15 + } + .align 32 + ;; +.L10: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X1], INCX + (p19) fcmp.neq.unc p12, p0 = DMAX1, DMAX5 + } + { .mmf + (p8) adds IMAX1 = 1, CURRENT + nop __LINE__ + (p19) FMAX DMAX5 = f67, DMAX1 + } + ;; + { .mmf + (p16) LDFD f36 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p13, p0 = DMAX2, DMAX6 + } + { .mmf + (p9) adds IMAX2 = 2, CURRENT + nop __LINE__ + (p19) FMAX DMAX6 = f71, DMAX2 + } + ;; + { .mmf + (p16) LDFD f40 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p14, p0 = DMAX3, DMAX7 + } + { .mmf + (p10) adds IMAX3 = 3, CURRENT + nop __LINE__ + (p19) FMAX DMAX7 = f75, DMAX3 + } + ;; + { .mmf + (p16) LDFD f44 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p15, p0 = DMAX4, DMAX8 + } + { .mmf + (p11) adds IMAX4 = 4, CURRENT + nop __LINE__ + (p19) FMAX DMAX8 = f79, DMAX4 + } + ;; + { .mmf + (p16) LDFD f48 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + (p12) adds IMAX1 = 5, CURRENT + nop __LINE__ + (p19) FMAX DMAX1 = f83, DMAX5 + } + ;; + { .mmf + (p16) LDFD f52 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + (p13) adds IMAX2 = 6, CURRENT + nop __LINE__ + (p19) FMAX DMAX2 = f87, DMAX6 + } + ;; + { .mmf + (p16) LDFD f56 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + (p14) adds IMAX3 = 7, CURRENT + nop __LINE__ + (p19) FMAX DMAX3 = f91, DMAX7 + } + ;; + { .mmf + (p16) LDFD f60 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mmf + (p15) adds IMAX4 = 8, CURRENT + nop __LINE__ + (p19) FMAX DMAX4 = f95, DMAX8 + } + ;; + { .mmf +#ifdef XDOUBLE + (p16) lfetch.nt1 [PRE1], INCX16 +#endif + (p16) LDFD f64 = [X1], INCX +#ifndef XDOUBLE + nop __LINE__ +#endif + (p19) fcmp.neq.unc p12, p0 = DMAX1, DMAX5 + } + { .mmf + (p8) adds IMAX1 = 9, CURRENT + nop __LINE__ + (p18) FMAX DMAX5 = f34, DMAX1 + } + ;; + { .mmf + (p16) LDFD f68 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p13, p0 = DMAX2, DMAX6 + } + { .mmf + (p9) adds IMAX2 = 10, CURRENT + nop __LINE__ + (p18) FMAX DMAX6 = f38, DMAX2 + } + ;; + { .mmf + (p16) LDFD f72 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p14, p0 = DMAX3, DMAX7 + } + { .mmf + (p10) adds IMAX3 = 11, CURRENT + nop __LINE__ + (p18) FMAX DMAX7 = f42, DMAX3 + } + ;; + { .mmf + (p16) LDFD f76 = [X1], INCX + nop __LINE__ + (p19) fcmp.neq.unc p15, p0 = DMAX4, DMAX8 + } + { .mmf + (p11) adds IMAX4 = 12, CURRENT + nop __LINE__ + (p18) FMAX DMAX8 = f46, DMAX4 + } + ;; + { .mmf + (p16) LDFD f80 = [X1], INCX + nop __LINE__ + (p18) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + (p12) adds IMAX1 = 13, CURRENT + nop __LINE__ + (p18) FMAX DMAX1 = f50, DMAX5 + } + ;; + { .mmf + (p16) LDFD f84 = [X1], INCX + nop __LINE__ + (p18) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + (p13) adds IMAX2 = 14, CURRENT + nop __LINE__ + (p18) FMAX DMAX2 = f54, DMAX6 + } + ;; + { .mmf + (p16) LDFD f88 = [X1], INCX + nop __LINE__ + (p18) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + (p14) adds IMAX3 = 15, CURRENT + nop __LINE__ + (p18) FMAX DMAX3 = f58, DMAX7 + } + ;; + { .mmf + (p16) LDFD f92 = [X1], INCX + (p15) adds IMAX4 = 16, CURRENT + (p18) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mfb + (p19) adds CURRENT = 16, CURRENT + (p18) FMAX DMAX4 = f62, DMAX8 + br.ctop.sptk.few .L10 + } + ;; + .align 32 + +.L15: + { .mmi + (p7) LDFD f32 = [X1], INCX + and I = 15, N + cmp.ne p14, p0 = r0, r0 + } + ;; + { .mmb + (p7) LDFD f33 = [X1], INCX + cmp.eq p6, p0 = 0, I + (p6) br.cond.dptk .L999 + } + ;; + { .mmi + (p7) LDFD f34 = [X1], INCX + ;; + (p7) LDFD f35 = [X1], INCX + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p7) LDFD f36 = [X1], INCX + ;; + (p7) LDFD f37 = [X1], INCX + tbit.z p0, p14 = N, 1 + } + ;; + { .mfi + (p7) LDFD f38 = [X1], INCX + (p7) FMAX DMAX5 = f32, DMAX1 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmf + (p7) LDFD f39 = [X1], INCX + nop __LINE__ + (p7) FMAX DMAX6 = f33, DMAX2 + } + ;; + { .mmf + (p13) LDFD f40 = [X1], INCX + nop __LINE__ + (p7) FMAX DMAX7 = f34, DMAX3 + } + ;; + { .mmf + (p13) LDFD f41 = [X1], INCX + nop __LINE__ + (p7) FMAX DMAX8 = f35, DMAX4 + } + ;; + { .mmf + (p13) LDFD f42 = [X1], INCX + nop __LINE__ + (p7) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p7) FMAX DMAX1 = f36, DMAX5 + } + ;; + { .mmf + (p13) LDFD f43 = [X1], INCX + nop __LINE__ + (p7) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p7) FMAX DMAX2 = f37, DMAX6 + } + ;; + { .mmf + (p14) LDFD f44 = [X1], INCX + nop __LINE__ + (p7) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p7) FMAX DMAX3 = f38, DMAX7 + } + ;; + { .mmf + (p14) LDFD f45 = [X1], INCX + nop __LINE__ + (p7) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p7) FMAX DMAX4 = f39, DMAX8 + } + ;; + { .mmf + (p15) LDFD f46 = [X1], INCX + (p8) adds IMAX1 = 1, CURRENT + (p7) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMAX DMAX5 = f40, DMAX1 + } + { .mmf + (p9) adds IMAX2 = 2, CURRENT + nop __LINE__ + (p7) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMAX DMAX6 = f41, DMAX2 + } + { .mmf + (p10) adds IMAX3 = 3, CURRENT + nop __LINE__ + (p7) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMAX DMAX7 = f42, DMAX3 + } + { .mmf + (p11) adds IMAX4 = 4, CURRENT + nop __LINE__ + (p7) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMAX DMAX8 = f43, DMAX4 + } + ;; + { .mmf + (p8) adds IMAX1 = 5, CURRENT + nop __LINE__ + (p13) fcmp.neq.unc p8, p0 = DMAX1, DMAX5 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) mov DMAX1 = DMAX5 + } + { .mmf + (p9) adds IMAX2 = 6, CURRENT + nop __LINE__ + (p13) fcmp.neq.unc p9, p0 = DMAX2, DMAX6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) mov DMAX2 = DMAX6 + } + { .mmf + (p10) adds IMAX3 = 7, CURRENT + nop __LINE__ + (p13) fcmp.neq.unc p10, p0 = DMAX3, DMAX7 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) mov DMAX3 = DMAX7 + } + { .mmf + (p11) adds IMAX4 = 8, CURRENT + nop __LINE__ + (p13) fcmp.neq.unc p11, p0 = DMAX4, DMAX8 + } + { .mmf + (p7) adds CURRENT = 8, CURRENT + nop __LINE__ + (p13) mov DMAX4 = DMAX8 + } + ;; + { .mmf + (p8) adds IMAX1 = 1, CURRENT + nop __LINE__ + (p14) FMAX DMAX5 = f44, DMAX1 + } + { .mmf + (p9) adds IMAX2 = 2, CURRENT + (p10) adds IMAX3 = 3, CURRENT + (p14) FMAX DMAX6 = f45, DMAX2 + } + { .mmf + (p11) adds IMAX4 = 4, CURRENT + (p13) adds CURRENT = 4, CURRENT + (p15) FMAX DMAX7 = f46, DMAX3 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p14) fcmp.neq.unc p8, p0 = DMAX5, DMAX1 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) mov DMAX1 = DMAX5 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) fcmp.neq.unc p9, p0 = DMAX6, DMAX2 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) mov DMAX2 = DMAX6 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) fcmp.neq.unc p10, p0 = DMAX7, DMAX3 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) mov DMAX3 = DMAX7 + } + ;; +.L999: + { .mmf + (p8) adds IMAX1 = 1, CURRENT + nop __LINE__ + FMAX DMAX5 = DMAX2, DMAX1 + } + { .mmf + (p9) adds IMAX2 = 2, CURRENT + (p14) adds CURRENT = 2, CURRENT + FMAX DMAX6 = DMAX4, DMAX3 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + fcmp.neq p12, p0 = DMAX5, DMAX1 + } + { .mmf + (p10) adds IMAX3 = 1, CURRENT + nop __LINE__ + fcmp.neq p13, p0 = DMAX6, DMAX3 + } + ;; + { .mmf + (p12) mov IMAX1 = IMAX2 + (p13) mov IMAX3 = IMAX4 + FMAX DMAX1 = DMAX6, DMAX5 + } + ;; + { .mfi + nop __LINE__ + fcmp.neq p12, p0 = DMAX1, DMAX5 + mov ar.lc = ARLC + } + ;; + { .mib + (p12) mov IMAX1 = IMAX3 + mov pr = PR, -65474 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/izamax.S b/kernel/ia64/izamax.S new file mode 100644 index 0000000000..c43bccaf67 --- /dev/null +++ b/kernel/ia64/izamax.S @@ -0,0 +1,579 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16 + 8) +#else +#define PREFETCH_SIZE (32 * 16 + 16) +#endif + +#ifdef USE_MIN +#define CMPUNC cmp.lt.unc +#define CMP cmp.lt +#else +#define CMPUNC cmp.gt.unc +#define CMP cmp.gt +#endif + +#define RET r8 + +#define N r32 +#define DX r33 +#define INCX r34 + +#define PRE1 r2 + +#define I r14 +#define J r15 +#define K r16 +#define TMP r17 +#define INCXM1 r18 +#define INCX8 r19 +#define MAX1 r20 +#define DMAX1 r21 +#define DATA1 r22 +#define DATA2 r23 +#define DATA3 r24 +#define DATA4 r25 +#define DATA5 r26 +#define DATA6 r27 +#define DATA7 r28 +#define DATA8 r29 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mmi + mov MAX1 = -1 + mov DMAX1 = 0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + .body + +#ifdef F_INTERFACE + { .mmi + LDINT N = [N] + LDINT INCX = [INCX] + nop.i 0 + } + ;; +#ifndef USE64BITINT + { .mii + nop.m 0 + sxt4 N = N + sxt4 INCX = INCX + } + ;; +#endif +#endif + + { .mii + adds K = -1, N + shl INCX = INCX, ZBASE_SHIFT + mov PR = pr + } + { .mmb + cmp.ge p8, p0 = 0, N + (p8) br.cond.dptk .L999 + } + ;; + { .mib + cmp.ge p6, p0 = 0, INCX + mov pr.rot= 0 + (p6) br.cond.dptk .L999 + } + ;; + { .mmi + LDFD f6 = [DX], SIZE + adds INCXM1 = - SIZE, INCX + mov ar.ec= 5 + } + ;; + { .mmi + LDFD f7 = [DX], INCXM1 + mov MAX1 = 0 + mov I = 1 + } + ;; + { .mfi + cmp.eq p16, p0 = r0, r0 + fabs f6 = f6 + shr J = K, 3 + } + { .mmf + nop.m 0 + nop.m 0 + fabs f7 = f7 + } + ;; + { .mmi + cmp.ne p8, p0 = r0, r0 + adds J = -1, J + shladd INCX8 = INCX, 3, r0 + } + { .mmf + nop.m 0 + nop.m 0 + FADD f6 = f6, f7 + } + ;; + { .mmi + getf.d DMAX1 = f6 + adds PRE1 = PREFETCH_SIZE * SIZE, DX + mov ar.lc = J + } + { .mib + cmp.eq p7 ,p0 = -1, J + tbit.z p0, p13 = K, 2 + (p7) br.cond.dpnt .L15 + } + .align 32 + ;; +.L10: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX8 + (p16) LDFD f32 = [DX], SIZE + (p19) fabs f35 = f35 + } + { .mmf + (p8 ) mov DMAX1 = DATA1 + nop.m 0 + (p19) fabs f40 = f40 + } + ;; + { .mmf + (p20) getf.d DATA5 = f12 + (p16) LDFD f37 = [DX], INCXM1 + (p20) FADD f14 = f96, f101 + } + { .mmi + (p8 ) adds MAX1 = 0, I + (p20) CMPUNC p8, p0 = DATA2, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f42 = [DX], SIZE + (p8 ) mov DMAX1 = DATA2 + (p19) fabs f45 = f45 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f50 = f50 + } + ;; + { .mmf + (p20) getf.d DATA6 = f13 + (p16) LDFD f47 = [DX], INCXM1 + (p20) FADD f15 = f106, f111 + } + { .mmi + (p8 ) adds MAX1 = 1, I + (p20) CMPUNC p8, p0 = DATA3, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f52 = [DX], SIZE + (p8 ) mov DMAX1 = DATA3 + (p19) fabs f55 = f55 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f60 = f60 + } + ;; + { .mmf + (p20) getf.d DATA7 = f14 + (p16) LDFD f57 = [DX], INCXM1 + (p19) FADD f8 = f35, f40 + } + { .mmi + (p8 ) adds MAX1 = 2, I + (p20) CMPUNC p8, p0 = DATA4, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f62 = [DX], SIZE + (p8 ) mov DMAX1 = DATA4 + (p19) fabs f65 = f65 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f70 = f70 + } + ;; + { .mmf + (p20) getf.d DATA8 = f15 + (p16) LDFD f67 = [DX], INCXM1 + (p19) FADD f9 = f45, f50 + } + { .mmi + (p8 ) adds MAX1 = 3, I + (p20) CMPUNC p8, p0 = DATA5, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f72 = [DX], SIZE + (p8 ) mov DMAX1 = DATA5 + (p19) fabs f75 = f75 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f80 = f80 + } + ;; + { .mmf + (p19) getf.d DATA1 = f8 + (p16) LDFD f77 = [DX], INCXM1 + (p19) FADD f10 = f55, f60 + } + { .mmi + (p8 ) adds MAX1 = 4, I + (p20) CMPUNC p8, p0 = DATA6, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f82 = [DX], SIZE + (p8 ) mov DMAX1 = DATA6 + (p19) fabs f85 = f85 + } + { .mmf + nop.m 0 + nop.m 0 + (p19) fabs f90 = f90 + } + ;; + { .mmf + (p19) getf.d DATA2 = f9 + (p16) LDFD f87 = [DX], INCXM1 + (p19) FADD f11 = f65, f70 + } + { .mmi + (p8 ) adds MAX1 = 5, I + (p20) CMPUNC p8, p0 = DATA7, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f92 = [DX], SIZE + (p8 ) mov DMAX1 = DATA7 + (p19) fabs f95 = f95 + } + { .mmf + mov TMP = I + nop.m 0 + (p19) fabs f100 = f100 + } + ;; + { .mmf + (p19) getf.d DATA3 = f10 + (p16) LDFD f97 = [DX], INCXM1 + (p19) FADD f12 = f75, f80 + } + { .mmi + (p8 ) adds MAX1 = 6, I + (p20) CMPUNC p8, p0 = DATA8, DMAX1 + nop.i 0 + } + ;; + { .mmf + (p16) LDFD f102 = [DX], SIZE + (p8 ) mov DMAX1 = DATA8 + (p19) fabs f105 = f105 + } + { .mmf + (p20) adds I = 8, I + nop.m 0 + (p19) fabs f110 = f110 + } + ;; + { .mmi + (p19) getf.d DATA4 = f11 + (p16) LDFD f107 = [DX], INCXM1 + (p8 ) adds MAX1 = 7, TMP + } + { .mfb + (p19) CMPUNC p8, p0 = DATA1, DMAX1 + (p19) FADD f13 = f85, f90 + br.ctop.sptk.few .L10 + } + ;; + .align 32 + +.L15: + { .mmi + (p13) LDFD f32 = [DX], SIZE + and J = 7, K + mov pr = PR, -65474 + } + ;; + { .mmb + (p13) LDFD f33 = [DX], INCXM1 + cmp.eq p8 ,p0 = r0, J + (p8) br.cond.dpnt .L999 + } + ;; + { .mmi + (p13) LDFD f34 = [DX], SIZE + ;; + (p13) LDFD f35 = [DX], INCXM1 + nop.i 0 + } + ;; + { .mmi + (p13) LDFD f36 = [DX], SIZE + ;; + (p13) LDFD f37 = [DX], INCXM1 + nop.i 0 + } + ;; + { .mfi + (p13) LDFD f38 = [DX], SIZE + (p13) fabs f32 = f32 + tbit.z p0, p14 = K, 1 + } + ;; + { .mmf + (p13) LDFD f39 = [DX], INCXM1 + nop.m 0 + (p13) fabs f33 = f33 + } + ;; + { .mmf + (p14) LDFD f40 = [DX], SIZE + nop.m 0 + (p13) fabs f34 = f34 + } + ;; + { .mfi + (p14) LDFD f41 = [DX], INCXM1 + (p13) fabs f35 = f35 + tbit.z p0, p15 = K, 0 + } + ;; + { .mmf + (p14) LDFD f42 = [DX], SIZE + nop.m 0 + (p13) fabs f36 = f36 + } + ;; + { .mmf + (p14) LDFD f43 = [DX], INCXM1 + nop.m 0 + (p13) fabs f37 = f37 + } + { .mmf + nop.m 0 + nop.m 0 + (p13) FADD f32 = f32, f33 + } + ;; + { .mmf + (p15) LDFD f44 = [DX], SIZE + nop.m 0 + (p13) fabs f38 = f38 + } + ;; + { .mmf + (p15) LDFD f45 = [DX], INCXM1 + nop.m 0 + (p13) fabs f39 = f39 + } + { .mmf + nop.m 0 + nop.m 0 + (p13) FADD f34 = f34, f35 + } + ;; + { .mmf + nop.m 0 + nop.m 0 + (p14) fabs f40 = f40 + } + ;; + { .mmf + (p13) getf.d DATA1 = f32 + nop.m 0 + (p14) fabs f41 = f41 + } + { .mmf + nop.m 0 + nop.m 0 + (p13) FADD f36 = f36, f37 + } + ;; + { .mmf + nop.m 0 + nop.m 0 + (p14) fabs f42 = f42 + } + ;; + { .mmf + (p13) getf.d DATA2 = f34 + nop.m 0 + (p14) fabs f43 = f43 + } + { .mmf + nop.m 0 + nop.m 0 + (p13) FADD f38 = f38, f39 + } + ;; + { .mmf + nop.m 0 + nop.m 0 + (p15) fabs f44 = f44 + } + ;; + { .mmf + (p13) getf.d DATA3 = f36 + nop.m 0 + (p15) fabs f45 = f45 + } + { .mmf + nop.m 0 + nop.m 0 + (p14) FADD f40 = f40, f41 + } + ;; + { .mmf + (p13) getf.d DATA4 = f38 + nop.m 0 + (p14) FADD f42 = f42, f43 + } + ;; + { .mmf + (p14) getf.d DATA5 = f40 + nop.m 0 + (p15) FADD f44 = f44, f45 + } + ;; + { .mmi + (p14) getf.d DATA6 = f42 + nop.m 0 + (p13) CMPUNC p8, p0 = DATA1, DMAX1 + } + ;; + { .mmi + (p15) getf.d DATA7 = f44 + (p8 ) adds MAX1 = 0, I + (p8 ) mov DMAX1 = DATA1 + } + ;; + { .mmi + (p13) CMPUNC p8, p0 = DATA2, DMAX1 + ;; + (p8 ) adds MAX1 = 1, I + (p8 ) mov DMAX1 = DATA2 + } + ;; + { .mmi + (p13) CMPUNC p8, p0 = DATA3, DMAX1 + ;; + (p8 ) adds MAX1 = 2, I + (p8 ) mov DMAX1 = DATA3 + } + ;; + { .mmi + (p13) CMPUNC p8, p0 = DATA4, DMAX1 + ;; + (p8 ) adds MAX1 = 3, I + (p8 ) mov DMAX1 = DATA4 + }{ .mmi + (p13) adds I = 4, I + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p14) CMPUNC p8, p0 = DATA5, DMAX1 + ;; + (p8 ) adds MAX1 = 0, I + (p8 ) mov DMAX1 = DATA5 + } + ;; + { .mmi + (p14) CMPUNC p8, p0 = DATA6, DMAX1 + ;; + (p8 ) adds MAX1 = 1, I + (p8 ) mov DMAX1 = DATA6 + }{ .mmi + (p14) adds I = 2, I + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p15) CMPUNC p8, p0 = DATA7, DMAX1 + ;; + (p8) adds MAX1 = 0, I + (p8) mov DMAX1 = DATA7 + } + ;; + .align 32 + +.L999: + { .mmi + setf.d f8 = DMAX1 + adds RET = 1, MAX1 + mov ar.lc = ARLC + } + { .mmb + nop.m 0 + nop.m 0 + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/lsame.S b/kernel/ia64/lsame.S new file mode 100644 index 0000000000..3f2a7dbe03 --- /dev/null +++ b/kernel/ia64/lsame.S @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + .prologue + .body + ld1 r14 = [r32] + ld1 r15 = [r33] + ;; + adds r16 = -32, r14 // a1 = a - 32 + adds r17 = -32, r15 // b1 = b - 32 + ;; + cmp4.ge p6, p7 = 96, r14 // if (a > 96) + cmp4.ge p8, p9 = 96, r15 // if (b > 96) + ;; + (p7) mov r14 = r16 + (p9) mov r15 = r17 + ;; + cmp4.eq p6, p7 = r15, r14 + mov r8 = 1 + ;; + (p7) mov r8 = 0 + br.ret.sptk.many b0 + + EPILOGUE + diff --git a/kernel/ia64/nrm2.S b/kernel/ia64/nrm2.S new file mode 100644 index 0000000000..bb88cfb898 --- /dev/null +++ b/kernel/ia64/nrm2.S @@ -0,0 +1,310 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#ifndef COMPLEX +#define COMPADD 0 +#define STRIDE INCX +#else +#define COMPADD 1 +#define STRIDE SIZE +#endif + +#define PRE1 r2 + +#define I r17 +#define J r18 +#define X2 r19 +#define INCX5 r20 +#define INCX16 r21 + +#define N r32 +#define X r33 +#define INCX r34 +#define PR r30 +#define ARLC r31 + + + PROLOGUE + .prologue + PROFCODE + { .mfi + adds PRE1 = PREFETCH_SIZE * SIZE, X + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + ;; +#endif +#endif + { .mmi + cmp.ge p6, p0 = r0, N + cmp.ge p7, p0 = r0, INCX + shr I = N, (4 - COMPADD) + } + { .mbb + and J = ((1 << (4 - COMPADD)) - 1), N + (p6) br.ret.sptk.many b0 + (p7) br.ret.sptk.many b0 + } + ;; + { .mfi + mov f9 = f0 + mov PR = pr + } + { .mfi + adds I = -1, I + mov f10 = f0 + shl INCX = INCX, (BASE_SHIFT + COMPADD) + } + ;; + { .mfi + shladd X2 = INCX, (2 - COMPADD), X + mov f11 = f0 + mov pr.rot = 0 + } + { .mfi + shladd INCX5 = INCX, (2 - COMPADD), INCX + mov f12 = f0 + tbit.z p0, p12 = N, (3 - COMPADD) + } + ;; + { .mfi + shladd INCX16 = INCX, (4 - COMPADD), r0 + mov f13 = f0 + mov ar.ec= 3 + } + { .mmf + cmp.gt p8 ,p0 = r0, I + cmp.eq p16, p0 = r0, r0 + mov f14 = f0 + } + ;; + { .mmf +#ifdef COMPLEX + adds INCX = - SIZE, INCX + adds INCX5 = - SIZE, INCX5 +#else + nop.m 0 + nop.m 0 +#endif + mov f15 = f0 + } + { .mib + cmp.eq p9, p0 = r0, J + mov ar.lc = I + (p8) br.cond.dpnt .L52 + } + ;; + .align 32 + +.L51: + (p16) LDFD f32 = [X], STRIDE + (p16) lfetch.nt1 [PRE1], INCX16 + (p18) fma.d.s1 f8 = f34, f34, f8 + + (p16) LDFD f35 = [X2], STRIDE + (p18) fma.d.s1 f9 = f37, f37, f9 + nop.b 0 + ;; + (p16) LDFD f38 = [X], INCX + (p18) fma.d.s1 f10 = f40, f40, f10 + nop.b 0 + (p16) LDFD f41 = [X2], INCX + (p18) fma.d.s1 f11 = f43, f43, f11 + nop.b 0 + ;; + (p16) LDFD f44 = [X], STRIDE + (p18) fma.d.s1 f12 = f46, f46, f12 + nop.b 0 + (p16) LDFD f47 = [X2], STRIDE + (p18) fma.d.s1 f13 = f49, f49, f13 + nop.b 0 + ;; + (p16) LDFD f50 = [X], INCX5 + (p18) fma.d.s1 f14 = f52, f52, f14 + nop.b 0 + (p16) LDFD f53 = [X2], INCX5 + (p18) fma.d.s1 f15 = f55, f55, f15 + nop.b 0 + ;; + (p16) LDFD f56 = [X], STRIDE + (p18) fma.d.s1 f8 = f58, f58, f8 + nop.b 0 + (p16) LDFD f59 = [X2], STRIDE + (p18) fma.d.s1 f9 = f61, f61, f9 + nop.b 0 + ;; + (p16) LDFD f62 = [X], INCX + (p18) fma.d.s1 f10 = f64, f64, f10 + nop.b 0 + (p16) LDFD f65 = [X2], INCX + (p18) fma.d.s1 f11 = f67, f67, f11 + nop.b 0 + ;; + (p16) LDFD f68 = [X], STRIDE + (p18) fma.d.s1 f12 = f70, f70, f12 + nop.b 0 + (p16) LDFD f71 = [X2], STRIDE + (p18) fma.d.s1 f13 = f73, f73, f13 + nop.b 0 + ;; + (p16) LDFD f74 = [X], INCX5 + (p18) fma.d.s1 f14 = f76, f76, f14 + nop.b 0 + (p16) LDFD f77 = [X2], INCX5 + (p18) fma.d.s1 f15 = f79, f79, f15 + br.ctop.sptk.few .L51 + ;; + .align 32 + +.L52: + { .mmb + (p12) LDFD f32 = [X], STRIDE + (p12) LDFD f33 = [X2], STRIDE + (p9) br.cond.dptk .L998 + } + ;; + { .mmi + (p12) LDFD f34 = [X], INCX + (p12) LDFD f35 = [X2], INCX + tbit.z p0, p13 = N, (2 - COMPADD) + } + ;; + { .mmi + (p12) LDFD f36 = [X], STRIDE + (p12) LDFD f37 = [X2], STRIDE + tbit.z p0, p14 = N, (1 - COMPADD) + } + ;; + { .mmi + (p12) LDFD f38 = [X], INCX5 + (p12) LDFD f39 = [X2], INCX5 +#ifndef COMPLEX + tbit.z p0, p15 = N, 0 +#endif + } + ;; + (p13) LDFD f40 = [X], STRIDE + (p12) fma.d.s1 f8 = f32, f32, f8 + (p12) fma.d.s1 f9 = f33, f33, f9 + ;; + (p13) LDFD f41 = [X], INCX + (p12) fma.d.s1 f10 = f34, f34, f10 + (p12) fma.d.s1 f11 = f35, f35, f11 + ;; + (p13) LDFD f42 = [X], STRIDE + (p12) fma.d.s1 f12 = f36, f36, f12 + (p12) fma.d.s1 f13 = f37, f37, f13 + ;; + (p13) LDFD f43 = [X], INCX + (p12) fma.d.s1 f14 = f38, f38, f14 + (p12) fma.d.s1 f15 = f39, f39, f15 + ;; + (p14) LDFD f44 = [X], STRIDE + (p13) fma.d.s1 f8 = f40, f40, f8 + (p13) fma.d.s1 f9 = f41, f41, f9 + ;; + (p14) LDFD f45 = [X], INCX + (p13) fma.d.s1 f10 = f42, f42, f10 + (p13) fma.d.s1 f11 = f43, f43, f11 + ;; +#ifndef COMPLEX + (p15) LDFD f46 = [X] +#endif + (p14) fma.d.s1 f12 = f44, f44, f12 + (p14) fma.d.s1 f13 = f45, f45, f13 + ;; +#ifndef COMPLEX + (p15) fma.d.s1 f14 = f46, f46, f14 + ;; +#endif + .align 32 + +.L998: + { .mmf + fadd.d.s1 f8 = f8, f9 + } + { .mmf + fadd.d.s1 f10 = f10, f11 + } + { .mmf + fadd.d.s1 f12 = f12, f13 + } + { .mfi + fadd.d.s1 f14 = f14, f15 + mov ar.lc = ARLC + } + ;; + { .mmf + fadd.d.s1 f8 = f8, f10 + } + { .mfi + fadd.d.s1 f12 = f12, f14 + mov pr = PR, -65474 + } + ;; + { .mfb + fadd.d.s1 f8 = f8, f12 + br sqrt + } + ;; + EPILOGUE + + .section .data + .type sqrt, @function + .global sqrt diff --git a/kernel/ia64/qaxpy.S b/kernel/ia64/qaxpy.S new file mode 100644 index 0000000000..2acb86b73c --- /dev/null +++ b/kernel/ia64/qaxpy.S @@ -0,0 +1,509 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 16) + +#define N r32 +#define X1 r38 +#define INCX r39 +#define Y1 r33 +#define INCY r34 + +#define PRE1 r2 +#define PRE2 r3 + +#define I r14 +#define J r15 +#define X2 r16 +#define Y2 r17 +#define X3 r18 +#define Y3 r19 +#define X4 r20 +#define Y4 r21 + +#define YY1 r22 +#define YY2 r23 +#define YY3 r24 +#define YY4 r25 + +#define INCX4 r8 +#define INCY4 r9 +#define INCX2 r10 +#define INCY2 r11 + +#define INCX8 r26 +#define INCY8 r27 + +#define PR r30 +#define ARLC r31 + +#define ALPHA f8 +#define SP r12 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds r8 = 16, SP + adds r9 = 24, SP + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mmb + adds PRE1 = (PREFETCHSIZE + 2) * SIZE, X1 + cmp.lt p0, p6 = r0, N + (p6) br.ret.sptk.many b0 + } + ;; + { .mmi + ld8 Y1 = [r8] + ld8 INCY = [r9] + mov PR = pr + } + ;; + .body + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + mov pr.rot = 0 + } + ;; + { .mmi + shladd INCX4 = INCX, 2, r0 + shladd INCY4 = INCY, 2, r0 + mov ar.ec = 3 + } + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + shr I = N, 4 + } + ;; + { .mmi + add X2 = INCX, X1 + add Y2 = INCY, Y1 + add YY2 = INCY, Y1 + } + ;; + { .mmi + shladd X3 = INCX, 1, X1 + shladd Y3 = INCY, 1, Y1 + shladd YY3 = INCY, 1, Y1 + } + { .mmi + shladd X4 = INCX, 1, X2 + shladd Y4 = INCY, 1, Y2 + shladd YY4 = INCY, 1, Y2 + } + ;; + { .mmi + cmp.eq p7 ,p0 = 0, I + adds I = -1, I + mov YY1 = Y1 + } + { .mmi + and r28 = 127, Y1 + and PRE1 = -128, PRE1 + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mmi + adds PRE2 = (PREFETCHSIZE + 2) * SIZE, Y1 + or PRE1 = PRE1, r28 + mov ar.lc = I + } + { .mib + and J = 15, N + tbit.z p0, p12 = N, 3 + (p7) br.cond.dpnt .L115 + } + ;; + .align 32 + +.L112: + { .mmf + (p18) STFD [YY1] = f6 + (p18) STFD [YY2] = f7 + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) lfetch.excl.nt1 [PRE2], INCY8 + nop __LINE__ + (p18) FMA f7 = ALPHA, f61, f109 + } + ;; + { .mmf + (p18) STFD [YY3] = f10 + (p18) STFD [YY4] = f11 + (p18) FMA f10 = ALPHA, f64, f112 + } + { .mmf + (p16) lfetch.nt1 [PRE1], INCX8 + nop __LINE__ + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmi + (p16) LDFD f32 = [X1], INCX4 + (p16) LDFD f35 = [X2], INCX4 + (p18) add YY1 = INCY4, YY1 + } + { .mmi + (p16) LDFD f38 = [X3], INCX4 + (p16) LDFD f41 = [X4], INCX4 + (p18) add YY2 = INCY4, YY2 + } + ;; + { .mmi + (p17) LDFD f117 = [Y1], INCY4 + (p17) LDFD f120 = [Y2], INCY4 + (p18) add YY3 = INCY4, YY3 + } + { .mmi + (p17) LDFD f123 = [Y3], INCY4 + (p17) LDFD f126 = [Y4], INCY4 + (p18) add YY4 = INCY4, YY4 + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) STFD [YY2] = f13 + (p18) FMA f12 = ALPHA, f70, f118 + } + { .mmf + (p18) add YY1 = INCY4, YY1 + (p18) add YY2 = INCY4, YY2 + (p18) FMA f13 = ALPHA, f73, f121 + } + ;; + { .mmf + (p18) STFD [YY3] = f14 + (p18) STFD [YY4] = f15 + (p18) FMA f14 = ALPHA, f76, f124 + } + { .mmf + (p18) add YY3 = INCY4, YY3 + (p18) add YY4 = INCY4, YY4 + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmi + (p16) LDFD f44 = [X1], INCX4 + (p16) LDFD f47 = [X2], INCX4 + nop __LINE__ + } + { .mmi + (p16) LDFD f50 = [X3], INCX4 + (p16) LDFD f53 = [X4], INCX4 + nop __LINE__ + } + ;; + { .mmi + (p16) LDFD f80 = [Y1], INCY4 + (p16) LDFD f83 = [Y2], INCY4 + nop __LINE__ + } + { .mmi + (p16) LDFD f86 = [Y3], INCY4 + (p16) LDFD f89 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmf + (p18) STFD [YY1] = f6 + (p18) STFD [YY2] = f7 + (p17) FMA f6 = ALPHA, f33, f81 + } + { .mmf + (p16) lfetch.excl.nt1 [PRE2], INCY8 + nop __LINE__ + (p17) FMA f7 = ALPHA, f36, f84 + } + ;; + { .mmf + (p18) STFD [YY3] = f10 + (p18) STFD [YY4] = f11 + (p17) FMA f10 = ALPHA, f39, f87 + } + { .mmf + (p16) lfetch.nt1 [PRE1], INCX8 + nop __LINE__ + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmi + (p16) LDFD f56 = [X1], INCX4 + (p16) LDFD f59 = [X2], INCX4 + (p18) add YY1 = INCY4, YY1 + } + { .mmi + (p16) LDFD f62 = [X3], INCX4 + (p16) LDFD f65 = [X4], INCX4 + (p18) add YY2 = INCY4, YY2 + } + ;; + { .mmi + (p16) LDFD f92 = [Y1], INCY4 + (p16) LDFD f95 = [Y2], INCY4 + (p18) add YY3 = INCY4, YY3 + } + { .mmi + (p16) LDFD f98 = [Y3], INCY4 + (p16) LDFD f101 = [Y4], INCY4 + (p18) add YY4 = INCY4, YY4 + } + ;; + { .mmf + (p18) STFD [YY1] = f12 + (p18) STFD [YY2] = f13 + (p17) FMA f12 = ALPHA, f45, f93 + } + { .mmf + (p18) add YY1 = INCY4, YY1 + (p18) add YY2 = INCY4, YY2 + (p17) FMA f13 = ALPHA, f48, f96 + } + ;; + { .mmf + (p18) STFD [YY3] = f14 + (p18) STFD [YY4] = f15 + (p17) FMA f14 = ALPHA, f51, f99 + } + { .mmf + (p18) add YY3 = INCY4, YY3 + (p18) add YY4 = INCY4, YY4 + (p17) FMA f15 = ALPHA, f54, f102 + } + ;; + { .mmi + (p16) LDFD f68 = [X1], INCX4 + (p16) LDFD f71 = [X2], INCX4 + nop __LINE__ + } + { .mmi + (p16) LDFD f74 = [X3], INCX4 + (p16) LDFD f77 = [X4], INCX4 + nop __LINE__ + } + ;; + { .mmi + (p16) LDFD f104 = [Y1], INCY4 + (p16) LDFD f107 = [Y2], INCY4 + nop __LINE__ + } + { .mmb + (p16) LDFD f110 = [Y3], INCY4 + (p16) LDFD f113 = [Y4], INCY4 + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + { .mmi + (p12) LDFD f32 = [X1], INCX4 + (p12) LDFD f33 = [X2], INCX4 + mov pr = PR, -65474 + } + { .mmi + (p12) LDFD f34 = [X3], INCX4 + (p12) LDFD f35 = [X4], INCX4 + cmp.eq p9, p0 = r0, J + } + ;; + { .mmi + (p12) LDFD f64 = [Y1], INCY4 + (p12) LDFD f65 = [Y2], INCY4 + mov ar.lc = ARLC + } + { .mmb + (p12) LDFD f66 = [Y3], INCY4 + (p12) LDFD f67 = [Y4], INCY4 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f36 = [X1], INCX4 + (p12) LDFD f37 = [X2], INCX4 + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) LDFD f38 = [X3], INCX4 + (p12) LDFD f39 = [X4], INCX4 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) LDFD f68 = [Y1], INCY4 + (p12) LDFD f69 = [Y2], INCY4 + tbit.z p0, p15 = N, 0 + } + { .mmi + (p12) LDFD f70 = [Y3], INCY4 + (p12) LDFD f71 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f40 = [X1], INCX4 + (p13) LDFD f41 = [X2], INCX4 + shladd INCX2 = INCX, 1, r0 + } + { .mmi + (p13) LDFD f42 = [X3], INCX4 + (p13) LDFD f43 = [X4], INCX4 + shladd INCY2 = INCY, 1, r0 + } + ;; + { .mmi + (p13) LDFD f72 = [Y1], INCY4 + (p13) LDFD f73 = [Y2], INCY4 + nop __LINE__ + } + { .mmi + (p13) LDFD f74 = [Y3], INCY4 + (p13) LDFD f75 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f44 = [X1], INCX2 + (p14) LDFD f45 = [X2], INCX2 + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f76 = [Y1], INCY2 + (p14) LDFD f77 = [Y2], INCY2 + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f46 = [X1] + (p15) LDFD f78 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f32 = ALPHA, f32, f64 + (p12) FMA f33 = ALPHA, f33, f65 + (p12) FMA f34 = ALPHA, f34, f66 + (p12) FMA f35 = ALPHA, f35, f67 + (p12) FMA f36 = ALPHA, f36, f68 + (p12) FMA f37 = ALPHA, f37, f69 + (p12) FMA f38 = ALPHA, f38, f70 + (p12) FMA f39 = ALPHA, f39, f71 + ;; + { .mmf + (p12) STFD [YY1] = f32 + (p12) STFD [YY2] = f33 + (p13) FMA f40 = ALPHA, f40, f72 + } + { .mmf + (p12) add YY1 = INCY4, YY1 + (p12) add YY2 = INCY4, YY2 + (p13) FMA f41 = ALPHA, f41, f73 + } + ;; + { .mmf + (p12) STFD [YY3] = f34 + (p12) STFD [YY4] = f35 + (p13) FMA f42 = ALPHA, f42, f74 + } + { .mmf + (p12) add YY3 = INCY4, YY3 + (p12) add YY4 = INCY4, YY4 + (p13) FMA f43 = ALPHA, f43, f75 + } + ;; + { .mmf + (p12) STFD [YY1] = f36 + (p12) STFD [YY2] = f37 + (p14) FMA f44 = ALPHA, f44, f76 + } + { .mmf + (p12) add YY1 = INCY4, YY1 + (p12) add YY2 = INCY4, YY2 + (p14) FMA f45 = ALPHA, f45, f77 + } + ;; + { .mmf + (p12) STFD [YY3] = f38 + (p12) STFD [YY4] = f39 + (p15) FMA f46 = ALPHA, f46, f78 + } + { .mmi + (p12) add YY3 = INCY4, YY3 + (p12) add YY4 = INCY4, YY4 + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YY1] = f40 + (p13) STFD [YY2] = f41 + nop __LINE__ + } + { .mmi + (p13) add YY1 = INCY4, YY1 + (p13) add YY2 = INCY4, YY2 + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YY3] = f42 + (p13) STFD [YY4] = f43 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YY1] = f44 + (p14) STFD [YY2] = f45 + (p14) add YY1 = INCY2, YY1 + } + ;; + { .mmb + (p15) STFD [YY1] = f46 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/qcopy.S b/kernel/ia64/qcopy.S new file mode 100644 index 0000000000..9200470e32 --- /dev/null +++ b/kernel/ia64/qcopy.S @@ -0,0 +1,581 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define INCX2 r18 +#define INCY2 r19 +#define INCX8 r20 +#define INCY8 r21 +#define PR r30 +#define ARLC r31 + +#define PREFETCH_SIZE (8 * 16) + + PROLOGUE + .prologue + PROFCODE + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + shr I = N, 4 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + sub r8 = X1, Y1 + mov r9 = 0xf0 + mov PR = pr + } + { .mmi + shladd INCX2 = INCX, 1, r0 + shladd INCY2 = INCY, 1, r0 + and J = 15, N + } + ;; + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + mov pr.rot = 0 + } + { .mmi + and r8 = r9, r8 + cmp.eq p9, p0 = r0, J + adds I = -1, I + } + ;; + { .mmi + add X2 = X1, INCX + add Y2 = Y1, INCY + mov ar.ec = 4 + } + { .mmb + cmp.gt p6, p0 = 127, r8 + cmp.eq p16, p0 = r0, r0 + (p6) br.cond.dpnt .L20 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 2) * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = N, 3 + (p8) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mmi + (p19) STFD [Y1] = f35 + (p19) STFD [Y2] = f39 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p17) LDFD f81 = [X1], INCX2 + (p17) LDFD f85 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f43 + (p19) STFD [Y2] = f47 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p17) LDFD f89 = [X1], INCX2 + (p17) LDFD f93 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f51 + (p19) STFD [Y2] = f55 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], INCX2 + (p16) LDFD f36 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f59 + (p19) STFD [Y2] = f63 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p16) LDFD f40 = [X1], INCX2 + (p16) LDFD f44 = [X2], INCX2 + nop __LINE__ + } + ;; + { .mmi + (p19) STFD [Y1] = f67 + (p19) STFD [Y2] = f71 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f48 = [X1], INCX2 + (p16) LDFD f52 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f79 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f56 = [X1], INCX2 + (p16) LDFD f60 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f83 + (p19) STFD [Y2] = f87 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f91 + (p19) STFD [Y2] = f95 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f64 = [X1], INCX2 + (p16) LDFD f68 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmb + (p16) LDFD f72 = [X1], INCX2 + (p16) LDFD f76 = [X2], INCX2 + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFD f48 = [X1], INCX2 + (p12) LDFD f49 = [X2], INCX2 + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX2 + (p12) LDFD f51 = [X2], INCX2 + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f52 = [X1], INCX2 + (p12) LDFD f53 = [X2], INCX2 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f54 = [X1], INCX2 + (p12) LDFD f55 = [X2], INCX2 + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX2 + (p13) LDFD f57 = [X2], INCX2 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX2 + (p13) LDFD f59 = [X2], INCX2 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f49 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + (p14) LDFD f60 = [X1], INCX2 + (p14) LDFD f61 = [X2], INCX2 + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f51 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + (p15) LDFD f62 = [X1] + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f52 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f54 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) STFD [Y2] = f57 + (p13) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p13) STFD [Y2] = f59 + (p13) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p14) STFD [Y1] = f60 + (p14) STFD [Y2] = f61 + (p14) add Y1 = INCY2, Y1 + } + ;; + { .mmb + (p15) STFD [Y1] = f62 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 16 + +.L20: + { .mmi + adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 10) * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = N, 3 + (p8) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mmi + (p19) STFD [Y1] = f67 + (p19) STFD [Y2] = f71 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p17) LDFD f81 = [X1], INCX2 + (p17) LDFD f85 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f79 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p17) LDFD f89 = [X1], INCX2 + (p17) LDFD f93 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f83 + (p19) STFD [Y2] = f87 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], INCX2 + (p16) LDFD f36 = [X2], INCX2 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f91 + (p19) STFD [Y2] = f95 + (p19) add Y1 = INCY2, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p16) LDFD f40 = [X1], INCX2 + (p16) LDFD f44 = [X2], INCX2 + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [Y1] = f34 + (p18) STFD [Y2] = f38 + (p18) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f48 = [X1], INCX2 + (p16) LDFD f52 = [X2], INCX2 + (p18) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f42 + (p18) STFD [Y2] = f46 + (p18) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f56 = [X1], INCX2 + (p16) LDFD f60 = [X2], INCX2 + (p18) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f50 + (p18) STFD [Y2] = f54 + (p18) add Y1 = INCY2, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p18) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f58 + (p18) STFD [Y2] = f62 + (p18) add Y1 = INCY2, Y1 + } + { .mmi + (p16) LDFD f64 = [X1], INCX2 + (p16) LDFD f68 = [X2], INCX2 + (p18) add Y2 = INCY2, Y2 + } + ;; + { .mmb + (p16) LDFD f72 = [X1], INCX2 + (p16) LDFD f76 = [X2], INCX2 + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFD f48 = [X1], INCX2 + (p12) LDFD f49 = [X2], INCX2 + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX2 + (p12) LDFD f51 = [X2], INCX2 + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f52 = [X1], INCX2 + (p12) LDFD f53 = [X2], INCX2 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f54 = [X1], INCX2 + (p12) LDFD f55 = [X2], INCX2 + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX2 + (p13) LDFD f57 = [X2], INCX2 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX2 + (p13) LDFD f59 = [X2], INCX2 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f49 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + (p14) LDFD f60 = [X1], INCX2 + (p14) LDFD f61 = [X2], INCX2 + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f51 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + (p15) LDFD f62 = [X1] + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f52 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f54 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) STFD [Y2] = f57 + (p13) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p13) STFD [Y2] = f59 + (p13) add Y1 = INCY2, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY2, Y2 + } + ;; + { .mmi + (p14) STFD [Y1] = f60 + (p14) STFD [Y2] = f61 + (p14) add Y1 = INCY2, Y1 + } + ;; + { .mmb + (p15) STFD [Y1] = f62 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + + EPILOGUE + diff --git a/kernel/ia64/qdot.S b/kernel/ia64/qdot.S new file mode 100644 index 0000000000..ff3f93bb01 --- /dev/null +++ b/kernel/ia64/qdot.S @@ -0,0 +1,421 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (8 * 24) + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX1 r2 +#define PREY1 r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 +#define Y3 r18 +#define X3 r19 +#define Y4 r20 +#define X4 r21 + +#define INCX2 r22 +#define INCY2 r23 + +#define INCX4 r24 +#define INCY4 r25 +#define INCX16 r26 +#define INCY16 r27 + +#define PREX2 r28 +#define PREY2 r29 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + nop __LINE__ + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov r26 = 1 + mov f9 = f0 + nop __LINE__ + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + (p6) shladd X1 = r26, BASE_SHIFT, X1 + (p7) shladd Y1 = r27, BASE_SHIFT, Y1 + ;; +#endif + { .mmi + adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 + adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 + mov PR = pr + } + { .mib + cmp.lt p0, p6 = r0, N + shl INCX = INCX, BASE_SHIFT + (p6) br.ret.sptk.many b0 + } + ;; + { .mfi + add X2 = INCX, X1 + mov f10 = f0 + shl INCY = INCY, BASE_SHIFT + } + { .mmf + and r8 = 127, X1 + shladd X3 = INCX, 1, X1 + mov f11 = f0 + } + ;; + { .mmi + and PREY1 = -128, PREY1 + shladd X4 = INCX, 1, X2 + add INCX2 = INCX, INCX + } + { .mmi + shladd INCX4 = INCX, 2, r0 + add Y2 = INCY, Y1 + shladd Y3 = INCY, 1, Y1 + } + ;; + { .mmi + shladd Y4 = INCY, 1, Y2 + add INCY2 = INCY, INCY + nop __LINE__ + } + { .mmi + shladd INCY4 = INCY, 2, r0 + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + } + ;; + { .mfi + nop __LINE__ + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + or PREY1 = PREY1, r8 + mov f13 = f0 + shr I = N, 4 + } + ;; + { .mfi + adds I = -1, I + mov f14 = f0 + mov ar.ec= 3 + } + { .mmf + shladd PREX2 = INCX, 3, PREX1 + shladd PREY2 = INCY, 3, PREY1 + mov f15 = f0 + } + ;; + { .mmi + and J = 15, N + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + } + { .mib + cmp.eq p6 ,p0 = -1, I + tbit.nz p12, p0 = N, 3 + (p6) br.cond.dpnt .L215 + } + ;; + .align 32 + +.L212: + { .mmf + (p16) lfetch.nt1 [PREX1], INCX16 + (p16) lfetch.nt1 [PREX2], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [X1], INCX4 + (p16) LDFD f83 = [X2], INCX4 + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFD f86 = [X3], INCX4 + (p16) LDFD f89 = [X4], INCX4 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f92 = [X1], INCX4 + (p16) LDFD f95 = [X2], INCX4 + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFD f32 = [Y1], INCY4 + (p16) LDFD f35 = [Y2], INCY4 + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f38 = [Y3], INCY4 + (p16) LDFD f41 = [Y4], INCY4 + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFD f98 = [X3], INCX4 + (p16) LDFD f101 = [X4], INCX4 + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f104 = [X1], INCX4 + (p16) LDFD f107 = [X2], INCX4 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFD f44 = [Y1], INCY4 + (p16) LDFD f47 = [Y2], INCY4 + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f50 = [Y3], INCY4 + (p16) LDFD f53 = [Y4], INCY4 + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) lfetch.nt1 [PREY1], INCY16 + (p16) lfetch.nt1 [PREY2], INCY16 + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f110 = [X3], INCX4 + (p16) LDFD f113 = [X4], INCX4 + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFD f56 = [Y1], INCY4 + (p16) LDFD f59 = [Y2], INCY4 + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f62 = [Y3], INCY4 + (p16) LDFD f65 = [Y4], INCY4 + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFD f116 = [X1], INCX4 + (p16) LDFD f119 = [X2], INCX4 + (p18) FMA f14 = f76, f124, f14 + } + { .mmf + (p16) LDFD f122 = [X3], INCX4 + (p16) LDFD f125 = [X4], INCX4 + (p18) FMA f15 = f79, f127, f15 + } + ;; + { .mmi + (p16) LDFD f68 = [Y1], INCY4 + (p16) LDFD f71 = [Y2], INCY4 + nop __LINE__ + } + { .mmb + (p16) LDFD f74 = [Y3], INCY4 + (p16) LDFD f77 = [Y4], INCY4 + br.ctop.sptk.few .L212 + } + ;; + .align 32 + +.L215: + { .mmi + (p12) LDFD f48 = [X1], INCX4 + (p12) LDFD f49 = [X2], INCX4 + cmp.eq p7, p0 = r0, J + } + { .mmb + (p12) LDFD f50 = [X3], INCX4 + (p12) LDFD f51 = [X4], INCX4 + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFD f32 = [Y1], INCY4 + (p12) LDFD f33 = [Y2], INCY4 + tbit.nz p13, p0 = N, 2 + } + { .mmi + (p12) LDFD f34 = [Y3], INCY4 + (p12) LDFD f35 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p12) LDFD f52 = [X1], INCX4 + (p12) LDFD f53 = [X2], INCX4 + tbit.nz p14, p0 = N, 1 + } + { .mmi + (p12) LDFD f54 = [X3], INCX4 + (p12) LDFD f55 = [X4], INCX4 + nop __LINE__ + } + ;; + { .mmi + (p12) LDFD f36 = [Y1], INCY4 + (p12) LDFD f37 = [Y2], INCY4 + tbit.nz p15, p0 = N, 0 + } + { .mmi + (p12) LDFD f38 = [Y3], INCY4 + (p12) LDFD f39 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX4 + (p13) LDFD f57 = [X2], INCX4 + nop __LINE__ + } + { .mmi + (p13) LDFD f58 = [X3], INCX4 + (p13) LDFD f59 = [X4], INCX4 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f40 = [Y1], INCY4 + (p13) LDFD f41 = [Y2], INCY4 + nop __LINE__ + } + { .mmi + (p13) LDFD f42 = [Y3], INCY4 + (p13) LDFD f43 = [Y4], INCY4 + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f60 = [X1], INCX2 + (p14) LDFD f61 = [X2], INCX2 + nop __LINE__ + } + { .mmi + (p14) LDFD f44 = [Y1], INCY2 + (p14) LDFD f45 = [Y2], INCY2 + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f62 = [X1] + (p15) LDFD f46 = [Y1] + nop __LINE__ + } + ;; + (p12) FMA f8 = f32, f48, f8 + (p12) FMA f9 = f33, f49, f9 + (p12) FMA f10 = f34, f50, f10 + (p12) FMA f11 = f35, f51, f11 + ;; + (p12) FMA f12 = f36, f52, f12 + (p12) FMA f13 = f37, f53, f13 + (p12) FMA f14 = f38, f54, f14 + (p12) FMA f15 = f39, f55, f15 + ;; + (p13) FMA f8 = f40, f56, f8 + (p13) FMA f9 = f41, f57, f9 + (p13) FMA f10 = f42, f58, f10 + (p13) FMA f11 = f43, f59, f11 + ;; + (p14) FMA f8 = f44, f60, f8 + (p14) FMA f9 = f45, f61, f9 + (p15) FMA f10 = f46, f62, f10 + ;; + .align 32 + +.L999: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f10 + FADD f12 = f12, f14 + mov ar.lc = ARLC + ;; + FADD f8 = f8, f12 + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/qgemm_kernel.S b/kernel/ia64/qgemm_kernel.S new file mode 100644 index 0000000000..3c9fb69803 --- /dev/null +++ b/kernel/ia64/qgemm_kernel.S @@ -0,0 +1,8993 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 16) + +#define CPREFETCHSIZE 7 +#define CPREFETCH lfetch.excl.nt2 + +#define M r32 +#define N r33 +#define K r34 +#define A r38 +#define B r39 +#define C r36 +#define LDC r37 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS +#ifdef TRMMKERNEL + alloc ARPFS = ar.pfs, 8, 16, 0, 0 +#else + alloc ARPFS = ar.pfs, 8, 8, 0, 0 +#endif + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -16 * 16, SP + adds r9 = -15 * 16, SP + adds SP = -16 * 16, SP + } + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + shr J = N, 3 + ;; + stf.spill [r8] = f22, 32 + stf.spill [r9] = f23, 32 + mov AOFFSET = A + ;; + stf.spill [r8] = f24, 32 + stf.spill [r9] = f25, 32 + cmp.ge p6, p0 = 0, J + ;; + stf.spill [r8] = f26, 32 + stf.spill [r9] = f27, 32 + ;; + stf.spill [r8] = f28, 32 + stf.spill [r9] = f29, 32 + ;; + stf.spill [r8] = f30 + stf.spill [r9] = f31 + ld8 C = [r14], 8 + ;; + ld8 LDC = [r14], 8 + ;; + shladd LDC = LDC, BASE_SHIFT, r0 + ;; +#ifndef TRMMKERNEL + (p6) br.cond.dpnt .L050 + .body + ;; +#else + .body + ;; + ld8 OFFSET = [r14], 8 + ;; + +#if defined(TRMMKERNEL) && !defined(LEFT) + ;; + sub KK = r0, OFFSET +#endif + (p6) br.cond.dpnt .L050 + ;; +#endif + .align 32 + +.L010: + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc + shladd C = LDC, 3, C // coffset += 8 * ldc + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc + shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + }{ .mfb + sub C8 = C, LDC // coffset8 = c + 7 * ldc + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + mov BOFFSET = B + ;; + + { .mfb + LDFD f48 = [BOFFSET], SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfb + LDFD f49 = [BOFFSET], SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 3, B + mov f65 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + LDFD f48 = [BOFFSET], SIZE + ;; + { .mfi + LDFD f49 = [BOFFSET], SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + LDFD f32 = [AOFFSET], SIZE + LDFD f50 = [BOFFSET], SIZE + ;; + + { .mfb + LDFD f33 = [AOFFSET], SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + LDFD f51 = [BOFFSET], SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + LDFD f52 = [BOFFSET], SIZE + ;; + { .mmf + LDFD f53 = [BOFFSET], SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfb + setf.d f113 = r0 + mov f121 = f0 + nop __LINE__ + } + ;; + LDFD f54 = [BOFFSET], SIZE + ;; + { .mmf + LDFD f55 = [BOFFSET], SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfb + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + LDFD f34 = [AOFFSET], SIZE + ;; + { .mmf + LDFD f35 = [AOFFSET], SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfb + setf.d f114 = r0 + mov f122 = f0 + nop __LINE__ + } + ;; + LDFD f36 = [AOFFSET], SIZE + ;; + { .mmf + LDFD f37 = [AOFFSET], SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 8, KK +#endif +#endif + } + ;; + LDFD f38 = [AOFFSET], SIZE + ;; + { .mmf + LDFD f39 = [AOFFSET], SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfi + setf.d f119 = r0 + mov f127 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.fault.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFD f40 = [AOFFSET], SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfi + (p3) LDFD f56 = [BOFFSET], SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + adds C10 = 4 * SIZE, C2 + } + { .mfb + (p3) LDFD f41 = [AOFFSET], SIZE + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfi + (p3) LDFD f57 = [BOFFSET], SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfb + (p3) LDFD f42 = [AOFFSET], SIZE + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfi + (p3) LDFD f58 = [BOFFSET], SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + (p3) LDFD f43 = [AOFFSET], SIZE + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfi + (p3) LDFD f59 = [BOFFSET], SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + adds C13 = 4 * SIZE, C5 + } + { .mfb + (p3) LDFD f44 = [AOFFSET], SIZE + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfi + (p3) LDFD f60 = [BOFFSET], SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + adds C14 = 4 * SIZE, C6 + } + { .mfb + (p3) LDFD f45 = [AOFFSET], SIZE + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfi + (p3) LDFD f61 = [BOFFSET], SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C15 = 4 * SIZE, C7 + } + { .mfb + (p3) LDFD f46 = [AOFFSET], SIZE + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfi + (p3) LDFD f62 = [BOFFSET], SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + adds C16 = 4 * SIZE, C8 + } + { .mfb + (p3) LDFD f47 = [AOFFSET], SIZE + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + (p3) LDFD f63 = [BOFFSET], SIZE + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + nop __LINE__ + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + nop __LINE__ + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + (p4) LDFD f32 = [AOFFSET], SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFD f33 = [AOFFSET], SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + (p4) LDFD f48 = [BOFFSET], SIZE + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFD f34 = [AOFFSET], SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + (p4) LDFD f49 = [BOFFSET], SIZE + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + lfetch.fault.nt1 [PREA], 8 * SIZE + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFD f35 = [AOFFSET], SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p4) LDFD f50 = [BOFFSET], SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFD f36 = [AOFFSET], SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + (p4) LDFD f51 = [BOFFSET], SIZE + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFD f37 = [AOFFSET], SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + (p4) LDFD f52 = [BOFFSET], SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFD f38 = [AOFFSET], SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFD f53 = [BOFFSET], SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFD f39 = [AOFFSET], SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p4) LDFD f54 = [BOFFSET], SIZE + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFD f55 = [BOFFSET], SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f6 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f7 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f10 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f11 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f12 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f13 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f14 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f15 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f16 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f17 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f18 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f19 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f20 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f21 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f22 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f23 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f24 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f25 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f26 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f27 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f28 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f29 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f30 = [C3 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f31 = [C11], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f32 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f33 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f34 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f35 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f36 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f37 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f38 = [C4 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f39 = [C12], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f48 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f49 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f50 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f51 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f52 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f53 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f54 = [C5 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f55 = [C13], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f40 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f41 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f42 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f43 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f44 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f45 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f59 = [C6 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f60 = [C14], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f61 = [C7 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f62 = [C15], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; +.L013: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + (p5) LDFD f63 = [C7 ], SIZE + FMA f64 = ALPHA, f64, f6 + cmp.ne p6, p0 = 1, I + } + { .mfb + (p5) LDFD f6 = [C15], SIZE + FMA f68 = ALPHA, f68, f7 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f7 = [C7 ], SIZE + FMA f65 = ALPHA, f65, f10 + adds I = -1, I + } + { .mfb + (p5) LDFD f10 = [C15], SIZE + FMA f69 = ALPHA, f69, f11 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f11 = [C7 ], -3 * SIZE + FMA f66 = ALPHA, f66, f12 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C15], -3 * SIZE + FMA f70 = ALPHA, f70, f13 + nop __LINE__ + } + ;; + { .mfb + LDFD f13 = [C8 ], SIZE + FMA f67 = ALPHA, f67, f14 + nop __LINE__ + } + { .mfb + LDFD f14 = [C16], SIZE + FMA f71 = ALPHA, f71, f15 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + FMA f72 = ALPHA, f72, f16 + } + { .mmf + LDFD f15 = [C8 ], SIZE + LDFD f16 = [C16], SIZE + FMA f76 = ALPHA, f76, f17 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + FMA f73 = ALPHA, f73, f18 + } + { .mmf + LDFD f17 = [C8 ], SIZE + LDFD f18 = [C16], SIZE + FMA f77 = ALPHA, f77, f19 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + FMA f74 = ALPHA, f74, f20 + } + { .mmf + LDFD f19 = [C8 ], -3 * SIZE + LDFD f20 = [C16], -3 * SIZE + FMA f78 = ALPHA, f78, f21 + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f22 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f23 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f80 = ALPHA, f80, f24 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMA f84 = ALPHA, f84, f25 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMA f81 = ALPHA, f81, f26 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMA f85 = ALPHA, f85, f27 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMA f82 = ALPHA, f82, f28 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMA f86 = ALPHA, f86, f29 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMA f83 = ALPHA, f83, f30 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMA f87 = ALPHA, f87, f31 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f88 = ALPHA, f88, f32 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMA f92 = ALPHA, f92, f33 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f89 = ALPHA, f89, f34 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMA f93 = ALPHA, f93, f35 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMA f90 = ALPHA, f90, f36 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMA f94 = ALPHA, f94, f37 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMA f91 = ALPHA, f91, f38 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMA f95 = ALPHA, f95, f39 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMA f96 = ALPHA, f96, f48 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + FMA f100 = ALPHA, f100, f49 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + FMA f97 = ALPHA, f97, f50 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + FMA f101 = ALPHA, f101, f51 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + FMA f98 = ALPHA, f98, f52 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + FMA f102 = ALPHA, f102, f53 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + FMA f99 = ALPHA, f99, f54 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + FMA f103 = ALPHA, f103, f55 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMA f104 = ALPHA, f104, f40 + nop __LINE__ + } + { .mfb + STFD [C13] = f100, SIZE + FMA f108 = ALPHA, f108, f41 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + FMA f105 = ALPHA, f105, f42 + nop __LINE__ + } + { .mfb + STFD [C13] = f101, SIZE + FMA f109 = ALPHA, f109, f43 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f98, SIZE + FMA f106 = ALPHA, f106, f44 + nop __LINE__ + } + { .mfb + STFD [C13] = f102, SIZE + FMA f110 = ALPHA, f110, f45 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f99, 5 * SIZE + FMA f107 = ALPHA, f107, f59 + nop __LINE__ + } + { .mfb + STFD [C13] = f103, 5 * SIZE + FMA f111 = ALPHA, f111, f60 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMA f112 = ALPHA, f112, f61 + nop __LINE__ + } + { .mfb + STFD [C14] = f108, SIZE + FMA f116 = ALPHA, f116, f62 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, SIZE + FMA f113 = ALPHA, f113, f63 + nop __LINE__ + } + { .mfb + STFD [C14] = f109, SIZE + FMA f117 = ALPHA, f117, f6 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f106, SIZE + FMA f114 = ALPHA, f114, f7 + nop __LINE__ + } + { .mfb + STFD [C14] = f110, SIZE + FMA f118 = ALPHA, f118, f10 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f107, 5 * SIZE + FMA f115 = ALPHA, f115, f11 + nop __LINE__ + } + { .mfb + STFD [C14] = f111, 5 * SIZE + FMA f119 = ALPHA, f119, f12 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + FMA f120 = ALPHA, f120, f13 + nop __LINE__ + } + { .mfb + STFD [C15] = f116, SIZE + FMA f124 = ALPHA, f124, f14 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + FMA f121 = ALPHA, f121, f15 + nop __LINE__ + } + { .mfb + STFD [C15] = f117, SIZE + FMA f125 = ALPHA, f125, f16 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f114, SIZE + FMA f122 = ALPHA, f122, f17 + nop __LINE__ + } + { .mfb + STFD [C15] = f118, SIZE + FMA f126 = ALPHA, f126, f18 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f115, 5 * SIZE + FMA f123 = ALPHA, f123, f19 + nop __LINE__ + } + { .mfb + STFD [C15] = f119, 5 * SIZE + FMA f127 = ALPHA, f127, f20 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f120, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f124, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f125, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f122, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f126, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f123, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f127, 5 * SIZE + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + FMPY f72 = ALPHA, f72 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f76 = ALPHA, f76 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + FMPY f73 = ALPHA, f73 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f77 = ALPHA, f77 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + FMPY f74 = ALPHA, f74 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMPY f78 = ALPHA, f78 + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMPY f84 = ALPHA, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMPY f85 = ALPHA, f85 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMPY f86 = ALPHA, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMPY f87 = ALPHA, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMPY f92 = ALPHA, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMPY f93 = ALPHA, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMPY f94 = ALPHA, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMPY f91 = ALPHA, f91 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMPY f95 = ALPHA, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + FMPY f100 = ALPHA, f100 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + FMPY f101 = ALPHA, f101 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + FMPY f98 = ALPHA, f98 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + FMPY f102 = ALPHA, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + FMPY f99 = ALPHA, f99 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + FMPY f103 = ALPHA, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + { .mfb + STFD [C13] = f100, SIZE + FMPY f108 = ALPHA, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + { .mfb + STFD [C13] = f101, SIZE + FMPY f109 = ALPHA, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f98, SIZE + FMPY f106 = ALPHA, f106 + nop __LINE__ + } + { .mfb + STFD [C13] = f102, SIZE + FMPY f110 = ALPHA, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f99, 5 * SIZE + FMPY f107 = ALPHA, f107 + nop __LINE__ + } + { .mfb + STFD [C13] = f103, 5 * SIZE + FMPY f111 = ALPHA, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMPY f112 = ALPHA, f112 + nop __LINE__ + } + { .mfb + STFD [C14] = f108, SIZE + FMPY f116 = ALPHA, f116 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, SIZE + FMPY f113 = ALPHA, f113 + nop __LINE__ + } + { .mfb + STFD [C14] = f109, SIZE + FMPY f117 = ALPHA, f117 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f106, SIZE + FMPY f114 = ALPHA, f114 + nop __LINE__ + } + { .mfb + STFD [C14] = f110, SIZE + FMPY f118 = ALPHA, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f107, 5 * SIZE + FMPY f115 = ALPHA, f115 + nop __LINE__ + } + { .mfb + STFD [C14] = f111, 5 * SIZE + FMPY f119 = ALPHA, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + FMPY f120 = ALPHA, f120 + nop __LINE__ + } + { .mfb + STFD [C15] = f116, SIZE + FMPY f124 = ALPHA, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + FMPY f121 = ALPHA, f121 + nop __LINE__ + } + { .mfb + STFD [C15] = f117, SIZE + FMPY f125 = ALPHA, f125 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f114, SIZE + FMPY f122 = ALPHA, f122 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C15] = f118, SIZE + FMPY f126 = ALPHA, f126 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f115, 5 * SIZE + FMPY f123 = ALPHA, f123 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C15] = f119, 5 * SIZE + FMPY f127 = ALPHA, f127 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f120, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f124, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C16] = f125, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f122, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f126, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f123, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f127, 5 * SIZE + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: +#if 0 + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f89 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 8, KK +#endif +#endif + mov f81 = f0 + (p6) br.cond.dptk .L030 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 3, B + mov f65 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; +#endif + { .mmf + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfi + setf.d f113 = r0 + mov f121 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mmf + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f75 = f0 + adds L = -1, L + } + { .mmf + setf.d f67 = r0 + setf.d f83 = r0 + mov f91 = f0 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f107 = f0 + mov ar.lc = L + } + { .mmf + setf.d f99 = r0 + setf.d f115 = r0 + mov f123 = f0 + } + ;; + .align 32 + +.L022: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C10], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f86 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f87 = [C11], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C12], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C13], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C5 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C13], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C14], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C6 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C14], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f116 = [C7 ], SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f118 = [C15], SIZE + FMA f66 = ALPHA, f66, f70 + nop __LINE__ + } + ;; + { .mfb + LDFD f117 = [C7 ], -1 * SIZE + FMA f65 = ALPHA, f65, f69 + nop __LINE__ + } + { .mfb + LDFD f119 = [C15], -1 * SIZE + FMA f67 = ALPHA, f67, f71 + nop __LINE__ + } + ;; + { .mfb + LDFD f124 = [C8], SIZE + FMA f72 = ALPHA, f72, f76 + nop __LINE__ + } + { .mfb + LDFD f126 = [C16], SIZE + FMA f74 = ALPHA, f74, f78 + nop __LINE__ + } + ;; + { .mfb + LDFD f125 = [C8], -1 * SIZE + FMA f73 = ALPHA, f73, f77 + nop __LINE__ + } + { .mfb + LDFD f127 = [C16], -1 * SIZE + FMA f75 = ALPHA, f75, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMA f82 = ALPHA, f82, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMA f83 = ALPHA, f83, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMA f90 = ALPHA, f90, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMA f91 = ALPHA, f91, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f96 = ALPHA, f96, f100 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + FMA f98 = ALPHA, f98, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + FMA f97 = ALPHA, f97, f101 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + FMA f99 = ALPHA, f99, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMA f104 = ALPHA, f104, f108 + nop __LINE__ + } + { .mfb + STFD [C12] = f90, SIZE + FMA f106 = ALPHA, f106, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, 3 * SIZE + FMA f105 = ALPHA, f105, f109 + nop __LINE__ + } + { .mfb + STFD [C12] = f91, 3 * SIZE + FMA f107 = ALPHA, f107, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMA f112 = ALPHA, f112, f116 + nop __LINE__ + } + { .mfb + STFD [C13] = f98, SIZE + FMA f114 = ALPHA, f114, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, 3 * SIZE + FMA f113 = ALPHA, f113, f117 + nop __LINE__ + } + { .mfb + STFD [C13] = f99, 3 * SIZE + FMA f115 = ALPHA, f115, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f104, SIZE + FMA f120 = ALPHA, f120, f124 + nop __LINE__ + } + { .mfb + STFD [C14] = f106, SIZE + FMA f122 = ALPHA, f122, f126 + nop __LINE__ + } + ;; + { .mfb + STFD [C6 ] = f105, 3 * SIZE + FMA f121 = ALPHA, f121, f125 + nop __LINE__ + } + { .mfb + STFD [C14] = f107, 3 * SIZE + FMA f123 = ALPHA, f123, f127 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C15] = f114, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, 3 * SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C15] = f115, 3 * SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f120, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f122, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C8 ] = f121, 3 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C16] = f123, 3 * SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mfb + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + { .mfb + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + ;; + { .mfb + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + ;; + { .mfb + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMPY f91 = ALPHA, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + FMPY f98 = ALPHA, f98 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + FMPY f99 = ALPHA, f99 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + { .mfb + STFD [C12] = f90, SIZE + FMPY f106 = ALPHA, f106 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, 3 * SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + { .mfb + STFD [C12] = f91, 3 * SIZE + FMPY f107 = ALPHA, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + FMPY f112 = ALPHA, f112 + nop __LINE__ + } + { .mfb + STFD [C13] = f98, SIZE + FMPY f114 = ALPHA, f114 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, 3 * SIZE + FMPY f113 = ALPHA, f113 + nop __LINE__ + } + { .mfb + STFD [C13] = f99, 3 * SIZE + FMPY f115 = ALPHA, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C6 ] = f104, SIZE + FMPY f120 = ALPHA, f120 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C14] = f106, SIZE + FMPY f122 = ALPHA, f122 + nop __LINE__ + } + ;; + { .mfi + STFD [C6 ] = f105, 3 * SIZE + FMPY f121 = ALPHA, f121 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C14] = f107, 3 * SIZE + FMPY f123 = ALPHA, f123 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C15] = f114, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f113, 3 * SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C15] = f115, 3 * SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C8 ] = f120, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f122, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C8 ] = f121, 3 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C16] = f123, 3 * SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L030: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 8, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#else + { .mmf + shladd BOFFSET = KK8, 3, B + shladd AOFFSET = KK8, 1, AOFFSET + mov f65 = f0 + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#endif + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f81 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f97 = f0 + adds L = -1, L + } + { .mfi + nop __LINE__ + mov f105 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f113 = f0 + mov ar.lc = L + } + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f121 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f100 = [C5], SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f108 = [C6], SIZE + FMA f65 = ALPHA, f65, f69 + nop __LINE__ + } + ;; + { .mfb + LDFD f101 = [C5], -1 * SIZE + FMA f72 = ALPHA, f72, f76 + nop __LINE__ + } + { .mfb + LDFD f109 = [C6], -1 * SIZE + FMA f73 = ALPHA, f73, f77 + nop __LINE__ + } + ;; + { .mfb + LDFD f116 = [C7], SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + LDFD f124 = [C8], SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + ;; + { .mfb + LDFD f117 = [C7], -1 * SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + LDFD f125 = [C8], -1 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f96 = ALPHA, f96, f100 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + FMA f104 = ALPHA, f104, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f97 = ALPHA, f97, f101 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + FMA f105 = ALPHA, f105, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f112 = ALPHA, f112, f116 + nop __LINE__ + } + { .mfb + STFD [C4 ] = f88, SIZE + FMA f120 = ALPHA, f120, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f113 = ALPHA, f113, f117 + nop __LINE__ + } + { .mfb + STFD [C4 ] = f89, SIZE + FMA f121 = ALPHA, f121, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f96, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C6 ] = f104, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C5 ] = f97, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C6 ] = f105, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f112, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f120, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C7 ] = f113, SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f121, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f96 = ALPHA, f96 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f97 = ALPHA, f97 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f105 = ALPHA, f105 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + FMPY f112 = ALPHA, f112 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4 ] = f88, SIZE + FMPY f120 = ALPHA, f120 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f81, SIZE + FMPY f113 = ALPHA, f113 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4 ] = f89, SIZE + FMPY f121 = ALPHA, f121 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C5 ] = f96, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C6 ] = f104, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C5 ] = f97, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6 ] = f105, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8 ] = f120, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f113, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8 ] = f121, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L040: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 8, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#else + { .mmi + shladd BOFFSET = KK8, 3, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } +#endif + ;; + { .mii + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + LDFD f32 = [AOFFSET], 1 * SIZE + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + { .mmi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + nop __LINE__ + } + ;; + .align 32 + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2] +#else + nop __LINE__ +#endif + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3] +#else + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4] +#else + nop __LINE__ +#endif + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C5] + (p5) LDFD f108 = [C6] +#else + nop __LINE__ + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f116 = [C7] + (p5) LDFD f124 = [C8] +#else + nop __LINE__ + nop __LINE__ +#endif + br.cloop.sptk.few .L042 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + FMA f80 = ALPHA, f80, f84 + FMA f88 = ALPHA, f88, f92 + + FMA f96 = ALPHA, f96, f100 + FMA f104 = ALPHA, f104, f108 + FMA f112 = ALPHA, f112, f116 + FMA f120 = ALPHA, f120, f124 + ;; + STFD [C1 ] = f64, SIZE + mov f64 = f0 + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; + STFD [C5 ] = f96, SIZE + mov f96 = f0 + STFD [C6 ] = f104, SIZE + mov f104 = f0 + ;; + STFD [C7 ] = f112, SIZE + mov f112 = f0 + STFD [C8 ] = f120, SIZE + mov f120 = f0 + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f72 = ALPHA, f72 + FMPY f80 = ALPHA, f80 + FMPY f88 = ALPHA, f88 + + { .mfi + FMPY f96 = ALPHA, f96 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f104 = ALPHA, f104 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f112 = ALPHA, f112 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f120 = ALPHA, f120 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C5 ] = f96, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6 ] = f104, SIZE + mov f104 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C7 ] = f112, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8 ] = f120, SIZE + mov f120 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 +#endif + +.L049: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 32 + +.L050: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 2 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + shladd C3 = LDC, 1, C + mov f80 = f0 + nop __LINE__ + } + { .mfb + mov AOFFSET = A + mov f88 = f0 + (p6) br.cond.dpnt .L090 + } + ;; +#if 0 + { .mfi + cmp.eq p6, p7 = 0, I + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + { .mfi + shladd C4 = LDC, 1, C2 + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + shladd C = LDC, 2, C + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 32 + +.L052: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f74 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f66 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; +#endif + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 4, KK +#endif +#endif + } + { .mfi + setf.d f84 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f67 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f75 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f83 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f91 = r0 + mov f68 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f76 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f92 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f69 = f0 + shr L = L, 1 + } + { .mmf + setf.d f77 = r0 + setf.d f85 = r0 + mov f93 = f0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f70 = f0 + adds L = -1, L + } + { .mmf + setf.d f78 = r0 + setf.d f86 = r0 + mov f94 = f0 + } + ;; + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + mov ar.lc = L + } + { .mmf + setf.d f79 = r0 + setf.d f87 = r0 + mov f95 = f0 + } + ;; + .align 32 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f96 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f97 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f98 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f99 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f112 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f113 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f114 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f115 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f116 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f117 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f118 = [C3 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f119 = [C11], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 32 + +.L058: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + LDFD f120 = [C4 ], SIZE + FMA f64 = ALPHA, f64, f96 + cmp.ne p6, p0 = 1, I + } + { .mfb + LDFD f121 = [C12], SIZE + FMA f68 = ALPHA, f68, f97 + nop __LINE__ + } + ;; + { .mfi + LDFD f122 = [C4 ], SIZE + FMA f65 = ALPHA, f65, f98 + adds I = -1, I + } + { .mfb + LDFD f123 = [C12], SIZE + FMA f69 = ALPHA, f69, f99 + nop __LINE__ + } + ;; + { .mfb + LDFD f124 = [C4 ], SIZE + FMA f66 = ALPHA, f66, f100 + nop __LINE__ + } + { .mfb + LDFD f125 = [C12], SIZE + FMA f70 = ALPHA, f70, f101 + nop __LINE__ + } + ;; + { .mfb + LDFD f126 = [C4 ], -3 * SIZE + FMA f67 = ALPHA, f67, f102 + nop __LINE__ + } + { .mfb + LDFD f127 = [C12], -3 * SIZE + FMA f71 = ALPHA, f71, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f72 = ALPHA, f72, f104 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMA f76 = ALPHA, f76, f105 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f73 = ALPHA, f73, f106 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMA f77 = ALPHA, f77, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMA f74 = ALPHA, f74, f108 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMA f78 = ALPHA, f78, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f110 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f80 = ALPHA, f80, f112 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMA f84 = ALPHA, f84, f113 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMA f81 = ALPHA, f81, f114 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMA f85 = ALPHA, f85, f115 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMA f82 = ALPHA, f82, f116 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMA f86 = ALPHA, f86, f117 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMA f83 = ALPHA, f83, f118 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMA f87 = ALPHA, f87, f119 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMA f88 = ALPHA, f88, f120 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMA f92 = ALPHA, f92, f121 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMA f89 = ALPHA, f89, f122 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMA f93 = ALPHA, f93, f123 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f82, SIZE + FMA f90 = ALPHA, f90, f124 + nop __LINE__ + } + { .mfb + STFD [C11] = f86, SIZE + FMA f94 = ALPHA, f94, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f83, 5 * SIZE + FMA f91 = ALPHA, f91, f126 + nop __LINE__ + } + { .mfb + STFD [C11] = f87, 5 * SIZE + FMA f95 = ALPHA, f95, f127 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f88, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f92, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f89, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f93, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f94, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4 ] = f91, 5 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + STFD [C12] = f95, 5 * SIZE + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMPY f76 = ALPHA, f76 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMPY f77 = ALPHA, f77 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMPY f78 = ALPHA, f78 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + FMPY f84 = ALPHA, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + FMPY f85 = ALPHA, f85 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + FMPY f86 = ALPHA, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + FMPY f87 = ALPHA, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + FMPY f88 = ALPHA, f88 + nop __LINE__ + } + { .mfb + STFD [C11] = f84, SIZE + FMPY f92 = ALPHA, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, SIZE + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + { .mfb + STFD [C11] = f85, SIZE + FMPY f93 = ALPHA, f93 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f82, SIZE + FMPY f90 = ALPHA, f90 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C11] = f86, SIZE + FMPY f94 = ALPHA, f94 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f83, 5 * SIZE + FMPY f91 = ALPHA, f91 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C11] = f87, 5 * SIZE + FMPY f95 = ALPHA, f95 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f92, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C12] = f93, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f90, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f94, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f91, 5 * SIZE + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f95, 5 * SIZE + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#endif + .align 32 + +.L060: + { .mfi + nop __LINE__ + mov f66 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 4, KK +#endif +#endif + mov f74 = f0 + (p6) br.cond.dptk .L070 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f82 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f90 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f82 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f90 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f67 = f0 + adds L = -1, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f75 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C10], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f86 = [C11], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f87 = [C11], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C12], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C12], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f66 = ALPHA, f66, f70 + FMA f65 = ALPHA, f65, f69 + FMA f67 = ALPHA, f67, f71 + FMA f72 = ALPHA, f72, f76 + FMA f74 = ALPHA, f74, f78 + FMA f73 = ALPHA, f73, f77 + FMA f75 = ALPHA, f75, f79 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f80 = ALPHA, f80, f84 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMA f82 = ALPHA, f82, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMA f81 = ALPHA, f81, f85 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMA f83 = ALPHA, f83, f87 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + FMA f88 = ALPHA, f88, f92 + nop __LINE__ + } + { .mfb + STFD [C10] = f74, SIZE + FMA f90 = ALPHA, f90, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, 3 * SIZE + FMA f89 = ALPHA, f89, f93 + nop __LINE__ + } + { .mfb + STFD [C10] = f75, 3 * SIZE + FMA f91 = ALPHA, f91, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f80, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C11] = f82, SIZE + mov f64 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3 ] = f81, 3 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + STFD [C11] = f83, 3 * SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 + adds L = 1, K + } + { .mfb + STFD [C12] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, 3 * SIZE + mov f89 = f0 + shr L = L, 1 + } + { .mfb + STFD [C12] = f91, 3 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f66 = ALPHA, f66 + FMPY f65 = ALPHA, f65 + FMPY f67 = ALPHA, f67 + FMPY f72 = ALPHA, f72 + FMPY f74 = ALPHA, f74 + FMPY f73 = ALPHA, f73 + FMPY f75 = ALPHA, f75 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f80 = ALPHA, f80 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f66, SIZE + FMPY f82 = ALPHA, f82 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, 3 * SIZE + FMPY f81 = ALPHA, f81 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + FMPY f83 = ALPHA, f83 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f74, SIZE + FMPY f90 = ALPHA, f90 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, 3 * SIZE + FMPY f89 = ALPHA, f89 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C10] = f75, 3 * SIZE + FMPY f91 = ALPHA, f91 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C11] = f82, SIZE + mov f64 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f81, 3 * SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C11] = f83, 3 * SIZE + mov f72 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4 ] = f88, SIZE + mov f88 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f90, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f89, 3 * SIZE + mov f89 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C12] = f91, 3 * SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L070: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L080 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 2, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + nop __LINE__ + } + ;; +#endif + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f84 = [C3 ], SIZE + (p5) LDFD f92 = [C4 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f85 = [C3 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C4 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + FMA f72 = ALPHA, f72, f76 + FMA f73 = ALPHA, f73, f77 + + FMA f80 = ALPHA, f80, f84 + FMA f81 = ALPHA, f81, f85 + FMA f88 = ALPHA, f88, f92 + FMA f89 = ALPHA, f89, f93 + ;; + { .mfb + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 + adds L = 1, K + } + { .mfb + STFD [C4 ] = f88, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE + shr L = L, 1 + } + ;; +#else + FMPY f64 = ALPHA, f64 + FMPY f65 = ALPHA, f65 + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = ALPHA, f80 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f81 = ALPHA, f81 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f89 = ALPHA, f89 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3 ] = f80, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4 ] = f88, SIZE + mov f88 = f0 + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L080: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 2, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + nop __LINE__ + } + ;; +#endif + + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2] + (p5) LDFD f84 = [C3] +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C4] +#else + nop __LINE__ +#endif + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + FMA f80 = ALPHA, f80, f84 + FMA f88 = ALPHA, f88, f92 + ;; + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = ALPHA, f80 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f88 = ALPHA, f88 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L089: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 +#endif + +.L090: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 1 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + setf.d f66 = r0 + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + { .mfb + mov AOFFSET = A + mov f73 = f0 + (p6) br.cond.dpnt .L130 + } + ;; +#if 0 + { .mfi +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + mov f67 = f0 + shladd C = LDC, 1, C + } + { .mfb + cmp.eq p6, p7 = 0, I + mov f74 = f0 + (p6) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L092: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f79 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f68 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f79 = f0 + nop __LINE__ + } + ;; +#endif + + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f75 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 2, KK +#endif +#endif + } + ;; + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + mov f76 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f69 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f77 = f0 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC], LDC + mov f70 = f0 + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f78 = f0 + mov ar.lc = L + } + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + .align 32 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f96 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f97 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f98 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f99 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f100 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f101 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f102 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f103 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C10], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + nop __LINE__ + FMA f64 = ALPHA, f64, f96 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMA f68 = ALPHA, f68, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f65 = ALPHA, f65, f98 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMA f69 = ALPHA, f69, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f66 = ALPHA, f66, f100 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f70 = ALPHA, f70, f101 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = ALPHA, f67, f102 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = ALPHA, f71, f103 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMA f72 = ALPHA, f72, f104 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMA f76 = ALPHA, f76, f105 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMA f73 = ALPHA, f73, f106 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMA f77 = ALPHA, f77, f107 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f66, SIZE + FMA f74 = ALPHA, f74, f108 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f70, SIZE + FMA f78 = ALPHA, f78, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f67, 5 * SIZE + FMA f75 = ALPHA, f75, f110 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f71, 5 * SIZE + FMA f79 = ALPHA, f79, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f72, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f76, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f73, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f77, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f74, SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f78, SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2 ] = f75, 5 * SIZE + mov f67 = f0 + nop __LINE__ + } + { .mfb + STFD [C10] = f79, 5 * SIZE + (p6) br.cond.dptk .L092 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMPY f68 = ALPHA, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + nop __LINE__ + FMPY f69 = ALPHA, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f70 = ALPHA, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f67 = ALPHA, f67 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f71 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f64, SIZE + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f68, SIZE + FMPY f76 = ALPHA, f76 + nop __LINE__ + } + ;; + { .mfb + STFD [C1 ] = f65, SIZE + FMPY f73 = ALPHA, f73 + nop __LINE__ + } + { .mfb + STFD [C9 ] = f69, SIZE + FMPY f77 = ALPHA, f77 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMPY f74 = ALPHA, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f70, SIZE + FMPY f78 = ALPHA, f78 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FMPY f75 = ALPHA, f75 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C9 ] = f71, 5 * SIZE + FMPY f79 = ALPHA, f79 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f76, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C10] = f77, SIZE + mov f73 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f74, SIZE + mov f66 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C10] = f78, SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f75, 5 * SIZE + mov f67 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + STFD [C10] = f79, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L092 + } + ;; +#endif + .align 32 + +.L100: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmf + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + mov f75 = f0 + } + { .mii + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f75 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C10], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + LDFD f77 = [C2 ], -1 * SIZE + FMA f64 = ALPHA, f64, f68 + nop __LINE__ + } + { .mfb + LDFD f79 = [C10], -1 * SIZE + FMA f66 = ALPHA, f66, f70 + nop __LINE__ + } + ;; + FMA f65 = ALPHA, f65, f69 + adds L = 1, K + FMA f67 = ALPHA, f67, f71 + ;; + FMA f72 = ALPHA, f72, f76 + shr L = L, 1 + FMA f74 = ALPHA, f74, f78 + FMA f73 = ALPHA, f73, f77 + FMA f75 = ALPHA, f75, f79 + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f66, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, 3 * SIZE + STFD [C9 ] = f67, 3 * SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f74, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C2 ] = f73, 3 * SIZE + STFD [C10] = f75, 3 * SIZE + mov f73 = f0 + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f64 = ALPHA, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + FMPY f65 = ALPHA, f65 + FMPY f67 = ALPHA, f67 + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f74 = ALPHA, f74 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f75 = ALPHA, f75 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f66, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f67, 3 * SIZE + nop __LINE__ +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2 ] = f72, SIZE + mov f72 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C10] = f74, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f73, 3 * SIZE + mov f73 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + STFD [C10] = f75, 3 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#endif + + .align 32 + +.L110: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + } + ;; + +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + shladd AOFFSET = KK8, 1, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE + (p5) LDFD f76 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C2 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + FMA f72 = ALPHA, f72, f76 + FMA f73 = ALPHA, f73, f77 + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfb + STFD [C2 ] = f73, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f73 = ALPHA, f73 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72, SIZE + mov f72 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f73, SIZE + mov f73 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L120: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + add AOFFSET = KK8, AOFFSET + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#endif + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + nop __LINE__ + mov ar.lc = L + } + ;; + .align 32 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] + (p5) LDFD f76 = [C2] +#else + nop __LINE__ + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f72 = ALPHA, f72, f76 + ;; + { .mfi + STFD [C1 ] = f64 + mov f64 = f0 + } + { .mfb + STFD [C2 ] = f72 + mov f72 = f0 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f72 = ALPHA, f72 + nop __LINE__ + } + ;; + { .mmi + nop __LINE__ +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + { .mfi + STFD [C1 ] = f64 + mov f64 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2 ] = f72 + mov f72 = f0 + } + ;; +#endif + .align 32 + +.L129: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 +#endif + +.L130: + { .mfi +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f64 = f0 + tbit.z p6, p0 = N, 0 + } + { .mib + mov AOFFSET = A + shr I = M, 3 + (p6) br.cond.dpnt .L999 + } + ;; +#if 0 + { .mfi + mov C1 = C + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + mov f66 = f0 + nop __LINE__ + } + { .mfb + cmp.eq p7, p0 = 0, I + mov f67 = f0 + (p7) br.cond.dpnt .L140 + } + ;; + .align 32 + +.L132: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFD f48 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 1 * SIZE, B + mov f69 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#else + { .mfi + add BOFFSET = KK8, B + mov f68 = f0 + shladd AOFFSET = KK8, 3, AOFFSET + } + ;; + { .mfi + LDFD f48 = [BOFFSET], 1 * SIZE + mov f69 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 8, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f70 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mii + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f71 = f0 + adds L = -1, L + } + ;; + { .mmi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds PREC = CPREFETCHSIZE * SIZE, C1 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmi + CPREFETCH [PREC] + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f6 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f7 = [C9 ], SIZE +#else + nop __LINE__ +#endif + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f10 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f11 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f12 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f13 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f14 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f15 = [C9 ], -3 * SIZE +#else + nop __LINE__ +#endif + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfi + FMA f64 = ALPHA, f64, f6 + cmp.ne p6, p0 = 1, I + } + { .mfb + FMA f68 = ALPHA, f68, f7 + } + ;; + { .mfi + FMA f65 = ALPHA, f65, f10 + adds I = -1, I + } + { .mfb + FMA f69 = ALPHA, f69, f11 + } + ;; + { .mfi + FMA f66 = ALPHA, f66, f12 + } + { .mfb + FMA f70 = ALPHA, f70, f13 + } + ;; + { .mfb + FMA f67 = ALPHA, f67, f14 + } + { .mfb + FMA f71 = ALPHA, f71, f15 + } + ;; + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + mov f66 = f0 + } + ;; + { .mmf + STFD [C1 ] = f67, 5 * SIZE + nop __LINE__ + mov f67 = f0 + } + { .mmb + STFD [C9 ] = f71, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; +#else + { .mfi + FMPY f64 = ALPHA, f64 + cmp.ne p6, p0 = 1, I + } + { .mfb + FMPY f68 = ALPHA, f68 + } + ;; + { .mfi + FMPY f65 = ALPHA, f65 + adds I = -1, I + } + { .mfb + FMPY f69 = ALPHA, f69 + } + ;; + { .mfi + FMPY f66 = ALPHA, f66 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + FMPY f70 = ALPHA, f70 + } + ;; + { .mfi + FMPY f67 = ALPHA, f67 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -8, L +#else + nop __LINE__ +#endif + } + { .mfi + FMPY f71 = ALPHA, f71 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f68, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 3, AOFFSET +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f69, SIZE + nop __LINE__ +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f66 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + { .mmi + STFD [C9 ] = f70, SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f67 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mmb + STFD [C9 ] = f71, 5 * SIZE + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; +#endif + .align 32 + +.L140: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 2, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + adds L = -1, L + nop __LINE__ + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f70 = [C9 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f69 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f71 = [C9 ], -1 * SIZE +#else + nop __LINE__ +#endif + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 + FMA f66 = ALPHA, f66, f70 + FMA f65 = ALPHA, f65, f69 + FMA f67 = ALPHA, f67, f71 + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + adds L = 1, K + } + { .mfb + STFD [C9 ] = f66, SIZE + mov f66 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 + shr L = L, 1 + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f66 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f67 = ALPHA, f67 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f66, SIZE + mov f66 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, 3 * SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C9 ] = f67, 3 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#endif + .align 32 + +.L150: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + LDFD f68 = [C1 ], SIZE + ;; + LDFD f69 = [C1 ], -1 * SIZE + ;; + FMA f64 = ALPHA, f64, f68 + FMA f65 = ALPHA, f65, f69 + ;; + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + nop __LINE__ + FMPY f64 = ALPHA, f64 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMPY f65 = ALPHA, f65 + nop __LINE__ + } + ;; + { .mii + nop __LINE__ +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 32 + +.L160: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + add BOFFSET = KK8, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f68 = [C1] +#else + nop __LINE__ +#endif + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + FMA f64 = ALPHA, f64, f68 +#else + FMPY f64 = ALPHA, f64 +#endif + ;; + STFD [C1 ] = f64 + ;; + .align 32 + +.L169: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 +#endif + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f24 = [SP], 32 + ldf.fill f25 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f26 = [SP], 32 + ldf.fill f27 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f28 = [SP], 32 + ldf.fill f29 = [r9], 32 + ;; + ldf.fill f30 = [SP], 32 + ldf.fill f31 = [r9] + + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/qgemv_n.S b/kernel/ia64/qgemv_n.S new file mode 100644 index 0000000000..4eeac126c1 --- /dev/null +++ b/kernel/ia64/qgemv_n.S @@ -0,0 +1,1676 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#ifndef XDOUBLE +#define A r36 +#define LDA r37 +#define X r38 +#define INCX r39 +#define Y r34 +#define INCY r35 +#else +#define A r38 +#define LDA r39 +#define X r34 +#define INCX r35 +#define Y r36 +#define INCY r37 +#endif + +#define BUFFER r11 + +#define I r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define AO5 r20 +#define AO6 r21 +#define AO7 r22 +#define AO8 r23 +#define YLD1 r24 +#define YLD2 r25 +#define YST1 r26 +#define YST2 r27 +#define II r28 +#define YY r29 + +#define ARLC r30 +#define PR r31 + +#define LDA7M8 r8 +#define PREA r9 +#define PREB r10 + +#define ALPHA1 f8 +#define ALPHA2 f9 +#define ALPHA3 f10 +#define ALPHA4 f11 +#define ALPHA5 f12 +#define ALPHA6 f13 +#define ALPHA7 f14 +#define ALPHA8 f15 + +#define RPREFETCHSIZE ( 8 * 1 + 6) +#define WPREFETCHSIZE ( 8 * 1 + 6) + +#define RPREFETCH lfetch.nt1 +#define WPREFETCH lfetch.excl.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + .body + ;; + +#ifdef XDOUBLE + ld8 X = [r14], 16 + ld8 INCX = [r15], 16 + ;; +#endif + ld8 Y = [r14], 16 + ld8 INCY = [r15], 16 + ;; + ld8 BUFFER = [r14] + ;; + + mov ALPHA = f8 + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + ;; + shladd INCX = INCX, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + ;; + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + ;; + sub I = A, Y + mov YY = Y + ;; + cmp.eq p10, p0 = SIZE, INCY + (p10) br.cond.dptk .L10 + ;; + shr J = M, 3 + mov YY = BUFFER + ;; + (p8) adds YY = SIZE, BUFFER + ;; + mov ar.lc = J + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + ;; +.L02: + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 5 * SIZE + STFD [YST2] = f0, 5 * SIZE + br.cloop.sptk.few .L02 + ;; + +.L10: + shr J = N, 3 + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + ;; + .align 16 + +.L11: + shladd LDA7M8 = LDA, 3, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov YLD1 = YY + mov YST1 = YY + adds YLD2 = 1 * SIZE, YY + adds YST2 = 1 * SIZE, YY + ;; + LDFD ALPHA1 = [X], INCX + ;; + LDFD ALPHA2 = [X], INCX + ;; + LDFD ALPHA3 = [X], INCX + ;; + LDFD ALPHA4 = [X], INCX + ;; + LDFD ALPHA5 = [X], INCX + ;; + LDFD ALPHA6 = [X], INCX + ;; + LDFD ALPHA7 = [X], INCX + ;; + LDFD ALPHA8 = [X], INCX + ;; + FMPY ALPHA1 = ALPHA, ALPHA1 + FMPY ALPHA2 = ALPHA, ALPHA2 + FMPY ALPHA3 = ALPHA, ALPHA3 + FMPY ALPHA4 = ALPHA, ALPHA4 + FMPY ALPHA5 = ALPHA, ALPHA5 + FMPY ALPHA6 = ALPHA, ALPHA6 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 3, A + ;; + shr I = M, 3 + mov pr.rot= 0 + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + adds J = -1, J + ;; + adds PREB = (WPREFETCHSIZE) * SIZE, YY + ;; + cmp.lt p7, p8 = r0, J + tbit.nz p13, p11 = M, 2 + mov ar.ec= 2 + ;; + FMPY ALPHA7 = ALPHA, ALPHA7 + ;; + { .mfi + and II = 7, M + FMPY ALPHA8 = ALPHA, ALPHA8 + mov ar.lc = I + } + { .mib + cmp.eq p6, p0 = -1, I + tbit.nz p14, p12 = M, 1 + (p6) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mmf + (p17) LDFD f93 = [AO5], LDA7M8 + (p17) LDFD f94 = [AO6], LDA7M8 + (p17) FMA f101 = ALPHA1, f33, f101 + } + { .mmf + (p17) LDFD f95 = [AO7], LDA7M8 + (p17) LDFD f96 = [AO8], LDA7M8 + (p17) FMA f104 = ALPHA1, f34, f104 + } + ;; + { .mmf + (p16) LDFD f32 = [AO1] + (p16) LDFD f33 = [AO2], LDA + (p17) FMA f107 = ALPHA1, f35, f107 + } + { .mmf + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f35 = [AO4], LDA + (p17) FMA f110 = ALPHA1, f36, f110 + } + ;; + { .mmf + (p16) LDFD f100 = [YLD1], 2 * SIZE + (p16) LDFD f103 = [YLD2], 2 * SIZE + (p17) FMA f113 = ALPHA1, f37, f113 + } + { .mmf + (p16) adds PREA = (RPREFETCHSIZE) * SIZE, AO1 + (p16) add AO1 = AO1, LDA + (p17) FMA f116 = ALPHA1, f38, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 2 * SIZE + (p18) STFD [YST2] = f105, 2 * SIZE + (p17) FMA f119 = ALPHA1, f39, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA1, f40, f122 + } + ;; + { .mmf + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f37 = [AO6], LDA + (p17) FMA f101 = ALPHA2, f41, f101 + } + { .mmf + (p16) LDFD f38 = [AO7], LDA + (p16) LDFD f39 = [AO8], LDA + (p17) FMA f104 = ALPHA2, f42, f104 + } + ;; + { .mmf + (p16) LDFD f40 = [AO1], LDA + (p16) LDFD f41 = [AO2], LDA + (p17) FMA f107 = ALPHA2, f43, f107 + } + { .mmf + (p16) LDFD f42 = [AO3], LDA + (p16) LDFD f43 = [AO4], LDA + (p17) FMA f110 = ALPHA2, f44, f110 + } + ;; + { .mmf + (p16) LDFD f106 = [YLD1], 2 * SIZE + (p16) LDFD f109 = [YLD2], 2 * SIZE + (p17) FMA f113 = ALPHA2, f45, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA2, f46, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 2 * SIZE + (p18) STFD [YST2] = f111, 2 * SIZE + (p17) FMA f119 = ALPHA2, f47, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA2, f48, f122 + } + ;; + { .mmf + (p16) LDFD f44 = [AO5], LDA + (p16) LDFD f45 = [AO6], LDA + (p17) FMA f101 = ALPHA3, f49, f101 + } + { .mmf + (p16) LDFD f46 = [AO7], LDA + (p16) LDFD f47 = [AO8], LDA + (p17) FMA f104 = ALPHA3, f50, f104 + } + ;; + { .mmf + (p16) LDFD f48 = [AO1], LDA + (p16) LDFD f49 = [AO2], LDA + (p17) FMA f107 = ALPHA3, f51, f107 + } + { .mmf + (p16) LDFD f50 = [AO3], LDA + (p16) LDFD f51 = [AO4], LDA + (p17) FMA f110 = ALPHA3, f52, f110 + } + ;; + { .mmf + (p16) LDFD f112 = [YLD1], 2 * SIZE + (p16) LDFD f115 = [YLD2], 2 * SIZE + (p17) FMA f113 = ALPHA3, f53, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA3, f54, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f114, 2 * SIZE + (p18) STFD [YST2] = f117, 2 * SIZE + (p17) FMA f119 = ALPHA3, f55, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA3, f56, f122 + } + ;; + { .mmf + (p16) LDFD f52 = [AO5], LDA + (p16) LDFD f53 = [AO6], LDA + (p17) FMA f101 = ALPHA4, f57, f101 + } + { .mmf + (p16) LDFD f54 = [AO7], LDA + (p16) LDFD f55 = [AO8], LDA + (p17) FMA f104 = ALPHA4, f58, f104 + } + ;; + { .mmf + (p16) LDFD f56 = [AO1], LDA + (p16) LDFD f57 = [AO2], LDA + (p17) FMA f107 = ALPHA4, f59, f107 + } + { .mmf + (p16) LDFD f58 = [AO3], LDA + (p16) LDFD f59 = [AO4], LDA + (p17) FMA f110 = ALPHA4, f60, f110 + } + ;; + { .mmf + (p16) LDFD f118 = [YLD1], 2 * SIZE + (p16) LDFD f121 = [YLD2], 2 * SIZE + (p17) FMA f113 = ALPHA4, f61, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA4, f62, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f120, 2 * SIZE + (p18) STFD [YST2] = f123, 2 * SIZE + (p17) FMA f119 = ALPHA4, f63, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA4, f64, f122 + } + ;; + { .mmf + (p16) LDFD f60 = [AO5], LDA + (p16) LDFD f61 = [AO6], LDA + (p17) FMA f101 = ALPHA5, f65, f101 + } + { .mmf + (p16) LDFD f62 = [AO7], LDA + (p16) LDFD f63 = [AO8], LDA + (p17) FMA f104 = ALPHA5, f66, f104 + } + ;; + { .mmf + (p16) LDFD f64 = [AO1], LDA + (p16) LDFD f65 = [AO2], LDA + (p17) FMA f107 = ALPHA5, f67, f107 + } + { .mmf + (p16) LDFD f66 = [AO3], LDA + (p16) LDFD f67 = [AO4], LDA + (p17) FMA f110 = ALPHA5, f68, f110 + } + ;; + { .mmf + (p16) WPREFETCH [PREB], 8 * SIZE + nop __LINE__ + (p17) FMA f113 = ALPHA5, f69, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA5, f70, f116 + } + ;; + { .mmf + (p16) RPREFETCH [PREA] + nop __LINE__ + (p17) FMA f119 = ALPHA5, f71, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA5, f72, f122 + } + ;; + { .mmf + (p16) LDFD f68 = [AO5], LDA + (p16) LDFD f69 = [AO6], LDA + (p17) FMA f101 = ALPHA6, f73, f101 + } + { .mmf + (p16) LDFD f70 = [AO7], LDA + (p16) LDFD f71 = [AO8], LDA + (p17) FMA f104 = ALPHA6, f74, f104 + } + ;; + { .mmf + (p16) LDFD f72 = [AO1], LDA + (p16) LDFD f73 = [AO2], LDA + (p17) FMA f107 = ALPHA6, f75, f107 + } + { .mmf + (p16) LDFD f74 = [AO3], LDA + (p16) LDFD f75 = [AO4], LDA + (p17) FMA f110 = ALPHA6, f76, f110 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f113 = ALPHA6, f77, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA6, f78, f116 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f119 = ALPHA6, f79, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA6, f80, f122 + } + ;; + { .mmf + (p16) LDFD f76 = [AO5], LDA + (p16) LDFD f77 = [AO6], LDA + (p17) FMA f101 = ALPHA7, f81, f101 + } + { .mmf + (p16) LDFD f78 = [AO7], LDA + (p16) LDFD f79 = [AO8], LDA + (p17) FMA f104 = ALPHA7, f82, f104 + } + ;; + { .mmf + (p16) LDFD f80 = [AO1], LDA + (p16) LDFD f81 = [AO2], LDA + (p17) FMA f107 = ALPHA7, f83, f107 + } + { .mmf + (p16) LDFD f82 = [AO3], LDA + (p16) LDFD f83 = [AO4], LDA + (p17) FMA f110 = ALPHA7, f84, f110 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f113 = ALPHA7, f85, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA7, f86, f116 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f119 = ALPHA7, f87, f119 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f122 = ALPHA7, f88, f122 + } + ;; + { .mmf + (p16) LDFD f84 = [AO5], LDA + (p16) LDFD f85 = [AO6], LDA + (p17) FMA f101 = ALPHA8, f89, f101 + } + { .mmf + (p16) LDFD f86 = [AO7], LDA + (p16) LDFD f87 = [AO8], LDA + (p17) FMA f104 = ALPHA8, f90, f104 + } + ;; + { .mmf + (p16) LDFD f88 = [AO1], LDA7M8 + (p16) LDFD f89 = [AO2], LDA7M8 + (p17) FMA f107 = ALPHA8, f91, f107 + } + { .mmf + (p16) LDFD f90 = [AO3], LDA7M8 + (p16) LDFD f91 = [AO4], LDA7M8 + (p17) FMA f110 = ALPHA8, f92, f110 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f113 = ALPHA8, f93, f113 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f116 = ALPHA8, f94, f116 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f119 = ALPHA8, f95, f119 + } + { .mfb + nop __LINE__ + (p17) FMA f122 = ALPHA8, f96, f122 + br.ctop.sptk.few .L12 + } + ;; + { .mmi + (p18) STFD [YST1] = f102, 2 * SIZE + (p18) STFD [YST2] = f105, 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 2 * SIZE + (p18) STFD [YST2] = f111, 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [YST1] = f114, 2 * SIZE + (p18) STFD [YST2] = f117, 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [YST1] = f120, 2 * SIZE + (p18) STFD [YST2] = f123, 2 * SIZE + nop __LINE__ + } + ;; + .align 16 + +.L15: + { .mmi + (p7) cmp.eq.unc p9, p0 = r0, II + (p8) cmp.eq.unc p10, p0 = r0, II + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + (p9) br.cond.dptk .L11 + (p10) br.cond.dptk .L20 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f33 = [AO2], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p13) LDFD f34 = [AO3], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f36 = [AO5], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f41 = [AO2], LDA + (p13) LDFD f42 = [AO3], LDA + (p13) LDFD f43 = [AO4], LDA + ;; + (p14) LDFD f44 = [AO5], LDA + (p14) LDFD f45 = [AO6], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f48 = [AO1], LDA + (p13) LDFD f49 = [AO2], LDA + (p13) LDFD f50 = [AO3], LDA + (p13) LDFD f51 = [AO4], LDA + ;; + (p14) LDFD f52 = [AO5], LDA + (p14) LDFD f53 = [AO6], LDA + (p15) LDFD f54 = [AO7], LDA + ;; + (p13) LDFD f56 = [AO1], LDA + (p13) LDFD f57 = [AO2], LDA + (p13) LDFD f58 = [AO3], LDA + (p13) LDFD f59 = [AO4], LDA + ;; + (p14) LDFD f60 = [AO5], LDA + (p14) LDFD f61 = [AO6], LDA + (p15) LDFD f62 = [AO7], LDA + ;; + (p13) LDFD f64 = [AO1], LDA + (p13) LDFD f65 = [AO2], LDA + (p13) LDFD f66 = [AO3], LDA + (p13) LDFD f67 = [AO4], LDA + ;; + (p14) LDFD f68 = [AO5], LDA + (p14) LDFD f69 = [AO6], LDA + (p15) LDFD f70 = [AO7], LDA + ;; + (p13) LDFD f72 = [AO1], LDA + (p13) LDFD f73 = [AO2], LDA + (p13) LDFD f74 = [AO3], LDA + (p13) LDFD f75 = [AO4], LDA + ;; + (p14) LDFD f76 = [AO5], LDA + (p14) LDFD f77 = [AO6], LDA + (p15) LDFD f78 = [AO7], LDA + ;; + (p13) LDFD f80 = [AO1], LDA + (p13) LDFD f81 = [AO2], LDA + (p13) LDFD f82 = [AO3], LDA + (p13) LDFD f83 = [AO4], LDA + ;; + (p14) LDFD f84 = [AO5], LDA + (p14) LDFD f85 = [AO6], LDA + (p15) LDFD f86 = [AO7], LDA + ;; + (p13) LDFD f88 = [AO1] + (p13) LDFD f89 = [AO2] + (p13) LDFD f90 = [AO3] + (p13) LDFD f91 = [AO4] + ;; + (p14) LDFD f92 = [AO5] + (p14) LDFD f93 = [AO6] + (p15) LDFD f94 = [AO7] + ;; + (p13) LDFD f96 = [YLD1], 2 * SIZE + (p13) LDFD f97 = [YLD2], 2 * SIZE + ;; + (p13) LDFD f98 = [YLD1], 2 * SIZE + (p13) LDFD f99 = [YLD2], 2 * SIZE + ;; + (p14) LDFD f100 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f101 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f102 = [YLD1], 1 * SIZE + ;; + + (p13) FMA f96 = ALPHA1, f32, f96 + (p13) FMA f97 = ALPHA1, f33, f97 + (p13) FMA f98 = ALPHA1, f34, f98 + (p13) FMA f99 = ALPHA1, f35, f99 + (p14) FMA f100 = ALPHA1, f36, f100 + (p14) FMA f101 = ALPHA1, f37, f101 + (p15) FMA f102 = ALPHA1, f38, f102 + ;; + (p13) FMA f96 = ALPHA2, f40, f96 + (p13) FMA f97 = ALPHA2, f41, f97 + (p13) FMA f98 = ALPHA2, f42, f98 + (p13) FMA f99 = ALPHA2, f43, f99 + (p14) FMA f100 = ALPHA2, f44, f100 + (p14) FMA f101 = ALPHA2, f45, f101 + (p15) FMA f102 = ALPHA2, f46, f102 + ;; + (p13) FMA f96 = ALPHA3, f48, f96 + (p13) FMA f97 = ALPHA3, f49, f97 + (p13) FMA f98 = ALPHA3, f50, f98 + (p13) FMA f99 = ALPHA3, f51, f99 + (p14) FMA f100 = ALPHA3, f52, f100 + (p14) FMA f101 = ALPHA3, f53, f101 + (p15) FMA f102 = ALPHA3, f54, f102 + ;; + (p13) FMA f96 = ALPHA4, f56, f96 + (p13) FMA f97 = ALPHA4, f57, f97 + (p13) FMA f98 = ALPHA4, f58, f98 + (p13) FMA f99 = ALPHA4, f59, f99 + (p14) FMA f100 = ALPHA4, f60, f100 + (p14) FMA f101 = ALPHA4, f61, f101 + (p15) FMA f102 = ALPHA4, f62, f102 + ;; + (p13) FMA f96 = ALPHA5, f64, f96 + (p13) FMA f97 = ALPHA5, f65, f97 + (p13) FMA f98 = ALPHA5, f66, f98 + (p13) FMA f99 = ALPHA5, f67, f99 + (p14) FMA f100 = ALPHA5, f68, f100 + (p14) FMA f101 = ALPHA5, f69, f101 + (p15) FMA f102 = ALPHA5, f70, f102 + ;; + (p13) FMA f96 = ALPHA6, f72, f96 + (p13) FMA f97 = ALPHA6, f73, f97 + (p13) FMA f98 = ALPHA6, f74, f98 + (p13) FMA f99 = ALPHA6, f75, f99 + (p14) FMA f100 = ALPHA6, f76, f100 + (p14) FMA f101 = ALPHA6, f77, f101 + (p15) FMA f102 = ALPHA6, f78, f102 + ;; + (p13) FMA f96 = ALPHA7, f80, f96 + (p13) FMA f97 = ALPHA7, f81, f97 + (p13) FMA f98 = ALPHA7, f82, f98 + (p13) FMA f99 = ALPHA7, f83, f99 + (p14) FMA f100 = ALPHA7, f84, f100 + (p14) FMA f101 = ALPHA7, f85, f101 + (p15) FMA f102 = ALPHA7, f86, f102 + ;; + (p13) FMA f16 = ALPHA8, f88, f96 + (p13) FMA f17 = ALPHA8, f89, f97 + (p13) FMA f18 = ALPHA8, f90, f98 + (p13) FMA f19 = ALPHA8, f91, f99 + (p14) FMA f20 = ALPHA8, f92, f100 + (p14) FMA f21 = ALPHA8, f93, f101 + (p15) FMA f22 = ALPHA8, f94, f102 + ;; + { .mmi + (p13) STFD [YST1] = f16, 2 * SIZE + (p13) STFD [YST2] = f17, 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YST1] = f18, 2 * SIZE + (p13) STFD [YST2] = f19 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YST1] = f20, 1 * SIZE + ;; + (p14) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + } + ;; + { .mib + (p15) STFD [YST1] = f22 + cmp.lt p11, p12 = r0, J + (p11) br.cond.dptk .L11 + } + ;; + .align 16 + +.L20: + tbit.z p6, p0 = N, 2 + ;; + (p6) br.cond.dpnt .L30 + ;; + + shladd LDA7M8 = LDA, 2, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov YLD1 = YY + mov YST1 = YY + adds YLD2 = 2 * SIZE, YY + adds YST2 = 2 * SIZE, YY + ;; + LDFD ALPHA1 = [X], INCX + ;; + LDFD ALPHA2 = [X], INCX + ;; + LDFD ALPHA3 = [X], INCX + ;; + LDFD ALPHA4 = [X], INCX + ;; + FMPY ALPHA1 = ALPHA, ALPHA1 + FMPY ALPHA2 = ALPHA, ALPHA2 + FMPY ALPHA3 = ALPHA, ALPHA3 + FMPY ALPHA4 = ALPHA, ALPHA4 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 2, A + ;; + shr I = M, 3 + mov pr.rot= 0 + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + adds J = -1, J + ;; + cmp.lt p7, p8 = r0, J + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + mov ar.ec= 1 + ;; + { .mfi + and II = 7, M + mov ar.lc = I + } + { .mfb + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA + (p16) LDFD f42 = [AO3], LDA + (p16) LDFD f44 = [AO5], LDA + (p16) LDFD f46 = [AO7], LDA + ;; + (p16) LDFD f41 = [AO2], LDA + (p16) LDFD f43 = [AO4], LDA + (p16) LDFD f45 = [AO6], LDA + (p16) LDFD f47 = [AO8], LDA + ;; + (p16) LDFD f48 = [AO1], LDA + (p16) LDFD f50 = [AO3], LDA + (p16) LDFD f52 = [AO5], LDA + (p16) LDFD f54 = [AO7], LDA + ;; + (p16) LDFD f49 = [AO2], LDA + (p16) LDFD f51 = [AO4], LDA + (p16) LDFD f53 = [AO6], LDA + (p16) LDFD f55 = [AO8], LDA + ;; + (p16) LDFD f56 = [AO1], LDA7M8 + (p16) LDFD f58 = [AO3], LDA7M8 + (p16) LDFD f60 = [AO5], LDA7M8 + (p16) LDFD f62 = [AO7], LDA7M8 + ;; + (p16) LDFD f57 = [AO2], LDA7M8 + (p16) LDFD f59 = [AO4], LDA7M8 + (p16) LDFD f61 = [AO6], LDA7M8 + (p16) LDFD f63 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [YLD1], 1 * SIZE + (p16) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f97 = [YLD1], 3 * SIZE + (p16) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p16) LDFD f102 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f101 = [YLD1], 3 * SIZE + (p16) LDFD f103 = [YLD2], 3 * SIZE + ;; + (p16) FMA f96 = ALPHA1, f32, f96 + (p16) FMA f98 = ALPHA1, f34, f98 + (p16) FMA f97 = ALPHA1, f33, f97 + (p16) FMA f99 = ALPHA1, f35, f99 + (p16) FMA f100 = ALPHA1, f36, f100 + (p16) FMA f102 = ALPHA1, f38, f102 + (p16) FMA f101 = ALPHA1, f37, f101 + (p16) FMA f103 = ALPHA1, f39, f103 + ;; + (p16) FMA f96 = ALPHA2, f40, f96 + (p16) FMA f98 = ALPHA2, f42, f98 + (p16) FMA f97 = ALPHA2, f41, f97 + (p16) FMA f99 = ALPHA2, f43, f99 + (p16) FMA f100 = ALPHA2, f44, f100 + (p16) FMA f102 = ALPHA2, f46, f102 + (p16) FMA f101 = ALPHA2, f45, f101 + (p16) FMA f103 = ALPHA2, f47, f103 + ;; + (p16) FMA f96 = ALPHA3, f48, f96 + (p16) FMA f98 = ALPHA3, f50, f98 + (p16) FMA f97 = ALPHA3, f49, f97 + (p16) FMA f99 = ALPHA3, f51, f99 + (p16) FMA f100 = ALPHA3, f52, f100 + (p16) FMA f102 = ALPHA3, f54, f102 + (p16) FMA f101 = ALPHA3, f53, f101 + (p16) FMA f103 = ALPHA3, f55, f103 + ;; + (p16) FMA f16 = ALPHA4, f56, f96 + (p16) FMA f18 = ALPHA4, f58, f98 + (p16) FMA f17 = ALPHA4, f57, f97 + (p16) FMA f19 = ALPHA4, f59, f99 + (p16) FMA f20 = ALPHA4, f60, f100 + (p16) FMA f22 = ALPHA4, f62, f102 + (p16) FMA f21 = ALPHA4, f61, f101 + (p16) FMA f23 = ALPHA4, f63, f103 + ;; + (p16) STFD [YST1] = f16, 1 * SIZE + (p16) STFD [YST2] = f18, 1 * SIZE + ;; + (p16) STFD [YST1] = f17, 3 * SIZE + (p16) STFD [YST2] = f19, 3 * SIZE + ;; + (p16) STFD [YST1] = f20, 1 * SIZE + (p16) STFD [YST2] = f22, 1 * SIZE + ;; + (p16) STFD [YST1] = f21, 3 * SIZE + (p16) STFD [YST2] = f23, 3 * SIZE + br.ctop.sptk.few .L22 + ;; + .align 16 + +.L25: + { .mmi + (p8) cmp.eq.unc p10, p0 = r0, II + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + (p10) br.cond.dptk .L30 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f42 = [AO3], LDA + (p14) LDFD f44 = [AO5], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f41 = [AO2], LDA + (p13) LDFD f43 = [AO4], LDA + (p14) LDFD f45 = [AO6], LDA + ;; + (p13) LDFD f48 = [AO1], LDA + (p13) LDFD f50 = [AO3], LDA + (p14) LDFD f52 = [AO5], LDA + (p15) LDFD f54 = [AO7], LDA + ;; + (p13) LDFD f49 = [AO2], LDA + (p13) LDFD f51 = [AO4], LDA + (p14) LDFD f53 = [AO6], LDA + ;; + (p13) LDFD f56 = [AO1] + (p13) LDFD f58 = [AO3] + (p14) LDFD f60 = [AO5] + (p15) LDFD f62 = [AO7] + ;; + (p13) LDFD f57 = [AO2] + (p13) LDFD f59 = [AO4] + (p14) LDFD f61 = [AO6] + ;; + (p13) LDFD f96 = [YLD1], 1 * SIZE + (p13) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p13) LDFD f97 = [YLD1], 3 * SIZE + (p13) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p14) LDFD f100 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f101 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f102 = [YLD1], 1 * SIZE + ;; + + (p13) FMA f96 = ALPHA1, f32, f96 + (p13) FMA f98 = ALPHA1, f34, f98 + (p13) FMA f97 = ALPHA1, f33, f97 + (p13) FMA f99 = ALPHA1, f35, f99 + (p14) FMA f100 = ALPHA1, f36, f100 + (p15) FMA f102 = ALPHA1, f38, f102 + (p14) FMA f101 = ALPHA1, f37, f101 + ;; + (p13) FMA f96 = ALPHA2, f40, f96 + (p13) FMA f98 = ALPHA2, f42, f98 + (p13) FMA f97 = ALPHA2, f41, f97 + (p13) FMA f99 = ALPHA2, f43, f99 + (p14) FMA f100 = ALPHA2, f44, f100 + (p15) FMA f102 = ALPHA2, f46, f102 + (p14) FMA f101 = ALPHA2, f45, f101 + ;; + (p13) FMA f96 = ALPHA3, f48, f96 + (p13) FMA f98 = ALPHA3, f50, f98 + (p13) FMA f97 = ALPHA3, f49, f97 + (p13) FMA f99 = ALPHA3, f51, f99 + (p14) FMA f100 = ALPHA3, f52, f100 + (p15) FMA f102 = ALPHA3, f54, f102 + (p14) FMA f101 = ALPHA3, f53, f101 + ;; + (p13) FMA f16 = ALPHA4, f56, f96 + (p13) FMA f18 = ALPHA4, f58, f98 + (p13) FMA f17 = ALPHA4, f57, f97 + (p13) FMA f19 = ALPHA4, f59, f99 + (p14) FMA f20 = ALPHA4, f60, f100 + (p15) FMA f22 = ALPHA4, f62, f102 + (p14) FMA f21 = ALPHA4, f61, f101 + ;; + { .mmi + (p13) STFD [YST1] = f16, 1 * SIZE + (p13) STFD [YST2] = f18, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YST1] = f17, 3 * SIZE + (p13) STFD [YST2] = f19 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YST1] = f20, 1 * SIZE + ;; + (p14) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + } + ;; + { .mib + (p15) STFD [YST1] = f22 + } + ;; + .align 16 + +.L30: + tbit.z p6, p0 = N, 1 + ;; + (p6) br.cond.dpnt .L40 + ;; + + shladd LDA7M8 = LDA, 1, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov YLD1 = YY + mov YST1 = YY + adds YLD2 = 2 * SIZE, YY + adds YST2 = 2 * SIZE, YY + ;; + LDFD ALPHA1 = [X], INCX + ;; + LDFD ALPHA2 = [X], INCX + ;; + FMPY ALPHA1 = ALPHA, ALPHA1 + FMPY ALPHA2 = ALPHA, ALPHA2 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 1, A + ;; + shr I = M, 3 + mov pr.rot= 0 + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + adds J = -1, J + ;; + cmp.lt p7, p8 = r0, J + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + mov ar.ec= 1 + ;; + { .mfi + and II = 7, M + mov ar.lc = I + } + { .mfb + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA7M8 + (p16) LDFD f42 = [AO3], LDA7M8 + (p16) LDFD f44 = [AO5], LDA7M8 + (p16) LDFD f46 = [AO7], LDA7M8 + ;; + (p16) LDFD f41 = [AO2], LDA7M8 + (p16) LDFD f43 = [AO4], LDA7M8 + (p16) LDFD f45 = [AO6], LDA7M8 + (p16) LDFD f47 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [YLD1], 1 * SIZE + (p16) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f97 = [YLD1], 3 * SIZE + (p16) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p16) LDFD f102 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f101 = [YLD1], 3 * SIZE + (p16) LDFD f103 = [YLD2], 3 * SIZE + ;; + (p16) FMA f96 = ALPHA1, f32, f96 + (p16) FMA f98 = ALPHA1, f34, f98 + (p16) FMA f97 = ALPHA1, f33, f97 + (p16) FMA f99 = ALPHA1, f35, f99 + (p16) FMA f100 = ALPHA1, f36, f100 + (p16) FMA f102 = ALPHA1, f38, f102 + (p16) FMA f101 = ALPHA1, f37, f101 + (p16) FMA f103 = ALPHA1, f39, f103 + ;; + (p16) FMA f16 = ALPHA2, f40, f96 + (p16) FMA f18 = ALPHA2, f42, f98 + (p16) FMA f17 = ALPHA2, f41, f97 + (p16) FMA f19 = ALPHA2, f43, f99 + (p16) FMA f20 = ALPHA2, f44, f100 + (p16) FMA f22 = ALPHA2, f46, f102 + (p16) FMA f21 = ALPHA2, f45, f101 + (p16) FMA f23 = ALPHA2, f47, f103 + ;; + (p16) STFD [YST1] = f16, 1 * SIZE + (p16) STFD [YST2] = f18, 1 * SIZE + ;; + (p16) STFD [YST1] = f17, 3 * SIZE + (p16) STFD [YST2] = f19, 3 * SIZE + ;; + (p16) STFD [YST1] = f20, 1 * SIZE + (p16) STFD [YST2] = f22, 1 * SIZE + ;; + (p16) STFD [YST1] = f21, 3 * SIZE + (p16) STFD [YST2] = f23, 3 * SIZE + br.ctop.sptk.few .L32 + ;; + .align 16 + +.L35: + { .mmi + (p8) cmp.eq.unc p10, p0 = r0, II + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + (p10) br.cond.dptk .L40 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f42 = [AO3], LDA + (p14) LDFD f44 = [AO5], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f41 = [AO2] + (p13) LDFD f43 = [AO4] + (p14) LDFD f45 = [AO6] + ;; + (p13) LDFD f96 = [YLD1], 1 * SIZE + (p13) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p13) LDFD f97 = [YLD1], 3 * SIZE + (p13) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p14) LDFD f100 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f101 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f102 = [YLD1], 1 * SIZE + ;; + + (p13) FMA f96 = ALPHA1, f32, f96 + (p13) FMA f98 = ALPHA1, f34, f98 + (p13) FMA f97 = ALPHA1, f33, f97 + (p13) FMA f99 = ALPHA1, f35, f99 + (p14) FMA f100 = ALPHA1, f36, f100 + (p15) FMA f102 = ALPHA1, f38, f102 + (p14) FMA f101 = ALPHA1, f37, f101 + ;; + (p13) FMA f16 = ALPHA2, f40, f96 + (p13) FMA f18 = ALPHA2, f42, f98 + (p13) FMA f17 = ALPHA2, f41, f97 + (p13) FMA f19 = ALPHA2, f43, f99 + (p14) FMA f20 = ALPHA2, f44, f100 + (p15) FMA f22 = ALPHA2, f46, f102 + (p14) FMA f21 = ALPHA2, f45, f101 + ;; + { .mmi + (p13) STFD [YST1] = f16, 1 * SIZE + (p13) STFD [YST2] = f18, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YST1] = f17, 3 * SIZE + (p13) STFD [YST2] = f19 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YST1] = f20, 1 * SIZE + ;; + (p14) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + } + ;; + { .mib + (p15) STFD [YST1] = f22 + } + ;; + .align 16 + +.L40: + tbit.z p6, p0 = N, 0 + ;; + (p6) br.cond.dpnt .L990 + ;; + mov LDA7M8 = 8 * SIZE + ;; + mov YLD1 = YY + mov YST1 = YY + adds YLD2 = 2 * SIZE, YY + adds YST2 = 2 * SIZE, YY + ;; + LDFD ALPHA1 = [X], INCX + ;; + LDFD ALPHA2 = [X], INCX + ;; + FMPY ALPHA1 = ALPHA, ALPHA1 + FMPY ALPHA2 = ALPHA, ALPHA2 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + add A = LDA, A + ;; + shr I = M, 3 + mov pr.rot= 0 + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + adds J = -1, J + ;; + cmp.lt p7, p8 = r0, J + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + mov ar.ec= 1 + ;; + { .mfi + and II = 7, M + mov ar.lc = I + } + { .mfb + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L45 + } + ;; + .align 16 + +.L42: + (p16) LDFD f32 = [AO1], 8 * SIZE + (p16) LDFD f34 = [AO3], 8 * SIZE + (p16) LDFD f36 = [AO5], 8 * SIZE + (p16) LDFD f38 = [AO7], 8 * SIZE + ;; + (p16) LDFD f33 = [AO2], 8 * SIZE + (p16) LDFD f35 = [AO4], 8 * SIZE + (p16) LDFD f37 = [AO6], 8 * SIZE + (p16) LDFD f39 = [AO8], 8 * SIZE + ;; + (p16) LDFD f96 = [YLD1], 1 * SIZE + (p16) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f97 = [YLD1], 3 * SIZE + (p16) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p16) LDFD f102 = [YLD2], 1 * SIZE + ;; + (p16) LDFD f101 = [YLD1], 3 * SIZE + (p16) LDFD f103 = [YLD2], 3 * SIZE + ;; + (p16) FMA f16 = ALPHA1, f32, f96 + (p16) FMA f18 = ALPHA1, f34, f98 + (p16) FMA f17 = ALPHA1, f33, f97 + (p16) FMA f19 = ALPHA1, f35, f99 + (p16) FMA f20 = ALPHA1, f36, f100 + (p16) FMA f22 = ALPHA1, f38, f102 + (p16) FMA f21 = ALPHA1, f37, f101 + (p16) FMA f23 = ALPHA1, f39, f103 + ;; + (p16) STFD [YST1] = f16, 1 * SIZE + (p16) STFD [YST2] = f18, 1 * SIZE + ;; + (p16) STFD [YST1] = f17, 3 * SIZE + (p16) STFD [YST2] = f19, 3 * SIZE + ;; + (p16) STFD [YST1] = f20, 1 * SIZE + (p16) STFD [YST2] = f22, 1 * SIZE + ;; + (p16) STFD [YST1] = f21, 3 * SIZE + (p16) STFD [YST2] = f23, 3 * SIZE + br.ctop.sptk.few .L42 + ;; + .align 16 + +.L45: + { .mmi + (p8) cmp.eq.unc p10, p0 = r0, II + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + (p10) br.cond.dptk .L990 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f96 = [YLD1], 1 * SIZE + (p13) LDFD f98 = [YLD2], 1 * SIZE + ;; + (p13) LDFD f97 = [YLD1], 3 * SIZE + (p13) LDFD f99 = [YLD2], 3 * SIZE + ;; + (p14) LDFD f100 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f101 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f102 = [YLD1], 1 * SIZE + ;; + + (p13) FMA f16 = ALPHA1, f32, f96 + (p13) FMA f18 = ALPHA1, f34, f98 + (p13) FMA f17 = ALPHA1, f33, f97 + (p13) FMA f19 = ALPHA1, f35, f99 + (p14) FMA f20 = ALPHA1, f36, f100 + (p15) FMA f22 = ALPHA1, f38, f102 + (p14) FMA f21 = ALPHA1, f37, f101 + ;; + { .mmi + (p13) STFD [YST1] = f16, 1 * SIZE + (p13) STFD [YST2] = f18, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [YST1] = f17, 3 * SIZE + (p13) STFD [YST2] = f19 + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YST1] = f20, 1 * SIZE + ;; + (p14) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + } + ;; + { .mib + (p15) STFD [YST1] = f22 + } + ;; + .align 16 + + +.L990: + cmp.eq p10, p0 = SIZE, INCY + ;; + { .mmi + mov YLD1 = YY + mov YST1 = Y + mov pr.rot= 0 + } + { .mib + mov YST2 = Y + shr J = M, 3 + (p10) br.cond.dptk .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = r0, J + adds J = -1, J + mov ar.ec = 4 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + nop __LINE__ + tbit.nz p13, p0 = M, 2 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = J + (p6) br.cond.dpnt .L995 + } + ;; +.L992: + { .mfi + (p19) STFD [YST2] = f35 + (p18) FADD f34 = f34, f66 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f64 = [YLD1], 1 * SIZE + (p16) LDFD f32 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f39 + (p18) FADD f38 = f38, f70 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f36 = [YST1], INCY + (p16) LDFD f68 = [YLD1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f43 + (p18) FADD f42 = f42, f74 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f72 = [YLD1], 1 * SIZE + (p16) LDFD f40 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f47 + (p18) FADD f46 = f46, f78 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f76 = [YLD1], 1 * SIZE + (p16) LDFD f44 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f51 + (p18) FADD f50 = f50, f82 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f80 = [YLD1], 1 * SIZE + (p16) LDFD f48 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f55 + (p18) FADD f54 = f54, f86 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f84 = [YLD1], 1 * SIZE + (p16) LDFD f52 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f59 + (p18) FADD f58 = f58, f90 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f88 = [YLD1], 1 * SIZE + (p16) LDFD f56 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f63 + (p18) FADD f62 = f62, f94 + (p19) add YST2 = YST2, INCY + } + { .mmb + (p16) LDFD f92 = [YLD1], 1 * SIZE + (p16) LDFD f60 = [YST1], INCY + br.ctop.sptk.few .L992 + } + ;; + +.L995: + (p13) LDFD f32 = [YST1], INCY + (p13) LDFD f40 = [YLD1], 1 * SIZE + tbit.nz p14, p0 = M, 1 + ;; + (p13) LDFD f33 = [YST1], INCY + (p13) LDFD f41 = [YLD1], 1 * SIZE + tbit.nz p15, p0 = M, 0 + ;; + (p13) LDFD f34 = [YST1], INCY + (p13) LDFD f42 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f35 = [YST1], INCY + (p13) LDFD f43 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f36 = [YST1], INCY + (p14) LDFD f44 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f37 = [YST1], INCY + (p14) LDFD f45 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f38 = [YST1], INCY + (p15) LDFD f46 = [YLD1], 1 * SIZE + ;; + (p13) FADD f32 = f32, f40 + (p13) FADD f33 = f33, f41 + (p13) FADD f34 = f34, f42 + (p13) FADD f35 = f35, f43 + (p14) FADD f36 = f36, f44 + (p14) FADD f37 = f37, f45 + (p15) FADD f38 = f38, f46 + ;; + (p13) STFD [YST2] = f32 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f33 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f34 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f35 + (p13) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f36 + (p14) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f37 + (p14) add YST2 = YST2, INCY + ;; + (p15) STFD [YST2] = f38 + ;; + +.L999: + mov ar.lc = ARLC + mov pr = PR, -1 + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/qgemv_t.S b/kernel/ia64/qgemv_t.S new file mode 100644 index 0000000000..f3fc693d7c --- /dev/null +++ b/kernel/ia64/qgemv_t.S @@ -0,0 +1,1287 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#ifndef XDOUBLE +#define A r36 +#define LDA r37 +#define X1 r38 +#define INCX r39 +#define Y1 r34 +#define INCY r35 +#else +#define A r38 +#define LDA r39 +#define X1 r34 +#define INCX r35 +#define Y1 r36 +#define INCY r37 +#endif + +#define BUFFER r11 + +#define I r15 +#define J r16 +#define AO1 r17 +#define AO2 r18 +#define AO3 r19 +#define AO4 r20 +#define AO5 r21 +#define AO6 r22 +#define AO7 r23 +#define AO8 r24 +#define X2 r25 +#define Y2 r26 +#define LDA7M8 r27 +#define INCX5 r28 +#define INCY5 r29 + +#define YY1 r8 +#define YY2 r9 + +#define ARLC r30 +#define PR r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 8) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + mov ARLC = ar.lc + } + { .mmi + adds r15 = 24, SP + adds r14 = 16, SP + } + ;; +#ifdef XDOUBLE + ld8 X1 = [r14], 16 + ld8 INCX = [r15], 16 + ;; +#endif + ld8 Y1 = [r14], 16 + ld8 INCY = [r15], 16 + ;; + ld8 BUFFER = [r14] + ;; + mov PR = pr + ;; + mov ALPHA = f8 + .body + ;; + { .mmi + cmp.ge p7, p0 = r0, M + cmp.ge p6, p0 = r0, N + } + ;; + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + } + ;; + { .mbb + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + } + .align 16 + ;; + shladd INCY5 = INCY, 2, INCY + shladd INCX5 = INCX, 2, INCX + cmp.eq p10, p0 = SIZE, INCX + ;; + (p10) mov BUFFER = X1 + (p10) br.cond.dptk .L10 + ;; + + + mov pr.rot= 0 + shladd X2 = INCX, 2, X1 + mov YY1 = BUFFER + adds YY2 = 4 * SIZE, BUFFER + ;; + shr I = M, 3 + ;; + { .mmi + adds I = -1, I + cmp.eq p16, p0 = r0, r0 + mov ar.ec= 5 + } + ;; + { .mmi + mov ar.lc = I + } + { .mib + cmp.gt p6, p0 = 0, I + tbit.nz p13, p0 = M, 2 + (p6) br.cond.dpnt .L05 + } + ;; + .align 16 + +.L01: + (p20) STFD [YY1] = f36, SIZE + (p20) STFD [YY2] = f56, SIZE + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f52 = [X2], INCX + ;; + (p20) STFD [YY1] = f41, SIZE + (p20) STFD [YY2] = f61, SIZE + (p16) LDFD f37 = [X1], INCX + (p16) LDFD f57 = [X2], INCX + ;; + (p20) STFD [YY1] = f46, SIZE + (p20) STFD [YY2] = f66, SIZE + (p16) LDFD f42 = [X1], INCX + (p16) LDFD f62 = [X2], INCX + ;; + (p20) STFD [YY1] = f51, 5 * SIZE + (p20) STFD [YY2] = f71, 5 * SIZE + (p16) LDFD f47 = [X1], INCX5 + (p16) LDFD f67 = [X2], INCX5 + br.ctop.sptk.few .L01 + ;; + .align 16 + +.L05: + (p13) LDFD f32 = [X1], INCX + tbit.nz p14, p0 = M, 1 + ;; + (p13) LDFD f33 = [X1], INCX + tbit.nz p15, p0 = M, 0 + ;; + (p13) LDFD f34 = [X1], INCX + ;; + (p13) LDFD f35 = [X1], INCX + ;; + (p14) LDFD f36 = [X1], INCX + ;; + (p13) STFD [YY1] = f32, SIZE + (p14) LDFD f37 = [X1], INCX + ;; + (p13) STFD [YY1] = f33, SIZE + (p15) LDFD f38 = [X1], INCX + ;; + (p13) STFD [YY1] = f34, SIZE + ;; + (p13) STFD [YY1] = f35, SIZE + ;; + (p14) STFD [YY1] = f36, SIZE + ;; + (p14) STFD [YY1] = f37, SIZE + ;; + (p15) STFD [YY1] = f38, SIZE + ;; + .align 16 + +.L10: + mov YY1 = Y1 + shladd Y2 = INCY, 2, Y1 + shladd YY2 = INCY, 2, Y1 + ;; + { .mmi + nop __LINE__ + shr J = N, 3 + } + ;; + { .mib + nop __LINE__ + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 3, A + ;; + shladd LDA7M8 = LDA, 3, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov f8 = f0 + mov f9 = f0 + mov f10 = f0 + mov f11 = f0 + mov f12 = f0 + mov f13 = f0 + mov f14 = f0 + mov f15 = f0 + + mov pr.rot= 0 + shr I = M, 3 + mov ar.ec = 2 + ;; + mov X1 = BUFFER + adds X2 = 2 * SIZE, BUFFER + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + ;; + mov ar.lc = I + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L15 + ;; + .align 16 + +.L12: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA + (p16) LDFD f42 = [AO3], LDA + (p16) LDFD f44 = [AO5], LDA + (p16) LDFD f46 = [AO7], LDA + ;; + (p16) LDFD f41 = [AO2], LDA + (p16) LDFD f43 = [AO4], LDA + (p16) LDFD f45 = [AO6], LDA + (p16) LDFD f47 = [AO8], LDA + ;; + (p16) LDFD f48 = [AO1], LDA + (p16) LDFD f50 = [AO3], LDA + (p16) LDFD f52 = [AO5], LDA + (p16) LDFD f54 = [AO7], LDA + ;; + (p16) LDFD f49 = [AO2], LDA + (p16) LDFD f51 = [AO4], LDA + (p16) LDFD f53 = [AO6], LDA + (p16) LDFD f55 = [AO8], LDA + ;; + (p16) LDFD f56 = [AO1], LDA + (p16) LDFD f58 = [AO3], LDA + (p16) LDFD f60 = [AO5], LDA + (p16) LDFD f62 = [AO7], LDA + ;; + (p16) LDFD f57 = [AO2], LDA + (p16) LDFD f59 = [AO4], LDA + (p16) LDFD f61 = [AO6], LDA + (p16) LDFD f63 = [AO8], LDA + ;; + (p16) LDFD f64 = [AO1], LDA + (p16) LDFD f66 = [AO3], LDA + (p16) LDFD f68 = [AO5], LDA + (p16) LDFD f70 = [AO7], LDA + ;; + (p16) LDFD f65 = [AO2], LDA + (p16) LDFD f67 = [AO4], LDA + (p16) LDFD f69 = [AO6], LDA + (p16) LDFD f71 = [AO8], LDA + ;; + (p16) LDFD f72 = [AO1], LDA + (p16) LDFD f74 = [AO3], LDA + (p16) LDFD f76 = [AO5], LDA + (p16) LDFD f78 = [AO7], LDA + ;; + (p16) LDFD f73 = [AO2], LDA + (p16) LDFD f75 = [AO4], LDA + (p16) LDFD f77 = [AO6], LDA + (p16) LDFD f79 = [AO8], LDA + ;; + (p16) LDFD f80 = [AO1], LDA + (p16) LDFD f82 = [AO3], LDA + (p16) LDFD f84 = [AO5], LDA + (p16) LDFD f86 = [AO7], LDA + ;; + (p16) LDFD f81 = [AO2], LDA + (p16) LDFD f83 = [AO4], LDA + (p16) LDFD f85 = [AO6], LDA + (p16) LDFD f87 = [AO8], LDA + ;; + (p16) LDFD f88 = [AO1], LDA7M8 + (p16) LDFD f90 = [AO3], LDA7M8 + (p16) LDFD f92 = [AO5], LDA7M8 + (p16) LDFD f94 = [AO7], LDA7M8 + ;; + (p16) LDFD f89 = [AO2], LDA7M8 + (p16) LDFD f91 = [AO4], LDA7M8 + (p16) LDFD f93 = [AO6], LDA7M8 + (p16) LDFD f95 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [X1], 1 * SIZE + (p16) LDFD f98 = [X2], 1 * SIZE + ;; + (p16) LDFD f97 = [X1], 3 * SIZE + (p16) LDFD f99 = [X2], 3 * SIZE + ;; + (p16) LDFD f100 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + ;; + (p16) LDFD f101 = [X1], 3 * SIZE + (p16) LDFD f103 = [X2], 3 * SIZE + ;; + (p16) FMA f8 = f96, f32, f8 + (p16) FMA f9 = f96, f40, f9 + (p16) FMA f10 = f96, f48, f10 + (p16) FMA f11 = f96, f56, f11 + (p16) FMA f12 = f96, f64, f12 + (p16) FMA f13 = f96, f72, f13 + (p16) FMA f14 = f96, f80, f14 + (p16) FMA f15 = f96, f88, f15 + ;; + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 + (p16) FMA f10 = f97, f49, f10 + (p16) FMA f11 = f97, f57, f11 + (p16) FMA f12 = f97, f65, f12 + (p16) FMA f13 = f97, f73, f13 + (p16) FMA f14 = f97, f81, f14 + (p16) FMA f15 = f97, f89, f15 + ;; + (p16) FMA f8 = f98, f34, f8 + (p16) FMA f9 = f98, f42, f9 + (p16) FMA f10 = f98, f50, f10 + (p16) FMA f11 = f98, f58, f11 + (p16) FMA f12 = f98, f66, f12 + (p16) FMA f13 = f98, f74, f13 + (p16) FMA f14 = f98, f82, f14 + (p16) FMA f15 = f98, f90, f15 + ;; + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 + (p16) FMA f10 = f99, f51, f10 + (p16) FMA f11 = f99, f59, f11 + (p16) FMA f12 = f99, f67, f12 + (p16) FMA f13 = f99, f75, f13 + (p16) FMA f14 = f99, f83, f14 + (p16) FMA f15 = f99, f91, f15 + ;; + (p16) FMA f8 = f100, f36, f8 + (p16) FMA f9 = f100, f44, f9 + (p16) FMA f10 = f100, f52, f10 + (p16) FMA f11 = f100, f60, f11 + (p16) FMA f12 = f100, f68, f12 + (p16) FMA f13 = f100, f76, f13 + (p16) FMA f14 = f100, f84, f14 + (p16) FMA f15 = f100, f92, f15 + ;; + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 + (p16) FMA f10 = f101, f53, f10 + (p16) FMA f11 = f101, f61, f11 + (p16) FMA f12 = f101, f69, f12 + (p16) FMA f13 = f101, f77, f13 + (p16) FMA f14 = f101, f85, f14 + (p16) FMA f15 = f101, f93, f15 + ;; + (p16) FMA f8 = f102, f38, f8 + (p16) FMA f9 = f102, f46, f9 + (p16) FMA f10 = f102, f54, f10 + (p16) FMA f11 = f102, f62, f11 + (p16) FMA f12 = f102, f70, f12 + (p16) FMA f13 = f102, f78, f13 + (p16) FMA f14 = f102, f86, f14 + (p16) FMA f15 = f102, f94, f15 + ;; + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 + (p16) FMA f10 = f103, f55, f10 + (p16) FMA f11 = f103, f63, f11 + (p16) FMA f12 = f103, f71, f12 + (p16) FMA f13 = f103, f79, f13 + (p16) FMA f14 = f103, f87, f14 + (p16) FMA f15 = f103, f95, f15 + br.ctop.sptk.few .L12 + ;; + .align 16 + +.L15: + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + ;; + { .mmi + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f42 = [AO3], LDA + (p14) LDFD f44 = [AO5], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f41 = [AO2], LDA + (p13) LDFD f43 = [AO4], LDA + (p14) LDFD f45 = [AO6], LDA + ;; + (p13) LDFD f48 = [AO1], LDA + (p13) LDFD f50 = [AO3], LDA + (p14) LDFD f52 = [AO5], LDA + (p15) LDFD f54 = [AO7], LDA + ;; + (p13) LDFD f49 = [AO2], LDA + (p13) LDFD f51 = [AO4], LDA + (p14) LDFD f53 = [AO6], LDA + ;; + (p13) LDFD f56 = [AO1], LDA + (p13) LDFD f58 = [AO3], LDA + (p14) LDFD f60 = [AO5], LDA + (p15) LDFD f62 = [AO7], LDA + ;; + (p13) LDFD f57 = [AO2], LDA + (p13) LDFD f59 = [AO4], LDA + (p14) LDFD f61 = [AO6], LDA + ;; + (p13) LDFD f64 = [AO1], LDA + (p13) LDFD f66 = [AO3], LDA + (p14) LDFD f68 = [AO5], LDA + (p15) LDFD f70 = [AO7], LDA + ;; + (p13) LDFD f65 = [AO2], LDA + (p13) LDFD f67 = [AO4], LDA + (p14) LDFD f69 = [AO6], LDA + ;; + (p13) LDFD f72 = [AO1], LDA + (p13) LDFD f74 = [AO3], LDA + (p14) LDFD f76 = [AO5], LDA + (p15) LDFD f78 = [AO7], LDA + ;; + (p13) LDFD f73 = [AO2], LDA + (p13) LDFD f75 = [AO4], LDA + (p14) LDFD f77 = [AO6], LDA + ;; + (p13) LDFD f80 = [AO1], LDA + (p13) LDFD f82 = [AO3], LDA + (p14) LDFD f84 = [AO5], LDA + (p15) LDFD f86 = [AO7], LDA + ;; + (p13) LDFD f81 = [AO2], LDA + (p13) LDFD f83 = [AO4], LDA + (p14) LDFD f85 = [AO6], LDA + ;; + (p13) LDFD f88 = [AO1] + (p13) LDFD f90 = [AO3] + (p14) LDFD f92 = [AO5] + (p15) LDFD f94 = [AO7] + ;; + (p13) LDFD f89 = [AO2] + (p13) LDFD f91 = [AO4] + (p14) LDFD f93 = [AO6] + ;; + (p13) LDFD f96 = [X1], 1 * SIZE + (p13) LDFD f98 = [X2], 1 * SIZE + ;; + (p13) LDFD f97 = [X1], 3 * SIZE + (p13) LDFD f99 = [X2], 3 * SIZE + ;; + (p14) LDFD f100 = [X1], 1 * SIZE + ;; + (p14) LDFD f101 = [X1], 1 * SIZE + ;; + (p15) LDFD f102 = [X1], 1 * SIZE + ;; + (p13) FMA f8 = f96, f32, f8 + (p13) FMA f9 = f96, f40, f9 + (p13) FMA f10 = f96, f48, f10 + (p13) FMA f11 = f96, f56, f11 + (p13) FMA f12 = f96, f64, f12 + (p13) FMA f13 = f96, f72, f13 + (p13) FMA f14 = f96, f80, f14 + (p13) FMA f15 = f96, f88, f15 + ;; + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 + (p13) FMA f10 = f97, f49, f10 + (p13) FMA f11 = f97, f57, f11 + (p13) FMA f12 = f97, f65, f12 + (p13) FMA f13 = f97, f73, f13 + (p13) FMA f14 = f97, f81, f14 + (p13) FMA f15 = f97, f89, f15 + ;; + (p13) FMA f8 = f98, f34, f8 + (p13) FMA f9 = f98, f42, f9 + (p13) FMA f10 = f98, f50, f10 + (p13) FMA f11 = f98, f58, f11 + (p13) FMA f12 = f98, f66, f12 + (p13) FMA f13 = f98, f74, f13 + (p13) FMA f14 = f98, f82, f14 + (p13) FMA f15 = f98, f90, f15 + ;; + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 + (p13) FMA f10 = f99, f51, f10 + (p13) FMA f11 = f99, f59, f11 + (p13) FMA f12 = f99, f67, f12 + (p13) FMA f13 = f99, f75, f13 + (p13) FMA f14 = f99, f83, f14 + (p13) FMA f15 = f99, f91, f15 + ;; + (p14) FMA f8 = f100, f36, f8 + (p14) FMA f9 = f100, f44, f9 + (p14) FMA f10 = f100, f52, f10 + (p14) FMA f11 = f100, f60, f11 + (p14) FMA f12 = f100, f68, f12 + (p14) FMA f13 = f100, f76, f13 + (p14) FMA f14 = f100, f84, f14 + (p14) FMA f15 = f100, f92, f15 + ;; + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 + (p14) FMA f10 = f101, f53, f10 + (p14) FMA f11 = f101, f61, f11 + (p14) FMA f12 = f101, f69, f12 + (p14) FMA f13 = f101, f77, f13 + (p14) FMA f14 = f101, f85, f14 + (p14) FMA f15 = f101, f93, f15 + ;; + (p15) FMA f8 = f102, f38, f8 + (p15) FMA f9 = f102, f46, f9 + (p15) FMA f10 = f102, f54, f10 + (p15) FMA f11 = f102, f62, f11 + (p15) FMA f12 = f102, f70, f12 + (p15) FMA f13 = f102, f78, f13 + (p15) FMA f14 = f102, f86, f14 + (p15) FMA f15 = f102, f94, f15 + ;; + LDFD f32 = [Y1], INCY + ;; + LDFD f33 = [Y1], INCY + ;; + LDFD f34 = [Y1], INCY + ;; + LDFD f35 = [Y1], INCY5 + ;; + LDFD f36 = [Y2], INCY + ;; + LDFD f37 = [Y2], INCY + ;; + LDFD f38 = [Y2], INCY + ;; + LDFD f39 = [Y2], INCY5 + ;; + FMA f32 = ALPHA, f8, f32 + FMA f33 = ALPHA, f9, f33 + FMA f34 = ALPHA, f10, f34 + FMA f35 = ALPHA, f11, f35 + FMA f36 = ALPHA, f12, f36 + FMA f37 = ALPHA, f13, f37 + FMA f38 = ALPHA, f14, f38 + FMA f39 = ALPHA, f15, f39 + ;; + STFD [YY1] = f32 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f33 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f34 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f35 + add YY1 = YY1, INCY5 + ;; + STFD [YY2] = f36 + add YY2 = YY2, INCY + ;; + STFD [YY2] = f37 + add YY2 = YY2, INCY + ;; + STFD [YY2] = f38 + add YY2 = YY2, INCY + ;; + STFD [YY2] = f39 + add YY2 = YY2, INCY5 + ;; + adds J = -1, J + ;; + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L11 + ;; + .align 16 + +.L20: + tbit.z p6, p0 = N, 2 + ;; + (p6) br.cond.dpnt .L30 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 2, A + ;; + shladd LDA7M8 = LDA, 2, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov f8 = f0 + mov f9 = f0 + mov f10 = f0 + mov f11 = f0 + mov f12 = f0 + mov f13 = f0 + mov f14 = f0 + mov f15 = f0 + + mov pr.rot= 0 + shr I = M, 3 + mov ar.ec = 2 + ;; + mov X1 = BUFFER + adds X2 = 2 * SIZE, BUFFER + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + ;; + mov ar.lc = I + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L25 + ;; + .align 16 + +.L22: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA + (p16) LDFD f42 = [AO3], LDA + (p16) LDFD f44 = [AO5], LDA + (p16) LDFD f46 = [AO7], LDA + ;; + (p16) LDFD f41 = [AO2], LDA + (p16) LDFD f43 = [AO4], LDA + (p16) LDFD f45 = [AO6], LDA + (p16) LDFD f47 = [AO8], LDA + ;; + (p16) LDFD f48 = [AO1], LDA + (p16) LDFD f50 = [AO3], LDA + (p16) LDFD f52 = [AO5], LDA + (p16) LDFD f54 = [AO7], LDA + ;; + (p16) LDFD f49 = [AO2], LDA + (p16) LDFD f51 = [AO4], LDA + (p16) LDFD f53 = [AO6], LDA + (p16) LDFD f55 = [AO8], LDA + ;; + (p16) LDFD f56 = [AO1], LDA7M8 + (p16) LDFD f58 = [AO3], LDA7M8 + (p16) LDFD f60 = [AO5], LDA7M8 + (p16) LDFD f62 = [AO7], LDA7M8 + ;; + (p16) LDFD f57 = [AO2], LDA7M8 + (p16) LDFD f59 = [AO4], LDA7M8 + (p16) LDFD f61 = [AO6], LDA7M8 + (p16) LDFD f63 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [X1], 1 * SIZE + (p16) LDFD f98 = [X2], 1 * SIZE + ;; + (p16) LDFD f97 = [X1], 3 * SIZE + (p16) LDFD f99 = [X2], 3 * SIZE + ;; + (p16) LDFD f100 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + ;; + (p16) LDFD f101 = [X1], 3 * SIZE + (p16) LDFD f103 = [X2], 3 * SIZE + ;; + (p16) FMA f8 = f96, f32, f8 + (p16) FMA f9 = f96, f40, f9 + (p16) FMA f10 = f96, f48, f10 + (p16) FMA f11 = f96, f56, f11 + ;; + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 + (p16) FMA f10 = f97, f49, f10 + (p16) FMA f11 = f97, f57, f11 + ;; + (p16) FMA f8 = f98, f34, f8 + (p16) FMA f9 = f98, f42, f9 + (p16) FMA f10 = f98, f50, f10 + (p16) FMA f11 = f98, f58, f11 + ;; + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 + (p16) FMA f10 = f99, f51, f10 + (p16) FMA f11 = f99, f59, f11 + ;; + (p16) FMA f8 = f100, f36, f8 + (p16) FMA f9 = f100, f44, f9 + (p16) FMA f10 = f100, f52, f10 + (p16) FMA f11 = f100, f60, f11 + + ;; + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 + (p16) FMA f10 = f101, f53, f10 + (p16) FMA f11 = f101, f61, f11 + ;; + (p16) FMA f8 = f102, f38, f8 + (p16) FMA f9 = f102, f46, f9 + (p16) FMA f10 = f102, f54, f10 + (p16) FMA f11 = f102, f62, f11 + ;; + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 + (p16) FMA f10 = f103, f55, f10 + (p16) FMA f11 = f103, f63, f11 + br.ctop.sptk.few .L22 + ;; + .align 16 + +.L25: + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + ;; + { .mmi + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1], LDA + (p13) LDFD f42 = [AO3], LDA + (p14) LDFD f44 = [AO5], LDA + (p15) LDFD f46 = [AO7], LDA + ;; + (p13) LDFD f41 = [AO2], LDA + (p13) LDFD f43 = [AO4], LDA + (p14) LDFD f45 = [AO6], LDA + ;; + (p13) LDFD f48 = [AO1], LDA + (p13) LDFD f50 = [AO3], LDA + (p14) LDFD f52 = [AO5], LDA + (p15) LDFD f54 = [AO7], LDA + ;; + (p13) LDFD f49 = [AO2], LDA + (p13) LDFD f51 = [AO4], LDA + (p14) LDFD f53 = [AO6], LDA + ;; + (p13) LDFD f56 = [AO1] + (p13) LDFD f58 = [AO3] + (p14) LDFD f60 = [AO5] + (p15) LDFD f62 = [AO7] + ;; + (p13) LDFD f57 = [AO2] + (p13) LDFD f59 = [AO4] + (p14) LDFD f61 = [AO6] + ;; + (p13) LDFD f96 = [X1], 1 * SIZE + (p13) LDFD f98 = [X2], 1 * SIZE + ;; + (p13) LDFD f97 = [X1], 3 * SIZE + (p13) LDFD f99 = [X2], 3 * SIZE + ;; + (p14) LDFD f100 = [X1], 1 * SIZE + ;; + (p14) LDFD f101 = [X1], 1 * SIZE + ;; + (p15) LDFD f102 = [X1], 1 * SIZE + ;; + (p13) FMA f8 = f96, f32, f8 + (p13) FMA f9 = f96, f40, f9 + (p13) FMA f10 = f96, f48, f10 + (p13) FMA f11 = f96, f56, f11 + ;; + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 + (p13) FMA f10 = f97, f49, f10 + (p13) FMA f11 = f97, f57, f11 + ;; + (p13) FMA f8 = f98, f34, f8 + (p13) FMA f9 = f98, f42, f9 + (p13) FMA f10 = f98, f50, f10 + (p13) FMA f11 = f98, f58, f11 + ;; + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 + (p13) FMA f10 = f99, f51, f10 + (p13) FMA f11 = f99, f59, f11 + ;; + (p14) FMA f8 = f100, f36, f8 + (p14) FMA f9 = f100, f44, f9 + (p14) FMA f10 = f100, f52, f10 + (p14) FMA f11 = f100, f60, f11 + ;; + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 + (p14) FMA f10 = f101, f53, f10 + (p14) FMA f11 = f101, f61, f11 + ;; + (p15) FMA f8 = f102, f38, f8 + (p15) FMA f9 = f102, f46, f9 + (p15) FMA f10 = f102, f54, f10 + (p15) FMA f11 = f102, f62, f11 + ;; + LDFD f32 = [Y1], INCY + ;; + LDFD f33 = [Y1], INCY + ;; + LDFD f34 = [Y1], INCY + ;; + LDFD f35 = [Y1], INCY + ;; + FMA f32 = ALPHA, f8, f32 + FMA f33 = ALPHA, f9, f33 + FMA f34 = ALPHA, f10, f34 + FMA f35 = ALPHA, f11, f35 + ;; + STFD [YY1] = f32 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f33 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f34 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f35 + add YY1 = YY1, INCY + ;; + .align 16 + +.L30: + tbit.z p6, p0 = N, 1 + ;; + (p6) br.cond.dpnt .L40 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + shladd A = LDA, 1, A + ;; + shladd LDA7M8 = LDA, 1, r0 + ;; + sub LDA7M8 = LDA, LDA7M8 + ;; + adds LDA7M8 = 8 * SIZE, LDA7M8 + ;; + mov f8 = f0 + mov f9 = f0 + mov f10 = f0 + mov f11 = f0 + mov f12 = f0 + mov f13 = f0 + mov f14 = f0 + mov f15 = f0 + + mov pr.rot= 0 + shr I = M, 3 + mov ar.ec = 2 + ;; + mov X1 = BUFFER + adds X2 = 2 * SIZE, BUFFER + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + ;; + mov ar.lc = I + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L35 + ;; + .align 16 + +.L32: + (p16) LDFD f32 = [AO1], LDA + (p16) LDFD f34 = [AO3], LDA + (p16) LDFD f36 = [AO5], LDA + (p16) LDFD f38 = [AO7], LDA + ;; + (p16) LDFD f33 = [AO2], LDA + (p16) LDFD f35 = [AO4], LDA + (p16) LDFD f37 = [AO6], LDA + (p16) LDFD f39 = [AO8], LDA + ;; + (p16) LDFD f40 = [AO1], LDA7M8 + (p16) LDFD f42 = [AO3], LDA7M8 + (p16) LDFD f44 = [AO5], LDA7M8 + (p16) LDFD f46 = [AO7], LDA7M8 + ;; + (p16) LDFD f41 = [AO2], LDA7M8 + (p16) LDFD f43 = [AO4], LDA7M8 + (p16) LDFD f45 = [AO6], LDA7M8 + (p16) LDFD f47 = [AO8], LDA7M8 + ;; + (p16) LDFD f96 = [X1], 1 * SIZE + (p16) LDFD f98 = [X2], 1 * SIZE + ;; + (p16) LDFD f97 = [X1], 3 * SIZE + (p16) LDFD f99 = [X2], 3 * SIZE + ;; + (p16) LDFD f100 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + ;; + (p16) LDFD f101 = [X1], 3 * SIZE + (p16) LDFD f103 = [X2], 3 * SIZE + ;; + (p16) FMA f8 = f96, f32, f8 + (p16) FMA f9 = f96, f40, f9 + ;; + (p16) FMA f8 = f97, f33, f8 + (p16) FMA f9 = f97, f41, f9 + ;; + (p16) FMA f8 = f98, f34, f8 + (p16) FMA f9 = f98, f42, f9 + ;; + (p16) FMA f8 = f99, f35, f8 + (p16) FMA f9 = f99, f43, f9 + ;; + (p16) FMA f8 = f100, f36, f8 + (p16) FMA f9 = f100, f44, f9 + ;; + (p16) FMA f8 = f101, f37, f8 + (p16) FMA f9 = f101, f45, f9 + ;; + (p16) FMA f8 = f102, f38, f8 + (p16) FMA f9 = f102, f46, f9 + ;; + (p16) FMA f8 = f103, f39, f8 + (p16) FMA f9 = f103, f47, f9 + br.ctop.sptk.few .L32 + ;; + .align 16 + +.L35: + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + ;; + { .mmi + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1], LDA + (p13) LDFD f34 = [AO3], LDA + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5], LDA + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2], LDA + (p13) LDFD f35 = [AO4], LDA + (p14) LDFD f37 = [AO6], LDA + (p15) LDFD f38 = [AO7], LDA + ;; + (p13) LDFD f40 = [AO1] + (p13) LDFD f42 = [AO3] + (p14) LDFD f44 = [AO5] + (p15) LDFD f46 = [AO7] + ;; + (p13) LDFD f41 = [AO2] + (p13) LDFD f43 = [AO4] + (p14) LDFD f45 = [AO6] + ;; + (p13) LDFD f96 = [X1], 1 * SIZE + (p13) LDFD f98 = [X2], 1 * SIZE + ;; + (p13) LDFD f97 = [X1], 3 * SIZE + (p13) LDFD f99 = [X2], 3 * SIZE + ;; + (p14) LDFD f100 = [X1], 1 * SIZE + ;; + (p14) LDFD f101 = [X1], 1 * SIZE + ;; + (p15) LDFD f102 = [X1], 1 * SIZE + ;; + (p13) FMA f8 = f96, f32, f8 + (p13) FMA f9 = f96, f40, f9 + ;; + (p13) FMA f8 = f97, f33, f8 + (p13) FMA f9 = f97, f41, f9 + ;; + (p13) FMA f8 = f98, f34, f8 + (p13) FMA f9 = f98, f42, f9 + ;; + (p13) FMA f8 = f99, f35, f8 + (p13) FMA f9 = f99, f43, f9 + ;; + (p14) FMA f8 = f100, f36, f8 + (p14) FMA f9 = f100, f44, f9 + ;; + (p14) FMA f8 = f101, f37, f8 + (p14) FMA f9 = f101, f45, f9 + ;; + (p15) FMA f8 = f102, f38, f8 + (p15) FMA f9 = f102, f46, f9 + ;; + LDFD f32 = [Y1], INCY + ;; + LDFD f33 = [Y1], INCY + ;; + FMA f32 = ALPHA, f8, f32 + FMA f33 = ALPHA, f9, f33 + ;; + STFD [YY1] = f32 + add YY1 = YY1, INCY + ;; + STFD [YY1] = f33 + add YY1 = YY1, INCY + ;; + .align 16 + +.L40: + tbit.z p6, p0 = N, 0 + ;; + (p6) br.cond.dpnt .L999 + ;; + mov AO1 = A + adds AO2 = 1 * SIZE, A + adds AO3 = 2 * SIZE, A + adds AO4 = 3 * SIZE, A + adds AO5 = 4 * SIZE, A + adds AO6 = 5 * SIZE, A + adds AO7 = 6 * SIZE, A + adds AO8 = 7 * SIZE, A + add A = LDA, A + ;; + mov f8 = f0 + mov f9 = f0 + mov f10 = f0 + mov f11 = f0 + mov f12 = f0 + mov f13 = f0 + mov f14 = f0 + mov f15 = f0 + + mov pr.rot= 0 + shr I = M, 3 + mov ar.ec = 2 + ;; + mov X1 = BUFFER + adds X2 = 2 * SIZE, BUFFER + ;; + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + ;; + mov ar.lc = I + cmp.eq p6, p0 = -1, I + (p6) br.cond.dpnt .L45 + ;; + .align 16 + +.L42: + (p16) LDFD f32 = [AO1], 8 * SIZE + (p16) LDFD f34 = [AO3], 8 * SIZE + (p16) LDFD f36 = [AO5], 8 * SIZE + (p16) LDFD f38 = [AO7], 8 * SIZE + ;; + (p16) LDFD f33 = [AO2], 8 * SIZE + (p16) LDFD f35 = [AO4], 8 * SIZE + (p16) LDFD f37 = [AO6], 8 * SIZE + (p16) LDFD f39 = [AO8], 8 * SIZE + ;; + (p16) LDFD f96 = [X1], 1 * SIZE + (p16) LDFD f98 = [X2], 1 * SIZE + ;; + (p16) LDFD f97 = [X1], 3 * SIZE + (p16) LDFD f99 = [X2], 3 * SIZE + ;; + (p16) LDFD f100 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + ;; + (p16) LDFD f101 = [X1], 3 * SIZE + (p16) LDFD f103 = [X2], 3 * SIZE + ;; + (p16) FMA f8 = f96, f32, f8 + ;; + (p16) FMA f8 = f97, f33, f8 + ;; + (p16) FMA f8 = f98, f34, f8 + ;; + (p16) FMA f8 = f99, f35, f8 + ;; + (p16) FMA f8 = f100, f36, f8 + ;; + (p16) FMA f8 = f101, f37, f8 + ;; + (p16) FMA f8 = f102, f38, f8 + ;; + (p16) FMA f8 = f103, f39, f8 + br.ctop.sptk.few .L42 + ;; + .align 16 + +.L45: + tbit.nz p13, p11 = M, 2 + tbit.nz p14, p12 = M, 1 + ;; + { .mmi + (p11) adds AO5 = - 4 * SIZE, AO5 + } + { .mbb + (p11) adds AO7 = - 4 * SIZE, AO7 + } + ;; + { .mmi + (p13) LDFD f32 = [AO1] + (p13) LDFD f34 = [AO3] + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f36 = [AO5] + (p11) adds AO6 = - 4 * SIZE, AO6 + (p12) adds AO7 = - 2 * SIZE, AO7 + } + ;; + (p13) LDFD f33 = [AO2] + (p13) LDFD f35 = [AO4] + (p14) LDFD f37 = [AO6] + (p15) LDFD f38 = [AO7] + ;; + (p13) LDFD f96 = [X1], 1 * SIZE + (p13) LDFD f98 = [X2], 1 * SIZE + ;; + (p13) LDFD f97 = [X1], 3 * SIZE + (p13) LDFD f99 = [X2], 3 * SIZE + ;; + (p14) LDFD f100 = [X1], 1 * SIZE + ;; + (p14) LDFD f101 = [X1], 1 * SIZE + ;; + (p15) LDFD f102 = [X1], 1 * SIZE + ;; + (p13) FMA f8 = f96, f32, f8 + ;; + (p13) FMA f8 = f97, f33, f8 + ;; + (p13) FMA f8 = f98, f34, f8 + ;; + (p13) FMA f8 = f99, f35, f8 + ;; + (p14) FMA f8 = f100, f36, f8 + ;; + (p14) FMA f8 = f101, f37, f8 + ;; + (p15) FMA f8 = f102, f38, f8 + ;; + LDFD f32 = [Y1], INCY + ;; + FMA f32 = ALPHA, f8, f32 + ;; + STFD [YY1] = f32 + .align 16 + +.L999: + mov ar.lc = ARLC + mov pr = PR, -1 + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/qscal.S b/kernel/ia64/qscal.S new file mode 100644 index 0000000000..3f978afde5 --- /dev/null +++ b/kernel/ia64/qscal.S @@ -0,0 +1,693 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (16 * 16) + +#define ALPHA f8 + +#define N r32 +#define X1 r38 +#define INCX r39 + +#define X2 r14 +#define Y1 r15 +#define Y2 r16 +#define PRE1 r17 +#define I r18 +#define NAND15 r19 +#define INCX5 r20 +#define INCX8 r21 +#define XX r22 +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + shladd INCX = INCX, BASE_SHIFT, r0 + fcmp.eq p0, p6 = ALPHA, f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + .body + { .mib + cmp.ge p7, p0 = 0, N + (p7) br.ret.sptk.many b0 + } + ;; + { .mmi + mov XX = X1 + mov PR = pr + } + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCX8 = INCX, 3, r0 + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + nop.m 0 + mov ar.ec = 5 + } + { .mmi + and NAND15 = 15, N + nop.m 0 + shr I = N, 4 + } + ;; + { .mmi + adds I = -1, I + nop.m 0 + tbit.z p0, p12 = N, 3 + } + { .mmb + cmp.ge p9, p0 = 0, NAND15 + nop.m 0 + (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3 + } + ;; + { .mmi + adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 + mov ar.lc = I + } + { .mmb + cmp.gt p8, p0 = 0, I + (p8) br.cond.dpnt .L30 + } + ;; + .align 32 + +.L20: + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + lfetch.excl.nt1 [PRE1], INCX8 + add X1 = INCX, X1 + add X2 = INCX, X2 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX5, X1 + add X2 = INCX5, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + lfetch.excl.nt1 [PRE1], INCX8 + add X1 = INCX, X1 + add X2 = INCX, X2 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop.i 0 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop.i 0 + } + {.mmb + add X1 = INCX5, X1 + add X2 = INCX5, X2 + br.cloop.sptk.few .L20 + } + ;; + .align 16 + +.L30: + { .mmi + (p12) STFD [X1] = f0 + (p12) STFD [X2] = f0 + mov ar.lc = ARLC + } + { .mmb + (p12) add X1 = INCX, X1 + (p12) add X2 = INCX, X2 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) STFD [X1] = f0 + (p12) add X1 = INCX, X1 + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) STFD [X2] = f0 + (p12) add X2 = INCX, X2 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) STFD [X1] = f0 + (p12) add X1 = INCX, X1 + tbit.z p0, p15 = N, 0 + } + { .mmb + (p12) STFD [X2] = f0 + (p12) add X2 = INCX, X2 + nop __LINE__ + } + ;; + { .mmb + (p12) STFD [X1] = f0 + (p12) add X1 = INCX5, X1 + nop __LINE__ + } + { .mmb + (p12) STFD [X2] = f0 + (p12) add X2 = INCX5, X2 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [X1] = f0 + nop.m 0 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mmi + mov Y1 = X1 + shladd Y2 = INCX, 2, X1 + mov pr.rot= 0 + } + ;; + { .mmi + mov ar.lc = I + } + cmp.eq p16, p0 = r0, r0 + ;; + + { .mmi + adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 + nop.m 0 + mov.i ar.ec = 6 + } + { .mmb + cmp.gt p8, p0 = 0, I + nop.m 0 + (p8) br.cond.dpnt .L320 + } + ;; + .align 32 + +.L310: + { .mmf + (p16) lfetch.excl.nt1 [PRE1], INCX8 + (p22) STFD [Y1] = f12 + (p21) FMPY f6 = ALPHA, f37 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + nop __LINE__ + (p22) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p22) STFD [Y1] = f13 + (p16) LDFD f38 = [X1], INCX + (p21) FMPY f7 = ALPHA, f43 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p22) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p22) STFD [Y1] = f14 + (p16) LDFD f44 = [X1], INCX + (p21) FMPY f10 = ALPHA, f49 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p22) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p22) STFD [Y1] = f15 + (p16) LDFD f50 = [X1], INCX + (p21) FMPY f11 = ALPHA, f55 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p22) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f6 + (p16) LDFD f56 = [X1], INCX + (p21) FMPY f12 = ALPHA, f61 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p16) lfetch.excl.nt1 [PRE1], INCX8 + (p21) STFD [Y1] = f7 + (p21) FMPY f13 = ALPHA, f67 + } + { .mmi + (p16) LDFD f62 = [X1], INCX + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f10 + (p16) LDFD f68 = [X1], INCX + (p21) FMPY f14 = ALPHA, f73 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f11 + (p16) LDFD f74 = [X1], INCX + (p21) FMPY f15 = ALPHA, f79 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f12 + (p16) LDFD f80 = [X1], INCX + (p21) FMPY f6 = ALPHA, f85 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f13 + (p16) LDFD f86 = [X1], INCX + (p21) FMPY f7 = ALPHA, f91 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f14 + (p16) LDFD f92 = [X1], INCX + (p21) FMPY f10 = ALPHA, f97 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f15 + (p16) LDFD f98 = [X1], INCX + (p21) FMPY f11 = ALPHA, f103 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f6 + (p16) LDFD f104 = [X1], INCX + (p21) FMPY f12 = ALPHA, f109 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f7 + (p16) LDFD f110 = [X1], INCX + (p21) FMPY f13 = ALPHA, f115 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f10 + (p16) LDFD f116 = [X1], INCX + (p21) FMPY f14 = ALPHA, f121 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p21) add Y1 = INCX, Y1 + } + ;; + { .mmf + (p21) STFD [Y1] = f11 + (p16) LDFD f122 = [X1], INCX + (p21) FMPY f15 = ALPHA, f127 + } + { .mmb + nop __LINE__ + (p21) add Y1 = INCX, Y1 + br.ctop.sptk.few .L310 + } + ;; + { .mmi + STFD [Y1] = f12 + add Y1 = INCX, Y1 + shladd Y2 = INCX, 2, X1 + } + ;; + { .mmi + STFD [Y1] = f13 + add Y1 = INCX, Y1 + shladd X2 = INCX, 2, X1 + } + ;; + { .mmi + STFD [Y1] = f14 + nop __LINE__ + add Y1 = INCX, Y1 + } + ;; + { .mmi + STFD [Y1] = f15 + nop __LINE__ + add Y1 = INCX, Y1 + } + ;; + .align 16 + +.L320: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f52 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + mov pr = PR, -65474 + } + { .mmb + nop __LINE__ + nop __LINE__ + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f54 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX5 + (p12) LDFD f55 = [X2], INCX5 + tbit.z p0, p14 = N, 1 + } + ;; + (p13) LDFD f56 = [X1], INCX + tbit.z p0, p15 = N, 0 + ;; + (p13) LDFD f57 = [X1], INCX + ;; + { .mmf + (p13) LDFD f58 = [X1], INCX + nop __LINE__ + (p12) FMPY f48 = ALPHA, f48 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f52 = ALPHA, f52 + } + ;; + { .mmf + (p13) LDFD f59 = [X1], INCX + nop __LINE__ + (p12) FMPY f49 = ALPHA, f49 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f53 = ALPHA, f53 + } + ;; + { .mmf + (p14) LDFD f60 = [X1], INCX + nop __LINE__ + (p12) FMPY f50 = ALPHA, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f54 = ALPHA, f54 + } + ;; + { .mmf + (p14) LDFD f61 = [X1], INCX + nop __LINE__ + (p12) FMPY f51 = ALPHA, f51 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f55 = ALPHA, f55 + } + ;; + { .mmf + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + (p13) FMPY f56 = ALPHA, f56 + } + { .mmi + (p15) LDFD f62 = [X1] + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + } + ;; + { .mmf + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p13) FMPY f57 = ALPHA, f57 + } + { .mmi + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + (p13) FMPY f58 = ALPHA, f58 + } + { .mmi + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) FMPY f59 = ALPHA, f59 + } + { .mmi + (p12) add Y1 = INCX5, Y1 + (p12) add Y2 = INCX5, Y2 + nop __LINE__ + } + ;; + { .mfi + (p13) STFD [Y1] = f56 + (p14) FMPY f60 = ALPHA, f60 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mfi + (p13) STFD [Y1] = f57 + (p14) FMPY f61 = ALPHA, f61 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mfi + (p13) STFD [Y1] = f58 + (p15) FMPY f62 = ALPHA, f62 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f59 + nop __LINE__ + (p13) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p14) STFD [Y1] = f60 + nop __LINE__ + (p14) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p14) STFD [Y1] = f61 + nop __LINE__ + (p14) add Y1 = INCX, Y1 + } + ;; + { .mib + (p15) STFD [Y1] = f62 + mov pr = PR, -65474 + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/rot.S b/kernel/ia64/rot.S new file mode 100644 index 0000000000..8e349f6538 --- /dev/null +++ b/kernel/ia64/rot.S @@ -0,0 +1,891 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 8 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 8 + 8) +#else +#define PREFETCH_SIZE (32 * 8 + 16) +#endif + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 + +#define INCX16 r18 +#define INCY16 r19 + +#define PR r30 +#define ARLC r31 + +#define C f8 +#define S f9 + + PROLOGUE + .prologue + PROFCODE + { .mmi + adds r29 = 16, r12 + shladd INCX = INCX, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + shr I = N, 4 + (p6) br.ret.spnt.many b0 + } + .body + ;; + { .mmi +#ifdef XDOUBLE + LDFD S = [r29] +#else + nop __LINE__ +#endif + shladd INCY = INCY, BASE_SHIFT, r0 + mov PR = pr + } + { .mmi + mov X2 = X1 + mov Y2 = Y1 + mov pr.rot= 0 + } + ;; + { .mmi +#ifndef XDOUBLE + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 +#else + shladd INCX16 = INCX, 3, r0 + shladd INCY16 = INCY, 3, r0 +#endif + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.eq p16, p0 = r0, r0 + and J = 15, N + } + ;; + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = PREFETCH_SIZE * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p6 ,p0 = -1, I + tbit.z p0, p12 = N, 3 + (p6) br.cond.dpnt .L15 + } + ;; + .align 32 + +.L12: + { .mmf + (p18) STFD [X2] = f6 + (p16) lfetch.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = C, f40, f12 + } + { .mmf + (p17) LDFD f120 = [Y1], INCY + (p18) add X2 = X2, INCX + (p18) FMPY f6 = S, f94 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) lfetch.excl.nt1 [PREX], INCX16 + (p18) FNMA f13 = S, f40, f13 + } + { .mmf + (p16) LDFD f32 = [X1], INCX + (p18) add Y2 = Y2, INCY + (p18) FMPY f7 = C, f94 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p17) LDFD f123 = [Y1], INCY + (p18) FMA f14 = C, f43, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f97 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f35 = [X1], INCX + (p18) FNMA f15 = S, f43, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f97 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p17) LDFD f126 = [Y1], INCY + (p18) FMPY f12 = S, f100 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f6 = C, f46, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f38 = [X1], INCX + (p18) FMPY f13 = C, f100 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f7 = S, f46, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f80 = [Y1], INCY + (p18) FMPY f14 = S, f103 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f49, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f41 = [X1], INCX + (p18) FMPY f15 = C, f103 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f49, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f83 = [Y1], INCY + (p18) FMA f12 = C, f52, f12 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f6 = S, f106 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f44 = [X1], INCX + (p18) FNMA f13 = S, f52, f13 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f7 = C, f106 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f86 = [Y1], INCY + (p18) FMA f14 = C, f55, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f109 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f47 = [X1], INCX + (p18) FNMA f15 = S, f55, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f109 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f89 = [Y1], INCY + (p18) FMPY f12 = S, f112 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f6 = C, f58, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f50 = [X1], INCX + (p18) FMPY f13 = C, f112 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f7 = S, f58, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f92 = [Y1], INCY + (p18) FMPY f14 = S, f115 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f61, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f53 = [X1], INCX + (p18) FMPY f15 = C, f115 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f61, f11 + } + ;; +#ifndef XDOUBLE + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f95 = [Y1], INCY + (p18) FMA f12 = C, f64, f12 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f6 = S, f118 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f56 = [X1], INCX + (p18) FNMA f13 = S, f64, f13 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f7 = C, f118 + } + ;; +#else + { .mmf + (p18) STFD [X2] = f6 + (p16) lfetch.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = C, f64, f12 + } + { .mmf + (p16) LDFD f95 = [Y1], INCY + (p18) add X2 = X2, INCX + (p18) FMPY f6 = S, f118 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) lfetch.excl.nt1 [PREX], INCX16 + (p18) FNMA f13 = S, f64, f13 + } + { .mmf + (p16) LDFD f56 = [X1], INCX + (p18) add Y2 = Y2, INCY + (p18) FMPY f7 = C, f118 + } + ;; +#endif + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f98 = [Y1], INCY + (p18) FMA f14 = C, f67, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f121 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f59 = [X1], INCX + (p18) FNMA f15 = S, f67, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f121 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f101 = [Y1], INCY + (p18) FMPY f12 = S, f124 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f6 = C, f70, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f62 = [X1], INCX + (p18) FMPY f13 = C, f124 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f7 = S, f70, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f104 = [Y1], INCY + (p18) FMPY f14 = S, f127 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f73, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f65 = [X1], INCX + (p18) FMPY f15 = C, f127 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f73, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f107 = [Y1], INCY + (p18) FMA f12 = C, f76, f12 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMPY f6 = S, f81 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f68 = [X1], INCX + (p18) FNMA f13 = S, f76, f13 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p17) FMPY f7 = C, f81 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f110 = [Y1], INCY + (p18) FMA f14 = C, f79, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMPY f10 = S, f84 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f71 = [X1], INCX + (p18) FNMA f15 = S, f79, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p17) FMPY f11 = C, f84 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f113 = [Y1], INCY + (p17) FMPY f12 = S, f87 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMA f6 = C, f33, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f74 = [X1], INCX + (p17) FMPY f13 = C, f87 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p17) FNMA f7 = S, f33, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f116 = [Y1], INCY + (p17) FMPY f14 = S, f90 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMA f10 = C, f36, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f77 = [X1], INCX + (p17) FMPY f15 = C, f90 + } + { .mfb + (p18) add Y2 = Y2, INCY + (p17) FNMA f11 = S, f36, f11 + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFD f40 = [Y1], INCY + (p12) LDFD f32 = [X1], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f41 = [Y1], INCY + (p12) LDFD f33 = [X1], INCX + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f42 = [Y1], INCY + cmp.eq p7, p0 = r0, J + (p7) br.ret.sptk.many b0 + } + ;; + { .mmf + (p12) LDFD f43 = [Y1], INCY + nop __LINE__ + (p12) FMPY f6 = S, f40 + } + ;; + { .mmf + (p12) LDFD f34 = [X1], INCX + nop __LINE__ + (p12) FMPY f7 = C, f40 + } + ;; + { .mmf + (p12) LDFD f44 = [Y1], INCY + nop __LINE__ + (p12) FMPY f10 = S, f41 + } + ;; + { .mmf + (p12) LDFD f35 = [X1], INCX + nop __LINE__ + (p12) FMPY f11 = C, f41 + } + ;; + { .mmf + (p12) LDFD f45 = [Y1], INCY + nop __LINE__ + (p12) FMPY f12 = S, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f6 = C, f32, f6 + } + ;; + { .mmf + (p12) LDFD f36 = [X1], INCX + nop __LINE__ + (p12) FMPY f13 = C, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f7 = S, f32, f7 + } + ;; + { .mmf + (p12) LDFD f46 = [Y1], INCY + nop __LINE__ + (p12) FMPY f14 = S, f43 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f10 = C, f33, f10 + } + ;; + { .mmf + (p12) LDFD f37 = [X1], INCX + nop __LINE__ + (p12) FMPY f15 = C, f43 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f11 = S, f33, f11 + } + ;; + { .mmf + (p12) STFD [X2] = f6 + (p12) LDFD f47 = [Y1], INCY + (p12) FMA f12 = C, f34, f12 + } + { .mfi + (p12) add X2 = X2, INCX + (p12) FMPY f6 = S, f44 + tbit.z p0, p13 = N, 2 + } + ;; + { .mmf + (p12) STFD [Y2] = f7 + (p12) LDFD f38 = [X1], INCX + (p12) FNMA f13 = S, f34, f13 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p12) FMPY f7 = C, f44 + } + ;; + { .mmf + (p12) STFD [X2] = f10 + (p13) LDFD f52 = [Y1], INCY + (p12) FMA f14 = C, f35, f14 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMPY f10 = S, f45 + } + ;; + { .mmf + (p12) STFD [Y2] = f11 + (p12) LDFD f39 = [X1], INCX + (p12) FNMA f15 = S, f35, f15 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p12) FMPY f11 = C, f45 + } + ;; + { .mmf + (p12) STFD [X2] = f12 + (p13) LDFD f53 = [Y1], INCY + (p12) FMPY f12 = S, f46 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMA f6 = C, f36, f6 + } + ;; + { .mmf + (p12) STFD [Y2] = f13 + (p13) LDFD f48 = [X1], INCX + (p12) FMPY f13 = C, f46 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p12) FNMA f7 = S, f36, f7 + } + ;; + { .mmf + (p12) STFD [X2] = f14 + (p13) LDFD f54 = [Y1], INCY + (p12) FMPY f14 = S, f47 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMA f10 = C, f37, f10 + } + ;; + { .mmf + (p12) STFD [Y2] = f15 + (p13) LDFD f49 = [X1], INCX + (p12) FMPY f15 = C, f47 + } + { .mfi + (p12) add Y2 = Y2, INCY + (p12) FNMA f11 = S, f37, f11 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmf + (p12) STFD [X2] = f6 + (p13) LDFD f55 = [Y1], INCY + (p12) FMA f12 = C, f38, f12 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMPY f6 = S, f52 + } + ;; + { .mmf + (p12) STFD [Y2] = f7 + (p13) LDFD f50 = [X1], INCX + (p12) FNMA f13 = S, f38, f13 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FMPY f7 = C, f52 + } + ;; + { .mmf + (p12) STFD [X2] = f10 + (p14) LDFD f58 = [Y1], INCY + (p12) FMA f14 = C, f39, f14 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMPY f10 = S, f53 + } + ;; + { .mmf + (p12) STFD [Y2] = f11 + (p13) LDFD f51 = [X1], INCX + (p12) FNMA f15 = S, f39, f15 + } + { .mfi + (p12) add Y2 = Y2, INCY + (p13) FMPY f11 = C, f53 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmf + (p12) STFD [X2] = f12 + (p14) LDFD f59 = [Y1], INCY + (p13) FMPY f12 = S, f54 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMA f6 = C, f48, f6 + } + ;; + { .mmf + (p12) STFD [Y2] = f13 + (p14) LDFD f56 = [X1], INCX + (p13) FMPY f13 = C, f54 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FNMA f7 = S, f48, f7 + } + ;; + { .mmf + (p12) STFD [X2] = f14 + (p15) LDFD f61 = [Y1], INCY + (p13) FMPY f14 = S, f55 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMA f10 = C, f49, f10 + } + ;; + { .mmf + (p12) STFD [Y2] = f15 + (p14) LDFD f57 = [X1], INCX + (p13) FMPY f15 = C, f55 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FNMA f11 = S, f49, f11 + } + ;; + { .mmf + (p13) STFD [X2] = f6 + nop __LINE__ + (p13) FMA f12 = C, f50, f12 + } + { .mmf + (p13) add X2 = X2, INCX + nop __LINE__ + (p14) FMPY f6 = S, f58 + } + ;; + { .mmf + (p13) STFD [Y2] = f7 + (p15) LDFD f60 = [X1], INCX + (p13) FNMA f13 = S, f50, f13 + } + { .mmf + (p13) add Y2 = Y2, INCY + nop __LINE__ + (p14) FMPY f7 = C, f58 + } + ;; + { .mmf + (p13) STFD [X2] = f10 + nop __LINE__ + (p13) FMA f14 = C, f51, f14 + } + { .mmf + (p13) add X2 = X2, INCX + nop __LINE__ + (p14) FMPY f10 = S, f59 + } + ;; + { .mmf + (p13) STFD [Y2] = f11 + nop __LINE__ + (p13) FNMA f15 = S, f51, f15 + } + { .mmf + (p13) add Y2 = Y2, INCY + nop __LINE__ + (p14) FMPY f11 = C, f59 + } + ;; + { .mmf + (p13) STFD [X2] = f12 + nop __LINE__ + (p14) FMA f6 = C, f56, f6 + } + { .mmf + (p13) add X2 = X2, INCX + nop __LINE__ + (p15) FMPY f12 = S, f61 + } + ;; + { .mmf + (p13) STFD [Y2] = f13 + nop __LINE__ + (p14) FNMA f7 = S, f56, f7 + } + { .mmf + (p13) add Y2 = Y2, INCY + nop __LINE__ + (p15) FMPY f13 = C, f61 + } + ;; + { .mmf + (p13) STFD [X2] = f14 + (p13) add X2 = X2, INCX + (p14) FMA f10 = C, f57, f10 + } + ;; + { .mmf + (p13) STFD [Y2] = f15 + (p13) add Y2 = Y2, INCY + (p14) FNMA f11 = S, f57, f11 + } + ;; + { .mmf + (p14) STFD [X2] = f6 + (p14) add X2 = X2, INCX + (p15) FMA f12 = C, f60, f12 + } + ;; + { .mmf + (p14) STFD [Y2] = f7 + (p14) add Y2 = Y2, INCY + (p15) FNMA f13 = S, f60, f13 + } + ;; + { .mmi + (p14) STFD [X2] = f10 + (p14) add X2 = X2, INCX + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [Y2] = f11 + (p14) add Y2 = Y2, INCY + nop __LINE__ + } + ;; + { .mmi + (p15) STFD [X2] = f12 + (p15) add X2 = X2, INCX + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [Y2] = f13 + (p15) add Y2 = Y2, INCY + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/saxpy.S b/kernel/ia64/saxpy.S new file mode 100644 index 0000000000..c3b2c1b04e --- /dev/null +++ b/kernel/ia64/saxpy.S @@ -0,0 +1,1667 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 64 * 8 + +#define N r32 +#define X r36 +#define INCX r37 +#define Y r38 +#define INCY r39 + +#define PRE1 r2 +#define PRE2 r3 + +#define I r14 +#define J r15 +#define Y1 r16 +#define Y2 r17 +#define X1 r18 +#define X2 r19 +#define INCX16 r20 +#define INCY16 r21 +#define YYY r25 +#define YY r27 +#define XA r28 +#define XB r29 +#define PR r30 +#define ARLC r31 + +#define ALPHA f8 +#define ALPHA_P f9 + + PROLOGUE + .prologue + PROFCODE + + { .mii + shladd INCX = INCX, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + tbit.nz p10, p0 = X, BASE_SHIFT + } + { .mfb + cmp.lt p0, p6 = r0, N + fcmp.eq p7, p0 = ALPHA, f0 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + (p10) LDFD f32 = [X], INCX + shladd INCY = INCY, BASE_SHIFT, r0 + mov PR = pr + } + { .mib + (p10) adds N = -1, N + mov YYY = Y + (p7) br.ret.sptk.many b0 + } + ;; + { .mmi + (p10) LDFD f33 = [Y], INCY + cmp.ne p13, p0 = SIZE, INCX + shr XA = X, 2 + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + nop.i 0 + } + ;; + { .mii + mov Y1 = Y + tbit.nz p11, p0 = Y, BASE_SHIFT + shr XB = Y, 2 + } + ;; + { .mmf + and XA = 0x3f, XA + and XB = 0x3f, XB + (p10) FMA f32 = ALPHA, f32, f33 + } + ;; + { .mmi + sub XA = XB, XA + shladd Y2 = INCY, 2, Y + mov pr.rot = 0x10000 + } + { .mbb + cmp.ne p14, p0 = SIZE, INCY + (p13) br.cond.dpnt .L100 + (p14) br.cond.dpnt .L100 + } + ;; + { .mmi + cmp.gt p14, p0 = r0, XA + ;; + and J = 15, N + shr I = N, 4 + } + { .mfb + (p14) adds XA = 64, XA + fpack ALPHA_P = f8, f8 + (p11) br.cond.dpnt .L30 + } + ;; + { .mmi + cmp.gt p14, p0 = 32, XA + cmp.lt p15, p0 = 58, XA + mov ar.ec = 3 + } + { .mmi + and J = 31, N + cmp.eq p16, p0 = r0, r0 + shr I = N, 5 + } + ;; + { .mmi + cmp.eq p9, p0 = r0, J + cmp.eq p7 ,p0 = 0, I + adds I = -1, I + } + { .mbb + nop.m 0 + (p14) br.cond.dpnt .L20 + (p15) br.cond.dpnt .L20 + } + ;; + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = PREFETCHSIZE * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = (PREFETCHSIZE - 24) * SIZE, Y + tbit.z p0, p11 = N, 4 + (p7) br.cond.dpnt .L15 + } + ;; + .align 32 + +.L12: +/* 0 */ + { .mmf + (p18) stf8 [Y1] = f6, 2 * SIZE + (p16) lfetch.nt1 [PRE1], 32 * SIZE + (p18) fpma f12 = ALPHA_P, f46, f94 + } + { .mmi + (p16) ldf8 f32 = [X], 2 * SIZE + (p16) ldf8 f80 = [Y], 2 * SIZE + } + ;; +/* 1 */ + { .mmf + (p18) stf8 [Y1] = f7, 2 * SIZE + (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE + (p18) fpma f13 = ALPHA_P, f49, f97 + } + { .mmi + (p16) ldf8 f35 = [X], 2 * SIZE + (p16) ldf8 f83 = [Y], 2 * SIZE + } + ;; +/* 2 */ + { .mmf + (p18) stf8 [Y1] = f10, 2 * SIZE + (p18) fpma f14 = ALPHA_P, f52, f100 + } + { .mmi + (p16) ldf8 f38 = [X], 2 * SIZE + (p16) ldf8 f86 = [Y], 2 * SIZE + } + ;; +/* 3 */ + { .mmf + (p18) stf8 [Y1] = f11, 2 * SIZE + (p18) fpma f15 = ALPHA_P, f55, f103 + } + { .mmi + (p16) ldf8 f41 = [X], 2 * SIZE + (p16) ldf8 f89 = [Y], 2 * SIZE + } + ;; +/* 4 */ + { .mmf + (p18) stf8 [Y1] = f12, 2 * SIZE + (p18) fpma f6 = ALPHA_P, f58, f106 + } + { .mmi + (p16) ldf8 f44 = [X], 2 * SIZE + (p16) ldf8 f92 = [Y], 2 * SIZE + } + ;; +/* 5 */ + { .mmf + (p18) stf8 [Y1] = f13, 2 * SIZE + (p18) fpma f7 = ALPHA_P, f61, f109 + } + { .mmi + (p16) ldf8 f47 = [X], 2 * SIZE + (p16) ldf8 f95 = [Y], 2 * SIZE + } + ;; +/* 6 */ + { .mmf + (p18) stf8 [Y1] = f14, 2 * SIZE + (p18) fpma f10 = ALPHA_P, f64, f112 + } + { .mmi + (p16) ldf8 f50 = [X], 2 * SIZE + (p16) ldf8 f98 = [Y], 2 * SIZE + } + ;; +/* 7 */ + { .mmf + (p18) stf8 [Y1] = f15, 2 * SIZE + (p18) fpma f11 = ALPHA_P, f67, f115 + } + { .mmi + (p16) ldf8 f53 = [X], 2 * SIZE + (p16) ldf8 f101 = [Y], 2 * SIZE + } + ;; +/* 8 */ + { .mmf + (p18) stf8 [Y1] = f6, 2 * SIZE + (p18) fpma f12 = ALPHA_P, f70, f118 + } + { .mmi + (p16) ldf8 f56 = [X], 2 * SIZE + (p16) ldf8 f104 = [Y], 2 * SIZE + } + ;; +/* 9 */ + { .mmf + (p18) stf8 [Y1] = f7, 2 * SIZE + (p18) fpma f13 = ALPHA_P, f73, f121 + } + { .mmi + (p16) ldf8 f59 = [X], 2 * SIZE + (p16) ldf8 f107 = [Y], 2 * SIZE + } + ;; +/* 10 */ + { .mmf + (p18) stf8 [Y1] = f10, 2 * SIZE + (p18) fpma f14 = ALPHA_P, f76, f124 + } + { .mmi + (p16) ldf8 f62 = [X], 2 * SIZE + (p16) ldf8 f110 = [Y], 2 * SIZE + } + ;; +/* 11 */ + { .mmf + (p18) stf8 [Y1] = f11, 2 * SIZE + (p18) fpma f15 = ALPHA_P, f79, f127 + } + { .mmi + (p16) ldf8 f65 = [X], 2 * SIZE + (p16) ldf8 f113 = [Y], 2 * SIZE + } + ;; +/* 12 */ + { .mmf + (p18) stf8 [Y1] = f12, 2 * SIZE + (p17) fpma f6 = ALPHA_P, f33, f81 + } + { .mmi + (p16) ldf8 f68 = [X], 2 * SIZE + (p16) ldf8 f116 = [Y], 2 * SIZE + } + ;; +/* 13 */ + { .mmf + (p18) stf8 [Y1] = f13, 2 * SIZE + (p17) fpma f7 = ALPHA_P, f36, f84 + } + { .mmi + (p16) ldf8 f71 = [X], 2 * SIZE + (p16) ldf8 f119 = [Y], 2 * SIZE + } + ;; +/* 14 */ + { .mmf + (p18) stf8 [Y1] = f14, 2 * SIZE + (p17) fpma f10 = ALPHA_P, f39, f87 + } + { .mmi + (p16) ldf8 f74 = [X], 2 * SIZE + (p16) ldf8 f122 = [Y], 2 * SIZE + } + ;; +/*15 */ + { .mmf + (p18) stf8 [Y1] = f15, 2 * SIZE + (p17) fpma f11 = ALPHA_P, f42, f90 + } + { .mmb + (p16) ldf8 f77 = [X], 2 * SIZE + (p16) ldf8 f125 = [Y], 2 * SIZE + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p11) ldf8 f32 = [X], 2 * SIZE + (p11) ldf8 f33 = [Y], 2 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p11) ldf8 f34 = [X], 2 * SIZE + (p11) ldf8 f35 = [Y], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p11) ldf8 f36 = [X], 2 * SIZE + (p11) ldf8 f37 = [Y], 2 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p11) ldf8 f38 = [X], 2 * SIZE + (p11) ldf8 f39 = [Y], 2 * SIZE + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + (p11) ldf8 f40 = [X], 2 * SIZE + (p11) ldf8 f41 = [Y], 2 * SIZE + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p11) ldf8 f42 = [X], 2 * SIZE + (p11) ldf8 f43 = [Y], 2 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmf + (p11) ldf8 f44 = [X], 2 * SIZE + (p11) ldf8 f45 = [Y], 2 * SIZE + (p11) fpma f6 = ALPHA_P, f32, f33 + } + ;; + { .mmf + (p11) ldf8 f46 = [X], 2 * SIZE + (p11) ldf8 f47 = [Y], 2 * SIZE + (p11) fpma f7 = ALPHA_P, f34, f35 + } + ;; + { .mmf + (p12) ldf8 f48 = [X], 2 * SIZE + (p12) ldf8 f49 = [Y], 2 * SIZE + (p11) fpma f10 = ALPHA_P, f36, f37 + } + ;; + { .mmi + (p11) stf8 [Y1] = f6, 2 * SIZE + nop.m 0 + tbit.z p0, p15 = N, 0 + } + { .mmf + (p12) ldf8 f50 = [X], 2 * SIZE + (p12) ldf8 f51 = [Y], 2 * SIZE + (p11) fpma f11 = ALPHA_P, f38, f39 + } + ;; + { .mmi + (p11) stf8 [Y1] = f7, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p12) ldf8 f52 = [X], 2 * SIZE + (p12) ldf8 f53 = [Y], 2 * SIZE + } + ;; + { .mmi + (p11) stf8 [Y1] = f10, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p12) ldf8 f54 = [X], 2 * SIZE + (p12) ldf8 f55 = [Y], 2 * SIZE + (p11) fpma f12 = ALPHA_P, f40, f41 + } + ;; + { .mmi + (p11) stf8 [Y1] = f11, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p13) ldf8 f56 = [X], 2 * SIZE + (p13) ldf8 f57 = [Y], 2 * SIZE + (p11) fpma f13 = ALPHA_P, f42, f43 + } + ;; + { .mmi + (p11) stf8 [Y1] = f12, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p13) ldf8 f58 = [X], 2 * SIZE + (p13) ldf8 f59 = [Y], 2 * SIZE + (p11) fpma f14 = ALPHA_P, f44, f45 + } + ;; + { .mmi + (p11) stf8 [Y1] = f13, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p14) ldf8 f60 = [X], 2 * SIZE + (p14) ldf8 f61 = [Y], 2 * SIZE + (p11) fpma f15 = ALPHA_P, f46, f47 + } + ;; + { .mmi + (p11) stf8 [Y1] = f14, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p15) ldfs f62 = [X] + (p15) ldfs f63 = [Y] + (p12) fpma f6 = ALPHA_P, f48, f49 + } + ;; + (p12) fpma f7 = ALPHA_P, f50, f51 + (p12) fpma f10 = ALPHA_P, f52, f53 + ;; + (p11) stf8 [Y1] = f15, 2 * SIZE + (p12) fpma f11 = ALPHA_P, f54, f55 + ;; + (p12) stf8 [Y1] = f6, 2 * SIZE + (p13) fpma f12 = ALPHA_P, f56, f57 + ;; + (p12) stf8 [Y1] = f7, 2 * SIZE + (p13) fpma f13 = ALPHA_P, f58, f59 + ;; + (p12) stf8 [Y1] = f10, 2 * SIZE + (p14) fpma f14 = ALPHA_P, f60, f61 + ;; + (p12) stf8 [Y1] = f11, 2 * SIZE + (p15) FMA f15 = ALPHA, f62, f63 + ;; + (p13) stf8 [Y1] = f12, 2 * SIZE + ;; + (p13) stf8 [Y1] = f13, 2 * SIZE + ;; + (p14) stf8 [Y1] = f14, 2 * SIZE + ;; + (p15) stfs [Y1] = f15 + br.ret.sptk.many b0 + ;; + .align 32 + +/* X is aligned; case 2 */ + +.L20: + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = (PREFETCHSIZE - 28) * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = (PREFETCHSIZE + 4) * SIZE, Y + tbit.z p0, p11 = N, 4 + (p7) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: +/* 0 */ + { .mmf + (p18) stf8 [Y1] = f6, 2 * SIZE + (p16) lfetch.nt1 [PRE1], 32 * SIZE + (p18) fpma f12 = ALPHA_P, f46, f94 + } + { .mmi + (p17) ldf8 f60 = [X], 2 * SIZE + (p16) ldf8 f80 = [Y], 2 * SIZE + } + ;; +/* 1 */ + { .mmf + (p18) stf8 [Y1] = f7, 2 * SIZE + (p16) lfetch.excl.nt1 [PRE2], 32 * SIZE + (p18) fpma f13 = ALPHA_P, f49, f97 + } + { .mmi + (p17) ldf8 f63 = [X], 2 * SIZE + (p16) ldf8 f83 = [Y], 2 * SIZE + } + ;; +/* 2 */ + { .mmf + (p18) stf8 [Y1] = f10, 2 * SIZE + (p18) fpma f14 = ALPHA_P, f52, f100 + } + { .mmi + (p17) ldf8 f66 = [X], 2 * SIZE + (p16) ldf8 f86 = [Y], 2 * SIZE + } + ;; +/* 3 */ + { .mmf + (p18) stf8 [Y1] = f11, 2 * SIZE + (p18) fpma f15 = ALPHA_P, f55, f103 + } + { .mmi + (p17) ldf8 f69 = [X], 2 * SIZE + (p16) ldf8 f89 = [Y], 2 * SIZE + } + ;; +/* 4 */ + { .mmf + (p18) stf8 [Y1] = f12, 2 * SIZE + (p18) fpma f6 = ALPHA_P, f58, f106 + } + { .mmi + (p17) ldf8 f72 = [X], 2 * SIZE + (p16) ldf8 f92 = [Y], 2 * SIZE + } + ;; +/* 5 */ + { .mmf + (p18) stf8 [Y1] = f13, 2 * SIZE + (p18) fpma f7 = ALPHA_P, f61, f109 + } + { .mmi + (p17) ldf8 f75 = [X], 2 * SIZE + (p16) ldf8 f95 = [Y], 2 * SIZE + } + ;; +/* 6 */ + { .mmf + (p18) stf8 [Y1] = f14, 2 * SIZE + (p18) fpma f10 = ALPHA_P, f64, f112 + } + { .mmi + (p17) ldf8 f78 = [X], 2 * SIZE + (p16) ldf8 f98 = [Y], 2 * SIZE + } + ;; +/* 7 */ + { .mmf + (p18) stf8 [Y1] = f15, 2 * SIZE + (p18) fpma f11 = ALPHA_P, f67, f115 + } + { .mmi + (p16) ldf8 f32 = [X], 2 * SIZE + (p16) ldf8 f101 = [Y], 2 * SIZE + } + ;; +/* 8 */ + { .mmf + (p18) stf8 [Y1] = f6, 2 * SIZE + (p18) fpma f12 = ALPHA_P, f70, f118 + } + { .mmi + (p16) ldf8 f35 = [X], 2 * SIZE + (p16) ldf8 f104 = [Y], 2 * SIZE + } + ;; +/* 9 */ + { .mmf + (p18) stf8 [Y1] = f7, 2 * SIZE + (p18) fpma f13 = ALPHA_P, f73, f121 + } + { .mmi + (p16) ldf8 f38 = [X], 2 * SIZE + (p16) ldf8 f107 = [Y], 2 * SIZE + } + ;; +/* 10 */ + { .mmf + (p18) stf8 [Y1] = f10, 2 * SIZE + (p18) fpma f14 = ALPHA_P, f76, f124 + } + { .mmi + (p16) ldf8 f41 = [X], 2 * SIZE + (p16) ldf8 f110 = [Y], 2 * SIZE + } + ;; +/* 11 */ + { .mmf + (p18) stf8 [Y1] = f11, 2 * SIZE + (p18) fpma f15 = ALPHA_P, f79, f127 + } + { .mmi + (p16) ldf8 f44 = [X], 2 * SIZE + (p16) ldf8 f113 = [Y], 2 * SIZE + } + ;; +/* 12 */ + { .mmf + (p18) stf8 [Y1] = f12, 2 * SIZE + (p17) fpma f6 = ALPHA_P, f33, f81 + } + { .mmi + (p16) ldf8 f47 = [X], 2 * SIZE + (p16) ldf8 f116 = [Y], 2 * SIZE + } + ;; +/* 13 */ + { .mmf + (p18) stf8 [Y1] = f13, 2 * SIZE + (p17) fpma f7 = ALPHA_P, f36, f84 + } + { .mmi + (p16) ldf8 f50 = [X], 2 * SIZE + (p16) ldf8 f119 = [Y], 2 * SIZE + } + ;; +/* 14 */ + { .mmf + (p18) stf8 [Y1] = f14, 2 * SIZE + (p17) fpma f10 = ALPHA_P, f39, f87 + } + { .mmi + (p16) ldf8 f53 = [X], 2 * SIZE + (p16) ldf8 f122 = [Y], 2 * SIZE + } + ;; +/*15 */ + { .mmf + (p18) stf8 [Y1] = f15, 2 * SIZE + (p17) fpma f11 = ALPHA_P, f42, f90 + } + { .mmb + (p16) ldf8 f56 = [X], 2 * SIZE + (p16) ldf8 f125 = [Y], 2 * SIZE + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p11) ldf8 f32 = [X], 2 * SIZE + (p11) ldf8 f33 = [Y], 2 * SIZE + mov pr = PR, -65474 + } + ;; + { .mmi + (p11) ldf8 f34 = [X], 2 * SIZE + (p11) ldf8 f35 = [Y], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p11) ldf8 f36 = [X], 2 * SIZE + (p11) ldf8 f37 = [Y], 2 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p11) ldf8 f38 = [X], 2 * SIZE + (p11) ldf8 f39 = [Y], 2 * SIZE + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + (p11) ldf8 f40 = [X], 2 * SIZE + (p11) ldf8 f41 = [Y], 2 * SIZE + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p11) ldf8 f42 = [X], 2 * SIZE + (p11) ldf8 f43 = [Y], 2 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmf + (p11) ldf8 f44 = [X], 2 * SIZE + (p11) ldf8 f45 = [Y], 2 * SIZE + (p11) fpma f6 = ALPHA_P, f32, f33 + } + ;; + { .mmf + (p11) ldf8 f46 = [X], 2 * SIZE + (p11) ldf8 f47 = [Y], 2 * SIZE + (p11) fpma f7 = ALPHA_P, f34, f35 + } + ;; + { .mmf + (p12) ldf8 f48 = [X], 2 * SIZE + (p12) ldf8 f49 = [Y], 2 * SIZE + (p11) fpma f10 = ALPHA_P, f36, f37 + } + ;; + { .mmi + (p11) stf8 [Y1] = f6, 2 * SIZE + nop.m 0 + tbit.z p0, p15 = N, 0 + } + { .mmf + (p12) ldf8 f50 = [X], 2 * SIZE + (p12) ldf8 f51 = [Y], 2 * SIZE + (p11) fpma f11 = ALPHA_P, f38, f39 + } + ;; + { .mmi + (p11) stf8 [Y1] = f7, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p12) ldf8 f52 = [X], 2 * SIZE + (p12) ldf8 f53 = [Y], 2 * SIZE + } + ;; + { .mmi + (p11) stf8 [Y1] = f10, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p12) ldf8 f54 = [X], 2 * SIZE + (p12) ldf8 f55 = [Y], 2 * SIZE + (p11) fpma f12 = ALPHA_P, f40, f41 + } + ;; + { .mmi + (p11) stf8 [Y1] = f11, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p13) ldf8 f56 = [X], 2 * SIZE + (p13) ldf8 f57 = [Y], 2 * SIZE + (p11) fpma f13 = ALPHA_P, f42, f43 + } + ;; + { .mmi + (p11) stf8 [Y1] = f12, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p13) ldf8 f58 = [X], 2 * SIZE + (p13) ldf8 f59 = [Y], 2 * SIZE + (p11) fpma f14 = ALPHA_P, f44, f45 + } + ;; + { .mmi + (p11) stf8 [Y1] = f13, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p14) ldf8 f60 = [X], 2 * SIZE + (p14) ldf8 f61 = [Y], 2 * SIZE + (p11) fpma f15 = ALPHA_P, f46, f47 + } + ;; + { .mmi + (p11) stf8 [Y1] = f14, 2 * SIZE + nop.m 0 + nop.i 0 + } + { .mmf + (p15) ldfs f62 = [X] + (p15) ldfs f63 = [Y] + (p12) fpma f6 = ALPHA_P, f48, f49 + } + ;; + (p12) fpma f7 = ALPHA_P, f50, f51 + (p12) fpma f10 = ALPHA_P, f52, f53 + ;; + (p11) stf8 [Y1] = f15, 2 * SIZE + (p12) fpma f11 = ALPHA_P, f54, f55 + ;; + (p12) stf8 [Y1] = f6, 2 * SIZE + (p13) fpma f12 = ALPHA_P, f56, f57 + ;; + (p12) stf8 [Y1] = f7, 2 * SIZE + (p13) fpma f13 = ALPHA_P, f58, f59 + ;; + (p12) stf8 [Y1] = f10, 2 * SIZE + (p14) fpma f14 = ALPHA_P, f60, f61 + ;; + (p12) stf8 [Y1] = f11, 2 * SIZE + (p15) FMA f15 = ALPHA, f62, f63 + ;; + (p13) stf8 [Y1] = f12, 2 * SIZE + ;; + (p13) stf8 [Y1] = f13, 2 * SIZE + ;; + (p14) stf8 [Y1] = f14, 2 * SIZE + ;; + (p15) stfs [Y1] = f15 + br.ret.sptk.many b0 + ;; + .align 32 + +.L30: + { .mmi + cmp.eq p9, p0 = r0, J + cmp.eq p7 ,p0 = 0, I + mov ar.ec = 4 + } + { .mmi + cmp.lt p12, p0 = 33, XA + adds I = -1, I + } + ;; + { .mmi + cmp.gt p14, p0 = 15, XA + cmp.lt p15, p0 = 60, XA + (p12) cmp.gt.unc p13, p0 = 53, XA + } + { .bbb + (p13) br.cond.dpnt .L40 + (p14) br.cond.dpnt .L40 + (p15) br.cond.dpnt .L40 + } + ;; + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = (PREFETCHSIZE + 6) * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = (PREFETCHSIZE + 0) * SIZE, Y + tbit.z p0, p12 = N, 3 + (p7) br.cond.dpnt .L35 + } + ;; + .align 32 + +.L32: + { .mmf + (p19) STFD [Y1] = f6, 1 * SIZE + (p19) STFD [Y2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f34, f82 + } + { .mmf + (p16) LDFPD f32, f35 = [X], 2 * SIZE + (p16) LDFD f80 = [Y], 1 * SIZE + (p18) FMA f7 = ALPHA, f46, f94 + } + ;; + { .mmf + (p19) STFD [Y1] = f10, 1 * SIZE + (p19) STFD [Y2] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA, f37, f85 + } + { .mmf + (p16) LDFPD f38, f41 = [X], 2 * SIZE + (p16) LDFPD f83, f86 = [Y], 2 * SIZE + (p18) FMA f11 = ALPHA, f49, f97 + } + ;; + { .mmf + (p19) STFD [Y1] = f12, 1 * SIZE + (p19) STFD [Y2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f40, f88 + } + { .mmf + (p16) LDFPD f44, f47 = [X], 2 * SIZE + (p16) LDFPD f89, f92 = [Y], 2 * SIZE + (p18) FMA f13 = ALPHA, f52, f100 + } + ;; + { .mmf + (p19) STFD [Y1] = f14, 5 * SIZE + (p19) STFD [Y2] = f15, 5 * SIZE + (p18) FMA f14 = ALPHA, f43, f91 + } + { .mmf + (p16) LDFPD f50, f53 = [X], 2 * SIZE + (p16) LDFPD f95, f98 = [Y], 2 * SIZE + (p18) FMA f15 = ALPHA, f55, f103 + } + ;; + { .mmf + (p18) STFD [Y1] = f6, 1 * SIZE + (p18) STFD [Y2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p16) LDFPD f56, f59 = [X], 2 * SIZE + (p16) LDFPD f101, f104 = [Y], 2 * SIZE + (p18) FMA f7 = ALPHA, f70, f118 + } + ;; + { .mmf + (p18) STFD [Y1] = f10, 1 * SIZE + (p18) STFD [Y2] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p16) LDFPD f62, f65 = [X], 2 * SIZE + (p16) LDFPD f107, f110 = [Y], 2 * SIZE + (p18) FMA f11 = ALPHA, f73, f121 + } + ;; + { .mmf + (p18) STFD [Y1] = f12, 1 * SIZE + (p18) STFD [Y2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f64, f112 + } + { .mmf + (p16) LDFPD f68, f71 = [X], 2 * SIZE + (p16) LDFPD f113, f116 = [Y], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [Y1] = f14, 5 * SIZE + (p18) STFD [Y2] = f15, 5 * SIZE + (p18) FMA f14 = ALPHA, f67, f115 + } + { .mmf + (p16) LDFPD f74, f77 = [X], 2 * SIZE + (p16) LDFPD f119, f122 = [Y], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmi + (p16) lfetch.nt1 [PRE1], 16 * SIZE + (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE + nop.i 0 + } + { .mmb + (p16) LDFD f125 = [Y], 1 * SIZE + nop.m 0 + br.ctop.sptk.few .L32 + } + ;; + .align 32 + +.L35: + { .mmi + (p12) LDFPD f32, f33 = [X], 2 * SIZE + (p12) LDFD f34 = [Y], 1 * SIZE; + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X], 2 * SIZE + (p12) LDFPD f35, f38 = [Y], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f40, f41 = [X], 2 * SIZE + (p12) LDFPD f39, f42 = [Y], 2 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFPD f44, f45 = [X], 2 * SIZE + (p12) LDFPD f43, f46 = [Y], 2 * SIZE + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [X], 2 * SIZE + (p12) LDFD f47 = [Y], 1 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFPD f52, f53 = [X], 2 * SIZE + (p13) LDFD f50 = [Y], 1 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p14) LDFPD f56, f57 = [X], 2 * SIZE + (p13) LDFPD f51, f54 = [Y], 2 * SIZE + mov YY = Y1; + } + ;; + (p15) LDFD f60 = [X] + (p13) LDFD f55 = [Y], 1 * SIZE + ;; + (p14) LDFD f58 = [Y], 1 * SIZE + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f40, f42 + ;; + (p14) LDFD f59 = [Y], 1 * SIZE + (p12) shladd YY = INCY, 3, YY + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f41, f43 + ;; + (p15) LDFD f61 = [Y] + (p13) shladd YY = INCY, 2, YY + (p12) FMA f12 = ALPHA, f36, f38 + (p12) FMA f13 = ALPHA, f44, f46 + ;; + (p12) STFD [Y1] = f6, 1 * SIZE + (p12) FMA f14 = ALPHA, f37, f39 + (p12) STFD [Y2] = f7, 1 * SIZE + (p12) FMA f15 = ALPHA, f45, f47 + ;; + (p12) STFD [Y1] = f10, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + (p12) STFD [Y2] = f11, 1 * SIZE + (p14) FMA f7 = ALPHA, f56, f58 + ;; + (p12) STFD [Y1] = f12, 1 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + (p12) STFD [Y2] = f13, 1 * SIZE + (p14) FMA f11 = ALPHA, f57, f59 + ;; + (p12) STFD [Y1] = f14, 5 * SIZE + (p13) FMA f12 = ALPHA, f52, f54 + (p12) STFD [Y2] = f15, 5 * SIZE + (p15) FMA f13 = ALPHA, f60, f61 + ;; + (p13) STFD [Y1] = f6, 1 * SIZE + (p14) STFD [YY] = f7, 1 * SIZE + (p13) FMA f14 = ALPHA, f53, f55 + ;; + (p13) STFD [Y1] = f10, 1 * SIZE + (p14) STFD [YY] = f11, 1 * SIZE + ;; + (p13) STFD [Y1] = f12, 1 * SIZE + (p15) STFD [YY] = f13 + ;; + (p13) STFD [Y1] = f14 + br.ret.sptk.many b0 + ;; + .align 32 + +.L40: + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = (PREFETCHSIZE + 38) * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = (PREFETCHSIZE + 14) * SIZE, Y + tbit.z p0, p12 = N, 3 + (p7) br.cond.dpnt .L45 + } + ;; + .align 32 + +.L42: + { .mmf + (p19) STFD [Y1] = f6, 1 * SIZE + (p19) STFD [Y2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f34, f82 + } + { .mmf + (p16) lfetch.nt1 [PRE1], 16 * SIZE + (p17) LDFPD f102, f105 = [Y], 2 * SIZE + (p18) FMA f7 = ALPHA, f46, f94 + } + ;; + { .mmf + (p19) STFD [Y1] = f10, 1 * SIZE + (p19) STFD [Y2] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA, f37, f85 + } + { .mmf + (p17) LDFPD f33, f36 = [X], 2 * SIZE + (p17) LDFPD f108, f111 = [Y], 2 * SIZE + (p18) FMA f11 = ALPHA, f49, f97 + } + ;; + { .mmf + (p19) STFD [Y1] = f12, 1 * SIZE + (p19) STFD [Y2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f40, f88 + } + { .mmf + (p17) LDFPD f39, f42 = [X], 2 * SIZE + (p17) LDFPD f114, f117 = [Y], 2 * SIZE + (p18) FMA f13 = ALPHA, f52, f100 + } + ;; + { .mmf + (p19) STFD [Y1] = f14, 5 * SIZE + (p19) STFD [Y2] = f15, 5 * SIZE + (p18) FMA f14 = ALPHA, f43, f91 + } + { .mmf + (p17) LDFPD f45, f48 = [X], 2 * SIZE + (p17) LDFPD f120, f123 = [Y], 2 * SIZE + (p18) FMA f15 = ALPHA, f55, f103 + } + ;; + { .mmf + (p18) STFD [Y1] = f6, 1 * SIZE + (p18) STFD [Y2] = f7, 1 * SIZE + (p18) FMA f6 = ALPHA, f58, f106 + } + { .mmf + (p17) LDFPD f51, f54 = [X], 2 * SIZE + (p17) LDFD f126 = [Y], 1 * SIZE + (p18) FMA f7 = ALPHA, f70, f118 + } + ;; + { .mmf + (p18) STFD [Y1] = f10, 1 * SIZE + (p18) STFD [Y2] = f11, 1 * SIZE + (p18) FMA f10 = ALPHA, f61, f109 + } + { .mmf + (p17) LDFPD f57, f60 = [X], 2 * SIZE + (p16) LDFD f80 = [Y], 1 * SIZE + (p18) FMA f11 = ALPHA, f73, f121 + } + ;; + { .mmf + (p18) STFD [Y1] = f12, 1 * SIZE + (p18) STFD [Y2] = f13, 1 * SIZE + (p18) FMA f12 = ALPHA, f64, f112 + } + { .mmf + (p17) LDFPD f63, f66 = [X], 2 * SIZE + (p16) LDFPD f83, f86 = [Y], 2 * SIZE + (p18) FMA f13 = ALPHA, f76, f124 + } + ;; + { .mmf + (p18) STFD [Y1] = f14, 5 * SIZE + (p18) STFD [Y2] = f15, 5 * SIZE + (p18) FMA f14 = ALPHA, f67, f115 + } + { .mmf + (p17) LDFPD f69, f72 = [X], 2 * SIZE + (p16) LDFPD f89, f92 = [Y], 2 * SIZE + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; +#if 0 + (p16) lfetch.excl.nt1 [PRE2], 16 * SIZE +#endif + { .mmb + (p17) LDFPD f75, f78 = [X], 2 * SIZE + (p16) LDFPD f95, f98 = [Y], 2 * SIZE + br.ctop.sptk.few .L42 + } + ;; + { .mmf + (p19) STFD [Y1] = f6, 1 * SIZE + (p19) STFD [Y2] = f7, 1 * SIZE + } + ;; + { .mmf + (p19) STFD [Y1] = f10, 1 * SIZE + (p19) STFD [Y2] = f11, 1 * SIZE + } + ;; + { .mmf + (p19) STFD [Y1] = f12, 1 * SIZE + (p19) STFD [Y2] = f13, 1 * SIZE + } + ;; + { .mmf + (p19) STFD [Y1] = f14, 5 * SIZE + (p19) STFD [Y2] = f15, 5 * SIZE + } + ;; + .align 32 + +.L45: + { .mmi + (p12) LDFPD f32, f33 = [X], 2 * SIZE + (p12) LDFD f34 = [Y], 1 * SIZE; + mov pr = PR, -65474 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X], 2 * SIZE + (p12) LDFPD f35, f38 = [Y], 2 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmb + (p12) LDFPD f40, f41 = [X], 2 * SIZE + (p12) LDFPD f39, f42 = [Y], 2 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFPD f44, f45 = [X], 2 * SIZE + (p12) LDFPD f43, f46 = [Y], 2 * SIZE + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [X], 2 * SIZE + (p12) LDFD f47 = [Y], 1 * SIZE + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p13) LDFPD f52, f53 = [X], 2 * SIZE + (p13) LDFD f50 = [Y], 1 * SIZE + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p14) LDFPD f56, f57 = [X], 2 * SIZE + (p13) LDFPD f51, f54 = [Y], 2 * SIZE + mov YY = Y1; + } + ;; + (p15) LDFD f60 = [X] + (p13) LDFD f55 = [Y], 1 * SIZE + ;; + (p14) LDFD f58 = [Y], 1 * SIZE + (p12) FMA f6 = ALPHA, f32, f34 + (p12) FMA f7 = ALPHA, f40, f42 + ;; + (p14) LDFD f59 = [Y], 1 * SIZE + (p12) shladd YY = INCY, 3, YY + (p12) FMA f10 = ALPHA, f33, f35 + (p12) FMA f11 = ALPHA, f41, f43 + ;; + (p15) LDFD f61 = [Y] + (p13) shladd YY = INCY, 2, YY + (p12) FMA f12 = ALPHA, f36, f38 + (p12) FMA f13 = ALPHA, f44, f46 + ;; + (p12) STFD [Y1] = f6, 1 * SIZE + (p12) FMA f14 = ALPHA, f37, f39 + (p12) STFD [Y2] = f7, 1 * SIZE + (p12) FMA f15 = ALPHA, f45, f47 + ;; + (p12) STFD [Y1] = f10, 1 * SIZE + (p13) FMA f6 = ALPHA, f48, f50 + (p12) STFD [Y2] = f11, 1 * SIZE + (p14) FMA f7 = ALPHA, f56, f58 + ;; + (p12) STFD [Y1] = f12, 1 * SIZE + (p13) FMA f10 = ALPHA, f49, f51 + (p12) STFD [Y2] = f13, 1 * SIZE + (p14) FMA f11 = ALPHA, f57, f59 + ;; + (p12) STFD [Y1] = f14, 5 * SIZE + (p13) FMA f12 = ALPHA, f52, f54 + (p12) STFD [Y2] = f15, 5 * SIZE + (p15) FMA f13 = ALPHA, f60, f61 + ;; + (p13) STFD [Y1] = f6, 1 * SIZE + (p14) STFD [YY] = f7, 1 * SIZE + (p13) FMA f14 = ALPHA, f53, f55 + ;; + (p13) STFD [Y1] = f10, 1 * SIZE + (p14) STFD [YY] = f11, 1 * SIZE + ;; + (p13) STFD [Y1] = f12, 1 * SIZE + (p15) STFD [YY] = f13 + ;; + (p13) STFD [Y1] = f14 + br.ret.sptk.many b0 + ;; + .align 32 + +.L100: + { .mii + and J = 15, N + shr I = N, 4 + mov ar.ec = 3 + } + ;; + { .mmi + cmp.eq p9, p0 = r0, J + cmp.eq p7 ,p0 = 0, I + adds I = -1, I + } + ;; + { .mmi + (p10) STFD [YYY] = f32 + adds PRE1 = PREFETCHSIZE * SIZE, X + mov ar.lc = I + } + { .mib + adds PRE2 = PREFETCHSIZE * SIZE, Y + tbit.z p0, p12 = N, 3 + (p7) br.cond.dpnt .L115 + } + ;; + .align 32 + +.L112: + { .mmi + (p18) STFD [Y1] = f6 + (p16) lfetch.nt1 [PRE1], INCX16 + (p18) add Y1 = INCY, Y1 + } + {.mmf + (p16) LDFD f32 = [X], INCX + (p16) LDFD f80 = [Y], INCY + (p18) FMA f6 = ALPHA, f58, f106 + } + ;; + { .mmi + (p18) STFD [Y1] = f7 + (p16) lfetch.excl.nt1 [PRE2], INCY16 + (p18) add Y1 = INCY, Y1 + } + { .mmf + (p16) LDFD f35 = [X], INCX + (p16) LDFD f83 = [Y], INCY + (p18) FMA f7 = ALPHA, f61, f109 + } + ;; + { .mmi + (p18) STFD [Y1] = f10 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f38 = [X], INCX + (p16) LDFD f86 = [Y], INCY + (p18) FMA f10 = ALPHA, f64, f112 + } + ;; + { .mmi + (p18) STFD [Y1] = f11 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f41 = [X], INCX + (p16) LDFD f89 = [Y], INCY + (p18) FMA f11 = ALPHA, f67, f115 + } + ;; + { .mmi + (p18) STFD [Y1] = f12 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f44 = [X], INCX + (p16) LDFD f92 = [Y], INCY + (p18) FMA f12 = ALPHA, f70, f118 + } + ;; + { .mmi + (p18) STFD [Y1] = f13 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f47 = [X], INCX + (p16) LDFD f95 = [Y], INCY + (p18) FMA f13 = ALPHA, f73, f121 + } + ;; + { .mmi + (p18) STFD [Y1] = f14 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f50 = [X], INCX + (p16) LDFD f98 = [Y], INCY + (p18) FMA f14 = ALPHA, f76, f124 + } + ;; + { .mmi + (p18) STFD [Y1] = f15 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f53 = [X], INCX + (p16) LDFD f101 = [Y], INCY + (p18) FMA f15 = ALPHA, f79, f127 + } + ;; + { .mmi + (p18) STFD [Y1] = f6 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f56 = [X], INCX + (p16) LDFD f104 = [Y], INCY + (p17) FMA f6 = ALPHA, f33, f81 + } + ;; + { .mmi + (p18) STFD [Y1] = f7 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f59 = [X], INCX + (p16) LDFD f107 = [Y], INCY + (p17) FMA f7 = ALPHA, f36, f84 + } + ;; + { .mmi + (p18) STFD [Y1] = f10 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f62 = [X], INCX + (p16) LDFD f110 = [Y], INCY + (p17) FMA f10 = ALPHA, f39, f87 + } + ;; + { .mmi + (p18) STFD [Y1] = f11 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f65 = [X], INCX + (p16) LDFD f113 = [Y], INCY + (p17) FMA f11 = ALPHA, f42, f90 + } + ;; + { .mmi + (p18) STFD [Y1] = f12 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f68 = [X], INCX + (p16) LDFD f116 = [Y], INCY + (p17) FMA f12 = ALPHA, f45, f93 + } + ;; + { .mmi + (p18) STFD [Y1] = f13 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f71 = [X], INCX + (p16) LDFD f119 = [Y], INCY + (p17) FMA f13 = ALPHA, f48, f96 + } + ;; + { .mmi + (p18) STFD [Y1] = f14 + (p18) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p16) LDFD f74 = [X], INCX + (p16) LDFD f122 = [Y], INCY + (p17) FMA f14 = ALPHA, f51, f99 + } + ;; + { .mmf + (p18) STFD [Y1] = f15 + (p18) add Y1 = INCY, Y1 + (p17) FMA f15 = ALPHA, f54, f102 + } + { .mmb + (p16) LDFD f77 = [X], INCX + (p16) LDFD f125 = [Y], INCY + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + (p12) LDFD f32 = [X], INCX + (p12) LDFD f34 = [Y], INCY + mov pr = PR, -65474 + ;; + (p12) LDFD f33 = [X], INCX + (p12) LDFD f35 = [Y], INCY + mov ar.lc = ARLC + ;; + (p12) LDFD f36 = [X], INCX + (p12) LDFD f38 = [Y], INCY + (p9) br.ret.sptk.many b0 + ;; + (p12) LDFD f37 = [X], INCX + (p12) LDFD f39 = [Y], INCY + tbit.z p0, p13 = N, 2 + ;; + (p12) LDFD f40 = [X], INCX + (p12) LDFD f42 = [Y], INCY + tbit.z p0, p14 = N, 1 + ;; + (p12) LDFD f41 = [X], INCX + (p12) LDFD f43 = [Y], INCY + tbit.z p0, p15 = N, 0 + ;; + { .mmf + (p12) LDFD f44 = [X], INCX + (p12) LDFD f46 = [Y], INCY + (p12) FMA f6 = ALPHA, f32, f34 + } + ;; + { .mmf + (p12) LDFD f45 = [X], INCX + (p12) LDFD f47 = [Y], INCY + (p12) FMA f7 = ALPHA, f33, f35 + } + ;; + { .mmf + (p13) LDFD f48 = [X], INCX + (p13) LDFD f50 = [Y], INCY + (p12) FMA f10 = ALPHA, f36, f38 + } + ;; + { .mmf + (p13) LDFD f49 = [X], INCX + (p13) LDFD f51 = [Y], INCY + (p12) FMA f11 = ALPHA, f37, f39 + } + ;; + { .mmi + (p12) STFD [Y1] = f6 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p13) LDFD f52 = [X], INCX + (p13) LDFD f54 = [Y], INCY + (p12) FMA f12 = ALPHA, f40, f42 + } + ;; + { .mmi + (p12) STFD [Y1] = f7 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p13) LDFD f53 = [X], INCX + (p13) LDFD f55 = [Y], INCY + (p12) FMA f13 = ALPHA, f41, f43 + } + ;; + { .mmi + (p12) STFD [Y1] = f10 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p14) LDFD f56 = [X], INCX + (p14) LDFD f58 = [Y], INCY + (p12) FMA f14 = ALPHA, f44, f46 + } + ;; + { .mmi + (p12) STFD [Y1] = f11 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p14) LDFD f57 = [X], INCX + (p14) LDFD f59 = [Y], INCY + (p12) FMA f15 = ALPHA, f45, f47 + } + ;; + { .mmi + (p12) STFD [Y1] = f12 + (p12) add Y1 = INCY, Y1 + nop.i 0 + } + { .mmf + (p15) LDFD f60 = [X] + (p15) LDFD f61 = [Y] + (p13) FMA f6 = ALPHA, f48, f50 + } + ;; + { .mmf + (p12) STFD [Y1] = f13 + (p12) add Y1 = INCY, Y1 + (p13) FMA f7 = ALPHA, f49, f51 + } + ;; + { .mmf + (p12) STFD [Y1] = f14 + (p12) add Y1 = INCY, Y1 + (p13) FMA f10 = ALPHA, f52, f54 + } + ;; + { .mmf + (p12) STFD [Y1] = f15 + (p12) add Y1 = INCY, Y1 + (p13) FMA f11 = ALPHA, f53, f55 + } + ;; + { .mmf + (p13) STFD [Y1] = f6 + (p13) add Y1 = INCY, Y1 + (p14) FMA f12 = ALPHA, f56, f58 + } + ;; + { .mmf + (p13) STFD [Y1] = f7 + (p13) add Y1 = INCY, Y1 + (p14) FMA f13 = ALPHA, f57, f59 + } + ;; + { .mmf + (p13) STFD [Y1] = f10 + (p13) add Y1 = INCY, Y1 + (p15) FMA f14 = ALPHA, f60, f61 + } + ;; + (p13) STFD [Y1] = f11 + (p13) add Y1 = INCY, Y1 + ;; + (p14) STFD [Y1] = f12 + (p14) add Y1 = INCY, Y1 + ;; + (p14) STFD [Y1] = f13 + (p14) add Y1 = INCY, Y1 + ;; + (p15) STFD [Y1] = f14 + br.ret.sptk.many b0 + ;; + EPILOGUE + diff --git a/kernel/ia64/scal.S b/kernel/ia64/scal.S new file mode 100644 index 0000000000..e3d93ddc59 --- /dev/null +++ b/kernel/ia64/scal.S @@ -0,0 +1,950 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCH_SIZE (8 * 16) +#else +#define PREFETCH_SIZE (1 * 64) +#endif + +#define ALPHA f8 + +#define N r32 +#define X1 r36 +#define INCX r37 + +#define X2 r14 +#define Y1 r15 +#define Y2 r16 +#define PRE1 r17 +#define I r18 +#define NAND15 r19 +#define INCX5 r20 +#define INCX16 r21 +#define XX r22 +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + + { .mfi + shladd INCX = INCX, BASE_SHIFT, r0 + fcmp.eq p0, p6 = ALPHA, f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.ge p7, p0 = 0, N + tbit.z p0, p10 = X1, BASE_SHIFT + (p7) br.ret.sptk.many b0 + } + .body + ;; + { .mmi + mov XX = X1 + (p10) LDFD f32 = [X1], INCX + mov PR = pr + } + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCX16 = INCX, 4, r0 + (p10) adds N = -1, N + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + nop __LINE__ + mov ar.ec = 5 + } + { .mmi + and NAND15 = 15, N + nop __LINE__ + shr I = N, 4 + } + ;; + { .mmi + adds I = -1, I + nop __LINE__ + tbit.z p0, p12 = N, 3 + } + { .mmb + cmp.ge p9, p0 = 0, NAND15 + adds PRE1 = PREFETCH_SIZE * SIZE + 192, XX + (p6) br.cond.dptk .L100 // if (alpha != 0) goto L3 + } + ;; + { .mmi + (p10) STFD [XX] = f0 + nop __LINE__ + mov ar.lc = I + } + { .mmb + cmp.gt p8, p0 = 0, I + (p8) br.cond.dpnt .L30 + } + ;; + .align 32 + +.L20: + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + lfetch.excl.nt1 [PRE1], INCX16 + add X1 = INCX, X1 + add X2 = INCX, X2 + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX5, X1 + add X2 = INCX5, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmi + add X1 = INCX, X1 + add X2 = INCX, X2 + nop __LINE__ + } + ;; + {.mmi + STFD [X1] = f0 + STFD [X2] = f0 + nop __LINE__ + } + {.mmb + add X1 = INCX5, X1 + add X2 = INCX5, X2 + br.cloop.sptk.few .L20 + } + ;; + .align 16 + +.L30: + { .mmi + (p12) STFD [X1] = f0 + (p12) STFD [X2] = f0 + mov ar.lc = ARLC + } + { .mmb + (p12) add X1 = INCX, X1 + (p12) add X2 = INCX, X2 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) STFD [X1] = f0 + (p12) add X1 = INCX, X1 + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) STFD [X2] = f0 + (p12) add X2 = INCX, X2 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) STFD [X1] = f0 + (p12) add X1 = INCX, X1 + tbit.z p0, p15 = N, 0 + } + { .mmb + (p12) STFD [X2] = f0 + (p12) add X2 = INCX, X2 + nop __LINE__ + } + ;; + { .mmb + (p12) STFD [X1] = f0 + (p12) add X1 = INCX5, X1 + nop __LINE__ + } + { .mmb + (p12) STFD [X2] = f0 + (p12) add X2 = INCX5, X2 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [X1] = f0 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mmi + mov Y1 = X1 + shladd Y2 = INCX, 2, X1 + mov pr.rot = 0 + } + { .mmf + cmp.gt p8, p0 = 0, I + shladd X2 = INCX, 2, X1 + (p10) FMPY f32 = ALPHA, f32 + } + ;; + { .mmi + (p10) STFD [XX] = f32 + cmp.eq p0, p7 = SIZE, INCX + mov ar.lc = I + } + { .mbb + cmp.eq p16, p0 = r0, r0 + (p7) br.cond.dpnt .L300 + (p8) br.cond.dpnt .L120 + } + ;; + .align 32 + +.L110: + { .mmf + (p21) STFD [Y1] = f6, 1 * SIZE + (p21) STFD [Y2] = f7, 1 * SIZE + (p20) FMPY f112 = ALPHA, f36 + } + { .mmf + (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [X1], 2 * SIZE + (p20) FMPY f113 = ALPHA, f56 + } + ;; + { .mmf + (p21) STFD [Y1] = f10, 1 * SIZE + (p21) STFD [Y2] = f11, 1 * SIZE + (p20) FMPY f114 = ALPHA, f41 + } + { .mfi + (p16) LDFPD f42, f47 = [X1], 2 * SIZE + (p20) FMPY f115 = ALPHA, f61 + nop __LINE__ + } + ;; + { .mmf + (p21) STFD [Y1] = f12, 1 * SIZE + (p21) STFD [Y2] = f13, 1 * SIZE + (p20) FMPY f116 = ALPHA, f46 + } + { .mfi + (p16) LDFPD f52, f57 = [X1], 2 * SIZE + (p20) FMPY f117 = ALPHA, f66 + nop __LINE__ + } + ;; + { .mmf + (p21) STFD [Y1] = f14, 5 * SIZE + (p21) STFD [Y2] = f15, 5 * SIZE + (p20) FMPY f118 = ALPHA, f51 + } + { .mfi + (p16) LDFPD f62, f67 = [X1], 2 * SIZE + (p20) FMPY f119 = ALPHA, f71 + nop __LINE__ + } + ;; + { .mmf + (p20) STFD [Y1] = f112, 1 * SIZE + (p20) STFD [Y2] = f113, 1 * SIZE + (p20) FMPY f6 = ALPHA, f76 + } + { .mfi + (p16) LDFPD f72, f77 = [X1], 2 * SIZE + (p20) FMPY f7 = ALPHA, f96 + nop __LINE__ + } + ;; + { .mmf + (p20) STFD [Y1] = f114, 1 * SIZE + (p20) STFD [Y2] = f115, 1 * SIZE + (p20) FMPY f10 = ALPHA, f81 + } + { .mfi + (p16) LDFPD f82, f87 = [X1], 2 * SIZE + (p20) FMPY f11 = ALPHA, f101 + nop __LINE__ + } + ;; + { .mmf + (p20) STFD [Y1] = f116, 1 * SIZE + (p20) STFD [Y2] = f117, 1 * SIZE + (p20) FMPY f12 = ALPHA, f86 + } + { .mfi + (p16) LDFPD f92, f97 = [X1], 2 * SIZE + (p20) FMPY f13 = ALPHA, f106 + (p20) shladd X2 = INCX, 2, X1 + } + ;; + { .mmf + (p20) STFD [Y1] = f118, 5 * SIZE + (p20) STFD [Y2] = f119, 5 * SIZE + (p20) FMPY f14 = ALPHA, f91 + } + { .mfb + (p16) LDFPD f102, f107 = [X1], 2 * SIZE + (p20) FMPY f15 = ALPHA, f111 + br.ctop.sptk.few .L110 + } + ;; + .align 32 + +.L120: + { .mmi + (p21) STFD [Y1] = f6, 1 * SIZE + (p21) STFD [Y2] = f7, 1 * SIZE + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + (p12) LDFPD f36, f37 = [X2], 2 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p21) STFD [Y1] = f10, 1 * SIZE + (p21) STFD [Y2] = f11, 1 * SIZE + mov ar.lc = ARLC + } + { .mmi + (p12) LDFPD f34, f35 = [X1] + (p12) LDFPD f38, f39 = [X2] + (p12) adds X1 = 6 * SIZE,X1 + } + ;; + { .mmi + (p21) STFD [Y1] = f12, 1 * SIZE + (p21) STFD [Y2] = f13, 1 * SIZE + tbit.z p0, p14 = N, 1 + } + { .mmi + (p13) LDFPD f40, f41 = [X1], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p21) STFD [Y1] = f14, 5 * SIZE + (p21) STFD [Y2] = f15, 5 * SIZE + mov pr = PR, -65474 + } + { .mib + (p13) LDFPD f42, f43 = [X1], 2 * SIZE + nop __LINE__ + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p14) LDFPD f44, f45 = [X1], 2 * SIZE + nop __LINE__ + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p15) LDFD f46 = [X1] + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f32 = ALPHA, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f36 = ALPHA, f36 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f33 = ALPHA, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f37 = ALPHA, f37 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f34 = ALPHA, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f38 = ALPHA, f38 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f35 = ALPHA, f35 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f39 = ALPHA, f39 + } + ;; + { .mmf + (p12) STFD [Y1] = f32, 1 * SIZE + nop __LINE__ + (p13) FMPY f40 = ALPHA, f40 + } + { .mmf + (p12) STFD [Y2] = f36, 1 * SIZE + nop __LINE__ + (p13) FMPY f41 = ALPHA, f41 + } + ;; + { .mmf + (p12) STFD [Y1] = f33, 1 * SIZE + nop __LINE__ + (p13) FMPY f42 = ALPHA, f42 + } + { .mmf + (p12) STFD [Y2] = f37, 1 * SIZE + nop __LINE__ + (p13) FMPY f43 = ALPHA, f43 + } + ;; + { .mmf + (p12) STFD [Y1] = f34, 1 * SIZE + nop __LINE__ + (p14) FMPY f44 = ALPHA, f44 + } + { .mmf + (p12) STFD [Y2] = f38, 1 * SIZE + nop __LINE__ + (p14) FMPY f45 = ALPHA, f45 + } + ;; + { .mmf + (p12) STFD [Y1] = f35, 5 * SIZE + (p12) STFD [Y2] = f39, 5 * SIZE + (p15) FMPY f46 = ALPHA, f46 + } + ;; + { .mmi + (p13) STFD [Y1] = f40, 1 * SIZE + ;; + (p13) STFD [Y1] = f41, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) STFD [Y1] = f42, 1 * SIZE + ;; + (p13) STFD [Y1] = f43, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [Y1] = f44, 1 * SIZE + ;; + (p14) STFD [Y1] = f45, 1 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [Y1] = f46 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L300: + { .mmi + adds PRE1 = PREFETCH_SIZE * SIZE + 64, X1 + nop __LINE__ + mov.i ar.ec = 6 + } + { .mmb + cmp.gt p8, p0 = 0, I + nop __LINE__ + (p8) br.cond.dpnt .L320 + } + ;; + .align 32 + +.L310: + { .mmf + (p16) lfetch.excl.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X1], INCX + (p21) FMPY f6 = ALPHA, f37 + } + { .mmb + (p22) STFD [Y1] = f12 + (p22) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f38 = [X1], INCX + (p21) FMPY f7 = ALPHA, f43 + nop __LINE__ + } + { .mmb + (p22) STFD [Y1] = f13 + (p22) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f44 = [X1], INCX + (p21) FMPY f10 = ALPHA, f49 + nop __LINE__ + } + { .mmb + (p22) STFD [Y1] = f14 + (p22) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f50 = [X1], INCX + (p21) FMPY f11 = ALPHA, f55 + nop __LINE__ + } + { .mmb + (p22) STFD [Y1] = f15 + (p22) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f56 = [X1], INCX + (p21) FMPY f12 = ALPHA, f61 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f6 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f62 = [X1], INCX + (p21) FMPY f13 = ALPHA, f67 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f7 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f68 = [X1], INCX + (p21) FMPY f14 = ALPHA, f73 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f10 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f74 = [X1], INCX + (p21) FMPY f15 = ALPHA, f79 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f11 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f80 = [X1], INCX + (p21) FMPY f6 = ALPHA, f85 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f12 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f86 = [X1], INCX + (p21) FMPY f7 = ALPHA, f91 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f13 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f92 = [X1], INCX + (p21) FMPY f10 = ALPHA, f97 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f14 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f98 = [X1], INCX + (p21) FMPY f11 = ALPHA, f103 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f15 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f104 = [X1], INCX + (p21) FMPY f12 = ALPHA, f109 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f6 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f110 = [X1], INCX + (p21) FMPY f13 = ALPHA, f115 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f7 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f116 = [X1], INCX + (p21) FMPY f14 = ALPHA, f121 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f10 + (p21) add Y1 = INCX, Y1 + nop __LINE__ + } + ;; + { .mfb + (p16) LDFD f122 = [X1], INCX + (p21) FMPY f15 = ALPHA, f127 + nop __LINE__ + } + { .mmb + (p21) STFD [Y1] = f11 + (p21) add Y1 = INCX, Y1 + br.ctop.sptk.few .L310 + } + ;; + STFD [Y1] = f12 + add Y1 = INCX, Y1 + shladd Y2 = INCX, 2, X1 + ;; + STFD [Y1] = f13 + add Y1 = INCX, Y1 + shladd X2 = INCX, 2, X1 + ;; + STFD [Y1] = f14 + add Y1 = INCX, Y1 + ;; + STFD [Y1] = f15 + add Y1 = INCX, Y1 + ;; + .align 16 + +.L320: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f52 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + mov pr = PR, -65474 + } + { .mmb + nop.m 0 + nop.m 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f54 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX5 + (p12) LDFD f55 = [X2], INCX5 + tbit.z p0, p14 = N, 1 + } + ;; + (p13) LDFD f56 = [X1], INCX + tbit.z p0, p15 = N, 0 + ;; + (p13) LDFD f57 = [X1], INCX + ;; + { .mfi + (p13) LDFD f58 = [X1], INCX + (p12) FMPY f48 = ALPHA, f48 + } + { .mfi + (p12) FMPY f52 = ALPHA, f52 + } + ;; + { .mfi + (p13) LDFD f59 = [X1], INCX + (p12) FMPY f49 = ALPHA, f49 + } + { .mfi + (p12) FMPY f53 = ALPHA, f53 + } + ;; + { .mfi + (p14) LDFD f60 = [X1], INCX + (p12) FMPY f50 = ALPHA, f50 + } + { .mfi + (p12) FMPY f54 = ALPHA, f54 + } + ;; + { .mfi + (p14) LDFD f61 = [X1], INCX + (p12) FMPY f51 = ALPHA, f51 + } + { .mfi + (p12) FMPY f55 = ALPHA, f55 + } + ;; + { .mmf + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f52 + (p13) FMPY f56 = ALPHA, f56 + } + { .mmi + (p15) LDFD f62 = [X1] + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + } + ;; + { .mmf + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p13) FMPY f57 = ALPHA, f57 + } + { .mmi + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f54 + (p13) FMPY f58 = ALPHA, f58 + } + { .mmi + (p12) add Y1 = INCX, Y1 + (p12) add Y2 = INCX, Y2 + nop __LINE__ + } + ;; + { .mmf + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) FMPY f59 = ALPHA, f59 + } + { .mmi + (p12) add Y1 = INCX5, Y1 + (p12) add Y2 = INCX5, Y2 + nop __LINE__ + } + ;; + { .mfi + (p13) STFD [Y1] = f56 + (p14) FMPY f60 = ALPHA, f60 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mfi + (p13) STFD [Y1] = f57 + (p14) FMPY f61 = ALPHA, f61 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mfi + (p13) STFD [Y1] = f58 + (p15) FMPY f62 = ALPHA, f62 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f59 + (p13) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p14) STFD [Y1] = f60 + (p14) add Y1 = INCX, Y1 + } + ;; + { .mmi + (p14) STFD [Y1] = f61 + (p14) add Y1 = INCX, Y1 + } + ;; + { .mib + (p15) STFD [Y1] = f62 + mov pr = PR, -65474 + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/sdot.S b/kernel/ia64/sdot.S new file mode 100644 index 0000000000..5a058e7081 --- /dev/null +++ b/kernel/ia64/sdot.S @@ -0,0 +1,1177 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (8 * 16 + 4) + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 +#define INCX16 r18 +#define INCY16 r19 +#define INCX5 r20 +#define INCY5 r21 +#define YY r22 +#define XA r23 +#define YA r24 +#define XX r25 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + nop.m 0 + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov r26 = 1 + mov f9 = f0 + shr XA = X1, 3 + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif + + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + (p6) shladd X1 = r26, BASE_SHIFT, X1 + (p7) shladd Y1 = r27, BASE_SHIFT, Y1 + ;; +#endif + { .mfi + shladd INCX = INCX, BASE_SHIFT, r0 + mov f32 = f0 + mov PR = pr + } + { .mfb + cmp.lt p0, p6 = r0, N + mov f80 = f0 + (p6) br.ret.sptk.many b0 + } + ;; + { .mfi + shladd INCY = INCY, BASE_SHIFT, r0 + mov f10 = f0 + tbit.nz p15, p0 = X1, BASE_SHIFT + } + { .mfb + cmp.ne p6, p0 = SIZE, INCX + mov f11 = f0 + (p6) br.cond.dptk .L100 + } + ;; + { .mfi + (p15) LDFD f32 = [X1], INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + (p15) adds N = -1, N + mov f13 = f0 + shr YA = Y1, 3 + } + ;; + { .mfi + (p15) LDFD f80 = [Y1], INCY + mov f14 = f0 + shr I = N, 4 + } + { .mmi + and J = 15, N + and XA = 0x1f, XA + and YA = 0x1f, YA + } + ;; + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCY5 = INCY, 2, INCY + sub XA = YA, XA + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + shladd Y2 = INCY, 2, Y1 + cmp.eq p7, p0 = r0, J + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.ge p8, p0 = 4, XA + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mbb + cmp.le p9, p0 = 24, XA + (p8) br.cond.dpnt .L20 + (p9) br.cond.dpnt .L20 + } + ;; + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 6) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L15 + } + ;; + .align 32 + +/* INCX == 1 && X is aligned */ +.L12: + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [Y1], INCY + (p16) LDFD f92 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f83 = [Y1], INCY + (p16) LDFD f95 = [Y2], INCY + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f86 = [Y1], INCY + (p16) LDFD f98 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f89 = [Y1], INCY5 + (p16) LDFD f101 = [Y2], INCY5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f104 = [Y1], INCY + (p16) LDFD f116 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f107 = [Y1], INCY + (p16) LDFD f119 = [Y2], INCY + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f110 = [Y1], INCY + (p16) LDFD f122 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p16) LDFD f113 = [Y1], INCY5 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f125 = [Y2], INCY5 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + mov YY = Y1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f42 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) shladd YY = INCY, 3, YY + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [Y1], INCY + (p12) LDFD f43 = [Y2], INCY + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p13) shladd YY = INCY, 2, YY + } + { .mmi + (p12) LDFD f38 = [Y1], INCY + (p12) LDFD f46 = [Y2], INCY + } + ;; + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFD f39 = [Y1], INCY5 + (p12) LDFD f47 = [Y2], INCY5 + ;; + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFD f50 = [Y1], INCY + (p14) LDFD f58 = [YY], INCY + ;; + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFD f51 = [Y1], INCY + (p14) LDFD f59 = [YY], INCY + ;; + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p13) LDFD f54 = [Y1], INCY + (p15) LDFD f61 = [YY] + ;; + (p13) LDFD f55 = [Y1], INCY + (p15) LDFD f60 = [X1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L20: + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 38) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: + { .mmf + (p16) LDFPD f32, f35 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p17) LDFD f81 = [Y1], INCY + (p17) LDFD f93 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [X1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p17) LDFD f84 = [Y1], INCY + (p17) LDFD f96 = [Y2], INCY + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [X1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p17) LDFD f87 = [Y1], INCY + (p17) LDFD f99 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [X1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p17) LDFD f90 = [Y1], INCY5 + (p17) LDFD f102 = [Y2], INCY5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [X1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p17) LDFD f105 = [Y1], INCY + (p17) LDFD f117 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [X1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p17) LDFD f108 = [Y1], INCY + (p17) LDFD f120 = [Y2], INCY + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [X1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p17) LDFD f111 = [Y1], INCY + (p17) LDFD f123 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [X1], 2 * SIZE + (p17) LDFD f114 = [Y1], INCY5 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p17) LDFD f126 = [Y2], INCY5 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFPD f32, f33 = [X1], 2 * SIZE + mov YY = Y1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f42 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [X1], 2 * SIZE + (p12) shladd YY = INCY, 3, YY + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [Y1], INCY + (p12) LDFD f43 = [Y2], INCY + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [X1], 2 * SIZE + (p13) shladd YY = INCY, 2, YY + } + { .mmi + (p12) LDFD f38 = [Y1], INCY + (p12) LDFD f46 = [Y2], INCY + } + ;; + (p12) LDFPD f44, f45 = [X1], 2 * SIZE + (p12) LDFD f39 = [Y1], INCY5 + (p12) LDFD f47 = [Y2], INCY5 + ;; + (p13) LDFPD f48, f49 = [X1], 2 * SIZE + (p13) LDFD f50 = [Y1], INCY + (p14) LDFD f58 = [YY], INCY + ;; + (p13) LDFPD f52, f53 = [X1], 2 * SIZE + (p13) LDFD f51 = [Y1], INCY + (p14) LDFD f59 = [YY], INCY + ;; + (p14) LDFPD f56, f57 = [X1], 2 * SIZE + (p13) LDFD f54 = [Y1], INCY + (p15) LDFD f61 = [YY] + ;; + (p13) LDFD f55 = [Y1], INCY + (p15) LDFD f60 = [X1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L100: + { .mmi + shladd X2 = INCX, 2, X1 + } + { .mib + cmp.ne p6, p0 = SIZE, INCY + tbit.nz p15, p0 = Y1, BASE_SHIFT + (p6) br.cond.dptk .L200 + } + ;; + { .mfi + (p15) LDFD f32 = [X1], INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + (p15) adds N = -1, N + mov f13 = f0 + shr YA = Y1, 3 + } + ;; + { .mfi + (p15) LDFD f80 = [Y1], INCY + mov f14 = f0 + shr I = N, 4 + } + { .mmi + and J = 15, N + and XA = 0x1f, XA + and YA = 0x1f, YA + } + ;; + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCY5 = INCY, 2, INCY + sub XA = YA, XA + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + cmp.eq p7, p0 = r0, J + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.ge p8, p0 = 8, XA + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mbb + cmp.le p9, p0 = 28, XA + (p8) br.cond.dpnt .L120 + (p9) br.cond.dpnt .L120 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L115 + } + ;; + .align 32 + +/* INCY == 1 */ +.L112: + { .mmf + (p16) LDFPD f32, f35 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f92 = [X2], INCX + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f83 = [X1], INCX + (p16) LDFD f95 = [X2], INCX + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [Y1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f86 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [Y1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f89 = [X1], INCX5 + (p16) LDFD f101 = [X2], INCX5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [Y1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f104 = [X1], INCX + (p16) LDFD f116 = [X2], INCX + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [Y1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f107 = [X1], INCX + (p16) LDFD f119 = [X2], INCX + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [Y1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f110 = [X1], INCX + (p16) LDFD f122 = [X2], INCX + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [Y1], 2 * SIZE + (p16) LDFD f113 = [X1], INCX5 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p16) LDFD f125 = [X2], INCX5 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L112 + } + ;; + .align 32 + +.L115: + { .mmi + (p12) LDFPD f32, f33 = [Y1], 2 * SIZE + mov XX = X1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f42 = [X2], INCX + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [Y1], 2 * SIZE + (p12) shladd XX = INCX, 3, XX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [X1], INCX + (p12) LDFD f43 = [X2], INCX + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [Y1], 2 * SIZE + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f38 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + } + ;; + (p12) LDFPD f44, f45 = [Y1], 2 * SIZE + (p12) LDFD f39 = [X1], INCX5 + (p12) LDFD f47 = [X2], INCX5 + ;; + (p13) LDFPD f48, f49 = [Y1], 2 * SIZE + (p13) LDFD f50 = [X1], INCX + (p14) LDFD f58 = [XX], INCX + ;; + (p13) LDFPD f52, f53 = [Y1], 2 * SIZE + (p13) LDFD f51 = [X1], INCX + (p14) LDFD f59 = [XX], INCX + ;; + (p14) LDFPD f56, f57 = [Y1], 2 * SIZE + (p13) LDFD f54 = [X1], INCX + (p15) LDFD f61 = [XX] + ;; + (p13) LDFD f55 = [X1], INCX + (p15) LDFD f60 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L120: + { .mmi + adds PREX = (PREFETCH_SIZE + 17) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 19) * SIZE, X1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + FMA f15 = f32, f80, f0 + (p6) br.cond.dpnt .L125 + } + ;; + .align 32 + +.L122: + { .mmf + (p16) LDFPD f32, f35 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREX], INCX16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p17) LDFD f81 = [X1], INCX + (p17) LDFD f93 = [X2], INCX + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFPD f38, f41 = [Y1], 2 * SIZE + (p16) lfetch.nt1 [PREY], INCX16 + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p17) LDFD f84 = [X1], INCX + (p17) LDFD f96 = [X2], INCX + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [Y1], 2 * SIZE + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p17) LDFD f87 = [X1], INCX + (p17) LDFD f99 = [X2], INCX + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFPD f50, f53 = [Y1], 2 * SIZE + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p17) LDFD f90 = [X1], INCX5 + (p17) LDFD f102 = [X2], INCX5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [Y1], 2 * SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p17) LDFD f105 = [X1], INCX + (p17) LDFD f117 = [X2], INCX + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFPD f62, f65 = [Y1], 2 * SIZE + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p17) LDFD f108 = [X1], INCX + (p17) LDFD f120 = [X2], INCX + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [Y1], 2 * SIZE + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p17) LDFD f111 = [X1], INCX + (p17) LDFD f123 = [X2], INCX + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFPD f74, f77 = [Y1], 2 * SIZE + (p17) LDFD f114 = [X1], INCX5 + (p18) FMA f14 = f76, f124, f14 + } + { .mfb + (p17) LDFD f126 = [X2], INCX5 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L122 + } + ;; + .align 32 + +.L125: + { .mmi + (p12) LDFPD f32, f33 = [Y1], 2 * SIZE + mov XX = X1 + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f42 = [X2], INCX + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFPD f36, f37 = [Y1], 2 * SIZE + (p12) shladd XX = INCX, 3, XX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f35 = [X1], INCX + (p12) LDFD f43 = [X2], INCX + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFPD f40, f41 = [Y1], 2 * SIZE + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f38 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + } + ;; + (p12) LDFPD f44, f45 = [Y1], 2 * SIZE + (p12) LDFD f39 = [X1], INCX5 + (p12) LDFD f47 = [X2], INCX5 + ;; + (p13) LDFPD f48, f49 = [Y1], 2 * SIZE + (p13) LDFD f50 = [X1], INCX + (p14) LDFD f58 = [XX], INCX + ;; + (p13) LDFPD f52, f53 = [Y1], 2 * SIZE + (p13) LDFD f51 = [X1], INCX + (p14) LDFD f59 = [XX], INCX + ;; + (p14) LDFPD f56, f57 = [Y1], 2 * SIZE + (p13) LDFD f54 = [X1], INCX + (p15) LDFD f61 = [XX] + ;; + (p13) LDFD f55 = [X1], INCX + (p15) LDFD f60 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L200: + { .mfi + shladd INCX5 = INCX, 2, INCX + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + and J = 15, N + mov f13 = f0 + shr I = N, 4 + } + ;; + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd INCY5 = INCY, 2, INCY + mov f14 = f0 + } + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + tbit.z p0, p12 = N, 3 + } + ;; + { .mmi + cmp.eq p7, p0 = r0, J + adds I = -1, I + mov ar.ec= 3 + } + { .mmi + shladd Y2 = INCY, 2, Y1 + mov XX = X1 + mov YY = Y1 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 5) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 3) * SIZE, Y1 + mov ar.lc = I + } + { .mfb + cmp.eq p6 ,p0 = -1, I + mov f15 = f0 + (p6) br.cond.dpnt .L215 + } + ;; + .align 32 + +/* INCY == 1 */ +.L212: + { .mmf + (p16) lfetch.nt1 [PREX], INCX16 + (p16) lfetch.nt1 [PREY], INCY16 + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f32 = [Y1], INCY + (p16) LDFD f44 = [Y2], INCY + (p18) FMA f9 = f37, f85, f9 + } + ;; + { .mmf + (p16) LDFD f80 = [X1], INCX + (p16) LDFD f92 = [X2], INCX + (p18) FMA f10 = f40, f88, f10 + } + { .mmf + (p16) LDFD f35 = [Y1], INCY + (p16) LDFD f47 = [Y2], INCY + (p18) FMA f11 = f43, f91, f11 + } + ;; + { .mmf + (p16) LDFD f83 = [X1], INCX + (p16) LDFD f95 = [X2], INCX + (p18) FMA f12 = f46, f94, f12 + } + { .mmf + (p16) LDFD f38 = [Y1], INCY + (p16) LDFD f50 = [Y2], INCY + (p18) FMA f13 = f49, f97, f13 + } + ;; + { .mmf + (p16) LDFD f86 = [X1], INCX + (p16) LDFD f98 = [X2], INCX + (p18) FMA f14 = f52, f100, f14 + } + { .mmf + (p16) LDFD f41 = [Y1], INCY5 + (p16) LDFD f53 = [Y2], INCY5 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFD f89 = [X1], INCX5 + (p16) LDFD f101 = [X2], INCX5 + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f56 = [Y1], INCY + (p16) LDFD f68 = [Y2], INCY + (p18) FMA f9 = f61, f109, f9 + } + ;; + { .mmf + (p16) LDFD f104 = [X1], INCX + (p16) LDFD f116 = [X2], INCX + (p18) FMA f10 = f64, f112, f10 + } + { .mmf + (p16) LDFD f59 = [Y1], INCY + (p16) LDFD f71 = [Y2], INCY + (p18) FMA f11 = f67, f115, f11 + } + ;; + { .mmf + (p16) LDFD f107 = [X1], INCX + (p16) LDFD f119 = [X2], INCX + (p18) FMA f12 = f70, f118, f12 + } + { .mmf + (p16) LDFD f62 = [Y1], INCY + (p16) LDFD f74 = [Y2], INCY + (p18) FMA f13 = f73, f121, f13 + } + ;; + { .mmf + (p16) LDFD f110 = [X1], INCX + (p16) LDFD f122 = [X2], INCX + (p18) FMA f14 = f76, f124, f14 + } + { .mmf + (p16) LDFD f65 = [Y1], INCY5 + (p16) LDFD f77 = [Y2], INCY5 + (p18) FMA f15 = f79, f127, f15 + } + ;; + { .mmi + (p16) LDFD f113 = [X1], INCX5 + (p16) LDFD f125 = [X2], INCX5 + } + { .mmb + (p16) add XX = INCX16, XX + (p16) add YY = INCY16, YY + br.ctop.sptk.few .L212 + } + ;; + .align 32 + +.L215: + { .mmi + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f42 = [X2], INCX + tbit.z p0, p13 = N, 2 + } + { .mmb + (p12) LDFD f32 = [Y1], INCY + (p12) LDFD f40 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFD f35 = [X1], INCX + (p12) LDFD f43 = [X2], INCX + tbit.z p0, p14 = N, 1 + } + { .mmi + (p12) LDFD f33 = [Y1], INCY + (p12) LDFD f41 = [Y2], INCY + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p12) LDFD f38 = [X1], INCX + (p12) LDFD f46 = [X2], INCX + (p12) shladd XX = INCX, 3, XX + } + { .mmi + (p12) LDFD f36 = [Y1], INCY + (p12) LDFD f44 = [Y2], INCY + (p12) shladd YY = INCY, 3, YY + } + ;; + { .mmi + (p12) LDFD f39 = [X1], INCX5 + (p12) LDFD f47 = [X2], INCX5 + (p13) shladd XX = INCX, 2, XX + } + { .mmi + (p12) LDFD f37 = [Y1], INCY5 + (p12) LDFD f45 = [Y2], INCY5 + (p13) shladd YY = INCY, 2, YY + } + ;; + (p13) LDFD f50 = [X1], INCX + (p13) LDFD f48 = [Y1], INCY + (p14) LDFD f58 = [XX], INCX + (p14) LDFD f56 = [YY], INCY + ;; + (p13) LDFD f51 = [X1], INCX + (p13) LDFD f49 = [Y1], INCY + (p14) LDFD f59 = [XX], INCX + (p14) LDFD f57 = [YY], INCY + ;; + (p13) LDFD f54 = [X1], INCX + (p13) LDFD f52 = [Y1], INCY + (p15) LDFD f61 = [XX] + (p15) LDFD f60 = [YY] + ;; + (p13) LDFD f55 = [X1] + (p13) LDFD f53 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f33, f35, f9 + (p12) FMA f10 = f36, f38, f10 + (p12) FMA f11 = f37, f39, f11 + (p12) FMA f12 = f40, f42, f12 + (p12) FMA f13 = f41, f43, f13 + (p12) FMA f14 = f44, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f49, f51, f9 + (p13) FMA f10 = f52, f54, f10 + (p13) FMA f11 = f53, f55, f11 + (p14) FMA f12 = f56, f58, f12 + (p14) FMA f13 = f57, f59, f13 + (p15) FMA f14 = f60, f61, f14 + br .L999 + ;; + .align 32 + +.L999: + FADD f8 = f8, f9 + FADD f10 = f10, f11 + FADD f12 = f12, f13 + FADD f14 = f14, f15 + ;; + FADD f8 = f8, f10 + FADD f12 = f12, f14 + mov ar.lc = ARLC + ;; + FADD f8 = f8, f12 + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/sgemv_n.S b/kernel/ia64/sgemv_n.S new file mode 100644 index 0000000000..f5949e6085 --- /dev/null +++ b/kernel/ia64/sgemv_n.S @@ -0,0 +1,3241 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#define A r36 +#define LDA r37 +#define X r38 +#define INCX r39 +#define Y r34 +#define INCY r35 +#define BUFFER r11 + +#define I r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define AO5 r20 +#define AO6 r21 +#define AO7 r22 +#define AO8 r23 +#define YLD1 r24 +#define YST1 r25 +#define YST2 r27 +#define MM r28 +#define YY r9 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define AO11 loc8 +#define AO21 loc9 +#define AO31 loc10 +#define AO41 loc11 +#define AO51 loc12 +#define AO61 loc13 +#define AO71 loc14 +#define AO81 loc15 + +#define PREB r8 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 8) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA f6 + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 8, 0 + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + ;; + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + ;; + stf.spill [r8] = f22 + stf.spill [r9] = f23 + .body + ;; + + ld8 Y = [r14] + ld8 INCY = [r15] + ld8 BUFFER = [r16] + + mov ALPHA = f8 + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + ;; + shladd INCX = INCX, BASE_SHIFT, r0 + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + ;; + tbit.nz p8, p0 = A, BASE_SHIFT + tbit.nz p9, p0 = LDA, BASE_SHIFT + mov MM = M + ;; + (p8) adds MM = -1, M + ;; + (p7) br.cond.dpnt .L999 + (p6) br.cond.dpnt .L999 + ;; + sub I = A, Y + cmp.eq p10, p0 = SIZE, INCY + mov YY = Y + ;; + (p10) tbit.z.unc p10, p0 = I, BASE_SHIFT + ;; + (p10) br.cond.dptk .L10 + ;; + shr J = M, 3 + mov YY = BUFFER + ;; + (p8) adds YY = SIZE, BUFFER + ;; + mov ar.lc = J + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + ;; +.L02: + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 5 * SIZE + STFD [YST2] = f0, 5 * SIZE + br.cloop.sptk.few .L02 + ;; + +.L10: + (p9) br.cond.dptk .L100 + + shr J = N, 3 + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + ;; + .align 16 + +.L11: + mov YLD1 = YY + mov YST1 = YY + ;; + LDFD f8 = [X], INCX + ;; + LDFD f9 = [X], INCX + ;; + LDFD f10 = [X], INCX + ;; + LDFD f11 = [X], INCX + ;; + LDFD f12 = [X], INCX + ;; + LDFD f13 = [X], INCX + ;; + LDFD f14 = [X], INCX + ;; + LDFD f15 = [X], INCX + ;; + FMPY f8 = ALPHA, f8 + FMPY f9 = ALPHA, f9 + FMPY f10 = ALPHA, f10 + FMPY f11 = ALPHA, f11 + FMPY f12 = ALPHA, f12 + FMPY f13 = ALPHA, f13 + FMPY f14 = ALPHA, f14 + FMPY f15 = ALPHA, f15 + ;; + mov AO1 = A + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + ;; + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + ;; + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + shladd A = LDA, 3, A + ;; + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + + (p8) LDFD f80 = [AO1], 1 * SIZE + (p8) LDFD f81 = [AO2], 1 * SIZE + (p8) LDFD f82 = [AO3], 1 * SIZE + (p8) LDFD f83 = [AO4], 1 * SIZE + (p8) LDFD f84 = [AO5], 1 * SIZE + (p8) LDFD f85 = [AO6], 1 * SIZE + (p8) LDFD f86 = [AO7], 1 * SIZE + (p8) LDFD f87 = [AO8], 1 * SIZE + (p8) LDFD f106 = [YLD1], 1 * SIZE + ;; + (p8) FMPY f32 = f8, f80 + (p8) FMPY f33 = f9, f81 + (p8) FMPY f34 = f10, f82 + (p8) FMA f35 = f11, f83, f106 + ;; + (p8) FMA f32 = f12, f84, f32 + (p8) FMA f33 = f13, f85, f33 + (p8) FMA f34 = f14, f86, f34 + (p8) FMA f35 = f15, f87, f35 + ;; + (p8) FADD f32 = f32, f33 + (p8) FADD f34 = f34, f35 + ;; + (p8) FADD f32 = f32, f34 + ;; + (p8) STFD [YST1] = f32, 1 * SIZE + + shr I = MM, 3 + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + ;; + mov ar.lc = I + mov ar.ec= 2 + (p6) br.cond.dpnt .L15 + ;; + .align 16 + +.L12: + { .mfi + (p17) LDFPD f95, f96 = [AO8], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFPD f46, f47 = [AO2], 2 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f101 = f11, f57, f101 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f104 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO4], 2 * SIZE + (p17) FMA f107 = f11, f59, f107 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f110 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f58, f59 = [AO4], 2 * SIZE + (p17) FMA f113 = f11, f61, f113 + } + { .mfi + (p17) FMA f116 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO4], 2 * SIZE + (p17) FMA f119 = f11, f63, f119 + } + { .mfi + (p17) FMA f122 = f11, f64, f122 + } + ;; + { .mfi + (p16) LDFPD f62, f63 = [AO4], 2 * SIZE + (p17) FMA f101 = f12, f65, f101 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f104 = f12, f66, f104 + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO5], 2 * SIZE + (p17) FMA f107 = f12, f67, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f110 = f12, f68, f110 + } + ;; + { .mfi + (p16) LDFPD f66, f67 = [AO5], 2 * SIZE + (p17) FMA f113 = f12, f69, f113 + } + { .mfi + (p14) PREFETCH [RPRE5], 16 * SIZE + (p17) FMA f116 = f12, f70, f116 + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO5], 2 * SIZE + (p17) FMA f119 = f12, f71, f119 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f122 = f12, f72, f122 + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO5], 2 * SIZE + (p17) FMA f101 = f13, f73, f101 + } + { .mfi + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f104 = f13, f74, f104 + } + ;; + { .mfi + (p16) LDFPD f72, f73 = [AO6], 2 * SIZE + (p17) FMA f107 = f13, f75, f107 + } + { .mfi + (p15) PREFETCH [RPRE6], 16 * SIZE + (p17) FMA f110 = f13, f76, f110 + } + ;; + { .mfi + (p16) LDFPD f74, f75 = [AO6], 2 * SIZE + (p17) FMA f113 = f13, f77, f113 + } + { .mfi + (p17) FMA f116 = f13, f78, f116 + } + ;; + { .mfi + (p16) LDFPD f76, f77 = [AO6], 2 * SIZE + (p17) FMA f119 = f13, f79, f119 + } + { .mfi + (p17) FMA f122 = f13, f80, f122 + } + ;; + { .mfi + (p16) LDFPD f78, f79 = [AO6], 2 * SIZE + (p17) FMA f101 = f14, f81, f101 + } + { .mfi + (p17) FMA f104 = f14, f82, f104 + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO7], 2 * SIZE + (p17) FMA f107 = f14, f83, f107 + } + { .mfi + (p14) PREFETCH [RPRE7], 16 * SIZE + (p17) FMA f110 = f14, f84, f110 + } + ;; + { .mfi + (p16) LDFPD f82, f83 = [AO7], 2 * SIZE + (p17) FMA f113 = f14, f85, f113 + } + { .mfi + (p17) FMA f116 = f14, f86, f116 + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO7], 2 * SIZE + (p17) FMA f119 = f14, f87, f119 + } + { .mfi + (p17) FMA f122 = f14, f88, f122 + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO7], 2 * SIZE + (p17) FMA f16 = f15, f89, f101 + } + { .mfi + (p17) FMA f17 = f15, f90, f104 + } + ;; + { .mfi + (p16) LDFPD f88, f89 = [AO8], 2 * SIZE + (p17) FMA f18 = f15, f91, f107 + } + { .mfi + (p15) PREFETCH [RPRE8], 16 * SIZE + (p17) FMA f19 = f15, f92, f110 + } + ;; + { .mfi + (p16) LDFPD f90, f91 = [AO8], 2 * SIZE + (p17) FMA f20 = f15, f93, f113 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f21 = f15, f94, f116 + } + ;; + { .mfi + (p16) LDFPD f92, f93 = [AO8], 2 * SIZE + (p17) FMA f22 = f15, f95, f119 + } + { .mfb + (p16) adds I = -1, I + (p17) FMA f23 = f15, f96, f122 + br.ctop.sptk.few .L12 + } + ;; + .align 16 + +.L15: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + cmp.lt p6, p0 = 1, J + adds J = -1, J + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f81 = [AO2] + (p15) LDFD f82 = [AO3] + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f23, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f38, f39 = [AO4], 2 * SIZE + (p13) FMA f100 = f8, f32, f100 + nop __LINE__ + } + { .mfi + (p13) LDFPD f40, f41 = [AO5], 2 * SIZE + (p13) FMA f101 = f8, f33, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f54, f55 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f56, f57 = [AO5], 2 * SIZE + (p13) FMA f103 = f8, f49, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f70, f71 = [AO4], 2 * SIZE + (p14) FMA f104 = f8, f64, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f72, f73 = [AO5], 2 * SIZE + (p14) FMA f105 = f8, f65, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f83 = [AO4] + (p15) FMA f106 = f8, f80, f106 + nop __LINE__ + } + { .mfi + (p15) LDFD f84 = [AO5] + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f42, f43 = [AO6], 2 * SIZE + (p13) FMA f100 = f9, f34, f100 + nop __LINE__ + } + { .mfi + (p13) LDFPD f44, f45 = [AO7], 2 * SIZE + (p13) FMA f101 = f9, f35, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f58, f59 = [AO6], 2 * SIZE + (p13) FMA f102 = f9, f50, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f60, f61 = [AO7], 2 * SIZE + (p13) FMA f103 = f9, f51, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f74, f75 = [AO6], 2 * SIZE + (p14) FMA f104 = f9, f66, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f76, f77 = [AO7], 2 * SIZE + (p14) FMA f105 = f9, f67, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f85 = [AO6] + (p15) FMA f106 = f9, f81, f106 + nop __LINE__ + } + { .mfi + (p15) LDFD f86 = [AO7] + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f46, f47 = [AO8], 2 * SIZE + (p13) FMA f100 = f10, f36, f100 + nop __LINE__ + } + { .mfi + (p13) FMA f101 = f10, f37, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f62, f63 = [AO8], 2 * SIZE + (p13) FMA f102 = f10, f52, f102 + nop __LINE__ + } + { .mfi + (p13) FMA f103 = f10, f53, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFPD f78, f79 = [AO8], 2 * SIZE + (p14) FMA f104 = f10, f68, f104 + nop __LINE__ + } + { .mfi + (p14) FMA f105 = f10, f69, f105 + nop __LINE__ + } + ;; + { .mfi + (p15) LDFD f87 = [AO8] + (p15) FMA f106 = f10, f82, f106 + nop __LINE__ + } + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + (p13) FMA f102 = f11, f54, f102 + (p13) FMA f103 = f11, f55, f103 + (p14) FMA f104 = f11, f70, f104 + (p14) FMA f105 = f11, f71, f105 + (p15) FMA f106 = f11, f83, f106 + ;; + (p13) FMA f100 = f12, f40, f100 + (p13) FMA f101 = f12, f41, f101 + (p13) FMA f102 = f12, f56, f102 + (p13) FMA f103 = f12, f57, f103 + (p14) FMA f104 = f12, f72, f104 + (p14) FMA f105 = f12, f73, f105 + (p15) FMA f106 = f12, f84, f106 + ;; + (p13) FMA f100 = f13, f42, f100 + (p13) FMA f101 = f13, f43, f101 + (p13) FMA f102 = f13, f58, f102 + (p13) FMA f103 = f13, f59, f103 + (p14) FMA f104 = f13, f74, f104 + (p14) FMA f105 = f13, f75, f105 + (p15) FMA f106 = f13, f85, f106 + ;; + (p13) FMA f100 = f14, f44, f100 + (p13) FMA f101 = f14, f45, f101 + (p13) FMA f102 = f14, f60, f102 + (p13) FMA f103 = f14, f61, f103 + (p14) FMA f104 = f14, f76, f104 + (p14) FMA f105 = f14, f77, f105 + (p15) FMA f106 = f14, f86, f106 + ;; + (p13) FMA f100 = f15, f46, f100 + (p13) FMA f101 = f15, f47, f101 + (p13) FMA f102 = f15, f62, f102 + (p13) FMA f103 = f15, f63, f103 + (p14) FMA f104 = f15, f78, f104 + (p14) FMA f105 = f15, f79, f105 + (p15) FMA f106 = f15, f87, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + (p6) br.cond.dptk .L11 + ;; + .align 16 + + +.L20: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd AO4 = LDA, 1, AO2 + } + ;; + { .mmi + LDFD f10 = [X], INCX + (p8) LDFD f81 = [AO2], 1 * SIZE + shladd AO3 = LDA, 1, A + } + ;; + { .mmi + LDFD f11 = [X], INCX + (p8) LDFD f82 = [AO3], 1 * SIZE + } + ;; + { .mfi + (p8) LDFD f83 = [AO4], 1 * SIZE + FMPY f8 = ALPHA, f8 + adds PREB = RPREFETCH * SIZE, YLD1 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + } + ;; + FMPY f10 = ALPHA, f10 + shladd A = LDA, 2, A + FMPY f11 = ALPHA, f11 + ;; + { .mfi + adds RPRE3 = RPREFETCH * SIZE, AO3 + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 2 + } + ;; + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMA f106 = f9, f81, f106 + shr I = MM, 3 + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + (p8) FMA f106 = f10, f82, f106 + } + ;; + { .mfi + adds I = -1, I + (p8) FMA f106 = f11, f83, f106 + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mfi + (p17) LDFPD f63, f64 = [AO4], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + (p16) adds I = -1, I + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mmf + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFPD f46, f47 = [AO2], 2 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f101 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f17 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f56, f57 = [AO4], 2 * SIZE + (p17) FMA f18 = f11, f59, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f58, f59 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f61, f113 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f60, f61 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f63, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f11, f64, f122 + br.ctop.sptk.few .L22 + } + ;; + .align 16 + +.L25: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmf + (p15) LDFD f81 = [AO2] + (p15) LDFD f82 = [AO3] + (p13) FMA f100 = f8, f32, f100 + } + { .mfi + (p18) STFD [YST1] = f23, 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + } + ;; + ;; + { .mfi + (p13) LDFPD f38, f39 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + } + { .mfi + (p13) FMA f103 = f8, f49, f103 + } + ;; + { .mfi + (p13) LDFPD f54, f55 = [AO4], 2 * SIZE + (p14) FMA f104 = f8, f64, f104 + } + { .mfi + (p14) FMA f105 = f8, f65, f105 + } + ;; + { .mfi + (p14) LDFPD f70, f71 = [AO4], 2 * SIZE + (p15) FMA f106 = f8, f80, f106 + } + { .mfi + (p13) FMA f100 = f9, f34, f100 + } + ;; + { .mfi + (p15) LDFD f83 = [AO4] + (p13) FMA f101 = f9, f35, f101 + } + { .mfi + (p13) FMA f102 = f9, f50, f102 + } + ;; + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) FMA f100 = f10, f36, f100 + (p13) FMA f101 = f10, f37, f101 + (p13) FMA f102 = f10, f52, f102 + (p13) FMA f103 = f10, f53, f103 + (p14) FMA f104 = f10, f68, f104 + (p14) FMA f105 = f10, f69, f105 + (p15) FMA f106 = f10, f82, f106 + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + ;; + (p13) FMA f102 = f11, f54, f102 + (p13) STFD [YST1] = f100, 1 * SIZE + (p13) FMA f103 = f11, f55, f103 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f104 = f11, f70, f104 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p14) FMA f105 = f11, f71, f105 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + (p15) FMA f106 = f11, f83, f106 + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L30: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L40 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd A = LDA, 1, A + } + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA, f8 + mov ar.ec= 2 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + shr I = MM, 3 + ;; + (p8) LDFD f81 = [AO2], 1 * SIZE + cmp.eq p6, p0 = 0, I + ;; + (p8) FMA f106 = f8, f80, f106 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + tbit.nz p13, p0 = MM, 2 + ;; + (p8) FMA f106 = f9, f81, f106 + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + { .mfi + (p17) LDFPD f47, f48 = [AO2], 2 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mmf + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + adds I = -1, I + } + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mmf + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f16 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f17 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f40, f41 = [AO2], 2 * SIZE + (p17) FMA f18 = f9, f43, f107 + } + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f42, f43 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f45, f113 + } + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f44, f45 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f47, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f9, f48, f122 + br.ctop.sptk.few .L32 + } + ;; + .align 16 + +.L35: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f34, f35 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f50, f51 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f66, f67 = [AO2], 2 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f81 = [AO2] + (p18) STFD [YST1] = f23, 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) FMA f100 = f9, f34, f100 + (p13) FMA f101 = f9, f35, f101 + (p13) FMA f102 = f9, f50, f102 + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L40: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 0 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L990 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + adds RPRE1 = RPREFETCH * SIZE, AO1 + } + ;; + { .mii + (p8) LDFD f80 = [AO1], 1 * SIZE + adds PREB = RPREFETCH * SIZE, YLD1 + } + ;; + FMPY f8 = ALPHA, f8 + shr I = MM, 3 + ;; + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 3 + ;; + { .mmi + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + tbit.nz p14, p15 = r0, 0 + } + ;; + { .mmi + adds YST2 = 4 * SIZE, YST1 + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mmi + (p8) STFD [YST1] = f106, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + } + { .mib + mov ar.lc = I + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L42: + { .mmf + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + (p18) FMA f16 = f8, f34, f102 + } + { .mmf + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) FMA f20 = f8, f46, f114 + } + ;; + { .mmf + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + (p18) FMA f17 = f8, f37, f105 + } + { .mmf + (p16) LDFPD f38, f41 = [AO1], 2 * SIZE + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) FMA f21 = f8, f49, f117 + } + ;; + { .mmf + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + (p18) FMA f18 = f8, f40, f108 + } + { .mmf + (p16) LDFPD f44, f47 = [AO1], 2 * SIZE + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) FMA f22 = f8, f52, f120 + } + ;; + { .mmf + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + (p18) FMA f19 = f8, f43, f111 + } + { .mmf + (p16) LDFPD f50, f53 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) FMA f23 = f8, f55, f123 + } + ;; + { .mmi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p14) PREFETCH [PREB], 16 * SIZE + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mib + nop __LINE__ + (p16) adds I = -1, I + br.ctop.sptk.few .L42 + } + ;; + .align 16 + +.L45: + { .mmi + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + } + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f105 = f8, f65, f105 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + br .L990 + ;; + .align 16 + +.L100: + shr J = N, 3 + ;; + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L120 + ;; + .align 16 + +.L111: + mov YLD1 = YY + mov YST1 = YY + ;; + LDFD f8 = [X], INCX + ;; + LDFD f9 = [X], INCX + ;; + LDFD f10 = [X], INCX + ;; + LDFD f11 = [X], INCX + ;; + LDFD f12 = [X], INCX + ;; + LDFD f13 = [X], INCX + ;; + LDFD f14 = [X], INCX + ;; + LDFD f15 = [X], INCX + ;; + FMPY f8 = ALPHA, f8 + FMPY f9 = ALPHA, f9 + FMPY f10 = ALPHA, f10 + FMPY f11 = ALPHA, f11 + FMPY f12 = ALPHA, f12 + FMPY f13 = ALPHA, f13 + FMPY f14 = ALPHA, f14 + FMPY f15 = ALPHA, f15 + ;; + mov AO1 = A + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + ;; + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + ;; + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + shladd A = LDA, 3, A + ;; + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds RPRE1 = RPREFETCH * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + adds RPRE3 = RPREFETCH * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + adds RPRE5 = RPREFETCH * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + adds RPRE7 = RPREFETCH * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + + (p8) LDFD f80 = [AO1], 1 * SIZE + (p8) LDFD f81 = [AO2], 1 * SIZE + (p8) LDFD f82 = [AO3], 1 * SIZE + (p8) LDFD f83 = [AO4], 1 * SIZE + (p8) LDFD f84 = [AO5], 1 * SIZE + (p8) LDFD f85 = [AO6], 1 * SIZE + (p8) LDFD f86 = [AO7], 1 * SIZE + (p8) LDFD f87 = [AO8], 1 * SIZE + (p8) LDFD f106 = [YLD1], 1 * SIZE + ;; + (p8) FMPY f32 = f8, f80 + (p8) FMPY f33 = f9, f81 + (p8) FMPY f34 = f10, f82 + (p8) FMA f35 = f11, f83, f106 + ;; + (p8) FMA f32 = f12, f84, f32 + (p8) FMA f33 = f13, f85, f33 + (p8) FMA f34 = f14, f86, f34 + (p8) FMA f35 = f15, f87, f35 + ;; + (p8) FADD f32 = f32, f33 + (p8) FADD f34 = f34, f35 + ;; + (p8) FADD f32 = f32, f34 + ;; + (p8) STFD [YST1] = f32, 1 * SIZE + + shr I = MM, 3 + mov pr.rot= 0 + ;; + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ;; + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + ;; + mov ar.lc = I + mov ar.ec= 2 + (p6) br.cond.dpnt .L115 + ;; + .align 16 + +.L112: + { .mfi + (p17) LDFD f96 = [AO8], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFD f47 = [AO2], 1 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f101 = f11, f57, f101 + } + { .mmf + (p18) STFD [YST1] = f19, 1 * SIZE + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f104 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f57, f58 = [AO4], 2 * SIZE + (p17) FMA f107 = f11, f59, f107 + } + { .mfi + (p15) PREFETCH [RPRE4], 16 * SIZE + (p17) FMA f110 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f59, f60 = [AO4], 2 * SIZE + (p17) FMA f113 = f11, f61, f113 + } + { .mfi + (p17) FMA f116 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f61, f62 = [AO4], 2 * SIZE + (p17) FMA f119 = f11, f63, f119 + } + { .mfi + (p17) FMA f122 = f11, f64, f122 + } + ;; + { .mfi + (p16) LDFD f63 = [AO4], 1 * SIZE + (p17) FMA f101 = f12, f65, f101 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f104 = f12, f66, f104 + } + ;; + { .mfi + (p16) LDFPD f64, f65 = [AO5], 2 * SIZE + (p17) FMA f107 = f12, f67, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f110 = f12, f68, f110 + } + ;; + { .mfi + (p16) LDFPD f66, f67 = [AO5], 2 * SIZE + (p17) FMA f113 = f12, f69, f113 + } + { .mfi + (p14) PREFETCH [RPRE5], 16 * SIZE + (p17) FMA f116 = f12, f70, f116 + } + ;; + { .mfi + (p16) LDFPD f68, f69 = [AO5], 2 * SIZE + (p17) FMA f119 = f12, f71, f119 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f122 = f12, f72, f122 + } + ;; + { .mfi + (p16) LDFPD f70, f71 = [AO5], 2 * SIZE + (p17) FMA f101 = f13, f73, f101 + } + { .mmf + (p18) STFD [YST1] = f23, 1 * SIZE + (p16) LDFD f72 = [AO6], 1 * SIZE + (p17) FMA f104 = f13, f74, f104 + } + ;; + { .mfi + (p16) LDFPD f73, f74 = [AO6], 2 * SIZE + (p17) FMA f107 = f13, f75, f107 + } + { .mfi + (p15) PREFETCH [RPRE6], 16 * SIZE + (p17) FMA f110 = f13, f76, f110 + } + ;; + { .mfi + (p16) LDFPD f75, f76 = [AO6], 2 * SIZE + (p17) FMA f113 = f13, f77, f113 + } + { .mfi + (p17) FMA f116 = f13, f78, f116 + } + ;; + { .mfi + (p16) LDFPD f77, f78 = [AO6], 2 * SIZE + (p17) FMA f119 = f13, f79, f119 + } + { .mfi + (p17) FMA f122 = f13, f80, f122 + } + ;; + { .mfi + (p16) LDFD f79 = [AO6], 1 * SIZE + (p17) FMA f101 = f14, f81, f101 + } + { .mfi + (p17) FMA f104 = f14, f82, f104 + } + ;; + { .mfi + (p16) LDFPD f80, f81 = [AO7], 2 * SIZE + (p17) FMA f107 = f14, f83, f107 + } + { .mfi + (p14) PREFETCH [RPRE7], 16 * SIZE + (p17) FMA f110 = f14, f84, f110 + } + ;; + { .mfi + (p16) LDFPD f82, f83 = [AO7], 2 * SIZE + (p17) FMA f113 = f14, f85, f113 + } + { .mfi + (p17) FMA f116 = f14, f86, f116 + } + ;; + { .mfi + (p16) LDFPD f84, f85 = [AO7], 2 * SIZE + (p17) FMA f119 = f14, f87, f119 + } + { .mfi + (p17) FMA f122 = f14, f88, f122 + } + ;; + { .mfi + (p16) LDFPD f86, f87 = [AO7], 2 * SIZE + (p17) FMA f16 = f15, f89, f101 + } + { .mfi + (p16) LDFD f88 = [AO8], 1 * SIZE + (p17) FMA f17 = f15, f90, f104 + } + ;; + { .mfi + (p16) LDFPD f89, f90 = [AO8], 2 * SIZE + (p17) FMA f18 = f15, f91, f107 + } + { .mfi + (p15) PREFETCH [RPRE8], 16 * SIZE + (p17) FMA f19 = f15, f92, f110 + } + ;; + { .mfi + (p16) LDFPD f91, f92 = [AO8], 2 * SIZE + (p17) FMA f20 = f15, f93, f113 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f21 = f15, f94, f116 + } + ;; + { .mfi + (p16) LDFPD f93, f94 = [AO8], 2 * SIZE + (p17) FMA f22 = f15, f95, f119 + } + { .mfb + (p16) adds I = -1, I + (p17) FMA f23 = f15, f96, f122 + br.ctop.sptk.few .L112 + } + ;; + .align 16 + +.L115: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + cmp.lt p6, p0 = 1, J + adds J = -1, J + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + (p13) LDFD f34 = [AO2], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p13) LDFPD f35, f50 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f66 = [AO2], 1 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p14) LDFD f67 = [AO2], 1 * SIZE + (p15) LDFD f82 = [AO3] + nop __LINE__ + } + { .mmi + (p18) STFD [YST1] = f23, 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p15) LDFD f81 = [AO2] + (p13) LDFD f38 = [AO4], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + } + { .mfi + (p13) LDFPD f40, f41 = [AO5], 2 * SIZE + (p13) FMA f101 = f8, f33, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFPD f39, f54 = [AO4], 2 * SIZE + (p13) FMA f102 = f8, f48, f102 + nop __LINE__ + } + { .mfi + (p13) LDFPD f56, f57 = [AO5], 2 * SIZE + (p13) FMA f103 = f8, f49, f103 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFD f55 = [AO4], 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f72, f73 = [AO5], 2 * SIZE + (p14) FMA f105 = f8, f65, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f70 = [AO4], 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + nop __LINE__ + } + { .mmi + (p15) LDFD f84 = [AO5] + (p13) LDFD f42 = [AO6], 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p13) LDFPD f43, f58 = [AO6], 2 * SIZE + (p14) LDFD f71 = [AO4], 1 * SIZE + (p13) FMA f100 = f9, f34, f100 + } + { .mfi + (p13) LDFPD f44, f45 = [AO7], 2 * SIZE + (p13) FMA f101 = f9, f35, f101 + nop __LINE__ + } + ;; + { .mmf + (p13) LDFD f59 = [AO6], 1 * SIZE + (p15) LDFD f83 = [AO4] + (p13) FMA f102 = f9, f50, f102 + } + { .mfi + (p13) LDFPD f60, f61 = [AO7], 2 * SIZE + (p13) FMA f103 = f9, f51, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f74 = [AO6], 1 * SIZE + (p14) FMA f104 = f9, f66, f104 + nop __LINE__ + } + { .mfi + (p14) LDFPD f76, f77 = [AO7], 2 * SIZE + (p14) FMA f105 = f9, f67, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f75 = [AO6], 1 * SIZE + (p15) FMA f106 = f9, f81, f106 + nop __LINE__ + } + { .mmi + (p15) LDFD f86 = [AO7] + (p13) LDFD f46 = [AO8], 1 * SIZE + nop __LINE__ + } + ;; + { .mmf + (p13) LDFPD f47, f62 = [AO8], 2 * SIZE + (p15) LDFD f85 = [AO6] + (p13) FMA f100 = f10, f36, f100 + } + { .mfi + (p13) FMA f101 = f10, f37, f101 + nop __LINE__ + } + ;; + { .mfi + (p13) LDFD f63 = [AO8], 1 * SIZE + (p13) FMA f102 = f10, f52, f102 + nop __LINE__ + } + { .mfi + (p13) FMA f103 = f10, f53, f103 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f78 = [AO8], 1 * SIZE + (p14) FMA f104 = f10, f68, f104 + nop __LINE__ + } + { .mfi + (p14) FMA f105 = f10, f69, f105 + nop __LINE__ + } + ;; + { .mfi + (p14) LDFD f79 = [AO8], 1 * SIZE + (p15) FMA f106 = f10, f82, f106 + nop __LINE__ + } + ;; + (p15) LDFD f87 = [AO8] + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + (p13) FMA f102 = f11, f54, f102 + (p13) FMA f103 = f11, f55, f103 + (p14) FMA f104 = f11, f70, f104 + (p14) FMA f105 = f11, f71, f105 + (p15) FMA f106 = f11, f83, f106 + ;; + (p13) FMA f100 = f12, f40, f100 + (p13) FMA f101 = f12, f41, f101 + (p13) FMA f102 = f12, f56, f102 + (p13) FMA f103 = f12, f57, f103 + (p14) FMA f104 = f12, f72, f104 + (p14) FMA f105 = f12, f73, f105 + (p15) FMA f106 = f12, f84, f106 + ;; + (p13) FMA f100 = f13, f42, f100 + (p13) FMA f101 = f13, f43, f101 + (p13) FMA f102 = f13, f58, f102 + (p13) FMA f103 = f13, f59, f103 + (p14) FMA f104 = f13, f74, f104 + (p14) FMA f105 = f13, f75, f105 + (p15) FMA f106 = f13, f85, f106 + ;; + (p13) FMA f100 = f14, f44, f100 + (p13) FMA f101 = f14, f45, f101 + (p13) FMA f102 = f14, f60, f102 + (p13) FMA f103 = f14, f61, f103 + (p14) FMA f104 = f14, f76, f104 + (p14) FMA f105 = f14, f77, f105 + (p15) FMA f106 = f14, f86, f106 + ;; + (p13) FMA f100 = f15, f46, f100 + (p13) FMA f101 = f15, f47, f101 + (p13) FMA f102 = f15, f62, f102 + (p13) FMA f103 = f15, f63, f103 + (p14) FMA f104 = f15, f78, f104 + (p14) FMA f105 = f15, f79, f105 + (p15) FMA f106 = f15, f87, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + (p6) br.cond.dptk .L111 + ;; + .align 16 + +.L120: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd AO4 = LDA, 1, AO2 + } + ;; + { .mmi + LDFD f10 = [X], INCX + (p8) LDFD f81 = [AO2], 1 * SIZE + shladd AO3 = LDA, 1, A + } + ;; + { .mmi + LDFD f11 = [X], INCX + (p8) LDFD f82 = [AO3], 1 * SIZE + } + ;; + { .mfi + (p8) LDFD f83 = [AO4], 1 * SIZE + FMPY f8 = ALPHA, f8 + adds PREB = RPREFETCH * SIZE, YLD1 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + } + ;; + FMPY f10 = ALPHA, f10 + shladd A = LDA, 2, A + FMPY f11 = ALPHA, f11 + ;; + { .mfi + adds RPRE3 = RPREFETCH * SIZE, AO3 + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 2 + } + ;; + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + (p8) FMA f106 = f9, f81, f106 + shr I = MM, 3 + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + (p8) FMA f106 = f10, f82, f106 + } + ;; + { .mfi + adds I = -1, I + (p8) FMA f106 = f11, f83, f106 + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L125 + } + ;; + .align 16 + +.L122: + { .mfi + (p17) LDFD f64 = [AO4], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mfi + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + (p16) adds I = -1, I + } + { .mfi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mfi + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mfi + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p17) FMA f101 = f9, f41, f101 + } + { .mmf + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f42, f104 + } + ;; + { .mmf + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p15) PREFETCH [RPRE2], 16 * SIZE + (p17) FMA f107 = f9, f43, f107 + } + { .mfi + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f110 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f113 = f9, f45, f113 + } + { .mfi + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f116 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f119 = f9, f47, f119 + } + { .mfi + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f122 = f9, f48, f122 + } + ;; + { .mfi + (p16) LDFD f47 = [AO2], 1 * SIZE + (p17) FMA f101 = f10, f49, f101 + } + { .mfi + (p14) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) FMA f104 = f10, f50, f104 + } + ;; + { .mfi + (p16) LDFPD f48, f49 = [AO3], 2 * SIZE + (p17) FMA f107 = f10, f51, f107 + } + { .mfi + (p14) PREFETCH [RPRE3], 16 * SIZE + (p17) FMA f110 = f10, f52, f110 + } + ;; + { .mfi + (p16) LDFPD f50, f51 = [AO3], 2 * SIZE + (p17) FMA f113 = f10, f53, f113 + } + { .mfi + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f116 = f10, f54, f116 + } + ;; + { .mfi + (p16) LDFPD f52, f53 = [AO3], 2 * SIZE + (p17) FMA f119 = f10, f55, f119 + } + { .mfi + (p18) STFD [YST1] = f20, 1 * SIZE + (p17) FMA f122 = f10, f56, f122 + } + ;; + { .mfi + (p16) LDFPD f54, f55 = [AO3], 2 * SIZE + (p17) FMA f16 = f11, f57, f101 + } + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f17 = f11, f58, f104 + } + ;; + { .mfi + (p16) LDFPD f57, f58 = [AO4], 2 * SIZE + (p17) FMA f18 = f11, f59, f107 + } + { .mfi + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f11, f60, f110 + } + ;; + { .mfi + (p16) LDFPD f59, f60 = [AO4], 2 * SIZE + (p17) FMA f20 = f11, f61, f113 + } + { .mfi + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f11, f62, f116 + } + ;; + { .mfi + (p16) LDFPD f61, f62 = [AO4], 2 * SIZE + (p17) FMA f22 = f11, f63, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f11, f64, f122 + br.ctop.sptk.few .L122 + } + ;; + .align 16 + +.L125: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + (p15) LDFD f80 = [AO1] + } + { .mmi + (p15) LDFD f106 = [YLD1], 1 * SIZE + (p13) LDFD f34 = [AO2], 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f35, f50 = [AO2], 2 * SIZE + (p13) LDFPD f36, f37 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p13) LDFPD f52, f53 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p14) LDFD f66 = [AO2], 1 * SIZE + (p14) LDFPD f68, f69 = [AO3], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmf + (p18) STFD [YST1] = f23, 1 * SIZE + (p14) LDFD f67 = [AO2], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + } + { .mmf + (p15) LDFD f82 = [AO3] + (p13) LDFD f38 = [AO4], 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + } + ;; + ;; + { .mmf + (p13) LDFPD f39, f54 = [AO4], 2 * SIZE + (p15) LDFD f81 = [AO2] + (p13) FMA f102 = f8, f48, f102 + } + { .mfi + (p13) FMA f103 = f8, f49, f103 + } + ;; + { .mfi + (p13) LDFD f55 = [AO4], 1 * SIZE + (p14) FMA f104 = f8, f64, f104 + } + { .mfi + (p14) FMA f105 = f8, f65, f105 + } + ;; + { .mfi + (p14) LDFD f70 = [AO4], 1 * SIZE + (p15) FMA f106 = f8, f80, f106 + } + { .mfi + (p13) FMA f100 = f9, f34, f100 + } + ;; + { .mfi + (p14) LDFD f71 = [AO4], 1 * SIZE + (p13) FMA f101 = f9, f35, f101 + } + { .mfi + (p13) FMA f102 = f9, f50, f102 + } + ;; + (p15) LDFD f83 = [AO4] + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) FMA f100 = f10, f36, f100 + (p13) FMA f101 = f10, f37, f101 + (p13) FMA f102 = f10, f52, f102 + (p13) FMA f103 = f10, f53, f103 + (p14) FMA f104 = f10, f68, f104 + (p14) FMA f105 = f10, f69, f105 + (p15) FMA f106 = f10, f82, f106 + ;; + (p13) FMA f100 = f11, f38, f100 + (p13) FMA f101 = f11, f39, f101 + ;; + (p13) FMA f102 = f11, f54, f102 + (p13) STFD [YST1] = f100, 1 * SIZE + (p13) FMA f103 = f11, f55, f103 + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + (p14) FMA f104 = f11, f70, f104 + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + (p14) FMA f105 = f11, f71, f105 + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + (p15) FMA f106 = f11, f83, f106 + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L130: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L140 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + add AO2 = LDA, A + } + ;; + { .mmi + LDFD f9 = [X], INCX + (p8) LDFD f80 = [AO1], 1 * SIZE + shladd A = LDA, 1, A + } + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA, f8 + mov ar.ec= 2 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA, f9 + shr I = MM, 3 + ;; + (p8) LDFD f81 = [AO2], 1 * SIZE + cmp.eq p6, p0 = 0, I + ;; + (p8) FMA f106 = f8, f80, f106 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + tbit.nz p13, p0 = MM, 2 + ;; + (p8) FMA f106 = f9, f81, f106 + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + ;; + { .mib + (p8) STFD [YST1] = f106, 1 * SIZE + mov ar.lc = I + (p6) br.cond.dpnt .L135 + } + ;; + .align 16 + +.L132: + { .mfi + (p17) LDFD f48 = [AO2], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mmf + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f16, 1 * SIZE + (p17) FMA f104 = f8, f34, f104 + } + ;; + { .mfi + (p16) LDFPD f32, f33 = [AO1], 2 * SIZE + (p17) FMA f107 = f8, f35, f107 + adds I = -1, I + } + { .mmf + (p14) PREFETCH [RPRE1], 16 * SIZE + (p18) STFD [YST1] = f17, 1 * SIZE + (p17) FMA f110 = f8, f36, f110 + } + ;; + { .mfi + (p16) LDFPD f34, f35 = [AO1], 2 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + { .mmf + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f18, 1 * SIZE + (p17) FMA f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFPD f36, f37 = [AO1], 2 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + { .mmf + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) STFD [YST1] = f19, 1 * SIZE + (p17) FMA f122 = f8, f40, f122 + } + ;; + { .mmf + (p16) LDFPD f38, f39 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p17) FMA f16 = f9, f41, f101 + } + { .mmf + (p18) STFD [YST1] = f20, 1 * SIZE + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f17 = f9, f42, f104 + } + ;; + { .mfi + (p16) LDFPD f41, f42 = [AO2], 2 * SIZE + (p17) FMA f18 = f9, f43, f107 + } + { .mmf + (p15) PREFETCH [RPRE2], 16 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + (p17) FMA f19 = f9, f44, f110 + } + ;; + { .mfi + (p16) LDFPD f43, f44 = [AO2], 2 * SIZE + (p17) FMA f20 = f9, f45, f113 + } + { .mmf + (p14) PREFETCH [PREB], 16 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + (p17) FMA f21 = f9, f46, f116 + } + ;; + { .mfi + (p16) LDFPD f45, f46 = [AO2], 2 * SIZE + (p17) FMA f22 = f9, f47, f119 + } + { .mfb + (p18) STFD [YST1] = f23, 1 * SIZE + (p17) FMA f23 = f9, f48, f122 + br.ctop.sptk.few .L132 + } + ;; + .align 16 + +.L135: + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p18) STFD [YST1] = f16, 1 * SIZE + } + ;; + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p18) STFD [YST1] = f17, 1 * SIZE + } + ;; + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + { .mmi + (p18) STFD [YST1] = f18, 1 * SIZE + } + ;; + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + { .mmi + (p18) STFD [YST1] = f19, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f34 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f20, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f35 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f21, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f50 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f22, 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f51 = [AO2], 1 * SIZE + (p18) STFD [YST1] = f23, 1 * SIZE + } + ;; + (p14) LDFD f66 = [AO2], 1 * SIZE + (p13) FMA f100 = f8, f32, f100 + ;; + (p14) LDFD f67 = [AO2], 1 * SIZE + (p13) FMA f101 = f8, f33, f101 + ;; + (p15) LDFD f81 = [AO2] + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) FMA f100 = f9, f34, f100 + (p13) FMA f101 = f9, f35, f101 + (p13) FMA f102 = f9, f50, f102 + (p13) FMA f103 = f9, f51, f103 + (p14) FMA f104 = f9, f66, f104 + (p14) FMA f105 = f9, f67, f105 + (p15) FMA f106 = f9, f81, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L140: + { .mmi + mov YLD1 = YY + mov YST1 = YY + tbit.z p6, p0 = N, 0 + } + ;; + { .mib + mov AO1 = A + mov pr.rot= 0 + (p6) br.cond.dpnt .L990 + } + ;; + { .mmi + LDFD f8 = [X], INCX + (p8) LDFD f106 = [YLD1], 1 * SIZE + adds RPRE1 = RPREFETCH * SIZE, AO1 + } + ;; + { .mmi + (p8) LDFD f80 = [AO1], 1 * SIZE + adds PREB = RPREFETCH * SIZE, YLD1 + } + ;; + FMPY f8 = ALPHA, f8 + shr I = MM, 3 + ;; + (p8) FMA f106 = f8, f80, f106 + mov ar.ec= 3 + ;; + { .mmi + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + tbit.nz p14, p15 = r0, 0 + } + ;; + { .mmi + adds YST2 = 4 * SIZE, YST1 + adds I = -1, I + tbit.nz p13, p0 = MM, 2 + } + ;; + { .mmi + (p8) STFD [YST1] = f106, 1 * SIZE + (p8) adds YST2 = 1 * SIZE, YST2 + } + { .mib + mov ar.lc = I + (p6) br.cond.dpnt .L145 + } + ;; + .align 16 + +.L142: + { .mmf + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + (p18) FMA f16 = f8, f34, f102 + } + { .mmf + (p16) LDFPD f32, f35 = [AO1], 2 * SIZE + (p16) LDFPD f100, f103 = [YLD1], 2 * SIZE + (p18) FMA f20 = f8, f46, f114 + } + ;; + { .mmf + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + (p18) FMA f17 = f8, f37, f105 + } + { .mmf + (p16) LDFPD f38, f41 = [AO1], 2 * SIZE + (p16) LDFPD f106, f109 = [YLD1], 2 * SIZE + (p18) FMA f21 = f8, f49, f117 + } + ;; + { .mmf + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + (p18) FMA f18 = f8, f40, f108 + } + { .mmf + (p16) LDFPD f44, f47 = [AO1], 2 * SIZE + (p16) LDFPD f112, f115 = [YLD1], 2 * SIZE + (p18) FMA f22 = f8, f52, f120 + } + ;; + { .mmf + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + (p18) FMA f19 = f8, f43, f111 + } + { .mmf + (p16) LDFPD f50, f53 = [AO1], 2 * SIZE + (p16) LDFPD f118, f121 = [YLD1], 2 * SIZE + (p18) FMA f23 = f8, f55, f123 + } + ;; + { .mmi + (p14) PREFETCH [RPRE1], 16 * SIZE + (p14) PREFETCH [PREB], 16 * SIZE + (p16) tbit.nz.unc p14, p15 = I, 0 + } + { .mib + nop __LINE__ + (p16) adds I = -1, I + br.ctop.sptk.few .L142 + } + ;; + .align 16 + +.L145: + { .mmi + (p19) STFD [YST1] = f16, 1 * SIZE + (p19) STFD [YST2] = f20, 1 * SIZE + tbit.nz p14, p0 = MM, 1 + } + { .mmi + (p13) LDFPD f32, f33 = [AO1], 2 * SIZE + (p13) LDFPD f100, f101 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f17, 1 * SIZE + (p19) STFD [YST2] = f21, 1 * SIZE + tbit.nz p15, p0 = MM, 0 + } + { .mmi + (p13) LDFPD f48, f49 = [AO1], 2 * SIZE + (p13) LDFPD f102, f103 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f18, 1 * SIZE + (p19) STFD [YST2] = f22, 1 * SIZE + } + { .mmi + (p14) LDFPD f64, f65 = [AO1], 2 * SIZE + (p14) LDFPD f104, f105 = [YLD1], 2 * SIZE + } + ;; + { .mmi + (p19) STFD [YST1] = f19, 5 * SIZE + (p19) STFD [YST2] = f23, 5 * SIZE + } + { .mmi + (p15) LDFD f80 = [AO1] + (p15) LDFD f106 = [YLD1], 1 * SIZE + } + ;; + (p13) FMA f100 = f8, f32, f100 + (p13) FMA f101 = f8, f33, f101 + (p13) FMA f102 = f8, f48, f102 + (p13) FMA f103 = f8, f49, f103 + (p14) FMA f104 = f8, f64, f104 + (p14) FMA f105 = f8, f65, f105 + (p15) FMA f106 = f8, f80, f106 + ;; + (p13) STFD [YST1] = f100, 1 * SIZE + ;; + (p13) STFD [YST1] = f101, 1 * SIZE + ;; + (p13) STFD [YST1] = f102, 1 * SIZE + ;; + (p13) STFD [YST1] = f103, 1 * SIZE + ;; + (p14) STFD [YST1] = f104, 1 * SIZE + ;; + (p14) STFD [YST1] = f105, 1 * SIZE + ;; + (p15) STFD [YST1] = f106, 1 * SIZE + ;; + .align 16 + +.L990: + { .mmi + mov YLD1 = YY + mov YST1 = Y + mov pr.rot= 0 + } + { .mib + mov YST2 = Y + shr J = M, 3 + (p10) br.cond.dptk .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = r0, J + adds J = -1, J + mov ar.ec = 4 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + nop __LINE__ + tbit.nz p13, p0 = M, 2 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = J + (p6) br.cond.dpnt .L995 + } + ;; +.L992: + { .mfi + (p19) STFD [YST2] = f35 + (p18) FADD f34 = f34, f66 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f64 = [YLD1], 1 * SIZE + (p16) LDFD f32 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f39 + (p18) FADD f38 = f38, f70 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f36 = [YST1], INCY + (p16) LDFD f68 = [YLD1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f43 + (p18) FADD f42 = f42, f74 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f72 = [YLD1], 1 * SIZE + (p16) LDFD f40 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f47 + (p18) FADD f46 = f46, f78 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f76 = [YLD1], 1 * SIZE + (p16) LDFD f44 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f51 + (p18) FADD f50 = f50, f82 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f80 = [YLD1], 1 * SIZE + (p16) LDFD f48 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f55 + (p18) FADD f54 = f54, f86 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f84 = [YLD1], 1 * SIZE + (p16) LDFD f52 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f59 + (p18) FADD f58 = f58, f90 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f88 = [YLD1], 1 * SIZE + (p16) LDFD f56 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f63 + (p18) FADD f62 = f62, f94 + (p19) add YST2 = YST2, INCY + } + { .mmb + (p16) LDFD f92 = [YLD1], 1 * SIZE + (p16) LDFD f60 = [YST1], INCY + br.ctop.sptk.few .L992 + } + ;; + +.L995: + (p13) LDFD f32 = [YST1], INCY + (p13) LDFD f40 = [YLD1], 1 * SIZE + tbit.nz p14, p0 = M, 1 + ;; + (p13) LDFD f33 = [YST1], INCY + (p13) LDFD f41 = [YLD1], 1 * SIZE + tbit.nz p15, p0 = M, 0 + ;; + (p13) LDFD f34 = [YST1], INCY + (p13) LDFD f42 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f35 = [YST1], INCY + (p13) LDFD f43 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f36 = [YST1], INCY + (p14) LDFD f44 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f37 = [YST1], INCY + (p14) LDFD f45 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f38 = [YST1], INCY + (p15) LDFD f46 = [YLD1], 1 * SIZE + ;; + (p13) FADD f32 = f32, f40 + (p13) FADD f33 = f33, f41 + (p13) FADD f34 = f34, f42 + (p13) FADD f35 = f35, f43 + (p14) FADD f36 = f36, f44 + (p14) FADD f37 = f37, f45 + (p15) FADD f38 = f38, f46 + ;; + (p13) STFD [YST2] = f32 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f33 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f34 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f35 + (p13) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f36 + (p14) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f37 + (p14) add YST2 = YST2, INCY + ;; + (p15) STFD [YST2] = f38 + ;; + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/staticbuffer.S b/kernel/ia64/staticbuffer.S new file mode 100644 index 0000000000..a30bb74c76 --- /dev/null +++ b/kernel/ia64/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 1024 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 +#endif diff --git a/kernel/ia64/swap.S b/kernel/ia64/swap.S new file mode 100644 index 0000000000..585f418282 --- /dev/null +++ b/kernel/ia64/swap.S @@ -0,0 +1,577 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#define SP r12 + +#ifndef XDOUBLE +#define N r32 +#define X1 r36 +#define INCX r37 +#define Y1 r38 +#define INCY r39 +#else +#define N r32 +#define X1 r38 +#define INCX r39 +#define Y1 r33 +#define INCY r34 +#endif + +#define PRE1 r2 +#define PRE2 r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define X3 r18 +#define Y3 r19 +#define X4 r20 +#define Y4 r21 + +#define YY r22 +#define XX r23 +#define INCX5 r24 +#define INCY5 r25 +#define INCX16 r26 +#define INCY16 r27 +#define XYSUB r28 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + +#ifdef XDOUBLE + adds r8 = 16, SP + adds r9 = 24, SP + ;; + ld8 Y1 = [r8] + ld8 INCY = [r9] + ;; +#endif + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + tbit.z p0, p8 = Y1, BASE_SHIFT + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + shladd INCX16 = INCX, 4, r0 + shladd INCY16 = INCY, 4, r0 + mov PR = pr + } + { .mmi + sub XYSUB = X1, Y1 + mov X3 = X1 + shr I = N, 4 + } + ;; + { .mmi + shladd INCX5 = INCX, 2, INCX + shladd INCY5 = INCY, 2, INCY + mov pr.rot= 0 + } + { .mmi + adds I = -1, I + and J = 15, N + extr XYSUB = XYSUB, BASE_SHIFT, 6 + } + ;; + { .mmi + shladd X2 = INCX, 2, X1 + shladd Y2 = INCY, 2, Y1 + mov ar.lc = I + } + { .mmi + shladd X4 = INCX, 2, X1 + shladd Y4 = INCY, 2, Y1 + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mmi + shladd PRE2 = XYSUB, BASE_SHIFT, Y1 + cmp.lt p8 ,p0 = 28, XYSUB + mov Y3 = Y1 + } + ;; + { .mmi + adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 + adds PRE2 = (PREFETCH_SIZE - 12) * SIZE, PRE2 + mov ar.ec= 2 + } + { .mib + cmp.eq p9 ,p0 = -1, I + tbit.z p0, p12 = N, 3 + (p9) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mmi + (p18) STFD [Y3] = f56 + (p18) STFD [Y4] = f64 + (p18) add Y3 = Y3, INCY5 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f40 = [X2], INCX + (p18) add Y4 = Y4, INCY5 + } + ;; + { .mmi + (p17) STFD [X3] = f65 + (p17) STFD [X4] = f73 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f64 = [Y1], INCY + (p16) LDFD f72 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f33 + (p17) STFD [Y4] = f41 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f34 = [X1], INCX + (p16) LDFD f42 = [X2], INCX + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f67 + (p17) STFD [X4] = f75 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f66 = [Y1], INCY + (p16) LDFD f74 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f35 + (p17) STFD [Y4] = f43 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f36 = [X1], INCX + (p16) LDFD f44 = [X2], INCX + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f69 + (p17) STFD [X4] = f77 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f68 = [Y1], INCY + (p16) LDFD f76 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f37 + (p17) STFD [Y4] = f45 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f38 = [X1], INCX5 + (p16) LDFD f46 = [X2], INCX5 + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f71 + (p17) STFD [X4] = f79 + (p17) add X3 = X3, INCX5 + } + { .mmi + (p16) LDFD f70 = [Y1], INCY5 + (p16) LDFD f78 = [Y2], INCY5 + (p17) add X4 = X4, INCX5 + } + ;; + { .mmi + (p17) STFD [Y3] = f39 + (p17) STFD [Y4] = f47 + (p17) add Y3 = Y3, INCY5 + } + { .mmi + (p16) LDFD f48 = [X1], INCX + (p16) LDFD f56 = [X2], INCX + (p17) add Y4 = Y4, INCY5 + } + ;; + { .mmi + (p17) STFD [X3] = f81 + (p17) STFD [X4] = f89 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f80 = [Y1], INCY + (p16) LDFD f88 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f49 + (p17) STFD [Y4] = f57 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f50 = [X1], INCX + (p16) LDFD f58 = [X2], INCX + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f83 + (p17) STFD [X4] = f91 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f82 = [Y1], INCY + (p16) LDFD f90 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p17) STFD [Y3] = f51 + (p17) STFD [Y4] = f59 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f52 = [X1], INCX + (p16) LDFD f60 = [X2], INCX + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f85 + (p17) STFD [X4] = f93 + (p17) add X3 = X3, INCX + } + { .mmi + (p16) LDFD f84 = [Y1], INCY + (p16) LDFD f92 = [Y2], INCY + (p17) add X4 = X4, INCX + } + ;; + { .mmi + (p16) lfetch.nt1 [PRE1] + (p16) lfetch.nt1 [PRE2] + (p16) shladd PRE1 = INCX, 4, PRE1 + } + { .mmi + (p16) LDFD f54 = [X1], INCX5 + (p16) LDFD f62 = [X2], INCX5 + (p16) shladd PRE2 = INCX, 4, PRE2 + } + ;; + { .mmi + (p17) STFD [Y3] = f53 + (p17) STFD [Y4] = f61 + (p17) add Y3 = Y3, INCY + } + { .mmi + (p16) LDFD f86 = [Y1], INCY5 + (p16) LDFD f94 = [Y2], INCY5 + (p17) add Y4 = Y4, INCY + } + ;; + { .mmi + (p17) STFD [X3] = f87 + (p17) STFD [X4] = f95 + (p17) add X3 = X3, INCX5 + } + { .mib + nop __LINE__ + (p17) add X4 = X4, INCX5 + br.ctop.sptk.few .L12 + } + ;; +.L15: + { .mmi + (p18) STFD [Y3] = f56 + (p18) STFD [Y4] = f64 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f32 = [X1], INCX + (p12) LDFD f36 = [X2], INCX + cmp.eq p10, p0 = r0, J + } + ;; + { .mmi + (p12) LDFD f80 = [Y1], INCY + (p12) LDFD f84 = [Y2], INCY + (p18) add Y3 = Y3, INCY5 + } + { .mmi + (p12) LDFD f33 = [X1], INCX + (p12) LDFD f37 = [X2], INCX + (p18) add Y4 = Y4, INCY5 + } + ;; + { .mmi + (p12) LDFD f81 = [Y1], INCY + (p12) LDFD f85 = [Y2], INCY + mov pr = PR, -65474 + } + { .mmb + (p12) LDFD f34 = [X1], INCX + (p12) LDFD f38 = [X2], INCX + (p10) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f82 = [Y1], INCY + (p12) LDFD f86 = [Y2], INCY + tbit.z p0, p13 = N, 2 + } + { .mmi + (p12) LDFD f35 = [X1], INCX5 + (p12) LDFD f39 = [X2], INCX5 + tbit.z p0, p14 = N, 1 + } + ;; + { .mmi + (p12) LDFD f83 = [Y1], INCY5 + (p12) LDFD f87 = [Y2], INCY5 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmi + (p13) LDFD f40 = [X1], INCX + (p13) LDFD f88 = [Y1], INCY + } + ;; + { .mmi + (p13) LDFD f41 = [X1], INCX + (p13) LDFD f89 = [Y1], INCY + } + ;; + { .mmi + (p12) STFD [Y3] = f32 + (p12) STFD [Y4] = f36 + (p12) add Y3 = Y3, INCY + } + { .mmi + (p13) LDFD f42 = [X1], INCX + (p13) LDFD f90 = [Y1], INCY + (p12) add Y4 = Y4, INCY + } + ;; + { .mmi + (p12) STFD [X3] = f80 + (p12) STFD [X4] = f84 + (p12) add X3 = X3, INCX + } + { .mmi + (p13) LDFD f43 = [X1], INCX + (p13) LDFD f91 = [Y1], INCY + (p12) add X4 = X4, INCX + } + ;; + { .mmi + (p12) STFD [Y3] = f33 + (p12) STFD [Y4] = f37 + (p12) add Y3 = Y3, INCY + } + { .mmi + (p14) LDFD f44 = [X1], INCX + (p14) LDFD f92 = [Y1], INCY + (p12) add Y4 = Y4, INCY + } + ;; + { .mmi + (p12) STFD [X3] = f81 + (p12) STFD [X4] = f85 + (p12) add X3 = X3, INCX + } + { .mmi + (p14) LDFD f45 = [X1], INCX + (p14) LDFD f93 = [Y1], INCY + (p12) add X4 = X4, INCX + } + ;; + { .mmi + (p12) STFD [X3] = f82 + (p12) STFD [X4] = f86 + (p12) add X3 = X3, INCX + } + { .mmi + (p15) LDFD f46 = [X1], INCX + (p15) LDFD f94 = [Y1], INCY + (p12) add X4 = X4, INCX + } + ;; + { .mmi + (p12) STFD [Y3] = f34 + (p12) STFD [Y4] = f38 + (p12) add Y3 = Y3, INCY + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y4 = Y4, INCY + } + ;; + { .mmi + (p12) STFD [X3] = f83 + (p12) STFD [X4] = f87 + (p12) add X3 = X3, INCX5 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add X4 = X4, INCX5 + } + ;; + { .mmi + (p12) STFD [Y3] = f35 + (p12) STFD [Y4] = f39 + (p12) add Y3 = Y3, INCY5 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y4 = Y4, INCY5 + } + ;; + { .mmi + (p13) STFD [X3] = f88 + (p13) STFD [Y3] = f40 + (p13) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y3 = Y3, INCY + } + ;; + { .mmi + (p13) STFD [X3] = f89 + (p13) STFD [Y3] = f41 + (p13) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y3 = Y3, INCY + } + ;; + { .mmi + (p13) STFD [X3] = f90 + (p13) STFD [Y3] = f42 + (p13) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y3 = Y3, INCY + } + ;; + { .mmi + (p13) STFD [X3] = f91 + (p13) STFD [Y3] = f43 + (p13) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y3 = Y3, INCY + } + ;; + { .mmi + (p14) STFD [X3] = f92 + (p14) STFD [Y3] = f44 + (p14) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p14) add Y3 = Y3, INCY + } + ;; + { .mmi + (p14) STFD [X3] = f93 + (p14) STFD [Y3] = f45 + (p14) add X3 = X3, INCX + } + { .mmi + nop __LINE__ + nop __LINE__ + (p14) add Y3 = Y3, INCY + } + ;; + { .mmb + (p15) STFD [X3] = f94 + (p15) STFD [Y3] = f46 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/symv_U.S b/kernel/ia64/symv_U.S new file mode 100644 index 0000000000..4f6c451433 --- /dev/null +++ b/kernel/ia64/symv_U.S @@ -0,0 +1,463 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define A r34 +#define LDA r35 +#define X r36 +#define INCX r37 +#define Y r38 +#define INCY r39 +#define BUFFER r33 + +#define I r14 +#define IS r15 +#define A1 r16 +#define A2 r17 +#define A3 r18 +#define A4 r19 + +#define NEW_X r20 +#define NEW_Y r21 +#define XX r22 +#define YY r23 +#define TEMP r24 +#define YYS r25 + +#define PREA1 loc0 +#define PREA2 loc1 +#define PREA3 loc2 +#define PREA4 loc3 + +#define A11 loc4 +#define A21 loc5 +#define A31 loc6 +#define A41 loc7 + +#define PREX r8 +#define PREY r9 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 3 + 4) +#else +#define RPREFETCH (16 * 3 + 16) +#endif +#define PREFETCH lfetch.nt1 +#define PREFETCHW lfetch.excl.nt1 + +#define alpha f8 +#define atemp1 f6 +#define atemp2 f7 +#define atemp3 f10 +#define atemp4 f11 + +#define xsum1 f12 +#define xsum2 f13 +#define xsum3 f14 +#define xsum4 f15 + + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 8, 0 + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + ;; + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + ;; + stf.spill [r8] = f22 + stf.spill [r9] = f23 + .body + ;; + ld8 BUFFER = [r14] + ;; + shladd LDA = LDA, BASE_SHIFT, r0 + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + ;; + cmp.ge p7, p0 = 0, M + ;; + (p7) br.cond.dpnt .L999 + ;; + mov NEW_X = X + cmp.eq p10, p0 = SIZE, INCX + (p10) br.cond.dptk .L10 + ;; +.L10: + mov NEW_Y = Y + cmp.eq p10, p0 = SIZE, INCY + (p10) br.cond.dptk .L20 + ;; + +.L20: + mov IS = 0 + cmp.gt p10, p0 = 4, M + (p10) br.cond.dpnt .L30 + ;; +.L21: + mov A1 = A + add A2 = LDA, A + ;; + shladd A3 = LDA, 1, A + shladd A4 = LDA, 1, A2 + shladd A = LDA, 2, A + ;; + ;; + adds PREX = RPREFETCH * SIZE, NEW_X + adds PREY = RPREFETCH * SIZE, NEW_Y + adds PREA1 = RPREFETCH * SIZE, A1 + adds PREA2 = RPREFETCH * SIZE, A2 + adds PREA3 = RPREFETCH * SIZE, A3 + adds PREA4 = RPREFETCH * SIZE, A4 + ;; + shladd TEMP = IS, BASE_SHIFT, NEW_X + ;; + LDFD atemp1 = [TEMP], 1 * SIZE + ;; + LDFD atemp2 = [TEMP], 1 * SIZE + ;; + LDFD atemp3 = [TEMP], 1 * SIZE + ;; + LDFD atemp4 = [TEMP], 1 * SIZE + ;; + FMPY atemp1 = alpha, atemp1 + FMPY atemp2 = alpha, atemp2 + FMPY atemp3 = alpha, atemp3 + FMPY atemp4 = alpha, atemp4 + ;; + mov xsum1 = f0 + mov xsum2 = f0 + mov xsum3 = f0 + mov xsum4 = f0 + ;; + mov XX = NEW_X + mov YY = NEW_Y + mov YYS = NEW_Y + ;; + shr I = IS, 2 + mov pr.rot = 0 + ;; + mov ar.ec = 3 + cmp.eq p16, p0 = r0, r0 + ;; + cmp.eq p6, p0 = 0, I + adds I = -1, I + ;; + mov ar.lc = I + (p6) br.cond.dpnt .L28 + ;; + .align 16 + +.L22: + { .mmf + (p16) LDFPD f32, f35 = [A1], 2 * SIZE + (p19) STFD [YYS] = f95, 1 * SIZE + (p18) FMA xsum1 = f82, f34, xsum1 + } + { .mmf + (p18) FMA f94 = atemp1, f34, f94 + } + ;; + { .mmf + (p17) LDFD f90 = [XX], 1 * SIZE + (p18) FMA xsum2 = f82, f46, xsum2 + } + { .mmf + (p18) FMA f98 = atemp1, f37, f98 + } + ;; + { .mmf + (p16) LDFPD f44, f47 = [A2], 2 * SIZE + (p19) STFD [YYS] = f99, 1 * SIZE + (p18) FMA xsum3 = f82, f58, xsum3 + } + { .mmf + (p18) FMA f102 = atemp1, f40, f102 + } + ;; + { .mmf + (p16) PREFETCHW [PREY], 4 * SIZE + (p16) LDFD f92 = [YY], 1 * SIZE + (p18) FMA xsum4 = f82, f70, xsum4 + } + { .mmf + (p18) FMA f106 = atemp1, f43, f106 + } + ;; + { .mmf + (p16) LDFPD f56, f59 = [A3], 2 * SIZE + (p19) STFD [YYS] = f103, 1 * SIZE + (p18) FMA xsum1 = f85, f37, xsum1 + } + { .mmf + (p18) FMA f94 = atemp2, f46, f94 + } + ;; + { .mmf + (p16) LDFD f96 = [YY], 1 * SIZE + (p18) FMA xsum2 = f85, f49, xsum2 + } + { .mmf + (p18) FMA f98 = atemp2, f49, f98 + } + ;; + { .mmf + (p16) LDFPD f68, f71 = [A4], 2 * SIZE + (p19) STFD [YYS] = f107, 1 * SIZE + (p18) FMA xsum3 = f85, f61, xsum3 + } + { .mmf + (p18) FMA f102 = atemp2, f52, f102 + } + ;; + { .mmf + (p16) LDFD f100 = [YY], 1 * SIZE + (p18) FMA xsum4 = f85, f73, xsum4 + } + { .mmf + (p18) FMA f106 = atemp2, f55, f106 + } + ;; + { .mmf + (p16) PREFETCH [PREA1], 4 * SIZE + (p16) LDFPD f38, f41 = [A1], 2 * SIZE + (p18) FMA xsum1 = f88, f40, xsum1 + } + { .mmf + (p18) FMA f94 = atemp3, f58, f94 + } + ;; + { .mmf + (p16) LDFD f104 = [YY], 1 * SIZE + (p18) FMA xsum2 = f88, f52, xsum2 + } + { .mmf + (p18) FMA f98 = atemp3, f61, f98 + } + ;; + { .mmf + (p16) PREFETCH [PREA2], 4 * SIZE + (p16) LDFPD f50, f53 = [A2], 2 * SIZE + (p18) FMA xsum3 = f88, f64, xsum3 + } + { .mmf + (p18) FMA f102 = atemp3, f64, f102 + } + ;; + { .mmf + (p16) PREFETCH [PREX], 4 * SIZE + (p16) LDFD f80 = [XX], 1 * SIZE + (p18) FMA xsum4 = f88, f76, xsum4 + } + { .mmf + (p18) FMA f106 = atemp3, f67, f106 + } + ;; + { .mmf + (p16) PREFETCH [PREA3], 4 * SIZE + (p16) LDFPD f62, f65 = [A3], 2 * SIZE + (p18) FMA xsum1 = f91, f43, xsum1 + } + { .mmf + (p18) FMA f94 = atemp4, f70, f94 + } + ;; + { .mmf + (p16) LDFD f83 = [XX], 1 * SIZE + (p18) FMA xsum2 = f91, f55, xsum2 + } + { .mmf + (p18) FMA f98 = atemp4, f73, f98 + } + ;; + { .mmf + (p16) PREFETCH [PREA4], 4 * SIZE + (p16) LDFPD f74, f77 = [A4], 2 * SIZE + (p18) FMA xsum3 = f91, f67, xsum3 + } + { .mmf + (p18) FMA f102 = atemp4, f76, f102 + } + ;; + { .mmf + (p16) LDFD f86 = [XX], 1 * SIZE + (p18) FMA xsum4 = f91, f79, xsum4 + } + { .mfb + (p18) FMA f106 = atemp4, f79, f106 + br.ctop.sptk.few .L22 + } + ;; + (p19) STFD [YYS] = f95, 1 * SIZE + ;; + (p19) STFD [YYS] = f99, 1 * SIZE + ;; + (p19) STFD [YYS] = f103, 1 * SIZE + ;; + (p19) STFD [YYS] = f107, 1 * SIZE + ;; + ;; + .align 16 + +.L28: + FMPY xsum1 = alpha, xsum1 + FMPY xsum2 = alpha, xsum2 + FMPY xsum3 = alpha, xsum3 + FMPY xsum4 = alpha, xsum4 + ;; + LDFD f64 = [A1], 1 * SIZE + LDFD f65 = [A2], 1 * SIZE + LDFD f66 = [A3], 1 * SIZE + LDFD f67 = [A4], 1 * SIZE + ;; + LDFD f68 = [A1], 1 * SIZE + LDFD f69 = [A2], 1 * SIZE + LDFD f70 = [A3], 1 * SIZE + LDFD f71 = [A4], 1 * SIZE + ;; + LDFD f72 = [A1], 1 * SIZE + LDFD f73 = [A2], 1 * SIZE + LDFD f74 = [A3], 1 * SIZE + LDFD f75 = [A4], 1 * SIZE + ;; + LDFD f76 = [A1], 1 * SIZE + LDFD f77 = [A2], 1 * SIZE + LDFD f78 = [A3], 1 * SIZE + LDFD f79 = [A4], 1 * SIZE + ;; + FMA xsum1 = atemp1, f64, xsum1 + FMA xsum2 = atemp1, f65, xsum2 + FMA xsum3 = atemp1, f66, xsum3 + FMA xsum4 = atemp1, f67, xsum4 + ;; + FMA xsum1 = atemp2, f65, xsum1 + FMA xsum2 = atemp2, f69, xsum2 + FMA xsum3 = atemp2, f70, xsum3 + FMA xsum4 = atemp2, f71, xsum4 + ;; + FMA xsum1 = atemp3, f66, xsum1 + FMA xsum2 = atemp3, f70, xsum2 + FMA xsum3 = atemp3, f74, xsum3 + FMA xsum4 = atemp3, f75, xsum4 + ;; + FMA xsum1 = atemp4, f67, xsum1 + FMA xsum2 = atemp4, f71, xsum2 + FMA xsum3 = atemp4, f75, xsum3 + FMA xsum4 = atemp4, f79, xsum4 + ;; + LDFD f36 = [YY], 1 * SIZE + ;; + LDFD f37 = [YY], 1 * SIZE + ;; + LDFD f38 = [YY], 1 * SIZE + ;; + LDFD f39 = [YY], 1 * SIZE + ;; + FADD f36 = f36, xsum1 + FADD f37 = f37, xsum2 + FADD f38 = f38, xsum3 + FADD f39 = f39, xsum4 + ;; + STFD [YYS] = f36, 1 * SIZE + ;; + STFD [YYS] = f37, 1 * SIZE + ;; + STFD [YYS] = f38, 1 * SIZE + ;; + STFD [YYS] = f39, 1 * SIZE + ;; + adds IS = 4, IS + ;; + adds TEMP = 4, IS + ;; + cmp.le p6, p0 = TEMP, M + ;; + (p6) br.cond.dpnt .L21 + ;; +.L30: + + +.L990: + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/trsm_kernel_LN.S b/kernel/ia64/trsm_kernel_LN.S new file mode 100644 index 0000000000..9b1f2b2692 --- /dev/null +++ b/kernel/ia64/trsm_kernel_LN.S @@ -0,0 +1,14028 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE -7 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r36 +#define B r37 +#define C r38 +#define LDC r39 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 +#define AOFFSET2 loc12 +#define BOFFSET2 loc13 + + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -6 * 16, SP + adds r9 = -5 * 16, SP + adds SP = -6 * 16, SP + } + ;; + { .mmi + setf.sig f32 = M + setf.sig f33 = K + mov PR = pr + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + shr J = N, 3 + } + ;; + { .mmi + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + shladd LDC = LDC, BASE_SHIFT, r0 + } + ;; + { .mmi + stf.spill [r8] = f20 + stf.spill [r9] = f21 + mov AOFFSET = A + } + ;; + .body + { .mmf + ld8 OFFSET = [r14] + cmp.ge p6, p0 = 0, J + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + shladd C = M, BASE_SHIFT, C + nop __LINE__ + } + ;; + { .mmb + shladd A = r2, BASE_SHIFT, A + nop __LINE__ + (p6) br.cond.dpnt .L050 + } + ;; + .align 8 + +.L000: + { .mmf + mov C1 = C + add KK = M, OFFSET + } + { .mmi + mov AORIG = A + add C2 = LDC, C + shladd C3 = LDC, 1, C + } + ;; + { .mmf + shladd C5 = LDC, 2, C + shladd C = LDC, 3, C + } + { .mmf + shladd C4 = LDC, 1, C2 + shladd C6 = LDC, 2, C2 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 + shladd C8 = LDC, 2, C4 + } + ;; + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f96 = f0 + mov f104 = f0 + mov f112 = f0 + mov f120 = f0 + +.L040: + { .mib + sub L = K, KK + tbit.z p6, p0 = M, 0 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + shladd BOFFSET = r3, 3, B + sub AORIG = AORIG, r2 + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L048 + } + ;; + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.cloop.sptk.few .L042 + } + ;; + +.L048: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f96 = f36, f96 + FSUB f104 = f37, f104 + FSUB f112 = f38, f112 + FSUB f120 = f39, f120 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f96 = f96, f32 + FMPY f72 = f72, f32 + FMPY f104 = f104, f32 + FMPY f80 = f80, f32 + FMPY f112 = f112, f32 + FMPY f88 = f88, f32 + FMPY f120 = f120, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -1 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + adds C3 = -1 * SIZE, C3 + adds C4 = -1 * SIZE, C4 + adds C5 = -1 * SIZE, C5 + adds C6 = -1 * SIZE, C6 + adds C7 = -1 * SIZE, C7 + adds C8 = -1 * SIZE, C8 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, -3 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, -3 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FNMA f96 = f64, f36, f96 + ;; + FNMA f104 = f64, f37, f104 + ;; + FNMA f112 = f64, f38, f112 + ;; + FNMA f120 = f64, f39, f120 + ;; + FMPY f72 = f72, f40 + ;; + FNMA f80 = f72, f41, f80 + ;; + FNMA f88 = f72, f42, f88 + ;; + FNMA f96 = f72, f43, f96 + ;; + FNMA f104 = f72, f44, f104 + ;; + FNMA f112 = f72, f45, f112 + ;; + FNMA f120 = f72, f46, f120 + ;; + FMPY f80 = f80, f47 + ;; + FNMA f88 = f80, f48, f88 + ;; + FNMA f96 = f80, f49, f96 + ;; + FNMA f104 = f80, f50, f104 + ;; + FNMA f112 = f80, f51, f112 + ;; + FNMA f120 = f80, f52, f120 + ;; + FMPY f88 = f88, f53 + ;; + FNMA f96 = f88, f54, f96 + ;; + FNMA f104 = f88, f55, f104 + ;; + FNMA f112 = f88, f56, f112 + ;; + FNMA f120 = f88, f57, f120 + ;; + FMPY f96 = f96, f58 + ;; + FNMA f104 = f96, f59, f104 + ;; + FNMA f112 = f96, f60, f112 + ;; + FNMA f120 = f96, f61, f120 + ;; + FMPY f104 = f104, f16 + ;; + FNMA f112 = f104, f17, f112 + ;; + FNMA f120 = f104, f18, f120 + ;; + FMPY f112 = f112, f19 + ;; + FNMA f120 = f112, f20, f120 + ;; + FMPY f120 = f120, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + ;; + FNMA f112 = f120, f33, f112 + ;; + FNMA f104 = f120, f34, f104 + ;; + FNMA f96 = f120, f35, f96 + ;; + FNMA f88 = f120, f36, f88 + ;; + FNMA f80 = f120, f37, f80 + ;; + FNMA f72 = f120, f38, f72 + ;; + FNMA f64 = f120, f39, f64 + ;; + FMPY f112 = f112, f40 + ;; + FNMA f104 = f112, f41, f104 + ;; + FNMA f96 = f112, f42, f96 + ;; + FNMA f88 = f112, f43, f88 + ;; + FNMA f80 = f112, f44, f80 + ;; + FNMA f72 = f112, f45, f72 + ;; + FNMA f64 = f112, f46, f64 + ;; + FMPY f104 = f104, f47 + ;; + FNMA f96 = f104, f48, f96 + ;; + FNMA f88 = f104, f49, f88 + ;; + FNMA f80 = f104, f50, f80 + ;; + FNMA f72 = f104, f51, f72 + ;; + FNMA f64 = f104, f52, f64 + ;; + FMPY f96 = f96, f53 + ;; + FNMA f88 = f96, f54, f88 + ;; + FNMA f80 = f96, f55, f80 + ;; + FNMA f72 = f96, f56, f72 + ;; + FNMA f64 = f96, f57, f64 + ;; + FMPY f88 = f88, f58 + ;; + FNMA f80 = f88, f59, f80 + ;; + FNMA f72 = f88, f60, f72 + ;; + FNMA f64 = f88, f61, f64 + ;; + FMPY f80 = f80, f16 + ;; + FNMA f72 = f80, f17, f72 + ;; + FNMA f64 = f80, f18, f64 + ;; + FMPY f72 = f72, f19 + ;; + FNMA f64 = f72, f20, f64 + ;; + FMPY f64 = f64, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, - 3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; + +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif +#ifndef LN + STFD [C3 ] = f80, SIZE +#else + STFD [C3 ] = f80 +#endif +#ifndef LN + STFD [C4 ] = f88, SIZE +#else + STFD [C4 ] = f88 +#endif +#ifndef LN + STFD [C5 ] = f96, SIZE +#else + STFD [C5 ] = f96 +#endif +#ifndef LN + STFD [C6 ] = f104, SIZE +#else + STFD [C6 ] = f104 +#endif +#ifndef LN + STFD [C7 ] = f112, SIZE +#else + STFD [C7 ] = f112 +#endif +#ifndef LN + STFD [C8 ] = f120, SIZE +#else + STFD [C8 ] = f120 +#endif + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f96 = f0 + mov f104 = f0 + mov f112 = f0 + mov f120 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L030: + { .mib + sub L = K, KK + tbit.z p6, p0 = M, 1 + (p6) br.cond.dptk .L020 + } + ;; + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L038 + } + ;; + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + FSUB f113 = f46, f113 + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; + FSUB f96 = f40, f96 + FSUB f97 = f41, f97 + ;; + FSUB f104 = f42, f104 + FSUB f105 = f43, f105 + ;; + FSUB f112 = f44, f112 + FSUB f113 = f45, f113 + ;; + FSUB f120 = f46, f120 + FSUB f121 = f47, f121 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f97 = f97, f32 + FMPY f73 = f73, f32 + FMPY f105 = f105, f32 + FMPY f81 = f81, f32 + FMPY f113 = f113, f32 + FMPY f89 = f89, f32 + FMPY f121 = f121, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f96 = f97, f33, f96 + FNMA f72 = f73, f33, f72 + FNMA f104 = f105, f33, f104 + FNMA f80 = f81, f33, f80 + FNMA f112 = f113, f33, f112 + FNMA f88 = f89, f33, f88 + FNMA f120 = f121, f33, f120 + ;; + FMPY f64 = f64, f34 + FMPY f96 = f96, f34 + FMPY f72 = f72, f34 + FMPY f104 = f104, f34 + FMPY f80 = f80, f34 + FMPY f112 = f112, f34 + FMPY f88 = f88, f34 + FMPY f120 = f120, f34 + ;; + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -2 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -2 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 + adds C5 = -2 * SIZE, C5 + adds C6 = -2 * SIZE, C6 + adds C7 = -2 * SIZE, C7 + adds C8 = -2 * SIZE, C8 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + FMPY f65 = f65, f34 + FMPY f97 = f97, f34 + FMPY f73 = f73, f34 + FMPY f105 = f105, f34 + FMPY f81 = f81, f34 + FMPY f113 = f113, f34 + FMPY f89 = f89, f34 + FMPY f121 = f121, f34 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, -11 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, -11 * SIZE + } +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + ;; + FNMA f120 = f104, f18, f120 + FNMA f121 = f105, f18, f121 + ;; + FMPY f112 = f112, f19 + FMPY f113 = f113, f19 + ;; + FNMA f120 = f112, f20, f120 + FNMA f121 = f113, f20, f121 + ;; + FMPY f120 = f120, f21 + FMPY f121 = f121, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, 5 * SIZE + STFD [AOFFSET2] = f89, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f105, -11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + FMPY f121 = f121, f32 + ;; + FNMA f112 = f120, f33, f112 + FNMA f113 = f121, f33, f113 + ;; + FNMA f104 = f120, f34, f104 + FNMA f105 = f121, f34, f105 + ;; + FNMA f96 = f120, f35, f96 + FNMA f97 = f121, f35, f97 + ;; + FNMA f88 = f120, f36, f88 + FNMA f89 = f121, f36, f89 + ;; + FNMA f80 = f120, f37, f80 + FNMA f81 = f121, f37, f81 + ;; + FNMA f72 = f120, f38, f72 + FNMA f73 = f121, f38, f73 + ;; + FNMA f64 = f120, f39, f64 + FNMA f65 = f121, f39, f65 + ;; + FMPY f112 = f112, f40 + FMPY f113 = f113, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f105 = f113, f41, f105 + ;; + FNMA f96 = f112, f42, f96 + FNMA f97 = f113, f42, f97 + ;; + FNMA f88 = f112, f43, f88 + FNMA f89 = f113, f43, f89 + ;; + FNMA f80 = f112, f44, f80 + FNMA f81 = f113, f44, f81 + ;; + FNMA f72 = f112, f45, f72 + FNMA f73 = f113, f45, f73 + ;; + FNMA f64 = f112, f46, f64 + FNMA f65 = f113, f46, f65 + ;; + FMPY f104 = f104, f47 + FMPY f105 = f105, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f97 = f105, f48, f97 + ;; + FNMA f88 = f104, f49, f88 + FNMA f89 = f105, f49, f89 + ;; + FNMA f80 = f104, f50, f80 + FNMA f81 = f105, f50, f81 + ;; + FNMA f72 = f104, f51, f72 + FNMA f73 = f105, f51, f73 + ;; + FNMA f64 = f104, f52, f64 + FNMA f65 = f105, f52, f65 + ;; + FMPY f96 = f96, f53 + FMPY f97 = f97, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f89 = f97, f54, f89 + ;; + FNMA f80 = f96, f55, f80 + FNMA f81 = f97, f55, f81 + ;; + FNMA f72 = f96, f56, f72 + FNMA f73 = f97, f56, f73 + ;; + FNMA f64 = f96, f57, f64 + FNMA f65 = f97, f57, f65 + ;; + FMPY f88 = f88, f58 + FMPY f89 = f89, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f81 = f89, f59, f81 + ;; + FNMA f72 = f88, f60, f72 + FNMA f73 = f89, f60, f73 + ;; + FNMA f64 = f88, f61, f64 + FNMA f65 = f89, f61, f65 + ;; + FMPY f80 = f80, f16 + FMPY f81 = f81, f16 + ;; + FNMA f72 = f80, f17, f72 + FNMA f73 = f81, f17, f73 + ;; + FNMA f64 = f80, f18, f64 + FNMA f65 = f81, f18, f65 + ;; + FMPY f72 = f72, f19 + FMPY f73 = f73, f19 + ;; + FNMA f64 = f72, f20, f64 + FNMA f65 = f73, f20, f65 + ;; + FMPY f64 = f64, f21 + FMPY f65 = f65, f21 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f105, - 11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, - 3 * SIZE + STFD [AOFFSET2] = f89, - 3 * SIZE + ;; + +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; +#ifndef LN + STFD [C3 ] = f81, SIZE +#else + STFD [C3 ] = f81, - SIZE +#endif + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; +#ifndef LN + STFD [C4 ] = f89, SIZE +#else + STFD [C4 ] = f89, -SIZE +#endif + ;; + STFD [C5 ] = f96, SIZE + mov f96 = f0 + ;; +#ifndef LN + STFD [C5 ] = f97, SIZE +#else + STFD [C5 ] = f97, -SIZE +#endif + ;; + STFD [C6 ] = f104, SIZE + mov f104 = f0 + ;; +#ifndef LN + STFD [C6 ] = f105, SIZE +#else + STFD [C6 ] = f105, -SIZE +#endif + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + STFD [C7 ] = f112, SIZE + mov f112 = f0 + ;; + { .mmi +#ifndef LN + STFD [C7 ] = f113, SIZE +#else + STFD [C7 ] = f113, -SIZE +#endif + +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + mov f120 = f0 + } + ;; + { .mmi +#ifndef LN + STFD [C8 ] = f121, SIZE +#else + STFD [C8 ] = f121, -SIZE +#endif + +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L020: + { .mib + sub L = K, KK + tbit.z p6, p0 = M, 2 + (p6) br.cond.dptk .L010 + } + ;; + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; + #if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; + #else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 + #ifdef LN + sub AORIG = AORIG, r2 + #else + nop __LINE__ + #endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; + #endif + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f67 = f0 + } + { .mfi + setf.d f74 = r0 + mov f75 = f0 + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f83 = f0 + } + { .mfi + setf.d f90 = r0 + mov f91 = f0 + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f99 = f0 + } + { .mfi + setf.d f106 = r0 + mov f107 = f0 + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f114 = r0 + mov f115 = f0 + } + { .mfb + setf.d f122 = r0 + mov f123 = f0 + (p6) br.cond.dpnt .L028 + } + ;; + + .L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + + .L028: + #if defined(LN) || defined(RT) + #ifdef LN + adds r2 = -4, KK + #else + adds r2 = -8, KK + #endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 3, B + ;; + #endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + + #if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f113 = f46, f113 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + FSUB f66 = f48, f66 + FSUB f74 = f49, f74 + FSUB f82 = f50, f82 + FSUB f90 = f51, f90 + FSUB f98 = f52, f98 + FSUB f106 = f53, f106 + FSUB f114 = f54, f114 + FSUB f122 = f55, f122 + ;; + FSUB f67 = f56, f67 + FSUB f75 = f57, f75 + FSUB f83 = f58, f83 + FSUB f91 = f59, f91 + FSUB f99 = f60, f99 + FSUB f107 = f61, f107 + FSUB f115 = f62, f115 + FSUB f123 = f63, f123 + ;; + #else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; + FSUB f96 = f48, f96 + FSUB f97 = f49, f97 + FSUB f98 = f50, f98 + FSUB f99 = f51, f99 + ;; + FSUB f104 = f52, f104 + FSUB f105 = f53, f105 + FSUB f106 = f54, f106 + FSUB f107 = f55, f107 + ;; + FSUB f112 = f56, f112 + FSUB f113 = f57, f113 + FSUB f114 = f58, f114 + FSUB f115 = f59, f115 + ;; + FSUB f120 = f60, f120 + FSUB f121 = f61, f121 + FSUB f122 = f62, f122 + FSUB f123 = f63, f123 + ;; + #endif + + #ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f99 = f99, f32 + FMPY f75 = f75, f32 + FMPY f107 = f107, f32 + FMPY f83 = f83, f32 + FMPY f115 = f115, f32 + FMPY f91 = f91, f32 + FMPY f123 = f123, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f98 = f99, f33, f98 + FNMA f74 = f75, f33, f74 + FNMA f106 = f107, f33, f106 + FNMA f82 = f83, f33, f82 + FNMA f114 = f115, f33, f114 + FNMA f90 = f91, f33, f90 + FNMA f122 = f123, f33, f122 + ;; + FNMA f65 = f67, f34, f65 + FNMA f97 = f99, f34, f97 + FNMA f73 = f75, f34, f73 + FNMA f105 = f107, f34, f105 + FNMA f81 = f83, f34, f81 + FNMA f113 = f115, f34, f113 + FNMA f89 = f91, f34, f89 + FNMA f121 = f123, f34, f121 + ;; + FNMA f64 = f67, f35, f64 + FNMA f96 = f99, f35, f96 + FNMA f72 = f75, f35, f72 + FNMA f104 = f107, f35, f104 + FNMA f80 = f83, f35, f80 + FNMA f112 = f115, f35, f112 + FNMA f88 = f91, f35, f88 + FNMA f120 = f123, f35, f120 + ;; + FMPY f66 = f66, f36 + FMPY f98 = f98, f36 + FMPY f74 = f74, f36 + FMPY f106 = f106, f36 + FMPY f82 = f82, f36 + FMPY f114 = f114, f36 + FMPY f90 = f90, f36 + FMPY f122 = f122, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f97 = f98, f37, f97 + FNMA f73 = f74, f37, f73 + FNMA f105 = f106, f37, f105 + FNMA f81 = f82, f37, f81 + FNMA f113 = f114, f37, f113 + FNMA f89 = f90, f37, f89 + FNMA f121 = f122, f37, f121 + ;; + FNMA f64 = f66, f38, f64 + FNMA f96 = f98, f38, f96 + FNMA f72 = f74, f38, f72 + FNMA f104 = f106, f38, f104 + FNMA f80 = f82, f38, f80 + FNMA f112 = f114, f38, f112 + FNMA f88 = f90, f38, f88 + FNMA f120 = f122, f38, f120 + ;; + adds BOFFSET = 24 * SIZE, BOFFSET + adds BOFFSET2 = 24 * SIZE, BOFFSET2 + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMPY f65 = f65, f39 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMPY f97 = f97, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FMPY f73 = f73, f39 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FMPY f105 = f105, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FMPY f81 = f81, f39 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FMPY f113 = f113, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f91, - 11 * SIZE + FMPY f89 = f89, f39 + } + { .mfi + STFD [BOFFSET2] = f123, - 11 * SIZE + FMPY f121 = f121, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f65, f40, f64 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f97, f40, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f72 = f73, f40, f72 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f104 = f105, f40, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f81, f40, f80 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f113, f40, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f90, -11 * SIZE + FNMA f88 = f89, f40, f88 + } + { .mfi + STFD [BOFFSET2] = f122, -11 * SIZE + FNMA f120 = f121, f40, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f64 = f64, f41 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f96 = f96, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f72 = f72, f41 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f104 = f104, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f80 = f80, f41 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f112 = f112, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + FMPY f88 = f88, f41 + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + FMPY f120 = f120, f41 + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -4 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -4 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + #endif + + #ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + FNMA f74 = f72, f34, f74 + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + FNMA f67 = f64, f35, f67 + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + FNMA f83 = f80, f35, f83 + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + nop __LINE__ + } + ;; + FMPY f65 = f65, f36 + FMPY f97 = f97, f36 + FMPY f73 = f73, f36 + FMPY f105 = f105, f36 + FMPY f81 = f81, f36 + FMPY f113 = f113, f36 + FMPY f89 = f89, f36 + FMPY f121 = f121, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f98 = f97, f37, f98 + FNMA f74 = f73, f37, f74 + FNMA f106 = f105, f37, f106 + FNMA f82 = f81, f37, f82 + FNMA f114 = f113, f37, f114 + FNMA f90 = f89, f37, f90 + FNMA f122 = f121, f37, f122 + ;; + FNMA f67 = f65, f38, f67 + FNMA f99 = f97, f38, f99 + FNMA f75 = f73, f38, f75 + FNMA f107 = f105, f38, f107 + FNMA f83 = f81, f38, f83 + FNMA f115 = f113, f38, f115 + FNMA f91 = f89, f38, f91 + FNMA f123 = f121, f38, f123 + ;; + FMPY f66 = f66, f39 + FMPY f98 = f98, f39 + FMPY f74 = f74, f39 + FMPY f106 = f106, f39 + FMPY f82 = f82, f39 + FMPY f114 = f114, f39 + FMPY f90 = f90, f39 + FMPY f122 = f122, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f99 = f98, f40, f99 + FNMA f75 = f74, f40, f75 + FNMA f107 = f106, f40, f107 + FNMA f83 = f82, f40, f83 + FNMA f115 = f114, f40, f115 + FNMA f91 = f90, f40, f91 + FNMA f123 = f122, f40, f123 + ;; + FMPY f67 = f67, f41 + FMPY f99 = f99, f41 + FMPY f75 = f75, f41 + FMPY f107 = f107, f41 + FMPY f83 = f83, f41 + FMPY f115 = f115, f41 + FMPY f91 = f91, f41 + FMPY f123 = f123, f41 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f91, -27 * SIZE + } + { .mfi + STFD [BOFFSET2] = f123, -27 * SIZE + } + ;; + #endif + + #ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + FNMA f98 = f66, f36, f98 + FNMA f99 = f67, f36, f99 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + FNMA f106 = f66, f37, f106 + FNMA f107 = f67, f37, f107 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + FNMA f114 = f66, f38, f114 + FNMA f115 = f67, f38, f115 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + FNMA f122 = f66, f39, f122 + FNMA f123 = f67, f39, f123 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + FMPY f74 = f74, f40 + FMPY f75 = f75, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + FNMA f82 = f74, f41, f82 + FNMA f83 = f75, f41, f83 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + FNMA f90 = f74, f42, f90 + FNMA f91 = f75, f42, f91 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + FNMA f98 = f74, f43, f98 + FNMA f99 = f75, f43, f99 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + FNMA f106 = f74, f44, f106 + FNMA f107 = f75, f44, f107 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + FNMA f114 = f74, f45, f114 + FNMA f115 = f75, f45, f115 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + FNMA f122 = f74, f46, f122 + FNMA f123 = f75, f46, f123 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + FMPY f82 = f82, f47 + FMPY f83 = f83, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + FNMA f90 = f82, f48, f90 + FNMA f91 = f83, f48, f91 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + FNMA f98 = f82, f49, f98 + FNMA f99 = f83, f49, f99 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + FNMA f106 = f82, f50, f106 + FNMA f107 = f83, f50, f107 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + FNMA f114 = f82, f51, f114 + FNMA f115 = f83, f51, f115 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + FNMA f122 = f82, f52, f122 + FNMA f123 = f83, f52, f123 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + FMPY f90 = f90, f53 + FMPY f91 = f91, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + FNMA f98 = f90, f54, f98 + FNMA f99 = f91, f54, f99 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + FNMA f106 = f90, f55, f106 + FNMA f107 = f91, f55, f107 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + FNMA f114 = f90, f56, f114 + FNMA f115 = f91, f56, f115 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + FNMA f122 = f90, f57, f122 + FNMA f123 = f91, f57, f123 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + FMPY f98 = f98, f58 + FMPY f99 = f99, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + FNMA f106 = f98, f59, f106 + FNMA f107 = f99, f59, f107 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + FNMA f114 = f98, f60, f114 + FNMA f115 = f99, f60, f115 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + FNMA f122 = f98, f61, f122 + FNMA f123 = f99, f61, f123 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + FMPY f106 = f106, f16 + FMPY f107 = f107, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + FNMA f114 = f106, f17, f114 + FNMA f115 = f107, f17, f115 + ;; + FNMA f120 = f104, f18, f120 + FNMA f121 = f105, f18, f121 + FNMA f122 = f106, f18, f122 + FNMA f123 = f107, f18, f123 + ;; + FMPY f112 = f112, f19 + FMPY f113 = f113, f19 + FMPY f114 = f114, f19 + FMPY f115 = f115, f19 + ;; + FNMA f120 = f112, f20, f120 + FNMA f121 = f113, f20, f121 + FNMA f122 = f114, f20, f122 + FNMA f123 = f115, f20, f123 + ;; + FMPY f120 = f120, f21 + FMPY f121 = f121, f21 + FMPY f122 = f122, f21 + FMPY f123 = f123, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f91, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f105, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f106, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f107, 5 * SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + ;; + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + ;; + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f123, - 27 * SIZE + ;; + #endif + + #ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + FMPY f121 = f121, f32 + FMPY f122 = f122, f32 + FMPY f123 = f123, f32 + ;; + FNMA f112 = f120, f33, f112 + FNMA f113 = f121, f33, f113 + FNMA f114 = f122, f33, f114 + FNMA f115 = f123, f33, f115 + ;; + FNMA f104 = f120, f34, f104 + FNMA f105 = f121, f34, f105 + FNMA f106 = f122, f34, f106 + FNMA f107 = f123, f34, f107 + ;; + FNMA f96 = f120, f35, f96 + FNMA f97 = f121, f35, f97 + FNMA f98 = f122, f35, f98 + FNMA f99 = f123, f35, f99 + ;; + FNMA f88 = f120, f36, f88 + FNMA f89 = f121, f36, f89 + FNMA f90 = f122, f36, f90 + FNMA f91 = f123, f36, f91 + ;; + FNMA f80 = f120, f37, f80 + FNMA f81 = f121, f37, f81 + FNMA f82 = f122, f37, f82 + FNMA f83 = f123, f37, f83 + ;; + FNMA f72 = f120, f38, f72 + FNMA f73 = f121, f38, f73 + FNMA f74 = f122, f38, f74 + FNMA f75 = f123, f38, f75 + ;; + FNMA f64 = f120, f39, f64 + FNMA f65 = f121, f39, f65 + FNMA f66 = f122, f39, f66 + FNMA f67 = f123, f39, f67 + ;; + FMPY f112 = f112, f40 + FMPY f113 = f113, f40 + FMPY f114 = f114, f40 + FMPY f115 = f115, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f105 = f113, f41, f105 + FNMA f106 = f114, f41, f106 + FNMA f107 = f115, f41, f107 + ;; + FNMA f96 = f112, f42, f96 + FNMA f97 = f113, f42, f97 + FNMA f98 = f114, f42, f98 + FNMA f99 = f115, f42, f99 + ;; + FNMA f88 = f112, f43, f88 + FNMA f89 = f113, f43, f89 + FNMA f90 = f114, f43, f90 + FNMA f91 = f115, f43, f91 + ;; + FNMA f80 = f112, f44, f80 + FNMA f81 = f113, f44, f81 + FNMA f82 = f114, f44, f82 + FNMA f83 = f115, f44, f83 + ;; + FNMA f72 = f112, f45, f72 + FNMA f73 = f113, f45, f73 + FNMA f74 = f114, f45, f74 + FNMA f75 = f115, f45, f75 + ;; + FNMA f64 = f112, f46, f64 + FNMA f65 = f113, f46, f65 + FNMA f66 = f114, f46, f66 + FNMA f67 = f115, f46, f67 + ;; + FMPY f104 = f104, f47 + FMPY f105 = f105, f47 + FMPY f106 = f106, f47 + FMPY f107 = f107, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f97 = f105, f48, f97 + FNMA f98 = f106, f48, f98 + FNMA f99 = f107, f48, f99 + ;; + FNMA f88 = f104, f49, f88 + FNMA f89 = f105, f49, f89 + FNMA f90 = f106, f49, f90 + FNMA f91 = f107, f49, f91 + ;; + FNMA f80 = f104, f50, f80 + FNMA f81 = f105, f50, f81 + FNMA f82 = f106, f50, f82 + FNMA f83 = f107, f50, f83 + ;; + FNMA f72 = f104, f51, f72 + FNMA f73 = f105, f51, f73 + FNMA f74 = f106, f51, f74 + FNMA f75 = f107, f51, f75 + ;; + FNMA f64 = f104, f52, f64 + FNMA f65 = f105, f52, f65 + FNMA f66 = f106, f52, f66 + FNMA f67 = f107, f52, f67 + ;; + FMPY f96 = f96, f53 + FMPY f97 = f97, f53 + FMPY f98 = f98, f53 + FMPY f99 = f99, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f89 = f97, f54, f89 + FNMA f90 = f98, f54, f90 + FNMA f91 = f99, f54, f91 + ;; + FNMA f80 = f96, f55, f80 + FNMA f81 = f97, f55, f81 + FNMA f82 = f98, f55, f82 + FNMA f83 = f99, f55, f83 + ;; + FNMA f72 = f96, f56, f72 + FNMA f73 = f97, f56, f73 + FNMA f74 = f98, f56, f74 + FNMA f75 = f99, f56, f75 + ;; + FNMA f64 = f96, f57, f64 + FNMA f65 = f97, f57, f65 + FNMA f66 = f98, f57, f66 + FNMA f67 = f99, f57, f67 + ;; + FMPY f88 = f88, f58 + FMPY f89 = f89, f58 + FMPY f90 = f90, f58 + FMPY f91 = f91, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f81 = f89, f59, f81 + FNMA f82 = f90, f59, f82 + FNMA f83 = f91, f59, f83 + ;; + FNMA f72 = f88, f60, f72 + FNMA f73 = f89, f60, f73 + FNMA f74 = f90, f60, f74 + FNMA f75 = f91, f60, f75 + ;; + FNMA f64 = f88, f61, f64 + FNMA f65 = f89, f61, f65 + FNMA f66 = f90, f61, f66 + FNMA f67 = f91, f61, f67 + ;; + FMPY f80 = f80, f16 + FMPY f81 = f81, f16 + FMPY f82 = f82, f16 + FMPY f83 = f83, f16 + ;; + FNMA f72 = f80, f17, f72 + FNMA f73 = f81, f17, f73 + FNMA f74 = f82, f17, f74 + FNMA f75 = f83, f17, f75 + ;; + FNMA f64 = f80, f18, f64 + FNMA f65 = f81, f18, f65 + FNMA f66 = f82, f18, f66 + FNMA f67 = f83, f18, f67 + ;; + FMPY f72 = f72, f19 + FMPY f73 = f73, f19 + FMPY f74 = f74, f19 + FMPY f75 = f75, f19 + ;; + FNMA f64 = f72, f20, f64 + FNMA f65 = f73, f20, f65 + FNMA f66 = f74, f20, f66 + FNMA f67 = f75, f20, f67 + ;; + FMPY f64 = f64, f21 + FMPY f65 = f65, f21 + FMPY f66 = f66, f21 + FMPY f67 = f67, f21 + ;; + adds AOFFSET = 24 * SIZE, AOFFSET + adds AOFFSET2 = 24 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + ;; + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + ;; + STFD [AOFFSET] = f115, - 11 * SIZE + STFD [AOFFSET2] = f123, - 11 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f105, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f106, SIZE + ;; + STFD [AOFFSET] = f99, - 11 * SIZE + STFD [AOFFSET2] = f107, - 11 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f91, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; + + #endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + #ifdef LN + adds C3 = -4 * SIZE, C3 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + #ifndef LN + STFD [C1 ] = f67, SIZE + #else + STFD [C1 ] = f67, - 3 * SIZE + #endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + #ifdef LN + adds C4 = -4 * SIZE, C4 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi + #ifndef LN + STFD [C2 ] = f75, SIZE + #else + STFD [C2 ] = f75, - 3 * SIZE + #endif + #ifdef LN + adds C5 = -4 * SIZE, C5 + #else + nop __LINE__ + #endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + #ifdef LN + adds C6 = -4 * SIZE, C6 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + #ifndef LN + STFD [C3 ] = f83, SIZE + #else + STFD [C3 ] = f83, - 3 * SIZE + #endif + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + #ifdef LN + adds C8 = -4 * SIZE, C8 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi + #ifndef LN + STFD [C4 ] = f91, SIZE + #else + STFD [C4 ] = f91, - 3 * SIZE + #endif + nop __LINE__ + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE + #ifdef LN + adds C7 = -4 * SIZE, C7 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + #ifndef LN + STFD [C5 ] = f99, SIZE + #else + STFD [C5 ] = f99, - 3 * SIZE + #endif + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + sub L = K, KK + } + ;; + { .mmi + #ifndef LN + STFD [C6 ] = f107, SIZE + #else + STFD [C6 ] = f107, - 3 * SIZE + #endif + #ifdef RT + shladd AORIG = r2, 2, AORIG + #else + nop __LINE__ + #endif + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + #if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 + #else + nop __LINE__ + #endif + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE + #if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET + #else + nop __LINE__ + #endif + } + ;; + { .mmi + #ifndef LN + STFD [C7 ] = f115, SIZE + #else + STFD [C7 ] = f115, - 3 * SIZE + #endif + #if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET + #else + nop __LINE__ + #endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE + #ifdef LT + adds KK = 4, KK + #elif defined LN + adds KK = -4, KK + #else + nop __LINE__ + #endif + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE + #if defined(LT) || defined(RN) + mov L = KK + #else + sub L = K, KK + #endif + } + ;; + { .mmb + #ifndef LN + STFD [C8 ] = f123, SIZE + #else + STFD [C8 ] = f123, - 3 * SIZE + #endif + } + ;; + .align 8 + +.L010: + { .mib + cmp.gt p6, p0 = 8, M + shr I = M, 3 + (p6) br.cond.dpnt .L049 + } + ;; + .align 8 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + shladd r3 = KK, BASE_SHIFT, r0 + shl r2 = K, 3 + BASE_SHIFT + } + ;; + { .mmi + shladd BOFFSET = r3, 3, B + sub AORIG = AORIG, r2 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f64 = r0 + mov f72 = f0 + } + { .mfi + setf.d f80 = r0 + mov f88 = f0 + shladd AOFFSET = r3, 3, AORIG + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f96 = r0 + mov f104 = f0 + } + { .mfb + setf.d f112 = r0 + mov f120 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f65 = r0 + mov f73 = f0 + } + { .mfb + setf.d f89 = r0 + mov f81 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfb + setf.d f113 = r0 + mov f121 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfb + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfb + setf.d f119 = r0 + mov f127 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 8 + +.L012: +/* 1 */ + { .mfi + lfetch.fault.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: + adds r2 = -8, KK + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 3, B + ;; + + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FSUB f113 = f46, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + FSUB f66 = f48, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f74 = f49, f74 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f90 = f51, f90 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FSUB f98 = f52, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f106 = f53, f106 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + FSUB f114 = f54, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f55, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + FSUB f67 = f56, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f75 = f57, f75 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + FSUB f83 = f58, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + FSUB f99 = f60, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f61, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + FSUB f115 = f62, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f63, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f68 = f32, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f76 = f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f84 = f34, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f92 = f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f108 = f37, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f116 = f38, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f39, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f69 = f40, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f77 = f41, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f85 = f42, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f43, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f101 = f44, f101 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f117 = f46, f117 + adds BOFFSET = -62 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f125 = f47, f125 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f48, f70 +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FSUB f78 = f49, f78 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f86 = f50, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f94 = f51, f94 + nop __LINE__ + } + ;; + { .mfi +#ifdef LN + LDFPD f33, f32 = [AOFFSET] +#else + LDFPD f32, f33 = [AOFFSET] +#endif + FSUB f102 = f52, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f110 = f53, f110 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f126 = f55, f126 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + ;; + { .mfi + nop __LINE__ + FSUB f71 = f56, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f79 = f57, f79 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f58, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f59, f95 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f60, f103 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f111 = f61, f111 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f62, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; + + { .mfi + LDFPD f35, f34 = [AOFFSET] + FMPY f71 = f71, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f103 = f103, f32 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + LDFPD f37, f36 = [AOFFSET] + FMPY f79 = f79, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f111 = f111, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f39, f38 = [AOFFSET] + FMPY f87 = f87, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f119 = f119, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [AOFFSET], -2 * SIZE + FMPY f95 = f95, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f127 = f127, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f41 = [AOFFSET] + FNMA f70 = f71, f33, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f103, f33, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f43 = [AOFFSET] + FNMA f78 = f79, f33, f78 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f110 = f111, f33, f110 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f45 = [AOFFSET] + FNMA f86 = f87, f33, f86 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f118 = f119, f33, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f47 = [AOFFSET] + FNMA f94 = f95, f33, f94 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f126 = f127, f33, f126 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f49 = [AOFFSET] + FNMA f69 = f71, f34, f69 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f103, f34, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f51 = [AOFFSET] + FNMA f77 = f79, f34, f77 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f109 = f111, f34, f109 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [AOFFSET], -2 * SIZE + FNMA f85 = f87, f34, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f119, f34, f117 + nop __LINE__ + } + ;; + { .mfi + LDFPD f55, f54 = [AOFFSET] + FNMA f93 = f95, f34, f93 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f125 = f127, f34, f125 + nop __LINE__ + } + ;; + { .mfi + LDFPD f57, f56 = [AOFFSET] + FNMA f68 = f71, f35, f68 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f103, f35, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f59, f58 = [AOFFSET] + FNMA f76 = f79, f35, f76 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f111, f35, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f61, f60 = [AOFFSET] + FNMA f84 = f87, f35, f84 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f119, f35, f116 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [AOFFSET], -2 * SIZE + FNMA f92 = f95, f35, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f124 = f127, f35, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f18, f17 = [AOFFSET] + FNMA f67 = f71, f36, f67 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f99 = f103, f36, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f20, f19 = [AOFFSET] + FNMA f75 = f79, f36, f75 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f107 = f111, f36, f107 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [AOFFSET] + FNMA f83 = f87, f36, f83 + adds BOFFSET = 56 * SIZE, BOFFSET + } + { .mfi + FNMA f115 = f119, f36, f115 + adds BOFFSET2 = 56 * SIZE, BOFFSET2 + } + ;; + FNMA f91 = f95, f36, f91 + FNMA f123 = f127, f36, f123 + ;; + FNMA f66 = f71, f37, f66 + FNMA f98 = f103, f37, f98 + FNMA f74 = f79, f37, f74 + FNMA f106 = f111, f37, f106 + FNMA f82 = f87, f37, f82 + FNMA f114 = f119, f37, f114 + FNMA f90 = f95, f37, f90 + FNMA f122 = f127, f37, f122 + ;; + FNMA f65 = f71, f38, f65 + FNMA f97 = f103, f38, f97 + FNMA f73 = f79, f38, f73 + FNMA f105 = f111, f38, f105 + FNMA f81 = f87, f38, f81 + FNMA f113 = f119, f38, f113 + FNMA f89 = f95, f38, f89 + FNMA f121 = f127, f38, f121 + ;; + FNMA f64 = f71, f39, f64 + FNMA f96 = f103, f39, f96 + FNMA f72 = f79, f39, f72 + FNMA f104 = f111, f39, f104 + FNMA f80 = f87, f39, f80 + FNMA f112 = f119, f39, f112 + FNMA f88 = f95, f39, f88 + FNMA f120 = f127, f39, f120 + ;; + FMPY f70 = f70, f40 + FMPY f102 = f102, f40 + FMPY f78 = f78, f40 + FMPY f110 = f110, f40 + FMPY f86 = f86, f40 + FMPY f118 = f118, f40 + FMPY f94 = f94, f40 + FMPY f126 = f126, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f101 = f102, f41, f101 + FNMA f77 = f78, f41, f77 + FNMA f109 = f110, f41, f109 + FNMA f85 = f86, f41, f85 + FNMA f117 = f118, f41, f117 + FNMA f93 = f94, f41, f93 + FNMA f125 = f126, f41, f125 + ;; + FNMA f68 = f70, f42, f68 + FNMA f100 = f102, f42, f100 + FNMA f76 = f78, f42, f76 + FNMA f108 = f110, f42, f108 + FNMA f84 = f86, f42, f84 + FNMA f116 = f118, f42, f116 + FNMA f92 = f94, f42, f92 + FNMA f124 = f126, f42, f124 + ;; + FNMA f67 = f70, f43, f67 + FNMA f99 = f102, f43, f99 + FNMA f75 = f78, f43, f75 + FNMA f107 = f110, f43, f107 + FNMA f83 = f86, f43, f83 + FNMA f115 = f118, f43, f115 + FNMA f91 = f94, f43, f91 + FNMA f123 = f126, f43, f123 + ;; + FNMA f66 = f70, f44, f66 + FNMA f98 = f102, f44, f98 + FNMA f74 = f78, f44, f74 + FNMA f106 = f110, f44, f106 + FNMA f82 = f86, f44, f82 + FNMA f114 = f118, f44, f114 + FNMA f90 = f94, f44, f90 + FNMA f122 = f126, f44, f122 + ;; + FNMA f65 = f70, f45, f65 + FNMA f97 = f102, f45, f97 + FNMA f73 = f78, f45, f73 + FNMA f105 = f110, f45, f105 + FNMA f81 = f86, f45, f81 + FNMA f113 = f118, f45, f113 + FNMA f89 = f94, f45, f89 + FNMA f121 = f126, f45, f121 + ;; + FNMA f64 = f70, f46, f64 + FNMA f96 = f102, f46, f96 + FNMA f72 = f78, f46, f72 + FNMA f104 = f110, f46, f104 + FNMA f80 = f86, f46, f80 + FNMA f112 = f118, f46, f112 + FNMA f88 = f94, f46, f88 + FNMA f120 = f126, f46, f120 + ;; + FMPY f69 = f69, f47 + FMPY f101 = f101, f47 + FMPY f77 = f77, f47 + FMPY f109 = f109, f47 + FMPY f85 = f85, f47 + FMPY f117 = f117, f47 + FMPY f93 = f93, f47 + FMPY f125 = f125, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f100 = f101, f48, f100 + FNMA f76 = f77, f48, f76 + FNMA f108 = f109, f48, f108 + FNMA f84 = f85, f48, f84 + FNMA f116 = f117, f48, f116 + FNMA f92 = f93, f48, f92 + FNMA f124 = f125, f48, f124 + ;; + FNMA f67 = f69, f49, f67 + FNMA f99 = f101, f49, f99 + FNMA f75 = f77, f49, f75 + FNMA f107 = f109, f49, f107 + FNMA f83 = f85, f49, f83 + FNMA f115 = f117, f49, f115 + FNMA f91 = f93, f49, f91 + FNMA f123 = f125, f49, f123 + ;; + FNMA f66 = f69, f50, f66 + FNMA f98 = f101, f50, f98 + FNMA f74 = f77, f50, f74 + FNMA f106 = f109, f50, f106 + FNMA f82 = f85, f50, f82 + FNMA f114 = f117, f50, f114 + FNMA f90 = f93, f50, f90 + FNMA f122 = f125, f50, f122 + ;; + FNMA f65 = f69, f51, f65 + FNMA f97 = f101, f51, f97 + FNMA f73 = f77, f51, f73 + FNMA f105 = f109, f51, f105 + FNMA f81 = f85, f51, f81 + FNMA f113 = f117, f51, f113 + FNMA f89 = f93, f51, f89 + FNMA f121 = f125, f51, f121 + ;; + FNMA f64 = f69, f52, f64 + FNMA f96 = f101, f52, f96 + FNMA f72 = f77, f52, f72 + FNMA f104 = f109, f52, f104 + FNMA f80 = f85, f52, f80 + FNMA f112 = f117, f52, f112 + FNMA f88 = f93, f52, f88 + FNMA f120 = f125, f52, f120 + ;; + FMPY f68 = f68, f53 + FMPY f100 = f100, f53 + FMPY f76 = f76, f53 + FMPY f108 = f108, f53 + FMPY f84 = f84, f53 + FMPY f116 = f116, f53 + FMPY f92 = f92, f53 + FMPY f124 = f124, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f99 = f100, f54, f99 + FNMA f75 = f76, f54, f75 + FNMA f107 = f108, f54, f107 + FNMA f83 = f84, f54, f83 + FNMA f115 = f116, f54, f115 + FNMA f91 = f92, f54, f91 + FNMA f123 = f124, f54, f123 + ;; + FNMA f66 = f68, f55, f66 + FNMA f98 = f100, f55, f98 + FNMA f74 = f76, f55, f74 + FNMA f106 = f108, f55, f106 + FNMA f82 = f84, f55, f82 + FNMA f114 = f116, f55, f114 + FNMA f90 = f92, f55, f90 + FNMA f122 = f124, f55, f122 + ;; + FNMA f65 = f68, f56, f65 + FNMA f97 = f100, f56, f97 + FNMA f73 = f76, f56, f73 + FNMA f105 = f108, f56, f105 + FNMA f81 = f84, f56, f81 + FNMA f113 = f116, f56, f113 + FNMA f89 = f92, f56, f89 + FNMA f121 = f124, f56, f121 + ;; + FNMA f64 = f68, f57, f64 + FNMA f96 = f100, f57, f96 + FNMA f72 = f76, f57, f72 + FNMA f104 = f108, f57, f104 + FNMA f80 = f84, f57, f80 + FNMA f112 = f116, f57, f112 + FNMA f88 = f92, f57, f88 + FNMA f120 = f124, f57, f120 + ;; + FMPY f67 = f67, f58 + FMPY f99 = f99, f58 + FMPY f75 = f75, f58 + FMPY f107 = f107, f58 + FMPY f83 = f83, f58 + FMPY f115 = f115, f58 + FMPY f91 = f91, f58 + FMPY f123 = f123, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f98 = f99, f59, f98 + FNMA f74 = f75, f59, f74 + FNMA f106 = f107, f59, f106 + FNMA f82 = f83, f59, f82 + FNMA f114 = f115, f59, f114 + FNMA f90 = f91, f59, f90 + FNMA f122 = f123, f59, f122 + ;; + FNMA f65 = f67, f60, f65 + FNMA f97 = f99, f60, f97 + FNMA f73 = f75, f60, f73 + FNMA f105 = f107, f60, f105 + FNMA f81 = f83, f60, f81 + FNMA f113 = f115, f60, f113 + FNMA f89 = f91, f60, f89 + FNMA f121 = f123, f60, f121 + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FNMA f64 = f67, f61, f64 + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FNMA f96 = f99, f61, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f79, SIZE + FNMA f72 = f75, f61, f72 + } + { .mfi + STFD [BOFFSET2] = f111, SIZE + FNMA f104 = f107, f61, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f87, SIZE + FNMA f80 = f83, f61, f80 + } + { .mfi + STFD [BOFFSET2] = f119, SIZE + FNMA f112 = f115, f61, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f95, - 11 * SIZE + FNMA f88 = f91, f61, f88 + } + { .mfi + STFD [BOFFSET2] = f127, - 11 * SIZE + FNMA f120 = f123, f61, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FMPY f66 = f66, f16 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FMPY f98 = f98, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f78, SIZE + FMPY f74 = f74, f16 + } + { .mfi + STFD [BOFFSET2] = f110, SIZE + FMPY f106 = f106, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FMPY f82 = f82, f16 + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FMPY f114 = f114, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f94, - 11 * SIZE + FMPY f90 = f90, f16 + } + { .mfi + STFD [BOFFSET2] = f126, - 11 * SIZE + FMPY f122 = f122, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FNMA f65 = f66, f17, f65 + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FNMA f97 = f98, f17, f97 + } + ;; + { .mfi + STFD [BOFFSET] = f77, SIZE + FNMA f73 = f74, f17, f73 + } + { .mfi + STFD [BOFFSET2] = f109, SIZE + FNMA f105 = f106, f17, f105 + } + ;; + { .mfi + STFD [BOFFSET] = f85, SIZE + FNMA f81 = f82, f17, f81 + } + { .mfi + STFD [BOFFSET2] = f117, SIZE + FNMA f113 = f114, f17, f113 + } + ;; + { .mfi + STFD [BOFFSET] = f93, - 11 * SIZE + FNMA f89 = f90, f17, f89 + } + { .mfi + STFD [BOFFSET2] = f125, - 11 * SIZE + FNMA f121 = f122, f17, f121 + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f64 = f66, f18, f64 + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f96 = f98, f18, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f76, SIZE + FNMA f72 = f74, f18, f72 + } + { .mfi + STFD [BOFFSET2] = f108, SIZE + FNMA f104 = f106, f18, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f80 = f82, f18, f80 + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f112 = f114, f18, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f92, - 11 * SIZE + FNMA f88 = f90, f18, f88 + } + { .mfi + STFD [BOFFSET2] = f124, - 11 * SIZE + FNMA f120 = f122, f18, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMPY f65 = f65, f19 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMPY f97 = f97, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FMPY f73 = f73, f19 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FMPY f105 = f105, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FMPY f81 = f81, f19 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FMPY f113 = f113, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f91, - 11 * SIZE + FMPY f89 = f89, f19 + } + { .mfi + STFD [BOFFSET2] = f123, - 11 * SIZE + FMPY f121 = f121, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f65, f20, f64 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f97, f20, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f72 = f73, f20, f72 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f104 = f105, f20, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f81, f20, f80 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f113, f20, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f90, -11 * SIZE + FNMA f88 = f89, f20, f88 + } + { .mfi + STFD [BOFFSET2] = f122, -11 * SIZE + FNMA f120 = f121, f20, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f64 = f64, f21 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f96 = f96, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f72 = f72, f21 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f104 = f104, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f80 = f80, f21 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f112 = f112, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + FMPY f88 = f88, f21 + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + FMPY f120 = f120, f21 + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -8 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -8 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + adds C3 = -8 * SIZE, C3 + } + ;; + { .mmi + STFD [C1 ] = f67, - 3 * SIZE + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + adds C4 = -8 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f75, - 3 * SIZE + STFD [C10] = f79 + adds C5 = -8 * SIZE, C5 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + adds C13 = 4 * SIZE, C5 + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + adds C6 = -8 * SIZE, C6 + } + ;; + { .mmi + STFD [C3 ] = f83, - 3 * SIZE + STFD [C11] = f87 + adds C14 = 4 * SIZE, C6 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + adds C8 = -8 * SIZE, C8 + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + adds C16 = 4 * SIZE, C8 + } + ;; + { .mmi + STFD [C4 ] = f91, - 3 * SIZE + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C13] = f100, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + STFD [C13] = f101, SIZE + adds I = -1, I + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE + STFD [C13] = f102, SIZE + adds C7 = -8 * SIZE, C7 + } + ;; + { .mmi + STFD [C5 ] = f99, - 3 * SIZE + STFD [C13] = f103 + adds C15 = 4 * SIZE, C7 + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + STFD [C14] = f108, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + STFD [C14] = f109, SIZE + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + STFD [C14] = f110, SIZE + sub L = K, KK + } + ;; + { .mmi + STFD [C6 ] = f107, - 3 * SIZE + STFD [C14] = f111 + nop __LINE__ + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C15] = f116, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + STFD [C15] = f117, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE + STFD [C15] = f118, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C7 ] = f115, - 3 * SIZE + STFD [C15] = f119 + nop __LINE__ + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + STFD [C16] = f124, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE + STFD [C16] = f125, SIZE + adds KK = -8, KK + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE + STFD [C16] = f126, SIZE + sub L = K, KK + } + ;; + { .mmb + STFD [C8 ] = f123, - 3 * SIZE + STFD [C16] = f127 + (p6) br.cond.dptk .L011 + } + ;; + +.L049: + { .mmi + adds J = -1, J + mov AOFFSET = A + shladd KK8 = K, BASE_SHIFT, r0 + } + ;; + { .mmb + shladd B = KK8, 3, B + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L000 + } + ;; + .align 8 + +.L050: + { .mib + setf.d f64 = r0 + tbit.z p6, p0 = N, 2 + (p6) br.cond.dpnt .L090 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + } + ;; + { .mfi +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + shladd C4 = LDC, 1, C2 + } + ;; + + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + + + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L070 + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L088 + } + ;; + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb + nop __LINE__ + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; + +.L088: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + adds C2 = -1 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + } + ;; + adds C3 = -1 * SIZE, C3 + adds C4 = -1 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FMPY f72 = f72, f36 + ;; + FNMA f80 = f72, f37, f80 + ;; + FNMA f88 = f72, f38, f88 + ;; + FMPY f80 = f80, f39 + ;; + FNMA f88 = f80, f40, f88 + ;; + FMPY f88 = f88, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + ;; + FNMA f80 = f88, f33, f80 + ;; + FNMA f72 = f88, f34, f72 + ;; + FNMA f64 = f88, f35, f64 + ;; + FMPY f80 = f80, f36 + ;; + FNMA f72 = f80, f37, f72 + ;; + FNMA f64 = f80, f38, f64 + ;; + FMPY f72 = f72, f39 + ;; + FNMA f64 = f72, f40, f64 + ;; + FMPY f64 = f64, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, - 3 * SIZE + ;; +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif +#ifndef LN + STFD [C3 ] = f80, SIZE +#else + STFD [C3 ] = f80 +#endif +#ifndef LN + STFD [C4 ] = f88, SIZE +#else + STFD [C4 ] = f88 +#endif + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L070: + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L060 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L078 + } + ;; + .align 8 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f73 = f73, f32 + FMPY f81 = f81, f32 + FMPY f89 = f89, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f72 = f73, f33, f72 + FNMA f80 = f81, f33, f80 + FNMA f88 = f89, f33, f88 + ;; + FMPY f64 = f64, f34 + FMPY f72 = f72, f34 + FMPY f80 = f80, f34 + FMPY f88 = f88, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f89, - 3 * SIZE + ;; + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + FMPY f81 = f81, f34 + FMPY f89 = f89, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + FMPY f89 = f89, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f81 = f89, f33, f81 + ;; + FNMA f72 = f88, f34, f72 + FNMA f73 = f89, f34, f73 + ;; + FNMA f64 = f88, f35, f64 + FNMA f65 = f89, f35, f65 + ;; + FMPY f80 = f80, f36 + FMPY f81 = f81, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f73 = f81, f37, f73 + ;; + FNMA f64 = f80, f38, f64 + FNMA f65 = f81, f38, f65 + ;; + FMPY f72 = f72, f39 + FMPY f73 = f73, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f65 = f73, f40, f65 + ;; + FMPY f64 = f64, f41 + FMPY f65 = f65, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; +#ifndef LN + STFD [C3 ] = f81, SIZE +#else + STFD [C3 ] = f81, - SIZE +#endif + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; +#ifndef LN + STFD [C4 ] = f89, SIZE +#else + STFD [C4 ] = f89, -SIZE +#endif + ;; + mov f96 = f0 + ;; + mov f104 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + mov f112 = f0 + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f120 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L060: + + + + + + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L051 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + mov f80 = f0 + mov f81 = f0 + mov f82 = f0 + mov f83 = f0 + mov f88 = f0 + mov f89 = f0 + mov f90 = f0 + mov f91 = f0 + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L068 + } + ;; + .align 8 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; + .align 8 + +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + ;; + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f75 = f75, f32 + FMPY f83 = f83, f32 + FMPY f91 = f91, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f74 = f75, f33, f74 + FNMA f82 = f83, f33, f82 + FNMA f90 = f91, f33, f90 + ;; + FNMA f65 = f67, f34, f65 + FNMA f73 = f75, f34, f73 + FNMA f81 = f83, f34, f81 + FNMA f89 = f91, f34, f89 + ;; + FNMA f64 = f67, f35, f64 + FNMA f72 = f75, f35, f72 + FNMA f80 = f83, f35, f80 + FNMA f88 = f91, f35, f88 + ;; + FMPY f66 = f66, f36 + FMPY f74 = f74, f36 + FMPY f82 = f82, f36 + FMPY f90 = f90, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f73 = f74, f37, f73 + FNMA f81 = f82, f37, f81 + FNMA f89 = f90, f37, f89 + ;; + FNMA f64 = f66, f38, f64 + FNMA f72 = f74, f38, f72 + FNMA f80 = f82, f38, f80 + FNMA f88 = f90, f38, f88 + ;; + FMPY f65 = f65, f39 + FMPY f73 = f73, f39 + FMPY f81 = f81, f39 + FMPY f89 = f89, f39 + ;; + FNMA f64 = f65, f40, f64 + FNMA f72 = f73, f40, f72 + FNMA f80 = f81, f40, f80 + FNMA f88 = f89, f40, f88 + ;; + FMPY f64 = f64, f41 + FMPY f72 = f72, f41 + FMPY f80 = f80, f41 + FMPY f88 = f88, f41 + ;; + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, - 11 * SIZE + STFD [BOFFSET2] = f91, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + FMPY f81 = f81, f36 + FMPY f89 = f89, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + FNMA f82 = f81, f37, f82 + FNMA f90 = f89, f37, f90 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + FNMA f83 = f81, f38, f83 + FNMA f91 = f89, f38, f91 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + FMPY f82 = f82, f39 + FMPY f90 = f90, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + FNMA f83 = f82, f40, f83 + FNMA f91 = f90, f40, f91 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + FMPY f83 = f83, f41 + FMPY f91 = f91, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, -11 * SIZE + STFD [BOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + FMPY f74 = f74, f36 + FMPY f75 = f75, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + FNMA f82 = f74, f37, f82 + FNMA f83 = f75, f37, f83 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + FNMA f90 = f74, f38, f90 + FNMA f91 = f75, f38, f91 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + FMPY f82 = f82, f39 + FMPY f83 = f83, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + FNMA f90 = f82, f40, f90 + FNMA f91 = f83, f40, f91 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + FMPY f90 = f90, f41 + FMPY f91 = f91, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, -11 * SIZE + STFD [AOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + FMPY f89 = f89, f32 + FMPY f90 = f90, f32 + FMPY f91 = f91, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f81 = f89, f33, f81 + FNMA f82 = f90, f33, f82 + FNMA f83 = f91, f33, f83 + ;; + FNMA f72 = f88, f34, f72 + FNMA f73 = f89, f34, f73 + FNMA f74 = f90, f34, f74 + FNMA f75 = f91, f34, f75 + ;; + FNMA f64 = f88, f35, f64 + FNMA f65 = f89, f35, f65 + FNMA f66 = f90, f35, f66 + FNMA f67 = f91, f35, f67 + ;; + FMPY f80 = f80, f36 + FMPY f81 = f81, f36 + FMPY f82 = f82, f36 + FMPY f83 = f83, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f73 = f81, f37, f73 + FNMA f74 = f82, f37, f74 + FNMA f75 = f83, f37, f75 + ;; + FNMA f64 = f80, f38, f64 + FNMA f65 = f81, f38, f65 + FNMA f66 = f82, f38, f66 + FNMA f67 = f83, f38, f67 + ;; + FMPY f72 = f72, f39 + FMPY f73 = f73, f39 + FMPY f74 = f74, f39 + FMPY f75 = f75, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f65 = f73, f40, f65 + FNMA f66 = f74, f40, f66 + FNMA f67 = f75, f40, f67 + ;; + FMPY f64 = f64, f41 + FMPY f65 = f65, f41 + FMPY f66 = f66, f41 + FMPY f67 = f67, f41 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f91, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + nop __LINE__ + } + ;; + mov f65 = f0 + ;; + mov f73 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f81 = f0 + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f89 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L051: + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + shr I = M, 3 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L089 + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f90 = f0 + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC] + } + ;; + { .mfi + setf.d f70 = r0 + mov f78 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mfi + setf.d f71 = r0 + adds L = -1, L + } + ;; + { .mfi + setf.d f87 = r0 + mov f79 = f0 + mov ar.lc = L + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f95 = f0 + (p6) br.cond.dpnt .L058 + } + ;; + .align 8 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 8 + +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [BOFFSET] + adds BOFFSET = -30 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + + FSUB f68 = f48, f68 + FSUB f76 = f49, f76 + FSUB f84 = f50, f84 + FSUB f92 = f51, f92 + + FSUB f69 = f52, f69 + FSUB f77 = f53, f77 + FSUB f85 = f54, f85 + FSUB f93 = f55, f93 + + FSUB f70 = f56, f70 + FSUB f78 = f57, f78 + FSUB f86 = f58, f86 + FSUB f94 = f59, f94 + + FSUB f71 = f60, f71 + FSUB f79 = f61, f79 + FSUB f87 = f62, f87 + FSUB f95 = f63, f95 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; + FSUB f80 = f48, f80 + FSUB f81 = f49, f81 + FSUB f82 = f50, f82 + FSUB f83 = f51, f83 + FSUB f84 = f52, f84 + FSUB f85 = f53, f85 + FSUB f86 = f54, f86 + FSUB f87 = f55, f87 + + FSUB f88 = f56, f88 + FSUB f89 = f57, f89 + FSUB f90 = f58, f90 + FSUB f91 = f59, f91 + FSUB f92 = f60, f92 + FSUB f93 = f61, f93 + FSUB f94 = f62, f94 + FSUB f95 = f63, f95 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + FMPY f79 = f79, f32 + FMPY f87 = f87, f32 + FMPY f95 = f95, f32 + ;; + FNMA f70 = f71, f33, f70 + FNMA f78 = f79, f33, f78 + FNMA f86 = f87, f33, f86 + FNMA f94 = f95, f33, f94 + ;; + FNMA f69 = f71, f34, f69 + FNMA f77 = f79, f34, f77 + FNMA f85 = f87, f34, f85 + FNMA f93 = f95, f34, f93 + ;; + FNMA f68 = f71, f35, f68 + FNMA f76 = f79, f35, f76 + FNMA f84 = f87, f35, f84 + FNMA f92 = f95, f35, f92 + ;; + FNMA f67 = f71, f36, f67 + FNMA f75 = f79, f36, f75 + FNMA f83 = f87, f36, f83 + FNMA f91 = f95, f36, f91 + ;; + FNMA f66 = f71, f37, f66 + FNMA f74 = f79, f37, f74 + FNMA f82 = f87, f37, f82 + FNMA f90 = f95, f37, f90 + ;; + FNMA f65 = f71, f38, f65 + FNMA f73 = f79, f38, f73 + FNMA f81 = f87, f38, f81 + FNMA f89 = f95, f38, f89 + ;; + FNMA f64 = f71, f39, f64 + FNMA f72 = f79, f39, f72 + FNMA f80 = f87, f39, f80 + FNMA f88 = f95, f39, f88 + ;; + FMPY f70 = f70, f40 + FMPY f78 = f78, f40 + FMPY f86 = f86, f40 + FMPY f94 = f94, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f77 = f78, f41, f77 + FNMA f85 = f86, f41, f85 + FNMA f93 = f94, f41, f93 + ;; + FNMA f68 = f70, f42, f68 + FNMA f76 = f78, f42, f76 + FNMA f84 = f86, f42, f84 + FNMA f92 = f94, f42, f92 + ;; + FNMA f67 = f70, f43, f67 + FNMA f75 = f78, f43, f75 + FNMA f83 = f86, f43, f83 + FNMA f91 = f94, f43, f91 + ;; + FNMA f66 = f70, f44, f66 + FNMA f74 = f78, f44, f74 + FNMA f82 = f86, f44, f82 + FNMA f90 = f94, f44, f90 + ;; + FNMA f65 = f70, f45, f65 + FNMA f73 = f78, f45, f73 + FNMA f81 = f86, f45, f81 + FNMA f89 = f94, f45, f89 + ;; + FNMA f64 = f70, f46, f64 + FNMA f72 = f78, f46, f72 + FNMA f80 = f86, f46, f80 + FNMA f88 = f94, f46, f88 + ;; + FMPY f69 = f69, f47 + FMPY f77 = f77, f47 + FMPY f85 = f85, f47 + FMPY f93 = f93, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f76 = f77, f48, f76 + FNMA f84 = f85, f48, f84 + FNMA f92 = f93, f48, f92 + ;; + FNMA f67 = f69, f49, f67 + FNMA f75 = f77, f49, f75 + FNMA f83 = f85, f49, f83 + FNMA f91 = f93, f49, f91 + ;; + FNMA f66 = f69, f50, f66 + FNMA f74 = f77, f50, f74 + FNMA f82 = f85, f50, f82 + FNMA f90 = f93, f50, f90 + ;; + FNMA f65 = f69, f51, f65 + FNMA f73 = f77, f51, f73 + FNMA f81 = f85, f51, f81 + FNMA f89 = f93, f51, f89 + ;; + FNMA f64 = f69, f52, f64 + FNMA f72 = f77, f52, f72 + FNMA f80 = f85, f52, f80 + FNMA f88 = f93, f52, f88 + ;; + FMPY f68 = f68, f53 + FMPY f76 = f76, f53 + FMPY f84 = f84, f53 + FMPY f92 = f92, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f75 = f76, f54, f75 + FNMA f83 = f84, f54, f83 + FNMA f91 = f92, f54, f91 + ;; + FNMA f66 = f68, f55, f66 + FNMA f74 = f76, f55, f74 + FNMA f82 = f84, f55, f82 + FNMA f90 = f92, f55, f90 + ;; + FNMA f65 = f68, f56, f65 + FNMA f73 = f76, f56, f73 + FNMA f81 = f84, f56, f81 + FNMA f89 = f92, f56, f89 + ;; + FNMA f64 = f68, f57, f64 + FNMA f72 = f76, f57, f72 + FNMA f80 = f84, f57, f80 + FNMA f88 = f92, f57, f88 + ;; + FMPY f67 = f67, f58 + FMPY f75 = f75, f58 + FMPY f83 = f83, f58 + FMPY f91 = f91, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f74 = f75, f59, f74 + FNMA f82 = f83, f59, f82 + FNMA f90 = f91, f59, f90 + ;; + FNMA f65 = f67, f60, f65 + FNMA f73 = f75, f60, f73 + FNMA f81 = f83, f60, f81 + FNMA f89 = f91, f60, f89 + ;; + FNMA f64 = f67, f61, f64 + FNMA f72 = f75, f61, f72 + FNMA f80 = f83, f61, f80 + FNMA f88 = f91, f61, f88 + ;; + FMPY f66 = f66, f16 + FMPY f74 = f74, f16 + FMPY f82 = f82, f16 + FMPY f90 = f90, f16 + ;; + FNMA f65 = f66, f17, f65 + FNMA f73 = f74, f17, f73 + FNMA f81 = f82, f17, f81 + FNMA f89 = f90, f17, f89 + ;; + FNMA f64 = f66, f18, f64 + FNMA f72 = f74, f18, f72 + FNMA f80 = f82, f18, f80 + FNMA f88 = f90, f18, f88 + ;; + FMPY f65 = f65, f19 + FMPY f73 = f73, f19 + FMPY f81 = f81, f19 + FMPY f89 = f89, f19 + ;; + FNMA f64 = f65, f20, f64 + FNMA f72 = f73, f20, f72 + FNMA f80 = f81, f20, f80 + FNMA f88 = f89, f20, f88 + ;; + FMPY f64 = f64, f21 + FMPY f72 = f72, f21 + FMPY f80 = f80, f21 + FMPY f88 = f88, f21 + ;; + + adds BOFFSET = 24 * SIZE, BOFFSET + adds BOFFSET2 = 24 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94, - 11 * SIZE + STFD [BOFFSET2] = f95, - 11 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, - 11 * SIZE + STFD [BOFFSET2] = f93, - 11 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, - 11 * SIZE + STFD [BOFFSET2] = f91, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f89, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C3 = -8 * SIZE, C3 + adds C4 = -8 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + FNMA f84 = f80, f36, f84 + FNMA f92 = f88, f36, f92 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + FNMA f85 = f80, f37, f85 + FNMA f93 = f88, f37, f93 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + FNMA f86 = f80, f38, f86 + FNMA f94 = f88, f38, f94 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + FNMA f87 = f80, f39, f87 + FNMA f95 = f88, f39, f95 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + FMPY f81 = f81, f40 + FMPY f89 = f89, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + FNMA f82 = f81, f41, f82 + FNMA f90 = f89, f41, f90 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + FNMA f83 = f81, f42, f83 + FNMA f91 = f89, f42, f91 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + FNMA f84 = f81, f43, f84 + FNMA f92 = f89, f43, f92 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + FNMA f85 = f81, f44, f85 + FNMA f93 = f89, f44, f93 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + FNMA f86 = f81, f45, f86 + FNMA f94 = f89, f45, f94 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + FNMA f87 = f81, f46, f87 + FNMA f95 = f89, f46, f95 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + FMPY f82 = f82, f47 + FMPY f90 = f90, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + FNMA f83 = f82, f48, f83 + FNMA f91 = f90, f48, f91 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + FNMA f84 = f82, f49, f84 + FNMA f92 = f90, f49, f92 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + FNMA f85 = f82, f50, f85 + FNMA f93 = f90, f50, f93 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + FNMA f86 = f82, f51, f86 + FNMA f94 = f90, f51, f94 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + FNMA f87 = f82, f52, f87 + FNMA f95 = f90, f52, f95 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + FMPY f83 = f83, f53 + FMPY f91 = f91, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + FNMA f84 = f83, f54, f84 + FNMA f92 = f91, f54, f92 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + FNMA f85 = f83, f55, f85 + FNMA f93 = f91, f55, f93 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + FNMA f86 = f83, f56, f86 + FNMA f94 = f91, f56, f94 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + FNMA f87 = f83, f57, f87 + FNMA f95 = f91, f57, f95 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + FMPY f84 = f84, f58 + FMPY f92 = f92, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + FNMA f85 = f84, f59, f85 + FNMA f93 = f92, f59, f93 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + FNMA f86 = f84, f60, f86 + FNMA f94 = f92, f60, f94 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + FNMA f87 = f84, f61, f87 + FNMA f95 = f92, f61, f95 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + FMPY f85 = f85, f16 + FMPY f93 = f93, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + FNMA f86 = f85, f17, f86 + FNMA f94 = f93, f17, f94 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + FNMA f87 = f85, f18, f87 + FNMA f95 = f93, f18, f95 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + FMPY f86 = f86, f19 + FMPY f94 = f94, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + FNMA f87 = f86, f20, f87 + FNMA f95 = f94, f20, f95 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + FMPY f87 = f87, f21 + FMPY f95 = f95, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, 5 * SIZE + STFD [BOFFSET2] = f91, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, 5 * SIZE + STFD [BOFFSET2] = f93, 5 * SIZE + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94 + STFD [BOFFSET2] = f95 + adds C9 = 4 * SIZE, C1 + adds BOFFSET = - 27 * SIZE, BOFFSET + adds BOFFSET2 = - 27 * SIZE, BOFFSET2 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FNMA f80 = f64, f34, f80 + FNMA f84 = f68, f34, f84 + FNMA f81 = f65, f34, f81 + FNMA f85 = f69, f34, f85 + FNMA f82 = f66, f34, f82 + FNMA f86 = f70, f34, f86 + FNMA f83 = f67, f34, f83 + FNMA f87 = f71, f34, f87 + ;; + FNMA f88 = f64, f35, f88 + FNMA f92 = f68, f35, f92 + FNMA f89 = f65, f35, f89 + FNMA f93 = f69, f35, f93 + FNMA f90 = f66, f35, f90 + FNMA f94 = f70, f35, f94 + FNMA f91 = f67, f35, f91 + FNMA f95 = f71, f35, f95 + ;; + FMPY f72 = f72, f36 + FMPY f76 = f76, f36 + FMPY f73 = f73, f36 + FMPY f77 = f77, f36 + FMPY f74 = f74, f36 + FMPY f78 = f78, f36 + FMPY f75 = f75, f36 + FMPY f79 = f79, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f84 = f76, f37, f84 + FNMA f81 = f73, f37, f81 + FNMA f85 = f77, f37, f85 + FNMA f82 = f74, f37, f82 + FNMA f86 = f78, f37, f86 + FNMA f83 = f75, f37, f83 + FNMA f87 = f79, f37, f87 + ;; + FNMA f88 = f72, f38, f88 + FNMA f92 = f76, f38, f92 + FNMA f89 = f73, f38, f89 + FNMA f93 = f77, f38, f93 + FNMA f90 = f74, f38, f90 + FNMA f94 = f78, f38, f94 + FNMA f91 = f75, f38, f91 + FNMA f95 = f79, f38, f95 + ;; + FMPY f80 = f80, f39 + FMPY f84 = f84, f39 + FMPY f81 = f81, f39 + FMPY f85 = f85, f39 + FMPY f82 = f82, f39 + FMPY f86 = f86, f39 + FMPY f83 = f83, f39 + FMPY f87 = f87, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f92 = f84, f40, f92 + FNMA f89 = f81, f40, f89 + FNMA f93 = f85, f40, f93 + FNMA f90 = f82, f40, f90 + FNMA f94 = f86, f40, f94 + FNMA f91 = f83, f40, f91 + FNMA f95 = f87, f40, f95 + ;; + FMPY f88 = f88, f41 + FMPY f92 = f92, f41 + FMPY f89 = f89, f41 + FMPY f93 = f93, f41 + FMPY f90 = f90, f41 + FMPY f94 = f94, f41 + FMPY f91 = f91, f41 + FMPY f95 = f95, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, 5 * SIZE + STFD [AOFFSET2] = f79, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f87, 5 * SIZE + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, -27 * SIZE + STFD [AOFFSET2] = f95, -27 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], -2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + + FMPY f88 = f88, f32 + FMPY f92 = f92, f32 + FMPY f89 = f89, f32 + FMPY f93 = f93, f32 + FMPY f90 = f90, f32 + FMPY f94 = f94, f32 + FMPY f91 = f91, f32 + FMPY f95 = f95, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f84 = f92, f33, f84 + FNMA f81 = f89, f33, f81 + FNMA f85 = f93, f33, f85 + FNMA f82 = f90, f33, f82 + FNMA f86 = f94, f33, f86 + FNMA f83 = f91, f33, f83 + FNMA f87 = f95, f33, f87 + ;; + FNMA f72 = f88, f34, f72 + FNMA f76 = f92, f34, f76 + FNMA f73 = f89, f34, f73 + FNMA f77 = f93, f34, f77 + FNMA f74 = f90, f34, f74 + FNMA f78 = f94, f34, f78 + FNMA f75 = f91, f34, f75 + FNMA f79 = f95, f34, f79 + ;; + FNMA f64 = f88, f35, f64 + FNMA f68 = f92, f35, f68 + FNMA f65 = f89, f35, f65 + FNMA f69 = f93, f35, f69 + FNMA f66 = f90, f35, f66 + FNMA f70 = f94, f35, f70 + FNMA f67 = f91, f35, f67 + FNMA f71 = f95, f35, f71 + ;; + FMPY f80 = f80, f36 + FMPY f84 = f84, f36 + FMPY f81 = f81, f36 + FMPY f85 = f85, f36 + FMPY f82 = f82, f36 + FMPY f86 = f86, f36 + FMPY f83 = f83, f36 + FMPY f87 = f87, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f76 = f84, f37, f76 + FNMA f73 = f81, f37, f73 + FNMA f77 = f85, f37, f77 + FNMA f74 = f82, f37, f74 + FNMA f78 = f86, f37, f78 + FNMA f75 = f83, f37, f75 + FNMA f79 = f87, f37, f79 + ;; + FNMA f64 = f80, f38, f64 + FNMA f68 = f84, f38, f68 + FNMA f65 = f81, f38, f65 + FNMA f69 = f85, f38, f69 + FNMA f66 = f82, f38, f66 + FNMA f70 = f86, f38, f70 + FNMA f67 = f83, f38, f67 + FNMA f71 = f87, f38, f71 + ;; + FMPY f72 = f72, f39 + FMPY f76 = f76, f39 + FMPY f73 = f73, f39 + FMPY f77 = f77, f39 + FMPY f74 = f74, f39 + FMPY f78 = f78, f39 + FMPY f75 = f75, f39 + FMPY f79 = f79, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f68 = f76, f40, f68 + FNMA f65 = f73, f40, f65 + FNMA f69 = f77, f40, f69 + FNMA f66 = f74, f40, f66 + FNMA f70 = f78, f40, f70 + FNMA f67 = f75, f40, f67 + FNMA f71 = f79, f40, f71 + ;; + FMPY f64 = f64, f41 + FMPY f68 = f68, f41 + FMPY f65 = f65, f41 + FMPY f69 = f69, f41 + FMPY f66 = f66, f41 + FMPY f70 = f70, f41 + FMPY f67 = f67, f41 + FMPY f71 = f71, f41 + ;; + adds AOFFSET = 24 * SIZE, AOFFSET + adds AOFFSET2 = 24 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, - 11 * SIZE + STFD [AOFFSET2] = f95, - 11 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f87, - 11 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, - 11 * SIZE + STFD [AOFFSET2] = f79, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + ;; + +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, 5 * SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + STFD [C11] = f87 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, 5 * SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + { .mmb + (p6) br.cond.dptk .L052 + } + ;; + .align 8 + +.L089: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L090: + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L130 + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + ;; + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + } + ;; + { .mfi +#ifndef RT + shladd C = LDC, 1, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + mov f81 = f0 +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L110 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L128 + } + ;; + .align 8 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, -SIZE + adds C2 = -1 * SIZE, C2 + } + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FMPY f72 = f72, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + ;; + FNMA f64 = f72, f33, f64 + ;; + FMPY f64 = f64, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif + + mov f64 = f0 + mov f72 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L110: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L100 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L118 + } + ;; + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + .align 8 + +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f73 = f73, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f72 = f73, f33, f72 + ;; + FMPY f64 = f64, f34 + FMPY f72 = f72, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, - 3 * SIZE + ;; + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + FMPY f73 = f73, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f65 = f73, f33, f65 + ;; + FMPY f64 = f64, f34 + FMPY f65 = f65, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + mov f65 = f0 + mov f73 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L100: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L091 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L108 + } + ;; + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + .align 8 + +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + ;; + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f75 = f75, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f74 = f75, f33, f74 + ;; + FNMA f65 = f67, f34, f65 + FNMA f73 = f75, f34, f73 + ;; + FNMA f64 = f67, f35, f64 + FNMA f72 = f75, f35, f72 + ;; + FMPY f66 = f66, f36 + FMPY f74 = f74, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f73 = f74, f37, f73 + ;; + FNMA f64 = f66, f38, f64 + FNMA f72 = f74, f38, f72 + ;; + FMPY f65 = f65, f39 + FMPY f73 = f73, f39 + ;; + FNMA f64 = f65, f40, f64 + FNMA f72 = f73, f40, f72 + ;; + FMPY f64 = f64, f41 + FMPY f72 = f72, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + FMPY f74 = f74, f34 + FMPY f75 = f75, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + FMPY f73 = f73, f32 + FMPY f74 = f74, f32 + FMPY f75 = f75, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f65 = f73, f33, f65 + FNMA f66 = f74, f33, f66 + FNMA f67 = f75, f33, f67 + ;; + FMPY f64 = f64, f34 + FMPY f65 = f65, f34 + FMPY f66 = f66, f34 + FMPY f67 = f67, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L091: + shr I = M, 3 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L129 + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + mov f76 = f0 + mov f77 = f0 + mov f78 = f0 + mov f79 = f0 + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + } + ;; + .align 8 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; + .align 8 + +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + + FSUB f68 = f40, f68 + FSUB f76 = f41, f76 + FSUB f69 = f42, f69 + FSUB f77 = f43, f77 + + FSUB f70 = f44, f70 + FSUB f78 = f45, f78 + FSUB f71 = f46, f71 + FSUB f79 = f47, f79 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + FMPY f79 = f79, f32 + ;; + FNMA f70 = f71, f33, f70 + FNMA f78 = f79, f33, f78 + ;; + FNMA f69 = f71, f34, f69 + FNMA f77 = f79, f34, f77 + ;; + FNMA f68 = f71, f35, f68 + FNMA f76 = f79, f35, f76 + ;; + FNMA f67 = f71, f36, f67 + FNMA f75 = f79, f36, f75 + ;; + FNMA f66 = f71, f37, f66 + FNMA f74 = f79, f37, f74 + ;; + FNMA f65 = f71, f38, f65 + FNMA f73 = f79, f38, f73 + ;; + FNMA f64 = f71, f39, f64 + FNMA f72 = f79, f39, f72 + ;; + FMPY f70 = f70, f40 + FMPY f78 = f78, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f77 = f78, f41, f77 + ;; + FNMA f68 = f70, f42, f68 + FNMA f76 = f78, f42, f76 + ;; + FNMA f67 = f70, f43, f67 + FNMA f75 = f78, f43, f75 + ;; + FNMA f66 = f70, f44, f66 + FNMA f74 = f78, f44, f74 + ;; + FNMA f65 = f70, f45, f65 + FNMA f73 = f78, f45, f73 + ;; + FNMA f64 = f70, f46, f64 + FNMA f72 = f78, f46, f72 + ;; + FMPY f69 = f69, f47 + FMPY f77 = f77, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f76 = f77, f48, f76 + ;; + FNMA f67 = f69, f49, f67 + FNMA f75 = f77, f49, f75 + ;; + FNMA f66 = f69, f50, f66 + FNMA f74 = f77, f50, f74 + ;; + FNMA f65 = f69, f51, f65 + FNMA f73 = f77, f51, f73 + ;; + FNMA f64 = f69, f52, f64 + FNMA f72 = f77, f52, f72 + ;; + FMPY f68 = f68, f53 + FMPY f76 = f76, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f75 = f76, f54, f75 + ;; + FNMA f66 = f68, f55, f66 + FNMA f74 = f76, f55, f74 + ;; + FNMA f65 = f68, f56, f65 + FNMA f73 = f76, f56, f73 + ;; + FNMA f64 = f68, f57, f64 + FNMA f72 = f76, f57, f72 + ;; + FMPY f67 = f67, f58 + FMPY f75 = f75, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f74 = f75, f59, f74 + ;; + FNMA f65 = f67, f60, f65 + FNMA f73 = f75, f60, f73 + ;; + FNMA f64 = f67, f61, f64 + FNMA f72 = f75, f61, f72 + ;; + FMPY f66 = f66, f16 + FMPY f74 = f74, f16 + ;; + FNMA f65 = f66, f17, f65 + FNMA f73 = f74, f17, f73 + ;; + FNMA f64 = f66, f18, f64 + FNMA f72 = f74, f18, f72 + ;; + FMPY f65 = f65, f19 + FMPY f73 = f73, f19 + ;; + FNMA f64 = f65, f20, f64 + FNMA f72 = f73, f20, f72 + ;; + FMPY f64 = f64, f21 + FMPY f72 = f72, f21 + ;; + + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, - 11 * SIZE + STFD [BOFFSET2] = f79, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, - 3 * SIZE + STFD [BOFFSET2] = f75, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, 5 * SIZE + STFD [BOFFSET2] = f75, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, -11 * SIZE + STFD [BOFFSET2] = f79, -11 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FMPY f72 = f72, f34 + FMPY f76 = f76, f34 + FMPY f73 = f73, f34 + FMPY f77 = f77, f34 + FMPY f74 = f74, f34 + FMPY f78 = f78, f34 + FMPY f75 = f75, f34 + FMPY f79 = f79, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, -11 * SIZE + STFD [AOFFSET2] = f79, -11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + + FMPY f72 = f72, f32 + FMPY f76 = f76, f32 + FMPY f73 = f73, f32 + FMPY f77 = f77, f32 + FMPY f74 = f74, f32 + FMPY f78 = f78, f32 + FMPY f75 = f75, f32 + FMPY f79 = f79, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f68 = f76, f33, f68 + FNMA f65 = f73, f33, f65 + FNMA f69 = f77, f33, f69 + FNMA f66 = f74, f33, f66 + FNMA f70 = f78, f33, f70 + FNMA f67 = f75, f33, f67 + FNMA f71 = f79, f33, f71 + ;; + FMPY f64 = f64, f34 + FMPY f68 = f68, f34 + FMPY f65 = f65, f34 + FMPY f69 = f69, f34 + FMPY f66 = f66, f34 + FMPY f70 = f70, f34 + FMPY f67 = f67, f34 + FMPY f71 = f71, f34 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, - 11 * SIZE + STFD [AOFFSET2] = f79, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + ;; + +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + + (p6) br.cond.dptk .L092 + ;; + .align 8 + +.L129: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L130: + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L999 + ;; + +#ifdef RT + { .mmi + nop __LINE__ + shl r2 = K, BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } +#endif + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + ;; + + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + { .mfi +#ifndef RT + add C = C, LDC // coffset += 8 * ldc +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + +.L160: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L150 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + ;; + shladd r3 = KK, BASE_SHIFT, r0 + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#else + { .mmi + shladd BOFFSET = KK, BASE_SHIFT, B + nop __LINE__ +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + adds L = 1, L + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, L + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L168 + } + ;; + .align 8 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; + .align 8 + +.L168: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + { .mmi + LDFD f32 = [BOFFSET] + LDFD f33 = [AOFFSET] +#ifdef LN + adds C1 = -1 * SIZE, C1 +#else + nop __LINE__ +#endif + } + ;; +#else + { .mmi + LDFD f32 = [AOFFSET] + LDFD f33 = [BOFFSET] + nop __LINE__ + } + ;; +#endif + + { .mmf + sub L = K, KK +#ifdef RT + shladd AORIG = K, BASE_SHIFT, AORIG +#else + nop __LINE__ +#endif + FSUB f64 = f32, f64 + } + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + FMPY f64 = f64, f33 + ;; +#if defined(LN) || defined(LT) + { .mmf + STFD [BOFFSET] = f64 +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif + mov f64 = f0 + } + ;; +#else + { .mmf + STFD [AOFFSET] = f64 + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; +#endif + +#if defined(LT) || defined(RN) + shladd AOFFSET = L, BASE_SHIFT, AOFFSET +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + shladd BOFFSET = L, BASE_SHIFT, BOFFSET +#else + nop __LINE__ +#endif + ;; + .align 8 + +.L150: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L140 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + ;; + shladd r3 = KK, BASE_SHIFT, r0 + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + (p7) LDFD f32 = [AOFFSET], SIZE + ;; + (p7) LDFD f33 = [AOFFSET], SIZE + ;; + ;; + { .mib + mov ar.lc = L + (p6) br.cond.dpnt .L158 + } + ;; + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + ;; + FNMA f64 = f65, f33, f64 + ;; + FMPY f64 = f64, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, - SIZE + ;; + adds C1 = -2 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FMPY f65 = f65, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, -SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + mov f64 = f0 + mov f65 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L140: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L131 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + ;; + shladd r3 = KK, BASE_SHIFT, r0 + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L148 + } + ;; + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf + nop __LINE__ + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + nop __LINE__ + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + ;; + FNMA f66 = f67, f33, f66 + ;; + FNMA f65 = f67, f34, f65 + ;; + FNMA f64 = f67, f35, f64 + ;; + FMPY f66 = f66, f36 + ;; + FNMA f65 = f66, f37, f65 + ;; + FNMA f64 = f66, f38, f64 + ;; + FMPY f65 = f65, f39 + ;; + FNMA f64 = f65, f40, f64 + ;; + FMPY f64 = f64, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FMPY f65 = f65, f36 + ;; + FNMA f66 = f65, f37, f66 + ;; + FNMA f67 = f65, f38, f67 + ;; + FMPY f66 = f66, f39 + ;; + FNMA f67 = f66, f40, f67 + ;; + FMPY f67 = f67, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + mov f72 = f0 + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L131: +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + shr I = M, 3 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L169 + ;; + .align 16 + +.L132: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + ;; + shladd r3 = KK, BASE_SHIFT, r0 + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds L = 1, L + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + ;; + + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L138 + } + ;; + .align 16 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + ;; + FNMA f70 = f71, f33, f70 + ;; + FNMA f69 = f71, f34, f69 + ;; + FNMA f68 = f71, f35, f68 + ;; + FNMA f67 = f71, f36, f67 + ;; + FNMA f66 = f71, f37, f66 + ;; + FNMA f65 = f71, f38, f65 + ;; + FNMA f64 = f71, f39, f64 + ;; + FMPY f70 = f70, f40 + ;; + FNMA f69 = f70, f41, f69 + ;; + FNMA f68 = f70, f42, f68 + ;; + FNMA f67 = f70, f43, f67 + ;; + FNMA f66 = f70, f44, f66 + ;; + FNMA f65 = f70, f45, f65 + ;; + FNMA f64 = f70, f46, f64 + ;; + FMPY f69 = f69, f47 + ;; + FNMA f68 = f69, f48, f68 + ;; + FNMA f67 = f69, f49, f67 + ;; + FNMA f66 = f69, f50, f66 + ;; + FNMA f65 = f69, f51, f65 + ;; + FNMA f64 = f69, f52, f64 + ;; + FMPY f68 = f68, f53 + ;; + FNMA f67 = f68, f54, f67 + ;; + FNMA f66 = f68, f55, f66 + ;; + FNMA f65 = f68, f56, f65 + ;; + FNMA f64 = f68, f57, f64 + ;; + FMPY f67 = f67, f58 + ;; + FNMA f66 = f67, f59, f66 + ;; + FNMA f65 = f67, f60, f65 + ;; + FNMA f64 = f67, f61, f64 + ;; + FMPY f66 = f66, f16 + ;; + FNMA f65 = f66, f17, f65 + ;; + FNMA f64 = f66, f18, f64 + ;; + FMPY f65 = f65, f19 + ;; + FNMA f64 = f65, f20, f64 + ;; + FMPY f64 = f64, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, - 3 * SIZE + STFD [BOFFSET2] = f71, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FNMA f68 = f64, f36, f68 + ;; + FNMA f69 = f64, f37, f69 + ;; + FNMA f70 = f64, f38, f70 + ;; + FNMA f71 = f64, f39, f71 + ;; + FMPY f65 = f65, f40 + ;; + FNMA f66 = f65, f41, f66 + ;; + FNMA f67 = f65, f42, f67 + ;; + FNMA f68 = f65, f43, f68 + ;; + FNMA f69 = f65, f44, f69 + ;; + FNMA f70 = f65, f45, f70 + ;; + FNMA f71 = f65, f46, f71 + ;; + FMPY f66 = f66, f47 + ;; + FNMA f67 = f66, f48, f67 + ;; + FNMA f68 = f66, f49, f68 + ;; + FNMA f69 = f66, f50, f69 + ;; + FNMA f70 = f66, f51, f70 + ;; + FNMA f71 = f66, f52, f71 + ;; + FMPY f67 = f67, f53 + ;; + FNMA f68 = f67, f54, f68 + ;; + FNMA f69 = f67, f55, f69 + ;; + FNMA f70 = f67, f56, f70 + ;; + FNMA f71 = f67, f57, f71 + ;; + FMPY f68 = f68, f58 + ;; + FNMA f69 = f68, f59, f69 + ;; + FNMA f70 = f68, f60, f70 + ;; + FNMA f71 = f68, f61, f71 + ;; + FMPY f69 = f69, f16 + ;; + FNMA f70 = f69, f17, f70 + ;; + FNMA f71 = f69, f18, f71 + ;; + FMPY f70 = f70, f19 + ;; + FNMA f71 = f70, f20, f71 + ;; + FMPY f71 = f71, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + STFD [BOFFSET2] = f71, -3 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + + (p6) br.cond.dptk .L132 + .align 8 + + +.L169: + { .mii +#ifdef LN + shladd B = K, BASE_SHIFT, B +#elif defined(LT) || defined(RN) + mov B = BOFFSET +#else + nop __LINE__ +#endif + +#ifdef RN + adds KK = 1, KK +#elif defined RT + adds KK = -1, KK +#else + nop __LINE__ +#endif + mov AOFFSET = A + } + ;; + .align 16 + + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + mov ar.lc = ARLC + ;; + mov pr = PR, -1 + ;; + mov ar.pfs = ARPFS + ;; + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/trsm_kernel_LT.S b/kernel/ia64/trsm_kernel_LT.S new file mode 100644 index 0000000000..eef4e000c3 --- /dev/null +++ b/kernel/ia64/trsm_kernel_LT.S @@ -0,0 +1,11027 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE 7 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r36 +#define B r37 +#define C r38 +#define LDC r39 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 +#define AOFFSET2 loc12 +#define BOFFSET2 loc13 + + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -6 * 16, SP + adds r9 = -5 * 16, SP + adds SP = -6 * 16, SP + } + ;; + { .mmi + ld8 OFFSET = [r14] + mov AOFFSET = A + mov PR = pr + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + shr J = N, 3 + } + ;; + { .mmi + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + shladd LDC = LDC, BASE_SHIFT, r0 + } + ;; + .body + { .mmi + stf.spill [r8] = f20 + stf.spill [r9] = f21 + cmp.ge p6, p0 = 0, J + } + { .mib + nop __LINE__ +#ifdef RN + sub KK = r0, OFFSET +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L050 + } + ;; + .align 8 + +.L010: + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 +#ifdef LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + mov AOFFSET = A + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc + shladd C = LDC, 3, C // coffset += 8 * ldc + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 + shladd C6 = LDC, 2, C2 + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 + mov f112 = f0 + mov L = KK + }{ .mfb + shladd C8 = LDC, 2, C4 + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mmf + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + mov f65 = f0 + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f81 = f0 + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f119 = r0 + mov f89 = f0 + } + { .mmf + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f113 = r0 + mov f121 = f0 + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + adds L = 1, L + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds AOFFSET2 = 4 * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f127 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.fault.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + adds AOFFSET2 = 4 * SIZE, AOFFSET + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FSUB f113 = f46, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + FSUB f66 = f48, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f74 = f49, f74 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f90 = f51, f90 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FSUB f98 = f52, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f106 = f53, f106 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + FSUB f114 = f54, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f55, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + FSUB f67 = f56, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f75 = f57, f75 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + FSUB f83 = f58, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + FSUB f99 = f60, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f61, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + FSUB f115 = f62, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f63, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f68 = f32, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f76 = f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f84 = f34, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f92 = f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f108 = f37, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f116 = f38, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f39, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f69 = f40, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f77 = f41, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f85 = f42, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f43, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f101 = f44, f101 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f117 = f46, f117 + adds BOFFSET = -62 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f125 = f47, f125 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f48, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f78 = f49, f78 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f86 = f50, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f94 = f51, f94 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET] + FSUB f102 = f52, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f110 = f53, f110 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f126 = f55, f126 + adds AOFFSET = 2 * SIZE, AOFFSET + } + ;; + { .mfi + nop __LINE__ + FSUB f71 = f56, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f79 = f57, f79 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f58, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f59, f95 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f60, f103 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f111 = f61, f111 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f62, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET] + FMPY f80 = f80, f32 + adds AOFFSET = 3 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [AOFFSET], 1 * SIZE + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f45, f46 = [AOFFSET] + FNMA f81 = f80, f33, f81 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + LDFPD f51, f52 = [AOFFSET] + FNMA f74 = f72, f34, f74 + adds AOFFSET = 5 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [AOFFSET], 1 * SIZE + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET] + FNMA f67 = f64, f35, f67 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET] + FNMA f83 = f80, f35, f83 + adds AOFFSET = 7 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [AOFFSET], 1 * SIZE + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f17, f18 = [AOFFSET] + FNMA f68 = f64, f36, f68 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f96, f36, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f19, f20 = [AOFFSET] + FNMA f76 = f72, f36, f76 + adds AOFFSET = 9 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f104, f36, f108 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [AOFFSET] + FNMA f84 = f80, f36, f84 + adds AOFFSET = -63 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f112, f36, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f92 = f88, f36, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f124 = f120, f36, f124 + nop __LINE__ + } + ;; + FNMA f69 = f64, f37, f69 + FNMA f101 = f96, f37, f101 + FNMA f77 = f72, f37, f77 + FNMA f109 = f104, f37, f109 + FNMA f85 = f80, f37, f85 + FNMA f117 = f112, f37, f117 + FNMA f93 = f88, f37, f93 + FNMA f125 = f120, f37, f125 + ;; + FNMA f70 = f64, f38, f70 + FNMA f102 = f96, f38, f102 + FNMA f78 = f72, f38, f78 + FNMA f110 = f104, f38, f110 + FNMA f86 = f80, f38, f86 + FNMA f118 = f112, f38, f118 + FNMA f94 = f88, f38, f94 + FNMA f126 = f120, f38, f126 + ;; + FNMA f71 = f64, f39, f71 + FNMA f103 = f96, f39, f103 + FNMA f79 = f72, f39, f79 + FNMA f111 = f104, f39, f111 + FNMA f87 = f80, f39, f87 + FNMA f119 = f112, f39, f119 + FNMA f95 = f88, f39, f95 + FNMA f127 = f120, f39, f127 + ;; + FMPY f65 = f65, f40 + FMPY f97 = f97, f40 + FMPY f73 = f73, f40 + FMPY f105 = f105, f40 + FMPY f81 = f81, f40 + FMPY f113 = f113, f40 + FMPY f89 = f89, f40 + FMPY f121 = f121, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f98 = f97, f41, f98 + FNMA f74 = f73, f41, f74 + FNMA f106 = f105, f41, f106 + FNMA f82 = f81, f41, f82 + FNMA f114 = f113, f41, f114 + FNMA f90 = f89, f41, f90 + FNMA f122 = f121, f41, f122 + FNMA f67 = f65, f42, f67 + FNMA f99 = f97, f42, f99 + FNMA f75 = f73, f42, f75 + FNMA f107 = f105, f42, f107 + FNMA f83 = f81, f42, f83 + FNMA f115 = f113, f42, f115 + FNMA f91 = f89, f42, f91 + FNMA f123 = f121, f42, f123 + ;; + FNMA f68 = f65, f43, f68 + FNMA f100 = f97, f43, f100 + FNMA f76 = f73, f43, f76 + FNMA f108 = f105, f43, f108 + FNMA f84 = f81, f43, f84 + FNMA f116 = f113, f43, f116 + FNMA f92 = f89, f43, f92 + FNMA f124 = f121, f43, f124 + ;; + FNMA f69 = f65, f44, f69 + FNMA f101 = f97, f44, f101 + FNMA f77 = f73, f44, f77 + FNMA f109 = f105, f44, f109 + FNMA f85 = f81, f44, f85 + FNMA f117 = f113, f44, f117 + FNMA f93 = f89, f44, f93 + FNMA f125 = f121, f44, f125 + ;; + FNMA f70 = f65, f45, f70 + FNMA f102 = f97, f45, f102 + FNMA f78 = f73, f45, f78 + FNMA f110 = f105, f45, f110 + FNMA f86 = f81, f45, f86 + FNMA f118 = f113, f45, f118 + FNMA f94 = f89, f45, f94 + FNMA f126 = f121, f45, f126 + ;; + FNMA f71 = f65, f46, f71 + FNMA f103 = f97, f46, f103 + FNMA f79 = f73, f46, f79 + FNMA f111 = f105, f46, f111 + FNMA f87 = f81, f46, f87 + FNMA f119 = f113, f46, f119 + FNMA f95 = f89, f46, f95 + FNMA f127 = f121, f46, f127 + ;; + FMPY f66 = f66, f47 + FMPY f98 = f98, f47 + FMPY f74 = f74, f47 + FMPY f106 = f106, f47 + FMPY f82 = f82, f47 + FMPY f114 = f114, f47 + FMPY f90 = f90, f47 + FMPY f122 = f122, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f99 = f98, f48, f99 + FNMA f75 = f74, f48, f75 + FNMA f107 = f106, f48, f107 + FNMA f83 = f82, f48, f83 + FNMA f115 = f114, f48, f115 + FNMA f91 = f90, f48, f91 + FNMA f123 = f122, f48, f123 + FNMA f68 = f66, f49, f68 + FNMA f100 = f98, f49, f100 + FNMA f76 = f74, f49, f76 + FNMA f108 = f106, f49, f108 + FNMA f84 = f82, f49, f84 + FNMA f116 = f114, f49, f116 + FNMA f92 = f90, f49, f92 + FNMA f124 = f122, f49, f124 + ;; + FNMA f69 = f66, f50, f69 + FNMA f101 = f98, f50, f101 + FNMA f77 = f74, f50, f77 + FNMA f109 = f106, f50, f109 + FNMA f85 = f82, f50, f85 + FNMA f117 = f114, f50, f117 + FNMA f93 = f90, f50, f93 + FNMA f125 = f122, f50, f125 + ;; + FNMA f70 = f66, f51, f70 + FNMA f102 = f98, f51, f102 + FNMA f78 = f74, f51, f78 + FNMA f110 = f106, f51, f110 + FNMA f86 = f82, f51, f86 + FNMA f118 = f114, f51, f118 + FNMA f94 = f90, f51, f94 + FNMA f126 = f122, f51, f126 + ;; + FNMA f71 = f66, f52, f71 + FNMA f103 = f98, f52, f103 + FNMA f79 = f74, f52, f79 + FNMA f111 = f106, f52, f111 + FNMA f87 = f82, f52, f87 + FNMA f119 = f114, f52, f119 + FNMA f95 = f90, f52, f95 + FNMA f127 = f122, f52, f127 + ;; + FMPY f67 = f67, f53 + FMPY f99 = f99, f53 + FMPY f75 = f75, f53 + FMPY f107 = f107, f53 + FMPY f83 = f83, f53 + FMPY f115 = f115, f53 + FMPY f91 = f91, f53 + FMPY f123 = f123, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f100 = f99, f54, f100 + FNMA f76 = f75, f54, f76 + FNMA f108 = f107, f54, f108 + FNMA f84 = f83, f54, f84 + FNMA f116 = f115, f54, f116 + FNMA f92 = f91, f54, f92 + FNMA f124 = f123, f54, f124 + ;; + FNMA f69 = f67, f55, f69 + FNMA f101 = f99, f55, f101 + FNMA f77 = f75, f55, f77 + FNMA f109 = f107, f55, f109 + FNMA f85 = f83, f55, f85 + FNMA f117 = f115, f55, f117 + FNMA f93 = f91, f55, f93 + FNMA f125 = f123, f55, f125 + ;; + FNMA f70 = f67, f56, f70 + FNMA f102 = f99, f56, f102 + FNMA f78 = f75, f56, f78 + FNMA f110 = f107, f56, f110 + FNMA f86 = f83, f56, f86 + FNMA f118 = f115, f56, f118 + FNMA f94 = f91, f56, f94 + FNMA f126 = f123, f56, f126 + ;; + FNMA f71 = f67, f57, f71 + FNMA f103 = f99, f57, f103 + FNMA f79 = f75, f57, f79 + FNMA f111 = f107, f57, f111 + FNMA f87 = f83, f57, f87 + FNMA f119 = f115, f57, f119 + FNMA f95 = f91, f57, f95 + FNMA f127 = f123, f57, f127 + ;; + FMPY f68 = f68, f58 + FMPY f100 = f100, f58 + FMPY f76 = f76, f58 + FMPY f108 = f108, f58 + FMPY f84 = f84, f58 + FMPY f116 = f116, f58 + FMPY f92 = f92, f58 + FMPY f124 = f124, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f101 = f100, f59, f101 + FNMA f77 = f76, f59, f77 + FNMA f109 = f108, f59, f109 + FNMA f85 = f84, f59, f85 + FNMA f117 = f116, f59, f117 + FNMA f93 = f92, f59, f93 + FNMA f125 = f124, f59, f125 + ;; + FNMA f70 = f68, f60, f70 + FNMA f102 = f100, f60, f102 + FNMA f78 = f76, f60, f78 + FNMA f110 = f108, f60, f110 + FNMA f86 = f84, f60, f86 + FNMA f118 = f116, f60, f118 + FNMA f94 = f92, f60, f94 + FNMA f126 = f124, f60, f126 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f71 = f68, f61, f71 + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f103 = f100, f61, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + FNMA f79 = f76, f61, f79 + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + FNMA f111 = f108, f61, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f87 = f84, f61, f87 + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f119 = f116, f61, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + FNMA f95 = f92, f61, f95 + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + FNMA f127 = f124, f61, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f69 = f69, f16 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f101 = f101, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f77 = f77, f16 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f109 = f109, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f85 = f85, f16 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f117 = f117, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + FMPY f93 = f93, f16 + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + FMPY f125 = f125, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f70 = f69, f17, f70 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f102 = f101, f17, f102 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f78 = f77, f17, f78 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f110 = f109, f17, f110 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f86 = f85, f17, f86 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f118 = f117, f17, f118 + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + FNMA f94 = f93, f17, f94 + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + FNMA f126 = f125, f17, f126 + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FNMA f71 = f69, f18, f71 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FNMA f103 = f101, f18, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FNMA f79 = f77, f18, f79 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FNMA f111 = f109, f18, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FNMA f87 = f85, f18, f87 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FNMA f119 = f117, f18, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f91, 5 * SIZE + FNMA f95 = f93, f18, f95 + } + { .mfi + STFD [BOFFSET2] = f123, 5 * SIZE + FNMA f127 = f125, f18, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FMPY f70 = f70, f19 + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FMPY f102 = f102, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f76, SIZE + FMPY f78 = f78, f19 + } + { .mfi + STFD [BOFFSET2] = f108, SIZE + FMPY f110 = f110, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FMPY f86 = f86, f19 + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FMPY f118 = f118, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f92, 5 * SIZE + FMPY f94 = f94, f19 + } + { .mfi + STFD [BOFFSET2] = f124, 5 * SIZE + FMPY f126 = f126, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FNMA f71 = f70, f20, f71 + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FNMA f103 = f102, f20, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f77, SIZE + FNMA f79 = f78, f20, f79 + } + { .mfi + STFD [BOFFSET2] = f109, SIZE + FNMA f111 = f110, f20, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f85, SIZE + FNMA f87 = f86, f20, f87 + } + { .mfi + STFD [BOFFSET2] = f117, SIZE + FNMA f119 = f118, f20, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f93, 5 * SIZE + FNMA f95 = f94, f20, f95 + } + { .mfi + STFD [BOFFSET2] = f125, 5 * SIZE + FNMA f127 = f126, f20, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FMPY f71 = f71, f21 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FMPY f103 = f103, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f78, SIZE + FMPY f79 = f79, f21 + } + { .mfi + STFD [BOFFSET2] = f110, SIZE + FMPY f111 = f111, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FMPY f87 = f87, f21 + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FMPY f119 = f119, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f94, 5 * SIZE + FMPY f95 = f95, f21 + } + { .mfi + STFD [BOFFSET2] = f126, 5 * SIZE + FMPY f127 = f127, f21 + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f79, SIZE + STFD [BOFFSET2] = f111, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f87, SIZE + STFD [BOFFSET2] = f119, SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + STFD [BOFFSET] = f95 + adds BOFFSET = - 59 * SIZE, BOFFSET + } + { .mfi + STFD [BOFFSET2] = f127 + adds BOFFSET2 = - 59 * SIZE, BOFFSET2 + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + } + { .mfi + FSUB f65 = f33, f65 + } + ;; + { .mfi + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + FSUB f66 = f34, f66 + } + { .mfi + FSUB f67 = f35, f67 + } + ;; + { .mfi + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + FSUB f68 = f36, f68 + } + { .mfi + FSUB f69 = f37, f69 + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FSUB f70 = f38, f70 + } + { .mfi + FSUB f71 = f39, f71 + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + FSUB f72 = f40, f72 + } + { .mfi + FSUB f73 = f41, f73 + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FSUB f74 = f42, f74 + } + { .mfi + FSUB f75 = f43, f75 + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + FSUB f76 = f44, f76 + } + { .mfi + FSUB f77 = f45, f77 + } + ;; + { .mfi + LDFPD f62, f63 = [AOFFSET], 2 * SIZE + FSUB f78 = f46, f78 + } + { .mfi + FSUB f79 = f47, f79 + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FSUB f80 = f48, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f49, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f51, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + FSUB f84 = f52, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f53, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + FSUB f86 = f54, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f55, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FSUB f88 = f56, f88 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f57, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FSUB f90 = f58, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FSUB f92 = f60, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f61, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FSUB f94 = f62, f94 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f63, f95 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + FSUB f96 = f32, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f33, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + FSUB f98 = f34, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f35, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f101 = f37, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FSUB f102 = f38, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f39, f103 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + FSUB f104 = f40, f104 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f41, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FSUB f106 = f42, f106 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f43, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + FSUB f108 = f44, f108 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [AOFFSET] + FSUB f110 = f46, f110 + adds AOFFSET = -62 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f111 = f47, f111 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f48, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f49, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f114 = f50, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f51, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f116 = f52, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f53, f117 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f55, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f56, f120 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f57, f121 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f58, f122 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f59, f123 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f60, f124 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f125 = f61, f125 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + FSUB f126 = f62, f126 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f68 = f68, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FMPY f65 = f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f69 = f69, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET] + FMPY f66 = f66, f32 + adds BOFFSET = 3 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f70 = f70, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [BOFFSET], 1 * SIZE + FMPY f67 = f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f71 = f71, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + FNMA f72 = f64, f33, f72 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f76 = f68, f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + FNMA f73 = f65, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f77 = f69, f33, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f45, f46 = [BOFFSET] + FNMA f74 = f66, f33, f74 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f78 = f70, f33, f78 + nop __LINE__ + } + ;; + { .mfi + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + FNMA f75 = f67, f33, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f79 = f71, f33, f79 + nop __LINE__ + } + ;; + { .mfi + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + FNMA f80 = f64, f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f68, f34, f84 + nop __LINE__ + } + ;; + { .mfi + LDFPD f51, f52 = [BOFFSET] + FNMA f81 = f65, f34, f81 + adds BOFFSET = 5 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f85 = f69, f34, f85 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [BOFFSET], 1 * SIZE + FNMA f82 = f66, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f70, f34, f86 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FNMA f83 = f67, f34, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f71, f34, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET] + FNMA f88 = f64, f35, f88 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f92 = f68, f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FNMA f89 = f65, f35, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f93 = f69, f35, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET] + FNMA f90 = f66, f35, f90 + adds BOFFSET = 7 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f94 = f70, f35, f94 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [BOFFSET], 1 * SIZE + FNMA f91 = f67, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f95 = f71, f35, f95 + nop __LINE__ + } + ;; + { .mfi + LDFPD f17, f18 = [BOFFSET] + FNMA f96 = f64, f36, f96 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f68, f36, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f19, f20 = [BOFFSET] + FNMA f97 = f65, f36, f97 + adds BOFFSET = 9 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f69, f36, f101 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [BOFFSET] + FNMA f98 = f66, f36, f98 + adds BOFFSET = -63 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f70, f36, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f67, f36, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f71, f36, f103 + nop __LINE__ + } + ;; + FNMA f104 = f64, f37, f104 + FNMA f108 = f68, f37, f108 + FNMA f105 = f65, f37, f105 + FNMA f109 = f69, f37, f109 + FNMA f106 = f66, f37, f106 + FNMA f110 = f70, f37, f110 + FNMA f107 = f67, f37, f107 + FNMA f111 = f71, f37, f111 + ;; + FNMA f112 = f64, f38, f112 + FNMA f116 = f68, f38, f116 + FNMA f113 = f65, f38, f113 + FNMA f117 = f69, f38, f117 + FNMA f114 = f66, f38, f114 + FNMA f118 = f70, f38, f118 + FNMA f115 = f67, f38, f115 + FNMA f119 = f71, f38, f119 + ;; + FNMA f120 = f64, f39, f120 + FNMA f124 = f68, f39, f124 + FNMA f121 = f65, f39, f121 + FNMA f125 = f69, f39, f125 + FNMA f122 = f66, f39, f122 + FNMA f126 = f70, f39, f126 + FNMA f123 = f67, f39, f123 + FNMA f127 = f71, f39, f127 + ;; + FMPY f72 = f72, f40 + FMPY f76 = f76, f40 + FMPY f73 = f73, f40 + FMPY f77 = f77, f40 + FMPY f74 = f74, f40 + FMPY f78 = f78, f40 + FMPY f75 = f75, f40 + FMPY f79 = f79, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f84 = f76, f41, f84 + FNMA f81 = f73, f41, f81 + FNMA f85 = f77, f41, f85 + FNMA f82 = f74, f41, f82 + FNMA f86 = f78, f41, f86 + FNMA f83 = f75, f41, f83 + FNMA f87 = f79, f41, f87 + ;; + FNMA f88 = f72, f42, f88 + FNMA f92 = f76, f42, f92 + FNMA f89 = f73, f42, f89 + FNMA f93 = f77, f42, f93 + FNMA f90 = f74, f42, f90 + FNMA f94 = f78, f42, f94 + FNMA f91 = f75, f42, f91 + FNMA f95 = f79, f42, f95 + ;; + FNMA f96 = f72, f43, f96 + FNMA f100 = f76, f43, f100 + FNMA f97 = f73, f43, f97 + FNMA f101 = f77, f43, f101 + FNMA f98 = f74, f43, f98 + FNMA f102 = f78, f43, f102 + FNMA f99 = f75, f43, f99 + FNMA f103 = f79, f43, f103 + ;; + FNMA f104 = f72, f44, f104 + FNMA f108 = f76, f44, f108 + FNMA f105 = f73, f44, f105 + FNMA f109 = f77, f44, f109 + FNMA f106 = f74, f44, f106 + FNMA f110 = f78, f44, f110 + FNMA f107 = f75, f44, f107 + FNMA f111 = f79, f44, f111 + ;; + FNMA f112 = f72, f45, f112 + FNMA f116 = f76, f45, f116 + FNMA f113 = f73, f45, f113 + FNMA f117 = f77, f45, f117 + FNMA f114 = f74, f45, f114 + FNMA f118 = f78, f45, f118 + FNMA f115 = f75, f45, f115 + FNMA f119 = f79, f45, f119 + ;; + FNMA f120 = f72, f46, f120 + FNMA f124 = f76, f46, f124 + FNMA f121 = f73, f46, f121 + FNMA f125 = f77, f46, f125 + FNMA f122 = f74, f46, f122 + FNMA f126 = f78, f46, f126 + FNMA f123 = f75, f46, f123 + FNMA f127 = f79, f46, f127 + ;; + FMPY f80 = f80, f47 + FMPY f84 = f84, f47 + FMPY f81 = f81, f47 + FMPY f85 = f85, f47 + FMPY f82 = f82, f47 + FMPY f86 = f86, f47 + FMPY f83 = f83, f47 + FMPY f87 = f87, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f92 = f84, f48, f92 + FNMA f89 = f81, f48, f89 + FNMA f93 = f85, f48, f93 + FNMA f90 = f82, f48, f90 + FNMA f94 = f86, f48, f94 + FNMA f91 = f83, f48, f91 + FNMA f95 = f87, f48, f95 + ;; + FNMA f96 = f80, f49, f96 + FNMA f100 = f84, f49, f100 + FNMA f97 = f81, f49, f97 + FNMA f101 = f85, f49, f101 + FNMA f98 = f82, f49, f98 + FNMA f102 = f86, f49, f102 + FNMA f99 = f83, f49, f99 + FNMA f103 = f87, f49, f103 + ;; + FNMA f104 = f80, f50, f104 + FNMA f108 = f84, f50, f108 + FNMA f105 = f81, f50, f105 + FNMA f109 = f85, f50, f109 + FNMA f106 = f82, f50, f106 + FNMA f110 = f86, f50, f110 + FNMA f107 = f83, f50, f107 + FNMA f111 = f87, f50, f111 + ;; + FNMA f112 = f80, f51, f112 + FNMA f116 = f84, f51, f116 + FNMA f113 = f81, f51, f113 + FNMA f117 = f85, f51, f117 + FNMA f114 = f82, f51, f114 + FNMA f118 = f86, f51, f118 + FNMA f115 = f83, f51, f115 + FNMA f119 = f87, f51, f119 + ;; + FNMA f120 = f80, f52, f120 + FNMA f124 = f84, f52, f124 + FNMA f121 = f81, f52, f121 + FNMA f125 = f85, f52, f125 + FNMA f122 = f82, f52, f122 + FNMA f126 = f86, f52, f126 + FNMA f123 = f83, f52, f123 + FNMA f127 = f87, f52, f127 + ;; + FMPY f88 = f88, f53 + FMPY f92 = f92, f53 + FMPY f89 = f89, f53 + FMPY f93 = f93, f53 + FMPY f90 = f90, f53 + FMPY f94 = f94, f53 + FMPY f91 = f91, f53 + FMPY f95 = f95, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f100 = f92, f54, f100 + FNMA f97 = f89, f54, f97 + FNMA f101 = f93, f54, f101 + FNMA f98 = f90, f54, f98 + FNMA f102 = f94, f54, f102 + FNMA f99 = f91, f54, f99 + FNMA f103 = f95, f54, f103 + ;; + FNMA f104 = f88, f55, f104 + FNMA f108 = f92, f55, f108 + FNMA f105 = f89, f55, f105 + FNMA f109 = f93, f55, f109 + FNMA f106 = f90, f55, f106 + FNMA f110 = f94, f55, f110 + FNMA f107 = f91, f55, f107 + FNMA f111 = f95, f55, f111 + ;; + FNMA f112 = f88, f56, f112 + FNMA f116 = f92, f56, f116 + FNMA f113 = f89, f56, f113 + FNMA f117 = f93, f56, f117 + FNMA f114 = f90, f56, f114 + FNMA f118 = f94, f56, f118 + FNMA f115 = f91, f56, f115 + FNMA f119 = f95, f56, f119 + ;; + FNMA f120 = f88, f57, f120 + FNMA f124 = f92, f57, f124 + FNMA f121 = f89, f57, f121 + FNMA f125 = f93, f57, f125 + FNMA f122 = f90, f57, f122 + FNMA f126 = f94, f57, f126 + FNMA f123 = f91, f57, f123 + FNMA f127 = f95, f57, f127 + ;; + FMPY f96 = f96, f58 + FMPY f100 = f100, f58 + FMPY f97 = f97, f58 + FMPY f101 = f101, f58 + FMPY f98 = f98, f58 + FMPY f102 = f102, f58 + FMPY f99 = f99, f58 + FMPY f103 = f103, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f108 = f100, f59, f108 + FNMA f105 = f97, f59, f105 + FNMA f109 = f101, f59, f109 + FNMA f106 = f98, f59, f106 + FNMA f110 = f102, f59, f110 + FNMA f107 = f99, f59, f107 + FNMA f111 = f103, f59, f111 + ;; + FNMA f112 = f96, f60, f112 + FNMA f116 = f100, f60, f116 + FNMA f113 = f97, f60, f113 + FNMA f117 = f101, f60, f117 + FNMA f114 = f98, f60, f114 + FNMA f118 = f102, f60, f118 + FNMA f115 = f99, f60, f115 + FNMA f119 = f103, f60, f119 + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f120 = f96, f61, f120 + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f124 = f100, f61, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FNMA f121 = f97, f61, f121 + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FNMA f125 = f101, f61, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f122 = f98, f61, f122 + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f126 = f102, f61, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FNMA f123 = f99, f61, f123 + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FNMA f127 = f103, f61, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f72, SIZE + FMPY f104 = f104, f16 + } + { .mfi + STFD [AOFFSET2] = f76, SIZE + FMPY f108 = f108, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f73, SIZE + FMPY f105 = f105, f16 + } + { .mfi + STFD [AOFFSET2] = f77, SIZE + FMPY f109 = f109, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f74, SIZE + FMPY f106 = f106, f16 + } + { .mfi + STFD [AOFFSET2] = f78, SIZE + FMPY f110 = f110, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f75, 5 * SIZE + FMPY f107 = f107, f16 + } + { .mfi + STFD [AOFFSET2] = f79, 5 * SIZE + FMPY f111 = f111, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f112 = f104, f17, f112 + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f116 = f108, f17, f116 + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FNMA f113 = f105, f17, f113 + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FNMA f117 = f109, f17, f117 + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f114 = f106, f17, f114 + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f118 = f110, f17, f118 + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FNMA f115 = f107, f17, f115 + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FNMA f119 = f111, f17, f119 + } + ;; + { .mfi + STFD [AOFFSET] = f88, SIZE + FNMA f120 = f104, f18, f120 + } + { .mfi + STFD [AOFFSET2] = f92, SIZE + FNMA f124 = f108, f18, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f89, SIZE + FNMA f121 = f105, f18, f121 + } + { .mfi + STFD [AOFFSET2] = f93, SIZE + FNMA f125 = f109, f18, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f90, SIZE + FNMA f122 = f106, f18, f122 + } + { .mfi + STFD [AOFFSET2] = f94, SIZE + FNMA f126 = f110, f18, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f91, 5 * SIZE + FNMA f123 = f107, f18, f123 + } + { .mfi + STFD [AOFFSET2] = f95, 5 * SIZE + FNMA f127 = f111, f18, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FMPY f112 = f112, f19 + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FMPY f116 = f116, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMPY f113 = f113, f19 + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMPY f117 = f117, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FMPY f114 = f114, f19 + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FMPY f118 = f118, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMPY f115 = f115, f19 + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMPY f119 = f119, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f104, SIZE + FNMA f120 = f112, f20, f120 + } + { .mfi + STFD [AOFFSET2] = f108, SIZE + FNMA f124 = f116, f20, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f105, SIZE + FNMA f121 = f113, f20, f121 + } + { .mfi + STFD [AOFFSET2] = f109, SIZE + FNMA f125 = f117, f20, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f106, SIZE + FNMA f122 = f114, f20, f122 + } + { .mfi + STFD [AOFFSET2] = f110, SIZE + FNMA f126 = f118, f20, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f107, 5 * SIZE + FNMA f123 = f115, f20, f123 + } + { .mfi + STFD [AOFFSET2] = f111, 5 * SIZE + FNMA f127 = f119, f20, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FMPY f120 = f120, f21 + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FMPY f124 = f124, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMPY f121 = f121, f21 + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMPY f125 = f125, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FMPY f122 = f122, f21 + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FMPY f126 = f126, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f115, 5 * SIZE + FMPY f123 = f123, f21 + } + { .mfi + STFD [AOFFSET2] = f119, 5 * SIZE + FMPY f127 = f127, f21 + } + ;; + { .mmi + STFD [AOFFSET] = f120, SIZE + STFD [AOFFSET2] = f124, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f121, SIZE + STFD [AOFFSET2] = f125, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f122, SIZE + STFD [AOFFSET2] = f126, SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + STFD [AOFFSET] = f123 + adds AOFFSET = - 59 * SIZE, AOFFSET + } + { .mfi + STFD [AOFFSET2] = f127 + adds AOFFSET2 = - 59 * SIZE, AOFFSET2 + } + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f67, 5 * SIZE + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f75, 5 * SIZE + STFD [C10] = f79 + nop __LINE__ + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + adds C13 = 4 * SIZE, C5 + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C3 ] = f83, 5 * SIZE + STFD [C11] = f87 + adds C14 = 4 * SIZE, C6 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + adds C16 = 4 * SIZE, C8 + } + ;; + { .mmi + STFD [C4 ] = f91, 5 * SIZE + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C13] = f100, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + STFD [C13] = f101, SIZE + adds I = -1, I + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE + STFD [C13] = f102, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C5 ] = f99, 5 * SIZE + STFD [C13] = f103 + adds C15 = 4 * SIZE, C7 + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + STFD [C14] = f108, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + STFD [C14] = f109, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + STFD [C14] = f110, SIZE + sub L = K, KK + } + ;; + { .mmi + STFD [C6 ] = f107, 5 * SIZE + STFD [C14] = f111 + nop __LINE__ + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C15] = f116, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + STFD [C15] = f117, SIZE + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE + STFD [C15] = f118, SIZE + shladd AOFFSET = L, 3, AOFFSET + } + ;; + { .mmi + STFD [C7 ] = f115, 5 * SIZE + STFD [C15] = f119 + shladd BOFFSET = L, 3, BOFFSET + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + STFD [C16] = f124, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE + STFD [C16] = f125, SIZE +#ifdef LT + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE + STFD [C16] = f126, SIZE + mov L = KK + } + ;; + { .mmb + STFD [C8 ] = f123, 5 * SIZE + STFD [C16] = f127 + (p6) br.cond.dptk .L011 + } + ;; + +.L020: + { .mib + mov L = KK + tbit.z p6, p0 = M, 2 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f67 = f0 + } + { .mfi + setf.d f74 = r0 + mov f75 = f0 + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f83 = f0 + } + { .mfi + setf.d f90 = r0 + mov f91 = f0 + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f99 = f0 + } + { .mfi + setf.d f106 = r0 + mov f107 = f0 + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f114 = r0 + mov f115 = f0 + } + { .mfb + setf.d f122 = r0 + mov f123 = f0 + (p6) br.cond.dpnt .L028 + } + ;; + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f113 = f46, f113 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + FSUB f66 = f48, f66 + FSUB f74 = f49, f74 + FSUB f82 = f50, f82 + FSUB f90 = f51, f90 + FSUB f98 = f52, f98 + FSUB f106 = f53, f106 + FSUB f114 = f54, f114 + FSUB f122 = f55, f122 + ;; + FSUB f67 = f56, f67 + FSUB f75 = f57, f75 + FSUB f83 = f58, f83 + FSUB f91 = f59, f91 + FSUB f99 = f60, f99 + FSUB f107 = f61, f107 + FSUB f115 = f62, f115 + FSUB f123 = f63, f123 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; + FSUB f96 = f48, f96 + FSUB f97 = f49, f97 + FSUB f98 = f50, f98 + FSUB f99 = f51, f99 + ;; + FSUB f104 = f52, f104 + FSUB f105 = f53, f105 + FSUB f106 = f54, f106 + FSUB f107 = f55, f107 + ;; + FSUB f112 = f56, f112 + FSUB f113 = f57, f113 + FSUB f114 = f58, f114 + FSUB f115 = f59, f115 + ;; + FSUB f120 = f60, f120 + FSUB f121 = f61, f121 + FSUB f122 = f62, f122 + FSUB f123 = f63, f123 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + FNMA f74 = f72, f34, f74 + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + FNMA f67 = f64, f35, f67 + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + FNMA f83 = f80, f35, f83 + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + FMPY f65 = f65, f36 + FMPY f97 = f97, f36 + FMPY f73 = f73, f36 + FMPY f105 = f105, f36 + FMPY f81 = f81, f36 + FMPY f113 = f113, f36 + FMPY f89 = f89, f36 + FMPY f121 = f121, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f98 = f97, f37, f98 + FNMA f74 = f73, f37, f74 + FNMA f106 = f105, f37, f106 + FNMA f82 = f81, f37, f82 + FNMA f114 = f113, f37, f114 + FNMA f90 = f89, f37, f90 + FNMA f122 = f121, f37, f122 + ;; + FNMA f67 = f65, f38, f67 + FNMA f99 = f97, f38, f99 + FNMA f75 = f73, f38, f75 + FNMA f107 = f105, f38, f107 + FNMA f83 = f81, f38, f83 + FNMA f115 = f113, f38, f115 + FNMA f91 = f89, f38, f91 + FNMA f123 = f121, f38, f123 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FMPY f66 = f66, f39 + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FMPY f98 = f98, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + FMPY f74 = f74, f39 + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + FMPY f106 = f106, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FMPY f82 = f82, f39 + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FMPY f114 = f114, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + FMPY f90 = f90, f39 + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + FMPY f122 = f122, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FNMA f67 = f66, f40, f67 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FNMA f99 = f98, f40, f99 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FNMA f75 = f74, f40, f75 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FNMA f107 = f106, f40, f107 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FNMA f83 = f82, f40, f83 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FNMA f115 = f114, f40, f115 + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + FNMA f91 = f90, f40, f91 + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + FNMA f123 = f122, f40, f123 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FMPY f67 = f67, f41 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FMPY f99 = f99, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FMPY f75 = f75, f41 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FMPY f107 = f107, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FMPY f83 = f83, f41 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FMPY f115 = f115, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + FMPY f91 = f91, f41 + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + FMPY f123 = f123, f41 + } + ;; + { .mmf + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + } + ;; + { .mmf + STFD [BOFFSET] = f75, SIZE + STFD [BOFFSET2] = f107, SIZE + } + ;; + { .mmf + STFD [BOFFSET] = f83, SIZE + STFD [BOFFSET2] = f115, SIZE + } + ;; + { .mmf + STFD [BOFFSET] = f91, -27 * SIZE + STFD [BOFFSET2] = f123, -27 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + FNMA f98 = f66, f36, f98 + FNMA f99 = f67, f36, f99 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + FNMA f106 = f66, f37, f106 + FNMA f107 = f67, f37, f107 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + FNMA f114 = f66, f38, f114 + FNMA f115 = f67, f38, f115 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + FNMA f122 = f66, f39, f122 + FNMA f123 = f67, f39, f123 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + FMPY f74 = f74, f40 + FMPY f75 = f75, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + FNMA f82 = f74, f41, f82 + FNMA f83 = f75, f41, f83 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + FNMA f90 = f74, f42, f90 + FNMA f91 = f75, f42, f91 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + FNMA f98 = f74, f43, f98 + FNMA f99 = f75, f43, f99 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + FNMA f106 = f74, f44, f106 + FNMA f107 = f75, f44, f107 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + FNMA f114 = f74, f45, f114 + FNMA f115 = f75, f45, f115 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + FNMA f122 = f74, f46, f122 + FNMA f123 = f75, f46, f123 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + FMPY f82 = f82, f47 + FMPY f83 = f83, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + FNMA f90 = f82, f48, f90 + FNMA f91 = f83, f48, f91 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + FNMA f98 = f82, f49, f98 + FNMA f99 = f83, f49, f99 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + FNMA f106 = f82, f50, f106 + FNMA f107 = f83, f50, f107 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + FNMA f114 = f82, f51, f114 + FNMA f115 = f83, f51, f115 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + FNMA f122 = f82, f52, f122 + FNMA f123 = f83, f52, f123 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + FMPY f90 = f90, f53 + FMPY f91 = f91, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + FNMA f98 = f90, f54, f98 + FNMA f99 = f91, f54, f99 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + FNMA f106 = f90, f55, f106 + FNMA f107 = f91, f55, f107 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + FNMA f114 = f90, f56, f114 + FNMA f115 = f91, f56, f115 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + FNMA f122 = f90, f57, f122 + FNMA f123 = f91, f57, f123 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + FMPY f98 = f98, f58 + FMPY f99 = f99, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + FNMA f106 = f98, f59, f106 + FNMA f107 = f99, f59, f107 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + FNMA f114 = f98, f60, f114 + FNMA f115 = f99, f60, f115 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + FNMA f122 = f98, f61, f122 + FNMA f123 = f99, f61, f123 + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FMPY f104 = f104, f16 + } + { .mfi + STFD [AOFFSET2] = f72, SIZE + FMPY f105 = f105, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMPY f106 = f106, f16 + } + { .mfi + STFD [AOFFSET2] = f73, SIZE + FMPY f107 = f107, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f112 = f104, f17, f112 + } + { .mfi + STFD [AOFFSET2] = f74, SIZE + FNMA f113 = f105, f17, f113 + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FNMA f114 = f106, f17, f114 + } + { .mfi + STFD [AOFFSET2] = f75, 5 * SIZE + FNMA f115 = f107, f17, f115 + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f120 = f104, f18, f120 + } + { .mfi + STFD [AOFFSET2] = f88, SIZE + FNMA f121 = f105, f18, f121 + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FNMA f122 = f106, f18, f122 + } + { .mfi + STFD [AOFFSET2] = f89, SIZE + FNMA f123 = f107, f18, f123 + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FMPY f112 = f112, f19 + } + { .mfi + STFD [AOFFSET2] = f90, SIZE + FMPY f113 = f113, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FMPY f114 = f114, f19 + } + { .mfi + STFD [AOFFSET2] = f91, 5 * SIZE + FMPY f115 = f115, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f120 = f112, f20, f120 + } + { .mfi + STFD [AOFFSET2] = f104, SIZE + FNMA f121 = f113, f20, f121 + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FNMA f122 = f114, f20, f122 + } + { .mfi + STFD [AOFFSET2] = f105, SIZE + FNMA f123 = f115, f20, f123 + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FMPY f120 = f120, f21 + } + { .mfi + STFD [AOFFSET2] = f106, SIZE + FMPY f121 = f121, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMPY f122 = f122, f21 + } + { .mfi + STFD [AOFFSET2] = f107, 5 * SIZE + FMPY f123 = f123, f21 + } + ;; + { .mmf + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + } + ;; + { .mmf + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + } + ;; + { .mmf + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + } + ;; + { .mmf + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f123, - 27 * SIZE + } + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f66, SIZE + STFD [C2 ] = f74, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C1 ] = f67, SIZE + STFD [C2 ] = f75, SIZE + sub L = K, KK + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmf + STFD [C3 ] = f82, SIZE + STFD [C4 ] = f90, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C3 ] = f83, SIZE + STFD [C4 ] = f91, SIZE + shladd AOFFSET = L, 2, AOFFSET + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C6 ] = f104, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + STFD [C6 ] = f105, SIZE + shladd BOFFSET = L, 3, BOFFSET + } + ;; + { .mmf + STFD [C5 ] = f98, SIZE + STFD [C6 ] = f106, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C5 ] = f99, SIZE + STFD [C6 ] = f107, SIZE +#ifdef LT + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C8 ] = f120, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + STFD [C8 ] = f121, SIZE + mov L = KK + } + ;; + { .mmf + STFD [C7 ] = f114, SIZE + STFD [C8 ] = f122, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C7 ] = f115, SIZE + STFD [C8 ] = f123, SIZE + nop __LINE__ + } + ;; + .align 8 + +.L030: + { .mib + mov L = KK + tbit.z p6, p0 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + nop __LINE__ + } + ;; + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + adds L = 1, L + } + ;; + { .mfi + setf.d f105 = r0 + mov f81 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + mov f65 = f0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + cmp.eq p3, p0 = r0, r0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = 0, L + adds L = -1, L + } + ;; + { .mib + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L038 + } + ;; + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + FSUB f113 = f46, f113 + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + + { .mmi + LDFPD f32, f33 = [AOFFSET] + nop __LINE__ + adds AOFFSET = 3 * SIZE, AOFFSET + } + ;; + { .mfi + LDFD f34 = [AOFFSET], - 3 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FMPY f65 = f65, f34 + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FMPY f97 = f97, f34 + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + FMPY f73 = f73, f34 + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + FMPY f105 = f105, f34 + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FMPY f81 = f81, f34 + sub L = K, KK + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FMPY f113 = f113, f34 + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + FMPY f89 = f89, f34 + shladd L = L, BASE_SHIFT, r0 + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + FMPY f121 = f121, f34 + } + ;; + { .mmi + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f73, SIZE + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f81, SIZE + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f89, -11 * SIZE + STFD [BOFFSET2] = f121, -11 * SIZE + } +#endif + +#ifdef RN + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + FSUB f96 = f40, f96 + FSUB f97 = f41, f97 + FSUB f104 = f42, f104 + FSUB f105 = f43, f105 + FSUB f112 = f44, f112 + FSUB f113 = f45, f113 + FSUB f120 = f46, f120 + FSUB f121 = f47, f121 + ;; + + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f120 = f104, f18, f120 + } + { .mfi + STFD [AOFFSET2] = f80, SIZE + FNMA f121 = f105, f18, f121 + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMPY f112 = f112, f19 + } + { .mfi + STFD [AOFFSET2] = f81, SIZE + FMPY f113 = f113, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f72, SIZE + FNMA f120 = f112, f20, f120 + sub L = K, KK + } + { .mfi + STFD [AOFFSET2] = f88, SIZE + FNMA f121 = f113, f20, f121 + } + ;; + { .mfi + STFD [AOFFSET] = f73, 5 * SIZE + FMPY f120 = f120, f21 + shladd L = L, BASE_SHIFT, r0 + } + { .mfi + STFD [AOFFSET2] = f89, 5 * SIZE + FMPY f121 = f121, f21 + } + ;; + { .mmi + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f105, -11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + nop __LINE__ + } + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C1 ] = f65, SIZE + STFD [C2 ] = f73, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmf + STFD [C3 ] = f81, SIZE + STFD [C4 ] = f89, SIZE + mov f80 = f0 + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C6 ] = f104, SIZE + mov f96 = f0 + } + ;; + { .mmf + STFD [C5 ] = f97, SIZE + STFD [C6 ] = f105, SIZE + mov f104 = f0 + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C8 ] = f120, SIZE + mov f112 = f0 + } + ;; + { .mmf + STFD [C7 ] = f113, SIZE + STFD [C8 ] = f121, SIZE + mov f120 = f0 + } + { .mmi + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 3, BOFFSET +#ifdef LT + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L040: + { .mib + mov L = KK + tbit.z p6, p0 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L048 + } + ;; + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.cloop.sptk.few .L042 + } + ;; + +.L048: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f96 = f36, f96 + FSUB f104 = f37, f104 + FSUB f112 = f38, f112 + FSUB f120 = f39, f120 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, -3 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, -3 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FNMA f96 = f64, f36, f96 + ;; + FNMA f104 = f64, f37, f104 + ;; + FNMA f112 = f64, f38, f112 + ;; + FNMA f120 = f64, f39, f120 + ;; + FMPY f72 = f72, f40 + ;; + FNMA f80 = f72, f41, f80 + ;; + FNMA f88 = f72, f42, f88 + ;; + FNMA f96 = f72, f43, f96 + ;; + FNMA f104 = f72, f44, f104 + ;; + FNMA f112 = f72, f45, f112 + ;; + FNMA f120 = f72, f46, f120 + ;; + FMPY f80 = f80, f47 + ;; + FNMA f88 = f80, f48, f88 + ;; + FNMA f96 = f80, f49, f96 + ;; + FNMA f104 = f80, f50, f104 + ;; + FNMA f112 = f80, f51, f112 + ;; + FNMA f120 = f80, f52, f120 + ;; + FMPY f88 = f88, f53 + ;; + FNMA f96 = f88, f54, f96 + ;; + FNMA f104 = f88, f55, f104 + ;; + FNMA f112 = f88, f56, f112 + ;; + FNMA f120 = f88, f57, f120 + ;; + FMPY f96 = f96, f58 + ;; + FNMA f104 = f96, f59, f104 + ;; + FNMA f112 = f96, f60, f112 + ;; + FNMA f120 = f96, f61, f120 + ;; + FMPY f104 = f104, f16 + ;; + FNMA f112 = f104, f17, f112 + ;; + FNMA f120 = f104, f18, f120 + ;; + FMPY f112 = f112, f19 + ;; + FNMA f120 = f112, f20, f120 + ;; + FMPY f120 = f120, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + STFD [C5 ] = f96, SIZE + STFD [C6 ] = f104, SIZE + STFD [C7 ] = f112, SIZE + STFD [C8 ] = f120, SIZE + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f96 = f0 + mov f104 = f0 + mov f112 = f0 + mov f120 = f0 + ;; + sub L = K, KK + ;; + shladd L = L, BASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + ;; + shladd BOFFSET = L, 3, BOFFSET + ;; +#ifdef LT + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + mov L = KK + ;; + .align 8 + +.L049: + mov B = BOFFSET + +#ifdef RN + adds KK = 8, KK +#endif + ;; + + { .mmi + mov AOFFSET = A + } + ;; + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 8 + +.L050: + { .mib + setf.d f64 = r0 + tbit.z p6, p0 = N, 2 + (p6) br.cond.dpnt .L090 + } + ;; + { .mfi + setf.d f72 = r0 + mov f80 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f88 = f0 +#ifdef LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + mov AORIG = A + mov f65 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f73 = f0 + } + ;; + { .mfi + shladd C = LDC, 2, C // coffset += 8 * ldc + mov f81 = f0 + mov L = KK + }{ .mfb + shladd C4 = LDC, 1, C2 + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f90 = f0 + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC] + } + ;; + { .mfi + setf.d f70 = r0 + mov f78 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mfi + setf.d f71 = r0 + adds L = -1, L + } + ;; + { .mfi + setf.d f87 = r0 + mov f79 = f0 + mov ar.lc = L + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f95 = f0 + (p6) br.cond.dpnt .L058 + } + ;; + .align 8 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 8 + +.L058: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [BOFFSET] + adds BOFFSET = -30 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + + FSUB f68 = f48, f68 + FSUB f76 = f49, f76 + FSUB f84 = f50, f84 + FSUB f92 = f51, f92 + + FSUB f69 = f52, f69 + FSUB f77 = f53, f77 + FSUB f85 = f54, f85 + FSUB f93 = f55, f93 + + FSUB f70 = f56, f70 + FSUB f78 = f57, f78 + FSUB f86 = f58, f86 + FSUB f94 = f59, f94 + + FSUB f71 = f60, f71 + FSUB f79 = f61, f79 + FSUB f87 = f62, f87 + FSUB f95 = f63, f95 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; + FSUB f80 = f48, f80 + FSUB f81 = f49, f81 + FSUB f82 = f50, f82 + FSUB f83 = f51, f83 + FSUB f84 = f52, f84 + FSUB f85 = f53, f85 + FSUB f86 = f54, f86 + FSUB f87 = f55, f87 + + FSUB f88 = f56, f88 + FSUB f89 = f57, f89 + FSUB f90 = f58, f90 + FSUB f91 = f59, f91 + FSUB f92 = f60, f92 + FSUB f93 = f61, f93 + FSUB f94 = f62, f94 + FSUB f95 = f63, f95 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + FNMA f84 = f80, f36, f84 + FNMA f92 = f88, f36, f92 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + FNMA f85 = f80, f37, f85 + FNMA f93 = f88, f37, f93 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + FNMA f86 = f80, f38, f86 + FNMA f94 = f88, f38, f94 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + FNMA f87 = f80, f39, f87 + FNMA f95 = f88, f39, f95 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + FMPY f81 = f81, f40 + FMPY f89 = f89, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + FNMA f82 = f81, f41, f82 + FNMA f90 = f89, f41, f90 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + FNMA f83 = f81, f42, f83 + FNMA f91 = f89, f42, f91 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + FNMA f84 = f81, f43, f84 + FNMA f92 = f89, f43, f92 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + FNMA f85 = f81, f44, f85 + FNMA f93 = f89, f44, f93 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + FNMA f86 = f81, f45, f86 + FNMA f94 = f89, f45, f94 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + FNMA f87 = f81, f46, f87 + FNMA f95 = f89, f46, f95 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + FMPY f82 = f82, f47 + FMPY f90 = f90, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + FNMA f83 = f82, f48, f83 + FNMA f91 = f90, f48, f91 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + FNMA f84 = f82, f49, f84 + FNMA f92 = f90, f49, f92 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + FNMA f85 = f82, f50, f85 + FNMA f93 = f90, f50, f93 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + FNMA f86 = f82, f51, f86 + FNMA f94 = f90, f51, f94 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + FNMA f87 = f82, f52, f87 + FNMA f95 = f90, f52, f95 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + FMPY f83 = f83, f53 + FMPY f91 = f91, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + FNMA f84 = f83, f54, f84 + FNMA f92 = f91, f54, f92 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + FNMA f85 = f83, f55, f85 + FNMA f93 = f91, f55, f93 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + FNMA f86 = f83, f56, f86 + FNMA f94 = f91, f56, f94 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + FNMA f87 = f83, f57, f87 + FNMA f95 = f91, f57, f95 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + FMPY f84 = f84, f58 + FMPY f92 = f92, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + FNMA f85 = f84, f59, f85 + FNMA f93 = f92, f59, f93 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + FNMA f86 = f84, f60, f86 + FNMA f94 = f92, f60, f94 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + FNMA f87 = f84, f61, f87 + FNMA f95 = f92, f61, f95 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + FMPY f85 = f85, f16 + FMPY f93 = f93, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + FNMA f86 = f85, f17, f86 + FNMA f94 = f93, f17, f94 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + FNMA f87 = f85, f18, f87 + FNMA f95 = f93, f18, f95 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + FMPY f86 = f86, f19 + FMPY f94 = f94, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + FNMA f87 = f86, f20, f87 + FNMA f95 = f94, f20, f95 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + FMPY f87 = f87, f21 + FMPY f95 = f95, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, 5 * SIZE + STFD [BOFFSET2] = f91, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, 5 * SIZE + STFD [BOFFSET2] = f93, 5 * SIZE + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94 + STFD [BOFFSET2] = f95 + adds C9 = 4 * SIZE, C1 + adds BOFFSET = - 27 * SIZE, BOFFSET + adds BOFFSET2 = - 27 * SIZE, BOFFSET2 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FNMA f80 = f64, f34, f80 + FNMA f84 = f68, f34, f84 + FNMA f81 = f65, f34, f81 + FNMA f85 = f69, f34, f85 + FNMA f82 = f66, f34, f82 + FNMA f86 = f70, f34, f86 + FNMA f83 = f67, f34, f83 + FNMA f87 = f71, f34, f87 + ;; + FNMA f88 = f64, f35, f88 + FNMA f92 = f68, f35, f92 + FNMA f89 = f65, f35, f89 + FNMA f93 = f69, f35, f93 + FNMA f90 = f66, f35, f90 + FNMA f94 = f70, f35, f94 + FNMA f91 = f67, f35, f91 + FNMA f95 = f71, f35, f95 + ;; + FMPY f72 = f72, f36 + FMPY f76 = f76, f36 + FMPY f73 = f73, f36 + FMPY f77 = f77, f36 + FMPY f74 = f74, f36 + FMPY f78 = f78, f36 + FMPY f75 = f75, f36 + FMPY f79 = f79, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f84 = f76, f37, f84 + FNMA f81 = f73, f37, f81 + FNMA f85 = f77, f37, f85 + FNMA f82 = f74, f37, f82 + FNMA f86 = f78, f37, f86 + FNMA f83 = f75, f37, f83 + FNMA f87 = f79, f37, f87 + ;; + FNMA f88 = f72, f38, f88 + FNMA f92 = f76, f38, f92 + FNMA f89 = f73, f38, f89 + FNMA f93 = f77, f38, f93 + FNMA f90 = f74, f38, f90 + FNMA f94 = f78, f38, f94 + FNMA f91 = f75, f38, f91 + FNMA f95 = f79, f38, f95 + ;; + FMPY f80 = f80, f39 + FMPY f84 = f84, f39 + FMPY f81 = f81, f39 + FMPY f85 = f85, f39 + FMPY f82 = f82, f39 + FMPY f86 = f86, f39 + FMPY f83 = f83, f39 + FMPY f87 = f87, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f92 = f84, f40, f92 + FNMA f89 = f81, f40, f89 + FNMA f93 = f85, f40, f93 + FNMA f90 = f82, f40, f90 + FNMA f94 = f86, f40, f94 + FNMA f91 = f83, f40, f91 + FNMA f95 = f87, f40, f95 + ;; + FMPY f88 = f88, f41 + FMPY f92 = f92, f41 + FMPY f89 = f89, f41 + FMPY f93 = f93, f41 + FMPY f90 = f90, f41 + FMPY f94 = f94, f41 + FMPY f91 = f91, f41 + FMPY f95 = f95, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, 5 * SIZE + STFD [AOFFSET2] = f79, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f87, 5 * SIZE + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, -27 * SIZE + STFD [AOFFSET2] = f95, -27 * SIZE + ;; +#endif + + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, 5 * SIZE + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f75, 5 * SIZE + STFD [C10] = f79 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + } + ;; + { .mmi + STFD [C3 ] = f83, 5 * SIZE + STFD [C11] = f87 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + + } + ;; + { .mmi + STFD [C4 ] = f91, 5 * SIZE + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + ;; + { .mmi + shladd AOFFSET = L, 3, AOFFSET + } + ;; + { .mmi + shladd BOFFSET = L, 2, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + mov L = KK + } + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + { .mmb + (p6) br.cond.dptk .L052 + } + ;; + + .align 8 + +.L060: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L070 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + + mov f66 = f0 + mov f67 = f0 + mov f74 = f0 + mov f75 = f0 + mov f82 = f0 + mov f83 = f0 + mov f90 = f0 + mov f91 = f0 + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L068 + } + ;; + .align 8 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; + .align 8 + +.L068: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + ;; + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + FMPY f81 = f81, f36 + FMPY f89 = f89, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + FNMA f82 = f81, f37, f82 + FNMA f90 = f89, f37, f90 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + FNMA f83 = f81, f38, f83 + FNMA f91 = f89, f38, f91 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + FMPY f82 = f82, f39 + FMPY f90 = f90, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + FNMA f83 = f82, f40, f83 + FNMA f91 = f90, f40, f91 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + FMPY f83 = f83, f41 + FMPY f91 = f91, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, -11 * SIZE + STFD [BOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + FMPY f74 = f74, f36 + FMPY f75 = f75, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + FNMA f82 = f74, f37, f82 + FNMA f83 = f75, f37, f83 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + FNMA f90 = f74, f38, f90 + FNMA f91 = f75, f38, f91 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + FMPY f82 = f82, f39 + FMPY f83 = f83, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + FNMA f90 = f82, f40, f90 + FNMA f91 = f83, f40, f91 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + FMPY f90 = f90, f41 + FMPY f91 = f91, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, -11 * SIZE + STFD [AOFFSET2] = f91, -11 * SIZE + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, SIZE + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi + STFD [C2 ] = f75, SIZE + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + } + ;; + { .mmi + STFD [C3 ] = f83, SIZE + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi + STFD [C4 ] = f91, SIZE + nop __LINE__ + } + ;; + mov f65 = f0 + ;; + mov f73 = f0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmf + mov f81 = f0 + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 2, AOFFSET + } + ;; + { .mmi + shladd BOFFSET = L, 2, BOFFSET + } + ;; + { .mmf + mov f89 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L070: + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L080 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; + { .mfi + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L078 + } + ;; + .align 8 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +.L078: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + FMPY f81 = f81, f34 + FMPY f89 = f89, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; + STFD [C2 ] = f73, SIZE + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; + STFD [C3 ] = f81, SIZE + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; + STFD [C4 ] = f89, SIZE + ;; + mov f96 = f0 + ;; + mov f104 = f0 + ;; + sub L = K, KK + ;; + mov f112 = f0 + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 1, AOFFSET + } + ;; + { .mmi + shladd BOFFSET = L, 2, BOFFSET + } + ;; + { .mmf + mov f120 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L080: + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L088 + } + ;; + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb + nop __LINE__ + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; + +.L088: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FMPY f72 = f72, f36 + ;; + FNMA f80 = f72, f37, f80 + ;; + FNMA f88 = f72, f38, f88 + ;; + FMPY f80 = f80, f39 + ;; + FNMA f88 = f80, f40, f88 + ;; + FMPY f88 = f88, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + STFD [C3 ] = f80, SIZE + STFD [C4 ] = f88, SIZE + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + ;; + sub L = K, KK + ;; + shladd L = L, BASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + ;; + shladd BOFFSET = L, 2, BOFFSET + ;; +#ifdef LT + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + mov L = KK + ;; + .align 8 + +.L089: + mov B = BOFFSET + +#ifdef RN + adds KK = 4, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L090: + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L130 + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + ;; + { .mfi + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + mov AORIG = A + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + } + ;; + { .mfi + shladd C = LDC, 1, C // coffset += 8 * ldc + mov f81 = f0 + mov L = KK + }{ .mfb + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + mov f76 = f0 + mov f77 = f0 + mov f78 = f0 + mov f79 = f0 + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + } + ;; + .align 8 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; + .align 8 + +.L098: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + + FSUB f68 = f40, f68 + FSUB f76 = f41, f76 + FSUB f69 = f42, f69 + FSUB f77 = f43, f77 + + FSUB f70 = f44, f70 + FSUB f78 = f45, f78 + FSUB f71 = f46, f71 + FSUB f79 = f47, f79 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, 5 * SIZE + STFD [BOFFSET2] = f75, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, -11 * SIZE + STFD [BOFFSET2] = f79, -11 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FMPY f72 = f72, f34 + FMPY f76 = f76, f34 + FMPY f73 = f73, f34 + FMPY f77 = f77, f34 + FMPY f74 = f74, f34 + FMPY f78 = f78, f34 + FMPY f75 = f75, f34 + FMPY f79 = f79, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, -11 * SIZE + STFD [AOFFSET2] = f79, -11 * SIZE + ;; +#endif + + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, 5 * SIZE + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi + STFD [C2 ] = f75, 5 * SIZE + STFD [C10] = f79 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + ;; + shladd AOFFSET = L, 3, AOFFSET + shladd BOFFSET = L, 1, BOFFSET + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + mov L = KK + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + (p6) br.cond.dptk .L092 + ;; + .align 8 + +.L100: + { .mib + mov L = KK + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + } + ;; + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L108 + } + ;; + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + .align 8 + +.L108: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + ;; + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + FMPY f74 = f74, f34 + FMPY f75 = f75, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f75, -3 * SIZE + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, SIZE + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi + STFD [C2 ] = f75, SIZE + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + shladd AOFFSET = L, 2, AOFFSET + ;; + shladd BOFFSET = L, 1, BOFFSET + ;; +#ifdef LT + adds KK = 4, KK + nop __LINE__ +#endif + ;; + .align 8 + +.L110: + { .mib + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + } + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; + adds L = 1, L + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L118 + } + ;; + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + .align 8 + +.L118: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; + STFD [C2 ] = f73, SIZE + ;; + mov f65 = f0 + mov f73 = f0 + ;; + sub L = K, KK + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 1, AOFFSET + } + ;; + { .mmi + shladd BOFFSET = L, 1, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L120: + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L128 + } + ;; + .align 8 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FMPY f72 = f72, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + + + STFD [C1 ] = f64, SIZE + STFD [C2 ] = f72, SIZE + + mov f64 = f0 + mov f72 = f0 + ;; + sub L = K, KK + ;; + shladd L = L, BASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + ;; + shladd BOFFSET = L, 1, BOFFSET + ;; +#ifdef LT + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + mov L = KK + ;; + .align 8 + +.L129: + mov B = BOFFSET + +#ifdef RN + adds KK = 2, KK +#endif + + ;; + mov AOFFSET = A + ;; + .align 16 + +.L130: + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L999 + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + ;; + + { .mfi + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + mov AORIG = A + } + ;; + { .mfi + add C = C, LDC // coffset += 8 * ldc + mov L = KK + }{ .mfb + (p6) br.cond.dpnt .L140 + } + ;; + .align 16 + +.L132: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds L = 1, L + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L138 + } + ;; + .align 16 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FNMA f68 = f64, f36, f68 + ;; + FNMA f69 = f64, f37, f69 + ;; + FNMA f70 = f64, f38, f70 + ;; + FNMA f71 = f64, f39, f71 + ;; + FMPY f65 = f65, f40 + ;; + FNMA f66 = f65, f41, f66 + ;; + FNMA f67 = f65, f42, f67 + ;; + FNMA f68 = f65, f43, f68 + ;; + FNMA f69 = f65, f44, f69 + ;; + FNMA f70 = f65, f45, f70 + ;; + FNMA f71 = f65, f46, f71 + ;; + FMPY f66 = f66, f47 + ;; + FNMA f67 = f66, f48, f67 + ;; + FNMA f68 = f66, f49, f68 + ;; + FNMA f69 = f66, f50, f69 + ;; + FNMA f70 = f66, f51, f70 + ;; + FNMA f71 = f66, f52, f71 + ;; + FMPY f67 = f67, f53 + ;; + FNMA f68 = f67, f54, f68 + ;; + FNMA f69 = f67, f55, f69 + ;; + FNMA f70 = f67, f56, f70 + ;; + FNMA f71 = f67, f57, f71 + ;; + FMPY f68 = f68, f58 + ;; + FNMA f69 = f68, f59, f69 + ;; + FNMA f70 = f68, f60, f70 + ;; + FNMA f71 = f68, f61, f71 + ;; + FMPY f69 = f69, f16 + ;; + FNMA f70 = f69, f17, f70 + ;; + FNMA f71 = f69, f18, f71 + ;; + FMPY f70 = f70, f19 + ;; + FNMA f71 = f70, f20, f71 + ;; + FMPY f71 = f71, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + STFD [BOFFSET2] = f71, -3 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, 5 * SIZE + STFD [C9 ] = f71 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + ;; + { .mmi + shladd AOFFSET = L, 3, AOFFSET + } + ;; + { .mmi + add BOFFSET = L, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + mov L = KK + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + + (p6) br.cond.dptk .L132 + .align 8 + +.L140: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + mov f65 = f0 + } + ;; + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L148 + } + ;; + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf + nop __LINE__ + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + nop __LINE__ + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FMPY f65 = f65, f36 + ;; + FNMA f66 = f65, f37, f66 + ;; + FNMA f67 = f65, f38, f67 + ;; + FMPY f66 = f66, f39 + ;; + FNMA f67 = f66, f40, f67 + ;; + FMPY f67 = f67, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + ;; +#endif + + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi + STFD [C1 ] = f67, SIZE + } + ;; + { .mmf + mov f72 = f0 + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 2, AOFFSET + } + ;; + { .mmi + add BOFFSET = L, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L150: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + ;; + + { .mib + mov L = KK + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + } + ;; + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L158 + } + ;; + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#ifdef LT + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FMPY f65 = f65, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, -SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + ;; + sub L = K, KK + ;; + { .mmi + shladd L = L, BASE_SHIFT, r0 + } + ;; + { .mmi + shladd AOFFSET = L, 1, AOFFSET + } + ;; + { .mmi + add BOFFSET = L, BOFFSET + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 8 + +.L160: + { .mib + mov L = KK + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + } + ;; + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, L + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L168 + } + ;; + .align 8 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; + .align 8 + +.L168: +#ifdef LT + { .mmi + LDFD f32 = [BOFFSET] + LDFD f33 = [AOFFSET] + nop __LINE__ + } + ;; +#else + { .mmi + LDFD f32 = [AOFFSET] + LDFD f33 = [BOFFSET] + nop __LINE__ + } + ;; +#endif + + { .mmf + sub L = K, KK + nop __LINE__ + FSUB f64 = f32, f64 + } + ;; +#ifdef LT + adds KK = 1, KK +#else + nop __LINE__ +#endif + ;; + mov L = KK + ;; + FMPY f64 = f64, f33 + ;; +#ifdef LT + { .mmf + STFD [BOFFSET] = f64 + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; +#else + { .mmf + STFD [AOFFSET] = f64 + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; +#endif + + shladd AOFFSET = L, BASE_SHIFT, AOFFSET + shladd BOFFSET = L, BASE_SHIFT, BOFFSET + ;; + .align 8 + +.L169: + { .mii + mov B = BOFFSET + +#ifdef RN + adds KK = 1, KK +#else + nop __LINE__ +#endif + mov AOFFSET = A + } + ;; + .align 16 + + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + mov ar.lc = ARLC + ;; + mov pr = PR, -1 + ;; + mov ar.pfs = ARPFS + ;; + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/trsm_kernel_RT.S b/kernel/ia64/trsm_kernel_RT.S new file mode 100644 index 0000000000..f3482aecdc --- /dev/null +++ b/kernel/ia64/trsm_kernel_RT.S @@ -0,0 +1,16688 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 4) +#endif + +#ifndef LN +#define CPREFETCHSIZE 8 +#else +#define CPREFETCHSIZE -8 +#endif +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r36 +#define B r37 +#define C r38 +#define LDC r39 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA f8 + +#define AORIG loc8 +#define KK loc9 +#define KK8 loc10 +#define OFFSET loc11 +#define AOFFSET2 loc12 +#define BOFFSET2 loc13 + + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -6 * 16, SP + adds r9 = -5 * 16, SP + adds SP = -6 * 16, SP + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + } + ;; + { .mmi + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + nop __LINE__ + } + ;; + { .mmi + stf.spill [r8] = f20 + stf.spill [r9] = f21 + shladd LDC = LDC, BASE_SHIFT, r0 + } + ;; + .body + { .mmi + ld8 OFFSET = [r14] + mov AOFFSET = A + } + ;; +#ifdef LN + { .mmi + setf.sig f32 = M + setf.sig f33 = K + shladd C = M, BASE_SHIFT, C + } + ;; + {.mmf + nop __LINE__ + nop __LINE__ + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + ;; + nop __LINE__ + shladd A = r2, BASE_SHIFT, A + } + ;; +#endif + +#ifdef RN + sub KK = r0, OFFSET +#endif + +#ifdef RT + { .mmi + setf.sig f32 = N + setf.sig f33 = K + nop __LINE__ + } + ;; + { .mmi + setf.sig f34 = LDC + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + xmpy.l f33 = f32, f33 + } + { .mmf + nop __LINE__ + sub KK = N, OFFSET + xmpy.l f34 = f32, f34 + } + ;; + { .mmi + getf.sig r2 = f33 + getf.sig r3 = f34 + } + ;; + shladd B = r2, BASE_SHIFT, B + add C = r3, C +#endif + ;; + +.L130: + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L090 + ;; + +#ifdef RT + { .mmi + nop __LINE__ + shl r2 = K, BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } +#endif + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + ;; + + { .mfi + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + { .mfi +#ifndef RT + add C = C, LDC // coffset += 8 * ldc +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + (p6) br.cond.dpnt .L140 + } + ;; + .align 16 + +.L132: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds L = 1, L + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L138 + } + ;; + .align 16 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + ;; + FNMA f70 = f71, f33, f70 + ;; + FNMA f69 = f71, f34, f69 + ;; + FNMA f68 = f71, f35, f68 + ;; + FNMA f67 = f71, f36, f67 + ;; + FNMA f66 = f71, f37, f66 + ;; + FNMA f65 = f71, f38, f65 + ;; + FNMA f64 = f71, f39, f64 + ;; + FMPY f70 = f70, f40 + ;; + FNMA f69 = f70, f41, f69 + ;; + FNMA f68 = f70, f42, f68 + ;; + FNMA f67 = f70, f43, f67 + ;; + FNMA f66 = f70, f44, f66 + ;; + FNMA f65 = f70, f45, f65 + ;; + FNMA f64 = f70, f46, f64 + ;; + FMPY f69 = f69, f47 + ;; + FNMA f68 = f69, f48, f68 + ;; + FNMA f67 = f69, f49, f67 + ;; + FNMA f66 = f69, f50, f66 + ;; + FNMA f65 = f69, f51, f65 + ;; + FNMA f64 = f69, f52, f64 + ;; + FMPY f68 = f68, f53 + ;; + FNMA f67 = f68, f54, f67 + ;; + FNMA f66 = f68, f55, f66 + ;; + FNMA f65 = f68, f56, f65 + ;; + FNMA f64 = f68, f57, f64 + ;; + FMPY f67 = f67, f58 + ;; + FNMA f66 = f67, f59, f66 + ;; + FNMA f65 = f67, f60, f65 + ;; + FNMA f64 = f67, f61, f64 + ;; + FMPY f66 = f66, f16 + ;; + FNMA f65 = f66, f17, f65 + ;; + FNMA f64 = f66, f18, f64 + ;; + FMPY f65 = f65, f19 + ;; + FNMA f64 = f65, f20, f64 + ;; + FMPY f64 = f64, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, - 3 * SIZE + STFD [BOFFSET2] = f71, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FNMA f68 = f64, f36, f68 + ;; + FNMA f69 = f64, f37, f69 + ;; + FNMA f70 = f64, f38, f70 + ;; + FNMA f71 = f64, f39, f71 + ;; + FMPY f65 = f65, f40 + ;; + FNMA f66 = f65, f41, f66 + ;; + FNMA f67 = f65, f42, f67 + ;; + FNMA f68 = f65, f43, f68 + ;; + FNMA f69 = f65, f44, f69 + ;; + FNMA f70 = f65, f45, f70 + ;; + FNMA f71 = f65, f46, f71 + ;; + FMPY f66 = f66, f47 + ;; + FNMA f67 = f66, f48, f67 + ;; + FNMA f68 = f66, f49, f68 + ;; + FNMA f69 = f66, f50, f69 + ;; + FNMA f70 = f66, f51, f70 + ;; + FNMA f71 = f66, f52, f71 + ;; + FMPY f67 = f67, f53 + ;; + FNMA f68 = f67, f54, f68 + ;; + FNMA f69 = f67, f55, f69 + ;; + FNMA f70 = f67, f56, f70 + ;; + FNMA f71 = f67, f57, f71 + ;; + FMPY f68 = f68, f58 + ;; + FNMA f69 = f68, f59, f69 + ;; + FNMA f70 = f68, f60, f70 + ;; + FNMA f71 = f68, f61, f71 + ;; + FMPY f69 = f69, f16 + ;; + FNMA f70 = f69, f17, f70 + ;; + FNMA f71 = f69, f18, f71 + ;; + FMPY f70 = f70, f19 + ;; + FNMA f71 = f70, f20, f71 + ;; + FMPY f71 = f71, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f68, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + STFD [BOFFSET2] = f71, -3 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71, -3 * SIZE + ;; +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + + (p6) br.cond.dptk .L132 + .align 8 + +.L140: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L148 + } + ;; + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mmf + nop __LINE__ + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + nop __LINE__ + nop.f 0 + br.cloop.sptk.few .L142 + } + ;; + +.L148: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + ;; + FNMA f66 = f67, f33, f66 + ;; + FNMA f65 = f67, f34, f65 + ;; + FNMA f64 = f67, f35, f64 + ;; + FMPY f66 = f66, f36 + ;; + FNMA f65 = f66, f37, f65 + ;; + FNMA f64 = f66, f38, f64 + ;; + FMPY f65 = f65, f39 + ;; + FNMA f64 = f65, f40, f64 + ;; + FMPY f64 = f64, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FNMA f66 = f64, f34, f66 + ;; + FNMA f67 = f64, f35, f67 + ;; + FMPY f65 = f65, f36 + ;; + FNMA f66 = f65, f37, f66 + ;; + FNMA f67 = f65, f38, f67 + ;; + FMPY f66 = f66, f39 + ;; + FNMA f67 = f66, f40, f67 + ;; + FMPY f67 = f67, f41 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f66, SIZE + ;; + STFD [BOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + mov f72 = f0 + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L150: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = KK, BASE_SHIFT, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L158 + } + ;; + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + ;; + FNMA f64 = f65, f33, f64 + ;; + FMPY f64 = f64, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, - SIZE + ;; + adds C1 = -2 * SIZE, C1 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f65 = f64, f33, f65 + ;; + FMPY f65 = f65, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, -SIZE + ;; +#endif + +#ifdef RN + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + +#ifdef RT + LDFD f32 = [BOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, - SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + mov f64 = f0 + mov f65 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + add BOFFSET = L, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L160: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + nop __LINE__ + adds L = 1, L + } + ;; +#else + { .mmi + shladd BOFFSET = KK, BASE_SHIFT, B + nop __LINE__ +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + (p7) LDFD f48 = [BOFFSET], 1 * SIZE + adds L = 1, L + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, L + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L168 + } + ;; + .align 8 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + br.cloop.sptk.few .L162 + } + ;; + .align 8 + +.L168: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + { .mmi + LDFD f32 = [BOFFSET] + LDFD f33 = [AOFFSET] +#ifdef LN + adds C1 = -1 * SIZE, C1 +#else + nop __LINE__ +#endif + } + ;; +#else + { .mmi + LDFD f32 = [AOFFSET] + LDFD f33 = [BOFFSET] + nop __LINE__ + } + ;; +#endif + + { .mmf + sub L = K, KK +#ifdef RT + shladd AORIG = K, BASE_SHIFT, AORIG +#else + nop __LINE__ +#endif + FSUB f64 = f32, f64 + } + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + FMPY f64 = f64, f33 + ;; +#if defined(LN) || defined(LT) + { .mmf + STFD [BOFFSET] = f64 +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif + mov f64 = f0 + } + ;; +#else + { .mmf + STFD [AOFFSET] = f64 + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; +#endif + +#if defined(LT) || defined(RN) + shladd AOFFSET = L, BASE_SHIFT, AOFFSET +#else + nop __LINE__ +#endif +#if defined(LT) || defined(RN) + shladd BOFFSET = L, BASE_SHIFT, BOFFSET +#else + nop __LINE__ +#endif + ;; + .align 8 + +.L169: + { .mii +#ifdef LN + shladd B = K, BASE_SHIFT, B +#elif defined(LT) || defined(RN) + mov B = BOFFSET +#else + nop __LINE__ +#endif + +#ifdef RN + adds KK = 1, KK +#elif defined RT + adds KK = -1, KK +#else + nop __LINE__ +#endif + mov AOFFSET = A + } + ;; + .align 16 + +.L090: + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L050 + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + ;; + { .mfi + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + } + ;; + { .mfi +#ifndef RT + shladd C = LDC, 1, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + mov f81 = f0 +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + } + { .mfi + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC] + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + mov ar.lc = L + } + ;; + mov f68 = f0 + mov f69 = f0 + mov f70 = f0 + mov f71 = f0 + mov f76 = f0 + mov f77 = f0 + mov f78 = f0 + mov f79 = f0 + ;; + { .mfb + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + } + ;; + .align 8 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; + .align 8 + +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + + FSUB f68 = f40, f68 + FSUB f76 = f41, f76 + FSUB f69 = f42, f69 + FSUB f77 = f43, f77 + + FSUB f70 = f44, f70 + FSUB f78 = f45, f78 + FSUB f71 = f46, f71 + FSUB f79 = f47, f79 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + FMPY f79 = f79, f32 + ;; + FNMA f70 = f71, f33, f70 + FNMA f78 = f79, f33, f78 + ;; + FNMA f69 = f71, f34, f69 + FNMA f77 = f79, f34, f77 + ;; + FNMA f68 = f71, f35, f68 + FNMA f76 = f79, f35, f76 + ;; + FNMA f67 = f71, f36, f67 + FNMA f75 = f79, f36, f75 + ;; + FNMA f66 = f71, f37, f66 + FNMA f74 = f79, f37, f74 + ;; + FNMA f65 = f71, f38, f65 + FNMA f73 = f79, f38, f73 + ;; + FNMA f64 = f71, f39, f64 + FNMA f72 = f79, f39, f72 + ;; + FMPY f70 = f70, f40 + FMPY f78 = f78, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f77 = f78, f41, f77 + ;; + FNMA f68 = f70, f42, f68 + FNMA f76 = f78, f42, f76 + ;; + FNMA f67 = f70, f43, f67 + FNMA f75 = f78, f43, f75 + ;; + FNMA f66 = f70, f44, f66 + FNMA f74 = f78, f44, f74 + ;; + FNMA f65 = f70, f45, f65 + FNMA f73 = f78, f45, f73 + ;; + FNMA f64 = f70, f46, f64 + FNMA f72 = f78, f46, f72 + ;; + FMPY f69 = f69, f47 + FMPY f77 = f77, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f76 = f77, f48, f76 + ;; + FNMA f67 = f69, f49, f67 + FNMA f75 = f77, f49, f75 + ;; + FNMA f66 = f69, f50, f66 + FNMA f74 = f77, f50, f74 + ;; + FNMA f65 = f69, f51, f65 + FNMA f73 = f77, f51, f73 + ;; + FNMA f64 = f69, f52, f64 + FNMA f72 = f77, f52, f72 + ;; + FMPY f68 = f68, f53 + FMPY f76 = f76, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f75 = f76, f54, f75 + ;; + FNMA f66 = f68, f55, f66 + FNMA f74 = f76, f55, f74 + ;; + FNMA f65 = f68, f56, f65 + FNMA f73 = f76, f56, f73 + ;; + FNMA f64 = f68, f57, f64 + FNMA f72 = f76, f57, f72 + ;; + FMPY f67 = f67, f58 + FMPY f75 = f75, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f74 = f75, f59, f74 + ;; + FNMA f65 = f67, f60, f65 + FNMA f73 = f75, f60, f73 + ;; + FNMA f64 = f67, f61, f64 + FNMA f72 = f75, f61, f72 + ;; + FMPY f66 = f66, f16 + FMPY f74 = f74, f16 + ;; + FNMA f65 = f66, f17, f65 + FNMA f73 = f74, f17, f73 + ;; + FNMA f64 = f66, f18, f64 + FNMA f72 = f74, f18, f72 + ;; + FMPY f65 = f65, f19 + FMPY f73 = f73, f19 + ;; + FNMA f64 = f65, f20, f64 + FNMA f72 = f73, f20, f72 + ;; + FMPY f64 = f64, f21 + FMPY f72 = f72, f21 + ;; + + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, - 11 * SIZE + STFD [BOFFSET2] = f79, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, - 3 * SIZE + STFD [BOFFSET2] = f75, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, 5 * SIZE + STFD [BOFFSET2] = f75, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f70, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f78, SIZE + ;; + STFD [BOFFSET] = f69, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f77, -11 * SIZE + STFD [BOFFSET2] = f79, -11 * SIZE + ;; + adds C9 = 4 * SIZE, C1 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FMPY f72 = f72, f34 + FMPY f76 = f76, f34 + FMPY f73 = f73, f34 + FMPY f77 = f77, f34 + FMPY f74 = f74, f34 + FMPY f78 = f78, f34 + FMPY f75 = f75, f34 + FMPY f79 = f79, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, -11 * SIZE + STFD [AOFFSET2] = f79, -11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + + FMPY f72 = f72, f32 + FMPY f76 = f76, f32 + FMPY f73 = f73, f32 + FMPY f77 = f77, f32 + FMPY f74 = f74, f32 + FMPY f78 = f78, f32 + FMPY f75 = f75, f32 + FMPY f79 = f79, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f68 = f76, f33, f68 + FNMA f65 = f73, f33, f65 + FNMA f69 = f77, f33, f69 + FNMA f66 = f74, f33, f66 + FNMA f70 = f78, f33, f70 + FNMA f67 = f75, f33, f67 + FNMA f71 = f79, f33, f71 + ;; + FMPY f64 = f64, f34 + FMPY f68 = f68, f34 + FMPY f65 = f65, f34 + FMPY f69 = f69, f34 + FMPY f66 = f66, f34 + FMPY f70 = f70, f34 + FMPY f67 = f67, f34 + FMPY f71 = f71, f34 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, - 11 * SIZE + STFD [AOFFSET2] = f79, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + ;; + +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 + } + ;; + { .mmf + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + + mov f64 = f0 + mov f65 = f0 + mov f66 = f0 + mov f67 = f0 + mov f72 = f0 + mov f73 = f0 + mov f74 = f0 + mov f75 = f0 + + (p6) br.cond.dptk .L092 + ;; + .align 8 + +.L100: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L108 + } + ;; + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 2 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + .align 8 + +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; + FSUB f66 = f36, f66 + FSUB f74 = f37, f74 + ;; + FSUB f67 = f38, f67 + FSUB f75 = f39, f75 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f75 = f75, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f74 = f75, f33, f74 + ;; + FNMA f65 = f67, f34, f65 + FNMA f73 = f75, f34, f73 + ;; + FNMA f64 = f67, f35, f64 + FNMA f72 = f75, f35, f72 + ;; + FMPY f66 = f66, f36 + FMPY f74 = f74, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f73 = f74, f37, f73 + ;; + FNMA f64 = f66, f38, f64 + FNMA f72 = f74, f38, f72 + ;; + FMPY f65 = f65, f39 + FMPY f73 = f73, f39 + ;; + FNMA f64 = f65, f40, f64 + FNMA f72 = f73, f40, f72 + ;; + FMPY f64 = f64, f41 + FMPY f72 = f72, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f66, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f74, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + STFD [BOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + FMPY f74 = f74, f34 + FMPY f75 = f75, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f75, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + FMPY f73 = f73, f32 + FMPY f74 = f74, f32 + FMPY f75 = f75, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f65 = f73, f33, f65 + FNMA f66 = f74, f33, f66 + FNMA f67 = f75, f33, f67 + ;; + FMPY f64 = f64, f34 + FMPY f65 = f65, f34 + FMPY f66 = f66, f34 + FMPY f67 = f67, f34 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + } + ;; + mov f65 = f0 + mov f73 = f0 + mov f66 = f0 + mov f74 = f0 + mov f67 = f0 + mov f75 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L110: + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L118 + } + ;; + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + .align 8 + +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f65 = f34, f65 + FSUB f73 = f35, f73 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f73 = f73, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f72 = f73, f33, f72 + ;; + FMPY f64 = f64, f34 + FMPY f72 = f72, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, - 3 * SIZE + ;; + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FMPY f72 = f72, f34 + FMPY f73 = f73, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + FMPY f73 = f73, f32 + ;; + FNMA f64 = f72, f33, f64 + FNMA f65 = f73, f33, f65 + ;; + FMPY f64 = f64, f34 + FMPY f65 = f65, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + mov f65 = f0 + mov f73 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L120: + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L128 + } + ;; + .align 8 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#else + LDFPD f32, f33 = [AOFFSET] + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, -SIZE + adds C2 = -1 * SIZE, C2 + } + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET], -3 * SIZE + ;; + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FMPY f72 = f72, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 2 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f34 = [BOFFSET] + ;; + FMPY f72 = f72, f32 + ;; + FNMA f64 = f72, f33, f64 + ;; + FMPY f64 = f64, f34 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, -SIZE + ;; +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif + + mov f64 = f0 + mov f72 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 1, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L129: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L050: + { .mib + setf.d f64 = r0 + tbit.z p6, p0 = N, 2 + (p6) br.cond.dpnt .L000 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + { .mfi + setf.d f72 = r0 + mov f80 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f88 = f0 +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + mov f65 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f73 = f0 + } + ;; + { .mfi +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + mov f81 = f0 +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + shladd C4 = LDC, 1, C2 + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f74 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f90 = f0 + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC] + } + ;; + { .mfi + setf.d f70 = r0 + mov f78 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mfi + setf.d f71 = r0 + adds L = -1, L + } + ;; + { .mfi + setf.d f87 = r0 + mov f79 = f0 + mov ar.lc = L + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f95 = f0 + (p6) br.cond.dpnt .L058 + } + ;; + .align 8 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 8 + +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [BOFFSET] + adds BOFFSET = -30 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + + FSUB f68 = f48, f68 + FSUB f76 = f49, f76 + FSUB f84 = f50, f84 + FSUB f92 = f51, f92 + + FSUB f69 = f52, f69 + FSUB f77 = f53, f77 + FSUB f85 = f54, f85 + FSUB f93 = f55, f93 + + FSUB f70 = f56, f70 + FSUB f78 = f57, f78 + FSUB f86 = f58, f86 + FSUB f94 = f59, f94 + + FSUB f71 = f60, f71 + FSUB f79 = f61, f79 + FSUB f87 = f62, f87 + FSUB f95 = f63, f95 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + FSUB f68 = f36, f68 + FSUB f69 = f37, f69 + FSUB f70 = f38, f70 + FSUB f71 = f39, f71 + ;; + FSUB f72 = f40, f72 + FSUB f73 = f41, f73 + FSUB f74 = f42, f74 + FSUB f75 = f43, f75 + FSUB f76 = f44, f76 + FSUB f77 = f45, f77 + FSUB f78 = f46, f78 + FSUB f79 = f47, f79 + ;; + FSUB f80 = f48, f80 + FSUB f81 = f49, f81 + FSUB f82 = f50, f82 + FSUB f83 = f51, f83 + FSUB f84 = f52, f84 + FSUB f85 = f53, f85 + FSUB f86 = f54, f86 + FSUB f87 = f55, f87 + + FSUB f88 = f56, f88 + FSUB f89 = f57, f89 + FSUB f90 = f58, f90 + FSUB f91 = f59, f91 + FSUB f92 = f60, f92 + FSUB f93 = f61, f93 + FSUB f94 = f62, f94 + FSUB f95 = f63, f95 + ;; +#endif + +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f37, f36 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f39, f38 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f44, f43 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f46, f45 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f48, f47 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f50, f49 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f52, f51 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f57, f56 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f59, f58 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f61, f60 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f20, f19 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + ;; + FMPY f71 = f71, f32 + FMPY f79 = f79, f32 + FMPY f87 = f87, f32 + FMPY f95 = f95, f32 + ;; + FNMA f70 = f71, f33, f70 + FNMA f78 = f79, f33, f78 + FNMA f86 = f87, f33, f86 + FNMA f94 = f95, f33, f94 + ;; + FNMA f69 = f71, f34, f69 + FNMA f77 = f79, f34, f77 + FNMA f85 = f87, f34, f85 + FNMA f93 = f95, f34, f93 + ;; + FNMA f68 = f71, f35, f68 + FNMA f76 = f79, f35, f76 + FNMA f84 = f87, f35, f84 + FNMA f92 = f95, f35, f92 + ;; + FNMA f67 = f71, f36, f67 + FNMA f75 = f79, f36, f75 + FNMA f83 = f87, f36, f83 + FNMA f91 = f95, f36, f91 + ;; + FNMA f66 = f71, f37, f66 + FNMA f74 = f79, f37, f74 + FNMA f82 = f87, f37, f82 + FNMA f90 = f95, f37, f90 + ;; + FNMA f65 = f71, f38, f65 + FNMA f73 = f79, f38, f73 + FNMA f81 = f87, f38, f81 + FNMA f89 = f95, f38, f89 + ;; + FNMA f64 = f71, f39, f64 + FNMA f72 = f79, f39, f72 + FNMA f80 = f87, f39, f80 + FNMA f88 = f95, f39, f88 + ;; + FMPY f70 = f70, f40 + FMPY f78 = f78, f40 + FMPY f86 = f86, f40 + FMPY f94 = f94, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f77 = f78, f41, f77 + FNMA f85 = f86, f41, f85 + FNMA f93 = f94, f41, f93 + ;; + FNMA f68 = f70, f42, f68 + FNMA f76 = f78, f42, f76 + FNMA f84 = f86, f42, f84 + FNMA f92 = f94, f42, f92 + ;; + FNMA f67 = f70, f43, f67 + FNMA f75 = f78, f43, f75 + FNMA f83 = f86, f43, f83 + FNMA f91 = f94, f43, f91 + ;; + FNMA f66 = f70, f44, f66 + FNMA f74 = f78, f44, f74 + FNMA f82 = f86, f44, f82 + FNMA f90 = f94, f44, f90 + ;; + FNMA f65 = f70, f45, f65 + FNMA f73 = f78, f45, f73 + FNMA f81 = f86, f45, f81 + FNMA f89 = f94, f45, f89 + ;; + FNMA f64 = f70, f46, f64 + FNMA f72 = f78, f46, f72 + FNMA f80 = f86, f46, f80 + FNMA f88 = f94, f46, f88 + ;; + FMPY f69 = f69, f47 + FMPY f77 = f77, f47 + FMPY f85 = f85, f47 + FMPY f93 = f93, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f76 = f77, f48, f76 + FNMA f84 = f85, f48, f84 + FNMA f92 = f93, f48, f92 + ;; + FNMA f67 = f69, f49, f67 + FNMA f75 = f77, f49, f75 + FNMA f83 = f85, f49, f83 + FNMA f91 = f93, f49, f91 + ;; + FNMA f66 = f69, f50, f66 + FNMA f74 = f77, f50, f74 + FNMA f82 = f85, f50, f82 + FNMA f90 = f93, f50, f90 + ;; + FNMA f65 = f69, f51, f65 + FNMA f73 = f77, f51, f73 + FNMA f81 = f85, f51, f81 + FNMA f89 = f93, f51, f89 + ;; + FNMA f64 = f69, f52, f64 + FNMA f72 = f77, f52, f72 + FNMA f80 = f85, f52, f80 + FNMA f88 = f93, f52, f88 + ;; + FMPY f68 = f68, f53 + FMPY f76 = f76, f53 + FMPY f84 = f84, f53 + FMPY f92 = f92, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f75 = f76, f54, f75 + FNMA f83 = f84, f54, f83 + FNMA f91 = f92, f54, f91 + ;; + FNMA f66 = f68, f55, f66 + FNMA f74 = f76, f55, f74 + FNMA f82 = f84, f55, f82 + FNMA f90 = f92, f55, f90 + ;; + FNMA f65 = f68, f56, f65 + FNMA f73 = f76, f56, f73 + FNMA f81 = f84, f56, f81 + FNMA f89 = f92, f56, f89 + ;; + FNMA f64 = f68, f57, f64 + FNMA f72 = f76, f57, f72 + FNMA f80 = f84, f57, f80 + FNMA f88 = f92, f57, f88 + ;; + FMPY f67 = f67, f58 + FMPY f75 = f75, f58 + FMPY f83 = f83, f58 + FMPY f91 = f91, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f74 = f75, f59, f74 + FNMA f82 = f83, f59, f82 + FNMA f90 = f91, f59, f90 + ;; + FNMA f65 = f67, f60, f65 + FNMA f73 = f75, f60, f73 + FNMA f81 = f83, f60, f81 + FNMA f89 = f91, f60, f89 + ;; + FNMA f64 = f67, f61, f64 + FNMA f72 = f75, f61, f72 + FNMA f80 = f83, f61, f80 + FNMA f88 = f91, f61, f88 + ;; + FMPY f66 = f66, f16 + FMPY f74 = f74, f16 + FMPY f82 = f82, f16 + FMPY f90 = f90, f16 + ;; + FNMA f65 = f66, f17, f65 + FNMA f73 = f74, f17, f73 + FNMA f81 = f82, f17, f81 + FNMA f89 = f90, f17, f89 + ;; + FNMA f64 = f66, f18, f64 + FNMA f72 = f74, f18, f72 + FNMA f80 = f82, f18, f80 + FNMA f88 = f90, f18, f88 + ;; + FMPY f65 = f65, f19 + FMPY f73 = f73, f19 + FMPY f81 = f81, f19 + FMPY f89 = f89, f19 + ;; + FNMA f64 = f65, f20, f64 + FNMA f72 = f73, f20, f72 + FNMA f80 = f81, f20, f80 + FNMA f88 = f89, f20, f88 + ;; + FMPY f64 = f64, f21 + FMPY f72 = f72, f21 + FMPY f80 = f80, f21 + FMPY f88 = f88, f21 + ;; + + adds BOFFSET = 24 * SIZE, BOFFSET + adds BOFFSET2 = 24 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94, - 11 * SIZE + STFD [BOFFSET2] = f95, - 11 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, - 11 * SIZE + STFD [BOFFSET2] = f93, - 11 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, - 11 * SIZE + STFD [BOFFSET2] = f91, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f89, - 3 * SIZE + ;; + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C3 = -8 * SIZE, C3 + adds C4 = -8 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f40 = [AOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f53 = [AOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET] + adds AOFFSET = 7 * SIZE, AOFFSET + ;; + LDFD f16 = [AOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f19, f20 = [AOFFSET] + adds AOFFSET = 9 * SIZE, AOFFSET + ;; + LDFD f21 = [AOFFSET] + adds AOFFSET = -63 * SIZE, AOFFSET + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FNMA f68 = f64, f36, f68 + FNMA f76 = f72, f36, f76 + FNMA f84 = f80, f36, f84 + FNMA f92 = f88, f36, f92 + ;; + FNMA f69 = f64, f37, f69 + FNMA f77 = f72, f37, f77 + FNMA f85 = f80, f37, f85 + FNMA f93 = f88, f37, f93 + ;; + FNMA f70 = f64, f38, f70 + FNMA f78 = f72, f38, f78 + FNMA f86 = f80, f38, f86 + FNMA f94 = f88, f38, f94 + ;; + FNMA f71 = f64, f39, f71 + FNMA f79 = f72, f39, f79 + FNMA f87 = f80, f39, f87 + FNMA f95 = f88, f39, f95 + ;; + FMPY f65 = f65, f40 + FMPY f73 = f73, f40 + FMPY f81 = f81, f40 + FMPY f89 = f89, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f74 = f73, f41, f74 + FNMA f82 = f81, f41, f82 + FNMA f90 = f89, f41, f90 + ;; + FNMA f67 = f65, f42, f67 + FNMA f75 = f73, f42, f75 + FNMA f83 = f81, f42, f83 + FNMA f91 = f89, f42, f91 + ;; + FNMA f68 = f65, f43, f68 + FNMA f76 = f73, f43, f76 + FNMA f84 = f81, f43, f84 + FNMA f92 = f89, f43, f92 + ;; + FNMA f69 = f65, f44, f69 + FNMA f77 = f73, f44, f77 + FNMA f85 = f81, f44, f85 + FNMA f93 = f89, f44, f93 + ;; + FNMA f70 = f65, f45, f70 + FNMA f78 = f73, f45, f78 + FNMA f86 = f81, f45, f86 + FNMA f94 = f89, f45, f94 + ;; + FNMA f71 = f65, f46, f71 + FNMA f79 = f73, f46, f79 + FNMA f87 = f81, f46, f87 + FNMA f95 = f89, f46, f95 + ;; + FMPY f66 = f66, f47 + FMPY f74 = f74, f47 + FMPY f82 = f82, f47 + FMPY f90 = f90, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f75 = f74, f48, f75 + FNMA f83 = f82, f48, f83 + FNMA f91 = f90, f48, f91 + ;; + FNMA f68 = f66, f49, f68 + FNMA f76 = f74, f49, f76 + FNMA f84 = f82, f49, f84 + FNMA f92 = f90, f49, f92 + ;; + FNMA f69 = f66, f50, f69 + FNMA f77 = f74, f50, f77 + FNMA f85 = f82, f50, f85 + FNMA f93 = f90, f50, f93 + ;; + FNMA f70 = f66, f51, f70 + FNMA f78 = f74, f51, f78 + FNMA f86 = f82, f51, f86 + FNMA f94 = f90, f51, f94 + ;; + FNMA f71 = f66, f52, f71 + FNMA f79 = f74, f52, f79 + FNMA f87 = f82, f52, f87 + FNMA f95 = f90, f52, f95 + ;; + FMPY f67 = f67, f53 + FMPY f75 = f75, f53 + FMPY f83 = f83, f53 + FMPY f91 = f91, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f76 = f75, f54, f76 + FNMA f84 = f83, f54, f84 + FNMA f92 = f91, f54, f92 + ;; + FNMA f69 = f67, f55, f69 + FNMA f77 = f75, f55, f77 + FNMA f85 = f83, f55, f85 + FNMA f93 = f91, f55, f93 + ;; + FNMA f70 = f67, f56, f70 + FNMA f78 = f75, f56, f78 + FNMA f86 = f83, f56, f86 + FNMA f94 = f91, f56, f94 + ;; + FNMA f71 = f67, f57, f71 + FNMA f79 = f75, f57, f79 + FNMA f87 = f83, f57, f87 + FNMA f95 = f91, f57, f95 + ;; + FMPY f68 = f68, f58 + FMPY f76 = f76, f58 + FMPY f84 = f84, f58 + FMPY f92 = f92, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f77 = f76, f59, f77 + FNMA f85 = f84, f59, f85 + FNMA f93 = f92, f59, f93 + ;; + FNMA f70 = f68, f60, f70 + FNMA f78 = f76, f60, f78 + FNMA f86 = f84, f60, f86 + FNMA f94 = f92, f60, f94 + ;; + FNMA f71 = f68, f61, f71 + FNMA f79 = f76, f61, f79 + FNMA f87 = f84, f61, f87 + FNMA f95 = f92, f61, f95 + ;; + FMPY f69 = f69, f16 + FMPY f77 = f77, f16 + FMPY f85 = f85, f16 + FMPY f93 = f93, f16 + ;; + FNMA f70 = f69, f17, f70 + FNMA f78 = f77, f17, f78 + FNMA f86 = f85, f17, f86 + FNMA f94 = f93, f17, f94 + ;; + FNMA f71 = f69, f18, f71 + FNMA f79 = f77, f18, f79 + FNMA f87 = f85, f18, f87 + FNMA f95 = f93, f18, f95 + ;; + FMPY f70 = f70, f19 + FMPY f78 = f78, f19 + FMPY f86 = f86, f19 + FMPY f94 = f94, f19 + ;; + FNMA f71 = f70, f20, f71 + FNMA f79 = f78, f20, f79 + FNMA f87 = f86, f20, f87 + FNMA f95 = f94, f20, f95 + ;; + FMPY f71 = f71, f21 + FMPY f79 = f79, f21 + FMPY f87 = f87, f21 + FMPY f95 = f95, f21 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, 5 * SIZE + STFD [BOFFSET2] = f91, 5 * SIZE + ;; + STFD [BOFFSET] = f68, SIZE + STFD [BOFFSET2] = f69, SIZE + ;; + STFD [BOFFSET] = f76, SIZE + STFD [BOFFSET2] = f77, SIZE + ;; + STFD [BOFFSET] = f84, SIZE + STFD [BOFFSET2] = f85, SIZE + ;; + STFD [BOFFSET] = f92, 5 * SIZE + STFD [BOFFSET2] = f93, 5 * SIZE + ;; + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f71, SIZE + ;; + STFD [BOFFSET] = f78, SIZE + STFD [BOFFSET2] = f79, SIZE + ;; + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f87, SIZE + ;; + STFD [BOFFSET] = f94 + STFD [BOFFSET2] = f95 + adds C9 = 4 * SIZE, C1 + adds BOFFSET = - 27 * SIZE, BOFFSET + adds BOFFSET2 = - 27 * SIZE, BOFFSET2 + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f68 = f68, f32 + FMPY f65 = f65, f32 + FMPY f69 = f69, f32 + FMPY f66 = f66, f32 + FMPY f70 = f70, f32 + FMPY f67 = f67, f32 + FMPY f71 = f71, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f76 = f68, f33, f76 + FNMA f73 = f65, f33, f73 + FNMA f77 = f69, f33, f77 + FNMA f74 = f66, f33, f74 + FNMA f78 = f70, f33, f78 + FNMA f75 = f67, f33, f75 + FNMA f79 = f71, f33, f79 + ;; + FNMA f80 = f64, f34, f80 + FNMA f84 = f68, f34, f84 + FNMA f81 = f65, f34, f81 + FNMA f85 = f69, f34, f85 + FNMA f82 = f66, f34, f82 + FNMA f86 = f70, f34, f86 + FNMA f83 = f67, f34, f83 + FNMA f87 = f71, f34, f87 + ;; + FNMA f88 = f64, f35, f88 + FNMA f92 = f68, f35, f92 + FNMA f89 = f65, f35, f89 + FNMA f93 = f69, f35, f93 + FNMA f90 = f66, f35, f90 + FNMA f94 = f70, f35, f94 + FNMA f91 = f67, f35, f91 + FNMA f95 = f71, f35, f95 + ;; + FMPY f72 = f72, f36 + FMPY f76 = f76, f36 + FMPY f73 = f73, f36 + FMPY f77 = f77, f36 + FMPY f74 = f74, f36 + FMPY f78 = f78, f36 + FMPY f75 = f75, f36 + FMPY f79 = f79, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f84 = f76, f37, f84 + FNMA f81 = f73, f37, f81 + FNMA f85 = f77, f37, f85 + FNMA f82 = f74, f37, f82 + FNMA f86 = f78, f37, f86 + FNMA f83 = f75, f37, f83 + FNMA f87 = f79, f37, f87 + ;; + FNMA f88 = f72, f38, f88 + FNMA f92 = f76, f38, f92 + FNMA f89 = f73, f38, f89 + FNMA f93 = f77, f38, f93 + FNMA f90 = f74, f38, f90 + FNMA f94 = f78, f38, f94 + FNMA f91 = f75, f38, f91 + FNMA f95 = f79, f38, f95 + ;; + FMPY f80 = f80, f39 + FMPY f84 = f84, f39 + FMPY f81 = f81, f39 + FMPY f85 = f85, f39 + FMPY f82 = f82, f39 + FMPY f86 = f86, f39 + FMPY f83 = f83, f39 + FMPY f87 = f87, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f92 = f84, f40, f92 + FNMA f89 = f81, f40, f89 + FNMA f93 = f85, f40, f93 + FNMA f90 = f82, f40, f90 + FNMA f94 = f86, f40, f94 + FNMA f91 = f83, f40, f91 + FNMA f95 = f87, f40, f95 + ;; + FMPY f88 = f88, f41 + FMPY f92 = f92, f41 + FMPY f89 = f89, f41 + FMPY f93 = f93, f41 + FMPY f90 = f90, f41 + FMPY f94 = f94, f41 + FMPY f91 = f91, f41 + FMPY f95 = f95, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f71, 5 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, 5 * SIZE + STFD [AOFFSET2] = f79, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f87, 5 * SIZE + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, -27 * SIZE + STFD [AOFFSET2] = f95, -27 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], -2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + + FMPY f88 = f88, f32 + FMPY f92 = f92, f32 + FMPY f89 = f89, f32 + FMPY f93 = f93, f32 + FMPY f90 = f90, f32 + FMPY f94 = f94, f32 + FMPY f91 = f91, f32 + FMPY f95 = f95, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f84 = f92, f33, f84 + FNMA f81 = f89, f33, f81 + FNMA f85 = f93, f33, f85 + FNMA f82 = f90, f33, f82 + FNMA f86 = f94, f33, f86 + FNMA f83 = f91, f33, f83 + FNMA f87 = f95, f33, f87 + ;; + FNMA f72 = f88, f34, f72 + FNMA f76 = f92, f34, f76 + FNMA f73 = f89, f34, f73 + FNMA f77 = f93, f34, f77 + FNMA f74 = f90, f34, f74 + FNMA f78 = f94, f34, f78 + FNMA f75 = f91, f34, f75 + FNMA f79 = f95, f34, f79 + ;; + FNMA f64 = f88, f35, f64 + FNMA f68 = f92, f35, f68 + FNMA f65 = f89, f35, f65 + FNMA f69 = f93, f35, f69 + FNMA f66 = f90, f35, f66 + FNMA f70 = f94, f35, f70 + FNMA f67 = f91, f35, f67 + FNMA f71 = f95, f35, f71 + ;; + FMPY f80 = f80, f36 + FMPY f84 = f84, f36 + FMPY f81 = f81, f36 + FMPY f85 = f85, f36 + FMPY f82 = f82, f36 + FMPY f86 = f86, f36 + FMPY f83 = f83, f36 + FMPY f87 = f87, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f76 = f84, f37, f76 + FNMA f73 = f81, f37, f73 + FNMA f77 = f85, f37, f77 + FNMA f74 = f82, f37, f74 + FNMA f78 = f86, f37, f78 + FNMA f75 = f83, f37, f75 + FNMA f79 = f87, f37, f79 + ;; + FNMA f64 = f80, f38, f64 + FNMA f68 = f84, f38, f68 + FNMA f65 = f81, f38, f65 + FNMA f69 = f85, f38, f69 + FNMA f66 = f82, f38, f66 + FNMA f70 = f86, f38, f70 + FNMA f67 = f83, f38, f67 + FNMA f71 = f87, f38, f71 + ;; + FMPY f72 = f72, f39 + FMPY f76 = f76, f39 + FMPY f73 = f73, f39 + FMPY f77 = f77, f39 + FMPY f74 = f74, f39 + FMPY f78 = f78, f39 + FMPY f75 = f75, f39 + FMPY f79 = f79, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f68 = f76, f40, f68 + FNMA f65 = f73, f40, f65 + FNMA f69 = f77, f40, f69 + FNMA f66 = f74, f40, f66 + FNMA f70 = f78, f40, f70 + FNMA f67 = f75, f40, f67 + FNMA f71 = f79, f40, f71 + ;; + FMPY f64 = f64, f41 + FMPY f68 = f68, f41 + FMPY f65 = f65, f41 + FMPY f69 = f69, f41 + FMPY f66 = f66, f41 + FMPY f70 = f70, f41 + FMPY f67 = f67, f41 + FMPY f71 = f71, f41 + ;; + adds AOFFSET = 24 * SIZE, AOFFSET + adds AOFFSET2 = 24 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f88, SIZE + STFD [AOFFSET2] = f92, SIZE + ;; + STFD [AOFFSET] = f89, SIZE + STFD [AOFFSET2] = f93, SIZE + ;; + STFD [AOFFSET] = f90, SIZE + STFD [AOFFSET2] = f94, SIZE + ;; + STFD [AOFFSET] = f91, - 11 * SIZE + STFD [AOFFSET2] = f95, - 11 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f84, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f85, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f86, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f87, - 11 * SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f76, SIZE + ;; + STFD [AOFFSET] = f73, SIZE + STFD [AOFFSET2] = f77, SIZE + ;; + STFD [AOFFSET] = f74, SIZE + STFD [AOFFSET2] = f78, SIZE + ;; + STFD [AOFFSET] = f75, - 11 * SIZE + STFD [AOFFSET2] = f79, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + ;; + +#endif + adds C9 = 4 * SIZE, C1 + ;; + + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, 5 * SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + STFD [C11] = f87 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, 5 * SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + adds I = -1, I + ;; + { .mmi + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f65 = f0 + mov f73 = f0 + mov f81 = f0 + mov f89 = f0 + + { .mmb + (p6) br.cond.dptk .L052 + } + ;; + + .align 8 + +.L060: + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L070 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mfi + adds L = -1, L + } + ;; + { .mfi + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + } + { .mfi + mov ar.lc = L + } + ;; + + mov f66 = f0 + mov f67 = f0 + mov f74 = f0 + mov f75 = f0 + mov f82 = f0 + mov f83 = f0 + mov f90 = f0 + mov f91 = f0 + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + } + { .mfb + (p6) br.cond.dpnt .L068 + } + ;; + .align 8 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; + .align 8 + +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; + FSUB f66 = f40, f66 + FSUB f74 = f41, f74 + FSUB f82 = f42, f82 + FSUB f90 = f43, f90 + ;; + FSUB f67 = f44, f67 + FSUB f75 = f45, f75 + FSUB f83 = f46, f83 + FSUB f91 = f47, f91 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f75 = f75, f32 + FMPY f83 = f83, f32 + FMPY f91 = f91, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f74 = f75, f33, f74 + FNMA f82 = f83, f33, f82 + FNMA f90 = f91, f33, f90 + ;; + FNMA f65 = f67, f34, f65 + FNMA f73 = f75, f34, f73 + FNMA f81 = f83, f34, f81 + FNMA f89 = f91, f34, f89 + ;; + FNMA f64 = f67, f35, f64 + FNMA f72 = f75, f35, f72 + FNMA f80 = f83, f35, f80 + FNMA f88 = f91, f35, f88 + ;; + FMPY f66 = f66, f36 + FMPY f74 = f74, f36 + FMPY f82 = f82, f36 + FMPY f90 = f90, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f73 = f74, f37, f73 + FNMA f81 = f82, f37, f81 + FNMA f89 = f90, f37, f89 + ;; + FNMA f64 = f66, f38, f64 + FNMA f72 = f74, f38, f72 + FNMA f80 = f82, f38, f80 + FNMA f88 = f90, f38, f88 + ;; + FMPY f65 = f65, f39 + FMPY f73 = f73, f39 + FMPY f81 = f81, f39 + FMPY f89 = f89, f39 + ;; + FNMA f64 = f65, f40, f64 + FNMA f72 = f73, f40, f72 + FNMA f80 = f81, f40, f80 + FNMA f88 = f89, f40, f88 + ;; + FMPY f64 = f64, f41 + FMPY f72 = f72, f41 + FMPY f80 = f80, f41 + FMPY f88 = f88, f41 + ;; + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, - 11 * SIZE + STFD [BOFFSET2] = f91, - 11 * SIZE + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FNMA f66 = f64, f34, f66 + FNMA f74 = f72, f34, f74 + FNMA f82 = f80, f34, f82 + FNMA f90 = f88, f34, f90 + ;; + FNMA f67 = f64, f35, f67 + FNMA f75 = f72, f35, f75 + FNMA f83 = f80, f35, f83 + FNMA f91 = f88, f35, f91 + ;; + FMPY f65 = f65, f36 + FMPY f73 = f73, f36 + FMPY f81 = f81, f36 + FMPY f89 = f89, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f74 = f73, f37, f74 + FNMA f82 = f81, f37, f82 + FNMA f90 = f89, f37, f90 + ;; + FNMA f67 = f65, f38, f67 + FNMA f75 = f73, f38, f75 + FNMA f83 = f81, f38, f83 + FNMA f91 = f89, f38, f91 + ;; + FMPY f66 = f66, f39 + FMPY f74 = f74, f39 + FMPY f82 = f82, f39 + FMPY f90 = f90, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f75 = f74, f40, f75 + FNMA f83 = f82, f40, f83 + FNMA f91 = f90, f40, f91 + ;; + FMPY f67 = f67, f41 + FMPY f75 = f75, f41 + FMPY f83 = f83, f41 + FMPY f91 = f91, f41 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, 5 * SIZE + STFD [BOFFSET2] = f89, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f67, SIZE + ;; + STFD [BOFFSET] = f74, SIZE + STFD [BOFFSET2] = f75, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f83, SIZE + ;; + STFD [BOFFSET] = f90, -11 * SIZE + STFD [BOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + FMPY f74 = f74, f36 + FMPY f75 = f75, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + FNMA f82 = f74, f37, f82 + FNMA f83 = f75, f37, f83 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + FNMA f90 = f74, f38, f90 + FNMA f91 = f75, f38, f91 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + FMPY f82 = f82, f39 + FMPY f83 = f83, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + FNMA f90 = f82, f40, f90 + FNMA f91 = f83, f40, f91 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + FMPY f90 = f90, f41 + FMPY f91 = f91, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, -11 * SIZE + STFD [AOFFSET2] = f91, -11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + FMPY f89 = f89, f32 + FMPY f90 = f90, f32 + FMPY f91 = f91, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f81 = f89, f33, f81 + FNMA f82 = f90, f33, f82 + FNMA f83 = f91, f33, f83 + ;; + FNMA f72 = f88, f34, f72 + FNMA f73 = f89, f34, f73 + FNMA f74 = f90, f34, f74 + FNMA f75 = f91, f34, f75 + ;; + FNMA f64 = f88, f35, f64 + FNMA f65 = f89, f35, f65 + FNMA f66 = f90, f35, f66 + FNMA f67 = f91, f35, f67 + ;; + FMPY f80 = f80, f36 + FMPY f81 = f81, f36 + FMPY f82 = f82, f36 + FMPY f83 = f83, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f73 = f81, f37, f73 + FNMA f74 = f82, f37, f74 + FNMA f75 = f83, f37, f75 + ;; + FNMA f64 = f80, f38, f64 + FNMA f65 = f81, f38, f65 + FNMA f66 = f82, f38, f66 + FNMA f67 = f83, f38, f67 + ;; + FMPY f72 = f72, f39 + FMPY f73 = f73, f39 + FMPY f74 = f74, f39 + FMPY f75 = f75, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f65 = f73, f40, f65 + FNMA f66 = f74, f40, f66 + FNMA f67 = f75, f40, f67 + ;; + FMPY f64 = f64, f41 + FMPY f65 = f65, f41 + FMPY f66 = f66, f41 + FMPY f67 = f67, f41 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f91, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + nop __LINE__ + } + ;; + mov f65 = f0 + ;; + mov f73 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + { .mmi + sub L = K, KK + } + ;; + { .mmi +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f81 = f0 + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f89 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L070: + tbit.z p6,p0 = M, 1 + (p6) br.cond.dptk .L080 + ;; + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + mov f73 = f0 + ;; + { .mfi + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + shr L = L, 1 + } + ;; + { .mmf + adds L = -1, L + } + ;; + { .mmf + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L078 + } + ;; + .align 8 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f65 = f36, f65 + FSUB f73 = f37, f73 + FSUB f81 = f38, f81 + FSUB f89 = f39, f89 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f73 = f73, f32 + FMPY f81 = f81, f32 + FMPY f89 = f89, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f72 = f73, f33, f72 + FNMA f80 = f81, f33, f80 + FNMA f88 = f89, f33, f88 + ;; + FMPY f64 = f64, f34 + FMPY f72 = f72, f34 + FMPY f80 = f80, f34 + FMPY f88 = f88, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f89, - 3 * SIZE + ;; + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + FNMA f65 = f64, f33, f65 + FNMA f73 = f72, f33, f73 + FNMA f81 = f80, f33, f81 + FNMA f89 = f88, f33, f89 + ;; + FMPY f65 = f65, f34 + FMPY f73 = f73, f34 + FMPY f81 = f81, f34 + FMPY f89 = f89, f34 + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f65, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f73, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f81, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + STFD [BOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + ;; + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FMPY f72 = f72, f36 + FMPY f73 = f73, f36 + ;; + FNMA f80 = f72, f37, f80 + FNMA f81 = f73, f37, f81 + ;; + FNMA f88 = f72, f38, f88 + FNMA f89 = f73, f38, f89 + ;; + FMPY f80 = f80, f39 + FMPY f81 = f81, f39 + ;; + FNMA f88 = f80, f40, f88 + FNMA f89 = f81, f40, f89 + ;; + FMPY f88 = f88, f41 + FMPY f89 = f89, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + FMPY f89 = f89, f32 + ;; + FNMA f80 = f88, f33, f80 + FNMA f81 = f89, f33, f81 + ;; + FNMA f72 = f88, f34, f72 + FNMA f73 = f89, f34, f73 + ;; + FNMA f64 = f88, f35, f64 + FNMA f65 = f89, f35, f65 + ;; + FMPY f80 = f80, f36 + FMPY f81 = f81, f36 + ;; + FNMA f72 = f80, f37, f72 + FNMA f73 = f81, f37, f73 + ;; + FNMA f64 = f80, f38, f64 + FNMA f65 = f81, f38, f65 + ;; + FMPY f72 = f72, f39 + FMPY f73 = f73, f39 + ;; + FNMA f64 = f72, f40, f64 + FNMA f65 = f73, f40, f65 + ;; + FMPY f64 = f64, f41 + FMPY f65 = f65, f41 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, -3 * SIZE + STFD [AOFFSET2] = f89, -3 * SIZE + ;; +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; +#ifndef LN + STFD [C3 ] = f81, SIZE +#else + STFD [C3 ] = f81, - SIZE +#endif + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; +#ifndef LN + STFD [C4 ] = f89, SIZE +#else + STFD [C4 ] = f89, -SIZE +#endif + ;; + mov f96 = f0 + ;; + mov f104 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + mov f112 = f0 + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + mov f120 = f0 + } + ;; + { .mmi +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L080: + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + adds L = -1, L + } + ;; + { .mmi + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L088 + } + ;; + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mmb + nop __LINE__ + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; + +.L088: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + adds C2 = -1 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + } + ;; + adds C3 = -1 * SIZE, C3 + adds C4 = -1 * SIZE, C4 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f72 = f72, f32 + FMPY f80 = f80, f32 + FMPY f88 = f88, f32 + ;; + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f72, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f39, f40 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET], -15 * SIZE + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FMPY f72 = f72, f36 + ;; + FNMA f80 = f72, f37, f80 + ;; + FNMA f88 = f72, f38, f88 + ;; + FMPY f80 = f80, f39 + ;; + FNMA f88 = f80, f40, f88 + ;; + FMPY f88 = f88, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 14 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f36 = [BOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f40, f39 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f41 = [BOFFSET] + ;; + FMPY f88 = f88, f32 + ;; + FNMA f80 = f88, f33, f80 + ;; + FNMA f72 = f88, f34, f72 + ;; + FNMA f64 = f88, f35, f64 + ;; + FMPY f80 = f80, f36 + ;; + FNMA f72 = f80, f37, f72 + ;; + FNMA f64 = f80, f38, f64 + ;; + FMPY f72 = f72, f39 + ;; + FNMA f64 = f72, f40, f64 + ;; + FMPY f64 = f64, f41 + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f88, - 3 * SIZE + ;; +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif +#ifndef LN + STFD [C3 ] = f80, SIZE +#else + STFD [C3 ] = f80 +#endif +#ifndef LN + STFD [C4 ] = f88, SIZE +#else + STFD [C4 ] = f88 +#endif + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 2, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L089: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + mov AOFFSET = A + ;; + .align 16 + +.L000: + shr J = N, 3 + ;; + cmp.ge p6, p0 = 0, J + (p6) br.cond.dpnt .L999 + ;; + .align 8 + +.L010: +#ifdef RT + { .mmi + shladd r3 = LDC, 3, r0 + nop __LINE__ + shl r2 = K, 3 + BASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } +#endif + ;; + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc +#ifndef RT + shladd C = LDC, 3, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 + shladd C6 = LDC, 2, C2 + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 + mov f112 = f0 +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + }{ .mfb + shladd C8 = LDC, 2, C4 + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 3 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f65 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 3, AORIG + } + ;; +#endif + { .mfb + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mmf + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f119 = r0 + mov f89 = f0 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfb + setf.d f113 = r0 + mov f121 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfb + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfb + setf.d f114 = r0 + mov f122 = f0 + nop __LINE__ + } + ;; + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmf + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfb + cmp.eq p6, p0 = -1, L + mov f127 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.fault.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -8, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 3, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FSUB f113 = f46, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + FSUB f66 = f48, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f74 = f49, f74 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f90 = f51, f90 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FSUB f98 = f52, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f106 = f53, f106 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + FSUB f114 = f54, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f55, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + FSUB f67 = f56, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f75 = f57, f75 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + FSUB f83 = f58, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + FSUB f99 = f60, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f61, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + FSUB f115 = f62, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f63, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f68 = f32, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f76 = f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f84 = f34, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f92 = f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f108 = f37, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f116 = f38, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f39, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f69 = f40, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f77 = f41, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f85 = f42, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f43, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f101 = f44, f101 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f117 = f46, f117 + adds BOFFSET = -62 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f125 = f47, f125 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f48, f70 +#ifdef LN + adds AOFFSET = 62 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FSUB f78 = f49, f78 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f86 = f50, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f94 = f51, f94 + nop __LINE__ + } + ;; + { .mfi +#ifdef LN + LDFPD f33, f32 = [AOFFSET] +#else + LDFPD f32, f33 = [AOFFSET] +#endif + FSUB f102 = f52, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f110 = f53, f110 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f126 = f55, f126 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + ;; + { .mfi + nop __LINE__ + FSUB f71 = f56, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f79 = f57, f79 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f58, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f59, f95 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f60, f103 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f111 = f61, f111 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f62, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + } + { .mfi + FSUB f65 = f33, f65 + } + ;; + { .mfi + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + FSUB f66 = f34, f66 + } + { .mfi + FSUB f67 = f35, f67 + } + ;; + { .mfi + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + FSUB f68 = f36, f68 + } + { .mfi + FSUB f69 = f37, f69 + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FSUB f70 = f38, f70 + } + { .mfi + FSUB f71 = f39, f71 + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + FSUB f72 = f40, f72 + } + { .mfi + FSUB f73 = f41, f73 + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FSUB f74 = f42, f74 + } + { .mfi + FSUB f75 = f43, f75 + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + FSUB f76 = f44, f76 + } + { .mfi + FSUB f77 = f45, f77 + } + ;; + { .mfi + LDFPD f62, f63 = [AOFFSET], 2 * SIZE + FSUB f78 = f46, f78 + } + { .mfi + FSUB f79 = f47, f79 + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FSUB f80 = f48, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f49, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + FSUB f82 = f50, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f51, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + FSUB f84 = f52, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f53, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + FSUB f86 = f54, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f55, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FSUB f88 = f56, f88 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f57, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FSUB f90 = f58, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f91 = f59, f91 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FSUB f92 = f60, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f93 = f61, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FSUB f94 = f62, f94 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f95 = f63, f95 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + FSUB f96 = f32, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f33, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + FSUB f98 = f34, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f35, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + FSUB f100 = f36, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f101 = f37, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FSUB f102 = f38, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f103 = f39, f103 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + FSUB f104 = f40, f104 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f41, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FSUB f106 = f42, f106 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f107 = f43, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + FSUB f108 = f44, f108 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f109 = f45, f109 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [AOFFSET] + FSUB f110 = f46, f110 + adds AOFFSET = -62 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f111 = f47, f111 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f48, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f49, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f114 = f50, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f51, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f116 = f52, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f53, f117 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f118 = f54, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f55, f119 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f56, f120 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f121 = f57, f121 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f122 = f58, f122 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f123 = f59, f123 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f124 = f60, f124 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f125 = f61, f125 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f126 = f62, f126 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f127 = f63, f127 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + { .mfi + LDFPD f35, f34 = [AOFFSET] + FMPY f71 = f71, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f103 = f103, f32 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + LDFPD f37, f36 = [AOFFSET] + FMPY f79 = f79, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f111 = f111, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f39, f38 = [AOFFSET] + FMPY f87 = f87, f32 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f119 = f119, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [AOFFSET], -2 * SIZE + FMPY f95 = f95, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f127 = f127, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f41 = [AOFFSET] + FNMA f70 = f71, f33, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f103, f33, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f43 = [AOFFSET] + FNMA f78 = f79, f33, f78 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f110 = f111, f33, f110 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f45 = [AOFFSET] + FNMA f86 = f87, f33, f86 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f118 = f119, f33, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f47 = [AOFFSET] + FNMA f94 = f95, f33, f94 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f126 = f127, f33, f126 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f49 = [AOFFSET] + FNMA f69 = f71, f34, f69 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f103, f34, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f51 = [AOFFSET] + FNMA f77 = f79, f34, f77 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f109 = f111, f34, f109 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [AOFFSET], -2 * SIZE + FNMA f85 = f87, f34, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f119, f34, f117 + nop __LINE__ + } + ;; + { .mfi + LDFPD f55, f54 = [AOFFSET] + FNMA f93 = f95, f34, f93 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f125 = f127, f34, f125 + nop __LINE__ + } + ;; + { .mfi + LDFPD f57, f56 = [AOFFSET] + FNMA f68 = f71, f35, f68 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f103, f35, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f59, f58 = [AOFFSET] + FNMA f76 = f79, f35, f76 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f111, f35, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f61, f60 = [AOFFSET] + FNMA f84 = f87, f35, f84 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f119, f35, f116 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [AOFFSET], -2 * SIZE + FNMA f92 = f95, f35, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f124 = f127, f35, f124 + nop __LINE__ + } + ;; + { .mfi + LDFPD f18, f17 = [AOFFSET] + FNMA f67 = f71, f36, f67 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f99 = f103, f36, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f20, f19 = [AOFFSET] + FNMA f75 = f79, f36, f75 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f107 = f111, f36, f107 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [AOFFSET] + FNMA f83 = f87, f36, f83 + adds BOFFSET = 56 * SIZE, BOFFSET + } + { .mfi + FNMA f115 = f119, f36, f115 + adds BOFFSET2 = 56 * SIZE, BOFFSET2 + } + ;; + FNMA f91 = f95, f36, f91 + FNMA f123 = f127, f36, f123 + ;; + FNMA f66 = f71, f37, f66 + FNMA f98 = f103, f37, f98 + FNMA f74 = f79, f37, f74 + FNMA f106 = f111, f37, f106 + FNMA f82 = f87, f37, f82 + FNMA f114 = f119, f37, f114 + FNMA f90 = f95, f37, f90 + FNMA f122 = f127, f37, f122 + ;; + FNMA f65 = f71, f38, f65 + FNMA f97 = f103, f38, f97 + FNMA f73 = f79, f38, f73 + FNMA f105 = f111, f38, f105 + FNMA f81 = f87, f38, f81 + FNMA f113 = f119, f38, f113 + FNMA f89 = f95, f38, f89 + FNMA f121 = f127, f38, f121 + ;; + FNMA f64 = f71, f39, f64 + FNMA f96 = f103, f39, f96 + FNMA f72 = f79, f39, f72 + FNMA f104 = f111, f39, f104 + FNMA f80 = f87, f39, f80 + FNMA f112 = f119, f39, f112 + FNMA f88 = f95, f39, f88 + FNMA f120 = f127, f39, f120 + ;; + FMPY f70 = f70, f40 + FMPY f102 = f102, f40 + FMPY f78 = f78, f40 + FMPY f110 = f110, f40 + FMPY f86 = f86, f40 + FMPY f118 = f118, f40 + FMPY f94 = f94, f40 + FMPY f126 = f126, f40 + ;; + FNMA f69 = f70, f41, f69 + FNMA f101 = f102, f41, f101 + FNMA f77 = f78, f41, f77 + FNMA f109 = f110, f41, f109 + FNMA f85 = f86, f41, f85 + FNMA f117 = f118, f41, f117 + FNMA f93 = f94, f41, f93 + FNMA f125 = f126, f41, f125 + ;; + FNMA f68 = f70, f42, f68 + FNMA f100 = f102, f42, f100 + FNMA f76 = f78, f42, f76 + FNMA f108 = f110, f42, f108 + FNMA f84 = f86, f42, f84 + FNMA f116 = f118, f42, f116 + FNMA f92 = f94, f42, f92 + FNMA f124 = f126, f42, f124 + ;; + FNMA f67 = f70, f43, f67 + FNMA f99 = f102, f43, f99 + FNMA f75 = f78, f43, f75 + FNMA f107 = f110, f43, f107 + FNMA f83 = f86, f43, f83 + FNMA f115 = f118, f43, f115 + FNMA f91 = f94, f43, f91 + FNMA f123 = f126, f43, f123 + ;; + FNMA f66 = f70, f44, f66 + FNMA f98 = f102, f44, f98 + FNMA f74 = f78, f44, f74 + FNMA f106 = f110, f44, f106 + FNMA f82 = f86, f44, f82 + FNMA f114 = f118, f44, f114 + FNMA f90 = f94, f44, f90 + FNMA f122 = f126, f44, f122 + ;; + FNMA f65 = f70, f45, f65 + FNMA f97 = f102, f45, f97 + FNMA f73 = f78, f45, f73 + FNMA f105 = f110, f45, f105 + FNMA f81 = f86, f45, f81 + FNMA f113 = f118, f45, f113 + FNMA f89 = f94, f45, f89 + FNMA f121 = f126, f45, f121 + ;; + FNMA f64 = f70, f46, f64 + FNMA f96 = f102, f46, f96 + FNMA f72 = f78, f46, f72 + FNMA f104 = f110, f46, f104 + FNMA f80 = f86, f46, f80 + FNMA f112 = f118, f46, f112 + FNMA f88 = f94, f46, f88 + FNMA f120 = f126, f46, f120 + ;; + FMPY f69 = f69, f47 + FMPY f101 = f101, f47 + FMPY f77 = f77, f47 + FMPY f109 = f109, f47 + FMPY f85 = f85, f47 + FMPY f117 = f117, f47 + FMPY f93 = f93, f47 + FMPY f125 = f125, f47 + ;; + FNMA f68 = f69, f48, f68 + FNMA f100 = f101, f48, f100 + FNMA f76 = f77, f48, f76 + FNMA f108 = f109, f48, f108 + FNMA f84 = f85, f48, f84 + FNMA f116 = f117, f48, f116 + FNMA f92 = f93, f48, f92 + FNMA f124 = f125, f48, f124 + ;; + FNMA f67 = f69, f49, f67 + FNMA f99 = f101, f49, f99 + FNMA f75 = f77, f49, f75 + FNMA f107 = f109, f49, f107 + FNMA f83 = f85, f49, f83 + FNMA f115 = f117, f49, f115 + FNMA f91 = f93, f49, f91 + FNMA f123 = f125, f49, f123 + ;; + FNMA f66 = f69, f50, f66 + FNMA f98 = f101, f50, f98 + FNMA f74 = f77, f50, f74 + FNMA f106 = f109, f50, f106 + FNMA f82 = f85, f50, f82 + FNMA f114 = f117, f50, f114 + FNMA f90 = f93, f50, f90 + FNMA f122 = f125, f50, f122 + ;; + FNMA f65 = f69, f51, f65 + FNMA f97 = f101, f51, f97 + FNMA f73 = f77, f51, f73 + FNMA f105 = f109, f51, f105 + FNMA f81 = f85, f51, f81 + FNMA f113 = f117, f51, f113 + FNMA f89 = f93, f51, f89 + FNMA f121 = f125, f51, f121 + ;; + FNMA f64 = f69, f52, f64 + FNMA f96 = f101, f52, f96 + FNMA f72 = f77, f52, f72 + FNMA f104 = f109, f52, f104 + FNMA f80 = f85, f52, f80 + FNMA f112 = f117, f52, f112 + FNMA f88 = f93, f52, f88 + FNMA f120 = f125, f52, f120 + ;; + FMPY f68 = f68, f53 + FMPY f100 = f100, f53 + FMPY f76 = f76, f53 + FMPY f108 = f108, f53 + FMPY f84 = f84, f53 + FMPY f116 = f116, f53 + FMPY f92 = f92, f53 + FMPY f124 = f124, f53 + ;; + FNMA f67 = f68, f54, f67 + FNMA f99 = f100, f54, f99 + FNMA f75 = f76, f54, f75 + FNMA f107 = f108, f54, f107 + FNMA f83 = f84, f54, f83 + FNMA f115 = f116, f54, f115 + FNMA f91 = f92, f54, f91 + FNMA f123 = f124, f54, f123 + ;; + FNMA f66 = f68, f55, f66 + FNMA f98 = f100, f55, f98 + FNMA f74 = f76, f55, f74 + FNMA f106 = f108, f55, f106 + FNMA f82 = f84, f55, f82 + FNMA f114 = f116, f55, f114 + FNMA f90 = f92, f55, f90 + FNMA f122 = f124, f55, f122 + ;; + FNMA f65 = f68, f56, f65 + FNMA f97 = f100, f56, f97 + FNMA f73 = f76, f56, f73 + FNMA f105 = f108, f56, f105 + FNMA f81 = f84, f56, f81 + FNMA f113 = f116, f56, f113 + FNMA f89 = f92, f56, f89 + FNMA f121 = f124, f56, f121 + ;; + FNMA f64 = f68, f57, f64 + FNMA f96 = f100, f57, f96 + FNMA f72 = f76, f57, f72 + FNMA f104 = f108, f57, f104 + FNMA f80 = f84, f57, f80 + FNMA f112 = f116, f57, f112 + FNMA f88 = f92, f57, f88 + FNMA f120 = f124, f57, f120 + ;; + FMPY f67 = f67, f58 + FMPY f99 = f99, f58 + FMPY f75 = f75, f58 + FMPY f107 = f107, f58 + FMPY f83 = f83, f58 + FMPY f115 = f115, f58 + FMPY f91 = f91, f58 + FMPY f123 = f123, f58 + ;; + FNMA f66 = f67, f59, f66 + FNMA f98 = f99, f59, f98 + FNMA f74 = f75, f59, f74 + FNMA f106 = f107, f59, f106 + FNMA f82 = f83, f59, f82 + FNMA f114 = f115, f59, f114 + FNMA f90 = f91, f59, f90 + FNMA f122 = f123, f59, f122 + ;; + FNMA f65 = f67, f60, f65 + FNMA f97 = f99, f60, f97 + FNMA f73 = f75, f60, f73 + FNMA f105 = f107, f60, f105 + FNMA f81 = f83, f60, f81 + FNMA f113 = f115, f60, f113 + FNMA f89 = f91, f60, f89 + FNMA f121 = f123, f60, f121 + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FNMA f64 = f67, f61, f64 + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FNMA f96 = f99, f61, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f79, SIZE + FNMA f72 = f75, f61, f72 + } + { .mfi + STFD [BOFFSET2] = f111, SIZE + FNMA f104 = f107, f61, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f87, SIZE + FNMA f80 = f83, f61, f80 + } + { .mfi + STFD [BOFFSET2] = f119, SIZE + FNMA f112 = f115, f61, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f95, - 11 * SIZE + FNMA f88 = f91, f61, f88 + } + { .mfi + STFD [BOFFSET2] = f127, - 11 * SIZE + FNMA f120 = f123, f61, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FMPY f66 = f66, f16 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FMPY f98 = f98, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f78, SIZE + FMPY f74 = f74, f16 + } + { .mfi + STFD [BOFFSET2] = f110, SIZE + FMPY f106 = f106, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FMPY f82 = f82, f16 + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FMPY f114 = f114, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f94, - 11 * SIZE + FMPY f90 = f90, f16 + } + { .mfi + STFD [BOFFSET2] = f126, - 11 * SIZE + FMPY f122 = f122, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FNMA f65 = f66, f17, f65 + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FNMA f97 = f98, f17, f97 + } + ;; + { .mfi + STFD [BOFFSET] = f77, SIZE + FNMA f73 = f74, f17, f73 + } + { .mfi + STFD [BOFFSET2] = f109, SIZE + FNMA f105 = f106, f17, f105 + } + ;; + { .mfi + STFD [BOFFSET] = f85, SIZE + FNMA f81 = f82, f17, f81 + } + { .mfi + STFD [BOFFSET2] = f117, SIZE + FNMA f113 = f114, f17, f113 + } + ;; + { .mfi + STFD [BOFFSET] = f93, - 11 * SIZE + FNMA f89 = f90, f17, f89 + } + { .mfi + STFD [BOFFSET2] = f125, - 11 * SIZE + FNMA f121 = f122, f17, f121 + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f64 = f66, f18, f64 + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f96 = f98, f18, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f76, SIZE + FNMA f72 = f74, f18, f72 + } + { .mfi + STFD [BOFFSET2] = f108, SIZE + FNMA f104 = f106, f18, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f80 = f82, f18, f80 + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f112 = f114, f18, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f92, - 11 * SIZE + FNMA f88 = f90, f18, f88 + } + { .mfi + STFD [BOFFSET2] = f124, - 11 * SIZE + FNMA f120 = f122, f18, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMPY f65 = f65, f19 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMPY f97 = f97, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FMPY f73 = f73, f19 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FMPY f105 = f105, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FMPY f81 = f81, f19 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FMPY f113 = f113, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f91, - 11 * SIZE + FMPY f89 = f89, f19 + } + { .mfi + STFD [BOFFSET2] = f123, - 11 * SIZE + FMPY f121 = f121, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f65, f20, f64 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f97, f20, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f72 = f73, f20, f72 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f104 = f105, f20, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f81, f20, f80 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f113, f20, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f90, -11 * SIZE + FNMA f88 = f89, f20, f88 + } + { .mfi + STFD [BOFFSET2] = f122, -11 * SIZE + FNMA f120 = f121, f20, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f64 = f64, f21 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f96 = f96, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f72 = f72, f21 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f104 = f104, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f80 = f80, f21 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f112 = f112, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + FMPY f88 = f88, f21 + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + FMPY f120 = f120, f21 + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -8 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -8 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + adds C9 = 4 * SIZE, C1 + } + ;; +#endif + +#ifdef LT + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET] + FMPY f80 = f80, f32 + adds AOFFSET = 3 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [AOFFSET], 1 * SIZE + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f41, f42 = [AOFFSET], 2 * SIZE + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f43, f44 = [AOFFSET], 2 * SIZE + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f45, f46 = [AOFFSET] + FNMA f81 = f80, f33, f81 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f47, f48 = [AOFFSET], 2 * SIZE + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + LDFPD f49, f50 = [AOFFSET], 2 * SIZE + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + LDFPD f51, f52 = [AOFFSET] + FNMA f74 = f72, f34, f74 + adds AOFFSET = 5 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [AOFFSET], 1 * SIZE + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [AOFFSET] + FNMA f67 = f64, f35, f67 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [AOFFSET] + FNMA f83 = f80, f35, f83 + adds AOFFSET = 7 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [AOFFSET], 1 * SIZE + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + nop __LINE__ + } + ;; + { .mfi + LDFPD f17, f18 = [AOFFSET] + FNMA f68 = f64, f36, f68 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f96, f36, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f19, f20 = [AOFFSET] + FNMA f76 = f72, f36, f76 + adds AOFFSET = 9 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f104, f36, f108 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [AOFFSET] + FNMA f84 = f80, f36, f84 + adds AOFFSET = -63 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f112, f36, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f92 = f88, f36, f92 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f124 = f120, f36, f124 + nop __LINE__ + } + ;; + FNMA f69 = f64, f37, f69 + FNMA f101 = f96, f37, f101 + FNMA f77 = f72, f37, f77 + FNMA f109 = f104, f37, f109 + FNMA f85 = f80, f37, f85 + FNMA f117 = f112, f37, f117 + FNMA f93 = f88, f37, f93 + FNMA f125 = f120, f37, f125 + ;; + FNMA f70 = f64, f38, f70 + FNMA f102 = f96, f38, f102 + FNMA f78 = f72, f38, f78 + FNMA f110 = f104, f38, f110 + FNMA f86 = f80, f38, f86 + FNMA f118 = f112, f38, f118 + FNMA f94 = f88, f38, f94 + FNMA f126 = f120, f38, f126 + ;; + FNMA f71 = f64, f39, f71 + FNMA f103 = f96, f39, f103 + FNMA f79 = f72, f39, f79 + FNMA f111 = f104, f39, f111 + FNMA f87 = f80, f39, f87 + FNMA f119 = f112, f39, f119 + FNMA f95 = f88, f39, f95 + FNMA f127 = f120, f39, f127 + ;; + FMPY f65 = f65, f40 + FMPY f97 = f97, f40 + FMPY f73 = f73, f40 + FMPY f105 = f105, f40 + FMPY f81 = f81, f40 + FMPY f113 = f113, f40 + FMPY f89 = f89, f40 + FMPY f121 = f121, f40 + ;; + FNMA f66 = f65, f41, f66 + FNMA f98 = f97, f41, f98 + FNMA f74 = f73, f41, f74 + FNMA f106 = f105, f41, f106 + FNMA f82 = f81, f41, f82 + FNMA f114 = f113, f41, f114 + FNMA f90 = f89, f41, f90 + FNMA f122 = f121, f41, f122 + FNMA f67 = f65, f42, f67 + FNMA f99 = f97, f42, f99 + FNMA f75 = f73, f42, f75 + FNMA f107 = f105, f42, f107 + FNMA f83 = f81, f42, f83 + FNMA f115 = f113, f42, f115 + FNMA f91 = f89, f42, f91 + FNMA f123 = f121, f42, f123 + ;; + FNMA f68 = f65, f43, f68 + FNMA f100 = f97, f43, f100 + FNMA f76 = f73, f43, f76 + FNMA f108 = f105, f43, f108 + FNMA f84 = f81, f43, f84 + FNMA f116 = f113, f43, f116 + FNMA f92 = f89, f43, f92 + FNMA f124 = f121, f43, f124 + ;; + FNMA f69 = f65, f44, f69 + FNMA f101 = f97, f44, f101 + FNMA f77 = f73, f44, f77 + FNMA f109 = f105, f44, f109 + FNMA f85 = f81, f44, f85 + FNMA f117 = f113, f44, f117 + FNMA f93 = f89, f44, f93 + FNMA f125 = f121, f44, f125 + ;; + FNMA f70 = f65, f45, f70 + FNMA f102 = f97, f45, f102 + FNMA f78 = f73, f45, f78 + FNMA f110 = f105, f45, f110 + FNMA f86 = f81, f45, f86 + FNMA f118 = f113, f45, f118 + FNMA f94 = f89, f45, f94 + FNMA f126 = f121, f45, f126 + ;; + FNMA f71 = f65, f46, f71 + FNMA f103 = f97, f46, f103 + FNMA f79 = f73, f46, f79 + FNMA f111 = f105, f46, f111 + FNMA f87 = f81, f46, f87 + FNMA f119 = f113, f46, f119 + FNMA f95 = f89, f46, f95 + FNMA f127 = f121, f46, f127 + ;; + FMPY f66 = f66, f47 + FMPY f98 = f98, f47 + FMPY f74 = f74, f47 + FMPY f106 = f106, f47 + FMPY f82 = f82, f47 + FMPY f114 = f114, f47 + FMPY f90 = f90, f47 + FMPY f122 = f122, f47 + ;; + FNMA f67 = f66, f48, f67 + FNMA f99 = f98, f48, f99 + FNMA f75 = f74, f48, f75 + FNMA f107 = f106, f48, f107 + FNMA f83 = f82, f48, f83 + FNMA f115 = f114, f48, f115 + FNMA f91 = f90, f48, f91 + FNMA f123 = f122, f48, f123 + FNMA f68 = f66, f49, f68 + FNMA f100 = f98, f49, f100 + FNMA f76 = f74, f49, f76 + FNMA f108 = f106, f49, f108 + FNMA f84 = f82, f49, f84 + FNMA f116 = f114, f49, f116 + FNMA f92 = f90, f49, f92 + FNMA f124 = f122, f49, f124 + ;; + FNMA f69 = f66, f50, f69 + FNMA f101 = f98, f50, f101 + FNMA f77 = f74, f50, f77 + FNMA f109 = f106, f50, f109 + FNMA f85 = f82, f50, f85 + FNMA f117 = f114, f50, f117 + FNMA f93 = f90, f50, f93 + FNMA f125 = f122, f50, f125 + ;; + FNMA f70 = f66, f51, f70 + FNMA f102 = f98, f51, f102 + FNMA f78 = f74, f51, f78 + FNMA f110 = f106, f51, f110 + FNMA f86 = f82, f51, f86 + FNMA f118 = f114, f51, f118 + FNMA f94 = f90, f51, f94 + FNMA f126 = f122, f51, f126 + ;; + FNMA f71 = f66, f52, f71 + FNMA f103 = f98, f52, f103 + FNMA f79 = f74, f52, f79 + FNMA f111 = f106, f52, f111 + FNMA f87 = f82, f52, f87 + FNMA f119 = f114, f52, f119 + FNMA f95 = f90, f52, f95 + FNMA f127 = f122, f52, f127 + ;; + FMPY f67 = f67, f53 + FMPY f99 = f99, f53 + FMPY f75 = f75, f53 + FMPY f107 = f107, f53 + FMPY f83 = f83, f53 + FMPY f115 = f115, f53 + FMPY f91 = f91, f53 + FMPY f123 = f123, f53 + ;; + FNMA f68 = f67, f54, f68 + FNMA f100 = f99, f54, f100 + FNMA f76 = f75, f54, f76 + FNMA f108 = f107, f54, f108 + FNMA f84 = f83, f54, f84 + FNMA f116 = f115, f54, f116 + FNMA f92 = f91, f54, f92 + FNMA f124 = f123, f54, f124 + ;; + FNMA f69 = f67, f55, f69 + FNMA f101 = f99, f55, f101 + FNMA f77 = f75, f55, f77 + FNMA f109 = f107, f55, f109 + FNMA f85 = f83, f55, f85 + FNMA f117 = f115, f55, f117 + FNMA f93 = f91, f55, f93 + FNMA f125 = f123, f55, f125 + ;; + FNMA f70 = f67, f56, f70 + FNMA f102 = f99, f56, f102 + FNMA f78 = f75, f56, f78 + FNMA f110 = f107, f56, f110 + FNMA f86 = f83, f56, f86 + FNMA f118 = f115, f56, f118 + FNMA f94 = f91, f56, f94 + FNMA f126 = f123, f56, f126 + ;; + FNMA f71 = f67, f57, f71 + FNMA f103 = f99, f57, f103 + FNMA f79 = f75, f57, f79 + FNMA f111 = f107, f57, f111 + FNMA f87 = f83, f57, f87 + FNMA f119 = f115, f57, f119 + FNMA f95 = f91, f57, f95 + FNMA f127 = f123, f57, f127 + ;; + FMPY f68 = f68, f58 + FMPY f100 = f100, f58 + FMPY f76 = f76, f58 + FMPY f108 = f108, f58 + FMPY f84 = f84, f58 + FMPY f116 = f116, f58 + FMPY f92 = f92, f58 + FMPY f124 = f124, f58 + ;; + FNMA f69 = f68, f59, f69 + FNMA f101 = f100, f59, f101 + FNMA f77 = f76, f59, f77 + FNMA f109 = f108, f59, f109 + FNMA f85 = f84, f59, f85 + FNMA f117 = f116, f59, f117 + FNMA f93 = f92, f59, f93 + FNMA f125 = f124, f59, f125 + ;; + FNMA f70 = f68, f60, f70 + FNMA f102 = f100, f60, f102 + FNMA f78 = f76, f60, f78 + FNMA f110 = f108, f60, f110 + FNMA f86 = f84, f60, f86 + FNMA f118 = f116, f60, f118 + FNMA f94 = f92, f60, f94 + FNMA f126 = f124, f60, f126 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f71 = f68, f61, f71 + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f103 = f100, f61, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + FNMA f79 = f76, f61, f79 + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + FNMA f111 = f108, f61, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f87 = f84, f61, f87 + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f119 = f116, f61, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + FNMA f95 = f92, f61, f95 + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + FNMA f127 = f124, f61, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f69 = f69, f16 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f101 = f101, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f77 = f77, f16 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f109 = f109, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f85 = f85, f16 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f117 = f117, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + FMPY f93 = f93, f16 + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + FMPY f125 = f125, f16 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f70 = f69, f17, f70 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f102 = f101, f17, f102 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f78 = f77, f17, f78 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f110 = f109, f17, f110 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f86 = f85, f17, f86 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f118 = f117, f17, f118 + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + FNMA f94 = f93, f17, f94 + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + FNMA f126 = f125, f17, f126 + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FNMA f71 = f69, f18, f71 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FNMA f103 = f101, f18, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FNMA f79 = f77, f18, f79 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FNMA f111 = f109, f18, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FNMA f87 = f85, f18, f87 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FNMA f119 = f117, f18, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f91, 5 * SIZE + FNMA f95 = f93, f18, f95 + } + { .mfi + STFD [BOFFSET2] = f123, 5 * SIZE + FNMA f127 = f125, f18, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FMPY f70 = f70, f19 + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FMPY f102 = f102, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f76, SIZE + FMPY f78 = f78, f19 + } + { .mfi + STFD [BOFFSET2] = f108, SIZE + FMPY f110 = f110, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FMPY f86 = f86, f19 + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FMPY f118 = f118, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f92, 5 * SIZE + FMPY f94 = f94, f19 + } + { .mfi + STFD [BOFFSET2] = f124, 5 * SIZE + FMPY f126 = f126, f19 + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FNMA f71 = f70, f20, f71 + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FNMA f103 = f102, f20, f103 + } + ;; + { .mfi + STFD [BOFFSET] = f77, SIZE + FNMA f79 = f78, f20, f79 + } + { .mfi + STFD [BOFFSET2] = f109, SIZE + FNMA f111 = f110, f20, f111 + } + ;; + { .mfi + STFD [BOFFSET] = f85, SIZE + FNMA f87 = f86, f20, f87 + } + { .mfi + STFD [BOFFSET2] = f117, SIZE + FNMA f119 = f118, f20, f119 + } + ;; + { .mfi + STFD [BOFFSET] = f93, 5 * SIZE + FNMA f95 = f94, f20, f95 + } + { .mfi + STFD [BOFFSET2] = f125, 5 * SIZE + FNMA f127 = f126, f20, f127 + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FMPY f71 = f71, f21 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FMPY f103 = f103, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f78, SIZE + FMPY f79 = f79, f21 + } + { .mfi + STFD [BOFFSET2] = f110, SIZE + FMPY f111 = f111, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FMPY f87 = f87, f21 + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FMPY f119 = f119, f21 + } + ;; + { .mfi + STFD [BOFFSET] = f94, 5 * SIZE + FMPY f95 = f95, f21 + } + { .mfi + STFD [BOFFSET2] = f126, 5 * SIZE + FMPY f127 = f127, f21 + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f79, SIZE + STFD [BOFFSET2] = f111, SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f87, SIZE + STFD [BOFFSET2] = f119, SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + STFD [BOFFSET] = f95 + adds BOFFSET = - 59 * SIZE, BOFFSET + } + { .mfi + STFD [BOFFSET2] = f127 + adds BOFFSET2 = - 59 * SIZE, BOFFSET2 + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f68 = f68, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + FMPY f65 = f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f69 = f69, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f38, f39 = [BOFFSET] + FMPY f66 = f66, f32 + adds BOFFSET = 3 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f70 = f70, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [BOFFSET], 1 * SIZE + FMPY f67 = f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f71 = f71, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + FNMA f72 = f64, f33, f72 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f76 = f68, f33, f76 + nop __LINE__ + } + ;; + { .mfi + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + FNMA f73 = f65, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f77 = f69, f33, f77 + nop __LINE__ + } + ;; + { .mfi + LDFPD f45, f46 = [BOFFSET] + FNMA f74 = f66, f33, f74 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f78 = f70, f33, f78 + nop __LINE__ + } + ;; + { .mfi + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + FNMA f75 = f67, f33, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f79 = f71, f33, f79 + nop __LINE__ + } + ;; + { .mfi + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + FNMA f80 = f64, f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f68, f34, f84 + nop __LINE__ + } + ;; + { .mfi + LDFPD f51, f52 = [BOFFSET] + FNMA f81 = f65, f34, f81 + adds BOFFSET = 5 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f85 = f69, f34, f85 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [BOFFSET], 1 * SIZE + FNMA f82 = f66, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f70, f34, f86 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FNMA f83 = f67, f34, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f71, f34, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET] + FNMA f88 = f64, f35, f88 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f92 = f68, f35, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FNMA f89 = f65, f35, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f93 = f69, f35, f93 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET] + FNMA f90 = f66, f35, f90 + adds BOFFSET = 7 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f94 = f70, f35, f94 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [BOFFSET], 1 * SIZE + FNMA f91 = f67, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f95 = f71, f35, f95 + nop __LINE__ + } + ;; + { .mfi + LDFPD f17, f18 = [BOFFSET] + FNMA f96 = f64, f36, f96 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f68, f36, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f19, f20 = [BOFFSET] + FNMA f97 = f65, f36, f97 + adds BOFFSET = 9 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f69, f36, f101 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [BOFFSET] + FNMA f98 = f66, f36, f98 + adds BOFFSET = -63 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f70, f36, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f67, f36, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f71, f36, f103 + nop __LINE__ + } + ;; + FNMA f104 = f64, f37, f104 + FNMA f108 = f68, f37, f108 + FNMA f105 = f65, f37, f105 + FNMA f109 = f69, f37, f109 + FNMA f106 = f66, f37, f106 + FNMA f110 = f70, f37, f110 + FNMA f107 = f67, f37, f107 + FNMA f111 = f71, f37, f111 + ;; + FNMA f112 = f64, f38, f112 + FNMA f116 = f68, f38, f116 + FNMA f113 = f65, f38, f113 + FNMA f117 = f69, f38, f117 + FNMA f114 = f66, f38, f114 + FNMA f118 = f70, f38, f118 + FNMA f115 = f67, f38, f115 + FNMA f119 = f71, f38, f119 + ;; + FNMA f120 = f64, f39, f120 + FNMA f124 = f68, f39, f124 + FNMA f121 = f65, f39, f121 + FNMA f125 = f69, f39, f125 + FNMA f122 = f66, f39, f122 + FNMA f126 = f70, f39, f126 + FNMA f123 = f67, f39, f123 + FNMA f127 = f71, f39, f127 + ;; + FMPY f72 = f72, f40 + FMPY f76 = f76, f40 + FMPY f73 = f73, f40 + FMPY f77 = f77, f40 + FMPY f74 = f74, f40 + FMPY f78 = f78, f40 + FMPY f75 = f75, f40 + FMPY f79 = f79, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f84 = f76, f41, f84 + FNMA f81 = f73, f41, f81 + FNMA f85 = f77, f41, f85 + FNMA f82 = f74, f41, f82 + FNMA f86 = f78, f41, f86 + FNMA f83 = f75, f41, f83 + FNMA f87 = f79, f41, f87 + ;; + FNMA f88 = f72, f42, f88 + FNMA f92 = f76, f42, f92 + FNMA f89 = f73, f42, f89 + FNMA f93 = f77, f42, f93 + FNMA f90 = f74, f42, f90 + FNMA f94 = f78, f42, f94 + FNMA f91 = f75, f42, f91 + FNMA f95 = f79, f42, f95 + ;; + FNMA f96 = f72, f43, f96 + FNMA f100 = f76, f43, f100 + FNMA f97 = f73, f43, f97 + FNMA f101 = f77, f43, f101 + FNMA f98 = f74, f43, f98 + FNMA f102 = f78, f43, f102 + FNMA f99 = f75, f43, f99 + FNMA f103 = f79, f43, f103 + ;; + FNMA f104 = f72, f44, f104 + FNMA f108 = f76, f44, f108 + FNMA f105 = f73, f44, f105 + FNMA f109 = f77, f44, f109 + FNMA f106 = f74, f44, f106 + FNMA f110 = f78, f44, f110 + FNMA f107 = f75, f44, f107 + FNMA f111 = f79, f44, f111 + ;; + FNMA f112 = f72, f45, f112 + FNMA f116 = f76, f45, f116 + FNMA f113 = f73, f45, f113 + FNMA f117 = f77, f45, f117 + FNMA f114 = f74, f45, f114 + FNMA f118 = f78, f45, f118 + FNMA f115 = f75, f45, f115 + FNMA f119 = f79, f45, f119 + ;; + FNMA f120 = f72, f46, f120 + FNMA f124 = f76, f46, f124 + FNMA f121 = f73, f46, f121 + FNMA f125 = f77, f46, f125 + FNMA f122 = f74, f46, f122 + FNMA f126 = f78, f46, f126 + FNMA f123 = f75, f46, f123 + FNMA f127 = f79, f46, f127 + ;; + FMPY f80 = f80, f47 + FMPY f84 = f84, f47 + FMPY f81 = f81, f47 + FMPY f85 = f85, f47 + FMPY f82 = f82, f47 + FMPY f86 = f86, f47 + FMPY f83 = f83, f47 + FMPY f87 = f87, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f92 = f84, f48, f92 + FNMA f89 = f81, f48, f89 + FNMA f93 = f85, f48, f93 + FNMA f90 = f82, f48, f90 + FNMA f94 = f86, f48, f94 + FNMA f91 = f83, f48, f91 + FNMA f95 = f87, f48, f95 + ;; + FNMA f96 = f80, f49, f96 + FNMA f100 = f84, f49, f100 + FNMA f97 = f81, f49, f97 + FNMA f101 = f85, f49, f101 + FNMA f98 = f82, f49, f98 + FNMA f102 = f86, f49, f102 + FNMA f99 = f83, f49, f99 + FNMA f103 = f87, f49, f103 + ;; + FNMA f104 = f80, f50, f104 + FNMA f108 = f84, f50, f108 + FNMA f105 = f81, f50, f105 + FNMA f109 = f85, f50, f109 + FNMA f106 = f82, f50, f106 + FNMA f110 = f86, f50, f110 + FNMA f107 = f83, f50, f107 + FNMA f111 = f87, f50, f111 + ;; + FNMA f112 = f80, f51, f112 + FNMA f116 = f84, f51, f116 + FNMA f113 = f81, f51, f113 + FNMA f117 = f85, f51, f117 + FNMA f114 = f82, f51, f114 + FNMA f118 = f86, f51, f118 + FNMA f115 = f83, f51, f115 + FNMA f119 = f87, f51, f119 + ;; + FNMA f120 = f80, f52, f120 + FNMA f124 = f84, f52, f124 + FNMA f121 = f81, f52, f121 + FNMA f125 = f85, f52, f125 + FNMA f122 = f82, f52, f122 + FNMA f126 = f86, f52, f126 + FNMA f123 = f83, f52, f123 + FNMA f127 = f87, f52, f127 + ;; + FMPY f88 = f88, f53 + FMPY f92 = f92, f53 + FMPY f89 = f89, f53 + FMPY f93 = f93, f53 + FMPY f90 = f90, f53 + FMPY f94 = f94, f53 + FMPY f91 = f91, f53 + FMPY f95 = f95, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f100 = f92, f54, f100 + FNMA f97 = f89, f54, f97 + FNMA f101 = f93, f54, f101 + FNMA f98 = f90, f54, f98 + FNMA f102 = f94, f54, f102 + FNMA f99 = f91, f54, f99 + FNMA f103 = f95, f54, f103 + ;; + FNMA f104 = f88, f55, f104 + FNMA f108 = f92, f55, f108 + FNMA f105 = f89, f55, f105 + FNMA f109 = f93, f55, f109 + FNMA f106 = f90, f55, f106 + FNMA f110 = f94, f55, f110 + FNMA f107 = f91, f55, f107 + FNMA f111 = f95, f55, f111 + ;; + FNMA f112 = f88, f56, f112 + FNMA f116 = f92, f56, f116 + FNMA f113 = f89, f56, f113 + FNMA f117 = f93, f56, f117 + FNMA f114 = f90, f56, f114 + FNMA f118 = f94, f56, f118 + FNMA f115 = f91, f56, f115 + FNMA f119 = f95, f56, f119 + ;; + FNMA f120 = f88, f57, f120 + FNMA f124 = f92, f57, f124 + FNMA f121 = f89, f57, f121 + FNMA f125 = f93, f57, f125 + FNMA f122 = f90, f57, f122 + FNMA f126 = f94, f57, f126 + FNMA f123 = f91, f57, f123 + FNMA f127 = f95, f57, f127 + ;; + FMPY f96 = f96, f58 + FMPY f100 = f100, f58 + FMPY f97 = f97, f58 + FMPY f101 = f101, f58 + FMPY f98 = f98, f58 + FMPY f102 = f102, f58 + FMPY f99 = f99, f58 + FMPY f103 = f103, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f108 = f100, f59, f108 + FNMA f105 = f97, f59, f105 + FNMA f109 = f101, f59, f109 + FNMA f106 = f98, f59, f106 + FNMA f110 = f102, f59, f110 + FNMA f107 = f99, f59, f107 + FNMA f111 = f103, f59, f111 + ;; + FNMA f112 = f96, f60, f112 + FNMA f116 = f100, f60, f116 + FNMA f113 = f97, f60, f113 + FNMA f117 = f101, f60, f117 + FNMA f114 = f98, f60, f114 + FNMA f118 = f102, f60, f118 + FNMA f115 = f99, f60, f115 + FNMA f119 = f103, f60, f119 + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f120 = f96, f61, f120 + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f124 = f100, f61, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FNMA f121 = f97, f61, f121 + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FNMA f125 = f101, f61, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f122 = f98, f61, f122 + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f126 = f102, f61, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FNMA f123 = f99, f61, f123 + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FNMA f127 = f103, f61, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f72, SIZE + FMPY f104 = f104, f16 + } + { .mfi + STFD [AOFFSET2] = f76, SIZE + FMPY f108 = f108, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f73, SIZE + FMPY f105 = f105, f16 + } + { .mfi + STFD [AOFFSET2] = f77, SIZE + FMPY f109 = f109, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f74, SIZE + FMPY f106 = f106, f16 + } + { .mfi + STFD [AOFFSET2] = f78, SIZE + FMPY f110 = f110, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f75, 5 * SIZE + FMPY f107 = f107, f16 + } + { .mfi + STFD [AOFFSET2] = f79, 5 * SIZE + FMPY f111 = f111, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f112 = f104, f17, f112 + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f116 = f108, f17, f116 + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FNMA f113 = f105, f17, f113 + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FNMA f117 = f109, f17, f117 + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f114 = f106, f17, f114 + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f118 = f110, f17, f118 + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FNMA f115 = f107, f17, f115 + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FNMA f119 = f111, f17, f119 + } + ;; + { .mfi + STFD [AOFFSET] = f88, SIZE + FNMA f120 = f104, f18, f120 + } + { .mfi + STFD [AOFFSET2] = f92, SIZE + FNMA f124 = f108, f18, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f89, SIZE + FNMA f121 = f105, f18, f121 + } + { .mfi + STFD [AOFFSET2] = f93, SIZE + FNMA f125 = f109, f18, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f90, SIZE + FNMA f122 = f106, f18, f122 + } + { .mfi + STFD [AOFFSET2] = f94, SIZE + FNMA f126 = f110, f18, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f91, 5 * SIZE + FNMA f123 = f107, f18, f123 + } + { .mfi + STFD [AOFFSET2] = f95, 5 * SIZE + FNMA f127 = f111, f18, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FMPY f112 = f112, f19 + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FMPY f116 = f116, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMPY f113 = f113, f19 + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMPY f117 = f117, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FMPY f114 = f114, f19 + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FMPY f118 = f118, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMPY f115 = f115, f19 + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMPY f119 = f119, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f104, SIZE + FNMA f120 = f112, f20, f120 + } + { .mfi + STFD [AOFFSET2] = f108, SIZE + FNMA f124 = f116, f20, f124 + } + ;; + { .mfi + STFD [AOFFSET] = f105, SIZE + FNMA f121 = f113, f20, f121 + } + { .mfi + STFD [AOFFSET2] = f109, SIZE + FNMA f125 = f117, f20, f125 + } + ;; + { .mfi + STFD [AOFFSET] = f106, SIZE + FNMA f122 = f114, f20, f122 + } + { .mfi + STFD [AOFFSET2] = f110, SIZE + FNMA f126 = f118, f20, f126 + } + ;; + { .mfi + STFD [AOFFSET] = f107, 5 * SIZE + FNMA f123 = f115, f20, f123 + } + { .mfi + STFD [AOFFSET2] = f111, 5 * SIZE + FNMA f127 = f119, f20, f127 + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FMPY f120 = f120, f21 + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FMPY f124 = f124, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMPY f121 = f121, f21 + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMPY f125 = f125, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FMPY f122 = f122, f21 + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FMPY f126 = f126, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f115, 5 * SIZE + FMPY f123 = f123, f21 + } + { .mfi + STFD [AOFFSET2] = f119, 5 * SIZE + FMPY f127 = f127, f21 + } + ;; + { .mmi + STFD [AOFFSET] = f120, SIZE + STFD [AOFFSET2] = f124, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f121, SIZE + STFD [AOFFSET2] = f125, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f122, SIZE + STFD [AOFFSET2] = f126, SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + STFD [AOFFSET] = f123 + adds AOFFSET = - 59 * SIZE, AOFFSET + } + { .mfi + STFD [AOFFSET2] = f127 + adds AOFFSET2 = - 59 * SIZE, AOFFSET2 + } + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + { .mfi + LDFPD f35, f34 = [BOFFSET] + FMPY f120 = f120, f32 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f124 = f124, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f37, f36 = [BOFFSET] + FMPY f121 = f121, f32 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f125 = f125, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f39, f38 = [BOFFSET] + FMPY f122 = f122, f32 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f126 = f126, f32 + nop __LINE__ + } + ;; + { .mfi + LDFD f40 = [BOFFSET], -2 * SIZE + FMPY f123 = f123, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f127 = f127, f32 + nop __LINE__ + } + ;; + { .mfi + LDFPD f42, f41 = [BOFFSET] + FNMA f112 = f120, f33, f112 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f116 = f124, f33, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f44, f43 = [BOFFSET] + FNMA f113 = f121, f33, f113 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f117 = f125, f33, f117 + nop __LINE__ + } + ;; + { .mfi + LDFPD f46, f45 = [BOFFSET] + FNMA f114 = f122, f33, f114 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f118 = f126, f33, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f47 = [BOFFSET] + FNMA f115 = f123, f33, f115 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f119 = f127, f33, f119 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f49 = [BOFFSET] + FNMA f104 = f120, f34, f104 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f108 = f124, f34, f108 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f51 = [BOFFSET] + FNMA f105 = f121, f34, f105 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f109 = f125, f34, f109 + nop __LINE__ + } + ;; + { .mfi + LDFD f53 = [BOFFSET], -2 * SIZE + FNMA f106 = f122, f34, f106 + } + { .mfi + nop __LINE__ + FNMA f110 = f126, f34, f110 + nop __LINE__ + } + ;; + { .mfi + LDFPD f55, f54 = [BOFFSET] + FNMA f107 = f123, f34, f107 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f111 = f127, f34, f111 + nop __LINE__ + } + ;; + { .mfi + LDFPD f57, f56 = [BOFFSET] + FNMA f96 = f120, f35, f96 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f100 = f124, f35, f100 + nop __LINE__ + } + ;; + { .mfi + LDFPD f59, f58 = [BOFFSET] + FNMA f97 = f121, f35, f97 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f101 = f125, f35, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f61, f60 = [BOFFSET] + FNMA f98 = f122, f35, f98 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f102 = f126, f35, f102 + nop __LINE__ + } + ;; + { .mfi + LDFD f16 = [BOFFSET], -2 * SIZE + FNMA f99 = f123, f35, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f127, f35, f103 + nop __LINE__ + } + ;; + { .mfi + LDFPD f18, f17 = [BOFFSET] + FNMA f88 = f120, f36, f88 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f92 = f124, f36, f92 + nop __LINE__ + } + ;; + { .mfi + LDFPD f20, f19 = [BOFFSET] + FNMA f89 = f121, f36, f89 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FNMA f93 = f125, f36, f93 + nop __LINE__ + } + ;; + { .mfi + LDFD f21 = [BOFFSET] + FNMA f90 = f122, f36, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f94 = f126, f36, f94 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f91 = f123, f36, f91 + adds AOFFSET = 56 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FNMA f95 = f127, f36, f95 + adds AOFFSET2 = 56 * SIZE, AOFFSET2 + } + ;; + FNMA f80 = f120, f37, f80 + FNMA f84 = f124, f37, f84 + FNMA f81 = f121, f37, f81 + FNMA f85 = f125, f37, f85 + FNMA f82 = f122, f37, f82 + FNMA f86 = f126, f37, f86 + FNMA f83 = f123, f37, f83 + FNMA f87 = f127, f37, f87 + ;; + FNMA f72 = f120, f38, f72 + FNMA f76 = f124, f38, f76 + FNMA f73 = f121, f38, f73 + FNMA f77 = f125, f38, f77 + FNMA f74 = f122, f38, f74 + FNMA f78 = f126, f38, f78 + FNMA f75 = f123, f38, f75 + FNMA f79 = f127, f38, f79 + ;; + FNMA f64 = f120, f39, f64 + FNMA f68 = f124, f39, f68 + FNMA f65 = f121, f39, f65 + FNMA f69 = f125, f39, f69 + FNMA f66 = f122, f39, f66 + FNMA f70 = f126, f39, f70 + FNMA f67 = f123, f39, f67 + FNMA f71 = f127, f39, f71 + ;; + FMPY f112 = f112, f40 + FMPY f116 = f116, f40 + FMPY f113 = f113, f40 + FMPY f117 = f117, f40 + FMPY f114 = f114, f40 + FMPY f118 = f118, f40 + FMPY f115 = f115, f40 + FMPY f119 = f119, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f108 = f116, f41, f108 + FNMA f105 = f113, f41, f105 + FNMA f109 = f117, f41, f109 + FNMA f106 = f114, f41, f106 + FNMA f110 = f118, f41, f110 + FNMA f107 = f115, f41, f107 + FNMA f111 = f119, f41, f111 + ;; + FNMA f96 = f112, f42, f96 + FNMA f100 = f116, f42, f100 + FNMA f97 = f113, f42, f97 + FNMA f101 = f117, f42, f101 + FNMA f98 = f114, f42, f98 + FNMA f102 = f118, f42, f102 + FNMA f99 = f115, f42, f99 + FNMA f103 = f119, f42, f103 + ;; + FNMA f88 = f112, f43, f88 + FNMA f92 = f116, f43, f92 + FNMA f89 = f113, f43, f89 + FNMA f93 = f117, f43, f93 + FNMA f90 = f114, f43, f90 + FNMA f94 = f118, f43, f94 + FNMA f91 = f115, f43, f91 + FNMA f95 = f119, f43, f95 + ;; + FNMA f80 = f112, f44, f80 + FNMA f84 = f116, f44, f84 + FNMA f81 = f113, f44, f81 + FNMA f85 = f117, f44, f85 + FNMA f82 = f114, f44, f82 + FNMA f86 = f118, f44, f86 + FNMA f83 = f115, f44, f83 + FNMA f87 = f119, f44, f87 + ;; + FNMA f72 = f112, f45, f72 + FNMA f76 = f116, f45, f76 + FNMA f73 = f113, f45, f73 + FNMA f77 = f117, f45, f77 + FNMA f74 = f114, f45, f74 + FNMA f78 = f118, f45, f78 + FNMA f75 = f115, f45, f75 + FNMA f79 = f119, f45, f79 + ;; + FNMA f64 = f112, f46, f64 + FNMA f68 = f116, f46, f68 + FNMA f65 = f113, f46, f65 + FNMA f69 = f117, f46, f69 + FNMA f66 = f114, f46, f66 + FNMA f70 = f118, f46, f70 + FNMA f67 = f115, f46, f67 + FNMA f71 = f119, f46, f71 + ;; + FMPY f104 = f104, f47 + FMPY f108 = f108, f47 + FMPY f105 = f105, f47 + FMPY f109 = f109, f47 + FMPY f106 = f106, f47 + FMPY f110 = f110, f47 + FMPY f107 = f107, f47 + FMPY f111 = f111, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f100 = f108, f48, f100 + FNMA f97 = f105, f48, f97 + FNMA f101 = f109, f48, f101 + FNMA f98 = f106, f48, f98 + FNMA f102 = f110, f48, f102 + FNMA f99 = f107, f48, f99 + FNMA f103 = f111, f48, f103 + ;; + FNMA f88 = f104, f49, f88 + FNMA f92 = f108, f49, f92 + FNMA f89 = f105, f49, f89 + FNMA f93 = f109, f49, f93 + FNMA f90 = f106, f49, f90 + FNMA f94 = f110, f49, f94 + FNMA f91 = f107, f49, f91 + FNMA f95 = f111, f49, f95 + ;; + FNMA f80 = f104, f50, f80 + FNMA f84 = f108, f50, f84 + FNMA f81 = f105, f50, f81 + FNMA f85 = f109, f50, f85 + FNMA f82 = f106, f50, f82 + FNMA f86 = f110, f50, f86 + FNMA f83 = f107, f50, f83 + FNMA f87 = f111, f50, f87 + ;; + FNMA f72 = f104, f51, f72 + FNMA f76 = f108, f51, f76 + FNMA f73 = f105, f51, f73 + FNMA f77 = f109, f51, f77 + FNMA f74 = f106, f51, f74 + FNMA f78 = f110, f51, f78 + FNMA f75 = f107, f51, f75 + FNMA f79 = f111, f51, f79 + ;; + FNMA f64 = f104, f52, f64 + FNMA f68 = f108, f52, f68 + FNMA f65 = f105, f52, f65 + FNMA f69 = f109, f52, f69 + FNMA f66 = f106, f52, f66 + FNMA f70 = f110, f52, f70 + FNMA f67 = f107, f52, f67 + FNMA f71 = f111, f52, f71 + ;; + FMPY f96 = f96, f53 + FMPY f100 = f100, f53 + FMPY f97 = f97, f53 + FMPY f101 = f101, f53 + FMPY f98 = f98, f53 + FMPY f102 = f102, f53 + FMPY f99 = f99, f53 + FMPY f103 = f103, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f92 = f100, f54, f92 + FNMA f89 = f97, f54, f89 + FNMA f93 = f101, f54, f93 + FNMA f90 = f98, f54, f90 + FNMA f94 = f102, f54, f94 + FNMA f91 = f99, f54, f91 + FNMA f95 = f103, f54, f95 + ;; + FNMA f80 = f96, f55, f80 + FNMA f84 = f100, f55, f84 + FNMA f81 = f97, f55, f81 + FNMA f85 = f101, f55, f85 + FNMA f82 = f98, f55, f82 + FNMA f86 = f102, f55, f86 + FNMA f83 = f99, f55, f83 + FNMA f87 = f103, f55, f87 + ;; + FNMA f72 = f96, f56, f72 + FNMA f76 = f100, f56, f76 + FNMA f73 = f97, f56, f73 + FNMA f77 = f101, f56, f77 + FNMA f74 = f98, f56, f74 + FNMA f78 = f102, f56, f78 + FNMA f75 = f99, f56, f75 + FNMA f79 = f103, f56, f79 + ;; + FNMA f64 = f96, f57, f64 + FNMA f68 = f100, f57, f68 + FNMA f65 = f97, f57, f65 + FNMA f69 = f101, f57, f69 + FNMA f66 = f98, f57, f66 + FNMA f70 = f102, f57, f70 + FNMA f67 = f99, f57, f67 + FNMA f71 = f103, f57, f71 + ;; + FMPY f88 = f88, f58 + FMPY f92 = f92, f58 + FMPY f89 = f89, f58 + FMPY f93 = f93, f58 + FMPY f90 = f90, f58 + FMPY f94 = f94, f58 + FMPY f91 = f91, f58 + FMPY f95 = f95, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f84 = f92, f59, f84 + FNMA f81 = f89, f59, f81 + FNMA f85 = f93, f59, f85 + FNMA f82 = f90, f59, f82 + FNMA f86 = f94, f59, f86 + FNMA f83 = f91, f59, f83 + FNMA f87 = f95, f59, f87 + ;; + FNMA f72 = f88, f60, f72 + FNMA f76 = f92, f60, f76 + FNMA f73 = f89, f60, f73 + FNMA f77 = f93, f60, f77 + FNMA f74 = f90, f60, f74 + FNMA f78 = f94, f60, f78 + FNMA f75 = f91, f60, f75 + FNMA f79 = f95, f60, f79 + ;; + + { .mfi + STFD [AOFFSET] = f120, SIZE + FNMA f64 = f88, f61, f64 + } + { .mfi + STFD [AOFFSET2] = f124, SIZE + FNMA f68 = f92, f61, f68 + } + ;; + { .mfi + STFD [AOFFSET] = f121, SIZE + FNMA f65 = f89, f61, f65 + } + { .mfi + STFD [AOFFSET2] = f125, SIZE + FNMA f69 = f93, f61, f69 + } + ;; + { .mfi + STFD [AOFFSET] = f122, SIZE + FNMA f66 = f90, f61, f66 + } + { .mfi + STFD [AOFFSET2] = f126, SIZE + FNMA f70 = f94, f61, f70 + } + ;; + { .mfi + STFD [AOFFSET] = f123, - 11 * SIZE + FNMA f67 = f91, f61, f67 + } + { .mfi + STFD [AOFFSET2] = f127, - 11 * SIZE + FNMA f71 = f95, f61, f71 + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FMPY f80 = f80, f16 + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FMPY f84 = f84, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMPY f81 = f81, f16 + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMPY f85 = f85, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FMPY f82 = f82, f16 + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FMPY f86 = f86, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f115, - 11 * SIZE + FMPY f83 = f83, f16 + } + { .mfi + STFD [AOFFSET2] = f119, - 11 * SIZE + FMPY f87 = f87, f16 + } + ;; + { .mfi + STFD [AOFFSET] = f104, SIZE + FNMA f72 = f80, f17, f72 + } + { .mfi + STFD [AOFFSET2] = f108, SIZE + FNMA f76 = f84, f17, f76 + } + ;; + { .mfi + STFD [AOFFSET] = f105, SIZE + FNMA f73 = f81, f17, f73 + } + { .mfi + STFD [AOFFSET2] = f109, SIZE + FNMA f77 = f85, f17, f77 + } + ;; + { .mfi + STFD [AOFFSET] = f106, SIZE + FNMA f74 = f82, f17, f74 + } + { .mfi + STFD [AOFFSET2] = f110, SIZE + FNMA f78 = f86, f17, f78 + } + ;; + { .mfi + STFD [AOFFSET] = f107, - 11 * SIZE + FNMA f75 = f83, f17, f75 + } + { .mfi + STFD [AOFFSET2] = f111, - 11 * SIZE + FNMA f79 = f87, f17, f79 + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f64 = f80, f18, f64 + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f68 = f84, f18, f68 + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FNMA f65 = f81, f18, f65 + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FNMA f69 = f85, f18, f69 + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f66 = f82, f18, f66 + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f70 = f86, f18, f70 + } + ;; + { .mfi + STFD [AOFFSET] = f99, - 11 * SIZE + FNMA f67 = f83, f18, f67 + } + { .mfi + STFD [AOFFSET2] = f103, - 11 * SIZE + FNMA f71 = f87, f18, f71 + } + ;; + { .mfi + STFD [AOFFSET] = f88, SIZE + FMPY f72 = f72, f19 + } + { .mfi + STFD [AOFFSET2] = f92, SIZE + FMPY f76 = f76, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f89, SIZE + FMPY f73 = f73, f19 + } + { .mfi + STFD [AOFFSET2] = f93, SIZE + FMPY f77 = f77, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f90, SIZE + FMPY f74 = f74, f19 + } + { .mfi + STFD [AOFFSET2] = f94, SIZE + FMPY f78 = f78, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f91, - 11 * SIZE + FMPY f75 = f75, f19 + } + { .mfi + STFD [AOFFSET2] = f95, - 11 * SIZE + FMPY f79 = f79, f19 + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f64 = f72, f20, f64 + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f68 = f76, f20, f68 + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FNMA f65 = f73, f20, f65 + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FNMA f69 = f77, f20, f69 + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f66 = f74, f20, f66 + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f70 = f78, f20, f70 + } + ;; + { .mfi + STFD [AOFFSET] = f83, - 11 * SIZE + FNMA f67 = f75, f20, f67 + } + { .mfi + STFD [AOFFSET2] = f87, - 11 * SIZE + FNMA f71 = f79, f20, f71 + } + ;; + { .mfi + STFD [AOFFSET] = f72, SIZE + FMPY f64 = f64, f21 + } + { .mfi + STFD [AOFFSET2] = f76, SIZE + FMPY f68 = f68, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f73, SIZE + FMPY f65 = f65, f21 + } + { .mfi + STFD [AOFFSET2] = f77, SIZE + FMPY f69 = f69, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f74, SIZE + FMPY f66 = f66, f21 + } + { .mfi + STFD [AOFFSET2] = f78, SIZE + FMPY f70 = f70, f21 + } + ;; + { .mfi + STFD [AOFFSET] = f75, - 11 * SIZE + FMPY f67 = f67, f21 + } + { .mfi + STFD [AOFFSET2] = f79, - 11 * SIZE + FMPY f71 = f71, f21 + } + ;; + { .mmi + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + } + ;; + { .mmi + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f71, - 3 * SIZE + adds C9 = 4 * SIZE, C1 + } + ;; + +#endif + { .mmf + STFD [C1 ] = f64, SIZE + STFD [C9 ] = f68, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + STFD [C9 ] = f69, SIZE + adds C10 = 4 * SIZE, C2 + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE + STFD [C9 ] = f70, SIZE +#ifdef LN + adds C3 = -8 * SIZE, C3 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, 5 * SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + STFD [C9 ] = f71 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + STFD [C10] = f76, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE + STFD [C10] = f77, SIZE +#ifdef LN + adds C4 = -8 * SIZE, C4 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + STFD [C10] = f78, SIZE + adds C12 = 4 * SIZE, C4 + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, 5 * SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif + STFD [C10] = f79 +#ifdef LN + adds C5 = -8 * SIZE, C5 +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + STFD [C11] = f84, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + STFD [C11] = f85, SIZE + adds C13 = 4 * SIZE, C5 + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE + STFD [C11] = f86, SIZE +#ifdef LN + adds C6 = -8 * SIZE, C6 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, 5 * SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + STFD [C11] = f87 + adds C14 = 4 * SIZE, C6 + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + STFD [C12] = f92, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE + STFD [C12] = f93, SIZE +#ifdef LN + adds C8 = -8 * SIZE, C8 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + STFD [C12] = f94, SIZE + adds C16 = 4 * SIZE, C8 + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, 5 * SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + STFD [C12] = f95 + cmp.ne p6, p0 = 1, I + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + STFD [C13] = f100, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + STFD [C13] = f101, SIZE + adds I = -1, I + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE + STFD [C13] = f102, SIZE +#ifdef LN + adds C7 = -8 * SIZE, C7 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C5 ] = f99, 5 * SIZE +#else + STFD [C5 ] = f99, - 3 * SIZE +#endif + STFD [C13] = f103 + adds C15 = 4 * SIZE, C7 + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + STFD [C14] = f108, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + STFD [C14] = f109, SIZE + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + STFD [C14] = f110, SIZE + sub L = K, KK + } + ;; + { .mmi +#ifndef LN + STFD [C6 ] = f107, 5 * SIZE +#else + STFD [C6 ] = f107, - 3 * SIZE +#endif + STFD [C14] = f111 +#ifdef RT + shladd AORIG = r2, 3, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + STFD [C15] = f116, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE + STFD [C15] = f117, SIZE +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE + STFD [C15] = f118, SIZE +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 3, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C7 ] = f115, 5 * SIZE +#else + STFD [C7 ] = f115, - 3 * SIZE +#endif + STFD [C15] = f119 +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + STFD [C16] = f124, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE + STFD [C16] = f125, SIZE +#ifdef LT + adds KK = 8, KK +#elif defined LN + adds KK = -8, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE + STFD [C16] = f126, SIZE +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmb +#ifndef LN + STFD [C8 ] = f123, 5 * SIZE +#else + STFD [C8 ] = f123, - 3 * SIZE +#endif + STFD [C16] = f127 + (p6) br.cond.dptk .L011 + } + ;; + +.L020: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p0 = M, 2 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f67 = f0 + } + { .mfi + setf.d f74 = r0 + mov f75 = f0 + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f82 = r0 + mov f83 = f0 + } + { .mfi + setf.d f90 = r0 + mov f91 = f0 + cmp.eq p6, p0 = -1, L + } + ;; + { .mmf + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f99 = f0 + } + { .mfi + setf.d f106 = r0 + mov f107 = f0 + mov ar.lc = L + } + ;; + { .mmf + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f114 = r0 + mov f115 = f0 + } + { .mfb + setf.d f122 = r0 + mov f123 = f0 + (p6) br.cond.dpnt .L028 + } + ;; + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 2 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 2 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 2 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 2 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 2 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 2 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 2 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 2 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + LDFPD f62, f63 = [BOFFSET] + FSUB f113 = f46, f113 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; + FSUB f66 = f48, f66 + FSUB f74 = f49, f74 + FSUB f82 = f50, f82 + FSUB f90 = f51, f90 + FSUB f98 = f52, f98 + FSUB f106 = f53, f106 + FSUB f114 = f54, f114 + FSUB f122 = f55, f122 + ;; + FSUB f67 = f56, f67 + FSUB f75 = f57, f75 + FSUB f83 = f58, f83 + FSUB f91 = f59, f91 + FSUB f99 = f60, f99 + FSUB f107 = f61, f107 + FSUB f115 = f62, f115 + FSUB f123 = f63, f123 + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET], 2 * SIZE + ;; + LDFPD f48, f49 = [AOFFSET], 2 * SIZE + ;; + LDFPD f50, f51 = [AOFFSET], 2 * SIZE + ;; + LDFPD f52, f53 = [AOFFSET], 2 * SIZE + ;; + LDFPD f54, f55 = [AOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [AOFFSET], 2 * SIZE + ;; + LDFPD f58, f59 = [AOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [AOFFSET], 2 * SIZE + ;; + LDFPD f62, f63 = [AOFFSET] + adds AOFFSET = -30 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + FSUB f66 = f34, f66 + FSUB f67 = f35, f67 + + FSUB f72 = f36, f72 + FSUB f73 = f37, f73 + FSUB f74 = f38, f74 + FSUB f75 = f39, f75 + + FSUB f80 = f40, f80 + FSUB f81 = f41, f81 + FSUB f82 = f42, f82 + FSUB f83 = f43, f83 + + FSUB f88 = f44, f88 + FSUB f89 = f45, f89 + FSUB f90 = f46, f90 + FSUB f91 = f47, f91 + ;; + FSUB f96 = f48, f96 + FSUB f97 = f49, f97 + FSUB f98 = f50, f98 + FSUB f99 = f51, f99 + ;; + FSUB f104 = f52, f104 + FSUB f105 = f53, f105 + FSUB f106 = f54, f106 + FSUB f107 = f55, f107 + ;; + FSUB f112 = f56, f112 + FSUB f113 = f57, f113 + FSUB f114 = f58, f114 + FSUB f115 = f59, f115 + ;; + FSUB f120 = f60, f120 + FSUB f121 = f61, f121 + FSUB f122 = f62, f122 + FSUB f123 = f63, f123 + ;; +#endif + +#ifdef LN + adds AOFFSET = 14 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f35, f34 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], - 2 * SIZE + ;; + LDFPD f38, f37 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f40, f39 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET] + ;; + FMPY f67 = f67, f32 + FMPY f99 = f99, f32 + FMPY f75 = f75, f32 + FMPY f107 = f107, f32 + FMPY f83 = f83, f32 + FMPY f115 = f115, f32 + FMPY f91 = f91, f32 + FMPY f123 = f123, f32 + ;; + FNMA f66 = f67, f33, f66 + FNMA f98 = f99, f33, f98 + FNMA f74 = f75, f33, f74 + FNMA f106 = f107, f33, f106 + FNMA f82 = f83, f33, f82 + FNMA f114 = f115, f33, f114 + FNMA f90 = f91, f33, f90 + FNMA f122 = f123, f33, f122 + ;; + FNMA f65 = f67, f34, f65 + FNMA f97 = f99, f34, f97 + FNMA f73 = f75, f34, f73 + FNMA f105 = f107, f34, f105 + FNMA f81 = f83, f34, f81 + FNMA f113 = f115, f34, f113 + FNMA f89 = f91, f34, f89 + FNMA f121 = f123, f34, f121 + ;; + FNMA f64 = f67, f35, f64 + FNMA f96 = f99, f35, f96 + FNMA f72 = f75, f35, f72 + FNMA f104 = f107, f35, f104 + FNMA f80 = f83, f35, f80 + FNMA f112 = f115, f35, f112 + FNMA f88 = f91, f35, f88 + FNMA f120 = f123, f35, f120 + ;; + FMPY f66 = f66, f36 + FMPY f98 = f98, f36 + FMPY f74 = f74, f36 + FMPY f106 = f106, f36 + FMPY f82 = f82, f36 + FMPY f114 = f114, f36 + FMPY f90 = f90, f36 + FMPY f122 = f122, f36 + ;; + FNMA f65 = f66, f37, f65 + FNMA f97 = f98, f37, f97 + FNMA f73 = f74, f37, f73 + FNMA f105 = f106, f37, f105 + FNMA f81 = f82, f37, f81 + FNMA f113 = f114, f37, f113 + FNMA f89 = f90, f37, f89 + FNMA f121 = f122, f37, f121 + ;; + FNMA f64 = f66, f38, f64 + FNMA f96 = f98, f38, f96 + FNMA f72 = f74, f38, f72 + FNMA f104 = f106, f38, f104 + FNMA f80 = f82, f38, f80 + FNMA f112 = f114, f38, f112 + FNMA f88 = f90, f38, f88 + FNMA f120 = f122, f38, f120 + ;; + adds BOFFSET = 24 * SIZE, BOFFSET + adds BOFFSET2 = 24 * SIZE, BOFFSET2 + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMPY f65 = f65, f39 + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMPY f97 = f97, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + FMPY f73 = f73, f39 + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + FMPY f105 = f105, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + FMPY f81 = f81, f39 + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + FMPY f113 = f113, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f91, - 11 * SIZE + FMPY f89 = f89, f39 + } + { .mfi + STFD [BOFFSET2] = f123, - 11 * SIZE + FMPY f121 = f121, f39 + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f65, f40, f64 + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f97, f40, f96 + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + FNMA f72 = f73, f40, f72 + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + FNMA f104 = f105, f40, f104 + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f81, f40, f80 + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f113, f40, f112 + } + ;; + { .mfi + STFD [BOFFSET] = f90, -11 * SIZE + FNMA f88 = f89, f40, f88 + } + { .mfi + STFD [BOFFSET2] = f122, -11 * SIZE + FNMA f120 = f121, f40, f120 + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMPY f64 = f64, f41 + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMPY f96 = f96, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + FMPY f72 = f72, f41 + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + FMPY f104 = f104, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + FMPY f80 = f80, f41 + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + FMPY f112 = f112, f41 + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + FMPY f88 = f88, f41 + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + FMPY f120 = f120, f41 + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -4 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -4 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f36 = [AOFFSET], 1 * SIZE + ;; + LDFPD f37, f38 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f39, f40 = [AOFFSET] + adds AOFFSET = 5 * SIZE, AOFFSET + ;; + LDFD f41 = [AOFFSET], -15 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + { .mfi + FNMA f66 = f64, f34, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f96, f34, f98 + nop __LINE__ + } + ;; + { .mfi + FNMA f74 = f72, f34, f74 + } + { .mfi + nop __LINE__ + FNMA f106 = f104, f34, f106 + nop __LINE__ + } + ;; + { .mfi + FNMA f82 = f80, f34, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f112, f34, f114 + nop __LINE__ + } + ;; + { .mfi + FNMA f90 = f88, f34, f90 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f122 = f120, f34, f122 + nop __LINE__ + } + ;; + { .mfi + FNMA f67 = f64, f35, f67 + } + { .mfi + nop __LINE__ + FNMA f99 = f96, f35, f99 + nop __LINE__ + } + ;; + { .mfi + FNMA f75 = f72, f35, f75 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f107 = f104, f35, f107 + nop __LINE__ + } + ;; + { .mfi + FNMA f83 = f80, f35, f83 + } + { .mfi + nop __LINE__ + FNMA f115 = f112, f35, f115 + nop __LINE__ + } + ;; + { .mfi + FNMA f91 = f88, f35, f91 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f123 = f120, f35, f123 + nop __LINE__ + } + ;; + FMPY f65 = f65, f36 + FMPY f97 = f97, f36 + FMPY f73 = f73, f36 + FMPY f105 = f105, f36 + FMPY f81 = f81, f36 + FMPY f113 = f113, f36 + FMPY f89 = f89, f36 + FMPY f121 = f121, f36 + ;; + FNMA f66 = f65, f37, f66 + FNMA f98 = f97, f37, f98 + FNMA f74 = f73, f37, f74 + FNMA f106 = f105, f37, f106 + FNMA f82 = f81, f37, f82 + FNMA f114 = f113, f37, f114 + FNMA f90 = f89, f37, f90 + FNMA f122 = f121, f37, f122 + ;; + FNMA f67 = f65, f38, f67 + FNMA f99 = f97, f38, f99 + FNMA f75 = f73, f38, f75 + FNMA f107 = f105, f38, f107 + FNMA f83 = f81, f38, f83 + FNMA f115 = f113, f38, f115 + FNMA f91 = f89, f38, f91 + FNMA f123 = f121, f38, f123 + ;; + FMPY f66 = f66, f39 + FMPY f98 = f98, f39 + FMPY f74 = f74, f39 + FMPY f106 = f106, f39 + FMPY f82 = f82, f39 + FMPY f114 = f114, f39 + FMPY f90 = f90, f39 + FMPY f122 = f122, f39 + ;; + FNMA f67 = f66, f40, f67 + FNMA f99 = f98, f40, f99 + FNMA f75 = f74, f40, f75 + FNMA f107 = f106, f40, f107 + FNMA f83 = f82, f40, f83 + FNMA f115 = f114, f40, f115 + FNMA f91 = f90, f40, f91 + FNMA f123 = f122, f40, f123 + ;; + FMPY f67 = f67, f41 + FMPY f99 = f99, f41 + FMPY f75 = f75, f41 + FMPY f107 = f107, f41 + FMPY f83 = f83, f41 + FMPY f115 = f115, f41 + FMPY f91 = f91, f41 + FMPY f123 = f123, f41 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f74, SIZE + } + { .mfi + STFD [BOFFSET2] = f106, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f90, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f122, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f75, SIZE + } + { .mfi + STFD [BOFFSET2] = f107, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f83, SIZE + } + { .mfi + STFD [BOFFSET2] = f115, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f91, -27 * SIZE + } + { .mfi + STFD [BOFFSET2] = f123, -27 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + FMPY f66 = f66, f32 + FMPY f67 = f67, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + FNMA f74 = f66, f33, f74 + FNMA f75 = f67, f33, f75 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + FNMA f82 = f66, f34, f82 + FNMA f83 = f67, f34, f83 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + FNMA f90 = f66, f35, f90 + FNMA f91 = f67, f35, f91 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + FNMA f98 = f66, f36, f98 + FNMA f99 = f67, f36, f99 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + FNMA f106 = f66, f37, f106 + FNMA f107 = f67, f37, f107 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + FNMA f114 = f66, f38, f114 + FNMA f115 = f67, f38, f115 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + FNMA f122 = f66, f39, f122 + FNMA f123 = f67, f39, f123 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + FMPY f74 = f74, f40 + FMPY f75 = f75, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + FNMA f82 = f74, f41, f82 + FNMA f83 = f75, f41, f83 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + FNMA f90 = f74, f42, f90 + FNMA f91 = f75, f42, f91 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + FNMA f98 = f74, f43, f98 + FNMA f99 = f75, f43, f99 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + FNMA f106 = f74, f44, f106 + FNMA f107 = f75, f44, f107 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + FNMA f114 = f74, f45, f114 + FNMA f115 = f75, f45, f115 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + FNMA f122 = f74, f46, f122 + FNMA f123 = f75, f46, f123 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + FMPY f82 = f82, f47 + FMPY f83 = f83, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + FNMA f90 = f82, f48, f90 + FNMA f91 = f83, f48, f91 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + FNMA f98 = f82, f49, f98 + FNMA f99 = f83, f49, f99 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + FNMA f106 = f82, f50, f106 + FNMA f107 = f83, f50, f107 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + FNMA f114 = f82, f51, f114 + FNMA f115 = f83, f51, f115 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + FNMA f122 = f82, f52, f122 + FNMA f123 = f83, f52, f123 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + FMPY f90 = f90, f53 + FMPY f91 = f91, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + FNMA f98 = f90, f54, f98 + FNMA f99 = f91, f54, f99 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + FNMA f106 = f90, f55, f106 + FNMA f107 = f91, f55, f107 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + FNMA f114 = f90, f56, f114 + FNMA f115 = f91, f56, f115 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + FNMA f122 = f90, f57, f122 + FNMA f123 = f91, f57, f123 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + FMPY f98 = f98, f58 + FMPY f99 = f99, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + FNMA f106 = f98, f59, f106 + FNMA f107 = f99, f59, f107 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + FNMA f114 = f98, f60, f114 + FNMA f115 = f99, f60, f115 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + FNMA f122 = f98, f61, f122 + FNMA f123 = f99, f61, f123 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + FMPY f106 = f106, f16 + FMPY f107 = f107, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + FNMA f114 = f106, f17, f114 + FNMA f115 = f107, f17, f115 + ;; + FNMA f120 = f104, f18, f120 + FNMA f121 = f105, f18, f121 + FNMA f122 = f106, f18, f122 + FNMA f123 = f107, f18, f123 + ;; + FMPY f112 = f112, f19 + FMPY f113 = f113, f19 + FMPY f114 = f114, f19 + FMPY f115 = f115, f19 + ;; + FNMA f120 = f112, f20, f120 + FNMA f121 = f113, f20, f121 + FNMA f122 = f114, f20, f122 + FNMA f123 = f115, f20, f123 + ;; + FMPY f120 = f120, f21 + FMPY f121 = f121, f21 + FMPY f122 = f122, f21 + FMPY f123 = f123, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f75, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, 5 * SIZE + STFD [AOFFSET2] = f91, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f105, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f106, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f107, 5 * SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + ;; + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + ;; + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f123, - 27 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + FMPY f121 = f121, f32 + FMPY f122 = f122, f32 + FMPY f123 = f123, f32 + ;; + FNMA f112 = f120, f33, f112 + FNMA f113 = f121, f33, f113 + FNMA f114 = f122, f33, f114 + FNMA f115 = f123, f33, f115 + ;; + FNMA f104 = f120, f34, f104 + FNMA f105 = f121, f34, f105 + FNMA f106 = f122, f34, f106 + FNMA f107 = f123, f34, f107 + ;; + FNMA f96 = f120, f35, f96 + FNMA f97 = f121, f35, f97 + FNMA f98 = f122, f35, f98 + FNMA f99 = f123, f35, f99 + ;; + FNMA f88 = f120, f36, f88 + FNMA f89 = f121, f36, f89 + FNMA f90 = f122, f36, f90 + FNMA f91 = f123, f36, f91 + ;; + FNMA f80 = f120, f37, f80 + FNMA f81 = f121, f37, f81 + FNMA f82 = f122, f37, f82 + FNMA f83 = f123, f37, f83 + ;; + FNMA f72 = f120, f38, f72 + FNMA f73 = f121, f38, f73 + FNMA f74 = f122, f38, f74 + FNMA f75 = f123, f38, f75 + ;; + FNMA f64 = f120, f39, f64 + FNMA f65 = f121, f39, f65 + FNMA f66 = f122, f39, f66 + FNMA f67 = f123, f39, f67 + ;; + FMPY f112 = f112, f40 + FMPY f113 = f113, f40 + FMPY f114 = f114, f40 + FMPY f115 = f115, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f105 = f113, f41, f105 + FNMA f106 = f114, f41, f106 + FNMA f107 = f115, f41, f107 + ;; + FNMA f96 = f112, f42, f96 + FNMA f97 = f113, f42, f97 + FNMA f98 = f114, f42, f98 + FNMA f99 = f115, f42, f99 + ;; + FNMA f88 = f112, f43, f88 + FNMA f89 = f113, f43, f89 + FNMA f90 = f114, f43, f90 + FNMA f91 = f115, f43, f91 + ;; + FNMA f80 = f112, f44, f80 + FNMA f81 = f113, f44, f81 + FNMA f82 = f114, f44, f82 + FNMA f83 = f115, f44, f83 + ;; + FNMA f72 = f112, f45, f72 + FNMA f73 = f113, f45, f73 + FNMA f74 = f114, f45, f74 + FNMA f75 = f115, f45, f75 + ;; + FNMA f64 = f112, f46, f64 + FNMA f65 = f113, f46, f65 + FNMA f66 = f114, f46, f66 + FNMA f67 = f115, f46, f67 + ;; + FMPY f104 = f104, f47 + FMPY f105 = f105, f47 + FMPY f106 = f106, f47 + FMPY f107 = f107, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f97 = f105, f48, f97 + FNMA f98 = f106, f48, f98 + FNMA f99 = f107, f48, f99 + ;; + FNMA f88 = f104, f49, f88 + FNMA f89 = f105, f49, f89 + FNMA f90 = f106, f49, f90 + FNMA f91 = f107, f49, f91 + ;; + FNMA f80 = f104, f50, f80 + FNMA f81 = f105, f50, f81 + FNMA f82 = f106, f50, f82 + FNMA f83 = f107, f50, f83 + ;; + FNMA f72 = f104, f51, f72 + FNMA f73 = f105, f51, f73 + FNMA f74 = f106, f51, f74 + FNMA f75 = f107, f51, f75 + ;; + FNMA f64 = f104, f52, f64 + FNMA f65 = f105, f52, f65 + FNMA f66 = f106, f52, f66 + FNMA f67 = f107, f52, f67 + ;; + FMPY f96 = f96, f53 + FMPY f97 = f97, f53 + FMPY f98 = f98, f53 + FMPY f99 = f99, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f89 = f97, f54, f89 + FNMA f90 = f98, f54, f90 + FNMA f91 = f99, f54, f91 + ;; + FNMA f80 = f96, f55, f80 + FNMA f81 = f97, f55, f81 + FNMA f82 = f98, f55, f82 + FNMA f83 = f99, f55, f83 + ;; + FNMA f72 = f96, f56, f72 + FNMA f73 = f97, f56, f73 + FNMA f74 = f98, f56, f74 + FNMA f75 = f99, f56, f75 + ;; + FNMA f64 = f96, f57, f64 + FNMA f65 = f97, f57, f65 + FNMA f66 = f98, f57, f66 + FNMA f67 = f99, f57, f67 + ;; + FMPY f88 = f88, f58 + FMPY f89 = f89, f58 + FMPY f90 = f90, f58 + FMPY f91 = f91, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f81 = f89, f59, f81 + FNMA f82 = f90, f59, f82 + FNMA f83 = f91, f59, f83 + ;; + FNMA f72 = f88, f60, f72 + FNMA f73 = f89, f60, f73 + FNMA f74 = f90, f60, f74 + FNMA f75 = f91, f60, f75 + ;; + FNMA f64 = f88, f61, f64 + FNMA f65 = f89, f61, f65 + FNMA f66 = f90, f61, f66 + FNMA f67 = f91, f61, f67 + ;; + FMPY f80 = f80, f16 + FMPY f81 = f81, f16 + FMPY f82 = f82, f16 + FMPY f83 = f83, f16 + ;; + FNMA f72 = f80, f17, f72 + FNMA f73 = f81, f17, f73 + FNMA f74 = f82, f17, f74 + FNMA f75 = f83, f17, f75 + ;; + FNMA f64 = f80, f18, f64 + FNMA f65 = f81, f18, f65 + FNMA f66 = f82, f18, f66 + FNMA f67 = f83, f18, f67 + ;; + FMPY f72 = f72, f19 + FMPY f73 = f73, f19 + FMPY f74 = f74, f19 + FMPY f75 = f75, f19 + ;; + FNMA f64 = f72, f20, f64 + FNMA f65 = f73, f20, f65 + FNMA f66 = f74, f20, f66 + FNMA f67 = f75, f20, f67 + ;; + FMPY f64 = f64, f21 + FMPY f65 = f65, f21 + FMPY f66 = f66, f21 + FMPY f67 = f67, f21 + ;; + adds AOFFSET = 24 * SIZE, AOFFSET + adds AOFFSET2 = 24 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f121, SIZE + ;; + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f122, SIZE + ;; + STFD [AOFFSET] = f115, - 11 * SIZE + STFD [AOFFSET2] = f123, - 11 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f105, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f106, SIZE + ;; + STFD [AOFFSET] = f99, - 11 * SIZE + STFD [AOFFSET2] = f107, - 11 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f89, SIZE + ;; + STFD [AOFFSET] = f82, SIZE + STFD [AOFFSET2] = f90, SIZE + ;; + STFD [AOFFSET] = f83, - 11 * SIZE + STFD [AOFFSET2] = f91, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f72, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f73, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f74, SIZE + ;; + STFD [AOFFSET] = f67, - 3 * SIZE + STFD [AOFFSET2] = f75, - 3 * SIZE + ;; + +#endif + { .mmf + STFD [C1 ] = f64, SIZE + mov f64 = f0 + } + ;; + { .mmi + STFD [C1 ] = f65, SIZE + } + ;; + { .mmi + STFD [C1 ] = f66, SIZE +#ifdef LN + adds C3 = -4 * SIZE, C3 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C1 ] = f67, SIZE +#else + STFD [C1 ] = f67, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C2 ] = f72, SIZE + mov f72 = f0 + } + ;; + { .mmi + STFD [C2 ] = f73, SIZE +#ifdef LN + adds C4 = -4 * SIZE, C4 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C2 ] = f74, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C2 ] = f75, SIZE +#else + STFD [C2 ] = f75, - 3 * SIZE +#endif +#ifdef LN + adds C5 = -4 * SIZE, C5 +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C3 ] = f80, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C3 ] = f81, SIZE + } + ;; + { .mmi + STFD [C3 ] = f82, SIZE +#ifdef LN + adds C6 = -4 * SIZE, C6 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C3 ] = f83, SIZE +#else + STFD [C3 ] = f83, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C4 ] = f88, SIZE + mov f88 = f0 + } + ;; + { .mmi + STFD [C4 ] = f89, SIZE +#ifdef LN + adds C8 = -4 * SIZE, C8 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C4 ] = f90, SIZE + } + ;; + { .mmi +#ifndef LN + STFD [C4 ] = f91, SIZE +#else + STFD [C4 ] = f91, - 3 * SIZE +#endif + nop __LINE__ + } + ;; + { .mmf + STFD [C5 ] = f96, SIZE + mov f96 = f0 + } + ;; + { .mmi + STFD [C5 ] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C5 ] = f98, SIZE +#ifdef LN + adds C7 = -4 * SIZE, C7 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C5 ] = f99, SIZE +#else + STFD [C5 ] = f99, - 3 * SIZE +#endif + } + ;; + { .mmf + STFD [C6 ] = f104, SIZE + mov f104 = f0 + } + ;; + { .mmi + STFD [C6 ] = f105, SIZE + shladd r2 = K, BASE_SHIFT, r0 + } + ;; + { .mmi + STFD [C6 ] = f106, SIZE + sub L = K, KK + } + ;; + { .mmi +#ifndef LN + STFD [C6 ] = f107, SIZE +#else + STFD [C6 ] = f107, - 3 * SIZE +#endif +#ifdef RT + shladd AORIG = r2, 2, AORIG +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C7 ] = f112, SIZE + mov f112 = f0 + } + ;; + { .mmi + STFD [C7 ] = f113, SIZE +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C7 ] = f114, SIZE +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 2, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#ifndef LN + STFD [C7 ] = f115, SIZE +#else + STFD [C7 ] = f115, - 3 * SIZE +#endif +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + mov f120 = f0 + } + ;; + { .mmi + STFD [C8 ] = f121, SIZE +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C8 ] = f122, SIZE +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + { .mmb +#ifndef LN + STFD [C8 ] = f123, SIZE +#else + STFD [C8 ] = f123, - 3 * SIZE +#endif + } + ;; + .align 8 + +.L030: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p0 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + setf.d f73 = r0 + mov f65 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B + mov f65 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + { .mfi + setf.d f105 = r0 + mov f81 = f0 + adds L = 1, L + } + { .mfi + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + mov f89 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f113 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f97 = r0 + mov f121 = f0 + shr L = L, 1 + } + ;; + { .mmf + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmf + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L038 + } + ;; + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [BOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [BOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [BOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; + { .mfi + FSUB f65 = f40, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f73 = f41, f73 + nop __LINE__ + } + ;; + { .mfi + FSUB f81 = f42, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f89 = f43, f89 + nop __LINE__ + } + ;; + { .mfi + FSUB f97 = f44, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f105 = f45, f105 + nop __LINE__ + } + ;; + { .mfi + FSUB f113 = f46, f113 + } + { .mfi + nop __LINE__ + FSUB f121 = f47, f121 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + ;; + LDFPD f40, f41 = [AOFFSET], 2 * SIZE + ;; + LDFPD f42, f43 = [AOFFSET], 2 * SIZE + ;; + LDFPD f44, f45 = [AOFFSET], 2 * SIZE + ;; + LDFPD f46, f47 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f65 = f33, f65 + + FSUB f72 = f34, f72 + FSUB f73 = f35, f73 + + FSUB f80 = f36, f80 + FSUB f81 = f37, f81 + + FSUB f88 = f38, f88 + FSUB f89 = f39, f89 + ;; + FSUB f96 = f40, f96 + FSUB f97 = f41, f97 + ;; + FSUB f104 = f42, f104 + FSUB f105 = f43, f105 + ;; + FSUB f112 = f44, f112 + FSUB f113 = f45, f113 + ;; + FSUB f120 = f46, f120 + FSUB f121 = f47, f121 + ;; +#endif + +#ifdef LN + adds AOFFSET = 2 * SIZE, AOFFSET + ;; + LDFPD f33, f32 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET] + ;; + FMPY f65 = f65, f32 + FMPY f97 = f97, f32 + FMPY f73 = f73, f32 + FMPY f105 = f105, f32 + FMPY f81 = f81, f32 + FMPY f113 = f113, f32 + FMPY f89 = f89, f32 + FMPY f121 = f121, f32 + ;; + FNMA f64 = f65, f33, f64 + FNMA f96 = f97, f33, f96 + FNMA f72 = f73, f33, f72 + FNMA f104 = f105, f33, f104 + FNMA f80 = f81, f33, f80 + FNMA f112 = f113, f33, f112 + FNMA f88 = f89, f33, f88 + FNMA f120 = f121, f33, f120 + ;; + FMPY f64 = f64, f34 + FMPY f96 = f96, f34 + FMPY f72 = f72, f34 + FMPY f104 = f104, f34 + FMPY f80 = f80, f34 + FMPY f112 = f112, f34 + FMPY f88 = f88, f34 + FMPY f120 = f120, f34 + ;; + adds BOFFSET = 8 * SIZE, BOFFSET + adds BOFFSET2 = 8 * SIZE, BOFFSET2 + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, - 11 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, - 11 * SIZE + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -2 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -2 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 + adds C5 = -2 * SIZE, C5 + adds C6 = -2 * SIZE, C6 + adds C7 = -2 * SIZE, C7 + adds C8 = -2 * SIZE, C8 + ;; +#endif + +#ifdef LT + LDFPD f32, f33 = [AOFFSET] + adds AOFFSET = 3 * SIZE, AOFFSET + ;; + LDFD f34 = [AOFFSET], - 3 * SIZE + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + FNMA f65 = f64, f33, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f96, f33, f97 + nop __LINE__ + } + ;; + { .mfi + FNMA f73 = f72, f33, f73 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f105 = f104, f33, f105 + nop __LINE__ + } + ;; + { .mfi + FNMA f81 = f80, f33, f81 + } + { .mfi + nop __LINE__ + FNMA f113 = f112, f33, f113 + nop __LINE__ + } + ;; + { .mfi + FNMA f89 = f88, f33, f89 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f121 = f120, f33, f121 + nop __LINE__ + } + ;; + FMPY f65 = f65, f34 + FMPY f97 = f97, f34 + FMPY f73 = f73, f34 + FMPY f105 = f105, f34 + FMPY f81 = f81, f34 + FMPY f113 = f113, f34 + FMPY f89 = f89, f34 + FMPY f121 = f121, f34 + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, 5 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, 5 * SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f73, SIZE + } + { .mfi + STFD [BOFFSET2] = f105, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f81, SIZE + } + { .mfi + STFD [BOFFSET2] = f113, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f89, -11 * SIZE + } + { .mfi + STFD [BOFFSET2] = f121, -11 * SIZE + } +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + FMPY f65 = f65, f32 + ;; + FNMA f72 = f64, f33, f72 + FNMA f73 = f65, f33, f73 + ;; + FNMA f80 = f64, f34, f80 + FNMA f81 = f65, f34, f81 + ;; + FNMA f88 = f64, f35, f88 + FNMA f89 = f65, f35, f89 + ;; + FNMA f96 = f64, f36, f96 + FNMA f97 = f65, f36, f97 + ;; + FNMA f104 = f64, f37, f104 + FNMA f105 = f65, f37, f105 + ;; + FNMA f112 = f64, f38, f112 + FNMA f113 = f65, f38, f113 + ;; + FNMA f120 = f64, f39, f120 + FNMA f121 = f65, f39, f121 + ;; + FMPY f72 = f72, f40 + FMPY f73 = f73, f40 + ;; + FNMA f80 = f72, f41, f80 + FNMA f81 = f73, f41, f81 + ;; + FNMA f88 = f72, f42, f88 + FNMA f89 = f73, f42, f89 + ;; + FNMA f96 = f72, f43, f96 + FNMA f97 = f73, f43, f97 + ;; + FNMA f104 = f72, f44, f104 + FNMA f105 = f73, f44, f105 + ;; + FNMA f112 = f72, f45, f112 + FNMA f113 = f73, f45, f113 + ;; + FNMA f120 = f72, f46, f120 + FNMA f121 = f73, f46, f121 + ;; + FMPY f80 = f80, f47 + FMPY f81 = f81, f47 + ;; + FNMA f88 = f80, f48, f88 + FNMA f89 = f81, f48, f89 + ;; + FNMA f96 = f80, f49, f96 + FNMA f97 = f81, f49, f97 + ;; + FNMA f104 = f80, f50, f104 + FNMA f105 = f81, f50, f105 + ;; + FNMA f112 = f80, f51, f112 + FNMA f113 = f81, f51, f113 + ;; + FNMA f120 = f80, f52, f120 + FNMA f121 = f81, f52, f121 + ;; + FMPY f88 = f88, f53 + FMPY f89 = f89, f53 + ;; + FNMA f96 = f88, f54, f96 + FNMA f97 = f89, f54, f97 + ;; + FNMA f104 = f88, f55, f104 + FNMA f105 = f89, f55, f105 + ;; + FNMA f112 = f88, f56, f112 + FNMA f113 = f89, f56, f113 + ;; + FNMA f120 = f88, f57, f120 + FNMA f121 = f89, f57, f121 + ;; + FMPY f96 = f96, f58 + FMPY f97 = f97, f58 + ;; + FNMA f104 = f96, f59, f104 + FNMA f105 = f97, f59, f105 + ;; + FNMA f112 = f96, f60, f112 + FNMA f113 = f97, f60, f113 + ;; + FNMA f120 = f96, f61, f120 + FNMA f121 = f97, f61, f121 + ;; + FMPY f104 = f104, f16 + FMPY f105 = f105, f16 + ;; + FNMA f112 = f104, f17, f112 + FNMA f113 = f105, f17, f113 + ;; + FNMA f120 = f104, f18, f120 + FNMA f121 = f105, f18, f121 + ;; + FMPY f112 = f112, f19 + FMPY f113 = f113, f19 + ;; + FNMA f120 = f112, f20, f120 + FNMA f121 = f113, f20, f121 + ;; + FMPY f120 = f120, f21 + FMPY f121 = f121, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, 5 * SIZE + STFD [AOFFSET2] = f89, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f105, -11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + FMPY f121 = f121, f32 + ;; + FNMA f112 = f120, f33, f112 + FNMA f113 = f121, f33, f113 + ;; + FNMA f104 = f120, f34, f104 + FNMA f105 = f121, f34, f105 + ;; + FNMA f96 = f120, f35, f96 + FNMA f97 = f121, f35, f97 + ;; + FNMA f88 = f120, f36, f88 + FNMA f89 = f121, f36, f89 + ;; + FNMA f80 = f120, f37, f80 + FNMA f81 = f121, f37, f81 + ;; + FNMA f72 = f120, f38, f72 + FNMA f73 = f121, f38, f73 + ;; + FNMA f64 = f120, f39, f64 + FNMA f65 = f121, f39, f65 + ;; + FMPY f112 = f112, f40 + FMPY f113 = f113, f40 + ;; + FNMA f104 = f112, f41, f104 + FNMA f105 = f113, f41, f105 + ;; + FNMA f96 = f112, f42, f96 + FNMA f97 = f113, f42, f97 + ;; + FNMA f88 = f112, f43, f88 + FNMA f89 = f113, f43, f89 + ;; + FNMA f80 = f112, f44, f80 + FNMA f81 = f113, f44, f81 + ;; + FNMA f72 = f112, f45, f72 + FNMA f73 = f113, f45, f73 + ;; + FNMA f64 = f112, f46, f64 + FNMA f65 = f113, f46, f65 + ;; + FMPY f104 = f104, f47 + FMPY f105 = f105, f47 + ;; + FNMA f96 = f104, f48, f96 + FNMA f97 = f105, f48, f97 + ;; + FNMA f88 = f104, f49, f88 + FNMA f89 = f105, f49, f89 + ;; + FNMA f80 = f104, f50, f80 + FNMA f81 = f105, f50, f81 + ;; + FNMA f72 = f104, f51, f72 + FNMA f73 = f105, f51, f73 + ;; + FNMA f64 = f104, f52, f64 + FNMA f65 = f105, f52, f65 + ;; + FMPY f96 = f96, f53 + FMPY f97 = f97, f53 + ;; + FNMA f88 = f96, f54, f88 + FNMA f89 = f97, f54, f89 + ;; + FNMA f80 = f96, f55, f80 + FNMA f81 = f97, f55, f81 + ;; + FNMA f72 = f96, f56, f72 + FNMA f73 = f97, f56, f73 + ;; + FNMA f64 = f96, f57, f64 + FNMA f65 = f97, f57, f65 + ;; + FMPY f88 = f88, f58 + FMPY f89 = f89, f58 + ;; + FNMA f80 = f88, f59, f80 + FNMA f81 = f89, f59, f81 + ;; + FNMA f72 = f88, f60, f72 + FNMA f73 = f89, f60, f73 + ;; + FNMA f64 = f88, f61, f64 + FNMA f65 = f89, f61, f65 + ;; + FMPY f80 = f80, f16 + FMPY f81 = f81, f16 + ;; + FNMA f72 = f80, f17, f72 + FNMA f73 = f81, f17, f73 + ;; + FNMA f64 = f80, f18, f64 + FNMA f65 = f81, f18, f65 + ;; + FMPY f72 = f72, f19 + FMPY f73 = f73, f19 + ;; + FNMA f64 = f72, f20, f64 + FNMA f65 = f73, f20, f65 + ;; + FMPY f64 = f64, f21 + FMPY f65 = f65, f21 + ;; + adds AOFFSET = 8 * SIZE, AOFFSET + adds AOFFSET2 = 8 * SIZE, AOFFSET2 + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f104, SIZE + STFD [AOFFSET2] = f120, SIZE + ;; + STFD [AOFFSET] = f105, - 11 * SIZE + STFD [AOFFSET2] = f121, - 11 * SIZE + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f88, SIZE + ;; + STFD [AOFFSET] = f73, - 3 * SIZE + STFD [AOFFSET2] = f89, - 3 * SIZE + ;; + +#endif + STFD [C1 ] = f64, SIZE + mov f64 = f0 + ;; +#ifndef LN + STFD [C1 ] = f65, SIZE +#else + STFD [C1 ] = f65, -SIZE +#endif + ;; + STFD [C2 ] = f72, SIZE + mov f72 = f0 + ;; +#ifndef LN + STFD [C2 ] = f73, SIZE +#else + STFD [C2 ] = f73, -SIZE +#endif + ;; + STFD [C3 ] = f80, SIZE + mov f80 = f0 + ;; +#ifndef LN + STFD [C3 ] = f81, SIZE +#else + STFD [C3 ] = f81, - SIZE +#endif + ;; + STFD [C4 ] = f88, SIZE + mov f88 = f0 + ;; +#ifndef LN + STFD [C4 ] = f89, SIZE +#else + STFD [C4 ] = f89, -SIZE +#endif + ;; + STFD [C5 ] = f96, SIZE + mov f96 = f0 + ;; +#ifndef LN + STFD [C5 ] = f97, SIZE +#else + STFD [C5 ] = f97, -SIZE +#endif + ;; + STFD [C6 ] = f104, SIZE + mov f104 = f0 + ;; +#ifndef LN + STFD [C6 ] = f105, SIZE +#else + STFD [C6 ] = f105, -SIZE +#endif + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#else + nop __LINE__ +#endif + ;; + STFD [C7 ] = f112, SIZE + mov f112 = f0 + ;; + { .mmi +#ifndef LN + STFD [C7 ] = f113, SIZE +#else + STFD [C7 ] = f113, -SIZE +#endif + +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd AOFFSET = L, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + STFD [C8 ] = f120, SIZE + mov f120 = f0 + } + ;; + { .mmi +#ifndef LN + STFD [C8 ] = f121, SIZE +#else + STFD [C8 ] = f121, -SIZE +#endif + +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + ;; + .align 8 + +.L040: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p0 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 0 + BASE_SHIFT + } + { .mmi + shladd r3 = KK, BASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mmf + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 3, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + { .mmi + adds L = 1, L + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mii + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + adds L = -1, L + } + ;; + { .mmi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + cmp.eq p6, p0 = -1, L + } + ;; + { .mib + (p7) LDFD f32 = [AOFFSET], 1 * SIZE + mov ar.lc = L + (p6) br.cond.dpnt .L048 + } + ;; + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + br.cloop.sptk.few .L042 + } + ;; + +.L048: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -8, KK +#endif + ;; + shladd r2 = r2, BASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 3, B + ;; +#endif + adds AOFFSET2 = 4 * SIZE, AOFFSET + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + +#if defined(LN) || defined(LT) + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + { .mfi + FSUB f64 = f32, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f72 = f33, f72 + nop __LINE__ + } + ;; + { .mfi + FSUB f80 = f34, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f88 = f35, f88 + nop __LINE__ + } + ;; + { .mfi + FSUB f96 = f36, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f104 = f37, f104 + nop __LINE__ + } + ;; + { .mfi + FSUB f112 = f38, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f120 = f39, f120 + nop __LINE__ + } + ;; +#else + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f32, f64 + FSUB f72 = f33, f72 + FSUB f80 = f34, f80 + FSUB f88 = f35, f88 + FSUB f96 = f36, f96 + FSUB f104 = f37, f104 + FSUB f112 = f38, f112 + FSUB f120 = f39, f120 + ;; +#endif + +#ifdef LN + LDFD f32 = [AOFFSET] + ;; + FMPY f64 = f64, f32 + FMPY f96 = f96, f32 + FMPY f72 = f72, f32 + FMPY f104 = f104, f32 + FMPY f80 = f80, f32 + FMPY f112 = f112, f32 + FMPY f88 = f88, f32 + FMPY f120 = f120, f32 + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + adds C1 = -1 * SIZE, C1 + } + ;; + { .mmi + STFD [BOFFSET] = f72, SIZE + STFD [BOFFSET2] = f104, SIZE + adds C2 = -1 * SIZE, C2 + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f88, - 3 * SIZE + STFD [BOFFSET2] = f120, - 3 * SIZE + } + ;; + adds C3 = -1 * SIZE, C3 + adds C4 = -1 * SIZE, C4 + adds C5 = -1 * SIZE, C5 + adds C6 = -1 * SIZE, C6 + adds C7 = -1 * SIZE, C7 + adds C8 = -1 * SIZE, C8 + ;; +#endif + +#ifdef LT + LDFD f32 = [AOFFSET] + ;; + { .mfi + FMPY f64 = f64, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f96 = f96, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f72 = f72, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f104 = f104, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f80 = f80, f32 + } + { .mfi + nop __LINE__ + FMPY f112 = f112, f32 + nop __LINE__ + } + ;; + { .mfi + FMPY f88 = f88, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f120 = f120, f32 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f72, SIZE + } + { .mfi + STFD [BOFFSET2] = f104, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + } + ;; + { .mfi + STFD [BOFFSET] = f88, -3 * SIZE + } + { .mfi + STFD [BOFFSET2] = f120, -3 * SIZE + } + ;; +#endif + +#ifdef RN + LDFPD f32, f33 = [BOFFSET], 2 * SIZE + ;; + LDFPD f34, f35 = [BOFFSET], 2 * SIZE + ;; + LDFPD f36, f37 = [BOFFSET], 2 * SIZE + ;; + LDFPD f38, f39 = [BOFFSET] + adds BOFFSET = 3 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], 1 * SIZE + ;; + LDFPD f41, f42 = [BOFFSET], 2 * SIZE + ;; + LDFPD f43, f44 = [BOFFSET], 2 * SIZE + ;; + LDFPD f45, f46 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f47, f48 = [BOFFSET], 2 * SIZE + ;; + LDFPD f49, f50 = [BOFFSET], 2 * SIZE + ;; + LDFPD f51, f52 = [BOFFSET] + adds BOFFSET = 5 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], 1 * SIZE + ;; + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + ;; + LDFPD f56, f57 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f58, f59 = [BOFFSET], 2 * SIZE + ;; + LDFPD f60, f61 = [BOFFSET] + adds BOFFSET = 7 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], 1 * SIZE + ;; + LDFPD f17, f18 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f19, f20 = [BOFFSET] + adds BOFFSET = 9 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + adds BOFFSET = -63 * SIZE, BOFFSET + ;; + + FMPY f64 = f64, f32 + ;; + FNMA f72 = f64, f33, f72 + ;; + FNMA f80 = f64, f34, f80 + ;; + FNMA f88 = f64, f35, f88 + ;; + FNMA f96 = f64, f36, f96 + ;; + FNMA f104 = f64, f37, f104 + ;; + FNMA f112 = f64, f38, f112 + ;; + FNMA f120 = f64, f39, f120 + ;; + FMPY f72 = f72, f40 + ;; + FNMA f80 = f72, f41, f80 + ;; + FNMA f88 = f72, f42, f88 + ;; + FNMA f96 = f72, f43, f96 + ;; + FNMA f104 = f72, f44, f104 + ;; + FNMA f112 = f72, f45, f112 + ;; + FNMA f120 = f72, f46, f120 + ;; + FMPY f80 = f80, f47 + ;; + FNMA f88 = f80, f48, f88 + ;; + FNMA f96 = f80, f49, f96 + ;; + FNMA f104 = f80, f50, f104 + ;; + FNMA f112 = f80, f51, f112 + ;; + FNMA f120 = f80, f52, f120 + ;; + FMPY f88 = f88, f53 + ;; + FNMA f96 = f88, f54, f96 + ;; + FNMA f104 = f88, f55, f104 + ;; + FNMA f112 = f88, f56, f112 + ;; + FNMA f120 = f88, f57, f120 + ;; + FMPY f96 = f96, f58 + ;; + FNMA f104 = f96, f59, f104 + ;; + FNMA f112 = f96, f60, f112 + ;; + FNMA f120 = f96, f61, f120 + ;; + FMPY f104 = f104, f16 + ;; + FNMA f112 = f104, f17, f112 + ;; + FNMA f120 = f104, f18, f120 + ;; + FMPY f112 = f112, f19 + ;; + FNMA f120 = f112, f20, f120 + ;; + FMPY f120 = f120, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, -3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; +#endif + +#ifdef RT + adds BOFFSET = 62 * SIZE, BOFFSET + ;; + LDFPD f33, f32 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f35, f34 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f37, f36 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f39, f38 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFD f40 = [BOFFSET], -2 * SIZE + ;; + LDFPD f42, f41 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f44, f43 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f46, f45 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f48, f47 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f50, f49 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f52, f51 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFD f53 = [BOFFSET], -2 * SIZE + ;; + LDFPD f55, f54 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f57, f56 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f59, f58 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f61, f60 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFD f16 = [BOFFSET], -2 * SIZE + ;; + LDFPD f18, f17 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f20, f19 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFD f21 = [BOFFSET] + ;; + + FMPY f120 = f120, f32 + ;; + FNMA f112 = f120, f33, f112 + ;; + FNMA f104 = f120, f34, f104 + ;; + FNMA f96 = f120, f35, f96 + ;; + FNMA f88 = f120, f36, f88 + ;; + FNMA f80 = f120, f37, f80 + ;; + FNMA f72 = f120, f38, f72 + ;; + FNMA f64 = f120, f39, f64 + ;; + FMPY f112 = f112, f40 + ;; + FNMA f104 = f112, f41, f104 + ;; + FNMA f96 = f112, f42, f96 + ;; + FNMA f88 = f112, f43, f88 + ;; + FNMA f80 = f112, f44, f80 + ;; + FNMA f72 = f112, f45, f72 + ;; + FNMA f64 = f112, f46, f64 + ;; + FMPY f104 = f104, f47 + ;; + FNMA f96 = f104, f48, f96 + ;; + FNMA f88 = f104, f49, f88 + ;; + FNMA f80 = f104, f50, f80 + ;; + FNMA f72 = f104, f51, f72 + ;; + FNMA f64 = f104, f52, f64 + ;; + FMPY f96 = f96, f53 + ;; + FNMA f88 = f96, f54, f88 + ;; + FNMA f80 = f96, f55, f80 + ;; + FNMA f72 = f96, f56, f72 + ;; + FNMA f64 = f96, f57, f64 + ;; + FMPY f88 = f88, f58 + ;; + FNMA f80 = f88, f59, f80 + ;; + FNMA f72 = f88, f60, f72 + ;; + FNMA f64 = f88, f61, f64 + ;; + FMPY f80 = f80, f16 + ;; + FNMA f72 = f80, f17, f72 + ;; + FNMA f64 = f80, f18, f64 + ;; + FMPY f72 = f72, f19 + ;; + FNMA f64 = f72, f20, f64 + ;; + FMPY f64 = f64, f21 + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f72, SIZE + STFD [AOFFSET2] = f104, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f88, - 3 * SIZE + STFD [AOFFSET2] = f120, - 3 * SIZE + ;; + +#endif + +#ifndef LN + STFD [C1 ] = f64, SIZE +#else + STFD [C1 ] = f64 +#endif +#ifndef LN + STFD [C2 ] = f72, SIZE +#else + STFD [C2 ] = f72 +#endif +#ifndef LN + STFD [C3 ] = f80, SIZE +#else + STFD [C3 ] = f80 +#endif +#ifndef LN + STFD [C4 ] = f88, SIZE +#else + STFD [C4 ] = f88 +#endif +#ifndef LN + STFD [C5 ] = f96, SIZE +#else + STFD [C5 ] = f96 +#endif +#ifndef LN + STFD [C6 ] = f104, SIZE +#else + STFD [C6 ] = f104 +#endif +#ifndef LN + STFD [C7 ] = f112, SIZE +#else + STFD [C7 ] = f112 +#endif +#ifndef LN + STFD [C8 ] = f120, SIZE +#else + STFD [C8 ] = f120 +#endif + ;; + + mov f64 = f0 + mov f72 = f0 + mov f80 = f0 + mov f88 = f0 + mov f96 = f0 + mov f104 = f0 + mov f112 = f0 + mov f120 = f0 + ;; + shladd r2 = K, BASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, BASE_SHIFT, r0 +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + add AOFFSET = L, AOFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + shladd BOFFSET = L, 3, BOFFSET +#else + nop __LINE__ +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 8 + +.L049: +#ifdef LN + shladd KK8 = K, BASE_SHIFT, r0 + ;; + shladd B = KK8, 3, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 8, KK +#endif + +#ifdef RT + adds KK = -8, KK +#endif + ;; + + { .mmi + mov AOFFSET = A + } + ;; + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 8 + + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + mov ar.lc = ARLC + ;; + mov pr = PR, -1 + ;; + mov ar.pfs = ARPFS + ;; + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/xcopy.S b/kernel/ia64/xcopy.S new file mode 100644 index 0000000000..e58f5eff06 --- /dev/null +++ b/kernel/ia64/xcopy.S @@ -0,0 +1,565 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define INCX2 r18 +#define INCY2 r19 +#define INCX8 r20 +#define INCY8 r21 +#define PR r30 +#define ARLC r31 + +#define PREFETCH_SIZE (8 * 16) + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + shr I = N, 3 + (p6) br.ret.sptk.many b0 + } + ;; + shl INCX = INCX, ZBASE_SHIFT + shl INCY = INCY, ZBASE_SHIFT + ;; + .body + { .mmi + sub r8 = X1, Y1 + mov r9 = 0xf0 + mov PR = pr + } + { .mmi + shladd INCX2 = INCX, 1, r0 + shladd INCY2 = INCY, 1, r0 + and J = 15, N + } + ;; + { .mmi + shladd INCX8 = INCX, 2, r0 + shladd INCY8 = INCY, 2, r0 + mov pr.rot = 0 + } + { .mmi + and r8 = r9, r8 + cmp.eq p9, p0 = r0, J + adds I = -1, I + } + ;; + { .mmi + adds X2 = 1 * SIZE, X1 + adds Y2 = 1 * SIZE, Y1 + mov ar.ec = 4 + } + { .mmb + cmp.gt p6, p0 = 127, r8 + cmp.eq p16, p0 = r0, r0 + (p6) br.cond.dpnt .L20 + } + ;; + { .mmi + adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 2) * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = N, 2 + (p8) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mmi + (p19) STFD [Y1] = f35 + (p19) STFD [Y2] = f39 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p17) LDFD f81 = [X1], INCX + (p17) LDFD f85 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f43 + (p19) STFD [Y2] = f47 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p17) LDFD f89 = [X1], INCX + (p17) LDFD f93 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f51 + (p19) STFD [Y2] = f55 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f36 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f59 + (p19) STFD [Y2] = f63 + (p19) add Y1 = INCY, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p16) LDFD f40 = [X1], INCX + (p16) LDFD f44 = [X2], INCX + nop __LINE__ + } + ;; + { .mmi + (p19) STFD [Y1] = f67 + (p19) STFD [Y2] = f71 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f48 = [X1], INCX + (p16) LDFD f52 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f79 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f56 = [X1], INCX + (p16) LDFD f60 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f83 + (p19) STFD [Y2] = f87 + (p19) add Y1 = INCY, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f91 + (p19) STFD [Y2] = f95 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f64 = [X1], INCX + (p16) LDFD f68 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmb + (p16) LDFD f72 = [X1], INCX + (p16) LDFD f76 = [X2], INCX + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f49 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f51 = [X2], INCX + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f52 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f54 = [X1], INCX + (p12) LDFD f55 = [X2], INCX + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX + (p13) LDFD f57 = [X2], INCX + tbit.z p0, p14 = N, 0 + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX + (p13) LDFD f59 = [X2], INCX + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f49 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p14) LDFD f60 = [X1], INCX + (p14) LDFD f61 = [X2], INCX + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f51 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f52 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f54 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) STFD [Y2] = f57 + (p13) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p13) STFD [Y2] = f59 + (p13) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY, Y2 + } + ;; + { .mmb + (p14) STFD [Y1] = f60 + (p14) STFD [Y2] = f61 + br.ret.sptk.many b0 + } + ;; + .align 16 + +.L20: + { .mmi + adds PREX = (PREFETCH_SIZE + 0) * SIZE, X1 + adds PREY = (PREFETCH_SIZE + 10) * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = N, 2 + (p8) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mmi + (p19) STFD [Y1] = f67 + (p19) STFD [Y2] = f71 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p17) LDFD f81 = [X1], INCX + (p17) LDFD f85 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75 + (p19) STFD [Y2] = f79 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p17) LDFD f89 = [X1], INCX + (p17) LDFD f93 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f83 + (p19) STFD [Y2] = f87 + (p19) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], INCX + (p16) LDFD f36 = [X2], INCX + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f91 + (p19) STFD [Y2] = f95 + (p19) add Y1 = INCY, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p19) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p16) LDFD f40 = [X1], INCX + (p16) LDFD f44 = [X2], INCX + nop __LINE__ + } + ;; + { .mmi + (p18) STFD [Y1] = f34 + (p18) STFD [Y2] = f38 + (p18) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f48 = [X1], INCX + (p16) LDFD f52 = [X2], INCX + (p18) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f42 + (p18) STFD [Y2] = f46 + (p18) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f56 = [X1], INCX + (p16) LDFD f60 = [X2], INCX + (p18) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f50 + (p18) STFD [Y2] = f54 + (p18) add Y1 = INCY, Y1 + } + { .mmi + lfetch.fault.nt1 [PREX], INCX8 + lfetch.fault.excl.nt1 [PREY], INCY8 + (p18) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p18) STFD [Y1] = f58 + (p18) STFD [Y2] = f62 + (p18) add Y1 = INCY, Y1 + } + { .mmi + (p16) LDFD f64 = [X1], INCX + (p16) LDFD f68 = [X2], INCX + (p18) add Y2 = INCY, Y2 + } + ;; + { .mmb + (p16) LDFD f72 = [X1], INCX + (p16) LDFD f76 = [X2], INCX + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f49 = [X2], INCX + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f51 = [X2], INCX + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f52 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f54 = [X1], INCX + (p12) LDFD f55 = [X2], INCX + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p13) LDFD f56 = [X1], INCX + (p13) LDFD f57 = [X2], INCX + tbit.z p0, p14 = N, 0 + } + ;; + { .mmi + (p13) LDFD f58 = [X1], INCX + (p13) LDFD f59 = [X2], INCX + } + ;; + { .mmi + (p12) STFD [Y1] = f48 + (p12) STFD [Y2] = f49 + (p12) add Y1 = INCY, Y1 + } + { .mmi + (p14) LDFD f60 = [X1], INCX + (p14) LDFD f61 = [X2], INCX + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50 + (p12) STFD [Y2] = f51 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f52 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f54 + (p12) STFD [Y2] = f55 + (p12) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p12) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f56 + (p13) STFD [Y2] = f57 + (p13) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY, Y2 + } + ;; + { .mmi + (p13) STFD [Y1] = f58 + (p13) STFD [Y2] = f59 + (p13) add Y1 = INCY, Y1 + } + { .mmi + nop __LINE__ + nop __LINE__ + (p13) add Y2 = INCY, Y2 + } + ;; + { .mmb + (p14) STFD [Y1] = f60 + (p14) STFD [Y2] = f61 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/xdot.S b/kernel/ia64/xdot.S new file mode 100644 index 0000000000..9322b4bc05 --- /dev/null +++ b/kernel/ia64/xdot.S @@ -0,0 +1,518 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCH_SIZE (4 * 24) + +#ifdef F_INTERFACE +#define N r33 +#define X1 r34 +#define INCX r35 +#define Y1 r36 +#define INCY r37 +#else +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 +#endif + +#define PREX1 r2 +#define PREY1 r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 + +#define INCX4 r24 +#define INCY4 r25 + +#define PR r30 +#define ARLC r31 + + PROLOGUE + .prologue + PROFCODE + { .mfi + nop __LINE__ + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov r26 = 1 + mov f9 = f0 + nop __LINE__ + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + shl r26 = r26, ZBASE_SHIFT + shl r27 = r27, ZBASE_SHIFT + ;; + (p6) add X1 = r26, X1 + (p7) add Y1 = r27, Y1 + ;; +#endif + { .mfi + adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 + mov f10 = f0 + mov PR = pr + } + { .mfb + cmp.lt p0, p6 = r0, N + mov f11 = f0 + (p6) br.cond.spnt .L1000 + } + ;; + { .mii + adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 + shl INCX = INCX, ZBASE_SHIFT + shl INCY = INCY, ZBASE_SHIFT + } + ;; + { .mfi + add X2 = SIZE, X1 + mov f12 = f0 + mov pr.rot= 0 + } + { .mfi + add Y2 = SIZE, Y1 + mov f13 = f0 + shr I = N, 3 + } + ;; + { .mfi + adds I = -1, I + mov f14 = f0 + mov ar.ec= 3 + } + { .mmf + shladd INCX4 = INCX, 2, r0 + shladd INCY4 = INCY, 2, r0 + mov f15 = f0 + } + ;; + { .mmi + and J = 7, N + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + } + { .mib + cmp.eq p6 ,p0 = -1, I + tbit.nz p12, p0 = N, 2 + (p6) br.cond.dpnt .L215 + } + ;; + .align 32 + +.L212: + { .mmf + (p16) lfetch.nt1 [PREX1], INCX4 + (p16) LDFD f80 = [X1], INCX + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f83 = [X2], INCX + nop __LINE__ + (p18) FMA f9 = f37, f82, f9 + } + ;; + { .mmf + (p16) LDFD f32 = [Y1], INCY + (p16) LDFD f35 = [Y2], INCY + (p18) FMA f10 = f34, f85, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f11 = f37, f85, f11 + } + ;; + { .mmf + (p16) LDFD f86 = [X1], INCX + (p16) LDFD f89 = [X2], INCX + (p18) FMA f12 = f40, f88, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f13 = f43, f88, f13 + } + ;; + { .mmf + (p16) LDFD f38 = [Y1], INCY + (p16) LDFD f41 = [Y2], INCY + (p18) FMA f14 = f40, f91, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f15 = f43, f91, f15 + } + ;; + { .mmf + (p16) LDFD f92 = [X1], INCX + (p16) LDFD f95 = [X2], INCX + (p18) FMA f8 = f46, f94, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f9 = f49, f94, f9 + } + ;; + { .mmf + (p16) lfetch.nt1 [PREY1], INCY4 + (p16) LDFD f44 = [Y1], INCY + (p18) FMA f10 = f46, f97, f10 + } + { .mmf + (p16) LDFD f47 = [Y2], INCY + nop __LINE__ + (p18) FMA f11 = f49, f97, f11 + } + ;; + { .mmf + (p16) LDFD f98 = [X1], INCX + (p16) LDFD f101 = [X2], INCX + (p18) FMA f12 = f52, f100, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f13 = f55, f100, f13 + } + ;; + { .mmf + (p16) LDFD f50 = [Y1], INCY + (p16) LDFD f53 = [Y2], INCY + (p18) FMA f14 = f52, f103, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) lfetch.nt1 [PREX1], INCX4 + (p16) LDFD f104 = [X1], INCX + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + (p16) LDFD f107 = [X2], INCX + nop __LINE__ + (p18) FMA f9 = f61, f106, f9 + } + ;; + { .mmf + (p16) LDFD f56 = [Y1], INCY + (p16) LDFD f59 = [Y2], INCY + (p18) FMA f10 = f58, f109, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f11 = f61, f109, f11 + } + ;; + { .mmf + (p16) LDFD f110 = [X1], INCX + (p16) LDFD f113 = [X2], INCX + (p18) FMA f12 = f64, f112, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f13 = f67, f112, f13 + } + ;; + { .mmf + (p16) LDFD f62 = [Y1], INCY + (p16) LDFD f65 = [Y2], INCY + (p18) FMA f14 = f64, f115, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f15 = f67, f115, f15 + } + ;; + { .mmf + (p16) lfetch.nt1 [PREY1], INCY4 + (p16) LDFD f116 = [X1], INCX + (p18) FMA f8 = f70, f118, f8 + } + { .mmf + (p16) LDFD f119 = [X2], INCX + nop __LINE__ + (p18) FMA f9 = f73, f118, f9 + } + ;; + { .mmf + (p16) LDFD f68 = [Y1], INCY + (p16) LDFD f71 = [Y2], INCY + (p18) FMA f10 = f70, f121, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f11 = f73, f121, f11 + } + ;; + { .mmf + (p16) LDFD f122 = [X1], INCX + (p16) LDFD f125 = [X2], INCX + (p18) FMA f12 = f76, f124, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f13 = f79, f124, f13 + } + ;; + { .mmf + (p16) LDFD f74 = [Y1], INCY + (p16) LDFD f77 = [Y2], INCY + (p18) FMA f14 = f76, f127, f14 + } + { .mfb + nop __LINE__ + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L212 + } + ;; + .align 32 + +.L215: + { .mmi + (p12) LDFD f48 = [X1], INCX + (p12) LDFD f49 = [X2], INCX + cmp.eq p7, p0 = r0, J + } + ;; + { .mmb + (p12) LDFD f32 = [Y1], INCY + (p12) LDFD f33 = [Y2], INCY + (p7) br.cond.dptk .L999 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], INCX + (p12) LDFD f51 = [X2], INCX + tbit.nz p13, p0 = N, 1 + } + ;; + { .mmi + (p12) LDFD f34 = [Y1], INCY + (p12) LDFD f35 = [Y2], INCY + nop __LINE__ + } + ;; + { .mmi + (p12) LDFD f52 = [X1], INCX + (p12) LDFD f53 = [X2], INCX + tbit.nz p14, p0 = N, 0 + } + ;; + { .mmi + (p12) LDFD f36 = [Y1], INCY + (p12) LDFD f37 = [Y2], INCY + nop __LINE__ + } + ;; + { .mmf + (p12) LDFD f54 = [X1], INCX + (p12) LDFD f55 = [X2], INCX + (p12) FMA f8 = f32, f48, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f9 = f33, f48, f9 + } + ;; + { .mmf + (p12) LDFD f38 = [Y1], INCY + (p12) LDFD f39 = [Y2], INCY + (p12) FMA f10 = f32, f49, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f11 = f33, f49, f11 + } + ;; + { .mmf + (p13) LDFD f56 = [X1], INCX + (p13) LDFD f57 = [X2], INCX + (p12) FMA f12 = f34, f50, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f13 = f35, f50, f13 + } + ;; + { .mmf + (p13) LDFD f40 = [Y1], INCY + (p13) LDFD f41 = [Y2], INCY + (p12) FMA f14 = f34, f51, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f15 = f35, f51, f15 + } + ;; + { .mmf + (p13) LDFD f58 = [X1], INCX + (p13) LDFD f59 = [X2], INCX + (p12) FMA f8 = f36, f52, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f9 = f37, f52, f9 + } + ;; + { .mmf + (p13) LDFD f42 = [Y1], INCY + (p13) LDFD f43 = [Y2], INCY + (p12) FMA f10 = f36, f53, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f11 = f37, f53, f11 + } + ;; + { .mmf + (p14) LDFD f60 = [X1] + (p14) LDFD f61 = [X2] + (p12) FMA f12 = f38, f54, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f13 = f39, f54, f13 + } + ;; + { .mmf + (p14) LDFD f44 = [Y1] + (p14) LDFD f45 = [Y2] + (p12) FMA f14 = f38, f55, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f15 = f39, f55, f15 + } + ;; + (p13) FMA f8 = f40, f56, f8 + (p13) FMA f9 = f41, f56, f9 + (p13) FMA f10 = f40, f57, f10 + (p13) FMA f11 = f41, f57, f11 + (p13) FMA f12 = f42, f58, f12 + (p13) FMA f13 = f43, f58, f13 + (p13) FMA f14 = f42, f59, f14 + (p13) FMA f15 = f43, f59, f15 + ;; + (p14) FMA f8 = f44, f60, f8 + (p14) FMA f9 = f45, f60, f9 + (p14) FMA f10 = f44, f61, f10 + (p14) FMA f11 = f45, f61, f11 + ;; + .align 32 + +.L999: + FADD f8 = f8, f12 + FADD f9 = f9, f13 + FADD f10 = f10, f14 + FADD f11 = f11, f15 + mov ar.lc = ARLC + ;; +#ifndef CONJ + FSUB f8 = f8, f11 + FADD f9 = f9, f10 +#else + FADD f8 = f8, f11 + FSUB f9 = f9, f10 +#endif + ;; + .align 32 + +.L1000: +#ifdef F_INTERFACE + STFD [r32] = f8, SIZE + ;; + STFD [r32] = f9, SIZE +#endif + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/zaxpy.S b/kernel/ia64/zaxpy.S new file mode 100644 index 0000000000..c0f14fe690 --- /dev/null +++ b/kernel/ia64/zaxpy.S @@ -0,0 +1,822 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#ifndef CONJ +#define FMA1 FNMA +#define FMA2 FMA +#else +#define FMA1 FMA +#define FMA2 FNMA +#endif + +#define SP r12 + +#ifdef XDOUBLE +#define N r32 +#define X1 r14 +#define INCX r15 +#define Y1 r16 +#define INCY r17 +#else +#define N r32 +#define X1 r37 +#define INCX r38 +#define Y1 r39 +#define INCY r36 +#endif + +#define PREX1 r2 +#define PREY1 r3 + +#define I r18 +#define J r19 +#define Y2 r20 +#define X2 r21 +#define INCX8 r22 +#define INCY8 r23 +#define YY1 r24 +#define YY2 r25 +#define YY3 r26 +#define YY4 r27 + +#define INCX2M1 loc0 +#define INCY2M1 loc1 +#define INCX4M1 loc2 +#define INCY4M1 loc3 +#define X3 loc4 +#define Y3 loc5 +#define X4 loc6 +#define Y4 loc7 +#define PREX2 loc8 +#define PREY2 loc9 + +#define ARLC r29 +#define PR r30 + +#define ALPHA_R f8 +#define ALPHA_I f9 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + } + { .mmb + adds r17 = 40, SP + cmp.gt p15, p0 = r0, N + (p15) br.ret.sptk.many b0 + } + ;; +#ifdef XDOUBLE + { .mmi + ld8 X1 = [r14] + ld8 INCX = [r15] + nop __LINE__ + } + { .mmi + ld8 Y1 = [r16] + ld8 INCY = [r17] + nop __LINE__ + } + ;; +#else + { .mmi + ld8 INCY = [r14] + nop __LINE__ + nop __LINE__ + } + ;; +#endif + { .mmi + .save ar.pfs, r10 + alloc r10 = ar.pfs, 8, 16, 0, 0 + and J = 7, N + shl INCX = INCX, ZBASE_SHIFT + } + { .mmi + adds PREX1 = (PREFETCH_SIZE + 2) * SIZE, X1 + adds PREY1 = (PREFETCH_SIZE + 2) * SIZE, Y1 + shl INCY = INCY, ZBASE_SHIFT + } + ;; + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mmi + adds INCX2M1 = -SIZE, INCX + adds INCY2M1 = -SIZE, INCY + shr I = N, 3 + } + ;; + { .mmi + add INCX2M1 = INCX2M1, INCX + add INCY2M1 = INCY2M1, INCY + mov PR = pr + } + { .mmi + add X2 = X1, INCX + add Y2 = Y1, INCY + nop __LINE__ + } + ;; + { .mmi + shladd INCX4M1 = INCX, 1, INCX2M1 + shladd INCY4M1 = INCY, 1, INCY2M1 + mov pr.rot= 0 + } + { .mmi + shladd X3 = INCX, 1, X1 + shladd Y3 = INCY, 1, Y1 + } + ;; + { .mmi + shladd X4 = INCX, 1, X2 + shladd Y4 = INCY, 1, Y2 + adds I = -1, I + } + { .mmi + cmp.eq p16, p0 = r0, r0 + and r8 = 127, Y1 + and PREX1 = -128, PREX1 + } + ;; + { .mmi + mov YY1 = Y1 + mov YY2 = Y2 + mov ar.ec = 3 + } + { .mmi + mov YY3 = Y3 + mov YY4 = Y4 + or PREX1 = PREX1, r8 + } + ;; + { .mmi + shladd PREX2 = INCX, 2, PREX1 + shladd PREY2 = INCY, 2, PREY1 + mov ar.lc = I + } + { .mib + cmp.eq p11 ,p0 = -1, I + tbit.z p0, p13 = N, 2 + (p11) br.cond.dpnt .L25 + } + ;; + .align 32 + +.L22: +#ifdef XDOUBLE + { .mmf + (p16) LDFD f80 = [Y1], 1 * SIZE + (p16) LDFD f83 = [Y2], 1 * SIZE + (p18) FMA1 f82 = ALPHA_I, f40, f82 + } + { .mmf + (p16) LDFD f92 = [Y3], 1 * SIZE + (p16) LDFD f95 = [Y4], 1 * SIZE + (p18) FMA1 f85 = ALPHA_I, f43, f85 + } + ;; + { .mmf + (p16) LDFD f86 = [Y1], INCY4M1 + (p16) LDFD f89 = [Y2], INCY4M1 + (p18) FMA1 f94 = ALPHA_I, f52, f94 + } + { .mmf + (p16) LDFD f98 = [Y3], INCY4M1 + (p16) LDFD f101 = [Y4], INCY4M1 + (p18) FMA1 f97 = ALPHA_I, f55, f97 + } + ;; + { .mmf + (p16) LDFD f32 = [X1], 1 * SIZE + (p16) LDFD f35 = [X2], 1 * SIZE + (p18) FMA f88 = ALPHA_I, f34, f88 + } + { .mmf + (p16) LDFD f44 = [X3], 1 * SIZE + (p16) LDFD f47 = [X4], 1 * SIZE + (p18) FMA f91 = ALPHA_I, f37, f91 + } + ;; + { .mmf + (p16) LDFD f38 = [X1], INCX4M1 + (p16) LDFD f41 = [X2], INCX4M1 + (p18) FMA f100 = ALPHA_I, f46, f100 + } + { .mmf + (p16) LDFD f50 = [X3], INCX4M1 + (p16) LDFD f53 = [X4], INCX4M1 + (p18) FMA f103 = ALPHA_I, f49, f103 + } + ;; + { .mmf + (p18) STFD [YY1] = f82, 1 * SIZE + (p18) STFD [YY2] = f85, 1 * SIZE + (p18) FMA f106 = ALPHA_R, f58, f106 + } + { .mmf + (p19) add YY3 = YY3, INCY4M1 + (p19) add YY4 = YY4, INCY4M1 + (p18) FMA f109 = ALPHA_R, f61, f109 + } + ;; + { .mmf + (p18) STFD [YY3] = f94, 1 * SIZE + (p18) STFD [YY4] = f97, 1 * SIZE + (p18) FMA f118 = ALPHA_R, f70, f118 + } + { .mmf + (p16) lfetch.excl.nt1 [PREY1], INCY8 + (p16) lfetch.excl.nt1 [PREY2], INCY8 + (p18) FMA f121 = ALPHA_R, f73, f121 + } + ;; + { .mmf + (p18) STFD [YY1] = f88 + (p18) STFD [YY2] = f91 + (p18) FMA2 f112 = ALPHA_R, f64, f112 + } + { .mmf + (p18) add YY1 = YY1, INCY4M1 + (p18) add YY2 = YY2, INCY4M1 + (p18) FMA2 f115 = ALPHA_R, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY3] = f100 + (p18) STFD [YY4] = f103 + (p18) FMA2 f124 = ALPHA_R, f76, f124 + } + { .mmf + (p18) add YY3 = YY3, INCY4M1 + (p18) add YY4 = YY4, INCY4M1 + (p18) FMA2 f127 = ALPHA_R, f79, f127 + } + ;; + { .mmf + (p16) LDFD f104 = [Y1], 1 * SIZE + (p16) LDFD f107 = [Y2], 1 * SIZE + (p18) FMA1 f106 = ALPHA_I, f64, f106 + } + { .mmf + (p16) LDFD f116 = [Y3], 1 * SIZE + (p16) LDFD f119 = [Y4], 1 * SIZE + (p18) FMA1 f109 = ALPHA_I, f67, f109 + } + ;; + { .mmf + (p16) LDFD f110 = [Y1], INCY4M1 + (p16) LDFD f113 = [Y2], INCY4M1 + (p18) FMA1 f118 = ALPHA_I, f76, f118 + } + { .mmf + (p16) LDFD f122 = [Y3], INCY4M1 + (p16) LDFD f125 = [Y4], INCY4M1 + (p18) FMA1 f121 = ALPHA_I, f79, f121 + } + ;; + { .mmf + (p16) LDFD f56 = [X1], 1 * SIZE + (p16) LDFD f59 = [X2], 1 * SIZE + (p18) FMA f112 = ALPHA_I, f58, f112 + } + { .mmf + (p16) LDFD f68 = [X3], 1 * SIZE + (p16) LDFD f71 = [X4], 1 * SIZE + (p18) FMA f115 = ALPHA_I, f61, f115 + } + ;; + { .mmf + (p16) LDFD f62 = [X1], INCX4M1 + (p16) LDFD f65 = [X2], INCX4M1 + (p18) FMA f124 = ALPHA_I, f70, f124 + } + { .mmf + (p16) LDFD f74 = [X3], INCX4M1 + (p16) LDFD f77 = [X4], INCX4M1 + (p18) FMA f127 = ALPHA_I, f73, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f106, 1 * SIZE + (p18) STFD [YY2] = f109, 1 * SIZE + (p17) FMA f81 = ALPHA_R, f33, f81 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p17) FMA f84 = ALPHA_R, f36, f84 + } + ;; + { .mmf + (p18) STFD [YY3] = f118, 1 * SIZE + (p18) STFD [YY4] = f121, 1 * SIZE + (p17) FMA f93 = ALPHA_R, f45, f93 + } + { .mmf + (p16) lfetch.nt1 [PREX1], INCX8 + (p16) lfetch.nt1 [PREX2], INCX8 + (p17) FMA f96 = ALPHA_R, f48, f96 + } + ;; + { .mmf + (p18) STFD [YY1] = f112 + (p18) STFD [YY2] = f115 + (p17) FMA2 f87 = ALPHA_R, f39, f87 + } + { .mmf + (p18) add YY1 = YY1, INCY4M1 + (p18) add YY2 = YY2, INCY4M1 + (p17) FMA2 f90 = ALPHA_R, f42, f90 + } + ;; + { .mmf + (p18) STFD [YY3] = f124 + (p18) STFD [YY4] = f127 + (p17) FMA2 f99 = ALPHA_R, f51, f99 + } + { .mfb + nop __LINE__ + (p17) FMA2 f102 = ALPHA_R, f54, f102 + br.ctop.sptk.few .L22 + } + ;; + ;; + (p19) add YY3 = YY3, INCY4M1 + (p19) add YY4 = YY4, INCY4M1 + ;; +#else + { .mmf + (p19) STFD [YY3] = f125 + (p19) STFD [YY4] = f32 + (p18) FMA2 f100 = ALPHA_R, f52, f100 + } + { .mmf + (p16) lfetch.excl.nt1 [PREY1], INCY8 + nop __LINE__ + (p18) FMA2 f103 = ALPHA_R, f55, f103 + } + ;; + { .mmf + (p16) LDFD f80 = [Y1], 1 * SIZE + (p16) LDFD f83 = [Y2], 1 * SIZE + (p18) FMA1 f82 = ALPHA_I, f40, f82 + } + { .mmf + (p16) LDFD f92 = [Y3], 1 * SIZE + (p16) LDFD f95 = [Y4], 1 * SIZE + (p18) FMA1 f85 = ALPHA_I, f43, f85 + } + ;; + { .mmf + (p16) LDFD f86 = [Y1], INCY4M1 + (p16) LDFD f89 = [Y2], INCY4M1 + (p18) FMA1 f94 = ALPHA_I, f52, f94 + } + { .mmf + (p19) add YY3 = YY3, INCY4M1 + (p19) add YY4 = YY4, INCY4M1 + (p18) FMA1 f97 = ALPHA_I, f55, f97 + } + ;; + { .mmf + (p16) LDFD f98 = [Y3], INCY4M1 + (p16) LDFD f101 = [Y4], INCY4M1 + (p18) FMA f88 = ALPHA_I, f34, f88 + } + { .mmf + (p19) add YY1 = YY1, INCY4M1 + (p19) add YY2 = YY2, INCY4M1 + (p18) FMA f91 = ALPHA_I, f37, f91 + } + ;; + { .mmf + (p16) LDFD f32 = [X1], 1 * SIZE + (p16) LDFD f35 = [X2], 1 * SIZE + (p18) FMA f100 = ALPHA_I, f46, f100 + } + { .mmf + (p16) LDFD f44 = [X3], 1 * SIZE + (p16) LDFD f47 = [X4], 1 * SIZE + (p18) FMA f103 = ALPHA_I, f49, f103 + } + ;; + { .mmf + (p18) STFD [YY1] = f82, 1 * SIZE + (p18) STFD [YY2] = f85, 1 * SIZE + (p18) FMA f106 = ALPHA_R, f58, f106 + } + { .mmf + (p16) LDFD f38 = [X1], INCX4M1 + (p16) LDFD f41 = [X2], INCX4M1 + (p18) FMA f109 = ALPHA_R, f61, f109 + } + ;; + { .mmf + (p18) STFD [YY3] = f94, 1 * SIZE + (p18) STFD [YY4] = f97, 1 * SIZE + (p18) FMA f118 = ALPHA_R, f70, f118 + } + { .mmf + (p16) LDFD f50 = [X3], INCX4M1 + (p16) LDFD f53 = [X4], INCX4M1 + (p18) FMA f121 = ALPHA_R, f73, f121 + } + ;; + { .mmf + (p18) STFD [YY1] = f88 + (p18) STFD [YY2] = f91 + (p18) FMA2 f112 = ALPHA_R, f64, f112 + } + { .mmf + (p16) lfetch.nt1 [PREX1], INCX8 + nop __LINE__ + (p18) FMA2 f115 = ALPHA_R, f67, f115 + } + ;; + { .mmf + (p18) STFD [YY3] = f100 + (p18) STFD [YY4] = f103 + (p18) FMA2 f124 = ALPHA_R, f76, f124 + } + { .mmf + (p16) LDFD f104 = [Y1], 1 * SIZE + (p16) LDFD f107 = [Y2], 1 * SIZE + (p18) FMA2 f127 = ALPHA_R, f79, f127 + } + ;; + { .mmf + (p16) LDFD f116 = [Y3], 1 * SIZE + (p16) LDFD f119 = [Y4], 1 * SIZE + (p18) FMA1 f106 = ALPHA_I, f64, f106 + } + { .mmf + (p18) add YY1 = YY1, INCY4M1 + (p18) add YY2 = YY2, INCY4M1 + (p18) FMA1 f109 = ALPHA_I, f67, f109 + } + ;; + { .mmf + (p16) LDFD f110 = [Y1], INCY4M1 + (p16) LDFD f113 = [Y2], INCY4M1 + (p18) FMA1 f118 = ALPHA_I, f76, f118 + } + { .mmf + (p18) add YY3 = YY3, INCY4M1 + (p18) add YY4 = YY4, INCY4M1 + (p18) FMA1 f121 = ALPHA_I, f79, f121 + } + ;; + { .mmf + (p16) LDFD f122 = [Y3], INCY4M1 + (p16) LDFD f125 = [Y4], INCY4M1 + (p18) FMA f112 = ALPHA_I, f58, f112 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p18) FMA f115 = ALPHA_I, f61, f115 + } + ;; + { .mmf + (p16) LDFD f56 = [X1], 1 * SIZE + (p16) LDFD f59 = [X2], 1 * SIZE + (p18) FMA f124 = ALPHA_I, f70, f124 + } + { .mmf + (p16) LDFD f68 = [X3], 1 * SIZE + (p16) LDFD f71 = [X4], 1 * SIZE + (p18) FMA f127 = ALPHA_I, f73, f127 + } + ;; + { .mmf + (p18) STFD [YY1] = f106, 1 * SIZE + (p18) STFD [YY2] = f109, 1 * SIZE + (p17) FMA f81 = ALPHA_R, f33, f81 + } + { .mmf + (p16) LDFD f62 = [X1], INCX4M1 + (p16) LDFD f65 = [X2], INCX4M1 + (p17) FMA f84 = ALPHA_R, f36, f84 + } + ;; + { .mmf + (p18) STFD [YY3] = f118, 1 * SIZE + (p18) STFD [YY4] = f121, 1 * SIZE + (p17) FMA f93 = ALPHA_R, f45, f93 + } + { .mmf + (p16) LDFD f74 = [X3], INCX4M1 + (p16) LDFD f77 = [X4], INCX4M1 + (p17) FMA f96 = ALPHA_R, f48, f96 + } + ;; + { .mmf + (p18) STFD [YY1] = f112 + (p18) STFD [YY2] = f115 + (p17) FMA2 f87 = ALPHA_R, f39, f87 + } + { .mfb + nop __LINE__ + (p17) FMA2 f90 = ALPHA_R, f42, f90 + br.ctop.sptk.few .L22 + } + ;; + { .mmi + (p19) STFD [YY3] = f125 + (p19) STFD [YY4] = f32 + (p19) add YY1 = YY1, INCY4M1 + } + { .mmi + (p19) add YY2 = YY2, INCY4M1 + (p19) add YY3 = YY3, INCY4M1 + (p19) add YY4 = YY4, INCY4M1 + } + ;; +#endif + .align 32 + +.L25: + { .mmi + (p13) LDFD f32 = [X1], 1 * SIZE + (p13) LDFD f34 = [X2], 1 * SIZE + mov ar.lc = ARLC + } + { .mmi + (p13) LDFD f36 = [X3], 1 * SIZE + (p13) LDFD f38 = [X4], 1 * SIZE + cmp.eq p12, p0 = r0, J + } + ;; + { .mmi + (p13) LDFD f80 = [Y1], 1 * SIZE + (p13) LDFD f82 = [Y2], 1 * SIZE + mov pr = PR, -65474 + } + { .mmb + (p13) LDFD f84 = [Y3], 1 * SIZE + (p13) LDFD f86 = [Y4], 1 * SIZE + (p12) br.ret.sptk.many b0 + } + ;; + { .mmi + (p13) LDFD f33 = [X1], INCX4M1 + (p13) LDFD f35 = [X2], INCX4M1 + tbit.z p0, p14 = N, 1 + } + { .mmi + (p13) LDFD f81 = [Y1], INCY4M1 + (p13) LDFD f83 = [Y2], INCY4M1 + nop __LINE__ + } + ;; + { .mmi + (p13) LDFD f37 = [X3], INCX4M1 + (p13) LDFD f39 = [X4], INCX4M1 + tbit.z p0, p15 = N, 0 + } + { .mmi + (p13) LDFD f85 = [Y3], INCY4M1 + (p13) LDFD f87 = [Y4], INCY4M1 + nop __LINE__ + } + ;; + { .mmf + (p14) LDFD f40 = [X1], 1 * SIZE + (p14) LDFD f42 = [X2], 1 * SIZE + } + ;; + { .mmf + (p14) LDFD f88 = [Y1], 1 * SIZE + (p14) LDFD f90 = [Y2], 1 * SIZE + } + ;; + { .mmf + (p14) LDFD f41 = [X1], INCX2M1 + (p14) LDFD f43 = [X2], INCX2M1 + (p13) FMA f80 = ALPHA_R, f32, f80 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f82 = ALPHA_R, f34, f82 + } + ;; + { .mmf + (p14) LDFD f89 = [Y1], INCY2M1 + (p14) LDFD f91 = [Y2], INCY2M1 + (p13) FMA f84 = ALPHA_R, f36, f84 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f86 = ALPHA_R, f38, f86 + } + ;; + { .mmf + (p15) LDFD f44 = [X1], 1 * SIZE + (p15) LDFD f92 = [Y1], 1 * SIZE + (p13) FMA2 f81 = ALPHA_R, f33, f81 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA2 f83 = ALPHA_R, f35, f83 + } + ;; + { .mmf + (p15) LDFD f45 = [X1] + (p15) LDFD f93 = [Y1] + (p13) FMA2 f85 = ALPHA_R, f37, f85 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA2 f87 = ALPHA_R, f39, f87 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA1 f80 = ALPHA_I, f33, f80 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA1 f82 = ALPHA_I, f35, f82 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA1 f84 = ALPHA_I, f37, f84 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA1 f86 = ALPHA_I, f39, f86 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f81 = ALPHA_I, f32, f81 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f83 = ALPHA_I, f34, f83 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f85 = ALPHA_I, f36, f85 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f87 = ALPHA_I, f38, f87 + } + ;; + { .mmf + (p13) STFD [YY1] = f80, 1 * SIZE + (p13) STFD [YY2] = f82, 1 * SIZE + (p14) FMA f88 = ALPHA_R, f40, f88 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMA f90 = ALPHA_R, f42, f90 + } + ;; + { .mmf + (p13) STFD [YY3] = f84, 1 * SIZE + (p13) STFD [YY4] = f86, 1 * SIZE + (p14) FMA2 f89 = ALPHA_R, f41, f89 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMA2 f91 = ALPHA_R, f43, f91 + } + ;; + { .mmf + (p13) STFD [YY1] = f81 + (p13) STFD [YY2] = f83 + (p15) FMA f92 = ALPHA_R, f44, f92 + } + { .mmf + (p13) add YY1 = YY1, INCY4M1 + (p13) add YY2 = YY2, INCY4M1 + (p15) FMA2 f93 = ALPHA_R, f45, f93 + } + ;; + { .mmf + (p13) STFD [YY3] = f85 + (p13) STFD [YY4] = f87 + (p14) FMA1 f88 = ALPHA_I, f41, f88 + } + { .mmf + (p13) add YY3 = YY3, INCY4M1 + (p13) add YY4 = YY4, INCY4M1 + (p14) FMA1 f90 = ALPHA_I, f43, f90 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMA f89 = ALPHA_I, f40, f89 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMA f91 = ALPHA_I, f42, f91 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA1 f92 = ALPHA_I, f45, f92 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p15) FMA f93 = ALPHA_I, f44, f93 + } + ;; + { .mmi + (p14) STFD [YY1] = f88, 1 * SIZE + (p14) STFD [YY2] = f90, 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p14) STFD [YY1] = f89 + (p14) STFD [YY2] = f91 + (p14) add YY1 = YY1, INCY2M1 + } + ;; + { .mmi + (p15) STFD [YY1] = f92, 1 * SIZE + nop __LINE__ + nop __LINE__ + } + ;; + { .mmb + (p15) STFD [YY1] = f93 + nop __LINE__ + br.ret.sptk.many b0 + } + ;; + EPILOGUE diff --git a/kernel/ia64/zcopy.S b/kernel/ia64/zcopy.S new file mode 100644 index 0000000000..91d90e0a84 --- /dev/null +++ b/kernel/ia64/zcopy.S @@ -0,0 +1,1378 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREA r2 +#define PREB r3 + +#define I r14 +#define J r15 + +#define X2 r16 +#define Y2 r17 +#define INCXM1 r20 +#define INCYM1 r21 +#define INCX3M1 r22 +#define INCY3M1 r23 +#define INCX8 r24 +#define INCY8 r25 +#define XX r26 +#define YY r27 +#define XA r28 +#define YA r29 +#define PR r30 +#define ARLC r31 + +#ifdef DOUBLE +#define PREFETCH_SIZE (6 * 32) +#else +#define PREFETCH_SIZE (8 * 64) +#endif + + PROLOGUE + .prologue + PROFCODE + + { .mmi + shladd INCX = INCX, ZBASE_SHIFT, r0 + shladd INCY = INCY, ZBASE_SHIFT, r0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + sub XA = Y1, X1 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + shladd INCX3M1 = INCX, 1, INCX + shladd INCY3M1 = INCY, 1, INCY + mov PR = pr + } + { .mmi + adds INCXM1 = - SIZE, INCX + adds INCYM1 = - SIZE, INCY + shr.u XA = XA, BASE_SHIFT + } + ;; + { .mmi +#ifdef DOUBLE + adds XA = 4, XA +#else + adds XA = -2, XA +#endif + and J = 7, N + mov pr.rot = 0 + } + { .mmi + adds INCX3M1 = - SIZE, INCX3M1 + adds INCY3M1 = - SIZE, INCY3M1 + shr I = N, 3 + } + ;; + { .mmi +#ifdef DOUBLE + and XA = 31, XA +#else + and XA = 63, XA +#endif + cmp.eq p9, p0 = r0, J + tbit.z p0, p7 = X1, BASE_SHIFT + } + { .mmi + shladd X2 = INCX, 1, X1 + shladd Y2 = INCY, 1, Y1 + tbit.z p0, p12 = N, 2 + } + ;; + { .mmi + cmp.eq p8 ,p0 = r0, I + adds I = -1, I +#ifdef DOUBLE + cmp.le p11, p0 = 15, XA +#else + cmp.ge p11, p0 = 31, XA +#endif + } + { .mmb + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + (p8) br.cond.dpnt .L25 + } + ;; + { .mmi + nop.m 0 + nop.m 0 + mov ar.lc = I + } + { .mbb + (p7) br.cond.dpnt .L100 + (p11) br.cond.dpnt .L30 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + nop.m 0 + mov ar.ec = 5 + } + { .mmi + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 +#ifndef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE + 0, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE - 40, Y1 +#endif + nop.i 0 + } + ;; + .align 32 + +.L21: + { .mmi + (p21) STFD [Y1] = f42 + (p21) STFD [Y2] = f62 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f32, f37 = [X1] + (p16) add X1 = X1, INCX + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f47, 1 * SIZE + (p21) STFD [Y2] = f67, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f42, f47 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p21) STFD [Y1] = f52 + (p21) STFD [Y2] = f72 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1] + (p16) add X1 = X1, INCX + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f77, 1 * SIZE + (p21) STFD [Y2] = f97, 1 * SIZE + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f62, f67 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p21) STFD [Y1] = f82 + (p21) STFD [Y2] = f102 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1] + (p16) add X1 = X1, INCX + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f87, 1 * SIZE + (p21) STFD [Y2] = f107, 1 * SIZE + } + { .mmi + (p16) LDFPD f82, f87 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p21) STFD [Y1] = f92 + (p21) STFD [Y2] = f112 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f92, f97 = [X1] + (p16) add X1 = X1, INCX + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f36, 1 * SIZE + (p20) STFD [Y2] = f56, 1 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + { .mmb + (p16) LDFPD f102, f107 = [X1] + (p16) add X1 = X1, INCX + br.ctop.sptk.few .L21 + } + ;; + + { .mmi + (p21) STFD [Y1] = f42 + (p21) STFD [Y2] = f62 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f47, 1 * SIZE + (p21) STFD [Y2] = f67, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f52 + (p21) STFD [Y2] = f72 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f77, 1 * SIZE + (p21) STFD [Y2] = f97, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f82 + (p21) STFD [Y2] = f102 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f87, 1 * SIZE + (p21) STFD [Y2] = f107, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f92 + (p21) STFD [Y2] = f112 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p21) add Y2 = INCY3M1, Y2 + } + ;; + .align 32 + +.L25: + { .mmi + mov XX = X1 + nop.m 0 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f48 = [X1], 1 * SIZE + (p12) LDFD f52 = [X2], 1 * SIZE + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCXM1 + (p12) LDFD f53 = [X2], INCXM1 + mov pr = PR, -65474 + } + { .mib + nop.m 0 + tbit.z p0, p14 = N, 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], 1 * SIZE + (p12) LDFD f54 = [X2], 1 * SIZE + (p12) shladd XX = INCX, 2, XX;; + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX3M1 + (p12) LDFD f55 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX;; + } + ;; + { .mmi + (p13) LDFD f56 = [X1], 1 * SIZE + (p14) LDFD f60 = [XX], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCXM1 + (p14) LDFD f61 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48, 1 * SIZE + (p12) STFD [Y2] = f52, 1 * SIZE + } + { .mmi + (p13) LDFD f58 = [X1], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCYM1, Y1 + } + { .mmi + (p13) LDFD f59 = [X1] + (p12) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50, 1 * SIZE + (p12) STFD [Y2] = f54, 1 * SIZE + (p12) shladd YY = INCY, 2, YY;; + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) shladd YY = INCY, 1, YY + } + { .mmi + (p12) add Y1 = INCY3M1, Y1 + (p12) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56, 1 * SIZE + (p14) STFD [YY] = f60, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCYM1, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f58, 1 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L30: + { .mmi + cmp.eq p16, p0 = r0, r0 + nop.m 0 + mov ar.ec = 5 + } + { .mmi +#ifndef DOUBLE + adds PREA = PREFETCH_SIZE * SIZE + 24, X1 + adds PREB = PREFETCH_SIZE * SIZE + 40, Y1 +#else + adds PREA = PREFETCH_SIZE * SIZE - 56, X1 + adds PREB = PREFETCH_SIZE * SIZE - 24, Y1 +#endif + nop.i 0 + } + ;; + .align 32 + +#ifndef DOUBLE +.L31: + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f111 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f32, f37 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f42, f47 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1] + (p16) add X1 = X1, INCX + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f45, 1 * SIZE + (p19) STFD [Y2] = f65, 1 * SIZE + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f62, f67 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p19) STFD [Y1] = f50 + (p19) STFD [Y2] = f70 + (p19) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1] + (p16) add X1 = X1, INCX + (p19) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75, 1 * SIZE + (p19) STFD [Y2] = f95, 1 * SIZE + } + { .mmi + (p16) LDFPD f82, f87 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p19) STFD [Y1] = f80 + (p19) STFD [Y2] = f100 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f92, f97 = [X1] + (p16) add X1 = X1, INCX + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f85, 1 * SIZE + (p19) STFD [Y2] = f105, 1 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + { .mmb + (p16) LDFPD f102, f107 = [X1] + (p16) add X1 = X1, INCX + br.ctop.sptk.few .L31 + } + ;; + br .L25 + .align 32 + +#else +.L31: + { .mmi + (p20) STFD [Y1] = f41 + (p20) STFD [Y2] = f61 + (p20) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f32, f37 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f46, 1 * SIZE + (p20) STFD [Y2] = f66, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f42, f47 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p20) STFD [Y1] = f51 + (p20) STFD [Y2] = f71 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f52, f57 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f76, 1 * SIZE + (p20) STFD [Y2] = f96, 1 * SIZE + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f62, f67 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p20) STFD [Y1] = f81 + (p20) STFD [Y2] = f101 + (p20) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f72, f77 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f86, 1 * SIZE + (p20) STFD [Y2] = f106, 1 * SIZE + } + { .mmi + (p16) LDFPD f82, f87 = [X1] + (p16) add X1 = X1, INCX + } + ;; + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f111 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f92, f97 = [X1] + (p16) add X1 = X1, INCX + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + { .mmb + (p16) LDFPD f102, f107 = [X1] + (p16) add X1 = X1, INCX + br.ctop.sptk.few .L31 + } + ;; + br .L25 + .align 32 +#endif + +.L100: + { .mmi + mov ar.lc = I + } + { .mbb + cmp.ne p6, p0 = 2 * SIZE, INCX + (p6) br.cond.dpnt .L200 + (p11) br.cond.dpnt .L130 + } + ;; + { .mmi + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 +#ifndef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE - 32, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 +#endif + mov ar.ec = 5 + } + { .mmi + LDFD f32 = [X1], 1 * SIZE + cmp.eq p16, p0 = r0, r0 + nop.i 0 + } + ;; + .align 32 + +.L121: + { .mmi + (p21) STFD [Y1] = f47, 1 * SIZE + (p21) STFD [Y2] = f67, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f37, f42 = [X1], 2 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f52 + (p21) STFD [Y2] = f72 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f47, f52 = [X1], 2 * SIZE + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f77, 1 * SIZE + (p21) STFD [Y2] = f97, 1 * SIZE + } + { .mmi + (p16) LDFPD f57, f62 = [X1], 2 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f82 + (p21) STFD [Y2] = f102 + (p21) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f67, f72 = [X1], 2 * SIZE + (p21) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p21) STFD [Y1] = f87, 1 * SIZE + (p21) STFD [Y2] = f107, 1 * SIZE + } + { .mmi + (p16) LDFPD f77, f82 = [X1], 2 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f92 + (p21) STFD [Y2] = f113 + (p21) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f87, f92 = [X1], 2 * SIZE + (p21) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f36, 1 * SIZE + (p20) STFD [Y2] = f56, 1 * SIZE + } + { .mmi + (p16) LDFPD f97, f102 = [X1], 2 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + ;; + { .mmi + (p20) STFD [Y1] = f41 + (p20) STFD [Y2] = f61 + (p20) add Y1 = INCYM1, Y1 + } + { .mmb + (p16) LDFPD f108, f127 = [X1], 2 * SIZE + (p20) add Y2 = INCYM1, Y2 + br.ctop.sptk.few .L121 + } + ;; + { .mmi + (p21) STFD [Y1] = f47, 1 * SIZE + (p21) STFD [Y2] = f67, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f52 + (p21) STFD [Y2] = f72 + (p21) add Y1 = INCY3M1, Y1 + } + (p21) add Y2 = INCY3M1, Y2 + ;; + { .mmi + (p21) STFD [Y1] = f77, 1 * SIZE + (p21) STFD [Y2] = f97, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f82 + (p21) STFD [Y2] = f102 + (p21) add Y1 = INCYM1, Y1 + } + (p21) add Y2 = INCYM1, Y2 + ;; + { .mmi + (p21) STFD [Y1] = f87, 1 * SIZE + (p21) STFD [Y2] = f107, 1 * SIZE + } + ;; + { .mmi + (p21) STFD [Y1] = f92 + (p21) STFD [Y2] = f113 + (p21) add Y1 = INCY3M1, Y1 + } + (p21) add Y2 = INCY3M1, Y2 + + adds X1 = -SIZE, X1 + ;; + .align 32 + +.L125: + { .mmi + mov XX = X1 + nop.m 0 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f48 = [X1], 1 * SIZE + (p12) LDFD f52 = [X2], 1 * SIZE + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCXM1 + (p12) LDFD f53 = [X2], INCXM1 + mov pr = PR, -65474 + } + { .mib + nop.m 0 + tbit.z p0, p14 = N, 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], 1 * SIZE + (p12) LDFD f54 = [X2], 1 * SIZE + (p12) shladd XX = INCX, 2, XX;; + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX3M1 + (p12) LDFD f55 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX;; + } + ;; + { .mmi + (p13) LDFD f56 = [X1], 1 * SIZE + (p14) LDFD f60 = [XX], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCXM1 + (p14) LDFD f61 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48, 1 * SIZE + (p12) STFD [Y2] = f52, 1 * SIZE + } + { .mmi + (p13) LDFD f58 = [X1], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCYM1, Y1 + } + { .mmi + (p13) LDFD f59 = [X1] + (p12) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50, 1 * SIZE + (p12) STFD [Y2] = f54, 1 * SIZE + (p12) shladd YY = INCY, 2, YY;; + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) shladd YY = INCY, 1, YY + } + { .mmi + (p12) add Y1 = INCY3M1, Y1 + (p12) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56, 1 * SIZE + (p14) STFD [YY] = f60, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCYM1, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f58, 1 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L130: + { .mmi + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 +#ifndef DOUBLE + adds PREB = PREFETCH_SIZE * SIZE + 72, Y1 +#else + adds PREB = PREFETCH_SIZE * SIZE + 56, Y1 +#endif + mov ar.ec = 5 + } + { .mmi + LDFD f32 = [X1], 1 * SIZE + cmp.eq p16, p0 = r0, r0 + nop.i 0 + } + ;; +#ifndef DOUBLE +.L131: + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + nop.i 0 + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f37, f42 = [X1], 2 * SIZE + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f47, f52 = [X1], 2 * SIZE + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f45, 1 * SIZE + (p19) STFD [Y2] = f65, 1 * SIZE + nop.i 0 + } + { .mmi + (p16) LDFPD f57, f62 = [X1], 2 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f50 + (p19) STFD [Y2] = f70 + (p19) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f67, f72 = [X1], 2 * SIZE + (p19) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f75, 1 * SIZE + (p19) STFD [Y2] = f95, 1 * SIZE + nop.i 0 + } + { .mmi + (p16) LDFPD f77, f82 = [X1], 2 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f80 + (p19) STFD [Y2] = f100 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f87, f92 = [X1], 2 * SIZE + (p19) add Y2 = INCYM1, Y2 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f85, 1 * SIZE + (p19) STFD [Y2] = f105, 1 * SIZE + nop.i 0 + } + { .mmi + (p16) LDFPD f97, f102 = [X1], 2 * SIZE + (p16) shladd X2 = INCX, 3, X2 + nop.i 0 + } + ;; + { .mmi + (p19) STFD [Y1] = f90 + (p19) STFD [Y2] = f111 + (p19) add Y1 = INCY3M1, Y1 + } + { .mmb + (p16) LDFPD f108, f127 = [X1], 2 * SIZE + (p19) add Y2 = INCY3M1, Y2 + br.ctop.sptk.few .L131 + } + ;; + { .mmi + adds X1 = -SIZE, X1 + nop.m 0 + nop.i 0 + } + ;; + .align 32 +#else +.L131: + { .mmi + (p20) STFD [Y1] = f46, 1 * SIZE + (p20) STFD [Y2] = f66, 1 * SIZE + } + { .mmi + (p16) lfetch.nt1 [PREA], INCX8 + (p16) LDFPD f37, f42 = [X1], 2 * SIZE + } + ;; + { .mmi + (p20) STFD [Y1] = f51 + (p20) STFD [Y2] = f71 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) lfetch.excl.nt1 [PREB], INCY8 + (p16) LDFPD f47, f52 = [X1], 2 * SIZE + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f76, 1 * SIZE + (p20) STFD [Y2] = f96, 1 * SIZE + } + { .mmi + (p16) LDFPD f57, f62 = [X1], 2 * SIZE + } + ;; + { .mmi + (p20) STFD [Y1] = f81 + (p20) STFD [Y2] = f101 + (p20) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFPD f67, f72 = [X1], 2 * SIZE + (p20) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p20) STFD [Y1] = f86, 1 * SIZE + (p20) STFD [Y2] = f106, 1 * SIZE + } + { .mmi + (p16) LDFPD f77, f82 = [X1], 2 * SIZE + } + ;; + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f112 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFPD f87, f92 = [X1], 2 * SIZE + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + } + { .mmi + (p16) LDFPD f97, f102 = [X1], 2 * SIZE + (p16) shladd X2 = INCX, 3, X2 + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCYM1, Y1 + } + { .mmb + (p16) LDFPD f108, f127 = [X1], 2 * SIZE + (p19) add Y2 = INCYM1, Y2 + br.ctop.sptk.few .L131 + } + ;; + { .mmi + adds X1 = -SIZE, X1 + nop.m 0 + nop.i 0 + } + ;; + .align 32 +#endif + +.L135: + { .mmi + mov XX = X1 + nop.m 0 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f48 = [X1], 1 * SIZE + (p12) LDFD f52 = [X2], 1 * SIZE + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCXM1 + (p12) LDFD f53 = [X2], INCXM1 + mov pr = PR, -65474 + } + { .mib + nop.m 0 + tbit.z p0, p14 = N, 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], 1 * SIZE + (p12) LDFD f54 = [X2], 1 * SIZE + (p12) shladd XX = INCX, 2, XX;; + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX3M1 + (p12) LDFD f55 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX;; + } + ;; + { .mmi + (p13) LDFD f56 = [X1], 1 * SIZE + (p14) LDFD f60 = [XX], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCXM1 + (p14) LDFD f61 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48, 1 * SIZE + (p12) STFD [Y2] = f52, 1 * SIZE + } + { .mmi + (p13) LDFD f58 = [X1], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCYM1, Y1 + } + { .mmi + (p13) LDFD f59 = [X1] + (p12) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50, 1 * SIZE + (p12) STFD [Y2] = f54, 1 * SIZE + (p12) shladd YY = INCY, 2, YY;; + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) shladd YY = INCY, 1, YY + } + { .mmi + (p12) add Y1 = INCY3M1, Y1 + (p12) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56, 1 * SIZE + (p14) STFD [YY] = f60, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCYM1, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f58, 1 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + ;; + +/* Unaligned Copy INCX =! 1 */ +.L200: + ;; + { .mmi + adds PREA = PREFETCH_SIZE * SIZE + 32, X1 + adds PREB = PREFETCH_SIZE * SIZE + 32, Y1 + mov ar.ec = 5 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + nop.m 0 + nop.i 0 + } + ;; + .align 32 + +.L221: + { .mmi + (p20) STFD [Y1] = f91 + (p20) STFD [Y2] = f111 + (p20) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFD f32 = [X1], 1 * SIZE + (p16) LDFD f52 = [X2], 1 * SIZE + (p20) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f35, 1 * SIZE + (p19) STFD [Y2] = f55, 1 * SIZE + } + { .mmi + (p16) LDFD f37 = [X1], INCXM1 + (p16) LDFD f57 = [X2], INCXM1 + } + ;; + { .mmi + (p19) STFD [Y1] = f40 + (p19) STFD [Y2] = f60 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFD f42 = [X1], 1 * SIZE + (p16) LDFD f62 = [X2], 1 * SIZE + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f45, 1 * SIZE + (p19) STFD [Y2] = f65, 1 * SIZE + } + { .mmi + (p16) LDFD f47 = [X1], INCX3M1 + (p16) LDFD f67 = [X2], INCX3M1 + } + ;; + { .mmi + (p19) STFD [Y1] = f50 + (p19) STFD [Y2] = f70 + (p19) add Y1 = INCY3M1, Y1 + } + { .mmi + (p16) LDFD f72 = [X1], 1 * SIZE + (p16) LDFD f92 = [X2], 1 * SIZE + (p19) add Y2 = INCY3M1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f75, 1 * SIZE + (p19) STFD [Y2] = f95, 1 * SIZE + } + { .mmi + (p16) LDFD f77 = [X1], INCXM1 + (p16) LDFD f97 = [X2], INCXM1 + } + ;; + { .mmi + (p19) STFD [Y1] = f80 + (p19) STFD [Y2] = f100 + (p19) add Y1 = INCYM1, Y1 + } + { .mmi + (p16) LDFD f82 = [X1], 1 * SIZE + (p16) LDFD f102 = [X2], 1 * SIZE + (p19) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p19) STFD [Y1] = f85, 1 * SIZE + (p19) STFD [Y2] = f105, 1 * SIZE + } + { .mmb + (p16) LDFD f87 = [X1], INCX3M1 + (p16) LDFD f107 = [X2], INCX3M1 + br.ctop.sptk.few .L221 + } + ;; + .align 32 + +.L225: + { .mmi + mov XX = X1 + nop.m 0 + mov ar.lc = ARLC + } + { .mmi + (p12) LDFD f48 = [X1], 1 * SIZE + (p12) LDFD f52 = [X2], 1 * SIZE + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f49 = [X1], INCXM1 + (p12) LDFD f53 = [X2], INCXM1 + mov pr = PR, -65474 + } + { .mib + nop.m 0 + tbit.z p0, p14 = N, 0 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f50 = [X1], 1 * SIZE + (p12) LDFD f54 = [X2], 1 * SIZE + (p12) shladd XX = INCX, 2, XX;; + } + ;; + { .mmi + (p12) LDFD f51 = [X1], INCX3M1 + (p12) LDFD f55 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX;; + } + ;; + { .mmi + (p13) LDFD f56 = [X1], 1 * SIZE + (p14) LDFD f60 = [XX], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f57 = [X1], INCXM1 + (p14) LDFD f61 = [XX] + mov YY = Y1 + } + ;; + { .mmi + (p12) STFD [Y1] = f48, 1 * SIZE + (p12) STFD [Y2] = f52, 1 * SIZE + } + { .mmi + (p13) LDFD f58 = [X1], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [Y1] = f49 + (p12) STFD [Y2] = f53 + (p12) add Y1 = INCYM1, Y1 + } + { .mmi + (p13) LDFD f59 = [X1] + (p12) add Y2 = INCYM1, Y2 + } + ;; + { .mmi + (p12) STFD [Y1] = f50, 1 * SIZE + (p12) STFD [Y2] = f54, 1 * SIZE + (p12) shladd YY = INCY, 2, YY;; + } + ;; + { .mmi + (p12) STFD [Y1] = f51 + (p12) STFD [Y2] = f55 + (p13) shladd YY = INCY, 1, YY + } + { .mmi + (p12) add Y1 = INCY3M1, Y1 + (p12) add Y2 = INCY3M1, Y2 + nop.i 0 + } + ;; + { .mmi + (p13) STFD [Y1] = f56, 1 * SIZE + (p14) STFD [YY] = f60, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [Y1] = f57 + (p14) STFD [YY] = f61 + (p13) add Y1 = INCYM1, Y1 + } + ;; + { .mmi + (p13) STFD [Y1] = f58, 1 * SIZE + nop.m 0 + nop.i 0 + } + ;; + { .mib + (p13) STFD [Y1] = f59 + nop.i 0 + br.ret.sptk.many b0 + } + + EPILOGUE + diff --git a/kernel/ia64/zdot.S b/kernel/ia64/zdot.S new file mode 100644 index 0000000000..5c77ce6efb --- /dev/null +++ b/kernel/ia64/zdot.S @@ -0,0 +1,487 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCH_SIZE (13 * 16) +#else +#define PREFETCH_SIZE ( 9 * 32) +#endif + +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) +#define N r33 +#define X1 r34 +#define INCX r35 +#define Y1 r36 +#define INCY r37 +#else +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 +#endif + +#define PRE1 r2 +#define PRE2 r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 +#define INCXM1 r18 +#define INCYM1 r19 +#define INCX16 r20 +#define INCY16 r21 +#define INCX3M1 r22 +#define INCY3M1 r23 +#define XX r24 +#define YY r25 + +#define PR r30 +#define ARLC r31 + +#define ALPHA f8 + + PROLOGUE + .prologue + PROFCODE + + { .mfi + mov f8 = f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + mov f9 = f0 + } + ;; + .body +#ifdef F_INTERFACE + LDINT N = [N] + LDINT INCX = [INCX] + LDINT INCY = [INCY] + ;; +#ifndef USE64BITINT + sxt4 N = N + sxt4 INCX = INCX + sxt4 INCY = INCY + ;; +#endif +#endif + { .mmi + shladd INCX = INCX, ZBASE_SHIFT, r0 + shladd INCY = INCY, ZBASE_SHIFT, r0 + mov PR = pr + } + { .mib + cmp.lt p0, p7 = r0, N + mov r26 = 1 + (p7) br.cond.spnt .L1000 + } + ;; +#ifdef F_INTERFACE + cmp.le p0, p6 = r0, INCX + cmp.le p0, p7 = r0, INCY + sub r26 = r26, N + ;; + setf.sig f32 = r26 + setf.sig f33 = INCX + setf.sig f34 = INCY + ;; + xmpy.l f33 = f32, f33 + xmpy.l f34 = f32, f34 + ;; + getf.sig r26 = f33 + getf.sig r27 = f34 + ;; + (p6) add X1 = X1, r26 + (p7) add Y1 = Y1, r27 + ;; +#endif + { .mfi +#ifdef DOUBLE + adds PRE1 = (PREFETCH_SIZE + 4) * SIZE, X1 +#else + adds PRE1 = (PREFETCH_SIZE + 8) * SIZE, X1 +#endif + mov f10 = f0 + mov pr.rot= 0 + } + { .mfi + and J = 7, N + mov f11 = f0 + shr I = N, 3 + } + ;; + { .mfi +#ifdef DOUBLE + adds PRE2 = (PREFETCH_SIZE + 6) * SIZE, Y1 +#else + adds PRE2 = (PREFETCH_SIZE + 12) * SIZE, Y1 +#endif + mov f12 = f0 + mov ar.ec = 3 + } + { .mmf + shladd INCX16 = INCX, 3, r0 + shladd INCY16 = INCY, 3, r0 + mov f13 = f0 + } + ;; + { .mmf + shladd INCX3M1 = INCX, 1, INCX + shladd INCY3M1 = INCY, 1, INCY + mov f14 = f0 + } + { .mmf + adds INCXM1 = -SIZE, INCX + adds INCYM1 = -SIZE, INCY + mov f15 = f0 + } + ;; + { .mmi + adds INCX3M1 = -SIZE, INCX3M1 + adds INCY3M1 = -SIZE, INCY3M1 + tbit.z p0, p12 = N, 2 + } + { .mmi + cmp.eq p8 ,p0 = r0, I + cmp.eq p16, p0 = r0, r0 + adds I = -1, I + } + ;; + { .mmi + shladd X2 = INCX, 1, X1 + shladd Y2 = INCY, 1, Y1 + mov ar.lc = I + } + { .mmb + mov XX = X1 + mov YY = Y1 + (p8) br.cond.dpnt .L55 + } + ;; + .align 32 + +.L52: + { .mmf + (p16) lfetch.nt1 [PRE1], INCX16 + (p16) LDFD f32 = [X1], SIZE + (p18) FMA f8 = f34, f82, f8 + } + { .mmf + (p16) LDFD f44 = [X2], SIZE + nop.m 0 + (p18) FMA f9 = f34, f85, f9 + } + ;; + { .mmf + (p16) LDFD f80 = [Y1], SIZE + (p16) LDFD f92 = [Y2], SIZE + (p18) FMA f10 = f37, f82, f10 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f11 = f37, f85, f11 + } + ;; + { .mmf + (p16) lfetch.nt1 [PRE2], INCY16 + (p16) LDFD f35 = [X1], INCXM1 + (p18) FMA f12 = f40, f88, f12 + } + { .mmf + (p16) LDFD f47 = [X2], INCXM1 + nop.m 0 + (p18) FMA f13 = f40, f91, f13 + } + ;; + { .mmf + (p16) LDFD f83 = [Y1], INCYM1 + (p16) LDFD f95 = [Y2], INCYM1 + (p18) FMA f14 = f43, f88, f14 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f15 = f43, f91, f15 + } + ;; + { .mmf + (p16) LDFD f38 = [X1], SIZE + (p16) LDFD f50 = [X2], SIZE + (p18) FMA f8 = f46, f94, f8 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f9 = f46, f97, f9 + } + ;; + { .mmf + (p16) LDFD f86 = [Y1], SIZE + (p16) LDFD f98 = [Y2], SIZE + (p18) FMA f10 = f49, f94, f10 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f11 = f49, f97, f11 + } + ;; + { .mmf + (p16) LDFD f41 = [X1], INCX3M1 + (p16) LDFD f53 = [X2], INCX3M1 + (p18) FMA f12 = f52, f100, f12 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f13 = f52, f103, f13 + } + ;; + { .mmf + (p16) LDFD f89 = [Y1], INCY3M1 + (p16) LDFD f101 = [Y2], INCY3M1 + (p18) FMA f14 = f55, f100, f14 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f15 = f55, f103, f15 + } + ;; + { .mmf + (p16) LDFD f56 = [X1], SIZE + (p16) LDFD f68 = [X2], SIZE + (p18) FMA f8 = f58, f106, f8 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f9 = f58, f109, f9 + } + ;; + { .mmf + (p16) LDFD f104 = [Y1], SIZE + (p16) LDFD f116 = [Y2], SIZE + (p18) FMA f10 = f61, f106, f10 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f11 = f61, f109, f11 + } + ;; + { .mmf + (p16) LDFD f59 = [X1], INCXM1 + (p16) LDFD f71 = [X2], INCXM1 + (p18) FMA f12 = f64, f112, f12 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f13 = f64, f115, f13 + } + ;; + { .mmf + (p16) LDFD f107 = [Y1], INCYM1 + (p16) LDFD f119 = [Y2], INCYM1 + (p18) FMA f14 = f67, f112, f14 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f15 = f67, f115, f15 + } + ;; + { .mmf + (p16) LDFD f62 = [X1], SIZE + (p16) LDFD f74 = [X2], SIZE + (p18) FMA f8 = f70, f118, f8 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f9 = f70, f121, f9 + } + ;; + { .mmf + (p16) LDFD f110 = [Y1], SIZE + (p16) LDFD f122 = [Y2], SIZE + (p18) FMA f10 = f73, f118, f10 + } + { .mmf + nop.m 0 + nop.m 0 + (p18) FMA f11 = f73, f121, f11 + } + ;; + { .mmf + (p16) LDFD f65 = [X1], INCX3M1 + (p16) LDFD f77 = [X2], INCX3M1 + (p18) FMA f12 = f76, f124, f12 + } + { .mmf + (p16) add XX = INCX16, XX + (p16) add YY = INCY16, YY + (p18) FMA f13 = f76, f127, f13 + } + ;; + { .mmf + (p16) LDFD f113 = [Y1], INCY3M1 + (p16) LDFD f125 = [Y2], INCY3M1 + (p18) FMA f14 = f79, f124, f14 + } + { .mfb + nop.m 0 + (p18) FMA f15 = f79, f127, f15 + br.ctop.sptk.few .L52 + } + ;; + .align 32 + +.L55: + (p12) LDFD f32 = [X1], SIZE + (p12) LDFD f40 = [X2], SIZE + tbit.z p0, p13 = N, 1 + (p12) LDFD f34 = [Y1], SIZE + (p12) LDFD f42 = [Y2], SIZE + tbit.z p0, p14 = N, 0 + ;; + (p12) LDFD f33 = [X1], INCXM1 + (p12) LDFD f41 = [X2], INCXM1 + cmp.eq p9, p0 = r0, J + (p12) LDFD f35 = [Y1], INCYM1 + (p12) LDFD f43 = [Y2], INCYM1 + (p9) br.cond.dptk .L999 + ;; + (p12) LDFD f36 = [X1], SIZE + (p12) LDFD f44 = [X2], SIZE + (p12) shladd XX = INCX, 2, XX + (p12) LDFD f38 = [Y1], SIZE + (p12) LDFD f46 = [Y2], SIZE + (p12) shladd YY = INCY, 2, YY + ;; + (p12) LDFD f37 = [X1], INCX3M1 + (p12) LDFD f45 = [X2], INCX3M1 + (p13) shladd XX = INCX, 1, XX + (p12) LDFD f39 = [Y1], INCY3M1 + (p12) LDFD f47 = [Y2], INCY3M1 + (p13) shladd YY = INCY, 1, YY + ;; + (p13) LDFD f48 = [X1], SIZE + (p13) LDFD f50 = [Y1], SIZE + (p14) LDFD f56 = [XX], SIZE + (p14) LDFD f58 = [YY], SIZE + ;; + (p13) LDFD f49 = [X1], INCXM1 + (p13) LDFD f51 = [Y1], INCYM1 + (p14) LDFD f57 = [XX] + (p14) LDFD f59 = [YY] + ;; + (p13) LDFD f52 = [X1], SIZE + (p13) LDFD f54 = [Y1], SIZE + ;; + (p13) LDFD f53 = [X1] + (p13) LDFD f55 = [Y1] + ;; + (p12) FMA f8 = f32, f34, f8 + (p12) FMA f9 = f32, f35, f9 + (p12) FMA f10 = f33, f34, f10 + (p12) FMA f11 = f33, f35, f11 + (p12) FMA f12 = f36, f38, f12 + (p12) FMA f13 = f36, f39, f13 + (p12) FMA f14 = f37, f38, f14 + (p12) FMA f15 = f37, f39, f15 + ;; + (p12) FMA f8 = f40, f42, f8 + (p12) FMA f9 = f40, f43, f9 + (p12) FMA f10 = f41, f42, f10 + (p12) FMA f11 = f41, f43, f11 + (p12) FMA f12 = f44, f46, f12 + (p12) FMA f13 = f44, f47, f13 + (p12) FMA f14 = f45, f46, f14 + (p12) FMA f15 = f45, f47, f15 + ;; + (p13) FMA f8 = f48, f50, f8 + (p13) FMA f9 = f48, f51, f9 + (p13) FMA f10 = f49, f50, f10 + (p13) FMA f11 = f49, f51, f11 + (p13) FMA f12 = f52, f54, f12 + (p13) FMA f13 = f52, f55, f13 + (p13) FMA f14 = f53, f54, f14 + (p13) FMA f15 = f53, f55, f15 + ;; + (p14) FMA f8 = f56, f58, f8 + (p14) FMA f9 = f56, f59, f9 + (p14) FMA f10 = f57, f58, f10 + (p14) FMA f11 = f57, f59, f11 + .align 32 + ;; +.L999: + FADD f8 = f8, f12 + FADD f9 = f9, f13 + FADD f10 = f10, f14 + FADD f11 = f11, f15 + mov ar.lc = ARLC + ;; +#ifndef CONJ + FSUB f8 = f8, f11 + FADD f9 = f9, f10 +#else + FADD f8 = f8, f11 + FSUB f9 = f9, f10 +#endif + ;; + .align 32 + +.L1000: +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + STFD [r32] = f8, SIZE + ;; + STFD [r32] = f9, SIZE +#endif + mov pr = PR, -65474 + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/zgemm3m_kernel.S b/kernel/ia64/zgemm3m_kernel.S new file mode 100644 index 0000000000..5adb66a3c7 --- /dev/null +++ b/kernel/ia64/zgemm3m_kernel.S @@ -0,0 +1,6803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE 15 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define C9 loc0 +#define C10 loc1 +#define C11 loc2 +#define C12 loc3 +#define C13 loc4 +#define C14 loc5 +#define C15 loc6 +#define C16 loc7 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -16 * 16, SP + adds r9 = -15 * 16, SP + adds SP = -16 * 16, SP + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + } + { .mmi + ld8 LDC = [r14], 8 + nop __LINE__ + nop __LINE__ + } + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + shr J = N, 3 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + shladd LDC = LDC, ZBASE_SHIFT, r0 + ;; + stf.spill [r8] = f22, 32 + stf.spill [r9] = f23, 32 + mov AOFFSET = A + ;; + stf.spill [r8] = f24, 32 + stf.spill [r9] = f25, 32 + cmp.ge p6, p0 = 0, J + ;; + stf.spill [r8] = f26, 32 + stf.spill [r9] = f27, 32 + ;; + stf.spill [r8] = f28, 32 + stf.spill [r9] = f29, 32 + ;; + stf.spill [r8] = f30 + stf.spill [r9] = f31 + (p6) br.cond.dpnt .L050 + .body + ;; + .align 32 + +.L010: + { .mfi + adds J = -1, J + mov f64 = f0 + shr I = M, 3 + } + { .mfi + mov C1 = C // coffset1 = c + 0 * ldc + mov f72 = f0 + } + ;; + { .mmf + cmp.eq p6, p7 = 0, I + nop __LINE__ + mov f80 = f0 + } + { .mmf + add C2 = LDC, C // coffset2 = c + 1 * ldc + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + mov f88 = f0 + } + ;; + { .mmf + shladd C5 = LDC, 2, C // coffset5 = c + 4 * ldc + shladd C = LDC, 3, C // coffset += 8 * ldc + mov f96 = f0 + } + { .mmf + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc + shladd C6 = LDC, 2, C2 // coffset6 = c + 5 * ldc + mov f104 = f0 + } + ;; + { .mfi + shladd C7 = LDC, 2, C3 // coffset7 = c + 6 * ldc + mov f112 = f0 + nop __LINE__ + } + { .mfb + sub C8 = C, LDC // coffset8 = c + 7 * ldc + mov f120 = f0 + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mfb + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfb + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f81 = f0 + nop __LINE__ + } + { .mfb + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfb + setf.d f113 = r0 + mov f121 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfb + setf.d f82 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfb + setf.d f114 = r0 + mov f122 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + setf.d f67 = r0 + mov f75 = f0 + } + { .mfi + setf.d f83 = r0 + mov f91 = f0 + nop __LINE__ + } + ;; + { .mmf + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + setf.d f99 = r0 + mov f107 = f0 + } + { .mfi + setf.d f115 = r0 + mov f123 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f68 = r0 + mov f76 = f0 + } + { .mfi + setf.d f84 = r0 + mov f92 = f0 + adds L = 1, K + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f100 = r0 + mov f108 = f0 + } + { .mfi + setf.d f116 = r0 + mov f124 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f69 = r0 + mov f77 = f0 + } + { .mfi + setf.d f85 = r0 + mov f93 = f0 + adds PREB = (PREFETCHSIZE - 8) * SIZE, BOFFSET + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f101 = r0 + mov f109 = f0 + } + { .mfi + setf.d f117 = r0 + mov f125 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f70 = r0 + mov f78 = f0 + } + { .mfi + setf.d f86 = r0 + mov f94 = f0 + shr L = L, 1 + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f102 = r0 + mov f110 = f0 + } + { .mfi + setf.d f118 = r0 + mov f126 = f0 + adds L = -1, L + } + ;; + { .mmf + CPREFETCH [PREC], LDC + setf.d f71 = r0 + mov f79 = f0 + } + { .mfi + setf.d f87 = r0 + mov f95 = f0 + mov ar.lc = L + } + ;; + { .mmf + CPREFETCH [PREC] + setf.d f103 = r0 + mov f111 = f0 + } + { .mfi + setf.d f119 = r0 + mov f127 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfi + cmp.ne p4, p5 = 0, L + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfi + adds C9 = 4 * SIZE, C1 + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfi + adds C10 = 4 * SIZE, C2 + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfi + adds C11 = 4 * SIZE, C3 + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfi + adds C12 = 4 * SIZE, C4 + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfi + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfi + adds C13 = 4 * SIZE, C5 + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfi + adds C14 = 4 * SIZE, C6 + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfi + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfi + adds C15 = 4 * SIZE, C7 + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfi + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfi + adds C16 = 4 * SIZE, C8 + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfi + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfi + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfi + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfi + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfi + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfi + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfi + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfi + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfi + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfi + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f108 = f36, f53, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfi + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f124 = f36, f55, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfi + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfi + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfi + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f109 = f37, f53, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfi + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f125 = f37, f55, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfi + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfi + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfi + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f110 = f38, f53, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfi + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f126 = f38, f55, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfi + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f111 = f39, f53, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfi + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f127 = f39, f55, f127 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfi + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfi + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfi + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfi + nop __LINE__ + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfi + (p5) LDFD f6 = [C1 ], SIZE + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f7 = [C9 ], SIZE + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfi + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfi + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfi + (p5) LDFD f14 = [C1 ], 5 * SIZE + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f15 = [C9 ], 5 * SIZE + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfi + (p5) LDFD f16 = [C1 ], SIZE + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f17 = [C9 ], SIZE + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfi + (p5) LDFD f18 = [C1 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f19 = [C9 ], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfi + (p5) LDFD f20 = [C1 ], SIZE + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f21 = [C9 ], SIZE + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfi + (p5) LDFD f22 = [C1 ], - 11 * SIZE + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f23 = [C9 ], - 11 * SIZE + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfi + (p5) LDFD f24 = [C2 ], SIZE + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f25 = [C10], SIZE + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfi + (p5) LDFD f26 = [C2 ], SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f27 = [C10], SIZE + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfi + (p5) LDFD f28 = [C2 ], SIZE + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f29 = [C10], SIZE + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfi + (p5) LDFD f30 = [C2 ], 5 * SIZE + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f31 = [C10], 5 * SIZE + (p3) FMA f108 = f44, f61, f108 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfi + (p5) LDFD f32 = [C2 ], SIZE + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f33 = [C10], SIZE + (p3) FMA f124 = f44, f63, f124 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfi + (p5) LDFD f34 = [C2 ], SIZE + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f35 = [C10], SIZE + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfi + (p5) LDFD f36 = [C2 ], SIZE + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f37 = [C10], SIZE + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfi + (p5) LDFD f38 = [C2 ], - 11 * SIZE + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f39 = [C10], - 11 * SIZE + (p3) FMA f109 = f45, f61, f109 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfi + (p5) LDFD f48 = [C3 ], SIZE + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f49 = [C11], SIZE + (p3) FMA f125 = f45, f63, f125 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfi + (p5) LDFD f50 = [C3 ], SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f51 = [C11], SIZE + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfi + (p5) LDFD f52 = [C3 ], SIZE + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f53 = [C11], SIZE + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfi + (p5) LDFD f54 = [C3 ], 5 * SIZE + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f55 = [C11], 5 * SIZE + (p3) FMA f110 = f46, f61, f110 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfi + (p5) LDFD f40 = [C3 ], SIZE + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfi + (p5) LDFD f41 = [C11], SIZE + (p3) FMA f126 = f46, f63, f126 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfi + (p5) LDFD f42 = [C3 ], SIZE + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfi + (p5) LDFD f43 = [C11], SIZE + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfi + (p5) LDFD f44 = [C3 ], SIZE + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfi + (p5) LDFD f45 = [C11], SIZE + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfi + (p5) LDFD f46 = [C3 ], - 11 * SIZE + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfi + (p5) LDFD f56 = [C11], - 11 * SIZE + (p3) FMA f111 = f47, f61, f111 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + (p5) LDFD f57 = [C4 ], SIZE + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + (p5) LDFD f58 = [C12], SIZE + (p3) FMA f127 = f47, f63, f127 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; +.L013: + { .mmf + (p5) LDFD f59 = [C4 ], SIZE + (p5) LDFD f60 = [C12], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + cmp.ne p6, p0 = 1, I + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + (p5) LDFD f61 = [C4 ], SIZE + (p5) LDFD f62 = [C12], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + (p5) LDFD f63 = [C4 ], 5 * SIZE + (p5) LDFD f47 = [C12], 5 * SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mfi + (p5) LDFD f64 = [C4 ], SIZE + FMA f14 = ALPHA_I, f65, f14 + nop __LINE__ + } + { .mfi + (p5) LDFD f65 = [C12], SIZE + FMA f15 = ALPHA_I, f67, f15 + nop __LINE__ + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f68, f16 + } + { .mmf + (p5) LDFD f6 = [C4 ], SIZE + (p5) LDFD f7 = [C12], SIZE + FMA f17 = ALPHA_R, f70, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f68, f18 + } + { .mmf + (p5) LDFD f10 = [C4 ], SIZE + (p5) LDFD f11 = [C12], SIZE + FMA f19 = ALPHA_I, f70, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f69, f20 + } + { .mmf + (p5) LDFD f12 = [C4 ], - 11 * SIZE + (p5) LDFD f13 = [C12], - 11 * SIZE + FMA f21 = ALPHA_R, f71, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f69, f22 + } + { .mmf + (p5) LDFD f14 = [C5 ], SIZE + (p5) LDFD f15 = [C13], SIZE + FMA f23 = ALPHA_I, f71, f23 + } + ;; + { .mmf + STFD [C1 ] = f16, SIZE + STFD [C9 ] = f17, SIZE + FMA f24 = ALPHA_R, f72, f24 + } + { .mmf + (p5) LDFD f16 = [C5 ], SIZE + (p5) LDFD f17 = [C13], SIZE + FMA f25 = ALPHA_R, f74, f25 + } + ;; + { .mmf + STFD [C1 ] = f18, SIZE + STFD [C9 ] = f19, SIZE + FMA f26 = ALPHA_I, f72, f26 + } + { .mmf + (p5) LDFD f18 = [C5 ], SIZE + (p5) LDFD f19 = [C13], SIZE + FMA f27 = ALPHA_I, f74, f27 + } + ;; + { .mmf + STFD [C1 ] = f20, SIZE + STFD [C9 ] = f21, SIZE + FMA f28 = ALPHA_R, f73, f28 + } + { .mmf + (p5) LDFD f20 = [C5 ], 5 * SIZE + (p5) LDFD f21 = [C13], 5 * SIZE + FMA f29 = ALPHA_R, f75, f29 + } + ;; + { .mmf + STFD [C1 ] = f22, 5 * SIZE + STFD [C9 ] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f73, f30 + } + { .mmf + (p5) LDFD f22 = [C5 ], SIZE + (p5) LDFD f23 = [C13], SIZE + FMA f31 = ALPHA_I, f75, f31 + } + ;; + { .mmf + STFD [C2 ] = f24, SIZE + STFD [C10] = f25, SIZE + FMA f32 = ALPHA_R, f76, f32 + } + { .mmf + (p5) LDFD f24 = [C5 ], SIZE + (p5) LDFD f25 = [C13], SIZE + FMA f33 = ALPHA_R, f78, f33 + } + ;; + { .mmf + STFD [C2 ] = f26, SIZE + STFD [C10] = f27, SIZE + FMA f34 = ALPHA_I, f76, f34 + } + { .mmf + (p5) LDFD f26 = [C5 ], SIZE + (p5) LDFD f27 = [C13], SIZE + FMA f35 = ALPHA_I, f78, f35 + } + ;; + { .mmf + STFD [C2 ] = f28, SIZE + STFD [C10] = f29, SIZE + FMA f36 = ALPHA_R, f77, f36 + } + { .mmf + (p5) LDFD f28 = [C5 ], - 11 * SIZE + (p5) LDFD f29 = [C13], - 11 * SIZE + FMA f37 = ALPHA_R, f79, f37 + } + ;; + { .mmf + STFD [C2 ] = f30, 5 * SIZE + STFD [C10] = f31, 5 * SIZE + FMA f38 = ALPHA_I, f77, f38 + } + { .mmf + (p5) LDFD f30 = [C6 ], SIZE + (p5) LDFD f31 = [C14], SIZE + FMA f39 = ALPHA_I, f79, f39 + } + ;; + { .mmf + STFD [C2 ] = f32, SIZE + STFD [C10] = f33, SIZE + FMA f48 = ALPHA_R, f80, f48 + } + { .mmf + (p5) LDFD f32 = [C6 ], SIZE + (p5) LDFD f33 = [C14], SIZE + FMA f49 = ALPHA_R, f82, f49 + } + ;; + { .mmf + STFD [C2 ] = f34, SIZE + STFD [C10] = f35, SIZE + FMA f50 = ALPHA_I, f80, f50 + } + { .mmf + (p5) LDFD f34 = [C6 ], SIZE + (p5) LDFD f35 = [C14], SIZE + FMA f51 = ALPHA_I, f82, f51 + } + ;; + { .mmf + STFD [C2 ] = f36, SIZE + STFD [C10] = f37, SIZE + FMA f52 = ALPHA_R, f81, f52 + } + { .mmf + (p5) LDFD f36 = [C6 ], 5 * SIZE + (p5) LDFD f37 = [C14], 5 * SIZE + FMA f53 = ALPHA_R, f83, f53 + } + ;; + { .mmf + STFD [C2 ] = f38, 5 * SIZE + STFD [C10] = f39, 5 * SIZE + FMA f54 = ALPHA_I, f81, f54 + } + { .mmf + (p5) LDFD f38 = [C6 ], SIZE + (p5) LDFD f39 = [C14], SIZE + FMA f55 = ALPHA_I, f83, f55 + } + ;; + { .mmf + STFD [C3 ] = f48, SIZE + STFD [C11] = f49, SIZE + FMA f40 = ALPHA_R, f84, f40 + } + { .mmf + (p5) LDFD f48 = [C6 ], SIZE + (p5) LDFD f49 = [C14], SIZE + FMA f41 = ALPHA_R, f86, f41 + } + ;; + { .mmf + STFD [C3 ] = f50, SIZE + STFD [C11] = f51, SIZE + FMA f42 = ALPHA_I, f84, f42 + } + { .mmf + (p5) LDFD f50 = [C6 ], SIZE + (p5) LDFD f51 = [C14], SIZE + FMA f43 = ALPHA_I, f86, f43 + } + ;; + { .mmf + STFD [C3 ] = f52, SIZE + STFD [C11] = f53, SIZE + FMA f44 = ALPHA_R, f85, f44 + } + { .mmf + (p5) LDFD f52 = [C6 ], - 11 * SIZE + (p5) LDFD f53 = [C14], - 11 * SIZE + FMA f45 = ALPHA_R, f87, f45 + } + ;; + { .mmf + STFD [C3 ] = f54, 5 * SIZE + STFD [C11] = f55, 5 * SIZE + FMA f46 = ALPHA_I, f85, f46 + } + { .mmf + (p5) LDFD f54 = [C7 ], SIZE + (p5) LDFD f55 = [C15], SIZE + FMA f56 = ALPHA_I, f87, f56 + } + ;; + { .mmf + STFD [C3 ] = f40, SIZE + STFD [C11] = f41, SIZE + FMA f57 = ALPHA_R, f88, f57 + } + { .mmf + (p5) LDFD f40 = [C7 ], SIZE + (p5) LDFD f41 = [C15], SIZE + FMA f58 = ALPHA_R, f90, f58 + } + ;; + { .mmf + STFD [C3 ] = f42, SIZE + STFD [C11] = f43, SIZE + FMA f59 = ALPHA_I, f88, f59 + } + { .mmf + (p5) LDFD f42 = [C7 ], SIZE + (p5) LDFD f43 = [C15], SIZE + FMA f60 = ALPHA_I, f90, f60 + } + ;; + { .mmf + STFD [C3 ] = f44, SIZE + STFD [C11] = f45, SIZE + FMA f61 = ALPHA_R, f89, f61 + } + { .mmf + (p5) LDFD f44 = [C7 ], 5 * SIZE + (p5) LDFD f45 = [C15], 5 * SIZE + FMA f62 = ALPHA_R, f91, f62 + } + ;; + { .mmf + STFD [C3 ] = f46, 5 * SIZE + STFD [C11] = f56, 5 * SIZE + FMA f63 = ALPHA_I, f89, f63 + } + { .mmf + (p5) LDFD f46 = [C7 ], SIZE + (p5) LDFD f56 = [C15], SIZE + FMA f47 = ALPHA_I, f91, f47 + } + ;; + { .mmf + STFD [C4 ] = f57, SIZE + STFD [C12] = f58, SIZE + FMA f64 = ALPHA_R, f92, f64 + } + { .mmf + (p5) LDFD f57 = [C7 ], SIZE + (p5) LDFD f58 = [C15], SIZE + FMA f65 = ALPHA_R, f94, f65 + } + ;; + { .mmf + STFD [C4 ] = f59, SIZE + STFD [C12] = f60, SIZE + FMA f6 = ALPHA_I, f92, f6 + } + { .mmf + (p5) LDFD f59 = [C7 ], SIZE + (p5) LDFD f60 = [C15], SIZE + FMA f7 = ALPHA_I, f94, f7 + } + ;; + { .mmf + STFD [C4 ] = f61, SIZE + STFD [C12] = f62, SIZE + FMA f10 = ALPHA_R, f93, f10 + } + { .mmf + (p5) LDFD f61 = [C7 ], - 11 * SIZE + (p5) LDFD f62 = [C15], - 11 * SIZE + FMA f11 = ALPHA_R, f95, f11 + } + ;; + { .mmf + STFD [C4 ] = f63, 5 * SIZE + STFD [C12] = f47, 5 * SIZE + FMA f12 = ALPHA_I, f93, f12 + } + { .mmf + (p5) LDFD f63 = [C8 ], SIZE + (p5) LDFD f47 = [C16], SIZE + FMA f13 = ALPHA_I, f95, f13 + } + ;; + { .mmf + STFD [C4 ] = f64, SIZE + STFD [C12] = f65, SIZE + FMA f14 = ALPHA_R, f96, f14 + } + { .mmf + (p5) LDFD f64 = [C8 ], SIZE + (p5) LDFD f65 = [C16], SIZE + FMA f15 = ALPHA_R, f98, f15 + } + ;; + { .mmf + STFD [C4 ] = f6, SIZE + STFD [C12] = f7, SIZE + FMA f16 = ALPHA_I, f96, f16 + } + { .mmf + (p5) LDFD f6 = [C8 ], SIZE + (p5) LDFD f7 = [C16], SIZE + FMA f17 = ALPHA_I, f98, f17 + } + ;; + { .mmf + STFD [C4 ] = f10, SIZE + STFD [C12] = f11, SIZE + FMA f18 = ALPHA_R, f97, f18 + } + { .mmf + (p5) LDFD f10 = [C8 ], 5 * SIZE + (p5) LDFD f11 = [C16], 5 * SIZE + FMA f19 = ALPHA_R, f99, f19 + } + ;; + { .mmf + STFD [C4 ] = f12, 5 * SIZE + STFD [C12] = f13, 5 * SIZE + FMA f20 = ALPHA_I, f97, f20 + } + { .mmf + (p5) LDFD f12 = [C8 ], SIZE + (p5) LDFD f13 = [C16], SIZE + FMA f21 = ALPHA_I, f99, f21 + } + ;; + { .mmf + STFD [C5 ] = f14, SIZE + STFD [C13] = f15, SIZE + FMA f22 = ALPHA_R, f100, f22 + } + { .mmf + (p5) LDFD f14 = [C8 ], SIZE + (p5) LDFD f15 = [C16], SIZE + FMA f23 = ALPHA_R, f102, f23 + } + ;; + { .mmf + STFD [C5 ] = f16, SIZE + STFD [C13] = f17, SIZE + FMA f24 = ALPHA_I, f100, f24 + } + { .mmf + (p5) LDFD f16 = [C8 ], SIZE + (p5) LDFD f17 = [C16], SIZE + FMA f25 = ALPHA_I, f102, f25 + } + ;; + { .mmf + STFD [C5 ] = f18, SIZE + STFD [C13] = f19, SIZE + FMA f26 = ALPHA_R, f101, f26 + } + { .mmf + (p5) LDFD f18 = [C8 ], - 11 * SIZE + (p5) LDFD f19 = [C16], - 11 * SIZE + FMA f27 = ALPHA_R, f103, f27 + } + ;; + { .mmf + STFD [C5 ] = f20, 5 * SIZE + STFD [C13] = f21, 5 * SIZE + FMA f28 = ALPHA_I, f101, f28 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f29 = ALPHA_I, f103, f29 + } + ;; + { .mmf + STFD [C5 ] = f22, SIZE + STFD [C13] = f23, SIZE + FMA f30 = ALPHA_R, f104, f30 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f31 = ALPHA_R, f106, f31 + } + ;; + { .mmf + STFD [C5 ] = f24, SIZE + STFD [C13] = f25, SIZE + FMA f32 = ALPHA_I, f104, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA_I, f106, f33 + } + ;; + { .mmf + STFD [C5 ] = f26, SIZE + STFD [C13] = f27, SIZE + FMA f34 = ALPHA_R, f105, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA_R, f107, f35 + } + ;; + { .mmf + STFD [C5 ] = f28, 5 * SIZE + STFD [C13] = f29, 5 * SIZE + FMA f36 = ALPHA_I, f105, f36 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_I, f107, f37 + } + ;; + { .mmf + STFD [C6 ] = f30, SIZE + STFD [C14] = f31, SIZE + FMA f38 = ALPHA_R, f108, f38 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_R, f110, f39 + } + ;; + { .mmf + STFD [C6 ] = f32, SIZE + STFD [C14] = f33, SIZE + FMA f48 = ALPHA_I, f108, f48 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f49 = ALPHA_I, f110, f49 + } + ;; + { .mmf + STFD [C6 ] = f34, SIZE + STFD [C14] = f35, SIZE + FMA f50 = ALPHA_R, f109, f50 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f51 = ALPHA_R, f111, f51 + } + ;; + { .mmf + STFD [C6 ] = f36, 5 * SIZE + STFD [C14] = f37, 5 * SIZE + FMA f52 = ALPHA_I, f109, f52 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f53 = ALPHA_I, f111, f53 + } + ;; + { .mmf + STFD [C6 ] = f38, SIZE + STFD [C14] = f39, SIZE + FMA f54 = ALPHA_R, f112, f54 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f55 = ALPHA_R, f114, f55 + } + ;; + { .mmf + STFD [C6 ] = f48, SIZE + STFD [C14] = f49, SIZE + FMA f40 = ALPHA_I, f112, f40 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f41 = ALPHA_I, f114, f41 + } + ;; + { .mmf + STFD [C6 ] = f50, SIZE + STFD [C14] = f51, SIZE + FMA f42 = ALPHA_R, f113, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f43 = ALPHA_R, f115, f43 + } + ;; + { .mmf + STFD [C6 ] = f52, 5 * SIZE + STFD [C14] = f53, 5 * SIZE + FMA f44 = ALPHA_I, f113, f44 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f45 = ALPHA_I, f115, f45 + } + ;; + { .mmf + STFD [C7 ] = f54, SIZE + STFD [C15] = f55, SIZE + FMA f46 = ALPHA_R, f116, f46 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f56 = ALPHA_R, f118, f56 + } + ;; + { .mmf + STFD [C7 ] = f40, SIZE + STFD [C15] = f41, SIZE + FMA f57 = ALPHA_I, f116, f57 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f58 = ALPHA_I, f118, f58 + } + ;; + { .mmf + STFD [C7 ] = f42, SIZE + STFD [C15] = f43, SIZE + FMA f59 = ALPHA_R, f117, f59 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f60 = ALPHA_R, f119, f60 + } + ;; + { .mmf + STFD [C7 ] = f44, 5 * SIZE + STFD [C15] = f45, 5 * SIZE + FMA f61 = ALPHA_I, f117, f61 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f62 = ALPHA_I, f119, f62 + } + ;; + { .mmf + STFD [C7 ] = f46, SIZE + STFD [C15] = f56, SIZE + FMA f63 = ALPHA_R, f120, f63 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f47 = ALPHA_R, f122, f47 + } + ;; + { .mmf + STFD [C7 ] = f57, SIZE + STFD [C15] = f58, SIZE + FMA f64 = ALPHA_I, f120, f64 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f65 = ALPHA_I, f122, f65 + } + ;; + { .mmf + STFD [C7 ] = f59, SIZE + STFD [C15] = f60, SIZE + FMA f6 = ALPHA_R, f121, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f123, f7 + } + ;; + { .mmf + STFD [C7 ] = f61, 5 * SIZE + STFD [C15] = f62, 5 * SIZE + FMA f10 = ALPHA_I, f121, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f123, f11 + } + ;; + { .mmf + STFD [C8 ] = f63, SIZE + STFD [C16] = f47, SIZE + FMA f12 = ALPHA_R, f124, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f126, f13 + } + ;; + { .mmf + STFD [C8 ] = f64, SIZE + STFD [C16] = f65, SIZE + FMA f14 = ALPHA_I, f124, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f126, f15 + } + ;; + { .mmf + STFD [C8 ] = f6, SIZE + STFD [C16] = f7, SIZE + FMA f16 = ALPHA_R, f125, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f17 = ALPHA_R, f127, f17 + } + ;; + { .mmf + STFD [C8 ] = f10, 5 * SIZE + STFD [C16] = f11, 5 * SIZE + FMA f18 = ALPHA_I, f125, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f19 = ALPHA_I, f127, f19 + } + ;; + { .mmf + STFD [C8 ] = f12, SIZE + STFD [C16] = f13, SIZE + mov f64 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C8 ] = f14, SIZE + STFD [C16] = f15, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C8 ] = f16, SIZE + STFD [C16] = f17, SIZE + mov f96 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f104 = f0 + } + ;; + { .mmf + STFD [C8 ] = f18, 5 * SIZE + STFD [C16] = f19, 5 * SIZE + mov f112 = f0 + } + { .mfb + adds I = -1, I + mov f120 = f0 + (p6) br.cond.dptk .L011 + } + ;; + +.L020: + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f89 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb + nop __LINE__ + mov f81 = f0 + (p6) br.cond.dptk .L030 + } + ;; + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; + { .mmf + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + setf.d f97 = r0 + mov f105 = f0 + } + { .mfi + setf.d f113 = r0 + mov f121 = f0 + adds L = 1, K + } + ;; + { .mmf + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + setf.d f66 = r0 + mov f74 = f0 + } + { .mfi + setf.d f82 = r0 + mov f90 = f0 + tbit.z p12, p0 = L, 0 + } + ;; + { .mmf + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + setf.d f98 = r0 + mov f106 = f0 + } + { .mfi + setf.d f114 = r0 + mov f122 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f75 = f0 + adds L = -1, L + } + { .mmf + setf.d f67 = r0 + setf.d f83 = r0 + mov f91 = f0 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f107 = f0 + mov ar.lc = L + } + { .mmf + setf.d f99 = r0 + setf.d f115 = r0 + mov f123 = f0 + } + ;; + .align 32 + +.L022: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + (p5) adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + (p5) adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + (p5) adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + (p5) adds C12 = 4 * SIZE, C4 + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C13 = 4 * SIZE, C5 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C14 = 4 * SIZE, C6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + (p5) adds C15 = 4 * SIZE, C7 + } + { .mfi + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + (p5) adds C16 = 4 * SIZE, C8 + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f106 = f34, f53, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f122 = f34, f55, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f107 = f35, f53, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = f35, f55, f123 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f14 = [C1 ], - 3 * SIZE + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9 ], - 3 * SIZE + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f16 = [C2 ], SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f17 = [C10], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f18 = [C2 ], SIZE + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f19 = [C10], SIZE + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f20 = [C2 ], SIZE + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f21 = [C10], SIZE + (p3) FMA f106 = f42, f61, f106 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f22 = [C2 ], - 3 * SIZE + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + (p5) LDFD f23 = [C10], - 3 * SIZE + (p3) FMA f122 = f42, f63, f122 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f24 = [C3 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f25 = [C11], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f26 = [C3 ], SIZE + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f27 = [C11], SIZE + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f28 = [C3 ], SIZE + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f29 = [C11], SIZE + (p3) FMA f107 = f43, f61, f107 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f30 = [C3 ], - 3 * SIZE + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + (p5) LDFD f31 = [C11], - 3 * SIZE + (p3) FMA f123 = f43, f63, f123 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; + +.L028: + { .mmf + LDFD f68 = [C4 ], SIZE + LDFD f69 = [C12], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f70 = [C4 ], SIZE + LDFD f71 = [C12], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f76 = [C4 ], SIZE + LDFD f77 = [C12], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f78 = [C4 ], -3 * SIZE + LDFD f79 = [C12], -3 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f72, f16 + } + { .mmf + LDFD f84 = [C5 ], SIZE + LDFD f85 = [C13], SIZE + FMA f17 = ALPHA_R, f74, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f72, f18 + } + { .mmf + LDFD f86 = [C5 ], SIZE + LDFD f87 = [C13], SIZE + FMA f19 = ALPHA_I, f74, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f73, f20 + } + { .mmf + LDFD f92 = [C5 ], SIZE + LDFD f93 = [C13], SIZE + FMA f21 = ALPHA_R, f75, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f73, f22 + } + { .mmf + LDFD f94 = [C5 ], -3 * SIZE + LDFD f95 = [C13], -3 * SIZE + FMA f23 = ALPHA_I, f75, f23 + } + ;; + { .mmf + STFD [C2 ] = f16, SIZE + STFD [C10] = f17, SIZE + FMA f24 = ALPHA_R, f80, f24 + } + { .mmf + LDFD f100 = [C6 ], SIZE + LDFD f101 = [C14], SIZE + FMA f25 = ALPHA_R, f82, f25 + } + ;; + { .mmf + STFD [C2 ] = f18, SIZE + STFD [C10] = f19, SIZE + FMA f26 = ALPHA_I, f80, f26 + } + { .mmf + LDFD f102 = [C6 ], SIZE + LDFD f103 = [C14], SIZE + FMA f27 = ALPHA_I, f82, f27 + } + ;; + { .mmf + STFD [C2 ] = f20, SIZE + STFD [C10] = f21, SIZE + FMA f28 = ALPHA_R, f81, f28 + } + { .mmf + LDFD f108 = [C6 ], SIZE + LDFD f109 = [C14], SIZE + FMA f29 = ALPHA_R, f83, f29 + } + ;; + { .mmf + STFD [C2 ] = f22, 5 * SIZE + STFD [C10] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f81, f30 + } + { .mmf + LDFD f110 = [C6 ], -3 * SIZE + LDFD f111 = [C14], -3 * SIZE + FMA f31 = ALPHA_I, f83, f31 + } + ;; + { .mmf + STFD [C3 ] = f24, SIZE + STFD [C11] = f25, SIZE + FMA f68 = ALPHA_R, f88, f68 + } + { .mmf + LDFD f116 = [C7 ], SIZE + LDFD f117 = [C15], SIZE + FMA f69 = ALPHA_R, f90, f69 + } + ;; + { .mmf + STFD [C3 ] = f26, SIZE + STFD [C11] = f27, SIZE + FMA f70 = ALPHA_I, f88, f70 + } + { .mmf + LDFD f118 = [C7 ], SIZE + LDFD f119 = [C15], SIZE + FMA f71 = ALPHA_I, f90, f71 + } + ;; + { .mmf + STFD [C3 ] = f28, SIZE + STFD [C11] = f29, SIZE + FMA f76 = ALPHA_R, f89, f76 + } + { .mmf + LDFD f124 = [C7 ], SIZE + LDFD f125 = [C15], SIZE + FMA f77 = ALPHA_R, f91, f77 + } + ;; + { .mmf + STFD [C3 ] = f30, 5 * SIZE + STFD [C11] = f31, 5 * SIZE + FMA f78 = ALPHA_I, f89, f78 + } + { .mmf + LDFD f126 = [C7 ], -3 * SIZE + LDFD f127 = [C15], -3 * SIZE + FMA f79 = ALPHA_I, f91, f79 + } + ;; + { .mmf + STFD [C4 ] = f68, SIZE + STFD [C12] = f69, SIZE + FMA f84 = ALPHA_R, f96, f84 + } + { .mmf + LDFD f32 = [C8 ], SIZE + LDFD f33 = [C16], SIZE + FMA f85 = ALPHA_R, f98, f85 + } + ;; + { .mmf + STFD [C4 ] = f70, SIZE + STFD [C12] = f71, SIZE + FMA f86 = ALPHA_I, f96, f86 + } + { .mmf + LDFD f34 = [C8 ], SIZE + LDFD f35 = [C16], SIZE + FMA f87 = ALPHA_I, f98, f87 + } + ;; + { .mmf + STFD [C4 ] = f76, SIZE + STFD [C12] = f77, SIZE + FMA f92 = ALPHA_R, f97, f92 + } + { .mmf + LDFD f36 = [C8 ], SIZE + LDFD f37 = [C16], SIZE + FMA f93 = ALPHA_R, f99, f93 + } + ;; + { .mmf + STFD [C4 ] = f78, 5 * SIZE + STFD [C12] = f79, 5 * SIZE + FMA f94 = ALPHA_I, f97, f94 + } + { .mmf + LDFD f38 = [C8 ], -3 * SIZE + LDFD f39 = [C16], -3 * SIZE + FMA f95 = ALPHA_I, f99, f95 + } + ;; + { .mmf + STFD [C5 ] = f84, SIZE + STFD [C13] = f85, SIZE + FMA f100 = ALPHA_R, f104, f100 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f101 = ALPHA_R, f106, f101 + } + ;; + { .mmf + STFD [C5 ] = f86, SIZE + STFD [C13] = f87, SIZE + FMA f102 = ALPHA_I, f104, f102 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f103 = ALPHA_I, f106, f103 + } + ;; + { .mmf + STFD [C5 ] = f92, SIZE + STFD [C13] = f93, SIZE + FMA f108 = ALPHA_R, f105, f108 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f109 = ALPHA_R, f107, f109 + } + ;; + { .mmf + STFD [C5 ] = f94, 5 * SIZE + STFD [C13] = f95, 5 * SIZE + FMA f110 = ALPHA_I, f105, f110 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f111 = ALPHA_I, f107, f111 + } + ;; + { .mmf + STFD [C6 ] = f100, SIZE + STFD [C14] = f101, SIZE + FMA f116 = ALPHA_R, f112, f116 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f117 = ALPHA_R, f114, f117 + } + ;; + { .mmf + STFD [C6 ] = f102, SIZE + STFD [C14] = f103, SIZE + FMA f118 = ALPHA_I, f112, f118 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f119 = ALPHA_I, f114, f119 + } + ;; + { .mmf + STFD [C6 ] = f108, SIZE + STFD [C14] = f109, SIZE + FMA f124 = ALPHA_R, f113, f124 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f125 = ALPHA_R, f115, f125 + } + ;; + { .mmf + STFD [C6 ] = f110, 5 * SIZE + STFD [C14] = f111, 5 * SIZE + FMA f126 = ALPHA_I, f113, f126 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f127 = ALPHA_I, f115, f127 + } + ;; + { .mmf + STFD [C7 ] = f116, SIZE + STFD [C15] = f117, SIZE + FMA f32 = ALPHA_R, f120, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA_R, f122, f33 + } + ;; + { .mmf + STFD [C7 ] = f118, SIZE + STFD [C15] = f119, SIZE + FMA f34 = ALPHA_I, f120, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA_I, f122, f35 + } + ;; + { .mmf + STFD [C7 ] = f124, SIZE + STFD [C15] = f125, SIZE + FMA f36 = ALPHA_R, f121, f36 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_R, f123, f37 + } + ;; + { .mmf + STFD [C7 ] = f126, 5 * SIZE + STFD [C15] = f127, 5 * SIZE + FMA f38 = ALPHA_I, f121, f38 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_I, f123, f39 + } + ;; + { .mmf + STFD [C8 ] = f32, SIZE + STFD [C16] = f33, SIZE + mov f64 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C8 ] = f34, SIZE + STFD [C16] = f35, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C8 ] = f36, SIZE + STFD [C16] = f37, SIZE + mov f96 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f104 = f0 + } + ;; + { .mmf + STFD [C8 ] = f38, 5 * SIZE + STFD [C16] = f39, 5 * SIZE + mov f112 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f120 = f0 + } + ;; + .align 32 + +.L030: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L040 + } + ;; + { .mfi + LDFPD f48, f49 = [B] + mov f65 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 + adds L = 1, K + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f81 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f97 = f0 + adds L = -1, L + } + { .mfi + nop __LINE__ + mov f105 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + ;; + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f113 = f0 + mov ar.lc = L + } + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f121 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f104 = f32, f53, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = f32, f55, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f105 = f33, f53, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = f33, f55, f121 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f6 = [C1], SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C2], SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C1], SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C2], SIZE + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f10 = [C1], SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + (p5) LDFD f14 = [C2], SIZE + (p3) FMA f105 = f41, f61, f105 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f11 = [C1], -3 * SIZE + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + (p5) LDFD f15 = [C2], -3 * SIZE + (p3) FMA f121 = f41, f63, f121 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; + +.L038: + { .mmf + LDFD f16 = [C3], SIZE + LDFD f20 = [C4], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f72, f12 + } + ;; + { .mmf + LDFD f17 = [C3], SIZE + LDFD f21 = [C4], SIZE + FMA f7 = ALPHA_I, f64, f7 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f72, f13 + } + ;; + { .mmf + LDFD f18 = [C3], SIZE + LDFD f22 = [C4], SIZE + FMA f10 = ALPHA_R, f65, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_R, f73, f14 + } + ;; + { .mmf + LDFD f19 = [C3], - 3 * SIZE + LDFD f23 = [C4], - 3 * SIZE + FMA f11 = ALPHA_I, f65, f11 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f73, f15 + } + ;; + { .mmf + STFD [C1] = f6, SIZE + STFD [C2] = f12, SIZE + FMA f16 = ALPHA_R, f80, f16 + } + { .mmf + LDFD f24 = [C5], SIZE + LDFD f28 = [C6], SIZE + FMA f20 = ALPHA_R, f88, f20 + } + ;; + { .mmf + STFD [C1] = f7, SIZE + STFD [C2] = f13, SIZE + FMA f17 = ALPHA_I, f80, f17 + } + { .mmf + LDFD f25 = [C5], SIZE + LDFD f29 = [C6], SIZE + FMA f21 = ALPHA_I, f88, f21 + } + ;; + { .mmf + STFD [C1] = f10, SIZE + STFD [C2] = f14, SIZE + FMA f18 = ALPHA_R, f81, f18 + } + { .mmf + LDFD f26 = [C5], SIZE + LDFD f30 = [C6], SIZE + FMA f22 = ALPHA_R, f89, f22 + } + ;; + { .mmf + STFD [C1] = f11, SIZE + STFD [C2] = f15, SIZE + FMA f19 = ALPHA_I, f81, f19 + } + { .mmf + LDFD f27 = [C5], - 3 * SIZE + LDFD f31 = [C6], - 3 * SIZE + FMA f23 = ALPHA_I, f89, f23 + } + ;; + { .mmf + STFD [C3] = f16, SIZE + STFD [C4] = f20, SIZE + FMA f24 = ALPHA_R, f96, f24 + } + { .mmf + LDFD f32 = [C7], SIZE + LDFD f36 = [C8], SIZE + FMA f28 = ALPHA_R, f104, f28 + } + ;; + { .mmf + STFD [C3] = f17, SIZE + STFD [C4] = f21, SIZE + FMA f25 = ALPHA_I, f96, f25 + } + { .mmf + LDFD f33 = [C7], SIZE + LDFD f37 = [C8], SIZE + FMA f29 = ALPHA_I, f104, f29 + } + ;; + { .mmf + STFD [C3] = f18, SIZE + STFD [C4] = f22, SIZE + FMA f26 = ALPHA_R, f97, f26 + } + { .mmf + LDFD f34 = [C7], SIZE + LDFD f38 = [C8], SIZE + FMA f30 = ALPHA_R, f105, f30 + } + ;; + { .mmf + STFD [C3] = f19, SIZE + STFD [C4] = f23, SIZE + FMA f27 = ALPHA_I, f97, f27 + } + { .mmf + LDFD f35 = [C7], - 3 * SIZE + LDFD f39 = [C8], - 3 * SIZE + FMA f31 = ALPHA_I, f105, f31 + } + ;; + { .mmf + STFD [C5] = f24, SIZE + STFD [C6] = f28, SIZE + FMA f32 = ALPHA_R, f112, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f36 = ALPHA_R, f120, f36 + } + ;; + { .mmf + STFD [C5] = f25, SIZE + STFD [C6] = f29, SIZE + FMA f33 = ALPHA_I, f112, f33 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_I, f120, f37 + } + ;; + { .mmf + STFD [C5] = f26, SIZE + STFD [C6] = f30, SIZE + FMA f34 = ALPHA_R, f113, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f38 = ALPHA_R, f121, f38 + } + ;; + { .mmf + STFD [C5] = f27, SIZE + STFD [C6] = f31, SIZE + FMA f35 = ALPHA_I, f113, f35 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_I, f121, f39 + } + ;; + { .mmf + STFD [C7] = f32, SIZE + STFD [C8] = f36, SIZE + mov f64 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C7] = f33, SIZE + STFD [C8] = f37, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C7] = f34, SIZE + STFD [C8] = f38, SIZE + mov f96 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f104 = f0 + } + ;; + { .mmf + STFD [C7] = f35, SIZE + STFD [C8] = f39, SIZE + mov f112 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f120 = f0 + } + ;; + .align 32 + +.L040: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + LDFD f32 = [AOFFSET], 1 * SIZE + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + { .mmi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + nop __LINE__ + } + ;; + .align 32 + +.L042: + { .mfb + lfetch.nt1 [PREB], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mmf + (p5) LDFD f6 = [C1], SIZE + (p5) LDFD f10 = [C2], SIZE + FMA f104 = f32, f53, f104 // A1 * B6 + } + ;; + { .mfi + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mmf + (p5) LDFD f7 = [C1], -SIZE + (p5) LDFD f11 = [C2], -SIZE + FMA f120 = f32, f55, f120 // A1 * B8 + } + ;; + { .mmf + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + } + { .mmf + (p5) LDFD f12 = [C3], SIZE + (p5) LDFD f14 = [C4], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + (p5) LDFD f13 = [C3], -SIZE + (p5) LDFD f15 = [C4], -SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mmf + (p5) LDFD f16 = [C5], SIZE + (p5) LDFD f18 = [C6], SIZE + (p3) FMA f104 = f40, f61, f104 // A1 * B6 + } + ;; + { .mfi + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + adds L = -1, L + } + { .mmb + (p5) LDFD f17 = [C5], -SIZE + (p5) LDFD f19 = [C6], -SIZE + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f120 = f40, f63, f120 // A1 * B8 + nop __LINE__ + } + { .mmb + (p5) LDFD f20 = [C7], SIZE + (p5) LDFD f22 = [C8], SIZE + br.cloop.sptk.few .L042 + } + ;; + { .mmf + LDFD f21 = [C7], -SIZE + LDFD f23 = [C8], -SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f10 = ALPHA_R, f72, f10 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_I, f64, f7 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f72, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f80, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_R, f88, f14 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f80, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f88, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C2 ] = f10, SIZE + FMA f16 = ALPHA_R, f96, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f18 = ALPHA_R, f104, f18 + } + ;; + { .mmf + STFD [C1 ] = f7, SIZE + STFD [C2 ] = f11, SIZE + FMA f17 = ALPHA_I, f96, f17 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f19 = ALPHA_I, f104, f19 + } + ;; + { .mmf + STFD [C3 ] = f12, SIZE + STFD [C4 ] = f14, SIZE + FMA f20 = ALPHA_R, f112, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f22 = ALPHA_R, f120, f22 + } + ;; + { .mmf + STFD [C3 ] = f13, SIZE + STFD [C4 ] = f15, SIZE + FMA f21 = ALPHA_I, f112, f21 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f23 = ALPHA_I, f120, f23 + } + ;; + { .mmi + STFD [C5 ] = f16, SIZE + STFD [C6 ] = f18, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C5 ] = f17, SIZE + STFD [C6 ] = f19, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C7 ] = f20, SIZE + STFD [C8 ] = f22, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C7 ] = f21, SIZE + STFD [C8 ] = f23, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L049: + { .mmi + mov B = BOFFSET + mov AOFFSET = A + nop __LINE__ + } + ;; + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 32 + +.L050: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 2 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + shladd C3 = LDC, 1, C + mov f80 = f0 + nop __LINE__ + } + { .mfb + mov AOFFSET = A + mov f88 = f0 + (p6) br.cond.dpnt .L090 + } + ;; + { .mfi + cmp.eq p6, p7 = 0, I + mov f65 = f0 + nop __LINE__ + } + { .mfi + shladd C4 = LDC, 1, C2 + mov f73 = f0 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + mov f81 = f0 + nop __LINE__ + } + { .mfb + shladd C = LDC, 2, C + mov f89 = f0 + (p6) br.cond.dpnt .L060 + } + ;; + .align 32 + +.L052: + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f74 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + nop __LINE__ + } + { .mfi + setf.d f84 = r0 + mov f90 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f67 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f75 = f0 + adds L = 1, K + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f83 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + setf.d f91 = r0 + mov f68 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f76 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f92 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f69 = f0 + shr L = L, 1 + } + { .mmf + setf.d f77 = r0 + setf.d f85 = r0 + mov f93 = f0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f70 = f0 + adds L = -1, L + } + { .mmf + setf.d f78 = r0 + setf.d f86 = r0 + mov f94 = f0 + } + ;; + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + mov ar.lc = L + } + { .mmf + setf.d f79 = r0 + setf.d f87 = r0 + mov f95 = f0 + } + ;; + .align 32 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + adds C9 = 4 * SIZE, C1 + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C10 = 4 * SIZE, C2 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C11 = 4 * SIZE, C3 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + adds C12 = 4 * SIZE, C4 + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f92 = f36, f51, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = f37, f51, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f94 = f38, f51, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = f39, f51, f95 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f14 = [C1 ], 5 * SIZE + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9 ], 5 * SIZE + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f16 = [C1 ], SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f17 = [C9], SIZE + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f18 = [C1 ], SIZE + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f19 = [C9], SIZE + (p3) FMA f92 = f44, f59, f92 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f20 = [C1 ], SIZE + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f21 = [C9], SIZE + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f22 = [C1 ], -11 * SIZE + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f23 = [C9 ], -11 * SIZE + (p3) FMA f93 = f45, f59, f93 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f24 = [C2 ], SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f25 = [C10], SIZE + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f26 = [C2 ], SIZE + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f27 = [C10], SIZE + (p3) FMA f94 = f46, f59, f94 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f28 = [C2 ], SIZE + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f29 = [C10], SIZE + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f30 = [C2 ], 5 * SIZE + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + adds L = -1, L + } + { .mfb + (p5) LDFD f31 = [C10], 5 * SIZE + (p3) FMA f95 = f47, f59, f95 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; + .align 32 + +.L058: + { .mmf + LDFD f32 = [C2 ], SIZE + LDFD f33 = [C10], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f34 = [C2 ], SIZE + LDFD f35 = [C10], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f36 = [C2 ], SIZE + LDFD f37 = [C10], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f38 = [C2 ], - 11 * SIZE + LDFD f39 = [C10], - 11 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f68, f16 + } + { .mmf + LDFD f48 = [C3 ], SIZE + LDFD f49 = [C11], SIZE + FMA f17 = ALPHA_R, f70, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f68, f18 + } + { .mmf + LDFD f50 = [C3 ], SIZE + LDFD f51 = [C11], SIZE + FMA f19 = ALPHA_I, f70, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f69, f20 + } + { .mmf + LDFD f52 = [C3 ], SIZE + LDFD f53 = [C11], SIZE + FMA f21 = ALPHA_R, f71, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f69, f22 + } + { .mmf + LDFD f54 = [C3 ], 5 * SIZE + LDFD f55 = [C11], 5 * SIZE + FMA f23 = ALPHA_I, f71, f23 + } + ;; + { .mmf + STFD [C1 ] = f16, SIZE + STFD [C9 ] = f17, SIZE + FMA f24 = ALPHA_R, f72, f24 + } + { .mmf + LDFD f40 = [C3 ], SIZE + LDFD f41 = [C11], SIZE + FMA f25 = ALPHA_R, f74, f25 + } + ;; + { .mmf + STFD [C1 ] = f18, SIZE + STFD [C9 ] = f19, SIZE + FMA f26 = ALPHA_I, f72, f26 + } + { .mmf + LDFD f42 = [C3 ], SIZE + LDFD f43 = [C11], SIZE + FMA f27 = ALPHA_I, f74, f27 + } + ;; + { .mmf + STFD [C1 ] = f20, SIZE + STFD [C9 ] = f21, SIZE + FMA f28 = ALPHA_R, f73, f28 + } + { .mmf + LDFD f44 = [C3 ], SIZE + LDFD f45 = [C11], SIZE + FMA f29 = ALPHA_R, f75, f29 + } + ;; + { .mmf + STFD [C1 ] = f22, 5 * SIZE + STFD [C9 ] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f73, f30 + } + { .mmf + LDFD f46 = [C3 ], - 11 * SIZE + LDFD f56 = [C11], - 11 * SIZE + FMA f31 = ALPHA_I, f75, f31 + } + ;; + { .mmf + STFD [C2 ] = f24, SIZE + STFD [C10] = f25, SIZE + FMA f32 = ALPHA_R, f76, f32 + } + { .mmf + LDFD f57 = [C4 ], SIZE + LDFD f58 = [C12], SIZE + FMA f33 = ALPHA_R, f78, f33 + } + ;; + { .mmf + STFD [C2 ] = f26, SIZE + STFD [C10] = f27, SIZE + FMA f34 = ALPHA_I, f76, f34 + } + { .mmf + LDFD f59 = [C4 ], SIZE + LDFD f60 = [C12], SIZE + FMA f35 = ALPHA_I, f78, f35 + } + ;; + { .mmf + STFD [C2 ] = f28, SIZE + STFD [C10] = f29, SIZE + FMA f36 = ALPHA_R, f77, f36 + } + { .mmf + LDFD f61 = [C4 ], SIZE + LDFD f62 = [C12], SIZE + FMA f37 = ALPHA_R, f79, f37 + } + ;; + { .mmf + STFD [C2 ] = f30, 5 * SIZE + STFD [C10] = f31, 5 * SIZE + FMA f38 = ALPHA_I, f77, f38 + } + { .mmf + LDFD f63 = [C4 ], 5 * SIZE + LDFD f47 = [C12], 5 * SIZE + FMA f39 = ALPHA_I, f79, f39 + } + ;; + { .mmf + STFD [C2 ] = f32, SIZE + STFD [C10] = f33, SIZE + FMA f48 = ALPHA_R, f80, f48 + } + { .mmf + LDFD f64 = [C4 ], SIZE + LDFD f65 = [C12], SIZE + FMA f49 = ALPHA_R, f82, f49 + } + ;; + { .mmf + STFD [C2 ] = f34, SIZE + STFD [C10] = f35, SIZE + FMA f50 = ALPHA_I, f80, f50 + } + { .mmf + LDFD f6 = [C4 ], SIZE + LDFD f7 = [C12], SIZE + FMA f51 = ALPHA_I, f82, f51 + } + ;; + { .mmf + STFD [C2 ] = f36, SIZE + STFD [C10] = f37, SIZE + FMA f52 = ALPHA_R, f81, f52 + } + { .mmf + LDFD f10 = [C4 ], SIZE + LDFD f11 = [C12], SIZE + FMA f53 = ALPHA_R, f83, f53 + } + ;; + { .mmf + STFD [C2 ] = f38, 5 * SIZE + STFD [C10] = f39, 5 * SIZE + FMA f54 = ALPHA_I, f81, f54 + } + { .mmf + LDFD f12 = [C4 ], - 11 * SIZE + LDFD f13 = [C12], - 11 * SIZE + FMA f55 = ALPHA_I, f83, f55 + } + ;; + { .mmf + STFD [C3 ] = f48, SIZE + STFD [C11] = f49, SIZE + FMA f40 = ALPHA_R, f84, f40 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f41 = ALPHA_R, f86, f41 + } + ;; + { .mmf + STFD [C3 ] = f50, SIZE + STFD [C11] = f51, SIZE + FMA f42 = ALPHA_I, f84, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f43 = ALPHA_I, f86, f43 + } + ;; + { .mmf + STFD [C3 ] = f52, SIZE + STFD [C11] = f53, SIZE + FMA f44 = ALPHA_R, f85, f44 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f45 = ALPHA_R, f87, f45 + } + ;; + { .mmf + STFD [C3 ] = f54, 5 * SIZE + STFD [C11] = f55, 5 * SIZE + FMA f46 = ALPHA_I, f85, f46 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f56 = ALPHA_I, f87, f56 + } + ;; + { .mmf + STFD [C3 ] = f40, SIZE + STFD [C11] = f41, SIZE + FMA f57 = ALPHA_R, f88, f57 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f58 = ALPHA_R, f90, f58 + } + ;; + { .mmf + STFD [C3 ] = f42, SIZE + STFD [C11] = f43, SIZE + FMA f59 = ALPHA_I, f88, f59 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f60 = ALPHA_I, f90, f60 + } + ;; + { .mmf + STFD [C3 ] = f44, SIZE + STFD [C11] = f45, SIZE + FMA f61 = ALPHA_R, f89, f61 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f62 = ALPHA_R, f91, f62 + } + ;; + { .mmf + STFD [C3 ] = f46, 5 * SIZE + STFD [C11] = f56, 5 * SIZE + FMA f63 = ALPHA_I, f89, f63 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f47 = ALPHA_I, f91, f47 + } + ;; + { .mmf + STFD [C4 ] = f57, SIZE + STFD [C12] = f58, SIZE + FMA f64 = ALPHA_R, f92, f64 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f65 = ALPHA_R, f94, f65 + } + ;; + { .mmf + STFD [C4 ] = f59, SIZE + STFD [C12] = f60, SIZE + FMA f6 = ALPHA_I, f92, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_I, f94, f7 + } + ;; + { .mmf + STFD [C4 ] = f61, SIZE + STFD [C12] = f62, SIZE + FMA f10 = ALPHA_R, f93, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_R, f95, f11 + } + ;; + { .mmf + STFD [C4 ] = f63, 5 * SIZE + STFD [C12] = f47, 5 * SIZE + FMA f12 = ALPHA_I, f93, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f95, f13 + } + ;; + { .mmf + STFD [C4 ] = f64, SIZE + STFD [C12] = f65, SIZE + mov f64 = f0 + } + { .mmf + cmp.ne p6, p0 = 1, I + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C4 ] = f6, SIZE + STFD [C12] = f7, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C4 ] = f10, SIZE + STFD [C12] = f11, SIZE + mov f65 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; + { .mmf + STFD [C4 ] = f12, 5 * SIZE + STFD [C12] = f13, 5 * SIZE + mov f81 = f0 + } + { .mfb + adds I = -1, I + mov f89 = f0 + (p6) br.cond.dptk .L052 + } + ;; + .align 32 + +.L060: + { .mfi + nop __LINE__ + mov f66 = f0 + tbit.z p6, p7 = M, 2 + } + { .mfb + nop __LINE__ + mov f74 = f0 + (p6) br.cond.dptk .L070 + } + ;; + { .mfb + LDFPD f48, f49 = [B] + mov f82 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f90 = f0 + adds L = 1, K + } + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f67 = f0 + adds L = -1, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov f75 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f91 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 32 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + (p5) adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + (p5) adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + (p5) adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + (p5) adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = f33, f51, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = f34, f51, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + FMA f91 = f35, f51, f91 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9], SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f14 = [C1 ], - 3 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9], - 3 * SIZE + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f16 = [C2 ], SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f17 = [C10], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f18 = [C2 ], SIZE + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + (p5) LDFD f19 = [C10], SIZE + (p3) FMA f90 = f42, f59, f90 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f20 = [C2 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f21 = [C10], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f22 = [C2 ], -3 * SIZE + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + adds L = -1, L + } + { .mfb + (p5) LDFD f23 = [C10], -3 * SIZE + (p3) FMA f91 = f43, f59, f91 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; + { .mmf + LDFD f24 = [C3 ], SIZE + LDFD f25 = [C11], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f26 = [C3 ], SIZE + LDFD f27 = [C11], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f28 = [C3 ], SIZE + LDFD f29 = [C11], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f30 = [C3 ], - 3 * SIZE + LDFD f31 = [C11], - 3 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f72, f16 + } + { .mmf + LDFD f32 = [C4 ], SIZE + LDFD f33 = [C12], SIZE + FMA f17 = ALPHA_R, f74, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f72, f18 + } + { .mmf + LDFD f34 = [C4 ], SIZE + LDFD f35 = [C12], SIZE + FMA f19 = ALPHA_I, f74, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f73, f20 + } + { .mmf + LDFD f36 = [C4 ], SIZE + LDFD f37 = [C12], SIZE + FMA f21 = ALPHA_R, f75, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f73, f22 + } + { .mmf + LDFD f38 = [C4 ], - 3 * SIZE + LDFD f39 = [C12], - 3 * SIZE + FMA f23 = ALPHA_I, f75, f23 + } + ;; + { .mmf + STFD [C2 ] = f16, SIZE + STFD [C10] = f17, SIZE + FMA f24 = ALPHA_R, f80, f24 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f25 = ALPHA_R, f82, f25 + } + ;; + { .mmf + STFD [C2 ] = f18, SIZE + STFD [C10] = f19, SIZE + FMA f26 = ALPHA_I, f80, f26 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f27 = ALPHA_I, f82, f27 + } + ;; + { .mmf + STFD [C2 ] = f20, SIZE + STFD [C10] = f21, SIZE + FMA f28 = ALPHA_R, f81, f28 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f29 = ALPHA_R, f83, f29 + } + ;; + { .mmf + STFD [C2 ] = f22, 5 * SIZE + STFD [C10] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f81, f30 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f31 = ALPHA_I, f83, f31 + } + ;; + { .mmf + STFD [C3 ] = f24, SIZE + STFD [C11] = f25, SIZE + FMA f32 = ALPHA_R, f88, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA_R, f90, f33 + } + ;; + { .mmf + STFD [C3 ] = f26, SIZE + STFD [C11] = f27, SIZE + FMA f34 = ALPHA_I, f88, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA_I, f90, f35 + } + ;; + { .mmf + STFD [C3 ] = f28, SIZE + STFD [C11] = f29, SIZE + FMA f36 = ALPHA_R, f89, f36 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_R, f91, f37 + } + ;; + { .mmf + STFD [C3 ] = f30, 5 * SIZE + STFD [C11] = f31, 5 * SIZE + FMA f38 = ALPHA_I, f89, f38 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_I, f91, f39 + } + ;; + { .mmf + STFD [C4 ] = f32, SIZE + STFD [C12] = f33, SIZE + mov f64 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C4 ] = f34, SIZE + STFD [C12] = f35, SIZE + mov f80 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f88 = f0 + } + ;; + { .mmf + STFD [C4 ] = f36, SIZE + STFD [C12] = f37, SIZE + mov f81 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f65 = f0 + } + ;; + { .mmf + STFD [C4 ] = f38, 5 * SIZE + STFD [C12] = f39, 5 * SIZE + mov f89 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; + .align 32 + +.L070: + { .mib + nop __LINE__ + tbit.z p6,p7 = M, 1 + (p6) br.cond.dptk .L080 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + (p5) LDFD f6 = [C1 ], SIZE + (p5) LDFD f12 = [C2 ], SIZE + FMA f89 = f33, f51, f89 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + (p5) LDFD f7 = [C1 ], SIZE + (p5) LDFD f13 = [C2 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + (p5) LDFD f10 = [C1 ], SIZE + (p5) LDFD f14 = [C2 ], SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C1 ], - 3 * SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + (p5) LDFD f15 = [C2 ], - 3 * SIZE + (p3) FMA f89 = f41, f59, f89 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mmf + LDFD f16 = [C3], SIZE + LDFD f20 = [C4], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f72, f12 + } + ;; + { .mmf + LDFD f17 = [C3], SIZE + LDFD f21 = [C4], SIZE + FMA f7 = ALPHA_I, f64, f7 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f72, f13 + } + ;; + { .mmf + LDFD f18 = [C3], SIZE + LDFD f22 = [C4], SIZE + FMA f10 = ALPHA_R, f65, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_R, f73, f14 + } + ;; + { .mmf + LDFD f19 = [C3], - 3 * SIZE + LDFD f23 = [C4], - 3 * SIZE + FMA f11 = ALPHA_I, f65, f11 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f73, f15 + } + ;; + { .mmf + STFD [C1] = f6, SIZE + STFD [C2] = f12, SIZE + FMA f16 = ALPHA_R, f80, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f20 = ALPHA_R, f88, f20 + } + ;; + { .mmf + STFD [C1] = f7, SIZE + STFD [C2] = f13, SIZE + FMA f17 = ALPHA_I, f80, f17 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f21 = ALPHA_I, f88, f21 + } + ;; + { .mmf + STFD [C1] = f10, SIZE + STFD [C2] = f14, SIZE + FMA f18 = ALPHA_R, f81, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f22 = ALPHA_R, f89, f22 + } + ;; + { .mmf + STFD [C1] = f11, SIZE + STFD [C2] = f15, SIZE + FMA f19 = ALPHA_I, f81, f19 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f23 = ALPHA_I, f89, f23 + } + ;; + { .mmf + STFD [C3] = f16, SIZE + STFD [C4] = f20, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C3] = f17, SIZE + STFD [C4] = f21, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C3] = f18, SIZE + STFD [C4] = f22, SIZE + mov f80 = f0 + } + ;; + { .mmf + STFD [C3] = f19, SIZE + STFD [C4] = f23, SIZE + mov f88 = f0 + } + ;; + .align 32 + +.L080: + { .mib + nop __LINE__ + tbit.z p6,p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L082: + { .mfb + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + (p12) cmp.ne p3, p0 = 0, L + FMA f72 = f32, f49, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + FMA f88 = f32, f51, f88 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + (p5) LDFD f6 = [C1], SIZE + (p5) LDFD f10 = [C2], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + } + ;; + { .mmf + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + } + { .mmf + (p5) LDFD f7 = [C1], -SIZE + (p5) LDFD f11 = [C2], -SIZE + (p3) FMA f88 = f40, f59, f88 // A1 * B4 + } + ;; + { .mib + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds L = -1, L + br.cloop.sptk.few .L082 + } + ;; + { .mmf + LDFD f12 = [C3], SIZE + LDFD f14 = [C4], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f10 = ALPHA_R, f72, f10 + } + ;; + { .mmf + LDFD f13 = [C3], -SIZE + LDFD f15 = [C4], -SIZE + FMA f7 = ALPHA_I, f64, f7 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f72, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f80, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_R, f88, f14 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_I, f80, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f88, f15 + } + ;; + { .mmi + STFD [C1] = f6, SIZE + STFD [C2] = f10, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C1] = f7, SIZE + STFD [C2] = f11, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C3] = f12, SIZE + STFD [C4] = f14, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C3] = f13, SIZE + STFD [C4] = f15, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L089: + { .mmi + mov B = BOFFSET + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L090: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 1 + } + { .mfi + add C2 = LDC, C + mov f72 = f0 + shr I = M, 3 + } + ;; + { .mfi + setf.d f66 = r0 + mov f65 = f0 + nop __LINE__ + } + { .mfb + mov AOFFSET = A + mov f73 = f0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mfi + nop __LINE__ + mov f67 = f0 + shladd C = LDC, 1, C + } + { .mfb + cmp.eq p6, p7 = 0, I + mov f74 = f0 + (p6) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L092: + { .mfb + LDFPD f48, f49 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f79 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f75 = f0 + nop __LINE__ + } + ;; + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + mov f76 = f0 + adds L = 1, K + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f69 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f77 = f0 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC], LDC + mov f70 = f0 + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f78 = f0 + mov ar.lc = L + } + { .mfi + CPREFETCH [PREC] + mov f71 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + .align 32 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C11 = 4 * SIZE, C3 + } + { .mfi + nop __LINE__ + FMA f74 = f34, f49, f74 // A3 * B2 + adds C12 = 4 * SIZE, C4 + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + FMA f76 = f36, f49, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + FMA f77 = f37, f49, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + FMA f78 = f38, f49, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C1 ], SIZE + FMA f79 = f39, f49, f79 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f14 = [C1 ], 5 * SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9 ], 5 * SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f16 = [C1 ], SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f17 = [C9 ], SIZE + (p3) FMA f76 = f44, f57, f76 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f18 = [C1 ], SIZE + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f19 = [C9 ], SIZE + (p3) FMA f77 = f45, f57, f77 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p5) LDFD f20 = [C1 ], SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f21 = [C9 ], SIZE + (p3) FMA f78 = f46, f57, f78 // A7 * B2 + nop __LINE__ + } + ;; + { .mfi + (p5) LDFD f22 = [C1 ], -11 * SIZE + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + adds L = -1, L + } + { .mfb + (p5) LDFD f23 = [C9 ], -11 * SIZE + (p3) FMA f79 = f47, f57, f79 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; + { .mmf + LDFD f24 = [C2 ], SIZE + LDFD f25 = [C10], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f26 = [C2 ], SIZE + LDFD f27 = [C10], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f28 = [C2 ], SIZE + LDFD f29 = [C10], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f30 = [C2 ], 5 * SIZE + LDFD f31 = [C10], 5 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f68, f16 + } + { .mmf + LDFD f32 = [C2 ], SIZE + LDFD f33 = [C10], SIZE + FMA f17 = ALPHA_R, f70, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f68, f18 + } + { .mmf + LDFD f34 = [C2 ], SIZE + LDFD f35 = [C10], SIZE + FMA f19 = ALPHA_I, f70, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f69, f20 + } + { .mmf + LDFD f36 = [C2 ], SIZE + LDFD f37 = [C10], SIZE + FMA f21 = ALPHA_R, f71, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f69, f22 + } + { .mmf + LDFD f38 = [C2 ], - 11 * SIZE + LDFD f39 = [C10], - 11 * SIZE + FMA f23 = ALPHA_I, f71, f23 + } + ;; + { .mmf + STFD [C1 ] = f16, SIZE + STFD [C9 ] = f17, SIZE + FMA f24 = ALPHA_R, f72, f24 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f25 = ALPHA_R, f74, f25 + } + ;; + { .mmf + STFD [C1 ] = f18, SIZE + STFD [C9 ] = f19, SIZE + FMA f26 = ALPHA_I, f72, f26 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f27 = ALPHA_I, f74, f27 + } + ;; + { .mmf + STFD [C1 ] = f20, SIZE + STFD [C9 ] = f21, SIZE + FMA f28 = ALPHA_R, f73, f28 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f29 = ALPHA_R, f75, f29 + } + ;; + { .mmf + STFD [C1 ] = f22, 5 * SIZE + STFD [C9 ] = f23, 5 * SIZE + FMA f30 = ALPHA_I, f73, f30 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f31 = ALPHA_I, f75, f31 + } + ;; + { .mmf + STFD [C2 ] = f24, SIZE + STFD [C10] = f25, SIZE + FMA f32 = ALPHA_R, f76, f32 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f33 = ALPHA_R, f78, f33 + } + ;; + { .mmf + STFD [C2 ] = f26, SIZE + STFD [C10] = f27, SIZE + FMA f34 = ALPHA_I, f76, f34 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f35 = ALPHA_I, f78, f35 + } + ;; + { .mmf + STFD [C2 ] = f28, SIZE + STFD [C10] = f29, SIZE + FMA f36 = ALPHA_R, f77, f36 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f37 = ALPHA_R, f79, f37 + } + ;; + { .mmf + STFD [C2 ] = f30, 5 * SIZE + STFD [C10] = f31, 5 * SIZE + FMA f38 = ALPHA_I, f77, f38 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f39 = ALPHA_I, f79, f39 + } + ;; + { .mmf + STFD [C2 ] = f32, SIZE + STFD [C10] = f33, SIZE + mov f64 = f0 + } + { .mmf + cmp.ne p6, p0 = 1, I + nop __LINE__ + mov f72 = f0 + } + ;; + { .mmf + STFD [C2 ] = f34, SIZE + STFD [C10] = f35, SIZE + mov f65 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; + { .mmf + STFD [C2 ] = f36, SIZE + STFD [C10] = f37, SIZE + mov f66 = f0 + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f74 = f0 + } + ;; + { .mmf + STFD [C2 ] = f38, 5 * SIZE + STFD [C10] = f39, 5 * SIZE + mov f67 = f0 + } + { .mfb + adds I = -1, I + mov f75 = f0 + (p6) br.cond.dptk .L092 + } + ;; + .align 32 + +.L100: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L110 + } + ;; + { .mmf + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + mov f75 = f0 + } + { .mii + nop __LINE__ + adds L = 1, K + } + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mfi + nop __LINE__ + FMA f73 = f33, f49, f73 // A2 * B2 + adds C10 = 4 * SIZE, C2 + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f6 = [C1 ], SIZE + FMA f74 = f34, f49, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + FMA f75 = f35, f49, f75 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f12 = [C1], SIZE + (p3) FMA f74 = f42, f57, f74 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + adds L = -1, L + } + { .mfb + (p5) LDFD f13 = [C9], SIZE + (p3) FMA f75 = f43, f57, f75 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mmf + LDFD f14 = [C1], - 3 * SIZE + LDFD f15 = [C9], - 3 * SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f16 = [C2 ], SIZE + LDFD f17 = [C10], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f18 = [C2 ], SIZE + LDFD f19 = [C10], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f20 = [C2 ], SIZE + LDFD f21 = [C10], SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f72, f16 + } + { .mmf + LDFD f22 = [C2 ], - 3 * SIZE + LDFD f23 = [C10], - 3 * SIZE + FMA f17 = ALPHA_R, f74, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f72, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f19 = ALPHA_I, f74, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f73, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f21 = ALPHA_R, f75, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f73, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f23 = ALPHA_I, f75, f23 + } + ;; + { .mmf + STFD [C2 ] = f16, SIZE + STFD [C10] = f17, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C2 ] = f18, SIZE + STFD [C10] = f19, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C2 ] = f20, SIZE + STFD [C10] = f21, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C2 ] = f22, 5 * SIZE + STFD [C10] = f23, 5 * SIZE + mov f73 = f0 + } + ;; + .align 32 + +.L110: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L120 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + (p5) LDFD f6 = [C1 ], SIZE + (p5) LDFD f7 = [C2 ], SIZE + FMA f73 = f33, f49, f73 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p5) LDFD f11 = [C2 ], SIZE + (p3) FMA f73 = f41, f57, f73 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mmf + LDFD f12 = [C1], SIZE + LDFD f13 = [C2], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f72, f7 + } + ;; + { .mmf + LDFD f14 = [C1], - 3 * SIZE + LDFD f15 = [C2], - 3 * SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f72, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f73, f13 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f73, f15 + } + ;; + { .mmf + STFD [C1] = f6, SIZE + STFD [C2] = f7, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1] = f10, SIZE + STFD [C2] = f11, SIZE + mov f72 = f0 + } + ;; + { .mmf + STFD [C1] = f12, SIZE + STFD [C2] = f13, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1] = f14, SIZE + STFD [C2] = f15, SIZE + mov f73 = f0 + } + ;; + .align 32 + +.L120: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L129 + } + ;; + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B + adds L = 1, K + } + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFD f32 = [AOFFSET], 1 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + cmp.eq p3, p0 = r0, r0 + nop __LINE__ + mov ar.lc = L + } + ;; + .align 32 + +.L122: + { .mfi + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f72 = f32, f49, f72 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + { .mmi + (p5) LDFD f6 = [C1], SIZE + (p5) LDFD f7 = [C2], SIZE + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + { .mfb + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p3) FMA f72 = f40, f57, f72 // A1 * B2 + br.cloop.sptk.few .L122 + } + ;; + +.L128: + { .mmf + (p5) LDFD f10 = [C1], -SIZE + (p5) LDFD f11 = [C2], -SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f72, f7 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f72, f11 + } + ;; + { .mmi + STFD [C1 ] = f6, SIZE + STFD [C2 ] = f7, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [C1 ] = f10, SIZE + STFD [C2 ] = f11, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L129: + { .mmi + mov B = BOFFSET + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L130: + { .mfi + nop __LINE__ + mov f64 = f0 + tbit.z p6, p0 = N, 0 + } + { .mib + mov AOFFSET = A + shr I = M, 3 + (p6) br.cond.dpnt .L999 + } + ;; + { .mfi + mov C1 = C + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + mov f66 = f0 + nop __LINE__ + } + { .mfb + cmp.eq p7, p0 = 0, I + mov f67 = f0 + (p7) br.cond.dpnt .L140 + } + ;; + .align 32 + +.L132: + { .mfb + LDFD f48 = [B] + mov f68 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 1 * SIZE, B + mov f69 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f70 = f0 + adds L = 1, K + } + ;; + { .mii + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f71 = f0 + adds L = -1, L + } + ;; + { .mmi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds PREC = CPREFETCHSIZE * SIZE, C1 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmi + CPREFETCH [PREC] + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov ar.lc = L + } + ;; + .align 32 + +.L133: + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p5) LDFD f6 = [C1 ], SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f7 = [C9 ], SIZE + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f10 = [C1 ], SIZE + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p5) LDFD f12 = [C1 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + (p5) LDFD f13 = [C9 ], SIZE + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + adds L = -1, L + } + { .mfb + (p5) LDFD f14 = [C1 ], 5 * SIZE + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + nop __LINE__ + nop __LINE__ + } + { .mfb + (p5) LDFD f15 = [C9 ], 5 * SIZE + nop __LINE__ + br.cloop.sptk.few .L133 + } + ;; + +.L138: + { .mmf + LDFD f16 = [C1 ], SIZE + LDFD f17 = [C9 ], SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + LDFD f18 = [C1 ], SIZE + LDFD f19 = [C9 ], SIZE + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + LDFD f20 = [C1 ], SIZE + LDFD f21 = [C9 ], SIZE + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + LDFD f22 = [C1 ], - 11 * SIZE + LDFD f23 = [C9 ], - 11 * SIZE + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + FMA f16 = ALPHA_R, f68, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f17 = ALPHA_R, f70, f17 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + FMA f18 = ALPHA_I, f68, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f19 = ALPHA_I, f70, f19 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + FMA f20 = ALPHA_R, f69, f20 + } + { .mmf + cmp.ne p6, p0 = 1, I + adds I = -1, I + FMA f21 = ALPHA_R, f71, f21 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + FMA f22 = ALPHA_I, f69, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f23 = ALPHA_I, f71, f23 + } + ;; + { .mmf + STFD [C1 ] = f16, SIZE + STFD [C9 ] = f17, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f18, SIZE + STFD [C9 ] = f19, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1 ] = f20, SIZE + STFD [C9 ] = f21, SIZE + mov f66 = f0 + } + ;; + { .mmf + STFD [C1 ] = f22, 5 * SIZE + STFD [C9 ] = f23, 5 * SIZE + mov f67 = f0 + } + { .mmb + nop __LINE__ + nop __LINE__ + (p6) br.cond.dptk .L132 + } + ;; + .align 32 + +.L140: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 2 + (p6) br.cond.dptk .L150 + } + ;; + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B + adds L = 1, K + } + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + adds L = -1, L + nop __LINE__ + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 32 + +.L142: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA f65 = f33, f48, f65 // A2 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + (p5) adds C9 = 4 * SIZE, C1 + } + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + FMA f67 = f35, f48, f67 // A4 * B1 + } + ;; + { .mfi + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + (p5) adds C10 = 2 * SIZE, C2 + } + { .mmf + (p5) LDFD f6 = [C1 ], SIZE + (p5) LDFD f7 = [C9 ], SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + } + ;; + { .mmf + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + } + { .mmf + (p5) LDFD f10 = [C1 ], SIZE + (p5) LDFD f11 = [C9 ], SIZE + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + { .mmb + (p5) LDFD f12 = [C1 ], SIZE + (p5) LDFD f13 = [C9 ], SIZE + br.cloop.sptk.few .L142 + } + ;; + +.L148: + { .mmf + LDFD f14 = [C1 ], - 3 * SIZE + LDFD f15 = [C9 ], - 3 * SIZE + FMA f6 = ALPHA_R, f64, f6 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f7 = ALPHA_R, f66, f7 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f10 = ALPHA_I, f64, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f11 = ALPHA_I, f66, f11 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f12 = ALPHA_R, f65, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f13 = ALPHA_R, f67, f13 + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + FMA f14 = ALPHA_I, f65, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f15 = ALPHA_I, f67, f15 + } + ;; + { .mmf + STFD [C1 ] = f6, SIZE + STFD [C9 ] = f7, SIZE + mov f64 = f0 + } + ;; + { .mmf + STFD [C1 ] = f10, SIZE + STFD [C9 ] = f11, SIZE + mov f65 = f0 + } + ;; + { .mmf + STFD [C1 ] = f12, SIZE + STFD [C9 ] = f13, SIZE + mov f66 = f0 + } + ;; + { .mmf + STFD [C1 ] = f14, 5 * SIZE + STFD [C9 ] = f15, 5 * SIZE + mov f67 = f0 + } + ;; + .align 32 + +.L150: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L160 + } + ;; + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B + adds L = 1, K + } + ;; + { .mii + cmp.eq p3, p0 = r0, r0 + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = -1, L + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L152: + { .mfi + cmp.ne p4, p5 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + ;; + { .mfi + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + adds L = -1, L + } + ;; + { .mfb + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + br.cloop.sptk.few .L152 + } + ;; + +.L158: + LDFD f68 = [C1 ], 1 * SIZE + ;; + LDFD f69 = [C1 ], 1 * SIZE + ;; + LDFD f70 = [C1 ], 1 * SIZE + ;; + LDFD f71 = [C1 ], - 3 * SIZE + ;; + FMA f68 = ALPHA_R, f64, f68 + FMA f69 = ALPHA_I, f64, f69 + FMA f70 = ALPHA_R, f65, f70 + FMA f71 = ALPHA_I, f65, f71 + ;; + STFD [C1 ] = f68, SIZE + ;; + STFD [C1 ] = f69, SIZE + ;; + STFD [C1 ] = f70, SIZE + mov f64 = f0 + ;; + STFD [C1 ] = f71, SIZE + mov f65 = f0 + ;; + .align 32 + +.L160: + { .mib + nop __LINE__ + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L169 + } + ;; + { .mmi + LDFD f48 = [B] + adds BOFFSET = 1 * SIZE, B + adds L = 1, K + } + ;; + { .mii + LDFD f32 = [AOFFSET], 1 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mii + adds L = -1, L + cmp.eq p3, p0 = r0, r0 + ;; + mov ar.lc = L + } + ;; + .align 32 + +.L162: + { .mmf + cmp.ne p4, p5 = 0, L + (p12) cmp.ne p3, p0 = 0, L + FMA f64 = f32, f48, f64 // A1 * B1 + } + ;; + { .mmi + (p3) LDFD f56 = [BOFFSET], 1 * SIZE + (p3) LDFD f40 = [AOFFSET], 1 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p4) LDFD f32 = [AOFFSET], 1 * SIZE + (p5) LDFD f68 = [C1], 1 * SIZE + adds L = -1, L + } + ;; + { .mmf + (p4) LDFD f48 = [BOFFSET], 1 * SIZE + (p5) LDFD f69 = [C1], - 1 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + } + { .mib + nop __LINE__ + nop __LINE__ + br.cloop.sptk.few .L162 + } + ;; + FMA f68 = ALPHA_R, f64, f68 + FMA f69 = ALPHA_I, f64, f69 + ;; + STFD [C1 ] = f68, SIZE + ;; + STFD [C1 ] = f69, SIZE + ;; + .align 32 + +.L169: + { .mmi + mov B = BOFFSET + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f24 = [SP], 32 + ldf.fill f25 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f26 = [SP], 32 + ldf.fill f27 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f28 = [SP], 32 + ldf.fill f29 = [r9], 32 + ;; + ldf.fill f30 = [SP], 32 + ldf.fill f31 = [r9] + br.ret.sptk.many b0 + EPILOGUE + diff --git a/kernel/ia64/zgemm_beta.S b/kernel/ia64/zgemm_beta.S new file mode 100644 index 0000000000..00cf3e95fd --- /dev/null +++ b/kernel/ia64/zgemm_beta.S @@ -0,0 +1,517 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 74 + +#define CO1 r14 +#define CO2 r15 +#define CO3 r16 +#define DO1 r17 +#define DO2 r18 +#define DO3 r19 + +#define I r22 +#define I_AND_15 r23 +#define PRE1 r24 + +#define PR r30 +#define ARLC r31 + +#define M r32 +#define N r33 +#define C r34 +#define LDC r35 +#define J r36 + +#define BETA_R f8 +#define BETA_I f9 + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds CO1 = 24, r12 + adds CO2 = 32, r12 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, N + fcmp.eq p0, p14 = BETA_R, f0 + (p6) br.ret.sptk.many b0 + } + ;; + .body + { .mmi + ld8 C = [CO1], 8 + ld8 LDC = [CO2] + mov PR = pr + } + { .mfi + mov J = N + fcmp.eq p0, p15 = BETA_I, f0 + shr I = M, 3 + } + ;; + { .mmb + cmp.ge p6, p0 = 0, M + adds I = -1, I + (p6) br.ret.sptk.many b0 + } + ;; + { .mbb + shladd LDC = LDC, ZBASE_SHIFT, r0 + (p14) br.cond.dpnt .L100 + (p15) br.cond.dpnt .L100 + } + ;; + .align 32 + +.L60: + { .mmi + mov CO1 = C + mov CO3 = C + add CO2 = 4 * SIZE, C + } + { .mmi + adds PRE1 = PREFETCHSIZE * SIZE, C + add C = C, LDC + tbit.nz p12, p0 = M, 2 + } + ;; + { .mmi + and I_AND_15 = 15, M + mov ar.lc = I + } + { .mib + cmp.gt p8, p0 = 0, I + (p8) br.cond.dpnt .L80 + } + ;; + .align 32 + +.L70: + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + { .mmi + lfetch.excl.nt1 [PRE1], 16 * SIZE + nop.m 0 + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + adds CO3 = 16 * SIZE, CO3 + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 5 * SIZE + STFD [CO2] = f0, 5 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmi + STFD [CO1] = f0, 1 * SIZE + STFD [CO2] = f0, 1 * SIZE + } + ;; + { .mmb + STFD [CO1] = f0, 5 * SIZE + STFD [CO2] = f0, 5 * SIZE + br.cloop.sptk.few .L70 + } + ;; + .align 32 + +.L80: + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + tbit.nz p13, p0 = M, 1 + } + { .mmb + cmp.eq p9, p0 = 0, I_AND_15 + adds J = -1, J + (p9) br.cond.dptk .L99 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + tbit.nz p14, p0 = M, 0 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 1 * SIZE + (p12) STFD [CO2] = f0, 1 * SIZE + (p12) adds CO3 = 8 * SIZE, CO3 + } + ;; + { .mmi + (p12) STFD [CO1] = f0, 5 * SIZE + (p12) STFD [CO2] = f0 + (p13) adds CO3 = 4 * SIZE, CO3 + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p14) STFD [CO3] = f0, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + (p14) STFD [CO3] = f0, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [CO1] = f0, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [CO1] = f0 + } + ;; + .align 32 + +.L99: + { .mib + cmp.lt p6, p0 = 0, J + mov ar.lc = ARLC + } + { .mbb + (p6) br.cond.dptk .L60 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + { .mmi + mov CO1 = C + mov CO3 = C + mov pr.rot = 0 + } + { .mmi + adds PRE1 = PREFETCHSIZE * SIZE, C + add CO2 = 4 * SIZE, C + mov DO1 = C + } + ;; + { .mmi + mov ar.ec = 6 + } + { .mmi + adds DO2 = 4 * SIZE, C + mov DO3 = C + add C = C, LDC + } + ;; + { .mmi + and I_AND_15 = 15, M + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + } + { .mib + cmp.gt p8, p0 = 0, I + tbit.nz p12, p0 = M, 2 + (p8) br.cond.dpnt .L180 + } + ;; + .align 32 + +.L170: + { .mmf + (p21) STFD [DO1] = f37, 1 * SIZE + (p16) lfetch.excl.nt1 [PRE1], 16 * SIZE + (p21) FNMA f61 = BETA_I, f67, f61 + } + { .mmf + (p16) LDFD f32 = [CO1], 1 * SIZE + (p16) adds CO2 = 16 * SIZE, CO2 + (p21) FMPY f12 = BETA_I, f85 + } + ;; + { .mfi + (p21) STFD [DO1] = f43, 1 * SIZE + (p21) FMA f67 = BETA_R, f67, f10 + (p16) adds CO3 = 16 * SIZE, CO3 + } + { .mfi + (p16) LDFD f38 = [CO1], 1 * SIZE + (p21) FMPY f85 = BETA_R, f85 + (p16) adds DO2 = 16 * SIZE, DO2 + } + ;; + { .mfi + (p21) STFD [DO1] = f49, 1 * SIZE + (p21) FNMA f73 = BETA_I, f79, f73 + (p16) adds DO3 = 16 * SIZE, DO3 + } + { .mfi + (p16) LDFD f44 = [CO1], 1 * SIZE + (p21) FMPY f13 = BETA_I, f97 + nop.i 0 + } + ;; + (p21) STFD [DO1] = f55, 1 * SIZE + (p21) FMA f79 = BETA_R, f79, f11 + (p16) LDFD f50 = [CO1], 1 * SIZE + (p21) FMPY f97 = BETA_R, f97 + ;; + (p21) STFD [DO1] = f61, 1 * SIZE + (p21) FNMA f85 = BETA_I, f91, f85 + (p16) LDFD f56 = [CO1], 1 * SIZE + (p21) FMPY f14 = BETA_I, f109 + ;; + (p21) STFD [DO1] = f67, 1 * SIZE + (p21) FMA f91 = BETA_R, f91, f12 + (p16) LDFD f62 = [CO1], 1 * SIZE + (p21) FMPY f109 = BETA_R, f109 + ;; + (p21) STFD [DO1] = f73, 1 * SIZE + (p21) FNMA f97 = BETA_I, f103, f97 + (p16) LDFD f68 = [CO1], 1 * SIZE + (p21) FMPY f15 = BETA_I, f121 + ;; + (p21) STFD [DO1] = f79, 1 * SIZE + (p21) FMA f103 = BETA_R, f103, f13 + (p16) LDFD f74 = [CO1], 1 * SIZE + (p21) FMPY f121 = BETA_R, f121 + ;; + (p21) STFD [DO1] = f85, 1 * SIZE + (p21) FNMA f109 = BETA_I, f115, f109 + (p16) LDFD f80 = [CO1], 1 * SIZE + (p20) FMPY f6 = BETA_I, f36 + ;; + (p21) STFD [DO1] = f91, 1 * SIZE + (p21) FMA f115 = BETA_R, f115, f14 + (p16) LDFD f86 = [CO1], 1 * SIZE + (p20) FMPY f36 = BETA_R, f36 + ;; + (p21) STFD [DO1] = f97, 1 * SIZE + (p21) FNMA f121 = BETA_I, f127, f121 + (p16) LDFD f92 = [CO1], 1 * SIZE + (p20) FMPY f7 = BETA_I, f48 + ;; + (p21) STFD [DO1] = f103, 1 * SIZE + (p21) FMA f127 = BETA_R, f127, f15 + (p16) LDFD f98 = [CO1], 1 * SIZE + (p20) FMPY f48 = BETA_R, f48 + ;; + (p21) STFD [DO1] = f109, 1 * SIZE + (p20) FNMA f36 = BETA_I, f42, f36 + (p16) LDFD f104 = [CO1], 1 * SIZE + (p20) FMPY f10 = BETA_I, f60 + ;; + (p21) STFD [DO1] = f115, 1 * SIZE + (p20) FMA f42 = BETA_R, f42, f6 + (p16) LDFD f110 = [CO1], 1 * SIZE + (p20) FMPY f60 = BETA_R, f60 + ;; + (p21) STFD [DO1] = f121, 1 * SIZE + (p20) FNMA f48 = BETA_I, f54, f48 + (p16) LDFD f116 = [CO1], 1 * SIZE + (p20) FMPY f11 = BETA_I, f72 + ;; + (p21) STFD [DO1] = f127, 1 * SIZE + (p20) FMA f54 = BETA_R, f54, f7 + (p16) LDFD f122 = [CO1], 1 * SIZE + (p20) FMPY f72 = BETA_R, f72 + br.ctop.sptk.few .L170 + ;; + .align 32 + +.L180: + { .mmi + (p12) LDFD f32 = [CO1], 1 * SIZE + (p12) LDFD f36 = [CO2], 1 * SIZE + tbit.nz p13, p0 = M, 1 + } + { .mmb + cmp.eq p9, p0 = 0, I_AND_15 + adds J = -1, J + (p9) br.cond.dptk .L199 + } + ;; + { .mmi + (p12) LDFD f33 = [CO1], 1 * SIZE + (p12) LDFD f37 = [CO2], 1 * SIZE + tbit.nz p14, p0 = M, 0 + } + ;; + { .mmi + (p12) LDFD f34 = [CO1], 1 * SIZE + (p12) LDFD f38 = [CO2], 1 * SIZE + (p12) adds CO3 = 8 * SIZE, CO3 + } + ;; + { .mmi + (p12) LDFD f35 = [CO1], 5 * SIZE + (p12) LDFD f39 = [CO2] + (p13) adds CO3 = 4 * SIZE, CO3 + } + ;; + { .mmi + (p13) LDFD f40 = [CO1], 1 * SIZE + (p14) LDFD f44 = [CO3], 1 * SIZE + } + ;; + { .mmi + (p13) LDFD f41 = [CO1], 1 * SIZE + (p14) LDFD f45 = [CO3], 1 * SIZE + } + ;; + { .mmf + (p13) LDFD f42 = [CO1], 1 * SIZE + } + ;; + { .mmf + (p13) LDFD f43 = [CO1] + } + ;; + (p12) FMPY f80 = BETA_I, f32 + (p12) FMPY f32 = BETA_R, f32 + (p12) FMPY f81 = BETA_I, f34 + (p12) FMPY f34 = BETA_R, f34 + (p12) FMPY f82 = BETA_I, f36 + (p12) FMPY f36 = BETA_R, f36 + (p12) FMPY f83 = BETA_I, f38 + (p12) FMPY f38 = BETA_R, f38 + ;; + (p12) FNMA f32 = BETA_I, f33, f32 + (p12) FMA f33 = BETA_R, f33, f80 + (p12) FNMA f34 = BETA_I, f35, f34 + (p12) FMA f35 = BETA_R, f35, f81 + (p12) FNMA f36 = BETA_I, f37, f36 + (p12) FMA f37 = BETA_R, f37, f82 + (p12) FNMA f38 = BETA_I, f39, f38 + (p12) FMA f39 = BETA_R, f39, f83 + ;; + (p13) FMPY f84 = BETA_I, f40 + (p13) FMPY f40 = BETA_R, f40 + (p13) FMPY f85 = BETA_I, f42 + (p13) FMPY f42 = BETA_R, f42 + (p14) FMPY f86 = BETA_I, f44 + (p14) FMPY f44 = BETA_R, f44 + ;; + (p13) FNMA f40 = BETA_I, f41, f40 + (p13) FMA f41 = BETA_R, f41, f84 + (p13) FNMA f42 = BETA_I, f43, f42 + (p13) FMA f43 = BETA_R, f43, f85 + (p14) FNMA f44 = BETA_I, f45, f44 + (p14) FMA f45 = BETA_R, f45, f86 + ;; + + { .mmf + (p12) STFD [DO1] = f32, 1 * SIZE + (p12) STFD [DO2] = f36, 1 * SIZE + } + { .mmf + (p12) adds DO3 = 8 * SIZE, DO3 + } + ;; + { .mmf + (p12) STFD [DO1] = f33, 1 * SIZE + (p12) STFD [DO2] = f37, 1 * SIZE + } + { .mmf + (p13) adds DO3 = 4 * SIZE, DO3 + } + ;; + { .mmf + (p12) STFD [DO1] = f34, 1 * SIZE + (p12) STFD [DO2] = f38, 1 * SIZE + } + ;; + { .mmf + (p12) STFD [DO1] = f35, 5 * SIZE + (p12) STFD [DO2] = f39 + } + ;; + { .mmi + (p13) STFD [DO1] = f40, 1 * SIZE + (p14) STFD [DO3] = f44, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [DO1] = f41, 1 * SIZE + (p14) STFD [DO3] = f45, 1 * SIZE + } + ;; + { .mmi + (p13) STFD [DO1] = f42, 1 * SIZE + ;; + (p13) STFD [DO1] = f43 + } + ;; + .align 32 + +.L199: + { .mib + cmp.lt p6, p0 = 0, J + mov ar.lc = ARLC + (p6) br.cond.dptk .L100 + } + ;; + { .mib + mov pr = PR, -1 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/zgemm_kernel.S b/kernel/ia64/zgemm_kernel.S new file mode 100644 index 0000000000..bfdb92cb87 --- /dev/null +++ b/kernel/ia64/zgemm_kernel.S @@ -0,0 +1,6849 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#define CPREFETCHSIZE 7 +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + +#define AORIG loc0 +#define KK loc1 +#define KK8 loc2 +#define OFFSET loc3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define FCALC_A FSUB +#define FCALC_B FADD +#define FMA_A FNMA +#define FMA_B FMA + +#else +#define FCALC_A FADD +#define FCALC_B FSUB +#define FMA_A FMA +#define FMA_B FNMA +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define FCALC_C FMA +#define FCALC_D FNMA +#else +#define FCALC_C FNMA +#define FCALC_D FMA +#endif + + PROLOGUE + .prologue + PROFCODE + + { .mfi +#ifdef TRMMKERNEL + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 8, 0, 0 +#else + nop __LINE__ +#endif + mov f64 = f0 + adds r14 = 16, SP + } + { .mfi + nop __LINE__ + mov f65 = f0 + adds r15 = 24, SP + } + ;; + { .mfi + ld8 LDC = [r14] + mov f81 = f0 + mov PR = pr + } + { .mfi +#ifdef TRMMKERNEL + ld8 OFFSET = [r15] +#else + nop __LINE__ +#endif + mov f96 = f0 + shr J = N, 2 + } + ;; + { .mfi + shladd LDC = LDC, ZBASE_SHIFT, r0 + mov f97 = f0 + mov AOFFSET = A + } + { .mfi + nop __LINE__ + mov f113 = f0 +#if defined(TRMMKERNEL) && !defined(LEFT) + sub KK = r0, OFFSET +#endif + } + ;; + .body + { .mfi + nop __LINE__ + mov f80 = f0 + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, J + mov f112 = f0 + (p6) br.cond.dpnt .L050 + } + ;; + .align 16 + +.L010: + { .mmi + mov C1 = C // coffset1 = c + 0 * ldc + add C2 = LDC, C // coffset2 = c + 1 * ldc + shr I = M, 2 + } + { .mmi + adds J = -1, J +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + nop __LINE__ + } + ;; + { .mmi + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mib + cmp.eq p6, p7 = 0, I + shladd C = LDC, 2, C // coffset += 8 * ldc + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f67 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f66 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + nop __LINE__ + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 4, KK +#endif +#endif + } + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + adds C5 = 4 * SIZE, C1 + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f115 = f0 + adds C6 = 4 * SIZE, C2 + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f68 = f0 + shr L = L, 1 + } + { .mfi + setf.d f86 = r0 + mov f69 = f0 + adds C7 = 4 * SIZE, C3 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f84 = f0 + adds L = -1, L + } + { .mfi + setf.d f87 = r0 + mov f85 = f0 + adds C8 = 4 * SIZE, C4 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f100 = f0 + mov ar.lc = L + } + { .mfi + setf.d f102 = r0 + mov f101 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f116 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + { .mfi + setf.d f103 = r0 + mov f117 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC] + mov f70 = f0 + nop __LINE__ + } + { .mmf + setf.d f118 = r0 + setf.d f119 = r0 + mov f71 = f0 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA_B f65 = f32, f49, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f69 = f36, f49, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f85 = f36, f51, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f101 = f36, f53, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f117 = f36, f55, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f68 = f37, f49, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f84 = f37, f51, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f100 = f37, f53, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f116 = f37, f55, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f71 = f38, f49, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f87 = f38, f51, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f103 = f38, f53, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f119 = f38, f55, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f70 = f39, f49, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f86 = f39, f51, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f102 = f39, f53, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f118 = f39, f55, f118 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C5], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C5], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C5], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C5], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C6], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C6], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f90 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C6], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f91 = [C2], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C6], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f108 = [C7], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f109 = [C7], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f110 = [C7], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C3], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f111 = [C7], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f120 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f124 = [C8], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f121 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f125 = [C8], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f122 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f126 = [C8], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f123 = [C4], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f127 = [C8], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f68, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f69, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f66, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f70, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f67, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f71, f79 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f69, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f68, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f67, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f71, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f66, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f70, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + { .mfb + STFD [C5] = f76, SIZE + FMA f92 = ALPHA_R, f84, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + { .mfb + STFD [C5] = f77, SIZE + FCALC_C f93 = ALPHA_R, f85, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMA f90 = ALPHA_R, f82, f90 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + FMA f94 = ALPHA_R, f86, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, 5 * SIZE + FCALC_C f91 = ALPHA_R, f83, f91 + nop __LINE__ + } + { .mfb + STFD [C5] = f79, 5 * SIZE + FCALC_C f95 = ALPHA_R, f87, f95 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f92 = ALPHA_I, f85, f92 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = ALPHA_I, f84, f93 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f83, f90 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f94 = ALPHA_I, f87, f94 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f82, f91 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = ALPHA_I, f86, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f88, SIZE + FMA f104 = ALPHA_R, f96, f104 + nop __LINE__ + } + { .mfb + STFD [C6] = f92, SIZE + FMA f108 = ALPHA_R, f100, f108 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f89, SIZE + FCALC_C f105 = ALPHA_R, f97, f105 + nop __LINE__ + } + { .mfb + STFD [C6] = f93, SIZE + FCALC_C f109 = ALPHA_R, f101, f109 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f90, SIZE + FMA f106 = ALPHA_R, f98, f106 + nop __LINE__ + } + { .mfb + STFD [C6] = f94, SIZE + FMA f110 = ALPHA_R, f102, f110 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f91, 5 * SIZE + FCALC_C f107 = ALPHA_R, f99, f107 + nop __LINE__ + } + { .mfb + STFD [C6] = f95, 5 * SIZE + FCALC_C f111 = ALPHA_R, f103, f111 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f108 = ALPHA_I, f101, f108 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = ALPHA_I, f100, f109 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f106 = ALPHA_I, f99, f106 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f110 = ALPHA_I, f103, f110 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f107 = ALPHA_I, f98, f107 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = ALPHA_I, f102, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f104, SIZE + FMA f120 = ALPHA_R, f112, f120 + nop __LINE__ + } + { .mfb + STFD [C7] = f108, SIZE + FMA f124 = ALPHA_R, f116, f124 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f105, SIZE + FCALC_C f121 = ALPHA_R, f113, f121 + nop __LINE__ + } + { .mfb + STFD [C7] = f109, SIZE + FCALC_C f125 = ALPHA_R, f117, f125 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f106, SIZE + FMA f122 = ALPHA_R, f114, f122 + nop __LINE__ + } + { .mfb + STFD [C7] = f110, SIZE + FMA f126 = ALPHA_R, f118, f126 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f107, 5 * SIZE + FCALC_C f123 = ALPHA_R, f115, f123 + nop __LINE__ + } + { .mfb + STFD [C7] = f111, 5 * SIZE + FCALC_C f127 = ALPHA_R, f119, f127 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f124 = ALPHA_I, f117, f124 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f125 = ALPHA_I, f116, f125 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f122 = ALPHA_I, f115, f122 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f126 = ALPHA_I, f119, f126 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f123 = ALPHA_I, f114, f123 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMA f127 = ALPHA_I, f118, f127 + nop __LINE__ + } + ;; + { .mfi + STFD [C4] = f120, SIZE + mov f64 = f0 + adds I = -1, I + } + { .mfb + STFD [C8] = f124, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4] = f121, SIZE + mov f80 = f0 + and TEMP = 3, M + } + { .mfb + STFD [C8] = f125, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4] = f122, SIZE + mov f96 = f0 + cmp.ne p8, p9 = r0, TEMP + } + { .mfb + STFD [C8] = f126, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C4] = f123, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8] = f127, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f72 = ALPHA_R, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f76 = ALPHA_R, f68 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f69, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f74 = ALPHA_R, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f78 = ALPHA_R, f70 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f67, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f71, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f69, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f68, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f67, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f71, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f66, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f70, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMPY f88 = ALPHA_R, f80 + nop __LINE__ + } + { .mfb + STFD [C5] = f76, SIZE + FMPY f92 = ALPHA_R, f84 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f77, SIZE + FCALC_C f93 = ALPHA_R, f85, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMPY f90 = ALPHA_R, f82 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + FMPY f94 = ALPHA_R, f86 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, 5 * SIZE + FCALC_C f91 = ALPHA_R, f83, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f79, 5 * SIZE + FCALC_C f95 = ALPHA_R, f87, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f92 = ALPHA_I, f85, f92 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = ALPHA_I, f84, f93 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f83, f90 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f94 = ALPHA_I, f87, f94 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f82, f91 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f95 = ALPHA_I, f86, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f88, SIZE + FMPY f104 = ALPHA_R, f96 + nop __LINE__ + } + { .mfb + STFD [C6] = f92, SIZE + FMPY f108 = ALPHA_R, f100 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f89, SIZE + FCALC_C f105 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f93, SIZE + FCALC_C f109 = ALPHA_R, f101, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f90, SIZE + FMPY f106 = ALPHA_R, f98 + nop __LINE__ + } + { .mfb + STFD [C6] = f94, SIZE + FMPY f110 = ALPHA_R, f102 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f91, 5 * SIZE + FCALC_C f107 = ALPHA_R, f99, f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f95, 5 * SIZE + FCALC_C f111 = ALPHA_R, f103, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f108 = ALPHA_I, f101, f108 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f109 = ALPHA_I, f100, f109 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f106 = ALPHA_I, f99, f106 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f110 = ALPHA_I, f103, f110 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f107 = ALPHA_I, f98, f107 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f111 = ALPHA_I, f102, f111 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f104, SIZE + FMPY f120 = ALPHA_R, f112 + nop __LINE__ + } + { .mfb + STFD [C7] = f108, SIZE + FMPY f124 = ALPHA_R, f116 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f105, SIZE + FCALC_C f121 = ALPHA_R, f113, f0 + nop __LINE__ + } + { .mfb + STFD [C7] = f109, SIZE + FCALC_C f125 = ALPHA_R, f117, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f106, SIZE + FMPY f122 = ALPHA_R, f114 + nop __LINE__ + } + { .mfb + STFD [C7] = f110, SIZE + FMPY f126 = ALPHA_R, f118 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f107, 5 * SIZE + FCALC_C f123 = ALPHA_R, f115, f0 + nop __LINE__ + } + { .mfb + STFD [C7] = f111, 5 * SIZE + FCALC_C f127 = ALPHA_R, f119, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f124 = ALPHA_I, f117, f124 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FMA f125 = ALPHA_I, f116, f125 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f122 = ALPHA_I, f115, f122 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FCALC_D f126 = ALPHA_I, f119, f126 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FMA f123 = ALPHA_I, f114, f123 + cmp.ne p6, p0 = 1, I + } + { .mfi + nop __LINE__ + FMA f127 = ALPHA_I, f118, f127 + adds I = -1, I + } + ;; + { .mfi + STFD [C4] = f120, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8] = f124, SIZE + mov f65 = f0 + and TEMP = 3, M + } + ;; + { .mfi + STFD [C4] = f121, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8] = f125, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C4] = f122, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C8] = f126, SIZE + mov f97 = f0 + cmp.ne p8, p9 = r0, TEMP + } + ;; + { .mfi + STFD [C4] = f123, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C8] = f127, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L030 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f66 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f67 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f66 = f0 + shladd AOFFSET = KK8, 1, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + shr L = L, 1 + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = -1, L + } + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f114 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f115 = f0 + nop __LINE__ + } + ;; + .align 16 + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f90 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f91 = [C2], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f120 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f121 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f106 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f122 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f107 = [C3], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f123 = [C4], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f66, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = ALPHA_R, f82, f90 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f67, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f91 = ALPHA_R, f83, f91 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f67, f74 + nop __LINE__ + } + { .mfb + FCALC_D f90 = ALPHA_I, f83, f90 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f66, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f82, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMA f104 = ALPHA_R, f96, f104 + nop __LINE__ + } + { .mfb + STFD [C2] = f88, SIZE + FMA f120 = ALPHA_R, f112, f120 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f105 = ALPHA_R, f97, f105 + nop __LINE__ + } + { .mfb + STFD [C2] = f89, SIZE + FCALC_C f121 = ALPHA_R, f113, f121 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMA f106 = ALPHA_R, f98, f106 + nop __LINE__ + } + { .mfb + STFD [C2] = f90, SIZE + FMA f122 = ALPHA_R, f114, f122 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, SIZE + FCALC_C f107 = ALPHA_R, f99, f107 + nop __LINE__ + } + { .mfb + STFD [C2] = f91, SIZE + FCALC_C f123 = ALPHA_R, f115, f123 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f106 = ALPHA_I, f99, f106 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f122 = ALPHA_I, f115, f122 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f107 = ALPHA_I, f98, f107 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f123 = ALPHA_I, f114, f123 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f104, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f120, SIZE + mov f65 = f0 + } + ;; + { .mfb + STFD [C3] = f105, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f121, SIZE + mov f81 = f0 + } + ;; + { .mfb + STFD [C3] = f106, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f122, SIZE + mov f97 = f0 + } + ;; + { .mfi + STFD [C3] = f107, SIZE + mov f112 = f0 + } + { .mfb + STFD [C4] = f123, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMPY f72 = ALPHA_R, f64 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f88 = ALPHA_R, f80 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMPY f74 = ALPHA_R, f66 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMPY f90 = ALPHA_R, f82 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f67, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f91 = ALPHA_R, f83, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f67, f74 + nop __LINE__ + } + { .mfb + FCALC_D f90 = ALPHA_I, f83, f90 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f66, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f82, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMPY f104 = ALPHA_R, f96 + nop __LINE__ + } + { .mfb + STFD [C2] = f88, SIZE + FMPY f120 = ALPHA_R, f112 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f105 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f89, SIZE + FCALC_C f121 = ALPHA_R, f113, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMPY f106 = ALPHA_R, f98 + nop __LINE__ + } + { .mfb + STFD [C2] = f90, SIZE + FMPY f122 = ALPHA_R, f114 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, SIZE + FCALC_C f107 = ALPHA_R, f99, f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f91, SIZE + FCALC_C f123 = ALPHA_R, f115, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f106 = ALPHA_I, f99, f106 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FCALC_D f122 = ALPHA_I, f115, f122 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f107 = ALPHA_I, f98, f107 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f123 = ALPHA_I, f114, f123 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3] = f104, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f120, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3] = f105, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f121, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3] = f106, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f122, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3] = f107, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C4] = f123, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#endif + .align 16 + +.L030: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 4, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + mov f72 = f0 + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 2, B + mov f72 = f0 + add AOFFSET = KK8, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mmi + nop __LINE__ + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + ;; + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f88 = f0 + shr L = L, 1 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f104 = f0 + adds L = -1, L + } + { .mfb + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f105 = f0 + nop __LINE__ + } + ;; + { .mfi + LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f120 = f0 + mov ar.lc = L + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f121 = f0 + nop __LINE__ + } + ;; + .align 16 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f104 = [C3], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f120 = [C4], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f105 = [C3], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f121 = [C4], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f104 = ALPHA_R, f96, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = ALPHA_R, f112, f120 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f105 = ALPHA_R, f97, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f121 = ALPHA_R, f113, f121 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f88, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f89, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f104, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f120, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C3] = f105, SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfi + STFD [C4] = f121, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f104 = ALPHA_R, f96, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f120 = ALPHA_R, f112, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f105 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f121 = ALPHA_R, f113, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f104 = ALPHA_I, f97, f104 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FCALC_D f120 = ALPHA_I, f113, f120 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f105 = ALPHA_I, f96, f105 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f121 = ALPHA_I, f112, f121 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2] = f88, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C2] = f89, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 2, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C3] = f104, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f120, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C3] = f105, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C4] = f121, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#endif + .align 16 + +.L049: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mmb + nop __LINE__ + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 16 + +.L050: + { .mmi +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + shr I = M, 2 + } + { .mib + mov C1 = C + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L090 + } + ;; + { .mmi + add C2 = LDC, C +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + nop __LINE__ + } + { .mib + cmp.eq p6, p7 = 0, I + shladd C = LDC, 1, C + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfi + LDFPD f48, f49 = [B] + mov f66 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + { .mfb + adds BOFFSET = 2 * SIZE, B + mov f67 = f0 + nop __LINE__ + } + ;; +#else + { .mfi + shladd BOFFSET = KK8, 1, B + mov f66 = f0 + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + adds PREA = (PREFETCHSIZE + 8) * SIZE, AOFFSET + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 2, KK +#endif +#endif + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f99 = f0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + CPREFETCH [PREC], LDC + mov f115 = f0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds C5 = 4 * SIZE, C1 + adds L = -1, L + } + ;; + { .mmi + CPREFETCH [PREC], LDC + adds C6 = 4 * SIZE, C2 + mov ar.lc = L + } + ;; + .align 16 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfi + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f36, f48, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f36, f49, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f36, f50, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f36, f51, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f38, f48, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f38, f49, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f38, f50, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f38, f51, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f37, f48, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f37, f49, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f37, f50, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f37, f51, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f39, f48, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f39, f49, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f39, f50, f115 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f39, f51, f114 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C5 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f66 = f44, f56, f66 // A5 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f92 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f82 = f44, f58, f82 // A5 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f93 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f90 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f98 = f46, f56, f98 // A7 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f94 = [C6 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f91 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f114 = f46, f58, f114 // A7 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f95 = [C6 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f45, f56, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f45, f58, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f47, f56, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f47, f58, f115 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f66, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f67, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f98, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f99, f79 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f67, f76 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f66, f77 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f99, f78 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f98, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + { .mfb + STFD [C5] = f76, SIZE + FMA f92 = ALPHA_R, f82, f92 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + { .mfb + STFD [C5] = f77, SIZE + FCALC_C f93 = ALPHA_R, f83, f93 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMA f90 = ALPHA_R, f112, f90 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + FMA f94 = ALPHA_R, f114, f94 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, 5 * SIZE + FCALC_C f91 = ALPHA_R, f113, f91 + nop __LINE__ + } + { .mfb + STFD [C5] = f79, 5 * SIZE + FCALC_C f95 = ALPHA_R, f115, f95 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f92 = ALPHA_I, f83, f92 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = ALPHA_I, f82, f93 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f113, f90 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f94 = ALPHA_I, f115, f94 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f91 = ALPHA_I, f112, f91 + cmp.ne p6, p0 = 1, I + } + { .mfb + nop __LINE__ + FMA f95 = ALPHA_I, f114, f95 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f88, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f92, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2] = f89, SIZE + mov f80 = f0 + adds I = -1, I + } + { .mfb + STFD [C6] = f93, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f90, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f94, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C2] = f91, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C6] = f95, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f66, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f67, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f98, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f99, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f67, f76 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f66, f77 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f99, f78 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f98, f79 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + FMA f88 = ALPHA_R, f80, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f76, SIZE + FMA f92 = ALPHA_R, f82, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f77, SIZE + FCALC_C f93 = ALPHA_R, f83, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + FMA f90 = ALPHA_R, f112, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + FMA f94 = ALPHA_R, f114, f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f75, 5 * SIZE + FCALC_C f91 = ALPHA_R, f113, f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f79, 5 * SIZE + FCALC_C f95 = ALPHA_R, f115, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f92 = ALPHA_I, f83, f92 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f93 = ALPHA_I, f82, f93 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f90 = ALPHA_I, f113, f90 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FCALC_D f94 = ALPHA_I, f115, f94 + cmp.ne p6, p0 = 1, I + } + ;; + { .mfi + nop __LINE__ + FMA f91 = ALPHA_I, f112, f91 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f95 = ALPHA_I, f114, f95 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2] = f88, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6] = f92, SIZE + mov f65 = f0 + adds I = -1, I + } + ;; + { .mfi + STFD [C2] = f89, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C6] = f93, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C2] = f90, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C6] = f94, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2] = f91, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C6] = f95, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L052 + } + ;; +#endif + .align 16 + +.L060: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L070 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + nop __LINE__ + } + { .mmi + adds BOFFSET = 2 * SIZE, B + cmp.eq p3, p0 = r0, r0 +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + shladd AOFFSET = KK8, 1, AOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = L + } + ;; + .align 16 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + } + { .mfb + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f88 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f90 = [C2 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f91 = [C2 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = ALPHA_R, f112, f90 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f91 = ALPHA_R, f113, f91 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f113, f90 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f91 = ALPHA_I, f112, f91 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f72, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f88, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f73, SIZE + mov f80 = f0 + nop __LINE__ + } + { .mfb + STFD [C2] = f89, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f96 = f0 + adds L = 1, K + } + { .mfb + STFD [C2] = f90, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f75, SIZE + mov f112 = f0 + shr L = L, 1 + } + { .mfb + STFD [C2] = f91, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f88 = ALPHA_R, f80, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f89 = ALPHA_R, f81, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f90 = ALPHA_R, f112, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f91 = ALPHA_R, f113, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FCALC_D f90 = ALPHA_I, f113, f90 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f91 = ALPHA_I, f112, f91 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2] = f88, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C2] = f89, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2] = f90, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f75, SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C2] = f91, SIZE + mov f113 = f0 + nop __LINE__ + } + ;; +#endif + .align 16 + +.L070: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 2, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mmi + shladd BOFFSET = KK8, 1, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mmi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE + nop __LINE__ +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 16 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f96 = f32, f49, f96 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f112 = f32, f51, f112 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f97 = f33, f49, f97 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f113 = f33, f51, f113 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f96 = f40, f57, f96 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE + (p5) LDFD f88 = [C2 ], SIZE +#else + nop __LINE__ + nop __LINE__ +#endif + (p3) FMA f112 = f40, f59, f112 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f41, f57, f97 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f89 = [C2 ], - SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f41, f59, f113 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f97 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f80 = f80, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f96 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f81 = f81, f112 + nop __LINE__ + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + setf.d f96 = r0 + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + setf.d f97 = r0 + FMA f88 = ALPHA_R, f80, f88 + nop __LINE__ + } + ;; + { .mfb + setf.d f112 = r0 + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + setf.d f113 = r0 + FCALC_C f89 = ALPHA_R, f81, f89 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + setf.d f65 = r0 + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfb + setf.d f81 = r0 + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + setf.d f64 = r0 + FMA f89 = ALPHA_I, f80, f89 + nop __LINE__ + } + ;; + { .mmf + STFD [C1] = f72, SIZE + STFD [C2] = f88, SIZE + mov f80 = f0 + } + ;; + { .mmi + STFD [C1] = f73, SIZE + STFD [C2] = f89, SIZE + mov B = BOFFSET + } + ;; +#else + { .mfi + setf.d f96 = r0 + FMA f72 = ALPHA_R, f64, f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + setf.d f97 = r0 + FMA f88 = ALPHA_R, f80, f0 + nop __LINE__ + } + ;; + { .mfi + setf.d f112 = r0 + FCALC_C f73 = ALPHA_R, f65, f0 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + { .mfi + setf.d f113 = r0 + FCALC_C f89 = ALPHA_R, f81, f0 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + setf.d f65 = r0 + FCALC_D f88 = ALPHA_I, f81, f88 + nop __LINE__ + } + ;; + { .mfi + setf.d f81 = r0 + FMA f73 = ALPHA_I, f64, f73 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add AOFFSET = KK8, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + setf.d f64 = r0 + FMA f89 = ALPHA_I, f80, f89 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd BOFFSET = KK8, 1, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + mov f80 = f0 + } + ;; + { .mmi + STFD [C1] = f72, SIZE + STFD [C2] = f88, SIZE +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + { .mmi + STFD [C1] = f73, SIZE + STFD [C2] = f89, SIZE +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } +#endif + ;; + .align 16 + +.L089: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + +.L090: + { .mfi + mov C1 = C + mov f64 = f0 + tbit.z p6, p0 = N, 0 + } + { .mfi +#if defined(TRMMKERNEL) && defined(LEFT) + mov KK = OFFSET +#else + nop __LINE__ +#endif + mov f72 = f0 + shr I = M, 2 + } + ;; + { .mfi + setf.d f66 = r0 + mov f65 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + mov AOFFSET = A + mov f73 = f0 + (p6) br.cond.dpnt .L999 + } + ;; + { .mfi + setf.d f74 = r0 + mov f67 = f0 + nop __LINE__ + } + { .mfb + cmp.eq p6, p7 = 0, I + mov f75 = f0 + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mfb + LDFPD f48, f49 = [B] + nop __LINE__ + } + { .mfi + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#else + { .mfi + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 2, AOFFSET + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 4, KK +#else + adds L = 1, KK +#endif +#endif + } + ;; +#endif + { .mfi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; + { .mfi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC] + } + ;; + { .mfi + LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov ar.lc = L + } + { .mmi + adds C5 = 4 * SIZE, C1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + .align 16 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f34, f48, f80 // A3 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f81 = f34, f49, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f36, f48, f96 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f97 = f36, f49, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f38, f48, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f38, f49, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f81 = f35, f48, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f35, f49, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f97 = f37, f48, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f37, f49, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f113 = f39, f48, f113 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f39, f49, f112 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f42, f56, f80 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f44, f56, f96 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f46, f56, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f76 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f81 = f43, f56, f81 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f77 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA f97 = f45, f56, f97 // A6 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f78 = [C5 ], SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfi +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f113 = f47, f56, f113 // A8 * B1 + adds L = -1, L + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f79 = [C5 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f96, f76 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f97, f77 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f80, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f112, f78 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f81, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f113, f79 + nop __LINE__ + } + ;; + + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f97, f76 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f96, f77 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f74 = ALPHA_I, f81, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f113, f78 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f80, f75 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f79 = ALPHA_I, f112, f79 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f72, SIZE + mov f64 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfb + STFD [C5] = f76, SIZE + mov f65 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 + adds I = -1, I + } + { .mfb + STFD [C5] = f77, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfb + STFD [C1] = f74, SIZE + mov f96 = f0 + nop __LINE__ + } + { .mfb + STFD [C5] = f78, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f75, 5 * SIZE + mov f112 = f0 + } + { .mfb + STFD [C5] = f79, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L092 + } + ;; +#else + { .mfb + nop __LINE__ + FMA f6 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f76 = ALPHA_R, f96, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f77 = ALPHA_R, f97, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f80, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f78 = ALPHA_R, f112, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f81, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f79 = ALPHA_R, f113, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f6 = ALPHA_I, f65, f6 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_D f76 = ALPHA_I, f97, f76 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f77 = ALPHA_I, f96, f77 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FCALC_D f74 = ALPHA_I, f81, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FCALC_D f78 = ALPHA_I, f113, f78 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA f75 = ALPHA_I, f80, f75 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -4, L +#else + nop __LINE__ +#endif + } + { .mfi + nop __LINE__ + FMA f79 = ALPHA_I, f112, f79 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f6, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C5] = f76, SIZE + mov f65 = f0 + cmp.ne p6, p0 = 1, I + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 2, AOFFSET +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C5] = f77, SIZE + mov f81 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f96 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 4, KK +#else + nop __LINE__ +#endif + } + { .mfi + STFD [C5] = f78, SIZE + mov f97 = f0 + adds I = -1, I + } + ;; + { .mfi + STFD [C1] = f75, 5 * SIZE + mov f112 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + STFD [C5] = f79, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L092 + } + ;; +#endif + .align 16 + +.L100: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 2, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L110 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mii + add BOFFSET = KK8, B + shladd AOFFSET = KK8, 1, AOFFSET + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + { .mii + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + .align 16 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f81 = f33, f49, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f112 = f34, f49, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f74 = [C1 ], SIZE +#else + nop __LINE__ +#endif + FMA f113 = f35, f49, f113 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f75 = [C1 ], -3 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f57, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f57, f113 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f96 = f96, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f97 = f97, f112 + nop __LINE__ + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f74 = ALPHA_R, f96, f74 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f75 = ALPHA_R, f97, f75 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + { .mfb + setf.d f112 = r0 + FCALC_D f74 = ALPHA_I, f97, f74 + nop __LINE__ + } + { .mfb + setf.d f113 = r0 + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + ;; + { .mmf + STFD [C1] = f72, SIZE + setf.d f97 = r0 + mov f64 = f0 + } + ;; + { .mmf + STFD [C1] = f73, SIZE + setf.d f96 = r0 + mov f80 = f0 + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f65 = f0 + adds L = 1, K + } + ;; + { .mfi + STFD [C1] = f75, SIZE + mov f81 = f0 + shr L = L, 1 + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + ;; + { .mfi + setf.d f112 = r0 + FMA f74 = ALPHA_R, f96, f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + sub L = K, KK +#else + nop __LINE__ +#endif + } + { .mfb + setf.d f113 = r0 + FCALC_C f75 = ALPHA_R, f97, f0 + nop __LINE__ + } + ;; + { .mfi + setf.d f97 = r0 + FCALC_D f72 = ALPHA_I, f65, f72 +#if defined(TRMMKERNEL) && (defined(LEFT) && defined(TRANSA)) + adds L = -2, L +#else + nop __LINE__ +#endif + } + { .mfi + setf.d f96 = r0 + FMA f73 = ALPHA_I, f64, f73 +#if defined(TRMMKERNEL) && (!defined(LEFT) && !defined(TRANSA)) + adds L = -1, L +#else + nop __LINE__ +#endif + } + ;; + { .mfi + nop __LINE__ + FCALC_D f74 = ALPHA_I, f97, f74 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd KK8 = L, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + { .mfb + nop __LINE__ + FMA f75 = ALPHA_I, f96, f75 + nop __LINE__ + } + ;; + { .mfi + STFD [C1] = f72, SIZE + mov f64 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + shladd AOFFSET = KK8, 1, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f73, SIZE + mov f80 = f0 +#if defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + add BOFFSET = KK8, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f74, SIZE + mov f65 = f0 +#if defined(TRMMKERNEL) && defined(LEFT) + adds KK = 2, KK +#else + nop __LINE__ +#endif + } + ;; + { .mfi + STFD [C1] = f75, SIZE + mov f81 = f0 +#ifdef TRMMKERNEL + shladd KK8 = KK, ZBASE_SHIFT, r0 +#else + nop __LINE__ +#endif + } + ;; +#endif + .align 16 + +.L110: + { .mib +#ifndef TRMMKERNEL + nop __LINE__ +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub L = K, KK +#elif defined(LEFT) + adds L = 1, KK +#else + adds L = 1, KK +#endif +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L119 + } + ;; +#if !defined(TRMMKERNEL) || \ + defined(TRMMKERNEL) && \ + ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))) + { .mmi + LDFPD f48, f49 = [B] + adds BOFFSET = 2 * SIZE, B +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#else + { .mii + add BOFFSET = KK8, B + add AOFFSET = KK8, AOFFSET + nop __LINE__ + } + ;; + { .mfi + LDFPD f48, f49 = [BOFFSET], 2 * SIZE +#ifndef TRMMKERNEL + adds L = 1, K +#else + adds L = 1, L +#endif + } + ;; +#endif + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + LDFPD f32, f33 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + .align 16 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f72 = [C1 ], SIZE +#else + nop __LINE__ +#endif + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + (p5) LDFD f73 = [C1 ], -1 * SIZE +#else + nop __LINE__ +#endif + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + ;; +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f73 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + ;; + { .mmf + STFD [C1] = f72, SIZE + setf.d f64 = r0 + mov f80 = f0 + } + ;; + { .mmf + STFD [C1] = f73, SIZE + setf.d f65 = r0 + mov f81 = f0 + } + ;; +#else + { .mfb + nop __LINE__ + FMA f72 = ALPHA_R, f64, f0 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_C f73 = ALPHA_R, f65, f0 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FCALC_D f72 = ALPHA_I, f65, f72 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f73 = ALPHA_I, f64, f73 + nop __LINE__ + } + ;; + { .mmf + STFD [C1] = f72, SIZE + setf.d f64 = r0 + mov f80 = f0 + } + ;; + { .mmf + STFD [C1] = f73, SIZE + setf.d f65 = r0 + mov f81 = f0 + } + ;; +#endif + .align 16 + +.L119: + { .mmi + mov B = BOFFSET + mov AOFFSET = A +#if defined(TRMMKERNEL) && !defined(LEFT) + adds KK = 1, KK +#else + nop __LINE__ +#endif + } + ;; + .align 16 + +.L999: + { .mii + nop __LINE__ + mov ar.lc = ARLC + mov pr = PR, -1 + } + { .mib + nop __LINE__ +#ifdef TRMMKERNEL + mov ar.pfs = ARPFS +#else + nop __LINE__ +#endif + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/zgemm_ncopy.S b/kernel/ia64/zgemm_ncopy.S new file mode 100644 index 0000000000..e7950e9909 --- /dev/null +++ b/kernel/ia64/zgemm_ncopy.S @@ -0,0 +1,854 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 64 +#define WPREFETCHSIZE 32 + +#define LD LDF8 +#define ST STF8_NTA + +#define TEMP r2 + +#define I r14 +#define J r15 +#define PREB r16 +#define PREA r17 + +#define A1 r18 +#define A2 r19 +#define A3 r20 +#define A4 r21 +#define A5 r22 +#define A6 r23 +#define A7 r24 +#define A8 r25 +#define B1 r26 + +#define COUNT r28 + +#define ARLC r30 +#define PR r31 + +#define M r32 +#define N r33 +#define A r34 +#define LDA r35 +#define B r36 + + PROLOGUE + .prologue + PROFCODE + + .body + { .mii + shladd LDA= LDA, ZBASE_SHIFT, r0 + mov PR = pr + shr J = N, 2 + } + ;; + { .mii + mov COUNT=r0 + tbit.nz p10, p0 =M, 1 + tbit.nz p11, p0 =M, 0 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mib + cmp.eq p8,p0 = 0, J + mov ARLC = ar.lc + (p8) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L11: + { .mmi + mov A1 = A + add A2 = A, LDA + mov pr.rot = 0 + } + { .mmi + shladd A3 = LDA, 1, A + adds B1 = 4 * SIZE, B + shr I = M, 2 + } + ;; + { .mmi + shladd A4 = LDA, 1, A2 + cmp.eq p16,p0 = r0, r0 + mov ar.ec = 3 + } + { .mmi + cmp.eq p6,p0 = 0,I + adds I =-1, I + adds J =-1, J + } + ;; + { .mmi + shladd A = LDA, 2, A + adds A5 = 4 * SIZE, A1 + adds A6 = 4 * SIZE, A2 + } + { .mmi + adds A7 = 4 * SIZE, A3 + adds A8 = 4 * SIZE, A4 + adds PREA = PREFETCHSIZE * SIZE,A1 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mib + adds PREB = WPREFETCHSIZE * SIZE, B + mov ar.lc = I + (p6) br.cond.dpnt.few .L15 + } + ;; + .align 32 + +.L12: + { .mmb + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], 16 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f34, SIZE + (p18) ST [B1] = f82, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f40, SIZE + (p18) ST [B1] = f88, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f58, SIZE + (p18) ST [B1] = f106, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f64, 5 * SIZE + (p18) ST [B1] = f112, 5 * SIZE + tbit.z p0,p7 = COUNT,0 + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f46, SIZE + (p18) ST [B1] = f94, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f56 = [A2], SIZE + (p16) LD f59 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f52, SIZE + (p18) ST [B1] = f100, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f62 = [A2], SIZE + (p16) LD f65 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f70, SIZE + (p18) ST [B1] = f118, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f68 = [A2], SIZE + (p16) LD f71 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f76, 5 * SIZE + (p18) ST [B1] = f124, 5 * SIZE + shladd TEMP = LDA, 2, r0 + } + { .mmb + (p16) LD f74 = [A2], 5 * SIZE + (p16) LD f77 = [A6], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], 16 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f37, SIZE + (p18) ST [B1] = f85, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f80 = [A3], SIZE + (p16) LD f83 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f43, SIZE + (p18) ST [B1] = f91, SIZE + adds TEMP = -16 * SIZE, TEMP + } + { .mmb + (p16) LD f86 = [A3], SIZE + (p16) LD f89 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f61, SIZE + (p18) ST [B1] = f109, SIZE + (p7) sub PREA = PREA, TEMP + } + { .mmb + (p16) LD f92 = [A3], SIZE + (p16) LD f95 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f67, 5 * SIZE + (p18) ST [B1] = f115, 5 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f98 = [A3], 5 * SIZE + (p16) LD f101 = [A7], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f49, SIZE + (p18) ST [B1] = f97, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f104 = [A4], SIZE + (p16) LD f107 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f55, SIZE + (p18) ST [B1] = f103, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f110 = [A4], SIZE + (p16) LD f113 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f73, SIZE + (p18) ST [B1] = f121, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f116 = [A4], SIZE + (p16) LD f119 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f79, 5 * SIZE + (p18) ST [B1] = f127, 5 * SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p16) LD f122 = [A4], 5 * SIZE + (p16) LD f125 = [A8], 5 * SIZE + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmb + (p10) LD f32 = [A1], SIZE + (p10) LD f40 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f33 = [A1], SIZE + (p10) LD f41 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f34 = [A1], SIZE + (p10) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f35 = [A1], SIZE + (p10) LD f43 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f50 = [A3], SIZE + (p10) LD f60 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f51 = [A3], SIZE + (p10) LD f61 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f52 = [A3], SIZE + (p10) LD f62 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f53 = [A3], SIZE + (p10) LD f63 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f36 = [A1], SIZE + (p11) LD f44 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f37 = [A1] + (p11) LD f45 = [A2] + nop __LINE__ + } + ;; + { .mmb + (p11) LD f54 = [A3], SIZE + (p11) LD f64 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f55 = [A3] + (p11) LD f65 = [A4] + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f32, SIZE + (p10) ST [B1] = f50, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f33, SIZE + (p10) ST [B1] = f51, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f40, SIZE + (p10) ST [B1] = f60, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f41, 5 * SIZE + (p10) ST [B1] = f61, 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f34, SIZE + (p10) ST [B1] = f52, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f35, SIZE + (p10) ST [B1] = f53, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f42, SIZE + (p10) ST [B1] = f62, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f43, 5 * SIZE + (p10) ST [B1] = f63, 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) ST [B ] = f36, SIZE + (p11) ST [B1] = f54, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [B ] = f37, SIZE + (p11) ST [B1] = f55, SIZE + mov COUNT = r0 + } + ;; + { .mmi + (p11) ST [B ] = f44, SIZE + (p11) ST [B1] = f64, SIZE + cmp.eq p0,p6 = 0,J + } + ;; + { .mmb + (p11) ST [B ] = f45, 5 * SIZE + (p11) ST [B1] = f65, 5 * SIZE + (p6) br.cond.dptk.few .L11 + } + ;; + .align 32 + +.L20: + { .mmi + mov A1 = A + add A2 = A,LDA + mov pr.rot = 0 + } + { .mmi + adds A5 = 4 * SIZE, A + adds B1 = 4 * SIZE, B + tbit.z p8, p0 = N, 1 + } + ;; + { .mmi + cmp.eq p16,p0 = r0,r0 + adds PREA = PREFETCHSIZE * SIZE, A + mov ar.ec = 3 + } + ;; + { .mib + adds PREB = WPREFETCHSIZE * SIZE,B + shr I = M, 2 + (p8) br.cond.dpnt.few .L30 + } + ;; + { .mmi + shladd A = LDA, 1, A + cmp.eq p6, p0 = 0, I + adds I = -1, I + } + ;; + { .mib + adds A6 = 4 * SIZE, A2 + mov ar.lc = I + (p6) br.cond.dpnt.few .L25 + } + ;; + .align 32 + +.L21: + { .mmb + (p16) lfetch.nt1 [PREA],LDA + (p16) lfetch.excl.nt1 [PREB ],16 * SIZE + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f34, SIZE + (p18) ST [B1] = f46, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f40, SIZE + (p18) ST [B1] = f52, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f58, SIZE + (p18) ST [B1] = f70, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f64, 5 * SIZE + (p18) ST [B1] = f76, 5 * SIZE + tbit.z p0,p7 = COUNT,0 + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f37, SIZE + (p18) ST [B1] = f49, SIZE + adds TEMP = -16 * SIZE,TEMP + } + { .mmb + (p16) LD f56 = [A2], SIZE + (p16) LD f59 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f43, SIZE + (p18) ST [B1] = f55, SIZE + (p7) sub PREA = PREA,TEMP + } + { .mmb + (p16) LD f62 = [A2], SIZE + (p16) LD f65 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f61, SIZE + (p18) ST [B1] = f73, SIZE + (p16) adds COUNT = 1,COUNT + } + { .mmb + (p16) LD f68 = [A2], SIZE + (p16) LD f71 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f67, 5 * SIZE + (p18) ST [B1] = f79, 5 * SIZE + shladd TEMP = LDA,2,r0 + } + { .mmb + (p16) LD f74 = [A2], 5 * SIZE + (p16) LD f77 = [A6], 5 * SIZE + br.ctop.sptk.few .L21 + } + ;; + .align 32 + +.L25: + { .mmb + (p10) LD f32 = [A1], SIZE + (p10) LD f40 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f33 = [A1], SIZE + (p10) LD f41 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f34 = [A1], SIZE + (p10) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f35 = [A1], SIZE + (p10) LD f43 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f36 = [A1], SIZE + (p11) LD f44 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f37 = [A1] + (p11) LD f45 = [A2] + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f32, SIZE + (p10) ST [B1] = f34, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f33, SIZE + (p10) ST [B1] = f35, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f40, SIZE + (p10) ST [B1] = f42, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [B ] = f41, 5 * SIZE + (p10) ST [B1] = f43, 5 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [B ] = f36, SIZE + ;; + (p11) ST [B ] = f37, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [B ] = f44, SIZE + ;; + (p11) ST [B ] = f45, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L30: + { .mmi + mov A1 = A + mov COUNT = r0 + mov pr.rot = 0 + } + { .mmi + adds A5 = 4 * SIZE,A + adds B1 = 4 * SIZE,B + tbit.z p8,p0 = N,0 + } + ;; + { .mmi + cmp.eq p16,p0 = r0,r0 + nop __LINE__ + mov ar.ec = 3 + } + { .mib + nop __LINE__ + shr I = M,2 + (p8) br.cond.dptk.few .L999 + } + ;; + { .mmi + cmp.eq p6 ,p0 = 0, I + adds PREA = PREFETCHSIZE * SIZE, A + adds I = -1, I + } + ;; + { .mib + adds PREB = WPREFETCHSIZE * SIZE, B + mov ar.lc = I + (p6) br.cond.dpnt.few .L35 + } + ;; + .align 32 + +.L31: + { .mmi + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB ], 16 * SIZE + tbit.z p0, p7 = COUNT, 0 + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f34, SIZE + (p18) ST [B1] = f37, SIZE + shladd TEMP = LDA,2,r0 + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B ] = f40, SIZE + (p18) ST [B1] = f43, SIZE + adds TEMP = -16 * SIZE,TEMP + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B ] = f46, SIZE + (p18) ST [B1] = f49, SIZE + nop __LINE__ + } + { .mmi + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + (p7) sub PREA = PREA,TEMP + } + ;; + { .mmi + (p18) ST [B ] = f52, 5 * SIZE + (p18) ST [B1] = f55, 5 * SIZE + (p16) adds COUNT = 1,COUNT + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + br.ctop.sptk.few .L31 + } + ;; + .align 32 + +.L35: + { .mmi + (p10) LD f32 = [A1], SIZE + ;; + (p10) LD f33 = [A1], SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) LD f34 = [A1], SIZE + ;; + (p10) LD f35 = [A1], SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) LD f36 = [A1], SIZE + ;; + (p11) LD f37 = [A1] + nop __LINE__ + } + ;; + { .mmi + (p10) ST [B ] = f32, SIZE + ;; + (p10) ST [B ] = f33, SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) ST [B ] = f34, SIZE + ;; + (p10) ST [B ] = f35, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [B ] = f36, SIZE + ;; + (p11) ST [B ] = f37, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L999: + mov pr = PR,-1 + mov ar.lc = ARLC + br.ret.sptk.many b0 + ;; + EPILOGUE + diff --git a/kernel/ia64/zgemm_tcopy.S b/kernel/ia64/zgemm_tcopy.S new file mode 100644 index 0000000000..9af5380a40 --- /dev/null +++ b/kernel/ia64/zgemm_tcopy.S @@ -0,0 +1,898 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 24 +#define WPREFETCHSIZE 48 + +#define LD LDF8 +#define ST STF8_NTA + +#define PREA r2 +#define PREB r3 + +#define I r14 +#define J r15 + +#define A1 r16 +#define A2 r17 +#define A3 r18 +#define A4 r19 +#define A5 r20 +#define A6 r21 +#define A7 r22 +#define A8 r23 +#define B1 r24 +#define B2 r25 + +#define COUNT r26 +#define TEMP r27 + +#define BO2 r28 +#define BO3 r29 +#define LDB r8 + +#define ARLC r30 +#define PR r31 + +#define M r32 +#define N r33 +#define A r34 +#define LDA r35 +#define B r36 + + PROLOGUE + .prologue + PROFCODE + + .body + { .mmi + setf.sig f32 = M + and r8 = -4, N + mov ARLC = ar.lc + } + ;; + { .mmi + setf.sig f33 = r8 + and r9 = -2, N + mov PR = pr + } + ;; + { .mmi + setf.sig f34 = r9 + shladd LDA = LDA, ZBASE_SHIFT, r0 + shl LDB = M, BASE_SHIFT + 3 + } + ;; + { .mfi + nop __LINE__ + xmpy.l f33 = f32, f33 + shr J = M, 2 + } + { .mfi + nop __LINE__ + xmpy.l f34 = f32, f34 + nop __LINE__ + } + ;; + { .mmb + getf.sig BO2 = f33 + getf.sig BO3 = f34 + nop __LINE__ + } + ;; + { .mmi + shladd BO2 = BO2, ZBASE_SHIFT, B + shladd BO3 = BO3, ZBASE_SHIFT, B + tbit.nz p10, p0 =N, 1 + } + { .mib + cmp.eq p6, p0 = 0, J + tbit.nz p11, p0 =N, 0 + (p6) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L11: + { .mmi + mov A1 = A + add A2 = A, LDA + mov pr.rot = 0 + } + { .mmi + shladd A3 = LDA, 1, A + mov B1 = B + shr I = N, 2 + } + ;; + { .mmi + shladd A4 = LDA, 1, A2 + cmp.eq p16,p0 = r0, r0 + mov ar.ec = 3 + } + { .mmi + cmp.eq p6,p0 = 0,I + adds I =-1, I + adds J =-1, J + } + ;; + { .mmi + shladd A = LDA, 2, A + adds A5 = 4 * SIZE, A1 + adds A6 = 4 * SIZE, A2 + } + { .mmi + adds A7 = 4 * SIZE, A3 + adds A8 = 4 * SIZE, A4 + adds PREA = PREFETCHSIZE * SIZE,A1 + } + ;; + { .mmb + adds B2 = 4 * SIZE, B + adds PREB = WPREFETCHSIZE * SIZE, B + nop __LINE__ + } + { .mib + adds B = 32 * SIZE, B + mov ar.lc = I + (p6) br.cond.dpnt.few .L15 + } + ;; + +.L12: + { .mmb + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], LDB + nop __LINE__ + } + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f34, SIZE + (p18) ST [B2] = f37, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f40, SIZE + (p18) ST [B2] = f43, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f46, SIZE + (p18) ST [B2] = f49, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f52, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + tbit.z p0,p7 = COUNT,0 + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f58, SIZE + (p18) ST [B2] = f61, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f56 = [A2], SIZE + (p16) LD f59 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f64, SIZE + (p18) ST [B2] = f67, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f62 = [A2], SIZE + (p16) LD f65 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f70, SIZE + (p18) ST [B2] = f73, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f68 = [A2], SIZE + (p16) LD f71 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f76, 5 * SIZE + (p18) ST [B2] = f79, 5 * SIZE + shladd TEMP = LDA, 2, r0 + } + { .mmb + (p16) LD f74 = [A2], 5 * SIZE + (p16) LD f77 = [A6], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f82, SIZE + (p18) ST [B2] = f85, SIZE + nop __LINE__ + } + { .mmb + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], LDB + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f88, SIZE + (p18) ST [B2] = f91, SIZE + adds TEMP = -16 * SIZE, TEMP + } + { .mmb + (p16) LD f80 = [A3], SIZE + (p16) LD f83 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f94, SIZE + (p18) ST [B2] = f97, SIZE + (p7) sub PREA = PREA, TEMP + } + { .mmb + (p16) LD f86 = [A3], SIZE + (p16) LD f89 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f100, 5 * SIZE + (p18) ST [B2] = f103, 5 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f92 = [A3], SIZE + (p16) LD f95 = [A7], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f106, SIZE + (p18) ST [B2] = f109, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f98 = [A3], 5 * SIZE + (p16) LD f101 = [A7], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f112, SIZE + (p18) ST [B2] = f115, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f104 = [A4], SIZE + (p16) LD f107 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f118, SIZE + (p18) ST [B2] = f121, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f110 = [A4], SIZE + (p16) LD f113 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f124, -27 * SIZE + (p18) ST [B2] = f127, -27 * SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p16) LD f116 = [A4], SIZE + (p16) LD f119 = [A8], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) add B1 = B1, LDB + (p18) add B2 = B2, LDB + nop __LINE__ + } + { .mmb + (p16) LD f122 = [A4], 5 * SIZE + (p16) LD f125 = [A8], 5 * SIZE + br.ctop.sptk.few .L12 + } + ;; + .align 32 + +.L15: + { .mmb + (p10) LD f32 = [A1], SIZE + (p10) LD f40 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f33 = [A1], SIZE + (p10) LD f41 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f34 = [A1], SIZE + (p10) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f35 = [A1], SIZE + (p10) LD f43 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f50 = [A3], SIZE + (p10) LD f60 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f51 = [A3], SIZE + (p10) LD f61 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f52 = [A3], SIZE + (p10) LD f62 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f53 = [A3], SIZE + (p10) LD f63 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f36 = [A1], SIZE + (p11) LD f44 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f37 = [A1] + (p11) LD f45 = [A2] + nop __LINE__ + } + ;; + { .mmb + (p11) LD f54 = [A3], SIZE + (p11) LD f64 = [A4], SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) LD f55 = [A3] + (p11) LD f65 = [A4] + adds B2 = 4 * SIZE, BO2 + } + ;; + { .mmb + (p10) ST [BO2] = f32, SIZE + (p10) ST [B2] = f40, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f33, SIZE + (p10) ST [B2] = f41, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f34, SIZE + (p10) ST [B2] = f42, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f35, 5 * SIZE + (p10) ST [B2] = f43, 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f50, SIZE + (p10) ST [B2] = f60, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f51, SIZE + (p10) ST [B2] = f61, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f52, SIZE + (p10) ST [B2] = f62, SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) ST [BO2] = f53, 5 * SIZE + (p10) ST [B2] = f63 + adds B2 = 4 * SIZE, BO3 + } + ;; + { .mmb + (p11) ST [BO3] = f36, SIZE + (p11) ST [B2] = f54, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [BO3] = f37, SIZE + (p11) ST [B2] = f55, SIZE + mov COUNT = r0 + } + ;; + { .mmi + (p11) ST [BO3] = f44, SIZE + (p11) ST [B2] = f64, SIZE + cmp.eq p0,p6 = 0,J + } + ;; + { .mmb + (p11) ST [BO3] = f45, 5 * SIZE + (p11) ST [B2] = f65, 5 * SIZE + (p6) br.cond.dptk.few .L11 + } + ;; + .align 32 + +.L20: + { .mmi + mov A1 = A + add A2 = A, LDA + mov pr.rot = 0 + } + { .mmi + mov B1 = B + adds PREA = PREFETCHSIZE * SIZE,A + tbit.z p6, p0 = M, 1 + } + ;; + { .mmi + cmp.eq p16,p0 = r0, r0 + adds B2 = 4 * SIZE, B + mov ar.ec = 3 + } + { .mib + adds PREB = WPREFETCHSIZE * SIZE, B + shr I = N, 2 + (p6) br.cond.dpnt .L30 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, I + adds I =-1, I + nop __LINE__ + } + { .mmi + shladd A = LDA, 1, A + adds A5 = 4 * SIZE, A1 + adds A6 = 4 * SIZE, A2 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mib + adds B = 16 * SIZE, B + mov ar.lc = I + (p6) br.cond.dpnt.few .L25 + } + ;; + +.L22: + { .mmi + (p16) lfetch.nt1 [PREA], LDA + (p16) lfetch.excl.nt1 [PREB], LDB + shladd TEMP = LDA, 1, r0 + } + ;; + { .mmb + (p18) ST [B1] = f34, SIZE + (p18) ST [B2] = f37, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f40, SIZE + (p18) ST [B2] = f43, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f46, SIZE + (p18) ST [B2] = f49, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f52, 5 * SIZE + (p18) ST [B2] = f55, 5 * SIZE + tbit.z p0,p7 = COUNT,0 + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f58, SIZE + (p18) ST [B2] = f61, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f56 = [A2], SIZE + (p16) LD f59 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f64, SIZE + (p18) ST [B2] = f67, SIZE + adds TEMP = -16 * SIZE, TEMP + } + { .mmb + (p16) LD f62 = [A2], SIZE + (p16) LD f65 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f70, SIZE + (p18) ST [B2] = f73, SIZE + (p7) sub PREA = PREA, TEMP + } + { .mmb + (p16) LD f68 = [A2], SIZE + (p16) LD f71 = [A6], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f76, -11 * SIZE + (p18) ST [B2] = f79, -11 * SIZE + (p16) adds COUNT = 1, COUNT + } + { .mmb + (p16) LD f74 = [A2], 5 * SIZE + (p16) LD f77 = [A6], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) add B1 = B1, LDB + (p18) add B2 = B2, LDB + br.ctop.sptk.few .L22 + } + ;; + .align 32 + +.L25: + { .mmb + (p10) LD f32 = [A1], SIZE + (p10) LD f40 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f33 = [A1], SIZE + (p10) LD f41 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f34 = [A1], SIZE + (p10) LD f42 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) LD f35 = [A1], SIZE + (p10) LD f43 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmb + (p11) LD f36 = [A1], SIZE + (p11) LD f44 = [A2], SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) LD f37 = [A1] + (p11) LD f45 = [A2] + adds B2 = 4 * SIZE, BO2 + } + ;; + { .mmb + (p10) ST [BO2] = f32, SIZE + (p10) ST [B2] = f40, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f33, SIZE + (p10) ST [B2] = f41, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f34, SIZE + (p10) ST [B2] = f42, SIZE + nop __LINE__ + } + ;; + { .mmb + (p10) ST [BO2] = f35, 5 * SIZE + (p10) ST [B2] = f43, 5 * SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [BO3] = f36, SIZE + ;; + (p11) ST [BO3] = f37, SIZE + mov COUNT = r0 + } + ;; + { .mmi + (p11) ST [BO3] = f44, SIZE + ;; + (p11) ST [BO3] = f45, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L30: + { .mmi + mov A1 = A + adds A5 = 4 * SIZE, A + mov pr.rot = 0 + } + { .mmi + mov B1 = B + adds B2 = 4 * SIZE, B + tbit.z p6, p0 = M, 0 + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mib + cmp.eq p16,p0 = r0, r0 + shr I = N, 2 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = 0, I + adds I =-1, I + mov ar.ec = 3 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = I + (p6) br.cond.dpnt.few .L35 + } + ;; + .align 32 + +.L32: + { .mmb + (p18) ST [B1] = f34, SIZE + (p18) ST [B2] = f37, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f32 = [A1], SIZE + (p16) LD f35 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f40, SIZE + (p18) ST [B2] = f43, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f38 = [A1], SIZE + (p16) LD f41 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmb + (p18) ST [B1] = f46, SIZE + (p18) ST [B2] = f49, SIZE + nop __LINE__ + } + { .mmb + (p16) LD f44 = [A1], SIZE + (p16) LD f47 = [A5], SIZE + nop __LINE__ + } + ;; + { .mmi + (p18) ST [B1] = f52, -3 * SIZE + (p18) ST [B2] = f55, -3 * SIZE + nop __LINE__ + } + { .mmb + (p16) LD f50 = [A1], 5 * SIZE + (p16) LD f53 = [A5], 5 * SIZE + nop __LINE__ + } + ;; + { .mmb + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + { .mmb + (p18) add B1 = B1, LDB + (p18) add B2 = B2, LDB + br.ctop.sptk.few .L32 + } + ;; + .align 32 + +.L35: + { .mmi + (p10) LD f32 = [A1], SIZE + ;; + (p10) LD f33 = [A1], SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) LD f34 = [A1], SIZE + ;; + (p10) LD f35 = [A1], SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) LD f36 = [A1], SIZE + ;; + (p11) LD f37 = [A1] + nop __LINE__ + } + ;; + { .mmi + (p10) ST [BO2] = f32, SIZE + ;; + (p10) ST [BO2] = f33, SIZE + nop __LINE__ + } + ;; + { .mmi + (p10) ST [BO2] = f34, SIZE + ;; + (p10) ST [BO2] = f35, SIZE + nop __LINE__ + } + ;; + { .mmi + (p11) ST [BO3] = f36, SIZE + ;; + (p11) ST [BO3] = f37, SIZE + nop __LINE__ + } + ;; + .align 32 + +.L999: + mov pr = PR, -1 + mov ar.lc = ARLC + br.ret.sptk.many b0 + EPILOGUE diff --git a/kernel/ia64/zgemv_n.S b/kernel/ia64/zgemv_n.S new file mode 100644 index 0000000000..b3027a68d0 --- /dev/null +++ b/kernel/ia64/zgemv_n.S @@ -0,0 +1,2293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#define A r37 +#define LDA r38 +#define X r39 +#define INCX r34 +#define Y r35 +#define INCY r36 +#define BUFFER r11 + +#define I r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define AO5 r20 +#define AO6 r21 +#define AO7 r22 +#define AO8 r23 +#define YLD1 r24 +#define YLD2 r25 +#define YST1 r26 +#define YST2 r27 +#define YY r28 +#define XX r9 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define INCXM1 r2 +#define INCX3M1 r3 + +#define AO9 loc8 +#define AO10 loc9 +#define AO11 loc10 +#define AO12 loc11 +#define AO13 loc12 +#define AO14 loc13 +#define AO15 loc14 +#define AO16 loc15 + +#define PREB r8 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 2 + 8) +#else +#define RPREFETCH (16 * 2 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA_R f6 +#define ALPHA_I f7 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 FNMA +#define ADD2 FMA +#define ADD3 FNMA +#define ADD4 FMA +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 FNMA +#define ADD2 FMA +#define ADD3 FMA +#define ADD4 FNMA +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 FMA +#define ADD2 FNMA +#define ADD3 FNMA +#define ADD4 FMA +#else +#define ADD1 FMA +#define ADD2 FNMA +#define ADD3 FMA +#define ADD4 FNMA +#endif + + PROLOGUE + .prologue + PROFCODE + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + mov ARLC = ar.lc + } + ;; + mov PR = pr + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + adds r17 = 40, SP + ;; + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + ;; + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + ;; + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + ;; + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + ;; + stf.spill [r8] = f22 + stf.spill [r9] = f23 + ;; + ld8 INCX = [r14] + ld8 Y = [r15] + ld8 INCY = [r16] + ld8 BUFFER = [r17] + .body + ;; + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + mov ALPHA_R = f8 + shladd INCX = INCX, ZBASE_SHIFT, r0 + shladd LDA = LDA, ZBASE_SHIFT, r0 + mov ALPHA_I = f9 + ;; + shladd INCY = INCY, ZBASE_SHIFT, r0 + tbit.nz p8, p0 = A, BASE_SHIFT + (p7) br.cond.dpnt .L999 + ;; + shladd XX = INCX, 1, X + adds INCXM1 = -SIZE, INCX + (p6) br.cond.dpnt .L999 + ;; + shladd INCX3M1 = INCX, 1, INCXM1 + cmp.eq p10, p11 = 2 * SIZE, INCY + mov YY = Y + ;; + (p11) mov YY = BUFFER + mov YST1 = BUFFER + shr J = M, 2 + ;; + { .mib + adds YST2 = 4 * SIZE, BUFFER + mov ar.lc = J + (p10) br.cond.dptk .L10 + } + ;; +.L02: + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 1 * SIZE + STFD [YST2] = f0, 1 * SIZE + ;; + STFD [YST1] = f0, 5 * SIZE + STFD [YST2] = f0, 5 * SIZE + br.cloop.sptk.few .L02 + ;; + +.L10: + { .mmi + mov AO1 = A + nop __LINE__ + shr J = N, 3 + } + ;; + { .mmb + add AO2 = LDA, A + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + LDFD f32 = [X], SIZE + LDFD f36 = [XX], SIZE + mov pr.rot= 0 + ;; + LDFD f33 = [X], INCXM1 + LDFD f37 = [XX], INCXM1 + mov YLD1 = YY + ;; + LDFD f34 = [X], SIZE + LDFD f38 = [XX], SIZE + adds YLD2 = 4 * SIZE, YY + ;; + LDFD f35 = [X], INCX3M1 + LDFD f39 = [XX], INCX3M1 + mov YST1 = YY + ;; + LDFD f40 = [X], SIZE + LDFD f44 = [XX], SIZE + adds YST2 = 4 * SIZE, YY + ;; + LDFD f41 = [X], INCXM1 + LDFD f45 = [XX], INCXM1 + shr I = M, 2 + ;; + LDFD f42 = [X], SIZE + LDFD f46 = [XX], SIZE + mov AO1 = A + ;; + LDFD f43 = [X], INCX3M1 + LDFD f47 = [XX], INCX3M1 + add AO2 = LDA, A + ;; + shladd AO3 = LDA, 1, A + FMPY f8 = ALPHA_R, f32 + mov ar.ec= 2 + shladd AO4 = LDA, 1, AO2 + FMPY f9 = ALPHA_I, f32 + ;; + shladd AO5 = LDA, 1, AO3 + FMPY f10 = ALPHA_R, f34 + shladd AO6 = LDA, 1, AO4 + FMPY f11 = ALPHA_I, f34 + ;; + FMPY f12 = ALPHA_R, f36 + shladd AO7 = LDA, 1, AO5 + FMPY f13 = ALPHA_I, f36 + shladd AO8 = LDA, 1, AO6 + FMPY f14 = ALPHA_R, f38 + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f15 = ALPHA_I, f38 + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f16 = ALPHA_R, f40 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + FMPY f17 = ALPHA_I, f40 + adds RPRE3 = RPREFETCH * SIZE, AO3 + FMPY f18 = ALPHA_R, f42 + adds RPRE4 = (RPREFETCH + 8) * SIZE, AO4 + FMPY f19 = ALPHA_I, f42 + adds RPRE5 = RPREFETCH * SIZE, AO5 + FMPY f20 = ALPHA_R, f44 + adds RPRE6 = (RPREFETCH + 8) * SIZE, AO6 + FMPY f21 = ALPHA_I, f44 + adds RPRE7 = RPREFETCH * SIZE, AO7 + FMPY f22 = ALPHA_R, f46 + adds RPRE8 = (RPREFETCH + 8) * SIZE, AO8 + FMPY f23 = ALPHA_I, f46 + ;; + ADD1 f8 = ALPHA_I, f33, f8 + tbit.nz p14, p0 = M, 1 + ADD2 f9 = ALPHA_R, f33, f9 + shladd A = LDA, 3, A + ADD1 f10 = ALPHA_I, f35, f10 + adds AO9 = 4 * SIZE, AO1 + ADD2 f11 = ALPHA_R, f35, f11 + adds AO10 = 4 * SIZE, AO2 + ADD1 f12 = ALPHA_I, f37, f12 + adds AO11 = 4 * SIZE, AO3 + ADD2 f13 = ALPHA_R, f37, f13 + adds AO12 = 4 * SIZE, AO4 + ADD1 f14 = ALPHA_I, f39, f14 + adds AO13 = 4 * SIZE, AO5 + ADD2 f15 = ALPHA_R, f39, f15 + adds AO14 = 4 * SIZE, AO6 + ADD1 f16 = ALPHA_I, f41, f16 + adds AO15 = 4 * SIZE, AO7 + ADD2 f17 = ALPHA_R, f41, f17 + adds AO16 = 4 * SIZE, AO8 + ADD1 f18 = ALPHA_I, f43, f18 + cmp.eq p6, p0 = 0, I + ADD2 f19 = ALPHA_R, f43, f19 + cmp.eq p16, p0 = r0, r0 + ADD1 f20 = ALPHA_I, f45, f20 + adds I = -1, I + ADD2 f21 = ALPHA_R, f45, f21 + ;; + { .mfi + nop __LINE__ + ADD1 f22 = ALPHA_I, f47, f22 + mov ar.lc = I + } + { .mfb + nop __LINE__ + ADD2 f23 = ALPHA_R, f47, f23 + (p6) br.cond.dpnt .L15 + } + ;; + .align 16 + +.L12: + { .mfi + (p17) LDFD f89 = [AO8], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p12, p13 = I, 0 + } + { .mfi + (p17) LDFD f93 = [AO16], 1 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + ;; + { .mfi + (p17) LDFD f90 = [AO8], 1 * SIZE + (p17) FMA f104 = f9, f33, f104 + (p16) adds I = -1, I + } + { .mfi + (p17) LDFD f94 = [AO16], 1 * SIZE + (p17) FMA f116 = f9, f37, f116 + } + ;; + { .mfi + (p17) LDFD f91 = [AO8], 1 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p17) LDFD f95 = [AO16], 1 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + ;; + { .mfi + (p17) LDFD f92 = [AO8], 5 * SIZE + (p17) FMA f110 = f9, f35, f110 + } + { .mfi + (p17) LDFD f96 = [AO16], 5 * SIZE + (p17) FMA f122 = f9, f39, f122 + } + ;; + { .mfi + (p12) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) ADD3 f101 = f9, f34, f101 + } + { .mfi + (p17) ADD3 f113 = f9, f38, f113 + } + ;; + { .mfi + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p17) ADD4 f104 = f8, f34, f104 + } + { .mfi + (p16) LDFD f112 = [YLD2], 1 * SIZE + (p17) ADD4 f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFD f103 = [YLD1], 1 * SIZE + (p17) ADD3 f107 = f9, f36, f107 + } + { .mfi + (p16) LDFD f115 = [YLD2], 1 * SIZE + (p17) ADD3 f119 = f9, f40, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE1], 16 * SIZE + (p17) ADD4 f110 = f8, f36, f110 + } + { .mfi + (p17) ADD4 f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFD f32 = [AO1], 1 * SIZE + (p17) FMA f101 = f10, f41, f101 + } + { .mfi + (p16) LDFD f36 = [AO9], 1 * SIZE + (p17) FMA f113 = f10, f45, f113 + } + ;; + { .mfi + (p16) LDFD f33 = [AO1], 1 * SIZE + (p17) FMA f104 = f11, f41, f104 + } + { .mfi + (p16) LDFD f37 = [AO9], 1 * SIZE + (p17) FMA f116 = f11, f45, f116 + } + ;; + { .mfi + (p16) LDFD f34 = [AO1], 1 * SIZE + (p17) FMA f107 = f10, f43, f107 + } + { .mfi + (p16) LDFD f38 = [AO9], 1 * SIZE + (p17) FMA f119 = f10, f47, f119 + } + ;; + { .mfi + (p16) LDFD f35 = [AO1], 5 * SIZE + (p17) FMA f110 = f11, f43, f110 + } + { .mfi + (p16) LDFD f39 = [AO9], 5 * SIZE + (p17) FMA f122 = f11, f47, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f11, f42, f101 + } + { .mfi + (p17) ADD3 f113 = f11, f46, f113 + } + ;; + { .mfi + (p16) LDFD f106 = [YLD1], 1 * SIZE + (p17) ADD4 f104 = f10, f42, f104 + } + { .mfi + (p16) LDFD f118 = [YLD2], 1 * SIZE + (p17) ADD4 f116 = f10, f46, f116 + } + ;; + { .mfi + (p16) LDFD f109 = [YLD1], 5 * SIZE + (p17) ADD3 f107 = f11, f44, f107 + } + { .mfi + (p16) LDFD f121 = [YLD2], 5 * SIZE + (p17) ADD3 f119 = f11, f48, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE2], 16 * SIZE + (p17) ADD4 f110 = f10, f44, f110 + } + { .mfi + (p17) ADD4 f122 = f10, f48, f122 + } + ;; + { .mfi + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f101 = f12, f49, f101 + } + { .mfi + (p16) LDFD f44 = [AO10], 1 * SIZE + (p17) FMA f113 = f12, f53, f113 + } + ;; + { .mfi + (p16) LDFD f41 = [AO2], 1 * SIZE + (p17) FMA f104 = f13, f49, f104 + } + { .mfi + (p16) LDFD f45 = [AO10], 1 * SIZE + (p17) FMA f116 = f13, f53, f116 + } + ;; + { .mfi + (p16) LDFD f42 = [AO2], 1 * SIZE + (p17) FMA f107 = f12, f51, f107 + } + { .mfi + (p16) LDFD f46 = [AO10], 1 * SIZE + (p17) FMA f119 = f12, f55, f119 + } + ;; + { .mfi + (p16) LDFD f43 = [AO2], 5 * SIZE + (p17) FMA f110 = f13, f51, f110 + } + { .mfi + (p16) LDFD f47 = [AO10], 5 * SIZE + (p17) FMA f122 = f13, f55, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f13, f50, f101 + } + { .mfi + (p17) ADD3 f113 = f13, f54, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f12, f50, f104 + } + { .mfi + (p17) ADD4 f116 = f12, f54, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f13, f52, f107 + } + { .mfi + (p17) ADD3 f119 = f13, f56, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE3], 16 * SIZE + (p17) ADD4 f110 = f12, f52, f110 + } + { .mfi + (p17) ADD4 f122 = f12, f56, f122 + } + ;; + { .mfi + (p16) LDFD f48 = [AO3], 1 * SIZE + (p17) FMA f101 = f14, f57, f101 + } + { .mfi + (p16) LDFD f52 = [AO11], 1 * SIZE + (p17) FMA f113 = f14, f61, f113 + } + ;; + { .mfi + (p16) LDFD f49 = [AO3], 1 * SIZE + (p17) FMA f104 = f15, f57, f104 + } + { .mfi + (p16) LDFD f53 = [AO11], 1 * SIZE + (p17) FMA f116 = f15, f61, f116 + } + ;; + { .mfi + (p16) LDFD f50 = [AO3], 1 * SIZE + (p17) FMA f107 = f14, f59, f107 + } + { .mfi + (p16) LDFD f54 = [AO11], 1 * SIZE + (p17) FMA f119 = f14, f63, f119 + } + ;; + { .mfi + (p16) LDFD f51 = [AO3], 5 * SIZE + (p17) FMA f110 = f15, f59, f110 + } + { .mfi + (p16) LDFD f55 = [AO11], 5 * SIZE + (p17) FMA f122 = f15, f63, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f15, f58, f101 + } + { .mfi + (p17) ADD3 f113 = f15, f62, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f14, f58, f104 + } + { .mfi + (p17) ADD4 f116 = f14, f62, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f15, f60, f107 + } + { .mfi + (p17) ADD3 f119 = f15, f64, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE4], 16 * SIZE + (p17) ADD4 f110 = f14, f60, f110 + } + { .mfi + (p17) ADD4 f122 = f14, f64, f122 + } + ;; + { .mfi + (p16) LDFD f56 = [AO4], 1 * SIZE + (p17) FMA f101 = f16, f65, f101 + } + { .mfi + (p16) LDFD f60 = [AO12], 1 * SIZE + (p17) FMA f113 = f16, f69, f113 + } + ;; + { .mfi + (p16) LDFD f57 = [AO4], 1 * SIZE + (p17) FMA f104 = f17, f65, f104 + } + { .mfi + (p16) LDFD f61 = [AO12], 1 * SIZE + (p17) FMA f116 = f17, f69, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + (p17) FMA f107 = f16, f67, f107 + } + { .mmf + (p16) LDFD f58 = [AO4], 1 * SIZE + (p16) LDFD f62 = [AO12], 1 * SIZE + (p17) FMA f119 = f16, f71, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + (p17) FMA f110 = f17, f67, f110 + } + { .mmf + (p16) LDFD f59 = [AO4], 5 * SIZE + (p16) LDFD f63 = [AO12], 5 * SIZE + (p17) FMA f122 = f17, f71, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f17, f66, f101 + } + { .mfi + (p17) ADD3 f113 = f17, f70, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f16, f66, f104 + } + { .mfi + (p17) ADD4 f116 = f16, f70, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f17, f68, f107 + } + { .mfi + (p17) ADD3 f119 = f17, f72, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE5], 16 * SIZE + (p17) ADD4 f110 = f16, f68, f110 + } + { .mfi + (p17) ADD4 f122 = f16, f72, f122 + } + ;; + { .mfi + (p16) LDFD f64 = [AO5], 1 * SIZE + (p17) FMA f101 = f18, f73, f101 + } + { .mfi + (p16) LDFD f68 = [AO13], 1 * SIZE + (p17) FMA f113 = f18, f77, f113 + } + ;; + { .mfi + (p16) LDFD f65 = [AO5], 1 * SIZE + (p17) FMA f104 = f19, f73, f104 + } + { .mfi + (p16) LDFD f69 = [AO13], 1 * SIZE + (p17) FMA f116 = f19, f77, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p17) FMA f107 = f18, f75, f107 + } + { .mmf + (p16) LDFD f66 = [AO5], 1 * SIZE + (p16) LDFD f70 = [AO13], 1 * SIZE + (p17) FMA f119 = f18, f79, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + (p17) FMA f110 = f19, f75, f110 + } + { .mmf + (p16) LDFD f67 = [AO5], 5 * SIZE + (p16) LDFD f71 = [AO13], 5 * SIZE + (p17) FMA f122 = f19, f79, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f19, f74, f101 + } + { .mfi + (p17) ADD3 f113 = f19, f78, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f18, f74, f104 + } + { .mfi + (p17) ADD4 f116 = f18, f78, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f19, f76, f107 + } + { .mfi + (p17) ADD3 f119 = f19, f80, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE6], 16 * SIZE + (p17) ADD4 f110 = f18, f76, f110 + } + { .mfi + (p17) ADD4 f122 = f18, f80, f122 + } + ;; + { .mfi + (p16) LDFD f72 = [AO6], 1 * SIZE + (p17) FMA f101 = f20, f81, f101 + } + { .mfi + (p16) LDFD f76 = [AO14], 1 * SIZE + (p17) FMA f113 = f20, f85, f113 + } + ;; + { .mfi + (p16) LDFD f73 = [AO6], 1 * SIZE + (p17) FMA f104 = f21, f81, f104 + } + { .mfi + (p16) LDFD f77 = [AO14], 1 * SIZE + (p17) FMA f116 = f21, f85, f116 + } + ;; + { .mfi + (p16) LDFD f74 = [AO6], 1 * SIZE + (p17) FMA f107 = f20, f83, f107 + } + { .mfi + (p16) LDFD f78 = [AO14], 1 * SIZE + (p17) FMA f119 = f20, f87, f119 + } + ;; + { .mfi + (p16) LDFD f75 = [AO6], 5 * SIZE + (p17) FMA f110 = f21, f83, f110 + } + { .mfi + (p16) LDFD f79 = [AO14], 5 * SIZE + (p17) FMA f122 = f21, f87, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f21, f82, f101 + } + { .mfi + (p17) ADD3 f113 = f21, f86, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f20, f82, f104 + } + { .mfi + (p17) ADD4 f116 = f20, f86, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f21, f84, f107 + } + { .mfi + (p17) ADD3 f119 = f21, f88, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE7], 16 * SIZE + (p17) ADD4 f110 = f20, f84, f110 + } + { .mfi + (p17) ADD4 f122 = f20, f88, f122 + } + ;; + { .mfi + (p16) LDFD f80 = [AO7], 1 * SIZE + (p17) FMA f101 = f22, f89, f101 + } + { .mfi + (p16) LDFD f84 = [AO15], 1 * SIZE + (p17) FMA f113 = f22, f93, f113 + } + ;; + { .mfi + (p16) LDFD f81 = [AO7], 1 * SIZE + (p17) FMA f104 = f23, f89, f104 + } + { .mfi + (p16) LDFD f85 = [AO15], 1 * SIZE + (p17) FMA f116 = f23, f93, f116 + } + ;; + { .mfi + (p16) LDFD f82 = [AO7], 1 * SIZE + (p17) FMA f107 = f22, f91, f107 + } + { .mfi + (p16) LDFD f86 = [AO15], 1 * SIZE + (p17) FMA f119 = f22, f95, f119 + } + ;; + { .mfi + (p16) LDFD f83 = [AO7], 5 * SIZE + (p17) FMA f110 = f23, f91, f110 + } + { .mfi + (p16) LDFD f87 = [AO15], 5 * SIZE + (p17) FMA f122 = f23, f95, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f23, f90, f101 + } + { .mfi + (p17) ADD3 f113 = f23, f94, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f22, f90, f104 + } + { .mfi + (p17) ADD4 f116 = f22, f94, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f23, f92, f107 + } + { .mfi + (p17) ADD3 f119 = f23, f96, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE8], 16 * SIZE + (p17) ADD4 f110 = f22, f92, f110 + } + { .mfb + (p17) ADD4 f122 = f22, f96, f122 + br.ctop.sptk.few .L12 + } + ;; + .align 16 + +.L15: + { .mmi + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f32 = [AO1], 1 * SIZE + (p14) LDFD f80 = [YLD1], 1 * SIZE + cmp.lt p6, p0 = 1, J + } + ;; + { .mmi + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + adds J = -1, J + } + { + (p14) LDFD f33 = [AO1], 1 * SIZE + (p14) LDFD f81 = [YLD1], 1 * SIZE + and I = 3, M + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p6) cmp.eq.unc p7, p0 = I, r0 + } + { .mmi + (p14) LDFD f34 = [AO1], 1 * SIZE + (p14) LDFD f82 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + } + { .mmb + (p14) LDFD f35 = [AO1], 1 * SIZE + (p14) LDFD f83 = [YLD1], 1 * SIZE + (p7) br.cond.dptk .L11 + } + ;; + (p15) LDFD f36 = [AO1], 1 * SIZE + (p15) LDFD f84 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f37 = [AO1], 1 * SIZE + (p15) LDFD f85 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f38 = [AO2], 1 * SIZE + (p14) LDFD f44 = [AO3], 1 * SIZE + ;; + (p14) LDFD f39 = [AO2], 1 * SIZE + (p14) LDFD f45 = [AO3], 1 * SIZE + ;; + (p14) LDFD f40 = [AO2], 1 * SIZE + (p14) LDFD f46 = [AO3], 1 * SIZE + ;; + (p14) LDFD f41 = [AO2], 1 * SIZE + (p14) LDFD f47 = [AO3], 1 * SIZE + (p14) FMA f80 = f8, f32, f80 + ;; + (p15) LDFD f42 = [AO2], 1 * SIZE + (p15) LDFD f48 = [AO3], 1 * SIZE + (p14) FMA f81 = f9, f32, f81 + ;; + (p15) LDFD f43 = [AO2], 1 * SIZE + (p15) LDFD f49 = [AO3], 1 * SIZE + (p14) FMA f82 = f8, f34, f82 + ;; + (p14) LDFD f50 = [AO4], 1 * SIZE + (p14) LDFD f56 = [AO5], 1 * SIZE + (p14) FMA f83 = f9, f34, f83 + ;; + (p14) LDFD f51 = [AO4], 1 * SIZE + (p14) LDFD f57 = [AO5], 1 * SIZE + (p15) FMA f84 = f8, f36, f84 + ;; + (p14) LDFD f52 = [AO4], 1 * SIZE + (p14) LDFD f58 = [AO5], 1 * SIZE + (p15) FMA f85 = f9, f36, f85 + ;; + (p14) LDFD f53 = [AO4], 1 * SIZE + (p14) LDFD f59 = [AO5], 1 * SIZE + (p14) ADD3 f80 = f9, f33, f80 + ;; + (p15) LDFD f54 = [AO4], 1 * SIZE + (p15) LDFD f60 = [AO5], 1 * SIZE + (p14) ADD4 f81 = f8, f33, f81 + ;; + (p15) LDFD f55 = [AO4], 1 * SIZE + (p15) LDFD f61 = [AO5], 1 * SIZE + (p14) ADD3 f82 = f9, f35, f82 + ;; + (p14) LDFD f62 = [AO6], 1 * SIZE + (p14) LDFD f68 = [AO7], 1 * SIZE + (p14) ADD4 f83 = f8, f35, f83 + ;; + (p14) LDFD f63 = [AO6], 1 * SIZE + (p14) LDFD f69 = [AO7], 1 * SIZE + (p15) ADD3 f84 = f9, f37, f84 + ;; + (p14) LDFD f64 = [AO6], 1 * SIZE + (p14) LDFD f70 = [AO7], 1 * SIZE + (p15) ADD4 f85 = f8, f37, f85 + ;; + (p14) LDFD f65 = [AO6], 1 * SIZE + (p14) LDFD f71 = [AO7], 1 * SIZE + (p14) FMA f80 = f10, f38, f80 + ;; + (p15) LDFD f66 = [AO6], 1 * SIZE + (p15) LDFD f72 = [AO7], 1 * SIZE + (p14) FMA f81 = f11, f38, f81 + ;; + (p15) LDFD f67 = [AO6], 1 * SIZE + (p15) LDFD f73 = [AO7], 1 * SIZE + (p14) FMA f82 = f10, f40, f82 + ;; + (p14) LDFD f74 = [AO8], 1 * SIZE + (p14) FMA f83 = f11, f40, f83 + ;; + (p14) LDFD f75 = [AO8], 1 * SIZE + (p15) FMA f84 = f10, f42, f84 + ;; + (p14) LDFD f76 = [AO8], 1 * SIZE + (p15) FMA f85 = f11, f42, f85 + ;; + (p14) LDFD f77 = [AO8], 1 * SIZE + (p14) ADD3 f80 = f11, f39, f80 + ;; + (p15) LDFD f78 = [AO8], 1 * SIZE + (p14) ADD4 f81 = f10, f39, f81 + ;; + (p15) LDFD f79 = [AO8], 1 * SIZE + (p14) ADD3 f82 = f11, f41, f82 + (p14) ADD4 f83 = f10, f41, f83 + (p15) ADD3 f84 = f11, f43, f84 + (p15) ADD4 f85 = f10, f43, f85 + ;; + (p14) FMA f80 = f12, f44, f80 + (p14) FMA f81 = f13, f44, f81 + (p14) FMA f82 = f12, f46, f82 + (p14) FMA f83 = f13, f46, f83 + (p15) FMA f84 = f12, f48, f84 + (p15) FMA f85 = f13, f48, f85 + ;; + (p14) ADD3 f80 = f13, f45, f80 + (p14) ADD4 f81 = f12, f45, f81 + (p14) ADD3 f82 = f13, f47, f82 + (p14) ADD4 f83 = f12, f47, f83 + (p15) ADD3 f84 = f13, f49, f84 + (p15) ADD4 f85 = f12, f49, f85 + ;; + (p14) FMA f80 = f14, f50, f80 + (p14) FMA f81 = f15, f50, f81 + (p14) FMA f82 = f14, f52, f82 + (p14) FMA f83 = f15, f52, f83 + (p15) FMA f84 = f14, f54, f84 + (p15) FMA f85 = f15, f54, f85 + ;; + (p14) ADD3 f80 = f15, f51, f80 + (p14) ADD4 f81 = f14, f51, f81 + (p14) ADD3 f82 = f15, f53, f82 + (p14) ADD4 f83 = f14, f53, f83 + (p15) ADD3 f84 = f15, f55, f84 + (p15) ADD4 f85 = f14, f55, f85 + ;; + (p14) FMA f80 = f16, f56, f80 + (p14) FMA f81 = f17, f56, f81 + (p14) FMA f82 = f16, f58, f82 + (p14) FMA f83 = f17, f58, f83 + (p15) FMA f84 = f16, f60, f84 + (p15) FMA f85 = f17, f60, f85 + ;; + (p14) ADD3 f80 = f17, f57, f80 + (p14) ADD4 f81 = f16, f57, f81 + (p14) ADD3 f82 = f17, f59, f82 + (p14) ADD4 f83 = f16, f59, f83 + (p15) ADD3 f84 = f17, f61, f84 + (p15) ADD4 f85 = f16, f61, f85 + ;; + (p14) FMA f80 = f18, f62, f80 + (p14) FMA f81 = f19, f62, f81 + (p14) FMA f82 = f18, f64, f82 + (p14) FMA f83 = f19, f64, f83 + (p15) FMA f84 = f18, f66, f84 + (p15) FMA f85 = f19, f66, f85 + ;; + (p14) ADD3 f80 = f19, f63, f80 + (p14) ADD4 f81 = f18, f63, f81 + (p14) ADD3 f82 = f19, f65, f82 + (p14) ADD4 f83 = f18, f65, f83 + (p15) ADD3 f84 = f19, f67, f84 + (p15) ADD4 f85 = f18, f67, f85 + ;; + (p14) FMA f80 = f20, f68, f80 + (p14) FMA f81 = f21, f68, f81 + (p14) FMA f82 = f20, f70, f82 + (p14) FMA f83 = f21, f70, f83 + (p15) FMA f84 = f20, f72, f84 + (p15) FMA f85 = f21, f72, f85 + ;; + (p14) ADD3 f80 = f21, f69, f80 + (p14) ADD4 f81 = f20, f69, f81 + (p14) ADD3 f82 = f21, f71, f82 + (p14) ADD4 f83 = f20, f71, f83 + (p15) ADD3 f84 = f21, f73, f84 + (p15) ADD4 f85 = f20, f73, f85 + ;; + (p14) FMA f80 = f22, f74, f80 + (p14) FMA f81 = f23, f74, f81 + (p14) FMA f82 = f22, f76, f82 + (p14) FMA f83 = f23, f76, f83 + (p15) FMA f84 = f22, f78, f84 + (p15) FMA f85 = f23, f78, f85 + ;; + (p14) ADD3 f80 = f23, f75, f80 + (p14) ADD4 f81 = f22, f75, f81 + (p14) ADD3 f82 = f23, f77, f82 + (p14) ADD4 f83 = f22, f77, f83 + (p15) ADD3 f84 = f23, f79, f84 + (p15) ADD4 f85 = f22, f79, f85 + ;; + (p14) STFD [YST1] = f80, 1 * SIZE + ;; + (p14) STFD [YST1] = f81, 1 * SIZE + ;; + (p14) STFD [YST1] = f82, 1 * SIZE + ;; + (p14) STFD [YST1] = f83, 1 * SIZE + ;; + (p15) STFD [YST1] = f84, 1 * SIZE + ;; + (p15) STFD [YST1] = f85, 1 * SIZE + (p6) br.cond.dptk .L11 + ;; + +.L20: + { .mmi + mov YLD1 = YY + adds YLD2 = 4 * SIZE, YY + tbit.z p6, p0 = N, 2 + } + ;; + { .mmb + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + (p6) br.cond.dpnt .L30 + } + ;; + LDFD f32 = [X], SIZE + LDFD f36 = [XX], SIZE + mov AO1 = A + ;; + LDFD f33 = [X], INCXM1 + LDFD f37 = [XX], INCXM1 + add AO2 = LDA, A + ;; + LDFD f34 = [X], SIZE + LDFD f38 = [XX], SIZE + shladd AO3 = LDA, 1, A + ;; + LDFD f35 = [X], INCX3M1 + LDFD f39 = [XX], INCX3M1 + shladd AO4 = LDA, 1, AO2 + ;; + shladd A = LDA, 2, A + FMPY f8 = ALPHA_R, f32 + adds AO9 = 4 * SIZE, AO1 + FMPY f9 = ALPHA_I, f32 + adds AO10 = 4 * SIZE, AO2 + FMPY f10 = ALPHA_R, f34 + adds AO11 = 4 * SIZE, AO3 + FMPY f11 = ALPHA_I, f34 + adds AO12 = 4 * SIZE, AO4 + FMPY f12 = ALPHA_R, f36 + mov pr.rot= 0 + FMPY f13 = ALPHA_I, f36 + shr I = M, 2 + FMPY f14 = ALPHA_R, f38 + tbit.nz p14, p0 = M, 1 + FMPY f15 = ALPHA_I, f38 + ;; + { .mfi + cmp.eq p6, p0 = 0, I + ADD1 f8 = ALPHA_I, f33, f8 + mov ar.ec= 2 + } + ADD2 f9 = ALPHA_R, f33, f9 + adds I = -1, I + ADD1 f10 = ALPHA_I, f35, f10 + adds PREB = RPREFETCH * SIZE, YLD1 + ADD2 f11 = ALPHA_R, f35, f11 + adds RPRE1 = RPREFETCH * SIZE, AO1 + ADD1 f12 = ALPHA_I, f37, f12 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + ADD2 f13 = ALPHA_R, f37, f13 + adds RPRE3 = RPREFETCH * SIZE, AO3 + ADD1 f14 = ALPHA_I, f39, f14 + ADD2 f15 = ALPHA_R, f39, f15 + ;; + { .mib + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + (p6) br.cond.dpnt .L25 + } + ;; + .align 16 + +.L22: + { .mfi + (p17) LDFD f57 = [AO4], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p12, p13 = I, 0 + } + { .mfi + (p17) LDFD f61 = [AO12], 1 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + ;; + { .mfi + (p17) LDFD f58 = [AO4], 1 * SIZE + (p17) FMA f104 = f9, f33, f104 + (p16) adds I = -1, I + } + { .mfi + (p17) LDFD f62 = [AO12], 1 * SIZE + (p17) FMA f116 = f9, f37, f116 + } + ;; + { .mfi + (p17) LDFD f59 = [AO4], 1 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mfi + (p17) LDFD f63 = [AO12], 1 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + ;; + { .mfi + (p17) LDFD f60 = [AO4], 5 * SIZE + (p17) FMA f110 = f9, f35, f110 + } + { .mfi + (p17) LDFD f64 = [AO12], 5 * SIZE + (p17) FMA f122 = f9, f39, f122 + } + ;; + { .mfi + (p12) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) ADD3 f101 = f9, f34, f101 + } + { .mfi + (p17) ADD3 f113 = f9, f38, f113 + } + ;; + { .mfi + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p17) ADD4 f104 = f8, f34, f104 + } + { .mfi + (p16) LDFD f112 = [YLD2], 1 * SIZE + (p17) ADD4 f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFD f103 = [YLD1], 1 * SIZE + (p17) ADD3 f107 = f9, f36, f107 + } + { .mfi + (p16) LDFD f115 = [YLD2], 1 * SIZE + (p17) ADD3 f119 = f9, f40, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE1], 16 * SIZE + (p17) ADD4 f110 = f8, f36, f110 + } + { .mfi + (p17) ADD4 f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFD f32 = [AO1], 1 * SIZE + (p17) FMA f101 = f10, f41, f101 + } + { .mfi + (p16) LDFD f36 = [AO9], 1 * SIZE + (p17) FMA f113 = f10, f45, f113 + } + ;; + { .mfi + (p16) LDFD f33 = [AO1], 1 * SIZE + (p17) FMA f104 = f11, f41, f104 + } + { .mfi + (p16) LDFD f37 = [AO9], 1 * SIZE + (p17) FMA f116 = f11, f45, f116 + } + ;; + { .mfi + (p16) LDFD f34 = [AO1], 1 * SIZE + (p17) FMA f107 = f10, f43, f107 + } + { .mfi + (p16) LDFD f38 = [AO9], 1 * SIZE + (p17) FMA f119 = f10, f47, f119 + } + ;; + { .mfi + (p16) LDFD f35 = [AO1], 5 * SIZE + (p17) FMA f110 = f11, f43, f110 + } + { .mfi + (p16) LDFD f39 = [AO9], 5 * SIZE + (p17) FMA f122 = f11, f47, f122 + } + ;; + { .mfi + (p16) LDFD f106 = [YLD1], 1 * SIZE + (p17) ADD3 f101 = f11, f42, f101 + } + { .mfi + (p16) LDFD f118 = [YLD2], 1 * SIZE + (p17) ADD3 f113 = f11, f46, f113 + } + ;; + { .mfi + (p16) LDFD f109 = [YLD1], 5 * SIZE + (p17) ADD4 f104 = f10, f42, f104 + } + { .mfi + (p16) LDFD f121 = [YLD2], 5 * SIZE + (p17) ADD4 f116 = f10, f46, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f11, f44, f107 + } + { .mfi + (p17) ADD3 f119 = f11, f48, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE2], 16 * SIZE + (p17) ADD4 f110 = f10, f44, f110 + } + { .mfi + (p17) ADD4 f122 = f10, f48, f122 + } + ;; + { .mfi + (p16) LDFD f40 = [AO2], 1 * SIZE + (p17) FMA f101 = f12, f49, f101 + } + { .mfi + (p16) LDFD f44 = [AO10], 1 * SIZE + (p17) FMA f113 = f12, f53, f113 + } + ;; + { .mfi + (p16) LDFD f41 = [AO2], 1 * SIZE + (p17) FMA f104 = f13, f49, f104 + } + { .mfi + (p16) LDFD f45 = [AO10], 1 * SIZE + (p17) FMA f116 = f13, f53, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + (p17) FMA f107 = f12, f51, f107 + } + { .mmf + (p16) LDFD f42 = [AO2], 1 * SIZE + (p16) LDFD f46 = [AO10], 1 * SIZE + (p17) FMA f119 = f12, f55, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + (p17) FMA f110 = f13, f51, f110 + } + { .mmf + (p16) LDFD f43 = [AO2], 5 * SIZE + (p16) LDFD f47 = [AO10], 5 * SIZE + (p17) FMA f122 = f13, f55, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f13, f50, f101 + } + { .mfi + (p17) ADD3 f113 = f13, f54, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f12, f50, f104 + } + { .mfi + (p17) ADD4 f116 = f12, f54, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f13, f52, f107 + } + { .mfi + (p17) ADD3 f119 = f13, f56, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE3], 16 * SIZE + (p17) ADD4 f110 = f12, f52, f110 + } + { .mfi + (p17) ADD4 f122 = f12, f56, f122 + } + ;; + { .mfi + (p16) LDFD f48 = [AO3], 1 * SIZE + (p17) FMA f101 = f14, f57, f101 + } + { .mfi + (p16) LDFD f52 = [AO11], 1 * SIZE + (p17) FMA f113 = f14, f61, f113 + } + ;; + { .mfi + (p16) LDFD f49 = [AO3], 1 * SIZE + (p17) FMA f104 = f15, f57, f104 + } + { .mfi + (p16) LDFD f53 = [AO11], 1 * SIZE + (p17) FMA f116 = f15, f61, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p17) FMA f107 = f14, f59, f107 + } + { .mmf + (p16) LDFD f50 = [AO3], 1 * SIZE + (p16) LDFD f54 = [AO11], 1 * SIZE + (p17) FMA f119 = f14, f63, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + (p17) FMA f110 = f15, f59, f110 + } + { .mmf + (p16) LDFD f51 = [AO3], 5 * SIZE + (p16) LDFD f55 = [AO11], 5 * SIZE + (p17) FMA f122 = f15, f63, f122 + } + ;; + { .mfi + (p17) ADD3 f101 = f15, f58, f101 + } + { .mfi + (p17) ADD3 f113 = f15, f62, f113 + } + ;; + { .mfi + (p17) ADD4 f104 = f14, f58, f104 + } + { .mfi + (p17) ADD4 f116 = f14, f62, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f15, f60, f107 + } + { .mfi + (p17) ADD3 f119 = f15, f64, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE4], 16 * SIZE + (p17) ADD4 f110 = f14, f60, f110 + } + { .mfb + (p17) ADD4 f122 = f14, f64, f122 + br.ctop.sptk.few .L22 + } + ;; + .align 16 + +.L25: + { .mmi + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f32 = [AO1], 1 * SIZE + (p14) LDFD f80 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + } + { .mmi + (p14) LDFD f33 = [AO1], 1 * SIZE + (p14) LDFD f81 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + } + { .mmi + (p14) LDFD f34 = [AO1], 1 * SIZE + (p14) LDFD f82 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + } + { .mmi + (p14) LDFD f35 = [AO1], 1 * SIZE + (p14) LDFD f83 = [YLD1], 1 * SIZE + } + ;; + (p15) LDFD f36 = [AO1], 1 * SIZE + (p15) LDFD f84 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f37 = [AO1], 1 * SIZE + (p15) LDFD f85 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f38 = [AO2], 1 * SIZE + (p14) LDFD f44 = [AO3], 1 * SIZE + ;; + + (p14) LDFD f39 = [AO2], 1 * SIZE + (p14) LDFD f45 = [AO3], 1 * SIZE + (p14) FMA f80 = f8, f32, f80 + ;; + (p14) LDFD f40 = [AO2], 1 * SIZE + (p14) LDFD f46 = [AO3], 1 * SIZE + (p14) FMA f81 = f9, f32, f81 + ;; + (p14) LDFD f41 = [AO2], 1 * SIZE + (p14) LDFD f47 = [AO3], 1 * SIZE + (p14) FMA f82 = f8, f34, f82 + ;; + (p15) LDFD f42 = [AO2], 1 * SIZE + (p15) LDFD f48 = [AO3], 1 * SIZE + (p14) FMA f83 = f9, f34, f83 + ;; + (p15) LDFD f43 = [AO2], 1 * SIZE + (p15) LDFD f49 = [AO3], 1 * SIZE + (p15) FMA f84 = f8, f36, f84 + ;; + (p14) LDFD f50 = [AO4], 1 * SIZE + (p15) FMA f85 = f9, f36, f85 + ;; + (p14) LDFD f51 = [AO4], 1 * SIZE + (p14) ADD3 f80 = f9, f33, f80 + ;; + (p14) LDFD f52 = [AO4], 1 * SIZE + (p14) ADD4 f81 = f8, f33, f81 + ;; + (p14) LDFD f53 = [AO4], 1 * SIZE + (p14) ADD3 f82 = f9, f35, f82 + ;; + (p15) LDFD f54 = [AO4], 1 * SIZE + (p14) ADD4 f83 = f8, f35, f83 + ;; + (p15) LDFD f55 = [AO4], 1 * SIZE + (p15) ADD3 f84 = f9, f37, f84 + (p15) ADD4 f85 = f8, f37, f85 + ;; + (p14) FMA f80 = f10, f38, f80 + (p14) FMA f81 = f11, f38, f81 + (p14) FMA f82 = f10, f40, f82 + (p14) FMA f83 = f11, f40, f83 + (p15) FMA f84 = f10, f42, f84 + (p15) FMA f85 = f11, f42, f85 + ;; + (p14) ADD3 f80 = f11, f39, f80 + (p14) ADD4 f81 = f10, f39, f81 + (p14) ADD3 f82 = f11, f41, f82 + (p14) ADD4 f83 = f10, f41, f83 + (p15) ADD3 f84 = f11, f43, f84 + (p15) ADD4 f85 = f10, f43, f85 + ;; + (p14) FMA f80 = f12, f44, f80 + (p14) FMA f81 = f13, f44, f81 + (p14) FMA f82 = f12, f46, f82 + (p14) FMA f83 = f13, f46, f83 + (p15) FMA f84 = f12, f48, f84 + (p15) FMA f85 = f13, f48, f85 + ;; + (p14) ADD3 f80 = f13, f45, f80 + (p14) ADD4 f81 = f12, f45, f81 + (p14) ADD3 f82 = f13, f47, f82 + (p14) ADD4 f83 = f12, f47, f83 + (p15) ADD3 f84 = f13, f49, f84 + (p15) ADD4 f85 = f12, f49, f85 + ;; + (p14) FMA f80 = f14, f50, f80 + (p14) FMA f81 = f15, f50, f81 + (p14) FMA f82 = f14, f52, f82 + (p14) FMA f83 = f15, f52, f83 + (p15) FMA f84 = f14, f54, f84 + (p15) FMA f85 = f15, f54, f85 + ;; + (p14) ADD3 f80 = f15, f51, f80 + (p14) ADD4 f81 = f14, f51, f81 + (p14) ADD3 f82 = f15, f53, f82 + (p14) ADD4 f83 = f14, f53, f83 + (p15) ADD3 f84 = f15, f55, f84 + (p15) ADD4 f85 = f14, f55, f85 + ;; + (p14) STFD [YST1] = f80, 1 * SIZE + ;; + (p14) STFD [YST1] = f81, 1 * SIZE + ;; + (p14) STFD [YST1] = f82, 1 * SIZE + ;; + (p14) STFD [YST1] = f83, 1 * SIZE + ;; + (p15) STFD [YST1] = f84, 1 * SIZE + ;; + (p15) STFD [YST1] = f85, 1 * SIZE + ;; + +.L30: + { .mmi + mov YLD1 = YY + adds YLD2 = 4 * SIZE, YY + tbit.z p6, p0 = N, 1 + } + ;; + { .mmb + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + (p6) br.cond.dpnt .L40 + } + ;; + LDFD f32 = [X], SIZE + mov AO1 = A + mov pr.rot= 0 + ;; + LDFD f33 = [X], INCXM1 + add AO2 = A, LDA + shr I = M, 2 + ;; + LDFD f34 = [X], SIZE + shladd A = LDA, 1, A + tbit.nz p14, p0 = M, 1 + ;; + LDFD f35 = [X], INCXM1 + cmp.eq p6, p0 = 0, I + ;; + FMPY f8 = ALPHA_R, f32 + adds AO9 = 4 * SIZE, AO1 + FMPY f9 = ALPHA_I, f32 + adds AO10 = 4 * SIZE, AO2 + FMPY f10 = ALPHA_R, f34 + mov ar.ec= 2 + FMPY f11 = ALPHA_I, f34 + ;; + adds PREB = RPREFETCH * SIZE, YLD1 + adds I = -1, I + ADD1 f8 = ALPHA_I, f33, f8 + adds RPRE1 = RPREFETCH * SIZE, AO1 + ADD2 f9 = ALPHA_R, f33, f9 + adds RPRE2 = (RPREFETCH + 8) * SIZE, AO2 + ADD1 f10 = ALPHA_I, f35, f10 + ADD2 f11 = ALPHA_R, f35, f11 + ;; + { .mib + cmp.eq p16, p0 = r0, r0 + mov ar.lc = I + (p6) br.cond.dpnt .L35 + } + ;; + .align 16 + +.L32: + { .mfi + (p17) LDFD f41 = [AO2], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + (p16) tbit.nz.unc p12, p13 = I, 0 + } + { .mfi + (p17) LDFD f45 = [AO10], 1 * SIZE + (p17) FMA f113 = f8, f37, f113 + } + ;; + { .mfi + (p17) LDFD f42 = [AO2], 1 * SIZE + (p17) FMA f104 = f9, f33, f104 + (p16) adds I = -1, I + } + { .mfi + (p17) LDFD f46 = [AO10], 1 * SIZE + (p17) FMA f116 = f9, f37, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + (p17) FMA f107 = f8, f35, f107 + } + { .mmf + (p17) LDFD f43 = [AO2], 1 * SIZE + (p17) LDFD f47 = [AO10], 1 * SIZE + (p17) FMA f119 = f8, f39, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + (p17) FMA f110 = f9, f35, f110 + } + { .mmf + (p17) LDFD f44 = [AO2], 5 * SIZE + (p17) LDFD f48 = [AO10], 5 * SIZE + (p17) FMA f122 = f9, f39, f122 + } + ;; + { .mfi + (p12) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) ADD3 f101 = f9, f34, f101 + } + { .mfi + (p17) ADD3 f113 = f9, f38, f113 + } + ;; + { .mfi + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p17) ADD4 f104 = f8, f34, f104 + } + { .mfi + (p16) LDFD f112 = [YLD2], 1 * SIZE + (p17) ADD4 f116 = f8, f38, f116 + } + ;; + { .mfi + (p16) LDFD f103 = [YLD1], 1 * SIZE + (p17) ADD3 f107 = f9, f36, f107 + } + { .mfi + (p16) LDFD f115 = [YLD2], 1 * SIZE + (p17) ADD3 f119 = f9, f40, f119 + } + ;; + { .mfi + (p12) PREFETCH [RPRE1], 16 * SIZE + (p17) ADD4 f110 = f8, f36, f110 + } + { .mfi + (p17) ADD4 f122 = f8, f40, f122 + } + ;; + { .mfi + (p16) LDFD f32 = [AO1], 1 * SIZE + (p17) FMA f101 = f10, f41, f101 + } + { .mfi + (p16) LDFD f36 = [AO9], 1 * SIZE + (p17) FMA f113 = f10, f45, f113 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p17) FMA f104 = f11, f41, f104 + } + { .mmf + (p16) LDFD f33 = [AO1], 1 * SIZE + (p16) LDFD f37 = [AO9], 1 * SIZE + (p17) FMA f116 = f11, f45, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + (p17) FMA f107 = f10, f43, f107 + } + { .mmf + (p16) LDFD f34 = [AO1], 1 * SIZE + (p16) LDFD f38 = [AO9], 1 * SIZE + (p17) FMA f119 = f10, f47, f119 + } + ;; + { .mfi + (p16) LDFD f35 = [AO1], 5 * SIZE + (p17) FMA f110 = f11, f43, f110 + } + { .mfi + (p16) LDFD f39 = [AO9], 5 * SIZE + (p17) FMA f122 = f11, f47, f122 + } + ;; + { .mfi + (p16) LDFD f106 = [YLD1], 1 * SIZE + (p17) ADD3 f101 = f11, f42, f101 + } + { .mfi + (p16) LDFD f118 = [YLD2], 1 * SIZE + (p17) ADD3 f113 = f11, f46, f113 + } + ;; + { .mfi + (p16) LDFD f109 = [YLD1], 5 * SIZE + (p17) ADD4 f104 = f10, f42, f104 + } + { .mfi + (p16) LDFD f121 = [YLD2], 5 * SIZE + (p17) ADD4 f116 = f10, f46, f116 + } + ;; + { .mfi + (p17) ADD3 f107 = f11, f44, f107 + } + { .mfi + (p17) ADD3 f119 = f11, f48, f119 + } + ;; + { .mfi + (p13) PREFETCH [RPRE2], 16 * SIZE + (p17) ADD4 f110 = f10, f44, f110 + } + { .mfb + (p17) ADD4 f122 = f10, f48, f122 + br.ctop.sptk.few .L32 + } + ;; + .align 16 + +.L35: + { .mmi + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f32 = [AO1], 1 * SIZE + (p14) LDFD f80 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + } + { .mmi + (p14) LDFD f33 = [AO1], 1 * SIZE + (p14) LDFD f81 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + } + { .mmi + (p14) LDFD f34 = [AO1], 1 * SIZE + (p14) LDFD f82 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + } + { .mmi + (p14) LDFD f35 = [AO1], 1 * SIZE + (p14) LDFD f83 = [YLD1], 1 * SIZE + } + ;; + (p15) LDFD f36 = [AO1], 1 * SIZE + (p15) LDFD f84 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f37 = [AO1], 1 * SIZE + (p15) LDFD f85 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f38 = [AO2], 1 * SIZE + (p14) FMA f80 = f8, f32, f80 + ;; + (p14) LDFD f39 = [AO2], 1 * SIZE + (p14) FMA f81 = f9, f32, f81 + ;; + (p14) LDFD f40 = [AO2], 1 * SIZE + (p14) FMA f82 = f8, f34, f82 + ;; + (p14) LDFD f41 = [AO2], 1 * SIZE + (p14) FMA f83 = f9, f34, f83 + ;; + (p15) LDFD f42 = [AO2], 1 * SIZE + (p15) FMA f84 = f8, f36, f84 + ;; + (p15) LDFD f43 = [AO2], 1 * SIZE + (p15) FMA f85 = f9, f36, f85 + ;; + (p14) ADD3 f80 = f9, f33, f80 + (p14) ADD4 f81 = f8, f33, f81 + (p14) ADD3 f82 = f9, f35, f82 + (p14) ADD4 f83 = f8, f35, f83 + (p15) ADD3 f84 = f9, f37, f84 + (p15) ADD4 f85 = f8, f37, f85 + ;; + (p14) FMA f80 = f10, f38, f80 + (p14) FMA f81 = f11, f38, f81 + (p14) FMA f82 = f10, f40, f82 + (p14) FMA f83 = f11, f40, f83 + (p15) FMA f84 = f10, f42, f84 + (p15) FMA f85 = f11, f42, f85 + ;; + (p14) ADD3 f80 = f11, f39, f80 + (p14) ADD4 f81 = f10, f39, f81 + (p14) ADD3 f82 = f11, f41, f82 + (p14) ADD4 f83 = f10, f41, f83 + (p15) ADD3 f84 = f11, f43, f84 + (p15) ADD4 f85 = f10, f43, f85 + ;; + (p14) STFD [YST1] = f80, 1 * SIZE + ;; + (p14) STFD [YST1] = f81, 1 * SIZE + ;; + (p14) STFD [YST1] = f82, 1 * SIZE + ;; + (p14) STFD [YST1] = f83, 1 * SIZE + ;; + (p15) STFD [YST1] = f84, 1 * SIZE + ;; + (p15) STFD [YST1] = f85, 1 * SIZE + ;; + +.L40: + { .mmi + mov YLD1 = YY + adds YLD2 = 4 * SIZE, YY + tbit.z p6, p0 = N, 0 + } + { .mmb + mov YST1 = YY + adds YST2 = 4 * SIZE, YY + (p6) br.cond.dpnt .L990 + } + ;; + LDFD f32 = [X], SIZE + mov AO1 = A + adds AO9 = 4 * SIZE, A + ;; + LDFD f33 = [X], INCXM1 + add A = A, LDA + mov pr.rot= 0 + ;; + { .mfi + adds PREB = RPREFETCH * SIZE, YLD1 + FMPY f8 = ALPHA_R, f32 + mov ar.ec= 2 + } + { .mfi + adds RPRE1 = RPREFETCH * SIZE, AO1 + FMPY f9 = ALPHA_I, f32 + shr I = M, 2 + } + ;; + { .mmf + cmp.eq p6, p0 = 0, I + cmp.eq p16, p0 = r0, r0 + ADD1 f8 = ALPHA_I, f33, f8 + } + { .mfi + adds I = -1, I + ADD2 f9 = ALPHA_R, f33, f9 + tbit.nz p14, p0 = M, 1 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = I + (p6) br.cond.dpnt .L45 + } + ;; + .align 16 + +.L42: + { .mmf + (p16) LDFD f100 = [YLD1], 1 * SIZE + (p16) LDFD f112 = [YLD2], 1 * SIZE + (p17) FMA f101 = f8, f33, f101 + } + { .mmf + (p16) LDFD f32 = [AO1], 1 * SIZE + (p16) LDFD f44 = [AO9], 1 * SIZE + (p17) FMA f113 = f8, f45, f113 + } + ;; + { .mmf + (p16) LDFD f103 = [YLD1], 1 * SIZE + (p16) LDFD f115 = [YLD2], 1 * SIZE + (p17) FMA f104 = f9, f33, f104 + } + { .mmf + (p16) LDFD f35 = [AO1], 1 * SIZE + (p16) LDFD f47 = [AO9], 1 * SIZE + (p17) FMA f116 = f9, f45, f116 + } + ;; + { .mmf + (p16) LDFD f106 = [YLD1], 1 * SIZE + (p16) LDFD f118 = [YLD2], 1 * SIZE + (p17) FMA f107 = f8, f39, f107 + } + { .mmf + (p16) LDFD f38 = [AO1], 1 * SIZE + (p16) LDFD f50 = [AO9], 1 * SIZE + (p17) FMA f119 = f8, f51, f119 + } + ;; + { .mmf + (p16) LDFD f109 = [YLD1], 5 * SIZE + (p16) LDFD f121 = [YLD2], 5 * SIZE + (p17) FMA f110 = f9, f39, f110 + } + { .mmf + (p16) LDFD f41 = [AO1], 5 * SIZE + (p16) LDFD f53 = [AO9], 5 * SIZE + (p17) FMA f122 = f9, f51, f122 + } + ;; + { .mmf + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + (p17) ADD3 f101 = f9, f36, f101 + } + { .mfi + (p17) ADD3 f113 = f9, f48, f113 + (p16) tbit.nz.unc p12, p13 = I, 0 + } + ;; + { .mmf + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + (p17) ADD4 f104 = f8, f36, f104 + } + { .mfi + (p12) PREFETCH [RPRE1], 16 * SIZE + (p17) ADD4 f116 = f8, f48, f116 + } + ;; + { .mmf + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + (p17) ADD3 f107 = f9, f42, f107 + } + { .mfi + (p13) lfetch.excl.nt2 [PREB], 16 * SIZE + (p17) ADD3 f119 = f9, f54, f119 + } + ;; + { .mmf + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + (p17) ADD4 f110 = f8, f42, f110 + } + { .mfb + (p17) ADD4 f122 = f8, f54, f122 + br.ctop.sptk.few .L42 + } + ;; + .align 16 + +.L45: + { .mmi + (p18) STFD [YST1] = f102, 1 * SIZE + (p18) STFD [YST2] = f114, 1 * SIZE + tbit.nz p15, p0 = M, 0 + } + { .mmi + (p14) LDFD f32 = [AO1], 1 * SIZE + (p14) LDFD f80 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f105, 1 * SIZE + (p18) STFD [YST2] = f117, 1 * SIZE + } + { .mmi + (p14) LDFD f33 = [AO1], 1 * SIZE + (p14) LDFD f81 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f108, 1 * SIZE + (p18) STFD [YST2] = f120, 1 * SIZE + } + { .mmi + (p14) LDFD f34 = [AO1], 1 * SIZE + (p14) LDFD f82 = [YLD1], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [YST1] = f111, 5 * SIZE + (p18) STFD [YST2] = f123, 5 * SIZE + } + { .mmi + (p14) LDFD f35 = [AO1], 1 * SIZE + (p14) LDFD f83 = [YLD1], 1 * SIZE + } + ;; + (p15) LDFD f36 = [AO1], 1 * SIZE + (p15) LDFD f84 = [YLD1], 1 * SIZE + ;; + (p15) LDFD f37 = [AO1], 1 * SIZE + (p15) LDFD f85 = [YLD1], 1 * SIZE + ;; + (p14) FMA f80 = f8, f32, f80 + (p14) FMA f81 = f9, f32, f81 + (p14) FMA f82 = f8, f34, f82 + (p14) FMA f83 = f9, f34, f83 + (p15) FMA f84 = f8, f36, f84 + (p15) FMA f85 = f9, f36, f85 + ;; + (p14) ADD3 f80 = f9, f33, f80 + (p14) ADD4 f81 = f8, f33, f81 + (p14) ADD3 f82 = f9, f35, f82 + (p14) ADD4 f83 = f8, f35, f83 + (p15) ADD3 f84 = f9, f37, f84 + (p15) ADD4 f85 = f8, f37, f85 + ;; + (p14) STFD [YST1] = f80, 1 * SIZE + ;; + (p14) STFD [YST1] = f81, 1 * SIZE + ;; + (p14) STFD [YST1] = f82, 1 * SIZE + ;; + (p14) STFD [YST1] = f83, 1 * SIZE + ;; + (p15) STFD [YST1] = f84, 1 * SIZE + ;; + (p15) STFD [YST1] = f85, 1 * SIZE + ;; + +.L990: + { .mmi + mov YST1 = Y + mov YST2 = Y + mov pr.rot= 0 + } + { .mib + mov YLD1 = YY + shr J = M, 2 + (p10) br.cond.dptk .L999 + } + ;; + { .mmi + cmp.eq p6, p0 = r0, J + adds INCY = - SIZE, INCY + mov ar.ec = 4 + } + { .mmi + cmp.eq p16, p0 = r0, r0 + adds J = -1, J + tbit.nz p13, p0 = M, 1 + } + ;; + { .mib + nop __LINE__ + mov ar.lc = J + (p6) br.cond.dpnt .L995 + } + ;; +.L992: + { .mfi + (p19) STFD [YST2] = f35, 1 * SIZE + (p18) FADD f34 = f34, f66 + } + { .mmi + (p16) LDFD f64 = [YLD1], 1 * SIZE + (p16) LDFD f32 = [YST1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f39 + (p18) FADD f38 = f38, f70 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f68 = [YLD1], 1 * SIZE + (p16) LDFD f36 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f43, 1 * SIZE + (p18) FADD f42 = f42, f74 + } + { .mmi + (p16) LDFD f72 = [YLD1], 1 * SIZE + (p16) LDFD f40 = [YST1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f47 + (p18) FADD f50 = f50, f82 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f76 = [YLD1], 1 * SIZE + (p16) LDFD f44 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f51, 1 * SIZE + (p18) FADD f54 = f54, f86 + } + { .mmi + (p16) LDFD f80 = [YLD1], 1 * SIZE + (p16) LDFD f48 = [YST1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f55 + (p18) FADD f58 = f58, f90 + (p19) add YST2 = YST2, INCY + } + { .mmi + (p16) LDFD f84 = [YLD1], 1 * SIZE + (p16) LDFD f52 = [YST1], INCY + } + ;; + { .mfi + (p19) STFD [YST2] = f59, 1 * SIZE + (p18) FADD f46 = f46, f78 + } + { .mmi + (p16) LDFD f88 = [YLD1], 1 * SIZE + (p16) LDFD f56 = [YST1], 1 * SIZE + } + ;; + { .mfi + (p19) STFD [YST2] = f63 + (p18) FADD f62 = f62, f94 + (p19) add YST2 = YST2, INCY + } + { .mmb + (p16) LDFD f92 = [YLD1], 1 * SIZE + (p16) LDFD f60 = [YST1], INCY + br.ctop.sptk.few .L992 + } + ;; + +.L995: + (p13) LDFD f32 = [YST1], 1 * SIZE + (p13) LDFD f40 = [YLD1], 1 * SIZE + tbit.nz p14, p0 = M, 0 + ;; + (p13) LDFD f33 = [YST1], INCY + (p13) LDFD f41 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f34 = [YST1], 1 * SIZE + (p13) LDFD f42 = [YLD1], 1 * SIZE + ;; + (p13) LDFD f35 = [YST1], INCY + (p13) LDFD f43 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f36 = [YST1], 1 * SIZE + (p14) LDFD f44 = [YLD1], 1 * SIZE + ;; + (p14) LDFD f37 = [YST1], INCY + (p14) LDFD f45 = [YLD1], 1 * SIZE + ;; + (p13) FADD f32 = f32, f40 + (p13) FADD f33 = f33, f41 + (p13) FADD f34 = f34, f42 + (p13) FADD f35 = f35, f43 + (p14) FADD f36 = f36, f44 + (p14) FADD f37 = f37, f45 + ;; + (p13) STFD [YST2] = f32, 1 * SIZE + ;; + (p13) STFD [YST2] = f33 + (p13) add YST2 = YST2, INCY + ;; + (p13) STFD [YST2] = f34, 1 * SIZE + ;; + (p13) STFD [YST2] = f35 + (p13) add YST2 = YST2, INCY + ;; + (p14) STFD [YST2] = f36, 1 * SIZE + ;; + (p14) STFD [YST2] = f37 + ;; + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/zgemv_t.S b/kernel/ia64/zgemv_t.S new file mode 100644 index 0000000000..73e6df04bd --- /dev/null +++ b/kernel/ia64/zgemv_t.S @@ -0,0 +1,2017 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define SP r12 + +#define M r32 +#define N r33 +#define A r37 +#define LDA r38 +#define X r39 +#define INCX r34 +#define Y r35 +#define INCY r36 +#define BUFFER r11 + +#define I r15 +#define J r16 +#define AO1 r18 +#define AO2 r19 +#define AO3 r20 +#define AO4 r21 +#define AO5 r22 +#define AO6 r23 +#define AO7 r24 +#define AO8 r25 +#define BO r26 +#define INCYM1 r28 + +#define RPRE1 loc0 +#define RPRE2 loc1 +#define RPRE3 loc2 +#define RPRE4 loc3 +#define RPRE5 loc4 +#define RPRE6 loc5 +#define RPRE7 loc6 +#define RPRE8 loc7 + +#define AO21 loc8 +#define AO41 loc9 +#define AO61 loc10 +#define AO81 loc11 +#define CLD1 loc12 +#define CLD2 loc13 +#define CST1 loc14 +#define CST2 loc15 + +#define PREB r8 +#define WPRE r9 +#define OFFSET PREB +#define INCX3M1 WPRE +#define INCY3M1 r10 + +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#ifdef DOUBLE +#define RPREFETCH (16 * 2 + 8) +#else +#define RPREFETCH (16 * 2 + 16) +#endif +#define PREFETCH lfetch.nt1 + +#define ALPHA_R f6 +#define ALPHA_I f7 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 FMA +#define ADD2 FMA +#define ADD3 FNMA +#define ADD4 FMA +#elif defined(CONJ) && !defined(XCONJ) +#define ADD1 FMA +#define ADD2 FMA +#define ADD3 FMA +#define ADD4 FNMA +#elif !defined(CONJ) && defined(XCONJ) +#define ADD1 FMA +#define ADD2 FNMA +#define ADD3 FMA +#define ADD4 FMA +#else +#define ADD1 FMA +#define ADD2 FNMA +#define ADD3 FNMA +#define ADD4 FNMA +#endif + + PROLOGUE + PROFCODE + .prologue + + { .mmi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 16, 0, 0 + adds r14 = 16, SP + mov ARLC = ar.lc + } + { .mmi + adds r8 = -8 * 16, SP + adds r9 = -7 * 16, SP + adds SP = -8 * 16, SP + } + ;; + { .mmi + stf.spill [r8] = f16, 32 + stf.spill [r9] = f17, 32 + mov PR = pr + } + ;; + { .mmi + stf.spill [r8] = f18, 32 + stf.spill [r9] = f19, 32 + adds r15 = 152, SP + } + ;; + { .mmi + stf.spill [r8] = f20, 32 + stf.spill [r9] = f21, 32 + adds r16 = 160, SP + } + ;; + { .mmi + stf.spill [r8] = f22 + stf.spill [r9] = f23 + adds r17 = 168, SP + } + .body + ;; + { .mmf + ld8 INCX = [r14] + ld8 Y = [r15] + mov ALPHA_R = f8 + } + { .mmf + ld8 INCY = [r16] + ld8 BUFFER = [r17] + mov ALPHA_I = f9 + } + ;; + { .mmi + shladd INCX = INCX, ZBASE_SHIFT, r0 + shladd LDA = LDA, ZBASE_SHIFT, r0 + mov pr.rot= 0 + } + { .mmi + cmp.ge p7, p0 = 0, M + cmp.ge p6, p0 = 0, N + shladd INCY = INCY, ZBASE_SHIFT, r0 + } + ;; + { .mmi + mov AO1 = BUFFER + adds OFFSET = -SIZE, INCX + shr I = M, 3 + } + { .mib + adds INCYM1 = - SIZE, INCY + shladd INCX3M1 = INCX, 1, INCX + (p7) br.cond.dpnt .L999 + } + ;; + { .mmi + shladd BO = INCX, 1, X + adds AO2 = 4 * SIZE, BUFFER + mov ar.ec= 5 + } + { .mmb + shladd INCY3M1 = INCY, 1, INCYM1 + adds I = -1, I + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + adds INCX3M1 = -SIZE, INCX3M1 + cmp.eq p16, p0 = r0, r0 + tbit.nz p13, p0 = M, 2 + } + { .mib + cmp.gt p6, p0 = 0, I + mov ar.lc = I + (p6) br.cond.dpnt .L05 + } + ;; + .align 16 + +.L01: + (p20) STFD [AO1] = f36, SIZE + (p20) STFD [AO2] = f56, SIZE + (p16) LDFD f32 = [X], SIZE + (p16) LDFD f52 = [BO], SIZE + ;; + (p20) STFD [AO1] = f41, SIZE + (p20) STFD [AO2] = f61, SIZE + (p16) LDFD f37 = [X], OFFSET + (p16) LDFD f57 = [BO], OFFSET + ;; + (p20) STFD [AO1] = f46, SIZE + (p20) STFD [AO2] = f66, SIZE + (p16) LDFD f42 = [X], SIZE + (p16) LDFD f62 = [BO], SIZE + ;; + (p20) STFD [AO1] = f51, 5 * SIZE + (p20) STFD [AO2] = f71, 5 * SIZE + (p16) LDFD f47 = [X], INCX3M1 + (p16) LDFD f67 = [BO], INCX3M1 + ;; + (p20) STFD [AO1] = f76, SIZE + (p20) STFD [AO2] = f96, SIZE + (p16) LDFD f72 = [X], SIZE + (p16) LDFD f92 = [BO], SIZE + ;; + (p20) STFD [AO1] = f81, SIZE + (p20) STFD [AO2] = f101, SIZE + (p16) LDFD f77 = [X], OFFSET + (p16) LDFD f97 = [BO], OFFSET + ;; + (p20) STFD [AO1] = f86, SIZE + (p20) STFD [AO2] = f106, SIZE + (p16) LDFD f82 = [X], SIZE + (p16) LDFD f102 = [BO], SIZE + ;; + (p20) STFD [AO1] = f91, 5 * SIZE + (p20) STFD [AO2] = f111, 5 * SIZE + (p16) LDFD f87 = [X], INCX3M1 + (p16) LDFD f107 = [BO], INCX3M1 + br.ctop.sptk.few .L01 + ;; + .align 16 + +.L05: + { .mmi + (p13) LDFD f32 = [X], SIZE + (p13) LDFD f36 = [BO], SIZE + tbit.nz p14, p0 = M, 1 + } + ;; + { .mmi + (p13) LDFD f33 = [X], OFFSET + (p13) LDFD f37 = [BO], OFFSET + tbit.nz p15, p0 = M, 0 + } + ;; + { .mmb + (p13) LDFD f34 = [X], SIZE + (p13) LDFD f38 = [BO], SIZE + } + ;; + { .mmi + (p13) LDFD f35 = [X], INCX3M1 + (p13) LDFD f39 = [BO], INCX3M1 + } + ;; + { .mmi + (p14) LDFD f40 = [X], SIZE + } + ;; + (p14) LDFD f41 = [X], OFFSET + (p13) STFD [AO1] = f32, SIZE + tbit.nz p8, p0 = A, BASE_SHIFT + ;; + (p14) LDFD f42 = [X], SIZE + (p13) STFD [AO2] = f36, SIZE + ;; + (p14) LDFD f43 = [X], OFFSET + (p13) STFD [AO1] = f33, SIZE + ;; + (p15) LDFD f44 = [X], SIZE + (p13) STFD [AO2] = f37, SIZE + ;; + (p15) LDFD f45 = [X], OFFSET + (p13) STFD [AO1] = f34, SIZE + (p13) STFD [AO2] = f38, SIZE + ;; + (p13) STFD [AO1] = f35, 5 * SIZE + (p13) STFD [AO2] = f39, 5 * SIZE + ;; + (p14) STFD [AO1] = f40, SIZE + ;; + (p14) STFD [AO1] = f41, SIZE + ;; + (p14) STFD [AO1] = f42, SIZE + ;; + (p14) STFD [AO1] = f43, SIZE + ;; + (p15) STFD [AO1] = f44, SIZE + ;; + (p15) STFD [AO1] = f45, SIZE + (p8) br.cond.dpnt .L100 + ;; + .align 16 + +.L10: + { .mmi + mov CLD1 = Y + shladd CLD2 = INCY, 1, Y + shr J = N, 3 + } + ;; + { .mmb + mov CST1 = Y + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L20 + } + ;; + .align 16 + +.L11: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + mov BO = BUFFER + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mmf + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + mov f14 = f0 + } + ;; + { .mmf + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + mov f16 = f0 + } + { .mmf + adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 + mov f18 = f0 + } + ;; + { .mmf + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + mov f20 = f0 + } + { .mmf + adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6 + mov f22 = f0 + } + ;; + { .mfi + shladd A = LDA, 3, A + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8 + mov f11 = f0 + } + ;; + { .mmf + adds WPRE = 16 * SIZE, CLD1 + adds PREB = RPREFETCH * SIZE, BO + mov f13 = f0 + } + { .mmf + adds I = -1, M + cmp.eq p16, p0 = r0, r0 + mov f15 = f0 + } + ;; + { .mfi + cmp.eq p12, p0 = r0, r0 + mov f17 = f0 + mov ar.lc = I + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f19 = f0 + } + ;; + { .mmf + lfetch.excl.nt1 [WPRE] + nop __LINE__ + mov f21 = f0 + } + { .mmf + mov I = 0 + nop __LINE__ + mov f23 = f0 + } + ;; + .align 16 + +.L16: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [AO1], 2 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 1, I + nop __LINE__ + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p13) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) cmp.eq.unc p14, p0 = 2, I + (p16) cmp.eq.unc p15, p0 = 3, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p16) LDFPD f42, f47 = [AO2], 2 * SIZE + nop __LINE__ + (p20) ADD1 f12 = f116, f56, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f13 = f121, f56, f13 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + nop __LINE__ + (p20) ADD1 f14 = f116, f66, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f15 = f121, f66, f15 + } + ;; + { .mmf + (p16) LDFPD f52, f57 = [AO3], 2 * SIZE + nop __LINE__ + (p20) ADD3 f8 = f121, f41, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f9 = f116, f41, f9 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + nop __LINE__ + (p20) ADD3 f10 = f121, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f11 = f116, f51, f11 + } + ;; + { .mmf + (p16) LDFPD f62, f67 = [AO4], 2 * SIZE + nop __LINE__ + (p20) ADD3 f12 = f121, f61, f12 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 4, I + (p16) cmp.eq.unc p13, p0 = 5, I + (p20) ADD4 f13 = f116, f61, f13 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + nop __LINE__ + (p20) ADD3 f14 = f121, f71, f14 + } + { .mmf + (p16) cmp.eq.unc p14, p0 = 6, I + (p16) cmp.eq.unc p15, p0 = 7, I + (p20) ADD4 f15 = f116, f71, f15 + } + ;; + { .mmf + (p16) LDFPD f72, f77 = [AO5], 2 * SIZE + nop __LINE__ + (p20) ADD1 f16 = f116, f76, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f17 = f121, f76, f17 + } + ;; + { .mmf + (p12) PREFETCH [RPRE5], 16 * SIZE + nop __LINE__ + (p20) ADD1 f18 = f116, f86, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f19 = f121, f86, f19 + } + ;; + { .mmf + (p16) LDFPD f82, f87 = [AO6], 2 * SIZE + nop __LINE__ + (p20) ADD1 f20 = f116, f96, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f21 = f121, f96, f21 + } + ;; + { .mmf + (p13) PREFETCH [RPRE6], 16 * SIZE + nop __LINE__ + (p20) ADD1 f22 = f116, f106, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f23 = f121, f106, f23 + } + ;; + { .mmf + (p16) LDFPD f92, f97 = [AO7], 2 * SIZE + nop __LINE__ + (p20) ADD3 f16 = f121, f81, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f17 = f116, f81, f17 + } + ;; + { .mmf + (p14) PREFETCH [RPRE7], 16 * SIZE + nop __LINE__ + (p20) ADD3 f18 = f121, f91, f18 + } + { .mmf + nop __LINE__ + (p16) adds I = 1, I + (p20) ADD4 f19 = f116, f91, f19 + } + ;; + { .mmf + (p16) LDFPD f102, f107 = [AO8], 2 * SIZE + nop __LINE__ + (p20) ADD3 f20 = f121, f101, f20 + } + { .mmf + (p15) mov I = 0 + nop __LINE__ + (p20) ADD4 f21 = f116, f101, f21 + } + ;; + { .mmf + (p15) PREFETCH [RPRE8], 16 * SIZE + nop __LINE__ + (p20) ADD3 f22 = f121, f111, f22 + } + { .mfb + (p16) cmp.eq.unc p12, p0 = 0, I + (p20) ADD4 f23 = f116, f111, f23 + br.ctop.sptk.few .L16 + } + ;; + +.L18: + LDFD f32 = [CLD1], SIZE + LDFD f36 = [CLD2], SIZE + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + LDFD f37 = [CLD2], INCYM1 + ;; + LDFD f34 = [CLD1], SIZE + LDFD f38 = [CLD2], SIZE + ;; + LDFD f35 = [CLD1], INCY3M1 + LDFD f39 = [CLD2], INCY3M1 + ;; + LDFD f40 = [CLD1], SIZE + LDFD f44 = [CLD2], SIZE + ;; + LDFD f41 = [CLD1], INCYM1 + LDFD f45 = [CLD2], INCYM1 + ;; + LDFD f42 = [CLD1], SIZE + LDFD f46 = [CLD2], SIZE + ;; + LDFD f43 = [CLD1], INCY3M1 + LDFD f47 = [CLD2], INCY3M1 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f36 = ALPHA_R, f12, f36 + FMA f33 = ALPHA_I, f8, f33 + FMA f37 = ALPHA_I, f12, f37 + FMA f34 = ALPHA_R, f10, f34 + FMA f38 = ALPHA_R, f14, f38 + FMA f35 = ALPHA_I, f10, f35 + FMA f39 = ALPHA_I, f14, f39 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FNMA f36 = ALPHA_I, f13, f36 + FMA f33 = ALPHA_R, f9, f33 + FMA f37 = ALPHA_R, f13, f37 + FNMA f34 = ALPHA_I, f11, f34 + FNMA f38 = ALPHA_I, f15, f38 + FMA f35 = ALPHA_R, f11, f35 + FMA f39 = ALPHA_R, f15, f39 + ;; + FMA f40 = ALPHA_R, f16, f40 + FMA f44 = ALPHA_R, f20, f44 + FMA f41 = ALPHA_I, f16, f41 + FMA f45 = ALPHA_I, f20, f45 + FMA f42 = ALPHA_R, f18, f42 + FMA f46 = ALPHA_R, f22, f46 + FMA f43 = ALPHA_I, f18, f43 + FMA f47 = ALPHA_I, f22, f47 + ;; + { .mmf + STFD [CST1] = f32, SIZE + STFD [CST2] = f36, SIZE + FNMA f40 = ALPHA_I, f17, f40 + } + { .mmf + nop __LINE__ + nop __LINE__ + FNMA f44 = ALPHA_I, f21, f44 + } + ;; + { .mmf + STFD [CST1] = f33 + STFD [CST2] = f37 + FMA f41 = ALPHA_R, f17, f41 + } + { .mmf + add CST1 = CST1, INCYM1 + add CST2 = CST2, INCYM1 + FMA f45 = ALPHA_R, f21, f45 + } + ;; + { .mmf + STFD [CST1] = f34, SIZE + STFD [CST2] = f38, SIZE + FNMA f42 = ALPHA_I, f19, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + FNMA f46 = ALPHA_I, f23, f46 + } + ;; + { .mmf + STFD [CST1] = f35 + STFD [CST2] = f39 + FMA f43 = ALPHA_R, f19, f43 + } + { .mmf + add CST1 = CST1, INCY3M1 + add CST2 = CST2, INCY3M1 + FMA f47 = ALPHA_R, f23, f47 + } + ;; + { .mmi + STFD [CST1] = f40, SIZE + STFD [CST2] = f44, SIZE + adds J = -1, J + } + ;; + { .mmi + STFD [CST1] = f41 + STFD [CST2] = f45 + add CST1 = CST1, INCYM1 + } + { .mmi + nop __LINE__ + nop __LINE__ + add CST2 = CST2, INCYM1 + } + ;; + { .mmi + STFD [CST1] = f42, SIZE + STFD [CST2] = f46, SIZE + cmp.lt p6, p0 = 0, J + } + ;; + { .mmi + STFD [CST1] = f43 + STFD [CST2] = f47 + add CST1 = CST1, INCY3M1 + } + { .mmb + add CST2 = CST2, INCY3M1 + (p6) br.cond.dptk .L11 + } + ;; + .align 16 + +.L20: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 2 + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mfb + mov BO = BUFFER + mov f14 = f0 + (p6) br.cond.dpnt .L30 + } + ;; + { .mfi + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + adds I = -1, M + mov f11 = f0 + } + ;; + { .mmf + adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 + mov f13 = f0 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 2, A + mov f15 = f0 + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + adds PREB = RPREFETCH * SIZE, BO + mov ar.lc = I + } + { .mmi + adds WPRE = 16 * SIZE, CLD1 + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L26: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [AO1], 2 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 2, I + nop __LINE__ + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p12) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) cmp.eq.unc p14, p0 = 4, I + (p16) cmp.eq.unc p15, p0 = 6, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p16) LDFPD f42, f47 = [AO2], 2 * SIZE + nop __LINE__ + (p20) ADD1 f12 = f116, f56, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f13 = f121, f56, f13 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + nop __LINE__ + (p20) ADD1 f14 = f116, f66, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f15 = f121, f66, f15 + } + ;; + { .mmf + (p16) LDFPD f52, f57 = [AO3], 2 * SIZE + nop __LINE__ + (p20) ADD3 f8 = f121, f41, f8 + } + { .mmf + (p16) adds I = 1, I + nop __LINE__ + (p20) ADD4 f9 = f116, f41, f9 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + nop __LINE__ + (p20) ADD3 f10 = f121, f51, f10 + } + { .mmf + (p16) cmp.eq.unc p15, p0 = 8, I + nop __LINE__ + (p20) ADD4 f11 = f116, f51, f11 + } + ;; + { .mmf + (p16) LDFPD f62, f67 = [AO4], 2 * SIZE + nop __LINE__ + (p20) ADD3 f12 = f121, f61, f12 + } + { .mmf + (p15) mov I = 0 + nop __LINE__ + (p20) ADD4 f13 = f116, f61, f13 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + nop __LINE__ + (p20) ADD3 f14 = f121, f71, f14 + } + { .mfb + (p16) cmp.eq.unc p12, p0 = 0, I + (p20) ADD4 f15 = f116, f71, f15 + br.ctop.sptk.few .L26 + } + ;; +.L28: + LDFD f32 = [CLD1], SIZE + LDFD f36 = [CLD2], SIZE + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + LDFD f37 = [CLD2], INCYM1 + ;; + LDFD f34 = [CLD1], SIZE + LDFD f38 = [CLD2], SIZE + ;; + LDFD f35 = [CLD1], INCY3M1 + LDFD f39 = [CLD2], INCY3M1 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f36 = ALPHA_R, f12, f36 + FMA f33 = ALPHA_I, f8, f33 + FMA f37 = ALPHA_I, f12, f37 + FMA f34 = ALPHA_R, f10, f34 + FMA f38 = ALPHA_R, f14, f38 + FMA f35 = ALPHA_I, f10, f35 + FMA f39 = ALPHA_I, f14, f39 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FNMA f36 = ALPHA_I, f13, f36 + FMA f33 = ALPHA_R, f9, f33 + FMA f37 = ALPHA_R, f13, f37 + FNMA f34 = ALPHA_I, f11, f34 + FNMA f38 = ALPHA_I, f15, f38 + FMA f35 = ALPHA_R, f11, f35 + FMA f39 = ALPHA_R, f15, f39 + ;; + STFD [CST1] = f32, SIZE + STFD [CST2] = f36, SIZE + ;; + STFD [CST1] = f33 + STFD [CST2] = f37 + add CST1 = CST1, INCYM1 + add CST2 = CST2, INCYM1 + ;; + STFD [CST1] = f34, SIZE + STFD [CST2] = f38, SIZE + ;; + STFD [CST1] = f35 + STFD [CST2] = f39 + add CST1 = CST1, INCY3M1 + add CST2 = CST2, INCY3M1 + ;; + .align 16 + +.L30: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 1 + } + ;; + { .mmf + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + mov f12 = f0 + } + { .mfb + adds I = -1, M + mov f14 = f0 + (p6) br.cond.dpnt .L40 + } + ;; + { .mfi + mov BO = BUFFER + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 1, A + mov f11 = f0 + } + ;; + { .mfi + adds WPRE = 16 * SIZE, CLD1 + mov f13 = f0 + mov ar.lc = I + } + { .mmf + adds PREB = RPREFETCH * SIZE, BO + nop __LINE__ + mov f15 = f0 + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L36: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [AO1], 2 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 4, I + (p16) adds I = 1, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p12) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 8, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFPD f42, f47 = [AO2], 2 * SIZE + (p20) ADD3 f12 = f121, f41, f12 + } + { .mmf + (p12) mov I = 0 + (p20) ADD4 f13 = f116, f41, f13 + } + ;; + { .mmf + (p20) ADD3 f14 = f121, f51, f14 + } + { .mfb + nop __LINE__ + (p20) ADD4 f15 = f116, f51, f15 + br.ctop.sptk.few .L36 + } + ;; + +.L38: + LDFD f32 = [CLD1], SIZE + FADD f8 = f8, f12 + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + FADD f10 = f10, f14 + ;; + LDFD f34 = [CLD1], SIZE + FADD f9 = f9, f13 + ;; + LDFD f35 = [CLD1], INCYM1 + FADD f11 = f11, f15 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f33 = ALPHA_I, f8, f33 + FMA f34 = ALPHA_R, f10, f34 + FMA f35 = ALPHA_I, f10, f35 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FMA f33 = ALPHA_R, f9, f33 + FNMA f34 = ALPHA_I, f11, f34 + FMA f35 = ALPHA_R, f11, f35 + ;; + STFD [CST1] = f32, SIZE + ;; + STFD [CST1] = f33 + add CST1 = CST1, INCYM1 + ;; + STFD [CST1] = f34, SIZE + ;; + STFD [CST1] = f35 + add CST1 = CST1, INCYM1 + ;; + .align 16 + + +.L40: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + mov f9 = f0 + tbit.z p6, p0 = N, 0 + } + ;; + { .mfi + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + mov f10 = f0 + mov ar.ec= 5 + } + { .mfb + adds I = -1, M + mov f11 = f0 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + add A = LDA, A + mov ar.lc = I + } + { .mmi + adds WPRE = 16 * SIZE, CLD1 + adds PREB = RPREFETCH * SIZE, BO + mov BO = BUFFER + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L46: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFPD f32, f37 = [AO1], 2 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 7, I + (p16) adds I = 1, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD3 f10 = f121, f41, f10 + } + { .mfb + (p12) mov I = 0 + (p20) ADD4 f11 = f116, f41, f11 + br.ctop.sptk.few .L46 + } + ;; + +.L48: + LDFD f32 = [CLD1], SIZE + FADD f8 = f8, f10 + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + FADD f9 = f9, f11 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f33 = ALPHA_I, f8, f33 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FMA f33 = ALPHA_R, f9, f33 + ;; + STFD [CST1] = f32, SIZE + ;; + STFD [CST1] = f33 + add CST1 = CST1, INCYM1 + br .L999 + .align 16 + ;; + +.L100: + { .mmi + mov CLD1 = Y + shladd CLD2 = INCY, 1, Y + shr J = N, 3 + } + ;; + { .mmb + mov CST1 = Y + cmp.eq p6, p0 = r0, J + (p6) br.cond.dpnt .L120 + } + ;; + .align 16 + +.L111: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + mov BO = BUFFER + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mmf + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + mov f14 = f0 + } + ;; + { .mmf + shladd AO5 = LDA, 1, AO3 + shladd AO6 = LDA, 1, AO4 + mov f16 = f0 + } + { .mmf + adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 + mov f18 = f0 + } + ;; + { .mmf + shladd AO7 = LDA, 1, AO5 + shladd AO8 = LDA, 1, AO6 + mov f20 = f0 + } + { .mmf + adds RPRE5 = (RPREFETCH + 8) * SIZE, AO5 + adds RPRE6 = (RPREFETCH + 10) * SIZE, AO6 + mov f22 = f0 + } + ;; + { .mfi + shladd A = LDA, 3, A + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + adds RPRE7 = (RPREFETCH + 12) * SIZE, AO7 + adds RPRE8 = (RPREFETCH + 14) * SIZE, AO8 + mov f11 = f0 + } + ;; + { .mmf + adds WPRE = 16 * SIZE, CLD1 + adds PREB = RPREFETCH * SIZE, BO + mov f13 = f0 + } + { .mmf + adds I = -1, M + cmp.eq p16, p0 = r0, r0 + mov f15 = f0 + } + ;; + { .mfi + cmp.eq p12, p0 = r0, r0 + mov f17 = f0 + mov ar.lc = I + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f19 = f0 + } + ;; + { .mmf + lfetch.excl.nt1 [WPRE] + nop __LINE__ + mov f21 = f0 + } + { .mmf + mov I = 0 + nop __LINE__ + mov f23 = f0 + } + ;; + .align 16 + +.L116: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFD f32 = [AO1], 1 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 1, I + (p16) cmp.eq.unc p14, p0 = 2, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p13) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) LDFD f37 = [AO1], 1 * SIZE + (p16) cmp.eq.unc p15, p0 = 3, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f42 = [AO2], 1 * SIZE + (p20) ADD1 f12 = f116, f56, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f13 = f121, f56, f13 + } + ;; + { .mmf + (p16) LDFD f47 = [AO2], 1 * SIZE + nop __LINE__ + (p20) ADD1 f14 = f116, f66, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f15 = f121, f66, f15 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFD f52 = [AO3], 1 * SIZE + (p20) ADD3 f8 = f121, f41, f8 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f9 = f116, f41, f9 + } + ;; + { .mmf + (p16) LDFD f57 = [AO3], 1 * SIZE + nop __LINE__ + (p20) ADD3 f10 = f121, f51, f10 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f11 = f116, f51, f11 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f62 = [AO4], 1 * SIZE + (p20) ADD3 f12 = f121, f61, f12 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 4, I + (p16) cmp.eq.unc p13, p0 = 5, I + (p20) ADD4 f13 = f116, f61, f13 + } + ;; + { .mmf + (p16) LDFD f67 = [AO4], 1 * SIZE + nop __LINE__ + (p20) ADD3 f14 = f121, f71, f14 + } + { .mmf + (p16) cmp.eq.unc p14, p0 = 6, I + (p16) cmp.eq.unc p15, p0 = 7, I + (p20) ADD4 f15 = f116, f71, f15 + } + ;; + { .mmf + (p12) PREFETCH [RPRE5], 16 * SIZE + (p16) LDFD f72 = [AO5], 1 * SIZE + (p20) ADD1 f16 = f116, f76, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f17 = f121, f76, f17 + } + ;; + { .mmf + (p16) LDFD f77 = [AO5], 1 * SIZE + nop __LINE__ + (p20) ADD1 f18 = f116, f86, f18 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f19 = f121, f86, f19 + } + ;; + { .mmf + (p13) PREFETCH [RPRE6], 16 * SIZE + (p16) LDFD f82 = [AO6], 1 * SIZE + (p20) ADD1 f20 = f116, f96, f20 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f21 = f121, f96, f21 + } + ;; + { .mmf + (p16) LDFD f87 = [AO6], 1 * SIZE + nop __LINE__ + (p20) ADD1 f22 = f116, f106, f22 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f23 = f121, f106, f23 + } + ;; + { .mmf + (p14) PREFETCH [RPRE7], 16 * SIZE + (p16) LDFD f92 = [AO7], 1 * SIZE + (p20) ADD3 f16 = f121, f81, f16 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD4 f17 = f116, f81, f17 + } + ;; + { .mmf + (p16) LDFD f97 = [AO7], 1 * SIZE + nop __LINE__ + (p20) ADD3 f18 = f121, f91, f18 + } + { .mmf + nop __LINE__ + (p16) adds I = 1, I + (p20) ADD4 f19 = f116, f91, f19 + } + ;; + { .mmf + (p15) PREFETCH [RPRE8], 16 * SIZE + (p16) LDFD f102 = [AO8], 1 * SIZE + (p20) ADD3 f20 = f121, f101, f20 + } + { .mmf + (p15) mov I = 0 + nop __LINE__ + (p20) ADD4 f21 = f116, f101, f21 + } + ;; + { .mmf + (p16) LDFD f107 = [AO8], 1 * SIZE + nop __LINE__ + (p20) ADD3 f22 = f121, f111, f22 + } + { .mfb + (p16) cmp.eq.unc p12, p0 = 0, I + (p20) ADD4 f23 = f116, f111, f23 + br.ctop.sptk.few .L116 + } + ;; + +.L118: + LDFD f32 = [CLD1], SIZE + LDFD f36 = [CLD2], SIZE + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + LDFD f37 = [CLD2], INCYM1 + ;; + LDFD f34 = [CLD1], SIZE + LDFD f38 = [CLD2], SIZE + ;; + LDFD f35 = [CLD1], INCY3M1 + LDFD f39 = [CLD2], INCY3M1 + ;; + LDFD f40 = [CLD1], SIZE + LDFD f44 = [CLD2], SIZE + ;; + LDFD f41 = [CLD1], INCYM1 + LDFD f45 = [CLD2], INCYM1 + ;; + LDFD f42 = [CLD1], SIZE + LDFD f46 = [CLD2], SIZE + ;; + LDFD f43 = [CLD1], INCY3M1 + LDFD f47 = [CLD2], INCY3M1 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f36 = ALPHA_R, f12, f36 + FMA f33 = ALPHA_I, f8, f33 + FMA f37 = ALPHA_I, f12, f37 + FMA f34 = ALPHA_R, f10, f34 + FMA f38 = ALPHA_R, f14, f38 + FMA f35 = ALPHA_I, f10, f35 + FMA f39 = ALPHA_I, f14, f39 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FNMA f36 = ALPHA_I, f13, f36 + FMA f33 = ALPHA_R, f9, f33 + FMA f37 = ALPHA_R, f13, f37 + FNMA f34 = ALPHA_I, f11, f34 + FNMA f38 = ALPHA_I, f15, f38 + FMA f35 = ALPHA_R, f11, f35 + FMA f39 = ALPHA_R, f15, f39 + ;; + FMA f40 = ALPHA_R, f16, f40 + FMA f44 = ALPHA_R, f20, f44 + FMA f41 = ALPHA_I, f16, f41 + FMA f45 = ALPHA_I, f20, f45 + FMA f42 = ALPHA_R, f18, f42 + FMA f46 = ALPHA_R, f22, f46 + FMA f43 = ALPHA_I, f18, f43 + FMA f47 = ALPHA_I, f22, f47 + ;; + { .mmf + STFD [CST1] = f32, SIZE + STFD [CST2] = f36, SIZE + FNMA f40 = ALPHA_I, f17, f40 + } + { .mmf + nop __LINE__ + nop __LINE__ + FNMA f44 = ALPHA_I, f21, f44 + } + ;; + { .mmf + STFD [CST1] = f33 + STFD [CST2] = f37 + FMA f41 = ALPHA_R, f17, f41 + } + { .mmf + add CST1 = CST1, INCYM1 + add CST2 = CST2, INCYM1 + FMA f45 = ALPHA_R, f21, f45 + } + ;; + { .mmf + STFD [CST1] = f34, SIZE + STFD [CST2] = f38, SIZE + FNMA f42 = ALPHA_I, f19, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + FNMA f46 = ALPHA_I, f23, f46 + } + ;; + { .mmf + STFD [CST1] = f35 + STFD [CST2] = f39 + FMA f43 = ALPHA_R, f19, f43 + } + { .mmf + add CST1 = CST1, INCY3M1 + add CST2 = CST2, INCY3M1 + FMA f47 = ALPHA_R, f23, f47 + } + ;; + { .mmi + STFD [CST1] = f40, SIZE + STFD [CST2] = f44, SIZE + adds J = -1, J + } + ;; + { .mmi + STFD [CST1] = f41 + STFD [CST2] = f45 + add CST1 = CST1, INCYM1 + } + { .mmi + nop __LINE__ + nop __LINE__ + add CST2 = CST2, INCYM1 + } + ;; + { .mmi + STFD [CST1] = f42, SIZE + STFD [CST2] = f46, SIZE + cmp.lt p6, p0 = 0, J + } + ;; + { .mmi + STFD [CST1] = f43 + STFD [CST2] = f47 + add CST1 = CST1, INCY3M1 + } + { .mmb + add CST2 = CST2, INCY3M1 + (p6) br.cond.dptk .L111 + } + ;; + .align 16 + +.L120: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 2 + } + ;; + { .mmf + shladd AO3 = LDA, 1, A + shladd AO4 = LDA, 1, AO2 + mov f12 = f0 + } + { .mfb + mov BO = BUFFER + mov f14 = f0 + (p6) br.cond.dpnt .L130 + } + ;; + { .mfi + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + adds I = -1, M + mov f11 = f0 + } + ;; + { .mmf + adds RPRE3 = (RPREFETCH + 4) * SIZE, AO3 + adds RPRE4 = (RPREFETCH + 6) * SIZE, AO4 + mov f13 = f0 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 2, A + mov f15 = f0 + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + adds PREB = RPREFETCH * SIZE, BO + mov ar.lc = I + } + { .mmi + adds WPRE = 16 * SIZE, CLD1 + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L126: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFD f32 = [AO1], 1 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 2, I + (p16) cmp.eq.unc p14, p0 = 4, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p12) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) LDFD f37 = [AO1], 1 * SIZE + (p16) cmp.eq.unc p15, p0 = 6, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p16) LDFD f42 = [AO2], 1 * SIZE + nop __LINE__ + (p20) ADD1 f12 = f116, f56, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f13 = f121, f56, f13 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f47 = [AO2], 1 * SIZE + (p20) ADD1 f14 = f116, f66, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p20) ADD2 f15 = f121, f66, f15 + } + ;; + { .mmf + (p16) LDFD f52 = [AO3], 1 * SIZE + nop __LINE__ + (p20) ADD3 f8 = f121, f41, f8 + } + { .mmf + nop __LINE__ + (p16) adds I = 1, I + (p20) ADD4 f9 = f116, f41, f9 + } + ;; + { .mmf + (p14) PREFETCH [RPRE3], 16 * SIZE + (p16) LDFD f57 = [AO3], 1 * SIZE + (p20) ADD3 f10 = f121, f51, f10 + } + { .mmf + nop __LINE__ + (p16) cmp.eq.unc p15, p0 = 8, I + (p20) ADD4 f11 = f116, f51, f11 + } + ;; + { .mmf + (p16) LDFD f62 = [AO4], 1 * SIZE + nop __LINE__ + (p20) ADD3 f12 = f121, f61, f12 + } + { .mmf + (p15) mov I = 0 + nop __LINE__ + (p20) ADD4 f13 = f116, f61, f13 + } + ;; + { .mmf + (p15) PREFETCH [RPRE4], 16 * SIZE + (p16) LDFD f67 = [AO4], 1 * SIZE + (p20) ADD3 f14 = f121, f71, f14 + } + { .mfb + (p16) cmp.eq.unc p12, p0 = 0, I + (p20) ADD4 f15 = f116, f71, f15 + br.ctop.sptk.few .L126 + } + ;; +.L128: + LDFD f32 = [CLD1], SIZE + LDFD f36 = [CLD2], SIZE + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + LDFD f37 = [CLD2], INCYM1 + ;; + LDFD f34 = [CLD1], SIZE + LDFD f38 = [CLD2], SIZE + ;; + LDFD f35 = [CLD1], INCY3M1 + LDFD f39 = [CLD2], INCY3M1 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f36 = ALPHA_R, f12, f36 + FMA f33 = ALPHA_I, f8, f33 + FMA f37 = ALPHA_I, f12, f37 + FMA f34 = ALPHA_R, f10, f34 + FMA f38 = ALPHA_R, f14, f38 + FMA f35 = ALPHA_I, f10, f35 + FMA f39 = ALPHA_I, f14, f39 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FNMA f36 = ALPHA_I, f13, f36 + FMA f33 = ALPHA_R, f9, f33 + FMA f37 = ALPHA_R, f13, f37 + FNMA f34 = ALPHA_I, f11, f34 + FNMA f38 = ALPHA_I, f15, f38 + FMA f35 = ALPHA_R, f11, f35 + FMA f39 = ALPHA_R, f15, f39 + ;; + STFD [CST1] = f32, SIZE + STFD [CST2] = f36, SIZE + ;; + STFD [CST1] = f33 + STFD [CST2] = f37 + add CST1 = CST1, INCYM1 + add CST2 = CST2, INCYM1 + ;; + STFD [CST1] = f34, SIZE + STFD [CST2] = f38, SIZE + ;; + STFD [CST1] = f35 + STFD [CST2] = f39 + add CST1 = CST1, INCY3M1 + add CST2 = CST2, INCY3M1 + ;; + .align 16 + +.L130: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + add AO2 = LDA, A + mov f10 = f0 + tbit.z p6, p0 = N, 1 + } + ;; + { .mmf + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + adds RPRE2 = (RPREFETCH + 2) * SIZE, AO2 + mov f12 = f0 + } + { .mfb + adds I = -1, M + mov f14 = f0 + (p6) br.cond.dpnt .L140 + } + ;; + { .mfi + mov BO = BUFFER + mov f9 = f0 + mov ar.ec= 5 + } + { .mmf + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 1, A + mov f11 = f0 + } + ;; + { .mfi + adds WPRE = 16 * SIZE, CLD1 + mov f13 = f0 + mov ar.lc = I + } + { .mmf + adds PREB = RPREFETCH * SIZE, BO + nop __LINE__ + mov f15 = f0 + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L136: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFD f32 = [AO1], 1 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p13, p0 = 4, I + (p16) adds I = 1, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p12) PREFETCH [PREB], 16 * SIZE + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p20) ADD1 f10 = f116, f46, f10 + } + { .mmf + (p16) LDFD f37 = [AO1], 1 * SIZE + (p16) cmp.eq.unc p12, p0 = 8, I + (p20) ADD2 f11 = f121, f46, f11 + } + ;; + { .mmf + (p13) PREFETCH [RPRE2], 16 * SIZE + (p16) LDFD f42 = [AO2], 1 * SIZE + (p20) ADD3 f12 = f121, f41, f12 + } + { .mmf + (p12) mov I = 0 + nop __LINE__ + (p20) ADD4 f13 = f116, f41, f13 + } + ;; + { .mmf + (p16) LDFD f47 = [AO2], 1 * SIZE + nop __LINE__ + (p20) ADD3 f14 = f121, f51, f14 + } + { .mfb + nop __LINE__ + (p20) ADD4 f15 = f116, f51, f15 + br.ctop.sptk.few .L136 + } + ;; + +.L138: + LDFD f32 = [CLD1], SIZE + FADD f8 = f8, f12 + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + FADD f10 = f10, f14 + ;; + LDFD f34 = [CLD1], SIZE + FADD f9 = f9, f13 + ;; + LDFD f35 = [CLD1], INCYM1 + FADD f11 = f11, f15 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f33 = ALPHA_I, f8, f33 + FMA f34 = ALPHA_R, f10, f34 + FMA f35 = ALPHA_I, f10, f35 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FMA f33 = ALPHA_R, f9, f33 + FNMA f34 = ALPHA_I, f11, f34 + FMA f35 = ALPHA_R, f11, f35 + ;; + STFD [CST1] = f32, SIZE + ;; + STFD [CST1] = f33 + add CST1 = CST1, INCYM1 + ;; + STFD [CST1] = f34, SIZE + ;; + STFD [CST1] = f35 + add CST1 = CST1, INCYM1 + ;; + .align 16 + + +.L140: + { .mfi + mov AO1 = A + mov f8 = f0 + mov pr.rot= 0 + } + { .mfi + mov f9 = f0 + tbit.z p6, p0 = N, 0 + } + ;; + { .mfi + adds RPRE1 = (RPREFETCH + 0) * SIZE, AO1 + mov f10 = f0 + mov ar.ec= 5 + } + { .mfb + adds I = -1, M + mov f11 = f0 + (p6) br.cond.dpnt .L999 + } + ;; + { .mmi + cmp.eq p16, p0 = r0, r0 + shladd A = LDA, 1, A + mov ar.lc = I + } + { .mmi + adds WPRE = 16 * SIZE, CLD1 + adds PREB = RPREFETCH * SIZE, BO + mov BO = BUFFER + } + ;; + { .mmi + lfetch.excl.nt1 [WPRE] + cmp.eq p12, p0 = r0, r0 + mov I = 0 + } + ;; + .align 16 + +.L146: + { .mmf + (p12) PREFETCH [RPRE1], 16 * SIZE + (p16) LDFD f32 = [AO1], 1 * SIZE + (p20) ADD1 f8 = f116, f36, f8 + } + { .mmf + (p16) cmp.eq.unc p12, p0 = 7, I + (p16) adds I = 1, I + (p20) ADD2 f9 = f121, f36, f9 + } + ;; + { .mmf + (p16) LDFPD f112, f117 = [BO], 2 * SIZE + (p16) LDFD f37 = [AO1], 1 * SIZE + (p20) ADD3 f10 = f121, f41, f10 + } + { .mfb + (p12) mov I = 0 + (p20) ADD4 f11 = f116, f41, f11 + br.ctop.sptk.few .L146 + } + ;; + +.L148: + LDFD f32 = [CLD1], SIZE + FADD f8 = f8, f10 + shladd CST2 = INCY, 1, CST1 + ;; + LDFD f33 = [CLD1], INCYM1 + FADD f9 = f9, f11 + ;; + FMA f32 = ALPHA_R, f8, f32 + FMA f33 = ALPHA_I, f8, f33 + ;; + FNMA f32 = ALPHA_I, f9, f32 + FMA f33 = ALPHA_R, f9, f33 + ;; + STFD [CST1] = f32, SIZE + ;; + STFD [CST1] = f33 + add CST1 = CST1, INCYM1 + ;; + .align 16 + +.L999: + mov r8 = r0 + adds r9 = 1 * 16, SP + ;; + ldf.fill f16 = [SP], 32 + ldf.fill f17 = [r9], 32 + mov ar.lc = ARLC + ;; + ldf.fill f18 = [SP], 32 + ldf.fill f19 = [r9], 32 + mov pr = PR, -1 + ;; + ldf.fill f20 = [SP], 32 + ldf.fill f21 = [r9], 32 + mov ar.pfs = ARPFS + ;; + ldf.fill f22 = [SP], 32 + ldf.fill f23 = [r9] + br.ret.sptk.many b0 + ;; + EPILOGUE diff --git a/kernel/ia64/zrot.S b/kernel/ia64/zrot.S new file mode 100644 index 0000000000..f133a74895 --- /dev/null +++ b/kernel/ia64/zrot.S @@ -0,0 +1,879 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 8 + 4) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 8 + 8) +#else +#define PREFETCH_SIZE (32 * 8 + 16) +#endif + +#define N r32 +#define X1 r33 +#define INCX r34 +#define Y1 r35 +#define INCY r36 + +#define PREX r2 +#define PREY r3 + +#define I r14 +#define J r15 +#define Y2 r16 +#define X2 r17 + +#define INCX16 r18 +#define INCY16 r19 + +#define PR r30 +#define ARLC r31 + +#define C f8 +#define S f9 + + PROLOGUE + .prologue + PROFCODE + { .mmi + adds r29 = 16, r12 + add INCX = INCX, INCX + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mib + cmp.lt p0, p6 = r0, N + shr I = N, 3 + (p6) br.ret.spnt.many b0 + } + ;; + .body + { .mmi +#ifdef XDOUBLE + LDFD S = [r29] +#else + nop __LINE__ +#endif + add INCY = INCY, INCY + mov PR = pr + } + { .mmi + mov X2 = X1 + mov Y2 = Y1 + mov pr.rot= 0 + } + ;; + { .mmi + shladd INCX = INCX, BASE_SHIFT, r0 + shladd INCY = INCY, BASE_SHIFT, r0 + mov ar.ec= 3 + } + { .mmi + adds I = -1, I + cmp.eq p16, p0 = r0, r0 + and J = 7, N + } + ;; + { .mmi +#ifndef XDOUBLE + shladd INCX16 = INCX, 3, r0 + shladd INCY16 = INCY, 3, r0 +#else + shladd INCX16 = INCX, 2, r0 + shladd INCY16 = INCY, 2, r0 +#endif + nop __LINE__ + } + { .mmi + adds INCX = -SIZE, INCX + adds INCY = -SIZE, INCY + nop __LINE__ + } + ;; + { .mmi + adds PREX = PREFETCH_SIZE * SIZE, X1 + adds PREY = PREFETCH_SIZE * SIZE, Y1 + mov ar.lc = I + } + { .mib + cmp.eq p6 ,p0 = -1, I + tbit.z p0, p12 = N, 2 + (p6) br.cond.dpnt .L15 + } + ;; + .align 32 + +.L12: + { .mmf + (p19) STFD [Y2] = f15 + (p16) lfetch.excl.nt1 [PREX], INCX16 + (p18) FMPY f15 = C, f91 + } + { .mmf + (p16) LDFD f32 = [X1], SIZE + (p19) add Y2 = Y2, INCY + (p18) FNMA f11 = S, f37, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p16) lfetch.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = C, f40, f12 + } + { .mmf + (p17) LDFD f114 = [Y1], INCY + (p18) adds X2 = SIZE, X2 + (p18) FMPY f6 = S, f94 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f35 = [X1], INCX + (p18) FNMA f13 = S, f40, f13 + } + { .mmf + nop __LINE__ + (p18) adds Y2 = SIZE, Y2 + (p18) FMPY f7 = C, f94 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p17) LDFD f117 = [Y1], SIZE + (p18) FMA f14 = C, f43, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f97 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f38 = [X1], SIZE + (p18) FNMA f15 = S, f43, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f97 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p17) LDFD f120 = [Y1], INCY + (p18) FMPY f12 = S, f100 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMA f6 = C, f46, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f41 = [X1], INCX + (p18) FMPY f13 = C, f100 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FNMA f7 = S, f46, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p17) LDFD f123 = [Y1], SIZE + (p18) FMPY f14 = S, f103 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f49, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f44 = [X1], SIZE + (p18) FMPY f15 = C, f103 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f49, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p17) LDFD f126 = [Y1], INCY + (p18) FMA f12 = C, f52, f12 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMPY f6 = S, f106 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f47 = [X1], INCX + (p18) FNMA f13 = S, f52, f13 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FMPY f7 = C, f106 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f80 = [Y1], SIZE + (p18) FMA f14 = C, f55, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f109 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f50 = [X1], SIZE + (p18) FNMA f15 = S, f55, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f109 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f83 = [Y1], INCY + (p18) FMPY f12 = S, f112 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMA f6 = C, f58, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f53 = [X1], INCX + (p18) FMPY f13 = C, f112 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FNMA f7 = S, f58, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f86 = [Y1], SIZE + (p18) FMPY f14 = S, f115 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f61, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f56 = [X1], SIZE + (p18) FMPY f15 = C, f115 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f61, f11 + } + ;; +#ifndef XDOUBLE + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f89 = [Y1], INCY + (p18) FMA f12 = C, f64, f12 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMPY f6 = S, f118 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f59 = [X1], INCX + (p18) FNMA f13 = S, f64, f13 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FMPY f7 = C, f118 + } + ;; +#else + { .mmf + (p18) STFD [X2] = f6 + (p16) lfetch.excl.nt1 [PREY], INCY16 + (p18) FMA f12 = C, f64, f12 + } + { .mmf + (p16) LDFD f89 = [Y1], INCY + (p18) adds X2 = SIZE, X2 + (p18) FMPY f6 = S, f118 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) lfetch.excl.nt1 [PREX], INCX16 + (p18) FNMA f13 = S, f64, f13 + } + { .mmf + (p16) LDFD f59 = [X1], INCX + (p18) adds Y2 = SIZE, Y2 + (p18) FMPY f7 = C, f118 + } + ;; +#endif + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f92 = [Y1], SIZE + (p18) FMA f14 = C, f67, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMPY f10 = S, f121 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f62 = [X1], SIZE + (p18) FNMA f15 = S, f67, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FMPY f11 = C, f121 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f95 = [Y1], INCY + (p18) FMPY f12 = S, f124 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p18) FMA f6 = C, f70, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f65 = [X1], INCX + (p18) FMPY f13 = C, f124 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p18) FNMA f7 = S, f70, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f98 = [Y1], SIZE + (p18) FMPY f14 = S, f127 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p18) FMA f10 = C, f73, f10 + } + ;; + { .mmf + (p18) STFD [Y2] = f15 + (p16) LDFD f68 = [X1], SIZE + (p18) FMPY f15 = C, f127 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p18) FNMA f11 = S, f73, f11 + } + ;; + { .mmf + (p18) STFD [X2] = f6 + (p16) LDFD f101 = [Y1], INCY + (p18) FMA f12 = C, f76, f12 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p17) FMPY f6 = S, f81 + } + ;; + { .mmf + (p18) STFD [Y2] = f7 + (p16) LDFD f71 = [X1], INCX + (p18) FNMA f13 = S, f76, f13 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p17) FMPY f7 = C, f81 + } + ;; + { .mmf + (p18) STFD [X2] = f10 + (p16) LDFD f104 = [Y1], SIZE + (p18) FMA f14 = C, f79, f14 + } + { .mmf + (p18) add X2 = X2, INCX + nop __LINE__ + (p17) FMPY f10 = S, f84 + } + ;; + { .mmf + (p18) STFD [Y2] = f11 + (p16) LDFD f74 = [X1], SIZE + (p18) FNMA f15 = S, f79, f15 + } + { .mmf + (p18) add Y2 = Y2, INCY + nop __LINE__ + (p17) FMPY f11 = C, f84 + } + ;; + { .mmf + (p18) STFD [X2] = f12 + (p16) LDFD f107 = [Y1], INCY + (p17) FMPY f12 = S, f87 + } + { .mmf + (p18) adds X2 = SIZE, X2 + nop __LINE__ + (p17) FMA f6 = C, f33, f6 + } + ;; + { .mmf + (p18) STFD [Y2] = f13 + (p16) LDFD f77 = [X1], INCX + (p17) FMPY f13 = C, f87 + } + { .mmf + (p18) adds Y2 = SIZE, Y2 + nop __LINE__ + (p17) FNMA f7 = S, f33, f7 + } + ;; + { .mmf + (p18) STFD [X2] = f14 + (p16) LDFD f110 = [Y1], SIZE + (p17) FMPY f14 = S, f90 + } + { .mfb + (p18) add X2 = X2, INCX + (p17) FMA f10 = C, f36, f10 + br.ctop.sptk.few .L12 + } + ;; + { .mmi + (p19) STFD [Y2] = f15 + (p19) add Y2 = Y2, INCY + nop __LINE__ + } + { .mmi + nop __LINE__ + nop __LINE__ + nop __LINE__ + } + ;; + .align 32 + +.L15: + { .mmi + (p12) LDFD f40 = [Y1], SIZE + (p12) LDFD f32 = [X1], SIZE + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f41 = [Y1], INCY + (p12) LDFD f33 = [X1], INCX + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f42 = [Y1], SIZE + cmp.eq p7, p0 = r0, J + (p7) br.ret.sptk.many b0 + } + ;; + { .mmf + (p12) LDFD f43 = [Y1], INCY + nop __LINE__ + (p12) FMPY f6 = S, f40 + } + ;; + { .mmf + (p12) LDFD f34 = [X1], SIZE + nop __LINE__ + (p12) FMPY f7 = C, f40 + } + ;; + { .mmf + (p12) LDFD f44 = [Y1], SIZE + nop __LINE__ + (p12) FMPY f10 = S, f41 + } + ;; + { .mmf + (p12) LDFD f35 = [X1], INCX + nop __LINE__ + (p12) FMPY f11 = C, f41 + } + ;; + { .mmf + (p12) LDFD f45 = [Y1], INCY + nop __LINE__ + (p12) FMPY f12 = S, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f6 = C, f32, f6 + } + ;; + { .mmf + (p12) LDFD f36 = [X1], SIZE + nop __LINE__ + (p12) FMPY f13 = C, f42 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f7 = S, f32, f7 + } + ;; + { .mmf + (p12) LDFD f46 = [Y1], SIZE + nop __LINE__ + (p12) FMPY f14 = S, f43 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f10 = C, f33, f10 + } + ;; + { .mmf + (p12) LDFD f37 = [X1], INCX + nop __LINE__ + (p12) FMPY f15 = C, f43 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f11 = S, f33, f11 + } + ;; + { .mmf + (p12) STFD [X2] = f6, SIZE + (p12) LDFD f47 = [Y1], INCY + (p12) FMA f12 = C, f34, f12 + } + { .mfi + nop __LINE__ + (p12) FMPY f6 = S, f44 + tbit.z p0, p13 = N, 1 + } + ;; + { .mmf + (p12) STFD [Y2] = f7, SIZE + (p12) LDFD f38 = [X1], SIZE + (p12) FNMA f13 = S, f34, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMPY f7 = C, f44 + } + ;; + { .mmf + (p12) STFD [X2] = f10 + (p13) LDFD f52 = [Y1], SIZE + (p12) FMA f14 = C, f35, f14 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMPY f10 = S, f45 + } + ;; + { .mmf + (p12) STFD [Y2] = f11 + (p12) LDFD f39 = [X1], INCX + (p12) FNMA f15 = S, f35, f15 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p12) FMPY f11 = C, f45 + } + ;; + { .mmf + (p12) STFD [X2] = f12, SIZE + (p13) LDFD f53 = [Y1], INCY + (p12) FMPY f12 = S, f46 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FMA f6 = C, f36, f6 + } + ;; + { .mmf + (p12) STFD [Y2] = f13, SIZE + (p13) LDFD f48 = [X1], SIZE + (p12) FMPY f13 = C, f46 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p12) FNMA f7 = S, f36, f7 + } + ;; + { .mmf + (p12) STFD [X2] = f14 + (p13) LDFD f54 = [Y1], SIZE + (p12) FMPY f14 = S, f47 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p12) FMA f10 = C, f37, f10 + } + ;; + { .mmf + (p12) STFD [Y2] = f15 + (p13) LDFD f49 = [X1], INCX + (p12) FMPY f15 = C, f47 + } + { .mfi + (p12) add Y2 = Y2, INCY + (p12) FNMA f11 = S, f37, f11 + tbit.z p0, p14 = N, 0 + } + ;; + { .mmf + (p12) STFD [X2] = f6, SIZE + (p13) LDFD f55 = [Y1], INCY + (p12) FMA f12 = C, f38, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMPY f6 = S, f52 + } + ;; + { .mmf + (p12) STFD [Y2] = f7, SIZE + (p13) LDFD f50 = [X1], SIZE + (p12) FNMA f13 = S, f38, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMPY f7 = C, f52 + } + ;; + { .mmf + (p12) STFD [X2] = f10 + (p14) LDFD f58 = [Y1], SIZE + (p12) FMA f14 = C, f39, f14 + } + { .mmf + (p12) add X2 = X2, INCX + nop __LINE__ + (p13) FMPY f10 = S, f53 + } + ;; + { .mmf + (p12) STFD [Y2] = f11 + (p13) LDFD f51 = [X1], INCX + (p12) FNMA f15 = S, f39, f15 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FMPY f11 = C, f53 + } + ;; + { .mmf + (p12) STFD [X2] = f12, SIZE + (p14) LDFD f59 = [Y1], INCY + (p13) FMPY f12 = S, f54 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f6 = C, f48, f6 + } + ;; + { .mmf + (p12) STFD [Y2] = f13, SIZE + (p14) LDFD f56 = [X1], SIZE + (p13) FMPY f13 = C, f54 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FNMA f7 = S, f48, f7 + } + ;; + { .mmf + (p12) STFD [X2] = f14 + (p12) add X2 = X2, INCX + (p13) FMPY f14 = S, f55 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p13) FMA f10 = C, f49, f10 + } + ;; + { .mmf + (p12) STFD [Y2] = f15 + (p14) LDFD f57 = [X1], INCX + (p13) FMPY f15 = C, f55 + } + { .mmf + (p12) add Y2 = Y2, INCY + nop __LINE__ + (p13) FNMA f11 = S, f49, f11 + } + ;; + { .mmf + (p13) STFD [X2] = f6, SIZE + nop __LINE__ + (p13) FMA f12 = C, f50, f12 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMPY f6 = S, f58 + } + ;; + { .mmf + (p13) STFD [Y2] = f7, SIZE + nop __LINE__ + (p13) FNMA f13 = S, f50, f13 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMPY f7 = C, f58 + } + ;; + { .mmf + (p13) STFD [X2] = f10 + (p13) add X2 = X2, INCX + (p13) FMA f14 = C, f51, f14 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMPY f10 = S, f59 + } + ;; + { .mmf + (p13) STFD [Y2] = f11 + (p13) add Y2 = Y2, INCY + (p13) FNMA f15 = S, f51, f15 + } + { .mmf + nop __LINE__ + nop __LINE__ + (p14) FMPY f11 = C, f59 + } + ;; + { .mmf + (p13) STFD [X2] = f12, SIZE + nop __LINE__ + (p14) FMA f6 = C, f56, f6 + } + ;; + { .mmf + (p13) STFD [Y2] = f13, SIZE + nop __LINE__ + (p14) FNMA f7 = S, f56, f7 + } + ;; + { .mmf + (p13) STFD [X2] = f14 + (p13) add X2 = X2, INCX + (p14) FMA f10 = C, f57, f10 + } + ;; + { .mmf + (p13) STFD [Y2] = f15 + (p13) add Y2 = Y2, INCY + (p14) FNMA f11 = S, f57, f11 + } + ;; + { .mmi + (p14) STFD [X2] = f6, SIZE + (p14) STFD [Y2] = f7, SIZE + nop __LINE__ + } + ;; + { .mmb + (p14) STFD [X2] = f10 + (p14) STFD [Y2] = f11 + br.ret.sptk.many b0 + } + ;; + EPILOGUE + diff --git a/kernel/ia64/zscal.S b/kernel/ia64/zscal.S new file mode 100644 index 0000000000..e97fedaee3 --- /dev/null +++ b/kernel/ia64/zscal.S @@ -0,0 +1,540 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#define SP r12 + +#ifdef XDOUBLE +#define N r32 +#define X1 r14 +#define INCX r15 +#else +#define N r32 +#define X1 r37 +#define INCX r38 +#endif + +#define X2 r16 +#define Y1 r17 +#define INCX3 r18 +#define PRE r19 +#define INCX8 r20 +#define I r29 +#define J r28 + +#define PR r30 +#define ARLC r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + + PROLOGUE + .prologue + PROFCODE + {.mmi + adds r22 = 16, SP + adds r23 = 24, SP + mov PR = pr + } + { .mib + cmp.ge p7, p0 = 0, N + shr I = N, 3 + (p7) br.ret.sptk.many b0 + } + ;; +#ifdef XDOUBLE + { .mmi + ld8 X1 = [r22] + ld8 INCX = [r23] + nop __LINE__ + } + ;; +#endif + { .mfi + and J = 7, N + fcmp.eq p0, p11 = ALPHA_I, f0 + .save ar.lc, ARLC + mov ARLC = ar.lc + } + { .mfi + adds I = -1, I + fcmp.eq p0, p10 = ALPHA_R, f0 + shl INCX = INCX, ZBASE_SHIFT + } + ;; + .body + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd X2 = INCX, 1, X1 + mov pr.rot= 0 + } + { .mmi + shladd INCX3 = INCX, 1, INCX + adds PRE = PREFETCH_SIZE * SIZE, X1 + mov Y1 = X1 + } + ;; + { .mmi + cmp.gt p8, p0 = 0, I + cmp.ge p9, p0 = 0, J + mov ar.lc = I + } + { .mmi + adds INCX = -1 * SIZE, INCX + adds INCX3 = -1 * SIZE, INCX3 + tbit.z p0, p13 = N, 2 + } + ;; + { .bbb + (p10) br.cond.dptk .L100 + (p11) br.cond.dptk .L100 + (p8) br.cond.dpnt .L20 + } + ;; + .align 32 + +.L10: + { .mmb + STFD [X1] = f0, 1 * SIZE + STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + { .mmb + lfetch.excl.nt1 [PRE], INCX8 + nop.m 0 + } + ;; + { .mmb + STFD [X1] = f0 + add X1 = INCX, X1 + } + { .mmb + STFD [X2] = f0 + add X2 = INCX, X2 + } + ;; + { .mmb + STFD [X1] = f0, 1 * SIZE + STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + ;; + { .mmb + STFD [X1] = f0 + add X1 = INCX3, X1 + } + { .mmb + STFD [X2] = f0 + add X2 = INCX3, X2 + } + ;; + { .mmb + STFD [X1] = f0, 1 * SIZE + STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + ;; + { .mmb + STFD [X1] = f0 + add X1 = INCX, X1 + } + { .mmb + STFD [X2] = f0 + add X2 = INCX, X2 + } + ;; + { .mmb + STFD [X1] = f0, 1 * SIZE + STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + ;; + { .mmb + STFD [X1] = f0 + add X1 = INCX3, X1 + } + { .mmb + STFD [X2] = f0 + add X2 = INCX3, X2 + br.cloop.sptk.few .L10 + } + ;; + .align 32 + +.L20: + { .mmi + (p13) STFD [X1] = f0, 1 * SIZE + (p13) STFD [X2] = f0, 1 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmi + (p13) STFD [X1] = f0 + (p13) add X1 = INCX, X1 + tbit.z p0, p14 = N, 1 + } + { .mmi + (p13) STFD [X2] = f0 + (p13) add X2 = INCX, X2 + tbit.z p0, p15 = N, 0 + } + ;; + { .mmb + (p13) STFD [X1] = f0, 1 * SIZE + (p13) STFD [X2] = f0, 1 * SIZE + nop.b 0 + } + { .mib + nop.m 0 + mov pr = PR, -65474 + (p9) br.ret.sptk.many b0 + } + ;; + { .mmb + (p13) STFD [X1] = f0 + (p13) add X1 = INCX3, X1 + } + { .mmb + (p13) STFD [X2] = f0 + (p13) add X2 = INCX3, X2 + } + ;; + (p14) STFD [X1] = f0, 1 * SIZE + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + } + ;; + (p14) STFD [X1] = f0, 1 * SIZE + ;; + { .mmb + (p14) STFD [X1] = f0 + (p14) add X1 = INCX, X1 + } + ;; + (p15) STFD [X1] = f0, 1 * SIZE + ;; + { .mib + (p15) STFD [X1] = f0 + mov pr = PR, -65474 + br.ret.sptk.many b0 + } + ;; + .align 32 + +.L100: + cmp.eq p16, p0 = r0, r0 + mov.i ar.ec = 6 + (p8) br.cond.dpnt .L170 + ;; + .align 32 + +.L160: + { .mmf + (p21) STFD [X1] = f6, 1 * SIZE + (p16) lfetch.excl.nt1 [PRE], INCX8 + (p21) FMS f12 = ALPHA_R, f85, f12 + } + { .mfb + (p16) LDFD f32 = [Y1], 1 * SIZE + (p20) FMPY f6 = ALPHA_I, f42 + } + ;; + { .mmf + (p21) STFD [X1] = f43 + (p21) add X1 = INCX, X1 + (p21) FMA f91 = ALPHA_I, f85, f91 + } + { .mfb + (p16) LDFD f38 = [Y1], INCX + (p20) FMPY f42 = ALPHA_R, f42 + } + ;; + { .mmf + (p21) STFD [X1] = f7, 1 * SIZE + (p21) FMS f13 = ALPHA_R, f97, f13 + } + { .mfb + (p16) LDFD f44 = [Y1], 1 * SIZE + (p20) FMPY f7 = ALPHA_I, f54 + } + ;; + { .mmf + (p21) STFD [X1] = f55 + (p21) add X1 = INCX, X1 + (p21) FMA f103 = ALPHA_I, f97, f103 + } + { .mfb + (p16) LDFD f50 = [Y1], INCX + (p20) FMPY f54 = ALPHA_R, f54 + } + ;; + { .mmf + (p21) STFD [X1] = f10, 1 * SIZE + (p21) FMS f14 = ALPHA_R, f109, f14 + } + { .mfb + (p16) LDFD f56 = [Y1], 1 * SIZE + (p20) FMPY f10 = ALPHA_I, f66 + } + ;; + { .mmf + (p21) STFD [X1] = f67 + (p21) add X1 = INCX, X1 + (p21) FMA f115 = ALPHA_I, f109, f115 + } + { .mfb + (p16) LDFD f62 = [Y1], INCX + (p20) FMPY f66 = ALPHA_R, f66 + } + ;; + { .mmf + (p21) STFD [X1] = f11, 1 * SIZE + (p21) FMS f15 = ALPHA_R, f121, f15 + } + { .mfb + (p16) LDFD f68 = [Y1], 1 * SIZE + (p20) FMPY f11 = ALPHA_I, f78 + } + ;; + { .mmf + (p21) STFD [X1] = f79 + (p21) add X1 = INCX, X1 + (p21) FMA f127 = ALPHA_I, f121, f127 + } + { .mfb + (p16) LDFD f74 = [Y1], INCX + (p20) FMPY f78 = ALPHA_R, f78 + } + ;; + { .mmf + (p21) STFD [X1] = f12, 1 * SIZE + (p20) FMS f6 = ALPHA_R, f36, f6 + } + { .mfb + (p16) LDFD f80 = [Y1], 1 * SIZE + (p20) FMPY f12 = ALPHA_I, f90 + } + ;; + { .mmf + (p21) STFD [X1] = f91 + (p21) add X1 = INCX, X1 + (p20) FMA f42 = ALPHA_I, f36, f42 + } + { .mfb + (p16) LDFD f86 = [Y1], INCX + (p20) FMPY f90 = ALPHA_R, f90 + } + ;; + { .mmf + (p21) STFD [X1] = f13, 1 * SIZE + (p20) FMS f7 = ALPHA_R, f48, f7 + } + { .mfb + (p16) LDFD f92 = [Y1], 1 * SIZE + (p20) FMPY f13 = ALPHA_I, f102 + } + ;; + { .mmf + (p21) STFD [X1] = f103 + (p21) add X1 = INCX, X1 + (p20) FMA f54 = ALPHA_I, f48, f54 + } + { .mfb + (p16) LDFD f98 = [Y1], INCX + (p20) FMPY f102 = ALPHA_R, f102 + } + ;; + { .mmf + (p21) STFD [X1] = f14, 1 * SIZE + (p20) FMS f10 = ALPHA_R, f60, f10 + } + { .mfb + (p16) LDFD f104 = [Y1], 1 * SIZE + (p20) FMPY f14 = ALPHA_I, f114 + } + ;; + { .mmf + (p21) STFD [X1] = f115 + (p21) add X1 = INCX, X1 + (p20) FMA f66 = ALPHA_I, f60, f66 + } + { .mfb + (p16) LDFD f110 = [Y1], INCX + (p20) FMPY f114 = ALPHA_R, f114 + } + ;; + { .mmf + (p21) STFD [X1] = f15, 1 * SIZE + (p20) FMS f11 = ALPHA_R, f72, f11 + } + { .mfb + (p16) LDFD f116 = [Y1], 1 * SIZE + (p20) FMPY f15 = ALPHA_I, f126 + } + ;; + { .mmf + (p21) STFD [X1] = f127 + (p21) add X1 = INCX, X1 + (p20) FMA f78 = ALPHA_I, f72, f78 + } + { .mfb + (p16) LDFD f122 = [Y1], INCX + (p20) FMPY f126 = ALPHA_R, f126 + br.ctop.sptk.few .L160 + } + ;; + .align 16 + +.L170: + { .mmi + (p13) LDFD f48 = [Y1], 1 * SIZE + mov ar.lc = ARLC + } + ;; + { .mib + (p13) LDFD f49 = [Y1], INCX + mov pr = PR, -65474 + (p9) br.ret.sptk.many b0 + } + ;; + (p13) LDFD f50 = [Y1], 1 * SIZE + tbit.z p0, p14 = N, 1 + ;; + (p13) LDFD f51 = [Y1], INCX + tbit.z p0, p15 = N, 0 + ;; + (p13) LDFD f52 = [Y1], 1 * SIZE + ;; + (p13) LDFD f53 = [Y1], INCX + ;; + (p13) LDFD f54 = [Y1], 1 * SIZE + (p13) FMPY f112 = ALPHA_I, f48 + ;; + (p13) LDFD f55 = [Y1], INCX + (p13) FMPY f111 = ALPHA_I, f49 + ;; + (p14) LDFD f56 = [Y1], 1 * SIZE + (p13) FMPY f114 = ALPHA_I, f50 + ;; + (p14) LDFD f57 = [Y1], INCX + (p13) FMPY f113 = ALPHA_I, f51 + ;; + (p14) LDFD f58 = [Y1], 1 * SIZE + (p13) FMPY f116 = ALPHA_I, f52 + ;; + (p14) LDFD f59 = [Y1], INCX + (p13) FMPY f115 = ALPHA_I, f53 + ;; + (p15) LDFD f60 = [Y1], 1 * SIZE + (p13) FMPY f118 = ALPHA_I, f54 + ;; + (p15) LDFD f61 = [Y1], INCX + (p13) FMPY f117 = ALPHA_I, f55 + ;; + (p14) FMPY f120 = ALPHA_I, f56 + (p14) FMPY f119 = ALPHA_I, f57 + (p14) FMPY f122 = ALPHA_I, f58 + (p14) FMPY f121 = ALPHA_I, f59 + (p15) FMPY f124 = ALPHA_I, f60 + (p15) FMPY f123 = ALPHA_I, f61 + ;; + (p13) FMS f48 = ALPHA_R, f48, f111 + (p13) FMA f49 = ALPHA_R, f49, f112 + (p13) FMS f50 = ALPHA_R, f50, f113 + (p13) FMA f51 = ALPHA_R, f51, f114 + + ;; + (p13) STFD [X1] = f48, 1 * SIZE + (p13) FMS f52 = ALPHA_R, f52, f115 + ;; + (p13) STFD [X1] = f49 + (p13) add X1 = INCX, X1 + (p13) FMA f53 = ALPHA_R, f53, f116 + ;; + (p13) STFD [X1] = f50, 1 * SIZE + (p13) FMS f54 = ALPHA_R, f54, f117 + ;; + (p13) STFD [X1] = f51 + (p13) add X1 = INCX, X1 + (p13) FMA f55 = ALPHA_R, f55, f118 + ;; + (p13) STFD [X1] = f52, 1 * SIZE + (p14) FMS f56 = ALPHA_R, f56, f119 + ;; + (p13) STFD [X1] = f53 + (p13) add X1 = INCX, X1 + (p14) FMA f57 = ALPHA_R, f57, f120 + ;; + (p13) STFD [X1] = f54, 1 * SIZE + (p14) FMS f58 = ALPHA_R, f58, f121 + ;; + (p13) STFD [X1] = f55 + (p13) add X1 = INCX, X1 + (p14) FMA f59 = ALPHA_R, f59, f122 + ;; + (p14) STFD [X1] = f56, 1 * SIZE + (p15) FMS f60 = ALPHA_R, f60, f123 + ;; + (p14) STFD [X1] = f57 + (p14) add X1 = INCX, X1 + (p15) FMA f61 = ALPHA_R, f61, f124 + ;; + (p14) STFD [X1] = f58, 1 * SIZE + ;; + (p14) STFD [X1] = f59 + (p14) add X1 = INCX, X1 + ;; + (p15) STFD [X1] = f60, 1 * SIZE + ;; + (p15) STFD [X1] = f61 + mov pr = PR, -65474 + br.ret.sptk.many b0 + + EPILOGUE diff --git a/kernel/ia64/zswap.S b/kernel/ia64/zswap.S new file mode 100644 index 0000000000..8251b14a87 --- /dev/null +++ b/kernel/ia64/zswap.S @@ -0,0 +1,476 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef XDOUBLE +#define PREFETCH_SIZE ( 8 * 16) +#elif defined(DOUBLE) +#define PREFETCH_SIZE (16 * 16) +#else +#define PREFETCH_SIZE (32 * 16) +#endif + +#define SP r12 + +#ifdef XDOUBLE +#define N r32 +#define X r14 +#define INCX r15 +#define Y r16 +#define INCY r17 +#else +#define N r32 +#define X r37 +#define INCX r38 +#define Y r39 +#define INCY r36 +#endif + +#define PRE1 r2 +#define PRE2 r3 + +#define I r18 +#define J r19 +#define YY r20 +#define XX r21 +#define INCXM1 r22 +#define INCYM1 r23 +#define INCX8 r24 +#define INCY8 r25 + +#define PR r30 +#define ARLC r31 + + + PROLOGUE + .prologue + PROFCODE + + { .mmi + adds r14 = 16, SP + adds r15 = 24, SP + adds r16 = 32, SP + } + { .mmb + adds r17 = 40, SP + cmp.gt p15, p0 = r0, N + (p15) br.ret.sptk.many b0 + } + ;; +#ifdef XDOUBLE + { .mmi + ld8 X = [r14] + ld8 INCX = [r15] + nop __LINE__ + } + { .mmi + ld8 Y = [r16] + ld8 INCY = [r17] + nop __LINE__ + } + ;; +#else + { .mmi + ld8 INCY = [r14] + nop __LINE__ + nop __LINE__ + } + ;; +#endif + { .mii + .save ar.lc, ARLC + mov ARLC = ar.lc + shl INCX = INCX, ZBASE_SHIFT + } + ;; + .body + { .mii + and J = 7, N + mov PR = pr + shl INCY = INCY, ZBASE_SHIFT + } + ;; + { .mmi + mov XX = X + mov YY = Y + shr I = N, 3 + } + ;; + { .mmi + adds I = -1, I + cmp.eq p9, p0 = r0, J + mov pr.rot = 0 + } + ;; + { .mmi + shladd INCX8 = INCX, 3, r0 + shladd INCY8 = INCY, 3, r0 + mov ar.ec= 3 + } + { .mmi + adds INCXM1 = -SIZE, INCX + adds INCYM1 = -SIZE, INCY + cmp.eq p16, p0 = r0, r0 + } + ;; + { .mmi + adds PRE1 = PREFETCH_SIZE * SIZE, X + adds PRE2 = PREFETCH_SIZE * SIZE, Y + mov ar.lc = I + } + { .mib + cmp.eq p8 ,p0 = -1, I + tbit.z p0, p12 = J, 2 + (p8) br.cond.dpnt .L55 + } + ;; + .align 32 + +.L52: + { .mmi + (p18) STFD [XX] = f37, 1 * SIZE + (p18) STFD [YY] = f34, 1 * SIZE + } + { .mmi + (p16) LDFD f32 = [X], 1 * SIZE + (p16) LDFD f35 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f43 + (p18) STFD [YY] = f40 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f38 = [X], INCXM1 + (p16) LDFD f41 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f49, 1 * SIZE + (p18) STFD [YY] = f46, 1 * SIZE + } + { .mmi + (p16) LDFD f44 = [X], 1 * SIZE + (p16) LDFD f47 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f55 + (p18) STFD [YY] = f52 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f50 = [X], INCXM1 + (p16) LDFD f53 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f61, 1 * SIZE + (p18) STFD [YY] = f58, 1 * SIZE + } + { .mmi + (p16) LDFD f56 = [X], 1 * SIZE + (p16) LDFD f59 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f67 + (p18) STFD [YY] = f64 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f62 = [X], INCXM1 + (p16) LDFD f65 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f73, 1 * SIZE + (p18) STFD [YY] = f70, 1 * SIZE + } + { .mmi + (p16) LDFD f68 = [X], 1 * SIZE + (p16) LDFD f71 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f79 + (p18) STFD [YY] = f76 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f74 = [X], INCXM1 + (p16) LDFD f77 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f85, 1 * SIZE + (p18) STFD [YY] = f82, 1 * SIZE + } + { .mmi + (p16) LDFD f80 = [X], 1 * SIZE + (p16) LDFD f83 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f91 + (p18) STFD [YY] = f88 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f86 = [X], INCXM1 + (p16) LDFD f89 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f97, 1 * SIZE + (p18) STFD [YY] = f94, 1 * SIZE + } + { .mmi + (p16) LDFD f92 = [X], 1 * SIZE + (p16) LDFD f95 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f103 + (p18) STFD [YY] = f100 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f98 = [X], INCXM1 + (p16) LDFD f101 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f109, 1 * SIZE + (p18) STFD [YY] = f106, 1 * SIZE + } + { .mmi + (p16) LDFD f104 = [X], 1 * SIZE + (p16) LDFD f107 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f115 + (p18) STFD [YY] = f112 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f110 = [X], INCXM1 + (p16) LDFD f113 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + ;; + { .mmi + (p18) STFD [XX] = f121, 1 * SIZE + (p18) STFD [YY] = f118, 1 * SIZE + } + { .mmi + (p16) LDFD f116 = [X], 1 * SIZE + (p16) LDFD f119 = [Y], 1 * SIZE + } + ;; + { .mmi + (p18) STFD [XX] = f127 + (p18) STFD [YY] = f124 + (p18) add XX = XX, INCXM1 + } + { .mmi + (p16) LDFD f122 = [X], INCXM1 + (p16) LDFD f125 = [Y], INCYM1 + (p18) add YY = YY, INCYM1 + } + { .mmb + (p16) lfetch.excl.nt1 [PRE1], INCX8 + (p16) lfetch.excl.nt1 [PRE2], INCY8 + br.ctop.sptk.few .L52 + } + ;; + .align 32 + +.L55: + { .mmi + (p12) LDFD f32 = [X], 1 * SIZE + (p12) LDFD f80 = [Y], 1 * SIZE + mov ar.lc = ARLC + } + ;; + { .mmi + (p12) LDFD f33 = [X], INCXM1 + (p12) LDFD f81 = [Y], INCYM1 + mov pr = PR, -65474 + } + ;; + { .mmb + (p12) LDFD f34 = [X], 1 * SIZE + (p12) LDFD f82 = [Y], 1 * SIZE + (p9) br.ret.sptk.many b0 + } + ;; + { .mmi + (p12) LDFD f35 = [X], INCXM1 + (p12) LDFD f83 = [Y], INCYM1 + tbit.z p0, p13 = N, 1 + } + ;; + { .mmi + (p12) LDFD f36 = [X], 1 * SIZE + (p12) LDFD f84 = [Y], 1 * SIZE + tbit.z p0, p14 = N, 0 + } + ;; + { .mmi + (p12) LDFD f37 = [X], INCXM1 + (p12) LDFD f85 = [Y], INCYM1 + } + ;; + { .mmi + (p12) STFD [XX] = f80, 1 * SIZE + (p12) STFD [YY] = f32, 1 * SIZE + } + { .mmi + (p12) LDFD f38 = [X], 1 * SIZE + (p12) LDFD f86 = [Y], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [XX] = f81 + (p12) STFD [YY] = f33 + (p12) add XX = XX, INCXM1 + } + { .mmi + (p12) LDFD f39 = [X], INCXM1 + (p12) LDFD f87 = [Y], INCYM1 + (p12) add YY = YY, INCYM1 + } + ;; + { .mmi + (p12) STFD [XX] = f82, 1 * SIZE + (p12) STFD [YY] = f34, 1 * SIZE + } + { .mmi + (p13) LDFD f40 = [X], 1 * SIZE + (p13) LDFD f88 = [Y], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [XX] = f83 + (p12) STFD [YY] = f35 + (p12) add XX = XX, INCXM1 + } + { .mmi + (p13) LDFD f41 = [X], INCXM1 + (p13) LDFD f89 = [Y], INCYM1 + (p12) add YY = YY, INCYM1 + } + ;; + { .mmi + (p12) STFD [XX] = f84, 1 * SIZE + (p12) STFD [YY] = f36, 1 * SIZE + } + { .mmi + (p13) LDFD f42 = [X], 1 * SIZE + (p13) LDFD f90 = [Y], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [XX] = f85 + (p12) STFD [YY] = f37 + (p12) add XX = XX, INCXM1 + } + { .mmi + (p13) LDFD f43 = [X], INCXM1 + (p13) LDFD f91 = [Y], INCYM1 + (p12) add YY = YY, INCYM1 + } + ;; + { .mmi + (p12) STFD [XX] = f86, 1 * SIZE + (p12) STFD [YY] = f38, 1 * SIZE + } + { .mmi + (p14) LDFD f44 = [X], 1 * SIZE + (p14) LDFD f92 = [Y], 1 * SIZE + } + ;; + { .mmi + (p12) STFD [XX] = f87 + (p12) STFD [YY] = f39 + (p12) add XX = XX, INCXM1 + } + { .mmi + (p14) LDFD f45 = [X] + (p14) LDFD f93 = [Y] + (p12) add YY = YY, INCYM1 + } + ;; + { .mmi + (p13) STFD [XX] = f88, 1 * SIZE + (p13) STFD [YY] = f40, 1 * SIZE + } + ;; + (p13) STFD [XX] = f89 + (p13) add XX = XX, INCXM1 + (p13) STFD [YY] = f41 + (p13) add YY = YY, INCYM1 + ;; + (p13) STFD [XX] = f90, 1 * SIZE + (p13) STFD [YY] = f42, 1 * SIZE + ;; + (p13) STFD [XX] = f91 + (p13) add XX = XX, INCXM1 + (p13) STFD [YY] = f43 + (p13) add YY = YY, INCYM1 + ;; + (p14) STFD [XX] = f92, 1 * SIZE + (p14) STFD [YY] = f44, 1 * SIZE + ;; + (p14) STFD [XX] = f93 + (p14) STFD [YY] = f45 + br.ret.sptk.many b0 + ;; + EPILOGUE + diff --git a/kernel/ia64/ztrsm_kernel_LN.S b/kernel/ia64/ztrsm_kernel_LN.S new file mode 100644 index 0000000000..ef903e35a2 --- /dev/null +++ b/kernel/ia64/ztrsm_kernel_LN.S @@ -0,0 +1,10839 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#ifndef LN +#define CPREFETCHSIZE 7 +#else +#define CPREFETCHSIZE -8 +#endif +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + +#define AORIG loc0 +#define KK loc1 +#define KK8 loc2 +#define OFFSET loc3 +#define AOFFSET2 loc4 +#define BOFFSET2 loc5 + +#ifndef CONJ +#define FCALC_A FSUB +#define FCALC_B FADD +#define FMA_A FNMA +#define FMA_B FMA +#else +#define FCALC_A FADD +#define FCALC_B FSUB +#define FMA_A FMA +#define FMA_B FNMA +#endif + +#ifndef CONJ +#define FCALC_C FMA +#define FCALC_D FNMA +#else +#define FCALC_C FNMA +#define FCALC_D FMA +#endif + +#ifndef CONJ +#define FMA_C FNMA +#define FMA_D FMA +#define FSUB_A FSUB +#else +#define FMA_C FMA +#define FMA_D FMS +#define FSUB_A FADD +#endif + + + PROLOGUE + .prologue + PROFCODE + + { .mfi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 8, 0, 0 + mov f64 = f0 + adds r14 = 16, SP + } + { .mfi + nop __LINE__ + mov f65 = f0 + adds r15 = 24, SP + } + ;; + { .mfi + ld8 LDC = [r14] + mov f81 = f0 + mov PR = pr + } + { .mfi + ld8 OFFSET = [r15] + mov f96 = f0 + shr J = N, 2 + } + ;; + { .mfi + shladd LDC = LDC, ZBASE_SHIFT, r0 + mov f97 = f0 + } + { .mfi + nop __LINE__ + mov f113 = f0 + } + ;; +#ifdef LN + { .mmi + setf.sig f32 = M + setf.sig f33 = K + shladd C = M, ZBASE_SHIFT, C + } + ;; + {.mmf + nop __LINE__ + nop __LINE__ + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + ;; + nop __LINE__ + shladd A = r2, ZBASE_SHIFT, A + } + ;; +#endif + +#ifdef RN + sub KK = r0, OFFSET +#endif + +#ifdef RT + { .mmi + setf.sig f32 = N + setf.sig f33 = K + nop __LINE__ + } + ;; + { .mmi + setf.sig f34 = LDC + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + xmpy.l f33 = f32, f33 + } + { .mmf + nop __LINE__ + sub KK = N, OFFSET + xmpy.l f34 = f32, f34 + } + ;; + { .mmi + getf.sig r2 = f33 + getf.sig r3 = f34 + } + ;; + shladd B = r2, ZBASE_SHIFT, B + add C = r3, C +#endif + ;; + .body + { .mfi + nop __LINE__ + mov f80 = f0 + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, J + mov f112 = f0 + (p6) br.cond.dpnt .L050 + } + ;; + .align 16 + +.L010: +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + { .mmi + mov C1 = C // coffset1 = c + 0 * ldc + add C2 = LDC, C // coffset2 = c + 1 * ldc + } + { .mmi + adds J = -1, J +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + { .mmi + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + { .mib +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L020 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f72 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f72 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + + { .mmi + nop __LINE__ + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f88 = f0 + shr L = L, 1 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f104 = f0 + adds L = -1, L + } + { .mfb + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f105 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f120 = f0 + mov ar.lc = L + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f121 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L038 + ;; + .align 16 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB_A f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB_A f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB_A f113 = f121, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB f113 = f121, f113 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f90, f91 = [AOFFSET] + ;; + FMPY f32 = f90, f64 + FMPY f33 = f91, f64 + FMPY f34 = f90, f80 + FMPY f35 = f91, f80 + FMPY f36 = f90, f96 + FMPY f37 = f91, f96 + FMPY f38 = f90, f112 + FMPY f39 = f91, f112 + ;; + FMA_C f64 = f91, f65, f32 + FMA_D f65 = f90, f65, f33 + FMA_C f80 = f91, f81, f34 + FMA_D f81 = f90, f81, f35 + FMA_C f96 = f91, f97, f36 + FMA_D f97 = f90, f97, f37 + FMA_C f112 = f91, f113, f38 + FMA_D f113 = f90, f113, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L020: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L010x + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + shr L = L, 1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = -1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f114 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f115 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L028 + ;; + .align 16 + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; +.L028: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET] + FSUB f80 = f74, f80 + adds BOFFSET = -14 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f89, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f90, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f91, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f66 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f105, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f106, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f107, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f120, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f121, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f123, f115 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET] + FSUB f66 = f74, f66 + adds AOFFSET = -14 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f106, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f66 + FMPY f33 = f105, f66 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + FMPY f36 = f104, f98 + FMPY f37 = f105, f98 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f66 = f105, f67, f32 + FMA_D f67 = f104, f67, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + FMA_C f98 = f105, f99, f36 + FMA_D f99 = f104, f99, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f66, f64 + FMA_A f65 = f107, f66, f65 + FNMA f80 = f106, f82, f80 + FMA_A f81 = f107, f82, f81 + FNMA f96 = f106, f98, f96 + FMA_A f97 = f107, f98, f97 + FNMA f112 = f106, f114, f112 + FMA_A f113 = f107, f114, f113 + ;; + FMA_B f64 = f107, f67, f64 + FNMA f65 = f106, f67, f65 + FMA_B f80 = f107, f83, f80 + FNMA f81 = f106, f83, f81 + FMA_B f96 = f107, f99, f96 + FNMA f97 = f106, f99, f97 + FMA_B f112 = f107, f115, f112 + FNMA f113 = f106, f115, f113 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; + FNMA f66 = f74, f64, f66 + FMA_A f67 = f75, f64, f67 + FNMA f82 = f74, f80, f82 + FMA_A f83 = f75, f80, f83 + FNMA f98 = f74, f96, f98 + FMA_A f99 = f75, f96, f99 + FNMA f114 = f74, f112, f114 + FMA_A f115 = f75, f112, f115 + ;; + FMA_B f66 = f75, f65, f66 + FNMA f67 = f74, f65, f67 + FMA_B f82 = f75, f81, f82 + FNMA f83 = f74, f81, f83 + FMA_B f98 = f75, f97, f98 + FNMA f99 = f74, f97, f99 + FMA_B f114 = f75, f113, f114 + FNMA f115 = f74, f113, f115 + ;; + FMPY f32 = f90, f66 + FMPY f33 = f91, f66 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + FMPY f36 = f90, f98 + FMPY f37 = f91, f98 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f66 = f91, f67, f32 + FMA_D f67 = f90, f67, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + FMA_C f98 = f91, f99, f36 + FMA_D f99 = f90, f99, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f66 + FMPY f35 = f73, f66 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f66 = f73, f67, f34 + FMA_D f67 = f72, f67, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f98 = f76, f66, f98 + FMA_A f99 = f77, f66, f99 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f98 = f77, f67, f98 + FNMA f99 = f76, f67, f99 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + FNMA f114 = f78, f66, f114 + FMA_A f115 = f79, f66, f115 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + FMA_B f114 = f79, f67, f114 + FNMA f115 = f78, f67, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f98 = f92, f82, f98 + FMA_A f99 = f93, f82, f99 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f98 = f93, f83, f98 + FNMA f99 = f92, f83, f99 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + FNMA f114 = f94, f82, f114 + FMA_A f115 = f95, f82, f115 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + FMA_B f114 = f95, f83, f114 + FNMA f115 = f94, f83, f115 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + FMPY f34 = f108, f98 + FMPY f35 = f109, f98 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + FMA_C f98 = f109, f99, f34 + FMA_D f99 = f108, f99, f35 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + FNMA f114 = f110, f98, f114 + FMA_A f115 = f111, f98, f115 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + FMA_B f114 = f111, f99, f114 + FNMA f115 = f110, f99, f115 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f98 = f74, f114, f98 + FMA_A f99 = f75, f114, f99 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f98 = f75, f115, f98 + FNMA f99 = f74, f115, f99 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f82 = f76, f114, f82 + FMA_A f83 = f77, f114, f83 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f82 = f77, f115, f82 + FNMA f83 = f76, f115, f83 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + FNMA f66 = f78, f114, f66 + FMA_A f67 = f79, f114, f67 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + FMA_B f66 = f79, f115, f66 + FNMA f67 = f78, f115, f67 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + FMPY f34 = f88, f98 + FMPY f35 = f89, f98 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + FMA_C f98 = f89, f99, f34 + FMA_D f99 = f88, f99, f35 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f82 = f90, f98, f82 + FMA_A f83 = f91, f98, f83 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f82 = f91, f99, f82 + FNMA f83 = f90, f99, f83 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + FNMA f66 = f92, f98, f66 + FMA_A f67 = f93, f98, f67 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + FMA_B f66 = f93, f99, f66 + FNMA f67 = f92, f99, f67 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f66 + FMPY f35 = f121, f66 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f66 = f121, f67, f34 + FMA_D f67 = f120, f67, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f83, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f66, SIZE + ;; + STFD [C1 ] = f67, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f82, SIZE + ;; + STFD [C2 ] = f83, SIZE + ;; + + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C3 ] = f98, SIZE + ;; + STFD [C3 ] = f99, SIZE + ;; + + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + STFD [C4 ] = f114, SIZE + ;; + STFD [C4 ] = f115, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L010x: +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + shr I = M, 2 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L049 + ;; + .align 16 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mfi + shladd r3 = KK, ZBASE_SHIFT, r0 + mov f118 = f0 + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = 1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + adds C5 = 4 * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f115 = f0 + adds C6 = 4 * SIZE, C2 + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f68 = f0 + shr L = L, 1 + } + { .mfi + setf.d f86 = r0 + mov f69 = f0 + adds C7 = 4 * SIZE, C3 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f84 = f0 + adds L = -1, L + } + { .mfi + setf.d f87 = r0 + mov f85 = f0 + adds C8 = 4 * SIZE, C4 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f100 = f0 + mov ar.lc = L + } + { .mfi + setf.d f102 = r0 + mov f101 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f116 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + { .mfi + setf.d f103 = r0 + mov f117 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC] + mov f70 = f0 + cmp.eq p6, p0 = -1, L + } + { .mfb + setf.d f119 = r0 + mov f71 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA_B f65 = f32, f49, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f69 = f36, f49, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f85 = f36, f51, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f101 = f36, f53, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f117 = f36, f55, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f68 = f37, f49, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f84 = f37, f51, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f100 = f37, f53, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f116 = f37, f55, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f71 = f38, f49, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f87 = f38, f51, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f103 = f38, f53, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f119 = f38, f55, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f70 = f39, f49, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f86 = f39, f51, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f102 = f39, f53, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f118 = f39, f55, f118 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + FSUB f80 = f74, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + FSUB f96 = f76, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f77, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FSUB f112 = f78, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f79, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET], 2 * SIZE + FSUB f66 = f88, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f89, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET], 2 * SIZE + FSUB f98 = f92, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f93, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [BOFFSET], 2 * SIZE + FSUB f114 = f94, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f95, f115 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FSUB f68 = f104, f68 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f69 = f105, f69 +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [AOFFSET] + FSUB f84 = f106, f84 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f85 = f107, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [AOFFSET] + FSUB f100 = f108, f100 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f110, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f117 = f111, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f120, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f71 = f121, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f86 = f122, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f87 = f123, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f102 = f124, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f103 = f125, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f119 = f127, f119 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET], 2 * SIZE + FSUB f66 = f74, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + FSUB f68 = f76, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f69 = f77, f69 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + FSUB f70 = f78, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f71 = f79, f71 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f84 = f92, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f93, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET], 2 * SIZE + FSUB f86 = f94, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f95, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [AOFFSET], 2 * SIZE + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FSUB f98 = f106, f98 + adds AOFFSET = -30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [BOFFSET] + FSUB f100 = f108, f100 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [BOFFSET] + FSUB f102 = f110, f102 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f103 = f111, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f124, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f125, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f127, f119 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + { .mfi + LDFPD f76, f77 = [AOFFSET] + FMPY f32 = f72, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f70 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [AOFFSET] + FMPY f34 = f72, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET] + FMPY f35 = f73, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET] + FMA_C f70 = f73, f71, f32 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f102 = f73, f103, f36 + adds C1 = -2 * SIZE, C1 + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET] + FMA_D f71 = f72, f71, f33 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f103 = f72, f103, f37 + adds C2 = -2 * SIZE, C2 + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET] + FMA_C f86 = f73, f87, f34 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + adds C3 = -2 * SIZE, C3 + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET] + FMA_D f87 = f72, f87, f35 + adds BOFFSET2 = 28 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds BOFFSET = 24 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FNMA f68 = f74, f70, f68 + adds C4 = -2 * SIZE, C4 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FNMA f100 = f74, f102, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FMA_A f69 = f75, f70, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FMA_A f101 = f75, f102, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FNMA f84 = f74, f86, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FNMA f116 = f74, f118, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f87, -11 * SIZE + FMA_A f85 = f75, f86, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f119, -11 * SIZE + FMA_A f117 = f75, f118, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + FMA_B f68 = f75, f71, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f102, SIZE + FMA_B f100 = f75, f103, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f71, -3 * SIZE + FNMA f69 = f74, f71, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f103, -3 * SIZE + FNMA f101 = f74, f103, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + FMA_B f84 = f75, f87, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f118, SIZE + FMA_B f116 = f75, f119, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, -3 * SIZE + FNMA f85 = f74, f87, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f119, -3 * SIZE + FNMA f117 = f74, f119, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f76, f70, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f76, f102, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f77, f70, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f99 = f77, f102, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f86, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f76, f118, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f86, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f115 = f77, f118, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f77, f71, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f98 = f77, f103, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f76, f71, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f99 = f76, f103, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f87, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f114 = f77, f119, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f87, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f115 = f76, f119, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f70, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f78, f102, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f70, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f79, f102, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f78, f86, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f78, f118, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f79, f86, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f79, f118, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f71, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f79, f103, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f71, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f78, f103, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f79, f87, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f79, f119, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f78, f87, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f78, f119, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f89, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f69 = f88, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f84 = f89, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f89, f117, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f85 = f88, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f88, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f66 = f90, f68, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f98 = f90, f100, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f67 = f91, f68, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f99 = f91, f100, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f82 = f90, f84, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f114 = f90, f116, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, -11 * SIZE + FMA_A f83 = f91, f84, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, -11 * SIZE + FMA_A f115 = f91, f116, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f66 = f91, f69, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f98 = f91, f101, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, -3 * SIZE + FNMA f67 = f90, f69, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, -3 * SIZE + FNMA f99 = f90, f101, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f82 = f91, f85, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f114 = f91, f117, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, -3 * SIZE + FNMA f83 = f90, f85, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, -3 * SIZE + FNMA f115 = f90, f117, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f68, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f92, f100, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f68, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f93, f100, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f92, f84, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f92, f116, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f93, f84, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f93, f116, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f69, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f93, f101, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f69, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f92, f101, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f93, f85, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f93, f117, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f92, f85, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f92, f117, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f105, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f105, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f104, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f104, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f105, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f104, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f106, f66, f64 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f106, f98, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f65 = f107, f66, f65 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f97 = f107, f98, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f106, f82, f80 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f106, f114, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, -11 * SIZE + FMA_A f81 = f107, f82, f81 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, -11 * SIZE + FMA_A f113 = f107, f114, f113 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f64 = f107, f67, f64 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f96 = f107, f99, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, -3 * SIZE + FNMA f65 = f106, f67, f65 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, -3 * SIZE + FNMA f97 = f106, f99, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f80 = f107, f83, f80 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f112 = f107, f115, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, -3 * SIZE + FNMA f81 = f106, f83, f81 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, -3 * SIZE + FNMA f113 = f106, f115, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f96 = f121, f97, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f120, f97, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f121, f81, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f112 = f121, f113, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f120, f81, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f113 = f120, f113, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f81, -3 * SIZE + STFD [BOFFSET2] = f113, -3 * SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, -1 * SIZE + mov f65 = f0 + adds KK = -4, KK + } + { .mfi + STFD [C3 ] = f97, -1 * SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f112, SIZE + mov f112 = f0 + sub L = K, KK + } + ;; + { .mfi + STFD [C2 ] = f81, -1 * SIZE + mov f81 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f113, -1 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef LT + { .mfi + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f64 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + FMPY f34 = f72, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FMPY f35 = f73, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET] + FMA_C f64 = f73, f65, f32 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f96 = f73, f97, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f72, f97, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET] + FMA_C f80 = f73, f81, f34 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f112 = f73, f113, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FMA_D f81 = f72, f81, f35 + adds AOFFSET = - 30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f113 = f72, f113, f39 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f66 = f74, f64, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f98 = f74, f96, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMA_A f67 = f75, f64, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMA_A f99 = f75, f96, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f82 = f74, f80, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f114 = f74, f112, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f81, 5 * SIZE + FMA_A f83 = f75, f80, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f113, 5 * SIZE + FMA_A f115 = f75, f112, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f66 = f75, f65, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f98 = f75, f97, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f67 = f74, f65, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f99 = f74, f97, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f82 = f75, f81, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f114 = f75, f113, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f83 = f74, f81, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f115 = f74, f113, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f68 = f76, f64, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f96, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f69 = f77, f64, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f96, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f84 = f76, f80, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f76, f112, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f85 = f77, f80, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f77, f112, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f68 = f77, f65, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f97, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f69 = f76, f65, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f97, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f84 = f77, f81, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f77, f113, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f85 = f76, f81, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f76, f113, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f78, f64, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f78, f96, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f79, f64, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f79, f96, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f78, f80, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f112, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f79, f80, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f112, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f79, f65, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f79, f97, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f78, f65, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f78, f97, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f79, f81, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f113, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f78, f81, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f113, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f91, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f91, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f91, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f90, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f90, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f91, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f90, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f68 = f92, f66, f68 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f100 = f92, f98, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f69 = f93, f66, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f101 = f93, f98, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f84 = f92, f82, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f116 = f92, f114, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, 5 * SIZE + FMA_A f85 = f93, f82, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, 5 * SIZE + FMA_A f117 = f93, f114, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f68 = f93, f67, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f100 = f93, f99, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, SIZE + FNMA f69 = f92, f67, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, SIZE + FNMA f101 = f92, f99, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f84 = f93, f83, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f116 = f93, f115, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, SIZE + FNMA f85 = f92, f83, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, SIZE + FNMA f117 = f92, f115, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f94, f66, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f94, f98, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f95, f66, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f95, f98, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f94, f82, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f114, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f95, f82, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f114, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f95, f67, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f95, f99, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f94, f67, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f94, f99, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f95, f83, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f115, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f94, f83, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f115, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f109, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f108, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f109, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f109, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f108, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f109, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f109, f117, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f108, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f108, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f70 = f110, f68, f70 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f102 = f110, f100, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f71 = f111, f68, f71 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f103 = f111, f100, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f86 = f110, f84, f86 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f118 = f110, f116, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, 5 * SIZE + FMA_A f87 = f111, f84, f87 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, 5 * SIZE + FMA_A f119 = f111, f116, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f70 = f111, f69, f70 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f102 = f111, f101, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, SIZE + FNMA f71 = f110, f69, f71 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, SIZE + FNMA f103 = f110, f101, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f86 = f111, f85, f86 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f118 = f111, f117, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, SIZE + FNMA f87 = f110, f85, f87 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, SIZE + FNMA f119 = f110, f117, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f70 = f127, f71, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f127, f103, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f71 = f126, f71, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f126, f103, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f86 = f127, f87, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f87 = f126, f87, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f102, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f118, SIZE + adds KK = 4, KK + } + ;; + { .mmi + STFD [BOFFSET] = f87, -27 * SIZE + STFD [BOFFSET2] = f119 + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + mov f64 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + { .mfi + STFD [C3 ] = f102, SIZE + mov f65 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + ;; + { .mfi + STFD [C1 ] = f71, SIZE + mov f80 = f0 + mov L = KK + } + { .mfi + STFD [C3 ] = f103, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + mov f96 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f119, SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef RN + { .mfi + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f64 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + FMPY f34 = f72, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + FMPY f35 = f73, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET] + FMA_C f64 = f73, f65, f32 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f68 = f73, f69, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f72, f69, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET] + FMA_C f66 = f73, f67, f34 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f70 = f73, f71, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FMA_D f67 = f72, f67, f35 + adds BOFFSET = - 30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f71 = f72, f71, f39 + adds AOFFSET2 = 4 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f80 = f74, f64, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f84 = f74, f68, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMA_A f81 = f75, f64, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FMA_A f85 = f75, f68, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f82 = f74, f66, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f86 = f74, f70, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FMA_A f83 = f75, f66, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FMA_A f87 = f75, f70, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f80 = f75, f65, f80 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f68, SIZE + FMA_B f84 = f75, f69, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f81 = f74, f65, f81 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + FNMA f85 = f74, f69, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f82 = f75, f67, f82 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f70, SIZE + FMA_B f86 = f75, f71, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FNMA f83 = f74, f67, f83 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f71, 5 * SIZE + FNMA f87 = f74, f71, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f96 = f76, f64, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f68, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f97 = f77, f64, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f68, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f98 = f76, f66, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f76, f70, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f99 = f77, f66, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f77, f70, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f96 = f77, f65, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f69, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f97 = f76, f65, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f69, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f98 = f77, f67, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f77, f71, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f76, f67, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f76, f71, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f78, f64, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f78, f68, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f79, f64, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f79, f68, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f78, f66, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f70, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f79, f66, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f70, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f79, f65, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f79, f69, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f78, f65, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f78, f69, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f79, f67, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f71, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f78, f67, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f71, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f91, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f91, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f91, f85, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f81 = f90, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f90, f85, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f91, f87, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f90, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f96 = f92, f80, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f100 = f92, f84, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f97 = f93, f80, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f101 = f93, f84, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f98 = f92, f82, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f102 = f92, f86, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FMA_A f99 = f93, f82, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FMA_A f103 = f93, f86, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f96 = f93, f81, f96 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f100 = f93, f85, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f97 = f92, f81, f97 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f101 = f92, f85, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f98 = f93, f83, f98 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f102 = f93, f87, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f99 = f92, f83, f99 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f103 = f92, f87, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f94, f80, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f94, f84, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f95, f80, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f95, f84, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f94, f82, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f86, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f95, f82, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f86, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f95, f81, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f95, f85, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f94, f81, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f94, f85, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f95, f83, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f87, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f94, f83, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f87, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f109, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f108, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f109, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f109, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f108, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f109, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f109, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f108, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f108, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f112 = f110, f96, f112 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f116 = f110, f100, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f113 = f111, f96, f113 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f117 = f111, f100, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f114 = f110, f98, f114 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f118 = f110, f102, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMA_A f115 = f111, f98, f115 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMA_A f119 = f111, f102, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f112 = f111, f97, f112 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f116 = f111, f101, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f113 = f110, f97, f113 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f117 = f110, f101, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f114 = f111, f99, f114 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f118 = f111, f103, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f115 = f110, f99, f115 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f119 = f110, f103, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f112 = f127, f113, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f127, f117, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f113 = f126, f113, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f126, f117, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f114 = f127, f115, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f115 = f126, f115, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f116, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f117, SIZE + mov L = KK + } + ;; + { .mmi + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f118, SIZE + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + mov f64 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + { .mfi + STFD [C8 ] = f116, SIZE + mov f65 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C8 ] = f117, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + mov f96 = f0 + adds I = -1, I + } + { .mfi + STFD [C8 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f119, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } +#endif + +#ifdef RT + { .mfi + LDFPD f76, f77 = [BOFFSET] + FMPY f32 = f72, f112 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f112 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [BOFFSET] + FMPY f34 = f72, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET] + FMPY f35 = f73, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET] + FMA_C f112 = f73, f113, f32 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f116 = f73, f117, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET] + FMA_D f113 = f72, f113, f33 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f117 = f72, f117, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET] + FMA_C f114 = f73, f115, f34 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET] + FMA_D f115 = f72, f115, f35 + adds AOFFSET2 = 28 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds AOFFSET = 24 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FNMA f96 = f74, f112, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FNMA f100 = f74, f116, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMA_A f97 = f75, f112, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMA_A f101 = f75, f116, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FNMA f98 = f74, f114, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FNMA f102 = f74, f118, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f115, -11 * SIZE + FMA_A f99 = f75, f114, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f119, -11 * SIZE + FMA_A f103 = f75, f118, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f96 = f75, f113, f96 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f116, SIZE + FMA_B f100 = f75, f117, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f97 = f74, f113, f97 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f117, SIZE + FNMA f101 = f74, f117, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f98 = f75, f115, f98 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f118, SIZE + FMA_B f102 = f75, f119, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + FNMA f99 = f74, f115, f99 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f119, 5 * SIZE + FNMA f103 = f74, f119, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f76, f112, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f76, f116, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f77, f112, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f85 = f77, f116, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f114, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f76, f118, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f114, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f87 = f77, f118, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f77, f113, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f84 = f77, f117, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f76, f113, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f85 = f76, f117, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f115, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f86 = f77, f119, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f115, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f76, f119, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f112, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f78, f116, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f112, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f79, f116, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f78, f114, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f78, f118, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f79, f114, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f79, f118, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f113, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f79, f117, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f113, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f78, f117, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f79, f115, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f79, f119, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f78, f115, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f78, f119, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f89, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f88, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f89, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f89, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f88, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f88, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f80 = f90, f96, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f84 = f90, f100, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f81 = f91, f96, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f85 = f91, f100, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f82 = f90, f98, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f86 = f90, f102, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, -11 * SIZE + FMA_A f83 = f91, f98, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, -11 * SIZE + FMA_A f87 = f91, f102, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f80 = f91, f97, f80 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f84 = f91, f101, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f81 = f90, f97, f81 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f85 = f90, f101, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f82 = f91, f99, f82 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f86 = f91, f103, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f83 = f90, f99, f83 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f87 = f90, f103, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f96, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f92, f100, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f96, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f93, f100, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f92, f98, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f92, f102, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f93, f98, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f93, f102, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f97, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f93, f101, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f97, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f92, f101, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f93, f99, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f93, f103, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f92, f99, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f92, f103, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f105, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f105, f85, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f104, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f104, f85, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f105, f87, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f104, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f64 = f106, f80, f64 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f68 = f106, f84, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f65 = f107, f80, f65 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f69 = f107, f84, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f66 = f106, f82, f66 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f70 = f106, f86, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, -11 * SIZE + FMA_A f67 = f107, f82, f67 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, -11 * SIZE + FMA_A f71 = f107, f86, f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f64 = f107, f81, f64 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f68 = f107, f85, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f65 = f106, f81, f65 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f69 = f106, f85, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f66 = f107, f83, f66 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f70 = f107, f87, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f67 = f106, f83, f67 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f71 = f106, f87, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f68 = f121, f69, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f120, f69, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f121, f67, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f70 = f121, f71, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f120, f67, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f71 = f120, f71, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + shladd r2 = K, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + shladd AORIG = r2, 2, AORIG + } + ;; + { .mmi + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C5 ] = f68, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f80 = f0 + sub L = K, KK + } + { .mfi + STFD [C5 ] = f70, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C5 ] = f71, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L049: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + + { .mmb + mov AOFFSET = A + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 16 + +.L050: + { .mib + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L090 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + + mov C1 = C + add C2 = LDC, C + ;; +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib +#ifndef RT + shladd C = LDC, 1, C +#else + nop __LINE__ +#endif + } + ;; + +.L070: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L060 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L078 + ;; + .align 16 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f96 = f32, f49, f96 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f112 = f32, f51, f112 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f97 = f33, f49, f97 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f113 = f33, f51, f113 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f96 = f40, f57, f96 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f112 = f40, f59, f112 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f57, f97 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f59, f113 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f97 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f80 = f80, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f96 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f81 = f81, f112 + nop __LINE__ + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f81, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L060: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L051 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L068 + ;; + .align 16 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + } + { .mfb + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f112, SIZE + ;; + STFD [C2 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L051: + shr I = M, 2 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L089 + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f99 = f0 + adds L = 1, L + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + CPREFETCH [PREC], LDC + mov f115 = f0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds C5 = 4 * SIZE, C1 + adds L = -1, L + } + ;; + { .mmi + CPREFETCH [PREC], LDC + adds C6 = 4 * SIZE, C2 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L058 + ;; + .align 16 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfi + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f36, f48, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f36, f49, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f36, f50, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f36, f51, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f38, f48, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f38, f49, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f38, f50, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f38, f51, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f37, f48, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f37, f49, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f37, f50, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f37, f51, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f39, f48, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f39, f49, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f39, f50, f115 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f39, f51, f114 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f44, f56, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f44, f58, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f46, f56, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f46, f58, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f45, f56, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f45, f58, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f47, f56, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f47, f58, f115 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + ;; + LDFPD f122, f123 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + + FSUB f66 = f104, f66 + FSUB_A f67 = f105, f67 + FSUB f82 = f106, f82 + FSUB_A f83 = f107, f83 + FSUB f98 = f120, f98 + FSUB_A f99 = f121, f99 + FSUB f114 = f122, f114 + FSUB_A f115 = f123, f115 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f66 = f76, f66 + FSUB f67 = f77, f67 + FSUB f98 = f78, f98 + FSUB f99 = f79, f99 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + + FSUB f82 = f92, f82 + FSUB f83 = f93, f83 + FSUB f114 = f94, f114 + FSUB f115 = f95, f115 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f98 + FMPY f33 = f73, f98 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f98 = f73, f99, f32 + FMA_D f99 = f72, f99, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f66 = f74, f98, f66 + FMA_A f67 = f75, f98, f67 + FNMA f82 = f74, f114, f82 + FMA_A f83 = f75, f114, f83 + ;; + FMA_B f66 = f75, f99, f66 + FNMA f67 = f74, f99, f67 + FMA_B f82 = f75, f115, f82 + FNMA f83 = f74, f115, f83 + ;; + FNMA f96 = f76, f98, f96 + FMA_A f97 = f77, f98, f97 + FNMA f112 = f76, f114, f112 + FMA_A f113 = f77, f114, f113 + ;; + FMA_B f96 = f77, f99, f96 + FNMA f97 = f76, f99, f97 + FMA_B f112 = f77, f115, f112 + FNMA f113 = f76, f115, f113 + ;; + FNMA f64 = f78, f98, f64 + FMA_A f65 = f79, f98, f65 + FNMA f80 = f78, f114, f80 + FMA_A f81 = f79, f114, f81 + ;; + FMA_B f64 = f79, f99, f64 + FNMA f65 = f78, f99, f65 + FMA_B f80 = f79, f115, f80 + FNMA f81 = f78, f115, f81 + ;; + FMPY f32 = f88, f66 + FMPY f33 = f89, f66 + FMPY f34 = f88, f82 + FMPY f35 = f89, f82 + ;; + FMA_C f66 = f89, f67, f32 + FMA_D f67 = f88, f67, f33 + FMA_C f82 = f89, f83, f34 + FMA_D f83 = f88, f83, f35 + ;; + FNMA f96 = f90, f66, f96 + FMA_A f97 = f91, f66, f97 + FNMA f112 = f90, f82, f112 + FMA_A f113 = f91, f82, f113 + ;; + FMA_B f96 = f91, f67, f96 + FNMA f97 = f90, f67, f97 + FMA_B f112 = f91, f83, f112 + FNMA f113 = f90, f83, f113 + ;; + FNMA f64 = f92, f66, f64 + FMA_A f65 = f93, f66, f65 + FNMA f80 = f92, f82, f80 + FMA_A f81 = f93, f82, f81 + ;; + FMA_B f64 = f93, f67, f64 + FNMA f65 = f92, f67, f65 + FMA_B f80 = f93, f83, f80 + FNMA f81 = f92, f83, f81 + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FNMA f66 = f76, f64, f66 + FMA_A f67 = f77, f64, f67 + FNMA f82 = f76, f80, f82 + FMA_A f83 = f77, f80, f83 + ;; + FMA_B f66 = f77, f65, f66 + FNMA f67 = f76, f65, f67 + FMA_B f82 = f77, f81, f82 + FNMA f83 = f76, f81, f83 + ;; + FNMA f98 = f78, f64, f98 + FMA_A f99 = f79, f64, f99 + FNMA f114 = f78, f80, f114 + FMA_A f115 = f79, f80, f115 + ;; + FMA_B f98 = f79, f65, f98 + FNMA f99 = f78, f65, f99 + FMA_B f114 = f79, f81, f114 + FNMA f115 = f78, f81, f115 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; + FNMA f66 = f92, f96, f66 + FMA_A f67 = f93, f96, f67 + FNMA f82 = f92, f112, f82 + FMA_A f83 = f93, f112, f83 + ;; + FMA_B f66 = f93, f97, f66 + FNMA f67 = f92, f97, f67 + FMA_B f82 = f93, f113, f82 + FNMA f83 = f92, f113, f83 + ;; + FNMA f98 = f94, f96, f98 + FMA_A f99 = f95, f96, f99 + FNMA f114 = f94, f112, f114 + FMA_A f115 = f95, f112, f115 + ;; + FMA_B f98 = f95, f97, f98 + FNMA f99 = f94, f97, f99 + FMA_B f114 = f95, f113, f114 + FNMA f115 = f94, f113, f115 + ;; + FMPY f32 = f108, f66 + FMPY f33 = f109, f66 + FMPY f34 = f108, f82 + FMPY f35 = f109, f82 + ;; + FMA_C f66 = f109, f67, f32 + FMA_D f67 = f108, f67, f33 + FMA_C f82 = f109, f83, f34 + FMA_D f83 = f108, f83, f35 + ;; + FNMA f98 = f110, f66, f98 + FMA_A f99 = f111, f66, f99 + FNMA f114 = f110, f82, f114 + FMA_A f115 = f111, f82, f115 + ;; + FMA_B f98 = f111, f67, f98 + FNMA f99 = f110, f67, f99 + FMA_B f114 = f111, f83, f114 + FNMA f115 = f110, f83, f115 + ;; + FMPY f32 = f126, f98 + FMPY f33 = f127, f98 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f98 = f127, f99, f32 + FMA_D f99 = f126, f99, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + FMPY f36 = f72, f66 + FMPY f37 = f73, f66 + FMPY f38 = f72, f98 + FMPY f39 = f73, f98 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + FMA_C f66 = f73, f67, f36 + FMA_D f67 = f72, f67, f37 + FMA_C f98 = f73, f99, f38 + FMA_D f99 = f72, f99, f39 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + FNMA f114 = f74, f98, f114 + FMA_A f115 = f75, f98, f115 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + FMA_B f114 = f75, f99, f114 + FNMA f115 = f74, f99, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + FMPY f36 = f90, f82 + FMPY f37 = f91, f82 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + FMA_C f82 = f91, f83, f36 + FMA_D f83 = f90, f83, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + FMPY f36 = f104, f82 + FMPY f37 = f105, f82 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + FMA_C f82 = f105, f83, f36 + FMA_D f83 = f104, f83, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + FNMA f98 = f106, f114, f98 + FMA_A f99 = f107, f114, f99 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + FMA_B f98 = f107, f115, f98 + FNMA f99 = f106, f115, f99 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + FMPY f36 = f120, f66 + FMPY f37 = f121, f66 + FMPY f38 = f120, f98 + FMPY f39 = f121, f98 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + FMA_C f66 = f121, f67, f36 + FMA_D f67 = f120, f67, f37 + FMA_C f98 = f121, f99, f38 + FMA_D f99 = f120, f99, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f66, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f67, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f98, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f99, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f83, SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f113, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f66, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f67, SIZE + ;; + STFD [C1 ] = f96, SIZE + STFD [C5 ] = f98, SIZE + ;; + STFD [C1 ] = f97, 5 * SIZE + STFD [C5 ] = f99, 5 * SIZE + ;; + STFD [C2 ] = f80, SIZE + STFD [C6 ] = f82, SIZE + ;; + STFD [C2 ] = f81, SIZE + STFD [C6 ] = f83, SIZE + ;; + STFD [C2 ] = f112, SIZE + STFD [C6 ] = f114, SIZE + ;; + STFD [C2 ] = f113, 5 * SIZE + STFD [C6 ] = f115, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L052 + ;; + .align 16 + + +.L089: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L090: + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L999 + ;; +#ifdef RT + { .mmi + shl r2 = K, ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } + ;; +#endif + mov C1 = C + +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib +#ifndef RT + add C = LDC, C +#else + nop __LINE__ +#endif + } + ;; + +.L110: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L100 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L118 + ;; + .align 16 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + ;; +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + ;; +#else + LDFPD f72, f73 = [AOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L100: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L091 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L108 + ;; + .align 16 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f112 = f34, f49, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f113 = f35, f49, f113 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f57, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f57, f113 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f96 = f96, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f97 = f97, f112 + nop __LINE__ + } + ;; +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f96, SIZE + ;; + STFD [BOFFSET] = f97, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + .align 16 + +.L091: + shr I = M, 2 + ;; + cmp.eq p6, p7 = 0, I + (p6) br.cond.dpnt .L119 + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC] + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov ar.lc = L + } + { .mmi + adds C5 = 4 * SIZE, C1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + ;; + .align 16 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f34, f48, f80 // A3 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f81 = f34, f49, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f36, f48, f96 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f97 = f36, f49, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f38, f48, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f38, f49, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f81 = f35, f48, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f35, f49, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f97 = f37, f48, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f37, f49, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f113 = f39, f48, f113 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f39, f49, f112 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f42, f56, f80 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f44, f56, f96 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f46, f56, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f43, f56, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f45, f56, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f47, f56, f113 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f96, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f97, SIZE + ;; + STFD [C1 ] = f80, SIZE + STFD [C5 ] = f112, SIZE + ;; + STFD [C1 ] = f81, 5 * SIZE + STFD [C5 ] = f113, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L092 + ;; + .align 16 + +.L119: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + add B = KK8, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 1, KK +#endif + +#ifdef RT + adds KK = -1, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L999: + { .mii + nop __LINE__ + mov ar.lc = ARLC + mov pr = PR, -1 + } + { .mib + nop __LINE__ +#ifdef TRMMKERNEL + mov ar.pfs = ARPFS +#else + nop __LINE__ +#endif + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/ztrsm_kernel_LT.S b/kernel/ia64/ztrsm_kernel_LT.S new file mode 100644 index 0000000000..6c7a8ca5ba --- /dev/null +++ b/kernel/ia64/ztrsm_kernel_LT.S @@ -0,0 +1,10835 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#ifndef LN +#define CPREFETCHSIZE 7 +#else +#define CPREFETCHSIZE -8 +#endif +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + +#define AORIG loc0 +#define KK loc1 +#define KK8 loc2 +#define OFFSET loc3 +#define AOFFSET2 loc4 +#define BOFFSET2 loc5 + +#ifndef CONJ +#define FCALC_A FSUB +#define FCALC_B FADD +#define FMA_A FNMA +#define FMA_B FMA +#else +#define FCALC_A FADD +#define FCALC_B FSUB +#define FMA_A FMA +#define FMA_B FNMA +#endif + +#ifndef CONJ +#define FCALC_C FMA +#define FCALC_D FNMA +#else +#define FCALC_C FNMA +#define FCALC_D FMA +#endif + +#ifndef CONJ +#define FMA_C FNMA +#define FMA_D FMA +#define FSUB_A FSUB +#else +#define FMA_C FMA +#define FMA_D FMS +#define FSUB_A FADD +#endif + + + PROLOGUE + .prologue + PROFCODE + + { .mfi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 8, 0, 0 + mov f64 = f0 + adds r14 = 16, SP + } + { .mfi + nop __LINE__ + mov f65 = f0 + adds r15 = 24, SP + } + ;; + { .mfi + ld8 LDC = [r14] + mov f81 = f0 + mov PR = pr + } + { .mfi + ld8 OFFSET = [r15] + mov f96 = f0 + shr J = N, 2 + } + ;; + { .mfi + shladd LDC = LDC, ZBASE_SHIFT, r0 + mov f97 = f0 + } + { .mfi + nop __LINE__ + mov f113 = f0 + } + ;; +#ifdef LN + { .mmi + setf.sig f32 = M + setf.sig f33 = K + shladd C = M, ZBASE_SHIFT, C + } + ;; + {.mmf + nop __LINE__ + nop __LINE__ + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + ;; + nop __LINE__ + shladd A = r2, ZBASE_SHIFT, A + } + ;; +#endif + +#ifdef RN + sub KK = r0, OFFSET +#endif + +#ifdef RT + { .mmi + setf.sig f32 = N + setf.sig f33 = K + nop __LINE__ + } + ;; + { .mmi + setf.sig f34 = LDC + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + xmpy.l f33 = f32, f33 + } + { .mmf + nop __LINE__ + sub KK = N, OFFSET + xmpy.l f34 = f32, f34 + } + ;; + { .mmi + getf.sig r2 = f33 + getf.sig r3 = f34 + } + ;; + shladd B = r2, ZBASE_SHIFT, B + add C = r3, C +#endif + ;; + .body + { .mfi + nop __LINE__ + mov f80 = f0 + mov ARLC = ar.lc + } + { .mfb + cmp.ge p6, p0 = 0, J + mov f112 = f0 + (p6) br.cond.dpnt .L050 + } + ;; + .align 16 + +.L010: +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + { .mmi + mov C1 = C // coffset1 = c + 0 * ldc + add C2 = LDC, C // coffset2 = c + 1 * ldc + shr I = M, 2 + } + { .mmi + adds J = -1, J +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + ;; + { .mmi + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mfi + shladd r3 = KK, ZBASE_SHIFT, r0 + mov f118 = f0 + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = 1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + adds C5 = 4 * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f115 = f0 + adds C6 = 4 * SIZE, C2 + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f68 = f0 + shr L = L, 1 + } + { .mfi + setf.d f86 = r0 + mov f69 = f0 + adds C7 = 4 * SIZE, C3 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f84 = f0 + adds L = -1, L + } + { .mfi + setf.d f87 = r0 + mov f85 = f0 + adds C8 = 4 * SIZE, C4 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f100 = f0 + mov ar.lc = L + } + { .mfi + setf.d f102 = r0 + mov f101 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f116 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + { .mfi + setf.d f103 = r0 + mov f117 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC] + mov f70 = f0 + cmp.eq p6, p0 = -1, L + } + { .mfb + setf.d f119 = r0 + mov f71 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA_B f65 = f32, f49, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f69 = f36, f49, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f85 = f36, f51, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f101 = f36, f53, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f117 = f36, f55, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f68 = f37, f49, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f84 = f37, f51, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f100 = f37, f53, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f116 = f37, f55, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f71 = f38, f49, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f87 = f38, f51, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f103 = f38, f53, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f119 = f38, f55, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f70 = f39, f49, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f86 = f39, f51, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f102 = f39, f53, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f118 = f39, f55, f118 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + FSUB f80 = f74, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + FSUB f96 = f76, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f77, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FSUB f112 = f78, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f79, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET], 2 * SIZE + FSUB f66 = f88, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f89, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET], 2 * SIZE + FSUB f98 = f92, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f93, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [BOFFSET], 2 * SIZE + FSUB f114 = f94, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f95, f115 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FSUB f68 = f104, f68 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f69 = f105, f69 +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [AOFFSET] + FSUB f84 = f106, f84 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f85 = f107, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [AOFFSET] + FSUB f100 = f108, f100 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f110, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f117 = f111, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f120, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f71 = f121, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f86 = f122, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f87 = f123, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f102 = f124, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f103 = f125, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f119 = f127, f119 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET], 2 * SIZE + FSUB f66 = f74, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + FSUB f68 = f76, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f69 = f77, f69 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + FSUB f70 = f78, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f71 = f79, f71 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f84 = f92, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f93, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET], 2 * SIZE + FSUB f86 = f94, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f95, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [AOFFSET], 2 * SIZE + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FSUB f98 = f106, f98 + adds AOFFSET = -30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [BOFFSET] + FSUB f100 = f108, f100 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [BOFFSET] + FSUB f102 = f110, f102 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f103 = f111, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f124, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f125, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f127, f119 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + { .mfi + LDFPD f76, f77 = [AOFFSET] + FMPY f32 = f72, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f70 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [AOFFSET] + FMPY f34 = f72, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET] + FMPY f35 = f73, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET] + FMA_C f70 = f73, f71, f32 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f102 = f73, f103, f36 + adds C1 = -2 * SIZE, C1 + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET] + FMA_D f71 = f72, f71, f33 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f103 = f72, f103, f37 + adds C2 = -2 * SIZE, C2 + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET] + FMA_C f86 = f73, f87, f34 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + adds C3 = -2 * SIZE, C3 + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET] + FMA_D f87 = f72, f87, f35 + adds BOFFSET2 = 28 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds BOFFSET = 24 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FNMA f68 = f74, f70, f68 + adds C4 = -2 * SIZE, C4 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FNMA f100 = f74, f102, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FMA_A f69 = f75, f70, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FMA_A f101 = f75, f102, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FNMA f84 = f74, f86, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FNMA f116 = f74, f118, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f87, -11 * SIZE + FMA_A f85 = f75, f86, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f119, -11 * SIZE + FMA_A f117 = f75, f118, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + FMA_B f68 = f75, f71, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f102, SIZE + FMA_B f100 = f75, f103, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f71, -3 * SIZE + FNMA f69 = f74, f71, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f103, -3 * SIZE + FNMA f101 = f74, f103, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + FMA_B f84 = f75, f87, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f118, SIZE + FMA_B f116 = f75, f119, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, -3 * SIZE + FNMA f85 = f74, f87, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f119, -3 * SIZE + FNMA f117 = f74, f119, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f76, f70, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f76, f102, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f77, f70, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f99 = f77, f102, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f86, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f76, f118, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f86, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f115 = f77, f118, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f77, f71, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f98 = f77, f103, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f76, f71, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f99 = f76, f103, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f87, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f114 = f77, f119, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f87, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f115 = f76, f119, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f70, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f78, f102, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f70, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f79, f102, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f78, f86, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f78, f118, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f79, f86, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f79, f118, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f71, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f79, f103, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f71, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f78, f103, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f79, f87, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f79, f119, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f78, f87, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f78, f119, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f89, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f69 = f88, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f84 = f89, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f89, f117, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f85 = f88, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f88, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f66 = f90, f68, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f98 = f90, f100, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f67 = f91, f68, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f99 = f91, f100, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f82 = f90, f84, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f114 = f90, f116, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, -11 * SIZE + FMA_A f83 = f91, f84, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, -11 * SIZE + FMA_A f115 = f91, f116, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f66 = f91, f69, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f98 = f91, f101, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, -3 * SIZE + FNMA f67 = f90, f69, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, -3 * SIZE + FNMA f99 = f90, f101, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f82 = f91, f85, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f114 = f91, f117, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, -3 * SIZE + FNMA f83 = f90, f85, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, -3 * SIZE + FNMA f115 = f90, f117, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f68, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f92, f100, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f68, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f93, f100, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f92, f84, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f92, f116, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f93, f84, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f93, f116, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f69, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f93, f101, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f69, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f92, f101, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f93, f85, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f93, f117, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f92, f85, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f92, f117, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f105, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f105, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f104, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f104, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f105, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f104, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f106, f66, f64 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f106, f98, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f65 = f107, f66, f65 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f97 = f107, f98, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f106, f82, f80 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f106, f114, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, -11 * SIZE + FMA_A f81 = f107, f82, f81 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, -11 * SIZE + FMA_A f113 = f107, f114, f113 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f64 = f107, f67, f64 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f96 = f107, f99, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, -3 * SIZE + FNMA f65 = f106, f67, f65 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, -3 * SIZE + FNMA f97 = f106, f99, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f80 = f107, f83, f80 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f112 = f107, f115, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, -3 * SIZE + FNMA f81 = f106, f83, f81 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, -3 * SIZE + FNMA f113 = f106, f115, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f96 = f121, f97, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f120, f97, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f121, f81, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f112 = f121, f113, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f120, f81, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f113 = f120, f113, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f81, -3 * SIZE + STFD [BOFFSET2] = f113, -3 * SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, -1 * SIZE + mov f65 = f0 + adds KK = -4, KK + } + { .mfi + STFD [C3 ] = f97, -1 * SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f112, SIZE + mov f112 = f0 + sub L = K, KK + } + ;; + { .mfi + STFD [C2 ] = f81, -1 * SIZE + mov f81 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f113, -1 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef LT + { .mfi + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f64 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + FMPY f34 = f72, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FMPY f35 = f73, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET] + FMA_C f64 = f73, f65, f32 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f96 = f73, f97, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f72, f97, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET] + FMA_C f80 = f73, f81, f34 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f112 = f73, f113, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FMA_D f81 = f72, f81, f35 + adds AOFFSET = - 30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f113 = f72, f113, f39 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f66 = f74, f64, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f98 = f74, f96, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMA_A f67 = f75, f64, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMA_A f99 = f75, f96, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f82 = f74, f80, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f114 = f74, f112, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f81, 5 * SIZE + FMA_A f83 = f75, f80, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f113, 5 * SIZE + FMA_A f115 = f75, f112, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f66 = f75, f65, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f98 = f75, f97, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f67 = f74, f65, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f99 = f74, f97, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f82 = f75, f81, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f114 = f75, f113, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f83 = f74, f81, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f115 = f74, f113, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f68 = f76, f64, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f96, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f69 = f77, f64, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f96, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f84 = f76, f80, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f76, f112, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f85 = f77, f80, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f77, f112, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f68 = f77, f65, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f97, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f69 = f76, f65, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f97, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f84 = f77, f81, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f77, f113, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f85 = f76, f81, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f76, f113, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f78, f64, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f78, f96, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f79, f64, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f79, f96, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f78, f80, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f112, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f79, f80, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f112, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f79, f65, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f79, f97, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f78, f65, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f78, f97, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f79, f81, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f113, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f78, f81, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f113, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f91, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f91, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f91, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f90, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f90, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f91, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f90, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f68 = f92, f66, f68 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f100 = f92, f98, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f69 = f93, f66, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f101 = f93, f98, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f84 = f92, f82, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f116 = f92, f114, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, 5 * SIZE + FMA_A f85 = f93, f82, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, 5 * SIZE + FMA_A f117 = f93, f114, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f68 = f93, f67, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f100 = f93, f99, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, SIZE + FNMA f69 = f92, f67, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, SIZE + FNMA f101 = f92, f99, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f84 = f93, f83, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f116 = f93, f115, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, SIZE + FNMA f85 = f92, f83, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, SIZE + FNMA f117 = f92, f115, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f94, f66, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f94, f98, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f95, f66, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f95, f98, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f94, f82, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f114, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f95, f82, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f114, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f95, f67, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f95, f99, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f94, f67, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f94, f99, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f95, f83, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f115, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f94, f83, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f115, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f109, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f108, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f109, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f109, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f108, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f109, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f109, f117, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f108, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f108, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f70 = f110, f68, f70 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f102 = f110, f100, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f71 = f111, f68, f71 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f103 = f111, f100, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f86 = f110, f84, f86 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f118 = f110, f116, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, 5 * SIZE + FMA_A f87 = f111, f84, f87 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, 5 * SIZE + FMA_A f119 = f111, f116, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f70 = f111, f69, f70 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f102 = f111, f101, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, SIZE + FNMA f71 = f110, f69, f71 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, SIZE + FNMA f103 = f110, f101, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f86 = f111, f85, f86 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f118 = f111, f117, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, SIZE + FNMA f87 = f110, f85, f87 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, SIZE + FNMA f119 = f110, f117, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f70 = f127, f71, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f127, f103, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f71 = f126, f71, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f126, f103, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f86 = f127, f87, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f87 = f126, f87, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f102, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f118, SIZE + adds KK = 4, KK + } + ;; + { .mmi + STFD [BOFFSET] = f87, -27 * SIZE + STFD [BOFFSET2] = f119 + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + mov f64 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + { .mfi + STFD [C3 ] = f102, SIZE + mov f65 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + ;; + { .mfi + STFD [C1 ] = f71, SIZE + mov f80 = f0 + mov L = KK + } + { .mfi + STFD [C3 ] = f103, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + mov f96 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f119, SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef RN + { .mfi + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f64 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + FMPY f34 = f72, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + FMPY f35 = f73, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET] + FMA_C f64 = f73, f65, f32 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f68 = f73, f69, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f72, f69, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET] + FMA_C f66 = f73, f67, f34 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f70 = f73, f71, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FMA_D f67 = f72, f67, f35 + adds BOFFSET = - 30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f71 = f72, f71, f39 + adds AOFFSET2 = 4 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f80 = f74, f64, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f84 = f74, f68, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMA_A f81 = f75, f64, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FMA_A f85 = f75, f68, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f82 = f74, f66, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f86 = f74, f70, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FMA_A f83 = f75, f66, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FMA_A f87 = f75, f70, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f80 = f75, f65, f80 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f68, SIZE + FMA_B f84 = f75, f69, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f81 = f74, f65, f81 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + FNMA f85 = f74, f69, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f82 = f75, f67, f82 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f70, SIZE + FMA_B f86 = f75, f71, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FNMA f83 = f74, f67, f83 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f71, 5 * SIZE + FNMA f87 = f74, f71, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f96 = f76, f64, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f68, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f97 = f77, f64, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f68, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f98 = f76, f66, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f76, f70, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f99 = f77, f66, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f77, f70, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f96 = f77, f65, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f69, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f97 = f76, f65, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f69, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f98 = f77, f67, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f77, f71, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f76, f67, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f76, f71, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f78, f64, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f78, f68, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f79, f64, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f79, f68, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f78, f66, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f70, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f79, f66, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f70, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f79, f65, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f79, f69, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f78, f65, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f78, f69, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f79, f67, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f71, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f78, f67, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f71, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f91, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f91, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f91, f85, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f81 = f90, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f90, f85, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f91, f87, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f90, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f96 = f92, f80, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f100 = f92, f84, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f97 = f93, f80, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f101 = f93, f84, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f98 = f92, f82, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f102 = f92, f86, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FMA_A f99 = f93, f82, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FMA_A f103 = f93, f86, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f96 = f93, f81, f96 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f100 = f93, f85, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f97 = f92, f81, f97 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f101 = f92, f85, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f98 = f93, f83, f98 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f102 = f93, f87, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f99 = f92, f83, f99 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f103 = f92, f87, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f94, f80, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f94, f84, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f95, f80, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f95, f84, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f94, f82, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f86, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f95, f82, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f86, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f95, f81, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f95, f85, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f94, f81, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f94, f85, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f95, f83, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f87, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f94, f83, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f87, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f109, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f108, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f109, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f109, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f108, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f109, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f109, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f108, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f108, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f112 = f110, f96, f112 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f116 = f110, f100, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f113 = f111, f96, f113 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f117 = f111, f100, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f114 = f110, f98, f114 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f118 = f110, f102, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMA_A f115 = f111, f98, f115 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMA_A f119 = f111, f102, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f112 = f111, f97, f112 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f116 = f111, f101, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f113 = f110, f97, f113 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f117 = f110, f101, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f114 = f111, f99, f114 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f118 = f111, f103, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f115 = f110, f99, f115 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f119 = f110, f103, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f112 = f127, f113, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f127, f117, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f113 = f126, f113, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f126, f117, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f114 = f127, f115, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f115 = f126, f115, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f116, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f117, SIZE + mov L = KK + } + ;; + { .mmi + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f118, SIZE + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + mov f64 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + { .mfi + STFD [C8 ] = f116, SIZE + mov f65 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C8 ] = f117, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + mov f96 = f0 + adds I = -1, I + } + { .mfi + STFD [C8 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f119, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } +#endif + +#ifdef RT + { .mfi + LDFPD f76, f77 = [BOFFSET] + FMPY f32 = f72, f112 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f112 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [BOFFSET] + FMPY f34 = f72, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET] + FMPY f35 = f73, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET] + FMA_C f112 = f73, f113, f32 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f116 = f73, f117, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET] + FMA_D f113 = f72, f113, f33 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f117 = f72, f117, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET] + FMA_C f114 = f73, f115, f34 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET] + FMA_D f115 = f72, f115, f35 + adds AOFFSET2 = 28 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds AOFFSET = 24 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FNMA f96 = f74, f112, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FNMA f100 = f74, f116, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMA_A f97 = f75, f112, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMA_A f101 = f75, f116, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FNMA f98 = f74, f114, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FNMA f102 = f74, f118, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f115, -11 * SIZE + FMA_A f99 = f75, f114, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f119, -11 * SIZE + FMA_A f103 = f75, f118, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f96 = f75, f113, f96 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f116, SIZE + FMA_B f100 = f75, f117, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f97 = f74, f113, f97 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f117, SIZE + FNMA f101 = f74, f117, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f98 = f75, f115, f98 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f118, SIZE + FMA_B f102 = f75, f119, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + FNMA f99 = f74, f115, f99 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f119, 5 * SIZE + FNMA f103 = f74, f119, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f76, f112, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f76, f116, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f77, f112, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f85 = f77, f116, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f114, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f76, f118, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f114, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f87 = f77, f118, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f77, f113, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f84 = f77, f117, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f76, f113, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f85 = f76, f117, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f115, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f86 = f77, f119, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f115, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f76, f119, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f112, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f78, f116, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f112, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f79, f116, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f78, f114, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f78, f118, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f79, f114, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f79, f118, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f113, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f79, f117, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f113, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f78, f117, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f79, f115, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f79, f119, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f78, f115, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f78, f119, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f89, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f88, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f89, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f89, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f88, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f88, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f80 = f90, f96, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f84 = f90, f100, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f81 = f91, f96, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f85 = f91, f100, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f82 = f90, f98, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f86 = f90, f102, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, -11 * SIZE + FMA_A f83 = f91, f98, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, -11 * SIZE + FMA_A f87 = f91, f102, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f80 = f91, f97, f80 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f84 = f91, f101, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f81 = f90, f97, f81 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f85 = f90, f101, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f82 = f91, f99, f82 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f86 = f91, f103, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f83 = f90, f99, f83 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f87 = f90, f103, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f96, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f92, f100, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f96, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f93, f100, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f92, f98, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f92, f102, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f93, f98, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f93, f102, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f97, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f93, f101, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f97, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f92, f101, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f93, f99, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f93, f103, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f92, f99, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f92, f103, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f105, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f105, f85, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f104, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f104, f85, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f105, f87, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f104, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f64 = f106, f80, f64 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f68 = f106, f84, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f65 = f107, f80, f65 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f69 = f107, f84, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f66 = f106, f82, f66 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f70 = f106, f86, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, -11 * SIZE + FMA_A f67 = f107, f82, f67 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, -11 * SIZE + FMA_A f71 = f107, f86, f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f64 = f107, f81, f64 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f68 = f107, f85, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f65 = f106, f81, f65 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f69 = f106, f85, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f66 = f107, f83, f66 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f70 = f107, f87, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f67 = f106, f83, f67 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f71 = f106, f87, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f68 = f121, f69, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f120, f69, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f121, f67, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f70 = f121, f71, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f120, f67, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f71 = f120, f71, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + shladd r2 = K, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + shladd AORIG = r2, 2, AORIG + } + ;; + { .mmi + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C5 ] = f68, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f80 = f0 + sub L = K, KK + } + { .mfi + STFD [C5 ] = f70, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C5 ] = f71, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + shr L = L, 1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = -1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f114 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f115 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L028 + ;; + .align 16 + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; +.L028: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET] + FSUB f80 = f74, f80 + adds BOFFSET = -14 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f89, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f90, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f91, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f66 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f105, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f106, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f107, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f120, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f121, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f123, f115 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET] + FSUB f66 = f74, f66 + adds AOFFSET = -14 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f106, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f66 + FMPY f33 = f105, f66 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + FMPY f36 = f104, f98 + FMPY f37 = f105, f98 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f66 = f105, f67, f32 + FMA_D f67 = f104, f67, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + FMA_C f98 = f105, f99, f36 + FMA_D f99 = f104, f99, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f66, f64 + FMA_A f65 = f107, f66, f65 + FNMA f80 = f106, f82, f80 + FMA_A f81 = f107, f82, f81 + FNMA f96 = f106, f98, f96 + FMA_A f97 = f107, f98, f97 + FNMA f112 = f106, f114, f112 + FMA_A f113 = f107, f114, f113 + ;; + FMA_B f64 = f107, f67, f64 + FNMA f65 = f106, f67, f65 + FMA_B f80 = f107, f83, f80 + FNMA f81 = f106, f83, f81 + FMA_B f96 = f107, f99, f96 + FNMA f97 = f106, f99, f97 + FMA_B f112 = f107, f115, f112 + FNMA f113 = f106, f115, f113 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; + FNMA f66 = f74, f64, f66 + FMA_A f67 = f75, f64, f67 + FNMA f82 = f74, f80, f82 + FMA_A f83 = f75, f80, f83 + FNMA f98 = f74, f96, f98 + FMA_A f99 = f75, f96, f99 + FNMA f114 = f74, f112, f114 + FMA_A f115 = f75, f112, f115 + ;; + FMA_B f66 = f75, f65, f66 + FNMA f67 = f74, f65, f67 + FMA_B f82 = f75, f81, f82 + FNMA f83 = f74, f81, f83 + FMA_B f98 = f75, f97, f98 + FNMA f99 = f74, f97, f99 + FMA_B f114 = f75, f113, f114 + FNMA f115 = f74, f113, f115 + ;; + FMPY f32 = f90, f66 + FMPY f33 = f91, f66 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + FMPY f36 = f90, f98 + FMPY f37 = f91, f98 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f66 = f91, f67, f32 + FMA_D f67 = f90, f67, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + FMA_C f98 = f91, f99, f36 + FMA_D f99 = f90, f99, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f66 + FMPY f35 = f73, f66 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f66 = f73, f67, f34 + FMA_D f67 = f72, f67, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f98 = f76, f66, f98 + FMA_A f99 = f77, f66, f99 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f98 = f77, f67, f98 + FNMA f99 = f76, f67, f99 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + FNMA f114 = f78, f66, f114 + FMA_A f115 = f79, f66, f115 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + FMA_B f114 = f79, f67, f114 + FNMA f115 = f78, f67, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f98 = f92, f82, f98 + FMA_A f99 = f93, f82, f99 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f98 = f93, f83, f98 + FNMA f99 = f92, f83, f99 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + FNMA f114 = f94, f82, f114 + FMA_A f115 = f95, f82, f115 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + FMA_B f114 = f95, f83, f114 + FNMA f115 = f94, f83, f115 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + FMPY f34 = f108, f98 + FMPY f35 = f109, f98 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + FMA_C f98 = f109, f99, f34 + FMA_D f99 = f108, f99, f35 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + FNMA f114 = f110, f98, f114 + FMA_A f115 = f111, f98, f115 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + FMA_B f114 = f111, f99, f114 + FNMA f115 = f110, f99, f115 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f98 = f74, f114, f98 + FMA_A f99 = f75, f114, f99 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f98 = f75, f115, f98 + FNMA f99 = f74, f115, f99 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f82 = f76, f114, f82 + FMA_A f83 = f77, f114, f83 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f82 = f77, f115, f82 + FNMA f83 = f76, f115, f83 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + FNMA f66 = f78, f114, f66 + FMA_A f67 = f79, f114, f67 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + FMA_B f66 = f79, f115, f66 + FNMA f67 = f78, f115, f67 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + FMPY f34 = f88, f98 + FMPY f35 = f89, f98 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + FMA_C f98 = f89, f99, f34 + FMA_D f99 = f88, f99, f35 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f82 = f90, f98, f82 + FMA_A f83 = f91, f98, f83 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f82 = f91, f99, f82 + FNMA f83 = f90, f99, f83 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + FNMA f66 = f92, f98, f66 + FMA_A f67 = f93, f98, f67 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + FMA_B f66 = f93, f99, f66 + FNMA f67 = f92, f99, f67 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f66 + FMPY f35 = f121, f66 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f66 = f121, f67, f34 + FMA_D f67 = f120, f67, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f83, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f66, SIZE + ;; + STFD [C1 ] = f67, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f82, SIZE + ;; + STFD [C2 ] = f83, SIZE + ;; + + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C3 ] = f98, SIZE + ;; + STFD [C3 ] = f99, SIZE + ;; + + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + STFD [C4 ] = f114, SIZE + ;; + STFD [C4 ] = f115, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L030: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f72 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f72 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + + { .mmi + nop __LINE__ + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f88 = f0 + shr L = L, 1 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f104 = f0 + adds L = -1, L + } + { .mfb + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f105 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f120 = f0 + mov ar.lc = L + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f121 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L038 + ;; + .align 16 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB_A f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB_A f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB_A f113 = f121, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB f113 = f121, f113 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f90, f91 = [AOFFSET] + ;; + FMPY f32 = f90, f64 + FMPY f33 = f91, f64 + FMPY f34 = f90, f80 + FMPY f35 = f91, f80 + FMPY f36 = f90, f96 + FMPY f37 = f91, f96 + FMPY f38 = f90, f112 + FMPY f39 = f91, f112 + ;; + FMA_C f64 = f91, f65, f32 + FMA_D f65 = f90, f65, f33 + FMA_C f80 = f91, f81, f34 + FMA_D f81 = f90, f81, f35 + FMA_C f96 = f91, f97, f36 + FMA_D f97 = f90, f97, f37 + FMA_C f112 = f91, f113, f38 + FMA_D f113 = f90, f113, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L049: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + + { .mmb + mov AOFFSET = A + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010 + } + ;; + .align 16 + +.L050: + { .mmi + shr I = M, 2 + } + { .mib + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L090 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + + mov C1 = C + add C2 = LDC, C + ;; +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + shladd C = LDC, 1, C +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f99 = f0 + adds L = 1, L + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + CPREFETCH [PREC], LDC + mov f115 = f0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds C5 = 4 * SIZE, C1 + adds L = -1, L + } + ;; + { .mmi + CPREFETCH [PREC], LDC + adds C6 = 4 * SIZE, C2 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L058 + ;; + .align 16 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfi + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f36, f48, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f36, f49, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f36, f50, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f36, f51, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f38, f48, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f38, f49, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f38, f50, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f38, f51, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f37, f48, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f37, f49, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f37, f50, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f37, f51, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f39, f48, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f39, f49, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f39, f50, f115 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f39, f51, f114 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f44, f56, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f44, f58, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f46, f56, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f46, f58, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f45, f56, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f45, f58, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f47, f56, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f47, f58, f115 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + ;; + LDFPD f122, f123 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + + FSUB f66 = f104, f66 + FSUB_A f67 = f105, f67 + FSUB f82 = f106, f82 + FSUB_A f83 = f107, f83 + FSUB f98 = f120, f98 + FSUB_A f99 = f121, f99 + FSUB f114 = f122, f114 + FSUB_A f115 = f123, f115 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f66 = f76, f66 + FSUB f67 = f77, f67 + FSUB f98 = f78, f98 + FSUB f99 = f79, f99 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + + FSUB f82 = f92, f82 + FSUB f83 = f93, f83 + FSUB f114 = f94, f114 + FSUB f115 = f95, f115 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f98 + FMPY f33 = f73, f98 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f98 = f73, f99, f32 + FMA_D f99 = f72, f99, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f66 = f74, f98, f66 + FMA_A f67 = f75, f98, f67 + FNMA f82 = f74, f114, f82 + FMA_A f83 = f75, f114, f83 + ;; + FMA_B f66 = f75, f99, f66 + FNMA f67 = f74, f99, f67 + FMA_B f82 = f75, f115, f82 + FNMA f83 = f74, f115, f83 + ;; + FNMA f96 = f76, f98, f96 + FMA_A f97 = f77, f98, f97 + FNMA f112 = f76, f114, f112 + FMA_A f113 = f77, f114, f113 + ;; + FMA_B f96 = f77, f99, f96 + FNMA f97 = f76, f99, f97 + FMA_B f112 = f77, f115, f112 + FNMA f113 = f76, f115, f113 + ;; + FNMA f64 = f78, f98, f64 + FMA_A f65 = f79, f98, f65 + FNMA f80 = f78, f114, f80 + FMA_A f81 = f79, f114, f81 + ;; + FMA_B f64 = f79, f99, f64 + FNMA f65 = f78, f99, f65 + FMA_B f80 = f79, f115, f80 + FNMA f81 = f78, f115, f81 + ;; + FMPY f32 = f88, f66 + FMPY f33 = f89, f66 + FMPY f34 = f88, f82 + FMPY f35 = f89, f82 + ;; + FMA_C f66 = f89, f67, f32 + FMA_D f67 = f88, f67, f33 + FMA_C f82 = f89, f83, f34 + FMA_D f83 = f88, f83, f35 + ;; + FNMA f96 = f90, f66, f96 + FMA_A f97 = f91, f66, f97 + FNMA f112 = f90, f82, f112 + FMA_A f113 = f91, f82, f113 + ;; + FMA_B f96 = f91, f67, f96 + FNMA f97 = f90, f67, f97 + FMA_B f112 = f91, f83, f112 + FNMA f113 = f90, f83, f113 + ;; + FNMA f64 = f92, f66, f64 + FMA_A f65 = f93, f66, f65 + FNMA f80 = f92, f82, f80 + FMA_A f81 = f93, f82, f81 + ;; + FMA_B f64 = f93, f67, f64 + FNMA f65 = f92, f67, f65 + FMA_B f80 = f93, f83, f80 + FNMA f81 = f92, f83, f81 + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FNMA f66 = f76, f64, f66 + FMA_A f67 = f77, f64, f67 + FNMA f82 = f76, f80, f82 + FMA_A f83 = f77, f80, f83 + ;; + FMA_B f66 = f77, f65, f66 + FNMA f67 = f76, f65, f67 + FMA_B f82 = f77, f81, f82 + FNMA f83 = f76, f81, f83 + ;; + FNMA f98 = f78, f64, f98 + FMA_A f99 = f79, f64, f99 + FNMA f114 = f78, f80, f114 + FMA_A f115 = f79, f80, f115 + ;; + FMA_B f98 = f79, f65, f98 + FNMA f99 = f78, f65, f99 + FMA_B f114 = f79, f81, f114 + FNMA f115 = f78, f81, f115 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; + FNMA f66 = f92, f96, f66 + FMA_A f67 = f93, f96, f67 + FNMA f82 = f92, f112, f82 + FMA_A f83 = f93, f112, f83 + ;; + FMA_B f66 = f93, f97, f66 + FNMA f67 = f92, f97, f67 + FMA_B f82 = f93, f113, f82 + FNMA f83 = f92, f113, f83 + ;; + FNMA f98 = f94, f96, f98 + FMA_A f99 = f95, f96, f99 + FNMA f114 = f94, f112, f114 + FMA_A f115 = f95, f112, f115 + ;; + FMA_B f98 = f95, f97, f98 + FNMA f99 = f94, f97, f99 + FMA_B f114 = f95, f113, f114 + FNMA f115 = f94, f113, f115 + ;; + FMPY f32 = f108, f66 + FMPY f33 = f109, f66 + FMPY f34 = f108, f82 + FMPY f35 = f109, f82 + ;; + FMA_C f66 = f109, f67, f32 + FMA_D f67 = f108, f67, f33 + FMA_C f82 = f109, f83, f34 + FMA_D f83 = f108, f83, f35 + ;; + FNMA f98 = f110, f66, f98 + FMA_A f99 = f111, f66, f99 + FNMA f114 = f110, f82, f114 + FMA_A f115 = f111, f82, f115 + ;; + FMA_B f98 = f111, f67, f98 + FNMA f99 = f110, f67, f99 + FMA_B f114 = f111, f83, f114 + FNMA f115 = f110, f83, f115 + ;; + FMPY f32 = f126, f98 + FMPY f33 = f127, f98 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f98 = f127, f99, f32 + FMA_D f99 = f126, f99, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + FMPY f36 = f72, f66 + FMPY f37 = f73, f66 + FMPY f38 = f72, f98 + FMPY f39 = f73, f98 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + FMA_C f66 = f73, f67, f36 + FMA_D f67 = f72, f67, f37 + FMA_C f98 = f73, f99, f38 + FMA_D f99 = f72, f99, f39 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + FNMA f114 = f74, f98, f114 + FMA_A f115 = f75, f98, f115 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + FMA_B f114 = f75, f99, f114 + FNMA f115 = f74, f99, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + FMPY f36 = f90, f82 + FMPY f37 = f91, f82 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + FMA_C f82 = f91, f83, f36 + FMA_D f83 = f90, f83, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + FMPY f36 = f104, f82 + FMPY f37 = f105, f82 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + FMA_C f82 = f105, f83, f36 + FMA_D f83 = f104, f83, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + FNMA f98 = f106, f114, f98 + FMA_A f99 = f107, f114, f99 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + FMA_B f98 = f107, f115, f98 + FNMA f99 = f106, f115, f99 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + FMPY f36 = f120, f66 + FMPY f37 = f121, f66 + FMPY f38 = f120, f98 + FMPY f39 = f121, f98 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + FMA_C f66 = f121, f67, f36 + FMA_D f67 = f120, f67, f37 + FMA_C f98 = f121, f99, f38 + FMA_D f99 = f120, f99, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f66, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f67, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f98, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f99, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f83, SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f113, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f66, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f67, SIZE + ;; + STFD [C1 ] = f96, SIZE + STFD [C5 ] = f98, SIZE + ;; + STFD [C1 ] = f97, 5 * SIZE + STFD [C5 ] = f99, 5 * SIZE + ;; + STFD [C2 ] = f80, SIZE + STFD [C6 ] = f82, SIZE + ;; + STFD [C2 ] = f81, SIZE + STFD [C6 ] = f83, SIZE + ;; + STFD [C2 ] = f112, SIZE + STFD [C6 ] = f114, SIZE + ;; + STFD [C2 ] = f113, 5 * SIZE + STFD [C6 ] = f115, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L052 + ;; + .align 16 + +.L060: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L070 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L068 + ;; + .align 16 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + } + { .mfb + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f112, SIZE + ;; + STFD [C2 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L070: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L078 + ;; + .align 16 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f96 = f32, f49, f96 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f112 = f32, f51, f112 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f97 = f33, f49, f97 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f113 = f33, f51, f113 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f96 = f40, f57, f96 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f112 = f40, f59, f112 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f57, f97 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f59, f113 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f97 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f80 = f80, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f96 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f81 = f81, f112 + nop __LINE__ + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f81, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L089: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L090: + shr I = M, 2 + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L999 + ;; + +#ifdef RT + { .mmi + shl r2 = K, ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } + ;; +#endif + mov C1 = C + +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + add C = LDC, C +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC] + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov ar.lc = L + } + { .mmi + adds C5 = 4 * SIZE, C1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + ;; + .align 16 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f34, f48, f80 // A3 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f81 = f34, f49, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f36, f48, f96 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f97 = f36, f49, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f38, f48, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f38, f49, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f81 = f35, f48, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f35, f49, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f97 = f37, f48, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f37, f49, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f113 = f39, f48, f113 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f39, f49, f112 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f42, f56, f80 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f44, f56, f96 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f46, f56, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f43, f56, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f45, f56, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f47, f56, f113 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f96, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f97, SIZE + ;; + STFD [C1 ] = f80, SIZE + STFD [C5 ] = f112, SIZE + ;; + STFD [C1 ] = f81, 5 * SIZE + STFD [C5 ] = f113, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L092 + ;; + .align 16 + +.L100: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L110 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L108 + ;; + .align 16 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f112 = f34, f49, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f113 = f35, f49, f113 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f57, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f57, f113 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f96 = f96, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f97 = f97, f112 + nop __LINE__ + } + ;; +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f96, SIZE + ;; + STFD [BOFFSET] = f97, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L110: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L119 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L118 + ;; + .align 16 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + ;; +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + ;; +#else + LDFPD f72, f73 = [AOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + .align 16 + +.L119: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + add B = KK8, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 1, KK +#endif + +#ifdef RT + adds KK = -1, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L999: + { .mii + nop __LINE__ + mov ar.lc = ARLC + mov pr = PR, -1 + } + { .mib + nop __LINE__ +#ifdef TRMMKERNEL + mov ar.pfs = ARPFS +#else + nop __LINE__ +#endif + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/ia64/ztrsm_kernel_RT.S b/kernel/ia64/ztrsm_kernel_RT.S new file mode 100644 index 0000000000..582e2e5bfc --- /dev/null +++ b/kernel/ia64/ztrsm_kernel_RT.S @@ -0,0 +1,10837 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE (16 * 8) +#else +#define PREFETCHSIZE (32 * 8) +#endif + +#ifndef LN +#define CPREFETCHSIZE 7 +#else +#define CPREFETCHSIZE -8 +#endif +#define CPREFETCH lfetch.excl.nt1 + +#define M r32 +#define N r33 +#define K r34 +#define A r37 +#define B r38 +#define C r39 +#define LDC r35 + +#define I r15 +#define J r16 +#define AOFFSET r17 +#define BOFFSET r18 +#define TEMP r19 +#define L r20 + +#define C1 r21 +#define C2 r22 +#define C3 r23 +#define C4 r24 +#define C5 r25 +#define C6 r26 +#define C7 r27 +#define C8 r28 + +#define PREA r8 +#define PREB r9 +#define PREC r10 +#define SP r12 +#define ARLC r29 +#define PR r30 +#define ARPFS r31 + +#define ALPHA_R f8 +#define ALPHA_I f9 + +#define AORIG loc0 +#define KK loc1 +#define KK8 loc2 +#define OFFSET loc3 +#define AOFFSET2 loc4 +#define BOFFSET2 loc5 + +#ifndef CONJ +#define FCALC_A FSUB +#define FCALC_B FADD +#define FMA_A FNMA +#define FMA_B FMA +#else +#define FCALC_A FADD +#define FCALC_B FSUB +#define FMA_A FMA +#define FMA_B FNMA +#endif + +#ifndef CONJ +#define FCALC_C FMA +#define FCALC_D FNMA +#else +#define FCALC_C FNMA +#define FCALC_D FMA +#endif + +#ifndef CONJ +#define FMA_C FNMA +#define FMA_D FMA +#define FSUB_A FSUB +#else +#define FMA_C FMA +#define FMA_D FMS +#define FSUB_A FADD +#endif + + + PROLOGUE + .prologue + PROFCODE + + { .mfi + .save ar.pfs, ARPFS + alloc ARPFS = ar.pfs, 8, 8, 0, 0 + mov f64 = f0 + adds r14 = 16, SP + } + { .mfi + nop __LINE__ + mov f65 = f0 + adds r15 = 24, SP + } + ;; + { .mfi + ld8 LDC = [r14] + mov f81 = f0 + mov PR = pr + } + { .mfi + ld8 OFFSET = [r15] + mov f96 = f0 + } + ;; + { .mfi + shladd LDC = LDC, ZBASE_SHIFT, r0 + mov f97 = f0 + } + { .mfi + nop __LINE__ + mov f113 = f0 + } + ;; +#ifdef LN + { .mmi + setf.sig f32 = M + setf.sig f33 = K + shladd C = M, ZBASE_SHIFT, C + } + ;; + {.mmf + nop __LINE__ + nop __LINE__ + xmpy.l f32 = f32, f33 + } + ;; + { .mmi + getf.sig r2 = f32 + ;; + nop __LINE__ + shladd A = r2, ZBASE_SHIFT, A + } + ;; +#endif + +#ifdef RN + sub KK = r0, OFFSET +#endif + +#ifdef RT + { .mmi + setf.sig f32 = N + setf.sig f33 = K + nop __LINE__ + } + ;; + { .mmi + setf.sig f34 = LDC + nop __LINE__ + nop __LINE__ + } + ;; + { .mmf + nop __LINE__ + nop __LINE__ + xmpy.l f33 = f32, f33 + } + { .mmf + nop __LINE__ + sub KK = N, OFFSET + xmpy.l f34 = f32, f34 + } + ;; + { .mmi + getf.sig r2 = f33 + getf.sig r3 = f34 + } + ;; + shladd B = r2, ZBASE_SHIFT, B + add C = r3, C +#endif + ;; + .body + { .mfi + nop __LINE__ + mov f80 = f0 + mov ARLC = ar.lc + } + { .mfb + mov f112 = f0 + } + ;; + ;; + shr I = M, 2 + tbit.z p6, p0 = N, 0 + (p6) br.cond.dpnt .L050 + ;; + +#ifdef RT + { .mmi + shl r2 = K, ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, LDC + nop __LINE__ + } + ;; +#endif + mov C1 = C + +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + add C = LDC, C +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L100 + } + ;; + .align 16 + +.L092: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + } + { .mfi + adds PREC = CPREFETCHSIZE * SIZE, C1 + shr L = L, 1 + } + ;; + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + { .mmf + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + CPREFETCH [PREC] + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov ar.lc = L + } + { .mmi + adds C5 = 4 * SIZE, C1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L098 + ;; + .align 16 + +.L093: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f34, f48, f80 // A3 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f81 = f34, f49, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f36, f48, f96 // A5 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f97 = f36, f49, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f38, f48, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f38, f49, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f81 = f35, f48, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f35, f49, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f97 = f37, f48, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f37, f49, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f113 = f39, f48, f113 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f39, f49, f112 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f42, f56, f80 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f42, f57, f81 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f44, f56, f96 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f44, f57, f97 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f46, f56, f112 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f46, f57, f113 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f43, f56, f81 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f43, f57, f80 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f45, f56, f97 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f45, f57, f96 // A6 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f47, f56, f113 // A8 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f47, f57, f112 // A8 * B2 + br.cloop.sptk.few .L093 + } + ;; +.L098: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f96, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f97, SIZE + ;; + STFD [C1 ] = f80, SIZE + STFD [C5 ] = f112, SIZE + ;; + STFD [C1 ] = f81, 5 * SIZE + STFD [C5 ] = f113, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C5 = -8 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L092 + ;; + .align 16 + +.L100: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L110 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L108 + ;; + .align 16 + +.L102: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 4 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f112 = f34, f49, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA f113 = f35, f49, f113 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f57, f112 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f57, f113 // A4 * B2 + br.cloop.sptk.few .L102 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f96 = f96, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f97 = f97, f112 + nop __LINE__ + } + ;; +.L108: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f88, f96 + FSUB f97 = f89, f97 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f96, SIZE + ;; + STFD [BOFFSET] = f97, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C5 = -4 * SIZE, C5 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L110: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L119 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + add BOFFSET = r3, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + nop __LINE__ + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + cmp.eq p3, p0 = r0, r0 + adds L = -1, L + } + ;; + { .mmi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L118 + ;; + .align 16 + +.L112: + { .mfi + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + lfetch.nt1 [PREB], 4 * SIZE + FMA f80 = f32, f49, f80 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mmf + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f81 = f33, f49, f81 // A2 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f80 = f40, f57, f80 // A1 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + adds L = -1, L + } + { .mfb + (p3) FMA f81 = f41, f57, f81 // A2 * B2 + br.cloop.sptk.few .L112 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f81 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f80 + nop __LINE__ + } + ;; +.L118: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -1, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + add BOFFSET = r2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + ;; +#else + LDFPD f72, f73 = [AOFFSET] + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#ifdef RT + LDFPD f72, f73 = [BOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + add BOFFSET = L, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + .align 16 + +.L119: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + add B = KK8, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 1, KK +#endif + +#ifdef RT + adds KK = -1, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L050: + { .mmi + shr I = M, 2 + } + { .mib + tbit.z p6, p0 = N, 1 + (p6) br.cond.dpnt .L010 + } + ;; + +#ifdef RT + { .mmi + shladd r3 = LDC, 1, r0 + nop __LINE__ + shl r2 = K, 1 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + + mov C1 = C + add C2 = LDC, C + ;; +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif + ;; +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + shladd C = LDC, 1, C +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L060 + } + ;; + .align 16 + +.L052: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f99 = f0 + adds L = 1, L + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + CPREFETCH [PREC], LDC + mov f115 = f0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + adds C5 = 4 * SIZE, C1 + adds L = -1, L + } + ;; + { .mmi + CPREFETCH [PREC], LDC + adds C6 = 4 * SIZE, C2 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L058 + ;; + .align 16 + +.L053: + { .mfb + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfi + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f36, f48, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f36, f49, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f36, f50, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f36, f51, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f38, f48, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f38, f49, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f38, f50, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f38, f51, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f37, f48, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f37, f49, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f37, f50, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f37, f51, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f39, f48, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f39, f49, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f39, f50, f115 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f39, f51, f114 // A8 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f44, f56, f66 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f44, f57, f67 // A5 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f44, f58, f82 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f44, f59, f83 // A5 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f46, f56, f98 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f46, f57, f99 // A7 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f46, f58, f114 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f46, f59, f115 // A7 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f45, f56, f67 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f45, f57, f66 // A6 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f45, f58, f83 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f45, f59, f82 // A6 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f47, f56, f99 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f47, f57, f98 // A8 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f47, f58, f115 // A8 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f47, f59, f114 // A8 * B4 + br.cloop.sptk.few .L053 + } + ;; +.L058: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + ;; + LDFPD f122, f123 = [BOFFSET] + adds BOFFSET = -14 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + + FSUB f66 = f104, f66 + FSUB_A f67 = f105, f67 + FSUB f82 = f106, f82 + FSUB_A f83 = f107, f83 + FSUB f98 = f120, f98 + FSUB_A f99 = f121, f99 + FSUB f114 = f122, f114 + FSUB_A f115 = f123, f115 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = -14 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f66 = f76, f66 + FSUB f67 = f77, f67 + FSUB f98 = f78, f98 + FSUB f99 = f79, f99 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + + FSUB f82 = f92, f82 + FSUB f83 = f93, f83 + FSUB f114 = f94, f114 + FSUB f115 = f95, f115 + ;; +#endif + +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET + ;; + LDFPD f72, f73 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f76, f77 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f92, f93 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f72, f98 + FMPY f33 = f73, f98 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f98 = f73, f99, f32 + FMA_D f99 = f72, f99, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f66 = f74, f98, f66 + FMA_A f67 = f75, f98, f67 + FNMA f82 = f74, f114, f82 + FMA_A f83 = f75, f114, f83 + ;; + FMA_B f66 = f75, f99, f66 + FNMA f67 = f74, f99, f67 + FMA_B f82 = f75, f115, f82 + FNMA f83 = f74, f115, f83 + ;; + FNMA f96 = f76, f98, f96 + FMA_A f97 = f77, f98, f97 + FNMA f112 = f76, f114, f112 + FMA_A f113 = f77, f114, f113 + ;; + FMA_B f96 = f77, f99, f96 + FNMA f97 = f76, f99, f97 + FMA_B f112 = f77, f115, f112 + FNMA f113 = f76, f115, f113 + ;; + FNMA f64 = f78, f98, f64 + FMA_A f65 = f79, f98, f65 + FNMA f80 = f78, f114, f80 + FMA_A f81 = f79, f114, f81 + ;; + FMA_B f64 = f79, f99, f64 + FNMA f65 = f78, f99, f65 + FMA_B f80 = f79, f115, f80 + FNMA f81 = f78, f115, f81 + ;; + FMPY f32 = f88, f66 + FMPY f33 = f89, f66 + FMPY f34 = f88, f82 + FMPY f35 = f89, f82 + ;; + FMA_C f66 = f89, f67, f32 + FMA_D f67 = f88, f67, f33 + FMA_C f82 = f89, f83, f34 + FMA_D f83 = f88, f83, f35 + ;; + FNMA f96 = f90, f66, f96 + FMA_A f97 = f91, f66, f97 + FNMA f112 = f90, f82, f112 + FMA_A f113 = f91, f82, f113 + ;; + FMA_B f96 = f91, f67, f96 + FNMA f97 = f90, f67, f97 + FMA_B f112 = f91, f83, f112 + FNMA f113 = f90, f83, f113 + ;; + FNMA f64 = f92, f66, f64 + FMA_A f65 = f93, f66, f65 + FNMA f80 = f92, f82, f80 + FMA_A f81 = f93, f82, f81 + ;; + FMA_B f64 = f93, f67, f64 + FNMA f65 = f92, f67, f65 + FMA_B f80 = f93, f83, f80 + FNMA f81 = f92, f83, f81 + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [AOFFSET] + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [AOFFSET] + adds AOFFSET = 8 * SIZE, AOFFSET + ;; + LDFPD f126, f127 = [AOFFSET] + adds AOFFSET = - 30 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FNMA f66 = f76, f64, f66 + FMA_A f67 = f77, f64, f67 + FNMA f82 = f76, f80, f82 + FMA_A f83 = f77, f80, f83 + ;; + FMA_B f66 = f77, f65, f66 + FNMA f67 = f76, f65, f67 + FMA_B f82 = f77, f81, f82 + FNMA f83 = f76, f81, f83 + ;; + FNMA f98 = f78, f64, f98 + FMA_A f99 = f79, f64, f99 + FNMA f114 = f78, f80, f114 + FMA_A f115 = f79, f80, f115 + ;; + FMA_B f98 = f79, f65, f98 + FNMA f99 = f78, f65, f99 + FMA_B f114 = f79, f81, f114 + FNMA f115 = f78, f81, f115 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; + FNMA f66 = f92, f96, f66 + FMA_A f67 = f93, f96, f67 + FNMA f82 = f92, f112, f82 + FMA_A f83 = f93, f112, f83 + ;; + FMA_B f66 = f93, f97, f66 + FNMA f67 = f92, f97, f67 + FMA_B f82 = f93, f113, f82 + FNMA f83 = f92, f113, f83 + ;; + FNMA f98 = f94, f96, f98 + FMA_A f99 = f95, f96, f99 + FNMA f114 = f94, f112, f114 + FMA_A f115 = f95, f112, f115 + ;; + FMA_B f98 = f95, f97, f98 + FNMA f99 = f94, f97, f99 + FMA_B f114 = f95, f113, f114 + FNMA f115 = f94, f113, f115 + ;; + FMPY f32 = f108, f66 + FMPY f33 = f109, f66 + FMPY f34 = f108, f82 + FMPY f35 = f109, f82 + ;; + FMA_C f66 = f109, f67, f32 + FMA_D f67 = f108, f67, f33 + FMA_C f82 = f109, f83, f34 + FMA_D f83 = f108, f83, f35 + ;; + FNMA f98 = f110, f66, f98 + FMA_A f99 = f111, f66, f99 + FNMA f114 = f110, f82, f114 + FMA_A f115 = f111, f82, f115 + ;; + FMA_B f98 = f111, f67, f98 + FNMA f99 = f110, f67, f99 + FMA_B f114 = f111, f83, f114 + FNMA f115 = f110, f83, f115 + ;; + FMPY f32 = f126, f98 + FMPY f33 = f127, f98 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f98 = f127, f99, f32 + FMA_D f99 = f126, f99, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + FMPY f36 = f72, f66 + FMPY f37 = f73, f66 + FMPY f38 = f72, f98 + FMPY f39 = f73, f98 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + FMA_C f66 = f73, f67, f36 + FMA_D f67 = f72, f67, f37 + FMA_C f98 = f73, f99, f38 + FMA_D f99 = f72, f99, f39 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + FNMA f114 = f74, f98, f114 + FMA_A f115 = f75, f98, f115 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + FMA_B f114 = f75, f99, f114 + FNMA f115 = f74, f99, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + FMPY f36 = f90, f82 + FMPY f37 = f91, f82 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + FMA_C f82 = f91, f83, f36 + FMA_D f83 = f90, f83, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + FMPY f36 = f104, f82 + FMPY f37 = f105, f82 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + FMA_C f82 = f105, f83, f36 + FMA_D f83 = f104, f83, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + FNMA f98 = f106, f114, f98 + FMA_A f99 = f107, f114, f99 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + FMA_B f98 = f107, f115, f98 + FNMA f99 = f106, f115, f99 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + FMPY f36 = f120, f66 + FMPY f37 = f121, f66 + FMPY f38 = f120, f98 + FMPY f39 = f121, f98 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + FMA_C f66 = f121, f67, f36 + FMA_D f67 = f120, f67, f37 + FMA_C f98 = f121, f99, f38 + FMA_D f99 = f120, f99, f39 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f66, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f67, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f98, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f99, 5 * SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + STFD [AOFFSET2] = f83, SIZE + ;; + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f113, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + STFD [C1 ] = f64, SIZE + STFD [C5 ] = f66, SIZE + ;; + STFD [C1 ] = f65, SIZE + STFD [C5 ] = f67, SIZE + ;; + STFD [C1 ] = f96, SIZE + STFD [C5 ] = f98, SIZE + ;; + STFD [C1 ] = f97, 5 * SIZE + STFD [C5 ] = f99, 5 * SIZE + ;; + STFD [C2 ] = f80, SIZE + STFD [C6 ] = f82, SIZE + ;; + STFD [C2 ] = f81, SIZE + STFD [C6 ] = f83, SIZE + ;; + STFD [C2 ] = f112, SIZE + STFD [C6 ] = f114, SIZE + ;; + STFD [C2 ] = f113, 5 * SIZE + STFD [C6 ] = f115, 5 * SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -8 * SIZE, C1 + adds C2 = -8 * SIZE, C2 + adds C5 = -8 * SIZE, C5 + adds C6 = -8 * SIZE, C6 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 2, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 4, KK +#elif defined LN + adds KK = -4, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + (p6) br.cond.dptk .L052 + ;; + .align 16 + +.L060: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L070 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mmi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + nop __LINE__ + adds L = -1, L + } + ;; + { .mmi + nop __LINE__ + nop __LINE__ + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L068 + ;; + .align 16 + +.L062: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + cmp.ne p4, p5 = 0, L + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfb + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f34, f48, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f34, f49, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f34, f50, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f34, f51, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f97 = f35, f48, f97 // A4 * B1 + } + { .mfb + FMA_A f96 = f35, f49, f96 // A4 * B2 + nop __LINE__ + } + + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f35, f50, f113 // A4 * B3 + nop __LINE__ + } + { .mfb + FMA_A f112 = f35, f51, f112 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f42, f56, f96 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f42, f57, f97 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f42, f58, f112 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f42, f59, f113 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f43, f56, f97 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f43, f57, f96 // A4 * B2 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f43, f58, f113 // A4 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f43, f59, f112 // A4 * B4 + br.cloop.sptk.few .L062 + } + ;; +.L068: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + FSUB f96 = f88, f96 + FSUB_A f97 = f89, f97 + FSUB f112 = f90, f112 + FSUB_A f113 = f91, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f96 = f74, f96 + FSUB f97 = f75, f97 + + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f112 = f90, f112 + FSUB f113 = f91, f113 + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f96 + FMPY f33 = f105, f96 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f96 = f105, f97, f32 + FMA_D f97 = f104, f97, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f96, f64 + FMA_A f65 = f107, f96, f65 + FNMA f80 = f106, f112, f80 + FMA_A f81 = f107, f112, f81 + ;; + FMA_B f64 = f107, f97, f64 + FNMA f65 = f106, f97, f65 + FMA_B f80 = f107, f113, f80 + FNMA f81 = f106, f113, f81 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; + FNMA f96 = f74, f64, f96 + FMA_A f97 = f75, f64, f97 + FNMA f112 = f74, f80, f112 + FMA_A f113 = f75, f80, f113 + ;; + FMA_B f96 = f75, f65, f96 + FNMA f97 = f74, f65, f97 + FMA_B f112 = f75, f81, f112 + FNMA f113 = f74, f81, f113 + ;; + FMPY f32 = f90, f96 + FMPY f33 = f91, f96 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f96 = f91, f97, f32 + FMA_D f97 = f90, f97, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f96 + FMPY f35 = f73, f96 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f96 = f73, f97, f34 + FMA_D f97 = f72, f97, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f112 = f74, f96, f112 + FMA_A f113 = f75, f96, f113 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f112 = f75, f97, f112 + FNMA f113 = f74, f97, f113 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f112 + FMPY f35 = f91, f112 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f112 = f91, f113, f34 + FMA_D f113 = f90, f113, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f112 + FMPY f35 = f105, f112 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f112 = f105, f113, f34 + FMA_D f113 = f104, f113, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f96 = f106, f112, f96 + FMA_A f97 = f107, f112, f97 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f96 = f107, f113, f96 + FNMA f97 = f106, f113, f97 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f96 + FMPY f35 = f121, f96 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f96 = f121, f97, f34 + FMA_D f97 = f120, f97, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f96, SIZE + ;; + STFD [C1 ] = f97, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f112, SIZE + ;; + STFD [C2 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L070: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L089 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 1, B +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mii + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + tbit.z p12, p0 = L, 0 + shr L = L, 1 + } + ;; + { .mmi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + adds L = -1, L + } + ;; + { .mmi + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + cmp.eq p3, p0 = r0, r0 + mov ar.lc = L + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L078 + ;; + .align 16 + +.L072: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA f96 = f32, f49, f96 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 8 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA f112 = f32, f51, f112 // A1 * B4 + nop __LINE__ + } + ;; + { .mfi + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + } + { .mfi + nop __LINE__ + FMA f97 = f33, f49, f97 // A2 * B2 + } + ;; + { .mfi + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + } + { .mmf + nop __LINE__ + nop __LINE__ + FMA f113 = f33, f51, f113 // A2 * B4 + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f96 = f40, f57, f96 // A1 * B2 + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + (p3) FMA f112 = f40, f59, f112 // A1 * B4 + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f57, f97 // A2 * B2 + nop __LINE__ + } + ;; + { .mfi + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f59, f113 // A2 * B4 + br.cloop.sptk.few .L072 + } + ;; + { .mfb + nop __LINE__ + FCALC_A f64 = f64, f97 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_A f80 = f80, f113 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f65 = f65, f96 + nop __LINE__ + } + { .mfb + nop __LINE__ + FCALC_B f81 = f81, f112 + nop __LINE__ + } + ;; +.L078: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -2, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 1, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = -2 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f74, f80 + FSUB_A f81 = f75, f81 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET] + adds AOFFSET = -2 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET] + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + STFD [BOFFSET] = f64, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + ;; + STFD [BOFFSET] = f81, SIZE + ;; + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; +#else + STFD [AOFFSET] = f64, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + ;; + STFD [AOFFSET] = f81, SIZE + ;; + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 1, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L089: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 1, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 2, KK +#endif + +#ifdef RT + adds KK = -2, KK +#endif + ;; + { .mmi + mov AOFFSET = A + nop __LINE__ + } + ;; + .align 16 + +.L010: + shr J = N, 2 + ;; + cmp.ge p6, p0 = 0, J + (p6) br.cond.dpnt .L999 + ;; + +.L010x: +#ifdef RT + { .mmi + shladd r3 = LDC, 2, r0 + nop __LINE__ + shl r2 = K, 2 + ZBASE_SHIFT + } + ;; + { .mmi + sub B = B, r2 + sub C = C, r3 + nop __LINE__ + } + ;; +#endif + { .mmi + mov C1 = C // coffset1 = c + 0 * ldc + add C2 = LDC, C // coffset2 = c + 1 * ldc + shr I = M, 2 + } + { .mmi + adds J = -1, J +#ifdef LN + add KK = M, OFFSET +#elif defined LT + mov KK = OFFSET +#else + nop __LINE__ +#endif +#if defined(LN) || defined(RT) + mov AORIG = A +#else + mov AOFFSET = A +#endif + } + ;; + ;; + { .mmi + shladd C3 = LDC, 1, C // coffset3 = c + 2 * ldc + shladd C4 = LDC, 1, C2 // coffset4 = c + 3 * ldc +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + } + { .mib + cmp.eq p6, p7 = 0, I +#ifndef RT + shladd C = LDC, 2, C // coffset += 8 * ldc +#else + nop __LINE__ +#endif + (p6) br.cond.dpnt .L020 + } + ;; + .align 16 + +.L011: + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 2 + ZBASE_SHIFT + } + { .mfi + shladd r3 = KK, ZBASE_SHIFT, r0 + mov f118 = f0 + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 2, AORIG + } + ;; +#endif + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + nop __LINE__ + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + adds PREC = CPREFETCHSIZE * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = 1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + adds C5 = 4 * SIZE, C1 + } + ;; + { .mfi + (p7) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + mov f114 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f115 = f0 + adds C6 = 4 * SIZE, C2 + } + ;; + { .mfi + (p7) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + mov f68 = f0 + shr L = L, 1 + } + { .mfi + setf.d f86 = r0 + mov f69 = f0 + adds C7 = 4 * SIZE, C3 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f84 = f0 + adds L = -1, L + } + { .mfi + setf.d f87 = r0 + mov f85 = f0 + adds C8 = 4 * SIZE, C4 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f100 = f0 + mov ar.lc = L + } + { .mfi + setf.d f102 = r0 + mov f101 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + CPREFETCH [PREC], LDC + mov f116 = f0 + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + } + { .mfi + setf.d f103 = r0 + mov f117 = f0 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + ;; + { .mfi + CPREFETCH [PREC] + mov f70 = f0 + cmp.eq p6, p0 = -1, L + } + { .mfb + setf.d f119 = r0 + mov f71 = f0 + (p6) br.cond.dpnt .L018 + } + ;; + .align 16 + +.L012: +/* 1 */ + { .mfi + lfetch.nt1 [PREA], 16 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + (p12) cmp.ne p3, p0 = 0, L + FMA_B f65 = f32, f49, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 2 */ + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + cmp.ne p4, p5 = 0, L + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 3 */ + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 4 */ + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 5 */ + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 6 */ + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 7 */ + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 8 */ + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; +/* 9 */ + { .mfb + (p3) LDFPD f44, f45 = [AOFFSET], 2 * SIZE + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 10 */ + { .mfb + (p3) LDFPD f46, f47 = [AOFFSET], 2 * SIZE + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 11 */ + { .mfb + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 12 */ + { .mfb + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 13 */ + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 14 */ + { .mfb + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 15 */ + { .mfb + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 16 */ + { .mfb + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 17 */ + { .mfb + nop __LINE__ + FMA f68 = f36, f48, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f69 = f36, f49, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 18 */ + { .mfb + nop __LINE__ + FMA f84 = f36, f50, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f85 = f36, f51, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 19 */ + { .mfb + nop __LINE__ + FMA f100 = f36, f52, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f101 = f36, f53, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 20 */ + { .mfb + nop __LINE__ + FMA f116 = f36, f54, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f117 = f36, f55, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 21 */ + { .mfb + nop __LINE__ + FMA f69 = f37, f48, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f68 = f37, f49, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 22 */ + { .mfb + nop __LINE__ + FMA f85 = f37, f50, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f84 = f37, f51, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 23 */ + { .mfb + nop __LINE__ + FMA f101 = f37, f52, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f100 = f37, f53, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 24 */ + { .mfb + nop __LINE__ + FMA f117 = f37, f54, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f116 = f37, f55, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 25 */ + { .mfb + nop __LINE__ + FMA f70 = f38, f48, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f71 = f38, f49, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 26 */ + { .mfb + nop __LINE__ + FMA f86 = f38, f50, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f87 = f38, f51, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 27 */ + { .mfb + nop __LINE__ + FMA f102 = f38, f52, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f103 = f38, f53, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 28 */ + { .mfb + nop __LINE__ + FMA f118 = f38, f54, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f119 = f38, f55, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 29 */ + { .mfb + nop __LINE__ + FMA f71 = f39, f48, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f70 = f39, f49, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 30 */ + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f87 = f39, f50, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f86 = f39, f51, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 31 */ + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f103 = f39, f52, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f102 = f39, f53, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 32 */ + { .mfb + nop __LINE__ + FMA f119 = f39, f54, f119 // A8 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f118 = f39, f55, f118 // A8 * B8 + nop __LINE__ + } + ;; +/* 33 */ + { .mfb + nop __LINE__ + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; +/* 34 */ + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; +/* 35 */ + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; +/* 36 */ + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; +/* 37 */ + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; +/* 38 */ + { .mfb + (p4) LDFPD f36, f37 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; +/* 39 */ + { .mfb + (p4) LDFPD f38, f39 = [AOFFSET], 2 * SIZE + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; +/* 40 */ + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + /* 41 */ + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; +/* 42 */ + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; +/* 43 */ + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; +/* 44 */ + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; +/* 45 */ + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; +/* 46 */ + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; +/* 47 */ + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; +/* 48 */ + { .mfb + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + nop __LINE__ + } + ;; +/* 49 */ + { .mfb + nop __LINE__ + (p3) FMA f68 = f44, f56, f68 // A5 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f69 = f44, f57, f69 // A5 * B2 + nop __LINE__ + } + ;; +/* 50 */ + { .mfb + nop __LINE__ + (p3) FMA f84 = f44, f58, f84 // A5 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f85 = f44, f59, f85 // A5 * B4 + nop __LINE__ + } + ;; +/* 51 */ + { .mfb + nop __LINE__ + (p3) FMA f100 = f44, f60, f100 // A5 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f101 = f44, f61, f101 // A5 * B6 + nop __LINE__ + } + ;; +/* 52 */ + { .mfb + nop __LINE__ + (p3) FMA f116 = f44, f62, f116 // A5 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f117 = f44, f63, f117 // A5 * B8 + nop __LINE__ + } + ;; +/* 53 */ + { .mfb + nop __LINE__ + (p3) FMA f69 = f45, f56, f69 // A6 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f68 = f45, f57, f68 // A6 * B2 + nop __LINE__ + } + ;; +/* 54 */ + { .mfb + nop __LINE__ + (p3) FMA f85 = f45, f58, f85 // A6 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f84 = f45, f59, f84 // A6 * B4 + nop __LINE__ + } + ;; +/* 55 */ + { .mfb + nop __LINE__ + (p3) FMA f101 = f45, f60, f101 // A6 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f100 = f45, f61, f100 // A6 * B6 + nop __LINE__ + } + ;; +/* 56 */ + { .mfb + nop __LINE__ + (p3) FMA f117 = f45, f62, f117 // A6 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f116 = f45, f63, f116 // A6 * B8 + nop __LINE__ + } + ;; +/* 57 */ + { .mfb + nop __LINE__ + (p3) FMA f70 = f46, f56, f70 // A7 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f71 = f46, f57, f71 // A7 * B2 + nop __LINE__ + } + ;; +/* 58 */ + { .mfb + nop __LINE__ + (p3) FMA f86 = f46, f58, f86 // A7 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f87 = f46, f59, f87 // A7 * B4 + nop __LINE__ + } + ;; +/* 59 */ + { .mfb + nop __LINE__ + (p3) FMA f102 = f46, f60, f102 // A7 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f103 = f46, f61, f103 // A7 * B6 + nop __LINE__ + } + ;; +/* 60 */ + { .mfb + nop __LINE__ + (p3) FMA f118 = f46, f62, f118 // A7 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f119 = f46, f63, f119 // A7 * B8 + nop __LINE__ + } + ;; +/* 61 */ + { .mfb + nop __LINE__ + (p3) FMA f71 = f47, f56, f71 // A8 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f70 = f47, f57, f70 // A8 * B2 + nop __LINE__ + } + ;; +/* 62 */ + { .mfb + nop __LINE__ + (p3) FMA f87 = f47, f58, f87 // A8 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f86 = f47, f59, f86 // A8 * B4 + nop __LINE__ + } + ;; +/* 63 */ + { .mfb + nop __LINE__ + (p3) FMA f103 = f47, f60, f103 // A8 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f102 = f47, f61, f102 // A8 * B6 + nop __LINE__ + } + ;; +/* 64 */ + { .mfi + nop __LINE__ + (p3) FMA f119 = f47, f62, f119 // A8 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f118 = f47, f63, f118 // A8 * B8 + br.cloop.sptk.few .L012 + } + ;; + +.L018: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -4, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + FSUB f80 = f74, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + FSUB f96 = f76, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f77, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FSUB f112 = f78, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f79, f113 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET], 2 * SIZE + FSUB f66 = f88, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f89, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET], 2 * SIZE + FSUB f98 = f92, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f93, f99 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [BOFFSET], 2 * SIZE + FSUB f114 = f94, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f95, f115 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FSUB f68 = f104, f68 + adds BOFFSET = -30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f69 = f105, f69 +#ifdef LN + adds AOFFSET = 30 * SIZE, AOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [AOFFSET] + FSUB f84 = f106, f84 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f85 = f107, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [AOFFSET] + FSUB f100 = f108, f100 +#ifdef LN + adds AOFFSET = - 2 * SIZE, AOFFSET +#else + adds AOFFSET = 2 * SIZE, AOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB_A f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f110, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f117 = f111, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f70 = f120, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f71 = f121, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f86 = f122, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f87 = f123, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f102 = f124, f102 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f103 = f125, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f119 = f127, f119 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET], 2 * SIZE + FSUB f66 = f74, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + FSUB f68 = f76, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f69 = f77, f69 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + FSUB f70 = f78, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f71 = f79, f71 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET], 2 * SIZE + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f84 = f92, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f85 = f93, f85 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET], 2 * SIZE + FSUB f86 = f94, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f87 = f95, f87 + nop __LINE__ + } + ;; + { .mfi + LDFPD f124, f125 = [AOFFSET], 2 * SIZE + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FSUB f98 = f106, f98 + adds AOFFSET = -30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET +#else + nop __LINE__ +#endif + } + ;; + { .mfi + LDFPD f72, f73 = [BOFFSET] + FSUB f100 = f108, f100 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f101 = f109, f101 + nop __LINE__ + } + ;; + { .mfi + LDFPD f74, f75 = [BOFFSET] + FSUB f102 = f110, f102 +#ifdef RN + adds BOFFSET = 2 * SIZE, BOFFSET +#else + adds BOFFSET = - 2 * SIZE, BOFFSET +#endif + } + { .mfi + nop __LINE__ + FSUB f103 = f111, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f116 = f124, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f117 = f125, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f118 = f126, f118 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f119 = f127, f119 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + { .mfi + LDFPD f76, f77 = [AOFFSET] + FMPY f32 = f72, f70 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f70 + adds AOFFSET = - 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f102 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [AOFFSET] + FMPY f34 = f72, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET] + FMPY f35 = f73, f86 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET] + FMA_C f70 = f73, f71, f32 + adds AOFFSET = - 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f102 = f73, f103, f36 + adds C1 = -2 * SIZE, C1 + } + ;; + { .mfi + LDFPD f104, f105 = [AOFFSET] + FMA_D f71 = f72, f71, f33 + adds AOFFSET = - 2 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f103 = f72, f103, f37 + adds C2 = -2 * SIZE, C2 + } + ;; + { .mfi + LDFPD f106, f107 = [AOFFSET] + FMA_C f86 = f73, f87, f34 + adds AOFFSET = - 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + adds C3 = -2 * SIZE, C3 + } + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET] + FMA_D f87 = f72, f87, f35 + adds BOFFSET2 = 28 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds BOFFSET = 24 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f70, SIZE + FNMA f68 = f74, f70, f68 + adds C4 = -2 * SIZE, C4 + } + { .mfi + STFD [BOFFSET2] = f102, SIZE + FNMA f100 = f74, f102, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f71, SIZE + FMA_A f69 = f75, f70, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f103, SIZE + FMA_A f101 = f75, f102, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f86, SIZE + FNMA f84 = f74, f86, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f118, SIZE + FNMA f116 = f74, f118, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f87, -11 * SIZE + FMA_A f85 = f75, f86, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f119, -11 * SIZE + FMA_A f117 = f75, f118, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + FMA_B f68 = f75, f71, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f102, SIZE + FMA_B f100 = f75, f103, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f71, -3 * SIZE + FNMA f69 = f74, f71, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f103, -3 * SIZE + FNMA f101 = f74, f103, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + FMA_B f84 = f75, f87, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f118, SIZE + FMA_B f116 = f75, f119, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, -3 * SIZE + FNMA f85 = f74, f87, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f119, -3 * SIZE + FNMA f117 = f74, f119, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f76, f70, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f98 = f76, f102, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f77, f70, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f99 = f77, f102, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f86, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f114 = f76, f118, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f86, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f115 = f77, f118, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f77, f71, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f98 = f77, f103, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f76, f71, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f99 = f76, f103, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f87, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f114 = f77, f119, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f87, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f115 = f76, f119, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f70, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f78, f102, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f70, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f79, f102, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f78, f86, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f78, f118, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f79, f86, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f79, f118, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f71, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f79, f103, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f71, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f78, f103, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f79, f87, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f79, f119, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f78, f87, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f78, f119, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f89, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f69 = f88, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f84 = f89, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f89, f117, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f85 = f88, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f88, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f66 = f90, f68, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f98 = f90, f100, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f67 = f91, f68, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f99 = f91, f100, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f82 = f90, f84, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f114 = f90, f116, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, -11 * SIZE + FMA_A f83 = f91, f84, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, -11 * SIZE + FMA_A f115 = f91, f116, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f66 = f91, f69, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f98 = f91, f101, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, -3 * SIZE + FNMA f67 = f90, f69, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, -3 * SIZE + FNMA f99 = f90, f101, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f82 = f91, f85, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f114 = f91, f117, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, -3 * SIZE + FNMA f83 = f90, f85, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, -3 * SIZE + FNMA f115 = f90, f117, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f68, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f96 = f92, f100, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f68, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f97 = f93, f100, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f92, f84, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f112 = f92, f116, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f93, f84, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f113 = f93, f116, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f69, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f96 = f93, f101, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f69, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f97 = f92, f101, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f93, f85, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f112 = f93, f117, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f92, f85, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f113 = f92, f117, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f105, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f105, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f104, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f104, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f105, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f104, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f64 = f106, f66, f64 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f96 = f106, f98, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f65 = f107, f66, f65 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f97 = f107, f98, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f80 = f106, f82, f80 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f112 = f106, f114, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, -11 * SIZE + FMA_A f81 = f107, f82, f81 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, -11 * SIZE + FMA_A f113 = f107, f114, f113 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f64 = f107, f67, f64 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f96 = f107, f99, f96 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, -3 * SIZE + FNMA f65 = f106, f67, f65 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, -3 * SIZE + FNMA f97 = f106, f99, f97 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f80 = f107, f83, f80 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f112 = f107, f115, f112 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, -3 * SIZE + FNMA f81 = f106, f83, f81 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, -3 * SIZE + FNMA f113 = f106, f115, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f96 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f112 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f96 = f121, f97, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f120, f97, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f121, f81, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f112 = f121, f113, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f120, f81, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f113 = f120, f113, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f81, -3 * SIZE + STFD [BOFFSET2] = f113, -3 * SIZE + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, -1 * SIZE + mov f65 = f0 + adds KK = -4, KK + } + { .mfi + STFD [C3 ] = f97, -1 * SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f112, SIZE + mov f112 = f0 + sub L = K, KK + } + ;; + { .mfi + STFD [C2 ] = f81, -1 * SIZE + mov f81 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f113, -1 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef LT + { .mfi + LDFPD f76, f77 = [AOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [AOFFSET] + FMPY f33 = f73, f64 + adds AOFFSET = 4 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f96 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + FMPY f34 = f72, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [AOFFSET], 2 * SIZE + FMPY f35 = f73, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f112 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [AOFFSET] + FMA_C f64 = f73, f65, f32 + adds AOFFSET = 6 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f96 = f73, f97, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [AOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f97 = f72, f97, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [AOFFSET] + FMA_C f80 = f73, f81, f34 + adds AOFFSET = 8 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_C f112 = f73, f113, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [AOFFSET] + FMA_D f81 = f72, f81, f35 + adds AOFFSET = - 30 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f113 = f72, f113, f39 + adds BOFFSET2 = 4 * SIZE, BOFFSET + } + ;; + { .mfi + STFD [BOFFSET] = f64, SIZE + FNMA f66 = f74, f64, f66 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f96, SIZE + FNMA f98 = f74, f96, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f65, SIZE + FMA_A f67 = f75, f64, f67 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f97, SIZE + FMA_A f99 = f75, f96, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f80, SIZE + FNMA f82 = f74, f80, f82 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f112, SIZE + FNMA f114 = f74, f112, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f81, 5 * SIZE + FMA_A f83 = f75, f80, f83 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f113, 5 * SIZE + FMA_A f115 = f75, f112, f115 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f66 = f75, f65, f66 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f98 = f75, f97, f98 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f67 = f74, f65, f67 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f99 = f74, f97, f99 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f82 = f75, f81, f82 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f114 = f75, f113, f114 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f83 = f74, f81, f83 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f115 = f74, f113, f115 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f68 = f76, f64, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f96, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f69 = f77, f64, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f96, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f84 = f76, f80, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f76, f112, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f85 = f77, f80, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f77, f112, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f68 = f77, f65, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f97, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f69 = f76, f65, f69 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f97, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f84 = f77, f81, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f77, f113, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f85 = f76, f81, f85 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f76, f113, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f78, f64, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f78, f96, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f79, f64, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f79, f96, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f78, f80, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f112, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f79, f80, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f112, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f79, f65, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f79, f97, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f78, f65, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f78, f97, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f79, f81, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f113, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f78, f81, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f113, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f91, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f98 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f114 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f91, f67, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f98 = f91, f99, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f90, f67, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f99 = f90, f99, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f114 = f91, f115, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f115 = f90, f115, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f66, SIZE + FNMA f68 = f92, f66, f68 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f98, SIZE + FNMA f100 = f92, f98, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f67, SIZE + FMA_A f69 = f93, f66, f69 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f99, SIZE + FMA_A f101 = f93, f98, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f82, SIZE + FNMA f84 = f92, f82, f84 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f114, SIZE + FNMA f116 = f92, f114, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f83, 5 * SIZE + FMA_A f85 = f93, f82, f85 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f115, 5 * SIZE + FMA_A f117 = f93, f114, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f68 = f93, f67, f68 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f100 = f93, f99, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, SIZE + FNMA f69 = f92, f67, f69 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f99, SIZE + FNMA f101 = f92, f99, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f84 = f93, f83, f84 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f116 = f93, f115, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, SIZE + FNMA f85 = f92, f83, f85 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f115, SIZE + FNMA f117 = f92, f115, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f70 = f94, f66, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f94, f98, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f71 = f95, f66, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f95, f98, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f86 = f94, f82, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f114, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f87 = f95, f82, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f114, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f70 = f95, f67, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f95, f99, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f71 = f94, f67, f71 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f94, f99, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f86 = f95, f83, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f115, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f87 = f94, f83, f87 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f115, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f109, f68 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f108, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f116 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f109, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f68 = f109, f69, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f108, f69, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f109, f85, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f109, f117, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f108, f85, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f108, f117, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f68, SIZE + FNMA f70 = f110, f68, f70 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f100, SIZE + FNMA f102 = f110, f100, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f69, SIZE + FMA_A f71 = f111, f68, f71 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f101, SIZE + FMA_A f103 = f111, f100, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f84, SIZE + FNMA f86 = f110, f84, f86 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f116, SIZE + FNMA f118 = f110, f116, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [BOFFSET] = f85, 5 * SIZE + FMA_A f87 = f111, f84, f87 + nop __LINE__ + } + { .mfi + STFD [BOFFSET2] = f117, 5 * SIZE + FMA_A f119 = f111, f116, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f68, SIZE + FMA_B f70 = f111, f69, f70 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f100, SIZE + FMA_B f102 = f111, f101, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f69, SIZE + FNMA f71 = f110, f69, f71 + nop __LINE__ + } + { .mfi + STFD [C3 ] = f101, SIZE + FNMA f103 = f110, f101, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f84, SIZE + FMA_B f86 = f111, f85, f86 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f116, SIZE + FMA_B f118 = f111, f117, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f85, SIZE + FNMA f87 = f110, f85, f87 + nop __LINE__ + } + { .mfi + STFD [C4 ] = f117, SIZE + FNMA f119 = f110, f117, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f70 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f70 = f127, f71, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f127, f103, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f71 = f126, f71, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f126, f103, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f86 = f127, f87, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f87 = f126, f87, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f70, SIZE + STFD [BOFFSET2] = f102, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [BOFFSET] = f71, SIZE + STFD [BOFFSET2] = f103, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [BOFFSET] = f86, SIZE + STFD [BOFFSET2] = f118, SIZE + adds KK = 4, KK + } + ;; + { .mmi + STFD [BOFFSET] = f87, -27 * SIZE + STFD [BOFFSET2] = f119 + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mfi + STFD [C1 ] = f70, SIZE + mov f64 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + { .mfi + STFD [C3 ] = f102, SIZE + mov f65 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + ;; + { .mfi + STFD [C1 ] = f71, SIZE + mov f80 = f0 + mov L = KK + } + { .mfi + STFD [C3 ] = f103, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f86, SIZE + mov f96 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C4 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f87, SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C4 ] = f119, SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +#ifdef RN + { .mfi + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + FMPY f32 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f64 + adds BOFFSET = 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f68 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + FMPY f34 = f72, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + FMPY f35 = f73, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f70 + nop __LINE__ + } + ;; + { .mfi + LDFPD f94, f95 = [BOFFSET] + FMA_C f64 = f73, f65, f32 + adds BOFFSET = 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f68 = f73, f69, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + FMA_D f65 = f72, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f72, f69, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f110, f111 = [BOFFSET] + FMA_C f66 = f73, f67, f34 + adds BOFFSET = 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f70 = f73, f71, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f126, f127 = [BOFFSET] + FMA_D f67 = f72, f67, f35 + adds BOFFSET = - 30 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f71 = f72, f71, f39 + adds AOFFSET2 = 4 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f64, SIZE + FNMA f80 = f74, f64, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f68, SIZE + FNMA f84 = f74, f68, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f65, SIZE + FMA_A f81 = f75, f64, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f69, SIZE + FMA_A f85 = f75, f68, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f66, SIZE + FNMA f82 = f74, f66, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f70, SIZE + FNMA f86 = f74, f70, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f67, 5 * SIZE + FMA_A f83 = f75, f66, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f71, 5 * SIZE + FMA_A f87 = f75, f70, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + FMA_B f80 = f75, f65, f80 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f68, SIZE + FMA_B f84 = f75, f69, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + FNMA f81 = f74, f65, f81 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + FNMA f85 = f74, f69, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + FMA_B f82 = f75, f67, f82 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f70, SIZE + FMA_B f86 = f75, f71, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + FNMA f83 = f74, f67, f83 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f71, 5 * SIZE + FNMA f87 = f74, f71, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f96 = f76, f64, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f100 = f76, f68, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f97 = f77, f64, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f101 = f77, f68, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f98 = f76, f66, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f102 = f76, f70, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f99 = f77, f66, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f103 = f77, f70, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f96 = f77, f65, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f100 = f77, f69, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f97 = f76, f65, f97 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f101 = f76, f69, f101 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f98 = f77, f67, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f102 = f77, f71, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f99 = f76, f67, f99 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f103 = f76, f71, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f78, f64, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f78, f68, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f79, f64, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f79, f68, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f78, f66, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f78, f70, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f79, f66, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f79, f70, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f79, f65, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f79, f69, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f78, f65, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f78, f69, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f79, f67, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f79, f71, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f78, f67, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f78, f71, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f90, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f90, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f33 = f91, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f91, f84 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f34 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f90, f86 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f35 = f91, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f91, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f91, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f91, f85, f36 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f81 = f90, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f90, f85, f37 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f82 = f91, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f91, f87, f38 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f83 = f90, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f90, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f96 = f92, f80, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f100 = f92, f84, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f97 = f93, f80, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f101 = f93, f84, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f98 = f92, f82, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f102 = f92, f86, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, 5 * SIZE + FMA_A f99 = f93, f82, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, 5 * SIZE + FMA_A f103 = f93, f86, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f96 = f93, f81, f96 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f100 = f93, f85, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f97 = f92, f81, f97 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f101 = f92, f85, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f98 = f93, f83, f98 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f102 = f93, f87, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f99 = f92, f83, f99 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f103 = f92, f87, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f112 = f94, f80, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f116 = f94, f84, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f113 = f95, f80, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f117 = f95, f84, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f114 = f94, f82, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f118 = f94, f86, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f115 = f95, f82, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f119 = f95, f86, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f112 = f95, f81, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f116 = f95, f85, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f113 = f94, f81, f113 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f117 = f94, f85, f117 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f114 = f95, f83, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f118 = f95, f87, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f115 = f94, f83, f115 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f119 = f94, f87, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f108, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f108, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f109, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f109, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f108, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f108, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f109, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f109, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f109, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f109, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f108, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f108, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f109, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f109, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f108, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f108, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f112 = f110, f96, f112 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f116 = f110, f100, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f113 = f111, f96, f113 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f117 = f111, f100, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f114 = f110, f98, f114 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f118 = f110, f102, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, 5 * SIZE + FMA_A f115 = f111, f98, f115 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, 5 * SIZE + FMA_A f119 = f111, f102, f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f112 = f111, f97, f112 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f116 = f111, f101, f116 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f113 = f110, f97, f113 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f117 = f110, f101, f117 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f114 = f111, f99, f114 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f118 = f111, f103, f118 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f115 = f110, f99, f115 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f119 = f110, f103, f119 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f126, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f126, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f127, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f127, f116 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f126, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f126, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f127, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f127, f118 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f112 = f127, f113, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f116 = f127, f117, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f113 = f126, f113, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f117 = f126, f117, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f114 = f127, f115, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f118 = f127, f119, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f115 = f126, f115, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f119 = f126, f119, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f112, SIZE + STFD [AOFFSET2] = f116, SIZE + sub r2 = K, KK + } + ;; + { .mmi + STFD [AOFFSET] = f113, SIZE + STFD [AOFFSET2] = f117, SIZE + mov L = KK + } + ;; + { .mmi + STFD [AOFFSET] = f114, SIZE + STFD [AOFFSET2] = f118, SIZE + shladd r2 = r2, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f115, -27 * SIZE + STFD [AOFFSET2] = f119 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + mov f64 = f0 + shladd BOFFSET = r2, 2, BOFFSET + } + { .mfi + STFD [C8 ] = f116, SIZE + mov f65 = f0 + shladd AOFFSET = r2, 2, AOFFSET + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + mov f80 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C8 ] = f117, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + mov f96 = f0 + adds I = -1, I + } + { .mfi + STFD [C8 ] = f118, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + mov f112 = f0 + nop __LINE__ + } + { .mfb + STFD [C8 ] = f119, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } +#endif + +#ifdef RT + { .mfi + LDFPD f76, f77 = [BOFFSET] + FMPY f32 = f72, f112 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f36 = f72, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f78, f79 = [BOFFSET] + FMPY f33 = f73, f112 + adds BOFFSET = - 4 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f37 = f73, f116 + nop __LINE__ + } + ;; + { .mfi + LDFPD f88, f89 = [BOFFSET] + FMPY f34 = f72, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f38 = f72, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f90, f91 = [BOFFSET] + FMPY f35 = f73, f114 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMPY f39 = f73, f118 + nop __LINE__ + } + ;; + { .mfi + LDFPD f92, f93 = [BOFFSET] + FMA_C f112 = f73, f113, f32 + adds BOFFSET = - 6 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f116 = f73, f117, f36 + nop __LINE__ + } + ;; + { .mfi + LDFPD f104, f105 = [BOFFSET] + FMA_D f113 = f72, f113, f33 + adds BOFFSET = - 2 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_D f117 = f72, f117, f37 + nop __LINE__ + } + ;; + { .mfi + LDFPD f106, f107 = [BOFFSET] + FMA_C f114 = f73, f115, f34 + adds BOFFSET = - 8 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_C f118 = f73, f119, f38 + nop __LINE__ + } + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET] + FMA_D f115 = f72, f115, f35 + adds AOFFSET2 = 28 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FMA_D f119 = f72, f119, f39 + adds AOFFSET = 24 * SIZE, AOFFSET + } + ;; + { .mfi + STFD [AOFFSET] = f112, SIZE + FNMA f96 = f74, f112, f96 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f116, SIZE + FNMA f100 = f74, f116, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f113, SIZE + FMA_A f97 = f75, f112, f97 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f117, SIZE + FMA_A f101 = f75, f116, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f114, SIZE + FNMA f98 = f74, f114, f98 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f118, SIZE + FNMA f102 = f74, f118, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f115, -11 * SIZE + FMA_A f99 = f75, f114, f99 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f119, -11 * SIZE + FMA_A f103 = f75, f118, f103 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f112, SIZE + FMA_B f96 = f75, f113, f96 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f116, SIZE + FMA_B f100 = f75, f117, f100 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f113, SIZE + FNMA f97 = f74, f113, f97 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f117, SIZE + FNMA f101 = f74, f117, f101 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f114, SIZE + FMA_B f98 = f75, f115, f98 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f118, SIZE + FMA_B f102 = f75, f119, f102 + nop __LINE__ + } + ;; + { .mfi + STFD [C4 ] = f115, 5 * SIZE + FNMA f99 = f74, f115, f99 + nop __LINE__ + } + { .mfi + STFD [C8 ] = f119, 5 * SIZE + FNMA f103 = f74, f119, f103 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f80 = f76, f112, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f84 = f76, f116, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f81 = f77, f112, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f85 = f77, f116, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f82 = f76, f114, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f86 = f76, f118, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f83 = f77, f114, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f87 = f77, f118, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f80 = f77, f113, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f84 = f77, f117, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f81 = f76, f113, f81 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f85 = f76, f117, f85 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f82 = f77, f115, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f86 = f77, f119, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f83 = f76, f115, f83 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f87 = f76, f119, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f78, f112, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f78, f116, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f79, f112, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f79, f116, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f78, f114, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f78, f118, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f79, f114, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f79, f118, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f79, f113, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f79, f117, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f78, f113, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f78, f117, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f79, f115, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f79, f119, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f78, f115, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f78, f119, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f88, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f89, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f89, f100 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f88, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f88, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f89, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f89, f102 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f96 = f89, f97, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f100 = f89, f101, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f97 = f88, f97, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f101 = f88, f101, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f98 = f89, f99, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f102 = f89, f103, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f99 = f88, f99, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f103 = f88, f103, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f96, SIZE + FNMA f80 = f90, f96, f80 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f100, SIZE + FNMA f84 = f90, f100, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f97, SIZE + FMA_A f81 = f91, f96, f81 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f101, SIZE + FMA_A f85 = f91, f100, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f98, SIZE + FNMA f82 = f90, f98, f82 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f102, SIZE + FNMA f86 = f90, f102, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f99, -11 * SIZE + FMA_A f83 = f91, f98, f83 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f103, -11 * SIZE + FMA_A f87 = f91, f102, f87 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f96, SIZE + FMA_B f80 = f91, f97, f80 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f100, SIZE + FMA_B f84 = f91, f101, f84 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f97, SIZE + FNMA f81 = f90, f97, f81 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f101, SIZE + FNMA f85 = f90, f101, f85 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f98, SIZE + FMA_B f82 = f91, f99, f82 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f102, SIZE + FMA_B f86 = f91, f103, f86 + nop __LINE__ + } + ;; + { .mfi + STFD [C3 ] = f99, 5 * SIZE + FNMA f83 = f90, f99, f83 + nop __LINE__ + } + { .mfi + STFD [C7 ] = f103, 5 * SIZE + FNMA f87 = f90, f103, f87 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f64 = f92, f96, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f68 = f92, f100, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f65 = f93, f96, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f69 = f93, f100, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f66 = f92, f98, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f70 = f92, f102, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_A f67 = f93, f98, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_A f71 = f93, f102, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f64 = f93, f97, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f68 = f93, f101, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f65 = f92, f97, f65 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f69 = f92, f101, f69 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_B f66 = f93, f99, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f70 = f93, f103, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FNMA f67 = f92, f99, f67 + nop __LINE__ + } + { .mfi + nop __LINE__ + FNMA f71 = f92, f103, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f104, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f104, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f105, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f105, f84 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f104, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f104, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f105, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f105, f86 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f80 = f105, f81, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f84 = f105, f85, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f81 = f104, f81, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f85 = f104, f85, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f82 = f105, f83, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f86 = f105, f87, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f83 = f104, f83, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f87 = f104, f87, f39 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f80, SIZE + FNMA f64 = f106, f80, f64 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f84, SIZE + FNMA f68 = f106, f84, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f81, SIZE + FMA_A f65 = f107, f80, f65 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f85, SIZE + FMA_A f69 = f107, f84, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f82, SIZE + FNMA f66 = f106, f82, f66 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f86, SIZE + FNMA f70 = f106, f86, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [AOFFSET] = f83, -11 * SIZE + FMA_A f67 = f107, f82, f67 + nop __LINE__ + } + { .mfi + STFD [AOFFSET2] = f87, -11 * SIZE + FMA_A f71 = f107, f86, f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f80, SIZE + FMA_B f64 = f107, f81, f64 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f84, SIZE + FMA_B f68 = f107, f85, f68 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f81, SIZE + FNMA f65 = f106, f81, f65 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f85, SIZE + FNMA f69 = f106, f85, f69 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f82, SIZE + FMA_B f66 = f107, f83, f66 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f86, SIZE + FMA_B f70 = f107, f87, f70 + nop __LINE__ + } + ;; + { .mfi + STFD [C2 ] = f83, 5 * SIZE + FNMA f67 = f106, f83, f67 + nop __LINE__ + } + { .mfi + STFD [C6 ] = f87, 5 * SIZE + FNMA f71 = f106, f87, f71 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f32 = f120, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f36 = f120, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f33 = f121, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f37 = f121, f68 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f34 = f120, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f38 = f120, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMPY f35 = f121, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMPY f39 = f121, f70 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f64 = f121, f65, f32 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f68 = f121, f69, f36 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f65 = f120, f65, f33 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f69 = f120, f69, f37 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_C f66 = f121, f67, f34 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_C f70 = f121, f71, f38 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FMA_D f67 = f120, f67, f35 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_D f71 = f120, f71, f39 + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f68, SIZE + shladd r2 = K, ZBASE_SHIFT, r0 + } + ;; + { .mmi + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f69, SIZE + shladd AORIG = r2, 2, AORIG + } + ;; + { .mmi + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f70, SIZE + nop __LINE__ + } + ;; + { .mmi + STFD [AOFFSET] = f67, -3 * SIZE + STFD [AOFFSET2] = f71 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f64, SIZE + mov f64 = f0 + cmp.ne p6, p0 = 1, I + } + { .mfi + STFD [C5 ] = f68, SIZE + mov f81 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f65, SIZE + mov f65 = f0 + nop __LINE__ + } + { .mfi + STFD [C5 ] = f69, SIZE + mov f96 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f66, SIZE + mov f80 = f0 + sub L = K, KK + } + { .mfi + STFD [C5 ] = f70, SIZE + mov f97 = f0 + nop __LINE__ + } + ;; + { .mfi + STFD [C1 ] = f67, 5 * SIZE + mov f112 = f0 + adds I = -1, I + } + { .mfb + STFD [C5 ] = f71, 5 * SIZE + mov f113 = f0 + (p6) br.cond.dptk .L011 + } + ;; +#endif + +.L020: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 1 + (p6) br.cond.dptk .L030 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, 1 + ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f66 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f67 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f66 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f67 = f0 + shladd AOFFSET = r3, 1, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f82 = f0 + tbit.z p12, p0 = L, 0 + } + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f83 = f0 + shr L = L, 1 + } + ;; + { .mfi + (p7) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + mov f98 = f0 + adds L = -1, L + } + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f99 = f0 + cmp.eq p3, p0 = r0, r0 + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f114 = f0 + mov ar.lc = L + } + { .mfi + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f115 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L028 + ;; + .align 16 + +.L022: + { .mfi + lfetch.nt1 [PREA], 8 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f42, f43 = [AOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f66 = f34, f48, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f67 = f34, f49, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f82 = f34, f50, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f83 = f34, f51, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f98 = f34, f52, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f99 = f34, f53, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f114 = f34, f54, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f115 = f34, f55, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f67 = f35, f48, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f66 = f35, f49, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + FMA f83 = f35, f50, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f82 = f35, f51, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + FMA f99 = f35, f52, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f98 = f35, f53, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f115 = f35, f54, f115 // A4 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f114 = f35, f55, f114 // A4 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + { .mfb + (p4) LDFPD f34, f35 = [AOFFSET], 2 * SIZE + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f66 = f42, f56, f66 // A3 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f67 = f42, f57, f67 // A3 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f82 = f42, f58, f82 // A3 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f83 = f42, f59, f83 // A3 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f98 = f42, f60, f98 // A3 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f99 = f42, f61, f99 // A3 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f114 = f42, f62, f114 // A3 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f115 = f42, f63, f115 // A3 * B8 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f67 = f43, f56, f67 // A4 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f66 = f43, f57, f66 // A4 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f83 = f43, f58, f83 // A4 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f82 = f43, f59, f82 // A4 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f99 = f43, f60, f99 // A4 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f98 = f43, f61, f98 // A4 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f115 = f43, f62, f115 // A4 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f114 = f43, f63, f114 // A4 * B8 + br.cloop.sptk.few .L022 + } + ;; +.L028: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -2, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = r2, 1, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [BOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [BOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [BOFFSET] + FSUB f80 = f74, f80 + adds BOFFSET = -14 * SIZE, BOFFSET + } + { .mfi + nop __LINE__ + FSUB_A f81 = f75, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f88, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f97 = f89, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f90, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f113 = f91, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f66 = f104, f66 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f67 = f105, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f106, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f83 = f107, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f120, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f99 = f121, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB_A f115 = f123, f115 + nop __LINE__ + } + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f90, f91 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f106, f107 = [AOFFSET], 2 * SIZE + ;; + { .mfi + LDFPD f120, f121 = [AOFFSET], 2 * SIZE + FSUB f64 = f72, f64 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f65 = f73, f65 + nop __LINE__ + } + ;; + { .mfi + LDFPD f122, f123 = [AOFFSET] + FSUB f66 = f74, f66 + adds AOFFSET = -14 * SIZE, AOFFSET + } + { .mfi + nop __LINE__ + FSUB f67 = f75, f67 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f80 = f88, f80 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f81 = f89, f81 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f82 = f90, f82 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f83 = f91, f83 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f96 = f104, f96 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f97 = f105, f97 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f98 = f106, f98 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f99 = f107, f99 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f112 = f120, f112 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f113 = f121, f113 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + FSUB f114 = f122, f114 + nop __LINE__ + } + { .mfi + nop __LINE__ + FSUB f115 = f123, f115 + nop __LINE__ + } + ;; +#endif + +#ifdef LN + adds AOFFSET = 6 * SIZE, AOFFSET + ;; + LDFPD f104, f105 = [AOFFSET] + adds AOFFSET = - 2 * SIZE, AOFFSET + ;; + LDFPD f106, f107 = [AOFFSET] + adds AOFFSET = - 4 * SIZE, AOFFSET + ;; + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f104, f66 + FMPY f33 = f105, f66 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + FMPY f36 = f104, f98 + FMPY f37 = f105, f98 + FMPY f38 = f104, f114 + FMPY f39 = f105, f114 + ;; + FMA_C f66 = f105, f67, f32 + FMA_D f67 = f104, f67, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + FMA_C f98 = f105, f99, f36 + FMA_D f99 = f104, f99, f37 + FMA_C f114 = f105, f115, f38 + FMA_D f115 = f104, f115, f39 + ;; + FNMA f64 = f106, f66, f64 + FMA_A f65 = f107, f66, f65 + FNMA f80 = f106, f82, f80 + FMA_A f81 = f107, f82, f81 + FNMA f96 = f106, f98, f96 + FMA_A f97 = f107, f98, f97 + FNMA f112 = f106, f114, f112 + FMA_A f113 = f107, f114, f113 + ;; + FMA_B f64 = f107, f67, f64 + FNMA f65 = f106, f67, f65 + FMA_B f80 = f107, f83, f80 + FNMA f81 = f106, f83, f81 + FMA_B f96 = f107, f99, f96 + FNMA f97 = f106, f99, f97 + FMA_B f112 = f107, f115, f112 + FNMA f113 = f106, f115, f113 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [AOFFSET] + adds AOFFSET = 4 * SIZE, AOFFSET + ;; + LDFPD f90, f91 = [AOFFSET] + adds AOFFSET = - 6 * SIZE, AOFFSET + ;; + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f80 + FMPY f35 = f73, f80 + FMPY f36 = f72, f96 + FMPY f37 = f73, f96 + FMPY f38 = f72, f112 + FMPY f39 = f73, f112 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f80 = f73, f81, f34 + FMA_D f81 = f72, f81, f35 + FMA_C f96 = f73, f97, f36 + FMA_D f97 = f72, f97, f37 + FMA_C f112 = f73, f113, f38 + FMA_D f113 = f72, f113, f39 + ;; + FNMA f66 = f74, f64, f66 + FMA_A f67 = f75, f64, f67 + FNMA f82 = f74, f80, f82 + FMA_A f83 = f75, f80, f83 + FNMA f98 = f74, f96, f98 + FMA_A f99 = f75, f96, f99 + FNMA f114 = f74, f112, f114 + FMA_A f115 = f75, f112, f115 + ;; + FMA_B f66 = f75, f65, f66 + FNMA f67 = f74, f65, f67 + FMA_B f82 = f75, f81, f82 + FNMA f83 = f74, f81, f83 + FMA_B f98 = f75, f97, f98 + FNMA f99 = f74, f97, f99 + FMA_B f114 = f75, f113, f114 + FNMA f115 = f74, f113, f115 + ;; + FMPY f32 = f90, f66 + FMPY f33 = f91, f66 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + FMPY f36 = f90, f98 + FMPY f37 = f91, f98 + FMPY f38 = f90, f114 + FMPY f39 = f91, f114 + ;; + FMA_C f66 = f91, f67, f32 + FMA_D f67 = f90, f67, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + FMA_C f98 = f91, f99, f36 + FMA_D f99 = f90, f99, f37 + FMA_C f114 = f91, f115, f38 + FMA_D f115 = f90, f115, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + FMPY f34 = f72, f66 + FMPY f35 = f73, f66 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + FMA_C f66 = f73, f67, f34 + FMA_D f67 = f72, f67, f35 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + FNMA f82 = f74, f66, f82 + FMA_A f83 = f75, f66, f83 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + FMA_B f82 = f75, f67, f82 + FNMA f83 = f74, f67, f83 + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + FNMA f98 = f76, f66, f98 + FMA_A f99 = f77, f66, f99 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + FMA_B f98 = f77, f67, f98 + FNMA f99 = f76, f67, f99 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + FNMA f114 = f78, f66, f114 + FMA_A f115 = f79, f66, f115 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + FMA_B f114 = f79, f67, f114 + FNMA f115 = f78, f67, f115 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + FMPY f34 = f90, f82 + FMPY f35 = f91, f82 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + FMA_C f82 = f91, f83, f34 + FMA_D f83 = f90, f83, f35 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + FNMA f98 = f92, f82, f98 + FMA_A f99 = f93, f82, f99 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + FMA_B f98 = f93, f83, f98 + FNMA f99 = f92, f83, f99 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + FNMA f114 = f94, f82, f114 + FMA_A f115 = f95, f82, f115 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + FMA_B f114 = f95, f83, f114 + FNMA f115 = f94, f83, f115 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + FMPY f34 = f108, f98 + FMPY f35 = f109, f98 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + FMA_C f98 = f109, f99, f34 + FMA_D f99 = f108, f99, f35 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + FNMA f114 = f110, f98, f114 + FMA_A f115 = f111, f98, f115 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + FMA_B f114 = f111, f99, f114 + FNMA f115 = f110, f99, f115 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + FMPY f34 = f126, f114 + FMPY f35 = f127, f114 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + FMA_C f114 = f127, f115, f34 + FMA_D f115 = f126, f115, f35 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + FMPY f34 = f72, f114 + FMPY f35 = f73, f114 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + FMA_C f114 = f73, f115, f34 + FMA_D f115 = f72, f115, f35 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + FNMA f98 = f74, f114, f98 + FMA_A f99 = f75, f114, f99 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + FMA_B f98 = f75, f115, f98 + FNMA f99 = f74, f115, f99 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + FNMA f82 = f76, f114, f82 + FMA_A f83 = f77, f114, f83 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + FMA_B f82 = f77, f115, f82 + FNMA f83 = f76, f115, f83 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + FNMA f66 = f78, f114, f66 + FMA_A f67 = f79, f114, f67 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + FMA_B f66 = f79, f115, f66 + FNMA f67 = f78, f115, f67 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + FMPY f34 = f88, f98 + FMPY f35 = f89, f98 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + FMA_C f98 = f89, f99, f34 + FMA_D f99 = f88, f99, f35 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + FNMA f82 = f90, f98, f82 + FMA_A f83 = f91, f98, f83 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + FMA_B f82 = f91, f99, f82 + FNMA f83 = f90, f99, f83 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + FNMA f66 = f92, f98, f66 + FMA_A f67 = f93, f98, f67 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + FMA_B f66 = f93, f99, f66 + FNMA f67 = f92, f99, f67 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + FMPY f34 = f104, f82 + FMPY f35 = f105, f82 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + FMA_C f82 = f105, f83, f34 + FMA_D f83 = f104, f83, f35 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + FNMA f66 = f106, f82, f66 + FMA_A f67 = f107, f82, f67 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + FMA_B f66 = f107, f83, f66 + FNMA f67 = f106, f83, f67 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f66 + FMPY f35 = f121, f66 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f66 = f121, f67, f34 + FMA_D f67 = f120, f67, f35 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + STFD [BOFFSET] = f66, SIZE + STFD [BOFFSET2] = f98, SIZE + ;; + STFD [BOFFSET] = f67, SIZE + STFD [BOFFSET2] = f99, SIZE + ;; + STFD [BOFFSET] = f82, SIZE + STFD [BOFFSET2] = f114, SIZE + ;; + STFD [BOFFSET] = f83, 5 * SIZE + STFD [BOFFSET2] = f115, 5 * SIZE + ;; + adds BOFFSET = - 16 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f80, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f81, SIZE + ;; + STFD [AOFFSET] = f66, SIZE + STFD [AOFFSET2] = f82, SIZE + ;; + STFD [AOFFSET] = f67, 5 * SIZE + STFD [AOFFSET2] = f83, 5 * SIZE + ;; + STFD [AOFFSET] = f96, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f97, SIZE + STFD [AOFFSET2] = f113, SIZE + ;; + STFD [AOFFSET] = f98, SIZE + STFD [AOFFSET2] = f114, SIZE + ;; + STFD [AOFFSET] = f99, 5 * SIZE + STFD [AOFFSET2] = f115, 5 * SIZE + ;; + adds AOFFSET = - 16 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C1 ] = f66, SIZE + ;; + STFD [C1 ] = f67, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C2 ] = f82, SIZE + ;; + STFD [C2 ] = f83, SIZE + ;; + + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C3 ] = f98, SIZE + ;; + STFD [C3 ] = f99, SIZE + ;; + + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + STFD [C4 ] = f114, SIZE + ;; + STFD [C4 ] = f115, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -4 * SIZE, C1 + adds C2 = -4 * SIZE, C2 + adds C3 = -4 * SIZE, C3 + adds C4 = -4 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + shladd AORIG = r2, 1, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + shladd AOFFSET = L, 1, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 2, KK +#elif defined LN + adds KK = -2, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L030: + { .mib +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + tbit.z p6, p7 = M, 0 + (p6) br.cond.dptk .L049 + } + ;; + { .mmi + cmp.ne p7, p0 = r0, L + adds BOFFSET = 0 * SIZE, B + shl r2 = K, ZBASE_SHIFT + } + { .mmi + shladd r3 = KK, ZBASE_SHIFT, r0 + nop __LINE__ + nop __LINE__ + } + ;; +#if defined(LT) || defined(RN) + { .mfb + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f72 = f0 + nop __LINE__ + } + { .mmf + nop __LINE__ + nop __LINE__ + mov f73 = f0 + } + ;; +#else + { .mfi + shladd BOFFSET = r3, 2, B + mov f72 = f0 +#ifdef LN + sub AORIG = AORIG, r2 +#else + nop __LINE__ +#endif + } + ;; + { .mfi + (p7) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + mov f73 = f0 + add AOFFSET = r3, AORIG + } + ;; +#endif + ;; + adds L = 1, L + ;; + + { .mmi + nop __LINE__ + adds PREB = (PREFETCHSIZE + 0) * SIZE, BOFFSET + tbit.z p12, p0 = L, 0 + } + ;; + { .mfi + (p7) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + mov f88 = f0 + shr L = L, 1 + } + { .mfi + (p7) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + mov f89 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + mov f104 = f0 + adds L = -1, L + } + { .mfb + adds PREA = (PREFETCHSIZE + 0) * SIZE, AOFFSET + mov f105 = f0 + nop __LINE__ + } + ;; + { .mfi + (p7) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + mov f120 = f0 + mov ar.lc = L + } + { .mfi + cmp.eq p3, p0 = r0, r0 + mov f121 = f0 + nop __LINE__ + } + ;; + cmp.eq p6, p0 = -1, L + (p6) br.cond.dpnt .L038 + ;; + .align 16 + +.L032: + { .mfb + lfetch.nt1 [PREA], 4 * SIZE + FMA f64 = f32, f48, f64 // A1 * B1 + nop __LINE__ + } + { .mfi + nop __LINE__ + FMA_B f65 = f32, f49, f65 // A1 * B2 + (p12) cmp.ne p3, p0 = 0, L + } + ;; + { .mfi + lfetch.nt1 [PREB], 16 * SIZE + FMA f80 = f32, f50, f80 // A1 * B3 + cmp.ne p4, p5 = 0, L + } + { .mfb + nop __LINE__ + FMA_B f81 = f32, f51, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f56, f57 = [BOFFSET], 2 * SIZE + FMA f96 = f32, f52, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f97 = f32, f53, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f40, f41 = [AOFFSET], 2 * SIZE + FMA f112 = f32, f54, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_B f113 = f32, f55, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f58, f59 = [BOFFSET], 2 * SIZE + FMA f65 = f33, f48, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f64 = f33, f49, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f60, f61 = [BOFFSET], 2 * SIZE + FMA f81 = f33, f50, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f80 = f33, f51, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + (p3) LDFPD f62, f63 = [BOFFSET], 2 * SIZE + FMA f97 = f33, f52, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f96 = f33, f53, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f48, f49 = [BOFFSET], 2 * SIZE + FMA f113 = f33, f54, f113 // A2 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + FMA_A f112 = f33, f55, f112 // A2 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f32, f33 = [AOFFSET], 2 * SIZE + (p3) FMA f64 = f40, f56, f64 // A1 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f65 = f40, f57, f65 // A1 * B2 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f50, f51 = [BOFFSET], 2 * SIZE + (p3) FMA f80 = f40, f58, f80 // A1 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f81 = f40, f59, f81 // A1 * B4 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f52, f53 = [BOFFSET], 2 * SIZE + (p3) FMA f96 = f40, f60, f96 // A1 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f97 = f40, f61, f97 // A1 * B6 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f112 = f40, f62, f112 // A1 * B7 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_B f113 = f40, f63, f113 // A1 * B8 + nop __LINE__ + } + ;; + { .mfb + (p4) LDFPD f54, f55 = [BOFFSET], 2 * SIZE + (p3) FMA f65 = f41, f56, f65 // A2 * B1 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f64 = f41, f57, f64 // A2 * B2 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f81 = f41, f58, f81 // A2 * B3 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f80 = f41, f59, f80 // A2 * B4 + nop __LINE__ + } + ;; + { .mfb + nop __LINE__ + (p3) FMA f97 = f41, f60, f97 // A2 * B5 + nop __LINE__ + } + { .mfb + nop __LINE__ + (p3) FMA_A f96 = f41, f61, f96 // A2 * B6 + nop __LINE__ + } + ;; + { .mfi + nop __LINE__ + (p3) FMA f113 = f41, f62, f113 // A2 * B7 + adds L = -1, L + } + { .mfb + nop __LINE__ + (p3) FMA_A f112 = f41, f63, f112 // A2 * B8 + br.cloop.sptk.few .L032 + } + ;; +.L038: +#if defined(LN) || defined(RT) +#ifdef LN + adds r2 = -1, KK +#else + adds r2 = -4, KK +#endif + ;; + shladd r2 = r2, ZBASE_SHIFT, r0 + ;; + add AOFFSET = r2, AORIG + shladd BOFFSET = r2, 2, B + ;; +#endif + +#if defined(LN) || defined(LT) + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [BOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [BOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [BOFFSET] + adds BOFFSET = -6 * SIZE, BOFFSET + ;; + FSUB f64 = f72, f64 + FSUB_A f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB_A f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB_A f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB_A f113 = f121, f113 + ;; +#else + LDFPD f72, f73 = [AOFFSET], 2 * SIZE + ;; + LDFPD f88, f89 = [AOFFSET], 2 * SIZE + ;; + LDFPD f104, f105 = [AOFFSET], 2 * SIZE + ;; + LDFPD f120, f121 = [AOFFSET] + adds AOFFSET = -6 * SIZE, AOFFSET + ;; + FSUB f64 = f72, f64 + FSUB f65 = f73, f65 + FSUB f80 = f88, f80 + FSUB f81 = f89, f81 + FSUB f96 = f104, f96 + FSUB f97 = f105, f97 + FSUB f112 = f120, f112 + FSUB f113 = f121, f113 + ;; +#endif + +#ifdef LN + LDFPD f120, f121 = [AOFFSET] + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + FMPY f34 = f120, f80 + FMPY f35 = f121, f80 + FMPY f36 = f120, f96 + FMPY f37 = f121, f96 + FMPY f38 = f120, f112 + FMPY f39 = f121, f112 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + FMA_C f80 = f121, f81, f34 + FMA_D f81 = f120, f81, f35 + FMA_C f96 = f121, f97, f36 + FMA_D f97 = f120, f97, f37 + FMA_C f112 = f121, f113, f38 + FMA_D f113 = f120, f113, f39 + ;; +#endif + +#ifdef LT + LDFPD f90, f91 = [AOFFSET] + ;; + FMPY f32 = f90, f64 + FMPY f33 = f91, f64 + FMPY f34 = f90, f80 + FMPY f35 = f91, f80 + FMPY f36 = f90, f96 + FMPY f37 = f91, f96 + FMPY f38 = f90, f112 + FMPY f39 = f91, f112 + ;; + FMA_C f64 = f91, f65, f32 + FMA_D f65 = f90, f65, f33 + FMA_C f80 = f91, f81, f34 + FMA_D f81 = f90, f81, f35 + FMA_C f96 = f91, f97, f36 + FMA_D f97 = f90, f97, f37 + FMA_C f112 = f91, f113, f38 + FMA_D f113 = f90, f113, f39 + ;; +#endif + +#ifdef RN + LDFPD f72, f73 = [BOFFSET], 2 * SIZE + ;; + LDFPD f74, f75 = [BOFFSET], 2 * SIZE + ;; + LDFPD f76, f77 = [BOFFSET], 2 * SIZE + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = 4 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET], 2 * SIZE + ;; + LDFPD f92, f93 = [BOFFSET], 2 * SIZE + ;; + LDFPD f94, f95 = [BOFFSET] + adds BOFFSET = 6 * SIZE, BOFFSET + ;; + LDFPD f108, f109 = [BOFFSET], 2 * SIZE + ;; + LDFPD f110, f111 = [BOFFSET] + adds BOFFSET = 8 * SIZE, BOFFSET + ;; + LDFPD f126, f127 = [BOFFSET] + adds BOFFSET = - 30 * SIZE, BOFFSET + ;; + + FMPY f32 = f72, f64 + FMPY f33 = f73, f64 + ;; + FMA_C f64 = f73, f65, f32 + FMA_D f65 = f72, f65, f33 + ;; + FNMA f80 = f74, f64, f80 + FMA_A f81 = f75, f64, f81 + ;; + FMA_B f80 = f75, f65, f80 + FNMA f81 = f74, f65, f81 + + ;; + FNMA f96 = f76, f64, f96 + FMA_A f97 = f77, f64, f97 + ;; + FMA_B f96 = f77, f65, f96 + FNMA f97 = f76, f65, f97 + ;; + FNMA f112 = f78, f64, f112 + FMA_A f113 = f79, f64, f113 + ;; + FMA_B f112 = f79, f65, f112 + FNMA f113 = f78, f65, f113 + ;; + FMPY f32 = f90, f80 + FMPY f33 = f91, f80 + ;; + FMA_C f80 = f91, f81, f32 + FMA_D f81 = f90, f81, f33 + ;; + + FNMA f96 = f92, f80, f96 + FMA_A f97 = f93, f80, f97 + ;; + FMA_B f96 = f93, f81, f96 + FNMA f97 = f92, f81, f97 + ;; + FNMA f112 = f94, f80, f112 + FMA_A f113 = f95, f80, f113 + ;; + FMA_B f112 = f95, f81, f112 + FNMA f113 = f94, f81, f113 + ;; + FMPY f32 = f108, f96 + FMPY f33 = f109, f96 + ;; + FMA_C f96 = f109, f97, f32 + FMA_D f97 = f108, f97, f33 + ;; + FNMA f112 = f110, f96, f112 + FMA_A f113 = f111, f96, f113 + ;; + FMA_B f112 = f111, f97, f112 + FNMA f113 = f110, f97, f113 + ;; + FMPY f32 = f126, f112 + FMPY f33 = f127, f112 + ;; + FMA_C f112 = f127, f113, f32 + FMA_D f113 = f126, f113, f33 + ;; +#endif + +#ifdef RT + adds BOFFSET = 30 * SIZE, BOFFSET + ;; + LDFPD f72, f73 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f74, f75 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f76, f77 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f78, f79 = [BOFFSET] + adds BOFFSET = - 4 * SIZE, BOFFSET + ;; + LDFPD f88, f89 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f90, f91 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f92, f93 = [BOFFSET] + adds BOFFSET = - 6 * SIZE, BOFFSET + ;; + LDFPD f104, f105 = [BOFFSET] + adds BOFFSET = - 2 * SIZE, BOFFSET + ;; + LDFPD f106, f107 = [BOFFSET] + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; + LDFPD f120, f121 = [BOFFSET] + ;; + FMPY f32 = f72, f112 + FMPY f33 = f73, f112 + ;; + FMA_C f112 = f73, f113, f32 + FMA_D f113 = f72, f113, f33 + ;; + FNMA f96 = f74, f112, f96 + FMA_A f97 = f75, f112, f97 + ;; + FMA_B f96 = f75, f113, f96 + FNMA f97 = f74, f113, f97 + ;; + FNMA f80 = f76, f112, f80 + FMA_A f81 = f77, f112, f81 + ;; + FMA_B f80 = f77, f113, f80 + FNMA f81 = f76, f113, f81 + ;; + FNMA f64 = f78, f112, f64 + FMA_A f65 = f79, f112, f65 + ;; + FMA_B f64 = f79, f113, f64 + FNMA f65 = f78, f113, f65 + ;; + FMPY f32 = f88, f96 + FMPY f33 = f89, f96 + ;; + FMA_C f96 = f89, f97, f32 + FMA_D f97 = f88, f97, f33 + ;; + FNMA f80 = f90, f96, f80 + FMA_A f81 = f91, f96, f81 + ;; + FMA_B f80 = f91, f97, f80 + FNMA f81 = f90, f97, f81 + ;; + FNMA f64 = f92, f96, f64 + FMA_A f65 = f93, f96, f65 + ;; + FMA_B f64 = f93, f97, f64 + FNMA f65 = f92, f97, f65 + ;; + FMPY f32 = f104, f80 + FMPY f33 = f105, f80 + ;; + FMA_C f80 = f105, f81, f32 + FMA_D f81 = f104, f81, f33 + ;; + FNMA f64 = f106, f80, f64 + FMA_A f65 = f107, f80, f65 + ;; + FMA_B f64 = f107, f81, f64 + FNMA f65 = f106, f81, f65 + ;; + FMPY f32 = f120, f64 + FMPY f33 = f121, f64 + ;; + FMA_C f64 = f121, f65, f32 + FMA_D f65 = f120, f65, f33 + ;; +#endif + +#if defined(LN) || defined(LT) + adds BOFFSET2 = 4 * SIZE, BOFFSET + ;; + STFD [BOFFSET] = f64, SIZE + STFD [BOFFSET2] = f96, SIZE + ;; + STFD [BOFFSET] = f65, SIZE + STFD [BOFFSET2] = f97, SIZE + ;; + STFD [BOFFSET] = f80, SIZE + STFD [BOFFSET2] = f112, SIZE + ;; + STFD [BOFFSET] = f81, 5 * SIZE + STFD [BOFFSET2] = f113, 5 * SIZE + ;; + adds BOFFSET = - 8 * SIZE, BOFFSET + ;; +#else + adds AOFFSET2 = 4 * SIZE, AOFFSET + ;; + STFD [AOFFSET] = f64, SIZE + STFD [AOFFSET2] = f96, SIZE + ;; + STFD [AOFFSET] = f65, SIZE + STFD [AOFFSET2] = f97, SIZE + ;; + STFD [AOFFSET] = f80, SIZE + STFD [AOFFSET2] = f112, SIZE + ;; + STFD [AOFFSET] = f81, 5 * SIZE + STFD [AOFFSET2] = f113, 5 * SIZE + ;; + adds AOFFSET = - 8 * SIZE, AOFFSET + ;; +#endif + +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + STFD [C1 ] = f64, SIZE + ;; + STFD [C1 ] = f65, SIZE + ;; + STFD [C2 ] = f80, SIZE + ;; + STFD [C2 ] = f81, SIZE + ;; + STFD [C3 ] = f96, SIZE + ;; + STFD [C3 ] = f97, SIZE + ;; + STFD [C4 ] = f112, SIZE + ;; + STFD [C4 ] = f113, SIZE + ;; + mov f64 = f0 + mov f65 = f0 + mov f80 = f0 + mov f81 = f0 + mov f96 = f0 + mov f97 = f0 + mov f112 = f0 + mov f113 = f0 + ;; +#ifdef LN + adds C1 = -2 * SIZE, C1 + adds C2 = -2 * SIZE, C2 + adds C3 = -2 * SIZE, C3 + adds C4 = -2 * SIZE, C4 +#endif + ;; + cmp.ne p6, p0 = 1, I + ;; + adds I = -1, I + ;; + shladd r2 = K, ZBASE_SHIFT, r0 + ;; + sub L = K, KK + ;; +#ifdef RT + add AORIG = r2, AORIG +#endif + ;; +#if defined(LT) || defined(RN) + shladd L = L, ZBASE_SHIFT, r0 + ;; + add AOFFSET = L, AOFFSET + shladd BOFFSET = L, 2, BOFFSET +#endif + ;; +#ifdef LT + adds KK = 1, KK +#elif defined LN + adds KK = -1, KK +#else + nop __LINE__ +#endif + ;; +#if defined(LT) || defined(RN) + mov L = KK +#else + sub L = K, KK +#endif + ;; + .align 16 + +.L049: +#ifdef LN + shladd KK8 = K, ZBASE_SHIFT, r0 + ;; + shladd B = KK8, 2, B +#endif + +#if defined(LT) || defined(RN) + mov B = BOFFSET +#endif + +#ifdef RN + adds KK = 4, KK +#endif + +#ifdef RT + adds KK = -4, KK +#endif + ;; + + { .mmb + mov AOFFSET = A + cmp.lt p6, p0 = 0, J + (p6) br.cond.dptk .L010x + } + ;; + .align 16 + +.L999: + { .mii + nop __LINE__ + mov ar.lc = ARLC + mov pr = PR, -1 + } + { .mib + nop __LINE__ +#ifdef TRMMKERNEL + mov ar.pfs = ARPFS +#else + nop __LINE__ +#endif + br.ret.sptk.many b0 + } + EPILOGUE + diff --git a/kernel/mips64/KERNEL b/kernel/mips64/KERNEL new file mode 100644 index 0000000000..3dd7f8e208 --- /dev/null +++ b/kernel/mips64/KERNEL @@ -0,0 +1,96 @@ +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef SCABS_KERNEL +SCABS_KERNEL = ../generic/cabs.c +endif + +ifndef DCABS_KERNEL +DCABS_KERNEL = ../generic/cabs.c +endif + +ifndef QCABS_KERNEL +QCABS_KERNEL = ../generic/cabs.c +endif + +ifndef LSAME_KERNEL +LSAME_KERNEL = ../generic/lsame.c +endif + +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = ../generic/gemm_ncopy_2.c +SGEMMITCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_8.c +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LT.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S + +CGEMM3MKERNEL = zgemm3m_kernel.S +ZGEMM3MKERNEL = zgemm3m_kernel.S diff --git a/kernel/mips64/Makefile b/kernel/mips64/Makefile new file mode 100644 index 0000000000..efae70d7b7 --- /dev/null +++ b/kernel/mips64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/mips64/amax.S b/kernel/mips64/amax.S new file mode 100644 index 0000000000..30c35ba476 --- /dev/null +++ b/kernel/mips64/amax.S @@ -0,0 +1,241 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + FABS s1, a1 + + blez N, .L999 + FABS s2, a1 + + FABS s3, a1 + dsra I, N, 3 + + blez I, .L15 + FABS s4, a1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + + bgtz I, .L12 + CMOVT s4, t4, $fcc3 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/amin.S b/kernel/mips64/amin.S new file mode 100644 index 0000000000..47108b1e43 --- /dev/null +++ b/kernel/mips64/amin.S @@ -0,0 +1,241 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + FABS s1, a1 + + blez N, .L999 + FABS s2, a1 + + FABS s3, a1 + dsra I, N, 3 + + blez I, .L15 + FABS s4, a1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, t1, s1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, t2, s2 + daddu X, X, INCX + + CMPLT $fcc2, t3, s3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, t4, s4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, t1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, t2, s2 + daddu X, X, INCX + + CMPLT $fcc2, t3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, t4, s4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + + bgtz I, .L12 + CMOVT s4, t4, $fcc3 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, t1, s1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/asum.S b/kernel/mips64/asum.S new file mode 100644 index 0000000000..447c2f73d0 --- /dev/null +++ b/kernel/mips64/asum.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 + +#define t1 $f10 +#define t2 $f11 +#define t3 $f12 +#define t4 $f13 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC $0, s1 + + MTC $0, s2 + dsll INCX, INCX, BASE_SHIFT + + blez N, .L999 + li TEMP, SIZE + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + + LD a5, 4 * SIZE(X) + FABS t1, a1 + LD a6, 5 * SIZE(X) + FABS t2, a2 + LD a7, 6 * SIZE(X) + FABS t3, a3 + + FABS t4, a4 + daddiu I, I, -1 + + blez I, .L13 + LD a8, 7 * SIZE(X) + .align 3 + +.L12: + ADD s1, s1, t1 + LD a1, 8 * SIZE(X) + + FABS t1, a5 + daddiu I, I, -1 + + ADD s2, s2, t2 + LD a2, 9 * SIZE(X) + + FABS t2, a6 + NOP + + ADD s1, s1, t3 + LD a3, 10 * SIZE(X) + + FABS t3, a7 + NOP + + ADD s2, s2, t4 + LD a4, 11 * SIZE(X) + + FABS t4, a8 + daddiu X, X, 8 * SIZE + + ADD s1, s1, t1 + LD a5, 4 * SIZE(X) + + FABS t1, a1 + NOP + + ADD s2, s2, t2 + LD a6, 5 * SIZE(X) + + FABS t2, a2 + NOP + + ADD s1, s1, t3 + LD a7, 6 * SIZE(X) + + FABS t3, a3 + NOP + + ADD s2, s2, t4 + LD a8, 7 * SIZE(X) + + bgtz I, .L12 + FABS t4, a4 + .align 3 + +.L13: + ADD s1, s1, t1 + daddiu X, X, 8 * SIZE + + FABS t1, a5 + NOP + + ADD s2, s2, t2 + FABS t2, a6 + + ADD s1, s1, t3 + FABS t3, a7 + + ADD s2, s2, t4 + FABS t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + + ADD s1, s1, t1 + + bgtz I, .L16 + daddiu X, X, SIZE + + j .L999 + NOP + .align 3 + +.L20: + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + LD a7, 0 * SIZE(X) + + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a8, 0 * SIZE(X) + + FABS t4, a4 + daddiu I, I, -1 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, 0 * SIZE(X) + + FABS t1, a5 + daddu X, X, INCX + + ADD s2, s2, t2 + LD a2, 0 * SIZE(X) + + FABS t2, a6 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a3, 0 * SIZE(X) + + FABS t3, a7 + daddu X, X, INCX + + ADD s2, s2, t4 + LD a4, 0 * SIZE(X) + + FABS t4, a8 + daddu X, X, INCX + + ADD s1, s1, t1 + LD a5, 0 * SIZE(X) + + FABS t1, a1 + daddu X, X, INCX + + ADD s2, s2, t2 + LD a6, 0 * SIZE(X) + + FABS t2, a2 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a7, 0 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + + ADD s2, s2, t4 + LD a8, 0 * SIZE(X) + + FABS t4, a4 + daddiu I, I, -1 + + bgtz I, .L23 + daddu X, X, INCX + .align 3 + +.L24: + ADD s1, s1, t1 + FABS t1, a5 + + ADD s2, s2, t2 + FABS t2, a6 + + ADD s1, s1, t3 + FABS t3, a7 + + ADD s2, s2, t4 + FABS t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + daddu X, X, INCX + + bgtz I, .L26 + ADD s1, s1, t1 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE diff --git a/kernel/mips64/axpy.S b/kernel/mips64/axpy.S new file mode 100644 index 0000000000..f7d888743e --- /dev/null +++ b/kernel/mips64/axpy.S @@ -0,0 +1,409 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $8 +#define INCX $9 + +#define Y $10 +#define INCY $11 + +#define I $2 +#define TEMP $3 + +#define YY $5 + +#define ALPHA $f15 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f17 + +#define t1 $f18 +#define t2 $f19 +#define t3 $f20 +#define t4 $f21 + + PROLOGUE + +#ifndef __64BIT__ + daddiu $sp, $sp, -16 + sdc1 $f20, 0($sp) + sdc1 $f21, 8($sp) +#endif + + li TEMP, SIZE + + blez N, .L999 + dsll INCX, INCX, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, BASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + LD a4, 3 * SIZE(X) + LD b4, 3 * SIZE(Y) + LD a5, 4 * SIZE(X) + LD b5, 4 * SIZE(Y) + LD a6, 5 * SIZE(X) + LD b6, 5 * SIZE(Y) + LD a7, 6 * SIZE(X) + LD b7, 6 * SIZE(Y) + LD a8, 7 * SIZE(X) + LD b8, 7 * SIZE(Y) + + blez I, .L13 + NOP + .align 3 + +.L12: + MADD t1, b1, ALPHA, a1 + LD a1, 8 * SIZE(X) + LD b1, 8 * SIZE(Y) + + MADD t2, b2, ALPHA, a2 + LD a2, 9 * SIZE(X) + LD b2, 9 * SIZE(Y) + + MADD t3, b3, ALPHA, a3 + LD a3, 10 * SIZE(X) + LD b3, 10 * SIZE(Y) + + MADD t4, b4, ALPHA, a4 + LD a4, 11 * SIZE(X) + LD b4, 11 * SIZE(Y) + + ST t1, 0 * SIZE(Y) + ST t2, 1 * SIZE(Y) + ST t3, 2 * SIZE(Y) + ST t4, 3 * SIZE(Y) + + MADD t1, b5, ALPHA, a5 + LD a5, 12 * SIZE(X) + LD b5, 12 * SIZE(Y) + + MADD t2, b6, ALPHA, a6 + LD a6, 13 * SIZE(X) + LD b6, 13 * SIZE(Y) + + MADD t3, b7, ALPHA, a7 + LD a7, 14 * SIZE(X) + LD b7, 14 * SIZE(Y) + + MADD t4, b8, ALPHA, a8 + LD a8, 15 * SIZE(X) + LD b8, 15 * SIZE(Y) + + ST t1, 4 * SIZE(Y) + ST t2, 5 * SIZE(Y) + ST t3, 6 * SIZE(Y) + ST t4, 7 * SIZE(Y) + + daddiu I, I, -1 + daddiu Y, Y, 8 * SIZE + + bgtz I, .L12 + daddiu X, X, 8 * SIZE + .align 3 + +.L13: + MADD t1, b1, ALPHA, a1 + MADD t2, b2, ALPHA, a2 + MADD t3, b3, ALPHA, a3 + MADD t4, b4, ALPHA, a4 + + ST t1, 0 * SIZE(Y) + MADD t1, b5, ALPHA, a5 + ST t2, 1 * SIZE(Y) + MADD t2, b6, ALPHA, a6 + ST t3, 2 * SIZE(Y) + MADD t3, b7, ALPHA, a7 + ST t4, 3 * SIZE(Y) + MADD t4, b8, ALPHA, a8 + + ST t1, 4 * SIZE(Y) + ST t2, 5 * SIZE(Y) + ST t3, 6 * SIZE(Y) + ST t4, 7 * SIZE(Y) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + daddiu X, X, SIZE + daddiu Y, Y, SIZE + + MADD t1, b1, ALPHA, a1 + daddiu I, I, -1 + + bgtz I, .L16 + ST t1, -1 * SIZE(Y) + +#ifndef __64BIT__ + ldc1 $f20, 0($sp) + ldc1 $f21, 8($sp) + daddiu $sp, $sp, 16 +#endif + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 3 + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD b1, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD b2, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD b3, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD b4, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD b5, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD b6, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD b7, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a8, 0 * SIZE(X) + daddu X, X, INCX + LD b8, 0 * SIZE(Y) + daddu Y, Y, INCY + + blez I, .L23 + NOP + .align 3 + +.L22: + MADD t1, b1, ALPHA, a1 + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + MADD t2, b2, ALPHA, a2 + LD a2, 0 * SIZE(X) + LD b2, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + MADD t3, b3, ALPHA, a3 + LD a3, 0 * SIZE(X) + LD b3, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + MADD t4, b4, ALPHA, a4 + LD a4, 0 * SIZE(X) + LD b4, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + ST t1, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t1, b5, ALPHA, a5 + + LD a5, 0 * SIZE(X) + LD b5, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + ST t2, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t2, b6, ALPHA, a6 + + LD a6, 0 * SIZE(X) + LD b6, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + ST t3, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t3, b7, ALPHA, a7 + + LD a7, 0 * SIZE(X) + LD b7, 0 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + ST t4, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t4, b8, ALPHA, a8 + + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + LD b8, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST t1, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t2, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t4, 0 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L22 + daddu YY, YY, INCY + .align 3 + +.L23: + MADD t1, b1, ALPHA, a1 + MADD t2, b2, ALPHA, a2 + MADD t3, b3, ALPHA, a3 + MADD t4, b4, ALPHA, a4 + + ST t1, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t1, b5, ALPHA, a5 + + ST t2, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t2, b6, ALPHA, a6 + + ST t3, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t3, b7, ALPHA, a7 + + ST t4, 0 * SIZE(YY) + daddu YY, YY, INCY + MADD t4, b8, ALPHA, a8 + + ST t1, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t2, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + daddu YY, YY, INCY + ST t4, 0 * SIZE(YY) + daddu YY, YY, INCY + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + MADD t1, b1, ALPHA, a1 + daddu X, X, INCX + + ST t1, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: +#ifndef __64BIT__ + ldc1 $f20, 0($sp) + ldc1 $f21, 8($sp) + daddiu $sp, $sp, 16 +#endif + + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S new file mode 100644 index 0000000000..dd8c210909 --- /dev/null +++ b/kernel/mips64/cnrm2.S @@ -0,0 +1,214 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f6 +#define a2 $f7 +#define a3 $f8 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + +#define s1 $f0 +#define s2 $f1 + +#define t1 $f2 +#define t2 $f3 +#define t3 $f4 +#define t4 $f5 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + dmtc1 $0, s1 + li TEMP, 2 * SIZE + + blez N, .L999 + mov.d s2, s1 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + dsra I, N, 2 + + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + + daddu X, X, INCX + cvt.d.s t1, a1 + + LD a7, 0 * SIZE(X) + cvt.d.s t2, a2 + + LD a8, 1 * SIZE(X) + cvt.d.s t3, a3 + + daddiu I, I, -1 + cvt.d.s t4, a4 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + madd.d s1, s1, t1, t1 + LD a1, 0 * SIZE(X) + + cvt.d.s t1, a5 + NOP + + madd.d s2, s2, t2, t2 + LD a2, 1 * SIZE(X) + + cvt.d.s t2, a6 + daddu X, X, INCX + + madd.d s1, s1, t3, t3 + LD a3, 0 * SIZE(X) + + cvt.d.s t3, a7 + NOP + + madd.d s2, s2, t4, t4 + LD a4, 1 * SIZE(X) + + cvt.d.s t4, a8 + daddu X, X, INCX + + madd.d s1, s1, t1, t1 + LD a5, 0 * SIZE(X) + + cvt.d.s t1, a1 + daddiu I, I, -1 + + madd.d s2, s2, t2, t2 + LD a6, 1 * SIZE(X) + + cvt.d.s t2, a2 + daddu X, X, INCX + + madd.d s1, s1, t3, t3 + LD a7, 0 * SIZE(X) + + cvt.d.s t3, a3 + LD a8, 1 * SIZE(X) + + madd.d s2, s2, t4, t4 + daddu X, X, INCX + + bgtz I, .L23 + cvt.d.s t4, a4 + .align 3 + +.L24: + madd.d s1, s1, t1, t1 + cvt.d.s t1, a5 + + madd.d s2, s2, t2, t2 + cvt.d.s t2, a6 + + madd.d s1, s1, t3, t3 + cvt.d.s t3, a7 + + madd.d s2, s2, t4, t4 + cvt.d.s t4, a8 + + madd.d s1, s1, t1, t1 + madd.d s2, s2, t2, t2 + madd.d s1, s1, t3, t3 + madd.d s2, s2, t4, t4 + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + cvt.d.s t2, a2 + + madd.d s1, s1, t1, t1 + daddu X, X, INCX + + bgtz I, .L26 + madd.d s2, s2, t2, t2 + .align 3 + +.L999: + add.d s1, s1, s2 + + sqrt.d s1, s1 + + j $31 + cvt.s.d s1, s1 + + EPILOGUE diff --git a/kernel/mips64/copy.S b/kernel/mips64/copy.S new file mode 100644 index 0000000000..7942b1890c --- /dev/null +++ b/kernel/mips64/copy.S @@ -0,0 +1,277 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, SIZE + NOP + + blez N, .L999 + dsll INCX, INCX, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, BASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + LD a5, 4 * SIZE(X) + LD a6, 5 * SIZE(X) + LD a7, 6 * SIZE(X) + LD a8, 7 * SIZE(X) + + blez I, .L13 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(Y) + LD a1, 8 * SIZE(X) + + ST a2, 1 * SIZE(Y) + LD a2, 9 * SIZE(X) + + ST a3, 2 * SIZE(Y) + LD a3, 10 * SIZE(X) + + ST a4, 3 * SIZE(Y) + LD a4, 11 * SIZE(X) + + ST a5, 4 * SIZE(Y) + LD a5, 12 * SIZE(X) + + ST a6, 5 * SIZE(Y) + LD a6, 13 * SIZE(X) + + ST a7, 6 * SIZE(Y) + LD a7, 14 * SIZE(X) + + ST a8, 7 * SIZE(Y) + LD a8, 15 * SIZE(X) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + ST a3, 2 * SIZE(Y) + ST a4, 3 * SIZE(Y) + ST a5, 4 * SIZE(Y) + ST a6, 5 * SIZE(Y) + ST a7, 6 * SIZE(Y) + ST a8, 7 * SIZE(Y) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu X, X, SIZE + + daddiu I, I, -1 + daddiu Y, Y, SIZE + + bgtz I, .L16 + ST a1, -1 * SIZE(Y) + + j .L999 + NOP + .align 3 + +.L20: + dsra I, N, 3 + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + blez I, .L23 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + ST a5, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + ST a6, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + ST a7, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a7, 0 * SIZE(X) + daddu X, X, INCX + + ST a8, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a8, 0 * SIZE(X) + + daddiu I, I, -1 + + bgtz I, .L22 + daddu X, X, INCX + .align 3 + +.L23: + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a6, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a8, 0 * SIZE(Y) + daddu Y, Y, INCY + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + daddiu I, I, -1 + ST a1, 0 * SIZE(Y) + + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S new file mode 100644 index 0000000000..595eb9620f --- /dev/null +++ b/kernel/mips64/dnrm2.S @@ -0,0 +1,397 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define XX $7 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define ALPHA $f16 +#define max $f17 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + move XX, X + NOP + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + FABS s1, a1 + + blez N, .L999 + FABS s2, a1 + + FABS s3, a1 + dsra I, N, 3 + + blez I, .L15 + FABS s4, a1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + + bgtz I, .L12 + CMOVT s4, t4, $fcc3 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L100 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + + daddiu N, N, 1 + + lui TEMP, 0x3f80 + dmtc1 $0, a1 + + mtc1 TEMP, ALPHA + CMPEQ $fcc0, s1, a1 + + bc1t $fcc0, .L999 + cvt.d.s ALPHA, ALPHA + + div.d ALPHA, ALPHA, s1 + MOV max, s1 + + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + + dsra I, N, 3 + blez I, .L105 + NOP + + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a2, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a3, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a4, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a5, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a6, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a7, 0 * SIZE(XX) + daddu XX, XX, INCX + + LD a8, 0 * SIZE(XX) + daddiu I, I, -1 + + blez I, .L104 + daddu XX, XX, INCX + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, 0 * SIZE(XX) + MUL t2, ALPHA, a2 + daddu XX, XX, INCX + + MUL t3, ALPHA, a3 + LD a2, 0 * SIZE(XX) + MUL t4, ALPHA, a4 + daddu XX, XX, INCX + + MADD s1, s1, t1, t1 + LD a3, 0 * SIZE(XX) + MADD s2, s2, t2, t2 + daddu XX, XX, INCX + + MADD s3, s3, t3, t3 + LD a4, 0 * SIZE(XX) + MADD s4, s4, t4, t4 + daddu XX, XX, INCX + + MUL t1, ALPHA, a5 + LD a5, 0 * SIZE(XX) + MUL t2, ALPHA, a6 + daddu XX, XX, INCX + + MUL t3, ALPHA, a7 + LD a6, 0 * SIZE(XX) + MUL t4, ALPHA, a8 + daddu XX, XX, INCX + + MADD s1, s1, t1, t1 + LD a7, 0 * SIZE(XX) + MADD s2, s2, t2, t2 + daddu XX, XX, INCX + + MADD s3, s3, t3, t3 + LD a8, 0 * SIZE(XX) + MADD s4, s4, t4, t4 + daddiu I, I, -1 + + bgtz I, .L103 + daddu XX, XX, INCX + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + + MADD s1, s1, t1, t1 + MADD s2, s2, t2, t2 + MADD s3, s3, t3, t3 + MADD s4, s4, t4, t4 + + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + + MADD s1, s1, t1, t1 + MADD s2, s2, t2, t2 + MADD s3, s3, t3, t3 + MADD s4, s4, t4, t4 + .align 3 + +.L105: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L106: + LD a1, 0 * SIZE(XX) + daddiu I, I, -1 + + MUL t1, ALPHA, a1 + + daddu XX, XX, INCX + + bgtz I, .L106 + MADD s1, s1, t1, t1 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + + ADD s1, s1, s3 + + sqrt.d s1, s1 + + j $31 + MUL s1, max, s1 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/dot.S b/kernel/mips64/dot.S new file mode 100644 index 0000000000..b1f599172d --- /dev/null +++ b/kernel/mips64/dot.S @@ -0,0 +1,306 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define b1 $f6 +#define b2 $f7 +#define b3 $f8 +#define b4 $f9 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC $0, s1 + MTC $0, s2 + + dsll INCX, INCX, BASE_SHIFT + li TEMP, SIZE + + blez N, .L999 + dsll INCY, INCY, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + bne INCY, TEMP, .L20 + NOP + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + + LD a4, 3 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + LD b4, 3 * SIZE(Y) + .align 3 + +.L12: + MADD s1, s1, a1, b1 + LD a1, 4 * SIZE(X) + LD b1, 4 * SIZE(Y) + + MADD s2, s2, a2, b2 + LD a2, 5 * SIZE(X) + LD b2, 5 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a3, 6 * SIZE(X) + LD b3, 6 * SIZE(Y) + + MADD s2, s2, a4, b4 + LD a4, 7 * SIZE(X) + LD b4, 7 * SIZE(Y) + + MADD s1, s1, a1, b1 + LD a1, 8 * SIZE(X) + LD b1, 8 * SIZE(Y) + + MADD s2, s2, a2, b2 + LD a2, 9 * SIZE(X) + LD b2, 9 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a3, 10 * SIZE(X) + LD b3, 10 * SIZE(Y) + + MADD s2, s2, a4, b4 + LD a4, 11 * SIZE(X) + LD b4, 11 * SIZE(Y) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + MADD s1, s1, a1, b1 + LD a1, 4 * SIZE(X) + LD b1, 4 * SIZE(Y) + + MADD s2, s2, a2, b2 + LD a2, 5 * SIZE(X) + LD b2, 5 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a3, 6 * SIZE(X) + LD b3, 6 * SIZE(Y) + + MADD s2, s2, a4, b4 + LD a4, 7 * SIZE(X) + LD b4, 7 * SIZE(Y) + + MADD s1, s1, a1, b1 + daddiu X, X, 8 * SIZE + MADD s2, s2, a2, b2 + daddiu Y, Y, 8 * SIZE + + MADD s1, s1, a3, b3 + MADD s2, s2, a4, b4 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + MADD s1, s1, a1, b1 + + daddiu I, I, -1 + + daddiu X, X, SIZE + daddiu Y, Y, SIZE + + bgtz I, .L16 + NOP + j .L999 + NOP + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + daddiu TEMP, N, -1 + + mult TEMP, INCX + + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + daddiu TEMP, N, -1 + + mult TEMP, INCY + + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + blez I, .L25 + NOP + .align 3 + +.L23: + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s2, s2, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s2, s2, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s2, s2, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L23 + MADD s2, s2, a1, b1 + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L26 + MADD s1, s1, a1, b1 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE diff --git a/kernel/mips64/gemm_beta.S b/kernel/mips64/gemm_beta.S new file mode 100644 index 0000000000..2e0b24171f --- /dev/null +++ b/kernel/mips64/gemm_beta.S @@ -0,0 +1,205 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define C $6 +#define LDC $7 + +#define I $2 +#define J $3 + +#define CO1 $8 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define b1 $f4 +#define b2 $f5 +#define b3 $f6 +#define b4 $f7 + +#define FZERO $f8 +#define ALPHA $f15 + + PROLOGUE + + LDARG C, 0($sp) + MTC $0, FZERO + LDARG LDC, 8($sp) + + dsll LDC, LDC, BASE_SHIFT + + move J, N + blez J, .L999 + nop + .align 3 + +.L10: + move CO1, C + dsra I, M, 3 + + blez I, .L15 + daddu C, C, LDC + + LD a1, 0 * SIZE(CO1) + LD a2, 1 * SIZE(CO1) + LD a3, 2 * SIZE(CO1) + LD a4, 3 * SIZE(CO1) + + MUL b1, ALPHA, a1 + LD a1, 4 * SIZE(CO1) + + MUL b2, ALPHA, a2 + daddiu I, I, -1 + + blez I, .L13 + LD a2, 5 * SIZE(CO1) + .align 3 + +.L12: + MUL b3, ALPHA, a3 + LD a3, 6 * SIZE(CO1) + + ST b1, 0 * SIZE(CO1) + + MUL b4, ALPHA, a4 + LD a4, 7 * SIZE(CO1) + + ST b2, 1 * SIZE(CO1) + + MUL b1, ALPHA, a1 + LD a1, 8 * SIZE(CO1) + + ST b3, 2 * SIZE(CO1) + + MUL b2, ALPHA, a2 + LD a2, 9 * SIZE(CO1) + + ST b4, 3 * SIZE(CO1) + + MUL b3, ALPHA, a3 + LD a3, 10 * SIZE(CO1) + + ST b1, 4 * SIZE(CO1) + + MUL b4, ALPHA, a4 + LD a4, 11 * SIZE(CO1) + + ST b2, 5 * SIZE(CO1) + + MUL b1, ALPHA, a1 + LD a1, 12 * SIZE(CO1) + + ST b3, 6 * SIZE(CO1) + + MUL b2, ALPHA, a2 + LD a2, 13 * SIZE(CO1) + + ST b4, 7 * SIZE(CO1) + daddiu I, I, -1 + + bgtz I, .L12 + daddiu CO1, CO1, 8 * SIZE + .align 3 + +.L13: + MUL b3, ALPHA, a3 + LD a3, 6 * SIZE(CO1) + + ST b1, 0 * SIZE(CO1) + + MUL b4, ALPHA, a4 + LD a4, 7 * SIZE(CO1) + + ST b2, 1 * SIZE(CO1) + + MUL b1, ALPHA, a1 + + ST b3, 2 * SIZE(CO1) + + MUL b2, ALPHA, a2 + + ST b4, 3 * SIZE(CO1) + + MUL b3, ALPHA, a3 + + ST b1, 4 * SIZE(CO1) + + MUL b4, ALPHA, a4 + + ST b2, 5 * SIZE(CO1) + ST b3, 6 * SIZE(CO1) + ST b4, 7 * SIZE(CO1) + + daddiu CO1, CO1, 8 * SIZE + .align 3 + +.L15: + andi I, M, 7 + daddiu J, J, -1 + + blez I, .L18 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(CO1) + daddiu I, I, -1 + + MUL b1, ALPHA, a1 + daddiu CO1, CO1, 1 * SIZE + + bgtz I, .L16 + ST b1, -1 * SIZE(CO1) + .align 3 + +.L18: + bgtz J, .L10 + NOP + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/gemm_kernel.S b/kernel/mips64/gemm_kernel.S new file mode 100644 index 0000000000..8ee32d5291 --- /dev/null +++ b/kernel/mips64/gemm_kernel.S @@ -0,0 +1,2250 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define PREFETCHSIZE (4 * 10) + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#define BB $22 + +#if defined(TRMMKERNEL) +#define OFFSET $23 +#define KK $24 +#define TEMP $25 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f27 +#define a4 $f28 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f16 +#define c41 $f17 +#define c42 $f18 +#define c51 $f19 +#define c52 $f20 +#define c61 $f21 +#define c62 $f22 +#define c71 $f23 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -160 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + SDARG $22, 48($sp) + + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) + +#if defined(TRMMKERNEL) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + + LDARG OFFSET, 160($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) +#endif + + dsll LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsra J, N, 3 + blez J, .L30 + nop + +.L10: + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + move AO, A + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + dsra I, M, 1 + daddu C, CO8, LDC + + dsll BB, K, 2 + BASE_SHIFT + daddu BB, B, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + blez I, .L20 + MOV c61, c11 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 8 +#endif + dsra L, TEMP, 2 + + blez L, .L15 + NOP +#else + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + pref 1, 3 * SIZE(CO1) + pref 1, 3 * SIZE(CO2) + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, K, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#endif + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + pref 1, 2 * SIZE(CO3) + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD a4, 2 * SIZE(AO) + MADD c61, c61, a1, b2 + NOP + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD a4, 6 * SIZE(AO) + MADD c61, c61, a3, b2 + NOP + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + pref 1, 3 * SIZE(CO4) + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + pref 1, 3 * SIZE(CO5) + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + pref 1, 3 * SIZE(CO6) + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + pref 1, 3 * SIZE(CO7) + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + NOP + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L18 + pref 1, 3 * SIZE(CO8) + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + daddiu CO3,CO3, 2 * SIZE + LD $f1, 1 * SIZE(CO1) + daddiu CO1,CO1, 2 * SIZE + LD $f2, 0 * SIZE(CO2) + daddiu CO4,CO4, 2 * SIZE + LD $f3, 1 * SIZE(CO2) + daddiu CO2,CO2, 2 * SIZE + + LD $f4, -2 * SIZE(CO3) + daddiu CO5,CO5, 2 * SIZE + LD $f5, -1 * SIZE(CO3) + daddiu CO6,CO6, 2 * SIZE + LD $f6, -2 * SIZE(CO4) + daddiu CO7,CO7, 2 * SIZE + LD $f7, -1 * SIZE(CO4) + daddiu I, I, -1 + + MADD c11, $f0, ALPHA, c11 + LD $f0,-2 * SIZE(CO5) + MADD c12, $f1, ALPHA, c12 + LD $f1,-1 * SIZE(CO5) + MADD c21, $f2, ALPHA, c21 + LD $f2,-2 * SIZE(CO6) + MADD c22, $f3, ALPHA, c22 + LD $f3,-1 * SIZE(CO6) + + MADD c31, $f4, ALPHA, c31 + LD $f4,-2 * SIZE(CO7) + MADD c32, $f5, ALPHA, c32 + LD $f5,-1 * SIZE(CO7) + MADD c41, $f6, ALPHA, c41 + LD $f6, 0 * SIZE(CO8) + MADD c42, $f7, ALPHA, c42 + LD $f7, 1 * SIZE(CO8) + + pref 0, 0 * SIZE(BB) + pref 0, 8 * SIZE(BB) + + ST c11, -2 * SIZE(CO1) + MTC $0, c11 + ST c12, -1 * SIZE(CO1) + daddiu CO8,CO8, 2 * SIZE + ST c21, -2 * SIZE(CO2) + MOV c21, c11 + ST c22, -1 * SIZE(CO2) + daddiu BB, BB, 16 * SIZE + + MADD c51, $f0, ALPHA, c51 + ST c31, -2 * SIZE(CO3) + MADD c52, $f1, ALPHA, c52 + ST c32, -1 * SIZE(CO3) + MADD c61, $f2, ALPHA, c61 + ST c41, -2 * SIZE(CO4) + MADD c62, $f3, ALPHA, c62 + ST c42, -1 * SIZE(CO4) + + MADD c71, $f4, ALPHA, c71 + ST c51, -2 * SIZE(CO5) + MADD c72, $f5, ALPHA, c72 + ST c52, -1 * SIZE(CO5) + MADD c81, $f6, ALPHA, c81 + ST c61, -2 * SIZE(CO6) + MADD c82, $f7, ALPHA, c82 + ST c62, -1 * SIZE(CO6) + + ST c71, -2 * SIZE(CO7) + MOV c31, c11 + ST c72, -1 * SIZE(CO7) + MOV c41, c11 + + ST c81, -2 * SIZE(CO8) + MOV c51, c11 + ST c82, -1 * SIZE(CO8) + bgtz I, .L11 + MOV c61, c11 +#else + daddiu CO4,CO4, 2 * SIZE + daddiu CO5,CO5, 2 * SIZE + daddiu CO6,CO6, 2 * SIZE + daddiu CO7,CO7, 2 * SIZE + + pref 0, 0 * SIZE(BB) + pref 0, 8 * SIZE(BB) + + MUL c11, ALPHA, c11 + daddiu CO1,CO1, 2 * SIZE + MUL c12, ALPHA, c12 + MTC $0, a1 + MUL c21, ALPHA, c21 + daddiu CO2,CO2, 2 * SIZE + MUL c22, ALPHA, c22 + daddiu CO3,CO3, 2 * SIZE + + ST c11, -2 * SIZE(CO1) + MUL c31, ALPHA, c31 + ST c12, -1 * SIZE(CO1) + MUL c32, ALPHA, c32 + ST c21, -2 * SIZE(CO2) + MUL c41, ALPHA, c41 + ST c22, -1 * SIZE(CO2) + MUL c42, ALPHA, c42 + + ST c31, -2 * SIZE(CO3) + MUL c51, ALPHA, c51 + ST c32, -1 * SIZE(CO3) + MUL c52, ALPHA, c52 + ST c41, -2 * SIZE(CO4) + MUL c61, ALPHA, c61 + ST c42, -1 * SIZE(CO4) + MUL c62, ALPHA, c62 + + ST c51, -2 * SIZE(CO5) + MUL c71, ALPHA, c71 + ST c52, -1 * SIZE(CO5) + MUL c72, ALPHA, c72 + ST c61, -2 * SIZE(CO6) + MUL c81, ALPHA, c81 + ST c62, -1 * SIZE(CO6) + MUL c82, ALPHA, c82 + + ST c71, -2 * SIZE(CO7) + MOV c11, a1 + ST c72, -1 * SIZE(CO7) + MOV c21, a1 + + daddiu CO8,CO8, 2 * SIZE + daddiu BB, BB, 16 * SIZE + + ST c81, -2 * SIZE(CO8) + MOV c31, a1 + ST c82, -1 * SIZE(CO8) + MOV c41, a1 + + daddiu I, I, -1 + MOV c51, a1 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -8 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + + bgtz I, .L11 + MOV c61, a1 +#endif + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 + blez I, .L29 + MOV c71, c11 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 8 +#endif + dsra L, TEMP, 2 + + blez L, .L25 + MOV c81, c11 +#else + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B +#endif + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + LD $f1, 0 * SIZE(CO2) + LD $f2, 0 * SIZE(CO3) + LD $f3, 0 * SIZE(CO4) + MADD c11, $f0, ALPHA, c11 + LD $f4, 0 * SIZE(CO5) + MADD c21, $f1, ALPHA, c21 + LD $f5, 0 * SIZE(CO6) + MADD c31, $f2, ALPHA, c31 + LD $f6, 0 * SIZE(CO7) + MADD c41, $f3, ALPHA, c41 + LD $f7, 0 * SIZE(CO8) + MADD c51, $f4, ALPHA, c51 + ST c11, 0 * SIZE(CO1) + MADD c61, $f5, ALPHA, c61 + ST c21, 0 * SIZE(CO2) + MADD c71, $f6, ALPHA, c71 + ST c31, 0 * SIZE(CO3) + MADD c81, $f7, ALPHA, c81 + ST c41, 0 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + + ST c11, 0 * SIZE(CO1) + MUL c51, ALPHA, c51 + ST c21, 0 * SIZE(CO2) + MUL c61, ALPHA, c61 + ST c31, 0 * SIZE(CO3) + MUL c71, ALPHA, c71 + ST c41, 0 * SIZE(CO4) + MUL c81, ALPHA, c81 + + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -8 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 8 +#endif + + bgtz J, .L10 + move B, BO + .align 3 + +.L30: + andi J, N, 4 + blez J, .L50 + move AO, A + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + MOV c21, c11 + daddu C, CO4, LDC + MOV c31, c11 + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + dsra I, M, 1 + blez I, .L40 + MOV c41, c11 + +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + MOV c32, c11 + LD b4, 3 * SIZE(BO) + MOV c42, c11 + + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + blez L, .L35 + NOP +#else + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B +#endif + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + daddiu CO3,CO3, 2 * SIZE + LD $f1, 1 * SIZE(CO1) + daddiu CO1,CO1, 2 * SIZE + LD $f2, 0 * SIZE(CO2) + daddiu CO4,CO4, 2 * SIZE + LD $f3, 1 * SIZE(CO2) + daddiu CO2,CO2, 2 * SIZE + + LD $f4, -2 * SIZE(CO3) + MADD c11, $f0, ALPHA, c11 + LD $f5, -1 * SIZE(CO3) + MADD c12, $f1, ALPHA, c12 + LD $f6, -2 * SIZE(CO4) + MADD c21, $f2, ALPHA, c21 + LD $f7, -1 * SIZE(CO4) + MADD c22, $f3, ALPHA, c22 + + MADD c31, $f4, ALPHA, c31 + ST c11, -2 * SIZE(CO1) + MADD c32, $f5, ALPHA, c32 + ST c12, -1 * SIZE(CO1) + MADD c41, $f6, ALPHA, c41 + ST c21, -2 * SIZE(CO2) + MADD c42, $f7, ALPHA, c42 + ST c22, -1 * SIZE(CO2) + + ST c31, -2 * SIZE(CO3) + MTC $0, c11 + ST c32, -1 * SIZE(CO3) + daddiu I, I, -1 + ST c41, -2 * SIZE(CO4) + MOV c21, c11 + ST c42, -1 * SIZE(CO4) + MOV c31, c11 +#else + MUL c11, ALPHA, c11 + daddiu CO3,CO3, 2 * SIZE + MUL c12, ALPHA, c12 + daddiu CO1,CO1, 2 * SIZE + MUL c21, ALPHA, c21 + daddiu CO4,CO4, 2 * SIZE + MUL c22, ALPHA, c22 + daddiu CO2,CO2, 2 * SIZE + + ST c11, -2 * SIZE(CO1) + MUL c31, ALPHA, c31 + ST c12, -1 * SIZE(CO1) + MUL c32, ALPHA, c32 + ST c21, -2 * SIZE(CO2) + MUL c41, ALPHA, c41 + ST c22, -1 * SIZE(CO2) + MUL c42, ALPHA, c42 + + ST c31, -2 * SIZE(CO3) + MTC $0, c11 + ST c32, -1 * SIZE(CO3) + daddiu I, I, -1 + ST c41, -2 * SIZE(CO4) + MOV c21, c11 + ST c42, -1 * SIZE(CO4) + MOV c31, c11 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif +#endif + + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L40: + andi I, M, 1 + blez I, .L49 + MOV c61, c11 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + + blez L, .L45 + NOP +#else + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + + blez L, .L45 + move BO, B +#endif + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + LD $f1, 0 * SIZE(CO2) + LD $f2, 0 * SIZE(CO3) + LD $f3, 0 * SIZE(CO4) + + MADD c11, $f0, ALPHA, c11 + MADD c21, $f1, ALPHA, c21 + MADD c31, $f2, ALPHA, c31 + MADD c41, $f3, ALPHA, c41 + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) +#else + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + MUL c31, ALPHA, c31 + MUL c41, ALPHA, c41 + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + .align 3 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 4 +#endif + move B, BO + .align 3 + +.L50: + andi J, N, 2 + blez J, .L70 + + move AO, A + move CO1, C + daddu CO2, C, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + dsra I, M, 1 + blez I, .L60 + daddu C, CO2, LDC + +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + blez L, .L55 + NOP +#else + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B +#endif + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + daddiu I, I, -1 + LD $f1, 1 * SIZE(CO1) + daddiu CO1,CO1, 2 * SIZE + LD $f2, 0 * SIZE(CO2) + NOP + LD $f3, 1 * SIZE(CO2) + daddiu CO2,CO2, 2 * SIZE + + MADD c11, $f0, ALPHA, c11 + MADD c12, $f1, ALPHA, c12 + MADD c21, $f2, ALPHA, c21 + MADD c22, $f3, ALPHA, c22 + + ST c11, -2 * SIZE(CO1) + ST c12, -1 * SIZE(CO1) + ST c21, -2 * SIZE(CO2) + NOP + bgtz I, .L51 + ST c22, -1 * SIZE(CO2) +#else + daddiu I, I, -1 + + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + MUL c21, ALPHA, c21 + MUL c22, ALPHA, c22 + + ST c11, -2 * SIZE(CO1) + ST c12, -1 * SIZE(CO1) + ST c21, -2 * SIZE(CO2) + ST c22, -1 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + + bgtz I, .L51 + NOP +#endif + .align 3 + +.L60: + andi I, M, 1 + blez I, .L69 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + blez L, .L65 + NOP +#else + dsra L, K, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B +#endif + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + LD $f1, 0 * SIZE(CO2) + + ADD c11, c11, c31 + ADD c21, c21, c41 + + MADD c11, $f0, ALPHA, c11 + MADD c21, $f1, ALPHA, c21 + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) +#else + ADD c11, c11, c31 + ADD c21, c21, c41 + + MUL c11, ALPHA, c11 + MUL c21, ALPHA, c21 + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + .align 3 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + move B, BO + .align 3 + +.L70: + andi J, N, 1 + blez J, .L999 + + move AO, A + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + dsra I, M, 1 + blez I, .L80 + daddu C, CO1, LDC + +.L71: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 2 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L75 + NOP +#else + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B +#endif + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + daddiu I, I, -1 + LD $f1, 1 * SIZE(CO1) + daddiu CO1,CO1, 2 * SIZE + + ADD c11, c11, c21 + ADD c12, c12, c22 + + MADD c11, $f0, ALPHA, c11 + MADD c12, $f1, ALPHA, c12 + + ST c11, -2 * SIZE(CO1) + bgtz I, .L71 + ST c12, -1 * SIZE(CO1) +#else + ADD c11, c11, c21 + daddiu I, I, -1 + ADD c12, c12, c22 + daddiu CO1,CO1, 2 * SIZE + + MUL c11, ALPHA, c11 + MUL c12, ALPHA, c12 + + ST c11, -2 * SIZE(CO1) + ST c12, -1 * SIZE(CO1) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -2 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 2 +#endif + + bgtz I, .L71 + NOP +#endif + .align 3 + +.L80: + andi I, M, 1 + blez I, .L89 + NOP + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + blez L, .L85 + NOP +#else + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + blez L, .L85 + move BO, B +#endif + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: +#ifndef TRMMKERNEL + LD $f0, 0 * SIZE(CO1) + + ADD c11, c11, c21 + MADD c11, $f0, ALPHA, c11 + + ST c11, 0 * SIZE(CO1) +#else + ADD c11, c11, c21 + MUL c11, ALPHA, c11 + + ST c11, 0 * SIZE(CO1) +#endif + .align 3 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 1 +#endif + move B, BO + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + LDARG $22, 48($sp) + + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) + +#if defined(TRMMKERNEL) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) +#endif + + j $31 + daddiu $sp, $sp, 160 + + EPILOGUE diff --git a/kernel/mips64/gemv_n.S b/kernel/mips64/gemv_n.S new file mode 100644 index 0000000000..908f97347c --- /dev/null +++ b/kernel/mips64/gemv_n.S @@ -0,0 +1,665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define A $8 +#define LDA $9 +#define X $10 +#define INCX $11 +#define Y $2 +#define INCY $6 +#define BUFFER $7 + +#define YORIG $3 +#define XX $12 +#define YY $13 + +#define I $14 +#define J $15 + +#define AO1 $16 +#define AO2 $17 + +#define ALPHA $f15 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define x1 $f8 +#define x2 $f9 + +#define y1 $f10 +#define y2 $f11 +#define y3 $f12 +#define y4 $f13 +#define y5 $f14 +#define y6 $f16 +#define y7 $f17 +#define y8 $f18 + +#define t1 $f19 +#define t2 $f20 +#define t3 $f21 +#define t4 $f22 + + + PROLOGUE + + LDARG Y, 0($sp) + LDARG INCY, 8($sp) + LDARG BUFFER, 16($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -16 +#else + daddiu $sp, $sp, -48 +#endif + + SDARG $16, 0($sp) + + SDARG $17, 8($sp) + dsll LDA, LDA, BASE_SHIFT + +#ifndef __64BIT__ + sdc1 $f20, 16($sp) + sdc1 $f21, 24($sp) + sdc1 $f22, 32($sp) +#endif + + blez M, .L999 + dsll INCX, INCX, BASE_SHIFT + + blez N, .L999 + dsll INCY, INCY, BASE_SHIFT + + li YORIG, SIZE + + beq INCY, YORIG, .L10 + move YORIG, Y + + dsra I, M, 2 + move YORIG, BUFFER + + move XX, Y + + blez I, .L05 + move YY, BUFFER + .align 3 + +.L02: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCY + LD a2, 0 * SIZE(XX) + daddu XX, XX, INCY + LD a3, 0 * SIZE(XX) + daddu XX, XX, INCY + LD a4, 0 * SIZE(XX) + daddu XX, XX, INCY + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + ST a3, 2 * SIZE(YY) + ST a4, 3 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L02 + daddiu YY, YY, 4 * SIZE + .align 3 + +.L05: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L06: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCY + + ST a1, 0 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu YY, YY, 1 * SIZE + .align 3 + +.L10: + dsra J, N, 1 + blez J, .L20 + NOP + .align 3 + +.L11: + LD x1, 0 * SIZE(X) + daddu X, X, INCX + LD x2, 0 * SIZE(X) + daddu X, X, INCX + + move AO1, A + daddu AO2, A, LDA + daddu A, AO2, LDA + + move YY, YORIG + MUL x1, ALPHA, x1 + + dsra I, M, 3 + blez I, .L15 + MUL x2, ALPHA, x2 + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + LD a5, 0 * SIZE(AO2) + LD y5, 4 * SIZE(YY) + LD a6, 1 * SIZE(AO2) + LD y6, 5 * SIZE(YY) + + LD a7, 2 * SIZE(AO2) + LD y7, 6 * SIZE(YY) + LD a8, 3 * SIZE(AO2) + daddiu I, I, -1 + + blez I, .L13 + LD y8, 7 * SIZE(YY) + .align 3 + +.L12: + MADD t1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD t2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + + LD y1, 8 * SIZE(YY) + LD y2, 9 * SIZE(YY) + + MADD t3, y3, x1, a3 + LD a3, 6 * SIZE(AO1) + MADD t4, y4, x1, a4 + LD a4, 7 * SIZE(AO1) + + LD y3, 10 * SIZE(YY) + LD y4, 11 * SIZE(YY) + + MADD t1, t1, x2, a5 + LD a5, 4 * SIZE(AO2) + MADD t2, t2, x2, a6 + LD a6, 5 * SIZE(AO2) + MADD t3, t3, x2, a7 + LD a7, 6 * SIZE(AO2) + MADD t4, t4, x2, a8 + LD a8, 7 * SIZE(AO2) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + ST t3, 2 * SIZE(YY) + ST t4, 3 * SIZE(YY) + + MADD t1, y5, x1, a1 + LD a1, 8 * SIZE(AO1) + MADD t2, y6, x1, a2 + LD a2, 9 * SIZE(AO1) + + LD y5, 12 * SIZE(YY) + LD y6, 13 * SIZE(YY) + + MADD t3, y7, x1, a3 + LD a3, 10 * SIZE(AO1) + MADD t4, y8, x1, a4 + LD a4, 11 * SIZE(AO1) + + LD y7, 14 * SIZE(YY) + LD y8, 15 * SIZE(YY) + + MADD t1, t1, x2, a5 + LD a5, 8 * SIZE(AO2) + MADD t2, t2, x2, a6 + LD a6, 9 * SIZE(AO2) + MADD t3, t3, x2, a7 + LD a7, 10 * SIZE(AO2) + MADD t4, t4, x2, a8 + LD a8, 11 * SIZE(AO2) + + ST t1, 4 * SIZE(YY) + ST t2, 5 * SIZE(YY) + ST t3, 6 * SIZE(YY) + ST t4, 7 * SIZE(YY) + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + daddiu AO1, AO1, 8 * SIZE + bgtz I, .L12 + daddiu AO2, AO2, 8 * SIZE + .align 3 + +.L13: + MADD t1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD t2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD t3, y3, x1, a3 + LD a3, 6 * SIZE(AO1) + MADD t4, y4, x1, a4 + LD a4, 7 * SIZE(AO1) + + MADD t1, t1, x2, a5 + LD a5, 4 * SIZE(AO2) + MADD t2, t2, x2, a6 + LD a6, 5 * SIZE(AO2) + MADD t3, t3, x2, a7 + LD a7, 6 * SIZE(AO2) + MADD t4, t4, x2, a8 + LD a8, 7 * SIZE(AO2) + + ST t1, 0 * SIZE(YY) + MADD t1, y5, x1, a1 + ST t2, 1 * SIZE(YY) + MADD t2, y6, x1, a2 + ST t3, 2 * SIZE(YY) + MADD t3, y7, x1, a3 + ST t4, 3 * SIZE(YY) + MADD t4, y8, x1, a4 + + MADD t1, t1, x2, a5 + daddiu AO1, AO1, 8 * SIZE + MADD t2, t2, x2, a6 + daddiu AO2, AO2, 8 * SIZE + MADD t3, t3, x2, a7 + daddiu YY, YY, 8 * SIZE + MADD t4, t4, x2, a8 + NOP + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L15: + andi I, M, 4 + NOP + blez I, .L16 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + LD a5, 0 * SIZE(AO2) + MADD y1, y1, x1, a1 + LD a6, 1 * SIZE(AO2) + MADD y2, y2, x1, a2 + LD a7, 2 * SIZE(AO2) + MADD y3, y3, x1, a3 + LD a8, 3 * SIZE(AO2) + MADD y4, y4, x1, a4 + + MADD y1, y1, x2, a5 + daddiu YY, YY, 4 * SIZE + MADD y2, y2, x2, a6 + daddiu AO1, AO1, 4 * SIZE + MADD y3, y3, x2, a7 + daddiu AO2, AO2, 4 * SIZE + MADD y4, y4, x2, a8 + + ST y1, -4 * SIZE(YY) + ST y2, -3 * SIZE(YY) + ST y3, -2 * SIZE(YY) + ST y4, -1 * SIZE(YY) + .align 3 + +.L16: + andi I, M, 2 + NOP + blez I, .L17 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + + MADD y1, y1, x1, a1 + NOP + MADD y2, y2, x1, a2 + daddiu YY, YY, 2 * SIZE + MADD y1, y1, x2, a5 + daddiu AO1, AO1, 2 * SIZE + MADD y2, y2, x2, a6 + daddiu AO2, AO2, 2 * SIZE + + ST y1, -2 * SIZE(YY) + ST y2, -1 * SIZE(YY) + .align 3 + +.L17: + andi I, M, 1 + NOP + blez I, .L19 + NOP + + LD y1, 0 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD a5, 0 * SIZE(AO2) + + MADD y1, y1, x1, a1 + MADD y1, y1, x2, a5 + + ST y1, 0 * SIZE(YY) + .align 3 + + +.L19: + daddiu J, J, -1 + + bgtz J, .L11 + NOP + .align 3 + +.L20: + andi J, N, 1 + blez J, .L900 + NOP + .align 3 + +.L21: + LD x1, 0 * SIZE(X) + daddu X, X, INCX + + move YY, YORIG + move AO1, A + + dsra I, M, 3 + blez I, .L25 + MUL x1, ALPHA, x1 + + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + LD y5, 4 * SIZE(YY) + LD y6, 5 * SIZE(YY) + + LD y7, 6 * SIZE(YY) + daddiu I, I, -1 + + blez I, .L23 + LD y8, 7 * SIZE(YY) + .align 3 + +.L22: + MADD t1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD t2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + + LD y1, 8 * SIZE(YY) + LD y2, 9 * SIZE(YY) + + MADD t3, y3, x1, a3 + LD a3, 6 * SIZE(AO1) + MADD t4, y4, x1, a4 + LD a4, 7 * SIZE(AO1) + + LD y3, 10 * SIZE(YY) + LD y4, 11 * SIZE(YY) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + ST t3, 2 * SIZE(YY) + ST t4, 3 * SIZE(YY) + + MADD t1, y5, x1, a1 + LD a1, 8 * SIZE(AO1) + MADD t2, y6, x1, a2 + LD a2, 9 * SIZE(AO1) + + LD y5, 12 * SIZE(YY) + LD y6, 13 * SIZE(YY) + + MADD t3, y7, x1, a3 + LD a3, 10 * SIZE(AO1) + MADD t4, y8, x1, a4 + LD a4, 11 * SIZE(AO1) + + LD y7, 14 * SIZE(YY) + LD y8, 15 * SIZE(YY) + + ST t1, 4 * SIZE(YY) + ST t2, 5 * SIZE(YY) + ST t3, 6 * SIZE(YY) + ST t4, 7 * SIZE(YY) + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + bgtz I, .L22 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L23: + MADD t1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD t2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD t3, y3, x1, a3 + LD a3, 6 * SIZE(AO1) + MADD t4, y4, x1, a4 + LD a4, 7 * SIZE(AO1) + + ST t1, 0 * SIZE(YY) + MADD t1, y5, x1, a1 + ST t2, 1 * SIZE(YY) + MADD t2, y6, x1, a2 + ST t3, 2 * SIZE(YY) + MADD t3, y7, x1, a3 + ST t4, 3 * SIZE(YY) + MADD t4, y8, x1, a4 + + ST t1, 4 * SIZE(YY) + ST t2, 5 * SIZE(YY) + ST t3, 6 * SIZE(YY) + ST t4, 7 * SIZE(YY) + + daddiu AO1, AO1, 8 * SIZE + daddiu YY, YY, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + NOP + blez I, .L26 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + MADD y1, y1, x1, a1 + MADD y2, y2, x1, a2 + + MADD y3, y3, x1, a3 + daddiu YY, YY, 4 * SIZE + MADD y4, y4, x1, a4 + daddiu AO1, AO1, 4 * SIZE + + ST y1, -4 * SIZE(YY) + ST y2, -3 * SIZE(YY) + ST y3, -2 * SIZE(YY) + ST y4, -1 * SIZE(YY) + .align 3 + +.L26: + andi I, M, 2 + NOP + blez I, .L27 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + MADD y1, y1, x1, a1 + daddiu YY, YY, 2 * SIZE + MADD y2, y2, x1, a2 + daddiu AO1, AO1, 2 * SIZE + + ST y1, -2 * SIZE(YY) + ST y2, -1 * SIZE(YY) + .align 3 + +.L27: + andi I, M, 1 + NOP + blez I, .L900 + NOP + + LD y1, 0 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + + MADD y1, y1, x1, a1 + + ST y1, 0 * SIZE(YY) + .align 3 + + +.L900: + li YORIG, SIZE + + beq INCY, YORIG, .L999 + dsra I, M, 2 + + blez I, .L905 + move XX, BUFFER + .align 3 + +.L902: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + LD a3, 2 * SIZE(XX) + LD a4, 3 * SIZE(XX) + + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu XX, XX, 4 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(XX) + daddiu XX, XX, 1 * SIZE + + ST a1, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + +#ifndef __64BIT__ + ldc1 $f20, 16($sp) + ldc1 $f21, 24($sp) + ldc1 $f22, 32($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 16 +#else + daddiu $sp, $sp, 48 +#endif + + EPILOGUE diff --git a/kernel/mips64/gemv_t.S b/kernel/mips64/gemv_t.S new file mode 100644 index 0000000000..2808756d4e --- /dev/null +++ b/kernel/mips64/gemv_t.S @@ -0,0 +1,531 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define A $8 +#define LDA $9 +#define X $10 +#define INCX $11 +#define Y $2 +#define INCY $6 +#define BUFFER $7 + +#define XORIG $3 +#define XX $12 +#define YY $13 + +#define I $14 +#define J $15 + +#define AO1 $16 +#define AO2 $17 + +#define ALPHA $f15 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define y1 $f8 +#define y2 $f9 +#define y3 $f10 +#define y4 $f11 + +#define x1 $f12 +#define x2 $f13 +#define x3 $f14 +#define x4 $f16 +#define x5 $f17 +#define x6 $f18 +#define x7 $f19 +#define x8 $f20 + + PROLOGUE + + LDARG Y, 0($sp) + LDARG INCY, 8($sp) + LDARG BUFFER, 16($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -16 +#else + daddiu $sp, $sp, -32 +#endif + + MTC $0, y1 + SDARG $16, 0($sp) + + SDARG $17, 8($sp) + dsll LDA, LDA, BASE_SHIFT + +#ifndef __64BIT__ + sdc1 $f20, 16($sp) +#endif + + blez M, .L999 + dsll INCX, INCX, BASE_SHIFT + + blez N, .L999 + dsll INCY, INCY, BASE_SHIFT + + li XORIG, SIZE + + beq INCX, XORIG, .L10 + move XORIG, X + + dsra I, M, 2 + move XORIG, BUFFER + + blez I, .L05 + move YY, BUFFER + .align 3 + +.L02: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + ST a3, 2 * SIZE(YY) + ST a4, 3 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L02 + daddiu YY, YY, 4 * SIZE + .align 3 + +.L05: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L06: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu YY, YY, 1 * SIZE + .align 3 + +.L10: + dsra J, N, 1 + blez J, .L20 + move YY, Y + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + daddu AO2, A, LDA + MOV y3, y1 + daddu A, AO2, LDA + MOV y4, y1 + + dsra I, M, 3 + blez I, .L15 + move XX, XORIG + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 0 * SIZE(AO2) + LD x2, 1 * SIZE(XX) + + LD a3, 1 * SIZE(AO1) + LD x3, 2 * SIZE(XX) + LD a4, 1 * SIZE(AO2) + LD x4, 3 * SIZE(XX) + + LD a5, 2 * SIZE(AO1) + LD x5, 4 * SIZE(XX) + LD a6, 2 * SIZE(AO2) + LD x6, 5 * SIZE(XX) + + LD a7, 3 * SIZE(AO1) + LD x7, 6 * SIZE(XX) + LD a8, 3 * SIZE(AO2) + daddiu I, I, -1 + + blez I, .L13 + LD x8, 7 * SIZE(XX) + .align 3 + +.L12: + MADD y1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD y2, y2, x1, a2 + LD a2, 4 * SIZE(AO2) + MADD y3, y3, x2, a3 + LD a3, 5 * SIZE(AO1) + MADD y4, y4, x2, a4 + LD a4, 5 * SIZE(AO2) + + LD x1, 8 * SIZE(XX) + LD x2, 9 * SIZE(XX) + + MADD y1, y1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD y2, y2, x3, a6 + LD a6, 6 * SIZE(AO2) + MADD y3, y3, x4, a7 + LD a7, 7 * SIZE(AO1) + MADD y4, y4, x4, a8 + LD a8, 7 * SIZE(AO2) + + LD x3, 10 * SIZE(XX) + LD x4, 11 * SIZE(XX) + + MADD y1, y1, x5, a1 + LD a1, 8 * SIZE(AO1) + MADD y2, y2, x5, a2 + LD a2, 8 * SIZE(AO2) + MADD y3, y3, x6, a3 + LD a3, 9 * SIZE(AO1) + MADD y4, y4, x6, a4 + LD a4, 9 * SIZE(AO2) + + LD x5, 12 * SIZE(XX) + LD x6, 13 * SIZE(XX) + + MADD y1, y1, x7, a5 + LD a5,10 * SIZE(AO1) + MADD y2, y2, x7, a6 + LD a6,10 * SIZE(AO2) + MADD y3, y3, x8, a7 + LD a7,11 * SIZE(AO1) + MADD y4, y4, x8, a8 + LD a8,11 * SIZE(AO2) + + LD x7, 14 * SIZE(XX) + LD x8, 15 * SIZE(XX) + + daddiu I, I, -1 + daddiu XX, XX, 8 * SIZE + + daddiu AO1, AO1, 8 * SIZE + bgtz I, .L12 + daddiu AO2, AO2, 8 * SIZE + .align 3 + +.L13: + MADD y1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD y2, y2, x1, a2 + LD a2, 4 * SIZE(AO2) + MADD y3, y3, x2, a3 + LD a3, 5 * SIZE(AO1) + MADD y4, y4, x2, a4 + LD a4, 5 * SIZE(AO2) + MADD y1, y1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD y2, y2, x3, a6 + LD a6, 6 * SIZE(AO2) + MADD y3, y3, x4, a7 + LD a7, 7 * SIZE(AO1) + MADD y4, y4, x4, a8 + LD a8, 7 * SIZE(AO2) + + MADD y1, y1, x5, a1 + MADD y2, y2, x5, a2 + MADD y3, y3, x6, a3 + MADD y4, y4, x6, a4 + + MADD y1, y1, x7, a5 + daddiu XX, XX, 8 * SIZE + MADD y2, y2, x7, a6 + daddiu AO1, AO1, 8 * SIZE + MADD y3, y3, x8, a7 + daddiu AO2, AO2, 8 * SIZE + MADD y4, y4, x8, a8 + NOP + .align 3 + +.L15: + andi I, M, 4 + NOP + blez I, .L17 + NOP + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 0 * SIZE(AO2) + + LD a3, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + + LD a4, 1 * SIZE(AO2) + + LD a5, 2 * SIZE(AO1) + LD x3, 2 * SIZE(XX) + MADD y1, y1, x1, a1 + LD a6, 2 * SIZE(AO2) + MADD y2, y2, x1, a2 + + LD a7, 3 * SIZE(AO1) + MADD y3, y3, x2, a3 + LD x4, 3 * SIZE(XX) + MADD y4, y4, x2, a4 + LD a8, 3 * SIZE(AO2) + MADD y1, y1, x3, a5 + + MADD y2, y2, x3, a6 + daddiu XX, XX, 4 * SIZE + MADD y3, y3, x4, a7 + daddiu AO1, AO1, 4 * SIZE + MADD y4, y4, x4, a8 + daddiu AO2, AO2, 4 * SIZE + .align 3 + +.L17: + andi I, M, 3 + ADD y1, y1, y3 + blez I, .L19 + ADD y2, y2, y4 + .align 3 + +.L18: + LD x1, 0 * SIZE(XX) + LD a1, 0 * SIZE(AO1) + LD a2, 0 * SIZE(AO2) + + daddiu I, I, -1 + daddiu XX, XX, 1 * SIZE + daddiu AO1, AO1, 1 * SIZE + daddiu AO2, AO2, 1 * SIZE + + MADD y1, y1, x1, a1 + + bgtz I, .L18 + MADD y2, y2, x1, a2 + .align 3 + +.L19: + LD a1, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 0 * SIZE(Y) + daddu Y, Y, INCY + + MADD a1, a1, ALPHA, y1 + daddiu J, J, -1 + MADD a2, a2, ALPHA, y2 + MTC $0, y1 + + ST a1, 0 * SIZE(YY) + daddu YY, YY, INCY + ST a2, 0 * SIZE(YY) + + bgtz J, .L11 + daddu YY, YY, INCY + .align 3 + +.L20: + andi J, N, 1 + MOV y3, y1 + blez J, .L999 + move AO1, A + + dsra I, M, 3 + NOP + blez I, .L25 + move XX, XORIG + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a3, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + LD a5, 2 * SIZE(AO1) + LD x3, 2 * SIZE(XX) + LD a7, 3 * SIZE(AO1) + + LD x4, 3 * SIZE(XX) + LD x5, 4 * SIZE(XX) + LD x6, 5 * SIZE(XX) + LD x7, 6 * SIZE(XX) + daddiu I, I, -1 + + blez I, .L23 + LD x8, 7 * SIZE(XX) + .align 3 + +.L22: + MADD y1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD y3, y3, x2, a3 + LD a3, 5 * SIZE(AO1) + + LD x1, 8 * SIZE(XX) + LD x2, 9 * SIZE(XX) + + MADD y1, y1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD y3, y3, x4, a7 + LD a7, 7 * SIZE(AO1) + + LD x3, 10 * SIZE(XX) + LD x4, 11 * SIZE(XX) + + MADD y1, y1, x5, a1 + LD a1, 8 * SIZE(AO1) + MADD y3, y3, x6, a3 + LD a3, 9 * SIZE(AO1) + + LD x5, 12 * SIZE(XX) + LD x6, 13 * SIZE(XX) + + MADD y1, y1, x7, a5 + LD a5, 10 * SIZE(AO1) + MADD y3, y3, x8, a7 + LD a7, 11 * SIZE(AO1) + + LD x7, 14 * SIZE(XX) + LD x8, 15 * SIZE(XX) + + daddiu I, I, -1 + daddiu XX, XX, 8 * SIZE + bgtz I, .L22 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L23: + MADD y1, y1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD y3, y3, x2, a3 + LD a3, 5 * SIZE(AO1) + MADD y1, y1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD y3, y3, x4, a7 + LD a7, 7 * SIZE(AO1) + + MADD y1, y1, x5, a1 + MADD y3, y3, x6, a3 + MADD y1, y1, x7, a5 + MADD y3, y3, x8, a7 + + daddiu XX, XX, 8 * SIZE + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L25: + andi I, M, 4 + NOP + blez I, .L27 + NOP + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a3, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + + LD a5, 2 * SIZE(AO1) + LD x3, 2 * SIZE(XX) + + MADD y1, y1, x1, a1 + LD a7, 3 * SIZE(AO1) + + MADD y3, y3, x2, a3 + LD x4, 3 * SIZE(XX) + + MADD y1, y1, x3, a5 + daddiu XX, XX, 4 * SIZE + MADD y3, y3, x4, a7 + daddiu AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 3 + ADD y1, y1, y3 + blez I, .L29 + NOP + .align 3 + +.L28: + LD x1, 0 * SIZE(XX) + LD a1, 0 * SIZE(AO1) + + daddiu I, I, -1 + daddiu XX, XX, 1 * SIZE + daddiu AO1, AO1, 1 * SIZE + + bgtz I, .L28 + MADD y1, y1, x1, a1 + .align 3 + +.L29: + LD a1, 0 * SIZE(Y) + daddu Y, Y, INCY + + MADD a1, a1, ALPHA, y1 + NOP + + ST a1, 0 * SIZE(YY) + daddu YY, YY, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + +#ifndef __64BIT__ + ldc1 $f20, 16($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 16 +#else + daddiu $sp, $sp, 32 +#endif + + EPILOGUE diff --git a/kernel/mips64/iamax.S b/kernel/mips64/iamax.S new file mode 100644 index 0000000000..ff6c2157ee --- /dev/null +++ b/kernel/mips64/iamax.S @@ -0,0 +1,288 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + FABS s1, a1 + daddu X, X, INCX + FABS s2, a1 + li x2, 1 + + FABS s3, a1 + dsra I, N, 3 + FABS s4, a1 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu I, I, -1 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + daddu X, X, INCX + + CMPLT $fcc2, s3, t3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + FABS t1, a5 + daddiu TEMP, TEMP, 4 + FABS t2, a6 + NOP + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + FABS t1, a1 + daddiu I, I, -1 + + CMPLT $fcc0, s1, t1 + NOP + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/iamin.S b/kernel/mips64/iamin.S new file mode 100644 index 0000000000..131aa881b6 --- /dev/null +++ b/kernel/mips64/iamin.S @@ -0,0 +1,288 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + FABS s1, a1 + daddu X, X, INCX + FABS s2, a1 + li x2, 1 + + FABS s3, a1 + dsra I, N, 3 + FABS s4, a1 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + daddu X, X, INCX + + FABS t3, a3 + LD a2, 0 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, t1, s1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, t2, s2 + daddu X, X, INCX + + CMPLT $fcc2, t3, s3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, t4, s4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu I, I, -1 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + daddu X, X, INCX + + FABS t3, a7 + LD a6, 0 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, t1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, t2, s2 + daddu X, X, INCX + + CMPLT $fcc2, t3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, t4, s4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + FABS t1, a5 + daddiu TEMP, TEMP, 4 + FABS t2, a6 + NOP + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t2, s2 + CMPLT $fcc2, t3, s3 + CMPLT $fcc3, t4, s4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + FABS t1, a1 + daddiu I, I, -1 + + CMPLT $fcc0, t1, s1 + NOP + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/imax.S b/kernel/mips64/imax.S new file mode 100644 index 0000000000..ec9d3fcdf3 --- /dev/null +++ b/kernel/mips64/imax.S @@ -0,0 +1,262 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD s1, 0 * SIZE(X) + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + daddu X, X, INCX + MOV s2, s1 + li x2, 1 + + MOV s3, s1 + dsra I, N, 3 + MOV s4, s1 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + CMPLT $fcc0, s1, a1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, a2 + daddu X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, a4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a2, $fcc1 + movt x2, TEMP, $fcc1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + CMOVT s3, a3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu I, I, -1 + + CMPLT $fcc0, s1, a5 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, a6 + daddu X, X, INCX + + CMPLT $fcc2, s3, a7 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, a8 + daddu X, X, INCX + + CMOVT s1, a5, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a6, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a7, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a8, $fcc3 + movt x4, TEMP, $fcc3 + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + CMPLT $fcc0, s1, a1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, a2 + daddu X, X, INCX + CMPLT $fcc2, s3, a3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, a4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a4, $fcc3 + movt x4, TEMP, $fcc3 + + CMPLT $fcc0, s1, a5 + daddiu TEMP, TEMP, 4 + CMPLT $fcc1, s2, a6 + NOP + + CMPLT $fcc2, s3, a7 + CMPLT $fcc3, s4, a8 + + CMOVT s1, a5, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a6, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a7, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a8, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + daddiu I, I, -1 + + CMPLT $fcc0, s1, a1 + NOP + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/imin.S b/kernel/mips64/imin.S new file mode 100644 index 0000000000..a247c833c2 --- /dev/null +++ b/kernel/mips64/imin.S @@ -0,0 +1,262 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD s1, 0 * SIZE(X) + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + daddu X, X, INCX + MOV s2, s1 + li x2, 1 + + MOV s3, s1 + dsra I, N, 3 + MOV s4, s1 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + CMPLT $fcc0, a1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, a2, s2 + daddu X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, a4, s4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a2, $fcc1 + movt x2, TEMP, $fcc1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + CMOVT s3, a3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a4, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu I, I, -1 + + CMPLT $fcc0, a5, s1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, a6, s2 + daddu X, X, INCX + + CMPLT $fcc2, a7, s3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, a8, s4 + daddu X, X, INCX + + CMOVT s1, a5, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a6, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a7, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a8, $fcc3 + movt x4, TEMP, $fcc3 + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + CMPLT $fcc0, a1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, a2, s2 + daddu X, X, INCX + CMPLT $fcc2, a3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, a4, s4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a2, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a3, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a4, $fcc3 + movt x4, TEMP, $fcc3 + + CMPLT $fcc0, a5, s1 + daddiu TEMP, TEMP, 4 + CMPLT $fcc1, a6, s2 + NOP + + CMPLT $fcc2, a7, s3 + CMPLT $fcc3, a8, s4 + + CMOVT s1, a5, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, a6, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, a7, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, a8, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 7 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddu X, X, INCX + daddiu I, I, -1 + + CMPLT $fcc0, a1, s1 + NOP + CMOVT s1, a1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/izamax.S b/kernel/mips64/izamax.S new file mode 100644 index 0000000000..12e26c9e13 --- /dev/null +++ b/kernel/mips64/izamax.S @@ -0,0 +1,268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 +#define t5 $f16 +#define t6 $f17 +#define t7 $f18 +#define t8 $f19 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + FABS t1, a1 + FABS t2, a2 + + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + daddu X, X, INCX + li x2, 1 + + dsra I, N, 2 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + LD a2, 1 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + FABS t4, a4 + NOP + + FABS t5, a5 + LD a3, 0 * SIZE(X) + FABS t6, a6 + LD a4, 1 * SIZE(X) + FABS t7, a7 + daddu X, X, INCX + FABS t8, a8 + NOP + + ADD t1, t1, t2 + LD a5, 0 * SIZE(X) + ADD t3, t3, t4 + LD a6, 1 * SIZE(X) + ADD t5, t5, t6 + daddu X, X, INCX + ADD t7, t7, t8 + NOP + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t3 + LD a8, 1 * SIZE(X) + CMPLT $fcc2, s3, t5 + daddu X, X, INCX + CMPLT $fcc3, s4, t7 + daddiu I, I, -1 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t3, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t5, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t7, $fcc3 + movt x4, TEMP, $fcc3 + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t3, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t5, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t7, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + ADD t1, t1, t2 + + daddiu I, I, -1 + + CMPLT $fcc0, s1, t1 + NOP + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/izamin.S b/kernel/mips64/izamin.S new file mode 100644 index 0000000000..af3d75056b --- /dev/null +++ b/kernel/mips64/izamin.S @@ -0,0 +1,268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $3 +#define TEMP $7 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 +#define t5 $f16 +#define t6 $f17 +#define t7 $f18 +#define t8 $f19 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define x1 $2 +#define x2 $8 +#define x3 $9 +#define x4 $10 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + li x1, 0 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + FABS t1, a1 + FABS t2, a2 + + ADD s1, t1, t2 + ADD s2, t1, t2 + ADD s3, t1, t2 + ADD s4, t1, t2 + + daddiu N, N, -1 + + blez N, .L999 + li x1, 1 + + daddu X, X, INCX + li x2, 1 + + dsra I, N, 2 + li x3, 1 + + li TEMP, 2 + + blez I, .L15 + li x4, 1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + LD a2, 1 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + FABS t4, a4 + NOP + + FABS t5, a5 + LD a3, 0 * SIZE(X) + FABS t6, a6 + LD a4, 1 * SIZE(X) + FABS t7, a7 + daddu X, X, INCX + FABS t8, a8 + NOP + + ADD t1, t1, t2 + LD a5, 0 * SIZE(X) + ADD t3, t3, t4 + LD a6, 1 * SIZE(X) + ADD t5, t5, t6 + daddu X, X, INCX + ADD t7, t7, t8 + NOP + + CMPLT $fcc0, t1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, t3, s2 + LD a8, 1 * SIZE(X) + CMPLT $fcc2, t5, s3 + daddu X, X, INCX + CMPLT $fcc3, t7, s4 + daddiu I, I, -1 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t3, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t5, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t7, $fcc3 + movt x4, TEMP, $fcc3 + + bgtz I, .L12 + daddiu TEMP, TEMP, 4 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + CMOVT s2, t3, $fcc1 + movt x2, TEMP, $fcc1 + + CMOVT s3, t5, $fcc2 + movt x3, TEMP, $fcc2 + CMOVT s4, t7, $fcc3 + movt x4, TEMP, $fcc3 + + daddiu TEMP, TEMP, 4 + daddiu x2, x2, 1 + daddiu x3, x3, 2 + daddiu x4, x4, 3 + .align 3 + +.L15: + andi I, N, 3 + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + ADD t1, t1, t2 + + daddiu I, I, -1 + + CMPLT $fcc0, t1, s1 + NOP + CMOVT s1, t1, $fcc0 + movt x1, TEMP, $fcc0 + + bgtz I, .L16 + daddiu TEMP, TEMP, 1 + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + movt x1, x2, $fcc0 + CMOVT s3, s4, $fcc1 + movt x3, x4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + movt x1, x3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/max.S b/kernel/mips64/max.S new file mode 100644 index 0000000000..a432f12255 --- /dev/null +++ b/kernel/mips64/max.S @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD s1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + NOP + + blez N, .L999 + MOV s2, s1 + + MOV s3, s1 + dsra I, N, 3 + + blez I, .L15 + MOV s4, s1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + CMPLT $fcc0, s1, a1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, a2 + daddu X, X, INCX + + CMPLT $fcc2, s3, a3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, a4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + LD a1, 0 * SIZE(X) + CMOVT s2, a2, $fcc1 + daddu X, X, INCX + + CMOVT s3, a3, $fcc2 + LD a2, 0 * SIZE(X) + CMOVT s4, a4, $fcc3 + daddu X, X, INCX + + CMPLT $fcc0, s1, a5 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, a6 + daddu X, X, INCX + CMPLT $fcc2, s3, a7 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, s4, a8 + daddu X, X, INCX + + CMOVT s1, a5, $fcc0 + LD a5, 0 * SIZE(X) + CMOVT s2, a6, $fcc1 + daddu X, X, INCX + + CMOVT s3, a7, $fcc2 + LD a6, 0 * SIZE(X) + CMOVT s4, a8, $fcc3 + daddiu I, I, -1 + + bgtz I, .L12 + daddu X, X, INCX + .align 3 + +.L13: + CMPLT $fcc0, s1, a1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, a2 + daddu X, X, INCX + + CMPLT $fcc2, s3, a3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, s4, a4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + CMOVT s2, a2, $fcc1 + CMOVT s3, a3, $fcc2 + CMOVT s4, a4, $fcc3 + + CMPLT $fcc0, s1, a5 + CMPLT $fcc1, s2, a6 + CMPLT $fcc2, s3, a7 + CMPLT $fcc3, s4, a8 + + CMOVT s1, a5, $fcc0 + CMOVT s2, a6, $fcc1 + CMOVT s3, a7, $fcc2 + CMOVT s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + CMPLT $fcc0, s1, a1 + + CMOVT s1, a1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/min.S b/kernel/mips64/min.S new file mode 100644 index 0000000000..33cfc81f3d --- /dev/null +++ b/kernel/mips64/min.S @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + LD s1, 0 * SIZE(X) + daddiu N, N, -1 + + daddu X, X, INCX + NOP + + blez N, .L999 + MOV s2, s1 + + MOV s3, s1 + dsra I, N, 3 + + blez I, .L15 + MOV s4, s1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + CMPLT $fcc0, a1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, a2, s2 + daddu X, X, INCX + + CMPLT $fcc2, a3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, a4, s4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + LD a1, 0 * SIZE(X) + CMOVT s2, a2, $fcc1 + daddu X, X, INCX + + CMOVT s3, a3, $fcc2 + LD a2, 0 * SIZE(X) + CMOVT s4, a4, $fcc3 + daddu X, X, INCX + + CMPLT $fcc0, a5, s1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, a6, s2 + daddu X, X, INCX + CMPLT $fcc2, a7, s3 + LD a4, 0 * SIZE(X) + CMPLT $fcc3, a8, s4 + daddu X, X, INCX + + CMOVT s1, a5, $fcc0 + LD a5, 0 * SIZE(X) + CMOVT s2, a6, $fcc1 + daddu X, X, INCX + + CMOVT s3, a7, $fcc2 + LD a6, 0 * SIZE(X) + CMOVT s4, a8, $fcc3 + daddiu I, I, -1 + + bgtz I, .L12 + daddu X, X, INCX + .align 3 + +.L13: + CMPLT $fcc0, a1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, a2, s2 + daddu X, X, INCX + + CMPLT $fcc2, a3, s3 + LD a8, 0 * SIZE(X) + CMPLT $fcc3, a4, s4 + daddu X, X, INCX + + CMOVT s1, a1, $fcc0 + CMOVT s2, a2, $fcc1 + CMOVT s3, a3, $fcc2 + CMOVT s4, a4, $fcc3 + + CMPLT $fcc0, a5, s1 + CMPLT $fcc1, a6, s2 + CMPLT $fcc2, a7, s3 + CMPLT $fcc3, a8, s4 + + CMOVT s1, a5, $fcc0 + CMOVT s2, a6, $fcc1 + CMOVT s3, a7, $fcc2 + CMOVT s4, a8, $fcc3 + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + CMPLT $fcc0, a1, s1 + + CMOVT s1, a1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/rot.S b/kernel/mips64/rot.S new file mode 100644 index 0000000000..b94a59c983 --- /dev/null +++ b/kernel/mips64/rot.S @@ -0,0 +1,367 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define XX $9 +#define YY $10 + +#define C $f17 +#define S $f18 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 + +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 + + PROLOGUE + + dsll INCX, INCX, BASE_SHIFT + li TEMP, SIZE + + blez N, .L999 + dsll INCY, INCY, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 2 + + bne INCY, TEMP, .L20 + NOP + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + MUL t1, S, b1 + + LD a4, 3 * SIZE(X) + MUL t2, C, b1 + LD b4, 3 * SIZE(Y) + MUL t3, S, b2 + + blez I, .L13 + MUL t4, C, b2 + .align 3 + +.L12: + MADD t1, t1, C, a1 + LD b1, 4 * SIZE(Y) + NMSUB t2, t2, S, a1 + LD a1, 4 * SIZE(X) + MADD t3, t3, C, a2 + LD b2, 5 * SIZE(Y) + NMSUB t4, t4, S, a2 + LD a2, 5 * SIZE(X) + + ST t1, 0 * SIZE(X) + MUL t1, S, b3 + ST t2, 0 * SIZE(Y) + MUL t2, C, b3 + ST t3, 1 * SIZE(X) + MUL t3, S, b4 + ST t4, 1 * SIZE(Y) + MUL t4, C, b4 + + + MADD t1, t1, C, a3 + LD b3, 6 * SIZE(Y) + NMSUB t2, t2, S, a3 + LD a3, 6 * SIZE(X) + MADD t3, t3, C, a4 + LD b4, 7 * SIZE(Y) + NMSUB t4, t4, S, a4 + LD a4, 7 * SIZE(X) + + ST t1, 2 * SIZE(X) + MUL t1, S, b1 + ST t2, 2 * SIZE(Y) + MUL t2, C, b1 + ST t3, 3 * SIZE(X) + MUL t3, S, b2 + ST t4, 3 * SIZE(Y) + MUL t4, C, b2 + + daddiu I, I, -1 + daddiu X, X, 4 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 4 * SIZE + .align 3 + +.L13: + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(X) + MUL t1, S, b3 + ST t2, 0 * SIZE(Y) + MUL t2, C, b3 + ST t3, 1 * SIZE(X) + MUL t3, S, b4 + ST t4, 1 * SIZE(Y) + MUL t4, C, b4 + + MADD t1, t1, C, a3 + NMSUB t2, t2, S, a3 + MADD t3, t3, C, a4 + daddiu X, X, 4 * SIZE + NMSUB t4, t4, S, a4 + daddiu Y, Y, 4 * SIZE + + ST t1, -2 * SIZE(X) + ST t2, -2 * SIZE(Y) + ST t3, -1 * SIZE(X) + ST t4, -1 * SIZE(Y) + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + MUL t1, S, b1 + MUL t2, C, b1 + + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + + ST t1, 0 * SIZE(X) + ST t2, 0 * SIZE(Y) + + daddiu I, I, -1 + + daddiu X, X, SIZE + daddiu Y, Y, SIZE + + bgtz I, .L16 + NOP + j .L999 + NOP + .align 3 + +.L20: + move XX, X + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + dadd X, X, INCX + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + + LD a2, 0 * SIZE(X) + dadd X, X, INCX + LD b2, 0 * SIZE(Y) + dadd Y, Y, INCY + + LD a3, 0 * SIZE(X) + dadd X, X, INCX + LD b3, 0 * SIZE(Y) + dadd Y, Y, INCY + + MUL t1, S, b1 + + LD a4, 0 * SIZE(X) + dadd X, X, INCX + MUL t2, C, b1 + LD b4, 0 * SIZE(Y) + dadd Y, Y, INCY + + MUL t3, S, b2 + blez I, .L23 + MUL t4, C, b2 + .align 3 + +.L22: + MADD t1, t1, C, a1 + LD b1, 0 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t2, t2, S, a1 + LD a1, 0 * SIZE(X) + dadd X, X, INCX + MADD t3, t3, C, a2 + LD b2, 0 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t4, t4, S, a2 + LD a2, 0 * SIZE(X) + dadd X, X, INCX + + ST t1, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t1, S, b3 + ST t2, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t2, C, b3 + ST t3, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b4 + ST t4, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t4, C, b4 + + MADD t1, t1, C, a3 + LD b3, 0 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t2, t2, S, a3 + LD a3, 0 * SIZE(X) + dadd X, X, INCX + MADD t3, t3, C, a4 + LD b4, 0 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t4, t4, S, a4 + LD a4, 0 * SIZE(X) + dadd X, X, INCX + + ST t1, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t1, S, b1 + ST t2, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t2, C, b1 + ST t3, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b2 + ST t4, 0 * SIZE(YY) + MUL t4, C, b2 + daddiu I, I, -1 + + bgtz I, .L22 + dadd YY, YY, INCY + .align 3 + +.L23: + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t1, S, b3 + ST t2, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t2, C, b3 + ST t3, 0 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b4 + ST t4, 0 * SIZE(YY) + dadd YY, YY, INCY + MUL t4, C, b4 + + MADD t1, t1, C, a3 + NMSUB t2, t2, S, a3 + MADD t3, t3, C, a4 + NMSUB t4, t4, S, a4 + + ST t1, 0 * SIZE(XX) + dadd XX, XX, INCX + ST t2, 0 * SIZE(YY) + dadd YY, YY, INCY + ST t3, 0 * SIZE(XX) + dadd XX, XX, INCX + ST t4, 0 * SIZE(YY) + dadd YY, YY, INCY + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + MUL t1, S, b1 + MUL t2, C, b1 + + MADD t1, t1, C, a1 + daddiu I, I, -1 + NMSUB t2, t2, S, a1 + + ST t1, 0 * SIZE(X) + ST t2, 0 * SIZE(Y) + + dadd X, X, INCX + bgtz I, .L26 + dadd Y, Y, INCY + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/scal.S b/kernel/mips64/scal.S new file mode 100644 index 0000000000..f544914d5a --- /dev/null +++ b/kernel/mips64/scal.S @@ -0,0 +1,412 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $8 +#define INCX $9 + +#define I $2 +#define TEMP $3 + +#define XX $5 + +#define ALPHA $f15 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define t1 $f8 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + + li TEMP, SIZE + MTC $0, a1 + + blez N, .L999 + dsll INCX, INCX, BASE_SHIFT + + CMPEQ $fcc0, ALPHA, a1 + NOP + + bc1f $fcc0, .L50 + NOP + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + ST a1, 2 * SIZE(X) + ST a1, 3 * SIZE(X) + ST a1, 4 * SIZE(X) + ST a1, 5 * SIZE(X) + ST a1, 6 * SIZE(X) + ST a1, 7 * SIZE(X) + addiu I, I, -1 + + bgtz I, .L12 + daddiu X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + ST a1, 0 * SIZE(X) + daddiu I, I, -1 + + bgtz I, .L16 + daddiu X, X, SIZE + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 3 + blez I, .L25 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + daddiu I, I, -1 + + bgtz I, .L22 + daddu X, X, INCX + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + daddiu I, I, -1 + ST a1, 0 * SIZE(X) + + bgtz I, .L26 + daddu X, X, INCX + + j $31 + NOP + .align 3 + +.L50: + bne INCX, TEMP, .L60 + dsra I, N, 3 + + blez I, .L55 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + LD a5, 4 * SIZE(X) + LD a6, 5 * SIZE(X) + LD a7, 6 * SIZE(X) + LD a8, 7 * SIZE(X) + + blez I, .L53 + NOP + .align 3 + +.L52: + MUL t1, ALPHA, a1 + LD a1, 8 * SIZE(X) + MUL t2, ALPHA, a2 + LD a2, 9 * SIZE(X) + + MUL t3, ALPHA, a3 + LD a3, 10 * SIZE(X) + MUL t4, ALPHA, a4 + LD a4, 11 * SIZE(X) + + ST t1, 0 * SIZE(X) + MUL t1, ALPHA, a5 + + LD a5, 12 * SIZE(X) + + ST t2, 1 * SIZE(X) + MUL t2, ALPHA, a6 + + LD a6, 13 * SIZE(X) + + ST t3, 2 * SIZE(X) + MUL t3, ALPHA, a7 + + LD a7, 14 * SIZE(X) + + ST t4, 3 * SIZE(X) + MUL t4, ALPHA, a8 + + LD a8, 15 * SIZE(X) + daddiu I, I, -1 + + ST t1, 4 * SIZE(X) + ST t2, 5 * SIZE(X) + ST t3, 6 * SIZE(X) + ST t4, 7 * SIZE(X) + + bgtz I, .L52 + daddiu X, X, 8 * SIZE + .align 3 + +.L53: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + + ST t1, 0 * SIZE(X) + MUL t1, ALPHA, a5 + ST t2, 1 * SIZE(X) + MUL t2, ALPHA, a6 + + ST t3, 2 * SIZE(X) + MUL t3, ALPHA, a7 + ST t4, 3 * SIZE(X) + MUL t4, ALPHA, a8 + + ST t1, 4 * SIZE(X) + ST t2, 5 * SIZE(X) + ST t3, 6 * SIZE(X) + ST t4, 7 * SIZE(X) + + daddiu X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L56: + LD a1, 0 * SIZE(X) + + MUL t1, ALPHA, a1 + + daddiu X, X, SIZE + daddiu I, I, -1 + + bgtz I, .L56 + ST t1, -1 * SIZE(X) + + j $31 + NOP + .align 3 + +.L60: + dsra I, N, 3 + move XX, X + + blez I, .L65 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + blez I, .L63 + NOP + .align 3 + +.L62: + MUL t1, ALPHA, a1 + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + MUL t2, ALPHA, a2 + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + MUL t3, ALPHA, a3 + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + MUL t4, ALPHA, a4 + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + ST t1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t4, 0 * SIZE(XX) + daddu XX, XX, INCX + + MUL t1, ALPHA, a5 + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + MUL t2, ALPHA, a6 + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + MUL t3, ALPHA, a7 + LD a7, 0 * SIZE(X) + daddu X, X, INCX + + MUL t4, ALPHA, a8 + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + ST t1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t4, 0 * SIZE(XX) + daddiu I, I, -1 + + bgtz I, .L62 + daddu XX, XX, INCX + .align 3 + +.L63: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + + ST t1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t4, 0 * SIZE(XX) + daddu XX, XX, INCX + + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + + ST t1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST t4, 0 * SIZE(XX) + daddu XX, XX, INCX + .align 3 + +.L65: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L66: + LD a1, 0 * SIZE(X) + + MUL t1, ALPHA, a1 + + daddiu I, I, -1 + ST t1, 0 * SIZE(X) + + bgtz I, .L66 + daddu X, X, INCX + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S new file mode 100644 index 0000000000..04a48bdaeb --- /dev/null +++ b/kernel/mips64/snrm2.S @@ -0,0 +1,337 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f6 +#define a2 $f7 +#define a3 $f8 +#define a4 $f9 +#define a5 $f10 +#define a6 $f11 +#define a7 $f12 +#define a8 $f13 + +#define s1 $f0 +#define s2 $f1 + +#define t1 $f2 +#define t2 $f3 +#define t3 $f4 +#define t4 $f5 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + dmtc1 $0, s1 + li TEMP, SIZE + + blez N, .L999 + mov.d s2, s1 + + blez INCX, .L999 + dsll INCX, INCX, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + + LD a5, 4 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + LD a6, 5 * SIZE(X) + cvt.d.s t2, a2 + LD a7, 6 * SIZE(X) + cvt.d.s t3, a3 + LD a8, 7 * SIZE(X) + + blez I, .L13 + cvt.d.s t4, a4 + .align 3 + +.L12: + madd.d s1, s1, t1, t1 + LD a1, 8 * SIZE(X) + + cvt.d.s t1, a5 + NOP + + madd.d s2, s2, t2, t2 + LD a2, 9 * SIZE(X) + + cvt.d.s t2, a6 + NOP + + madd.d s1, s1, t3, t3 + LD a3, 10 * SIZE(X) + + cvt.d.s t3, a7 + NOP + + madd.d s2, s2, t4, t4 + LD a4, 11 * SIZE(X) + + cvt.d.s t4, a8 + NOP + + madd.d s1, s1, t1, t1 + LD a5, 12 * SIZE(X) + + cvt.d.s t1, a1 + NOP + + madd.d s2, s2, t2, t2 + LD a6, 13 * SIZE(X) + + cvt.d.s t2, a2 + daddiu I, I, -1 + + madd.d s1, s1, t3, t3 + LD a7, 14 * SIZE(X) + + cvt.d.s t3, a3 + daddiu X, X, 8 * SIZE + + madd.d s2, s2, t4, t4 + LD a8, 7 * SIZE(X) + + bgtz I, .L12 + cvt.d.s t4, a4 + .align 3 + +.L13: + madd.d s1, s1, t1, t1 + cvt.d.s t1, a5 + + madd.d s2, s2, t2, t2 + cvt.d.s t2, a6 + + madd.d s1, s1, t3, t3 + cvt.d.s t3, a7 + + madd.d s2, s2, t4, t4 + cvt.d.s t4, a8 + + madd.d s1, s1, t1, t1 + madd.d s2, s2, t2, t2 + madd.d s1, s1, t3, t3 + madd.d s2, s2, t4, t4 + + daddiu X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + + madd.d s1, s1, t1, t1 + + bgtz I, .L16 + daddiu X, X, SIZE + + j .L999 + NOP + .align 3 + +.L20: + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + LD a7, 0 * SIZE(X) + daddu X, X, INCX + + LD a8, 0 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + cvt.d.s t2, a2 + cvt.d.s t3, a3 + cvt.d.s t4, a4 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + madd.d s1, s1, t1, t1 + LD a1, 0 * SIZE(X) + + cvt.d.s t1, a5 + daddu X, X, INCX + + madd.d s2, s2, t2, t2 + LD a2, 0 * SIZE(X) + + cvt.d.s t2, a6 + daddu X, X, INCX + + madd.d s1, s1, t3, t3 + LD a3, 0 * SIZE(X) + + cvt.d.s t3, a7 + daddu X, X, INCX + + madd.d s2, s2, t4, t4 + LD a4, 0 * SIZE(X) + + cvt.d.s t4, a8 + daddu X, X, INCX + + madd.d s1, s1, t1, t1 + LD a5, 0 * SIZE(X) + + cvt.d.s t1, a1 + daddu X, X, INCX + + madd.d s2, s2, t2, t2 + LD a6, 0 * SIZE(X) + + cvt.d.s t2, a2 + daddu X, X, INCX + + madd.d s1, s1, t3, t3 + LD a7, 0 * SIZE(X) + + cvt.d.s t3, a3 + daddu X, X, INCX + + madd.d s2, s2, t4, t4 + LD a8, 0 * SIZE(X) + + cvt.d.s t4, a4 + daddiu I, I, -1 + + bgtz I, .L23 + daddu X, X, INCX + .align 3 + +.L24: + madd.d s1, s1, t1, t1 + cvt.d.s t1, a5 + + madd.d s2, s2, t2, t2 + cvt.d.s t2, a6 + + madd.d s1, s1, t3, t3 + cvt.d.s t3, a7 + + madd.d s2, s2, t4, t4 + cvt.d.s t4, a8 + + madd.d s1, s1, t1, t1 + madd.d s2, s2, t2, t2 + madd.d s1, s1, t3, t3 + madd.d s2, s2, t4, t4 + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + daddiu I, I, -1 + + cvt.d.s t1, a1 + + daddu X, X, INCX + + bgtz I, .L26 + madd.d s1, s1, t1, t1 + .align 3 + +.L999: + add.d s1, s1, s2 + + sqrt.d s1, s1 + + j $31 + cvt.s.d s1, s1 + + EPILOGUE diff --git a/kernel/mips64/swap.S b/kernel/mips64/swap.S new file mode 100644 index 0000000000..d54abd7dfd --- /dev/null +++ b/kernel/mips64/swap.S @@ -0,0 +1,392 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $8 +#define INCX $9 +#define Y $10 +#define INCY $11 + +#define I $2 +#define TEMP $3 + +#define XX $5 +#define YY $6 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 + + PROLOGUE + + li TEMP, SIZE + NOP + + blez N, .L999 + dsll INCX, INCX, BASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, BASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 3 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + LD a4, 3 * SIZE(X) + LD b4, 3 * SIZE(Y) + LD a5, 4 * SIZE(X) + LD b5, 4 * SIZE(Y) + LD a6, 5 * SIZE(X) + LD b6, 5 * SIZE(Y) + LD a7, 6 * SIZE(X) + LD b7, 6 * SIZE(Y) + LD a8, 7 * SIZE(X) + LD b8, 7 * SIZE(Y) + + blez I, .L13 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(Y) + LD a1, 8 * SIZE(X) + ST b1, 0 * SIZE(X) + LD b1, 8 * SIZE(Y) + + ST a2, 1 * SIZE(Y) + LD a2, 9 * SIZE(X) + ST b2, 1 * SIZE(X) + LD b2, 9 * SIZE(Y) + + ST a3, 2 * SIZE(Y) + LD a3, 10 * SIZE(X) + ST b3, 2 * SIZE(X) + LD b3, 10 * SIZE(Y) + + ST a4, 3 * SIZE(Y) + LD a4, 11 * SIZE(X) + ST b4, 3 * SIZE(X) + LD b4, 11 * SIZE(Y) + + ST a5, 4 * SIZE(Y) + LD a5, 12 * SIZE(X) + ST b5, 4 * SIZE(X) + LD b5, 12 * SIZE(Y) + + ST a6, 5 * SIZE(Y) + LD a6, 13 * SIZE(X) + ST b6, 5 * SIZE(X) + LD b6, 13 * SIZE(Y) + + ST a7, 6 * SIZE(Y) + LD a7, 14 * SIZE(X) + ST b7, 6 * SIZE(X) + LD b7, 14 * SIZE(Y) + + ST a8, 7 * SIZE(Y) + LD a8, 15 * SIZE(X) + ST b8, 7 * SIZE(X) + LD b8, 15 * SIZE(Y) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + ST a1, 0 * SIZE(Y) + ST b1, 0 * SIZE(X) + ST a2, 1 * SIZE(Y) + ST b2, 1 * SIZE(X) + ST a3, 2 * SIZE(Y) + ST b3, 2 * SIZE(X) + ST a4, 3 * SIZE(Y) + ST b4, 3 * SIZE(X) + ST a5, 4 * SIZE(Y) + ST b5, 4 * SIZE(X) + ST a6, 5 * SIZE(Y) + ST b6, 5 * SIZE(X) + ST a7, 6 * SIZE(Y) + ST b7, 6 * SIZE(X) + ST a8, 7 * SIZE(Y) + ST b8, 7 * SIZE(X) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + daddiu X, X, SIZE + daddiu I, I, -1 + daddiu Y, Y, SIZE + + ST b1, -1 * SIZE(X) + bgtz I, .L16 + ST a1, -1 * SIZE(Y) + + j .L999 + NOP + .align 3 + +.L20: + dsra I, N, 3 + move XX, X + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + daddu X, X, INCX + LD b1, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 0 * SIZE(X) + daddu X, X, INCX + LD b2, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a3, 0 * SIZE(X) + daddu X, X, INCX + LD b3, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a4, 0 * SIZE(X) + daddu X, X, INCX + LD b4, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a5, 0 * SIZE(X) + daddu X, X, INCX + LD b5, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a6, 0 * SIZE(X) + daddu X, X, INCX + LD b6, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a7, 0 * SIZE(X) + daddu X, X, INCX + LD b7, 0 * SIZE(Y) + daddu Y, Y, INCY + LD a8, 0 * SIZE(X) + daddu X, X, INCX + LD b8, 0 * SIZE(Y) + daddu Y, Y, INCY + + blez I, .L23 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a1, 0 * SIZE(X) + daddu X, X, INCX + + ST b1, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b1, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a2, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a2, 0 * SIZE(X) + daddu X, X, INCX + + ST b2, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b2, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a3, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(X) + daddu X, X, INCX + + ST b3, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b3, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a4, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a4, 0 * SIZE(X) + daddu X, X, INCX + + ST b4, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b4, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a5, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a5, 0 * SIZE(X) + daddu X, X, INCX + + ST b5, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b5, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a6, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a6, 0 * SIZE(X) + daddu X, X, INCX + + ST b6, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b6, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a7, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a7, 0 * SIZE(X) + daddu X, X, INCX + + ST b7, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b7, 0 * SIZE(Y) + daddu Y, Y, INCY + + ST a8, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a8, 0 * SIZE(X) + daddu X, X, INCX + + ST b8, 0 * SIZE(XX) + daddu XX, XX, INCX + LD b8, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L22 + daddu Y, Y, INCY + .align 3 + +.L23: + ST a1, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b1, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a2, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b2, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a3, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b3, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a4, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b4, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a5, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b5, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a6, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b6, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a7, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b7, 0 * SIZE(XX) + daddu XX, XX, INCX + ST a8, 0 * SIZE(YY) + daddu YY, YY, INCY + ST b8, 0 * SIZE(XX) + daddu XX, XX, INCX + .align 3 + +.L25: + andi I, N, 7 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + daddiu I, I, -1 + ST a1, 0 * SIZE(Y) + ST b1, 0 * SIZE(X) + + daddu X, X, INCX + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/symv_L.S b/kernel/mips64/symv_L.S new file mode 100644 index 0000000000..9a54eb7899 --- /dev/null +++ b/kernel/mips64/symv_L.S @@ -0,0 +1,658 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define A $6 +#define LDA $7 +#define X $8 +#define INCX $9 +#define Y $10 +#define INCY $11 +#define BUFFER $5 + +#define XX $12 +#define YY $13 + +#define I $14 +#define IS $15 + +#define AO1 $16 +#define AO2 $17 + +#define Y1 $18 +#define TEMP $19 + +#define II INCX + +#define ALPHA $f13 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define alpha1 $f8 +#define alpha2 $f9 + +#define x1 $f10 +#define x2 $f11 +#define x3 $f12 +#define x4 $f14 + +#define xsum1 $f15 +#define xsum2 $f16 + +#define ysum1 $f17 +#define ysum2 $f18 +#define ysum3 $f19 +#define ysum4 $f20 + + + PROLOGUE + + LDARG BUFFER, 0($sp) + daddiu $sp, $sp, -32 + + SDARG $16, 0($sp) + dsll LDA, LDA, BASE_SHIFT + SDARG $17, 8($sp) + dsll INCX, INCX, BASE_SHIFT + SDARG $18, 16($sp) + dsll INCY, INCY, BASE_SHIFT + SDARG $19, 24($sp) + nop + + blez M, .L999 + li IS, SIZE + + beq IS, INCX, .L05 + move Y1, Y + + dsra I, M, 2 + move XX, X + + blez I, .L02 + move X, BUFFER + .align 3 + +.L01: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a2, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a3, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a4, 0 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L01 + daddiu BUFFER, BUFFER, 4 * SIZE + .align 3 + +.L02: + andi I, M, 3 + blez I, .L05 + NOP + .align 3 + +.L03: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L03 + daddiu BUFFER, BUFFER, 1 * SIZE + .align 3 + +.L05: + beq IS, INCY, .L10 + daddiu BUFFER, BUFFER, 255 + + li TEMP, -256 + and BUFFER, BUFFER, TEMP + + dsra I, M, 2 + move Y1, BUFFER + + blez I, .L07 + move YY, Y + .align 3 + +.L06: + LD a1, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a2, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a4, 0 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu BUFFER, BUFFER, 4 * SIZE + .align 3 + +.L07: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L08: + LD a1, 0 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L08 + daddiu BUFFER, BUFFER, 1 * SIZE + .align 3 + +.L10: + slti TEMP, M, 2 + nop + + bgtz TEMP, .L20 + li IS, 0 + .align 3 + +.L11: + dsll TEMP, IS, BASE_SHIFT + nop + + daddu XX, X, TEMP + daddu YY, Y1, TEMP + + LD alpha1, 0 * SIZE(XX) + move AO1, A + LD alpha2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + LD a1, 0 * SIZE(AO1) + daddu AO2, A, LDA + LD a2, 1 * SIZE(AO1) + daddiu AO1, AO1, 2 * SIZE + + LD a3, 0 * SIZE(AO2) + daddu A, AO2, LDA + LD a4, 1 * SIZE(AO2) + daddiu AO2, AO2, 2 * SIZE + + MUL xsum1, alpha1, a1 + daddiu A, A, 2 * SIZE + MUL xsum2, alpha1, a2 + dsubu II, M, IS + + MADD xsum1, xsum1, alpha2, a2 + MADD xsum2, xsum2, alpha2, a4 + daddiu II, II, - 2 + + MUL alpha1, ALPHA, alpha1 + daddiu YY, YY, 2 * SIZE + MUL alpha2, ALPHA, alpha2 + dsra I, II, 3 + + blez I, .L15 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a5, 2 * SIZE(AO1) + LD a6, 3 * SIZE(AO1) + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + blez I, .L13 + LD ysum3, 2 * SIZE(YY) + .align 3 + +.L12: + MADD ysum1, ysum1, alpha1, a1 + LD ysum4, 3 * SIZE(YY) + MADD ysum2, ysum2, alpha1, a2 + LD x4, 3 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 4 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + daddiu I, I, -1 + MADD xsum1, xsum1, x2, a2 + LD a2, 5 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 5 * SIZE(AO2) + + ST ysum1, 0 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + nop + MADD ysum4, ysum4, alpha1, a6 + LD x2, 5 * SIZE(XX) + MADD xsum1, xsum1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 6 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 6 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + daddiu XX, XX, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + LD a6, 7 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 7 * SIZE(AO2) + + ST ysum3, 2 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + daddiu AO2, AO2, 8 * SIZE + MADD ysum2, ysum2, alpha1, a2 + LD x4,-1 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 8 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 0 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 0 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + nop + MADD xsum1, xsum1, x2, a2 + LD a2, 9 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 1 * SIZE(AO2) + + ST ysum1, 4 * SIZE(YY) + LD ysum1, 8 * SIZE(YY) + ST ysum2, 5 * SIZE(YY) + LD ysum2, 9 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + daddiu AO1, AO1, 8 * SIZE + MADD ysum4, ysum4, alpha1, a6 + LD x2, 1 * SIZE(XX) + MADD xsum1, xsum1, x3, a5 + LD a5, 2 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 2 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 2 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + daddiu YY, YY, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + LD a6, 3 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 3 * SIZE(AO2) + + ST ysum3,-2 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + bgtz I, .L12 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L13: + MADD ysum1, ysum1, alpha1, a1 + LD ysum4, 3 * SIZE(YY) + MADD ysum2, ysum2, alpha1, a2 + LD x4, 3 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 4 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + LD a2, 5 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 5 * SIZE(AO2) + LD x2, 5 * SIZE(XX) + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 6 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 6 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + MADD xsum1, xsum1, x4, a6 + LD a6, 7 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 7 * SIZE(AO2) + LD x4, 7 * SIZE(XX) + + ST ysum3, 2 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + MADD xsum2, xsum2, x3, a7 + + MADD ysum3, ysum3, alpha2, a7 + daddiu XX, XX, 8 * SIZE + MADD ysum4, ysum4, alpha2, a8 + daddiu AO1, AO1, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + daddiu AO2, AO2, 8 * SIZE + MADD xsum2, xsum2, x4, a8 + + ST ysum1, 4 * SIZE(YY) + ST ysum2, 5 * SIZE(YY) + ST ysum3, 6 * SIZE(YY) + ST ysum4, 7 * SIZE(YY) + daddiu YY, YY, 8 * SIZE + .align 3 + +.L15: + andi I, II, 4 + NOP + blez I, .L16 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + LD x4, 3 * SIZE(XX) + daddiu XX, XX, 4 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a5, 2 * SIZE(AO1) + LD a6, 3 * SIZE(AO1) + daddiu AO1, AO1, 4 * SIZE + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + daddiu AO2, AO2, 4 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + LD ysum4, 3 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + MADD xsum2, xsum2, x3, a7 + + MADD ysum3, ysum3, alpha2, a7 + MADD ysum4, ysum4, alpha2, a8 + MADD xsum1, xsum1, x4, a6 + MADD xsum2, xsum2, x4, a8 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + ST ysum3, 2 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + + daddiu YY, YY, 4 * SIZE + .align 3 + +.L16: + andi I, II, 2 + NOP + blez I, .L17 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + daddiu AO1, AO1, 2 * SIZE + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + daddiu AO2, AO2, 2 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + daddiu YY, YY, 2 * SIZE + .align 3 + +.L17: + andi I, M, 1 + NOP + blez I, .L19 + NOP + + LD x1, 0 * SIZE(XX) + daddiu XX, XX, 1 * SIZE + LD a1, 0 * SIZE(AO1) + daddiu AO1, AO1, 1 * SIZE + + LD a3, 0 * SIZE(AO2) + daddiu AO2, AO2, 1 * SIZE + LD ysum1, 0 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD xsum1, xsum1, x1, a1 + MADD ysum1, ysum1, alpha2, a3 + MADD xsum2, xsum2, x1, a3 + + ST ysum1, 0 * SIZE(YY) + .align 3 + +.L19: + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + + MADD ysum1, ysum1, ALPHA, xsum1 + MADD ysum2, ysum2, ALPHA, xsum2 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + + daddiu TEMP, IS, 4 + slt TEMP, M, TEMP + + beqz TEMP, .L11 + daddiu IS, IS, 2 + .align 3 + +.L20: + andi I, M, 1 + dsll TEMP, IS, BASE_SHIFT + blez I, .L900 + daddu XX, X, TEMP + + daddu YY, Y1, TEMP + + LD x1, 0 * SIZE(XX) + LD ysum1, 0 * SIZE(YY) + LD a1, 0 * SIZE(A) + + MUL xsum1, a1, x1 + + MADD ysum1, ysum1, ALPHA, xsum1 + + ST ysum1, 0 * SIZE(YY) + .align 3 + +.L900: + li IS, SIZE + + beq INCY, IS, .L999 + NOP + + dsra I, M, 2 + blez I, .L905 + NOP + .align 3 + +.L902: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + LD a3, 2 * SIZE(Y1) + LD a4, 3 * SIZE(Y1) + + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu Y1, Y1, 4 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(Y1) + daddiu Y1, Y1, 1 * SIZE + + ST a1, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + + j $31 + daddiu $sp, $sp, 32 + + EPILOGUE diff --git a/kernel/mips64/symv_U.S b/kernel/mips64/symv_U.S new file mode 100644 index 0000000000..285e591eff --- /dev/null +++ b/kernel/mips64/symv_U.S @@ -0,0 +1,782 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define A $6 +#define LDA $7 +#define X $8 +#define INCX $9 +#define Y $10 +#define INCY $11 +#define BUFFER $5 + +#define XX $12 +#define YY $13 + +#define I $14 +#define IS $15 + +#define AO1 $16 +#define AO2 $17 + +#define Y1 $18 +#define TEMP $19 + +#define ALPHA $f13 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define alpha1 $f8 +#define alpha2 $f9 + +#define x1 $f10 +#define x2 $f11 +#define x3 $f12 +#define x4 $f14 + +#define xsum1 $f15 +#define xsum2 $f16 + +#define ysum1 $f17 +#define ysum2 $f18 +#define ysum3 $f19 +#define ysum4 $f20 + + + PROLOGUE + + LDARG BUFFER, 0($sp) + daddiu $sp, $sp, -32 + + SDARG $16, 0($sp) + dsll LDA, LDA, BASE_SHIFT + SDARG $17, 8($sp) + dsll INCX, INCX, BASE_SHIFT + SDARG $18, 16($sp) + dsll INCY, INCY, BASE_SHIFT + SDARG $19, 24($sp) + nop + + blez M, .L999 + li IS, SIZE + + beq IS, INCX, .L05 + move Y1, Y + + dsra I, M, 2 + move XX, X + + blez I, .L02 + move X, BUFFER + .align 3 + +.L01: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a2, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a3, 0 * SIZE(XX) + daddu XX, XX, INCX + LD a4, 0 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L01 + daddiu BUFFER, BUFFER, 4 * SIZE + .align 3 + +.L02: + andi I, M, 3 + blez I, .L05 + NOP + .align 3 + +.L03: + LD a1, 0 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L03 + daddiu BUFFER, BUFFER, 1 * SIZE + .align 3 + +.L05: + beq IS, INCY, .L10 + daddiu BUFFER, BUFFER, 255 + + li TEMP, -256 + and BUFFER, BUFFER, TEMP + + dsra I, M, 2 + move Y1, BUFFER + + blez I, .L07 + move YY, Y + .align 3 + +.L06: + LD a1, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a2, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(YY) + daddu YY, YY, INCY + LD a4, 0 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu BUFFER, BUFFER, 4 * SIZE + .align 3 + +.L07: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L08: + LD a1, 0 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L08 + daddiu BUFFER, BUFFER, 1 * SIZE + .align 3 + +.L10: + slti TEMP, M, 2 + nop + + bgtz TEMP, .L20 + li IS, 0 + .align 3 + +.L11: + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, X, TEMP + + LD alpha1, 0 * SIZE(TEMP) + LD alpha2, 1 * SIZE(TEMP) + + move AO1, A + dsra I, IS, 3 + daddu AO2, A, LDA + daddu A, AO2, LDA + + MTC $0, xsum1 + MTC $0, xsum2 + + move XX, X + MUL alpha1, ALPHA, alpha1 + move YY, Y1 + MUL alpha2, ALPHA, alpha2 + + blez I, .L15 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a5, 2 * SIZE(AO1) + LD a6, 3 * SIZE(AO1) + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + blez I, .L13 + LD ysum3, 2 * SIZE(YY) + .align 3 + +.L12: + MADD ysum1, ysum1, alpha1, a1 + LD ysum4, 3 * SIZE(YY) + MADD ysum2, ysum2, alpha1, a2 + LD x4, 3 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 4 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + daddiu I, I, -1 + MADD xsum1, xsum1, x2, a2 + LD a2, 5 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 5 * SIZE(AO2) + + ST ysum1, 0 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + nop + MADD ysum4, ysum4, alpha1, a6 + LD x2, 5 * SIZE(XX) + MADD xsum1, xsum1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 6 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 6 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + daddiu XX, XX, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + LD a6, 7 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 7 * SIZE(AO2) + + ST ysum3, 2 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + daddiu AO2, AO2, 8 * SIZE + MADD ysum2, ysum2, alpha1, a2 + LD x4,-1 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 8 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 0 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 0 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + nop + MADD xsum1, xsum1, x2, a2 + LD a2, 9 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 1 * SIZE(AO2) + + ST ysum1, 4 * SIZE(YY) + LD ysum1, 8 * SIZE(YY) + ST ysum2, 5 * SIZE(YY) + LD ysum2, 9 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + daddiu AO1, AO1, 8 * SIZE + MADD ysum4, ysum4, alpha1, a6 + LD x2, 1 * SIZE(XX) + MADD xsum1, xsum1, x3, a5 + LD a5, 2 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 2 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 2 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + daddiu YY, YY, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + LD a6, 3 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 3 * SIZE(AO2) + + ST ysum3,-2 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + bgtz I, .L12 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L13: + MADD ysum1, ysum1, alpha1, a1 + LD ysum4, 3 * SIZE(YY) + MADD ysum2, ysum2, alpha1, a2 + LD x4, 3 * SIZE(XX) + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD xsum2, xsum2, x1, a3 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha2, a3 + LD a3, 4 * SIZE(AO2) + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + LD a2, 5 * SIZE(AO1) + MADD xsum2, xsum2, x2, a4 + LD a4, 5 * SIZE(AO2) + LD x2, 5 * SIZE(XX) + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + LD a5, 6 * SIZE(AO1) + MADD xsum2, xsum2, x3, a7 + LD x3, 6 * SIZE(XX) + + MADD ysum3, ysum3, alpha2, a7 + LD a7, 6 * SIZE(AO2) + MADD ysum4, ysum4, alpha2, a8 + MADD xsum1, xsum1, x4, a6 + LD a6, 7 * SIZE(AO1) + MADD xsum2, xsum2, x4, a8 + LD a8, 7 * SIZE(AO2) + LD x4, 7 * SIZE(XX) + + ST ysum3, 2 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + MADD xsum2, xsum2, x3, a7 + + MADD ysum3, ysum3, alpha2, a7 + daddiu XX, XX, 8 * SIZE + MADD ysum4, ysum4, alpha2, a8 + daddiu AO1, AO1, 8 * SIZE + MADD xsum1, xsum1, x4, a6 + daddiu AO2, AO2, 8 * SIZE + MADD xsum2, xsum2, x4, a8 + + ST ysum1, 4 * SIZE(YY) + ST ysum2, 5 * SIZE(YY) + ST ysum3, 6 * SIZE(YY) + ST ysum4, 7 * SIZE(YY) + daddiu YY, YY, 8 * SIZE + .align 3 + +.L15: + andi I, IS, 4 + NOP + blez I, .L16 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + LD x4, 3 * SIZE(XX) + daddiu XX, XX, 4 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a5, 2 * SIZE(AO1) + LD a6, 3 * SIZE(AO1) + daddiu AO1, AO1, 4 * SIZE + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + daddiu AO2, AO2, 4 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + LD ysum4, 3 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + MADD ysum3, ysum3, alpha1, a5 + MADD ysum4, ysum4, alpha1, a6 + MADD xsum1, xsum1, x3, a5 + MADD xsum2, xsum2, x3, a7 + + MADD ysum3, ysum3, alpha2, a7 + MADD ysum4, ysum4, alpha2, a8 + MADD xsum1, xsum1, x4, a6 + MADD xsum2, xsum2, x4, a8 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + ST ysum3, 2 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + + daddiu YY, YY, 4 * SIZE + .align 3 + +.L16: + andi I, IS, 2 + NOP + blez I, .L19 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + daddiu AO1, AO1, 2 * SIZE + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + daddiu AO2, AO2, 2 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha1, a2 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x1, a3 + + MADD ysum1, ysum1, alpha2, a3 + MADD ysum2, ysum2, alpha2, a4 + MADD xsum1, xsum1, x2, a2 + MADD xsum2, xsum2, x2, a4 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + .align 3 + +.L19: + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + LD a3, 0 * SIZE(AO2) + LD a4, 1 * SIZE(AO2) + + MUL xsum1, ALPHA, xsum1 + MUL xsum2, ALPHA, xsum2 + + MADD xsum1, xsum1, alpha1, a1 + MADD xsum2, xsum2, alpha1, a3 + MADD xsum1, xsum1, alpha2, a3 + MADD xsum2, xsum2, alpha2, a4 + + ADD ysum1, ysum1, xsum1 + ADD ysum2, ysum2, xsum2 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + + daddiu TEMP, IS, 4 + slt TEMP, M, TEMP + + beqz TEMP, .L11 + daddiu IS, IS, 2 + .align 3 + +.L20: + andi TEMP, M, 1 + nop + blez TEMP, .L900 + nop + .align 3 + + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, X, TEMP + + LD alpha1, 0 * SIZE(TEMP) + + move AO1, A + dsra I, IS, 2 + daddu A, AO1, LDA + + MTC $0, xsum1 + MTC $0, xsum2 + + move XX, X + MUL alpha1, ALPHA, alpha1 + move YY, Y1 + + blez I, .L25 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + LD ysum3, 2 * SIZE(YY) + + blez I, .L23 + LD ysum4, 3 * SIZE(YY) + .align 3 + +.L22: + MADD ysum1, ysum1, alpha1, a1 + daddiu I, I, -1 + MADD xsum1, xsum1, x1, a1 + LD a1, 4 * SIZE(AO1) + MADD ysum2, ysum2, alpha1, a2 + LD x1, 4 * SIZE(XX) + MADD xsum2, xsum2, x2, a2 + LD a2, 5 * SIZE(AO1) + + ST ysum1, 0 * SIZE(YY) + LD ysum1, 4 * SIZE(YY) + + ST ysum2, 1 * SIZE(YY) + LD ysum2, 5 * SIZE(YY) + + daddiu AO1, AO1, 4 * SIZE + nop + + MADD ysum3, ysum3, alpha1, a3 + LD x2, 5 * SIZE(XX) + MADD xsum1, xsum1, x3, a3 + LD a3, 2 * SIZE(AO1) + MADD ysum4, ysum4, alpha1, a4 + LD x3, 6 * SIZE(XX) + MADD xsum2, xsum2, x4, a4 + LD a4, 3 * SIZE(AO1) + + ST ysum3, 2 * SIZE(YY) + LD ysum3, 6 * SIZE(YY) + ST ysum4, 3 * SIZE(YY) + LD ysum4, 7 * SIZE(YY) + + daddiu XX, XX, 4 * SIZE + daddiu YY, YY, 4 * SIZE + + bgtz I, .L22 + LD x4, 3 * SIZE(XX) + .align 3 + +.L23: + MADD ysum1, ysum1, alpha1, a1 + daddiu AO1, AO1, 4 * SIZE + MADD xsum1, xsum1, x1, a1 + daddiu XX, XX, 4 * SIZE + MADD ysum2, ysum2, alpha1, a2 + daddiu YY, YY, 4 * SIZE + MADD xsum2, xsum2, x2, a2 + nop + + MADD ysum3, ysum3, alpha1, a3 + ST ysum1,-4 * SIZE(YY) + MADD xsum1, xsum1, x3, a3 + ST ysum2,-3 * SIZE(YY) + MADD ysum4, ysum4, alpha1, a4 + ST ysum3,-2 * SIZE(YY) + MADD xsum2, xsum2, x4, a4 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L25: + andi I, IS, 2 + NOP + blez I, .L26 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + daddiu AO1, AO1, 2 * SIZE + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD xsum1, xsum1, x1, a1 + + MADD ysum2, ysum2, alpha1, a2 + MADD xsum2, xsum2, x2, a2 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + + daddiu YY, YY, 2 * SIZE + .align 3 + +.L26: + andi I, IS, 1 + NOP + blez I, .L29 + NOP + + LD x1, 0 * SIZE(XX) + daddiu XX, XX, 1 * SIZE + LD a1, 0 * SIZE(AO1) + daddiu AO1, AO1, 1* SIZE + + LD ysum1, 0 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD xsum1, xsum1, x1, a1 + + ST ysum1, 0 * SIZE(YY) + .align 3 + +.L29: + dsll TEMP, IS, BASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + + LD a1, 0 * SIZE(AO1) + + ADD xsum1, xsum1, xsum2 + + MUL xsum1, ALPHA, xsum1 + + MADD xsum1, xsum1, alpha1, a1 + + ADD ysum1, ysum1, xsum1 + + ST ysum1, 0 * SIZE(TEMP) + .align 3 + +.L900: + li IS, SIZE + + beq INCY, IS, .L999 + NOP + + dsra I, M, 2 + blez I, .L905 + NOP + .align 3 + +.L902: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + LD a3, 2 * SIZE(Y1) + LD a4, 3 * SIZE(Y1) + + ST a1, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a2, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + daddu Y, Y, INCY + ST a4, 0 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu Y1, Y1, 4 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(Y1) + daddiu Y1, Y1, 1 * SIZE + + ST a1, 0 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + + j $31 + daddiu $sp, $sp, 32 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_LN.S b/kernel/mips64/trsm_kernel_LN.S new file mode 100644 index 0000000000..28e1794b58 --- /dev/null +++ b/kernel/mips64/trsm_kernel_LN.S @@ -0,0 +1,3544 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f27 +#define a4 $f28 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f16 +#define c41 $f17 +#define c42 $f18 +#define c51 $f19 +#define c52 $f20 +#define c61 $f21 +#define c62 $f22 +#define c71 $f23 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + LDARG OFFSET, 144($sp) + + dsll LDC, LDC, BASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, BASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + dsra J, N, 3 + blez J, .L30 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 3 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 3 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO8, LDC +#endif + + andi I, M, 1 + MOV c61, c11 + blez I, .L20 + MOV c71, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 0 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + MOV c81, c11 + + blez L, .L25 + NOP +#endif + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + NMSUB c51, c51, b5, c11 + NMSUB c61, c61, b6, c11 + NMSUB c71, c71, b7, c11 + NMSUB c81, c81, b8, c11 + + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + NMSUB c51, c51, b5, c21 + NMSUB c61, c61, b6, c21 + NMSUB c71, c71, b7, c21 + NMSUB c81, c81, b8, c21 + + LD b3, 18 * SIZE(BO) + LD b4, 19 * SIZE(BO) + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + NMSUB c51, c51, b5, c31 + NMSUB c61, c61, b6, c31 + NMSUB c71, c71, b7, c31 + NMSUB c81, c81, b8, c31 + + LD b4, 27 * SIZE(BO) + LD b5, 28 * SIZE(BO) + LD b6, 29 * SIZE(BO) + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL c41, b4, c41 + + NMSUB c51, c51, b5, c41 + NMSUB c61, c61, b6, c41 + NMSUB c71, c71, b7, c41 + NMSUB c81, c81, b8, c41 + + LD b5, 36 * SIZE(BO) + LD b6, 37 * SIZE(BO) + LD b7, 38 * SIZE(BO) + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + + NMSUB c61, c61, b6, c51 + NMSUB c71, c71, b7, c51 + NMSUB c81, c81, b8, c51 + + LD b6, 45 * SIZE(BO) + LD b7, 46 * SIZE(BO) + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + + NMSUB c71, c71, b7, c61 + NMSUB c81, c81, b8, c61 + + LD b7, 54 * SIZE(BO) + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + + NMSUB c81, c81, b8, c71 + + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + LD b5, 59 * SIZE(BO) + LD b6, 58 * SIZE(BO) + LD b7, 57 * SIZE(BO) + LD b8, 56 * SIZE(BO) + + MUL c81, b1, c81 + + NMSUB c71, c71, b2, c81 + NMSUB c61, c61, b3, c81 + NMSUB c51, c51, b4, c81 + NMSUB c41, c41, b5, c81 + NMSUB c31, c31, b6, c81 + NMSUB c21, c21, b7, c81 + NMSUB c11, c11, b8, c81 + + LD b2, 54 * SIZE(BO) + LD b3, 53 * SIZE(BO) + LD b4, 52 * SIZE(BO) + LD b5, 51 * SIZE(BO) + LD b6, 50 * SIZE(BO) + LD b7, 49 * SIZE(BO) + LD b8, 48 * SIZE(BO) + + MUL c71, b2, c71 + + NMSUB c61, c61, b3, c71 + NMSUB c51, c51, b4, c71 + NMSUB c41, c41, b5, c71 + NMSUB c31, c31, b6, c71 + NMSUB c21, c21, b7, c71 + NMSUB c11, c11, b8, c71 + + LD b3, 45 * SIZE(BO) + LD b4, 44 * SIZE(BO) + LD b5, 43 * SIZE(BO) + LD b6, 42 * SIZE(BO) + LD b7, 41 * SIZE(BO) + LD b8, 40 * SIZE(BO) + + MUL c61, b3, c61 + + NMSUB c51, c51, b4, c61 + NMSUB c41, c41, b5, c61 + NMSUB c31, c31, b6, c61 + NMSUB c21, c21, b7, c61 + NMSUB c11, c11, b8, c61 + + LD b4, 36 * SIZE(BO) + LD b5, 35 * SIZE(BO) + LD b6, 34 * SIZE(BO) + LD b7, 33 * SIZE(BO) + LD b8, 32 * SIZE(BO) + + MUL c51, b4, c51 + + NMSUB c41, c41, b5, c51 + NMSUB c31, c31, b6, c51 + NMSUB c21, c21, b7, c51 + NMSUB c11, c11, b8, c51 + + LD b5, 27 * SIZE(BO) + LD b6, 26 * SIZE(BO) + LD b7, 25 * SIZE(BO) + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 18 * SIZE(BO) + LD b7, 17 * SIZE(BO) + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE + daddiu CO5, CO5, -1 * SIZE + daddiu CO6, CO6, -1 * SIZE + daddiu CO7, CO7, -1 * SIZE + daddiu CO8, CO8, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c61, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c81, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + + MTC $0, c11 + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + daddiu CO5, CO5, 1 * SIZE + daddiu CO6, CO6, 1 * SIZE + daddiu CO7, CO7, 1 * SIZE + daddiu CO8, CO8, 1 * SIZE +#endif + + MOV c21, c11 + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + + MOV c31, c11 + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + + MOV c41, c11 + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L20: + dsra I, M, 1 + MOV c51, c11 + blez I, .L29 + MOV c61, c11 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + dsra L, TEMP, 2 + blez L, .L15 + NOP +#endif + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(BO) + SUB c21, b2, c21 + LD b6, 5 * SIZE(BO) + SUB c31, b3, c31 + LD b7, 6 * SIZE(BO) + SUB c41, b4, c41 + LD b8, 7 * SIZE(BO) + + SUB c51, b5, c51 + LD b1, 8 * SIZE(BO) + SUB c61, b6, c61 + LD b2, 9 * SIZE(BO) + SUB c71, b7, c71 + LD b3, 10 * SIZE(BO) + SUB c81, b8, c81 + LD b4, 11 * SIZE(BO) + + SUB c12, b1, c12 + LD b5, 12 * SIZE(BO) + SUB c22, b2, c22 + LD b6, 13 * SIZE(BO) + SUB c32, b3, c32 + LD b7, 14 * SIZE(BO) + SUB c42, b4, c42 + LD b8, 15 * SIZE(BO) + + SUB c52, b5, c52 +#ifdef LN + LD b1, 3 * SIZE(AO) +#else + LD b1, 0 * SIZE(AO) +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(AO) + SUB c12, b2, c12 + LD b6, 5 * SIZE(AO) + SUB c21, b3, c21 + LD b7, 6 * SIZE(AO) + SUB c22, b4, c22 + LD b8, 7 * SIZE(AO) + + SUB c31, b5, c31 + LD b1, 8 * SIZE(AO) + SUB c32, b6, c32 + LD b2, 9 * SIZE(AO) + SUB c41, b7, c41 + LD b3, 10 * SIZE(AO) + SUB c42, b8, c42 + LD b4, 11 * SIZE(AO) + + LD b5, 12 * SIZE(AO) + SUB c51, b1, c51 + LD b6, 13 * SIZE(AO) + SUB c52, b2, c52 + LD b7, 14 * SIZE(AO) + SUB c61, b3, c61 + LD b8, 15 * SIZE(AO) + SUB c62, b4, c62 + + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif + +#ifdef LN + MUL c12, b1, c12 + LD b2, 2 * SIZE(AO) + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + + NMSUB c11, c11, b2, c12 + LD b3, 0 * SIZE(AO) + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + NMSUB c51, c51, b2, c52 + NMSUB c61, c61, b2, c62 + NMSUB c71, c71, b2, c72 + NMSUB c81, c81, b2, c82 + + MUL c11, b3, c11 + daddiu CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + daddiu CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + daddiu CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + daddiu CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + daddiu CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + daddiu CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + daddiu CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + daddiu CO8, CO8, -2 * SIZE +#endif + +#ifdef LT + MUL c11, b1, c11 + LD b2, 1 * SIZE(AO) + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + + NMSUB c12, c12, b2, c11 + LD b3, 3 * SIZE(AO) + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + NMSUB c52, c52, b2, c51 + NMSUB c62, c62, b2, c61 + NMSUB c72, c72, b2, c71 + NMSUB c82, c82, b2, c81 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, 4 * SIZE(BO) + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + LD b6, 5 * SIZE(BO) + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + LD b7, 6 * SIZE(BO) + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + LD b8, 7 * SIZE(BO) + + NMSUB c51, c51, b5, c11 + NMSUB c52, c52, b5, c12 + LD b2, 9 * SIZE(BO) + NMSUB c61, c61, b6, c11 + NMSUB c62, c62, b6, c12 + LD b3, 10 * SIZE(BO) + NMSUB c71, c71, b7, c11 + NMSUB c72, c72, b7, c12 + LD b4, 11 * SIZE(BO) + NMSUB c81, c81, b8, c11 + NMSUB c82, c82, b8, c12 + LD b5, 12 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, 13 * SIZE(BO) + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + LD b7, 14 * SIZE(BO) + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + LD b8, 15 * SIZE(BO) + NMSUB c51, c51, b5, c21 + NMSUB c52, c52, b5, c22 + LD b3, 18 * SIZE(BO) + NMSUB c61, c61, b6, c21 + NMSUB c62, c62, b6, c22 + LD b4, 19 * SIZE(BO) + NMSUB c71, c71, b7, c21 + NMSUB c72, c72, b7, c22 + LD b5, 20 * SIZE(BO) + NMSUB c81, c81, b8, c21 + NMSUB c82, c82, b8, c22 + LD b6, 21 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, 22 * SIZE(BO) + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + LD b8, 23 * SIZE(BO) + NMSUB c51, c51, b5, c31 + NMSUB c52, c52, b5, c32 + LD b4, 27 * SIZE(BO) + NMSUB c61, c61, b6, c31 + NMSUB c62, c62, b6, c32 + LD b5, 28 * SIZE(BO) + NMSUB c71, c71, b7, c31 + NMSUB c72, c72, b7, c32 + LD b6, 29 * SIZE(BO) + NMSUB c81, c81, b8, c31 + NMSUB c82, c82, b8, c32 + LD b7, 30 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, 31 * SIZE(BO) + + NMSUB c51, c51, b5, c41 + NMSUB c52, c52, b5, c42 + LD b5, 36 * SIZE(BO) + NMSUB c61, c61, b6, c41 + NMSUB c62, c62, b6, c42 + LD b6, 37 * SIZE(BO) + NMSUB c71, c71, b7, c41 + NMSUB c72, c72, b7, c42 + LD b7, 38 * SIZE(BO) + NMSUB c81, c81, b8, c41 + NMSUB c82, c82, b8, c42 + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + MUL c52, b5, c52 + + NMSUB c61, c61, b6, c51 + NMSUB c62, c62, b6, c52 + LD b6, 45 * SIZE(BO) + NMSUB c71, c71, b7, c51 + NMSUB c72, c72, b7, c52 + LD b7, 46 * SIZE(BO) + NMSUB c81, c81, b8, c51 + NMSUB c82, c82, b8, c52 + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + MUL c62, b6, c62 + + NMSUB c71, c71, b7, c61 + NMSUB c72, c72, b7, c62 + LD b7, 54 * SIZE(BO) + NMSUB c81, c81, b8, c61 + NMSUB c82, c82, b8, c62 + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + MUL c72, b7, c72 + + NMSUB c81, c81, b8, c71 + NMSUB c82, c82, b8, c72 + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, 59 * SIZE(BO) + + NMSUB c71, c71, b2, c81 + NMSUB c72, c72, b2, c82 + LD b6, 58 * SIZE(BO) + NMSUB c61, c61, b3, c81 + NMSUB c62, c62, b3, c82 + LD b7, 57 * SIZE(BO) + NMSUB c51, c51, b4, c81 + NMSUB c52, c52, b4, c82 + LD b8, 56 * SIZE(BO) + + NMSUB c41, c41, b5, c81 + NMSUB c42, c42, b5, c82 + LD b2, 54 * SIZE(BO) + NMSUB c31, c31, b6, c81 + NMSUB c32, c32, b6, c82 + LD b3, 53 * SIZE(BO) + NMSUB c21, c21, b7, c81 + NMSUB c22, c22, b7, c82 + LD b4, 52 * SIZE(BO) + NMSUB c11, c11, b8, c81 + NMSUB c12, c12, b8, c82 + LD b5, 51 * SIZE(BO) + + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, 50 * SIZE(BO) + + NMSUB c61, c61, b3, c71 + NMSUB c62, c62, b3, c72 + LD b7, 49 * SIZE(BO) + NMSUB c51, c51, b4, c71 + NMSUB c52, c52, b4, c72 + LD b8, 48 * SIZE(BO) + NMSUB c41, c41, b5, c71 + NMSUB c42, c42, b5, c72 + LD b3, 45 * SIZE(BO) + NMSUB c31, c31, b6, c71 + NMSUB c32, c32, b6, c72 + LD b4, 44 * SIZE(BO) + NMSUB c21, c21, b7, c71 + NMSUB c22, c22, b7, c72 + LD b5, 43 * SIZE(BO) + NMSUB c11, c11, b8, c71 + NMSUB c12, c12, b8, c72 + LD b6, 42 * SIZE(BO) + + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, 41 * SIZE(BO) + + NMSUB c51, c51, b4, c61 + NMSUB c52, c52, b4, c62 + LD b8, 40 * SIZE(BO) + NMSUB c41, c41, b5, c61 + NMSUB c42, c42, b5, c62 + LD b4, 36 * SIZE(BO) + NMSUB c31, c31, b6, c61 + NMSUB c32, c32, b6, c62 + LD b5, 35 * SIZE(BO) + NMSUB c21, c21, b7, c61 + NMSUB c22, c22, b7, c62 + LD b6, 34 * SIZE(BO) + NMSUB c11, c11, b8, c61 + NMSUB c12, c12, b8, c62 + LD b7, 33 * SIZE(BO) + + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, 32 * SIZE(BO) + + NMSUB c41, c41, b5, c51 + NMSUB c42, c42, b5, c52 + LD b5, 27 * SIZE(BO) + NMSUB c31, c31, b6, c51 + NMSUB c32, c32, b6, c52 + LD b6, 26 * SIZE(BO) + NMSUB c21, c21, b7, c51 + NMSUB c22, c22, b7, c52 + LD b7, 25 * SIZE(BO) + NMSUB c11, c11, b8, c51 + NMSUB c12, c12, b8, c52 + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + LD b6, 18 * SIZE(BO) + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + LD b7, 17 * SIZE(BO) + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + LD b7, 9 * SIZE(BO) + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) + + ST c12, 8 * SIZE(BO) + ST c22, 9 * SIZE(BO) + ST c32, 10 * SIZE(BO) + ST c42, 11 * SIZE(BO) + ST c52, 12 * SIZE(BO) + ST c62, 13 * SIZE(BO) + ST c72, 14 * SIZE(BO) + ST c82, 15 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) + + ST c51, 8 * SIZE(AO) + ST c52, 9 * SIZE(AO) + ST c61, 10 * SIZE(AO) + ST c62, 11 * SIZE(AO) + ST c71, 12 * SIZE(AO) + ST c72, 13 * SIZE(AO) + ST c81, 14 * SIZE(AO) + ST c82, 15 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c52, 1 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c62, 1 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c72, 1 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + ST c82, 1 * SIZE(CO8) + + MTC $0, a1 + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + daddiu CO5, CO5, 2 * SIZE + daddiu CO6, CO6, 2 * SIZE + daddiu CO7, CO7, 2 * SIZE + daddiu CO8, CO8, 2 * SIZE +#endif + + MOV c11, a1 + MOV c21, a1 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + + MOV c31, a1 + MOV c41, a1 + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + MOV c51, a1 + + bgtz I, .L11 + MOV c61, a1 + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 3 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 8 +#endif + +#ifdef RT + daddiu KK, KK, -8 +#endif + + bgtz J, .L10 + NOP + .align 3 + +.L30: + andi J, N, 4 + blez J, .L50 + move AO, A + +#ifdef RT + dsll TEMP, K, 2 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + MOV c21, c11 + daddu CO4, CO3, LDC + MOV c31, c11 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + andi I, M, 1 + blez I, .L40 + MOV c41, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + + blez L, .L45 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + + blez L, .L45 + NOP +#endif + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + + MTC $0, c11 + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE +#endif + + MOV c21, c11 + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + + MOV c31, c11 + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L40: + dsra I, M, 1 + MOV c61, c11 + blez I, .L49 + MOV c41, c11 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + MOV c32, c11 + LD b4, 3 * SIZE(BO) + MOV c42, c11 + + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L35 + NOP +#endif + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + daddiu CO3, CO3, -2 * SIZE + daddiu CO4, CO4, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c12, 4 * SIZE(BO) + ST c22, 5 * SIZE(BO) + ST c32, 6 * SIZE(BO) + ST c42, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L49: +#ifdef LN + dsll TEMP, K, 2 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 + blez J, .L70 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + move AO, A + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + andi I, M, 1 + blez I, .L60 + NOP + +#if defined(LT) || defined(RN) + dsra L, KK, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + dsra L, TEMP, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L65 + NOP +#endif + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif + +#if defined(LN) || defined(LT) + LD b3, 0 * SIZE(AO) + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + + MUL c21, b3, c21 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + + NMSUB c11, c11, b2, c21 + + MUL c11, b3, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 0 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L60: + dsra I, M, 1 + blez I, .L69 + NOP + +.L51: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B + +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L55 + NOP +#endif + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c21 + NMSUB c12, c12, b2, c22 + + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c12, 2 * SIZE(BO) + ST c22, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L51 + MOV c41, c11 + .align 3 + +.L69: +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + blez J, .L999 + NOP + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + move AO, A + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + andi I, M, 1 + blez I, .L80 + NOP + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + blez L, .L85 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L85 + NOP +#endif + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: + ADD c11, c11, c21 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -1 +#endif + + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + + SUB c11, b1, c11 +#else + LD b1, 0 * SIZE(AO) + + SUB c11, b1, c11 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L80: + dsra I, M, 1 + blez I, .L89 + NOP + +.L71: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L75 + NOP +#endif + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -1 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + NMSUB c11, c11, b2, c12 + MUL c11, b3, c11 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + NMSUB c12, c12, b2, c11 + MUL c12, b3, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + + bgtz I, .L71 + NOP + .align 3 + + +.L89: +#ifdef LN + dsll TEMP, K, BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_LT.S b/kernel/mips64/trsm_kernel_LT.S new file mode 100644 index 0000000000..824e0457ba --- /dev/null +++ b/kernel/mips64/trsm_kernel_LT.S @@ -0,0 +1,3527 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f27 +#define a4 $f28 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f16 +#define c41 $f17 +#define c42 $f18 +#define c51 $f19 +#define c52 $f20 +#define c61 $f21 +#define c62 $f22 +#define c71 $f23 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + LDARG OFFSET, 144($sp) + + dsll LDC, LDC, BASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, BASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + dsra J, N, 3 + blez J, .L30 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 3 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 3 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + dsra I, M, 1 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO8, LDC +#endif + + blez I, .L20 + MOV c61, c11 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + dsra L, TEMP, 2 + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + blez L, .L15 + NOP +#endif + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(BO) + SUB c21, b2, c21 + LD b6, 5 * SIZE(BO) + SUB c31, b3, c31 + LD b7, 6 * SIZE(BO) + SUB c41, b4, c41 + LD b8, 7 * SIZE(BO) + + SUB c51, b5, c51 + LD b1, 8 * SIZE(BO) + SUB c61, b6, c61 + LD b2, 9 * SIZE(BO) + SUB c71, b7, c71 + LD b3, 10 * SIZE(BO) + SUB c81, b8, c81 + LD b4, 11 * SIZE(BO) + + SUB c12, b1, c12 + LD b5, 12 * SIZE(BO) + SUB c22, b2, c22 + LD b6, 13 * SIZE(BO) + SUB c32, b3, c32 + LD b7, 14 * SIZE(BO) + SUB c42, b4, c42 + LD b8, 15 * SIZE(BO) + + SUB c52, b5, c52 +#ifdef LN + LD b1, 3 * SIZE(AO) +#else + LD b1, 0 * SIZE(AO) +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(AO) + SUB c12, b2, c12 + LD b6, 5 * SIZE(AO) + SUB c21, b3, c21 + LD b7, 6 * SIZE(AO) + SUB c22, b4, c22 + LD b8, 7 * SIZE(AO) + + SUB c31, b5, c31 + LD b1, 8 * SIZE(AO) + SUB c32, b6, c32 + LD b2, 9 * SIZE(AO) + SUB c41, b7, c41 + LD b3, 10 * SIZE(AO) + SUB c42, b8, c42 + LD b4, 11 * SIZE(AO) + + LD b5, 12 * SIZE(AO) + SUB c51, b1, c51 + LD b6, 13 * SIZE(AO) + SUB c52, b2, c52 + LD b7, 14 * SIZE(AO) + SUB c61, b3, c61 + LD b8, 15 * SIZE(AO) + SUB c62, b4, c62 + + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif + +#ifdef LN + MUL c12, b1, c12 + LD b2, 2 * SIZE(AO) + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + + NMSUB c11, c11, b2, c12 + LD b3, 0 * SIZE(AO) + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + NMSUB c51, c51, b2, c52 + NMSUB c61, c61, b2, c62 + NMSUB c71, c71, b2, c72 + NMSUB c81, c81, b2, c82 + + MUL c11, b3, c11 + daddiu CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + daddiu CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + daddiu CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + daddiu CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + daddiu CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + daddiu CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + daddiu CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + daddiu CO8, CO8, -2 * SIZE +#endif + +#ifdef LT + MUL c11, b1, c11 + LD b2, 1 * SIZE(AO) + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + + NMSUB c12, c12, b2, c11 + LD b3, 3 * SIZE(AO) + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + NMSUB c52, c52, b2, c51 + NMSUB c62, c62, b2, c61 + NMSUB c72, c72, b2, c71 + NMSUB c82, c82, b2, c81 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, 4 * SIZE(BO) + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + LD b6, 5 * SIZE(BO) + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + LD b7, 6 * SIZE(BO) + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + LD b8, 7 * SIZE(BO) + + NMSUB c51, c51, b5, c11 + NMSUB c52, c52, b5, c12 + LD b2, 9 * SIZE(BO) + NMSUB c61, c61, b6, c11 + NMSUB c62, c62, b6, c12 + LD b3, 10 * SIZE(BO) + NMSUB c71, c71, b7, c11 + NMSUB c72, c72, b7, c12 + LD b4, 11 * SIZE(BO) + NMSUB c81, c81, b8, c11 + NMSUB c82, c82, b8, c12 + LD b5, 12 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, 13 * SIZE(BO) + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + LD b7, 14 * SIZE(BO) + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + LD b8, 15 * SIZE(BO) + NMSUB c51, c51, b5, c21 + NMSUB c52, c52, b5, c22 + LD b3, 18 * SIZE(BO) + NMSUB c61, c61, b6, c21 + NMSUB c62, c62, b6, c22 + LD b4, 19 * SIZE(BO) + NMSUB c71, c71, b7, c21 + NMSUB c72, c72, b7, c22 + LD b5, 20 * SIZE(BO) + NMSUB c81, c81, b8, c21 + NMSUB c82, c82, b8, c22 + LD b6, 21 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, 22 * SIZE(BO) + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + LD b8, 23 * SIZE(BO) + NMSUB c51, c51, b5, c31 + NMSUB c52, c52, b5, c32 + LD b4, 27 * SIZE(BO) + NMSUB c61, c61, b6, c31 + NMSUB c62, c62, b6, c32 + LD b5, 28 * SIZE(BO) + NMSUB c71, c71, b7, c31 + NMSUB c72, c72, b7, c32 + LD b6, 29 * SIZE(BO) + NMSUB c81, c81, b8, c31 + NMSUB c82, c82, b8, c32 + LD b7, 30 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, 31 * SIZE(BO) + + NMSUB c51, c51, b5, c41 + NMSUB c52, c52, b5, c42 + LD b5, 36 * SIZE(BO) + NMSUB c61, c61, b6, c41 + NMSUB c62, c62, b6, c42 + LD b6, 37 * SIZE(BO) + NMSUB c71, c71, b7, c41 + NMSUB c72, c72, b7, c42 + LD b7, 38 * SIZE(BO) + NMSUB c81, c81, b8, c41 + NMSUB c82, c82, b8, c42 + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + MUL c52, b5, c52 + + NMSUB c61, c61, b6, c51 + NMSUB c62, c62, b6, c52 + LD b6, 45 * SIZE(BO) + NMSUB c71, c71, b7, c51 + NMSUB c72, c72, b7, c52 + LD b7, 46 * SIZE(BO) + NMSUB c81, c81, b8, c51 + NMSUB c82, c82, b8, c52 + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + MUL c62, b6, c62 + + NMSUB c71, c71, b7, c61 + NMSUB c72, c72, b7, c62 + LD b7, 54 * SIZE(BO) + NMSUB c81, c81, b8, c61 + NMSUB c82, c82, b8, c62 + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + MUL c72, b7, c72 + + NMSUB c81, c81, b8, c71 + NMSUB c82, c82, b8, c72 + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, 59 * SIZE(BO) + + NMSUB c71, c71, b2, c81 + NMSUB c72, c72, b2, c82 + LD b6, 58 * SIZE(BO) + NMSUB c61, c61, b3, c81 + NMSUB c62, c62, b3, c82 + LD b7, 57 * SIZE(BO) + NMSUB c51, c51, b4, c81 + NMSUB c52, c52, b4, c82 + LD b8, 56 * SIZE(BO) + + NMSUB c41, c41, b5, c81 + NMSUB c42, c42, b5, c82 + LD b2, 54 * SIZE(BO) + NMSUB c31, c31, b6, c81 + NMSUB c32, c32, b6, c82 + LD b3, 53 * SIZE(BO) + NMSUB c21, c21, b7, c81 + NMSUB c22, c22, b7, c82 + LD b4, 52 * SIZE(BO) + NMSUB c11, c11, b8, c81 + NMSUB c12, c12, b8, c82 + LD b5, 51 * SIZE(BO) + + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, 50 * SIZE(BO) + + NMSUB c61, c61, b3, c71 + NMSUB c62, c62, b3, c72 + LD b7, 49 * SIZE(BO) + NMSUB c51, c51, b4, c71 + NMSUB c52, c52, b4, c72 + LD b8, 48 * SIZE(BO) + NMSUB c41, c41, b5, c71 + NMSUB c42, c42, b5, c72 + LD b3, 45 * SIZE(BO) + NMSUB c31, c31, b6, c71 + NMSUB c32, c32, b6, c72 + LD b4, 44 * SIZE(BO) + NMSUB c21, c21, b7, c71 + NMSUB c22, c22, b7, c72 + LD b5, 43 * SIZE(BO) + NMSUB c11, c11, b8, c71 + NMSUB c12, c12, b8, c72 + LD b6, 42 * SIZE(BO) + + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, 41 * SIZE(BO) + + NMSUB c51, c51, b4, c61 + NMSUB c52, c52, b4, c62 + LD b8, 40 * SIZE(BO) + NMSUB c41, c41, b5, c61 + NMSUB c42, c42, b5, c62 + LD b4, 36 * SIZE(BO) + NMSUB c31, c31, b6, c61 + NMSUB c32, c32, b6, c62 + LD b5, 35 * SIZE(BO) + NMSUB c21, c21, b7, c61 + NMSUB c22, c22, b7, c62 + LD b6, 34 * SIZE(BO) + NMSUB c11, c11, b8, c61 + NMSUB c12, c12, b8, c62 + LD b7, 33 * SIZE(BO) + + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, 32 * SIZE(BO) + + NMSUB c41, c41, b5, c51 + NMSUB c42, c42, b5, c52 + LD b5, 27 * SIZE(BO) + NMSUB c31, c31, b6, c51 + NMSUB c32, c32, b6, c52 + LD b6, 26 * SIZE(BO) + NMSUB c21, c21, b7, c51 + NMSUB c22, c22, b7, c52 + LD b7, 25 * SIZE(BO) + NMSUB c11, c11, b8, c51 + NMSUB c12, c12, b8, c52 + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + LD b6, 18 * SIZE(BO) + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + LD b7, 17 * SIZE(BO) + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + LD b7, 9 * SIZE(BO) + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) + + ST c12, 8 * SIZE(BO) + ST c22, 9 * SIZE(BO) + ST c32, 10 * SIZE(BO) + ST c42, 11 * SIZE(BO) + ST c52, 12 * SIZE(BO) + ST c62, 13 * SIZE(BO) + ST c72, 14 * SIZE(BO) + ST c82, 15 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) + + ST c51, 8 * SIZE(AO) + ST c52, 9 * SIZE(AO) + ST c61, 10 * SIZE(AO) + ST c62, 11 * SIZE(AO) + ST c71, 12 * SIZE(AO) + ST c72, 13 * SIZE(AO) + ST c81, 14 * SIZE(AO) + ST c82, 15 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c52, 1 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c62, 1 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c72, 1 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + ST c82, 1 * SIZE(CO8) + + MTC $0, a1 + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + daddiu CO5, CO5, 2 * SIZE + daddiu CO6, CO6, 2 * SIZE + daddiu CO7, CO7, 2 * SIZE + daddiu CO8, CO8, 2 * SIZE +#endif + + MOV c11, a1 + MOV c21, a1 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + + MOV c31, a1 + MOV c41, a1 + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + MOV c51, a1 + + bgtz I, .L11 + MOV c61, a1 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 + blez I, .L29 + MOV c71, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 0 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + MOV c81, c11 + + blez L, .L25 + NOP +#endif + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + NMSUB c51, c51, b5, c11 + NMSUB c61, c61, b6, c11 + NMSUB c71, c71, b7, c11 + NMSUB c81, c81, b8, c11 + + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + NMSUB c51, c51, b5, c21 + NMSUB c61, c61, b6, c21 + NMSUB c71, c71, b7, c21 + NMSUB c81, c81, b8, c21 + + LD b3, 18 * SIZE(BO) + LD b4, 19 * SIZE(BO) + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + NMSUB c51, c51, b5, c31 + NMSUB c61, c61, b6, c31 + NMSUB c71, c71, b7, c31 + NMSUB c81, c81, b8, c31 + + LD b4, 27 * SIZE(BO) + LD b5, 28 * SIZE(BO) + LD b6, 29 * SIZE(BO) + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL c41, b4, c41 + + NMSUB c51, c51, b5, c41 + NMSUB c61, c61, b6, c41 + NMSUB c71, c71, b7, c41 + NMSUB c81, c81, b8, c41 + + LD b5, 36 * SIZE(BO) + LD b6, 37 * SIZE(BO) + LD b7, 38 * SIZE(BO) + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + + NMSUB c61, c61, b6, c51 + NMSUB c71, c71, b7, c51 + NMSUB c81, c81, b8, c51 + + LD b6, 45 * SIZE(BO) + LD b7, 46 * SIZE(BO) + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + + NMSUB c71, c71, b7, c61 + NMSUB c81, c81, b8, c61 + + LD b7, 54 * SIZE(BO) + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + + NMSUB c81, c81, b8, c71 + + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + LD b5, 59 * SIZE(BO) + LD b6, 58 * SIZE(BO) + LD b7, 57 * SIZE(BO) + LD b8, 56 * SIZE(BO) + + MUL c81, b1, c81 + + NMSUB c71, c71, b2, c81 + NMSUB c61, c61, b3, c81 + NMSUB c51, c51, b4, c81 + NMSUB c41, c41, b5, c81 + NMSUB c31, c31, b6, c81 + NMSUB c21, c21, b7, c81 + NMSUB c11, c11, b8, c81 + + LD b2, 54 * SIZE(BO) + LD b3, 53 * SIZE(BO) + LD b4, 52 * SIZE(BO) + LD b5, 51 * SIZE(BO) + LD b6, 50 * SIZE(BO) + LD b7, 49 * SIZE(BO) + LD b8, 48 * SIZE(BO) + + MUL c71, b2, c71 + + NMSUB c61, c61, b3, c71 + NMSUB c51, c51, b4, c71 + NMSUB c41, c41, b5, c71 + NMSUB c31, c31, b6, c71 + NMSUB c21, c21, b7, c71 + NMSUB c11, c11, b8, c71 + + LD b3, 45 * SIZE(BO) + LD b4, 44 * SIZE(BO) + LD b5, 43 * SIZE(BO) + LD b6, 42 * SIZE(BO) + LD b7, 41 * SIZE(BO) + LD b8, 40 * SIZE(BO) + + MUL c61, b3, c61 + + NMSUB c51, c51, b4, c61 + NMSUB c41, c41, b5, c61 + NMSUB c31, c31, b6, c61 + NMSUB c21, c21, b7, c61 + NMSUB c11, c11, b8, c61 + + LD b4, 36 * SIZE(BO) + LD b5, 35 * SIZE(BO) + LD b6, 34 * SIZE(BO) + LD b7, 33 * SIZE(BO) + LD b8, 32 * SIZE(BO) + + MUL c51, b4, c51 + + NMSUB c41, c41, b5, c51 + NMSUB c31, c31, b6, c51 + NMSUB c21, c21, b7, c51 + NMSUB c11, c11, b8, c51 + + LD b5, 27 * SIZE(BO) + LD b6, 26 * SIZE(BO) + LD b7, 25 * SIZE(BO) + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 18 * SIZE(BO) + LD b7, 17 * SIZE(BO) + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE + daddiu CO5, CO5, -1 * SIZE + daddiu CO6, CO6, -1 * SIZE + daddiu CO7, CO7, -1 * SIZE + daddiu CO8, CO8, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c61, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c81, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + daddiu CO5, CO5, 1 * SIZE + daddiu CO6, CO6, 1 * SIZE + daddiu CO7, CO7, 1 * SIZE + daddiu CO8, CO8, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 3 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 8 +#endif + +#ifdef RT + daddiu KK, KK, -8 +#endif + + bgtz J, .L10 + NOP + .align 3 + +.L30: + andi J, N, 4 + blez J, .L50 + move AO, A + +#ifdef RT + dsll TEMP, K, 2 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + MOV c21, c11 + dsra I, M, 1 + MOV c31, c11 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + blez I, .L40 + MOV c41, c11 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + MOV c32, c11 + LD b4, 3 * SIZE(BO) + MOV c42, c11 + + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L35 + NOP +#endif + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + daddiu CO3, CO3, -2 * SIZE + daddiu CO4, CO4, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c12, 4 * SIZE(BO) + ST c22, 5 * SIZE(BO) + ST c32, 6 * SIZE(BO) + ST c42, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L40: + andi I, M, 1 + blez I, .L49 + MOV c61, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + + blez L, .L45 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + + blez L, .L45 + NOP +#endif + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + dsll TEMP, K, 2 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + .align 3 + +.L50: + andi J, N, 2 + blez J, .L70 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + move AO, A + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + dsra I, M, 1 + blez I, .L60 + NOP + +.L51: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B + +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L55 + NOP +#endif + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c21 + NMSUB c12, c12, b2, c22 + + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c12, 2 * SIZE(BO) + ST c22, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L51 + MOV c41, c11 + .align 3 + +.L60: + andi I, M, 1 + blez I, .L69 + NOP + +#if defined(LT) || defined(RN) + dsra L, KK, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + dsra L, TEMP, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L65 + NOP +#endif + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif + +#if defined(LN) || defined(LT) + LD b3, 0 * SIZE(AO) + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + + MUL c21, b3, c21 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + + NMSUB c11, c11, b2, c21 + + MUL c11, b3, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 0 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L70: + andi J, N, 1 + blez J, .L999 + NOP + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + move AO, A + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + dsra I, M, 1 + blez I, .L80 + NOP + +.L71: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L75 + NOP +#endif + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -1 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + NMSUB c11, c11, b2, c12 + MUL c11, b3, c11 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + NMSUB c12, c12, b2, c11 + MUL c12, b3, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + + bgtz I, .L71 + NOP + .align 3 + +.L80: + andi I, M, 1 + blez I, .L89 + NOP + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + blez L, .L85 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L85 + NOP +#endif + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: + ADD c11, c11, c21 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -1 +#endif + + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + + SUB c11, b1, c11 +#else + LD b1, 0 * SIZE(AO) + + SUB c11, b1, c11 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + dsll TEMP, K, BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/trsm_kernel_RT.S b/kernel/mips64/trsm_kernel_RT.S new file mode 100644 index 0000000000..81bbfec0f2 --- /dev/null +++ b/kernel/mips64/trsm_kernel_RT.S @@ -0,0 +1,3529 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $8 +#define B $9 +#define C $10 +#define LDC $11 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#define AORIG $25 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f27 +#define a4 $f28 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f16 +#define c41 $f17 +#define c42 $f18 +#define c51 $f19 +#define c52 $f20 +#define c61 $f21 +#define c62 $f22 +#define c71 $f23 +#define c72 $f24 +#define c81 $f25 +#define c82 $f26 + +#define ALPHA $f15 + + PROLOGUE + + daddiu $sp, $sp, -144 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + + SDARG $22, 88($sp) + SDARG $23, 96($sp) + SDARG $24, 104($sp) + SDARG $25, 112($sp) + +#ifndef __64BIT__ + sdc1 $f20,112($sp) + sdc1 $f21,120($sp) + sdc1 $f22,128($sp) + sdc1 $f23,136($sp) +#endif + + LDARG OFFSET, 144($sp) + + dsll LDC, LDC, BASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, BASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, BASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + andi J, N, 1 + blez J, .L30 + NOP + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + move AO, A + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + dsra I, M, 1 + blez I, .L80 + NOP + +.L71: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 0 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L75 + NOP +#endif + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: + ADD c11, c11, c21 + ADD c12, c12, c22 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -1 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + NMSUB c11, c11, b2, c12 + MUL c11, b3, c11 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + NMSUB c12, c12, b2, c11 + MUL c12, b3, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + + bgtz I, .L71 + NOP + .align 3 + +.L80: + andi I, M, 1 + blez I, .L89 + NOP + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + MOV c21, c11 + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + blez L, .L85 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll TEMP, KK, BASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + MOV c21, c11 + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + blez L, .L85 + NOP +#endif + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: + ADD c11, c11, c21 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -1 +#endif + + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + + SUB c11, b1, c11 +#else + LD b1, 0 * SIZE(AO) + + SUB c11, b1, c11 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + + MUL c11, b1, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 0 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L89: +#ifdef LN + dsll TEMP, K, BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + +.L30: + andi J, N, 2 + blez J, .L50 + NOP + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + move AO, A + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + dsra I, M, 1 + blez I, .L60 + NOP + +.L51: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B + +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L55 + NOP +#endif + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c12, b3, c12 + SUB c22, b4, c22 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + + MUL c12, b3, c12 + MUL c22, b3, c22 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + + MUL c21, b3, c21 + MUL c22, b3, c22 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + MUL c22, b1, c22 + + NMSUB c11, c11, b2, c21 + NMSUB c12, c12, b2, c22 + + MUL c11, b3, c11 + MUL c12, b3, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c12, 2 * SIZE(BO) + ST c22, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L51 + MOV c41, c11 + .align 3 + +.L60: + andi I, M, 1 + blez I, .L69 + NOP + +#if defined(LT) || defined(RN) + dsra L, KK, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 1 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + dsra L, TEMP, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L65 + NOP +#endif + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: + ADD c11, c11, c31 + ADD c21, c21, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 +#endif + +#if defined(LN) || defined(LT) + LD b3, 0 * SIZE(AO) + + MUL c11, b3, c11 + MUL c21, b3, c21 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + + MUL c21, b3, c21 +#endif + +#ifdef RT + LD b1, 3 * SIZE(BO) + LD b2, 2 * SIZE(BO) + LD b3, 0 * SIZE(BO) + + MUL c21, b1, c21 + + NMSUB c11, c11, b2, c21 + + MUL c11, b3, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 0 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 1 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L69: +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L50: + andi J, N, 4 + blez J, .L70 + move AO, A + +#ifdef RT + dsll TEMP, K, 2 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + MOV c21, c11 + dsra I, M, 1 + MOV c31, c11 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + blez I, .L40 + MOV c41, c11 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, KK, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + LD b3, 2 * SIZE(BO) + MOV c32, c11 + LD b4, 3 * SIZE(BO) + MOV c42, c11 + + LD b5, 4 * SIZE(BO) + dsra L, TEMP, 2 + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + blez L, .L35 + NOP +#endif + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c12, b5, c12 + SUB c22, b6, c22 + SUB c32, b7, c32 + SUB c42, b8, c42 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c21, b3, c21 + SUB c22, b4, c22 + SUB c31, b5, c31 + SUB c32, b6, c32 + SUB c41, b7, c41 + SUB c42, b8, c42 +#endif + +#ifdef LN + LD b1, 3 * SIZE(AO) + LD b2, 2 * SIZE(AO) + LD b3, 0 * SIZE(AO) + + MUL c12, b1, c12 + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + + NMSUB c11, c11, b2, c12 + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + + MUL c11, b3, c11 + MUL c21, b3, c21 + MUL c31, b3, c31 + MUL c41, b3, c41 +#endif + +#ifdef LT + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 3 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + + NMSUB c12, c12, b2, c11 + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#ifdef LN + daddiu CO1, CO1, -2 * SIZE + daddiu CO2, CO2, -2 * SIZE + daddiu CO3, CO3, -2 * SIZE + daddiu CO4, CO4, -2 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c12, 4 * SIZE(BO) + ST c22, 5 * SIZE(BO) + ST c32, 6 * SIZE(BO) + ST c42, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + MTC $0, a1 + + MOV c11, a1 + MOV c21, a1 + MOV c31, a1 + + daddiu I, I, -1 + + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L40: + andi I, M, 1 + blez I, .L49 + MOV c61, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + + blez L, .L45 + move BO, B +#else +#ifdef LN + dsll TEMP, K, BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 2 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + + blez L, .L45 + NOP +#endif + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + + LD b2, 5 * SIZE(BO) + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + + LD b4, 15 * SIZE(BO) + + MUL c41, b4, c41 +#endif + +#ifdef RT + LD b5, 15 * SIZE(BO) + LD b6, 14 * SIZE(BO) + LD b7, 13 * SIZE(BO) + LD b8, 12 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 10 * SIZE(BO) + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 5 * SIZE(BO) + LD b8, 4 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 2 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L49: +#ifdef LN + dsll TEMP, K, 2 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + .align 3 + +.L70: + dsra J, N, 3 + blez J, .L999 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 3 + BASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 3 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + dsra I, M, 1 + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO8, LDC +#endif + + blez I, .L20 + MOV c61, c11 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 1 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 1 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + dsra L, TEMP, 2 + blez L, .L15 + NOP +#endif + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -2 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(BO) + SUB c21, b2, c21 + LD b6, 5 * SIZE(BO) + SUB c31, b3, c31 + LD b7, 6 * SIZE(BO) + SUB c41, b4, c41 + LD b8, 7 * SIZE(BO) + + SUB c51, b5, c51 + LD b1, 8 * SIZE(BO) + SUB c61, b6, c61 + LD b2, 9 * SIZE(BO) + SUB c71, b7, c71 + LD b3, 10 * SIZE(BO) + SUB c81, b8, c81 + LD b4, 11 * SIZE(BO) + + SUB c12, b1, c12 + LD b5, 12 * SIZE(BO) + SUB c22, b2, c22 + LD b6, 13 * SIZE(BO) + SUB c32, b3, c32 + LD b7, 14 * SIZE(BO) + SUB c42, b4, c42 + LD b8, 15 * SIZE(BO) + + SUB c52, b5, c52 +#ifdef LN + LD b1, 3 * SIZE(AO) +#else + LD b1, 0 * SIZE(AO) +#endif + SUB c62, b6, c62 + SUB c72, b7, c72 + SUB c82, b8, c82 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + LD b5, 4 * SIZE(AO) + SUB c12, b2, c12 + LD b6, 5 * SIZE(AO) + SUB c21, b3, c21 + LD b7, 6 * SIZE(AO) + SUB c22, b4, c22 + LD b8, 7 * SIZE(AO) + + SUB c31, b5, c31 + LD b1, 8 * SIZE(AO) + SUB c32, b6, c32 + LD b2, 9 * SIZE(AO) + SUB c41, b7, c41 + LD b3, 10 * SIZE(AO) + SUB c42, b8, c42 + LD b4, 11 * SIZE(AO) + + LD b5, 12 * SIZE(AO) + SUB c51, b1, c51 + LD b6, 13 * SIZE(AO) + SUB c52, b2, c52 + LD b7, 14 * SIZE(AO) + SUB c61, b3, c61 + LD b8, 15 * SIZE(AO) + SUB c62, b4, c62 + + SUB c71, b5, c71 + SUB c72, b6, c72 + SUB c81, b7, c81 + SUB c82, b8, c82 +#endif + +#ifdef LN + MUL c12, b1, c12 + LD b2, 2 * SIZE(AO) + MUL c22, b1, c22 + MUL c32, b1, c32 + MUL c42, b1, c42 + MUL c52, b1, c52 + MUL c62, b1, c62 + MUL c72, b1, c72 + MUL c82, b1, c82 + + NMSUB c11, c11, b2, c12 + LD b3, 0 * SIZE(AO) + NMSUB c21, c21, b2, c22 + NMSUB c31, c31, b2, c32 + NMSUB c41, c41, b2, c42 + NMSUB c51, c51, b2, c52 + NMSUB c61, c61, b2, c62 + NMSUB c71, c71, b2, c72 + NMSUB c81, c81, b2, c82 + + MUL c11, b3, c11 + daddiu CO1, CO1, -2 * SIZE + MUL c21, b3, c21 + daddiu CO2, CO2, -2 * SIZE + MUL c31, b3, c31 + daddiu CO3, CO3, -2 * SIZE + MUL c41, b3, c41 + daddiu CO4, CO4, -2 * SIZE + MUL c51, b3, c51 + daddiu CO5, CO5, -2 * SIZE + MUL c61, b3, c61 + daddiu CO6, CO6, -2 * SIZE + MUL c71, b3, c71 + daddiu CO7, CO7, -2 * SIZE + MUL c81, b3, c81 + daddiu CO8, CO8, -2 * SIZE +#endif + +#ifdef LT + MUL c11, b1, c11 + LD b2, 1 * SIZE(AO) + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 + + NMSUB c12, c12, b2, c11 + LD b3, 3 * SIZE(AO) + NMSUB c22, c22, b2, c21 + NMSUB c32, c32, b2, c31 + NMSUB c42, c42, b2, c41 + NMSUB c52, c52, b2, c51 + NMSUB c62, c62, b2, c61 + NMSUB c72, c72, b2, c71 + NMSUB c82, c82, b2, c81 + + MUL c12, b3, c12 + MUL c22, b3, c22 + MUL c32, b3, c32 + MUL c42, b3, c42 + MUL c52, b3, c52 + MUL c62, b3, c62 + MUL c72, b3, c72 + MUL c82, b3, c82 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL c11, b1, c11 + MUL c12, b1, c12 + LD b5, 4 * SIZE(BO) + + NMSUB c21, c21, b2, c11 + NMSUB c22, c22, b2, c12 + LD b6, 5 * SIZE(BO) + NMSUB c31, c31, b3, c11 + NMSUB c32, c32, b3, c12 + LD b7, 6 * SIZE(BO) + NMSUB c41, c41, b4, c11 + NMSUB c42, c42, b4, c12 + LD b8, 7 * SIZE(BO) + + NMSUB c51, c51, b5, c11 + NMSUB c52, c52, b5, c12 + LD b2, 9 * SIZE(BO) + NMSUB c61, c61, b6, c11 + NMSUB c62, c62, b6, c12 + LD b3, 10 * SIZE(BO) + NMSUB c71, c71, b7, c11 + NMSUB c72, c72, b7, c12 + LD b4, 11 * SIZE(BO) + NMSUB c81, c81, b8, c11 + NMSUB c82, c82, b8, c12 + LD b5, 12 * SIZE(BO) + + MUL c21, b2, c21 + MUL c22, b2, c22 + LD b6, 13 * SIZE(BO) + + NMSUB c31, c31, b3, c21 + NMSUB c32, c32, b3, c22 + LD b7, 14 * SIZE(BO) + NMSUB c41, c41, b4, c21 + NMSUB c42, c42, b4, c22 + LD b8, 15 * SIZE(BO) + NMSUB c51, c51, b5, c21 + NMSUB c52, c52, b5, c22 + LD b3, 18 * SIZE(BO) + NMSUB c61, c61, b6, c21 + NMSUB c62, c62, b6, c22 + LD b4, 19 * SIZE(BO) + NMSUB c71, c71, b7, c21 + NMSUB c72, c72, b7, c22 + LD b5, 20 * SIZE(BO) + NMSUB c81, c81, b8, c21 + NMSUB c82, c82, b8, c22 + LD b6, 21 * SIZE(BO) + + MUL c31, b3, c31 + MUL c32, b3, c32 + LD b7, 22 * SIZE(BO) + + NMSUB c41, c41, b4, c31 + NMSUB c42, c42, b4, c32 + LD b8, 23 * SIZE(BO) + NMSUB c51, c51, b5, c31 + NMSUB c52, c52, b5, c32 + LD b4, 27 * SIZE(BO) + NMSUB c61, c61, b6, c31 + NMSUB c62, c62, b6, c32 + LD b5, 28 * SIZE(BO) + NMSUB c71, c71, b7, c31 + NMSUB c72, c72, b7, c32 + LD b6, 29 * SIZE(BO) + NMSUB c81, c81, b8, c31 + NMSUB c82, c82, b8, c32 + LD b7, 30 * SIZE(BO) + + MUL c41, b4, c41 + MUL c42, b4, c42 + LD b8, 31 * SIZE(BO) + + NMSUB c51, c51, b5, c41 + NMSUB c52, c52, b5, c42 + LD b5, 36 * SIZE(BO) + NMSUB c61, c61, b6, c41 + NMSUB c62, c62, b6, c42 + LD b6, 37 * SIZE(BO) + NMSUB c71, c71, b7, c41 + NMSUB c72, c72, b7, c42 + LD b7, 38 * SIZE(BO) + NMSUB c81, c81, b8, c41 + NMSUB c82, c82, b8, c42 + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + MUL c52, b5, c52 + + NMSUB c61, c61, b6, c51 + NMSUB c62, c62, b6, c52 + LD b6, 45 * SIZE(BO) + NMSUB c71, c71, b7, c51 + NMSUB c72, c72, b7, c52 + LD b7, 46 * SIZE(BO) + NMSUB c81, c81, b8, c51 + NMSUB c82, c82, b8, c52 + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + MUL c62, b6, c62 + + NMSUB c71, c71, b7, c61 + NMSUB c72, c72, b7, c62 + LD b7, 54 * SIZE(BO) + NMSUB c81, c81, b8, c61 + NMSUB c82, c82, b8, c62 + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + MUL c72, b7, c72 + + NMSUB c81, c81, b8, c71 + NMSUB c82, c82, b8, c72 + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 + MUL c82, b8, c82 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + + MUL c81, b1, c81 + MUL c82, b1, c82 + LD b5, 59 * SIZE(BO) + + NMSUB c71, c71, b2, c81 + NMSUB c72, c72, b2, c82 + LD b6, 58 * SIZE(BO) + NMSUB c61, c61, b3, c81 + NMSUB c62, c62, b3, c82 + LD b7, 57 * SIZE(BO) + NMSUB c51, c51, b4, c81 + NMSUB c52, c52, b4, c82 + LD b8, 56 * SIZE(BO) + + NMSUB c41, c41, b5, c81 + NMSUB c42, c42, b5, c82 + LD b2, 54 * SIZE(BO) + NMSUB c31, c31, b6, c81 + NMSUB c32, c32, b6, c82 + LD b3, 53 * SIZE(BO) + NMSUB c21, c21, b7, c81 + NMSUB c22, c22, b7, c82 + LD b4, 52 * SIZE(BO) + NMSUB c11, c11, b8, c81 + NMSUB c12, c12, b8, c82 + LD b5, 51 * SIZE(BO) + + MUL c71, b2, c71 + MUL c72, b2, c72 + LD b6, 50 * SIZE(BO) + + NMSUB c61, c61, b3, c71 + NMSUB c62, c62, b3, c72 + LD b7, 49 * SIZE(BO) + NMSUB c51, c51, b4, c71 + NMSUB c52, c52, b4, c72 + LD b8, 48 * SIZE(BO) + NMSUB c41, c41, b5, c71 + NMSUB c42, c42, b5, c72 + LD b3, 45 * SIZE(BO) + NMSUB c31, c31, b6, c71 + NMSUB c32, c32, b6, c72 + LD b4, 44 * SIZE(BO) + NMSUB c21, c21, b7, c71 + NMSUB c22, c22, b7, c72 + LD b5, 43 * SIZE(BO) + NMSUB c11, c11, b8, c71 + NMSUB c12, c12, b8, c72 + LD b6, 42 * SIZE(BO) + + MUL c61, b3, c61 + MUL c62, b3, c62 + LD b7, 41 * SIZE(BO) + + NMSUB c51, c51, b4, c61 + NMSUB c52, c52, b4, c62 + LD b8, 40 * SIZE(BO) + NMSUB c41, c41, b5, c61 + NMSUB c42, c42, b5, c62 + LD b4, 36 * SIZE(BO) + NMSUB c31, c31, b6, c61 + NMSUB c32, c32, b6, c62 + LD b5, 35 * SIZE(BO) + NMSUB c21, c21, b7, c61 + NMSUB c22, c22, b7, c62 + LD b6, 34 * SIZE(BO) + NMSUB c11, c11, b8, c61 + NMSUB c12, c12, b8, c62 + LD b7, 33 * SIZE(BO) + + MUL c51, b4, c51 + MUL c52, b4, c52 + LD b8, 32 * SIZE(BO) + + NMSUB c41, c41, b5, c51 + NMSUB c42, c42, b5, c52 + LD b5, 27 * SIZE(BO) + NMSUB c31, c31, b6, c51 + NMSUB c32, c32, b6, c52 + LD b6, 26 * SIZE(BO) + NMSUB c21, c21, b7, c51 + NMSUB c22, c22, b7, c52 + LD b7, 25 * SIZE(BO) + NMSUB c11, c11, b8, c51 + NMSUB c12, c12, b8, c52 + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + MUL c42, b5, c42 + + NMSUB c31, c31, b6, c41 + NMSUB c32, c32, b6, c42 + LD b6, 18 * SIZE(BO) + NMSUB c21, c21, b7, c41 + NMSUB c22, c22, b7, c42 + LD b7, 17 * SIZE(BO) + NMSUB c11, c11, b8, c41 + NMSUB c12, c12, b8, c42 + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + MUL c32, b6, c32 + + NMSUB c21, c21, b7, c31 + NMSUB c22, c22, b7, c32 + LD b7, 9 * SIZE(BO) + NMSUB c11, c11, b8, c31 + NMSUB c12, c12, b8, c32 + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + MUL c22, b7, c22 + + NMSUB c11, c11, b8, c21 + NMSUB c12, c12, b8, c22 + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 + MUL c12, b8, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) + + ST c12, 8 * SIZE(BO) + ST c22, 9 * SIZE(BO) + ST c32, 10 * SIZE(BO) + ST c42, 11 * SIZE(BO) + ST c52, 12 * SIZE(BO) + ST c62, 13 * SIZE(BO) + ST c72, 14 * SIZE(BO) + ST c82, 15 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c21, 2 * SIZE(AO) + ST c22, 3 * SIZE(AO) + ST c31, 4 * SIZE(AO) + ST c32, 5 * SIZE(AO) + ST c41, 6 * SIZE(AO) + ST c42, 7 * SIZE(AO) + + ST c51, 8 * SIZE(AO) + ST c52, 9 * SIZE(AO) + ST c61, 10 * SIZE(AO) + ST c62, 11 * SIZE(AO) + ST c71, 12 * SIZE(AO) + ST c72, 13 * SIZE(AO) + ST c81, 14 * SIZE(AO) + ST c82, 15 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c22, 1 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c32, 1 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c42, 1 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c52, 1 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c62, 1 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c72, 1 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + ST c82, 1 * SIZE(CO8) + + MTC $0, a1 + +#ifndef LN + daddiu CO1, CO1, 2 * SIZE + daddiu CO2, CO2, 2 * SIZE + daddiu CO3, CO3, 2 * SIZE + daddiu CO4, CO4, 2 * SIZE + daddiu CO5, CO5, 2 * SIZE + daddiu CO6, CO6, 2 * SIZE + daddiu CO7, CO7, 2 * SIZE + daddiu CO8, CO8, 2 * SIZE +#endif + + MOV c11, a1 + MOV c21, a1 + +#ifdef RT + dsll TEMP, K, 1 + BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + + MOV c31, a1 + MOV c41, a1 + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 1 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 2 +#endif + +#ifdef LN + daddiu KK, KK, -2 +#endif + + daddiu I, I, -1 + MOV c51, a1 + + bgtz I, .L11 + MOV c61, a1 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 + blez I, .L29 + MOV c71, c11 + +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, KK, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B +#else + +#ifdef LN + dsll TEMP, K, 0 + BASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, 0 + BASE_SHIFT + dsll TEMP, KK, 3 + BASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 8 * SIZE(BO) + LD b7, 12 * SIZE(BO) + + dsra L, TEMP, 2 + MOV c81, c11 + + blez L, .L25 + NOP +#endif + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -8 +#endif + + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c21, b2, c21 + SUB c31, b3, c31 + SUB c41, b4, c41 + SUB c51, b5, c51 + SUB c61, b6, c61 + SUB c71, b7, c71 + SUB c81, b8, c81 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + + MUL c11, b1, c11 + MUL c21, b1, c21 + MUL c31, b1, c31 + MUL c41, b1, c41 + MUL c51, b1, c51 + MUL c61, b1, c61 + MUL c71, b1, c71 + MUL c81, b1, c81 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL c11, b1, c11 + + NMSUB c21, c21, b2, c11 + NMSUB c31, c31, b3, c11 + NMSUB c41, c41, b4, c11 + NMSUB c51, c51, b5, c11 + NMSUB c61, c61, b6, c11 + NMSUB c71, c71, b7, c11 + NMSUB c81, c81, b8, c11 + + LD b2, 9 * SIZE(BO) + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL c21, b2, c21 + + NMSUB c31, c31, b3, c21 + NMSUB c41, c41, b4, c21 + NMSUB c51, c51, b5, c21 + NMSUB c61, c61, b6, c21 + NMSUB c71, c71, b7, c21 + NMSUB c81, c81, b8, c21 + + LD b3, 18 * SIZE(BO) + LD b4, 19 * SIZE(BO) + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL c31, b3, c31 + + NMSUB c41, c41, b4, c31 + NMSUB c51, c51, b5, c31 + NMSUB c61, c61, b6, c31 + NMSUB c71, c71, b7, c31 + NMSUB c81, c81, b8, c31 + + LD b4, 27 * SIZE(BO) + LD b5, 28 * SIZE(BO) + LD b6, 29 * SIZE(BO) + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL c41, b4, c41 + + NMSUB c51, c51, b5, c41 + NMSUB c61, c61, b6, c41 + NMSUB c71, c71, b7, c41 + NMSUB c81, c81, b8, c41 + + LD b5, 36 * SIZE(BO) + LD b6, 37 * SIZE(BO) + LD b7, 38 * SIZE(BO) + LD b8, 39 * SIZE(BO) + + MUL c51, b5, c51 + + NMSUB c61, c61, b6, c51 + NMSUB c71, c71, b7, c51 + NMSUB c81, c81, b8, c51 + + LD b6, 45 * SIZE(BO) + LD b7, 46 * SIZE(BO) + LD b8, 47 * SIZE(BO) + + MUL c61, b6, c61 + + NMSUB c71, c71, b7, c61 + NMSUB c81, c81, b8, c61 + + LD b7, 54 * SIZE(BO) + LD b8, 55 * SIZE(BO) + + MUL c71, b7, c71 + + NMSUB c81, c81, b8, c71 + + LD b8, 63 * SIZE(BO) + + MUL c81, b8, c81 +#endif + +#ifdef RT + LD b1, 63 * SIZE(BO) + LD b2, 62 * SIZE(BO) + LD b3, 61 * SIZE(BO) + LD b4, 60 * SIZE(BO) + LD b5, 59 * SIZE(BO) + LD b6, 58 * SIZE(BO) + LD b7, 57 * SIZE(BO) + LD b8, 56 * SIZE(BO) + + MUL c81, b1, c81 + + NMSUB c71, c71, b2, c81 + NMSUB c61, c61, b3, c81 + NMSUB c51, c51, b4, c81 + NMSUB c41, c41, b5, c81 + NMSUB c31, c31, b6, c81 + NMSUB c21, c21, b7, c81 + NMSUB c11, c11, b8, c81 + + LD b2, 54 * SIZE(BO) + LD b3, 53 * SIZE(BO) + LD b4, 52 * SIZE(BO) + LD b5, 51 * SIZE(BO) + LD b6, 50 * SIZE(BO) + LD b7, 49 * SIZE(BO) + LD b8, 48 * SIZE(BO) + + MUL c71, b2, c71 + + NMSUB c61, c61, b3, c71 + NMSUB c51, c51, b4, c71 + NMSUB c41, c41, b5, c71 + NMSUB c31, c31, b6, c71 + NMSUB c21, c21, b7, c71 + NMSUB c11, c11, b8, c71 + + LD b3, 45 * SIZE(BO) + LD b4, 44 * SIZE(BO) + LD b5, 43 * SIZE(BO) + LD b6, 42 * SIZE(BO) + LD b7, 41 * SIZE(BO) + LD b8, 40 * SIZE(BO) + + MUL c61, b3, c61 + + NMSUB c51, c51, b4, c61 + NMSUB c41, c41, b5, c61 + NMSUB c31, c31, b6, c61 + NMSUB c21, c21, b7, c61 + NMSUB c11, c11, b8, c61 + + LD b4, 36 * SIZE(BO) + LD b5, 35 * SIZE(BO) + LD b6, 34 * SIZE(BO) + LD b7, 33 * SIZE(BO) + LD b8, 32 * SIZE(BO) + + MUL c51, b4, c51 + + NMSUB c41, c41, b5, c51 + NMSUB c31, c31, b6, c51 + NMSUB c21, c21, b7, c51 + NMSUB c11, c11, b8, c51 + + LD b5, 27 * SIZE(BO) + LD b6, 26 * SIZE(BO) + LD b7, 25 * SIZE(BO) + LD b8, 24 * SIZE(BO) + + MUL c41, b5, c41 + + NMSUB c31, c31, b6, c41 + NMSUB c21, c21, b7, c41 + NMSUB c11, c11, b8, c41 + + LD b6, 18 * SIZE(BO) + LD b7, 17 * SIZE(BO) + LD b8, 16 * SIZE(BO) + + MUL c31, b6, c31 + + NMSUB c21, c21, b7, c31 + NMSUB c11, c11, b8, c31 + + LD b7, 9 * SIZE(BO) + LD b8, 8 * SIZE(BO) + + MUL c21, b7, c21 + + NMSUB c11, c11, b8, c21 + + LD b8, 0 * SIZE(BO) + + MUL c11, b8, c11 +#endif + +#ifdef LN + daddiu CO1, CO1, -1 * SIZE + daddiu CO2, CO2, -1 * SIZE + daddiu CO3, CO3, -1 * SIZE + daddiu CO4, CO4, -1 * SIZE + daddiu CO5, CO5, -1 * SIZE + daddiu CO6, CO6, -1 * SIZE + daddiu CO7, CO7, -1 * SIZE + daddiu CO8, CO8, -1 * SIZE +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c21, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c41, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c61, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c81, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c21, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c41, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c61, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c81, 7 * SIZE(AO) +#endif + + ST c11, 0 * SIZE(CO1) + ST c21, 0 * SIZE(CO2) + ST c31, 0 * SIZE(CO3) + ST c41, 0 * SIZE(CO4) + ST c51, 0 * SIZE(CO5) + ST c61, 0 * SIZE(CO6) + ST c71, 0 * SIZE(CO7) + ST c81, 0 * SIZE(CO8) + +#ifndef LN + daddiu CO1, CO1, 1 * SIZE + daddiu CO2, CO2, 1 * SIZE + daddiu CO3, CO3, 1 * SIZE + daddiu CO4, CO4, 1 * SIZE + daddiu CO5, CO5, 1 * SIZE + daddiu CO6, CO6, 1 * SIZE + daddiu CO7, CO7, 1 * SIZE + daddiu CO8, CO8, 1 * SIZE +#endif + +#ifdef RT + dsll TEMP, K, BASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, 0 + BASE_SHIFT + dsll TEMP, TEMP, 3 + BASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 3 + BASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 8 +#endif + +#ifdef RT + daddiu KK, KK, -8 +#endif + + bgtz J, .L10 + NOP + .align 3 + + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + + LDARG $22, 88($sp) + LDARG $23, 96($sp) + LDARG $24, 104($sp) + LDARG $25, 112($sp) + +#ifndef __64BIT__ + ldc1 $f20,112($sp) + ldc1 $f21,120($sp) + ldc1 $f22,128($sp) + ldc1 $f23,136($sp) +#endif + + j $31 + daddiu $sp, $sp, 144 + + EPILOGUE diff --git a/kernel/mips64/zamax.S b/kernel/mips64/zamax.S new file mode 100644 index 0000000000..e993867efd --- /dev/null +++ b/kernel/mips64/zamax.S @@ -0,0 +1,245 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 +#define t5 $f16 +#define t6 $f17 +#define t7 $f18 +#define t8 $f19 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + blez N, .L999 + ADD s1, t1, t2 + + NOP + ADD s2, t1, t2 + + dsra I, N, 2 + ADD s3, t1, t2 + + blez I, .L15 + ADD s4, t1, t2 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + LD a2, 1 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + FABS t4, a4 + NOP + + FABS t5, a5 + LD a3, 0 * SIZE(X) + FABS t6, a6 + LD a4, 1 * SIZE(X) + + FABS t7, a7 + daddu X, X, INCX + FABS t8, a8 + NOP + + ADD t1, t1, t2 + LD a5, 0 * SIZE(X) + ADD t3, t3, t4 + LD a6, 1 * SIZE(X) + + ADD t5, t5, t6 + daddu X, X, INCX + ADD t7, t7, t8 + NOP + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t3 + LD a8, 1 * SIZE(X) + + CMPLT $fcc2, s3, t5 + daddu X, X, INCX + CMPLT $fcc3, s4, t7 + NOP + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + CMOVT s2, t3, $fcc1 + NOP + + CMOVT s3, t5, $fcc2 + bgtz I, .L12 + + CMOVT s4, t7, $fcc3 + NOP + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t3 + CMPLT $fcc2, s3, t5 + CMPLT $fcc3, s4, t7 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t3, $fcc1 + CMOVT s3, t5, $fcc2 + CMOVT s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + daddiu I, I, -1 + + FABS t1, a1 + FABS t2, a2 + + + ADD t1, t1, t2 + + CMPLT $fcc0, s1, t1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zamin.S b/kernel/mips64/zamin.S new file mode 100644 index 0000000000..bd1d509f1d --- /dev/null +++ b/kernel/mips64/zamin.S @@ -0,0 +1,245 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 +#define t5 $f16 +#define t6 $f17 +#define t7 $f18 +#define t8 $f19 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + LD a1, 0 * SIZE(X) + daddiu N, N, -1 + + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + blez N, .L999 + ADD s1, t1, t2 + + NOP + ADD s2, t1, t2 + + dsra I, N, 2 + ADD s3, t1, t2 + + blez I, .L15 + ADD s4, t1, t2 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + LD a2, 1 * SIZE(X) + + FABS t3, a3 + daddu X, X, INCX + FABS t4, a4 + NOP + + FABS t5, a5 + LD a3, 0 * SIZE(X) + FABS t6, a6 + LD a4, 1 * SIZE(X) + + FABS t7, a7 + daddu X, X, INCX + FABS t8, a8 + NOP + + ADD t1, t1, t2 + LD a5, 0 * SIZE(X) + ADD t3, t3, t4 + LD a6, 1 * SIZE(X) + + ADD t5, t5, t6 + daddu X, X, INCX + ADD t7, t7, t8 + NOP + + CMPLT $fcc0, t1, s1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, t3, s2 + LD a8, 1 * SIZE(X) + + CMPLT $fcc2, t5, s3 + daddu X, X, INCX + CMPLT $fcc3, t7, s4 + NOP + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + CMOVT s2, t3, $fcc1 + NOP + + CMOVT s3, t5, $fcc2 + bgtz I, .L12 + + CMOVT s4, t7, $fcc3 + NOP + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + FABS t5, a5 + FABS t6, a6 + FABS t7, a7 + FABS t8, a8 + + ADD t1, t1, t2 + ADD t3, t3, t4 + ADD t5, t5, t6 + ADD t7, t7, t8 + + CMPLT $fcc0, t1, s1 + CMPLT $fcc1, t3, s2 + CMPLT $fcc2, t5, s3 + CMPLT $fcc3, t7, s4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t3, $fcc1 + CMOVT s3, t5, $fcc2 + CMOVT s4, t7, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L998 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + daddiu I, I, -1 + + FABS t1, a1 + FABS t2, a2 + + + ADD t1, t1, t2 + + CMPLT $fcc0, t1, s1 + + CMOVT s1, t1, $fcc0 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L998: + CMPLT $fcc0, s2, s1 + CMPLT $fcc1, s4, s3 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s3, s1 + CMOVT s1, s3, $fcc0 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zasum.S b/kernel/mips64/zasum.S new file mode 100644 index 0000000000..d6dc205845 --- /dev/null +++ b/kernel/mips64/zasum.S @@ -0,0 +1,204 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 + +#define I $2 +#define TEMP $3 + +#define a1 $f2 +#define a2 $f3 +#define a3 $f4 +#define a4 $f5 +#define a5 $f6 +#define a6 $f7 +#define a7 $f8 +#define a8 $f9 + +#define t1 $f10 +#define t2 $f11 +#define t3 $f12 +#define t4 $f13 + +#define s1 $f0 +#define s2 $f1 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + MTC $0, s1 + + MTC $0, s2 + dsll INCX, INCX, ZBASE_SHIFT + + blez N, .L999 + dsra I, N, 2 + + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + FABS t1, a1 + FABS t2, a2 + + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + + FABS t3, a3 + FABS t4, a4 + daddiu I, I, -1 + + blez I, .L24 + daddu X, X, INCX + .align 3 + +.L23: + ADD s1, s1, t1 + LD a1, 0 * SIZE(X) + + FABS t1, a5 + daddiu I, I, -1 + + ADD s2, s2, t2 + LD a2, 1 * SIZE(X) + + FABS t2, a6 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a3, 0 * SIZE(X) + + FABS t3, a7 + NOP + + ADD s2, s2, t4 + LD a4, 1 * SIZE(X) + + FABS t4, a8 + daddu X, X, INCX + + ADD s1, s1, t1 + LD a5, 0 * SIZE(X) + + FABS t1, a1 + NOP + + ADD s2, s2, t2 + LD a6, 1 * SIZE(X) + + FABS t2, a2 + daddu X, X, INCX + + ADD s1, s1, t3 + LD a7, 0 * SIZE(X) + + FABS t3, a3 + LD a8, 1 * SIZE(X) + + ADD s2, s2, t4 + daddu X, X, INCX + + bgtz I, .L23 + FABS t4, a4 + .align 3 + +.L24: + ADD s1, s1, t1 + FABS t1, a5 + + ADD s2, s2, t2 + FABS t2, a6 + + ADD s1, s1, t3 + FABS t3, a7 + + ADD s2, s2, t4 + FABS t4, a8 + + ADD s1, s1, t1 + ADD s2, s2, t2 + ADD s1, s1, t3 + ADD s2, s2, t4 + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + FABS t1, a1 + daddiu I, I, -1 + FABS t2, a2 + daddu X, X, INCX + + ADD s1, s1, t1 + bgtz I, .L26 + ADD s2, s2, t2 + .align 3 + +.L999: + j $31 + ADD s1, s1, s2 + + EPILOGUE diff --git a/kernel/mips64/zaxpy.S b/kernel/mips64/zaxpy.S new file mode 100644 index 0000000000..8a7b29a768 --- /dev/null +++ b/kernel/mips64/zaxpy.S @@ -0,0 +1,438 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $9 +#define INCX $10 +#define Y $11 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define YY $5 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f17 + +#define t1 $f18 +#define t2 $f19 +#define t3 $f20 +#define t4 $f21 + +#ifndef CONJ +#define MADD1 NMSUB +#define MADD2 MADD +#else +#define MADD1 MADD +#define MADD2 NMSUB +#endif + + PROLOGUE + + LDARG INCY, 0($sp) + li TEMP, 2 * SIZE + +#ifndef __64BIT__ + daddiu $sp, $sp, -16 + sdc1 $f20, 0($sp) + sdc1 $f21, 8($sp) +#endif + + blez N, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 2 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + LD a4, 3 * SIZE(X) + LD b4, 3 * SIZE(Y) + LD a5, 4 * SIZE(X) + LD b5, 4 * SIZE(Y) + LD a6, 5 * SIZE(X) + LD b6, 5 * SIZE(Y) + LD a7, 6 * SIZE(X) + LD b7, 6 * SIZE(Y) + LD a8, 7 * SIZE(X) + LD b8, 7 * SIZE(Y) + + blez I, .L13 + NOP + .align 3 + +.L12: + MADD t1, b1, ALPHA_R, a1 + LD b1, 8 * SIZE(Y) + MADD t2, b2, ALPHA_I, a1 + LD a1, 8 * SIZE(X) + MADD t3, b3, ALPHA_R, a3 + LD b3, 10 * SIZE(Y) + MADD t4, b4, ALPHA_I, a3 + LD a3, 10 * SIZE(X) + + MADD1 t1, t1, ALPHA_I, a2 + LD b2, 9 * SIZE(Y) + MADD2 t2, t2, ALPHA_R, a2 + LD a2, 9 * SIZE(X) + MADD1 t3, t3, ALPHA_I, a4 + LD b4, 11 * SIZE(Y) + MADD2 t4, t4, ALPHA_R, a4 + LD a4, 11 * SIZE(X) + + ST t1, 0 * SIZE(Y) + ST t2, 1 * SIZE(Y) + ST t3, 2 * SIZE(Y) + ST t4, 3 * SIZE(Y) + + MADD t1, b5, ALPHA_R, a5 + LD b5, 12 * SIZE(Y) + MADD t2, b6, ALPHA_I, a5 + LD a5, 12 * SIZE(X) + MADD t3, b7, ALPHA_R, a7 + LD b7, 14 * SIZE(Y) + MADD t4, b8, ALPHA_I, a7 + LD a7, 14 * SIZE(X) + + MADD1 t1, t1, ALPHA_I, a6 + LD b6, 13 * SIZE(Y) + MADD2 t2, t2, ALPHA_R, a6 + LD a6, 13 * SIZE(X) + MADD1 t3, t3, ALPHA_I, a8 + LD b8, 15 * SIZE(Y) + MADD2 t4, t4, ALPHA_R, a8 + LD a8, 15 * SIZE(X) + + ST t1, 4 * SIZE(Y) + ST t2, 5 * SIZE(Y) + ST t3, 6 * SIZE(Y) + ST t4, 7 * SIZE(Y) + + daddiu I, I, -1 + daddiu Y, Y, 8 * SIZE + + bgtz I, .L12 + daddiu X, X, 8 * SIZE + .align 3 + +.L13: + MADD t1, b1, ALPHA_R, a1 + MADD t2, b2, ALPHA_I, a1 + MADD t3, b3, ALPHA_R, a3 + MADD t4, b4, ALPHA_I, a3 + + MADD1 t1, t1, ALPHA_I, a2 + MADD2 t2, t2, ALPHA_R, a2 + MADD1 t3, t3, ALPHA_I, a4 + MADD2 t4, t4, ALPHA_R, a4 + + ST t1, 0 * SIZE(Y) + MADD t1, b5, ALPHA_R, a5 + ST t2, 1 * SIZE(Y) + MADD t2, b6, ALPHA_I, a5 + ST t3, 2 * SIZE(Y) + MADD t3, b7, ALPHA_R, a7 + ST t4, 3 * SIZE(Y) + MADD t4, b8, ALPHA_I, a7 + + MADD1 t1, t1, ALPHA_I, a6 + MADD2 t2, t2, ALPHA_R, a6 + MADD1 t3, t3, ALPHA_I, a8 + MADD2 t4, t4, ALPHA_R, a8 + + ST t1, 4 * SIZE(Y) + ST t2, 5 * SIZE(Y) + ST t3, 6 * SIZE(Y) + ST t4, 7 * SIZE(Y) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MADD t1, b1, ALPHA_R, a1 + daddiu X, X, 2 * SIZE + MADD t2, b2, ALPHA_I, a1 + + MADD1 t1, t1, ALPHA_I, a2 + daddiu I, I, -1 + MADD2 t2, t2, ALPHA_R, a2 + daddiu Y, Y, 2 * SIZE + + ST t1, -2 * SIZE(Y) + + bgtz I, .L16 + ST t2, -1 * SIZE(Y) + +#ifndef __64BIT__ + ldc1 $f20, 0($sp) + ldc1 $f21, 8($sp) + daddiu $sp, $sp, 16 +#endif + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 2 + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + LD a3, 0 * SIZE(X) + LD b3, 0 * SIZE(Y) + LD a4, 1 * SIZE(X) + LD b4, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + LD a5, 0 * SIZE(X) + LD b5, 0 * SIZE(Y) + LD a6, 1 * SIZE(X) + LD b6, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + LD a7, 0 * SIZE(X) + blez I, .L23 + LD b7, 0 * SIZE(Y) + .align 3 + +.L22: + MADD t1, b1, ALPHA_R, a1 + LD b8, 1 * SIZE(Y) + daddu Y, Y, INCY + MADD t2, b2, ALPHA_I, a1 + LD a8, 1 * SIZE(X) + daddu X, X, INCX + + MADD t3, b3, ALPHA_R, a3 + LD b1, 0 * SIZE(Y) + MADD t4, b4, ALPHA_I, a3 + LD a1, 0 * SIZE(X) + + MADD1 t1, t1, ALPHA_I, a2 + LD b2, 1 * SIZE(Y) + daddu Y, Y, INCY + MADD2 t2, t2, ALPHA_R, a2 + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + MADD1 t3, t3, ALPHA_I, a4 + LD a3, 0 * SIZE(X) + MADD2 t4, t4, ALPHA_R, a4 + LD b3, 0 * SIZE(Y) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + ST t4, 1 * SIZE(YY) + daddu YY, YY, INCY + + MADD t1, b5, ALPHA_R, a5 + LD a4, 1 * SIZE(X) + daddu X, X, INCX + MADD t2, b6, ALPHA_I, a5 + LD b4, 1 * SIZE(Y) + daddu Y, Y, INCY + + MADD t3, b7, ALPHA_R, a7 + LD b5, 0 * SIZE(Y) + MADD t4, b8, ALPHA_I, a7 + LD a5, 0 * SIZE(X) + + MADD1 t1, t1, ALPHA_I, a6 + LD b6, 1 * SIZE(Y) + daddu Y, Y, INCY + MADD2 t2, t2, ALPHA_R, a6 + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + MADD1 t3, t3, ALPHA_I, a8 + LD b7, 0 * SIZE(Y) + MADD2 t4, t4, ALPHA_R, a8 + LD a7, 0 * SIZE(X) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + ST t4, 1 * SIZE(YY) + daddu YY, YY, INCY + + + daddiu I, I, -1 + + bgtz I, .L22 + NOP + .align 3 + +.L23: + MADD t1, b1, ALPHA_R, a1 + LD a8, 1 * SIZE(X) + MADD t2, b2, ALPHA_I, a1 + LD b8, 1 * SIZE(Y) + MADD t3, b3, ALPHA_R, a3 + daddu X, X, INCX + MADD t4, b4, ALPHA_I, a3 + daddu Y, Y, INCY + + MADD1 t1, t1, ALPHA_I, a2 + MADD2 t2, t2, ALPHA_R, a2 + MADD1 t3, t3, ALPHA_I, a4 + MADD2 t4, t4, ALPHA_R, a4 + + ST t1, 0 * SIZE(YY) + MADD t1, b5, ALPHA_R, a5 + ST t2, 1 * SIZE(YY) + MADD t2, b6, ALPHA_I, a5 + daddu YY, YY, INCY + + ST t3, 0 * SIZE(YY) + MADD t3, b7, ALPHA_R, a7 + ST t4, 1 * SIZE(YY) + MADD t4, b8, ALPHA_I, a7 + daddu YY, YY, INCY + + MADD1 t1, t1, ALPHA_I, a6 + MADD2 t2, t2, ALPHA_R, a6 + MADD1 t3, t3, ALPHA_I, a8 + MADD2 t4, t4, ALPHA_R, a8 + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + daddu YY, YY, INCY + ST t3, 0 * SIZE(YY) + ST t4, 1 * SIZE(YY) + daddu YY, YY, INCY + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MADD t1, b1, ALPHA_R, a1 + MADD t2, b2, ALPHA_I, a1 + daddu X, X, INCX + + MADD1 t1, t1, ALPHA_I, a2 + MADD2 t2, t2, ALPHA_R, a2 + daddiu I, I, -1 + + ST t1, 0 * SIZE(Y) + ST t2, 1 * SIZE(Y) + + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: +#ifndef __64BIT__ + ldc1 $f20, 0($sp) + ldc1 $f21, 8($sp) + daddiu $sp, $sp, 16 +#endif + + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zcopy.S b/kernel/mips64/zcopy.S new file mode 100644 index 0000000000..5a4ce9c98e --- /dev/null +++ b/kernel/mips64/zcopy.S @@ -0,0 +1,265 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + li TEMP, 2 * SIZE + NOP + + blez N, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 2 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + LD a5, 4 * SIZE(X) + LD a6, 5 * SIZE(X) + LD a7, 6 * SIZE(X) + LD a8, 7 * SIZE(X) + + blez I, .L13 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(Y) + LD a1, 8 * SIZE(X) + + ST a2, 1 * SIZE(Y) + LD a2, 9 * SIZE(X) + + ST a3, 2 * SIZE(Y) + LD a3, 10 * SIZE(X) + + ST a4, 3 * SIZE(Y) + LD a4, 11 * SIZE(X) + + ST a5, 4 * SIZE(Y) + LD a5, 12 * SIZE(X) + + ST a6, 5 * SIZE(Y) + LD a6, 13 * SIZE(X) + + ST a7, 6 * SIZE(Y) + LD a7, 14 * SIZE(X) + + ST a8, 7 * SIZE(Y) + LD a8, 15 * SIZE(X) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + ST a3, 2 * SIZE(Y) + ST a4, 3 * SIZE(Y) + ST a5, 4 * SIZE(Y) + ST a6, 5 * SIZE(Y) + ST a7, 6 * SIZE(Y) + ST a8, 7 * SIZE(Y) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + daddiu X, X, 2 * SIZE + daddiu Y, Y, 2 * SIZE + + ST a1, -2 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L16 + ST a2, -1 * SIZE(Y) + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 2 + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + + blez I, .L23 + daddu X, X, INCX + .align 3 + +.L22: + ST a1, 0 * SIZE(Y) + LD a1, 0 * SIZE(X) + + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + ST a3, 0 * SIZE(Y) + LD a3, 0 * SIZE(X) + + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + ST a5, 0 * SIZE(Y) + LD a5, 0 * SIZE(X) + + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + ST a7, 0 * SIZE(Y) + LD a7, 0 * SIZE(X) + + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a8, 1 * SIZE(X) + + daddiu I, I, -1 + + bgtz I, .L22 + daddu X, X, INCX + .align 3 + +.L23: + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + daddiu I, I, -1 + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + + bgtz I, .L26 + daddu Y, Y, INCY + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zdot.S b/kernel/mips64/zdot.S new file mode 100644 index 0000000000..c50fe318e5 --- /dev/null +++ b/kernel/mips64/zdot.S @@ -0,0 +1,402 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + MTC $0, s1 + + MOV s2, s1 + MOV s3, s2 + MOV s4, s3 + + dsll INCX, INCX, ZBASE_SHIFT + li TEMP, 2 * SIZE + + blez N, .L999 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 2 + + bne INCY, TEMP, .L20 + NOP + + blez I, .L15 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + daddiu I, I, -1 + + blez I, .L14 + LD b2, 1 * SIZE(Y) + .align 3 + +.L13: + MADD s1, s1, a1, b1 + LD a3, 2 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 3 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 2 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 3 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a1, 4 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 5 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 4 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 5 * SIZE(Y) + + MADD s1, s1, a1, b1 + LD a3, 6 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 7 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 6 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 7 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a1, 8 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 9 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 8 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 9 * SIZE(Y) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L13 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L14: + MADD s1, s1, a1, b1 + LD a3, 2 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 3 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 2 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 3 * SIZE(Y) + + MADD s1, s1, a3, b3 + LD a1, 4 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 5 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 4 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 5 * SIZE(Y) + + MADD s1, s1, a1, b1 + LD a3, 6 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 7 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 6 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 7 * SIZE(Y) + + MADD s1, s1, a3, b3 + daddiu X, X, 8 * SIZE + MADD s2, s2, a4, b3 + daddiu Y, Y, 8 * SIZE + MADD s3, s3, a3, b4 + MADD s4, s4, a4, b4 + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + LD b1, 0 * SIZE(Y) + daddiu I, I, -1 + + blez I, .L17 + LD b2, 1 * SIZE(Y) + .align 3 + +.L16: + MADD s1, s1, a1, b1 + daddiu I, I, -1 + MADD s2, s2, a2, b1 + LD b1, 2 * SIZE(Y) + MADD s3, s3, a1, b2 + LD a1, 2 * SIZE(X) + MADD s4, s4, a2, b2 + LD a2, 3 * SIZE(X) + + LD b2, 3 * SIZE(Y) + daddiu X, X, 2 * SIZE + + bgtz I, .L16 + daddiu Y, Y, 2 * SIZE + .align 3 + +.L17: + MADD s1, s1, a1, b1 + MADD s2, s2, a2, b1 + NOP + MADD s3, s3, a1, b2 + j .L999 + MADD s4, s4, a2, b2 + .align 3 + +.L20: +#ifdef F_INTERFACE + bgez INCX, .L21 + daddiu TEMP, N, -1 + + mult TEMP, INCX + + mflo TEMP + dsub X, X, TEMP + .align 3 + +.L21: + bgez INCY, .L22 + daddiu TEMP, N, -1 + + mult TEMP, INCY + + mflo TEMP + dsub Y, Y, TEMP + .align 3 + +.L22: +#endif + blez I, .L25 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + dadd X, X, INCX + daddiu I, I, -1 + + blez I, .L24 + dadd Y, Y, INCY + .align 3 + +.L23: + MADD s1, s1, a1, b1 + LD a3, 0 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 1 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 0 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a3, b3 + LD a1, 0 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 1 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 0 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + LD a3, 0 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 1 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 0 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a3, b3 + LD a1, 0 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 1 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 0 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 1 * SIZE(Y) + + dadd X, X, INCX + daddiu I, I, -1 + + bgtz I, .L23 + dadd Y, Y, INCY + .align 3 + +.L24: + MADD s1, s1, a1, b1 + LD a3, 0 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 1 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 0 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a3, b3 + LD a1, 0 * SIZE(X) + MADD s2, s2, a4, b3 + LD a2, 1 * SIZE(X) + MADD s3, s3, a3, b4 + LD b1, 0 * SIZE(Y) + MADD s4, s4, a4, b4 + LD b2, 1 * SIZE(Y) + + dadd X, X, INCX + dadd Y, Y, INCY + + MADD s1, s1, a1, b1 + LD a3, 0 * SIZE(X) + MADD s2, s2, a2, b1 + LD a4, 1 * SIZE(X) + MADD s3, s3, a1, b2 + LD b3, 0 * SIZE(Y) + MADD s4, s4, a2, b2 + LD b4, 1 * SIZE(Y) + + MADD s1, s1, a3, b3 + dadd X, X, INCX + MADD s2, s2, a4, b3 + dadd Y, Y, INCY + MADD s3, s3, a3, b4 + MADD s4, s4, a4, b4 + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MADD s1, s1, a1, b1 + MADD s2, s2, a2, b1 + MADD s3, s3, a1, b2 + MADD s4, s4, a2, b2 + + + dadd X, X, INCX + dadd Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L26 + NOP + .align 3 + +.L999: + NOP +#ifndef CONJ + SUB s1, s1, s4 +#else + ADD s1, s1, s4 +#endif + + j $31 +#ifndef CONJ + ADD s3, s3, s2 +#else + SUB s3, s3, s2 +#endif + + EPILOGUE diff --git a/kernel/mips64/zgemm3m_kernel.S b/kernel/mips64/zgemm3m_kernel.S new file mode 100644 index 0000000000..14bb7469c2 --- /dev/null +++ b/kernel/mips64/zgemm3m_kernel.S @@ -0,0 +1,1666 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 +#define CO5 $18 +#define CO6 $19 +#define CO7 $20 +#define CO8 $21 + +#if defined(TRMMKERNEL) +#define OFFSET $22 +#define KK $23 +#define TEMP $24 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f28 +#define a4 $f29 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f17 +#define c41 $f18 +#define c42 $f19 +#define c51 $f20 +#define c52 $f21 +#define c61 $f22 +#define c62 $f23 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + + PROLOGUE + + daddiu $sp, $sp, -128 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + sdc1 $f28, 80($sp) + sdc1 $f29, 88($sp) + + LDARG LDC, 128($sp) + + dsll LDC, LDC, ZBASE_SHIFT + + dsra J, N, 3 + blez J, .L30 + nop + +.L10: + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + move AO, A + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + daddu CO5, CO4, LDC + MOV c31, c11 + daddu CO6, CO5, LDC + MOV c41, c11 + daddu CO7, CO6, LDC + MOV c51, c11 + daddu CO8, CO7, LDC + dsra I, M, 1 + daddu C, CO8, LDC + + blez I, .L20 + MOV c61, c11 + +.L11: + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, K, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + blez L, .L13 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L12: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD a4, 2 * SIZE(AO) + MADD c61, c61, a1, b2 + NOP + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD a4, 6 * SIZE(AO) + MADD c61, c61, a3, b2 + NOP + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + daddiu L, L, -1 + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + bgtz L, .L12 + MADD c41, c41, a1, b4 + NOP + .align 3 + +.L13: + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + NOP + MADD c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD c71, c71, a1, b3 + NOP + MADD c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a4, b7 + NOP + MADD c61, c61, a4, b2 + NOP + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + NOP + MADD c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD c71, c71, a3, b3 + NOP + MADD c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a4, b2 + NOP + MADD c31, c31, a4, b3 + NOP + MADD c41, c41, a4, b4 + NOP + + MADD c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD c71, c71, a4, b3 + NOP + MADD c81, c81, a4, b4 + NOP + + MADD c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: + andi L, K, 3 + NOP + blez L, .L18 + NOP + .align 3 + +.L16: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + NOP + + MADD c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + daddiu L, L, -1 + MADD c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 2 * SIZE(CO1) + LD $f3, 3 * SIZE(CO1) + + LD $f4, 0 * SIZE(CO2) + MADD $f0, $f0, ALPHA_R, c11 + LD $f5, 1 * SIZE(CO2) + MADD $f1, $f1, ALPHA_I, c11 + LD $f6, 2 * SIZE(CO2) + MADD $f2, $f2, ALPHA_R, c12 + LD $f7, 3 * SIZE(CO2) + MADD $f3, $f3, ALPHA_I, c12 + + MADD $f4, $f4, ALPHA_R, c21 + ST $f0, 0 * SIZE(CO1) + MADD $f5, $f5, ALPHA_I, c21 + ST $f1, 1 * SIZE(CO1) + MADD $f6, $f6, ALPHA_R, c22 + ST $f2, 2 * SIZE(CO1) + MADD $f7, $f7, ALPHA_I, c22 + ST $f3, 3 * SIZE(CO1) + + LD $f0, 0 * SIZE(CO3) + LD $f1, 1 * SIZE(CO3) + LD $f2, 2 * SIZE(CO3) + LD $f3, 3 * SIZE(CO3) + + ST $f4, 0 * SIZE(CO2) + ST $f5, 1 * SIZE(CO2) + ST $f6, 2 * SIZE(CO2) + ST $f7, 3 * SIZE(CO2) + + LD $f4, 0 * SIZE(CO4) + LD $f5, 1 * SIZE(CO4) + LD $f6, 2 * SIZE(CO4) + LD $f7, 3 * SIZE(CO4) + + MADD $f0, $f0, ALPHA_R, c31 + MADD $f1, $f1, ALPHA_I, c31 + MADD $f2, $f2, ALPHA_R, c32 + MADD $f3, $f3, ALPHA_I, c32 + + MADD $f4, $f4, ALPHA_R, c41 + ST $f0, 0 * SIZE(CO3) + MADD $f5, $f5, ALPHA_I, c41 + ST $f1, 1 * SIZE(CO3) + MADD $f6, $f6, ALPHA_R, c42 + ST $f2, 2 * SIZE(CO3) + MADD $f7, $f7, ALPHA_I, c42 + ST $f3, 3 * SIZE(CO3) + + LD $f0, 0 * SIZE(CO5) + LD $f1, 1 * SIZE(CO5) + LD $f2, 2 * SIZE(CO5) + LD $f3, 3 * SIZE(CO5) + + ST $f4, 0 * SIZE(CO4) + ST $f5, 1 * SIZE(CO4) + ST $f6, 2 * SIZE(CO4) + ST $f7, 3 * SIZE(CO4) + + LD $f4, 0 * SIZE(CO6) + LD $f5, 1 * SIZE(CO6) + LD $f6, 2 * SIZE(CO6) + LD $f7, 3 * SIZE(CO6) + + MADD $f0, $f0, ALPHA_R, c51 + daddiu CO1,CO1, 4 * SIZE + MADD $f1, $f1, ALPHA_I, c51 + daddiu CO2,CO2, 4 * SIZE + MADD $f2, $f2, ALPHA_R, c52 + daddiu CO3,CO3, 4 * SIZE + MADD $f3, $f3, ALPHA_I, c52 + daddiu CO4,CO4, 4 * SIZE + + MADD $f4, $f4, ALPHA_R, c61 + ST $f0, 0 * SIZE(CO5) + MADD $f5, $f5, ALPHA_I, c61 + ST $f1, 1 * SIZE(CO5) + MADD $f6, $f6, ALPHA_R, c62 + ST $f2, 2 * SIZE(CO5) + MADD $f7, $f7, ALPHA_I, c62 + ST $f3, 3 * SIZE(CO5) + + LD $f0, 0 * SIZE(CO7) + LD $f1, 1 * SIZE(CO7) + LD $f2, 2 * SIZE(CO7) + LD $f3, 3 * SIZE(CO7) + + ST $f4, 0 * SIZE(CO6) + ST $f5, 1 * SIZE(CO6) + ST $f6, 2 * SIZE(CO6) + ST $f7, 3 * SIZE(CO6) + + LD $f4, 0 * SIZE(CO8) + daddiu I, I, -1 + LD $f5, 1 * SIZE(CO8) + MTC $0, c11 + LD $f6, 2 * SIZE(CO8) + LD $f7, 3 * SIZE(CO8) + + MADD $f0, $f0, ALPHA_R, c71 + daddiu CO5,CO5, 4 * SIZE + MADD $f1, $f1, ALPHA_I, c71 + daddiu CO6,CO6, 4 * SIZE + MADD $f2, $f2, ALPHA_R, c72 + daddiu CO7,CO7, 4 * SIZE + MADD $f3, $f3, ALPHA_I, c72 + daddiu CO8,CO8, 4 * SIZE + + MADD $f4, $f4, ALPHA_R, c81 + ST $f0, -4 * SIZE(CO7) + MADD $f5, $f5, ALPHA_I, c81 + ST $f1, -3 * SIZE(CO7) + MADD $f6, $f6, ALPHA_R, c82 + ST $f2, -2 * SIZE(CO7) + MADD $f7, $f7, ALPHA_I, c82 + ST $f3, -1 * SIZE(CO7) + + ST $f4, -4 * SIZE(CO8) + MOV c21, c11 + ST $f5, -3 * SIZE(CO8) + MOV c31, c11 + ST $f6, -2 * SIZE(CO8) + MOV c41, c11 + ST $f7, -1 * SIZE(CO8) + MOV c51, c11 + bgtz I, .L11 + MOV c61, c11 + .align 3 + +.L20: + andi I, M, 1 + MOV c61, c11 + blez I, .L29 + MOV c71, c11 + + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + MOV c81, c11 + + blez L, .L25 + move BO, B + .align 3 + +.L22: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + MADD c51, c51, a1, b5 + LD b5, 20 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 9 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 10 * SIZE(BO) + MADD c81, c81, a1, b4 + LD b4, 11 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c51, c51, a2, b7 + LD b7, 28 * SIZE(BO) + MADD c61, c61, a2, b2 + LD b2, 17 * SIZE(BO) + MADD c71, c71, a2, b3 + LD b3, 18 * SIZE(BO) + MADD c81, c81, a2, b4 + LD b4, 19 * SIZE(BO) + + LD a2, 5 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a3, b1 + LD b1, 32 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 21 * SIZE(BO) + MADD c31, c31, a3, b3 + LD b3, 22 * SIZE(BO) + MADD c41, c41, a3, b4 + LD b4, 23 * SIZE(BO) + + MADD c51, c51, a3, b5 + LD b5, 36 * SIZE(BO) + MADD c61, c61, a3, b2 + LD b2, 25 * SIZE(BO) + MADD c71, c71, a3, b3 + LD b3, 26 * SIZE(BO) + MADD c81, c81, a3, b4 + LD b4, 27 * SIZE(BO) + + LD a3, 2 * SIZE(AO) + daddiu BO, BO, 32 * SIZE + + MADD c11, c11, a4, b6 + LD b6, 8 * SIZE(BO) + MADD c21, c21, a4, b2 + LD b2, -3 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, -2 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, -1 * SIZE(BO) + + MADD c51, c51, a4, b7 + LD b7, 12 * SIZE(BO) + MADD c61, c61, a4, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a4, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a4, b4 + LD b4, 3 * SIZE(BO) + bgtz L, .L22 + LD a4, 3 * SIZE(AO) + .align 3 + +.L25: + andi L, K, 3 + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD c11, c11, a1, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + daddiu L, L, -1 + MOV a2, a2 + daddiu AO, AO, 1 * SIZE + daddiu BO, BO, 8 * SIZE + + MADD c51, c51, a1, b5 + LD b5, 4 * SIZE(BO) + MADD c61, c61, a1, b2 + LD b2, 1 * SIZE(BO) + MADD c71, c71, a1, b3 + LD b3, 2 * SIZE(BO) + MADD c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + bgtz L, .L26 + LD b4, 3 * SIZE(BO) + +.L28: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 0 * SIZE(CO2) + LD $f3, 1 * SIZE(CO2) + + LD $f4, 0 * SIZE(CO3) + MADD $f0, $f0, ALPHA_R, c11 + LD $f5, 1 * SIZE(CO3) + MADD $f1, $f1, ALPHA_I, c11 + LD $f6, 0 * SIZE(CO4) + MADD $f2, $f2, ALPHA_R, c21 + LD $f7, 1 * SIZE(CO4) + MADD $f3, $f3, ALPHA_I, c21 + + MADD $f4, $f4, ALPHA_R, c31 + ST $f0, 0 * SIZE(CO1) + MADD $f5, $f5, ALPHA_I, c31 + ST $f1, 1 * SIZE(CO1) + MADD $f6, $f6, ALPHA_R, c41 + ST $f2, 0 * SIZE(CO2) + MADD $f7, $f7, ALPHA_I, c41 + ST $f3, 1 * SIZE(CO2) + + LD $f0, 0 * SIZE(CO5) + LD $f1, 1 * SIZE(CO5) + LD $f2, 0 * SIZE(CO6) + LD $f3, 1 * SIZE(CO6) + + ST $f4, 0 * SIZE(CO3) + ST $f5, 1 * SIZE(CO3) + ST $f6, 0 * SIZE(CO4) + ST $f7, 1 * SIZE(CO4) + + LD $f4, 0 * SIZE(CO7) + MADD $f0, $f0, ALPHA_R, c51 + LD $f5, 1 * SIZE(CO7) + MADD $f1, $f1, ALPHA_I, c51 + LD $f6, 0 * SIZE(CO8) + MADD $f2, $f2, ALPHA_R, c61 + LD $f7, 1 * SIZE(CO8) + MADD $f3, $f3, ALPHA_I, c61 + + MADD $f4, $f4, ALPHA_R, c71 + ST $f0, 0 * SIZE(CO5) + MADD $f5, $f5, ALPHA_I, c71 + ST $f1, 1 * SIZE(CO5) + MADD $f6, $f6, ALPHA_R, c81 + ST $f2, 0 * SIZE(CO6) + MADD $f7, $f7, ALPHA_I, c81 + ST $f3, 1 * SIZE(CO6) + + ST $f4, 0 * SIZE(CO7) + ST $f5, 1 * SIZE(CO7) + ST $f6, 0 * SIZE(CO8) + ST $f7, 1 * SIZE(CO8) + .align 3 + +.L29: + bgtz J, .L10 + move B, BO + .align 3 + +.L30: + andi J, N, 4 + blez J, .L50 + move AO, A + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddu CO4, CO3, LDC + MOV c21, c11 + daddu C, CO4, LDC + MOV c31, c11 + + dsra I, M, 1 + blez I, .L40 + MOV c41, c11 + +.L31: + LD a1, 0 * SIZE(AO) + LD a3, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + MOV c32, c11 + LD b4, 3 * SIZE(B) + MOV c42, c11 + + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L35 + move BO, B + .align 3 + +.L32: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD c21, c21, a1, b2 + NOP + MADD c31, c31, a1, b3 + NOP + MADD c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD c11, c11, a3, b6 + LD a2, 5 * SIZE(AO) + MADD c21, c21, a3, b2 + NOP + MADD c31, c31, a3, b3 + NOP + MADD c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD c11, c11, a3, b7 + LD a2, 7 * SIZE(AO) + MADD c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD c31, c31, a3, b3 + daddiu BO, BO, 16 * SIZE + MADD c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD c12, c12, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c42, c42, a2, b4 + NOP + + bgtz L, .L32 + LD b4, 3 * SIZE(BO) + .align 3 + +.L35: + andi L, K, 3 + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + daddiu L, L, -1 + MADD c31, c31, a1, b3 + daddiu AO, AO, 2 * SIZE + MADD c41, c41, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 4 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + bgtz L, .L36 + daddiu BO, BO, 4 * SIZE + +.L38: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 2 * SIZE(CO1) + LD $f3, 3 * SIZE(CO1) + + LD $f4, 0 * SIZE(CO2) + LD $f5, 1 * SIZE(CO2) + LD $f6, 2 * SIZE(CO2) + LD $f7, 3 * SIZE(CO2) + + MADD $f0, $f0, ALPHA_R, c11 + MADD $f1, $f1, ALPHA_I, c11 + MADD $f2, $f2, ALPHA_R, c12 + MADD $f3, $f3, ALPHA_I, c12 + + MADD $f4, $f4, ALPHA_R, c21 + ST $f0, 0 * SIZE(CO1) + MADD $f5, $f5, ALPHA_I, c21 + ST $f1, 1 * SIZE(CO1) + MADD $f6, $f6, ALPHA_R, c22 + ST $f2, 2 * SIZE(CO1) + MADD $f7, $f7, ALPHA_I, c22 + ST $f3, 3 * SIZE(CO1) + + LD $f0, 0 * SIZE(CO3) + LD $f1, 1 * SIZE(CO3) + LD $f2, 2 * SIZE(CO3) + LD $f3, 3 * SIZE(CO3) + + ST $f4, 0 * SIZE(CO2) + MADD $f0, $f0, ALPHA_R, c31 + ST $f5, 1 * SIZE(CO2) + MADD $f1, $f1, ALPHA_I, c31 + ST $f6, 2 * SIZE(CO2) + MADD $f2, $f2, ALPHA_R, c32 + ST $f7, 3 * SIZE(CO2) + MADD $f3, $f3, ALPHA_I, c32 + + LD $f4, 0 * SIZE(CO4) + LD $f5, 1 * SIZE(CO4) + LD $f6, 2 * SIZE(CO4) + LD $f7, 3 * SIZE(CO4) + + MADD $f4, $f4, ALPHA_R, c41 + daddiu CO1,CO1, 4 * SIZE + MADD $f5, $f5, ALPHA_I, c41 + daddiu CO2,CO2, 4 * SIZE + MADD $f6, $f6, ALPHA_R, c42 + daddiu CO3,CO3, 4 * SIZE + MADD $f7, $f7, ALPHA_I, c42 + daddiu CO4,CO4, 4 * SIZE + + ST $f0, -4 * SIZE(CO3) + daddiu I, I, -1 + ST $f1, -3 * SIZE(CO3) + ST $f2, -2 * SIZE(CO3) + ST $f3, -1 * SIZE(CO3) + + ST $f4, -4 * SIZE(CO4) + MTC $0, c11 + ST $f5, -3 * SIZE(CO4) + MOV c21, c11 + ST $f6, -2 * SIZE(CO4) + MOV c31, c11 + ST $f7, -1 * SIZE(CO4) + bgtz I, .L31 + MOV c41, c11 + .align 3 + +.L40: + andi I, M, 1 + blez I, .L49 + MOV c61, c11 + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD a2, 1 * SIZE(AO) + MOV c81, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + + blez L, .L45 + move BO, B + .align 3 + +.L42: + MADD c11, c11, a1, b1 + LD b1, 16 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + daddiu L, L, -1 + + MADD c11, c11, a2, b5 + LD b5, 20 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 11 * SIZE(BO) + + LD a2, 2 * SIZE(AO) + daddiu AO, AO, 4 * SIZE + + MADD c11, c11, a2, b6 + LD b6, 24 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 13 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 14 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 15 * SIZE(BO) + + LD a2, -1 * SIZE(AO) + daddiu BO, BO, 16 * SIZE + + MADD c11, c11, a2, b7 + LD b7, 12 * SIZE(BO) + MADD c21, c21, a2, b2 + LD b2, 1 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 2 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L42 + LD a2, 1 * SIZE(AO) + .align 3 + +.L45: + andi L, K, 3 + NOP + blez L, .L48 + NOP + .align 3 + +.L46: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a1, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a1, b4 + LD a1, 1 * SIZE(AO) + + LD b4, 7 * SIZE(BO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + MOV a2, a2 + bgtz L, .L46 + daddiu BO, BO, 4 * SIZE + + +.L48: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 0 * SIZE(CO2) + LD $f3, 1 * SIZE(CO2) + + LD $f4, 0 * SIZE(CO3) + MADD $f0, $f0, ALPHA_R, c11 + LD $f5, 1 * SIZE(CO3) + MADD $f1, $f1, ALPHA_I, c11 + LD $f6, 0 * SIZE(CO4) + MADD $f2, $f2, ALPHA_R, c21 + LD $f7, 1 * SIZE(CO4) + MADD $f3, $f3, ALPHA_I, c21 + + MADD $f4, $f4, ALPHA_R, c31 + ST $f0, 0 * SIZE(CO1) + MADD $f5, $f5, ALPHA_I, c31 + ST $f1, 1 * SIZE(CO1) + MADD $f6, $f6, ALPHA_R, c41 + ST $f2, 0 * SIZE(CO2) + MADD $f7, $f7, ALPHA_I, c41 + ST $f3, 1 * SIZE(CO2) + + ST $f4, 0 * SIZE(CO3) + ST $f5, 1 * SIZE(CO3) + ST $f6, 0 * SIZE(CO4) + ST $f7, 1 * SIZE(CO4) + .align 3 + +.L49: + move B, BO + .align 3 + +.L50: + andi J, N, 2 + blez J, .L70 + + move AO, A + move CO1, C + daddu CO2, C, LDC + + dsra I, M, 1 + blez I, .L60 + daddu C, CO2, LDC + +.L51: + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L55 + move BO, B + .align 3 + +.L52: + MADD c11, c11, a1, b1 + LD a3, 2 * SIZE(AO) + MADD c21, c21, a1, b2 + LD b4, 3 * SIZE(BO) + MADD c12, c12, a2, b1 + LD a4, 3 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b1, 8 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a1, 8 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 5 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 5 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 6 * SIZE(BO) + + MADD c11, c11, a5, b5 + LD a3, 6 * SIZE(AO) + MADD c21, c21, a5, b2 + LD b4, 7 * SIZE(BO) + MADD c12, c12, a2, b5 + LD a4, 7 * SIZE(AO) + MADD c22, c22, a2, b2 + LD b5, 12 * SIZE(BO) + + MADD c11, c11, a3, b3 + LD a5, 12 * SIZE(AO) + MADD c21, c21, a3, b4 + LD b2, 9 * SIZE(BO) + MADD c12, c12, a4, b3 + LD a2, 9 * SIZE(AO) + MADD c22, c22, a4, b4 + LD b3, 10 * SIZE(BO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + bgtz L, .L52 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L55: + andi L, K, 3 + NOP + blez L, .L58 + NOP + .align 3 + +.L56: + MADD c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + + MADD c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD c22, c22, a2, b2 + LD b2, 3 * SIZE(BO) + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L56 + daddiu BO, BO, 2 * SIZE + +.L58: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 2 * SIZE(CO1) + LD $f3, 3 * SIZE(CO1) + + LD $f4, 0 * SIZE(CO2) + LD $f5, 1 * SIZE(CO2) + LD $f6, 2 * SIZE(CO2) + LD $f7, 3 * SIZE(CO2) + + MADD $f0, $f0, ALPHA_R, c11 + daddiu I, I, -1 + MADD $f1, $f1, ALPHA_I, c11 + daddiu CO1,CO1, 4 * SIZE + MADD $f2, $f2, ALPHA_R, c12 + daddiu CO2,CO2, 4 * SIZE + MADD $f3, $f3, ALPHA_I, c12 + MADD $f4, $f4, ALPHA_R, c21 + MADD $f5, $f5, ALPHA_I, c21 + MADD $f6, $f6, ALPHA_R, c22 + MADD $f7, $f7, ALPHA_I, c22 + + ST $f0, -4 * SIZE(CO1) + ST $f1, -3 * SIZE(CO1) + ST $f2, -2 * SIZE(CO1) + ST $f3, -1 * SIZE(CO1) + + ST $f4, -4 * SIZE(CO2) + ST $f5, -3 * SIZE(CO2) + ST $f6, -2 * SIZE(CO2) + bgtz I, .L51 + ST $f7, -1 * SIZE(CO2) + .align 3 + +.L60: + andi I, M, 1 + blez I, .L69 + NOP + + dsra L, K, 2 + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + MOV c31, c11 + LD a4, 3 * SIZE(AO) + MOV c41, c11 + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L65 + move BO, B + .align 3 + +.L62: + MADD c11, c11, a1, b1 + LD b1, 4 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 5 * SIZE(BO) + MADD c31, c31, a2, b3 + LD b3, 6 * SIZE(BO) + MADD c41, c41, a2, b4 + LD b4, 7 * SIZE(BO) + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + + MADD c11, c11, a3, b1 + LD b1, 8 * SIZE(BO) + MADD c21, c21, a3, b2 + LD b2, 9 * SIZE(BO) + MADD c31, c31, a4, b3 + LD b3, 10 * SIZE(BO) + MADD c41, c41, a4, b4 + LD b4, 11 * SIZE(BO) + + LD a3, 6 * SIZE(AO) + LD a4, 7 * SIZE(AO) + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + + bgtz L, .L62 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L65: + andi L, K, 3 + NOP + blez L, .L68 + NOP + .align 3 + +.L66: + MADD c11, c11, a1, b1 + LD b1, 2 * SIZE(BO) + MADD c21, c21, a1, b2 + LD b2, 3 * SIZE(BO) + + LD a1, 1 * SIZE(AO) + daddiu L, L, -1 + + daddiu AO, AO, 1 * SIZE + bgtz L, .L66 + daddiu BO, BO, 2 * SIZE + + +.L68: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 0 * SIZE(CO2) + LD $f3, 1 * SIZE(CO2) + + ADD c11, c11, c31 + ADD c21, c21, c41 + + MADD $f0, $f0, ALPHA_R, c11 + MADD $f1, $f1, ALPHA_I, c11 + MADD $f2, $f2, ALPHA_R, c21 + MADD $f3, $f3, ALPHA_I, c21 + + ST $f0, 0 * SIZE(CO1) + ST $f1, 1 * SIZE(CO1) + ST $f2, 0 * SIZE(CO2) + ST $f3, 1 * SIZE(CO2) + .align 3 + +.L69: + move B, BO + .align 3 + +.L70: + andi J, N, 1 + blez J, .L999 + + move AO, A + move CO1, C + + dsra I, M, 1 + blez I, .L80 + daddu C, CO1, LDC + +.L71: + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a5, 4 * SIZE(AO) + + LD b1, 0 * SIZE(B) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + LD b3, 2 * SIZE(B) + LD b5, 4 * SIZE(B) + dsra L, K, 2 + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + blez L, .L75 + move BO, B + .align 3 + +.L72: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 2 * SIZE(AO) + LD a2, 3 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 4 * SIZE(AO) + LD a2, 5 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + LD a1, 6 * SIZE(AO) + LD a2, 7 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 8 * SIZE + bgtz L, .L72 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L75: + andi L, K, 3 + NOP + blez L, .L78 + NOP + .align 3 + +.L76: + LD a1, 0 * SIZE(AO) + LD a2, 1 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + MADD c12, c12, a2, b1 + + daddiu L, L, -1 + daddiu AO, AO, 2 * SIZE + bgtz L, .L76 + daddiu BO, BO, 1 * SIZE + +.L78: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + LD $f2, 2 * SIZE(CO1) + LD $f3, 3 * SIZE(CO1) + + ADD c11, c11, c21 + daddiu I, I, -1 + ADD c12, c12, c22 + daddiu CO1,CO1, 4 * SIZE + + MADD $f0, $f0, ALPHA_R, c11 + MADD $f1, $f1, ALPHA_I, c11 + MADD $f2, $f2, ALPHA_R, c12 + MADD $f3, $f3, ALPHA_I, c12 + + ST $f0, -4 * SIZE(CO1) + ST $f1, -3 * SIZE(CO1) + ST $f2, -2 * SIZE(CO1) + + bgtz I, .L71 + ST $f3, -1 * SIZE(CO1) + .align 3 + +.L80: + andi I, M, 1 + blez I, .L89 + NOP + + LD a1, 0 * SIZE(AO) + MTC $0, c11 + LD a2, 1 * SIZE(AO) + MOV c21, c11 + LD a3, 2 * SIZE(AO) + LD a4, 3 * SIZE(AO) + + LD b1, 0 * SIZE(B) + LD b2, 1 * SIZE(B) + LD b3, 2 * SIZE(B) + LD b4, 3 * SIZE(B) + LD b5, 4 * SIZE(B) + LD b6, 8 * SIZE(B) + LD b7, 12 * SIZE(B) + + dsra L, K, 2 + blez L, .L85 + move BO, B + .align 3 + +.L82: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 1 * SIZE(AO) + LD b1, 1 * SIZE(BO) + + MADD c21, c21, a1, b1 + + LD a1, 2 * SIZE(AO) + LD b1, 2 * SIZE(BO) + + MADD c11, c11, a1, b1 + + LD a1, 3 * SIZE(AO) + LD b1, 3 * SIZE(BO) + + MADD c21, c21, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 4 * SIZE + bgtz L, .L82 + daddiu BO, BO, 4 * SIZE + .align 3 + +.L85: + andi L, K, 3 + NOP + blez L, .L88 + NOP + .align 3 + +.L86: + LD a1, 0 * SIZE(AO) + LD b1, 0 * SIZE(BO) + + MADD c11, c11, a1, b1 + + daddiu L, L, -1 + daddiu AO, AO, 1 * SIZE + bgtz L, .L86 + daddiu BO, BO, 1 * SIZE + + +.L88: + LD $f0, 0 * SIZE(CO1) + LD $f1, 1 * SIZE(CO1) + + ADD c11, c11, c21 + MADD $f0, $f0, ALPHA_R, c11 + MADD $f1, $f1, ALPHA_I, c11 + + ST $f0, 0 * SIZE(CO1) + ST $f1, 1 * SIZE(CO1) + .align 3 + +.L89: + move B, BO + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + ldc1 $f28, 80($sp) + ldc1 $f29, 88($sp) + + j $31 + daddiu $sp, $sp, 128 + + EPILOGUE diff --git a/kernel/mips64/zgemm_kernel.S b/kernel/mips64/zgemm_kernel.S new file mode 100644 index 0000000000..c48519c334 --- /dev/null +++ b/kernel/mips64/zgemm_kernel.S @@ -0,0 +1,1286 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#if defined(TRMMKERNEL) +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#endif + +#define a1 $f0 +#define a2 $f1 +#define a3 $f28 +#define a4 $f29 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f17 +#define c41 $f18 +#define c42 $f19 +#define c51 $f20 +#define c52 $f21 +#define c61 $f22 +#define c62 $f23 +#define c71 $f24 +#define c72 $f25 +#define c81 $f26 +#define c82 $f27 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG LDC, 0($sp) + daddiu $sp, $sp, -128 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + sdc1 $f26, 32($sp) + sdc1 $f27, 40($sp) + sdc1 $f28, 48($sp) + sdc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + SDARG $18, 64($sp) + SDARG $19, 72($sp) + SDARG $20, 80($sp) + + LDARG OFFSET, 128 + 8($sp) +#endif + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + dsll LDC, LDC, ZBASE_SHIFT + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + dsra J, N, 2 + blez J, .L20 + nop + +.L10: + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + move AO, A + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + MOV c41, c11 + MOV c51, c11 + move I, M + daddu C, CO4, LDC + + blez I, .L19 + MOV c61, c11 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 2 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 4 +#endif + dsra L, TEMP, 2 + + blez L, .L15 + NOP +#else + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, K, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#endif + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + blez L, .L13 + MADD3 c41, c41, a1, b4 + .align 3 + +.L12: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + daddiu L, L, -1 + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + bgtz L, .L12 + MADD3 c41, c41, a1, b4 + .align 3 + +.L13: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L18 + NOP + .align 3 + +.L16: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + daddiu L, L, -1 + MADD3 c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD1 c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD3 c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(CO1) + ADD c11, c11, c22 + LD b2, 1 * SIZE(CO1) + ADD c12, c12, c21 + LD b3, 0 * SIZE(CO2) + ADD c31, c31, c42 + LD b4, 1 * SIZE(CO2) + ADD c32, c32, c41 + + LD b5, 0 * SIZE(CO3) + ADD c51, c51, c62 + LD b6, 1 * SIZE(CO3) + ADD c52, c52, c61 + LD b7, 0 * SIZE(CO4) + ADD c71, c71, c82 + LD b8, 1 * SIZE(CO4) + ADD c72, c72, c81 + + MADD b1, b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MADD b2, b2, ALPHA_R, c12 + daddiu CO2,CO2, 2 * SIZE + MADD b3, b3, ALPHA_R, c31 + daddiu CO3,CO3, 2 * SIZE + MADD b4, b4, ALPHA_R, c32 + daddiu CO4,CO4, 2 * SIZE + + MADD b5, b5, ALPHA_R, c51 + daddiu I, I, -1 + MADD b6, b6, ALPHA_R, c52 + NOP + MADD b7, b7, ALPHA_R, c71 + NOP + MADD b8, b8, ALPHA_R, c72 + NOP + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + NMSUB b3, b3, ALPHA_I, c32 + NOP + MADD b4, b4, ALPHA_I, c31 + NOP + + ST b1, -2 * SIZE(CO1) + NMSUB b5, b5, ALPHA_I, c52 + ST b2, -1 * SIZE(CO1) + MADD b6, b6, ALPHA_I, c51 + ST b3, -2 * SIZE(CO2) + NMSUB b7, b7, ALPHA_I, c72 + ST b4, -1 * SIZE(CO2) + MADD b8, b8, ALPHA_I, c71 + + ST b5, -2 * SIZE(CO3) + MOV c21, c11 + ST b6, -1 * SIZE(CO3) + MOV c31, c11 + ST b7, -2 * SIZE(CO4) + MOV c41, c11 + ST b8, -1 * SIZE(CO4) + MOV c51, c11 + +#else + + ADD c11, c11, c22 + daddiu CO1,CO1, 2 * SIZE + ADD c12, c12, c21 + daddiu CO2,CO2, 2 * SIZE + ADD c31, c31, c42 + daddiu CO3,CO3, 2 * SIZE + ADD c32, c32, c41 + daddiu CO4,CO4, 2 * SIZE + + ADD c51, c51, c62 + daddiu I, I, -1 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + + MUL b1, ALPHA_R, c11 + MUL b2, ALPHA_R, c12 + MUL b3, ALPHA_R, c31 + MUL b4, ALPHA_R, c32 + + MUL b5, ALPHA_R, c51 + MUL b6, ALPHA_R, c52 + MUL b7, ALPHA_R, c71 + MUL b8, ALPHA_R, c72 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + NMSUB b3, b3, ALPHA_I, c32 + NOP + MADD b4, b4, ALPHA_I, c31 + NOP + + ST b1, -2 * SIZE(CO1) + NMSUB b5, b5, ALPHA_I, c52 + ST b2, -1 * SIZE(CO1) + MADD b6, b6, ALPHA_I, c51 + ST b3, -2 * SIZE(CO2) + NMSUB b7, b7, ALPHA_I, c72 + ST b4, -1 * SIZE(CO2) + MADD b8, b8, ALPHA_I, c71 + + ST b5, -2 * SIZE(CO3) + MOV c21, c11 + ST b6, -1 * SIZE(CO3) + MOV c31, c11 + ST b7, -2 * SIZE(CO4) + MOV c41, c11 + ST b8, -1 * SIZE(CO4) + MOV c51, c11 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -4 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + bgtz I, .L11 + MOV c61, c11 + .align 3 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 4 +#endif + + bgtz J, .L10 + move B, BO + .align 3 + +.L20: + andi J, N, 2 + MTC $0, c11 + blez J, .L30 + move CO1, C + + daddu CO2, C, LDC + daddu C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move I, M + blez I, .L29 + move AO, A + .align 3 + +.L21: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(BO) + + LD b3, 2 * SIZE(BO) + MOV c12, c11 + LD b4, 3 * SIZE(BO) + MOV c22, c11 + LD b5, 4 * SIZE(BO) + MOV c32, c11 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 2 +#endif + dsra L, TEMP, 2 + blez L, .L25 + MOV c42, c11 + +#else + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(B) + dsra L, K, 2 + + LD b3, 2 * SIZE(B) + MOV c12, c11 + LD b4, 3 * SIZE(B) + MOV c22, c11 + LD b5, 4 * SIZE(B) + MOV c32, c11 + + NOP + MOV c42, c11 + blez L, .L25 + move BO, B +#endif + .align 3 + +.L22: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 12 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c11, c11, a3, b5 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 19 * SIZE(BO) + + bgtz L, .L22 + daddiu BO, BO, 16 * SIZE + .align 3 + +.L25: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L28 + NOP + .align 3 + +.L26: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + daddiu BO, BO, 4 * SIZE + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 0 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L26 + daddiu AO, AO, 2 * SIZE + +.L28: +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(CO1) + ADD c11, c11, c22 + LD b2, 1 * SIZE(CO1) + ADD c12, c12, c21 + LD b3, 0 * SIZE(CO2) + ADD c31, c31, c42 + LD b4, 1 * SIZE(CO2) + ADD c32, c32, c41 + + MADD b1, b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MADD b2, b2, ALPHA_R, c12 + daddiu CO2,CO2, 2 * SIZE + MADD b3, b3, ALPHA_R, c31 + daddiu I, I, -1 + MADD b4, b4, ALPHA_R, c32 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + NMSUB b3, b3, ALPHA_I, c32 + NOP + MADD b4, b4, ALPHA_I, c31 + NOP + + ST b1, -2 * SIZE(CO1) + ST b2, -1 * SIZE(CO1) + ST b3, -2 * SIZE(CO2) +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + + MUL b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + daddiu CO2,CO2, 2 * SIZE + MUL b3, ALPHA_R, c31 + daddiu I, I, -1 + MUL b4, ALPHA_R, c32 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + NMSUB b3, b3, ALPHA_I, c32 + NOP + MADD b4, b4, ALPHA_I, c31 + NOP + + ST b1, -2 * SIZE(CO1) + ST b2, -1 * SIZE(CO1) + ST b3, -2 * SIZE(CO2) + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -2 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif +#endif + bgtz I, .L21 + ST b4, -1 * SIZE(CO2) + .align 3 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 2 +#endif + + move B, BO + .align 3 + +.L30: + andi J, N, 1 + MTC $0, c11 + blez J, .L999 + move CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + move KK, OFFSET +#endif + + move I, M + daddu C, CO1, LDC + blez I, .L39 + move AO, A + .align 3 + +.L31: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move BO, B +#else + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, B, TEMP +#endif + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(BO) + MOV c12, c11 + NOP + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(BO) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + dsubu TEMP, K, KK +#elif defined(LEFT) + daddiu TEMP, KK, 1 +#else + daddiu TEMP, KK, 1 +#endif + dsra L, TEMP, 2 + + blez L, .L35 + MOV c42, c11 +#else + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(B) + MOV c12, c11 + dsra L, K, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(B) + + NOP + MOV c42, c11 + blez L, .L35 + move BO, B +#endif + .align 3 + +.L32: + MADD1 c11, c11, a1, b1 + LD b4, 3 * SIZE(BO) + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + MADD1 c11, c11, a1, b1 + LD b2, 5 * SIZE(BO) + MADD3 c21, c21, a1, b4 + LD a1, 8 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 5 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b4, 7 * SIZE(BO) + MADD3 c21, c21, a3, b2 + LD a3, 6 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 7 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b2, 9 * SIZE(BO) + MADD3 c21, c21, a3, b4 + LD a3, 12 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 12 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 9 * SIZE(AO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + + bgtz L, .L32 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L35: +#ifndef TRMMKERNEL + andi L, K, 3 +#else + andi L, TEMP, 3 +#endif + NOP + blez L, .L38 + NOP + .align 3 + +.L36: + MADD1 c11, c11, a1, b1 + daddiu L, L, -1 + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + LD b2, 3 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + bgtz L, .L36 + daddiu AO, AO, 2 * SIZE + +.L38: +#ifndef TRMMKERNEL + LD b1, 0 * SIZE(CO1) + ADD c11, c11, c22 + LD b2, 1 * SIZE(CO1) + ADD c12, c12, c21 + + MADD b1, b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MADD b2, b2, ALPHA_R, c12 + daddiu I, I, -1 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + + ST b1, -2 * SIZE(CO1) + NOP + bgtz I, .L31 + ST b2, -1 * SIZE(CO1) +#else + ADD c11, c11, c22 + ADD c12, c12, c21 + + MUL b1, ALPHA_R, c11 + daddiu CO1,CO1, 2 * SIZE + MUL b2, ALPHA_R, c12 + daddiu I, I, -1 + + NMSUB b1, b1, ALPHA_I, c12 + NOP + MADD b2, b2, ALPHA_I, c11 + MTC $0, c11 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + dsubu TEMP, K, KK +#ifdef LEFT + daddiu TEMP, TEMP, -1 +#else + daddiu TEMP, TEMP, -1 +#endif + + dsll TEMP, TEMP, ZBASE_SHIFT + + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LEFT + daddiu KK, KK, 1 +#endif + + ST b1, -2 * SIZE(CO1) + NOP + bgtz I, .L31 + ST b2, -1 * SIZE(CO1) +#endif + .align 3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + daddiu KK, KK, 1 +#endif + move B, BO + .align 3 + + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + ldc1 $f26, 32($sp) + ldc1 $f27, 40($sp) + ldc1 $f28, 48($sp) + ldc1 $f29, 56($sp) + +#if defined(TRMMKERNEL) + LDARG $18, 64($sp) + LDARG $19, 72($sp) + LDARG $20, 80($sp) +#endif + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, 128 + + EPILOGUE diff --git a/kernel/mips64/zgemv_n.S b/kernel/mips64/zgemv_n.S new file mode 100644 index 0000000000..c6cc896151 --- /dev/null +++ b/kernel/mips64/zgemv_n.S @@ -0,0 +1,777 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define A $9 +#define LDA $10 +#define X $11 +#define INCX $2 +#define Y $6 +#define INCY $7 +#define BUFFER $8 + +#define YORIG $3 +#define XX $12 +#define YY $13 + +#define I $14 +#define J $15 + +#define AO1 $16 +#define AO2 $17 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define x1 $f8 +#define x2 $f9 +#define x3 $f10 +#define x4 $f11 + +#define y1 $f12 +#define y2 $f13 +#define y3 $f14 +#define y4 $f17 + +#define t1 $f18 +#define t2 $f19 +#define t3 $f20 +#define t4 $f21 +#define t5 $f22 +#define t6 $f23 +#define t7 $f24 +#define t8 $f25 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCX, 0($sp) + LDARG Y, 8($sp) + LDARG INCY, 16($sp) + LDARG BUFFER, 24($sp) +#ifndef __64BIT__ + daddiu $sp, $sp, -64 +#else + daddiu $sp, $sp, -32 +#endif + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + + sdc1 $f24, 16($sp) + sdc1 $f25, 24($sp) + +#ifndef __64BIT__ + sdc1 $f20, 32($sp) + sdc1 $f21, 40($sp) + sdc1 $f22, 48($sp) + sdc1 $f23, 56($sp) +#endif + + dsll LDA, LDA, ZBASE_SHIFT + + blez M, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + blez N, .L999 + dsll INCY, INCY, ZBASE_SHIFT + + li YORIG, 2 * SIZE + + beq INCY, YORIG, .L10 + move YORIG, Y + + dsra I, M, 2 + move YORIG, BUFFER + + move XX, Y + + blez I, .L05 + move YY, BUFFER + .align 3 + +.L02: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCY + LD a3, 0 * SIZE(XX) + LD a4, 1 * SIZE(XX) + daddu XX, XX, INCY + LD a5, 0 * SIZE(XX) + LD a6, 1 * SIZE(XX) + daddu XX, XX, INCY + LD a7, 0 * SIZE(XX) + LD a8, 1 * SIZE(XX) + daddu XX, XX, INCY + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + ST a1, -8 * SIZE(YY) + ST a2, -7 * SIZE(YY) + ST a3, -6 * SIZE(YY) + ST a4, -5 * SIZE(YY) + ST a5, -4 * SIZE(YY) + ST a6, -3 * SIZE(YY) + ST a7, -2 * SIZE(YY) + + bgtz I, .L02 + ST a8, -1 * SIZE(YY) + .align 3 + +.L05: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L06: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCY + + daddiu I, I, -1 + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + + bgtz I, .L06 + daddiu YY, YY, 2 * SIZE + .align 3 + +.L10: + dsra J, N, 1 + blez J, .L20 + NOP + .align 3 + +.L11: + LD x1, 0 * SIZE(X) + LD x2, 1 * SIZE(X) + daddu X, X, INCX + LD x3, 0 * SIZE(X) + LD x4, 1 * SIZE(X) + daddu X, X, INCX + + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 + daddu AO2, A, LDA + MUL a3, ALPHA_R, x3 + daddu A, AO2, LDA + MUL a4, ALPHA_I, x3 + +#ifndef XCONJ + NMSUB x1, a1, ALPHA_I, x2 + MADD x2, a2, ALPHA_R, x2 + NMSUB x3, a3, ALPHA_I, x4 + MADD x4, a4, ALPHA_R, x4 +#else + MADD x1, a1, ALPHA_I, x2 + MSUB x2, a2, ALPHA_R, x2 + MADD x3, a3, ALPHA_I, x4 + MSUB x4, a4, ALPHA_R, x4 +#endif + + dsra I, M, 2 + + blez I, .L15 + move YY, YORIG + + LD y1, 0 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + MADD1 t1, y1, x1, a1 + LD y1, 4 * SIZE(YY) + MADD2 t2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + MADD1 t3, y3, x1, a3 + LD y2, 5 * SIZE(YY) + MADD2 t4, y4, x2, a3 + LD a3, 6 * SIZE(AO1) + + MADD3 t1, t1, x2, a2 + LD y3, 6 * SIZE(YY) + MADD4 t2, t2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD3 t3, t3, x2, a4 + LD y4, 7 * SIZE(YY) + MADD4 t4, t4, x1, a4 + LD a4, 7 * SIZE(AO1) + + MADD1 t1, t1, x3, a5 + NOP + MADD2 t2, t2, x4, a5 + LD a5, 4 * SIZE(AO2) + MADD1 t3, t3, x3, a7 + NOP + MADD2 t4, t4, x4, a7 + LD a7, 6 * SIZE(AO2) + + MADD3 t1, t1, x4, a6 + NOP + MADD4 t2, t2, x3, a6 + LD a6, 5 * SIZE(AO2) + MADD3 t3, t3, x4, a8 + daddiu I, I, -1 + MADD4 t4, t4, x3, a8 + + blez I, .L13 + LD a8, 7 * SIZE(AO2) + .align 3 + +.L12: + MADD1 t5, y1, x1, a1 + LD y1, 8 * SIZE(YY) + MADD2 t6, y2, x2, a1 + LD a1, 8 * SIZE(AO1) + MADD1 t7, y3, x1, a3 + LD y2, 9 * SIZE(YY) + MADD2 t8, y4, x2, a3 + LD a3, 10 * SIZE(AO1) + + MADD3 t5, t5, x2, a2 + LD y3, 10 * SIZE(YY) + MADD4 t6, t6, x1, a2 + LD a2, 9 * SIZE(AO1) + MADD3 t7, t7, x2, a4 + LD y4, 11 * SIZE(YY) + MADD4 t8, t8, x1, a4 + LD a4, 11 * SIZE(AO1) + + MADD1 t5, t5, x3, a5 + ST t1, 0 * SIZE(YY) + MADD2 t6, t6, x4, a5 + LD a5, 8 * SIZE(AO2) + MADD1 t7, t7, x3, a7 + ST t2, 1 * SIZE(YY) + MADD2 t8, t8, x4, a7 + LD a7, 10 * SIZE(AO2) + + MADD3 t5, t5, x4, a6 + ST t3, 2 * SIZE(YY) + MADD4 t6, t6, x3, a6 + LD a6, 9 * SIZE(AO2) + MADD3 t7, t7, x4, a8 + ST t4, 3 * SIZE(YY) + MADD4 t8, t8, x3, a8 + LD a8, 11 * SIZE(AO2) + + MADD1 t1, y1, x1, a1 + LD y1, 12 * SIZE(YY) + MADD2 t2, y2, x2, a1 + LD a1, 12 * SIZE(AO1) + MADD1 t3, y3, x1, a3 + LD y2, 13 * SIZE(YY) + MADD2 t4, y4, x2, a3 + LD a3, 14 * SIZE(AO1) + + MADD3 t1, t1, x2, a2 + LD y3, 14 * SIZE(YY) + MADD4 t2, t2, x1, a2 + LD a2, 13 * SIZE(AO1) + MADD3 t3, t3, x2, a4 + LD y4, 15 * SIZE(YY) + MADD4 t4, t4, x1, a4 + LD a4, 15 * SIZE(AO1) + + MADD1 t1, t1, x3, a5 + ST t5, 4 * SIZE(YY) + MADD2 t2, t2, x4, a5 + LD a5, 12 * SIZE(AO2) + MADD1 t3, t3, x3, a7 + ST t6, 5 * SIZE(YY) + MADD2 t4, t4, x4, a7 + LD a7, 14 * SIZE(AO2) + + MADD3 t1, t1, x4, a6 + ST t7, 6 * SIZE(YY) + MADD4 t2, t2, x3, a6 + LD a6, 13 * SIZE(AO2) + MADD3 t3, t3, x4, a8 + ST t8, 7 * SIZE(YY) + MADD4 t4, t4, x3, a8 + LD a8, 15 * SIZE(AO2) + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + daddiu AO1, AO1, 8 * SIZE + bgtz I, .L12 + daddiu AO2, AO2, 8 * SIZE + .align 3 + +.L13: + ST t1, 0 * SIZE(YY) + MADD1 t1, y1, x1, a1 + ST t2, 1 * SIZE(YY) + MADD2 t2, y2, x2, a1 + ST t3, 2 * SIZE(YY) + MADD1 t3, y3, x1, a3 + ST t4, 3 * SIZE(YY) + MADD2 t4, y4, x2, a3 + + MADD3 t1, t1, x2, a2 + MADD4 t2, t2, x1, a2 + MADD3 t3, t3, x2, a4 + MADD4 t4, t4, x1, a4 + + MADD1 t1, t1, x3, a5 + MADD2 t2, t2, x4, a5 + MADD1 t3, t3, x3, a7 + MADD2 t4, t4, x4, a7 + + MADD3 t1, t1, x4, a6 + daddiu AO1, AO1, 8 * SIZE + MADD4 t2, t2, x3, a6 + daddiu AO2, AO2, 8 * SIZE + MADD3 t3, t3, x4, a8 + daddiu YY, YY, 8 * SIZE + MADD4 t4, t4, x3, a8 + NOP + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L15: + andi I, M, 2 + NOP + blez I, .L16 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + MADD1 t1, y1, x1, a1 + LD a5, 0 * SIZE(AO2) + MADD2 t2, y2, x2, a1 + LD a6, 1 * SIZE(AO2) + MADD1 t3, y3, x1, a3 + LD a7, 2 * SIZE(AO2) + MADD2 t4, y4, x2, a3 + LD a8, 3 * SIZE(AO2) + + MADD3 t1, t1, x2, a2 + MADD4 t2, t2, x1, a2 + MADD3 t3, t3, x2, a4 + MADD4 t4, t4, x1, a4 + + MADD1 t1, t1, x3, a5 + MADD2 t2, t2, x4, a5 + MADD1 t3, t3, x3, a7 + MADD2 t4, t4, x4, a7 + + MADD3 t1, t1, x4, a6 + daddiu YY, YY, 4 * SIZE + MADD4 t2, t2, x3, a6 + daddiu AO1, AO1, 4 * SIZE + MADD3 t3, t3, x4, a8 + daddiu AO2, AO2, 4 * SIZE + MADD4 t4, t4, x3, a8 + NOP + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L16: + andi I, M, 1 + NOP + blez I, .L19 + NOP + + LD y1, 0 * SIZE(YY) + LD y2, 1 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + MADD1 t1, y1, x1, a1 + LD a5, 0 * SIZE(AO2) + MADD2 t2, y2, x2, a1 + LD a6, 1 * SIZE(AO2) + MADD3 t1, t1, x2, a2 + MADD4 t2, t2, x1, a2 + + MADD1 t1, t1, x3, a5 + MADD2 t2, t2, x4, a5 + MADD3 t1, t1, x4, a6 + MADD4 t2, t2, x3, a6 + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + .align 3 + + +.L19: + daddiu J, J, -1 + + bgtz J, .L11 + NOP + .align 3 + +.L20: + andi J, N, 1 + blez J, .L900 + NOP + + LD x1, 0 * SIZE(X) + LD x2, 1 * SIZE(X) + daddu X, X, INCX + + MUL a1, ALPHA_R, x1 + move AO1, A + MUL a2, ALPHA_I, x1 + +#ifndef XCONJ + NMSUB x1, a1, ALPHA_I, x2 + MADD x2, a2, ALPHA_R, x2 +#else + MADD x1, a1, ALPHA_I, x2 + MSUB x2, a2, ALPHA_R, x2 +#endif + + dsra I, M, 2 + + blez I, .L25 + move YY, YORIG + + LD y1, 0 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + + MADD1 t1, y1, x1, a1 + LD y1, 4 * SIZE(YY) + MADD2 t2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + MADD1 t3, y3, x1, a3 + LD y2, 5 * SIZE(YY) + MADD2 t4, y4, x2, a3 + LD a3, 6 * SIZE(AO1) + + MADD3 t1, t1, x2, a2 + LD y3, 6 * SIZE(YY) + MADD4 t2, t2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD3 t3, t3, x2, a4 + LD y4, 7 * SIZE(YY) + MADD4 t4, t4, x1, a4 + daddiu I, I, -1 + + blez I, .L23 + LD a4, 7 * SIZE(AO1) + .align 3 + +.L22: + MADD1 t5, y1, x1, a1 + LD y1, 8 * SIZE(YY) + MADD2 t6, y2, x2, a1 + LD a1, 8 * SIZE(AO1) + MADD1 t7, y3, x1, a3 + LD y2, 9 * SIZE(YY) + MADD2 t8, y4, x2, a3 + LD a3, 10 * SIZE(AO1) + + MADD3 t5, t5, x2, a2 + LD y3, 10 * SIZE(YY) + MADD4 t6, t6, x1, a2 + LD a2, 9 * SIZE(AO1) + MADD3 t7, t7, x2, a4 + LD y4, 11 * SIZE(YY) + MADD4 t8, t8, x1, a4 + LD a4, 11 * SIZE(AO1) + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + ST t3, 2 * SIZE(YY) + ST t4, 3 * SIZE(YY) + + MADD1 t1, y1, x1, a1 + LD y1, 12 * SIZE(YY) + MADD2 t2, y2, x2, a1 + LD a1, 12 * SIZE(AO1) + MADD1 t3, y3, x1, a3 + LD y2, 13 * SIZE(YY) + MADD2 t4, y4, x2, a3 + LD a3, 14 * SIZE(AO1) + + MADD3 t1, t1, x2, a2 + LD y3, 14 * SIZE(YY) + MADD4 t2, t2, x1, a2 + LD a2, 13 * SIZE(AO1) + MADD3 t3, t3, x2, a4 + LD y4, 15 * SIZE(YY) + MADD4 t4, t4, x1, a4 + LD a4, 15 * SIZE(AO1) + + ST t5, 4 * SIZE(YY) + ST t6, 5 * SIZE(YY) + ST t7, 6 * SIZE(YY) + ST t8, 7 * SIZE(YY) + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + bgtz I, .L22 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L23: + ST t1, 0 * SIZE(YY) + MADD1 t1, y1, x1, a1 + ST t2, 1 * SIZE(YY) + MADD2 t2, y2, x2, a1 + ST t3, 2 * SIZE(YY) + MADD1 t3, y3, x1, a3 + ST t4, 3 * SIZE(YY) + MADD2 t4, y4, x2, a3 + + MADD3 t1, t1, x2, a2 + daddiu AO1, AO1, 8 * SIZE + MADD4 t2, t2, x1, a2 + daddiu YY, YY, 8 * SIZE + MADD3 t3, t3, x2, a4 + MADD4 t4, t4, x1, a4 + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L25: + andi I, M, 2 + NOP + blez I, .L26 + NOP + + LD a1, 0 * SIZE(AO1) + LD y1, 0 * SIZE(YY) + LD a2, 1 * SIZE(AO1) + LD y2, 1 * SIZE(YY) + + LD a3, 2 * SIZE(AO1) + LD y3, 2 * SIZE(YY) + LD a4, 3 * SIZE(AO1) + LD y4, 3 * SIZE(YY) + + MADD1 t1, y1, x1, a1 + MADD2 t2, y2, x2, a1 + MADD1 t3, y3, x1, a3 + MADD2 t4, y4, x2, a3 + + MADD3 t1, t1, x2, a2 + daddiu YY, YY, 4 * SIZE + MADD4 t2, t2, x1, a2 + daddiu AO1, AO1, 4 * SIZE + MADD3 t3, t3, x2, a4 + MADD4 t4, t4, x1, a4 + + ST t1, -4 * SIZE(YY) + ST t2, -3 * SIZE(YY) + ST t3, -2 * SIZE(YY) + ST t4, -1 * SIZE(YY) + .align 3 + +.L26: + andi I, M, 1 + NOP + blez I, .L900 + NOP + + LD y1, 0 * SIZE(YY) + LD y2, 1 * SIZE(YY) + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + MADD1 t1, y1, x1, a1 + MADD2 t2, y2, x2, a1 + MADD3 t1, t1, x2, a2 + MADD4 t2, t2, x1, a2 + + ST t1, 0 * SIZE(YY) + ST t2, 1 * SIZE(YY) + .align 3 + +.L900: + li YORIG, 2 * SIZE + + beq INCY, YORIG, .L999 + dsra I, M, 2 + + blez I, .L905 + move XX, BUFFER + .align 3 + +.L902: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + LD a3, 2 * SIZE(XX) + LD a4, 3 * SIZE(XX) + LD a5, 4 * SIZE(XX) + LD a6, 5 * SIZE(XX) + LD a7, 6 * SIZE(XX) + LD a8, 7 * SIZE(XX) + + daddiu I, I, -1 + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + + bgtz I, .L902 + daddiu XX, XX, 8 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddiu XX, XX, 2 * SIZE + + daddiu I, I, -1 + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + ldc1 $f24, 16($sp) + ldc1 $f25, 24($sp) + +#ifndef __64BIT__ + ldc1 $f20, 32($sp) + ldc1 $f21, 40($sp) + ldc1 $f22, 48($sp) + ldc1 $f23, 56($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 32 +#else + daddiu $sp, $sp, 64 +#endif + + EPILOGUE diff --git a/kernel/mips64/zgemv_t.S b/kernel/mips64/zgemv_t.S new file mode 100644 index 0000000000..f7f7fdf99f --- /dev/null +++ b/kernel/mips64/zgemv_t.S @@ -0,0 +1,669 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define A $9 +#define LDA $10 +#define X $11 +#define INCX $2 +#define Y $6 +#define INCY $7 +#define BUFFER $8 + +#define XORIG $3 +#define XX $12 +#define YY $13 + +#define I $14 +#define J $15 + +#define AO1 $16 +#define AO2 $17 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define y1 $f8 +#define y2 $f9 +#define y3 $f10 +#define y4 $f11 + +#define x1 $f12 +#define x2 $f13 +#define x3 $f14 +#define x4 $f17 +#define x5 $f18 +#define x6 $f19 +#define x7 $f20 +#define x8 $f21 + +#if !defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#endif + +#if defined(CONJ) && defined(XCONJ) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 NMSUB +#define MADD4 NMSUB +#endif + + PROLOGUE + + LDARG INCX, 0($sp) + LDARG Y, 8($sp) + LDARG INCY, 16($sp) + LDARG BUFFER, 24($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -16 +#else + daddiu $sp, $sp, -32 +#endif + + MTC $0, y1 + SDARG $16, 0($sp) + + SDARG $17, 8($sp) + dsll LDA, LDA, ZBASE_SHIFT + +#ifndef __64BIT__ + sdc1 $f20, 16($sp) + sdc1 $f21, 24($sp) +#endif + + blez M, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + blez N, .L999 + dsll INCY, INCY, ZBASE_SHIFT + + li XORIG, 2 * SIZE + + beq INCX, XORIG, .L10 + move XORIG, X + + dsra I, M, 2 + move XORIG, BUFFER + + blez I, .L05 + move YY, BUFFER + .align 3 + +.L02: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddu X, X, INCX + + daddiu I, I, -1 + daddiu YY, YY, 8 * SIZE + + ST a1, -8 * SIZE(YY) + ST a2, -7 * SIZE(YY) + ST a3, -6 * SIZE(YY) + ST a4, -5 * SIZE(YY) + ST a5, -4 * SIZE(YY) + ST a6, -3 * SIZE(YY) + ST a7, -2 * SIZE(YY) + + bgtz I, .L02 + ST a8, -1 * SIZE(YY) + .align 3 + +.L05: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L06: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu YY, YY, 2 * SIZE + .align 3 + +.L10: + dsra J, N, 1 + blez J, .L20 + move YY, Y + .align 3 + +.L11: + move AO1, A + MOV y2, y1 + daddu AO2, A, LDA + MOV y3, y1 + daddu A, AO2, LDA + MOV y4, y1 + + dsra I, M, 2 + blez I, .L15 + move XX, XORIG + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a3, 0 * SIZE(AO2) + LD a2, 1 * SIZE(AO1) + LD a4, 1 * SIZE(AO2) + + LD a5, 2 * SIZE(AO1) + LD a7, 2 * SIZE(AO2) + LD a6, 3 * SIZE(AO1) + LD a8, 3 * SIZE(AO2) + daddiu I, I, -1 + + blez I, .L13 + NOP + .align 3 + +.L12: + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + MADD1 y3, y3, x1, a3 + NOP + MADD2 y4, y4, x2, a3 + LD a3, 4 * SIZE(AO2) + + MADD3 y1, y1, x2, a2 + NOP + MADD4 y2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD3 y3, y3, x2, a4 + LD x2, 5 * SIZE(XX) + MADD4 y4, y4, x1, a4 + LD a4, 5 * SIZE(AO2) + + MADD1 y1, y1, x3, a5 + LD x1, 4 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 6 * SIZE(AO1) + MADD1 y3, y3, x3, a7 + MADD2 y4, y4, x4, a7 + LD a7, 6 * SIZE(AO2) + + MADD3 y1, y1, x4, a6 + daddiu I, I, -1 + MADD4 y2, y2, x3, a6 + LD a6, 7 * SIZE(AO1) + MADD3 y3, y3, x4, a8 + LD x4, 7 * SIZE(XX) + MADD4 y4, y4, x3, a8 + LD a8, 7 * SIZE(AO2) + + MADD1 y1, y1, x1, a1 + LD x3, 6 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 8 * SIZE(AO1) + MADD1 y3, y3, x1, a3 + MADD2 y4, y4, x2, a3 + LD a3, 8 * SIZE(AO2) + + MADD3 y1, y1, x2, a2 + MADD4 y2, y2, x1, a2 + LD a2, 9 * SIZE(AO1) + MADD3 y3, y3, x2, a4 + LD x2, 9 * SIZE(XX) + MADD4 y4, y4, x1, a4 + LD a4, 9 * SIZE(AO2) + + MADD1 y1, y1, x3, a5 + LD x1, 8 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 10 * SIZE(AO1) + MADD1 y3, y3, x3, a7 + daddiu XX, XX, 8 * SIZE + MADD2 y4, y4, x4, a7 + LD a7, 10 * SIZE(AO2) + + MADD3 y1, y1, x4, a6 + daddiu AO2, AO2, 8 * SIZE + MADD4 y2, y2, x3, a6 + LD a6, 11 * SIZE(AO1) + MADD3 y3, y3, x4, a8 + LD x4, 3 * SIZE(XX) + MADD4 y4, y4, x3, a8 + LD a8, 3 * SIZE(AO2) + + bgtz I, .L12 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L13: + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + MADD1 y3, y3, x1, a3 + NOP + MADD2 y4, y4, x2, a3 + LD a3, 4 * SIZE(AO2) + + MADD3 y1, y1, x2, a2 + NOP + MADD4 y2, y2, x1, a2 + LD a2, 5 * SIZE(AO1) + MADD3 y3, y3, x2, a4 + LD x2, 5 * SIZE(XX) + MADD4 y4, y4, x1, a4 + LD a4, 5 * SIZE(AO2) + + MADD1 y1, y1, x3, a5 + LD x1, 4 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 6 * SIZE(AO1) + MADD1 y3, y3, x3, a7 + MADD2 y4, y4, x4, a7 + LD a7, 6 * SIZE(AO2) + + MADD3 y1, y1, x4, a6 + NOP + MADD4 y2, y2, x3, a6 + LD a6, 7 * SIZE(AO1) + MADD3 y3, y3, x4, a8 + LD x4, 7 * SIZE(XX) + MADD4 y4, y4, x3, a8 + LD a8, 7 * SIZE(AO2) + + MADD1 y1, y1, x1, a1 + LD x3, 6 * SIZE(XX) + MADD2 y2, y2, x2, a1 + NOP + MADD1 y3, y3, x1, a3 + MADD2 y4, y4, x2, a3 + + MADD3 y1, y1, x2, a2 + MADD4 y2, y2, x1, a2 + MADD3 y3, y3, x2, a4 + MADD4 y4, y4, x1, a4 + + MADD1 y1, y1, x3, a5 + MADD2 y2, y2, x4, a5 + MADD1 y3, y3, x3, a7 + MADD2 y4, y4, x4, a7 + + MADD3 y1, y1, x4, a6 + daddiu XX, XX, 8 * SIZE + MADD4 y2, y2, x3, a6 + daddiu AO1, AO1, 8 * SIZE + MADD3 y3, y3, x4, a8 + daddiu AO2, AO2, 8 * SIZE + MADD4 y4, y4, x3, a8 + NOP + .align 3 + +.L15: + andi I, M, 2 + NOP + blez I, .L17 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x3, 2 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a3, 0 * SIZE(AO2) + LD a2, 1 * SIZE(AO1) + LD a4, 1 * SIZE(AO2) + + LD a5, 2 * SIZE(AO1) + LD a7, 2 * SIZE(AO2) + LD a6, 3 * SIZE(AO1) + LD a8, 3 * SIZE(AO2) + + MADD1 y1, y1, x1, a1 + MADD2 y2, y2, x2, a1 + MADD1 y3, y3, x1, a3 + MADD2 y4, y4, x2, a3 + + MADD3 y1, y1, x2, a2 + MADD4 y2, y2, x1, a2 + MADD3 y3, y3, x2, a4 + MADD4 y4, y4, x1, a4 + + MADD1 y1, y1, x3, a5 + MADD2 y2, y2, x4, a5 + MADD1 y3, y3, x3, a7 + MADD2 y4, y4, x4, a7 + + MADD3 y1, y1, x4, a6 + daddiu XX, XX, 4 * SIZE + MADD4 y2, y2, x3, a6 + daddiu AO1, AO1, 4 * SIZE + MADD3 y3, y3, x4, a8 + daddiu AO2, AO2, 4 * SIZE + MADD4 y4, y4, x3, a8 + NOP + .align 3 + +.L17: + andi I, M, 1 + blez I, .L19 + .align 3 + +.L18: + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD a1, 0 * SIZE(AO1) + LD a3, 0 * SIZE(AO2) + + MADD1 y1, y1, x1, a1 + LD a2, 1 * SIZE(AO1) + MADD2 y2, y2, x2, a1 + LD a4, 1 * SIZE(AO2) + MADD1 y3, y3, x1, a3 + MADD2 y4, y4, x2, a3 + + MADD3 y1, y1, x2, a2 + MADD4 y2, y2, x1, a2 + MADD3 y3, y3, x2, a4 + MADD4 y4, y4, x1, a4 + .align 3 + +.L19: + LD a1, 0 * SIZE(Y) + LD a2, 1 * SIZE(Y) + daddu Y, Y, INCY + LD a3, 0 * SIZE(Y) + LD a4, 1 * SIZE(Y) + daddu Y, Y, INCY + + MADD a1, a1, ALPHA_R, y1 + MADD a2, a2, ALPHA_I, y1 + MADD a3, a3, ALPHA_R, y3 + MADD a4, a4, ALPHA_I, y3 + + NMSUB a1, a1, ALPHA_I, y2 + MADD a2, a2, ALPHA_R, y2 + NMSUB a3, a3, ALPHA_I, y4 + MTC $0, y1 + MADD a4, a4, ALPHA_R, y4 + daddiu J, J, -1 + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + daddu YY, YY, INCY + ST a3, 0 * SIZE(YY) + ST a4, 1 * SIZE(YY) + + bgtz J, .L11 + daddu YY, YY, INCY + .align 3 + +.L20: + andi J, N, 1 + MOV y2, y1 + blez J, .L999 + dsra I, M, 2 + + MOV y3, y1 + move AO1, A + MOV y4, y1 + + blez I, .L25 + move XX, XORIG + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + LD a5, 2 * SIZE(AO1) + LD x4, 3 * SIZE(XX) + daddiu I, I, -1 + + blez I, .L23 + LD a6, 3 * SIZE(AO1) + .align 3 + +.L22: + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + + MADD3 y3, y3, x2, a2 + LD x2, 5 * SIZE(XX) + MADD4 y4, y4, x1, a2 + LD a2, 5 * SIZE(AO1) + + MADD1 y1, y1, x3, a5 + LD x1, 4 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 6 * SIZE(AO1) + + MADD3 y3, y3, x4, a6 + LD x4, 7 * SIZE(XX) + MADD4 y4, y4, x3, a6 + LD a6, 7 * SIZE(AO1) + + MADD1 y1, y1, x1, a1 + LD x3, 6 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 8 * SIZE(AO1) + + MADD3 y3, y3, x2, a2 + LD x2, 9 * SIZE(XX) + MADD4 y4, y4, x1, a2 + LD a2, 9 * SIZE(AO1) + + MADD1 y1, y1, x3, a5 + LD x1, 8 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 10 * SIZE(AO1) + + MADD3 y3, y3, x4, a6 + LD x4, 11 * SIZE(XX) + MADD4 y4, y4, x3, a6 + LD a6, 11 * SIZE(AO1) + + daddiu I, I, -1 + daddiu XX, XX, 8 * SIZE + + bgtz I, .L22 + daddiu AO1, AO1, 8 * SIZE + .align 3 + +.L23: + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a1, 4 * SIZE(AO1) + + MADD3 y3, y3, x2, a2 + LD x2, 5 * SIZE(XX) + MADD4 y4, y4, x1, a2 + LD a2, 5 * SIZE(AO1) + + MADD1 y1, y1, x3, a5 + LD x1, 4 * SIZE(XX) + MADD2 y2, y2, x4, a5 + LD a5, 6 * SIZE(AO1) + + MADD3 y3, y3, x4, a6 + LD x4, 7 * SIZE(XX) + MADD4 y4, y4, x3, a6 + LD a6, 7 * SIZE(AO1) + + MADD1 y1, y1, x1, a1 + LD x3, 6 * SIZE(XX) + MADD2 y2, y2, x2, a1 + NOP + + MADD3 y3, y3, x2, a2 + MADD4 y4, y4, x1, a2 + MADD1 y1, y1, x3, a5 + MADD2 y2, y2, x4, a5 + + MADD3 y3, y3, x4, a6 + daddiu XX, XX, 8 * SIZE + MADD4 y4, y4, x3, a6 + daddiu AO1, AO1, 8 * SIZE + NOP + .align 3 + +.L25: + andi I, M, 2 + NOP + blez I, .L27 + NOP + + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + + LD a5, 2 * SIZE(AO1) + MADD1 y1, y1, x1, a1 + LD x3, 2 * SIZE(XX) + MADD2 y2, y2, x2, a1 + LD a6, 3 * SIZE(AO1) + MADD3 y3, y3, x2, a2 + LD x4, 3 * SIZE(XX) + MADD4 y4, y4, x1, a2 + + MADD1 y1, y1, x3, a5 + MADD2 y2, y2, x4, a5 + + MADD3 y3, y3, x4, a6 + daddiu XX, XX, 4 * SIZE + MADD4 y4, y4, x3, a6 + daddiu AO1, AO1, 4 * SIZE + .align 3 + +.L27: + andi I, M, 1 + blez I, .L29 + .align 3 + +.L28: + LD a1, 0 * SIZE(AO1) + LD x1, 0 * SIZE(XX) + LD a2, 1 * SIZE(AO1) + LD x2, 1 * SIZE(XX) + + MADD1 y1, y1, x1, a1 + MADD2 y2, y2, x2, a1 + + MADD3 y3, y3, x2, a2 + MADD4 y4, y4, x1, a2 + .align 3 + +.L29: + LD a1, 0 * SIZE(Y) + LD a2, 1 * SIZE(Y) + + ADD y1, y1, y3 + ADD y2, y2, y4 + + MADD a1, a1, ALPHA_R, y1 + MADD a2, a2, ALPHA_I, y1 + NMSUB a1, a1, ALPHA_I, y2 + MADD a2, a2, ALPHA_R, y2 + + ST a1, 0 * SIZE(YY) + ST a2, 1 * SIZE(YY) + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + +#ifndef __64BIT__ + ldc1 $f20, 16($sp) + ldc1 $f21, 24($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 16 +#else + daddiu $sp, $sp, 32 +#endif + + EPILOGUE diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S new file mode 100644 index 0000000000..1f4a90eac9 --- /dev/null +++ b/kernel/mips64/znrm2.S @@ -0,0 +1,378 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define XX $7 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 +#define a5 $f8 +#define a6 $f9 +#define a7 $f10 +#define a8 $f11 + +#define t1 $f12 +#define t2 $f13 +#define t3 $f14 +#define t4 $f15 + +#define s1 $f0 +#define s2 $f1 +#define s3 $f2 +#define s4 $f3 + +#define ALPHA $f16 +#define max $f17 + + + PROLOGUE + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + blez N, .L999 + MTC $0, s1 + + blez INCX, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + move XX, X + MOV s2, s1 + + dsra I, N, 2 + MOV s3, s1 + + blez I, .L15 + MOV s4, s1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + LD a7, 0 * SIZE(X) + LD a8, 1 * SIZE(X) + daddiu I, I, -1 + + blez I, .L13 + daddu X, X, INCX + .align 3 + +.L12: + FABS t1, a1 + LD a1, 0 * SIZE(X) + FABS t2, a2 + NOP + + FABS t3, a3 + LD a2, 1 * SIZE(X) + FABS t4, a4 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a3, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + NOP + + CMPLT $fcc2, s3, t3 + LD a4, 1 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + LD a5, 0 * SIZE(X) + FABS t2, a6 + NOP + + FABS t3, a7 + LD a6, 1 * SIZE(X) + FABS t4, a8 + daddu X, X, INCX + + CMPLT $fcc0, s1, t1 + LD a7, 0 * SIZE(X) + CMPLT $fcc1, s2, t2 + NOP + + CMPLT $fcc2, s3, t3 + LD a8, 1 * SIZE(X) + CMPLT $fcc3, s4, t4 + daddu X, X, INCX + + CMOVT s1, t1, $fcc0 + daddiu I, I, -1 + + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + + bgtz I, .L12 + CMOVT s4, t4, $fcc3 + .align 3 + +.L13: + FABS t1, a1 + FABS t2, a2 + FABS t3, a3 + FABS t4, a4 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + + FABS t1, a5 + FABS t2, a6 + FABS t3, a7 + FABS t4, a8 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + CMPLT $fcc2, s3, t3 + CMPLT $fcc3, s4, t4 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + CMOVT s3, t3, $fcc2 + CMOVT s4, t4, $fcc3 + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L100 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddiu I, I, -1 + + FABS t1, a1 + FABS t2, a2 + + CMPLT $fcc0, s1, t1 + CMPLT $fcc1, s2, t2 + + CMOVT s1, t1, $fcc0 + CMOVT s2, t2, $fcc1 + + bgtz I, .L16 + daddu X, X, INCX + .align 3 + +.L100: + CMPLT $fcc0, s1, s2 + CMPLT $fcc1, s3, s4 + + CMOVT s1, s2, $fcc0 + CMOVT s3, s4, $fcc1 + + CMPLT $fcc0, s1, s3 + CMOVT s1, s3, $fcc0 + + lui TEMP, 0x3f80 + dmtc1 $0, a1 + + mtc1 TEMP, ALPHA + CMPEQ $fcc0, s1, a1 + + bc1t $fcc0, .L999 + cvt.d.s ALPHA, ALPHA + + div.d ALPHA, ALPHA, s1 + MOV max, s1 + + MOV s1, a1 + MOV s2, a1 + MOV s3, a1 + MOV s4, a1 + + dsra I, N, 2 + blez I, .L105 + NOP + + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + + LD a3, 0 * SIZE(XX) + LD a4, 1 * SIZE(XX) + daddu XX, XX, INCX + + LD a5, 0 * SIZE(XX) + LD a6, 1 * SIZE(XX) + daddu XX, XX, INCX + + LD a7, 0 * SIZE(XX) + LD a8, 1 * SIZE(XX) + daddiu I, I, -1 + + blez I, .L104 + daddu XX, XX, INCX + .align 3 + +.L103: + MUL t1, ALPHA, a1 + LD a1, 0 * SIZE(XX) + MUL t2, ALPHA, a2 + daddiu I, I, -1 + + MUL t3, ALPHA, a3 + LD a2, 1 * SIZE(XX) + MUL t4, ALPHA, a4 + daddu XX, XX, INCX + + MADD s1, s1, t1, t1 + LD a3, 0 * SIZE(XX) + MADD s2, s2, t2, t2 + NOP + + MADD s3, s3, t3, t3 + LD a4, 1 * SIZE(XX) + MADD s4, s4, t4, t4 + daddu XX, XX, INCX + + MUL t1, ALPHA, a5 + LD a5, 0 * SIZE(XX) + MUL t2, ALPHA, a6 + NOP + + MUL t3, ALPHA, a7 + LD a6, 1 * SIZE(XX) + MUL t4, ALPHA, a8 + daddu XX, XX, INCX + + MADD s1, s1, t1, t1 + LD a7, 0 * SIZE(XX) + MADD s2, s2, t2, t2 + LD a8, 1 * SIZE(XX) + + MADD s3, s3, t3, t3 + daddu XX, XX, INCX + bgtz I, .L103 + MADD s4, s4, t4, t4 + .align 3 + +.L104: + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + MUL t3, ALPHA, a3 + MUL t4, ALPHA, a4 + + MADD s1, s1, t1, t1 + MADD s2, s2, t2, t2 + MADD s3, s3, t3, t3 + MADD s4, s4, t4, t4 + + MUL t1, ALPHA, a5 + MUL t2, ALPHA, a6 + MUL t3, ALPHA, a7 + MUL t4, ALPHA, a8 + + MADD s1, s1, t1, t1 + MADD s2, s2, t2, t2 + MADD s3, s3, t3, t3 + MADD s4, s4, t4, t4 + .align 3 + +.L105: + andi I, N, 3 + + blez I, .L998 + NOP + .align 3 + +.L106: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddiu I, I, -1 + + MUL t1, ALPHA, a1 + MUL t2, ALPHA, a2 + + MADD s1, s1, t1, t1 + daddu XX, XX, INCX + + bgtz I, .L106 + MADD s2, s2, t2, t2 + .align 3 + +.L998: + ADD s1, s1, s2 + ADD s3, s3, s4 + + ADD s1, s1, s3 + + sqrt.d s1, s1 + + j $31 + MUL s1, max, s1 + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zrot.S b/kernel/mips64/zrot.S new file mode 100644 index 0000000000..0a205691c8 --- /dev/null +++ b/kernel/mips64/zrot.S @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 +#define X $5 +#define INCX $6 +#define Y $7 +#define INCY $8 + +#define XX $9 +#define YY $10 + +#define C $f17 +#define S $f18 + +#define I $2 +#define TEMP $3 + +#define a1 $f4 +#define a2 $f5 +#define a3 $f6 +#define a4 $f7 + +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 + +#define t1 $f0 +#define t2 $f1 +#define t3 $f2 +#define t4 $f3 + + PROLOGUE + + dsll INCX, INCX, ZBASE_SHIFT + li TEMP, 2 * SIZE + + blez N, .L999 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsra I, N, 1 + + bne INCY, TEMP, .L20 + NOP + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + MUL t1, S, b1 + + LD a4, 3 * SIZE(X) + MUL t2, C, b1 + LD b4, 3 * SIZE(Y) + MUL t3, S, b2 + + blez I, .L13 + MUL t4, C, b2 + .align 3 + +.L12: + MADD t1, t1, C, a1 + LD b1, 4 * SIZE(Y) + NMSUB t2, t2, S, a1 + LD a1, 4 * SIZE(X) + MADD t3, t3, C, a2 + LD b2, 5 * SIZE(Y) + NMSUB t4, t4, S, a2 + LD a2, 5 * SIZE(X) + + ST t1, 0 * SIZE(X) + MUL t1, S, b3 + ST t2, 0 * SIZE(Y) + MUL t2, C, b3 + ST t3, 1 * SIZE(X) + MUL t3, S, b4 + ST t4, 1 * SIZE(Y) + MUL t4, C, b4 + + MADD t1, t1, C, a3 + LD b3, 6 * SIZE(Y) + NMSUB t2, t2, S, a3 + LD a3, 6 * SIZE(X) + MADD t3, t3, C, a4 + LD b4, 7 * SIZE(Y) + NMSUB t4, t4, S, a4 + LD a4, 7 * SIZE(X) + + ST t1, 2 * SIZE(X) + MUL t1, S, b1 + ST t2, 2 * SIZE(Y) + MUL t2, C, b1 + ST t3, 3 * SIZE(X) + MUL t3, S, b2 + ST t4, 3 * SIZE(Y) + MUL t4, C, b2 + + daddiu I, I, -1 + daddiu X, X, 4 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 4 * SIZE + .align 3 + +.L13: + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(X) + MUL t1, S, b3 + ST t2, 0 * SIZE(Y) + MUL t2, C, b3 + ST t3, 1 * SIZE(X) + MUL t3, S, b4 + ST t4, 1 * SIZE(Y) + MUL t4, C, b4 + + MADD t1, t1, C, a3 + NMSUB t2, t2, S, a3 + MADD t3, t3, C, a4 + daddiu X, X, 4 * SIZE + NMSUB t4, t4, S, a4 + daddiu Y, Y, 4 * SIZE + + ST t1, -2 * SIZE(X) + ST t2, -2 * SIZE(Y) + ST t3, -1 * SIZE(X) + ST t4, -1 * SIZE(Y) + .align 3 + +.L15: + andi I, N, 1 + + blez I, .L999 + NOP + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MUL t1, S, b1 + MUL t2, C, b1 + MUL t3, S, b2 + MUL t4, C, b2 + + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(X) + ST t2, 0 * SIZE(Y) + ST t3, 1 * SIZE(X) + + j .L999 + ST t4, 1 * SIZE(Y) + .align 3 + +.L20: + move XX, X + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + + LD a2, 1 * SIZE(X) + dadd X, X, INCX + LD b2, 1 * SIZE(Y) + dadd Y, Y, INCY + + LD a3, 0 * SIZE(X) + LD b3, 0 * SIZE(Y) + + LD a4, 1 * SIZE(X) + dadd X, X, INCX + MUL t1, S, b1 + LD b4, 1 * SIZE(Y) + MUL t2, C, b1 + dadd Y, Y, INCY + + MUL t3, S, b2 + blez I, .L23 + MUL t4, C, b2 + .align 3 + +.L22: + MADD t1, t1, C, a1 + LD b1, 0 * SIZE(Y) + NMSUB t2, t2, S, a1 + LD a1, 0 * SIZE(X) + MADD t3, t3, C, a2 + LD b2, 1 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t4, t4, S, a2 + LD a2, 1 * SIZE(X) + dadd X, X, INCX + + ST t1, 0 * SIZE(XX) + MUL t1, S, b3 + ST t2, 0 * SIZE(YY) + MUL t2, C, b3 + ST t3, 1 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b4 + ST t4, 1 * SIZE(YY) + dadd YY, YY, INCY + MUL t4, C, b4 + + MADD t1, t1, C, a3 + LD b3, 0 * SIZE(Y) + NMSUB t2, t2, S, a3 + LD a3, 0 * SIZE(X) + MADD t3, t3, C, a4 + LD b4, 1 * SIZE(Y) + dadd Y, Y, INCY + NMSUB t4, t4, S, a4 + LD a4, 1 * SIZE(X) + dadd X, X, INCX + + ST t1, 0 * SIZE(XX) + MUL t1, S, b1 + ST t2, 0 * SIZE(YY) + MUL t2, C, b1 + ST t3, 1 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b2 + ST t4, 1 * SIZE(YY) + MUL t4, C, b2 + daddiu I, I, -1 + + bgtz I, .L22 + dadd YY, YY, INCY + .align 3 + +.L23: + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(XX) + MUL t1, S, b3 + ST t2, 0 * SIZE(YY) + MUL t2, C, b3 + ST t3, 1 * SIZE(XX) + dadd XX, XX, INCX + MUL t3, S, b4 + ST t4, 1 * SIZE(YY) + dadd YY, YY, INCY + MUL t4, C, b4 + + MADD t1, t1, C, a3 + NMSUB t2, t2, S, a3 + MADD t3, t3, C, a4 + NMSUB t4, t4, S, a4 + + ST t1, 0 * SIZE(XX) + ST t2, 0 * SIZE(YY) + ST t3, 1 * SIZE(XX) + dadd XX, XX, INCX + ST t4, 1 * SIZE(YY) + dadd YY, YY, INCY + .align 3 + +.L25: + andi I, N, 1 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + MUL t1, S, b1 + MUL t2, C, b1 + MUL t3, S, b2 + MUL t4, C, b2 + + MADD t1, t1, C, a1 + NMSUB t2, t2, S, a1 + MADD t3, t3, C, a2 + NMSUB t4, t4, S, a2 + + ST t1, 0 * SIZE(X) + ST t2, 0 * SIZE(Y) + ST t3, 1 * SIZE(X) + ST t4, 1 * SIZE(Y) + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zscal.S b/kernel/mips64/zscal.S new file mode 100644 index 0000000000..3feaf5a05e --- /dev/null +++ b/kernel/mips64/zscal.S @@ -0,0 +1,441 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $9 +#define INCX $10 + +#define I $2 +#define TEMP $3 + +#define XX $5 + +#define ALPHA_R $f15 +#define ALPHA_I $f16 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define t1 $f8 +#define t2 $f9 +#define t3 $f10 +#define t4 $f11 + + PROLOGUE + + li TEMP, 2 * SIZE + MTC $0, a1 + + blez N, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + CMPEQ $fcc0, ALPHA_R, a1 + CMPEQ $fcc1, ALPHA_I, a1 + + bc1f $fcc0, .L50 + NOP + + bc1f $fcc1, .L50 + NOP + + bne INCX, TEMP, .L20 + dsra I, N, 2 + + blez I, .L15 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + ST a1, 2 * SIZE(X) + ST a1, 3 * SIZE(X) + ST a1, 4 * SIZE(X) + ST a1, 5 * SIZE(X) + ST a1, 6 * SIZE(X) + ST a1, 7 * SIZE(X) + addiu I, I, -1 + + bgtz I, .L12 + daddiu X, X, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddiu I, I, -1 + + bgtz I, .L16 + daddiu X, X, 2 * SIZE + + j $31 + NOP + .align 3 + +.L20: + dsra I, N, 2 + blez I, .L25 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddu X, X, INCX + + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddu X, X, INCX + ST a1, 0 * SIZE(X) + ST a1, 1 * SIZE(X) + daddiu I, I, -1 + + bgtz I, .L22 + daddu X, X, INCX + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + ST a1, 0 * SIZE(X) + daddiu I, I, -1 + ST a1, 1 * SIZE(X) + + bgtz I, .L26 + daddu X, X, INCX + + j $31 + NOP + .align 3 + +.L50: + bne INCX, TEMP, .L60 + dsra I, N, 2 + + blez I, .L55 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD a3, 2 * SIZE(X) + LD a4, 3 * SIZE(X) + LD a5, 4 * SIZE(X) + LD a6, 5 * SIZE(X) + MUL t1, ALPHA_R, a1 + LD a7, 6 * SIZE(X) + MUL t2, ALPHA_I, a1 + LD a8, 7 * SIZE(X) + MUL t3, ALPHA_R, a3 + + blez I, .L53 + MUL t4, ALPHA_I, a3 + .align 3 + +.L52: + NMSUB t1, t1, ALPHA_I, a2 + LD a1, 8 * SIZE(X) + MADD t2, t2, ALPHA_R, a2 + LD a2, 9 * SIZE(X) + + NMSUB t3, t3, ALPHA_I, a4 + LD a3, 10 * SIZE(X) + MADD t4, t4, ALPHA_R, a4 + LD a4, 11 * SIZE(X) + + ST t1, 0 * SIZE(X) + MUL t1, ALPHA_R, a5 + ST t2, 1 * SIZE(X) + MUL t2, ALPHA_I, a5 + + ST t3, 2 * SIZE(X) + MUL t3, ALPHA_R, a7 + ST t4, 3 * SIZE(X) + MUL t4, ALPHA_I, a7 + + NMSUB t1, t1, ALPHA_I, a6 + LD a5, 12 * SIZE(X) + MADD t2, t2, ALPHA_R, a6 + LD a6, 13 * SIZE(X) + + NMSUB t3, t3, ALPHA_I, a8 + LD a7, 14 * SIZE(X) + MADD t4, t4, ALPHA_R, a8 + LD a8, 15 * SIZE(X) + + ST t1, 4 * SIZE(X) + MUL t1, ALPHA_R, a1 + ST t2, 5 * SIZE(X) + MUL t2, ALPHA_I, a1 + ST t3, 6 * SIZE(X) + MUL t3, ALPHA_R, a3 + ST t4, 7 * SIZE(X) + MUL t4, ALPHA_I, a3 + + daddiu I, I, -1 + + bgtz I, .L52 + daddiu X, X, 8 * SIZE + .align 3 + +.L53: + NMSUB t1, t1, ALPHA_I, a2 + MADD t2, t2, ALPHA_R, a2 + NMSUB t3, t3, ALPHA_I, a4 + MADD t4, t4, ALPHA_R, a4 + + ST t1, 0 * SIZE(X) + MUL t1, ALPHA_R, a5 + ST t2, 1 * SIZE(X) + MUL t2, ALPHA_I, a5 + ST t3, 2 * SIZE(X) + MUL t3, ALPHA_R, a7 + ST t4, 3 * SIZE(X) + MUL t4, ALPHA_I, a7 + + NMSUB t1, t1, ALPHA_I, a6 + MADD t2, t2, ALPHA_R, a6 + NMSUB t3, t3, ALPHA_I, a8 + MADD t4, t4, ALPHA_R, a8 + + ST t1, 4 * SIZE(X) + ST t2, 5 * SIZE(X) + ST t3, 6 * SIZE(X) + ST t4, 7 * SIZE(X) + + daddiu X, X, 8 * SIZE + .align 3 + +.L55: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L56: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + + NMSUB t1, t1, ALPHA_I, a2 + MADD t2, t2, ALPHA_R, a2 + + daddiu X, X, 2 * SIZE + daddiu I, I, -1 + + ST t1, -2 * SIZE(X) + bgtz I, .L56 + ST t2, -1 * SIZE(X) + + j $31 + NOP + .align 3 + +.L60: + dsra I, N, 2 + move XX, X + + blez I, .L65 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + daddu X, X, INCX + LD a3, 0 * SIZE(X) + LD a4, 1 * SIZE(X) + daddu X, X, INCX + LD a5, 0 * SIZE(X) + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + MUL t1, ALPHA_R, a1 + LD a7, 0 * SIZE(X) + MUL t2, ALPHA_I, a1 + LD a8, 1 * SIZE(X) + MUL t3, ALPHA_R, a3 + daddu X, X, INCX + + blez I, .L63 + MUL t4, ALPHA_I, a3 + .align 3 + +.L62: + NMSUB t1, t1, ALPHA_I, a2 + LD a1, 0 * SIZE(X) + MADD t2, t2, ALPHA_R, a2 + LD a2, 1 * SIZE(X) + daddu X, X, INCX + + NMSUB t3, t3, ALPHA_I, a4 + LD a3, 0 * SIZE(X) + MADD t4, t4, ALPHA_R, a4 + LD a4, 1 * SIZE(X) + daddu X, X, INCX + + ST t1, 0 * SIZE(XX) + MUL t1, ALPHA_R, a5 + + ST t2, 1 * SIZE(XX) + MUL t2, ALPHA_I, a5 + daddu XX, XX, INCX + + ST t3, 0 * SIZE(XX) + MUL t3, ALPHA_R, a7 + + ST t4, 1 * SIZE(XX) + MUL t4, ALPHA_I, a7 + daddu XX, XX, INCX + + + NMSUB t1, t1, ALPHA_I, a6 + LD a5, 0 * SIZE(X) + MADD t2, t2, ALPHA_R, a6 + LD a6, 1 * SIZE(X) + daddu X, X, INCX + + NMSUB t3, t3, ALPHA_I, a8 + LD a7, 0 * SIZE(X) + MADD t4, t4, ALPHA_R, a8 + LD a8, 1 * SIZE(X) + daddu X, X, INCX + + ST t1, 0 * SIZE(XX) + MUL t1, ALPHA_R, a1 + ST t2, 1 * SIZE(XX) + MUL t2, ALPHA_I, a1 + daddu XX, XX, INCX + + ST t3, 0 * SIZE(XX) + MUL t3, ALPHA_R, a3 + ST t4, 1 * SIZE(XX) + MUL t4, ALPHA_I, a3 + + daddiu I, I, -1 + + bgtz I, .L62 + daddu XX, XX, INCX + .align 3 + +.L63: + NMSUB t1, t1, ALPHA_I, a2 + MADD t2, t2, ALPHA_R, a2 + NMSUB t3, t3, ALPHA_I, a4 + MADD t4, t4, ALPHA_R, a4 + + ST t1, 0 * SIZE(XX) + MUL t1, ALPHA_R, a5 + ST t2, 1 * SIZE(XX) + MUL t2, ALPHA_I, a5 + daddu XX, XX, INCX + + ST t3, 0 * SIZE(XX) + MUL t3, ALPHA_R, a7 + ST t4, 1 * SIZE(XX) + MUL t4, ALPHA_I, a7 + daddu XX, XX, INCX + + NMSUB t1, t1, ALPHA_I, a6 + MADD t2, t2, ALPHA_R, a6 + NMSUB t3, t3, ALPHA_I, a8 + MADD t4, t4, ALPHA_R, a8 + + ST t1, 0 * SIZE(XX) + ST t2, 1 * SIZE(XX) + daddu XX, XX, INCX + ST t3, 0 * SIZE(XX) + ST t4, 1 * SIZE(XX) + daddu XX, XX, INCX + .align 3 + +.L65: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L66: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + + MUL t1, ALPHA_R, a1 + MUL t2, ALPHA_I, a1 + + NMSUB t1, t1, ALPHA_I, a2 + MADD t2, t2, ALPHA_R, a2 + daddiu I, I, -1 + + ST t1, 0 * SIZE(X) + ST t2, 1 * SIZE(X) + + bgtz I, .L66 + daddu X, X, INCX + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zswap.S b/kernel/mips64/zswap.S new file mode 100644 index 0000000000..663da23ff8 --- /dev/null +++ b/kernel/mips64/zswap.S @@ -0,0 +1,361 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N $4 + +#define X $9 +#define INCX $10 +#define Y $11 +#define INCY $8 + +#define I $2 +#define TEMP $3 + +#define XX $5 +#define YY $6 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 +#define b1 $f8 +#define b2 $f9 +#define b3 $f10 +#define b4 $f11 +#define b5 $f12 +#define b6 $f13 +#define b7 $f14 +#define b8 $f15 + + PROLOGUE + + LDARG INCY, 0($sp) + li TEMP, 2 * SIZE + + blez N, .L999 + dsll INCX, INCX, ZBASE_SHIFT + + bne INCX, TEMP, .L20 + dsll INCY, INCY, ZBASE_SHIFT + + bne INCY, TEMP, .L20 + dsra I, N, 2 + + blez I, .L15 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + LD a3, 2 * SIZE(X) + LD b3, 2 * SIZE(Y) + LD a4, 3 * SIZE(X) + LD b4, 3 * SIZE(Y) + LD a5, 4 * SIZE(X) + LD b5, 4 * SIZE(Y) + LD a6, 5 * SIZE(X) + LD b6, 5 * SIZE(Y) + LD a7, 6 * SIZE(X) + LD b7, 6 * SIZE(Y) + LD a8, 7 * SIZE(X) + LD b8, 7 * SIZE(Y) + + blez I, .L13 + NOP + .align 3 + +.L12: + ST a1, 0 * SIZE(Y) + LD a1, 8 * SIZE(X) + ST b1, 0 * SIZE(X) + LD b1, 8 * SIZE(Y) + + ST a2, 1 * SIZE(Y) + LD a2, 9 * SIZE(X) + ST b2, 1 * SIZE(X) + LD b2, 9 * SIZE(Y) + + ST a3, 2 * SIZE(Y) + LD a3, 10 * SIZE(X) + ST b3, 2 * SIZE(X) + LD b3, 10 * SIZE(Y) + + ST a4, 3 * SIZE(Y) + LD a4, 11 * SIZE(X) + ST b4, 3 * SIZE(X) + LD b4, 11 * SIZE(Y) + + ST a5, 4 * SIZE(Y) + LD a5, 12 * SIZE(X) + ST b5, 4 * SIZE(X) + LD b5, 12 * SIZE(Y) + + ST a6, 5 * SIZE(Y) + LD a6, 13 * SIZE(X) + ST b6, 5 * SIZE(X) + LD b6, 13 * SIZE(Y) + + ST a7, 6 * SIZE(Y) + LD a7, 14 * SIZE(X) + ST b7, 6 * SIZE(X) + LD b7, 14 * SIZE(Y) + + ST a8, 7 * SIZE(Y) + LD a8, 15 * SIZE(X) + ST b8, 7 * SIZE(X) + LD b8, 15 * SIZE(Y) + + daddiu I, I, -1 + daddiu X, X, 8 * SIZE + + bgtz I, .L12 + daddiu Y, Y, 8 * SIZE + .align 3 + +.L13: + ST a1, 0 * SIZE(Y) + ST b1, 0 * SIZE(X) + ST a2, 1 * SIZE(Y) + ST b2, 1 * SIZE(X) + ST a3, 2 * SIZE(Y) + ST b3, 2 * SIZE(X) + ST a4, 3 * SIZE(Y) + ST b4, 3 * SIZE(X) + ST a5, 4 * SIZE(Y) + ST b5, 4 * SIZE(X) + ST a6, 5 * SIZE(Y) + ST b6, 5 * SIZE(X) + ST a7, 6 * SIZE(Y) + ST b7, 6 * SIZE(X) + ST a8, 7 * SIZE(Y) + ST b8, 7 * SIZE(X) + + daddiu X, X, 8 * SIZE + daddiu Y, Y, 8 * SIZE + .align 3 + +.L15: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L16: + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + + daddiu X, X, 2 * SIZE + daddiu I, I, -1 + daddiu Y, Y, 2 * SIZE + + ST b1, -2 * SIZE(X) + ST b2, -1 * SIZE(X) + ST a1, -2 * SIZE(Y) + bgtz I, .L16 + ST a2, -1 * SIZE(Y) + + j .L999 + NOP + .align 3 + +.L20: + dsra I, N, 2 + move XX, X + move YY, Y + + blez I, .L25 + daddiu I, I, -1 + + LD a1, 0 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD a2, 1 * SIZE(X) + LD b2, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + LD a3, 0 * SIZE(X) + LD b3, 0 * SIZE(Y) + LD a4, 1 * SIZE(X) + LD b4, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + LD a5, 0 * SIZE(X) + LD b5, 0 * SIZE(Y) + LD a6, 1 * SIZE(X) + LD b6, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + LD a7, 0 * SIZE(X) + LD b7, 0 * SIZE(Y) + LD a8, 1 * SIZE(X) + LD b8, 1 * SIZE(Y) + daddu X, X, INCX + daddu Y, Y, INCY + + blez I, .L23 + NOP + .align 3 + +.L22: + ST a1, 0 * SIZE(YY) + LD a1, 0 * SIZE(X) + ST b1, 0 * SIZE(XX) + LD b1, 0 * SIZE(Y) + + ST a2, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a2, 1 * SIZE(X) + daddu X, X, INCX + ST b2, 1 * SIZE(XX) + daddu XX, XX, INCX + LD b2, 1 * SIZE(Y) + daddu Y, Y, INCY + + ST a3, 0 * SIZE(YY) + LD a3, 0 * SIZE(X) + ST b3, 0 * SIZE(XX) + LD b3, 0 * SIZE(Y) + + ST a4, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a4, 1 * SIZE(X) + daddu X, X, INCX + ST b4, 1 * SIZE(XX) + daddu XX, XX, INCX + LD b4, 1 * SIZE(Y) + daddu Y, Y, INCY + + ST a5, 0 * SIZE(YY) + LD a5, 0 * SIZE(X) + ST b5, 0 * SIZE(XX) + LD b5, 0 * SIZE(Y) + + ST a6, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a6, 1 * SIZE(X) + daddu X, X, INCX + ST b6, 1 * SIZE(XX) + daddu XX, XX, INCX + LD b6, 1 * SIZE(Y) + daddu Y, Y, INCY + + ST a7, 0 * SIZE(YY) + LD a7, 0 * SIZE(X) + ST b7, 0 * SIZE(XX) + LD b7, 0 * SIZE(Y) + + ST a8, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a8, 1 * SIZE(X) + daddu X, X, INCX + ST b8, 1 * SIZE(XX) + daddu XX, XX, INCX + LD b8, 1 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L22 + daddu Y, Y, INCY + .align 3 + +.L23: + ST a1, 0 * SIZE(YY) + ST b1, 0 * SIZE(XX) + ST a2, 1 * SIZE(YY) + ST b2, 1 * SIZE(XX) + daddu YY, YY, INCY + daddu XX, XX, INCX + ST a3, 0 * SIZE(YY) + ST b3, 0 * SIZE(XX) + ST a4, 1 * SIZE(YY) + ST b4, 1 * SIZE(XX) + daddu YY, YY, INCY + daddu XX, XX, INCX + ST a5, 0 * SIZE(YY) + ST b5, 0 * SIZE(XX) + ST a6, 1 * SIZE(YY) + ST b6, 1 * SIZE(XX) + daddu YY, YY, INCY + daddu XX, XX, INCX + ST a7, 0 * SIZE(YY) + ST b7, 0 * SIZE(XX) + ST a8, 1 * SIZE(YY) + ST b8, 1 * SIZE(XX) + daddu YY, YY, INCY + daddu XX, XX, INCX + .align 3 + +.L25: + andi I, N, 3 + + blez I, .L999 + NOP + .align 3 + +.L26: + LD a1, 0 * SIZE(X) + LD a2, 1 * SIZE(X) + LD b1, 0 * SIZE(Y) + LD b2, 1 * SIZE(Y) + + daddiu I, I, -1 + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + + ST b1, 0 * SIZE(X) + ST b2, 1 * SIZE(X) + + bgtz I, .L26 + daddu X, X, INCX + .align 3 + +.L999: + j $31 + NOP + + EPILOGUE diff --git a/kernel/mips64/zsymv_L.S b/kernel/mips64/zsymv_L.S new file mode 100644 index 0000000000..65d5ce31b8 --- /dev/null +++ b/kernel/mips64/zsymv_L.S @@ -0,0 +1,698 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define A $7 +#define LDA $8 +#define X $9 +#define INCX $10 +#define Y $11 +#define INCY $5 +#define BUFFER $6 + +#define XX $12 +#define YY $13 + +#define I $14 +#define IS $15 + +#define AO1 $16 +#define AO2 $17 + +#define Y1 $18 +#define TEMP $19 + +#define II INCX + +#define ALPHA_R $f13 +#define ALPHA_I $f14 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define alpha1 $f8 +#define alpha2 $f9 +#define alpha3 $f10 +#define alpha4 $f11 + +#define x1 $f12 +#define x2 $f15 +#define x3 $f16 +#define x4 $f17 + +#define xsum1 $f18 +#define xsum2 $f19 +#define xsum3 $f20 +#define xsum4 $f21 + +#define ysum1 $f22 +#define ysum2 $f23 +#define ysum3 $f24 +#define ysum4 $f25 + +#ifndef HEMV +#define ADD1 NMSUB +#define ADD2 MADD +#else +#define ADD1 MADD +#define ADD2 NMSUB +#endif + + PROLOGUE + + LDARG INCY, 0($sp) + LDARG BUFFER, 8($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -64 +#else + daddiu $sp, $sp, -80 +#endif + + SDARG $16, 0($sp) + dsll LDA, LDA, ZBASE_SHIFT + SDARG $17, 8($sp) + dsll INCX, INCX, ZBASE_SHIFT + SDARG $18, 16($sp) + dsll INCY, INCY, ZBASE_SHIFT + SDARG $19, 24($sp) + nop + + sdc1 $f24, 32($sp) + sdc1 $f25, 40($sp) + +#ifndef __64BIT__ + sdc1 $f20, 48($sp) + sdc1 $f21, 56($sp) + sdc1 $f22, 64($sp) + sdc1 $f23, 72($sp) +#endif + + blez M, .L999 + li IS, 2 * SIZE + + beq IS, INCX, .L05 + move Y1, Y + + dsra I, M, 2 + move XX, X + + blez I, .L02 + move X, BUFFER + .align 3 + +.L01: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a3, 0 * SIZE(XX) + LD a4, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a5, 0 * SIZE(XX) + LD a6, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a7, 0 * SIZE(XX) + LD a8, 1 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + ST a5, 4 * SIZE(BUFFER) + ST a6, 5 * SIZE(BUFFER) + ST a7, 6 * SIZE(BUFFER) + ST a8, 7 * SIZE(BUFFER) + + daddiu I, I, -1 + + bgtz I, .L01 + daddiu BUFFER, BUFFER, 8 * SIZE + .align 3 + +.L02: + andi I, M, 3 + blez I, .L05 + NOP + .align 3 + +.L03: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L03 + daddiu BUFFER, BUFFER, 2 * SIZE + .align 3 + +.L05: + beq IS, INCY, .L10 + daddiu BUFFER, BUFFER, 255 + + li TEMP, -256 + and BUFFER, BUFFER, TEMP + + dsra I, M, 2 + move Y1, BUFFER + + blez I, .L07 + move YY, Y + .align 3 + +.L06: + LD a1, 0 * SIZE(YY) + LD a2, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(YY) + LD a4, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a5, 0 * SIZE(YY) + LD a6, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a7, 0 * SIZE(YY) + LD a8, 1 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + ST a5, 4 * SIZE(BUFFER) + ST a6, 5 * SIZE(BUFFER) + ST a7, 6 * SIZE(BUFFER) + ST a8, 7 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu BUFFER, BUFFER, 8 * SIZE + .align 3 + +.L07: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L08: + LD a1, 0 * SIZE(YY) + LD a2, 1 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L08 + daddiu BUFFER, BUFFER, 2 * SIZE + .align 3 + +.L10: + slti TEMP, M, 2 + nop + + bgtz TEMP, .L20 + li IS, 0 + .align 3 + +.L11: + dsll TEMP, IS, ZBASE_SHIFT + nop + + daddu XX, X, TEMP + daddu YY, Y1, TEMP + + LD alpha1, 0 * SIZE(XX) + LD alpha2, 1 * SIZE(XX) + LD alpha3, 2 * SIZE(XX) + LD alpha4, 3 * SIZE(XX) + + move AO1, A + daddu AO2, A, LDA + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + MUL xsum1, alpha1, a1 + daddiu XX, XX, 4 * SIZE + MUL xsum2, alpha2, a1 + daddiu YY, YY, 4 * SIZE + MUL xsum3, alpha1, a3 + daddu A, AO2, LDA + MUL xsum4, alpha2, a3 + daddiu A, A, 4 * SIZE + +#ifndef HEMV + NMSUB xsum1, xsum1, alpha2, a2 + MADD xsum2, xsum2, alpha1, a2 +#endif + NMSUB xsum3, xsum3, alpha2, a4 + daddiu AO1, AO1, 4 * SIZE + MADD xsum4, xsum4, alpha1, a4 + daddiu AO2, AO2, 4 * SIZE + + MADD xsum1, xsum1, alpha3, a3 + MADD xsum2, xsum2, alpha4, a3 + MADD xsum3, xsum3, alpha3, a7 + MADD xsum4, xsum4, alpha4, a7 + + ADD1 xsum1, xsum1, alpha4, a4 + ADD2 xsum2, xsum2, alpha3, a4 +#ifndef HEMV + ADD1 xsum3, xsum3, alpha4, a8 + ADD2 xsum4, xsum4, alpha3, a8 +#endif + + MOV x1, alpha1 + dsubu II, M, IS + MOV x2, alpha2 + daddiu II, II, - 2 + MOV x3, alpha3 + dsra I, II, 1 + MOV x4, alpha4 + nop + + MUL alpha1, ALPHA_R, alpha1 + MUL alpha2, ALPHA_R, alpha2 + MUL alpha3, ALPHA_R, alpha3 + MUL alpha4, ALPHA_R, alpha4 + + NMSUB alpha1, alpha1, ALPHA_I, x2 + MADD alpha2, alpha2, ALPHA_I, x1 + NMSUB alpha3, alpha3, ALPHA_I, x4 + MADD alpha4, alpha4, ALPHA_I, x3 + + blez I, .L15 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + + blez I, .L13 + LD ysum2, 1 * SIZE(YY) + .align 3 + +.L12: + MADD ysum1, ysum1, alpha1, a1 + LD ysum3, 2 * SIZE(YY) + MADD ysum2, ysum2, alpha2, a1 + LD ysum4, 3 * SIZE(YY) + MADD xsum1, xsum1, x1, a1 + LD a8, 3 * SIZE(AO2) + MADD xsum2, xsum2, x2, a1 + LD a1, 4 * SIZE(AO1) + + MADD ysum3, ysum3, alpha1, a3 + LD x3, 2 * SIZE(XX) + MADD ysum4, ysum4, alpha2, a3 + daddiu I, I, -1 + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + daddiu AO2, AO2, 4 * SIZE + ADD2 xsum2, xsum2, x1, a2 + LD a2, 5 * SIZE(AO1) + + NMSUB ysum3, ysum3, alpha2, a4 + MADD ysum4, ysum4, alpha1, a4 + ADD1 xsum3, xsum3, x2, a6 + LD x2, 5 * SIZE(XX) + ADD2 xsum4, xsum4, x1, a6 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + MADD xsum1, xsum1, x3, a3 + LD a5, 0 * SIZE(AO2) + MADD xsum2, xsum2, x4, a3 + LD a3, 6 * SIZE(AO1) + + MADD ysum3, ysum3, alpha3, a7 + MADD ysum4, ysum4, alpha4, a7 + MADD xsum3, xsum3, x3, a7 + daddiu AO1, AO1, 4 * SIZE + MADD xsum4, xsum4, x4, a7 + LD a7, 2 * SIZE(AO2) + + NMSUB ysum1, ysum1, alpha4, a6 + daddiu XX, XX, 4 * SIZE + MADD ysum2, ysum2, alpha3, a6 + LD a6, 1 * SIZE(AO2) + ADD1 xsum1, xsum1, x4, a4 + daddiu YY, YY, 4 * SIZE + ADD2 xsum2, xsum2, x3, a4 + LD a4, 3 * SIZE(AO1) + + NMSUB ysum3, ysum3, alpha4, a8 + ST ysum1,-4 * SIZE(YY) + MADD ysum4, ysum4, alpha3, a8 + ST ysum2,-3 * SIZE(YY) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + ADD1 xsum3, xsum3, x4, a8 + LD x4, 3 * SIZE(XX) + ADD2 xsum4, xsum4, x3, a8 + + ST ysum3,-2 * SIZE(YY) + bgtz I, .L12 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L13: + MADD ysum1, ysum1, alpha1, a1 + LD ysum3, 2 * SIZE(YY) + MADD ysum2, ysum2, alpha2, a1 + LD ysum4, 3 * SIZE(YY) + MADD xsum1, xsum1, x1, a1 + LD a8, 3 * SIZE(AO2) + MADD xsum2, xsum2, x2, a1 + LD x3, 2 * SIZE(XX) + + MADD ysum3, ysum3, alpha1, a3 + MADD ysum4, ysum4, alpha2, a3 + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + ADD2 xsum2, xsum2, x1, a2 + + NMSUB ysum3, ysum3, alpha2, a4 + MADD ysum4, ysum4, alpha1, a4 + ADD1 xsum3, xsum3, x2, a6 + ADD2 xsum4, xsum4, x1, a6 + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + MADD xsum1, xsum1, x3, a3 + MADD xsum2, xsum2, x4, a3 + + MADD ysum3, ysum3, alpha3, a7 + MADD ysum4, ysum4, alpha4, a7 + MADD xsum3, xsum3, x3, a7 + MADD xsum4, xsum4, x4, a7 + + NMSUB ysum1, ysum1, alpha4, a6 + MADD ysum2, ysum2, alpha3, a6 + ADD1 xsum1, xsum1, x4, a4 + ADD2 xsum2, xsum2, x3, a4 + + NMSUB ysum3, ysum3, alpha4, a8 + daddiu XX, XX, 4 * SIZE + MADD ysum4, ysum4, alpha3, a8 + daddiu YY, YY, 4 * SIZE + ADD1 xsum3, xsum3, x4, a8 + daddiu AO1, AO1, 4 * SIZE + ADD2 xsum4, xsum4, x3, a8 + daddiu AO2, AO2, 4 * SIZE + + ST ysum1, -4 * SIZE(YY) + ST ysum2, -3 * SIZE(YY) + ST ysum3, -2 * SIZE(YY) + ST ysum4, -1 * SIZE(YY) + .align 3 + +.L15: + andi I, M, 1 + NOP + blez I, .L16 + NOP + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha2, a1 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x2, a1 + + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + ADD2 xsum2, xsum2, x1, a2 + + ADD1 xsum3, xsum3, x2, a6 + ADD2 xsum4, xsum4, x1, a6 + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + + NMSUB ysum1, ysum1, alpha4, a6 + MADD ysum2, ysum2, alpha3, a6 + + daddiu XX, XX, 2 * SIZE + daddiu YY, YY, 2 * SIZE + daddiu AO1, AO1, 2 * SIZE + daddiu AO2, AO2, 2 * SIZE + + ST ysum1, -2 * SIZE(YY) + ST ysum2, -1 * SIZE(YY) + .align 3 + +.L16: + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + LD ysum3, 2 * SIZE(TEMP) + LD ysum4, 3 * SIZE(TEMP) + + MADD ysum1, ysum1, ALPHA_R, xsum1 + MADD ysum2, ysum2, ALPHA_I, xsum1 + MADD ysum3, ysum3, ALPHA_R, xsum3 + MADD ysum4, ysum4, ALPHA_I, xsum3 + + NMSUB ysum1, ysum1, ALPHA_I, xsum2 + MADD ysum2, ysum2, ALPHA_R, xsum2 + NMSUB ysum3, ysum3, ALPHA_I, xsum4 + MADD ysum4, ysum4, ALPHA_R, xsum4 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + ST ysum3, 2 * SIZE(TEMP) + ST ysum4, 3 * SIZE(TEMP) + + daddiu TEMP, IS, 4 + slt TEMP, M, TEMP + + beqz TEMP, .L11 + daddiu IS, IS, 2 + .align 3 + +.L20: + andi TEMP, M, 1 + nop + blez TEMP, .L900 + nop + + dsll TEMP, IS, ZBASE_SHIFT + nop + + daddu XX, X, TEMP + daddu YY, Y1, TEMP + + LD alpha1, 0 * SIZE(XX) + LD alpha2, 1 * SIZE(XX) + + LD a1, 0 * SIZE(A) + LD a2, 1 * SIZE(A) + + MUL xsum1, alpha1, a1 + LD ysum1, 0 * SIZE(YY) + MUL xsum2, alpha2, a1 + LD ysum2, 1 * SIZE(YY) + +#ifndef HEMV + NMSUB xsum1, xsum1, alpha2, a2 + MADD xsum2, xsum2, alpha1, a2 +#endif + + MOV x1, alpha1 + MOV x2, alpha2 + + MUL alpha1, ALPHA_R, alpha1 + MUL alpha2, ALPHA_R, alpha2 + + NMSUB alpha1, alpha1, ALPHA_I, x2 + MADD alpha2, alpha2, ALPHA_I, x1 + + MADD ysum1, ysum1, ALPHA_R, xsum1 + MADD ysum2, ysum2, ALPHA_I, xsum1 + NMSUB ysum1, ysum1, ALPHA_I, xsum2 + MADD ysum2, ysum2, ALPHA_R, xsum2 + + ST ysum1, 0 * SIZE(YY) + ST ysum2, 1 * SIZE(YY) + .align 3 + +.L900: + li IS, 2 * SIZE + NOP + + beq INCY, IS, .L999 + dsra I, M, 2 + + blez I, .L905 + NOP + .align 3 + +.L902: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + LD a3, 2 * SIZE(Y1) + LD a4, 3 * SIZE(Y1) + LD a5, 4 * SIZE(Y1) + LD a6, 5 * SIZE(Y1) + LD a7, 6 * SIZE(Y1) + LD a8, 7 * SIZE(Y1) + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu Y1, Y1, 8 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + daddiu Y1, Y1, 2 * SIZE + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + ldc1 $f24, 32($sp) + ldc1 $f25, 40($sp) + +#ifndef __64BIT__ + ldc1 $f20, 48($sp) + ldc1 $f21, 56($sp) + ldc1 $f22, 64($sp) + ldc1 $f23, 72($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 64 +#else + daddiu $sp, $sp, 80 +#endif + + EPILOGUE diff --git a/kernel/mips64/zsymv_U.S b/kernel/mips64/zsymv_U.S new file mode 100644 index 0000000000..938d9118c1 --- /dev/null +++ b/kernel/mips64/zsymv_U.S @@ -0,0 +1,717 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define A $7 +#define LDA $8 +#define X $9 +#define INCX $10 +#define Y $11 +#define INCY $5 +#define BUFFER $6 + +#define XX $12 +#define YY $13 + +#define I $14 +#define IS $15 + +#define AO1 $16 +#define AO2 $17 + +#define Y1 $18 +#define TEMP $19 + +#define ALPHA_R $f13 +#define ALPHA_I $f14 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f2 +#define a4 $f3 +#define a5 $f4 +#define a6 $f5 +#define a7 $f6 +#define a8 $f7 + +#define alpha1 $f8 +#define alpha2 $f9 +#define alpha3 $f10 +#define alpha4 $f11 + +#define x1 $f12 +#define x2 $f15 +#define x3 $f16 +#define x4 $f17 + +#define xsum1 $f18 +#define xsum2 $f19 +#define xsum3 $f20 +#define xsum4 $f21 + +#define ysum1 $f22 +#define ysum2 $f23 +#define ysum3 $f24 +#define ysum4 $f25 + +#ifndef HEMV +#define ADD1 NMSUB +#define ADD2 MADD +#else +#define ADD1 MADD +#define ADD2 NMSUB +#endif + + PROLOGUE + + LDARG INCY, 0($sp) + LDARG BUFFER, 8($sp) +#ifdef __64BIT__ + daddiu $sp, $sp, -64 +#else + daddiu $sp, $sp, -80 +#endif + + SDARG $16, 0($sp) + dsll LDA, LDA, ZBASE_SHIFT + SDARG $17, 8($sp) + dsll INCX, INCX, ZBASE_SHIFT + SDARG $18, 16($sp) + dsll INCY, INCY, ZBASE_SHIFT + SDARG $19, 24($sp) + nop + + sdc1 $f24, 32($sp) + sdc1 $f25, 40($sp) + +#ifndef __64BIT__ + sdc1 $f20, 48($sp) + sdc1 $f21, 56($sp) + sdc1 $f22, 64($sp) + sdc1 $f23, 72($sp) +#endif + + blez M, .L999 + li IS, 2 * SIZE + + beq IS, INCX, .L05 + move Y1, Y + + dsra I, M, 2 + move XX, X + + blez I, .L02 + move X, BUFFER + .align 3 + +.L01: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a3, 0 * SIZE(XX) + LD a4, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a5, 0 * SIZE(XX) + LD a6, 1 * SIZE(XX) + daddu XX, XX, INCX + LD a7, 0 * SIZE(XX) + LD a8, 1 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + ST a5, 4 * SIZE(BUFFER) + ST a6, 5 * SIZE(BUFFER) + ST a7, 6 * SIZE(BUFFER) + ST a8, 7 * SIZE(BUFFER) + + daddiu I, I, -1 + + bgtz I, .L01 + daddiu BUFFER, BUFFER, 8 * SIZE + .align 3 + +.L02: + andi I, M, 3 + blez I, .L05 + NOP + .align 3 + +.L03: + LD a1, 0 * SIZE(XX) + LD a2, 1 * SIZE(XX) + daddu XX, XX, INCX + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L03 + daddiu BUFFER, BUFFER, 2 * SIZE + .align 3 + +.L05: + beq IS, INCY, .L10 + daddiu BUFFER, BUFFER, 255 + + li TEMP, -256 + and BUFFER, BUFFER, TEMP + + dsra I, M, 2 + move Y1, BUFFER + + blez I, .L07 + move YY, Y + .align 3 + +.L06: + LD a1, 0 * SIZE(YY) + LD a2, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a3, 0 * SIZE(YY) + LD a4, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a5, 0 * SIZE(YY) + LD a6, 1 * SIZE(YY) + daddu YY, YY, INCY + LD a7, 0 * SIZE(YY) + LD a8, 1 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + ST a3, 2 * SIZE(BUFFER) + ST a4, 3 * SIZE(BUFFER) + ST a5, 4 * SIZE(BUFFER) + ST a6, 5 * SIZE(BUFFER) + ST a7, 6 * SIZE(BUFFER) + ST a8, 7 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L06 + daddiu BUFFER, BUFFER, 8 * SIZE + .align 3 + +.L07: + andi I, M, 3 + blez I, .L10 + NOP + .align 3 + +.L08: + LD a1, 0 * SIZE(YY) + LD a2, 1 * SIZE(YY) + daddu YY, YY, INCY + + ST a1, 0 * SIZE(BUFFER) + ST a2, 1 * SIZE(BUFFER) + daddiu I, I, -1 + + bgtz I, .L08 + daddiu BUFFER, BUFFER, 2 * SIZE + .align 3 + +.L10: + slti TEMP, M, 2 + nop + + bgtz TEMP, .L20 + li IS, 0 + .align 3 + +.L11: + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, X, TEMP + + LD x1, 0 * SIZE(TEMP) + LD x2, 1 * SIZE(TEMP) + LD x3, 2 * SIZE(TEMP) + LD x4, 3 * SIZE(TEMP) + + MTC $0, xsum1 + MTC $0, xsum2 + MTC $0, xsum3 + MTC $0, xsum4 + + MUL alpha1, ALPHA_R, x1 + move AO1, A + MUL alpha2, ALPHA_I, x1 + dsra I, IS, 1 + MUL alpha3, ALPHA_R, x3 + daddu AO2, A, LDA + MUL alpha4, ALPHA_I, x3 + daddu A, AO2, LDA + + NMSUB alpha1, alpha1, ALPHA_I, x2 + move XX, X + MADD alpha2, alpha2, ALPHA_R, x2 + move YY, Y1 + NMSUB alpha3, alpha3, ALPHA_I, x4 + MADD alpha4, alpha4, ALPHA_R, x4 + + blez I, .L15 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + LD x4, 3 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + LD ysum1, 0 * SIZE(YY) + + blez I, .L13 + LD ysum2, 1 * SIZE(YY) + .align 3 + +.L12: + MADD ysum1, ysum1, alpha1, a1 + LD ysum3, 2 * SIZE(YY) + MADD ysum2, ysum2, alpha2, a1 + LD ysum4, 3 * SIZE(YY) + MADD xsum1, xsum1, x1, a1 + LD a8, 3 * SIZE(AO2) + MADD xsum2, xsum2, x2, a1 + LD a1, 4 * SIZE(AO1) + + MADD ysum3, ysum3, alpha1, a3 + LD x3, 2 * SIZE(XX) + MADD ysum4, ysum4, alpha2, a3 + daddiu I, I, -1 + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + daddiu AO2, AO2, 4 * SIZE + ADD2 xsum2, xsum2, x1, a2 + LD a2, 5 * SIZE(AO1) + + NMSUB ysum3, ysum3, alpha2, a4 + MADD ysum4, ysum4, alpha1, a4 + ADD1 xsum3, xsum3, x2, a6 + LD x2, 5 * SIZE(XX) + ADD2 xsum4, xsum4, x1, a6 + LD x1, 4 * SIZE(XX) + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + MADD xsum1, xsum1, x3, a3 + LD a5, 0 * SIZE(AO2) + MADD xsum2, xsum2, x4, a3 + LD a3, 6 * SIZE(AO1) + + MADD ysum3, ysum3, alpha3, a7 + MADD ysum4, ysum4, alpha4, a7 + MADD xsum3, xsum3, x3, a7 + daddiu AO1, AO1, 4 * SIZE + MADD xsum4, xsum4, x4, a7 + LD a7, 2 * SIZE(AO2) + + NMSUB ysum1, ysum1, alpha4, a6 + daddiu XX, XX, 4 * SIZE + MADD ysum2, ysum2, alpha3, a6 + LD a6, 1 * SIZE(AO2) + ADD1 xsum1, xsum1, x4, a4 + daddiu YY, YY, 4 * SIZE + ADD2 xsum2, xsum2, x3, a4 + LD a4, 3 * SIZE(AO1) + + NMSUB ysum3, ysum3, alpha4, a8 + ST ysum1,-4 * SIZE(YY) + MADD ysum4, ysum4, alpha3, a8 + ST ysum2,-3 * SIZE(YY) + + LD ysum1, 0 * SIZE(YY) + LD ysum2, 1 * SIZE(YY) + + ADD1 xsum3, xsum3, x4, a8 + LD x4, 3 * SIZE(XX) + ADD2 xsum4, xsum4, x3, a8 + + ST ysum3,-2 * SIZE(YY) + bgtz I, .L12 + ST ysum4,-1 * SIZE(YY) + .align 3 + +.L13: + MADD ysum1, ysum1, alpha1, a1 + LD ysum3, 2 * SIZE(YY) + MADD ysum2, ysum2, alpha2, a1 + LD ysum4, 3 * SIZE(YY) + MADD xsum1, xsum1, x1, a1 + LD a8, 3 * SIZE(AO2) + MADD xsum2, xsum2, x2, a1 + LD x3, 2 * SIZE(XX) + + MADD ysum3, ysum3, alpha1, a3 + MADD ysum4, ysum4, alpha2, a3 + MADD xsum3, xsum3, x1, a5 + MADD xsum4, xsum4, x2, a5 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + ADD2 xsum2, xsum2, x1, a2 + + NMSUB ysum3, ysum3, alpha2, a4 + MADD ysum4, ysum4, alpha1, a4 + ADD1 xsum3, xsum3, x2, a6 + ADD2 xsum4, xsum4, x1, a6 + + MADD ysum1, ysum1, alpha3, a5 + MADD ysum2, ysum2, alpha4, a5 + MADD xsum1, xsum1, x3, a3 + MADD xsum2, xsum2, x4, a3 + + MADD ysum3, ysum3, alpha3, a7 + MADD ysum4, ysum4, alpha4, a7 + MADD xsum3, xsum3, x3, a7 + MADD xsum4, xsum4, x4, a7 + + NMSUB ysum1, ysum1, alpha4, a6 + MADD ysum2, ysum2, alpha3, a6 + ADD1 xsum1, xsum1, x4, a4 + ADD2 xsum2, xsum2, x3, a4 + + NMSUB ysum3, ysum3, alpha4, a8 + daddiu XX, XX, 4 * SIZE + MADD ysum4, ysum4, alpha3, a8 + daddiu YY, YY, 4 * SIZE + ADD1 xsum3, xsum3, x4, a8 + daddiu AO1, AO1, 4 * SIZE + ADD2 xsum4, xsum4, x3, a8 + daddiu AO2, AO2, 4 * SIZE + + ST ysum1, -4 * SIZE(YY) + ST ysum2, -3 * SIZE(YY) + ST ysum3, -2 * SIZE(YY) + ST ysum4, -1 * SIZE(YY) + .align 3 + +.L15: + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + LD ysum3, 2 * SIZE(TEMP) + LD ysum4, 3 * SIZE(TEMP) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + LD a3, 2 * SIZE(AO1) + LD a4, 3 * SIZE(AO1) + + LD a5, 0 * SIZE(AO2) + LD a6, 1 * SIZE(AO2) + LD a7, 2 * SIZE(AO2) + LD a8, 3 * SIZE(AO2) + + MOV x1, xsum1 + MOV x2, xsum2 + MOV x3, xsum3 + MOV x4, xsum4 + + MUL xsum1, ALPHA_R, xsum1 + MUL xsum2, ALPHA_R, xsum2 + MUL xsum3, ALPHA_R, xsum3 + MUL xsum4, ALPHA_R, xsum4 + + NMSUB xsum1, xsum1, ALPHA_I, x2 + MADD xsum2, xsum2, ALPHA_I, x1 + NMSUB xsum3, xsum3, ALPHA_I, x4 + MADD xsum4, xsum4, ALPHA_I, x3 + + MADD xsum1, xsum1, alpha1, a1 + MADD xsum2, xsum2, alpha2, a1 + MADD xsum3, xsum3, alpha1, a5 + MADD xsum4, xsum4, alpha2, a5 + +#ifndef HEMV + ADD1 xsum1, xsum1, alpha2, a2 + ADD2 xsum2, xsum2, alpha1, a2 +#endif + ADD1 xsum3, xsum3, alpha2, a6 + ADD2 xsum4, xsum4, alpha1, a6 + + MADD xsum1, xsum1, alpha3, a5 + MADD xsum2, xsum2, alpha4, a5 + MADD xsum3, xsum3, alpha3, a7 + MADD xsum4, xsum4, alpha4, a7 + + NMSUB xsum1, xsum1, alpha4, a6 + MADD xsum2, xsum2, alpha3, a6 +#ifndef HEMV + ADD1 xsum3, xsum3, alpha4, a8 + ADD2 xsum4, xsum4, alpha3, a8 +#endif + + ADD ysum1, ysum1, xsum1 + ADD ysum2, ysum2, xsum2 + ADD ysum3, ysum3, xsum3 + ADD ysum4, ysum4, xsum4 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + ST ysum3, 2 * SIZE(TEMP) + ST ysum4, 3 * SIZE(TEMP) + + daddiu TEMP, IS, 4 + slt TEMP, M, TEMP + + beqz TEMP, .L11 + daddiu IS, IS, 2 + .align 3 + +.L20: + andi TEMP, M, 1 + nop + blez TEMP, .L900 + nop + + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, X, TEMP + + LD x1, 0 * SIZE(TEMP) + LD x2, 1 * SIZE(TEMP) + + MTC $0, xsum1 + MTC $0, xsum2 + + MUL alpha1, ALPHA_R, x1 + move AO1, A + MUL alpha2, ALPHA_I, x1 + move I, IS + daddu A, AO1, LDA + + NMSUB alpha1, alpha1, ALPHA_I, x2 + move XX, X + MADD alpha2, alpha2, ALPHA_R, x2 + move YY, Y1 + + blez I, .L25 + daddiu I, I, -1 + + LD x1, 0 * SIZE(XX) + LD x2, 1 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + LD ysum1, 0 * SIZE(YY) + blez I, .L23 + LD ysum2, 1 * SIZE(YY) + .align 3 + +.L22: + MADD ysum1, ysum1, alpha1, a1 + daddiu XX, XX, 2 * SIZE + MADD ysum2, ysum2, alpha2, a1 + daddiu YY, YY, 2 * SIZE + MADD xsum1, xsum1, x1, a1 + daddiu AO1, AO1, 2 * SIZE + MADD xsum2, xsum2, x2, a1 + daddiu I, I, -1 + + NMSUB ysum1, ysum1, alpha2, a2 + MADD ysum2, ysum2, alpha1, a2 + ADD1 xsum1, xsum1, x2, a2 + LD x2, 1 * SIZE(XX) + ADD2 xsum2, xsum2, x1, a2 + LD x1, 0 * SIZE(XX) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + ST ysum1, -2 * SIZE(YY) + LD ysum1, 0 * SIZE(YY) + ST ysum2, -1 * SIZE(YY) + bgtz I, .L22 + LD ysum2, 1 * SIZE(YY) + .align 3 + +.L23: + MADD ysum1, ysum1, alpha1, a1 + MADD ysum2, ysum2, alpha2, a1 + MADD xsum1, xsum1, x1, a1 + MADD xsum2, xsum2, x2, a1 + + NMSUB ysum1, ysum1, alpha2, a2 + daddiu XX, XX, 2 * SIZE + MADD ysum2, ysum2, alpha1, a2 + daddiu YY, YY, 2 * SIZE + ADD1 xsum1, xsum1, x2, a2 + daddiu AO1, AO1, 2 * SIZE + ADD2 xsum2, xsum2, x1, a2 + nop + + ST ysum1, -2 * SIZE(YY) + ST ysum2, -1 * SIZE(YY) + .align 3 + +.L25: + dsll TEMP, IS, ZBASE_SHIFT + daddu TEMP, Y1, TEMP + + LD ysum1, 0 * SIZE(TEMP) + LD ysum2, 1 * SIZE(TEMP) + + LD a1, 0 * SIZE(AO1) + LD a2, 1 * SIZE(AO1) + + MOV x1, xsum1 + MOV x2, xsum2 + + MUL xsum1, ALPHA_R, xsum1 + MUL xsum2, ALPHA_R, xsum2 + + NMSUB xsum1, xsum1, ALPHA_I, x2 + MADD xsum2, xsum2, ALPHA_I, x1 + + MADD xsum1, xsum1, alpha1, a1 + MADD xsum2, xsum2, alpha2, a1 + +#ifndef HEMV + NMSUB xsum1, xsum1, alpha2, a2 + MADD xsum2, xsum2, alpha1, a2 +#endif + + ADD ysum1, ysum1, xsum1 + ADD ysum2, ysum2, xsum2 + + ST ysum1, 0 * SIZE(TEMP) + ST ysum2, 1 * SIZE(TEMP) + .align 3 + +.L900: + li IS, 2 * SIZE + + beq INCY, IS, .L999 + NOP + + dsra I, M, 2 + blez I, .L905 + NOP + .align 3 + +.L902: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + LD a3, 2 * SIZE(Y1) + LD a4, 3 * SIZE(Y1) + LD a5, 4 * SIZE(Y1) + LD a6, 5 * SIZE(Y1) + LD a7, 6 * SIZE(Y1) + LD a8, 7 * SIZE(Y1) + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a3, 0 * SIZE(Y) + ST a4, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a5, 0 * SIZE(Y) + ST a6, 1 * SIZE(Y) + daddu Y, Y, INCY + ST a7, 0 * SIZE(Y) + ST a8, 1 * SIZE(Y) + daddu Y, Y, INCY + + daddiu I, I, -1 + + bgtz I, .L902 + daddiu Y1, Y1, 8 * SIZE + .align 3 + +.L905: + andi I, M, 3 + blez I, .L999 + NOP + .align 3 + +.L906: + LD a1, 0 * SIZE(Y1) + LD a2, 1 * SIZE(Y1) + daddiu Y1, Y1, 2 * SIZE + + ST a1, 0 * SIZE(Y) + ST a2, 1 * SIZE(Y) + daddiu I, I, -1 + + bgtz I, .L906 + daddu Y, Y, INCY + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + ldc1 $f24, 32($sp) + ldc1 $f25, 40($sp) + +#ifndef __64BIT__ + ldc1 $f20, 48($sp) + ldc1 $f21, 56($sp) + ldc1 $f22, 64($sp) + ldc1 $f23, 72($sp) +#endif + + j $31 +#ifdef __64BIT__ + daddiu $sp, $sp, 64 +#else + daddiu $sp, $sp, 80 +#endif + + EPILOGUE diff --git a/kernel/mips64/ztrsm_kernel_LT.S b/kernel/mips64/ztrsm_kernel_LT.S new file mode 100644 index 0000000000..0e7011815b --- /dev/null +++ b/kernel/mips64/ztrsm_kernel_LT.S @@ -0,0 +1,1685 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#define AORIG $21 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f26 +#define a4 $f27 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f15 +#define c41 $f16 +#define c42 $f17 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f22 +#define c72 $f23 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + daddiu $sp, $sp, -128 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + LDARG LDC, 128 + 0($sp) + LDARG OFFSET, 128 + 8($sp) + + dsll LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, ZBASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + dsra J, N, 2 + blez J, .L20 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 2 + ZBASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + blez I, .L19 + MOV c61, c11 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 2 + ZBASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + dsra L, TEMP, 2 + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + blez L, .L15 + NOP +#endif + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + blez L, .L13 + MADD3 c41, c41, a1, b4 + .align 3 + +.L12: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + daddiu L, L, -1 + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + bgtz L, .L12 + MADD3 c41, c41, a1, b4 + .align 3 + +.L13: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + daddiu L, L, -1 + MADD3 c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD1 c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD3 c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 + +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + MADD5 c31, a3, b1, c31 + MADD6 c32, a4, b1, c32 + + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + + MADD5 c51, a1, b1, c51 + MADD6 c52, a2, b1, c52 + MADD5 c71, a3, b1, c71 + MADD6 c72, a4, b1, c72 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + + NMSUB c31, c31, b3, c11 + MADD7 c32, c32, b4, c11 + NMSUB c51, c51, b5, c11 + MADD7 c52, c52, b6, c11 + NMSUB c71, c71, b7, c11 + MADD7 c72, c72, b8, c11 + + MADD8 c31, c31, b4, c12 + NMSUB c32, c32, b3, c12 + MADD8 c51, c51, b6, c12 + NMSUB c52, c52, b5, c12 + MADD8 c71, c71, b8, c12 + NMSUB c72, c72, b7, c12 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL a1, b4, c32 + MUL a2, b4, c31 + + MADD5 c31, a1, b3, c31 + MADD6 c32, a2, b3, c32 + + NMSUB c51, c51, b5, c31 + MADD7 c52, c52, b6, c31 + NMSUB c71, c71, b7, c31 + MADD7 c72, c72, b8, c31 + + MADD8 c51, c51, b6, c32 + NMSUB c52, c52, b5, c32 + MADD8 c71, c71, b8, c32 + NMSUB c72, c72, b7, c32 + + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL a1, b6, c52 + MUL a2, b6, c51 + + MADD5 c51, a1, b5, c51 + MADD6 c52, a2, b5, c52 + + NMSUB c71, c71, b7, c51 + MADD7 c72, c72, b8, c51 + + MADD8 c71, c71, b8, c52 + NMSUB c72, c72, b7, c52 + + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL a1, b8, c72 + MUL a2, b8, c71 + + MADD5 c71, a1, b7, c71 + MADD6 c72, a2, b7, c72 +#endif + +#ifdef RT + LD b1, 30 * SIZE(BO) + LD b2, 31 * SIZE(BO) + LD b3, 28 * SIZE(BO) + LD b4, 29 * SIZE(BO) + LD b5, 26 * SIZE(BO) + LD b6, 27 * SIZE(BO) + LD b7, 24 * SIZE(BO) + LD b8, 25 * SIZE(BO) + + MUL a1, b2, c72 + MUL a2, b2, c71 + + MADD5 c71, a1, b1, c71 + MADD6 c72, a2, b1, c72 + + NMSUB c51, c51, b3, c71 + MADD7 c52, c52, b4, c71 + NMSUB c31, c31, b5, c71 + MADD7 c32, c32, b6, c71 + NMSUB c11, c11, b7, c71 + MADD7 c12, c12, b8, c71 + + MADD8 c51, c51, b4, c72 + NMSUB c52, c52, b3, c72 + MADD8 c31, c31, b6, c72 + NMSUB c32, c32, b5, c72 + MADD8 c11, c11, b8, c72 + NMSUB c12, c12, b7, c72 + + LD b3, 20 * SIZE(BO) + LD b4, 21 * SIZE(BO) + LD b5, 18 * SIZE(BO) + LD b6, 19 * SIZE(BO) + LD b7, 16 * SIZE(BO) + LD b8, 17 * SIZE(BO) + + MUL a1, b4, c52 + MUL a2, b4, c51 + + MADD5 c51, a1, b3, c51 + MADD6 c52, a2, b3, c52 + + NMSUB c31, c31, b5, c51 + MADD7 c32, c32, b6, c51 + NMSUB c11, c11, b7, c51 + MADD7 c12, c12, b8, c51 + + MADD8 c31, c31, b6, c52 + NMSUB c32, c32, b5, c52 + MADD8 c11, c11, b8, c52 + NMSUB c12, c12, b7, c52 + + LD b5, 10 * SIZE(BO) + LD b6, 11 * SIZE(BO) + LD b7, 8 * SIZE(BO) + LD b8, 9 * SIZE(BO) + + MUL a1, b6, c32 + MUL a2, b6, c31 + + MADD5 c31, a1, b5, c31 + MADD6 c32, a2, b5, c32 + + NMSUB c11, c11, b7, c31 + MADD7 c12, c12, b8, c31 + + MADD8 c11, c11, b8, c32 + NMSUB c12, c12, b7, c32 + + LD b7, 0 * SIZE(BO) + LD b8, 1 * SIZE(BO) + + MUL a1, b8, c12 + MUL a2, b8, c11 + + MADD5 c11, a1, b7, c11 + MADD6 c12, a2, b7, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c32, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c52, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c72, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c32, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c52, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c72, 7 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE + daddiu CO2,CO2, -2 * SIZE + daddiu CO3,CO3, -2 * SIZE + daddiu CO4,CO4, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c31, 0 * SIZE(CO2) + ST c32, 1 * SIZE(CO2) + ST c51, 0 * SIZE(CO3) + ST c52, 1 * SIZE(CO3) + ST c71, 0 * SIZE(CO4) + ST c72, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE +#endif + + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + MTC $0, c11 + + daddiu I, I, -1 + + + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + + bgtz I, .L11 + MOV c61, c11 + .align 3 + +.L19: +#ifdef LN + dsll TEMP, K, 2 + ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + + bgtz J, .L10 + NOP + .align 3 + +.L20: + andi J, N, 2 + blez J, .L30 + NOP + +#ifdef RT + dsll TEMP, K, 1 + ZBASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + MTC $0, c11 + + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + move I, M + blez I, .L29 + NOP + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(B) + dsra L, KK, 2 + + LD b3, 2 * SIZE(B) + MOV c12, c11 + LD b4, 3 * SIZE(B) + MOV c22, c11 + LD b5, 4 * SIZE(B) + MOV c32, c11 + + NOP + MOV c42, c11 + blez L, .L25 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(BO) + dsra L, TEMP, 2 + + LD b3, 2 * SIZE(BO) + MOV c12, c11 + LD b4, 3 * SIZE(BO) + MOV c22, c11 + LD b5, 4 * SIZE(BO) + MOV c32, c11 + + blez L, .L25 + MOV c42, c11 +#endif + .align 3 + +.L22: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 12 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c11, c11, a3, b5 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 19 * SIZE(BO) + + bgtz L, .L22 + daddiu BO, BO, 16 * SIZE + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L28 + NOP + .align 3 + +.L26: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + daddiu BO, BO, 4 * SIZE + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 0 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L26 + daddiu AO, AO, 2 * SIZE + +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + MADD5 c31, a3, b1, c31 + MADD6 c32, a4, b1, c32 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + + NMSUB c31, c31, b3, c11 + MADD7 c32, c32, b4, c11 + + MADD8 c31, c31, b4, c12 + NMSUB c32, c32, b3, c12 + + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL a1, b4, c32 + MUL a2, b4, c31 + + MADD5 c31, a1, b3, c31 + MADD6 c32, a2, b3, c32 +#endif + +#ifdef RT + LD b5, 6 * SIZE(BO) + LD b6, 7 * SIZE(BO) + LD b7, 4 * SIZE(BO) + LD b8, 5 * SIZE(BO) + + MUL a1, b6, c32 + MUL a2, b6, c31 + + MADD5 c31, a1, b5, c31 + MADD6 c32, a2, b5, c32 + + NMSUB c11, c11, b7, c31 + MADD7 c12, c12, b8, c31 + + MADD8 c11, c11, b8, c32 + NMSUB c12, c12, b7, c32 + + LD b7, 0 * SIZE(BO) + LD b8, 1 * SIZE(BO) + + MUL a1, b8, c12 + MUL a2, b8, c11 + + MADD5 c11, a1, b7, c11 + MADD6 c12, a2, b7, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c32, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c32, 3 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE + daddiu CO2,CO2, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c31, 0 * SIZE(CO2) + ST c32, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE +#endif + + MTC $0, c11 + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + daddiu I, I, -1 + + bgtz I, .L21 + NOP + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 1 + ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L30: + andi J, N, 1 + blez J, .L999 + NOP + + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + MTC $0, c11 + + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + move I, M + blez I, .L39 + NOP + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(B) + MOV c12, c11 + dsra L, KK, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(B) + + NOP + MOV c42, c11 + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(BO) + MOV c12, c11 + dsra L, TEMP, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(BO) + + blez L, .L35 + MOV c42, c11 +#endif + .align 3 + +.L32: + MADD1 c11, c11, a1, b1 + LD b4, 3 * SIZE(BO) + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + MADD1 c11, c11, a1, b1 + LD b2, 5 * SIZE(BO) + MADD3 c21, c21, a1, b4 + LD a1, 8 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 5 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b4, 7 * SIZE(BO) + MADD3 c21, c21, a3, b2 + LD a3, 6 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 7 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b2, 9 * SIZE(BO) + MADD3 c21, c21, a3, b4 + LD a3, 12 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 12 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 9 * SIZE(AO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + + bgtz L, .L32 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L38 + NOP + .align 3 + +.L36: + MADD1 c11, c11, a1, b1 + daddiu L, L, -1 + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + LD b2, 3 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + bgtz L, .L36 + daddiu AO, AO, 2 * SIZE + +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 + +#if defined(LN) || defined(RT) + daddiu TEMP, KK, -1 + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE +#endif + + MTC $0, c11 + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, ZBASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + daddiu I, I, -1 + + bgtz I, .L31 + NOP + .align 3 + +.L39: +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, 128 + + EPILOGUE diff --git a/kernel/mips64/ztrsm_kernel_RT.S b/kernel/mips64/ztrsm_kernel_RT.S new file mode 100644 index 0000000000..1fc268466f --- /dev/null +++ b/kernel/mips64/ztrsm_kernel_RT.S @@ -0,0 +1,1684 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M $4 +#define N $5 +#define K $6 +#define A $9 +#define B $10 +#define C $11 +#define LDC $8 + +#define AO $12 +#define BO $13 + +#define I $2 +#define J $3 +#define L $7 + +#define CO1 $14 +#define CO2 $15 +#define CO3 $16 +#define CO4 $17 + +#define OFFSET $18 +#define KK $19 +#define TEMP $20 +#define AORIG $21 + +#define a1 $f0 +#define a2 $f1 +#define a3 $f26 +#define a4 $f27 + +#define b1 $f2 +#define b2 $f3 +#define b3 $f4 +#define b4 $f5 +#define b5 $f6 +#define b6 $f7 +#define b7 $f8 +#define b8 $f9 + +#define a5 b8 + +#define c11 $f10 +#define c12 $f11 +#define c21 $f12 +#define c22 $f13 +#define c31 $f14 +#define c32 $f15 +#define c41 $f16 +#define c42 $f17 +#define c51 $f18 +#define c52 $f19 +#define c61 $f20 +#define c62 $f21 +#define c71 $f22 +#define c72 $f23 +#define c81 $f24 +#define c82 $f25 + +#ifndef CONJ +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 MADD +#define MADD4 NMSUB +#define MADD5 MSUB +#define MADD6 MADD +#define MADD7 NMSUB +#define MADD8 MADD +#else +#if defined(LN) || defined(LT) +#define MADD1 MADD +#define MADD2 NMSUB +#define MADD3 MADD +#define MADD4 MADD +#else +#define MADD1 MADD +#define MADD2 MADD +#define MADD3 NMSUB +#define MADD4 MADD +#endif +#define MADD5 MADD +#define MADD6 MSUB +#define MADD7 MADD +#define MADD8 NMSUB +#endif + + PROLOGUE + + daddiu $sp, $sp, -128 + + SDARG $16, 0($sp) + SDARG $17, 8($sp) + SDARG $18, 16($sp) + SDARG $19, 24($sp) + SDARG $20, 32($sp) + SDARG $21, 40($sp) + + sdc1 $f24, 48($sp) + sdc1 $f25, 56($sp) + sdc1 $f26, 64($sp) + sdc1 $f27, 72($sp) + +#ifndef __64BIT__ + sdc1 $f20, 88($sp) + sdc1 $f21, 96($sp) + sdc1 $f22,104($sp) + sdc1 $f23,112($sp) +#endif + + LDARG LDC, 128 + 0($sp) + LDARG OFFSET, 128 + 8($sp) + + dsll LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mult M, K + mflo TEMP + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu A, A, TEMP + + dsll TEMP, M, ZBASE_SHIFT + daddu C, C, TEMP +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mult N, K + mflo TEMP + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu B, B, TEMP + + mult N, LDC + mflo TEMP + daddu C, C, TEMP + + dsubu KK, N, OFFSET +#endif + + andi J, N, 1 + blez J, .L20 + NOP + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + dsubu B, B, TEMP + + dsubu C, C, LDC +#endif + + MTC $0, c11 + + move CO1, C + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO1, LDC +#endif + + move I, M + blez I, .L39 + NOP + .align 3 + +.L31: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(B) + MOV c12, c11 + dsra L, KK, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(B) + + NOP + MOV c42, c11 + blez L, .L35 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + dsll TEMP, KK, ZBASE_SHIFT + + daddu AO, AORIG, TEMP + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a2, 1 * SIZE(AO) + + MOV c41, c11 + LD b2, 1 * SIZE(BO) + MOV c12, c11 + dsra L, TEMP, 2 + + MOV c22, c11 + LD a3, 4 * SIZE(AO) + MOV c32, c11 + LD b3, 4 * SIZE(BO) + + blez L, .L35 + MOV c42, c11 +#endif + .align 3 + +.L32: + MADD1 c11, c11, a1, b1 + LD b4, 3 * SIZE(BO) + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + MADD1 c11, c11, a1, b1 + LD b2, 5 * SIZE(BO) + MADD3 c21, c21, a1, b4 + LD a1, 8 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 5 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b4, 7 * SIZE(BO) + MADD3 c21, c21, a3, b2 + LD a3, 6 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 7 * SIZE(AO) + + MADD1 c11, c11, a3, b3 + LD b2, 9 * SIZE(BO) + MADD3 c21, c21, a3, b4 + LD a3, 12 * SIZE(AO) + MADD2 c12, c12, a2, b3 + LD b3, 12 * SIZE(BO) + MADD4 c22, c22, a2, b4 + LD a2, 9 * SIZE(AO) + + daddiu AO, AO, 8 * SIZE + daddiu L, L, -1 + + bgtz L, .L32 + daddiu BO, BO, 8 * SIZE + .align 3 + +.L35: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L38 + NOP + .align 3 + +.L36: + MADD1 c11, c11, a1, b1 + daddiu L, L, -1 + MADD3 c21, c21, a1, b2 + LD a1, 2 * SIZE(AO) + MADD2 c12, c12, a2, b1 + LD b1, 2 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD a2, 3 * SIZE(AO) + + LD b2, 3 * SIZE(BO) + daddiu BO, BO, 2 * SIZE + bgtz L, .L36 + daddiu AO, AO, 2 * SIZE + +.L38: + ADD c11, c11, c22 + ADD c12, c12, c21 + +#if defined(LN) || defined(RT) + daddiu TEMP, KK, -1 + + dsll TEMP, TEMP, ZBASE_SHIFT + daddu AO, AORIG, TEMP + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 +#endif + +#if defined(RN) || defined(RT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE +#endif + + MTC $0, c11 + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll TEMP, TEMP, ZBASE_SHIFT + daddu AO, AO, TEMP + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + daddiu I, I, -1 + + bgtz I, .L31 + NOP + .align 3 + +.L39: +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 1 +#endif + +#ifdef RT + daddiu KK, KK, -1 +#endif + .align 3 + +.L20: + andi J, N, 2 + blez J, .L30 + NOP + +#ifdef RT + dsll TEMP, K, 1 + ZBASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 1 + dsubu C, C, TEMP +#endif + + MTC $0, c11 + + move CO1, C + daddu CO2, C, LDC + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO2, LDC +#endif + + move I, M + blez I, .L29 + NOP + .align 3 + +.L21: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(B) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(B) + dsra L, KK, 2 + + LD b3, 2 * SIZE(B) + MOV c12, c11 + LD b4, 3 * SIZE(B) + MOV c22, c11 + LD b5, 4 * SIZE(B) + MOV c32, c11 + + NOP + MOV c42, c11 + blez L, .L25 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 1 + ZBASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c21, c11 + LD b1, 0 * SIZE(BO) + MOV c31, c11 + LD a3, 4 * SIZE(AO) + MOV c41, c11 + LD b2, 1 * SIZE(BO) + dsra L, TEMP, 2 + + LD b3, 2 * SIZE(BO) + MOV c12, c11 + LD b4, 3 * SIZE(BO) + MOV c22, c11 + LD b5, 4 * SIZE(BO) + MOV c32, c11 + + blez L, .L25 + MOV c42, c11 +#endif + .align 3 + +.L22: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c11, c11, a1, b5 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 12 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 6 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c11, c11, a3, b5 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a3, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + LD a3, 4 * SIZE(AO) + + MADD2 c12, c12, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 19 * SIZE(BO) + + bgtz L, .L22 + daddiu BO, BO, 16 * SIZE + .align 3 + +.L25: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L28 + NOP + .align 3 + +.L26: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + daddiu BO, BO, 4 * SIZE + MADD3 c41, c41, a1, b4 + LD a1, 2 * SIZE(AO) + + MADD2 c12, c12, a2, b1 + LD b1, 0 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 3 * SIZE(BO) + + bgtz L, .L26 + daddiu AO, AO, 2 * SIZE + +.L28: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -2 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + MADD5 c31, a3, b1, c31 + MADD6 c32, a4, b1, c32 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + + NMSUB c31, c31, b3, c11 + MADD7 c32, c32, b4, c11 + + MADD8 c31, c31, b4, c12 + NMSUB c32, c32, b3, c12 + + LD b3, 6 * SIZE(BO) + LD b4, 7 * SIZE(BO) + + MUL a1, b4, c32 + MUL a2, b4, c31 + + MADD5 c31, a1, b3, c31 + MADD6 c32, a2, b3, c32 +#endif + +#ifdef RT + LD b5, 6 * SIZE(BO) + LD b6, 7 * SIZE(BO) + LD b7, 4 * SIZE(BO) + LD b8, 5 * SIZE(BO) + + MUL a1, b6, c32 + MUL a2, b6, c31 + + MADD5 c31, a1, b5, c31 + MADD6 c32, a2, b5, c32 + + NMSUB c11, c11, b7, c31 + MADD7 c12, c12, b8, c31 + + MADD8 c11, c11, b8, c32 + NMSUB c12, c12, b7, c32 + + LD b7, 0 * SIZE(BO) + LD b8, 1 * SIZE(BO) + + MUL a1, b8, c12 + MUL a2, b8, c11 + + MADD5 c11, a1, b7, c11 + MADD6 c12, a2, b7, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c32, 3 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c32, 3 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE + daddiu CO2,CO2, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c31, 0 * SIZE(CO2) + ST c32, 1 * SIZE(CO2) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE +#endif + + MTC $0, c11 + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 1 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + daddiu I, I, -1 + + bgtz I, .L21 + NOP + .align 3 + +.L29: +#ifdef LN + dsll TEMP, K, 1 + ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 2 +#endif + +#ifdef RT + daddiu KK, KK, -2 +#endif + .align 3 + +.L30: + dsra J, N, 2 + blez J, .L999 + nop + +.L10: +#ifdef RT + dsll TEMP, K, 2 + ZBASE_SHIFT + dsubu B, B, TEMP + + dsll TEMP, LDC, 2 + dsubu C, C, TEMP +#endif + + move CO1, C + MTC $0, c11 + daddu CO2, C, LDC + daddu CO3, CO2, LDC + daddiu J, J, -1 + daddu CO4, CO3, LDC + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + move I, M + +#ifdef LN + daddu KK, M, OFFSET +#endif + +#ifdef LT + move KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + move AORIG, A +#else + move AO, A +#endif +#ifndef RT + daddu C, CO4, LDC +#endif + + blez I, .L19 + MOV c61, c11 + .align 3 + +.L11: +#if defined(LT) || defined(RN) + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(B) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(B) + MOV c22, c11 + + dsra L, KK, 2 + MOV c32, c11 + LD b3, 2 * SIZE(B) + MOV c42, c11 + + LD b4, 3 * SIZE(B) + MOV c52, c11 + LD b5, 4 * SIZE(B) + MOV c62, c11 + + LD b6, 8 * SIZE(B) + MOV c72, c11 + LD b7, 12 * SIZE(B) + MOV c82, c11 + + blez L, .L15 + move BO, B +#else +#ifdef LN + dsll TEMP, K, ZBASE_SHIFT + dsubu AORIG, AORIG, TEMP +#endif + + dsll L, KK, ZBASE_SHIFT + dsll TEMP, KK, 2 + ZBASE_SHIFT + + daddu AO, AORIG, L + daddu BO, B, TEMP + + dsubu TEMP, K, KK + + LD a1, 0 * SIZE(AO) + MOV c71, c11 + LD b1, 0 * SIZE(BO) + MOV c81, c11 + + LD a3, 4 * SIZE(AO) + MOV c12, c11 + LD b2, 1 * SIZE(BO) + MOV c22, c11 + + dsra L, TEMP, 2 + MOV c32, c11 + LD b3, 2 * SIZE(BO) + MOV c42, c11 + + LD b4, 3 * SIZE(BO) + MOV c52, c11 + LD b5, 4 * SIZE(BO) + MOV c62, c11 + + LD b6, 8 * SIZE(BO) + MOV c72, c11 + LD b7, 12 * SIZE(BO) + MOV c82, c11 + + blez L, .L15 + NOP +#endif + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + daddiu L, L, -1 + MADD1 c31, c31, a1, b3 + NOP + blez L, .L13 + MADD3 c41, c41, a1, b4 + .align 3 + +.L12: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + daddiu L, L, -1 + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + bgtz L, .L12 + MADD3 c41, c41, a1, b4 + .align 3 + +.L13: + MADD2 c12, c12, a2, b1 + LD b1, 16 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + NOP + MADD3 c61, c61, a1, b2 + LD a4, 2 * SIZE(AO) + MADD1 c71, c71, a1, b3 + NOP + MADD3 c81, c81, a1, b4 + LD a1, 8 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 20 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 9 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 10 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 11 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 3 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 24 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 13 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 14 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 15 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + NOP + MADD3 c61, c61, a4, b2 + NOP + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 28 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 17 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 18 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 19 * SIZE(BO) + + MADD1 c11, c11, a3, b1 + LD a2, 5 * SIZE(AO) + MADD3 c21, c21, a3, b2 + NOP + MADD1 c31, c31, a3, b3 + NOP + MADD3 c41, c41, a3, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 32 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 21 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 22 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 23 * SIZE(BO) + + MADD1 c51, c51, a3, b5 + NOP + MADD3 c61, c61, a3, b2 + LD a4, 6 * SIZE(AO) + MADD1 c71, c71, a3, b3 + NOP + MADD3 c81, c81, a3, b4 + LD a3, 12 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 36 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 25 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 26 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 27 * SIZE(BO) + + MADD1 c11, c11, a4, b6 + LD a2, 7 * SIZE(AO) + MADD3 c21, c21, a4, b2 + NOP + MADD1 c31, c31, a4, b3 + NOP + MADD3 c41, c41, a4, b4 + NOP + + MADD2 c12, c12, a2, b6 + LD b6, 40 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 29 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 30 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 31 * SIZE(BO) + + MADD1 c51, c51, a4, b7 + daddiu BO, BO, 32 * SIZE + MADD3 c61, c61, a4, b2 + daddiu AO, AO, 8 * SIZE + MADD1 c71, c71, a4, b3 + NOP + MADD3 c81, c81, a4, b4 + NOP + + MADD2 c52, c52, a2, b7 + LD b7, 12 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + LD b4, 3 * SIZE(BO) + .align 3 + +.L15: +#if defined(LT) || defined(RN) + andi L, KK, 3 +#else + andi L, TEMP, 3 +#endif + blez L, .L18 + NOP + .align 3 + +.L16: + MADD1 c11, c11, a1, b1 + LD a2, 1 * SIZE(AO) + MADD3 c21, c21, a1, b2 + NOP + MADD1 c31, c31, a1, b3 + NOP + MADD3 c41, c41, a1, b4 + NOP + + MADD2 c12, c12, a2, b1 + LD b1, 8 * SIZE(BO) + MADD4 c22, c22, a2, b2 + LD b2, 5 * SIZE(BO) + MADD2 c32, c32, a2, b3 + LD b3, 6 * SIZE(BO) + MADD4 c42, c42, a2, b4 + LD b4, 7 * SIZE(BO) + + MADD1 c51, c51, a1, b5 + daddiu L, L, -1 + MADD3 c61, c61, a1, b2 + daddiu AO, AO, 2 * SIZE + MADD1 c71, c71, a1, b3 + daddiu BO, BO, 8 * SIZE + MADD3 c81, c81, a1, b4 + LD a1, 0 * SIZE(AO) + + MADD2 c52, c52, a2, b5 + LD b5, 4 * SIZE(BO) + MADD4 c62, c62, a2, b2 + LD b2, 1 * SIZE(BO) + MADD2 c72, c72, a2, b3 + LD b3, 2 * SIZE(BO) + MADD4 c82, c82, a2, b4 + bgtz L, .L16 + LD b4, 3 * SIZE(BO) + +.L18: + ADD c11, c11, c22 + ADD c12, c12, c21 + ADD c31, c31, c42 + ADD c32, c32, c41 + + ADD c51, c51, c62 + ADD c52, c52, c61 + ADD c71, c71, c82 + ADD c72, c72, c81 + +#if defined(LN) || defined(RT) +#ifdef LN + daddiu TEMP, KK, -1 +#else + daddiu TEMP, KK, -4 +#endif + + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + daddu AO, AORIG, L + daddu BO, B, TEMP +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 + +#else + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + LD b3, 2 * SIZE(AO) + LD b4, 3 * SIZE(AO) + LD b5, 4 * SIZE(AO) + LD b6, 5 * SIZE(AO) + LD b7, 6 * SIZE(AO) + LD b8, 7 * SIZE(AO) + + SUB c11, b1, c11 + SUB c12, b2, c12 + SUB c31, b3, c31 + SUB c32, b4, c32 + SUB c51, b5, c51 + SUB c52, b6, c52 + SUB c71, b7, c71 + SUB c72, b8, c72 +#endif + +#if defined(LN) || defined(LT) + LD b1, 0 * SIZE(AO) + LD b2, 1 * SIZE(AO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + MUL a3, b2, c32 + MUL a4, b2, c31 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + MADD5 c31, a3, b1, c31 + MADD6 c32, a4, b1, c32 + + MUL a1, b2, c52 + MUL a2, b2, c51 + MUL a3, b2, c72 + MUL a4, b2, c71 + + MADD5 c51, a1, b1, c51 + MADD6 c52, a2, b1, c52 + MADD5 c71, a3, b1, c71 + MADD6 c72, a4, b1, c72 +#endif + +#ifdef RN + LD b1, 0 * SIZE(BO) + LD b2, 1 * SIZE(BO) + LD b3, 2 * SIZE(BO) + LD b4, 3 * SIZE(BO) + LD b5, 4 * SIZE(BO) + LD b6, 5 * SIZE(BO) + LD b7, 6 * SIZE(BO) + LD b8, 7 * SIZE(BO) + + MUL a1, b2, c12 + MUL a2, b2, c11 + + MADD5 c11, a1, b1, c11 + MADD6 c12, a2, b1, c12 + + NMSUB c31, c31, b3, c11 + MADD7 c32, c32, b4, c11 + NMSUB c51, c51, b5, c11 + MADD7 c52, c52, b6, c11 + NMSUB c71, c71, b7, c11 + MADD7 c72, c72, b8, c11 + + MADD8 c31, c31, b4, c12 + NMSUB c32, c32, b3, c12 + MADD8 c51, c51, b6, c12 + NMSUB c52, c52, b5, c12 + MADD8 c71, c71, b8, c12 + NMSUB c72, c72, b7, c12 + + LD b3, 10 * SIZE(BO) + LD b4, 11 * SIZE(BO) + LD b5, 12 * SIZE(BO) + LD b6, 13 * SIZE(BO) + LD b7, 14 * SIZE(BO) + LD b8, 15 * SIZE(BO) + + MUL a1, b4, c32 + MUL a2, b4, c31 + + MADD5 c31, a1, b3, c31 + MADD6 c32, a2, b3, c32 + + NMSUB c51, c51, b5, c31 + MADD7 c52, c52, b6, c31 + NMSUB c71, c71, b7, c31 + MADD7 c72, c72, b8, c31 + + MADD8 c51, c51, b6, c32 + NMSUB c52, c52, b5, c32 + MADD8 c71, c71, b8, c32 + NMSUB c72, c72, b7, c32 + + LD b5, 20 * SIZE(BO) + LD b6, 21 * SIZE(BO) + LD b7, 22 * SIZE(BO) + LD b8, 23 * SIZE(BO) + + MUL a1, b6, c52 + MUL a2, b6, c51 + + MADD5 c51, a1, b5, c51 + MADD6 c52, a2, b5, c52 + + NMSUB c71, c71, b7, c51 + MADD7 c72, c72, b8, c51 + + MADD8 c71, c71, b8, c52 + NMSUB c72, c72, b7, c52 + + LD b7, 30 * SIZE(BO) + LD b8, 31 * SIZE(BO) + + MUL a1, b8, c72 + MUL a2, b8, c71 + + MADD5 c71, a1, b7, c71 + MADD6 c72, a2, b7, c72 +#endif + +#ifdef RT + LD b1, 30 * SIZE(BO) + LD b2, 31 * SIZE(BO) + LD b3, 28 * SIZE(BO) + LD b4, 29 * SIZE(BO) + LD b5, 26 * SIZE(BO) + LD b6, 27 * SIZE(BO) + LD b7, 24 * SIZE(BO) + LD b8, 25 * SIZE(BO) + + MUL a1, b2, c72 + MUL a2, b2, c71 + + MADD5 c71, a1, b1, c71 + MADD6 c72, a2, b1, c72 + + NMSUB c51, c51, b3, c71 + MADD7 c52, c52, b4, c71 + NMSUB c31, c31, b5, c71 + MADD7 c32, c32, b6, c71 + NMSUB c11, c11, b7, c71 + MADD7 c12, c12, b8, c71 + + MADD8 c51, c51, b4, c72 + NMSUB c52, c52, b3, c72 + MADD8 c31, c31, b6, c72 + NMSUB c32, c32, b5, c72 + MADD8 c11, c11, b8, c72 + NMSUB c12, c12, b7, c72 + + LD b3, 20 * SIZE(BO) + LD b4, 21 * SIZE(BO) + LD b5, 18 * SIZE(BO) + LD b6, 19 * SIZE(BO) + LD b7, 16 * SIZE(BO) + LD b8, 17 * SIZE(BO) + + MUL a1, b4, c52 + MUL a2, b4, c51 + + MADD5 c51, a1, b3, c51 + MADD6 c52, a2, b3, c52 + + NMSUB c31, c31, b5, c51 + MADD7 c32, c32, b6, c51 + NMSUB c11, c11, b7, c51 + MADD7 c12, c12, b8, c51 + + MADD8 c31, c31, b6, c52 + NMSUB c32, c32, b5, c52 + MADD8 c11, c11, b8, c52 + NMSUB c12, c12, b7, c52 + + LD b5, 10 * SIZE(BO) + LD b6, 11 * SIZE(BO) + LD b7, 8 * SIZE(BO) + LD b8, 9 * SIZE(BO) + + MUL a1, b6, c32 + MUL a2, b6, c31 + + MADD5 c31, a1, b5, c31 + MADD6 c32, a2, b5, c32 + + NMSUB c11, c11, b7, c31 + MADD7 c12, c12, b8, c31 + + MADD8 c11, c11, b8, c32 + NMSUB c12, c12, b7, c32 + + LD b7, 0 * SIZE(BO) + LD b8, 1 * SIZE(BO) + + MUL a1, b8, c12 + MUL a2, b8, c11 + + MADD5 c11, a1, b7, c11 + MADD6 c12, a2, b7, c12 +#endif + +#if defined(LN) || defined(LT) + ST c11, 0 * SIZE(BO) + ST c12, 1 * SIZE(BO) + ST c31, 2 * SIZE(BO) + ST c32, 3 * SIZE(BO) + ST c51, 4 * SIZE(BO) + ST c52, 5 * SIZE(BO) + ST c71, 6 * SIZE(BO) + ST c72, 7 * SIZE(BO) +#else + ST c11, 0 * SIZE(AO) + ST c12, 1 * SIZE(AO) + ST c31, 2 * SIZE(AO) + ST c32, 3 * SIZE(AO) + ST c51, 4 * SIZE(AO) + ST c52, 5 * SIZE(AO) + ST c71, 6 * SIZE(AO) + ST c72, 7 * SIZE(AO) +#endif + +#ifdef LN + daddiu CO1,CO1, -2 * SIZE + daddiu CO2,CO2, -2 * SIZE + daddiu CO3,CO3, -2 * SIZE + daddiu CO4,CO4, -2 * SIZE +#endif + + ST c11, 0 * SIZE(CO1) + ST c12, 1 * SIZE(CO1) + ST c31, 0 * SIZE(CO2) + ST c32, 1 * SIZE(CO2) + ST c51, 0 * SIZE(CO3) + ST c52, 1 * SIZE(CO3) + ST c71, 0 * SIZE(CO4) + ST c72, 1 * SIZE(CO4) + +#ifndef LN + daddiu CO1,CO1, 2 * SIZE + daddiu CO2,CO2, 2 * SIZE + daddiu CO3,CO3, 2 * SIZE + daddiu CO4,CO4, 2 * SIZE +#endif + + +#ifdef RT + dsll TEMP, K, ZBASE_SHIFT + daddu AORIG, AORIG, TEMP +#endif + +#if defined(LT) || defined(RN) + dsubu TEMP, K, KK + dsll L, TEMP, ZBASE_SHIFT + dsll TEMP, TEMP, 2 + ZBASE_SHIFT + daddu AO, AO, L + daddu BO, BO, TEMP +#endif + +#ifdef LT + daddiu KK, KK, 1 +#endif + +#ifdef LN + daddiu KK, KK, -1 +#endif + + MTC $0, c11 + + daddiu I, I, -1 + + + MOV c21, c11 + MOV c31, c11 + MOV c41, c11 + MOV c51, c11 + + bgtz I, .L11 + MOV c61, c11 + .align 3 + +.L19: +#ifdef LN + dsll TEMP, K, 2 + ZBASE_SHIFT + daddu B, B, TEMP +#endif + +#if defined(LT) || defined(RN) + move B, BO +#endif + +#ifdef RN + daddiu KK, KK, 4 +#endif + +#ifdef RT + daddiu KK, KK, -4 +#endif + + bgtz J, .L10 + NOP + .align 3 + +.L999: + LDARG $16, 0($sp) + LDARG $17, 8($sp) + LDARG $18, 16($sp) + LDARG $19, 24($sp) + LDARG $20, 32($sp) + LDARG $21, 40($sp) + + ldc1 $f24, 48($sp) + ldc1 $f25, 56($sp) + ldc1 $f26, 64($sp) + ldc1 $f27, 72($sp) + +#ifndef __64BIT__ + ldc1 $f20, 88($sp) + ldc1 $f21, 96($sp) + ldc1 $f22,104($sp) + ldc1 $f23,112($sp) +#endif + + j $31 + daddiu $sp, $sp, 128 + + EPILOGUE diff --git a/kernel/power/KERNEL b/kernel/power/KERNEL new file mode 100644 index 0000000000..cb9ed848b7 --- /dev/null +++ b/kernel/power/KERNEL @@ -0,0 +1,86 @@ +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S + + +ifndef SSYMV_U_KERNEL +SSYMV_U_KERNEL = symv_U.S +endif + +ifndef SSYMV_L_KERNEL +SSYMV_L_KERNEL = symv_L.S +endif + +ifndef DSYMV_U_KERNEL +DSYMV_U_KERNEL = symv_U.S +endif + +ifndef DSYMV_L_KERNEL +DSYMV_L_KERNEL = symv_L.S +endif + +ifndef CSYMV_U_KERNEL +CSYMV_U_KERNEL = zsymv_U.S +endif + +ifndef CSYMV_L_KERNEL +CSYMV_L_KERNEL = zsymv_L.S +endif + +ifndef ZSYMV_U_KERNEL +ZSYMV_U_KERNEL = zsymv_U.S +endif + +ifndef ZSYMV_L_KERNEL +ZSYMV_L_KERNEL = zsymv_L.S +endif + +ifndef CHEMV_U_KERNEL +CHEMV_U_KERNEL = zsymv_U.S +endif + +ifndef CHEMV_L_KERNEL +CHEMV_L_KERNEL = zsymv_L.S +endif + +ifndef ZHEMV_U_KERNEL +ZHEMV_U_KERNEL = zsymv_U.S +endif + +ifndef ZHEMV_L_KERNEL +ZHEMV_L_KERNEL = zsymv_L.S +endif + +ifndef STRSMKERNEL_LN +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +endif + +ifndef STRSMKERNEL_LT +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +endif + +ifndef STRSMKERNEL_RN +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +endif + +ifndef STRSMKERNEL_RT +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifndef CTRSMKERNEL_LN +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +endif + +ifndef CTRSMKERNEL_LT +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +endif + +ifndef CTRSMKERNEL_RN +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +endif + +ifndef CTRSMKERNEL_RT +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + diff --git a/kernel/power/KERNEL.CELL b/kernel/power/KERNEL.CELL new file mode 100644 index 0000000000..745e16e895 --- /dev/null +++ b/kernel/power/KERNEL.CELL @@ -0,0 +1,76 @@ +SAMAXKERNEL = amax_cell.S +DAMAXKERNEL = amax_cell.S +CAMAXKERNEL = zamax_cell.S +ZAMAXKERNEL = zamax_cell.S + +SAMINKERNEL = amin_cell.S +DAMINKERNEL = amin_cell.S +CAMINKERNEL = zamin_cell.S +ZAMINKERNEL = zamin_cell.S + +SASUMKERNEL = asum_cell.S +DASUMKERNEL = asum_cell.S +CASUMKERNEL = zasum_cell.S +ZASUMKERNEL = zasum_cell.S + +SDOTKERNEL = dot_cell.S +DDOTKERNEL = dot_cell.S +CDOTKERNEL = zdot_cell.S +ZDOTKERNEL = zdot_cell.S + +SGEMMKERNEL = gemm_kernel_altivec_cell.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_cell.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_altivec_cell.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_cell.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_LN.S +#STRSMKERNEL_LT = trsm_kernel_LT.S +#STRSMKERNEL_RN = trsm_kernel_LT.S +#STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_cell_LN.S +DTRSMKERNEL_LT = trsm_kernel_cell_LT.S +DTRSMKERNEL_RN = trsm_kernel_cell_LT.S +DTRSMKERNEL_RT = trsm_kernel_cell_RT.S + +#CTRSMKERNEL_LN = ztrsm_kernel_LN.S +#CTRSMKERNEL_LT = ztrsm_kernel_LT.S +#CTRSMKERNEL_RN = ztrsm_kernel_LT.S +#CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_cell_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_cell_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_cell_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_cell_RT.S diff --git a/kernel/power/KERNEL.POWER3 b/kernel/power/KERNEL.POWER3 new file mode 100644 index 0000000000..188eab8d30 --- /dev/null +++ b/kernel/power/KERNEL.POWER3 @@ -0,0 +1,2 @@ +include $(KERNELDIR)/KERNEL.POWER5 + diff --git a/kernel/power/KERNEL.POWER4 b/kernel/power/KERNEL.POWER4 new file mode 100644 index 0000000000..932dbe54b1 --- /dev/null +++ b/kernel/power/KERNEL.POWER4 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.POWER5 diff --git a/kernel/power/KERNEL.POWER5 b/kernel/power/KERNEL.POWER5 new file mode 100644 index 0000000000..af0960d1f7 --- /dev/null +++ b/kernel/power/KERNEL.POWER5 @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S diff --git a/kernel/power/KERNEL.POWER6 b/kernel/power/KERNEL.POWER6 new file mode 100644 index 0000000000..ef5f744722 --- /dev/null +++ b/kernel/power/KERNEL.POWER6 @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_power6.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_power6.S +CGEMMINCOPY = ../generic/zgemm_ncopy_2.c +CGEMMITCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_power6.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_power6_LN.S +STRSMKERNEL_LT = trsm_kernel_power6_LT.S +STRSMKERNEL_RN = trsm_kernel_power6_LT.S +STRSMKERNEL_RT = trsm_kernel_power6_RT.S + +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440 new file mode 100644 index 0000000000..5e2a7f9e45 --- /dev/null +++ b/kernel/power/KERNEL.PPC440 @@ -0,0 +1,118 @@ +SAMAXKERNEL = amax_ppc440.S +DAMAXKERNEL = amax_ppc440.S +CAMAXKERNEL = zamax_ppc440.S +ZAMAXKERNEL = zamax_ppc440.S + +SAMINKERNEL = amin_ppc440.S +DAMINKERNEL = amin_ppc440.S +CAMINKERNEL = zamin_ppc440.S +ZAMINKERNEL = zamin_ppc440.S + +SASUMKERNEL = asum_ppc440.S +DASUMKERNEL = asum_ppc440.S +CASUMKERNEL = zasum_ppc440.S +ZASUMKERNEL = zasum_ppc440.S + +SAXPYKERNEL = axpy_ppc440.S +DAXPYKERNEL = axpy_ppc440.S +CAXPYKERNEL = zaxpy_ppc440.S +ZAXPYKERNEL = zaxpy_ppc440.S + +SDOTKERNEL = dot_ppc440.S +DDOTKERNEL = dot_ppc440.S +CDOTKERNEL = zdot_ppc440.S +ZDOTKERNEL = zdot_ppc440.S + +ISAMAXKERNEL = iamax_ppc440.S +IDAMAXKERNEL = iamax_ppc440.S +ICAMAXKERNEL = izamax_ppc440.S +IZAMAXKERNEL = izamax_ppc440.S + +ISAMINKERNEL = iamin_ppc440.S +IDAMINKERNEL = iamin_ppc440.S +ICAMINKERNEL = izamin_ppc440.S +IZAMINKERNEL = izamin_ppc440.S + +ISMAXKERNEL = imax_ppc440.S +IDMAXKERNEL = imax_ppc440.S + +ISMINKERNEL = imin_ppc440.S +IDMINKERNEL = imin_ppc440.S + +SMAXKERNEL = max_ppc440.S +DMAXKERNEL = max_ppc440.S + +SMINKERNEL = min_ppc440.S +DMINKERNEL = min_ppc440.S + +SNRM2KERNEL = snrm2_ppc440.S +DNRM2KERNEL = dnrm2_ppc440.S +CNRM2KERNEL = cnrm2_ppc440.S +ZNRM2KERNEL = znrm2_ppc440.S + +SROTKERNEL = rot_ppc440.S +DROTKERNEL = rot_ppc440.S +CROTKERNEL = zrot_ppc440.S +ZROTKERNEL = zrot_ppc440.S + +SSCALKERNEL = scal_ppc440.S +DSCALKERNEL = scal_ppc440.S +CSCALKERNEL = zscal_ppc440.S +ZSCALKERNEL = zscal_ppc440.S + +SGEMMKERNEL = gemm_kernel_ppc440.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_ppc440.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_ppc440.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_ppc440.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_ppc440_LN.S +STRSMKERNEL_LT = trsm_kernel_ppc440_LT.S +STRSMKERNEL_RN = trsm_kernel_ppc440_LT.S +STRSMKERNEL_RT = trsm_kernel_ppc440_RT.S + +DTRSMKERNEL_LN = trsm_kernel_ppc440_LN.S +DTRSMKERNEL_LT = trsm_kernel_ppc440_LT.S +DTRSMKERNEL_RN = trsm_kernel_ppc440_LT.S +DTRSMKERNEL_RT = trsm_kernel_ppc440_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S diff --git a/kernel/power/KERNEL.PPC440FP2 b/kernel/power/KERNEL.PPC440FP2 new file mode 100644 index 0000000000..3359385b6c --- /dev/null +++ b/kernel/power/KERNEL.PPC440FP2 @@ -0,0 +1,128 @@ +SAMAXKERNEL = amax_hummer.S +DAMAXKERNEL = amax_hummer.S +CAMAXKERNEL = zamax_hummer.S +ZAMAXKERNEL = zamax_hummer.S + +SAMINKERNEL = amin_hummer.S +DAMINKERNEL = amin_hummer.S +CAMINKERNEL = zamin_hummer.S +ZAMINKERNEL = zamin_hummer.S + +SASUMKERNEL = asum_hummer.S +DASUMKERNEL = asum_hummer.S +CASUMKERNEL = zasum_hummer.S +ZASUMKERNEL = zasum_hummer.S + +SAXPYKERNEL = axpy_hummer.S +DAXPYKERNEL = axpy_hummer.S +CAXPYKERNEL = zaxpy_hummer.S +ZAXPYKERNEL = zaxpy_hummer.S + +SCOPYKERNEL = copy_hummer.S +DCOPYKERNEL = copy_hummer.S +CCOPYKERNEL = zcopy_hummer.S +ZCOPYKERNEL = zcopy_hummer.S + +SDOTKERNEL = dot_hummer.S +DDOTKERNEL = dot_hummer.S +CDOTKERNEL = zdot_hummer.S +ZDOTKERNEL = zdot_hummer.S + +ISAMAXKERNEL = iamax_hummer.S +IDAMAXKERNEL = iamax_hummer.S +ICAMAXKERNEL = izamax_hummer.S +IZAMAXKERNEL = izamax_hummer.S + +ISAMINKERNEL = iamin_hummer.S +IDAMINKERNEL = iamin_hummer.S +ICAMINKERNEL = izamin_hummer.S +IZAMINKERNEL = izamin_hummer.S + +ISMAXKERNEL = imax_hummer.S +IDMAXKERNEL = imax_hummer.S + +ISMINKERNEL = imin_hummer.S +IDMINKERNEL = imin_hummer.S + +SMAXKERNEL = max_hummer.S +DMAXKERNEL = max_hummer.S + +SMINKERNEL = min_hummer.S +DMINKERNEL = min_hummer.S + +SNRM2KERNEL = snrm2_hummer.S +DNRM2KERNEL = dnrm2_hummer.S +CNRM2KERNEL = cnrm2_hummer.S +ZNRM2KERNEL = znrm2_hummer.S + +SROTKERNEL = rot_ppc440.S +DROTKERNEL = rot_ppc440.S +CROTKERNEL = zrot_ppc440.S +ZROTKERNEL = zrot_ppc440.S + +SSCALKERNEL = scal_hummer.S +DSCALKERNEL = scal_hummer.S +CSCALKERNEL = zscal_hummer.S +ZSCALKERNEL = zscal_hummer.S + +SSWAPKERNEL = swap_hummer.S +DSWAPKERNEL = swap_hummer.S +CSWAPKERNEL = zswap_hummer.S +ZSWAPKERNEL = zswap_hummer.S + +SGEMMKERNEL = gemm_kernel_hummer.S +SGEMMINCOPY = gemm_ncopy_hummer_8.S +SGEMMITCOPY = gemm_tcopy_hummer_8.S +SGEMMONCOPY = gemm_ncopy_hummer_4.S +SGEMMOTCOPY = gemm_tcopy_hummer_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_hummer.S +DGEMMINCOPY = gemm_ncopy_hummer_8.S +DGEMMITCOPY = gemm_tcopy_hummer_8.S +DGEMMONCOPY = gemm_ncopy_hummer_4.S +DGEMMOTCOPY = gemm_tcopy_hummer_4.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_hummer.S +CGEMMINCOPY = zgemm_ncopy_hummer_4.S +CGEMMITCOPY = zgemm_tcopy_hummer_4.S +CGEMMONCOPY = zgemm_ncopy_hummer_2.S +CGEMMOTCOPY = zgemm_tcopy_hummer_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_hummer.S +ZGEMMINCOPY = zgemm_ncopy_hummer_4.S +ZGEMMITCOPY = zgemm_tcopy_hummer_4.S +ZGEMMONCOPY = zgemm_ncopy_hummer_2.S +ZGEMMOTCOPY = zgemm_tcopy_hummer_2.S +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_hummer_LN.S +STRSMKERNEL_LT = trsm_kernel_hummer_LT.S +STRSMKERNEL_RN = trsm_kernel_hummer_LT.S +STRSMKERNEL_RT = trsm_kernel_hummer_RT.S + +DTRSMKERNEL_LN = trsm_kernel_hummer_LN.S +DTRSMKERNEL_LT = trsm_kernel_hummer_LT.S +DTRSMKERNEL_RN = trsm_kernel_hummer_LT.S +DTRSMKERNEL_RT = trsm_kernel_hummer_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_hummer_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_hummer_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_hummer_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_hummer_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_hummer_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_hummer_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_hummer_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_hummer_RT.S diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 new file mode 100644 index 0000000000..bfa43b7e82 --- /dev/null +++ b/kernel/power/KERNEL.PPC970 @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel_altivec.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_altivec.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_LN.S +#STRSMKERNEL_LT = trsm_kernel_LT.S +#STRSMKERNEL_RN = trsm_kernel_LT.S +#STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +#CTRSMKERNEL_LN = ztrsm_kernel_LN.S +#CTRSMKERNEL_LT = ztrsm_kernel_LT.S +#CTRSMKERNEL_RN = ztrsm_kernel_LT.S +#CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 new file mode 100644 index 0000000000..c41df975a6 --- /dev/null +++ b/kernel/power/KERNEL.PPCG4 @@ -0,0 +1,118 @@ +SAMAXKERNEL = amax_ppc440.S +DAMAXKERNEL = amax_ppc440.S +CAMAXKERNEL = zamax_ppc440.S +ZAMAXKERNEL = zamax_ppc440.S + +SAMINKERNEL = amin_ppc440.S +DAMINKERNEL = amin_ppc440.S +CAMINKERNEL = zamin_ppc440.S +ZAMINKERNEL = zamin_ppc440.S + +SASUMKERNEL = asum_ppc440.S +DASUMKERNEL = asum_ppc440.S +CASUMKERNEL = zasum_ppc440.S +ZASUMKERNEL = zasum_ppc440.S + +SAXPYKERNEL = axpy_ppc440.S +DAXPYKERNEL = axpy_ppc440.S +CAXPYKERNEL = zaxpy_ppc440.S +ZAXPYKERNEL = zaxpy_ppc440.S + +SDOTKERNEL = dot_ppc440.S +DDOTKERNEL = dot_ppc440.S +CDOTKERNEL = zdot_ppc440.S +ZDOTKERNEL = zdot_ppc440.S + +ISAMAXKERNEL = iamax_ppc440.S +IDAMAXKERNEL = iamax_ppc440.S +ICAMAXKERNEL = izamax_ppc440.S +IZAMAXKERNEL = izamax_ppc440.S + +ISAMINKERNEL = iamin_ppc440.S +IDAMINKERNEL = iamin_ppc440.S +ICAMINKERNEL = izamin_ppc440.S +IZAMINKERNEL = izamin_ppc440.S + +ISMAXKERNEL = imax_ppc440.S +IDMAXKERNEL = imax_ppc440.S + +ISMINKERNEL = imin_ppc440.S +IDMINKERNEL = imin_ppc440.S + +SMAXKERNEL = max_ppc440.S +DMAXKERNEL = max_ppc440.S + +SMINKERNEL = min_ppc440.S +DMINKERNEL = min_ppc440.S + +SNRM2KERNEL = snrm2_ppc440.S +DNRM2KERNEL = dnrm2_ppc440.S +CNRM2KERNEL = cnrm2_ppc440.S +ZNRM2KERNEL = znrm2_ppc440.S + +SROTKERNEL = rot_ppc440.S +DROTKERNEL = rot_ppc440.S +CROTKERNEL = zrot_ppc440.S +ZROTKERNEL = zrot_ppc440.S + +SSCALKERNEL = scal_ppc440.S +DSCALKERNEL = scal_ppc440.S +CSCALKERNEL = zscal_ppc440.S +ZSCALKERNEL = zscal_ppc440.S + +SGEMMKERNEL = gemm_kernel_altivec_g4.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_g4.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_altivec_g4.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_g4.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_ppc440_LN.S +#STRSMKERNEL_LT = trsm_kernel_ppc440_LT.S +#STRSMKERNEL_RN = trsm_kernel_ppc440_LT.S +#STRSMKERNEL_RT = trsm_kernel_ppc440_RT.S + +DTRSMKERNEL_LN = trsm_kernel_ppc440_LN.S +DTRSMKERNEL_LT = trsm_kernel_ppc440_LT.S +DTRSMKERNEL_RN = trsm_kernel_ppc440_LT.S +DTRSMKERNEL_RT = trsm_kernel_ppc440_RT.S + +#CTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S +#CTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S +#CTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S +#CTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S diff --git a/kernel/power/Makefile b/kernel/power/Makefile new file mode 100644 index 0000000000..520349bd69 --- /dev/null +++ b/kernel/power/Makefile @@ -0,0 +1 @@ +clean :: diff --git a/kernel/power/amax.S b/kernel/power/amax.S new file mode 100644 index 0000000000..7fbe39e7f6 --- /dev/null +++ b/kernel/power/amax.S @@ -0,0 +1,523 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amax_cell.S b/kernel/power/amax_cell.S new file mode 100644 index 0000000000..3f25e75c76 --- /dev/null +++ b/kernel/power/amax_cell.S @@ -0,0 +1,691 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, 10 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(20) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(15) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + bdz LL(13) + .align 4 + +LL(12): + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + + fabs f14, f14 + dcbt X, PREA + fabs f15, f15 + nop + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + + fsel f6, f22, f6, f14 + LFD f8, 8 * SIZE(X) + fsel f7, f23, f7, f15 + LFD f9, 9 * SIZE(X) + + fabs f8, f8 + LFD f10, 10 * SIZE(X) + fabs f9, f9 + LFD f11, 11 * SIZE(X) + fabs f10, f10 + LFD f12, 12 * SIZE(X) + fabs f11, f11 + LFD f13, 13 * SIZE(X) + fabs f12, f12 + LFD f14, 14 * SIZE(X) + fabs f13, f13 + LFD f15, 15 * SIZE(X) + + fabs f14, f14 + addi X, X, 16 * SIZE + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + LFD f8, 0 * SIZE(X) + + fsel f6, f22, f6, f14 + LFD f9, 1 * SIZE(X) + fsel f7, f23, f7, f15 + bdnz LL(12) + .align 4 + +LL(13): + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + + fsel f6, f22, f6, f14 + LFD f8, 8 * SIZE(X) + fsel f7, f23, f7, f15 + LFD f9, 9 * SIZE(X) + + fabs f8, f8 + LFD f10, 10 * SIZE(X) + fabs f9, f9 + LFD f11, 11 * SIZE(X) + fabs f10, f10 + LFD f12, 12 * SIZE(X) + fabs f11, f11 + LFD f13, 13 * SIZE(X) + fabs f12, f12 + LFD f14, 14 * SIZE(X) + fabs f13, f13 + LFD f15, 15 * SIZE(X) + + fabs f14, f14 + addi X, X, 16 * SIZE + fabs f15, f15 + nop + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + nop + fsel f7, f23, f7, f15 + addi X, X, 8 * SIZE + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + fabs f11, f11 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + nop + fsel f3, f19, f3, f11 + addi X, X, 4 * SIZE + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + + fsel f0, f16, f0, f8 + nop + fsel f1, f17, f1, f9 + addi X, X, 2 * SIZE + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFD f8, 0 * SIZE(X) + fabs f8, f8 + fsub f16, f0, f8 + fsel f0, f16, f0, f8 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(25) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + bdz LL(23) + .align 4 + +LL(22): + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + + fsel f6, f22, f6, f14 + LFDUX f8, X, INCX + fsel f7, f23, f7, f15 + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + LFDUX f8, X, INCX + + fsel f6, f22, f6, f14 + LFDUX f9, X, INCX + fsel f7, f23, f7, f15 + bdnz LL(22) + .align 4 + +LL(23): + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + + fsel f6, f22, f6, f14 + LFDUX f8, X, INCX + fsel f7, f23, f7, f15 + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + fabs f11, f11 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f0, f8 + fsel f0, f16, f0, f8 + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amax_hummer.S b/kernel/power/amax_hummer.S new file mode 100644 index 0000000000..0d8b97db83 --- /dev/null +++ b/kernel/power/amax_hummer.S @@ -0,0 +1,540 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(999) + + fsmfp C1, C1 + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C2, C2 + ble LL(998) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, T1 + LFPDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFPDUX A2, X, INCX2 + fpsub F3, C3, T3 + LFPDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + b LL(998) + .align 4 + + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, T1 + LFDUX A1, X, INCX + fpsub F2, C2, T2 + LFDUX A2, X, INCX + fpsub F3, C3, T3 + LFDUX A3, X, INCX + fpsub F4, C4, T4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, C1, T5 + LFSDUX A5, X, INCX + fpsub F6, C2, T6 + LFSDUX A6, X, INCX + fpsub F7, C3, T7 + LFSDUX A7, X, INCX + fpsub F8, C4, T8 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + + +LL(998): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/amax_ppc440.S b/kernel/power/amax_ppc440.S new file mode 100644 index 0000000000..0184493047 --- /dev/null +++ b/kernel/power/amax_ppc440.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREX r8 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + sub X, X, INCX + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fabs f0, f1 + li PREX, 3 * 16 * SIZE + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amin.S b/kernel/power/amin.S new file mode 100644 index 0000000000..01056c3d98 --- /dev/null +++ b/kernel/power/amin.S @@ -0,0 +1,523 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amin_cell.S b/kernel/power/amin_cell.S new file mode 100644 index 0000000000..e4179f52a5 --- /dev/null +++ b/kernel/power/amin_cell.S @@ -0,0 +1,691 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, 10 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(20) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(15) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + bdz LL(13) + .align 4 + +LL(12): + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + + fabs f14, f14 + dcbt X, PREA + fabs f15, f15 + nop + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + + fsel f6, f22, f14, f6 + LFD f8, 8 * SIZE(X) + fsel f7, f23, f15, f7 + LFD f9, 9 * SIZE(X) + + fabs f8, f8 + LFD f10, 10 * SIZE(X) + fabs f9, f9 + LFD f11, 11 * SIZE(X) + fabs f10, f10 + LFD f12, 12 * SIZE(X) + fabs f11, f11 + LFD f13, 13 * SIZE(X) + fabs f12, f12 + LFD f14, 14 * SIZE(X) + fabs f13, f13 + LFD f15, 15 * SIZE(X) + + fabs f14, f14 + addi X, X, 16 * SIZE + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + LFD f8, 0 * SIZE(X) + + fsel f6, f22, f14, f6 + LFD f9, 1 * SIZE(X) + fsel f7, f23, f15, f7 + bdnz LL(12) + .align 4 + +LL(13): + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + + fsel f6, f22, f14, f6 + LFD f8, 8 * SIZE(X) + fsel f7, f23, f15, f7 + LFD f9, 9 * SIZE(X) + + fabs f8, f8 + LFD f10, 10 * SIZE(X) + fabs f9, f9 + LFD f11, 11 * SIZE(X) + fabs f10, f10 + LFD f12, 12 * SIZE(X) + fabs f11, f11 + LFD f13, 13 * SIZE(X) + fabs f12, f12 + LFD f14, 14 * SIZE(X) + fabs f13, f13 + LFD f15, 15 * SIZE(X) + + fabs f14, f14 + addi X, X, 16 * SIZE + fabs f15, f15 + nop + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + LFD f12, 4 * SIZE(X) + fabs f11, f11 + LFD f13, 5 * SIZE(X) + + fabs f12, f12 + LFD f14, 6 * SIZE(X) + fabs f13, f13 + LFD f15, 7 * SIZE(X) + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + nop + fsel f7, f23, f15, f7 + addi X, X, 8 * SIZE + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + LFD f10, 2 * SIZE(X) + fabs f9, f9 + LFD f11, 3 * SIZE(X) + fabs f10, f10 + fabs f11, f11 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + nop + fsel f3, f19, f11, f3 + addi X, X, 4 * SIZE + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + + fsel f0, f16, f8, f0 + nop + fsel f1, f17, f9, f1 + addi X, X, 2 * SIZE + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFD f8, 0 * SIZE(X) + fabs f8, f8 + fsub f16, f0, f8 + fsel f0, f16, f8, f0 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(25) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + bdz LL(23) + .align 4 + +LL(22): + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + + fsel f6, f22, f14, f6 + LFDUX f8, X, INCX + fsel f7, f23, f15, f7 + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + LFDUX f8, X, INCX + + fsel f6, f22, f14, f6 + LFDUX f9, X, INCX + fsel f7, f23, f15, f7 + bdnz LL(22) + .align 4 + +LL(23): + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + + fsel f6, f22, f14, f6 + LFDUX f8, X, INCX + fsel f7, f23, f15, f7 + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + LFDUX f12, X, INCX + fabs f11, f11 + LFDUX f13, X, INCX + + fabs f12, f12 + LFDUX f14, X, INCX + fabs f13, f13 + LFDUX f15, X, INCX + fabs f14, f14 + fabs f15, f15 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + LFDUX f10, X, INCX + fabs f9, f9 + LFDUX f11, X, INCX + fabs f10, f10 + fabs f11, f11 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f0, f8 + fsel f0, f16, f8, f0 + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/amin_hummer.S b/kernel/power/amin_hummer.S new file mode 100644 index 0000000000..f4bbf070b5 --- /dev/null +++ b/kernel/power/amin_hummer.S @@ -0,0 +1,539 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(999) + + fsmfp C1, C1 + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C2, C2 + ble LL(998) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, T1, C1 + LFPDUX A1, X, INCX2 + fpsub F2, T2, C2 + LFPDUX A2, X, INCX2 + fpsub F3, T3, C3 + LFPDUX A3, X, INCX2 + fpsub F4, T4, C4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + b LL(998) + .align 4 + + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, T1, C1 + LFDUX A1, X, INCX + fpsub F2, T2, C2 + LFDUX A2, X, INCX + fpsub F3, T3, C3 + LFDUX A3, X, INCX + fpsub F4, T4, C4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, T5, C1 + LFSDUX A5, X, INCX + fpsub F6, T6, C2 + LFSDUX A6, X, INCX + fpsub F7, T7, C3 + LFSDUX A7, X, INCX + fpsub F8, T8, C4 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsub F3, A3, C3 + fsub F4, A4, C4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, A1, C1 + fsub F2, A2, C2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(998): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/amin_ppc440.S b/kernel/power/amin_ppc440.S new file mode 100644 index 0000000000..b47742bb70 --- /dev/null +++ b/kernel/power/amin_ppc440.S @@ -0,0 +1,333 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define INC1 r6 + +#define PREX r8 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + sub X, X, INCX + li INC1, SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fabs f0, f1 + li PREX, 3 * 16 * SIZE + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + subi N, N, 1 + fabs f6, f1 + srawi. r0, N, 4 + fabs f7, f1 + mtspr CTR, r0 + fabs f1, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f9, f1 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f10, f2 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f11, f3 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f13, f5 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f14, f6 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f15, f7 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f9, f1 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f10, f2 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f11, f3 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f13, f5 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f14, f6 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f15, f7 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/asum.S b/kernel/power/asum.S new file mode 100644 index 0000000000..1188aa5c10 --- /dev/null +++ b/kernel/power/asum.S @@ -0,0 +1,448 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fabs f16, f8 + fabs f17, f9 + fabs f18, f10 + fabs f19, f11 + + fabs f20, f12 + fabs f21, f13 + fabs f22, f14 + fabs f23, f15 + bdz LL(20) + .align 4 + +LL(10): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + FADD f0, f0, f16 + fabs f16, f8 + FADD f1, f1, f17 + fabs f17, f9 + + FADD f2, f2, f18 + fabs f18, f10 + FADD f3, f3, f19 + fabs f19, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + FADD f4, f4, f20 + fabs f20, f12 + FADD f5, f5, f21 + fabs f21, f13 + + FADD f6, f6, f22 + fabs f22, f14 + FADD f7, f7, f23 + fabs f23, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + + fabs f8, f8 + FADD f0, f0, f8 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f16, f8 + fabs f17, f9 + fabs f18, f10 + fabs f19, f11 + + fabs f20, f12 + fabs f21, f13 + fabs f22, f14 + fabs f23, f15 + bdz LL(120) + .align 4 + +LL(110): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + FADD f0, f0, f16 + fabs f16, f8 + FADD f1, f1, f17 + fabs f17, f9 + + FADD f2, f2, f18 + fabs f18, f10 + FADD f3, f3, f19 + fabs f19, f11 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + FADD f4, f4, f20 + fabs f20, f12 + FADD f5, f5, f21 + fabs f21, f13 + + FADD f6, f6, f22 + fabs f22, f14 + FADD f7, f7, f23 + fabs f23, f15 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + FADD f0, f0, f8 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/asum_cell.S b/kernel/power/asum_cell.S new file mode 100644 index 0000000000..076651f33e --- /dev/null +++ b/kernel/power/asum_cell.S @@ -0,0 +1,599 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 16 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stw r0, 0(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfs FZERO, 0(SP) + + slwi INCX, INCX, BASE_SHIFT + fmr f1, FZERO + li PREA, 8 * 16 * SIZE + fmr f2, FZERO + + cmpwi cr0, N, 0 + fmr f3, FZERO + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(20) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(15) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + bdz LL(13) + .align 4 + +LL(12): + FADD f0, f0, f4 + dcbt X, PREA + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 7 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 8 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 9 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 10 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 11 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 12 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 13 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 14 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 15 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 16 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 17 * SIZE(X) + + FADD f1, f1, f5 + addi X, X, 16 * SIZE + fabs f5, f9 + LFD f10, 2 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 3 * SIZE(X) + + FADD f3, f3, f7 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + bdnz LL(12) + .align 4 + +LL(13): + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 7 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 8 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 9 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 10 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 11 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 12 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 13 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 14 * SIZE(X) + + FADD f2, f2, f6 + addi X, X, 16 * SIZE + fabs f6, f10 + LFD f11, -1 * SIZE(X) + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + addi X, X, 8 * SIZE + fabs f6, f10 + LFD f11, -1 * SIZE(X) + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + addi X, X, 4 * SIZE + fabs f7, f11 + nop + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + fabs f5, f9 + + FADD f0, f0, f4 + addi X, X, 2 * SIZE + FADD f1, f1, f5 + nop + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFD f8, 0 * SIZE(X) + fabs f4, f8 + FADD f0, f0, f4 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(25) + .align 4 + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f4, f8 + LFDUX f10, X, INCX + fabs f5, f9 + LFDUX f11, X, INCX + fabs f6, f10 + LFDUX f8, X, INCX + fabs f7, f11 + bdz LL(23) + .align 4 + +LL(22): + FADD f0, f0, f4 + dcbt X, PREA + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + LFDUX f8, X, INCX + fabs f7, f11 + bdnz LL(22) + .align 4 + +LL(23): + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDUX f8, X, INCX + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f4, f8 + LFDUX f10, X, INCX + fabs f5, f9 + LFDUX f11, X, INCX + fabs f6, f10 + LFDUX f8, X, INCX + fabs f7, f11 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDUX f10, X, INCX + + FADD f2, f2, f6 + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f4, f8 + LFDUX f10, X, INCX + fabs f5, f9 + LFDUX f11, X, INCX + + fabs f6, f10 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + + fabs f4, f8 + fabs f5, f9 + + FADD f0, f0, f4 + FADD f1, f1, f5 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX f8, X, INCX + fabs f4, f8 + FADD f0, f0, f4 + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + + FADD f1, f0, f2 + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/asum_hummer.S b/kernel/power/asum_hummer.S new file mode 100644 index 0000000000..9906a44479 --- /dev/null +++ b/kernel/power/asum_hummer.S @@ -0,0 +1,455 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define T1 f12 +#define T2 f13 +#define T3 f14 +#define T4 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C1, 0(X) + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(999) + .align 4 + +LL(05): + srawi. r0, N, 4 + sub X, X, INCX2 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + fpmr T1, C2 + LFPDUX A2, X, INCX2 + fpmr T2, C2 + LFPDUX A3, X, INCX2 + fpmr T3, C2 + LFPDUX A4, X, INCX2 + fpmr T4, C2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFPDUX A1, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFPDUX A2, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A3 + LFPDUX A3, X, INCX2 + + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFPDUX A4, X, INCX2 + + fpadd C1, C1, T1 + nop + fpabs T1, A5 + LFPDUX A5, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A6 + LFPDUX A6, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A7 + LFPDUX A7, X, INCX2 + + fpadd C4, C4, T4 + fpabs T4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + fpadd C1, C1, T1 + fpabs T1, A1 + fpadd C2, C2, T2 + fpabs T2, A2 + fpadd C3, C3, T3 + fpabs T3, A3 + fpadd C4, C4, T4 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpabs T1, A1 + fpabs T2, A2 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs T1, A1 + fpadd C1, C1, T1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFDX A1, X, INCX2 + fabs T1, A1 + fadd C1, C1, T1 + b LL(999) + .align 4 + +LL(100): + sub X2, X, INCX + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(115) + + + LFDUX A1, X, INCX2 + fpmr T1, C2 + LFDUX A2, X, INCX2 + fpmr T2, C2 + LFDUX A3, X, INCX2 + fpmr T3, C2 + LFDUX A4, X, INCX2 + fpmr T4, C2 + + LFDUX A5, X, INCX2 + LFSDUX A1, X2, INCX2 + + LFDUX A6, X, INCX2 + LFSDUX A2, X2, INCX2 + + LFDUX A7, X, INCX2 + LFSDUX A3, X2, INCX2 + + LFDUX A8, X, INCX2 + LFSDUX A4, X2, INCX2 + bdz LL(113) + .align 4 + +LL(112): + fpadd C1, C1, T1 + LFSDUX A5, X2, INCX2 + fpabs T1, A1 + LFDUX A1, X, INCX2 + + fpadd C2, C2, T2 + LFSDUX A6, X2, INCX2 + fpabs T2, A2 + LFDUX A2, X, INCX2 + + fpadd C3, C3, T3 + LFSDUX A7, X2, INCX2 + fpabs T3, A3 + LFDUX A3, X, INCX2 + + fpadd C4, C4, T4 + LFSDUX A8, X2, INCX2 + fpabs T4, A4 + LFDUX A4, X, INCX2 + + fpadd C1, C1, T1 + LFSDUX A1, X2, INCX2 + fpabs T1, A5 + LFDUX A5, X, INCX2 + fpadd C2, C2, T2 + LFSDUX A2, X2, INCX2 + fpabs T2, A6 + LFDUX A6, X, INCX2 + + fpadd C3, C3, T3 + LFSDUX A3, X2, INCX2 + fpabs T3, A7 + LFDUX A7, X, INCX2 + fpadd C4, C4, T4 + LFSDUX A4, X2, INCX2 + fpabs T4, A8 + LFDUX A8, X, INCX2 + + bdnz LL(112) + .align 4 + +LL(113): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFSDUX A5, X2, INCX2 + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFSDUX A6, X2, INCX2 + fpadd C3, C3, T3 + + nop + fpabs T3, A3 + LFSDUX A7, X2, INCX2 + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFSDUX A8, X2, INCX2 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(115): + andi. r0, N, 15 + beq LL(999) + andi. r0, N, 8 + beq LL(116) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs T1, A1 + LFDUX A5, X, INCX2 + fabs T2, A2 + LFDUX A6, X2, INCX2 + fabs T3, A3 + LFDUX A7, X, INCX2 + fabs T4, A4 + LFDUX A8, X2, INCX2 + + fadd C1, C1, T1 + fabs T1, A5 + fadd C2, C2, T2 + fabs T2, A6 + + fadd C3, C3, T3 + fabs T3, A7 + fadd C4, C4, T4 + fabs T4, A8 + + fadd C1, C1, T1 + fadd C2, C2, T2 + fadd C3, C3, T3 + fadd C4, C4, T4 + .align 4 + +LL(116): + andi. r0, N, 4 + beq LL(117) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs T1, A1 + fabs T2, A2 + fabs T3, A3 + fabs T4, A4 + + fadd C1, C1, T1 + fadd C2, C2, T2 + fadd C3, C3, T3 + fadd C4, C4, T4 + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + fabs T1, A1 + fabs T2, A2 + fadd C1, C1, T1 + fadd C2, C2, T2 + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(999) + + LFDX A1, X, INCX2 + fabs T1, A1 + fadd C1, C1, T1 + .align 4 + +LL(999): + fpadd C1, C1, C2 + li r10, 16 + fpadd C3, C3, C4 + fpadd C1, C1, C3 + lfpdux f15, SP, r10 + fsmtp C2, C1 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fadd C1, C2, C1 + blr + + EPILOGUE diff --git a/kernel/power/asum_ppc440.S b/kernel/power/asum_ppc440.S new file mode 100644 index 0000000000..c6ad0f0668 --- /dev/null +++ b/kernel/power/asum_ppc440.S @@ -0,0 +1,313 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define PREX r6 + +#define ATTR r7 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + fmr f1, FZERO + li PREX, 3 * 16 * SIZE + fmr f2, FZERO + sub X, X, INCX + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + cmpwi cr0, N, 0 + fmr f7, FZERO + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + fabs f16, f8 + + LFDUX f24, X, INCX + fabs f17, f9 + LFDUX f25, X, INCX + fabs f18, f10 + LFDUX f26, X, INCX + fabs f19, f11 + LFDUX f27, X, INCX + fabs f20, f12 + LFDUX f28, X, INCX + fabs f21, f13 + LFDUX f29, X, INCX + fabs f22, f14 + LFDUX f30, X, INCX + fabs f23, f15 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + LFDUX f8, X, INCX + FADD f0, f0, f16 +#ifdef PPCG4 + dcbt X, PREX +#else + nop +#endif + fabs f16, f24 + + LFDUX f9, X, INCX + FADD f1, f1, f17 + nop + fabs f17, f25 + + LFDUX f10, X, INCX + FADD f2, f2, f18 + nop + fabs f18, f26 + LFDUX f11, X, INCX + FADD f3, f3, f19 + nop + fabs f19, f27 + + LFDUX f12, X, INCX + FADD f4, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#else + nop +#endif + fabs f20, f28 + + LFDUX f13, X, INCX + FADD f5, f5, f21 + nop + fabs f21, f29 + + LFDUX f14, X, INCX + FADD f6, f6, f22 + nop + fabs f22, f30 + LFDUX f15, X, INCX + FADD f7, f7, f23 + nop + fabs f23, f31 + + LFDUX f24, X, INCX + FADD f0, f0, f16 +#ifdef PPCG4 + dcbt X, PREX +#else + nop +#endif + fabs f16, f8 + LFDUX f25, X, INCX + FADD f1, f1, f17 + nop + fabs f17, f9 + + LFDUX f26, X, INCX + FADD f2, f2, f18 + nop + fabs f18, f10 + LFDUX f27, X, INCX + FADD f3, f3, f19 + nop + fabs f19, f11 + + LFDUX f28, X, INCX + FADD f4, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#else + nop +#endif + fabs f20, f12 + + LFDUX f29, X, INCX + FADD f5, f5, f21 + nop + fabs f21, f13 + + LFDUX f30, X, INCX + FADD f6, f6, f22 + nop + fabs f22, f14 + + LFDUX f31, X, INCX + FADD f7, f7, f23 + fabs f23, f15 + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + FADD f0, f0, f8 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S new file mode 100644 index 0000000000..9f9605f91e --- /dev/null +++ b/kernel/power/axpy.S @@ -0,0 +1,550 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define PREA r4 +#define YY r5 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define PREA r4 +#define YY r5 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define PREA r5 +#define YY r6 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define PREA r4 +#define YY r5 +#endif +#endif + +#define ALPHA f24 + +#ifndef NEEDPARAM + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCY, 56 + STACKSIZE(SP) +#endif + + fmr ALPHA, f1 + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + LFD f8, 0 * SIZE(Y) + LFD f9, 1 * SIZE(Y) + LFD f10, 2 * SIZE(Y) + LFD f11, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + LFD f12, 4 * SIZE(Y) + LFD f13, 5 * SIZE(Y) + LFD f14, 6 * SIZE(Y) + LFD f15, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFD f0, 8 * SIZE(X) + LFD f1, 9 * SIZE(X) + LFD f2, 10 * SIZE(X) + LFD f3, 11 * SIZE(X) + + LFD f8, 8 * SIZE(Y) + LFD f9, 9 * SIZE(Y) + LFD f10, 10 * SIZE(Y) + LFD f11, 11 * SIZE(Y) + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + STFD f18, 2 * SIZE(Y) + STFD f19, 3 * SIZE(Y) + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFD f4, 12 * SIZE(X) + LFD f5, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f7, 15 * SIZE(X) + + LFD f12, 12 * SIZE(Y) + LFD f13, 13 * SIZE(Y) + LFD f14, 14 * SIZE(Y) + LFD f15, 15 * SIZE(Y) + + STFD f20, 4 * SIZE(Y) + STFD f21, 5 * SIZE(Y) + STFD f22, 6 * SIZE(Y) + STFD f23, 7 * SIZE(Y) + + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFD f0, 16 * SIZE(X) + LFD f1, 17 * SIZE(X) + LFD f2, 18 * SIZE(X) + LFD f3, 19 * SIZE(X) + + LFD f8, 16 * SIZE(Y) + LFD f9, 17 * SIZE(Y) + LFD f10, 18 * SIZE(Y) + LFD f11, 19 * SIZE(Y) + + STFD f16, 8 * SIZE(Y) + STFD f17, 9 * SIZE(Y) + STFD f18, 10 * SIZE(Y) + STFD f19, 11 * SIZE(Y) + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFD f4, 20 * SIZE(X) + LFD f5, 21 * SIZE(X) + LFD f6, 22 * SIZE(X) + LFD f7, 23 * SIZE(X) + + LFD f12, 20 * SIZE(Y) + LFD f13, 21 * SIZE(Y) + LFD f14, 22 * SIZE(Y) + LFD f15, 23 * SIZE(Y) + + STFD f20, 12 * SIZE(Y) + STFD f21, 13 * SIZE(Y) + STFD f22, 14 * SIZE(Y) + STFD f23, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst Y, PREA +#ifdef L1_DUALFETCH + dcbt X, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst Y, PREA + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFD f0, 8 * SIZE(X) + LFD f1, 9 * SIZE(X) + LFD f2, 10 * SIZE(X) + LFD f3, 11 * SIZE(X) + + LFD f8, 8 * SIZE(Y) + LFD f9, 9 * SIZE(Y) + LFD f10, 10 * SIZE(Y) + LFD f11, 11 * SIZE(Y) + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFD f4, 12 * SIZE(X) + LFD f5, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f7, 15 * SIZE(X) + + LFD f12, 12 * SIZE(Y) + LFD f13, 13 * SIZE(Y) + LFD f14, 14 * SIZE(Y) + LFD f15, 15 * SIZE(Y) + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + STFD f18, 2 * SIZE(Y) + STFD f19, 3 * SIZE(Y) + + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + STFD f20, 4 * SIZE(Y) + STFD f21, 5 * SIZE(Y) + STFD f22, 6 * SIZE(Y) + STFD f23, 7 * SIZE(Y) + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + STFD f16, 8 * SIZE(Y) + STFD f17, 9 * SIZE(Y) + STFD f18, 10 * SIZE(Y) + STFD f19, 11 * SIZE(Y) + + STFD f20, 12 * SIZE(Y) + STFD f21, 13 * SIZE(Y) + STFD f22, 14 * SIZE(Y) + STFD f23, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f0, 0 * SIZE(X) + LFD f8, 0 * SIZE(Y) + + FMADD f16, ALPHA, f0, f8 + + STFD f16, 0 * SIZE(Y) + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + .align 4 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + FMADD f16, ALPHA, f0, f8 + FMADD f17, ALPHA, f1, f9 + FMADD f18, ALPHA, f2, f10 + FMADD f19, ALPHA, f3, f11 + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + + FMADD f20, ALPHA, f4, f12 + FMADD f21, ALPHA, f5, f13 + FMADD f22, ALPHA, f6, f14 + FMADD f23, ALPHA, f7, f15 + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDUX f8, Y, INCY + + FMADD f16, ALPHA, f0, f8 + + STFDUX f16, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/axpy_hummer.S b/kernel/power/axpy_hummer.S new file mode 100644 index 0000000000..372a846f69 --- /dev/null +++ b/kernel/power/axpy_hummer.S @@ -0,0 +1,656 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 + +#define YY r4 +#define INCX2 r5 +#define INCY2 r10 + +#define ALPHA f1 + +#define A1 f0 +#define A2 f8 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 +#define A9 f25 + +#define B1 f9 +#define B2 f10 +#define B3 f11 +#define B4 f12 +#define B5 f13 +#define B6 f14 +#define B7 f15 +#define B8 f16 + +#define C1 f17 +#define C2 f18 +#define C3 f19 +#define C4 f20 +#define C5 f21 +#define C6 f22 +#define C7 f23 +#define C8 f24 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + + fsmfp ALPHA, ALPHA + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + andi. r0, Y, 2 * SIZE - 1 + beq LL(05) + + LFD A1, 0 * SIZE(X) + LFD B1, 0 * SIZE(Y) + + addi X, X, SIZE + addi Y, Y, SIZE + + fmadd C1, ALPHA, A1, B1 + addi N, N, -1 + STFD C1, -1 * SIZE(Y) + +LL(05): + andi. r0, X, 2 * SIZE - 1 + bne LL(20) + + sub X, X, INCX2 + sub Y, Y, INCY2 + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + bdz LL(13) + .align 4 + +LL(12): + fpmadd C1, ALPHA, A1, B1 + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + fpmadd C2, ALPHA, A2, B2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fpmadd C3, ALPHA, A3, B3 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + fpmadd C4, ALPHA, A4, B4 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C5, ALPHA, A5, B5 + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + fpmadd C6, ALPHA, A6, B6 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + fpmadd C7, ALPHA, A7, B7 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + fpmadd C8, ALPHA, A8, B8 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + bdnz LL(12) + .align 4 + +LL(13): + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + fpmadd C3, ALPHA, A3, B3 + fpmadd C4, ALPHA, A4, B4 + + fpmadd C5, ALPHA, A5, B5 + fpmadd C6, ALPHA, A6, B6 + STFPDUX C1, YY, INCY2 + fpmadd C7, ALPHA, A7, B7 + STFPDUX C2, YY, INCY2 + fpmadd C8, ALPHA, A8, B8 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + fpmadd C3, ALPHA, A3, B3 + fpmadd C4, ALPHA, A4, B4 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + fpmadd C1, ALPHA, A1, B1 + + STFPDUX C1, YY, INCY2 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + fmadd C1, ALPHA, A1, B1 + STFDUX C1, YY, INCY2 + b LL(999) + .align 4 + +/* X is unaliged */ + +LL(20): + LFD A1, 0 * SIZE(X) + addi X, X, SIZE + sub X, X, INCX2 + sub Y, Y, INCY2 + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + LFXDUX A4, X, INCX2 + LFPDUX B3, Y, INCY2 + LFXDUX A5, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFXDUX A6, X, INCX2 + LFPDUX B5, Y, INCY2 + LFXDUX A7, X, INCX2 + LFPDUX B6, Y, INCY2 + fsmr A1, A2 + LFXDUX A8, X, INCX2 + fsmr A2, A3 + LFPDUX B7, Y, INCY2 + fsmr A3, A4 + LFXDUX A9, X, INCX2 + fsmr A4, A5 + LFPDUX B8, Y, INCY2 + bdz LL(23) + .align 4 + +LL(22): + fpmadd C1, ALPHA, A1, B1 + fsmr A5, A6 + LFPDUX B1, Y, INCY2 + fpmadd C2, ALPHA, A2, B2 + LFXDUX A2, X, INCX2 + fsmr A6, A7 + LFPDUX B2, Y, INCY2 + fpmadd C3, ALPHA, A3, B3 + LFXDUX A3, X, INCX2 + fsmr A7, A8 + LFPDUX B3, Y, INCY2 + fpmadd C4, ALPHA, A4, B4 + LFXDUX A4, X, INCX2 + fsmr A8, A9 + LFPDUX B4, Y, INCY2 + + fpmadd C5, ALPHA, A5, B5 + LFXDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + fpmadd C6, ALPHA, A6, B6 + LFXDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + fpmadd C7, ALPHA, A7, B7 + LFXDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + fpmadd C8, ALPHA, A8, B8 + LFXDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + + fpmr A1, A9 + LFXDUX A9, X, INCX2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + fsmr A1, A2 + + STFPDUX C5, YY, INCY2 + fsmr A2, A3 + STFPDUX C6, YY, INCY2 + fsmr A3, A4 + STFPDUX C7, YY, INCY2 + fsmr A4, A5 + STFPDUX C8, YY, INCY2 + bdnz LL(22) + .align 4 + +LL(23): + fpmadd C1, ALPHA, A1, B1 + fsmr A5, A6 + fpmadd C2, ALPHA, A2, B2 + fsmr A6, A7 + fpmadd C3, ALPHA, A3, B3 + fsmr A7, A8 + fpmadd C4, ALPHA, A4, B4 + fsmr A8, A9 + + fpmadd C5, ALPHA, A5, B5 + fpmadd C6, ALPHA, A6, B6 + fpmadd C7, ALPHA, A7, B7 + fpmadd C8, ALPHA, A8, B8 + fpmr A1, A9 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + LFXDUX A4, X, INCX2 + LFPDUX B3, Y, INCY2 + LFXDUX A5, X, INCX2 + LFPDUX B4, Y, INCY2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + fpmadd C3, ALPHA, A3, B3 + fpmadd C4, ALPHA, A4, B4 + fpmr A1, A5 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + + fsmr A1, A2 + fsmr A2, A3 + fpmadd C1, ALPHA, A1, B1 + fpmadd C2, ALPHA, A2, B2 + fpmr A1, A3 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + + fsmr A1, A2 + fpmadd C1, ALPHA, A1, B1 + fpmr A1, A2 + + STFPDUX C1, YY, INCY2 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX B1, Y, INCY2 + + fmadd C1, ALPHA, A1, B1 + STFDUX C1, YY, INCY2 + b LL(999) + .align 4 +#### + + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + LFDUX A5, X, INCX + LFDUX B5, Y, INCY + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + + LFDUX A7, X, INCX + LFDUX B7, Y, INCY + LFDUX A8, X, INCX + LFDUX B8, Y, INCY + bdz LL(113) + .align 4 + +LL(112): + fmadd C1, ALPHA, A1, B1 + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + + fmadd C2, ALPHA, A2, B2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C3, ALPHA, A3, B3 + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + + fmadd C4, ALPHA, A4, B4 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C5, ALPHA, A5, B5 + LFDUX A5, X, INCX + LFDUX B5, Y, INCY + fmadd C6, ALPHA, A6, B6 + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + fmadd C7, ALPHA, A7, B7 + LFDUX A7, X, INCX + LFDUX B7, Y, INCY + fmadd C8, ALPHA, A8, B8 + LFDUX A8, X, INCX + LFDUX B8, Y, INCY + + STFDUX C1, YY, INCY + STFDUX C2, YY, INCY + STFDUX C3, YY, INCY + STFDUX C4, YY, INCY + + STFDUX C5, YY, INCY + STFDUX C6, YY, INCY + STFDUX C7, YY, INCY + STFDUX C8, YY, INCY + bdnz LL(112) + .align 4 + +LL(113): + fmadd C1, ALPHA, A1, B1 + fmadd C2, ALPHA, A2, B2 + fmadd C3, ALPHA, A3, B3 + fmadd C4, ALPHA, A4, B4 + + fmadd C5, ALPHA, A5, B5 + fmadd C6, ALPHA, A6, B6 + STFDUX C1, YY, INCY + fmadd C7, ALPHA, A7, B7 + STFDUX C2, YY, INCY + fmadd C8, ALPHA, A8, B8 + STFDUX C3, YY, INCY + + STFDUX C4, YY, INCY + STFDUX C5, YY, INCY + STFDUX C6, YY, INCY + STFDUX C7, YY, INCY + STFDUX C8, YY, INCY + .align 4 + +LL(115): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(117) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C1, ALPHA, A1, B1 + fmadd C2, ALPHA, A2, B2 + fmadd C3, ALPHA, A3, B3 + fmadd C4, ALPHA, A4, B4 + + STFDUX C1, YY, INCY + STFDUX C2, YY, INCY + STFDUX C3, YY, INCY + STFDUX C4, YY, INCY + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C1, ALPHA, A1, B1 + fmadd C2, ALPHA, A2, B2 + + STFDUX C1, YY, INCY + STFDUX C2, YY, INCY + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + + fmadd C1, ALPHA, A1, B1 + STFDUX C1, YY, INCY + .align 4 + +LL(999): + li r10, 16 + subi SP, SP, 16 + + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S new file mode 100644 index 0000000000..cc2605cc03 --- /dev/null +++ b/kernel/power/axpy_ppc440.S @@ -0,0 +1,337 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define YY r5 +#define PRE r4 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define YY r5 +#define PRE r4 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define YY r6 +#define PRE r5 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define YY r5 +#define PRE r4 +#endif +#endif + +#define ALPHA f24 + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCY, 56 + STACKSIZE(SP) +#endif + + fmr ALPHA, f1 + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + li PRE, 2 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + sub X, X, INCX + sub Y, Y, INCY + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + .align 4 + + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f8, Y, INCY + LFDUX f9, Y, INCY + LFDUX f10, Y, INCY + LFDUX f11, Y, INCY + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f12, Y, INCY + LFDUX f13, Y, INCY + LFDUX f14, Y, INCY + LFDUX f15, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f16, ALPHA, f0, f8 + LFDUX f0, X, INCX + LFDUX f8, Y, INCY +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f17, ALPHA, f1, f9 + LFDUX f1, X, INCX + LFDUX f9, Y, INCY + FMADD f18, ALPHA, f2, f10 + LFDUX f2, X, INCX + LFDUX f10, Y, INCY +#ifdef PPCG4 + dcbtst Y, PRE +#endif + FMADD f19, ALPHA, f3, f11 + LFDUX f3, X, INCX + LFDUX f11, Y, INCY + + FMADD f20, ALPHA, f4, f12 + LFDUX f4, X, INCX + LFDUX f12, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f21, ALPHA, f5, f13 + LFDUX f5, X, INCX + LFDUX f13, Y, INCY + FMADD f22, ALPHA, f6, f14 + LFDUX f6, X, INCX + LFDUX f14, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + FMADD f23, ALPHA, f7, f15 + LFDUX f7, X, INCX + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + FMADD f16, ALPHA, f0, f8 + LFDUX f0, X, INCX + LFDUX f8, Y, INCY +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f17, ALPHA, f1, f9 + LFDUX f1, X, INCX + LFDUX f9, Y, INCY + FMADD f18, ALPHA, f2, f10 + LFDUX f2, X, INCX + LFDUX f10, Y, INCY +#ifdef PPCG4 + dcbtst Y, PRE +#endif + FMADD f19, ALPHA, f3, f11 + LFDUX f3, X, INCX + LFDUX f11, Y, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + + FMADD f20, ALPHA, f4, f12 + LFDUX f4, X, INCX + LFDUX f12, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f21, ALPHA, f5, f13 + LFDUX f5, X, INCX + LFDUX f13, Y, INCY + FMADD f22, ALPHA, f6, f14 + LFDUX f6, X, INCX + LFDUX f14, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + FMADD f23, ALPHA, f7, f15 + LFDUX f7, X, INCX + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f16, ALPHA, f0, f8 + LFDUX f0, X, INCX + LFDUX f8, Y, INCY + FMADD f17, ALPHA, f1, f9 + LFDUX f1, X, INCX + LFDUX f9, Y, INCY + FMADD f18, ALPHA, f2, f10 + LFDUX f2, X, INCX + LFDUX f10, Y, INCY + FMADD f19, ALPHA, f3, f11 + LFDUX f3, X, INCX + LFDUX f11, Y, INCY + + FMADD f20, ALPHA, f4, f12 + LFDUX f4, X, INCX + LFDUX f12, Y, INCY + FMADD f21, ALPHA, f5, f13 + LFDUX f5, X, INCX + LFDUX f13, Y, INCY + FMADD f22, ALPHA, f6, f14 + LFDUX f6, X, INCX + LFDUX f14, Y, INCY + FMADD f23, ALPHA, f7, f15 + LFDUX f7, X, INCX + LFDUX f15, Y, INCY + + STFDUX f16, YY, INCY + STFDUX f17, YY, INCY + STFDUX f18, YY, INCY + STFDUX f19, YY, INCY + + FMADD f16, ALPHA, f0, f8 + STFDUX f20, YY, INCY + FMADD f17, ALPHA, f1, f9 + STFDUX f21, YY, INCY + FMADD f18, ALPHA, f2, f10 + STFDUX f22, YY, INCY + FMADD f19, ALPHA, f3, f11 + STFDUX f23, YY, INCY + + FMADD f20, ALPHA, f4, f12 + STFDUX f16, YY, INCY + FMADD f21, ALPHA, f5, f13 + STFDUX f17, YY, INCY + FMADD f22, ALPHA, f6, f14 + STFDUX f18, YY, INCY + FMADD f23, ALPHA, f7, f15 + STFDUX f19, YY, INCY + + STFDUX f20, YY, INCY + STFDUX f21, YY, INCY + STFDUX f22, YY, INCY + STFDUX f23, YY, INCY + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDUX f8, Y, INCY + + FMADD f16, ALPHA, f0, f8 + + STFDUX f16, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/cabs.S b/kernel/power/cabs.S new file mode 100644 index 0000000000..28ae70318f --- /dev/null +++ b/kernel/power/cabs.S @@ -0,0 +1,54 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + LFD f0, 0 * SIZE(r3) + LFD f1, 1 * SIZE(r3) + fabs f0, f0 + fabs f1, f1 + fadd f1, f0, f1 + blr + + EPILOGUE + + diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S new file mode 100644 index 0000000000..930ea29e27 --- /dev/null +++ b/kernel/power/cnrm2.S @@ -0,0 +1,418 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO 144(SP) +#define FONE 148(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, 4 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + fmr f8, f1 + fmr f9, f1 + fmr f10, f1 + fmr f11, f1 + fmr f12, f1 + fmr f13, f1 + fmr f14, f1 + fmr f15, f1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(1000) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + addi X, X, 2 * SIZE + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f0, f0, f8 + + fsqrt f1, f0 + b LL(9999) + .align 4 + +LL(1000): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(1150) + + LFDX f16, X, INCXM1 + LFDUX f17, X, INCX + LFDX f18, X, INCXM1 + LFDUX f19, X, INCX + LFDX f20, X, INCXM1 + LFDUX f21, X, INCX + LFDX f22, X, INCXM1 + LFDUX f23, X, INCX + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + LFDX f16, X, INCXM1 + LFDUX f17, X, INCX + LFDX f18, X, INCXM1 + LFDUX f19, X, INCX + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + LFDX f20, X, INCXM1 + LFDUX f21, X, INCX + LFDX f22, X, INCXM1 + LFDUX f23, X, INCX + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdnz LL(1110) + .align 4 + +LL(1120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + .align 4 + +LL(1150): + andi. r0, N, 7 + mtspr CTR, r0 + beq- cr0, LL(1170) + .align 4 + +LL(1160): + LFDX f16, X, INCXM1 + LFDUX f17, X, INCX + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(1160) + .align 4 + +LL(1170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f0, f0, f8 + + fsqrt f1, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S new file mode 100644 index 0000000000..e6b022f11f --- /dev/null +++ b/kernel/power/cnrm2_hummer.S @@ -0,0 +1,812 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 +#define C5 f4 +#define C6 f5 +#define C7 f6 +#define C8 f7 + +#define A1 f8 +#define A2 f9 +#define A3 f10 +#define A4 f11 +#define A5 f12 +#define A6 f13 +#define A7 f14 +#define A8 f15 + +#define A9 f16 +#define A10 f17 +#define A11 f18 +#define A12 f19 +#define A13 f20 +#define A14 f21 +#define A15 f22 +#define A16 f23 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + fpmr C5, C1 + fpmr C6, C1 + fpmr C7, C1 + fpmr C8, C1 + + cmpwi cr0, N, 0 + ble LL(99) + cmpwi cr0, INCX, 0 + ble LL(99) + + andi. r0, X, 2 * SIZE - 1 + bne LL(100) + + srawi. r0, N, 4 + sub X, X, INCX2 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + LFPDUX A9, X, INCX2 + LFPDUX A10, X, INCX2 + LFPDUX A11, X, INCX2 + LFPDUX A12, X, INCX2 + LFPDUX A13, X, INCX2 + LFPDUX A14, X, INCX2 + LFPDUX A15, X, INCX2 + LFPDUX A16, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpmadd C1, A1, A1, C1 + LFPDUX A1, X, INCX2 + fpmadd C2, A2, A2, C2 + LFPDUX A2, X, INCX2 + fpmadd C3, A3, A3, C3 + LFPDUX A3, X, INCX2 + fpmadd C4, A4, A4, C4 + LFPDUX A4, X, INCX2 + + fpmadd C5, A5, A5, C5 + LFPDUX A5, X, INCX2 + fpmadd C6, A6, A6, C6 + LFPDUX A6, X, INCX2 + fpmadd C7, A7, A7, C7 + LFPDUX A7, X, INCX2 + fpmadd C8, A8, A8, C8 + LFPDUX A8, X, INCX2 + + fpmadd C1, A9, A9, C1 + LFPDUX A9, X, INCX2 + fpmadd C2, A10, A10, C2 + LFPDUX A10, X, INCX2 + fpmadd C3, A11, A11, C3 + LFPDUX A11, X, INCX2 + fpmadd C4, A12, A12, C4 + LFPDUX A12, X, INCX2 + + fpmadd C5, A13, A13, C5 + LFPDUX A13, X, INCX2 + fpmadd C6, A14, A14, C6 + LFPDUX A14, X, INCX2 + fpmadd C7, A15, A15, C7 + LFPDUX A15, X, INCX2 + fpmadd C8, A16, A16, C8 + LFPDUX A16, X, INCX2 + + bdnz LL(12) + .align 4 + +LL(13): + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + + fpmadd C1, A9, A9, C1 + fpmadd C2, A10, A10, C2 + fpmadd C3, A11, A11, C3 + fpmadd C4, A12, A12, C4 + + fpmadd C5, A13, A13, C5 + fpmadd C6, A14, A14, C6 + fpmadd C7, A15, A15, C7 + fpmadd C8, A16, A16, C8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(98) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(98) + + LFPDUX A1, X, INCX2 + fpmadd C3, A1, A1, C3 + .align 4 + +LL(98): + fpadd C1, C1, C5 + lis r3, 0x3f00 + fpadd C2, C2, C6 + lis r4, 0x4040 + fpadd C3, C3, C7 + stw r3, 4(SP) + fpadd C4, C4, C8 + stw r4, 8(SP) + + fpadd C1, C1, C2 + lfs f10, 0(SP) + fpadd C3, C3, C4 + lfs f11, 4(SP) + + fpadd C1, C1, C3 + lfs f12, 8(SP) + + fsmtp C2, C1 + fadd C1, C2, C1 + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr +#else + fsqrt f1, f1 + + li r10, 16 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(99): + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + cmpwi cr0, INCX, SIZE + bne LL(200) + + LFD C1, 0(X) + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + fmul C1, C1, C1 + sub X, X, INCX2 + ble LL(198) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(115) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + LFPDUX A9, X, INCX2 + LFPDUX A10, X, INCX2 + LFPDUX A11, X, INCX2 + LFPDUX A12, X, INCX2 + LFPDUX A13, X, INCX2 + LFPDUX A14, X, INCX2 + LFPDUX A15, X, INCX2 + LFPDUX A16, X, INCX2 + bdz LL(113) + .align 4 + +LL(112): + fpmadd C1, A1, A1, C1 + LFPDUX A1, X, INCX2 + fpmadd C2, A2, A2, C2 + LFPDUX A2, X, INCX2 + fpmadd C3, A3, A3, C3 + LFPDUX A3, X, INCX2 + fpmadd C4, A4, A4, C4 + LFPDUX A4, X, INCX2 + + fpmadd C5, A5, A5, C5 + LFPDUX A5, X, INCX2 + fpmadd C6, A6, A6, C6 + LFPDUX A6, X, INCX2 + fpmadd C7, A7, A7, C7 + LFPDUX A7, X, INCX2 + fpmadd C8, A8, A8, C8 + LFPDUX A8, X, INCX2 + + fpmadd C1, A9, A9, C1 + LFPDUX A9, X, INCX2 + fpmadd C2, A10, A10, C2 + LFPDUX A10, X, INCX2 + fpmadd C3, A11, A11, C3 + LFPDUX A11, X, INCX2 + fpmadd C4, A12, A12, C4 + LFPDUX A12, X, INCX2 + + fpmadd C5, A13, A13, C5 + LFPDUX A13, X, INCX2 + fpmadd C6, A14, A14, C6 + LFPDUX A14, X, INCX2 + fpmadd C7, A15, A15, C7 + LFPDUX A15, X, INCX2 + fpmadd C8, A16, A16, C8 + LFPDUX A16, X, INCX2 + + bdnz LL(112) + .align 4 + +LL(113): + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + + fpmadd C1, A9, A9, C1 + fpmadd C2, A10, A10, C2 + fpmadd C3, A11, A11, C3 + fpmadd C4, A12, A12, C4 + + fpmadd C5, A13, A13, C5 + fpmadd C6, A14, A14, C6 + fpmadd C7, A15, A15, C7 + fpmadd C8, A16, A16, C8 + .align 4 + +LL(115): + andi. r0, N, 15 + beq LL(198) + + andi. r0, N, 8 + beq LL(116) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + .align 4 + +LL(116): + andi. r0, N, 4 + beq LL(117) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(198) + + LFPDUX A1, X, INCX2 + fpmadd C3, A1, A1, C3 + .align 4 + +LL(198): + LFDX A1, X, INCX2 + fmadd C4, A1, A1, C4 + + fpadd C1, C1, C5 + lis r3, 0x3f00 + fpadd C2, C2, C6 + lis r4, 0x4040 + fpadd C3, C3, C7 + stw r3, 4(SP) + fpadd C4, C4, C8 + stw r4, 8(SP) + + fpadd C1, C1, C2 + lfs f10, 0(SP) + fpadd C3, C3, C4 + lfs f11, 4(SP) + + fpadd C1, C1, C3 + lfs f12, 8(SP) + + fsmtp C2, C1 + fadd C1, C2, C1 + + fcmpu cr0, f10, C1 + beq cr0, LL(199) + +#ifndef HUMMER_EMULATOR + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr +#else + fsqrt f1, f1 + + li r10, 16 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(199): + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(200): + sub X, X, INCX2 + addi X2, X, SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(215) + + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + + LFDUX A9, X, INCX2 + LFDUX A10, X2, INCX2 + LFDUX A11, X, INCX2 + LFDUX A12, X2, INCX2 + + LFDUX A13, X, INCX2 + LFDUX A14, X2, INCX2 + LFDUX A15, X, INCX2 + LFDUX A16, X2, INCX2 + bdz LL(213) + .align 4 + +LL(212): + fmadd C1, A1, A1, C1 + LFDUX A1, X, INCX2 + fmadd C2, A2, A2, C2 + LFDUX A2, X2, INCX2 + fmadd C3, A3, A3, C3 + LFDUX A3, X, INCX2 + fmadd C4, A4, A4, C4 + LFDUX A4, X2, INCX2 + + fmadd C5, A5, A5, C5 + LFDUX A5, X, INCX2 + fmadd C6, A6, A6, C6 + LFDUX A6, X2, INCX2 + fmadd C7, A7, A7, C7 + LFDUX A7, X, INCX2 + fmadd C8, A8, A8, C8 + LFDUX A8, X2, INCX2 + + fmadd C1, A9, A9, C1 + LFDUX A9, X, INCX2 + fmadd C2, A10, A10, C2 + LFDUX A10, X2, INCX2 + fmadd C3, A11, A11, C3 + LFDUX A11, X, INCX2 + fmadd C4, A12, A12, C4 + LFDUX A12, X2, INCX2 + + fmadd C5, A13, A13, C5 + LFDUX A13, X, INCX2 + fmadd C6, A14, A14, C6 + LFDUX A14, X2, INCX2 + fmadd C7, A15, A15, C7 + LFDUX A15, X, INCX2 + fmadd C8, A16, A16, C8 + LFDUX A16, X2, INCX2 + + bdnz LL(212) + .align 4 + +LL(213): + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + + fmadd C5, A5, A5, C5 + fmadd C6, A6, A6, C6 + fmadd C7, A7, A7, C7 + fmadd C8, A8, A8, C8 + + fmadd C1, A9, A9, C1 + fmadd C2, A10, A10, C2 + fmadd C3, A11, A11, C3 + fmadd C4, A12, A12, C4 + + fmadd C5, A13, A13, C5 + fmadd C6, A14, A14, C6 + fmadd C7, A15, A15, C7 + fmadd C8, A16, A16, C8 + .align 4 + +LL(215): + andi. r0, N, 7 + beq LL(998) + andi. r0, N, 4 + beq LL(216) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + + fmadd C5, A5, A5, C5 + fmadd C6, A6, A6, C6 + fmadd C7, A7, A7, C7 + fmadd C8, A8, A8, C8 + .align 4 + +LL(216): + andi. r0, N, 2 + beq LL(217) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + .align 4 + +LL(217): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + .align 4 + +LL(998): + fadd C1, C1, C5 + lis r3, 0x3f00 + fadd C2, C2, C6 + lis r4, 0x4040 + fadd C3, C3, C7 + stw r3, 4(SP) + fadd C4, C4, C8 + stw r4, 8(SP) + + fadd C1, C1, C2 + lfs f10, 0(SP) + fadd C3, C3, C4 + lfs f11, 4(SP) + fadd C1, C1, C3 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr + +LL(999): + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S new file mode 100644 index 0000000000..5ead681572 --- /dev/null +++ b/kernel/power/cnrm2_ppc440.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PRE r8 +#define INC1 r9 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define C1 152(SP) +#define C2 156(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r6, 0x3f00 + lis r7, 0x4040 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r6, C1 + stw r7, C2 + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + li INC1, SIZE + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + cmpwi cr0, INCX, 0 + ble- LL(999) + + fmr f0, f1 + sub X, X, INCX + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + fmr f8, f1 + fmr f9, f1 + fmr f10, f1 + fmr f11, f1 + fmr f12, f1 + fmr f13, f1 + fmr f14, f1 + fmr f15, f1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(1150) + + LFDUX f16, X, INCX + LFDX f17, X, INC1 + LFDUX f18, X, INCX + LFDX f19, X, INC1 + LFDUX f20, X, INCX + LFDX f21, X, INC1 + LFDUX f22, X, INCX + LFDX f23, X, INC1 + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + bdz LL(1120) + .align 4 + +LL(1110): + fmadd f0, f16, f16, f0 + LFDUX f16, X, INCX + fmadd f1, f17, f17, f1 + LFDX f17, X, INC1 + fmadd f2, f18, f18, f2 + LFDUX f18, X, INCX + fmadd f3, f19, f19, f3 + LFDX f19, X, INC1 + +#ifdef PPCG4 + dcbt X, PRE +#endif + + fmadd f4, f20, f20, f4 + LFDUX f20, X, INCX + fmadd f5, f21, f21, f5 + LFDX f21, X, INC1 + fmadd f6, f22, f22, f6 + LFDUX f22, X, INCX + fmadd f7, f23, f23, f7 + LFDX f23, X, INC1 + + fmadd f8, f24, f24, f8 + LFDUX f24, X, INCX + fmadd f9, f25, f25, f9 + LFDX f25, X, INC1 + fmadd f10, f26, f26, f10 + LFDUX f26, X, INCX + fmadd f11, f27, f27, f11 + LFDX f27, X, INC1 + +#ifdef PPCG4 + dcbt X, PRE +#endif + + fmadd f12, f28, f28, f12 + LFDUX f28, X, INCX + fmadd f13, f29, f29, f13 + LFDX f29, X, INC1 + fmadd f14, f30, f30, f14 + LFDUX f30, X, INCX + fmadd f15, f31, f31, f15 + LFDX f31, X, INC1 + bdnz LL(1110) + .align 4 + +LL(1120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + .align 4 + +LL(1150): + andi. r0, N, 7 + mtspr CTR, r0 + beq- cr0, LL(1170) + .align 4 + +LL(1160): + LFDUX f16, X, INCX + LFDX f17, X, INC1 + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(1160) + .align 4 + +LL(1170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f1, f0, f8 + lfs f4, FZERO + + fcmpu cr0, f1, f4 + beq cr0, LL(999) + + frsqrte f0, f1 + lfs f8, C1 + lfs f9, C2 + + fmul f2, f1, f0 + fadd f7, f8, f8 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f5, f1, f0 + fmul f2, f5, f8 + fnmsub f3, f5, f0, f7 + fmadd f1, f2, f3, f5 + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/copy.S b/kernel/power/copy.S new file mode 100644 index 0000000000..5a6c610c23 --- /dev/null +++ b/kernel/power/copy.S @@ -0,0 +1,226 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 + +#define STACKSIZE 16 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + +LL(10): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + STFD f2, 2 * SIZE(Y) + STFD f3, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + STFD f4, 4 * SIZE(Y) + STFD f5, 5 * SIZE(Y) + STFD f6, 6 * SIZE(Y) + STFD f7, 7 * SIZE(Y) + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + STFD f8, 8 * SIZE(Y) + STFD f9, 9 * SIZE(Y) + STFD f10, 10 * SIZE(Y) + STFD f11, 11 * SIZE(Y) + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + STFD f12, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f14, 14 * SIZE(Y) + STFD f15, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst Y, PREA +#ifdef L1_DUALFETCH + dcbt X, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst Y, PREA + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + + STFD f8, 0 * SIZE(Y) + addi Y, Y, 1 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + STFDUX f0, Y, INCY + STFDUX f1, Y, INCY + STFDUX f2, Y, INCY + STFDUX f3, Y, INCY + + STFDUX f4, Y, INCY + STFDUX f5, Y, INCY + STFDUX f6, Y, INCY + STFDUX f7, Y, INCY + + STFDUX f8, Y, INCY + STFDUX f9, Y, INCY + STFDUX f10, Y, INCY + STFDUX f11, Y, INCY + + STFDUX f12, Y, INCY + STFDUX f13, Y, INCY + STFDUX f14, Y, INCY + STFDUX f15, Y, INCY + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + STFDUX f8, Y, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/copy_hummer.S b/kernel/power/copy_hummer.S new file mode 100644 index 0000000000..1efa6fb6d9 --- /dev/null +++ b/kernel/power/copy_hummer.S @@ -0,0 +1,958 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 + +#define INCX2 r8 +#define INCY2 r9 +#define X2 r10 +#define Y2 r11 + +#define A1 f0 +#define A2 f1 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 +#define A9 f8 + +#define T1 f9 +#define T2 f10 +#define T3 f11 +#define T4 f12 +#define T5 f13 +#define T6 f14 +#define T7 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCY, SIZE + bne LL(60) + + cmpwi cr0, INCX, SIZE + bne LL(50) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + .align 4 + +LL(10): /* X : aligned Y : aligned */ + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + STFPDUX A1, Y, INCY2 + LFPDUX A1, X, INCX2 + STFPDUX A2, Y, INCY2 + LFPDUX A2, X, INCX2 + STFPDUX A3, Y, INCY2 + LFPDUX A3, X, INCX2 + STFPDUX A4, Y, INCY2 + LFPDUX A4, X, INCX2 + + STFPDUX A5, Y, INCY2 + LFPDUX A5, X, INCX2 + STFPDUX A6, Y, INCY2 + LFPDUX A6, X, INCX2 + STFPDUX A7, Y, INCY2 + LFPDUX A7, X, INCX2 + STFPDUX A8, Y, INCY2 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A6, Y, INCY2 + STFPDUX A7, Y, INCY2 + STFPDUX A8, Y, INCY2 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + STFPDUX A1, Y, INCY2 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + STFDUX A1, Y, INCY2 + .align 4 + b LL(999) + .align 4 + +LL(20): /* X ): aligned Y ): unaligned */ + + LFXDUX A1, X, INCX2 + addi N, N, -1 + cmpwi cr0, N, 0 + STFSDX A1, Y, INCY2 + add Y, Y, INCY + ble LL(999) + .align 4 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX T1, X, INCX2 + LFXDUX T2, X, INCX2 + LFXDUX T3, X, INCX2 + LFXDUX T4, X, INCX2 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdz LL(23) + .align 4 + +LL(22): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + LFPDUX A2, X, INCX2 + fsmr T5, T6 + LFPDUX A3, X, INCX2 + fsmr T6, T7 + LFPDUX A4, X, INCX2 + fsmr T7, A1 + LFPDUX A5, X, INCX2 + + STFPDUX T4, Y, INCY2 + fxmr T1, A2 + STFPDUX T5, Y, INCY2 + fxmr T2, A3 + STFPDUX T6, Y, INCY2 + fxmr T3, A4 + STFPDUX T7, Y, INCY2 + fxmr T4, A5 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdnz LL(22) + .align 4 + +LL(23): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + fsmr T5, T6 + fsmr T6, T7 + fsmr T7, A1 + + STFPDUX T4, Y, INCY2 + STFPDUX T5, Y, INCY2 + STFPDUX T6, Y, INCY2 + STFPDUX T7, Y, INCY2 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + LFXDUX A4, X, INCX2 + LFXDUX A5, X, INCX2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + fpmr A1, A5 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + fsmr A1, A2 + fsmr A2, A3 + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + fpmr A1, A3 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFXDUX A2, X, INCX2 + fsmr A1, A2 + STFPDUX A1, Y, INCY2 + fpmr A1, A2 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(30): /* X : unaligned Y : aligned */ + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFDX A1, X, INCX2 + add X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX T1, X, INCX2 + LFXDUX T2, X, INCX2 + LFXDUX T3, X, INCX2 + LFXDUX T4, X, INCX2 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdz LL(33) + .align 4 + +LL(32): + fxmr T5, A6 + STFPDUX A1, Y, INCY2 + fxmr T6, A7 + STFPDUX T1, Y, INCY2 + fxmr T7, A8 + STFPDUX T2, Y, INCY2 + fxmr A1, A9 + STFPDUX T3, Y, INCY2 + + fsmr T4, T5 + LFPDUX A2, X, INCX2 + fsmr T5, T6 + LFPDUX A3, X, INCX2 + fsmr T6, T7 + LFPDUX A4, X, INCX2 + fsmr T7, A1 + LFPDUX A5, X, INCX2 + + STFPDUX T4, Y, INCY2 + fxmr T1, A2 + STFPDUX T5, Y, INCY2 + fxmr T2, A3 + STFPDUX T6, Y, INCY2 + fxmr T3, A4 + STFPDUX T7, Y, INCY2 + fxmr T4, A5 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + + bdnz LL(32) + .align 4 + +LL(33): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + fsmr T5, T6 + fsmr T6, T7 + fsmr T7, A1 + + STFPDUX T4, Y, INCY2 + STFPDUX T5, Y, INCY2 + STFPDUX T6, Y, INCY2 + STFPDUX T7, Y, INCY2 + .align 4 + +LL(35): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(36) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + LFXDUX A4, X, INCX2 + LFXDUX A5, X, INCX2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + fpmr A1, A5 + .align 4 + +LL(36): + andi. r0, N, 4 + beq LL(37) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + fsmr A1, A2 + fsmr A2, A3 + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + fpmr A1, A3 + .align 4 + +LL(37): + andi. r0, N, 2 + beq LL(38) + + LFXDUX A2, X, INCX2 + fsmr A1, A2 + STFPDUX A1, Y, INCY2 + fpmr A1, A2 + .align 4 + +LL(38): + andi. r0, N, 1 + beq LL(999) + + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(40): /* X : unaligned Y : unaligned */ + + LFDX A1, X, INCX2 + add X, X, INCX + addi N, N, -1 + cmpwi cr0, N, 0 + STFDX A1, Y, INCY2 + add Y, Y, INCY + ble LL(999) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(43) + .align 4 + +LL(42): + STFPDUX A1, Y, INCY2 + LFPDUX A1, X, INCX2 + STFPDUX A2, Y, INCY2 + LFPDUX A2, X, INCX2 + STFPDUX A3, Y, INCY2 + LFPDUX A3, X, INCX2 + STFPDUX A4, Y, INCY2 + LFPDUX A4, X, INCX2 + + STFPDUX A5, Y, INCY2 + LFPDUX A5, X, INCX2 + STFPDUX A6, Y, INCY2 + LFPDUX A6, X, INCX2 + STFPDUX A7, Y, INCY2 + LFPDUX A7, X, INCX2 + STFPDUX A8, Y, INCY2 + LFPDUX A8, X, INCX2 + bdnz LL(42) + .align 4 + +LL(43): + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A6, Y, INCY2 + STFPDUX A7, Y, INCY2 + STFPDUX A8, Y, INCY2 + .align 4 + +LL(45): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + .align 4 + +LL(46): + andi. r0, N, 4 + beq LL(47) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + .align 4 + +LL(47): + andi. r0, N, 2 + beq LL(48) + + LFPDUX A1, X, INCX2 + STFPDUX A1, Y, INCY2 + .align 4 + +LL(48): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + STFDUX A1, Y, INCY2 + .align 4 + b LL(999) + .align 4 + +# INCX != 1, INCY == 1 +LL(50): + andi. r0, Y, 2 * SIZE - 1 + beq LL(51) + + LFD A1, 0 * SIZE(X) + add X, X, INCX + STFD A1, 0 * SIZE(Y) + add Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + .align 4 + +LL(51): + sub X, X, INCX + sub Y, Y, INCY2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(55) + .align 4 + +LL(52): + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + + LFDUX A9, X, INCX + LFDUX T1, X, INCX + LFDUX T2, X, INCX + LFDUX T3, X, INCX + fsmfp A1, A2 + LFDUX T4, X, INCX + fsmfp A3, A4 + LFDUX T5, X, INCX + fsmfp A5, A6 + LFDUX T6, X, INCX + fsmfp A7, A8 + LFDUX T7, X, INCX + fsmfp A9, T1 + + STFPDUX A1, Y, INCY2 + fsmfp T2, T3 + STFPDUX A3, Y, INCY2 + fsmfp T4, T5 + STFPDUX A5, Y, INCY2 + fsmfp T6, T7 + STFPDUX A7, Y, INCY2 + STFPDUX A9, Y, INCY2 + STFPDUX T2, Y, INCY2 + STFPDUX T4, Y, INCY2 + STFPDUX T6, Y, INCY2 + bdnz LL(52) + .align 4 + +LL(55): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(56) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + + fsmfp A1, A2 + fsmfp A3, A4 + fsmfp A5, A6 + fsmfp A7, A8 + + STFPDUX A1, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A7, Y, INCY2 + .align 4 + +LL(56): + andi. r0, N, 4 + beq LL(57) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsmfp A1, A2 + fsmfp A3, A4 + + STFPDUX A1, Y, INCY2 + STFPDUX A3, Y, INCY2 + .align 4 + +LL(57): + andi. r0, N, 2 + beq LL(58) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + fsmfp A1, A2 + + STFPDUX A1, Y, INCY2 + .align 4 + +LL(58): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + + +# INCX == 1, INCY != 1 +LL(60): + cmpwi cr0, INCY, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(61) + + LFD A1, 0 * SIZE(X) + add X, X, INCX + STFD A1, 0 * SIZE(Y) + add Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + .align 4 + +LL(61): + sub X, X, INCX2 + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(65) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(63) + .align 4 + +LL(62): + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + LFPDUX A1, X, INCX2 + + STFDUX A2, Y, INCY + STFSDUX A2, Y, INCY + LFPDUX A2, X, INCX2 + + STFDUX A3, Y, INCY + STFSDUX A3, Y, INCY + LFPDUX A3, X, INCX2 + + STFDUX A4, Y, INCY + STFSDUX A4, Y, INCY + LFPDUX A4, X, INCX2 + + STFDUX A5, Y, INCY + STFSDUX A5, Y, INCY + LFPDUX A5, X, INCX2 + + STFDUX A6, Y, INCY + STFSDUX A6, Y, INCY + LFPDUX A6, X, INCX2 + + STFDUX A7, Y, INCY + STFSDUX A7, Y, INCY + LFPDUX A7, X, INCX2 + + STFDUX A8, Y, INCY + STFSDUX A8, Y, INCY + LFPDUX A8, X, INCX2 + bdnz LL(62) + .align 4 + +LL(63): + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFSDUX A2, Y, INCY + STFDUX A3, Y, INCY + STFSDUX A3, Y, INCY + STFDUX A4, Y, INCY + STFSDUX A4, Y, INCY + STFDUX A5, Y, INCY + STFSDUX A5, Y, INCY + STFDUX A6, Y, INCY + STFSDUX A6, Y, INCY + STFDUX A7, Y, INCY + STFSDUX A7, Y, INCY + STFDUX A8, Y, INCY + STFSDUX A8, Y, INCY + .align 4 + +LL(65): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(66) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFSDUX A2, Y, INCY + STFDUX A3, Y, INCY + STFSDUX A3, Y, INCY + STFDUX A4, Y, INCY + STFSDUX A4, Y, INCY + .align 4 + +LL(66): + andi. r0, N, 4 + beq LL(67) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFSDUX A2, Y, INCY + .align 4 + +LL(67): + andi. r0, N, 2 + beq LL(68) + + LFPDUX A1, X, INCX2 + + STFDUX A1, Y, INCY + STFSDUX A1, Y, INCY + .align 4 + +LL(68): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + STFDUX A1, Y, INCY + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + bdz LL(113) + .align 4 + +LL(112): + STFDUX A1, Y, INCY + LFDUX A1, X, INCX + STFDUX A2, Y, INCY + LFDUX A2, X, INCX + STFDUX A3, Y, INCY + LFDUX A3, X, INCX + STFDUX A4, Y, INCY + LFDUX A4, X, INCX + + STFDUX A5, Y, INCY + LFDUX A5, X, INCX + STFDUX A6, Y, INCY + LFDUX A6, X, INCX + STFDUX A7, Y, INCY + LFDUX A7, X, INCX + STFDUX A8, Y, INCY + LFDUX A8, X, INCX + bdnz LL(112) + .align 4 + +LL(113): + STFDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFDUX A3, Y, INCY + STFDUX A4, Y, INCY + STFDUX A5, Y, INCY + STFDUX A6, Y, INCY + STFDUX A7, Y, INCY + STFDUX A8, Y, INCY + .align 4 + +LL(115): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(117) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + STFDUX A1, Y, INCY + STFDUX A2, Y, INCY + STFDUX A3, Y, INCY + STFDUX A4, Y, INCY + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + STFDUX A1, Y, INCY + STFDUX A2, Y, INCY + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + STFDUX A1, Y, INCY + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S new file mode 100644 index 0000000000..4faa6c96cc --- /dev/null +++ b/kernel/power/dnrm2_hummer.S @@ -0,0 +1,1066 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define ALPHA f4 +#define ALPHA_R f5 + +#define A1 f6 +#define A2 f7 +#define A3 f8 +#define A4 f9 +#define A5 f10 +#define A6 f11 +#define A7 f12 +#define A8 f13 + +#define F1 f14 +#define F2 f15 +#define F3 f16 +#define F4 f17 +#define F5 f18 +#define F6 f19 +#define F7 f20 +#define F8 f21 + +#define T1 f22 +#define T2 f23 +#define T3 f24 +#define T4 f25 +#define T5 f26 +#define T6 f27 +#define T7 f28 +#define T8 f29 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + stfpdux f28, SP, r10 + stfpdux f29, SP, r10 + + li r10, 0 + lis r11, 0x3f80 + stwu r11, -4(SP) + stwu r11, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpsx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(99) + cmpwi cr0, INCX, 0 + ble LL(99) + + mr XX, X + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, T1 + LFPDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFPDUX A2, X, INCX2 + fpsub F3, C3, T3 + LFPDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel ALPHA, F1, C1, C2 + + li r10, 0 + + lfs ALPHA_R, 8(SP) # load 1.0 + fdiv ALPHA_R, ALPHA_R, ALPHA + + lfpsx C1, SP, r10 # Zero clear + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + fsmfp ALPHA_R, ALPHA_R + + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD C1, 0 * SIZE(XX) + add XX, XX, INCX + + cmpwi cr0, N, 0 + fmul C1, ALPHA_R, C1 + fmul C1, C1, C1 + ble LL(998) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + + fpmul T1, ALPHA_R, A1 + fpmul T2, ALPHA_R, A2 + fpmul T3, ALPHA_R, A3 + fpmul T4, ALPHA_R, A4 + + bdz LL(23) + .align 4 + +LL(22): + fpmadd C1, T1, T1, C1 + LFPDUX A1, XX, INCX2 + fpmul T1, ALPHA_R, A5 + LFPDUX A2, XX, INCX2 + + fpmadd C2, T2, T2, C2 + LFPDUX A3, XX, INCX2 + fpmul T2, ALPHA_R, A6 + LFPDUX A4, XX, INCX2 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + LFPDUX A5, XX, INCX2 + fpmul T1, ALPHA_R, A1 + LFPDUX A6, XX, INCX2 + + fpmadd C2, T2, T2, C2 + LFPDUX A7, XX, INCX2 + fpmul T2, ALPHA_R, A2 + LFPDUX A8, XX, INCX2 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A3 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A4 + bdnz LL(22) + .align 4 + +LL(23): + fpmadd C1, T1, T1, C1 + fpmul T1, ALPHA_R, A5 + fpmadd C2, T2, T2, C2 + fpmul T2, ALPHA_R, A6 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + fpmadd C2, T2, T2, C2 + fpmadd C3, T3, T3, C3 + fpmadd C4, T4, T4, C4 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(98) + + andi. r0, N, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + fpmul A3, ALPHA_R, A3 + fpmul A4, ALPHA_R, A4 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + fpmul A1, ALPHA_R, A1 + fpmadd C1, A1, A1, C1 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(98) + + LFDUX A1, XX, INCX2 + fmul A1, ALPHA_R, A1 + fmadd C1, A1, A1, C1 + .align 4 + +LL(98): + fpadd C1, C1, C2 + lis r3, 0x3f00 + fpadd C3, C3, C4 + lis r4, 0x4040 + + stw r3, 4(SP) + stw r4, 8(SP) + + fpadd C1, C1, C3 + lfs f10, 0(SP) + + fsmtp C2, C1 + lfs f11, 4(SP) + fadd C1, C2, C1 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, C1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f29, SP, r10 + fmul f3, f9, f11 + lfpdux f28, SP, r10 + fnmsub f7, f2, f9, f12 + lfpdux f27, SP, r10 + fmul f9, f3, f7 + lfpdux f26, SP, r10 + fadd f13, f11, f11 + lfpdux f25, SP, r10 + fmul f12, f1, f9 + lfpdux f24, SP, r10 + fmul f11, f12, f11 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f21, SP, r10 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fmadd f1, f11, f1, f12 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#else + fsqrt C1, C1 + + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(99): + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, T1 + LFDUX A1, X, INCX + fpsub F2, C2, T2 + LFDUX A2, X, INCX + fpsub F3, C3, T3 + LFDUX A3, X, INCX + fpsub F4, C4, T4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, C1, T5 + LFSDUX A5, X, INCX + fpsub F6, C2, T6 + LFSDUX A6, X, INCX + fpsub F7, C3, T7 + LFSDUX A7, X, INCX + fpsub F8, C4, T8 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel ALPHA, F1, C1, C2 + + li r10, 0 + + lfs ALPHA_R, 8(SP) # load 1.0 + fdiv ALPHA_R, ALPHA_R, ALPHA + + lfpsx C1, SP, r10 # Zero clear + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + fsmfp ALPHA_R, ALPHA_R + + sub XX, XX, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(125) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + LFSDUX A1, XX, INCX + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX + LFSDUX A4, XX, INCX + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + LFSDUX A5, XX, INCX + fpmul T1, ALPHA_R, A1 + LFSDUX A6, XX, INCX + fpmul T2, ALPHA_R, A2 + LFSDUX A7, XX, INCX + fpmul T3, ALPHA_R, A3 + LFSDUX A8, XX, INCX + fpmul T4, ALPHA_R, A4 + bdz LL(123) + .align 4 + +LL(122): + fpmadd C1, T1, T1, C1 + LFDUX A1, XX, INCX + fpmul T1, ALPHA_R, A5 + LFDUX A2, XX, INCX + + fpmadd C2, T2, T2, C2 + LFDUX A3, XX, INCX + fpmul T2, ALPHA_R, A6 + LFDUX A4, XX, INCX + + fpmadd C3, T3, T3, C3 + LFSDUX A1, XX, INCX + fpmul T3, ALPHA_R, A7 + LFSDUX A2, XX, INCX + + fpmadd C4, T4, T4, C4 + LFSDUX A3, XX, INCX + fpmul T4, ALPHA_R, A8 + LFSDUX A4, XX, INCX + + fpmadd C1, T1, T1, C1 + LFDUX A5, XX, INCX + fpmul T1, ALPHA_R, A1 + LFDUX A6, XX, INCX + + fpmadd C2, T2, T2, C2 + LFDUX A7, XX, INCX + fpmul T2, ALPHA_R, A2 + LFDUX A8, XX, INCX + + fpmadd C3, T3, T3, C3 + LFSDUX A5, XX, INCX + fpmul T3, ALPHA_R, A3 + LFSDUX A6, XX, INCX + fpmadd C4, T4, T4, C4 + LFSDUX A7, XX, INCX + fpmul T4, ALPHA_R, A4 + LFSDUX A8, XX, INCX + bdnz LL(122) + .align 4 + +LL(123): + fpmadd C1, T1, T1, C1 + fpmul T1, ALPHA_R, A5 + fpmadd C2, T2, T2, C2 + fpmul T2, ALPHA_R, A6 + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + fpmadd C2, T2, T2, C2 + fpmadd C3, T3, T3, C3 + fpmadd C4, T4, T4, C4 + .align 4 + +LL(125): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + LFSDUX A1, XX, INCX + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX + LFSDUX A4, XX, INCX + + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + fpmul A3, ALPHA_R, A3 + fpmul A4, ALPHA_R, A4 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(126): + andi. r0, N, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fmul A1, ALPHA_R, A1 + fmul A2, ALPHA_R, A2 + fmul A3, ALPHA_R, A3 + fmul A4, ALPHA_R, A4 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + .align 4 + +LL(127): + andi. r0, N, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + fmul A1, ALPHA_R, A1 + fmul A2, ALPHA_R, A2 + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + .align 4 + +LL(128): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, XX, INCX + fmul A1, ALPHA_R, A1 + fmadd C1, A1, A1, C1 + .align 4 + +LL(998): + fpadd C1, C1, C2 + lis r3, 0x3f00 + fpadd C3, C3, C4 + lis r4, 0x4040 + + stw r3, 4(SP) + stw r4, 8(SP) + + fpadd C1, C1, C3 + lfs f10, 0(SP) + + fsmtp C2, C1 + lfs f11, 4(SP) + fadd C1, C2, C1 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(999) + +#ifndef HUMMER_EMULATOR + frsqrte f9, C1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f29, SP, r10 + fmul f3, f9, f11 + lfpdux f28, SP, r10 + fnmsub f7, f2, f9, f12 + lfpdux f27, SP, r10 + fmul f9, f3, f7 + lfpdux f26, SP, r10 + fadd f13, f11, f11 + lfpdux f25, SP, r10 + fmul f12, f1, f9 + lfpdux f24, SP, r10 + fmul f11, f12, f11 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + fnmsub f1, f12, f9, f13 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fmadd f1, f11, f1, f12 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#else + fsqrt C1, C1 + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(999): + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S new file mode 100644 index 0000000000..6be9eadf38 --- /dev/null +++ b/kernel/power/dnrm2_ppc440.S @@ -0,0 +1,556 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define NN r6 +#define XX r7 + +#define PRE r8 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define FMAX 152(SP) +#define C1 156(SP) +#define C2 160(SP) + +#define STACKSIZE 168 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r12, 0x5fe0 + lis r6, 0x3f00 + lis r7, 0x4040 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r12, FMAX + stw r10, 4 + FMAX + stw r6, C1 + stw r7, C2 + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + cmpwi cr0, INCX, 0 + ble- LL(999) + + mr NN, N + mr XX, X + + LFDUX f1, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + subi N, N, 1 + + cmpwi cr0, N, 0 + ble- LL(999) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(50) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(99) + .align 4 + +LL(60): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + .align 4 + +LL(99): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + lfd f2, FMAX + + fcmpu cr0, f1, f31 + beq- cr0, LL(999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmul f17, f30, f9 + LFDUX f9, XX, INCX + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmul f19, f30, f11 + LFDUX f11, XX, INCX + + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmul f21, f30, f13 + LFDUX f13, XX, INCX + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmul f23, f30, f15 + LFDUX f15, XX, INCX + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + LFDUX f9, XX, INCX + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + LFDUX f13, XX, INCX + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + LFDUX f15, XX, INCX + + fmadd f0, f16, f16, f0 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + LFDUX f9, XX, INCX + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + LFDUX f13, XX, INCX + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + LFDUX f15, XX, INCX + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + .align 4 + +LL(150): + andi. r0, NN, 15 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFDUX f8, XX, INCX + + fmul f16, f30, f8 + fmadd f0, f16, f16, f0 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f1, f0, f4 + + frsqrte f0, f1 + lfs f8, C1 + lfs f9, C2 + + fmul f2, f1, f0 + fadd f7, f8, f8 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f2, f1, f0 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f2, f1, f0 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f5, f1, f0 + fmul f2, f5, f8 + fnmsub f3, f5, f0, f7 + fmadd f1, f2, f3, f5 + fmul f1, f31, f1 + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + EPILOGUE diff --git a/kernel/power/dot.S b/kernel/power/dot.S new file mode 100644 index 0000000000..724b0c3c15 --- /dev/null +++ b/kernel/power/dot.S @@ -0,0 +1,468 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + + stw r0, 80(SP) + lfs FZERO, 80(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- cr0, LL(999) + + cmpwi cr0, INCX, SIZE + bne cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f16, 8 * SIZE(Y) + LFD f17, 9 * SIZE(Y) + LFD f18, 10 * SIZE(Y) + LFD f19, 11 * SIZE(Y) + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f20, 12 * SIZE(Y) + LFD f21, 13 * SIZE(Y) + LFD f22, 14 * SIZE(Y) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + LFD f16, 16 * SIZE(Y) + LFD f17, 17 * SIZE(Y) + LFD f18, 18 * SIZE(Y) + LFD f19, 19 * SIZE(Y) + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + LFD f20, 20 * SIZE(Y) + LFD f21, 21 * SIZE(Y) + LFD f22, 22 * SIZE(Y) + LFD f23, 23 * SIZE(Y) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#ifdef L1_DUALFETCH + L1_PREFETCH Y, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + L1_PREFETCH X, PREA +#ifdef L1_DUALFETCH + L1_PREFETCH Y, PREA +#endif +#endif + bdnz LL(10) + .align 4 + +LL(20): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f16, 8 * SIZE(Y) + LFD f17, 9 * SIZE(Y) + LFD f18, 10 * SIZE(Y) + LFD f19, 11 * SIZE(Y) + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f20, 12 * SIZE(Y) + LFD f21, 13 * SIZE(Y) + LFD f22, 14 * SIZE(Y) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f16, 0 * SIZE(Y) + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + + FMADD f0, f8, f16, f0 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + FMADD f0, f8, f16, f0 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/dot_cell.S b/kernel/power/dot_cell.S new file mode 100644 index 0000000000..617fb13563 --- /dev/null +++ b/kernel/power/dot_cell.S @@ -0,0 +1,458 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + + stw r0, 80(SP) + lfs FZERO, 80(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, 16 * 20 * SIZE + + cmpwi cr0, N, 0 + ble- cr0, LL(999) + + cmpwi cr0, INCX, SIZE + bne cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f0, f8, f16, f0 + LFD f8, 8 * SIZE(X) + LFD f16, 8 * SIZE(Y) + + FMADD f1, f9, f17, f1 + LFD f9, 9 * SIZE(X) + LFD f17, 9 * SIZE(Y) + + FMADD f2, f10, f18, f2 + LFD f10, 10 * SIZE(X) + LFD f18, 10 * SIZE(Y) + + FMADD f3, f11, f19, f3 + LFD f11, 11 * SIZE(X) + LFD f19, 11 * SIZE(Y) + + FMADD f4, f12, f20, f4 + LFD f12, 12 * SIZE(X) + LFD f20, 12 * SIZE(Y) + + FMADD f5, f13, f21, f5 + LFD f13, 13 * SIZE(X) + LFD f21, 13 * SIZE(Y) + + FMADD f6, f14, f22, f6 + LFD f14, 14 * SIZE(X) + LFD f22, 14 * SIZE(Y) + + FMADD f7, f15, f23, f7 + LFD f15, 15 * SIZE(X) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + LFD f8, 16 * SIZE(X) + LFD f16, 16 * SIZE(Y) + + FMADD f1, f9, f17, f1 + LFD f9, 17 * SIZE(X) + LFD f17, 17 * SIZE(Y) + + FMADD f2, f10, f18, f2 + LFD f10, 18 * SIZE(X) + LFD f18, 18 * SIZE(Y) + + FMADD f3, f11, f19, f3 + LFD f11, 19 * SIZE(X) + LFD f19, 19 * SIZE(Y) + + FMADD f4, f12, f20, f4 + LFD f12, 20 * SIZE(X) + LFD f20, 20 * SIZE(Y) + + FMADD f5, f13, f21, f5 + LFD f13, 21 * SIZE(X) + LFD f21, 21 * SIZE(Y) + + FMADD f6, f14, f22, f6 + LFD f14, 22 * SIZE(X) + LFD f22, 22 * SIZE(Y) + + FMADD f7, f15, f23, f7 + LFD f15, 23 * SIZE(X) + LFD f23, 23 * SIZE(Y) + + dcbt X, PREA + addi X, X, 16 * SIZE + dcbt Y, PREA + addi Y, Y, 16 * SIZE + bdnz LL(10) + .align 4 + +LL(20): + FMADD f0, f8, f16, f0 + LFD f8, 8 * SIZE(X) + LFD f16, 8 * SIZE(Y) + + FMADD f1, f9, f17, f1 + LFD f9, 9 * SIZE(X) + LFD f17, 9 * SIZE(Y) + + FMADD f2, f10, f18, f2 + LFD f10, 10 * SIZE(X) + LFD f18, 10 * SIZE(Y) + + FMADD f3, f11, f19, f3 + LFD f11, 11 * SIZE(X) + LFD f19, 11 * SIZE(Y) + + FMADD f4, f12, f20, f4 + LFD f12, 12 * SIZE(X) + LFD f20, 12 * SIZE(Y) + + FMADD f5, f13, f21, f5 + LFD f13, 13 * SIZE(X) + LFD f21, 13 * SIZE(Y) + + FMADD f6, f14, f22, f6 + LFD f14, 14 * SIZE(X) + LFD f22, 14 * SIZE(Y) + + FMADD f7, f15, f23, f7 + LFD f15, 15 * SIZE(X) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + addi X, X, 16 * SIZE + FMADD f7, f15, f23, f7 + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f16, 0 * SIZE(Y) + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + + FMADD f0, f8, f16, f0 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + FMADD f0, f8, f16, f0 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/dot_hummer.S b/kernel/power/dot_hummer.S new file mode 100644 index 0000000000..14a378090f --- /dev/null +++ b/kernel/power/dot_hummer.S @@ -0,0 +1,879 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 + +#define INCX2 r8 +#define INCY2 r9 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 +#define A9 f20 + +#define B1 f12 +#define B2 f13 +#define B3 f14 +#define B4 f15 +#define B5 f16 +#define B6 f17 +#define B7 f18 +#define B8 f19 +#define B9 f20 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + fpmr C2, C1 + + slwi INCY, INCY, BASE_SHIFT + fpmr C3, C1 + add INCY2, INCY, INCY + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + +/* X is aligned, Y is aligned */ +LL(10): + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + bdz LL(14) + .align 4 + +LL(13): + fpmadd C1, A1, B1, C1 + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + fpmadd C2, A2, B2, C2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + fpmadd C3, A3, B3, C3 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + fpmadd C4, A4, B4, C4 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C1, A5, B5, C1 + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + fpmadd C2, A6, B6, C2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + fpmadd C3, A7, B7, C3 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + fpmadd C4, A8, B8, C4 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + + bdnz LL(13) + .align 4 + +LL(14): + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + fpmadd C1, A5, B5, C1 + fpmadd C2, A6, B6, C2 + fpmadd C3, A7, B7, C3 + fpmadd C4, A8, B8, C4 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + fpmadd C1, A1, B1, C1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + fmadd C1, A1, B1, C1 + b LL(999) + .align 4 + +/* X is aligned, Y is NOT aligned */ + +LL(20): + LFD B1, 0 * SIZE(Y) + sub X, X, INCX2 + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, X, INCX2 + LFXDUX B2, Y, INCY2 + LFPDUX A2, X, INCX2 + LFXDUX B3, Y, INCY2 + + LFPDUX A3, X, INCX2 + LFXDUX B4, Y, INCY2 + LFPDUX A4, X, INCX2 + LFXDUX B5, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFXDUX B6, Y, INCY2 + LFPDUX A6, X, INCX2 + LFXDUX B7, Y, INCY2 + + LFPDUX A7, X, INCX2 + fsmr B1, B2 + LFXDUX B8, Y, INCY2 + fsmr B2, B3 + LFPDUX A8, X, INCX2 + fsmr B3, B4 + bdz LL(24) + .align 4 + +LL(23): + fpmadd C1, A1, B1, C1 + LFPDUX A1, X, INCX2 + fsmr B4, B5 + LFXDUX B9, Y, INCY2 + + fpmadd C2, A2, B2, C2 + LFPDUX A2, X, INCX2 + fsmr B5, B6 + LFXDUX B2, Y, INCY2 + + fpmadd C3, A3, B3, C3 + LFXDUX B3, Y, INCY2 + fsmr B6, B7 + LFPDUX A3, X, INCX2 + + fpmadd C4, A4, B4, C4 + LFXDUX B4, Y, INCY2 + fsmr B7, B8 + LFPDUX A4, X, INCX2 + + fpmadd C1, A5, B5, C1 + LFXDUX B5, Y, INCY2 + fsmr B8, B9 + LFPDUX A5, X, INCX2 + + fpmadd C2, A6, B6, C2 + LFXDUX B6, Y, INCY2 + fpmr B1, B9 + LFPDUX A6, X, INCX2 + + fpmadd C3, A7, B7, C3 + LFXDUX B7, Y, INCY2 + fsmr B1, B2 + LFPDUX A7, X, INCX2 + + fpmadd C4, A8, B8, C4 + LFXDUX B8, Y, INCY2 + fsmr B2, B3 + LFPDUX A8, X, INCX2 + + fsmr B3, B4 + bdnz LL(23) + .align 4 + +LL(24): + LFXDUX B9, Y, INCY2 + fpmadd C1, A1, B1, C1 + fsmr B4, B5 + fpmadd C2, A2, B2, C2 + fsmr B5, B6 + fpmadd C3, A3, B3, C3 + fsmr B6, B7 + fpmadd C4, A4, B4, C4 + fsmr B7, B8 + fpmadd C1, A5, B5, C1 + fsmr B8, B9 + fpmadd C2, A6, B6, C2 + fpmr B1, B9 + fpmadd C3, A7, B7, C3 + fpmadd C4, A8, B8, C4 + .align 4 + +LL(25): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(26) + + LFPDUX A1, X, INCX2 + LFXDUX B2, Y, INCY2 + LFPDUX A2, X, INCX2 + LFXDUX B3, Y, INCY2 + LFPDUX A3, X, INCX2 + LFXDUX B4, Y, INCY2 + LFPDUX A4, X, INCX2 + LFXDUX B5, Y, INCY2 + + fsmr B1, B2 + fsmr B2, B3 + fsmr B3, B4 + fsmr B4, B5 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + fpmr B1, B5 + .align 4 + +LL(26): + andi. r0, N, 4 + beq LL(27) + + LFPDUX A1, X, INCX2 + LFXDUX B2, Y, INCY2 + LFPDUX A2, X, INCX2 + LFXDUX B3, Y, INCY2 + + fsmr B1, B2 + fsmr B2, B3 + fpmadd C1, A1, B1, C1 + fpmr B1, B3 + fpmadd C2, A2, B2, C2 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFPDUX A1, X, INCX2 + LFXDUX B2, Y, INCY2 + fsmr B1, B2 + fpmadd C1, A1, B1, C1 + fpmr B1, B2 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + fmadd C1, A1, B1, C1 + b LL(999) + .align 4 + +/* X is not aligned, Y is aligned */ +LL(30): + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFD A1, 0 * SIZE(X) + sub X, X, INCX + sub Y, Y, INCY2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + + LFXDUX A4, X, INCX2 + LFPDUX B3, Y, INCY2 + LFXDUX A5, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFXDUX A6, X, INCX2 + LFPDUX B5, Y, INCY2 + LFXDUX A7, X, INCX2 + LFPDUX B6, Y, INCY2 + + LFXDUX A8, X, INCX2 + fsmr A1, A2 + LFPDUX B7, Y, INCY2 + fsmr A2, A3 + LFPDUX B8, Y, INCY2 + fsmr A3, A4 + bdz LL(34) + .align 4 + +LL(33): + fpmadd C1, A1, B1, C1 + LFXDUX A9, X, INCX2 + fsmr A4, A5 + LFPDUX B1, Y, INCY2 + + fpmadd C2, A2, B2, C2 + LFXDUX A2, X, INCX2 + fsmr A5, A6 + LFPDUX B2, Y, INCY2 + + fpmadd C3, A3, B3, C3 + LFXDUX A3, X, INCX2 + fsmr A6, A7 + LFPDUX B3, Y, INCY2 + + fpmadd C4, A4, B4, C4 + LFXDUX A4, X, INCX2 + fsmr A7, A8 + LFPDUX B4, Y, INCY2 + + fpmadd C1, A5, B5, C1 + LFXDUX A5, X, INCX2 + fsmr A8, A9 + LFPDUX B5, Y, INCY2 + + fpmadd C2, A6, B6, C2 + LFXDUX A6, X, INCX2 + fpmr A1, A9 + LFPDUX B6, Y, INCY2 + + fpmadd C3, A7, B7, C3 + LFXDUX A7, X, INCX2 + fsmr A1, A2 + LFPDUX B7, Y, INCY2 + + fpmadd C4, A8, B8, C4 + LFXDUX A8, X, INCX2 + fsmr A2, A3 + LFPDUX B8, Y, INCY2 + + fsmr A3, A4 + bdnz LL(33) + .align 4 + +LL(34): + LFXDUX A9, X, INCX2 + fpmadd C1, A1, B1, C1 + fsmr A4, A5 + fpmadd C2, A2, B2, C2 + fsmr A5, A6 + fpmadd C3, A3, B3, C3 + fsmr A6, A7 + fpmadd C4, A4, B4, C4 + fsmr A7, A8 + fpmadd C1, A5, B5, C1 + fsmr A8, A9 + fpmadd C2, A6, B6, C2 + fpmr A1, A9 + fpmadd C3, A7, B7, C3 + fpmadd C4, A8, B8, C4 + .align 4 + +LL(35): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(36) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + LFXDUX A4, X, INCX2 + LFPDUX B3, Y, INCY2 + LFXDUX A5, X, INCX2 + LFPDUX B4, Y, INCY2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + fpmadd C1, A1, B1, C1 + fpmr A1, A5 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + .align 4 + +LL(36): + andi. r0, N, 4 + beq LL(37) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + LFXDUX A3, X, INCX2 + LFPDUX B2, Y, INCY2 + + fsmr A1, A2 + fsmr A2, A3 + fpmadd C1, A1, B1, C1 + fpmr A1, A3 + fpmadd C2, A2, B2, C2 + .align 4 + +LL(37): + andi. r0, N, 2 + beq LL(38) + + LFXDUX A2, X, INCX2 + LFPDUX B1, Y, INCY2 + + fsmr A1, A2 + fpmadd C1, A1, B1, C1 + fpmr A1, A2 + .align 4 + +LL(38): + andi. r0, N, 1 + beq LL(999) + + LFDUX B1, Y, INCY2 + fmadd C1, A1, B1, C1 + b LL(999) + .align 4 + +/* X is NOT aligned, Y is NOT aligned */ +LL(40): + LFD A1, 0 * SIZE(X) + LFD B1, 0 * SIZE(Y) + + sub X, X, INCX + sub Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + fmadd C1, A1, B1, C1 + ble LL(999) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + bdz LL(44) + .align 4 + +LL(43): + fpmadd C1, A1, B1, C1 + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + fpmadd C2, A2, B2, C2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + fpmadd C3, A3, B3, C3 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + fpmadd C4, A4, B4, C4 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + fpmadd C1, A5, B5, C1 + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + fpmadd C2, A6, B6, C2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + fpmadd C3, A7, B7, C3 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + fpmadd C4, A8, B8, C4 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + bdnz LL(43) + .align 4 + +LL(44): + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + fpmadd C1, A5, B5, C1 + fpmadd C2, A6, B6, C2 + fpmadd C3, A7, B7, C3 + fpmadd C4, A8, B8, C4 + .align 4 + +LL(45): + andi. r0, N, 15 + beq LL(999) + + andi. r0, N, 8 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + fpmadd C3, A3, B3, C3 + fpmadd C4, A4, B4, C4 + .align 4 + +LL(46): + andi. r0, N, 4 + beq LL(47) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fpmadd C1, A1, B1, C1 + fpmadd C2, A2, B2, C2 + .align 4 + +LL(47): + andi. r0, N, 2 + beq LL(48) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + fpmadd C1, A1, B1, C1 + .align 4 + +LL(48): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + fmadd C1, A1, B1, C1 + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(101) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(101): + cmpwi cr0, INCY, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(102): +#endif + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + LFDUX A5, X, INCX + LFDUX B5, Y, INCY + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + + LFDUX A7, X, INCX + LFDUX B7, Y, INCY + LFDUX A8, X, INCX + LFDUX B8, Y, INCY + bdz LL(104) + .align 4 + +LL(103): + fmadd C1, A1, B1, C1 + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + fmadd C2, A2, B2, C2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C3, A3, B3, C3 + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + fmadd C4, A4, B4, C4 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C1, A5, B5, C1 + LFDUX A5, X, INCX + LFDUX B5, Y, INCY + fmadd C2, A6, B6, C2 + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + + fmadd C3, A7, B7, C3 + LFDUX A7, X, INCX + LFDUX B7, Y, INCY + fmadd C4, A8, B8, C4 + LFDUX A8, X, INCX + LFDUX B8, Y, INCY + + bdnz LL(103) + .align 4 + +LL(104): + fmadd C1, A1, B1, C1 + fmadd C2, A2, B2, C2 + fmadd C3, A3, B3, C3 + fmadd C4, A4, B4, C4 + fmadd C1, A5, B5, C1 + fmadd C2, A6, B6, C2 + fmadd C3, A7, B7, C3 + fmadd C4, A8, B8, C4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C1, A1, B1, C1 + fmadd C2, A2, B2, C2 + fmadd C3, A3, B3, C3 + fmadd C4, A4, B4, C4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C1, A1, B1, C1 + fmadd C2, A2, B2, C2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + + fmadd C1, A1, B1, C1 + .align 4 + +LL(999): + li r10, 16 + + fpadd C1, C1, C2 + fpadd C3, C3, C4 + fpadd C1, C1, C3 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + fsmtp C2, C1 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fadd C1, C1, C2 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/dot_ppc440.S b/kernel/power/dot_ppc440.S new file mode 100644 index 0000000000..b3f3efc0e7 --- /dev/null +++ b/kernel/power/dot_ppc440.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PRE r8 + +#define FZERO f0 + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + + stw r0, 80(SP) + lfs FZERO, 80(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCX + sub Y, Y, INCY + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + LFDUX f8, X, INCX + LFDUX f16, Y, INCY +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f1, f9, f17, f1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + FMADD f2, f10, f18, f2 + LFDUX f10, X, INCX + LFDUX f18, Y, INCY +#ifdef PPCG4 + dcbt Y, PRE +#endif + FMADD f3, f11, f19, f3 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + LFDUX f12, X, INCX + LFDUX f20, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f5, f13, f21, f5 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + FMADD f6, f14, f22, f6 + LFDUX f14, X, INCX + LFDUX f22, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt Y, PRE +#endif + FMADD f7, f15, f23, f7 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + LFDUX f8, X, INCX + LFDUX f16, Y, INCY +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f1, f9, f17, f1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + FMADD f2, f10, f18, f2 + LFDUX f10, X, INCX + LFDUX f18, Y, INCY +#ifdef PPCG4 + dcbt Y, PRE +#endif + FMADD f3, f11, f19, f3 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + LFDUX f12, X, INCX + LFDUX f20, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f5, f13, f21, f5 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + FMADD f6, f14, f22, f6 + LFDUX f14, X, INCX + LFDUX f22, Y, INCY +#if defined(PPCG4) && defined(DOUBLE) + dcbt Y, PRE +#endif + FMADD f7, f15, f23, f7 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + FMADD f1, f9, f17, f1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + FMADD f2, f10, f18, f2 + LFDUX f10, X, INCX + LFDUX f18, Y, INCY + FMADD f3, f11, f19, f3 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f4, f12, f20, f4 + LFDUX f12, X, INCX + LFDUX f20, Y, INCY + FMADD f5, f13, f21, f5 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + FMADD f6, f14, f22, f6 + LFDUX f14, X, INCX + LFDUX f22, Y, INCY + FMADD f7, f15, f23, f7 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f10, f18, f2 + FMADD f3, f11, f19, f3 + FMADD f4, f12, f20, f4 + FMADD f5, f13, f21, f5 + FMADD f6, f14, f22, f6 + FMADD f7, f15, f23, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDUX f16, Y, INCY + FMADD f0, f8, f16, f0 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/exfunc.S b/kernel/power/exfunc.S new file mode 100644 index 0000000000..257736c943 --- /dev/null +++ b/kernel/power/exfunc.S @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + .machine "any" + + .globl .rpcc +.rpcc: + mftb r3 + rlinm r3, r3, 3, 0, 31 # ldc(scaling) + bcr BO_ALWAYS,CR0_LT + + .globl .blas_lock +.blas_lock: + cal r7, 1(r0) +LL(0): + l r6, 0(r3) + cmpi CR0, r6, 0 + bne LL(2) + lwarx r6, r0, r3 + cmpwi CR6, r6, 0 + bne LL(2) + stwcx. r7, r0, r3 + bne- LL(0) +LL(1): + bcr BO_ALWAYS,CR0_LT + +LL(2): + b LL(0) diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S new file mode 100644 index 0000000000..e531bde6f9 --- /dev/null +++ b/kernel/power/gemm_beta.S @@ -0,0 +1,253 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define C r10 +#define LDC r11 +#define J r5 +#define PRE r6 +#define CO1 r7 + +#define ALPHA f31 + +#define STACKSIZE 32 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f31, 16(SP) + stw r0, 24(SP) + +#ifdef linux +#ifndef __64BIT__ + lwz LDC, 8 + STACKSIZE(SP) +#else + ld C, 112 + STACKSIZE(SP) + ld LDC, 120 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld C, 112 + STACKSIZE(SP) + ld LDC, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz C, 56 + STACKSIZE(SP) + lwz LDC, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + + fmr ALPHA, f1 + lfs f0, 24(SP) + + cmpwi cr0, M, 0 + ble- LL(999) + cmpwi cr0, N, 0 + ble- LL(999) + + mr J, N + fcmpu cr7, f1, f0 + bne cr7, LL(20) + .align 4 + +LL(10): + mr CO1, C + add C, C, LDC + addi PRE, 0, 32 * SIZE + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + STFD f0, 0 * SIZE(CO1) + STFD f0, 1 * SIZE(CO1) + STFD f0, 2 * SIZE(CO1) + STFD f0, 3 * SIZE(CO1) + STFD f0, 4 * SIZE(CO1) + STFD f0, 5 * SIZE(CO1) + STFD f0, 6 * SIZE(CO1) + STFD f0, 7 * SIZE(CO1) + STFD f0, 8 * SIZE(CO1) + STFD f0, 9 * SIZE(CO1) + STFD f0, 10 * SIZE(CO1) + STFD f0, 11 * SIZE(CO1) + STFD f0, 12 * SIZE(CO1) + STFD f0, 13 * SIZE(CO1) + STFD f0, 14 * SIZE(CO1) + STFD f0, 15 * SIZE(CO1) + + dcbst PRE, CO1 + addi CO1, CO1, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 15 + mtspr CTR, r0 + beq LL(19) + .align 4 + +LL(16): + STFD f0, 0 * SIZE(CO1) + addi CO1, CO1, 1 * SIZE + bdnz LL(16) + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(10) + b LL(999) + .align 4 + +LL(20): + mr CO1, C + add C, C, LDC + addi PRE, 0, 16 * SIZE + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFD f0, 0 * SIZE(CO1) + LFD f1, 1 * SIZE(CO1) + LFD f2, 2 * SIZE(CO1) + LFD f3, 3 * SIZE(CO1) + LFD f4, 4 * SIZE(CO1) + LFD f5, 5 * SIZE(CO1) + LFD f6, 6 * SIZE(CO1) + LFD f7, 7 * SIZE(CO1) + + LFD f8, 8 * SIZE(CO1) + LFD f9, 9 * SIZE(CO1) + LFD f10, 10 * SIZE(CO1) + LFD f11, 11 * SIZE(CO1) + LFD f12, 12 * SIZE(CO1) + LFD f13, 13 * SIZE(CO1) + LFD f14, 14 * SIZE(CO1) + LFD f15, 15 * SIZE(CO1) + + FMUL f0, ALPHA, f0 + FMUL f1, ALPHA, f1 + FMUL f2, ALPHA, f2 + FMUL f3, ALPHA, f3 + FMUL f4, ALPHA, f4 + FMUL f5, ALPHA, f5 + FMUL f6, ALPHA, f6 + FMUL f7, ALPHA, f7 + + FMUL f8, ALPHA, f8 + FMUL f9, ALPHA, f9 + FMUL f10, ALPHA, f10 + FMUL f11, ALPHA, f11 + FMUL f12, ALPHA, f12 + FMUL f13, ALPHA, f13 + FMUL f14, ALPHA, f14 + FMUL f15, ALPHA, f15 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + STFD f4, 4 * SIZE(CO1) + STFD f5, 5 * SIZE(CO1) + STFD f6, 6 * SIZE(CO1) + STFD f7, 7 * SIZE(CO1) + + STFD f8, 8 * SIZE(CO1) + STFD f9, 9 * SIZE(CO1) + STFD f10, 10 * SIZE(CO1) + STFD f11, 11 * SIZE(CO1) + STFD f12, 12 * SIZE(CO1) + STFD f13, 13 * SIZE(CO1) + STFD f14, 14 * SIZE(CO1) + STFD f15, 15 * SIZE(CO1) + + addi CO1, CO1, 16 * SIZE + dcbtst PRE, CO1 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 15 + mtspr CTR, r0 + ble LL(29) + .align 4 + +LL(26): + LFD f0, 0 * SIZE(CO1) + FMUL f0, f0, ALPHA + STFD f0, 0 * SIZE(CO1) + addi CO1, CO1, 1 * SIZE + bdnz LL(26) + .align 4 + +LL(29): + addic. J, J, -1 + bgt LL(20) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f31, 16(SP) + addi SP, SP, STACKSIZE + + blr + EPILOGUE diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S new file mode 100644 index 0000000000..2b7d1d99ad --- /dev/null +++ b/kernel/power/gemm_kernel.S @@ -0,0 +1,2705 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define TEMP r18 +#define KK r19 +#define BB r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) +#if defined(TRMMKERNEL) + std r19, 240(SP) + std r18, 248(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) +#if defined(TRMMKERNEL) + stw r19, 192(SP) + stw r18, 196(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +/* Normal prefetch */ +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 3 * SIZE +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE) + li PREB, (16 * 5 * SIZE) +#else + li PREA, (16 * 19 * SIZE) + li PREB, (16 * 8 * SIZE) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE) + li PREB, (16 * 1 * SIZE) +#else + li PREA, (16 * 2 * SIZE) + li PREB, (16 * 2 * SIZE) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE) + li PREB, (16 * 7 * SIZE) +#else + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 6 * SIZE) +#endif +#endif +#endif + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + slwi BB, K, BASE_SHIFT + 2 + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble LL(20) + .align 4 + +LL(11): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + +#ifdef POWER5 + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + LFD f31, 7 * SIZE(B) +#endif + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + +#ifdef POWER5 + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif +#endif + + DCBTST(CO1, PREC) + DCBTST(CO2, PREC) + DCBTST(CO3, PREC) + DCBTST(CO4, PREC) + + dcbt B, BB + addi BB, BB, 16 * SIZE + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(15) + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + +#ifdef POWER5 + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + LFD f31, 7 * SIZE(B) +#endif + + DCBTST(CO1, PREC) + DCBTST(CO2, PREC) + DCBTST(CO3, PREC) + DCBTST(CO4, PREC) + + dcbt B, BB + addi BB, BB, 16 * SIZE + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(15) +#endif + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + +#if defined(ALLOC_HUGETLB) && !defined(POWER5) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + +#if !defined(ALLOC_HUGETLB) && !defined(POWER5) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + +#ifndef POWER5 + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) +#else + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) +#endif + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + +#ifndef POWER5 + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) +#else + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) +#endif + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + +#if (L2_SIZE == 1024976) && defined (ALLOC_HUGETLB) + nop + nop + nop + nop +#endif + +#ifdef POWER5 + LFD f28, 20 * SIZE(BO) + LFD f29, 21 * SIZE(BO) + LFD f30, 22 * SIZE(BO) + LFD f31, 23 * SIZE(BO) +#endif + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 +#ifndef ALLOC_HUGETLB + DCBT(BO, PREB) + DCBT(AO, PREA) +#endif +#endif + bdnz LL(12) + .align 4 + +LL(15): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 + + LFD f16, 0 * SIZE(CO3) + LFD f17, 1 * SIZE(CO3) + LFD f18, 2 * SIZE(CO3) + LFD f19, 3 * SIZE(CO3) + + LFD f20, 0 * SIZE(CO4) + LFD f21, 1 * SIZE(CO4) + LFD f22, 2 * SIZE(CO4) + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +LL(39): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(40): + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble LL(70) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble LL(50) + .align 4 + +LL(41): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + DCBTST(CO1, PREC) + DCBTST(CO2, PREC) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + DCBTST(CO1, PREC) + DCBTST(CO2, PREC) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(69): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +LL(70): + mr CO1, C + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble LL(80) + .align 4 + +LL(71): +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + DCBTST(CO1, PREC) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + DCBTST(CO1, PREC) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + ld r19, 240(SP) + ld r18, 248(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S new file mode 100644 index 0000000000..6f5c3624fb --- /dev/null +++ b/kernel/power/gemm_kernel_altivec.S @@ -0,0 +1,2708 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 +#define C6 v21 +#define C7 v22 +#define C8 v23 +#define C9 v24 + +#define c00 v25 + +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 +#define PERMRSHIFT3 v28 +#define PERMRSHIFT4 v29 + +#define VZERO v30 +#define alpha v31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + li r0, -1 + + mfspr VREG, VRsave + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -128 + and SP, SP, r0 + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + stfs f1, ALPHA + 0(SP) + stfs f1, ALPHA + 4(SP) + stfs f1, ALPHA + 8(SP) + stfs f1, ALPHA + 12(SP) + + li r29, 0 + stw r29, FZERO(SP) + + slwi LDC, LDC, BASE_SHIFT + + li PREC, (15 * SIZE) +#ifdef CELL + li PREB, (3 * 32 * SIZE) +#else + li PREB, (5 * 32 * SIZE) +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 2 + ble LL(60) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + vxor c05, c05, c05 + LOAD_A a4, OFFSET_3, AO + vxor c06, c06, c06 + LOAD_A a5, OFFSET_4, AO + vxor c07, c07, c07 + nop + vxor c08, c08, c08 + + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + dcbtst CO3, PREC + vxor c12, c12, c12 + dcbtst CO4, PREC + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(13) + .align 4 + +#define NOP1 mr r3, r3 +#define NOP2 mr r4, r4 + +LL(12): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + DCBT(A, PREA) + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + vspltw bp1, b1, 2 + + vmaddfp c05, a1, bp2, c05 + DCBT(B, PREB) + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + vspltw bp2, b1, 3 + + vmaddfp c09, a1, bp1, c09 + NOP1 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 8 * SIZE + vmaddfp c12, a4, bp1, c12 + vspltw bp1, b2, 0 + + vmaddfp c13, a1, bp2, c13 + NOP1 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + addi AO, AO, 32 * SIZE + vmaddfp c07, a7, bp2, c07 + LOAD_B b1, OFFSET_0, BO + vmaddfp c08, a8, bp2, c08 + vspltw bp2, b2, 3 + + vmaddfp c09, a5, bp1, c09 + NOP1 + vmaddfp c10, a6, bp1, c10 + NOP2 + vmaddfp c11, a7, bp1, c11 + NOP1 + vmaddfp c12, a8, bp1, c12 + vspltw bp1, b1, 0 + + vmaddfp c13, a5, bp2, c13 + DCBT(A, PREA) + vmaddfp c14, a6, bp2, c14 + LOAD_A a1, OFFSET_0, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a8, bp2, c16 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + LOAD_A a3, OFFSET_2, AO + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + vspltw bp1, b1, 2 + + vmaddfp c05, a1, bp2, c05 + NOP1 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + vspltw bp2, b1, 3 + + vmaddfp c09, a1, bp1, c09 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a2, bp1, c10 + NOP2 + vmaddfp c11, a3, bp1, c11 + NOP1 + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + addi AO, AO, 32 * SIZE + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c11, a7, bp1, c11 + LOAD_A a2, OFFSET_1, AO + vmaddfp c12, a8, bp1, c12 + NOP2 + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a4, OFFSET_3, AO + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(13): + andi. r0, K, 2 + nop + nop + ble+ LL(15) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + NOP2 + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_A a5, OFFSET_4, AO + vmaddfp c11, a3, bp1, c11 + LOAD_A a6, OFFSET_5, AO + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a7, OFFSET_6, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a8, OFFSET_7, AO + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 32 * SIZE + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + NOP2 + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO + vmaddfp c11, a7, bp1, c11 + LOAD_A a2, OFFSET_1, AO + vmaddfp c12, a8, bp1, c12 + NOP2 + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a4, OFFSET_3, AO + vmaddfp c16, a8, bp2, c16 + .align 4 + +LL(15): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(18) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 16 * SIZE + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 4 * SIZE + vmaddfp c12, a4, bp1, c12 + nop + + vmaddfp c13, a1, bp2, c13 + vmaddfp c14, a2, bp2, c14 + vmaddfp c15, a3, bp2, c15 + vmaddfp c16, a4, bp2, c16 + .align 4 + +LL(18): + lvx C1, OFFSET_0, CO1 + cmpwi cr0, LDC, 32 * SIZE + lvx C2, OFFSET_1, CO1 + lvsr PERMRSHIFT1, 0, CO1 + lvx C3, OFFSET_2, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvx C4, OFFSET_3, CO1 + lvsr PERMRSHIFT3, 0, CO3 + lvx C5, OFFSET_4, CO1 + lvsr PERMRSHIFT4, 0, CO4 + ble LL(19) + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO2 + vmaddfp c01, alpha, c01, C2 + lvx C6, OFFSET_1, CO2 + vmaddfp c02, alpha, c02, C3 + lvx C7, OFFSET_2, CO2 + vmaddfp c03, alpha, c03, C4 + lvx C8, OFFSET_3, CO2 + vmaddfp c04, alpha, c04, C5 + lvx C9, OFFSET_4, CO2 + + stvx c00, OFFSET_0, CO1 + vperm c00, VZERO, c05, PERMRSHIFT2 + stvx c01, OFFSET_1, CO1 + vperm c05, c05, c06, PERMRSHIFT2 + stvx c02, OFFSET_2, CO1 + vperm c06, c06, c07, PERMRSHIFT2 + stvx c03, OFFSET_3, CO1 + vperm c07, c07, c08, PERMRSHIFT2 + stvx c04, OFFSET_4, CO1 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO3 + vmaddfp c05, alpha, c05, C6 + lvx C2, OFFSET_1, CO3 + vmaddfp c06, alpha, c06, C7 + lvx C3, OFFSET_2, CO3 + vmaddfp c07, alpha, c07, C8 + lvx C4, OFFSET_3, CO3 + vmaddfp c08, alpha, c08, C9 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO2 + vperm c00, VZERO, c09, PERMRSHIFT3 + stvx c05, OFFSET_1, CO2 + vperm c09, c09, c10, PERMRSHIFT3 + stvx c06, OFFSET_2, CO2 + vperm c10, c10, c11, PERMRSHIFT3 + stvx c07, OFFSET_3, CO2 + vperm c11, c11, c12, PERMRSHIFT3 + stvx c08, OFFSET_4, CO2 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + lvx C9, OFFSET_4, CO4 + vmaddfp c09, alpha, c09, C2 + lvx C1, OFFSET_0, CO4 + vmaddfp c10, alpha, c10, C3 + lvx C6, OFFSET_1, CO4 + vmaddfp c11, alpha, c11, C4 + lvx C7, OFFSET_2, CO4 + vmaddfp c12, alpha, c12, C5 + lvx C8, OFFSET_3, CO4 + + stvx c00, OFFSET_0, CO3 + vperm c00, VZERO, c13, PERMRSHIFT4 + stvx c09, OFFSET_1, CO3 + vperm c13, c13, c14, PERMRSHIFT4 + stvx c10, OFFSET_2, CO3 + vperm c14, c14, c15, PERMRSHIFT4 + stvx c11, OFFSET_3, CO3 + vperm c15, c15, c16, PERMRSHIFT4 + stvx c12, OFFSET_4, CO3 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + b LL(20) + .align 4 + +LL(19): + lvx C6, OFFSET_1, CO2 + lvx C7, OFFSET_2, CO2 + lvx C8, OFFSET_3, CO2 + lvx C9, OFFSET_4, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + lvx C2, OFFSET_1, CO3 + vmaddfp c02, alpha, c02, C3 + lvx C3, OFFSET_2, CO3 + vmaddfp c03, alpha, c03, C4 + lvx C4, OFFSET_3, CO3 + vmaddfp c04, alpha, c04, C5 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C6 + lvx C6, OFFSET_1, CO4 + vmaddfp c06, alpha, c06, C7 + lvx C7, OFFSET_2, CO4 + vmaddfp c07, alpha, c07, C8 + lvx C8, OFFSET_3, CO4 + vmaddfp c08, alpha, c08, C9 + lvx C9, OFFSET_4, CO4 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + lvx C1, OFFSET_0, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, c11, PERMRSHIFT3 + vperm c11, c11, c12, PERMRSHIFT3 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + vmaddfp c11, alpha, c11, C4 + vmaddfp c12, alpha, c12, C5 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + stvx c11, OFFSET_3, CO3 + stvx c12, OFFSET_4, CO3 + + lvx C1, OFFSET_0, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, c15, PERMRSHIFT4 + vperm c15, c15, c16, PERMRSHIFT4 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 8 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + lvx C3, OFFSET_2, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + lvx C3, OFFSET_2, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + vmaddfp c14, alpha, c14, C3 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + addi CO3, CO3, 8 * SIZE + addi CO4, CO4, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 4 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 2 + ble LL(50) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + LFD f8, 0 * SIZE(CO3) + LFD f9, 1 * SIZE(CO3) + LFD f10, 0 * SIZE(CO4) + LFD f11, 1 * SIZE(CO4) + + FMADD f4, f4, f13, f8 + FMADD f5, f5, f13, f9 + FMADD f6, f6, f13, f10 + FMADD f7, f7, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + STFD f4, 0 * SIZE(CO3) + STFD f5, 1 * SIZE(CO3) + STFD f6, 0 * SIZE(CO4) + STFD f7, 1 * SIZE(CO4) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + .align 4 + +LL(50): + andi. I, M, 1 + ble LL(59) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f9, f10, f0 + FMADD f1, f9, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f9, 3 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(52) + .align 4 + +LL(55): + andi. r0, K, 1 + ble LL(58) + .align 4 + +LL(56): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(58): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + LFD f10, 0 * SIZE(CO3) + LFD f11, 0 * SIZE(CO4) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + STFD f2, 0 * SIZE(CO3) + STFD f3, 0 * SIZE(CO4) + .align 4 + +LL(59): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(60): + andi. r0, N, 2 + ble LL(120) + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(80) + .align 4 + +LL(71): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(78): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + vmaddfp c07, alpha, c07, C4 + vmaddfp c08, alpha, c08, C5 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 8 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(88) + .align 4 + +LL(86): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(88): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(90): + andi. I, M, 4 + ble LL(100) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(98) + .align 4 + +LL(96): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(98): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(100): + andi. I, M, 2 + ble LL(110) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(105) + .align 4 + +LL(102): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + FMADD f4, f8, f12, f4 + FMADD f5, f9, f12, f5 + FMADD f6, f8, f13, f6 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(102) + .align 4 + +LL(105): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(108) + .align 4 + +LL(106): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(108): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + .align 4 + +LL(110): + andi. I, M, 1 + ble LL(119) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(118) + .align 4 + +LL(116): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + + LFD f8, 1 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(118): + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + .align 4 + +LL(119): + mr B, BO + .align 4 + +LL(120): + andi. r0, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + srawi. I, M, 4 + ble LL(140) + .align 4 + +LL(130): + vxor c01, c01, c01 + vxor c02, c02, c02 + vxor c03, c03, c03 + vxor c04, c04, c04 + + mr BO, B + + dcbtst CO1, PREC + + mr J, K + + andi. r0, B, 15 + ble+ LL(131) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + .align 4 + + +LL(131): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(135) + .align 4 + +LL(133): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vspltw bp2, b1, 1 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vspltw bp2, b1, 3 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(133) + .align 4 + +LL(135): + andi. r0, J, 3 + ble+ LL(138) + + cmpwi cr0, r0, 3 + bne LL(136) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 3 * SIZE + b LL(138) + .align 4 + +LL(136): + cmpwi cr0, r0, 2 + bne LL(137) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + LOAD_A a3, OFFSET_6, AO + LOAD_A a4, OFFSET_7, AO + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 2 * SIZE + b LL(138) + .align 4 + +LL(137): + cmpwi cr0, r0, 1 + bne LL(138) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(138): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(130) + .align 4 + +LL(140): + andi. I, M, 8 + ble LL(150) + + vxor c01, c01, c01 + vxor c02, c02, c02 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(141) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + .align 4 + + +LL(141): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(145) + .align 4 + +LL(143): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + vspltw bp1, b1, 2 + vmaddfp c01, a5, bp1, c01 + vmaddfp c02, a6, bp1, c02 + + vspltw bp2, b1, 3 + vmaddfp c01, a7, bp2, c01 + vmaddfp c02, a8, bp2, c02 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(143) + .align 4 + +LL(145): + andi. r0, J, 3 + ble+ LL(148) + + cmpwi cr0, r0, 3 + bne LL(146) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + + addi AO, AO, 24 * SIZE + addi BO, BO, 3 * SIZE + b LL(148) + .align 4 + +LL(146): + cmpwi cr0, r0, 2 + bne LL(147) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + addi AO, AO, 16 * SIZE + addi BO, BO, 2 * SIZE + b LL(148) + .align 4 + +LL(147): + cmpwi cr0, r0, 1 + bne LL(148) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + addi AO, AO, 8 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(148): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(150): + andi. I, M, 4 + ble LL(160) + + vxor c01, c01, c01 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(151) + + LOAD_A a1, OFFSET_0, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + + LOAD_A a1, OFFSET_0, AO + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + .align 4 + + +LL(151): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(155) + .align 4 + +LL(153): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c01, a4, bp2, c01 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(153) + .align 4 + +LL(155): + andi. r0, J, 3 + ble+ LL(158) + + cmpwi cr0, r0, 3 + bne LL(156) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + + addi AO, AO, 12 * SIZE + addi BO, BO, 3 * SIZE + b LL(158) + .align 4 + +LL(156): + cmpwi cr0, r0, 2 + bne LL(157) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c01, a2, bp2, c01 + + addi AO, AO, 8 * SIZE + addi BO, BO, 2 * SIZE + b LL(158) + .align 4 + +LL(157): + cmpwi cr0, r0, 1 + bne LL(158) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + + addi AO, AO, 4 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(158): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(160): + andi. I, M, 2 + ble LL(170) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + LFD f10, 2 * SIZE(AO) + LFD f11, 3 * SIZE(AO) + + LFD f12, 0 * SIZE(B) + LFD f13, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(165) + .align 4 + +LL(162): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + FMADD f2, f10, f13, f2 + FMADD f3, f11, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f10, 6 * SIZE(AO) + LFD f11, 7 * SIZE(AO) + + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(162) + .align 4 + +LL(165): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(168) + .align 4 + +LL(166): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + + addi AO, AO, 2 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(168): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + .align 4 + +LL(170): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(175) + .align 4 + +LL(172): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f11, f1 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(172) + .align 4 + +LL(175): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(178) + .align 4 + +LL(176): + FMADD f0, f8, f10, f0 + + addi AO, AO, 1 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(178): + LFD f8, 0 * SIZE(CO1) + + FADD f0, f0, f1 + + FMADD f0, f0, f13, f8 + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S new file mode 100644 index 0000000000..010ed39459 --- /dev/null +++ b/kernel/power/gemm_kernel_altivec_cell.S @@ -0,0 +1,2711 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 +#define C6 v21 +#define C7 v22 +#define C8 v23 +#define C9 v24 + +#define c00 v25 + +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 +#define PERMRSHIFT3 v28 +#define PERMRSHIFT4 v29 + +#define VZERO v30 +#define alpha v31 + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../sparam.h" +#else +#include "../dparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + li r0, -1 + + mfspr VREG, VRsave + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -128 + and SP, SP, r0 + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + stfs f1, ALPHA + 0(SP) + stfs f1, ALPHA + 4(SP) + stfs f1, ALPHA + 8(SP) + stfs f1, ALPHA + 12(SP) + + li r29, 0 + stw r29, FZERO(SP) + + slwi LDC, LDC, BASE_SHIFT + + li PREC, (15 * SIZE) +#ifdef CELL + li PREB, (5 * 32 * SIZE) +#else + li PREB, (5 * 32 * SIZE) +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 2 + ble LL(60) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + + vxor c05, c05, c05 + vxor c06, c06, c06 + vxor c07, c07, c07 + vxor c08, c08, c08 + + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + dcbtst CO3, PREC + vxor c12, c12, c12 + dcbtst CO4, PREC + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(13) + .align 4 + +#define NOP1 mr r3, r3 +#define NOP2 mr r4, r4 + +LL(12): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + dcbt AO, PREA + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + dcbt BO, PREB + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 8 * SIZE + vmaddfp c12, a4, bp1, c12 + NOP1 + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + addi AO, AO, 32 * SIZE + vmaddfp c07, a7, bp2, c07 + LOAD_B b1, OFFSET_0, BO + vmaddfp c08, a8, bp2, c08 + NOP1 + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + NOP2 + vmaddfp c11, a7, bp1, c11 + NOP1 + vmaddfp c12, a8, bp1, c12 + dcbt AO, PREA + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c15, a7, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a8, bp2, c16 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + LOAD_A a3, OFFSET_2, AO + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + NOP2 + vmaddfp c11, a3, bp1, c11 + NOP1 + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + addi AO, AO, 32 * SIZE + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c11, a7, bp1, c11 + NOP2 + vmaddfp c12, a8, bp1, c12 + vspltw bp1, b1, 0 + + vmaddfp c13, a5, bp2, c13 + LOAD_A a2, OFFSET_1, AO + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + NOP1 + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(13): + andi. r0, K, 2 + nop + nop + ble+ LL(15) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_A a5, OFFSET_4, AO + vmaddfp c11, a3, bp1, c11 + LOAD_A a6, OFFSET_5, AO + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a7, OFFSET_6, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a8, OFFSET_7, AO + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 32 * SIZE + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + NOP2 + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO + vmaddfp c11, a7, bp1, c11 + LOAD_A a2, OFFSET_1, AO + vmaddfp c12, a8, bp1, c12 + NOP2 + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + vmaddfp c16, a8, bp2, c16 + .align 4 + +LL(15): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(18) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 16 * SIZE + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 4 * SIZE + vmaddfp c12, a4, bp1, c12 + nop + + vmaddfp c13, a1, bp2, c13 + vmaddfp c14, a2, bp2, c14 + vmaddfp c15, a3, bp2, c15 + vmaddfp c16, a4, bp2, c16 + .align 4 + +LL(18): + lvx C1, OFFSET_0, CO1 + cmpwi cr0, LDC, 32 * SIZE + lvx C2, OFFSET_1, CO1 + lvsr PERMRSHIFT1, 0, CO1 + lvx C3, OFFSET_2, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvx C4, OFFSET_3, CO1 + lvsr PERMRSHIFT3, 0, CO3 + lvx C5, OFFSET_4, CO1 + lvsr PERMRSHIFT4, 0, CO4 + ble LL(19) + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO2 + vmaddfp c01, alpha, c01, C2 + lvx C6, OFFSET_1, CO2 + vmaddfp c02, alpha, c02, C3 + lvx C7, OFFSET_2, CO2 + vmaddfp c03, alpha, c03, C4 + lvx C8, OFFSET_3, CO2 + vmaddfp c04, alpha, c04, C5 + lvx C9, OFFSET_4, CO2 + + stvx c00, OFFSET_0, CO1 + vperm c00, VZERO, c05, PERMRSHIFT2 + stvx c01, OFFSET_1, CO1 + vperm c05, c05, c06, PERMRSHIFT2 + stvx c02, OFFSET_2, CO1 + vperm c06, c06, c07, PERMRSHIFT2 + stvx c03, OFFSET_3, CO1 + vperm c07, c07, c08, PERMRSHIFT2 + stvx c04, OFFSET_4, CO1 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO3 + vmaddfp c05, alpha, c05, C6 + lvx C2, OFFSET_1, CO3 + vmaddfp c06, alpha, c06, C7 + lvx C3, OFFSET_2, CO3 + vmaddfp c07, alpha, c07, C8 + lvx C4, OFFSET_3, CO3 + vmaddfp c08, alpha, c08, C9 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO2 + vperm c00, VZERO, c09, PERMRSHIFT3 + stvx c05, OFFSET_1, CO2 + vperm c09, c09, c10, PERMRSHIFT3 + stvx c06, OFFSET_2, CO2 + vperm c10, c10, c11, PERMRSHIFT3 + stvx c07, OFFSET_3, CO2 + vperm c11, c11, c12, PERMRSHIFT3 + stvx c08, OFFSET_4, CO2 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + lvx C9, OFFSET_4, CO4 + vmaddfp c09, alpha, c09, C2 + lvx C1, OFFSET_0, CO4 + vmaddfp c10, alpha, c10, C3 + lvx C6, OFFSET_1, CO4 + vmaddfp c11, alpha, c11, C4 + lvx C7, OFFSET_2, CO4 + vmaddfp c12, alpha, c12, C5 + lvx C8, OFFSET_3, CO4 + + stvx c00, OFFSET_0, CO3 + vperm c00, VZERO, c13, PERMRSHIFT4 + stvx c09, OFFSET_1, CO3 + vperm c13, c13, c14, PERMRSHIFT4 + stvx c10, OFFSET_2, CO3 + vperm c14, c14, c15, PERMRSHIFT4 + stvx c11, OFFSET_3, CO3 + vperm c15, c15, c16, PERMRSHIFT4 + stvx c12, OFFSET_4, CO3 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + b LL(20) + .align 4 + +LL(19): + lvx C6, OFFSET_1, CO2 + lvx C7, OFFSET_2, CO2 + lvx C8, OFFSET_3, CO2 + lvx C9, OFFSET_4, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + lvx C2, OFFSET_1, CO3 + vmaddfp c02, alpha, c02, C3 + lvx C3, OFFSET_2, CO3 + vmaddfp c03, alpha, c03, C4 + lvx C4, OFFSET_3, CO3 + vmaddfp c04, alpha, c04, C5 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C6 + lvx C6, OFFSET_1, CO4 + vmaddfp c06, alpha, c06, C7 + lvx C7, OFFSET_2, CO4 + vmaddfp c07, alpha, c07, C8 + lvx C8, OFFSET_3, CO4 + vmaddfp c08, alpha, c08, C9 + lvx C9, OFFSET_4, CO4 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + lvx C1, OFFSET_0, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, c11, PERMRSHIFT3 + vperm c11, c11, c12, PERMRSHIFT3 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + vmaddfp c11, alpha, c11, C4 + vmaddfp c12, alpha, c12, C5 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + stvx c11, OFFSET_3, CO3 + stvx c12, OFFSET_4, CO3 + + lvx C1, OFFSET_0, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, c15, PERMRSHIFT4 + vperm c15, c15, c16, PERMRSHIFT4 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 8 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + lvx C3, OFFSET_2, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + lvx C3, OFFSET_2, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + vmaddfp c14, alpha, c14, C3 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + addi CO3, CO3, 8 * SIZE + addi CO4, CO4, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 4 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 2 + ble LL(50) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + LFD f8, 0 * SIZE(CO3) + LFD f9, 1 * SIZE(CO3) + LFD f10, 0 * SIZE(CO4) + LFD f11, 1 * SIZE(CO4) + + FMADD f4, f4, f13, f8 + FMADD f5, f5, f13, f9 + FMADD f6, f6, f13, f10 + FMADD f7, f7, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + STFD f4, 0 * SIZE(CO3) + STFD f5, 1 * SIZE(CO3) + STFD f6, 0 * SIZE(CO4) + STFD f7, 1 * SIZE(CO4) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + .align 4 + +LL(50): + andi. I, M, 1 + ble LL(59) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f9, f10, f0 + FMADD f1, f9, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f9, 3 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(52) + .align 4 + +LL(55): + andi. r0, K, 1 + ble LL(58) + .align 4 + +LL(56): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(58): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + LFD f10, 0 * SIZE(CO3) + LFD f11, 0 * SIZE(CO4) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + STFD f2, 0 * SIZE(CO3) + STFD f3, 0 * SIZE(CO4) + .align 4 + +LL(59): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(60): + andi. r0, N, 2 + ble LL(120) + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(80) + .align 4 + +LL(71): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(78): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + vmaddfp c07, alpha, c07, C4 + vmaddfp c08, alpha, c08, C5 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 8 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(88) + .align 4 + +LL(86): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(88): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(90): + andi. I, M, 4 + ble LL(100) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(98) + .align 4 + +LL(96): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(98): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(100): + andi. I, M, 2 + ble LL(110) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(105) + .align 4 + +LL(102): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + FMADD f4, f8, f12, f4 + FMADD f5, f9, f12, f5 + FMADD f6, f8, f13, f6 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(102) + .align 4 + +LL(105): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(108) + .align 4 + +LL(106): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(108): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + .align 4 + +LL(110): + andi. I, M, 1 + ble LL(119) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(118) + .align 4 + +LL(116): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + + LFD f8, 1 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(118): + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + .align 4 + +LL(119): + mr B, BO + .align 4 + +LL(120): + andi. r0, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + srawi. I, M, 4 + ble LL(140) + .align 4 + +LL(130): + vxor c01, c01, c01 + vxor c02, c02, c02 + vxor c03, c03, c03 + vxor c04, c04, c04 + + mr BO, B + + dcbtst CO1, PREC + + mr J, K + + andi. r0, B, 15 + ble+ LL(131) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + .align 4 + + +LL(131): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(135) + .align 4 + +LL(133): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vspltw bp2, b1, 1 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vspltw bp2, b1, 3 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(133) + .align 4 + +LL(135): + andi. r0, J, 3 + ble+ LL(138) + + cmpwi cr0, r0, 3 + bne LL(136) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 3 * SIZE + b LL(138) + .align 4 + +LL(136): + cmpwi cr0, r0, 2 + bne LL(137) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + LOAD_A a3, OFFSET_6, AO + LOAD_A a4, OFFSET_7, AO + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 2 * SIZE + b LL(138) + .align 4 + +LL(137): + cmpwi cr0, r0, 1 + bne LL(138) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(138): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(130) + .align 4 + +LL(140): + andi. I, M, 8 + ble LL(150) + + vxor c01, c01, c01 + vxor c02, c02, c02 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(141) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + .align 4 + + +LL(141): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(145) + .align 4 + +LL(143): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + vspltw bp1, b1, 2 + vmaddfp c01, a5, bp1, c01 + vmaddfp c02, a6, bp1, c02 + + vspltw bp2, b1, 3 + vmaddfp c01, a7, bp2, c01 + vmaddfp c02, a8, bp2, c02 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(143) + .align 4 + +LL(145): + andi. r0, J, 3 + ble+ LL(148) + + cmpwi cr0, r0, 3 + bne LL(146) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + + addi AO, AO, 24 * SIZE + addi BO, BO, 3 * SIZE + b LL(148) + .align 4 + +LL(146): + cmpwi cr0, r0, 2 + bne LL(147) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + addi AO, AO, 16 * SIZE + addi BO, BO, 2 * SIZE + b LL(148) + .align 4 + +LL(147): + cmpwi cr0, r0, 1 + bne LL(148) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + addi AO, AO, 8 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(148): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(150): + andi. I, M, 4 + ble LL(160) + + vxor c01, c01, c01 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(151) + + LOAD_A a1, OFFSET_0, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + + LOAD_A a1, OFFSET_0, AO + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + .align 4 + + +LL(151): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(155) + .align 4 + +LL(153): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c01, a4, bp2, c01 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(153) + .align 4 + +LL(155): + andi. r0, J, 3 + ble+ LL(158) + + cmpwi cr0, r0, 3 + bne LL(156) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + + addi AO, AO, 12 * SIZE + addi BO, BO, 3 * SIZE + b LL(158) + .align 4 + +LL(156): + cmpwi cr0, r0, 2 + bne LL(157) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c01, a2, bp2, c01 + + addi AO, AO, 8 * SIZE + addi BO, BO, 2 * SIZE + b LL(158) + .align 4 + +LL(157): + cmpwi cr0, r0, 1 + bne LL(158) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + + addi AO, AO, 4 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(158): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(160): + andi. I, M, 2 + ble LL(170) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + LFD f10, 2 * SIZE(AO) + LFD f11, 3 * SIZE(AO) + + LFD f12, 0 * SIZE(B) + LFD f13, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(165) + .align 4 + +LL(162): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + FMADD f2, f10, f13, f2 + FMADD f3, f11, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f10, 6 * SIZE(AO) + LFD f11, 7 * SIZE(AO) + + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(162) + .align 4 + +LL(165): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(168) + .align 4 + +LL(166): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + + addi AO, AO, 2 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(168): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + .align 4 + +LL(170): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(175) + .align 4 + +LL(172): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f11, f1 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(172) + .align 4 + +LL(175): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(178) + .align 4 + +LL(176): + FMADD f0, f8, f10, f0 + + addi AO, AO, 1 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(178): + LFD f8, 0 * SIZE(CO1) + + FADD f0, f0, f1 + + FMADD f0, f0, f13, f8 + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S new file mode 100644 index 0000000000..24d437d19c --- /dev/null +++ b/kernel/power/gemm_kernel_altivec_g4.S @@ -0,0 +1,2647 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 +#define C6 v21 +#define C7 v22 +#define C8 v23 +#define C9 v24 + +#define c00 v25 + +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 +#define PERMRSHIFT3 v28 +#define PERMRSHIFT4 v29 + +#define VZERO v30 +#define alpha v31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + li r0, -1 + + mfspr VREG, VRsave + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -128 + and SP, SP, r0 + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + stfs f1, ALPHA + 0(SP) + stfs f1, ALPHA + 4(SP) + stfs f1, ALPHA + 8(SP) + stfs f1, ALPHA + 12(SP) + + li r29, 0 + stw r29, FZERO(SP) + + slwi LDC, LDC, BASE_SHIFT + + li PREC, (15 * SIZE) + li PREB, (25 * 8 * SIZE) + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 2 + ble LL(60) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + vxor c05, c05, c05 + LOAD_A a4, OFFSET_3, AO + vxor c06, c06, c06 + LOAD_B b2, OFFSET_2, B + vxor c07, c07, c07 + LOAD_A a5, OFFSET_4, AO + vxor c08, c08, c08 + LOAD_A a6, OFFSET_5, AO + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + dcbtst CO3, PREC + vxor c12, c12, c12 + dcbtst CO4, PREC + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(15) + .align 4 + +LL(12): +/* 1 */ + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c03, a3, bp1, c03 + LOAD_A a7, OFFSET_4, AO + vmaddfp c04, a4, bp1, c04 + LOAD_A a8, OFFSET_5, AO + +/* 2 */ + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + dcbt BO, PREB + vmaddfp c07, a3, bp2, c07 + dcbt AO, PREB + vmaddfp c08, a4, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 3 */ + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + dcbt AO, PREB + vmaddfp c12, a4, bp1, c12 + addi AO, AO, 8 * SIZE + +/* 4 */ + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_2, AO + vmaddfp c15, a3, bp2, c15 + dcbt AO, PREB + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 8 * SIZE + +/* 5 */ + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a6, bp1, c02 + LOAD_A a2, OFFSET_1, AO + vmaddfp c03, a7, bp1, c03 + LOAD_A a3, OFFSET_2, AO + vmaddfp c04, a8, bp1, c04 + LOAD_A a4, OFFSET_3, AO + +/* 6 */ + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + dcbt AO, PREA + vmaddfp c08, a8, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 7 */ + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b1, OFFSET_4, BO + vmaddfp c11, a7, bp1, c11 + nop + vmaddfp c12, a8, bp1, c12 + nop + +/* 8 */ + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a5, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a6, OFFSET_3, AO + vmaddfp c16, a8, bp2, c16 + LOAD_A a7, OFFSET_4, AO + +/* 9 */ + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a8, OFFSET_5, AO + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 8 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + +/* 10 */ + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + +/* 11 */ + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + dcbt AO, PREA + vmaddfp c12, a4, bp1, c12 + addi AO, AO, 8 * SIZE + +/* 12 */ + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a2, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + LOAD_A a3, OFFSET_6, AO + +/* 13 */ + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + LOAD_A a4, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + dcbt AO, PREA + vmaddfp c04, a8, bp1, c04 + addi AO, AO, 8 * SIZE + +/* 14 */ + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + dcbt AO, PREA + vmaddfp c08, a8, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 15 */ + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b2, OFFSET_4, BO + vmaddfp c11, a7, bp1, c11 + dcbt AO, PREA + vmaddfp c12, a8, bp1, c12 + addi BO, BO, 8 * SIZE + +/* 16 */ + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(15): + andi. r0, K, 3 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi AO, AO, 16 * SIZE + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 4 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_0, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a4, bp2, c16 + LOAD_A a3, OFFSET_2, AO + + LOAD_A a4, OFFSET_3, AO + bdnz+ LL(16) + .align 4 + +LL(18): + lvx C1, OFFSET_0, CO1 + cmpwi cr0, LDC, 32 * SIZE + lvx C2, OFFSET_1, CO1 + lvsr PERMRSHIFT1, 0, CO1 + lvx C3, OFFSET_2, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvx C4, OFFSET_3, CO1 + lvsr PERMRSHIFT3, 0, CO3 + lvx C5, OFFSET_4, CO1 + lvsr PERMRSHIFT4, 0, CO4 + ble LL(19) + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO2 + vmaddfp c01, alpha, c01, C2 + lvx C6, OFFSET_1, CO2 + vmaddfp c02, alpha, c02, C3 + lvx C7, OFFSET_2, CO2 + vmaddfp c03, alpha, c03, C4 + lvx C8, OFFSET_3, CO2 + vmaddfp c04, alpha, c04, C5 + lvx C9, OFFSET_4, CO2 + + stvx c00, OFFSET_0, CO1 + vperm c00, VZERO, c05, PERMRSHIFT2 + stvx c01, OFFSET_1, CO1 + vperm c05, c05, c06, PERMRSHIFT2 + stvx c02, OFFSET_2, CO1 + vperm c06, c06, c07, PERMRSHIFT2 + stvx c03, OFFSET_3, CO1 + vperm c07, c07, c08, PERMRSHIFT2 + stvx c04, OFFSET_4, CO1 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + lvx C1, OFFSET_0, CO3 + vmaddfp c05, alpha, c05, C6 + lvx C2, OFFSET_1, CO3 + vmaddfp c06, alpha, c06, C7 + lvx C3, OFFSET_2, CO3 + vmaddfp c07, alpha, c07, C8 + lvx C4, OFFSET_3, CO3 + vmaddfp c08, alpha, c08, C9 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO2 + vperm c00, VZERO, c09, PERMRSHIFT3 + stvx c05, OFFSET_1, CO2 + vperm c09, c09, c10, PERMRSHIFT3 + stvx c06, OFFSET_2, CO2 + vperm c10, c10, c11, PERMRSHIFT3 + stvx c07, OFFSET_3, CO2 + vperm c11, c11, c12, PERMRSHIFT3 + stvx c08, OFFSET_4, CO2 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + lvx C9, OFFSET_4, CO4 + vmaddfp c09, alpha, c09, C2 + lvx C1, OFFSET_0, CO4 + vmaddfp c10, alpha, c10, C3 + lvx C6, OFFSET_1, CO4 + vmaddfp c11, alpha, c11, C4 + lvx C7, OFFSET_2, CO4 + vmaddfp c12, alpha, c12, C5 + lvx C8, OFFSET_3, CO4 + + stvx c00, OFFSET_0, CO3 + vperm c00, VZERO, c13, PERMRSHIFT4 + stvx c09, OFFSET_1, CO3 + vperm c13, c13, c14, PERMRSHIFT4 + stvx c10, OFFSET_2, CO3 + vperm c14, c14, c15, PERMRSHIFT4 + stvx c11, OFFSET_3, CO3 + vperm c15, c15, c16, PERMRSHIFT4 + stvx c12, OFFSET_4, CO3 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + b LL(20) + .align 4 + +LL(19): + lvx C6, OFFSET_1, CO2 + lvx C7, OFFSET_2, CO2 + lvx C8, OFFSET_3, CO2 + lvx C9, OFFSET_4, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + lvx C2, OFFSET_1, CO3 + vmaddfp c02, alpha, c02, C3 + lvx C3, OFFSET_2, CO3 + vmaddfp c03, alpha, c03, C4 + lvx C4, OFFSET_3, CO3 + vmaddfp c04, alpha, c04, C5 + lvx C5, OFFSET_4, CO3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C6 + lvx C6, OFFSET_1, CO4 + vmaddfp c06, alpha, c06, C7 + lvx C7, OFFSET_2, CO4 + vmaddfp c07, alpha, c07, C8 + lvx C8, OFFSET_3, CO4 + vmaddfp c08, alpha, c08, C9 + lvx C9, OFFSET_4, CO4 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + lvx C1, OFFSET_0, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, c11, PERMRSHIFT3 + vperm c11, c11, c12, PERMRSHIFT3 + vperm c12, c12, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + vmaddfp c11, alpha, c11, C4 + vmaddfp c12, alpha, c12, C5 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + stvx c11, OFFSET_3, CO3 + stvx c12, OFFSET_4, CO3 + + lvx C1, OFFSET_0, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, c15, PERMRSHIFT4 + vperm c15, c15, c16, PERMRSHIFT4 + vperm c16, c16, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C6 + vmaddfp c14, alpha, c14, C7 + vmaddfp c15, alpha, c15, C8 + vmaddfp c16, alpha, c16, C9 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + stvx c15, OFFSET_3, CO4 + stvx c16, OFFSET_4, CO4 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addi CO3, CO3, 16 * SIZE + addi CO4, CO4, 16 * SIZE + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 8 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + lvx C3, OFFSET_2, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, c10, PERMRSHIFT3 + vperm c10, c10, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + vmaddfp c10, alpha, c10, C3 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + stvx c10, OFFSET_2, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + lvx C3, OFFSET_2, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, c14, PERMRSHIFT4 + vperm c14, c14, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + vmaddfp c14, alpha, c14, C3 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + stvx c14, OFFSET_2, CO4 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + addi CO3, CO3, 8 * SIZE + addi CO4, CO4, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 4 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + lvx C1, OFFSET_0, CO3 + lvx C2, OFFSET_1, CO3 + + vperm c00, VZERO, c09, PERMRSHIFT3 + vperm c09, c09, VZERO, PERMRSHIFT3 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c09, alpha, c09, C2 + + stvx c00, OFFSET_0, CO3 + stvx c09, OFFSET_1, CO3 + + lvx C1, OFFSET_0, CO4 + lvx C2, OFFSET_1, CO4 + + vperm c00, VZERO, c13, PERMRSHIFT4 + vperm c13, c13, VZERO, PERMRSHIFT4 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c13, alpha, c13, C2 + + stvx c00, OFFSET_0, CO4 + stvx c13, OFFSET_1, CO4 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 2 + ble LL(50) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + FMADD f0, f8, f10, f0 + FMADD f2, f8, f11, f2 + FMADD f4, f8, f12, f4 + FMADD f6, f8, f13, f6 + + FMADD f1, f9, f10, f1 + FMADD f3, f9, f11, f3 + FMADD f5, f9, f12, f5 + FMADD f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + LFD f8, 0 * SIZE(CO3) + LFD f9, 1 * SIZE(CO3) + LFD f10, 0 * SIZE(CO4) + LFD f11, 1 * SIZE(CO4) + + FMADD f4, f4, f13, f8 + FMADD f5, f5, f13, f9 + FMADD f6, f6, f13, f10 + FMADD f7, f7, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + STFD f4, 0 * SIZE(CO3) + STFD f5, 1 * SIZE(CO3) + STFD f6, 0 * SIZE(CO4) + STFD f7, 1 * SIZE(CO4) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + .align 4 + +LL(50): + andi. I, M, 1 + ble LL(59) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + FMADD f0, f9, f10, f0 + FMADD f1, f9, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f9, 3 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(52) + .align 4 + +LL(55): + andi. r0, K, 1 + ble LL(58) + .align 4 + +LL(56): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f8, f12, f2 + FMADD f3, f8, f13, f3 + + LFD f8, 2 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(58): + lfs f13, ALPHA(SP) + + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + LFD f10, 0 * SIZE(CO3) + LFD f11, 0 * SIZE(CO4) + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + STFD f2, 0 * SIZE(CO3) + STFD f3, 0 * SIZE(CO4) + .align 4 + +LL(59): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(60): + andi. r0, N, 2 + ble LL(120) + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 4 + ble LL(80) + .align 4 + +LL(71): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(78): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, c07, PERMRSHIFT2 + vperm c07, c07, c08, PERMRSHIFT2 + vperm c08, c08, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + vmaddfp c07, alpha, c07, C4 + vmaddfp c08, alpha, c08, C5 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + stvx c07, OFFSET_3, CO2 + stvx c08, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 8 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(88) + .align 4 + +LL(86): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(88): + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, c06, PERMRSHIFT2 + vperm c06, c06, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + vmaddfp c06, alpha, c06, C3 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + stvx c06, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(90): + andi. I, M, 4 + ble LL(100) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + ble+ LL(98) + .align 4 + +LL(96): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(98): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + lvsr PERMRSHIFT3, 0, CO3 + lvsr PERMRSHIFT4, 0, CO4 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c05, PERMRSHIFT2 + vperm c05, c05, VZERO, PERMRSHIFT2 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c05, alpha, c05, C2 + + stvx c00, OFFSET_0, CO2 + stvx c05, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(100): + andi. I, M, 2 + ble LL(110) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(105) + .align 4 + +LL(102): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + FMADD f4, f8, f12, f4 + FMADD f5, f9, f12, f5 + FMADD f6, f8, f13, f6 + FMADD f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(102) + .align 4 + +LL(105): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(108) + .align 4 + +LL(106): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f10, f1 + FMADD f2, f8, f11, f2 + FMADD f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(108): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + FMADD f2, f2, f13, f10 + FMADD f3, f3, f13, f11 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + .align 4 + +LL(110): + andi. I, M, 1 + ble LL(119) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + LFD f12, 2 * SIZE(B) + LFD f13, 3 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + FMADD f2, f9, f12, f2 + FMADD f3, f9, f13, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(118) + .align 4 + +LL(116): + FMADD f0, f8, f10, f0 + FMADD f1, f8, f11, f1 + + LFD f8, 1 * SIZE(AO) + + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 1 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(118): + LFD f8, 0 * SIZE(CO1) + LFD f9, 0 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + .align 4 + +LL(119): + mr B, BO + .align 4 + +LL(120): + andi. r0, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + srawi. I, M, 4 + ble LL(140) + .align 4 + +LL(130): + vxor c01, c01, c01 + vxor c02, c02, c02 + vxor c03, c03, c03 + vxor c04, c04, c04 + + mr BO, B + + dcbtst CO1, PREC + + mr J, K + + andi. r0, B, 15 + ble+ LL(131) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + addi AO, AO, 16 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(138) + .align 4 + + +LL(131): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(135) + .align 4 + +LL(133): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vspltw bp2, b1, 1 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vspltw bp2, b1, 3 + vmaddfp c01, a5, bp2, c01 + vmaddfp c02, a6, bp2, c02 + vmaddfp c03, a7, bp2, c03 + vmaddfp c04, a8, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(133) + .align 4 + +LL(135): + andi. r0, J, 3 + ble+ LL(138) + + cmpwi cr0, r0, 3 + bne LL(136) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 16 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 3 * SIZE + b LL(138) + .align 4 + +LL(136): + cmpwi cr0, r0, 2 + bne LL(137) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + LOAD_A a3, OFFSET_6, AO + LOAD_A a4, OFFSET_7, AO + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + vmaddfp c03, a3, bp2, c03 + vmaddfp c04, a4, bp2, c04 + + addi AO, AO, 32 * SIZE + addi BO, BO, 2 * SIZE + b LL(138) + .align 4 + +LL(137): + cmpwi cr0, r0, 1 + bne LL(138) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + addi AO, AO, 16 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(138): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + vmaddfp c03, alpha, c03, C4 + vmaddfp c04, alpha, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(130) + .align 4 + +LL(140): + andi. I, M, 8 + ble LL(150) + + vxor c01, c01, c01 + vxor c02, c02, c02 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(141) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + + addi AO, AO, 8 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + vmaddfp c02, a2, bp2, c02 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(148) + .align 4 + + +LL(141): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(145) + .align 4 + +LL(143): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + vspltw bp1, b1, 2 + vmaddfp c01, a5, bp1, c01 + vmaddfp c02, a6, bp1, c02 + + vspltw bp2, b1, 3 + vmaddfp c01, a7, bp2, c01 + vmaddfp c02, a8, bp2, c02 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(143) + .align 4 + +LL(145): + andi. r0, J, 3 + ble+ LL(148) + + cmpwi cr0, r0, 3 + bne LL(146) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vspltw bp2, b1, 1 + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + LOAD_A a1, OFFSET_4, AO + LOAD_A a2, OFFSET_5, AO + + vspltw bp1, b1, 2 + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + + addi AO, AO, 24 * SIZE + addi BO, BO, 3 * SIZE + b LL(148) + .align 4 + +LL(146): + cmpwi cr0, r0, 2 + bne LL(147) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c01, a3, bp2, c01 + vmaddfp c02, a4, bp2, c02 + + addi AO, AO, 16 * SIZE + addi BO, BO, 2 * SIZE + b LL(148) + .align 4 + +LL(147): + cmpwi cr0, r0, 1 + bne LL(148) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c02, a2, bp1, c02 + + addi AO, AO, 8 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(148): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + vmaddfp c02, alpha, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(150): + andi. I, M, 4 + ble LL(160) + + vxor c01, c01, c01 + + mr BO, B + + mr J, K + + andi. r0, B, 15 + ble+ LL(151) + + LOAD_A a1, OFFSET_0, AO + LOAD_B b1, OFFSET_0, BO + vspltw bp1, b1, 2 + vspltw bp2, b1, 3 + + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp1, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + + LOAD_A a1, OFFSET_0, AO + addi AO, AO, 4 * SIZE + addi BO, BO, SIZE + + vmaddfp c01, a1, bp2, c01 + subi J, J, 1 + cmpwi cr0, J, 0 + ble LL(158) + .align 4 + + +LL(151): + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + LOAD_B b1, OFFSET_0, BO + + srawi. r0, J, 2 + mtspr CTR, r0 + ble LL(155) + .align 4 + +LL(153): + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c01, a4, bp2, c01 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + + LOAD_B b1, OFFSET_0, BO + + bdnz LL(153) + .align 4 + +LL(155): + andi. r0, J, 3 + ble+ LL(158) + + cmpwi cr0, r0, 3 + bne LL(156) + + vspltw bp1, b1, 0 + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c01, a2, bp2, c01 + vspltw bp1, b1, 2 + vmaddfp c01, a3, bp1, c01 + + addi AO, AO, 12 * SIZE + addi BO, BO, 3 * SIZE + b LL(158) + .align 4 + +LL(156): + cmpwi cr0, r0, 2 + bne LL(157) + + vspltw bp1, b1, 0 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + vmaddfp c01, a2, bp2, c01 + + addi AO, AO, 8 * SIZE + addi BO, BO, 2 * SIZE + b LL(158) + .align 4 + +LL(157): + cmpwi cr0, r0, 1 + bne LL(158) + + vspltw bp1, b1, 0 + + vmaddfp c01, a1, bp1, c01 + + addi AO, AO, 4 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(158): + lvx alpha, OFFSET_0, SP + vxor VZERO, VZERO, VZERO + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vmaddfp c00, alpha, c00, C1 + vmaddfp c01, alpha, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(160): + andi. I, M, 2 + ble LL(170) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + LFD f10, 2 * SIZE(AO) + LFD f11, 3 * SIZE(AO) + + LFD f12, 0 * SIZE(B) + LFD f13, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(165) + .align 4 + +LL(162): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + FMADD f2, f10, f13, f2 + FMADD f3, f11, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f10, 6 * SIZE(AO) + LFD f11, 7 * SIZE(AO) + + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(162) + .align 4 + +LL(165): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(168) + .align 4 + +LL(166): + FMADD f0, f8, f12, f0 + FMADD f1, f9, f12, f1 + + addi AO, AO, 2 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(168): + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMADD f0, f0, f13, f8 + FMADD f1, f1, f13, f9 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + .align 4 + +LL(170): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(B) + LFD f11, 1 * SIZE(B) + + lfs f0, FZERO(SP) + fmr f1, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(175) + .align 4 + +LL(172): + FMADD f0, f8, f10, f0 + FMADD f1, f9, f11, f1 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 2 * SIZE(BO) + LFD f11, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(172) + .align 4 + +LL(175): + andi. r0, K, 1 + lfs f13, ALPHA(SP) + ble LL(178) + .align 4 + +LL(176): + FMADD f0, f8, f10, f0 + + addi AO, AO, 1 * SIZE + addi BO, BO, 1 * SIZE + .align 4 + +LL(178): + LFD f8, 0 * SIZE(CO1) + + FADD f0, f0, f1 + + FMADD f0, f0, f13, f8 + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S new file mode 100644 index 0000000000..0b0d75f501 --- /dev/null +++ b/kernel/power/gemm_kernel_cell.S @@ -0,0 +1,2642 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../sparam.h" +#else +#include "../dparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) +#if defined(TRMMKERNEL) + std r19, 240(SP) + std r18, 248(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) +#if defined(TRMMKERNEL) + stw r19, 192(SP) + stw r18, 196(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +/* Normal prefetch */ +#ifdef CELL + li PREC, 4 * SIZE +#endif + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ +xc ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST + li PREC, 3 * SIZE + li PREA, 16 * 12 * SIZE + li PREB, 16 * 12 * SIZE +#endif + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble LL(20) + .align 4 + +LL(11): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(15) + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(15) +#endif + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 + + LFD f16, 0 * SIZE(CO3) + LFD f17, 1 * SIZE(CO3) + LFD f18, 2 * SIZE(CO3) + LFD f19, 3 * SIZE(CO3) + + LFD f20, 0 * SIZE(CO4) + LFD f21, 1 * SIZE(CO4) + LFD f22, 2 * SIZE(CO4) + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + dcbt 0, BO, PREB + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + dcbt 0, BO, PREB + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +LL(39): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(40): + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble LL(70) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble LL(50) + .align 4 + +LL(41): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbt CO1, PREC + dcbt CO2, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt 0, BO, PREB + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + dcbt 0, BO, PREB + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(69): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +LL(70): + mr CO1, C + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble LL(80) + .align 4 + +LL(71): +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbt CO1, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + dcbt 0, BO, PREB + bdnz LL(72) + .align 4 + +LL(75): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + dcbt 0, BO, PREB + bdnz LL(82) + .align 4 + +LL(85): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + ld r19, 240(SP) + ld r18, 248(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S new file mode 100644 index 0000000000..1ee4b2853c --- /dev/null +++ b/kernel/power/gemm_kernel_g4.S @@ -0,0 +1,2412 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREC r30 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) +#if defined(TRMMKERNEL) + std r19, 240(SP) + std r18, 248(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) +#if defined(TRMMKERNEL) + stw r19, 192(SP) + stw r18, 196(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + li PREA, 8 * 8 * SIZE + li PREC, 3 * SIZE + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + srawi. J, N, 2 + ble .L40 + .align 4 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + +.L10: + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble .L20 + .align 4 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 1 + mtspr CTR, TEMP + ble .L15 + +#else + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, K, 1 + mtspr CTR, r0 + mr BO, B + ble .L15 +#endif + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFDU B5, 4 * SIZE(BO) + FMADD f4, A1, B2, f4 + dcbt AO, PREA + + FMADD f8, A1, B3, f8 + LFD A4, -1 * SIZE(AO) + FMADD f12, A1, B4, f12 + dcbt BO, PREA + + FMADD f1, A2, B1, f1 + LFD B6, 1 * SIZE(BO) + FMADD f5, A2, B2, f5 + nop + + FMADD f9, A2, B3, f9 + LFDU A1, 4 * SIZE(AO) + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + LFD B7, 2 * SIZE(BO) + FMADD f6, A3, B2, f6 + nop + + FMADD f10, A3, B3, f10 + LFD A2, -3 * SIZE(AO) + FMADD f14, A3, B4, f14 + nop + + FMADD f3, A4, B1, f3 + LFD B8, 3 * SIZE(BO) + FMADD f7, A4, B2, f7 + nop + + FMADD f11, A4, B3, f11 + LFD A3, -2 * SIZE(AO) + FMADD f15, A4, B4, f15 + nop + + FMADD f0, A5, B5, f0 + LFDU B1, 4 * SIZE(BO) + FMADD f4, A5, B6, f4 + nop + + FMADD f8, A5, B7, f8 + LFD A4, -1 * SIZE(AO) + FMADD f12, A5, B8, f12 +#ifdef DOUBLE + dcbt BO, PREA +#else + nop +#endif + + FMADD f1, A2, B5, f1 + LFD B2, 1 * SIZE(BO) + FMADD f5, A2, B6, f5 + nop + + FMADD f9, A2, B7, f9 + LFDU A5, 4 * SIZE(AO) + FMADD f13, A2, B8, f13 +#ifdef DOUBLE + dcbt AO, PREA +#else + nop +#endif + + FMADD f2, A3, B5, f2 + LFD B3, 2 * SIZE(BO) + FMADD f6, A3, B6, f6 + nop + + FMADD f10, A3, B7, f10 + LFD A2, -3 * SIZE(AO) + FMADD f14, A3, B8, f14 + nop + + FMADD f3, A4, B5, f3 + LFD B4, 3 * SIZE(BO) + FMADD f7, A4, B6, f7 + nop + + FMADD f11, A4, B7, f11 + LFD A3, -2 * SIZE(AO) + FMADD f15, A4, B8, f15 + bdnz .L12 + .align 4 + +.L15: + addi AO, AO, -4 * SIZE + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 1 +#else + + andi. r0, K, 1 +#endif + ble+ .L18 + +.L16: + LFD A4, 3 * SIZE(AO) + + FMADD f0, A1, B1, f0 + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + + FMADD f3, A4, B1, f3 + FMADD f7, A4, B2, f7 + FMADD f11, A4, B3, f11 + FMADD f15, A4, B4, f15 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + + .align 4 + +.L18: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + LFD f16, 0 * SIZE(CO3) + FMADD f1, f1, f30, f17 + LFD f17, 1 * SIZE(CO3) + FMADD f2, f2, f30, f18 + LFD f18, 2 * SIZE(CO3) + FMADD f3, f3, f30, f19 + LFD f19, 3 * SIZE(CO3) + + FMADD f4, f4, f30, f20 + LFD f20, 0 * SIZE(CO4) + FMADD f5, f5, f30, f21 + LFD f21, 1 * SIZE(CO4) + FMADD f6, f6, f30, f22 + LFD f22, 2 * SIZE(CO4) + FMADD f7, f7, f30, f23 + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + fmr f4, f0 + STFD f5, 1 * SIZE(CO2) + fmr f5, f0 + STFD f6, 2 * SIZE(CO2) + fmr f6, f0 + STFD f7, 3 * SIZE(CO2) + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + fmr f8, f0 + STFD f9, 1 * SIZE(CO3) + fmr f9, f0 + STFD f10, 2 * SIZE(CO3) + fmr f10, f0 + STFD f11, 3 * SIZE(CO3) + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + fmr f12, f0 + STFD f13, 1 * SIZE(CO4) + fmr f13, f0 + STFD f14, 2 * SIZE(CO4) + fmr f14, f0 + STFD f15, 3 * SIZE(CO4) + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + ble .L30 + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L30: + andi. I, M, 1 + ble .L39 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + addic. J, J, -1 + bgt .L10 + .align 4 + +.L40: + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble .L50 + .align 4 + +.L41: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L45 + .align 5 + +.L42: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L42 + .align 4 + +.L45: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ .L41 + .align 4 + +.L50: + andi. I, M, 2 + ble .L60 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L60: + andi. I, M, 1 + ble .L69 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +.L70: + mr CO1, C + andi. J, N, 1 + ble .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble .L80 + .align 4 + +.L71: +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble .L75 + +#endif + ble .L75 + .align 5 + +.L72: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L72 + .align 4 + +.L75: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ .L71 + .align 4 + +.L80: + andi. I, M, 2 + ble .L90 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L999 + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + ld r19, 240(SP) + ld r18, 248(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S new file mode 100644 index 0000000000..6b4e6b9a06 --- /dev/null +++ b/kernel/power/gemm_kernel_hummer.S @@ -0,0 +1,7006 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define ALPHA 0 +#define FZERO 8 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define CO3 r30 +#define CO4 r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) # dummy + + li r0, 0 + + stwu r0, -4(SP) + stwu r0, -4(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + andi. r0, C, 2 * SIZE - 1 + bne .L1000 + andi. r0, LDC, 2 * SIZE - 1 + bne .L1000 + +/* High performance version */ + + li INCM3, -2 * SIZE + li INCM5, -5 * SIZE + li INCM7, -6 * SIZE + + addi C, C, - 2 * SIZE + srawi. J, N, 2 + ble .L50 + .align 4 + +.L10: + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -4 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L20 + .align 4 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + fpmr f1, f0 + mtspr CTR, TEMP + ble .L14 + +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 +#else + nop +#endif + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 +#ifndef TRMMKERNEL + LFPDUX B1, CO1, INC4 +#else + nop +#endif + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFPDUX A3, CO2, INC2 +#else + nop +#endif + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A5, CO2, INC4 +#else + nop +#endif + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 +#ifndef TRMMKERNEL + LFPDUX B3, CO3, INC2 +#else + nop +#endif + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 +#ifndef TRMMKERNEL + LFPDUX A6, CO3, INC4 +#else + nop +#endif + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A7, CO4, INC2 +#else + nop +#endif + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + nop + fxcsmadd f12, B4, A9, f12 +#ifndef TRMMKERNEL + LFPDUX B2, CO4, INC4 +#else + nop +#endif + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFPDUX B5, CO1, INCM3 +#else + nop +#endif + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 +#ifndef TRMMKERNEL + LFPDUX A8, CO1, INC4 +#else + nop +#endif + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 +#ifndef TRMMKERNEL + LFPDUX A9, CO2, INCM3 +#else + nop +#endif + .align 4 + +.L14: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 4 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 + + cmpwi cr0, TEMP, 3 + bgt+ .L15 +#else + andi. r0, K, 3 + mtspr CTR, r0 + ble+ .L18 + + cmpwi cr0, K, 3 + bgt+ .L15 +#endif + +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + fpmr f5, f0 + LFPDUX B1, CO1, INC4 + fpmr f9, f0 + LFPDUX A3, CO2, INC2 + fpmr f13, f0 + LFPDUX A5, CO2, INC4 + fpmr f2, f0 + + LFPDUX B3, CO3, INC2 + fpmr f6, f0 + LFPDUX A6, CO3, INC4 + fpmr f10, f0 + LFPDUX A7, CO4, INC2 + fpmr f14, f0 + LFPDUX B2, CO4, INC4 + fpmr f3, f0 + + LFPDUX B5, CO1, INCM3 + fpmr f7, f0 + LFPDUX A8, CO1, INC4 + fpmr f11, f0 + LFPDUX A9, CO2, INCM3 + fpmr f15, f0 +#else + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop +#endif + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L18: +#ifndef TRMMKERNEL + fxcpmadd f0, AP, f0, A1 + LFPDUX B4, CO2, INC4 + fxcpmadd f1, AP, f1, B5 + LFPDUX A2, CO3, INCM3 + + fxcpmadd f2, AP, f2, B1 + LFPDUX A4, CO3, INC4 + fxcpmadd f3, AP, f3, A8 + LFPDUX A10, CO4, INCM3 + + fxcpmadd f4, AP, f4, A3 + LFPDUX A1, CO4, INC4 + fxcpmadd f5, AP, f5, A9 + STFPDUX f0, CO1, INCM7 + + fxcpmadd f6, AP, f6, A5 + STFPDUX f1, CO1, INC2 + fxcpmadd f7, AP, f7, B4 + STFPDUX f2, CO1, INC2 + + fxcpmadd f8, AP, f8, B3 + STFPDUX f3, CO1, INC2 + fxcpmadd f9, AP, f9, A2 + STFPDUX f4, CO2, INCM7 + + fxcpmadd f10, AP, f10, A6 + STFPDUX f5, CO2, INC2 + fxcpmadd f11, AP, f11, A4 + STFPDUX f6, CO2, INC2 + + fxcpmadd f12, AP, f12, A7 + STFPDUX f7, CO2, INC2 + fxcpmadd f13, AP, f13, A10 + STFPDUX f8, CO3, INCM7 + + fxcpmadd f14, AP, f14, B2 + STFPDUX f9, CO3, INC2 + fxcpmadd f15, AP, f15, A1 + STFPDUX f10, CO3, INC2 + + STFPDUX f11, CO3, INC2 + STFPDUX f12, CO4, INCM7 + STFPDUX f13, CO4, INC2 + STFPDUX f14, CO4, INC2 + STFPDUX f15, CO4, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + fpmul f4, AP, f4 + fpmul f5, AP, f5 + STFPDUX f0, CO1, INC2 + + fpmul f6, AP, f6 + STFPDUX f1, CO1, INC2 + fpmul f7, AP, f7 + STFPDUX f2, CO1, INC2 + + fpmul f8, AP, f8 + STFPDUX f3, CO1, INC2 + fpmul f9, AP, f9 + STFPDUX f4, CO2, INC2 + + fpmul f10, AP, f10 + STFPDUX f5, CO2, INC2 + fpmul f11, AP, f11 + STFPDUX f6, CO2, INC2 + + fpmul f12, AP, f12 + STFPDUX f7, CO2, INC2 + fpmul f13, AP, f13 + STFPDUX f8, CO3, INC2 + + fpmul f14, AP, f14 + STFPDUX f9, CO3, INC2 + fpmul f15, AP, f15 + STFPDUX f10, CO3, INC2 + + STFPDUX f11, CO3, INC2 + STFPDUX f12, CO4, INC2 + STFPDUX f13, CO4, INC2 + STFPDUX f14, CO4, INC2 + STFPDUX f15, CO4, INC2 +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 4 + beq .L30 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, TEMP + fpmr f13, f0 + ble .L24 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L24: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L28 + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L28: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX B1, CO1, INC2 + LFPDUX B3, CO2, INC2 + LFPDUX A6, CO2, INC2 + + LFPDUX B5, CO3, INC2 + LFPDUX A8, CO3, INC2 + LFPDUX A2, CO4, INC2 + LFPDUX A4, CO4, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + fxcpmadd f4, AP, f4, B3 + fxcpmadd f5, AP, f5, A6 + + fxcpmadd f8, AP, f8, B5 + fxcpmadd f9, AP, f9, A8 + STFPDUX f0, CO1, INCM3 + fxcpmadd f12, AP, f12, A2 + STFPDUX f1, CO1, INC2 + fxcpmadd f13, AP, f13, A4 + STFPDUX f4, CO2, INCM3 + + STFPDUX f5, CO2, INC2 + STFPDUX f8, CO3, INCM3 + STFPDUX f9, CO3, INC2 + STFPDUX f12, CO4, INCM3 + STFPDUX f13, CO4, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f4, AP, f4 + fpmul f5, AP, f5 + + fpmul f8, AP, f8 + fpmul f9, AP, f9 + STFPDUX f0, CO1, INC2 + fpmul f12, AP, f12 + STFPDUX f1, CO1, INC2 + fpmul f13, AP, f13 + STFPDUX f4, CO2, INC2 + + STFPDUX f5, CO2, INC2 + STFPDUX f8, CO3, INC2 + STFPDUX f9, CO3, INC2 + STFPDUX f12, CO4, INC2 + STFPDUX f13, CO4, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 2 + beq .L40 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 + +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + fxcpmadd f2, B4, A2, f2 + fxcsmadd f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + fxcpmadd f2, A6, A3, f2 + fxcsmadd f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + fxcpmadd f2, A8, A4, f2 + fxcsmadd f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f1, B3, A2, f1 + fxcpmadd f2, B4, A2, f2 + fxcsmadd f3, B4, A2, f3 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f1, A5, A3, f1 + fxcpmadd f2, A6, A3, f2 + fxcsmadd f3, A6, A3, f3 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f1, A7, A4, f1 + fxcpmadd f2, A8, A4, f2 + fxcsmadd f3, A8, A4, f3 + .align 4 + +.L34: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L38 + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + .align 4 + +.L38: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 + LFPDX A2, CO2, INC2 + LFPDX A3, CO3, INC2 + LFPDX A4, CO4, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A2 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A4 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 +#endif + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO2, INC2 + STFPDUX f2, CO3, INC2 + STFPDUX f3, CO4, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L40: + andi. I, M, 1 + beq .L49 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L44 + +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L44 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L43 + .align 4 + +.L42: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L42 + .align 4 + +.L43: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L44: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L48 + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L47 + .align 4 + +.L46: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L46 + .align 4 + +.L47: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + .align 4 + +.L48: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC2 + LFDX A2, CO2, INC2 + LFDX A3, CO3, INC2 + LFDX A4, CO4, INC2 + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fsmfp A1, A2 + fsmfp A3, A4 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A3 +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 +#endif + + STFDX f0, CO1, INC2 + STFSDX f0, CO2, INC2 + STFDX f1, CO3, INC2 + STFSDX f1, CO4, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + addi B, BO, 4 * SIZE + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 2 + beq .L90 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L60 + .align 4 + +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, K, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L54: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L58 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L58: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX B1, CO1, INC2 + LFPDUX A3, CO1, INC2 + LFPDUX A5, CO1, INC2 + + LFPDUX B3, CO2, INC2 + LFPDUX A6, CO2, INC2 + LFPDUX A7, CO2, INC2 + LFPDUX B2, CO2, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A5 + + fxcpmadd f4, AP, f4, B3 + fxcpmadd f5, AP, f5, A6 + STFPDUX f0, CO1, INCM7 + fxcpmadd f6, AP, f6, A7 + STFPDUX f1, CO1, INC2 + fxcpmadd f7, AP, f7, B2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 + STFPDUX f4, CO2, INCM7 + + STFPDUX f5, CO2, INC2 + STFPDUX f6, CO2, INC2 + STFPDUX f7, CO2, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + fpmul f4, AP, f4 + fpmul f5, AP, f5 + STFPDUX f0, CO1, INC2 + fpmul f6, AP, f6 + STFPDUX f1, CO1, INC2 + fpmul f7, AP, f7 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 + STFPDUX f4, CO2, INC2 + + STFPDUX f5, CO2, INC2 + STFPDUX f6, CO2, INC2 + STFPDUX f7, CO2, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 4 + beq .L70 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + fpmr f2, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + fpmr f3, f0 + ble .L64 +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L64: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L68 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L68: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX A2, CO1, INC2 + LFPDUX A3, CO2, INC2 + LFPDUX A4, CO2, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A2 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A4 + + STFPDUX f0, CO1, INCM3 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO2, INCM3 + STFPDUX f3, CO2, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO2, INC2 + STFPDUX f3, CO2, INC2 +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 2 + beq .L80 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L74 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + srawi. r0, K, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L74 +#endif + + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L72 + .align 4 + +.L73: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L74: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L78 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L78: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 + LFPDX B3, CO2, INC2 + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B3 +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 +#endif + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO2, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L80: + andi. I, M, 1 + beq .L89 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L84 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L84 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L83 + .align 4 + +.L82: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L82 + .align 4 + +.L83: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L84: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L88 + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L87 + .align 4 + +.L86: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L86 + .align 4 + +.L87: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L88: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC2 + LFDX A2, CO2, INC2 + + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fsmfp A1, A2 + fpadd f0, f0, f2 + fxcpmadd f0, AP, f0, A1 +#else + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fsmfp A1, A2 + fpadd f0, f0, f2 + fpmul f0, AP, f0 +#endif + + STFDX f0, CO1, INC2 + STFSDX f0, CO2, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + addi B, BO, 2 * SIZE + .align 4 + +.L90: + andi. J, N, 1 + beq .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + mr CO1, C + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L100 + .align 4 + +.L91: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 1 +#endif + fpmr f2, f0 + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + ble .L94 + +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + mtspr CTR, r0 + ble .L94 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L93 + .align 4 + +.L92: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L92 + .align 4 + +.L93: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L94: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L98 + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L97 + .align 4 + +.L96: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L96 + .align 4 + +.L97: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L98: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX B1, CO1, INC2 + LFPDUX A3, CO1, INC2 + LFPDUX A5, CO1, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A5 + + STFPDUX f0, CO1, INCM7 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L91 + .align 4 + +.L100: + andi. I, M, 4 + beq .L110 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L104 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L103 + .align 4 + +.L102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L102 + .align 4 + +.L103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L104: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L108 + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L107 + .align 4 + +.L106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L106 + .align 4 + +.L107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L108: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX B1, CO1, INC2 + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + + STFPDUX f0, CO1, INCM3 + STFPDUX f1, CO1, INC2 +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L110: + andi. I, M, 2 + beq .L120 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L114 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L113 + .align 4 + +.L112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L112 + .align 4 + +.L113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L114: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L118 + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L117 + .align 4 + +.L116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L116 + .align 4 + +.L117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L118: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 + + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + fxcpmadd f1, AP, f0, A1 + + li r0, FZERO + lfpsx f0, SP, r0 + + STFPDUX f1, CO1, INC2 +#else + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + fpmul f1, AP, f0 + + li r0, FZERO + lfpsx f0, SP, r0 + + STFPDUX f1, CO1, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L120: + andi. I, M, 1 + beq .L999 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L124 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L123 + .align 4 + +.L122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L122 + .align 4 + +.L123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L124: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L128 + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L127 + .align 4 + +.L126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L126 + .align 4 + +.L127: + fmadd f0, A1, B1, f0 + .align 4 + +.L128: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC2 + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + fadd f0, f0, f1 + fmadd f0, AP, f0, A1 +#else + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + fadd f0, f0, f1 + fpmul f0, AP, f0 +#endif + STFDUX f0, CO1, INC2 + .align 4 + +.L999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + +.L1000: + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + srawi. J, N, 2 + ble .L1050 + .align 4 + +.L1010: + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -4 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L1020 + .align 4 + +.L1011: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + fpmr f1, f0 + mtspr CTR, TEMP + ble .L1014 + +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L1014 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L1013 + .align 4 + +.L1012: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L1012 + .align 4 + +.L1013: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC +#else + nop +#endif + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 +#ifndef TRMMKERNEL + LFDUX B1, CO1, INC2 +#else + nop +#endif + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFDUX A3, CO1, INC2 +#else + nop +#endif + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 +#ifndef TRMMKERNEL + LFDUX A5, CO1, INC2 +#else + nop +#endif + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 +#ifndef TRMMKERNEL + LFSDUX A1, CO1, INCM5 +#else + nop +#endif + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 +#ifndef TRMMKERNEL + LFSDUX B1, CO1, INC2 +#else + nop +#endif + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFSDUX A3, CO1, INC2 +#else + nop +#endif + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 +#ifndef TRMMKERNEL + LFSDUX A5, CO1, INC2 +#else + nop +#endif + fxcsmadd f12, B4, A9, f12 +#ifndef TRMMKERNEL + LFDUX B3, CO2, INC +#else + nop +#endif + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFDUX A6, CO2, INC2 +#else + nop +#endif + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 +#ifndef TRMMKERNEL + LFDUX A7, CO2, INC2 +#else + nop +#endif + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 +#ifndef TRMMKERNEL + LFDUX B2, CO2, INC2 +#else + nop +#endif + .align 4 + +.L1014: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 4 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L1018 + + cmpwi cr0, TEMP, 3 + bgt+ .L1015 +#else + andi. r0, K, 3 + mtspr CTR, r0 + ble+ .L1018 + + cmpwi cr0, K, 3 + bgt+ .L1015 +#endif + +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + fpmr f5, f0 + LFDUX B1, CO1, INC2 + fpmr f9, f0 + LFDUX A3, CO1, INC2 + fpmr f13, f0 + LFDUX A5, CO1, INC2 + fpmr f2, f0 + + LFSDUX A1, CO1, INCM5 + fpmr f6, f0 + LFSDUX B1, CO1, INC2 + fpmr f10, f0 + LFSDUX A3, CO1, INC2 + fpmr f14, f0 + LFSDUX A5, CO1, INC2 + fpmr f3, f0 + + LFDUX B3, CO2, INC + fpmr f7, f0 + LFDUX A6, CO2, INC2 + fpmr f11, f0 + LFDUX A7, CO2, INC2 + fpmr f15, f0 + LFDUX B2, CO2, INC2 +#else + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop +#endif + .align 4 + +.L1015: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L1017 + .align 4 + +.L1016: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L1016 + .align 4 + +.L1017: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L1018: +#ifndef TRMMKERNEL + LFSDUX B3, CO2, INCM5 + LFSDUX A6, CO2, INC2 + LFSDUX A7, CO2, INC2 + LFSDUX B2, CO2, INC2 + + LFDUX B5, CO3, INC + LFDUX A8, CO3, INC2 + LFDUX A9, CO3, INC2 + LFDUX B4, CO3, INC2 + + LFSDUX B5, CO3, INCM5 + LFSDUX A8, CO3, INC2 + LFSDUX A9, CO3, INC2 + LFSDUX B4, CO3, INC2 + + LFDUX A2, CO4, INC + LFDUX A4, CO4, INC2 + + fxcpmadd f0, AP, f0, A1 + LFDUX A10, CO4, INC2 + LFDUX A1, CO4, INC2 + + fxcpmadd f1, AP, f1, B1 + LFSDUX A2, CO4, INCM5 + LFSDUX A4, CO4, INC2 + + fxcpmadd f2, AP, f2, A3 + LFSDUX A10, CO4, INC2 + LFSDUX A1, CO4, INC2 + + fxcpmadd f3, AP, f3, A5 + STFDUX f0, CO1, INCM7 + STFSDUX f0, CO1, INC + + fxcpmadd f4, AP, f4, B3 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fxcpmadd f5, AP, f5, A6 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + fxcpmadd f6, AP, f6, A7 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fxcpmadd f7, AP, f7, B2 + STFDUX f4, CO2, INCM7 + STFSDUX f4, CO2, INC + + fxcpmadd f8, AP, f8, B5 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + fxcpmadd f9, AP, f9, A8 + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + + fxcpmadd f10, AP, f10, A9 + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + fxcpmadd f11, AP, f11, B4 + STFDUX f8, CO3, INCM7 + STFSDUX f8, CO3, INC + + fxcpmadd f12, AP, f12, A2 + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + + fxcpmadd f13, AP, f13, A4 + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + + fxcpmadd f14, AP, f14, A10 + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + fxcpmadd f15, AP, f15, A1 + STFDUX f12, CO4, INCM7 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f4, AP, f4 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fpmul f5, AP, f5 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + fpmul f6, AP, f6 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fpmul f7, AP, f7 + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + fpmul f8, AP, f8 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + fpmul f9, AP, f9 + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + + fpmul f10, AP, f10 + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + fpmul f11, AP, f11 + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + + fpmul f12, AP, f12 + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + + fpmul f13, AP, f13 + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + + fpmul f14, AP, f14 + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + fpmul f15, AP, f15 + STFDUX f12, CO4, INC +#endif + + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC + STFDUX f14, CO4, INC + STFSDUX f14, CO4, INC + STFDUX f15, CO4, INC + STFSDUX f15, CO4, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1011 + .align 4 + +.L1020: + andi. I, M, 4 + beq .L1030 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, TEMP + fpmr f13, f0 + ble .L1024 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L1024 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L1023 + .align 4 + +.L1022: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L1022 + .align 4 + +.L1023: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L1024: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1028 + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L1027 + .align 4 + +.L1026: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L1026 + .align 4 + +.L1027: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L1028: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B1, CO1, INC2 + LFDUX B3, CO2, INC + LFDUX A6, CO2, INC2 + + LFSDUX A1, CO1, INCM1 + LFSDUX B1, CO1, INC2 + LFSDUX B3, CO2, INCM1 + LFSDUX A6, CO2, INC2 + + LFDUX B5, CO3, INC + LFDUX A8, CO3, INC2 + LFDUX A2, CO4, INC + LFDUX A4, CO4, INC2 + + fxcpmadd f0, AP, f0, A1 + LFSDUX B5, CO3, INCM1 + LFSDUX A8, CO3, INC2 + + fxcpmadd f1, AP, f1, B1 + LFSDUX A2, CO4, INCM1 + LFSDUX A4, CO4, INC2 + + fxcpmadd f4, AP, f4, B3 + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC + + fxcpmadd f5, AP, f5, A6 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fxcpmadd f8, AP, f8, B5 + STFDUX f4, CO2, INCM3 + STFSDUX f4, CO2, INC + + fxcpmadd f9, AP, f9, A8 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + fxcpmadd f12, AP, f12, A2 + STFDUX f8, CO3, INCM3 + STFSDUX f8, CO3, INC + + fxcpmadd f13, AP, f13, A4 + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + + STFDUX f12, CO4, INCM3 + STFSDUX f12, CO4, INC + + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + fpmul f4, AP, f4 + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f5, AP, f5 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fpmul f8, AP, f8 + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + fpmul f9, AP, f9 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + fpmul f12, AP, f12 + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + + fpmul f13, AP, f13 + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1030: + andi. I, M, 2 + beq .L1040 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L1034 + +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 2 + mtspr CTR, r0 + ble .L1034 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L1033 + .align 4 + +.L1032: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + fxcpmadd f2, B4, A2, f2 + fxcsmadd f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + fxcpmadd f2, A6, A3, f2 + fxcsmadd f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + fxcpmadd f2, A8, A4, f2 + fxcsmadd f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L1032 + .align 4 + +.L1033: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f1, B3, A2, f1 + fxcpmadd f2, B4, A2, f2 + fxcsmadd f3, B4, A2, f3 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f1, A5, A3, f1 + fxcpmadd f2, A6, A3, f2 + fxcsmadd f3, A6, A3, f3 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f1, A7, A4, f1 + fxcpmadd f2, A8, A4, f2 + fxcsmadd f3, A8, A4, f3 + .align 4 + +.L1034: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1038 + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L1037 + .align 4 + +.L1036: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L1036 + .align 4 + +.L1037: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A1, f2 + fxcsmadd f3, B2, A1, f3 + .align 4 + +.L1038: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO2, INC + LFDUX A3, CO3, INC + LFDUX A4, CO4, INC + + LFSDUX A1, CO1, INC + LFSDUX A2, CO2, INC + LFSDUX A3, CO3, INC + LFSDUX A4, CO4, INC + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A2 + fxcpmadd f2, AP, f2, A3 + fxcpmadd f3, AP, f3, A4 + + STFDUX f0, CO1, INCM1 + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INCM1 + STFSDUX f1, CO2, INC + + STFDUX f2, CO3, INCM1 + STFSDUX f2, CO3, INC + + STFDUX f3, CO4, INCM1 + STFSDUX f3, CO4, INC +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + fpmul f3, AP, f3 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC + + STFDUX f2, CO3, INC + STFSDUX f2, CO3, INC + + STFDUX f3, CO4, INC + STFSDUX f3, CO4, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1040: + andi. I, M, 1 + beq .L1049 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L1044 + +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L1044 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L1043 + .align 4 + +.L1042: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L1042 + .align 4 + +.L1043: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L1044: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L1048 + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L1047 + .align 4 + +.L1046: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L1046 + .align 4 + +.L1047: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + .align 4 + +.L1048: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC + LFDX B3, CO3, INC + LFSDX A1, CO2, INC + LFSDX B3, CO4, INC + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B3 +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC + STFDUX f1, CO3, INC + STFSDUX f1, CO4, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L1049: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + addi B, BO, 4 * SIZE + + addic. J, J, -1 + bgt+ .L1010 + .align 4 + +.L1050: + andi. J, N, 2 + beq .L1090 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L1060 + .align 4 + +.L1051: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#endif + + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L1054 +#else + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, K, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L1054 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1053 + .align 4 + +.L1052: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L1052 + .align 4 + +.L1053: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L1054: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1058 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L1057 + .align 4 + +.L1056: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L1056 + .align 4 + +.L1057: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L1058: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B1, CO1, INC2 + LFDUX A3, CO1, INC2 + LFDUX A5, CO1, INC2 + + LFSDUX A1, CO1, INCM5 + LFSDUX B1, CO1, INC2 + LFSDUX A3, CO1, INC2 + LFSDUX A5, CO1, INC2 + + LFDUX B3, CO2, INC + LFDUX A6, CO2, INC2 + LFDUX A7, CO2, INC2 + LFDUX B2, CO2, INC2 + + fxcpmadd f0, AP, f0, A1 + LFSDUX B3, CO2, INCM5 + LFSDUX A6, CO2, INC2 + fxcpmadd f1, AP, f1, B1 + LFSDUX A7, CO2, INC2 + LFSDUX B2, CO2, INC2 + + fxcpmadd f2, AP, f2, A3 + STFDUX f0, CO1, INCM7 + STFSDUX f0, CO1, INC + + fxcpmadd f3, AP, f3, A5 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fxcpmadd f4, AP, f4, B3 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + fxcpmadd f5, AP, f5, A6 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fxcpmadd f6, AP, f6, A7 + STFDUX f4, CO2, INCM7 + STFSDUX f4, CO2, INC + + fxcpmadd f7, AP, f7, B2 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + fpmul f2, AP, f2 + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f3, AP, f3 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + fpmul f4, AP, f4 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + fpmul f5, AP, f5 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fpmul f6, AP, f6 + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + fpmul f7, AP, f7 + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1051 + .align 4 + +.L1060: + andi. I, M, 4 + beq .L1070 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + fpmr f2, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1064 +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1064 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1063 + .align 4 + +.L1062: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L1062 + .align 4 + +.L1063: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L1064: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1068 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L1067 + .align 4 + +.L1066: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L1066 + .align 4 + +.L1067: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L1068: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC2 + LFDUX A3, CO2, INC + LFDUX A4, CO2, INC2 + + LFSDUX A1, CO1, INCM1 + LFSDUX A2, CO1, INC2 + LFSDUX A3, CO2, INCM1 + LFSDUX A4, CO2, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, A2 + fxcpmadd f2, AP, f2, A3 + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC + + fxcpmadd f3, AP, f3, A4 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INCM3 + STFSDUX f2, CO2, INC + + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f3, AP, f3 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1070: + andi. I, M, 2 + beq .L1080 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1074 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + srawi. r0, K, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1074 +#endif + + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L1073 + .align 4 + +.L1072: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L1072 + .align 4 + +.L1073: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L1074: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L1078 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L1077 + .align 4 + +.L1076: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L1076 + .align 4 + +.L1077: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L1078: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B3, CO2, INC + LFSDUX A1, CO1, INC + LFSDUX B3, CO2, INC + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B3 + + STFDUX f0, CO1, INCM1 + STFSDUX f0, CO1, INC + STFDUX f1, CO2, INCM1 + STFSDUX f1, CO2, INC +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1080: + andi. I, M, 1 + beq .L1089 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 1 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L1084 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L1084 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L1083 + .align 4 + +.L1082: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L1082 + .align 4 + +.L1083: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L1084: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L1088 + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L1087 + .align 4 + +.L1086: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L1086 + .align 4 + +.L1087: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L1088: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC + LFDX A2, CO2, INC + + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fsmfp A1, A2 + fpadd f0, f0, f2 + fxcpmadd f0, AP, f0, A1 +#else + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fsmfp A1, A2 + fpadd f0, f0, f2 + fpmul f0, AP, f0 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L1089: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + addi B, BO, 2 * SIZE + .align 4 + +.L1090: + andi. J, N, 1 + beq .L10999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + mr CO1, C + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L10100 + .align 4 + +.L1091: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 3 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 1 +#endif + fpmr f2, f0 + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + ble .L1094 + +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + mtspr CTR, r0 + ble .L1094 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1093 + .align 4 + +.L1092: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L1092 + .align 4 + +.L1093: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L1094: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 8 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1098 + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L1097 + .align 4 + +.L1096: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L1096 + .align 4 + +.L1097: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L1098: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B1, CO1, INC2 + LFDUX A3, CO1, INC2 + LFDUX A5, CO1, INC2 + + LFSDUX A1, CO1, INCM5 + LFSDUX B1, CO1, INC2 + LFSDUX A3, CO1, INC2 + LFSDUX A5, CO1, INC2 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + fxcpmadd f2, AP, f2, A3 + STFDUX f0, CO1, INCM7 + STFSDUX f0, CO1, INC + + fxcpmadd f3, AP, f3, A5 +#else + fpmul f0, AP, f0 + fpmul f1, AP, f1 + fpmul f2, AP, f2 + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + fpmul f3, AP, f3 +#endif + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -8 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 8 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1091 + .align 4 + +.L10100: + andi. I, M, 4 + beq .L10110 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 2 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L10104 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L10104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L10103 + .align 4 + +.L10102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L10102 + .align 4 + +.L10103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L10104: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L10108 + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L10107 + .align 4 + +.L10106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L10106 + .align 4 + +.L10107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L10108: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX B1, CO1, INC2 + LFSDUX A1, CO1, INCM1 + LFSDUX B1, CO1, INC2 + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fxcpmadd f0, AP, f0, A1 + fxcpmadd f1, AP, f1, B1 + + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC +#else + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpmul f0, AP, f0 + fpmul f1, AP, f1 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L10110: + andi. I, M, 2 + beq .L10120 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 1 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L10114 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L10114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L10113 + .align 4 + +.L10112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L10112 + .align 4 + +.L10113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L10114: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L10118 + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L10117 + .align 4 + +.L10116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L10116 + .align 4 + +.L10117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L10118: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC + + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fsmfp A1, A2 + fpadd f0, f0, f2 + fxcpmadd f1, AP, f0, A1 + + li r0, FZERO + lfpsx f0, SP, r0 + + STFDUX f1, CO1, INCM1 + STFSDUX f1, CO1, INC +#else + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fsmfp A1, A2 + fpadd f0, f0, f2 + fpmul f1, AP, f0 + + li r0, FZERO + lfpsx f0, SP, r0 + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L10120: + andi. I, M, 1 + beq .L10999 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + BASE_SHIFT + slwi r0, KK, 0 + BASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L10124 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, K, 3 + mtspr CTR, r0 + ble .L10124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L10123 + .align 4 + +.L10122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L10122 + .align 4 + +.L10123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L10124: + lfd AP, ALPHA(SP) +#ifdef TRMMKERNEL + fsmfp AP, AP +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L10128 + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L10127 + .align 4 + +.L10126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L10126 + .align 4 + +.L10127: + fmadd f0, A1, B1, f0 + .align 4 + +.L10128: +#ifndef TRMMKERNEL + LFDX A1, CO1, INC + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + fadd f0, f0, f1 + fmadd f0, AP, f0, A1 + STFDUX f0, CO1, INC +#else + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + fadd f0, f0, f1 + fmul f0, AP, f0 + STFDUX f0, CO1, INC +#endif + .align 4 + +.L10999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S new file mode 100644 index 0000000000..92e8e9f5f7 --- /dev/null +++ b/kernel/power/gemm_kernel_power3.S @@ -0,0 +1,1664 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#endif +#endif + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../sparam.h" +#else +#include "../dparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREA, (16 * 5 * SIZE + 16) + li PREB, (16 * 5 * SIZE + 16) + li PREC, 4 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble LL(20) + .align 4 + +LL(11): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + +#if 0 + PREFETCH_C1 + PREFETCH_C2 + PREFETCH_C3 + PREFETCH_C4 +#endif + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(15) + .align 4 + +LL(12): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + LFD f28, 4 * SIZE(BO) + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFD f16, 8 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + LFD f29, 5 * SIZE(BO) + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + LFD f30, 6 * SIZE(BO) + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + fmadd f3, f19, f20, f3 + fmadd f7, f19, f21, f7 + LFD f31, 7 * SIZE(BO) + fmadd f11, f19, f22, f11 + fmadd f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f4, f24, f29, f4 + LFD f20, 8 * SIZE(BO) + fmadd f8, f24, f30, f8 + fmadd f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f5, f25, f29, f5 + LFD f21, 9 * SIZE(BO) + fmadd f9, f25, f30, f9 + fmadd f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + fmadd f2, f26, f28, f2 + fmadd f6, f26, f29, f6 + LFD f22, 10 * SIZE(BO) + fmadd f10, f26, f30, f10 + fmadd f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + fmadd f3, f27, f28, f3 + fmadd f7, f27, f29, f7 + LFD f23, 11 * SIZE(BO) + fmadd f11, f27, f30, f11 + fmadd f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + LFD f28, 12 * SIZE(BO) + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFDU f16, 16 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + LFD f29, 13 * SIZE(BO) + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + LFD f30, 14 * SIZE(BO) + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 2 * SIZE(AO) + + fmadd f3, f19, f20, f3 + fmadd f7, f19, f21, f7 + LFD f31, 15 * SIZE(BO) + fmadd f11, f19, f22, f11 + fmadd f15, f19, f23, f15 + LFD f19, 3 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f4, f24, f29, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f8, f24, f30, f8 + fmadd f12, f24, f31, f12 + LFD f24, 4 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f5, f25, f29, f5 + LFD f21, 1 * SIZE(BO) + fmadd f9, f25, f30, f9 + fmadd f13, f25, f31, f13 + LFD f25, 5 * SIZE(AO) + + fmadd f2, f26, f28, f2 + fmadd f6, f26, f29, f6 + LFD f22, 2 * SIZE(BO) + fmadd f10, f26, f30, f10 + fmadd f14, f26, f31, f14 + LFD f26, 6 * SIZE(AO) + + fmadd f3, f27, f28, f3 + fmadd f7, f27, f29, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f11, f27, f30, f11 + fmadd f15, f27, f31, f15 + LFD f27, 7 * SIZE(AO) + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, K, 3 + lfd f30, ALPHA + lfs f31, FZERO + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 5 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 6 * SIZE(AO) + + fmadd f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + fmadd f7, f19, f21, f7 + LFD f21, 5 * SIZE(BO) + fmadd f11, f19, f22, f11 + LFD f22, 6 * SIZE(BO) + fmadd f15, f19, f23, f15 + LFD f19, 7 * SIZE(AO) + + LFD f23, 7 * SIZE(BO) + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + fmadd f0, f0, f30, f16 + LFD f16, 0 * SIZE(CO3) + fmadd f1, f1, f30, f17 + LFD f17, 1 * SIZE(CO3) + fmadd f2, f2, f30, f18 + LFD f18, 2 * SIZE(CO3) + fmadd f3, f3, f30, f19 + LFD f19, 3 * SIZE(CO3) + + fmadd f4, f4, f30, f20 + LFD f20, 0 * SIZE(CO4) + fmadd f5, f5, f30, f21 + LFD f21, 1 * SIZE(CO4) + fmadd f6, f6, f30, f22 + LFD f22, 2 * SIZE(CO4) + fmadd f7, f7, f30, f23 + LFD f23, 3 * SIZE(CO4) + + fmadd f8, f8, f30, f16 + fmadd f9, f9, f30, f17 + STFD f0, 0 * SIZE(CO1) + + fmadd f10, f10, f30, f18 + fmadd f11, f11, f30, f19 + STFD f1, 1 * SIZE(CO1) + + fmadd f12, f12, f30, f20 + fmadd f13, f13, f30, f21 + STFD f2, 2 * SIZE(CO1) + + fmadd f14, f14, f30, f22 + fmadd f15, f15, f30, f23 + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + fmr f0, f31 + fmr f1, f31 + STFD f5, 1 * SIZE(CO2) + fmr f2, f31 + fmr f3, f31 + + STFD f6, 2 * SIZE(CO2) + fmr f4, f31 + fmr f5, f31 + STFD f7, 3 * SIZE(CO2) + fmr f6, f31 + fmr f7, f31 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + addi CO1, CO1, 4 * SIZE + fmr f8, f31 + fmr f9, f31 + + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + addi CO2, CO2, 4 * SIZE + fmr f10, f31 + fmr f11, f31 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + addi CO3, CO3, 4 * SIZE + fmr f12, f31 + fmr f13, f31 + + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + addi CO4, CO4, 4 * SIZE + fmr f14, f31 + fmr f15, f31 + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(25) + .align 5 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f2, f18, f24, f2 + fmadd f3, f19, f24, f3 + fmadd f6, f18, f25, f6 + fmadd f7, f19, f25, f7 + + fmadd f10, f18, f26, f10 + fmadd f11, f19, f26, f11 + fmadd f14, f18, f27, f14 + fmadd f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f2, f18, f24, f2 + fmadd f3, f19, f24, f3 + fmadd f6, f18, f25, f6 + fmadd f7, f19, f25, f7 + + fmadd f10, f18, f26, f10 + fmadd f11, f19, f26, f11 + fmadd f14, f18, f27, f14 + fmadd f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + fmadd f4, f4, f30, f18 + fmadd f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + fmadd f8, f8, f30, f20 + fmadd f9, f9, f30, f21 + fmadd f12, f12, f30, f22 + fmadd f13, f13, f30, f23 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(35) + .align 5 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f1, f17, f24, f1 + fmadd f5, f17, f25, f5 + fmadd f9, f17, f26, f9 + fmadd f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + fmadd f0, f18, f20, f0 + fmadd f4, f18, f21, f4 + fmadd f8, f18, f22, f8 + fmadd f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f1, f19, f24, f1 + fmadd f5, f19, f25, f5 + fmadd f9, f19, f26, f9 + fmadd f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + fmadd f0, f0, f30, f16 + fmadd f4, f4, f30, f18 + fmadd f8, f8, f30, f20 + fmadd f12, f12, f30, f22 + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + .align 4 + +LL(39): + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble LL(70) + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble LL(50) + .align 4 + +LL(41): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + PREFETCH_C1 + PREFETCH_C2 + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(45) + .align 5 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f22, f0 + fmadd f1, f17, f22, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f22, f3 + + fmadd f4, f16, f23, f4 + fmadd f5, f17, f23, f5 + fmadd f6, f18, f23, f6 + fmadd f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + fmadd f0, f16, f22, f0 + fmadd f1, f17, f22, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f22, f3 + + fmadd f4, f16, f23, f4 + fmadd f5, f17, f23, f5 + fmadd f6, f18, f23, f6 + fmadd f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + fmadd f4, f16, f21, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + fmadd f2, f2, f30, f18 + fmadd f3, f3, f30, f19 + + fmadd f4, f4, f30, f20 + fmadd f5, f5, f30, f21 + fmadd f6, f6, f30, f22 + fmadd f7, f7, f30, f23 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(55) + .align 5 + +LL(52): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f16, f21, f2 + fmadd f3, f17, f21, f3 + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f22, f5 + fmadd f6, f18, f23, f6 + fmadd f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f16, f24, f0 + fmadd f1, f17, f24, f1 + fmadd f2, f16, f25, f2 + fmadd f3, f17, f25, f3 + + fmadd f4, f18, f26, f4 + fmadd f5, f19, f26, f5 + fmadd f6, f18, f27, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f16, f21, f2 + fmadd f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + fadd f0, f4, f0 + fadd f1, f5, f1 + fadd f2, f6, f2 + fadd f3, f7, f3 + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + fmadd f2, f2, f30, f18 + fmadd f3, f3, f30, f19 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(65) + .align 5 + +LL(62): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f17, f22, f2 + fmadd f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f19, f26, f2 + fmadd f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + fadd f0, f2, f0 + fadd f1, f3, f1 + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f18 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + .align 4 + +LL(69): + mr B, BO + lfs f0, FZERO + .align 4 + +LL(70): + mr CO1, C + andi. J, N, 1 + ble LL(999) + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble LL(80) + .align 4 + +LL(71): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + PREFETCH_C1 + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(75) + .align 5 + +LL(72): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f21, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f18, f21, f2 + fmadd f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + fmadd f0, f16, f22, f0 + fmadd f1, f17, f22, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + fmadd f0, f16, f23, f0 + fmadd f1, f17, f23, f1 + fmadd f2, f18, f23, f2 + fmadd f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + fmadd f2, f2, f30, f18 + fmadd f3, f3, f30, f19 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble LL(85) + .align 5 + +LL(82): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + fmadd f2, f18, f21, f2 + fmadd f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f22, f0 + fmadd f1, f17, f22, f1 + fmadd f2, f18, f23, f2 + fmadd f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): + lfd f30, ALPHA + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + fadd f0, f2, f0 + fadd f1, f3, f1 + + fmadd f0, f0, f30, f16 + fmadd f1, f1, f30, f17 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B + ble LL(95) + .align 5 + +LL(92): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f18, f22, f2 + fmadd f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + lfd f30, ALPHA + andi. r0, K, 7 + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + fmadd f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + LFD f16, 0 * SIZE(CO1) + + fadd f0, f1, f0 + fadd f2, f3, f2 + fadd f0, f2, f0 + + fmadd f0, f0, f30, f16 + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S new file mode 100644 index 0000000000..b10a042dca --- /dev/null +++ b/kernel/power/gemm_kernel_power6.S @@ -0,0 +1,2667 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define TEMP r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#ifdef TRMMKERNEL + std r20, 232(SP) + std r19, 240(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#ifdef TRMMKERNEL + stw r20, 188(SP) + stw r19, 192(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + lfs f0, FZERO + li PREA, (16 * 3) * SIZE + srawi. J, N, 2 + + li PREC, 3 * SIZE + ble LL(40) + .align 4 + +LL(10): + mr CO1, C + fmr f1, f0 + add CO2, C, LDC + fmr f2, f0 + add CO3, CO2, LDC + fmr f3, f0 + add CO4, CO3, LDC + fmr f4, f0 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + slwi BB, K, BASE_SHIFT + 2 + fmr f5, f0 + + srawi. I, M, 2 + fmr f6, f0 + + mr AO, A + fmr f7, f0 + add C, CO4, LDC + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + ble LL(20) + .align 4 + +LL(11): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(15) + +#else + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, K, 3 + mtctr r0 + mr BO, B + ble LL(15) +#endif + .align 4 + +LL(12): + dcbt AO, PREA + FMADD f0, f16, f20, f0 + nop + FMADD f4, f16, f21, f4 + + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + LFD f28, 20 * SIZE(BO) + LFD f29, 21 * SIZE(BO) + + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + LFD f30, 22 * SIZE(BO) + LFD f31, 23 * SIZE(BO) + + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + LFD f24, 28 * SIZE(AO) + LFD f25, 29 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + LFD f28, 28 * SIZE(BO) + LFD f29, 29 * SIZE(BO) + + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + LFD f26, 30 * SIZE(AO) + LFD f27, 31 * SIZE(AO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + lfd f30, ALPHA + + dcbtst B, BB + addi BB, BB, 16 * SIZE + dcbtst B, BB + addi BB, BB, 16 * SIZE + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + nop + bdnz LL(16) + .align 4 + +LL(18): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 + + LFD f16, 0 * SIZE(CO3) + LFD f17, 1 * SIZE(CO3) + LFD f18, 2 * SIZE(CO3) + LFD f19, 3 * SIZE(CO3) + + LFD f20, 0 * SIZE(CO4) + LFD f21, 1 * SIZE(CO4) + LFD f22, 2 * SIZE(CO4) + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +LL(39): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + lfs f0, FZERO + + mr B, BO + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(40): + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble LL(70) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble LL(50) + .align 4 + +LL(41): +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(69): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +LL(70): + mr CO1, C + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble LL(80) + .align 4 + +LL(71): +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(72) + .align 4 + +LL(75): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(82) + .align 4 + +LL(85): + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#ifdef TRMMKERNEL + ld r20, 232(SP) + ld r19, 240(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#ifdef TRMMKERNEL + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S new file mode 100644 index 0000000000..5d3b3066ba --- /dev/null +++ b/kernel/power/gemm_kernel_ppc440.S @@ -0,0 +1,2470 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 + + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) +#if defined(TRMMKERNEL) + std r19, 240(SP) + std r18, 248(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) +#if defined(TRMMKERNEL) + stw r19, 192(SP) + stw r18, 196(SP) +#endif +#endif + + stfd f1, ALPHA + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + srawi. J, N, 2 + ble .L40 + .align 4 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + +.L10: + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + mr AO, A + add C, CO4, LDC + ble .L20 + .align 4 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L15 + +#else + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble .L15 +#endif + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 4 +#endif + + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L18 + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.L18: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + LFD f16, 0 * SIZE(CO3) + FMADD f1, f1, f30, f17 + LFD f17, 1 * SIZE(CO3) + FMADD f2, f2, f30, f18 + LFD f18, 2 * SIZE(CO3) + FMADD f3, f3, f30, f19 + LFD f19, 3 * SIZE(CO3) + + FMADD f4, f4, f30, f20 + LFD f20, 0 * SIZE(CO4) + FMADD f5, f5, f30, f21 + LFD f21, 1 * SIZE(CO4) + FMADD f6, f6, f30, f22 + LFD f22, 2 * SIZE(CO4) + FMADD f7, f7, f30, f23 + LFD f23, 3 * SIZE(CO4) + + FMADD f8, f8, f30, f16 + FMADD f9, f9, f30, f17 + FMADD f10, f10, f30, f18 + FMADD f11, f11, f30, f19 + + FMADD f12, f12, f30, f20 + FMADD f13, f13, f30, f21 + FMADD f14, f14, f30, f22 + FMADD f15, f15, f30, f23 + +#else + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f10, f10, f30 + FMUL f11, f11, f30 + + FMUL f12, f12, f30 + FMUL f13, f13, f30 + FMUL f14, f14, f30 + FMUL f15, f15, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + fmr f4, f0 + STFD f5, 1 * SIZE(CO2) + fmr f5, f0 + STFD f6, 2 * SIZE(CO2) + fmr f6, f0 + STFD f7, 3 * SIZE(CO2) + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + fmr f8, f0 + STFD f9, 1 * SIZE(CO3) + fmr f9, f0 + STFD f10, 2 * SIZE(CO3) + fmr f10, f0 + STFD f11, 3 * SIZE(CO3) + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + fmr f12, f0 + STFD f13, 1 * SIZE(CO4) + fmr f13, f0 + STFD f14, 2 * SIZE(CO4) + fmr f14, f0 + STFD f15, 3 * SIZE(CO4) + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -4 +#endif + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + ble .L30 + +#if defined(TRMMKERNEL) +#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f4, f4, f30, f18 + FMADD f5, f5, f30, f19 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FMADD f8, f8, f30, f20 + FMADD f9, f9, f30, f21 + FMADD f12, f12, f30, f22 + FMADD f13, f13, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f4, f4, f30 + FMUL f5, f5, f30 + + FMUL f8, f8, f30 + FMUL f9, f9, f30 + FMUL f12, f12, f30 + FMUL f13, f13, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L30: + andi. I, M, 1 + ble .L39 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f20, 0 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + + FMADD f0, f0, f30, f16 + FMADD f4, f4, f30, f18 + FMADD f8, f8, f30, f20 + FMADD f12, f12, f30, f22 +#else + FMUL f0, f0, f30 + FMUL f4, f4, f30 + FMUL f8, f8, f30 + FMUL f12, f12, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + addic. J, J, -1 + bgt .L10 + .align 4 + +.L40: + mr CO1, C + add CO2, C, LDC + andi. J, N, 2 + ble .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 + add C, CO2, LDC + mr AO, A + ble .L50 + .align 4 + +.L41: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L45 + .align 5 + +.L42: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L42 + .align 4 + +.L45: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 + + FMADD f4, f4, f30, f20 + FMADD f5, f5, f30, f21 + FMADD f6, f6, f30, f22 + FMADD f7, f7, f30, f23 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 + + FMUL f4, f4, f30 + FMUL f5, f5, f30 + FMUL f6, f6, f30 + FMUL f7, f7, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + bgt+ .L41 + .align 4 + +.L50: + andi. I, M, 2 + ble .L60 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L60: + andi. I, M, 1 + ble .L69 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f18 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +.L70: + mr CO1, C + andi. J, N, 1 + ble .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 + mr AO, A + ble .L80 + .align 4 + +.L71: +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + ble .L75 + +#endif + ble .L75 + .align 5 + +.L72: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L72 + .align 4 + +.L75: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 + FMADD f2, f2, f30, f18 + FMADD f3, f3, f30, f19 +#else + FMUL f0, f0, f30 + FMUL f1, f1, f30 + FMUL f2, f2, f30 + FMUL f3, f3, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addi CO1, CO1, 4 * SIZE + addic. I, I, -1 + bgt+ .L71 + .align 4 + +.L80: + andi. I, M, 2 + ble .L90 + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mtspr CTR, r0 + mr BO, B + +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: + lfd f30, ALPHA +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + + andi. r0, K, 3 + mtspr CTR, r0 + +#endif + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMADD f0, f0, f30, f16 + FMADD f1, f1, f30, f17 +#else + FADD f0, f2, f0 + FADD f1, f3, f1 + + FMUL f0, f0, f30 + FMUL f1, f1, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0 , TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L999 + + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 3 + mtspr CTR, r0 + mr BO, B +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: + lfd f30, ALPHA + +#if defined(TRMMKERNEL) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + +#else + + andi. r0, K, 7 + mtspr CTR, r0 + +#endif + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMADD f0, f0, f30, f16 +#else + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + + FMUL f0, f0, f30 +#endif + + STFD f0, 0 * SIZE(CO1) + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + ld r19, 240(SP) + ld r18, 248(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) +#if defined(TRMMKERNEL) || defined(TRSMKERNEL) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/gemm_ncopy_4.S b/kernel/power/gemm_ncopy_4.S new file mode 100644 index 0000000000..93c687bff8 --- /dev/null +++ b/kernel/power/gemm_ncopy_4.S @@ -0,0 +1,366 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define PREA r14 +#define PREB1 r15 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define STACKSIZE 32 + +#ifdef CELL +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef PPC440 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + +#ifdef PPCG4 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 72 +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + +#ifdef __64BIT__ + std r14, 16(SP) + std r15, 24(SP) +#else + stw r14, 16(SP) + stw r15, 20(SP) +#endif + + slwi LDA, LDA, BASE_SHIFT + + li PREA, PREFETCHSIZE * SIZE + li PREB1, (PREFETCHWSIZE + 0) * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + cmpwi cr0, N, 0 + ble- LL(999) + + srawi. J, N, 2 + ble LL(20) + .align 4 + +LL(10): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + LFD c05, 0 * SIZE(AO2) + LFD c06, 1 * SIZE(AO2) + LFD c07, 2 * SIZE(AO2) + LFD c08, 3 * SIZE(AO2) + + LFD c09, 0 * SIZE(AO3) + LFD c10, 1 * SIZE(AO3) + LFD c11, 2 * SIZE(AO3) + LFD c12, 3 * SIZE(AO3) + + LFD c13, 0 * SIZE(AO4) + LFD c14, 1 * SIZE(AO4) + LFD c15, 2 * SIZE(AO4) + LFD c16, 3 * SIZE(AO4) + + STFD c01, 0 * SIZE(B) + STFD c05, 1 * SIZE(B) + STFD c09, 2 * SIZE(B) + STFD c13, 3 * SIZE(B) + + STFD c02, 4 * SIZE(B) + STFD c06, 5 * SIZE(B) + STFD c10, 6 * SIZE(B) + STFD c14, 7 * SIZE(B) + + STFD c03, 8 * SIZE(B) + STFD c07, 9 * SIZE(B) + STFD c11, 10 * SIZE(B) + STFD c15, 11 * SIZE(B) + + STFD c04, 12 * SIZE(B) + STFD c08, 13 * SIZE(B) + STFD c12, 14 * SIZE(B) + STFD c16, 15 * SIZE(B) + +#ifdef POWER6 + dcbtst PREA, AO1 + dcbtst PREA, AO2 + dcbtst PREA, AO3 + dcbtst PREA, AO4 +#else + dcbt PREA, AO1 + dcbt PREA, AO2 + dcbt PREA, AO3 + dcbt PREA, AO4 +#endif + + dcbtst PREB1, B + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + addi B, B, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(17) + .align 4 + +LL(16): + LFD c01, 0 * SIZE(AO1) + LFD c05, 0 * SIZE(AO2) + LFD c09, 0 * SIZE(AO3) + LFD c13, 0 * SIZE(AO4) + + STFD c01, 0 * SIZE(B) + STFD c05, 1 * SIZE(B) + STFD c09, 2 * SIZE(B) + STFD c13, 3 * SIZE(B) + + addi AO1, AO1, 1 * SIZE + addi AO2, AO2, 1 * SIZE + addi AO3, AO3, 1 * SIZE + addi AO4, AO4, 1 * SIZE + addi B, B, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(17): + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + LFD c05, 0 * SIZE(AO2) + LFD c06, 1 * SIZE(AO2) + LFD c07, 2 * SIZE(AO2) + LFD c08, 3 * SIZE(AO2) + + STFD c01, 0 * SIZE(B) + STFD c05, 1 * SIZE(B) + STFD c02, 2 * SIZE(B) + STFD c06, 3 * SIZE(B) + + STFD c03, 4 * SIZE(B) + STFD c07, 5 * SIZE(B) + STFD c04, 6 * SIZE(B) + STFD c08, 7 * SIZE(B) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi B, B, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(30) + .align 4 + +LL(26): + LFD c01, 0 * SIZE(AO1) + LFD c05, 0 * SIZE(AO2) + + STFD c01, 0 * SIZE(B) + STFD c05, 1 * SIZE(B) + + addi AO1, AO1, 1 * SIZE + addi AO2, AO2, 1 * SIZE + addi B, B, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + STFD c01, 0 * SIZE(B) + STFD c02, 1 * SIZE(B) + STFD c03, 2 * SIZE(B) + STFD c04, 3 * SIZE(B) + + addi AO1, AO1, 4 * SIZE + addi B, B, 4 * SIZE + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(999) + .align 4 + +LL(36): + LFD c01, 0 * SIZE(AO1) + + STFD c01, 0 * SIZE(B) + + addi AO1, AO1, 1 * SIZE + addi B, B, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + +#ifdef __64BIT__ + ld r14, 16(SP) + ld r15, 24(SP) +#else + lwz r14, 16(SP) + lwz r15, 20(SP) +#endif + addi SP, SP, STACKSIZE + + blr + EPILOGUE diff --git a/kernel/power/gemm_ncopy_hummer_4.S b/kernel/power/gemm_ncopy_hummer_4.S new file mode 100644 index 0000000000..f05fdaae56 --- /dev/null +++ b/kernel/power/gemm_ncopy_hummer_4.S @@ -0,0 +1,798 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define sel_p f16 +#define sel_s f17 + +#define c17 f18 +#define c18 f19 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + + lis r9, 0x3f80 + lis r10, 0xbf80 + + stwu r9, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r9, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + + cmpwi cr0, M, 0 + ble- .L99 + cmpwi cr0, N, 0 + ble- .L99 + + andi. r0, A, 2 * SIZE - 1 + bne .L100 + andi. r0, LDA, 2 * SIZE - 1 + bne .L100 + + li r0, 8 + addi SP, SP, -8 + + lfpsux sel_p, SP, r0 + lfpsux sel_s, SP, r0 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + subi A, A, 2 * SIZE + subi B, B, 2 * SIZE + + srawi. J, N, 2 + ble .L20 + .align 4 +.L11: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L15 + .align 4 + +.L12: + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFXDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFXDUX c14, AO4, INC2 + + LFPDUX c03, AO1, INC2 + LFXDUX c07, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFXDUX c15, AO4, INC2 + + LFPDUX c04, AO1, INC2 + LFXDUX c08, AO2, INC2 + LFPDUX c12, AO3, INC2 + LFXDUX c16, AO4, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c18, sel_p, c09, c13 + fpsel c01, sel_s, c01, c05 + fpsel c05, sel_s, c09, c13 + + fpsel c09, sel_p, c02, c06 + fpsel c13, sel_p, c10, c14 + STFPDUX c17, B, INC2 + fpsel c02, sel_s, c02, c06 + STFPDUX c18, B, INC2 + fpsel c06, sel_s, c10, c14 + STFXDUX c01, B, INC2 + + fpsel c10, sel_p, c03, c07 + STFXDUX c05, B, INC2 + fpsel c14, sel_p, c11, c15 + STFPDUX c09, B, INC2 + fpsel c03, sel_s, c03, c07 + STFPDUX c13, B, INC2 + fpsel c07, sel_s, c11, c15 + STFXDUX c02, B, INC2 + + fpsel c11, sel_p, c04, c08 + STFXDUX c06, B, INC2 + fpsel c15, sel_p, c12, c16 + STFPDUX c10, B, INC2 + fpsel c04, sel_s, c04, c08 + STFPDUX c14, B, INC2 + fpsel c08, sel_s, c12, c16 + STFXDUX c03, B, INC2 + + STFXDUX c07, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFXDUX c04, B, INC2 + STFXDUX c08, B, INC2 + bdnz .L12 + .align 4 + +.L15: + andi. r0, M, 7 + ble .L19 + + andi. r0, M, 4 + beq .L16 + + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFXDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFXDUX c14, AO4, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c18, sel_p, c09, c13 + fpsel c01, sel_s, c01, c05 + fpsel c05, sel_s, c09, c13 + + fpsel c09, sel_p, c02, c06 + fpsel c13, sel_p, c10, c14 + STFPDUX c17, B, INC2 + fpsel c02, sel_s, c02, c06 + STFPDUX c18, B, INC2 + fpsel c06, sel_s, c10, c14 + STFXDUX c01, B, INC2 + STFXDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFXDUX c02, B, INC2 + STFXDUX c06, B, INC2 + .align 4 + +.L16: + andi. r0, M, 2 + beq .L17 + + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFXDUX c13, AO4, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c18, sel_p, c09, c13 + fpsel c01, sel_s, c01, c05 + fpsel c05, sel_s, c09, c13 + + STFPDUX c17, B, INC2 + STFPDUX c18, B, INC2 + STFXDUX c01, B, INC2 + STFXDUX c05, B, INC2 + .align 4 + +.L17: + andi. r0, M, 1 + beq .L19 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + LFDUX c03, AO3, INC2 + LFDUX c04, AO4, INC2 + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +.L19: + addic. J, J, -1 + bgt .L11 + .align 4 + +.L20: + andi. J, N, 2 + ble .L30 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L25 + .align 4 + +.L22: + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + + LFPDUX c03, AO1, INC2 + LFXDUX c07, AO2, INC2 + LFPDUX c04, AO1, INC2 + LFXDUX c08, AO2, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c01, sel_s, c01, c05 + fpsel c09, sel_p, c02, c06 + fpsel c02, sel_s, c02, c06 + + fpsel c10, sel_p, c03, c07 + fpsel c03, sel_s, c03, c07 + STFPDUX c17, B, INC2 + fpsel c11, sel_p, c04, c08 + STFXDUX c01, B, INC2 + fpsel c04, sel_s, c04, c08 + STFPDUX c09, B, INC2 + + STFXDUX c02, B, INC2 + STFPDUX c10, B, INC2 + STFXDUX c03, B, INC2 + STFPDUX c11, B, INC2 + STFXDUX c04, B, INC2 + bdnz .L22 + .align 4 + +.L25: + andi. r0, M, 7 + ble .L30 + + andi. r0, M, 4 + beq .L26 + + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c01, sel_s, c01, c05 + fpsel c09, sel_p, c02, c06 + fpsel c02, sel_s, c02, c06 + + STFPDUX c17, B, INC2 + STFXDUX c01, B, INC2 + STFPDUX c09, B, INC2 + STFXDUX c02, B, INC2 + .align 4 + +.L26: + andi. r0, M, 2 + beq .L27 + + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + + fpsel c17, sel_p, c01, c05 + fpsel c01, sel_s, c01, c05 + + STFPDUX c17, B, INC2 + STFXDUX c01, B, INC2 + .align 4 + +.L27: + andi. r0, M, 1 + beq .L30 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L30: + andi. J, N, 1 + ble .L99 + + mr AO1, A + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L35 + .align 4 + +.L32: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + bdnz .L32 + .align 4 + +.L35: + andi. r0, M, 7 + ble .L99 + + andi. r0, M, 4 + beq .L36 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +.L36: + andi. r0, M, 2 + beq .L37 + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B, INC2 + .align 4 + +.L37: + andi. r0, M, 1 + beq .L99 + + LFDX c01, AO1, INC2 + STFDX c01, B, INC2 + .align 4 + +.L99: + addi SP, SP, 4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + +.L100: + li INC, 1 * SIZE + li INC2, 2 * SIZE + + subi A, A, 1 * SIZE + subi B, B, 2 * SIZE + + srawi. J, N, 2 + ble .L120 + .align 4 +.L111: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L115 + .align 4 + +.L112: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO1, INC + LFDUX c12, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFSDUX c09, AO2, INC + LFSDUX c10, AO2, INC + LFSDUX c11, AO2, INC + LFSDUX c12, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO3, INC + LFDUX c08, AO3, INC + + LFDUX c13, AO3, INC + LFDUX c14, AO3, INC + LFDUX c15, AO3, INC + LFDUX c16, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c07, AO4, INC + LFSDUX c08, AO4, INC + + LFSDUX c13, AO4, INC + LFSDUX c14, AO4, INC + LFSDUX c15, AO4, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c16, B, INC2 + bdnz .L112 + .align 4 + +.L115: + andi. r0, M, 7 + ble .L119 + + andi. r0, M, 4 + beq .L116 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO3, INC + LFDUX c08, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c07, AO4, INC + LFSDUX c08, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + .align 4 + +.L116: + andi. r0, M, 2 + beq .L117 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + .align 4 + +.L117: + andi. r0, M, 1 + beq .L119 + + LFDUX c01, AO1, INC + LFDUX c05, AO3, INC + + nop + nop + + LFSDUX c01, AO2, INC + LFSDUX c05, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + .align 4 + +.L119: + addic. J, J, -1 + bgt .L111 + .align 4 + +.L120: + andi. J, N, 2 + ble .L130 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L125 + .align 4 + +.L122: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO1, INC + LFDUX c12, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFSDUX c09, AO2, INC + LFSDUX c10, AO2, INC + LFSDUX c11, AO2, INC + LFSDUX c12, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + bdnz .L122 + .align 4 + +.L125: + andi. r0, M, 7 + ble .L130 + + andi. r0, M, 4 + beq .L126 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +.L126: + andi. r0, M, 2 + beq .L127 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +.L127: + andi. r0, M, 1 + beq .L130 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L130: + andi. J, N, 1 + ble .L999 + + mr AO1, A + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L135 + .align 4 + +.L132: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz .L132 + .align 4 + +.L135: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + beq .L136 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +.L136: + andi. r0, M, 2 + beq .L137 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L137: + andi. r0, M, 1 + beq .L999 + + LFDX c01, AO1, INC + STFDX c01, B, INC2 + .align 4 + +.L999: + addi SP, SP, 12 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/gemm_ncopy_hummer_8.S b/kernel/power/gemm_ncopy_hummer_8.S new file mode 100644 index 0000000000..fec7c139c5 --- /dev/null +++ b/kernel/power/gemm_ncopy_hummer_8.S @@ -0,0 +1,1217 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define AO5 r26 +#define AO6 r27 +#define AO7 r28 +#define AO8 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define c17 f16 +#define c18 f17 +#define c19 f18 +#define c20 f19 +#define c21 f20 +#define c22 f21 +#define c23 f22 +#define c24 f23 +#define c25 f24 +#define c26 f25 +#define c27 f26 +#define c28 f27 +#define c29 f28 +#define c30 f29 +#define c31 f30 +#define c32 f31 + +#define sel_p f30 +#define sel_s f31 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + + lis r9, 0x3f80 + lis r10, 0xbf80 + + stwu r9, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r9, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + + li r0, 0 + lfpsux sel_p, SP, r0 + li r0, 8 + lfpsux sel_s, SP, r0 + + cmpwi cr0, M, 0 + ble- .L999 + cmpwi cr0, N, 0 + ble- .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + subi B, B, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne .L100 + andi. r0, LDA, 2 * SIZE - 1 + bne .L100 + + subi A, A, 2 * SIZE + srawi. J, N, 3 + ble .L20 + .align 4 +.L11: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble .L15 + .align 4 + +.L12: + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + LFPDUX c05, AO5, INC2 + LFXDUX c06, AO6, INC2 + LFPDUX c07, AO7, INC2 + LFXDUX c08, AO8, INC2 + + LFPDUX c09, AO1, INC2 + LFXDUX c10, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFXDUX c12, AO4, INC2 + fpsel c17, sel_p, c01, c02 + + LFPDUX c13, AO5, INC2 + fpsel c18, sel_p, c03, c04 + LFXDUX c14, AO6, INC2 + fpsel c19, sel_p, c05, c06 + LFPDUX c15, AO7, INC2 + fpsel c20, sel_p, c07, c08 + LFXDUX c16, AO8, INC2 + fpsel c21, sel_s, c01, c02 + + fpsel c22, sel_s, c03, c04 + STFPDUX c17, B, INC2 + fpsel c23, sel_s, c05, c06 + STFPDUX c18, B, INC2 + fpsel c24, sel_s, c07, c08 + STFPDUX c19, B, INC2 + + fpsel c01, sel_p, c09, c10 + STFPDUX c20, B, INC2 + fpsel c02, sel_p, c11, c12 + STFXDUX c21, B, INC2 + fpsel c03, sel_p, c13, c14 + STFXDUX c22, B, INC2 + fpsel c04, sel_p, c15, c16 + STFXDUX c23, B, INC2 + + fpsel c05, sel_s, c09, c10 + STFXDUX c24, B, INC2 + fpsel c06, sel_s, c11, c12 + STFPDUX c01, B, INC2 + fpsel c07, sel_s, c13, c14 + STFPDUX c02, B, INC2 + fpsel c08, sel_s, c15, c16 + STFPDUX c03, B, INC2 + + STFPDUX c04, B, INC2 + STFXDUX c05, B, INC2 + STFXDUX c06, B, INC2 + STFXDUX c07, B, INC2 + STFXDUX c08, B, INC2 + bdnz .L12 + .align 4 + +.L15: + andi. r0, M, 3 + ble .L19 + + andi. r0, M, 2 + beq .L17 + + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + LFPDUX c05, AO5, INC2 + fpsel c09, sel_p, c01, c02 + LFXDUX c06, AO6, INC2 + fpsel c10, sel_p, c03, c04 + LFPDUX c07, AO7, INC2 + fpsel c11, sel_p, c05, c06 + LFXDUX c08, AO8, INC2 + fpsel c12, sel_p, c07, c08 + + fpsel c13, sel_s, c01, c02 + fpsel c14, sel_s, c03, c04 + STFPDUX c09, B, INC2 + fpsel c15, sel_s, c05, c06 + STFPDUX c10, B, INC2 + fpsel c16, sel_s, c07, c08 + STFPDUX c11, B, INC2 + + STFPDUX c12, B, INC2 + STFXDUX c13, B, INC2 + STFXDUX c14, B, INC2 + STFXDUX c15, B, INC2 + STFXDUX c16, B, INC2 + .align 4 + +.L17: + andi. r0, M, 1 + beq .L19 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO3, INC2 + LFDUX c03, AO5, INC2 + LFDUX c04, AO7, INC2 + + LFSDUX c01, AO2, INC2 + LFSDUX c02, AO4, INC2 + LFSDUX c03, AO6, INC2 + LFSDUX c04, AO8, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +.L19: + addic. J, J, -1 + bgt .L11 + .align 4 + +.L20: + andi. J, N, 4 + ble .L30 + .align 4 +.L21: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L25 + .align 4 + +.L22: + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + LFPDUX c05, AO1, INC2 + LFXDUX c06, AO2, INC2 + LFPDUX c07, AO3, INC2 + LFXDUX c08, AO4, INC2 + + LFPDUX c09, AO1, INC2 + LFXDUX c10, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFXDUX c12, AO4, INC2 + fpsel c17, sel_p, c01, c02 + + LFPDUX c13, AO1, INC2 + fpsel c18, sel_p, c03, c04 + LFXDUX c14, AO2, INC2 + fpsel c19, sel_s, c01, c02 + LFPDUX c15, AO3, INC2 + fpsel c20, sel_s, c03, c04 + LFXDUX c16, AO4, INC2 + fpsel c21, sel_p, c05, c06 + + fpsel c22, sel_p, c07, c08 + STFPDUX c17, B, INC2 + fpsel c23, sel_s, c05, c06 + STFPDUX c18, B, INC2 + fpsel c24, sel_s, c07, c08 + STFXDUX c19, B, INC2 + + fpsel c01, sel_p, c09, c10 + STFXDUX c20, B, INC2 + fpsel c02, sel_p, c11, c12 + STFPDUX c21, B, INC2 + fpsel c03, sel_s, c09, c10 + STFPDUX c22, B, INC2 + fpsel c04, sel_s, c11, c12 + STFXDUX c23, B, INC2 + + fpsel c05, sel_p, c13, c14 + STFXDUX c24, B, INC2 + fpsel c06, sel_p, c15, c16 + STFPDUX c01, B, INC2 + fpsel c07, sel_s, c13, c14 + STFPDUX c02, B, INC2 + fpsel c08, sel_s, c15, c16 + STFXDUX c03, B, INC2 + + STFXDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFXDUX c07, B, INC2 + STFXDUX c08, B, INC2 + bdnz .L22 + .align 4 + +.L25: + andi. r0, M, 7 + ble .L30 + + andi. r0, M, 4 + beq .L26 + + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + LFPDUX c05, AO1, INC2 + fpsel c09, sel_p, c01, c02 + LFXDUX c06, AO2, INC2 + fpsel c10, sel_p, c03, c04 + LFPDUX c07, AO3, INC2 + fpsel c11, sel_s, c01, c02 + LFXDUX c08, AO4, INC2 + fpsel c12, sel_s, c03, c04 + + fpsel c13, sel_p, c05, c06 + fpsel c14, sel_p, c07, c08 + STFPDUX c09, B, INC2 + fpsel c15, sel_s, c05, c06 + STFPDUX c10, B, INC2 + fpsel c16, sel_s, c07, c08 + STFXDUX c11, B, INC2 + + STFXDUX c12, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c14, B, INC2 + STFXDUX c15, B, INC2 + STFXDUX c16, B, INC2 + .align 4 + +.L26: + andi. r0, M, 2 + beq .L27 + + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFXDUX c04, AO4, INC2 + + fpsel c05, sel_p, c01, c02 + fpsel c06, sel_p, c03, c04 + fpsel c07, sel_s, c01, c02 + fpsel c08, sel_s, c03, c04 + + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFXDUX c07, B, INC2 + STFXDUX c08, B, INC2 + .align 4 + +.L27: + andi. r0, M, 1 + beq .L30 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + LFDUX c03, AO3, INC2 + LFDUX c04, AO4, INC2 + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + + +.L30: + andi. J, N, 2 + ble .L40 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L35 + .align 4 + +.L32: + LFPDUX c01, AO1, INC2 + LFXDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFXDUX c06, AO2, INC2 + + LFPDUX c03, AO1, INC2 + fpsel c09, sel_p, c01, c05 + LFXDUX c07, AO2, INC2 + fpsel c10, sel_s, c01, c05 + LFPDUX c04, AO1, INC2 + fpsel c11, sel_p, c02, c06 + LFXDUX c08, AO2, INC2 + fpsel c12, sel_s, c02, c06 + + fpsel c13, sel_p, c03, c07 + fpsel c14, sel_s, c03, c07 + STFPDUX c09, B, INC2 + fpsel c15, sel_p, c04, c08 + STFXDUX c10, B, INC2 + fpsel c16, sel_s, c04, c08 + STFPDUX c11, B, INC2 + STFXDUX c12, B, INC2 + + STFPDUX c13, B, INC2 + STFXDUX c14, B, INC2 + STFPDUX c15, B, INC2 + STFXDUX c16, B, INC2 + bdnz .L32 + .align 4 + +.L35: + andi. r0, M, 7 + ble .L40 + + andi. r0, M, 4 + beq .L36 + + LFPDUX c01, AO1, INC2 + LFXDUX c03, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFXDUX c04, AO2, INC2 + + fpsel c05, sel_p, c01, c03 + fpsel c06, sel_s, c01, c03 + fpsel c07, sel_p, c02, c04 + fpsel c08, sel_s, c02, c04 + + STFPDUX c05, B, INC2 + STFXDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFXDUX c08, B, INC2 + .align 4 + +.L36: + andi. r0, M, 2 + beq .L37 + + LFPDUX c01, AO1, INC2 + LFXDUX c02, AO2, INC2 + + fpsel c03, sel_p, c01, c02 + fpsel c04, sel_s, c01, c02 + + STFPDUX c03, B, INC2 + STFXDUX c04, B, INC2 + .align 4 + +.L37: + andi. r0, M, 1 + beq .L40 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L40: + andi. J, N, 1 + ble .L999 + + mr AO1, A + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L45 + .align 4 + +.L42: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + bdnz .L42 + .align 4 + +.L45: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + beq .L46 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +.L46: + andi. r0, M, 2 + beq .L47 + + LFPDUX c01, AO1, INC2 + STFPDUX c01, B, INC2 + .align 4 + +.L47: + andi. r0, M, 1 + beq .L999 + + LFDX c01, AO1, INC2 + STFDX c01, B, INC2 + b .L999 + .align 4 + + +.L100: + subi A, A, 1 * SIZE + srawi. J, N, 3 + ble .L120 + .align 4 +.L111: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L115 + .align 4 + +.L112: + LFDUX c01, AO1, INC + LFDUX c05, AO1, INC + LFDUX c09, AO1, INC + LFDUX c13, AO1, INC + + LFDUX c17, AO1, INC + LFDUX c21, AO1, INC + LFDUX c25, AO1, INC + LFDUX c29, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO2, INC + LFSDUX c13, AO2, INC + + LFSDUX c17, AO2, INC + LFSDUX c21, AO2, INC + LFSDUX c25, AO2, INC + LFSDUX c29, AO2, INC + + LFDUX c02, AO3, INC + LFDUX c06, AO3, INC + LFDUX c10, AO3, INC + LFDUX c14, AO3, INC + + LFDUX c18, AO3, INC + LFDUX c22, AO3, INC + LFDUX c26, AO3, INC + LFDUX c30, AO3, INC + + LFSDUX c02, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c10, AO4, INC + LFSDUX c14, AO4, INC + + LFSDUX c18, AO4, INC + LFSDUX c22, AO4, INC + LFSDUX c26, AO4, INC + LFSDUX c30, AO4, INC + + LFDUX c03, AO5, INC + LFDUX c07, AO5, INC + LFDUX c11, AO5, INC + LFDUX c15, AO5, INC + + LFDUX c19, AO5, INC + LFDUX c23, AO5, INC + LFDUX c27, AO5, INC + LFDUX c31, AO5, INC + + LFSDUX c03, AO6, INC + LFSDUX c07, AO6, INC + LFSDUX c11, AO6, INC + LFSDUX c15, AO6, INC + + LFSDUX c19, AO6, INC + LFSDUX c23, AO6, INC + LFSDUX c27, AO6, INC + LFSDUX c31, AO6, INC + + LFDUX c04, AO7, INC + LFDUX c08, AO7, INC + LFDUX c12, AO7, INC + LFDUX c16, AO7, INC + + LFDUX c20, AO7, INC + LFDUX c24, AO7, INC + LFDUX c28, AO7, INC + LFDUX c32, AO7, INC + + LFSDUX c04, AO8, INC + LFSDUX c08, AO8, INC + LFSDUX c12, AO8, INC + LFSDUX c16, AO8, INC + + LFSDUX c20, AO8, INC + LFSDUX c24, AO8, INC + LFSDUX c28, AO8, INC + LFSDUX c32, AO8, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c16, B, INC2 + + STFPDUX c17, B, INC2 + STFPDUX c18, B, INC2 + STFPDUX c19, B, INC2 + STFPDUX c20, B, INC2 + STFPDUX c21, B, INC2 + STFPDUX c22, B, INC2 + STFPDUX c23, B, INC2 + STFPDUX c24, B, INC2 + + STFPDUX c25, B, INC2 + STFPDUX c26, B, INC2 + STFPDUX c27, B, INC2 + STFPDUX c28, B, INC2 + STFPDUX c29, B, INC2 + STFPDUX c30, B, INC2 + STFPDUX c31, B, INC2 + STFPDUX c32, B, INC2 + bdnz .L112 + .align 4 + +.L115: + andi. r0, M, 7 + ble .L119 + + andi. r0, M, 4 + beq .L116 + + LFDUX c01, AO1, INC + LFDUX c05, AO1, INC + LFDUX c09, AO1, INC + LFDUX c13, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO2, INC + LFSDUX c13, AO2, INC + + LFDUX c02, AO3, INC + LFDUX c06, AO3, INC + LFDUX c10, AO3, INC + LFDUX c14, AO3, INC + + LFSDUX c02, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c10, AO4, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO5, INC + LFDUX c07, AO5, INC + LFDUX c11, AO5, INC + LFDUX c15, AO5, INC + + LFSDUX c03, AO6, INC + LFSDUX c07, AO6, INC + LFSDUX c11, AO6, INC + LFSDUX c15, AO6, INC + + LFDUX c04, AO7, INC + LFDUX c08, AO7, INC + LFDUX c12, AO7, INC + LFDUX c16, AO7, INC + + LFSDUX c04, AO8, INC + LFSDUX c08, AO8, INC + LFSDUX c12, AO8, INC + LFSDUX c16, AO8, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c16, B, INC2 + .align 4 + +.L116: + andi. r0, M, 2 + beq .L117 + + LFDUX c01, AO1, INC + LFDUX c05, AO1, INC + LFDUX c02, AO3, INC + LFDUX c06, AO3, INC + + LFSDUX c01, AO2, INC + LFSDUX c05, AO2, INC + LFSDUX c02, AO4, INC + LFSDUX c06, AO4, INC + + LFDUX c03, AO5, INC + LFDUX c07, AO5, INC + LFDUX c04, AO7, INC + LFDUX c08, AO7, INC + + LFSDUX c03, AO6, INC + LFSDUX c07, AO6, INC + LFSDUX c04, AO8, INC + LFSDUX c08, AO8, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + .align 4 + +.L117: + andi. r0, M, 1 + beq .L119 + + LFDUX c01, AO1, INC + LFDUX c02, AO3, INC + LFDUX c03, AO5, INC + LFDUX c04, AO7, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO4, INC + LFSDUX c03, AO6, INC + LFSDUX c04, AO8, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +.L119: + addic. J, J, -1 + bgt .L111 + .align 4 + +.L120: + andi. J, N, 4 + ble .L130 + .align 4 +.L121: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L125 + .align 4 + +.L122: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO1, INC + LFDUX c12, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFSDUX c09, AO2, INC + LFSDUX c10, AO2, INC + LFSDUX c11, AO2, INC + LFSDUX c12, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO3, INC + LFDUX c08, AO3, INC + + LFDUX c13, AO3, INC + LFDUX c14, AO3, INC + LFDUX c15, AO3, INC + LFDUX c16, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c07, AO4, INC + LFSDUX c08, AO4, INC + + LFSDUX c13, AO4, INC + LFSDUX c14, AO4, INC + LFSDUX c15, AO4, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c16, B, INC2 + bdnz .L122 + .align 4 + +.L125: + andi. r0, M, 7 + ble .L130 + + andi. r0, M, 4 + beq .L126 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO3, INC + LFDUX c08, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + LFSDUX c07, AO4, INC + LFSDUX c08, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + .align 4 + +.L126: + andi. r0, M, 2 + beq .L127 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + + LFSDUX c05, AO4, INC + LFSDUX c06, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + .align 4 + +.L127: + andi. r0, M, 1 + beq .L130 + + LFDUX c01, AO1, INC + LFDUX c05, AO3, INC + + nop + nop + + LFSDUX c01, AO2, INC + LFSDUX c05, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + .align 4 + + +.L130: + andi. J, N, 2 + ble .L140 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L135 + .align 4 + +.L132: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO1, INC + LFDUX c12, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + LFSDUX c09, AO2, INC + LFSDUX c10, AO2, INC + LFSDUX c11, AO2, INC + LFSDUX c12, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + bdnz .L132 + .align 4 + +.L135: + andi. r0, M, 7 + ble .L140 + + andi. r0, M, 4 + beq .L136 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + LFSDUX c03, AO2, INC + LFSDUX c04, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +.L136: + andi. r0, M, 2 + beq .L137 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO2, INC + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +.L137: + andi. r0, M, 1 + beq .L140 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L140: + andi. J, N, 1 + ble .L999 + + mr AO1, A + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L145 + .align 4 + +.L142: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz .L142 + .align 4 + +.L145: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + beq .L146 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +.L146: + andi. r0, M, 2 + beq .L147 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +.L147: + andi. r0, M, 1 + beq .L999 + + LFDX c01, AO1, INC + STFDX c01, B, INC2 + .align 4 + +.L999: + addi SP, SP, 4 + + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/gemm_tcopy_4.S b/kernel/power/gemm_tcopy_4.S new file mode 100644 index 0000000000..712420f482 --- /dev/null +++ b/kernel/power/gemm_tcopy_4.S @@ -0,0 +1,452 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define PREA r14 +#define PREB1 r15 +#define B1 r16 +#define B2 r17 +#define B3 r18 +#define M4 r19 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define STACKSIZE 64 + +#ifdef CELL +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef PPC440 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + +#ifdef PPCG4 +#define PREFETCHSIZE 16 +#define PREFETCHWSIZE 48 +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + +#ifdef __64BIT__ + std r14, 16(SP) + std r15, 24(SP) + std r16, 32(SP) + std r17, 40(SP) + std r18, 48(SP) + std r19, 56(SP) +#else + stw r14, 16(SP) + stw r15, 20(SP) + stw r16, 24(SP) + stw r17, 28(SP) + stw r18, 32(SP) + stw r19, 36(SP) +#endif + + slwi LDA, LDA, BASE_SHIFT + slwi M4, M, 2 + BASE_SHIFT + + li PREA, -4 + li PREB1, -2 + + and B2, N, PREA + and B3, N, PREB1 + + mullw B2, B2, M + mullw B3, B3, M + + slwi B2, B2, BASE_SHIFT + slwi B3, B3, BASE_SHIFT + + add B2, B2, B + add B3, B3, B + + li PREA, PREFETCHSIZE * SIZE + li PREB1, (PREFETCHWSIZE + 0) * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + cmpwi cr0, N, 0 + ble- LL(999) + + srawi. J, M, 2 + ble LL(20) + .align 4 + +LL(10): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr B1, B + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(13) + .align 4 + +LL(12): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + LFD c05, 0 * SIZE(AO2) + LFD c06, 1 * SIZE(AO2) + LFD c07, 2 * SIZE(AO2) + LFD c08, 3 * SIZE(AO2) + + LFD c09, 0 * SIZE(AO3) + LFD c10, 1 * SIZE(AO3) + LFD c11, 2 * SIZE(AO3) + LFD c12, 3 * SIZE(AO3) + + LFD c13, 0 * SIZE(AO4) + LFD c14, 1 * SIZE(AO4) + LFD c15, 2 * SIZE(AO4) + LFD c16, 3 * SIZE(AO4) + + STFD c01, 0 * SIZE(B1) + STFD c02, 1 * SIZE(B1) + STFD c03, 2 * SIZE(B1) + STFD c04, 3 * SIZE(B1) + + STFD c05, 4 * SIZE(B1) + STFD c06, 5 * SIZE(B1) + STFD c07, 6 * SIZE(B1) + STFD c08, 7 * SIZE(B1) + + STFD c09, 8 * SIZE(B1) + STFD c10, 9 * SIZE(B1) + STFD c11, 10 * SIZE(B1) + STFD c12, 11 * SIZE(B1) + + STFD c13, 12 * SIZE(B1) + STFD c14, 13 * SIZE(B1) + STFD c15, 14 * SIZE(B1) + STFD c16, 15 * SIZE(B1) + +#ifdef POWER6 + dcbtst PREA, AO1 + dcbtst PREA, AO2 + dcbtst PREA, AO3 + dcbtst PREA, AO4 +#else + dcbt PREA, AO1 + dcbt PREA, AO2 + dcbt PREA, AO3 + dcbt PREA, AO4 +#endif + + dcbtst PREB1, B + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + add B1, B1, M4 + bdnz LL(12) + .align 4 + +LL(13): + andi. r0, N, 2 + ble LL(14) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 0 * SIZE(AO2) + LFD c04, 1 * SIZE(AO2) + + LFD c05, 0 * SIZE(AO3) + LFD c06, 1 * SIZE(AO3) + LFD c07, 0 * SIZE(AO4) + LFD c08, 1 * SIZE(AO4) + + STFD c01, 0 * SIZE(B2) + STFD c02, 1 * SIZE(B2) + STFD c03, 2 * SIZE(B2) + STFD c04, 3 * SIZE(B2) + + STFD c05, 4 * SIZE(B2) + STFD c06, 5 * SIZE(B2) + STFD c07, 6 * SIZE(B2) + STFD c08, 7 * SIZE(B2) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi AO3, AO3, 2 * SIZE + addi AO4, AO4, 2 * SIZE + addi B2, B2, 8 * SIZE + .align 4 + +LL(14): + andi. r0, N, 1 + ble LL(17) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 0 * SIZE(AO2) + LFD c03, 0 * SIZE(AO3) + LFD c04, 0 * SIZE(AO4) + + STFD c01, 0 * SIZE(B3) + STFD c02, 1 * SIZE(B3) + STFD c03, 2 * SIZE(B3) + STFD c04, 3 * SIZE(B3) + + addi B3, B3, 4 * SIZE + .align 4 + +LL(17): + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(20): + andi. J, M, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr B1, B + addi B, B, 8 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(23) + .align 4 + +LL(22): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + LFD c05, 0 * SIZE(AO2) + LFD c06, 1 * SIZE(AO2) + LFD c07, 2 * SIZE(AO2) + LFD c08, 3 * SIZE(AO2) + + STFD c01, 0 * SIZE(B1) + STFD c02, 1 * SIZE(B1) + STFD c03, 2 * SIZE(B1) + STFD c04, 3 * SIZE(B1) + + STFD c05, 4 * SIZE(B1) + STFD c06, 5 * SIZE(B1) + STFD c07, 6 * SIZE(B1) + STFD c08, 7 * SIZE(B1) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + add B1, B1, M4 + bdnz LL(22) + .align 4 + +LL(23): + andi. r0, N, 2 + ble LL(24) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 0 * SIZE(AO2) + LFD c04, 1 * SIZE(AO2) + + STFD c01, 0 * SIZE(B2) + STFD c02, 1 * SIZE(B2) + STFD c03, 2 * SIZE(B2) + STFD c04, 3 * SIZE(B2) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi B2, B2, 4 * SIZE + .align 4 + +LL(24): + andi. r0, N, 1 + ble LL(30) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 0 * SIZE(AO2) + + STFD c01, 0 * SIZE(B3) + STFD c02, 1 * SIZE(B3) + + addi B3, B3, 2 * SIZE + .align 4 + +LL(30): + andi. J, M, 1 + ble LL(999) + + mr AO1, A + + mr B1, B + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(33) + .align 4 + +LL(32): + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + LFD c03, 2 * SIZE(AO1) + LFD c04, 3 * SIZE(AO1) + + STFD c01, 0 * SIZE(B1) + STFD c02, 1 * SIZE(B1) + STFD c03, 2 * SIZE(B1) + STFD c04, 3 * SIZE(B1) + + addi AO1, AO1, 4 * SIZE + add B1, B1, M4 + bdnz LL(32) + .align 4 + +LL(33): + andi. r0, N, 2 + ble LL(34) + + LFD c01, 0 * SIZE(AO1) + LFD c02, 1 * SIZE(AO1) + + STFD c01, 0 * SIZE(B2) + STFD c02, 1 * SIZE(B2) + + addi AO1, AO1, 2 * SIZE + addi B2, B2, 2 * SIZE + .align 4 + +LL(34): + andi. r0, N, 1 + ble LL(999) + + LFD c01, 0 * SIZE(AO1) + STFD c01, 0 * SIZE(B3) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + +#ifdef __64BIT__ + ld r14, 16(SP) + ld r15, 24(SP) + ld r16, 32(SP) + ld r17, 40(SP) + ld r18, 48(SP) + ld r19, 56(SP) +#else + lwz r14, 16(SP) + lwz r15, 20(SP) + lwz r16, 24(SP) + lwz r17, 28(SP) + lwz r18, 32(SP) + lwz r19, 36(SP) +#endif + addi SP, SP, STACKSIZE + + blr + EPILOGUE diff --git a/kernel/power/gemm_tcopy_hummer_4.S b/kernel/power/gemm_tcopy_hummer_4.S new file mode 100644 index 0000000000..dc94b046f1 --- /dev/null +++ b/kernel/power/gemm_tcopy_hummer_4.S @@ -0,0 +1,521 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r25 +#define B1 r26 +#define B2 r27 +#define B3 r28 +#define M4 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 + + PROLOGUE + PROFCODE + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + slwi M4, M, 2 + BASE_SHIFT + + li r8, -4 + li r9, -2 + + and B2, N, r8 + and B3, N, r9 + + mullw B2, B2, M + mullw B3, B3, M + + slwi B2, B2, BASE_SHIFT + slwi B3, B3, BASE_SHIFT + + add B2, B2, B + add B3, B3, B + + cmpwi cr0, M, 0 + ble- .L99 + cmpwi cr0, N, 0 + ble- .L99 + + subi B2, B2, 2 * SIZE + subi B3, B3, 2 * SIZE + subi M4, M4, 14 * SIZE + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne .L100 + andi. r0, LDA, 2 * SIZE - 1 + bne .L100 + + subi A, A, 2 * SIZE + srawi. J, M, 2 + ble .L20 + .align 4 + +.L10: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M4 + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L15 + .align 4 + +.L12: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + LFPDUX c05, AO3, INC2 + LFPDUX c06, AO3, INC2 + LFPDUX c07, AO4, INC2 + LFPDUX c08, AO4, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + bdnz .L12 + .align 4 + +.L15: + andi. r0, N, 3 + ble .L19 + + andi. r0, N, 2 + ble .L17 + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c05, AO3, INC2 + LFPDUX c07, AO4, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c07, B2, INC2 + .align 4 + +.L17: + andi. r0, N, 1 + ble .L19 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + LFDUX c03, AO3, INC2 + LFDUX c04, AO4, INC2 + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + .align 4 + +.L19: + addic. J, J, -1 + bgt .L10 + .align 4 + +.L20: + andi. J, M, 2 + addi M4, M4, 8 * SIZE + + ble .L30 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 8 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L23 + .align 4 + +.L22: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz .L22 + .align 4 + +.L23: + andi. r0, N, 2 + ble .L24 + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO2, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +.L24: + andi. r0, N, 1 + ble .L30 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + + fsmfp c01, c02 + STFPDUX c01, B3, INC2 + .align 4 + +.L30: + andi. J, M, 1 + addi M4, M4, 4 * SIZE + ble .L99 + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L33 + .align 4 + +.L32: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + bdnz .L32 + .align 4 + +.L33: + andi. r0, N, 2 + ble .L34 + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B2, INC2 + .align 4 + +.L34: + andi. r0, N, 1 + ble .L99 + + LFDX c01, AO1, INC2 + STFDX c01, B3, INC2 + .align 4 + +.L99: + addi SP, SP, -4 + + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + addi SP, SP, 4 + blr + +.L100: + subi A, A, SIZE + srawi. J, M, 2 + ble .L120 + .align 4 + +.L110: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M4 + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L115 + .align 4 + +.L112: + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c04, AO2, INC + LFDUX c06, AO3, INC + LFDUX c08, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c04, AO2, INC + LFSDUX c06, AO3, INC + LFSDUX c08, AO4, INC + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + bdnz .L112 + .align 4 + +.L115: + andi. r0, N, 3 + ble .L119 + + andi. r0, N, 2 + ble .L117 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c07, B2, INC2 + .align 4 + +.L117: + andi. r0, N, 1 + ble .L119 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + LFDUX c03, AO3, INC + LFDUX c04, AO4, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + .align 4 + +.L119: + addic. J, J, -1 + bgt .L110 + .align 4 + +.L120: + andi. J, M, 2 + addi M4, M4, 8 * SIZE + + ble .L130 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 8 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L123 + .align 4 + +.L122: + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + + LFDUX c02, AO1, INC + LFDUX c04, AO2, INC + LFSDUX c02, AO1, INC + LFSDUX c04, AO2, INC + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz .L122 + .align 4 + +.L123: + andi. r0, N, 2 + ble .L124 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +.L124: + andi. r0, N, 1 + ble .L130 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + + fsmfp c01, c02 + STFPDUX c01, B3, INC2 + .align 4 + +.L130: + andi. J, M, 1 + addi M4, M4, 4 * SIZE + ble .L999 + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 2 + mtspr CTR, r0 + ble .L133 + .align 4 + +.L132: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B1, M4 + STFPDUX c03, B1, INC2 + bdnz .L132 + .align 4 + +.L133: + andi. r0, N, 2 + ble .L134 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B2, INC2 + .align 4 + +.L134: + andi. r0, N, 1 + ble .L999 + + LFDX c01, AO1, INC + STFDX c01, B3, INC2 + .align 4 + +.L999: + addi SP, SP, -4 + + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + addi SP, SP, 4 + blr + EPILOGUE diff --git a/kernel/power/gemm_tcopy_hummer_8.S b/kernel/power/gemm_tcopy_hummer_8.S new file mode 100644 index 0000000000..5062f65366 --- /dev/null +++ b/kernel/power/gemm_tcopy_hummer_8.S @@ -0,0 +1,1285 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define B1 r21 +#define B2 r22 +#define B3 r23 +#define B4 r24 +#define M8 r25 + +#define AO5 r26 +#define AO6 r27 +#define AO7 r28 +#define AO8 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + +#define c17 f16 +#define c18 f17 +#define c19 f18 +#define c20 f19 +#define c21 f20 +#define c22 f21 +#define c23 f22 +#define c24 f23 +#define c25 f24 +#define c26 f25 +#define c27 f26 +#define c28 f27 +#define c29 f28 +#define c30 f29 +#define c31 f30 +#define c32 f31 + +#define STACKSIZE 64 + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + slwi M8, M, 3 + BASE_SHIFT + + li r8, -8 + li r9, -4 + li r10, -2 + + and B2, N, r8 + and B3, N, r9 + and B4, N, r10 + + mullw B2, B2, M + mullw B3, B3, M + mullw B4, B4, M + + slwi B2, B2, BASE_SHIFT + slwi B3, B3, BASE_SHIFT + slwi B4, B4, BASE_SHIFT + + add B2, B2, B + add B3, B3, B + add B4, B4, B + + cmpwi cr0, M, 0 + ble- .L999 + cmpwi cr0, N, 0 + ble- .L999 + + subi B2, B2, 2 * SIZE + subi B3, B3, 2 * SIZE + subi B4, B4, 2 * SIZE + + subi M8, M8, 62 * SIZE + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne .L100 + andi. r0, LDA, 2 * SIZE - 1 + bne .L100 + + subi A, A, 2 * SIZE + srawi. J, M, 3 + ble .L20 + .align 4 + +.L10: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + sub B1, B, M8 + addi B, B, 64 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L15 + .align 4 + +.L12: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + LFPDUX c05, AO2, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c08, AO2, INC2 + + LFPDUX c09, AO3, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c11, AO3, INC2 + LFPDUX c12, AO3, INC2 + + LFPDUX c13, AO4, INC2 + LFPDUX c14, AO4, INC2 + LFPDUX c15, AO4, INC2 + LFPDUX c16, AO4, INC2 + + LFPDUX c17, AO5, INC2 + LFPDUX c18, AO5, INC2 + LFPDUX c19, AO5, INC2 + LFPDUX c20, AO5, INC2 + + LFPDUX c21, AO6, INC2 + LFPDUX c22, AO6, INC2 + LFPDUX c23, AO6, INC2 + LFPDUX c24, AO6, INC2 + + LFPDUX c25, AO7, INC2 + LFPDUX c26, AO7, INC2 + LFPDUX c27, AO7, INC2 + LFPDUX c28, AO7, INC2 + + LFPDUX c29, AO8, INC2 + LFPDUX c30, AO8, INC2 + LFPDUX c31, AO8, INC2 + LFPDUX c32, AO8, INC2 + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + + STFPDUX c17, B1, INC2 + STFPDUX c18, B1, INC2 + STFPDUX c19, B1, INC2 + STFPDUX c20, B1, INC2 + STFPDUX c21, B1, INC2 + STFPDUX c22, B1, INC2 + STFPDUX c23, B1, INC2 + STFPDUX c24, B1, INC2 + + STFPDUX c25, B1, INC2 + STFPDUX c26, B1, INC2 + STFPDUX c27, B1, INC2 + STFPDUX c28, B1, INC2 + STFPDUX c29, B1, INC2 + STFPDUX c30, B1, INC2 + STFPDUX c31, B1, INC2 + STFPDUX c32, B1, INC2 + bdnz .L12 + .align 4 + +.L15: + andi. r0, N, 7 + ble .L19 + + andi. r0, N, 4 + ble .L16 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + LFPDUX c05, AO3, INC2 + LFPDUX c06, AO3, INC2 + LFPDUX c07, AO4, INC2 + LFPDUX c08, AO4, INC2 + + LFPDUX c09, AO5, INC2 + LFPDUX c10, AO5, INC2 + LFPDUX c11, AO6, INC2 + LFPDUX c12, AO6, INC2 + + LFPDUX c13, AO7, INC2 + LFPDUX c14, AO7, INC2 + LFPDUX c15, AO8, INC2 + LFPDUX c16, AO8, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + STFPDUX c09, B2, INC2 + STFPDUX c10, B2, INC2 + STFPDUX c11, B2, INC2 + STFPDUX c12, B2, INC2 + STFPDUX c13, B2, INC2 + STFPDUX c14, B2, INC2 + STFPDUX c15, B2, INC2 + STFPDUX c16, B2, INC2 + .align 4 + +.L16: + andi. r0, N, 2 + ble .L17 + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c05, AO3, INC2 + LFPDUX c07, AO4, INC2 + + LFPDUX c09, AO5, INC2 + LFPDUX c11, AO6, INC2 + LFPDUX c13, AO7, INC2 + LFPDUX c15, AO8, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + STFPDUX c09, B3, INC2 + STFPDUX c11, B3, INC2 + STFPDUX c13, B3, INC2 + STFPDUX c15, B3, INC2 + .align 4 + +.L17: + andi. r0, N, 1 + ble .L19 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO3, INC2 + LFDUX c03, AO5, INC2 + LFDUX c04, AO7, INC2 + + LFSDUX c01, AO2, INC2 + LFSDUX c02, AO4, INC2 + LFSDUX c03, AO6, INC2 + LFSDUX c04, AO8, INC2 + + STFPDUX c01, B4, INC2 + STFPDUX c02, B4, INC2 + STFPDUX c03, B4, INC2 + STFPDUX c04, B4, INC2 + .align 4 + +.L19: + addic. J, J, -1 + bgt .L10 + .align 4 + +.L20: + andi. J, M, 4 + addi M8, M8, 32 * SIZE + ble .L30 + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M8 + addi B, B, 32 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L25 + .align 4 + +.L22: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + LFPDUX c05, AO2, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c08, AO2, INC2 + + LFPDUX c09, AO3, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c11, AO3, INC2 + LFPDUX c12, AO3, INC2 + + LFPDUX c13, AO4, INC2 + LFPDUX c14, AO4, INC2 + LFPDUX c15, AO4, INC2 + LFPDUX c16, AO4, INC2 + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + bdnz .L22 + .align 4 + +.L25: + andi. r0, N, 7 + ble .L30 + + andi. r0, N, 4 + ble .L26 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + LFPDUX c05, AO3, INC2 + LFPDUX c06, AO3, INC2 + LFPDUX c07, AO4, INC2 + LFPDUX c08, AO4, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + .align 4 + +.L26: + andi. r0, N, 2 + ble .L27 + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c05, AO3, INC2 + LFPDUX c07, AO4, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + .align 4 + +.L27: + andi. r0, N, 1 + ble .L30 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + LFDUX c03, AO3, INC2 + LFDUX c04, AO4, INC2 + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B4, INC2 + STFPDUX c03, B4, INC2 + .align 4 + +.L30: + andi. J, M, 2 + addi M8, M8, 16 * SIZE + ble .L40 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M8 + addi B, B, 16 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L35 + .align 4 + +.L32: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + LFPDUX c05, AO2, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c08, AO2, INC2 + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + bdnz .L32 + .align 4 + +.L35: + andi. r0, N, 7 + ble .L40 + + andi. r0, N, 4 + ble .L36 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + .align 4 + +.L36: + andi. r0, N, 2 + ble .L37 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c02, B3, INC2 + .align 4 + +.L37: + andi. r0, N, 1 + ble .L40 + + LFDUX c01, AO1, INC2 + LFDUX c02, AO2, INC2 + + fsmfp c01, c02 + STFPDUX c01, B4, INC2 + .align 4 + +.L40: + andi. J, M, 1 + addi M8, M8, 8 * SIZE + ble .L999 + + mr AO1, A + + sub B1, B, M8 + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L45 + .align 4 + +.L42: + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz .L42 + .align 4 + +.L45: + andi. r0, N, 7 + ble .L999 + + andi. r0, N, 4 + ble .L46 + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + .align 4 + +.L46: + andi. r0, N, 2 + ble .L47 + + LFPDUX c01, AO1, INC2 + STFPDUX c01, B3, INC2 + .align 4 + +.L47: + andi. r0, N, 1 + ble .L999 + + LFDX c01, AO1, INC2 + STFDX c01, B4, INC2 + b .L999 + .align 4 + + +.L100: + subi A, A, SIZE + srawi. J, M, 3 + ble .L120 + .align 4 + +.L110: + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + sub B1, B, M8 + addi B, B, 64 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L115 + .align 4 + +.L112: + LFDUX c01, AO1, INC + LFDUX c05, AO2, INC + LFDUX c09, AO3, INC + LFDUX c13, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO3, INC + LFSDUX c13, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c06, AO2, INC + LFDUX c10, AO3, INC + LFDUX c14, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c06, AO2, INC + LFSDUX c10, AO3, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c07, AO2, INC + LFDUX c11, AO3, INC + LFDUX c15, AO4, INC + + LFSDUX c03, AO1, INC + LFSDUX c07, AO2, INC + LFSDUX c11, AO3, INC + LFSDUX c15, AO4, INC + + LFDUX c04, AO1, INC + LFDUX c08, AO2, INC + LFDUX c12, AO3, INC + LFDUX c16, AO4, INC + + LFSDUX c04, AO1, INC + LFSDUX c08, AO2, INC + LFSDUX c12, AO3, INC + LFSDUX c16, AO4, INC + + + LFDUX c17, AO5, INC + LFDUX c21, AO6, INC + LFDUX c25, AO7, INC + LFDUX c29, AO8, INC + + LFSDUX c17, AO5, INC + LFSDUX c21, AO6, INC + LFSDUX c25, AO7, INC + LFSDUX c29, AO8, INC + + LFDUX c18, AO5, INC + LFDUX c22, AO6, INC + LFDUX c26, AO7, INC + LFDUX c30, AO8, INC + + LFSDUX c18, AO5, INC + LFSDUX c22, AO6, INC + LFSDUX c26, AO7, INC + LFSDUX c30, AO8, INC + + LFDUX c19, AO5, INC + LFDUX c23, AO6, INC + LFDUX c27, AO7, INC + LFDUX c31, AO8, INC + + LFSDUX c19, AO5, INC + LFSDUX c23, AO6, INC + LFSDUX c27, AO7, INC + LFSDUX c31, AO8, INC + + LFDUX c20, AO5, INC + LFDUX c24, AO6, INC + LFDUX c28, AO7, INC + LFDUX c32, AO8, INC + + LFSDUX c20, AO5, INC + LFSDUX c24, AO6, INC + LFSDUX c28, AO7, INC + LFSDUX c32, AO8, INC + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + + STFPDUX c17, B1, INC2 + STFPDUX c18, B1, INC2 + STFPDUX c19, B1, INC2 + STFPDUX c20, B1, INC2 + STFPDUX c21, B1, INC2 + STFPDUX c22, B1, INC2 + STFPDUX c23, B1, INC2 + STFPDUX c24, B1, INC2 + + STFPDUX c25, B1, INC2 + STFPDUX c26, B1, INC2 + STFPDUX c27, B1, INC2 + STFPDUX c28, B1, INC2 + STFPDUX c29, B1, INC2 + STFPDUX c30, B1, INC2 + STFPDUX c31, B1, INC2 + STFPDUX c32, B1, INC2 + bdnz .L112 + .align 4 + +.L115: + andi. r0, N, 7 + ble .L119 + + andi. r0, N, 4 + ble .L116 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c04, AO2, INC + LFDUX c06, AO3, INC + LFDUX c08, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c04, AO2, INC + LFSDUX c06, AO3, INC + LFSDUX c08, AO4, INC + + LFDUX c09, AO5, INC + LFDUX c11, AO6, INC + LFDUX c13, AO7, INC + LFDUX c15, AO8, INC + + LFSDUX c09, AO5, INC + LFSDUX c11, AO6, INC + LFSDUX c13, AO7, INC + LFSDUX c15, AO8, INC + + LFDUX c10, AO5, INC + LFDUX c12, AO6, INC + LFDUX c14, AO7, INC + LFDUX c16, AO8, INC + + LFSDUX c10, AO5, INC + LFSDUX c12, AO6, INC + LFSDUX c14, AO7, INC + LFSDUX c16, AO8, INC + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + STFPDUX c09, B2, INC2 + STFPDUX c10, B2, INC2 + STFPDUX c11, B2, INC2 + STFPDUX c12, B2, INC2 + STFPDUX c13, B2, INC2 + STFPDUX c14, B2, INC2 + STFPDUX c15, B2, INC2 + STFPDUX c16, B2, INC2 + .align 4 + +.L116: + andi. r0, N, 2 + ble .L117 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + LFDUX c09, AO5, INC + LFDUX c11, AO6, INC + LFDUX c13, AO7, INC + LFDUX c15, AO8, INC + + LFSDUX c09, AO5, INC + LFSDUX c11, AO6, INC + LFSDUX c13, AO7, INC + LFSDUX c15, AO8, INC + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + STFPDUX c09, B3, INC2 + STFPDUX c11, B3, INC2 + STFPDUX c13, B3, INC2 + STFPDUX c15, B3, INC2 + .align 4 + +.L117: + andi. r0, N, 1 + ble .L119 + + LFDUX c01, AO1, INC + LFDUX c02, AO3, INC + LFDUX c03, AO5, INC + LFDUX c04, AO7, INC + + LFSDUX c01, AO2, INC + LFSDUX c02, AO4, INC + LFSDUX c03, AO6, INC + LFSDUX c04, AO8, INC + + STFPDUX c01, B4, INC2 + STFPDUX c02, B4, INC2 + STFPDUX c03, B4, INC2 + STFPDUX c04, B4, INC2 + .align 4 + +.L119: + addic. J, J, -1 + bgt .L110 + .align 4 + +.L120: + andi. J, M, 4 + addi M8, M8, 32 * SIZE + ble .L130 + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M8 + addi B, B, 32 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L125 + .align 4 + +.L122: + LFDUX c01, AO1, INC + LFDUX c05, AO2, INC + LFDUX c09, AO3, INC + LFDUX c13, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO3, INC + LFSDUX c13, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c06, AO2, INC + LFDUX c10, AO3, INC + LFDUX c14, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c06, AO2, INC + LFSDUX c10, AO3, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c07, AO2, INC + LFDUX c11, AO3, INC + LFDUX c15, AO4, INC + + LFSDUX c03, AO1, INC + LFSDUX c07, AO2, INC + LFSDUX c11, AO3, INC + LFSDUX c15, AO4, INC + + LFDUX c04, AO1, INC + LFDUX c08, AO2, INC + LFDUX c12, AO3, INC + LFDUX c16, AO4, INC + + LFSDUX c04, AO1, INC + LFSDUX c08, AO2, INC + LFSDUX c12, AO3, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B1, M8 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + bdnz .L122 + .align 4 + +.L125: + andi. r0, N, 7 + ble .L130 + + andi. r0, N, 4 + ble .L126 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c04, AO2, INC + LFDUX c06, AO3, INC + LFDUX c08, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c04, AO2, INC + LFSDUX c06, AO3, INC + LFSDUX c08, AO4, INC + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + .align 4 + +.L126: + andi. r0, N, 2 + ble .L127 + + LFDUX c01, AO1, INC + LFDUX c03, AO2, INC + LFDUX c05, AO3, INC + LFDUX c07, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c03, AO2, INC + LFSDUX c05, AO3, INC + LFSDUX c07, AO4, INC + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + .align 4 + +.L127: + andi. r0, N, 1 + ble .L130 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + LFDUX c03, AO3, INC + LFDUX c04, AO4, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B4, INC2 + STFPDUX c03, B4, INC2 + .align 4 + +.L130: + andi. J, M, 2 + addi M8, M8, 16 * SIZE + ble .L140 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M8 + addi B, B, 16 * SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L135 + .align 4 + +.L132: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + LFDUX c13, AO2, INC + LFDUX c14, AO2, INC + LFDUX c15, AO2, INC + LFDUX c16, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + fsmfp c09, c10 + fsmfp c11, c12 + fsmfp c13, c14 + fsmfp c15, c16 + + STFPDUX c01, B1, M8 + STFPDUX c03, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c09, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c15, B1, INC2 + bdnz .L132 + .align 4 + +.L135: + andi. r0, N, 7 + ble .L140 + + andi. r0, N, 4 + ble .L136 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c09, c10 + fsmfp c11, c12 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c09, B2, INC2 + STFPDUX c11, B2, INC2 + .align 4 + +.L136: + andi. r0, N, 2 + ble .L137 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + + fsmfp c01, c02 + fsmfp c09, c10 + + STFPDUX c01, B3, INC2 + STFPDUX c09, B3, INC2 + .align 4 + +.L137: + andi. r0, N, 1 + ble .L140 + + LFDUX c01, AO1, INC + LFDUX c02, AO2, INC + + fsmfp c01, c02 + STFPDUX c01, B4, INC2 + .align 4 + +.L140: + andi. J, M, 1 + addi M8, M8, 8 * SIZE + ble .L999 + + mr AO1, A + + sub B1, B, M8 + + srawi. r0, N, 3 + mtspr CTR, r0 + ble .L145 + .align 4 + +.L142: + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B1, M8 + STFPDUX c03, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c07, B1, INC2 + bdnz .L142 + .align 4 + +.L145: + andi. r0, N, 7 + ble .L999 + + andi. r0, N, 4 + ble .L146 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +.L146: + andi. r0, N, 2 + ble .L147 + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + + STFPDUX c01, B3, INC2 + .align 4 + +.L147: + andi. r0, N, 1 + ble .L999 + + LFDX c01, AO1, INC + STFDX c01, B4, INC2 + .align 4 + +.L999: + addi SP, SP, -4 + + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/gemv_hummer_n.S b/kernel/power/gemv_hummer_n.S new file mode 100644 index 0000000000..a9340bebe2 --- /dev/null +++ b/kernel/power/gemv_hummer_n.S @@ -0,0 +1,1780 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 + +#define I r11 +#define J r12 + +#define INCY2 r24 +#define A1 r25 +#define A2 r26 +#define A3 r27 +#define A4 r28 + +#define YL r29 +#define YS r30 +#define INC2 r31 + +#define yl1 f0 +#define yl2 f2 +#define yl3 f3 +#define yl4 f4 +#define ys1 f5 +#define ys2 f6 +#define ys3 f7 +#define ys4 f8 +#define yl5 f27 +#define ys5 f28 + +#define alpha1 f9 +#define alpha2 f10 + +#define a1 f11 +#define a2 f12 +#define a3 f13 +#define a4 f14 +#define a5 f15 +#define a6 f16 +#define a7 f17 +#define a8 f18 + +#define a9 f19 +#define a10 f20 +#define a11 f21 +#define a12 f22 +#define a13 f23 +#define a14 f24 +#define a15 f25 +#define a16 f26 + +#define alpha f1 + + PROLOGUE + PROFCODE + + li r0, -16 + lwz INCY, 8(SP) + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + fsmfp alpha, alpha + + cmpwi cr0, M, 0 + ble- .L999 + cmpwi cr0, N, 0 + ble- .L999 + + add INCY2, INCY, INCY + li INC2, 2 * SIZE + sub X, X, INCX + + andi. r0, A, 2 * SIZE - 1 +# bne .L100 + +# All cases for aligned A, even LDA + + cmpwi cr0, INCY, SIZE + bne .L70 + + andi. r0, Y, 2 * SIZE - 1 + bne .L40 + +# A : aligned LDA : even Y : Unit Aligned + + sub A, A, INC2 + sub Y, Y, INCY2 + + srawi. J, N, 2 + ble .L20 + .align 4 + +.L11: + LFDUX alpha1, X, INCX + mr A1, A + add A2, A, LDA + add A3, A2, LDA + LFSDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + add A4, A3, LDA + add A, A4, LDA + mr YL, Y + LFSDUX alpha2, X, INCX + fpmul alpha1, alpha, alpha1 + mr YS, Y + srawi. r0, M, 3 + mtspr CTR, r0 + fpmul alpha2, alpha, alpha2 + ble .L15 + + LFPDUX yl1, YL, INCY2 + LFPDUX yl2, YL, INCY2 + LFPDUX yl3, YL, INCY2 + LFPDUX yl4, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a10, A2, INC2 + LFPDUX a14, A2, INC2 + + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + LFPDUX a11, A3, INC2 + LFPDUX a15, A3, INC2 + + LFPDUX a4, A4, INC2 + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX a8, A4, INC2 + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX a12, A4, INC2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX a16, A4, INC2 + fxcpmadd ys4, alpha1, a13, yl4 + bdz .L13 + .align 4 + +.L12: + LFPDUX yl1, YL, INCY2 + + fxcsmadd ys1, alpha1, a2, ys1 + LFPDUX a1, A1, INC2 + fxcsmadd ys2, alpha1, a6, ys2 + LFPDUX a5, A1, INC2 + fxcsmadd ys3, alpha1, a10, ys3 + LFPDUX a9, A1, INC2 + fxcsmadd ys4, alpha1, a14, ys4 + LFPDUX a13, A1, INC2 + + LFPDUX yl2, YL, INCY2 + + fxcpmadd ys1, alpha2, a3, ys1 + LFPDUX a2, A2, INC2 + fxcpmadd ys2, alpha2, a7, ys2 + LFPDUX a6, A2, INC2 + fxcpmadd ys3, alpha2, a11, ys3 + LFPDUX a10, A2, INC2 + fxcpmadd ys4, alpha2, a15, ys4 + LFPDUX a14, A2, INC2 + + LFPDUX yl3, YL, INCY2 + + fxcsmadd ys1, alpha2, a4, ys1 + LFPDUX a3, A3, INC2 + fxcsmadd ys2, alpha2, a8, ys2 + LFPDUX a7, A3, INC2 + fxcsmadd ys3, alpha2, a12, ys3 + LFPDUX a11, A3, INC2 + fxcsmadd ys4, alpha2, a16, ys4 + LFPDUX a15, A3, INC2 + + LFPDUX yl4, YL, INCY2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + + LFPDUX a4, A4, INC2 + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX a8, A4, INC2 + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX a12, A4, INC2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX a16, A4, INC2 + fxcpmadd ys4, alpha1, a13, yl4 + bdnz .L12 + .align 4 + +.L13: + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + fxcsmadd ys3, alpha1, a10, ys3 + fxcsmadd ys4, alpha1, a14, ys4 + + fxcpmadd ys1, alpha2, a3, ys1 + fxcpmadd ys2, alpha2, a7, ys2 + fxcpmadd ys3, alpha2, a11, ys3 + fxcpmadd ys4, alpha2, a15, ys4 + + fxcsmadd ys1, alpha2, a4, ys1 + fxcsmadd ys2, alpha2, a8, ys2 + fxcsmadd ys3, alpha2, a12, ys3 + fxcsmadd ys4, alpha2, a16, ys4 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + .align 4 + +.L15: + andi. r0, M, 7 + ble .L19 + + andi. r0, M, 4 + ble .L17 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + + LFPDUX a4, A4, INC2 + LFPDUX a8, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + + fxcpmadd ys1, alpha2, a3, ys1 + fxcpmadd ys2, alpha2, a7, ys2 + fxcsmadd ys1, alpha2, a4, ys1 + fxcsmadd ys2, alpha2, a8, ys2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + .align 4 + +.L17: + andi. r0, M, 2 + ble .L18 + + LFPDUX yl1, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFPDUX a3, A3, INC2 + LFPDUX a4, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys1, alpha2, a3, ys1 + fxcsmadd ys1, alpha2, a4, ys1 + + STFPDUX ys1, YS, INCY2 + .align 4 + +.L18: + andi. r0, M, 1 + ble .L19 + + LFDUX yl1, YL, INCY2 + + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + LFDUX a3, A3, INC2 + LFDUX a4, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys1, alpha2, a3, ys1 + fxcsmadd ys1, alpha2, a4, ys1 + + STFDUX ys1, YS, INCY2 + .align 4 + +.L19: + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt .L11 + .align 4 + +.L20: + andi. J, N, 2 + ble .L30 + + LFDUX alpha1, X, INCX + + mr A1, A + add A2, A, LDA + add A, A2, LDA + LFSDUX alpha1, X, INCX + + mr YL, Y + mr YS, Y + fpmul alpha1, alpha, alpha1 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L25 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + + LFPDUX yl3, YL, INCY2 + LFPDUX a9, A1, INC2 + LFPDUX yl4, YL, INCY2 + LFPDUX a13, A1, INC2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a10, A2, INC2 + LFPDUX a14, A2, INC2 + bdz .L23 + .align 4 + +.L22: + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + LFPDUX yl1, YL, INCY2 + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX a5, A1, INC2 + LFPDUX yl2, YL, INCY2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX a9, A1, INC2 + LFPDUX yl3, YL, INCY2 + fxcpmadd ys4, alpha1, a13, yl4 + LFPDUX a13, A1, INC2 + LFPDUX yl4, YL, INCY2 + + fxcsmadd ys1, alpha1, a2, ys1 + LFPDUX a2, A2, INC2 + fxcsmadd ys2, alpha1, a6, ys2 + LFPDUX a6, A2, INC2 + fxcsmadd ys3, alpha1, a10, ys3 + LFPDUX a10, A2, INC2 + fxcsmadd ys4, alpha1, a14, ys4 + LFPDUX a14, A2, INC2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + bdnz .L22 + .align 4 + +.L23: + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcpmadd ys3, alpha1, a9, yl3 + fxcpmadd ys4, alpha1, a13, yl4 + + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + fxcsmadd ys3, alpha1, a10, ys3 + fxcsmadd ys4, alpha1, a14, ys4 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + .align 4 + +.L25: + andi. r0, M, 7 + ble .L30 + + andi. r0, M, 4 + ble .L27 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + LFPDUX a6, A2, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcsmadd ys2, alpha1, a6, ys2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + .align 4 + +.L27: + andi. r0, M, 2 + ble .L28 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + + STFPDUX ys1, YS, INCY2 + .align 4 + +.L28: + andi. r0, M, 1 + ble .L30 + + LFDUX yl1, YL, INCY2 + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + + STFDUX ys1, YS, INCY2 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + + LFDUX alpha1, X, INCX + + mr A1, A + mr YL, Y + mr YS, Y + fmul alpha1, alpha, alpha1 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L35 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + + LFPDUX yl3, YL, INCY2 + LFPDUX a9, A1, INC2 + LFPDUX yl4, YL, INCY2 + LFPDUX a13, A1, INC2 + bdz .L33 + .align 4 + +.L32: + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX yl3, YL, INCY2 + LFPDUX a9, A1, INC2 + fxcpmadd ys4, alpha1, a13, yl4 + LFPDUX yl4, YL, INCY2 + LFPDUX a13, A1, INC2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + bdnz .L32 + .align 4 + +.L33: + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcpmadd ys3, alpha1, a9, yl3 + fxcpmadd ys4, alpha1, a13, yl4 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + STFPDUX ys3, YS, INCY2 + STFPDUX ys4, YS, INCY2 + .align 4 + +.L35: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + ble .L37 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + + LFPDUX yl2, YL, INCY2 + LFPDUX a5, A1, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + + STFPDUX ys1, YS, INCY2 + STFPDUX ys2, YS, INCY2 + .align 4 + +.L37: + andi. r0, M, 2 + ble .L38 + + LFPDUX yl1, YL, INCY2 + LFPDUX a1, A1, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + + STFPDUX ys1, YS, INCY2 + .align 4 + +.L38: + andi. r0, M, 1 + ble .L999 + + LFDUX yl1, YL, INCY2 + LFDUX a1, A1, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + + STFDUX ys1, YS, INCY2 + b .L999 + .align 4 + +.L40: +# A : aligned LDA : even Y : Unaligned + + sub A, A, INC2 + sub Y, Y, INCY + + srawi. J, N, 2 + ble .L50 + .align 4 + +.L41: + LFDUX alpha1, X, INCX + LFSDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + LFSDUX alpha2, X, INCX + + fpmul alpha1, alpha, alpha1 + fpmul alpha2, alpha, alpha2 + + mr A1, A + add A2, A, LDA + add A3, A2, LDA + add A4, A3, LDA + add A, A4, LDA + + mr YL, Y + sub YS, Y, INCY2 + + LFSDX ys1, YS, INCY2 + LFDX yl1, YL, INCY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L45 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + LFXDUX yl4, YL, INCY2 + LFXDUX yl5, YL, INCY2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a10, A2, INC2 + LFPDUX a14, A2, INC2 + + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + LFPDUX a11, A3, INC2 + LFPDUX a15, A3, INC2 + + LFPDUX a4, A4, INC2 + fsmr yl1, yl2 + LFPDUX a8, A4, INC2 + fsmr yl2, yl3 + LFPDUX a12, A4, INC2 + fsmr yl3, yl4 + LFPDUX a16, A4, INC2 + fsmr yl4, yl5 + bdz .L43 + .align 4 + +.L42: + fxcpmadd ys2, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + fxcpmadd ys3, alpha1, a5, yl2 + LFPDUX a5, A1, INC2 + fxcpmadd ys4, alpha1, a9, yl3 + LFPDUX a9, A1, INC2 + fxcpmadd ys5, alpha1, a13, yl4 + LFPDUX a13, A1, INC2 + + fxcsmadd ys2, alpha1, a2, ys2 + LFPDUX a2, A2, INC2 + fxcsmadd ys3, alpha1, a6, ys3 + LFPDUX a6, A2, INC2 + fxcsmadd ys4, alpha1, a10, ys4 + LFPDUX a10, A2, INC2 + fxcsmadd ys5, alpha1, a14, ys5 + LFPDUX a14, A2, INC2 + + fxcpmadd ys2, alpha2, a3, ys2 + LFPDUX a3, A3, INC2 + fxcpmadd ys3, alpha2, a7, ys3 + LFPDUX a7, A3, INC2 + fxcpmadd ys4, alpha2, a11, ys4 + LFPDUX a11, A3, INC2 + fxcpmadd ys5, alpha2, a15, ys5 + LFPDUX a15, A3, INC2 + + fxcsmadd ys2, alpha2, a4, ys2 + LFPDUX a4, A4, INC2 + fxcsmadd ys3, alpha2, a8, ys3 + LFPDUX a8, A4, INC2 + fxcsmadd ys4, alpha2, a12, ys4 + LFPDUX a12, A4, INC2 + fxcsmadd ys5, alpha2, a16, ys5 + LFPDUX a16, A4, INC2 + + fmr yl1, yl5 + LFXDUX yl2, YL, INCY2 + fmr ys1, ys2 + LFXDUX yl3, YL, INCY2 + fmr ys2, ys3 + LFXDUX yl4, YL, INCY2 + fmr ys3, ys4 + LFXDUX yl5, YL, INCY2 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + fsmr yl1, yl2 + STFXDUX ys3, YS, INCY2 + fsmr yl2, yl3 + STFXDUX ys4, YS, INCY2 + fsmr yl3, yl4 + + fsmr yl4, yl5 + bdnz .L42 + .align 4 + +.L43: + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + fxcpmadd ys4, alpha1, a9, yl3 + fxcpmadd ys5, alpha1, a13, yl4 + + fxcsmadd ys2, alpha1, a2, ys2 + fxcsmadd ys3, alpha1, a6, ys3 + fxcsmadd ys4, alpha1, a10, ys4 + fxcsmadd ys5, alpha1, a14, ys5 + + fxcpmadd ys2, alpha2, a3, ys2 + fxcpmadd ys3, alpha2, a7, ys3 + fxcpmadd ys4, alpha2, a11, ys4 + fxcpmadd ys5, alpha2, a15, ys5 + + fxcsmadd ys2, alpha2, a4, ys2 + fxcsmadd ys3, alpha2, a8, ys3 + fxcsmadd ys4, alpha2, a12, ys4 + fxcsmadd ys5, alpha2, a16, ys5 + + fmr ys1, ys2 + fmr ys2, ys3 + fmr ys3, ys4 + fmr ys4, ys5 + fmr yl1, yl5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + STFXDUX ys3, YS, INCY2 + STFXDUX ys4, YS, INCY2 + .align 4 + +.L45: + andi. r0, M, 7 + ble .L48 + + andi. r0, M, 4 + ble .L46 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + + LFPDUX a4, A4, INC2 + fsmr yl1, yl2 + LFPDUX a8, A4, INC2 + fsmr yl2, yl3 + + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + fxcsmadd ys2, alpha1, a2, ys2 + fxcsmadd ys3, alpha1, a6, ys3 + + fxcpmadd ys2, alpha2, a3, ys2 + fxcpmadd ys3, alpha2, a7, ys3 + fxcsmadd ys2, alpha2, a4, ys2 + fxcsmadd ys3, alpha2, a8, ys3 + + fmr yl1, yl3 + fmr ys1, ys2 + fmr ys2, ys3 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys3 + STFXDUX ys2, YS, INCY2 + .align 4 + +.L46: + andi. r0, M, 2 + ble .L47 + + LFXDUX yl2, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFPDUX a3, A3, INC2 + LFPDUX a4, A4, INC2 + + fsmr yl1, yl2 + fxcpmadd ys2, alpha1, a1, yl1 + fxcsmadd ys2, alpha1, a2, ys2 + fxcpmadd ys2, alpha2, a3, ys2 + fxcsmadd ys2, alpha2, a4, ys2 + fmr yl1, yl2 + + fmr ys1, ys2 + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys2 + .align 4 + +.L47: + andi. r0, M, 1 + ble .L48 + + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + LFDUX a3, A3, INC2 + LFDUX a4, A4, INC2 + + fxcpmadd ys2, alpha1, a1, yl1 + fxcsmadd ys2, alpha1, a2, ys2 + fxcpmadd ys2, alpha2, a3, ys2 + fxcsmadd ys2, alpha2, a4, ys2 + + STFSDX ys1, YS, INCY2 + add YS, YS, INCY + STFDX ys2, YS, INCY2 + b .L49 + .align 4 + +.L48: + STFSDUX ys1, YS, INCY2 + .align 4 + +.L49: + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt .L41 + .align 4 + +.L50: + andi. J, N, 2 + ble .L60 + + LFDUX alpha1, X, INCX + + mr A1, A + add A2, A, LDA + add A, A2, LDA + LFSDUX alpha1, X, INCX + + mr YL, Y + sub YS, Y, INCY2 + fpmul alpha1, alpha, alpha1 + + LFSDX ys1, YS, INCY2 + LFDX yl1, YL, INCY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L55 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + LFXDUX yl4, YL, INCY2 + LFXDUX yl5, YL, INCY2 + + LFPDUX a2, A2, INC2 + fsmr yl1, yl2 + LFPDUX a6, A2, INC2 + fsmr yl2, yl3 + LFPDUX a10, A2, INC2 + fsmr yl3, yl4 + LFPDUX a14, A2, INC2 + fsmr yl4, yl5 + bdz .L53 + .align 4 + +.L52: + fxcpmadd ys2, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + fxcpmadd ys3, alpha1, a5, yl2 + LFPDUX a5, A1, INC2 + fxcpmadd ys4, alpha1, a9, yl3 + LFPDUX a9, A1, INC2 + fxcpmadd ys5, alpha1, a13, yl4 + LFPDUX a13, A1, INC2 + + fxcsmadd ys2, alpha1, a2, ys2 + LFPDUX a2, A2, INC2 + fxcsmadd ys3, alpha1, a6, ys3 + LFPDUX a6, A2, INC2 + fxcsmadd ys4, alpha1, a10, ys4 + LFPDUX a10, A2, INC2 + fxcsmadd ys5, alpha1, a14, ys5 + LFPDUX a14, A2, INC2 + + fmr yl1, yl5 + LFXDUX yl2, YL, INCY2 + fmr ys1, ys2 + LFXDUX yl3, YL, INCY2 + fmr ys2, ys3 + LFXDUX yl4, YL, INCY2 + fmr ys3, ys4 + LFXDUX yl5, YL, INCY2 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + fsmr yl1, yl2 + STFXDUX ys3, YS, INCY2 + fsmr yl2, yl3 + STFXDUX ys4, YS, INCY2 + fsmr yl3, yl4 + + fsmr yl4, yl5 + bdnz .L52 + .align 4 + +.L53: + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + fxcpmadd ys4, alpha1, a9, yl3 + fxcpmadd ys5, alpha1, a13, yl4 + + fxcsmadd ys2, alpha1, a2, ys2 + fxcsmadd ys3, alpha1, a6, ys3 + fxcsmadd ys4, alpha1, a10, ys4 + fxcsmadd ys5, alpha1, a14, ys5 + + fmr yl1, yl5 + fmr ys1, ys2 + fmr ys2, ys3 + fmr ys3, ys4 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + STFXDUX ys3, YS, INCY2 + STFXDUX ys4, YS, INCY2 + .align 4 + +.L55: + andi. r0, M, 7 + ble .L59 + + andi. r0, M, 4 + ble .L57 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + + LFPDUX a5, A1, INC2 + LFPDUX a6, A2, INC2 + + fsmr yl1, yl2 + fsmr yl2, yl3 + + fxcpmadd ys2, alpha1, a1, yl1 + fxcsmadd ys2, alpha1, a2, ys2 + fxcpmadd ys3, alpha1, a5, yl2 + fxcsmadd ys3, alpha1, a6, ys3 + + fmr yl1, yl3 + fmr ys1, ys2 + fmr ys2, ys3 + + STFXDUX ys1, YS, INCY2 + STFXDUX ys2, YS, INCY2 + fsmr ys1, ys3 + .align 4 + +.L57: + andi. r0, M, 2 + ble .L58 + + LFXDUX yl2, YL, INCY2 + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + + fsmr yl1, yl2 + fxcpmadd ys2, alpha1, a1, yl1 + fxcsmadd ys2, alpha1, a2, ys2 + fmr yl1, yl2 + + fmr ys1, ys2 + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys2 + .align 4 + +.L58: + andi. r0, M, 1 + ble .L59 + + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + + fxmr alpha2, alpha1 + fmadd ys1, alpha1, a1, yl1 + fmadd ys1, alpha2, a2, ys1 + + STFXDUX ys1, YS, INCY2 + b .L60 + .align 4 + +.L59: + STFSDUX ys1, YS, INCY2 + .align 4 + +.L60: + andi. J, N, 1 + ble .L999 + + LFDUX alpha1, X, INCX + mr A1, A + + mr YL, Y + sub YS, Y, INCY2 + + fmul alpha1, alpha, alpha1 + + LFSDX ys1, YS, INCY2 + LFDX yl1, YL, INCY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L65 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + LFXDUX yl4, YL, INCY2 + LFXDUX yl5, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + fsmr yl1, yl2 + fsmr yl2, yl3 + fsmr yl3, yl4 + fsmr yl4, yl5 + bdz .L63 + .align 4 + +.L62: + fxcpmadd ys2, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + fxcpmadd ys3, alpha1, a5, yl2 + LFXDUX yl2, YL, INCY2 + fxcpmadd ys4, alpha1, a9, yl3 + LFXDUX yl3, YL, INCY2 + fxcpmadd ys5, alpha1, a13, yl4 + LFXDUX yl4, YL, INCY2 + + fmr yl1, yl5 + LFXDUX yl5, YL, INCY2 + fmr ys1, ys2 + LFPDUX a5, A1, INC2 + fmr ys2, ys3 + LFPDUX a9, A1, INC2 + fmr ys3, ys4 + LFPDUX a13, A1, INC2 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + fsmr yl1, yl2 + STFXDUX ys3, YS, INCY2 + fsmr yl2, yl3 + STFXDUX ys4, YS, INCY2 + fsmr yl3, yl4 + + fsmr yl4, yl5 + bdnz .L62 + .align 4 + +.L63: + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + fxcpmadd ys4, alpha1, a9, yl3 + fxcpmadd ys5, alpha1, a13, yl4 + + fmr yl1, yl5 + fmr ys1, ys2 + fmr ys2, ys3 + fmr ys3, ys4 + fmr ys4, ys5 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys5 + STFXDUX ys2, YS, INCY2 + STFXDUX ys3, YS, INCY2 + STFXDUX ys4, YS, INCY2 + .align 4 + +.L65: + andi. r0, M, 7 + ble .L69 + + andi. r0, M, 4 + ble .L67 + + LFXDUX yl2, YL, INCY2 + LFXDUX yl3, YL, INCY2 + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + + fsmr yl1, yl2 + fsmr yl2, yl3 + + fxcpmadd ys2, alpha1, a1, yl1 + fxcpmadd ys3, alpha1, a5, yl2 + + fmr yl1, yl3 + fmr ys1, ys2 + fmr ys2, ys3 + + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys3 + STFXDUX ys2, YS, INCY2 + .align 4 + +.L67: + andi. r0, M, 2 + ble .L68 + + LFPDUX a1, A1, INC2 + LFXDUX yl2, YL, INCY2 + + fsmr yl1, yl2 + fxcpmadd ys2, alpha1, a1, yl1 + fmr yl1, yl2 + fmr ys1, ys2 + STFXDUX ys1, YS, INCY2 + fsmr ys1, ys2 + .align 4 + +.L68: + andi. r0, M, 1 + ble .L69 + + LFDUX a1, A1, INC2 + fmadd ys1, alpha1, a1, yl1 + STFXDUX ys1, YS, INCY2 + b .L999 + .align 4 + +.L69: + STFSDUX ys1, YS, INCY2 + b .L999 + .align 4 + +.L70: + sub A, A, INC2 + sub Y, Y, INCY + srawi. J, N, 2 + ble .L80 + .align 4 + +.L71: + LFDUX alpha1, X, INCX + mr A1, A + add A2, A, LDA + add A3, A2, LDA + LFSDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + add A4, A3, LDA + add A, A4, LDA + mr YL, Y + LFSDUX alpha2, X, INCX + fpmul alpha1, alpha, alpha1 + mr YS, Y + srawi. r0, M, 3 + mtspr CTR, r0 + fpmul alpha2, alpha, alpha2 + ble .L75 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + LFSDUX yl1, YL, INCY + + LFDUX yl2, YL, INCY + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a10, A2, INC2 + LFPDUX a14, A2, INC2 + LFSDUX yl2, YL, INCY + + LFDUX yl3, YL, INCY + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + LFPDUX a11, A3, INC2 + LFPDUX a15, A3, INC2 + LFSDUX yl3, YL, INCY + + LFDUX yl4, YL, INCY + LFPDUX a4, A4, INC2 + LFPDUX a8, A4, INC2 + LFPDUX a12, A4, INC2 + LFPDUX a16, A4, INC2 + LFSDUX yl4, YL, INCY + bdz .L73 + .align 4 + +.L72: + fxcpmadd ys1, alpha1, a1, yl1 + LFPDUX a1, A1, INC2 + LFDUX yl1, YL, INCY + fxcpmadd ys2, alpha1, a5, yl2 + LFPDUX a5, A1, INC2 + fxcpmadd ys3, alpha1, a9, yl3 + LFPDUX a9, A1, INC2 + fxcpmadd ys4, alpha1, a13, yl4 + LFPDUX a13, A1, INC2 + LFSDUX yl1, YL, INCY + + fxcsmadd ys1, alpha1, a2, ys1 + LFPDUX a2, A2, INC2 + LFDUX yl2, YL, INCY + fxcsmadd ys2, alpha1, a6, ys2 + LFPDUX a6, A2, INC2 + fxcsmadd ys3, alpha1, a10, ys3 + LFPDUX a10, A2, INC2 + fxcsmadd ys4, alpha1, a14, ys4 + LFPDUX a14, A2, INC2 + LFSDUX yl2, YL, INCY + + fxcpmadd ys1, alpha2, a3, ys1 + LFPDUX a3, A3, INC2 + LFDUX yl3, YL, INCY + fxcpmadd ys2, alpha2, a7, ys2 + LFPDUX a7, A3, INC2 + fxcpmadd ys3, alpha2, a11, ys3 + LFPDUX a11, A3, INC2 + fxcpmadd ys4, alpha2, a15, ys4 + LFPDUX a15, A3, INC2 + LFSDUX yl3, YL, INCY + + fxcsmadd ys1, alpha2, a4, ys1 + LFPDUX a4, A4, INC2 + LFDUX yl4, YL, INCY + fxcsmadd ys2, alpha2, a8, ys2 + LFPDUX a8, A4, INC2 + fxcsmadd ys3, alpha2, a12, ys3 + LFPDUX a12, A4, INC2 + fxcsmadd ys4, alpha2, a16, ys4 + LFPDUX a16, A4, INC2 + LFSDUX yl4, YL, INCY + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + bdnz .L72 + .align 4 + +.L73: + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcpmadd ys3, alpha1, a9, yl3 + fxcpmadd ys4, alpha1, a13, yl4 + + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + fxcsmadd ys3, alpha1, a10, ys3 + fxcsmadd ys4, alpha1, a14, ys4 + + fxcpmadd ys1, alpha2, a3, ys1 + fxcpmadd ys2, alpha2, a7, ys2 + fxcpmadd ys3, alpha2, a11, ys3 + fxcpmadd ys4, alpha2, a15, ys4 + + fxcsmadd ys1, alpha2, a4, ys1 + fxcsmadd ys2, alpha2, a8, ys2 + fxcsmadd ys3, alpha2, a12, ys3 + fxcsmadd ys4, alpha2, a16, ys4 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + .align 4 + +.L75: + andi. r0, M, 7 + ble .L79 + + andi. r0, M, 4 + ble .L77 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFSDUX yl1, YL, INCY + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + + LFDUX yl2, YL, INCY + LFPDUX a3, A3, INC2 + LFPDUX a7, A3, INC2 + LFSDUX yl2, YL, INCY + LFPDUX a4, A4, INC2 + LFPDUX a8, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + + fxcpmadd ys1, alpha2, a3, ys1 + fxcpmadd ys2, alpha2, a7, ys2 + fxcsmadd ys1, alpha2, a4, ys1 + fxcsmadd ys2, alpha2, a8, ys2 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + .align 4 + +.L77: + andi. r0, M, 2 + ble .L78 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFSDUX yl1, YL, INCY + LFPDUX a3, A3, INC2 + LFPDUX a4, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys1, alpha2, a3, ys1 + fxcsmadd ys1, alpha2, a4, ys1 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + .align 4 + +.L78: + andi. r0, M, 1 + ble .L79 + + LFDUX yl1, YL, INCY + + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + LFDUX a3, A3, INC2 + LFDUX a4, A4, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + fxcpmadd ys1, alpha2, a3, ys1 + fxcsmadd ys1, alpha2, a4, ys1 + + STFDUX ys1, YS, INCY + .align 4 + +.L79: + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt .L71 + .align 4 + +.L80: + andi. J, N, 2 + ble .L90 + + LFDUX alpha1, X, INCX + + mr A1, A + add A2, A, LDA + add A, A2, LDA + LFSDUX alpha1, X, INCX + + mr YL, Y + mr YS, Y + fpmul alpha1, alpha, alpha1 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L85 + + LFDUX yl1, YL, INCY + LFDUX a9, YL, INCY + LFDUX yl2, YL, INCY + LFDUX a10, YL, INCY + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a3, A1, INC2 + LFPDUX a7, A1, INC2 + + LFDUX yl3, YL, INCY + LFDUX a11, YL, INCY + LFDUX yl4, YL, INCY + LFDUX a12, YL, INCY + + LFPDUX a2, A2, INC2 + LFPDUX a6, A2, INC2 + LFPDUX a4, A2, INC2 + LFPDUX a8, A2, INC2 + + bdz .L83 + .align 4 + +.L82: + fsmfp yl1, a9 + fsmfp yl2, a10 + fsmfp yl3, a11 + fsmfp yl4, a12 + + fxcpmadd ys1, alpha1, a1, yl1 + LFDUX yl1, YL, INCY + LFDUX a9, YL, INCY + LFPDUX a1, A1, INC2 + fxcpmadd ys2, alpha1, a5, yl2 + LFDUX yl2, YL, INCY + LFDUX a10, YL, INCY + LFPDUX a5, A1, INC2 + fxcpmadd ys3, alpha1, a3, yl3 + LFDUX yl3, YL, INCY + LFDUX a11, YL, INCY + LFPDUX a3, A1, INC2 + fxcpmadd ys4, alpha1, a7, yl4 + LFDUX yl4, YL, INCY + LFDUX a12, YL, INCY + LFPDUX a7, A1, INC2 + + fxcsmadd ys1, alpha1, a2, ys1 + LFPDUX a2, A2, INC2 + fxcsmadd ys2, alpha1, a6, ys2 + LFPDUX a6, A2, INC2 + fxcsmadd ys3, alpha1, a4, ys3 + LFPDUX a4, A2, INC2 + fxcsmadd ys4, alpha1, a8, ys4 + LFPDUX a8, A2, INC2 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + bdnz .L82 + .align 4 + +.L83: + fsmfp yl1, a9 + fsmfp yl2, a10 + fsmfp yl3, a11 + fsmfp yl4, a12 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcpmadd ys3, alpha1, a3, yl3 + fxcpmadd ys4, alpha1, a7, yl4 + + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + fxcsmadd ys3, alpha1, a4, ys3 + fxcsmadd ys4, alpha1, a8, ys4 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + .align 4 + +.L85: + andi. r0, M, 7 + ble .L90 + + andi. r0, M, 4 + ble .L87 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFSDUX yl1, YL, INCY + LFDUX yl2, YL, INCY + LFPDUX a5, A1, INC2 + LFPDUX a6, A2, INC2 + LFSDUX yl2, YL, INCY + + fxcpmadd ys1, alpha1, a1, yl1 + fxcpmadd ys2, alpha1, a5, yl2 + fxcsmadd ys1, alpha1, a2, ys1 + fxcsmadd ys2, alpha1, a6, ys2 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + .align 4 + +.L87: + andi. r0, M, 2 + ble .L88 + + LFDUX yl1, YL, INCY + LFPDUX a1, A1, INC2 + LFPDUX a2, A2, INC2 + LFSDUX yl1, YL, INCY + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + .align 4 + +.L88: + andi. r0, M, 1 + ble .L90 + + LFDUX yl1, YL, INCY + LFDUX a1, A1, INC2 + LFDUX a2, A2, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + fxcsmadd ys1, alpha1, a2, ys1 + + STFDUX ys1, YS, INCY + .align 4 + +.L90: + andi. J, N, 1 + ble .L999 + + LFDUX alpha1, X, INCX + + mr A1, A + mr YL, Y + mr YS, Y + fmul alpha1, alpha, alpha1 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble .L95 + + LFDUX yl1, YL, INCY + LFSDUX a2, YL, INCY + LFDUX yl2, YL, INCY + LFSDUX a4, YL, INCY + LFDUX yl3, YL, INCY + LFSDUX a6, YL, INCY + LFDUX yl4, YL, INCY + LFSDUX a8, YL, INCY + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + bdz .L93 + .align 4 + +.L92: + fmr a2, yl1 + fmr a4, yl2 + fmr a6, yl3 + fmr a8, yl4 + + fxcpmadd ys1, alpha1, a1, a2 + LFDUX yl1, YL, INCY + LFSDUX a2, YL, INCY + fxcpmadd ys2, alpha1, a5, a4 + LFDUX yl2, YL, INCY + LFSDUX a4, YL, INCY + fxcpmadd ys3, alpha1, a9, a6 + LFDUX yl3, YL, INCY + LFSDUX a6, YL, INCY + fxcpmadd ys4, alpha1, a13, a8 + LFDUX yl4, YL, INCY + LFSDUX a8, YL, INCY + + LFPDUX a1, A1, INC2 + LFPDUX a5, A1, INC2 + LFPDUX a9, A1, INC2 + LFPDUX a13, A1, INC2 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + bdnz .L92 + .align 4 + +.L93: + fmr a2, yl1 + fmr a4, yl2 + fmr a6, yl3 + fmr a8, yl4 + + fxcpmadd ys1, alpha1, a1, a2 + fxcpmadd ys2, alpha1, a5, a4 + fxcpmadd ys3, alpha1, a9, a6 + fxcpmadd ys4, alpha1, a13, a8 + + STFDUX ys1, YS, INCY + STFSDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFSDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFSDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + STFSDUX ys4, YS, INCY + .align 4 + +.L95: + andi. r0, M, 7 + ble .L999 + + andi. r0, M, 4 + ble .L97 + + LFPDUX a1, A1, INC2 + LFDUX yl1, YL, INCY + LFDUX yl2, YL, INCY + LFPDUX a2, A1, INC2 + LFDUX yl3, YL, INCY + LFDUX yl4, YL, INCY + + fxcpmadd ys1, a1, alpha1, yl1 + fxcsmadd ys2, a1, alpha1, yl2 + fxcpmadd ys3, a2, alpha1, yl3 + fxcsmadd ys4, a2, alpha1, yl4 + + STFDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + STFDUX ys3, YS, INCY + STFDUX ys4, YS, INCY + .align 4 + +.L97: + andi. r0, M, 2 + ble .L98 + + LFPDUX a1, A1, INC2 + LFDUX yl1, YL, INCY + LFDUX yl2, YL, INCY + + fxcpmadd ys1, a1, alpha1, yl1 + fxcsmadd ys2, a1, alpha1, yl2 + + STFDUX ys1, YS, INCY + STFDUX ys2, YS, INCY + .align 4 + +.L98: + andi. r0, M, 1 + ble .L999 + + LFDUX yl1, YL, INCY + LFDUX a1, A1, INC2 + + fxcpmadd ys1, alpha1, a1, yl1 + + STFDUX ys1, YS, INCY + b .L999 + .align 4 + + +.L999: + addi SP, SP, -4 + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S new file mode 100644 index 0000000000..b66caa75c0 --- /dev/null +++ b/kernel/power/gemv_n.S @@ -0,0 +1,3090 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define AO5 r18 +#define AO6 r19 +#define AO7 r20 +#define AO8 r21 +#define LDA8 r22 + +#define Y1 r23 +#define PREA r24 +#define PREC r25 +#define YY r26 +#define BUFFER r27 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define alpha1 f16 +#define alpha2 f17 +#define alpha3 f18 +#define alpha4 f19 +#define alpha5 f20 +#define alpha6 f21 +#define alpha7 f22 +#define alpha8 f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha f31 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 16 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 40 +#define PREFETCHSIZE_C 24 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 40 +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA 200(SP) +#define FZERO 208(SP) +#else +#define STACKSIZE 280 +#define ALPHA 256(SP) +#define FZERO 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA + fmr alpha, f1 + + slwi LDA8, LDA, BASE_SHIFT + 3 + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + mr YY, Y + lfd f0, FZERO + + cmpi cr0, 0, INCY, SIZE + beq LL(10) + + mr YY, BUFFER + mr Y1, BUFFER + + addi r0, M, 7 + srawi. r0, r0, 3 + mtspr CTR, r0 + .align 4 + +LL(02): + STFD f0, 0 * SIZE(Y1) + STFD f0, 1 * SIZE(Y1) + STFD f0, 2 * SIZE(Y1) + STFD f0, 3 * SIZE(Y1) + STFD f0, 4 * SIZE(Y1) + STFD f0, 5 * SIZE(Y1) + STFD f0, 6 * SIZE(Y1) + STFD f0, 7 * SIZE(Y1) + addi Y1, Y1, 8 * SIZE + bdnz LL(02) + .align 4 + +LL(10): + srawi. J, N, 3 + ble LL(20) + .align 4 + +LL(11): + LFD alpha1, 0 * SIZE(X) + add X, X, INCX + LFD alpha2, 0 * SIZE(X) + add X, X, INCX + LFD alpha3, 0 * SIZE(X) + add X, X, INCX + LFD alpha4, 0 * SIZE(X) + add X, X, INCX + LFD alpha5, 0 * SIZE(X) + add X, X, INCX + LFD alpha6, 0 * SIZE(X) + add X, X, INCX + LFD alpha7, 0 * SIZE(X) + add X, X, INCX + LFD alpha8, 0 * SIZE(X) + add X, X, INCX + + FMUL alpha1, alpha, alpha1 + FMUL alpha2, alpha, alpha2 + FMUL alpha3, alpha, alpha3 + FMUL alpha4, alpha, alpha4 + FMUL alpha5, alpha, alpha5 + FMUL alpha6, alpha, alpha6 + FMUL alpha7, alpha, alpha7 + FMUL alpha8, alpha, alpha8 + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + mr Y1, YY + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(15) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + FMADD y06, alpha1, a6, y06 + FMADD y07, alpha1, a7, y07 + FMADD y08, alpha1, a8, y08 + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + nop + DCBT(AO1, PREA) + + FMADD y09, alpha1, a1, y09 + FMADD y10, alpha1, a2, y10 + FMADD y11, alpha1, a3, y11 + FMADD y12, alpha1, a4, y12 + + LFD a1, 0 * SIZE(AO2) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + FMADD y14, alpha1, a6, y14 + FMADD y15, alpha1, a7, y15 + FMADD y16, alpha1, a8, y16 + + LFD a5, 4 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + FMADD y02, alpha2, a2, y02 + FMADD y03, alpha2, a3, y03 + FMADD y04, alpha2, a4, y04 + + LFD a1, 8 * SIZE(AO2) + LFD a2, 9 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + FMADD y06, alpha2, a6, y06 + FMADD y07, alpha2, a7, y07 + FMADD y08, alpha2, a8, y08 + + LFD a5, 12 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + addi AO2, AO2, 16 * SIZE + nop + nop + DCBT(AO2, PREA) + + FMADD y09, alpha2, a1, y09 + FMADD y10, alpha2, a2, y10 + FMADD y11, alpha2, a3, y11 + FMADD y12, alpha2, a4, y12 + + LFD a1, 0 * SIZE(AO3) + LFD a2, 1 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + + FMADD y13, alpha2, a5, y13 + FMADD y14, alpha2, a6, y14 + FMADD y15, alpha2, a7, y15 + FMADD y16, alpha2, a8, y16 + + LFD a5, 4 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + FMADD y02, alpha3, a2, y02 + FMADD y03, alpha3, a3, y03 + FMADD y04, alpha3, a4, y04 + + LFD a1, 8 * SIZE(AO3) + LFD a2, 9 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + + FMADD y05, alpha3, a5, y05 + FMADD y06, alpha3, a6, y06 + FMADD y07, alpha3, a7, y07 + FMADD y08, alpha3, a8, y08 + + LFD a5, 12 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + addi AO3, AO3, 16 * SIZE + nop + nop + DCBT(AO3, PREA) + + FMADD y09, alpha3, a1, y09 + FMADD y10, alpha3, a2, y10 + FMADD y11, alpha3, a3, y11 + FMADD y12, alpha3, a4, y12 + + LFD a1, 0 * SIZE(AO4) + LFD a2, 1 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + + FMADD y13, alpha3, a5, y13 + FMADD y14, alpha3, a6, y14 + FMADD y15, alpha3, a7, y15 + FMADD y16, alpha3, a8, y16 + + LFD a5, 4 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + FMADD y02, alpha4, a2, y02 + FMADD y03, alpha4, a3, y03 + FMADD y04, alpha4, a4, y04 + + LFD a1, 8 * SIZE(AO4) + LFD a2, 9 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + + FMADD y05, alpha4, a5, y05 + FMADD y06, alpha4, a6, y06 + FMADD y07, alpha4, a7, y07 + FMADD y08, alpha4, a8, y08 + + LFD a5, 12 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + addi AO4, AO4, 16 * SIZE + nop + nop + DCBT(AO4, PREA) + + FMADD y09, alpha4, a1, y09 + FMADD y10, alpha4, a2, y10 + FMADD y11, alpha4, a3, y11 + FMADD y12, alpha4, a4, y12 + + LFD a1, 0 * SIZE(AO5) + LFD a2, 1 * SIZE(AO5) + LFD a3, 2 * SIZE(AO5) + LFD a4, 3 * SIZE(AO5) + + FMADD y13, alpha4, a5, y13 + FMADD y14, alpha4, a6, y14 + FMADD y15, alpha4, a7, y15 + FMADD y16, alpha4, a8, y16 + + LFD a5, 4 * SIZE(AO5) + LFD a6, 5 * SIZE(AO5) + LFD a7, 6 * SIZE(AO5) + LFD a8, 7 * SIZE(AO5) + + FMADD y01, alpha5, a1, y01 + FMADD y02, alpha5, a2, y02 + FMADD y03, alpha5, a3, y03 + FMADD y04, alpha5, a4, y04 + + LFD a1, 8 * SIZE(AO5) + LFD a2, 9 * SIZE(AO5) + LFD a3, 10 * SIZE(AO5) + LFD a4, 11 * SIZE(AO5) + + FMADD y05, alpha5, a5, y05 + FMADD y06, alpha5, a6, y06 + FMADD y07, alpha5, a7, y07 + FMADD y08, alpha5, a8, y08 + + LFD a5, 12 * SIZE(AO5) + LFD a6, 13 * SIZE(AO5) + LFD a7, 14 * SIZE(AO5) + LFD a8, 15 * SIZE(AO5) + + addi AO5, AO5, 16 * SIZE + nop + nop + DCBT(AO5, PREA) + + FMADD y09, alpha5, a1, y09 + FMADD y10, alpha5, a2, y10 + FMADD y11, alpha5, a3, y11 + FMADD y12, alpha5, a4, y12 + + LFD a1, 0 * SIZE(AO6) + LFD a2, 1 * SIZE(AO6) + LFD a3, 2 * SIZE(AO6) + LFD a4, 3 * SIZE(AO6) + + FMADD y13, alpha5, a5, y13 + FMADD y14, alpha5, a6, y14 + FMADD y15, alpha5, a7, y15 + FMADD y16, alpha5, a8, y16 + + LFD a5, 4 * SIZE(AO6) + LFD a6, 5 * SIZE(AO6) + LFD a7, 6 * SIZE(AO6) + LFD a8, 7 * SIZE(AO6) + + FMADD y01, alpha6, a1, y01 + FMADD y02, alpha6, a2, y02 + FMADD y03, alpha6, a3, y03 + FMADD y04, alpha6, a4, y04 + + LFD a1, 8 * SIZE(AO6) + LFD a2, 9 * SIZE(AO6) + LFD a3, 10 * SIZE(AO6) + LFD a4, 11 * SIZE(AO6) + + FMADD y05, alpha6, a5, y05 + FMADD y06, alpha6, a6, y06 + FMADD y07, alpha6, a7, y07 + FMADD y08, alpha6, a8, y08 + + LFD a5, 12 * SIZE(AO6) + LFD a6, 13 * SIZE(AO6) + LFD a7, 14 * SIZE(AO6) + LFD a8, 15 * SIZE(AO6) + + addi AO6, AO6, 16 * SIZE + nop + nop + DCBT(AO6, PREA) + + FMADD y09, alpha6, a1, y09 + FMADD y10, alpha6, a2, y10 + FMADD y11, alpha6, a3, y11 + FMADD y12, alpha6, a4, y12 + + LFD a1, 0 * SIZE(AO7) + LFD a2, 1 * SIZE(AO7) + LFD a3, 2 * SIZE(AO7) + LFD a4, 3 * SIZE(AO7) + + FMADD y13, alpha6, a5, y13 + FMADD y14, alpha6, a6, y14 + FMADD y15, alpha6, a7, y15 + FMADD y16, alpha6, a8, y16 + + LFD a5, 4 * SIZE(AO7) + LFD a6, 5 * SIZE(AO7) + LFD a7, 6 * SIZE(AO7) + LFD a8, 7 * SIZE(AO7) + + FMADD y01, alpha7, a1, y01 + FMADD y02, alpha7, a2, y02 + FMADD y03, alpha7, a3, y03 + FMADD y04, alpha7, a4, y04 + + LFD a1, 8 * SIZE(AO7) + LFD a2, 9 * SIZE(AO7) + LFD a3, 10 * SIZE(AO7) + LFD a4, 11 * SIZE(AO7) + + FMADD y05, alpha7, a5, y05 + FMADD y06, alpha7, a6, y06 + FMADD y07, alpha7, a7, y07 + FMADD y08, alpha7, a8, y08 + + LFD a5, 12 * SIZE(AO7) + LFD a6, 13 * SIZE(AO7) + LFD a7, 14 * SIZE(AO7) + LFD a8, 15 * SIZE(AO7) + + addi AO7, AO7, 16 * SIZE + nop + nop + DCBT(AO7, PREA) + + FMADD y09, alpha7, a1, y09 + FMADD y10, alpha7, a2, y10 + FMADD y11, alpha7, a3, y11 + FMADD y12, alpha7, a4, y12 + + LFD a1, 0 * SIZE(AO8) + LFD a2, 1 * SIZE(AO8) + LFD a3, 2 * SIZE(AO8) + LFD a4, 3 * SIZE(AO8) + + FMADD y13, alpha7, a5, y13 + FMADD y14, alpha7, a6, y14 + FMADD y15, alpha7, a7, y15 + FMADD y16, alpha7, a8, y16 + + LFD a5, 4 * SIZE(AO8) + LFD a6, 5 * SIZE(AO8) + LFD a7, 6 * SIZE(AO8) + LFD a8, 7 * SIZE(AO8) + + FMADD y01, alpha8, a1, y01 + FMADD y02, alpha8, a2, y02 + FMADD y03, alpha8, a3, y03 + FMADD y04, alpha8, a4, y04 + + LFD a1, 8 * SIZE(AO8) + LFD a2, 9 * SIZE(AO8) + LFD a3, 10 * SIZE(AO8) + LFD a4, 11 * SIZE(AO8) + + FMADD y05, alpha8, a5, y05 + FMADD y06, alpha8, a6, y06 + FMADD y07, alpha8, a7, y07 + FMADD y08, alpha8, a8, y08 + + LFD a5, 12 * SIZE(AO8) + LFD a6, 13 * SIZE(AO8) + LFD a7, 14 * SIZE(AO8) + LFD a8, 15 * SIZE(AO8) + + addi AO8, AO8, 16 * SIZE + nop + nop + DCBT(AO8, PREA) + + FMADD y09, alpha8, a1, y09 + FMADD y10, alpha8, a2, y10 + FMADD y11, alpha8, a3, y11 + FMADD y12, alpha8, a4, y12 + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + FMADD y13, alpha8, a5, y13 + FMADD y14, alpha8, a6, y14 + FMADD y15, alpha8, a7, y15 + FMADD y16, alpha8, a8, y16 + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + LFD y01, 16 * SIZE(Y1) + LFD y02, 17 * SIZE(Y1) + LFD y03, 18 * SIZE(Y1) + LFD y04, 19 * SIZE(Y1) + + DCBT(Y1, PREC) + bdz LL(13) + .align 4 + +LL(12): + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + LFD y05, 20 * SIZE(Y1) + LFD y06, 21 * SIZE(Y1) + LFD y07, 22 * SIZE(Y1) + LFD y08, 23 * SIZE(Y1) + + FMADD y05, alpha1, a5, y05 + FMADD y06, alpha1, a6, y06 + FMADD y07, alpha1, a7, y07 + FMADD y08, alpha1, a8, y08 + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + + LFD y09, 24 * SIZE(Y1) + LFD y10, 25 * SIZE(Y1) + LFD y11, 26 * SIZE(Y1) + LFD y12, 27 * SIZE(Y1) + + FMADD y09, alpha1, a1, y09 + FMADD y10, alpha1, a2, y10 + FMADD y11, alpha1, a3, y11 + FMADD y12, alpha1, a4, y12 + + LFD a1, 0 * SIZE(AO2) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + LFD y13, 28 * SIZE(Y1) + LFD y14, 29 * SIZE(Y1) + LFD y15, 30 * SIZE(Y1) + LFD y16, 31 * SIZE(Y1) + + FMADD y13, alpha1, a5, y13 + FMADD y14, alpha1, a6, y14 + FMADD y15, alpha1, a7, y15 + FMADD y16, alpha1, a8, y16 + + LFD a5, 4 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + FMADD y02, alpha2, a2, y02 + FMADD y03, alpha2, a3, y03 + FMADD y04, alpha2, a4, y04 + + LFD a1, 8 * SIZE(AO2) + LFD a2, 9 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + FMADD y06, alpha2, a6, y06 + FMADD y07, alpha2, a7, y07 + FMADD y08, alpha2, a8, y08 + + LFD a5, 12 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2, a1, y09 + FMADD y10, alpha2, a2, y10 + FMADD y11, alpha2, a3, y11 + FMADD y12, alpha2, a4, y12 + + LFD a1, 0 * SIZE(AO3) + LFD a2, 1 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + + FMADD y13, alpha2, a5, y13 + FMADD y14, alpha2, a6, y14 + FMADD y15, alpha2, a7, y15 + FMADD y16, alpha2, a8, y16 + + LFD a5, 4 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + FMADD y02, alpha3, a2, y02 + FMADD y03, alpha3, a3, y03 + FMADD y04, alpha3, a4, y04 + + LFD a1, 8 * SIZE(AO3) + LFD a2, 9 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + + FMADD y05, alpha3, a5, y05 + FMADD y06, alpha3, a6, y06 + FMADD y07, alpha3, a7, y07 + FMADD y08, alpha3, a8, y08 + + LFD a5, 12 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3, a1, y09 + FMADD y10, alpha3, a2, y10 + FMADD y11, alpha3, a3, y11 + FMADD y12, alpha3, a4, y12 + + LFD a1, 0 * SIZE(AO4) + LFD a2, 1 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + + FMADD y13, alpha3, a5, y13 + FMADD y14, alpha3, a6, y14 + FMADD y15, alpha3, a7, y15 + FMADD y16, alpha3, a8, y16 + + LFD a5, 4 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + FMADD y02, alpha4, a2, y02 + FMADD y03, alpha4, a3, y03 + FMADD y04, alpha4, a4, y04 + + LFD a1, 8 * SIZE(AO4) + LFD a2, 9 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + + FMADD y05, alpha4, a5, y05 + FMADD y06, alpha4, a6, y06 + FMADD y07, alpha4, a7, y07 + FMADD y08, alpha4, a8, y08 + + LFD a5, 12 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + FMADD y09, alpha4, a1, y09 + FMADD y10, alpha4, a2, y10 + FMADD y11, alpha4, a3, y11 + FMADD y12, alpha4, a4, y12 + + LFD a1, 0 * SIZE(AO5) + LFD a2, 1 * SIZE(AO5) + LFD a3, 2 * SIZE(AO5) + LFD a4, 3 * SIZE(AO5) + + FMADD y13, alpha4, a5, y13 + FMADD y14, alpha4, a6, y14 + FMADD y15, alpha4, a7, y15 + FMADD y16, alpha4, a8, y16 + + LFD a5, 4 * SIZE(AO5) + LFD a6, 5 * SIZE(AO5) + LFD a7, 6 * SIZE(AO5) + LFD a8, 7 * SIZE(AO5) + + FMADD y01, alpha5, a1, y01 + FMADD y02, alpha5, a2, y02 + FMADD y03, alpha5, a3, y03 + FMADD y04, alpha5, a4, y04 + + LFD a1, 8 * SIZE(AO5) + LFD a2, 9 * SIZE(AO5) + LFD a3, 10 * SIZE(AO5) + LFD a4, 11 * SIZE(AO5) + + FMADD y05, alpha5, a5, y05 + FMADD y06, alpha5, a6, y06 + FMADD y07, alpha5, a7, y07 + FMADD y08, alpha5, a8, y08 + + LFD a5, 12 * SIZE(AO5) + LFD a6, 13 * SIZE(AO5) + LFD a7, 14 * SIZE(AO5) + LFD a8, 15 * SIZE(AO5) + + FMADD y09, alpha5, a1, y09 + FMADD y10, alpha5, a2, y10 + FMADD y11, alpha5, a3, y11 + FMADD y12, alpha5, a4, y12 + + LFD a1, 0 * SIZE(AO6) + LFD a2, 1 * SIZE(AO6) + LFD a3, 2 * SIZE(AO6) + LFD a4, 3 * SIZE(AO6) + + FMADD y13, alpha5, a5, y13 + FMADD y14, alpha5, a6, y14 + FMADD y15, alpha5, a7, y15 + FMADD y16, alpha5, a8, y16 + + LFD a5, 4 * SIZE(AO6) + LFD a6, 5 * SIZE(AO6) + LFD a7, 6 * SIZE(AO6) + LFD a8, 7 * SIZE(AO6) + + FMADD y01, alpha6, a1, y01 + FMADD y02, alpha6, a2, y02 + FMADD y03, alpha6, a3, y03 + FMADD y04, alpha6, a4, y04 + + LFD a1, 8 * SIZE(AO6) + LFD a2, 9 * SIZE(AO6) + LFD a3, 10 * SIZE(AO6) + LFD a4, 11 * SIZE(AO6) + + FMADD y05, alpha6, a5, y05 + FMADD y06, alpha6, a6, y06 + FMADD y07, alpha6, a7, y07 + FMADD y08, alpha6, a8, y08 + + LFD a5, 12 * SIZE(AO6) + LFD a6, 13 * SIZE(AO6) + LFD a7, 14 * SIZE(AO6) + LFD a8, 15 * SIZE(AO6) + + FMADD y09, alpha6, a1, y09 + FMADD y10, alpha6, a2, y10 + FMADD y11, alpha6, a3, y11 + FMADD y12, alpha6, a4, y12 + + LFD a1, 0 * SIZE(AO7) + LFD a2, 1 * SIZE(AO7) + LFD a3, 2 * SIZE(AO7) + LFD a4, 3 * SIZE(AO7) + + FMADD y13, alpha6, a5, y13 + FMADD y14, alpha6, a6, y14 + FMADD y15, alpha6, a7, y15 + FMADD y16, alpha6, a8, y16 + + LFD a5, 4 * SIZE(AO7) + LFD a6, 5 * SIZE(AO7) + LFD a7, 6 * SIZE(AO7) + LFD a8, 7 * SIZE(AO7) + + FMADD y01, alpha7, a1, y01 + FMADD y02, alpha7, a2, y02 + FMADD y03, alpha7, a3, y03 + FMADD y04, alpha7, a4, y04 + + LFD a1, 8 * SIZE(AO7) + LFD a2, 9 * SIZE(AO7) + LFD a3, 10 * SIZE(AO7) + LFD a4, 11 * SIZE(AO7) + + FMADD y05, alpha7, a5, y05 + FMADD y06, alpha7, a6, y06 + FMADD y07, alpha7, a7, y07 + FMADD y08, alpha7, a8, y08 + + LFD a5, 12 * SIZE(AO7) + LFD a6, 13 * SIZE(AO7) + LFD a7, 14 * SIZE(AO7) + LFD a8, 15 * SIZE(AO7) + + FMADD y09, alpha7, a1, y09 + FMADD y10, alpha7, a2, y10 + FMADD y11, alpha7, a3, y11 + FMADD y12, alpha7, a4, y12 + + LFD a1, 0 * SIZE(AO8) + LFD a2, 1 * SIZE(AO8) + LFD a3, 2 * SIZE(AO8) + LFD a4, 3 * SIZE(AO8) + + FMADD y13, alpha7, a5, y13 + FMADD y14, alpha7, a6, y14 + FMADD y15, alpha7, a7, y15 + FMADD y16, alpha7, a8, y16 + + LFD a5, 4 * SIZE(AO8) + LFD a6, 5 * SIZE(AO8) + LFD a7, 6 * SIZE(AO8) + LFD a8, 7 * SIZE(AO8) + + FMADD y01, alpha8, a1, y01 + FMADD y02, alpha8, a2, y02 + FMADD y03, alpha8, a3, y03 + FMADD y04, alpha8, a4, y04 + + LFD a1, 8 * SIZE(AO8) + LFD a2, 9 * SIZE(AO8) + LFD a3, 10 * SIZE(AO8) + LFD a4, 11 * SIZE(AO8) + + FMADD y05, alpha8, a5, y05 + FMADD y06, alpha8, a6, y06 + FMADD y07, alpha8, a7, y07 + FMADD y08, alpha8, a8, y08 + + LFD a5, 12 * SIZE(AO8) + LFD a6, 13 * SIZE(AO8) + LFD a7, 14 * SIZE(AO8) + LFD a8, 15 * SIZE(AO8) + + addi AO5, AO5, 16 * SIZE + addi AO6, AO6, 16 * SIZE + addi AO7, AO7, 16 * SIZE + addi AO8, AO8, 16 * SIZE + + DCBT(AO5, PREA) + DCBT(AO6, PREA) + DCBT(AO7, PREA) + DCBT(AO8, PREA) + + FMADD y09, alpha8, a1, y09 + FMADD y10, alpha8, a2, y10 + FMADD y11, alpha8, a3, y11 + FMADD y12, alpha8, a4, y12 + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + FMADD y13, alpha8, a5, y13 + FMADD y14, alpha8, a6, y14 + FMADD y15, alpha8, a7, y15 + FMADD y16, alpha8, a8, y16 + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y01, 16 * SIZE(Y1) + STFD y02, 17 * SIZE(Y1) + STFD y03, 18 * SIZE(Y1) + STFD y04, 19 * SIZE(Y1) + + LFD y01, 32 * SIZE(Y1) + LFD y02, 33 * SIZE(Y1) + LFD y03, 34 * SIZE(Y1) + LFD y04, 35 * SIZE(Y1) + + DCBT(Y1, PREC) + addi Y1, Y1, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(13): + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + addi Y1, Y1, 16 * SIZE + .align 4 + +LL(15): + andi. r0, M, 15 + ble LL(19) + + andi. r0, M, 8 + ble LL(16) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO2) + + FMADD y05, alpha1, a5, y05 + LFD a5, 4 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFD a6, 5 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFD a7, 6 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 0 * SIZE(AO3) + FMADD y02, alpha2, a2, y02 + LFD a2, 1 * SIZE(AO3) + + FMADD y03, alpha2, a3, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, alpha2, a4, y04 + LFD a4, 3 * SIZE(AO3) + + FMADD y05, alpha2, a5, y05 + LFD a5, 4 * SIZE(AO3) + FMADD y06, alpha2, a6, y06 + LFD a6, 5 * SIZE(AO3) + + FMADD y07, alpha2, a7, y07 + LFD a7, 6 * SIZE(AO3) + FMADD y08, alpha2, a8, y08 + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFD a1, 0 * SIZE(AO4) + FMADD y02, alpha3, a2, y02 + LFD a2, 1 * SIZE(AO4) + + FMADD y03, alpha3, a3, y03 + LFD a3, 2 * SIZE(AO4) + FMADD y04, alpha3, a4, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y05, alpha3, a5, y05 + LFD a5, 4 * SIZE(AO4) + FMADD y06, alpha3, a6, y06 + LFD a6, 5 * SIZE(AO4) + + FMADD y07, alpha3, a7, y07 + LFD a7, 6 * SIZE(AO4) + FMADD y08, alpha3, a8, y08 + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + LFD a1, 0 * SIZE(AO5) + FMADD y02, alpha4, a2, y02 + LFD a2, 1 * SIZE(AO5) + + FMADD y03, alpha4, a3, y03 + LFD a3, 2 * SIZE(AO5) + FMADD y04, alpha4, a4, y04 + LFD a4, 3 * SIZE(AO5) + + FMADD y05, alpha4, a5, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, alpha4, a6, y06 + LFD a6, 5 * SIZE(AO5) + + FMADD y07, alpha4, a7, y07 + LFD a7, 6 * SIZE(AO5) + FMADD y08, alpha4, a8, y08 + LFD a8, 7 * SIZE(AO5) + + FMADD y01, alpha5, a1, y01 + LFD a1, 0 * SIZE(AO6) + FMADD y02, alpha5, a2, y02 + LFD a2, 1 * SIZE(AO6) + + FMADD y03, alpha5, a3, y03 + LFD a3, 2 * SIZE(AO6) + FMADD y04, alpha5, a4, y04 + LFD a4, 3 * SIZE(AO6) + + FMADD y05, alpha5, a5, y05 + LFD a5, 4 * SIZE(AO6) + FMADD y06, alpha5, a6, y06 + LFD a6, 5 * SIZE(AO6) + + FMADD y07, alpha5, a7, y07 + LFD a7, 6 * SIZE(AO6) + FMADD y08, alpha5, a8, y08 + LFD a8, 7 * SIZE(AO6) + + FMADD y01, alpha6, a1, y01 + LFD a1, 0 * SIZE(AO7) + FMADD y02, alpha6, a2, y02 + LFD a2, 1 * SIZE(AO7) + + FMADD y03, alpha6, a3, y03 + LFD a3, 2 * SIZE(AO7) + FMADD y04, alpha6, a4, y04 + LFD a4, 3 * SIZE(AO7) + + FMADD y05, alpha6, a5, y05 + LFD a5, 4 * SIZE(AO7) + FMADD y06, alpha6, a6, y06 + LFD a6, 5 * SIZE(AO7) + + FMADD y07, alpha6, a7, y07 + LFD a7, 6 * SIZE(AO7) + FMADD y08, alpha6, a8, y08 + LFD a8, 7 * SIZE(AO7) + + FMADD y01, alpha7, a1, y01 + LFD a1, 0 * SIZE(AO8) + FMADD y02, alpha7, a2, y02 + LFD a2, 1 * SIZE(AO8) + + FMADD y03, alpha7, a3, y03 + LFD a3, 2 * SIZE(AO8) + FMADD y04, alpha7, a4, y04 + LFD a4, 3 * SIZE(AO8) + + FMADD y05, alpha7, a5, y05 + LFD a5, 4 * SIZE(AO8) + FMADD y06, alpha7, a6, y06 + LFD a6, 5 * SIZE(AO8) + + FMADD y07, alpha7, a7, y07 + LFD a7, 6 * SIZE(AO8) + FMADD y08, alpha7, a8, y08 + LFD a8, 7 * SIZE(AO8) + + FMADD y01, alpha8, a1, y01 + addi AO1, AO1, 8 * SIZE + FMADD y02, alpha8, a2, y02 + addi AO2, AO2, 8 * SIZE + FMADD y03, alpha8, a3, y03 + addi AO3, AO3, 8 * SIZE + FMADD y04, alpha8, a4, y04 + addi AO4, AO4, 8 * SIZE + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + FMADD y05, alpha8, a5, y05 + addi AO5, AO5, 8 * SIZE + FMADD y06, alpha8, a6, y06 + addi AO6, AO6, 8 * SIZE + FMADD y07, alpha8, a7, y07 + addi AO7, AO7, 8 * SIZE + FMADD y08, alpha8, a8, y08 + addi AO8, AO8, 8 * SIZE + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + addi Y1, Y1, 8 * SIZE + .align 4 + +LL(16): + andi. r0, M, 4 + ble LL(17) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO3) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO3) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO3) + + FMADD y01, alpha2, a5, y01 + LFD a5, 0 * SIZE(AO4) + FMADD y02, alpha2, a6, y02 + LFD a6, 1 * SIZE(AO4) + FMADD y03, alpha2, a7, y03 + LFD a7, 2 * SIZE(AO4) + FMADD y04, alpha2, a8, y04 + LFD a8, 3 * SIZE(AO4) + + FMADD y01, alpha3, a1, y01 + LFD a1, 0 * SIZE(AO5) + FMADD y02, alpha3, a2, y02 + LFD a2, 1 * SIZE(AO5) + FMADD y03, alpha3, a3, y03 + LFD a3, 2 * SIZE(AO5) + FMADD y04, alpha3, a4, y04 + LFD a4, 3 * SIZE(AO5) + + FMADD y01, alpha4, a5, y01 + LFD a5, 0 * SIZE(AO6) + FMADD y02, alpha4, a6, y02 + LFD a6, 1 * SIZE(AO6) + FMADD y03, alpha4, a7, y03 + LFD a7, 2 * SIZE(AO6) + FMADD y04, alpha4, a8, y04 + LFD a8, 3 * SIZE(AO6) + + FMADD y01, alpha5, a1, y01 + LFD a1, 0 * SIZE(AO7) + FMADD y02, alpha5, a2, y02 + LFD a2, 1 * SIZE(AO7) + FMADD y03, alpha5, a3, y03 + LFD a3, 2 * SIZE(AO7) + FMADD y04, alpha5, a4, y04 + LFD a4, 3 * SIZE(AO7) + + FMADD y01, alpha6, a5, y01 + LFD a5, 0 * SIZE(AO8) + FMADD y02, alpha6, a6, y02 + LFD a6, 1 * SIZE(AO8) + FMADD y03, alpha6, a7, y03 + LFD a7, 2 * SIZE(AO8) + FMADD y04, alpha6, a8, y04 + LFD a8, 3 * SIZE(AO8) + + FMADD y01, alpha7, a1, y01 + addi AO1, AO1, 4 * SIZE + FMADD y02, alpha7, a2, y02 + addi AO2, AO2, 4 * SIZE + FMADD y03, alpha7, a3, y03 + addi AO3, AO3, 4 * SIZE + FMADD y04, alpha7, a4, y04 + addi AO4, AO4, 4 * SIZE + + FMADD y01, alpha8, a5, y01 + addi AO5, AO5, 4 * SIZE + FMADD y02, alpha8, a6, y02 + addi AO6, AO6, 4 * SIZE + FMADD y03, alpha8, a7, y03 + addi AO7, AO7, 4 * SIZE + FMADD y04, alpha8, a8, y04 + addi AO8, AO8, 4 * SIZE + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + addi Y1, Y1, 4 * SIZE + .align 4 + +LL(17): + andi. r0, M, 2 + ble LL(18) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD a5, 0 * SIZE(AO3) + LFD a6, 1 * SIZE(AO3) + LFD a7, 0 * SIZE(AO4) + LFD a8, 1 * SIZE(AO4) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO5) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO5) + FMADD y01, alpha2, a3, y01 + LFD a3, 0 * SIZE(AO6) + FMADD y02, alpha2, a4, y02 + LFD a4, 1 * SIZE(AO6) + + FMADD y01, alpha3, a5, y01 + LFD a5, 0 * SIZE(AO7) + FMADD y02, alpha3, a6, y02 + LFD a6, 1 * SIZE(AO7) + FMADD y01, alpha4, a7, y01 + LFD a7, 0 * SIZE(AO8) + FMADD y02, alpha4, a8, y02 + LFD a8, 1 * SIZE(AO8) + + FMADD y01, alpha5, a1, y01 + addi AO1, AO1, 2 * SIZE + FMADD y02, alpha5, a2, y02 + addi AO2, AO2, 2 * SIZE + FMADD y01, alpha6, a3, y01 + addi AO3, AO3, 2 * SIZE + FMADD y02, alpha6, a4, y02 + addi AO4, AO4, 2 * SIZE + + FMADD y01, alpha7, a5, y01 + addi AO5, AO5, 2 * SIZE + FMADD y02, alpha7, a6, y02 + addi AO6, AO6, 2 * SIZE + FMADD y01, alpha8, a7, y01 + addi AO7, AO7, 2 * SIZE + FMADD y02, alpha8, a8, y02 + addi AO8, AO8, 2 * SIZE + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + addi Y1, Y1, 2 * SIZE + .align 4 + +LL(18): + andi. r0, M, 1 + ble LL(19) + + LFD y01, 0 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 0 * SIZE(AO2) + LFD a3, 0 * SIZE(AO3) + LFD a4, 0 * SIZE(AO4) + LFD a5, 0 * SIZE(AO5) + LFD a6, 0 * SIZE(AO6) + LFD a7, 0 * SIZE(AO7) + LFD a8, 0 * SIZE(AO8) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + FMADD y01, alpha3, a3, y01 + FMADD y01, alpha4, a4, y01 + + FMADD y01, alpha5, a5, y01 + FMADD y01, alpha6, a6, y01 + FMADD y01, alpha7, a7, y01 + FMADD y01, alpha8, a8, y01 + + STFD y01, 0 * SIZE(Y1) + .align 4 + +LL(19): + addi J, J, -1 + lfd alpha, ALPHA + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 4 + mr AO1, A + add AO2, A, LDA + ble LL(30) + .align 4 + + LFD alpha1, 0 * SIZE(X) + add X, X, INCX + LFD alpha2, 0 * SIZE(X) + add X, X, INCX + LFD alpha3, 0 * SIZE(X) + add X, X, INCX + LFD alpha4, 0 * SIZE(X) + add X, X, INCX + + FMUL alpha1, alpha, alpha1 + add AO3, AO2, LDA + FMUL alpha2, alpha, alpha2 + add AO4, AO3, LDA + FMUL alpha3, alpha, alpha3 + add A, AO4, LDA + FMUL alpha4, alpha, alpha4 + mr Y1, YY + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(25) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + bdz LL(23) + .align 4 + +LL(22): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 0 * SIZE(AO2) + FMADD y10, alpha1, a2, y10 + LFD a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y11 + LFD a3, 2 * SIZE(AO2) + FMADD y12, alpha1, a4, y12 + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + LFD a5, 4 * SIZE(AO2) + FMADD y14, alpha1, a6, y14 + LFD a6, 5 * SIZE(AO2) + FMADD y15, alpha1, a7, y15 + LFD a7, 6 * SIZE(AO2) + FMADD y16, alpha1, a8, y16 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 8 * SIZE(AO2) + FMADD y02, alpha2, a2, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, alpha2, a3, y03 + LFD a3, 10 * SIZE(AO2) + FMADD y04, alpha2, a4, y04 + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + LFD a5, 12 * SIZE(AO2) + FMADD y06, alpha2, a6, y06 + LFD a6, 13 * SIZE(AO2) + FMADD y07, alpha2, a7, y07 + LFD a7, 14 * SIZE(AO2) + FMADD y08, alpha2, a8, y08 + LFD a8, 15 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + DCBT(AO1, PREA) + DCBT(AO2, PREA) + + FMADD y09, alpha2, a1, y09 + LFD a1, 0 * SIZE(AO3) + FMADD y10, alpha2, a2, y10 + LFD a2, 1 * SIZE(AO3) + FMADD y11, alpha2, a3, y11 + LFD a3, 2 * SIZE(AO3) + FMADD y12, alpha2, a4, y12 + LFD a4, 3 * SIZE(AO3) + + FMADD y13, alpha2, a5, y13 + LFD a5, 4 * SIZE(AO3) + FMADD y14, alpha2, a6, y14 + LFD a6, 5 * SIZE(AO3) + FMADD y15, alpha2, a7, y15 + LFD a7, 6 * SIZE(AO3) + FMADD y16, alpha2, a8, y16 + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFD a1, 8 * SIZE(AO3) + FMADD y02, alpha3, a2, y02 + LFD a2, 9 * SIZE(AO3) + FMADD y03, alpha3, a3, y03 + LFD a3, 10 * SIZE(AO3) + FMADD y04, alpha3, a4, y04 + LFD a4, 11 * SIZE(AO3) + + FMADD y05, alpha3, a5, y05 + LFD a5, 12 * SIZE(AO3) + FMADD y06, alpha3, a6, y06 + LFD a6, 13 * SIZE(AO3) + FMADD y07, alpha3, a7, y07 + LFD a7, 14 * SIZE(AO3) + FMADD y08, alpha3, a8, y08 + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3, a1, y09 + LFD a1, 0 * SIZE(AO4) + FMADD y10, alpha3, a2, y10 + LFD a2, 1 * SIZE(AO4) + FMADD y11, alpha3, a3, y11 + LFD a3, 2 * SIZE(AO4) + FMADD y12, alpha3, a4, y12 + LFD a4, 3 * SIZE(AO4) + + FMADD y13, alpha3, a5, y13 + LFD a5, 4 * SIZE(AO4) + FMADD y14, alpha3, a6, y14 + LFD a6, 5 * SIZE(AO4) + FMADD y15, alpha3, a7, y15 + LFD a7, 6 * SIZE(AO4) + FMADD y16, alpha3, a8, y16 + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + LFD a1, 8 * SIZE(AO4) + FMADD y02, alpha4, a2, y02 + LFD a2, 9 * SIZE(AO4) + FMADD y03, alpha4, a3, y03 + LFD a3, 10 * SIZE(AO4) + FMADD y04, alpha4, a4, y04 + LFD a4, 11 * SIZE(AO4) + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + LFD y01, 16 * SIZE(Y1) + LFD y02, 17 * SIZE(Y1) + LFD y03, 18 * SIZE(Y1) + LFD y04, 19 * SIZE(Y1) + + FMADD y05, alpha4, a5, y05 + LFD a5, 12 * SIZE(AO4) + FMADD y06, alpha4, a6, y06 + LFD a6, 13 * SIZE(AO4) + FMADD y07, alpha4, a7, y07 + LFD a7, 14 * SIZE(AO4) + FMADD y08, alpha4, a8, y08 + LFD a8, 15 * SIZE(AO4) + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + LFD y05, 20 * SIZE(Y1) + LFD y06, 21 * SIZE(Y1) + LFD y07, 22 * SIZE(Y1) + LFD y08, 23 * SIZE(Y1) + + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + FMADD y09, alpha4, a1, y09 + LFD a1, 0 * SIZE(AO1) + FMADD y10, alpha4, a2, y10 + LFD a2, 1 * SIZE(AO1) + FMADD y11, alpha4, a3, y11 + LFD a3, 2 * SIZE(AO1) + FMADD y12, alpha4, a4, y12 + LFD a4, 3 * SIZE(AO1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + + LFD y09, 24 * SIZE(Y1) + LFD y10, 25 * SIZE(Y1) + LFD y11, 26 * SIZE(Y1) + LFD y12, 27 * SIZE(Y1) + + FMADD y13, alpha4, a5, y13 + LFD a5, 4 * SIZE(AO1) + FMADD y14, alpha4, a6, y14 + LFD a6, 5 * SIZE(AO1) + FMADD y15, alpha4, a7, y15 + LFD a7, 6 * SIZE(AO1) + FMADD y16, alpha4, a8, y16 + LFD a8, 7 * SIZE(AO1) + + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + LFD y13, 28 * SIZE(Y1) + LFD y14, 29 * SIZE(Y1) + LFD y15, 30 * SIZE(Y1) + LFD y16, 31 * SIZE(Y1) + + addi Y1, Y1, 16 * SIZE + DCBT(Y1, PREC) + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 0 * SIZE(AO2) + FMADD y10, alpha1, a2, y10 + LFD a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y11 + LFD a3, 2 * SIZE(AO2) + FMADD y12, alpha1, a4, y12 + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + LFD a5, 4 * SIZE(AO2) + FMADD y14, alpha1, a6, y14 + LFD a6, 5 * SIZE(AO2) + FMADD y15, alpha1, a7, y15 + LFD a7, 6 * SIZE(AO2) + FMADD y16, alpha1, a8, y16 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 8 * SIZE(AO2) + FMADD y02, alpha2, a2, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, alpha2, a3, y03 + LFD a3, 10 * SIZE(AO2) + FMADD y04, alpha2, a4, y04 + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + LFD a5, 12 * SIZE(AO2) + FMADD y06, alpha2, a6, y06 + LFD a6, 13 * SIZE(AO2) + FMADD y07, alpha2, a7, y07 + LFD a7, 14 * SIZE(AO2) + FMADD y08, alpha2, a8, y08 + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2, a1, y09 + LFD a1, 0 * SIZE(AO3) + FMADD y10, alpha2, a2, y10 + LFD a2, 1 * SIZE(AO3) + FMADD y11, alpha2, a3, y11 + LFD a3, 2 * SIZE(AO3) + FMADD y12, alpha2, a4, y12 + LFD a4, 3 * SIZE(AO3) + + FMADD y13, alpha2, a5, y13 + LFD a5, 4 * SIZE(AO3) + FMADD y14, alpha2, a6, y14 + LFD a6, 5 * SIZE(AO3) + FMADD y15, alpha2, a7, y15 + LFD a7, 6 * SIZE(AO3) + FMADD y16, alpha2, a8, y16 + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFD a1, 8 * SIZE(AO3) + FMADD y02, alpha3, a2, y02 + LFD a2, 9 * SIZE(AO3) + FMADD y03, alpha3, a3, y03 + LFD a3, 10 * SIZE(AO3) + FMADD y04, alpha3, a4, y04 + LFD a4, 11 * SIZE(AO3) + + FMADD y05, alpha3, a5, y05 + LFD a5, 12 * SIZE(AO3) + FMADD y06, alpha3, a6, y06 + LFD a6, 13 * SIZE(AO3) + FMADD y07, alpha3, a7, y07 + LFD a7, 14 * SIZE(AO3) + FMADD y08, alpha3, a8, y08 + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3, a1, y09 + LFD a1, 0 * SIZE(AO4) + FMADD y10, alpha3, a2, y10 + LFD a2, 1 * SIZE(AO4) + FMADD y11, alpha3, a3, y11 + LFD a3, 2 * SIZE(AO4) + FMADD y12, alpha3, a4, y12 + LFD a4, 3 * SIZE(AO4) + + FMADD y13, alpha3, a5, y13 + LFD a5, 4 * SIZE(AO4) + FMADD y14, alpha3, a6, y14 + LFD a6, 5 * SIZE(AO4) + FMADD y15, alpha3, a7, y15 + LFD a7, 6 * SIZE(AO4) + FMADD y16, alpha3, a8, y16 + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + LFD a1, 8 * SIZE(AO4) + FMADD y02, alpha4, a2, y02 + LFD a2, 9 * SIZE(AO4) + FMADD y03, alpha4, a3, y03 + LFD a3, 10 * SIZE(AO4) + FMADD y04, alpha4, a4, y04 + LFD a4, 11 * SIZE(AO4) + + FMADD y05, alpha4, a5, y05 + LFD a5, 12 * SIZE(AO4) + FMADD y06, alpha4, a6, y06 + LFD a6, 13 * SIZE(AO4) + FMADD y07, alpha4, a7, y07 + LFD a7, 14 * SIZE(AO4) + FMADD y08, alpha4, a8, y08 + LFD a8, 15 * SIZE(AO4) + + FMADD y09, alpha4, a1, y09 + addi AO1, AO1, 16 * SIZE + FMADD y10, alpha4, a2, y10 + addi AO2, AO2, 16 * SIZE + FMADD y11, alpha4, a3, y11 + addi AO3, AO3, 16 * SIZE + FMADD y12, alpha4, a4, y12 + addi AO4, AO4, 16 * SIZE + + FMADD y13, alpha4, a5, y13 + FMADD y14, alpha4, a6, y14 + FMADD y15, alpha4, a7, y15 + FMADD y16, alpha4, a8, y16 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + addi Y1, Y1, 16 * SIZE + .align 4 + +LL(25): + andi. r0, M, 15 + ble LL(30) + + andi. r0, M, 8 + ble LL(26) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO2) + + FMADD y05, alpha1, a5, y05 + LFD a5, 4 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFD a6, 5 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFD a7, 6 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 0 * SIZE(AO3) + FMADD y02, alpha2, a2, y02 + LFD a2, 1 * SIZE(AO3) + FMADD y03, alpha2, a3, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, alpha2, a4, y04 + LFD a4, 3 * SIZE(AO3) + FMADD y05, alpha2, a5, y05 + LFD a5, 4 * SIZE(AO3) + FMADD y06, alpha2, a6, y06 + LFD a6, 5 * SIZE(AO3) + FMADD y07, alpha2, a7, y07 + LFD a7, 6 * SIZE(AO3) + FMADD y08, alpha2, a8, y08 + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFD a1, 0 * SIZE(AO4) + FMADD y02, alpha3, a2, y02 + LFD a2, 1 * SIZE(AO4) + FMADD y03, alpha3, a3, y03 + LFD a3, 2 * SIZE(AO4) + FMADD y04, alpha3, a4, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y05, alpha3, a5, y05 + LFD a5, 4 * SIZE(AO4) + FMADD y06, alpha3, a6, y06 + LFD a6, 5 * SIZE(AO4) + FMADD y07, alpha3, a7, y07 + LFD a7, 6 * SIZE(AO4) + FMADD y08, alpha3, a8, y08 + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + addi AO1, AO1, 8 * SIZE + FMADD y02, alpha4, a2, y02 + addi AO2, AO2, 8 * SIZE + FMADD y03, alpha4, a3, y03 + addi AO3, AO3, 8 * SIZE + FMADD y04, alpha4, a4, y04 + addi AO4, AO4, 8 * SIZE + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + FMADD y05, alpha4, a5, y05 + FMADD y06, alpha4, a6, y06 + FMADD y07, alpha4, a7, y07 + FMADD y08, alpha4, a8, y08 + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + addi Y1, Y1, 8 * SIZE + .align 4 + +LL(26): + andi. r0, M, 4 + ble LL(27) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO3) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO3) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO3) + + FMADD y01, alpha2, a5, y01 + LFD a5, 0 * SIZE(AO4) + FMADD y02, alpha2, a6, y02 + LFD a6, 1 * SIZE(AO4) + FMADD y03, alpha2, a7, y03 + LFD a7, 2 * SIZE(AO4) + FMADD y04, alpha2, a8, y04 + LFD a8, 3 * SIZE(AO4) + + FMADD y01, alpha3, a1, y01 + addi AO1, AO1, 4 * SIZE + FMADD y02, alpha3, a2, y02 + addi AO2, AO2, 4 * SIZE + FMADD y03, alpha3, a3, y03 + addi AO3, AO3, 4 * SIZE + FMADD y04, alpha3, a4, y04 + addi AO4, AO4, 4 * SIZE + + FMADD y01, alpha4, a5, y01 + FMADD y02, alpha4, a6, y02 + FMADD y03, alpha4, a7, y03 + FMADD y04, alpha4, a8, y04 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + addi Y1, Y1, 4 * SIZE + .align 4 + +LL(27): + andi. r0, M, 2 + ble LL(28) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD a5, 0 * SIZE(AO3) + LFD a6, 1 * SIZE(AO3) + LFD a7, 0 * SIZE(AO4) + LFD a8, 1 * SIZE(AO4) + + FMADD y01, alpha1, a1, y01 + addi AO1, AO1, 2 * SIZE + FMADD y02, alpha1, a2, y02 + addi AO2, AO2, 2 * SIZE + FMADD y01, alpha2, a3, y01 + addi AO3, AO3, 2 * SIZE + FMADD y02, alpha2, a4, y02 + addi AO4, AO4, 2 * SIZE + + FMADD y01, alpha3, a5, y01 + FMADD y02, alpha3, a6, y02 + FMADD y01, alpha4, a7, y01 + FMADD y02, alpha4, a8, y02 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + addi Y1, Y1, 2 * SIZE + .align 4 + +LL(28): + andi. r0, M, 1 + ble LL(30) + + LFD y01, 0 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 0 * SIZE(AO2) + LFD a3, 0 * SIZE(AO3) + LFD a4, 0 * SIZE(AO4) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + FMADD y01, alpha3, a3, y01 + FMADD y01, alpha4, a4, y01 + + STFD y01, 0 * SIZE(Y1) + .align 4 + +LL(30): + andi. J, N, 2 + lfd alpha, ALPHA + ble LL(40) + .align 4 + + LFD alpha1, 0 * SIZE(X) + add X, X, INCX + LFD alpha2, 0 * SIZE(X) + add X, X, INCX + + FMUL alpha1, alpha, alpha1 + FMUL alpha2, alpha, alpha2 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr Y1, YY + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(35) + + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + bdz LL(33) + .align 4 + +LL(32): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 0 * SIZE(AO2) + FMADD y10, alpha1, a2, y10 + LFD a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y11 + LFD a3, 2 * SIZE(AO2) + FMADD y12, alpha1, a4, y12 + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + LFD a5, 4 * SIZE(AO2) + FMADD y14, alpha1, a6, y14 + LFD a6, 5 * SIZE(AO2) + FMADD y15, alpha1, a7, y15 + LFD a7, 6 * SIZE(AO2) + FMADD y16, alpha1, a8, y16 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 8 * SIZE(AO2) + FMADD y02, alpha2, a2, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, alpha2, a3, y03 + LFD a3, 10 * SIZE(AO2) + FMADD y04, alpha2, a4, y04 + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + LFD a5, 12 * SIZE(AO2) + FMADD y06, alpha2, a6, y06 + LFD a6, 13 * SIZE(AO2) + FMADD y07, alpha2, a7, y07 + LFD a7, 14 * SIZE(AO2) + FMADD y08, alpha2, a8, y08 + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2, a1, y09 + LFD a1, 16 * SIZE(AO1) + FMADD y10, alpha2, a2, y10 + LFD a2, 17 * SIZE(AO1) + FMADD y11, alpha2, a3, y11 + LFD a3, 18 * SIZE(AO1) + FMADD y12, alpha2, a4, y12 + LFD a4, 19 * SIZE(AO1) + + FMADD y13, alpha2, a5, y13 + LFD a5, 20 * SIZE(AO1) + FMADD y14, alpha2, a6, y14 + LFD a6, 21 * SIZE(AO1) + FMADD y15, alpha2, a7, y15 + LFD a7, 22 * SIZE(AO1) + FMADD y16, alpha2, a8, y16 + LFD a8, 23 * SIZE(AO1) + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + LFD y01, 16 * SIZE(Y1) + LFD y02, 17 * SIZE(Y1) + LFD y03, 18 * SIZE(Y1) + LFD y04, 19 * SIZE(Y1) + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + LFD y05, 20 * SIZE(Y1) + LFD y06, 21 * SIZE(Y1) + LFD y07, 22 * SIZE(Y1) + LFD y08, 23 * SIZE(Y1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + + LFD y09, 24 * SIZE(Y1) + LFD y10, 25 * SIZE(Y1) + LFD y11, 26 * SIZE(Y1) + LFD y12, 27 * SIZE(Y1) + + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + LFD y13, 28 * SIZE(Y1) + LFD y14, 29 * SIZE(Y1) + LFD y15, 30 * SIZE(Y1) + LFD y16, 31 * SIZE(Y1) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi Y1, Y1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(Y1, PREC) + + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 0 * SIZE(AO2) + FMADD y10, alpha1, a2, y10 + LFD a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y11 + LFD a3, 2 * SIZE(AO2) + FMADD y12, alpha1, a4, y12 + LFD a4, 3 * SIZE(AO2) + + FMADD y13, alpha1, a5, y13 + LFD a5, 4 * SIZE(AO2) + FMADD y14, alpha1, a6, y14 + LFD a6, 5 * SIZE(AO2) + FMADD y15, alpha1, a7, y15 + LFD a7, 6 * SIZE(AO2) + FMADD y16, alpha1, a8, y16 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFD a1, 8 * SIZE(AO2) + FMADD y02, alpha2, a2, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, alpha2, a3, y03 + LFD a3, 10 * SIZE(AO2) + FMADD y04, alpha2, a4, y04 + LFD a4, 11 * SIZE(AO2) + + FMADD y05, alpha2, a5, y05 + LFD a5, 12 * SIZE(AO2) + FMADD y06, alpha2, a6, y06 + LFD a6, 13 * SIZE(AO2) + FMADD y07, alpha2, a7, y07 + LFD a7, 14 * SIZE(AO2) + FMADD y08, alpha2, a8, y08 + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2, a1, y09 + FMADD y10, alpha2, a2, y10 + FMADD y11, alpha2, a3, y11 + FMADD y12, alpha2, a4, y12 + FMADD y13, alpha2, a5, y13 + FMADD y14, alpha2, a6, y14 + FMADD y15, alpha2, a7, y15 + FMADD y16, alpha2, a8, y16 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi Y1, Y1, 16 * SIZE + .align 4 + +LL(35): + andi. r0, M, 15 + ble LL(40) + + andi. r0, M, 8 + ble LL(36) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + LFD a1, 0 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFD a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFD a3, 2 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFD a4, 3 * SIZE(AO2) + FMADD y05, alpha1, a5, y05 + LFD a5, 4 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFD a6, 5 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFD a7, 6 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + FMADD y02, alpha2, a2, y02 + FMADD y03, alpha2, a3, y03 + FMADD y04, alpha2, a4, y04 + FMADD y05, alpha2, a5, y05 + FMADD y06, alpha2, a6, y06 + FMADD y07, alpha2, a7, y07 + FMADD y08, alpha2, a8, y08 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi Y1, Y1, 8 * SIZE + .align 4 + +LL(36): + andi. r0, M, 4 + ble LL(37) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + FMADD y01, alpha2, a5, y01 + FMADD y02, alpha2, a6, y02 + FMADD y03, alpha2, a7, y03 + FMADD y04, alpha2, a8, y04 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi Y1, Y1, 4 * SIZE + .align 4 + +LL(37): + andi. r0, M, 2 + ble LL(38) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y01, alpha2, a3, y01 + FMADD y02, alpha2, a4, y02 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi Y1, Y1, 2 * SIZE + .align 4 + +LL(38): + andi. r0, M, 1 + ble LL(40) + + LFD y01, 0 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 0 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + + STFD y01, 0 * SIZE(Y1) + .align 4 + +LL(40): + andi. J, N, 1 + lfd alpha, ALPHA + ble LL(990) + .align 4 + + LFD alpha1, 0 * SIZE(X) + FMUL alpha1, alpha, alpha1 + + mr AO1, A + mr Y1, YY + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(45) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + bdz LL(43) + .align 4 + +LL(42): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + LFD a1, 16 * SIZE(AO1) + FMADD y10, alpha1, a2, y10 + LFD a2, 17 * SIZE(AO1) + FMADD y11, alpha1, a3, y11 + LFD a3, 18 * SIZE(AO1) + FMADD y12, alpha1, a4, y12 + LFD a4, 19 * SIZE(AO1) + + FMADD y13, alpha1, a5, y13 + LFD a5, 20 * SIZE(AO1) + FMADD y14, alpha1, a6, y14 + LFD a6, 21 * SIZE(AO1) + FMADD y15, alpha1, a7, y15 + LFD a7, 22 * SIZE(AO1) + FMADD y16, alpha1, a8, y16 + LFD a8, 23 * SIZE(AO1) + + STFD y01, 0 * SIZE(Y1) + LFD y01, 16 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + LFD y02, 17 * SIZE(Y1) + + STFD y03, 2 * SIZE(Y1) + LFD y03, 18 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + LFD y04, 19 * SIZE(Y1) + + STFD y05, 4 * SIZE(Y1) + LFD y05, 20 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + LFD y06, 21 * SIZE(Y1) + + STFD y07, 6 * SIZE(Y1) + LFD y07, 22 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + LFD y08, 23 * SIZE(Y1) + + STFD y09, 8 * SIZE(Y1) + LFD y09, 24 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + LFD y10, 25 * SIZE(Y1) + + STFD y11, 10 * SIZE(Y1) + LFD y11, 26 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + LFD y12, 27 * SIZE(Y1) + + STFD y13, 12 * SIZE(Y1) + LFD y13, 28 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + LFD y14, 29 * SIZE(Y1) + + STFD y15, 14 * SIZE(Y1) + LFD y15, 30 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + LFD y16, 31 * SIZE(Y1) + + addi AO1, AO1, 16 * SIZE + addi Y1, Y1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(Y1, PREC) + + bdnz LL(42) + .align 4 + +LL(43): + FMADD y01, alpha1, a1, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, alpha1, a2, y02 + LFD a2, 9 * SIZE(AO1) + FMADD y03, alpha1, a3, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, alpha1, a4, y04 + LFD a4, 11 * SIZE(AO1) + + FMADD y05, alpha1, a5, y05 + LFD a5, 12 * SIZE(AO1) + FMADD y06, alpha1, a6, y06 + LFD a6, 13 * SIZE(AO1) + FMADD y07, alpha1, a7, y07 + LFD a7, 14 * SIZE(AO1) + FMADD y08, alpha1, a8, y08 + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1, a1, y09 + FMADD y10, alpha1, a2, y10 + FMADD y11, alpha1, a3, y11 + FMADD y12, alpha1, a4, y12 + FMADD y13, alpha1, a5, y13 + FMADD y14, alpha1, a6, y14 + FMADD y15, alpha1, a7, y15 + FMADD y16, alpha1, a8, y16 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + STFD y09, 8 * SIZE(Y1) + STFD y10, 9 * SIZE(Y1) + STFD y11, 10 * SIZE(Y1) + STFD y12, 11 * SIZE(Y1) + STFD y13, 12 * SIZE(Y1) + STFD y14, 13 * SIZE(Y1) + STFD y15, 14 * SIZE(Y1) + STFD y16, 15 * SIZE(Y1) + + addi AO1, AO1, 16 * SIZE + addi Y1, Y1, 16 * SIZE + .align 4 + +LL(45): + andi. r0, M, 15 + ble LL(990) + + andi. r0, M, 8 + ble LL(46) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + FMADD y05, alpha1, a5, y05 + FMADD y06, alpha1, a6, y06 + FMADD y07, alpha1, a7, y07 + FMADD y08, alpha1, a8, y08 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + STFD y05, 4 * SIZE(Y1) + STFD y06, 5 * SIZE(Y1) + STFD y07, 6 * SIZE(Y1) + STFD y08, 7 * SIZE(Y1) + + addi AO1, AO1, 8 * SIZE + addi Y1, Y1, 8 * SIZE + .align 4 + +LL(46): + andi. r0, M, 4 + ble LL(47) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + STFD y03, 2 * SIZE(Y1) + STFD y04, 3 * SIZE(Y1) + + addi AO1, AO1, 4 * SIZE + addi Y1, Y1, 4 * SIZE + .align 4 + +LL(47): + andi. r0, M, 2 + ble LL(48) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + + STFD y01, 0 * SIZE(Y1) + STFD y02, 1 * SIZE(Y1) + + addi AO1, AO1, 2 * SIZE + addi Y1, Y1, 2 * SIZE + .align 4 + +LL(48): + andi. r0, M, 1 + ble LL(990) + + LFD y01, 0 * SIZE(Y1) + LFD a1, 0 * SIZE(AO1) + + FMADD y01, alpha1, a1, y01 + STFD y01, 0 * SIZE(Y1) + .align 4 + +LL(990): + cmpi cr0, 0, INCY, SIZE + beq LL(999) + + mr YY, BUFFER + mr Y1, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + add Y, Y, INCY + LFD f5, 0 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + add Y, Y, INCY + LFD f7, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(YY) + LFD f9, 1 * SIZE(YY) + LFD f10, 2 * SIZE(YY) + LFD f11, 3 * SIZE(YY) + LFD f12, 4 * SIZE(YY) + LFD f13, 5 * SIZE(YY) + LFD f14, 6 * SIZE(YY) + LFD f15, 7 * SIZE(YY) + addi YY, YY, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f9, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f10, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f11, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f12, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f13, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f14, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f15, 0 * SIZE(Y1) + add Y1, Y1, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 4 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(YY) + LFD f9, 1 * SIZE(YY) + LFD f10, 2 * SIZE(YY) + LFD f11, 3 * SIZE(YY) + addi YY, YY, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f9, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f10, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f11, 0 * SIZE(Y1) + add Y1, Y1, INCY + .align 4 + +LL(996): + andi. J, M, 2 + ble LL(997) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(YY) + LFD f9, 1 * SIZE(YY) + addi YY, YY, 2 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(Y1) + add Y1, Y1, INCY + STFD f9, 0 * SIZE(Y1) + add Y1, Y1, INCY + .align 4 + +LL(997): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f8, 0 * SIZE(YY) + + FADD f8, f8, f0 + + STFD f8, 0 * SIZE(Y1) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S new file mode 100644 index 0000000000..baedebc2b9 --- /dev/null +++ b/kernel/power/gemv_n_ppc440.S @@ -0,0 +1,1185 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define LDA8 r18 + +#define Y1 r19 +#define Y2 r20 +#define PREA r21 +#define YY r22 +#define BUFFER r23 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define alpha1 f16 +#define alpha2 f17 +#define alpha3 f18 +#define alpha4 f19 + +#define a1 f20 +#define a2 f21 +#define a3 f22 +#define a4 f23 +#define a5 f24 +#define a6 f25 +#define a7 f26 +#define a8 f27 + +#define alpha f27 + +#if defined(PPCG4) +#define PREFETCHSIZE_A (3 * 4) +#endif + +#if defined(POWER6) +#define PREFETCHSIZE_A (3 * 4) +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA 200(SP) +#define FZERO 208(SP) +#else +#define STACKSIZE 280 +#define ALPHA 256(SP) +#define FZERO 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA + fmr alpha, f1 + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + addi A, A, -SIZE + sub X, X, INCX + sub Y, Y, INCY + + mr YY, Y + lfd f0, FZERO + + cmpi cr0, 0, INCY, SIZE + beq LL(10) + + addi YY, BUFFER, -SIZE + addi Y1, BUFFER, -SIZE + + addi r0, M, 7 + srawi. r0, r0, 3 + mtspr CTR, r0 + .align 4 + +LL(02): + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + bdnz LL(02) + .align 4 + +LL(10): + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(21): + mr AO1, A + add AO2, A, LDA + + LFDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + LFDUX alpha3, X, INCX + LFDUX alpha4, X, INCX + + FMUL alpha1, alpha, alpha1 + add AO3, AO2, LDA + FMUL alpha2, alpha, alpha2 + add AO4, AO3, LDA + FMUL alpha3, alpha, alpha3 + add A, AO4, LDA + FMUL alpha4, alpha, alpha4 + mr Y1, YY + mr Y2, YY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(25) + + LFDU y01, 1 * SIZE(Y1) + LFDU a1, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a5, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + bdz LL(23) + .align 4 + +LL(22): +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y09, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO2) + FMADD y10, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO2) + FMADD y12, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO2) + + LFDU y01, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD y13, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO2) + FMADD y14, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y15, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO2) + FMADD y16, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO2) + + LFDU y02, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y09, alpha2, a1, y09 + LFDU a1, 1 * SIZE(AO3) + FMADD y10, alpha2, a2, y10 + LFDU a2, 1 * SIZE(AO3) + FMADD y11, alpha2, a3, y11 + LFDU a3, 1 * SIZE(AO3) + FMADD y12, alpha2, a4, y12 + LFDU a4, 1 * SIZE(AO3) + + LFDU y03, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO3, PREA +#endif + + FMADD y13, alpha2, a5, y13 + LFDU a5, 1 * SIZE(AO3) + FMADD y14, alpha2, a6, y14 + LFDU a6, 1 * SIZE(AO3) + FMADD y15, alpha2, a7, y15 + LFDU a7, 1 * SIZE(AO3) + FMADD y16, alpha2, a8, y16 + LFDU a8, 1 * SIZE(AO3) + + LFDU y04, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO3, PREA +#endif + + FMADD y09, alpha3, a1, y09 + LFDU a1, 1 * SIZE(AO4) + FMADD y10, alpha3, a2, y10 + LFDU a2, 1 * SIZE(AO4) + FMADD y11, alpha3, a3, y11 + LFDU a3, 1 * SIZE(AO4) + FMADD y12, alpha3, a4, y12 + LFDU a4, 1 * SIZE(AO4) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + LFDU y05, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO4, PREA +#endif + + FMADD y13, alpha3, a5, y13 + LFDU a5, 1 * SIZE(AO4) + FMADD y14, alpha3, a6, y14 + LFDU a6, 1 * SIZE(AO4) + FMADD y15, alpha3, a7, y15 + LFDU a7, 1 * SIZE(AO4) + FMADD y16, alpha3, a8, y16 + LFDU a8, 1 * SIZE(AO4) + + LFDU y06, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO4, PREA +#endif + + FMADD y09, alpha4, a1, y09 + LFDU a1, 1 * SIZE(AO1) + FMADD y10, alpha4, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMADD y11, alpha4, a3, y11 + LFDU a3, 1 * SIZE(AO1) + FMADD y12, alpha4, a4, y12 + LFDU a4, 1 * SIZE(AO1) + + LFDU y07, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + STFDU y10, 1 * SIZE(Y2) + STFDU y11, 1 * SIZE(Y2) + STFDU y12, 1 * SIZE(Y2) + + FMADD y13, alpha4, a5, y13 + LFDU a5, 1 * SIZE(AO1) + FMADD y14, alpha4, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMADD y15, alpha4, a7, y15 + LFDU a7, 1 * SIZE(AO1) + FMADD y16, alpha4, a8, y16 + LFDU a8, 1 * SIZE(AO1) + + LFDU y08, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO2) + + FMADD y05, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + LFDU a1, 1 * SIZE(AO3) + FMADD y02, alpha2, a2, y02 + LFDU a2, 1 * SIZE(AO3) + FMADD y03, alpha2, a3, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, alpha2, a4, y04 + LFDU a4, 1 * SIZE(AO3) + + FMADD y05, alpha2, a5, y05 + LFDU a5, 1 * SIZE(AO3) + FMADD y06, alpha2, a6, y06 + LFDU a6, 1 * SIZE(AO3) + FMADD y07, alpha2, a7, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, alpha2, a8, y08 + LFDU a8, 1 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFDU a1, 1 * SIZE(AO4) + FMADD y02, alpha3, a2, y02 + LFDU a2, 1 * SIZE(AO4) + FMADD y03, alpha3, a3, y03 + LFDU a3, 1 * SIZE(AO4) + FMADD y04, alpha3, a4, y04 + LFDU a4, 1 * SIZE(AO4) + + FMADD y05, alpha3, a5, y05 + LFDU a5, 1 * SIZE(AO4) + FMADD y06, alpha3, a6, y06 + LFDU a6, 1 * SIZE(AO4) + FMADD y07, alpha3, a7, y07 + LFDU a7, 1 * SIZE(AO4) + FMADD y08, alpha3, a8, y08 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha4, a1, y01 + FMADD y02, alpha4, a2, y02 + FMADD y03, alpha4, a3, y03 + FMADD y04, alpha4, a4, y04 + + FMADD y05, alpha4, a5, y05 + STFDU y01, 1 * SIZE(Y2) + FMADD y06, alpha4, a6, y06 + STFDU y02, 1 * SIZE(Y2) + FMADD y07, alpha4, a7, y07 + STFDU y03, 1 * SIZE(Y2) + FMADD y08, alpha4, a8, y08 + STFDU y04, 1 * SIZE(Y2) + + STFDU y05, 1 * SIZE(Y2) + STFDU y06, 1 * SIZE(Y2) + STFDU y07, 1 * SIZE(Y2) + STFDU y08, 1 * SIZE(Y2) + .align 4 + +LL(25): + andi. r0, M, 7 + ble LL(29) + + andi. r0, M, 4 + ble LL(27) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + LFDU a5, 1 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFDU a7, 1 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFDU a8, 1 * SIZE(AO2) + + FMADD y01, alpha2, a5, y01 + LFDU a1, 1 * SIZE(AO3) + FMADD y02, alpha2, a6, y02 + LFDU a2, 1 * SIZE(AO3) + FMADD y03, alpha2, a7, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, alpha2, a8, y04 + LFDU a4, 1 * SIZE(AO3) + + FMADD y01, alpha3, a1, y01 + LFDU a5, 1 * SIZE(AO4) + FMADD y02, alpha3, a2, y02 + LFDU a6, 1 * SIZE(AO4) + FMADD y03, alpha3, a3, y03 + LFDU a7, 1 * SIZE(AO4) + FMADD y04, alpha3, a4, y04 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha4, a5, y01 + FMADD y02, alpha4, a6, y02 + FMADD y03, alpha4, a7, y03 + FMADD y04, alpha4, a8, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(27): + andi. r0, M, 2 + ble LL(28) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + LFDU a5, 1 * SIZE(AO3) + FMADD y02, alpha1, a2, y02 + LFDU a6, 1 * SIZE(AO3) + FMADD y01, alpha2, a3, y01 + LFDU a7, 1 * SIZE(AO4) + FMADD y02, alpha2, a4, y02 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha3, a5, y01 + FMADD y02, alpha3, a6, y02 + FMADD y01, alpha4, a7, y01 + FMADD y02, alpha4, a8, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(28): + andi. r0, M, 1 + ble LL(29) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO2) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + FMADD y01, alpha3, a3, y01 + FMADD y01, alpha4, a4, y01 + + STFDU y01, 1 * SIZE(Y2) + .align 4 + +LL(29): + addi J, J, -1 + lfd alpha, ALPHA + cmpi cr0, 0, J, 0 + bgt LL(21) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(40) + + LFDUX alpha1, X, INCX + LFDUX alpha2, X, INCX + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + FMUL alpha1, alpha, alpha1 + mr Y1, YY + FMUL alpha2, alpha, alpha2 + mr Y2, YY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(35) + + LFDU y01, 1 * SIZE(Y1) + LFDU a1, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a5, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + bdz LL(33) + .align 4 + +LL(32): +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y09, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO2) + FMADD y10, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y11, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO2) + FMADD y12, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO2) + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD y13, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO2) + FMADD y14, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y15, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO2) + FMADD y16, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO2) + + LFDU y03, 1 * SIZE(Y1) + LFDU y04, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y09, alpha2, a1, y09 + LFDU a1, 1 * SIZE(AO1) + FMADD y10, alpha2, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMADD y11, alpha2, a3, y11 + LFDU a3, 1 * SIZE(AO1) + FMADD y12, alpha2, a4, y12 + LFDU a4, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + LFDU y05, 1 * SIZE(Y1) + LFDU y06, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y13, alpha2, a5, y13 + LFDU a5, 1 * SIZE(AO1) + FMADD y14, alpha2, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMADD y15, alpha2, a7, y15 + LFDU a7, 1 * SIZE(AO1) + FMADD y16, alpha2, a8, y16 + LFDU a8, 1 * SIZE(AO1) + + LFDU y07, 1 * SIZE(Y1) + LFDU y08, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + STFDU y10, 1 * SIZE(Y2) + STFDU y11, 1 * SIZE(Y2) + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO2) + + FMADD y05, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO2) + FMADD y06, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO2) + FMADD y08, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO2) + + FMADD y01, alpha2, a1, y01 + FMADD y02, alpha2, a2, y02 + FMADD y03, alpha2, a3, y03 + FMADD y04, alpha2, a4, y04 + + FMADD y05, alpha2, a5, y05 + STFDU y01, 1 * SIZE(Y2) + FMADD y06, alpha2, a6, y06 + STFDU y02, 1 * SIZE(Y2) + FMADD y07, alpha2, a7, y07 + STFDU y03, 1 * SIZE(Y2) + FMADD y08, alpha2, a8, y08 + STFDU y04, 1 * SIZE(Y2) + + STFDU y05, 1 * SIZE(Y2) + STFDU y06, 1 * SIZE(Y2) + STFDU y07, 1 * SIZE(Y2) + STFDU y08, 1 * SIZE(Y2) + .align 4 + +LL(35): + andi. r0, M, 7 + ble LL(40) + + andi. r0, M, 4 + ble LL(37) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + LFDU a5, 1 * SIZE(AO2) + FMADD y02, alpha1, a2, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, alpha1, a3, y03 + LFDU a7, 1 * SIZE(AO2) + FMADD y04, alpha1, a4, y04 + LFDU a8, 1 * SIZE(AO2) + + FMADD y01, alpha2, a5, y01 + FMADD y02, alpha2, a6, y02 + FMADD y03, alpha2, a7, y03 + FMADD y04, alpha2, a8, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(37): + andi. r0, M, 2 + ble LL(38) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y01, alpha2, a3, y01 + FMADD y02, alpha2, a4, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(38): + andi. r0, M, 1 + ble LL(40) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO2) + + FMADD y01, alpha1, a1, y01 + FMADD y01, alpha2, a2, y01 + + STFDU y01, 1 * SIZE(Y2) + .align 4 + +LL(40): + andi. J, N, 1 + lfd alpha, ALPHA + ble LL(990) + + LFDUX alpha1, X, INCX + + mr AO1, A + add A, A, LDA + + FMUL alpha1, alpha, alpha1 + mr Y1, YY + mr Y2, YY + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(45) + + LFDU y01, 1 * SIZE(Y1) + LFDU a1, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a5, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + bdz LL(43) + .align 4 + +LL(42): +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y09, alpha1, a1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y10, alpha1, a2, y02 + LFDU a2, 1 * SIZE(AO1) + FMADD y11, alpha1, a3, y03 + LFDU a3, 1 * SIZE(AO1) + FMADD y12, alpha1, a4, y04 + LFDU a4, 1 * SIZE(AO1) + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) + LFDU y03, 1 * SIZE(Y1) + LFDU y04, 1 * SIZE(Y1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y13, alpha1, a5, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y14, alpha1, a6, y06 + LFDU a6, 1 * SIZE(AO1) + FMADD y15, alpha1, a7, y07 + LFDU a7, 1 * SIZE(AO1) + FMADD y16, alpha1, a8, y08 + LFDU a8, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + LFDU y05, 1 * SIZE(Y1) + LFDU y06, 1 * SIZE(Y1) + LFDU y07, 1 * SIZE(Y1) + LFDU y08, 1 * SIZE(Y1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + STFDU y10, 1 * SIZE(Y2) + STFDU y11, 1 * SIZE(Y2) + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + bdnz LL(42) + .align 4 + +LL(43): + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + FMADD y05, alpha1, a5, y05 + STFDU y01, 1 * SIZE(Y2) + FMADD y06, alpha1, a6, y06 + STFDU y02, 1 * SIZE(Y2) + FMADD y07, alpha1, a7, y07 + STFDU y03, 1 * SIZE(Y2) + FMADD y08, alpha1, a8, y08 + STFDU y04, 1 * SIZE(Y2) + + STFDU y05, 1 * SIZE(Y2) + STFDU y06, 1 * SIZE(Y2) + STFDU y07, 1 * SIZE(Y2) + STFDU y08, 1 * SIZE(Y2) + .align 4 + +LL(45): + andi. r0, M, 7 + ble LL(990) + + andi. r0, M, 4 + ble LL(47) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + FMADD y03, alpha1, a3, y03 + FMADD y04, alpha1, a4, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(47): + andi. r0, M, 2 + ble LL(48) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + FMADD y02, alpha1, a2, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(48): + andi. r0, M, 1 + ble LL(990) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + + FMADD y01, alpha1, a1, y01 + + STFDU y01, 1 * SIZE(Y2) + .align 4 + +LL(990): + cmpi cr0, 0, INCY, SIZE + beq LL(999) + + addi YY, BUFFER, -SIZE + mr Y1, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFDUX f0, Y, INCY + LFDUX f1, Y, INCY + LFDUX f2, Y, INCY + LFDUX f3, Y, INCY + LFDUX f4, Y, INCY + LFDUX f5, Y, INCY + LFDUX f6, Y, INCY + LFDUX f7, Y, INCY + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + LFDU f10, 1 * SIZE(YY) + LFDU f11, 1 * SIZE(YY) + LFDU f12, 1 * SIZE(YY) + LFDU f13, 1 * SIZE(YY) + LFDU f14, 1 * SIZE(YY) + LFDU f15, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFDUX f8, Y1, INCY + STFDUX f9, Y1, INCY + STFDUX f10, Y1, INCY + STFDUX f11, Y1, INCY + STFDUX f12, Y1, INCY + STFDUX f13, Y1, INCY + STFDUX f14, Y1, INCY + STFDUX f15, Y1, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 4 + ble LL(996) + + LFDUX f0, Y, INCY + LFDUX f1, Y, INCY + LFDUX f2, Y, INCY + LFDUX f3, Y, INCY + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + LFDU f10, 1 * SIZE(YY) + LFDU f11, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFDUX f8, Y1, INCY + STFDUX f9, Y1, INCY + STFDUX f10, Y1, INCY + STFDUX f11, Y1, INCY + .align 4 + +LL(996): + andi. J, M, 2 + ble LL(997) + + LFDUX f0, Y, INCY + LFDUX f1, Y, INCY + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFDUX f8, Y1, INCY + STFDUX f9, Y1, INCY + .align 4 + +LL(997): + andi. J, M, 1 + ble LL(999) + + LFDUX f0, Y, INCY + LFDU f8, 1 * SIZE(YY) + + FADD f8, f8, f0 + + STFDUX f8, Y1, INCY + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S new file mode 100644 index 0000000000..a70e8b8a08 --- /dev/null +++ b/kernel/power/gemv_t.S @@ -0,0 +1,2964 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#define BUFFER r11 +#define XP r12 +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define AO5 r18 +#define AO6 r19 +#define AO7 r20 +#define AO8 r21 +#define MIN_N r22 +#define J r23 +#define CO r24 +#define PREA r25 +#define PREC r26 +#define BO r27 +#define PLDA_M r28 +#define IS r29 + +#define Y1 CO + +#if defined(PPCG4) +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 16 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 48 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 40 +#define PREFETCHSIZE_C 8 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 96 +#define PREFETCHSIZE_C 8 +#endif + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define a1 f16 +#define a2 f17 +#define a3 f18 +#define a4 f19 +#define a5 f20 +#define a6 f21 +#define a7 f22 +#define a8 f23 + +#define b1 f24 +#define b2 f25 +#define b3 f26 +#define b4 f27 +#define b5 f28 +#define b6 f29 +#define b7 f30 +#define b8 f31 + +#define alpha f31 + +#ifndef NEEDPARAM + +#define P 2048 + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 288 +#endif + +#define FZERO 144(SP) +#define ALPHA 152(SP) + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + stfd f1, ALPHA + std r14, 160(SP) + std r15, 168(SP) + std r16, 176(SP) + std r17, 184(SP) + std r18, 192(SP) + std r19, 200(SP) + std r20, 208(SP) + std r21, 216(SP) + std r22, 224(SP) + std r23, 232(SP) + std r24, 240(SP) + std r25, 248(SP) + std r26, 256(SP) + std r27, 264(SP) + std r28, 272(SP) + std r29, 280(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stfd f1, ALPHA + stw r14, 160(SP) + stw r15, 164(SP) + stw r16, 168(SP) + stw r17, 172(SP) + stw r18, 176(SP) + stw r19, 180(SP) + stw r20, 184(SP) + stw r21, 188(SP) + stw r22, 192(SP) + stw r23, 196(SP) + stw r24, 200(SP) + stw r25, 204(SP) + stw r26, 208(SP) + stw r27, 212(SP) + stw r28, 216(SP) + stw r29, 220(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + mullw PLDA_M, LDA, N + li XP, P + subf PLDA_M, XP, PLDA_M + slwi PLDA_M, PLDA_M, BASE_SHIFT + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + subf Y, INCY, Y + + li IS, 0 + + addi A, A, -SIZE + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpi cr0, 0, M, 0 + ble LL(999) + + cmpi cr0, 0, N, 0 + ble LL(999) + .align 4 + +LL(ISLoop): + subf MIN_N, IS, M + slwi r0, IS, BASE_SHIFT + cmpi cr0, 0, MIN_N, P + ble+ LL(min_nP) + li MIN_N, P +LL(min_nP): + add XP, X, r0 + cmpi cr0, 0, INCX, SIZE + beq LL(10) + + mr XP, BUFFER + addi CO, BUFFER, -SIZE + + srawi. r0, MIN_N, 3 + mtspr CTR, r0 + ble LL(CopyRemain) + .align 4 + +LL(CopyKernel): + LFD f0, 0 * SIZE(X) + add X, X, INCX + LFD f1, 0 * SIZE(X) + add X, X, INCX + LFD f2, 0 * SIZE(X) + add X, X, INCX + LFD f3, 0 * SIZE(X) + add X, X, INCX + LFD f4, 0 * SIZE(X) + add X, X, INCX + LFD f5, 0 * SIZE(X) + add X, X, INCX + LFD f6, 0 * SIZE(X) + add X, X, INCX + LFD f7, 0 * SIZE(X) + add X, X, INCX + + STFD f0, 1 * SIZE(CO) + STFD f1, 2 * SIZE(CO) + STFD f2, 3 * SIZE(CO) + STFD f3, 4 * SIZE(CO) + STFD f4, 5 * SIZE(CO) + STFD f5, 6 * SIZE(CO) + STFD f6, 7 * SIZE(CO) + STFDU f7, 8 * SIZE(CO) + bdnz LL(CopyKernel) + .align 4 + +LL(CopyRemain): + andi. r0, MIN_N, 7 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(CopySub): + LFD f0, 0 * SIZE(X) + add X, X, INCX + STFDU f0, 1 * SIZE(CO) + bdnz LL(CopySub) + .align 4 + +LL(10): + mr CO, Y + addi XP, XP, -SIZE + srawi. J, N, 3 + ble LL(20) + .align 4 + +LL(11): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add AO5, AO4, LDA + add AO6, AO5, LDA + add AO7, AO6, LDA + add AO8, AO7, LDA + add A, AO8, LDA + + mr BO, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y05, y01 + fmr y06, y01 + fmr y07, y01 + fmr y08, y01 + fmr y09, y01 + fmr y10, y01 + fmr y11, y01 + fmr y12, y01 + fmr y13, y01 + fmr y14, y01 + fmr y15, y01 + fmr y16, y01 + + DCBT(Y1, PREC) + + srawi. r0, MIN_N, 4 + mtspr CTR, r0 + ble LL(14) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + bdz LL(13) + .align 4 + +LL(12): + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 3 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 3 * SIZE(AO2) + + FMADD y11, a3, b2, y11 + LFD a3, 3 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 3 * SIZE(AO4) + + FMADD y13, a5, b2, y13 + LFD a5, 3 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 3 * SIZE(AO6) + + FMADD y15, a7, b2, y15 + LFD a7, 3 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 3 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 4 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 4 * SIZE(AO2) + + FMADD y03, a3, b3, y03 + LFD a3, 4 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 4 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 4 * SIZE(AO6) + + FMADD y07, a7, b3, y07 + LFD a7, 4 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 4 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 5 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 5 * SIZE(AO2) + + FMADD y11, a3, b4, y11 + LFD a3, 5 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 5 * SIZE(AO4) + + FMADD y13, a5, b4, y13 + LFD a5, 5 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 5 * SIZE(AO6) + + FMADD y15, a7, b4, y15 + LFD a7, 5 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 5 * SIZE(AO8) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 6 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 6 * SIZE(AO2) + + FMADD y03, a3, b5, y03 + LFD a3, 6 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 6 * SIZE(AO4) + + FMADD y05, a5, b5, y05 + LFD a5, 6 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 6 * SIZE(AO6) + + FMADD y07, a7, b5, y07 + LFD a7, 6 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 6 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 7 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 7 * SIZE(AO2) + + FMADD y11, a3, b6, y11 + LFD a3, 7 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 7 * SIZE(AO4) + + FMADD y13, a5, b6, y13 + LFD a5, 7 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 7 * SIZE(AO6) + + FMADD y15, a7, b6, y15 + LFD a7, 7 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 7 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 8 * SIZE(AO2) + + FMADD y03, a3, b7, y03 + LFD a3, 8 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 8 * SIZE(AO4) + + FMADD y05, a5, b7, y05 + LFD a5, 8 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 8 * SIZE(AO6) + + FMADD y07, a7, b7, y07 + LFD a7, 8 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 8 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + LFD a1, 9 * SIZE(AO1) + FMADD y10, a2, b8, y10 + LFD a2, 9 * SIZE(AO2) + + FMADD y11, a3, b8, y11 + LFD a3, 9 * SIZE(AO3) + FMADD y12, a4, b8, y12 + LFD a4, 9 * SIZE(AO4) + + FMADD y13, a5, b8, y13 + LFD a5, 9 * SIZE(AO5) + FMADD y14, a6, b8, y14 + LFD a6, 9 * SIZE(AO6) + + FMADD y15, a7, b8, y15 + LFD a7, 9 * SIZE(AO7) + FMADD y16, a8, b8, y16 + LFD a8, 9 * SIZE(AO8) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + FMADD y01, a1, b1, y01 + LFD a1, 10 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 10 * SIZE(AO2) + + FMADD y03, a3, b1, y03 + LFD a3, 10 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 10 * SIZE(AO4) + + FMADD y05, a5, b1, y05 + LFD a5, 10 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 10 * SIZE(AO6) + + FMADD y07, a7, b1, y07 + LFD a7, 10 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 10 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 11 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 11 * SIZE(AO2) + + FMADD y11, a3, b2, y11 + LFD a3, 11 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 11 * SIZE(AO4) + + FMADD y13, a5, b2, y13 + LFD a5, 11 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 11 * SIZE(AO6) + + FMADD y15, a7, b2, y15 + LFD a7, 11 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 11 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 12 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 12 * SIZE(AO2) + + FMADD y03, a3, b3, y03 + LFD a3, 12 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 12 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 12 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 12 * SIZE(AO6) + + FMADD y07, a7, b3, y07 + LFD a7, 12 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 12 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 13 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 13 * SIZE(AO2) + + FMADD y11, a3, b4, y11 + LFD a3, 13 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 13 * SIZE(AO4) + + FMADD y13, a5, b4, y13 + LFD a5, 13 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 13 * SIZE(AO6) + + FMADD y15, a7, b4, y15 + LFD a7, 13 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 13 * SIZE(AO8) + + LFD b1, 17 * SIZE(BO) + LFD b2, 18 * SIZE(BO) + LFD b3, 19 * SIZE(BO) + LFD b4, 20 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 14 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 14 * SIZE(AO2) + + FMADD y03, a3, b5, y03 + LFD a3, 14 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 14 * SIZE(AO4) + + FMADD y05, a5, b5, y05 + LFD a5, 14 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 14 * SIZE(AO6) + + FMADD y07, a7, b5, y07 + LFD a7, 14 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 14 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 15 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 15 * SIZE(AO2) + + FMADD y11, a3, b6, y11 + LFD a3, 15 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 15 * SIZE(AO4) + + FMADD y13, a5, b6, y13 + LFD a5, 15 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 15 * SIZE(AO6) + + FMADD y15, a7, b6, y15 + LFD a7, 15 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 15 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 16 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 16 * SIZE(AO2) + + FMADD y03, a3, b7, y03 + LFD a3, 16 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 16 * SIZE(AO4) + + FMADD y05, a5, b7, y05 + LFD a5, 16 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 16 * SIZE(AO6) + + FMADD y07, a7, b7, y07 + LFD a7, 16 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 16 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + LFD a1, 17 * SIZE(AO1) + FMADD y10, a2, b8, y10 + LFD a2, 17 * SIZE(AO2) + + FMADD y11, a3, b8, y11 + LFD a3, 17 * SIZE(AO3) + FMADD y12, a4, b8, y12 + LFD a4, 17 * SIZE(AO4) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + FMADD y13, a5, b8, y13 + LFD a5, 17 * SIZE(AO5) + FMADD y14, a6, b8, y14 + LFD a6, 17 * SIZE(AO6) + + FMADD y15, a7, b8, y15 + LFD a7, 17 * SIZE(AO7) + FMADD y16, a8, b8, y16 + LFD a8, 17 * SIZE(AO8) + + LFD b5, 21 * SIZE(BO) + LFD b6, 22 * SIZE(BO) + LFD b7, 23 * SIZE(BO) + LFD b8, 24 * SIZE(BO) + + addi AO5, AO5, 16 * SIZE + addi AO6, AO6, 16 * SIZE + DCBT(AO5, PREA) + DCBT(AO6, PREA) + + addi AO7, AO7, 16 * SIZE + addi AO8, AO8, 16 * SIZE + DCBT(AO7, PREA) + DCBT(AO8, PREA) + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(13): + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 3 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 3 * SIZE(AO2) + + FMADD y11, a3, b2, y11 + LFD a3, 3 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 3 * SIZE(AO4) + + FMADD y13, a5, b2, y13 + LFD a5, 3 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 3 * SIZE(AO6) + + FMADD y15, a7, b2, y15 + LFD a7, 3 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 3 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 4 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 4 * SIZE(AO2) + + FMADD y03, a3, b3, y03 + LFD a3, 4 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 4 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 4 * SIZE(AO6) + + FMADD y07, a7, b3, y07 + LFD a7, 4 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 4 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 5 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 5 * SIZE(AO2) + + FMADD y11, a3, b4, y11 + LFD a3, 5 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 5 * SIZE(AO4) + + FMADD y13, a5, b4, y13 + LFD a5, 5 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 5 * SIZE(AO6) + + FMADD y15, a7, b4, y15 + LFD a7, 5 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 5 * SIZE(AO8) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 6 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 6 * SIZE(AO2) + + FMADD y03, a3, b5, y03 + LFD a3, 6 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 6 * SIZE(AO4) + + FMADD y05, a5, b5, y05 + LFD a5, 6 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 6 * SIZE(AO6) + + FMADD y07, a7, b5, y07 + LFD a7, 6 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 6 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 7 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 7 * SIZE(AO2) + + FMADD y11, a3, b6, y11 + LFD a3, 7 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 7 * SIZE(AO4) + + FMADD y13, a5, b6, y13 + LFD a5, 7 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 7 * SIZE(AO6) + + FMADD y15, a7, b6, y15 + LFD a7, 7 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 7 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 8 * SIZE(AO2) + + FMADD y03, a3, b7, y03 + LFD a3, 8 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 8 * SIZE(AO4) + + FMADD y05, a5, b7, y05 + LFD a5, 8 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 8 * SIZE(AO6) + + FMADD y07, a7, b7, y07 + LFD a7, 8 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 8 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + LFD a1, 9 * SIZE(AO1) + FMADD y10, a2, b8, y10 + LFD a2, 9 * SIZE(AO2) + + FMADD y11, a3, b8, y11 + LFD a3, 9 * SIZE(AO3) + FMADD y12, a4, b8, y12 + LFD a4, 9 * SIZE(AO4) + + FMADD y13, a5, b8, y13 + LFD a5, 9 * SIZE(AO5) + FMADD y14, a6, b8, y14 + LFD a6, 9 * SIZE(AO6) + + FMADD y15, a7, b8, y15 + LFD a7, 9 * SIZE(AO7) + FMADD y16, a8, b8, y16 + LFD a8, 9 * SIZE(AO8) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 10 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 10 * SIZE(AO2) + + FMADD y03, a3, b1, y03 + LFD a3, 10 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 10 * SIZE(AO4) + + FMADD y05, a5, b1, y05 + LFD a5, 10 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 10 * SIZE(AO6) + + FMADD y07, a7, b1, y07 + LFD a7, 10 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 10 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 11 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 11 * SIZE(AO2) + + FMADD y11, a3, b2, y11 + LFD a3, 11 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 11 * SIZE(AO4) + + FMADD y13, a5, b2, y13 + LFD a5, 11 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 11 * SIZE(AO6) + + FMADD y15, a7, b2, y15 + LFD a7, 11 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 11 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 12 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 12 * SIZE(AO2) + + FMADD y03, a3, b3, y03 + LFD a3, 12 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 12 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 12 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 12 * SIZE(AO6) + + FMADD y07, a7, b3, y07 + LFD a7, 12 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 12 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 13 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 13 * SIZE(AO2) + + FMADD y11, a3, b4, y11 + LFD a3, 13 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 13 * SIZE(AO4) + + FMADD y13, a5, b4, y13 + LFD a5, 13 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 13 * SIZE(AO6) + + FMADD y15, a7, b4, y15 + LFD a7, 13 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 13 * SIZE(AO8) + + FMADD y01, a1, b5, y01 + LFD a1, 14 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 14 * SIZE(AO2) + + FMADD y03, a3, b5, y03 + LFD a3, 14 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 14 * SIZE(AO4) + + FMADD y05, a5, b5, y05 + LFD a5, 14 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 14 * SIZE(AO6) + + FMADD y07, a7, b5, y07 + LFD a7, 14 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 14 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 15 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 15 * SIZE(AO2) + + FMADD y11, a3, b6, y11 + LFD a3, 15 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 15 * SIZE(AO4) + + FMADD y13, a5, b6, y13 + LFD a5, 15 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 15 * SIZE(AO6) + + FMADD y15, a7, b6, y15 + LFD a7, 15 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 15 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 16 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 16 * SIZE(AO2) + + FMADD y03, a3, b7, y03 + LFD a3, 16 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 16 * SIZE(AO4) + + FMADD y05, a5, b7, y05 + LFD a5, 16 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 16 * SIZE(AO6) + + FMADD y07, a7, b7, y07 + LFD a7, 16 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 16 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + FMADD y10, a2, b8, y10 + FMADD y11, a3, b8, y11 + FMADD y12, a4, b8, y12 + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + FMADD y13, a5, b8, y13 + FMADD y14, a6, b8, y14 + FMADD y15, a7, b8, y15 + FMADD y16, a8, b8, y16 + + addi AO5, AO5, 16 * SIZE + addi AO6, AO6, 16 * SIZE + addi AO7, AO7, 16 * SIZE + addi AO8, AO8, 16 * SIZE + addi BO, BO, 16 * SIZE + .align 4 + +LL(14): + andi. r0, MIN_N, 15 + ble LL(18) + + andi. r0, MIN_N, 8 + ble LL(15) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 3 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 3 * SIZE(AO2) + FMADD y11, a3, b2, y11 + LFD a3, 3 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 3 * SIZE(AO4) + FMADD y13, a5, b2, y13 + LFD a5, 3 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 3 * SIZE(AO6) + FMADD y15, a7, b2, y15 + LFD a7, 3 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 3 * SIZE(AO8) + + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + + FMADD y01, a1, b3, y01 + LFD a1, 4 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 4 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 4 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 4 * SIZE(AO4) + FMADD y05, a5, b3, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 4 * SIZE(AO6) + FMADD y07, a7, b3, y07 + LFD a7, 4 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 4 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + LFD a1, 5 * SIZE(AO1) + FMADD y10, a2, b4, y10 + LFD a2, 5 * SIZE(AO2) + FMADD y11, a3, b4, y11 + LFD a3, 5 * SIZE(AO3) + FMADD y12, a4, b4, y12 + LFD a4, 5 * SIZE(AO4) + FMADD y13, a5, b4, y13 + LFD a5, 5 * SIZE(AO5) + FMADD y14, a6, b4, y14 + LFD a6, 5 * SIZE(AO6) + FMADD y15, a7, b4, y15 + LFD a7, 5 * SIZE(AO7) + FMADD y16, a8, b4, y16 + LFD a8, 5 * SIZE(AO8) + + FMADD y01, a1, b5, y01 + LFD a1, 6 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 6 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 6 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 6 * SIZE(AO4) + FMADD y05, a5, b5, y05 + LFD a5, 6 * SIZE(AO5) + FMADD y06, a6, b5, y06 + LFD a6, 6 * SIZE(AO6) + FMADD y07, a7, b5, y07 + LFD a7, 6 * SIZE(AO7) + FMADD y08, a8, b5, y08 + LFD a8, 6 * SIZE(AO8) + + FMADD y09, a1, b6, y09 + LFD a1, 7 * SIZE(AO1) + FMADD y10, a2, b6, y10 + LFD a2, 7 * SIZE(AO2) + FMADD y11, a3, b6, y11 + LFD a3, 7 * SIZE(AO3) + FMADD y12, a4, b6, y12 + LFD a4, 7 * SIZE(AO4) + FMADD y13, a5, b6, y13 + LFD a5, 7 * SIZE(AO5) + FMADD y14, a6, b6, y14 + LFD a6, 7 * SIZE(AO6) + FMADD y15, a7, b6, y15 + LFD a7, 7 * SIZE(AO7) + FMADD y16, a8, b6, y16 + LFD a8, 7 * SIZE(AO8) + + FMADD y01, a1, b7, y01 + LFD a1, 8 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 8 * SIZE(AO2) + FMADD y03, a3, b7, y03 + LFD a3, 8 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 8 * SIZE(AO4) + FMADD y05, a5, b7, y05 + LFD a5, 8 * SIZE(AO5) + FMADD y06, a6, b7, y06 + LFD a6, 8 * SIZE(AO6) + FMADD y07, a7, b7, y07 + LFD a7, 8 * SIZE(AO7) + FMADD y08, a8, b7, y08 + LFD a8, 8 * SIZE(AO8) + + FMADD y09, a1, b8, y09 + addi AO1, AO1, 8 * SIZE + FMADD y10, a2, b8, y10 + addi AO2, AO2, 8 * SIZE + FMADD y11, a3, b8, y11 + addi AO3, AO3, 8 * SIZE + FMADD y12, a4, b8, y12 + addi AO4, AO4, 8 * SIZE + FMADD y13, a5, b8, y13 + addi AO5, AO5, 8 * SIZE + FMADD y14, a6, b8, y14 + addi AO6, AO6, 8 * SIZE + FMADD y15, a7, b8, y15 + addi AO7, AO7, 8 * SIZE + FMADD y16, a8, b8, y16 + addi AO8, AO8, 8 * SIZE + addi BO, BO, 8 * SIZE + .align 4 + +LL(15): + andi. r0, MIN_N, 4 + ble LL(16) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + LFD a1, 3 * SIZE(AO1) + FMADD y10, a2, b2, y10 + LFD a2, 3 * SIZE(AO2) + FMADD y11, a3, b2, y11 + LFD a3, 3 * SIZE(AO3) + FMADD y12, a4, b2, y12 + LFD a4, 3 * SIZE(AO4) + FMADD y13, a5, b2, y13 + LFD a5, 3 * SIZE(AO5) + FMADD y14, a6, b2, y14 + LFD a6, 3 * SIZE(AO6) + FMADD y15, a7, b2, y15 + LFD a7, 3 * SIZE(AO7) + FMADD y16, a8, b2, y16 + LFD a8, 3 * SIZE(AO8) + + FMADD y01, a1, b3, y01 + LFD a1, 4 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 4 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 4 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 4 * SIZE(AO4) + + FMADD y05, a5, b3, y05 + LFD a5, 4 * SIZE(AO5) + FMADD y06, a6, b3, y06 + LFD a6, 4 * SIZE(AO6) + FMADD y07, a7, b3, y07 + LFD a7, 4 * SIZE(AO7) + FMADD y08, a8, b3, y08 + LFD a8, 4 * SIZE(AO8) + + FMADD y09, a1, b4, y09 + addi AO1, AO1, 4 * SIZE + FMADD y10, a2, b4, y10 + addi AO2, AO2, 4 * SIZE + FMADD y11, a3, b4, y11 + addi AO3, AO3, 4 * SIZE + FMADD y12, a4, b4, y12 + addi AO4, AO4, 4 * SIZE + FMADD y13, a5, b4, y13 + addi AO5, AO5, 4 * SIZE + FMADD y14, a6, b4, y14 + addi AO6, AO6, 4 * SIZE + FMADD y15, a7, b4, y15 + addi AO7, AO7, 4 * SIZE + FMADD y16, a8, b4, y16 + addi AO8, AO8, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(16): + andi. r0, MIN_N, 2 + ble LL(17) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + LFD b2, 2 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 2 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 2 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 2 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 2 * SIZE(AO4) + FMADD y05, a5, b1, y05 + LFD a5, 2 * SIZE(AO5) + FMADD y06, a6, b1, y06 + LFD a6, 2 * SIZE(AO6) + FMADD y07, a7, b1, y07 + LFD a7, 2 * SIZE(AO7) + FMADD y08, a8, b1, y08 + LFD a8, 2 * SIZE(AO8) + + FMADD y09, a1, b2, y09 + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + FMADD y10, a2, b2, y10 + addi AO3, AO3, 2 * SIZE + addi AO4, AO4, 2 * SIZE + FMADD y11, a3, b2, y11 + FMADD y12, a4, b2, y12 + addi AO5, AO5, 2 * SIZE + addi AO6, AO6, 2 * SIZE + FMADD y13, a5, b2, y13 + FMADD y14, a6, b2, y14 + addi AO7, AO7, 2 * SIZE + addi AO8, AO8, 2 * SIZE + FMADD y15, a7, b2, y15 + FMADD y16, a8, b2, y16 + addi BO, BO, 2 * SIZE + .align 4 + +LL(17): + andi. r0, MIN_N, 1 + ble LL(18) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 1 * SIZE(AO5) + LFD a6, 1 * SIZE(AO6) + LFD a7, 1 * SIZE(AO7) + LFD a8, 1 * SIZE(AO8) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b1, y03 + FMADD y04, a4, b1, y04 + FMADD y05, a5, b1, y05 + FMADD y06, a6, b1, y06 + FMADD y07, a7, b1, y07 + FMADD y08, a8, b1, y08 + .align 4 + +LL(18): + mr BO, CO + lfd alpha, ALPHA + cmpi cr0, 0, INCY, SIZE + bne LL(19) + + LFD a1, 1 * SIZE(CO) + LFD a2, 2 * SIZE(CO) + LFD a3, 3 * SIZE(CO) + LFD a4, 4 * SIZE(CO) + LFD a5, 5 * SIZE(CO) + LFD a6, 6 * SIZE(CO) + LFD a7, 7 * SIZE(CO) + LFD a8, 8 * SIZE(CO) + + FADD y01, y09, y01 + FADD y02, y10, y02 + FADD y03, y11, y03 + FADD y04, y12, y04 + FADD y05, y13, y05 + FADD y06, y14, y06 + FADD y07, y15, y07 + FADD y08, y16, y08 + + FMADD a1, alpha, y01, a1 + FMADD a2, alpha, y02, a2 + FMADD a3, alpha, y03, a3 + FMADD a4, alpha, y04, a4 + FMADD a5, alpha, y05, a5 + FMADD a6, alpha, y06, a6 + FMADD a7, alpha, y07, a7 + FMADD a8, alpha, y08, a8 + + STFD a1, 1 * SIZE(CO) + STFD a2, 2 * SIZE(CO) + STFD a3, 3 * SIZE(CO) + STFD a4, 4 * SIZE(CO) + STFD a5, 5 * SIZE(CO) + STFD a6, 6 * SIZE(CO) + STFD a7, 7 * SIZE(CO) + STFD a8, 8 * SIZE(CO) + + addi J, J, -1 + addi CO, CO, 8 * SIZE + cmpi cr0, 0, J, 0 + bgt LL(11) + b LL(20) + .align 4 + +LL(19): + LFDUX a1, CO, INCY + LFDUX a2, CO, INCY + LFDUX a3, CO, INCY + LFDUX a4, CO, INCY + LFDUX a5, CO, INCY + LFDUX a6, CO, INCY + LFDUX a7, CO, INCY + LFDUX a8, CO, INCY + + FADD y01, y09, y01 + FADD y02, y10, y02 + FADD y03, y11, y03 + FADD y04, y12, y04 + FADD y05, y13, y05 + FADD y06, y14, y06 + FADD y07, y15, y07 + FADD y08, y16, y08 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + FMADD a3, alpha, f2, a3 + FMADD a4, alpha, f3, a4 + FMADD a5, alpha, f4, a5 + FMADD a6, alpha, f5, a6 + FMADD a7, alpha, f6, a7 + FMADD a8, alpha, f7, a8 + + STFDUX a1, BO, INCY + STFDUX a2, BO, INCY + STFDUX a3, BO, INCY + STFDUX a4, BO, INCY + STFDUX a5, BO, INCY + STFDUX a6, BO, INCY + STFDUX a7, BO, INCY + STFDUX a8, BO, INCY + + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 7 + ble LL(99) + andi. J, N, 4 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr BO, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y09, y01 + fmr y10, y01 + fmr y11, y01 + fmr y12, y01 + + DCBT(Y1, PREC) + + srawi. r0, MIN_N, 4 + mtspr CTR, r0 + ble LL(24) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + LFD a5, 2 * SIZE(AO1) + LFD a6, 2 * SIZE(AO2) + LFD a7, 2 * SIZE(AO3) + LFD a8, 2 * SIZE(AO4) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + bdz LL(23) + .align 4 + +LL(22): + FMADD y01, a1, b1, y01 + LFD a1, 3 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 3 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 3 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 4 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 4 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 4 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 4 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 5 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 5 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 6 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 6 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 6 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 6 * SIZE(AO4) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 7 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 7 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 7 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 7 * SIZE(AO4) + + FMADD y09, a5, b6, y09 + LFD a5, 8 * SIZE(AO1) + FMADD y10, a6, b6, y10 + LFD a6, 8 * SIZE(AO2) + FMADD y11, a7, b6, y11 + LFD a7, 8 * SIZE(AO3) + FMADD y12, a8, b6, y12 + LFD a8, 8 * SIZE(AO4) + + FMADD y01, a1, b7, y01 + LFD a1, 9 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, a3, b7, y03 + LFD a3, 9 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 9 * SIZE(AO4) + + FMADD y09, a5, b8, y09 + LFD a5, 10 * SIZE(AO1) + FMADD y10, a6, b8, y10 + LFD a6, 10 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 10 * SIZE(AO3) + FMADD y12, a8, b8, y12 + LFD a8, 10 * SIZE(AO4) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 11 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 11 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 11 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 11 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 12 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 12 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 12 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 12 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 13 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 13 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 13 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 13 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 14 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 14 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 14 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 14 * SIZE(AO4) + + LFD b1, 17 * SIZE(BO) + LFD b2, 18 * SIZE(BO) + LFD b3, 19 * SIZE(BO) + LFD b4, 20 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 15 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 15 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 15 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 15 * SIZE(AO4) + + FMADD y09, a5, b6, y09 + LFD a5, 16 * SIZE(AO1) + FMADD y10, a6, b6, y10 + LFD a6, 16 * SIZE(AO2) + FMADD y11, a7, b6, y11 + LFD a7, 16 * SIZE(AO3) + FMADD y12, a8, b6, y12 + LFD a8, 16 * SIZE(AO4) + + FMADD y01, a1, b7, y01 + LFD a1, 17 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 17 * SIZE(AO2) + FMADD y03, a3, b7, y03 + LFD a3, 17 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 17 * SIZE(AO4) + + FMADD y09, a5, b8, y09 + LFD a5, 18 * SIZE(AO1) + FMADD y10, a6, b8, y10 + LFD a6, 18 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 18 * SIZE(AO3) + FMADD y12, a8, b8, y12 + LFD a8, 18 * SIZE(AO4) + + LFD b5, 21 * SIZE(BO) + LFD b6, 22 * SIZE(BO) + LFD b7, 23 * SIZE(BO) + LFD b8, 24 * SIZE(BO) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + DCBT(AO1, PREA) + DCBT(AO2, PREA) + + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + addi BO, BO, 16 * SIZE + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, a1, b1, y01 + LFD a1, 3 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 3 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 3 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 4 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 4 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 4 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 4 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 5 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 5 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 6 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 6 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 6 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 6 * SIZE(AO4) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 7 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 7 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 7 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 7 * SIZE(AO4) + + FMADD y09, a5, b6, y09 + LFD a5, 8 * SIZE(AO1) + FMADD y10, a6, b6, y10 + LFD a6, 8 * SIZE(AO2) + FMADD y11, a7, b6, y11 + LFD a7, 8 * SIZE(AO3) + FMADD y12, a8, b6, y12 + LFD a8, 8 * SIZE(AO4) + + FMADD y01, a1, b7, y01 + LFD a1, 9 * SIZE(AO1) + FMADD y02, a2, b7, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, a3, b7, y03 + LFD a3, 9 * SIZE(AO3) + FMADD y04, a4, b7, y04 + LFD a4, 9 * SIZE(AO4) + + FMADD y09, a5, b8, y09 + LFD a5, 10 * SIZE(AO1) + FMADD y10, a6, b8, y10 + LFD a6, 10 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 10 * SIZE(AO3) + FMADD y12, a8, b8, y12 + LFD a8, 10 * SIZE(AO4) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 11 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 11 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 11 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 11 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 12 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 12 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 12 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 12 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 13 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 13 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 13 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 13 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 14 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 14 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 14 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 14 * SIZE(AO4) + + FMADD y01, a1, b5, y01 + LFD a1, 15 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 15 * SIZE(AO2) + FMADD y03, a3, b5, y03 + LFD a3, 15 * SIZE(AO3) + FMADD y04, a4, b5, y04 + LFD a4, 15 * SIZE(AO4) + + FMADD y09, a5, b6, y09 + LFD a5, 16 * SIZE(AO1) + FMADD y10, a6, b6, y10 + LFD a6, 16 * SIZE(AO2) + FMADD y11, a7, b6, y11 + LFD a7, 16 * SIZE(AO3) + FMADD y12, a8, b6, y12 + LFD a8, 16 * SIZE(AO4) + + FMADD y01, a1, b7, y01 + FMADD y02, a2, b7, y02 + FMADD y03, a3, b7, y03 + FMADD y04, a4, b7, y04 + + FMADD y09, a5, b8, y09 + FMADD y10, a6, b8, y10 + FMADD y11, a7, b8, y11 + FMADD y12, a8, b8, y12 + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + addi BO, BO, 16 * SIZE + .align 4 + +LL(24): + andi. r0, MIN_N, 15 + ble LL(28) + + andi. r0, MIN_N, 8 + ble LL(25) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + LFD a5, 2 * SIZE(AO1) + LFD a6, 2 * SIZE(AO2) + LFD a7, 2 * SIZE(AO3) + LFD a8, 2 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + LFD a1, 3 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 3 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 3 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 4 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 4 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 4 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 4 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFD a3, 5 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFD a4, 5 * SIZE(AO4) + + FMADD y09, a5, b4, y09 + LFD a5, 6 * SIZE(AO1) + FMADD y10, a6, b4, y10 + LFD a6, 6 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 6 * SIZE(AO3) + FMADD y12, a8, b4, y12 + LFD a8, 6 * SIZE(AO4) + + LFD b1, 5 * SIZE(BO) + LFD b2, 6 * SIZE(BO) + LFD b3, 7 * SIZE(BO) + LFD b4, 8 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 7 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 7 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 7 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 7 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 8 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 8 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 8 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 8 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + FMADD y02, a2, b3, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b3, y04 + + FMADD y09, a5, b4, y09 + addi AO1, AO1, 8 * SIZE + FMADD y10, a6, b4, y10 + addi AO2, AO2, 8 * SIZE + FMADD y11, a7, b4, y11 + addi AO3, AO3, 8 * SIZE + FMADD y12, a8, b4, y12 + addi AO4, AO4, 8 * SIZE + + addi BO, BO, 8 * SIZE + .align 4 + +LL(25): + andi. r0, MIN_N, 4 + ble LL(26) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + LFD a5, 2 * SIZE(AO1) + LFD a6, 2 * SIZE(AO2) + LFD a7, 2 * SIZE(AO3) + LFD a8, 2 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + LFD a1, 3 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 3 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFD a3, 3 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFD a4, 3 * SIZE(AO4) + + FMADD y09, a5, b2, y09 + LFD a5, 4 * SIZE(AO1) + FMADD y10, a6, b2, y10 + LFD a6, 4 * SIZE(AO2) + FMADD y11, a7, b2, y11 + LFD a7, 4 * SIZE(AO3) + FMADD y12, a8, b2, y12 + LFD a8, 4 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + FMADD y02, a2, b3, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b3, y04 + + FMADD y09, a5, b4, y09 + addi AO1, AO1, 4 * SIZE + FMADD y10, a6, b4, y10 + addi AO2, AO2, 4 * SIZE + FMADD y11, a7, b4, y11 + addi AO3, AO3, 4 * SIZE + FMADD y12, a8, b4, y12 + addi AO4, AO4, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(26): + andi. r0, MIN_N, 2 + ble LL(27) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + + LFD a5, 2 * SIZE(AO1) + LFD a6, 2 * SIZE(AO2) + LFD a7, 2 * SIZE(AO3) + LFD a8, 2 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b1, y03 + FMADD y04, a4, b1, y04 + + FMADD y09, a5, b2, y09 + addi AO1, AO1, 2 * SIZE + FMADD y10, a6, b2, y10 + addi AO2, AO2, 2 * SIZE + FMADD y11, a7, b2, y11 + addi AO3, AO3, 2 * SIZE + FMADD y12, a8, b2, y12 + addi AO4, AO4, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(27): + andi. r0, MIN_N, 1 + ble LL(28) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + + LFD a2, 1 * SIZE(AO2) + LFD a3, 1 * SIZE(AO3) + LFD a4, 1 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b1, y03 + FMADD y04, a4, b1, y04 + .align 4 + +LL(28): + mr BO, CO + lfd alpha, ALPHA + cmpi cr0, 0, INCY, SIZE + bne LL(29) + + LFD a1, 1 * SIZE(CO) + LFD a2, 2 * SIZE(CO) + LFD a3, 3 * SIZE(CO) + LFD a4, 4 * SIZE(CO) + + FADD y01, y09, y01 + FADD y02, y10, y02 + FADD y03, y11, y03 + FADD y04, y12, y04 + + FMADD a1, alpha, y01, a1 + FMADD a2, alpha, y02, a2 + FMADD a3, alpha, y03, a3 + FMADD a4, alpha, y04, a4 + + STFD a1, 1 * SIZE(CO) + STFD a2, 2 * SIZE(CO) + STFD a3, 3 * SIZE(CO) + STFD a4, 4 * SIZE(CO) + + addi CO, CO, 4 * SIZE + b LL(30) + .align 4 + +LL(29): + LFDUX a1, CO, INCY + LFDUX a2, CO, INCY + LFDUX a3, CO, INCY + LFDUX a4, CO, INCY + + FADD y01, y09, y01 + FADD y02, y10, y02 + FADD y03, y11, y03 + FADD y04, y12, y04 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + FMADD a3, alpha, f2, a3 + FMADD a4, alpha, f3, a4 + + STFDUX a1, BO, INCY + STFDUX a2, BO, INCY + STFDUX a3, BO, INCY + STFDUX a4, BO, INCY + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(40) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr BO, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y09, y01 + fmr y10, y01 + fmr y11, y01 + fmr y12, y01 + + DCBT(Y1, PREC) + + srawi. r0, MIN_N, 4 + mtspr CTR, r0 + ble LL(34) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO1) + LFD a4, 2 * SIZE(AO2) + LFD a5, 3 * SIZE(AO1) + LFD a6, 3 * SIZE(AO2) + LFD a7, 4 * SIZE(AO1) + LFD a8, 4 * SIZE(AO2) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + bdz LL(33) + .align 4 + +LL(32): + FMADD y01, a1, b1, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b2, y03 + LFD a3, 6 * SIZE(AO1) + FMADD y04, a4, b2, y04 + LFD a4, 6 * SIZE(AO2) + + FMADD y09, a5, b3, y09 + LFD a5, 7 * SIZE(AO1) + FMADD y10, a6, b3, y10 + LFD a6, 7 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 8 * SIZE(AO1) + FMADD y12, a8, b4, y12 + LFD a8, 8 * SIZE(AO2) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 9 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, a3, b6, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, a4, b6, y04 + LFD a4, 10 * SIZE(AO2) + + FMADD y09, a5, b7, y09 + LFD a5, 11 * SIZE(AO1) + FMADD y10, a6, b7, y10 + LFD a6, 11 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 12 * SIZE(AO1) + FMADD y12, a8, b8, y12 + LFD a8, 12 * SIZE(AO2) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 13 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 13 * SIZE(AO2) + FMADD y03, a3, b2, y03 + LFD a3, 14 * SIZE(AO1) + FMADD y04, a4, b2, y04 + LFD a4, 14 * SIZE(AO2) + + FMADD y09, a5, b3, y09 + LFD a5, 15 * SIZE(AO1) + FMADD y10, a6, b3, y10 + LFD a6, 15 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 16 * SIZE(AO1) + FMADD y12, a8, b4, y12 + LFD a8, 16 * SIZE(AO2) + + LFD b1, 17 * SIZE(BO) + LFD b2, 18 * SIZE(BO) + LFD b3, 19 * SIZE(BO) + LFD b4, 20 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 17 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 17 * SIZE(AO2) + FMADD y03, a3, b6, y03 + LFD a3, 18 * SIZE(AO1) + FMADD y04, a4, b6, y04 + LFD a4, 18 * SIZE(AO2) + + FMADD y09, a5, b7, y09 + LFD a5, 19 * SIZE(AO1) + FMADD y10, a6, b7, y10 + LFD a6, 19 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 20 * SIZE(AO1) + FMADD y12, a8, b8, y12 + LFD a8, 20 * SIZE(AO2) + + LFD b5, 21 * SIZE(BO) + LFD b6, 22 * SIZE(BO) + LFD b7, 23 * SIZE(BO) + LFD b8, 24 * SIZE(BO) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + DCBT(AO1, PREA) + DCBT(AO2, PREA) + + addi BO, BO, 16 * SIZE + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, a1, b1, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y03, a3, b2, y03 + LFD a3, 6 * SIZE(AO1) + FMADD y04, a4, b2, y04 + LFD a4, 6 * SIZE(AO2) + + FMADD y09, a5, b3, y09 + LFD a5, 7 * SIZE(AO1) + FMADD y10, a6, b3, y10 + LFD a6, 7 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 8 * SIZE(AO1) + FMADD y12, a8, b4, y12 + LFD a8, 8 * SIZE(AO2) + + LFD b1, 9 * SIZE(BO) + LFD b2, 10 * SIZE(BO) + LFD b3, 11 * SIZE(BO) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a1, b5, y01 + LFD a1, 9 * SIZE(AO1) + FMADD y02, a2, b5, y02 + LFD a2, 9 * SIZE(AO2) + FMADD y03, a3, b6, y03 + LFD a3, 10 * SIZE(AO1) + FMADD y04, a4, b6, y04 + LFD a4, 10 * SIZE(AO2) + + FMADD y09, a5, b7, y09 + LFD a5, 11 * SIZE(AO1) + FMADD y10, a6, b7, y10 + LFD a6, 11 * SIZE(AO2) + FMADD y11, a7, b8, y11 + LFD a7, 12 * SIZE(AO1) + FMADD y12, a8, b8, y12 + LFD a8, 12 * SIZE(AO2) + + LFD b5, 13 * SIZE(BO) + LFD b6, 14 * SIZE(BO) + LFD b7, 15 * SIZE(BO) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 13 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 13 * SIZE(AO2) + FMADD y03, a3, b2, y03 + LFD a3, 14 * SIZE(AO1) + FMADD y04, a4, b2, y04 + LFD a4, 14 * SIZE(AO2) + + FMADD y09, a5, b3, y09 + LFD a5, 15 * SIZE(AO1) + FMADD y10, a6, b3, y10 + LFD a6, 15 * SIZE(AO2) + FMADD y11, a7, b4, y11 + LFD a7, 16 * SIZE(AO1) + FMADD y12, a8, b4, y12 + LFD a8, 16 * SIZE(AO2) + + FMADD y01, a1, b5, y01 + FMADD y02, a2, b5, y02 + FMADD y03, a3, b6, y03 + FMADD y04, a4, b6, y04 + + FMADD y09, a5, b7, y09 + FMADD y10, a6, b7, y10 + FMADD y11, a7, b8, y11 + FMADD y12, a8, b8, y12 + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi BO, BO, 16 * SIZE + .align 4 + +LL(34): + andi. r0, MIN_N, 15 + ble LL(38) + andi. r0, MIN_N, 8 + ble LL(35) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO1) + LFD a4, 2 * SIZE(AO2) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + LFD a5, 3 * SIZE(AO1) + LFD a6, 3 * SIZE(AO2) + LFD a7, 4 * SIZE(AO1) + LFD a8, 4 * SIZE(AO2) + + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + + FMADD y01, a1, b1, y01 + LFD a1, 5 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFD a2, 5 * SIZE(AO2) + FMADD y09, a3, b2, y09 + LFD a3, 6 * SIZE(AO1) + FMADD y10, a4, b2, y10 + LFD a4, 6 * SIZE(AO2) + + FMADD y01, a5, b3, y01 + LFD a5, 7 * SIZE(AO1) + FMADD y02, a6, b3, y02 + LFD a6, 7 * SIZE(AO2) + FMADD y09, a7, b4, y09 + LFD a7, 8 * SIZE(AO1) + FMADD y10, a8, b4, y10 + LFD a8, 8 * SIZE(AO2) + + FMADD y01, a1, b5, y01 + FMADD y02, a2, b5, y02 + FMADD y09, a3, b6, y09 + FMADD y10, a4, b6, y10 + + FMADD y01, a5, b7, y01 + addi AO1, AO1, 8 * SIZE + FMADD y02, a6, b7, y02 + addi AO2, AO2, 8 * SIZE + FMADD y09, a7, b8, y09 + addi BO, BO, 8 * SIZE + FMADD y10, a8, b8, y10 + nop + .align 4 + +LL(35): + andi. r0, MIN_N, 4 + ble LL(36) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD a3, 2 * SIZE(AO1) + LFD a4, 2 * SIZE(AO2) + + LFD a5, 3 * SIZE(AO1) + LFD a6, 3 * SIZE(AO2) + LFD a7, 4 * SIZE(AO1) + LFD a8, 4 * SIZE(AO2) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y09, a3, b2, y09 + FMADD y10, a4, b2, y10 + + FMADD y01, a5, b3, y01 + addi AO1, AO1, 4 * SIZE + FMADD y02, a6, b3, y02 + addi AO2, AO2, 4 * SIZE + + FMADD y09, a7, b4, y09 + addi BO, BO, 4 * SIZE + FMADD y10, a8, b4, y10 + .align 4 + +LL(36): + andi. r0, MIN_N, 2 + ble LL(37) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 1 * SIZE(AO2) + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + + LFD a3, 2 * SIZE(AO1) + LFD a4, 2 * SIZE(AO2) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y09, a3, b2, y09 + FMADD y10, a4, b2, y10 + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(37): + andi. r0, MIN_N, 1 + ble LL(38) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 1 * SIZE(AO2) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + .align 4 + +LL(38): + mr BO, CO + lfd alpha, ALPHA + cmpi cr0, 0, INCY, SIZE + bne LL(39) + + LFD a1, 1 * SIZE(CO) + LFD a2, 2 * SIZE(CO) + + FADD y01, y03, y01 + FADD y02, y04, y02 + FADD y09, y11, y09 + FADD y10, y12, y10 + + FADD y01, y09, y01 + FADD y02, y10, y02 + + FMADD a1, alpha, y01, a1 + FMADD a2, alpha, y02, a2 + + STFD a1, 1 * SIZE(CO) + STFD a2, 2 * SIZE(CO) + + addi CO, CO, 2 * SIZE + b LL(40) + .align 4 + +LL(39): + LFDUX a1, CO, INCY + LFDUX a2, CO, INCY + + FADD y01, y03, y01 + FADD y02, y04, y02 + FADD y09, y11, y09 + FADD y10, y12, y10 + + FADD y01, y09, y01 + FADD y02, y10, y02 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + + STFDUX a1, BO, INCY + STFDUX a2, BO, INCY + .align 4 + +LL(40): + andi. J, N, 1 + ble LL(99) + + mr AO1, A + add A, A, LDA + mr BO, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y09, y01 + fmr y10, y01 + fmr y11, y01 + fmr y12, y01 + + DCBT(Y1, PREC) + + srawi. r0, MIN_N, 4 + mtspr CTR, r0 + ble LL(44) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 2 * SIZE(AO1) + LFD a3, 3 * SIZE(AO1) + LFD a4, 4 * SIZE(AO1) + LFD a5, 5 * SIZE(AO1) + LFD a6, 6 * SIZE(AO1) + LFD a7, 7 * SIZE(AO1) + LFD a8, 8 * SIZE(AO1) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + bdz LL(43) + .align 4 + +LL(42): + FMADD y01, a1, b1, y01 + nop + LFD a1, 9 * SIZE(AO1) + LFD b1, 9 * SIZE(BO) + + FMADD y02, a2, b2, y02 + nop + LFD a2, 10 * SIZE(AO1) + LFD b2, 10 * SIZE(BO) + + FMADD y03, a3, b3, y03 + nop + LFD a3, 11 * SIZE(AO1) + LFD b3, 11 * SIZE(BO) + + FMADD y04, a4, b4, y04 + nop + LFD a4, 12 * SIZE(AO1) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a5, b5, y01 + nop + LFD a5, 13 * SIZE(AO1) + LFD b5, 13 * SIZE(BO) + + FMADD y02, a6, b6, y02 + nop + LFD a6, 14 * SIZE(AO1) + LFD b6, 14 * SIZE(BO) + + FMADD y03, a7, b7, y03 + nop + LFD a7, 15 * SIZE(AO1) + LFD b7, 15 * SIZE(BO) + + FMADD y04, a8, b8, y04 + nop + LFD a8, 16 * SIZE(AO1) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + nop + LFD a1, 17 * SIZE(AO1) + LFD b1, 17 * SIZE(BO) + + FMADD y02, a2, b2, y02 + nop + LFD a2, 18 * SIZE(AO1) + LFD b2, 18 * SIZE(BO) + + FMADD y03, a3, b3, y03 + nop + LFD a3, 19 * SIZE(AO1) + LFD b3, 19 * SIZE(BO) + + FMADD y04, a4, b4, y04 + nop + LFD a4, 20 * SIZE(AO1) + LFD b4, 20 * SIZE(BO) + + FMADD y01, a5, b5, y01 + nop + LFD a5, 21 * SIZE(AO1) + LFD b5, 21 * SIZE(BO) + + FMADD y02, a6, b6, y02 + nop + LFD a6, 22 * SIZE(AO1) + LFD b6, 22 * SIZE(BO) + + FMADD y03, a7, b7, y03 + nop + LFD a7, 23 * SIZE(AO1) + LFD b7, 23 * SIZE(BO) + + FMADD y04, a8, b8, y04 + nop + LFD a8, 24 * SIZE(AO1) + LFD b8, 24 * SIZE(BO) + + addi AO1, AO1, 16 * SIZE + addi BO, BO, 16 * SIZE + DCBT(AO1, PREA) + bdnz LL(42) + .align 4 + +LL(43): + FMADD y01, a1, b1, y01 + nop + LFD a1, 9 * SIZE(AO1) + LFD b1, 9 * SIZE(BO) + + FMADD y02, a2, b2, y02 + nop + LFD a2, 10 * SIZE(AO1) + LFD b2, 10 * SIZE(BO) + + FMADD y03, a3, b3, y03 + nop + LFD a3, 11 * SIZE(AO1) + LFD b3, 11 * SIZE(BO) + + FMADD y04, a4, b4, y04 + nop + LFD a4, 12 * SIZE(AO1) + LFD b4, 12 * SIZE(BO) + + FMADD y01, a5, b5, y01 + nop + LFD a5, 13 * SIZE(AO1) + LFD b5, 13 * SIZE(BO) + + FMADD y02, a6, b6, y02 + nop + LFD a6, 14 * SIZE(AO1) + LFD b6, 14 * SIZE(BO) + + FMADD y03, a7, b7, y03 + nop + LFD a7, 15 * SIZE(AO1) + LFD b7, 15 * SIZE(BO) + + FMADD y04, a8, b8, y04 + nop + LFD a8, 16 * SIZE(AO1) + LFD b8, 16 * SIZE(BO) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b2, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b4, y04 + + FMADD y01, a5, b5, y01 + addi AO1, AO1, 16 * SIZE + FMADD y02, a6, b6, y02 + addi BO, BO, 16 * SIZE + + FMADD y03, a7, b7, y03 + nop + FMADD y04, a8, b8, y04 + nop + .align 4 + +LL(44): + andi. r0, MIN_N, 15 + ble LL(48) + andi. r0, MIN_N, 8 + ble LL(45) + + LFD a1, 1 * SIZE(AO1) + LFD a2, 2 * SIZE(AO1) + LFD a3, 3 * SIZE(AO1) + LFD a4, 4 * SIZE(AO1) + + LFD b1, 1 * SIZE(BO) + LFD b2, 2 * SIZE(BO) + LFD b3, 3 * SIZE(BO) + LFD b4, 4 * SIZE(BO) + + LFD a5, 5 * SIZE(AO1) + LFD a6, 6 * SIZE(AO1) + LFD a7, 7 * SIZE(AO1) + LFD a8, 8 * SIZE(AO1) + + LFD b5, 5 * SIZE(BO) + LFD b6, 6 * SIZE(BO) + LFD b7, 7 * SIZE(BO) + LFD b8, 8 * SIZE(BO) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b2, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b4, y04 + + FMADD y01, a5, b5, y01 + addi AO1, AO1, 8 * SIZE + FMADD y02, a6, b6, y02 + addi BO, BO, 8 * SIZE + FMADD y03, a7, b7, y03 + nop + FMADD y04, a8, b8, y04 + nop + .align 4 + +LL(45): + andi. r0, MIN_N, 4 + ble LL(46) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 2 * SIZE(AO1) + LFD b2, 2 * SIZE(BO) + + LFD a3, 3 * SIZE(AO1) + LFD b3, 3 * SIZE(BO) + LFD a4, 4 * SIZE(AO1) + LFD b4, 4 * SIZE(BO) + + FMADD y01, a1, b1, y01 + addi AO1, AO1, 4 * SIZE + FMADD y02, a2, b2, y02 + addi AO2, AO2, 4 * SIZE + + FMADD y03, a3, b3, y03 + addi BO, BO, 4 * SIZE + FMADD y04, a4, b4, y04 + nop + .align 4 + +LL(46): + andi. r0, MIN_N, 2 + ble LL(47) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + LFD a2, 2 * SIZE(AO1) + LFD b2, 2 * SIZE(BO) + + FMADD y01, a1, b1, y01 + addi AO1, AO1, 2 * SIZE + FMADD y02, a2, b2, y02 + addi BO, BO, 2 * SIZE + .align 4 + +LL(47): + andi. r0, MIN_N, 1 + ble LL(48) + + LFD a1, 1 * SIZE(AO1) + LFD b1, 1 * SIZE(BO) + FMADD y01, a1, b1, y01 + .align 4 + +LL(48): + mr BO, CO + lfd alpha, ALPHA + cmpi cr0, 0, INCY, SIZE + bne LL(49) + + LFD a1, 1 * SIZE(CO) + + FADD y01, y02, y01 + FADD y03, y04, y03 + FADD y01, y03, y01 + + FMADD a1, alpha, y01, a1 + STFD a1, 1 * SIZE(CO) + b LL(99) + .align 4 + +LL(49): + LFDUX a1, CO, INCY + FADD y01, y02, y01 + FADD y03, y04, y03 + FADD y01, y03, y01 + FMADD a1, alpha, f0, a1 + STFDUX a1, BO, INCY + .align 4 + +LL(99): + subf A, PLDA_M, A + addi IS, IS, P + cmp cr0, 0, IS, M + blt LL(ISLoop) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 160(SP) + ld r15, 168(SP) + ld r16, 176(SP) + ld r17, 184(SP) + ld r18, 192(SP) + ld r19, 200(SP) + ld r20, 208(SP) + ld r21, 216(SP) + ld r22, 224(SP) + ld r23, 232(SP) + ld r24, 240(SP) + ld r25, 248(SP) + ld r26, 256(SP) + ld r27, 264(SP) + ld r28, 272(SP) + ld r29, 280(SP) +#else + lwz r14, 160(SP) + lwz r15, 164(SP) + lwz r16, 168(SP) + lwz r17, 172(SP) + lwz r18, 176(SP) + lwz r19, 180(SP) + lwz r20, 184(SP) + lwz r21, 188(SP) + lwz r22, 192(SP) + lwz r23, 196(SP) + lwz r24, 200(SP) + lwz r25, 204(SP) + lwz r26, 208(SP) + lwz r27, 212(SP) + lwz r28, 216(SP) + lwz r29, 220(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE + +#endif diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S new file mode 100644 index 0000000000..1aa59b2147 --- /dev/null +++ b/kernel/power/gemv_t_ppc440.S @@ -0,0 +1,1089 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#endif +#endif + +#define BUFFER r11 +#define XP r12 +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define J r18 +#define YY r19 +#define PREA r20 +#define PREC r21 +#define X1 r22 + + +#if defined(PPCG4) +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 7 +#endif + +#if defined(POWER6) +#define PREFETCHSIZE_A 42 +#define PREFETCHSIZE_C 7 +#endif + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define a1 f8 +#define a2 f9 +#define a3 f10 +#define a4 f11 +#define a5 f12 +#define a6 f13 +#define a7 f14 +#define a8 f15 + +#define b1 f16 +#define b2 f17 +#define b3 f18 +#define b4 f19 +#define b5 f20 +#define b6 f21 +#define b7 f22 +#define b8 f23 + +#define alpha f23 + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 288 +#endif + +#define FZERO 144(SP) +#define ALPHA 152(SP) + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + +#ifdef __64BIT__ + std r0, FZERO + stfd f1, ALPHA + std r14, 160(SP) + std r15, 168(SP) + std r16, 176(SP) + std r17, 184(SP) + std r18, 192(SP) + std r19, 200(SP) + std r20, 208(SP) + std r21, 216(SP) + std r22, 224(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stfd f1, ALPHA + stw r14, 160(SP) + stw r15, 164(SP) + stw r16, 168(SP) + stw r17, 172(SP) + stw r18, 176(SP) + stw r19, 180(SP) + stw r20, 184(SP) + stw r21, 188(SP) + stw r22, 192(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + addi A, A, -SIZE + sub X, X, INCX + sub Y, Y, INCY + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpi cr0, 0, M, 0 + ble LL(999) + cmpi cr0, 0, N, 0 + ble LL(999) + + mr XP, X + + cmpi cr0, 0, INCX, SIZE + beq LL(10) + + addi XP, BUFFER, -SIZE + addi X1, BUFFER, -SIZE + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(CopyRemain) + .align 4 + +LL(CopyKernel): + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + STFDU f0, 1 * SIZE(X1) + STFDU f1, 1 * SIZE(X1) + STFDU f2, 1 * SIZE(X1) + STFDU f3, 1 * SIZE(X1) + STFDU f4, 1 * SIZE(X1) + STFDU f5, 1 * SIZE(X1) + STFDU f6, 1 * SIZE(X1) + STFDU f7, 1 * SIZE(X1) + bdnz LL(CopyKernel) + .align 4 + +LL(CopyRemain): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(CopySub): + LFDUX f0, X, INCX + STFDU f0, 1 * SIZE(X1) + bdnz LL(CopySub) + .align 4 + +LL(10): + mr YY, Y + + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(21): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr X1, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + fmr y05, y01 + fmr y06, y01 + fmr y07, y01 + fmr y08, y01 + + dcbtst Y, PREC + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(24) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + + LFDU a5, 1 * SIZE(AO1) + LFDU a6, 1 * SIZE(AO2) + LFDU a7, 1 * SIZE(AO3) + LFDU a8, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) + LFDU b4, 1 * SIZE(X1) + bdz LL(23) + .align 4 + +LL(22): +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y05, a5, b2, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b2, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO3, PREA +#endif + + FMADD y05, a5, b4, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b4, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b4, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b4, y08 + LFDU a8, 1 * SIZE(AO4) + +#ifdef PPCG4 + dcbt AO4, PREA +#endif + LFDU b4, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD y05, a5, b2, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b2, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO3, PREA +#endif + + FMADD y05, a5, b4, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b4, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b4, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b4, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b4, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO4, PREA +#endif + + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) + + FMADD y05, a5, b2, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFDU a4, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) + + FMADD y05, a5, b4, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b4, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b4, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b4, y08 + LFDU a8, 1 * SIZE(AO4) + + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a3, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a4, 1 * SIZE(AO4) + + FMADD y05, a5, b2, y05 + LFDU a5, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a6, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a7, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, a1, b3, y01 + FMADD y02, a2, b3, y02 + FMADD y03, a3, b3, y03 + FMADD y04, a4, b3, y04 + + FMADD y05, a5, b4, y05 + FMADD y06, a6, b4, y06 + FMADD y07, a7, b4, y07 + FMADD y08, a8, b4, y08 + .align 4 + +LL(24): + andi. r0, M, 7 + ble LL(28) + + andi. r0, M, 4 + ble LL(26) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU b1, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a7, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a8, 1 * SIZE(AO4) + + LFDU b3, 1 * SIZE(X1) + + FMADD y05, a5, b2, y05 + LFDU a1, 1 * SIZE(AO1) + FMADD y06, a6, b2, y06 + LFDU a2, 1 * SIZE(AO2) + FMADD y07, a7, b2, y07 + LFDU a3, 1 * SIZE(AO3) + FMADD y08, a8, b2, y08 + LFDU a4, 1 * SIZE(AO4) + + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, a3, b3, y03 + LFDU a7, 1 * SIZE(AO3) + FMADD y04, a4, b3, y04 + LFDU a8, 1 * SIZE(AO4) + + FMADD y05, a5, b4, y05 + FMADD y06, a6, b4, y06 + FMADD y07, a7, b4, y07 + FMADD y08, a8, b4, y08 + .align 4 + +LL(26): + andi. r0, M, 2 + ble LL(27) + + LFDU b1, 1 * SIZE(X1) + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, a3, b1, y03 + LFDU a7, 1 * SIZE(AO3) + FMADD y04, a4, b1, y04 + LFDU a8, 1 * SIZE(AO4) + + FMADD y05, a5, b2, y05 + FMADD y06, a6, b2, y06 + FMADD y07, a7, b2, y07 + FMADD y08, a8, b2, y08 + .align 4 + +LL(27): + andi. r0, M, 1 + ble LL(28) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + + LFDU a2, 1 * SIZE(AO2) + LFDU a3, 1 * SIZE(AO3) + LFDU a4, 1 * SIZE(AO4) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b1, y03 + FMADD y04, a4, b1, y04 + .align 4 + +LL(28): + lfd alpha, ALPHA + + LFDUX a1, Y, INCY + LFDUX a2, Y, INCY + LFDUX a3, Y, INCY + LFDUX a4, Y, INCY + + FADD y01, y05, y01 + FADD y02, y06, y02 + FADD y03, y07, y03 + FADD y04, y08, y04 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + FMADD a3, alpha, f2, a3 + FMADD a4, alpha, f3, a4 + + STFDUX a1, YY, INCY + addi J, J, -1 + STFDUX a2, YY, INCY + cmpi cr0, 0, J, 0 + STFDUX a3, YY, INCY + STFDUX a4, YY, INCY + bgt LL(21) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(40) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr X1, XP + + lfd y01, FZERO + fmr y02, y01 + fmr y03, y01 + fmr y04, y01 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(34) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + + LFDU a5, 1 * SIZE(AO1) + LFDU a6, 1 * SIZE(AO2) + LFDU b3, 1 * SIZE(X1) + LFDU b4, 1 * SIZE(X1) + bdz LL(33) + .align 4 + +LL(32): +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b1, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y03, a5, b2, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b3, 1 * SIZE(X1) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD y03, a5, b4, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b4, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + LFDU b1, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD y03, a5, b2, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b3, 1 * SIZE(X1) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y03, a5, b4, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b4, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b4, 1 * SIZE(X1) + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b1, 1 * SIZE(X1) + + FMADD y03, a5, b2, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b3, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a2, 1 * SIZE(AO2) + + LFDU b3, 1 * SIZE(X1) + + FMADD y03, a5, b4, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b4, y04 + LFDU a6, 1 * SIZE(AO2) + + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a2, 1 * SIZE(AO2) + + FMADD y03, a5, b2, y03 + LFDU a5, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a6, 1 * SIZE(AO2) + + FMADD y01, a1, b3, y01 + FMADD y02, a2, b3, y02 + + FMADD y03, a5, b4, y03 + FMADD y04, a6, b4, y04 + .align 4 + +LL(34): + andi. r0, M, 7 + ble LL(38) + + andi. r0, M, 4 + ble LL(36) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU b1, 1 * SIZE(X1) + + LFDU b2, 1 * SIZE(X1) + FMADD y01, a1, b1, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b1, y02 + LFDU a6, 1 * SIZE(AO2) + + LFDU b3, 1 * SIZE(X1) + FMADD y03, a5, b2, y03 + LFDU a1, 1 * SIZE(AO1) + FMADD y04, a6, b2, y04 + LFDU a2, 1 * SIZE(AO2) + + LFDU b4, 1 * SIZE(X1) + FMADD y01, a1, b3, y01 + LFDU a5, 1 * SIZE(AO1) + FMADD y02, a2, b3, y02 + LFDU a6, 1 * SIZE(AO2) + + FMADD y03, a5, b4, y03 + FMADD y04, a6, b4, y04 + .align 4 + +LL(36): + andi. r0, M, 2 + ble LL(37) + + LFDU b1, 1 * SIZE(X1) + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO2) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO1) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + FMADD y03, a3, b2, y03 + FMADD y04, a4, b2, y04 + .align 4 + +LL(37): + andi. r0, M, 1 + ble LL(38) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO2) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b1, y02 + .align 4 + +LL(38): + lfd alpha, ALPHA + + LFDUX a1, Y, INCY + LFDUX a2, Y, INCY + + FADD y01, y03, y01 + FADD y02, y04, y02 + + FMADD a1, alpha, f0, a1 + FMADD a2, alpha, f1, a2 + + STFDUX a1, YY, INCY + STFDUX a2, YY, INCY + .align 4 + +LL(40): + andi. J, N, 1 + ble LL(999) + + mr AO1, A + add A, A, LDA + + mr X1, XP + + lfd y01, FZERO + fmr y02, y01 + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(44) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO1) + LFDU a4, 1 * SIZE(AO1) + + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + LFDU b3, 1 * SIZE(X1) + LFDU b4, 1 * SIZE(X1) + bdz LL(43) + .align 4 + +LL(42): + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD y02, a2, b2, y02 + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD y01, a3, b3, y01 + LFDU a3, 1 * SIZE(AO1) + LFDU b3, 1 * SIZE(X1) + + FMADD y02, a4, b4, y02 + LFDU a4, 1 * SIZE(AO1) + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + + FMADD y02, a2, b2, y02 + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD y01, a3, b3, y01 + LFDU a3, 1 * SIZE(AO1) + LFDU b3, 1 * SIZE(X1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD y02, a4, b4, y02 + LFDU a4, 1 * SIZE(AO1) + LFDU b4, 1 * SIZE(X1) + + bdnz LL(42) + .align 4 + +LL(43): + FMADD y01, a1, b1, y01 + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + + FMADD y02, a2, b2, y02 + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a3, b3, y01 + LFDU a3, 1 * SIZE(AO1) + LFDU b3, 1 * SIZE(X1) + + FMADD y02, a4, b4, y02 + LFDU a4, 1 * SIZE(AO1) + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b2, y02 + FMADD y01, a3, b3, y01 + FMADD y02, a4, b4, y02 + .align 4 + +LL(44): + andi. r0, M, 7 + ble LL(48) + + andi. r0, M, 4 + ble LL(46) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + LFDU a3, 1 * SIZE(AO1) + LFDU b3, 1 * SIZE(X1) + + FMADD y02, a2, b2, y02 + LFDU a4, 1 * SIZE(AO1) + LFDU b4, 1 * SIZE(X1) + + FMADD y01, a3, b3, y01 + FMADD y02, a4, b4, y02 + .align 4 + +LL(46): + andi. r0, M, 2 + ble LL(47) + + LFDU b1, 1 * SIZE(X1) + LFDU a1, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + + FMADD y01, a1, b1, y01 + FMADD y02, a2, b2, y02 + .align 4 + +LL(47): + andi. r0, M, 1 + ble LL(48) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + + FMADD y01, a1, b1, y01 + .align 4 + +LL(48): + lfd alpha, ALPHA + + LFDUX a1, Y, INCY + + FADD y01, y02, y01 + + FMADD a1, alpha, f0, a1 + + STFDUX a1, YY, INCY + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + +#ifdef __64BIT__ + ld r14, 160(SP) + ld r15, 168(SP) + ld r16, 176(SP) + ld r17, 184(SP) + ld r18, 192(SP) + ld r19, 200(SP) + ld r20, 208(SP) + ld r21, 216(SP) + ld r22, 224(SP) +#else + lwz r14, 160(SP) + lwz r15, 164(SP) + lwz r16, 168(SP) + lwz r17, 172(SP) + lwz r18, 176(SP) + lwz r19, 180(SP) + lwz r20, 184(SP) + lwz r21, 188(SP) + lwz r22, 192(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE + +#endif diff --git a/kernel/power/ger.S b/kernel/power/ger.S new file mode 100644 index 0000000000..00685693ac --- /dev/null +++ b/kernel/power/ger.S @@ -0,0 +1,1209 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef NEEDPARAM +#ifndef DOUBLE +#include "sparam.h" +#else +#include "dparam.h" +#endif +#endif + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define A r10 +#define LDA r5 +#else +#define M r3 +#define N r4 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define A r5 +#define LDA r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define A r6 +#define LDA r7 +#else +#define M r3 +#define N r4 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define A r5 +#define LDA r6 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define AO5 r18 +#define AO6 r19 +#define AO7 r20 +#define AO8 r21 + +#define X1 r22 +#define PREA r23 +#define PREC r24 +#define XX r25 +#define BUFFER r26 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define alpha1 f8 +#define alpha2 f9 + +#define a1 f12 +#define a2 f13 +#define a3 f14 +#define a4 f15 +#define a5 f16 +#define a6 f17 +#define a7 f18 +#define a8 f19 +#define a9 f20 +#define a10 f21 +#define a11 f22 +#define a12 f23 +#define a13 f24 +#define a14 f25 +#define a15 f26 +#define a16 f27 + +#define alpha f31 + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 280 +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz LDA, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld A, 112 + STACKSIZE(SP) + ld LDA, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCY, 56 + STACKSIZE(SP) + lwz A, 60 + STACKSIZE(SP) + lwz LDA, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#else + lwz A, 56 + STACKSIZE(SP) + lwz LDA, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld A, 112 + STACKSIZE(SP) + ld LDA, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + fmr alpha, f1 + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + mr XX, X + + cmpi cr0, 0, INCX, SIZE + beq LL(10) + + mr XX, BUFFER + mr X1, BUFFER + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(X) + add X, X, INCX + LFD a2, 0 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + add X, X, INCX + LFD a4, 0 * SIZE(X) + add X, X, INCX + LFD a5, 0 * SIZE(X) + add X, X, INCX + LFD a6, 0 * SIZE(X) + add X, X, INCX + LFD a7, 0 * SIZE(X) + add X, X, INCX + LFD a8, 0 * SIZE(X) + add X, X, INCX + + STFD a1, 0 * SIZE(X1) + STFD a2, 1 * SIZE(X1) + STFD a3, 2 * SIZE(X1) + STFD a4, 3 * SIZE(X1) + STFD a5, 4 * SIZE(X1) + STFD a6, 5 * SIZE(X1) + STFD a7, 6 * SIZE(X1) + STFD a8, 7 * SIZE(X1) + + addi X1, X1, 8 * SIZE + bdnz+ LL(01) + .align 4 + +LL(05): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(06): + LFD a1, 0 * SIZE(X) + add X, X, INCX + STFD a1, 0 * SIZE(X1) + addi X1, X1, SIZE + bdnz+ LL(06) + .align 4 + +LL(10): + srawi. J, N, 1 + ble LL(20) + .align 4 + +LL(11): + LFD alpha1, 0 * SIZE(Y) + add Y, Y, INCY + LFD alpha2, 0 * SIZE(Y) + add Y, Y, INCY + + FMUL alpha1, alpha, alpha1 + FMUL alpha2, alpha, alpha2 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr X1, XX + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(15) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a9, 0 * SIZE(AO2) + LFD a10, 1 * SIZE(AO2) + LFD a11, 2 * SIZE(AO2) + LFD a12, 3 * SIZE(AO2) + + LFD a13, 4 * SIZE(AO2) + LFD a14, 5 * SIZE(AO2) + LFD a15, 6 * SIZE(AO2) + LFD a16, 7 * SIZE(AO2) + bdz LL(13) + .align 4 + +LL(12): + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + LFD a9, 8 * SIZE(AO2) + LFD a10, 9 * SIZE(AO2) + LFD a11, 10 * SIZE(AO2) + LFD a12, 11 * SIZE(AO2) + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + LFD a13, 12 * SIZE(AO2) + LFD a14, 13 * SIZE(AO2) + LFD a15, 14 * SIZE(AO2) + LFD a16, 15 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + LFD y01, 16 * SIZE(X1) + LFD y02, 17 * SIZE(X1) + LFD y03, 18 * SIZE(X1) + LFD y04, 19 * SIZE(X1) + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + LFD y05, 20 * SIZE(X1) + LFD y06, 21 * SIZE(X1) + LFD y07, 22 * SIZE(X1) + LFD y08, 23 * SIZE(X1) + + STFD a9, 8 * SIZE(AO2) + STFD a10, 9 * SIZE(AO2) + STFD a11, 10 * SIZE(AO2) + STFD a12, 11 * SIZE(AO2) + + LFD a9, 16 * SIZE(AO2) + LFD a10, 17 * SIZE(AO2) + LFD a11, 18 * SIZE(AO2) + LFD a12, 19 * SIZE(AO2) + + STFD a13, 12 * SIZE(AO2) + STFD a14, 13 * SIZE(AO2) + STFD a15, 14 * SIZE(AO2) + STFD a16, 15 * SIZE(AO2) + + LFD a13, 20 * SIZE(AO2) + LFD a14, 21 * SIZE(AO2) + LFD a15, 22 * SIZE(AO2) + LFD a16, 23 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi X1, X1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(Y1, PREY) + + bdnz+ LL(12) + .align 4 + +LL(13): + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + LFD a9, 8 * SIZE(AO2) + LFD a10, 9 * SIZE(AO2) + LFD a11, 10 * SIZE(AO2) + LFD a12, 11 * SIZE(AO2) + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + LFD a13, 12 * SIZE(AO2) + LFD a14, 13 * SIZE(AO2) + LFD a15, 14 * SIZE(AO2) + LFD a16, 15 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + STFD a9, 8 * SIZE(AO2) + STFD a10, 9 * SIZE(AO2) + STFD a11, 10 * SIZE(AO2) + STFD a12, 11 * SIZE(AO2) + + STFD a13, 12 * SIZE(AO2) + STFD a14, 13 * SIZE(AO2) + STFD a15, 14 * SIZE(AO2) + STFD a16, 15 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi X1, X1, 16 * SIZE + .align 4 + + +LL(15): + andi. r0, M, 15 + ble LL(19) + + andi. r0, M, 8 + ble LL(16) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD a9, 0 * SIZE(AO2) + LFD a10, 1 * SIZE(AO2) + LFD a11, 2 * SIZE(AO2) + LFD a12, 3 * SIZE(AO2) + LFD a13, 4 * SIZE(AO2) + LFD a14, 5 * SIZE(AO2) + LFD a15, 6 * SIZE(AO2) + LFD a16, 7 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + FMADD a9, alpha2, y01, a9 + FMADD a10, alpha2, y02, a10 + FMADD a11, alpha2, y03, a11 + FMADD a12, alpha2, y04, a12 + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + FMADD a13, alpha2, y05, a13 + FMADD a14, alpha2, y06, a14 + FMADD a15, alpha2, y07, a15 + FMADD a16, alpha2, y08, a16 + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi X1, X1, 8 * SIZE + .align 4 + +LL(16): + andi. r0, M, 4 + ble LL(17) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + FMADD a5, alpha2, y01, a5 + FMADD a6, alpha2, y02, a6 + FMADD a7, alpha2, y03, a7 + FMADD a8, alpha2, y04, a8 + + STFD a5, 0 * SIZE(AO2) + STFD a6, 1 * SIZE(AO2) + STFD a7, 2 * SIZE(AO2) + STFD a8, 3 * SIZE(AO2) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi X1, X1, 4 * SIZE + .align 4 + +LL(17): + andi. r0, M, 2 + ble LL(18) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha2, y01, a3 + FMADD a4, alpha2, y02, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 0 * SIZE(AO2) + STFD a4, 1 * SIZE(AO2) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + + addi X1, X1, 2 * SIZE + .align 4 + +LL(18): + andi. r0, M, 1 + ble LL(19) + + LFD y01, 0 * SIZE(X1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 0 * SIZE(AO2) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha2, y01, a2 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 0 * SIZE(AO2) + .align 4 + +LL(19): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 1 + ble LL(999) + .align 4 + +LL(21): + LFD alpha1, 0 * SIZE(Y) + FMUL alpha1, alpha, alpha1 + + mr AO1, A + mr X1, XX + + srawi. r0, M, 4 + mtspr CTR, r0 + ble LL(25) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + bdz LL(23) + .align 4 + +LL(22): + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + LFD y01, 16 * SIZE(X1) + LFD y02, 17 * SIZE(X1) + LFD y03, 18 * SIZE(X1) + LFD y04, 19 * SIZE(X1) + + LFD y05, 20 * SIZE(X1) + LFD y06, 21 * SIZE(X1) + LFD y07, 22 * SIZE(X1) + LFD y08, 23 * SIZE(X1) + + addi AO1, AO1, 16 * SIZE + addi X1, X1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(Y1, PREY) + + bdnz+ LL(22) + .align 4 + +LL(23): + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + addi X1, X1, 16 * SIZE + .align 4 + +LL(25): + andi. r0, M, 15 + ble LL(999) + + andi. r0, M, 8 + ble LL(26) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + FMADD a5, alpha1, y05, a5 + FMADD a6, alpha1, y06, a6 + FMADD a7, alpha1, y07, a7 + FMADD a8, alpha1, y08, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + addi AO1, AO1, 8 * SIZE + addi X1, X1, 8 * SIZE + .align 4 + +LL(26): + andi. r0, M, 4 + ble LL(27) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + FMADD a3, alpha1, y03, a3 + FMADD a4, alpha1, y04, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + addi AO1, AO1, 4 * SIZE + addi X1, X1, 4 * SIZE + .align 4 + +LL(27): + andi. r0, M, 2 + ble LL(28) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + + FMADD a1, alpha1, y01, a1 + FMADD a2, alpha1, y02, a2 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + + addi AO1, AO1, 2 * SIZE + addi X1, X1, 2 * SIZE + .align 4 + +LL(28): + andi. r0, M, 1 + ble LL(999) + + LFD y01, 0 * SIZE(X1) + LFD a1, 0 * SIZE(AO1) + + FMADD a1, alpha1, y01, a1 + + STFD a1, 0 * SIZE(AO1) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/iamax.S b/kernel/power/iamax.S new file mode 100644 index 0000000000..cdc57fa382 --- /dev/null +++ b/kernel/power/iamax.S @@ -0,0 +1,802 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(1000): + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f24, 0 * SIZE(XX) + LFD f25, 1 * SIZE(XX) + LFD f26, 2 * SIZE(XX) + LFD f27, 3 * SIZE(XX) + LFD f28, 4 * SIZE(XX) + LFD f29, 5 * SIZE(XX) + LFD f30, 6 * SIZE(XX) + LFD f31, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + addi XX, XX, 8 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi XX, XX, 8 * SIZE + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/iamax_hummer.S b/kernel/power/iamax_hummer.S new file mode 100644 index 0000000000..9b23709707 --- /dev/null +++ b/kernel/power/iamax_hummer.S @@ -0,0 +1,1015 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + mr NN, N + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + + addi N, N, -1 + cmpwi cr0, N, 0 + li RET, 1 + fabs C1, C1 + ble LL(999) + + fsmfp C1, C1 + mr XX, X + fpmr C2, C1 + add X, X, INCX + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C2, C2 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, T1 + LFPDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFPDUX A2, X, INCX2 + fpsub F3, C3, T3 + LFPDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + li RET, 0 + + fsmfp C1, C1 + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD A1, 0 * SIZE(XX) + add XX, XX, INCX + + addi NN, NN, -1 + addi RET, RET, 1 + + fabs A1, A1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + bdz LL(23) + .align 4 + +LL(22): + addi RET, RET, 1 + fcmpu cr0, C1, T1 + LFPDUX A1, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + LFPDUX A2, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + LFPDUX A3, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + LFPDUX A4, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + LFPDUX A5, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + LFPDUX A6, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + LFPDUX A7, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + LFPDUX A8, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T5 + fpabs T1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T5 + fpabs T2, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T6 + fpabs T3, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T6 + fpabs T4, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T7 + fpabs T5, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T7 + fpabs T6, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T8 + fpabs T7, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T8 + fpabs T8, A8 + beq cr0, LL(999) + bdnz LL(22) + .align 4 + +LL(23): + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T8 + beq cr0, LL(999) + .align 4 + +LL(25): + andi. r0, NN, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(26): + andi. r0, NN, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + .align 4 + +LL(27): + andi. r0, NN, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + + fpabs T1, A1 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + .align 4 + +LL(28): + andi. r0, NN, 1 + beq LL(999) + addi RET, RET, 1 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, T1 + LFDUX A1, X, INCX + fpsub F2, C2, T2 + LFDUX A2, X, INCX + fpsub F3, C3, T3 + LFDUX A3, X, INCX + fpsub F4, C4, T4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, C1, T5 + LFSDUX A5, X, INCX + fpsub F6, C2, T6 + LFSDUX A6, X, INCX + fpsub F7, C3, T7 + LFSDUX A7, X, INCX + fpsub F8, C4, T8 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + + li RET, 0 + + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fabs T1, A1 + fabs T2, A2 + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + bdz LL(123) + .align 4 + +LL(122): + LFDUX A1, XX, INCX + fabs T3, A3 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + LFDUX A2, XX, INCX + fabs T4, A4 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + LFDUX A3, XX, INCX + fabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + LFDUX A4, XX, INCX + fabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + LFDUX A5, XX, INCX + fabs T3, A7 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + LFDUX A6, XX, INCX + fabs T4, A8 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + LFDUX A7, XX, INCX + fabs T1, A1 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + LFDUX A8, XX, INCX + fabs T2, A2 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + bdnz LL(122) + .align 4 + +LL(123): + fabs T3, A3 + fabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + fabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + fabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + fabs T3, A7 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + fabs T4, A8 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fabs T1, A1 + fabs T2, A2 + fabs T3, A3 + fabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(127): + andi. r0, NN, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + fabs T1, A1 + fabs T2, A2 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + .align 4 + +LL(128): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/iamax_ppc440.S b/kernel/power/iamax_ppc440.S new file mode 100644 index 0000000000..11ea4cb742 --- /dev/null +++ b/kernel/power/iamax_ppc440.S @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + li RET, 0 + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + li PRE, 3 * 16 * SIZE + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + subi N, N, 1 + fabs f6, f1 + srawi. r0, N, 4 + fabs f7, f1 + mtspr CTR, r0 + fabs f1, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDUX f25, XX, INCX + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDUX f27, XX, INCX + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDUX f29, XX, INCX + + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDUX f31, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/iamin.S b/kernel/power/iamin.S new file mode 100644 index 0000000000..c3dbb848a6 --- /dev/null +++ b/kernel/power/iamin.S @@ -0,0 +1,803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(1000): + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f24, 0 * SIZE(XX) + LFD f25, 1 * SIZE(XX) + LFD f26, 2 * SIZE(XX) + LFD f27, 3 * SIZE(XX) + LFD f28, 4 * SIZE(XX) + LFD f29, 5 * SIZE(XX) + LFD f30, 6 * SIZE(XX) + LFD f31, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + addi XX, XX, 8 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi XX, XX, 8 * SIZE + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/iamin_hummer.S b/kernel/power/iamin_hummer.S new file mode 100644 index 0000000000..6dad3bec50 --- /dev/null +++ b/kernel/power/iamin_hummer.S @@ -0,0 +1,1016 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + mr NN, N + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + + addi N, N, -1 + cmpwi cr0, N, 0 + li RET, 1 + fabs C1, C1 + ble LL(999) + + fsmfp C1, C1 + mr XX, X + fpmr C2, C1 + add X, X, INCX + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C2, C2 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, T1, C1 + LFPDUX A1, X, INCX2 + fpsub F2, T2, C2 + LFPDUX A2, X, INCX2 + fpsub F3, T3, C3 + LFPDUX A3, X, INCX2 + fpsub F4, T4, C4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fabs A1, A1 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + li RET, 0 + + fsmfp C1, C1 + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD A1, 0 * SIZE(XX) + add XX, XX, INCX + + addi NN, NN, -1 + addi RET, RET, 1 + + fabs A1, A1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + bdz LL(23) + .align 4 + +LL(22): + addi RET, RET, 1 + fcmpu cr0, C1, T1 + LFPDUX A1, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + LFPDUX A2, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + LFPDUX A3, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + LFPDUX A4, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + LFPDUX A5, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + LFPDUX A6, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + LFPDUX A7, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + LFPDUX A8, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T5 + fpabs T1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T5 + fpabs T2, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T6 + fpabs T3, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T6 + fpabs T4, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T7 + fpabs T5, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T7 + fpabs T6, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T8 + fpabs T7, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T8 + fpabs T8, A8 + beq cr0, LL(999) + bdnz LL(22) + .align 4 + +LL(23): + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T8 + beq cr0, LL(999) + .align 4 + +LL(25): + andi. r0, NN, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(26): + andi. r0, NN, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T2 + beq cr0, LL(999) + .align 4 + +LL(27): + andi. r0, NN, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + + fpabs T1, A1 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, T1 + beq cr0, LL(999) + .align 4 + +LL(28): + andi. r0, NN, 1 + beq LL(999) + addi RET, RET, 1 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + fpabs T1, A1 + LFSDUX A6, X, INCX + fpabs T2, A2 + LFSDUX A7, X, INCX + fpabs T3, A3 + LFSDUX A8, X, INCX + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, T1, C1 + LFDUX A1, X, INCX + fpsub F2, T2, C2 + LFDUX A2, X, INCX + fpsub F3, T3, C3 + LFDUX A3, X, INCX + fpsub F4, T4, C4 + LFDUX A4, X, INCX + + fpabs T5, A5 + LFSDUX A1, X, INCX + fpabs T6, A6 + LFSDUX A2, X, INCX + fpabs T7, A7 + LFSDUX A3, X, INCX + fpabs T8, A8 + LFSDUX A4, X, INCX + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX + fpsel C2, F2, C2, T2 + LFDUX A6, X, INCX + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX + fpsel C4, F4, C4, T4 + LFDUX A8, X, INCX + + fpsub F5, T5, C1 + LFSDUX A5, X, INCX + fpsub F6, T6, C2 + LFSDUX A6, X, INCX + fpsub F7, T7, C3 + LFSDUX A7, X, INCX + fpsub F8, T8, C4 + LFSDUX A8, X, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, T5, C1 + fpsub F6, T6, C2 + fpsub F7, T7, C3 + fpsub F8, T8, C4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsub F3, A3, C3 + fsub F4, A4, C4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fabs A1, A1 + fabs A2, A2 + fsub F1, A1, C1 + fsub F2, A2, C2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fabs A1, A1 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + + li RET, 0 + + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fabs T1, A1 + fabs T2, A2 + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + bdz LL(123) + .align 4 + +LL(122): + LFDUX A1, XX, INCX + fabs T3, A3 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + LFDUX A2, XX, INCX + fabs T4, A4 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + LFDUX A3, XX, INCX + fabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + LFDUX A4, XX, INCX + fabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + LFDUX A5, XX, INCX + fabs T3, A7 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + LFDUX A6, XX, INCX + fabs T4, A8 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + LFDUX A7, XX, INCX + fabs T1, A1 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + LFDUX A8, XX, INCX + fabs T2, A2 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + bdnz LL(122) + .align 4 + +LL(123): + fabs T3, A3 + fabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + fabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + fabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + + fabs T3, A7 + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + fabs T4, A8 + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + fabs T1, A1 + fabs T2, A2 + fabs T3, A3 + fabs T4, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T4 + beq cr0, LL(999) + .align 4 + +LL(127): + andi. r0, NN, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + fabs T1, A1 + fabs T2, A2 + + addi RET, RET, 1 + fcmpu cr0, C1, T1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, T2 + beq cr0, LL(999) + .align 4 + +LL(128): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/iamin_ppc440.S b/kernel/power/iamin_ppc440.S new file mode 100644 index 0000000000..888e74a24b --- /dev/null +++ b/kernel/power/iamin_ppc440.S @@ -0,0 +1,482 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + li RET, 0 + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + li PRE, 3 * 16 * SIZE + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + subi N, N, 1 + fabs f6, f1 + srawi. r0, N, 4 + fabs f7, f1 + mtspr CTR, r0 + fabs f1, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDUX f25, X, INCX + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDUX f27, X, INCX + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDUX f29, X, INCX + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f9, f1 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f10, f2 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f11, f3 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f13, f5 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f14, f6 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f15, f7 + fabs f15, f31 + LFDUX f31, X, INCX + + fsub f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f9, f1 + fabs f9, f25 + LFDUX f25, X, INCX + fsel f2, f18, f10, f2 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f11, f3 + fabs f11, f27 + LFDUX f27, X, INCX + + fsel f4, f20, f12, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f13, f5 + fabs f13, f29 + LFDUX f29, X, INCX + fsel f6, f22, f14, f6 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f15, f7 + fabs f15, f31 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fabs f8, f24 + fsel f1, f17, f9, f1 + fabs f9, f25 + fsel f2, f18, f10, f2 + fabs f10, f26 + fsel f3, f19, f11, f3 + fabs f11, f27 + + fsel f4, f20, f12, f4 + fabs f12, f28 + fsel f5, f21, f13, f5 + fabs f13, f29 + fsel f6, f22, f14, f6 + fabs f14, f30 + fsel f7, f23, f15, f7 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f8, f0 + fsel f1, f17, f9, f1 + fsel f2, f18, f10, f2 + fsel f3, f19, f11, f3 + fsel f4, f20, f12, f4 + fsel f5, f21, f13, f5 + fsel f6, f22, f14, f6 + fsel f7, f23, f15, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDUX f25, XX, INCX + LFDUX f26, XX, INCX + LFDUX f27, XX, INCX + LFDUX f28, XX, INCX + LFDUX f29, XX, INCX + LFDUX f30, XX, INCX + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDUX f25, XX, INCX + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDUX f27, XX, INCX + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDUX f29, XX, INCX + + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDUX f31, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + fabs f8, f8 + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/imax.S b/kernel/power/imax.S new file mode 100644 index 0000000000..6b6cd45608 --- /dev/null +++ b/kernel/power/imax.S @@ -0,0 +1,684 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(20) + .align 4 + +LL(10): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fsel f0, f8, f0, f24 + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + fsub f11, f3, f19 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f12, f4, f28 + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + fsub f15, f7, f23 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fsel f0, f8, f0, f24 + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + fsub f11, f3, f19 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f12, f4, f28 + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + fsub f15, f7, f23 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(1000): + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + LFD f10, 2 * SIZE(XX) + LFD f11, 3 * SIZE(XX) + LFD f12, 4 * SIZE(XX) + LFD f13, 5 * SIZE(XX) + LFD f14, 6 * SIZE(XX) + LFD f15, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + LFD f8, 8 * SIZE(XX) + LFD f9, 9 * SIZE(XX) + LFD f10, 10 * SIZE(XX) + LFD f11, 11 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + LFD f12, 12 * SIZE(XX) + LFD f13, 13 * SIZE(XX) + LFD f14, 14 * SIZE(XX) + LFD f15, 15 * SIZE(XX) + + addi XX, XX, 8 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + addi XX, XX, 8 * SIZE + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + bdnz LL(1110) + .align 4 + +LL(1120): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/imax_hummer.S b/kernel/power/imax_hummer.S new file mode 100644 index 0000000000..110dc18b8d --- /dev/null +++ b/kernel/power/imax_hummer.S @@ -0,0 +1,867 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + mr NN, N + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + + addi N, N, -1 + cmpwi cr0, N, 0 + li RET, 1 + ble LL(999) + + fsmfp C1, C1 + mr XX, X + fpmr C2, C1 + add X, X, INCX + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + LFPDUX A1, X, INCX2 + fpsel C2, F2, C2, A2 + LFPDUX A2, X, INCX2 + fpsel C3, F3, C3, A3 + LFPDUX A3, X, INCX2 + fpsel C4, F4, C4, A4 + LFPDUX A4, X, INCX2 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + LFPDUX A5, X, INCX2 + fpsel C2, F6, C2, A6 + LFPDUX A6, X, INCX2 + fpsel C3, F7, C3, A7 + LFPDUX A7, X, INCX2 + fpsel C4, F8, C4, A8 + LFPDUX A8, X, INCX2 + + bdnz LL(12) + .align 4 + +LL(13): + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + li RET, 0 + + fsmfp C1, C1 + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD A1, 0 * SIZE(XX) + add XX, XX, INCX + + addi NN, NN, -1 + addi RET, RET, 1 + + fcmpu cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + bdz LL(23) + .align 4 + +LL(22): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + LFPDUX A1, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + LFPDUX A2, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + LFPDUX A3, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + LFPDUX A4, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A5 + LFPDUX A5, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A6 + LFPDUX A6, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A7 + LFPDUX A7, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A8 + LFPDUX A8, XX, INCX2 + beq cr0, LL(999) + bdnz LL(22) + .align 4 + +LL(23): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A8 + beq cr0, LL(999) + .align 4 + +LL(25): + andi. r0, NN, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + beq cr0, LL(999) + .align 4 + +LL(26): + andi. r0, NN, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + .align 4 + +LL(27): + andi. r0, NN, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(28): + addi RET, RET, 1 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, A1 + LFSDUX A5, X, INCX + fpsub F2, C2, A2 + LFSDUX A6, X, INCX + fpsub F3, C3, A3 + LFSDUX A7, X, INCX + fpsub F4, C4, A4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, A1 + LFDUX A1, X, INCX + fpsel C2, F2, C2, A2 + LFDUX A2, X, INCX + fpsel C3, F3, C3, A3 + LFDUX A3, X, INCX + fpsel C4, F4, C4, A4 + LFDUX A4, X, INCX + + fpsub F5, C1, A5 + LFSDUX A1, X, INCX + fpsub F6, C2, A6 + LFSDUX A2, X, INCX + fpsub F7, C3, A7 + LFSDUX A3, X, INCX + fpsub F8, C4, A8 + LFSDUX A4, X, INCX + + fpsel C1, F5, C1, A5 + LFDUX A5, X, INCX + fpsel C2, F6, C2, A6 + LFDUX A6, X, INCX + fpsel C3, F7, C3, A7 + LFDUX A7, X, INCX + fpsel C4, F8, C4, A8 + LFDUX A8, X, INCX + bdnz LL(102) + .align 4 + +LL(103): + fpsub F1, C1, A1 + LFSDUX A5, X, INCX + fpsub F2, C2, A2 + LFSDUX A6, X, INCX + fpsub F3, C3, A3 + LFSDUX A7, X, INCX + fpsub F4, C4, A4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + + li RET, 0 + + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + bdz LL(123) + .align 4 + +LL(122): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + LFDUX A1, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + LFDUX A2, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + LFDUX A3, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + LFDUX A4, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + LFDUX A5, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + LFDUX A6, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + LFDUX A7, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + LFDUX A8, XX, INCX + beq cr0, LL(999) + bdnz LL(122) + .align 4 + +LL(123): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + .align 4 + +LL(127): + andi. r0, NN, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + .align 4 + +LL(128): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/imax_ppc440.S b/kernel/power/imax_ppc440.S new file mode 100644 index 0000000000..b4a6449748 --- /dev/null +++ b/kernel/power/imax_ppc440.S @@ -0,0 +1,429 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + li PRE, 3 * 16 * SIZE + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + subi N, N, 1 + fmr f5, f1 + srawi. r0, N, 4 + fmr f6, f1 + mtspr CTR, r0 + fmr f7, f1 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + fsub f8, f0, f16 + LFDUX f25, X, INCX + fsub f9, f1, f17 + LFDUX f26, X, INCX + fsub f10, f2, f18 + LFDUX f27, X, INCX + fsub f11, f3, f19 + LFDUX f28, X, INCX + fsub f12, f4, f20 + LFDUX f29, X, INCX + fsub f13, f5, f21 + LFDUX f30, X, INCX + fsub f14, f6, f22 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f0, f16 + LFDUX f16, X, INCX + fsub f8, f0, f24 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsel f1, f9, f1, f17 + LFDUX f17, X, INCX + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + LFDUX f18, X, INCX + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + LFDUX f19, X, INCX + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + LFDUX f20, X, INCX + fsub f12, f4, f28 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f5, f13, f5, f21 + LFDUX f21, X, INCX + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + LFDUX f22, X, INCX + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + LFDUX f23, X, INCX + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + LFDUX f24, X, INCX + fsub f8, f0, f16 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsel f1, f9, f1, f25 + LFDUX f25, X, INCX + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + LFDUX f26, X, INCX + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + LFDUX f27, X, INCX + fsub f11, f3, f19 + + fsel f4, f12, f4, f28 + LFDUX f28, X, INCX + fsub f12, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f5, f13, f5, f29 + LFDUX f29, X, INCX + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + LFDUX f30, X, INCX + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + LFDUX f8, XX, INCX + beq cr0, LL(9999) + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + LFDUX f9, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + LFDUX f10, XX, INCX + beq cr0, LL(9999) + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + LFDUX f11, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + LFDUX f12, XX, INCX + beq cr0, LL(9999) + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + LFDUX f13, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + LFDUX f14, XX, INCX + beq cr0, LL(9999) + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + LFDUX f15, XX, INCX + beq cr0, LL(9999) + bdnz LL(1110) + .align 4 + +LL(1120): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/imin.S b/kernel/power/imin.S new file mode 100644 index 0000000000..2dd774d102 --- /dev/null +++ b/kernel/power/imin.S @@ -0,0 +1,684 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(20) + .align 4 + +LL(10): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fsel f0, f8, f24, f0 + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + fsub f11, f3, f19 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f12, f28, f4 + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + fsub f15, f7, f23 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fsel f0, f8, f24, f0 + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + fsub f11, f3, f19 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f12, f28, f4 + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + fsub f15, f7, f23 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(1000): + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + LFD f10, 2 * SIZE(XX) + LFD f11, 3 * SIZE(XX) + LFD f12, 4 * SIZE(XX) + LFD f13, 5 * SIZE(XX) + LFD f14, 6 * SIZE(XX) + LFD f15, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + LFD f8, 8 * SIZE(XX) + LFD f9, 9 * SIZE(XX) + LFD f10, 10 * SIZE(XX) + LFD f11, 11 * SIZE(XX) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + LFD f12, 12 * SIZE(XX) + LFD f13, 13 * SIZE(XX) + LFD f14, 14 * SIZE(XX) + LFD f15, 15 * SIZE(XX) + + addi XX, XX, 8 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + addi XX, XX, 8 * SIZE + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + bdnz LL(1110) + .align 4 + +LL(1120): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/imin_hummer.S b/kernel/power/imin_hummer.S new file mode 100644 index 0000000000..d333329f6c --- /dev/null +++ b/kernel/power/imin_hummer.S @@ -0,0 +1,867 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + mr NN, N + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + + addi N, N, -1 + cmpwi cr0, N, 0 + li RET, 1 + ble LL(999) + + fsmfp C1, C1 + mr XX, X + fpmr C2, C1 + add X, X, INCX + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(20) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + LFPDUX A1, X, INCX2 + fpsel C2, F2, C2, A2 + LFPDUX A2, X, INCX2 + fpsel C3, F3, C3, A3 + LFPDUX A3, X, INCX2 + fpsel C4, F4, C4, A4 + LFPDUX A4, X, INCX2 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + LFPDUX A5, X, INCX2 + fpsel C2, F6, C2, A6 + LFPDUX A6, X, INCX2 + fpsel C3, F7, C3, A7 + LFPDUX A7, X, INCX2 + fpsel C4, F8, C4, A8 + LFPDUX A8, X, INCX2 + + bdnz LL(12) + .align 4 + +LL(13): + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(20) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(20) + + LFDUX A1, X, INCX2 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + li RET, 0 + + fsmfp C1, C1 + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD A1, 0 * SIZE(XX) + add XX, XX, INCX + + addi NN, NN, -1 + addi RET, RET, 1 + + fcmpu cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + bdz LL(23) + .align 4 + +LL(22): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + LFPDUX A1, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + LFPDUX A2, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + LFPDUX A3, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + LFPDUX A4, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A5 + LFPDUX A5, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A6 + LFPDUX A6, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A7 + LFPDUX A7, XX, INCX2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A8 + LFPDUX A8, XX, INCX2 + beq cr0, LL(999) + bdnz LL(22) + .align 4 + +LL(23): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A8 + beq cr0, LL(999) + .align 4 + +LL(25): + andi. r0, NN, 8 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A4 + beq cr0, LL(999) + .align 4 + +LL(26): + andi. r0, NN, 4 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A2 + beq cr0, LL(999) + .align 4 + +LL(27): + andi. r0, NN, 2 + beq LL(28) + + LFPDUX A1, XX, INCX2 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + .align 4 + +LL(28): + addi RET, RET, 1 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, A1, C1 + LFSDUX A5, X, INCX + fpsub F2, A2, C2 + LFSDUX A6, X, INCX + fpsub F3, A3, C3 + LFSDUX A7, X, INCX + fpsub F4, A4, C4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, A1 + LFDUX A1, X, INCX + fpsel C2, F2, C2, A2 + LFDUX A2, X, INCX + fpsel C3, F3, C3, A3 + LFDUX A3, X, INCX + fpsel C4, F4, C4, A4 + LFDUX A4, X, INCX + + fpsub F5, A5, C1 + LFSDUX A1, X, INCX + fpsub F6, A6, C2 + LFSDUX A2, X, INCX + fpsub F7, A7, C3 + LFSDUX A3, X, INCX + fpsub F8, A8, C4 + LFSDUX A4, X, INCX + + fpsel C1, F5, C1, A5 + LFDUX A5, X, INCX + fpsel C2, F6, C2, A6 + LFDUX A6, X, INCX + fpsel C3, F7, C3, A7 + LFDUX A7, X, INCX + fpsel C4, F8, C4, A8 + LFDUX A8, X, INCX + bdnz LL(102) + .align 4 + +LL(103): + fpsub F1, A1, C1 + LFSDUX A5, X, INCX + fpsub F2, A2, C2 + LFSDUX A6, X, INCX + fpsub F3, A3, C3 + LFSDUX A7, X, INCX + fpsub F4, A4, C4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(120) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsub F3, A3, C3 + fsub F4, A4, C4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + + li RET, 0 + + sub XX, XX, INCX + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(126) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + LFDUX A5, XX, INCX + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX + LFDUX A8, XX, INCX + bdz LL(123) + .align 4 + +LL(122): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + LFDUX A1, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + LFDUX A2, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + LFDUX A3, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + LFDUX A4, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + LFDUX A5, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + LFDUX A6, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + LFDUX A7, XX, INCX + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + LFDUX A8, XX, INCX + beq cr0, LL(999) + bdnz LL(122) + .align 4 + +LL(123): + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A5 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A6 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A7 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A8 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 4 + beq LL(127) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX + LFDUX A4, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A4 + beq cr0, LL(999) + .align 4 + +LL(127): + andi. r0, NN, 2 + beq LL(128) + + LFDUX A1, XX, INCX + LFDUX A2, XX, INCX + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A2 + beq cr0, LL(999) + .align 4 + +LL(128): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/imin_ppc440.S b/kernel/power/imin_ppc440.S new file mode 100644 index 0000000000..4e1185d1ab --- /dev/null +++ b/kernel/power/imin_ppc440.S @@ -0,0 +1,414 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + li PRE, 3 * 16 * SIZE + + slwi INCX, INCX, BASE_SHIFT + sub X, X, INCX + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + subi N, N, 1 + fmr f5, f1 + srawi. r0, N, 4 + fmr f6, f1 + mtspr CTR, r0 + fmr f7, f1 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + fsub f8, f0, f16 + LFDUX f25, X, INCX + fsub f9, f1, f17 + LFDUX f26, X, INCX + fsub f10, f2, f18 + LFDUX f27, X, INCX + fsub f11, f3, f19 + LFDUX f28, X, INCX + fsub f12, f4, f20 + LFDUX f29, X, INCX + fsub f13, f5, f21 + LFDUX f30, X, INCX + fsub f14, f6, f22 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f16, f0 + LFDUX f16, X, INCX + fsub f8, f0, f24 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsel f1, f9, f17, f1 + LFDUX f17, X, INCX + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + LFDUX f18, X, INCX + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + LFDUX f19, X, INCX + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + LFDUX f20, X, INCX + fsub f12, f4, f28 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f5, f13, f21, f5 + LFDUX f21, X, INCX + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + LFDUX f22, X, INCX + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + LFDUX f23, X, INCX + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + LFDUX f24, X, INCX + fsub f8, f0, f16 +#ifdef PPCG4 + dcbt X, PRE +#endif + fsel f1, f9, f25, f1 + LFDUX f25, X, INCX + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + LFDUX f26, X, INCX + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + LFDUX f27, X, INCX + fsub f11, f3, f19 + + fsel f4, f12, f28, f4 + LFDUX f28, X, INCX + fsub f12, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f5, f13, f29, f5 + LFDUX f29, X, INCX + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + LFDUX f30, X, INCX + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + LFDUX f8, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + LFDUX f9, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + LFDUX f10, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + LFDUX f11, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + LFDUX f12, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + LFDUX f13, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + LFDUX f14, XX, INCX + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + LFDUX f15, XX, INCX + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f9 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f10 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f11 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f12 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f13 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f14 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f15 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/izamax.S b/kernel/power/izamax.S new file mode 100644 index 0000000000..48510477d1 --- /dev/null +++ b/kernel/power/izamax.S @@ -0,0 +1,919 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 +#define INCXM1 r10 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + + +LL(1000): + cmpwi cr0, INCX, SIZE * 2 + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f24, 0 * SIZE(XX) + LFD f25, 1 * SIZE(XX) + LFD f26, 2 * SIZE(XX) + LFD f27, 3 * SIZE(XX) + LFD f28, 4 * SIZE(XX) + LFD f29, 5 * SIZE(XX) + LFD f30, 6 * SIZE(XX) + LFD f31, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(XX) + LFD f25, 17 * SIZE(XX) + LFD f26, 18 * SIZE(XX) + LFD f27, 19 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(XX) + LFD f29, 21 * SIZE(XX) + LFD f30, 22 * SIZE(XX) + LFD f31, 23 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + addi XX, XX, 16 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + addi XX, XX, 16 * SIZE + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + addi XX, XX, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCXM1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/izamax_hummer.S b/kernel/power/izamax_hummer.S new file mode 100644 index 0000000000..8dffa0c0c8 --- /dev/null +++ b/kernel/power/izamax_hummer.S @@ -0,0 +1,566 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 + +#define T1 f16 +#define T2 f17 +#define T3 f18 +#define T4 f19 + +#define B1 f20 +#define B2 f21 +#define B3 f22 +#define B4 f23 +#define B5 f24 +#define B6 f25 +#define B7 f26 +#define B8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + mr NN, N + ble LL(999) + + mr XX, X + + LFD A1, 0 * SIZE(X) + LFD A2, 1 * SIZE(X) + add X, X, INCX2 + li RET, 1 + + fabs A1, A1 + fabs A2, A2 + + subi INCX2, INCX2, SIZE + + addi N, N, -1 + cmpwi cr0, N, 0 + fadd C1, A1, A2 + ble LL(999) + + fsmfp C1, C1 + li INCX, SIZE + fpmr C2, C1 + sub X, X, INCX2 + fpmr C3, C1 + srawi. r0, N, 3 + fpmr C4, C1 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX2 + LFDUX A6, X, INCX + LFDUX A7, X, INCX2 + LFDUX A8, X, INCX + + LFSDUX A5, X, INCX2 + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX2 + LFSDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpabs B1, A1 + LFDUX A1, X, INCX2 + fpabs B2, A2 + LFDUX A2, X, INCX + fpabs B3, A3 + LFDUX A3, X, INCX2 + fpabs B4, A4 + LFDUX A4, X, INCX + + fpabs B5, A5 + LFSDUX A1, X, INCX2 + fpabs B6, A6 + LFSDUX A2, X, INCX + fpabs B7, A7 + LFSDUX A3, X, INCX2 + fpabs B8, A8 + LFSDUX A4, X, INCX + + fpadd T1, B1, B2 + LFDUX A5, X, INCX2 + fpadd T2, B3, B4 + LFDUX A6, X, INCX + fpadd T3, B5, B6 + LFDUX A7, X, INCX2 + fpadd T4, B7, B8 + LFDUX A8, X, INCX + + fpsub F1, C1, T1 + LFSDUX A5, X, INCX2 + fpsub F2, C2, T2 + LFSDUX A6, X, INCX + fpsub F3, C3, T3 + LFSDUX A7, X, INCX2 + fpsub F4, C4, T4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + bdnz LL(102) + .align 4 + +LL(103): + fpabs B1, A1 + fpabs B2, A2 + fpabs B3, A3 + fpabs B4, A4 + + fpabs B5, A5 + fpabs B6, A6 + fpabs B7, A7 + fpabs B8, A8 + + fpadd T1, B1, B2 + fpadd T2, B3, B4 + fpadd T3, B5, B6 + fpadd T4, B7, B8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(120) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A3 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A3 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + + fpadd A1, A1, A2 + + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + + fabs A1, A1 + fabs A2, A2 + + fadd A1, A1, A2 + + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + li RET, 0 + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + + fsmfp C1, C1 + + sub XX, XX, INCX2 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(125) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + LFSDUX A1, XX, INCX2 + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX2 + LFSDUX A4, XX, INCX + + LFDUX A5, XX, INCX2 + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX2 + LFDUX A8, XX, INCX + + LFSDUX A5, XX, INCX2 + LFSDUX A6, XX, INCX + LFSDUX A7, XX, INCX2 + LFSDUX A8, XX, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd B1, T1, T2 + fpadd B2, T3, T4 + + bdz LL(123) + .align 4 + +LL(122): + LFDUX A1, XX, INCX2 + fpabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, B1 + LFDUX A2, XX, INCX + beq cr0, LL(999) + + LFDUX A3, XX, INCX2 + fpabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, B2 + LFDUX A4, XX, INCX + beq cr0, LL(999) + + LFSDUX A1, XX, INCX2 + fpabs T3, A7 + addi RET, RET, 1 + fscmp cr0, C1, B1 + LFSDUX A2, XX, INCX + beq cr0, LL(999) + + LFSDUX A3, XX, INCX2 + fpabs T4, A8 + addi RET, RET, 1 + fscmp cr0, C1, B2 + LFSDUX A4, XX, INCX + beq cr0, LL(999) + + fpadd B3, T1, T2 + fpadd B4, T3, T4 + + LFDUX A5, XX, INCX2 + fpabs T1, A1 + addi RET, RET, 1 + fcmpu cr0, C1, B3 + LFDUX A6, XX, INCX + beq cr0, LL(999) + + LFDUX A7, XX, INCX2 + fpabs T2, A2 + addi RET, RET, 1 + fcmpu cr0, C1, B4 + LFDUX A8, XX, INCX + beq cr0, LL(999) + + LFSDUX A5, XX, INCX2 + fpabs T3, A3 + addi RET, RET, 1 + fscmp cr0, C1, B3 + LFSDUX A6, XX, INCX + beq cr0, LL(999) + + LFSDUX A7, XX, INCX2 + fpabs T4, A4 + addi RET, RET, 1 + fscmp cr0, C1, B4 + LFSDUX A8, XX, INCX + beq cr0, LL(999) + + fpadd B1, T1, T2 + fpadd B2, T3, T4 + bdnz LL(122) + .align 4 + +LL(123): + fpabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, B1 + beq cr0, LL(999) + + fpabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, B2 + beq cr0, LL(999) + + fpabs T3, A7 + addi RET, RET, 1 + fscmp cr0, C1, B1 + beq cr0, LL(999) + + fpabs T4, A8 + addi RET, RET, 1 + fscmp cr0, C1, B2 + beq cr0, LL(999) + + fpadd B3, T1, T2 + fpadd B4, T3, T4 + + addi RET, RET, 1 + fcmpu cr0, C1, B3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, B4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, B3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, B4 + beq cr0, LL(999) + .align 4 + +LL(125): + andi. r0, NN, 4 + beq LL(126) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + LFSDUX A1, XX, INCX2 + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX2 + LFSDUX A4, XX, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 2 + beq LL(127) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fadd A1, A1, A2 + fadd A3, A3, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + .align 4 + +LL(127): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/izamax_ppc440.S b/kernel/power/izamax_ppc440.S new file mode 100644 index 0000000000..f80c9ad172 --- /dev/null +++ b/kernel/power/izamax_ppc440.S @@ -0,0 +1,538 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 +#define INC1 r10 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + + slwi INCX, INCX, ZBASE_SHIFT + sub X, X, INCX + li INC1, SIZE + li PRE, 3 * 16 * SIZE + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + LFDX f2, X, INC1 + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + subi N, N, 1 + fmr f0, f1 + srawi. r0, N, 3 + fmr f2, f1 + mtspr CTR, r0 + fmr f3, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 +#ifdef PPCG4 + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f13, f29 + LFDUX f28, X, INCX + fabs f14, f30 + LFDX f29, X, INC1 + fabs f15, f31 + LFDUX f30, X, INCX + + fsub f16, f0, f4 + LFDX f31, X, INC1 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 +#ifdef PPCG4 + dcbt X, PRE +#endif + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f0, f16, f0, f4 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDX f25, XX, INC1 + LFDUX f26, XX, INCX + LFDX f27, XX, INC1 + LFDUX f28, XX, INCX + LFDX f29, XX, INC1 + LFDUX f30, XX, INCX + LFDX f31, XX, INC1 + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + LFDX f9, XX, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/izamin.S b/kernel/power/izamin.S new file mode 100644 index 0000000000..17275fcecb --- /dev/null +++ b/kernel/power/izamin.S @@ -0,0 +1,920 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PREA r9 +#define INCXM1 r10 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + mr NN, N + mr XX, X + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + + +LL(1000): + cmpwi cr0, INCX, SIZE * 2 + bne- cr0, LL(1100) + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1050) + + LFD f24, 0 * SIZE(XX) + LFD f25, 1 * SIZE(XX) + LFD f26, 2 * SIZE(XX) + LFD f27, 3 * SIZE(XX) + LFD f28, 4 * SIZE(XX) + LFD f29, 5 * SIZE(XX) + LFD f30, 6 * SIZE(XX) + LFD f31, 7 * SIZE(XX) + bdz LL(1020) + .align 4 + +LL(1010): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(XX) + LFD f25, 17 * SIZE(XX) + LFD f26, 18 * SIZE(XX) + LFD f27, 19 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(XX) + LFD f29, 21 * SIZE(XX) + LFD f30, 22 * SIZE(XX) + LFD f31, 23 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + addi XX, XX, 16 * SIZE + bdnz LL(1010) + .align 4 + +LL(1020): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(XX) + LFD f25, 9 * SIZE(XX) + LFD f26, 10 * SIZE(XX) + LFD f27, 11 * SIZE(XX) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(XX) + LFD f29, 13 * SIZE(XX) + LFD f30, 14 * SIZE(XX) + LFD f31, 15 * SIZE(XX) + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + addi XX, XX, 16 * SIZE + .align 4 + +LL(1050): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1060): + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + addi XX, XX, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1060) + b LL(9999) + .align 4 + +LL(1100): + sub XX, XX, INCXM1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, XX, INCXM1 + LFDUX f25, XX, INCX + LFDX f26, XX, INCXM1 + LFDUX f27, XX, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, XX, INCXM1 + LFDUX f29, XX, INCX + LFDX f30, XX, INCXM1 + LFDUX f31, XX, INCX + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/izamin_hummer.S b/kernel/power/izamin_hummer.S new file mode 100644 index 0000000000..75145abf5c --- /dev/null +++ b/kernel/power/izamin_hummer.S @@ -0,0 +1,566 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 +#define RET r9 +#define NN r10 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 + +#define T1 f16 +#define T2 f17 +#define T3 f18 +#define T4 f19 + +#define B1 f20 +#define B2 f21 +#define B3 f22 +#define B4 f23 +#define B5 f24 +#define B6 f25 +#define B7 f26 +#define B8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + li RET, 0 + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + mr NN, N + ble LL(999) + + mr XX, X + + LFD A1, 0 * SIZE(X) + LFD A2, 1 * SIZE(X) + add X, X, INCX2 + li RET, 1 + + fabs A1, A1 + fabs A2, A2 + + subi INCX2, INCX2, SIZE + + addi N, N, -1 + cmpwi cr0, N, 0 + fadd C1, A1, A2 + ble LL(999) + + fsmfp C1, C1 + li INCX, SIZE + fpmr C2, C1 + sub X, X, INCX2 + fpmr C3, C1 + srawi. r0, N, 3 + fpmr C4, C1 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX2 + LFDUX A6, X, INCX + LFDUX A7, X, INCX2 + LFDUX A8, X, INCX + + LFSDUX A5, X, INCX2 + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX2 + LFSDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpabs B1, A1 + LFDUX A1, X, INCX2 + fpabs B2, A2 + LFDUX A2, X, INCX + fpabs B3, A3 + LFDUX A3, X, INCX2 + fpabs B4, A4 + LFDUX A4, X, INCX + + fpabs B5, A5 + LFSDUX A1, X, INCX2 + fpabs B6, A6 + LFSDUX A2, X, INCX + fpabs B7, A7 + LFSDUX A3, X, INCX2 + fpabs B8, A8 + LFSDUX A4, X, INCX + + fpadd T1, B1, B2 + LFDUX A5, X, INCX2 + fpadd T2, B3, B4 + LFDUX A6, X, INCX + fpadd T3, B5, B6 + LFDUX A7, X, INCX2 + fpadd T4, B7, B8 + LFDUX A8, X, INCX + + fpsub F1, T1, C1 + LFSDUX A5, X, INCX2 + fpsub F2, T2, C2 + LFSDUX A6, X, INCX + fpsub F3, T3, C3 + LFSDUX A7, X, INCX2 + fpsub F4, T4, C4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + bdnz LL(102) + .align 4 + +LL(103): + fpabs B1, A1 + fpabs B2, A2 + fpabs B3, A3 + fpabs B4, A4 + + fpabs B5, A5 + fpabs B6, A6 + fpabs B7, A7 + fpabs B8, A8 + + fpadd T1, B1, B2 + fpadd T2, B3, B4 + fpadd T3, B5, B6 + fpadd T4, B7, B8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(120) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + fpsub F1, A1, C1 + fpsub F2, A3, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A3 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + + fpadd A1, A1, A2 + + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + + fabs A1, A1 + fabs A2, A2 + + fadd A1, A1, A2 + + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(120): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + li RET, 0 + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + + fsmfp C1, C1 + + sub XX, XX, INCX2 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(125) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + LFSDUX A1, XX, INCX2 + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX2 + LFSDUX A4, XX, INCX + + LFDUX A5, XX, INCX2 + LFDUX A6, XX, INCX + LFDUX A7, XX, INCX2 + LFDUX A8, XX, INCX + + LFSDUX A5, XX, INCX2 + LFSDUX A6, XX, INCX + LFSDUX A7, XX, INCX2 + LFSDUX A8, XX, INCX + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd B1, T1, T2 + fpadd B2, T3, T4 + + bdz LL(123) + .align 4 + +LL(122): + LFDUX A1, XX, INCX2 + fpabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, B1 + LFDUX A2, XX, INCX + beq cr0, LL(999) + + LFDUX A3, XX, INCX2 + fpabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, B2 + LFDUX A4, XX, INCX + beq cr0, LL(999) + + LFSDUX A1, XX, INCX2 + fpabs T3, A7 + addi RET, RET, 1 + fscmp cr0, C1, B1 + LFSDUX A2, XX, INCX + beq cr0, LL(999) + + LFSDUX A3, XX, INCX2 + fpabs T4, A8 + addi RET, RET, 1 + fscmp cr0, C1, B2 + LFSDUX A4, XX, INCX + beq cr0, LL(999) + + fpadd B3, T1, T2 + fpadd B4, T3, T4 + + LFDUX A5, XX, INCX2 + fpabs T1, A1 + addi RET, RET, 1 + fcmpu cr0, C1, B3 + LFDUX A6, XX, INCX + beq cr0, LL(999) + + LFDUX A7, XX, INCX2 + fpabs T2, A2 + addi RET, RET, 1 + fcmpu cr0, C1, B4 + LFDUX A8, XX, INCX + beq cr0, LL(999) + + LFSDUX A5, XX, INCX2 + fpabs T3, A3 + addi RET, RET, 1 + fscmp cr0, C1, B3 + LFSDUX A6, XX, INCX + beq cr0, LL(999) + + LFSDUX A7, XX, INCX2 + fpabs T4, A4 + addi RET, RET, 1 + fscmp cr0, C1, B4 + LFSDUX A8, XX, INCX + beq cr0, LL(999) + + fpadd B1, T1, T2 + fpadd B2, T3, T4 + bdnz LL(122) + .align 4 + +LL(123): + fpabs T1, A5 + addi RET, RET, 1 + fcmpu cr0, C1, B1 + beq cr0, LL(999) + + fpabs T2, A6 + addi RET, RET, 1 + fcmpu cr0, C1, B2 + beq cr0, LL(999) + + fpabs T3, A7 + addi RET, RET, 1 + fscmp cr0, C1, B1 + beq cr0, LL(999) + + fpabs T4, A8 + addi RET, RET, 1 + fscmp cr0, C1, B2 + beq cr0, LL(999) + + fpadd B3, T1, T2 + fpadd B4, T3, T4 + + addi RET, RET, 1 + fcmpu cr0, C1, B3 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, B4 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, B3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, B4 + beq cr0, LL(999) + .align 4 + +LL(125): + andi. r0, NN, 4 + beq LL(126) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + LFSDUX A1, XX, INCX2 + LFSDUX A2, XX, INCX + LFSDUX A3, XX, INCX2 + LFSDUX A4, XX, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fscmp cr0, C1, A3 + beq cr0, LL(999) + .align 4 + +LL(126): + andi. r0, NN, 2 + beq LL(127) + + LFDUX A1, XX, INCX2 + LFDUX A2, XX, INCX + LFDUX A3, XX, INCX2 + LFDUX A4, XX, INCX + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fadd A1, A1, A2 + fadd A3, A3, A4 + + addi RET, RET, 1 + fcmpu cr0, C1, A1 + beq cr0, LL(999) + + addi RET, RET, 1 + fcmpu cr0, C1, A3 + beq cr0, LL(999) + .align 4 + +LL(127): + addi RET, RET, 1 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + mr r3, RET + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/izamin_ppc440.S b/kernel/power/izamin_ppc440.S new file mode 100644 index 0000000000..2cdb8bf38a --- /dev/null +++ b/kernel/power/izamin_ppc440.S @@ -0,0 +1,538 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RET r3 +#define X r4 +#define INCX r5 + +#define N r6 +#define NN r7 +#define XX r8 +#define PRE r9 +#define INC1 r10 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(r3) + LDINT INCX, 0(INCX) +#else + mr N, r3 +#endif + + li RET, 0 + + slwi INCX, INCX, ZBASE_SHIFT + sub X, X, INCX + li INC1, SIZE + li PRE, 3 * 16 * SIZE + + mr NN, N + mr XX, X + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + LFDX f2, X, INC1 + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + subi N, N, 1 + fmr f0, f1 + srawi. r0, N, 3 + fmr f2, f1 + mtspr CTR, r0 + fmr f3, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 +#ifdef PPCG4 + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fabs f13, f29 + LFDUX f28, X, INCX + fabs f14, f30 + LFDX f29, X, INC1 + fabs f15, f31 + LFDUX f30, X, INCX + + fsub f16, f0, f4 + LFDX f31, X, INC1 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 +#ifdef PPCG4 + dcbt X, PRE +#endif + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f0, f16, f4, f0 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + +LL(1000): + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- LL(1150) + + LFDUX f24, XX, INCX + LFDX f25, XX, INC1 + LFDUX f26, XX, INCX + LFDX f27, XX, INC1 + LFDUX f28, XX, INCX + LFDX f29, XX, INC1 + LFDUX f30, XX, INCX + LFDX f31, XX, INC1 + bdz LL(1120) + .align 4 + +LL(1110): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + +#ifdef PPCG4 + dcbt XX, PRE +#endif + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + bdnz LL(1110) + .align 4 + +LL(1120): + fabs f8, f24 + LFDUX f24, XX, INCX + fabs f9, f25 + LFDX f25, XX, INC1 + fabs f10, f26 + LFDUX f26, XX, INCX + fabs f11, f27 + LFDX f27, XX, INC1 + + fabs f12, f28 + LFDUX f28, XX, INCX + fabs f13, f29 + LFDX f29, XX, INC1 + fabs f14, f30 + LFDUX f30, XX, INCX + fabs f15, f31 + LFDX f31, XX, INC1 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + addi RET, RET, 1 + fcmpu cr0, f1, f4 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f5 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f6 + beq cr0, LL(9999) + + addi RET, RET, 1 + fcmpu cr0, f1, f7 + beq cr0, LL(9999) + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq LL(9999) + .align 4 + +LL(1160): + LFDUX f8, XX, INCX + LFDX f9, XX, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + + addi RET, RET, 1 + fcmpu cr0, f1, f8 + beq cr0, LL(9999) + bdnz LL(1160) + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/lock.c b/kernel/power/lock.c new file mode 100644 index 0000000000..51348d63c7 --- /dev/null +++ b/kernel/power/lock.c @@ -0,0 +1,61 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +static void __inline blas_lock(volatile BLASULONG *address){ + +#ifdef __GNUC__ + + BLASLONG int ret, val = 1; + + __asm__ __volatile__ ( + " .machine \"any\" ;" + "0: lwarx %0,0, %1 ;" + " cmpwi 0,%0,0;" + " bne 1f;" + " stwcx. %2,0, %1 ;" + " bne- 0b;" + "1: " + : "=&r"(ret) + : "r"(address), "r" (val) + : "cr0", "memory"); + +#else + while (*address) {}; + *address = 1; +#endif +} diff --git a/kernel/power/lsame.S b/kernel/power/lsame.S new file mode 100644 index 0000000000..51d21b041f --- /dev/null +++ b/kernel/power/lsame.S @@ -0,0 +1,72 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + lbz r3, 0(r3) + lbz r4, 0(r4) + + cmplwi cr0, r3, 96 + cmplwi cr6, r4, 96 + addi r0, r3, -32 + addi r11,r4, -32 + + ble- cr0, LL(2) +#ifdef __64BIT__ + rldicl r3, r0, 0, 56 +#else + rlwinm r3, r0, 0, 0xff +#endif +LL(2): + ble- cr6, LL(3) +#ifdef __64BIT__ + rldicl r4, r11, 0, 56 +#else + rlwinm r4, r11, 0, 0xff +#endif +LL(3): + xor r3, r3, r4 + subfic r0, r3, 0 + adde r3, r0, r3 + blr + + EPILOGUE diff --git a/kernel/power/max.S b/kernel/power/max.S new file mode 100644 index 0000000000..5862bc9305 --- /dev/null +++ b/kernel/power/max.S @@ -0,0 +1,445 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(20) + .align 4 + +LL(10): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fsel f0, f8, f0, f24 + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + fsub f11, f3, f19 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f12, f4, f28 + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + fsub f15, f7, f23 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fsel f0, f8, f0, f24 + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + fsub f11, f3, f19 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f12, f4, f28 + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + fsub f15, f7, f23 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/max_hummer.S b/kernel/power/max_hummer.S new file mode 100644 index 0000000000..01ff907e67 --- /dev/null +++ b/kernel/power/max_hummer.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + + fsmfp C1, C1 + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(998) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + LFPDUX A1, X, INCX2 + fpsel C2, F2, C2, A2 + LFPDUX A2, X, INCX2 + fpsel C3, F3, C3, A3 + LFPDUX A3, X, INCX2 + fpsel C4, F4, C4, A4 + LFPDUX A4, X, INCX2 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + LFPDUX A5, X, INCX2 + fpsel C2, F6, C2, A6 + LFPDUX A6, X, INCX2 + fpsel C3, F7, C3, A7 + LFPDUX A7, X, INCX2 + fpsel C4, F8, C4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + b LL(998) + .align 4 + + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX + LFSDUX A8, X, INCX + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + bdz LL(103) + .align 4 + +LL(102): + fpsel C1, F1, C1, A1 + LFDUX A1, X, INCX + fpsel C2, F2, C2, A2 + LFDUX A2, X, INCX + fpsel C3, F3, C3, A3 + LFDUX A3, X, INCX + fpsel C4, F4, C4, A4 + LFDUX A4, X, INCX + + fpsub F5, C1, A5 + LFSDUX A1, X, INCX + fpsub F6, C2, A6 + LFSDUX A2, X, INCX + fpsub F7, C3, A7 + LFSDUX A3, X, INCX + fpsub F8, C4, A8 + LFSDUX A4, X, INCX + + fpsel C1, F5, C1, A5 + LFDUX A5, X, INCX + fpsel C2, F6, C2, A6 + LFDUX A6, X, INCX + fpsel C3, F7, C3, A7 + LFDUX A7, X, INCX + fpsel C4, F8, C4, A8 + LFDUX A8, X, INCX + + fpsub F1, C1, A1 + LFSDUX A5, X, INCX + fpsub F2, C2, A2 + LFSDUX A6, X, INCX + fpsub F3, C3, A3 + LFSDUX A7, X, INCX + fpsub F4, C4, A4 + LFSDUX A8, X, INCX + bdnz LL(102) + .align 4 + +LL(103): + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, C1, A5 + fpsub F6, C2, A6 + fpsub F7, C3, A7 + fpsub F8, C4, A8 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + + +LL(998): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/max_ppc440.S b/kernel/power/max_ppc440.S new file mode 100644 index 0000000000..7afdf566ed --- /dev/null +++ b/kernel/power/max_ppc440.S @@ -0,0 +1,284 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + sub X, X, INCX + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fmr f0, f1 + fmr f2, f1 + subi N, N, 1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + srawi. r0, N, 4 + fmr f6, f1 + mtspr CTR, r0 + fmr f7, f1 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + fsub f8, f0, f16 + LFDUX f25, X, INCX + fsub f9, f1, f17 + LFDUX f26, X, INCX + fsub f10, f2, f18 + LFDUX f27, X, INCX + fsub f11, f3, f19 + LFDUX f28, X, INCX + fsub f12, f4, f20 + LFDUX f29, X, INCX + fsub f13, f5, f21 + LFDUX f30, X, INCX + fsub f14, f6, f22 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f0, f16 + LFDUX f16, X, INCX + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + LFDUX f17, X, INCX + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + LFDUX f18, X, INCX + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + LFDUX f19, X, INCX + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + LFDUX f20, X, INCX + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + LFDUX f21, X, INCX + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + LFDUX f22, X, INCX + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + LFDUX f23, X, INCX + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + LFDUX f24, X, INCX + fsub f8, f0, f16 + fsel f1, f9, f1, f25 + LFDUX f25, X, INCX + fsub f9, f1, f17 + fsel f2, f10, f2, f26 + LFDUX f26, X, INCX + fsub f10, f2, f18 + fsel f3, f11, f3, f27 + LFDUX f27, X, INCX + fsub f11, f3, f19 + + fsel f4, f12, f4, f28 + LFDUX f28, X, INCX + fsub f12, f4, f20 + fsel f5, f13, f5, f29 + LFDUX f29, X, INCX + fsub f13, f5, f21 + fsel f6, f14, f6, f30 + LFDUX f30, X, INCX + fsub f14, f6, f22 + fsel f7, f15, f7, f31 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f0, f16 + fsub f8, f0, f24 + fsel f1, f9, f1, f17 + fsub f9, f1, f25 + fsel f2, f10, f2, f18 + fsub f10, f2, f26 + fsel f3, f11, f3, f19 + fsub f11, f3, f27 + + fsel f4, f12, f4, f20 + fsub f12, f4, f28 + fsel f5, f13, f5, f21 + fsub f13, f5, f29 + fsel f6, f14, f6, f22 + fsub f14, f6, f30 + fsel f7, f15, f7, f23 + fsub f15, f7, f31 + + fsel f0, f8, f0, f24 + fsel f1, f9, f1, f25 + fsel f2, f10, f2, f26 + fsel f3, f11, f3, f27 + fsel f4, f12, f4, f28 + fsel f5, f13, f5, f29 + fsel f6, f14, f6, f30 + fsel f7, f15, f7, f31 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f1, f8, f0, f4 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/min.S b/kernel/power/min.S new file mode 100644 index 0000000000..727a6a7b11 --- /dev/null +++ b/kernel/power/min.S @@ -0,0 +1,445 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(20) + .align 4 + +LL(10): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fsel f0, f8, f24, f0 + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + fsub f11, f3, f19 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f12, f28, f4 + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + fsub f15, f7, f23 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f8, f0, f16 + fsub f9, f1, f17 + fsub f10, f2, f18 + fsub f11, f3, f19 + fsub f12, f4, f20 + fsub f13, f5, f21 + fsub f14, f6, f22 + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fsel f0, f8, f24, f0 + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + fsub f11, f3, f19 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f12, f28, f4 + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + fsub f15, f7, f23 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/min_hummer.S b/kernel/power/min_hummer.S new file mode 100644 index 0000000000..bd82687113 --- /dev/null +++ b/kernel/power/min_hummer.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 +#define F5 f16 +#define F6 f17 +#define F7 f18 +#define F8 f19 + +#define T1 f20 +#define T2 f21 +#define T3 f22 +#define T4 f23 +#define T5 f24 +#define T6 f25 +#define T7 f26 +#define T8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD C1, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + + fsmfp C1, C1 + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C2, 0 * SIZE(X) + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(998) + .align 4 + +LL(05): + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + LFPDUX A1, X, INCX2 + fpsel C2, F2, C2, A2 + LFPDUX A2, X, INCX2 + fpsel C3, F3, C3, A3 + LFPDUX A3, X, INCX2 + fpsel C4, F4, C4, A4 + LFPDUX A4, X, INCX2 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + LFPDUX A5, X, INCX2 + fpsel C2, F6, C2, A6 + LFPDUX A6, X, INCX2 + fpsel C3, F7, C3, A7 + LFPDUX A7, X, INCX2 + fpsel C4, F8, C4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + LFPDUX A1, X, INCX2 + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + b LL(998) + .align 4 + + +LL(100): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + LFSDUX A5, X, INCX + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX + LFSDUX A8, X, INCX + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + bdz LL(103) + .align 4 + +LL(102): + fpsel C1, F1, C1, A1 + LFDUX A1, X, INCX + fpsel C2, F2, C2, A2 + LFDUX A2, X, INCX + fpsel C3, F3, C3, A3 + LFDUX A3, X, INCX + fpsel C4, F4, C4, A4 + LFDUX A4, X, INCX + + fpsub F5, A5, C1 + LFSDUX A1, X, INCX + fpsub F6, A6, C2 + LFSDUX A2, X, INCX + fpsub F7, A7, C3 + LFSDUX A3, X, INCX + fpsub F8, A8, C4 + LFSDUX A4, X, INCX + + fpsel C1, F5, C1, A5 + LFDUX A5, X, INCX + fpsel C2, F6, C2, A6 + LFDUX A6, X, INCX + fpsel C3, F7, C3, A7 + LFDUX A7, X, INCX + fpsel C4, F8, C4, A8 + LFDUX A8, X, INCX + + fpsub F1, A1, C1 + LFSDUX A5, X, INCX + fpsub F2, A2, C2 + LFSDUX A6, X, INCX + fpsub F3, A3, C3 + LFSDUX A7, X, INCX + fpsub F4, A4, C4 + LFSDUX A8, X, INCX + bdnz LL(102) + .align 4 + +LL(103): + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + + fpsub F5, A5, C1 + fpsub F6, A6, C2 + fpsub F7, A7, C3 + fpsub F8, A8, C4 + + fpsel C1, F5, C1, A5 + fpsel C2, F6, C2, A6 + fpsel C3, F7, C3, A7 + fpsel C4, F8, C4, A8 + .align 4 + +LL(105): + andi. r0, N, 15 + beq LL(998) + + andi. r0, N, 8 + beq LL(106) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFSDUX A1, X, INCX + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX + LFSDUX A4, X, INCX + + fpsub F1, A1, C1 + fpsub F2, A2, C2 + fpsub F3, A3, C3 + fpsub F4, A4, C4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 4 + beq LL(107) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fsub F1, A1, C1 + fsub F2, A2, C2 + fsub F3, A3, C3 + fsub F4, A4, C4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 2 + beq LL(108) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + fsub F1, A1, C1 + fsub F2, A2, C2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(108): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + + +LL(998): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/min_ppc440.S b/kernel/power/min_ppc440.S new file mode 100644 index 0000000000..ab67bbc8ef --- /dev/null +++ b/kernel/power/min_ppc440.S @@ -0,0 +1,284 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + sub X, X, INCX + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + + fmr f0, f1 + subi N, N, 1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + srawi. r0, N, 4 + fmr f6, f1 + mtspr CTR, r0 + fmr f7, f1 + beq- LL(150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + fsub f8, f0, f16 + LFDUX f25, X, INCX + fsub f9, f1, f17 + LFDUX f26, X, INCX + fsub f10, f2, f18 + LFDUX f27, X, INCX + fsub f11, f3, f19 + LFDUX f28, X, INCX + fsub f12, f4, f20 + LFDUX f29, X, INCX + fsub f13, f5, f21 + LFDUX f30, X, INCX + fsub f14, f6, f22 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdz LL(120) + .align 4 + +LL(110): + fsel f0, f8, f16, f0 + LFDUX f16, X, INCX + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + LFDUX f17, X, INCX + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + LFDUX f18, X, INCX + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + LFDUX f19, X, INCX + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + LFDUX f20, X, INCX + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + LFDUX f21, X, INCX + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + LFDUX f22, X, INCX + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + LFDUX f23, X, INCX + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + LFDUX f24, X, INCX + fsub f8, f0, f16 + fsel f1, f9, f25, f1 + LFDUX f25, X, INCX + fsub f9, f1, f17 + fsel f2, f10, f26, f2 + LFDUX f26, X, INCX + fsub f10, f2, f18 + fsel f3, f11, f27, f3 + LFDUX f27, X, INCX + fsub f11, f3, f19 + + fsel f4, f12, f28, f4 + LFDUX f28, X, INCX + fsub f12, f4, f20 + fsel f5, f13, f29, f5 + LFDUX f29, X, INCX + fsub f13, f5, f21 + fsel f6, f14, f30, f6 + LFDUX f30, X, INCX + fsub f14, f6, f22 + fsel f7, f15, f31, f7 + LFDUX f31, X, INCX + fsub f15, f7, f23 + bdnz LL(110) + .align 4 + +LL(120): + fsel f0, f8, f16, f0 + fsub f8, f0, f24 + fsel f1, f9, f17, f1 + fsub f9, f1, f25 + fsel f2, f10, f18, f2 + fsub f10, f2, f26 + fsel f3, f11, f19, f3 + fsub f11, f3, f27 + + fsel f4, f12, f20, f4 + fsub f12, f4, f28 + fsel f5, f13, f21, f5 + fsub f13, f5, f29 + fsel f6, f14, f22, f6 + fsub f14, f6, f30 + fsel f7, f15, f23, f7 + fsub f15, f7, f31 + + fsel f0, f8, f24, f0 + fsel f1, f9, f25, f1 + fsel f2, f10, f26, f2 + fsel f3, f11, f27, f3 + fsel f4, f12, f28, f4 + fsel f5, f13, f29, f5 + fsel f6, f14, f30, f6 + fsel f7, f15, f31, f7 + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsel f4, f10, f5, f4 + fsel f6, f11, f7, f6 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f2, f0 + fsel f4, f9, f6, f4 + + fsub f8, f0, f4 + fsel f1, f8, f4, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S new file mode 100644 index 0000000000..e2b635ee70 --- /dev/null +++ b/kernel/power/nrm2.S @@ -0,0 +1,908 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define NN r6 +#define XX r7 +#define PREA r8 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define FMAX 152(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r12, 0x5fe0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r12, FMAX + stw r10, 4 + FMAX + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + mr NN, N + mr XX, X + + LFD f1, 0 * SIZE(X) + add X, X, INCX + + fabs f0, f1 + fabs f2, f1 + fabs f3, f1 + fabs f4, f1 + fabs f5, f1 + fabs f6, f1 + fabs f7, f1 + fabs f1, f1 + subi N, N, 1 + + cmpwi cr0, N, 0 + ble- LL(9999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1000) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(100) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + addi X, X, 1 * SIZE + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + .align 4 + +LL(100): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + + fcmpu cr0, f1, f31 + beq- cr0, LL(9999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- cr0, LL(250) + + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + LFD f10, 2 * SIZE(XX) + LFD f11, 3 * SIZE(XX) + LFD f12, 4 * SIZE(XX) + LFD f13, 5 * SIZE(XX) + LFD f14, 6 * SIZE(XX) + LFD f15, 7 * SIZE(XX) + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmul f18, f30, f10 + fmul f19, f30, f11 + + LFD f8, 8 * SIZE(XX) + LFD f9, 9 * SIZE(XX) + LFD f10, 10 * SIZE(XX) + LFD f11, 11 * SIZE(XX) + + fmul f20, f30, f12 + fmul f21, f30, f13 + fmul f22, f30, f14 + fmul f23, f30, f15 + + LFD f12, 12 * SIZE(XX) + LFD f13, 13 * SIZE(XX) + LFD f14, 14 * SIZE(XX) + LFD f15, 15 * SIZE(XX) + bdz LL(220) + .align 4 + +LL(210): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFD f8, 16 * SIZE(XX) + LFD f9, 17 * SIZE(XX) + LFD f10, 18 * SIZE(XX) + LFD f11, 19 * SIZE(XX) + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFD f12, 20 * SIZE(XX) + LFD f13, 21 * SIZE(XX) + LFD f14, 22 * SIZE(XX) + LFD f15, 23 * SIZE(XX) + + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFD f8, 24 * SIZE(XX) + LFD f9, 25 * SIZE(XX) + LFD f10, 26 * SIZE(XX) + LFD f11, 27 * SIZE(XX) + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFD f12, 28 * SIZE(XX) + LFD f13, 29 * SIZE(XX) + LFD f14, 30 * SIZE(XX) + LFD f15, 31 * SIZE(XX) + +#ifndef POWER6 + L1_PREFETCH XX, PREA +#endif + addi XX, XX, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH XX, PREA +#endif + + bdnz LL(210) + .align 4 + +LL(220): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + addi XX, XX, 16 * SIZE + .align 4 + +LL(250): + andi. r0, NN, 15 + mtspr CTR, r0 + beq- cr0, LL(270) + .align 4 + +LL(260): + LFD f8, 0 * SIZE(XX) + addi XX, XX, 1 * SIZE + + fmul f16, f30, f8 + fmadd f0, f16, f16, f0 + bdnz LL(260) + .align 4 + +LL(270): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f0, f0, f4 + + fsqrt f0, f0 + fmul f1, f31, f0 + b LL(9999) + .align 4 + +LL(1000): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(1050) + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(1020) + .align 4 + +LL(1010): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(1010) + .align 4 + +LL(1020): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(1050): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(1999) + .align 4 + +LL(1060): + LFDUX f8, X, INCX + fabs f8, f8 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(1060) + .align 4 + +LL(1999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + lfd f2, FMAX + + fcmpu cr0, f1, f31 + beq- cr0, LL(9999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + sub XX, XX, INCX + + srawi. r0, NN, 4 + mtspr CTR, r0 + beq- cr0, LL(2150) + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmul f18, f30, f10 + fmul f19, f30, f11 + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + fmul f20, f30, f12 + fmul f21, f30, f13 + fmul f22, f30, f14 + fmul f23, f30, f15 + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + bdz LL(2120) + .align 4 + +LL(2110): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFDUX f8, XX, INCX + LFDUX f9, XX, INCX + LFDUX f10, XX, INCX + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFDUX f12, XX, INCX + LFDUX f13, XX, INCX + LFDUX f14, XX, INCX + LFDUX f15, XX, INCX + + bdnz LL(2110) + .align 4 + +LL(2120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + .align 4 + +LL(2150): + andi. r0, NN, 15 + mtspr CTR, r0 + beq- cr0, LL(2170) + .align 4 + +LL(2160): + LFDUX f8, XX, INCX + + fmul f16, f30, f8 + fmadd f0, f16, f16, f0 + bdnz LL(2160) + .align 4 + +LL(2170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f0, f0, f4 + + fsqrt f0, f0 + fmul f1, f31, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/rot.S b/kernel/power/rot.S new file mode 100644 index 0000000000..b9e9338ac7 --- /dev/null +++ b/kernel/power/rot.S @@ -0,0 +1,571 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 +#define XX r9 +#define YY r10 + +#define C f1 +#define S f2 + +#define STACKSIZE 32 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f0, 0 * SIZE(X) + LFD f4, 1 * SIZE(X) + LFD f6, 2 * SIZE(X) + LFD f8, 3 * SIZE(X) + + LFD f3, 0 * SIZE(Y) + LFD f5, 1 * SIZE(Y) + LFD f7, 2 * SIZE(Y) + LFD f9, 3 * SIZE(Y) + bdz LL(12) + .align 4 + +LL(10): + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 4 * SIZE(X) + LFD f4, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f8, 7 * SIZE(X) + + LFD f3, 4 * SIZE(Y) + LFD f5, 5 * SIZE(Y) + LFD f7, 6 * SIZE(Y) + LFD f9, 7 * SIZE(Y) + + STFD f10, 0 * SIZE(X) + STFD f12, 1 * SIZE(X) + STFD f14, 2 * SIZE(X) + STFD f16, 3 * SIZE(X) + + STFD f11, 0 * SIZE(Y) + STFD f13, 1 * SIZE(Y) + STFD f15, 2 * SIZE(Y) + STFD f17, 3 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f6, 10 * SIZE(X) + LFD f8, 11 * SIZE(X) + + LFD f3, 8 * SIZE(Y) + LFD f5, 9 * SIZE(Y) + LFD f7, 10 * SIZE(Y) + LFD f9, 11 * SIZE(Y) + + STFD f10, 4 * SIZE(X) + STFD f12, 5 * SIZE(X) + STFD f14, 6 * SIZE(X) + STFD f16, 7 * SIZE(X) + + STFD f11, 4 * SIZE(Y) + STFD f13, 5 * SIZE(Y) + STFD f15, 6 * SIZE(Y) + STFD f17, 7 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 12 * SIZE(X) + LFD f4, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f8, 15 * SIZE(X) + + LFD f3, 12 * SIZE(Y) + LFD f5, 13 * SIZE(Y) + LFD f7, 14 * SIZE(Y) + LFD f9, 15 * SIZE(Y) + + STFD f10, 8 * SIZE(X) + STFD f12, 9 * SIZE(X) + STFD f14, 10 * SIZE(X) + STFD f16, 11 * SIZE(X) + + STFD f11, 8 * SIZE(Y) + STFD f13, 9 * SIZE(Y) + STFD f15, 10 * SIZE(Y) + STFD f17, 11 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 16 * SIZE(X) + LFD f4, 17 * SIZE(X) + LFD f6, 18 * SIZE(X) + LFD f8, 19 * SIZE(X) + + LFD f3, 16 * SIZE(Y) + LFD f5, 17 * SIZE(Y) + LFD f7, 18 * SIZE(Y) + LFD f9, 19 * SIZE(Y) + + STFD f10, 12 * SIZE(X) + STFD f12, 13 * SIZE(X) + STFD f14, 14 * SIZE(X) + STFD f16, 15 * SIZE(X) + + STFD f11, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f15, 14 * SIZE(Y) + STFD f17, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst X, PREA +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst X, PREA + dcbtst X, PREA +#endif + bdnz LL(10) + .align 4 + +LL(12): + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 0 * SIZE(X) + STFD f12, 1 * SIZE(X) + STFD f14, 2 * SIZE(X) + STFD f16, 3 * SIZE(X) + + STFD f11, 0 * SIZE(Y) + STFD f13, 1 * SIZE(Y) + STFD f15, 2 * SIZE(Y) + STFD f17, 3 * SIZE(Y) + + LFD f0, 4 * SIZE(X) + LFD f4, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f8, 7 * SIZE(X) + + LFD f3, 4 * SIZE(Y) + LFD f5, 5 * SIZE(Y) + LFD f7, 6 * SIZE(Y) + LFD f9, 7 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 4 * SIZE(X) + STFD f12, 5 * SIZE(X) + STFD f14, 6 * SIZE(X) + STFD f16, 7 * SIZE(X) + + STFD f11, 4 * SIZE(Y) + STFD f13, 5 * SIZE(Y) + STFD f15, 6 * SIZE(Y) + STFD f17, 7 * SIZE(Y) + + LFD f0, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f6, 10 * SIZE(X) + LFD f8, 11 * SIZE(X) + + LFD f3, 8 * SIZE(Y) + LFD f5, 9 * SIZE(Y) + LFD f7, 10 * SIZE(Y) + LFD f9, 11 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 8 * SIZE(X) + STFD f12, 9 * SIZE(X) + STFD f14, 10 * SIZE(X) + STFD f16, 11 * SIZE(X) + + STFD f11, 8 * SIZE(Y) + STFD f13, 9 * SIZE(Y) + STFD f15, 10 * SIZE(Y) + STFD f17, 11 * SIZE(Y) + + LFD f0, 12 * SIZE(X) + LFD f4, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f8, 15 * SIZE(X) + + LFD f3, 12 * SIZE(Y) + LFD f5, 13 * SIZE(Y) + LFD f7, 14 * SIZE(Y) + LFD f9, 15 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 12 * SIZE(X) + STFD f12, 13 * SIZE(X) + STFD f14, 14 * SIZE(X) + STFD f16, 15 * SIZE(X) + + STFD f11, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f15, 14 * SIZE(Y) + STFD f17, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f3, 0 * SIZE(X) + LFD f4, 0 * SIZE(Y) + + FMUL f10, C, f3 + FMUL f11, C, f4 + + FMADD f10, S, f4, f10 + FNMSUB f11, S, f3, f11 + + STFD f10, 0 * SIZE(X) + STFD f11, 0 * SIZE(Y) + + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + mr XX, X + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + LFDUX f6, X, INCX + LFDUX f7, Y, INCY + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDUX f10, XX, INCX + STFDUX f11, YY, INCY + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + STFDUX f14, XX, INCX + STFDUX f15, YY, INCY + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + LFDUX f6, X, INCX + LFDUX f7, Y, INCY + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDUX f10, XX, INCX + STFDUX f11, YY, INCY + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + STFDUX f14, XX, INCX + STFDUX f15, YY, INCY + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + + STFDUX f10, XX, INCX + STFDUX f11, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/rot_ppc440.S b/kernel/power/rot_ppc440.S new file mode 100644 index 0000000000..bb19583b7d --- /dev/null +++ b/kernel/power/rot_ppc440.S @@ -0,0 +1,286 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PRE r8 +#define XX r9 +#define YY r10 + +#define C f1 +#define S f2 + +#define STACKSIZE 32 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PRE, 2 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + sub X, X, INCX + sub Y, Y, INCY + + mr XX, X + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + LFDUX f4, X, INCX + + FMUL f10, C, f0 + LFDUX f5, Y, INCY + FMUL f11, C, f3 + LFDUX f6, X, INCX + FMUL f12, C, f4 + LFDUX f7, Y, INCY + FMUL f13, C, f5 + LFDUX f8, X, INCX + + FMADD f10, S, f3, f10 + LFDUX f9, Y, INCY + FNMSUB f11, S, f0, f11 + LFDUX f0, X, INCX + FMADD f12, S, f5, f12 + LFDUX f3, Y, INCY + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + + bdz LL(111) + .align 4 + +LL(110): + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDUX f10, XX, INCX + FMUL f16, C, f8 + STFDUX f11, YY, INCY + FMUL f17, C, f9 + STFDUX f12, XX, INCX + +#ifdef PPCG4 + dcbtst X, PRE +#endif + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDUX f6, X, INCX + FMADD f16, S, f9, f16 + LFDUX f7, Y, INCY + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDUX f14, XX, INCX + FMUL f12, C, f4 + STFDUX f15, YY, INCY + FMUL f13, C, f5 + STFDUX f16, XX, INCX + +#ifdef PPCG4 + dcbtst Y, PRE +#endif + + FMADD f10, S, f3, f10 + STFDUX f17, YY, INCY + FNMSUB f11, S, f0, f11 + LFDUX f0, X, INCX + FMADD f12, S, f5, f12 + LFDUX f3, Y, INCY + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDUX f10, XX, INCX + FMUL f16, C, f8 + STFDUX f11, YY, INCY + FMUL f17, C, f9 + STFDUX f12, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDUX f6, X, INCX + FMADD f16, S, f9, f16 + LFDUX f7, Y, INCY + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDUX f14, XX, INCX + FMUL f12, C, f4 + STFDUX f15, YY, INCY + FMUL f13, C, f5 + STFDUX f16, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + + FMADD f10, S, f3, f10 + STFDUX f17, YY, INCY + FNMSUB f11, S, f0, f11 + LFDUX f0, X, INCX + FMADD f12, S, f5, f12 + LFDUX f3, Y, INCY + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + + bdnz LL(110) + .align 4 + +LL(111): + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDUX f10, XX, INCX + FMUL f16, C, f8 + STFDUX f11, YY, INCY + FMUL f17, C, f9 + STFDUX f12, XX, INCX + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDUX f6, X, INCX + FMADD f16, S, f9, f16 + LFDUX f7, Y, INCY + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDUX f14, XX, INCX + FMUL f12, C, f4 + STFDUX f15, YY, INCY + FMUL f13, C, f5 + STFDUX f16, XX, INCX + + FMUL f14, C, f6 + STFDUX f17, YY, INCY + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + STFDUX f10, XX, INCX + FNMSUB f15, S, f6, f15 + STFDUX f11, YY, INCY + FMADD f16, S, f9, f16 + STFDUX f12, XX, INCX + FNMSUB f17, S, f8, f17 + STFDUX f13, YY, INCY + + STFDUX f14, XX, INCX + STFDUX f15, YY, INCY + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDUX f3, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + + STFDUX f10, XX, INCX + STFDUX f11, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/scal.S b/kernel/power/scal.S new file mode 100644 index 0000000000..f242f083c1 --- /dev/null +++ b/kernel/power/scal.S @@ -0,0 +1,401 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define XX r4 +#define PREA r5 + +#ifdef linux +#ifndef __64BIT__ +#define X r6 +#define INCX r7 +#else +#define X r7 +#define INCX r8 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define X r8 +#define INCX r9 +#else +#define X r7 +#define INCX r8 +#endif +#endif + +#define FZERO f0 +#define ALPHA f1 + + PROLOGUE + PROFCODE + + addi SP, SP, -8 + li r0, 0 + + stw r0, 0(SP) + lfs FZERO, 0(SP) + + addi SP, SP, 8 + + slwi INCX, INCX, BASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + blelr- cr0 + + fcmpu cr0, FZERO, ALPHA + bne- cr0, LL(A1I1) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(A0IN) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(A0I1_Remain) + .align 4 + +LL(A0I1_kernel): + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + STFD FZERO, 2 * SIZE(X) + STFD FZERO, 3 * SIZE(X) + STFD FZERO, 4 * SIZE(X) + STFD FZERO, 5 * SIZE(X) + STFD FZERO, 6 * SIZE(X) + STFD FZERO, 7 * SIZE(X) + + STFD FZERO, 8 * SIZE(X) + STFD FZERO, 9 * SIZE(X) + STFD FZERO, 10 * SIZE(X) + STFD FZERO, 11 * SIZE(X) + STFD FZERO, 12 * SIZE(X) + STFD FZERO, 13 * SIZE(X) + STFD FZERO, 14 * SIZE(X) + STFD FZERO, 15 * SIZE(X) + + addi X, X, 16 * SIZE + bdnz LL(A0I1_kernel) + .align 4 + +LL(A0I1_Remain): + andi. r0, N, 15 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0I1_RemainKernel): + STFD FZERO, 0 * SIZE(X) + addi X, X, 1 * SIZE + bdnz LL(A0I1_RemainKernel) + blr + .align 4 + +LL(A0IN): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(A0IN_Remain) + .align 4 + +LL(A0IN_Kernel): + dcbtst X, PREA + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + bdnz LL(A0IN_Kernel) + .align 4 + +LL(A0IN_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0IN_RemainKernel): + STFD FZERO, 0 * SIZE(X) + add X, X, INCX + bdnz LL(A0IN_RemainKernel) + blr + .align 4 + +LL(A1I1): + cmpwi cr0, INCX, SIZE + bne- LL(A1IN) + + mr XX, X + + srawi. r0, N, 4 + mtspr CTR, r0 + beq+ LL(A1I1_Remain) + + LFD f2, 0 * SIZE(X) + LFD f3, 1 * SIZE(X) + LFD f4, 2 * SIZE(X) + LFD f5, 3 * SIZE(X) + LFD f6, 4 * SIZE(X) + LFD f7, 5 * SIZE(X) + LFD f8, 6 * SIZE(X) + LFD f9, 7 * SIZE(X) + bdz LL(13) + .align 4 + +LL(A1I1_kernel): + FMUL f10, ALPHA, f2 + FMUL f11, ALPHA, f3 + FMUL f12, ALPHA, f4 + FMUL f13, ALPHA, f5 + + LFD f2, 8 * SIZE(X) + LFD f3, 9 * SIZE(X) + LFD f4, 10 * SIZE(X) + LFD f5, 11 * SIZE(X) + + STFD f10, 0 * SIZE(X) + STFD f11, 1 * SIZE(X) + STFD f12, 2 * SIZE(X) + STFD f13, 3 * SIZE(X) + + FMUL f10, ALPHA, f6 + FMUL f11, ALPHA, f7 + FMUL f12, ALPHA, f8 + FMUL f13, ALPHA, f9 + + LFD f6, 12 * SIZE(X) + LFD f7, 13 * SIZE(X) + LFD f8, 14 * SIZE(X) + LFD f9, 15 * SIZE(X) + + STFD f10, 4 * SIZE(X) + STFD f11, 5 * SIZE(X) + STFD f12, 6 * SIZE(X) + STFD f13, 7 * SIZE(X) + + FMUL f10, ALPHA, f2 + FMUL f11, ALPHA, f3 + FMUL f12, ALPHA, f4 + FMUL f13, ALPHA, f5 + + LFD f2, 16 * SIZE(X) + LFD f3, 17 * SIZE(X) + LFD f4, 18 * SIZE(X) + LFD f5, 19 * SIZE(X) + + STFD f10, 8 * SIZE(X) + STFD f11, 9 * SIZE(X) + STFD f12, 10 * SIZE(X) + STFD f13, 11 * SIZE(X) + + FMUL f10, ALPHA, f6 + FMUL f11, ALPHA, f7 + FMUL f12, ALPHA, f8 + FMUL f13, ALPHA, f9 + + LFD f6, 20 * SIZE(X) + LFD f7, 21 * SIZE(X) + LFD f8, 22 * SIZE(X) + LFD f9, 23 * SIZE(X) + + STFD f10, 12 * SIZE(X) + STFD f11, 13 * SIZE(X) + STFD f12, 14 * SIZE(X) + STFD f13, 15 * SIZE(X) + + addi X, X, 16 * SIZE + dcbtst X, PREA + bdnz LL(A1I1_kernel) + .align 4 + +LL(13): + FMUL f10, ALPHA, f2 + FMUL f11, ALPHA, f3 + FMUL f12, ALPHA, f4 + FMUL f13, ALPHA, f5 + + LFD f2, 8 * SIZE(X) + LFD f3, 9 * SIZE(X) + LFD f4, 10 * SIZE(X) + LFD f5, 11 * SIZE(X) + + STFD f10, 0 * SIZE(X) + STFD f11, 1 * SIZE(X) + STFD f12, 2 * SIZE(X) + STFD f13, 3 * SIZE(X) + + FMUL f10, ALPHA, f6 + FMUL f11, ALPHA, f7 + FMUL f12, ALPHA, f8 + FMUL f13, ALPHA, f9 + + LFD f6, 12 * SIZE(X) + LFD f7, 13 * SIZE(X) + LFD f8, 14 * SIZE(X) + LFD f9, 15 * SIZE(X) + + STFD f10, 4 * SIZE(X) + STFD f11, 5 * SIZE(X) + STFD f12, 6 * SIZE(X) + STFD f13, 7 * SIZE(X) + + FMUL f10, ALPHA, f2 + FMUL f11, ALPHA, f3 + FMUL f12, ALPHA, f4 + FMUL f13, ALPHA, f5 + + STFD f10, 8 * SIZE(X) + STFD f11, 9 * SIZE(X) + STFD f12, 10 * SIZE(X) + STFD f13, 11 * SIZE(X) + + FMUL f10, ALPHA, f6 + FMUL f11, ALPHA, f7 + FMUL f12, ALPHA, f8 + FMUL f13, ALPHA, f9 + + STFD f10, 12 * SIZE(X) + STFD f11, 13 * SIZE(X) + STFD f12, 14 * SIZE(X) + STFD f13, 15 * SIZE(X) + + addi X, X, 16 * SIZE + .align 4 + +LL(A1I1_Remain): + andi. r0, N, 15 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1I1_RemainKernel): + LFD f2, 0 * SIZE(X) + FMUL f2, ALPHA, f2 + STFD f2, 0 * SIZE(X) + addi X, X, 1 * SIZE + bdnz LL(A1I1_RemainKernel) + blr + .align 4 + +LL(A1IN): + mr XX, X + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(A1IN_Remain) + .align 4 + +LL(A1IN_Kernel): + LFD f2, 0 * SIZE(XX) + add XX, XX, INCX + LFD f3, 0 * SIZE(XX) + add XX, XX, INCX + LFD f4, 0 * SIZE(XX) + add XX, XX, INCX + LFD f5, 0 * SIZE(XX) + add XX, XX, INCX + + FMUL f2, ALPHA, f2 + FMUL f3, ALPHA, f3 + FMUL f4, ALPHA, f4 + FMUL f5, ALPHA, f5 + + LFD f6, 0 * SIZE(XX) + add XX, XX, INCX + LFD f7, 0 * SIZE(XX) + add XX, XX, INCX + LFD f8, 0 * SIZE(XX) + add XX, XX, INCX + LFD f9, 0 * SIZE(XX) + add XX, XX, INCX + + FMUL f6, ALPHA, f6 + FMUL f7, ALPHA, f7 + FMUL f8, ALPHA, f8 + FMUL f9, ALPHA, f9 + + STFD f2, 0 * SIZE(X) + add X, X, INCX + STFD f3, 0 * SIZE(X) + add X, X, INCX + STFD f4, 0 * SIZE(X) + add X, X, INCX + STFD f5, 0 * SIZE(X) + add X, X, INCX + STFD f6, 0 * SIZE(X) + add X, X, INCX + STFD f7, 0 * SIZE(X) + add X, X, INCX + STFD f8, 0 * SIZE(X) + add X, X, INCX + STFD f9, 0 * SIZE(X) + add X, X, INCX + bdnz LL(A1IN_Kernel) + .align 4 + +LL(A1IN_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1IN_RemainKernel): + LFD f2, 0 * SIZE(XX) + add XX, XX, INCX + FMUL f2, ALPHA, f2 + STFD f2, 0 * SIZE(X) + add X, X, INCX + bdnz LL(A1IN_RemainKernel) + blr + + EPILOGUE diff --git a/kernel/power/scal_hummer.S b/kernel/power/scal_hummer.S new file mode 100644 index 0000000000..0b584862ab --- /dev/null +++ b/kernel/power/scal_hummer.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 + +#define INCX2 r4 +#define X2 r5 + +#define ALPHA f1 + +#define A1 f0 +#define A2 f16 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 + +#define B1 f8 +#define B2 f9 +#define B3 f10 +#define B4 f11 +#define B5 f12 +#define B6 f13 +#define B7 f14 +#define B8 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + + lfpdx A1, SP, r10 # Zero clear + fsmfp ALPHA, ALPHA + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + + fcmpu cr7, ALPHA, A1 + bne cr7, LL(50) + + sub X, X, INCX2 + + andi. r0, X, 2 * SIZE - 1 + beq LL(11) + + STFDX A1, X, INCX2 + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(999) + .align 4 + +LL(11): + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(15) + .align 4 + +LL(12): + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, N, 15 + beq LL(999) + andi. r0, N, 8 + beq LL(16) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(16): + andi. r0, N, 4 + beq LL(17) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(17): + andi. r0, N, 2 + beq LL(18) + + STFPDUX A1, X, INCX2 + .align 4 + +LL(18): + andi. r0, N, 1 + beq LL(999) + STFDUX A1, X, INCX2 + b LL(999) + .align 4 + +LL(50): + sub X2, X, INCX2 + sub X, X, INCX2 + + andi. r0, X, 2 * SIZE - 1 + beq LL(51) + + LFDX A1, X, INCX2 + addi X, X, 1 * SIZE + + fmul B1, ALPHA, A1 + addi N, N, -1 + cmpwi cr0, N, 0 + + STFDX B1, X2, INCX2 + addi X2, X2, 1 * SIZE + ble LL(999) + .align 4 + +LL(51): + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(55) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(53) + .align 4 + +LL(52): + fpmul B1, ALPHA, A1 + LFPDUX A1, X, INCX2 + fpmul B2, ALPHA, A2 + LFPDUX A2, X, INCX2 + fpmul B3, ALPHA, A3 + LFPDUX A3, X, INCX2 + fpmul B4, ALPHA, A4 + LFPDUX A4, X, INCX2 + fpmul B5, ALPHA, A5 + LFPDUX A5, X, INCX2 + fpmul B6, ALPHA, A6 + LFPDUX A6, X, INCX2 + fpmul B7, ALPHA, A7 + LFPDUX A7, X, INCX2 + fpmul B8, ALPHA, A8 + LFPDUX A8, X, INCX2 + + STFPDUX B1, X2, INCX2 + STFPDUX B2, X2, INCX2 + STFPDUX B3, X2, INCX2 + STFPDUX B4, X2, INCX2 + STFPDUX B5, X2, INCX2 + STFPDUX B6, X2, INCX2 + STFPDUX B7, X2, INCX2 + STFPDUX B8, X2, INCX2 + bdnz LL(52) + .align 4 + +LL(53): + fpmul B1, ALPHA, A1 + fpmul B2, ALPHA, A2 + fpmul B3, ALPHA, A3 + fpmul B4, ALPHA, A4 + fpmul B5, ALPHA, A5 + fpmul B6, ALPHA, A6 + STFPDUX B1, X2, INCX2 + fpmul B7, ALPHA, A7 + STFPDUX B2, X2, INCX2 + fpmul B8, ALPHA, A8 + STFPDUX B3, X2, INCX2 + + STFPDUX B4, X2, INCX2 + STFPDUX B5, X2, INCX2 + STFPDUX B6, X2, INCX2 + STFPDUX B7, X2, INCX2 + STFPDUX B8, X2, INCX2 + .align 4 + +LL(55): + andi. r0, N, 15 + beq LL(999) + andi. r0, N, 8 + beq LL(56) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpmul B1, ALPHA, A1 + fpmul B2, ALPHA, A2 + fpmul B3, ALPHA, A3 + fpmul B4, ALPHA, A4 + + STFPDUX B1, X2, INCX2 + STFPDUX B2, X2, INCX2 + STFPDUX B3, X2, INCX2 + STFPDUX B4, X2, INCX2 + .align 4 + +LL(56): + andi. r0, N, 4 + beq LL(57) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpmul B1, ALPHA, A1 + fpmul B2, ALPHA, A2 + STFPDUX B1, X2, INCX2 + STFPDUX B2, X2, INCX2 + .align 4 + +LL(57): + andi. r0, N, 2 + beq LL(58) + + LFPDUX A1, X, INCX2 + fpmul B1, ALPHA, A1 + STFPDUX B1, X2, INCX2 + .align 4 + +LL(58): + andi. r0, N, 1 + beq LL(999) + + LFDX A1, X, INCX2 + fmul B1, ALPHA, A1 + STFDX B1, X2, INCX2 + b LL(999) + .align 4 + + +LL(100): + fcmpu cr7, ALPHA, A1 + bne cr7, LL(200) + + sub X, X, INCX + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(115) + .align 4 + +LL(112): + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(117) + + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + STFDUX A1, X, INCX + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + STFDUX A1, X, INCX + STFDUX A1, X, INCX + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(999) + STFDUX A1, X, INCX + b LL(999) + .align 4 + +LL(200): + sub X2, X, INCX + sub X, X, INCX + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(215) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + LFDUX A5, X, INCX + LFDUX A6, X, INCX + LFDUX A7, X, INCX + LFDUX A8, X, INCX + bdz LL(213) + .align 4 + +LL(212): + fmul B1, ALPHA, A1 + LFDUX A1, X, INCX + fmul B2, ALPHA, A2 + LFDUX A2, X, INCX + + fmul B3, ALPHA, A3 + LFDUX A3, X, INCX + fmul B4, ALPHA, A4 + LFDUX A4, X, INCX + + fmul B5, ALPHA, A5 + LFDUX A5, X, INCX + fmul B6, ALPHA, A6 + LFDUX A6, X, INCX + + fmul B7, ALPHA, A7 + LFDUX A7, X, INCX + fmul B8, ALPHA, A8 + LFDUX A8, X, INCX + + STFDUX B1, X2, INCX + STFDUX B2, X2, INCX + STFDUX B3, X2, INCX + STFDUX B4, X2, INCX + STFDUX B5, X2, INCX + STFDUX B6, X2, INCX + STFDUX B7, X2, INCX + STFDUX B8, X2, INCX + bdnz LL(212) + .align 4 + +LL(213): + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fmul B3, ALPHA, A3 + fmul B4, ALPHA, A4 + fmul B5, ALPHA, A5 + + fmul B6, ALPHA, A6 + STFDUX B1, X2, INCX + fmul B7, ALPHA, A7 + STFDUX B2, X2, INCX + fmul B8, ALPHA, A8 + STFDUX B3, X2, INCX + STFDUX B4, X2, INCX + STFDUX B5, X2, INCX + STFDUX B6, X2, INCX + STFDUX B7, X2, INCX + STFDUX B8, X2, INCX + .align 4 + +LL(215): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(217) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX A3, X, INCX + LFDUX A4, X, INCX + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fmul B3, ALPHA, A3 + fmul B4, ALPHA, A4 + + STFDUX B1, X2, INCX + STFDUX B2, X2, INCX + STFDUX B3, X2, INCX + STFDUX B4, X2, INCX + .align 4 + +LL(217): + andi. r0, N, 2 + beq LL(218) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + + STFDUX B1, X2, INCX + STFDUX B2, X2, INCX + .align 4 + +LL(218): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + fmul B1, ALPHA, A1 + STFDUX B1, X2, INCX + .align 4 + +LL(999): + li r10, 16 + + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S new file mode 100644 index 0000000000..8b9e271cfe --- /dev/null +++ b/kernel/power/scal_ppc440.S @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define XX r4 +#define PRE r5 + +#ifdef linux +#ifndef __64BIT__ +#define X r6 +#define INCX r7 +#else +#define X r7 +#define INCX r8 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define X r8 +#define INCX r9 +#else +#define X r7 +#define INCX r8 +#endif +#endif + +#define FZERO f0 +#define ALPHA f1 + + PROLOGUE + PROFCODE + + addi SP, SP, -8 + li r0, 0 + + stw r0, 0(SP) + lfs FZERO, 0(SP) + + addi SP, SP, 8 + + slwi INCX, INCX, BASE_SHIFT + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + blelr- cr0 + + sub X, X, INCX + + fcmpu cr0, FZERO, ALPHA + bne- cr0, LL(A1I1) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(A0I1_Remain) + .align 4 + +LL(A0I1_kernel): +#ifdef PPCG4 + dcbtst X, PRE +#endif + + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + +#ifdef PPCG4 + dcbtst X, PRE +#endif + + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + STFDUX FZERO, X, INCX + bdnz LL(A0I1_kernel) + .align 4 + +LL(A0I1_Remain): + andi. r0, N, 15 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0I1_RemainKernel): + STFDUX FZERO, X, INCX + bdnz LL(A0I1_RemainKernel) + blr + .align 4 + +LL(A1I1): + mr XX, X + + srawi. r0, N, 3 + mtspr CTR, r0 + beq+ LL(A1I1_Remain) + + LFDUX f2, X, INCX + LFDUX f3, X, INCX + LFDUX f4, X, INCX + LFDUX f5, X, INCX + bdz LL(12) + .align 4 + +LL(11): + LFDUX f6, X, INCX + FMUL f2, ALPHA, f2 + LFDUX f7, X, INCX + FMUL f3, ALPHA, f3 + LFDUX f8, X, INCX + FMUL f4, ALPHA, f4 + LFDUX f9, X, INCX + FMUL f5, ALPHA, f5 + +#ifdef PPCG4 + dcbtst X, PRE +#endif + STFDUX f2, XX, INCX + STFDUX f3, XX, INCX + STFDUX f4, XX, INCX + STFDUX f5, XX, INCX + + LFDUX f2, X, INCX + FMUL f6, ALPHA, f6 + LFDUX f3, X, INCX + FMUL f7, ALPHA, f7 + LFDUX f4, X, INCX + FMUL f8, ALPHA, f8 + LFDUX f5, X, INCX + FMUL f9, ALPHA, f9 + + STFDUX f6, XX, INCX + STFDUX f7, XX, INCX + STFDUX f8, XX, INCX + STFDUX f9, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + bdnz LL(11) + .align 4 + +LL(12): + LFDUX f6, X, INCX + FMUL f2, ALPHA, f2 + LFDUX f7, X, INCX + FMUL f3, ALPHA, f3 + LFDUX f8, X, INCX + FMUL f4, ALPHA, f4 + LFDUX f9, X, INCX + FMUL f5, ALPHA, f5 + + STFDUX f2, XX, INCX + FMUL f6, ALPHA, f6 + STFDUX f3, XX, INCX + FMUL f7, ALPHA, f7 + STFDUX f4, XX, INCX + FMUL f8, ALPHA, f8 + STFDUX f5, XX, INCX + FMUL f9, ALPHA, f9 + + STFDUX f6, XX, INCX + STFDUX f7, XX, INCX + STFDUX f8, XX, INCX + STFDUX f9, XX, INCX + .align 4 + +LL(A1I1_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1I1_RemainKernel): + LFDUX f2, X, INCX + FMUL f2, ALPHA, f2 + STFDUX f2, XX, INCX + bdnz LL(A1I1_RemainKernel) + blr + .align 4 + + EPILOGUE diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S new file mode 100644 index 0000000000..f235c67684 --- /dev/null +++ b/kernel/power/snrm2.S @@ -0,0 +1,412 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 + +#define FZERO 144(SP) +#define FONE 148(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + + li PREA, 4 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + fmr f8, f1 + fmr f9, f1 + fmr f10, f1 + fmr f11, f1 + fmr f12, f1 + fmr f13, f1 + fmr f14, f1 + fmr f15, f1 + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(1000) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFD f16, 0 * SIZE(X) + LFD f17, 1 * SIZE(X) + LFD f18, 2 * SIZE(X) + LFD f19, 3 * SIZE(X) + LFD f20, 4 * SIZE(X) + LFD f21, 5 * SIZE(X) + LFD f22, 6 * SIZE(X) + LFD f23, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + LFD f16, 16 * SIZE(X) + LFD f17, 17 * SIZE(X) + LFD f18, 18 * SIZE(X) + LFD f19, 19 * SIZE(X) + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + LFD f20, 20 * SIZE(X) + LFD f21, 21 * SIZE(X) + LFD f22, 22 * SIZE(X) + LFD f23, 23 * SIZE(X) + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFD f16, 0 * SIZE(X) + addi X, X, 1 * SIZE + fmadd f0, f16, f16, f0 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f0, f0, f8 + + fsqrts f1, f0 + b LL(9999) + .align 4 + +LL(1000): + sub X, X, INCX + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(1150) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdnz LL(1110) + .align 4 + +LL(1120): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + .align 4 + +LL(1150): + andi. r0, N, 15 + mtspr CTR, r0 + beq- cr0, LL(1170) + .align 4 + +LL(1160): + LFDUX f16, X, INCX + fmadd f0, f16, f16, f0 + bdnz LL(1160) + .align 4 + +LL(1170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f0, f0, f8 + + fsqrts f1, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S new file mode 100644 index 0000000000..a0024926ff --- /dev/null +++ b/kernel/power/snrm2_hummer.S @@ -0,0 +1,614 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 +#define C5 f4 +#define C6 f5 +#define C7 f6 +#define C8 f7 + +#define A1 f8 +#define A2 f9 +#define A3 f10 +#define A4 f11 +#define A5 f12 +#define A6 f13 +#define A7 f14 +#define A8 f15 + +#define A9 f16 +#define A10 f17 +#define A11 f18 +#define A12 f19 +#define A13 f20 +#define A14 f21 +#define A15 f22 +#define A16 f23 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + fpmr C5, C1 + fpmr C6, C1 + fpmr C7, C1 + fpmr C8, C1 + + cmpwi cr0, N, 0 + ble LL(99) + cmpwi cr0, INCX, 0 + ble LL(99) + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C1, 0(X) + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + fmul C1, C1, C1 + ble LL(998) + .align 4 + +LL(05): + srawi. r0, N, 5 + sub X, X, INCX2 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + LFPDUX A9, X, INCX2 + LFPDUX A10, X, INCX2 + LFPDUX A11, X, INCX2 + LFPDUX A12, X, INCX2 + LFPDUX A13, X, INCX2 + LFPDUX A14, X, INCX2 + LFPDUX A15, X, INCX2 + LFPDUX A16, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpmadd C1, A1, A1, C1 + LFPDUX A1, X, INCX2 + fpmadd C2, A2, A2, C2 + LFPDUX A2, X, INCX2 + fpmadd C3, A3, A3, C3 + LFPDUX A3, X, INCX2 + fpmadd C4, A4, A4, C4 + LFPDUX A4, X, INCX2 + + fpmadd C5, A5, A5, C5 + LFPDUX A5, X, INCX2 + fpmadd C6, A6, A6, C6 + LFPDUX A6, X, INCX2 + fpmadd C7, A7, A7, C7 + LFPDUX A7, X, INCX2 + fpmadd C8, A8, A8, C8 + LFPDUX A8, X, INCX2 + + fpmadd C1, A9, A9, C1 + LFPDUX A9, X, INCX2 + fpmadd C2, A10, A10, C2 + LFPDUX A10, X, INCX2 + fpmadd C3, A11, A11, C3 + LFPDUX A11, X, INCX2 + fpmadd C4, A12, A12, C4 + LFPDUX A12, X, INCX2 + + fpmadd C5, A13, A13, C5 + LFPDUX A13, X, INCX2 + fpmadd C6, A14, A14, C6 + LFPDUX A14, X, INCX2 + fpmadd C7, A15, A15, C7 + LFPDUX A15, X, INCX2 + fpmadd C8, A16, A16, C8 + LFPDUX A16, X, INCX2 + + bdnz LL(12) + .align 4 + +LL(13): + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + + fpmadd C1, A9, A9, C1 + fpmadd C2, A10, A10, C2 + fpmadd C3, A11, A11, C3 + fpmadd C4, A12, A12, C4 + + fpmadd C5, A13, A13, C5 + fpmadd C6, A14, A14, C6 + fpmadd C7, A15, A15, C7 + fpmadd C8, A16, A16, C8 + .align 4 + +LL(15): + andi. r0, N, 31 + beq LL(98) + + andi. r0, N, 16 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + + fpmadd C5, A5, A5, C5 + fpmadd C6, A6, A6, C6 + fpmadd C7, A7, A7, C7 + fpmadd C8, A8, A8, C8 + .align 4 + +LL(16): + andi. r0, N, 8 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(17): + andi. r0, N, 4 + beq LL(18) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(18): + andi. r0, N, 2 + beq LL(19) + + LFPDUX A1, X, INCX2 + fpmadd C3, A1, A1, C3 + .align 4 + +LL(19): + andi. r0, N, 1 + beq LL(98) + + LFDX A1, X, INCX2 + fmadd C4, A1, A1, C4 + .align 4 + +LL(98): + fpadd C1, C1, C5 + lis r3, 0x3f00 + fpadd C2, C2, C6 + lis r4, 0x4040 + fpadd C3, C3, C7 + stw r3, 4(SP) + fpadd C4, C4, C8 + stw r4, 8(SP) + + fpadd C1, C1, C2 + fpadd C3, C3, C4 + lfs f10, 4(SP) + + fpadd C1, C1, C3 + lfs f11, 4(SP) + lfs f12, 8(SP) + + fsmtp C2, C1 + fadd C1, C2, C1 + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr +#else + fsqrt f1, f1 + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr +#endif + + .align 4 + +LL(99): + li r10, 16 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + sub X2, X, INCX + sub X, X, INCX2 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(115) + + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + + LFDUX A9, X, INCX2 + LFDUX A10, X2, INCX2 + LFDUX A11, X, INCX2 + LFDUX A12, X2, INCX2 + + LFDUX A13, X, INCX2 + LFDUX A14, X2, INCX2 + LFDUX A15, X, INCX2 + LFDUX A16, X2, INCX2 + bdz LL(113) + .align 4 + +LL(112): + fmadd C1, A1, A1, C1 + LFDUX A1, X, INCX2 + fmadd C2, A2, A2, C2 + LFDUX A2, X2, INCX2 + fmadd C3, A3, A3, C3 + LFDUX A3, X, INCX2 + fmadd C4, A4, A4, C4 + LFDUX A4, X2, INCX2 + + fmadd C5, A5, A5, C5 + LFDUX A5, X, INCX2 + fmadd C6, A6, A6, C6 + LFDUX A6, X2, INCX2 + fmadd C7, A7, A7, C7 + LFDUX A7, X, INCX2 + fmadd C8, A8, A8, C8 + LFDUX A8, X2, INCX2 + + fmadd C1, A9, A9, C1 + LFDUX A9, X, INCX2 + fmadd C2, A10, A10, C2 + LFDUX A10, X2, INCX2 + fmadd C3, A11, A11, C3 + LFDUX A11, X, INCX2 + fmadd C4, A12, A12, C4 + LFDUX A12, X2, INCX2 + + fmadd C5, A13, A13, C5 + LFDUX A13, X, INCX2 + fmadd C6, A14, A14, C6 + LFDUX A14, X2, INCX2 + fmadd C7, A15, A15, C7 + LFDUX A15, X, INCX2 + fmadd C8, A16, A16, C8 + LFDUX A16, X2, INCX2 + + bdnz LL(112) + .align 4 + +LL(113): + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + + fmadd C5, A5, A5, C5 + fmadd C6, A6, A6, C6 + fmadd C7, A7, A7, C7 + fmadd C8, A8, A8, C8 + + fmadd C1, A9, A9, C1 + fmadd C2, A10, A10, C2 + fmadd C3, A11, A11, C3 + fmadd C4, A12, A12, C4 + + fmadd C5, A13, A13, C5 + fmadd C6, A14, A14, C6 + fmadd C7, A15, A15, C7 + fmadd C8, A16, A16, C8 + .align 4 + +LL(115): + andi. r0, N, 15 + beq LL(998) + andi. r0, N, 8 + beq LL(116) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + + fmadd C5, A5, A5, C5 + fmadd C6, A6, A6, C6 + fmadd C7, A7, A7, C7 + fmadd C8, A8, A8, C8 + .align 4 + +LL(116): + andi. r0, N, 4 + beq LL(117) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + .align 4 + +LL(117): + andi. r0, N, 2 + beq LL(118) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + .align 4 + +LL(118): + andi. r0, N, 1 + beq LL(998) + + LFDX A1, X, INCX2 + fmadd C1, A1, A1, C1 + .align 4 + +LL(998): + fadd C1, C1, C5 + lis r3, 0x3f00 + fadd C2, C2, C6 + lis r4, 0x4040 + fadd C3, C3, C7 + stw r3, 4(SP) + fadd C4, C4, C8 + stw r4, 8(SP) + + fadd C1, C1, C2 + lfs f10, 0(SP) + fadd C3, C3, C4 + lfs f11, 4(SP) + lfs f12, 8(SP) + + fadd C1, C1, C3 + + fcmpu cr0, f10, C1 + beq cr0, LL(999) + +#ifndef HUMMER_EMULATOR + frsqrte f9, f1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f23, SP, r10 + fmul f3, f9, f11 + lfpdux f22, SP, r10 + fnmsub f4, f2, f9, f12 + lfpdux f21, SP, r10 + fmul f9, f3, f4 + lfpdux f20, SP, r10 + fadd f13, f11, f11 + lfpdux f19, SP, r10 + fmul f12, f1, f9 + lfpdux f18, SP, r10 + fmul f11, f12, f11 + lfpdux f17, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmadd f1, f11, f1, f12 + blr +#else + fsqrt f1, f1 + + li r10, 16 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(999): + li r10, 16 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S new file mode 100644 index 0000000000..ffda99ed44 --- /dev/null +++ b/kernel/power/snrm2_ppc440.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PRE r8 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define C1 152(SP) +#define C2 156(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + + li r10, 0 + lis r11, 0x3f80 + lis r6, 0x3f00 + lis r7, 0x4040 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r6, C1 + stw r7, C2 + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, BASE_SHIFT + li PRE, 3 * 16 * SIZE + + sub X, X, INCX + + cmpwi cr0, N, 0 + ble- LL(999) + cmpwi cr0, INCX, 0 + ble- LL(999) + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + fmr f8, f1 + fmr f9, f1 + fmr f10, f1 + fmr f11, f1 + fmr f12, f1 + fmr f13, f1 + fmr f14, f1 + fmr f15, f1 + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFDUX f16, X, INCX + LFDUX f17, X, INCX + LFDUX f18, X, INCX + LFDUX f19, X, INCX + LFDUX f20, X, INCX + LFDUX f21, X, INCX + LFDUX f22, X, INCX + LFDUX f23, X, INCX + + LFDUX f24, X, INCX + LFDUX f25, X, INCX + LFDUX f26, X, INCX + LFDUX f27, X, INCX + + LFDUX f28, X, INCX + LFDUX f29, X, INCX + LFDUX f30, X, INCX + LFDUX f31, X, INCX + bdz LL(20) + .align 4 + +LL(10): + fmadd f0, f16, f16, f0 + LFDUX f16, X, INCX + fmadd f1, f17, f17, f1 + LFDUX f17, X, INCX + fmadd f2, f18, f18, f2 + LFDUX f18, X, INCX + fmadd f3, f19, f19, f3 + LFDUX f19, X, INCX + +#ifdef PPCG4 + dcbt X, PRE +#endif + + fmadd f4, f20, f20, f4 + LFDUX f20, X, INCX + fmadd f5, f21, f21, f5 + LFDUX f21, X, INCX + fmadd f6, f22, f22, f6 + LFDUX f22, X, INCX + fmadd f7, f23, f23, f7 + LFDUX f23, X, INCX + + fmadd f8, f24, f24, f8 + LFDUX f24, X, INCX + fmadd f9, f25, f25, f9 + LFDUX f25, X, INCX + fmadd f10, f26, f26, f10 + LFDUX f26, X, INCX + fmadd f11, f27, f27, f11 + LFDUX f27, X, INCX + +#ifdef PPCG4 + dcbt X, PRE +#endif + + fmadd f12, f28, f28, f12 + LFDUX f28, X, INCX + fmadd f13, f29, f29, f13 + LFDUX f29, X, INCX + fmadd f14, f30, f30, f14 + LFDUX f30, X, INCX + fmadd f15, f31, f31, f15 + LFDUX f31, X, INCX + + bdnz LL(10) + .align 4 + +LL(20): + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + fmadd f8, f24, f24, f8 + fmadd f9, f25, f25, f9 + fmadd f10, f26, f26, f10 + fmadd f11, f27, f27, f11 + + fmadd f12, f28, f28, f12 + fmadd f13, f29, f29, f13 + fmadd f14, f30, f30, f14 + fmadd f15, f31, f31, f15 + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq- cr0, LL(70) + .align 4 + +LL(60): + LFDUX f16, X, INCX + fmadd f0, f16, f16, f0 + bdnz LL(60) + .align 4 + +LL(70): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f8, f8, f9 + fadd f10, f10, f11 + fadd f12, f12, f13 + fadd f14, f14, f15 + + fadd f0, f0, f2 + fadd f4, f4, f6 + fadd f8, f8, f10 + fadd f12, f12, f14 + + fadd f0, f0, f4 + fadd f8, f8, f12 + + fadd f1, f0, f8 + lfs f4, FZERO + + fcmpu cr0, f1, f4 + beq cr0, LL(999) + + frsqrte f0, f1 + lfs f8, C1 + lfs f9, C2 + + fmul f2, f1, f0 + fadd f7, f8, f8 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f5, f1, f0 + fmul f2, f5, f8 + fnmsub f3, f5, f0, f7 + fmadd f1, f2, f3, f5 + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/staticbuffer.S b/kernel/power/staticbuffer.S new file mode 100644 index 0000000000..7bbd23d891 --- /dev/null +++ b/kernel/power/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 8 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 +#endif diff --git a/kernel/power/swap.S b/kernel/power/swap.S new file mode 100644 index 0000000000..a0d150f3ec --- /dev/null +++ b/kernel/power/swap.S @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define PREA r4 +#define XX r10 +#define YY r11 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define PREA r4 +#define XX r5 +#define YY r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define PREA r5 +#define XX r6 +#define YY r11 +#else +#define N r3 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define PREA r4 +#define XX r5 +#define YY r6 +#endif +#endif + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCY, 56 + STACKSIZE(SP) +#endif + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, SIZE + bne- cr0, LL(100) + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + +LL(10): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f24, 8 * SIZE(Y) + LFD f25, 9 * SIZE(Y) + LFD f26, 10 * SIZE(Y) + LFD f27, 11 * SIZE(Y) + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f28, 12 * SIZE(Y) + LFD f29, 13 * SIZE(Y) + LFD f30, 14 * SIZE(Y) + LFD f31, 15 * SIZE(Y) + + STFD f16, 0 * SIZE(X) + STFD f17, 1 * SIZE(X) + STFD f18, 2 * SIZE(X) + STFD f19, 3 * SIZE(X) + + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + STFD f2, 2 * SIZE(Y) + STFD f3, 3 * SIZE(Y) + + STFD f20, 4 * SIZE(X) + STFD f21, 5 * SIZE(X) + STFD f22, 6 * SIZE(X) + STFD f23, 7 * SIZE(X) + + STFD f4, 4 * SIZE(Y) + STFD f5, 5 * SIZE(Y) + STFD f6, 6 * SIZE(Y) + STFD f7, 7 * SIZE(Y) + + STFD f24, 8 * SIZE(X) + STFD f25, 9 * SIZE(X) + STFD f26, 10 * SIZE(X) + STFD f27, 11 * SIZE(X) + + STFD f8, 8 * SIZE(Y) + STFD f9, 9 * SIZE(Y) + STFD f10, 10 * SIZE(Y) + STFD f11, 11 * SIZE(Y) + + STFD f28, 12 * SIZE(X) + STFD f29, 13 * SIZE(X) + STFD f30, 14 * SIZE(X) + STFD f31, 15 * SIZE(X) + + STFD f12, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f14, 14 * SIZE(Y) + STFD f15, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + dcbtst X, PREA +#ifdef L1_DUALFETCH + dcbtst Y, PREA +#endif + bdnz LL(10) + .align 4 + +LL(50): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 0 * SIZE(Y) + + STFD f9, 0 * SIZE(X) + STFD f8, 0 * SIZE(Y) + + addi X, X, 1 * SIZE + addi Y, Y, 1 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + mr XX, X + mr YY, Y + + srawi. r0, N, 4 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDUX f0, X, INCX + LFDUX f1, X, INCX + LFDUX f2, X, INCX + LFDUX f3, X, INCX + + LFDUX f16, Y, INCY + LFDUX f17, Y, INCY + LFDUX f18, Y, INCY + LFDUX f19, Y, INCY + + LFDUX f4, X, INCX + LFDUX f5, X, INCX + LFDUX f6, X, INCX + LFDUX f7, X, INCX + + LFDUX f20, Y, INCY + LFDUX f21, Y, INCY + LFDUX f22, Y, INCY + LFDUX f23, Y, INCY + + LFDUX f8, X, INCX + LFDUX f9, X, INCX + LFDUX f10, X, INCX + LFDUX f11, X, INCX + + LFDUX f24, Y, INCY + LFDUX f25, Y, INCY + LFDUX f26, Y, INCY + LFDUX f27, Y, INCY + + LFDUX f12, X, INCX + LFDUX f13, X, INCX + LFDUX f14, X, INCX + LFDUX f15, X, INCX + + LFDUX f28, Y, INCY + LFDUX f29, Y, INCY + LFDUX f30, Y, INCY + LFDUX f31, Y, INCY + + STFDUX f16, XX, INCX + STFDUX f17, XX, INCX + STFDUX f18, XX, INCX + STFDUX f19, XX, INCX + + STFDUX f0, YY, INCY + STFDUX f1, YY, INCY + STFDUX f2, YY, INCY + STFDUX f3, YY, INCY + + STFDUX f20, XX, INCX + STFDUX f21, XX, INCX + STFDUX f22, XX, INCX + STFDUX f23, XX, INCX + + STFDUX f4, YY, INCY + STFDUX f5, YY, INCY + STFDUX f6, YY, INCY + STFDUX f7, YY, INCY + + STFDUX f24, XX, INCX + STFDUX f25, XX, INCX + STFDUX f26, XX, INCX + STFDUX f27, XX, INCX + + STFDUX f8, YY, INCY + STFDUX f9, YY, INCY + STFDUX f10, YY, INCY + STFDUX f11, YY, INCY + + STFDUX f28, XX, INCX + STFDUX f29, XX, INCX + STFDUX f30, XX, INCX + STFDUX f31, XX, INCX + + STFDUX f12, YY, INCY + STFDUX f13, YY, INCY + STFDUX f14, YY, INCY + STFDUX f15, YY, INCY + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 15 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + STFDUX f9, XX, INCX + STFDUX f8, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/swap_hummer.S b/kernel/power/swap_hummer.S new file mode 100644 index 0000000000..293a28bec1 --- /dev/null +++ b/kernel/power/swap_hummer.S @@ -0,0 +1,703 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 + +#define INCX2 r4 +#define INCY2 r5 +#define X2 r10 +#define Y2 r11 + +#define A1 f0 +#define A2 f1 +#define A3 f2 +#define A4 f3 +#define A5 f4 + +#define B1 f5 +#define B2 f6 +#define B3 f7 +#define B4 f8 +#define B5 f9 + +#define T1 f10 +#define T2 f11 +#define T3 f12 +#define T4 f13 +#define T5 f14 +#define T6 f15 +#define T7 f16 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + mr X2, X + mr Y2, Y + + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + .align 4 + +LL(10): /* X : aligned Y : aligned */ + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + bdz LL(13) + .align 4 + +LL(12): + STFPDUX B1, X2, INCY2 + LFPDUX B1, Y, INCY2 + STFPDUX A1, Y2, INCY2 + LFPDUX A1, X, INCX2 + + STFPDUX B2, X2, INCY2 + LFPDUX B2, Y, INCY2 + STFPDUX A2, Y2, INCY2 + LFPDUX A2, X, INCX2 + + STFPDUX B3, X2, INCY2 + LFPDUX B3, Y, INCY2 + STFPDUX A3, Y2, INCY2 + LFPDUX A3, X, INCX2 + + STFPDUX B4, X2, INCY2 + LFPDUX B4, Y, INCY2 + STFPDUX A4, Y2, INCY2 + LFPDUX A4, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + STFPDUX B3, X2, INCY2 + STFPDUX A3, Y2, INCY2 + STFPDUX B4, X2, INCY2 + STFPDUX A4, Y2, INCY2 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + STFDUX B1, X2, INCY2 + STFDUX A1, Y2, INCY2 + b LL(999) + .align 4 + +LL(20): /* X : aligned Y : unaligned */ + + LFXDUX A1, X, INCX2 + LFDX B1, Y, INCY2 + + STFSDX A1, Y2, INCY2 + + add Y, Y, INCY + add Y2, Y2, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(29) + .align 4 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX T1, X, INCX2 + LFXDUX T2, Y, INCY2 + LFXDUX T3, X, INCX2 + LFXDUX T4, Y, INCY2 + + LFPDUX A4, X, INCX2 + fsmr A1, T1 + LFPDUX B4, Y, INCY2 + fsmr B1, T2 + LFPDUX A5, X, INCX2 + fsmr T1, T3 + LFPDUX B5, Y, INCY2 + fsmr T2, T4 + bdz LL(23) + .align 4 + +LL(22): + fxmr T5, A4 + STFPDUX A1, Y2, INCY2 + fxmr T6, B4 + STFPDUX B1, X2, INCX2 + fxmr A1, A5 + STFPDUX T1, Y2, INCY2 + fxmr B1, B5 + STFPDUX T2, X2, INCX2 + + fsmr T3, T5 + LFPDUX A2, X, INCX2 + fsmr T4, T6 + LFPDUX B2, Y, INCY2 + fsmr T5, A1 + LFPDUX A3, X, INCX2 + fsmr T6, B1 + LFPDUX B3, Y, INCY2 + + fxmr T1, A2 + STFPDUX T3, Y2, INCY2 + fxmr T2, B2 + STFPDUX T4, X2, INCX2 + fxmr T3, A3 + STFPDUX T5, Y2, INCY2 + fxmr T4, B3 + STFPDUX T6, X2, INCX2 + + fsmr A1, T1 + LFPDUX A4, X, INCX2 + fsmr B1, T2 + LFPDUX B4, Y, INCY2 + fsmr T1, T3 + LFPDUX A5, X, INCX2 + fsmr T2, T4 + LFPDUX B5, Y, INCY2 + bdnz LL(22) + .align 4 + +LL(23): + fxmr T5, A4 + STFPDUX A1, Y2, INCY2 + fxmr T6, B4 + STFPDUX B1, X2, INCX2 + fxmr A1, A5 + STFPDUX T1, Y2, INCY2 + fxmr B1, B5 + STFPDUX T2, X2, INCX2 + + fsmr T3, T5 + fsmr T4, T6 + fsmr T5, A1 + fsmr T6, B1 + + STFPDUX T3, Y2, INCY2 + STFPDUX T4, X2, INCX2 + STFPDUX T5, Y2, INCY2 + STFPDUX T6, X2, INCX2 + .align 4 + +LL(25): + andi. r0, N, 7 + beq LL(29) + + andi. r0, N, 4 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFXDUX B2, Y, INCY2 + LFXDUX A3, X, INCX2 + LFXDUX B3, Y, INCY2 + + fsmr A1, A2 + fsmr B1, B2 + fsmr A2, A3 + fsmr B2, B3 + + STFPDUX A1, Y2, INCY2 + STFPDUX B1, X2, INCX2 + STFPDUX A2, Y2, INCY2 + fpmr A1, A3 + STFPDUX B2, X2, INCX2 + fpmr B1, B3 + .align 4 + +LL(27): + andi. r0, N, 2 + beq LL(28) + + LFXDUX A2, X, INCX2 + LFXDUX B2, Y, INCY2 + fsmr A1, A2 + fsmr B1, B2 + STFPDUX A1, Y2, INCY2 + fpmr A1, A2 + STFPDUX B1, X2, INCX2 + fpmr B1, B2 + .align 4 + +LL(28): + andi. r0, N, 1 + beq LL(29) + + LFSDX B1, Y, INCY2 + STFDX A1, Y2, INCY2 + STFDX B1, X2, INCX2 + add X2, X2, INCX + fsmtp B1, B1 + .align 4 + +LL(29): + STFDX B1, X2, INCX2 + b LL(999) + .align 4 + + +LL(30): /* X : unaligned Y : aligned */ + + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFXDUX A1, Y, INCY2 + LFDX B1, X, INCX2 + + STFSDX A1, X2, INCX2 + + add X, X, INCX + add X2, X2, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(39) + .align 4 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX T1, Y, INCY2 + LFXDUX T2, X, INCX2 + LFXDUX T3, Y, INCY2 + LFXDUX T4, X, INCX2 + + LFPDUX A4, Y, INCY2 + fsmr A1, T1 + LFPDUX B4, X, INCX2 + fsmr B1, T2 + LFPDUX A5, Y, INCY2 + fsmr T1, T3 + LFPDUX B5, X, INCX2 + fsmr T2, T4 + bdz LL(33) + .align 4 + +LL(32): + fxmr T5, A4 + STFPDUX A1, X2, INCX2 + fxmr T6, B4 + STFPDUX B1, Y2, INCY2 + fxmr A1, A5 + STFPDUX T1, X2, INCX2 + fxmr B1, B5 + STFPDUX T2, Y2, INCY2 + + fsmr T3, T5 + LFPDUX A2, Y, INCY2 + fsmr T4, T6 + LFPDUX B2, X, INCX2 + fsmr T5, A1 + LFPDUX A3, Y, INCY2 + fsmr T6, B1 + LFPDUX B3, X, INCX2 + + fxmr T1, A2 + STFPDUX T3, X2, INCX2 + fxmr T2, B2 + STFPDUX T4, Y2, INCY2 + fxmr T3, A3 + STFPDUX T5, X2, INCX2 + fxmr T4, B3 + STFPDUX T6, Y2, INCY2 + + fsmr A1, T1 + LFPDUX A4, Y, INCY2 + fsmr B1, T2 + LFPDUX B4, X, INCX2 + fsmr T1, T3 + LFPDUX A5, Y, INCY2 + fsmr T2, T4 + LFPDUX B5, X, INCX2 + bdnz LL(32) + .align 4 + +LL(33): + fxmr T5, A4 + STFPDUX A1, X2, INCX2 + fxmr T6, B4 + STFPDUX B1, Y2, INCY2 + fxmr A1, A5 + STFPDUX T1, X2, INCX2 + fxmr B1, B5 + STFPDUX T2, Y2, INCY2 + + fsmr T3, T5 + fsmr T4, T6 + fsmr T5, A1 + fsmr T6, B1 + + STFPDUX T3, X2, INCX2 + STFPDUX T4, Y2, INCY2 + STFPDUX T5, X2, INCX2 + STFPDUX T6, Y2, INCY2 + .align 4 + +LL(35): + andi. r0, N, 7 + beq LL(39) + + andi. r0, N, 4 + beq LL(37) + + LFXDUX A2, Y, INCY2 + LFXDUX B2, X, INCX2 + LFXDUX A3, Y, INCY2 + LFXDUX B3, X, INCX2 + + fsmr A1, A2 + fsmr B1, B2 + fsmr A2, A3 + fsmr B2, B3 + + STFPDUX A1, X2, INCX2 + STFPDUX B1, Y2, INCY2 + STFPDUX A2, X2, INCX2 + fpmr A1, A3 + STFPDUX B2, Y2, INCY2 + fpmr B1, B3 + .align 4 + +LL(37): + andi. r0, N, 2 + beq LL(38) + + LFXDUX A2, Y, INCY2 + LFXDUX B2, X, INCX2 + fsmr A1, A2 + fsmr B1, B2 + STFPDUX A1, X2, INCX2 + fpmr A1, A2 + STFPDUX B1, Y2, INCY2 + fpmr B1, B2 + .align 4 + +LL(38): + andi. r0, N, 1 + beq LL(39) + + LFSDX B1, X, INCX2 + STFDX A1, X2, INCX2 + STFDX B1, Y2, INCY2 + add Y2, Y2, INCY + fsmtp B1, B1 + .align 4 + +LL(39): + STFDX B1, Y2, INCY2 + b LL(999) + .align 4 + +LL(40): /* X : unaligned Y : unaligned */ + + LFDX A1, Y, INCY2 + LFDX B1, X, INCX2 + add X, X, INCX + add Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + + STFDX A1, X2, INCX2 + STFDX B1, Y2, INCY2 + add X2, X2, INCX + add Y2, Y2, INCY + ble LL(999) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + bdz LL(43) + .align 4 + +LL(42): + STFPDUX B1, X2, INCY2 + LFPDUX B1, Y, INCY2 + STFPDUX A1, Y2, INCY2 + LFPDUX A1, X, INCX2 + + STFPDUX B2, X2, INCY2 + LFPDUX B2, Y, INCY2 + STFPDUX A2, Y2, INCY2 + LFPDUX A2, X, INCX2 + + STFPDUX B3, X2, INCY2 + LFPDUX B3, Y, INCY2 + STFPDUX A3, Y2, INCY2 + LFPDUX A3, X, INCX2 + + STFPDUX B4, X2, INCY2 + LFPDUX B4, Y, INCY2 + STFPDUX A4, Y2, INCY2 + LFPDUX A4, X, INCX2 + bdnz LL(42) + .align 4 + +LL(43): + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + STFPDUX B3, X2, INCY2 + STFPDUX A3, Y2, INCY2 + STFPDUX B4, X2, INCY2 + STFPDUX A4, Y2, INCY2 + .align 4 + +LL(45): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + .align 4 + +LL(46): + andi. r0, N, 2 + beq LL(47) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + .align 4 + +LL(47): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + STFDUX B1, X2, INCY2 + STFDUX A1, Y2, INCY2 + b LL(999) + .align 4 + +LL(100): + sub X, X, INCX + sub Y, Y, INCY + + mr X2, X + mr Y2, Y + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + LFDUX A3, X, INCX + LFDUX B3, Y, INCY + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + bdz LL(113) + .align 4 + +LL(112): + STFDUX B1, X2, INCX + LFDUX B1, Y, INCY + STFDUX A1, Y2, INCY + LFDUX A1, X, INCX + + STFDUX B2, X2, INCX + LFDUX B2, Y, INCY + STFDUX A2, Y2, INCY + LFDUX A2, X, INCX + + STFDUX B3, X2, INCX + LFDUX B3, Y, INCY + STFDUX A3, Y2, INCY + LFDUX A3, X, INCX + + STFDUX B4, X2, INCX + LFDUX B4, Y, INCY + STFDUX A4, Y2, INCY + LFDUX A4, X, INCX + bdnz LL(112) + .align 4 + +LL(113): + STFDUX B1, X2, INCX + STFDUX A1, Y2, INCY + STFDUX B2, X2, INCX + STFDUX A2, Y2, INCY + + STFDUX B3, X2, INCX + STFDUX A3, Y2, INCY + STFDUX B4, X2, INCX + STFDUX A4, Y2, INCY + .align 4 + +LL(115): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(117) + + LFDUX A1, X, INCX + LFDUX A2, X, INCX + LFDUX B1, Y, INCY + LFDUX B2, Y, INCY + + STFDUX B1, X2, INCX + STFDUX B2, X2, INCX + STFDUX A1, Y2, INCY + STFDUX A2, Y2, INCY + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX + LFDUX B1, Y, INCY + STFDUX B1, X2, INCX + STFDUX A1, Y2, INCY + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S new file mode 100644 index 0000000000..91bfb5e0bf --- /dev/null +++ b/kernel/power/symv_L.S @@ -0,0 +1,1521 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define BUFFER r14 +#else +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define BUFFER r14 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#else +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define BUFFER r14 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r15 +#define AO2 r16 +#define AO3 r17 +#define AO4 r18 +#define XX r19 +#define YY r20 +#define NEW_Y r21 +#define TEMP r22 +#define PREA r24 +#define IS r25 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 + +#define atemp1 f4 +#define atemp2 f5 +#define atemp3 f6 +#define atemp4 f7 + +#define xtemp1 f8 +#define xtemp2 f9 +#define xtemp3 f10 +#define xtemp4 f11 + +#define xsum1 f12 +#define xsum2 f13 +#define xsum3 f14 +#define xsum4 f15 + +#define a1 f16 +#define a2 f17 +#define a3 f18 +#define a4 f19 +#define a5 f20 +#define a6 f21 +#define a7 f22 +#define a8 f23 +#define a9 f24 +#define a10 f25 +#define a11 f26 +#define a12 f27 +#define a13 f28 +#define a14 f29 +#define a15 f30 +#define a16 f31 + +#define alpha f1 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 64 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 96 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 40 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#define NOP1 +#define NOP2 +#else +#define NOP1 mr LDA, LDA +#define NOP2 mr INCX, INCX +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA 200(SP) +#define FZERO 208(SP) +#else +#define STACKSIZE 280 +#define ALPHA 256(SP) +#define FZERO 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz BUFFER, 56 + STACKSIZE(SP) +#else + ld INCY, 112 + STACKSIZE(SP) + ld BUFFER, 120 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) + lwz BUFFER, 60 + STACKSIZE(SP) +#endif +#else + ld INCY, 112 + STACKSIZE(SP) + ld BUFFER, 120 + STACKSIZE(SP) +#endif +#endif + + STFD alpha, ALPHA + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE + beq LL(05) + + mr XX, X + mr X, BUFFER + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(03) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(XX) + add XX, XX, INCX + LFD a2, 0 * SIZE(XX) + add XX, XX, INCX + LFD a3, 0 * SIZE(XX) + add XX, XX, INCX + LFD a4, 0 * SIZE(XX) + add XX, XX, INCX + LFD a5, 0 * SIZE(XX) + add XX, XX, INCX + LFD a6, 0 * SIZE(XX) + add XX, XX, INCX + LFD a7, 0 * SIZE(XX) + add XX, XX, INCX + LFD a8, 0 * SIZE(XX) + add XX, XX, INCX + + dcbt XX, PREA + dcbtst BUFFER, PREA + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + STFD a3, 2 * SIZE(BUFFER) + STFD a4, 3 * SIZE(BUFFER) + STFD a5, 4 * SIZE(BUFFER) + STFD a6, 5 * SIZE(BUFFER) + STFD a7, 6 * SIZE(BUFFER) + STFD a8, 7 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(01) + .align 4 + +LL(03): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(04): + LFD a1, 0 * SIZE(XX) + add XX, XX, INCX + + STFD a1, 0 * SIZE(BUFFER) + addi BUFFER, BUFFER, 1 * SIZE + bdnz LL(04) + .align 4 + +LL(05): + mr NEW_Y, Y + lfd f0, FZERO + + cmpwi cr0, INCY, SIZE + beq LL(10) + + mr NEW_Y, BUFFER + + addi r0, M, 7 + srawi. r0, r0, 3 + mtspr CTR, r0 + .align 4 + +LL(06): + STFD f0, 0 * SIZE(BUFFER) + STFD f0, 1 * SIZE(BUFFER) + STFD f0, 2 * SIZE(BUFFER) + STFD f0, 3 * SIZE(BUFFER) + STFD f0, 4 * SIZE(BUFFER) + STFD f0, 5 * SIZE(BUFFER) + STFD f0, 6 * SIZE(BUFFER) + STFD f0, 7 * SIZE(BUFFER) + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(06) + .align 4 + +LL(10): + li IS, 0 + + cmpwi cr0, N, 4 + blt LL(20) + .align 4 + +LL(11): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + addi A, A, 4 * SIZE + + slwi TEMP, IS, BASE_SHIFT + add XX, X, TEMP + add YY, NEW_Y, TEMP + + LFD atemp1, 0 * SIZE(XX) + LFD atemp2, 1 * SIZE(XX) + LFD atemp3, 2 * SIZE(XX) + LFD atemp4, 3 * SIZE(XX) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + LFD a11, 2 * SIZE(AO3) + LFD a12, 3 * SIZE(AO3) + + LFD a16, 3 * SIZE(AO4) + + LFD a5, ALPHA + + FMUL xsum1, atemp1, a1 + FMUL xsum2, atemp1, a2 + FMUL xsum3, atemp1, a3 + FMUL xsum4, atemp1, a4 + + FMADD xsum1, atemp2, a2, xsum1 + FMADD xsum2, atemp2, a6, xsum2 + FMADD xsum3, atemp2, a7, xsum3 + FMADD xsum4, atemp2, a8, xsum4 + + FMADD xsum1, atemp3, a3, xsum1 + FMADD xsum2, atemp3, a7, xsum2 + FMADD xsum3, atemp3, a11, xsum3 + FMADD xsum4, atemp3, a12, xsum4 + + FMADD xsum1, atemp4, a4, xsum1 + FMADD xsum2, atemp4, a8, xsum2 + FMADD xsum3, atemp4, a12, xsum3 + FMADD xsum4, atemp4, a16, xsum4 + + FMUL atemp1, a5, atemp1 + FMUL atemp2, a5, atemp2 + FMUL atemp3, a5, atemp3 + FMUL atemp4, a5, atemp4 + + LFD xtemp1, 4 * SIZE(XX) + LFD xtemp2, 5 * SIZE(XX) + LFD xtemp3, 6 * SIZE(XX) + LFD xtemp4, 7 * SIZE(XX) + + LFD y01, 4 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + LFD a1, 4 * SIZE(AO1) + LFD a2, 5 * SIZE(AO1) + LFD a3, 6 * SIZE(AO1) + LFD a4, 7 * SIZE(AO1) + + LFD a5, 4 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + LFD a9, 4 * SIZE(AO3) + LFD a10, 5 * SIZE(AO3) + LFD a11, 6 * SIZE(AO3) + LFD a12, 7 * SIZE(AO3) + + LFD a13, 4 * SIZE(AO4) + LFD a14, 5 * SIZE(AO4) + LFD a15, 6 * SIZE(AO4) + LFD a16, 7 * SIZE(AO4) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + + addi XX, XX, 4 * SIZE + addi YY, YY, 4 * SIZE + + sub TEMP, M, IS + addi TEMP, TEMP, -4 + srawi. r0, TEMP, 4 + mtspr CTR, r0 + ble LL(14) + .align 4 + +LL(12): + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO1, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 +# DCBT(X, PREX) + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO2, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 9 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 9 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 9 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 10 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 8 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 10 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 9 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 10 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 11 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 8 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 11 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 9 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 11 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 10 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 11 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 11 * SIZE(AO4) + + STFD y01, 4 * SIZE(YY) + LFD y01, 8 * SIZE(YY) + STFD y02, 5 * SIZE(YY) + LFD y02, 9 * SIZE(YY) + + STFD y03, 6 * SIZE(YY) + LFD y03, 10 * SIZE(YY) + STFD y04, 7 * SIZE(YY) + LFD y04, 11 * SIZE(YY) + + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO3, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 13 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 +# DCBT(Y1, PREY) + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 13 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 14 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 12 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 14 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10,13 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 14 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 15 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13,12 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 15 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 13 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 15 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 14 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 15 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 15 * SIZE(AO4) + + STFD y01, 8 * SIZE(YY) + LFD y01, 12 * SIZE(YY) + STFD y02, 9 * SIZE(YY) + LFD y02, 13 * SIZE(YY) + + STFD y03, 10 * SIZE(YY) + LFD y03, 14 * SIZE(YY) + STFD y04, 11 * SIZE(YY) + LFD y04, 15 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO4, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + addi YY, YY, 16 * SIZE + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 17 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 16 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + addi AO3, AO3, 16 * SIZE + FMADD y02, atemp2, a6, y02 + LFD a6, 17 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + addi AO1, AO1, 16 * SIZE + FMADD y03, atemp2, a7, y03 + addi AO2, AO2, 16 * SIZE + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 17 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + addi AO4, AO4, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 2 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 0 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 2 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 1 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 2 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 18 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + addi XX, XX, 16 * SIZE + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 3 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 0 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 3 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 1 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 3 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 2 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 3 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 3 * SIZE(AO4) + + STFD y01, -4 * SIZE(YY) + LFD y01, 0 * SIZE(YY) + STFD y02, -3 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + STFD y03, -2 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + STFD y04, -1 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + bdnz LL(12) + .align 4 + +LL(14): + sub TEMP, M, IS + addi TEMP, TEMP, -4 + andi. r0, TEMP, 8 + ble LL(15) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 9 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 9 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 9 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 10 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 8 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 10 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 9 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 10 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 11 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 8 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 11 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 9 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 11 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 10 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 11 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 11 * SIZE(AO4) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi AO3, AO3, 8 * SIZE + addi AO4, AO4, 8 * SIZE + + STFD y01, 4 * SIZE(YY) + LFD y01, 8 * SIZE(YY) + STFD y02, 5 * SIZE(YY) + LFD y02, 9 * SIZE(YY) + + STFD y03, 6 * SIZE(YY) + LFD y03, 10 * SIZE(YY) + STFD y04, 7 * SIZE(YY) + LFD y04, 11 * SIZE(YY) + + addi XX, XX, 8 * SIZE + addi YY, YY, 8 * SIZE + .align 4 + +LL(15): + sub TEMP, M, IS + addi TEMP, TEMP, -4 + andi. r0, TEMP, 4 + ble LL(16) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + addi XX, XX, 4 * SIZE + addi YY, YY, 4 * SIZE + .align 4 + +LL(16): + andi. r0, M, 2 + ble LL(17) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + LFD a1, 2 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + FMADD y02, atemp1, a2, y02 + + FMADD xsum3, xtemp1, a9, xsum3 + FMADD y01, atemp2, a5, y01 + LFD a5, 2 * SIZE(AO2) + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 2 * SIZE(XX) + FMADD y02, atemp2, a6, y02 + + FMADD xsum1, xtemp2, a2, xsum1 + FMADD y01, atemp3, a9, y01 + LFD a9, 2 * SIZE(AO3) + + FMADD xsum2, xtemp2, a6, xsum2 + FMADD y02, atemp3, a10, y02 + + FMADD xsum3, xtemp2, a10, xsum3 + FMADD y01, atemp4, a13, y01 + LFD a13, 2 * SIZE(AO4) + + FMADD xsum4, xtemp2, a14, xsum4 + FMADD y02, atemp4, a14, y02 + + STFD y01, 0 * SIZE(YY) + LFD y01, 2 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + addi YY, YY, 2 * SIZE + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(18) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp1, a5, xsum2 + FMADD y01, atemp2, a5, y01 + FMADD xsum3, xtemp1, a9, xsum3 + FMADD y01, atemp3, a9, y01 + FMADD xsum4, xtemp1, a13, xsum4 + FMADD y01, atemp4, a13, y01 + + STFD y01, 0 * SIZE(YY) + .align 4 + +LL(18): + slwi TEMP, IS, BASE_SHIFT + add YY, NEW_Y, TEMP + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + FMUL xsum2, xtemp1, xsum2 + FMUL xsum3, xtemp1, xsum3 + FMUL xsum4, xtemp1, xsum4 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + FADD y03, y03, xsum3 + FADD y04, y04, xsum4 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + STFD y03, 2 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + + addi TEMP, IS, 8 + addi IS, IS, 4 + cmpw cr0, TEMP, N + ble LL(11) + .align 4 + +LL(20): + andi. TEMP, N, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + addi A, A, 2 * SIZE + + slwi TEMP, IS, BASE_SHIFT + add XX, X, TEMP + add YY, NEW_Y, TEMP + + LFD atemp1, 0 * SIZE(XX) + LFD atemp2, 1 * SIZE(XX) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a6, 1 * SIZE(AO2) + + LFD a5, ALPHA + + FMUL xsum1, atemp1, a1 + FMUL xsum2, atemp1, a2 + + FMADD xsum1, atemp2, a2, xsum1 + FMADD xsum2, atemp2, a6, xsum2 + + FMUL atemp1, a5, atemp1 + FMUL atemp2, a5, atemp2 + + LFD xtemp1, 2 * SIZE(XX) + LFD y01, 2 * SIZE(YY) + LFD a1, 2 * SIZE(AO1) + LFD a5, 2 * SIZE(AO2) + + andi. r0, M, 1 + ble LL(28) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp1, a5, xsum2 + FMADD y01, atemp2, a5, y01 + + STFD y01, 2 * SIZE(YY) + .align 4 + +LL(28): + slwi TEMP, IS, BASE_SHIFT + add YY, NEW_Y, TEMP + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + FMUL xsum2, xtemp1, xsum2 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + + addi IS, IS, 2 + .align 4 + +LL(30): + andi. TEMP, N, 1 + ble LL(990) + + mr AO1, A + + slwi TEMP, IS, BASE_SHIFT + add XX, X, TEMP + add YY, NEW_Y, TEMP + + LFD atemp1, 0 * SIZE(XX) + LFD a1, 0 * SIZE(AO1) + LFD xtemp1, ALPHA + LFD y01, 0 * SIZE(YY) + + FMUL xsum1, atemp1, a1 + FMUL xsum1, xtemp1, xsum1 + + FADD y01, y01, xsum1 + + STFD y01, 0 * SIZE(YY) + .align 4 + +LL(990): + cmpwi cr0, INCY, SIZE + beq LL(999) + + mr YY, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + add Y, Y, INCY + LFD f5, 0 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + add Y, Y, INCY + LFD f7, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + LFD f12, 4 * SIZE(NEW_Y) + LFD f13, 5 * SIZE(NEW_Y) + LFD f14, 6 * SIZE(NEW_Y) + LFD f15, 7 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + add YY, YY, INCY + STFD f11, 0 * SIZE(YY) + add YY, YY, INCY + STFD f12, 0 * SIZE(YY) + add YY, YY, INCY + STFD f13, 0 * SIZE(YY) + add YY, YY, INCY + STFD f14, 0 * SIZE(YY) + add YY, YY, INCY + STFD f15, 0 * SIZE(YY) + add YY, YY, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 4 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + add YY, YY, INCY + STFD f11, 0 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(996): + andi. J, M, 2 + ble LL(997) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 2 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(997): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f8, 0 * SIZE(NEW_Y) + + FADD f8, f8, f0 + + STFD f8, 0 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S new file mode 100644 index 0000000000..76cbd6461e --- /dev/null +++ b/kernel/power/symv_U.S @@ -0,0 +1,1506 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define IS r4 +#define A r5 +#define LDA r6 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define BUFFER r14 +#else +#define M r3 +#define IS r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define BUFFER r14 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define IS r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#else +#define M r3 +#define IS r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define BUFFER r14 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r15 +#define AO2 r16 +#define AO3 r17 +#define AO4 r18 +#define XX r19 +#define YY r20 +#define NEW_Y r21 +#define TEMP r22 +#define PREA r24 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 + +#define atemp1 f4 +#define atemp2 f5 +#define atemp3 f6 +#define atemp4 f7 + +#define xtemp1 f8 +#define xtemp2 f9 +#define xtemp3 f10 +#define xtemp4 f11 + +#define xsum1 f12 +#define xsum2 f13 +#define xsum3 f14 +#define xsum4 f15 + +#define a1 f16 +#define a2 f17 +#define a3 f18 +#define a4 f19 +#define a5 f20 +#define a6 f21 +#define a7 f22 +#define a8 f23 +#define a9 f24 +#define a10 f25 +#define a11 f26 +#define a12 f27 +#define a13 f28 +#define a14 f29 +#define a15 f30 +#define a16 f31 + +#define alpha f1 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 64 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 96 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 40 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#define NOP1 +#define NOP2 +#else +#define NOP1 mr LDA, LDA +#define NOP2 mr INCX, INCX +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA 200(SP) +#define FZERO 208(SP) +#else +#define STACKSIZE 280 +#define ALPHA 256(SP) +#define FZERO 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz BUFFER, 56 + STACKSIZE(SP) +#else + ld INCY, 112 + STACKSIZE(SP) + ld BUFFER, 120 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) + lwz BUFFER, 60 + STACKSIZE(SP) +#endif +#else + ld INCY, 112 + STACKSIZE(SP) + ld BUFFER, 120 + STACKSIZE(SP) +#endif +#endif + + STFD alpha, ALPHA + + slwi LDA, LDA, BASE_SHIFT + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + sub IS, M, IS + + cmpwi cr0, M, 0 + ble- LL(999) + + mullw TEMP, IS, LDA + add A, A, TEMP + + cmpwi cr0, INCX, SIZE + beq LL(05) + + mr XX, X + mr X, BUFFER + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(03) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(XX) + add XX, XX, INCX + LFD a2, 0 * SIZE(XX) + add XX, XX, INCX + LFD a3, 0 * SIZE(XX) + add XX, XX, INCX + LFD a4, 0 * SIZE(XX) + add XX, XX, INCX + LFD a5, 0 * SIZE(XX) + add XX, XX, INCX + LFD a6, 0 * SIZE(XX) + add XX, XX, INCX + LFD a7, 0 * SIZE(XX) + add XX, XX, INCX + LFD a8, 0 * SIZE(XX) + add XX, XX, INCX + + dcbt XX, PREA + dcbtst BUFFER, PREA + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + STFD a3, 2 * SIZE(BUFFER) + STFD a4, 3 * SIZE(BUFFER) + STFD a5, 4 * SIZE(BUFFER) + STFD a6, 5 * SIZE(BUFFER) + STFD a7, 6 * SIZE(BUFFER) + STFD a8, 7 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(01) + .align 4 + +LL(03): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(04): + LFD a1, 0 * SIZE(XX) + add XX, XX, INCX + + STFD a1, 0 * SIZE(BUFFER) + addi BUFFER, BUFFER, 1 * SIZE + bdnz LL(04) + .align 4 + +LL(05): + mr NEW_Y, Y + lfd f0, FZERO + + cmpwi cr0, INCY, SIZE + beq LL(10) + + mr NEW_Y, BUFFER + + addi r0, M, 7 + srawi. r0, r0, 3 + mtspr CTR, r0 + .align 4 + +LL(06): + STFD f0, 0 * SIZE(BUFFER) + STFD f0, 1 * SIZE(BUFFER) + STFD f0, 2 * SIZE(BUFFER) + STFD f0, 3 * SIZE(BUFFER) + STFD f0, 4 * SIZE(BUFFER) + STFD f0, 5 * SIZE(BUFFER) + STFD f0, 6 * SIZE(BUFFER) + STFD f0, 7 * SIZE(BUFFER) + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(06) + .align 4 + +LL(10): + addi TEMP, IS, 4 + cmpw cr0, TEMP, M + bgt LL(20) + .align 4 + +LL(11): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + slwi TEMP, IS, BASE_SHIFT + add TEMP, X, TEMP + + LFD a16, ALPHA + lfd xsum1, FZERO + + LFD atemp1, 0 * SIZE(TEMP) + LFD atemp2, 1 * SIZE(TEMP) + LFD atemp3, 2 * SIZE(TEMP) + LFD atemp4, 3 * SIZE(TEMP) + + LFD xtemp1, 0 * SIZE(X) + LFD xtemp2, 1 * SIZE(X) + LFD xtemp3, 2 * SIZE(X) + LFD xtemp4, 3 * SIZE(X) + + LFD y01, 0 * SIZE(NEW_Y) + LFD y02, 1 * SIZE(NEW_Y) + LFD y03, 2 * SIZE(NEW_Y) + LFD y04, 3 * SIZE(NEW_Y) + + LFD a1, 0 * SIZE(AO1) + FMUL atemp1, a16, atemp1 + LFD a2, 1 * SIZE(AO1) + FMUL atemp2, a16, atemp2 + LFD a3, 2 * SIZE(AO1) + FMUL atemp3, a16, atemp3 + LFD a4, 3 * SIZE(AO1) + FMUL atemp4, a16, atemp4 + + LFD a5, 0 * SIZE(AO2) + fmr xsum2, xsum1 + LFD a6, 1 * SIZE(AO2) + fmr xsum3, xsum1 + LFD a7, 2 * SIZE(AO2) + fmr xsum4, xsum1 + LFD a8, 3 * SIZE(AO2) + + LFD a9, 0 * SIZE(AO3) + LFD a10, 1 * SIZE(AO3) + LFD a11, 2 * SIZE(AO3) + LFD a12, 3 * SIZE(AO3) + + LFD a13, 0 * SIZE(AO4) + LFD a14, 1 * SIZE(AO4) + LFD a15, 2 * SIZE(AO4) + LFD a16, 3 * SIZE(AO4) + + mr XX, X + mr YY, NEW_Y + + srawi. r0, IS, 4 + mtspr CTR, r0 + ble LL(14) + .align 4 + +LL(12): + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO1, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 +# DCBT(X, PREX) + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO2, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 9 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 9 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 9 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 10 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 8 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 10 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 9 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 10 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 11 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 8 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 11 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 9 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 11 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 10 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 11 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 11 * SIZE(AO4) + + STFD y01, 4 * SIZE(YY) + LFD y01, 8 * SIZE(YY) + STFD y02, 5 * SIZE(YY) + LFD y02, 9 * SIZE(YY) + + STFD y03, 6 * SIZE(YY) + LFD y03, 10 * SIZE(YY) + STFD y04, 7 * SIZE(YY) + LFD y04, 11 * SIZE(YY) + + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO3, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 13 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 +# DCBT(Y1, PREY) + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 13 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 14 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 12 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 14 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10,13 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 14 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 15 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13,12 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 15 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 13 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 15 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 14 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 15 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 15 * SIZE(AO4) + + STFD y01, 8 * SIZE(YY) + LFD y01, 12 * SIZE(YY) + STFD y02, 9 * SIZE(YY) + LFD y02, 13 * SIZE(YY) + + STFD y03, 10 * SIZE(YY) + LFD y03, 14 * SIZE(YY) + STFD y04, 11 * SIZE(YY) + LFD y04, 15 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO4, PREA) + FMADD y01, atemp1, a1, y01 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + addi YY, YY, 16 * SIZE + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 17 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 16 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + addi AO3, AO3, 16 * SIZE + FMADD y02, atemp2, a6, y02 + LFD a6, 17 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + addi AO1, AO1, 16 * SIZE + FMADD y03, atemp2, a7, y03 + addi AO2, AO2, 16 * SIZE + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 17 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + addi AO4, AO4, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 2 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 0 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 2 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 1 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 2 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 18 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + addi XX, XX, 16 * SIZE + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 3 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 0 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 3 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 1 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 3 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 2 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 3 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 3 * SIZE(AO4) + + STFD y01, -4 * SIZE(YY) + LFD y01, 0 * SIZE(YY) + STFD y02, -3 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + STFD y03, -2 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + STFD y04, -1 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + bdnz LL(12) + .align 4 + +LL(14): + andi. r0, IS, 8 + ble LL(15) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 9 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 9 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 9 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 10 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 8 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 10 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 9 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 10 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 11 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 8 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 11 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 9 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 11 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 10 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 11 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 11 * SIZE(AO4) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi AO3, AO3, 8 * SIZE + addi AO4, AO4, 8 * SIZE + + STFD y01, 4 * SIZE(YY) + LFD y01, 8 * SIZE(YY) + STFD y02, 5 * SIZE(YY) + LFD y02, 9 * SIZE(YY) + + STFD y03, 6 * SIZE(YY) + LFD y03, 10 * SIZE(YY) + STFD y04, 7 * SIZE(YY) + LFD y04, 11 * SIZE(YY) + + addi XX, XX, 8 * SIZE + addi YY, YY, 8 * SIZE + .align 4 + +LL(15): + andi. r0, IS, 4 + ble LL(18) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum2, xtemp1, a5, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + NOP2 + + FMADD xsum3, xtemp1, a9, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp1, a13, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp2, a2, xsum1 + LFD a2, 5 * SIZE(AO1) + FMADD y01, atemp2, a5, y01 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum2, xtemp2, a6, xsum2 + NOP1 + FMADD y02, atemp2, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD xsum3, xtemp2, a10, xsum3 + NOP1 + FMADD y03, atemp2, a7, y03 + NOP2 + + FMADD xsum4, xtemp2, a14, xsum4 + LFD xtemp2, 5 * SIZE(XX) + FMADD y04, atemp2, a8, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD a3, 6 * SIZE(AO1) + FMADD y01, atemp3, a9, y01 + LFD a9, 4 * SIZE(AO3) + + FMADD xsum2, xtemp3, a7, xsum2 + LFD a7, 6 * SIZE(AO2) + FMADD y02, atemp3, a10, y02 + LFD a10, 5 * SIZE(AO3) + + FMADD xsum3, xtemp3, a11, xsum3 + NOP1 + FMADD y03, atemp3, a11, y03 + LFD a11, 6 * SIZE(AO3) + + FMADD xsum4, xtemp3, a15, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a12, y04 + NOP2 + + FMADD xsum1, xtemp4, a4, xsum1 + LFD a4, 7 * SIZE(AO1) + FMADD y01, atemp4, a13, y01 + LFD a13, 4 * SIZE(AO4) + + FMADD xsum2, xtemp4, a8, xsum2 + LFD a8, 7 * SIZE(AO2) + FMADD y02, atemp4, a14, y02 + LFD a14, 5 * SIZE(AO4) + + FMADD xsum3, xtemp4, a12, xsum3 + LFD a12, 7 * SIZE(AO3) + FMADD y03, atemp4, a15, y03 + LFD a15, 6 * SIZE(AO4) + + FMADD xsum4, xtemp4, a16, xsum4 + LFD xtemp4, 7 * SIZE(XX) + FMADD y04, atemp4, a16, y04 + LFD a16, 7 * SIZE(AO4) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + + STFD y01, 0 * SIZE(YY) + LFD y01, 4 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + + STFD y03, 2 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + addi XX, XX, 4 * SIZE + addi YY, YY, 4 * SIZE + .align 4 + +LL(18): + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + FMUL xsum2, xtemp1, xsum2 + FMUL xsum3, xtemp1, xsum3 + FMUL xsum4, xtemp1, xsum4 + + FMADD xsum1, atemp1, a1, xsum1 + FMADD xsum2, atemp1, a5, xsum2 + FMADD xsum3, atemp1, a9, xsum3 + FMADD xsum4, atemp1, a13, xsum4 + + FMADD xsum1, atemp2, a5, xsum1 + FMADD xsum2, atemp2, a6, xsum2 + FMADD xsum3, atemp2, a10, xsum3 + FMADD xsum4, atemp2, a14, xsum4 + + FMADD xsum1, atemp3, a9, xsum1 + FMADD xsum2, atemp3, a10, xsum2 + FMADD xsum3, atemp3, a11, xsum3 + FMADD xsum4, atemp3, a15, xsum4 + + FMADD xsum1, atemp4, a13, xsum1 + FMADD xsum2, atemp4, a14, xsum2 + FMADD xsum3, atemp4, a15, xsum3 + FMADD xsum4, atemp4, a16, xsum4 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + FADD y03, y03, xsum3 + FADD y04, y04, xsum4 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + STFD y03, 2 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + + addi TEMP, IS, 8 + addi IS, IS, 4 + cmpw cr0, TEMP, M + ble LL(11) + .align 4 + +LL(20): + andi. TEMP, M, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + slwi TEMP, IS, BASE_SHIFT + add TEMP, X, TEMP + + LFD atemp1, 0 * SIZE(TEMP) + LFD atemp2, 1 * SIZE(TEMP) + + LFD a1, ALPHA + + FMUL atemp1, a1, atemp1 + FMUL atemp2, a1, atemp2 + + lfd xsum1, FZERO + fmr xsum2, xsum1 + + mr XX, X + mr YY, NEW_Y + + LFD xtemp1, 0 * SIZE(XX) + LFD xtemp2, 1 * SIZE(XX) + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + + srawi. r0, IS, 1 + mtspr CTR, r0 + ble LL(28) + .align 4 + +LL(22): + FMADD xsum1, xtemp1, a1, xsum1 + FMADD xsum2, xtemp1, a5, xsum2 + + FMADD xsum1, xtemp2, a2, xsum1 + FMADD xsum2, xtemp2, a6, xsum2 + + FMADD y01, atemp1, a1, y01 + FMADD y02, atemp1, a2, y02 + FMADD y01, atemp2, a5, y01 + FMADD y02, atemp2, a6, y02 + + LFD xtemp1, 2 * SIZE(XX) + LFD xtemp2, 3 * SIZE(XX) + + LFD a1, 2 * SIZE(AO1) + LFD a2, 3 * SIZE(AO1) + + LFD a5, 2 * SIZE(AO2) + LFD a6, 3 * SIZE(AO2) + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + + LFD y01, 2 * SIZE(YY) + LFD y02, 3 * SIZE(YY) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + + addi XX, XX, 2 * SIZE + addi YY, YY, 2 * SIZE + + bdnz LL(22) + .align 4 + +LL(28): + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + FMUL xsum2, xtemp1, xsum2 + + FMADD xsum1, atemp1, a1, xsum1 + FMADD xsum2, atemp1, a5, xsum2 + FMADD xsum1, atemp2, a5, xsum1 + FMADD xsum2, atemp2, a6, xsum2 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + + addi IS, IS, 2 + .align 4 + +LL(30): + andi. TEMP, M, 1 + ble LL(990) + + mr AO1, A + + slwi TEMP, IS, BASE_SHIFT + add TEMP, X, TEMP + + LFD atemp1, 0 * SIZE(TEMP) + + LFD a1, ALPHA + + FMUL atemp1, a1, atemp1 + + lfd xsum1, FZERO + + mr XX, X + mr YY, NEW_Y + + LFD xtemp1, 0 * SIZE(XX) + LFD y01, 0 * SIZE(YY) + + LFD a1, 0 * SIZE(AO1) + + mtspr CTR, IS + cmpwi cr0, IS, 0 + ble LL(38) + .align 4 + +LL(32): + FMADD xsum1, xtemp1, a1, xsum1 + + FMADD y01, atemp1, a1, y01 + + LFD xtemp1, 1 * SIZE(XX) + + LFD a1, 1 * SIZE(AO1) + + STFD y01, 0 * SIZE(YY) + + LFD y01, 1 * SIZE(YY) + + addi AO1, AO1, 1 * SIZE + + addi XX, XX, 1 * SIZE + addi YY, YY, 1 * SIZE + + bdnz LL(32) + .align 4 + +LL(38): + LFD xtemp1, ALPHA + + FMUL xsum1, xtemp1, xsum1 + + FMADD xsum1, atemp1, a1, xsum1 + + FADD y01, y01, xsum1 + + STFD y01, 0 * SIZE(YY) + .align 4 + +LL(990): + cmpwi cr0, INCY, SIZE + beq LL(999) + + mr YY, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + add Y, Y, INCY + LFD f5, 0 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + add Y, Y, INCY + LFD f7, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + LFD f12, 4 * SIZE(NEW_Y) + LFD f13, 5 * SIZE(NEW_Y) + LFD f14, 6 * SIZE(NEW_Y) + LFD f15, 7 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + add YY, YY, INCY + STFD f11, 0 * SIZE(YY) + add YY, YY, INCY + STFD f12, 0 * SIZE(YY) + add YY, YY, INCY + STFD f13, 0 * SIZE(YY) + add YY, YY, INCY + STFD f14, 0 * SIZE(YY) + add YY, YY, INCY + STFD f15, 0 * SIZE(YY) + add YY, YY, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 4 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + add Y, Y, INCY + LFD f3, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + add YY, YY, INCY + STFD f11, 0 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(996): + andi. J, M, 2 + ble LL(997) + + LFD f0, 0 * SIZE(Y) + add Y, Y, INCY + LFD f1, 0 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 2 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(YY) + add YY, YY, INCY + STFD f9, 0 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(997): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f8, 0 * SIZE(NEW_Y) + + FADD f8, f8, f0 + + STFD f8, 0 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S new file mode 100644 index 0000000000..6be8e286d6 --- /dev/null +++ b/kernel/power/trsm_kernel_LN.S @@ -0,0 +1,3652 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef LN + li PREC, -4 * SIZE +#else + li PREC, 4 * SIZE +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + +LL(30): + andi. I, M, 1 + ble LL(20) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(09): + srawi. I, M, 2 + ble LL(39) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(60): + andi. I, M, 1 + ble LL(50) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(41) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(41): + srawi. I, M, 2 + ble LL(69) + .align 4 + +LL(42): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(43): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(43) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(42) + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(80) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(71) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(71): + srawi. I, M, 2 + ble LL(999) + .align 4 + +LL(72): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(73): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(73) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(72) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S new file mode 100644 index 0000000000..0d287440b8 --- /dev/null +++ b/kernel/power/trsm_kernel_LT.S @@ -0,0 +1,3665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#if defined(TRSMKERNEL) && defined(LN) +/* Direction is special */ +#ifdef PPC970 + li PREC, -4 * SIZE +#endif +#ifdef POWER4 + li PREC, -4 * SIZE +#endif +#ifdef POWER5 + li PREC, -4 * SIZE +#endif +#ifdef CELL + li PREC, -4 * SIZE +#endif +#else +/* Normal prefetch */ +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 3 * SIZE +#endif +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#ifdef CELL + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 12 * SIZE) +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S new file mode 100644 index 0000000000..533f29953e --- /dev/null +++ b/kernel/power/trsm_kernel_RT.S @@ -0,0 +1,3679 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#if defined(TRSMKERNEL) && defined(LN) +/* Direction is special */ +#ifdef PPC970 + li PREC, -4 * SIZE +#endif +#ifdef POWER4 + li PREC, -4 * SIZE +#endif +#ifdef POWER5 + li PREC, -4 * SIZE +#endif +#else +/* Normal prefetch */ +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 3 * SIZE +#endif +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + lfs f0, FZERO + +LL(70): + andi. J, N, 1 + ble LL(40) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(99) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(99): +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(09) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(09): + srawi. J, N, 2 + ble LL(999) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S new file mode 100644 index 0000000000..179db31d24 --- /dev/null +++ b/kernel/power/trsm_kernel_cell_LN.S @@ -0,0 +1,3666 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREC, -4 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#ifdef CELL + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 12 * SIZE) +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + +LL(30): + andi. I, M, 1 + ble LL(20) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(09): + srawi. I, M, 2 + ble LL(39) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(60): + andi. I, M, 1 + ble LL(50) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(41) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(41): + srawi. I, M, 2 + ble LL(69) + .align 4 + +LL(42): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(43): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(43) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(42) + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(80) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(71) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(71): + srawi. I, M, 2 + ble LL(999) + .align 4 + +LL(72): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(73): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(73) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(72) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S new file mode 100644 index 0000000000..06b3d9ef12 --- /dev/null +++ b/kernel/power/trsm_kernel_cell_LT.S @@ -0,0 +1,3680 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#if defined(TRSMKERNEL) && defined(LN) +/* Direction is special */ +#ifdef PPC970 + li PREC, -4 * SIZE +#endif +#ifdef POWER4 + li PREC, -4 * SIZE +#endif +#ifdef POWER5 + li PREC, -4 * SIZE +#endif +#ifdef CELL + li PREC, -4 * SIZE +#endif +#else +/* Normal prefetch */ +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 3 * SIZE +#endif +#endif + +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#ifdef CELL + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 12 * SIZE) +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S new file mode 100644 index 0000000000..51e7bc48b1 --- /dev/null +++ b/kernel/power/trsm_kernel_cell_RT.S @@ -0,0 +1,3675 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREB r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREC, -4 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + mr PREA, r10 + lwz PREB, 8 + STACKSIZE(SP) + lwz PREC, 12 + STACKSIZE(SP) +#else + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 112 + STACKSIZE(SP) + ld PREB, 120 + STACKSIZE(SP) + ld PREC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 60 + STACKSIZE(SP) + lwz PREB, 64 + STACKSIZE(SP) + lwz PREC, 68 + STACKSIZE(SP) +#else + lwz PREA, 56 + STACKSIZE(SP) + lwz PREB, 60 + STACKSIZE(SP) + lwz PREC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE | 1) + li PREB, (16 * 5 * SIZE | 3) +#else + li PREA, (16 * 14 * SIZE | 1) + li PREB, (16 * 8 * SIZE | 3) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) + li PREB, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) + li PREB, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#ifdef CELL + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 12 * SIZE) +#endif +#endif + lfs f0, FZERO + +LL(70): + andi. J, N, 1 + ble LL(40) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + DCBT(BO, PREB) + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(99) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(99): +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(09) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + DCBT(BO, PREB) + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(09): + srawi. J, N, 2 + ble LL(999) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + dcbt CO3, PREC + dcbt CO4, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + DCBT(BO, PREB) + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S new file mode 100644 index 0000000000..32f4d0d735 --- /dev/null +++ b/kernel/power/trsm_kernel_hummer_LN.S @@ -0,0 +1,5695 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define ALPHA 0 +#define FZERO 8 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM4 r16 +#define INCM2 r17 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define CO3 r30 +#define CO4 r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) # dummy + + li r0, 0 + + stwu r0, -4(SP) + stwu r0, -4(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + + li INCM1, -1 * SIZE + li INCM2, -2 * SIZE + li INCM4, -4 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + srawi. J, N, 2 + ble .L50 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO4, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + andi. I, M, 1 + beq .L20 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L44 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L44 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L43 + .align 4 + +.L42: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L42 + .align 4 + +.L43: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L44: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L48 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L48 +#endif + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L47 + .align 4 + +.L46: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L46 + .align 4 + +.L47: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + addi AO2, AO, 2 * SIZE + .align 4 + +.L48: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#if defined(LN) || defined(LT) + LFPDX A1, AO, INC4 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RN + LFD A1, (4 + 0) * SIZE(BO) + LFD A2, (4 + 1) * SIZE(BO) + LFD A3, (4 + 2) * SIZE(BO) + LFD A4, (4 + 3) * SIZE(BO) + + LFD A5, (4 + 5) * SIZE(BO) + LFD A6, (4 + 6) * SIZE(BO) + LFD A7, (4 + 7) * SIZE(BO) + LFD A8, (4 + 10) * SIZE(BO) + + LFD A9, (4 + 11) * SIZE(BO) + LFD A10, (4 + 15) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f0, A1, f0 + fnmsub f2, A2, f0, f2 + fnmsub f1, A3, f0, f1 + fnmsub f3, A4, f0, f3 + + fmul f2, A5, f2 + fnmsub f1, A6, f2, f1 + fnmsub f3, A7, f2, f3 + + fmul f1, A8, f1 + fnmsub f3, A9, f1, f3 + + fmul f3, A10, f3 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#ifdef RT + LFD A1, (4 + 15) * SIZE(BO) + LFD A2, (4 + 14) * SIZE(BO) + LFD A3, (4 + 13) * SIZE(BO) + LFD A4, (4 + 12) * SIZE(BO) + + LFD A5, (4 + 10) * SIZE(BO) + LFD A6, (4 + 9) * SIZE(BO) + LFD A7, (4 + 8) * SIZE(BO) + LFD A8, (4 + 5) * SIZE(BO) + + LFD A9, (4 + 4) * SIZE(BO) + LFD A10, (4 + 0) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f3, A1, f3 + fnmsub f1, A2, f3, f1 + fnmsub f2, A3, f3, f2 + fnmsub f0, A4, f3, f0 + + fmul f1, A5, f1 + fnmsub f2, A6, f1, f2 + fnmsub f0, A7, f1, f0 + + fmul f2, A8, f2 + fnmsub f0, A9, f2, f0 + + fmul f0, A10, f0 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f1, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f1, AO2, INC4 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + + STFDX f0, CO1, INC + STFSDX f0, CO2, INC + STFDX f1, CO3, INC + STFSDX f1, CO4, INC + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + LFPDUX B3, BO, INC4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + LFPDUX A5, BO, INC4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + LFPDUX A7, BO, INC4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f28, f8 + + fsmfp f0, f4 + fsmfp f8, f12 + fsmtp f4, f24 + fsmtp f12, f28 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f8, f18, f8 + fpsub f12, f19, f12 +#endif + +#ifdef LN + addi AO, AO, 8 * SIZE + addi AO2, AO2, 8 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f12, A1, f12 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f8, A1, f12, f8 + + fxpmul f0, A2, f0 + fxpmul f8, A2, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + subi AO, AO, 4 * SIZE + subi AO2, AO2, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxsmul f4, A2, f4 + fxsmul f12, A2, f12 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f8, A2, f0, f8 + fxcsnmsub f12, A2, f0, f12 + + fxsmul f4, A3, f4 + fxcpnmsub f8, A4, f4, f8 + fxcsnmsub f12, A4, f4, f12 + + fxpmul f8, A5, f8 + fxcsnmsub f12, A5, f8, f12 + fxsmul f12, A6, f12 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxcpnmsub f8, A1, f12, f8 + fxcsnmsub f4, A2, f12, f4 + fxcpnmsub f0, A2, f12, f0 + + fxpmul f8, A3, f8 + fxcsnmsub f4, A4, f8, f4 + fxcpnmsub f0, A4, f8, f0 + + fxsmul f4, A5, f4 + fxcpnmsub f0, A5, f4, f0 + fxpmul f0, A6, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f4, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f12, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 4 + beq .L40 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f28, f8 + fpmr f29, f9 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f8, f12 + fsmfp f9, f13 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f12, f28 + fsmtp f13, f29 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + LFPDUX f20, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f22, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 + + fpsub f1, f20, f1 + fpsub f9, f21, f9 + fpsub f5, f22, f5 + fpsub f13, f23, f13 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f4, f18, f4 + fpsub f5, f19, f5 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f12, f22, f12 + fpsub f13, f23, f13 +#endif + +#ifdef LN + addi AO, AO, 20 * SIZE + addi AO2, AO2, 20 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A5, AO, INCM4 + add AO2, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f5, A1, f5 + fxsmul f13, A1, f13 + + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f5, f4 + fxcsnmsub f12, A2, f13, f12 + + fxcpnmsub f0, A2, f5, f0 + fxcpnmsub f8, A2, f13, f8 + + fxpmul f1, A3, f1 + fxpmul f9, A3, f9 + + fxcsnmsub f4, A4, f1, f4 + fxcsnmsub f12, A4, f9, f12 + + fxcpnmsub f0, A4, f1, f0 + fxcpnmsub f8, A4, f9, f8 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f8, A5, f12, f8 + + fxpmul f0, A6, f0 + fxpmul f8, A6, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + add AO, AO, INC4 + LFPDUX A6, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f1, A4, f4, f1 + fxcpnmsub f9, A4, f12, f9 + + fxcsnmsub f5, A4, f4, f5 + fxcsnmsub f13, A4, f12, f13 + + fxpmul f1, A5, f1 + fxpmul f9, A5, f9 + + fxcsnmsub f5, A5, f1, f5 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f5, A6, f5 + fxsmul f13, A6, f13 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L40: + srawi. I, M, 3 + ble .L49 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + nop + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + nop + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + nop + + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + nop + fxcsmadd f12, B4, A9, f12 + nop + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + LFPDUX f16, BO, INC4 + fpmr f25, f1 + nop + fpmr f26, f2 + LFPDUX f17, BO2, INC4 + fpmr f27, f3 + nop + + fpmr f28, f8 + LFPDUX f18, BO, INC4 + fpmr f29, f9 + nop + fpmr f30, f10 + LFPDUX f19, BO2, INC4 + fpmr f31, f11 + nop + + fsmfp f0, f4 + LFPDUX f20, BO, INC4 + fsmfp f1, f5 + nop + fsmfp f2, f6 + LFPDUX f21, BO2, INC4 + fsmfp f3, f7 + nop + + fsmfp f8, f12 + LFPDUX f22, BO, INC4 + fsmfp f9, f13 + nop + fsmfp f10, f14 + LFPDUX f23, BO2, INC4 + fsmfp f11, f15 + nop + + fsmtp f4, f24 + LFPDUX f24, BO, INC4 + fsmtp f5, f25 + nop + fsmtp f6, f26 + LFPDUX f25, BO2, INC4 + fsmtp f7, f27 + nop + + fsmtp f12, f28 + LFPDUX f26, BO, INC4 + fsmtp f13, f29 + nop + fsmtp f14, f30 + LFPDUX f27, BO2, INC4 + fsmtp f15, f31 + nop + + fpsub f0, f16, f0 + LFPDUX f28, BO, INC4 + fpsub f8, f17, f8 + nop + fpsub f4, f18, f4 + LFPDUX f29, BO2, INC4 + fpsub f12, f19, f12 + nop + + fpsub f1, f20, f1 + LFPDUX f30, BO, INC4 + fpsub f9, f21, f9 + subi BO, BO, 32 * SIZE + fpsub f5, f22, f5 + LFPDUX f31, BO2, INC4 + fpsub f13, f23, f13 + subi BO2, BO2, 32 * SIZE + + fpsub f2, f24, f2 + fpsub f10, f25, f10 + fpsub f6, f26, f6 + fpsub f14, f27, f14 + fpsub f3, f28, f3 + fpsub f11, f29, f11 + fpsub f7, f30, f7 + fpsub f15, f31, f15 + +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + fpsub f0, f16, f0 + LFPDUX f24, AO, INC4 + fpsub f1, f17, f1 + LFPDUX f25, AO2, INC4 + fpsub f2, f18, f2 + LFPDUX f26, AO, INC4 + fpsub f3, f19, f3 + LFPDUX f27, AO2, INC4 + fpsub f4, f20, f4 + LFPDUX f28, AO, INC4 + fpsub f5, f21, f5 + LFPDUX f29, AO2, INC4 + fpsub f6, f22, f6 + LFPDUX f30, AO, INC4 + fpsub f7, f23, f7 + LFPDUX f31, AO2, INC4 + + fpsub f8, f24, f8 + subi AO, AO, 32 * SIZE + fpsub f9, f25, f9 + subi AO2, AO2, 32 * SIZE + fpsub f10, f26, f10 + fpsub f11, f27, f11 + fpsub f12, f28, f12 + fpsub f13, f29, f13 + fpsub f14, f30, f14 + fpsub f15, f31, f15 +#endif + +#ifdef LN + addi AO, AO, 68 * SIZE + addi AO2, AO2, 68 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + fxsmul f7, A1, f7 + fxsmul f15, A1, f15 + + fxcpnmsub f3, A1, f7, f3 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f6, A2, f7, f6 + fxcsnmsub f14, A2, f15, f14 + + fxcpnmsub f2, A2, f7, f2 + fxcpnmsub f10, A2, f15, f10 + + fxcsnmsub f5, A3, f7, f5 + fxcsnmsub f13, A3, f15, f13 + + fxcpnmsub f1, A3, f7, f1 + fxcpnmsub f9, A3, f15, f9 + + fxcsnmsub f4, A4, f7, f4 + fxcsnmsub f12, A4, f15, f12 + + fxcpnmsub f0, A4, f7, f0 + fxcpnmsub f8, A4, f15, f8 + + fxpmul f3, A5, f3 + fxpmul f11, A5, f11 + + fxcsnmsub f6, A6, f3, f6 + fxcsnmsub f14, A6, f11, f14 + + fxcpnmsub f2, A6, f3, f2 + fxcpnmsub f10, A6, f11, f10 + + fxcsnmsub f5, A7, f3, f5 + fxcsnmsub f13, A7, f11, f13 + + fxcpnmsub f1, A7, f3, f1 + fxcpnmsub f9, A7, f11, f9 + + fxcsnmsub f4, A8, f3, f4 + fxcsnmsub f12, A8, f11, f12 + + fxcpnmsub f0, A8, f3, f0 + fxcpnmsub f8, A8, f11, f8 + + add AO2, AO2, INCM4 + LFPDUX A1, AO, INCM4 + LFPDUX A2, AO2, INCM4 + LFPDUX A3, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f10, A1, f14, f10 + + fxcsnmsub f5, A2, f6, f5 + fxcsnmsub f13, A2, f14, f13 + + fxcpnmsub f1, A2, f6, f1 + fxcpnmsub f9, A2, f14, f9 + + fxcsnmsub f4, A3, f6, f4 + fxcsnmsub f12, A3, f14, f12 + + fxcpnmsub f0, A3, f6, f0 + fxcpnmsub f8, A3, f14, f8 + + fxpmul f2, A4, f2 + fxpmul f10, A4, f10 + + fxcsnmsub f5, A5, f2, f5 + fxcsnmsub f13, A5, f10, f13 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + + fxcsnmsub f4, A6, f2, f4 + fxcsnmsub f12, A6, f10, f12 + + fxcpnmsub f0, A6, f2, f0 + fxcpnmsub f8, A6, f10, f8 + + fxsmul f5, A7, f5 + fxsmul f13, A7, f13 + + fxcpnmsub f1, A7, f5, f1 + fxcpnmsub f9, A7, f13, f9 + + fxcsnmsub f4, A8, f5, f4 + fxcsnmsub f12, A8, f13, f12 + + fxcpnmsub f0, A8, f5, f0 + fxcpnmsub f8, A8, f13, f8 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A3, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A4, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f4, A2, f1, f4 + fxcsnmsub f12, A2, f9, f12 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f0, A3, f4, f0 + fxcpnmsub f8, A3, f12, f8 + + fxpmul f0, A4, f0 + fxpmul f8, A4, f8 + +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + + fxcsnmsub f6, A3, f0, f6 + fxcsnmsub f14, A3, f8, f14 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + + fxcsnmsub f7, A4, f0, f7 + fxcsnmsub f15, A4, f8, f15 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f1, A6, f4, f1 + fxcpnmsub f9, A6, f12, f9 + + fxcsnmsub f5, A6, f4, f5 + fxcsnmsub f13, A6, f12, f13 + + fxcpnmsub f2, A7, f4, f2 + fxcpnmsub f10, A7, f12, f10 + + fxcsnmsub f6, A7, f4, f6 + fxcsnmsub f14, A7, f12, f14 + + fxcpnmsub f3, A8, f4, f3 + fxcpnmsub f11, A8, f12, f11 + + fxcsnmsub f7, A8, f4, f7 + fxcsnmsub f15, A8, f12, f15 + + add AO, AO, INC4 + LFPDUX A1, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f13, A1, f9, f13 + + fxcpnmsub f2, A2, f1, f2 + fxcpnmsub f10, A2, f9, f10 + + fxcsnmsub f6, A2, f1, f6 + fxcsnmsub f14, A2, f9, f14 + + fxcpnmsub f3, A3, f1, f3 + fxcpnmsub f11, A3, f9, f11 + + fxcsnmsub f7, A3, f1, f7 + fxcsnmsub f15, A3, f9, f15 + + fxsmul f5, A4, f5 + fxsmul f13, A4, f13 + + fxcpnmsub f2, A5, f5, f2 + fxcpnmsub f10, A5, f13, f10 + + fxcsnmsub f6, A5, f5, f6 + fxcsnmsub f14, A5, f13, f14 + + fxcpnmsub f3, A6, f5, f3 + fxcpnmsub f11, A6, f13, f11 + + fxcsnmsub f7, A6, f5, f7 + fxcsnmsub f15, A6, f13, f15 + + fxpmul f2, A7, f2 + fxpmul f10, A7, f10 + + fxcsnmsub f6, A7, f2, f6 + fxcsnmsub f14, A7, f10, f14 + + fxcpnmsub f3, A8, f2, f3 + fxcpnmsub f11, A8, f10, f11 + + fxcsnmsub f7, A8, f2, f7 + fxcsnmsub f15, A8, f10, f15 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A3, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A4, AO2, INC4 + + subi AO, AO, 64 * SIZE + subi AO2, AO2, 64 * SIZE + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f3, A2, f6, f3 + fxcpnmsub f11, A2, f14, f11 + + fxcsnmsub f7, A2, f6, f7 + fxcsnmsub f15, A2, f14, f15 + + fxpmul f3, A3, f3 + fxpmul f11, A3, f11 + + fxcsnmsub f7, A3, f3, f7 + fxcsnmsub f15, A3, f11, f15 + + fxsmul f7, A4, f7 + fxsmul f15, A4, f15 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + fxcsnmsub f14, A2, f2, f14 + fxcsnmsub f15, A2, f3, f15 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxsmul f6, A3, f6 + fxsmul f7, A3, f7 + + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + fxcpnmsub f10, A4, f6, f10 + fxcpnmsub f11, A4, f7, f11 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + fxcsnmsub f14, A4, f6, f14 + fxcsnmsub f15, A4, f7, f15 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxpmul f10, A5, f10 + fxpmul f11, A5, f11 + + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + fxcsnmsub f14, A5, f10, f14 + fxcsnmsub f15, A5, f11, f15 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 + fxsmul f14, A6, f14 + fxsmul f15, A6, f15 + +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxsmul f14, A1, f14 + fxsmul f15, A1, f15 + + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + fxcpnmsub f10, A1, f14, f10 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcsnmsub f6, A2, f14, f6 + fxcsnmsub f7, A2, f15, f7 + + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + fxcpnmsub f2, A2, f14, f2 + fxcpnmsub f3, A2, f15, f3 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxpmul f10, A3, f10 + fxpmul f11, A3, f11 + + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + fxcsnmsub f6, A4, f10, f6 + fxcsnmsub f7, A4, f11, f7 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + fxcpnmsub f2, A4, f10, f2 + fxcpnmsub f3, A4, f11, f3 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxsmul f6, A5, f6 + fxsmul f7, A5, f7 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + fxcpnmsub f2, A5, f6, f2 + fxcpnmsub f3, A5, f7, f3 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 + fxpmul f2, A6, f2 + fxpmul f3, A6, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f6, BO, INC4 + STFPDUX f14, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + STFPDUX f7, BO, INC4 + STFPDUX f15, BO2, INC4 + + subi BO, BO, 32 * SIZE + subi BO2, BO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + STFDUX f10, CO3, INC + STFDUX f14, CO3, INC + STFDUX f11, CO3, INC + STFDUX f15, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC + STFSDUX f10, CO4, INC + STFSDUX f14, CO4, INC + STFSDUX f11, CO4, INC + STFSDUX f15, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f6, AO, INC4 + STFPDUX f7, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + STFPDUX f14, AO, INC4 + STFPDUX f15, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC + STFDUX f14, CO4, INC + STFSDUX f14, CO4, INC + STFDUX f15, CO4, INC + STFSDUX f15, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 2 + beq .L90 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + andi. I, M, 1 + beq .L60 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L84 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L84 + +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L83 + .align 4 + +.L82: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L82 + .align 4 + +.L83: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L84: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L88 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L88 +#endif + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L87 + .align 4 + +.L86: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L86 + .align 4 + +.L87: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L88: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RN + LFD A1, (2 + 0) * SIZE(BO) + LFD A2, (2 + 1) * SIZE(BO) + LFD A3, (2 + 3) * SIZE(BO) + + fsmtp f1, f0 + + fmul f0, A1, f0 + fnmsub f1, A2, f0, f1 + + fmul f1, A3, f1 + fsmfp f0, f1 +#endif + +#ifdef RT + LFD A1, (2 + 3) * SIZE(BO) + LFD A2, (2 + 2) * SIZE(BO) + LFD A3, (2 + 0) * SIZE(BO) + + fsmtp f1, f0 + + fmul f1, A1, f1 + fnmsub f0, A2, f1, f0 + + fmul f0, A3, f0 + fsmfp f0, f1 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L72 + .align 4 + +.L73: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fsmfp f0, f1 + fsmtp f1, f24 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxsmul f1, A2, f1 + fxcpnmsub f0, A2, f1, f0 + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f1, A1, f1 + fxcpnmsub f0, A1, f1, f0 + fxpmul f0, A2, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFDUX f1, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f1, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 4 + beq .L80 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L68: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + + fsmfp f0, f2 + fsmfp f1, f3 + fsmtp f2, f24 + fsmtp f3, f25 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + fpsub f1, f18, f1 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + addi AO, AO, 18 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A5, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A6, AO, INCM2 + + subi AO, AO, 2 * SIZE + + fxsmul f3, A1, f3 + fxcpnmsub f1, A1, f3, f1 + fxcsnmsub f2, A2, f3, f2 + fxcpnmsub f0, A2, f3, f0 + + fxpmul f1, A3, f1 + fxcsnmsub f2, A4, f1, f2 + fxcpnmsub f0, A4, f1, f0 + + fxsmul f2, A5, f2 + fxcpnmsub f0, A5, f2, f0 + + fxpmul f0, A6, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + add AO, AO, INC2 + LFPDUX A6, AO, INC2 + + subi AO, AO, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f2, A1, f0, f2 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f3, A2, f0, f3 + + fxsmul f2, A3, f2 + fxcpnmsub f1, A4, f2, f1 + fxcsnmsub f3, A4, f2, f3 + + fxpmul f1, A5, f1 + fxcsnmsub f3, A5, f1, f3 + + fxsmul f3, A6, f3 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + + fxcsnmsub f2, A1, f0, f2 + fxcsnmsub f3, A1, f1, f3 + + fxsmul f2, A2, f2 + fxsmul f3, A2, f3 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f2, A1, f2 + fxsmul f3, A1, f3 + + fxcpnmsub f0, A1, f2, f0 + fxcpnmsub f1, A1, f3, f1 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f2, CO1, INC + STFDUX f1, CO1, INC + STFDUX f3, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f3, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L80: + srawi. I, M, 3 + ble .L89 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, KK, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f26, f2 + fpmr f27, f3 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f6, f26 + fsmtp f7, f27 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + LFPDUX f20, BO, INC2 + LFPDUX f21, BO, INC2 + LFPDUX f22, BO, INC2 + LFPDUX f23, BO, INC2 + + subi BO, BO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f1, f18, f1 + fpsub f5, f19, f5 + + fpsub f2, f20, f2 + fpsub f6, f21, f6 + fpsub f3, f22, f3 + fpsub f7, f23, f7 + +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + LFPDUX f20, AO, INC2 + LFPDUX f21, AO, INC2 + LFPDUX f22, AO, INC2 + LFPDUX f23, AO, INC2 + + subi AO, AO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + fpsub f4, f20, f4 + fpsub f5, f21, f5 + fpsub f6, f22, f6 + fpsub f7, f23, f7 +#endif + +#ifdef LN + addi AO, AO, 66 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f7, A1, f7 + fxcpnmsub f3, A1, f7, f3 + fxcsnmsub f6, A2, f7, f6 + fxcpnmsub f2, A2, f7, f2 + + fxcsnmsub f5, A3, f7, f5 + fxcpnmsub f1, A3, f7, f1 + fxcsnmsub f4, A4, f7, f4 + fxcpnmsub f0, A4, f7, f0 + + fxpmul f3, A5, f3 + fxcsnmsub f6, A6, f3, f6 + fxcpnmsub f2, A6, f3, f2 + + fxcsnmsub f5, A7, f3, f5 + fxcpnmsub f1, A7, f3, f1 + fxcsnmsub f4, A8, f3, f4 + fxcpnmsub f0, A8, f3, f0 + + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + + add AO, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f6, A1, f6 + fxcpnmsub f2, A1, f6, f2 + fxcsnmsub f5, A2, f6, f5 + fxcpnmsub f1, A2, f6, f1 + fxcsnmsub f4, A3, f6, f4 + fxcpnmsub f0, A3, f6, f0 + + fxpmul f2, A4, f2 + fxcsnmsub f5, A5, f2, f5 + fxcpnmsub f1, A5, f2, f1 + fxcsnmsub f4, A6, f2, f4 + fxcpnmsub f0, A6, f2, f0 + + fxsmul f5, A7, f5 + fxcpnmsub f1, A7, f5, f1 + fxcsnmsub f4, A8, f5, f4 + fxcpnmsub f0, A8, f5, f0 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + + subi AO, AO, 6 * SIZE + LFPDUX A3, AO, INCM2 + subi AO, AO, 6 * SIZE + LFPDUX A4, AO, INCM2 + + addi AO, AO, -2 * SIZE + + fxpmul f1, A1, f1 + fxcsnmsub f4, A2, f1, f4 + fxcpnmsub f0, A2, f1, f0 + + fxsmul f4, A3, f4 + fxcpnmsub f0, A3, f4, f0 + + fxpmul f0, A4, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f5, A2, f0, f5 + fxcpnmsub f2, A3, f0, f2 + fxcsnmsub f6, A3, f0, f6 + fxcpnmsub f3, A4, f0, f3 + fxcsnmsub f7, A4, f0, f7 + + fxsmul f4, A5, f4 + fxcpnmsub f1, A6, f4, f1 + fxcsnmsub f5, A6, f4, f5 + fxcpnmsub f2, A7, f4, f2 + fxcsnmsub f6, A7, f4, f6 + fxcpnmsub f3, A8, f4, f3 + fxcsnmsub f7, A8, f4, f7 + + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + add AO, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f1, A1, f1 + fxcsnmsub f5, A1, f1, f5 + fxcpnmsub f2, A2, f1, f2 + fxcsnmsub f6, A2, f1, f6 + fxcpnmsub f3, A3, f1, f3 + fxcsnmsub f7, A3, f1, f7 + + fxsmul f5, A4, f5 + fxcpnmsub f2, A5, f5, f2 + fxcsnmsub f6, A5, f5, f6 + fxcpnmsub f3, A6, f5, f3 + fxcsnmsub f7, A6, f5, f7 + + fxpmul f2, A7, f2 + fxcsnmsub f6, A7, f2, f6 + fxcpnmsub f3, A8, f2, f3 + fxcsnmsub f7, A8, f2, f7 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, 6 * SIZE + LFPDUX A3, AO, INC2 + addi AO, AO, 6 * SIZE + LFPDUX A4, AO, INC2 + + subi AO, AO, 64 * SIZE + + fxsmul f6, A1, f6 + fxcpnmsub f3, A2, f6, f3 + fxcsnmsub f7, A2, f6, f7 + + fxpmul f3, A3, f3 + fxcsnmsub f7, A3, f3, f7 + + fxsmul f7, A4, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxsmul f4, A2, f4 + fxsmul f5, A2, f5 + fxsmul f6, A2, f6 + fxsmul f7, A2, f7 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f5, A1, f5 + fxsmul f6, A1, f6 + fxsmul f7, A1, f7 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f3, A1, f7, f3 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 + fxpmul f2, A2, f2 + fxpmul f3, A2, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f4, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f5, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f6, BO, INC2 + STFPDUX f3, BO, INC2 + STFPDUX f7, BO, INC2 + + subi BO, BO, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + STFPDUX f4, AO, INC2 + STFPDUX f5, AO, INC2 + STFPDUX f6, AO, INC2 + STFPDUX f7, AO, INC2 + + subi AO, AO, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +.L90: + andi. J, N, 1 + beq .L999 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO1, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + andi. I, M, 1 + beq .L100 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L124 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L123 + .align 4 + +.L122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L122 + .align 4 + +.L123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L124: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L128 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L128 +#endif + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L127 + .align 4 + +.L126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L126 + .align 4 + +.L127: + fmadd f0, A1, B1, f0 + .align 4 + +.L128: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + + fadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFDX f16, BO, INC2 + + fsub f0, f16, f0 +#else + LFDX f16, AO, INC2 + + fsub f0, f16, f0 +#endif + +#ifdef LN + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef LT + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef RN + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef RT + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFDX f0, BO, INC2 + + STFDUX f0, CO1, INC +#else + STFDX f0, AO, INC2 + + STFDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L100: + andi. I, M, 2 + beq .L110 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L114 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L113 + .align 4 + +.L112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L112 + .align 4 + +.L113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L114: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L118 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L118 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L117 + .align 4 + +.L116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L116 + .align 4 + +.L117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L118: + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + fsmtp f4, f0 + + LFD A1, (2 + 3) * SIZE(AO) + LFD A2, (2 + 2) * SIZE(AO) + LFD A3, (2 + 0) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + fmul f0, A3, f0 + fsmfp f0, f4 +#endif + +#ifdef LT + fsmtp f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fmul f4, A3, f4 + + fsmfp f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L110: + andi. I, M, 4 + beq .L120 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L104 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L103 + .align 4 + +.L102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L102 + .align 4 + +.L103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L104: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L108 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L108 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L107 + .align 4 + +.L106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L106 + .align 4 + +.L107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L108: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 15) * SIZE(AO) + LFD A2, (2 + 14) * SIZE(AO) + LFD A3, (2 + 13) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 9) * SIZE(AO) + LFD A3, (2 + 8) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 4) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 6) * SIZE(AO) + LFD A3, (2 + 7) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 11) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + + LFD A1, (2 + 15) * SIZE(AO) + + fmul f5, A1, f5 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L120: + srawi. I, M, 3 + ble .L129 + .align 4 + +.L91: +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L94 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L94 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L93 + .align 4 + +.L92: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L92 + .align 4 + +.L93: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L94: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L98 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L98 +#endif + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L97 + .align 4 + +.L96: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L96 + .align 4 + +.L97: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L98: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 63) * SIZE(AO) + LFD A2, (2 + 62) * SIZE(AO) + LFD A3, (2 + 61) * SIZE(AO) + LFD A4, (2 + 60) * SIZE(AO) + LFD A5, (2 + 59) * SIZE(AO) + LFD A6, (2 + 58) * SIZE(AO) + LFD A7, (2 + 57) * SIZE(AO) + LFD A8, (2 + 56) * SIZE(AO) + + fmul f7, A1, f7 + fnmsub f3, A2, f7, f3 + fnmsub f6, A3, f7, f6 + fnmsub f2, A4, f7, f2 + fnmsub f5, A5, f7, f5 + fnmsub f1, A6, f7, f1 + fnmsub f4, A7, f7, f4 + fnmsub f0, A8, f7, f0 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 53) * SIZE(AO) + LFD A3, (2 + 52) * SIZE(AO) + LFD A4, (2 + 51) * SIZE(AO) + LFD A5, (2 + 50) * SIZE(AO) + LFD A6, (2 + 49) * SIZE(AO) + LFD A7, (2 + 48) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f6, A2, f3, f6 + fnmsub f2, A3, f3, f2 + fnmsub f5, A4, f3, f5 + fnmsub f1, A5, f3, f1 + fnmsub f4, A6, f3, f4 + fnmsub f0, A7, f3, f0 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 44) * SIZE(AO) + LFD A3, (2 + 43) * SIZE(AO) + LFD A4, (2 + 42) * SIZE(AO) + LFD A5, (2 + 41) * SIZE(AO) + LFD A6, (2 + 40) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f2, A2, f6, f2 + fnmsub f5, A3, f6, f5 + fnmsub f1, A4, f6, f1 + fnmsub f4, A5, f6, f4 + fnmsub f0, A6, f6, f0 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 35) * SIZE(AO) + LFD A3, (2 + 34) * SIZE(AO) + LFD A4, (2 + 33) * SIZE(AO) + LFD A5, (2 + 32) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f5, A2, f2, f5 + fnmsub f1, A3, f2, f1 + fnmsub f4, A4, f2, f4 + fnmsub f0, A5, f2, f0 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 26) * SIZE(AO) + LFD A3, (2 + 25) * SIZE(AO) + LFD A4, (2 + 24) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 17) * SIZE(AO) + LFD A3, (2 + 16) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 8) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + LFD A5, (2 + 4) * SIZE(AO) + LFD A6, (2 + 5) * SIZE(AO) + LFD A7, (2 + 6) * SIZE(AO) + LFD A8, (2 + 7) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + fnmsub f2, A5, f0, f2 + fnmsub f6, A6, f0, f6 + fnmsub f3, A7, f0, f3 + fnmsub f7, A8, f0, f7 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 10) * SIZE(AO) + LFD A3, (2 + 11) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + LFD A5, (2 + 13) * SIZE(AO) + LFD A6, (2 + 14) * SIZE(AO) + LFD A7, (2 + 15) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + fnmsub f2, A4, f4, f2 + fnmsub f6, A5, f4, f6 + fnmsub f3, A6, f4, f3 + fnmsub f7, A7, f4, f7 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 19) * SIZE(AO) + LFD A3, (2 + 20) * SIZE(AO) + LFD A4, (2 + 21) * SIZE(AO) + LFD A5, (2 + 22) * SIZE(AO) + LFD A6, (2 + 23) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + fnmsub f2, A3, f1, f2 + fnmsub f6, A4, f1, f6 + fnmsub f3, A5, f1, f3 + fnmsub f7, A6, f1, f7 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 28) * SIZE(AO) + LFD A3, (2 + 29) * SIZE(AO) + LFD A4, (2 + 30) * SIZE(AO) + LFD A5, (2 + 31) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f2, A2, f5, f2 + fnmsub f6, A3, f5, f6 + fnmsub f3, A4, f5, f3 + fnmsub f7, A5, f5, f7 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 37) * SIZE(AO) + LFD A3, (2 + 38) * SIZE(AO) + LFD A4, (2 + 39) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f6, A2, f2, f6 + fnmsub f3, A3, f2, f3 + fnmsub f7, A4, f2, f7 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 46) * SIZE(AO) + LFD A3, (2 + 47) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f3, A2, f6, f3 + fnmsub f7, A3, f6, f7 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 55) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f7, A2, f3, f7 + + LFD A1, (2 + 63) * SIZE(AO) + + fmul f7, A1, f7 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L91 + .align 4 + +.L129: +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S new file mode 100644 index 0000000000..027fcf0f14 --- /dev/null +++ b/kernel/power/trsm_kernel_hummer_LT.S @@ -0,0 +1,5697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define ALPHA 0 +#define FZERO 8 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM4 r16 +#define INCM2 r17 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define CO3 r30 +#define CO4 r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) # dummy + + li r0, 0 + + stwu r0, -4(SP) + stwu r0, -4(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + + li INCM1, -1 * SIZE + li INCM2, -2 * SIZE + li INCM4, -4 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + srawi. J, N, 2 + ble .L50 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO4, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + nop + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + nop + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + nop + + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + nop + fxcsmadd f12, B4, A9, f12 + nop + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + LFPDUX f16, BO, INC4 + fpmr f25, f1 + nop + fpmr f26, f2 + LFPDUX f17, BO2, INC4 + fpmr f27, f3 + nop + + fpmr f28, f8 + LFPDUX f18, BO, INC4 + fpmr f29, f9 + nop + fpmr f30, f10 + LFPDUX f19, BO2, INC4 + fpmr f31, f11 + nop + + fsmfp f0, f4 + LFPDUX f20, BO, INC4 + fsmfp f1, f5 + nop + fsmfp f2, f6 + LFPDUX f21, BO2, INC4 + fsmfp f3, f7 + nop + + fsmfp f8, f12 + LFPDUX f22, BO, INC4 + fsmfp f9, f13 + nop + fsmfp f10, f14 + LFPDUX f23, BO2, INC4 + fsmfp f11, f15 + nop + + fsmtp f4, f24 + LFPDUX f24, BO, INC4 + fsmtp f5, f25 + nop + fsmtp f6, f26 + LFPDUX f25, BO2, INC4 + fsmtp f7, f27 + nop + + fsmtp f12, f28 + LFPDUX f26, BO, INC4 + fsmtp f13, f29 + nop + fsmtp f14, f30 + LFPDUX f27, BO2, INC4 + fsmtp f15, f31 + nop + + fpsub f0, f16, f0 + LFPDUX f28, BO, INC4 + fpsub f8, f17, f8 + nop + fpsub f4, f18, f4 + LFPDUX f29, BO2, INC4 + fpsub f12, f19, f12 + nop + + fpsub f1, f20, f1 + LFPDUX f30, BO, INC4 + fpsub f9, f21, f9 + subi BO, BO, 32 * SIZE + fpsub f5, f22, f5 + LFPDUX f31, BO2, INC4 + fpsub f13, f23, f13 + subi BO2, BO2, 32 * SIZE + + fpsub f2, f24, f2 + fpsub f10, f25, f10 + fpsub f6, f26, f6 + fpsub f14, f27, f14 + fpsub f3, f28, f3 + fpsub f11, f29, f11 + fpsub f7, f30, f7 + fpsub f15, f31, f15 + +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + fpsub f0, f16, f0 + LFPDUX f24, AO, INC4 + fpsub f1, f17, f1 + LFPDUX f25, AO2, INC4 + fpsub f2, f18, f2 + LFPDUX f26, AO, INC4 + fpsub f3, f19, f3 + LFPDUX f27, AO2, INC4 + fpsub f4, f20, f4 + LFPDUX f28, AO, INC4 + fpsub f5, f21, f5 + LFPDUX f29, AO2, INC4 + fpsub f6, f22, f6 + LFPDUX f30, AO, INC4 + fpsub f7, f23, f7 + LFPDUX f31, AO2, INC4 + + fpsub f8, f24, f8 + subi AO, AO, 32 * SIZE + fpsub f9, f25, f9 + subi AO2, AO2, 32 * SIZE + fpsub f10, f26, f10 + fpsub f11, f27, f11 + fpsub f12, f28, f12 + fpsub f13, f29, f13 + fpsub f14, f30, f14 + fpsub f15, f31, f15 +#endif + +#ifdef LN + addi AO, AO, 68 * SIZE + addi AO2, AO2, 68 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + fxsmul f7, A1, f7 + fxsmul f15, A1, f15 + + fxcpnmsub f3, A1, f7, f3 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f6, A2, f7, f6 + fxcsnmsub f14, A2, f15, f14 + + fxcpnmsub f2, A2, f7, f2 + fxcpnmsub f10, A2, f15, f10 + + fxcsnmsub f5, A3, f7, f5 + fxcsnmsub f13, A3, f15, f13 + + fxcpnmsub f1, A3, f7, f1 + fxcpnmsub f9, A3, f15, f9 + + fxcsnmsub f4, A4, f7, f4 + fxcsnmsub f12, A4, f15, f12 + + fxcpnmsub f0, A4, f7, f0 + fxcpnmsub f8, A4, f15, f8 + + fxpmul f3, A5, f3 + fxpmul f11, A5, f11 + + fxcsnmsub f6, A6, f3, f6 + fxcsnmsub f14, A6, f11, f14 + + fxcpnmsub f2, A6, f3, f2 + fxcpnmsub f10, A6, f11, f10 + + fxcsnmsub f5, A7, f3, f5 + fxcsnmsub f13, A7, f11, f13 + + fxcpnmsub f1, A7, f3, f1 + fxcpnmsub f9, A7, f11, f9 + + fxcsnmsub f4, A8, f3, f4 + fxcsnmsub f12, A8, f11, f12 + + fxcpnmsub f0, A8, f3, f0 + fxcpnmsub f8, A8, f11, f8 + + add AO2, AO2, INCM4 + LFPDUX A1, AO, INCM4 + LFPDUX A2, AO2, INCM4 + LFPDUX A3, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f10, A1, f14, f10 + + fxcsnmsub f5, A2, f6, f5 + fxcsnmsub f13, A2, f14, f13 + + fxcpnmsub f1, A2, f6, f1 + fxcpnmsub f9, A2, f14, f9 + + fxcsnmsub f4, A3, f6, f4 + fxcsnmsub f12, A3, f14, f12 + + fxcpnmsub f0, A3, f6, f0 + fxcpnmsub f8, A3, f14, f8 + + fxpmul f2, A4, f2 + fxpmul f10, A4, f10 + + fxcsnmsub f5, A5, f2, f5 + fxcsnmsub f13, A5, f10, f13 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + + fxcsnmsub f4, A6, f2, f4 + fxcsnmsub f12, A6, f10, f12 + + fxcpnmsub f0, A6, f2, f0 + fxcpnmsub f8, A6, f10, f8 + + fxsmul f5, A7, f5 + fxsmul f13, A7, f13 + + fxcpnmsub f1, A7, f5, f1 + fxcpnmsub f9, A7, f13, f9 + + fxcsnmsub f4, A8, f5, f4 + fxcsnmsub f12, A8, f13, f12 + + fxcpnmsub f0, A8, f5, f0 + fxcpnmsub f8, A8, f13, f8 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A3, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A4, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f4, A2, f1, f4 + fxcsnmsub f12, A2, f9, f12 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f0, A3, f4, f0 + fxcpnmsub f8, A3, f12, f8 + + fxpmul f0, A4, f0 + fxpmul f8, A4, f8 + +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + + fxcsnmsub f6, A3, f0, f6 + fxcsnmsub f14, A3, f8, f14 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + + fxcsnmsub f7, A4, f0, f7 + fxcsnmsub f15, A4, f8, f15 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f1, A6, f4, f1 + fxcpnmsub f9, A6, f12, f9 + + fxcsnmsub f5, A6, f4, f5 + fxcsnmsub f13, A6, f12, f13 + + fxcpnmsub f2, A7, f4, f2 + fxcpnmsub f10, A7, f12, f10 + + fxcsnmsub f6, A7, f4, f6 + fxcsnmsub f14, A7, f12, f14 + + fxcpnmsub f3, A8, f4, f3 + fxcpnmsub f11, A8, f12, f11 + + fxcsnmsub f7, A8, f4, f7 + fxcsnmsub f15, A8, f12, f15 + + add AO, AO, INC4 + LFPDUX A1, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f13, A1, f9, f13 + + fxcpnmsub f2, A2, f1, f2 + fxcpnmsub f10, A2, f9, f10 + + fxcsnmsub f6, A2, f1, f6 + fxcsnmsub f14, A2, f9, f14 + + fxcpnmsub f3, A3, f1, f3 + fxcpnmsub f11, A3, f9, f11 + + fxcsnmsub f7, A3, f1, f7 + fxcsnmsub f15, A3, f9, f15 + + fxsmul f5, A4, f5 + fxsmul f13, A4, f13 + + fxcpnmsub f2, A5, f5, f2 + fxcpnmsub f10, A5, f13, f10 + + fxcsnmsub f6, A5, f5, f6 + fxcsnmsub f14, A5, f13, f14 + + fxcpnmsub f3, A6, f5, f3 + fxcpnmsub f11, A6, f13, f11 + + fxcsnmsub f7, A6, f5, f7 + fxcsnmsub f15, A6, f13, f15 + + fxpmul f2, A7, f2 + fxpmul f10, A7, f10 + + fxcsnmsub f6, A7, f2, f6 + fxcsnmsub f14, A7, f10, f14 + + fxcpnmsub f3, A8, f2, f3 + fxcpnmsub f11, A8, f10, f11 + + fxcsnmsub f7, A8, f2, f7 + fxcsnmsub f15, A8, f10, f15 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A3, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A4, AO2, INC4 + + subi AO, AO, 64 * SIZE + subi AO2, AO2, 64 * SIZE + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f3, A2, f6, f3 + fxcpnmsub f11, A2, f14, f11 + + fxcsnmsub f7, A2, f6, f7 + fxcsnmsub f15, A2, f14, f15 + + fxpmul f3, A3, f3 + fxpmul f11, A3, f11 + + fxcsnmsub f7, A3, f3, f7 + fxcsnmsub f15, A3, f11, f15 + + fxsmul f7, A4, f7 + fxsmul f15, A4, f15 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + fxcsnmsub f14, A2, f2, f14 + fxcsnmsub f15, A2, f3, f15 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxsmul f6, A3, f6 + fxsmul f7, A3, f7 + + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + fxcpnmsub f10, A4, f6, f10 + fxcpnmsub f11, A4, f7, f11 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + fxcsnmsub f14, A4, f6, f14 + fxcsnmsub f15, A4, f7, f15 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxpmul f10, A5, f10 + fxpmul f11, A5, f11 + + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + fxcsnmsub f14, A5, f10, f14 + fxcsnmsub f15, A5, f11, f15 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 + fxsmul f14, A6, f14 + fxsmul f15, A6, f15 + +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxsmul f14, A1, f14 + fxsmul f15, A1, f15 + + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + fxcpnmsub f10, A1, f14, f10 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcsnmsub f6, A2, f14, f6 + fxcsnmsub f7, A2, f15, f7 + + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + fxcpnmsub f2, A2, f14, f2 + fxcpnmsub f3, A2, f15, f3 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxpmul f10, A3, f10 + fxpmul f11, A3, f11 + + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + fxcsnmsub f6, A4, f10, f6 + fxcsnmsub f7, A4, f11, f7 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + fxcpnmsub f2, A4, f10, f2 + fxcpnmsub f3, A4, f11, f3 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxsmul f6, A5, f6 + fxsmul f7, A5, f7 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + fxcpnmsub f2, A5, f6, f2 + fxcpnmsub f3, A5, f7, f3 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 + fxpmul f2, A6, f2 + fxpmul f3, A6, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f6, BO, INC4 + STFPDUX f14, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + STFPDUX f7, BO, INC4 + STFPDUX f15, BO2, INC4 + + subi BO, BO, 32 * SIZE + subi BO2, BO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + STFDUX f10, CO3, INC + STFDUX f14, CO3, INC + STFDUX f11, CO3, INC + STFDUX f15, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC + STFSDUX f10, CO4, INC + STFSDUX f14, CO4, INC + STFSDUX f11, CO4, INC + STFSDUX f15, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f6, AO, INC4 + STFPDUX f7, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + STFPDUX f14, AO, INC4 + STFPDUX f15, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC + STFDUX f14, CO4, INC + STFSDUX f14, CO4, INC + STFDUX f15, CO4, INC + STFSDUX f15, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 4 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f28, f8 + fpmr f29, f9 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f8, f12 + fsmfp f9, f13 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f12, f28 + fsmtp f13, f29 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + LFPDUX f20, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f22, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 + + fpsub f1, f20, f1 + fpsub f9, f21, f9 + fpsub f5, f22, f5 + fpsub f13, f23, f13 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f4, f18, f4 + fpsub f5, f19, f5 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f12, f22, f12 + fpsub f13, f23, f13 +#endif + +#ifdef LN + addi AO, AO, 20 * SIZE + addi AO2, AO2, 20 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A5, AO, INCM4 + add AO2, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f5, A1, f5 + fxsmul f13, A1, f13 + + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f5, f4 + fxcsnmsub f12, A2, f13, f12 + + fxcpnmsub f0, A2, f5, f0 + fxcpnmsub f8, A2, f13, f8 + + fxpmul f1, A3, f1 + fxpmul f9, A3, f9 + + fxcsnmsub f4, A4, f1, f4 + fxcsnmsub f12, A4, f9, f12 + + fxcpnmsub f0, A4, f1, f0 + fxcpnmsub f8, A4, f9, f8 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f8, A5, f12, f8 + + fxpmul f0, A6, f0 + fxpmul f8, A6, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + add AO, AO, INC4 + LFPDUX A6, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f1, A4, f4, f1 + fxcpnmsub f9, A4, f12, f9 + + fxcsnmsub f5, A4, f4, f5 + fxcsnmsub f13, A4, f12, f13 + + fxpmul f1, A5, f1 + fxpmul f9, A5, f9 + + fxcsnmsub f5, A5, f1, f5 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f5, A6, f5 + fxsmul f13, A6, f13 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 2 + beq .L40 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + LFPDUX B3, BO, INC4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + LFPDUX A5, BO, INC4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + LFPDUX A7, BO, INC4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f28, f8 + + fsmfp f0, f4 + fsmfp f8, f12 + fsmtp f4, f24 + fsmtp f12, f28 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f8, f18, f8 + fpsub f12, f19, f12 +#endif + +#ifdef LN + addi AO, AO, 8 * SIZE + addi AO2, AO2, 8 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f12, A1, f12 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f8, A1, f12, f8 + + fxpmul f0, A2, f0 + fxpmul f8, A2, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + subi AO, AO, 4 * SIZE + subi AO2, AO2, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxsmul f4, A2, f4 + fxsmul f12, A2, f12 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f8, A2, f0, f8 + fxcsnmsub f12, A2, f0, f12 + + fxsmul f4, A3, f4 + fxcpnmsub f8, A4, f4, f8 + fxcsnmsub f12, A4, f4, f12 + + fxpmul f8, A5, f8 + fxcsnmsub f12, A5, f8, f12 + fxsmul f12, A6, f12 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxcpnmsub f8, A1, f12, f8 + fxcsnmsub f4, A2, f12, f4 + fxcpnmsub f0, A2, f12, f0 + + fxpmul f8, A3, f8 + fxcsnmsub f4, A4, f8, f4 + fxcpnmsub f0, A4, f8, f0 + + fxsmul f4, A5, f4 + fxcpnmsub f0, A5, f4, f0 + fxpmul f0, A6, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f4, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f12, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L40: + andi. I, M, 1 + beq .L49 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L44 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L44 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L43 + .align 4 + +.L42: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L42 + .align 4 + +.L43: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L44: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L48 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L48 +#endif + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L47 + .align 4 + +.L46: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L46 + .align 4 + +.L47: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + addi AO2, AO, 2 * SIZE + .align 4 + +.L48: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#if defined(LN) || defined(LT) + LFPDX A1, AO, INC4 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RN + LFD A1, (4 + 0) * SIZE(BO) + LFD A2, (4 + 1) * SIZE(BO) + LFD A3, (4 + 2) * SIZE(BO) + LFD A4, (4 + 3) * SIZE(BO) + + LFD A5, (4 + 5) * SIZE(BO) + LFD A6, (4 + 6) * SIZE(BO) + LFD A7, (4 + 7) * SIZE(BO) + LFD A8, (4 + 10) * SIZE(BO) + + LFD A9, (4 + 11) * SIZE(BO) + LFD A10, (4 + 15) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f0, A1, f0 + fnmsub f2, A2, f0, f2 + fnmsub f1, A3, f0, f1 + fnmsub f3, A4, f0, f3 + + fmul f2, A5, f2 + fnmsub f1, A6, f2, f1 + fnmsub f3, A7, f2, f3 + + fmul f1, A8, f1 + fnmsub f3, A9, f1, f3 + + fmul f3, A10, f3 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#ifdef RT + LFD A1, (4 + 15) * SIZE(BO) + LFD A2, (4 + 14) * SIZE(BO) + LFD A3, (4 + 13) * SIZE(BO) + LFD A4, (4 + 12) * SIZE(BO) + + LFD A5, (4 + 10) * SIZE(BO) + LFD A6, (4 + 9) * SIZE(BO) + LFD A7, (4 + 8) * SIZE(BO) + LFD A8, (4 + 5) * SIZE(BO) + + LFD A9, (4 + 4) * SIZE(BO) + LFD A10, (4 + 0) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f3, A1, f3 + fnmsub f1, A2, f3, f1 + fnmsub f2, A3, f3, f2 + fnmsub f0, A4, f3, f0 + + fmul f1, A5, f1 + fnmsub f2, A6, f1, f2 + fnmsub f0, A7, f1, f0 + + fmul f2, A8, f2 + fnmsub f0, A9, f2, f0 + + fmul f0, A10, f0 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f1, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f1, AO2, INC4 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC + STFDUX f1, CO3, INC + STFSDUX f1, CO4, INC + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 2 + beq .L90 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L60 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, KK, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f26, f2 + fpmr f27, f3 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f6, f26 + fsmtp f7, f27 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + LFPDUX f20, BO, INC2 + LFPDUX f21, BO, INC2 + LFPDUX f22, BO, INC2 + LFPDUX f23, BO, INC2 + + subi BO, BO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f1, f18, f1 + fpsub f5, f19, f5 + + fpsub f2, f20, f2 + fpsub f6, f21, f6 + fpsub f3, f22, f3 + fpsub f7, f23, f7 + +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + LFPDUX f20, AO, INC2 + LFPDUX f21, AO, INC2 + LFPDUX f22, AO, INC2 + LFPDUX f23, AO, INC2 + + subi AO, AO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + fpsub f4, f20, f4 + fpsub f5, f21, f5 + fpsub f6, f22, f6 + fpsub f7, f23, f7 +#endif + +#ifdef LN + addi AO, AO, 66 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f7, A1, f7 + fxcpnmsub f3, A1, f7, f3 + fxcsnmsub f6, A2, f7, f6 + fxcpnmsub f2, A2, f7, f2 + + fxcsnmsub f5, A3, f7, f5 + fxcpnmsub f1, A3, f7, f1 + fxcsnmsub f4, A4, f7, f4 + fxcpnmsub f0, A4, f7, f0 + + fxpmul f3, A5, f3 + fxcsnmsub f6, A6, f3, f6 + fxcpnmsub f2, A6, f3, f2 + + fxcsnmsub f5, A7, f3, f5 + fxcpnmsub f1, A7, f3, f1 + fxcsnmsub f4, A8, f3, f4 + fxcpnmsub f0, A8, f3, f0 + + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + + add AO, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f6, A1, f6 + fxcpnmsub f2, A1, f6, f2 + fxcsnmsub f5, A2, f6, f5 + fxcpnmsub f1, A2, f6, f1 + fxcsnmsub f4, A3, f6, f4 + fxcpnmsub f0, A3, f6, f0 + + fxpmul f2, A4, f2 + fxcsnmsub f5, A5, f2, f5 + fxcpnmsub f1, A5, f2, f1 + fxcsnmsub f4, A6, f2, f4 + fxcpnmsub f0, A6, f2, f0 + + fxsmul f5, A7, f5 + fxcpnmsub f1, A7, f5, f1 + fxcsnmsub f4, A8, f5, f4 + fxcpnmsub f0, A8, f5, f0 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + + subi AO, AO, 6 * SIZE + LFPDUX A3, AO, INCM2 + subi AO, AO, 6 * SIZE + LFPDUX A4, AO, INCM2 + + addi AO, AO, -2 * SIZE + + fxpmul f1, A1, f1 + fxcsnmsub f4, A2, f1, f4 + fxcpnmsub f0, A2, f1, f0 + + fxsmul f4, A3, f4 + fxcpnmsub f0, A3, f4, f0 + + fxpmul f0, A4, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f5, A2, f0, f5 + fxcpnmsub f2, A3, f0, f2 + fxcsnmsub f6, A3, f0, f6 + fxcpnmsub f3, A4, f0, f3 + fxcsnmsub f7, A4, f0, f7 + + fxsmul f4, A5, f4 + fxcpnmsub f1, A6, f4, f1 + fxcsnmsub f5, A6, f4, f5 + fxcpnmsub f2, A7, f4, f2 + fxcsnmsub f6, A7, f4, f6 + fxcpnmsub f3, A8, f4, f3 + fxcsnmsub f7, A8, f4, f7 + + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + add AO, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f1, A1, f1 + fxcsnmsub f5, A1, f1, f5 + fxcpnmsub f2, A2, f1, f2 + fxcsnmsub f6, A2, f1, f6 + fxcpnmsub f3, A3, f1, f3 + fxcsnmsub f7, A3, f1, f7 + + fxsmul f5, A4, f5 + fxcpnmsub f2, A5, f5, f2 + fxcsnmsub f6, A5, f5, f6 + fxcpnmsub f3, A6, f5, f3 + fxcsnmsub f7, A6, f5, f7 + + fxpmul f2, A7, f2 + fxcsnmsub f6, A7, f2, f6 + fxcpnmsub f3, A8, f2, f3 + fxcsnmsub f7, A8, f2, f7 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, 6 * SIZE + LFPDUX A3, AO, INC2 + addi AO, AO, 6 * SIZE + LFPDUX A4, AO, INC2 + + subi AO, AO, 64 * SIZE + + fxsmul f6, A1, f6 + fxcpnmsub f3, A2, f6, f3 + fxcsnmsub f7, A2, f6, f7 + + fxpmul f3, A3, f3 + fxcsnmsub f7, A3, f3, f7 + + fxsmul f7, A4, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxsmul f4, A2, f4 + fxsmul f5, A2, f5 + fxsmul f6, A2, f6 + fxsmul f7, A2, f7 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f5, A1, f5 + fxsmul f6, A1, f6 + fxsmul f7, A1, f7 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f3, A1, f7, f3 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 + fxpmul f2, A2, f2 + fxpmul f3, A2, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f4, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f5, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f6, BO, INC2 + STFPDUX f3, BO, INC2 + STFPDUX f7, BO, INC2 + + subi BO, BO, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + STFPDUX f4, AO, INC2 + STFPDUX f5, AO, INC2 + STFPDUX f6, AO, INC2 + STFPDUX f7, AO, INC2 + + subi AO, AO, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 4 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L68: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + + fsmfp f0, f2 + fsmfp f1, f3 + fsmtp f2, f24 + fsmtp f3, f25 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + fpsub f1, f18, f1 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + addi AO, AO, 18 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A5, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A6, AO, INCM2 + + subi AO, AO, 2 * SIZE + + fxsmul f3, A1, f3 + fxcpnmsub f1, A1, f3, f1 + fxcsnmsub f2, A2, f3, f2 + fxcpnmsub f0, A2, f3, f0 + + fxpmul f1, A3, f1 + fxcsnmsub f2, A4, f1, f2 + fxcpnmsub f0, A4, f1, f0 + + fxsmul f2, A5, f2 + fxcpnmsub f0, A5, f2, f0 + + fxpmul f0, A6, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + add AO, AO, INC2 + LFPDUX A6, AO, INC2 + + subi AO, AO, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f2, A1, f0, f2 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f3, A2, f0, f3 + + fxsmul f2, A3, f2 + fxcpnmsub f1, A4, f2, f1 + fxcsnmsub f3, A4, f2, f3 + + fxpmul f1, A5, f1 + fxcsnmsub f3, A5, f1, f3 + + fxsmul f3, A6, f3 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + + fxcsnmsub f2, A1, f0, f2 + fxcsnmsub f3, A1, f1, f3 + + fxsmul f2, A2, f2 + fxsmul f3, A2, f3 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f2, A1, f2 + fxsmul f3, A1, f3 + + fxcpnmsub f0, A1, f2, f0 + fxcpnmsub f1, A1, f3, f1 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f2, CO1, INC + STFDUX f1, CO1, INC + STFDUX f3, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f3, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 2 + beq .L80 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L72 + .align 4 + +.L73: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fsmfp f0, f1 + fsmtp f1, f24 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxsmul f1, A2, f1 + fxcpnmsub f0, A2, f1, f0 + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f1, A1, f1 + fxcpnmsub f0, A1, f1, f0 + fxpmul f0, A2, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFDUX f1, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f1, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L80: + andi. I, M, 1 + beq .L89 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L84 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L84 + +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L83 + .align 4 + +.L82: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L82 + .align 4 + +.L83: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L84: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L88 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L88 +#endif + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L87 + .align 4 + +.L86: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L86 + .align 4 + +.L87: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L88: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RN + LFD A1, (2 + 0) * SIZE(BO) + LFD A2, (2 + 1) * SIZE(BO) + LFD A3, (2 + 3) * SIZE(BO) + + fsmtp f1, f0 + + fmul f0, A1, f0 + fnmsub f1, A2, f0, f1 + + fmul f1, A3, f1 + fsmfp f0, f1 +#endif + +#ifdef RT + LFD A1, (2 + 3) * SIZE(BO) + LFD A2, (2 + 2) * SIZE(BO) + LFD A3, (2 + 0) * SIZE(BO) + + fsmtp f1, f0 + + fmul f1, A1, f1 + fnmsub f0, A2, f1, f0 + + fmul f0, A3, f0 + fsmfp f0, f1 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +.L90: + andi. J, N, 1 + beq .L999 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO1, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L100 + .align 4 + +.L91: +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L94 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L94 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L93 + .align 4 + +.L92: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L92 + .align 4 + +.L93: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L94: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L98 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L98 +#endif + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L97 + .align 4 + +.L96: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L96 + .align 4 + +.L97: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L98: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 63) * SIZE(AO) + LFD A2, (2 + 62) * SIZE(AO) + LFD A3, (2 + 61) * SIZE(AO) + LFD A4, (2 + 60) * SIZE(AO) + LFD A5, (2 + 59) * SIZE(AO) + LFD A6, (2 + 58) * SIZE(AO) + LFD A7, (2 + 57) * SIZE(AO) + LFD A8, (2 + 56) * SIZE(AO) + + fmul f7, A1, f7 + fnmsub f3, A2, f7, f3 + fnmsub f6, A3, f7, f6 + fnmsub f2, A4, f7, f2 + fnmsub f5, A5, f7, f5 + fnmsub f1, A6, f7, f1 + fnmsub f4, A7, f7, f4 + fnmsub f0, A8, f7, f0 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 53) * SIZE(AO) + LFD A3, (2 + 52) * SIZE(AO) + LFD A4, (2 + 51) * SIZE(AO) + LFD A5, (2 + 50) * SIZE(AO) + LFD A6, (2 + 49) * SIZE(AO) + LFD A7, (2 + 48) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f6, A2, f3, f6 + fnmsub f2, A3, f3, f2 + fnmsub f5, A4, f3, f5 + fnmsub f1, A5, f3, f1 + fnmsub f4, A6, f3, f4 + fnmsub f0, A7, f3, f0 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 44) * SIZE(AO) + LFD A3, (2 + 43) * SIZE(AO) + LFD A4, (2 + 42) * SIZE(AO) + LFD A5, (2 + 41) * SIZE(AO) + LFD A6, (2 + 40) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f2, A2, f6, f2 + fnmsub f5, A3, f6, f5 + fnmsub f1, A4, f6, f1 + fnmsub f4, A5, f6, f4 + fnmsub f0, A6, f6, f0 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 35) * SIZE(AO) + LFD A3, (2 + 34) * SIZE(AO) + LFD A4, (2 + 33) * SIZE(AO) + LFD A5, (2 + 32) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f5, A2, f2, f5 + fnmsub f1, A3, f2, f1 + fnmsub f4, A4, f2, f4 + fnmsub f0, A5, f2, f0 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 26) * SIZE(AO) + LFD A3, (2 + 25) * SIZE(AO) + LFD A4, (2 + 24) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 17) * SIZE(AO) + LFD A3, (2 + 16) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 8) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + LFD A5, (2 + 4) * SIZE(AO) + LFD A6, (2 + 5) * SIZE(AO) + LFD A7, (2 + 6) * SIZE(AO) + LFD A8, (2 + 7) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + fnmsub f2, A5, f0, f2 + fnmsub f6, A6, f0, f6 + fnmsub f3, A7, f0, f3 + fnmsub f7, A8, f0, f7 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 10) * SIZE(AO) + LFD A3, (2 + 11) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + LFD A5, (2 + 13) * SIZE(AO) + LFD A6, (2 + 14) * SIZE(AO) + LFD A7, (2 + 15) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + fnmsub f2, A4, f4, f2 + fnmsub f6, A5, f4, f6 + fnmsub f3, A6, f4, f3 + fnmsub f7, A7, f4, f7 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 19) * SIZE(AO) + LFD A3, (2 + 20) * SIZE(AO) + LFD A4, (2 + 21) * SIZE(AO) + LFD A5, (2 + 22) * SIZE(AO) + LFD A6, (2 + 23) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + fnmsub f2, A3, f1, f2 + fnmsub f6, A4, f1, f6 + fnmsub f3, A5, f1, f3 + fnmsub f7, A6, f1, f7 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 28) * SIZE(AO) + LFD A3, (2 + 29) * SIZE(AO) + LFD A4, (2 + 30) * SIZE(AO) + LFD A5, (2 + 31) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f2, A2, f5, f2 + fnmsub f6, A3, f5, f6 + fnmsub f3, A4, f5, f3 + fnmsub f7, A5, f5, f7 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 37) * SIZE(AO) + LFD A3, (2 + 38) * SIZE(AO) + LFD A4, (2 + 39) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f6, A2, f2, f6 + fnmsub f3, A3, f2, f3 + fnmsub f7, A4, f2, f7 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 46) * SIZE(AO) + LFD A3, (2 + 47) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f3, A2, f6, f3 + fnmsub f7, A3, f6, f7 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 55) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f7, A2, f3, f7 + + LFD A1, (2 + 63) * SIZE(AO) + + fmul f7, A1, f7 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L91 + .align 4 + +.L100: + andi. I, M, 4 + beq .L110 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L104 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L103 + .align 4 + +.L102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L102 + .align 4 + +.L103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L104: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L108 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L108 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L107 + .align 4 + +.L106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L106 + .align 4 + +.L107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L108: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 15) * SIZE(AO) + LFD A2, (2 + 14) * SIZE(AO) + LFD A3, (2 + 13) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 9) * SIZE(AO) + LFD A3, (2 + 8) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 4) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 6) * SIZE(AO) + LFD A3, (2 + 7) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 11) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + + LFD A1, (2 + 15) * SIZE(AO) + + fmul f5, A1, f5 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L110: + andi. I, M, 2 + beq .L120 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L114 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L113 + .align 4 + +.L112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L112 + .align 4 + +.L113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L114: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L118 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L118 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L117 + .align 4 + +.L116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L116 + .align 4 + +.L117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L118: + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + fsmtp f4, f0 + + LFD A1, (2 + 3) * SIZE(AO) + LFD A2, (2 + 2) * SIZE(AO) + LFD A3, (2 + 0) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + fmul f0, A3, f0 + fsmfp f0, f4 +#endif + +#ifdef LT + fsmtp f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fmul f4, A3, f4 + + fsmfp f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L120: + andi. I, M, 1 + beq .L129 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L124 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L123 + .align 4 + +.L122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L122 + .align 4 + +.L123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L124: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L128 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L128 +#endif + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L127 + .align 4 + +.L126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L126 + .align 4 + +.L127: + fmadd f0, A1, B1, f0 + .align 4 + +.L128: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + + fadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFDX f16, BO, INC2 + + fsub f0, f16, f0 +#else + LFDX f16, AO, INC2 + + fsub f0, f16, f0 +#endif + +#ifdef LN + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef LT + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef RN + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef RT + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFDX f0, BO, INC2 + + STFDUX f0, CO1, INC +#else + STFDX f0, AO, INC2 + + STFDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L129: +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S new file mode 100644 index 0000000000..e0b5d21f87 --- /dev/null +++ b/kernel/power/trsm_kernel_hummer_RT.S @@ -0,0 +1,5696 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define ALPHA 0 +#define FZERO 8 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM4 r16 +#define INCM2 r17 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define CO3 r30 +#define CO4 r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) # dummy + + li r0, 0 + + stwu r0, -4(SP) + stwu r0, -4(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, BASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + + li INCM1, -1 * SIZE + li INCM2, -2 * SIZE + li INCM4, -4 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + andi. J, N, 1 + beq .L50 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO1, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L100 + .align 4 + +.L91: +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L94 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L94 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L93 + .align 4 + +.L92: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B2, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + bdnz+ .L92 + .align 4 + +.L93: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcsmadd f0, B1, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B1, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B1, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B1, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B2, A1, f0 + fxcpmadd f1, B2, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcpmadd f3, B2, A4, f3 + + fxcsmadd f0, B2, A5, f0 + fxcsmadd f1, B2, A6, f1 + fxcsmadd f2, B2, A7, f2 + fxcsmadd f3, B2, A8, f3 + .align 4 + +.L94: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L98 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L98 +#endif + + LFDX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdz- .L97 + .align 4 + +.L96: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcpmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + LFDX B1, BO, INC2 + LFPDUX A4, AO, INC2 + add BO, BO, INC + bdnz+ .L96 + .align 4 + +.L97: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + fxcpmadd f2, B1, A3, f2 + fxcpmadd f3, B1, A4, f3 + .align 4 + +.L98: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 63) * SIZE(AO) + LFD A2, (2 + 62) * SIZE(AO) + LFD A3, (2 + 61) * SIZE(AO) + LFD A4, (2 + 60) * SIZE(AO) + LFD A5, (2 + 59) * SIZE(AO) + LFD A6, (2 + 58) * SIZE(AO) + LFD A7, (2 + 57) * SIZE(AO) + LFD A8, (2 + 56) * SIZE(AO) + + fmul f7, A1, f7 + fnmsub f3, A2, f7, f3 + fnmsub f6, A3, f7, f6 + fnmsub f2, A4, f7, f2 + fnmsub f5, A5, f7, f5 + fnmsub f1, A6, f7, f1 + fnmsub f4, A7, f7, f4 + fnmsub f0, A8, f7, f0 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 53) * SIZE(AO) + LFD A3, (2 + 52) * SIZE(AO) + LFD A4, (2 + 51) * SIZE(AO) + LFD A5, (2 + 50) * SIZE(AO) + LFD A6, (2 + 49) * SIZE(AO) + LFD A7, (2 + 48) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f6, A2, f3, f6 + fnmsub f2, A3, f3, f2 + fnmsub f5, A4, f3, f5 + fnmsub f1, A5, f3, f1 + fnmsub f4, A6, f3, f4 + fnmsub f0, A7, f3, f0 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 44) * SIZE(AO) + LFD A3, (2 + 43) * SIZE(AO) + LFD A4, (2 + 42) * SIZE(AO) + LFD A5, (2 + 41) * SIZE(AO) + LFD A6, (2 + 40) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f2, A2, f6, f2 + fnmsub f5, A3, f6, f5 + fnmsub f1, A4, f6, f1 + fnmsub f4, A5, f6, f4 + fnmsub f0, A6, f6, f0 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 35) * SIZE(AO) + LFD A3, (2 + 34) * SIZE(AO) + LFD A4, (2 + 33) * SIZE(AO) + LFD A5, (2 + 32) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f5, A2, f2, f5 + fnmsub f1, A3, f2, f1 + fnmsub f4, A4, f2, f4 + fnmsub f0, A5, f2, f0 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 26) * SIZE(AO) + LFD A3, (2 + 25) * SIZE(AO) + LFD A4, (2 + 24) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 17) * SIZE(AO) + LFD A3, (2 + 16) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 8) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + fsmtp f6, f2 + fsmtp f7, f3 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + LFD A5, (2 + 4) * SIZE(AO) + LFD A6, (2 + 5) * SIZE(AO) + LFD A7, (2 + 6) * SIZE(AO) + LFD A8, (2 + 7) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + fnmsub f2, A5, f0, f2 + fnmsub f6, A6, f0, f6 + fnmsub f3, A7, f0, f3 + fnmsub f7, A8, f0, f7 + + LFD A1, (2 + 9) * SIZE(AO) + LFD A2, (2 + 10) * SIZE(AO) + LFD A3, (2 + 11) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + LFD A5, (2 + 13) * SIZE(AO) + LFD A6, (2 + 14) * SIZE(AO) + LFD A7, (2 + 15) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + fnmsub f2, A4, f4, f2 + fnmsub f6, A5, f4, f6 + fnmsub f3, A6, f4, f3 + fnmsub f7, A7, f4, f7 + + LFD A1, (2 + 18) * SIZE(AO) + LFD A2, (2 + 19) * SIZE(AO) + LFD A3, (2 + 20) * SIZE(AO) + LFD A4, (2 + 21) * SIZE(AO) + LFD A5, (2 + 22) * SIZE(AO) + LFD A6, (2 + 23) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + fnmsub f2, A3, f1, f2 + fnmsub f6, A4, f1, f6 + fnmsub f3, A5, f1, f3 + fnmsub f7, A6, f1, f7 + + LFD A1, (2 + 27) * SIZE(AO) + LFD A2, (2 + 28) * SIZE(AO) + LFD A3, (2 + 29) * SIZE(AO) + LFD A4, (2 + 30) * SIZE(AO) + LFD A5, (2 + 31) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f2, A2, f5, f2 + fnmsub f6, A3, f5, f6 + fnmsub f3, A4, f5, f3 + fnmsub f7, A5, f5, f7 + + LFD A1, (2 + 36) * SIZE(AO) + LFD A2, (2 + 37) * SIZE(AO) + LFD A3, (2 + 38) * SIZE(AO) + LFD A4, (2 + 39) * SIZE(AO) + + fmul f2, A1, f2 + fnmsub f6, A2, f2, f6 + fnmsub f3, A3, f2, f3 + fnmsub f7, A4, f2, f7 + + LFD A1, (2 + 45) * SIZE(AO) + LFD A2, (2 + 46) * SIZE(AO) + LFD A3, (2 + 47) * SIZE(AO) + + fmul f6, A1, f6 + fnmsub f3, A2, f6, f3 + fnmsub f7, A3, f6, f7 + + LFD A1, (2 + 54) * SIZE(AO) + LFD A2, (2 + 55) * SIZE(AO) + + fmul f3, A1, f3 + fnmsub f7, A2, f3, f7 + + LFD A1, (2 + 63) * SIZE(AO) + + fmul f7, A1, f7 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L91 + .align 4 + +.L100: + andi. I, M, 4 + beq .L110 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L104 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L104 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + + bdz- .L103 + .align 4 + +.L102: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B3, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B3, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L102 + .align 4 + +.L103: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + fxcsmadd f2, B1, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B1, A4, f3 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + LFPDUX A6, AO, INC2 + fxcsmadd f2, B2, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B2, A8, f3 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f2, B3, A3, f2 + fxcsmadd f3, B3, A4, f3 + + fxcpmadd f0, B4, A5, f0 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L104: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L108 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L108 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdz- .L107 + .align 4 + +.L106: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + LFDX B1, BO, INC2 + LFPDUX A2, AO, INC2 + add BO, BO, INC + bdnz+ .L106 + .align 4 + +.L107: + fxcpmadd f0, B1, A1, f0 + fxcpmadd f1, B1, A2, f1 + .align 4 + +.L108: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 15) * SIZE(AO) + LFD A2, (2 + 14) * SIZE(AO) + LFD A3, (2 + 13) * SIZE(AO) + LFD A4, (2 + 12) * SIZE(AO) + + fmul f5, A1, f5 + fnmsub f1, A2, f5, f1 + fnmsub f4, A3, f5, f4 + fnmsub f0, A4, f5, f0 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 9) * SIZE(AO) + LFD A3, (2 + 8) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f4, A2, f1, f4 + fnmsub f0, A3, f1, f0 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 4) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef LT + fsmtp f4, f0 + fsmtp f5, f1 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 2) * SIZE(AO) + LFD A4, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fnmsub f1, A3, f0, f1 + fnmsub f5, A4, f0, f5 + + LFD A1, (2 + 5) * SIZE(AO) + LFD A2, (2 + 6) * SIZE(AO) + LFD A3, (2 + 7) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f1, A2, f4, f1 + fnmsub f5, A3, f4, f5 + + LFD A1, (2 + 10) * SIZE(AO) + LFD A2, (2 + 11) * SIZE(AO) + + fmul f1, A1, f1 + fnmsub f5, A2, f1, f5 + + LFD A1, (2 + 15) * SIZE(AO) + + fmul f5, A1, f5 + + fsmfp f0, f4 + fsmfp f1, f5 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L110: + andi. I, M, 2 + beq .L120 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L114 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L114 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L113 + .align 4 + +.L112: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + fxcsmadd f1, B1, A2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + fxcpmadd f0, B3, A5, f0 + LFPDUX A5, AO, INC2 + fxcsmadd f1, B3, A6, f1 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L112 + .align 4 + +.L113: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A2, f1 + fxcpmadd f2, B2, A3, f2 + fxcsmadd f3, B2, A4, f3 + fxcpmadd f0, B3, A5, f0 + fxcsmadd f1, B3, A6, f1 + fxcpmadd f2, B4, A7, f2 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L114: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L118 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L118 +#endif + + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdz- .L117 + .align 4 + +.L116: + fxcpmadd f0, B1, A1, f0 + LFPDUX A1, AO, INC2 + LFDX B1, BO, INC2 + add BO, BO, INC + bdnz+ .L116 + .align 4 + +.L117: + fxcpmadd f0, B1, A1, f0 + .align 4 + +.L118: + fpadd f0, f0, f1 + fpadd f2, f3, f2 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + fsmtp f4, f0 + + LFD A1, (2 + 3) * SIZE(AO) + LFD A2, (2 + 2) * SIZE(AO) + LFD A3, (2 + 0) * SIZE(AO) + + fmul f4, A1, f4 + fnmsub f0, A2, f4, f0 + fmul f0, A3, f0 + fsmfp f0, f4 +#endif + +#ifdef LT + fsmtp f4, f0 + + LFD A1, (2 + 0) * SIZE(AO) + LFD A2, (2 + 1) * SIZE(AO) + LFD A3, (2 + 3) * SIZE(AO) + + fmul f0, A1, f0 + fnmsub f4, A2, f0, f4 + fmul f4, A3, f4 + + fsmfp f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L120: + andi. I, M, 1 + beq .L129 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L124 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L124 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L123 + .align 4 + +.L122: + fpmadd f0, A1, B1, f0 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fpmadd f1, A2, B2, f1 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + fpmadd f2, A3, B3, f2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fpmadd f3, A4, B4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L122 + .align 4 + +.L123: + fpmadd f0, A1, B1, f0 + fpmadd f1, A2, B2, f1 + fpmadd f2, A3, B3, f2 + fpmadd f3, A4, B4, f3 + .align 4 + +.L124: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L128 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L128 +#endif + + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdz- .L127 + .align 4 + +.L126: + fmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFDX B1, BO, INC2 + add AO, AO, INC + add BO, BO, INC + bdnz+ .L126 + .align 4 + +.L127: + fmadd f0, A1, B1, f0 + .align 4 + +.L128: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + fsmtp f1, f0 + + fadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFDX f16, BO, INC2 + + fsub f0, f16, f0 +#else + LFDX f16, AO, INC2 + + fsub f0, f16, f0 +#endif + +#ifdef LN + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef LT + LFD A1, (2 + 0) * SIZE(AO) + + fmul f0, A1, f0 +#endif + +#ifdef RN + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef RT + LFDX A1, BO, INC2 + + fmul f0, A1, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFDX f0, BO, INC2 + + STFDUX f0, CO1, INC +#else + STFDX f0, AO, INC2 + + STFDUX f0, CO1, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L129: +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L50: + andi. J, N, 2 + beq .L90 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L60 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, KK, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + fxcsmadd f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B3, A3, f2 + nop + fxcsmadd f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + fxcsmadd f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B4, A6, f1 + nop + fxcsmadd f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B4, A7, f2 + nop + fxcsmadd f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B4, A8, f3 + nop + fxcsmadd f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + fxcpmadd f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + fxcpmadd f0, B2, A5, f0 + nop + fxcsmadd f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B2, A6, f1 + nop + fxcsmadd f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + fxcpmadd f2, B2, A7, f2 + nop + fxcsmadd f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + fxcpmadd f3, B2, A8, f3 + nop + fxcsmadd f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + fxcpmadd f0, B3, A1, f0 + fxcsmadd f4, B3, A1, f4 + fxcpmadd f1, B3, A2, f1 + fxcsmadd f5, B3, A2, f5 + + fxcpmadd f2, B3, A3, f2 + fxcsmadd f6, B3, A3, f6 + fxcpmadd f3, B3, A4, f3 + fxcsmadd f7, B3, A4, f7 + + fxcpmadd f0, B4, A5, f0 + fxcsmadd f4, B4, A5, f4 + fxcpmadd f1, B4, A6, f1 + fxcsmadd f5, B4, A6, f5 + + fxcpmadd f2, B4, A7, f2 + fxcsmadd f6, B4, A7, f6 + fxcpmadd f3, B4, A8, f3 + fxcsmadd f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + + fxcpmadd f2, B1, A3, f2 + fxcsmadd f6, B1, A3, f6 + fxcpmadd f3, B1, A4, f3 + fxcsmadd f7, B1, A4, f7 + .align 4 + +.L58: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f26, f2 + fpmr f27, f3 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f2, f6 + fsmfp f3, f7 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f6, f26 + fsmtp f7, f27 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + LFPDUX f20, BO, INC2 + LFPDUX f21, BO, INC2 + LFPDUX f22, BO, INC2 + LFPDUX f23, BO, INC2 + + subi BO, BO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f1, f18, f1 + fpsub f5, f19, f5 + + fpsub f2, f20, f2 + fpsub f6, f21, f6 + fpsub f3, f22, f3 + fpsub f7, f23, f7 + +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + LFPDUX f20, AO, INC2 + LFPDUX f21, AO, INC2 + LFPDUX f22, AO, INC2 + LFPDUX f23, AO, INC2 + + subi AO, AO, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + fpsub f4, f20, f4 + fpsub f5, f21, f5 + fpsub f6, f22, f6 + fpsub f7, f23, f7 +#endif + +#ifdef LN + addi AO, AO, 66 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f7, A1, f7 + fxcpnmsub f3, A1, f7, f3 + fxcsnmsub f6, A2, f7, f6 + fxcpnmsub f2, A2, f7, f2 + + fxcsnmsub f5, A3, f7, f5 + fxcpnmsub f1, A3, f7, f1 + fxcsnmsub f4, A4, f7, f4 + fxcpnmsub f0, A4, f7, f0 + + fxpmul f3, A5, f3 + fxcsnmsub f6, A6, f3, f6 + fxcpnmsub f2, A6, f3, f2 + + fxcsnmsub f5, A7, f3, f5 + fxcpnmsub f1, A7, f3, f1 + fxcsnmsub f4, A8, f3, f4 + fxcpnmsub f0, A8, f3, f0 + + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + + add AO, AO, INCM2 + LFPDUX A4, AO, INCM2 + LFPDUX A5, AO, INCM2 + LFPDUX A6, AO, INCM2 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A7, AO, INCM2 + LFPDUX A8, AO, INCM2 + + fxsmul f6, A1, f6 + fxcpnmsub f2, A1, f6, f2 + fxcsnmsub f5, A2, f6, f5 + fxcpnmsub f1, A2, f6, f1 + fxcsnmsub f4, A3, f6, f4 + fxcpnmsub f0, A3, f6, f0 + + fxpmul f2, A4, f2 + fxcsnmsub f5, A5, f2, f5 + fxcpnmsub f1, A5, f2, f1 + fxcsnmsub f4, A6, f2, f4 + fxcpnmsub f0, A6, f2, f0 + + fxsmul f5, A7, f5 + fxcpnmsub f1, A7, f5, f1 + fxcsnmsub f4, A8, f5, f4 + fxcpnmsub f0, A8, f5, f0 + + add AO, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + + subi AO, AO, 6 * SIZE + LFPDUX A3, AO, INCM2 + subi AO, AO, 6 * SIZE + LFPDUX A4, AO, INCM2 + + addi AO, AO, -2 * SIZE + + fxpmul f1, A1, f1 + fxcsnmsub f4, A2, f1, f4 + fxcpnmsub f0, A2, f1, f0 + + fxsmul f4, A3, f4 + fxcpnmsub f0, A3, f4, f0 + + fxpmul f0, A4, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f5, A2, f0, f5 + fxcpnmsub f2, A3, f0, f2 + fxcsnmsub f6, A3, f0, f6 + fxcpnmsub f3, A4, f0, f3 + fxcsnmsub f7, A4, f0, f7 + + fxsmul f4, A5, f4 + fxcpnmsub f1, A6, f4, f1 + fxcsnmsub f5, A6, f4, f5 + fxcpnmsub f2, A7, f4, f2 + fxcsnmsub f6, A7, f4, f6 + fxcpnmsub f3, A8, f4, f3 + fxcsnmsub f7, A8, f4, f7 + + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + add AO, AO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + + fxpmul f1, A1, f1 + fxcsnmsub f5, A1, f1, f5 + fxcpnmsub f2, A2, f1, f2 + fxcsnmsub f6, A2, f1, f6 + fxcpnmsub f3, A3, f1, f3 + fxcsnmsub f7, A3, f1, f7 + + fxsmul f5, A4, f5 + fxcpnmsub f2, A5, f5, f2 + fxcsnmsub f6, A5, f5, f6 + fxcpnmsub f3, A6, f5, f3 + fxcsnmsub f7, A6, f5, f7 + + fxpmul f2, A7, f2 + fxcsnmsub f6, A7, f2, f6 + fxcpnmsub f3, A8, f2, f3 + fxcsnmsub f7, A8, f2, f7 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, 6 * SIZE + LFPDUX A3, AO, INC2 + addi AO, AO, 6 * SIZE + LFPDUX A4, AO, INC2 + + subi AO, AO, 64 * SIZE + + fxsmul f6, A1, f6 + fxcpnmsub f3, A2, f6, f3 + fxcsnmsub f7, A2, f6, f7 + + fxpmul f3, A3, f3 + fxcsnmsub f7, A3, f3, f7 + + fxsmul f7, A4, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxsmul f4, A2, f4 + fxsmul f5, A2, f5 + fxsmul f6, A2, f6 + fxsmul f7, A2, f7 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f5, A1, f5 + fxsmul f6, A1, f6 + fxsmul f7, A1, f7 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f3, A1, f7, f3 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 + fxpmul f2, A2, f2 + fxpmul f3, A2, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f4, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f5, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f6, BO, INC2 + STFPDUX f3, BO, INC2 + STFPDUX f7, BO, INC2 + + subi BO, BO, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + STFPDUX f4, AO, INC2 + STFPDUX f5, AO, INC2 + STFPDUX f6, AO, INC2 + STFPDUX f7, AO, INC2 + + subi AO, AO, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 4 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + + fxcpmadd f0, B2, A3, f0 + fxcsmadd f2, B2, A3, f2 + fxcpmadd f1, B2, A4, f1 + fxcsmadd f3, B2, A4, f3 + + fxcpmadd f0, B3, A5, f0 + fxcsmadd f2, B3, A5, f2 + fxcpmadd f1, B3, A6, f1 + fxcsmadd f3, B3, A6, f3 + + fxcpmadd f0, B4, A7, f0 + fxcsmadd f2, B4, A7, f2 + fxcpmadd f1, B4, A8, f1 + fxcsmadd f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f2, B1, A1, f2 + fxcpmadd f1, B1, A2, f1 + fxcsmadd f3, B1, A2, f3 + .align 4 + +.L68: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + + fsmfp f0, f2 + fsmfp f1, f3 + fsmtp f2, f24 + fsmtp f3, f25 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + fpsub f1, f18, f1 + fpsub f3, f19, f3 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 +#endif + +#ifdef LN + addi AO, AO, 18 * SIZE + + LFPDUX A1, AO, INCM2 + LFPDUX A2, AO, INCM2 + LFPDUX A3, AO, INCM2 + LFPDUX A4, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A5, AO, INCM2 + add AO, AO, INCM2 + LFPDUX A6, AO, INCM2 + + subi AO, AO, 2 * SIZE + + fxsmul f3, A1, f3 + fxcpnmsub f1, A1, f3, f1 + fxcsnmsub f2, A2, f3, f2 + fxcpnmsub f0, A2, f3, f0 + + fxpmul f1, A3, f1 + fxcsnmsub f2, A4, f1, f2 + fxcpnmsub f0, A4, f1, f0 + + fxsmul f2, A5, f2 + fxcpnmsub f0, A5, f2, f0 + + fxpmul f0, A6, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + add AO, AO, INC2 + LFPDUX A6, AO, INC2 + + subi AO, AO, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f2, A1, f0, f2 + fxcpnmsub f1, A2, f0, f1 + fxcsnmsub f3, A2, f0, f3 + + fxsmul f2, A3, f2 + fxcpnmsub f1, A4, f2, f1 + fxcsnmsub f3, A4, f2, f3 + + fxpmul f1, A5, f1 + fxcsnmsub f3, A5, f1, f3 + + fxsmul f3, A6, f3 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + + fxcsnmsub f2, A1, f0, f2 + fxcsnmsub f3, A1, f1, f3 + + fxsmul f2, A2, f2 + fxsmul f3, A2, f3 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f2, A1, f2 + fxsmul f3, A1, f3 + + fxcpnmsub f0, A1, f2, f0 + fxcpnmsub f1, A1, f3, f1 + + fxpmul f0, A2, f0 + fxpmul f1, A2, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f2, CO1, INC + STFDUX f1, CO1, INC + STFDUX f3, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f3, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + STFDUX f3, CO2, INC + STFSDUX f3, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 2 + beq .L80 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdnz+ .L72 + .align 4 + +.L73: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + fxcpmadd f2, B2, A2, f2 + fxcsmadd f3, B2, A2, f3 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f1, B3, A3, f1 + fxcpmadd f2, B4, A4, f2 + fxcsmadd f3, B4, A4, f3 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f1, B5, A5, f1 + fxcpmadd f2, B6, A6, f2 + fxcsmadd f3, B6, A6, f3 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f1, A9, A7, f1 + fxcpmadd f2, A10, A8, f2 + fxcsmadd f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fsmfp f0, f1 + fsmtp f1, f24 + + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#ifdef LN + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxsmul f1, A2, f1 + fxcpnmsub f0, A2, f1, f0 + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + addi AO, AO, -4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RN + LFPDUX A1, BO, INC2 + LFPDUX A2, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f1, A1, f0, f1 + + fxsmul f1, A2, f1 +#endif + +#ifdef RT + LFPDUX A2, BO, INC2 + LFPDUX A1, BO, INC2 + + subi BO, BO, 4 * SIZE + + fxsmul f1, A1, f1 + fxcpnmsub f0, A1, f1, f0 + fxpmul f0, A2, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE + + STFDUX f0, CO1, INC + STFDUX f1, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f1, CO2, INC +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO2, INC + STFSDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L80: + andi. I, M, 1 + beq .L89 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L84 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L84 + +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX B4, BO, INC2 + bdz- .L83 + .align 4 + +.L82: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A1, AO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A2, AO, INC2 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A3, B2, f1 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + fxcpmadd f2, A4, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A4, B4, f3 + LFPDUX B4, BO, INC2 + LFPDUX A4, AO, INC2 + bdnz+ .L82 + .align 4 + +.L83: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC2 + fxcsmadd f1, A1, B2, f1 + LFPDUX B2, BO, INC2 + fxcpmadd f2, A2, B3, f2 + LFPDUX B3, BO, INC2 + fxcsmadd f3, A2, B4, f3 + LFPDUX B4, BO, INC2 + + fxcpmadd f0, A3, B1, f0 + fxcsmadd f1, A3, B2, f1 + fxcpmadd f2, A4, B3, f2 + fxcsmadd f3, A4, B4, f3 + .align 4 + +.L84: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L88 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L88 +#endif + + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdz- .L87 + .align 4 + +.L86: + fxcpmadd f0, A1, B1, f0 + LFDX A1, AO, INC2 + LFPDUX B1, BO, INC2 + add AO, AO, INC + bdnz+ .L86 + .align 4 + +.L87: + fxcpmadd f0, A1, B1, f0 + .align 4 + +.L88: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + fpadd f0, f0, f2 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 + + fpsub f0, f16, f0 +#else + LFPDX f16, AO, INC2 + + fpsub f0, f16, f0 +#endif + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f0, A1, f0 +#endif + +#ifdef RN + LFD A1, (2 + 0) * SIZE(BO) + LFD A2, (2 + 1) * SIZE(BO) + LFD A3, (2 + 3) * SIZE(BO) + + fsmtp f1, f0 + + fmul f0, A1, f0 + fnmsub f1, A2, f0, f1 + + fmul f1, A3, f1 + fsmfp f0, f1 +#endif + +#ifdef RT + LFD A1, (2 + 3) * SIZE(BO) + LFD A2, (2 + 2) * SIZE(BO) + LFD A3, (2 + 0) * SIZE(BO) + + fsmtp f1, f0 + + fmul f1, A1, f1 + fnmsub f0, A2, f1, f0 + + fmul f0, A3, f0 + fsmfp f0, f1 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC +#else + STFPDX f0, AO, INC2 + + STFDUX f0, CO1, INC + STFDUX f1, CO2, INC +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +.L90: + srawi. J, N, 2 + ble .L999 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO4, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 3 + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 3 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 3 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + nop + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + fxcsmadd f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + nop + fxcpmadd f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + fxcsmadd f13, B2, A2, f13 + nop + + fxcpmadd f2, B1, A3, f2 + nop + fxcsmadd f6, B1, A3, f6 + nop + fxcpmadd f10, B2, A3, f10 + nop + fxcsmadd f14, B2, A3, f14 + nop + + fxcpmadd f3, B1, A4, f3 + nop + fxcsmadd f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 2 ## + + fxcpmadd f0, B3, A5, f0 + nop + fxcsmadd f4, B3, A5, f4 + nop + fxcpmadd f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + fxcsmadd f12, B4, A5, f12 + nop + + fxcpmadd f1, B3, A2, f1 + nop + fxcsmadd f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B3, A6, f2 + nop + fxcsmadd f6, B3, A6, f6 + nop + fxcpmadd f10, B4, A6, f10 + nop + fxcsmadd f14, B4, A6, f14 + nop + + fxcpmadd f3, B3, A4, f3 + nop + fxcsmadd f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + +## 3 ## + + fxcpmadd f0, B5, A7, f0 + nop + fxcsmadd f4, B5, A7, f4 + nop + fxcpmadd f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + fxcsmadd f12, B2, A7, f12 + nop + + fxcpmadd f1, B5, A2, f1 + nop + fxcsmadd f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B2, A2, f9 + nop + + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f2, B5, A8, f2 + nop + fxcsmadd f6, B5, A8, f6 + nop + fxcpmadd f10, B2, A8, f10 + nop + fxcsmadd f14, B2, A8, f14 + nop + + fxcpmadd f3, B5, A4, f3 + nop + fxcsmadd f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + fxcpmadd f11, B2, A4, f11 + nop + fxcsmadd f15, B2, A4, f15 + nop + +## 4 ## + + fxcpmadd f0, B6, A9, f0 + nop + fxcsmadd f4, B6, A9, f4 + nop + fxcpmadd f8, B4, A9, f8 + nop + fxcsmadd f12, B4, A9, f12 + nop + + fxcpmadd f1, B6, A2, f1 + nop + fxcsmadd f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + fxcpmadd f9, B4, A2, f9 + nop + fxcsmadd f13, B4, A2, f13 + nop + + fxcpmadd f2, B6, A10, f2 + nop + fxcsmadd f6, B6, A10, f6 + nop + fxcpmadd f10, B4, A10, f10 + nop + fxcsmadd f14, B4, A10, f14 + nop + + fxcpmadd f3, B6, A4, f3 + nop + fxcsmadd f7, B6, A4, f7 + nop + fxcpmadd f11, B4, A4, f11 + nop + fxcsmadd f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + fxcpmadd f0, A10, A2, f0 + fxcsmadd f4, A10, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + fxcpmadd f1, A10, A4, f1 + fxcsmadd f5, A10, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + fxcpmadd f2, A10, A2, f2 + fxcsmadd f6, A10, A2, f6 + fxcpmadd f10, B4, A2, f10 + fxcsmadd f14, B4, A2, f14 + + fxcpmadd f3, A10, A4, f3 + fxcsmadd f7, A10, A4, f7 + fxcpmadd f11, B4, A4, f11 + fxcsmadd f15, B4, A4, f15 + .align 4 + +.L18: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 8 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 3 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + LFPDUX f16, BO, INC4 + fpmr f25, f1 + nop + fpmr f26, f2 + LFPDUX f17, BO2, INC4 + fpmr f27, f3 + nop + + fpmr f28, f8 + LFPDUX f18, BO, INC4 + fpmr f29, f9 + nop + fpmr f30, f10 + LFPDUX f19, BO2, INC4 + fpmr f31, f11 + nop + + fsmfp f0, f4 + LFPDUX f20, BO, INC4 + fsmfp f1, f5 + nop + fsmfp f2, f6 + LFPDUX f21, BO2, INC4 + fsmfp f3, f7 + nop + + fsmfp f8, f12 + LFPDUX f22, BO, INC4 + fsmfp f9, f13 + nop + fsmfp f10, f14 + LFPDUX f23, BO2, INC4 + fsmfp f11, f15 + nop + + fsmtp f4, f24 + LFPDUX f24, BO, INC4 + fsmtp f5, f25 + nop + fsmtp f6, f26 + LFPDUX f25, BO2, INC4 + fsmtp f7, f27 + nop + + fsmtp f12, f28 + LFPDUX f26, BO, INC4 + fsmtp f13, f29 + nop + fsmtp f14, f30 + LFPDUX f27, BO2, INC4 + fsmtp f15, f31 + nop + + fpsub f0, f16, f0 + LFPDUX f28, BO, INC4 + fpsub f8, f17, f8 + nop + fpsub f4, f18, f4 + LFPDUX f29, BO2, INC4 + fpsub f12, f19, f12 + nop + + fpsub f1, f20, f1 + LFPDUX f30, BO, INC4 + fpsub f9, f21, f9 + subi BO, BO, 32 * SIZE + fpsub f5, f22, f5 + LFPDUX f31, BO2, INC4 + fpsub f13, f23, f13 + subi BO2, BO2, 32 * SIZE + + fpsub f2, f24, f2 + fpsub f10, f25, f10 + fpsub f6, f26, f6 + fpsub f14, f27, f14 + fpsub f3, f28, f3 + fpsub f11, f29, f11 + fpsub f7, f30, f7 + fpsub f15, f31, f15 + +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + fpsub f0, f16, f0 + LFPDUX f24, AO, INC4 + fpsub f1, f17, f1 + LFPDUX f25, AO2, INC4 + fpsub f2, f18, f2 + LFPDUX f26, AO, INC4 + fpsub f3, f19, f3 + LFPDUX f27, AO2, INC4 + fpsub f4, f20, f4 + LFPDUX f28, AO, INC4 + fpsub f5, f21, f5 + LFPDUX f29, AO2, INC4 + fpsub f6, f22, f6 + LFPDUX f30, AO, INC4 + fpsub f7, f23, f7 + LFPDUX f31, AO2, INC4 + + fpsub f8, f24, f8 + subi AO, AO, 32 * SIZE + fpsub f9, f25, f9 + subi AO2, AO2, 32 * SIZE + fpsub f10, f26, f10 + fpsub f11, f27, f11 + fpsub f12, f28, f12 + fpsub f13, f29, f13 + fpsub f14, f30, f14 + fpsub f15, f31, f15 +#endif + +#ifdef LN + addi AO, AO, 68 * SIZE + addi AO2, AO2, 68 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + fxsmul f7, A1, f7 + fxsmul f15, A1, f15 + + fxcpnmsub f3, A1, f7, f3 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f6, A2, f7, f6 + fxcsnmsub f14, A2, f15, f14 + + fxcpnmsub f2, A2, f7, f2 + fxcpnmsub f10, A2, f15, f10 + + fxcsnmsub f5, A3, f7, f5 + fxcsnmsub f13, A3, f15, f13 + + fxcpnmsub f1, A3, f7, f1 + fxcpnmsub f9, A3, f15, f9 + + fxcsnmsub f4, A4, f7, f4 + fxcsnmsub f12, A4, f15, f12 + + fxcpnmsub f0, A4, f7, f0 + fxcpnmsub f8, A4, f15, f8 + + fxpmul f3, A5, f3 + fxpmul f11, A5, f11 + + fxcsnmsub f6, A6, f3, f6 + fxcsnmsub f14, A6, f11, f14 + + fxcpnmsub f2, A6, f3, f2 + fxcpnmsub f10, A6, f11, f10 + + fxcsnmsub f5, A7, f3, f5 + fxcsnmsub f13, A7, f11, f13 + + fxcpnmsub f1, A7, f3, f1 + fxcpnmsub f9, A7, f11, f9 + + fxcsnmsub f4, A8, f3, f4 + fxcsnmsub f12, A8, f11, f12 + + fxcpnmsub f0, A8, f3, f0 + fxcpnmsub f8, A8, f11, f8 + + add AO2, AO2, INCM4 + LFPDUX A1, AO, INCM4 + LFPDUX A2, AO2, INCM4 + LFPDUX A3, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A4, AO, INCM4 + LFPDUX A5, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A7, AO2, INCM4 + LFPDUX A8, AO, INCM4 + + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f2, A1, f6, f2 + fxcpnmsub f10, A1, f14, f10 + + fxcsnmsub f5, A2, f6, f5 + fxcsnmsub f13, A2, f14, f13 + + fxcpnmsub f1, A2, f6, f1 + fxcpnmsub f9, A2, f14, f9 + + fxcsnmsub f4, A3, f6, f4 + fxcsnmsub f12, A3, f14, f12 + + fxcpnmsub f0, A3, f6, f0 + fxcpnmsub f8, A3, f14, f8 + + fxpmul f2, A4, f2 + fxpmul f10, A4, f10 + + fxcsnmsub f5, A5, f2, f5 + fxcsnmsub f13, A5, f10, f13 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + + fxcsnmsub f4, A6, f2, f4 + fxcsnmsub f12, A6, f10, f12 + + fxcpnmsub f0, A6, f2, f0 + fxcpnmsub f8, A6, f10, f8 + + fxsmul f5, A7, f5 + fxsmul f13, A7, f13 + + fxcpnmsub f1, A7, f5, f1 + fxcpnmsub f9, A7, f13, f9 + + fxcsnmsub f4, A8, f5, f4 + fxcsnmsub f12, A8, f13, f12 + + fxcpnmsub f0, A8, f5, f0 + fxcpnmsub f8, A8, f13, f8 + + add AO2, AO2, INCM4 + add AO, AO, INCM4 + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A3, AO, INCM4 + + subi AO2, AO2, 8 * SIZE + add AO, AO, INCM4 + LFPDUX A4, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f4, A2, f1, f4 + fxcsnmsub f12, A2, f9, f12 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f0, A3, f4, f0 + fxcpnmsub f8, A3, f12, f8 + + fxpmul f0, A4, f0 + fxpmul f8, A4, f8 + +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + + fxcsnmsub f6, A3, f0, f6 + fxcsnmsub f14, A3, f8, f14 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + + fxcsnmsub f7, A4, f0, f7 + fxcsnmsub f15, A4, f8, f15 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f1, A6, f4, f1 + fxcpnmsub f9, A6, f12, f9 + + fxcsnmsub f5, A6, f4, f5 + fxcsnmsub f13, A6, f12, f13 + + fxcpnmsub f2, A7, f4, f2 + fxcpnmsub f10, A7, f12, f10 + + fxcsnmsub f6, A7, f4, f6 + fxcsnmsub f14, A7, f12, f14 + + fxcpnmsub f3, A8, f4, f3 + fxcpnmsub f11, A8, f12, f11 + + fxcsnmsub f7, A8, f4, f7 + fxcsnmsub f15, A8, f12, f15 + + add AO, AO, INC4 + LFPDUX A1, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A5, AO, INC4 + LFPDUX A6, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + + fxpmul f1, A1, f1 + fxpmul f9, A1, f9 + + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f13, A1, f9, f13 + + fxcpnmsub f2, A2, f1, f2 + fxcpnmsub f10, A2, f9, f10 + + fxcsnmsub f6, A2, f1, f6 + fxcsnmsub f14, A2, f9, f14 + + fxcpnmsub f3, A3, f1, f3 + fxcpnmsub f11, A3, f9, f11 + + fxcsnmsub f7, A3, f1, f7 + fxcsnmsub f15, A3, f9, f15 + + fxsmul f5, A4, f5 + fxsmul f13, A4, f13 + + fxcpnmsub f2, A5, f5, f2 + fxcpnmsub f10, A5, f13, f10 + + fxcsnmsub f6, A5, f5, f6 + fxcsnmsub f14, A5, f13, f14 + + fxcpnmsub f3, A6, f5, f3 + fxcpnmsub f11, A6, f13, f11 + + fxcsnmsub f7, A6, f5, f7 + fxcsnmsub f15, A6, f13, f15 + + fxpmul f2, A7, f2 + fxpmul f10, A7, f10 + + fxcsnmsub f6, A7, f2, f6 + fxcsnmsub f14, A7, f10, f14 + + fxcpnmsub f3, A8, f2, f3 + fxcpnmsub f11, A8, f10, f11 + + fxcsnmsub f7, A8, f2, f7 + fxcsnmsub f15, A8, f10, f15 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A3, AO2, INC4 + + addi AO, AO, 8 * SIZE + addi AO2, AO2, 4 * SIZE + LFPDUX A4, AO2, INC4 + + subi AO, AO, 64 * SIZE + subi AO2, AO2, 64 * SIZE + + fxsmul f6, A1, f6 + fxsmul f14, A1, f14 + + fxcpnmsub f3, A2, f6, f3 + fxcpnmsub f11, A2, f14, f11 + + fxcsnmsub f7, A2, f6, f7 + fxcsnmsub f15, A2, f14, f15 + + fxpmul f3, A3, f3 + fxpmul f11, A3, f11 + + fxcsnmsub f7, A3, f3, f7 + fxcsnmsub f15, A3, f11, f15 + + fxsmul f7, A4, f7 + fxsmul f15, A4, f15 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxpmul f2, A1, f2 + fxpmul f3, A1, f3 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + fxcsnmsub f6, A1, f2, f6 + fxcsnmsub f7, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + fxcsnmsub f14, A2, f2, f14 + fxcsnmsub f15, A2, f3, f15 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxsmul f6, A3, f6 + fxsmul f7, A3, f7 + + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + fxcpnmsub f10, A4, f6, f10 + fxcpnmsub f11, A4, f7, f11 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + fxcsnmsub f14, A4, f6, f14 + fxcsnmsub f15, A4, f7, f15 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxpmul f10, A5, f10 + fxpmul f11, A5, f11 + + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + fxcsnmsub f14, A5, f10, f14 + fxcsnmsub f15, A5, f11, f15 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 + fxsmul f14, A6, f14 + fxsmul f15, A6, f15 + +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxsmul f14, A1, f14 + fxsmul f15, A1, f15 + + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + fxcpnmsub f10, A1, f14, f10 + fxcpnmsub f11, A1, f15, f11 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcsnmsub f6, A2, f14, f6 + fxcsnmsub f7, A2, f15, f7 + + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + fxcpnmsub f2, A2, f14, f2 + fxcpnmsub f3, A2, f15, f3 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxpmul f10, A3, f10 + fxpmul f11, A3, f11 + + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + fxcsnmsub f6, A4, f10, f6 + fxcsnmsub f7, A4, f11, f7 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + fxcpnmsub f2, A4, f10, f2 + fxcpnmsub f3, A4, f11, f3 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxsmul f6, A5, f6 + fxsmul f7, A5, f7 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + fxcpnmsub f2, A5, f6, f2 + fxcpnmsub f3, A5, f7, f3 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 + fxpmul f2, A6, f2 + fxpmul f3, A6, f3 + +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f6, BO, INC4 + STFPDUX f14, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + STFPDUX f7, BO, INC4 + STFPDUX f15, BO2, INC4 + + subi BO, BO, 32 * SIZE + subi BO2, BO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + STFDUX f2, CO1, INC + STFDUX f6, CO1, INC + STFDUX f3, CO1, INC + STFDUX f7, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + STFSDUX f2, CO2, INC + STFSDUX f6, CO2, INC + STFSDUX f3, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + STFDUX f10, CO3, INC + STFDUX f14, CO3, INC + STFDUX f11, CO3, INC + STFDUX f15, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC + STFSDUX f10, CO4, INC + STFSDUX f14, CO4, INC + STFSDUX f11, CO4, INC + STFSDUX f15, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f6, AO, INC4 + STFPDUX f7, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + STFPDUX f14, AO, INC4 + STFPDUX f15, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + STFDUX f6, CO2, INC + STFSDUX f6, CO2, INC + STFDUX f7, CO2, INC + STFSDUX f7, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f10, CO3, INC + STFSDUX f10, CO3, INC + STFDUX f11, CO3, INC + STFSDUX f11, CO3, INC + + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC + STFDUX f14, CO4, INC + STFSDUX f14, CO4, INC + STFDUX f15, CO4, INC + STFSDUX f15, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE + subi CO3, CO3, 8 * SIZE + subi CO4, CO4, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 3 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 3 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 8 +#endif + +#ifdef LN + subi KK, KK, 8 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 4 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + fxcpmadd f0, B1, A1, f0 + nop + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + nop + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + nop + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + nop + fxcsmadd f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + fxcpmadd f0, B3, A3, f0 + nop + fxcsmadd f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + fxcpmadd f8, B4, A3, f8 + nop + fxcsmadd f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + fxcpmadd f1, B3, A4, f1 + nop + fxcsmadd f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + fxcpmadd f9, B4, A4, f9 + nop + fxcsmadd f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, B5, A5, f0 + nop + fxcsmadd f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + fxcpmadd f8, B6, A5, f8 + nop + fxcsmadd f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + fxcpmadd f1, B5, A6, f1 + nop + fxcsmadd f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + fxcpmadd f9, B6, A6, f9 + nop + fxcsmadd f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + fxcpmadd f0, A9, A7, f0 + nop + fxcsmadd f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + fxcpmadd f8, A10, A7, f8 + nop + fxcsmadd f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + fxcpmadd f1, A9, A8, f1 + nop + fxcsmadd f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + fxcpmadd f9, A10, A8, f9 + nop + fxcsmadd f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + + fxcpmadd f0, B3, A3, f0 + fxcsmadd f4, B3, A3, f4 + fxcpmadd f8, B4, A3, f8 + fxcsmadd f12, B4, A3, f12 + + fxcpmadd f1, B3, A4, f1 + fxcsmadd f5, B3, A4, f5 + fxcpmadd f9, B4, A4, f9 + fxcsmadd f13, B4, A4, f13 + + fxcpmadd f0, B5, A5, f0 + fxcsmadd f4, B5, A5, f4 + fxcpmadd f8, B6, A5, f8 + fxcsmadd f12, B6, A5, f12 + + fxcpmadd f1, B5, A6, f1 + fxcsmadd f5, B5, A6, f5 + fxcpmadd f9, B6, A6, f9 + fxcsmadd f13, B6, A6, f13 + + fxcpmadd f0, A9, A7, f0 + fxcsmadd f4, A9, A7, f4 + fxcpmadd f8, A10, A7, f8 + fxcsmadd f12, A10, A7, f12 + + fxcpmadd f1, A9, A8, f1 + fxcsmadd f5, A9, A8, f5 + fxcpmadd f9, A10, A8, f9 + fxcsmadd f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f1, B1, A2, f1 + fxcsmadd f5, B1, A2, f5 + fxcpmadd f9, B2, A2, f9 + fxcsmadd f13, B2, A2, f13 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f25, f1 + fpmr f28, f8 + fpmr f29, f9 + + fsmfp f0, f4 + fsmfp f1, f5 + fsmfp f8, f12 + fsmfp f9, f13 + + fsmtp f4, f24 + fsmtp f5, f25 + fsmtp f12, f28 + fsmtp f13, f29 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + LFPDUX f20, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f22, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 + + fpsub f1, f20, f1 + fpsub f9, f21, f9 + fpsub f5, f22, f5 + fpsub f13, f23, f13 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f4, f18, f4 + fpsub f5, f19, f5 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f12, f22, f12 + fpsub f13, f23, f13 +#endif + +#ifdef LN + addi AO, AO, 20 * SIZE + addi AO2, AO2, 20 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + LFPDUX A3, AO2, INCM4 + LFPDUX A4, AO, INCM4 + + add AO2, AO2, INCM4 + LFPDUX A5, AO, INCM4 + add AO2, AO2, INCM4 + LFPDUX A6, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f5, A1, f5 + fxsmul f13, A1, f13 + + fxcpnmsub f1, A1, f5, f1 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f5, f4 + fxcsnmsub f12, A2, f13, f12 + + fxcpnmsub f0, A2, f5, f0 + fxcpnmsub f8, A2, f13, f8 + + fxpmul f1, A3, f1 + fxpmul f9, A3, f9 + + fxcsnmsub f4, A4, f1, f4 + fxcsnmsub f12, A4, f9, f12 + + fxcpnmsub f0, A4, f1, f0 + fxcpnmsub f8, A4, f9, f8 + + fxsmul f4, A5, f4 + fxsmul f12, A5, f12 + + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f8, A5, f12, f8 + + fxpmul f0, A6, f0 + fxpmul f8, A6, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + add AO, AO, INC4 + LFPDUX A6, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + + fxcsnmsub f5, A2, f0, f5 + fxcsnmsub f13, A2, f8, f13 + + fxsmul f4, A3, f4 + fxsmul f12, A3, f12 + + fxcpnmsub f1, A4, f4, f1 + fxcpnmsub f9, A4, f12, f9 + + fxcsnmsub f5, A4, f4, f5 + fxcsnmsub f13, A4, f12, f13 + + fxpmul f1, A5, f1 + fxpmul f9, A5, f9 + + fxcsnmsub f5, A5, f1, f5 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f5, A6, f5 + fxsmul f13, A6, f13 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f5, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcsnmsub f12, A2, f0, f12 + fxcsnmsub f13, A2, f1, f13 + + fxsmul f4, A3, f4 + fxsmul f5, A3, f5 + fxcpnmsub f8, A4, f4, f8 + fxcpnmsub f9, A4, f5, f9 + + fxcsnmsub f12, A4, f4, f12 + fxcsnmsub f13, A4, f5, f13 + + fxpmul f8, A5, f8 + fxpmul f9, A5, f9 + fxcsnmsub f12, A5, f8, f12 + fxcsnmsub f13, A5, f9, f13 + + fxsmul f12, A6, f12 + fxsmul f13, A6, f13 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxsmul f13, A1, f13 + fxcpnmsub f8, A1, f12, f8 + fxcpnmsub f9, A1, f13, f9 + + fxcsnmsub f4, A2, f12, f4 + fxcsnmsub f5, A2, f13, f5 + fxcpnmsub f0, A2, f12, f0 + fxcpnmsub f1, A2, f13, f1 + + fxpmul f8, A3, f8 + fxpmul f9, A3, f9 + fxcsnmsub f4, A4, f8, f4 + fxcsnmsub f5, A4, f9, f5 + + fxcpnmsub f0, A4, f8, f0 + fxcpnmsub f1, A4, f9, f1 + + fxsmul f4, A5, f4 + fxsmul f5, A5, f5 + fxcpnmsub f0, A5, f4, f0 + fxcpnmsub f1, A5, f5, f1 + + fxpmul f0, A6, f0 + fxpmul f1, A6, f1 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f5, BO, INC4 + STFPDUX f13, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFDUX f1, CO1, INC + STFDUX f5, CO1, INC + + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + STFSDUX f1, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFDUX f9, CO3, INC + STFDUX f13, CO3, INC + + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + STFSDUX f9, CO4, INC + STFSDUX f13, CO4, INC +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f4, AO, INC4 + STFPDUX f5, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f12, AO, INC4 + STFPDUX f13, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + STFDUX f5, CO2, INC + STFSDUX f5, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f9, CO3, INC + STFSDUX f9, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC + STFDUX f13, CO4, INC + STFSDUX f13, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 2 + beq .L40 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + LFPDUX B3, BO, INC4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + LFPDUX A5, BO, INC4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + LFPDUX A7, BO, INC4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + + fxcpmadd f0, B3, A2, f0 + fxcsmadd f4, B3, A2, f4 + fxcpmadd f8, B4, A2, f8 + fxcsmadd f12, B4, A2, f12 + + fxcpmadd f0, A5, A3, f0 + fxcsmadd f4, A5, A3, f4 + fxcpmadd f8, A6, A3, f8 + fxcsmadd f12, A6, A3, f12 + + fxcpmadd f0, A7, A4, f0 + fxcsmadd f4, A7, A4, f4 + fxcpmadd f8, A8, A4, f8 + fxcsmadd f12, A8, A4, f12 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + LFPDUX B1, BO, INC4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + fxcpmadd f0, B1, A1, f0 + fxcsmadd f4, B1, A1, f4 + fxcpmadd f8, B2, A1, f8 + fxcsmadd f12, B2, A1, f12 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + fpmr f24, f0 + fpmr f28, f8 + + fsmfp f0, f4 + fsmfp f8, f12 + fsmtp f4, f24 + fsmtp f12, f28 + + LFPDUX f16, BO, INC4 + LFPDUX f17, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f8, f17, f8 + fpsub f4, f18, f4 + fpsub f12, f19, f12 +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fpsub f0, f16, f0 + fpsub f4, f17, f4 + fpsub f8, f18, f8 + fpsub f12, f19, f12 +#endif + +#ifdef LN + addi AO, AO, 8 * SIZE + addi AO2, AO2, 8 * SIZE + + LFPDUX A1, AO2, INCM4 + LFPDUX A2, AO, INCM4 + + addi AO, AO, -4 * SIZE + addi AO2, AO2, -4 * SIZE + + fxsmul f4, A1, f4 + fxsmul f12, A1, f12 + + fxcpnmsub f0, A1, f4, f0 + fxcpnmsub f8, A1, f12, f8 + + fxpmul f0, A2, f0 + fxpmul f8, A2, f8 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + + subi AO, AO, 4 * SIZE + subi AO2, AO2, 4 * SIZE + + fxpmul f0, A1, f0 + fxpmul f8, A1, f8 + + fxcsnmsub f4, A1, f0, f4 + fxcsnmsub f12, A1, f8, f12 + + fxsmul f4, A2, f4 + fxsmul f12, A2, f12 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + LFPDUX A3, BO, INC4 + LFPDUX A4, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A5, BO2, INC4 + + add BO, BO, INC4 + LFPDUX A6, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE + + fxpmul f0, A1, f0 + fxcsnmsub f4, A1, f0, f4 + fxcpnmsub f8, A2, f0, f8 + fxcsnmsub f12, A2, f0, f12 + + fxsmul f4, A3, f4 + fxcpnmsub f8, A4, f4, f8 + fxcsnmsub f12, A4, f4, f12 + + fxpmul f8, A5, f8 + fxcsnmsub f12, A5, f8, f12 + fxsmul f12, A6, f12 +#endif + +#ifdef RT + addi BO, BO, 20 * SIZE + addi BO2, BO2, 20 * SIZE + + LFPDUX A1, BO2, INCM4 + LFPDUX A2, BO, INCM4 + + LFPDUX A3, BO2, INCM4 + LFPDUX A4, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A5, BO, INCM4 + + add BO2, BO2, INCM4 + LFPDUX A6, BO, INCM4 + subi BO, BO, 4 * SIZE + subi BO2, BO2, 4 * SIZE + + fxsmul f12, A1, f12 + fxcpnmsub f8, A1, f12, f8 + fxcsnmsub f4, A2, f12, f4 + fxcpnmsub f0, A2, f12, f0 + + fxpmul f8, A3, f8 + fxcsnmsub f4, A4, f8, f4 + fxcpnmsub f0, A4, f8, f0 + + fxsmul f4, A5, f4 + fxcpnmsub f0, A5, f4, f0 + fxpmul f0, A6, f0 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f4, BO, INC4 + STFPDUX f12, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFDUX f4, CO1, INC + STFSDUX f0, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFDUX f12, CO3, INC + STFSDUX f8, CO4, INC + STFSDUX f12, CO4, INC + +#else + STFPDUX f0, AO, INC4 + STFPDUX f4, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f12, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f4, CO2, INC + STFSDUX f4, CO2, INC + + STFDUX f8, CO3, INC + STFSDUX f8, CO3, INC + STFDUX f12, CO4, INC + STFSDUX f12, CO4, INC +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L40: + andi. I, M, 1 + beq .L49 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L44 +#else + +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L44 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L43 + .align 4 + +.L42: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A1, AO, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A2, AO2, INC4 + + fxcpmadd f0, A3, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A3, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A3, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A3, B4, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A3, AO, INC4 + + fxcpmadd f0, A4, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A4, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A4, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A4, A8, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L42 + .align 4 + +.L43: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFPDUX B2, BO2, INC4 + fxcsmadd f2, A1, B3, f2 + LFPDUX B3, BO, INC4 + fxcsmadd f3, A1, B4, f3 + LFPDUX B4, BO2, INC4 + + fxcpmadd f0, A2, A5, f0 + LFPDUX A5, BO, INC4 + fxcpmadd f1, A2, A6, f1 + LFPDUX A6, BO2, INC4 + fxcsmadd f2, A2, A7, f2 + LFPDUX A7, BO, INC4 + fxcsmadd f3, A2, A8, f3 + LFPDUX A8, BO2, INC4 + + fxcpmadd f0, A3, B1, f0 + fxcpmadd f1, A3, B2, f1 + fxcsmadd f2, A3, B3, f2 + fxcsmadd f3, A3, B4, f3 + + fxcpmadd f0, A4, A5, f0 + fxcpmadd f1, A4, A6, f1 + fxcsmadd f2, A4, A7, f2 + fxcsmadd f3, A4, A8, f3 + .align 4 + +.L44: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L48 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L48 +#endif + + LFDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdz- .L47 + .align 4 + +.L46: + fxcpmadd f0, A1, B1, f0 + LFPDUX B1, BO, INC4 + fxcpmadd f1, A1, B2, f1 + LFDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC + bdnz+ .L46 + .align 4 + +.L47: + fxcpmadd f0, A1, B1, f0 + fxcpmadd f1, A1, B2, f1 + addi AO2, AO, 2 * SIZE + .align 4 + +.L48: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 + + fpsub f0, f16, f0 + fpsub f1, f17, f1 +#endif + +#if defined(LN) || defined(LT) + LFPDX A1, AO, INC4 + + fxpmul f0, A1, f0 + fxpmul f1, A1, f1 +#endif + +#ifdef RN + LFD A1, (4 + 0) * SIZE(BO) + LFD A2, (4 + 1) * SIZE(BO) + LFD A3, (4 + 2) * SIZE(BO) + LFD A4, (4 + 3) * SIZE(BO) + + LFD A5, (4 + 5) * SIZE(BO) + LFD A6, (4 + 6) * SIZE(BO) + LFD A7, (4 + 7) * SIZE(BO) + LFD A8, (4 + 10) * SIZE(BO) + + LFD A9, (4 + 11) * SIZE(BO) + LFD A10, (4 + 15) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f0, A1, f0 + fnmsub f2, A2, f0, f2 + fnmsub f1, A3, f0, f1 + fnmsub f3, A4, f0, f3 + + fmul f2, A5, f2 + fnmsub f1, A6, f2, f1 + fnmsub f3, A7, f2, f3 + + fmul f1, A8, f1 + fnmsub f3, A9, f1, f3 + + fmul f3, A10, f3 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#ifdef RT + LFD A1, (4 + 15) * SIZE(BO) + LFD A2, (4 + 14) * SIZE(BO) + LFD A3, (4 + 13) * SIZE(BO) + LFD A4, (4 + 12) * SIZE(BO) + + LFD A5, (4 + 10) * SIZE(BO) + LFD A6, (4 + 9) * SIZE(BO) + LFD A7, (4 + 8) * SIZE(BO) + LFD A8, (4 + 5) * SIZE(BO) + + LFD A9, (4 + 4) * SIZE(BO) + LFD A10, (4 + 0) * SIZE(BO) + + fsmtp f2, f0 + fsmtp f3, f1 + + fmul f3, A1, f3 + fnmsub f1, A2, f3, f1 + fnmsub f2, A3, f3, f2 + fnmsub f0, A4, f3, f0 + + fmul f1, A5, f1 + fnmsub f2, A6, f1, f2 + fnmsub f0, A7, f1, f0 + + fmul f2, A8, f2 + fnmsub f0, A9, f2, f0 + + fmul f0, A10, f0 + + fsmfp f0, f2 + fsmfp f1, f3 +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f1, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f1, AO2, INC4 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO2, INC + STFDUX f1, CO3, INC + STFSDUX f1, CO4, INC + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L999: + addi SP, SP, 12 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S new file mode 100644 index 0000000000..60ba587809 --- /dev/null +++ b/kernel/power/trsm_kernel_power6_LN.S @@ -0,0 +1,3688 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, (16 * 3 * SIZE) + li PREC, -4 * SIZE + + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + +LL(30): + andi. I, M, 1 + ble LL(20) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(09): + srawi. I, M, 2 + ble LL(39) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(60): + andi. I, M, 1 + ble LL(50) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(41) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(41): + srawi. I, M, 2 + ble LL(69) + .align 4 + +LL(42): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(43): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(43) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(42) + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(80) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(71) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(71): + srawi. I, M, 2 + ble LL(999) + .align 4 + +LL(72): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(73): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(73) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(72) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S new file mode 100644 index 0000000000..448b16369d --- /dev/null +++ b/kernel/power/trsm_kernel_power6_LT.S @@ -0,0 +1,3676 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define BB r20 +#define KK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, (16 * 3 * SIZE) + li PREC, 4 * SIZE + lfs f0, FZERO + + srawi. J, N, 2 + ble LL(40) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(70) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(70): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbt CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S new file mode 100644 index 0000000000..1f36d17df1 --- /dev/null +++ b/kernel/power/trsm_kernel_power6_RT.S @@ -0,0 +1,3696 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define PREA r29 +#define PREC r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, (16 * 3 * SIZE) + li PREC, 4 * SIZE + lfs f0, FZERO + + andi. J, N, 1 + ble LL(40) + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble LL(80) + .align 4 + +LL(71): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(75) + .align 5 + +LL(72): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f21, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f23, f0 + FMADD f1, f17, f23, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + dcbtst AO, PREA + bdnz LL(72) + .align 4 + +LL(75): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(78) + .align 4 + +LL(76): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(76) + .align 4 + +LL(78): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(71) + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(85) + .align 5 + +LL(82): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f21, f2 + FMADD f3, f19, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f23, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + dcbt AO, PREA + bdnz LL(82) + .align 4 + +LL(85): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(88) + .align 4 + +LL(86): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + + addi BO, BO, 1 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(86) + .align 4 + +LL(88): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(99) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(95) + .align 5 + +LL(92): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f23, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(92) + .align 4 + +LL(95): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(98) + .align 4 + +LL(96): + FMADD f0, f16, f20, f0 + LFD f16, 1 * SIZE(AO) + LFD f20, 1 * SIZE(BO) + addi BO, BO, 1 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(96) + .align 4 + +LL(98): + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(99): +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + andi. J, N, 2 + ble LL(09) + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(50) + .align 4 + +LL(41): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 5 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + + FMADD f4, f16, f23, f4 + FMADD f5, f17, f23, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbtst AO, PREA + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(48) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(41) + .align 4 + +LL(50): + andi. I, M, 2 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 5 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f22, f5 + FMADD f6, f18, f23, f6 + FMADD f7, f19, f23, f7 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f17, f24, f1 + FMADD f2, f16, f25, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f19, f26, f5 + FMADD f6, f18, f27, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + dcbt AO, PREA + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(58) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f16, f21, f2 + FMADD f3, f17, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 5 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f17, f22, f2 + FMADD f3, f17, f23, f3 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f19, f26, f2 + FMADD f3, f19, f27, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(68) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(66) + .align 4 + +LL(68): + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +LL(09): + srawi. J, N, 2 + ble LL(999) + .align 4 + +LL(10): + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 2 + ble LL(30) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 5 + +LL(22): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f2, f18, f24, f2 + FMADD f3, f19, f24, f3 + FMADD f6, f18, f25, f6 + FMADD f7, f19, f25, f7 + + FMADD f10, f18, f26, f10 + FMADD f11, f19, f26, f11 + FMADD f14, f18, f27, f14 + FMADD f15, f19, f27, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(22) + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(28) + .align 4 + +LL(26): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f4, f16, f21, f4 + FMADD f5, f17, f21, f5 + + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +LL(30): + andi. I, M, 1 + ble LL(39) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 5 + +LL(32): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f1, f17, f24, f1 + FMADD f5, f17, f25, f5 + FMADD f9, f17, f26, f9 + FMADD f13, f17, f27, f13 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMADD f0, f18, f20, f0 + FMADD f4, f18, f21, f4 + FMADD f8, f18, f22, f8 + FMADD f12, f18, f23, f12 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f1, f19, f24, f1 + FMADD f5, f19, f25, f5 + FMADD f9, f19, f26, f9 + FMADD f13, f19, f27, f13 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 16 * SIZE + dcbtst AO, PREA + bdnz LL(32) + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f16, 1 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 1 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +LL(39): +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S new file mode 100644 index 0000000000..43354c690b --- /dev/null +++ b/kernel/power/trsm_kernel_ppc440_LN.S @@ -0,0 +1,3487 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + + srawi. J, N, 2 + ble .L40 + .align 4 + +.L10: + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + +.L30: + andi. I, M, 1 + ble .L20 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L20: + andi. I, M, 2 + ble .L09 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L09: + srawi. I, M, 2 + ble .L39 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L18 + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.L18: +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + + +.L39: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L40: + andi. J, N, 2 + ble .L70 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +.L60: + andi. I, M, 1 + ble .L50 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L50: + andi. I, M, 2 + ble .L41 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L41: + srawi. I, M, 2 + ble .L69 + .align 4 + +.L42: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 5 + +.L43: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L43 + .align 4 + +.L45: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L42 + .align 4 + +.L69: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +.L70: + andi. J, N, 1 + ble .L999 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L80 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L80: + andi. I, M, 2 + ble .L71 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L71: + srawi. I, M, 2 + ble .L999 + .align 4 + +.L72: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L75 + .align 5 + +.L73: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L73 + .align 4 + +.L75: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L72 + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S new file mode 100644 index 0000000000..eb0d4e413a --- /dev/null +++ b/kernel/power/trsm_kernel_ppc440_LT.S @@ -0,0 +1,3477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + + srawi. J, N, 2 + ble .L40 + .align 4 + +.L10: + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L18 + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.L18: +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + ble .L30 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L30: + andi. I, M, 1 + ble .L39 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L39: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L40: + andi. J, N, 2 + ble .L70 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble .L50 + .align 4 + +.L41: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 5 + +.L42: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L42 + .align 4 + +.L45: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L41 + .align 4 + +.L50: + andi. I, M, 2 + ble .L60 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L60: + andi. I, M, 1 + ble .L69 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L69: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +.L70: + andi. J, N, 1 + ble .L999 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble .L80 + .align 4 + +.L71: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L75 + .align 5 + +.L72: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L72 + .align 4 + +.L75: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L71 + .align 4 + +.L80: + andi. I, M, 2 + ble .L90 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L999 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S new file mode 100644 index 0000000000..54c59c2697 --- /dev/null +++ b/kernel/power/trsm_kernel_ppc440_RT.S @@ -0,0 +1,3496 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define AORIG r18 +#define TEMP r19 +#define KK r20 +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 +#define CO3 r27 +#define CO4 r28 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) +#endif + + stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 60 + STACKSIZE(SP) +#else + lwz OFFSET, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef LN + mullw r0, M, K + slwi r0, r0, BASE_SHIFT + add A, A, r0 + + slwi r0, M, BASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, BASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + +.L70: + andi. J, N, 1 + ble .L40 + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO1, LDC +#endif + ble .L80 + .align 4 + +.L71: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L75 + .align 5 + +.L72: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFD f19, 7 * SIZE(AO) + LFDU f20, 4 * SIZE(BO) + + FMADD f0, f16, f21, f0 + LFD f16, 8 * SIZE(AO) + FMADD f1, f17, f21, f1 + LFD f17, 9 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 10 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f19, 11 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + + FMADD f0, f16, f22, f0 + LFD f16, 12 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f17, 13 * SIZE(AO) + FMADD f2, f18, f22, f2 + LFD f18, 14 * SIZE(AO) + FMADD f3, f19, f22, f3 + LFD f19, 15 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + + FMADD f0, f16, f23, f0 + LFDU f16, 16 * SIZE(AO) + FMADD f1, f17, f23, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L72 + .align 4 + +.L75: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L78 + .align 4 + +.L76: + FMADD f0, f16, f20, f0 + LFDU f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f20, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f20, f3 + LFDU f20, 1 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L76 + .align 4 + +.L78: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + LFD f24, 2 * SIZE(BO) + LFD f28, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 + FSUB f2, f24, f2 + FSUB f3, f28, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FNMSUB f2, f17, f3, f2 + FNMSUB f1, f18, f3, f1 + FNMSUB f0, f19, f3, f0 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FNMSUB f1, f17, f2, f1 + FNMSUB f0, f18, f2, f0 + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f19, f0, f3 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FNMSUB f2, f18, f1, f2 + FNMSUB f3, f19, f1, f3 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FNMSUB f3, f19, f2, f3 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L71 + .align 4 + +.L80: + andi. I, M, 2 + ble .L90 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L85 + .align 5 + +.L82: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + LFD f17, 5 * SIZE(AO) + FMADD f2, f18, f21, f2 + LFD f18, 6 * SIZE(AO) + FMADD f3, f19, f21, f3 + LFD f21, 1 * SIZE(BO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFDU f16, 8 * SIZE(AO) + FMADD f1, f17, f22, f1 + LFD f22, 2 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + FMADD f2, f18, f23, f2 + LFD f18, 2 * SIZE(AO) + FMADD f3, f19, f23, f3 + LFD f23, 3 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L82 + .align 4 + +.L85: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L88 + .align 4 + +.L86: + FMADD f0, f16, f20, f0 + LFDU f16, 2 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 1 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L86 + .align 4 + +.L88: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f20, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + + LFD f17, 3 * SIZE(AO) + FMUL f1, f17, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L90: + andi. I, M, 1 + ble .L99 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 0 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble .L95 + .align 5 + +.L92: + FMADD f0, f16, f20, f0 + LFD f16, 4 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + LFD f21, 5 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 6 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 7 * SIZE(AO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFDU f16, 8 * SIZE(AO) + LFDU f20, 8 * SIZE(BO) + FMADD f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f18, f22, f2 + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + FMADD f3, f19, f23, f3 + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L92 + .align 4 + +.L95: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble+ .L98 + .align 4 + +.L96: + FMADD f0, f16, f20, f0 + LFDU f16, 1 * SIZE(AO) + LFDU f20, 1 * SIZE(BO) + bdnz .L96 + .align 4 + +.L98: + FADD f0, f1, f0 + FADD f2, f3, f2 + FADD f0, f2, f0 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 0 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + FSUB f0, f16, f0 +#else + LFD f16, 0 * SIZE(AO) + FSUB f0, f16, f0 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + FMUL f0, f21, f0 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + FMUL f0, f16, f0 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + FMUL f0, f16, f0 +#endif + +#ifdef RT + LFD f21, 0 * SIZE(BO) + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + + lfs f0, FZERO + +#ifndef LN + addi CO1, CO1, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 0 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L99: +#ifdef LN + slwi r0, K, 0 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L40: + andi. J, N, 2 + ble .L09 + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. I, M, 2 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble .L50 + .align 4 + +.L41: +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 2 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 5 + +.L42: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + FMADD f2, f18, f22, f2 + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + + LFD f23, 3 * SIZE(BO) + bdnz .L42 + .align 4 + +.L45: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L48 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L48: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f28, 6 * SIZE(BO) + LFD f29, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f1, f20, f1 + FSUB f5, f21, f5 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f3, f28, f3 + FSUB f7, f29, f7 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FMUL f0, f21, f0 + FMUL f4, f21, f4 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + + FNMSUB f2, f18, f1, f2 + FNMSUB f6, f18, f5, f6 + + FNMSUB f3, f19, f1, f3 + FNMSUB f7, f19, f5, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMUL f2, f18, f2 + FMUL f6, f18, f6 + + FNMSUB f3, f19, f2, f3 + FNMSUB f7, f19, f6, f7 + + LFD f19, 15 * SIZE(AO) + + FMUL f3, f19, f3 + FMUL f7, f19, f7 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FMUL f4, f18, f4 + FMUL f5, f18, f5 + FMUL f6, f18, f6 + FMUL f7, f18, f7 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f6, 5 * SIZE(BO) + STFD f3, 6 * SIZE(BO) + STFD f7, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L41 + .align 4 + +.L50: + andi. I, M, 2 + ble .L60 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L55 + .align 5 + +.L52: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 8 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFD f16, 4 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 5 * SIZE(AO) + + FMADD f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + FMADD f5, f19, f22, f5 + LFD f22, 2 * SIZE(BO) + FMADD f6, f18, f23, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f24, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f2, f16, f25, f2 + LFDU f16, 8 * SIZE(AO) + FMADD f3, f17, f25, f3 + LFD f17, 1 * SIZE(AO) + + FMADD f4, f18, f26, f4 + LFD f25, 5 * SIZE(BO) + FMADD f5, f19, f26, f5 + LFD f26, 6 * SIZE(BO) + FMADD f6, f18, f27, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f27, f7 + LFD f19, 3 * SIZE(AO) + + LFD f27, 7 * SIZE(BO) + bdnz .L52 + .align 4 + +.L55: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L58 + .align 4 + +.L56: + FMADD f0, f16, f20, f0 + FMADD f1, f17, f20, f1 + LFDU f20, 2 * SIZE(BO) + FMADD f2, f16, f21, f2 + LFDU f16, 2 * SIZE(AO) + FMADD f3, f17, f21, f3 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L56 + .align 4 + +.L58: + FADD f0, f4, f0 + FADD f1, f5, f1 + FADD f2, f6, f2 + FADD f3, f7, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f2, f17, f2 + FSUB f1, f20, f1 + FSUB f3, f21, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f3, f19, f3 + + FNMSUB f0, f20, f1, f0 + FNMSUB f2, f20, f3, f2 + + FMUL f0, f21, f0 + FMUL f2, f21, f2 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f2, f16, f2 + FNMSUB f1, f17, f0, f1 + FNMSUB f3, f17, f2, f3 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f3, f17, f3 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + + FNMSUB f2, f17, f0, f2 + FNMSUB f3, f17, f1, f3 + FMUL f2, f18, f2 + FMUL f3, f18, f3 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f2, f19, f2 + FMUL f3, f19, f3 + FNMSUB f0, f20, f2, f0 + FNMSUB f1, f20, f3, f1 + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f2, 1 * SIZE(BO) + STFD f1, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L60: + andi. I, M, 1 + ble .L69 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 1 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L65 + .align 5 + +.L62: + FMADD f0, f16, f20, f0 + LFDU f20, 8 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 4 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + FMADD f2, f17, f22, f2 + LFD f22, 2 * SIZE(BO) + FMADD f3, f17, f23, f3 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + + FMADD f0, f18, f24, f0 + LFD f24, 4 * SIZE(BO) + FMADD f1, f18, f25, f1 + LFD f18, 2 * SIZE(AO) + LFD f25, 5 * SIZE(BO) + FMADD f2, f19, f26, f2 + LFD f26, 6 * SIZE(BO) + FMADD f3, f19, f27, f3 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L62 + .align 4 + +.L65: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L68 + .align 4 + +.L66: + FMADD f0, f16, f20, f0 + LFDU f20, 2 * SIZE(BO) + FMADD f1, f16, f21, f1 + LFDU f16, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L66 + .align 4 + +.L68: + FADD f0, f2, f0 + FADD f1, f3, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 1 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f20, f1 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f1, f17, f0, f1 + FMUL f1, f18, f1 +#endif + +#ifdef RT + LFD f19, 3 * SIZE(BO) + LFD f20, 2 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f1, f19, f1 + FNMSUB f0, f20, f1, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 0 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 1 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L69: +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + lfs f0, FZERO + .align 4 + +.L09: + srawi. J, N, 2 + ble .L999 + .align 4 + +.L10: + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 2 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L18 + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.L18: +#if defined(LN) || defined(RT) + subi r0, KK, 4 + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 + + FSUB f2, f24, f2 + FSUB f6, f25, f6 + FSUB f10, f26, f10 + FSUB f14, f27, f14 + + FSUB f3, f28, f3 + FSUB f7, f29, f7 + FSUB f11, f30, f11 + FSUB f15, f31, f15 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f16, 15 * SIZE(AO) + LFD f17, 14 * SIZE(AO) + LFD f18, 13 * SIZE(AO) + LFD f19, 12 * SIZE(AO) + + FMUL f3, f16, f3 + FMUL f7, f16, f7 + FMUL f11, f16, f11 + FMUL f15, f16, f15 + + FNMSUB f2, f17, f3, f2 + FNMSUB f6, f17, f7, f6 + FNMSUB f10, f17, f11, f10 + FNMSUB f14, f17, f15, f14 + + FNMSUB f1, f18, f3, f1 + FNMSUB f5, f18, f7, f5 + FNMSUB f9, f18, f11, f9 + FNMSUB f13, f18, f15, f13 + + FNMSUB f0, f19, f3, f0 + FNMSUB f4, f19, f7, f4 + FNMSUB f8, f19, f11, f8 + FNMSUB f12, f19, f15, f12 + + LFD f16, 10 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 8 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + + FMUL f2, f16, f2 + FMUL f6, f16, f6 + FMUL f10, f16, f10 + FMUL f14, f16, f14 + + LFD f20, 4 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FNMSUB f1, f17, f2, f1 + FNMSUB f5, f17, f6, f5 + FNMSUB f9, f17, f10, f9 + FNMSUB f13, f17, f14, f13 + + FNMSUB f0, f18, f2, f0 + FNMSUB f4, f18, f6, f4 + FNMSUB f8, f18, f10, f8 + FNMSUB f12, f18, f14, f12 + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + FNMSUB f2, f18, f0, f2 + FNMSUB f6, f18, f4, f6 + FNMSUB f10, f18, f8, f10 + FNMSUB f14, f18, f12, f14 + + FNMSUB f3, f19, f0, f3 + FNMSUB f7, f19, f4, f7 + FNMSUB f11, f19, f8, f11 + FNMSUB f15, f19, f12, f15 + + LFD f16, 5 * SIZE(AO) + LFD f17, 6 * SIZE(AO) + LFD f18, 7 * SIZE(AO) + LFD f19, 10 * SIZE(AO) + + FMUL f1, f16, f1 + FMUL f5, f16, f5 + FMUL f9, f16, f9 + FMUL f13, f16, f13 + + LFD f20, 11 * SIZE(AO) + LFD f21, 15 * SIZE(AO) + + FNMSUB f2, f17, f1, f2 + FNMSUB f6, f17, f5, f6 + FNMSUB f10, f17, f9, f10 + FNMSUB f14, f17, f13, f14 + + FNMSUB f3, f18, f1, f3 + FNMSUB f7, f18, f5, f7 + FNMSUB f11, f18, f9, f11 + FNMSUB f15, f18, f13, f15 + + FMUL f2, f19, f2 + FMUL f6, f19, f6 + FMUL f10, f19, f10 + FMUL f14, f19, f14 + + FNMSUB f3, f20, f2, f3 + FNMSUB f7, f20, f6, f7 + FNMSUB f11, f20, f10, f11 + FNMSUB f15, f20, f14, f15 + + FMUL f3, f21, f3 + FMUL f7, f21, f7 + FMUL f11, f21, f11 + FMUL f15, f21, f15 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FMUL f2, f16, f2 + FMUL f3, f16, f3 + + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f6, f17, f2, f6 + FNMSUB f7, f17, f3, f7 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + FNMSUB f14, f19, f2, f14 + FNMSUB f15, f19, f3, f15 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FMUL f6, f16, f6 + FMUL f7, f16, f7 + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f10, f17, f6, f10 + FNMSUB f11, f17, f7, f11 + + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + FNMSUB f14, f18, f6, f14 + FNMSUB f15, f18, f7, f15 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FMUL f10, f19, f10 + FMUL f11, f19, f11 + + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FNMSUB f14, f20, f10, f14 + FNMSUB f15, f20, f11, f15 + + FMUL f12, f21, f12 + FMUL f13, f21, f13 + FMUL f14, f21, f14 + FMUL f15, f21, f15 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FMUL f14, f16, f14 + FMUL f15, f16, f15 + + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f10, f17, f14, f10 + FNMSUB f11, f17, f15, f11 + + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f6, f18, f14, f6 + FNMSUB f7, f18, f15, f7 + + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + FNMSUB f2, f19, f14, f2 + FNMSUB f3, f19, f15, f3 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FMUL f10, f16, f10 + FMUL f11, f16, f11 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f6, f17, f10, f6 + FNMSUB f7, f17, f11, f7 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FMUL f6, f19, f6 + FMUL f7, f19, f7 + + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + FNMSUB f2, f20, f6, f2 + FNMSUB f3, f20, f7, f3 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 + FMUL f2, f21, f2 + FMUL f3, f21, f3 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f6, 9 * SIZE(BO) + STFD f10, 10 * SIZE(BO) + STFD f14, 11 * SIZE(BO) + + STFD f3, 12 * SIZE(BO) + STFD f7, 13 * SIZE(BO) + STFD f11, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + ble .L30 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 5 + +.L22: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 11 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 5 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 12 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 13 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 14 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 6 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f27, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + FMADD f1, f17, f20, f1 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 8 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f23, 3 * SIZE(BO) + + FMADD f2, f18, f24, f2 + LFD f17, 1 * SIZE(AO) + FMADD f3, f19, f24, f3 + LFD f24, 4 * SIZE(BO) + FMADD f6, f18, f25, f6 + nop + FMADD f7, f19, f25, f7 + LFD f25, 5 * SIZE(BO) + + FMADD f10, f18, f26, f10 + nop + FMADD f11, f19, f26, f11 + LFD f26, 6 * SIZE(BO) + FMADD f14, f18, f27, f14 + LFD f18, 2 * SIZE(AO) + FMADD f15, f19, f27, f15 + LFD f19, 3 * SIZE(AO) + LFD f27, 7 * SIZE(BO) + bdnz .L22 + + fadd f0, f2, f0 + fadd f1, f3, f1 + fadd f4, f6, f4 + fadd f5, f7, f5 + fadd f8, f10, f8 + fadd f9, f11, f9 + fadd f12, f14, f12 + fadd f13, f15, f13 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L28 + .align 4 + +.L26: + FMADD f0, f16, f20, f0 + nop + FMADD f1, f17, f20, f1 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + nop + FMADD f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + + FMADD f8, f16, f22, f8 + nop + FMADD f9, f17, f22, f9 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 2 * SIZE(AO) + FMADD f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L28: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 + + FSUB f1, f20, f1 + FSUB f5, f21, f5 + FSUB f9, f22, f9 + FSUB f13, f23, f13 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f19, 3 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 0 * SIZE(AO) + + FMUL f1, f19, f1 + FMUL f5, f19, f5 + FMUL f9, f19, f9 + FMUL f13, f19, f13 + + FNMSUB f0, f20, f1, f0 + FNMSUB f4, f20, f5, f4 + FNMSUB f8, f20, f9, f8 + FNMSUB f12, f20, f13, f12 + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 + + FNMSUB f1, f17, f0, f1 + FNMSUB f5, f17, f4, f5 + FNMSUB f9, f17, f8, f9 + FNMSUB f13, f17, f12, f13 + + LFD f17, 3 * SIZE(AO) + + FMUL f1, f17, f1 + FMUL f5, f17, f5 + FMUL f9, f17, f9 + FMUL f13, f17, f13 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FMUL f1, f16, f1 + FNMSUB f4, f17, f0, f4 + FNMSUB f5, f17, f1, f5 + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f12, f19, f0, f12 + FNMSUB f13, f19, f1, f13 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FMUL f5, f16, f5 + FNMSUB f8, f17, f4, f8 + FNMSUB f9, f17, f5, f9 + FNMSUB f12, f18, f4, f12 + FNMSUB f13, f18, f5, f13 + + FMUL f8, f19, f8 + FMUL f9, f19, f9 + FNMSUB f12, f20, f8, f12 + FNMSUB f13, f20, f9, f13 + FMUL f12, f21, f12 + FMUL f13, f21, f13 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FMUL f13, f16, f13 + FNMSUB f8, f17, f12, f8 + FNMSUB f9, f17, f13, f9 + FNMSUB f4, f18, f12, f4 + FNMSUB f5, f18, f13, f5 + FNMSUB f0, f19, f12, f0 + FNMSUB f1, f19, f13, f1 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FMUL f8, f16, f8 + FMUL f9, f16, f9 + FNMSUB f4, f17, f8, f4 + FNMSUB f5, f17, f9, f5 + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + + FMUL f4, f19, f4 + FMUL f5, f19, f5 + FNMSUB f0, f20, f4, f0 + FNMSUB f1, f20, f5, f1 + + FMUL f0, f21, f0 + FMUL f1, f21, f1 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) + + STFD f1, 4 * SIZE(BO) + STFD f5, 5 * SIZE(BO) + STFD f9, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + .align 4 + +.L30: + andi. I, M, 1 + ble .L39 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, BASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + BASE_SHIFT + slwi TEMP, KK, 2 + BASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 5 + +.L32: + FMADD f0, f16, f20, f0 + LFD f20, 8 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 9 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 10 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f23, 11 * SIZE(BO) + LFDU f16, 4 * SIZE(AO) + + FMADD f1, f17, f24, f1 + LFD f24, 12 * SIZE(BO) + FMADD f5, f17, f25, f5 + LFD f25, 13 * SIZE(BO) + FMADD f9, f17, f26, f9 + LFD f26, 14 * SIZE(BO) + FMADD f13, f17, f27, f13 + LFD f27, 15 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + + FMADD f0, f18, f20, f0 + LFDU f20, 16 * SIZE(BO) + FMADD f4, f18, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f18, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f18, f23, f12 + LFD f23, 3 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + + FMADD f1, f19, f24, f1 + LFD f24, 4 * SIZE(BO) + FMADD f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + FMADD f9, f19, f26, f9 + LFD f26, 6 * SIZE(BO) + FMADD f13, f19, f27, f13 + LFD f27, 7 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + bdnz .L32 + + fadd f0, f1, f0 + fadd f4, f5, f4 + fadd f8, f9, f8 + fadd f12, f13, f12 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble+ .L38 + .align 4 + +.L36: + FMADD f0, f16, f20, f0 + LFDU f20, 4 * SIZE(BO) + FMADD f4, f16, f21, f4 + LFD f21, 1 * SIZE(BO) + FMADD f8, f16, f22, f8 + LFD f22, 2 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFDU f16, 1 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + bdnz .L36 + .align 4 + +.L38: +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + BASE_SHIFT + slwi r0, r0, 2 + BASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f4, f17, f4 + FSUB f8, f18, f8 + FSUB f12, f19, f12 +#else + LFD f16, 0 * SIZE(AO) + LFD f20, 1 * SIZE(AO) + LFD f24, 2 * SIZE(AO) + LFD f28, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f4, f20, f4 + FSUB f8, f24, f8 + FSUB f12, f28, f12 +#endif + +#ifdef LN + LFD f21, 0 * SIZE(AO) + + FMUL f0, f21, f0 + FMUL f4, f21, f4 + FMUL f8, f21, f8 + FMUL f12, f21, f12 +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + + FMUL f0, f16, f0 + FMUL f4, f16, f4 + FMUL f8, f16, f8 + FMUL f12, f16, f12 +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FMUL f0, f16, f0 + FNMSUB f4, f17, f0, f4 + FNMSUB f8, f18, f0, f8 + FNMSUB f12, f19, f0, f12 + + LFD f16, 5 * SIZE(BO) + LFD f17, 6 * SIZE(BO) + LFD f18, 7 * SIZE(BO) + LFD f19, 10 * SIZE(BO) + + LFD f20, 11 * SIZE(BO) + LFD f21, 15 * SIZE(BO) + + FMUL f4, f16, f4 + FNMSUB f8, f17, f4, f8 + FNMSUB f12, f18, f4, f12 + FMUL f8, f19, f8 + FNMSUB f12, f20, f8, f12 + FMUL f12, f21, f12 +#endif + +#ifdef RT + LFD f16, 15 * SIZE(BO) + LFD f17, 14 * SIZE(BO) + LFD f18, 13 * SIZE(BO) + LFD f19, 12 * SIZE(BO) + + FMUL f12, f16, f12 + FNMSUB f8, f17, f12, f8 + FNMSUB f4, f18, f12, f4 + FNMSUB f0, f19, f12, f0 + + LFD f16, 10 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 8 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + + FMUL f8, f16, f8 + + LFD f20, 4 * SIZE(BO) + LFD f21, 0 * SIZE(BO) + + FNMSUB f4, f17, f8, f4 + FNMSUB f0, f18, f8, f0 + + FMUL f4, f19, f4 + FNMSUB f0, f20, f4, f0 + FMUL f0, f21, f0 +#endif + +#ifdef LN + subi CO1, CO1, 1 * SIZE + subi CO2, CO2, 1 * SIZE + subi CO3, CO3, 1 * SIZE + subi CO4, CO4, 1 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f4, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f12, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f4, 1 * SIZE(AO) + STFD f8, 2 * SIZE(AO) + STFD f12, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f8, 0 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + + lfs f0, FZERO + fmr f1, f0 + fmr f4, f0 + fmr f5, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f12, f0 + fmr f13, f0 + +#ifndef LN + addi CO1, CO1, 1 * SIZE + addi CO2, CO2, 1 * SIZE + addi CO3, CO3, 1 * SIZE + addi CO4, CO4, 1 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + BASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + BASE_SHIFT + slwi TEMP, TEMP, 2 + BASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + .align 4 + +.L39: +#ifdef LN + slwi r0, K, 2 + BASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/zamax.S b/kernel/power/zamax.S new file mode 100644 index 0000000000..6acd96dcc8 --- /dev/null +++ b/kernel/power/zamax.S @@ -0,0 +1,505 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamax_cell.S b/kernel/power/zamax_cell.S new file mode 100644 index 0000000000..2af3d24116 --- /dev/null +++ b/kernel/power/zamax_cell.S @@ -0,0 +1,495 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, 10 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + + fabs f8, f24 + LFD f26, 2 * SIZE(X) + fabs f9, f25 + LFD f27, 3 * SIZE(X) + fabs f10, f26 + LFD f28, 4 * SIZE(X) + fabs f11, f27 + LFD f29, 5 * SIZE(X) + fabs f12, f28 + LFD f30, 6 * SIZE(X) + fabs f13, f29 + LFD f31, 7 * SIZE(X) + fabs f14, f30 + nop + fabs f15, f31 + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + dcbt X, PREA + fadd f5, f10, f11 + nop + fadd f6, f12, f13 + LFD f24, 8 * SIZE(X) + fadd f7, f14, f15 + LFD f25, 9 * SIZE(X) + + fabs f8, f24 + LFD f26, 10 * SIZE(X) + fabs f9, f25 + LFD f27, 11 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + LFD f28, 12 * SIZE(X) + fsub f19, f3, f7 + LFD f29, 13 * SIZE(X) + + fabs f12, f28 + LFD f30, 14 * SIZE(X) + fabs f13, f29 + LFD f31, 15 * SIZE(X) + fabs f14, f30 + fabs f15, f31 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + LFD f24, 16 * SIZE(X) + fadd f23, f14, f15 + LFD f25, 17 * SIZE(X) + + fabs f8, f24 + LFD f26, 18 * SIZE(X) + fabs f9, f25 + LFD f27, 19 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + LFD f28, 20 * SIZE(X) + fsub f19, f3, f23 + LFD f29, 21 * SIZE(X) + + fabs f12, f28 + LFD f30, 22 * SIZE(X) + fabs f13, f29 + LFD f31, 23 * SIZE(X) + fabs f14, f30 + addi X, X, 16 * SIZE + fabs f15, f31 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + LFD f24, 8 * SIZE(X) + fadd f7, f14, f15 + LFD f25, 9 * SIZE(X) + + fabs f8, f24 + LFD f26, 10 * SIZE(X) + fabs f9, f25 + LFD f27, 11 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + LFD f28, 12 * SIZE(X) + fsub f19, f3, f7 + LFD f29, 13 * SIZE(X) + + fabs f12, f28 + LFD f30, 14 * SIZE(X) + fabs f13, f29 + LFD f31, 15 * SIZE(X) + fabs f14, f30 + fabs f15, f31 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + addi X, X, 16 * SIZE + + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamax_hummer.S b/kernel/power/zamax_hummer.S new file mode 100644 index 0000000000..84312395cf --- /dev/null +++ b/kernel/power/zamax_hummer.S @@ -0,0 +1,347 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 + +#define T1 f16 +#define T2 f17 +#define T3 f18 +#define T4 f19 + +#define B1 f20 +#define B2 f21 +#define B3 f22 +#define B4 f23 +#define B5 f24 +#define B6 f25 +#define B7 f26 +#define B8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD A1, 0 * SIZE(X) + LFD A2, 1 * SIZE(X) + add X, X, INCX2 + + fabs A1, A1 + fabs A2, A2 + + addi N, N, -1 + cmpwi cr0, N, 0 + fadd C1, A1, A2 + ble LL(999) + + subi INCX2, INCX2, SIZE + fsmfp C1, C1 + li INCX, SIZE + fpmr C2, C1 + sub X, X, INCX2 + fpmr C3, C1 + srawi. r0, N, 3 + fpmr C4, C1 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX2 + LFDUX A6, X, INCX + LFDUX A7, X, INCX2 + LFDUX A8, X, INCX + + LFSDUX A5, X, INCX2 + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX2 + LFSDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpabs B1, A1 + LFDUX A1, X, INCX2 + fpabs B2, A2 + LFDUX A2, X, INCX + fpabs B3, A3 + LFDUX A3, X, INCX2 + fpabs B4, A4 + LFDUX A4, X, INCX + + fpabs B5, A5 + LFSDUX A1, X, INCX2 + fpabs B6, A6 + LFSDUX A2, X, INCX + fpabs B7, A7 + LFSDUX A3, X, INCX2 + fpabs B8, A8 + LFSDUX A4, X, INCX + + fpadd T1, B1, B2 + LFDUX A5, X, INCX2 + fpadd T2, B3, B4 + LFDUX A6, X, INCX + fpadd T3, B5, B6 + LFDUX A7, X, INCX2 + fpadd T4, B7, B8 + LFDUX A8, X, INCX + + fpsub F1, C1, T1 + LFSDUX A5, X, INCX2 + fpsub F2, C2, T2 + LFSDUX A6, X, INCX + fpsub F3, C3, T3 + LFSDUX A7, X, INCX2 + fpsub F4, C4, T4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + bdnz LL(102) + .align 4 + +LL(103): + fpabs B1, A1 + fpabs B2, A2 + fpabs B3, A3 + fpabs B4, A4 + + fpabs B5, A5 + fpabs B6, A6 + fpabs B7, A7 + fpabs B8, A8 + + fpadd T1, B1, B2 + fpadd T2, B3, B4 + fpadd T3, B5, B6 + fpadd T4, B7, B8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(998) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A3 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A3 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + + fpadd A1, A1, A2 + + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + + fabs A1, A1 + fabs A2, A2 + + fadd A1, A1, A2 + + fsub F1, C1, A1 + fsel C1, F1, C1, A1 + .align 4 + +LL(998): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zamax_ppc440.S b/kernel/power/zamax_ppc440.S new file mode 100644 index 0000000000..17372bbbab --- /dev/null +++ b/kernel/power/zamax_ppc440.S @@ -0,0 +1,319 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREX r8 +#define INC1 r9 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + + sub X, X, INCX + li INC1, SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + LFDX f2, X, INC1 + + fabs f1, f1 + li PREX, 4 * 8 * SIZE + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 +#ifdef PPCG4 + dcbt X, PREX +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f13, f29 + LFDUX f28, X, INCX + fabs f14, f30 + LFDX f29, X, INC1 + fabs f15, f31 + LFDUX f30, X, INCX + + fsub f16, f0, f4 + LFDX f31, X, INC1 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 +#ifdef PPCG4 + dcbt X, PREX +#endif + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f0, f16, f0, f4 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f0, f4 + fsel f1, f17, f1, f5 + fsel f2, f18, f2, f6 + fsel f3, f19, f3, f7 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f0, f20 + fsel f1, f17, f1, f21 + fsel f2, f18, f2, f22 + fsel f3, f19, f3, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f1, f8 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsub f8, f0, f2 + fsel f1, f8, f0, f2 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamin.S b/kernel/power/zamin.S new file mode 100644 index 0000000000..1ab8b6b390 --- /dev/null +++ b/kernel/power/zamin.S @@ -0,0 +1,505 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamin_cell.S b/kernel/power/zamin_cell.S new file mode 100644 index 0000000000..6d32f60c83 --- /dev/null +++ b/kernel/power/zamin_cell.S @@ -0,0 +1,495 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f1 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, 10 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFD f1, 0 * SIZE(X) + LFD f2, 1 * SIZE(X) + add X, X, INCX + + fabs f1, f1 + fabs f2, f2 + fadd f1, f1, f2 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + + fabs f8, f24 + LFD f26, 2 * SIZE(X) + fabs f9, f25 + LFD f27, 3 * SIZE(X) + fabs f10, f26 + LFD f28, 4 * SIZE(X) + fabs f11, f27 + LFD f29, 5 * SIZE(X) + fabs f12, f28 + LFD f30, 6 * SIZE(X) + fabs f13, f29 + LFD f31, 7 * SIZE(X) + fabs f14, f30 + nop + fabs f15, f31 + bdz LL(20) + .align 4 + +LL(10): + fadd f4, f8, f9 + dcbt X, PREA + fadd f5, f10, f11 + nop + fadd f6, f12, f13 + LFD f24, 8 * SIZE(X) + fadd f7, f14, f15 + LFD f25, 9 * SIZE(X) + + fabs f8, f24 + LFD f26, 10 * SIZE(X) + fabs f9, f25 + LFD f27, 11 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + LFD f28, 12 * SIZE(X) + fsub f19, f3, f7 + LFD f29, 13 * SIZE(X) + + fabs f12, f28 + LFD f30, 14 * SIZE(X) + fabs f13, f29 + LFD f31, 15 * SIZE(X) + fabs f14, f30 + fabs f15, f31 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + LFD f24, 16 * SIZE(X) + fadd f23, f14, f15 + LFD f25, 17 * SIZE(X) + + fabs f8, f24 + LFD f26, 18 * SIZE(X) + fabs f9, f25 + LFD f27, 19 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + LFD f28, 20 * SIZE(X) + fsub f19, f3, f23 + LFD f29, 21 * SIZE(X) + + fabs f12, f28 + LFD f30, 22 * SIZE(X) + fabs f13, f29 + LFD f31, 23 * SIZE(X) + fabs f14, f30 + addi X, X, 16 * SIZE + fabs f15, f31 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + + bdnz LL(10) + .align 4 + +LL(20): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + LFD f24, 8 * SIZE(X) + fadd f7, f14, f15 + LFD f25, 9 * SIZE(X) + + fabs f8, f24 + LFD f26, 10 * SIZE(X) + fabs f9, f25 + LFD f27, 11 * SIZE(X) + fabs f10, f26 + fabs f11, f27 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + LFD f28, 12 * SIZE(X) + fsub f19, f3, f7 + LFD f29, 13 * SIZE(X) + + fabs f12, f28 + LFD f30, 14 * SIZE(X) + fabs f13, f29 + LFD f31, 15 * SIZE(X) + fabs f14, f30 + fabs f15, f31 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + addi X, X, 16 * SIZE + + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zamin_hummer.S b/kernel/power/zamin_hummer.S new file mode 100644 index 0000000000..5ac1b89600 --- /dev/null +++ b/kernel/power/zamin_hummer.S @@ -0,0 +1,347 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define F1 f12 +#define F2 f13 +#define F3 f14 +#define F4 f15 + +#define T1 f16 +#define T2 f17 +#define T3 f18 +#define T4 f19 + +#define B1 f20 +#define B2 f21 +#define B3 f22 +#define B4 f23 +#define B5 f24 +#define B6 f25 +#define B7 f26 +#define B8 f27 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + LFD A1, 0 * SIZE(X) + LFD A2, 1 * SIZE(X) + add X, X, INCX2 + + fabs A1, A1 + fabs A2, A2 + + addi N, N, -1 + cmpwi cr0, N, 0 + fadd C1, A1, A2 + ble LL(999) + + subi INCX2, INCX2, SIZE + fsmfp C1, C1 + li INCX, SIZE + fpmr C2, C1 + sub X, X, INCX2 + fpmr C3, C1 + srawi. r0, N, 3 + fpmr C4, C1 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + LFDUX A5, X, INCX2 + LFDUX A6, X, INCX + LFDUX A7, X, INCX2 + LFDUX A8, X, INCX + + LFSDUX A5, X, INCX2 + LFSDUX A6, X, INCX + LFSDUX A7, X, INCX2 + LFSDUX A8, X, INCX + bdz LL(103) + .align 4 + +LL(102): + fpabs B1, A1 + LFDUX A1, X, INCX2 + fpabs B2, A2 + LFDUX A2, X, INCX + fpabs B3, A3 + LFDUX A3, X, INCX2 + fpabs B4, A4 + LFDUX A4, X, INCX + + fpabs B5, A5 + LFSDUX A1, X, INCX2 + fpabs B6, A6 + LFSDUX A2, X, INCX + fpabs B7, A7 + LFSDUX A3, X, INCX2 + fpabs B8, A8 + LFSDUX A4, X, INCX + + fpadd T1, B1, B2 + LFDUX A5, X, INCX2 + fpadd T2, B3, B4 + LFDUX A6, X, INCX + fpadd T3, B5, B6 + LFDUX A7, X, INCX2 + fpadd T4, B7, B8 + LFDUX A8, X, INCX + + fpsub F1, T1, C1 + LFSDUX A5, X, INCX2 + fpsub F2, T2, C2 + LFSDUX A6, X, INCX + fpsub F3, T3, C3 + LFSDUX A7, X, INCX2 + fpsub F4, T4, C4 + LFSDUX A8, X, INCX + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + bdnz LL(102) + .align 4 + +LL(103): + fpabs B1, A1 + fpabs B2, A2 + fpabs B3, A3 + fpabs B4, A4 + + fpabs B5, A5 + fpabs B6, A6 + fpabs B7, A7 + fpabs B8, A8 + + fpadd T1, B1, B2 + fpadd T2, B3, B4 + fpadd T3, B5, B6 + fpadd T4, B7, B8 + + fpsub F1, T1, C1 + fpsub F2, T2, C2 + fpsub F3, T3, C3 + fpsub F4, T4, C4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(998) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX A3, X, INCX2 + LFDUX A4, X, INCX + + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + LFSDUX A3, X, INCX2 + LFSDUX A4, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpadd A1, A1, A2 + fpadd A3, A3, A4 + + fpsub F1, A1, C1 + fpsub F2, A3, C2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A3 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFSDUX A1, X, INCX2 + LFSDUX A2, X, INCX + + fpabs A1, A1 + fpabs A2, A2 + + fpadd A1, A1, A2 + + fpsub F1, A1, C1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + + fabs A1, A1 + fabs A2, A2 + + fadd A1, A1, A2 + + fsub F1, A1, C1 + fsel C1, F1, C1, A1 + .align 4 + +LL(998): + fpsub F1, C2, C1 + fpsub F2, C4, C3 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C3, C1 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C2, C1 + fsel C1, F1, C1, C2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zamin_ppc440.S b/kernel/power/zamin_ppc440.S new file mode 100644 index 0000000000..9d70f76082 --- /dev/null +++ b/kernel/power/zamin_ppc440.S @@ -0,0 +1,317 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREX r8 +#define INC1 r9 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + + sub X, X, INCX + li INC1, SIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + LFDUX f1, X, INCX + LFDX f2, X, INC1 + + fabs f1, f1 + li PREX, 4 * 8 * SIZE + fabs f2, f2 + subi N, N, 1 + fadd f1, f1, f2 + + fmr f0, f1 + srawi. r0, N, 3 + fmr f2, f1 + mtspr CTR, r0 + fmr f3, f1 + beq- LL(150) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + bdz LL(120) + .align 4 + +LL(110): + fadd f4, f8, f9 +#ifdef PPCG4 + dcbt X, PREX +#endif + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 +#ifdef PPCG4 + dcbt X, PREX +#endif + fabs f13, f29 + LFDUX f28, X, INCX + fabs f14, f30 + LFDX f29, X, INC1 + fabs f15, f31 + LFDUX f30, X, INCX + + fsub f16, f0, f4 + LFDX f31, X, INC1 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 +#ifdef PPCG4 + dcbt X, PREX +#endif + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f0, f16, f4, f0 +#ifdef PPCG4 + dcbt X, PREX +#endif + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + bdnz LL(110) + .align 4 + +LL(120): + fadd f4, f8, f9 + fadd f5, f10, f11 + fadd f6, f12, f13 + fadd f7, f14, f15 + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + fsub f16, f0, f4 + fsub f17, f1, f5 + fsub f18, f2, f6 + fsub f19, f3, f7 + + fadd f20, f8, f9 + fadd f21, f10, f11 + fadd f22, f12, f13 + fadd f23, f14, f15 + + fsel f0, f16, f4, f0 + fsel f1, f17, f5, f1 + fsel f2, f18, f6, f2 + fsel f3, f19, f7, f3 + + fsub f16, f0, f20 + fsub f17, f1, f21 + fsub f18, f2, f22 + fsub f19, f3, f23 + + fsel f0, f16, f20, f0 + fsel f1, f17, f21, f1 + fsel f2, f18, f22, f2 + fsel f3, f19, f23, f3 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fadd f8, f8, f9 + fsub f16, f1, f8 + fsel f1, f16, f8, f1 + bdnz LL(160) + .align 4 + +LL(999): + fsub f8, f0, f1 + fsub f9, f2, f3 + + fsel f0, f8, f1, f0 + fsel f2, f9, f3, f2 + fsub f8, f0, f2 + fsel f1, f8, f2, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zasum.S b/kernel/power/zasum.S new file mode 100644 index 0000000000..14b58ce1ad --- /dev/null +++ b/kernel/power/zasum.S @@ -0,0 +1,456 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCXM1 r9 +#define PREA r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + + fabs f16, f8 + fabs f17, f9 + fabs f18, f10 + fabs f19, f11 + + fabs f20, f12 + fabs f21, f13 + fabs f22, f14 + fabs f23, f15 + bdz LL(20) + .align 4 + +LL(10): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + FADD f0, f0, f16 + fabs f16, f8 + FADD f1, f1, f17 + fabs f17, f9 + + FADD f2, f2, f18 + fabs f18, f10 + FADD f3, f3, f19 + fabs f19, f11 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + FADD f4, f4, f20 + fabs f20, f12 + FADD f5, f5, f21 + fabs f21, f13 + + FADD f6, f6, f22 + fabs f22, f14 + FADD f7, f7, f23 + fabs f23, f15 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + fabs f8, f8 + fabs f9, f9 + FADD f0, f0, f8 + FADD f1, f1, f9 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f16, f8 + fabs f17, f9 + fabs f18, f10 + fabs f19, f11 + + fabs f20, f12 + fabs f21, f13 + fabs f22, f14 + fabs f23, f15 + bdz LL(120) + .align 4 + +LL(110): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + FADD f0, f0, f16 + fabs f16, f8 + FADD f1, f1, f17 + fabs f17, f9 + + FADD f2, f2, f18 + fabs f18, f10 + FADD f3, f3, f19 + fabs f19, f11 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + FADD f4, f4, f20 + fabs f20, f12 + FADD f5, f5, f21 + fabs f21, f13 + + FADD f6, f6, f22 + fabs f22, f14 + FADD f7, f7, f23 + fabs f23, f15 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + fabs f8, f8 + fabs f9, f9 + FADD f0, f0, f8 + FADD f1, f1, f9 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zasum_cell.S b/kernel/power/zasum_cell.S new file mode 100644 index 0000000000..7389468f3d --- /dev/null +++ b/kernel/power/zasum_cell.S @@ -0,0 +1,581 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define PREA r8 +#define INCXM1 r9 + +#define FZERO f0 + +#define STACKSIZE 16 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stw r0, 0(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfs FZERO, 0(SP) + + slwi INCX, INCX, ZBASE_SHIFT + fmr f1, FZERO + li PREA, 8 * 16 * SIZE + fmr f2, FZERO + subi INCXM1, INCX, SIZE + + cmpwi cr0, N, 0 + fmr f3, FZERO + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + cmpwi cr0, INCX, SIZE * 2 + bne- cr0, LL(20) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(15) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + bdz LL(13) + .align 4 + +LL(12): + FADD f0, f0, f4 + dcbt X, PREA + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 7 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 8 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 9 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 10 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 11 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 12 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 13 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 14 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 15 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 16 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 17 * SIZE(X) + + FADD f1, f1, f5 + addi X, X, 16 * SIZE + fabs f5, f9 + LFD f10, 2 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 3 * SIZE(X) + + FADD f3, f3, f7 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + bdnz LL(12) + .align 4 + +LL(13): + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 7 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 8 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 9 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 10 * SIZE(X) + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFD f11, 11 * SIZE(X) + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFD f8, 12 * SIZE(X) + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 13 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 14 * SIZE(X) + + FADD f2, f2, f6 + addi X, X, 16 * SIZE + fabs f6, f10 + LFD f11, -1 * SIZE(X) + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(16) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + LFD f8, 4 * SIZE(X) + fabs f7, f11 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFD f9, 5 * SIZE(X) + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFD f10, 6 * SIZE(X) + + FADD f2, f2, f6 + addi X, X, 8 * SIZE + fabs f6, f10 + LFD f11, -1 * SIZE(X) + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + LFD f10, 2 * SIZE(X) + fabs f5, f9 + LFD f11, 3 * SIZE(X) + fabs f6, f10 + addi X, X, 4 * SIZE + fabs f7, f11 + nop + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + + fabs f4, f8 + fabs f5, f9 + + FADD f0, f0, f4 + addi X, X, 2 * SIZE + FADD f1, f1, f5 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(25) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f4, f8 + LFDX f10, X, INCXM1 + fabs f5, f9 + LFDUX f11, X, INCX + fabs f6, f10 + LFDX f8, X, INCXM1 + fabs f7, f11 + bdz LL(23) + .align 4 + +LL(22): + FADD f0, f0, f4 + dcbt X, PREA + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + LFDX f8, X, INCXM1 + fabs f7, f11 + bdnz LL(22) + .align 4 + +LL(23): + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + nop + fabs f7, f11 + LFDX f8, X, INCXM1 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + nop + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(25): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(26) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f4, f8 + LFDX f10, X, INCXM1 + fabs f5, f9 + LFDUX f11, X, INCX + fabs f6, f10 + LFDX f8, X, INCXM1 + fabs f7, f11 + + FADD f0, f0, f4 + nop + fabs f4, f8 + LFDUX f9, X, INCX + + FADD f1, f1, f5 + nop + fabs f5, f9 + LFDX f10, X, INCXM1 + + FADD f2, f2, f6 + fabs f6, f10 + LFDUX f11, X, INCX + + FADD f3, f3, f7 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(26): + andi. r0, N, 2 + beq LL(27) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f4, f8 + LFDX f10, X, INCXM1 + fabs f5, f9 + LFDUX f11, X, INCX + + fabs f6, f10 + fabs f7, f11 + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(999) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f4, f8 + fabs f5, f9 + + FADD f0, f0, f4 + FADD f1, f1, f5 + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + + FADD f1, f0, f2 + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zasum_hummer.S b/kernel/power/zasum_hummer.S new file mode 100644 index 0000000000..f090e69f4d --- /dev/null +++ b/kernel/power/zasum_hummer.S @@ -0,0 +1,583 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 +#define FLAG r8 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define A1 f4 +#define A2 f5 +#define A3 f6 +#define A4 f7 +#define A5 f8 +#define A6 f9 +#define A7 f10 +#define A8 f11 + +#define T1 f12 +#define T2 f13 +#define T3 f14 +#define T4 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + li FLAG, 0 + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, INCX, 0 + ble LL(999) + + sub X, X, INCX2 + + cmpwi cr0, INCX, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + beq LL(05) + + LFD C1, 2 * SIZE(X) + li FLAG, 1 + addi X, X, 1 * SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + fabs C1, C1 + ble LL(99) + .align 4 + +LL(05): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + fpmr T1, C2 + LFPDUX A2, X, INCX2 + fpmr T2, C2 + LFPDUX A3, X, INCX2 + fpmr T3, C2 + LFPDUX A4, X, INCX2 + fpmr T4, C2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFPDUX A1, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFPDUX A2, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A3 + LFPDUX A3, X, INCX2 + + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFPDUX A4, X, INCX2 + + fpadd C1, C1, T1 + nop + fpabs T1, A5 + LFPDUX A5, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A6 + LFPDUX A6, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A7 + LFPDUX A7, X, INCX2 + + fpadd C4, C4, T4 + fpabs T4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + fpadd C1, C1, T1 + fpabs T1, A1 + fpadd C2, C2, T2 + fpabs T2, A2 + fpadd C3, C3, T3 + fpabs T3, A3 + fpadd C4, C4, T4 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(99) + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpabs T1, A1 + fpabs T2, A2 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(99) + + LFPDUX A1, X, INCX2 + fpabs T1, A1 + fpadd C1, C1, T1 + .align 4 + +LL(99): + cmpwi cr0, FLAG, 0 + beq LL(999) + + LFD A1, 2 * SIZE(X) + fabs T1, A1 + fadd C2, C2, T1 + b LL(999) + .align 4 + +LL(100): + addi X2, X, SIZE + andi. r0, X, 2 * SIZE - 1 + bne LL(200) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(115) + + LFPDUX A1, X, INCX2 + fpmr T1, C2 + LFPDUX A2, X, INCX2 + fpmr T2, C2 + LFPDUX A3, X, INCX2 + fpmr T3, C2 + LFPDUX A4, X, INCX2 + fpmr T4, C2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(113) + .align 4 + +LL(112): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFPDUX A1, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFPDUX A2, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A3 + LFPDUX A3, X, INCX2 + + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFPDUX A4, X, INCX2 + + fpadd C1, C1, T1 + nop + fpabs T1, A5 + LFPDUX A5, X, INCX2 + + fpadd C2, C2, T2 + nop + fpabs T2, A6 + LFPDUX A6, X, INCX2 + + fpadd C3, C3, T3 + nop + fpabs T3, A7 + LFPDUX A7, X, INCX2 + + fpadd C4, C4, T4 + fpabs T4, A8 + LFPDUX A8, X, INCX2 + bdnz LL(112) + .align 4 + +LL(113): + fpadd C1, C1, T1 + fpabs T1, A1 + fpadd C2, C2, T2 + fpabs T2, A2 + fpadd C3, C3, T3 + fpabs T3, A3 + fpadd C4, C4, T4 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(115): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(116) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(116): + andi. r0, N, 2 + beq LL(117) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + fpabs T1, A1 + fpabs T2, A2 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + fpabs T1, A1 + fpadd C1, C1, T1 + b LL(999) + .align 4 + +LL(200): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(215) + + + LFDUX A1, X, INCX2 + fpmr T1, C2 + LFDUX A2, X, INCX2 + fpmr T2, C2 + LFDUX A3, X, INCX2 + fpmr T3, C2 + LFDUX A4, X, INCX2 + fpmr T4, C2 + + LFDUX A5, X, INCX2 + LFSDUX A1, X2, INCX2 + + LFDUX A6, X, INCX2 + LFSDUX A2, X2, INCX2 + + LFDUX A7, X, INCX2 + LFSDUX A3, X2, INCX2 + + LFDUX A8, X, INCX2 + LFSDUX A4, X2, INCX2 + bdz LL(213) + .align 4 + +LL(212): + fpadd C1, C1, T1 + LFSDUX A5, X2, INCX2 + fpabs T1, A1 + LFDUX A1, X, INCX2 + + fpadd C2, C2, T2 + LFSDUX A6, X2, INCX2 + fpabs T2, A2 + LFDUX A2, X, INCX2 + + fpadd C3, C3, T3 + LFSDUX A7, X2, INCX2 + fpabs T3, A3 + LFDUX A3, X, INCX2 + + fpadd C4, C4, T4 + LFSDUX A8, X2, INCX2 + fpabs T4, A4 + LFDUX A4, X, INCX2 + + fpadd C1, C1, T1 + LFSDUX A1, X2, INCX2 + fpabs T1, A5 + LFDUX A5, X, INCX2 + fpadd C2, C2, T2 + LFSDUX A2, X2, INCX2 + fpabs T2, A6 + LFDUX A6, X, INCX2 + + fpadd C3, C3, T3 + LFSDUX A3, X2, INCX2 + fpabs T3, A7 + LFDUX A7, X, INCX2 + fpadd C4, C4, T4 + LFSDUX A4, X2, INCX2 + fpabs T4, A8 + LFDUX A8, X, INCX2 + + bdnz LL(212) + .align 4 + +LL(213): + fpadd C1, C1, T1 + nop + fpabs T1, A1 + LFSDUX A5, X2, INCX2 + fpadd C2, C2, T2 + nop + fpabs T2, A2 + LFSDUX A6, X2, INCX2 + fpadd C3, C3, T3 + + nop + fpabs T3, A3 + LFSDUX A7, X2, INCX2 + fpadd C4, C4, T4 + nop + fpabs T4, A4 + LFSDUX A8, X2, INCX2 + + fpadd C1, C1, T1 + fpabs T1, A5 + fpadd C2, C2, T2 + fpabs T2, A6 + fpadd C3, C3, T3 + fpabs T3, A7 + fpadd C4, C4, T4 + fpabs T4, A8 + + fpadd C1, C1, T1 + fpadd C2, C2, T2 + fpadd C3, C3, T3 + fpadd C4, C4, T4 + .align 4 + +LL(215): + andi. r0, N, 7 + beq LL(999) + andi. r0, N, 4 + beq LL(216) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs T1, A1 + LFDUX A5, X, INCX2 + fabs T2, A2 + LFDUX A6, X2, INCX2 + fabs T3, A3 + LFDUX A7, X, INCX2 + fabs T4, A4 + LFDUX A8, X2, INCX2 + + fadd C1, C1, T1 + fabs T1, A5 + fadd C2, C2, T2 + fabs T2, A6 + + fadd C3, C3, T3 + fabs T3, A7 + fadd C4, C4, T4 + fabs T4, A8 + + fadd C1, C1, T1 + fadd C2, C2, T2 + fadd C3, C3, T3 + fadd C4, C4, T4 + .align 4 + +LL(216): + andi. r0, N, 2 + beq LL(217) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs T1, A1 + fabs T2, A2 + fabs T3, A3 + fabs T4, A4 + + fadd C1, C1, T1 + fadd C2, C2, T2 + fadd C3, C3, T3 + fadd C4, C4, T4 + .align 4 + +LL(217): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + fabs T1, A1 + fabs T2, A2 + fadd C1, C1, T1 + fadd C2, C2, T2 + .align 4 + +LL(999): + fpadd C1, C1, C2 + li r10, 16 + fpadd C3, C3, C4 + fpadd C1, C1, C3 + lfpdux f15, SP, r10 + fsmtp C2, C1 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fadd C1, C2, C1 + blr + + EPILOGUE diff --git a/kernel/power/zasum_ppc440.S b/kernel/power/zasum_ppc440.S new file mode 100644 index 0000000000..213c837bb0 --- /dev/null +++ b/kernel/power/zasum_ppc440.S @@ -0,0 +1,321 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCXM1 r9 +#define PREX r8 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + fmr f1, FZERO + slwi INCX, INCX, ZBASE_SHIFT + fmr f2, FZERO + fmr f3, FZERO + subi INCXM1, INCX, SIZE + fmr f4, FZERO + sub X, X, INCXM1 + fmr f5, FZERO + li PREX, 3 * 16 * SIZE + fmr f6, FZERO + cmpwi cr0, N, 0 + fmr f7, FZERO + ble- LL(999) + + cmpwi cr0, INCX, 0 + ble- LL(999) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + fabs f16, f8 + + LFDX f24, X, INCXM1 + fabs f17, f9 + LFDUX f25, X, INCX + fabs f18, f10 + LFDX f26, X, INCXM1 + fabs f19, f11 + LFDUX f27, X, INCX + fabs f20, f12 + LFDX f28, X, INCXM1 + fabs f21, f13 + LFDUX f29, X, INCX + fabs f22, f14 + LFDX f30, X, INCXM1 + fabs f23, f15 + LFDUX f31, X, INCX + bdz LL(120) + .align 4 + +LL(110): + LFDX f8, X, INCXM1 + FADD f0, f0, f16 +#ifdef PPCG4 + dcbt X, PREX +#else + nop +#endif + fabs f16, f24 + + LFDUX f9, X, INCX + FADD f1, f1, f17 + nop + fabs f17, f25 + + LFDX f10, X, INCXM1 + FADD f2, f2, f18 + nop + fabs f18, f26 + + LFDUX f11, X, INCX + FADD f3, f3, f19 + nop + fabs f19, f27 + + LFDX f12, X, INCXM1 + FADD f4, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#else + nop +#endif + fabs f20, f28 + + LFDUX f13, X, INCX + FADD f5, f5, f21 + nop + fabs f21, f29 + + LFDX f14, X, INCXM1 + FADD f6, f6, f22 + nop + fabs f22, f30 + + LFDUX f15, X, INCX + FADD f7, f7, f23 + nop + fabs f23, f31 + + LFDX f24, X, INCXM1 + FADD f0, f0, f16 +#ifdef PPCG4 + dcbt X, PREX +#else + nop +#endif + fabs f16, f8 + + LFDUX f25, X, INCX + FADD f1, f1, f17 + nop + fabs f17, f9 + + LFDX f26, X, INCXM1 + FADD f2, f2, f18 + nop + fabs f18, f10 + + LFDUX f27, X, INCX + FADD f3, f3, f19 + nop + fabs f19, f11 + + LFDX f28, X, INCXM1 + FADD f4, f4, f20 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PREX +#else + nop +#endif + fabs f20, f12 + + LFDUX f29, X, INCX + FADD f5, f5, f21 + nop + fabs f21, f13 + + LFDX f30, X, INCXM1 + FADD f6, f6, f22 + nop + fabs f22, f14 + + LFDUX f31, X, INCX + FADD f7, f7, f23 + fabs f23, f15 + bdnz LL(110) + .align 4 + +LL(120): + FADD f0, f0, f16 + fabs f16, f24 + FADD f1, f1, f17 + fabs f17, f25 + + FADD f2, f2, f18 + fabs f18, f26 + FADD f3, f3, f19 + fabs f19, f27 + + FADD f4, f4, f20 + fabs f20, f28 + FADD f5, f5, f21 + fabs f21, f29 + + FADD f6, f6, f22 + fabs f22, f30 + FADD f7, f7, f23 + fabs f23, f31 + + FADD f0, f0, f16 + FADD f1, f1, f17 + FADD f2, f2, f18 + FADD f3, f3, f19 + + FADD f4, f4, f20 + FADD f5, f5, f21 + FADD f6, f6, f22 + FADD f7, f7, f23 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + fabs f8, f8 + fabs f9, f9 + FADD f0, f0, f8 + FADD f1, f1, f9 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f1 + FADD f2, f2, f3 + FADD f4, f4, f5 + FADD f6, f6, f7 + + FADD f0, f0, f2 + FADD f4, f4, f6 + FADD f1, f0, f4 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S new file mode 100644 index 0000000000..7eb591d1bc --- /dev/null +++ b/kernel/power/zaxpy.S @@ -0,0 +1,683 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define INCXM1 r4 +#define INCYM1 r5 +#define PREA r10 +#define YY r11 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define INCXM1 r5 +#define INCYM1 r6 +#define PREA r7 +#define YY r11 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r10 +#define INCX r4 +#define Y r5 +#define INCY r6 +#define INCXM1 r7 +#define INCYM1 r8 +#define PREA r9 +#define YY r11 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define INCXM1 r5 +#define INCYM1 r6 +#define PREA r7 +#define YY r11 +#endif +#endif + +#define ALPHA_R f24 +#define ALPHA_I f25 + +#ifndef CONJ +#define ADD1 FNMSUB +#define ADD2 FMADD +#else +#define ADD1 FMADD +#define ADD2 FNMSUB +#endif + +#ifndef NEEDPARAM + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + +#if defined(linux) && defined(__64BIT__) + ld INCY, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld INCY, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + fmr ALPHA_R, f1 + fmr ALPHA_I, f2 + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + LFD f8, 0 * SIZE(Y) + LFD f9, 1 * SIZE(Y) + LFD f10, 2 * SIZE(Y) + LFD f11, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + LFD f12, 4 * SIZE(Y) + LFD f13, 5 * SIZE(Y) + LFD f14, 6 * SIZE(Y) + LFD f15, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFD f0, 8 * SIZE(X) + LFD f1, 9 * SIZE(X) + LFD f2, 10 * SIZE(X) + LFD f3, 11 * SIZE(X) + + LFD f8, 8 * SIZE(Y) + LFD f9, 9 * SIZE(Y) + LFD f10, 10 * SIZE(Y) + LFD f11, 11 * SIZE(Y) + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + STFD f18, 2 * SIZE(Y) + STFD f19, 3 * SIZE(Y) + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFD f4, 12 * SIZE(X) + LFD f5, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f7, 15 * SIZE(X) + + LFD f12, 12 * SIZE(Y) + LFD f13, 13 * SIZE(Y) + LFD f14, 14 * SIZE(Y) + LFD f15, 15 * SIZE(Y) + + STFD f20, 4 * SIZE(Y) + STFD f21, 5 * SIZE(Y) + STFD f22, 6 * SIZE(Y) + STFD f23, 7 * SIZE(Y) + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFD f0, 16 * SIZE(X) + LFD f1, 17 * SIZE(X) + LFD f2, 18 * SIZE(X) + LFD f3, 19 * SIZE(X) + + LFD f8, 16 * SIZE(Y) + LFD f9, 17 * SIZE(Y) + LFD f10, 18 * SIZE(Y) + LFD f11, 19 * SIZE(Y) + + STFD f16, 8 * SIZE(Y) + STFD f17, 9 * SIZE(Y) + STFD f18, 10 * SIZE(Y) + STFD f19, 11 * SIZE(Y) + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFD f4, 20 * SIZE(X) + LFD f5, 21 * SIZE(X) + LFD f6, 22 * SIZE(X) + LFD f7, 23 * SIZE(X) + + LFD f12, 20 * SIZE(Y) + LFD f13, 21 * SIZE(Y) + LFD f14, 22 * SIZE(Y) + LFD f15, 23 * SIZE(Y) + + STFD f20, 12 * SIZE(Y) + STFD f21, 13 * SIZE(Y) + STFD f22, 14 * SIZE(Y) + STFD f23, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst Y, PREA +#ifdef L1_DUALFETCH + dcbt X, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst Y, PREA + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFD f0, 8 * SIZE(X) + LFD f1, 9 * SIZE(X) + LFD f2, 10 * SIZE(X) + LFD f3, 11 * SIZE(X) + + LFD f8, 8 * SIZE(Y) + LFD f9, 9 * SIZE(Y) + LFD f10, 10 * SIZE(Y) + LFD f11, 11 * SIZE(Y) + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFD f4, 12 * SIZE(X) + LFD f5, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f7, 15 * SIZE(X) + + LFD f12, 12 * SIZE(Y) + LFD f13, 13 * SIZE(Y) + LFD f14, 14 * SIZE(Y) + LFD f15, 15 * SIZE(Y) + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + STFD f18, 2 * SIZE(Y) + STFD f19, 3 * SIZE(Y) + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + STFD f20, 4 * SIZE(Y) + STFD f21, 5 * SIZE(Y) + STFD f22, 6 * SIZE(Y) + STFD f23, 7 * SIZE(Y) + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + STFD f16, 8 * SIZE(Y) + STFD f17, 9 * SIZE(Y) + STFD f18, 10 * SIZE(Y) + STFD f19, 11 * SIZE(Y) + + STFD f20, 12 * SIZE(Y) + STFD f21, 13 * SIZE(Y) + STFD f22, 14 * SIZE(Y) + STFD f23, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f8, 0 * SIZE(Y) + LFD f9, 1 * SIZE(Y) + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + + STFD f16, 0 * SIZE(Y) + STFD f17, 1 * SIZE(Y) + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + sub Y, Y, INCYM1 + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + .align 4 + + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + LFDX f10, Y, INCYM1 + LFDUX f11, Y, INCY + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f12, Y, INCYM1 + LFDUX f13, Y, INCY + LFDX f14, Y, INCYM1 + LFDUX f15, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + LFDX f10, Y, INCYM1 + LFDUX f11, Y, INCY + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f12, Y, INCYM1 + LFDUX f13, Y, INCY + LFDX f14, Y, INCYM1 + LFDUX f15, Y, INCY + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + STFDX f18, YY, INCYM1 + STFDUX f19, YY, INCY + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + LFDX f10, Y, INCYM1 + LFDUX f11, Y, INCY + + STFDX f20, YY, INCYM1 + STFDUX f21, YY, INCY + STFDX f22, YY, INCYM1 + STFDUX f23, YY, INCY + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f12, Y, INCYM1 + LFDUX f13, Y, INCY + LFDX f14, Y, INCYM1 + LFDUX f15, Y, INCY + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + STFDX f18, YY, INCYM1 + STFDUX f19, YY, INCY + + STFDX f20, YY, INCYM1 + STFDUX f21, YY, INCY + STFDX f22, YY, INCYM1 + STFDUX f23, YY, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + LFDX f10, Y, INCYM1 + LFDUX f11, Y, INCY + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f12, Y, INCYM1 + LFDUX f13, Y, INCY + LFDX f14, Y, INCYM1 + LFDUX f15, Y, INCY + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + STFDX f18, YY, INCYM1 + STFDUX f19, YY, INCY + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + FMADD f18, ALPHA_R, f2, f10 + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + STFDX f20, YY, INCYM1 + STFDUX f21, YY, INCY + STFDX f22, YY, INCYM1 + STFDUX f23, YY, INCY + + FMADD f20, ALPHA_R, f4, f12 + FMADD f21, ALPHA_I, f4, f13 + FMADD f22, ALPHA_R, f6, f14 + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + ADD2 f21, ALPHA_R, f5, f21 + ADD1 f22, ALPHA_I, f7, f22 + ADD2 f23, ALPHA_R, f7, f23 + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + STFDX f18, YY, INCYM1 + STFDUX f19, YY, INCY + + STFDX f20, YY, INCYM1 + STFDUX f21, YY, INCY + STFDX f22, YY, INCYM1 + STFDUX f23, YY, INCY + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f8, Y, INCYM1 + LFDUX f9, Y, INCY + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + + STFDX f16, YY, INCYM1 + STFDUX f17, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE + +#endif diff --git a/kernel/power/zaxpy_hummer.S b/kernel/power/zaxpy_hummer.S new file mode 100644 index 0000000000..41b34954ed --- /dev/null +++ b/kernel/power/zaxpy_hummer.S @@ -0,0 +1,503 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 + +#define YY r4 +#define INCX2 r5 +#define INCY2 r10 +#define X1 r11 +#define Y1 INCX +#define YY1 INCY + +#define ALPHA f1 + +#define A1 f0 +#define A2 f8 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 +#define A9 f25 + +#define B1 f9 +#define B2 f10 +#define B3 f11 +#define B4 f12 +#define B5 f13 +#define B6 f14 +#define B7 f15 +#define B8 f16 + +#define C1 f17 +#define C2 f18 +#define C3 f19 +#define C4 f20 +#define C5 f21 +#define C6 f22 +#define C7 f23 +#define C8 f24 + +#define ALPHA_R ALPHA +#define ALPHA_I A9 + +#ifndef CONJ +#define ADD1 FNMSUB +#define ADD2 FMADD +#else +#define ADD1 FMADD +#define ADD2 FNMSUB +#endif + +#ifndef CONJ +#define FXMADD1 fxcpmadd +#define FXMADD2 fxcxnpma +#else +#define FXMADD1 fxcpnsma +#define FXMADD2 fxcxma +#endif + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + + fsmfp ALPHA, f2 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + andi. r0, X, 2 * SIZE - 1 + bne LL(100) + andi. r0, Y, 2 * SIZE - 1 + bne LL(100) + + sub X, X, INCX2 + sub Y, Y, INCY2 + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + LFPDUX B8, Y, INCY2 + + bdz LL(13) + .align 4 + +LL(12): + FXMADD1 C1, ALPHA, A1, B1 + LFPDUX B1, Y, INCY2 + FXMADD1 C2, ALPHA, A2, B2 + LFPDUX B2, Y, INCY2 + FXMADD1 C3, ALPHA, A3, B3 + LFPDUX B3, Y, INCY2 + FXMADD1 C4, ALPHA, A4, B4 + LFPDUX B4, Y, INCY2 + + FXMADD1 C5, ALPHA, A5, B5 + LFPDUX B5, Y, INCY2 + FXMADD1 C6, ALPHA, A6, B6 + LFPDUX B6, Y, INCY2 + FXMADD1 C7, ALPHA, A7, B7 + LFPDUX B7, Y, INCY2 + FXMADD1 C8, ALPHA, A8, B8 + LFPDUX B8, Y, INCY2 + + FXMADD2 C1, ALPHA, A1, C1 + LFPDUX A1, X, INCX2 + FXMADD2 C2, ALPHA, A2, C2 + LFPDUX A2, X, INCX2 + FXMADD2 C3, ALPHA, A3, C3 + LFPDUX A3, X, INCX2 + FXMADD2 C4, ALPHA, A4, C4 + LFPDUX A4, X, INCX2 + + FXMADD2 C5, ALPHA, A5, C5 + LFPDUX A5, X, INCX2 + FXMADD2 C6, ALPHA, A6, C6 + LFPDUX A6, X, INCX2 + FXMADD2 C7, ALPHA, A7, C7 + LFPDUX A7, X, INCX2 + FXMADD2 C8, ALPHA, A8, C8 + LFPDUX A8, X, INCX2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + bdnz LL(12) + .align 4 + +LL(13): + FXMADD1 C1, ALPHA, A1, B1 + FXMADD1 C2, ALPHA, A2, B2 + FXMADD1 C3, ALPHA, A3, B3 + FXMADD1 C4, ALPHA, A4, B4 + + FXMADD1 C5, ALPHA, A5, B5 + FXMADD1 C6, ALPHA, A6, B6 + FXMADD1 C7, ALPHA, A7, B7 + FXMADD1 C8, ALPHA, A8, B8 + + FXMADD2 C1, ALPHA, A1, C1 + FXMADD2 C2, ALPHA, A2, C2 + FXMADD2 C3, ALPHA, A3, C3 + FXMADD2 C4, ALPHA, A4, C4 + + FXMADD2 C5, ALPHA, A5, C5 + FXMADD2 C6, ALPHA, A6, C6 + STFPDUX C1, YY, INCY2 + FXMADD2 C7, ALPHA, A7, C7 + STFPDUX C2, YY, INCY2 + FXMADD2 C8, ALPHA, A8, C8 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + + STFPDUX C5, YY, INCY2 + STFPDUX C6, YY, INCY2 + STFPDUX C7, YY, INCY2 + STFPDUX C8, YY, INCY2 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + FXMADD1 C1, ALPHA, A1, B1 + FXMADD1 C2, ALPHA, A2, B2 + FXMADD1 C3, ALPHA, A3, B3 + FXMADD1 C4, ALPHA, A4, B4 + + FXMADD2 C1, ALPHA, A1, C1 + FXMADD2 C2, ALPHA, A2, C2 + FXMADD2 C3, ALPHA, A3, C3 + FXMADD2 C4, ALPHA, A4, C4 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + STFPDUX C3, YY, INCY2 + STFPDUX C4, YY, INCY2 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + FXMADD1 C1, ALPHA, A1, B1 + FXMADD1 C2, ALPHA, A2, B2 + FXMADD2 C1, ALPHA, A1, C1 + FXMADD2 C2, ALPHA, A2, C2 + + STFPDUX C1, YY, INCY2 + STFPDUX C2, YY, INCY2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + FXMADD1 C1, ALPHA, A1, B1 + FXMADD2 C1, ALPHA, A1, C1 + + STFPDUX C1, YY, INCY2 + b LL(999) + .align 4 + +LL(100): + fsmtp ALPHA_I, ALPHA_R + + sub X, X, INCX2 + sub Y, Y, INCY2 + + addi X1, X, SIZE + addi Y1, Y, SIZE + + mr YY, Y + mr YY1, Y1 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX2 + LFDUX A2, X1, INCX2 + LFDUX B1, Y, INCY2 + LFDUX B2, Y1, INCY2 + + LFDUX A3, X, INCX2 + LFDUX A4, X1, INCX2 + LFDUX B3, Y, INCY2 + LFDUX B4, Y1, INCY2 + + LFDUX A5, X, INCX2 + LFDUX A6, X1, INCX2 + LFDUX B5, Y, INCY2 + LFDUX B6, Y1, INCY2 + + LFDUX A7, X, INCX2 + LFDUX A8, X1, INCX2 + LFDUX B7, Y, INCY2 + LFDUX B8, Y1, INCY2 + bdz LL(113) + .align 4 + +LL(112): + FMADD C1, ALPHA_R, A1, B1 + LFDUX B1, Y, INCY2 + FMADD C2, ALPHA_I, A1, B2 + LFDUX A1, X, INCX2 + FMADD C3, ALPHA_R, A3, B3 + LFDUX B3, Y, INCY2 + FMADD C4, ALPHA_I, A3, B4 + LFDUX A3, X, INCX2 + + FMADD C5, ALPHA_R, A5, B5 + LFDUX B5, Y, INCY2 + FMADD C6, ALPHA_I, A5, B6 + LFDUX A5, X, INCX2 + FMADD C7, ALPHA_R, A7, B7 + LFDUX B7, Y, INCY2 + FMADD C8, ALPHA_I, A7, B8 + LFDUX A7, X, INCX2 + + ADD1 C1, ALPHA_I, A2, C1 + LFDUX B2, Y1, INCY2 + ADD2 C2, ALPHA_R, A2, C2 + LFDUX A2, X1, INCX2 + ADD1 C3, ALPHA_I, A4, C3 + LFDUX B4, Y1, INCY2 + ADD2 C4, ALPHA_R, A4, C4 + LFDUX A4, X1, INCX2 + + ADD1 C5, ALPHA_I, A6, C5 + LFDUX B6, Y1, INCY2 + ADD2 C6, ALPHA_R, A6, C6 + LFDUX A6, X1, INCX2 + ADD1 C7, ALPHA_I, A8, C7 + LFDUX B8, Y1, INCY2 + ADD2 C8, ALPHA_R, A8, C8 + LFDUX A8, X1, INCX2 + + STFDUX C1, YY, INCY2 + STFDUX C2, YY1, INCY2 + STFDUX C3, YY, INCY2 + STFDUX C4, YY1, INCY2 + + STFDUX C5, YY, INCY2 + STFDUX C6, YY1, INCY2 + STFDUX C7, YY, INCY2 + STFDUX C8, YY1, INCY2 + bdnz LL(112) + .align 4 + +LL(113): + FMADD C1, ALPHA_R, A1, B1 + FMADD C2, ALPHA_I, A1, B2 + FMADD C3, ALPHA_R, A3, B3 + FMADD C4, ALPHA_I, A3, B4 + + FMADD C5, ALPHA_R, A5, B5 + FMADD C6, ALPHA_I, A5, B6 + FMADD C7, ALPHA_R, A7, B7 + FMADD C8, ALPHA_I, A7, B8 + + ADD1 C1, ALPHA_I, A2, C1 + ADD2 C2, ALPHA_R, A2, C2 + ADD1 C3, ALPHA_I, A4, C3 + ADD2 C4, ALPHA_R, A4, C4 + + ADD1 C5, ALPHA_I, A6, C5 + ADD2 C6, ALPHA_R, A6, C6 + STFDUX C1, YY, INCY2 + ADD1 C7, ALPHA_I, A8, C7 + STFDUX C2, YY1, INCY2 + ADD2 C8, ALPHA_R, A8, C8 + STFDUX C3, YY, INCY2 + STFDUX C4, YY1, INCY2 + + STFDUX C5, YY, INCY2 + STFDUX C6, YY1, INCY2 + STFDUX C7, YY, INCY2 + STFDUX C8, YY1, INCY2 + .align 4 + +LL(115): + andi. r0, N, 3 + beq LL(999) + + andi. r0, N, 2 + beq LL(117) + + LFDUX A1, X, INCX2 + LFDUX A2, X1, INCX2 + LFDUX B1, Y, INCY2 + LFDUX B2, Y1, INCY2 + + LFDUX A3, X, INCX2 + FMADD C1, ALPHA_R, A1, B1 + LFDUX A4, X1, INCX2 + FMADD C2, ALPHA_I, A1, B2 + LFDUX B3, Y, INCY2 + FMADD C3, ALPHA_R, A3, B3 + LFDUX B4, Y1, INCY2 + FMADD C4, ALPHA_I, A3, B4 + + ADD1 C1, ALPHA_I, A2, C1 + ADD2 C2, ALPHA_R, A2, C2 + STFDUX C1, YY, INCY2 + ADD1 C3, ALPHA_I, A4, C3 + STFDUX C2, YY1, INCY2 + ADD2 C4, ALPHA_R, A4, C4 + STFDUX C3, YY, INCY2 + STFDUX C4, YY1, INCY2 + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, X1, INCX2 + LFDUX B1, Y, INCY2 + LFDUX B2, Y1, INCY2 + + FMADD C1, ALPHA_R, A1, B1 + FMADD C2, ALPHA_I, A1, B2 + + ADD1 C1, ALPHA_I, A2, C1 + ADD2 C2, ALPHA_R, A2, C2 + + STFDUX C1, YY, INCY2 + STFDUX C2, YY1, INCY2 + .align 4 + +LL(999): + li r10, 16 + subi SP, SP, 16 + + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S new file mode 100644 index 0000000000..5100e94423 --- /dev/null +++ b/kernel/power/zaxpy_ppc440.S @@ -0,0 +1,413 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define YY r4 +#define PRE r5 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r5 +#define INCY r4 +#define YY r6 +#define PRE r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r10 +#define INCX r4 +#define Y r5 +#define INCY r6 +#define YY r7 +#define PRE r8 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define YY r5 +#define PRE r6 +#endif +#endif + +#define ALPHA_R f24 +#define ALPHA_I f25 + +#ifndef CONJ +#define ADD1 FNMSUB +#define ADD2 FMADD +#else +#define ADD1 FMADD +#define ADD2 FNMSUB +#endif + +#define STACKSIZE 96 + + PROLOGUE + PROFCODE + + subi SP, SP, STACKSIZE + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + +#if defined(linux) && defined(__64BIT__) + ld INCY, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld INCY, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + fmr ALPHA_R, f1 + slwi INCX, INCX, ZBASE_SHIFT + fmr ALPHA_I, f2 + slwi INCY, INCY, ZBASE_SHIFT + + subi INCX, INCX, SIZE + subi INCY, INCY, SIZE + + li PRE, 2 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + sub X, X, INCX + sub Y, Y, INCY + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + ble- LL(150) + .align 4 + + LFDUX f0, X, INCX + LFDU f1, 1 * SIZE(X) + LFDUX f2, X, INCX + LFDU f3, 1 * SIZE(X) + + LFDUX f8, Y, INCY + LFDU f9, 1 * SIZE(Y) + LFDUX f10, Y, INCY + LFDU f11, 1 * SIZE(Y) + + LFDUX f4, X, INCX + LFDU f5, 1 * SIZE(X) + LFDUX f6, X, INCX + LFDU f7, 1 * SIZE(X) + + LFDUX f12, Y, INCY + LFDU f13, 1 * SIZE(Y) + LFDUX f14, Y, INCY + LFDU f15, 1 * SIZE(Y) + bdz LL(120) + .align 4 + +LL(110): + FMADD f16, ALPHA_R, f0, f8 + LFDUX f8, Y, INCY + FMADD f17, ALPHA_I, f0, f9 + LFDU f9, 1 * SIZE(Y) + FMADD f18, ALPHA_R, f2, f10 + LFDUX f10, Y, INCY + FMADD f19, ALPHA_I, f2, f11 + LFDU f11, 1 * SIZE(Y) +#ifdef PPCG4 + dcbt X, PRE +#endif + + ADD1 f16, ALPHA_I, f1, f16 + LFDUX f0, X, INCX + ADD2 f17, ALPHA_R, f1, f17 + LFDU f1, 1 * SIZE(X) + ADD1 f18, ALPHA_I, f3, f18 + LFDUX f2, X, INCX + ADD2 f19, ALPHA_R, f3, f19 + LFDU f3, 1 * SIZE(X) +#ifdef PPCG4 + dcbtst Y, PRE +#endif + + FMADD f20, ALPHA_R, f4, f12 + LFDUX f12, Y, INCY + FMADD f21, ALPHA_I, f4, f13 + LFDU f13, 1 * SIZE(Y) + FMADD f22, ALPHA_R, f6, f14 + LFDUX f14, Y, INCY + FMADD f23, ALPHA_I, f6, f15 + LFDU f15, 1 * SIZE(Y) +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + + ADD1 f20, ALPHA_I, f5, f20 + LFDUX f4, X, INCX + ADD2 f21, ALPHA_R, f5, f21 + LFDU f5, 1 * SIZE(X) + ADD1 f22, ALPHA_I, f7, f22 + LFDUX f6, X, INCX + ADD2 f23, ALPHA_R, f7, f23 + LFDU f7, 1 * SIZE(X) +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + + STFDUX f16, YY, INCY + STFDU f17, 1 * SIZE(YY) + STFDUX f18, YY, INCY + STFDU f19, 1 * SIZE(YY) + + FMADD f16, ALPHA_R, f0, f8 + LFDUX f8, Y, INCY + FMADD f17, ALPHA_I, f0, f9 + LFDU f9, 1 * SIZE(Y) + FMADD f18, ALPHA_R, f2, f10 + LFDUX f10, Y, INCY + FMADD f19, ALPHA_I, f2, f11 + LFDU f11, 1 * SIZE(Y) +#ifdef PPCG4 + dcbt X, PRE +#endif + + ADD1 f16, ALPHA_I, f1, f16 + LFDUX f0, X, INCX + ADD2 f17, ALPHA_R, f1, f17 + LFDU f1, 1 * SIZE(X) + ADD1 f18, ALPHA_I, f3, f18 + LFDUX f2, X, INCX + ADD2 f19, ALPHA_R, f3, f19 + LFDU f3, 1 * SIZE(X) +#ifdef PPCG4 + dcbtst Y, PRE +#endif + + STFDUX f20, YY, INCY + STFDU f21, 1 * SIZE(YY) + STFDUX f22, YY, INCY + STFDU f23, 1 * SIZE(YY) + + FMADD f20, ALPHA_R, f4, f12 + LFDUX f12, Y, INCY + FMADD f21, ALPHA_I, f4, f13 + LFDU f13, 1 * SIZE(Y) + FMADD f22, ALPHA_R, f6, f14 + LFDUX f14, Y, INCY + FMADD f23, ALPHA_I, f6, f15 + LFDU f15, 1 * SIZE(Y) +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + + ADD1 f20, ALPHA_I, f5, f20 + LFDUX f4, X, INCX + ADD2 f21, ALPHA_R, f5, f21 + LFDU f5, 1 * SIZE(X) + ADD1 f22, ALPHA_I, f7, f22 + LFDUX f6, X, INCX + ADD2 f23, ALPHA_R, f7, f23 + LFDU f7, 1 * SIZE(X) +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + + STFDUX f16, YY, INCY + STFDU f17, 1 * SIZE(YY) + STFDUX f18, YY, INCY + STFDU f19, 1 * SIZE(YY) + + STFDUX f20, YY, INCY + STFDU f21, 1 * SIZE(YY) + STFDUX f22, YY, INCY + STFDU f23, 1 * SIZE(YY) + bdnz LL(110) + .align 4 + +LL(120): + FMADD f16, ALPHA_R, f0, f8 + LFDUX f8, Y, INCY + FMADD f17, ALPHA_I, f0, f9 + LFDU f9, 1 * SIZE(Y) + FMADD f18, ALPHA_R, f2, f10 + LFDUX f10, Y, INCY + FMADD f19, ALPHA_I, f2, f11 + LFDU f11, 1 * SIZE(Y) + + ADD1 f16, ALPHA_I, f1, f16 + LFDUX f0, X, INCX + ADD2 f17, ALPHA_R, f1, f17 + LFDU f1, 1 * SIZE(X) + ADD1 f18, ALPHA_I, f3, f18 + LFDUX f2, X, INCX + ADD2 f19, ALPHA_R, f3, f19 + LFDU f3, 1 * SIZE(X) + + FMADD f20, ALPHA_R, f4, f12 + LFDUX f12, Y, INCY + FMADD f21, ALPHA_I, f4, f13 + LFDU f13, 1 * SIZE(Y) + FMADD f22, ALPHA_R, f6, f14 + LFDUX f14, Y, INCY + FMADD f23, ALPHA_I, f6, f15 + LFDU f15, 1 * SIZE(Y) + + ADD1 f20, ALPHA_I, f5, f20 + LFDUX f4, X, INCX + ADD2 f21, ALPHA_R, f5, f21 + LFDU f5, 1 * SIZE(X) + ADD1 f22, ALPHA_I, f7, f22 + LFDUX f6, X, INCX + ADD2 f23, ALPHA_R, f7, f23 + LFDU f7, 1 * SIZE(X) + + STFDUX f16, YY, INCY + FMADD f16, ALPHA_R, f0, f8 + STFDU f17, 1 * SIZE(YY) + FMADD f17, ALPHA_I, f0, f9 + STFDUX f18, YY, INCY + FMADD f18, ALPHA_R, f2, f10 + STFDU f19, 1 * SIZE(YY) + FMADD f19, ALPHA_I, f2, f11 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + ADD1 f18, ALPHA_I, f3, f18 + ADD2 f19, ALPHA_R, f3, f19 + + STFDUX f20, YY, INCY + FMADD f20, ALPHA_R, f4, f12 + STFDU f21, 1 * SIZE(YY) + FMADD f21, ALPHA_I, f4, f13 + STFDUX f22, YY, INCY + FMADD f22, ALPHA_R, f6, f14 + STFDU f23, 1 * SIZE(YY) + FMADD f23, ALPHA_I, f6, f15 + + ADD1 f20, ALPHA_I, f5, f20 + STFDUX f16, YY, INCY + ADD2 f21, ALPHA_R, f5, f21 + STFDU f17, 1 * SIZE(YY) + ADD1 f22, ALPHA_I, f7, f22 + STFDUX f18, YY, INCY + ADD2 f23, ALPHA_R, f7, f23 + STFDU f19, 1 * SIZE(YY) + + STFDUX f20, YY, INCY + STFDU f21, 1 * SIZE(YY) + STFDUX f22, YY, INCY + STFDU f23, 1 * SIZE(YY) + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + ble LL(999) + .align 4 + +LL(160): + LFDUX f0, X, INCX + LFDU f1, 1 * SIZE(X) + LFDUX f8, Y, INCY + LFDU f9, 1 * SIZE(Y) + + FMADD f16, ALPHA_R, f0, f8 + FMADD f17, ALPHA_I, f0, f9 + + ADD1 f16, ALPHA_I, f1, f16 + ADD2 f17, ALPHA_R, f1, f17 + + STFDUX f16, YY, INCY + STFDU f17, 1 * SIZE(YY) + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + addi SP, SP, STACKSIZE + li r0, 0 + blr + EPILOGUE diff --git a/kernel/power/zcopy.S b/kernel/power/zcopy.S new file mode 100644 index 0000000000..f5ed2f99d8 --- /dev/null +++ b/kernel/power/zcopy.S @@ -0,0 +1,237 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 + +#define INCXM1 r9 +#define INCYM1 r10 + +#define STACKSIZE 16 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + +LL(10): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + STFD f2, 2 * SIZE(Y) + STFD f3, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + STFD f4, 4 * SIZE(Y) + STFD f5, 5 * SIZE(Y) + STFD f6, 6 * SIZE(Y) + STFD f7, 7 * SIZE(Y) + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + STFD f8, 8 * SIZE(Y) + STFD f9, 9 * SIZE(Y) + STFD f10, 10 * SIZE(Y) + STFD f11, 11 * SIZE(Y) + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + STFD f12, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f14, 14 * SIZE(Y) + STFD f15, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst Y, PREA +#ifdef L1_DUALFETCH + dcbt X, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst Y, PREA + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + + STFD f8, 0 * SIZE(Y) + STFD f9, 1 * SIZE(Y) + addi Y, Y, 2 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + STFDX f0, Y, INCYM1 + STFDUX f1, Y, INCY + STFDX f2, Y, INCYM1 + STFDUX f3, Y, INCY + + STFDX f4, Y, INCYM1 + STFDUX f5, Y, INCY + STFDX f6, Y, INCYM1 + STFDUX f7, Y, INCY + + STFDX f8, Y, INCYM1 + STFDUX f9, Y, INCY + STFDX f10, Y, INCYM1 + STFDUX f11, Y, INCY + + STFDX f12, Y, INCYM1 + STFDUX f13, Y, INCY + STFDX f14, Y, INCYM1 + STFDUX f15, Y, INCY + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + STFDX f8, Y, INCYM1 + STFDUX f9, Y, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zcopy_hummer.S b/kernel/power/zcopy_hummer.S new file mode 100644 index 0000000000..825b440167 --- /dev/null +++ b/kernel/power/zcopy_hummer.S @@ -0,0 +1,652 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 + +#define INCX2 r8 +#define INCY2 r9 +#define X2 r10 +#define Y2 r11 + +#define A1 f0 +#define A2 f1 +#define A3 f2 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 +#define A9 f8 + +#define T1 f9 +#define T2 f10 +#define T3 f11 +#define T4 f12 +#define T5 f13 +#define T6 f14 +#define T7 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + .align 4 + +LL(10): /* X ): aligned Y ): aligned */ + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(13) + .align 4 + +LL(12): + STFPDUX A1, Y, INCY2 + LFPDUX A1, X, INCX2 + STFPDUX A2, Y, INCY2 + LFPDUX A2, X, INCX2 + STFPDUX A3, Y, INCY2 + LFPDUX A3, X, INCX2 + STFPDUX A4, Y, INCY2 + LFPDUX A4, X, INCX2 + + STFPDUX A5, Y, INCY2 + LFPDUX A5, X, INCX2 + STFPDUX A6, Y, INCY2 + LFPDUX A6, X, INCX2 + STFPDUX A7, Y, INCY2 + LFPDUX A7, X, INCX2 + STFPDUX A8, Y, INCY2 + LFPDUX A8, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A6, Y, INCY2 + STFPDUX A7, Y, INCY2 + STFPDUX A8, Y, INCY2 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + STFPDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(20): /* X : aligned Y : unaligned */ + + LFXDUX A1, X, INCX2 + addi N, N, -1 + cmpwi cr0, N, 0 + STFSDX A1, Y, INCY2 + add Y, Y, INCY + ble LL(29) + .align 4 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX T1, X, INCX2 + LFXDUX T2, X, INCX2 + LFXDUX T3, X, INCX2 + LFXDUX T4, X, INCX2 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdz LL(23) + .align 4 + +LL(22): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + LFPDUX A2, X, INCX2 + fsmr T5, T6 + LFPDUX A3, X, INCX2 + fsmr T6, T7 + LFPDUX A4, X, INCX2 + fsmr T7, A1 + LFPDUX A5, X, INCX2 + + STFPDUX T4, Y, INCY2 + fxmr T1, A2 + STFPDUX T5, Y, INCY2 + fxmr T2, A3 + STFPDUX T6, Y, INCY2 + fxmr T3, A4 + STFPDUX T7, Y, INCY2 + fxmr T4, A5 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdnz LL(22) + .align 4 + +LL(23): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + fsmr T5, T6 + fsmr T6, T7 + fsmr T7, A1 + + STFPDUX T4, Y, INCY2 + STFPDUX T5, Y, INCY2 + STFPDUX T6, Y, INCY2 + STFPDUX T7, Y, INCY2 + .align 4 + +LL(25): + andi. r0, N, 7 + beq LL(29) + + andi. r0, N, 4 + beq LL(26) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + LFXDUX A4, X, INCX2 + LFXDUX A5, X, INCX2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + fpmr A1, A5 + .align 4 + +LL(26): + andi. r0, N, 2 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + fsmr A1, A2 + fsmr A2, A3 + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + fpmr A1, A3 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(29) + + LFXDUX A2, X, INCX2 + fsmr A1, A2 + STFPDUX A1, Y, INCY2 + fpmr A1, A2 + .align 4 + +LL(29): + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(30): /* X ): unaligned Y ): aligned */ + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFDX A1, X, INCX2 + add X, X, INCX + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX T1, X, INCX2 + LFXDUX T2, X, INCX2 + LFXDUX T3, X, INCX2 + LFXDUX T4, X, INCX2 + + LFPDUX A6, X, INCX2 + fsmr A1, T1 + LFPDUX A7, X, INCX2 + fsmr T1, T2 + LFPDUX A8, X, INCX2 + fsmr T2, T3 + LFPDUX A9, X, INCX2 + fsmr T3, T4 + bdz LL(33) + .align 4 + +LL(32): + fxmr T5, A6 + STFPDUX A1, Y, INCY2 + fxmr T6, A7 + STFPDUX T1, Y, INCY2 + fxmr T7, A8 + STFPDUX T2, Y, INCY2 + fxmr A1, A9 + STFPDUX T3, Y, INCY2 + + LFPDUX A2, X, INCX2 + fsmr T4, T5 + LFPDUX A3, X, INCX2 + fsmr T5, T6 + LFPDUX A4, X, INCX2 + fsmr T6, T7 + LFPDUX A5, X, INCX2 + fsmr T7, A1 + + fxmr T1, A2 + STFPDUX T4, Y, INCY2 + fxmr T2, A3 + STFPDUX T5, Y, INCY2 + fxmr T3, A4 + STFPDUX T6, Y, INCY2 + fxmr T4, A5 + STFPDUX T7, Y, INCY2 + + fsmr A1, T1 + LFPDUX A6, X, INCX2 + fsmr T1, T2 + LFPDUX A7, X, INCX2 + fsmr T2, T3 + LFPDUX A8, X, INCX2 + fsmr T3, T4 + LFPDUX A9, X, INCX2 + bdnz LL(32) + .align 4 + +LL(33): + STFPDUX A1, Y, INCY2 + fxmr T5, A6 + STFPDUX T1, Y, INCY2 + fxmr T6, A7 + STFPDUX T2, Y, INCY2 + fxmr T7, A8 + STFPDUX T3, Y, INCY2 + fxmr A1, A9 + + fsmr T4, T5 + fsmr T5, T6 + fsmr T6, T7 + fsmr T7, A1 + + STFPDUX T4, Y, INCY2 + STFPDUX T5, Y, INCY2 + STFPDUX T6, Y, INCY2 + STFPDUX T7, Y, INCY2 + .align 4 + +LL(35): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(36) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + LFXDUX A4, X, INCX2 + LFXDUX A5, X, INCX2 + + fsmr A1, A2 + fsmr A2, A3 + fsmr A3, A4 + fsmr A4, A5 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + fpmr A1, A5 + .align 4 + +LL(36): + andi. r0, N, 2 + beq LL(37) + + LFXDUX A2, X, INCX2 + LFXDUX A3, X, INCX2 + fsmr A1, A2 + fsmr A2, A3 + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + fpmr A1, A3 + .align 4 + +LL(37): + andi. r0, N, 1 + beq LL(999) + + LFXDUX A2, X, INCX2 + fsmr A1, A2 + STFPDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(40): /* X : unaligned Y : unaligned */ + + LFDX A1, X, INCX2 + add X, X, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + STFDX A1, Y, INCY2 + add Y, Y, INCY + ble LL(49) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + LFPDUX A5, X, INCX2 + LFPDUX A6, X, INCX2 + LFPDUX A7, X, INCX2 + LFPDUX A8, X, INCX2 + bdz LL(43) + .align 4 + +LL(42): + STFPDUX A1, Y, INCY2 + LFPDUX A1, X, INCX2 + STFPDUX A2, Y, INCY2 + LFPDUX A2, X, INCX2 + STFPDUX A3, Y, INCY2 + LFPDUX A3, X, INCX2 + STFPDUX A4, Y, INCY2 + LFPDUX A4, X, INCX2 + + STFPDUX A5, Y, INCY2 + LFPDUX A5, X, INCX2 + STFPDUX A6, Y, INCY2 + LFPDUX A6, X, INCX2 + STFPDUX A7, Y, INCY2 + LFPDUX A7, X, INCX2 + STFPDUX A8, Y, INCY2 + LFPDUX A8, X, INCX2 + bdnz LL(42) + .align 4 + +LL(43): + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + STFPDUX A5, Y, INCY2 + STFPDUX A6, Y, INCY2 + STFPDUX A7, Y, INCY2 + STFPDUX A8, Y, INCY2 + .align 4 + +LL(45): + andi. r0, N, 7 + beq LL(49) + + andi. r0, N, 4 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + STFPDUX A3, Y, INCY2 + STFPDUX A4, Y, INCY2 + .align 4 + +LL(46): + andi. r0, N, 2 + beq LL(47) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + STFPDUX A1, Y, INCY2 + STFPDUX A2, Y, INCY2 + .align 4 + +LL(47): + andi. r0, N, 1 + beq LL(49) + + LFPDUX A1, X, INCX2 + STFPDUX A1, Y, INCY2 + +LL(49): + LFDUX A1, X, INCX2 + STFDUX A1, Y, INCY2 + b LL(999) + .align 4 + +LL(100): + addi X2, X, SIZE + addi Y2, Y, SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + bdz LL(113) + .align 4 + +LL(112): + STFDUX A1, Y, INCY2 + LFDUX A1, X, INCX2 + STFDUX A2, Y2, INCY2 + LFDUX A2, X2, INCX2 + STFDUX A3, Y, INCY2 + LFDUX A3, X, INCX2 + STFDUX A4, Y2, INCY2 + LFDUX A4, X2, INCX2 + + STFDUX A5, Y, INCY2 + LFDUX A5, X, INCX2 + STFDUX A6, Y2, INCY2 + LFDUX A6, X2, INCX2 + STFDUX A7, Y, INCY2 + LFDUX A7, X, INCX2 + STFDUX A8, Y2, INCY2 + LFDUX A8, X2, INCX2 + bdnz LL(112) + .align 4 + +LL(113): + STFDUX A1, Y, INCY2 + STFDUX A2, Y2, INCY2 + STFDUX A3, Y, INCY2 + STFDUX A4, Y2, INCY2 + STFDUX A5, Y, INCY2 + STFDUX A6, Y2, INCY2 + STFDUX A7, Y, INCY2 + STFDUX A8, Y2, INCY2 + .align 4 + +LL(115): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(117) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + STFDUX A1, Y, INCY2 + STFDUX A2, Y2, INCY2 + STFDUX A3, Y, INCY2 + STFDUX A4, Y2, INCY2 + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + + STFDUX A1, Y, INCY2 + STFDUX A2, Y2, INCY2 + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zdot.S b/kernel/power/zdot.S new file mode 100644 index 0000000000..dab7eaa49a --- /dev/null +++ b/kernel/power/zdot.S @@ -0,0 +1,654 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define RESULT r3 +#define N r4 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define PREA r9 +#else +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 +#endif + +#define INCXM1 r10 +#define INCYM1 r11 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + fmr f24, FZERO + fmr f25, FZERO + fmr f26, FZERO + fmr f27, FZERO + fmr f28, FZERO + fmr f29, FZERO + fmr f30, FZERO + fmr f31, FZERO + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f16, 8 * SIZE(Y) + LFD f17, 9 * SIZE(Y) + LFD f18, 10 * SIZE(Y) + LFD f19, 11 * SIZE(Y) + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f20, 12 * SIZE(Y) + LFD f21, 13 * SIZE(Y) + LFD f22, 14 * SIZE(Y) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFD f8, 16 * SIZE(X) + LFD f9, 17 * SIZE(X) + LFD f10, 18 * SIZE(X) + LFD f11, 19 * SIZE(X) + + LFD f16, 16 * SIZE(Y) + LFD f17, 17 * SIZE(Y) + LFD f18, 18 * SIZE(Y) + LFD f19, 19 * SIZE(Y) + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFD f12, 20 * SIZE(X) + LFD f13, 21 * SIZE(X) + LFD f14, 22 * SIZE(X) + LFD f15, 23 * SIZE(X) + + LFD f20, 20 * SIZE(Y) + LFD f21, 21 * SIZE(Y) + LFD f22, 22 * SIZE(Y) + LFD f23, 23 * SIZE(Y) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#ifdef L1_DUALFETCH + L1_PREFETCH Y, PREA +#endif +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + L1_PREFETCH X, PREA +#ifdef L1_DUALFETCH + L1_PREFETCH Y, PREA +#endif +#endif + bdnz LL(10) + .align 4 + +LL(20): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f16, 8 * SIZE(Y) + LFD f17, 9 * SIZE(Y) + LFD f18, 10 * SIZE(Y) + LFD f19, 11 * SIZE(Y) + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f20, 12 * SIZE(Y) + LFD f21, 13 * SIZE(Y) + LFD f22, 14 * SIZE(Y) + LFD f23, 15 * SIZE(Y) + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDX f22, Y, INCYM1 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDX f22, Y, INCYM1 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDX f22, Y, INCYM1 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDX f22, Y, INCYM1 + LFDUX f15, X, INCX + LFDUX f23, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + + FMADD f4, f10, f18, f4 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + FMADD f7, f10, f19, f7 + + FMADD f24, f12, f20, f24 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + FMADD f27, f12, f21, f27 + + FMADD f28, f14, f22, f28 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + FMADD f31, f14, f23, f31 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f16, Y, INCYM1 + LFDUX f17, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FADD f24, f28, f24 + FADD f25, f29, f25 + FADD f26, f30, f26 + FADD f27, f31, f27 + + FADD f0, f0, f24 + FADD f1, f1, f25 + FADD f2, f2, f26 + FADD f3, f3, f27 + +#ifndef CONJ + FSUB f1, f0, f1 + FADD f2, f2, f3 +#else + FADD f1, f0, f1 + FSUB f2, f3, f2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD f1, 0 * SIZE(RESULT) + STFD f2, 1 * SIZE(RESULT) +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) +#ifndef __64BIT__ +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) + lwz r5, 152(SP) + lwz r6, 156(SP) +#endif +#else +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + ld r3, 144(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + ld r3, 144(SP) + ld r4, 152(SP) +#endif +#endif +#endif + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zdot_cell.S b/kernel/power/zdot_cell.S new file mode 100644 index 0000000000..66b7dfa097 --- /dev/null +++ b/kernel/power/zdot_cell.S @@ -0,0 +1,617 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define RESULT r3 +#define N r4 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define PREA r9 +#else +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 +#endif + +#define INCXM1 r10 +#define INCYM1 r11 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r0, 144(SP) + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + fmr f24, FZERO + fmr f25, FZERO + fmr f26, FZERO + fmr f27, FZERO + fmr f28, FZERO + fmr f29, FZERO + fmr f30, FZERO + fmr f31, FZERO + + li PREA, 16 * 10 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f10, 2 * SIZE(X) + LFD f11, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f12, 4 * SIZE(X) + LFD f13, 5 * SIZE(X) + LFD f14, 6 * SIZE(X) + LFD f15, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + bdz LL(20) + .align 4 + +LL(10): + FMADD f0, f8, f16, f0 + LFD f22, 6 * SIZE(Y) + FMADD f3, f8, f17, f3 + LFD f8, 8 * SIZE(X) + FMADD f1, f9, f17, f1 + LFD f17, 9 * SIZE(Y) + FMADD f2, f9, f16, f2 + LFD f9, 9 * SIZE(X) + + FMADD f4, f10, f18, f4 + LFD f16, 8 * SIZE(Y) + FMADD f7, f10, f19, f7 + LFD f10, 10 * SIZE(X) + FMADD f5, f11, f19, f5 + LFD f19, 11 * SIZE(Y) + FMADD f6, f11, f18, f6 + LFD f11, 11 * SIZE(X) + + + FMADD f24, f12, f20, f24 + LFD f18, 10 * SIZE(Y) + FMADD f27, f12, f21, f27 + LFD f12, 12 * SIZE(X) + FMADD f25, f13, f21, f25 + LFD f21, 13 * SIZE(Y) + FMADD f26, f13, f20, f26 + LFD f13, 13 * SIZE(X) + + FMADD f28, f14, f22, f28 + LFD f20, 12 * SIZE(Y) + FMADD f31, f14, f23, f31 + LFD f14, 14 * SIZE(X) + FMADD f29, f15, f23, f29 + LFD f23, 15 * SIZE(Y) + FMADD f30, f15, f22, f30 + LFD f15, 15 * SIZE(X) + + FMADD f0, f8, f16, f0 + LFD f22, 14 * SIZE(Y) + FMADD f3, f8, f17, f3 + LFD f8, 16 * SIZE(X) + FMADD f1, f9, f17, f1 + LFD f17, 17 * SIZE(Y) + FMADD f2, f9, f16, f2 + LFD f9, 17 * SIZE(X) + + FMADD f4, f10, f18, f4 + LFD f16, 16 * SIZE(Y) + FMADD f7, f10, f19, f7 + LFD f10, 18 * SIZE(X) + FMADD f5, f11, f19, f5 + LFD f19, 19 * SIZE(Y) + FMADD f6, f11, f18, f6 + LFD f11, 19 * SIZE(X) + + FMADD f24, f12, f20, f24 + LFD f18, 18 * SIZE(Y) + FMADD f27, f12, f21, f27 + LFD f12, 20 * SIZE(X) + FMADD f25, f13, f21, f25 + LFD f21, 21 * SIZE(Y) + FMADD f26, f13, f20, f26 + LFD f13, 21 * SIZE(X) + + FMADD f28, f14, f22, f28 + LFD f20, 20 * SIZE(Y) + FMADD f31, f14, f23, f31 + LFD f14, 22 * SIZE(X) + FMADD f29, f15, f23, f29 + LFD f23, 23 * SIZE(Y) + FMADD f30, f15, f22, f30 + LFD f15, 23 * SIZE(X) + + dcbt X, PREA + addi X, X, 16 * SIZE + dcbt Y, PREA + addi Y, Y, 16 * SIZE + bdnz LL(10) + .align 4 + +LL(20): + FMADD f0, f8, f16, f0 + LFD f22, 6 * SIZE(Y) + FMADD f3, f8, f17, f3 + LFD f8, 8 * SIZE(X) + FMADD f1, f9, f17, f1 + LFD f17, 9 * SIZE(Y) + FMADD f2, f9, f16, f2 + LFD f9, 9 * SIZE(X) + + FMADD f4, f10, f18, f4 + LFD f16, 8 * SIZE(Y) + FMADD f7, f10, f19, f7 + LFD f10, 10 * SIZE(X) + FMADD f5, f11, f19, f5 + LFD f19, 11 * SIZE(Y) + FMADD f6, f11, f18, f6 + LFD f11, 11 * SIZE(X) + + FMADD f24, f12, f20, f24 + LFD f18, 10 * SIZE(Y) + FMADD f27, f12, f21, f27 + LFD f12, 12 * SIZE(X) + FMADD f25, f13, f21, f25 + LFD f21, 13 * SIZE(Y) + FMADD f26, f13, f20, f26 + LFD f13, 13 * SIZE(X) + + FMADD f28, f14, f22, f28 + LFD f20, 12 * SIZE(Y) + FMADD f31, f14, f23, f31 + LFD f14, 14 * SIZE(X) + FMADD f29, f15, f23, f29 + LFD f23, 15 * SIZE(Y) + FMADD f30, f15, f22, f30 + LFD f15, 15 * SIZE(X) + + FMADD f0, f8, f16, f0 + LFD f22, 14 * SIZE(Y) + FMADD f3, f8, f17, f3 + addi X, X, 16 * SIZE + FMADD f1, f9, f17, f1 + addi Y, Y, 16 * SIZE + FMADD f2, f9, f16, f2 + nop + + FMADD f4, f10, f18, f4 + FMADD f7, f10, f19, f7 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + + FMADD f24, f12, f20, f24 + FMADD f27, f12, f21, f27 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + + FMADD f28, f14, f22, f28 + FMADD f31, f14, f23, f31 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + + FMADD f0, f8, f16, f0 + FMADD f3, f8, f17, f3 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDUX f23, Y, INCY + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + + FMADD f4, f10, f18, f4 + FMADD f7, f10, f19, f7 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + + FMADD f24, f12, f20, f24 + FMADD f27, f12, f21, f27 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + + FMADD f28, f14, f22, f28 + FMADD f31, f14, f23, f31 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f16, Y, INCYM1 + LFDUX f17, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f3, f8, f17, f3 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FADD f24, f28, f24 + FADD f25, f29, f25 + FADD f26, f30, f26 + FADD f27, f31, f27 + + FADD f0, f0, f24 + FADD f1, f1, f25 + FADD f2, f2, f26 + FADD f3, f3, f27 + +#ifndef CONJ + FSUB f1, f0, f1 + FADD f2, f2, f3 +#else + FADD f1, f0, f1 + FSUB f2, f3, f2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD f1, 0 * SIZE(RESULT) + STFD f2, 1 * SIZE(RESULT) +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) +#ifndef __64BIT__ +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) + lwz r5, 152(SP) + lwz r6, 156(SP) +#endif +#else +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + ld r3, 144(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + ld r3, 144(SP) + ld r4, 152(SP) +#endif +#endif +#endif + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zdot_hummer.S b/kernel/power/zdot_hummer.S new file mode 100644 index 0000000000..83027cfd6c --- /dev/null +++ b/kernel/power/zdot_hummer.S @@ -0,0 +1,529 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define RESULT r3 +#define N r4 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#else +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif + +#define INCX2 r9 +#define INCY2 r10 + +#define C1 f1 +#define C2 f2 +#define C3 f0 +#define C4 f3 +#define C5 f4 +#define C6 f5 +#define C7 f6 +#define C8 f7 + +#define A1 f8 +#define A2 f9 +#define A3 f10 +#define A4 f11 +#define A5 f12 +#define A6 f13 +#define A7 f14 +#define A8 f15 + +#define B1 f16 +#define B2 f17 +#define B3 f18 +#define B4 f19 +#define B5 f20 +#define B6 f21 +#define B7 f22 +#define B8 f23 + +#ifndef CONJ +#define FXCXNPMA fxcxnpma +#else +#define FXCXNPMA fxcxnsma +#endif + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + lfpdx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + fpmr C2, C1 + + slwi INCY, INCY, BASE_SHIFT + fpmr C3, C1 + add INCY2, INCY, INCY + fpmr C4, C1 + + fpmr C5, C1 + fpmr C6, C1 + fpmr C7, C1 + fpmr C8, C1 + + cmpwi cr0, N, 0 + ble LL(99) + +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(05) + + subi r0, N, 1 + mullw r0, r0, INCX2 + sub X, X, r0 + .align 4 + +LL(05): + cmpwi cr0, INCY, 0 + bge+ LL(06) + + subi r0, N, 1 + mullw r0, r0, INCY2 + sub Y, Y, r0 + .align 4 + +LL(06): +#endif + + andi. r0, X, 2 * SIZE - 1 + bne LL(100) + andi. r0, Y, 2 * SIZE - 1 + bne LL(100) + +/* X is aligned, Y is aligned */ +LL(10): + sub X, X, INCX2 + sub Y, Y, INCY2 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + LFPDUX A5, X, INCX2 + LFPDUX B5, Y, INCY2 + LFPDUX A6, X, INCX2 + LFPDUX B6, Y, INCY2 + + LFPDUX A7, X, INCX2 + LFPDUX B7, Y, INCY2 + LFPDUX A8, X, INCX2 + bdz LL(14) + .align 4 + +LL(13): + fxcpmadd C1, A1, B1, C1 + LFPDUX B8, Y, INCY2 + FXCXNPMA C2, A1, B1, C2 + LFPDUX A1, X, INCX2 + fxcpmadd C3, A2, B2, C3 + LFPDUX B1, Y, INCY2 + FXCXNPMA C4, A2, B2, C4 + LFPDUX A2, X, INCX2 + + fxcpmadd C5, A3, B3, C5 + LFPDUX B2, Y, INCY2 + FXCXNPMA C6, A3, B3, C6 + LFPDUX A3, X, INCX2 + fxcpmadd C7, A4, B4, C7 + LFPDUX B3, Y, INCY2 + FXCXNPMA C8, A4, B4, C8 + LFPDUX A4, X, INCX2 + + fxcpmadd C1, A5, B5, C1 + LFPDUX B4, Y, INCY2 + FXCXNPMA C2, A5, B5, C2 + LFPDUX A5, X, INCX2 + fxcpmadd C3, A6, B6, C3 + LFPDUX B5, Y, INCY2 + FXCXNPMA C4, A6, B6, C4 + LFPDUX A6, X, INCX2 + + fxcpmadd C5, A7, B7, C5 + LFPDUX B6, Y, INCY2 + FXCXNPMA C6, A7, B7, C6 + LFPDUX A7, X, INCX2 + fxcpmadd C7, A8, B8, C7 + LFPDUX B7, Y, INCY2 + FXCXNPMA C8, A8, B8, C8 + LFPDUX A8, X, INCX2 + bdnz LL(13) + .align 4 + +LL(14): + LFPDUX B8, Y, INCY2 + fxcpmadd C1, A1, B1, C1 + FXCXNPMA C2, A1, B1, C2 + fxcpmadd C3, A2, B2, C3 + FXCXNPMA C4, A2, B2, C4 + + fxcpmadd C5, A3, B3, C5 + FXCXNPMA C6, A3, B3, C6 + fxcpmadd C7, A4, B4, C7 + FXCXNPMA C8, A4, B4, C8 + + fxcpmadd C1, A5, B5, C1 + FXCXNPMA C2, A5, B5, C2 + fxcpmadd C3, A6, B6, C3 + FXCXNPMA C4, A6, B6, C4 + + fxcpmadd C5, A7, B7, C5 + FXCXNPMA C6, A7, B7, C6 + fxcpmadd C7, A8, B8, C7 + FXCXNPMA C8, A8, B8, C8 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(99) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + + fxcpmadd C1, A1, B1, C1 + FXCXNPMA C2, A1, B1, C2 + fxcpmadd C3, A2, B2, C3 + FXCXNPMA C4, A2, B2, C4 + + fxcpmadd C5, A3, B3, C5 + FXCXNPMA C6, A3, B3, C6 + fxcpmadd C7, A4, B4, C7 + FXCXNPMA C8, A4, B4, C8 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + fxcpmadd C1, A1, B1, C1 + FXCXNPMA C2, A1, B1, C2 + fxcpmadd C3, A2, B2, C3 + FXCXNPMA C4, A2, B2, C4 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(99) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + fxcpmadd C1, A1, B1, C1 + FXCXNPMA C2, A1, B1, C2 + .align 4 + +LL(99): + li r10, 16 + + fpadd C1, C1, C5 + lfpdux f23, SP, r10 + fpadd C2, C2, C6 + lfpdux f22, SP, r10 + fpadd C3, C3, C7 + lfpdux f21, SP, r10 + fpadd C4, C4, C8 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fpadd C1, C1, C3 + lfpdux f17, SP, r10 + fpadd C2, C2, C4 + lfpdux f16, SP, r10 + + fpadd C1, C1, C2 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fsmtp C2, C1 + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD C1, 0 * SIZE(RESULT) + STFD C2, 1 * SIZE(RESULT) +#endif + addi SP, SP, 16 + blr + .align 4 + + +/* X is aligned, Y is NOT aligned */ + +LL(100): + subi INCX2, INCX2, SIZE + subi INCY2, INCY2, SIZE + + li INCX, SIZE + li INCY, SIZE + + sub X, X, INCX2 + sub Y, Y, INCY2 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX2 + LFDUX B3, Y, INCY2 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + LFDUX A5, X, INCX2 + LFDUX B5, Y, INCY2 + LFDUX A6, X, INCX + LFDUX B6, Y, INCY + + LFDUX A7, X, INCX2 + LFDUX B7, Y, INCY2 + LFDUX A8, X, INCX + bdz LL(104) + .align 4 + +LL(103): + fmadd C1, A1, B1, C1 + LFDUX B8, Y, INCY + fmadd C2, A1, B2, C2 + LFDUX A1, X, INCX2 + + fmadd C3, A2, B1, C3 + LFDUX B1, Y, INCY2 + fmadd C4, A2, B2, C4 + LFDUX A2, X, INCX + + fmadd C5, A3, B3, C5 + LFDUX B2, Y, INCY + fmadd C6, A3, B4, C6 + LFDUX A3, X, INCX2 + + fmadd C7, A4, B3, C7 + LFDUX B3, Y, INCY2 + fmadd C8, A4, B4, C8 + LFDUX A4, X, INCX + + fmadd C1, A5, B5, C1 + LFDUX B4, Y, INCY + fmadd C2, A5, B6, C2 + LFDUX A5, X, INCX2 + + fmadd C3, A6, B5, C3 + LFDUX B5, Y, INCY2 + fmadd C4, A6, B6, C4 + LFDUX A6, X, INCX + + fmadd C5, A7, B7, C5 + LFDUX B6, Y, INCY + fmadd C6, A7, B8, C6 + LFDUX A7, X, INCX2 + + fmadd C7, A8, B7, C7 + LFDUX B7, Y, INCY2 + fmadd C8, A8, B8, C8 + LFDUX A8, X, INCX + + bdnz LL(103) + .align 4 + +LL(104): + LFDUX B8, Y, INCY + fmadd C1, A1, B1, C1 + fmadd C2, A1, B2, C2 + fmadd C3, A2, B1, C3 + fmadd C4, A2, B2, C4 + + fmadd C5, A3, B3, C5 + fmadd C6, A3, B4, C6 + fmadd C7, A4, B3, C7 + fmadd C8, A4, B4, C8 + + fmadd C1, A5, B5, C1 + fmadd C2, A5, B6, C2 + fmadd C3, A6, B5, C3 + fmadd C4, A6, B6, C4 + + fmadd C5, A7, B7, C5 + fmadd C6, A7, B8, C6 + fmadd C7, A8, B7, C7 + fmadd C8, A8, B8, C8 + .align 4 + +LL(105): + andi. r0, N, 3 + beq LL(999) + + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + LFDUX A3, X, INCX2 + LFDUX B3, Y, INCY2 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + + fmadd C1, A1, B1, C1 + fmadd C2, A1, B2, C2 + fmadd C3, A2, B1, C3 + fmadd C4, A2, B2, C4 + + fmadd C5, A3, B3, C5 + fmadd C6, A3, B4, C6 + fmadd C7, A4, B3, C7 + fmadd C8, A4, B4, C8 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + + fmadd C1, A1, B1, C1 + fmadd C2, A1, B2, C2 + fmadd C3, A2, B1, C3 + fmadd C4, A2, B2, C4 + .align 4 + +LL(999): + li r10, 16 + + fadd C1, C1, C5 + lfpdux f23, SP, r10 + fadd C2, C2, C6 + lfpdux f22, SP, r10 + fadd C3, C3, C7 + lfpdux f21, SP, r10 + fadd C4, C4, C8 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + +#ifndef CONJ + FSUB C1, C1, C4 + FADD C2, C2, C3 +#else + FADD C1, C1, C4 + FSUB C2, C2, C3 +#endif + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD C1, 0 * SIZE(RESULT) + STFD C2, 1 * SIZE(RESULT) +#endif + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zdot_ppc440.S b/kernel/power/zdot_ppc440.S new file mode 100644 index 0000000000..3340e65967 --- /dev/null +++ b/kernel/power/zdot_ppc440.S @@ -0,0 +1,441 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define RESULT r3 +#define N r4 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define PRE r9 +#else +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PRE r8 +#endif + +#define INCXM1 r10 +#define INCYM1 r11 + +#define FZERO f0 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stw r0, 144(SP) + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + lfs FZERO,144(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) + LDINT INCY, 0(INCY) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + fmr f1, FZERO + fmr f2, FZERO + fmr f3, FZERO + fmr f4, FZERO + fmr f5, FZERO + fmr f6, FZERO + fmr f7, FZERO + + fmr f24, FZERO + fmr f25, FZERO + fmr f26, FZERO + fmr f27, FZERO + fmr f28, FZERO + fmr f29, FZERO + fmr f30, FZERO + fmr f31, FZERO + + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + +#ifdef F_INTERFACE + cmpwi cr0, INCX, 0 + bge+ LL(102) + + subi r0, N, 1 + mullw r0, r0, INCX + sub X, X, r0 + .align 4 + +LL(102): + cmpwi cr0, INCY, 0 + bge+ LL(104) + + subi r0, N, 1 + mullw r0, r0, INCY + sub Y, Y, r0 + .align 4 + +LL(104): +#endif + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + + LFDX f8, X, INCXM1 + LFDX f16, Y, INCYM1 + LFDUX f9, X, INCX + LFDUX f17, Y, INCY + + LFDX f10, X, INCXM1 + LFDX f18, Y, INCYM1 + LFDUX f11, X, INCX + LFDUX f19, Y, INCY + + LFDX f12, X, INCXM1 + LFDX f20, Y, INCYM1 + LFDUX f13, X, INCX + LFDUX f21, Y, INCY + + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + bdz LL(120) + .align 4 + +LL(110): + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 +#ifdef PPCG4 + dcbt Y, PRE +#endif + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 +#if defined(PPCG4) && defined(DOUBLE) + dcbt Y, PRE +#endif + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 +#ifdef PPCG4 + dcbt X, PRE +#endif + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 +#ifdef PPCG4 + dcbt Y, PRE +#endif + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 +#if defined(PPCG4) && defined(DOUBLE) + dcbt Y, PRE +#endif + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + bdnz LL(110) + .align 4 + +LL(120): + FMADD f0, f8, f16, f0 + LFDX f22, Y, INCYM1 + FMADD f3, f8, f17, f3 + LFDX f8, X, INCXM1 + FMADD f1, f9, f17, f1 + LFDUX f23, Y, INCY + FMADD f2, f9, f16, f2 + LFDUX f9, X, INCX + + FMADD f4, f10, f18, f4 + LFDX f16, Y, INCYM1 + FMADD f7, f10, f19, f7 + LFDX f10, X, INCXM1 + FMADD f5, f11, f19, f5 + LFDUX f17, Y, INCY + FMADD f6, f11, f18, f6 + LFDUX f11, X, INCX + + FMADD f24, f12, f20, f24 + LFDX f18, Y, INCYM1 + FMADD f27, f12, f21, f27 + LFDX f12, X, INCXM1 + FMADD f25, f13, f21, f25 + LFDUX f19, Y, INCY + FMADD f26, f13, f20, f26 + LFDUX f13, X, INCX + + FMADD f28, f14, f22, f28 + LFDX f20, Y, INCYM1 + FMADD f31, f14, f23, f31 + LFDX f14, X, INCXM1 + FMADD f29, f15, f23, f29 + LFDUX f21, Y, INCY + FMADD f30, f15, f22, f30 + LFDUX f15, X, INCX + + LFDX f22, Y, INCYM1 + FMADD f0, f8, f16, f0 + LFDUX f23, Y, INCY + FMADD f3, f8, f17, f3 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + + FMADD f4, f10, f18, f4 + FMADD f7, f10, f19, f7 + FMADD f5, f11, f19, f5 + FMADD f6, f11, f18, f6 + + FMADD f24, f12, f20, f24 + FMADD f27, f12, f21, f27 + FMADD f25, f13, f21, f25 + FMADD f26, f13, f20, f26 + + FMADD f28, f14, f22, f28 + FMADD f31, f14, f23, f31 + FMADD f29, f15, f23, f29 + FMADD f30, f15, f22, f30 + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f16, Y, INCYM1 + LFDUX f17, Y, INCY + + FMADD f0, f8, f16, f0 + FMADD f1, f9, f17, f1 + FMADD f2, f9, f16, f2 + FMADD f3, f8, f17, f3 + bdnz LL(160) + .align 4 + +LL(999): + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FADD f24, f28, f24 + FADD f25, f29, f25 + FADD f26, f30, f26 + FADD f27, f31, f27 + + FADD f0, f0, f24 + FADD f1, f1, f25 + FADD f2, f2, f26 + FADD f3, f3, f27 + +#ifndef CONJ + FSUB f1, f0, f1 + FADD f2, f2, f3 +#else + FADD f1, f0, f1 + FSUB f2, f3, f2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STFD f1, 0 * SIZE(RESULT) + STFD f2, 1 * SIZE(RESULT) +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_GFORT) +#ifndef __64BIT__ +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + lwz r3, 144(SP) + lwz r4, 148(SP) + lwz r5, 152(SP) + lwz r6, 156(SP) +#endif +#else +#ifndef DOUBLE + stfs f1, 144(SP) + stfs f2, 148(SP) + ld r3, 144(SP) +#else + stfd f1, 144(SP) + stfd f2, 152(SP) + ld r3, 144(SP) + ld r4, 152(SP) +#endif +#endif +#endif + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S new file mode 100644 index 0000000000..c936a3d43e --- /dev/null +++ b/kernel/power/zgemm_beta.S @@ -0,0 +1,249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define C r10 +#define LDC r11 +#define J r5 +#define PRE r6 +#define CO1 r7 + +#define ALPHA_R f30 +#define ALPHA_I f31 + +#define STACKSIZE 32 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f30, 0(SP) + stfd f31, 8(SP) + stw r0, 16(SP) + +#ifdef linux +#ifndef __64BIT__ + lwz LDC, 8 + STACKSIZE(SP) +#else + ld C, 120 + STACKSIZE(SP) + ld LDC, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld C, 120 + STACKSIZE(SP) + ld LDC, 128 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz C, 68 + STACKSIZE(SP) + lwz LDC, 72 + STACKSIZE(SP) +#else + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#endif +#endif +#endif + + + slwi LDC, LDC, ZBASE_SHIFT + + lfs f0, 16(SP) + + fmr ALPHA_R, f1 + fmr ALPHA_I, f2 + + cmpwi cr0, M, 0 + ble- LL(999) + cmpwi cr0, N, 0 + ble- LL(999) + + mr J, N + fcmpu cr7, f1, f0 + bne cr7, LL(20) + fcmpu cr7, f2, f0 + bne cr7, LL(20) + .align 4 + +LL(10): + mr CO1, C + add C, C, LDC + addi PRE, 0, 32 * SIZE + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + STFD f0, 0 * SIZE(CO1) + STFD f0, 1 * SIZE(CO1) + STFD f0, 2 * SIZE(CO1) + STFD f0, 3 * SIZE(CO1) + STFD f0, 4 * SIZE(CO1) + STFD f0, 5 * SIZE(CO1) + STFD f0, 6 * SIZE(CO1) + STFD f0, 7 * SIZE(CO1) + STFD f0, 8 * SIZE(CO1) + STFD f0, 9 * SIZE(CO1) + STFD f0, 10 * SIZE(CO1) + STFD f0, 11 * SIZE(CO1) + STFD f0, 12 * SIZE(CO1) + STFD f0, 13 * SIZE(CO1) + STFD f0, 14 * SIZE(CO1) + STFD f0, 15 * SIZE(CO1) + + dcbst PRE, CO1 + addi CO1, CO1, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 7 + mtspr CTR, r0 + beq LL(19) + .align 4 + +LL(16): + STFD f0, 0 * SIZE(CO1) + STFD f0, 1 * SIZE(CO1) + addi CO1, CO1, 2 * SIZE + bdnz LL(16) + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(10) + b LL(999) + .align 4 + +LL(20): + mr CO1, C + add C, C, LDC + addi PRE, 0, 16 * SIZE + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFD f3, 0 * SIZE(CO1) + LFD f4, 1 * SIZE(CO1) + LFD f5, 2 * SIZE(CO1) + LFD f6, 3 * SIZE(CO1) + LFD f7, 4 * SIZE(CO1) + LFD f8, 5 * SIZE(CO1) + LFD f9, 6 * SIZE(CO1) + LFD f10, 7 * SIZE(CO1) + + FMUL f0, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMUL f11, ALPHA_I, f6 + FMUL f6, ALPHA_R, f6 + + FMUL f12, ALPHA_I, f8 + FMUL f8, ALPHA_R, f8 + FMUL f13, ALPHA_I, f10 + FMUL f10, ALPHA_R, f10 + + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f0 + FMADD f6, ALPHA_I, f5, f6 + FMSUB f5, ALPHA_R, f5, f11 + + FMADD f8, ALPHA_I, f7, f8 + FMSUB f7, ALPHA_R, f7, f12 + FMADD f10, ALPHA_I, f9, f10 + FMSUB f9, ALPHA_R, f9, f13 + + STFD f3, 0 * SIZE(CO1) + STFD f4, 1 * SIZE(CO1) + STFD f5, 2 * SIZE(CO1) + STFD f6, 3 * SIZE(CO1) + STFD f7, 4 * SIZE(CO1) + STFD f8, 5 * SIZE(CO1) + STFD f9, 6 * SIZE(CO1) + STFD f10, 7 * SIZE(CO1) + + addi CO1, CO1, 8 * SIZE + dcbtst PRE, CO1 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(29) + .align 4 + +LL(26): + LFD f0, 0 * SIZE(CO1) + LFD f1, 1 * SIZE(CO1) + + FMUL f5, ALPHA_I, f1 + FMUL f1, ALPHA_R, f1 + FMADD f1, ALPHA_I, f0, f1 + FMSUB f0, ALPHA_R, f0, f5 + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + bdnz LL(26) + .align 4 + +LL(29): + addic. J, J, -1 + bgt LL(20) + .align 4 + +LL(999): + li r3, 0 + lfd f30, 0(SP) + lfd f31, 8(SP) + addi SP, SP, STACKSIZE + + blr + EPILOGUE diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S new file mode 100644 index 0000000000..5fef0da3db --- /dev/null +++ b/kernel/power/zgemm_kernel.S @@ -0,0 +1,1837 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#ifdef TRMMKERNEL + std r23, 208(SP) + std r22, 216(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#ifdef TRMMKERNEL + stw r23, 176(SP) + stw r22, 180(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE) + li PREB, (16 * 5 * SIZE) +#else + li PREA, (16 * 15 * SIZE) + li PREB, (16 * 8 * SIZE) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE) + li PREB, (16 * 1 * SIZE) +#else + li PREA, (16 * 2 * SIZE) + li PREB, (16 * 2 * SIZE) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE) + li PREB, (16 * 7 * SIZE) +#else + li PREA, (16 * 12 * SIZE) + li PREB, (16 * 6 * SIZE) +#endif +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr AO, A + ble LL(20) + .align 4 + +LL(11): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + +#ifdef POWER5 + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + LFD f31, 7 * SIZE(B) +#endif + + DCBTST(CO1, PREC) + nop + nop + DCBTST(CO2, PREC) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(15) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + +#ifdef POWER5 + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + LFD f31, 7 * SIZE(B) +#endif + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + +#ifdef POWER5 + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif +#endif + + DCBTST(CO1, PREC) + nop + nop + DCBTST(CO2, PREC) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(15) +#endif + .align 4 + +LL(12): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + +#if defined(ALLOC_HUGETLB) && !defined(POWER5) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + +#if !defined(ALLOC_HUGETLB) && !defined(POWER5) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) +#endif + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + +#ifndef POWER5 + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) +#else + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) +#endif + + FMADD f0, f24, f28, f0 + FMADD f5, f25, f29, f5 + FMADD f10, f26, f30, f10 + FMADD f15, f27, f31, f15 + +#ifndef POWER5 + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) +#else + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) +#endif + + FMADD f1, f25, f28, f1 + FMADD f2, f26, f28, f2 + FMADD f3, f27, f28, f3 + FMADD f4, f24, f29, f4 + + FMADD f6, f26, f29, f6 + FMADD f7, f27, f29, f7 + FMADD f8, f24, f30, f8 + FMADD f9, f25, f30, f9 + + FMADD f11, f27, f30, f11 + FMADD f12, f24, f31, f12 + FMADD f13, f25, f31, f13 + FMADD f14, f26, f31, f14 + +#ifdef POWER5 + LFD f28, 20 * SIZE(BO) + LFD f29, 21 * SIZE(BO) + LFD f30, 22 * SIZE(BO) + LFD f31, 23 * SIZE(BO) +#endif + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE + +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 +#ifndef ALLOC_HUGETLB + DCBT(BO, PREB) + DCBT(AO, PREA) +#endif +#endif + bdnz LL(12) + .align 4 + +LL(15): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(KERNEL_MainFinish) +#endif + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FNMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FNMADD f23, f31, f10, f23 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(25) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(25) +#endif + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(27) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(27) +#endif + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(29): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + ble LL(40) + .align 4 + +LL(31): +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(35) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + + DCBTST(CO1, PREC) + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(35) +#endif + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + DCBT(AO, PREA) + DCBT(BO, PREB) + bdnz LL(32) + .align 4 + +LL(35): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(37) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(37) +#endif + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(999) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(45) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(45) +#endif + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble LL(47) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,TEMP + ble LL(47) +#endif + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 +#endif + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 + + FMADD f16, f31, f2, f16 + FNMADD f17, f31, f0, f17 +#endif + +#endif + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#ifdef TRMMKERNEL + ld r23, 208(SP) + ld r22, 216(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#ifdef TRMMKERNEL + lwz r23, 176(SP) + lwz r22, 180(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S new file mode 100644 index 0000000000..b55300ef64 --- /dev/null +++ b/kernel/power/zgemm_kernel_altivec.S @@ -0,0 +1,1703 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALIGN_SIZE 0xffff +#define SWAP 0 +#define NEG 16 +#define ALPHA_R 32 +#define ALPHA_I 48 +#define FZERO 64 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 + +#define c00 v24 + +#define VZERO v25 +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 + +#define swap v28 +#define neg v29 +#define alpha_r v30 +#define alpha_i v31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 16 * SIZE +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREB, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREB, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREB, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREB, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREB, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef CELL + li PREB, (3 * 32 * SIZE) +#else + li PREB, (5 * 32 * SIZE) +#endif +#endif + + li r0, -1 + mfspr VREG, VRsave + + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -8192 + + and SP, SP, r0 + + fneg f3, f1 + fneg f4, f2 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NC) || defined(TC) || defined(NR) || defined(TR) + stfs f1, ALPHA_R + 0(SP) + stfs f1, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f1, ALPHA_R + 12(SP) + + stfs f4, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f4, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#else + stfs f1, ALPHA_R + 0(SP) + stfs f3, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f3, ALPHA_R + 12(SP) + + stfs f2, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f2, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#endif + + li I, Address_L(0x04050607) + addis I, I, Address_H(0x04050607) + stw I, SWAP + 0(SP) + li I, Address_L(0x00010203) + addis I, I, Address_H(0x00010203) + stw I, SWAP + 4(SP) + li I, Address_L(0x0c0d0e0f) + addis I, I, Address_H(0x0c0d0e0f) + stw I, SWAP + 8(SP) + li I, Address_L(0x08090a0b) + addis I, I, Address_H(0x08090a0b) + stw I, SWAP + 12(SP) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + lis I, 0x8000 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + li I, 0 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#else + li I, 0 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + lis I, 0x8000 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#endif + + li r0, 0 + stw r0, FZERO(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 1 + ble LL(50) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 3 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_B b2, OFFSET_1, B + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + LOAD_A a5, OFFSET_4, AO + vxor c08, c08, c08 + + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + vxor c12, c12, c12 + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 1 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(15) + .align 4 + +LL(12): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + DCBT(BO, PREB) + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + LOAD_A a6, OFFSET_5, AO + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 +#ifdef CELL + DCBT(AO, PREA) +#else + nop +#endif + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + LOAD_A a7, OFFSET_6, AO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_2, BO + vmaddfp c11, a3, bp1, c11 + nop + vmaddfp c12, a4, bp1, c12 + LOAD_A a8, OFFSET_7, AO + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + addi AO, AO, 32 * SIZE + vmaddfp c15, a3, bp2, c15 + nop + vmaddfp c16, a4, bp2, c16 + LOAD_A a1, OFFSET_0, AO + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + nop + vmaddfp c03, a7, bp1, c03 + nop + vmaddfp c04, a8, bp1, c04 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + nop + vmaddfp c08, a8, bp2, c08 + LOAD_A a3, OFFSET_2, AO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b2, OFFSET_3, BO + vmaddfp c11, a7, bp1, c11 + nop + vmaddfp c12, a8, bp1, c12 + LOAD_A a4, OFFSET_3, AO + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + addi BO, BO, 8 * SIZE + vmaddfp c15, a7, bp2, c15 + LOAD_A a5, OFFSET_4, AO + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(15): + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + andi. r0, K, 1 + ble+ LL(18) + .align 4 + +LL(16): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 16 * SIZE + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 4 * SIZE + vmaddfp c12, a4, bp1, c12 + nop + + vmaddfp c13, a1, bp2, c13 + vmaddfp c14, a2, bp2, c14 + vmaddfp c15, a3, bp2, c15 + vmaddfp c16, a4, bp2, c16 + .align 4 + +LL(18): + vxor VZERO, VZERO, VZERO + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + vperm c15, c15, c15, swap + vperm c16, c16, c16, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vxor c13, c13, neg + vxor c14, c14, neg + vxor c15, c15, neg + vxor c16, c16, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + vaddfp c11, c11, c15 + vaddfp c12, c12, c16 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + vperm c15, c11, c11, swap + vperm c16, c12, c12, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c11, alpha_r, c11, VZERO + vmaddfp c12, alpha_r, c12, VZERO + + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + vmaddfp c11, alpha_i, c15, c11 + vmaddfp c12, alpha_i, c16, c12 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, c11, PERMRSHIFT2 + vperm c11, c11, c12, PERMRSHIFT2 + vperm c12, c12, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + vaddfp c11, c11, C4 + vaddfp c12, c12, C5 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + stvx c11, OFFSET_3, CO2 + stvx c12, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 4 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c13, c13, neg + vxor c14, c14, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 2 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c13, c13, c13, swap + + vxor c05, c05, neg + vxor c13, c13, neg + + vaddfp c01, c01, c05 + vaddfp c09, c09, c13 + + vperm c05, c01, c01, swap + vperm c13, c09, c09, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c09, alpha_i, c13, c09 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 + fadd f4, f4, f7 + fsub f5, f5, f6 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 + fadd f4, f4, f7 + fsub f5, f6, f5 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fnmsub f11, f12, f5, f11 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fmadd f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fmadd f11, f12, f5, f11 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fnmsub f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + STFD f10, 0 * SIZE(CO2) + STFD f11, 1 * SIZE(CO2) + +LL(49): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + + srawi. I, M, 3 + ble LL(70) + .align 4 + +LL(61): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(65) + .align 4 + +LL(62): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(62) + .align 4 + +LL(65): + andi. r0, K, 1 + ble+ LL(68) + .align 4 + +LL(66): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(68): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(61) + .align 4 + +LL(70): + andi. I, M, 4 + ble LL(80) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(78): + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + + vxor c05, c05, neg + vxor c06, c06, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + ble+ LL(88) + .align 4 + +LL(86): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(88): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + + vxor c05, c05, neg + + vaddfp c01, c01, c05 + + vperm c05, c01, c01, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + + fmadd f0, f8, f12, f0 + fmadd f2, f8, f13, f2 + fmadd f1, f9, f12, f1 + fmadd f3, f9, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + ble LL(98) + .align 4 + +LL(96): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + .align 4 + +LL(98): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S new file mode 100644 index 0000000000..7b80e66013 --- /dev/null +++ b/kernel/power/zgemm_kernel_altivec_cell.S @@ -0,0 +1,1858 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALIGN_SIZE 0xffff +#define SWAP 0 +#define NEG 16 +#define ALPHA_R 32 +#define ALPHA_I 48 +#define FZERO 64 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 + +#define c00 v24 + +#define VZERO v25 +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 + +#define swap v28 +#define neg v29 +#define alpha_r v30 +#define alpha_i v31 + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../cparam.h" +#else +#include "../zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 16 * SIZE +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREB, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREB, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREB, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREB, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREB, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef CELL + li PREB, (3 * 32 * SIZE) +#else + li PREB, (5 * 32 * SIZE) +#endif +#endif + + li r0, -1 + mfspr VREG, VRsave + + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -8192 + + and SP, SP, r0 + + fneg f3, f1 + fneg f4, f2 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NC) || defined(TC) || defined(NR) || defined(TR) + stfs f1, ALPHA_R + 0(SP) + stfs f1, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f1, ALPHA_R + 12(SP) + + stfs f4, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f4, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#else + stfs f1, ALPHA_R + 0(SP) + stfs f3, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f3, ALPHA_R + 12(SP) + + stfs f2, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f2, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#endif + + li I, Address_L(0x04050607) + addis I, I, Address_H(0x04050607) + stw I, SWAP + 0(SP) + li I, Address_L(0x00010203) + addis I, I, Address_H(0x00010203) + stw I, SWAP + 4(SP) + li I, Address_L(0x0c0d0e0f) + addis I, I, Address_H(0x0c0d0e0f) + stw I, SWAP + 8(SP) + li I, Address_L(0x08090a0b) + addis I, I, Address_H(0x08090a0b) + stw I, SWAP + 12(SP) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + lis I, 0x8000 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + li I, 0 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#else + li I, 0 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + lis I, 0x8000 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#endif + + li r0, 0 + stw r0, FZERO(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 1 + ble LL(50) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 3 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + + vxor c04, c04, c04 + vxor c05, c05, c05 + vxor c06, c06, c06 + vxor c07, c07, c07 + vxor c08, c08, c08 + + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + vxor c12, c12, c12 + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(13) + .align 4 + +#define NOP1 mr r3, r3 +#define NOP2 mr r4, r4 + +LL(12): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + dcbt AO, PREA + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + dcbt BO, PREB + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 8 * SIZE + vmaddfp c12, a4, bp1, c12 + NOP1 + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + addi AO, AO, 32 * SIZE + vmaddfp c07, a7, bp2, c07 + LOAD_B b1, OFFSET_0, BO + vmaddfp c08, a8, bp2, c08 + NOP1 + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + NOP2 + vmaddfp c11, a7, bp1, c11 + NOP1 + vmaddfp c12, a8, bp1, c12 + dcbt AO, PREA + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c15, a7, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a8, bp2, c16 + vspltw bp2, b1, 1 + + vmaddfp c01, a1, bp1, c01 + LOAD_A a3, OFFSET_2, AO + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + NOP2 + vmaddfp c11, a3, bp1, c11 + NOP1 + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + vspltw bp2, b2, 1 + + vmaddfp c01, a5, bp1, c01 + LOAD_A a7, OFFSET_6, AO + vmaddfp c02, a6, bp1, c02 + LOAD_A a8, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + addi AO, AO, 32 * SIZE + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO // + vmaddfp c11, a7, bp1, c11 + NOP2 + vmaddfp c12, a8, bp1, c12 + vspltw bp1, b1, 0 + + vmaddfp c13, a5, bp2, c13 + LOAD_A a2, OFFSET_1, AO + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + NOP1 + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(13): + andi. r0, K, 2 + nop + nop + ble+ LL(15) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + NOP1 + vmaddfp c04, a4, bp1, c04 + NOP2 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + NOP2 + vmaddfp c07, a3, bp2, c07 + NOP1 + vmaddfp c08, a4, bp2, c08 + LOAD_B b2, OFFSET_1, BO + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_A a5, OFFSET_4, AO + vmaddfp c11, a3, bp1, c11 + LOAD_A a6, OFFSET_5, AO + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a7, OFFSET_6, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a8, OFFSET_7, AO + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 32 * SIZE + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + NOP2 + vmaddfp c03, a7, bp1, c03 + NOP1 + vmaddfp c04, a8, bp1, c04 + NOP2 + + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + NOP2 + vmaddfp c07, a7, bp2, c07 + NOP1 + vmaddfp c08, a8, bp2, c08 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_A a1, OFFSET_0, AO + vmaddfp c11, a7, bp1, c11 + LOAD_A a2, OFFSET_1, AO + vmaddfp c12, a8, bp1, c12 + NOP2 + + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a3, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + vmaddfp c16, a8, bp2, c16 + .align 4 + + +LL(15): + andi. r0, K, 1 + vxor VZERO, VZERO, VZERO + ble+ LL(18) + .align 4 + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a4, OFFSET_3, AO + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 16 * SIZE + vmaddfp c11, a3, bp1, c11 + addi BO, BO, 4 * SIZE + vmaddfp c12, a4, bp1, c12 + nop + + vmaddfp c13, a1, bp2, c13 + vmaddfp c14, a2, bp2, c14 + vmaddfp c15, a3, bp2, c15 + vmaddfp c16, a4, bp2, c16 + .align 4 + +LL(18): + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vxor VZERO, VZERO, VZERO + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + vperm c15, c15, c15, swap + vperm c16, c16, c16, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vxor c13, c13, neg + vxor c14, c14, neg + vxor c15, c15, neg + vxor c16, c16, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + vaddfp c11, c11, c15 + vaddfp c12, c12, c16 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + vperm c15, c11, c11, swap + vperm c16, c12, c12, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c11, alpha_r, c11, VZERO + vmaddfp c12, alpha_r, c12, VZERO + + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + vmaddfp c11, alpha_i, c15, c11 + vmaddfp c12, alpha_i, c16, c12 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, c11, PERMRSHIFT2 + vperm c11, c11, c12, PERMRSHIFT2 + vperm c12, c12, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + vaddfp c11, c11, C4 + vaddfp c12, c12, C5 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + stvx c11, OFFSET_3, CO2 + stvx c12, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 4 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c13, c13, neg + vxor c14, c14, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 2 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c13, c13, c13, swap + + vxor c05, c05, neg + vxor c13, c13, neg + + vaddfp c01, c01, c05 + vaddfp c09, c09, c13 + + vperm c05, c01, c01, swap + vperm c13, c09, c09, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c09, alpha_i, c13, c09 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 + fadd f4, f4, f7 + fsub f5, f5, f6 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 + fadd f4, f4, f7 + fsub f5, f6, f5 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fnmsub f11, f12, f5, f11 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fmadd f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fmadd f11, f12, f5, f11 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fnmsub f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + STFD f10, 0 * SIZE(CO2) + STFD f11, 1 * SIZE(CO2) + +LL(49): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + + srawi. I, M, 3 + ble LL(70) + .align 4 + +LL(61): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(65) + .align 4 + +LL(62): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(62) + .align 4 + +LL(65): + andi. r0, K, 1 + ble+ LL(68) + .align 4 + +LL(66): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(68): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(61) + .align 4 + +LL(70): + andi. I, M, 4 + ble LL(80) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(78): + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + + vxor c05, c05, neg + vxor c06, c06, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + ble+ LL(88) + .align 4 + +LL(86): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(88): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + + vxor c05, c05, neg + + vaddfp c01, c01, c05 + + vperm c05, c01, c01, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + + fmadd f0, f8, f12, f0 + fmadd f2, f8, f13, f2 + fmadd f1, f9, f12, f1 + fmadd f3, f9, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + ble LL(98) + .align 4 + +LL(96): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + .align 4 + +LL(98): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S new file mode 100644 index 0000000000..f827348333 --- /dev/null +++ b/kernel/power/zgemm_kernel_altivec_g4.S @@ -0,0 +1,1757 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 360 +#else +#define STACKSIZE 272 +#endif + +#define ALIGN_SIZE 0xffff +#define SWAP 0 +#define NEG 16 +#define ALPHA_R 32 +#define ALPHA_I 48 +#define FZERO 64 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#define STACK r11 + +#define I r21 +#define J r22 +#define AO r23 +#define BO r24 +#define CO1 r25 +#define CO2 r26 + +#define PREA r29 +#define PREB r29 +#define PREC r30 +#define VREG r31 + +#define LOAD_A lvx +#define LOAD_B lvx + +#define OFFSET_0 0 +#define OFFSET_1 r14 +#define OFFSET_2 r15 +#define OFFSET_3 r16 +#define OFFSET_4 r17 +#define OFFSET_5 r18 +#define OFFSET_6 r19 +#define OFFSET_7 r20 + +#define c01 v0 +#define c02 v1 +#define c03 v2 +#define c04 v3 +#define c05 v4 +#define c06 v5 +#define c07 v6 +#define c08 v7 +#define c09 v8 +#define c10 v9 +#define c11 v10 +#define c12 v11 +#define c13 v12 +#define c14 v13 +#define c15 v14 +#define c16 v15 + +#define a1 v16 +#define a2 v17 +#define a3 v18 +#define a4 v19 +#define a5 v20 +#define a6 v21 +#define a7 v22 +#define a8 v23 + +#define b1 v24 +#define b2 v25 +#define bp1 v26 +#define bp2 v27 + +#define C1 v16 +#define C2 v17 +#define C3 v18 +#define C4 v19 +#define C5 v20 + +#define c00 v24 + +#define VZERO v25 +#define PERMRSHIFT1 v26 +#define PERMRSHIFT2 v27 + +#define swap v28 +#define neg v29 +#define alpha_r v30 +#define alpha_i v31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + mr STACK, SP + + li r0, 0 * 16 + stvx v20, SP, r0 + li r0, 1 * 16 + stvx v21, SP, r0 + li r0, 2 * 16 + stvx v22, SP, r0 + li r0, 3 * 16 + stvx v23, SP, r0 + li r0, 4 * 16 + stvx v24, SP, r0 + li r0, 5 * 16 + stvx v25, SP, r0 + li r0, 6 * 16 + stvx v26, SP, r0 + li r0, 7 * 16 + stvx v27, SP, r0 + li r0, 8 * 16 + stvx v28, SP, r0 + li r0, 9 * 16 + stvx v29, SP, r0 + li r0, 10 * 16 + stvx v30, SP, r0 + li r0, 11 * 16 + stvx v31, SP, r0 + +#ifdef __64BIT__ + std r31, 192(SP) + std r30, 200(SP) + std r29, 208(SP) + std r28, 216(SP) + std r27, 224(SP) + std r26, 232(SP) + std r25, 240(SP) + std r24, 248(SP) + std r23, 256(SP) + std r22, 264(SP) + std r21, 272(SP) + std r20, 280(SP) + std r19, 288(SP) + std r18, 296(SP) + std r17, 304(SP) + std r16, 312(SP) + std r15, 320(SP) + std r14, 328(SP) +#else + stw r31, 192(SP) + stw r30, 196(SP) + stw r29, 200(SP) + stw r28, 204(SP) + stw r27, 208(SP) + stw r26, 212(SP) + stw r25, 216(SP) + stw r24, 220(SP) + stw r23, 224(SP) + stw r22, 228(SP) + stw r21, 232(SP) + stw r20, 236(SP) + stw r19, 240(SP) + stw r18, 244(SP) + stw r17, 248(SP) + stw r16, 252(SP) + stw r15, 256(SP) + stw r14, 260(SP) +#endif + + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + li r0, -1 + mfspr VREG, VRsave + + mtspr VRsave, r0 + + addi SP, SP, -128 + li r0, -8192 + + and SP, SP, r0 + + fneg f3, f1 + fneg f4, f2 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NC) || defined(TC) || defined(NR) || defined(TR) + stfs f1, ALPHA_R + 0(SP) + stfs f1, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f1, ALPHA_R + 12(SP) + + stfs f4, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f4, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#else + stfs f1, ALPHA_R + 0(SP) + stfs f3, ALPHA_R + 4(SP) + stfs f1, ALPHA_R + 8(SP) + stfs f3, ALPHA_R + 12(SP) + + stfs f2, ALPHA_I + 0(SP) + stfs f2, ALPHA_I + 4(SP) + stfs f2, ALPHA_I + 8(SP) + stfs f2, ALPHA_I + 12(SP) +#endif + + li I, Address_L(0x04050607) + addis I, I, Address_H(0x04050607) + stw I, SWAP + 0(SP) + li I, Address_L(0x00010203) + addis I, I, Address_H(0x00010203) + stw I, SWAP + 4(SP) + li I, Address_L(0x0c0d0e0f) + addis I, I, Address_H(0x0c0d0e0f) + stw I, SWAP + 8(SP) + li I, Address_L(0x08090a0b) + addis I, I, Address_H(0x08090a0b) + stw I, SWAP + 12(SP) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + lis I, 0x8000 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + li I, 0 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#else + li I, 0 + stw I, NEG + 0(SP) + stw I, NEG + 8(SP) + lis I, 0x8000 + stw I, NEG + 4(SP) + stw I, NEG + 12(SP) +#endif + + li r0, 0 + stw r0, FZERO(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + li PREC, (15 * SIZE) + li PREB, (25 * 8 * SIZE) + + li OFFSET_1, 4 * SIZE + li OFFSET_2, 8 * SIZE + li OFFSET_3, 12 * SIZE + li OFFSET_4, 16 * SIZE + li OFFSET_5, 20 * SIZE + li OFFSET_6, 24 * SIZE + li OFFSET_7, 28 * SIZE + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 1 + ble LL(50) + .align 4 + +LL(01): + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + mr AO, A + srawi. I, M, 3 + ble LL(20) + .align 4 + +LL(11): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + vxor c03, c03, c03 + LOAD_A a2, OFFSET_1, AO + vxor c04, c04, c04 + LOAD_A a3, OFFSET_2, AO + vxor c05, c05, c05 + LOAD_A a4, OFFSET_3, AO + vxor c06, c06, c06 + LOAD_B b2, OFFSET_2, B + vxor c07, c07, c07 + LOAD_A a5, OFFSET_4, AO + vxor c08, c08, c08 + LOAD_A a6, OFFSET_5, AO + vxor c09, c09, c09 + dcbtst CO1, PREC + vxor c10, c10, c10 + dcbtst CO2, PREC + vxor c11, c11, c11 + vxor c12, c12, c12 + vxor c13, c13, c13 + mr BO, B + vxor c14, c14, c14 + srawi. r0, K, 2 + vxor c15, c15, c15 + mtspr CTR, r0 + vxor c16, c16, c16 + vspltw bp1, b1, 0 + ble LL(15) + .align 4 + +LL(12): +/* 1 */ + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c03, a3, bp1, c03 + LOAD_A a7, OFFSET_4, AO + vmaddfp c04, a4, bp1, c04 + LOAD_A a8, OFFSET_5, AO + +/* 2 */ + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + dcbt BO, PREB + vmaddfp c07, a3, bp2, c07 + dcbt AO, PREB + vmaddfp c08, a4, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 3 */ + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + dcbt AO, PREB + vmaddfp c12, a4, bp1, c12 + addi AO, AO, 8 * SIZE + +/* 4 */ + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_2, AO + vmaddfp c15, a3, bp2, c15 + dcbt AO, PREB + vmaddfp c16, a4, bp2, c16 + addi AO, AO, 8 * SIZE + +/* 5 */ + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a6, bp1, c02 + LOAD_A a2, OFFSET_1, AO + vmaddfp c03, a7, bp1, c03 + LOAD_A a3, OFFSET_2, AO + vmaddfp c04, a8, bp1, c04 + LOAD_A a4, OFFSET_3, AO + +/* 6 */ + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + dcbt AO, PREA + vmaddfp c08, a8, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 7 */ + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b1, OFFSET_4, BO + vmaddfp c11, a7, bp1, c11 + nop + vmaddfp c12, a8, bp1, c12 + nop + +/* 8 */ + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a5, OFFSET_2, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a6, OFFSET_3, AO + vmaddfp c16, a8, bp2, c16 + LOAD_A a7, OFFSET_4, AO + +/* 9 */ + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a2, bp1, c02 + LOAD_A a8, OFFSET_5, AO + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 8 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + +/* 10 */ + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + +/* 11 */ + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b2, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + dcbt AO, PREA + vmaddfp c12, a4, bp1, c12 + addi AO, AO, 8 * SIZE + +/* 12 */ + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_4, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a2, OFFSET_5, AO + vmaddfp c16, a4, bp2, c16 + LOAD_A a3, OFFSET_6, AO + +/* 13 */ + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a6, bp1, c02 + LOAD_A a4, OFFSET_7, AO + vmaddfp c03, a7, bp1, c03 + dcbt AO, PREA + vmaddfp c04, a8, bp1, c04 + addi AO, AO, 8 * SIZE + +/* 14 */ + vmaddfp c05, a5, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a6, bp2, c06 + nop + vmaddfp c07, a7, bp2, c07 + dcbt AO, PREA + vmaddfp c08, a8, bp2, c08 + addi AO, AO, 8 * SIZE + +/* 15 */ + vmaddfp c09, a5, bp1, c09 + vspltw bp2, b2, 3 + vmaddfp c10, a6, bp1, c10 + LOAD_B b2, OFFSET_4, BO + vmaddfp c11, a7, bp1, c11 + dcbt AO, PREA + vmaddfp c12, a8, bp1, c12 + addi BO, BO, 8 * SIZE + +/* 16 */ + vmaddfp c13, a5, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a6, bp2, c14 + LOAD_A a5, OFFSET_4, AO + vmaddfp c15, a7, bp2, c15 + LOAD_A a6, OFFSET_5, AO + vmaddfp c16, a8, bp2, c16 + bdnz+ LL(12) + .align 4 + +LL(15): + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + andi. r0, K, 3 + mtspr CTR, r0 + ble+ LL(18) + .align 4 + +LL(16): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + vmaddfp c03, a3, bp1, c03 + nop + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + vmaddfp c07, a3, bp2, c07 + nop + vmaddfp c08, a4, bp2, c08 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + LOAD_B b1, OFFSET_1, BO + vmaddfp c11, a3, bp1, c11 + addi AO, AO, 16 * SIZE + vmaddfp c12, a4, bp1, c12 + addi BO, BO, 4 * SIZE + + vmaddfp c13, a1, bp2, c13 + vspltw bp1, b1, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a1, OFFSET_0, AO + vmaddfp c15, a3, bp2, c15 + LOAD_A a2, OFFSET_1, AO + vmaddfp c16, a4, bp2, c16 + LOAD_A a3, OFFSET_2, AO + + LOAD_A a4, OFFSET_3, AO + bdnz+ LL(16) + .align 4 + +LL(18): + vxor VZERO, VZERO, VZERO + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + vperm c15, c15, c15, swap + vperm c16, c16, c16, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vxor c13, c13, neg + vxor c14, c14, neg + vxor c15, c15, neg + vxor c16, c16, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + vaddfp c11, c11, c15 + vaddfp c12, c12, c16 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + vperm c15, c11, c11, swap + vperm c16, c12, c12, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c11, alpha_r, c11, VZERO + vmaddfp c12, alpha_r, c12, VZERO + + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + vmaddfp c11, alpha_i, c15, c11 + vmaddfp c12, alpha_i, c16, c12 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + lvx C4, OFFSET_3, CO2 + lvx C5, OFFSET_4, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, c11, PERMRSHIFT2 + vperm c11, c11, c12, PERMRSHIFT2 + vperm c12, c12, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + vaddfp c11, c11, C4 + vaddfp c12, c12, C5 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + stvx c11, OFFSET_3, CO2 + stvx c12, OFFSET_4, CO2 + + addi CO1, CO1, 16 * SIZE + addi CO2, CO2, 16 * SIZE + addic. I, I, -1 + bgt+ LL(11) + .align 4 + +LL(20): + andi. I, M, 4 + ble LL(30) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c09, c09, c09 + LOAD_B b1, OFFSET_0, B + vxor c10, c10, c10 + LOAD_B b2, OFFSET_1, B + vxor c13, c13, c13 + vxor c14, c14, c14 + mr BO, B + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + addi AO, AO, 16 * SIZE + vmaddfp c02, a2, bp1, c02 + addi BO, BO, 8 * SIZE + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + LOAD_B b1, OFFSET_0, BO + vmaddfp c10, a2, bp1, c10 + + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vmaddfp c01, a3, bp1, c01 + vspltw bp2, b2, 1 + vmaddfp c02, a4, bp1, c02 + + vmaddfp c05, a3, bp2, c05 + vspltw bp1, b2, 2 + vmaddfp c06, a4, bp2, c06 + + vmaddfp c09, a3, bp1, c09 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c10, a4, bp1, c10 + + vmaddfp c13, a3, bp2, c13 + LOAD_A a3, OFFSET_2, AO + vmaddfp c14, a4, bp2, c14 + LOAD_A a4, OFFSET_3, AO + vspltw bp1, b1, 0 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, K, 1 + ble+ LL(28) + .align 4 + +LL(26): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + nop + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + nop + + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c10, a2, bp1, c10 + addi AO, AO, 8 * SIZE + + vmaddfp c13, a1, bp2, c13 + addi BO, BO, 4 * SIZE + vmaddfp c14, a2, bp2, c14 + nop + .align 4 + +LL(28): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c13, c13, c13, swap + vperm c14, c14, c14, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c13, c13, neg + vxor c14, c14, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c09, c09, c13 + vaddfp c10, c10, c14 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c13, c09, c09, swap + vperm c14, c10, c10, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c10, alpha_r, c10, VZERO + vmaddfp c09, alpha_i, c13, c09 + vmaddfp c10, alpha_i, c14, c10 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + lvx C3, OFFSET_2, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, c10, PERMRSHIFT2 + vperm c10, c10, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + vaddfp c10, c10, C3 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + stvx c10, OFFSET_2, CO2 + + addi CO1, CO1, 8 * SIZE + addi CO2, CO2, 8 * SIZE + .align 4 + +LL(30): + andi. I, M, 2 + ble LL(40) + + vxor c01, c01, c01 + LOAD_A a1, OFFSET_0, AO + vxor c02, c02, c02 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_B b1, OFFSET_0, B + vxor c06, c06, c06 + LOAD_B b2, OFFSET_1, B + vxor c09, c09, c09 + vxor c10, c10, c10 + vxor c13, c13, c13 + vxor c14, c14, c14 + + vspltw bp1, b1, 0 + mr BO, B + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + vmaddfp c01, a1, bp1, c01 + addi AO, AO, 8 * SIZE + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 8 * SIZE + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + LOAD_A a1, OFFSET_0, AO + vspltw bp1, b2, 0 + LOAD_B b1, OFFSET_0, BO + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b2, 1 + vmaddfp c06, a2, bp2, c06 + vspltw bp1, b2, 2 + vmaddfp c10, a2, bp1, c10 + vspltw bp2, b2, 3 + LOAD_B b2, OFFSET_1, BO + vmaddfp c14, a2, bp2, c14 + LOAD_A a2, OFFSET_1, AO + + vspltw bp1, b1, 0 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, K, 1 + ble+ LL(38) + .align 4 + +LL(36): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c09, a1, bp1, c09 + vspltw bp2, b1, 3 + vmaddfp c13, a1, bp2, c13 + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(38): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c13, c13, c13, swap + + vxor c05, c05, neg + vxor c13, c13, neg + + vaddfp c01, c01, c05 + vaddfp c09, c09, c13 + + vperm c05, c01, c01, swap + vperm c13, c09, c09, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + vmaddfp c09, alpha_r, c09, VZERO + vmaddfp c09, alpha_i, c13, c09 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + lvsr PERMRSHIFT2, 0, CO2 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + lvx C1, OFFSET_0, CO2 + lvx C2, OFFSET_1, CO2 + + vperm c00, VZERO, c09, PERMRSHIFT2 + vperm c09, c09, VZERO, PERMRSHIFT2 + + vaddfp c00, c00, C1 + vaddfp c09, c09, C2 + + stvx c00, OFFSET_0, CO2 + stvx c09, OFFSET_1, CO2 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + + LFD f10, 8 * SIZE(BO) + LFD f11, 9 * SIZE(BO) + LFD f12, 10 * SIZE(BO) + LFD f13, 11 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + andi. r0, K, 1 + ble LL(48) + .align 4 + +LL(46): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f4, f8, f12, f4 + fmadd f6, f8, f13, f6 + + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + fmadd f5, f9, f12, f5 + fmadd f7, f9, f13, f7 + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + .align 4 + +LL(48): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 + fadd f4, f4, f7 + fsub f5, f5, f6 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 + fadd f4, f4, f7 + fsub f5, f6, f5 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 + fsub f4, f4, f7 + fadd f5, f5, f6 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + LFD f10, 0 * SIZE(CO2) + LFD f11, 1 * SIZE(CO2) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fnmsub f11, f12, f5, f11 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fmadd f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + fmadd f10, f12, f4, f10 + fmadd f11, f12, f5, f11 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 + fnmsub f10, f13, f5, f10 + fmadd f11, f13, f4, f11 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + STFD f10, 0 * SIZE(CO2) + STFD f11, 1 * SIZE(CO2) + +LL(49): + mr B, BO + + addic. J, J, -1 + bgt LL(01) + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + + mr CO1, C + mr AO, A + + srawi. I, M, 3 + ble LL(70) + .align 4 + +LL(61): + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + dcbtst CO1, PREC + dcbtst CO2, PREC + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(65) + .align 4 + +LL(62): + LOAD_A a5, OFFSET_4, AO + LOAD_A a6, OFFSET_5, AO + LOAD_A a7, OFFSET_6, AO + LOAD_A a8, OFFSET_7, AO + + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + vmaddfp c03, a3, bp1, c03 + vmaddfp c04, a4, bp1, c04 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + vmaddfp c01, a5, bp1, c01 + vspltw bp2, b1, 3 + vmaddfp c02, a6, bp1, c02 + vmaddfp c03, a7, bp1, c03 + vmaddfp c04, a8, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c05, a5, bp2, c05 + vmaddfp c06, a6, bp2, c06 + vmaddfp c07, a7, bp2, c07 + vmaddfp c08, a8, bp2, c08 + + addi AO, AO, 32 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(62) + .align 4 + +LL(65): + andi. r0, K, 1 + ble+ LL(68) + .align 4 + +LL(66): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 16 * SIZE + vmaddfp c03, a3, bp1, c03 + addi BO, BO, 2 * SIZE + vmaddfp c04, a4, bp1, c04 + nop + + vmaddfp c05, a1, bp2, c05 + vmaddfp c06, a2, bp2, c06 + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + .align 4 + +LL(68): + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + vperm c07, c07, c07, swap + vperm c08, c08, c08, swap + + vxor c05, c05, neg + vxor c06, c06, neg + vxor c07, c07, neg + vxor c08, c08, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + vaddfp c03, c03, c07 + vaddfp c04, c04, c08 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + vperm c07, c03, c03, swap + vperm c08, c04, c04, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c03, alpha_r, c03, VZERO + vmaddfp c04, alpha_r, c04, VZERO + + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + vmaddfp c03, alpha_i, c07, c03 + vmaddfp c04, alpha_i, c08, c04 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + lvx C4, OFFSET_3, CO1 + lvx C5, OFFSET_4, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, c03, PERMRSHIFT1 + vperm c03, c03, c04, PERMRSHIFT1 + vperm c04, c04, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + vaddfp c03, c03, C4 + vaddfp c04, c04, C5 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + stvx c03, OFFSET_3, CO1 + stvx c04, OFFSET_4, CO1 + + addi CO1, CO1, 16 * SIZE + addic. I, I, -1 + bgt+ LL(61) + .align 4 + +LL(70): + andi. I, M, 4 + ble LL(80) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + vxor c03, c03, c03 + LOAD_A a1, OFFSET_0, AO + vxor c04, c04, c04 + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + LOAD_A a3, OFFSET_2, AO + vxor c06, c06, c06 + LOAD_A a4, OFFSET_3, AO + vxor c07, c07, c07 + vxor c08, c08, c08 + + mr BO, B + + vspltw bp1, b1, 0 + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(75) + .align 4 + +LL(72): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + vmaddfp c06, a2, bp2, c06 + + vmaddfp c03, a3, bp1, c03 + vspltw bp2, b1, 3 + vmaddfp c04, a4, bp1, c04 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c07, a3, bp2, c07 + vmaddfp c08, a4, bp2, c08 + + addi AO, AO, 16 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + LOAD_A a3, OFFSET_2, AO + LOAD_A a4, OFFSET_3, AO + bdnz LL(72) + .align 4 + +LL(75): + andi. r0, K, 1 + ble+ LL(78) + .align 4 + +LL(76): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + vmaddfp c02, a2, bp1, c02 + addi AO, AO, 8 * SIZE + vmaddfp c05, a1, bp2, c05 + addi BO, BO, 2 * SIZE + vmaddfp c06, a2, bp2, c06 + .align 4 + +LL(78): + vaddfp c01, c01, c03 + vaddfp c02, c02, c04 + vaddfp c05, c05, c07 + vaddfp c06, c06, c08 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + vperm c06, c06, c06, swap + + vxor c05, c05, neg + vxor c06, c06, neg + + vaddfp c01, c01, c05 + vaddfp c02, c02, c06 + + vperm c05, c01, c01, swap + vperm c06, c02, c02, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c02, alpha_r, c02, VZERO + vmaddfp c01, alpha_i, c05, c01 + vmaddfp c02, alpha_i, c06, c02 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + lvx C3, OFFSET_2, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, c02, PERMRSHIFT1 + vperm c02, c02, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + vaddfp c02, c02, C3 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + stvx c02, OFFSET_2, CO1 + + addi CO1, CO1, 8 * SIZE + .align 4 + +LL(80): + andi. I, M, 2 + ble LL(90) + + vxor c01, c01, c01 + LOAD_B b1, OFFSET_0, B + vxor c02, c02, c02 + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + vxor c05, c05, c05 + vxor c06, c06, c06 + + mr BO, B + + vspltw bp1, b1, 0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(85) + .align 4 + +LL(82): + vmaddfp c01, a1, bp1, c01 + vspltw bp2, b1, 1 + + vmaddfp c05, a1, bp2, c05 + vspltw bp1, b1, 2 + + vmaddfp c02, a2, bp1, c02 + vspltw bp2, b1, 3 + + LOAD_B b1, OFFSET_1, BO + vspltw bp1, b1, 0 + + vmaddfp c06, a2, bp2, c06 + + addi AO, AO, 8 * SIZE + addi BO, BO, 4 * SIZE + + LOAD_A a1, OFFSET_0, AO + LOAD_A a2, OFFSET_1, AO + bdnz LL(82) + .align 4 + +LL(85): + andi. r0, K, 1 + ble+ LL(88) + .align 4 + +LL(86): + vspltw bp2, b1, 1 + vmaddfp c01, a1, bp1, c01 + vmaddfp c05, a1, bp2, c05 + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + .align 4 + +LL(88): + vaddfp c01, c01, c02 + vaddfp c05, c05, c06 + vaddfp c09, c09, c10 + vaddfp c13, c13, c14 + + vxor VZERO, VZERO, VZERO + + lvx swap, OFFSET_0, SP + lvx neg, OFFSET_1, SP + lvx alpha_r, OFFSET_2, SP + lvx alpha_i, OFFSET_3, SP + + vperm c05, c05, c05, swap + + vxor c05, c05, neg + + vaddfp c01, c01, c05 + + vperm c05, c01, c01, swap + + vmaddfp c01, alpha_r, c01, VZERO + vmaddfp c01, alpha_i, c05, c01 + + lvx C1, OFFSET_0, CO1 + lvx C2, OFFSET_1, CO1 + + lvsr PERMRSHIFT1, 0, CO1 + + vperm c00, VZERO, c01, PERMRSHIFT1 + vperm c01, c01, VZERO, PERMRSHIFT1 + + vaddfp c00, c00, C1 + vaddfp c01, c01, C2 + + stvx c00, OFFSET_0, CO1 + stvx c01, OFFSET_1, CO1 + + addi CO1, CO1, 4 * SIZE + .align 4 + +LL(90): + andi. I, M, 1 + ble LL(999) + + mr BO, B + + LFD f8, 0 * SIZE(AO) + LFD f9, 1 * SIZE(AO) + + LFD f10, 0 * SIZE(BO) + LFD f11, 1 * SIZE(BO) + LFD f12, 2 * SIZE(BO) + LFD f13, 3 * SIZE(BO) + + lfs f0, FZERO(SP) + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + srawi. r0, K, 1 + mtspr CTR, r0 + ble LL(95) + .align 4 + +LL(92): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + + LFD f8, 2 * SIZE(AO) + LFD f9, 3 * SIZE(AO) + LFD f10, 4 * SIZE(BO) + LFD f11, 5 * SIZE(BO) + + fmadd f0, f8, f12, f0 + fmadd f2, f8, f13, f2 + fmadd f1, f9, f12, f1 + fmadd f3, f9, f13, f3 + + LFD f8, 4 * SIZE(AO) + LFD f9, 5 * SIZE(AO) + LFD f12, 6 * SIZE(BO) + LFD f13, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(92) + .align 4 + +LL(95): + andi. r0, K, 1 + ble LL(98) + .align 4 + +LL(96): + fmadd f0, f8, f10, f0 + fmadd f2, f8, f11, f2 + fmadd f1, f9, f10, f1 + fmadd f3, f9, f11, f3 + .align 4 + +LL(98): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + fsub f0, f0, f3 + fadd f1, f1, f2 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + fadd f0, f0, f3 + fsub f1, f1, f2 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + fadd f0, f0, f3 + fsub f1, f2, f1 +#else /* RR, RC, CR, CC */ + fsub f0, f0, f3 + fadd f1, f1, f2 +#endif + + LFD f8, 0 * SIZE(CO1) + LFD f9, 1 * SIZE(CO1) + + lfs f12, ALPHA_R + 0(SP) + lfs f13, ALPHA_I + 4(SP) + +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + fmadd f8, f12, f0, f8 + fnmsub f9, f12, f1, f9 + + fmadd f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#else + fmadd f8, f12, f0, f8 + fmadd f9, f12, f1, f9 + + fnmsub f8, f13, f1, f8 + fmadd f9, f13, f0, f9 +#endif + + STFD f8, 0 * SIZE(CO1) + STFD f9, 1 * SIZE(CO1) + .align 4 + +LL(999): + mr SP, STACK + + li r0, 0 * 16 + lvx v20, SP, r0 + li r0, 1 * 16 + lvx v21, SP, r0 + li r0, 2 * 16 + lvx v22, SP, r0 + li r0, 3 * 16 + lvx v23, SP, r0 + li r0, 4 * 16 + lvx v24, SP, r0 + li r0, 5 * 16 + lvx v25, SP, r0 + li r0, 6 * 16 + lvx v26, SP, r0 + li r0, 7 * 16 + lvx v27, SP, r0 + li r0, 8 * 16 + lvx v28, SP, r0 + li r0, 9 * 16 + lvx v29, SP, r0 + li r0, 10 * 16 + lvx v30, SP, r0 + li r0, 11 * 16 + lvx v31, SP, r0 + + mtspr VRsave, VREG + +#ifdef __64BIT__ + ld r31, 192(SP) + ld r30, 200(SP) + ld r29, 208(SP) + ld r28, 216(SP) + ld r27, 224(SP) + ld r26, 232(SP) + ld r25, 240(SP) + ld r24, 248(SP) + ld r23, 256(SP) + ld r22, 264(SP) + ld r21, 272(SP) + ld r20, 280(SP) + ld r19, 288(SP) + ld r18, 296(SP) + ld r17, 304(SP) + ld r16, 312(SP) + ld r15, 320(SP) + ld r14, 328(SP) +#else + lwz r31, 192(SP) + lwz r30, 196(SP) + lwz r29, 200(SP) + lwz r28, 204(SP) + lwz r27, 208(SP) + lwz r26, 212(SP) + lwz r25, 216(SP) + lwz r24, 220(SP) + lwz r23, 224(SP) + lwz r22, 228(SP) + lwz r21, 232(SP) + lwz r20, 236(SP) + lwz r19, 240(SP) + lwz r18, 244(SP) + lwz r17, 248(SP) + lwz r16, 252(SP) + lwz r15, 256(SP) + lwz r14, 260(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S new file mode 100644 index 0000000000..f0d32048bf --- /dev/null +++ b/kernel/power/zgemm_kernel_cell.S @@ -0,0 +1,1784 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../cparam.h" +#else +#include "../zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#ifdef TRMMKERNEL + std r23, 208(SP) + std r22, 216(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#ifdef TRMMKERNEL + stw r23, 176(SP) + stw r22, 180(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREC, 3 * SIZE + li PREA, 16 * 12 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + + + lfs f0, FZERO + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr AO, A + ble LL(20) + .align 4 + +LL(11): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + PREFETCH_C1 + nop + nop + PREFETCH_C2 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(15) +#else + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) +#endif + + PREFETCH_C1 + PREFETCH_C2 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(15) +#endif + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(KERNEL_MainFinish) +#endif + .align 4 + +LL(16): + FMADD f0, f16, f20, f0 + FMADD f5, f17, f21, f5 + FMADD f10, f18, f22, f10 + FMADD f15, f19, f23, f15 + + FMADD f1, f17, f20, f1 + FMADD f2, f18, f20, f2 + FMADD f3, f19, f20, f3 + FMADD f4, f16, f21, f4 + + FMADD f6, f18, f21, f6 + FMADD f7, f19, f21, f7 + FMADD f8, f16, f22, f8 + FMADD f9, f17, f22, f9 + + FMADD f11, f19, f22, f11 + FMADD f12, f16, f23, f12 + FMADD f13, f17, f23, f13 + FMADD f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FNMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FNMADD f23, f31, f10, f23 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(25) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(25) +#endif + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(27) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(27) +#endif + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(29): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + ble LL(40) + .align 4 + +LL(31): +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(35) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + + PREFETCH_C1 + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(35) +#endif + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(37) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble LL(37) +#endif + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(999) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(45) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(45) +#endif + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble LL(47) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,TEMP + ble LL(47) +#endif + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 +#endif + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 + + FMADD f16, f31, f2, f16 + FNMADD f17, f31, f0, f17 +#endif + +#endif + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#ifdef TRMMKERNEL + ld r23, 208(SP) + ld r22, 216(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#ifdef TRMMKERNEL + lwz r23, 176(SP) + lwz r22, 180(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S new file mode 100644 index 0000000000..c652adf8a5 --- /dev/null +++ b/kernel/power/zgemm_kernel_g4.S @@ -0,0 +1,1637 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#ifdef TRMMKERNEL + std r23, 208(SP) + std r22, 216(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#ifdef TRMMKERNEL + stw r23, 176(SP) + stw r22, 180(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + li PREA, 8 * 8 * SIZE + li PREC, 3 * SIZE + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + + srawi. J, N, 1 + ble .L30 + .align 4 + +.L10: + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr AO, A + ble .L20 + .align 4 + +.L11: +#ifndef TRMMKERNEL + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, K, 1 + mr BO, B + mtspr CTR, r0 + ble .L15 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + mr BO, B + +#else + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A3, 2 * SIZE(AO) + LFDU A5, 4 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 1 + mtspr CTR, TEMP + ble .L15 +#endif + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + dcbt AO, PREA + FMADD f4, A1, B2, f4 + LFDU B5, 4 * SIZE(BO) + + FMADD f8, A1, B3, f8 + dcbt BO, PREA + FMADD f12, A1, B4, f12 + LFD A4, -1 * SIZE(AO) + + FMADD f1, A2, B1, f1 + nop + FMADD f5, A2, B2, f5 + LFD B6, 1 * SIZE(BO) + + FMADD f9, A2, B3, f9 + LFDU A1, 4 * SIZE(AO) + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B7, 2 * SIZE(BO) + + FMADD f10, A3, B3, f10 + LFD A2, -3 * SIZE(AO) + FMADD f14, A3, B4, f14 + nop + + FMADD f3, A4, B1, f3 + nop + FMADD f7, A4, B2, f7 + LFD B8, 3 * SIZE(BO) + + FMADD f11, A4, B3, f11 + LFD A3, -2 * SIZE(AO) + FMADD f15, A4, B4, f15 + nop + + FMADD f0, A5, B5, f0 +#ifdef DOUBLE + dcbt AO, PREA +#else + nop +#endif + FMADD f4, A5, B6, f4 + LFDU B1, 4 * SIZE(BO) + + FMADD f8, A5, B7, f8 +#ifdef DOUBLE + dcbt BO, PREA +#else + nop +#endif + FMADD f12, A5, B8, f12 + LFD A4, -1 * SIZE(AO) + + FMADD f1, A2, B5, f1 + nop + FMADD f5, A2, B6, f5 + LFD B2, 1 * SIZE(BO) + + FMADD f9, A2, B7, f9 + LFDU A5, 4 * SIZE(AO) + FMADD f13, A2, B8, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B6, f6 + LFD B3, 2 * SIZE(BO) + + FMADD f10, A3, B7, f10 + LFD A2, -3 * SIZE(AO) + FMADD f14, A3, B8, f14 + nop + + FMADD f3, A4, B5, f3 + nop + FMADD f7, A4, B6, f7 + LFD B4, 3 * SIZE(BO) + + FMADD f11, A4, B7, f11 + LFD A3, -2 * SIZE(AO) + FMADD f15, A4, B8, f15 + bdnz .L12 + .align 4 + .align 4 + +.L15: + addi AO, AO, -4 * SIZE + +#ifndef TRMMKERNEL + andi. r0, K, 1 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + ble .LKERNEL_MainFinish +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 1 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + ble .LKERNEL_MainFinish +#endif + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A4, 3 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + + FMADD f3, A4, B1, f3 + FMADD f7, A4, B2, f7 + FMADD f11, A4, B3, f11 + addi AO, AO, 4 * SIZE + FMADD f15, A4, B4, f15 + addi BO, BO, 4 * SIZE + .align 4 + +.LKERNEL_MainFinish: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FNMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FNMADD f23, f31, f10, f23 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L20: + andi. I, M, 1 + ble .L29 + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L25 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L25 +#endif + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f27, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + LFD f19, 3 * SIZE(AO) + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + bdnz .L22 + .align 4 + +.L25: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .L27 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .L27 +#endif + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L26 + .align 4 + +.L27: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + ble .L40 + .align 4 + +.L31: +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L35 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L35 +#endif + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f27, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + LFD f19, 3 * SIZE(BO) + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + bdnz .L32 + .align 4 + +.L35: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .L37 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .L37 +#endif + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + LFD f17, 1 * SIZE(BO) + bdnz .L36 + .align 4 + +.L37: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L40: + andi. I, M, 1 + ble .L999 + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L45 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L45 +#endif + .align 4 + +.L42: + fmadd f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFD f16, 4 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFD f20, 4 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 5 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 6 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 6 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 8 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 8 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 2 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble .L47 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,TEMP + ble .L47 +#endif + .align 4 + +.L46: + fmadd f0, f16, f20, f0 + fmadd f3, f16, f21, f3 + LFDU f16, 2 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 2 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L47: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 +#endif + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 + + FMADD f16, f31, f2, f16 + FNMADD f17, f31, f0, f17 +#endif + +#endif + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#ifdef TRMMKERNEL + ld r23, 208(SP) + ld r22, 216(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#ifdef TRMMKERNEL + lwz r23, 176(SP) + lwz r22, 180(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S new file mode 100644 index 0000000000..7378950e87 --- /dev/null +++ b/kernel/power/zgemm_kernel_hummer.S @@ -0,0 +1,4428 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#undef ZERO + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define ZERO r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnpma +#else +#define FXCPMADD fxcpnsma +#define FXCSMADD fxcxma +#endif + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) + + li r0, 0 + stwu r0, -4(SP) + stwu r0, -4(SP) + + stfdu f2, -8(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif + + andi. r0, C, 2 * SIZE - 1 + bne .L1000 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -2 * SIZE + li INCM5, -4 * SIZE + li INCM7, -6 * SIZE + + addi C, C, - 2 * SIZE + srawi. J, N, 1 + ble .L50 + .align 4 + +.L10: + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -4 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L20 + .align 4 + +.L11: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 2 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 +#else + nop +#endif + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 +#ifndef TRMMKERNEL + LFPDUX B1, CO1, INC2 +#else + nop +#endif + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFPDUX A3, CO1, INC2 +#else + nop +#endif + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A5, CO1, INC2 +#else + nop +#endif + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 +#ifndef TRMMKERNEL + LFPDUX B3, CO2, INC2 +#else + nop +#endif + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 +#ifndef TRMMKERNEL + LFPDUX A6, CO2, INC2 +#else + nop +#endif + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFPDUX A7, CO2, INC2 +#else + nop +#endif + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 +#ifndef TRMMKERNEL + LFPDUX B2, CO2, INC2 +#else + nop +#endif + + FXCSMADD f12, B4, A9, f12 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + FXCSMADD f6, B6, A10, f6 + FXCPMADD f10, B4, A10, f10 + FXCSMADD f14, B4, A10, f14 + + FXCPMADD f3, B6, A4, f3 + FXCSMADD f7, B6, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L14: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 + + cmpwi cr0, TEMP, 3 + bgt+ .L15 +#else + andi. r0, K, 3 + mtspr CTR, r0 + ble+ .L18 + + cmpwi cr0, K, 3 + bgt+ .L15 +#endif + +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + fpmr f5, f0 + LFPDUX B1, CO1, INC2 + fpmr f9, f0 + LFPDUX A3, CO1, INC2 + fpmr f13, f0 + LFPDUX A5, CO1, INC2 + fpmr f2, f0 + + LFPDUX B3, CO2, INC2 + fpmr f6, f0 + LFPDUX A6, CO2, INC2 + fpmr f10, f0 + LFPDUX A7, CO2, INC2 + fpmr f14, f0 + LFPDUX B2, CO2, INC2 + fpmr f3, f0 +#else + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 +#endif + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + .align 4 + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L18: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 +#else + fpsub f0, f0, f4 + fpsub f8, f8, f12 + fpsub f1, f1, f5 + fpsub f9, f9, f13 + + fpsub f2, f2, f6 + fpsub f10, f10, f14 + fpsub f3, f3, f7 + fpsub f11, f11, f15 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd B1, f1, AP, B1 + fxcpmadd A3, f2, AP, A3 + fxcpmadd A5, f3, AP, A5 + + fxcxnpma f0, f0, AP, A1 + fxcpmadd B3, f8, AP, B3 + fxcxnpma f1, f1, AP, B1 + fxcpmadd A6, f9, AP, A6 + fxcxnpma f2, f2, AP, A3 + fxcpmadd A7, f10, AP, A7 + + fxcxnpma f3, f3, AP, A5 + fxcpmadd B2, f11, AP, B2 + fxcxnpma f8, f8, AP, B3 + STFPDUX f0, CO1, INCM7 + fxcxnpma f9, f9, AP, A6 + STFPDUX f1, CO1, INC2 + fxcxnpma f10, f10, AP, A7 + STFPDUX f2, CO1, INC2 + + fxcxnpma f11, f11, AP, B2 + STFPDUX f3, CO1, INC2 + STFPDUX f8, CO2, INCM7 + STFPDUX f9, CO2, INC2 + STFPDUX f10, CO2, INC2 + STFPDUX f11, CO2, INC2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f2, AP, f30 + fxcpmadd f15, f3, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f2, f2, AP, f14 + fxcxnpma f3, f3, AP, f15 + + fxcpmadd f16, f8, AP, f30 + fxcpmadd f17, f9, AP, f30 + fxcpmadd f18, f10, AP, f30 + fxcpmadd f19, f11, AP, f30 + + fxcxnpma f8, f8, AP, f16 + fxcxnpma f9, f9, AP, f17 + fxcxnpma f10, f10, AP, f18 + fxcxnpma f11, f11, AP, f19 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 + + STFPDUX f8, CO2, INC2 + STFPDUX f9, CO2, INC2 + STFPDUX f10, CO2, INC2 + STFPDUX f11, CO2, INC2 + +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 1 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L24: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L28 + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L28: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX A2, CO1, INC2 + LFPDUX A3, CO2, INC2 + LFPDUX A4, CO2, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 +#else + fpsub f0, f0, f4 + fpsub f8, f8, f12 + fpsub f1, f1, f5 + fpsub f9, f9, f13 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcpmadd A3, f8, AP, A3 + fxcpmadd A4, f9, AP, A4 + + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + fxcxnpma f8, f8, AP, A3 + fxcxnpma f9, f9, AP, A4 + + STFPDUX f0, CO1, INCM3 + STFPDUX f1, CO1, INC2 + + STFPDUX f8, CO2, INCM3 + STFPDUX f9, CO2, INC2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f8, AP, f30 + fxcpmadd f15, f9, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f8, f8, AP, f14 + fxcxnpma f9, f9, AP, f15 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + + STFPDUX f8, CO2, INC2 + STFPDUX f9, CO2, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 1 + beq .L49 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L34: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L38 + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L38: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 + LFPDX A2, CO2, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f1 + fpadd f2, f2, f3 +#else + fpsub f0, f0, f1 + fpsub f2, f2, f3 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f2, AP, A2 + fxcxnpma f0, f0, AP, A1 + fxcxnpma f2, f2, AP, A2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f2, AP, f30 + fxcxnpma f0, f0, AP, f12 + fxcxnpma f2, f2, AP, f13 +#endif + + STFPDUX f0, CO1, INC2 + STFPDUX f2, CO2, INC2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + addi B, BO, 4 * SIZE + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 1 + beq .L999 + + mr CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L60 + .align 4 + +.L51: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#else + slwi TEMP, KK, 2 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#else + srawi. r0, K, 2 + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L54: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L58 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L58: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX A2, CO1, INC2 + LFPDUX A3, CO1, INC2 + LFPDUX A4, CO1, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 +#else + fpsub f0, f0, f4 + fpsub f1, f1, f5 + fpsub f2, f2, f6 + fpsub f3, f3, f7 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcpmadd A3, f2, AP, A3 + fxcpmadd A4, f3, AP, A4 + + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + fxcxnpma f2, f2, AP, A3 + fxcxnpma f3, f3, AP, A4 + + STFPDUX f0, CO1, INCM7 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f2, AP, f30 + fxcpmadd f15, f3, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f2, f2, AP, f14 + fxcxnpma f3, f3, AP, f15 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 + STFPDUX f2, CO1, INC2 + STFPDUX f3, CO1, INC2 +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 1 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + + srawi. r0, TEMP, 2 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L64 + +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L64: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L68 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L68: +#ifndef TRMMKERNEL + LFPDUX A1, CO1, INC2 + LFPDUX A2, CO1, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f2 + fpadd f1, f1, f3 +#else + fpsub f0, f0, f2 + fpsub f1, f1, f3 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + + STFPDUX f0, CO1, INCM3 + STFPDUX f1, CO1, INC2 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + + STFPDUX f0, CO1, INC2 + STFPDUX f1, CO1, INC2 +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 1 + beq .L89 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 0 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L74 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + srawi. r0, K, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L72 + .align 4 + +.L73: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L74: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 7 + mtspr CTR, r0 +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L78 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L78: +#ifndef TRMMKERNEL + LFPDX A1, CO1, INC2 +#endif + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f1 +#else + fpsub f0, f0, f1 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcxnpma f0, f0, AP, A1 +#else + fxcpmadd f12, f0, AP, f30 + fxcxnpma f0, f0, AP, f12 +#endif + + STFPDUX f0, CO1, INC2 + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L89: + addi B, BO, 2 * SIZE + .align 4 + +.L999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + +.L1000: + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + srawi. J, N, 1 + ble .L1050 + .align 4 + +.L1010: + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -4 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L1020 + .align 4 + +.L1011: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 2 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L1014 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L1014 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L1013 + .align 4 + +.L1012: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L1012 + .align 4 + +.L1013: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC +#else + nop +#endif + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 +#ifndef TRMMKERNEL + LFDUX B1, CO1, INC2 +#else + nop +#endif + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFDUX A3, CO1, INC2 +#else + nop +#endif + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 +#ifndef TRMMKERNEL + LFDUX A5, CO1, INC2 +#else + nop +#endif + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 +#ifndef TRMMKERNEL + LFSDUX A1, CO1, INCM5 +#else + nop +#endif + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 +#ifndef TRMMKERNEL + LFSDUX B1, CO1, INC2 +#else + nop +#endif + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 +#ifndef TRMMKERNEL + LFSDUX A3, CO1, INC2 +#else + nop +#endif + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 +#ifndef TRMMKERNEL + LFSDUX A5, CO1, INC2 +#else + nop +#endif + FXCSMADD f12, B4, A9, f12 +#ifndef TRMMKERNEL + LFDUX B3, CO2, INC +#else + nop +#endif + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 +#ifndef TRMMKERNEL + LFDUX A6, CO2, INC2 +#else + nop +#endif + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 +#ifndef TRMMKERNEL + LFDUX A7, CO2, INC2 +#else + nop +#endif + + FXCPMADD f3, B6, A4, f3 + nop + FXCSMADD f7, B6, A4, f7 + nop + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 +#ifndef TRMMKERNEL + LFDUX B2, CO2, INC2 +#else + nop +#endif + .align 4 + +.L1014: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L1018 + + cmpwi cr0, TEMP, 3 + bgt+ .L1015 +#else + andi. r0, K, 3 + mtspr CTR, r0 + ble+ .L1018 + + cmpwi cr0, K, 3 + bgt+ .L1015 +#endif + +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + fpmr f5, f0 + LFDUX B1, CO1, INC2 + fpmr f9, f0 + LFDUX A3, CO1, INC2 + fpmr f13, f0 + LFDUX A5, CO1, INC2 + fpmr f2, f0 + + LFSDUX A1, CO1, INCM5 + fpmr f6, f0 + LFSDUX B1, CO1, INC2 + fpmr f10, f0 + LFSDUX A3, CO1, INC2 + fpmr f14, f0 + LFSDUX A5, CO1, INC2 + fpmr f3, f0 + + LFDUX B3, CO2, INC + fpmr f7, f0 + LFDUX A6, CO2, INC2 + fpmr f11, f0 + LFDUX A7, CO2, INC2 + fpmr f15, f0 + LFDUX B2, CO2, INC2 +#else + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 +#endif + .align 4 + +.L1015: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L1017 + .align 4 + +.L1016: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L1016 + .align 4 + +.L1017: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L1018: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 +#else + fpsub f0, f0, f4 + fpsub f8, f8, f12 + fpsub f1, f1, f5 + fpsub f9, f9, f13 + + fpsub f2, f2, f6 + fpsub f10, f10, f14 + fpsub f3, f3, f7 + fpsub f11, f11, f15 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + LFSDUX B3, CO2, INCM5 + fxcpmadd B1, f1, AP, B1 + LFSDUX A6, CO2, INC2 + fxcpmadd A3, f2, AP, A3 + LFSDUX A7, CO2, INC2 + fxcpmadd A5, f3, AP, A5 + LFSDUX B2, CO2, INC2 + + fxcxnpma f0, f0, AP, A1 + fxcpmadd B3, f8, AP, B3 + fxcxnpma f1, f1, AP, B1 + fxcpmadd A6, f9, AP, A6 + fxcxnpma f2, f2, AP, A3 + fxcpmadd A7, f10, AP, A7 + + fxcxnpma f3, f3, AP, A5 + STFDUX f0, CO1, INCM7 + fxcpmadd B2, f11, AP, B2 + STFSDUX f0, CO1, INC + fxcxnpma f8, f8, AP, B3 + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + fxcxnpma f9, f9, AP, A6 + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + fxcxnpma f10, f10, AP, A7 + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + fxcxnpma f11, f11, AP, B2 + STFDUX f8, CO2, INCM7 +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f2, AP, f30 + fxcpmadd f15, f3, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f2, f2, AP, f14 + fxcxnpma f3, f3, AP, f15 + + fxcpmadd f16, f8, AP, f30 + fxcpmadd f17, f9, AP, f30 + fxcpmadd f18, f10, AP, f30 + fxcpmadd f19, f11, AP, f30 + + fxcxnpma f8, f8, AP, f16 + fxcxnpma f9, f9, AP, f17 + fxcxnpma f10, f10, AP, f18 + fxcxnpma f11, f11, AP, f19 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + STFDUX f8, CO2, INC +#endif + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + STFDUX f10, CO2, INC + STFSDUX f10, CO2, INC + + STFDUX f11, CO2, INC + STFSDUX f11, CO2, INC + + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1011 + .align 4 + +.L1020: + andi. I, M, 2 + beq .L1030 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 +#else + slwi TEMP, KK, 1 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L1024 +#else + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, K, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L1024 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L1023 + .align 4 + +.L1022: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L1022 + .align 4 + +.L1023: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L1024: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1028 + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L1027 + .align 4 + +.L1026: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L1026 + .align 4 + +.L1027: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L1028: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC2 + LFDUX A3, CO2, INC + LFDUX A4, CO2, INC2 + + LFSDUX A1, CO1, INCM1 + LFSDUX A2, CO1, INC2 + LFSDUX A3, CO2, INCM1 + LFSDUX A4, CO2, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 +#else + fpsub f0, f0, f4 + fpsub f8, f8, f12 + fpsub f1, f1, f5 + fpsub f9, f9, f13 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcpmadd A3, f8, AP, A3 + fxcpmadd A4, f9, AP, A4 + + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + fxcxnpma f8, f8, AP, A3 + fxcxnpma f9, f9, AP, A4 + + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INCM3 + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f8, AP, f30 + fxcpmadd f15, f9, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f8, f8, AP, f14 + fxcxnpma f9, f9, AP, f15 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1030: + andi. I, M, 1 + beq .L1049 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 +#else + slwi TEMP, KK, 0 + ZBASE_SHIFT + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L1034 +#else + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, K, 2 + mtspr CTR, r0 + ble .L1034 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L1033 + .align 4 + +.L1032: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L1032 + .align 4 + +.L1033: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L1034: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1038 + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L1037 + .align 4 + +.L1036: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L1036 + .align 4 + +.L1037: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L1038: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO2, INC + LFSDUX A1, CO1, INC + LFSDUX A2, CO2, INC +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f1 + fpadd f2, f2, f3 +#else + fpsub f0, f0, f1 + fpsub f2, f2, f3 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f2, AP, A2 + fxcxnpma f0, f0, AP, A1 + fxcxnpma f2, f2, AP, A2 + + STFDUX f0, CO1, INCM1 + STFSDUX f0, CO1, INC + + STFDUX f2, CO2, INCM1 + STFSDUX f2, CO2, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f2, AP, f30 + fxcxnpma f0, f0, AP, f12 + fxcxnpma f2, f2, AP, f13 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1049: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + addi B, BO, 4 * SIZE + + addic. J, J, -1 + bgt+ .L1010 + .align 4 + +.L1050: + andi. J, N, 1 + beq .L10999 + + mr CO1, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + addi AO, A, -2 * SIZE + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L1060 + .align 4 + +.L1051: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#else + slwi TEMP, KK, 2 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 2 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L1054 +#else + srawi. r0, K, 2 + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + mtspr CTR, r0 + fpmr f7, f0 + ble .L1054 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1053 + .align 4 + +.L1052: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L1052 + .align 4 + +.L1053: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L1054: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 4 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1058 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L1057 + .align 4 + +.L1056: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L1056 + .align 4 + +.L1057: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L1058: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC2 + LFDUX A3, CO1, INC2 + LFDUX A4, CO1, INC2 + + LFSDUX A1, CO1, INCM5 + LFSDUX A2, CO1, INC2 + LFSDUX A3, CO1, INC2 + LFSDUX A4, CO1, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 +#else + fpsub f0, f0, f4 + fpsub f1, f1, f5 + fpsub f2, f2, f6 + fpsub f3, f3, f7 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcpmadd A3, f2, AP, A3 + fxcpmadd A4, f3, AP, A4 + + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + fxcxnpma f2, f2, AP, A3 + fxcxnpma f3, f3, AP, A4 + + STFDUX f0, CO1, INCM7 + STFSDUX f0, CO1, INC + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcpmadd f14, f2, AP, f30 + fxcpmadd f15, f3, AP, f30 + + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + fxcxnpma f2, f2, AP, f14 + fxcxnpma f3, f3, AP, f15 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC +#endif + + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -4 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 4 +#endif +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L1051 + .align 4 + +.L1060: + andi. I, M, 2 + beq .L1070 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 1 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + + srawi. r0, TEMP, 2 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1064 + +#else + srawi. r0, K, 2 + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1064 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L1063 + .align 4 + +.L1062: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L1062 + .align 4 + +.L1063: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L1064: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 3 + mtspr CTR, r0 +#else + andi. r0, K, 3 + mtspr CTR, r0 +#endif + ble+ .L1068 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L1067 + .align 4 + +.L1066: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L1066 + .align 4 + +.L1067: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L1068: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC2 + LFSDUX A1, CO1, INCM1 + LFSDUX A2, CO1, INC2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f2 + fpadd f1, f1, f3 +#else + fpsub f0, f0, f2 + fpsub f1, f1, f3 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcpmadd A2, f1, AP, A2 + fxcxnpma f0, f0, AP, A1 + fxcxnpma f1, f1, AP, A2 + + STFDUX f0, CO1, INCM3 + STFSDUX f0, CO1, INC + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcpmadd f13, f1, AP, f30 + fxcxnpma f0, f0, AP, f12 + fxcxnpma f1, f1, AP, f13 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC +#endif + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1070: + andi. I, M, 1 + beq .L1089 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + addi BO, B, - 2 * SIZE + fpmr f1, f0 +#else + slwi TEMP, KK, 0 + ZBASE_SHIFT + slwi r0, KK, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, B, r0 + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. r0, TEMP, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1074 +#else + addi BO, B, - 2 * SIZE + fpmr f1, f0 + srawi. r0, K, 3 + fpmr f2, f0 + mtspr CTR, r0 + fpmr f3, f0 + ble .L1074 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L1073 + .align 4 + +.L1072: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L1072 + .align 4 + +.L1073: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L1074: + li r0, ALPHA + lfpdx AP, SP, r0 +#ifdef TRMMKERNEL + li r0, FZERO + lfpsx f30, SP, r0 +#endif + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. r0, TEMP, 7 + mtspr CTR, r0 +#else + andi. r0, K, 7 + mtspr CTR, r0 +#endif + ble+ .L1078 + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L1077 + .align 4 + +.L1076: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L1076 + .align 4 + +.L1077: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L1078: +#ifndef TRMMKERNEL + LFDUX A1, CO1, INC + LFDUX A2, CO1, INC +#endif + + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fsmfp A1, A2 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + fpadd f0, f0, f1 +#else + fpsub f0, f0, f1 +#endif + +#ifndef TRMMKERNEL + fxcpmadd A1, f0, AP, A1 + fxcxnpma f0, f0, AP, A1 + + STFDUX f0, CO1, INCM1 + STFSDUX f0, CO1, INC +#else + fxcpmadd f12, f0, AP, f30 + fxcxnpma f0, f0, AP, f12 + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L1089: + addi B, BO, 2 * SIZE + .align 4 + +.L10999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S new file mode 100644 index 0000000000..716fa885f4 --- /dev/null +++ b/kernel/power/zgemm_kernel_power3.S @@ -0,0 +1,1260 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#endif +#endif + +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "../cparam.h" +#else +#include "../zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE + 16) +#else + li PREA, (16 * 9 * SIZE + 16) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 + li PREA, 16 * 9 * SIZE +#endif +#endif + + lfs f0, FZERO + + srawi. J, N, 1 + ble LL(KERNEL_N_AND_3_HEAD) + .align 4 + +LL(KERNEL_MainHead): + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + + srawi. I, M, 1 + mr AO, A + ble LL(KERNEL_M_AND_3) + .align 4 + +LL(KERNEL_MainSubHead): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(KERNEL_K_AND_7) + .align 4 + +LL(KERNEL_MainLoop): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + LFD f28, 4 * SIZE(BO) + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFD f16, 8 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + LFD f29, 5 * SIZE(BO) + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + LFD f30, 6 * SIZE(BO) + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + fmadd f3, f19, f20, f3 + fmadd f7, f19, f21, f7 + LFD f31, 7 * SIZE(BO) + fmadd f11, f19, f22, f11 + fmadd f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f4, f24, f29, f4 + LFD f20, 8 * SIZE(BO) + fmadd f8, f24, f30, f8 + fmadd f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f5, f25, f29, f5 + LFD f21, 9 * SIZE(BO) + fmadd f9, f25, f30, f9 + fmadd f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + fmadd f2, f26, f28, f2 + fmadd f6, f26, f29, f6 + LFD f22, 10 * SIZE(BO) + fmadd f10, f26, f30, f10 + fmadd f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + fmadd f3, f27, f28, f3 + fmadd f7, f27, f29, f7 + LFD f23, 11 * SIZE(BO) + fmadd f11, f27, f30, f11 + fmadd f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + LFD f28, 12 * SIZE(BO) + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFDU f16, 16 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + LFD f29, 13 * SIZE(BO) + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 1 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + LFD f30, 14 * SIZE(BO) + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 2 * SIZE(AO) + + fmadd f3, f19, f20, f3 + fmadd f7, f19, f21, f7 + LFD f31, 15 * SIZE(BO) + fmadd f11, f19, f22, f11 + fmadd f15, f19, f23, f15 + LFD f19, 3 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f4, f24, f29, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f8, f24, f30, f8 + fmadd f12, f24, f31, f12 + LFD f24, 4 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f5, f25, f29, f5 + LFD f21, 1 * SIZE(BO) + fmadd f9, f25, f30, f9 + fmadd f13, f25, f31, f13 + LFD f25, 5 * SIZE(AO) + + fmadd f2, f26, f28, f2 + fmadd f6, f26, f29, f6 + LFD f22, 2 * SIZE(BO) + fmadd f10, f26, f30, f10 + fmadd f14, f26, f31, f14 + LFD f26, 6 * SIZE(AO) + + fmadd f3, f27, f28, f3 + fmadd f7, f27, f29, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f11, f27, f30, f11 + fmadd f15, f27, f31, f15 + LFD f27, 7 * SIZE(AO) + bdnz LL(KERNEL_MainLoop) + .align 4 + +LL(KERNEL_K_AND_7): + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(KERNEL_SubLoop): + fmadd f0, f16, f20, f0 + fmadd f4, f16, f21, f4 + fmadd f8, f16, f22, f8 + fmadd f12, f16, f23, f12 + LFD f16, 4 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f5, f17, f21, f5 + fmadd f9, f17, f22, f9 + fmadd f13, f17, f23, f13 + LFD f17, 5 * SIZE(AO) + + fmadd f2, f18, f20, f2 + fmadd f6, f18, f21, f6 + fmadd f10, f18, f22, f10 + fmadd f14, f18, f23, f14 + LFD f18, 6 * SIZE(AO) + + fmadd f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + fmadd f7, f19, f21, f7 + LFD f21, 5 * SIZE(BO) + fmadd f11, f19, f22, f11 + LFD f22, 6 * SIZE(BO) + fmadd f15, f19, f23, f15 + LFD f19, 7 * SIZE(AO) + + LFD f23, 7 * SIZE(BO) + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(KERNEL_SubLoop) + .align 4 + +LL(KERNEL_MainFinish): + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + + addic. I, I, -1 + bgt LL(KERNEL_MainSubHead) + .align 4 + +LL(KERNEL_M_AND_3): + andi. I, M, 1 + ble LL(KERNEL_MainTail) + .align 4 + +LL(KERNEL_M_AND_3_SubHead): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(KERNEL_M_AND_3_K_AND_3) + .align 4 + +LL(KERNEL_M_AND_3_MainLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(KERNEL_M_AND_3_MainLoop) + .align 4 + +LL(KERNEL_M_AND_3_K_AND_3): + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_M_AND3_Finish) + .align 4 + +LL(KERNEL_M_AND_3_SubLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(KERNEL_M_AND_3_SubLoop) + .align 4 + +LL(KERNEL_M_AND3_Finish): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + + addic. I, I, -1 + bgt LL(KERNEL_M_AND_3_SubHead) + .align 4 + +LL(KERNEL_MainTail): + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt LL(KERNEL_MainHead) + .align 4 + +LL(KERNEL_N_AND_3_HEAD): + andi. J, N, 1 + ble LL(999) + .align 4 + +LL(KERNEL_N_AND_3_MainHead): + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + + ble LL(KERNEL_MN_AND_3_Head) + .align 4 + +LL(KERNEL_N_AND_3_SubHead): + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(KERNEL_N_AND_3_K_AND_3) + .align 4 + +LL(KERNEL_N_AND_3_MainLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(KERNEL_N_AND_3_MainLoop) + .align 4 + +LL(KERNEL_N_AND_3_K_AND_3): + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble LL(KERNEL_N_AND_3_Finish) + .align 4 + +LL(KERNEL_N_AND_3_SubLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(KERNEL_N_AND_3_SubLoop) + .align 4 + +LL(KERNEL_N_AND_3_Finish): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + + addic. I, I, -1 + bgt LL(KERNEL_N_AND_3_SubHead) + .align 4 + +LL(KERNEL_MN_AND_3_Head): + andi. I, M, 1 + ble LL(KERNEL_SubEnd) + .align 4 + +LL(KERNEL_MN_AND_3_SubHead): + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(KERNEL_MN_AND_3_K_AND_3) + .align 4 + +LL(KERNEL_MN_AND_3_MainLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(KERNEL_MN_AND_3_MainLoop) + .align 4 + +LL(KERNEL_MN_AND_3_K_AND_3): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble LL(KERNEL_MN_AND_3_Finish) + .align 4 + +LL(KERNEL_MN_AND_3_SubLoop): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(KERNEL_MN_AND_3_SubLoop) + .align 4 + +LL(KERNEL_MN_AND_3_Finish): +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#endif + + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + addic. I, I, -1 + bgt LL(KERNEL_MN_AND_3_SubHead) + .align 4 + +LL(KERNEL_SubEnd): + mr B, BO + addic. J, J, -1 + bgt LL(KERNEL_N_AND_3_MainHead) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S new file mode 100644 index 0000000000..7f677dfecd --- /dev/null +++ b/kernel/power/zgemm_kernel_power6.S @@ -0,0 +1,2937 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FNMSUB +#define FMA4 FMADD +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FMADD +#define FMA4 FMADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FMADD +#define FMA4 FNMSUB +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FNMSUB +#define FMA4 FNMSUB +#endif + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#ifdef TRMMKERNEL + std r20, 232(SP) + std r19, 240(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#ifdef TRMMKERNEL + stw r20, 188(SP) + stw r19, 192(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + li PREA, (16 * 3) * SIZE + li PREC, 3 * SIZE + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(10): + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + add C, CO4, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + slwi BB, K, ZBASE_SHIFT + 2 + mr AO, A + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 1 + ble LL(20) + .align 4 + +LL(11): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, K, 3 + mr BO, B + mtspr CTR, r0 + ble LL(15) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(15) +#endif + .align 4 + +LL(12): + dcbt AO, PREA + FMA1 f0, f16, f20, f0 + nop + FMA1 f2, f18, f20, f2 + + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 40 * SIZE(BO) + LFD f21, 41 * SIZE(BO) + LFD f22, 42 * SIZE(BO) + LFD f23, 43 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 44 * SIZE(BO) + LFD f25, 45 * SIZE(BO) + LFD f26, 46 * SIZE(BO) + LFD f27, 47 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 48 * SIZE(BO) + LFD f21, 49 * SIZE(BO) + LFD f22, 50 * SIZE(BO) + LFD f23, 51 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 52 * SIZE(BO) + LFD f25, 53 * SIZE(BO) + LFD f26, 54 * SIZE(BO) + LFD f27, 55 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 56 * SIZE(BO) + LFD f21, 57 * SIZE(BO) + LFD f22, 58 * SIZE(BO) + LFD f23, 59 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 60 * SIZE(BO) + LFD f25, 61 * SIZE(BO) + LFD f26, 62 * SIZE(BO) + LFD f27, 63 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 64 * SIZE(BO) + LFD f21, 65 * SIZE(BO) + LFD f22, 66 * SIZE(BO) + LFD f23, 67 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 68 * SIZE(BO) + LFD f25, 69 * SIZE(BO) + LFD f26, 70 * SIZE(BO) + LFD f27, 71 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 64 * SIZE + bdnz LL(12) + .align 4 + +LL(15): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + + dcbtst B, BB + addi BB, BB, 16 * SIZE + dcbtst B, BB + addi BB, BB, 16 * SIZE + +#ifndef TRMMKERNEL + andi. r0, K, 7 + mtspr CTR, r0 + ble LL(18) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + ble LL(18) +#endif + .align 4 + +LL(16): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(16) + .align 4 + +LL(18): +#ifndef TRMMKERNEL + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f3, f18 + FMADD f27, f31, f2, f19 + + LFD f16, 0 * SIZE(CO3) + LFD f17, 1 * SIZE(CO3) + LFD f18, 2 * SIZE(CO3) + LFD f19, 3 * SIZE(CO3) + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f2, f30, f2, f26 + FMADD f3, f30, f3, f27 + + FNMSUB f24, f31, f5, f20 + FMADD f25, f31, f4, f21 + FNMSUB f26, f31, f7, f22 + FMADD f27, f31, f6, f23 + + LFD f20, 0 * SIZE(CO4) + LFD f21, 1 * SIZE(CO4) + LFD f22, 2 * SIZE(CO4) + LFD f23, 3 * SIZE(CO4) + + FMADD f4, f30, f4, f24 + FMADD f5, f30, f5, f25 + FMADD f6, f30, f6, f26 + FMADD f7, f30, f7, f27 + + FNMSUB f24, f31, f9, f16 + FMADD f25, f31, f8, f17 + FNMSUB f26, f31, f11, f18 + FMADD f27, f31, f10, f19 + + FMADD f8, f30, f8, f24 + FMADD f9, f30, f9, f25 + FMADD f10, f30, f10, f26 + FMADD f11, f30, f11, f27 + + FNMSUB f24, f31, f13, f20 + FMADD f25, f31, f12, f21 + FNMSUB f26, f31, f15, f22 + FMADD f27, f31, f14, f23 + + FMADD f12, f30, f12, f24 + FMADD f13, f30, f13, f25 + FMADD f14, f30, f14, f26 + FMADD f15, f30, f15, f27 + +#else + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMUL f20, f31, f5 + FMUL f21, f31, f4 + FMUL f22, f31, f7 + FMUL f23, f31, f6 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + + FMSUB f4, f30, f4, f20 + FMADD f5, f30, f5, f21 + FMADD f6, f30, f6, f22 + FMADD f7, f30, f7, f23 + + FMUL f16, f31, f9 + FMUL f17, f31, f8 + FMUL f18, f31, f11 + FMUL f19, f31, f10 + + FMUL f20, f31, f13 + FMUL f21, f31, f12 + FMUL f22, f31, f15 + FMUL f23, f31, f14 + + FMSUB f8, f30, f8, f16 + FMADD f9, f30, f9, f17 + FMADD f10, f30, f10, f18 + FMADD f11, f30, f11, f19 + + FMSUB f12, f30, f12, f20 + FMADD f13, f30, f13, f21 + FMADD f14, f30, f14, f22 + FMADD f15, f30, f15, f23 +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(25) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(25) +#endif + .align 4 + +LL(22): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f18, f20, f0 + FMA4 f3, f19, f20, f3 + FMA2 f1, f18, f21, f1 + FMA3 f2, f19, f21, f2 + + FMA1 f4, f18, f22, f4 + FMA4 f7, f19, f22, f7 + FMA2 f5, f18, f23, f5 + FMA3 f6, f19, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f8, f18, f24, f8 + FMA4 f11, f19, f24, f11 + FMA2 f9, f18, f25, f9 + FMA3 f10, f19, f25, f10 + + FMA1 f12, f18, f26, f12 + FMA4 f15, f19, f26, f15 + FMA2 f13, f18, f27, f13 + FMA3 f14, f19, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA4 f3, f29, f20, f3 + FMA2 f1, f28, f21, f1 + FMA3 f2, f29, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA4 f7, f29, f22, f7 + FMA2 f5, f28, f23, f5 + FMA3 f6, f29, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f8, f28, f24, f8 + FMA4 f11, f29, f24, f11 + FMA2 f9, f28, f25, f9 + FMA3 f10, f29, f25, f10 + + FMA1 f12, f28, f26, f12 + FMA4 f15, f29, f26, f15 + FMA2 f13, f28, f27, f13 + FMA3 f14, f29, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f30, f20, f0 + FMA4 f3, f31, f20, f3 + FMA2 f1, f30, f21, f1 + FMA3 f2, f31, f21, f2 + + FMA1 f4, f30, f22, f4 + FMA4 f7, f31, f22, f7 + FMA2 f5, f30, f23, f5 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f8, f30, f24, f8 + FMA4 f11, f31, f24, f11 + FMA2 f9, f30, f25, f9 + FMA3 f10, f31, f25, f10 + + FMA1 f12, f30, f26, f12 + FMA4 f15, f31, f26, f15 + FMA2 f13, f30, f27, f13 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 32 * SIZE + + bdnz LL(22) + .align 4 + +LL(25): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef TRMMKERNEL + andi. r0, K, 3 + mtspr CTR, r0 + ble LL(28) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 4 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(28) +#endif + .align 4 + +LL(26): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(26) + .align 4 + +LL(28): +#ifndef TRMMKERNEL + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + LFD f20, 0 * SIZE(CO3) + LFD f21, 1 * SIZE(CO3) + LFD f22, 0 * SIZE(CO4) + LFD f23, 1 * SIZE(CO4) + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f5, f18 + FMADD f27, f31, f4, f19 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f4, f30, f4, f26 + FMADD f5, f30, f5, f27 + + FNMSUB f24, f31, f9, f20 + FMADD f25, f31, f8, f21 + FNMSUB f26, f31, f13, f22 + FMADD f27, f31, f12, f23 + + FMADD f8, f30, f8, f24 + FMADD f9, f30, f9, f25 + FMADD f12, f30, f12, f26 + FMADD f13, f30, f13, f27 + +#else + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f5 + FMUL f19, f31, f4 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f4, f30, f4, f18 + FMADD f5, f30, f5, f19 + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + FMUL f20, f31, f9 + FMUL f21, f31, f8 + FMUL f22, f31, f13 + FMUL f23, f31, f12 + + FMSUB f8, f30, f8, f20 + FMADD f9, f30, f9, f21 + FMSUB f12, f30, f12, f22 + FMADD f13, f30, f13, f23 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -4 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(29): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 4 +#endif + + mr B, BO + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(50) + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + slwi BB, K, ZBASE_SHIFT + 1 + mr AO, A + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 1 + ble LL(40) + .align 4 + +LL(31): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, K, 3 + mr BO, B + mtspr CTR, r0 + ble LL(35) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + +#endif + + dcbtst CO1, PREC + dcbtst CO2, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(35) +#endif + .align 4 + +LL(32): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f0, f28, f24, f0 + FMA1 f2, f30, f24, f2 + FMA2 f1, f28, f25, f1 + FMA2 f3, f30, f25, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f26, f4 + FMA1 f6, f30, f26, f6 + FMA2 f5, f28, f27, f5 + FMA2 f7, f30, f27, f7 + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f0, f28, f24, f0 + FMA1 f2, f30, f24, f2 + FMA2 f1, f28, f25, f1 + FMA2 f3, f30, f25, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f26, f4 + FMA1 f6, f30, f26, f6 + FMA2 f5, f28, f27, f5 + FMA2 f7, f30, f27, f7 + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f0, f28, f24, f0 + FMA1 f2, f30, f24, f2 + FMA2 f1, f28, f25, f1 + FMA2 f3, f30, f25, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f26, f4 + FMA1 f6, f30, f26, f6 + FMA2 f5, f28, f27, f5 + FMA2 f7, f30, f27, f7 + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f0, f28, f24, f0 + FMA1 f2, f30, f24, f2 + FMA2 f1, f28, f25, f1 + FMA2 f3, f30, f25, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f26, f4 + FMA1 f6, f30, f26, f6 + FMA2 f5, f28, f27, f5 + FMA2 f7, f30, f27, f7 + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + bdnz LL(32) + .align 4 + +LL(35): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + + dcbtst B, BB + addi BB, BB, 16 * SIZE + +#ifndef TRMMKERNEL + andi. r0, K, 7 + mtspr CTR, r0 + ble LL(38) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + ble LL(38) +#endif + .align 4 + +LL(36): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 4 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMA4 f13, f17, f22, f13 + FMA4 f15, f19, f22, f15 + FMA3 f12, f17, f23, f12 + FMA3 f14, f19, f23, f14 + + LFD f17, 5 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(38): +#ifndef TRMMKERNEL + + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) + + FADD f4, f4, f12 + FADD f5, f5, f13 + FADD f6, f6, f14 + FADD f7, f7, f15 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f3, f18 + FMADD f27, f31, f2, f19 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f2, f30, f2, f26 + FMADD f3, f30, f3, f27 + + FNMSUB f24, f31, f5, f20 + FMADD f25, f31, f4, f21 + FNMSUB f26, f31, f7, f22 + FMADD f27, f31, f6, f23 + + FMADD f4, f30, f4, f24 + FMADD f5, f30, f5, f25 + FMADD f6, f30, f6, f26 + FMADD f7, f30, f7, f27 + +#else + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + + FADD f4, f4, f12 + FADD f5, f5, f13 + FADD f6, f6, f14 + FADD f7, f7, f15 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMUL f20, f31, f5 + FMUL f21, f31, f4 + FMUL f22, f31, f7 + FMUL f23, f31, f6 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + + FMSUB f4, f30, f4, f20 + FMADD f5, f30, f5, f21 + FMADD f6, f30, f6, f22 + FMADD f7, f30, f7, f23 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(45) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(45) +#endif + .align 4 + +LL(42): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + + LFD f20, 12 * SIZE(BO) + LFD f21, 13 * SIZE(BO) + LFD f22, 14 * SIZE(BO) + LFD f23, 15 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 16 * SIZE + + bdnz LL(42) + .align 4 + +LL(45): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef TRMMKERNEL + andi. r0, K, 3 + mtspr CTR, r0 + ble LL(48) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(48) +#endif + .align 4 + +LL(46): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(48): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f5, f20 + FMADD f27, f31, f4, f21 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f4, f30, f4, f26 + FMADD f5, f30, f5, f27 + +#else + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f5 + FMUL f19, f31, f4 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f4, f30, f4, f18 + FMADD f5, f30, f5, f19 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(49): +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + + mr CO1, C + add C, CO1, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + mr AO, A + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + srawi. I, M, 1 + ble LL(60) + .align 4 + +LL(51): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + dcbtst CO1, PREC + + srawi. r0, K, 3 + mr BO, B + mtspr CTR, r0 + ble LL(55) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + + dcbtst CO1, PREC + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(55) +#endif + .align 4 + +LL(52): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMA1 f0, f16, f22, f0 + FMA1 f2, f18, f22, f2 + FMA2 f1, f16, f23, f1 + FMA2 f3, f18, f23, f3 + + FMA4 f9, f17, f22, f9 + FMA4 f11, f19, f22, f11 + FMA3 f8, f17, f23, f8 + FMA3 f10, f19, f23, f10 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 12 * SIZE(AO) + LFD f17, 13 * SIZE(AO) + LFD f18, 14 * SIZE(AO) + LFD f19, 15 * SIZE(AO) + + FMA1 f0, f16, f22, f0 + FMA1 f2, f18, f22, f2 + FMA2 f1, f16, f23, f1 + FMA2 f3, f18, f23, f3 + + FMA4 f9, f17, f22, f9 + FMA4 f11, f19, f22, f11 + FMA3 f8, f17, f23, f8 + FMA3 f10, f19, f23, f10 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 20 * SIZE(AO) + LFD f17, 21 * SIZE(AO) + LFD f18, 22 * SIZE(AO) + LFD f19, 23 * SIZE(AO) + + FMA1 f0, f16, f22, f0 + FMA1 f2, f18, f22, f2 + FMA2 f1, f16, f23, f1 + FMA2 f3, f18, f23, f3 + + FMA4 f9, f17, f22, f9 + FMA4 f11, f19, f22, f11 + FMA3 f8, f17, f23, f8 + FMA3 f10, f19, f23, f10 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + LFD f20, 12 * SIZE(BO) + LFD f21, 13 * SIZE(BO) + LFD f22, 14 * SIZE(BO) + LFD f23, 15 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f16, 28 * SIZE(AO) + LFD f17, 29 * SIZE(AO) + LFD f18, 30 * SIZE(AO) + LFD f19, 31 * SIZE(AO) + + FMA1 f0, f16, f22, f0 + FMA1 f2, f18, f22, f2 + FMA2 f1, f16, f23, f1 + FMA2 f3, f18, f23, f3 + + FMA4 f9, f17, f22, f9 + FMA4 f11, f19, f22, f11 + FMA3 f8, f17, f23, f8 + FMA3 f10, f19, f23, f10 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 16 * SIZE + + bdnz LL(52) + .align 4 + +LL(55): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef TRMMKERNEL + andi. r0, K, 7 + mtspr CTR, r0 + ble LL(58) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 7 + mtspr CTR, TEMP + ble LL(58) +#endif + .align 4 + +LL(56): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + + FMA4 f9, f17, f20, f9 + FMA4 f11, f19, f20, f11 + FMA3 f8, f17, f21, f8 + FMA3 f10, f19, f21, f10 + + LFD f17, 5 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(56) + .align 4 + +LL(58): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) + + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + FNMSUB f26, f31, f3, f18 + FMADD f27, f31, f2, f19 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + FMADD f2, f30, f2, f26 + FMADD f3, f30, f3, f27 + +#else + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt LL(51) + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(999) + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble LL(65) +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble LL(65) +#endif + .align 4 + +LL(62): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMA1 f0, f18, f22, f0 + FMA4 f3, f19, f22, f3 + FMA2 f1, f18, f23, f1 + FMA3 f2, f19, f23, f2 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMA1 f0, f18, f22, f0 + FMA4 f3, f19, f22, f3 + FMA2 f1, f18, f23, f1 + FMA3 f2, f19, f23, f2 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef TRMMKERNEL + andi. r0, K, 3 + mtspr CTR, r0 + ble LL(68) +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + mtspr CTR, TEMP + ble LL(68) +#endif + .align 4 + +LL(66): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + LFD f20, 2 * SIZE(BO) + FMA2 f1, f16, f21, f1 + LFD f16, 2 * SIZE(AO) + FMA3 f2, f17, f21, f2 + LFD f17, 3 * SIZE(AO) + + LFD f21, 3 * SIZE(BO) + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + bdnz LL(66) + .align 4 + +LL(68): +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FNMSUB f24, f31, f1, f16 + FMADD f25, f31, f0, f17 + + FMADD f0, f30, f0, f24 + FMADD f1, f30, f1, f25 + +#else + + FADD f0, f0, f2 + FADD f1, f1, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + + addi CO1, CO1, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#ifdef TRMMKERNEL + ld r20, 232(SP) + ld r19, 240(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#ifdef TRMMKERNEL + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S new file mode 100644 index 0000000000..2a80c97f88 --- /dev/null +++ b/kernel/power/zgemm_kernel_ppc440.S @@ -0,0 +1,1700 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) +#ifdef TRMMKERNEL + std r23, 208(SP) + std r22, 216(SP) +#endif +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) +#ifdef TRMMKERNEL + stw r23, 176(SP) + stw r22, 180(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + lfs f0, FZERO + + srawi. J, N, 1 + ble .L30 + .align 4 + +.L10: + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + mr CO1, C + add CO2, C, LDC + add C, CO2, LDC + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr AO, A + ble .L20 + .align 4 + +.L11: +#ifndef TRMMKERNEL + LFD A1, 0 * SIZE(AO) ### + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) ### + LFD A5, 8 * SIZE(AO) ### + + LFD B1, 0 * SIZE(B) ### + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) ### + LFD B6, 8 * SIZE(B) ### + LFD B7, 12 * SIZE(B) ### + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L15 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD A1, 0 * SIZE(AO) ### + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) ### + LFD A5, 8 * SIZE(AO) ### + + LFD B1, 0 * SIZE(B) ### + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) ### + LFD B6, 8 * SIZE(B) ### + LFD B7, 12 * SIZE(B) ### + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, r0 + + LFD A1, 0 * SIZE(AO) ### + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) ### + LFD A5, 8 * SIZE(AO) ### + + LFD B1, 0 * SIZE(BO) ### + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) ### + LFD B6, 8 * SIZE(BO) ### + LFD B7, 12 * SIZE(BO) ### +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L15 +#endif + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) ### + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) ### + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + +############ + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) ### + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) ### + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + +############ + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) ### + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) ### + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + +############ + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) ### + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) ### + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .LKERNEL_MainFinish +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .LKERNEL_MainFinish +#endif + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.LKERNEL_MainFinish: +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f9, f12 + FADD f10, f10, f15 + FSUB f11, f11, f14 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(CO2) + LFD f21, 1 * SIZE(CO2) + LFD f22, 2 * SIZE(CO2) + LFD f23, 3 * SIZE(CO2) +#endif + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FMADD f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FMADD f23, f30, f11, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FNMSUB f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FNMSUB f22, f31, f11, f22 + FMADD f23, f31, f10, f23 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f20, f30, f8, f20 + FNMSUB f21, f30, f9, f21 + FMADD f22, f30, f10, f22 + FNMSUB f23, f30, f11, f23 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FMADD f23, f31, f10, f23 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMUL f20, f30, f8 + FMUL f21, f30, f9 + FMUL f22, f30, f10 + FMUL f23, f30, f11 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 + + FMADD f20, f31, f9, f20 + FNMADD f21, f31, f8, f21 + FMADD f22, f31, f11, f22 + FNMADD f23, f31, f10, f23 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f20, 0 * SIZE(CO2) + STFD f21, 1 * SIZE(CO2) + STFD f22, 2 * SIZE(CO2) + STFD f23, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -2 +#endif + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L20: + andi. I, M, 1 + ble .L29 + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L25 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L25 +#endif + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f27, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + LFD f19, 3 * SIZE(AO) + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + bdnz .L22 + .align 4 + +.L25: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .L27 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 2 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .L27 +#endif + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + bdnz .L26 + .align 4 + +.L27: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + + LFD f18, 0 * SIZE(CO2) + LFD f19, 1 * SIZE(CO2) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 0 * SIZE(CO2) + STFD f19, 1 * SIZE(CO2) + + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -1 +#else + addi TEMP, TEMP, -2 +#endif + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 1 +#endif +#endif + .align 4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addi KK, KK, 2 +#endif + + mr B, BO + addic. J, J, -1 + lfs f0, FZERO + bgt .L10 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + mr KK, OFFSET +#endif + + srawi. I, M, 1 + mr CO1, C + add C, C, LDC + mr AO, A + ble .L40 + .align 4 + +.L31: +#ifndef TRMMKERNEL + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L35 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L35 +#endif + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f27, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + LFD f19, 3 * SIZE(BO) + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + bdnz .L32 + .align 4 + +.L35: +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, r0 + ble .L37 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 2 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR, TEMP + ble .L37 +#endif + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + LFD f17, 1 * SIZE(BO) + bdnz .L36 + .align 4 + +.L37: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 + +#else /* defined(NC) || defined(TC) || defined(NR) || defined(TR) */ + + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) + LFD f18, 2 * SIZE(CO1) + LFD f19, 3 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FMADD f19, f30, f3, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 +#endif + + FNMSUB f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FNMSUB f18, f31, f3, f18 + FMADD f19, f31, f2, f19 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC)|| defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f1, f17 + FMADD f18, f30, f2, f18 + FNMSUB f19, f30, f3, f19 + + FMADD f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FMADD f19, f31, f2, f19 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f1 + FMUL f18, f30, f2 + FMUL f19, f30, f3 + + FMADD f16, f31, f1, f16 + FNMADD f17, f31, f0, f17 + FMADD f18, f31, f3, f18 + FNMADD f19, f31, f2, f19 +#endif + +#endif + + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + STFD f18, 2 * SIZE(CO1) + STFD f19, 3 * SIZE(CO1) + + addi CO1, CO1, 4 * SIZE + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub TEMP, K, KK +#ifdef LEFT + addi TEMP, TEMP, -2 +#else + addi TEMP, TEMP, -1 +#endif + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LEFT + addi KK, KK, 2 +#endif +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L40: + andi. I, M, 1 + ble .L999 + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, K, 2 + mr BO, B + mtspr CTR, r0 + ble .L45 +#else +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + mr BO, B +#else + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, B, TEMP + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + srawi. TEMP, TEMP, 2 + mtspr CTR, TEMP + ble .L45 +#endif + .align 4 + +.L42: + fmadd f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFD f16, 4 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFD f20, 4 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 5 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 6 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 6 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 8 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 8 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 2 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#ifndef TRMMKERNEL + andi. r0, K, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,r0 + ble .L47 +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub TEMP, K, KK +#elif defined(LEFT) + addi TEMP, KK, 1 +#else + addi TEMP, KK, 1 +#endif + andi. TEMP, TEMP, 3 + lfd f30, ALPHA_R + lfd f31, ALPHA_I + mtspr CTR,TEMP + ble .L47 +#endif + .align 4 + +.L46: + fmadd f0, f16, f20, f0 + fmadd f3, f16, f21, f3 + LFDU f16, 2 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 2 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + bdnz .L46 + .align 4 + +.L47: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(CC) || defined(CR) || defined(RC) || defined(RR) + fsub f0, f0, f1 + fadd f2, f2, f3 +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + fadd f0, f0, f1 + fsub f2, f2, f3 +#else + fadd f0, f0, f1 + fsub f2, f3, f2 +#endif + +#ifndef TRMMKERNEL + LFD f16, 0 * SIZE(CO1) + LFD f17, 1 * SIZE(CO1) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FMADD f17, f30, f2, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 +#endif + + FNMSUB f16, f31, f2, f16 + FMADD f17, f31, f0, f17 + +#else /* defined(CN)||defined(CT)||defined(NC)||defined(TC)||defined(CC) */ + /* defined(RN)||defined(RT)||defined(NR)||defined(TR)||defined(CR) */ + /* defined(RC) || defined(RR) */ + +#ifndef TRMMKERNEL + FMADD f16, f30, f0, f16 + FNMSUB f17, f30, f2, f17 + + FMADD f16, f31, f2, f16 + FMADD f17, f31, f0, f17 +#else + FMUL f16, f30, f0 + FMUL f17, f30, f2 + + FMADD f16, f31, f2, f16 + FNMADD f17, f31, f0, f17 +#endif + +#endif + STFD f16, 0 * SIZE(CO1) + STFD f17, 1 * SIZE(CO1) + .align 4 + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) +#ifdef TRMMKERNEL + ld r23, 208(SP) + ld r22, 216(SP) +#endif +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) +#ifdef TRMMKERNEL + lwz r23, 176(SP) + lwz r22, 180(SP) +#endif +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_ncopy_hummer_2.S b/kernel/power/zgemm_ncopy_hummer_2.S new file mode 100644 index 0000000000..9a6f802695 --- /dev/null +++ b/kernel/power/zgemm_ncopy_hummer_2.S @@ -0,0 +1,451 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 + +#define J r12 + +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + + slwi LDA, LDA, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble- LL(99) + cmpwi cr0, N, 0 + ble- LL(99) + + li INC, 1 * SIZE + li INC2, 2 * SIZE + subi B, B, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne LL(100) + + subi A, A, 2 * SIZE + srawi. J, N, 1 + ble LL(20) + .align 4 +LL(11): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO2, INC2 + LFPDUX c05, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO1, INC2 + LFPDUX c08, AO2, INC2 + + LFPDUX c09, AO1, INC2 + LFPDUX c10, AO2, INC2 + LFPDUX c11, AO1, INC2 + LFPDUX c12, AO2, INC2 + LFPDUX c13, AO1, INC2 + LFPDUX c14, AO2, INC2 + LFPDUX c15, AO1, INC2 + LFPDUX c16, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c14, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c16, B, INC2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 7 + ble LL(19) + + andi. r0, M, 4 + beq LL(16) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO2, INC2 + LFPDUX c05, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO1, INC2 + LFPDUX c08, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c08, B, INC2 + .align 4 + +LL(16): + andi. r0, M, 2 + beq LL(17) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + .align 4 + +LL(17): + andi. r0, M, 1 + beq LL(19) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 1 + ble LL(99) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c05, AO1, INC2 + LFPDUX c07, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 3 + ble LL(99) + + andi. r0, M, 2 + beq LL(27) + + LFPDUX c01, AO1, INC2 + LFPDUX c03, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(27): + andi. r0, M, 1 + beq LL(99) + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B, INC2 + .align 4 + +LL(99): + addi SP, SP, -4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + subi A, A, 1 * SIZE + srawi. J, N, 1 + ble LL(120) + .align 4 +LL(111): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + + LFDUX c09, AO1, INC + LFDUX c10, AO1, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + fsmfp c01, c02 + + LFDUX c13, AO1, INC + fsmfp c03, c04 + LFDUX c14, AO1, INC + fsmfp c05, c06 + LFDUX c15, AO2, INC + fsmfp c07, c08 + LFDUX c16, AO2, INC + fsmfp c09, c10 + + STFPDUX c01, B, INC2 + fsmfp c11, c12 + STFPDUX c03, B, INC2 + fsmfp c13, c14 + STFPDUX c05, B, INC2 + fsmfp c15, c16 + STFPDUX c07, B, INC2 + + STFPDUX c09, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c15, B, INC2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, M, 3 + ble LL(119) + + andi. r0, M, 2 + beq LL(117) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + .align 4 + +LL(117): + andi. r0, M, 1 + beq LL(119) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(119): + addic. J, J, -1 + bgt LL(111) + .align 4 + +LL(120): + andi. J, N, 1 + ble LL(999) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(125) + .align 4 + +LL(122): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz LL(122) + .align 4 + +LL(125): + andi. r0, M, 3 + ble LL(999) + + andi. r0, M, 2 + beq LL(127) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(127): + andi. r0, M, 1 + beq LL(999) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +LL(999): + addi SP, SP, -4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/zgemm_ncopy_hummer_4.S b/kernel/power/zgemm_ncopy_hummer_4.S new file mode 100644 index 0000000000..0a64d0d05f --- /dev/null +++ b/kernel/power/zgemm_ncopy_hummer_4.S @@ -0,0 +1,666 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r12 + +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + + slwi LDA, LDA, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble- LL(99) + cmpwi cr0, N, 0 + ble- LL(99) + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + subi B, B, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne LL(100) + + subi A, A, 2 * SIZE + srawi. J, N, 2 + ble LL(20) + .align 4 +LL(11): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFPDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c14, AO4, INC2 + + LFPDUX c03, AO1, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFPDUX c15, AO4, INC2 + + LFPDUX c04, AO1, INC2 + LFPDUX c08, AO2, INC2 + LFPDUX c12, AO3, INC2 + LFPDUX c16, AO4, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c16, B, INC2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, M, 3 + ble LL(19) + + andi. r0, M, 2 + beq LL(17) + + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFPDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c14, AO4, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + .align 4 + +LL(17): + andi. r0, M, 1 + beq LL(19) + + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFPDUX c13, AO4, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + .align 4 + +LL(22): + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + + LFPDUX c03, AO1, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c04, AO1, INC2 + LFPDUX c08, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, M, 3 + ble LL(30) + + andi. r0, M, 2 + beq LL(27) + + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + .align 4 + +LL(27): + andi. r0, M, 1 + beq LL(30) + + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(99) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(35) + .align 4 + +LL(32): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c04, B, INC2 + bdnz LL(32) + .align 4 + +LL(35): + andi. r0, M, 3 + ble LL(99) + + andi. r0, M, 2 + beq LL(37) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B, INC2 + STFPDUX c02, B, INC2 + .align 4 + +LL(37): + andi. r0, M, 1 + beq LL(99) + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B, INC2 + .align 4 + +LL(99): + addi SP, SP, -4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + subi A, A, 1 * SIZE + srawi. J, N, 2 + ble LL(120) + .align 4 +LL(111): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + LFDUX c01, AO1, INC + LFDUX c05, AO2, INC + LFDUX c09, AO3, INC + LFDUX c13, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO3, INC + LFSDUX c13, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c06, AO2, INC + LFDUX c10, AO3, INC + LFDUX c14, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c06, AO2, INC + LFSDUX c10, AO3, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c07, AO2, INC + LFDUX c11, AO3, INC + LFDUX c15, AO4, INC + + LFSDUX c03, AO1, INC + LFSDUX c07, AO2, INC + LFSDUX c11, AO3, INC + LFSDUX c15, AO4, INC + + LFDUX c04, AO1, INC + LFDUX c08, AO2, INC + LFDUX c12, AO3, INC + LFDUX c16, AO4, INC + + LFSDUX c04, AO1, INC + LFSDUX c08, AO2, INC + LFSDUX c12, AO3, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c09, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c02, B, INC2 + STFPDUX c06, B, INC2 + STFPDUX c10, B, INC2 + STFPDUX c14, B, INC2 + + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + STFPDUX c04, B, INC2 + STFPDUX c08, B, INC2 + STFPDUX c12, B, INC2 + STFPDUX c16, B, INC2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, M, 3 + ble LL(119) + + andi. r0, M, 2 + beq LL(117) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c05, AO2, INC + LFDUX c06, AO2, INC + + LFDUX c09, AO3, INC + LFDUX c10, AO3, INC + LFDUX c13, AO4, INC + LFDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + fsmfp c01, c02 + + LFDUX c11, AO3, INC + fsmfp c05, c06 + LFDUX c12, AO3, INC + fsmfp c09, c10 + LFDUX c15, AO4, INC + fsmfp c13, c14 + LFDUX c16, AO4, INC + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + fsmfp c07, c08 + STFPDUX c05, B, INC2 + fsmfp c11, c12 + STFPDUX c09, B, INC2 + fsmfp c15, c16 + STFPDUX c13, B, INC2 + + STFPDUX c03, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c11, B, INC2 + STFPDUX c15, B, INC2 + .align 4 + +LL(117): + andi. r0, M, 1 + beq LL(119) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + LFDUX c05, AO3, INC + LFDUX c06, AO3, INC + LFDUX c07, AO4, INC + LFDUX c08, AO4, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + .align 4 + +LL(119): + addic. J, J, -1 + bgt LL(111) + .align 4 + +LL(120): + andi. J, N, 2 + ble LL(130) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(125) + .align 4 + +LL(122): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c13, AO2, INC + LFDUX c14, AO2, INC + fsmfp c01, c02 + + LFDUX c07, AO1, INC + fsmfp c09, c10 + LFDUX c08, AO1, INC + fsmfp c03, c04 + LFDUX c15, AO2, INC + fsmfp c11, c12 + LFDUX c16, AO2, INC + fsmfp c05, c06 + + STFPDUX c01, B, INC2 + fsmfp c13, c14 + STFPDUX c09, B, INC2 + fsmfp c07, c08 + STFPDUX c03, B, INC2 + fsmfp c15, c16 + STFPDUX c11, B, INC2 + + STFPDUX c05, B, INC2 + STFPDUX c13, B, INC2 + STFPDUX c07, B, INC2 + STFPDUX c15, B, INC2 + bdnz LL(122) + .align 4 + +LL(125): + andi. r0, M, 3 + ble LL(130) + + andi. r0, M, 2 + beq LL(127) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + .align 4 + +LL(127): + andi. r0, M, 1 + beq LL(130) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(130): + andi. J, N, 1 + ble LL(999) + + mr AO1, A + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(135) + .align 4 + +LL(132): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + fsmfp c05, c06 + fsmfp c07, c08 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + STFPDUX c05, B, INC2 + STFPDUX c07, B, INC2 + bdnz LL(132) + .align 4 + +LL(135): + andi. r0, M, 3 + ble LL(999) + + andi. r0, M, 2 + beq LL(137) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B, INC2 + STFPDUX c03, B, INC2 + .align 4 + +LL(137): + andi. r0, M, 1 + beq LL(999) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDUX c01, B, INC2 + .align 4 + +LL(999): + addi SP, SP, -4 + + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + + + EPILOGUE diff --git a/kernel/power/zgemm_tcopy_hummer_2.S b/kernel/power/zgemm_tcopy_hummer_2.S new file mode 100644 index 0000000000..bc2a083cfb --- /dev/null +++ b/kernel/power/zgemm_tcopy_hummer_2.S @@ -0,0 +1,308 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 + +#define J r10 +#define B1 r11 + +#define B2 r28 +#define M4 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 + + PROLOGUE + PROFCODE + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + slwi LDA, LDA, ZBASE_SHIFT + slwi M4, M, 1 + ZBASE_SHIFT + + li r9, -2 + + and B2, N, r9 + + mullw B2, B2, M + + slwi B2, B2, ZBASE_SHIFT + + add B2, B2, B + + cmpwi cr0, M, 0 + ble- LL(99) + cmpwi cr0, N, 0 + ble- LL(99) + + subi B2, B2, 2 * SIZE + subi M4, M4, 6 * SIZE + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne LL(100) + + subi A, A, 2 * SIZE + srawi. J, M, 1 + ble LL(20) + .align 4 + +LL(10): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 8 * SIZE + + srawi. r0, N, 1 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, N, 1 + ble LL(19) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(20): + andi. J, M, 1 + addi M4, M4, 4 * SIZE + ble LL(99) + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 1 + mtspr CTR, r0 + ble LL(23) + .align 4 + +LL(22): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + bdnz LL(22) + .align 4 + +LL(23): + andi. r0, N, 1 + ble LL(99) + + LFPDUX c01, AO1, INC2 + + STFPDUX c01, B2, INC2 + .align 4 + +LL(99): + addi SP, SP, -4 + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + addi SP, SP, 4 + blr + .align 4 + +LL(100): + subi A, A, SIZE + srawi. J, M, 1 + ble LL(120) + .align 4 + +LL(110): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 8 * SIZE + + srawi. r0, N, 1 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO2, INC + fsmfp c01, c02 + LFDUX c06, AO2, INC + fsmfp c03, c04 + LFDUX c07, AO2, INC + fsmfp c05, c06 + LFDUX c08, AO2, INC + fsmfp c07, c08 + + STFPDUX c01, B1, M4 + STFPDUX c03, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c07, B1, INC2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, N, 1 + ble LL(119) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +LL(119): + addic. J, J, -1 + bgt LL(110) + .align 4 + +LL(120): + andi. J, M, 1 + addi M4, M4, 4 * SIZE + ble LL(999) + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 1 + mtspr CTR, r0 + ble LL(123) + .align 4 + +LL(122): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B1, M4 + STFPDUX c03, B1, INC2 + bdnz LL(122) + .align 4 + +LL(123): + andi. r0, N, 1 + ble LL(999) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + + STFPDUX c01, B2, INC2 + .align 4 + +LL(999): + addi SP, SP, -4 + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + addi SP, SP, 4 + blr + + + + + EPILOGUE diff --git a/kernel/power/zgemm_tcopy_hummer_4.S b/kernel/power/zgemm_tcopy_hummer_4.S new file mode 100644 index 0000000000..7011dc2d85 --- /dev/null +++ b/kernel/power/zgemm_tcopy_hummer_4.S @@ -0,0 +1,705 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define B r7 + +#define AO1 r8 +#define AO2 r9 +#define AO3 r10 +#define AO4 r11 + +#define J r25 +#define B1 r26 +#define B2 r27 +#define B3 r28 +#define M4 r29 +#define INC r30 +#define INC2 r31 + +#define c01 f0 +#define c02 f1 +#define c03 f2 +#define c04 f3 +#define c05 f4 +#define c06 f5 +#define c07 f6 +#define c08 f7 +#define c09 f8 +#define c10 f9 +#define c11 f10 +#define c12 f11 +#define c13 f12 +#define c14 f13 +#define c15 f14 +#define c16 f15 + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + + slwi LDA, LDA, ZBASE_SHIFT + slwi M4, M, 2 + ZBASE_SHIFT + + li r8, -4 + li r9, -2 + + and B2, N, r8 + and B3, N, r9 + + mullw B2, B2, M + mullw B3, B3, M + + slwi B2, B2, ZBASE_SHIFT + slwi B3, B3, ZBASE_SHIFT + + add B2, B2, B + add B3, B3, B + + cmpwi cr0, M, 0 + ble- LL(99) + cmpwi cr0, N, 0 + ble- LL(99) + + subi B2, B2, 2 * SIZE + subi B3, B3, 2 * SIZE + subi M4, M4, 30 * SIZE + + li INC, 1 * SIZE + li INC2, 2 * SIZE + + andi. r0, A, 2 * SIZE - 1 + bne LL(100) + + subi A, A, 2 * SIZE + srawi. J, M, 2 + ble LL(20) + .align 4 + +LL(10): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M4 + addi B, B, 32 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(15) + .align 4 + +LL(12): + LFPDUX c01, AO1, INC2 + LFPDUX c05, AO2, INC2 + LFPDUX c09, AO3, INC2 + LFPDUX c13, AO4, INC2 + + LFPDUX c02, AO1, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c10, AO3, INC2 + LFPDUX c14, AO4, INC2 + + LFPDUX c03, AO1, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c11, AO3, INC2 + LFPDUX c15, AO4, INC2 + + LFPDUX c04, AO1, INC2 + LFPDUX c08, AO2, INC2 + LFPDUX c12, AO3, INC2 + LFPDUX c16, AO4, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, N, 3 + ble LL(19) + + andi. r0, N, 2 + ble LL(17) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + LFPDUX c05, AO3, INC2 + LFPDUX c06, AO3, INC2 + LFPDUX c07, AO4, INC2 + LFPDUX c08, AO4, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c06, B2, INC2 + STFPDUX c07, B2, INC2 + STFPDUX c08, B2, INC2 + .align 4 + +LL(17): + andi. r0, N, 1 + ble LL(19) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + LFPDUX c03, AO3, INC2 + LFPDUX c04, AO4, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c02, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c04, B3, INC2 + .align 4 + +LL(19): + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(20): + andi. J, M, 2 + addi M4, M4, 16 * SIZE + + ble LL(30) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(23) + .align 4 + +LL(22): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + LFPDUX c05, AO2, INC2 + LFPDUX c06, AO2, INC2 + LFPDUX c07, AO2, INC2 + LFPDUX c08, AO2, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + bdnz LL(22) + .align 4 + +LL(23): + andi. r0, N, 2 + ble LL(24) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO2, INC2 + LFPDUX c04, AO2, INC2 + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c04, B2, INC2 + .align 4 + +LL(24): + andi. r0, N, 1 + ble LL(30) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO2, INC2 + + STFPDUX c01, B3, INC2 + STFPDUX c02, B3, INC2 + .align 4 + +LL(30): + andi. J, M, 1 + addi M4, M4, 8 * SIZE + ble LL(99) + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(33) + .align 4 + +LL(32): + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + LFPDUX c03, AO1, INC2 + LFPDUX c04, AO1, INC2 + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + bdnz LL(32) + .align 4 + +LL(33): + andi. r0, N, 2 + ble LL(34) + + LFPDUX c01, AO1, INC2 + LFPDUX c02, AO1, INC2 + + + STFPDUX c01, B2, INC2 + STFPDUX c02, B2, INC2 + .align 4 + +LL(34): + andi. r0, N, 1 + ble LL(99) + + LFPDUX c01, AO1, INC2 + + STFPDX c01, B3, INC2 + .align 4 + +LL(99): + addi SP, SP, -4 + + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + + addi SP, SP, 16 + blr + .align 4 + +LL(100): + subi A, A, SIZE + srawi. J, M, 2 + ble LL(120) + .align 4 + +LL(110): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + sub B1, B, M4 + addi B, B, 32 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(115) + .align 4 + +LL(112): + LFDUX c01, AO1, INC + LFDUX c05, AO2, INC + LFDUX c09, AO3, INC + LFDUX c13, AO4, INC + + LFSDUX c01, AO1, INC + LFSDUX c05, AO2, INC + LFSDUX c09, AO3, INC + LFSDUX c13, AO4, INC + + LFDUX c02, AO1, INC + LFDUX c06, AO2, INC + LFDUX c10, AO3, INC + LFDUX c14, AO4, INC + + LFSDUX c02, AO1, INC + LFSDUX c06, AO2, INC + LFSDUX c10, AO3, INC + LFSDUX c14, AO4, INC + + LFDUX c03, AO1, INC + LFDUX c07, AO2, INC + LFDUX c11, AO3, INC + LFDUX c15, AO4, INC + + LFSDUX c03, AO1, INC + LFSDUX c07, AO2, INC + LFSDUX c11, AO3, INC + LFSDUX c15, AO4, INC + + LFDUX c04, AO1, INC + LFDUX c08, AO2, INC + LFDUX c12, AO3, INC + LFDUX c16, AO4, INC + + LFSDUX c04, AO1, INC + LFSDUX c08, AO2, INC + LFSDUX c12, AO3, INC + LFSDUX c16, AO4, INC + + STFPDUX c01, B1, M4 + STFPDUX c02, B1, INC2 + STFPDUX c03, B1, INC2 + STFPDUX c04, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c06, B1, INC2 + STFPDUX c07, B1, INC2 + STFPDUX c08, B1, INC2 + STFPDUX c09, B1, INC2 + STFPDUX c10, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c12, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c14, B1, INC2 + STFPDUX c15, B1, INC2 + STFPDUX c16, B1, INC2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, N, 3 + ble LL(119) + + andi. r0, N, 2 + ble LL(117) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO2, INC + LFDUX c06, AO2, INC + LFDUX c07, AO2, INC + LFDUX c08, AO2, INC + + LFDUX c09, AO3, INC + LFDUX c10, AO3, INC + LFDUX c11, AO3, INC + LFDUX c12, AO3, INC + fsmfp c01, c02 + + LFDUX c13, AO4, INC + fsmfp c03, c04 + LFDUX c14, AO4, INC + fsmfp c05, c06 + LFDUX c15, AO4, INC + fsmfp c07, c08 + LFDUX c16, AO4, INC + fsmfp c09, c10 + + STFPDUX c01, B2, INC2 + fsmfp c11, c12 + STFPDUX c03, B2, INC2 + fsmfp c13, c14 + STFPDUX c05, B2, INC2 + fsmfp c15, c16 + STFPDUX c07, B2, INC2 + STFPDUX c09, B2, INC2 + STFPDUX c11, B2, INC2 + STFPDUX c13, B2, INC2 + STFPDUX c15, B2, INC2 + .align 4 + +LL(117): + andi. r0, N, 1 + ble LL(119) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + LFDUX c05, AO3, INC + fsmfp c01, c02 + LFDUX c06, AO3, INC + fsmfp c03, c04 + LFDUX c07, AO4, INC + fsmfp c05, c06 + LFDUX c08, AO4, INC + fsmfp c07, c08 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + STFPDUX c05, B3, INC2 + STFPDUX c07, B3, INC2 + .align 4 + +LL(119): + addic. J, J, -1 + bgt LL(110) + .align 4 + +LL(120): + andi. J, M, 2 + addi M4, M4, 16 * SIZE + + ble LL(130) + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + sub B1, B, M4 + addi B, B, 16 * SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(123) + .align 4 + +LL(122): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c05, AO1, INC + LFDUX c06, AO1, INC + LFDUX c07, AO1, INC + LFDUX c08, AO1, INC + + LFDUX c09, AO2, INC + LFDUX c10, AO2, INC + LFDUX c11, AO2, INC + LFDUX c12, AO2, INC + fsmfp c01, c02 + LFDUX c13, AO2, INC + fsmfp c03, c04 + LFDUX c14, AO2, INC + fsmfp c05, c06 + LFDUX c15, AO2, INC + fsmfp c07, c08 + LFDUX c16, AO2, INC + fsmfp c09, c10 + + STFPDUX c01, B1, M4 + fsmfp c11, c12 + STFPDUX c03, B1, INC2 + fsmfp c13, c14 + STFPDUX c05, B1, INC2 + fsmfp c15, c16 + STFPDUX c07, B1, INC2 + STFPDUX c09, B1, INC2 + STFPDUX c11, B1, INC2 + STFPDUX c13, B1, INC2 + STFPDUX c15, B1, INC2 + bdnz LL(122) + .align 4 + +LL(123): + andi. r0, N, 2 + ble LL(124) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + LFDUX c05, AO2, INC + fsmfp c01, c02 + LFDUX c06, AO2, INC + fsmfp c03, c04 + LFDUX c07, AO2, INC + fsmfp c05, c06 + LFDUX c08, AO2, INC + fsmfp c07, c08 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + STFPDUX c05, B2, INC2 + STFPDUX c07, B2, INC2 + .align 4 + +LL(124): + andi. r0, N, 1 + ble LL(130) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + LFDUX c03, AO2, INC + LFDUX c04, AO2, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B3, INC2 + STFPDUX c03, B3, INC2 + .align 4 + +LL(130): + andi. J, M, 1 + addi M4, M4, 8 * SIZE + ble LL(999) + + mr AO1, A + sub B1, B, M4 + + srawi. r0, N, 2 + mtspr CTR, r0 + ble LL(133) + .align 4 + +LL(132): + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + LFDUX c05, AO1, INC + fsmfp c01, c02 + LFDUX c06, AO1, INC + fsmfp c03, c04 + LFDUX c07, AO1, INC + fsmfp c05, c06 + LFDUX c08, AO1, INC + fsmfp c07, c08 + + STFPDUX c01, B1, M4 + STFPDUX c03, B1, INC2 + STFPDUX c05, B1, INC2 + STFPDUX c07, B1, INC2 + bdnz LL(132) + .align 4 + +LL(133): + andi. r0, N, 2 + ble LL(134) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + LFDUX c03, AO1, INC + LFDUX c04, AO1, INC + + fsmfp c01, c02 + fsmfp c03, c04 + + STFPDUX c01, B2, INC2 + STFPDUX c03, B2, INC2 + .align 4 + +LL(134): + andi. r0, N, 1 + ble LL(999) + + LFDUX c01, AO1, INC + LFDUX c02, AO1, INC + + fsmfp c01, c02 + STFPDX c01, B3, INC2 + .align 4 + +LL(999): + addi SP, SP, -4 + + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + + addi SP, SP, 16 + blr + + + + EPILOGUE diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S new file mode 100644 index 0000000000..00ba966ac8 --- /dev/null +++ b/kernel/power/zgemv_n.S @@ -0,0 +1,4290 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r10 +#define LDA r5 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define LDA4 r18 + +#define Y1 r19 +#define Y2 r20 +#define PREA r21 +#define PREC r22 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define alpha1r f16 +#define alpha1i f17 +#define alpha2r f18 +#define alpha2i f19 +#define alpha3r f20 +#define alpha3i f21 +#define alpha4r f22 +#define alpha4i f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha_r f14 +#define alpha_i f15 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 56 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 56 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 40 +#define PREFETCHSIZE_C 24 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 24 +#endif + +#ifndef XCONJ +#define FMADDR FMADD +#define FMSUBR FNMSUB +#else +#define FMADDR FNMSUB +#define FMSUBR FMADD +#endif + +#ifndef CONJ +#define FMADDX FMADD +#define FMSUBX FNMSUB +#else +#define FMADDX FNMSUB +#define FMSUBX FMADD +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA_R 208(SP) +#define ALPHA_I 216(SP) +#else +#define STACKSIZE 280 +#define ALPHA_R 256(SP) +#define ALPHA_I 264(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz LDA, 56 + STACKSIZE(SP) + lwz X, 60 + STACKSIZE(SP) + lwz INCX, 64 + STACKSIZE(SP) + lwz Y, 68 + STACKSIZE(SP) + lwz INCY, 72 + STACKSIZE(SP) +#else + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) +#endif +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + + slwi LDA4, LDA, ZBASE_SHIFT + 2 + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpi cr0, 0, INCY, 2 * SIZE + bne LL(100) + + srawi. J, N, 2 + ble LL(20) + .align 4 + +LL(11): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + LFD a5, 0 * SIZE(X) + LFD a6, 1 * SIZE(X) + add X, X, INCX + LFD a7, 0 * SIZE(X) + LFD a8, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + FMUL alpha3r, alpha_r, a5 + FMUL alpha3i, alpha_i, a5 + FMUL alpha4r, alpha_r, a7 + FMUL alpha4i, alpha_i, a7 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + FMSUBR alpha2r, alpha_i, a4, alpha2r + FMADDR alpha2i, alpha_r, a4, alpha2i + FMSUBR alpha3r, alpha_i, a6, alpha3r + FMADDR alpha3i, alpha_r, a6, alpha3i + FMSUBR alpha4r, alpha_i, a8, alpha4r + FMADDR alpha4i, alpha_r, a8, alpha4i + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(15) + .align 4 + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + addi Y1, Y1, 16 * SIZE + + bdz LL(13) + .align 4 + +LL(12): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + addi AO2, AO2, 16 * SIZE + nop + DCBT(AO2, PREA) + nop + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 8 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a5, 12 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 9 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + addi AO3, AO3, 16 * SIZE + nop + DCBT(AO3, PREA) + nop + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y09, alpha3i, a2, y09 + FMADDX y10, alpha3r, a2, y10 + FMSUBX y11, alpha3i, a4, y11 + FMADDX y12, alpha3r, a4, y12 + + FMSUBX y13, alpha3i, a6, y13 + FMADDX y14, alpha3r, a6, y14 + FMSUBX y15, alpha3i, a8, y15 + FMADDX y16, alpha3r, a8, y16 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + LFD a1, 8 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a5, 12 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + LFD a2, 9 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + addi AO4, AO4, 16 * SIZE + nop + DCBT(AO4, PREA) + nop + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + + FMSUBX y13, alpha4i, a6, y13 + FMADDX y14, alpha4r, a6, y14 + FMSUBX y15, alpha4i, a8, y15 + FMADDX y16, alpha4r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + addi Y2, Y2, 16 * SIZE + addi Y1, Y1, 16 * SIZE + DCBT(Y1, PREC) + bdnz LL(12) + .align 4 + +LL(13): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 8 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a5, 12 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 9 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y09, alpha3i, a2, y09 + FMADDX y10, alpha3r, a2, y10 + FMSUBX y11, alpha3i, a4, y11 + FMADDX y12, alpha3r, a4, y12 + + FMSUBX y13, alpha3i, a6, y13 + FMADDX y14, alpha3r, a6, y14 + FMSUBX y15, alpha3i, a8, y15 + FMADDX y16, alpha3r, a8, y16 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + LFD a1, 8 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a5, 12 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + LFD a2, 9 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + LFD a1, 16 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a5, 20 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + FMSUBX y13, alpha4i, a6, y13 + FMADDX y14, alpha4r, a6, y14 + FMSUBX y15, alpha4i, a8, y15 + FMADDX y16, alpha4r, a8, y16 + + LFD a2, 17 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + addi Y2, Y2, 16 * SIZE + .align 4 + +LL(15): + andi. r0, M, 7 + ble LL(19) + andi. r0, M, 4 + ble LL(16) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi AO3, AO3, 8 * SIZE + addi AO4, AO4, 8 * SIZE + + addi Y1, Y1, 8 * SIZE + addi Y2, Y2, 8 * SIZE + .align 4 + +LL(16): + andi. r0, M, 2 + nop + nop + ble LL(17) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + LFD a1, 0 * SIZE(AO3) + LFD a2, 1 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + LFD a5, 0 * SIZE(AO4) + LFD a6, 1 * SIZE(AO4) + LFD a7, 2 * SIZE(AO4) + LFD a8, 3 * SIZE(AO4) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMADD y01, alpha4r, a5, y01 + FMADD y02, alpha4i, a5, y02 + FMADD y03, alpha4r, a7, y03 + FMADD y04, alpha4i, a7, y04 + + FMSUBX y01, alpha4i, a6, y01 + FMADDX y02, alpha4r, a6, y02 + FMSUBX y03, alpha4i, a8, y03 + FMADDX y04, alpha4r, a8, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + + addi Y1, Y1, 4 * SIZE + addi Y2, Y2, 4 * SIZE + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(19) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD a5, 0 * SIZE(AO3) + LFD a6, 1 * SIZE(AO3) + LFD a7, 0 * SIZE(AO4) + LFD a8, 1 * SIZE(AO4) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + FMADD y01, alpha3r, a5, y01 + FMADD y02, alpha3i, a5, y02 + FMSUBX y01, alpha3i, a6, y01 + FMADDX y02, alpha3r, a6, y02 + + FMADD y01, alpha4r, a7, y01 + FMADD y02, alpha4i, a7, y02 + FMSUBX y01, alpha4i, a8, y01 + FMADDX y02, alpha4r, a8, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + + add Y1, Y1, INCY + add Y2, Y2, INCY + .align 4 + +LL(19): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + .align 4 + +LL(21): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + FMSUBR alpha2r, alpha_i, a4, alpha2r + FMADDR alpha2i, alpha_r, a4, alpha2i + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(25) + .align 4 + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + addi Y1, Y1, 16 * SIZE + bdz LL(23) + .align 4 + +LL(22): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + addi AO2, AO2, 16 * SIZE + nop + DCBT(AO2, PREA) + nop + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + addi Y2, Y2, 16 * SIZE + addi Y1, Y1, 16 * SIZE + DCBT(Y1, PREC) + bdnz LL(22) + .align 4 + +LL(23): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi Y2, Y2, 16 * SIZE + .align 4 + +LL(25): + andi. r0, M, 7 + ble LL(30) + andi. r0, M, 4 + ble LL(26) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi Y1, Y1, 8 * SIZE + addi Y2, Y2, 8 * SIZE + .align 4 + +LL(26): + andi. r0, M, 2 + ble LL(27) + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 0 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 1 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi Y1, Y1, 4 * SIZE + addi Y2, Y2, 4 * SIZE + .align 4 + +LL(27): + andi. r0, M, 1 + ble LL(30) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + add Y1, Y1, INCY + add Y2, Y2, INCY + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + .align 4 + +LL(31): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + + mr AO1, A + add A, AO1, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(35) + .align 4 + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + addi Y1, Y1, 16 * SIZE + bdz LL(33) + .align 4 + +LL(32): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + LFD y09, 8 * SIZE(Y1) + LFD y10, 9 * SIZE(Y1) + LFD y11, 10 * SIZE(Y1) + LFD y12, 11 * SIZE(Y1) + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + LFD y13, 12 * SIZE(Y1) + LFD y14, 13 * SIZE(Y1) + LFD y15, 14 * SIZE(Y1) + LFD y16, 15 * SIZE(Y1) + + addi Y1, Y1, 16 * SIZE + addi Y2, Y2, 16 * SIZE + DCBT(Y1, PREC) + bdnz LL(32) + .align 4 + +LL(33): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + STFD y09, 8 * SIZE(Y2) + STFD y10, 9 * SIZE(Y2) + STFD y11, 10 * SIZE(Y2) + STFD y12, 11 * SIZE(Y2) + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + STFD y13, 12 * SIZE(Y2) + STFD y14, 13 * SIZE(Y2) + STFD y15, 14 * SIZE(Y2) + STFD y16, 15 * SIZE(Y2) + + addi AO1, AO1, 16 * SIZE + addi Y2, Y2, 16 * SIZE + .align 4 + +LL(35): + andi. r0, M, 7 + ble LL(999) + andi. r0, M, 4 + ble LL(36) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 4 * SIZE(Y1) + LFD y06, 5 * SIZE(Y1) + LFD y07, 6 * SIZE(Y1) + LFD y08, 7 * SIZE(Y1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + STFD y05, 4 * SIZE(Y2) + STFD y06, 5 * SIZE(Y2) + STFD y07, 6 * SIZE(Y2) + STFD y08, 7 * SIZE(Y2) + + addi AO1, AO1, 8 * SIZE + addi Y1, Y1, 8 * SIZE + addi Y2, Y2, 8 * SIZE + .align 4 + +LL(36): + andi. r0, M, 2 + ble LL(37) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD y03, 2 * SIZE(Y1) + LFD y04, 3 * SIZE(Y1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + STFD y03, 2 * SIZE(Y2) + STFD y04, 3 * SIZE(Y2) + + addi AO1, AO1, 4 * SIZE + addi Y1, Y1, 4 * SIZE + addi Y2, Y2, 4 * SIZE + .align 4 + +LL(37): + andi. r0, M, 1 + ble LL(999) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + add Y1, Y1, INCY + add Y2, Y2, INCY + b LL(999) + .align 4 + +LL(100): + srawi. J, N, 2 + ble LL(120) + .align 4 + +LL(111): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + LFD a5, 0 * SIZE(X) + LFD a6, 1 * SIZE(X) + add X, X, INCX + LFD a7, 0 * SIZE(X) + LFD a8, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + FMUL alpha3r, alpha_r, a5 + FMUL alpha3i, alpha_i, a5 + FMUL alpha4r, alpha_r, a7 + FMUL alpha4i, alpha_i, a7 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + FMSUBR alpha2r, alpha_i, a4, alpha2r + FMADDR alpha2i, alpha_r, a4, alpha2i + FMSUBR alpha3r, alpha_i, a6, alpha3r + FMADDR alpha3i, alpha_r, a6, alpha3i + FMSUBR alpha4r, alpha_i, a8, alpha4r + FMADDR alpha4i, alpha_r, a8, alpha4i + + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(115) + .align 4 + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y09, 0 * SIZE(Y1) + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y11, 0 * SIZE(Y1) + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y13, 0 * SIZE(Y1) + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y15, 0 * SIZE(Y1) + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + bdz LL(113) + .align 4 + +LL(112): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + addi AO2, AO2, 16 * SIZE + nop + DCBT(AO2, PREA) + nop + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 8 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a5, 12 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 9 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + addi AO3, AO3, 16 * SIZE + nop + DCBT(AO3, PREA) + nop + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y09, alpha3i, a2, y09 + FMADDX y10, alpha3r, a2, y10 + FMSUBX y11, alpha3i, a4, y11 + FMADDX y12, alpha3r, a4, y12 + + FMSUBX y13, alpha3i, a6, y13 + FMADDX y14, alpha3r, a6, y14 + FMSUBX y15, alpha3i, a8, y15 + FMADDX y16, alpha3r, a8, y16 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + LFD a1, 8 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a5, 12 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + LFD a2, 9 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + addi AO4, AO4, 16 * SIZE + nop + DCBT(AO4, PREA) + nop + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y05, 0 * SIZE(Y1) + nop + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y07, 0 * SIZE(Y1) + nop + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y09, 0 * SIZE(Y1) + nop + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y11, 0 * SIZE(Y1) + nop + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y13, alpha4i, a6, y13 + FMADDX y14, alpha4r, a6, y14 + FMSUBX y15, alpha4i, a8, y15 + FMADDX y16, alpha4r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y13, 0 * SIZE(Y1) + nop + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y15, 0 * SIZE(Y1) + nop + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + DCBT(Y1, PREC) + bdnz LL(112) + .align 4 + +LL(113): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 8 * SIZE(AO3) + LFD a3, 10 * SIZE(AO3) + LFD a5, 12 * SIZE(AO3) + LFD a7, 14 * SIZE(AO3) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 9 * SIZE(AO3) + LFD a4, 11 * SIZE(AO3) + LFD a6, 13 * SIZE(AO3) + LFD a8, 15 * SIZE(AO3) + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y09, alpha3i, a2, y09 + FMADDX y10, alpha3r, a2, y10 + FMSUBX y11, alpha3i, a4, y11 + FMADDX y12, alpha3r, a4, y12 + + FMSUBX y13, alpha3i, a6, y13 + FMADDX y14, alpha3r, a6, y14 + FMSUBX y15, alpha3i, a8, y15 + FMADDX y16, alpha3r, a8, y16 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + LFD a1, 8 * SIZE(AO4) + LFD a3, 10 * SIZE(AO4) + LFD a5, 12 * SIZE(AO4) + LFD a7, 14 * SIZE(AO4) + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + LFD a2, 9 * SIZE(AO4) + LFD a4, 11 * SIZE(AO4) + LFD a6, 13 * SIZE(AO4) + LFD a8, 15 * SIZE(AO4) + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y13, alpha4i, a6, y13 + FMADDX y14, alpha4r, a6, y14 + FMSUBX y15, alpha4i, a8, y15 + FMADDX y16, alpha4r, a8, y16 + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + .align 4 + +LL(115): + andi. r0, M, 7 + ble LL(119) + andi. r0, M, 4 + ble LL(116) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 0 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a5, 4 * SIZE(AO3) + LFD a7, 6 * SIZE(AO3) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 1 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + LFD a6, 5 * SIZE(AO3) + LFD a8, 7 * SIZE(AO3) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMADD y05, alpha3r, a5, y05 + FMADD y06, alpha3i, a5, y06 + FMADD y07, alpha3r, a7, y07 + FMADD y08, alpha3i, a7, y08 + + LFD a1, 0 * SIZE(AO4) + LFD a3, 2 * SIZE(AO4) + LFD a5, 4 * SIZE(AO4) + LFD a7, 6 * SIZE(AO4) + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMSUBX y05, alpha3i, a6, y05 + FMADDX y06, alpha3r, a6, y06 + FMSUBX y07, alpha3i, a8, y07 + FMADDX y08, alpha3r, a8, y08 + + LFD a2, 1 * SIZE(AO4) + LFD a4, 3 * SIZE(AO4) + LFD a6, 5 * SIZE(AO4) + LFD a8, 7 * SIZE(AO4) + + FMADD y01, alpha4r, a1, y01 + FMADD y02, alpha4i, a1, y02 + FMADD y03, alpha4r, a3, y03 + FMADD y04, alpha4i, a3, y04 + + FMADD y05, alpha4r, a5, y05 + FMADD y06, alpha4i, a5, y06 + FMADD y07, alpha4r, a7, y07 + FMADD y08, alpha4i, a7, y08 + + FMSUBX y01, alpha4i, a2, y01 + FMADDX y02, alpha4r, a2, y02 + FMSUBX y03, alpha4i, a4, y03 + FMADDX y04, alpha4r, a4, y04 + + FMSUBX y05, alpha4i, a6, y05 + FMADDX y06, alpha4r, a6, y06 + FMSUBX y07, alpha4i, a8, y07 + FMADDX y08, alpha4r, a8, y08 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 8 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + addi AO2, AO2, 8 * SIZE + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y05, 0 * SIZE(Y2) + addi AO3, AO3, 8 * SIZE + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + addi AO4, AO4, 8 * SIZE + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(116): + andi. r0, M, 2 + ble LL(117) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + LFD a1, 0 * SIZE(AO3) + LFD a2, 1 * SIZE(AO3) + LFD a3, 2 * SIZE(AO3) + LFD a4, 3 * SIZE(AO3) + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + LFD a5, 0 * SIZE(AO4) + LFD a6, 1 * SIZE(AO4) + LFD a7, 2 * SIZE(AO4) + LFD a8, 3 * SIZE(AO4) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMADD y01, alpha4r, a5, y01 + FMADD y02, alpha4i, a5, y02 + FMADD y03, alpha4r, a7, y03 + FMADD y04, alpha4i, a7, y04 + + FMSUBX y01, alpha4i, a6, y01 + FMADDX y02, alpha4r, a6, y02 + FMSUBX y03, alpha4i, a8, y03 + FMADDX y04, alpha4r, a8, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 4 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + addi AO2, AO2, 4 * SIZE + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + addi AO3, AO3, 4 * SIZE + addi AO4, AO4, 4 * SIZE + .align 4 + +LL(117): + andi. r0, M, 1 + ble LL(119) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + LFD a5, 0 * SIZE(AO3) + LFD a6, 1 * SIZE(AO3) + LFD a7, 0 * SIZE(AO4) + LFD a8, 1 * SIZE(AO4) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + FMADD y01, alpha3r, a5, y01 + FMADD y02, alpha3i, a5, y02 + FMSUBX y01, alpha3i, a6, y01 + FMADDX y02, alpha3r, a6, y02 + + FMADD y01, alpha4r, a7, y01 + FMADD y02, alpha4i, a7, y02 + FMSUBX y01, alpha4i, a8, y01 + FMADDX y02, alpha4r, a8, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(119): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(111) + .align 4 + +LL(120): + andi. J, N, 2 + ble LL(130) + .align 4 + +LL(121): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + FMSUBR alpha2r, alpha_i, a4, alpha2r + FMADDR alpha2i, alpha_r, a4, alpha2i + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(125) + .align 4 + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y09, 0 * SIZE(Y1) + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y11, 0 * SIZE(Y1) + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y13, 0 * SIZE(Y1) + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y15, 0 * SIZE(Y1) + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + bdz LL(123) + .align 4 + +LL(122): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + addi AO2, AO2, 16 * SIZE + nop + DCBT(AO2, PREA) + nop + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y05, 0 * SIZE(Y1) + nop + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y07, 0 * SIZE(Y1) + nop + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y09, 0 * SIZE(Y1) + nop + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y11, 0 * SIZE(Y1) + nop + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y13, 0 * SIZE(Y1) + nop + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y15, 0 * SIZE(Y1) + nop + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + DCBT(Y1, PREC) + bdnz LL(122) + .align 4 + +LL(123): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + LFD a1, 8 * SIZE(AO2) + LFD a3, 10 * SIZE(AO2) + LFD a5, 12 * SIZE(AO2) + LFD a7, 14 * SIZE(AO2) + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 16 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + LFD a2, 9 * SIZE(AO2) + LFD a4, 11 * SIZE(AO2) + LFD a6, 13 * SIZE(AO2) + LFD a8, 15 * SIZE(AO2) + + STFD y05, 0 * SIZE(Y2) + addi AO2, AO2, 16 * SIZE + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y13, alpha2i, a6, y13 + FMADDX y14, alpha2r, a6, y14 + FMSUBX y15, alpha2i, a8, y15 + FMADDX y16, alpha2r, a8, y16 + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(125): + andi. r0, M, 7 + ble LL(130) + andi. r0, M, 4 + ble LL(126) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 0 * SIZE(AO2) + LFD a3, 2 * SIZE(AO2) + LFD a5, 4 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 1 * SIZE(AO2) + LFD a4, 3 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + FMADD y01, alpha2r, a1, y01 + FMADD y02, alpha2i, a1, y02 + FMADD y03, alpha2r, a3, y03 + FMADD y04, alpha2i, a3, y04 + + FMADD y05, alpha2r, a5, y05 + FMADD y06, alpha2i, a5, y06 + FMADD y07, alpha2r, a7, y07 + FMADD y08, alpha2i, a7, y08 + + FMSUBX y01, alpha2i, a2, y01 + FMADDX y02, alpha2r, a2, y02 + FMSUBX y03, alpha2i, a4, y03 + FMADDX y04, alpha2r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 8 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + addi AO2, AO2, 8 * SIZE + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + FMSUBX y05, alpha2i, a6, y05 + FMADDX y06, alpha2r, a6, y06 + FMSUBX y07, alpha2i, a8, y07 + FMADDX y08, alpha2r, a8, y08 + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(126): + andi. r0, M, 2 + ble LL(127) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 4 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + addi AO2, AO2, 4 * SIZE + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(127): + andi. r0, M, 1 + ble LL(130) + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + STFD y01, 0 * SIZE(Y2) + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(130): + andi. J, N, 1 + ble LL(999) + .align 4 + +LL(131): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + + FMSUBR alpha1r, alpha_i, a2, alpha1r + FMADDR alpha1i, alpha_r, a2, alpha1i + + mr AO1, A + add A, AO1, LDA + + mr Y1, Y + mr Y2, Y + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(135) + .align 4 + + LFD y01, 0 * SIZE(Y1) + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD y05, 0 * SIZE(Y1) + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y07, 0 * SIZE(Y1) + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y09, 0 * SIZE(Y1) + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y11, 0 * SIZE(Y1) + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y13, 0 * SIZE(Y1) + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y15, 0 * SIZE(Y1) + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + bdz LL(133) + .align 4 + +LL(132): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + nop + DCBT(AO1, PREA) + nop + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y05, 0 * SIZE(Y1) + nop + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y07, 0 * SIZE(Y1) + nop + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y09, 0 * SIZE(Y1) + nop + LFD y10, 1 * SIZE(Y1) + add Y1, Y1, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y11, 0 * SIZE(Y1) + nop + LFD y12, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + + LFD y13, 0 * SIZE(Y1) + nop + LFD y14, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y15, 0 * SIZE(Y1) + nop + LFD y16, 1 * SIZE(Y1) + add Y1, Y1, INCY + + DCBT(Y1, PREC) + bdnz LL(132) + .align 4 + +LL(133): + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + LFD a1, 8 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a5, 12 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + LFD a2, 9 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FMADD y09, alpha1r, a1, y09 + FMADD y10, alpha1i, a1, y10 + FMADD y11, alpha1r, a3, y11 + FMADD y12, alpha1i, a3, y12 + + FMADD y13, alpha1r, a5, y13 + FMADD y14, alpha1i, a5, y14 + FMADD y15, alpha1r, a7, y15 + FMADD y16, alpha1i, a7, y16 + + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + FMADDX y14, alpha1r, a6, y14 + FMSUBX y15, alpha1i, a8, y15 + FMADDX y16, alpha1r, a8, y16 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 16 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y09, 0 * SIZE(Y2) + nop + STFD y10, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y11, 0 * SIZE(Y2) + nop + STFD y12, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y13, 0 * SIZE(Y2) + nop + STFD y14, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y15, 0 * SIZE(Y2) + nop + STFD y16, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(135): + andi. r0, M, 7 + ble LL(999) + andi. r0, M, 4 + ble LL(136) + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y05, 0 * SIZE(Y1) + nop + LFD y06, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD y07, 0 * SIZE(Y1) + nop + LFD y08, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + + LFD a2, 1 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMADD y05, alpha1r, a5, y05 + FMADD y06, alpha1i, a5, y06 + FMADD y07, alpha1r, a7, y07 + FMADD y08, alpha1i, a7, y08 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMSUBX y05, alpha1i, a6, y05 + FMADDX y06, alpha1r, a6, y06 + FMSUBX y07, alpha1i, a8, y07 + FMADDX y08, alpha1r, a8, y08 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 8 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y05, 0 * SIZE(Y2) + nop + STFD y06, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y07, 0 * SIZE(Y2) + nop + STFD y08, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(136): + andi. r0, M, 2 + ble LL(137) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + LFD y03, 0 * SIZE(Y1) + nop + LFD y04, 1 * SIZE(Y1) + add Y1, Y1, INCY + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFD y01, 0 * SIZE(Y2) + addi AO1, AO1, 4 * SIZE + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + + STFD y03, 0 * SIZE(Y2) + nop + STFD y04, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(137): + andi. r0, M, 1 + ble LL(999) + + LFD y01, 0 * SIZE(Y1) + nop + LFD y02, 1 * SIZE(Y1) + add Y1, Y1, INCY + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + STFD y01, 0 * SIZE(Y2) + nop + STFD y02, 1 * SIZE(Y2) + add Y2, Y2, INCY + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S new file mode 100644 index 0000000000..690eb0d463 --- /dev/null +++ b/kernel/power/zgemv_n_ppc440.S @@ -0,0 +1,1386 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r10 +#define LDA r5 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 + +#define Y1 r18 +#define Y2 r19 +#define PREA r20 +#define YY r21 +#define BUFFER r22 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 +#define y09 f8 +#define y10 f9 +#define y11 f10 +#define y12 f11 +#define y13 f12 +#define y14 f13 +#define y15 f14 +#define y16 f15 + +#define alpha1r f16 +#define alpha1i f17 +#define alpha2r f18 +#define alpha2i f19 +#define alpha3r f20 +#define alpha3i f21 +#define alpha4r f22 +#define alpha4i f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha_r f14 +#define alpha_i f15 + +#if defined(PPCG4) +#define PREFETCHSIZE_A (3 * 4) +#endif + +#if defined(POWER6) +#define PREFETCHSIZE_A (3 * 4) +#endif + +#ifndef XCONJ +#define FMADDR FMADD +#define FMSUBR FNMSUB +#else +#define FMADDR FNMSUB +#define FMSUBR FMADD +#endif + +#ifndef CONJ +#define FMADDX FMADD +#define FMSUBX FNMSUB +#else +#define FMADDX FNMSUB +#define FMSUBX FMADD +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 232 +#define ALPHA_R 208(SP) +#define ALPHA_I 216(SP) +#define FZERO 224(SP) +#else +#define STACKSIZE 280 +#define ALPHA_R 256(SP) +#define ALPHA_I 264(SP) +#define FZERO 272(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz LDA, 56 + STACKSIZE(SP) + lwz X, 60 + STACKSIZE(SP) + lwz INCX, 64 + STACKSIZE(SP) + lwz Y, 68 + STACKSIZE(SP) + lwz INCY, 72 + STACKSIZE(SP) + lwz BUFFER, 76 + STACKSIZE(SP) +#else + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#endif +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + addi INCX, INCX, -SIZE + addi INCY, INCY, -SIZE + addi A, A, -SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + sub X, X, INCX + cmpwi cr0, N, 0 + sub Y, Y, INCY + ble- LL(999) + + li PREA, PREFETCHSIZE_A * SIZE + + mr YY, Y + lfd f0, FZERO + + cmpi cr0, 0, INCY, SIZE + beq LL(10) + + addi YY, BUFFER, -SIZE + addi Y1, BUFFER, -SIZE + + addi r0, M, 3 + srawi. r0, r0, 2 + mtspr CTR, r0 + .align 4 + +LL(02): + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + STFDU f0, 1 * SIZE(Y1) + bdnz LL(02) + .align 4 + +LL(10): + srawi. J, N, 2 + ble LL(20) + .align 4 + +LL(11): + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFDUX a1, X, INCX + LFDU a2, 1 * SIZE(X) + LFDUX a3, X, INCX + LFDU a4, 1 * SIZE(X) + LFDUX a5, X, INCX + LFDU a6, 1 * SIZE(X) + LFDUX a7, X, INCX + LFDU a8, 1 * SIZE(X) + + FMUL alpha1r, alpha_r, a1 + FMUL alpha1i, alpha_i, a1 + FMUL alpha2r, alpha_r, a3 + FMUL alpha2i, alpha_i, a3 + + FMUL alpha3r, alpha_r, a5 + mr Y1, YY + FMUL alpha3i, alpha_i, a5 + mr Y2, YY + FMUL alpha4r, alpha_r, a7 + mr AO1, A + FMUL alpha4i, alpha_i, a7 + add AO2, A, LDA + + FMSUBR alpha1r, alpha_i, a2, alpha1r + add AO3, AO2, LDA + FMADDR alpha1i, alpha_r, a2, alpha1i + add AO4, AO3, LDA + FMSUBR alpha2r, alpha_i, a4, alpha2r + add A, AO4, LDA + FMADDR alpha2i, alpha_r, a4, alpha2i + + FMSUBR alpha3r, alpha_i, a6, alpha3r + srawi. r0, M, 2 + FMADDR alpha3i, alpha_r, a6, alpha3i + FMSUBR alpha4r, alpha_i, a8, alpha4r + mtspr CTR, r0 + FMADDR alpha4i, alpha_r, a8, alpha4i + ble LL(15) + .align 4 + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + LFDU a5, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + + FMADD y09, alpha1r, a1, y01 + FMADD y10, alpha1i, a1, y02 + FMADD y11, alpha1r, a3, y03 + FMADD y12, alpha1i, a3, y04 + + FMADD y13, alpha1r, a5, y05 + FMADD y14, alpha1i, a5, y06 + FMADD y15, alpha1r, a7, y07 + FMADD y16, alpha1i, a7, y08 + + bdz LL(13) + .align 4 + +LL(12): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO2) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO2) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO2) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO2) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO2) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO2) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO2) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO2) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y09, alpha2r, a1, y09 + LFDU y01, 1 * SIZE(Y1) + FMADD y10, alpha2i, a1, y10 + LFDU y02, 1 * SIZE(Y1) + FMADD y11, alpha2r, a3, y11 + LFDU y03, 1 * SIZE(Y1) + FMADD y12, alpha2i, a3, y12 + LFDU y04, 1 * SIZE(Y1) + +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + LFDU a1, 1 * SIZE(AO3) + FMADDX y10, alpha2r, a2, y10 + LFDU a2, 1 * SIZE(AO3) + FMSUBX y11, alpha2i, a4, y11 + LFDU a3, 1 * SIZE(AO3) + FMADDX y12, alpha2r, a4, y12 + LFDU a4, 1 * SIZE(AO3) + +#ifdef PPCG4 + dcbt AO3, PREA +#endif + + FMSUBX y13, alpha2i, a6, y13 + LFDU a5, 1 * SIZE(AO3) + FMADDX y14, alpha2r, a6, y14 + LFDU a6, 1 * SIZE(AO3) + FMSUBX y15, alpha2i, a8, y15 + LFDU a7, 1 * SIZE(AO3) + FMADDX y16, alpha2r, a8, y16 + LFDU a8, 1 * SIZE(AO3) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO3, PREA +#endif + + FMADD y09, alpha3r, a1, y09 + LFDU y05, 1 * SIZE(Y1) + FMADD y10, alpha3i, a1, y10 + LFDU y06, 1 * SIZE(Y1) + FMADD y11, alpha3r, a3, y11 + LFDU y07, 1 * SIZE(Y1) + FMADD y12, alpha3i, a3, y12 + LFDU y08, 1 * SIZE(Y1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + FMSUBX y09, alpha3i, a2, y09 + LFDU a1, 1 * SIZE(AO4) + FMADDX y10, alpha3r, a2, y10 + LFDU a2, 1 * SIZE(AO4) + FMSUBX y11, alpha3i, a4, y11 + LFDU a3, 1 * SIZE(AO4) + FMADDX y12, alpha3r, a4, y12 + LFDU a4, 1 * SIZE(AO4) + +#ifdef PPCG4 + dcbt AO4, PREA +#endif + + FMSUBX y13, alpha3i, a6, y13 + LFDU a5, 1 * SIZE(AO4) + FMADDX y14, alpha3r, a6, y14 + LFDU a6, 1 * SIZE(AO4) + FMSUBX y15, alpha3i, a8, y15 + LFDU a7, 1 * SIZE(AO4) + FMADDX y16, alpha3r, a8, y16 + LFDU a8, 1 * SIZE(AO4) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO4, PREA +#endif + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + FMSUBX y09, alpha4i, a2, y09 + LFDU a1, 1 * SIZE(AO1) + FMADDX y10, alpha4r, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMSUBX y11, alpha4i, a4, y11 + LFDU a3, 1 * SIZE(AO1) + FMADDX y12, alpha4r, a4, y12 + LFDU a4, 1 * SIZE(AO1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMSUBX y13, alpha4i, a6, y13 + LFDU a5, 1 * SIZE(AO1) + FMADDX y14, alpha4r, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMSUBX y15, alpha4i, a8, y15 + LFDU a7, 1 * SIZE(AO1) + FMADDX y16, alpha4r, a8, y16 + LFDU a8, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + FMADD y09, alpha1r, a1, y01 + STFDU y10, 1 * SIZE(Y2) + FMADD y10, alpha1i, a1, y02 + STFDU y11, 1 * SIZE(Y2) + FMADD y11, alpha1r, a3, y03 + STFDU y12, 1 * SIZE(Y2) + FMADD y12, alpha1i, a3, y04 + + STFDU y13, 1 * SIZE(Y2) + FMADD y13, alpha1r, a5, y05 + STFDU y14, 1 * SIZE(Y2) + FMADD y14, alpha1i, a5, y06 + STFDU y15, 1 * SIZE(Y2) + FMADD y15, alpha1r, a7, y07 + STFDU y16, 1 * SIZE(Y2) + FMADD y16, alpha1i, a7, y08 + bdnz LL(12) + .align 4 + +LL(13): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO2) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO2) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO2) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO2) + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO2) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO2) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO2) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + LFDU a1, 1 * SIZE(AO3) + FMADDX y10, alpha2r, a2, y10 + LFDU a2, 1 * SIZE(AO3) + FMSUBX y11, alpha2i, a4, y11 + LFDU a3, 1 * SIZE(AO3) + FMADDX y12, alpha2r, a4, y12 + LFDU a4, 1 * SIZE(AO3) + + FMSUBX y13, alpha2i, a6, y13 + LFDU a5, 1 * SIZE(AO3) + FMADDX y14, alpha2r, a6, y14 + LFDU a6, 1 * SIZE(AO3) + FMSUBX y15, alpha2i, a8, y15 + LFDU a7, 1 * SIZE(AO3) + FMADDX y16, alpha2r, a8, y16 + LFDU a8, 1 * SIZE(AO3) + + FMADD y09, alpha3r, a1, y09 + FMADD y10, alpha3i, a1, y10 + FMADD y11, alpha3r, a3, y11 + FMADD y12, alpha3i, a3, y12 + + FMADD y13, alpha3r, a5, y13 + FMADD y14, alpha3i, a5, y14 + FMADD y15, alpha3r, a7, y15 + FMADD y16, alpha3i, a7, y16 + + FMSUBX y09, alpha3i, a2, y09 + LFDU a1, 1 * SIZE(AO4) + FMADDX y10, alpha3r, a2, y10 + LFDU a2, 1 * SIZE(AO4) + FMSUBX y11, alpha3i, a4, y11 + LFDU a3, 1 * SIZE(AO4) + FMADDX y12, alpha3r, a4, y12 + LFDU a4, 1 * SIZE(AO4) + + FMSUBX y13, alpha3i, a6, y13 + LFDU a5, 1 * SIZE(AO4) + FMADDX y14, alpha3r, a6, y14 + LFDU a6, 1 * SIZE(AO4) + FMSUBX y15, alpha3i, a8, y15 + LFDU a7, 1 * SIZE(AO4) + FMADDX y16, alpha3r, a8, y16 + LFDU a8, 1 * SIZE(AO4) + + FMADD y09, alpha4r, a1, y09 + FMADD y10, alpha4i, a1, y10 + FMADD y11, alpha4r, a3, y11 + FMADD y12, alpha4i, a3, y12 + + FMADD y13, alpha4r, a5, y13 + FMADD y14, alpha4i, a5, y14 + FMADD y15, alpha4r, a7, y15 + FMADD y16, alpha4i, a7, y16 + + FMSUBX y09, alpha4i, a2, y09 + FMADDX y10, alpha4r, a2, y10 + FMSUBX y11, alpha4i, a4, y11 + FMADDX y12, alpha4r, a4, y12 + + FMSUBX y13, alpha4i, a6, y13 + STFDU y09, 1 * SIZE(Y2) + FMADDX y14, alpha4r, a6, y14 + STFDU y10, 1 * SIZE(Y2) + FMSUBX y15, alpha4i, a8, y15 + STFDU y11, 1 * SIZE(Y2) + FMADDX y16, alpha4r, a8, y16 + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + .align 4 + +LL(15): + andi. r0, M, 2 + ble LL(17) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1r, a1, y01 + LFDU a5, 1 * SIZE(AO2) + FMADD y02, alpha1i, a1, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, alpha1r, a3, y03 + LFDU a7, 1 * SIZE(AO2) + FMADD y04, alpha1i, a3, y04 + LFDU a8, 1 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + LFDU a1, 1 * SIZE(AO3) + FMADDX y02, alpha1r, a2, y02 + LFDU a2, 1 * SIZE(AO3) + FMSUBX y03, alpha1i, a4, y03 + LFDU a3, 1 * SIZE(AO3) + FMADDX y04, alpha1r, a4, y04 + LFDU a4, 1 * SIZE(AO3) + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + LFDU a5, 1 * SIZE(AO4) + FMADDX y02, alpha2r, a6, y02 + LFDU a6, 1 * SIZE(AO4) + FMSUBX y03, alpha2i, a8, y03 + LFDU a7, 1 * SIZE(AO4) + FMADDX y04, alpha2r, a8, y04 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha3r, a1, y01 + FMADD y02, alpha3i, a1, y02 + FMADD y03, alpha3r, a3, y03 + FMADD y04, alpha3i, a3, y04 + + FMSUBX y01, alpha3i, a2, y01 + FMADDX y02, alpha3r, a2, y02 + FMSUBX y03, alpha3i, a4, y03 + FMADDX y04, alpha3r, a4, y04 + + FMADD y01, alpha4r, a5, y01 + FMADD y02, alpha4i, a5, y02 + FMADD y03, alpha4r, a7, y03 + FMADD y04, alpha4i, a7, y04 + + FMSUBX y01, alpha4i, a6, y01 + FMADDX y02, alpha4r, a6, y02 + FMSUBX y03, alpha4i, a8, y03 + FMADDX y04, alpha4r, a8, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(19) + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + LFDU a5, 1 * SIZE(AO3) + FMADD y02, alpha1i, a1, y02 + LFDU a6, 1 * SIZE(AO3) + FMSUBX y01, alpha1i, a2, y01 + LFDU a7, 1 * SIZE(AO4) + FMADDX y02, alpha1r, a2, y02 + LFDU a8, 1 * SIZE(AO4) + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + FMADD y01, alpha3r, a5, y01 + FMADD y02, alpha3i, a5, y02 + FMSUBX y01, alpha3i, a6, y01 + FMADDX y02, alpha3r, a6, y02 + + FMADD y01, alpha4r, a7, y01 + FMADD y02, alpha4i, a7, y02 + FMSUBX y01, alpha4i, a8, y01 + FMADDX y02, alpha4r, a8, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(19): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFDUX a1, X, INCX + LFDU a2, 1 * SIZE(X) + LFDUX a3, X, INCX + LFDU a4, 1 * SIZE(X) + + FMUL alpha1r, alpha_r, a1 + mr Y1, YY + FMUL alpha1i, alpha_i, a1 + mr Y2, YY + FMUL alpha2r, alpha_r, a3 + mr AO1, A + FMUL alpha2i, alpha_i, a3 + add AO2, A, LDA + + FMSUBR alpha1r, alpha_i, a2, alpha1r + add A, AO2, LDA + FMADDR alpha1i, alpha_r, a2, alpha1i + srawi. r0, M, 2 + FMSUBR alpha2r, alpha_i, a4, alpha2r + mtspr CTR, r0 + FMADDR alpha2i, alpha_r, a4, alpha2i + ble LL(25) + .align 4 + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + LFDU a5, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + + FMADD y09, alpha1r, a1, y01 + FMADD y10, alpha1i, a1, y02 + FMADD y11, alpha1r, a3, y03 + FMADD y12, alpha1i, a3, y04 + + FMADD y13, alpha1r, a5, y05 + FMADD y14, alpha1i, a5, y06 + FMADD y15, alpha1r, a7, y07 + FMADD y16, alpha1i, a7, y08 + + bdz LL(23) + .align 4 + +LL(22): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO2) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO2) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO2) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO2) +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO2) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO2) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO2) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO2) +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD y09, alpha2r, a1, y09 + LFDU y01, 1 * SIZE(Y1) + FMADD y10, alpha2i, a1, y10 + LFDU y02, 1 * SIZE(Y1) + FMADD y11, alpha2r, a3, y11 + LFDU y03, 1 * SIZE(Y1) + FMADD y12, alpha2i, a3, y12 + LFDU y04, 1 * SIZE(Y1) + +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMADD y13, alpha2r, a5, y13 + LFDU y05, 1 * SIZE(Y1) + FMADD y14, alpha2i, a5, y14 + LFDU y06, 1 * SIZE(Y1) + FMADD y15, alpha2r, a7, y15 + LFDU y07, 1 * SIZE(Y1) + FMADD y16, alpha2i, a7, y16 + LFDU y08, 1 * SIZE(Y1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + + FMSUBX y09, alpha2i, a2, y09 + LFDU a1, 1 * SIZE(AO1) + FMADDX y10, alpha2r, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMSUBX y11, alpha2i, a4, y11 + LFDU a3, 1 * SIZE(AO1) + FMADDX y12, alpha2r, a4, y12 + LFDU a4, 1 * SIZE(AO1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMSUBX y13, alpha2i, a6, y13 + LFDU a5, 1 * SIZE(AO1) + FMADDX y14, alpha2r, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMSUBX y15, alpha2i, a8, y15 + LFDU a7, 1 * SIZE(AO1) + FMADDX y16, alpha2r, a8, y16 + LFDU a8, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + FMADD y09, alpha1r, a1, y01 + STFDU y10, 1 * SIZE(Y2) + FMADD y10, alpha1i, a1, y02 + STFDU y11, 1 * SIZE(Y2) + FMADD y11, alpha1r, a3, y03 + STFDU y12, 1 * SIZE(Y2) + FMADD y12, alpha1i, a3, y04 + + STFDU y13, 1 * SIZE(Y2) + FMADD y13, alpha1r, a5, y05 + STFDU y14, 1 * SIZE(Y2) + FMADD y14, alpha1i, a5, y06 + STFDU y15, 1 * SIZE(Y2) + FMADD y15, alpha1r, a7, y07 + STFDU y16, 1 * SIZE(Y2) + FMADD y16, alpha1i, a7, y08 + bdnz LL(22) + .align 4 + +LL(23): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO2) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO2) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO2) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO2) + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO2) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO2) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO2) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO2) + + FMADD y09, alpha2r, a1, y09 + FMADD y10, alpha2i, a1, y10 + FMADD y11, alpha2r, a3, y11 + FMADD y12, alpha2i, a3, y12 + + FMADD y13, alpha2r, a5, y13 + FMADD y14, alpha2i, a5, y14 + FMADD y15, alpha2r, a7, y15 + FMADD y16, alpha2i, a7, y16 + + FMSUBX y09, alpha2i, a2, y09 + FMADDX y10, alpha2r, a2, y10 + FMSUBX y11, alpha2i, a4, y11 + FMADDX y12, alpha2r, a4, y12 + + FMSUBX y13, alpha2i, a6, y13 + STFDU y09, 1 * SIZE(Y2) + FMADDX y14, alpha2r, a6, y14 + STFDU y10, 1 * SIZE(Y2) + FMSUBX y15, alpha2i, a8, y15 + STFDU y11, 1 * SIZE(Y2) + FMADDX y16, alpha2r, a8, y16 + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + .align 4 + +LL(25): + andi. r0, M, 2 + ble LL(27) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1r, a1, y01 + LFDU a5, 1 * SIZE(AO2) + FMADD y02, alpha1i, a1, y02 + LFDU a6, 1 * SIZE(AO2) + FMADD y03, alpha1r, a3, y03 + LFDU a7, 1 * SIZE(AO2) + FMADD y04, alpha1i, a3, y04 + LFDU a8, 1 * SIZE(AO2) + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + FMADD y01, alpha2r, a5, y01 + FMADD y02, alpha2i, a5, y02 + FMADD y03, alpha2r, a7, y03 + FMADD y04, alpha2i, a7, y04 + + FMSUBX y01, alpha2i, a6, y01 + FMADDX y02, alpha2r, a6, y02 + FMSUBX y03, alpha2i, a8, y03 + FMADDX y04, alpha2r, a8, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(27): + andi. r0, M, 1 + ble LL(30) + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + FMADD y01, alpha2r, a3, y01 + FMADD y02, alpha2i, a3, y02 + FMSUBX y01, alpha2i, a4, y01 + FMADDX y02, alpha2r, a4, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(990) + .align 4 + + lfd alpha_r, ALPHA_R + lfd alpha_i, ALPHA_I + + LFDUX a1, X, INCX + LFDU a2, 1 * SIZE(X) + + FMUL alpha1r, alpha_r, a1 + mr Y1, YY + mr Y2, YY + FMUL alpha1i, alpha_i, a1 + mr AO1, A + add A, A, LDA + + FMSUBR alpha1r, alpha_i, a2, alpha1r + srawi. r0, M, 2 + mtspr CTR, r0 + FMADDR alpha1i, alpha_r, a2, alpha1i + ble LL(35) + .align 4 + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + LFDU a5, 1 * SIZE(AO1) + LFDU y05, 1 * SIZE(Y1) + LFDU a6, 1 * SIZE(AO1) + LFDU y06, 1 * SIZE(Y1) + LFDU a7, 1 * SIZE(AO1) + LFDU y07, 1 * SIZE(Y1) + LFDU a8, 1 * SIZE(AO1) + LFDU y08, 1 * SIZE(Y1) + + FMADD y09, alpha1r, a1, y01 + FMADD y10, alpha1i, a1, y02 + FMADD y11, alpha1r, a3, y03 + FMADD y12, alpha1i, a3, y04 + + FMADD y13, alpha1r, a5, y05 + FMADD y14, alpha1i, a5, y06 + FMADD y15, alpha1r, a7, y07 + FMADD y16, alpha1i, a7, y08 + + bdz LL(33) + .align 4 + +LL(32): + FMSUBX y09, alpha1i, a2, y09 + LFDU a1, 1 * SIZE(AO1) + FMADDX y10, alpha1r, a2, y10 + LFDU a2, 1 * SIZE(AO1) + FMSUBX y11, alpha1i, a4, y11 + LFDU a3, 1 * SIZE(AO1) + FMADDX y12, alpha1r, a4, y12 + LFDU a4, 1 * SIZE(AO1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + LFDU y01, 1 * SIZE(Y1) + LFDU y02, 1 * SIZE(Y1) + LFDU y03, 1 * SIZE(Y1) + LFDU y04, 1 * SIZE(Y1) + +#ifdef PPCG4 + dcbtst Y1, PREA +#endif + + FMSUBX y13, alpha1i, a6, y13 + LFDU a5, 1 * SIZE(AO1) + FMADDX y14, alpha1r, a6, y14 + LFDU a6, 1 * SIZE(AO1) + FMSUBX y15, alpha1i, a8, y15 + LFDU a7, 1 * SIZE(AO1) + FMADDX y16, alpha1r, a8, y16 + LFDU a8, 1 * SIZE(AO1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + LFDU y05, 1 * SIZE(Y1) + LFDU y06, 1 * SIZE(Y1) + LFDU y07, 1 * SIZE(Y1) + LFDU y08, 1 * SIZE(Y1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y1, PREA +#endif + + STFDU y09, 1 * SIZE(Y2) + FMADD y09, alpha1r, a1, y01 + STFDU y10, 1 * SIZE(Y2) + FMADD y10, alpha1i, a1, y02 + STFDU y11, 1 * SIZE(Y2) + FMADD y11, alpha1r, a3, y03 + STFDU y12, 1 * SIZE(Y2) + FMADD y12, alpha1i, a3, y04 + + STFDU y13, 1 * SIZE(Y2) + FMADD y13, alpha1r, a5, y05 + STFDU y14, 1 * SIZE(Y2) + FMADD y14, alpha1i, a5, y06 + STFDU y15, 1 * SIZE(Y2) + FMADD y15, alpha1r, a7, y07 + STFDU y16, 1 * SIZE(Y2) + FMADD y16, alpha1i, a7, y08 + bdnz LL(32) + .align 4 + +LL(33): + FMSUBX y09, alpha1i, a2, y09 + FMADDX y10, alpha1r, a2, y10 + FMSUBX y11, alpha1i, a4, y11 + FMADDX y12, alpha1r, a4, y12 + + FMSUBX y13, alpha1i, a6, y13 + STFDU y09, 1 * SIZE(Y2) + FMADDX y14, alpha1r, a6, y14 + STFDU y10, 1 * SIZE(Y2) + FMSUBX y15, alpha1i, a8, y15 + STFDU y11, 1 * SIZE(Y2) + FMADDX y16, alpha1r, a8, y16 + STFDU y12, 1 * SIZE(Y2) + + STFDU y13, 1 * SIZE(Y2) + STFDU y14, 1 * SIZE(Y2) + STFDU y15, 1 * SIZE(Y2) + STFDU y16, 1 * SIZE(Y2) + .align 4 + +LL(35): + andi. r0, M, 2 + ble LL(37) + + LFDU a1, 1 * SIZE(AO1) + LFDU y01, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a3, 1 * SIZE(AO1) + LFDU y03, 1 * SIZE(Y1) + LFDU a4, 1 * SIZE(AO1) + LFDU y04, 1 * SIZE(Y1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMADD y03, alpha1r, a3, y03 + FMADD y04, alpha1i, a3, y04 + + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + FMSUBX y03, alpha1i, a4, y03 + FMADDX y04, alpha1r, a4, y04 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + STFDU y03, 1 * SIZE(Y2) + STFDU y04, 1 * SIZE(Y2) + .align 4 + +LL(37): + andi. r0, M, 1 + ble LL(990) + + LFDU y01, 1 * SIZE(Y1) + LFDU a1, 1 * SIZE(AO1) + LFDU y02, 1 * SIZE(Y1) + LFDU a2, 1 * SIZE(AO1) + + FMADD y01, alpha1r, a1, y01 + FMADD y02, alpha1i, a1, y02 + FMSUBX y01, alpha1i, a2, y01 + FMADDX y02, alpha1r, a2, y02 + + STFDU y01, 1 * SIZE(Y2) + STFDU y02, 1 * SIZE(Y2) + .align 4 + +LL(990): + cmpi cr0, 0, INCY, SIZE + beq LL(999) + + addi YY, BUFFER, -SIZE + mr Y1, Y + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFDUX f0, Y, INCY + LFDU f1, 1 * SIZE(Y) + LFDUX f2, Y, INCY + LFDU f3, 1 * SIZE(Y) + LFDUX f4, Y, INCY + LFDU f5, 1 * SIZE(Y) + LFDUX f6, Y, INCY + LFDU f7, 1 * SIZE(Y) + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + LFDU f10, 1 * SIZE(YY) + LFDU f11, 1 * SIZE(YY) + LFDU f12, 1 * SIZE(YY) + LFDU f13, 1 * SIZE(YY) + LFDU f14, 1 * SIZE(YY) + LFDU f15, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFDUX f8, Y1, INCY + STFDU f9, 1 * SIZE(Y1) + STFDUX f10, Y1, INCY + STFDU f11, 1 * SIZE(Y1) + STFDUX f12, Y1, INCY + STFDU f13, 1 * SIZE(Y1) + STFDUX f14, Y1, INCY + STFDU f15, 1 * SIZE(Y1) + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 2 + ble LL(996) + + LFDUX f0, Y, INCY + LFDU f1, 1 * SIZE(Y) + LFDUX f2, Y, INCY + LFDU f3, 1 * SIZE(Y) + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + LFDU f10, 1 * SIZE(YY) + LFDU f11, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFDUX f8, Y1, INCY + STFDU f9, 1 * SIZE(Y1) + STFDUX f10, Y1, INCY + STFDU f11, 1 * SIZE(Y1) + .align 4 + +LL(996): + andi. J, M, 1 + ble LL(999) + + LFDUX f0, Y, INCY + LFDU f1, 1 * SIZE(Y) + + LFDU f8, 1 * SIZE(YY) + LFDU f9, 1 * SIZE(YY) + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFDUX f8, Y1, INCY + STFDU f9, 1 * SIZE(Y1) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S new file mode 100644 index 0000000000..057c04d620 --- /dev/null +++ b/kernel/power/zgemv_t.S @@ -0,0 +1,1522 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 2048 + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 304 +#endif + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r10 +#define LDA r5 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#define BUFFER r11 +#define XP r12 +#define MIN_N r14 +#define J r15 +#define CO r16 +#define BO r17 +#define PLDA_M r18 +#define AO1 r19 +#define AO2 r20 +#define AO3 r21 +#define AO4 r22 +#define IS r23 +#define PREA r24 +#define PREC r25 + +#define Y1 r23 /* dummy; should be same as gemv_n.S */ +#define Y2 r24 /* dummy; should be same as gemv_n.S */ + +#if defined(PPCG4) +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 56 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 56 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 34 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 40 +#define PREFETCHSIZE_C 8 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 8 +#endif + +#if !(defined(CONJ) && defined(XCONJ)) +#define FMADDR FMADD +#define FMSUBR FNMSUB +#else +#define FMADDR FNMSUB +#define FMSUBR FMADD +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define FZERO 200(SP) +#define ALPHA_R 208(SP) +#define ALPHA_I 216(SP) +#else +#define FZERO 256(SP) +#define ALPHA_R 264(SP) +#define ALPHA_I 272(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r0, FZERO +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r0, FZERO + stw r0, 4 + FZERO +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz LDA, 56 + STACKSIZE(SP) + lwz X, 60 + STACKSIZE(SP) + lwz INCX, 64 + STACKSIZE(SP) + lwz Y, 68 + STACKSIZE(SP) + lwz INCY, 72 + STACKSIZE(SP) + lwz BUFFER, 76 + STACKSIZE(SP) +#else + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#endif +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + + stfd f1, ALPHA_R + stfd f2, ALPHA_I + + mullw PLDA_M, LDA, N + li XP, P + subf PLDA_M, XP, PLDA_M + slwi PLDA_M, PLDA_M, ZBASE_SHIFT + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li IS, 0 + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble LL(End) + cmpwi cr0, N, 0 + ble LL(End) + .align 4 + +LL(ISLoop): + subf MIN_N, IS, M + slwi r0, IS, ZBASE_SHIFT + cmpi cr0, 0, MIN_N, P + ble+ LL(min_nP) + li MIN_N, P +LL(min_nP): + add XP, X, r0 + cmpwi cr0, INCX, 2 * SIZE + beq LL(Main) + + mr XP, BUFFER + addi CO, BUFFER, -SIZE + + srawi. r0, MIN_N, 2 + mtspr CTR, r0 + ble LL(CopyRemain) + .align 4 + +LL(CopyKernel): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + add X, X, INCX + LFD f2, 0 * SIZE(X) + LFD f3, 1 * SIZE(X) + add X, X, INCX + LFD f4, 0 * SIZE(X) + LFD f5, 1 * SIZE(X) + add X, X, INCX + LFD f6, 0 * SIZE(X) + LFD f7, 1 * SIZE(X) + add X, X, INCX + + STFD f0, 1 * SIZE(CO) + STFD f1, 2 * SIZE(CO) + STFD f2, 3 * SIZE(CO) + STFD f3, 4 * SIZE(CO) + STFD f4, 5 * SIZE(CO) + STFD f5, 6 * SIZE(CO) + STFD f6, 7 * SIZE(CO) + STFDU f7, 8 * SIZE(CO) + bdnz LL(CopyKernel) + .align 4 + +LL(CopyRemain): + andi. r0, MIN_N, 3 + mtspr CTR, r0 + ble LL(Main) + .align 4 + +LL(CopySub): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + add X, X, INCX + STFD f0, 1 * SIZE(CO) + STFDU f1, 2 * SIZE(CO) + bdnz LL(CopySub) + .align 4 + +LL(Main): + mr CO, Y + addi XP, XP, -SIZE + srawi. J, N, 2 + ble LL(Remain) + .align 4 + +LL(MainHead): + mr AO1, A + add AO2, A, LDA + add AO3, AO2, LDA + add AO4, AO3, LDA + add A, AO4, LDA + + mr BO, XP + + lfd f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst PREC, CO + srawi. r0, MIN_N, 3 + mtspr CTR, r0 + ble LL(MainN3) + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f18, 0 * SIZE(AO2) + LFD f19, 1 * SIZE(AO2) + LFD f20, 0 * SIZE(AO3) + LFD f21, 1 * SIZE(AO3) + LFD f22, 0 * SIZE(AO4) + LFD f23, 1 * SIZE(AO4) + + LFD f24, 1 * SIZE(BO) + LFD f25, 2 * SIZE(BO) + LFD f26, 3 * SIZE(BO) + LFD f27, 4 * SIZE(BO) + LFD f28, 5 * SIZE(BO) + LFD f29, 6 * SIZE(BO) + LFD f30, 7 * SIZE(BO) + LFD f31, 8 * SIZE(BO) + + bdz LL(MainKernelSkip) + .align 5 + +LL(MainKernel): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 2 * SIZE(AO1) + LFD f17, 3 * SIZE(AO1) + LFD f18, 2 * SIZE(AO2) + LFD f19, 3 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 2 * SIZE(AO3) + LFD f21, 3 * SIZE(AO3) + LFD f22, 2 * SIZE(AO4) + LFD f23, 3 * SIZE(AO4) + + FMADD f0, f16, f26, f0 + FMADD f1, f16, f27, f1 + FMADD f2, f17, f26, f2 + FMADD f3, f17, f27, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 4 * SIZE(AO1) + LFD f17, 5 * SIZE(AO1) + LFD f18, 4 * SIZE(AO2) + LFD f19, 5 * SIZE(AO2) + + FMADD f8, f20, f26, f8 + FMADD f9, f20, f27, f9 + FMADD f10, f21, f26, f10 + FMADD f11, f21, f27, f11 + + FMADD f12, f22, f26, f12 + FMADD f13, f22, f27, f13 + FMADD f14, f23, f26, f14 + FMADD f15, f23, f27, f15 + + LFD f20, 4 * SIZE(AO3) + LFD f21, 5 * SIZE(AO3) + LFD f22, 4 * SIZE(AO4) + LFD f23, 5 * SIZE(AO4) + + LFD f24, 9 * SIZE(BO) + LFD f25, 10 * SIZE(BO) + LFD f26, 11 * SIZE(BO) + LFD f27, 12 * SIZE(BO) + + FMADD f0, f16, f28, f0 + FMADD f1, f16, f29, f1 + FMADD f2, f17, f28, f2 + FMADD f3, f17, f29, f3 + + FMADD f4, f18, f28, f4 + FMADD f5, f18, f29, f5 + FMADD f6, f19, f28, f6 + FMADD f7, f19, f29, f7 + + LFD f16, 6 * SIZE(AO1) + LFD f17, 7 * SIZE(AO1) + LFD f18, 6 * SIZE(AO2) + LFD f19, 7 * SIZE(AO2) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f28, f12 + FMADD f13, f22, f29, f13 + FMADD f14, f23, f28, f14 + FMADD f15, f23, f29, f15 + + LFD f20, 6 * SIZE(AO3) + LFD f21, 7 * SIZE(AO3) + LFD f22, 6 * SIZE(AO4) + LFD f23, 7 * SIZE(AO4) + + FMADD f0, f16, f30, f0 + FMADD f1, f16, f31, f1 + FMADD f2, f17, f30, f2 + FMADD f3, f17, f31, f3 + + FMADD f4, f18, f30, f4 + FMADD f5, f18, f31, f5 + FMADD f6, f19, f30, f6 + FMADD f7, f19, f31, f7 + + LFD f16, 8 * SIZE(AO1) + LFD f17, 9 * SIZE(AO1) + LFD f18, 8 * SIZE(AO2) + LFD f19, 9 * SIZE(AO2) + + FMADD f8, f20, f30, f8 + FMADD f9, f20, f31, f9 + FMADD f10, f21, f30, f10 + FMADD f11, f21, f31, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 8 * SIZE(AO3) + LFD f21, 9 * SIZE(AO3) + LFD f22, 8 * SIZE(AO4) + LFD f23, 9 * SIZE(AO4) + + LFD f28, 13 * SIZE(BO) + LFD f29, 14 * SIZE(BO) + LFD f30, 15 * SIZE(BO) + LFD f31, 16 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 10 * SIZE(AO1) + LFD f17, 11 * SIZE(AO1) + LFD f18, 10 * SIZE(AO2) + LFD f19, 11 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 10 * SIZE(AO3) + LFD f21, 11 * SIZE(AO3) + LFD f22, 10 * SIZE(AO4) + LFD f23, 11 * SIZE(AO4) + + FMADD f0, f16, f26, f0 + FMADD f1, f16, f27, f1 + FMADD f2, f17, f26, f2 + FMADD f3, f17, f27, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 12 * SIZE(AO1) + LFD f17, 13 * SIZE(AO1) + LFD f18, 12 * SIZE(AO2) + LFD f19, 13 * SIZE(AO2) + + FMADD f8, f20, f26, f8 + FMADD f9, f20, f27, f9 + FMADD f10, f21, f26, f10 + FMADD f11, f21, f27, f11 + + FMADD f12, f22, f26, f12 + FMADD f13, f22, f27, f13 + FMADD f14, f23, f26, f14 + FMADD f15, f23, f27, f15 + + LFD f20, 12 * SIZE(AO3) + LFD f21, 13 * SIZE(AO3) + LFD f22, 12 * SIZE(AO4) + LFD f23, 13 * SIZE(AO4) + + LFD f24, 17 * SIZE(BO) + LFD f25, 18 * SIZE(BO) + LFD f26, 19 * SIZE(BO) + LFD f27, 20 * SIZE(BO) + + FMADD f0, f16, f28, f0 + FMADD f1, f16, f29, f1 + FMADD f2, f17, f28, f2 + FMADD f3, f17, f29, f3 + + FMADD f4, f18, f28, f4 + FMADD f5, f18, f29, f5 + FMADD f6, f19, f28, f6 + FMADD f7, f19, f29, f7 + + LFD f16, 14 * SIZE(AO1) + LFD f17, 15 * SIZE(AO1) + LFD f18, 14 * SIZE(AO2) + LFD f19, 15 * SIZE(AO2) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f28, f12 + FMADD f13, f22, f29, f13 + FMADD f14, f23, f28, f14 + FMADD f15, f23, f29, f15 + + LFD f20, 14 * SIZE(AO3) + LFD f21, 15 * SIZE(AO3) + LFD f22, 14 * SIZE(AO4) + LFD f23, 15 * SIZE(AO4) + + FMADD f0, f16, f30, f0 + FMADD f1, f16, f31, f1 + FMADD f2, f17, f30, f2 + FMADD f3, f17, f31, f3 + + FMADD f4, f18, f30, f4 + FMADD f5, f18, f31, f5 + FMADD f6, f19, f30, f6 + FMADD f7, f19, f31, f7 + + LFD f16, 16 * SIZE(AO1) + LFD f17, 17 * SIZE(AO1) + LFD f18, 16 * SIZE(AO2) + LFD f19, 17 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + DCBT(AO1, PREA) + DCBT(AO2, PREA) + + FMADD f8, f20, f30, f8 + FMADD f9, f20, f31, f9 + FMADD f10, f21, f30, f10 + FMADD f11, f21, f31, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 16 * SIZE(AO3) + LFD f21, 17 * SIZE(AO3) + LFD f22, 16 * SIZE(AO4) + LFD f23, 17 * SIZE(AO4) + + LFD f28, 21 * SIZE(BO) + LFD f29, 22 * SIZE(BO) + LFD f30, 23 * SIZE(BO) + LFD f31, 24 * SIZE(BO) + + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + DCBT(AO3, PREA) + DCBT(AO4, PREA) + + addi BO, BO, 16 * SIZE + bdnz LL(MainKernel) + .align 4 + +LL(MainKernelSkip): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 2 * SIZE(AO1) + LFD f17, 3 * SIZE(AO1) + LFD f18, 2 * SIZE(AO2) + LFD f19, 3 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 2 * SIZE(AO3) + LFD f21, 3 * SIZE(AO3) + LFD f22, 2 * SIZE(AO4) + LFD f23, 3 * SIZE(AO4) + + FMADD f0, f16, f26, f0 + FMADD f1, f16, f27, f1 + FMADD f2, f17, f26, f2 + FMADD f3, f17, f27, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 4 * SIZE(AO1) + LFD f17, 5 * SIZE(AO1) + LFD f18, 4 * SIZE(AO2) + LFD f19, 5 * SIZE(AO2) + + FMADD f8, f20, f26, f8 + FMADD f9, f20, f27, f9 + FMADD f10, f21, f26, f10 + FMADD f11, f21, f27, f11 + + FMADD f12, f22, f26, f12 + FMADD f13, f22, f27, f13 + FMADD f14, f23, f26, f14 + FMADD f15, f23, f27, f15 + + LFD f20, 4 * SIZE(AO3) + LFD f21, 5 * SIZE(AO3) + LFD f22, 4 * SIZE(AO4) + LFD f23, 5 * SIZE(AO4) + + FMADD f0, f16, f28, f0 + FMADD f1, f16, f29, f1 + FMADD f2, f17, f28, f2 + FMADD f3, f17, f29, f3 + + FMADD f4, f18, f28, f4 + FMADD f5, f18, f29, f5 + FMADD f6, f19, f28, f6 + FMADD f7, f19, f29, f7 + + LFD f16, 6 * SIZE(AO1) + LFD f17, 7 * SIZE(AO1) + LFD f18, 6 * SIZE(AO2) + LFD f19, 7 * SIZE(AO2) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f28, f12 + FMADD f13, f22, f29, f13 + FMADD f14, f23, f28, f14 + FMADD f15, f23, f29, f15 + + LFD f20, 6 * SIZE(AO3) + LFD f21, 7 * SIZE(AO3) + LFD f22, 6 * SIZE(AO4) + LFD f23, 7 * SIZE(AO4) + + FMADD f0, f16, f30, f0 + FMADD f1, f16, f31, f1 + FMADD f2, f17, f30, f2 + FMADD f3, f17, f31, f3 + + FMADD f4, f18, f30, f4 + FMADD f5, f18, f31, f5 + FMADD f6, f19, f30, f6 + FMADD f7, f19, f31, f7 + + LFD f16, 8 * SIZE(AO1) + LFD f17, 9 * SIZE(AO1) + LFD f18, 8 * SIZE(AO2) + LFD f19, 9 * SIZE(AO2) + + FMADD f8, f20, f30, f8 + FMADD f9, f20, f31, f9 + FMADD f10, f21, f30, f10 + FMADD f11, f21, f31, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 8 * SIZE(AO3) + LFD f21, 9 * SIZE(AO3) + LFD f22, 8 * SIZE(AO4) + LFD f23, 9 * SIZE(AO4) + + LFD f24, 9 * SIZE(BO) + LFD f25, 10 * SIZE(BO) + LFD f26, 11 * SIZE(BO) + LFD f27, 12 * SIZE(BO) + + LFD f28, 13 * SIZE(BO) + LFD f29, 14 * SIZE(BO) + LFD f30, 15 * SIZE(BO) + LFDU f31, 16 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 10 * SIZE(AO1) + LFD f17, 11 * SIZE(AO1) + LFD f18, 10 * SIZE(AO2) + LFD f19, 11 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 10 * SIZE(AO3) + LFD f21, 11 * SIZE(AO3) + LFD f22, 10 * SIZE(AO4) + LFD f23, 11 * SIZE(AO4) + + FMADD f0, f16, f26, f0 + FMADD f1, f16, f27, f1 + FMADD f2, f17, f26, f2 + FMADD f3, f17, f27, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 12 * SIZE(AO1) + LFD f17, 13 * SIZE(AO1) + LFD f18, 12 * SIZE(AO2) + LFD f19, 13 * SIZE(AO2) + + FMADD f8, f20, f26, f8 + FMADD f9, f20, f27, f9 + FMADD f10, f21, f26, f10 + FMADD f11, f21, f27, f11 + + FMADD f12, f22, f26, f12 + FMADD f13, f22, f27, f13 + FMADD f14, f23, f26, f14 + FMADD f15, f23, f27, f15 + + LFD f20, 12 * SIZE(AO3) + LFD f21, 13 * SIZE(AO3) + LFD f22, 12 * SIZE(AO4) + LFD f23, 13 * SIZE(AO4) + + FMADD f0, f16, f28, f0 + FMADD f1, f16, f29, f1 + FMADD f2, f17, f28, f2 + FMADD f3, f17, f29, f3 + + FMADD f4, f18, f28, f4 + FMADD f5, f18, f29, f5 + FMADD f6, f19, f28, f6 + FMADD f7, f19, f29, f7 + + LFD f16, 14 * SIZE(AO1) + LFD f17, 15 * SIZE(AO1) + LFD f18, 14 * SIZE(AO2) + LFD f19, 15 * SIZE(AO2) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f28, f12 + FMADD f13, f22, f29, f13 + FMADD f14, f23, f28, f14 + FMADD f15, f23, f29, f15 + + LFD f20, 14 * SIZE(AO3) + LFD f21, 15 * SIZE(AO3) + LFD f22, 14 * SIZE(AO4) + LFD f23, 15 * SIZE(AO4) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi AO3, AO3, 16 * SIZE + addi AO4, AO4, 16 * SIZE + + FMADD f0, f16, f30, f0 + FMADD f1, f16, f31, f1 + FMADD f2, f17, f30, f2 + FMADD f3, f17, f31, f3 + + FMADD f4, f18, f30, f4 + FMADD f5, f18, f31, f5 + FMADD f6, f19, f30, f6 + FMADD f7, f19, f31, f7 + + FMADD f8, f20, f30, f8 + FMADD f9, f20, f31, f9 + FMADD f10, f21, f30, f10 + FMADD f11, f21, f31, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + .align 4 + +LL(MainN3): + andi. r0, MIN_N, 7 + mtspr CTR, r0 + ble LL(MainFinish) + .align 4 + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f18, 0 * SIZE(AO2) + LFD f19, 1 * SIZE(AO2) + LFD f20, 0 * SIZE(AO3) + LFD f21, 1 * SIZE(AO3) + LFD f22, 0 * SIZE(AO4) + LFD f23, 1 * SIZE(AO4) + + LFD f24, 1 * SIZE(BO) + LFDU f25, 2 * SIZE(BO) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi AO3, AO3, 2 * SIZE + addi AO4, AO4, 2 * SIZE + + bdz LL(MainN3KernelSkip) + .align 4 + +LL(MainN3Kernel): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f18, 0 * SIZE(AO2) + LFD f19, 1 * SIZE(AO2) + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + + LFD f20, 0 * SIZE(AO3) + LFD f21, 1 * SIZE(AO3) + LFD f22, 0 * SIZE(AO4) + LFD f23, 1 * SIZE(AO4) + + LFD f24, 1 * SIZE(BO) + LFDU f25, 2 * SIZE(BO) + + addi AO1, AO1, 2 * SIZE + addi AO2, AO2, 2 * SIZE + addi AO3, AO3, 2 * SIZE + addi AO4, AO4, 2 * SIZE + + bdnz LL(MainN3Kernel) + .align 4 + +LL(MainN3KernelSkip): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f24, f4 + FMADD f5, f18, f25, f5 + FMADD f6, f19, f24, f6 + FMADD f7, f19, f25, f7 + + FMADD f8, f20, f24, f8 + FMADD f9, f20, f25, f9 + FMADD f10, f21, f24, f10 + FMADD f11, f21, f25, f11 + + FMADD f12, f22, f24, f12 + FMADD f13, f22, f25, f13 + FMADD f14, f23, f24, f14 + FMADD f15, f23, f25, f15 + .align 4 + +LL(MainFinish): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + +#ifndef XCONJ +#ifndef CONJ + FSUB f0, f0, f3 + FADD f1, f1, f2 + FSUB f4, f4, f7 + FADD f5, f5, f6 + FSUB f8, f8, f11 + FADD f9, f9, f10 + FSUB f12, f12, f15 + FADD f13, f13, f14 +#else + FADD f0, f0, f3 + FSUB f1, f1, f2 + FADD f4, f4, f7 + FSUB f5, f5, f6 + FADD f8, f8, f11 + FSUB f9, f9, f10 + FADD f12, f12, f15 + FSUB f13, f13, f14 +#endif +#else +#ifndef CONJ + FADD f0, f0, f3 + FSUB f1, f2, f1 + FADD f4, f4, f7 + FSUB f5, f6, f5 + FADD f8, f8, f11 + FSUB f9, f10, f9 + FADD f12, f12, f15 + FSUB f13, f14, f13 +#else + FSUB f0, f0, f3 + FADD f1, f1, f2 + FSUB f4, f4, f7 + FADD f5, f5, f6 + FSUB f8, f8, f11 + FADD f9, f9, f10 + FSUB f12, f12, f15 + FADD f13, f13, f14 +#endif +#endif + + mr BO, CO + cmpwi cr0, INCY, 2 * SIZE + bne LL(FinishN1) + + LFD f16, 0 * SIZE(CO) + LFD f17, 1 * SIZE(CO) + LFD f18, 2 * SIZE(CO) + LFD f19, 3 * SIZE(CO) + LFD f20, 4 * SIZE(CO) + LFD f21, 5 * SIZE(CO) + LFD f22, 6 * SIZE(CO) + LFD f23, 7 * SIZE(CO) + + FMADD f16, f30, f0, f16 + FMADDR f17, f30, f1, f17 + FMADD f18, f30, f4, f18 + FMADDR f19, f30, f5, f19 + + FMADD f20, f30, f8, f20 + FMADDR f21, f30, f9, f21 + FMADD f22, f30, f12, f22 + FMADDR f23, f30, f13, f23 + + FMSUBR f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMSUBR f18, f31, f5, f18 + FMADD f19, f31, f4, f19 + + FMSUBR f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMSUBR f22, f31, f13, f22 + FMADD f23, f31, f12, f23 + + STFD f16, 0 * SIZE(CO) + STFD f17, 1 * SIZE(CO) + STFD f18, 2 * SIZE(CO) + STFD f19, 3 * SIZE(CO) + + STFD f20, 4 * SIZE(CO) + STFD f21, 5 * SIZE(CO) + STFD f22, 6 * SIZE(CO) + STFD f23, 7 * SIZE(CO) + + addi CO, CO, 8 * SIZE + + addi J, J, -1 + cmpwi cr0, J, 0 + bgt LL(MainHead) + b LL(Remain) + .align 4 + +LL(FinishN1): + LFD f16, 0 * SIZE(CO) + LFD f17, 1 * SIZE(CO) + add CO, CO, INCY + + LFD f18, 0 * SIZE(CO) + LFD f19, 1 * SIZE(CO) + add CO, CO, INCY + + LFD f20, 0 * SIZE(CO) + LFD f21, 1 * SIZE(CO) + add CO, CO, INCY + + LFD f22, 0 * SIZE(CO) + LFD f23, 1 * SIZE(CO) + add CO, CO, INCY + + FMADD f16, f30, f0, f16 + FMADDR f17, f30, f1, f17 + FMADD f18, f30, f4, f18 + FMADDR f19, f30, f5, f19 + + FMADD f20, f30, f8, f20 + FMADDR f21, f30, f9, f21 + FMADD f22, f30, f12, f22 + FMADDR f23, f30, f13, f23 + + FMSUBR f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + FMSUBR f18, f31, f5, f18 + FMADD f19, f31, f4, f19 + + FMSUBR f20, f31, f9, f20 + FMADD f21, f31, f8, f21 + FMSUBR f22, f31, f13, f22 + FMADD f23, f31, f12, f23 + + STFD f16, 0 * SIZE(BO) + STFD f17, 1 * SIZE(BO) + add BO, BO, INCY + STFD f18, 0 * SIZE(BO) + STFD f19, 1 * SIZE(BO) + add BO, BO, INCY + + STFD f20, 0 * SIZE(BO) + STFD f21, 1 * SIZE(BO) + add BO, BO, INCY + STFD f22, 0 * SIZE(BO) + STFD f23, 1 * SIZE(BO) + + addi J, J, -1 + cmpwi cr0, J, 0 + bgt LL(MainHead) + .align 4 + +LL(Remain): + andi. J, N, 3 + ble LL(ISEnd) + .align 4 + +LL(RemainHead): + mr AO1, A + add A, A, LDA + mr BO, XP + lfd f0, FZERO + + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0 , MIN_N, 3 + mtspr CTR, r0 + ble LL(RemainN3) + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f18, 2 * SIZE(AO1) + LFD f19, 3 * SIZE(AO1) + + LFD f20, 4 * SIZE(AO1) + LFD f21, 5 * SIZE(AO1) + LFD f22, 6 * SIZE(AO1) + LFD f23, 7 * SIZE(AO1) + + LFD f24, 1 * SIZE(BO) + LFD f25, 2 * SIZE(BO) + LFD f26, 3 * SIZE(BO) + LFD f27, 4 * SIZE(BO) + + LFD f28, 5 * SIZE(BO) + LFD f29, 6 * SIZE(BO) + LFD f30, 7 * SIZE(BO) + LFD f31, 8 * SIZE(BO) + + bdz LL(RemainKernelSkip) + .align 4 + +LL(RemainKernel): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO1) + LFD f17, 9 * SIZE(AO1) + LFD f18, 10 * SIZE(AO1) + LFD f19, 11 * SIZE(AO1) + + LFD f24, 9 * SIZE(BO) + LFD f25, 10 * SIZE(BO) + LFD f26, 11 * SIZE(BO) + LFD f27, 12 * SIZE(BO) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 12 * SIZE(AO1) + LFD f21, 13 * SIZE(AO1) + LFD f22, 14 * SIZE(AO1) + LFD f23, 15 * SIZE(AO1) + + LFD f28, 13 * SIZE(BO) + LFD f29, 14 * SIZE(BO) + LFD f30, 15 * SIZE(BO) + LFD f31, 16 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 16 * SIZE(AO1) + LFD f17, 17 * SIZE(AO1) + LFD f18, 18 * SIZE(AO1) + LFD f19, 19 * SIZE(AO1) + + LFD f24, 17 * SIZE(BO) + LFD f25, 18 * SIZE(BO) + LFD f26, 19 * SIZE(BO) + LFD f27, 20 * SIZE(BO) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 20 * SIZE(AO1) + LFD f21, 21 * SIZE(AO1) + LFD f22, 22 * SIZE(AO1) + LFD f23, 23 * SIZE(AO1) + + LFD f28, 21 * SIZE(BO) + LFD f29, 22 * SIZE(BO) + LFD f30, 23 * SIZE(BO) + LFD f31, 24 * SIZE(BO) + + addi AO1, AO1, 16 * SIZE + addi BO, BO, 16 * SIZE + + DCBT(AO1, PREA) + + bdnz LL(RemainKernel) + .align 4 + +LL(RemainKernelSkip): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO1) + LFD f17, 9 * SIZE(AO1) + LFD f18, 10 * SIZE(AO1) + LFD f19, 11 * SIZE(AO1) + + LFD f24, 9 * SIZE(BO) + LFD f25, 10 * SIZE(BO) + LFD f26, 11 * SIZE(BO) + LFD f27, 12 * SIZE(BO) + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + LFD f20, 12 * SIZE(AO1) + LFD f21, 13 * SIZE(AO1) + LFD f22, 14 * SIZE(AO1) + LFD f23, 15 * SIZE(AO1) + + LFD f28, 13 * SIZE(BO) + LFD f29, 14 * SIZE(BO) + LFD f30, 15 * SIZE(BO) + LFDU f31, 16 * SIZE(BO) + + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + FMADD f4, f18, f26, f4 + FMADD f5, f18, f27, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + FMADD f8, f20, f28, f8 + FMADD f9, f20, f29, f9 + FMADD f10, f21, f28, f10 + FMADD f11, f21, f29, f11 + + FMADD f12, f22, f30, f12 + FMADD f13, f22, f31, f13 + FMADD f14, f23, f30, f14 + FMADD f15, f23, f31, f15 + + addi AO1, AO1, 16 * SIZE + .align 4 + +LL(RemainN3): + andi. r0, MIN_N, 7 + mtspr CTR, r0 + ble LL(RemainFinish) + .align 4 + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f24, 1 * SIZE(BO) + LFDU f25, 2 * SIZE(BO) + addi AO1, AO1, 2 * SIZE + bdz LL(RemainN3KernelSkip) + .align 4 + +LL(RemainN3Kernel): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + + LFD f16, 0 * SIZE(AO1) + LFD f17, 1 * SIZE(AO1) + LFD f24, 1 * SIZE(BO) + LFDU f25, 2 * SIZE(BO) + addi AO1, AO1, 2 * SIZE + bdnz LL(RemainN3Kernel) + .align 4 + +LL(RemainN3KernelSkip): + FMADD f0, f16, f24, f0 + FMADD f1, f16, f25, f1 + FMADD f2, f17, f24, f2 + FMADD f3, f17, f25, f3 + .align 4 + +LL(RemainFinish): + lfd f30, ALPHA_R + lfd f31, ALPHA_I + LFD f16, 0 * SIZE(CO) + LFD f17, 1 * SIZE(CO) + + FADD f0, f0, f4 + FADD f1, f1, f5 + FADD f2, f2, f6 + FADD f3, f3, f7 + + FADD f8, f8, f12 + FADD f9, f9, f13 + FADD f10, f10, f14 + FADD f11, f11, f15 + + FADD f0, f0, f8 + FADD f1, f1, f9 + FADD f2, f2, f10 + FADD f3, f3, f11 + +#ifndef XCONJ +#ifndef CONJ + FSUB f0, f0, f3 + FADD f1, f1, f2 +#else + FADD f0, f0, f3 + FSUB f1, f1, f2 +#endif +#else +#ifndef CONJ + FADD f0, f0, f3 + FSUB f1, f2, f1 +#else + FSUB f0, f0, f3 + FADD f1, f1, f2 +#endif +#endif + + FMADD f16, f30, f0, f16 + FMADDR f17, f30, f1, f17 + FMSUBR f16, f31, f1, f16 + FMADD f17, f31, f0, f17 + + STFD f16, 0 * SIZE(CO) + STFD f17, 1 * SIZE(CO) + add CO, CO, INCY + + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(RemainHead) + .align 4 + +LL(ISEnd): + subf A, PLDA_M, A + addi IS, IS, P + + cmp cr0, 0, IS, M + blt LL(ISLoop) + .align 4 + +LL(End): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE + +#endif diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S new file mode 100644 index 0000000000..edb5183fc2 --- /dev/null +++ b/kernel/power/zgemv_t_ppc440.S @@ -0,0 +1,1294 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 1024 + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 304 +#endif + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r6 +#define LDA r7 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r10 +#define LDA r5 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#else +#define M r3 +#define N r4 +#define A r8 +#define LDA r9 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#endif +#endif + +#define BUFFER r11 +#define XP r12 +#define X1 r14 +#define J r15 +#define AO1 r16 +#define AO2 r17 +#define AO3 r18 +#define AO4 r19 +#define PREA r20 +#define PREC r21 +#define YY r22 + +#if defined(PPCG4) +#define PREFETCHSIZE_A (3 * 8) +#define PREFETCHSIZE_C 7 +#endif + +#if defined(POWER6) +#define PREFETCHSIZE_A (3 * 8) +#define PREFETCHSIZE_C 7 +#endif + +#if !(defined(CONJ) && defined(XCONJ)) +#define FMADDR FMADD +#define FMSUBR FNMSUB +#else +#define FMADDR FNMSUB +#define FMSUBR FMADD +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define FZERO 200(SP) +#else +#define FZERO 256(SP) +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r0, FZERO +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r0, FZERO + stw r0, 4 + FZERO +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz INCY, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz LDA, 56 + STACKSIZE(SP) + lwz X, 60 + STACKSIZE(SP) + lwz INCX, 64 + STACKSIZE(SP) + lwz Y, 68 + STACKSIZE(SP) + lwz INCY, 72 + STACKSIZE(SP) + lwz BUFFER, 76 + STACKSIZE(SP) +#else + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#endif +#else + ld INCX, 112 + STACKSIZE(SP) + ld Y, 120 + STACKSIZE(SP) + ld INCY, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#ifndef XCONJ +#ifndef CONJ +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FMADD +#define FMADD4 FNMSUB +#endif +#else +#ifndef CONJ +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FMADD +#define FMADD4 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#endif +#endif + +#define y1 f0 +#define y2 f1 +#define y3 f2 +#define y4 f3 +#define y5 f4 +#define y6 f5 +#define y7 f6 +#define y8 f7 + +#define a1 f8 +#define a2 f9 +#define a3 f10 +#define a4 f11 +#define a5 f12 +#define a6 f13 +#define a7 f14 +#define a8 f15 + +#define b1 f16 +#define b2 f17 +#define b3 f18 +#define b4 f19 +#define b5 f20 +#define b6 f21 +#define b7 f22 +#define b8 f23 + +#define alpha_r f24 +#define alpha_i f25 + + fmr alpha_r, f1 + fmr alpha_i, f2 + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + addi A, A, -SIZE + addi INCX, INCX, -SIZE + addi INCY, INCY, -SIZE + + sub X, X, INCX + sub Y, Y, INCY + + mr YY, Y + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + + mr XP, X + cmpwi cr0, INCX, SIZE + beq LL(10) + + addi XP, BUFFER, -SIZE + addi X1, BUFFER, -SIZE + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(02): + LFDUX f0, X, INCX + LFDU f1, 1 * SIZE(X) + LFDUX f2, X, INCX + LFDU f3, 1 * SIZE(X) + LFDUX f4, X, INCX + LFDU f5, 1 * SIZE(X) + LFDUX f6, X, INCX + LFDU f7, 1 * SIZE(X) + + STFDU f0, 1 * SIZE(X1) + STFDU f1, 1 * SIZE(X1) + STFDU f2, 1 * SIZE(X1) + STFDU f3, 1 * SIZE(X1) + STFDU f4, 1 * SIZE(X1) + STFDU f5, 1 * SIZE(X1) + STFDU f6, 1 * SIZE(X1) + STFDU f7, 1 * SIZE(X1) + bdnz LL(02) + .align 4 + +LL(05): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(06): + LFDUX f0, X, INCX + LFDU f1, 1 * SIZE(X) + STFDU f0, 1 * SIZE(X1) + STFDU f1, 1 * SIZE(X1) + bdnz LL(06) + .align 4 + +LL(10): + srawi. J, N, 2 + ble LL(20) + .align 4 + +LL(11): + lfd y1, FZERO + mr AO1, A + fmr y2, y1 + mr X1, XP + fmr y3, y1 + add AO2, A, LDA + fmr y4, y1 + add AO3, AO2, LDA + fmr y5, y1 + add AO4, AO3, LDA + fmr y6, y1 + add A, AO4, LDA + fmr y7, y1 + + dcbtst PREC, Y + fmr y8, y1 + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(15) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + LFDU a5, 1 * SIZE(AO3) + LFDU a6, 1 * SIZE(AO3) + LFDU a7, 1 * SIZE(AO4) + bdz LL(13) + .align 5 + +LL(12): + FMADD1 y1, a1, b1, y1 + LFDU a8, 1 * SIZE(AO4) + FMADD2 y2, a1, b2, y2 + LFDU b3, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + LFDU b4, 1 * SIZE(X1) + FMADD2 y4, a3, b2, y4 + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + +#ifdef PPCG4 + dcbt AO3, PREA +#endif + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b3, y5 + FMADD2 y6, a5, b4, y6 + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + +#ifdef PPCG4 + dcbt AO4, PREA +#endif + + FMADD3 y5, a6, b4, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b3, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b4, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b3, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO3, PREA +#endif + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b3, y5 + LFDU b1, 1 * SIZE(X1) + FMADD2 y6, a5, b4, y6 + LFDU b2, 1 * SIZE(X1) + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO4, PREA +#endif + + FMADD3 y5, a6, b4, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b3, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b4, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b3, y8 + bdnz LL(12) + .align 4 + +LL(13): + FMADD1 y1, a1, b1, y1 + LFDU a8, 1 * SIZE(AO4) + FMADD2 y2, a1, b2, y2 + LFDU b3, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + LFDU b4, 1 * SIZE(X1) + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b3, y5 + FMADD2 y6, a5, b4, y6 + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + + FMADD3 y5, a6, b4, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b3, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b4, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b3, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + FMADD4 y2, a2, b3, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + + FMADD1 y5, a5, b3, y5 + FMADD2 y6, a5, b4, y6 + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + + FMADD3 y5, a6, b4, y5 + FMADD4 y6, a6, b3, y6 + FMADD3 y7, a8, b4, y7 + FMADD4 y8, a8, b3, y8 + .align 4 + +LL(15): + andi. r0, M, 2 + ble LL(17) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO2) + LFDU b3, 1 * SIZE(X1) + LFDU a4, 1 * SIZE(AO2) + LFDU b4, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + LFDU a5, 1 * SIZE(AO3) + FMADD2 y2, a1, b2, y2 + LFDU a6, 1 * SIZE(AO3) + FMADD1 y3, a3, b1, y3 + LFDU a7, 1 * SIZE(AO4) + FMADD2 y4, a3, b2, y4 + LFDU a8, 1 * SIZE(AO4) + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + + FMADD3 y5, a6, b2, y5 + LFDU a5, 1 * SIZE(AO3) + FMADD4 y6, a6, b1, y6 + LFDU a6, 1 * SIZE(AO3) + FMADD3 y7, a8, b2, y7 + LFDU a7, 1 * SIZE(AO4) + FMADD4 y8, a8, b1, y8 + LFDU a8, 1 * SIZE(AO4) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + FMADD4 y2, a2, b3, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + + FMADD1 y5, a5, b3, y5 + FMADD2 y6, a5, b4, y6 + FMADD1 y7, a7, b3, y7 + FMADD2 y8, a7, b4, y8 + + FMADD3 y5, a6, b4, y5 + FMADD4 y6, a6, b3, y6 + FMADD3 y7, a8, b4, y7 + FMADD4 y8, a8, b3, y8 + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(19) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + LFDU a5, 1 * SIZE(AO3) + LFDU a6, 1 * SIZE(AO3) + LFDU a7, 1 * SIZE(AO4) + LFDU a8, 1 * SIZE(AO4) + + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + FMADD2 y2, a1, b2, y2 + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + FMADD4 y2, a2, b1, y2 + FMADD3 y3, a4, b2, y3 + FMADD4 y4, a4, b1, y4 + + FMADD1 y5, a5, b1, y5 + FMADD2 y6, a5, b2, y6 + FMADD1 y7, a7, b1, y7 + FMADD2 y8, a7, b2, y8 + + FMADD3 y5, a6, b2, y5 + FMADD4 y6, a6, b1, y6 + FMADD3 y7, a8, b2, y7 + FMADD4 y8, a8, b1, y8 + .align 4 + +LL(19): + LFDUX b1, Y, INCY + LFDU b2, 1 * SIZE(Y) + LFDUX b3, Y, INCY + LFDU b4, 1 * SIZE(Y) + LFDUX b5, Y, INCY + LFDU b6, 1 * SIZE(Y) + LFDUX b7, Y, INCY + LFDU b8, 1 * SIZE(Y) + + FMADD b1, alpha_r, y1, b1 + FMADDR b2, alpha_r, y2, b2 + FMADD b3, alpha_r, y3, b3 + FMADDR b4, alpha_r, y4, b4 + + FMADD b5, alpha_r, y5, b5 + FMADDR b6, alpha_r, y6, b6 + FMADD b7, alpha_r, y7, b7 + FMADDR b8, alpha_r, y8, b8 + + FMSUBR b1, alpha_i, y2, b1 + FMADD b2, alpha_i, y1, b2 + FMSUBR b3, alpha_i, y4, b3 + FMADD b4, alpha_i, y3, b4 + + FMSUBR b5, alpha_i, y6, b5 + FMADD b6, alpha_i, y5, b6 + FMSUBR b7, alpha_i, y8, b7 + FMADD b8, alpha_i, y7, b8 + + STFDUX b1, YY, INCY + STFDU b2, 1 * SIZE(YY) + STFDUX b3, YY, INCY + STFDU b4, 1 * SIZE(YY) + + STFDUX b5, YY, INCY + STFDU b6, 1 * SIZE(YY) + STFDUX b7, YY, INCY + STFDU b8, 1 * SIZE(YY) + + addi J, J, -1 + cmpwi cr0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 2 + ble LL(30) + + lfd y1, FZERO + mr AO1, A + fmr y2, y1 + mr X1, XP + fmr y3, y1 + add AO2, A, LDA + fmr y4, y1 + add A, AO2, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(25) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO2) + bdz LL(23) + .align 5 + +LL(22): + FMADD1 y1, a1, b1, y1 + LFDU a4, 1 * SIZE(AO2) + FMADD2 y2, a1, b2, y2 + LFDU b3, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + LFDU b4, 1 * SIZE(X1) + FMADD2 y4, a3, b2, y4 + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + +#ifdef PPCG4 + dcbt AO2, PREA +#endif + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO2, PREA +#endif + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + + bdnz LL(22) + .align 4 + +LL(23): + FMADD1 y1, a1, b1, y1 + LFDU a4, 1 * SIZE(AO2) + FMADD2 y2, a1, b2, y2 + LFDU b3, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + LFDU b4, 1 * SIZE(X1) + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b3, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b4, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b3, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + FMADD4 y2, a2, b3, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + .align 4 + +LL(25): + andi. r0, M, 2 + ble LL(27) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + LFDU a3, 1 * SIZE(AO2) + LFDU b3, 1 * SIZE(X1) + LFDU a4, 1 * SIZE(AO2) + LFDU b4, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + FMADD2 y2, a1, b2, y2 + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y2, a2, b1, y2 + LFDU a2, 1 * SIZE(AO1) + FMADD3 y3, a4, b2, y3 + LFDU a3, 1 * SIZE(AO2) + FMADD4 y4, a4, b1, y4 + LFDU a4, 1 * SIZE(AO2) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD1 y3, a3, b3, y3 + FMADD2 y4, a3, b4, y4 + + FMADD3 y1, a2, b4, y1 + FMADD4 y2, a2, b3, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + .align 4 + +LL(27): + andi. r0, M, 1 + ble LL(29) + + LFDU a1, 1 * SIZE(AO1) + LFDU a2, 1 * SIZE(AO1) + LFDU a3, 1 * SIZE(AO2) + LFDU a4, 1 * SIZE(AO2) + + LFDU b1, 1 * SIZE(X1) + LFDU b2, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + FMADD2 y2, a1, b2, y2 + FMADD1 y3, a3, b1, y3 + FMADD2 y4, a3, b2, y4 + + FMADD3 y1, a2, b2, y1 + FMADD4 y2, a2, b1, y2 + FMADD3 y3, a4, b2, y3 + FMADD4 y4, a4, b1, y4 + .align 4 + +LL(29): + LFDUX b1, Y, INCY + LFDU b2, 1 * SIZE(Y) + LFDUX b3, Y, INCY + LFDU b4, 1 * SIZE(Y) + + FMADD b1, alpha_r, y1, b1 + FMADDR b2, alpha_r, y2, b2 + FMADD b3, alpha_r, y3, b3 + FMADDR b4, alpha_r, y4, b4 + + FMSUBR b1, alpha_i, y2, b1 + FMADD b2, alpha_i, y1, b2 + FMSUBR b3, alpha_i, y4, b3 + FMADD b4, alpha_i, y3, b4 + + STFDUX b1, YY, INCY + STFDU b2, 1 * SIZE(YY) + STFDUX b3, YY, INCY + STFDU b4, 1 * SIZE(YY) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + + lfd y1, FZERO + mr AO1, A + fmr y2, y1 + mr X1, XP + fmr y3, y1 + fmr y4, y1 + add A, A, LDA + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(35) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + bdz LL(33) + .align 5 + +LL(32): + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + +#ifdef PPCG4 + dcbt AO1, PREA +#endif + + FMADD3 y3, a2, b2, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b1, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + +#ifdef PPCG4 + dcbt X1, PREA +#endif + + FMADD3 y3, a2, b4, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b3, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt AO1, PREA +#endif + + FMADD3 y3, a2, b2, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b1, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X1, PREA +#endif + + FMADD3 y3, a2, b4, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b3, y4 + LFDU a2, 1 * SIZE(AO1) + + bdnz LL(32) + .align 4 + +LL(33): + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + + FMADD3 y3, a2, b2, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b1, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b3, y1 + LFDU b1, 1 * SIZE(X1) + FMADD2 y2, a1, b4, y2 + LFDU b2, 1 * SIZE(X1) + + FMADD3 y3, a2, b4, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b3, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU b4, 1 * SIZE(X1) + + FMADD3 y3, a2, b2, y3 + LFDU a1, 1 * SIZE(AO1) + FMADD4 y4, a2, b1, y4 + LFDU a2, 1 * SIZE(AO1) + + FMADD1 y1, a1, b3, y1 + FMADD2 y2, a1, b4, y2 + FMADD3 y3, a2, b4, y3 + FMADD4 y4, a2, b3, y4 + .align 4 + +LL(35): + andi. r0, M, 2 + ble LL(37) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + LFDU b3, 1 * SIZE(X1) + FMADD2 y2, a1, b2, y2 + LFDU a3, 1 * SIZE(AO1) + FMADD3 y3, a2, b2, y3 + LFDU b4, 1 * SIZE(X1) + FMADD4 y4, a2, b1, y4 + LFDU a4, 1 * SIZE(AO1) + + FMADD1 y1, a3, b3, y1 + FMADD2 y2, a3, b4, y2 + FMADD3 y3, a4, b4, y3 + FMADD4 y4, a4, b3, y4 + .align 4 + +LL(37): + andi. r0, M, 1 + ble LL(39) + + LFDU a1, 1 * SIZE(AO1) + LFDU b1, 1 * SIZE(X1) + LFDU a2, 1 * SIZE(AO1) + LFDU b2, 1 * SIZE(X1) + + FMADD1 y1, a1, b1, y1 + FMADD2 y2, a1, b2, y2 + FMADD3 y3, a2, b2, y3 + FMADD4 y4, a2, b1, y4 + .align 4 + +LL(39): + LFDUX b1, Y, INCY + LFDU b2, 1 * SIZE(Y) + + FADD y1, y1, y3 + FADD y2, y2, y4 + + FMADD b1, alpha_r, y1, b1 + FMADDR b2, alpha_r, y2, b2 + FMSUBR b1, alpha_i, y2, b1 + FMADD b2, alpha_i, y1, b2 + + STFDUX b1, YY, INCY + STFDU b2, 1 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE + +#endif diff --git a/kernel/power/zger.S b/kernel/power/zger.S new file mode 100644 index 0000000000..03d0bca7be --- /dev/null +++ b/kernel/power/zger.S @@ -0,0 +1,1357 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef NEEDPARAM +#ifndef DOUBLE +#include "cparam.h" +#else +#include "zparam.h" +#endif +#endif + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define A r10 +#define LDA r5 +#else +#define M r3 +#define N r4 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define A r6 +#define LDA r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define X r10 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define A r8 +#define LDA r9 +#else +#define M r3 +#define N r4 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r5 +#define A r6 +#define LDA r7 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r14 +#define AO2 r15 +#define AO3 r16 +#define AO4 r17 +#define AO5 r18 +#define AO6 r19 +#define AO7 r20 +#define AO8 r21 + +#define X1 r22 +#define PREA r23 +#define PREC r24 +#define XX r25 +#define BUFFER r26 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define alpha1_r f8 +#define alpha1_i f9 +#define alpha2_r f10 +#define alpha2_i f11 + +#define a1 f12 +#define a2 f13 +#define a3 f14 +#define a4 f15 +#define a5 f16 +#define a6 f17 +#define a7 f18 +#define a8 f19 +#define a9 f20 +#define a10 f21 +#define a11 f22 +#define a12 f23 +#define a13 f24 +#define a14 f25 +#define a15 f26 +#define a16 f27 + +#define alpha_r f30 +#define alpha_i f31 + +#ifndef CONJ +#define FMA1 FNMSUB +#define FMA2 FMADD +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 16 +#define PREFETCHSIZE_C 16 +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#else +#define STACKSIZE 280 +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz LDA, 8 + STACKSIZE(SP) + lwz BUFFER, 12 + STACKSIZE(SP) +#else + ld INCY, 112 + STACKSIZE(SP) + ld A, 120 + STACKSIZE(SP) + ld LDA, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) + lwz A, 68 + STACKSIZE(SP) + lwz LDA, 72 + STACKSIZE(SP) + lwz BUFFER, 76 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) + lwz A, 60 + STACKSIZE(SP) + lwz LDA, 64 + STACKSIZE(SP) + lwz BUFFER, 68 + STACKSIZE(SP) +#endif +#else + ld INCY, 112 + STACKSIZE(SP) + ld A, 120 + STACKSIZE(SP) + ld LDA, 128 + STACKSIZE(SP) + ld BUFFER, 136 + STACKSIZE(SP) +#endif +#endif + + fmr alpha_r, f1 + fmr alpha_i, f2 + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + li PREC, PREFETCHSIZE_C * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, N, 0 + ble- LL(999) + + mr XX, X + + cmpi cr0, 0, INCX, 2 * SIZE + beq LL(10) + + mr XX, BUFFER + mr X1, BUFFER + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + add X, X, INCX + LFD a3, 0 * SIZE(X) + LFD a4, 1 * SIZE(X) + add X, X, INCX + LFD a5, 0 * SIZE(X) + LFD a6, 1 * SIZE(X) + add X, X, INCX + LFD a7, 0 * SIZE(X) + LFD a8, 1 * SIZE(X) + add X, X, INCX + + STFD a1, 0 * SIZE(X1) + STFD a2, 1 * SIZE(X1) + STFD a3, 2 * SIZE(X1) + STFD a4, 3 * SIZE(X1) + STFD a5, 4 * SIZE(X1) + STFD a6, 5 * SIZE(X1) + STFD a7, 6 * SIZE(X1) + STFD a8, 7 * SIZE(X1) + + addi X1, X1, 8 * SIZE + bdnz+ LL(01) + .align 4 + +LL(05): + andi. r0, M, 7 + mtspr CTR, r0 + ble LL(10) + .align 4 + +LL(06): + LFD a1, 0 * SIZE(X) + LFD a2, 1 * SIZE(X) + STFD a1, 0 * SIZE(X1) + STFD a2, 1 * SIZE(X1) + + add X, X, INCX + addi X1, X1, 2 * SIZE + bdnz+ LL(06) + .align 4 + +LL(10): + srawi. J, N, 1 + ble LL(20) + .align 4 + +LL(11): + LFD alpha1_r, 0 * SIZE(Y) + LFD alpha1_i, 1 * SIZE(Y) + add Y, Y, INCY + LFD alpha2_r, 0 * SIZE(Y) + LFD alpha2_i, 1 * SIZE(Y) + add Y, Y, INCY + + FMUL a1, alpha_r, alpha1_r + FMUL a2, alpha_i, alpha1_r + FMUL a3, alpha_r, alpha2_r + FMUL a4, alpha_i, alpha2_r + + FMA1 alpha1_r, alpha_i, alpha1_i, a1 + FMA2 alpha1_i, alpha_r, alpha1_i, a2 + FMA1 alpha2_r, alpha_i, alpha2_i, a3 + FMA2 alpha2_i, alpha_r, alpha2_i, a4 + + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + mr X1, XX + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(15) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a9, 0 * SIZE(AO2) + LFD a10, 1 * SIZE(AO2) + LFD a11, 2 * SIZE(AO2) + LFD a12, 3 * SIZE(AO2) + LFD a13, 4 * SIZE(AO2) + LFD a14, 5 * SIZE(AO2) + LFD a15, 6 * SIZE(AO2) + LFD a16, 7 * SIZE(AO2) + + bdz LL(13) + .align 4 + +LL(12): + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + LFD a9, 8 * SIZE(AO2) + LFD a10, 9 * SIZE(AO2) + LFD a11, 10 * SIZE(AO2) + LFD a12, 11 * SIZE(AO2) + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + LFD a13, 12 * SIZE(AO2) + LFD a14, 13 * SIZE(AO2) + LFD a15, 14 * SIZE(AO2) + LFD a16, 15 * SIZE(AO2) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + LFD y01, 16 * SIZE(X1) + LFD y02, 17 * SIZE(X1) + LFD y03, 18 * SIZE(X1) + LFD y04, 19 * SIZE(X1) + + STFD a9, 8 * SIZE(AO2) + STFD a10, 9 * SIZE(AO2) + STFD a11, 10 * SIZE(AO2) + STFD a12, 11 * SIZE(AO2) + + LFD a9, 16 * SIZE(AO2) + LFD a10, 17 * SIZE(AO2) + LFD a11, 18 * SIZE(AO2) + LFD a12, 19 * SIZE(AO2) + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + LFD y05, 20 * SIZE(X1) + LFD y06, 21 * SIZE(X1) + LFD y07, 22 * SIZE(X1) + LFD y08, 23 * SIZE(X1) + + STFD a13, 12 * SIZE(AO2) + STFD a14, 13 * SIZE(AO2) + STFD a15, 14 * SIZE(AO2) + STFD a16, 15 * SIZE(AO2) + + LFD a13, 20 * SIZE(AO2) + LFD a14, 21 * SIZE(AO2) + LFD a15, 22 * SIZE(AO2) + LFD a16, 23 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi X1, X1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(AO2, PREA) + DCBT(Y1, PREY) + + bdnz+ LL(12) + .align 4 + +LL(13): + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + + LFD a9, 8 * SIZE(AO2) + LFD a10, 9 * SIZE(AO2) + LFD a11, 10 * SIZE(AO2) + LFD a12, 11 * SIZE(AO2) + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + LFD a13, 12 * SIZE(AO2) + LFD a14, 13 * SIZE(AO2) + LFD a15, 14 * SIZE(AO2) + LFD a16, 15 * SIZE(AO2) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + STFD a9, 8 * SIZE(AO2) + STFD a10, 9 * SIZE(AO2) + STFD a11, 10 * SIZE(AO2) + STFD a12, 11 * SIZE(AO2) + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + STFD a13, 12 * SIZE(AO2) + STFD a14, 13 * SIZE(AO2) + STFD a15, 14 * SIZE(AO2) + STFD a16, 15 * SIZE(AO2) + + addi AO1, AO1, 16 * SIZE + addi AO2, AO2, 16 * SIZE + addi X1, X1, 16 * SIZE + .align 4 + +LL(15): + andi. r0, M, 7 + ble LL(19) + + andi. r0, M, 4 + ble LL(17) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + LFD a9, 0 * SIZE(AO2) + LFD a10, 1 * SIZE(AO2) + LFD a11, 2 * SIZE(AO2) + LFD a12, 3 * SIZE(AO2) + LFD a13, 4 * SIZE(AO2) + LFD a14, 5 * SIZE(AO2) + LFD a15, 6 * SIZE(AO2) + LFD a16, 7 * SIZE(AO2) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + FNMSUB a9, alpha2_i, y02, a9 + FMADD a10, alpha2_i, y01, a10 + FNMSUB a11, alpha2_i, y04, a11 + FMADD a12, alpha2_i, y03, a12 + + FNMSUB a13, alpha2_i, y06, a13 + FMADD a14, alpha2_i, y05, a14 + FNMSUB a15, alpha2_i, y08, a15 + FMADD a16, alpha2_i, y07, a16 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + STFD a9, 0 * SIZE(AO2) + STFD a10, 1 * SIZE(AO2) + STFD a11, 2 * SIZE(AO2) + STFD a12, 3 * SIZE(AO2) + STFD a13, 4 * SIZE(AO2) + STFD a14, 5 * SIZE(AO2) + STFD a15, 6 * SIZE(AO2) + STFD a16, 7 * SIZE(AO2) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + addi X1, X1, 8 * SIZE + .align 4 + +LL(17): + andi. r0, M, 2 + ble LL(18) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha2_r, y01, a5 + FMADD a6, alpha2_r, y02, a6 + FMADD a7, alpha2_r, y03, a7 + FMADD a8, alpha2_r, y04, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + FNMSUB a5, alpha2_i, y02, a5 + FMADD a6, alpha2_i, y01, a6 + FNMSUB a7, alpha2_i, y04, a7 + FMADD a8, alpha2_i, y03, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + STFD a5, 0 * SIZE(AO2) + STFD a6, 1 * SIZE(AO2) + STFD a7, 2 * SIZE(AO2) + STFD a8, 3 * SIZE(AO2) + + addi AO1, AO1, 4 * SIZE + addi AO2, AO2, 4 * SIZE + addi X1, X1, 4 * SIZE + .align 4 + +LL(18): + andi. r0, M, 1 + ble LL(19) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 0 * SIZE(AO2) + LFD a4, 1 * SIZE(AO2) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha2_r, y01, a3 + FMADD a4, alpha2_r, y02, a4 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha2_i, y02, a3 + FMADD a4, alpha2_i, y01, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 0 * SIZE(AO2) + STFD a4, 1 * SIZE(AO2) + .align 4 + +LL(19): + addi J, J, -1 + cmpi cr0, 0, J, 0 + bgt LL(11) + .align 4 + +LL(20): + andi. J, N, 1 + ble LL(999) + + LFD alpha1_r, 0 * SIZE(Y) + LFD alpha1_i, 1 * SIZE(Y) + + FMUL a1, alpha_r, alpha1_r + FMUL a2, alpha_i, alpha1_r + + FMA1 alpha1_r, alpha_i, alpha1_i, a1 + FMA2 alpha1_i, alpha_r, alpha1_i, a2 + + mr AO1, A + + mr X1, XX + + srawi. r0, M, 3 + mtspr CTR, r0 + ble LL(25) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + bdz LL(23) + .align 4 + +LL(22): + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FMADD a9, alpha2_r, y01, a9 + FMADD a10, alpha2_r, y02, a10 + FMADD a11, alpha2_r, y03, a11 + FMADD a12, alpha2_r, y04, a12 + + FMADD a13, alpha2_r, y05, a13 + FMADD a14, alpha2_r, y06, a14 + FMADD a15, alpha2_r, y07, a15 + FMADD a16, alpha2_r, y08, a16 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + LFD a1, 16 * SIZE(AO1) + LFD a2, 17 * SIZE(AO1) + LFD a3, 18 * SIZE(AO1) + LFD a4, 19 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + LFD a5, 20 * SIZE(AO1) + LFD a6, 21 * SIZE(AO1) + LFD a7, 22 * SIZE(AO1) + LFD a8, 23 * SIZE(AO1) + + LFD y01, 16 * SIZE(X1) + LFD y02, 17 * SIZE(X1) + LFD y03, 18 * SIZE(X1) + LFD y04, 19 * SIZE(X1) + + LFD y05, 20 * SIZE(X1) + LFD y06, 21 * SIZE(X1) + LFD y07, 22 * SIZE(X1) + LFD y08, 23 * SIZE(X1) + + addi AO1, AO1, 16 * SIZE + addi X1, X1, 16 * SIZE + + DCBT(AO1, PREA) + DCBT(Y1, PREY) + + bdnz+ LL(22) + .align 4 + +LL(23): + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + LFD a1, 8 * SIZE(AO1) + LFD a2, 9 * SIZE(AO1) + LFD a3, 10 * SIZE(AO1) + LFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + LFD a5, 12 * SIZE(AO1) + LFD a6, 13 * SIZE(AO1) + LFD a7, 14 * SIZE(AO1) + LFD a8, 15 * SIZE(AO1) + + LFD y01, 8 * SIZE(X1) + LFD y02, 9 * SIZE(X1) + LFD y03, 10 * SIZE(X1) + LFD y04, 11 * SIZE(X1) + + LFD y05, 12 * SIZE(X1) + LFD y06, 13 * SIZE(X1) + LFD y07, 14 * SIZE(X1) + LFD y08, 15 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 8 * SIZE(AO1) + STFD a2, 9 * SIZE(AO1) + STFD a3, 10 * SIZE(AO1) + STFD a4, 11 * SIZE(AO1) + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a5, 12 * SIZE(AO1) + STFD a6, 13 * SIZE(AO1) + STFD a7, 14 * SIZE(AO1) + STFD a8, 15 * SIZE(AO1) + + addi AO1, AO1, 16 * SIZE + addi X1, X1, 16 * SIZE + .align 4 + +LL(25): + andi. r0, M, 7 + ble LL(999) + + andi. r0, M, 4 + ble LL(27) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + LFD a5, 4 * SIZE(AO1) + LFD a6, 5 * SIZE(AO1) + LFD a7, 6 * SIZE(AO1) + LFD a8, 7 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + LFD y05, 4 * SIZE(X1) + LFD y06, 5 * SIZE(X1) + LFD y07, 6 * SIZE(X1) + LFD y08, 7 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FMADD a5, alpha1_r, y05, a5 + FMADD a6, alpha1_r, y06, a6 + FMADD a7, alpha1_r, y07, a7 + FMADD a8, alpha1_r, y08, a8 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + FNMSUB a5, alpha1_i, y06, a5 + FMADD a6, alpha1_i, y05, a6 + FNMSUB a7, alpha1_i, y08, a7 + FMADD a8, alpha1_i, y07, a8 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + STFD a5, 4 * SIZE(AO1) + STFD a6, 5 * SIZE(AO1) + STFD a7, 6 * SIZE(AO1) + STFD a8, 7 * SIZE(AO1) + + addi AO1, AO1, 8 * SIZE + addi X1, X1, 8 * SIZE + .align 4 + +LL(27): + andi. r0, M, 2 + ble LL(28) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + LFD y03, 2 * SIZE(X1) + LFD y04, 3 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + FMADD a3, alpha1_r, y03, a3 + FMADD a4, alpha1_r, y04, a4 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + FNMSUB a3, alpha1_i, y04, a3 + FMADD a4, alpha1_i, y03, a4 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + STFD a3, 2 * SIZE(AO1) + STFD a4, 3 * SIZE(AO1) + + addi AO1, AO1, 4 * SIZE + addi X1, X1, 4 * SIZE + .align 4 + +LL(28): + andi. r0, M, 1 + ble LL(999) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + LFD y01, 0 * SIZE(X1) + LFD y02, 1 * SIZE(X1) + + FMADD a1, alpha1_r, y01, a1 + FMADD a2, alpha1_r, y02, a2 + + FNMSUB a1, alpha1_i, y02, a1 + FMADD a2, alpha1_i, y01, a2 + + STFD a1, 0 * SIZE(AO1) + STFD a2, 1 * SIZE(AO1) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S new file mode 100644 index 0000000000..ded25fdd1f --- /dev/null +++ b/kernel/power/znrm2.S @@ -0,0 +1,924 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define NN r6 +#define XX r7 +#define PREA r8 +#define INCXM1 r9 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define FMAX 152(SP) + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r12, 0x5fe0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r12, FMAX + stw r10, 4 + FMAX + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(9999) + cmpwi cr0, INCX, 0 + ble- LL(9999) + + mr NN, N + mr XX, X + + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + + add X, X, INCX + + fabs f2, f0 + fabs f3, f1 + fabs f4, f0 + fabs f5, f1 + fabs f6, f0 + fabs f7, f1 + fabs f0, f0 + fabs f1, f1 + + subi N, N, 1 + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(1000) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f24, 0 * SIZE(X) + LFD f25, 1 * SIZE(X) + LFD f26, 2 * SIZE(X) + LFD f27, 3 * SIZE(X) + LFD f28, 4 * SIZE(X) + LFD f29, 5 * SIZE(X) + LFD f30, 6 * SIZE(X) + LFD f31, 7 * SIZE(X) + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFD f24, 8 * SIZE(X) + LFD f25, 9 * SIZE(X) + LFD f26, 10 * SIZE(X) + LFD f27, 11 * SIZE(X) + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFD f28, 12 * SIZE(X) + LFD f29, 13 * SIZE(X) + LFD f30, 14 * SIZE(X) + LFD f31, 15 * SIZE(X) + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 16 * SIZE(X) + LFD f25, 17 * SIZE(X) + LFD f26, 18 * SIZE(X) + LFD f27, 19 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 20 * SIZE(X) + LFD f29, 21 * SIZE(X) + LFD f30, 22 * SIZE(X) + LFD f31, 23 * SIZE(X) + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFD f24, 24 * SIZE(X) + LFD f25, 25 * SIZE(X) + LFD f26, 26 * SIZE(X) + LFD f27, 27 * SIZE(X) + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFD f28, 28 * SIZE(X) + LFD f29, 29 * SIZE(X) + LFD f30, 30 * SIZE(X) + LFD f31, 31 * SIZE(X) + +#ifndef POWER6 + L1_PREFETCH X, PREA +#endif + addi X, X, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH X, PREA +#endif + + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + addi X, X, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(100) + .align 4 + +LL(60): + LFD f8, 0 * SIZE(X) + LFD f9, 1 * SIZE(X) + addi X, X, 2 * SIZE + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + bdnz LL(60) + .align 4 + +LL(100): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + + fcmpu cr0, f1, f31 + beq- cr0, LL(9999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + LFD f10, 2 * SIZE(XX) + LFD f11, 3 * SIZE(XX) + LFD f12, 4 * SIZE(XX) + LFD f13, 5 * SIZE(XX) + LFD f14, 6 * SIZE(XX) + LFD f15, 7 * SIZE(XX) + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmul f18, f30, f10 + fmul f19, f30, f11 + + LFD f8, 8 * SIZE(XX) + LFD f9, 9 * SIZE(XX) + LFD f10, 10 * SIZE(XX) + LFD f11, 11 * SIZE(XX) + + fmul f20, f30, f12 + fmul f21, f30, f13 + fmul f22, f30, f14 + fmul f23, f30, f15 + + LFD f12, 12 * SIZE(XX) + LFD f13, 13 * SIZE(XX) + LFD f14, 14 * SIZE(XX) + LFD f15, 15 * SIZE(XX) + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFD f8, 16 * SIZE(XX) + LFD f9, 17 * SIZE(XX) + LFD f10, 18 * SIZE(XX) + LFD f11, 19 * SIZE(XX) + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFD f12, 20 * SIZE(XX) + LFD f13, 21 * SIZE(XX) + LFD f14, 22 * SIZE(XX) + LFD f15, 23 * SIZE(XX) + + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFD f8, 24 * SIZE(XX) + LFD f9, 25 * SIZE(XX) + LFD f10, 26 * SIZE(XX) + LFD f11, 27 * SIZE(XX) + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFD f12, 28 * SIZE(XX) + LFD f13, 29 * SIZE(XX) + LFD f14, 30 * SIZE(XX) + LFD f15, 31 * SIZE(XX) + +#ifndef POWER6 + L1_PREFETCH XX, PREA +#endif + addi XX, XX, 16 * SIZE +#ifdef POWER6 + L1_PREFETCH XX, PREA +#endif + + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + + addi XX, XX, 16 * SIZE + .align 4 + +LL(150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFD f8, 0 * SIZE(XX) + LFD f9, 1 * SIZE(XX) + addi XX, XX, 2 * SIZE + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f0, f0, f4 + + fsqrt f0, f0 + fmul f1, f31, f0 + b LL(9999) + .align 4 + +LL(1000): + sub X, X, INCXM1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(1050) + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fabs f8, f24 + fabs f9, f25 + fabs f10, f26 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fabs f12, f28 + fabs f13, f29 + fabs f14, f30 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdz LL(1020) + .align 4 + +LL(1010): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + LFDX f24, X, INCXM1 + LFDUX f25, X, INCX + LFDX f26, X, INCXM1 + LFDUX f27, X, INCX + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + LFDX f28, X, INCXM1 + LFDUX f29, X, INCX + LFDX f30, X, INCXM1 + LFDUX f31, X, INCX + bdnz LL(1010) + .align 4 + +LL(1020): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(1050): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(1999) + .align 4 + +LL(1060): + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + bdnz LL(1060) + .align 4 + +LL(1999): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + + fcmpu cr0, f1, f31 + beq- cr0, LL(9999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + sub XX, XX, INCXM1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(1150) + + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + LFDX f10, XX, INCXM1 + LFDUX f11, XX, INCX + LFDX f12, XX, INCXM1 + LFDUX f13, XX, INCX + LFDX f14, XX, INCXM1 + LFDUX f15, XX, INCX + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmul f18, f30, f10 + fmul f19, f30, f11 + + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + LFDX f10, XX, INCXM1 + LFDUX f11, XX, INCX + + fmul f20, f30, f12 + fmul f21, f30, f13 + fmul f22, f30, f14 + fmul f23, f30, f15 + + LFDX f12, XX, INCXM1 + LFDUX f13, XX, INCX + LFDX f14, XX, INCXM1 + LFDUX f15, XX, INCX + bdz LL(1120) + .align 4 + +LL(1110): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + LFDX f10, XX, INCXM1 + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFDX f12, XX, INCXM1 + LFDUX f13, XX, INCX + LFDX f14, XX, INCXM1 + LFDUX f15, XX, INCX + + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + LFDX f10, XX, INCXM1 + LFDUX f11, XX, INCX + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + LFDX f12, XX, INCXM1 + LFDUX f13, XX, INCX + LFDX f14, XX, INCXM1 + LFDUX f15, XX, INCX + + bdnz LL(1110) + .align 4 + +LL(1120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + .align 4 + +LL(1150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq- cr0, LL(1170) + .align 4 + +LL(1160): + LFDX f8, XX, INCXM1 + LFDUX f9, XX, INCX + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(1160) + .align 4 + +LL(1170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f0, f0, f4 + + fsqrt f0, f0 + fmul f1, f31, f0 + .align 4 + +LL(9999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S new file mode 100644 index 0000000000..b6deb94470 --- /dev/null +++ b/kernel/power/znrm2_hummer.S @@ -0,0 +1,1018 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define INCX2 r6 +#define X2 r7 + +#define XX r8 + +#define C1 f1 +#define C2 f0 +#define C3 f2 +#define C4 f3 + +#define ALPHA f4 +#define ALPHA_R f5 + +#define A1 f6 +#define A2 f7 +#define A3 f8 +#define A4 f9 +#define A5 f10 +#define A6 f11 +#define A7 f12 +#define A8 f13 + +#define F1 f14 +#define F2 f15 +#define F3 f16 +#define F4 f17 +#define F5 f18 +#define F6 f19 +#define F7 f20 +#define F8 f21 + +#define T1 f22 +#define T2 f23 +#define T3 f24 +#define T4 f25 +#define T5 f26 +#define T6 f27 +#define T7 f28 +#define T8 f29 + + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + stfpdux f18, SP, r10 + stfpdux f19, SP, r10 + + stfpdux f20, SP, r10 + stfpdux f21, SP, r10 + stfpdux f22, SP, r10 + stfpdux f23, SP, r10 + + stfpdux f24, SP, r10 + stfpdux f25, SP, r10 + stfpdux f26, SP, r10 + stfpdux f27, SP, r10 + + stfpdux f28, SP, r10 + stfpdux f29, SP, r10 + + li r10, 0 + lis r11, 0x3f80 + stwu r11, -4(SP) + stwu r11, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + lfpsx C1, SP, r10 # Zero clear + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + + cmpwi cr0, N, 0 + ble LL(99) + cmpwi cr0, INCX, 0 + ble LL(99) + + mr XX, X + + andi. r0, X, 2 * SIZE - 1 + bne LL(100) + +/* aligned */ + + sub X, X, INCX2 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fpabs T1, A1 + LFPDUX A6, X, INCX2 + fpabs T2, A2 + LFPDUX A7, X, INCX2 + fpabs T3, A3 + LFPDUX A8, X, INCX2 + fpabs T4, A4 + bdz LL(13) + .align 4 + +LL(12): + fpsub F1, C1, T1 + LFPDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFPDUX A2, X, INCX2 + fpsub F3, C3, T3 + LFPDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFPDUX A4, X, INCX2 + + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsel C1, F1, C1, T1 + LFPDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFPDUX A6, X, INCX2 + fpsel C3, F3, C3, T3 + LFPDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFPDUX A8, X, INCX2 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(12) + .align 4 + +LL(13): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(15): + andi. r0, N, 7 + beq LL(20) + + andi. r0, N, 4 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(16): + andi. r0, N, 2 + beq LL(17) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(20) + + LFPDUX A1, X, INCX2 + fpabs A1, A1 + fpsub F1, C1, A1 + fpsel C1, F1, C1, A1 + .align 4 + +LL(20): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel ALPHA, F1, C1, C2 + + li r10, 0 + + lfs ALPHA_R, 8(SP) # load 1.0 + fdiv ALPHA_R, ALPHA_R, ALPHA + + lfpsx C1, SP, r10 # Zero clear + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + fsmfp ALPHA_R, ALPHA_R + + andi. r0, XX, 2 * SIZE - 1 + beq LL(21) + + LFD C1, 0 * SIZE(XX) + add XX, XX, INCX + + cmpwi cr0, N, 0 + fmul C1, ALPHA_R, C1 + fmul C1, C1, C1 + ble LL(98) + .align 4 + +LL(21): + sub XX, XX, INCX2 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(25) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + LFPDUX A5, XX, INCX2 + LFPDUX A6, XX, INCX2 + LFPDUX A7, XX, INCX2 + LFPDUX A8, XX, INCX2 + + fpmul T1, ALPHA_R, A1 + fpmul T2, ALPHA_R, A2 + fpmul T3, ALPHA_R, A3 + fpmul T4, ALPHA_R, A4 + + bdz LL(23) + .align 4 + +LL(22): + fpmadd C1, T1, T1, C1 + LFPDUX A1, XX, INCX2 + fpmul T1, ALPHA_R, A5 + LFPDUX A2, XX, INCX2 + + fpmadd C2, T2, T2, C2 + LFPDUX A3, XX, INCX2 + fpmul T2, ALPHA_R, A6 + LFPDUX A4, XX, INCX2 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + LFPDUX A5, XX, INCX2 + fpmul T1, ALPHA_R, A1 + LFPDUX A6, XX, INCX2 + + fpmadd C2, T2, T2, C2 + LFPDUX A7, XX, INCX2 + fpmul T2, ALPHA_R, A2 + LFPDUX A8, XX, INCX2 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A3 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A4 + bdnz LL(22) + .align 4 + +LL(23): + fpmadd C1, T1, T1, C1 + fpmul T1, ALPHA_R, A5 + fpmadd C2, T2, T2, C2 + fpmul T2, ALPHA_R, A6 + + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + fpmadd C2, T2, T2, C2 + fpmadd C3, T3, T3, C3 + fpmadd C4, T4, T4, C4 + .align 4 + +LL(25): + andi. r0, N, 7 + beq LL(98) + + andi. r0, N, 4 + beq LL(26) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + LFPDUX A3, XX, INCX2 + LFPDUX A4, XX, INCX2 + + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + fpmul A3, ALPHA_R, A3 + fpmul A4, ALPHA_R, A4 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(26): + andi. r0, N, 2 + beq LL(27) + + LFPDUX A1, XX, INCX2 + LFPDUX A2, XX, INCX2 + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(98) + + LFPDUX A1, XX, INCX2 + fpmul A1, ALPHA_R, A1 + fpmadd C1, A1, A1, C1 + .align 4 + +LL(98): + fpadd C1, C1, C2 + lis r3, 0x3f00 + fpadd C3, C3, C4 + lis r4, 0x4040 + + stw r3, 4(SP) + stw r4, 8(SP) + + fpadd C1, C1, C3 + lfs f10, 0(SP) + + fsmtp C2, C1 + lfs f11, 4(SP) + fadd C1, C2, C1 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, C1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f29, SP, r10 + fmul f3, f9, f11 + lfpdux f28, SP, r10 + fnmsub f7, f2, f9, f12 + lfpdux f27, SP, r10 + fmul f9, f3, f7 + lfpdux f26, SP, r10 + fadd f13, f11, f11 + lfpdux f25, SP, r10 + fmul f12, f1, f9 + lfpdux f24, SP, r10 + fmul f11, f12, f11 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + fnmsub f1, f12, f9, f13 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fmadd f1, f11, f1, f12 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmul C1, ALPHA, C1 + blr +#else + fsqrt C1, C1 + + li r10, 16 + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(99): + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + .align 4 + +LL(100): + sub X, X, INCX2 + addi X2, X, SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(105) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + LFSDUX A1, X, INCX2 + LFSDUX A2, X2, INCX2 + LFSDUX A3, X, INCX2 + LFSDUX A4, X2, INCX2 + + LFDUX A5, X, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, X, INCX2 + LFDUX A8, X2, INCX2 + LFSDUX A5, X, INCX2 + fpabs T1, A1 + LFSDUX A6, X2, INCX2 + fpabs T2, A2 + LFSDUX A7, X, INCX2 + fpabs T3, A3 + LFSDUX A8, X2, INCX2 + fpabs T4, A4 + bdz LL(103) + .align 4 + +LL(102): + fpsub F1, C1, T1 + LFDUX A1, X, INCX2 + fpsub F2, C2, T2 + LFDUX A2, X2, INCX2 + fpsub F3, C3, T3 + LFDUX A3, X, INCX2 + fpsub F4, C4, T4 + LFDUX A4, X2, INCX2 + + fpabs T5, A5 + LFSDUX A1, X, INCX2 + fpabs T6, A6 + LFSDUX A2, X2, INCX2 + fpabs T7, A7 + LFSDUX A3, X, INCX2 + fpabs T8, A8 + LFSDUX A4, X2, INCX2 + + fpsel C1, F1, C1, T1 + LFDUX A5, X, INCX2 + fpsel C2, F2, C2, T2 + LFDUX A6, X2, INCX2 + fpsel C3, F3, C3, T3 + LFDUX A7, X, INCX2 + fpsel C4, F4, C4, T4 + LFDUX A8, X2, INCX2 + + fpsub F5, C1, T5 + LFSDUX A5, X, INCX2 + fpsub F6, C2, T6 + LFSDUX A6, X2, INCX2 + fpsub F7, C3, T7 + LFSDUX A7, X, INCX2 + fpsub F8, C4, T8 + LFSDUX A8, X2, INCX2 + + fpabs T1, A1 + fpabs T2, A2 + fpabs T3, A3 + fpabs T4, A4 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + bdnz LL(102) + .align 4 + +LL(103): + fpabs T5, A5 + fpabs T6, A6 + fpabs T7, A7 + fpabs T8, A8 + + fpsub F1, C1, T1 + fpsub F2, C2, T2 + fpsub F3, C3, T3 + fpsub F4, C4, T4 + + fpsel C1, F1, C1, T1 + fpsel C2, F2, C2, T2 + fpsel C3, F3, C3, T3 + fpsel C4, F4, C4, T4 + + fpsub F5, C1, T5 + fpsub F6, C2, T6 + fpsub F7, C3, T7 + fpsub F8, C4, T8 + + fpsel C1, F5, C1, T5 + fpsel C2, F6, C2, T6 + fpsel C3, F7, C3, T7 + fpsel C4, F8, C4, T8 + .align 4 + +LL(105): + andi. r0, N, 7 + beq LL(120) + + andi. r0, N, 4 + beq LL(106) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + LFSDUX A1, X, INCX2 + LFSDUX A2, X2, INCX2 + LFSDUX A3, X, INCX2 + LFSDUX A4, X2, INCX2 + + fpabs A1, A1 + fpabs A2, A2 + fpabs A3, A3 + fpabs A4, A4 + + fpsub F1, C1, A1 + fpsub F2, C2, A2 + fpsub F3, C3, A3 + fpsub F4, C4, A4 + + fpsel C1, F1, C1, A1 + fpsel C2, F2, C2, A2 + fpsel C3, F3, C3, A3 + fpsel C4, F4, C4, A4 + .align 4 + +LL(106): + andi. r0, N, 2 + beq LL(107) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, X2, INCX2 + + fabs A1, A1 + fabs A2, A2 + fabs A3, A3 + fabs A4, A4 + + fsub F1, C1, A1 + fsub F2, C2, A2 + fsub F3, C3, A3 + fsub F4, C4, A4 + + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + fsel C3, F3, C3, A3 + fsel C4, F4, C4, A4 + .align 4 + +LL(107): + andi. r0, N, 1 + beq LL(120) + + LFDUX A1, X, INCX2 + LFDUX A2, X2, INCX2 + fabs A1, A1 + fabs A2, A2 + fsub F1, C1, A1 + fsub F2, C2, A2 + fsel C1, F1, C1, A1 + fsel C2, F2, C2, A2 + .align 4 + +LL(120): + fpsub F1, C1, C2 + fpsub F2, C3, C4 + + fpsel C1, F1, C1, C2 + fpsel C3, F2, C3, C4 + + fpsub F1, C1, C3 + fpsel C1, F1, C1, C3 + + fsmtp C2, C1 + + fsub F1, C1, C2 + fsel ALPHA, F1, C1, C2 + + li r10, 0 + + lfs ALPHA_R, 8(SP) # load 1.0 + fdiv ALPHA_R, ALPHA_R, ALPHA + + lfpsx C1, SP, r10 # Zero clear + + fpmr C2, C1 + fpmr C3, C1 + fpmr C4, C1 + fsmfp ALPHA_R, ALPHA_R + + sub XX, XX, INCX2 + addi X2, XX, SIZE + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(125) + + LFDUX A1, XX, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, XX, INCX2 + LFDUX A4, X2, INCX2 + LFSDUX A1, XX, INCX2 + LFSDUX A2, X2, INCX2 + LFSDUX A3, XX, INCX2 + LFSDUX A4, X2, INCX2 + + LFDUX A5, XX, INCX2 + LFDUX A6, X2, INCX2 + LFDUX A7, XX, INCX2 + LFDUX A8, X2, INCX2 + LFSDUX A5, XX, INCX2 + fpmul T1, ALPHA_R, A1 + LFSDUX A6, X2, INCX2 + fpmul T2, ALPHA_R, A2 + LFSDUX A7, XX, INCX2 + fpmul T3, ALPHA_R, A3 + LFSDUX A8, X2, INCX2 + fpmul T4, ALPHA_R, A4 + bdz LL(123) + .align 4 + +LL(122): + fpmadd C1, T1, T1, C1 + LFDUX A1, XX, INCX2 + fpmul T1, ALPHA_R, A5 + LFDUX A2, X2, INCX2 + + fpmadd C2, T2, T2, C2 + LFDUX A3, XX, INCX2 + fpmul T2, ALPHA_R, A6 + LFDUX A4, X2, INCX2 + + fpmadd C3, T3, T3, C3 + LFSDUX A1, XX, INCX2 + fpmul T3, ALPHA_R, A7 + LFSDUX A2, X2, INCX2 + + fpmadd C4, T4, T4, C4 + LFSDUX A3, XX, INCX2 + fpmul T4, ALPHA_R, A8 + LFSDUX A4, X2, INCX2 + + fpmadd C1, T1, T1, C1 + LFDUX A5, XX, INCX2 + fpmul T1, ALPHA_R, A1 + LFDUX A6, X2, INCX2 + + fpmadd C2, T2, T2, C2 + LFDUX A7, XX, INCX2 + fpmul T2, ALPHA_R, A2 + LFDUX A8, X2, INCX2 + + fpmadd C3, T3, T3, C3 + LFSDUX A5, XX, INCX2 + fpmul T3, ALPHA_R, A3 + LFSDUX A6, X2, INCX2 + fpmadd C4, T4, T4, C4 + LFSDUX A7, XX, INCX2 + fpmul T4, ALPHA_R, A4 + LFSDUX A8, X2, INCX2 + bdnz LL(122) + .align 4 + +LL(123): + fpmadd C1, T1, T1, C1 + fpmul T1, ALPHA_R, A5 + fpmadd C2, T2, T2, C2 + fpmul T2, ALPHA_R, A6 + fpmadd C3, T3, T3, C3 + fpmul T3, ALPHA_R, A7 + fpmadd C4, T4, T4, C4 + fpmul T4, ALPHA_R, A8 + + fpmadd C1, T1, T1, C1 + fpmadd C2, T2, T2, C2 + fpmadd C3, T3, T3, C3 + fpmadd C4, T4, T4, C4 + .align 4 + +LL(125): + andi. r0, N, 7 + beq LL(998) + + andi. r0, N, 4 + beq LL(126) + + LFDUX A1, XX, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, XX, INCX2 + LFDUX A4, X2, INCX2 + LFSDUX A1, XX, INCX2 + LFSDUX A2, X2, INCX2 + LFSDUX A3, XX, INCX2 + LFSDUX A4, X2, INCX2 + + fpmul A1, ALPHA_R, A1 + fpmul A2, ALPHA_R, A2 + fpmul A3, ALPHA_R, A3 + fpmul A4, ALPHA_R, A4 + + fpmadd C1, A1, A1, C1 + fpmadd C2, A2, A2, C2 + fpmadd C3, A3, A3, C3 + fpmadd C4, A4, A4, C4 + .align 4 + +LL(126): + andi. r0, N, 2 + beq LL(127) + + LFDUX A1, XX, INCX2 + LFDUX A2, X2, INCX2 + LFDUX A3, XX, INCX2 + LFDUX A4, X2, INCX2 + + fmul A1, ALPHA_R, A1 + fmul A2, ALPHA_R, A2 + fmul A3, ALPHA_R, A3 + fmul A4, ALPHA_R, A4 + + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + fmadd C3, A3, A3, C3 + fmadd C4, A4, A4, C4 + .align 4 + +LL(127): + andi. r0, N, 1 + beq LL(998) + + LFDUX A1, XX, INCX2 + LFDUX A2, X2, INCX2 + + fmul A1, ALPHA_R, A1 + fmul A2, ALPHA_R, A2 + fmadd C1, A1, A1, C1 + fmadd C2, A2, A2, C2 + .align 4 + +LL(998): + fpadd C1, C1, C2 + lis r3, 0x3f00 + fpadd C3, C3, C4 + lis r4, 0x4040 + + stw r3, 4(SP) + stw r4, 8(SP) + + fpadd C1, C1, C3 + lfs f10, 0(SP) + fsmtp C2, C1 + lfs f11, 4(SP) + fadd C1, C2, C1 + lfs f12, 8(SP) + + fcmpu cr0, f10, C1 + beq cr0, LL(99) + +#ifndef HUMMER_EMULATOR + frsqrte f9, C1 + li r10, 16 + + fmul f2, f1, f9 + lfpdux f29, SP, r10 + fmul f3, f9, f11 + lfpdux f28, SP, r10 + fnmsub f7, f2, f9, f12 + lfpdux f27, SP, r10 + fmul f9, f3, f7 + lfpdux f26, SP, r10 + fadd f13, f11, f11 + lfpdux f25, SP, r10 + fmul f12, f1, f9 + lfpdux f24, SP, r10 + fmul f11, f12, f11 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + fnmsub f1, f12, f9, f13 + lfpdux f21, SP, r10 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + fmadd f1, f11, f1, f12 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + fmul C1, ALPHA, C1 + blr +#else + fsqrt C1, C1 + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + + lfpdux f20, SP, r10 + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + fmul C1, ALPHA, C1 + addi SP, SP, 16 + blr +#endif + .align 4 + +LL(999): + li r10, 16 + + lfpdux f29, SP, r10 + lfpdux f28, SP, r10 + lfpdux f27, SP, r10 + lfpdux f26, SP, r10 + lfpdux f25, SP, r10 + lfpdux f24, SP, r10 + + lfpdux f23, SP, r10 + lfpdux f22, SP, r10 + lfpdux f21, SP, r10 + lfpdux f20, SP, r10 + + lfpdux f19, SP, r10 + lfpdux f18, SP, r10 + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + addi SP, SP, 16 + blr + EPILOGUE diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S new file mode 100644 index 0000000000..354227917e --- /dev/null +++ b/kernel/power/znrm2_ppc440.S @@ -0,0 +1,564 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 + +#define NN r6 +#define XX r7 +#define INC1 r9 +#define PRE r10 + +#define FZERO 144(SP) +#define FONE 148(SP) +#define FMAX 152(SP) +#define C1 156(SP) +#define C2 160(SP) + +#define STACKSIZE 168 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r10, 0 + lis r11, 0x3f80 + lis r12, 0x5fe0 + lis r6, 0x3f00 + lis r7, 0x4040 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + stw r10, FZERO + stw r11, FONE + stw r12, FMAX + stw r10, 4 + FMAX + stw r6, C1 + stw r7, C2 + + lfs f1, FZERO + +#ifdef F_INTERFACE + LDINT N, 0(N) + LDINT INCX, 0(INCX) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + sub X, X, INCX + li INC1, SIZE + + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + cmpwi cr0, INCX, 0 + ble- LL(999) + + mr NN, N + mr XX, X + + LFDUX f0, X, INCX + LFDX f1, X, INC1 + + fabs f2, f0 + fabs f3, f1 + fabs f4, f0 + fabs f5, f1 + fabs f6, f0 + fabs f7, f1 + fabs f0, f0 + fabs f1, f1 + + subi N, N, 1 + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(50) + + LFDUX f24, X, INCX + LFDX f25, X, INC1 + LFDUX f26, X, INCX + LFDX f27, X, INC1 + LFDUX f28, X, INCX + LFDX f29, X, INC1 + LFDUX f30, X, INCX + LFDX f31, X, INC1 + + fabs f8, f24 + LFDUX f24, X, INCX + fabs f9, f25 + LFDX f25, X, INC1 + fabs f10, f26 + LFDUX f26, X, INCX + fabs f11, f27 + LFDX f27, X, INC1 + + fabs f12, f28 + LFDUX f28, X, INCX + fabs f13, f29 + LFDX f29, X, INC1 + fabs f14, f30 + LFDUX f30, X, INCX + fabs f15, f31 + LFDX f31, X, INC1 + bdz LL(20) + .align 4 + +LL(10): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDX f25, X, INC1 + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDX f29, X, INC1 + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDX f31, X, INC1 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f8, f24 + LFDUX f24, X, INCX + fsel f1, f17, f1, f9 + fabs f9, f25 + LFDX f25, X, INC1 + fsel f2, f18, f2, f10 + fabs f10, f26 + LFDUX f26, X, INCX + fsel f3, f19, f3, f11 + fabs f11, f27 + LFDX f27, X, INC1 + + fsel f4, f20, f4, f12 +#ifdef PPCG4 + dcbt X, PRE +#endif + fabs f12, f28 + LFDUX f28, X, INCX + fsel f5, f21, f5, f13 + fabs f13, f29 + LFDX f29, X, INC1 + fsel f6, f22, f6, f14 + fabs f14, f30 + LFDUX f30, X, INCX + fsel f7, f23, f7, f15 + fabs f15, f31 + LFDX f31, X, INC1 + bdnz LL(10) + .align 4 + +LL(20): + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fabs f8, f24 + fsel f1, f17, f1, f9 + fabs f9, f25 + fsel f2, f18, f2, f10 + fabs f10, f26 + fsel f3, f19, f3, f11 + fabs f11, f27 + + fsel f4, f20, f4, f12 + fabs f12, f28 + fsel f5, f21, f5, f13 + fabs f13, f29 + fsel f6, f22, f6, f14 + fabs f14, f30 + fsel f7, f23, f7, f15 + fabs f15, f31 + + fsub f16, f0, f8 + fsub f17, f1, f9 + fsub f18, f2, f10 + fsub f19, f3, f11 + fsub f20, f4, f12 + fsub f21, f5, f13 + fsub f22, f6, f14 + fsub f23, f7, f15 + + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + fsel f2, f18, f2, f10 + fsel f3, f19, f3, f11 + fsel f4, f20, f4, f12 + fsel f5, f21, f5, f13 + fsel f6, f22, f6, f14 + fsel f7, f23, f7, f15 + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(99) + .align 4 + +LL(60): + LFDUX f8, X, INCX + LFDX f9, X, INC1 + + fabs f8, f8 + fabs f9, f9 + fsub f16, f0, f8 + fsub f17, f1, f9 + fsel f0, f16, f0, f8 + fsel f1, f17, f1, f9 + bdnz LL(60) + .align 4 + +LL(99): + fsub f8, f0, f1 + fsub f9, f2, f3 + fsub f10, f4, f5 + fsub f11, f6, f7 + + fsel f0, f8, f0, f1 + fsel f2, f9, f2, f3 + fsel f4, f10, f4, f5 + fsel f6, f11, f6, f7 + + fsub f8, f0, f2 + fsub f9, f4, f6 + fsel f0, f8, f0, f2 + fsel f4, f9, f4, f6 + + fsub f8, f0, f4 + fsel f31, f8, f0, f4 + + lfs f1, FZERO + lfs f0, FONE + + fcmpu cr0, f1, f31 + beq- cr0, LL(999) + + fdiv f30, f0, f31 + + fmr f0, f1 + fmr f2, f1 + fmr f3, f1 + fmr f4, f1 + fmr f5, f1 + fmr f6, f1 + fmr f7, f1 + + srawi. r0, NN, 3 + mtspr CTR, r0 + beq- cr0, LL(150) + + LFDUX f8, XX, INCX + LFDX f9, XX, INC1 + LFDUX f10, XX, INCX + LFDX f11, XX, INC1 + LFDUX f12, XX, INCX + LFDX f13, XX, INC1 + LFDUX f14, XX, INCX + LFDX f15, XX, INC1 + + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmul f17, f30, f9 + LFDX f9, XX, INC1 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmul f19, f30, f11 + LFDX f11, XX, INC1 + + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmul f21, f30, f13 + LFDX f13, XX, INC1 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmul f23, f30, f15 + LFDX f15, XX, INC1 + bdz LL(120) + .align 4 + +LL(110): + fmadd f0, f16, f16, f0 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + LFDX f9, XX, INC1 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + LFDX f11, XX, INC1 + + fmadd f4, f20, f20, f4 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + LFDX f13, XX, INC1 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + LFDX f15, XX, INC1 + + fmadd f0, f16, f16, f0 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f16, f30, f8 + LFDUX f8, XX, INCX + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + LFDX f9, XX, INC1 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + LFDUX f10, XX, INCX + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + LFDX f11, XX, INC1 + + fmadd f4, f20, f20, f4 +#ifdef PPCG4 + dcbt XX, PRE +#endif + fmul f20, f30, f12 + LFDUX f12, XX, INCX + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + LFDX f13, XX, INC1 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + LFDUX f14, XX, INCX + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + LFDX f15, XX, INC1 + bdnz LL(110) + .align 4 + +LL(120): + fmadd f0, f16, f16, f0 + fmul f16, f30, f8 + fmadd f1, f17, f17, f1 + fmul f17, f30, f9 + fmadd f2, f18, f18, f2 + fmul f18, f30, f10 + fmadd f3, f19, f19, f3 + fmul f19, f30, f11 + + fmadd f4, f20, f20, f4 + fmul f20, f30, f12 + fmadd f5, f21, f21, f5 + fmul f21, f30, f13 + fmadd f6, f22, f22, f6 + fmul f22, f30, f14 + fmadd f7, f23, f23, f7 + fmul f23, f30, f15 + + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + fmadd f2, f18, f18, f2 + fmadd f3, f19, f19, f3 + fmadd f4, f20, f20, f4 + fmadd f5, f21, f21, f5 + fmadd f6, f22, f22, f6 + fmadd f7, f23, f23, f7 + .align 4 + +LL(150): + andi. r0, NN, 7 + mtspr CTR, r0 + beq- cr0, LL(170) + .align 4 + +LL(160): + LFDUX f8, XX, INCX + LFDX f9, XX, INC1 + + fmul f16, f30, f8 + fmul f17, f30, f9 + fmadd f0, f16, f16, f0 + fmadd f1, f17, f17, f1 + bdnz LL(160) + .align 4 + +LL(170): + fadd f0, f0, f1 + fadd f2, f2, f3 + fadd f4, f4, f5 + fadd f6, f6, f7 + + fadd f0, f0, f2 + fadd f4, f4, f6 + + fadd f1, f0, f4 + + frsqrte f0, f1 + lfs f8, C1 + lfs f9, C2 + + fmul f2, f1, f0 + fadd f7, f8, f8 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f2, f1, f0 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f2, f1, f0 + fmul f3, f0, f8 + fnmsub f4, f2, f0, f9 + fmul f0, f3, f4 + + fmul f5, f1, f0 + fmul f2, f5, f8 + fnmsub f3, f5, f0, f7 + fmadd f1, f2, f3, f5 + fmul f1, f31, f1 + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zrot.S b/kernel/power/zrot.S new file mode 100644 index 0000000000..aad28af058 --- /dev/null +++ b/kernel/power/zrot.S @@ -0,0 +1,595 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PREA r8 +#define XX r9 +#define YY r10 + +#define INCXM1 r11 +#define INCYM1 r12 + +#define C f1 +#define S f2 + +#define STACKSIZE 32 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + + LFD f0, 0 * SIZE(X) + LFD f4, 1 * SIZE(X) + LFD f6, 2 * SIZE(X) + LFD f8, 3 * SIZE(X) + + LFD f3, 0 * SIZE(Y) + LFD f5, 1 * SIZE(Y) + LFD f7, 2 * SIZE(Y) + LFD f9, 3 * SIZE(Y) + bdz LL(12) + .align 4 + +LL(10): + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 4 * SIZE(X) + LFD f4, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f8, 7 * SIZE(X) + + LFD f3, 4 * SIZE(Y) + LFD f5, 5 * SIZE(Y) + LFD f7, 6 * SIZE(Y) + LFD f9, 7 * SIZE(Y) + + STFD f10, 0 * SIZE(X) + STFD f12, 1 * SIZE(X) + STFD f14, 2 * SIZE(X) + STFD f16, 3 * SIZE(X) + + STFD f11, 0 * SIZE(Y) + STFD f13, 1 * SIZE(Y) + STFD f15, 2 * SIZE(Y) + STFD f17, 3 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f6, 10 * SIZE(X) + LFD f8, 11 * SIZE(X) + + LFD f3, 8 * SIZE(Y) + LFD f5, 9 * SIZE(Y) + LFD f7, 10 * SIZE(Y) + LFD f9, 11 * SIZE(Y) + + STFD f10, 4 * SIZE(X) + STFD f12, 5 * SIZE(X) + STFD f14, 6 * SIZE(X) + STFD f16, 7 * SIZE(X) + + STFD f11, 4 * SIZE(Y) + STFD f13, 5 * SIZE(Y) + STFD f15, 6 * SIZE(Y) + STFD f17, 7 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 12 * SIZE(X) + LFD f4, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f8, 15 * SIZE(X) + + LFD f3, 12 * SIZE(Y) + LFD f5, 13 * SIZE(Y) + LFD f7, 14 * SIZE(Y) + LFD f9, 15 * SIZE(Y) + + STFD f10, 8 * SIZE(X) + STFD f12, 9 * SIZE(X) + STFD f14, 10 * SIZE(X) + STFD f16, 11 * SIZE(X) + + STFD f11, 8 * SIZE(Y) + STFD f13, 9 * SIZE(Y) + STFD f15, 10 * SIZE(Y) + STFD f17, 11 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + LFD f0, 16 * SIZE(X) + LFD f4, 17 * SIZE(X) + LFD f6, 18 * SIZE(X) + LFD f8, 19 * SIZE(X) + + LFD f3, 16 * SIZE(Y) + LFD f5, 17 * SIZE(Y) + LFD f7, 18 * SIZE(Y) + LFD f9, 19 * SIZE(Y) + + STFD f10, 12 * SIZE(X) + STFD f12, 13 * SIZE(X) + STFD f14, 14 * SIZE(X) + STFD f16, 15 * SIZE(X) + + STFD f11, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f15, 14 * SIZE(Y) + STFD f17, 15 * SIZE(Y) + +#ifndef POWER6 + dcbtst X, PREA +#endif + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + +#ifdef POWER6 + dcbtst X, PREA + dcbtst X, PREA +#endif + bdnz LL(10) + .align 4 + +LL(12): + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 0 * SIZE(X) + STFD f12, 1 * SIZE(X) + STFD f14, 2 * SIZE(X) + STFD f16, 3 * SIZE(X) + + STFD f11, 0 * SIZE(Y) + STFD f13, 1 * SIZE(Y) + STFD f15, 2 * SIZE(Y) + STFD f17, 3 * SIZE(Y) + + LFD f0, 4 * SIZE(X) + LFD f4, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f8, 7 * SIZE(X) + + LFD f3, 4 * SIZE(Y) + LFD f5, 5 * SIZE(Y) + LFD f7, 6 * SIZE(Y) + LFD f9, 7 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 4 * SIZE(X) + STFD f12, 5 * SIZE(X) + STFD f14, 6 * SIZE(X) + STFD f16, 7 * SIZE(X) + + STFD f11, 4 * SIZE(Y) + STFD f13, 5 * SIZE(Y) + STFD f15, 6 * SIZE(Y) + STFD f17, 7 * SIZE(Y) + + LFD f0, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f6, 10 * SIZE(X) + LFD f8, 11 * SIZE(X) + + LFD f3, 8 * SIZE(Y) + LFD f5, 9 * SIZE(Y) + LFD f7, 10 * SIZE(Y) + LFD f9, 11 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 8 * SIZE(X) + STFD f12, 9 * SIZE(X) + STFD f14, 10 * SIZE(X) + STFD f16, 11 * SIZE(X) + + STFD f11, 8 * SIZE(Y) + STFD f13, 9 * SIZE(Y) + STFD f15, 10 * SIZE(Y) + STFD f17, 11 * SIZE(Y) + + LFD f0, 12 * SIZE(X) + LFD f4, 13 * SIZE(X) + LFD f6, 14 * SIZE(X) + LFD f8, 15 * SIZE(X) + + LFD f3, 12 * SIZE(Y) + LFD f5, 13 * SIZE(Y) + LFD f7, 14 * SIZE(Y) + LFD f9, 15 * SIZE(Y) + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFD f10, 12 * SIZE(X) + STFD f12, 13 * SIZE(X) + STFD f14, 14 * SIZE(X) + STFD f16, 15 * SIZE(X) + + STFD f11, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f15, 14 * SIZE(Y) + STFD f17, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f3, 0 * SIZE(X) + LFD f4, 0 * SIZE(Y) + LFD f5, 1 * SIZE(X) + LFD f6, 1 * SIZE(Y) + + FMUL f10, C, f3 + FMUL f11, C, f4 + FMUL f12, C, f5 + FMUL f13, C, f6 + + FMADD f10, S, f4, f10 + FNMSUB f11, S, f3, f11 + FMADD f12, S, f6, f12 + FNMSUB f13, S, f5, f13 + + STFD f10, 0 * SIZE(X) + STFD f11, 0 * SIZE(Y) + STFD f12, 1 * SIZE(X) + STFD f13, 1 * SIZE(Y) + + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + mr XX, X + mr YY, Y + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + LFDX f6, X, INCXM1 + LFDX f7, Y, INCYM1 + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDX f10, XX, INCXM1 + STFDX f11, YY, INCYM1 + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + STFDX f14, XX, INCXM1 + STFDX f15, YY, INCYM1 + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + LFDX f6, X, INCXM1 + LFDX f7, Y, INCYM1 + LFDUX f8, X, INCX + LFDUX f9, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + FMUL f14, C, f6 + FMUL f15, C, f7 + FMUL f16, C, f8 + FMUL f17, C, f9 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDX f10, XX, INCXM1 + STFDX f11, YY, INCYM1 + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + STFDX f14, XX, INCXM1 + STFDX f15, YY, INCYM1 + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 3 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + STFDX f10, XX, INCXM1 + STFDX f11, YY, INCYM1 + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE + +#endif diff --git a/kernel/power/zrot_ppc440.S b/kernel/power/zrot_ppc440.S new file mode 100644 index 0000000000..fe1a99dc59 --- /dev/null +++ b/kernel/power/zrot_ppc440.S @@ -0,0 +1,301 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r4 +#define INCX r5 +#define Y r6 +#define INCY r7 +#define PRE r8 +#define XX r9 +#define YY r10 + +#define INCXM1 r11 +#define INCYM1 r12 + +#define C f1 +#define S f2 + +#define STACKSIZE 32 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + + li PRE, 2 * 16 * SIZE + + cmpwi cr0, N, 0 + ble- LL(999) + + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + mr XX, X + mr YY, Y + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(150) + + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + + FMUL f10, C, f0 + LFDUX f5, Y, INCY + FMUL f11, C, f3 + LFDX f6, X, INCXM1 + FMUL f12, C, f4 + LFDX f7, Y, INCYM1 + FMUL f13, C, f5 + LFDUX f8, X, INCX + + FMADD f10, S, f3, f10 + LFDUX f9, Y, INCY + FNMSUB f11, S, f0, f11 + LFDX f0, X, INCXM1 + FMADD f12, S, f5, f12 + LFDX f3, Y, INCYM1 + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + bdz LL(111) + .align 4 + +LL(110): + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDX f10, XX, INCXM1 + FMUL f16, C, f8 + STFDX f11, YY, INCYM1 + FMUL f17, C, f9 + STFDUX f12, XX, INCX + +#ifdef PPCG4 + dcbtst X, PRE +#endif + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDX f6, X, INCXM1 + FMADD f16, S, f9, f16 + LFDX f7, Y, INCYM1 + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDX f14, XX, INCXM1 + FMUL f12, C, f4 + STFDX f15, YY, INCYM1 + FMUL f13, C, f5 + STFDUX f16, XX, INCX + +#ifdef PPCG4 + dcbtst Y, PRE +#endif + + FMADD f10, S, f3, f10 + STFDUX f17, YY, INCY + FNMSUB f11, S, f0, f11 + LFDX f0, X, INCXM1 + FMADD f12, S, f5, f12 + LFDX f3, Y, INCYM1 + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDX f10, XX, INCXM1 + FMUL f16, C, f8 + STFDX f11, YY, INCYM1 + FMUL f17, C, f9 + STFDUX f12, XX, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbt X, PRE +#endif + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDX f6, X, INCXM1 + FMADD f16, S, f9, f16 + LFDX f7, Y, INCYM1 + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + STFDX f14, XX, INCXM1 + FMUL f11, C, f3 + STFDX f15, YY, INCYM1 + FMUL f12, C, f4 + STFDUX f16, XX, INCX + FMUL f13, C, f5 + STFDUX f17, YY, INCY + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst Y, PRE +#endif + + FMADD f10, S, f3, f10 + LFDUX f9, Y, INCY + FNMSUB f11, S, f0, f11 + LFDX f0, X, INCXM1 + FMADD f12, S, f5, f12 + LFDX f3, Y, INCYM1 + FNMSUB f13, S, f4, f13 + LFDUX f4, X, INCX + bdnz LL(110) + .align 4 + + +LL(111): + FMUL f14, C, f6 + LFDUX f5, Y, INCY + FMUL f15, C, f7 + STFDX f10, XX, INCXM1 + FMUL f16, C, f8 + STFDX f11, YY, INCYM1 + FMUL f17, C, f9 + STFDUX f12, XX, INCX + + FMADD f14, S, f7, f14 + STFDUX f13, YY, INCY + FNMSUB f15, S, f6, f15 + LFDX f6, X, INCXM1 + FMADD f16, S, f9, f16 + LFDX f7, Y, INCYM1 + FNMSUB f17, S, f8, f17 + LFDUX f8, X, INCX + + FMUL f10, C, f0 + LFDUX f9, Y, INCY + FMUL f11, C, f3 + STFDX f14, XX, INCXM1 + FMUL f12, C, f4 + STFDX f15, YY, INCYM1 + FMUL f13, C, f5 + STFDUX f16, XX, INCX + + FMADD f10, S, f3, f10 + STFDUX f17, YY, INCY + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + FMUL f14, C, f6 + STFDX f10, XX, INCXM1 + FMUL f15, C, f7 + STFDX f11, YY, INCYM1 + FMUL f16, C, f8 + STFDUX f12, XX, INCX + FMUL f17, C, f9 + STFDUX f13, YY, INCY + + FMADD f14, S, f7, f14 + FNMSUB f15, S, f6, f15 + FMADD f16, S, f9, f16 + FNMSUB f17, S, f8, f17 + + STFDX f14, XX, INCXM1 + STFDX f15, YY, INCYM1 + STFDUX f16, XX, INCX + STFDUX f17, YY, INCY + .align 4 + + +LL(150): + andi. r0, N, 3 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f0, X, INCXM1 + LFDX f3, Y, INCYM1 + LFDUX f4, X, INCX + LFDUX f5, Y, INCY + + FMUL f10, C, f0 + FMUL f11, C, f3 + FMUL f12, C, f4 + FMUL f13, C, f5 + + FMADD f10, S, f3, f10 + FNMSUB f11, S, f0, f11 + FMADD f12, S, f5, f12 + FNMSUB f13, S, f4, f13 + + STFDX f10, XX, INCXM1 + STFDX f11, YY, INCYM1 + STFDUX f12, XX, INCX + STFDUX f13, YY, INCY + + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S new file mode 100644 index 0000000000..7ffa80f199 --- /dev/null +++ b/kernel/power/zscal.S @@ -0,0 +1,385 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define XX r4 +#define PREA r5 + +#ifdef linux +#ifndef __64BIT__ +#define X r6 +#define INCX r7 +#else +#define X r8 +#define INCX r9 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define X r10 +#define INCX r8 +#else +#define X r8 +#define INCX r9 +#endif +#endif + +#define FZERO f0 +#define ALPHA_R f1 +#define ALPHA_I f2 + + PROLOGUE + PROFCODE + + addi SP, SP, -8 + li r0, 0 + + stw r0, 0(SP) + lfs FZERO, 0(SP) + addi SP, SP, 8 + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCX, 56(SP) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + + li PREA, L1_PREFETCHSIZE + + cmpwi cr0, N, 0 + blelr- cr0 + + fcmpu cr0, FZERO, ALPHA_R + bne- cr0, LL(A1I1) + + fcmpu cr0, FZERO, ALPHA_I + bne- cr0, LL(A1I1) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(A0IN) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(A0I1_Remain) + .align 4 + +LL(A0I1_kernel): + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + STFD FZERO, 2 * SIZE(X) + STFD FZERO, 3 * SIZE(X) + STFD FZERO, 4 * SIZE(X) + STFD FZERO, 5 * SIZE(X) + STFD FZERO, 6 * SIZE(X) + STFD FZERO, 7 * SIZE(X) + + STFD FZERO, 8 * SIZE(X) + STFD FZERO, 9 * SIZE(X) + STFD FZERO, 10 * SIZE(X) + STFD FZERO, 11 * SIZE(X) + STFD FZERO, 12 * SIZE(X) + STFD FZERO, 13 * SIZE(X) + STFD FZERO, 14 * SIZE(X) + STFD FZERO, 15 * SIZE(X) + + addi X, X, 16 * SIZE + bdnz LL(A0I1_kernel) + .align 4 + +LL(A0I1_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0I1_RemainKernel): + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + addi X, X, 2 * SIZE + bdnz LL(A0I1_RemainKernel) + blr + .align 4 + +LL(A0IN): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(A0IN_Remain) + .align 4 + +LL(A0IN_Kernel): + dcbtst X, PREA + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + bdnz LL(A0IN_Kernel) + .align 4 + +LL(A0IN_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0IN_RemainKernel): + STFD FZERO, 0 * SIZE(X) + STFD FZERO, 1 * SIZE(X) + add X, X, INCX + bdnz LL(A0IN_RemainKernel) + blr + .align 4 + +LL(A1I1): + cmpwi cr0, INCX, 2 * SIZE + bne- LL(A1IN) + + mr XX, X + srawi. r0, N, 3 + mtspr CTR, r0 + beq+ LL(A1I1_Remain) + .align 4 + +LL(A1I1_kernel): + LFD f3, 0 * SIZE(X) + LFD f4, 1 * SIZE(X) + LFD f5, 2 * SIZE(X) + LFD f6, 3 * SIZE(X) + LFD f7, 4 * SIZE(X) + LFD f8, 5 * SIZE(X) + LFD f9, 6 * SIZE(X) + LFD f10, 7 * SIZE(X) + + FMUL f0, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMUL f11, ALPHA_I, f6 + FMUL f6, ALPHA_R, f6 + + FMUL f12, ALPHA_I, f8 + FMUL f8, ALPHA_R, f8 + FMUL f13, ALPHA_I, f10 + FMUL f10, ALPHA_R, f10 + + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f0 + FMADD f6, ALPHA_I, f5, f6 + FMSUB f5, ALPHA_R, f5, f11 + + FMADD f8, ALPHA_I, f7, f8 + FMSUB f7, ALPHA_R, f7, f12 + FMADD f10, ALPHA_I, f9, f10 + FMSUB f9, ALPHA_R, f9, f13 + + STFD f3, 0 * SIZE(X) + STFD f4, 1 * SIZE(X) + STFD f5, 2 * SIZE(X) + STFD f6, 3 * SIZE(X) + STFD f7, 4 * SIZE(X) + STFD f8, 5 * SIZE(X) + STFD f9, 6 * SIZE(X) + STFD f10, 7 * SIZE(X) + + LFD f3, 8 * SIZE(X) + LFD f4, 9 * SIZE(X) + LFD f5, 10 * SIZE(X) + LFD f6, 11 * SIZE(X) + LFD f7, 12 * SIZE(X) + LFD f8, 13 * SIZE(X) + LFD f9, 14 * SIZE(X) + LFD f10,15 * SIZE(X) + + FMUL f0, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMUL f11, ALPHA_I, f6 + FMUL f6, ALPHA_R, f6 + + FMUL f12, ALPHA_I, f8 + FMUL f8, ALPHA_R, f8 + FMUL f13, ALPHA_I, f10 + FMUL f10, ALPHA_R, f10 + + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f0 + FMADD f6, ALPHA_I, f5, f6 + FMSUB f5, ALPHA_R, f5, f11 + + FMADD f8, ALPHA_I, f7, f8 + FMSUB f7, ALPHA_R, f7, f12 + FMADD f10, ALPHA_I, f9, f10 + FMSUB f9, ALPHA_R, f9, f13 + + STFD f3, 8 * SIZE(X) + STFD f4, 9 * SIZE(X) + STFD f5, 10 * SIZE(X) + STFD f6, 11 * SIZE(X) + STFD f7, 12 * SIZE(X) + STFD f8, 13 * SIZE(X) + STFD f9, 14 * SIZE(X) + STFD f10,15 * SIZE(X) + + addi X, X, 16 * SIZE + dcbtst X, PREA + bdnz LL(A1I1_kernel) + .align 4 + +LL(A1I1_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1I1_RemainKernel): + LFD f3, 0 * SIZE(X) + LFD f4, 1 * SIZE(X) + + FMUL f5, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f5 + + STFD f3, 0 * SIZE(X) + STFD f4, 1 * SIZE(X) + addi X, X, 2 * SIZE + bdnz LL(A1I1_RemainKernel) + blr + .align 4 + +LL(A1IN): + mr XX, X + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(A1IN_Remain) + .align 4 + +LL(A1IN_Kernel): + LFD f3, 0 * SIZE(XX) + LFD f4, 1 * SIZE(XX) + add XX, XX, INCX + LFD f5, 0 * SIZE(XX) + LFD f6, 1 * SIZE(XX) + add XX, XX, INCX + LFD f7, 0 * SIZE(XX) + LFD f8, 1 * SIZE(XX) + add XX, XX, INCX + LFD f9, 0 * SIZE(XX) + LFD f10, 1 * SIZE(XX) + add XX, XX, INCX + + FMUL f0, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMUL f11, ALPHA_I, f6 + FMUL f6, ALPHA_R, f6 + + FMUL f12, ALPHA_I, f8 + FMUL f8, ALPHA_R, f8 + FMUL f13, ALPHA_I, f10 + FMUL f10, ALPHA_R, f10 + + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f0 + FMADD f6, ALPHA_I, f5, f6 + FMSUB f5, ALPHA_R, f5, f11 + + FMADD f8, ALPHA_I, f7, f8 + FMSUB f7, ALPHA_R, f7, f12 + FMADD f10, ALPHA_I, f9, f10 + FMSUB f9, ALPHA_R, f9, f13 + + STFD f3, 0 * SIZE(X) + STFD f4, 1 * SIZE(X) + add X, X, INCX + STFD f5, 0 * SIZE(X) + STFD f6, 1 * SIZE(X) + add X, X, INCX + STFD f7, 0 * SIZE(X) + STFD f8, 1 * SIZE(X) + add X, X, INCX + STFD f9, 0 * SIZE(X) + STFD f10, 1 * SIZE(X) + add X, X, INCX + dcbtst X, PREA + bdnz LL(A1IN_Kernel) + .align 4 + +LL(A1IN_Remain): + andi. r0, N, 3 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1IN_RemainKernel): + LFD f3, 0 * SIZE(XX) + LFD f4, 1 * SIZE(XX) + add XX, XX, INCX + + FMUL f5, ALPHA_I, f4 + FMUL f4, ALPHA_R, f4 + FMADD f4, ALPHA_I, f3, f4 + FMSUB f3, ALPHA_R, f3, f5 + + STFD f3, 0 * SIZE(X) + STFD f4, 1 * SIZE(X) + add X, X, INCX + bdnz LL(A1IN_RemainKernel) + blr + + EPILOGUE diff --git a/kernel/power/zscal_hummer.S b/kernel/power/zscal_hummer.S new file mode 100644 index 0000000000..6c559f3f21 --- /dev/null +++ b/kernel/power/zscal_hummer.S @@ -0,0 +1,871 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 + +#define INCX2 r4 +#define XX r5 +#define Y r8 +#define YY r9 + +#define ALPHA f1 +#define ALPHA_I f2 + +#define A1 f0 +#define A2 f16 +#define A3 f17 +#define A4 f3 +#define A5 f4 +#define A6 f5 +#define A7 f6 +#define A8 f7 + +#define B1 f8 +#define B2 f9 +#define B3 f10 +#define B4 f11 +#define B5 f12 +#define B6 f13 +#define B7 f14 +#define B8 f15 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + stfpdux f17, SP, r10 + + li r10, 0 + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + stwu r10, -4(SP) + + lfpdx A1, SP, r10 # Zero clear + fsmfp ALPHA, ALPHA_I + + slwi INCX, INCX, BASE_SHIFT + add INCX2, INCX, INCX + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + + fcmpu cr7, ALPHA, A1 + bne cr7, LL(50) + + fscmp cr7, ALPHA, A1 + bne cr7, LL(50) + + andi. r0, X, 2 * SIZE - 1 + bne LL(20) + + sub X, X, INCX2 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(15) + .align 4 + +LL(12): + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + bdnz LL(12) + .align 4 + +LL(15): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(17) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(17): + andi. r0, N, 1 + beq LL(999) + + STFPDUX A1, X, INCX2 + b LL(999) + .align 4 + +LL(20): + sub X, X, INCX2 + + STFDX A1, X, INCX2 + addi X, X, SIZE + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(29) + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(25) + .align 4 + +LL(22): + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + bdnz LL(22) + .align 4 + +LL(25): + andi. r0, N, 3 + beq LL(29) + andi. r0, N, 2 + beq LL(27) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(29) + + STFPDUX A1, X, INCX2 + .align 4 + +LL(29): + STFDX A1, X, INCX2 + b LL(999) + .align 4 + +LL(50): + sub Y, X, INCX2 + sub X, X, INCX2 + + andi. r0, X, 2 * SIZE - 1 + bne LL(60) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(55) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fxpmul B1, ALPHA, A1 + LFPDUX A6, X, INCX2 + fxpmul B2, ALPHA, A2 + LFPDUX A7, X, INCX2 + fxpmul B3, ALPHA, A3 + LFPDUX A8, X, INCX2 + fxpmul B4, ALPHA, A4 + fxpmul B5, ALPHA, A5 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + bdz LL(53) + .align 4 + +LL(52): + fxcxnpma B3, ALPHA, A3, B3 + LFPDUX A1, X, INCX2 + fxpmul B6, ALPHA, A6 + STFPDUX B1, Y, INCX2 + + fxcxnpma B4, ALPHA, A4, B4 + LFPDUX A2, X, INCX2 + fxpmul B7, ALPHA, A7 + STFPDUX B2, Y, INCX2 + + fxcxnpma B5, ALPHA, A5, B5 + LFPDUX A3, X, INCX2 + fxpmul B8, ALPHA, A8 + STFPDUX B3, Y, INCX2 + + fxcxnpma B6, ALPHA, A6, B6 + LFPDUX A4, X, INCX2 + fxpmul B1, ALPHA, A1 + STFPDUX B4, Y, INCX2 + + fxcxnpma B7, ALPHA, A7, B7 + LFPDUX A5, X, INCX2 + fxpmul B2, ALPHA, A2 + STFPDUX B5, Y, INCX2 + + fxcxnpma B8, ALPHA, A8, B8 + LFPDUX A6, X, INCX2 + fxpmul B3, ALPHA, A3 + STFPDUX B6, Y, INCX2 + + fxcxnpma B1, ALPHA, A1, B1 + LFPDUX A7, X, INCX2 + fxpmul B4, ALPHA, A4 + STFPDUX B7, Y, INCX2 + + fxcxnpma B2, ALPHA, A2, B2 + LFPDUX A8, X, INCX2 + fxpmul B5, ALPHA, A5 + STFPDUX B8, Y, INCX2 + bdnz LL(52) + .align 4 + +LL(53): + fxcxnpma B3, ALPHA, A3, B3 + fxpmul B6, ALPHA, A6 + STFPDUX B1, Y, INCX2 + + fxcxnpma B4, ALPHA, A4, B4 + fxpmul B7, ALPHA, A7 + STFPDUX B2, Y, INCX2 + + fxcxnpma B5, ALPHA, A5, B5 + fxpmul B8, ALPHA, A8 + STFPDUX B3, Y, INCX2 + + fxcxnpma B6, ALPHA, A6, B6 + STFPDUX B4, Y, INCX2 + fxcxnpma B7, ALPHA, A7, B7 + STFPDUX B5, Y, INCX2 + fxcxnpma B8, ALPHA, A8, B8 + STFPDUX B6, Y, INCX2 + STFPDUX B7, Y, INCX2 + STFPDUX B8, Y, INCX2 + .align 4 + +LL(55): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(56) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxpmul B2, ALPHA, A2 + fxpmul B3, ALPHA, A3 + fxpmul B4, ALPHA, A4 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + fxcxnpma B3, ALPHA, A3, B3 + fxcxnpma B4, ALPHA, A4, B4 + + STFPDUX B1, Y, INCX2 + STFPDUX B2, Y, INCX2 + STFPDUX B3, Y, INCX2 + STFPDUX B4, Y, INCX2 + .align 4 + +LL(56): + andi. r0, N, 2 + beq LL(57) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxpmul B2, ALPHA, A2 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + + STFPDUX B1, Y, INCX2 + STFPDUX B2, Y, INCX2 + .align 4 + +LL(57): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxcxnpma B1, ALPHA, A1, B1 + + STFPDUX B1, Y, INCX2 + b LL(999) + .align 4 + +LL(60): + addi XX, X, SIZE + addi YY, Y, SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(65) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, XX, INCX2 + + LFDUX A5, X, INCX2 + fmul B1, ALPHA, A1 + LFDUX A6, XX, INCX2 + fmul B2, ALPHA_I, A1 + LFDUX A7, X, INCX2 + fmul B3, ALPHA, A3 + LFDUX A8, XX, INCX2 + fmul B4, ALPHA_I, A3 + + fmul B5, ALPHA, A5 + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA , A2, B2 + bdz LL(63) + .align 4 + +LL(62): + fnmsub B3, ALPHA_I, A4, B3 + LFDUX A1, X, INCX2 + fmul B6, ALPHA_I, A5 + STFDUX B1, Y, INCX2 + + fmadd B4, ALPHA , A4, B4 + LFDUX A2, XX, INCX2 + fmul B7, ALPHA, A7 + STFDUX B2, YY, INCX2 + + fnmsub B5, ALPHA_I, A6, B5 + LFDUX A3, X, INCX2 + fmul B8, ALPHA_I, A7 + STFDUX B3, Y, INCX2 + + fmadd B6, ALPHA , A6, B6 + LFDUX A4, XX, INCX2 + fmul B1, ALPHA, A1 + STFDUX B4, YY, INCX2 + + fnmsub B7, ALPHA_I, A8, B7 + LFDUX A5, X, INCX2 + fmul B2, ALPHA_I, A1 + STFDUX B5, Y, INCX2 + + fmadd B8, ALPHA , A8, B8 + LFDUX A6, XX, INCX2 + fmul B3, ALPHA, A3 + STFDUX B6, YY, INCX2 + + fnmsub B1, ALPHA_I, A2, B1 + LFDUX A7, X, INCX2 + fmul B4, ALPHA_I, A3 + STFDUX B7, Y, INCX2 + + fmadd B2, ALPHA , A2, B2 + LFDUX A8, XX, INCX2 + fmul B5, ALPHA, A5 + STFDUX B8, YY, INCX2 + bdnz LL(62) + .align 4 + +LL(63): + fnmsub B3, ALPHA_I, A4, B3 + fmul B6, ALPHA_I, A5 + STFDUX B1, Y, INCX2 + + fmadd B4, ALPHA , A4, B4 + fmul B7, ALPHA, A7 + STFDUX B2, YY, INCX2 + + fnmsub B5, ALPHA_I, A6, B5 + fmul B8, ALPHA_I, A7 + STFDUX B3, Y, INCX2 + + fmadd B6, ALPHA , A6, B6 + STFDUX B4, YY, INCX2 + fnmsub B7, ALPHA_I, A8, B7 + STFDUX B5, Y, INCX2 + fmadd B8, ALPHA , A8, B8 + STFDUX B6, YY, INCX2 + STFDUX B7, Y, INCX2 + STFDUX B8, YY, INCX2 + .align 4 + +LL(65): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(67) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, XX, INCX2 + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fmul B3, ALPHA, A3 + fmul B4, ALPHA, A4 + + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA_I, A1, B2 + fnmsub B3, ALPHA_I, A4, B3 + fmadd B4, ALPHA_I, A3, B4 + + STFDUX B1, Y, INCX2 + STFDUX B2, YY, INCX2 + STFDUX B3, Y, INCX2 + STFDUX B4, YY, INCX2 + .align 4 + +LL(67): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA_I, A1, B2 + + STFDUX B1, Y, INCX2 + STFDUX B2, YY, INCX2 + b LL(999) + .align 4 + + +LL(100): + fcmpu cr7, ALPHA, A1 + bne cr7, LL(150) + + fscmp cr7, ALPHA, A1 + bne cr7, LL(150) + + andi. r0, X, 2 * SIZE - 1 + bne LL(120) + + sub X, X, INCX2 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(115) + .align 4 + +LL(112): + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + bdnz LL(112) + .align 4 + +LL(115): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(117) + + STFPDUX A1, X, INCX2 + STFPDUX A1, X, INCX2 + .align 4 + +LL(117): + andi. r0, N, 1 + beq LL(999) + + STFPDUX A1, X, INCX2 + b LL(999) + .align 4 + +LL(120): + subi INCX2, INCX2, SIZE + li INCX, SIZE + + sub X, X, INCX2 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(125) + .align 4 + +LL(122): + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + bdnz LL(122) + .align 4 + +LL(125): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(127) + + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + .align 4 + +LL(127): + andi. r0, N, 1 + beq LL(999) + + STFDUX A1, X, INCX2 + STFDUX A1, X, INCX + b LL(999) + .align 4 + +LL(150): + sub Y, X, INCX2 + sub X, X, INCX2 + + andi. r0, X, 2 * SIZE - 1 + bne LL(160) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(155) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + LFPDUX A5, X, INCX2 + fxpmul B1, ALPHA, A1 + LFPDUX A6, X, INCX2 + fxpmul B2, ALPHA, A2 + LFPDUX A7, X, INCX2 + fxpmul B3, ALPHA, A3 + LFPDUX A8, X, INCX2 + fxpmul B4, ALPHA, A4 + fxpmul B5, ALPHA, A5 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + bdz LL(153) + .align 4 + +LL(152): + fxcxnpma B3, ALPHA, A3, B3 + LFPDUX A1, X, INCX2 + fxpmul B6, ALPHA, A6 + STFPDUX B1, Y, INCX2 + + fxcxnpma B4, ALPHA, A4, B4 + LFPDUX A2, X, INCX2 + fxpmul B7, ALPHA, A7 + STFPDUX B2, Y, INCX2 + + fxcxnpma B5, ALPHA, A5, B5 + LFPDUX A3, X, INCX2 + fxpmul B8, ALPHA, A8 + STFPDUX B3, Y, INCX2 + + fxcxnpma B6, ALPHA, A6, B6 + LFPDUX A4, X, INCX2 + fxpmul B1, ALPHA, A1 + STFPDUX B4, Y, INCX2 + + fxcxnpma B7, ALPHA, A7, B7 + LFPDUX A5, X, INCX2 + fxpmul B2, ALPHA, A2 + STFPDUX B5, Y, INCX2 + + fxcxnpma B8, ALPHA, A8, B8 + LFPDUX A6, X, INCX2 + fxpmul B3, ALPHA, A3 + STFPDUX B6, Y, INCX2 + + fxcxnpma B1, ALPHA, A1, B1 + LFPDUX A7, X, INCX2 + fxpmul B4, ALPHA, A4 + STFPDUX B7, Y, INCX2 + + fxcxnpma B2, ALPHA, A2, B2 + LFPDUX A8, X, INCX2 + fxpmul B5, ALPHA, A5 + STFPDUX B8, Y, INCX2 + bdnz LL(152) + .align 4 + +LL(153): + fxcxnpma B3, ALPHA, A3, B3 + fxpmul B6, ALPHA, A6 + STFPDUX B1, Y, INCX2 + + fxcxnpma B4, ALPHA, A4, B4 + fxpmul B7, ALPHA, A7 + STFPDUX B2, Y, INCX2 + + fxcxnpma B5, ALPHA, A5, B5 + fxpmul B8, ALPHA, A8 + STFPDUX B3, Y, INCX2 + + fxcxnpma B6, ALPHA, A6, B6 + STFPDUX B4, Y, INCX2 + fxcxnpma B7, ALPHA, A7, B7 + STFPDUX B5, Y, INCX2 + fxcxnpma B8, ALPHA, A8, B8 + STFPDUX B6, Y, INCX2 + STFPDUX B7, Y, INCX2 + STFPDUX B8, Y, INCX2 + .align 4 + +LL(155): + andi. r0, N, 7 + beq LL(999) + + andi. r0, N, 4 + beq LL(156) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + LFPDUX A3, X, INCX2 + LFPDUX A4, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxpmul B2, ALPHA, A2 + fxpmul B3, ALPHA, A3 + fxpmul B4, ALPHA, A4 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + fxcxnpma B3, ALPHA, A3, B3 + fxcxnpma B4, ALPHA, A4, B4 + + STFPDUX B1, Y, INCX2 + STFPDUX B2, Y, INCX2 + STFPDUX B3, Y, INCX2 + STFPDUX B4, Y, INCX2 + .align 4 + +LL(156): + andi. r0, N, 2 + beq LL(157) + + LFPDUX A1, X, INCX2 + LFPDUX A2, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxpmul B2, ALPHA, A2 + + fxcxnpma B1, ALPHA, A1, B1 + fxcxnpma B2, ALPHA, A2, B2 + + STFPDUX B1, Y, INCX2 + STFPDUX B2, Y, INCX2 + .align 4 + +LL(157): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + + fxpmul B1, ALPHA, A1 + fxcxnpma B1, ALPHA, A1, B1 + + STFPDUX B1, Y, INCX2 + b LL(999) + .align 4 + +LL(160): + addi XX, X, SIZE + addi YY, Y, SIZE + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(165) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, XX, INCX2 + + LFDUX A5, X, INCX2 + fmul B1, ALPHA, A1 + LFDUX A6, XX, INCX2 + fmul B2, ALPHA_I, A1 + LFDUX A7, X, INCX2 + fmul B3, ALPHA, A3 + LFDUX A8, XX, INCX2 + fmul B4, ALPHA_I, A3 + + fmul B5, ALPHA, A5 + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA , A2, B2 + bdz LL(163) + + .align 4 + +LL(162): + fnmsub B3, ALPHA_I, A4, B3 + LFDUX A1, X, INCX2 + fmul B6, ALPHA_I, A5 + STFDUX B1, Y, INCX2 + + fmadd B4, ALPHA , A4, B4 + LFDUX A2, XX, INCX2 + fmul B7, ALPHA, A7 + STFDUX B2, YY, INCX2 + + fnmsub B5, ALPHA_I, A6, B5 + LFDUX A3, X, INCX2 + fmul B8, ALPHA_I, A7 + STFDUX B3, Y, INCX2 + + fmadd B6, ALPHA , A6, B6 + LFDUX A4, XX, INCX2 + fmul B1, ALPHA, A1 + STFDUX B4, YY, INCX2 + + fnmsub B7, ALPHA_I, A8, B7 + LFDUX A5, X, INCX2 + fmul B2, ALPHA_I, A1 + STFDUX B5, Y, INCX2 + + fmadd B8, ALPHA , A8, B8 + LFDUX A6, XX, INCX2 + fmul B3, ALPHA, A3 + STFDUX B6, YY, INCX2 + + fnmsub B1, ALPHA_I, A2, B1 + LFDUX A7, X, INCX2 + fmul B4, ALPHA_I, A3 + STFDUX B7, Y, INCX2 + + fmadd B2, ALPHA , A2, B2 + LFDUX A8, XX, INCX2 + fmul B5, ALPHA, A5 + STFDUX B8, YY, INCX2 + bdnz LL(162) + .align 4 + +LL(163): + fnmsub B3, ALPHA_I, A4, B3 + fmul B6, ALPHA_I, A5 + STFDUX B1, Y, INCX2 + + fmadd B4, ALPHA , A4, B4 + fmul B7, ALPHA, A7 + STFDUX B2, YY, INCX2 + + fnmsub B5, ALPHA_I, A6, B5 + fmul B8, ALPHA_I, A7 + STFDUX B3, Y, INCX2 + + fmadd B6, ALPHA , A6, B6 + STFDUX B4, YY, INCX2 + fnmsub B7, ALPHA_I, A8, B7 + STFDUX B5, Y, INCX2 + fmadd B8, ALPHA , A8, B8 + STFDUX B6, YY, INCX2 + STFDUX B7, Y, INCX2 + STFDUX B8, YY, INCX2 + .align 4 + +LL(165): + andi. r0, N, 3 + beq LL(999) + andi. r0, N, 2 + beq LL(167) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + LFDUX A3, X, INCX2 + LFDUX A4, XX, INCX2 + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fmul B3, ALPHA, A3 + fmul B4, ALPHA, A4 + + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA_I, A1, B2 + fnmsub B3, ALPHA_I, A4, B3 + fmadd B4, ALPHA_I, A3, B4 + + STFDUX B1, Y, INCX2 + STFDUX B2, YY, INCX2 + STFDUX B3, Y, INCX2 + STFDUX B4, YY, INCX2 + .align 4 + +LL(167): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, XX, INCX2 + + fmul B1, ALPHA, A1 + fmul B2, ALPHA, A2 + fnmsub B1, ALPHA_I, A2, B1 + fmadd B2, ALPHA_I, A1, B2 + + STFDUX B1, Y, INCX2 + STFDUX B2, YY, INCX2 + .align 4 + +LL(999): + li r10, 16 + + lfpdux f17, SP, r10 + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S new file mode 100644 index 0000000000..9f120acfad --- /dev/null +++ b/kernel/power/zscal_ppc440.S @@ -0,0 +1,276 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define XX r4 +#define PRE r5 + +#ifdef linux +#ifndef __64BIT__ +#define X r6 +#define INCX r7 +#else +#define X r8 +#define INCX r9 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define X r10 +#define INCX r8 +#else +#define X r8 +#define INCX r9 +#endif +#endif + +#define INC1 r11 + +#define FZERO f0 +#define ALPHA_R f1 +#define ALPHA_I f2 + + PROLOGUE + PROFCODE + + addi SP, SP, -8 + li r0, 0 + + stw r0, 0(SP) + lfs FZERO, 0(SP) + addi SP, SP, 8 + +#if (defined(_AIX) || defined(__APPLE__)) && !defined(__64BIT__) && defined(DOUBLE) + lwz INCX, 56(SP) +#endif + + slwi INCX, INCX, ZBASE_SHIFT + li INC1, SIZE + sub X, X, INCX + li PRE, 3 * 16 * SIZE + + cmpwi cr0, N, 0 + blelr- cr0 + + fcmpu cr0, FZERO, ALPHA_R + bne- cr0, LL(A1I1) + + fcmpu cr0, FZERO, ALPHA_I + bne- cr0, LL(A1I1) + +LL(A0IN): + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(A0IN_Remain) + .align 4 + +LL(A0IN_Kernel): +#ifdef PPCG4 + dcbtst X, PRE +#endif + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 +#ifdef PPCG4 + dcbtst X, PRE +#endif + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + bdnz LL(A0IN_Kernel) + .align 4 + +LL(A0IN_Remain): + andi. r0, N, 7 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A0IN_RemainKernel): + STFDUX FZERO, X, INCX + STFDX FZERO, X, INC1 + bdnz LL(A0IN_RemainKernel) + blr + .align 4 + +LL(A1I1): + mr XX, X + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(15) + + LFDUX f0, X, INCX + LFDX f3, X, INC1 + LFDUX f4, X, INCX + LFDX f5, X, INC1 + + LFDUX f6, X, INCX + FMUL f10, ALPHA_R, f0 + LFDX f7, X, INC1 + FMUL f11, ALPHA_R, f3 + LFDUX f8, X, INCX + FMUL f12, ALPHA_R, f4 + FMUL f13, ALPHA_R, f5 + bdz LL(13) + .align 4 + +LL(12): +#ifdef PPCG4 + dcbtst X, PRE +#endif + + FNMSUB f10, ALPHA_I, f3, f10 + LFDX f9, X, INC1 + FMADD f11, ALPHA_I, f0, f11 + LFDUX f0, X, INCX + FNMSUB f12, ALPHA_I, f5, f12 + LFDX f3, X, INC1 + FMADD f13, ALPHA_I, f4, f13 + LFDUX f4, X, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + STFDUX f10, XX, INCX + FMUL f10, ALPHA_R, f6 + STFDX f11, XX, INC1 + FMUL f11, ALPHA_R, f7 + STFDUX f12, XX, INCX + FMUL f12, ALPHA_R, f8 + STFDX f13, XX, INC1 + FMUL f13, ALPHA_R, f9 + +#ifdef PPCG4 + dcbtst X, PRE +#endif + + FNMSUB f10, ALPHA_I, f7, f10 + LFDX f5, X, INC1 + FMADD f11, ALPHA_I, f6, f11 + LFDUX f6, X, INCX + FNMSUB f12, ALPHA_I, f9, f12 + LFDX f7, X, INC1 + FMADD f13, ALPHA_I, f8, f13 + LFDUX f8, X, INCX + +#if defined(PPCG4) && defined(DOUBLE) + dcbtst X, PRE +#endif + + STFDUX f10, XX, INCX + FMUL f10, ALPHA_R, f0 + STFDX f11, XX, INC1 + FMUL f11, ALPHA_R, f3 + STFDUX f12, XX, INCX + FMUL f12, ALPHA_R, f4 + STFDX f13, XX, INC1 + FMUL f13, ALPHA_R, f5 + bdnz LL(12) + .align 4 + +LL(13): + FNMSUB f10, ALPHA_I, f3, f10 + LFDX f9, X, INC1 + FMADD f11, ALPHA_I, f0, f11 + FNMSUB f12, ALPHA_I, f5, f12 + FMADD f13, ALPHA_I, f4, f13 + + STFDUX f10, XX, INCX + FMUL f10, ALPHA_R, f6 + STFDX f11, XX, INC1 + FMUL f11, ALPHA_R, f7 + STFDUX f12, XX, INCX + FMUL f12, ALPHA_R, f8 + STFDX f13, XX, INC1 + FMUL f13, ALPHA_R, f9 + + FNMSUB f10, ALPHA_I, f7, f10 + FMADD f11, ALPHA_I, f6, f11 + FNMSUB f12, ALPHA_I, f9, f12 + FMADD f13, ALPHA_I, f8, f13 + + STFDUX f10, XX, INCX + STFDX f11, XX, INC1 + STFDUX f12, XX, INCX + STFDX f13, XX, INC1 + .align 4 + +LL(15): + andi. r0, N, 3 + mtspr CTR, r0 + beqlr+ + .align 4 + +LL(A1IN_RemainKernel): + LFDUX f3, X, INCX + LFDX f4, X, INC1 + + FMUL f5, ALPHA_R, f3 + FMUL f6, ALPHA_R, f4 + + FNMSUB f5, ALPHA_I, f4, f5 + FMADD f6, ALPHA_I, f3, f6 + + STFDUX f5, XX, INCX + STFDX f6, XX, INC1 + bdnz LL(A1IN_RemainKernel) + blr + + EPILOGUE diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S new file mode 100644 index 0000000000..4c23c1d5e1 --- /dev/null +++ b/kernel/power/zswap.S @@ -0,0 +1,414 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 +#define PREA r4 +#define XX r5 +#define YY r10 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define PREA r5 +#define XX r6 +#define YY r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define N r3 +#define X r10 +#define INCX r4 +#define Y r5 +#define INCY r6 +#define PREA r7 +#define XX r8 +#define YY r9 +#else +#define N r3 +#define X r8 +#define INCX r9 +#define Y r10 +#define INCY r4 +#define PREA r5 +#define XX r6 +#define YY r7 +#endif +#endif + +#define INCXM1 r11 +#define INCYM1 r12 + +#define STACKSIZE 160 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#if defined(linux) && defined(__64BIT__) + ld INCY, 112 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld INCY, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz INCX, 56 + STACKSIZE(SP) + lwz Y, 60 + STACKSIZE(SP) + lwz INCY, 64 + STACKSIZE(SP) +#else + lwz INCY, 56 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + subi INCXM1, INCX, SIZE + subi INCYM1, INCY, SIZE + +#ifdef L1_DUALFETCH + li PREA, (L1_PREFETCHSIZE) / 2 +#else + li PREA, (L1_PREFETCHSIZE) +#endif + + cmpwi cr0, N, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + bne- cr0, LL(100) + cmpwi cr0, INCY, 2 * SIZE + bne- cr0, LL(100) + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- cr0, LL(50) + .align 4 + +LL(10): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 2 * SIZE(X) + LFD f3, 3 * SIZE(X) + + LFD f16, 0 * SIZE(Y) + LFD f17, 1 * SIZE(Y) + LFD f18, 2 * SIZE(Y) + LFD f19, 3 * SIZE(Y) + + LFD f4, 4 * SIZE(X) + LFD f5, 5 * SIZE(X) + LFD f6, 6 * SIZE(X) + LFD f7, 7 * SIZE(X) + + LFD f20, 4 * SIZE(Y) + LFD f21, 5 * SIZE(Y) + LFD f22, 6 * SIZE(Y) + LFD f23, 7 * SIZE(Y) + + LFD f8, 8 * SIZE(X) + LFD f9, 9 * SIZE(X) + LFD f10, 10 * SIZE(X) + LFD f11, 11 * SIZE(X) + + LFD f24, 8 * SIZE(Y) + LFD f25, 9 * SIZE(Y) + LFD f26, 10 * SIZE(Y) + LFD f27, 11 * SIZE(Y) + + LFD f12, 12 * SIZE(X) + LFD f13, 13 * SIZE(X) + LFD f14, 14 * SIZE(X) + LFD f15, 15 * SIZE(X) + + LFD f28, 12 * SIZE(Y) + LFD f29, 13 * SIZE(Y) + LFD f30, 14 * SIZE(Y) + LFD f31, 15 * SIZE(Y) + + STFD f16, 0 * SIZE(X) + STFD f17, 1 * SIZE(X) + STFD f18, 2 * SIZE(X) + STFD f19, 3 * SIZE(X) + + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + STFD f2, 2 * SIZE(Y) + STFD f3, 3 * SIZE(Y) + + STFD f20, 4 * SIZE(X) + STFD f21, 5 * SIZE(X) + STFD f22, 6 * SIZE(X) + STFD f23, 7 * SIZE(X) + + STFD f4, 4 * SIZE(Y) + STFD f5, 5 * SIZE(Y) + STFD f6, 6 * SIZE(Y) + STFD f7, 7 * SIZE(Y) + + STFD f24, 8 * SIZE(X) + STFD f25, 9 * SIZE(X) + STFD f26, 10 * SIZE(X) + STFD f27, 11 * SIZE(X) + + STFD f8, 8 * SIZE(Y) + STFD f9, 9 * SIZE(Y) + STFD f10, 10 * SIZE(Y) + STFD f11, 11 * SIZE(Y) + + STFD f28, 12 * SIZE(X) + STFD f29, 13 * SIZE(X) + STFD f30, 14 * SIZE(X) + STFD f31, 15 * SIZE(X) + + STFD f12, 12 * SIZE(Y) + STFD f13, 13 * SIZE(Y) + STFD f14, 14 * SIZE(Y) + STFD f15, 15 * SIZE(Y) + + addi X, X, 16 * SIZE + addi Y, Y, 16 * SIZE + dcbtst X, PREA +#ifdef L1_DUALFETCH + dcbtst Y, PREA +#endif + bdnz LL(10) + .align 4 + +LL(50): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(60): + LFD f0, 0 * SIZE(X) + LFD f1, 1 * SIZE(X) + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + + STFD f2, 0 * SIZE(X) + STFD f3, 1 * SIZE(X) + STFD f0, 0 * SIZE(Y) + STFD f1, 1 * SIZE(Y) + + addi X, X, 2 * SIZE + addi Y, Y, 2 * SIZE + bdnz LL(60) + b LL(999) + .align 4 + +LL(100): + sub X, X, INCXM1 + sub Y, Y, INCYM1 + + mr XX, X + mr YY, Y + + srawi. r0, N, 3 + mtspr CTR, r0 + beq- LL(150) + .align 4 + +LL(110): + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, X, INCXM1 + LFDUX f3, X, INCX + + LFDX f16, Y, INCYM1 + LFDUX f17, Y, INCY + LFDX f18, Y, INCYM1 + LFDUX f19, Y, INCY + + LFDX f4, X, INCXM1 + LFDUX f5, X, INCX + LFDX f6, X, INCXM1 + LFDUX f7, X, INCX + + LFDX f20, Y, INCYM1 + LFDUX f21, Y, INCY + LFDX f22, Y, INCYM1 + LFDUX f23, Y, INCY + + LFDX f8, X, INCXM1 + LFDUX f9, X, INCX + LFDX f10, X, INCXM1 + LFDUX f11, X, INCX + + LFDX f24, Y, INCYM1 + LFDUX f25, Y, INCY + LFDX f26, Y, INCYM1 + LFDUX f27, Y, INCY + + LFDX f12, X, INCXM1 + LFDUX f13, X, INCX + LFDX f14, X, INCXM1 + LFDUX f15, X, INCX + + LFDX f28, Y, INCYM1 + LFDUX f29, Y, INCY + LFDX f30, Y, INCYM1 + LFDUX f31, Y, INCY + + STFDX f16, XX, INCXM1 + STFDUX f17, XX, INCX + STFDX f18, XX, INCXM1 + STFDUX f19, XX, INCX + + STFDX f0, YY, INCYM1 + STFDUX f1, YY, INCY + STFDX f2, YY, INCYM1 + STFDUX f3, YY, INCY + + STFDX f20, XX, INCXM1 + STFDUX f21, XX, INCX + STFDX f22, XX, INCXM1 + STFDUX f23, XX, INCX + + STFDX f4, YY, INCYM1 + STFDUX f5, YY, INCY + STFDX f6, YY, INCYM1 + STFDUX f7, YY, INCY + + STFDX f24, XX, INCXM1 + STFDUX f25, XX, INCX + STFDX f26, XX, INCXM1 + STFDUX f27, XX, INCX + + STFDX f8, YY, INCYM1 + STFDUX f9, YY, INCY + STFDX f10, YY, INCYM1 + STFDUX f11, YY, INCY + + STFDX f28, XX, INCXM1 + STFDUX f29, XX, INCX + STFDX f30, XX, INCXM1 + STFDUX f31, XX, INCX + + STFDX f12, YY, INCYM1 + STFDUX f13, YY, INCY + STFDX f14, YY, INCYM1 + STFDUX f15, YY, INCY + bdnz LL(110) + .align 4 + +LL(150): + andi. r0, N, 7 + mtspr CTR, r0 + beq LL(999) + .align 4 + +LL(160): + LFDX f0, X, INCXM1 + LFDUX f1, X, INCX + LFDX f2, Y, INCYM1 + LFDUX f3, Y, INCY + STFDX f2, XX, INCXM1 + STFDUX f3, XX, INCX + STFDX f0, YY, INCYM1 + STFDUX f1, YY, INCY + bdnz LL(160) + .align 4 + +LL(999): + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + addi SP, SP, STACKSIZE + blr + + EPILOGUE diff --git a/kernel/power/zswap_hummer.S b/kernel/power/zswap_hummer.S new file mode 100644 index 0000000000..335eaa11c0 --- /dev/null +++ b/kernel/power/zswap_hummer.S @@ -0,0 +1,665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N r3 +#define X r6 +#define INCX r7 +#define Y r8 +#define INCY r9 + +#define INCX2 r4 +#define INCY2 r5 +#define X2 r10 +#define Y2 r11 + +#define A1 f0 +#define A2 f1 +#define A3 f2 +#define A4 f3 +#define A5 f4 + +#define B1 f5 +#define B2 f6 +#define B3 f7 +#define B4 f8 +#define B5 f9 + +#define T1 f10 +#define T2 f11 +#define T3 f12 +#define T4 f13 +#define T5 f14 +#define T6 f15 +#define T7 f16 + + PROLOGUE + PROFCODE + + li r10, -16 + + stfpdux f14, SP, r10 + stfpdux f15, SP, r10 + stfpdux f16, SP, r10 + + slwi INCX, INCX, BASE_SHIFT + slwi INCY, INCY, BASE_SHIFT + add INCX2, INCX, INCX + add INCY2, INCY, INCY + + cmpwi cr0, N, 0 + ble LL(999) + + cmpwi cr0, INCX, SIZE + bne LL(100) + cmpwi cr0, INCY, SIZE + bne LL(100) + + sub X, X, INCX2 + sub Y, Y, INCY2 + + mr X2, X + mr Y2, Y + + andi. r0, X, 2 * SIZE - 1 + bne LL(30) + andi. r0, Y, 2 * SIZE - 1 + bne LL(20) + .align 4 + +LL(10): /* X : aligned Y : aligned */ + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(15) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + bdz LL(13) + .align 4 + +LL(12): + STFPDUX B1, X2, INCY2 + LFPDUX B1, Y, INCY2 + STFPDUX A1, Y2, INCY2 + LFPDUX A1, X, INCX2 + + STFPDUX B2, X2, INCY2 + LFPDUX B2, Y, INCY2 + STFPDUX A2, Y2, INCY2 + LFPDUX A2, X, INCX2 + + STFPDUX B3, X2, INCY2 + LFPDUX B3, Y, INCY2 + STFPDUX A3, Y2, INCY2 + LFPDUX A3, X, INCX2 + + STFPDUX B4, X2, INCY2 + LFPDUX B4, Y, INCY2 + STFPDUX A4, Y2, INCY2 + LFPDUX A4, X, INCX2 + bdnz LL(12) + .align 4 + +LL(13): + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + STFPDUX B3, X2, INCY2 + STFPDUX A3, Y2, INCY2 + STFPDUX B4, X2, INCY2 + STFPDUX A4, Y2, INCY2 + .align 4 + +LL(15): + andi. r0, N, 3 + beq LL(999) + + andi. r0, N, 2 + beq LL(16) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + .align 4 + +LL(16): + andi. r0, N, 1 + beq LL(999) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + b LL(999) + .align 4 + +LL(20): /* X : aligned Y : unaligned */ + + LFXDUX A1, X, INCX2 + LFDX B1, Y, INCY2 + + STFSDX A1, Y2, INCY2 + + add Y, Y, INCY + add Y2, Y2, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(29) + .align 4 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(25) + + LFXDUX T1, X, INCX2 + LFXDUX T2, Y, INCY2 + LFXDUX T3, X, INCX2 + LFXDUX T4, Y, INCY2 + + LFPDUX A4, X, INCX2 + fsmr A1, T1 + LFPDUX B4, Y, INCY2 + fsmr B1, T2 + LFPDUX A5, X, INCX2 + fsmr T1, T3 + LFPDUX B5, Y, INCY2 + fsmr T2, T4 + bdz LL(23) + .align 4 + +LL(22): + fxmr T5, A4 + STFPDUX A1, Y2, INCY2 + fxmr T6, B4 + STFPDUX B1, X2, INCX2 + fxmr A1, A5 + STFPDUX T1, Y2, INCY2 + fxmr B1, B5 + STFPDUX T2, X2, INCX2 + + fsmr T3, T5 + LFPDUX A2, X, INCX2 + fsmr T4, T6 + LFPDUX B2, Y, INCY2 + fsmr T5, A1 + LFPDUX A3, X, INCX2 + fsmr T6, B1 + LFPDUX B3, Y, INCY2 + + fxmr T1, A2 + STFPDUX T3, Y2, INCY2 + fxmr T2, B2 + STFPDUX T4, X2, INCX2 + fxmr T3, A3 + STFPDUX T5, Y2, INCY2 + fxmr T4, B3 + STFPDUX T6, X2, INCX2 + + fsmr A1, T1 + LFPDUX A4, X, INCX2 + fsmr B1, T2 + LFPDUX B4, Y, INCY2 + fsmr T1, T3 + LFPDUX A5, X, INCX2 + fsmr T2, T4 + LFPDUX B5, Y, INCY2 + bdnz LL(22) + .align 4 + +LL(23): + fxmr T5, A4 + STFPDUX A1, Y2, INCY2 + fxmr T6, B4 + STFPDUX B1, X2, INCX2 + fxmr A1, A5 + STFPDUX T1, Y2, INCY2 + fxmr B1, B5 + STFPDUX T2, X2, INCX2 + + fsmr T3, T5 + fsmr T4, T6 + fsmr T5, A1 + fsmr T6, B1 + + STFPDUX T3, Y2, INCY2 + STFPDUX T4, X2, INCX2 + STFPDUX T5, Y2, INCY2 + STFPDUX T6, X2, INCX2 + .align 4 + +LL(25): + andi. r0, N, 3 + beq LL(29) + + andi. r0, N, 2 + beq LL(27) + + LFXDUX A2, X, INCX2 + LFXDUX B2, Y, INCY2 + LFXDUX A3, X, INCX2 + LFXDUX B3, Y, INCY2 + + fsmr A1, A2 + fsmr B1, B2 + fsmr A2, A3 + fsmr B2, B3 + + STFPDUX A1, Y2, INCY2 + STFPDUX B1, X2, INCX2 + STFPDUX A2, Y2, INCY2 + fpmr A1, A3 + STFPDUX B2, X2, INCX2 + fpmr B1, B3 + .align 4 + +LL(27): + andi. r0, N, 1 + beq LL(29) + + LFXDUX A2, X, INCX2 + LFXDUX B2, Y, INCY2 + fsmr A1, A2 + fsmr B1, B2 + STFPDUX A1, Y2, INCY2 + fpmr A1, A2 + STFPDUX B1, X2, INCX2 + fpmr B1, B2 + .align 4 + +LL(29): + LFSDX B1, Y, INCY2 + STFDX A1, Y2, INCY2 + STFPDX B1, X2, INCX2 + b LL(999) + .align 4 + + +LL(30): /* X : unaligned Y : aligned */ + + andi. r0, Y, 2 * SIZE - 1 + bne LL(40) + + LFXDUX A1, Y, INCY2 + LFDX B1, X, INCX2 + + STFSDX A1, X2, INCX2 + + add X, X, INCX + add X2, X2, INCX + + addi N, N, -1 + cmpwi cr0, N, 0 + ble LL(39) + .align 4 + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(35) + + LFXDUX T1, Y, INCY2 + LFXDUX T2, X, INCX2 + LFXDUX T3, Y, INCY2 + LFXDUX T4, X, INCX2 + + LFPDUX A4, Y, INCY2 + fsmr A1, T1 + LFPDUX B4, X, INCX2 + fsmr B1, T2 + LFPDUX A5, Y, INCY2 + fsmr T1, T3 + LFPDUX B5, X, INCX2 + fsmr T2, T4 + bdz LL(33) + .align 4 + +LL(32): + fxmr T5, A4 + STFPDUX A1, X2, INCX2 + fxmr T6, B4 + STFPDUX B1, Y2, INCY2 + fxmr A1, A5 + STFPDUX T1, X2, INCX2 + fxmr B1, B5 + STFPDUX T2, Y2, INCY2 + + fsmr T3, T5 + LFPDUX A2, Y, INCY2 + fsmr T4, T6 + LFPDUX B2, X, INCX2 + fsmr T5, A1 + LFPDUX A3, Y, INCY2 + fsmr T6, B1 + LFPDUX B3, X, INCX2 + + fxmr T1, A2 + STFPDUX T3, X2, INCX2 + fxmr T2, B2 + STFPDUX T4, Y2, INCY2 + fxmr T3, A3 + STFPDUX T5, X2, INCX2 + fxmr T4, B3 + STFPDUX T6, Y2, INCY2 + + fsmr A1, T1 + LFPDUX A4, Y, INCY2 + fsmr B1, T2 + LFPDUX B4, X, INCX2 + fsmr T1, T3 + LFPDUX A5, Y, INCY2 + fsmr T2, T4 + LFPDUX B5, X, INCX2 + bdnz LL(32) + .align 4 + +LL(33): + fxmr T5, A4 + STFPDUX A1, X2, INCX2 + fxmr T6, B4 + STFPDUX B1, Y2, INCY2 + fxmr A1, A5 + STFPDUX T1, X2, INCX2 + fxmr B1, B5 + STFPDUX T2, Y2, INCY2 + + fsmr T3, T5 + fsmr T4, T6 + fsmr T5, A1 + fsmr T6, B1 + + STFPDUX T3, X2, INCX2 + STFPDUX T4, Y2, INCY2 + STFPDUX T5, X2, INCX2 + STFPDUX T6, Y2, INCY2 + .align 4 + +LL(35): + andi. r0, N, 3 + beq LL(39) + + andi. r0, N, 2 + beq LL(37) + + LFXDUX A2, Y, INCY2 + LFXDUX B2, X, INCX2 + LFXDUX A3, Y, INCY2 + LFXDUX B3, X, INCX2 + + fsmr A1, A2 + fsmr B1, B2 + fsmr A2, A3 + fsmr B2, B3 + + STFPDUX A1, X2, INCX2 + STFPDUX B1, Y2, INCY2 + STFPDUX A2, X2, INCX2 + fpmr A1, A3 + STFPDUX B2, Y2, INCY2 + fpmr B1, B3 + .align 4 + +LL(37): + andi. r0, N, 1 + beq LL(39) + + LFXDUX A2, Y, INCY2 + LFXDUX B2, X, INCX2 + fsmr A1, A2 + fsmr B1, B2 + STFPDUX A1, X2, INCX2 + fpmr A1, A2 + STFPDUX B1, Y2, INCY2 + fpmr B1, B2 + .align 4 + +LL(39): + LFSDX B1, X, INCX2 + STFDX A1, X2, INCX2 + STFPDX B1, Y2, INCY2 + b LL(999) + .align 4 + +LL(40): /* X : unaligned Y : unaligned */ + + LFDX A1, Y, INCY2 + LFDX B1, X, INCX2 + add X, X, INCX + add Y, Y, INCY + + addi N, N, -1 + cmpwi cr0, N, 0 + + STFDX A1, X2, INCX2 + STFDX B1, Y2, INCY2 + add X2, X2, INCX + add Y2, Y2, INCY + ble LL(49) + + srawi. r0, N, 2 + mtspr CTR, r0 + beq- LL(45) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + LFPDUX A3, X, INCX2 + LFPDUX B3, Y, INCY2 + LFPDUX A4, X, INCX2 + LFPDUX B4, Y, INCY2 + bdz LL(43) + .align 4 + +LL(42): + STFPDUX B1, X2, INCY2 + LFPDUX B1, Y, INCY2 + STFPDUX A1, Y2, INCY2 + LFPDUX A1, X, INCX2 + + STFPDUX B2, X2, INCY2 + LFPDUX B2, Y, INCY2 + STFPDUX A2, Y2, INCY2 + LFPDUX A2, X, INCX2 + + STFPDUX B3, X2, INCY2 + LFPDUX B3, Y, INCY2 + STFPDUX A3, Y2, INCY2 + LFPDUX A3, X, INCX2 + + STFPDUX B4, X2, INCY2 + LFPDUX B4, Y, INCY2 + STFPDUX A4, Y2, INCY2 + LFPDUX A4, X, INCX2 + bdnz LL(42) + .align 4 + +LL(43): + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + STFPDUX B3, X2, INCY2 + STFPDUX A3, Y2, INCY2 + STFPDUX B4, X2, INCY2 + STFPDUX A4, Y2, INCY2 + .align 4 + +LL(45): + andi. r0, N, 3 + beq LL(49) + + andi. r0, N, 2 + beq LL(46) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + LFPDUX A2, X, INCX2 + LFPDUX B2, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + STFPDUX B2, X2, INCY2 + STFPDUX A2, Y2, INCY2 + .align 4 + +LL(46): + andi. r0, N, 1 + beq LL(49) + + LFPDUX A1, X, INCX2 + LFPDUX B1, Y, INCY2 + + STFPDUX B1, X2, INCY2 + STFPDUX A1, Y2, INCY2 + .align 4 + +LL(49): + LFDX A1, Y, INCY2 + LFDX B1, X, INCX2 + STFDX A1, X2, INCX2 + STFDX B1, Y2, INCY2 + b LL(999) + .align 4 + +LL(100): + subi INCX2, INCX2, SIZE + subi INCY2, INCY2, SIZE + + li INCX, SIZE + li INCY, SIZE + + sub X, X, INCX2 + sub Y, Y, INCY2 + + mr X2, X + mr Y2, Y + + srawi. r0, N, 1 + mtspr CTR, r0 + beq- LL(115) + + LFDUX A1, X, INCX2 + LFDUX B1, Y, INCY2 + LFDUX A2, X, INCX + LFDUX B2, Y, INCY + LFDUX A3, X, INCX2 + LFDUX B3, Y, INCY2 + LFDUX A4, X, INCX + LFDUX B4, Y, INCY + bdz LL(113) + .align 4 + +LL(112): + STFDUX B1, X2, INCX2 + LFDUX B1, Y, INCY2 + STFDUX A1, Y2, INCY2 + LFDUX A1, X, INCX2 + + STFDUX B2, X2, INCX + LFDUX B2, Y, INCY + STFDUX A2, Y2, INCY + LFDUX A2, X, INCX + + STFDUX B3, X2, INCX2 + LFDUX B3, Y, INCY2 + STFDUX A3, Y2, INCY2 + LFDUX A3, X, INCX2 + + STFDUX B4, X2, INCX + LFDUX B4, Y, INCY + STFDUX A4, Y2, INCY + LFDUX A4, X, INCX + bdnz LL(112) + .align 4 + +LL(113): + STFDUX B1, X2, INCX2 + STFDUX A1, Y2, INCY2 + STFDUX B2, X2, INCX + STFDUX A2, Y2, INCY + + STFDUX B3, X2, INCX2 + STFDUX A3, Y2, INCY2 + STFDUX B4, X2, INCX + STFDUX A4, Y2, INCY + .align 4 + +LL(115): + andi. r0, N, 1 + beq LL(999) + + LFDUX A1, X, INCX2 + LFDUX A2, X, INCX + LFDUX B1, Y, INCY2 + LFDUX B2, Y, INCY + + STFDUX B1, X2, INCX2 + STFDUX B2, X2, INCX + STFDUX A1, Y2, INCY2 + STFDUX A2, Y2, INCY + .align 4 + +LL(999): + li r10, 16 + addi SP, SP, -16 + + lfpdux f16, SP, r10 + lfpdux f15, SP, r10 + lfpdux f14, SP, r10 + + addi SP, SP, 16 + blr + + EPILOGUE diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S new file mode 100644 index 0000000000..0dca84d51c --- /dev/null +++ b/kernel/power/zsymv_L.S @@ -0,0 +1,1673 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define N r4 +#define A r5 +#define LDA r6 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define BUFFER r14 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define N r4 +#define A r9 +#define LDA r10 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define BUFFER r14 +#else +#define M r3 +#define N r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r15 +#define AO2 r16 +#define AO3 r17 +#define AO4 r18 +#define XX r19 +#define YY r20 +#define NEW_Y r21 +#define TEMP r22 +#define PREA r24 +#define IS r25 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define xtemp1 f8 +#define xtemp2 f9 +#define xtemp3 f10 +#define xtemp4 f11 +#define xtemp5 f12 +#define xtemp6 f13 +#define xtemp7 f14 +#define xtemp8 f15 + +#define atemp1 f16 +#define atemp2 f17 +#define atemp3 f18 +#define atemp4 f19 + +#define xsum1 f20 +#define xsum2 f21 +#define xsum3 f22 +#define xsum4 f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha_r f1 +#define alpha_i f2 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 32 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 96 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 112 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#define NOP1 +#define NOP2 +#else +#define NOP1 mr LDA, LDA +#define NOP2 mr INCX, INCX +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA_R 200(SP) +#define ALPHA_I 208(SP) +#define FZERO 216(SP) +#else +#define STACKSIZE 280 +#define ALPHA_R 256(SP) +#define ALPHA_I 264(SP) +#define FZERO 272(SP) +#endif + +#ifndef HEMV +#define FMADD1 FNMSUB +#define FMADD2 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz BUFFER, 56 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz X, 56 + STACKSIZE(SP) + lwz INCX, 60 + STACKSIZE(SP) + lwz Y, 64 + STACKSIZE(SP) + lwz INCY, 68 + STACKSIZE(SP) + lwz BUFFER, 72 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + STFD alpha_r, ALPHA_R + STFD alpha_i, ALPHA_I + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + + cmpwi cr0, M, 0 + ble- LL(999) + + cmpwi cr0, INCX, 2 * SIZE + beq LL(05) + + mr XX, X + mr X, BUFFER + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(03) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(XX) + LFD a2, 1 * SIZE(XX) + add XX, XX, INCX + LFD a3, 0 * SIZE(XX) + LFD a4, 1 * SIZE(XX) + add XX, XX, INCX + LFD a5, 0 * SIZE(XX) + LFD a6, 1 * SIZE(XX) + add XX, XX, INCX + LFD a7, 0 * SIZE(XX) + LFD a8, 1 * SIZE(XX) + add XX, XX, INCX + + dcbt XX, PREA + dcbtst BUFFER, PREA + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + STFD a3, 2 * SIZE(BUFFER) + STFD a4, 3 * SIZE(BUFFER) + STFD a5, 4 * SIZE(BUFFER) + STFD a6, 5 * SIZE(BUFFER) + STFD a7, 6 * SIZE(BUFFER) + STFD a8, 7 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(01) + .align 4 + +LL(03): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(04): + LFD a1, 0 * SIZE(XX) + LFD a2, 1 * SIZE(XX) + add XX, XX, INCX + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 2 * SIZE + bdnz LL(04) + .align 4 + +LL(05): + mr NEW_Y, Y + lfd f0, FZERO + + cmpwi cr0, INCY, 2 * SIZE + beq LL(10) + + mr NEW_Y, BUFFER + + addi r0, M, 3 + srawi. r0, r0, 2 + mtspr CTR, r0 + .align 4 + +LL(06): + STFD f0, 0 * SIZE(BUFFER) + STFD f0, 1 * SIZE(BUFFER) + STFD f0, 2 * SIZE(BUFFER) + STFD f0, 3 * SIZE(BUFFER) + STFD f0, 4 * SIZE(BUFFER) + STFD f0, 5 * SIZE(BUFFER) + STFD f0, 6 * SIZE(BUFFER) + STFD f0, 7 * SIZE(BUFFER) + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(06) + .align 4 + +LL(10): + li IS, 0 + + cmpwi cr0, N, 2 + blt LL(20) + .align 4 + +LL(11): + mr AO1, A + slwi TEMP, IS, ZBASE_SHIFT + add AO2, A, LDA + add XX, X, TEMP + + add A, AO2, LDA + add YY, NEW_Y, TEMP + addi A, A, 4 * SIZE + NOP2 + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD atemp1, 0 * SIZE(XX) + LFD atemp2, 1 * SIZE(XX) + LFD atemp3, 2 * SIZE(XX) + LFD atemp4, 3 * SIZE(XX) + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + FMUL xsum1, atemp1, a1 + addi AO2, AO2, 4 * SIZE + FMUL xsum2, atemp2, a1 + LFD a1, 4 * SIZE(AO1) + FMUL xsum3, atemp1, a3 + addi AO1, AO1, 4 * SIZE + FMUL xsum4, atemp2, a3 + LFD a5, 0 * SIZE(AO2) + +#ifndef HEMV + FNMSUB xsum1, atemp2, a2, xsum1 +#endif + addi XX, XX, 4 * SIZE +#ifndef HEMV + FMADD xsum2, atemp1, a2, xsum2 +#endif + LFD a2, 1 * SIZE(AO1) + FNMSUB xsum3, atemp2, a4, xsum3 + addi YY, YY, 4 * SIZE + FMADD xsum4, atemp1, a4, xsum4 + LFD a6, 1 * SIZE(AO2) + + FMADD xsum1, atemp3, a3, xsum1 + sub TEMP, M, IS + FMADD xsum2, atemp4, a3, xsum2 + LFD a3, 2 * SIZE(AO1) + FMADD xsum3, atemp3, a7, xsum3 + addi TEMP, TEMP, -2 + FMADD xsum4, atemp4, a7, xsum4 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, atemp4, a4, xsum1 + srawi. r0, TEMP, 3 + FMADD2 xsum2, atemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) +#ifndef HEMV + FMADD1 xsum3, atemp4, a8, xsum3 +#endif + mtspr CTR, r0 +#ifndef HEMV + FMADD2 xsum4, atemp3, a8, xsum4 +#endif + LFD a8, 3 * SIZE(AO2) + + FMUL xtemp1, y05, atemp1 + LFD y01, 0 * SIZE(YY) + FMUL xtemp2, y06, atemp1 + LFD y02, 1 * SIZE(YY) + FMUL xtemp3, y05, atemp3 + LFD y03, 2 * SIZE(YY) + FMUL xtemp4, y06, atemp3 + LFD y04, 3 * SIZE(YY) + + FNMSUB atemp1, y06, atemp2, xtemp1 + LFD xtemp1, 0 * SIZE(XX) + FMADD atemp2, y05, atemp2, xtemp2 + LFD xtemp2, 1 * SIZE(XX) + FNMSUB atemp3, y06, atemp4, xtemp3 + LFD xtemp3, 2 * SIZE(XX) + FMADD atemp4, y05, atemp4, xtemp4 + LFD xtemp4, 3 * SIZE(XX) + + NOP1 + ble LL(15) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO1, PREA) + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + NOP1 + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(X, PREX) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + bdz LL(13) + .align 4 + +LL(12): + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, 4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + DCBT(AO2, PREA) + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, 5 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, 6 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, 7 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 12 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 13 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 13 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 13 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 14 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 14 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 15 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 14 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(Y1, PREY) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 15 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 15 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 15 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 8 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 9 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 10 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 11 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 16 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 17 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 17 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 17 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + addi AO2, AO2, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 18 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + addi XX, XX, 16 * SIZE + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 18 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 0 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 19 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi AO1, AO1, 16 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi YY, YY, 16 * SIZE + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, -4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + DCBT(AO1, PREA) + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, -3 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, -2 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, -1 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(X, PREX) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + bdnz LL(12) + .align 4 + +LL(13): + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, 4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, 5 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, 6 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, 7 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 12 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 13 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 13 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 13 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 14 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 14 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 15 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 14 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 15 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 15 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 15 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 8 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 9 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 10 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 11 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 16 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 17 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 17 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 17 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + addi AO2, AO2, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 18 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + addi XX, XX, 16 * SIZE + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 18 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 0 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 19 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi AO1, AO1, 16 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi YY, YY, 16 * SIZE + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + STFD y05, -4 * SIZE(YY) + STFD y06, -3 * SIZE(YY) + STFD y07, -2 * SIZE(YY) + STFD y08, -1 * SIZE(YY) + .align 4 + +LL(15): + andi. r0, TEMP, 4 + ble LL(16) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + NOP1 + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi YY, YY, 8 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi AO2, AO2, 8 * SIZE + FNMSUB y05, atemp4, a6, y05 + addi XX, XX, 8 * SIZE + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + addi AO1, AO1, 8 * SIZE + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + STFD y05, -4 * SIZE(YY) + STFD y06, -3 * SIZE(YY) + STFD y07, -2 * SIZE(YY) + STFD y08, -1 * SIZE(YY) + .align 4 + +LL(16): + andi. r0, TEMP, 2 + ble LL(17) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + FMADD y03, atemp1, a3, y03 + FMADD xsum4, xtemp2, a5, xsum4 + FMADD y04, atemp2, a3, y04 + + FMADD1 xsum1, xtemp2, a2, xsum1 + NOP1 + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + NOP1 + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + NOP1 + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + NOP1 + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + FMADD y03, atemp3, a7, y03 + FMADD xsum4, xtemp4, a7, xsum4 + FMADD y04, atemp4, a7, y04 + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + NOP1 + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + addi AO1, AO1, 4 * SIZE + FNMSUB y03, atemp4, a8, y03 + addi AO2, AO2, 4 * SIZE + FMADD2 xsum4, xtemp3, a8, xsum4 + addi YY, YY, 4 * SIZE + FMADD y04, atemp3, a8, y04 + NOP2 + + STFD y01, -4 * SIZE(YY) + LFD y01, 0 * SIZE(YY) + STFD y02, -3 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + STFD y03, -2 * SIZE(YY) + STFD y04, -1 * SIZE(YY) + .align 4 + +LL(17): + andi. r0, M, 1 + ble LL(18) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp2, a1, xsum2 + FMADD y02, atemp2, a1, y02 + FMADD xsum3, xtemp1, a5, xsum3 + FNMSUB y01, atemp2, a2, y01 + FMADD xsum4, xtemp2, a5, xsum4 + FMADD y02, atemp1, a2, y02 + + FMADD1 xsum1, xtemp2, a2, xsum1 + FMADD y01, atemp3, a5, y01 + FMADD2 xsum2, xtemp1, a2, xsum2 + FMADD y02, atemp4, a5, y02 + FMADD1 xsum3, xtemp2, a6, xsum3 + FNMSUB y01, atemp4, a6, y01 + FMADD2 xsum4, xtemp1, a6, xsum4 + FMADD y02, atemp3, a6, y02 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + STFD y03, 2 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + .align 4 + +LL(18): + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + slwi TEMP, IS, ZBASE_SHIFT + add YY, NEW_Y, TEMP + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + + FMUL xtemp1, y05, xsum1 + FMUL xtemp2, y06, xsum1 + FMUL xtemp3, y05, xsum3 + FMUL xtemp4, y06, xsum3 + + FNMSUB xsum1, y06, xsum2, xtemp1 + FMADD xsum2, y05, xsum2, xtemp2 + FNMSUB xsum3, y06, xsum4, xtemp3 + FMADD xsum4, y05, xsum4, xtemp4 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + FADD y03, y03, xsum3 + FADD y04, y04, xsum4 + + STFD y01, 0 * SIZE(YY) + addi TEMP, IS, 4 + STFD y02, 1 * SIZE(YY) + addi IS, IS, 2 + STFD y03, 2 * SIZE(YY) + cmpw cr0, TEMP, N + STFD y04, 3 * SIZE(YY) + ble LL(11) + .align 4 + +LL(20): + andi. TEMP, N, 1 + ble LL(990) + + slwi TEMP, IS, ZBASE_SHIFT + add XX, X, TEMP + add YY, NEW_Y, TEMP + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD atemp1, 0 * SIZE(XX) + LFD atemp2, 1 * SIZE(XX) + + LFD a1, 0 * SIZE(A) + LFD a2, 1 * SIZE(A) + + FMUL xsum1, atemp1, a1 + FMUL xsum2, atemp2, a1 + +#ifndef HEMV + FNMSUB xsum1, atemp2, a2, xsum1 + FMADD xsum2, atemp1, a2, xsum2 +#endif + + FMUL xtemp1, y05, atemp1 + FMUL xtemp2, y06, atemp1 + + FNMSUB atemp1, y06, atemp2, xtemp1 + FMADD atemp2, y05, atemp2, xtemp2 + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + FMUL xtemp1, y05, xsum1 + FMUL xtemp2, y06, xsum1 + + FNMSUB xsum1, y06, xsum2, xtemp1 + FMADD xsum2, y05, xsum2, xtemp2 + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + .align 4 + +LL(990): + cmpwi cr0, INCY, 2 * SIZE + beq LL(999) + + mr YY, Y + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + LFD f5, 1 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + LFD f7, 1 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + LFD f12, 4 * SIZE(NEW_Y) + LFD f13, 5 * SIZE(NEW_Y) + LFD f14, 6 * SIZE(NEW_Y) + LFD f15, 7 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + STFD f11, 1 * SIZE(YY) + add YY, YY, INCY + STFD f12, 0 * SIZE(YY) + STFD f13, 1 * SIZE(YY) + add YY, YY, INCY + STFD f14, 0 * SIZE(YY) + STFD f15, 1 * SIZE(YY) + add YY, YY, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 2 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + STFD f11, 1 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(996): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S new file mode 100644 index 0000000000..dbf6ebb1d0 --- /dev/null +++ b/kernel/power/zsymv_U.S @@ -0,0 +1,1653 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef linux +#ifndef __64BIT__ +#define M r3 +#define IS r4 +#define A r5 +#define LDA r6 +#define X r7 +#define INCX r8 +#define Y r9 +#define INCY r10 +#define BUFFER r14 +#else +#define M r3 +#define IS r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define M r3 +#define IS r4 +#define A r9 +#define LDA r10 +#define X r5 +#define INCX r6 +#define Y r7 +#define INCY r8 +#define BUFFER r14 +#else +#define M r3 +#define IS r4 +#define A r7 +#define LDA r8 +#define X r9 +#define INCX r10 +#define Y r5 +#define INCY r6 +#define BUFFER r14 +#endif +#endif + +#define I r11 +#define J r12 + +#define AO1 r15 +#define AO2 r16 +#define XX r19 +#define YY r20 +#define NEW_Y r21 +#define TEMP r22 +#define PREA r24 + +#define y01 f0 +#define y02 f1 +#define y03 f2 +#define y04 f3 +#define y05 f4 +#define y06 f5 +#define y07 f6 +#define y08 f7 + +#define xtemp1 f8 +#define xtemp2 f9 +#define xtemp3 f10 +#define xtemp4 f11 +#define xtemp5 f12 +#define xtemp6 f13 +#define xtemp7 f14 +#define xtemp8 f15 + +#define atemp1 f16 +#define atemp2 f17 +#define atemp3 f18 +#define atemp4 f19 + +#define xsum1 f20 +#define xsum2 f21 +#define xsum3 f22 +#define xsum4 f23 + +#define a1 f24 +#define a2 f25 +#define a3 f26 +#define a4 f27 +#define a5 f28 +#define a6 f29 +#define a7 f30 +#define a8 f31 + +#define alpha_r f1 +#define alpha_i f2 + +#if defined(PPCG4) +#define PREFETCHSIZE_A 24 +#endif + +#if defined(PPC440) || defined(PPC440FP2) +#define PREFETCHSIZE_A 24 +#endif + +#ifdef PPC970 +#define PREFETCHSIZE_A 32 +#endif + +#ifdef CELL +#define PREFETCHSIZE_A 72 +#endif + +#ifdef POWER4 +#define PREFETCHSIZE_A 16 +#endif + +#ifdef POWER5 +#define PREFETCHSIZE_A 96 +#endif + +#ifdef POWER6 +#define PREFETCHSIZE_A 112 +#endif + +#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) +#define NOP1 +#define NOP2 +#else +#define NOP1 mr LDA, LDA +#define NOP2 mr INCX, INCX +#endif + +#ifndef NEEDPARAM + +#ifndef __64BIT__ +#define STACKSIZE 224 +#define ALPHA_R 200(SP) +#define ALPHA_I 208(SP) +#define FZERO 216(SP) +#else +#define STACKSIZE 280 +#define ALPHA_R 256(SP) +#define ALPHA_I 264(SP) +#define FZERO 272(SP) +#endif + +#ifndef HEMV +#define FMADD1 FNMSUB +#define FMADD2 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r0, FZERO + std r14, 144(SP) + std r15, 152(SP) + std r16, 160(SP) + std r17, 168(SP) + std r18, 176(SP) + std r19, 184(SP) + std r20, 192(SP) + std r21, 200(SP) + std r22, 208(SP) + std r23, 216(SP) + std r24, 224(SP) + std r25, 232(SP) + std r26, 240(SP) + std r27, 248(SP) +#else + stw r0, 0 + FZERO + stw r0, 4 + FZERO + stw r14, 144(SP) + stw r15, 148(SP) + stw r16, 152(SP) + stw r17, 156(SP) + stw r18, 160(SP) + stw r19, 164(SP) + stw r20, 168(SP) + stw r21, 172(SP) + stw r22, 176(SP) + stw r23, 180(SP) + stw r24, 184(SP) + stw r25, 188(SP) + stw r26, 192(SP) + stw r27, 196(SP) +#endif + +#ifdef linux +#ifndef __64BIT__ + lwz BUFFER, 56 + STACKSIZE(SP) +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifndef __64BIT__ +#ifdef DOUBLE + lwz X, 56 + STACKSIZE(SP) + lwz INCX, 60 + STACKSIZE(SP) + lwz Y, 64 + STACKSIZE(SP) + lwz INCY, 68 + STACKSIZE(SP) + lwz BUFFER, 72 + STACKSIZE(SP) +#else + lwz Y, 56 + STACKSIZE(SP) + lwz INCY, 60 + STACKSIZE(SP) + lwz BUFFER, 64 + STACKSIZE(SP) +#endif +#else + ld Y, 112 + STACKSIZE(SP) + ld INCY, 120 + STACKSIZE(SP) + ld BUFFER, 128 + STACKSIZE(SP) +#endif +#endif + + STFD alpha_r, ALPHA_R + STFD alpha_i, ALPHA_I + + slwi LDA, LDA, ZBASE_SHIFT + slwi INCX, INCX, ZBASE_SHIFT + slwi INCY, INCY, ZBASE_SHIFT + + li PREA, PREFETCHSIZE_A * SIZE + sub IS, M, IS + + cmpwi cr0, M, 0 + ble- LL(999) + + mullw TEMP, IS, LDA + add A, A, TEMP + + cmpwi cr0, INCX, 2 * SIZE + beq LL(05) + + mr XX, X + mr X, BUFFER + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(03) + .align 4 + +LL(01): + LFD a1, 0 * SIZE(XX) + LFD a2, 1 * SIZE(XX) + add XX, XX, INCX + LFD a3, 0 * SIZE(XX) + LFD a4, 1 * SIZE(XX) + add XX, XX, INCX + LFD a5, 0 * SIZE(XX) + LFD a6, 1 * SIZE(XX) + add XX, XX, INCX + LFD a7, 0 * SIZE(XX) + LFD a8, 1 * SIZE(XX) + add XX, XX, INCX + + dcbt XX, PREA + dcbtst BUFFER, PREA + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + STFD a3, 2 * SIZE(BUFFER) + STFD a4, 3 * SIZE(BUFFER) + STFD a5, 4 * SIZE(BUFFER) + STFD a6, 5 * SIZE(BUFFER) + STFD a7, 6 * SIZE(BUFFER) + STFD a8, 7 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(01) + .align 4 + +LL(03): + andi. r0, M, 3 + mtspr CTR, r0 + ble LL(05) + .align 4 + +LL(04): + LFD a1, 0 * SIZE(XX) + LFD a2, 1 * SIZE(XX) + add XX, XX, INCX + + STFD a1, 0 * SIZE(BUFFER) + STFD a2, 1 * SIZE(BUFFER) + + addi BUFFER, BUFFER, 2 * SIZE + bdnz LL(04) + .align 4 + +LL(05): + mr NEW_Y, Y + lfd f0, FZERO + + cmpwi cr0, INCY, 2 * SIZE + beq LL(10) + + mr NEW_Y, BUFFER + + addi r0, M, 3 + srawi. r0, r0, 2 + mtspr CTR, r0 + .align 4 + +LL(06): + STFD f0, 0 * SIZE(BUFFER) + STFD f0, 1 * SIZE(BUFFER) + STFD f0, 2 * SIZE(BUFFER) + STFD f0, 3 * SIZE(BUFFER) + STFD f0, 4 * SIZE(BUFFER) + STFD f0, 5 * SIZE(BUFFER) + STFD f0, 6 * SIZE(BUFFER) + STFD f0, 7 * SIZE(BUFFER) + addi BUFFER, BUFFER, 8 * SIZE + bdnz LL(06) + .align 4 + +LL(10): + addi TEMP, IS, 2 + cmpw cr0, TEMP, M + bgt LL(20) + .align 4 + +LL(11): + mr AO1, A + add AO2, A, LDA + add A, AO2, LDA + + slwi TEMP, IS, ZBASE_SHIFT + add TEMP, X, TEMP + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD xtemp1, 0 * SIZE(TEMP) + LFD xtemp2, 1 * SIZE(TEMP) + LFD xtemp3, 2 * SIZE(TEMP) + LFD xtemp4, 3 * SIZE(TEMP) + + FMUL atemp1, y05, xtemp1 + FMUL atemp2, y06, xtemp1 + FMUL atemp3, y05, xtemp3 + FMUL atemp4, y06, xtemp3 + + FNMSUB atemp1, y06, xtemp2, atemp1 + FMADD atemp2, y05, xtemp2, atemp2 + FNMSUB atemp3, y06, xtemp4, atemp3 + FMADD atemp4, y05, xtemp4, atemp4 + + lfd xsum1, FZERO + fmr xsum2, xsum1 + fmr xsum3, xsum1 + fmr xsum4, xsum1 + + mr XX, X + mr YY, NEW_Y + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + LFD a3, 2 * SIZE(AO1) + LFD a4, 3 * SIZE(AO1) + + LFD a5, 0 * SIZE(AO2) + LFD a6, 1 * SIZE(AO2) + LFD a7, 2 * SIZE(AO2) + LFD a8, 3 * SIZE(AO2) + + LFD xtemp1, 0 * SIZE(XX) + LFD xtemp2, 1 * SIZE(XX) + LFD xtemp3, 2 * SIZE(XX) + LFD xtemp4, 3 * SIZE(XX) + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + LFD y03, 2 * SIZE(YY) + LFD y04, 3 * SIZE(YY) + + srawi. r0, IS, 3 + mtspr CTR, r0 + ble LL(15) + + FMADD xsum1, xtemp1, a1, xsum1 + DCBT(AO1, PREA) + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + NOP1 + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(X, PREX) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + bdz LL(13) + .align 4 + +LL(12): + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, 4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + DCBT(AO2, PREA) + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, 5 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, 6 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, 7 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 12 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 13 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 13 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 13 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 14 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 14 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 15 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 14 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(Y1, PREY) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 15 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 15 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 15 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 8 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 9 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 10 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 11 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 16 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 17 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 17 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 17 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + addi AO2, AO2, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 18 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + addi XX, XX, 16 * SIZE + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 18 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 0 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 19 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi AO1, AO1, 16 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi YY, YY, 16 * SIZE + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, -4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + DCBT(AO1, PREA) + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, -3 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, -2 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, -1 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 +# DCBT(X, PREX) + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + bdnz LL(12) + .align 4 + +LL(13): + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y05, 4 * SIZE(YY) + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y06, 5 * SIZE(YY) + FMADD y02, atemp2, a1, y02 + LFD a1, 12 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y07, 6 * SIZE(YY) + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y08, 7 * SIZE(YY) + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 12 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 13 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 13 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 13 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 12 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 14 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 14 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 12 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 15 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 14 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 15 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 13 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 15 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 14 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 15 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 8 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 9 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 16 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 10 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 11 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 16 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 17 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 17 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 17 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 16 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + addi AO2, AO2, 16 * SIZE + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 18 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + addi XX, XX, 16 * SIZE + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 18 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 0 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 19 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + addi AO1, AO1, 16 * SIZE + FMADD y08, atemp4, a7, y08 + LFD a7, 2 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + addi YY, YY, 16 * SIZE + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 3 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 1 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 3 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 2 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 3 * SIZE(AO2) + + STFD y05, -4 * SIZE(YY) + STFD y06, -3 * SIZE(YY) + STFD y07, -2 * SIZE(YY) + STFD y08, -1 * SIZE(YY) + .align 4 + +LL(15): + andi. r0, IS, 4 + ble LL(16) + + FMADD xsum1, xtemp1, a1, xsum1 + NOP1 + FMADD y01, atemp1, a1, y01 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + NOP1 + FMADD y02, atemp2, a1, y02 + LFD a1, 4 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + NOP1 + FMADD y03, atemp1, a3, y03 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + NOP1 + FMADD y04, atemp2, a3, y04 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y05, 4 * SIZE(YY) + FNMSUB y01, atemp2, a2, y01 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y06, 5 * SIZE(YY) + FMADD y02, atemp1, a2, y02 + LFD a2, 5 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 5 * SIZE(XX) + FNMSUB y03, atemp2, a4, y03 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 4 * SIZE(XX) + FMADD y04, atemp1, a4, y04 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y07, 6 * SIZE(YY) + FMADD y01, atemp3, a5, y01 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 6 * SIZE(AO1) + FMADD y02, atemp4, a5, y02 + LFD a5, 4 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y08, 7 * SIZE(YY) + FMADD y03, atemp3, a7, y03 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y04, atemp4, a7, y04 + LFD a7, 6 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y01, atemp4, a6, y01 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 7 * SIZE(AO1) + FMADD y02, atemp3, a6, y02 + LFD a6, 5 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 7 * SIZE(XX) + FNMSUB y03, atemp4, a8, y03 + NOP2 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 6 * SIZE(XX) + FMADD y04, atemp3, a8, y04 + LFD a8, 7 * SIZE(AO2) + + FMADD xsum1, xtemp1, a1, xsum1 + STFD y01, 0 * SIZE(YY) + FMADD y05, atemp1, a1, y05 + NOP2 + + FMADD xsum2, xtemp2, a1, xsum2 + STFD y02, 1 * SIZE(YY) + FMADD y06, atemp2, a1, y06 + LFD a1, 8 * SIZE(AO1) + + FMADD xsum3, xtemp1, a5, xsum3 + STFD y03, 2 * SIZE(YY) + FMADD y07, atemp1, a3, y07 + NOP2 + + FMADD xsum4, xtemp2, a5, xsum4 + STFD y04, 3 * SIZE(YY) + FMADD y08, atemp2, a3, y08 + NOP2 + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD y01, 8 * SIZE(YY) + FNMSUB y05, atemp2, a2, y05 + NOP2 + + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD y02, 9 * SIZE(YY) + FMADD y06, atemp1, a2, y06 + LFD a2, 9 * SIZE(AO1) + + FMADD1 xsum3, xtemp2, a6, xsum3 + LFD xtemp2, 9 * SIZE(XX) + FNMSUB y07, atemp2, a4, y07 + NOP2 + + FMADD2 xsum4, xtemp1, a6, xsum4 + LFD xtemp1, 8 * SIZE(XX) + FMADD y08, atemp1, a4, y08 + NOP2 + + FMADD xsum1, xtemp3, a3, xsum1 + LFD y03, 10 * SIZE(YY) + FMADD y05, atemp3, a5, y05 + NOP2 + + FMADD xsum2, xtemp4, a3, xsum2 + LFD a3, 10 * SIZE(AO1) + FMADD y06, atemp4, a5, y06 + LFD a5, 8 * SIZE(AO2) + + FMADD xsum3, xtemp3, a7, xsum3 + LFD y04, 11 * SIZE(YY) + FMADD y07, atemp3, a7, y07 + NOP2 + + FMADD xsum4, xtemp4, a7, xsum4 + NOP1 + FMADD y08, atemp4, a7, y08 + LFD a7, 10 * SIZE(AO2) + + FMADD1 xsum1, xtemp4, a4, xsum1 + NOP1 + FNMSUB y05, atemp4, a6, y05 + NOP2 + + FMADD2 xsum2, xtemp3, a4, xsum2 + LFD a4, 11 * SIZE(AO1) + FMADD y06, atemp3, a6, y06 + LFD a6, 9 * SIZE(AO2) + + FMADD1 xsum3, xtemp4, a8, xsum3 + LFD xtemp4, 11 * SIZE(XX) + FNMSUB y07, atemp4, a8, y07 + + FMADD2 xsum4, xtemp3, a8, xsum4 + LFD xtemp3, 10 * SIZE(XX) + FMADD y08, atemp3, a8, y08 + LFD a8, 11 * SIZE(AO2) + + STFD y05, 4 * SIZE(YY) + STFD y06, 5 * SIZE(YY) + STFD y07, 6 * SIZE(YY) + STFD y08, 7 * SIZE(YY) + + addi AO1, AO1, 8 * SIZE + addi AO2, AO2, 8 * SIZE + + addi XX, XX, 8 * SIZE + addi YY, YY, 8 * SIZE + .align 4 + +LL(16): + andi. r0, IS, 2 + ble LL(18) + + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp2, a1, xsum2 + FMADD y02, atemp2, a1, y02 + FMADD xsum3, xtemp1, a5, xsum3 + FMADD y03, atemp1, a3, y03 + FMADD xsum4, xtemp2, a5, xsum4 + FMADD y04, atemp2, a3, y04 + + FMADD1 xsum1, xtemp2, a2, xsum1 + FNMSUB y01, atemp2, a2, y01 + FMADD2 xsum2, xtemp1, a2, xsum2 + FMADD y02, atemp1, a2, y02 + FMADD1 xsum3, xtemp2, a6, xsum3 + FNMSUB y03, atemp2, a4, y03 + FMADD2 xsum4, xtemp1, a6, xsum4 + FMADD y04, atemp1, a4, y04 + + FMADD xsum1, xtemp3, a3, xsum1 + FMADD y01, atemp3, a5, y01 + FMADD xsum2, xtemp4, a3, xsum2 + FMADD y02, atemp4, a5, y02 + FMADD xsum3, xtemp3, a7, xsum3 + FMADD y03, atemp3, a7, y03 + FMADD xsum4, xtemp4, a7, xsum4 + FMADD y04, atemp4, a7, y04 + + FMADD1 xsum1, xtemp4, a4, xsum1 + FNMSUB y01, atemp4, a6, y01 + FMADD2 xsum2, xtemp3, a4, xsum2 + FMADD y02, atemp3, a6, y02 + FMADD1 xsum3, xtemp4, a8, xsum3 + FNMSUB y03, atemp4, a8, y03 + FMADD2 xsum4, xtemp3, a8, xsum4 + FMADD y04, atemp3, a8, y04 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + STFD y03, 2 * SIZE(YY) + STFD y04, 3 * SIZE(YY) + + LFD a1, 4 * SIZE(AO1) + LFD a2, 5 * SIZE(AO1) + + LFD a5, 4 * SIZE(AO2) + LFD a6, 5 * SIZE(AO2) + LFD a7, 6 * SIZE(AO2) + LFD a8, 7 * SIZE(AO2) + + LFD y01, 4 * SIZE(YY) + LFD y02, 5 * SIZE(YY) + LFD y03, 6 * SIZE(YY) + LFD y04, 7 * SIZE(YY) + + addi YY, YY, 4 * SIZE + .align 4 + +LL(18): + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + FMUL xtemp1, y05, xsum1 + FMUL xtemp2, y06, xsum1 + FMUL xtemp3, y05, xsum3 + FMUL xtemp4, y06, xsum3 + + FNMSUB xsum1, y06, xsum2, xtemp1 + FMADD xsum2, y05, xsum2, xtemp2 + FNMSUB xsum3, y06, xsum4, xtemp3 + FMADD xsum4, y05, xsum4, xtemp4 + + FMADD xsum1, atemp1, a1, xsum1 + FMADD xsum2, atemp2, a1, xsum2 + FMADD xsum3, atemp1, a5, xsum3 + FMADD xsum4, atemp2, a5, xsum4 + +#ifndef HEMV + FMADD1 xsum1, atemp2, a2, xsum1 + FMADD2 xsum2, atemp1, a2, xsum2 +#endif + FMADD1 xsum3, atemp2, a6, xsum3 + FMADD2 xsum4, atemp1, a6, xsum4 + + FMADD xsum1, atemp3, a5, xsum1 + FMADD xsum2, atemp4, a5, xsum2 + FMADD xsum3, atemp3, a7, xsum3 + FMADD xsum4, atemp4, a7, xsum4 + + FNMSUB xsum1, atemp4, a6, xsum1 + FMADD xsum2, atemp3, a6, xsum2 +#ifndef HEMV + FNMSUB xsum3, atemp4, a8, xsum3 + FMADD xsum4, atemp3, a8, xsum4 +#endif + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + FADD y03, y03, xsum3 + FADD y04, y04, xsum4 + + STFD y01, 0 * SIZE(YY) + addi TEMP, IS, 4 + STFD y02, 1 * SIZE(YY) + addi IS, IS, 2 + STFD y03, 2 * SIZE(YY) + cmpw cr0, TEMP, M + STFD y04, 3 * SIZE(YY) + ble LL(11) + .align 4 + +LL(20): + andi. TEMP, M, 1 + ble LL(990) + + mr AO1, A + + slwi TEMP, IS, ZBASE_SHIFT + add TEMP, X, TEMP + + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + LFD xtemp1, 0 * SIZE(TEMP) + LFD xtemp2, 1 * SIZE(TEMP) + + FMUL atemp1, y05, xtemp1 + FMUL atemp2, y06, xtemp1 + + FNMSUB atemp1, y06, xtemp2, atemp1 + FMADD atemp2, y05, xtemp2, atemp2 + + lfd xsum1, FZERO + fmr xsum2, xsum1 + + mr XX, X + mr YY, NEW_Y + + LFD a1, 0 * SIZE(AO1) + LFD a2, 1 * SIZE(AO1) + + LFD xtemp1, 0 * SIZE(XX) + LFD xtemp2, 1 * SIZE(XX) + + LFD y01, 0 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + + mtspr CTR, IS + cmpwi cr0, IS, 0 + ble LL(28) + .align 4 + +LL(22): + FMADD xsum1, xtemp1, a1, xsum1 + FMADD y01, atemp1, a1, y01 + FMADD xsum2, xtemp2, a1, xsum2 + FMADD y02, atemp2, a1, y02 + LFD a1, 2 * SIZE(AO1) + + FMADD1 xsum1, xtemp2, a2, xsum1 + LFD xtemp2, 3 * SIZE(XX) + FNMSUB y01, atemp2, a2, y01 + FMADD2 xsum2, xtemp1, a2, xsum2 + LFD xtemp1, 2 * SIZE(XX) + FMADD y02, atemp1, a2, y02 + LFD a2, 3 * SIZE(AO1) + + addi AO1, AO1, 2 * SIZE + addi XX, XX, 2 * SIZE + addi YY, YY, 2 * SIZE + + STFD y01, -2 * SIZE(YY) + LFD y01, 0 * SIZE(YY) + STFD y02, -1 * SIZE(YY) + LFD y02, 1 * SIZE(YY) + bdnz LL(22) + .align 4 + +LL(28): + LFD y05, ALPHA_R + LFD y06, ALPHA_I + + FMUL xtemp1, y05, xsum1 + FMUL xtemp2, y06, xsum1 + + FNMSUB xsum1, y06, xsum2, xtemp1 + FMADD xsum2, y05, xsum2, xtemp2 + + FMADD xsum1, atemp1, a1, xsum1 + FMADD xsum2, atemp2, a1, xsum2 + +#ifndef HEMV + FNMSUB xsum1, atemp2, a2, xsum1 + FMADD xsum2, atemp1, a2, xsum2 +#endif + + FADD y01, y01, xsum1 + FADD y02, y02, xsum2 + + STFD y01, 0 * SIZE(YY) + STFD y02, 1 * SIZE(YY) + .align 4 + +LL(990): + cmpwi cr0, INCY, 2 * SIZE + beq LL(999) + + mr YY, Y + + srawi. r0, M, 2 + mtspr CTR, r0 + ble LL(995) + .align 4 + +LL(991): + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + add Y, Y, INCY + LFD f4, 0 * SIZE(Y) + LFD f5, 1 * SIZE(Y) + add Y, Y, INCY + LFD f6, 0 * SIZE(Y) + LFD f7, 1 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + LFD f12, 4 * SIZE(NEW_Y) + LFD f13, 5 * SIZE(NEW_Y) + LFD f14, 6 * SIZE(NEW_Y) + LFD f15, 7 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 8 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + FADD f12, f12, f4 + FADD f13, f13, f5 + FADD f14, f14, f6 + FADD f15, f15, f7 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + STFD f11, 1 * SIZE(YY) + add YY, YY, INCY + STFD f12, 0 * SIZE(YY) + STFD f13, 1 * SIZE(YY) + add YY, YY, INCY + STFD f14, 0 * SIZE(YY) + STFD f15, 1 * SIZE(YY) + add YY, YY, INCY + bdnz LL(991) + .align 4 + +LL(995): + andi. J, M, 2 + ble LL(996) + + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + add Y, Y, INCY + LFD f2, 0 * SIZE(Y) + LFD f3, 1 * SIZE(Y) + add Y, Y, INCY + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + LFD f10, 2 * SIZE(NEW_Y) + LFD f11, 3 * SIZE(NEW_Y) + addi NEW_Y, NEW_Y, 4 * SIZE + + FADD f8, f8, f0 + FADD f9, f9, f1 + FADD f10, f10, f2 + FADD f11, f11, f3 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + add YY, YY, INCY + STFD f10, 0 * SIZE(YY) + STFD f11, 1 * SIZE(YY) + add YY, YY, INCY + .align 4 + +LL(996): + andi. J, M, 1 + ble LL(999) + + LFD f0, 0 * SIZE(Y) + LFD f1, 1 * SIZE(Y) + + LFD f8, 0 * SIZE(NEW_Y) + LFD f9, 1 * SIZE(NEW_Y) + + FADD f8, f8, f0 + FADD f9, f9, f1 + + STFD f8, 0 * SIZE(YY) + STFD f9, 1 * SIZE(YY) + .align 4 + +LL(999): + li r3, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r14, 144(SP) + ld r15, 152(SP) + ld r16, 160(SP) + ld r17, 168(SP) + ld r18, 176(SP) + ld r19, 184(SP) + ld r20, 192(SP) + ld r21, 200(SP) + ld r22, 208(SP) + ld r23, 216(SP) + ld r24, 224(SP) + ld r25, 232(SP) + ld r26, 240(SP) + ld r27, 248(SP) +#else + lwz r14, 144(SP) + lwz r15, 148(SP) + lwz r16, 152(SP) + lwz r17, 156(SP) + lwz r18, 160(SP) + lwz r19, 164(SP) + lwz r20, 168(SP) + lwz r21, 172(SP) + lwz r22, 176(SP) + lwz r23, 180(SP) + lwz r24, 184(SP) + lwz r25, 188(SP) + lwz r26, 192(SP) + lwz r27, 196(SP) +#endif + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S new file mode 100644 index 0000000000..e31a887bc1 --- /dev/null +++ b/kernel/power/ztrsm_kernel_LN.S @@ -0,0 +1,2288 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + +#ifndef PREFETCHTEST +#ifdef LN + li PREC, -4 * SIZE +#else + li PREC, 4 * SIZE +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE + 16) +#else + li PREA, (16 * 9 * SIZE + 16) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(20): + andi. I, M, 1 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f20, f2 + FADD f3, f21, f3 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(09): + srawi. I, M, 1 + ble LL(29) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + + andi. I, M, 1 + ble LL(40) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + srawi. I, M, 1 + ble LL(49) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S new file mode 100644 index 0000000000..f7153b789d --- /dev/null +++ b/kernel/power/ztrsm_kernel_LT.S @@ -0,0 +1,2288 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE + 16) +#else + li PREA, (16 * 9 * SIZE + 16) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S new file mode 100644 index 0000000000..55bc29b1dd --- /dev/null +++ b/kernel/power/ztrsm_kernel_RT.S @@ -0,0 +1,2289 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST +#ifdef PPC970 + li PREC, 4 * SIZE +#endif +#ifdef POWER4 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#ifdef POWER5 + li PREC, 4 * SIZE /* is 12 best? */ +#endif +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + +#ifndef PREFETCHTEST +#ifdef PPC970 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 5 * SIZE + 16) +#else + li PREA, (16 * 9 * SIZE + 16) +#endif +#endif +#ifdef POWER4 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 1 * SIZE + 16) +#else + li PREA, (16 * 2 * SIZE + 16) +#endif +#endif +#ifdef POWER5 +#ifdef ALLOC_HUGETLB + li PREA, (16 * 7 * SIZE | 1) + li PREB, (16 * 7 * SIZE | 3) +#else + li PREA, (16 * 12 * SIZE | 1) + li PREB, (16 * 6 * SIZE | 3) +#endif +#endif +#endif + + andi. J, N, 1 + ble LL(30) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +LL(30): + srawi. J, N, 1 + ble LL(999) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + fmadd f0, f24, f28, f0 + fmadd f5, f25, f29, f5 + fmadd f10, f26, f30, f10 + fmadd f15, f27, f31, f15 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f1, f25, f28, f1 + fmadd f2, f26, f28, f2 + fmadd f3, f27, f28, f3 + fmadd f4, f24, f29, f4 + + fmadd f6, f26, f29, f6 + fmadd f7, f27, f29, f7 + fmadd f8, f24, f30, f8 + fmadd f9, f25, f30, f9 + + fmadd f11, f27, f30, f11 + fmadd f12, f24, f31, f12 + fmadd f13, f25, f31, f13 + fmadd f14, f26, f31, f14 + + addi AO, AO, 16 * SIZE + addi BO, BO, 16 * SIZE +#ifdef PPC970 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER4 +#ifndef ALLOC_HUGETLB + DCBT(AO, PREA) +#endif + DCBT(BO, PREB) +#endif + +#ifdef POWER5 + DCBT(AO, PREA) + DCBT(BO, PREB) +#endif + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S new file mode 100644 index 0000000000..c284a0ed7c --- /dev/null +++ b/kernel/power/ztrsm_kernel_cell_LN.S @@ -0,0 +1,2252 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "cparam.h" +#else +#include "zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + + li PREC, -4 * SIZE + li PREA, 16 * 12 * SIZE + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +LL(20): + andi. I, M, 1 + ble LL(09) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f20, f2 + FADD f3, f21, f3 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(09): + srawi. I, M, 1 + ble LL(29) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + + andi. I, M, 1 + ble LL(40) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + srawi. I, M, 1 + ble LL(49) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbtst CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S new file mode 100644 index 0000000000..ca80100913 --- /dev/null +++ b/kernel/power/ztrsm_kernel_cell_LT.S @@ -0,0 +1,2277 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "cparam.h" +#else +#include "zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + +#ifndef PREFETCHTEST + li PREC, 3 * SIZE + li PREA, 16 * 12 * SIZE +#else + +#ifdef linux +#ifndef __64BIT__ + lwz PREA, 16 + STACKSIZE(SP) + lwz PREC, 20 + STACKSIZE(SP) +#else + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld PREA, 136 + STACKSIZE(SP) + ld PREC, 144 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz PREA, 72 + STACKSIZE(SP) + lwz PREC, 76 + STACKSIZE(SP) +#else + lwz PREA, 68 + STACKSIZE(SP) + lwz PREC, 72 + STACKSIZE(SP) +#endif +#endif +#endif + +#endif + + srawi. J, N, 1 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(B) + LFD f29, 5 * SIZE(B) + LFD f30, 6 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S new file mode 100644 index 0000000000..f1139fd343 --- /dev/null +++ b/kernel/power/ztrsm_kernel_cell_RT.S @@ -0,0 +1,2249 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define PREA r30 +#define PREC r31 +#define PREB PREA + +#ifndef NEEDPARAM + +#ifndef DOUBLE +#include "cparam.h" +#else +#include "zparam.h" +#endif + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREC, 3 * SIZE + li PREA, 16 * 12 * SIZE + + andi. J, N, 1 + ble LL(30) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(37) + .align 4 + +LL(36): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(36) + .align 4 + +LL(37): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + fmadd f4, f18, f22, f4 + fmadd f5, f19, f23, f5 + fmadd f6, f19, f22, f6 + fmadd f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(47) + .align 4 + +LL(46): + fmadd f0, f16, f20, f0 + fmadd f1, f17, f21, f1 + fmadd f2, f17, f20, f2 + fmadd f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +LL(30): + srawi. J, N, 1 + ble LL(999) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbt CO1, PREC + dcbt CO2, PREC + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +#define NOP1 mr r18, r18 +#define NOP2 mr r19, r19 + +LL(12): + FMADD f0, f16, f20, f0 + dcbt AO, PREA + FMADD f4, f16, f21, f4 + dcbt BO, PREB + FMADD f8, f16, f22, f8 + LFD f31, 7 * SIZE(BO) + FMADD f12, f16, f23, f12 + LFD f27, 7 * SIZE(AO) + + FMADD f1, f17, f20, f1 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 9 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 10 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 8 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 9 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 10 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 11 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 12 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 13 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 14 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 12 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 13 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 14 * SIZE(BO) + FMADD f15, f27, f31, f15 + LFD f27, 15 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f31, 15 * SIZE(BO) + FMADD f4, f16, f21, f4 + NOP2 + FMADD f8, f16, f22, f8 + NOP1 + FMADD f12, f16, f23, f12 + LFD f16, 16 * SIZE(AO) + + FMADD f1, f17, f20, f1 + NOP1 + FMADD f5, f17, f21, f5 + NOP2 + FMADD f9, f17, f22, f9 + NOP1 + FMADD f13, f17, f23, f13 + LFD f17, 17 * SIZE(AO) + + FMADD f2, f18, f20, f2 + NOP1 + FMADD f6, f18, f21, f6 + NOP2 + FMADD f10, f18, f22, f10 + NOP1 + FMADD f14, f18, f23, f14 + LFD f18, 18 * SIZE(AO) + + FMADD f3, f19, f20, f3 + LFD f20, 16 * SIZE(BO) + FMADD f7, f19, f21, f7 + LFD f21, 17 * SIZE(BO) + FMADD f11, f19, f22, f11 + LFD f22, 18 * SIZE(BO) + FMADD f15, f19, f23, f15 + LFD f19, 19 * SIZE(AO) + + FMADD f0, f24, f28, f0 + LFD f23, 19 * SIZE(BO) + FMADD f4, f24, f29, f4 + NOP2 + FMADD f8, f24, f30, f8 + NOP1 + FMADD f12, f24, f31, f12 + LFD f24, 20 * SIZE(AO) + + FMADD f1, f25, f28, f1 + NOP1 + FMADD f5, f25, f29, f5 + NOP2 + FMADD f9, f25, f30, f9 + NOP1 + FMADD f13, f25, f31, f13 + LFD f25, 21 * SIZE(AO) + + FMADD f2, f26, f28, f2 + NOP1 + FMADD f6, f26, f29, f6 + NOP2 + FMADD f10, f26, f30, f10 + NOP1 + FMADD f14, f26, f31, f14 + LFD f26, 22 * SIZE(AO) + + FMADD f3, f27, f28, f3 + LFD f28, 20 * SIZE(BO) + FMADD f7, f27, f29, f7 + LFD f29, 21 * SIZE(BO) + FMADD f11, f27, f30, f11 + LFD f30, 22 * SIZE(BO) + FMADD f15, f27, f31, f15 + addi AO, AO, 16 * SIZE + + addi BO, BO, 16 * SIZE + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(KERNEL_MainFinish) + .align 4 + +LL(16): + fmadd f0, f16, f20, f0 + fmadd f5, f17, f21, f5 + fmadd f10, f18, f22, f10 + fmadd f15, f19, f23, f15 + + fmadd f1, f17, f20, f1 + fmadd f2, f18, f20, f2 + fmadd f3, f19, f20, f3 + fmadd f4, f16, f21, f4 + + fmadd f6, f18, f21, f6 + fmadd f7, f19, f21, f7 + fmadd f8, f16, f22, f8 + fmadd f9, f17, f22, f9 + + fmadd f11, f19, f22, f11 + fmadd f12, f16, f23, f12 + fmadd f13, f17, f23, f13 + fmadd f14, f18, f23, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(16) + .align 4 + +LL(KERNEL_MainFinish): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + fmadd f0, f18, f24, f0 + fmadd f1, f18, f25, f1 + fmadd f2, f18, f26, f2 + fmadd f3, f18, f27, f3 + + fmadd f4, f19, f24, f4 + fmadd f5, f19, f25, f5 + fmadd f6, f19, f26, f6 + fmadd f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + fmadd f0, f16, f20, f0 + fmadd f1, f16, f21, f1 + fmadd f2, f16, f22, f2 + fmadd f3, f16, f23, f3 + + fmadd f4, f17, f20, f4 + fmadd f5, f17, f21, f5 + fmadd f6, f17, f22, f6 + fmadd f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S new file mode 100644 index 0000000000..9e9697dacd --- /dev/null +++ b/kernel/power/ztrsm_kernel_hummer_LN.S @@ -0,0 +1,2963 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#undef ZERO + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define ZERO r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + +#ifndef CONJ +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnpma +#else +#if defined(LN) || defined(LT) +#define FXCPMADD fxcpnsma +#define FXCSMADD fxcxma +#else +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnsma +#endif +#endif + +#ifndef CONJ +#define FXCXNPMA fxcxnpma +#define FXCXNSMA fxcxnsma +#else +#define FXCXNPMA fxcxnsma +#define FXCXNSMA fxcxnpma +#endif + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) + + li r0, 0 + stwu r0, -4(SP) + stwu r0, -4(SP) + + stfdu f2, -8(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + srawi. J, N, 1 + ble .L50 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + + andi. I, M, 1 + beq .L20 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L38: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 4 * SIZE +#endif + + addi AO2, AO, 2 * SIZE + addi BO2, BO, 2 * SIZE + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 +#endif + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + +#ifdef LN + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef LT + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f2, A2, f0, f2 + FXCXNSMA f2, A2, f0, f2 + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 + + fxcpnmsub f0, A2, f2, f0 + FXCXNSMA f0, A2, f2, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f2, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f2, AO2, INC4 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L28: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f18, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f8, f18, f8 + fpsub f9, f19, f9 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxpmul f6, A3, f1 + fxpmul f7, A3, f9 + FXCXNPMA f1, A3, f1, f6 + FXCXNPMA f9, A3, f9, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, r0 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + srawi. I, M, 2 + ble .L49 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + nop + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + nop + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + nop + FXCSMADD f12, B4, A9, f12 + nop + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + nop + FXCSMADD f7, B6, A4, f7 + nop + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L18: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f20, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f22, BO2, INC4 + LFPDUX f19, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f10, f22, f10 + fpsub f11, f23, f11 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A4, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + LFPDUX A9, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A10, f3 + fxpmul f5, A10, f11 + FXCXNPMA f3, A10, f3, f4 + FXCXNPMA f11, A10, f11, f5 + + fxcpnmsub f2, A9, f3, f2 + fxcpnmsub f10, A9, f11, f10 + FXCXNSMA f2, A9, f3, f2 + FXCXNSMA f10, A9, f11, f10 + + fxcpnmsub f1, A8, f3, f1 + fxcpnmsub f9, A8, f11, f9 + FXCXNSMA f1, A8, f3, f1 + FXCXNSMA f9, A8, f11, f9 + + fxcpnmsub f0, A7, f3, f0 + fxcpnmsub f8, A7, f11, f8 + FXCXNSMA f0, A7, f3, f0 + FXCXNSMA f8, A7, f11, f8 + + fxpmul f4, A6, f2 + fxpmul f5, A6, f10 + FXCXNPMA f2, A6, f2, f4 + FXCXNPMA f10, A6, f10, f5 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + FXCXNSMA f1, A5, f2, f1 + FXCXNSMA f9, A5, f10, f9 + + fxcpnmsub f0, A4, f2, f0 + fxcpnmsub f8, A4, f10, f8 + FXCXNSMA f0, A4, f2, f0 + FXCXNSMA f8, A4, f10, f8 + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + LFPDUX A7, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A8, AO, INC4 + LFPDUX A9, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + FXCXNSMA f2, A3, f0, f2 + FXCXNSMA f10, A3, f8, f10 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + FXCXNSMA f3, A4, f0, f3 + FXCXNSMA f11, A4, f8, f11 + + fxpmul f6, A5, f1 + fxpmul f7, A5, f9 + FXCXNPMA f1, A5, f1, f6 + FXCXNPMA f9, A5, f9, f7 + + fxcpnmsub f2, A6, f1, f2 + fxcpnmsub f10, A6, f9, f10 + FXCXNSMA f2, A6, f1, f2 + FXCXNSMA f10, A6, f9, f10 + + fxcpnmsub f3, A7, f1, f3 + fxcpnmsub f11, A7, f9, f11 + FXCXNSMA f3, A7, f1, f3 + FXCXNSMA f11, A7, f9, f11 + + fxpmul f4, A8, f2 + fxpmul f5, A8, f10 + FXCXNPMA f2, A8, f2, f4 + FXCXNPMA f10, A8, f10, f5 + + fxcpnmsub f3, A9, f2, f3 + fxcpnmsub f11, A9, f10, f11 + FXCXNSMA f3, A9, f2, f3 + FXCXNSMA f11, A9, f10, f11 + + fxpmul f6, A10, f3 + fxpmul f7, A10, f11 + FXCXNPMA f3, A10, f3, f6 + FXCXNPMA f11, A10, f11, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + FXCXNSMA f10, A2, f2, f10 + FXCXNSMA f11, A2, f3, f11 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + fxcpnmsub f2, A2, f10, f2 + fxcpnmsub f3, A2, f11, f3 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + FXCXNSMA f2, A2, f10, f2 + FXCXNSMA f3, A2, f11, f3 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + STFDUX f10, CO2, INC + STFSDUX f10, CO2, INC + STFDUX f11, CO2, INC + STFSDUX f11, CO2, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 1 + beq .L999 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + li r0, FZERO + lfpsx f0, SP, r0 + + andi. I, M, 1 + beq .L60 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L72 + .align 4 + +.L73: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 +#else + LFPDX f16, AO, INC2 +#endif + + fpsub f0, f16, f0 + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 +#else + STFPDX f0, AO, INC2 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L68: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + add AO, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxpmul f6, A3, f1 + FXCXNPMA f1, A3, f1, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + srawi. I, M, 2 + ble .L89 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L58: + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + add AO, AO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A10, f3 + FXCXNPMA f3, A10, f3, f4 + + fxcpnmsub f2, A9, f3, f2 + FXCXNSMA f2, A9, f3, f2 + + fxcpnmsub f1, A8, f3, f1 + FXCXNSMA f1, A8, f3, f1 + + fxcpnmsub f0, A7, f3, f0 + FXCXNSMA f0, A7, f3, f0 + + fxpmul f4, A6, f2 + FXCXNPMA f2, A6, f2, f4 + + fxcpnmsub f1, A5, f2, f1 + FXCXNSMA f1, A5, f2, f1 + + fxcpnmsub f0, A4, f2, f0 + FXCXNSMA f0, A4, f2, f0 + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxcpnmsub f2, A3, f0, f2 + FXCXNSMA f2, A3, f0, f2 + + fxcpnmsub f3, A4, f0, f3 + FXCXNSMA f3, A4, f0, f3 + + fxpmul f6, A5, f1 + FXCXNPMA f1, A5, f1, f6 + + fxcpnmsub f2, A6, f1, f2 + FXCXNSMA f2, A6, f1, f2 + + fxcpnmsub f3, A7, f1, f3 + FXCXNSMA f3, A7, f1, f3 + + fxpmul f4, A8, f2 + FXCXNPMA f2, A8, f2, f4 + + fxcpnmsub f3, A9, f2, f3 + FXCXNSMA f3, A9, f2, f3 + + fxpmul f6, A10, f3 + FXCXNPMA f3, A10, f3, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S new file mode 100644 index 0000000000..6da6c72ada --- /dev/null +++ b/kernel/power/ztrsm_kernel_hummer_LT.S @@ -0,0 +1,2962 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#undef ZERO + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define ZERO r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + +#ifndef CONJ +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnpma +#else +#if defined(LN) || defined(LT) +#define FXCPMADD fxcpnsma +#define FXCSMADD fxcxma +#else +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnsma +#endif +#endif + +#ifndef CONJ +#define FXCXNPMA fxcxnpma +#define FXCXNSMA fxcxnsma +#else +#define FXCXNPMA fxcxnsma +#define FXCXNSMA fxcxnpma +#endif + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) + + li r0, 0 + stwu r0, -4(SP) + stwu r0, -4(SP) + + stfdu f2, -8(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + srawi. J, N, 1 + ble .L50 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + nop + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + nop + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + nop + FXCSMADD f12, B4, A9, f12 + nop + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + nop + FXCSMADD f7, B6, A4, f7 + nop + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L18: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f20, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f22, BO2, INC4 + LFPDUX f19, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f10, f22, f10 + fpsub f11, f23, f11 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A4, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + LFPDUX A9, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A10, f3 + fxpmul f5, A10, f11 + FXCXNPMA f3, A10, f3, f4 + FXCXNPMA f11, A10, f11, f5 + + fxcpnmsub f2, A9, f3, f2 + fxcpnmsub f10, A9, f11, f10 + FXCXNSMA f2, A9, f3, f2 + FXCXNSMA f10, A9, f11, f10 + + fxcpnmsub f1, A8, f3, f1 + fxcpnmsub f9, A8, f11, f9 + FXCXNSMA f1, A8, f3, f1 + FXCXNSMA f9, A8, f11, f9 + + fxcpnmsub f0, A7, f3, f0 + fxcpnmsub f8, A7, f11, f8 + FXCXNSMA f0, A7, f3, f0 + FXCXNSMA f8, A7, f11, f8 + + fxpmul f4, A6, f2 + fxpmul f5, A6, f10 + FXCXNPMA f2, A6, f2, f4 + FXCXNPMA f10, A6, f10, f5 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + FXCXNSMA f1, A5, f2, f1 + FXCXNSMA f9, A5, f10, f9 + + fxcpnmsub f0, A4, f2, f0 + fxcpnmsub f8, A4, f10, f8 + FXCXNSMA f0, A4, f2, f0 + FXCXNSMA f8, A4, f10, f8 + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + LFPDUX A7, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A8, AO, INC4 + LFPDUX A9, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + FXCXNSMA f2, A3, f0, f2 + FXCXNSMA f10, A3, f8, f10 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + FXCXNSMA f3, A4, f0, f3 + FXCXNSMA f11, A4, f8, f11 + + fxpmul f6, A5, f1 + fxpmul f7, A5, f9 + FXCXNPMA f1, A5, f1, f6 + FXCXNPMA f9, A5, f9, f7 + + fxcpnmsub f2, A6, f1, f2 + fxcpnmsub f10, A6, f9, f10 + FXCXNSMA f2, A6, f1, f2 + FXCXNSMA f10, A6, f9, f10 + + fxcpnmsub f3, A7, f1, f3 + fxcpnmsub f11, A7, f9, f11 + FXCXNSMA f3, A7, f1, f3 + FXCXNSMA f11, A7, f9, f11 + + fxpmul f4, A8, f2 + fxpmul f5, A8, f10 + FXCXNPMA f2, A8, f2, f4 + FXCXNPMA f10, A8, f10, f5 + + fxcpnmsub f3, A9, f2, f3 + fxcpnmsub f11, A9, f10, f11 + FXCXNSMA f3, A9, f2, f3 + FXCXNSMA f11, A9, f10, f11 + + fxpmul f6, A10, f3 + fxpmul f7, A10, f11 + FXCXNPMA f3, A10, f3, f6 + FXCXNPMA f11, A10, f11, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + FXCXNSMA f10, A2, f2, f10 + FXCXNSMA f11, A2, f3, f11 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + fxcpnmsub f2, A2, f10, f2 + fxcpnmsub f3, A2, f11, f3 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + FXCXNSMA f2, A2, f10, f2 + FXCXNSMA f3, A2, f11, f3 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + STFDUX f10, CO2, INC + STFSDUX f10, CO2, INC + STFDUX f11, CO2, INC + STFSDUX f11, CO2, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L28: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f18, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f8, f18, f8 + fpsub f9, f19, f9 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxpmul f6, A3, f1 + fxpmul f7, A3, f9 + FXCXNPMA f1, A3, f1, f6 + FXCXNPMA f9, A3, f9, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, r0 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 1 + beq .L49 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L38: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 4 * SIZE +#endif + + addi AO2, AO, 2 * SIZE + addi BO2, BO, 2 * SIZE + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 +#endif + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + +#ifdef LN + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef LT + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f2, A2, f0, f2 + FXCXNSMA f2, A2, f0, f2 + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 + + fxcpnmsub f0, A2, f2, f0 + FXCXNSMA f0, A2, f2, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f2, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f2, AO2, INC4 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L50: + andi. J, N, 1 + beq .L999 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L60 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L58: + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + add AO, AO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A10, f3 + FXCXNPMA f3, A10, f3, f4 + + fxcpnmsub f2, A9, f3, f2 + FXCXNSMA f2, A9, f3, f2 + + fxcpnmsub f1, A8, f3, f1 + FXCXNSMA f1, A8, f3, f1 + + fxcpnmsub f0, A7, f3, f0 + FXCXNSMA f0, A7, f3, f0 + + fxpmul f4, A6, f2 + FXCXNPMA f2, A6, f2, f4 + + fxcpnmsub f1, A5, f2, f1 + FXCXNSMA f1, A5, f2, f1 + + fxcpnmsub f0, A4, f2, f0 + FXCXNSMA f0, A4, f2, f0 + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxcpnmsub f2, A3, f0, f2 + FXCXNSMA f2, A3, f0, f2 + + fxcpnmsub f3, A4, f0, f3 + FXCXNSMA f3, A4, f0, f3 + + fxpmul f6, A5, f1 + FXCXNPMA f1, A5, f1, f6 + + fxcpnmsub f2, A6, f1, f2 + FXCXNSMA f2, A6, f1, f2 + + fxcpnmsub f3, A7, f1, f3 + FXCXNSMA f3, A7, f1, f3 + + fxpmul f4, A8, f2 + FXCXNPMA f2, A8, f2, f4 + + fxcpnmsub f3, A9, f2, f3 + FXCXNSMA f3, A9, f2, f3 + + fxpmul f6, A10, f3 + FXCXNPMA f3, A10, f3, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L68: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + add AO, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxpmul f6, A3, f1 + FXCXNPMA f1, A3, f1, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 1 + beq .L89 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L72 + .align 4 + +.L73: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 +#else + LFPDX f16, AO, INC2 +#endif + + fpsub f0, f16, f0 + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 +#else + STFPDX f0, AO, INC2 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S new file mode 100644 index 0000000000..8670ceac58 --- /dev/null +++ b/kernel/power/ztrsm_kernel_hummer_RT.S @@ -0,0 +1,2962 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#undef ZERO + +#define ALPHA 0 +#define FZERO 16 + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#endif + +#define TEMP r11 +#define AORIG r12 +#define KK r14 +#define INCM1 r15 +#define INCM3 r16 +#define INCM5 r17 +#define INCM7 r18 +#define INC2 r19 +#define INC r20 +#define INC4 r21 + +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define AO2 r26 +#define BO2 r27 + +#define CO1 r28 +#define CO2 r29 +#define ZERO r31 + +#ifndef NEEDPARAM + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define A7 f22 +#define A8 f23 +#define A9 f24 +#define A10 f25 + +#define B1 f26 +#define B2 f27 +#define B3 f28 +#define B4 f29 +#define B5 f30 +#define B6 f31 + +#define AP B6 + +#ifndef CONJ +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnpma +#else +#if defined(LN) || defined(LT) +#define FXCPMADD fxcpnsma +#define FXCSMADD fxcxma +#else +#define FXCPMADD fxcpmadd +#define FXCSMADD fxcxnsma +#endif +#endif + +#ifndef CONJ +#define FXCXNPMA fxcxnpma +#define FXCXNSMA fxcxnsma +#else +#define FXCXNPMA fxcxnsma +#define FXCXNSMA fxcxnpma +#endif + + + PROLOGUE + PROFCODE + + li r0, -16 + + stfpdux f14, SP, r0 + stfpdux f15, SP, r0 + stfpdux f16, SP, r0 + stfpdux f17, SP, r0 + stfpdux f18, SP, r0 + stfpdux f19, SP, r0 + stfpdux f20, SP, r0 + stfpdux f21, SP, r0 + stfpdux f22, SP, r0 + stfpdux f23, SP, r0 + stfpdux f24, SP, r0 + stfpdux f25, SP, r0 + stfpdux f26, SP, r0 + stfpdux f27, SP, r0 + stfpdux f28, SP, r0 + stfpdux f29, SP, r0 + stfpdux f30, SP, r0 + stfpdux f31, SP, r0 + + stwu r31, -4(SP) + stwu r30, -4(SP) + stwu r29, -4(SP) + stwu r28, -4(SP) + + stwu r27, -4(SP) + stwu r26, -4(SP) + stwu r25, -4(SP) + stwu r24, -4(SP) + + stwu r23, -4(SP) + stwu r22, -4(SP) + stwu r21, -4(SP) + stwu r20, -4(SP) + + stwu r19, -4(SP) + stwu r18, -4(SP) + stwu r17, -4(SP) + stwu r16, -4(SP) + + stwu r15, -4(SP) + stwu r14, -4(SP) + + li r0, 0 + stwu r0, -4(SP) + stwu r0, -4(SP) + + stfdu f2, -8(SP) + stfdu f1, -8(SP) + + slwi LDC, LDC, ZBASE_SHIFT + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + li INC, 1 * SIZE + li INC2, 2 * SIZE + li INC4, 4 * SIZE + li INCM1, -1 * SIZE + li INCM3, -3 * SIZE + li INCM5, -5 * SIZE + li INCM7, -7 * SIZE + + addi C, C, - 1 * SIZE + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + andi. J, N, 1 + beq .L50 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -2 * SIZE +#else + addi AO, A, -2 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L60 + .align 4 + +.L51: +#if defined(LT) || defined(RN) + fpmr f4, f0 + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L54 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f4, f0 + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f5, f0 + fpmr f2, f0 + fpmr f6, f0 + fpmr f3, f0 + fpmr f7, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L54 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L53 + .align 4 + +.L52: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + LFPDUX B1, BO, INC2 + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + LFPDUX B2, BO, INC2 + FXCSMADD f4, B3, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B3, A3, f2 + nop + FXCSMADD f6, B3, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B4, A5, f0 + LFPDUX B3, BO, INC2 + FXCSMADD f4, B4, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B4, A6, f1 + nop + FXCSMADD f5, B4, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B4, A7, f2 + nop + FXCSMADD f6, B4, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B4, A8, f3 + nop + FXCSMADD f7, B4, A8, f7 + LFPDUX A8, AO, INC2 + bdnz+ .L52 + .align 4 + +.L53: + FXCPMADD f0, B1, A1, f0 + LFPDUX B4, BO, INC2 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + + FXCPMADD f0, B2, A5, f0 + nop + FXCSMADD f4, B2, A5, f4 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B2, A6, f1 + nop + FXCSMADD f5, B2, A6, f5 + LFPDUX A6, AO, INC2 + + FXCPMADD f2, B2, A7, f2 + nop + FXCSMADD f6, B2, A7, f6 + LFPDUX A7, AO, INC2 + FXCPMADD f3, B2, A8, f3 + nop + FXCSMADD f7, B2, A8, f7 + LFPDUX A8, AO, INC2 + + FXCPMADD f0, B3, A1, f0 + FXCSMADD f4, B3, A1, f4 + FXCPMADD f1, B3, A2, f1 + FXCSMADD f5, B3, A2, f5 + + FXCPMADD f2, B3, A3, f2 + FXCSMADD f6, B3, A3, f6 + FXCPMADD f3, B3, A4, f3 + FXCSMADD f7, B3, A4, f7 + + FXCPMADD f0, B4, A5, f0 + FXCSMADD f4, B4, A5, f4 + FXCPMADD f1, B4, A6, f1 + FXCSMADD f5, B4, A6, f5 + + FXCPMADD f2, B4, A7, f2 + FXCSMADD f6, B4, A7, f6 + FXCPMADD f3, B4, A8, f3 + FXCSMADD f7, B4, A8, f7 + .align 4 + +.L54: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L58 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L58 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + bdz- .L57 + .align 4 + +.L56: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX A2, AO, INC2 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + LFPDUX A3, AO, INC2 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + LFPDUX A4, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L56 + .align 4 + +.L57: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + + FXCPMADD f2, B1, A3, f2 + FXCSMADD f6, B1, A3, f6 + FXCPMADD f3, B1, A4, f3 + FXCSMADD f7, B1, A4, f7 + .align 4 + +.L58: + fpadd f0, f0, f4 + fpadd f1, f1, f5 + fpadd f2, f2, f6 + fpadd f3, f3, f7 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + LFPDUX f18, BO, INC2 + LFPDUX f19, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + LFPDUX f18, AO, INC2 + LFPDUX f19, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + + LFPDUX A4, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + add AO, AO, INC2 + + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A10, f3 + FXCXNPMA f3, A10, f3, f4 + + fxcpnmsub f2, A9, f3, f2 + FXCXNSMA f2, A9, f3, f2 + + fxcpnmsub f1, A8, f3, f1 + FXCXNSMA f1, A8, f3, f1 + + fxcpnmsub f0, A7, f3, f0 + FXCXNSMA f0, A7, f3, f0 + + fxpmul f4, A6, f2 + FXCXNPMA f2, A6, f2, f4 + + fxcpnmsub f1, A5, f2, f1 + FXCXNSMA f1, A5, f2, f1 + + fxcpnmsub f0, A4, f2, f0 + FXCXNSMA f0, A4, f2, f0 + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + add AO, AO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX A7, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A9, AO, INC2 + + add AO, AO, INC2 + add AO, AO, INC2 + add AO, AO, INC2 + LFPDUX A10, AO, INC2 + + subi AO, AO, 32 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxcpnmsub f2, A3, f0, f2 + FXCXNSMA f2, A3, f0, f2 + + fxcpnmsub f3, A4, f0, f3 + FXCXNSMA f3, A4, f0, f3 + + fxpmul f6, A5, f1 + FXCXNPMA f1, A5, f1, f6 + + fxcpnmsub f2, A6, f1, f2 + FXCXNSMA f2, A6, f1, f2 + + fxcpnmsub f3, A7, f1, f3 + FXCXNSMA f3, A7, f1, f3 + + fxpmul f4, A8, f2 + FXCXNPMA f2, A8, f2, f4 + + fxcpnmsub f3, A9, f2, f3 + FXCXNSMA f3, A9, f2, f3 + + fxpmul f6, A10, f3 + FXCXNPMA f3, A10, f3, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + STFPDUX f2, BO, INC2 + STFPDUX f3, BO, INC2 + + subi BO, BO, 8 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + STFPDUX f2, AO, INC2 + STFPDUX f3, AO, INC2 + + subi AO, AO, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L51 + .align 4 + +.L60: + andi. I, M, 2 + beq .L70 + +#if defined(LT) || defined(RN) + fpmr f1, f0 + addi BO, B, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L64 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f1, f0 + addi BO, BO, - 2 * SIZE + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L64 +#endif + + LFPDUX B1, BO, INC2 + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX A4, AO, INC2 + + LFPDUX B3, BO, INC2 + LFPDUX A5, AO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B4, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A8, AO, INC2 + bdz- .L63 + .align 4 + +.L62: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B1, BO, INC2 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + LFPDUX A3, AO, INC2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + LFPDUX A5, AO, INC2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B3, BO, INC2 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + LFPDUX A7, AO, INC2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX B4, BO, INC2 + bdnz+ .L62 + .align 4 + +.L63: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + + FXCPMADD f0, B2, A3, f0 + FXCSMADD f2, B2, A3, f2 + FXCPMADD f1, B2, A4, f1 + FXCSMADD f3, B2, A4, f3 + + FXCPMADD f0, B3, A5, f0 + FXCSMADD f2, B3, A5, f2 + FXCPMADD f1, B3, A6, f1 + FXCSMADD f3, B3, A6, f3 + + FXCPMADD f0, B4, A7, f0 + FXCSMADD f2, B4, A7, f2 + FXCPMADD f1, B4, A8, f1 + FXCSMADD f3, B4, A8, f3 + .align 4 + +.L64: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L68 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L68 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdz- .L67 + .align 4 + +.L66: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + LFPDUX A1, AO, INC2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + bdnz+ .L66 + .align 4 + +.L67: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f2, B1, A1, f2 + FXCPMADD f1, B1, A2, f1 + FXCSMADD f3, B1, A2, f3 + .align 4 + +.L68: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC2 + LFPDUX f17, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + LFPDUX f16, AO, INC2 + LFPDUX f17, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + +#ifdef LN + LFPDUX A1, AO, INC2 + add AO, AO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A3, f1 + FXCXNPMA f1, A3, f1, f4 + + fxcpnmsub f0, A2, f1, f0 + FXCXNSMA f0, A2, f1, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDUX A1, AO, INC2 + LFPDUX A2, AO, INC2 + add AO, AO, INC2 + LFPDUX A3, AO, INC2 + + subi AO, AO, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f1, A2, f0, f1 + FXCXNSMA f1, A2, f0, f1 + + fxpmul f6, A3, f1 + FXCXNPMA f1, A3, f1, f6 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC2 + STFPDUX f1, BO, INC2 + + subi BO, BO, 4 * SIZE +#else + STFPDUX f0, AO, INC2 + STFPDUX f1, AO, INC2 + + subi AO, AO, 4 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L70: + andi. I, M, 1 + beq .L89 + +#if defined(LT) || defined(RN) + addi BO, B, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, KK, 3 + mtspr CTR, r0 + ble .L74 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + addi BO, BO, - 2 * SIZE + fpmr f1, f0 + fpmr f2, f0 + fpmr f3, f0 + srawi. r0, TEMP, 3 + mtspr CTR, r0 + ble .L74 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + bdz- .L73 + .align 4 + +.L72: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + LFPDUX A2, AO, INC2 + LFPDUX B2, BO, INC2 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + LFPDUX A3, AO, INC2 + LFPDUX B3, BO, INC2 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + LFPDUX A4, AO, INC2 + LFPDUX B4, BO, INC2 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + LFPDUX A5, AO, INC2 + LFPDUX B5, BO, INC2 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + LFPDUX A6, AO, INC2 + LFPDUX B6, BO, INC2 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + LFPDUX A7, AO, INC2 + LFPDUX A9, BO, INC2 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + LFPDUX A8, AO, INC2 + LFPDUX A10, BO, INC2 + + bdnz+ .L72 + .align 4 + +.L73: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A2, f2 + FXCSMADD f3, B2, A2, f3 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f1, B3, A3, f1 + FXCPMADD f2, B4, A4, f2 + FXCSMADD f3, B4, A4, f3 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f1, B5, A5, f1 + FXCPMADD f2, B6, A6, f2 + FXCSMADD f3, B6, A6, f3 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f1, A9, A7, f1 + FXCPMADD f2, A10, A8, f2 + FXCSMADD f3, A10, A8, f3 + .align 4 + +.L74: +#if defined(LT) || defined(RN) + andi. r0, KK, 7 + mtspr CTR, r0 + ble+ .L78 +#else + andi. r0, TEMP, 7 + mtspr CTR, r0 + ble+ .L78 +#endif + + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdz- .L77 + .align 4 + +.L76: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX A1, AO, INC2 + LFPDUX B1, BO, INC2 + bdnz+ .L76 + .align 4 + +.L77: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + .align 4 + +.L78: + fpadd f0, f0, f2 + fpadd f1, f1, f3 + + fpadd f0, f0, f1 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + addi BO, BO, - 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC2 +#else + LFPDX f16, AO, INC2 +#endif + + fpsub f0, f16, f0 + +#ifdef LN + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LT + LFPDX A1, AO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RN + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef RT + LFPDX A1, BO, INC2 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC2 +#else + STFPDX f0, AO, INC2 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L89: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 2 * SIZE +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +.L50: + srawi. J, N, 1 + ble .L999 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + addi AORIG, A, -4 * SIZE +#else + addi AO, A, -4 * SIZE +#endif +#ifndef RT + add C, CO2, LDC +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + + srawi. I, M, 2 + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#else + +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 2 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + fpmr f2, f0 + + fpmr f6, f0 + fpmr f10, f0 + fpmr f14, f0 + fpmr f3, f0 + + fpmr f7, f0 + fpmr f11, f0 + fpmr f15, f0 + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + srawi. r0, TEMP, 2 + fpmr f1, f0 + mtspr CTR, r0 + ble .L14 +#endif + + LFPDUX A1, AO, INC4 + fpmr f5, f0 + LFPDUX A3, AO, INC4 + fpmr f9, f0 + LFPDUX B1, BO, INC4 + fpmr f13, f0 + + LFPDUX A5, AO, INC4 + fpmr f2, f0 + LFPDUX A6, AO, INC4 + fpmr f6, f0 + LFPDUX B3, BO, INC4 + fpmr f10, f0 + LFPDUX A7, AO, INC4 + fpmr f14, f0 + + LFPDUX A8, AO, INC4 + fpmr f3, f0 + LFPDUX B5, BO, INC4 + fpmr f7, f0 + LFPDUX A9, AO, INC4 + fpmr f11, f0 + LFPDUX A2, AO2, INC4 + fpmr f15, f0 + LFPDUX B2, BO2, INC4 + bdz- .L13 + .align 4 + +.L12: + +## 1 ## + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A1, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + LFPDUX B1, BO, INC4 + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A3, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + LFPDUX A5, AO, INC4 + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + LFPDUX B3, BO, INC4 + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A6, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + LFPDUX A7, AO, INC4 + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A9, f12 + LFPDUX B5, BO, INC4 + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + LFPDUX A8, AO, INC4 + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + LFPDUX A2, AO2, INC4 + FXCSMADD f7, B6, A4, f7 + LFPDUX A9, AO, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + bdnz+ .L12 + .align 4 + +.L13: +## 1 ## + + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + nop + FXCPMADD f8, B2, A1, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A1, f12 + LFPDUX B6, BO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + LFPDUX A10, AO, INC4 + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B1, A3, f2 + nop + FXCSMADD f6, B1, A3, f6 + nop + FXCPMADD f10, B2, A3, f10 + nop + FXCSMADD f14, B2, A3, f14 + nop + + FXCPMADD f3, B1, A4, f3 + nop + FXCSMADD f7, B1, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 2 ## + + FXCPMADD f0, B3, A5, f0 + nop + FXCSMADD f4, B3, A5, f4 + nop + FXCPMADD f8, B4, A5, f8 + LFPDUX B2, BO2, INC4 + FXCSMADD f12, B4, A5, f12 + nop + + FXCPMADD f1, B3, A2, f1 + nop + FXCSMADD f5, B3, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B3, A6, f2 + nop + FXCSMADD f6, B3, A6, f6 + nop + FXCPMADD f10, B4, A6, f10 + nop + FXCSMADD f14, B4, A6, f14 + nop + + FXCPMADD f3, B3, A4, f3 + nop + FXCSMADD f7, B3, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + +## 3 ## + + FXCPMADD f0, B5, A7, f0 + nop + FXCSMADD f4, B5, A7, f4 + nop + FXCPMADD f8, B2, A7, f8 + LFPDUX B4, BO2, INC4 + FXCSMADD f12, B2, A7, f12 + nop + + FXCPMADD f1, B5, A2, f1 + nop + FXCSMADD f5, B5, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + nop + + FXCPMADD f2, B5, A8, f2 + nop + FXCSMADD f6, B5, A8, f6 + nop + FXCPMADD f10, B2, A8, f10 + nop + FXCSMADD f14, B2, A8, f14 + nop + + FXCPMADD f3, B5, A4, f3 + nop + FXCSMADD f7, B5, A4, f7 + LFPDUX A2, AO2, INC4 + FXCPMADD f11, B2, A4, f11 + nop + FXCSMADD f15, B2, A4, f15 + nop + +## 4 ## + + FXCPMADD f0, B6, A9, f0 + nop + FXCSMADD f4, B6, A9, f4 + nop + FXCPMADD f8, B4, A9, f8 + nop + FXCSMADD f12, B4, A9, f12 + nop + + FXCPMADD f1, B6, A2, f1 + nop + FXCSMADD f5, B6, A2, f5 + LFPDUX A4, AO2, INC4 + FXCPMADD f9, B4, A2, f9 + nop + FXCSMADD f13, B4, A2, f13 + nop + + FXCPMADD f2, B6, A10, f2 + nop + FXCSMADD f6, B6, A10, f6 + nop + FXCPMADD f10, B4, A10, f10 + nop + FXCSMADD f14, B4, A10, f14 + nop + + FXCPMADD f3, B6, A4, f3 + nop + FXCSMADD f7, B6, A4, f7 + nop + FXCPMADD f11, B4, A4, f11 + nop + FXCSMADD f15, B4, A4, f15 + nop + .align 4 + +.L14: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L18 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L18 +#endif + +.L15: + LFPDUX A2, AO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A10, BO, INC4 + LFPDUX B4, BO2, INC4 + bdz- .L17 + .align 4 + +.L16: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + LFPDUX A2, AO, INC4 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + LFPDUX A10, BO, INC4 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + bdnz+ .L16 + .align 4 + +.L17: + FXCPMADD f0, A10, A2, f0 + FXCSMADD f4, A10, A2, f4 + FXCPMADD f8, B4, A2, f8 + FXCSMADD f12, B4, A2, f12 + LFPDUX A2, AO, INC4 + + FXCPMADD f1, A10, A4, f1 + FXCSMADD f5, A10, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + LFPDUX A4, AO2, INC4 + + FXCPMADD f2, A10, A2, f2 + FXCSMADD f6, A10, A2, f6 + FXCPMADD f10, B4, A2, f10 + FXCSMADD f14, B4, A2, f14 + + FXCPMADD f3, A10, A4, f3 + FXCSMADD f7, A10, A4, f7 + FXCPMADD f11, B4, A4, f11 + FXCSMADD f15, B4, A4, f15 + .align 4 + +.L18: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + + fpadd f2, f2, f6 + fpadd f10, f10, f14 + fpadd f3, f3, f7 + fpadd f11, f11, f15 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 4 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 2 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f20, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f21, BO2, INC4 + LFPDUX f18, BO, INC4 + LFPDUX f22, BO2, INC4 + LFPDUX f19, BO, INC4 + LFPDUX f23, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + LFPDUX f20, AO, INC4 + LFPDUX f21, AO2, INC4 + LFPDUX f22, AO, INC4 + LFPDUX f23, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f2, f18, f2 + fpsub f3, f19, f3 + + fpsub f8, f20, f8 + fpsub f9, f21, f9 + fpsub f10, f22, f10 + fpsub f11, f23, f11 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + add AO, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A4, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + add AO2, AO2, INC4 + + LFPDUX A7, AO, INC4 + LFPDUX A8, AO2, INC4 + LFPDUX A9, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A10, f3 + fxpmul f5, A10, f11 + FXCXNPMA f3, A10, f3, f4 + FXCXNPMA f11, A10, f11, f5 + + fxcpnmsub f2, A9, f3, f2 + fxcpnmsub f10, A9, f11, f10 + FXCXNSMA f2, A9, f3, f2 + FXCXNSMA f10, A9, f11, f10 + + fxcpnmsub f1, A8, f3, f1 + fxcpnmsub f9, A8, f11, f9 + FXCXNSMA f1, A8, f3, f1 + FXCXNSMA f9, A8, f11, f9 + + fxcpnmsub f0, A7, f3, f0 + fxcpnmsub f8, A7, f11, f8 + FXCXNSMA f0, A7, f3, f0 + FXCXNSMA f8, A7, f11, f8 + + fxpmul f4, A6, f2 + fxpmul f5, A6, f10 + FXCXNPMA f2, A6, f2, f4 + FXCXNPMA f10, A6, f10, f5 + + fxcpnmsub f1, A5, f2, f1 + fxcpnmsub f9, A5, f10, f9 + FXCXNSMA f1, A5, f2, f1 + FXCXNSMA f9, A5, f10, f9 + + fxcpnmsub f0, A4, f2, f0 + fxcpnmsub f8, A4, f10, f8 + FXCXNSMA f0, A4, f2, f0 + FXCXNSMA f8, A4, f10, f8 + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX A4, AO2, INC4 + + add AO, AO, INC4 + LFPDUX A5, AO2, INC4 + LFPDUX A6, AO, INC4 + LFPDUX A7, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A8, AO, INC4 + LFPDUX A9, AO2, INC4 + + add AO, AO, INC4 + add AO2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A10, AO2, INC4 + + subi AO, AO, 32 * SIZE + subi AO2, AO2, 32 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxcpnmsub f2, A3, f0, f2 + fxcpnmsub f10, A3, f8, f10 + FXCXNSMA f2, A3, f0, f2 + FXCXNSMA f10, A3, f8, f10 + + fxcpnmsub f3, A4, f0, f3 + fxcpnmsub f11, A4, f8, f11 + FXCXNSMA f3, A4, f0, f3 + FXCXNSMA f11, A4, f8, f11 + + fxpmul f6, A5, f1 + fxpmul f7, A5, f9 + FXCXNPMA f1, A5, f1, f6 + FXCXNPMA f9, A5, f9, f7 + + fxcpnmsub f2, A6, f1, f2 + fxcpnmsub f10, A6, f9, f10 + FXCXNSMA f2, A6, f1, f2 + FXCXNSMA f10, A6, f9, f10 + + fxcpnmsub f3, A7, f1, f3 + fxcpnmsub f11, A7, f9, f11 + FXCXNSMA f3, A7, f1, f3 + FXCXNSMA f11, A7, f9, f11 + + fxpmul f4, A8, f2 + fxpmul f5, A8, f10 + FXCXNPMA f2, A8, f2, f4 + FXCXNPMA f10, A8, f10, f5 + + fxcpnmsub f3, A9, f2, f3 + fxcpnmsub f11, A9, f10, f11 + FXCXNSMA f3, A9, f2, f3 + FXCXNSMA f11, A9, f10, f11 + + fxpmul f6, A10, f3 + fxpmul f7, A10, f11 + FXCXNPMA f3, A10, f3, f6 + FXCXNPMA f11, A10, f11, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + fxcpnmsub f10, A2, f2, f10 + fxcpnmsub f11, A2, f3, f11 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + FXCXNSMA f10, A2, f2, f10 + FXCXNSMA f11, A2, f3, f11 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + fxpmul f6, A3, f10 + fxpmul f7, A3, f11 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + FXCXNPMA f10, A3, f10, f6 + FXCXNPMA f11, A3, f11, f7 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + fxcpnmsub f2, A2, f10, f2 + fxcpnmsub f3, A2, f11, f3 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + FXCXNSMA f2, A2, f10, f2 + FXCXNSMA f3, A2, f11, f3 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + fxpmul f6, A1, f2 + fxpmul f7, A1, f3 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + FXCXNPMA f2, A1, f2, f6 + FXCXNPMA f3, A1, f3, f7 +#endif + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + STFPDUX f2, BO, INC4 + STFPDUX f10, BO2, INC4 + STFPDUX f3, BO, INC4 + STFPDUX f11, BO2, INC4 + + subi BO, BO, 16 * SIZE + subi BO2, BO2, 16 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f2, AO, INC4 + STFPDUX f3, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + STFPDUX f10, AO, INC4 + STFPDUX f11, AO2, INC4 + + subi AO, AO, 16 * SIZE + subi AO2, AO2, 16 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + STFDUX f2, CO1, INC + STFSDUX f2, CO1, INC + STFDUX f3, CO1, INC + STFSDUX f3, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + STFDUX f10, CO2, INC + STFSDUX f10, CO2, INC + STFDUX f11, CO2, INC + STFSDUX f11, CO2, INC + +#ifdef LN + subi CO1, CO1, 8 * SIZE + subi CO2, CO2, 8 * SIZE +#endif + +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 2 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 4 +#endif + +#ifdef LN + subi KK, KK, 4 +#endif + + addic. I, I, -1 + li r0, FZERO + + lfpsx f0, SP, r0 + bgt+ .L11 + .align 4 + +.L20: + andi. I, M, 2 + beq .L30 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, B, - 4 * SIZE + fpmr f8, f0 + addi BO2, B, - 2 * SIZE + fpmr f12, f0 + + srawi. r0, KK, 2 + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + mtspr CTR, r0 + fpmr f13, f0 + ble .L24 +#else +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f4, f0 + addi BO, BO, - 4 * SIZE + fpmr f8, f0 + addi BO2, BO, 2 * SIZE + fpmr f12, f0 + + fpmr f1, f0 + fpmr f5, f0 + fpmr f9, f0 + fpmr f13, f0 + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L24 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A3, AO, INC4 + LFPDUX B3, BO, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A5, AO, INC4 + LFPDUX B5, BO, INC4 + LFPDUX A6, AO2, INC4 + LFPDUX B6, BO2, INC4 + LFPDUX A7, AO, INC4 + LFPDUX A9, BO, INC4 + LFPDUX A10, BO2, INC4 + bdz- .L23 + .align 4 + +.L22: + FXCPMADD f0, B1, A1, f0 + nop + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + nop + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + nop + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + nop + FXCSMADD f13, B2, A2, f13 + LFPDUX B2, BO2, INC4 + + FXCPMADD f0, B3, A3, f0 + nop + FXCSMADD f4, B3, A3, f4 + LFPDUX A2, AO2, INC4 + FXCPMADD f8, B4, A3, f8 + nop + FXCSMADD f12, B4, A3, f12 + LFPDUX A3, AO, INC4 + + FXCPMADD f1, B3, A4, f1 + nop + FXCSMADD f5, B3, A4, f5 + LFPDUX B3, BO, INC4 + FXCPMADD f9, B4, A4, f9 + nop + FXCSMADD f13, B4, A4, f13 + LFPDUX B4, BO2, INC4 + + FXCPMADD f0, B5, A5, f0 + nop + FXCSMADD f4, B5, A5, f4 + LFPDUX A4, AO2, INC4 + FXCPMADD f8, B6, A5, f8 + nop + FXCSMADD f12, B6, A5, f12 + LFPDUX A5, AO, INC4 + + FXCPMADD f1, B5, A6, f1 + nop + FXCSMADD f5, B5, A6, f5 + LFPDUX B5, BO, INC4 + FXCPMADD f9, B6, A6, f9 + nop + FXCSMADD f13, B6, A6, f13 + LFPDUX B6, BO2, INC4 + + FXCPMADD f0, A9, A7, f0 + nop + FXCSMADD f4, A9, A7, f4 + LFPDUX A6, AO2, INC4 + FXCPMADD f8, A10, A7, f8 + nop + FXCSMADD f12, A10, A7, f12 + LFPDUX A7, AO, INC4 + + FXCPMADD f1, A9, A8, f1 + nop + FXCSMADD f5, A9, A8, f5 + LFPDUX A9, BO, INC4 + FXCPMADD f9, A10, A8, f9 + nop + FXCSMADD f13, A10, A8, f13 + LFPDUX A10, BO2, INC4 + bdnz+ .L22 + .align 4 + +.L23: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + LFPDUX A8, AO2, INC4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + + FXCPMADD f0, B3, A3, f0 + FXCSMADD f4, B3, A3, f4 + FXCPMADD f8, B4, A3, f8 + FXCSMADD f12, B4, A3, f12 + + FXCPMADD f1, B3, A4, f1 + FXCSMADD f5, B3, A4, f5 + FXCPMADD f9, B4, A4, f9 + FXCSMADD f13, B4, A4, f13 + + FXCPMADD f0, B5, A5, f0 + FXCSMADD f4, B5, A5, f4 + FXCPMADD f8, B6, A5, f8 + FXCSMADD f12, B6, A5, f12 + + FXCPMADD f1, B5, A6, f1 + FXCSMADD f5, B5, A6, f5 + FXCPMADD f9, B6, A6, f9 + FXCSMADD f13, B6, A6, f13 + + FXCPMADD f0, A9, A7, f0 + FXCSMADD f4, A9, A7, f4 + FXCPMADD f8, A10, A7, f8 + FXCSMADD f12, A10, A7, f12 + + FXCPMADD f1, A9, A8, f1 + FXCSMADD f5, A9, A8, f5 + FXCPMADD f9, A10, A8, f9 + FXCSMADD f13, A10, A8, f13 + .align 4 + +.L24: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L28 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L28 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + bdz- .L27 + .align 4 + +.L26: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + LFPDUX A1, AO, INC4 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + LFPDUX B1, BO, INC4 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + LFPDUX A2, AO2, INC4 + LFPDUX B2, BO2, INC4 + bdnz+ .L26 + .align 4 + +.L27: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f4, B1, A1, f4 + FXCPMADD f8, B2, A1, f8 + FXCSMADD f12, B2, A1, f12 + + FXCPMADD f1, B1, A2, f1 + FXCSMADD f5, B1, A2, f5 + FXCPMADD f9, B2, A2, f9 + FXCSMADD f13, B2, A2, f13 + .align 4 + +.L28: + fpadd f0, f0, f4 + fpadd f8, f8, f12 + fpadd f1, f1, f5 + fpadd f9, f9, f13 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 2 +#endif + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 + addi AO2, AO, 2 * SIZE + addi BO, BO, - 4 * SIZE + addi BO2, BO, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + LFPDUX f16, BO, INC4 + LFPDUX f18, BO2, INC4 + LFPDUX f17, BO, INC4 + LFPDUX f19, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + LFPDUX f16, AO, INC4 + LFPDUX f17, AO2, INC4 + LFPDUX f18, AO, INC4 + LFPDUX f19, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + fpsub f0, f16, f0 + fpsub f1, f17, f1 + fpsub f8, f18, f8 + fpsub f9, f19, f9 + +#ifdef LN + LFPDUX A1, AO, INC4 + add AO2, AO2, INC4 + LFPDUX A2, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A3, f1 + fxpmul f5, A3, f9 + FXCXNPMA f1, A3, f1, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f1, f0 + fxcpnmsub f8, A2, f9, f8 + FXCXNSMA f0, A2, f1, f0 + FXCXNSMA f8, A2, f9, f8 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 +#endif + +#ifdef LT + LFPDUX A1, AO, INC4 + LFPDUX A2, AO2, INC4 + add AO, AO, INC4 + LFPDUX A3, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f8 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f8, A1, f8, f5 + + fxcpnmsub f1, A2, f0, f1 + fxcpnmsub f9, A2, f8, f9 + FXCXNSMA f1, A2, f0, f1 + FXCXNSMA f9, A2, f8, f9 + + fxpmul f6, A3, f1 + fxpmul f7, A3, f9 + FXCXNPMA f1, A3, f1, f6 + FXCXNPMA f9, A3, f9, f7 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 + + fxcpnmsub f8, A2, f0, f8 + fxcpnmsub f9, A2, f1, f9 + + FXCXNSMA f8, A2, f0, f8 + FXCXNSMA f9, A2, f1, f9 + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f8 + fxpmul f5, A3, f9 + + FXCXNPMA f8, A3, f8, f4 + FXCXNPMA f9, A3, f9, f5 + + fxcpnmsub f0, A2, f8, f0 + fxcpnmsub f1, A2, f9, f1 + + FXCXNSMA f0, A2, f8, f0 + FXCXNSMA f1, A2, f9, f1 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f1 + + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f1, A1, f1, f5 +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDUX f0, BO, INC4 + STFPDUX f8, BO2, INC4 + STFPDUX f1, BO, INC4 + STFPDUX f9, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE +#else + STFPDUX f0, AO, INC4 + STFPDUX f1, AO2, INC4 + STFPDUX f8, AO, INC4 + STFPDUX f9, AO2, INC4 + + subi AO, AO, 8 * SIZE + subi AO2, AO2, 8 * SIZE +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f1, CO1, INC + STFSDUX f1, CO1, INC + + STFDUX f8, CO2, INC + STFSDUX f8, CO2, INC + STFDUX f9, CO2, INC + STFSDUX f9, CO2, INC + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, r0 +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L30: + andi. I, M, 1 + beq .L49 + +#if defined(LT) || defined(RN) + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, B, - 4 * SIZE + fpmr f2, f0 + addi BO2, B, - 2 * SIZE + fpmr f3, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + ble .L34 +#else +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0 , KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + addi AO2, AO, 2 * SIZE + fpmr f1, f0 + addi BO, BO, - 4 * SIZE + fpmr f2, f0 + addi BO2, BO, 2 * SIZE + fpmr f3, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 + ble .L34 +#endif + + LFPDUX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + LFPDUX A2, AO2, INC4 + LFPDUX B3, BO, INC4 + LFPDUX B4, BO2, INC4 + + LFPDUX A3, AO, INC4 + LFPDUX A5, BO, INC4 + LFPDUX A6, BO2, INC4 + LFPDUX A4, AO2, INC4 + LFPDUX A7, BO, INC4 + LFPDUX A8, BO2, INC4 + bdz- .L33 + .align 4 + +.L32: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDUX B2, BO2, INC4 + LFPDUX A1, AO, INC4 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + LFPDUX B3, BO, INC4 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + LFPDUX B4, BO2, INC4 + LFPDUX A2, AO2, INC4 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + LFPDUX A5, BO, INC4 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + LFPDUX A6, BO2, INC4 + LFPDUX A3, AO, INC4 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + LFPDUX A7, BO, INC4 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + LFPDUX A8, BO2, INC4 + LFPDUX A4, AO2, INC4 + bdnz+ .L32 + .align 4 + +.L33: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + + FXCPMADD f0, B3, A2, f0 + FXCSMADD f1, B3, A2, f1 + FXCPMADD f2, B4, A2, f2 + FXCSMADD f3, B4, A2, f3 + + FXCPMADD f0, A5, A3, f0 + FXCSMADD f1, A5, A3, f1 + FXCPMADD f2, A6, A3, f2 + FXCSMADD f3, A6, A3, f3 + + FXCPMADD f0, A7, A4, f0 + FXCSMADD f1, A7, A4, f1 + FXCPMADD f2, A8, A4, f2 + FXCSMADD f3, A8, A4, f3 + .align 4 + +.L34: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 + mtspr CTR, r0 + ble+ .L38 +#else + andi. r0, TEMP, 3 + mtspr CTR, r0 + ble+ .L38 +#endif + + LFPDX A1, AO, INC4 + LFPDUX B1, BO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdz- .L37 + .align 4 + +.L36: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + LFPDUX B1, BO, INC4 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + LFPDX A1, AO, INC4 + LFPDUX B2, BO2, INC4 + add AO, AO, INC2 + bdnz+ .L36 + .align 4 + +.L37: + FXCPMADD f0, B1, A1, f0 + FXCSMADD f1, B1, A1, f1 + FXCPMADD f2, B2, A1, f2 + FXCSMADD f3, B2, A1, f3 + .align 4 + +.L38: + fpadd f0, f0, f1 + fpadd f2, f2, f3 + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 + addi BO, BO, - 4 * SIZE +#endif + + addi AO2, AO, 2 * SIZE + addi BO2, BO, 2 * SIZE + +#if defined(LN) || defined(LT) + LFPDX f16, BO, INC4 + LFPDX f17, BO2, INC4 +#else + LFPDX f16, AO, INC4 + LFPDX f17, AO2, INC4 +#endif + + fpsub f0, f16, f0 + fpsub f2, f17, f2 + +#ifdef LN + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef LT + LFPDX A1, AO, INC4 + + fxpmul f4, A1, f0 + fxpmul f5, A1, f2 + FXCXNPMA f0, A1, f0, f4 + FXCXNPMA f2, A1, f2, f5 +#endif + +#ifdef RN + LFPDUX A1, BO, INC4 + LFPDUX A2, BO2, INC4 + add BO, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 + + fxcpnmsub f2, A2, f0, f2 + FXCXNSMA f2, A2, f0, f2 + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 +#endif + +#ifdef RT + LFPDUX A1, BO, INC4 + add BO2, BO2, INC4 + LFPDUX A2, BO, INC4 + LFPDUX A3, BO2, INC4 + + subi BO, BO, 8 * SIZE + subi BO2, BO2, 8 * SIZE + + fxpmul f4, A3, f2 + FXCXNPMA f2, A3, f2, f4 + + fxcpnmsub f0, A2, f2, f0 + FXCXNSMA f0, A2, f2, f0 + + fxpmul f4, A1, f0 + FXCXNPMA f0, A1, f0, f4 +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFPDX f0, BO, INC4 + STFPDX f2, BO2, INC4 +#else + STFPDX f0, AO, INC4 + STFPDX f2, AO2, INC4 +#endif + + STFDUX f0, CO1, INC + STFSDUX f0, CO1, INC + STFDUX f2, CO2, INC + STFSDUX f2, CO2, INC + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + + li r0, FZERO + lfpsx f0, SP, r0 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + addi B, BO, 4 * SIZE +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt+ .L10 + .align 4 + +.L999: + addi SP, SP, 20 + + lwzu r14, 4(SP) + lwzu r15, 4(SP) + + lwzu r16, 4(SP) + lwzu r17, 4(SP) + lwzu r18, 4(SP) + lwzu r19, 4(SP) + + lwzu r20, 4(SP) + lwzu r21, 4(SP) + lwzu r22, 4(SP) + lwzu r23, 4(SP) + + lwzu r24, 4(SP) + lwzu r25, 4(SP) + lwzu r26, 4(SP) + lwzu r27, 4(SP) + + lwzu r28, 4(SP) + lwzu r29, 4(SP) + lwzu r30, 4(SP) + lwzu r31, 4(SP) + + subi SP, SP, 12 + li r0, 16 + + lfpdux f31, SP, r0 + lfpdux f30, SP, r0 + lfpdux f29, SP, r0 + lfpdux f28, SP, r0 + lfpdux f27, SP, r0 + lfpdux f26, SP, r0 + lfpdux f25, SP, r0 + lfpdux f24, SP, r0 + lfpdux f23, SP, r0 + lfpdux f22, SP, r0 + lfpdux f21, SP, r0 + lfpdux f20, SP, r0 + lfpdux f19, SP, r0 + lfpdux f18, SP, r0 + lfpdux f17, SP, r0 + lfpdux f16, SP, r0 + lfpdux f15, SP, r0 + lfpdux f14, SP, r0 + addi SP, SP, 16 + blr + .align 4 + + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S new file mode 100644 index 0000000000..7a3b286366 --- /dev/null +++ b/kernel/power/ztrsm_kernel_power6_LN.S @@ -0,0 +1,4720 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r19 +#define TEMP r20 +#define KK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef CONJ +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FNMSUB +#define FMA4 FMADD +#elif defined(LN) || defined(LT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FMADD +#define FMA4 FNMSUB +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FMADD +#define FMA4 FMADD +#endif + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, 48 * SIZE + li PREC, -4 * SIZE + + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + + andi. I, M, 1 + ble LL(20) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f18, f20, f0 + FMA4 f3, f19, f20, f3 + FMA2 f1, f18, f21, f1 + FMA3 f2, f19, f21, f2 + + FMA1 f4, f18, f22, f4 + FMA4 f7, f19, f22, f7 + FMA2 f5, f18, f23, f5 + FMA3 f6, f19, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f8, f18, f24, f8 + FMA4 f11, f19, f24, f11 + FMA2 f9, f18, f25, f9 + FMA3 f10, f19, f25, f10 + + FMA1 f12, f18, f26, f12 + FMA4 f15, f19, f26, f15 + FMA2 f13, f18, f27, f13 + FMA3 f14, f19, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA4 f3, f29, f20, f3 + FMA2 f1, f28, f21, f1 + FMA3 f2, f29, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA4 f7, f29, f22, f7 + FMA2 f5, f28, f23, f5 + FMA3 f6, f29, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f8, f28, f24, f8 + FMA4 f11, f29, f24, f11 + FMA2 f9, f28, f25, f9 + FMA3 f10, f29, f25, f10 + + FMA1 f12, f28, f26, f12 + FMA4 f15, f29, f26, f15 + FMA2 f13, f28, f27, f13 + FMA3 f14, f29, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f30, f20, f0 + FMA4 f3, f31, f20, f3 + FMA2 f1, f30, f21, f1 + FMA3 f2, f31, f21, f2 + + FMA1 f4, f30, f22, f4 + FMA4 f7, f31, f22, f7 + FMA2 f5, f30, f23, f5 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f8, f30, f24, f8 + FMA4 f11, f31, f24, f11 + FMA2 f9, f30, f25, f9 + FMA3 f10, f31, f25, f10 + + FMA1 f12, f30, f26, f12 + FMA4 f15, f31, f26, f15 + FMA2 f13, f30, f27, f13 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 32 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + +#ifndef CONJ + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + +#else + + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + +#else + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(20): + srawi. I, M, 1 + ble LL(29) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 40 * SIZE(BO) + LFD f21, 41 * SIZE(BO) + LFD f22, 42 * SIZE(BO) + LFD f23, 43 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 44 * SIZE(BO) + LFD f25, 45 * SIZE(BO) + LFD f26, 46 * SIZE(BO) + LFD f27, 47 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 48 * SIZE(BO) + LFD f21, 49 * SIZE(BO) + LFD f22, 50 * SIZE(BO) + LFD f23, 51 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 52 * SIZE(BO) + LFD f25, 53 * SIZE(BO) + LFD f26, 54 * SIZE(BO) + LFD f27, 55 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 56 * SIZE(BO) + LFD f21, 57 * SIZE(BO) + LFD f22, 58 * SIZE(BO) + LFD f23, 59 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 60 * SIZE(BO) + LFD f25, 61 * SIZE(BO) + LFD f26, 62 * SIZE(BO) + LFD f27, 63 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 64 * SIZE(BO) + LFD f21, 65 * SIZE(BO) + LFD f22, 66 * SIZE(BO) + LFD f23, 67 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 68 * SIZE(BO) + LFD f25, 69 * SIZE(BO) + LFD f26, 70 * SIZE(BO) + LFD f27, 71 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 64 * SIZE + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(18) + .align 4 + +LL(16): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + FSUB f2, f24, f2 + FSUB f3, f25, f3 + FSUB f6, f26, f6 + FSUB f7, f27, f7 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f10, f28, f10 + FSUB f11, f29, f11 + FSUB f14, f30, f14 + FSUB f15, f31, f15 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f24, 6 * SIZE(AO) + LFD f25, 7 * SIZE(AO) + LFD f26, 4 * SIZE(AO) + LFD f27, 5 * SIZE(AO) + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f25, f3 + FMUL f17, f25, f2 + FMUL f18, f25, f7 + FMUL f19, f25, f6 + + FMUL f20, f25, f11 + FMUL f21, f25, f10 + FMUL f22, f25, f15 + FMUL f23, f25, f14 + +#ifndef CONJ + + FMSUB f2, f24, f2, f16 + FMADD f3, f24, f3, f17 + FMSUB f6, f24, f6, f18 + FMADD f7, f24, f7, f19 + + FMSUB f10, f24, f10, f20 + FMADD f11, f24, f11, f21 + FMSUB f14, f24, f14, f22 + FMADD f15, f24, f15, f23 + + FMADD f0, f27, f3, f0 + FNMSUB f1, f27, f2, f1 + FMADD f4, f27, f7, f4 + FNMSUB f5, f27, f6, f5 + + FMADD f8, f27, f11, f8 + FNMSUB f9, f27, f10, f9 + FMADD f12, f27, f15, f12 + FNMSUB f13, f27, f14, f13 + + FNMSUB f0, f26, f2, f0 + FNMSUB f1, f26, f3, f1 + FNMSUB f4, f26, f6, f4 + FNMSUB f5, f26, f7, f5 + + FNMSUB f8, f26, f10, f8 + FNMSUB f9, f26, f11, f9 + FNMSUB f12, f26, f14, f12 + FNMSUB f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f2, f24, f2, f16 + FMSUB f3, f24, f3, f17 + FMADD f6, f24, f6, f18 + FMSUB f7, f24, f7, f19 + + FMADD f10, f24, f10, f20 + FMSUB f11, f24, f11, f21 + FMADD f14, f24, f14, f22 + FMSUB f15, f24, f15, f23 + + FMSUB f0, f27, f3, f0 + FNMADD f1, f27, f2, f1 + FMSUB f4, f27, f7, f4 + FNMADD f5, f27, f6, f5 + + FMSUB f8, f27, f11, f8 + FNMADD f9, f27, f10, f9 + FMSUB f12, f27, f15, f12 + FNMADD f13, f27, f14, f13 + + FNMADD f0, f26, f2, f0 + FNMADD f1, f26, f3, f1 + FNMADD f4, f26, f6, f4 + FNMADD f5, f26, f7, f5 + + FNMADD f8, f26, f10, f8 + FNMADD f9, f26, f11, f9 + FNMADD f12, f26, f14, f12 + FNMADD f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + LFD f26, 2 * SIZE(AO) + LFD f27, 3 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + + FMADD f2, f27, f1, f2 + FNMSUB f3, f27, f0, f3 + FMADD f6, f27, f5, f6 + FNMSUB f7, f27, f4, f7 + + FMADD f10, f27, f9, f10 + FNMSUB f11, f27, f8, f11 + FMADD f14, f27, f13, f14 + FNMSUB f15, f27, f12, f15 + + FNMSUB f2, f26, f0, f2 + FNMSUB f3, f26, f1, f3 + FNMSUB f6, f26, f4, f6 + FNMSUB f7, f26, f5, f7 + + FNMSUB f10, f26, f8, f10 + FNMSUB f11, f26, f9, f11 + FNMSUB f14, f26, f12, f14 + FNMSUB f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMSUB f2, f28, f2, f16 + FMADD f3, f28, f3, f17 + FMSUB f6, f28, f6, f18 + FMADD f7, f28, f7, f19 + + FMSUB f10, f28, f10, f20 + FMADD f11, f28, f11, f21 + FMSUB f14, f28, f14, f22 + FMADD f15, f28, f15, f23 +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + + FMSUB f2, f27, f1, f2 + FNMADD f3, f27, f0, f3 + FMSUB f6, f27, f5, f6 + FNMADD f7, f27, f4, f7 + + FMSUB f10, f27, f9, f10 + FNMADD f11, f27, f8, f11 + FMSUB f14, f27, f13, f14 + FNMADD f15, f27, f12, f15 + + FNMADD f2, f26, f0, f2 + FNMADD f3, f26, f1, f3 + FNMADD f6, f26, f4, f6 + FNMADD f7, f26, f5, f7 + + FNMADD f10, f26, f8, f10 + FNMADD f11, f26, f9, f11 + FNMADD f14, f26, f12, f14 + FNMADD f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMADD f2, f28, f2, f16 + FMSUB f3, f28, f3, f17 + FMADD f6, f28, f6, f18 + FMSUB f7, f28, f7, f19 + + FMADD f10, f28, f10, f20 + FMSUB f11, f28, f11, f21 + FMADD f14, f28, f14, f22 + FMSUB f15, f28, f15, f23 +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f3 + FMUL f19, f25, f2 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f2, f24, f2, f18 + FMADD f3, f24, f3, f19 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FMADD f6, f27, f3, f6 + FNMSUB f7, f27, f2, f7 + + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + FNMSUB f6, f26, f2, f6 + FNMSUB f7, f26, f3, f7 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FMADD f10, f29, f3, f10 + FNMSUB f11, f29, f2, f11 + + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + FNMSUB f10, f28, f2, f10 + FNMSUB f11, f28, f3, f11 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FMADD f14, f31, f3, f14 + FNMSUB f15, f31, f2, f15 + + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + FNMSUB f14, f30, f2, f14 + FNMSUB f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FMADD f10, f29, f7, f10 + FNMSUB f11, f29, f6, f11 + + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + FNMSUB f10, f28, f6, f10 + FNMSUB f11, f28, f7, f11 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FMADD f14, f31, f7, f14 + FNMSUB f15, f31, f6, f15 + + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + FNMSUB f14, f30, f6, f14 + FNMSUB f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FMADD f14, f29, f11, f14 + FNMSUB f15, f29, f10, f15 + + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + FNMSUB f14, f28, f10, f14 + FNMSUB f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + FMSUB f14, f30, f14, f18 + FMADD f15, f30, f15, f19 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f2, f24, f2, f18 + FMSUB f3, f24, f3, f19 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FMSUB f6, f27, f3, f6 + FNMADD f7, f27, f2, f7 + + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + FNMADD f6, f26, f2, f6 + FNMADD f7, f26, f3, f7 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FMSUB f10, f29, f3, f10 + FNMADD f11, f29, f2, f11 + + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + FNMADD f10, f28, f2, f10 + FNMADD f11, f28, f3, f11 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FMSUB f14, f31, f3, f14 + FNMADD f15, f31, f2, f15 + + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + FNMADD f14, f30, f2, f14 + FNMADD f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FMSUB f10, f29, f7, f10 + FNMADD f11, f29, f6, f11 + + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + FNMADD f10, f28, f6, f10 + FNMADD f11, f28, f7, f11 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FMSUB f14, f31, f7, f14 + FNMADD f15, f31, f6, f15 + + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + FNMADD f14, f30, f6, f14 + FNMADD f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FMSUB f14, f29, f11, f14 + FNMADD f15, f29, f10, f15 + + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + FNMADD f14, f28, f10, f14 + FNMADD f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 + FMADD f14, f30, f14, f18 + FMSUB f15, f30, f15, f19 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + FMUL f18, f25, f15 + FMUL f19, f25, f14 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + FMSUB f14, f24, f14, f18 + FMADD f15, f24, f15, f19 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FMADD f10, f27, f15, f10 + FNMSUB f11, f27, f14, f11 + + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + FNMSUB f10, f26, f14, f10 + FNMSUB f11, f26, f15, f11 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FMADD f6, f29, f15, f6 + FNMSUB f7, f29, f14, f7 + + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + FNMSUB f6, f28, f14, f6 + FNMSUB f7, f28, f15, f7 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FMADD f2, f31, f15, f2 + FNMSUB f3, f31, f14, f3 + + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + FNMSUB f2, f30, f14, f2 + FNMSUB f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FMADD f6, f29, f11, f6 + FNMSUB f7, f29, f10, f7 + + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + FNMSUB f6, f28, f10, f6 + FNMSUB f7, f28, f11, f7 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FMADD f2, f31, f11, f2 + FNMSUB f3, f31, f10, f3 + + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + FNMSUB f2, f30, f10, f2 + FNMSUB f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FMADD f2, f29, f7, f2 + FNMSUB f3, f29, f6, f3 + + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + FNMSUB f2, f28, f6, f2 + FNMSUB f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + +#else + + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + FMADD f14, f24, f14, f18 + FMSUB f15, f24, f15, f19 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FMSUB f10, f27, f15, f10 + FNMADD f11, f27, f14, f11 + + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + FNMADD f10, f26, f14, f10 + FNMADD f11, f26, f15, f11 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FMSUB f6, f29, f15, f6 + FNMADD f7, f29, f14, f7 + + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + FNMADD f6, f28, f14, f6 + FNMADD f7, f28, f15, f7 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FMSUB f2, f31, f15, f2 + FNMADD f3, f31, f14, f3 + + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + FNMADD f2, f30, f14, f2 + FNMADD f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FMSUB f6, f29, f11, f6 + FNMADD f7, f29, f10, f7 + + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + FNMADD f6, f28, f10, f6 + FNMADD f7, f28, f11, f7 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FMSUB f2, f31, f11, f2 + FNMADD f3, f31, f10, f3 + + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + FNMADD f2, f30, f10, f2 + FNMADD f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FMSUB f2, f29, f7, f2 + FNMADD f3, f29, f6, f3 + + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + FNMADD f2, f28, f6, f2 + FNMADD f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMSUB f3, f30, f3, f19 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f3, 9 * SIZE(BO) + STFD f6, 10 * SIZE(BO) + STFD f7, 11 * SIZE(BO) + STFD f10, 12 * SIZE(BO) + STFD f11, 13 * SIZE(BO) + STFD f14, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(50) + .align 4 + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + + andi. I, M, 1 + ble LL(40) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(47) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(40): + srawi. I, M, 1 + ble LL(49) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + + bdnz LL(36) + .align 4 + +LL(38): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + + andi. I, M, 1 + ble LL(60) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 4 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(67) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(66) + .align 4 + +LL(67): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(60): + srawi. I, M, 1 + ble LL(69) + .align 4 + +LL(51): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(57) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(56) + .align 4 + +LL(57): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(51) + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S new file mode 100644 index 0000000000..b7c34419b9 --- /dev/null +++ b/kernel/power/ztrsm_kernel_power6_LT.S @@ -0,0 +1,4697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r19 +#define TEMP r20 +#define KK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef CONJ +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FNMSUB +#define FMA4 FMADD +#elif defined(LN) || defined(LT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FMADD +#define FMA4 FNMSUB +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FMADD +#define FMA4 FMADD +#endif + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, 48 * SIZE + li PREC, 4 * SIZE + + srawi. J, N, 2 + ble LL(30) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 40 * SIZE(BO) + LFD f21, 41 * SIZE(BO) + LFD f22, 42 * SIZE(BO) + LFD f23, 43 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 44 * SIZE(BO) + LFD f25, 45 * SIZE(BO) + LFD f26, 46 * SIZE(BO) + LFD f27, 47 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 48 * SIZE(BO) + LFD f21, 49 * SIZE(BO) + LFD f22, 50 * SIZE(BO) + LFD f23, 51 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 52 * SIZE(BO) + LFD f25, 53 * SIZE(BO) + LFD f26, 54 * SIZE(BO) + LFD f27, 55 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 56 * SIZE(BO) + LFD f21, 57 * SIZE(BO) + LFD f22, 58 * SIZE(BO) + LFD f23, 59 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 60 * SIZE(BO) + LFD f25, 61 * SIZE(BO) + LFD f26, 62 * SIZE(BO) + LFD f27, 63 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 64 * SIZE(BO) + LFD f21, 65 * SIZE(BO) + LFD f22, 66 * SIZE(BO) + LFD f23, 67 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 68 * SIZE(BO) + LFD f25, 69 * SIZE(BO) + LFD f26, 70 * SIZE(BO) + LFD f27, 71 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 64 * SIZE + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(18) + .align 4 + +LL(16): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + FSUB f2, f24, f2 + FSUB f3, f25, f3 + FSUB f6, f26, f6 + FSUB f7, f27, f7 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f10, f28, f10 + FSUB f11, f29, f11 + FSUB f14, f30, f14 + FSUB f15, f31, f15 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f24, 6 * SIZE(AO) + LFD f25, 7 * SIZE(AO) + LFD f26, 4 * SIZE(AO) + LFD f27, 5 * SIZE(AO) + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f25, f3 + FMUL f17, f25, f2 + FMUL f18, f25, f7 + FMUL f19, f25, f6 + + FMUL f20, f25, f11 + FMUL f21, f25, f10 + FMUL f22, f25, f15 + FMUL f23, f25, f14 + +#ifndef CONJ + + FMSUB f2, f24, f2, f16 + FMADD f3, f24, f3, f17 + FMSUB f6, f24, f6, f18 + FMADD f7, f24, f7, f19 + + FMSUB f10, f24, f10, f20 + FMADD f11, f24, f11, f21 + FMSUB f14, f24, f14, f22 + FMADD f15, f24, f15, f23 + + FMADD f0, f27, f3, f0 + FNMSUB f1, f27, f2, f1 + FMADD f4, f27, f7, f4 + FNMSUB f5, f27, f6, f5 + + FMADD f8, f27, f11, f8 + FNMSUB f9, f27, f10, f9 + FMADD f12, f27, f15, f12 + FNMSUB f13, f27, f14, f13 + + FNMSUB f0, f26, f2, f0 + FNMSUB f1, f26, f3, f1 + FNMSUB f4, f26, f6, f4 + FNMSUB f5, f26, f7, f5 + + FNMSUB f8, f26, f10, f8 + FNMSUB f9, f26, f11, f9 + FNMSUB f12, f26, f14, f12 + FNMSUB f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f2, f24, f2, f16 + FMSUB f3, f24, f3, f17 + FMADD f6, f24, f6, f18 + FMSUB f7, f24, f7, f19 + + FMADD f10, f24, f10, f20 + FMSUB f11, f24, f11, f21 + FMADD f14, f24, f14, f22 + FMSUB f15, f24, f15, f23 + + FMSUB f0, f27, f3, f0 + FNMADD f1, f27, f2, f1 + FMSUB f4, f27, f7, f4 + FNMADD f5, f27, f6, f5 + + FMSUB f8, f27, f11, f8 + FNMADD f9, f27, f10, f9 + FMSUB f12, f27, f15, f12 + FNMADD f13, f27, f14, f13 + + FNMADD f0, f26, f2, f0 + FNMADD f1, f26, f3, f1 + FNMADD f4, f26, f6, f4 + FNMADD f5, f26, f7, f5 + + FNMADD f8, f26, f10, f8 + FNMADD f9, f26, f11, f9 + FNMADD f12, f26, f14, f12 + FNMADD f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + LFD f26, 2 * SIZE(AO) + LFD f27, 3 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + + FMADD f2, f27, f1, f2 + FNMSUB f3, f27, f0, f3 + FMADD f6, f27, f5, f6 + FNMSUB f7, f27, f4, f7 + + FMADD f10, f27, f9, f10 + FNMSUB f11, f27, f8, f11 + FMADD f14, f27, f13, f14 + FNMSUB f15, f27, f12, f15 + + FNMSUB f2, f26, f0, f2 + FNMSUB f3, f26, f1, f3 + FNMSUB f6, f26, f4, f6 + FNMSUB f7, f26, f5, f7 + + FNMSUB f10, f26, f8, f10 + FNMSUB f11, f26, f9, f11 + FNMSUB f14, f26, f12, f14 + FNMSUB f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMSUB f2, f28, f2, f16 + FMADD f3, f28, f3, f17 + FMSUB f6, f28, f6, f18 + FMADD f7, f28, f7, f19 + + FMSUB f10, f28, f10, f20 + FMADD f11, f28, f11, f21 + FMSUB f14, f28, f14, f22 + FMADD f15, f28, f15, f23 +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + + FMSUB f2, f27, f1, f2 + FNMADD f3, f27, f0, f3 + FMSUB f6, f27, f5, f6 + FNMADD f7, f27, f4, f7 + + FMSUB f10, f27, f9, f10 + FNMADD f11, f27, f8, f11 + FMSUB f14, f27, f13, f14 + FNMADD f15, f27, f12, f15 + + FNMADD f2, f26, f0, f2 + FNMADD f3, f26, f1, f3 + FNMADD f6, f26, f4, f6 + FNMADD f7, f26, f5, f7 + + FNMADD f10, f26, f8, f10 + FNMADD f11, f26, f9, f11 + FNMADD f14, f26, f12, f14 + FNMADD f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMADD f2, f28, f2, f16 + FMSUB f3, f28, f3, f17 + FMADD f6, f28, f6, f18 + FMSUB f7, f28, f7, f19 + + FMADD f10, f28, f10, f20 + FMSUB f11, f28, f11, f21 + FMADD f14, f28, f14, f22 + FMSUB f15, f28, f15, f23 +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f3 + FMUL f19, f25, f2 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f2, f24, f2, f18 + FMADD f3, f24, f3, f19 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FMADD f6, f27, f3, f6 + FNMSUB f7, f27, f2, f7 + + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + FNMSUB f6, f26, f2, f6 + FNMSUB f7, f26, f3, f7 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FMADD f10, f29, f3, f10 + FNMSUB f11, f29, f2, f11 + + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + FNMSUB f10, f28, f2, f10 + FNMSUB f11, f28, f3, f11 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FMADD f14, f31, f3, f14 + FNMSUB f15, f31, f2, f15 + + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + FNMSUB f14, f30, f2, f14 + FNMSUB f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FMADD f10, f29, f7, f10 + FNMSUB f11, f29, f6, f11 + + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + FNMSUB f10, f28, f6, f10 + FNMSUB f11, f28, f7, f11 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FMADD f14, f31, f7, f14 + FNMSUB f15, f31, f6, f15 + + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + FNMSUB f14, f30, f6, f14 + FNMSUB f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FMADD f14, f29, f11, f14 + FNMSUB f15, f29, f10, f15 + + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + FNMSUB f14, f28, f10, f14 + FNMSUB f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + FMSUB f14, f30, f14, f18 + FMADD f15, f30, f15, f19 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f2, f24, f2, f18 + FMSUB f3, f24, f3, f19 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FMSUB f6, f27, f3, f6 + FNMADD f7, f27, f2, f7 + + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + FNMADD f6, f26, f2, f6 + FNMADD f7, f26, f3, f7 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FMSUB f10, f29, f3, f10 + FNMADD f11, f29, f2, f11 + + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + FNMADD f10, f28, f2, f10 + FNMADD f11, f28, f3, f11 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FMSUB f14, f31, f3, f14 + FNMADD f15, f31, f2, f15 + + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + FNMADD f14, f30, f2, f14 + FNMADD f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FMSUB f10, f29, f7, f10 + FNMADD f11, f29, f6, f11 + + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + FNMADD f10, f28, f6, f10 + FNMADD f11, f28, f7, f11 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FMSUB f14, f31, f7, f14 + FNMADD f15, f31, f6, f15 + + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + FNMADD f14, f30, f6, f14 + FNMADD f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FMSUB f14, f29, f11, f14 + FNMADD f15, f29, f10, f15 + + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + FNMADD f14, f28, f10, f14 + FNMADD f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 + FMADD f14, f30, f14, f18 + FMSUB f15, f30, f15, f19 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + FMUL f18, f25, f15 + FMUL f19, f25, f14 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + FMSUB f14, f24, f14, f18 + FMADD f15, f24, f15, f19 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FMADD f10, f27, f15, f10 + FNMSUB f11, f27, f14, f11 + + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + FNMSUB f10, f26, f14, f10 + FNMSUB f11, f26, f15, f11 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FMADD f6, f29, f15, f6 + FNMSUB f7, f29, f14, f7 + + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + FNMSUB f6, f28, f14, f6 + FNMSUB f7, f28, f15, f7 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FMADD f2, f31, f15, f2 + FNMSUB f3, f31, f14, f3 + + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + FNMSUB f2, f30, f14, f2 + FNMSUB f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FMADD f6, f29, f11, f6 + FNMSUB f7, f29, f10, f7 + + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + FNMSUB f6, f28, f10, f6 + FNMSUB f7, f28, f11, f7 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FMADD f2, f31, f11, f2 + FNMSUB f3, f31, f10, f3 + + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + FNMSUB f2, f30, f10, f2 + FNMSUB f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FMADD f2, f29, f7, f2 + FNMSUB f3, f29, f6, f3 + + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + FNMSUB f2, f28, f6, f2 + FNMSUB f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + +#else + + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + FMADD f14, f24, f14, f18 + FMSUB f15, f24, f15, f19 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FMSUB f10, f27, f15, f10 + FNMADD f11, f27, f14, f11 + + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + FNMADD f10, f26, f14, f10 + FNMADD f11, f26, f15, f11 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FMSUB f6, f29, f15, f6 + FNMADD f7, f29, f14, f7 + + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + FNMADD f6, f28, f14, f6 + FNMADD f7, f28, f15, f7 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FMSUB f2, f31, f15, f2 + FNMADD f3, f31, f14, f3 + + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + FNMADD f2, f30, f14, f2 + FNMADD f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FMSUB f6, f29, f11, f6 + FNMADD f7, f29, f10, f7 + + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + FNMADD f6, f28, f10, f6 + FNMADD f7, f28, f11, f7 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FMSUB f2, f31, f11, f2 + FNMADD f3, f31, f10, f3 + + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + FNMADD f2, f30, f10, f2 + FNMADD f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FMSUB f2, f29, f7, f2 + FNMADD f3, f29, f6, f3 + + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + FNMADD f2, f28, f6, f2 + FNMADD f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMSUB f3, f30, f3, f19 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f3, 9 * SIZE(BO) + STFD f6, 10 * SIZE(BO) + STFD f7, 11 * SIZE(BO) + STFD f10, 12 * SIZE(BO) + STFD f11, 13 * SIZE(BO) + STFD f14, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f18, f20, f0 + FMA4 f3, f19, f20, f3 + FMA2 f1, f18, f21, f1 + FMA3 f2, f19, f21, f2 + + FMA1 f4, f18, f22, f4 + FMA4 f7, f19, f22, f7 + FMA2 f5, f18, f23, f5 + FMA3 f6, f19, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f8, f18, f24, f8 + FMA4 f11, f19, f24, f11 + FMA2 f9, f18, f25, f9 + FMA3 f10, f19, f25, f10 + + FMA1 f12, f18, f26, f12 + FMA4 f15, f19, f26, f15 + FMA2 f13, f18, f27, f13 + FMA3 f14, f19, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA4 f3, f29, f20, f3 + FMA2 f1, f28, f21, f1 + FMA3 f2, f29, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA4 f7, f29, f22, f7 + FMA2 f5, f28, f23, f5 + FMA3 f6, f29, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f8, f28, f24, f8 + FMA4 f11, f29, f24, f11 + FMA2 f9, f28, f25, f9 + FMA3 f10, f29, f25, f10 + + FMA1 f12, f28, f26, f12 + FMA4 f15, f29, f26, f15 + FMA2 f13, f28, f27, f13 + FMA3 f14, f29, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f30, f20, f0 + FMA4 f3, f31, f20, f3 + FMA2 f1, f30, f21, f1 + FMA3 f2, f31, f21, f2 + + FMA1 f4, f30, f22, f4 + FMA4 f7, f31, f22, f7 + FMA2 f5, f30, f23, f5 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f8, f30, f24, f8 + FMA4 f11, f31, f24, f11 + FMA2 f9, f30, f25, f9 + FMA3 f10, f31, f25, f10 + + FMA1 f12, f30, f26, f12 + FMA4 f15, f31, f26, f15 + FMA2 f13, f30, f27, f13 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 32 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + +#ifndef CONJ + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + +#else + + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + +#else + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(50) + .align 4 + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + + bdnz LL(36) + .align 4 + +LL(38): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(47) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +LL(50): + andi. J, N, 1 + ble LL(999) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(60) + .align 4 + +LL(51): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(57) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(56) + .align 4 + +LL(57): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(51) + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 4 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(67) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(66) + .align 4 + +LL(67): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S new file mode 100644 index 0000000000..069a73c219 --- /dev/null +++ b/kernel/power/ztrsm_kernel_power6_RT.S @@ -0,0 +1,4696 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r19 +#define TEMP r20 +#define KK r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO1 r26 +#define CO2 r27 +#define CO3 r28 +#define CO4 r29 + +#define PREA r30 +#define PREC r31 + +#ifndef CONJ +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FNMSUB +#define FMA4 FMADD +#elif defined(LN) || defined(LT) +#define FMA1 FMADD +#define FMA2 FMADD +#define FMA3 FMADD +#define FMA4 FNMSUB +#else +#define FMA1 FMADD +#define FMA2 FNMSUB +#define FMA3 FMADD +#define FMA4 FMADD +#endif + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble LL(999) + cmpwi cr0, N, 0 + ble LL(999) + cmpwi cr0, K, 0 + ble LL(999) + + li PREA, 48 * SIZE + li PREC, 4 * SIZE + + andi. J, N, 1 + ble LL(30) + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble LL(60) + .align 4 + +LL(51): +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + dcbt CO1, PREC + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(55) + .align 4 + +LL(52): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(AO) + LFD f21, 9 * SIZE(AO) + LFD f22, 10 * SIZE(AO) + LFD f23, 11 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(AO) + LFD f25, 13 * SIZE(AO) + LFD f26, 14 * SIZE(AO) + LFD f27, 15 * SIZE(AO) + + LFD f16, 4 * SIZE(BO) + LFD f17, 5 * SIZE(BO) + LFD f18, 6 * SIZE(BO) + LFD f19, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(AO) + LFD f21, 17 * SIZE(AO) + LFD f22, 18 * SIZE(AO) + LFD f23, 19 * SIZE(AO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 20 * SIZE(AO) + LFD f25, 21 * SIZE(AO) + LFD f26, 22 * SIZE(AO) + LFD f27, 23 * SIZE(AO) + + LFD f16, 8 * SIZE(BO) + LFD f17, 9 * SIZE(BO) + LFD f18, 10 * SIZE(BO) + LFD f19, 11 * SIZE(BO) + + addi AO, AO, 16 * SIZE + addi BO, BO, 8 * SIZE + dcbt PREA, AO + dcbt PREA, BO + bdnz LL(52) + .align 4 + +LL(55): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(57) + .align 4 + +LL(56): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + LFD f16, 2 * SIZE(BO) + LFD f17, 3 * SIZE(BO) + + addi BO, BO, 2 * SIZE + addi AO, AO, 4 * SIZE + bdnz LL(56) + .align 4 + +LL(57): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(51) + .align 4 + +LL(60): + andi. I, M, 1 + ble LL(69) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(65) + .align 4 + +LL(62): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + + FMADD f4, f18, f22, f4 + FMADD f5, f19, f23, f5 + FMADD f6, f19, f22, f6 + FMADD f7, f18, f23, f7 + + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(62) + .align 4 + +LL(65): + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble LL(67) + .align 4 + +LL(66): + FMADD f0, f16, f20, f0 + FMADD f1, f17, f21, f1 + FMADD f2, f17, f20, f2 + FMADD f3, f16, f21, f3 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + LFD f20, 2 * SIZE(BO) + LFD f21, 3 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 2 * SIZE + + bdnz LL(66) + .align 4 + +LL(67): +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(69): +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + +LL(30): + andi. J, N, 2 + ble LL(50) + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble LL(40) + .align 4 + +LL(31): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + dcbtst CO1, PREC + dcbtst CO2, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(35) + .align 4 + +LL(32): + dcbt AO, PREA + dcbtst BO, PREA + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 4 * SIZE(AO) + LFD f28, 4 * SIZE(BO) + LFD f25, 5 * SIZE(AO) + LFD f29, 5 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 6 * SIZE(AO) + LFD f30, 6 * SIZE(BO) + LFD f27, 7 * SIZE(AO) + LFD f31, 7 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 8 * SIZE(AO) + LFD f20, 8 * SIZE(BO) + LFD f17, 9 * SIZE(AO) + LFD f21, 9 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 10 * SIZE(AO) + LFD f22, 10 * SIZE(BO) + LFD f19, 11 * SIZE(AO) + LFD f23, 11 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 12 * SIZE(AO) + LFD f28, 12 * SIZE(BO) + LFD f25, 13 * SIZE(AO) + LFD f29, 13 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 14 * SIZE(AO) + LFD f30, 14 * SIZE(BO) + LFD f27, 15 * SIZE(AO) + LFD f31, 15 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 16 * SIZE(AO) + LFD f20, 16 * SIZE(BO) + LFD f17, 17 * SIZE(AO) + LFD f21, 17 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 18 * SIZE(AO) + LFD f22, 18 * SIZE(BO) + LFD f19, 19 * SIZE(AO) + LFD f23, 19 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 20 * SIZE(AO) + LFD f28, 20 * SIZE(BO) + LFD f25, 21 * SIZE(AO) + LFD f29, 21 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 22 * SIZE(AO) + LFD f30, 22 * SIZE(BO) + LFD f27, 23 * SIZE(AO) + LFD f31, 23 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 24 * SIZE(AO) + LFD f20, 24 * SIZE(BO) + LFD f17, 25 * SIZE(AO) + LFD f21, 25 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 26 * SIZE(AO) + LFD f22, 26 * SIZE(BO) + LFD f19, 27 * SIZE(AO) + LFD f23, 27 * SIZE(BO) + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + LFD f24, 28 * SIZE(AO) + LFD f28, 28 * SIZE(BO) + LFD f25, 29 * SIZE(AO) + LFD f29, 29 * SIZE(BO) + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + LFD f26, 30 * SIZE(AO) + LFD f30, 30 * SIZE(BO) + LFD f27, 31 * SIZE(AO) + LFD f31, 31 * SIZE(BO) + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + FMADD f0, f24, f28, f0 + FMADD f4, f24, f29, f4 + FMADD f8, f24, f30, f8 + FMADD f12, f24, f31, f12 + + LFD f16, 32 * SIZE(AO) + LFD f20, 32 * SIZE(BO) + LFD f17, 33 * SIZE(AO) + LFD f21, 33 * SIZE(BO) + + FMADD f1, f25, f28, f1 + FMADD f5, f25, f29, f5 + FMADD f9, f25, f30, f9 + FMADD f13, f25, f31, f13 + + FMADD f2, f26, f28, f2 + FMADD f6, f26, f29, f6 + FMADD f10, f26, f30, f10 + FMADD f14, f26, f31, f14 + + LFD f18, 34 * SIZE(AO) + LFD f22, 34 * SIZE(BO) + LFD f19, 35 * SIZE(AO) + LFD f23, 35 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 32 * SIZE + + FMADD f3, f27, f28, f3 + FMADD f7, f27, f29, f7 + FMADD f11, f27, f30, f11 + FMADD f15, f27, f31, f15 + + bdnz LL(32) + .align 4 + +LL(35): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(38) + .align 4 + +LL(36): + FMADD f0, f16, f20, f0 + FMADD f4, f16, f21, f4 + FMADD f8, f16, f22, f8 + FMADD f12, f16, f23, f12 + + FMADD f1, f17, f20, f1 + FMADD f5, f17, f21, f5 + FMADD f9, f17, f22, f9 + FMADD f13, f17, f23, f13 + + FMADD f2, f18, f20, f2 + FMADD f6, f18, f21, f6 + FMADD f10, f18, f22, f10 + FMADD f14, f18, f23, f14 + + FMADD f3, f19, f20, f3 + FMADD f7, f19, f21, f7 + FMADD f11, f19, f22, f11 + FMADD f15, f19, f23, f15 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + addi BO, BO, 4 * SIZE + addi AO, AO, 4 * SIZE + + bdnz LL(36) + .align 4 + +LL(38): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(31) + .align 4 + +LL(40): + andi. I, M, 1 + ble LL(49) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(45) + .align 4 + +LL(42): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMADD f0, f18, f24, f0 + FMADD f1, f18, f25, f1 + FMADD f2, f18, f26, f2 + FMADD f3, f18, f27, f3 + + FMADD f4, f19, f24, f4 + FMADD f5, f19, f25, f5 + FMADD f6, f19, f26, f6 + FMADD f7, f19, f27, f7 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + addi BO, BO, 16 * SIZE + addi AO, AO, 8 * SIZE + bdnz LL(42) + .align 4 + +LL(45): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(47) + .align 4 + +LL(46): + FMADD f0, f16, f20, f0 + FMADD f1, f16, f21, f1 + FMADD f2, f16, f22, f2 + FMADD f3, f16, f23, f3 + + FMADD f4, f17, f20, f4 + FMADD f5, f17, f21, f5 + FMADD f6, f17, f22, f6 + FMADD f7, f17, f23, f7 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + addi AO, AO, 2 * SIZE + addi BO, BO, 4 * SIZE + bdnz LL(46) + .align 4 + +LL(47): +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(49): +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + .align 4 + +LL(50): + srawi. J, N, 2 + ble LL(999) + .align 4 + +LL(10): +#ifdef RT + slwi r0, K, 2 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 2 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + add CO3, CO2, LDC + add CO4, CO3, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO4, LDC +#endif + ble LL(20) + .align 4 + +LL(11): +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(B) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(B) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(B) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, KK, 3 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f20, 0 * SIZE(BO) + LFD f17, 1 * SIZE(AO) + LFD f21, 1 * SIZE(BO) + LFD f18, 2 * SIZE(AO) + LFD f22, 2 * SIZE(BO) + LFD f19, 3 * SIZE(AO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + dcbtst CO1, PREC + dcbtst CO2, PREC + dcbtst CO3, PREC + dcbtst CO4, PREC + + srawi. r0, TEMP, 3 + mtspr CTR, r0 +#endif + ble LL(15) + .align 4 + +LL(12): + dcbt AO, PREA + dcbtst BO, PREA + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 16 * SIZE(AO) + LFD f17, 17 * SIZE(AO) + LFD f18, 18 * SIZE(AO) + LFD f19, 19 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 20 * SIZE(AO) + LFD f29, 21 * SIZE(AO) + LFD f30, 22 * SIZE(AO) + LFD f31, 23 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 40 * SIZE(BO) + LFD f21, 41 * SIZE(BO) + LFD f22, 42 * SIZE(BO) + LFD f23, 43 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 44 * SIZE(BO) + LFD f25, 45 * SIZE(BO) + LFD f26, 46 * SIZE(BO) + LFD f27, 47 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 24 * SIZE(AO) + LFD f17, 25 * SIZE(AO) + LFD f18, 26 * SIZE(AO) + LFD f19, 27 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 48 * SIZE(BO) + LFD f21, 49 * SIZE(BO) + LFD f22, 50 * SIZE(BO) + LFD f23, 51 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 52 * SIZE(BO) + LFD f25, 53 * SIZE(BO) + LFD f26, 54 * SIZE(BO) + LFD f27, 55 * SIZE(BO) + + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + LFD f28, 28 * SIZE(AO) + LFD f29, 29 * SIZE(AO) + LFD f30, 30 * SIZE(AO) + LFD f31, 31 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + LFD f20, 56 * SIZE(BO) + LFD f21, 57 * SIZE(BO) + LFD f22, 58 * SIZE(BO) + LFD f23, 59 * SIZE(BO) + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f24, 60 * SIZE(BO) + LFD f25, 61 * SIZE(BO) + LFD f26, 62 * SIZE(BO) + LFD f27, 63 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA1 f2, f30, f20, f2 + FMA2 f1, f28, f21, f1 + FMA2 f3, f30, f21, f3 + + LFD f16, 32 * SIZE(AO) + LFD f17, 33 * SIZE(AO) + LFD f18, 34 * SIZE(AO) + LFD f19, 35 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA1 f6, f30, f22, f6 + FMA2 f5, f28, f23, f5 + FMA2 f7, f30, f23, f7 + + FMA1 f8, f28, f24, f8 + FMA1 f10, f30, f24, f10 + FMA2 f9, f28, f25, f9 + FMA2 f11, f30, f25, f11 + + FMA1 f12, f28, f26, f12 + FMA1 f14, f30, f26, f14 + FMA2 f13, f28, f27, f13 + FMA2 f15, f30, f27, f15 + + FMA4 f1, f29, f20, f1 + FMA4 f3, f31, f20, f3 + FMA3 f0, f29, f21, f0 + FMA3 f2, f31, f21, f2 + + FMA4 f5, f29, f22, f5 + FMA4 f7, f31, f22, f7 + FMA3 f4, f29, f23, f4 + FMA3 f6, f31, f23, f6 + + LFD f20, 64 * SIZE(BO) + LFD f21, 65 * SIZE(BO) + LFD f22, 66 * SIZE(BO) + LFD f23, 67 * SIZE(BO) + + FMA4 f9, f29, f24, f9 + FMA4 f11, f31, f24, f11 + FMA3 f8, f29, f25, f8 + FMA3 f10, f31, f25, f10 + + FMA4 f13, f29, f26, f13 + FMA4 f15, f31, f26, f15 + FMA3 f12, f29, f27, f12 + FMA3 f14, f31, f27, f14 + + LFD f24, 68 * SIZE(BO) + LFD f25, 69 * SIZE(BO) + LFD f26, 70 * SIZE(BO) + LFD f27, 71 * SIZE(BO) + + addi AO, AO, 32 * SIZE + addi BO, BO, 64 * SIZE + + bdnz LL(12) + .align 4 + +LL(15): +#if defined(LT) || defined(RN) + andi. r0, KK, 7 +#else + andi. r0, TEMP, 7 +#endif + mtspr CTR, r0 + ble LL(18) + .align 4 + +LL(16): + FMA1 f0, f16, f20, f0 + FMA1 f2, f18, f20, f2 + FMA2 f1, f16, f21, f1 + FMA2 f3, f18, f21, f3 + + FMA1 f4, f16, f22, f4 + FMA1 f6, f18, f22, f6 + FMA2 f5, f16, f23, f5 + FMA2 f7, f18, f23, f7 + + FMA1 f8, f16, f24, f8 + FMA1 f10, f18, f24, f10 + FMA2 f9, f16, f25, f9 + FMA2 f11, f18, f25, f11 + + FMA1 f12, f16, f26, f12 + FMA1 f14, f18, f26, f14 + FMA2 f13, f16, f27, f13 + FMA2 f15, f18, f27, f15 + + FMA4 f1, f17, f20, f1 + FMA4 f3, f19, f20, f3 + FMA3 f0, f17, f21, f0 + FMA3 f2, f19, f21, f2 + + FMA4 f5, f17, f22, f5 + FMA4 f7, f19, f22, f7 + FMA3 f4, f17, f23, f4 + FMA3 f6, f19, f23, f6 + + FMA4 f9, f17, f24, f9 + FMA4 f11, f19, f24, f11 + FMA3 f8, f17, f25, f8 + FMA3 f10, f19, f25, f10 + + FMA4 f13, f17, f26, f13 + FMA4 f15, f19, f26, f15 + FMA3 f12, f17, f27, f12 + FMA3 f14, f19, f27, f14 + + LFD f16, 4 * SIZE(AO) + LFD f17, 5 * SIZE(AO) + LFD f18, 6 * SIZE(AO) + LFD f19, 7 * SIZE(AO) + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 4 * SIZE + addi BO, BO, 8 * SIZE + + bdnz LL(16) + .align 4 + +LL(18): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + + LFD f24, 8 * SIZE(BO) + LFD f25, 9 * SIZE(BO) + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + + FSUB f2, f24, f2 + FSUB f3, f25, f3 + FSUB f6, f26, f6 + FSUB f7, f27, f7 + + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FSUB f10, f28, f10 + FSUB f11, f29, f11 + FSUB f14, f30, f14 + FSUB f15, f31, f15 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + + FSUB f4, f20, f4 + FSUB f5, f21, f5 + FSUB f6, f22, f6 + FSUB f7, f23, f7 + + LFD f24, 8 * SIZE(AO) + LFD f25, 9 * SIZE(AO) + LFD f26, 10 * SIZE(AO) + LFD f27, 11 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f10, f26, f10 + FSUB f11, f27, f11 + + LFD f28, 12 * SIZE(AO) + LFD f29, 13 * SIZE(AO) + LFD f30, 14 * SIZE(AO) + LFD f31, 15 * SIZE(AO) + + FSUB f12, f28, f12 + FSUB f13, f29, f13 + FSUB f14, f30, f14 + FSUB f15, f31, f15 +#endif + +#ifdef LN + LFD f24, 6 * SIZE(AO) + LFD f25, 7 * SIZE(AO) + LFD f26, 4 * SIZE(AO) + LFD f27, 5 * SIZE(AO) + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f25, f3 + FMUL f17, f25, f2 + FMUL f18, f25, f7 + FMUL f19, f25, f6 + + FMUL f20, f25, f11 + FMUL f21, f25, f10 + FMUL f22, f25, f15 + FMUL f23, f25, f14 + +#ifndef CONJ + + FMSUB f2, f24, f2, f16 + FMADD f3, f24, f3, f17 + FMSUB f6, f24, f6, f18 + FMADD f7, f24, f7, f19 + + FMSUB f10, f24, f10, f20 + FMADD f11, f24, f11, f21 + FMSUB f14, f24, f14, f22 + FMADD f15, f24, f15, f23 + + FMADD f0, f27, f3, f0 + FNMSUB f1, f27, f2, f1 + FMADD f4, f27, f7, f4 + FNMSUB f5, f27, f6, f5 + + FMADD f8, f27, f11, f8 + FNMSUB f9, f27, f10, f9 + FMADD f12, f27, f15, f12 + FNMSUB f13, f27, f14, f13 + + FNMSUB f0, f26, f2, f0 + FNMSUB f1, f26, f3, f1 + FNMSUB f4, f26, f6, f4 + FNMSUB f5, f26, f7, f5 + + FNMSUB f8, f26, f10, f8 + FNMSUB f9, f26, f11, f9 + FNMSUB f12, f26, f14, f12 + FNMSUB f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f2, f24, f2, f16 + FMSUB f3, f24, f3, f17 + FMADD f6, f24, f6, f18 + FMSUB f7, f24, f7, f19 + + FMADD f10, f24, f10, f20 + FMSUB f11, f24, f11, f21 + FMADD f14, f24, f14, f22 + FMSUB f15, f24, f15, f23 + + FMSUB f0, f27, f3, f0 + FNMADD f1, f27, f2, f1 + FMSUB f4, f27, f7, f4 + FNMADD f5, f27, f6, f5 + + FMSUB f8, f27, f11, f8 + FNMADD f9, f27, f10, f9 + FMSUB f12, f27, f15, f12 + FNMADD f13, f27, f14, f13 + + FNMADD f0, f26, f2, f0 + FNMADD f1, f26, f3, f1 + FNMADD f4, f26, f6, f4 + FNMADD f5, f26, f7, f5 + + FNMADD f8, f26, f10, f8 + FNMADD f9, f26, f11, f9 + FNMADD f12, f26, f14, f12 + FNMADD f13, f26, f15, f13 + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + LFD f26, 2 * SIZE(AO) + LFD f27, 3 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + + FMADD f2, f27, f1, f2 + FNMSUB f3, f27, f0, f3 + FMADD f6, f27, f5, f6 + FNMSUB f7, f27, f4, f7 + + FMADD f10, f27, f9, f10 + FNMSUB f11, f27, f8, f11 + FMADD f14, f27, f13, f14 + FNMSUB f15, f27, f12, f15 + + FNMSUB f2, f26, f0, f2 + FNMSUB f3, f26, f1, f3 + FNMSUB f6, f26, f4, f6 + FNMSUB f7, f26, f5, f7 + + FNMSUB f10, f26, f8, f10 + FNMSUB f11, f26, f9, f11 + FNMSUB f14, f26, f12, f14 + FNMSUB f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMSUB f2, f28, f2, f16 + FMADD f3, f28, f3, f17 + FMSUB f6, f28, f6, f18 + FMADD f7, f28, f7, f19 + + FMSUB f10, f28, f10, f20 + FMADD f11, f28, f11, f21 + FMSUB f14, f28, f14, f22 + FMADD f15, f28, f15, f23 +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + + FMSUB f2, f27, f1, f2 + FNMADD f3, f27, f0, f3 + FMSUB f6, f27, f5, f6 + FNMADD f7, f27, f4, f7 + + FMSUB f10, f27, f9, f10 + FNMADD f11, f27, f8, f11 + FMSUB f14, f27, f13, f14 + FNMADD f15, f27, f12, f15 + + FNMADD f2, f26, f0, f2 + FNMADD f3, f26, f1, f3 + FNMADD f6, f26, f4, f6 + FNMADD f7, f26, f5, f7 + + FNMADD f10, f26, f8, f10 + FNMADD f11, f26, f9, f11 + FNMADD f14, f26, f12, f14 + FNMADD f15, f26, f13, f15 + + FMUL f16, f29, f3 + FMUL f17, f29, f2 + FMUL f18, f29, f7 + FMUL f19, f29, f6 + + FMUL f20, f29, f11 + FMUL f21, f29, f10 + FMUL f22, f29, f15 + FMUL f23, f29, f14 + + FMADD f2, f28, f2, f16 + FMSUB f3, f28, f3, f17 + FMADD f6, f28, f6, f18 + FMSUB f7, f28, f7, f19 + + FMADD f10, f28, f10, f20 + FMSUB f11, f28, f11, f21 + FMADD f14, f28, f14, f22 + FMSUB f15, f28, f15, f23 +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f3 + FMUL f19, f25, f2 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f2, f24, f2, f18 + FMADD f3, f24, f3, f19 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FMADD f6, f27, f3, f6 + FNMSUB f7, f27, f2, f7 + + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + FNMSUB f6, f26, f2, f6 + FNMSUB f7, f26, f3, f7 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FMADD f10, f29, f3, f10 + FNMSUB f11, f29, f2, f11 + + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + FNMSUB f10, f28, f2, f10 + FNMSUB f11, f28, f3, f11 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FMADD f14, f31, f3, f14 + FNMSUB f15, f31, f2, f15 + + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + FNMSUB f14, f30, f2, f14 + FNMSUB f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FMADD f10, f29, f7, f10 + FNMSUB f11, f29, f6, f11 + + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + FNMSUB f10, f28, f6, f10 + FNMSUB f11, f28, f7, f11 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FMADD f14, f31, f7, f14 + FNMSUB f15, f31, f6, f15 + + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + FNMSUB f14, f30, f6, f14 + FNMSUB f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FMADD f14, f29, f11, f14 + FNMSUB f15, f29, f10, f15 + + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + FNMSUB f14, f28, f10, f14 + FNMSUB f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + FMSUB f14, f30, f14, f18 + FMADD f15, f30, f15, f19 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f2, f24, f2, f18 + FMSUB f3, f24, f3, f19 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FMSUB f6, f27, f3, f6 + FNMADD f7, f27, f2, f7 + + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + FNMADD f6, f26, f2, f6 + FNMADD f7, f26, f3, f7 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FMSUB f10, f29, f3, f10 + FNMADD f11, f29, f2, f11 + + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + FNMADD f10, f28, f2, f10 + FNMADD f11, f28, f3, f11 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FMSUB f14, f31, f3, f14 + FNMADD f15, f31, f2, f15 + + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + FNMADD f14, f30, f2, f14 + FNMADD f15, f30, f3, f15 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FMSUB f10, f29, f7, f10 + FNMADD f11, f29, f6, f11 + + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + FNMADD f10, f28, f6, f10 + FNMADD f11, f28, f7, f11 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FMSUB f14, f31, f7, f14 + FNMADD f15, f31, f6, f15 + + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + FNMADD f14, f30, f6, f14 + FNMADD f15, f30, f7, f15 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FMSUB f14, f29, f11, f14 + FNMADD f15, f29, f10, f15 + + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + FNMADD f14, f28, f10, f14 + FNMADD f15, f28, f11, f15 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMUL f18, f31, f15 + FMUL f19, f31, f14 + + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 + FMADD f14, f30, f14, f18 + FMSUB f15, f30, f15, f19 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + FMUL f18, f25, f15 + FMUL f19, f25, f14 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + FMSUB f14, f24, f14, f18 + FMADD f15, f24, f15, f19 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FMADD f10, f27, f15, f10 + FNMSUB f11, f27, f14, f11 + + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + FNMSUB f10, f26, f14, f10 + FNMSUB f11, f26, f15, f11 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FMADD f6, f29, f15, f6 + FNMSUB f7, f29, f14, f7 + + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + FNMSUB f6, f28, f14, f6 + FNMSUB f7, f28, f15, f7 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FMADD f2, f31, f15, f2 + FNMSUB f3, f31, f14, f3 + + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + FNMSUB f2, f30, f14, f2 + FNMSUB f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + FMSUB f10, f26, f10, f18 + FMADD f11, f26, f11, f19 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FMADD f6, f29, f11, f6 + FNMSUB f7, f29, f10, f7 + + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + FNMSUB f6, f28, f10, f6 + FNMSUB f7, f28, f11, f7 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FMADD f2, f31, f11, f2 + FNMSUB f3, f31, f10, f3 + + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + FNMSUB f2, f30, f10, f2 + FNMSUB f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + FMSUB f6, f26, f6, f18 + FMADD f7, f26, f7, f19 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FMADD f2, f29, f7, f2 + FNMSUB f3, f29, f6, f3 + + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + FNMSUB f2, f28, f6, f2 + FNMSUB f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + FMSUB f2, f30, f2, f18 + FMADD f3, f30, f3, f19 + +#else + + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + FMADD f14, f24, f14, f18 + FMSUB f15, f24, f15, f19 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FMSUB f10, f27, f15, f10 + FNMADD f11, f27, f14, f11 + + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + FNMADD f10, f26, f14, f10 + FNMADD f11, f26, f15, f11 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FMSUB f6, f29, f15, f6 + FNMADD f7, f29, f14, f7 + + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + FNMADD f6, f28, f14, f6 + FNMADD f7, f28, f15, f7 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FMSUB f2, f31, f15, f2 + FNMADD f3, f31, f14, f3 + + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + FNMADD f2, f30, f14, f2 + FNMADD f3, f30, f15, f3 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMUL f18, f27, f11 + FMUL f19, f27, f10 + + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + FMADD f10, f26, f10, f18 + FMSUB f11, f26, f11, f19 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FMSUB f6, f29, f11, f6 + FNMADD f7, f29, f10, f7 + + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + FNMADD f6, f28, f10, f6 + FNMADD f7, f28, f11, f7 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FMSUB f2, f31, f11, f2 + FNMADD f3, f31, f10, f3 + + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + FNMADD f2, f30, f10, f2 + FNMADD f3, f30, f11, f3 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMUL f18, f27, f7 + FMUL f19, f27, f6 + + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + FMADD f6, f26, f6, f18 + FMSUB f7, f26, f7, f19 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FMSUB f2, f29, f7, f2 + FNMADD f3, f29, f6, f3 + + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + FNMADD f2, f28, f6, f2 + FNMADD f3, f28, f7, f3 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMUL f18, f31, f3 + FMUL f19, f31, f2 + + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + FMADD f2, f30, f2, f18 + FMSUB f3, f30, f3, f19 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE + subi CO3, CO3, 4 * SIZE + subi CO4, CO4, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) + + STFD f2, 8 * SIZE(BO) + STFD f3, 9 * SIZE(BO) + STFD f6, 10 * SIZE(BO) + STFD f7, 11 * SIZE(BO) + STFD f10, 12 * SIZE(BO) + STFD f11, 13 * SIZE(BO) + STFD f14, 14 * SIZE(BO) + STFD f15, 15 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + STFD f4, 4 * SIZE(AO) + STFD f5, 5 * SIZE(AO) + STFD f6, 6 * SIZE(AO) + STFD f7, 7 * SIZE(AO) + + STFD f8, 8 * SIZE(AO) + STFD f9, 9 * SIZE(AO) + STFD f10, 10 * SIZE(AO) + STFD f11, 11 * SIZE(AO) + STFD f12, 12 * SIZE(AO) + STFD f13, 13 * SIZE(AO) + STFD f14, 14 * SIZE(AO) + STFD f15, 15 * SIZE(AO) + +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + STFD f6, 2 * SIZE(CO2) + STFD f7, 3 * SIZE(CO2) + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f10, 2 * SIZE(CO3) + STFD f11, 3 * SIZE(CO3) + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + STFD f14, 2 * SIZE(CO4) + STFD f15, 3 * SIZE(CO4) + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE + addi CO3, CO3, 4 * SIZE + addi CO4, CO4, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt LL(11) + .align 4 + +LL(20): + andi. I, M, 1 + ble LL(29) + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 2 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble LL(25) + .align 4 + +LL(22): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + LFD f28, 4 * SIZE(AO) + LFD f29, 5 * SIZE(AO) + LFD f30, 6 * SIZE(AO) + LFD f31, 7 * SIZE(AO) + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + FMA1 f0, f18, f20, f0 + FMA4 f3, f19, f20, f3 + FMA2 f1, f18, f21, f1 + FMA3 f2, f19, f21, f2 + + FMA1 f4, f18, f22, f4 + FMA4 f7, f19, f22, f7 + FMA2 f5, f18, f23, f5 + FMA3 f6, f19, f23, f6 + + LFD f20, 16 * SIZE(BO) + LFD f21, 17 * SIZE(BO) + LFD f22, 18 * SIZE(BO) + LFD f23, 19 * SIZE(BO) + + FMA1 f8, f18, f24, f8 + FMA4 f11, f19, f24, f11 + FMA2 f9, f18, f25, f9 + FMA3 f10, f19, f25, f10 + + FMA1 f12, f18, f26, f12 + FMA4 f15, f19, f26, f15 + FMA2 f13, f18, f27, f13 + FMA3 f14, f19, f27, f14 + + LFD f24, 20 * SIZE(BO) + LFD f25, 21 * SIZE(BO) + LFD f26, 22 * SIZE(BO) + LFD f27, 23 * SIZE(BO) + + FMA1 f0, f28, f20, f0 + FMA4 f3, f29, f20, f3 + FMA2 f1, f28, f21, f1 + FMA3 f2, f29, f21, f2 + + LFD f16, 8 * SIZE(AO) + LFD f17, 9 * SIZE(AO) + LFD f18, 10 * SIZE(AO) + LFD f19, 11 * SIZE(AO) + + FMA1 f4, f28, f22, f4 + FMA4 f7, f29, f22, f7 + FMA2 f5, f28, f23, f5 + FMA3 f6, f29, f23, f6 + + LFD f20, 24 * SIZE(BO) + LFD f21, 25 * SIZE(BO) + LFD f22, 26 * SIZE(BO) + LFD f23, 27 * SIZE(BO) + + FMA1 f8, f28, f24, f8 + FMA4 f11, f29, f24, f11 + FMA2 f9, f28, f25, f9 + FMA3 f10, f29, f25, f10 + + FMA1 f12, f28, f26, f12 + FMA4 f15, f29, f26, f15 + FMA2 f13, f28, f27, f13 + FMA3 f14, f29, f27, f14 + + LFD f24, 28 * SIZE(BO) + LFD f25, 29 * SIZE(BO) + LFD f26, 30 * SIZE(BO) + LFD f27, 31 * SIZE(BO) + + FMA1 f0, f30, f20, f0 + FMA4 f3, f31, f20, f3 + FMA2 f1, f30, f21, f1 + FMA3 f2, f31, f21, f2 + + FMA1 f4, f30, f22, f4 + FMA4 f7, f31, f22, f7 + FMA2 f5, f30, f23, f5 + FMA3 f6, f31, f23, f6 + + LFD f20, 32 * SIZE(BO) + LFD f21, 33 * SIZE(BO) + LFD f22, 34 * SIZE(BO) + LFD f23, 35 * SIZE(BO) + + FMA1 f8, f30, f24, f8 + FMA4 f11, f31, f24, f11 + FMA2 f9, f30, f25, f9 + FMA3 f10, f31, f25, f10 + + FMA1 f12, f30, f26, f12 + FMA4 f15, f31, f26, f15 + FMA2 f13, f30, f27, f13 + FMA3 f14, f31, f27, f14 + + LFD f24, 36 * SIZE(BO) + LFD f25, 37 * SIZE(BO) + LFD f26, 38 * SIZE(BO) + LFD f27, 39 * SIZE(BO) + + addi AO, AO, 8 * SIZE + addi BO, BO, 32 * SIZE + bdnz LL(22) + .align 4 + +LL(25): +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble LL(27) + .align 4 + +LL(26): + FMA1 f0, f16, f20, f0 + FMA4 f3, f17, f20, f3 + FMA2 f1, f16, f21, f1 + FMA3 f2, f17, f21, f2 + + FMA1 f4, f16, f22, f4 + FMA4 f7, f17, f22, f7 + FMA2 f5, f16, f23, f5 + FMA3 f6, f17, f23, f6 + + LFD f20, 8 * SIZE(BO) + LFD f21, 9 * SIZE(BO) + LFD f22, 10 * SIZE(BO) + LFD f23, 11 * SIZE(BO) + + FMA1 f8, f16, f24, f8 + FMA4 f11, f17, f24, f11 + FMA2 f9, f16, f25, f9 + FMA3 f10, f17, f25, f10 + + FMA1 f12, f16, f26, f12 + FMA4 f15, f17, f26, f15 + FMA2 f13, f16, f27, f13 + FMA3 f14, f17, f27, f14 + + LFD f16, 2 * SIZE(AO) + LFD f17, 3 * SIZE(AO) + + LFD f24, 12 * SIZE(BO) + LFD f25, 13 * SIZE(BO) + LFD f26, 14 * SIZE(BO) + LFD f27, 15 * SIZE(BO) + + addi AO, AO, 2 * SIZE + addi BO, BO, 8 * SIZE + bdnz LL(26) + .align 4 + +LL(27): +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 4 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 2 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + FADD f0, f0, f2 + FADD f1, f1, f3 + FADD f4, f4, f6 + FADD f5, f5, f7 + + FADD f8, f8, f10 + FADD f9, f9, f11 + FADD f12, f12, f14 + FADD f13, f13, f15 + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f18, f4 + FSUB f5, f19, f5 + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f12, f22, f12 + FSUB f13, f23, f13 + +#else + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f4, f20, f4 + FSUB f5, f21, f5 + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f28, 6 * SIZE(AO) + LFD f29, 7 * SIZE(AO) + + FSUB f8, f24, f8 + FSUB f9, f25, f9 + FSUB f12, f28, f12 + FSUB f13, f29, f13 +#endif + +#ifdef LN + LFD f28, 0 * SIZE(AO) + LFD f29, 1 * SIZE(AO) + + FMUL f16, f29, f1 + FMUL f17, f29, f0 + FMUL f18, f29, f5 + FMUL f19, f29, f4 + + FMUL f20, f29, f9 + FMUL f21, f29, f8 + FMUL f22, f29, f13 + FMUL f23, f29, f12 + +#ifndef CONJ + FMSUB f0, f28, f0, f16 + FMADD f1, f28, f1, f17 + FMSUB f4, f28, f4, f18 + FMADD f5, f28, f5, f19 + + FMSUB f8, f28, f8, f20 + FMADD f9, f28, f9, f21 + FMSUB f12, f28, f12, f22 + FMADD f13, f28, f13, f23 +#else + + FMADD f0, f28, f0, f16 + FMSUB f1, f28, f1, f17 + FMADD f4, f28, f4, f18 + FMSUB f5, f28, f5, f19 + + FMADD f8, f28, f8, f20 + FMSUB f9, f28, f9, f21 + FMADD f12, f28, f12, f22 + FMSUB f13, f28, f13, f23 +#endif +#endif + +#ifdef LT + LFD f24, 0 * SIZE(AO) + LFD f25, 1 * SIZE(AO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + FMUL f18, f25, f5 + FMUL f19, f25, f4 + + FMUL f20, f25, f9 + FMUL f21, f25, f8 + FMUL f22, f25, f13 + FMUL f23, f25, f12 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + FMSUB f4, f24, f4, f18 + FMADD f5, f24, f5, f19 + + FMSUB f8, f24, f8, f20 + FMADD f9, f24, f9, f21 + FMSUB f12, f24, f12, f22 + FMADD f13, f24, f13, f23 + +#else + + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + FMADD f4, f24, f4, f18 + FMSUB f5, f24, f5, f19 + + FMADD f8, f24, f8, f20 + FMSUB f9, f24, f9, f21 + FMADD f12, f24, f12, f22 + FMSUB f13, f24, f13, f23 + +#endif +#endif + +#ifdef RN + LFD f24, 0 * SIZE(BO) + LFD f25, 1 * SIZE(BO) + LFD f26, 2 * SIZE(BO) + LFD f27, 3 * SIZE(BO) + LFD f28, 4 * SIZE(BO) + LFD f29, 5 * SIZE(BO) + LFD f30, 6 * SIZE(BO) + LFD f31, 7 * SIZE(BO) + + FMUL f16, f25, f1 + FMUL f17, f25, f0 + +#ifndef CONJ + + FMSUB f0, f24, f0, f16 + FMADD f1, f24, f1, f17 + + FMADD f4, f27, f1, f4 + FNMSUB f5, f27, f0, f5 + FNMSUB f4, f26, f0, f4 + FNMSUB f5, f26, f1, f5 + + FMADD f8, f29, f1, f8 + FNMSUB f9, f29, f0, f9 + FNMSUB f8, f28, f0, f8 + FNMSUB f9, f28, f1, f9 + + FMADD f12, f31, f1, f12 + FNMSUB f13, f31, f0, f13 + FNMSUB f12, f30, f0, f12 + FNMSUB f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f8, f29, f5, f8 + FNMSUB f9, f29, f4, f9 + FNMSUB f8, f28, f4, f8 + FNMSUB f9, f28, f5, f9 + + FMADD f12, f31, f5, f12 + FNMSUB f13, f31, f4, f13 + FNMSUB f12, f30, f4, f12 + FNMSUB f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f12, f29, f9, f12 + FNMSUB f13, f29, f8, f13 + FNMSUB f12, f28, f8, f12 + FNMSUB f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMSUB f12, f30, f12, f16 + FMADD f13, f30, f13, f17 + +#else + + FMADD f0, f24, f0, f16 + FMSUB f1, f24, f1, f17 + + FMSUB f4, f27, f1, f4 + FNMADD f5, f27, f0, f5 + FNMADD f4, f26, f0, f4 + FNMADD f5, f26, f1, f5 + + FMSUB f8, f29, f1, f8 + FNMADD f9, f29, f0, f9 + FNMADD f8, f28, f0, f8 + FNMADD f9, f28, f1, f9 + + FMSUB f12, f31, f1, f12 + FNMADD f13, f31, f0, f13 + FNMADD f12, f30, f0, f12 + FNMADD f13, f30, f1, f13 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 12 * SIZE(BO) + LFD f29, 13 * SIZE(BO) + LFD f30, 14 * SIZE(BO) + LFD f31, 15 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f8, f29, f5, f8 + FNMADD f9, f29, f4, f9 + FNMADD f8, f28, f4, f8 + FNMADD f9, f28, f5, f9 + + FMSUB f12, f31, f5, f12 + FNMADD f13, f31, f4, f13 + FNMADD f12, f30, f4, f12 + FNMADD f13, f30, f5, f13 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 22 * SIZE(BO) + LFD f29, 23 * SIZE(BO) + LFD f30, 30 * SIZE(BO) + LFD f31, 31 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f12, f29, f9, f12 + FNMADD f13, f29, f8, f13 + FNMADD f12, f28, f8, f12 + FNMADD f13, f28, f9, f13 + + FMUL f16, f31, f13 + FMUL f17, f31, f12 + FMADD f12, f30, f12, f16 + FMSUB f13, f30, f13, f17 +#endif + +#endif + +#ifdef RT + LFD f24, 30 * SIZE(BO) + LFD f25, 31 * SIZE(BO) + LFD f26, 28 * SIZE(BO) + LFD f27, 29 * SIZE(BO) + LFD f28, 26 * SIZE(BO) + LFD f29, 27 * SIZE(BO) + LFD f30, 24 * SIZE(BO) + LFD f31, 25 * SIZE(BO) + + FMUL f16, f25, f13 + FMUL f17, f25, f12 + +#ifndef CONJ + + FMSUB f12, f24, f12, f16 + FMADD f13, f24, f13, f17 + + FMADD f8, f27, f13, f8 + FNMSUB f9, f27, f12, f9 + FNMSUB f8, f26, f12, f8 + FNMSUB f9, f26, f13, f9 + + FMADD f4, f29, f13, f4 + FNMSUB f5, f29, f12, f5 + FNMSUB f4, f28, f12, f4 + FNMSUB f5, f28, f13, f5 + + FMADD f0, f31, f13, f0 + FNMSUB f1, f31, f12, f1 + FNMSUB f0, f30, f12, f0 + FNMSUB f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMSUB f8, f26, f8, f16 + FMADD f9, f26, f9, f17 + + FMADD f4, f29, f9, f4 + FNMSUB f5, f29, f8, f5 + FNMSUB f4, f28, f8, f4 + FNMSUB f5, f28, f9, f5 + + FMADD f0, f31, f9, f0 + FNMSUB f1, f31, f8, f1 + FNMSUB f0, f30, f8, f0 + FNMSUB f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMSUB f4, f26, f4, f16 + FMADD f5, f26, f5, f17 + + FMADD f0, f29, f5, f0 + FNMSUB f1, f29, f4, f1 + FNMSUB f0, f28, f4, f0 + FNMSUB f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMSUB f0, f30, f0, f16 + FMADD f1, f30, f1, f17 + +#else + FMADD f12, f24, f12, f16 + FMSUB f13, f24, f13, f17 + + FMSUB f8, f27, f13, f8 + FNMADD f9, f27, f12, f9 + FNMADD f8, f26, f12, f8 + FNMADD f9, f26, f13, f9 + + FMSUB f4, f29, f13, f4 + FNMADD f5, f29, f12, f5 + FNMADD f4, f28, f12, f4 + FNMADD f5, f28, f13, f5 + + FMSUB f0, f31, f13, f0 + FNMADD f1, f31, f12, f1 + FNMADD f0, f30, f12, f0 + FNMADD f1, f30, f13, f1 + + LFD f26, 20 * SIZE(BO) + LFD f27, 21 * SIZE(BO) + LFD f28, 18 * SIZE(BO) + LFD f29, 19 * SIZE(BO) + LFD f30, 16 * SIZE(BO) + LFD f31, 17 * SIZE(BO) + + FMUL f16, f27, f9 + FMUL f17, f27, f8 + FMADD f8, f26, f8, f16 + FMSUB f9, f26, f9, f17 + + FMSUB f4, f29, f9, f4 + FNMADD f5, f29, f8, f5 + FNMADD f4, f28, f8, f4 + FNMADD f5, f28, f9, f5 + + FMSUB f0, f31, f9, f0 + FNMADD f1, f31, f8, f1 + FNMADD f0, f30, f8, f0 + FNMADD f1, f30, f9, f1 + + LFD f26, 10 * SIZE(BO) + LFD f27, 11 * SIZE(BO) + LFD f28, 8 * SIZE(BO) + LFD f29, 9 * SIZE(BO) + LFD f30, 0 * SIZE(BO) + LFD f31, 1 * SIZE(BO) + + FMUL f16, f27, f5 + FMUL f17, f27, f4 + FMADD f4, f26, f4, f16 + FMSUB f5, f26, f5, f17 + + FMSUB f0, f29, f5, f0 + FNMADD f1, f29, f4, f1 + FNMADD f0, f28, f4, f0 + FNMADD f1, f28, f5, f1 + + FMUL f16, f31, f1 + FMUL f17, f31, f0 + FMADD f0, f30, f0, f16 + FMSUB f1, f30, f1, f17 + +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE + subi CO3, CO3, 2 * SIZE + subi CO4, CO4, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f4, 2 * SIZE(BO) + STFD f5, 3 * SIZE(BO) + STFD f8, 4 * SIZE(BO) + STFD f9, 5 * SIZE(BO) + STFD f12, 6 * SIZE(BO) + STFD f13, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f4, 2 * SIZE(AO) + STFD f5, 3 * SIZE(AO) + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f12, 6 * SIZE(AO) + STFD f13, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f4, 0 * SIZE(CO2) + STFD f5, 1 * SIZE(CO2) + + STFD f8, 0 * SIZE(CO3) + STFD f9, 1 * SIZE(CO3) + STFD f12, 0 * SIZE(CO4) + STFD f13, 1 * SIZE(CO4) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE + addi CO3, CO3, 2 * SIZE + addi CO4, CO4, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 2 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +LL(29): +#ifdef LN + slwi r0, K, 2 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 4 +#endif + +#ifdef RT + subi KK, KK, 4 +#endif + + addic. J, J, -1 + bgt LL(10) + .align 4 + +LL(999): + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S new file mode 100644 index 0000000000..fdcf5beb05 --- /dev/null +++ b/kernel/power/ztrsm_kernel_ppc440_LN.S @@ -0,0 +1,2256 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + srawi. J, N, 1 + ble .L30 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + +.L20: + andi. I, M, 1 + ble .L09 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(BO) + bdnz .L22 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L27 + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L27: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f20, f2 + FADD f3, f21, f3 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L09: + srawi. I, M, 1 + ble .L29 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .LKERNEL_MainFinish + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.LKERNEL_MainFinish: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L29: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt .L10 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + + andi. I, M, 1 + ble .L40 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 4 + +.L42: + FMADD f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + FMADD f1, f17, f20, f1 + nop + FMADD f2, f18, f20, f2 + nop + FMADD f3, f19, f20, f3 + LFD f20, 4 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 5 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 6 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 7 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 5 * SIZE(BO) + FMADD f1, f17, f22, f1 + nop + FMADD f2, f18, f22, f2 + nop + FMADD f3, f19, f22, f3 + LFD f22, 6 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFD f16, 8 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 9 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 10 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 11 * SIZE(AO) + + FMADD f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + FMADD f1, f17, f20, f1 + nop + FMADD f2, f18, f20, f2 + nop + FMADD f3, f19, f20, f3 + LFDU f20, 8 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFD f16, 12 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 13 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 14 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 15 * SIZE(AO) + + FMADD f0, f16, f22, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f22, f1 + nop + FMADD f2, f18, f22, f2 + nop + FMADD f3, f19, f22, f3 + LFD f22, 2 * SIZE(BO) + + FMADD f4, f16, f23, f4 + LFDU f16, 16 * SIZE(AO) + FMADD f5, f17, f23, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f23, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f23, f7 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble .L47 + .align 4 + +.L46: + FMADD f0, f16, f20, f0 + LFD f21, 1 * SIZE(BO) + FMADD f1, f17, f20, f1 + nop + FMADD f2, f18, f20, f2 + nop + FMADD f3, f19, f20, f3 + LFDU f20, 2 * SIZE(BO) + + FMADD f4, f16, f21, f4 + LFDU f16, 4 * SIZE(AO) + FMADD f5, f17, f21, f5 + LFD f17, 1 * SIZE(AO) + FMADD f6, f18, f21, f6 + LFD f18, 2 * SIZE(AO) + FMADD f7, f19, f21, f7 + LFD f19, 3 * SIZE(AO) + bdnz .L46 + .align 4 + +.L47: +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L40: + srawi. I, M, 1 + ble .L49 + .align 4 + +.L31: +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(AO) + bdnz .L32 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L37 + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + bdnz .L36 + .align 4 + +.L37: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S new file mode 100644 index 0000000000..a9c98dd309 --- /dev/null +++ b/kernel/power/ztrsm_kernel_ppc440_LT.S @@ -0,0 +1,2208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + srawi. J, N, 1 + ble .L30 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .LKERNEL_MainFinish + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.LKERNEL_MainFinish: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L20: + andi. I, M, 1 + ble .L29 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(BO) + bdnz .L22 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L27 + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L27: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L29: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt .L10 + .align 4 + +.L30: + andi. J, N, 1 + ble .L999 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble .L40 + .align 4 + +.L31: +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(AO) + bdnz .L32 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L37 + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + bdnz .L36 + .align 4 + +.L37: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L40: + andi. I, M, 1 + ble .L49 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 4 + +.L42: + fmadd f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFD f16, 4 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFD f20, 4 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 5 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 6 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 6 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 8 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 8 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 2 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble .L47 + .align 4 + +.L46: + fmadd f0, f16, f20, f0 + LFD f21, 1 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 2 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 2 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + bdnz .L46 + .align 4 + +.L47: +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S new file mode 100644 index 0000000000..c9b794ef41 --- /dev/null +++ b/kernel/power/ztrsm_kernel_ppc440_RT.S @@ -0,0 +1,2209 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_R 296(SP) +#define ALPHA_I 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R 224(SP) +#define ALPHA_I 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define AORIG r21 +#define TEMP r22 +#define KK r23 +#define I r24 +#define J r25 +#define AO r26 +#define BO r27 +#define CO1 r28 +#define CO2 r29 + +#define A1 f16 +#define A2 f17 +#define A3 f18 +#define A4 f19 +#define A5 f20 +#define A6 f21 +#define B1 f22 +#define B2 f23 +#define B3 f24 +#define B4 f25 +#define B5 f26 +#define B6 f27 +#define B7 f28 +#define B8 f29 +#define B9 f30 +#define B10 f31 + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) +#endif + + stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, 112 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, 56 + STACKSIZE(SP) + lwz C, 60 + STACKSIZE(SP) + lwz LDC, 64 + STACKSIZE(SP) +#else + lwz LDC, 56 + STACKSIZE(SP) +#endif +#endif +#endif + +#if defined(linux) && defined(__64BIT__) + ld OFFSET, 120 + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, 120 + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, 68 + STACKSIZE(SP) +#else + lwz OFFSET, 60 + STACKSIZE(SP) +#endif +#endif +#endif + + slwi LDC, LDC, ZBASE_SHIFT + +#ifdef LN + mullw r0, M, K + slwi r0, r0, ZBASE_SHIFT + add A, A, r0 + + slwi r0, M, ZBASE_SHIFT + add C, C, r0 +#endif + +#ifdef RN + neg KK, OFFSET +#endif + +#ifdef RT + mullw r0, N, K + slwi r0, r0, ZBASE_SHIFT + add B, B, r0 + + mullw r0, N, LDC + add C, C, r0 + + sub KK, N, OFFSET +#endif + + cmpwi cr0, M, 0 + ble .L999 + cmpwi cr0, N, 0 + ble .L999 + cmpwi cr0, K, 0 + ble .L999 + + andi. J, N, 1 + ble .L30 + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + sub B, B, r0 + + sub C, C, LDC +#endif + + mr CO1, C + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, C, LDC +#endif + ble .L40 + .align 4 + +.L31: +#if defined(LT) || defined(RN) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(B) + LFD f17, 1 * SIZE(B) + LFD f18, 2 * SIZE(B) + LFD f19, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 1 + ZBASE_SHIFT + slwi TEMP, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + LFD f22, 2 * SIZE(AO) + LFD f23, 3 * SIZE(AO) + + LFD f24, 4 * SIZE(AO) + LFD f25, 5 * SIZE(AO) + LFD f26, 6 * SIZE(AO) + LFD f27, 7 * SIZE(AO) + + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L35 + .align 4 + +.L32: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(BO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(AO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(AO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(AO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(AO) + bdnz .L32 + .align 4 + +.L35: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L37 + .align 4 + +.L36: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(BO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(BO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(AO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(AO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(AO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(AO) + bdnz .L36 + .align 4 + +.L37: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 2 +#else + subi r0, KK, 1 +#endif + slwi TEMP, r0, 1 + ZBASE_SHIFT + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, TEMP + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 1 + ZBASE_SHIFT + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L31 + .align 4 + +.L40: + andi. I, M, 1 + ble .L49 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L45 + .align 4 + +.L42: + fmadd f0, f16, f20, f0 + LFD f23, 3 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFD f16, 4 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFD f20, 4 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 5 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 5 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 6 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 6 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 7 * SIZE(AO) + + fmadd f0, f16, f20, f0 + LFD f23, 7 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 8 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 8 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + + fmadd f4, f18, f22, f4 + LFD f21, 1 * SIZE(BO) + fmadd f7, f18, f23, f7 + LFD f18, 2 * SIZE(AO) + fmadd f6, f19, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f5, f19, f23, f5 + LFD f19, 3 * SIZE(AO) + bdnz .L42 + .align 4 + +.L45: + fadd f0, f0, f4 + fadd f1, f1, f5 + fadd f2, f2, f6 + fadd f3, f3, f7 + +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR,r0 + ble .L47 + .align 4 + +.L46: + fmadd f0, f16, f20, f0 + LFD f21, 1 * SIZE(BO) + fmadd f3, f16, f21, f3 + LFDU f16, 2 * SIZE(AO) + fmadd f2, f17, f20, f2 + LFDU f20, 2 * SIZE(BO) + fmadd f1, f17, f21, f1 + LFD f17, 1 * SIZE(AO) + bdnz .L46 + .align 4 + +.L47: +#ifndef CONJ + FSUB f0, f0, f1 + FADD f1, f2, f3 +#else + FADD f0, f0, f1 + FSUB f1, f3, f2 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 1 + slwi r0, r0, 0 + ZBASE_SHIFT + + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 +#endif +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 +#endif +#endif + +#ifdef RT + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + +#ifndef LN + addi CO1, CO1, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 0 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L49: +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 1 +#endif + +#ifdef RT + subi KK, KK, 1 +#endif + .align 4 + + +.L30: + srawi. J, N, 1 + ble .L999 + .align 4 + +.L10: +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + sub B, B, r0 + + slwi r0, LDC, 1 + sub C, C, r0 +#endif + + mr CO1, C + add CO2, C, LDC + +#ifdef LN + add KK, M, OFFSET +#endif + +#ifdef LT + mr KK, OFFSET +#endif + + srawi. I, M, 1 +#if defined(LN) || defined(RT) + mr AORIG, A +#else + mr AO, A +#endif +#ifndef RT + add C, CO2, LDC +#endif + ble .L20 + .align 4 + +.L11: +#if defined(LT) || defined(RN) + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(B) + LFD B2, 1 * SIZE(B) + LFD B3, 2 * SIZE(B) + LFD B4, 3 * SIZE(B) + LFD B5, 4 * SIZE(B) + LFD B6, 8 * SIZE(B) + LFD B7, 12 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, KK, 2 + mtspr CTR, r0 + mr BO, B +#else + +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, TEMP + + sub TEMP, K, KK + + LFD A1, 0 * SIZE(AO) + LFD A2, 1 * SIZE(AO) + LFD A4, 4 * SIZE(AO) + LFD A5, 8 * SIZE(AO) + + LFD B1, 0 * SIZE(BO) + LFD B2, 1 * SIZE(BO) + LFD B3, 2 * SIZE(BO) + LFD B4, 3 * SIZE(BO) + LFD B5, 4 * SIZE(BO) + LFD B6, 8 * SIZE(BO) + LFD B7, 12 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + fmr f8, f0 + fmr f9, f0 + fmr f10, f0 + fmr f11, f0 + + fmr f12, f0 + fmr f13, f0 + fmr f14, f0 + fmr f15, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L15 + .align 4 + +.L12: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + LFD A6, 12 * SIZE(AO) + FMADD f8, A1, B3, f8 + nop + FMADD f12, A1, B4, f12 + nop + + FMADD f1, A2, B1, f1 + LFD A1, 3 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B1, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 5 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 6 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10, 7 * SIZE(BO) + + FMADD f3, A1, B1, f3 + LFD A2, 5 * SIZE(AO) + FMADD f7, A1, B2, f7 + LFD B1, 16 * SIZE(BO) + FMADD f11, A1, B3, f11 + nop + FMADD f15, A1, B4, f15 + nop + + FMADD f0, A4, B5, f0 + LFD A3, 6 * SIZE(AO) + FMADD f4, A4, B8, f4 + LFD A1, 16 * SIZE(AO) + FMADD f8, A4, B9, f8 + nop + FMADD f12, A4, B10, f12 + nop + + FMADD f1, A2, B5, f1 + LFD A4, 7 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B5, f2 + nop + FMADD f6, A3, B8, f6 + LFD B2, 9 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 10 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 11 * SIZE(BO) + + FMADD f3, A4, B5, f3 + LFD A2, 9 * SIZE(AO) + FMADD f7, A4, B8, f7 + LFD B5, 20 * SIZE(BO) + FMADD f11, A4, B9, f11 + nop + FMADD f15, A4, B10, f15 + nop + + FMADD f0, A5, B6, f0 + LFD A3, 10 * SIZE(AO) + FMADD f4, A5, B2, f4 + LFD A4, 20 * SIZE(AO) + FMADD f8, A5, B3, f8 + nop + FMADD f12, A5, B4, f12 + nop + + FMADD f1, A2, B6, f1 + LFD A5, 11 * SIZE(AO) + FMADD f5, A2, B2, f5 + nop + FMADD f9, A2, B3, f9 + nop + FMADD f13, A2, B4, f13 + nop + + FMADD f2, A3, B6, f2 + nop + FMADD f6, A3, B2, f6 + LFD B8, 13 * SIZE(BO) + FMADD f10, A3, B3, f10 + LFD B9, 14 * SIZE(BO) + FMADD f14, A3, B4, f14 + LFD B10,15 * SIZE(BO) + + FMADD f3, A5, B6, f3 + LFD A2, 13 * SIZE(AO) + FMADD f7, A5, B2, f7 + LFD B6, 24 * SIZE(BO) + FMADD f11, A5, B3, f11 + nop + FMADD f15, A5, B4, f15 + nop + + FMADD f0, A6, B7, f0 + LFD A3, 14 * SIZE(AO) + FMADD f4, A6, B8, f4 + LFD A5, 24 * SIZE(AO) + FMADD f8, A6, B9, f8 + nop + FMADD f12, A6, B10, f12 + nop + + FMADD f1, A2, B7, f1 + LFD A6, 15 * SIZE(AO) + FMADD f5, A2, B8, f5 + nop + FMADD f9, A2, B9, f9 + nop + FMADD f13, A2, B10, f13 + nop + + FMADD f2, A3, B7, f2 + addi AO, AO, 16 * SIZE + FMADD f6, A3, B8, f6 + LFD B2, 17 * SIZE(BO) + FMADD f10, A3, B9, f10 + LFD B3, 18 * SIZE(BO) + FMADD f14, A3, B10, f14 + LFD B4, 19 * SIZE(BO) + + FMADD f3, A6, B7, f3 + LFD A2, 1 * SIZE(AO) + FMADD f7, A6, B8, f7 + LFD B7, 28 * SIZE(BO) + FMADD f11, A6, B9, f11 + addi BO, BO, 16 * SIZE + FMADD f15, A6, B10, f15 + bdnz .L12 + .align 4 + +.L15: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .LKERNEL_MainFinish + .align 4 + +.L16: + FMADD f0, A1, B1, f0 + LFD A3, 2 * SIZE(AO) + FMADD f4, A1, B2, f4 + FMADD f8, A1, B3, f8 + FMADD f12, A1, B4, f12 + LFD A4, 3 * SIZE(AO) + + FMADD f1, A2, B1, f1 + FMADD f5, A2, B2, f5 + FMADD f9, A2, B3, f9 + FMADD f13, A2, B4, f13 + LFDU A1, 4 * SIZE(AO) + + FMADD f2, A3, B1, f2 + FMADD f6, A3, B2, f6 + FMADD f10, A3, B3, f10 + FMADD f14, A3, B4, f14 + LFD A2, 1 * SIZE(AO) + + FMADD f3, A4, B1, f3 + LFDU B1, 4 * SIZE(BO) + FMADD f7, A4, B2, f7 + LFD B2, 1 * SIZE(BO) + FMADD f11, A4, B3, f11 + LFD B3, 2 * SIZE(BO) + FMADD f15, A4, B4, f15 + LFD B4, 3 * SIZE(BO) + bdnz .L16 + .align 4 + +.LKERNEL_MainFinish: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 + + FSUB f8, f8, f13 + FADD f9, f9, f12 + FSUB f10, f10, f15 + FADD f11, f11, f14 + +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 + + FADD f8, f8, f13 + FSUB f9, f12, f9 + FADD f10, f10, f15 + FSUB f11, f14, f11 +#endif + +#if defined(LN) || defined(RT) + subi r0, KK, 2 + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, r0 +#endif + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + LFD f20, 4 * SIZE(BO) + LFD f21, 5 * SIZE(BO) + LFD f22, 6 * SIZE(BO) + LFD f23, 7 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f8, f18, f8 + FSUB f9, f19, f9 + + FSUB f2, f20, f2 + FSUB f3, f21, f3 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 4 * SIZE(AO) + LFD f21, 5 * SIZE(AO) + LFD f22, 6 * SIZE(AO) + LFD f23, 7 * SIZE(AO) + +#ifndef CONJ + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 + + FSUB f8, f20, f8 + FSUB f9, f21, f9 + FSUB f10, f22, f10 + FSUB f11, f23, f11 +#else + FSUB f0, f16, f0 + FADD f1, f17, f1 + FSUB f2, f18, f2 + FADD f3, f19, f3 + + FSUB f8, f20, f8 + FADD f9, f21, f9 + FSUB f10, f22, f10 + FADD f11, f23, f11 +#endif +#endif + +#ifdef LN + LFD f16, 6 * SIZE(AO) + LFD f17, 7 * SIZE(AO) + LFD f18, 4 * SIZE(AO) + LFD f19, 5 * SIZE(AO) + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f6, f17, f3 + FMUL f7, f17, f2 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + FMADD f8, f19, f11, f8 + FNMSUB f9, f19, f10, f9 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + FNMSUB f8, f18, f10, f8 + FNMSUB f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f8, f20, f8, f12 + FMADD f9, f20, f9, f13 + +#else + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + FMSUB f8, f19, f11, f8 + FNMADD f9, f19, f10, f9 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + FNMADD f8, f18, f10, f8 + FNMADD f9, f18, f11, f9 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f9 + FMUL f13, f21, f8 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f8, f20, f8, f12 + FMSUB f9, f20, f9, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + LFD f20, 6 * SIZE(AO) + LFD f21, 7 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f9 + FMUL f13, f17, f8 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + FMADD f10, f19, f9, f10 + FNMSUB f11, f19, f8, f11 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + FNMSUB f10, f18, f8, f10 + FNMSUB f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 + FMSUB f10, f20, f10, f12 + FMADD f11, f20, f11, f13 + +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + FMSUB f10, f19, f9, f10 + FNMADD f11, f19, f8, f11 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + FNMADD f10, f18, f8, f10 + FNMADD f11, f18, f9, f11 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + FMUL f12, f21, f11 + FMUL f13, f21, f10 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 + FMADD f10, f20, f10, f12 + FMSUB f11, f20, f11, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f6, f17, f3 + FMUL f7, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f6 + FMADD f3, f16, f3, f7 + + FMADD f8, f19, f1, f8 + FNMSUB f9, f19, f0, f9 + FMADD f10, f19, f3, f10 + FNMSUB f11, f19, f2, f11 + + FNMSUB f8, f18, f0, f8 + FNMSUB f9, f18, f1, f9 + FNMSUB f10, f18, f2, f10 + FNMSUB f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMSUB f8, f20, f8, f4 + FMADD f9, f20, f9, f5 + FMSUB f10, f20, f10, f6 + FMADD f11, f20, f11, f7 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f6 + FMSUB f3, f16, f3, f7 + + FMSUB f8, f19, f1, f8 + FNMADD f9, f19, f0, f9 + FMSUB f10, f19, f3, f10 + FNMADD f11, f19, f2, f11 + + FNMADD f8, f18, f0, f8 + FNMADD f9, f18, f1, f9 + FNMADD f10, f18, f2, f10 + FNMADD f11, f18, f3, f11 + + FMUL f4, f21, f9 + FMUL f5, f21, f8 + FMUL f6, f21, f11 + FMUL f7, f21, f10 + + FMADD f8, f20, f8, f4 + FMSUB f9, f20, f9, f5 + FMADD f10, f20, f10, f6 + FMSUB f11, f20, f11, f7 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f9 + FMUL f13, f17, f8 + FMUL f14, f17, f11 + FMUL f15, f17, f10 + +#ifndef CONJ + FMSUB f8, f16, f8, f12 + FMADD f9, f16, f9, f13 + FMSUB f10, f16, f10, f14 + FMADD f11, f16, f11, f15 + + FMADD f0, f19, f9, f0 + FNMSUB f1, f19, f8, f1 + FMADD f2, f19, f11, f2 + FNMSUB f3, f19, f10, f3 + + FNMSUB f0, f18, f8, f0 + FNMSUB f1, f18, f9, f1 + FNMSUB f2, f18, f10, f2 + FNMSUB f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f6 + FMADD f3, f20, f3, f7 + +#else + FMADD f8, f16, f8, f12 + FMSUB f9, f16, f9, f13 + FMADD f10, f16, f10, f14 + FMSUB f11, f16, f11, f15 + + FMSUB f0, f19, f9, f0 + FNMADD f1, f19, f8, f1 + FMSUB f2, f19, f11, f2 + FNMADD f3, f19, f10, f3 + + FNMADD f0, f18, f8, f0 + FNMADD f1, f18, f9, f1 + FNMADD f2, f18, f10, f2 + FNMADD f3, f18, f11, f3 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f6, f21, f3 + FMUL f7, f21, f2 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f6 + FMSUB f3, f20, f3, f7 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 4 * SIZE + subi CO2, CO2, 4 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f8, 2 * SIZE(BO) + STFD f9, 3 * SIZE(BO) + + STFD f2, 4 * SIZE(BO) + STFD f3, 5 * SIZE(BO) + STFD f10, 6 * SIZE(BO) + STFD f11, 7 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) + + STFD f8, 4 * SIZE(AO) + STFD f9, 5 * SIZE(AO) + STFD f10, 6 * SIZE(AO) + STFD f11, 7 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 2 * SIZE(CO1) + STFD f3, 3 * SIZE(CO1) + + STFD f8, 0 * SIZE(CO2) + STFD f9, 1 * SIZE(CO2) + STFD f10, 2 * SIZE(CO2) + STFD f11, 3 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 4 * SIZE + addi CO2, CO2, 4 * SIZE +#endif + +#ifdef RT + slwi r0, K, 1 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, TEMP + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 2 +#endif + +#ifdef LN + subi KK, KK, 2 +#endif + + addic. I, I, -1 + bgt .L11 + .align 4 + +.L20: + andi. I, M, 1 + ble .L29 + +#if defined(LT) || defined(RN) + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(B) + LFD f21, 1 * SIZE(B) + LFD f22, 2 * SIZE(B) + LFD f23, 3 * SIZE(B) + + LFD f24, 4 * SIZE(B) + LFD f25, 5 * SIZE(B) + LFD f26, 6 * SIZE(B) + LFD f27, 7 * SIZE(B) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, KK, 2 + mr BO, B + mtspr CTR, r0 +#else + +#ifdef LN + slwi r0, K, 0 + ZBASE_SHIFT + sub AORIG, AORIG, r0 +#endif + + slwi r0, KK, 0 + ZBASE_SHIFT + slwi TEMP, KK, 1 + ZBASE_SHIFT + add AO, AORIG, r0 + add BO, B, TEMP + + sub TEMP, K, KK + + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f18, 2 * SIZE(AO) + LFD f19, 3 * SIZE(AO) + + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + LFD f22, 2 * SIZE(BO) + LFD f23, 3 * SIZE(BO) + + LFD f24, 4 * SIZE(BO) + LFD f25, 5 * SIZE(BO) + LFD f26, 6 * SIZE(BO) + LFD f27, 7 * SIZE(BO) + + lfs f0, FZERO + fmr f1, f0 + fmr f2, f0 + fmr f3, f0 + + fmr f4, f0 + fmr f5, f0 + fmr f6, f0 + fmr f7, f0 + + srawi. r0, TEMP, 2 + mtspr CTR, r0 +#endif + ble .L25 + .align 4 + +.L22: + fmadd f0, f16, f20, f0 + LFD f19, 3 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFD f16, 4 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFD f20, 8 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 9 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 10 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 11 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 5 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 6 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 12 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 13 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 14 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 15 * SIZE(BO) + + fmadd f0, f16, f20, f0 + LFD f19, 7 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 8 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 16 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + + fmadd f0, f18, f24, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f18, f25, f1 + nop + fmadd f2, f18, f26, f2 + nop + fmadd f3, f18, f27, f3 + LFD f18, 2 * SIZE(AO) + + fmadd f4, f19, f24, f4 + LFD f24, 4 * SIZE(BO) + fmadd f5, f19, f25, f5 + LFD f25, 5 * SIZE(BO) + fmadd f6, f19, f26, f6 + LFD f26, 6 * SIZE(BO) + fmadd f7, f19, f27, f7 + LFD f27, 7 * SIZE(BO) + bdnz .L22 + .align 4 + +.L25: +#if defined(LT) || defined(RN) + andi. r0, KK, 3 +#else + andi. r0, TEMP, 3 +#endif + mtspr CTR, r0 + ble .L27 + .align 4 + +.L26: + fmadd f0, f16, f20, f0 + LFD f17, 1 * SIZE(AO) + fmadd f1, f16, f21, f1 + nop + fmadd f2, f16, f22, f2 + nop + fmadd f3, f16, f23, f3 + LFDU f16, 2 * SIZE(AO) + + fmadd f4, f17, f20, f4 + LFDU f20, 4 * SIZE(BO) + fmadd f5, f17, f21, f5 + LFD f21, 1 * SIZE(BO) + fmadd f6, f17, f22, f6 + LFD f22, 2 * SIZE(BO) + fmadd f7, f17, f23, f7 + LFD f23, 3 * SIZE(BO) + bdnz .L26 + .align 4 + +.L27: +#ifndef CONJ + FSUB f0, f0, f5 + FADD f1, f1, f4 + FSUB f2, f2, f7 + FADD f3, f3, f6 +#else +#if defined(LN) || defined(LT) + FADD f0, f0, f5 + FSUB f1, f1, f4 + FADD f2, f2, f7 + FSUB f3, f3, f6 +#else + FADD f0, f0, f5 + FSUB f1, f4, f1 + FADD f2, f2, f7 + FSUB f3, f6, f3 +#endif +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + subi r0, KK, 1 +#else + subi r0, KK, 2 +#endif + slwi TEMP, r0, 0 + ZBASE_SHIFT + slwi r0, r0, 1 + ZBASE_SHIFT + add AO, AORIG, TEMP + add BO, B, r0 +#endif + + +#if defined(LN) || defined(LT) + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f18, f2 + FSUB f3, f19, f3 +#else + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + LFD f20, 2 * SIZE(AO) + LFD f21, 3 * SIZE(AO) + + FSUB f0, f16, f0 + FSUB f1, f17, f1 + FSUB f2, f20, f2 + FSUB f3, f21, f3 +#endif + +#ifdef LN + LFD f20, 0 * SIZE(AO) + LFD f21, 1 * SIZE(AO) + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + FMUL f12, f21, f3 + FMUL f13, f21, f2 + +#ifndef CONJ + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 + FMSUB f2, f20, f2, f12 + FMADD f3, f20, f3, f13 +#else + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 + FMADD f2, f20, f2, f12 + FMSUB f3, f20, f3, f13 +#endif +#endif + +#ifdef LT + LFD f16, 0 * SIZE(AO) + LFD f17, 1 * SIZE(AO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 +#endif +#endif + +#ifdef RN + LFD f16, 0 * SIZE(BO) + LFD f17, 1 * SIZE(BO) + LFD f18, 2 * SIZE(BO) + LFD f19, 3 * SIZE(BO) + LFD f20, 6 * SIZE(BO) + LFD f21, 7 * SIZE(BO) + + FMUL f4, f17, f1 + FMUL f5, f17, f0 + +#ifndef CONJ + FMSUB f0, f16, f0, f4 + FMADD f1, f16, f1, f5 + + FMADD f2, f19, f1, f2 + FNMSUB f3, f19, f0, f3 + + FNMSUB f2, f18, f0, f2 + FNMSUB f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMSUB f2, f20, f2, f4 + FMADD f3, f20, f3, f5 +#else + FMADD f0, f16, f0, f4 + FMSUB f1, f16, f1, f5 + + FMSUB f2, f19, f1, f2 + FNMADD f3, f19, f0, f3 + + FNMADD f2, f18, f0, f2 + FNMADD f3, f18, f1, f3 + + FMUL f4, f21, f3 + FMUL f5, f21, f2 + + FMADD f2, f20, f2, f4 + FMSUB f3, f20, f3, f5 +#endif +#endif + +#ifdef RT + LFD f16, 6 * SIZE(BO) + LFD f17, 7 * SIZE(BO) + LFD f18, 4 * SIZE(BO) + LFD f19, 5 * SIZE(BO) + LFD f20, 0 * SIZE(BO) + LFD f21, 1 * SIZE(BO) + + FMUL f12, f17, f3 + FMUL f13, f17, f2 + +#ifndef CONJ + FMSUB f2, f16, f2, f12 + FMADD f3, f16, f3, f13 + + FMADD f0, f19, f3, f0 + FNMSUB f1, f19, f2, f1 + + FNMSUB f0, f18, f2, f0 + FNMSUB f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMSUB f0, f20, f0, f4 + FMADD f1, f20, f1, f5 +#else + FMADD f2, f16, f2, f12 + FMSUB f3, f16, f3, f13 + + FMSUB f0, f19, f3, f0 + FNMADD f1, f19, f2, f1 + + FNMADD f0, f18, f2, f0 + FNMADD f1, f18, f3, f1 + + FMUL f4, f21, f1 + FMUL f5, f21, f0 + + FMADD f0, f20, f0, f4 + FMSUB f1, f20, f1, f5 +#endif +#endif + +#ifdef LN + subi CO1, CO1, 2 * SIZE + subi CO2, CO2, 2 * SIZE +#endif + +#if defined(LN) || defined(LT) + STFD f0, 0 * SIZE(BO) + STFD f1, 1 * SIZE(BO) + STFD f2, 2 * SIZE(BO) + STFD f3, 3 * SIZE(BO) +#else + STFD f0, 0 * SIZE(AO) + STFD f1, 1 * SIZE(AO) + STFD f2, 2 * SIZE(AO) + STFD f3, 3 * SIZE(AO) +#endif + + STFD f0, 0 * SIZE(CO1) + STFD f1, 1 * SIZE(CO1) + STFD f2, 0 * SIZE(CO2) + STFD f3, 1 * SIZE(CO2) + +#ifndef LN + addi CO1, CO1, 2 * SIZE + addi CO2, CO2, 2 * SIZE +#endif + +#ifdef RT + slwi r0, K, 0 + ZBASE_SHIFT + add AORIG, AORIG, r0 +#endif + +#if defined(LT) || defined(RN) + sub TEMP, K, KK + slwi r0, TEMP, 0 + ZBASE_SHIFT + slwi TEMP, TEMP, 1 + ZBASE_SHIFT + add AO, AO, r0 + add BO, BO, TEMP +#endif + +#ifdef LT + addi KK, KK, 1 +#endif + +#ifdef LN + subi KK, KK, 1 +#endif + .align 4 + +.L29: +#ifdef LN + slwi r0, K, 1 + ZBASE_SHIFT + add B, B, r0 +#endif + +#if defined(LT) || defined(RN) + mr B, BO +#endif + +#ifdef RN + addi KK, KK, 2 +#endif + +#ifdef RT + subi KK, KK, 2 +#endif + + addic. J, J, -1 + bgt .L10 + .align 4 + + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c new file mode 100644 index 0000000000..0ab57f3b3d --- /dev/null +++ b/kernel/setparam-ref.c @@ -0,0 +1,819 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifdef BUILD_KERNEL +#include "kernelTS.h" +#endif + +#undef DEBUG + +static void init_parameter(void); + +gotoblas_t TABLE_NAME = { + GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, + + 0, 0, 0, + SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N), +#ifdef HAVE_EXCLUSIVE_CACHE + 1, +#else + 0, +#endif + + samax_kTS, samin_kTS, smax_kTS, smin_kTS, + isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, + snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS, + dsdot_kTS, + srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, + sgemv_nTS, sgemv_tTS, sger_kTS, + ssymv_LTS, ssymv_UTS, + + sgemm_kernelTS, sgemm_betaTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + sgemm_incopyTS, sgemm_itcopyTS, +#else + sgemm_oncopyTS, sgemm_otcopyTS, +#endif + sgemm_oncopyTS, sgemm_otcopyTS, + strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, + strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS, +#else + strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, + strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, +#endif + strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, + strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, + strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS, + strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS, +#else + strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, + strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, +#endif + strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, + strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, +#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N + ssymm_iutcopyTS, ssymm_iltcopyTS, +#else + ssymm_outcopyTS, ssymm_oltcopyTS, +#endif + ssymm_outcopyTS, ssymm_oltcopyTS, + + sneg_tcopyTS, slaswp_ncopyTS, + + 0, 0, 0, + DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), + + damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, + idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, + dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS, + drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, + dgemv_nTS, dgemv_tTS, dger_kTS, + dsymv_LTS, dsymv_UTS, + + dgemm_kernelTS, dgemm_betaTS, +#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N + dgemm_incopyTS, dgemm_itcopyTS, +#else + dgemm_oncopyTS, dgemm_otcopyTS, +#endif + dgemm_oncopyTS, dgemm_otcopyTS, + dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, +#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N + dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, + dtrsm_ilnucopyTS, dtrsm_ilnncopyTS, dtrsm_iltucopyTS, dtrsm_iltncopyTS, +#else + dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS, + dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS, +#endif + dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS, + dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS, + dtrmm_kernel_RNTS, dtrmm_kernel_RTTS, dtrmm_kernel_LNTS, dtrmm_kernel_LTTS, +#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N + dtrmm_iunucopyTS, dtrmm_iunncopyTS, dtrmm_iutucopyTS, dtrmm_iutncopyTS, + dtrmm_ilnucopyTS, dtrmm_ilnncopyTS, dtrmm_iltucopyTS, dtrmm_iltncopyTS, +#else + dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, + dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, +#endif + dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, + dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, +#if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N + dsymm_iutcopyTS, dsymm_iltcopyTS, +#else + dsymm_outcopyTS, dsymm_oltcopyTS, +#endif + dsymm_outcopyTS, dsymm_oltcopyTS, + + dneg_tcopyTS, dlaswp_ncopyTS, + +#ifdef EXPRECISION + + 0, 0, 0, + QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), + + qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, + iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, + qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS, + qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, + qgemv_nTS, qgemv_tTS, qger_kTS, + qsymv_LTS, qsymv_UTS, + + qgemm_kernelTS, qgemm_betaTS, +#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N + qgemm_incopyTS, qgemm_itcopyTS, +#else + qgemm_oncopyTS, qgemm_otcopyTS, +#endif + qgemm_oncopyTS, qgemm_otcopyTS, + qtrsm_kernel_LNTS, qtrsm_kernel_LTTS, qtrsm_kernel_RNTS, qtrsm_kernel_RTTS, +#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N + qtrsm_iunucopyTS, qtrsm_iunncopyTS, qtrsm_iutucopyTS, qtrsm_iutncopyTS, + qtrsm_ilnucopyTS, qtrsm_ilnncopyTS, qtrsm_iltucopyTS, qtrsm_iltncopyTS, +#else + qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS, + qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS, +#endif + qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS, + qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS, + qtrmm_kernel_RNTS, qtrmm_kernel_RTTS, qtrmm_kernel_LNTS, qtrmm_kernel_LTTS, +#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N + qtrmm_iunucopyTS, qtrmm_iunncopyTS, qtrmm_iutucopyTS, qtrmm_iutncopyTS, + qtrmm_ilnucopyTS, qtrmm_ilnncopyTS, qtrmm_iltucopyTS, qtrmm_iltncopyTS, +#else + qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS, + qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS, +#endif + qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS, + qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS, +#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N + qsymm_iutcopyTS, qsymm_iltcopyTS, +#else + qsymm_outcopyTS, qsymm_oltcopyTS, +#endif + qsymm_outcopyTS, qsymm_oltcopyTS, + + qneg_tcopyTS, qlaswp_ncopyTS, + +#endif + + 0, 0, 0, + CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N), + + camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, + cnrm2_kTS, casum_kTS, ccopy_kTS, + cdotu_kTS, cdotc_kTS, csrot_kTS, + caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, + + cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, + cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS, + cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS, + csymv_LTS, csymv_UTS, + chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS, + + cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS, + cgemm_betaTS, + +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + cgemm_incopyTS, cgemm_itcopyTS, +#else + cgemm_oncopyTS, cgemm_otcopyTS, +#endif + cgemm_oncopyTS, cgemm_otcopyTS, + + ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS, + ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS, + +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + ctrsm_iunucopyTS, ctrsm_iunncopyTS, ctrsm_iutucopyTS, ctrsm_iutncopyTS, + ctrsm_ilnucopyTS, ctrsm_ilnncopyTS, ctrsm_iltucopyTS, ctrsm_iltncopyTS, +#else + ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, + ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, +#endif + ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, + ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, + + ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS, + ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS, + +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + ctrmm_iunucopyTS, ctrmm_iunncopyTS, ctrmm_iutucopyTS, ctrmm_iutncopyTS, + ctrmm_ilnucopyTS, ctrmm_ilnncopyTS, ctrmm_iltucopyTS, ctrmm_iltncopyTS, +#else + ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, + ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, +#endif + ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, + ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, + +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + csymm_iutcopyTS, csymm_iltcopyTS, +#else + csymm_outcopyTS, csymm_oltcopyTS, +#endif + csymm_outcopyTS, csymm_oltcopyTS, +#if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N + chemm_iutcopyTS, chemm_iltcopyTS, +#else + chemm_outcopyTS, chemm_oltcopyTS, +#endif + chemm_outcopyTS, chemm_oltcopyTS, + + cgemm3m_kernelTS, + + cgemm3m_incopybTS, cgemm3m_incopyrTS, + cgemm3m_incopyiTS, cgemm3m_itcopybTS, + cgemm3m_itcopyrTS, cgemm3m_itcopyiTS, + cgemm3m_oncopybTS, cgemm3m_oncopyrTS, + cgemm3m_oncopyiTS, cgemm3m_otcopybTS, + cgemm3m_otcopyrTS, cgemm3m_otcopyiTS, + + csymm3m_iucopybTS, csymm3m_ilcopybTS, + csymm3m_iucopyrTS, csymm3m_ilcopyrTS, + csymm3m_iucopyiTS, csymm3m_ilcopyiTS, + csymm3m_oucopybTS, csymm3m_olcopybTS, + csymm3m_oucopyrTS, csymm3m_olcopyrTS, + csymm3m_oucopyiTS, csymm3m_olcopyiTS, + + chemm3m_iucopybTS, chemm3m_ilcopybTS, + chemm3m_iucopyrTS, chemm3m_ilcopyrTS, + chemm3m_iucopyiTS, chemm3m_ilcopyiTS, + + chemm3m_oucopybTS, chemm3m_olcopybTS, + chemm3m_oucopyrTS, chemm3m_olcopyrTS, + chemm3m_oucopyiTS, chemm3m_olcopyiTS, + + cneg_tcopyTS, claswp_ncopyTS, + + 0, 0, 0, + ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N), + + zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, + znrm2_kTS, zasum_kTS, zcopy_kTS, + zdotu_kTS, zdotc_kTS, zdrot_kTS, + zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, + + zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS, + zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS, + zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS, + zsymv_LTS, zsymv_UTS, + zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS, + + zgemm_kernel_nTS, zgemm_kernel_lTS, zgemm_kernel_rTS, zgemm_kernel_bTS, + zgemm_betaTS, + +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + zgemm_incopyTS, zgemm_itcopyTS, +#else + zgemm_oncopyTS, zgemm_otcopyTS, +#endif + zgemm_oncopyTS, zgemm_otcopyTS, + + ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS, + ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS, + +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + ztrsm_iunucopyTS, ztrsm_iunncopyTS, ztrsm_iutucopyTS, ztrsm_iutncopyTS, + ztrsm_ilnucopyTS, ztrsm_ilnncopyTS, ztrsm_iltucopyTS, ztrsm_iltncopyTS, +#else + ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS, + ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS, +#endif + ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS, + ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS, + + ztrmm_kernel_RNTS, ztrmm_kernel_RTTS, ztrmm_kernel_RRTS, ztrmm_kernel_RCTS, + ztrmm_kernel_LNTS, ztrmm_kernel_LTTS, ztrmm_kernel_LRTS, ztrmm_kernel_LCTS, + +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + ztrmm_iunucopyTS, ztrmm_iunncopyTS, ztrmm_iutucopyTS, ztrmm_iutncopyTS, + ztrmm_ilnucopyTS, ztrmm_ilnncopyTS, ztrmm_iltucopyTS, ztrmm_iltncopyTS, +#else + ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, + ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS, +#endif + ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, + ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS, + +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + zsymm_iutcopyTS, zsymm_iltcopyTS, +#else + zsymm_outcopyTS, zsymm_oltcopyTS, +#endif + zsymm_outcopyTS, zsymm_oltcopyTS, +#if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N + zhemm_iutcopyTS, zhemm_iltcopyTS, +#else + zhemm_outcopyTS, zhemm_oltcopyTS, +#endif + zhemm_outcopyTS, zhemm_oltcopyTS, + + zgemm3m_kernelTS, + + zgemm3m_incopybTS, zgemm3m_incopyrTS, + zgemm3m_incopyiTS, zgemm3m_itcopybTS, + zgemm3m_itcopyrTS, zgemm3m_itcopyiTS, + zgemm3m_oncopybTS, zgemm3m_oncopyrTS, + zgemm3m_oncopyiTS, zgemm3m_otcopybTS, + zgemm3m_otcopyrTS, zgemm3m_otcopyiTS, + + zsymm3m_iucopybTS, zsymm3m_ilcopybTS, + zsymm3m_iucopyrTS, zsymm3m_ilcopyrTS, + zsymm3m_iucopyiTS, zsymm3m_ilcopyiTS, + zsymm3m_oucopybTS, zsymm3m_olcopybTS, + zsymm3m_oucopyrTS, zsymm3m_olcopyrTS, + zsymm3m_oucopyiTS, zsymm3m_olcopyiTS, + + zhemm3m_iucopybTS, zhemm3m_ilcopybTS, + zhemm3m_iucopyrTS, zhemm3m_ilcopyrTS, + zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS, + + zhemm3m_oucopybTS, zhemm3m_olcopybTS, + zhemm3m_oucopyrTS, zhemm3m_olcopyrTS, + zhemm3m_oucopyiTS, zhemm3m_olcopyiTS, + + zneg_tcopyTS, zlaswp_ncopyTS, + +#ifdef EXPRECISION + + 0, 0, 0, + XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), + + xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, + xnrm2_kTS, xasum_kTS, xcopy_kTS, + xdotu_kTS, xdotc_kTS, xqrot_kTS, + xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, + + xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS, + xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS, + xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS, + xsymv_LTS, xsymv_UTS, + xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS, + + xgemm_kernel_nTS, xgemm_kernel_lTS, xgemm_kernel_rTS, xgemm_kernel_bTS, + xgemm_betaTS, + +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xgemm_incopyTS, xgemm_itcopyTS, +#else + xgemm_oncopyTS, xgemm_otcopyTS, +#endif + xgemm_oncopyTS, xgemm_otcopyTS, + + xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS, + xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS, + +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xtrsm_iunucopyTS, xtrsm_iunncopyTS, xtrsm_iutucopyTS, xtrsm_iutncopyTS, + xtrsm_ilnucopyTS, xtrsm_ilnncopyTS, xtrsm_iltucopyTS, xtrsm_iltncopyTS, +#else + xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS, + xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS, +#endif + xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS, + xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS, + + xtrmm_kernel_RNTS, xtrmm_kernel_RTTS, xtrmm_kernel_RRTS, xtrmm_kernel_RCTS, + xtrmm_kernel_LNTS, xtrmm_kernel_LTTS, xtrmm_kernel_LRTS, xtrmm_kernel_LCTS, + +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xtrmm_iunucopyTS, xtrmm_iunncopyTS, xtrmm_iutucopyTS, xtrmm_iutncopyTS, + xtrmm_ilnucopyTS, xtrmm_ilnncopyTS, xtrmm_iltucopyTS, xtrmm_iltncopyTS, +#else + xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS, + xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS, +#endif + xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS, + xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS, + +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xsymm_iutcopyTS, xsymm_iltcopyTS, +#else + xsymm_outcopyTS, xsymm_oltcopyTS, +#endif + xsymm_outcopyTS, xsymm_oltcopyTS, +#if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N + xhemm_iutcopyTS, xhemm_iltcopyTS, +#else + xhemm_outcopyTS, xhemm_oltcopyTS, +#endif + xhemm_outcopyTS, xhemm_oltcopyTS, + + xgemm3m_kernelTS, + + xgemm3m_incopybTS, xgemm3m_incopyrTS, + xgemm3m_incopyiTS, xgemm3m_itcopybTS, + xgemm3m_itcopyrTS, xgemm3m_itcopyiTS, + xgemm3m_oncopybTS, xgemm3m_oncopyrTS, + xgemm3m_oncopyiTS, xgemm3m_otcopybTS, + xgemm3m_otcopyrTS, xgemm3m_otcopyiTS, + + xsymm3m_iucopybTS, xsymm3m_ilcopybTS, + xsymm3m_iucopyrTS, xsymm3m_ilcopyrTS, + xsymm3m_iucopyiTS, xsymm3m_ilcopyiTS, + xsymm3m_oucopybTS, xsymm3m_olcopybTS, + xsymm3m_oucopyrTS, xsymm3m_olcopyrTS, + xsymm3m_oucopyiTS, xsymm3m_olcopyiTS, + + xhemm3m_iucopybTS, xhemm3m_ilcopybTS, + xhemm3m_iucopyrTS, xhemm3m_ilcopyrTS, + xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS, + + xhemm3m_oucopybTS, xhemm3m_olcopybTS, + xhemm3m_oucopyrTS, xhemm3m_olcopyrTS, + xhemm3m_oucopyiTS, xhemm3m_olcopyiTS, + + xneg_tcopyTS, xlaswp_ncopyTS, + +#endif + + init_parameter, + + SNUMOPT, DNUMOPT, QNUMOPT, + +}; + +#ifdef ARCH_X86 +static int get_l2_size_old(void){ + int i, eax, ebx, ecx, edx, cpuid_level; + int info[15]; + + cpuid(2, &eax, &ebx, &ecx, &edx); + + info[ 0] = BITMASK(eax, 8, 0xff); + info[ 1] = BITMASK(eax, 16, 0xff); + info[ 2] = BITMASK(eax, 24, 0xff); + + info[ 3] = BITMASK(ebx, 0, 0xff); + info[ 4] = BITMASK(ebx, 8, 0xff); + info[ 5] = BITMASK(ebx, 16, 0xff); + info[ 6] = BITMASK(ebx, 24, 0xff); + + info[ 7] = BITMASK(ecx, 0, 0xff); + info[ 8] = BITMASK(ecx, 8, 0xff); + info[ 9] = BITMASK(ecx, 16, 0xff); + info[10] = BITMASK(ecx, 24, 0xff); + + info[11] = BITMASK(edx, 0, 0xff); + info[12] = BITMASK(edx, 8, 0xff); + info[13] = BITMASK(edx, 16, 0xff); + info[14] = BITMASK(edx, 24, 0xff); + + for (i = 0; i < 15; i++){ + + switch (info[i]){ + + /* This table is from http://www.sandpile.org/ia32/cpuid.htm */ + + case 0x1a : + return 96; + + case 0x39 : + case 0x3b : + case 0x41 : + case 0x79 : + case 0x81 : + return 128; + + case 0x3a : + return 192; + + case 0x21 : + case 0x3c : + case 0x42 : + case 0x7a : + case 0x7e : + case 0x82 : + return 256; + + case 0x3d : + return 384; + + case 0x3e : + case 0x43 : + case 0x7b : + case 0x7f : + case 0x83 : + case 0x86 : + return 512; + + case 0x44 : + case 0x78 : + case 0x7c : + case 0x84 : + case 0x87 : + return 1024; + + case 0x45 : + case 0x7d : + case 0x85 : + return 2048; + + case 0x48 : + return 3184; + + case 0x49 : + return 4096; + + case 0x4e : + return 6144; + } + } + return 0; +} +#endif + +static __inline__ int get_l2_size(void){ + + int eax, ebx, ecx, edx, l2; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + l2 = BITMASK(ecx, 16, 0xffff); + +#ifndef ARCH_X86 + return l2; + +#else + + if (l2 > 0) return l2; + + return get_l2_size_old(); +#endif +} + +static __inline__ int get_l3_size(void){ + + int eax, ebx, ecx, edx; + + cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + + return BITMASK(edx, 18, 0x3fff) * 512; +} + + +static void init_parameter(void) { + + int l2 = get_l2_size(); + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#ifdef EXPRECISION + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; +#endif + +#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) + +#ifdef DEBUG + fprintf(stderr, "Katmai, Coppermine, Banias\n"); +#endif + + TABLE_NAME.sgemm_p = 64 * (l2 >> 7); + TABLE_NAME.dgemm_p = 32 * (l2 >> 7); + TABLE_NAME.cgemm_p = 32 * (l2 >> 7); + TABLE_NAME.zgemm_p = 16 * (l2 >> 7); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 16 * (l2 >> 7); + TABLE_NAME.xgemm_p = 8 * (l2 >> 7); +#endif +#endif + +#ifdef CORE_NORTHWOOD + +#ifdef DEBUG + fprintf(stderr, "Northwood\n"); +#endif + + TABLE_NAME.sgemm_p = 96 * (l2 >> 7); + TABLE_NAME.dgemm_p = 48 * (l2 >> 7); + TABLE_NAME.cgemm_p = 48 * (l2 >> 7); + TABLE_NAME.zgemm_p = 24 * (l2 >> 7); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 24 * (l2 >> 7); + TABLE_NAME.xgemm_p = 12 * (l2 >> 7); +#endif +#endif + +#ifdef ATOM + +#ifdef DEBUG + fprintf(stderr, "Atom\n"); +#endif + + TABLE_NAME.sgemm_p = 256; + TABLE_NAME.dgemm_p = 128; + TABLE_NAME.cgemm_p = 128; + TABLE_NAME.zgemm_p = 64; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 64; + TABLE_NAME.xgemm_p = 32; +#endif +#endif + +#ifdef CORE_PRESCOTT + +#ifdef DEBUG + fprintf(stderr, "Prescott\n"); +#endif + + TABLE_NAME.sgemm_p = 56 * (l2 >> 7); + TABLE_NAME.dgemm_p = 28 * (l2 >> 7); + TABLE_NAME.cgemm_p = 28 * (l2 >> 7); + TABLE_NAME.zgemm_p = 14 * (l2 >> 7); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 14 * (l2 >> 7); + TABLE_NAME.xgemm_p = 7 * (l2 >> 7); +#endif +#endif + +#ifdef CORE2 + +#ifdef DEBUG + fprintf(stderr, "Core2\n"); +#endif + + TABLE_NAME.sgemm_p = 92 * (l2 >> 9); + TABLE_NAME.dgemm_p = 46 * (l2 >> 9); + TABLE_NAME.cgemm_p = 46 * (l2 >> 9); + TABLE_NAME.zgemm_p = 23 * (l2 >> 9); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 92 * (l2 >> 9); + TABLE_NAME.xgemm_p = 46 * (l2 >> 9); +#endif +#endif + +#ifdef PENRYN + +#ifdef DEBUG + fprintf(stderr, "Penryn\n"); +#endif + + TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; + TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; + TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; + TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8; + TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4; +#endif +#endif + +#ifdef NEHALEM + +#ifdef DEBUG + fprintf(stderr, "Nehalem\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + +#ifdef OPTERON + +#ifdef DEBUG + fprintf(stderr, "Opteron\n"); +#endif + + TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7); + TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7); + TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7); + TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7); +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = 56 + 14 * (l2 >> 7); + TABLE_NAME.xgemm_p = 28 + 7 * (l2 >> 7); +#endif +#endif + +#ifdef BARCELONA + +#ifdef DEBUG + fprintf(stderr, "Barcelona\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + +#ifdef NANO + +#ifdef DEBUG + fprintf(stderr, "NANO\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + + + TABLE_NAME.sgemm_p = (TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.dgemm_p = (TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.cgemm_p = (TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1) & ~(CGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.zgemm_p = (TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1) & ~(ZGEMM_DEFAULT_UNROLL_M - 1); +#ifdef QUAD_PRECISION + TABLE_NAME.qgemm_p = (TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1); + TABLE_NAME.xgemm_p = (TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1) & ~(XGEMM_DEFAULT_UNROLL_M - 1); +#endif + +#ifdef DEBUG + fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); +#endif + + TABLE_NAME.sgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); + + TABLE_NAME.dgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15); + +#ifdef EXPRECISION + TABLE_NAME.qgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); +#endif + + TABLE_NAME.cgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); + + TABLE_NAME.zgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); + +#ifdef EXPRECISION + TABLE_NAME.xgemm_r = (((BUFFER_SIZE - + ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA + + TABLE_NAME.align) & ~TABLE_NAME.align) + ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15); +#endif + +} diff --git a/kernel/sparc/KERNEL b/kernel/sparc/KERNEL new file mode 100644 index 0000000000..594fd05e52 --- /dev/null +++ b/kernel/sparc/KERNEL @@ -0,0 +1,69 @@ +ifndef SAMINKERNEL +SAMINKERNEL = amax.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = max.S +endif + +ifndef DMINKERNEL +DMINKERNEL = max.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = snrm2.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = dnrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = cnrm2.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +SGEMM_BETA = ../generic/gemm_beta.c +DGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = ../generic/zgemm_beta.c +ZGEMM_BETA = ../generic/zgemm_beta.c + diff --git a/kernel/sparc/KERNEL.sparc b/kernel/sparc/KERNEL.sparc new file mode 100644 index 0000000000..fb6cc2b753 --- /dev/null +++ b/kernel/sparc/KERNEL.sparc @@ -0,0 +1,56 @@ +SGEMMKERNEL = gemm_kernel.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy.S +SGEMMOTCOPY = gemm_tcopy.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) +DGEMMKERNEL = gemm_kernel.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy.S +DGEMMOTCOPY = gemm_tcopy.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) +CGEMMKERNEL = zgemm_kernel.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = zgemm_ncopy.S +CGEMMOTCOPY = zgemm_tcopy.S +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy.S +ZGEMMOTCOPY = zgemm_tcopy.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN.S +STRSMKERNEL_LT = trsm_kernel_LT.S +STRSMKERNEL_RN = trsm_kernel_LT.S +STRSMKERNEL_RT = trsm_kernel_RT.S + +DTRSMKERNEL_LN = trsm_kernel_LN.S +DTRSMKERNEL_LT = trsm_kernel_LT.S +DTRSMKERNEL_RN = trsm_kernel_LT.S +DTRSMKERNEL_RT = trsm_kernel_RT.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN.S +CTRSMKERNEL_LT = ztrsm_kernel_LT.S +CTRSMKERNEL_RN = ztrsm_kernel_LT.S +CTRSMKERNEL_RT = ztrsm_kernel_RT.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT.S diff --git a/kernel/sparc/KERNEL.sparcv7 b/kernel/sparc/KERNEL.sparcv7 new file mode 100644 index 0000000000..dfda684e28 --- /dev/null +++ b/kernel/sparc/KERNEL.sparcv7 @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_2x8.S +SGEMMINCOPY = gemm_ncopy_2.S +SGEMMITCOPY = gemm_tcopy_2.S +SGEMMONCOPY = gemm_ncopy_8.S +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy.$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy.$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy.$(SUFFIX) + +DGEMMKERNEL = gemm_kernel_2x8.S +DGEMMINCOPY = gemm_ncopy_2.S +DGEMMITCOPY = gemm_tcopy_2.S +DGEMMONCOPY = gemm_ncopy_8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_8.c +DGEMMINCOPYOBJ = dgemm_incopy.$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy.$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy.$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy.$(SUFFIX) + +CGEMMKERNEL = zgemm_kernel_1x4.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy.$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy.$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy.$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy.$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_1x4.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy.$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy.$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy.$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy.$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_2x8.S +STRSMKERNEL_LT = trsm_kernel_LT_2x8.S +STRSMKERNEL_RN = trsm_kernel_LT_2x8.S +STRSMKERNEL_RT = trsm_kernel_RT_2x8.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x8.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x8.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x8.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x8.S + +CTRSMKERNEL_LN = ztrsm_kernel_LT_1x4.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_1x4.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_1x4.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_1x4.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4.S diff --git a/kernel/sparc/Makefile b/kernel/sparc/Makefile new file mode 100644 index 0000000000..efae70d7b7 --- /dev/null +++ b/kernel/sparc/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/sparc/amax.S b/kernel/sparc/amax.S new file mode 100644 index 0000000000..7729e5cb6f --- /dev/null +++ b/kernel/sparc/amax.S @@ -0,0 +1,380 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#else +#define FCMOV FMOVL +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + add N, -1, N + LDF [X], c4 + add X, INCX, X + cmp N, 0 + ble .LL20 + FABS c4, c1 + + FABS c4, c2 + FABS c4, c3 + FABS c4, c4 + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + FABS a8, t4 + LDF [X + 7 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + add I, -1, I + FCMOV %fcc1, t2, c2 + cmp I, 0 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + +.LL20: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FABS a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FABS a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FABS a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FABS a8, t4 + LDF [X + 0 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + add I, -1, I + FCMOV %fcc1, t2, c2 + cmp I, 0 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t2, c2 + FCMOV %fcc2, t3, c3 + FCMOV %fcc3, t4, c4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/asum.S b/kernel/sparc/asum.S new file mode 100644 index 0000000000..7205fa60f7 --- /dev/null +++ b/kernel/sparc/asum.S @@ -0,0 +1,325 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + sll INCX, BASE_SHIFT, INCX + + FMOV c1, c2 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL19 + cmp INCX, SIZE + bne .LL50 + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + cmp I, 0 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 128 + +.LL11: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + add I, -1, I + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + + FADD c1, t3, c1 + cmp I, 0 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + + FADD c2, t4, c2 + nop + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + nop + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + + FADD c2, t2, c2 + nop + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + + FADD c1, t3, c1 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + add X, 8 * SIZE, X + + FADD c2, t4, c2 + FABS a8, t4 + bg,pt %icc, .LL11 + LDF [X - 1 * SIZE], a8 + +.LL12: + FADD c1, t1, c1 + FABS a1, t1 + FADD c2, t2, c2 + FABS a2, t2 + + FADD c1, t3, c1 + FABS a3, t3 + FADD c2, t4, c2 + FABS a4, t4 + + FADD c1, t1, c1 + FABS a5, t1 + FADD c2, t2, c2 + FABS a6, t2 + + FADD c1, t3, c1 + FABS a7, t3 + FADD c2, t4, c2 + FABS a8, t4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + FADD c1, t1, c1 + FABS a1, t1 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FADD c1, t1, c1 + add I, -1, I + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FADD c2, t2, c2 + cmp I, 0 + FABS a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + + FADD c1, t3, c1 + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + FADD c2, t4, c2 + FABS a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + + FADD c2, t2, c2 + FABS a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + + FADD c1, t3, c1 + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + + FADD c2, t4, c2 + FABS a8, t4 + LDF [X + 0 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FADD c1, t1, c1 + FABS a1, t1 + FADD c2, t2, c2 + FABS a2, t2 + + FADD c1, t3, c1 + FABS a3, t3 + FADD c2, t4, c2 + FABS a4, t4 + + FADD c1, t1, c1 + FABS a5, t1 + FADD c2, t2, c2 + FABS a6, t2 + + FADD c1, t3, c1 + FABS a7, t3 + FADD c2, t4, c2 + FABS a8, t4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FADD c1, t1, c1 + add I, -1, I + FABS a1, t1 + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/axpy.S b/kernel/sparc/axpy.S new file mode 100644 index 0000000000..997f9e0998 --- /dev/null +++ b/kernel/sparc/axpy.S @@ -0,0 +1,503 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(DOUBLE) && !defined(__64BIT__) +#define N %i0 +#define X %i5 +#define INCX %i1 +#define Y %i2 +#define INCY %i3 +#define I %i4 +#else +#define N %i0 +#define X %i4 +#define INCX %i5 +#define Y %i1 +#define INCY %i2 +#define I %i3 +#endif + +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define b1 %f16 +#define b2 %f18 +#define b3 %f20 +#define b4 %f22 +#define b5 %f24 +#define b6 %f26 +#define b7 %f28 +#define b8 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 +#define c1 %f40 +#define c2 %f42 +#define c3 %f44 +#define c4 %f46 + +#define c5 %f48 +#define c6 %f50 +#define c7 %f52 +#define c8 %f54 + +#define ALPHA %f62 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define b1 %f8 +#define b2 %f9 +#define b3 %f10 +#define b4 %f11 +#define b5 %f12 +#define b6 %f13 +#define b7 %f14 +#define b8 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 +#define c1 %f20 +#define c2 %f21 +#define c3 %f22 +#define c4 %f23 + +#define c5 %f24 +#define c6 %f25 +#define c7 %f26 +#define c8 %f27 + +#define ALPHA %f31 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], INCX + ld [%sp + STACK_START + 32], Y + ld [%sp + STACK_START + 36], INCY +#else + st %i3, [%sp + STACK_START + 16] + ld [%sp + STACK_START + 28], Y + ld [%sp + STACK_START + 32], INCY +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp + STACK_START + 56], Y + ldx [%sp + STACK_START + 64], INCY +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif +#endif + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + FMUL ALPHA, a1, t1 + FMUL ALPHA, a2, t2 + FMUL ALPHA, a3, t3 + FMUL ALPHA, a4, t4 + + FADD b1, t1, c1 + FMUL ALPHA, a5, t1 + FADD b2, t2, c2 + FMUL ALPHA, a6, t2 + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL12 + nop + +#ifdef DOUBLE +#define PREFETCHSIZE 54 +#else +#define PREFETCHSIZE 108 +#endif + +.LL11: + prefetch [Y + PREFETCHSIZE * SIZE], 0 + + LDF [X + 8 * SIZE], a1 + LDF [X + 9 * SIZE], a2 + LDF [X + 10 * SIZE], a3 + LDF [X + 11 * SIZE], a4 + + FADD b3, t3, c3 + STF c1, [Y + 0 * SIZE] + FMUL ALPHA, a7, t3 + + FADD b4, t4, c4 + STF c2, [Y + 1 * SIZE] + FMUL ALPHA, a8, t4 + + LDF [Y + 8 * SIZE], b1 + LDF [Y + 9 * SIZE], b2 + LDF [Y + 10 * SIZE], b3 + LDF [Y + 11 * SIZE], b4 + + FADD b5, t1, c5 + STF c3, [Y + 2 * SIZE] + FMUL ALPHA, a1, t1 + + FADD b6, t2, c6 + STF c4, [Y + 3 * SIZE] + FMUL ALPHA, a2, t2 + + prefetch [X + PREFETCHSIZE * SIZE], 0 + + LDF [X + 12 * SIZE], a5 + LDF [X + 13 * SIZE], a6 + LDF [X + 14 * SIZE], a7 + LDF [X + 15 * SIZE], a8 + + FADD b7, t3, c7 + STF c5, [Y + 4 * SIZE] + FMUL ALPHA, a3, t3 + + FADD b8, t4, c8 + STF c6, [Y + 5 * SIZE] + FMUL ALPHA, a4, t4 + + LDF [Y + 12 * SIZE], b5 + LDF [Y + 13 * SIZE], b6 + LDF [Y + 14 * SIZE], b7 + LDF [Y + 15 * SIZE], b8 + + FADD b1, t1, c1 + STF c7, [Y + 6 * SIZE] + FMUL ALPHA, a5, t1 + deccc I + + FADD b2, t2, c2 + STF c8, [Y + 7 * SIZE] + FMUL ALPHA, a6, t2 + add Y, 8 * SIZE, Y + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FADD b3, t3, c3 + FMUL ALPHA, a7, t3 + FADD b4, t4, c4 + FMUL ALPHA, a8, t4 + + FADD b5, t1, c5 + FADD b6, t2, c6 + FADD b7, t3, c7 + FADD b8, t4, c8 + + STF c1, [Y + 0 * SIZE] + STF c2, [Y + 1 * SIZE] + STF c3, [Y + 2 * SIZE] + STF c4, [Y + 3 * SIZE] + + STF c5, [Y + 4 * SIZE] + STF c6, [Y + 5 * SIZE] + STF c7, [Y + 6 * SIZE] + STF c8, [Y + 7 * SIZE] + + add Y, 8 * SIZE, Y + add X, 8 * SIZE, X + + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + + FMUL ALPHA, a1, t1 + FADD b1, t1, c1 + + add I, -1, I + cmp I, 0 + STF c1, [Y + 0 * SIZE] + add Y, 1 * SIZE, Y + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + mov Y, YY + + LDF [X + 0 * SIZE], a1 + add I, -1, I + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + add Y, INCY, Y + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + ble,pt %icc, .LL52 + add Y, INCY, Y + + +.LL51: + FMUL ALPHA, a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FMUL ALPHA, a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + + FMUL ALPHA, a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FMUL ALPHA, a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FADD b1, t1, c1 + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + + FMUL ALPHA, a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FADD b2, t2, c2 + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + + FMUL ALPHA, a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FADD b3, t3, c3 + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + + FMUL ALPHA, a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FADD b4, t4, c4 + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + FMUL ALPHA, a8, t4 + LDF [X + 0 * SIZE], a8 + add X, INCX, X + + STF c1, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b5, t1, c1 + STF c2, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b6, t2, c2 + STF c3, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b7, t3, c3 + STF c4, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b8, t4, c4 + + LDF [Y + 0 * SIZE], b5 + add I, -1, I + add Y, INCY, Y + LDF [Y + 0 * SIZE], b6 + cmp I, 0 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b8 + add Y, INCY, Y + + STF c1, [YY + 0 * SIZE] + add YY, INCY, YY + STF c2, [YY + 0 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + add YY, INCY, YY + STF c4, [YY + 0 * SIZE] + + bg,pt %icc, .LL51 + add YY, INCY, YY + +.LL52: + FMUL ALPHA, a1, t1 + FMUL ALPHA, a2, t2 + FMUL ALPHA, a3, t3 + FMUL ALPHA, a4, t4 + + FADD b1, t1, c1 + FMUL ALPHA, a5, t1 + FADD b2, t2, c2 + FMUL ALPHA, a6, t2 + FADD b3, t3, c3 + FMUL ALPHA, a7, t3 + FADD b4, t4, c4 + FMUL ALPHA, a8, t4 + + STF c1, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b5, t1, c1 + STF c2, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b6, t2, c2 + STF c3, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b7, t3, c3 + STF c4, [YY + 0 * SIZE] + add YY, INCY, YY + FADD b8, t4, c4 + + STF c1, [YY + 0 * SIZE] + add YY, INCY, YY + STF c2, [YY + 0 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + add YY, INCY, YY + STF c4, [YY + 0 * SIZE] + add YY, INCY, YY + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + + FMUL ALPHA, a1, t1 + FADD b1, t1, c1 + + add I, -1, I + cmp I, 0 + STF c1, [Y + 0 * SIZE] + add Y, INCY, Y + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/cabs.S b/kernel/sparc/cabs.S new file mode 100644 index 0000000000..119293e982 --- /dev/null +++ b/kernel/sparc/cabs.S @@ -0,0 +1,58 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + + add %sp, -128, %sp + + LDF [%o0 + 0 * SIZE], %f0 + LDF [%o0 + 1 * SIZE], %f8 + FABS %f8, %f8 + FABS %f0, %f0 + FADD %f0, %f8, %f0 +#if !defined(DOUBLE) && defined(F2CCONV) + fstod %f0, %f0 +#endif + retl + sub %sp, -128, %sp + + EPILOGUE + diff --git a/kernel/sparc/cnrm2.S b/kernel/sparc/cnrm2.S new file mode 100644 index 0000000000..8dc4b56b67 --- /dev/null +++ b/kernel/sparc/cnrm2.S @@ -0,0 +1,329 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + + PROLOGUE + SAVESP + + FCLR(0) + + FMOV c1, c2 + FMOV c1, c3 + FMOV c1, c4 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL20 + sll INCX, ZBASE_SHIFT, INCX + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + ld [X + 0 * SIZE], a1 + add I, -1, I + ld [X + 1 * SIZE], a2 + cmp I, 0 + ld [X + 2 * SIZE], a3 + ld [X + 3 * SIZE], a4 + ld [X + 4 * SIZE], a5 + ld [X + 5 * SIZE], a6 + ld [X + 6 * SIZE], a7 + ld [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + + faddd c2, t2, c2 + add I, -1, I + fsmuld a2, a2, t2 + ld [X + 0 * SIZE], a1 + + faddd c3, t3, c3 + cmp I, 0 + fsmuld a3, a3, t3 + ld [X + 1 * SIZE], a2 + + faddd c4, t4, c4 + fsmuld a4, a4, t4 + ld [X + 2 * SIZE], a3 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + ld [X + 3 * SIZE], a4 + + faddd c2, t2, c2 + fsmuld a6, a6, t2 + ld [X + 4 * SIZE], a5 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + ld [X + 5 * SIZE], a6 + + faddd c4, t4, c4 + ld [X + 6 * SIZE], a7 + fsmuld a8, a8, t4 + add X, 8 * SIZE, X + + bg,pt %icc, .LL11 + ld [X - 1 * SIZE], a8 + +.LL12: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + faddd c2, t2, c2 + fsmuld a2, a2, t2 + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + faddd c4, t4, c4 + fsmuld a4, a4, t4 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + faddd c2, t2, c2 + fsmuld a6, a6, t2 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + faddd c4, t4, c4 + fsmuld a8, a8, t4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + ld [X + 0 * SIZE], a1 + add I, -1, I + ld [X + 1 * SIZE], a2 + cmp I, 0 + faddd c1, t1, c1 + faddd c2, t2, c2 + fsmuld a1, a1, t1 + fsmuld a2, a2, t2 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + faddd c1, t1, c1 + faddd c2, t2, c2 + faddd c3, t3, c3 + faddd c4, t4, c4 + + faddd c1, c2, c1 + faddd c3, c4, c3 + faddd c1, c3, c1 + + fsqrtd c1, c1 + +#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) + fdtos c1, c1 +#endif +.LL20: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + ld [X + 0 * SIZE], a1 + ld [X + 1 * SIZE], a2 + add X, INCX, X + ld [X + 0 * SIZE], a3 + ld [X + 1 * SIZE], a4 + add X, INCX, X + ld [X + 0 * SIZE], a5 + ld [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + ld [X + 0 * SIZE], a7 + cmp I, 0 + ld [X + 1 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + faddd c1, t1, c1 + add I, -1, I + fsmuld a1, a1, t1 + ld [X + 0 * SIZE], a1 + + faddd c2, t2, c2 + cmp I, 0 + fsmuld a2, a2, t2 + ld [X + 1 * SIZE], a2 + add X, INCX, X + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + ld [X + 0 * SIZE], a3 + + faddd c4, t4, c4 + fsmuld a4, a4, t4 + ld [X + 1 * SIZE], a4 + add X, INCX, X + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + ld [X + 0 * SIZE], a5 + + faddd c2, t2, c2 + fsmuld a6, a6, t2 + ld [X + 1 * SIZE], a6 + add X, INCX, X + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + ld [X + 0 * SIZE], a7 + + faddd c4, t4, c4 + fsmuld a8, a8, t4 + ld [X + 1 * SIZE], a8 + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + faddd c2, t2, c2 + fsmuld a2, a2, t2 + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + faddd c4, t4, c4 + fsmuld a4, a4, t4 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + faddd c2, t2, c2 + fsmuld a6, a6, t2 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + faddd c4, t4, c4 + fsmuld a8, a8, t4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + ld [X + 0 * SIZE], a1 + add I, -1, I + ld [X + 1 * SIZE], a2 + cmp I, 0 + faddd c1, t1, c1 + faddd c2, t2, c2 + fsmuld a1, a1, t1 + fsmuld a2, a2, t2 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + faddd c1, t1, c1 + faddd c2, t2, c2 + faddd c3, t3, c3 + faddd c4, t4, c4 + + faddd c1, c2, c1 + faddd c3, c4, c3 + faddd c1, c3, c1 + + fsqrtd c1, c1 + +#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) + fdtos c1, c1 +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/copy.S b/kernel/sparc/copy.S new file mode 100644 index 0000000000..959d2ff2af --- /dev/null +++ b/kernel/sparc/copy.S @@ -0,0 +1,218 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define a9 %f16 +#define a10 %f18 +#define a11 %f20 +#define a12 %f22 +#define a13 %f24 +#define a14 %f26 +#define a15 %f28 +#define a16 %f30 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define a9 %f8 +#define a10 %f9 +#define a11 %f10 +#define a12 %f11 +#define a13 %f12 +#define a14 %f13 +#define a15 %f14 +#define a16 %f15 +#endif + + PROLOGUE + SAVESP + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + +#define PREFETCHSIZE 32 + +.LL11: + LDF [X + 0 * SIZE], a1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + STF a1, [Y + 0 * SIZE] + prefetch [Y + PREFETCHSIZE * SIZE], 0 + STF a2, [Y + 1 * SIZE] + STF a3, [Y + 2 * SIZE] + STF a4, [Y + 3 * SIZE] + STF a5, [Y + 4 * SIZE] + STF a6, [Y + 5 * SIZE] + STF a7, [Y + 6 * SIZE] + STF a8, [Y + 7 * SIZE] + + add I, -1, I + cmp I, 0 + add Y, 8 * SIZE, Y + add X, 8 * SIZE, X + + bg,pt %icc, .LL11 + nop + + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + add X, 1 * SIZE, X + STF a1, [Y + 0 * SIZE] + bg,pt %icc, .LL16 + add Y, 1 * SIZE, Y + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + add X, INCX, X + + STF a1, [Y + 0 * SIZE] + add Y, INCY, Y + add I, -1, I + STF a2, [Y + 0 * SIZE] + add Y, INCY, Y + cmp I, 0 + STF a3, [Y + 0 * SIZE] + add Y, INCY, Y + STF a4, [Y + 0 * SIZE] + add Y, INCY, Y + STF a5, [Y + 0 * SIZE] + add Y, INCY, Y + STF a6, [Y + 0 * SIZE] + add Y, INCY, Y + STF a7, [Y + 0 * SIZE] + add Y, INCY, Y + STF a8, [Y + 0 * SIZE] + + bg,pt %icc, .LL51 + add Y, INCY, Y + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + add X, INCX, X + STF a1, [Y + 0 * SIZE] + bg,pt %icc, .LL56 + add Y, INCY, Y + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S new file mode 100644 index 0000000000..8063e23dac --- /dev/null +++ b/kernel/sparc/dnrm2.S @@ -0,0 +1,675 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 +#define XX %i4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#define fmax %f32 +#define fzero %f34 +#define fone %f36 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#define fmax %f16 +#define fzero %f17 +#define fone %f18 +#endif + + PROLOGUE + SAVESP + +#ifdef DOUBLE + FCLR(3) +#else + FCLR(17) +#endif + + mov X, XX + mov 0x3ff, %g1 + sll %g1, 20, %g1 + + cmp N, 0 + ble .LL99 + FMOV fzero, c1 + + cmp INCX, 0 + ble .LL99 + sll INCX, BASE_SHIFT, INCX + + add %sp, -8, %sp + st %g1, [%sp + STACK_START + 0] + st %g0, [%sp + STACK_START + 4] + + add N, -1, N + LDF [X], c4 + add X, INCX, X + + LDF [%sp + STACK_START], fone + add %sp, 8, %sp + + FABS c4, c1 + FABS c4, c2 + FABS c4, c3 + FABS c4, c4 + + cmp INCX, SIZE + bne .LL100 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + FABS a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a2, t2 + LDF [X + 0 * SIZE], a1 + FABS a3, t3 + LDF [X + 1 * SIZE], a2 + FABS a4, t4 + LDF [X + 2 * SIZE], a3 + + FCMP %fcc0, t1, c1 + LDF [X + 3 * SIZE], a4 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + FABS a8, t4 + LDF [X + 7 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + add I, -1, I + FMOVG %fcc1, t2, c2 + cmp I, 0 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + + FCMP %fcc0, t1, c1 + FMOVG %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + mov XX, X + FMOVG %fcc0, c2, c1 + FMOVG %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FMOVG %fcc0, c3, c1 + + FCMP c1, fzero + fbe .LL99 + nop + + FMOV c1, fmax + add N, 1, N + FDIV fone, c1, fone + + FMOV fzero, c1 + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL35 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL32 + add X, 8 * SIZE, X + +.LL31: + FMUL fone, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMUL fone, a2, t2 + LDF [X + 0 * SIZE], a1 + FMUL fone, a3, t3 + LDF [X + 1 * SIZE], a2 + FMUL fone, a4, t4 + LDF [X + 2 * SIZE], a3 + + FMUL t1, t1, t1 + LDF [X + 3 * SIZE], a4 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + LDF [X + 4 * SIZE], a5 + FADD c2, t2, c2 + FMUL fone, a6, t2 + LDF [X + 5 * SIZE], a6 + FADD c3, t3, c3 + FMUL fone, a7, t3 + LDF [X + 6 * SIZE], a7 + FADD c4, t4, c4 + FMUL fone, a8, t4 + LDF [X + 7 * SIZE], a8 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + add I, -1, I + FADD c2, t2, c2 + cmp I, 0 + FADD c3, t3, c3 + FADD c4, t4, c4 + + bg,pt %icc, .LL31 + add X, 8 * SIZE, X + +.LL32: + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL fone, a3, t3 + FMUL fone, a4, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + FADD c2, t2, c2 + FMUL fone, a6, t2 + FADD c3, t3, c3 + FMUL fone, a7, t3 + FADD c4, t4, c4 + FMUL fone, a8, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +.LL35: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL39 + nop + +.LL36: + LDF [X + 0 * SIZE], a1 + FMUL fone, a1, t1 + FMUL t1, t1, t1 + FADD c1, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL36 + add X, 1 * SIZE, X + +.LL39: + FADD c1, c2, c1 + FADD c3, c4, c3 + FADD c1, c3, c1 + + FSQRT c1, c1 + FMUL fmax, c1, c1 + +.LL99: + return %i7 + 8 + clr %g0 + +.LL100: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL105 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + ble,pt %icc, .LL102 + add X, INCX, X + +.LL101: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FABS a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FABS a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FABS a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FABS a8, t4 + LDF [X + 0 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + add I, -1, I + FMOVG %fcc1, t2, c2 + cmp I, 0 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + bg,pt %icc, .LL101 + add X, INCX, X + +.LL102: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + +.LL105: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL109 + nop + +.LL106: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FMOVG %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL106 + add X, INCX, X + +.LL109: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + mov XX, X + FMOVG %fcc0, c2, c1 + FMOVG %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FMOVG %fcc0, c3, c1 + + FCMP c1, fzero + fbe .LL99 + nop + + FMOV c1, fmax + FDIV fone, c1, fone + + FMOV fzero, c1 + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + add N, 1, N + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL135 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + LDF [X + 0 * SIZE], a5 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a6 + add X, INCX, X + cmp I, 0 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL132 + add X, INCX, X + +.LL131: + FMUL fone, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMUL fone, a2, t2 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FMUL fone, a3, t3 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FMUL fone, a4, t4 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + FMUL t1, t1, t1 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FADD c2, t2, c2 + FMUL fone, a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FADD c3, t3, c3 + FMUL fone, a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FADD c4, t4, c4 + FMUL fone, a8, t4 + LDF [X + 0 * SIZE], a8 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + add I, -1, I + FADD c2, t2, c2 + cmp I, 0 + FADD c3, t3, c3 + FADD c4, t4, c4 + + bg,pt %icc, .LL131 + add X, INCX, X + +.LL132: + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL fone, a3, t3 + FMUL fone, a4, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + FADD c2, t2, c2 + FMUL fone, a6, t2 + FADD c3, t3, c3 + FMUL fone, a7, t3 + FADD c4, t4, c4 + FMUL fone, a8, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +.LL135: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL139 + nop + +.LL136: + LDF [X + 0 * SIZE], a1 + FMUL fone, a1, t1 + FMUL t1, t1, t1 + FADD c1, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL136 + add X, INCX, X + +.LL139: + FADD c1, c2, c1 + FADD c3, c4, c3 + FADD c1, c3, c1 + + FSQRT c1, c1 + FMUL fmax, c1, c1 + + return %i7 + 8 + clr %g0 + + EPILOGUE diff --git a/kernel/sparc/dot.S b/kernel/sparc/dot.S new file mode 100644 index 0000000000..f89d5f95e3 --- /dev/null +++ b/kernel/sparc/dot.S @@ -0,0 +1,423 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f4 +#define t2 %f6 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + +#define b1 %f32 +#define b2 %f34 +#define b3 %f36 +#define b4 %f38 +#define b5 %f40 +#define b6 %f42 +#define b7 %f44 +#define b8 %f46 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 + +#define b1 %f16 +#define b2 %f17 +#define b3 %f18 +#define b4 %f19 +#define b5 %f20 +#define b6 %f21 +#define b7 %f22 +#define b8 %f23 +#endif + + PROLOGUE + SAVESP + +#ifdef DOUBLE + FCLR(0) + FCLR(2) + FCLR(4) + FCLR(6) +#else + FCLR(0) + FCLR(1) + FCLR(4) + FCLR(5) +#endif + + cmp N, 0 + ble .LL19 + nop + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + + LDF [X + 6 * SIZE], a7 + add I, -1, I + LDF [Y + 6 * SIZE], b7 + cmp I, 0 + + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + LDF [Y + 7 * SIZE], b8 + add Y, 8 * SIZE, Y + + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 40 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + FADD c1, t1, c1 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + FMUL a1, b1, t1 + + LDF [X + 0 * SIZE], a1 + FADD c2, t2, c2 + FMUL a2, b2, t2 + LDF [Y + 0 * SIZE], b1 + add I, -1, I + + LDF [X + 1 * SIZE], a2 + FADD c1, t1, c1 + FMUL a3, b3, t1 + LDF [Y + 1 * SIZE], b2 + cmp I, 0 + + LDF [X + 2 * SIZE], a3 + FADD c2, t2, c2 + FMUL a4, b4, t2 + LDF [Y + 2 * SIZE], b3 + + LDF [X + 3 * SIZE], a4 + FADD c1, t1, c1 + FMUL a5, b5, t1 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + FADD c2, t2, c2 + FMUL a6, b6, t2 + LDF [Y + 4 * SIZE], b5 + + LDF [X + 5 * SIZE], a6 + FADD c1, t1, c1 + FMUL a7, b7, t1 + LDF [Y + 5 * SIZE], b6 + + LDF [X + 6 * SIZE], a7 + FADD c2, t2, c2 + FMUL a8, b8, t2 + LDF [Y + 6 * SIZE], b7 + add Y, 8 * SIZE, Y + + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + bg,pt %icc, .LL11 + LDF [Y - 1 * SIZE], b8 + +.LL12: + FADD c1, t1, c1 + FMUL a1, b1, t1 + + FADD c2, t2, c2 + FMUL a2, b2, t2 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a4, b4, t2 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a6, b6, t2 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a8, b8, t2 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + add X, 1 * SIZE, X + FADD c1, t1, c1 + FMUL a1, b1, t1 + bg,pt %icc, .LL16 + add Y, 1 * SIZE, Y + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, c2, c1 + + return %i7 + 8 + nop + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + add Y, INCY, Y + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL52 + nop + +.LL51: + FADD c1, t1, c1 + FMUL a1, b1, t1 + + LDF [X + 0 * SIZE], a1 + FADD c2, t2, c2 + add X, INCX, X + FMUL a2, b2, t2 + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a2 + FADD c1, t1, c1 + add X, INCX, X + FMUL a3, b3, t1 + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + add I, -1, I + + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FADD c2, t2, c2 + FMUL a4, b4, t2 + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + cmp I, 0 + + LDF [X + 0 * SIZE], a4 + add X, INCX, X + FADD c1, t1, c1 + FMUL a5, b5, t1 + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FADD c2, t2, c2 + FMUL a6, b6, t2 + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FADD c1, t1, c1 + FMUL a7, b7, t1 + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FADD c2, t2, c2 + FMUL a8, b8, t2 + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + bg,pt %icc, .LL51 + add Y, INCY, Y + +.LL52: + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b2, t2 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a4, b4, t2 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a6, b6, t2 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a8, b8, t2 + +.LL55: + and N, 7, I + cmp I, 0 + ble %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + add X, INCX, X + add Y, INCY, Y + + FADD c1, t1, c1 + FMUL a1, b1, t1 + + addcc I, -1, I + bg %icc, .LL56 + nop + + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + + FADD c1, c2, c1 + + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/gemm_kernel.S b/kernel/sparc/gemm_kernel.S new file mode 100644 index 0000000000..b6632439cb --- /dev/null +++ b/kernel/sparc/gemm_kernel.S @@ -0,0 +1,3054 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define OFFSET %l5 +#define KK %l6 +#define TEMP1 %l7 +#define TEMP2 %i3 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f58 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f60 +#define ALPHA %f62 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#define ALPHA %f30 +#endif + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 36], OFFSET +#endif +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC +#ifdef TRMMKERNEL + ldx [%sp+ STACK_START + 72], OFFSET +#endif +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif +#endif + + FCLR(29) + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDC, BASE_SHIFT, LDC + +.LL11: + add C, LDC, C2 + FMOV FZERO, t1 + nop + mov C, C1 + + add C2, LDC, C3 + FMOV FZERO, t2 + sra K, 2, L + mov A, AO + + sra M, 2, I + add C3, LDC, C4 + FMOV FZERO, t3 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + cmp I, 0 + add C4, LDC, C + FMOV FZERO, t4 + + ble,pn %icc, .LL50 + FMOV FZERO, c01 + +.LL21: +#if !defined(TRMMKERNEL) + FMOV FZERO, c02 + mov B, BO + + FMOV FZERO, c03 + cmp L, 0 +#else + FMOV FZERO, c02 + FMOV FZERO, c03 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 2 + BASE_SHIFT, TEMP1 + + add AO, TEMP1, AO + add B, TEMP1, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 4, L +#endif + sra L, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c04 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c05 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c06 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c07 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c08 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c09 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c10 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c11 + LDF [BO + 4 * SIZE], b5 /* ***** */ + + LDF [AO + 4 * SIZE], a5 /* ***** */ + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 + 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 + 3 * SIZE], 3 + FMOV FZERO, c15 + + ble,pn %icc, .LL25 + FMOV FZERO, c16 + + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + +.LL22: + FADD c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c08, t2, c08 + FMUL a5, b2, t2 + FADD c12, t3, c12 + FMUL a5, b3, t3 + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL29 + nop + +.LL26: + FADD c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#ifndef TRMMKERNEL + FADD c04, t1, c04 + add I, -1, I + FMUL c01, ALPHA, c01 + LDF [C1 + 0 * SIZE], a1 + + FADD c08, t2, c08 + cmp I, 0 + FMUL c02, ALPHA, c02 + LDF [C1 + 1 * SIZE], a2 + + FADD c12, t3, c12 + nop + FMUL c03, ALPHA, c03 + LDF [C1 + 2 * SIZE], a3 + + FADD c16, t4, c16 + nop + FMUL c04, ALPHA, c04 + LDF [C1 + 3 * SIZE], a4 + + FMUL c05, ALPHA, c05 + LDF [C2 + 0 * SIZE], b1 + FMUL c06, ALPHA, c06 + LDF [C2 + 1 * SIZE], b2 + + FMUL c07, ALPHA, c07 + LDF [C2 + 2 * SIZE], b3 + FMUL c08, ALPHA, c08 + LDF [C2 + 3 * SIZE], b4 + + FMUL c09, ALPHA, c09 + LDF [C3 + 0 * SIZE], t1 + FMUL c10, ALPHA, c10 + LDF [C3 + 1 * SIZE], t2 + + FMUL c11, ALPHA, c11 + LDF [C3 + 2 * SIZE], t3 + FMUL c12, ALPHA, c12 + LDF [C3 + 3 * SIZE], t4 + + FMUL c13, ALPHA, c13 + add C1, 4 * SIZE, C1 + FADD c01, a1, c01 + LDF [C4 + 0 * SIZE], a1 + + FMUL c14, ALPHA, c14 + add C2, 4 * SIZE, C2 + FADD c02, a2, c02 + LDF [C4 + 1 * SIZE], a2 + + FMUL c15, ALPHA, c15 + add C3, 4 * SIZE, C3 + FADD c03, a3, c03 + LDF [C4 + 2 * SIZE], a3 + + FMUL c16, ALPHA, c16 + nop + FADD c04, a4, c04 + LDF [C4 + 3 * SIZE], a4 + + STF c01, [C1 - 4 * SIZE] + FADD c05, b1, c05 + STF c02, [C1 - 3 * SIZE] + FADD c06, b2, c06 + + STF c03, [C1 - 2 * SIZE] + FADD c07, b3, c07 + STF c04, [C1 - 1 * SIZE] + FADD c08, b4, c08 + + STF c05, [C2 - 4 * SIZE] + FADD c09, t1, c09 + STF c06, [C2 - 3 * SIZE] + FADD c10, t2, c10 + + STF c07, [C2 - 2 * SIZE] + FADD c11, t3, c11 + STF c08, [C2 - 1 * SIZE] + FADD c12, t4, c12 + + STF c09, [C3 - 4 * SIZE] + FADD c13, a1, c13 + STF c10, [C3 - 3 * SIZE] + FADD c14, a2, c14 + + STF c11, [C3 - 2 * SIZE] + FADD c15, a3, c15 + STF c12, [C3 - 1 * SIZE] + FADD c16, a4, c16 + + STF c13, [C4 + 0 * SIZE] + FMOV FZERO, t1 + STF c14, [C4 + 1 * SIZE] + FMOV FZERO, t2 + + STF c15, [C4 + 2 * SIZE] + FMOV FZERO, t3 + STF c16, [C4 + 3 * SIZE] + FMOV FZERO, t4 + + add C4, 4 * SIZE, C4 +#else + + FADD c04, t1, c04 + FMUL c01, ALPHA, c01 + FADD c08, t2, c08 + FMUL c02, ALPHA, c02 + FADD c12, t3, c12 + FMUL c03, ALPHA, c03 + FADD c16, t4, c16 + FMUL c04, ALPHA, c04 + + STF c01, [C1 + 0 * SIZE] + FMUL c05, ALPHA, c05 + STF c02, [C1 + 1 * SIZE] + FMUL c06, ALPHA, c06 + STF c03, [C1 + 2 * SIZE] + FMUL c07, ALPHA, c07 + STF c04, [C1 + 3 * SIZE] + FMUL c08, ALPHA, c08 + + STF c05, [C2 + 0 * SIZE] + FMUL c09, ALPHA, c09 + STF c06, [C2 + 1 * SIZE] + FMUL c10, ALPHA, c10 + STF c07, [C2 + 2 * SIZE] + FMUL c11, ALPHA, c11 + STF c08, [C2 + 3 * SIZE] + FMUL c12, ALPHA, c12 + + STF c09, [C3 + 0 * SIZE] + FMUL c13, ALPHA, c13 + STF c10, [C3 + 1 * SIZE] + FMUL c14, ALPHA, c14 + STF c11, [C3 + 2 * SIZE] + FMUL c15, ALPHA, c15 + STF c12, [C3 + 3 * SIZE] + FMUL c16, ALPHA, c16 + + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + STF c15, [C4 + 2 * SIZE] + STF c16, [C4 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + add C3, 4 * SIZE, C3 + add C4, 4 * SIZE, C4 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -4, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + +#endif + + sra K, 2, L + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 2, I + FMOV FZERO, c02 + cmp I, 0 + + FMOV FZERO, t1 + ble,pn %icc, .LL70 + FMOV FZERO, c04 + +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t2 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, t3 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c05 +#else + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 4, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + +#endif + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD c04, t2, c04 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#ifndef TRMMKERNEL + FADD c02, t1, c02 + FMUL c01, ALPHA, c01 + LDF [C1 + 0 * SIZE], a1 + FADD c04, t2, c04 + FMUL c03, ALPHA, c03 + LDF [C1 + 1 * SIZE], a2 + FADD c06, t3, c06 + FMUL c05, ALPHA, c05 + LDF [C2 + 0 * SIZE], a3 + FADD c08, t4, c08 + FMUL c07, ALPHA, c07 + LDF [C2 + 1 * SIZE], a4 + + FMUL c02, ALPHA, c02 + FADD c01, a1, c01 + LDF [C3 + 0 * SIZE], b1 + + FMUL c04, ALPHA, c04 + FADD c02, a2, c02 + LDF [C3 + 1 * SIZE], b2 + + FMUL c06, ALPHA, c06 + FADD c03, a3, c03 + LDF [C4 + 0 * SIZE], b3 + + FMUL c08, ALPHA, c08 + FADD c04, a4, c04 + LDF [C4 + 1 * SIZE], b4 + + STF c01, [C1 + 0 * SIZE] + FADD c05, b1, c05 + STF c02, [C1 + 1 * SIZE] + FADD c06, b2, c06 + add C1, 2 * SIZE, C1 + + STF c03, [C2 + 0 * SIZE] + FADD c07, b3, c07 + STF c04, [C2 + 1 * SIZE] + FADD c08, b4, c08 + add C2, 2 * SIZE, C2 + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + add C3, 2 * SIZE, C3 + + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + add C4, 2 * SIZE, C4 +#else + + FADD c02, t1, c02 + FADD c04, t2, c04 + FADD c06, t3, c06 + FADD c08, t4, c08 + + FMUL c01, ALPHA, c01 + FMUL c03, ALPHA, c03 + FMUL c05, ALPHA, c05 + FMUL c07, ALPHA, c07 + + FMUL c02, ALPHA, c02 + FMUL c04, ALPHA, c04 + FMUL c06, ALPHA, c06 + FMUL c08, ALPHA, c08 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + +.LL70: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL71: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL75 + nop + +.LL72: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a1, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 4 * SIZE], a1 + + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a2, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [BO + 9 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a2, b3, t3 + LDF [BO + 10 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 11 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 12 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 13 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [BO + 14 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a3, b4, t4 + LDF [BO + 15 * SIZE], b4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 16 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a4, b2, t2 + LDF [BO + 17 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 18 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 19 * SIZE], b4 + + add BO, 16 * SIZE, BO + bg,pt %icc, .LL72 + LDF [AO + 3 * SIZE], a4 + +.LL75: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL79 + nop + +.LL76: + FADD c01, t1, c01 + add AO, 1 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + add BO, 4 * SIZE, BO + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + bg,pt %icc, .LL76 + LDF [BO + 3 * SIZE], b4 + + +.LL79: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + LDF [C1 + 0 * SIZE], a1 + FADD c02, t2, c02 + LDF [C2 + 0 * SIZE], a2 + FADD c03, t3, c03 + LDF [C3 + 0 * SIZE], a3 + FADD c04, t4, c04 + LDF [C4 + 0 * SIZE], a4 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + FADD c01, a1, c01 + FADD c02, a2, c02 + FADD c03, a3, c03 + FADD c04, a4, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL99: + add J, -1, J + mov BO, B + cmp J, 0 + bg,pt %icc, .LL11 +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 4, KK +#else + nop +#endif + +.LL100: /* n & 2 */ + sra M, 2, I + and N, 2, J + + cmp J, 0 + add C, LDC, C2 + ble,pn %icc, .LL200 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov C, C1 + add C2, LDC, C + + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t1 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, t2 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 +#else + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 +#endif + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + FADD c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: +#ifndef TRMMKERNEL + FADD c03, t1, c03 + add I, -1, I + LDF [C1 + 0 * SIZE], a1 + FADD c07, t2, c07 + cmp I, 0 + LDF [C1 + 1 * SIZE], a2 + FADD c04, t3, c04 + LDF [C1 + 2 * SIZE], a3 + FADD c08, t4, c08 + LDF [C1 + 3 * SIZE], a4 + + LDF [C2 + 0 * SIZE], b1 + FMUL c01, ALPHA, c01 + LDF [C2 + 1 * SIZE], b2 + FMUL c02, ALPHA, c02 + LDF [C2 + 2 * SIZE], b3 + FMUL c03, ALPHA, c03 + LDF [C2 + 3 * SIZE], b4 + FMUL c04, ALPHA, c04 + + FMUL c05, ALPHA, c05 + FADD c01, a1, c01 + FMUL c06, ALPHA, c06 + FADD c02, a2, c02 + FMUL c07, ALPHA, c07 + FADD c03, a3, c03 + FMUL c08, ALPHA, c08 + FADD c04, a4, c04 + + STF c01, [C1 + 0 * SIZE] + FADD c05, b1, c05 + STF c02, [C1 + 1 * SIZE] + FADD c06, b2, c06 + STF c03, [C1 + 2 * SIZE] + FADD c07, b3, c07 + STF c04, [C1 + 3 * SIZE] + add C1, 4 * SIZE, C1 + FADD c08, b4, c08 + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + add C2, 4 * SIZE, C2 +#else + FADD c03, t1, c03 + FADD c07, t2, c07 + FADD c04, t3, c04 + FADD c08, t4, c08 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + FMUL c05, ALPHA, c05 + FMUL c06, ALPHA, c06 + FMUL c07, ALPHA, c07 + FMUL c08, ALPHA, c08 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -4, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 +#endif + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL170 + nop + +.LL151: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL a1, b1, t1 + FMUL a1, b2, t2 + FMUL a2, b1, t3 + FMUL a2, b2, t4 + + add AO, 2 * SIZE, AO + add BO, 2 * SIZE, BO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL156 + nop + +.LL159: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C2 + 0 * SIZE], a2 + LDF [C1 + 1 * SIZE], a3 + LDF [C2 + 1 * SIZE], a4 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + FADD c01, a1, c01 + FADD c02, a2, c02 + FADD c03, a3, c03 + FADD c04, a4, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + add C1, 2 * SIZE, C1 + STF c04, [C2 + 1 * SIZE] + add C2, 2 * SIZE, C2 +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + STF c04, [C2 + 1 * SIZE] + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + +.LL170: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + +.LL171: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL175 + nop + +.LL172: + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + add L, -1, L + LDF [AO + 0 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 9 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 10 * SIZE], b3 + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 11 * SIZE], b4 + add BO, 8 * SIZE, BO + + bg,pt %icc, .LL172 + LDF [AO + 3 * SIZE], a4 + +.LL175: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL179 + nop + +.LL176: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + add AO, 1 * SIZE, AO + LDF [BO + 2 * SIZE], b1 + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 3 * SIZE], b2 + + add BO, 2 * SIZE, BO + bg,pt %icc, .LL176 + LDF [AO + 0 * SIZE], a1 + +.LL179: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + LDF [C1 + 0 * SIZE], a1 + FADD c02, t2, c02 + LDF [C2 + 0 * SIZE], a2 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + + FADD c01, a1, c01 + FADD c02, a2, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] +#else + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL199: + mov BO, B +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 2, KK +#else + nop +#endif + +.LL200: + and N, 1, J + sra M, 2, I + + cmp J, 0 + ble,pn %icc, .LL999 + mov A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + cmp I, 0 + ble,pn %icc, .LL250 + mov C, C1 + +.LL221: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL225 + prefetch [C1 + 4 * SIZE], 2 + +.LL222: + FADD c01, t1, c01 + add BO, 4 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + add L, -1, L + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + + FADD c01, t1, c01 + cmp L, 0 + FMUL a1, b2, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [AO + 9 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b2, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 11 * SIZE], a4 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 12 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 13 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [AO + 14 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b3, t4 + LDF [AO + 15 * SIZE], a4 + LDF [BO + 2 * SIZE], b3 + + FADD c01, t1, c01 + FMUL a1, b4, t1 + LDF [AO + 16 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b4, t2 + LDF [AO + 17 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 18 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 19 * SIZE], a4 + add AO, 16 * SIZE, AO + + bg,pt %icc, .LL222 + LDF [BO + 3 * SIZE], b4 + +.LL225: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 4, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL229 + nop + +.LL226: + FADD c01, t1, c01 + add BO, 1 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + add AO, 4 * SIZE, AO + + bg,pt %icc, .LL226 + LDF [BO + 0 * SIZE], b1 + +.LL229: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + add I, -1, I + FADD c02, t2, c02 + cmp I, 0 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + LDF [C1 + 2 * SIZE], a3 + LDF [C1 + 3 * SIZE], a4 + + FADD c01, a1, c01 + FADD c02, a2, c02 + FADD c03, a3, c03 + FADD c04, a4, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + add C1, 4 * SIZE, C1 +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FMUL c03, ALPHA, c03 + FMUL c04, ALPHA, c04 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + add C1, 4 * SIZE, C1 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -4, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 +#endif + + bg,pt %icc, .LL221 + nop + +.LL250: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL270 + nop + +.LL251: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL255 + nop + +.LL252: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + LDF [BO + 4 * SIZE], b1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b2, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 5 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 9 * SIZE], a2 + LDF [BO + 6 * SIZE], b3 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + add AO, 8 * SIZE, AO + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL252 + add BO, 4 * SIZE, BO + +.LL255: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL259 + nop + +.LL256: + + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 2 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a2, b1, t2 + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add AO, 2 * SIZE, AO + + bg,pt %icc, .LL256 + add BO, 1 * SIZE, BO + +.LL259: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + LDF [C1 + 0 * SIZE], a1 + FADD c02, t2, c02 + LDF [C1 + 1 * SIZE], a2 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + FADD c01, a1, c01 + FADD c02, a2, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + add C1, 2 * SIZE, C1 +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + FMUL c01, ALPHA, c01 + FMUL c02, ALPHA, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + add C1, 2 * SIZE, C1 + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + +.LL270: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL271: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + mov B, BO + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + cmp L, 0 + FMOV FZERO, t2 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t2 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 +#endif + + ble,pn %icc, .LL275 + LDF [BO + 3 * SIZE], b4 + +.LL272: + FADD c01, t1, c01 + add L, -1, L + add AO, 4 * SIZE, AO + + FMUL a1, b1, t1 + add BO, 4 * SIZE, BO + LDF [AO + 0 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + LDF [BO + 0 * SIZE], b1 + FMUL a2, b2, t2 + + LDF [AO + 1 * SIZE], a2 + FADD c01, t3, c01 + LDF [BO + 1 * SIZE], b2 + FMUL a3, b3, t3 + + LDF [AO + 2 * SIZE], a3 + FADD c02, t4, c02 + LDF [BO + 2 * SIZE], b3 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL272 + LDF [BO + 3 * SIZE], b4 + +.LL275: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL279 + nop + +.LL276: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 1 * SIZE], a1 + + LDF [BO + 1 * SIZE], b1 + add BO, 1 * SIZE, BO + cmp L, 0 + bg,pt %icc, .LL276 + add AO, 1 * SIZE, AO + +.LL279: +#ifndef TRMMKERNEL + FADD c01, t1, c01 + + LDF [C1 + 0 * SIZE], a1 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + FADD c01, c02, c01 + + FMUL c01, ALPHA, c01 + FADD c01, a1, c01 + STF c01, [C1 + 0 * SIZE] +#else + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + FADD c01, c02, c01 + + FMUL c01, ALPHA, c01 + STF c01, [C1 + 0 * SIZE] + +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_kernel_2x8.S b/kernel/sparc/gemm_kernel_2x8.S new file mode 100644 index 0000000000..c0d257aa02 --- /dev/null +++ b/kernel/sparc/gemm_kernel_2x8.S @@ -0,0 +1,2561 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define BB %o7 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define C5 %l5 +#define C6 %l6 +#define C7 %l7 +#define C8 %i3 + +#define OFFSET %g1 +#define KK %g2 +#define TEMP1 %g3 +#define TEMP2 %g4 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define ALPHA %f62 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#define alpha 31 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define ALPHA %f31 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#define alpha 31 + +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + st %i3, [%sp + STACK_START + 16] + + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 36], OFFSET +#endif +#endif + LDF [%sp + STACK_START + 16], ALPHA +#ifdef TRMMKERNEL + st %g1, [%sp + STACK_START + 8] + st %g2, [%sp + STACK_START + 12] + st %g3, [%sp + STACK_START + 16] + st %g4, [%sp + STACK_START + 20] +#endif +#else + + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC +#ifdef TRMMKERNEL + ldx [%sp+ STACK_START + 72], OFFSET +#endif + +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif + +#ifdef TRMMKERNEL + stx %g1, [%sp + STACK_START + 32] + stx %g2, [%sp + STACK_START + 40] + stx %g3, [%sp + STACK_START + 48] + stx %g4, [%sp + STACK_START + 56] +#endif + +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL30 + sll LDC, BASE_SHIFT, LDC + +.LL11: + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C5 + add C5, LDC, C6 + add C6, LDC, C7 + add C7, LDC, C8 + add C8, LDC, C + + sll K, BASE_SHIFT + 3, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov A, AO + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL20 + add B, BB, BB + .align 4 + +.LL12: + prefetch [BB + 0 * SIZE], 1 + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 8 * SIZE], a5 + + LDF [BO + 0 * SIZE], b1 + + LDF [BO + 1 * SIZE], b2 + FCLR (cc01) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc09) + LDF [BO + 4 * SIZE], b5 + FCLR (cc13) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc06) + LDF [BO + 7 * SIZE], b8 + FCLR (cc10) + LDF [BO + 8 * SIZE], b9 + FCLR (cc14) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc03) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc11) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc15) + + prefetch [C5 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C6 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C7 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C8 + 2 * SIZE], 3 + FCLR (cc16) + +#ifndef TRMMKERNEL + sra K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 8, L +#endif + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + add BB, 32 * SIZE, BB + .align 4 + +.LL13: + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#ifndef TRMMKERNEL + and K, 7, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 8, L +#endif + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + nop + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + nop + FMADD (aa2, bb5, cc10, cc10) + nop + + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + LDF [C2 + 0 * SIZE], a3 + LDF [C2 + 1 * SIZE], a4 + + LDF [C3 + 0 * SIZE], b1 + LDF [C3 + 1 * SIZE], b2 + LDF [C4 + 0 * SIZE], b3 + LDF [C4 + 1 * SIZE], b4 + + FMADD (alpha, cc01, aa1, cc01) + LDF [C5 + 0 * SIZE], a1 + FMADD (alpha, cc02, aa2, cc02) + LDF [C5 + 1 * SIZE], a2 + FMADD (alpha, cc03, aa3, cc03) + LDF [C6 + 0 * SIZE], a3 + FMADD (alpha, cc04, aa4, cc04) + LDF [C6 + 1 * SIZE], a4 + + FMADD (alpha, cc05, bb1, cc05) + LDF [C7 + 0 * SIZE], b1 + FMADD (alpha, cc06, bb2, cc06) + LDF [C7 + 1 * SIZE], b2 + FMADD (alpha, cc07, bb3, cc07) + LDF [C8 + 0 * SIZE], b3 + FMADD (alpha, cc08, bb4, cc08) + LDF [C8 + 1 * SIZE], b4 + + FMADD (alpha, cc09, aa1, cc09) + STF c01, [C1 + 0 * SIZE] + FMADD (alpha, cc10, aa2, cc10) + STF c02, [C1 + 1 * SIZE] + FMADD (alpha, cc11, aa3, cc11) + STF c03, [C2 + 0 * SIZE] + FMADD (alpha, cc12, aa4, cc12) + STF c04, [C2 + 1 * SIZE] + + FMADD (alpha, cc13, bb1, cc13) + STF c05, [C3 + 0 * SIZE] + FMADD (alpha, cc14, bb2, cc14) + STF c06, [C3 + 1 * SIZE] + FMADD (alpha, cc15, bb3, cc15) + STF c07, [C4 + 0 * SIZE] + FMADD (alpha, cc16, bb4, cc16) + STF c08, [C4 + 1 * SIZE] + +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c02, c02 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c04, c04 + + FMUL ALPHA, c05, c05 + FMUL ALPHA, c06, c06 + FMUL ALPHA, c07, c07 + FMUL ALPHA, c08, c08 + + FMUL ALPHA, c09, c09 + STF c01, [C1 + 0 * SIZE] + FMUL ALPHA, c10, c10 + STF c02, [C1 + 1 * SIZE] + FMUL ALPHA, c11, c11 + STF c03, [C2 + 0 * SIZE] + FMUL ALPHA, c12, c12 + STF c04, [C2 + 1 * SIZE] + + FMUL ALPHA, c13, c13 + STF c05, [C3 + 0 * SIZE] + FMUL ALPHA, c14, c14 + STF c06, [C3 + 1 * SIZE] + FMUL ALPHA, c15, c15 + STF c07, [C4 + 0 * SIZE] + FMUL ALPHA, c16, c16 + STF c08, [C4 + 1 * SIZE] +#endif + + STF c09, [C5 + 0 * SIZE] + add C1, 2 * SIZE, C1 + STF c10, [C5 + 1 * SIZE] + add C2, 2 * SIZE, C2 + STF c11, [C6 + 0 * SIZE] + add C3, 2 * SIZE, C3 + STF c12, [C6 + 1 * SIZE] + add C4, 2 * SIZE, C4 + + STF c13, [C7 + 0 * SIZE] + add C5, 2 * SIZE, C5 + STF c14, [C7 + 1 * SIZE] + add C6, 2 * SIZE, C6 + STF c15, [C8 + 0 * SIZE] + add C7, 2 * SIZE, C7 + STF c16, [C8 + 1 * SIZE] + add C8, 2 * SIZE, C8 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + .align 4 + +.LL20: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL29 + nop + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + FCLR (cc01) + LDF [BO + 1 * SIZE], b2 + FCLR (cc03) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc07) + LDF [BO + 4 * SIZE], b5 + FCLR (cc09) + LDF [BO + 5 * SIZE], b6 + FCLR (cc11) + LDF [BO + 6 * SIZE], b7 + FCLR (cc13) + LDF [BO + 7 * SIZE], b8 + FCLR (cc15) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 8, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + LDF [BO + 8 * SIZE], b9 + .align 4 + +.LL23: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa2, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa2, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa2, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa2, bb5, cc09, cc09) + LDF [BO + 20 * SIZE], b5 + FMADD (aa2, bb6, cc11, cc11) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa2, bb7, cc13, cc13) + LDF [BO + 22 * SIZE], b7 + FMADD (aa2, bb8, cc15, cc15) + LDF [BO + 23 * SIZE], b8 + + LDF [AO + 4 * SIZE], a1 + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb1, cc01, cc01) + LDF [BO + 32 * SIZE], b1 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 26 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [BO + 28 * SIZE], b5 + FMADD (aa3, bb6, cc11, cc11) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 30 * SIZE], b7 + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa4, bb9, cc01, cc01) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb2, cc03, cc03) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa4, bb3, cc05, cc05) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc07, cc07) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa4, bb5, cc09, cc09) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb6, cc11, cc11) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa4, bb7, cc13, cc13) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc15, cc15) + LDF [BO + 39 * SIZE], b8 + + LDF [AO + 6 * SIZE], a3 + LDF [AO + 7 * SIZE], a4 + + add AO, 4 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL23 + add BO, 32 * SIZE, BO + .align 4 + +.LL25: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 8, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 1 * SIZE], a1 + add AO, 1 * SIZE, AO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL27 + add BO, 8 * SIZE, BO + .align 4 + +.LL28: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C2 + 0 * SIZE], a2 + LDF [C3 + 0 * SIZE], a3 + LDF [C4 + 0 * SIZE], a4 + + FMADD (alpha, cc01, aa1, cc01) + LDF [C5 + 0 * SIZE], b1 + FMADD (alpha, cc03, aa2, cc03) + LDF [C6 + 0 * SIZE], b2 + + FMADD (alpha, cc05, aa3, cc05) + LDF [C7 + 0 * SIZE], b3 + FMADD (alpha, cc07, aa4, cc07) + LDF [C8 + 0 * SIZE], b4 + + FMADD (alpha, cc09, bb1, cc09) + STF c01, [C1 + 0 * SIZE] + FMADD (alpha, cc11, bb2, cc11) + STF c03, [C2 + 0 * SIZE] + FMADD (alpha, cc13, bb3, cc13) + STF c05, [C3 + 0 * SIZE] + FMADD (alpha, cc15, bb4, cc15) + STF c07, [C4 + 0 * SIZE] +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c05, c05 + FMUL ALPHA, c07, c07 + + FMUL ALPHA, c09, c09 + STF c01, [C1 + 0 * SIZE] + FMUL ALPHA, c11, c11 + STF c03, [C2 + 0 * SIZE] + + FMUL ALPHA, c13, c13 + STF c05, [C3 + 0 * SIZE] + FMUL ALPHA, c15, c15 + STF c07, [C4 + 0 * SIZE] +#endif + + STF c09, [C5 + 0 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c13, [C7 + 0 * SIZE] + STF c15, [C8 + 0 * SIZE] + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + .align 4 + +.LL29: +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 8, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + mov BO, B + .align 4 + +.LL30: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL50 + mov C, C1 + + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + mov A, AO + .align 4 + +.LL32: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc02) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + LDF [BO + 8 * SIZE], b9 + FCLR (cc04) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc05) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C3 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc08) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 4, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + nop + .align 4 + +.LL33: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + nop + FMADD (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD (aa3, bb8, cc07, cc07) + FMADD (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL37 + add BO, 4 * SIZE, BO + .align 4 + +.LL38: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + LDF [C2 + 0 * SIZE], a3 + LDF [C2 + 1 * SIZE], a4 + + FMADD (alpha, cc01, aa1, cc01) + LDF [C3 + 0 * SIZE], b1 + FMADD (alpha, cc02, aa2, cc02) + LDF [C3 + 1 * SIZE], b2 + FMADD (alpha, cc03, aa3, cc03) + LDF [C4 + 0 * SIZE], b3 + FMADD (alpha, cc04, aa4, cc04) + LDF [C4 + 1 * SIZE], b4 + + FMADD (alpha, cc05, bb1, cc05) + STF c01, [C1 + 0 * SIZE] + FMADD (alpha, cc06, bb2, cc06) + STF c02, [C1 + 1 * SIZE] + FMADD (alpha, cc07, bb3, cc07) + STF c03, [C2 + 0 * SIZE] + FMADD (alpha, cc08, bb4, cc08) + STF c04, [C2 + 1 * SIZE] +#else + + FMUL ALPHA, c01, c01 + FMUL ALPHA, c02, c02 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c04, c04 + + FMUL ALPHA, c05, c05 + STF c01, [C1 + 0 * SIZE] + FMUL ALPHA, c06, c06 + STF c02, [C1 + 1 * SIZE] + FMUL ALPHA, c07, c07 + STF c03, [C2 + 0 * SIZE] + FMUL ALPHA, c08, c08 + STF c04, [C2 + 1 * SIZE] +#endif + + STF c05, [C3 + 0 * SIZE] + add C1, 2 * SIZE, C1 + STF c06, [C3 + 1 * SIZE] + add C2, 2 * SIZE, C2 + STF c07, [C4 + 0 * SIZE] + add C3, 2 * SIZE, C3 + STF c08, [C4 + 1 * SIZE] + add C4, 2 * SIZE, C4 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +.LL40: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL49 + nop + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc05) + LDF [BO + 8 * SIZE], b9 + FCLR (cc07) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL45 + nop + +.LL43: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + FMADD (aa2, bb7, cc05, cc05) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc07, cc07) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + LDF [AO + 2 * SIZE], a3 + add BO, 16 * SIZE, BO + + FMADD (aa4, bb5, cc01, cc01) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc03, cc03) + LDF [BO + 5 * SIZE], b6 + FMADD (aa4, bb7, cc05, cc05) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc07, cc07) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL43 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL45: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL48 + nop + .align 4 + +.LL47: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 4 * SIZE], b1 + add L, -1, L + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 5 * SIZE], b2 + add AO, 1 * SIZE, AO + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 6 * SIZE], b3 + cmp L, 0 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 7 * SIZE], b4 + add BO, 4 * SIZE, BO + + bg,pt %icc, .LL47 + LDF [AO + 0 * SIZE], a1 + .align 4 + +.LL48: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C2 + 0 * SIZE], a2 + LDF [C3 + 0 * SIZE], a3 + LDF [C4 + 0 * SIZE], a4 + + FMADD (alpha, cc01, aa1, cc01) + FMADD (alpha, cc03, aa2, cc03) + FMADD (alpha, cc05, aa3, cc05) + FMADD (alpha, cc07, aa4, cc07) +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c05, c05 + FMUL ALPHA, c07, c07 +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + .align 4 + +.LL49: +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 4, KK +#endif + mov BO, B + .align 4 + +.LL50: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL70 + mov C, C1 + + add C, LDC, C2 + add C2, LDC, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL60 + mov A, AO + .align 4 + +.LL52: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL55 + nop + .align 4 + +.LL53: + FMADD (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL53 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL55: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL58 + nop + .align 4 + +.LL57: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL57 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL58: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + LDF [C2 + 0 * SIZE], a3 + LDF [C2 + 1 * SIZE], a4 + + FMADD (alpha, cc01, aa1, cc01) + FMADD (alpha, cc02, aa2, cc02) + FMADD (alpha, cc03, aa3, cc03) + FMADD (alpha, cc04, aa4, cc04) +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c02, c02 + FMUL ALPHA, c03, c03 + FMUL ALPHA, c04, c04 +#endif + + STF c01, [C1 + 0 * SIZE] + add I, -1, I + STF c02, [C1 + 1 * SIZE] + add C1, 2 * SIZE, C1 + + STF c03, [C2 + 0 * SIZE] + cmp I, 0 + STF c04, [C2 + 1 * SIZE] + add C2, 2 * SIZE, C2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + bg,pt %icc, .LL52 + nop + .align 4 + +.LL60: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL69 + nop + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + FCLR (cc01) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL65 + nop + .align 4 + +.LL63: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb3, cc01, cc01) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc03, cc03) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + + LDF [AO + 2 * SIZE], a3 + add BO, 8 * SIZE, BO + + FMADD (aa4, bb7, cc01, cc01) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc03, cc03) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL63 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL65: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL68 + nop + .align 4 + +.LL67: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 2 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 3 * SIZE], b2 + + LDF [AO + 1 * SIZE], a1 + add L, -1, L + add AO, 1 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL67 + add BO, 2 * SIZE, BO + .align 4 + +.LL68: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C2 + 0 * SIZE], a2 + + FMADD (alpha, cc01, aa1, cc01) + FMADD (alpha, cc03, aa2, cc03) +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c03, c03 +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + .align 4 + +.LL69: +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 2, KK +#endif + mov BO, B + .align 4 + +.LL70: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + mov C, C1 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL80 + mov A, AO + .align 4 + +.LL72: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + prefetch [C1 + 2 * SIZE], 3 + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL75 + nop + +.LL73: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + LDF [BO + 4 * SIZE], b1 + cmp L, 0 + + FMADD (aa3, bb2, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb2, cc02, cc02) + LDF [AO + 7 * SIZE], a4 + + LDF [BO + 5 * SIZE], b2 + add BO, 4 * SIZE, BO + + FMADD (aa1, bb3, cc01, cc01) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb3, cc02, cc02) + LDF [AO + 9 * SIZE], a2 + + LDF [BO + 2 * SIZE], b3 + add AO, 8 * SIZE, AO + + FMADD (aa3, bb4, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa4, bb4, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL73 + LDF [BO + 3 * SIZE], b4 + .align 4 + +.LL75: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL78 + nop + .align 4 + +.LL77: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add L, -1, L + add AO, 2 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL77 + add BO, 1 * SIZE, BO + .align 4 + +.LL78: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + + FMADD (alpha, cc01, aa1, cc01) + FMADD (alpha, cc02, aa2, cc02) +#else + FMUL ALPHA, c01, c01 + FMUL ALPHA, c02, c02 +#endif + + STF c01, [C1 + 0 * SIZE] + add I, -1, I + STF c02, [C1 + 1 * SIZE] + cmp I, 0 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + bg,pt %icc, .LL72 + add C1, 2 * SIZE, C1 + .align 4 + +.LL80: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [BO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], a2 + LDF [BO + 1 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + LDF [BO + 2 * SIZE], b3 + LDF [AO + 3 * SIZE], a4 + LDF [BO + 3 * SIZE], b4 + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL85 + FCLR (cc01) + .align 4 + +.LL83: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + LDF [BO + 4 * SIZE], b1 + + FMADD (aa2, bb2, cc01, cc01) + LDF [AO + 5 * SIZE], a2 + LDF [BO + 5 * SIZE], b2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + LDF [BO + 6 * SIZE], b3 + + FMADD (aa4, bb4, cc01, cc01) + LDF [AO + 7 * SIZE], a4 + LDF [BO + 7 * SIZE], b4 + + add AO, 4 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL83 + add BO, 4 * SIZE, BO + .align 4 + +.LL85: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL88 + nop + .align 4 + +.LL87: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 1 * SIZE], a1 + LDF [BO + 1 * SIZE], b1 + + add AO, 1 * SIZE, AO + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL87 + add BO, 1 * SIZE, BO + .align 4 + +.LL88: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + + FMADD (alpha, cc01, aa1, cc01) +#else + FMUL ALPHA, c01, c01 +#endif + + STF c01, [C1 + 0 * SIZE] + .align 4 + +.LL999: +#ifdef TRMMKERNEL +#ifndef __64BIT__ + ld [%sp + STACK_START + 8], %g1 + ld [%sp + STACK_START + 12], %g2 + ld [%sp + STACK_START + 16], %g3 + ld [%sp + STACK_START + 20], %g4 +#else + ldx [%sp + STACK_START + 32], %g1 + ldx [%sp + STACK_START + 40], %g2 + ldx [%sp + STACK_START + 48], %g3 + ldx [%sp + STACK_START + 56], %g4 +#endif +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_ncopy.S b/kernel/sparc/gemm_ncopy.S new file mode 100644 index 0000000000..880d39cba1 --- /dev/null +++ b/kernel/sparc/gemm_ncopy.S @@ -0,0 +1,309 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + add A2, LDA, A3 + sra M, 2, I + add A3, LDA, A4 + cmp I, 0 + + ble,pn %icc, .LL15 + add A4, LDA, A + +#define PREFETCHSIZE 36 +#define WPREFETCHSIZE 20 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c05 + LDF [A3 + 0 * SIZE], c09 + LDF [A4 + 0 * SIZE], c13 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 1 * SIZE], c06 + LDF [A3 + 1 * SIZE], c10 + LDF [A4 + 1 * SIZE], c14 + + prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 2 * SIZE], c03 + LDF [A2 + 2 * SIZE], c07 + LDF [A3 + 2 * SIZE], c11 + LDF [A4 + 2 * SIZE], c15 + + prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 3 * SIZE], c04 + LDF [A2 + 3 * SIZE], c08 + LDF [A3 + 3 * SIZE], c12 + LDF [A4 + 3 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c05, [B + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c09, [B + 2 * SIZE] + add A3, 4 * SIZE, A3 + STF c13, [B + 3 * SIZE] + add A4, 4 * SIZE, A4 + STF c02, [B + 4 * SIZE] + add I, -1, I + STF c06, [B + 5 * SIZE] + cmp I, 0 + STF c10, [B + 6 * SIZE] + STF c14, [B + 7 * SIZE] +#ifdef DOUBLE + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 +#endif + STF c03, [B + 8 * SIZE] + STF c07, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c15, [B + 11 * SIZE] + STF c04, [B + 12 * SIZE] + STF c08, [B + 13 * SIZE] + STF c12, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + bg,pt %icc, .LL12 + add B, 16 * SIZE, B + +.LL15: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL16: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + LDF [A2 + 0 * SIZE], c05 + add A2, 1 * SIZE, A2 + LDF [A3 + 0 * SIZE], c09 + add A3, 1 * SIZE, A3 + LDF [A4 + 0 * SIZE], c13 + add A4, 1 * SIZE, A4 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c05, [B + 1 * SIZE] + cmp I, 0 + STF c09, [B + 2 * SIZE] + STF c13, [B + 3 * SIZE] + bg,pt %icc, .LL16 + add B, 4 * SIZE, B + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +.LL111: + sra M, 2, I + add A, LDA, A2 + cmp I, 0 + mov A, A1 + + ble,pn %icc, .LL115 + add A2, LDA, A + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c05 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 1 * SIZE], c06 + + LDF [A1 + 2 * SIZE], c03 + LDF [A2 + 2 * SIZE], c07 + LDF [A1 + 3 * SIZE], c04 + LDF [A2 + 3 * SIZE], c08 + + STF c01, [B + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c05, [B + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c02, [B + 2 * SIZE] + add I, -1, I + STF c06, [B + 3 * SIZE] + cmp I, 0 + STF c03, [B + 4 * SIZE] + STF c07, [B + 5 * SIZE] + STF c04, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + bg,pt %icc, .LL112 + add B, 8 * SIZE, B + +.LL115: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL200 + nop + +.LL116: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + add I, -1, I + LDF [A2 + 0 * SIZE], c05 + add A2, 1 * SIZE, A2 + cmp I, 0 + + STF c01, [B + 0 * SIZE] + STF c05, [B + 1 * SIZE] + bg,pt %icc, .LL116 + add B, 2 * SIZE, B + +.LL200: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL211: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL215 + mov A, A1 + +.LL212: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + add A1, 4 * SIZE, A1 + STF c04, [B + 3 * SIZE] + + bg,pt %icc, .LL212 + add B, 4 * SIZE, B + +.LL215: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL216: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + add I, -1, I + cmp I, 0 + + STF c01, [B + 0 * SIZE] + bg,pt %icc, .LL216 + add B, 1 * SIZE, B + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_ncopy_2.S b/kernel/sparc/gemm_ncopy_2.S new file mode 100644 index 0000000000..b52e71d6a9 --- /dev/null +++ b/kernel/sparc/gemm_ncopy_2.S @@ -0,0 +1,235 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 72 +#define WPREFETCHSIZE 20 + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra M, 3, I + cmp I, 0 + + ble,pn %icc, .LL15 + add A2, LDA, A + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A1 + 1 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + LDF [A1 + 2 * SIZE], c05 + LDF [A2 + 2 * SIZE], c06 + LDF [A1 + 3 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c09 + LDF [A2 + 4 * SIZE], c10 + LDF [A1 + 5 * SIZE], c11 + LDF [A2 + 5 * SIZE], c12 + LDF [A1 + 6 * SIZE], c13 + LDF [A2 + 6 * SIZE], c14 + LDF [A1 + 7 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + add A1, 8 * SIZE, A1 + add I, -1, I + add A2, 8 * SIZE, A2 + cmp I, 0 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + bg,pt %icc, .LL12 + add B, 16 * SIZE, B + +.LL15: + and M, 7, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL16: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + LDF [A2 + 0 * SIZE], c02 + add A2, 1 * SIZE, A2 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + bg,pt %icc, .LL16 + add B, 2 * SIZE, B + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL111: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL115 + mov A, A1 + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + add A1, 4 * SIZE, A1 + STF c04, [B + 3 * SIZE] + + bg,pt %icc, .LL112 + add B, 4 * SIZE, B + +.LL115: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL116: + LDF [A1 + 0 * SIZE], c01 + add A1, 1 * SIZE, A1 + add I, -1, I + cmp I, 0 + + STF c01, [B + 0 * SIZE] + bg,pt %icc, .LL116 + add B, 1 * SIZE, B + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_ncopy_8.S b/kernel/sparc/gemm_ncopy_8.S new file mode 100644 index 0000000000..f55195f483 --- /dev/null +++ b/kernel/sparc/gemm_ncopy_8.S @@ -0,0 +1,921 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 42 +#define WPREFETCHSIZE 20 + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 +#define A5 %o0 +#define A6 %o1 +#define A7 %o2 +#define A8 %o3 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL20 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + add A2, LDA, A3 + sra M, 3, I + add A3, LDA, A4 + cmp I, 0 + + add A4, LDA, A5 + add A5, LDA, A6 + add A6, LDA, A7 + add A7, LDA, A8 + + ble,pn %icc, .LL13 + add A8, LDA, A + .align 4 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A5 + 0 * SIZE], c05 + LDF [A6 + 0 * SIZE], c06 + LDF [A7 + 0 * SIZE], c07 + LDF [A8 + 0 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 1 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A3 + 1 * SIZE], c11 + LDF [A4 + 1 * SIZE], c12 + LDF [A5 + 1 * SIZE], c13 + LDF [A6 + 1 * SIZE], c14 + LDF [A7 + 1 * SIZE], c15 + LDF [A8 + 1 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 2 * SIZE], c01 + LDF [A2 + 2 * SIZE], c02 + LDF [A3 + 2 * SIZE], c03 + LDF [A4 + 2 * SIZE], c04 + LDF [A5 + 2 * SIZE], c05 + LDF [A6 + 2 * SIZE], c06 + LDF [A7 + 2 * SIZE], c07 + LDF [A8 + 2 * SIZE], c08 + + prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 3 * SIZE], c09 + LDF [A2 + 3 * SIZE], c10 + LDF [A3 + 3 * SIZE], c11 + LDF [A4 + 3 * SIZE], c12 + LDF [A5 + 3 * SIZE], c13 + LDF [A6 + 3 * SIZE], c14 + LDF [A7 + 3 * SIZE], c15 + LDF [A8 + 3 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 16) * SIZE], 2 + STF c01, [B + 16 * SIZE] + STF c02, [B + 17 * SIZE] + STF c03, [B + 18 * SIZE] + STF c04, [B + 19 * SIZE] + STF c05, [B + 20 * SIZE] + STF c06, [B + 21 * SIZE] + STF c07, [B + 22 * SIZE] + STF c08, [B + 23 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 24) * SIZE], 2 + STF c09, [B + 24 * SIZE] + STF c10, [B + 25 * SIZE] + STF c11, [B + 26 * SIZE] + STF c12, [B + 27 * SIZE] + STF c13, [B + 28 * SIZE] + STF c14, [B + 29 * SIZE] + STF c15, [B + 30 * SIZE] + STF c16, [B + 31 * SIZE] + + prefetch [A5 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c01 + LDF [A2 + 4 * SIZE], c02 + LDF [A3 + 4 * SIZE], c03 + LDF [A4 + 4 * SIZE], c04 + LDF [A5 + 4 * SIZE], c05 + LDF [A6 + 4 * SIZE], c06 + LDF [A7 + 4 * SIZE], c07 + LDF [A8 + 4 * SIZE], c08 + + prefetch [A6 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 5 * SIZE], c09 + LDF [A2 + 5 * SIZE], c10 + LDF [A3 + 5 * SIZE], c11 + LDF [A4 + 5 * SIZE], c12 + LDF [A5 + 5 * SIZE], c13 + LDF [A6 + 5 * SIZE], c14 + LDF [A7 + 5 * SIZE], c15 + LDF [A8 + 5 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 32) * SIZE], 2 + STF c01, [B + 32 * SIZE] + STF c02, [B + 33 * SIZE] + STF c03, [B + 34 * SIZE] + STF c04, [B + 35 * SIZE] + STF c05, [B + 36 * SIZE] + STF c06, [B + 37 * SIZE] + STF c07, [B + 38 * SIZE] + STF c08, [B + 39 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 40) * SIZE], 2 + STF c09, [B + 40 * SIZE] + STF c10, [B + 41 * SIZE] + STF c11, [B + 42 * SIZE] + STF c12, [B + 43 * SIZE] + STF c13, [B + 44 * SIZE] + STF c14, [B + 45 * SIZE] + STF c15, [B + 46 * SIZE] + STF c16, [B + 47 * SIZE] + + prefetch [A7 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 6 * SIZE], c01 + LDF [A2 + 6 * SIZE], c02 + LDF [A3 + 6 * SIZE], c03 + LDF [A4 + 6 * SIZE], c04 + LDF [A5 + 6 * SIZE], c05 + LDF [A6 + 6 * SIZE], c06 + LDF [A7 + 6 * SIZE], c07 + LDF [A8 + 6 * SIZE], c08 + + prefetch [A8 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 7 * SIZE], c09 + LDF [A2 + 7 * SIZE], c10 + LDF [A3 + 7 * SIZE], c11 + LDF [A4 + 7 * SIZE], c12 + LDF [A5 + 7 * SIZE], c13 + LDF [A6 + 7 * SIZE], c14 + LDF [A7 + 7 * SIZE], c15 + LDF [A8 + 7 * SIZE], c16 + + add A1, 8 * SIZE, A1 + add A2, 8 * SIZE, A2 + add A3, 8 * SIZE, A3 + add A4, 8 * SIZE, A4 + + prefetch [B + (WPREFETCHSIZE + 48) * SIZE], 2 + STF c01, [B + 48 * SIZE] + STF c02, [B + 49 * SIZE] + STF c03, [B + 50 * SIZE] + STF c04, [B + 51 * SIZE] + STF c05, [B + 52 * SIZE] + STF c06, [B + 53 * SIZE] + STF c07, [B + 54 * SIZE] + STF c08, [B + 55 * SIZE] + + add A5, 8 * SIZE, A5 + add A6, 8 * SIZE, A6 + add A7, 8 * SIZE, A7 + add A8, 8 * SIZE, A8 + + prefetch [B + (WPREFETCHSIZE + 56) * SIZE], 2 + STF c09, [B + 56 * SIZE] + STF c10, [B + 57 * SIZE] + STF c11, [B + 58 * SIZE] + STF c12, [B + 59 * SIZE] + STF c13, [B + 60 * SIZE] + STF c14, [B + 61 * SIZE] + STF c15, [B + 62 * SIZE] + STF c16, [B + 63 * SIZE] + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL12 + add B, 64 * SIZE, B + .align 4 + +.LL13: + and M, 4, I + cmp I, 0 + ble,pn %icc, .LL14 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A5 + 0 * SIZE], c05 + LDF [A6 + 0 * SIZE], c06 + LDF [A7 + 0 * SIZE], c07 + LDF [A8 + 0 * SIZE], c08 + + LDF [A1 + 1 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A3 + 1 * SIZE], c11 + LDF [A4 + 1 * SIZE], c12 + LDF [A5 + 1 * SIZE], c13 + LDF [A6 + 1 * SIZE], c14 + LDF [A7 + 1 * SIZE], c15 + LDF [A8 + 1 * SIZE], c16 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + LDF [A1 + 2 * SIZE], c01 + LDF [A2 + 2 * SIZE], c02 + LDF [A3 + 2 * SIZE], c03 + LDF [A4 + 2 * SIZE], c04 + LDF [A5 + 2 * SIZE], c05 + LDF [A6 + 2 * SIZE], c06 + LDF [A7 + 2 * SIZE], c07 + LDF [A8 + 2 * SIZE], c08 + + LDF [A1 + 3 * SIZE], c09 + LDF [A2 + 3 * SIZE], c10 + LDF [A3 + 3 * SIZE], c11 + LDF [A4 + 3 * SIZE], c12 + LDF [A5 + 3 * SIZE], c13 + LDF [A6 + 3 * SIZE], c14 + LDF [A7 + 3 * SIZE], c15 + LDF [A8 + 3 * SIZE], c16 + + STF c01, [B + 16 * SIZE] + STF c02, [B + 17 * SIZE] + STF c03, [B + 18 * SIZE] + STF c04, [B + 19 * SIZE] + STF c05, [B + 20 * SIZE] + STF c06, [B + 21 * SIZE] + STF c07, [B + 22 * SIZE] + STF c08, [B + 23 * SIZE] + + STF c09, [B + 24 * SIZE] + STF c10, [B + 25 * SIZE] + STF c11, [B + 26 * SIZE] + STF c12, [B + 27 * SIZE] + STF c13, [B + 28 * SIZE] + STF c14, [B + 29 * SIZE] + STF c15, [B + 30 * SIZE] + STF c16, [B + 31 * SIZE] + + add A1, 4 * SIZE, A1 + add A2, 4 * SIZE, A2 + add A3, 4 * SIZE, A3 + add A4, 4 * SIZE, A4 + + add A5, 4 * SIZE, A5 + add A6, 4 * SIZE, A6 + add A7, 4 * SIZE, A7 + add A8, 4 * SIZE, A8 + + add B, 32 * SIZE, B + .align 4 + +.LL14: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A5 + 0 * SIZE], c05 + LDF [A6 + 0 * SIZE], c06 + LDF [A7 + 0 * SIZE], c07 + LDF [A8 + 0 * SIZE], c08 + + LDF [A1 + 1 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A3 + 1 * SIZE], c11 + LDF [A4 + 1 * SIZE], c12 + LDF [A5 + 1 * SIZE], c13 + LDF [A6 + 1 * SIZE], c14 + LDF [A7 + 1 * SIZE], c15 + LDF [A8 + 1 * SIZE], c16 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + add A3, 2 * SIZE, A3 + add A4, 2 * SIZE, A4 + + add A5, 2 * SIZE, A5 + add A6, 2 * SIZE, A6 + add A7, 2 * SIZE, A7 + add A8, 2 * SIZE, A8 + + add B, 16 * SIZE, B + .align 4 + +.LL15: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL19 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A5 + 0 * SIZE], c05 + LDF [A6 + 0 * SIZE], c06 + LDF [A7 + 0 * SIZE], c07 + LDF [A8 + 0 * SIZE], c08 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + add B, 8 * SIZE, B + .align 4 + +.LL19: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL20: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + + add A, LDA, A2 + mov A, A1 + add A2, LDA, A3 + sra M, 3, I + add A3, LDA, A4 + cmp I, 0 + + ble,pn %icc, .LL23 + add A4, LDA, A + .align 4 + +.LL22: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A1 + 1 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A3 + 1 * SIZE], c07 + LDF [A4 + 1 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 2 * SIZE], c09 + LDF [A2 + 2 * SIZE], c10 + LDF [A3 + 2 * SIZE], c11 + LDF [A4 + 2 * SIZE], c12 + LDF [A1 + 3 * SIZE], c13 + LDF [A2 + 3 * SIZE], c14 + LDF [A3 + 3 * SIZE], c15 + LDF [A4 + 3 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c01 + LDF [A2 + 4 * SIZE], c02 + LDF [A3 + 4 * SIZE], c03 + LDF [A4 + 4 * SIZE], c04 + LDF [A1 + 5 * SIZE], c05 + LDF [A2 + 5 * SIZE], c06 + LDF [A3 + 5 * SIZE], c07 + LDF [A4 + 5 * SIZE], c08 + + prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 6 * SIZE], c09 + LDF [A2 + 6 * SIZE], c10 + LDF [A3 + 6 * SIZE], c11 + LDF [A4 + 6 * SIZE], c12 + LDF [A1 + 7 * SIZE], c13 + LDF [A2 + 7 * SIZE], c14 + LDF [A3 + 7 * SIZE], c15 + LDF [A4 + 7 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 16) * SIZE], 2 + STF c01, [B + 16 * SIZE] + STF c02, [B + 17 * SIZE] + STF c03, [B + 18 * SIZE] + STF c04, [B + 19 * SIZE] + STF c05, [B + 20 * SIZE] + STF c06, [B + 21 * SIZE] + STF c07, [B + 22 * SIZE] + STF c08, [B + 23 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 24) * SIZE], 2 + STF c09, [B + 24 * SIZE] + STF c10, [B + 25 * SIZE] + STF c11, [B + 26 * SIZE] + STF c12, [B + 27 * SIZE] + STF c13, [B + 28 * SIZE] + STF c14, [B + 29 * SIZE] + STF c15, [B + 30 * SIZE] + STF c16, [B + 31 * SIZE] + + add A1, 8 * SIZE, A1 + add A2, 8 * SIZE, A2 + add A3, 8 * SIZE, A3 + add A4, 8 * SIZE, A4 + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL22 + add B, 32 * SIZE, B + .align 4 + +.LL23: + and M, 4, I + cmp I, 0 + ble,pn %icc, .LL24 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A1 + 1 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A3 + 1 * SIZE], c07 + LDF [A4 + 1 * SIZE], c08 + + LDF [A1 + 2 * SIZE], c09 + LDF [A2 + 2 * SIZE], c10 + LDF [A3 + 2 * SIZE], c11 + LDF [A4 + 2 * SIZE], c12 + LDF [A1 + 3 * SIZE], c13 + LDF [A2 + 3 * SIZE], c14 + LDF [A3 + 3 * SIZE], c15 + LDF [A4 + 3 * SIZE], c16 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + add A1, 4 * SIZE, A1 + add A2, 4 * SIZE, A2 + add A3, 4 * SIZE, A3 + add A4, 4 * SIZE, A4 + + add B, 16 * SIZE, B + .align 4 + +.LL24: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL25 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + LDF [A1 + 1 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A3 + 1 * SIZE], c07 + LDF [A4 + 1 * SIZE], c08 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + add A3, 2 * SIZE, A3 + add A4, 2 * SIZE, A4 + + add B, 8 * SIZE, B + .align 4 + +.LL25: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL30 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + + add B, 4 * SIZE, B + .align 4 + +.LL30: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL40 + nop + + add A, LDA, A2 + mov A, A1 + sra M, 3, I + cmp I, 0 + + ble,pn %icc, .LL33 + add A2, LDA, A + .align 4 + +.LL32: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A1 + 1 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + LDF [A1 + 2 * SIZE], c05 + LDF [A2 + 2 * SIZE], c06 + LDF [A1 + 3 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c09 + LDF [A2 + 4 * SIZE], c10 + LDF [A1 + 5 * SIZE], c11 + LDF [A2 + 5 * SIZE], c12 + LDF [A1 + 6 * SIZE], c13 + LDF [A2 + 6 * SIZE], c14 + LDF [A1 + 7 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + + add A1, 8 * SIZE, A1 + add A2, 8 * SIZE, A2 + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL32 + add B, 16 * SIZE, B + .align 4 + +.LL33: + and M, 4, I + cmp I, 0 + ble,pn %icc, .LL34 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A1 + 1 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + LDF [A1 + 2 * SIZE], c05 + LDF [A2 + 2 * SIZE], c06 + LDF [A1 + 3 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + add A1, 4 * SIZE, A1 + add A2, 4 * SIZE, A2 + + add B, 8 * SIZE, B + .align 4 + +.LL34: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL35 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A1 + 1 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + + add B, 4 * SIZE, B + .align 4 + +.LL35: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + + add B, 2 * SIZE, B + .align 4 + +.LL40: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + + sra M, 3, I + cmp I, 0 + + ble,pn %icc, .LL43 + mov A, A1 + .align 4 + +.LL42: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + add A1, 8 * SIZE, A1 + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL42 + add B, 8 * SIZE, B + .align 4 + +.LL43: + and M, 4, I + cmp I, 0 + ble,pn %icc, .LL44 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + + add A1, 4 * SIZE, A1 + + add B, 4 * SIZE, B + .align 4 + +.LL44: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL45 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + + add A1, 2 * SIZE, A1 + + add B, 2 * SIZE, B + .align 4 + +.LL45: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + + LDF [A1 + 0 * SIZE], c01 + STF c01, [B + 0 * SIZE] + .align 4 + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_tcopy.S b/kernel/sparc/gemm_tcopy.S new file mode 100644 index 0000000000..9838a5336b --- /dev/null +++ b/kernel/sparc/gemm_tcopy.S @@ -0,0 +1,376 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 + +#define I %l4 +#define J %l5 + +#define B1 %o0 +#define B2 %o1 +#define B3 %o3 +#define M4 %o4 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sll M, BASE_SHIFT + 2, M4 + + and N, -4, B2 + and N, -2, B3 + sll M, BASE_SHIFT, B1 + smul B1, B2, B2 + smul B1, B3, B3 + add B, B2, B2 + add B, B3, B3 + + sra M, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + add A2, LDA, A3 + sra N, 2, I + add A3, LDA, A4 + cmp I, 0 + + mov B, B1 + add B, 16 * SIZE, B + + ble,pn %icc, .LL15 + add A4, LDA, A + +#define PREFETCHSIZE 8 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A2 + 0 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A3 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A3 + 0 * SIZE], c09 + LDF [A3 + 1 * SIZE], c10 + LDF [A3 + 2 * SIZE], c11 + LDF [A3 + 3 * SIZE], c12 + + prefetch [A4 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A4 + 0 * SIZE], c13 + LDF [A4 + 1 * SIZE], c14 + LDF [A4 + 2 * SIZE], c15 + LDF [A4 + 3 * SIZE], c16 + + prefetch [B1 + (PREFETCHSIZE + 0) * SIZE], 0 + STF c01, [B1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c03, [B1 + 2 * SIZE] + add A3, 4 * SIZE, A3 + STF c04, [B1 + 3 * SIZE] + add A4, 4 * SIZE, A4 + STF c05, [B1 + 4 * SIZE] + add I, -1, I + STF c06, [B1 + 5 * SIZE] + cmp I, 0 + STF c07, [B1 + 6 * SIZE] + STF c08, [B1 + 7 * SIZE] + +#ifdef DOUBLE + prefetch [B1 + (PREFETCHSIZE + 8) * SIZE], 0 +#endif + STF c09, [B1 + 8 * SIZE] + STF c10, [B1 + 9 * SIZE] + STF c11, [B1 + 10 * SIZE] + STF c12, [B1 + 11 * SIZE] + STF c13, [B1 + 12 * SIZE] + STF c14, [B1 + 13 * SIZE] + STF c15, [B1 + 14 * SIZE] + STF c16, [B1 + 15 * SIZE] + bg,pt %icc, .LL12 + add B1, M4, B1 + +.LL15: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL17 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + LDF [A3 + 0 * SIZE], c05 + LDF [A3 + 1 * SIZE], c06 + LDF [A4 + 0 * SIZE], c07 + LDF [A4 + 1 * SIZE], c08 + + STF c01, [B2 + 0 * SIZE] + add A1, 2 * SIZE, A1 + STF c02, [B2 + 1 * SIZE] + add A2, 2 * SIZE, A2 + STF c03, [B2 + 2 * SIZE] + add A3, 2 * SIZE, A3 + STF c04, [B2 + 3 * SIZE] + add A4, 2 * SIZE, A4 + STF c05, [B2 + 4 * SIZE] + STF c06, [B2 + 5 * SIZE] + STF c07, [B2 + 6 * SIZE] + STF c08, [B2 + 7 * SIZE] + add B2, 8 * SIZE, B2 + +.LL17: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + LDF [A3 + 0 * SIZE], c03 + LDF [A4 + 0 * SIZE], c04 + + STF c01, [B3 + 0 * SIZE] + STF c02, [B3 + 1 * SIZE] + STF c03, [B3 + 2 * SIZE] + STF c04, [B3 + 3 * SIZE] + add B3, 4 * SIZE, B3 + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and M, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +.LL111: + sra N, 2, I + add A, LDA, A2 + cmp I, 0 + mov A, A1 + + mov B, B1 + add B, 8 * SIZE, B + + ble,pn %icc, .LL115 + add A2, LDA, A + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + LDF [A2 + 0 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + STF c01, [B1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c03, [B1 + 2 * SIZE] + add I, -1, I + STF c04, [B1 + 3 * SIZE] + cmp I, 0 + STF c05, [B1 + 4 * SIZE] + STF c06, [B1 + 5 * SIZE] + STF c07, [B1 + 6 * SIZE] + STF c08, [B1 + 7 * SIZE] + + bg,pt %icc, .LL112 + add B1, M4, B1 + +.LL115: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL117 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + STF c01, [B2 + 0 * SIZE] + add A1, 2 * SIZE, A1 + STF c02, [B2 + 1 * SIZE] + add A2, 2 * SIZE, A2 + STF c03, [B2 + 2 * SIZE] + add I, -1, I + STF c04, [B2 + 3 * SIZE] + cmp I, 0 + add B2, 4 * SIZE, B2 + +.LL117: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL200 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + + STF c01, [B3 + 0 * SIZE] + STF c02, [B3 + 1 * SIZE] + add B3, 2 * SIZE, B3 + +.LL200: + and M, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL211: + sra N, 2, I + cmp I, 0 + + mov B, B1 + + ble,pn %icc, .LL215 + mov A, A1 + +.LL212: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + add A1, 4 * SIZE, A1 + STF c04, [B + 3 * SIZE] + + bg,pt %icc, .LL212 + add B, M4, B + +.LL215: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL217 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + STF c01, [B2 + 0 * SIZE] + STF c02, [B2 + 1 * SIZE] + add A1, 2 * SIZE, A1 + +.LL217: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + + LDF [A1 + 0 * SIZE], c01 + STF c01, [B3 + 0 * SIZE] + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemm_tcopy_2.S b/kernel/sparc/gemm_tcopy_2.S new file mode 100644 index 0000000000..aed95f93d2 --- /dev/null +++ b/kernel/sparc/gemm_tcopy_2.S @@ -0,0 +1,298 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE 72 +#define WPREFETCHSIZE 16 + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 +#define A3 %l2 +#define A4 %l3 + +#define I %l4 +#define J %l5 + +#define B1 %o0 +#define B2 %o1 +#define B3 %o3 +#define M2 %o4 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sll M, BASE_SHIFT + 1, M2 + + and N, -2, B2 + sll M, BASE_SHIFT, B1 + smul B1, B2, B2 + add B, B2, B2 + + sra M, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, BASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra N, 3, I + cmp I, 0 + + mov B, B1 + add B, 4 * SIZE, B + + ble,pn %icc, .LL13 + add A2, LDA, A + .align 4 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + LDF [A2 + 2 * SIZE], c11 + LDF [A2 + 3 * SIZE], c12 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A2 + 4 * SIZE], c13 + LDF [A2 + 5 * SIZE], c14 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + LDF [A2 + 6 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + add A1, 8 * SIZE, A1 + add A2, 8 * SIZE, A2 + add I, -1, I + cmp I, 0 + + prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 + STF c01, [B1 + 0 * SIZE] + STF c02, [B1 + 1 * SIZE] + STF c09, [B1 + 2 * SIZE] + STF c10, [B1 + 3 * SIZE] + add B1, M2, B1 + + prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 + STF c03, [B1 + 0 * SIZE] + STF c04, [B1 + 1 * SIZE] + STF c11, [B1 + 2 * SIZE] + STF c12, [B1 + 3 * SIZE] + add B1, M2, B1 + + prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 + STF c05, [B1 + 0 * SIZE] + STF c06, [B1 + 1 * SIZE] + STF c13, [B1 + 2 * SIZE] + STF c14, [B1 + 3 * SIZE] + add B1, M2, B1 + + prefetch [B1 + (WPREFETCHSIZE + 0) * SIZE], 0 + STF c07, [B1 + 0 * SIZE] + STF c08, [B1 + 1 * SIZE] + STF c15, [B1 + 2 * SIZE] + STF c16, [B1 + 3 * SIZE] + + bg,pt %icc, .LL12 + add B1, M2, B1 + +.LL13: + and N, 4, I + cmp I, 0 + ble,pn %icc, .LL14 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + LDF [A1 + 2 * SIZE], c05 + LDF [A1 + 3 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + add A1, 4 * SIZE, A1 + add A2, 4 * SIZE, A2 + + STF c01, [B1 + 0 * SIZE] + STF c02, [B1 + 1 * SIZE] + STF c03, [B1 + 2 * SIZE] + STF c04, [B1 + 3 * SIZE] + add B1, M2, B1 + STF c05, [B1 + 0 * SIZE] + STF c06, [B1 + 1 * SIZE] + STF c07, [B1 + 2 * SIZE] + STF c08, [B1 + 3 * SIZE] + add B1, M2, B1 + .align 4 + +.LL14: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + + STF c01, [B1 + 0 * SIZE] + STF c02, [B1 + 1 * SIZE] + STF c03, [B1 + 2 * SIZE] + STF c04, [B1 + 3 * SIZE] + add B1, M2, B1 + .align 4 + +.LL15: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A2 + 0 * SIZE], c02 + + STF c01, [B2 + 0 * SIZE] + STF c02, [B2 + 1 * SIZE] + add B2, 2 * SIZE, B2 + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and M, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL211: + sra N, 1, I + cmp I, 0 + + mov B, B1 + + ble,pn %icc, .LL215 + mov A, A1 + +.LL212: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + add A1, 2 * SIZE, A1 + add I, -1, I + cmp I, 0 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + + bg,pt %icc, .LL212 + add B, M2, B + +.LL215: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + + LDF [A1 + 0 * SIZE], c01 + STF c01, [B2 + 0 * SIZE] + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemv_n.S b/kernel/sparc/gemv_n.S new file mode 100644 index 0000000000..649ef1617d --- /dev/null +++ b/kernel/sparc/gemv_n.S @@ -0,0 +1,1400 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define LDA %i2 +#define X %i3 +#define INCX %i4 +#else +#define A %i4 +#define LDA %i5 +#define X %i2 +#define INCX %i3 +#endif + +#define Y %l0 +#define INCY %l1 +#define BUFFER %l2 + +#define I %l3 +#define J %l5 + +#define A1 %o0 +#define A2 %o1 +#define A3 %o2 +#define A4 %o3 + +#define Y1 %l4 +#define YY %l6 + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define y1 %f8 +#define y2 %f10 +#define y3 %f12 +#define y4 %f14 +#define y5 %f16 +#define y6 %f18 +#define y7 %f20 +#define y8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 + +#define a9 %f40 +#define a10 %f42 +#define a11 %f44 +#define a12 %f46 +#define a13 %f48 +#define a14 %f50 +#define a15 %f52 +#define a16 %f54 + +#define x1 %f56 +#define x2 %f58 +#define x3 %f60 +#define x4 %f62 + +#define FZERO %f52 +#define ALPHA %f54 +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define y1 %f4 +#define y2 %f5 +#define y3 %f6 +#define y4 %f7 +#define y5 %f8 +#define y6 %f9 +#define y7 %f10 +#define y8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 + +#define a9 %f20 +#define a10 %f21 +#define a11 %f22 +#define a12 %f23 +#define a13 %f24 +#define a14 %f25 +#define a15 %f26 +#define a16 %f27 + +#define x1 %f28 +#define x2 %f29 +#define x3 %f30 +#define x4 %f31 + +#define FZERO %f26 +#define ALPHA %f27 +#endif + +#ifndef __64BIT__ +#define STACK_FZERO [%sp + STACK_START + 8] +#define STACK_ALPHA [%sp + STACK_START + 16] +#else +#define STACK_FZERO [%sp + STACK_START + 32] +#define STACK_ALPHA [%sp + STACK_START + 40] +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], LDA + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY + ld [%sp + STACK_START + 48], BUFFER +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + + ld [%sp + STACK_START + 28], X + ld [%sp + STACK_START + 32], INCX + ld [%sp + STACK_START + 36], Y + ld [%sp + STACK_START + 40], INCY + ld [%sp + STACK_START + 44], BUFFER +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp + STACK_START + 56], X + ldx [%sp + STACK_START + 64], INCX + ldx [%sp + STACK_START + 72], Y + ldx [%sp + STACK_START + 80], INCY + ldx [%sp + STACK_START + 88], BUFFER +#ifdef DOUBLE + FMOV %f6, ALPHA + STF %f6, STACK_ALPHA +#else + FMOV %f7, ALPHA + STF %f7, STACK_ALPHA +#endif +#endif + + sll LDA, BASE_SHIFT, LDA + + cmp M, 0 + ble %icc, .LL999 + sll INCX, BASE_SHIFT, INCX + cmp N, 0 + ble %icc, .LL999 + sll INCY, BASE_SHIFT, INCY + +#ifdef DOUBLE + FCLR(21) +#else + FCLR(26) +#endif + + cmp INCY, SIZE + be %icc, .LL10 + mov Y, YY + + add M, 7, J + sra J, 3, J + mov BUFFER, YY + mov BUFFER, Y1 + +.LL01: + STF FZERO, [Y1 + 0 * SIZE] + STF FZERO, [Y1 + 1 * SIZE] + STF FZERO, [Y1 + 2 * SIZE] + STF FZERO, [Y1 + 3 * SIZE] + STF FZERO, [Y1 + 4 * SIZE] + STF FZERO, [Y1 + 5 * SIZE] + STF FZERO, [Y1 + 6 * SIZE] + deccc J + STF FZERO, [Y1 + 7 * SIZE] + bg,pn %icc, .LL01 + add Y1, 8 * SIZE, Y1 + +.LL10: + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL20 + nop + +.LL11: + mov YY, Y1 + + mov A, A1 + add A, LDA, A2 + add A2, LDA, A3 + add A3, LDA, A4 + add A4, LDA, A + + LDF STACK_ALPHA, ALPHA + + LDF [X], x1 + add X, INCX, X + LDF [X], x2 + add X, INCX, X + LDF [X], x3 + add X, INCX, X + LDF [X], x4 + add X, INCX, X + + FMUL ALPHA, x1, x1 + FMUL ALPHA, x2, x2 + FMUL ALPHA, x3, x3 + FMUL ALPHA, x4, x4 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL16 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + + LDF [A2 + 0 * SIZE], a9 + LDF [A2 + 1 * SIZE], a10 + LDF [A2 + 2 * SIZE], a11 + LDF [A2 + 3 * SIZE], a12 + LDF [A2 + 4 * SIZE], a13 + LDF [A2 + 5 * SIZE], a14 + LDF [A2 + 6 * SIZE], a15 + LDF [A2 + 7 * SIZE], a16 + + FMUL a1, x1, t1 + LDF [A3 + 0 * SIZE], a1 + FMUL a2, x1, t2 + LDF [A3 + 1 * SIZE], a2 + FMUL a3, x1, t3 + LDF [A3 + 2 * SIZE], a3 + FMUL a4, x1, t4 + LDF [A3 + 3 * SIZE], a4 + + deccc I + ble,pn %icc, .LL13 + nop + nop + nop + nop + +#ifdef DOUBLE +#define PREFETCHSIZE 20 +#else +#define PREFETCHSIZE 40 +#endif + +.LL12: + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + FADD y1, t1, y1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a5, x1, t1 + LDF [A3 + 4 * SIZE], a5 + + FADD y2, t2, y2 + nop + FMUL a6, x1, t2 + LDF [A3 + 5 * SIZE], a6 + + FADD y3, t3, y3 + nop + FMUL a7, x1, t3 + LDF [A3 + 6 * SIZE], a7 + + FADD y4, t4, y4 + nop + FMUL a8, x1, t4 + LDF [A3 + 7 * SIZE], a8 + + FADD y5, t1, y5 + nop + FMUL a9, x2, t1 + LDF [A4 + 0 * SIZE], a9 + + FADD y6, t2, y6 + nop + FMUL a10, x2, t2 + LDF [A4 + 1 * SIZE], a10 + + FADD y7, t3, y7 + nop + FMUL a11, x2, t3 + LDF [A4 + 2 * SIZE], a11 + + FADD y8, t4, y8 + nop + FMUL a12, x2, t4 + LDF [A4 + 3 * SIZE], a12 + + FADD y1, t1, y1 + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + FMUL a13, x2, t1 + LDF [A4 + 4 * SIZE], a13 + + FADD y2, t2, y2 + nop + FMUL a14, x2, t2 + LDF [A4 + 5 * SIZE], a14 + + FADD y3, t3, y3 + nop + FMUL a15, x2, t3 + LDF [A4 + 6 * SIZE], a15 + + FADD y4, t4, y4 + nop + FMUL a16, x2, t4 + LDF [A4 + 7 * SIZE], a16 + + FADD y5, t1, y5 + nop + FMUL a1, x3, t1 + LDF [A1 + 8 * SIZE], a1 + + FADD y6, t2, y6 + nop + FMUL a2, x3, t2 + LDF [A1 + 9 * SIZE], a2 + + FADD y7, t3, y7 + nop + FMUL a3, x3, t3 + LDF [A1 + 10 * SIZE], a3 + + FADD y8, t4, y8 + nop + FMUL a4, x3, t4 + LDF [A1 + 11 * SIZE], a4 + + FADD y1, t1, y1 + prefetch [A3 + PREFETCHSIZE * SIZE], 1 + FMUL a5, x3, t1 + LDF [A1 + 12 * SIZE], a5 + + FADD y2, t2, y2 + nop + FMUL a6, x3, t2 + LDF [A1 + 13 * SIZE], a6 + + FADD y3, t3, y3 + nop + FMUL a7, x3, t3 + LDF [A1 + 14 * SIZE], a7 + + FADD y4, t4, y4 + nop + FMUL a8, x3, t4 + LDF [A1 + 15 * SIZE], a8 + + FADD y5, t1, y5 + nop + FMUL a9, x4, t1 + LDF [A2 + 8 * SIZE], a9 + + FADD y6, t2, y6 + nop + FMUL a10, x4, t2 + LDF [A2 + 9 * SIZE], a10 + + FADD y7, t3, y7 + nop + FMUL a11, x4, t3 + LDF [A2 + 10 * SIZE], a11 + + FADD y8, t4, y8 + nop + FMUL a12, x4, t4 + LDF [A2 + 11 * SIZE], a12 + + FADD y1, t1, y1 + prefetch [A4 + PREFETCHSIZE * SIZE], 1 + FMUL a13, x4, t1 + LDF [A2 + 12 * SIZE], a13 + + FADD y2, t2, y2 + add A3, 8 * SIZE, A3 + FMUL a14, x4, t2 + LDF [A2 + 13 * SIZE], a14 + + FADD y3, t3, y3 + add Y1, 8 * SIZE, Y1 + FMUL a15, x4, t3 + LDF [A2 + 14 * SIZE], a15 + + FADD y4, t4, y4 + deccc I + FMUL a16, x4, t4 + LDF [A2 + 15 * SIZE], a16 + + FADD y5, t1, y5 + add A1, 8 * SIZE, A1 + FMUL a1, x1, t1 + LDF [A3 + 0 * SIZE], a1 + + FADD y6, t2, y6 + add A2, 8 * SIZE, A2 + FMUL a2, x1, t2 + LDF [A3 + 1 * SIZE], a2 + + FADD y7, t3, y7 + add A4, 8 * SIZE, A4 + FMUL a3, x1, t3 + LDF [A3 + 2 * SIZE], a3 + + FADD y8, t4, y8 + nop + FMUL a4, x1, t4 + LDF [A3 + 3 * SIZE], a4 + + STF y1, [Y1 - 8 * SIZE] + STF y2, [Y1 - 7 * SIZE] + STF y3, [Y1 - 6 * SIZE] + STF y4, [Y1 - 5 * SIZE] + + STF y5, [Y1 - 4 * SIZE] + STF y6, [Y1 - 3 * SIZE] + STF y7, [Y1 - 2 * SIZE] + + bg,pn %icc, .LL12 + STF y8, [Y1 - 1 * SIZE] + +.LL13: + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + FADD y1, t1, y1 + FMUL a5, x1, t1 + LDF [A3 + 0 * SIZE], a1 + FADD y2, t2, y2 + FMUL a6, x1, t2 + LDF [A3 + 1 * SIZE], a2 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + LDF [A3 + 2 * SIZE], a3 + FADD y4, t4, y4 + FMUL a8, x1, t4 + LDF [A3 + 3 * SIZE], a4 + + FADD y5, t1, y5 + FMUL a9, x2, t1 + LDF [A3 + 4 * SIZE], a5 + FADD y6, t2, y6 + FMUL a10, x2, t2 + LDF [A3 + 5 * SIZE], a6 + + FADD y7, t3, y7 + FMUL a11, x2, t3 + LDF [A3 + 6 * SIZE], a7 + FADD y8, t4, y8 + FMUL a12, x2, t4 + LDF [A3 + 7 * SIZE], a8 + + FADD y1, t1, y1 + FMUL a13, x2, t1 + LDF [A4 + 0 * SIZE], a9 + FADD y2, t2, y2 + FMUL a14, x2, t2 + LDF [A4 + 1 * SIZE], a10 + + FADD y3, t3, y3 + FMUL a15, x2, t3 + LDF [A4 + 2 * SIZE], a11 + FADD y4, t4, y4 + FMUL a16, x2, t4 + LDF [A4 + 3 * SIZE], a12 + + FADD y5, t1, y5 + FMUL a1, x3, t1 + LDF [A4 + 4 * SIZE], a13 + FADD y6, t2, y6 + FMUL a2, x3, t2 + LDF [A4 + 5 * SIZE], a14 + + FADD y7, t3, y7 + FMUL a3, x3, t3 + LDF [A4 + 6 * SIZE], a15 + FADD y8, t4, y8 + FMUL a4, x3, t4 + LDF [A4 + 7 * SIZE], a16 + + FADD y1, t1, y1 + FMUL a5, x3, t1 + FADD y2, t2, y2 + FMUL a6, x3, t2 + + FADD y3, t3, y3 + FMUL a7, x3, t3 + FADD y4, t4, y4 + FMUL a8, x3, t4 + + FADD y5, t1, y5 + FMUL a9, x4, t1 + FADD y6, t2, y6 + FMUL a10, x4, t2 + + FADD y7, t3, y7 + FMUL a11, x4, t3 + FADD y8, t4, y8 + FMUL a12, x4, t4 + + FADD y1, t1, y1 + FMUL a13, x4, t1 + FADD y2, t2, y2 + FMUL a14, x4, t2 + + FADD y3, t3, y3 + FMUL a15, x4, t3 + FADD y4, t4, y4 + FMUL a16, x4, t4 + add A4, 8 * SIZE, A4 + + STF y1, [Y1 + 0 * SIZE] + FADD y5, t1, y5 + STF y2, [Y1 + 1 * SIZE] + FADD y6, t2, y6 + STF y3, [Y1 + 2 * SIZE] + FADD y7, t3, y7 + STF y4, [Y1 + 3 * SIZE] + FADD y8, t4, y8 + + STF y5, [Y1 + 4 * SIZE] + add A1, 8 * SIZE, A1 + STF y6, [Y1 + 5 * SIZE] + add A2, 8 * SIZE, A2 + STF y7, [Y1 + 6 * SIZE] + add A3, 8 * SIZE, A3 + STF y8, [Y1 + 7 * SIZE] + add Y1, 8 * SIZE, Y1 + +.LL16: + andcc M, 4, I + ble,pn %icc, .LL17 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [A2 + 0 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + LDF [A2 + 2 * SIZE], a7 + LDF [A2 + 3 * SIZE], a8 + + LDF [A3 + 0 * SIZE], a9 + LDF [A3 + 1 * SIZE], a10 + LDF [A3 + 2 * SIZE], a11 + LDF [A3 + 3 * SIZE], a12 + + LDF [A4 + 0 * SIZE], a13 + LDF [A4 + 1 * SIZE], a14 + LDF [A4 + 2 * SIZE], a15 + LDF [A4 + 3 * SIZE], a16 + + LDF [Y1 + 0 * SIZE], y1 + add A1, 4 * SIZE, A1 + LDF [Y1 + 1 * SIZE], y2 + add A2, 4 * SIZE, A2 + LDF [Y1 + 2 * SIZE], y3 + add A3, 4 * SIZE, A3 + LDF [Y1 + 3 * SIZE], y4 + add A4, 4 * SIZE, A4 + + FMUL a1, x1, t1 + FMUL a2, x1, t2 + FMUL a3, x1, t3 + FMUL a4, x1, t4 + + FADD y1, t1, y1 + FMUL a5, x2, t1 + FADD y2, t2, y2 + FMUL a6, x2, t2 + FADD y3, t3, y3 + FMUL a7, x2, t3 + FADD y4, t4, y4 + FMUL a8, x2, t4 + + FADD y1, t1, y1 + FMUL a9, x3, t1 + FADD y2, t2, y2 + FMUL a10, x3, t2 + + FADD y3, t3, y3 + FMUL a11, x3, t3 + FADD y4, t4, y4 + FMUL a12, x3, t4 + + FADD y1, t1, y1 + FMUL a13, x4, t1 + FADD y2, t2, y2 + FMUL a14, x4, t2 + + FADD y3, t3, y3 + FMUL a15, x4, t3 + FADD y4, t4, y4 + FMUL a16, x4, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FADD y3, t3, y3 + FADD y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + add Y1, 4 * SIZE, Y1 + +.LL17: + andcc M, 2, I + ble,pn %icc, .LL18 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A2 + 0 * SIZE], a2 + LDF [A3 + 0 * SIZE], a3 + LDF [A4 + 0 * SIZE], a4 + LDF [Y1 + 0 * SIZE], y1 + + LDF [A1 + 1 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + LDF [A3 + 1 * SIZE], a7 + LDF [A4 + 1 * SIZE], a8 + LDF [Y1 + 1 * SIZE], y2 + + add A1, 2 * SIZE, A1 + add A2, 2 * SIZE, A2 + add A3, 2 * SIZE, A3 + add A4, 2 * SIZE, A4 + + FMUL a1, x1, t1 + FMUL a2, x2, t2 + FMUL a3, x3, t3 + FMUL a4, x4, t4 + + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y1, t2, y1 + FMUL a6, x2, t2 + FADD y1, t3, y1 + FMUL a7, x3, t3 + FADD y1, t4, y1 + FMUL a8, x4, t4 + + FADD y2, t1, y2 + FADD y2, t2, y2 + FADD y2, t3, y2 + FADD y2, t4, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, 2 * SIZE, Y1 + +.LL18: + andcc M, 1, I + ble,pn %icc, .LL19 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A2 + 0 * SIZE], a2 + LDF [A3 + 0 * SIZE], a3 + LDF [A4 + 0 * SIZE], a4 + LDF [Y1 + 0 * SIZE], y1 + + FMUL a1, x1, t1 + FMUL a2, x2, t2 + FMUL a3, x3, t3 + FMUL a4, x4, t4 + + FADD y1, t1, y1 + FADD y1, t2, y1 + FADD y1, t3, y1 + FADD y1, t4, y1 + + STF y1, [Y1] + +.LL19: + deccc J + bg %icc, .LL11 + nop + +.LL20: + andcc N, 2, J + ble,pn %icc, .LL30 + nop + +.LL21: + mov YY, Y1 + + mov A, A1 + add A, LDA, A2 + add A2, LDA, A + + LDF STACK_ALPHA, ALPHA + + LDF [X], x1 + add X, INCX, X + LDF [X], x2 + add X, INCX, X + + FMUL ALPHA, x1, x1 + FMUL ALPHA, x2, x2 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL26 + nop + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + + LDF [A2 + 0 * SIZE], a9 + LDF [A2 + 1 * SIZE], a10 + LDF [A2 + 2 * SIZE], a11 + LDF [A2 + 3 * SIZE], a12 + LDF [A2 + 4 * SIZE], a13 + LDF [A2 + 5 * SIZE], a14 + LDF [A2 + 6 * SIZE], a15 + LDF [A2 + 7 * SIZE], a16 + + FMUL a1, x1, t1 + deccc I + LDF [A1 + 8 * SIZE], a1 + FMUL a2, x1, t2 + LDF [A1 + 9 * SIZE], a2 + FMUL a3, x1, t3 + LDF [A1 + 10 * SIZE], a3 + FMUL a4, x1, t4 + ble,pn %icc, .LL23 + LDF [A1 + 11 * SIZE], a4 + +.LL22: + FADD y1, t1, y1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a5, x1, t1 + LDF [A1 + 12 * SIZE], a5 + FADD y2, t2, y2 + FMUL a6, x1, t2 + LDF [A1 + 13 * SIZE], a6 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + LDF [A1 + 14 * SIZE], a7 + FADD y4, t4, y4 + FMUL a8, x1, t4 + LDF [A1 + 15 * SIZE], a8 + + FADD y5, t1, y5 + FMUL a9, x2, t1 + LDF [A2 + 8 * SIZE], a9 + FADD y6, t2, y6 + FMUL a10, x2, t2 + LDF [A2 + 9 * SIZE], a10 + + FADD y7, t3, y7 + FMUL a11, x2, t3 + LDF [A2 + 10 * SIZE], a11 + FADD y8, t4, y8 + FMUL a12, x2, t4 + LDF [A2 + 11 * SIZE], a12 + + FADD y1, t1, y1 + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + FMUL a13, x2, t1 + LDF [A2 + 12 * SIZE], a13 + FADD y2, t2, y2 + FMUL a14, x2, t2 + LDF [A2 + 13 * SIZE], a14 + + FADD y3, t3, y3 + FMUL a15, x2, t3 + LDF [A2 + 14 * SIZE], a15 + FADD y4, t4, y4 + FMUL a16, x2, t4 + LDF [A2 + 15 * SIZE], a16 + + FADD y5, t1, y5 + FMUL a1, x1, t1 + LDF [A1 + 16 * SIZE], a1 + FADD y6, t2, y6 + FMUL a2, x1, t2 + LDF [A1 + 17 * SIZE], a2 + + FADD y7, t3, y7 + FMUL a3, x1, t3 + LDF [A1 + 18 * SIZE], a3 + FADD y8, t4, y8 + FMUL a4, x1, t4 + LDF [A1 + 19 * SIZE], a4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + + LDF [Y1 + 8 * SIZE], y1 + add A1, 8 * SIZE, A1 + LDF [Y1 + 9 * SIZE], y2 + add A2, 8 * SIZE, A2 + LDF [Y1 + 10 * SIZE], y3 + deccc I + LDF [Y1 + 11 * SIZE], y4 + LDF [Y1 + 12 * SIZE], y5 + LDF [Y1 + 13 * SIZE], y6 + LDF [Y1 + 14 * SIZE], y7 + LDF [Y1 + 15 * SIZE], y8 + bg,pn %icc, .LL22 + add Y1, 8 * SIZE, Y1 + +.LL23: + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y2, t2, y2 + FMUL a6, x1, t2 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + FADD y4, t4, y4 + FMUL a8, x1, t4 + + FADD y5, t1, y5 + FMUL a9, x2, t1 + FADD y6, t2, y6 + FMUL a10, x2, t2 + + FADD y7, t3, y7 + FMUL a11, x2, t3 + FADD y8, t4, y8 + FMUL a12, x2, t4 + + FADD y1, t1, y1 + FMUL a13, x2, t1 + FADD y2, t2, y2 + FMUL a14, x2, t2 + + FADD y3, t3, y3 + FMUL a15, x2, t3 + FADD y4, t4, y4 + FMUL a16, x2, t4 + + STF y1, [Y1 + 0 * SIZE] + FADD y5, t1, y5 + STF y2, [Y1 + 1 * SIZE] + FADD y6, t2, y6 + STF y3, [Y1 + 2 * SIZE] + FADD y7, t3, y7 + STF y4, [Y1 + 3 * SIZE] + FADD y8, t4, y8 + + STF y5, [Y1 + 4 * SIZE] + add A1, 8 * SIZE, A1 + STF y6, [Y1 + 5 * SIZE] + add A2, 8 * SIZE, A2 + STF y7, [Y1 + 6 * SIZE] + nop + STF y8, [Y1 + 7 * SIZE] + add Y1, 8 * SIZE, Y1 + +.LL26: + andcc M, 4, I + ble,pn %icc, .LL27 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [A2 + 0 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + LDF [A2 + 2 * SIZE], a7 + LDF [A2 + 3 * SIZE], a8 + + LDF [Y1 + 0 * SIZE], y1 + add A1, 4 * SIZE, A1 + LDF [Y1 + 1 * SIZE], y2 + add A2, 4 * SIZE, A2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + + FMUL a1, x1, t1 + FMUL a2, x1, t2 + FMUL a3, x1, t3 + FMUL a4, x1, t4 + + FADD y1, t1, y1 + FMUL a5, x2, t1 + FADD y2, t2, y2 + FMUL a6, x2, t2 + FADD y3, t3, y3 + FMUL a7, x2, t3 + FADD y4, t4, y4 + FMUL a8, x2, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FADD y3, t3, y3 + FADD y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + add Y1, 4 * SIZE, Y1 + +.LL27: + andcc M, 2, I + ble,pn %icc, .LL28 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A2 + 0 * SIZE], a2 + LDF [Y1 + 0 * SIZE], y1 + LDF [A1 + 1 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + add A1, 2 * SIZE, A1 + LDF [Y1 + 1 * SIZE], y2 + add A2, 2 * SIZE, A2 + + FMUL a1, x1, t1 + FMUL a2, x2, t2 + + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y1, t2, y1 + FMUL a6, x2, t2 + + FADD y2, t1, y2 + FADD y2, t2, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, 2 * SIZE, Y1 + +.LL28: + andcc M, 1, I + ble,pn %icc, .LL30 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A2 + 0 * SIZE], a2 + LDF [Y1 + 0 * SIZE], y1 + + FMUL a1, x1, t1 + FMUL a2, x2, t2 + + FADD y1, t1, y1 + FADD y1, t2, y1 + + STF y1, [Y1] + +.LL30: + andcc N, 1, J + ble,pn %icc, .LL990 + nop + +.LL31: + mov YY, Y1 + mov A, A1 + + LDF STACK_ALPHA, ALPHA + + LDF [X], x1 + add X, INCX, X + + FMUL ALPHA, x1, x1 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL36 + nop + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + + FMUL a1, x1, t1 + deccc I + LDF [A1 + 8 * SIZE], a1 + FMUL a2, x1, t2 + LDF [A1 + 9 * SIZE], a2 + FMUL a3, x1, t3 + LDF [A1 + 10 * SIZE], a3 + FMUL a4, x1, t4 + ble,pn %icc, .LL33 + LDF [A1 + 11 * SIZE], a4 + +.LL32: + FADD y1, t1, y1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a5, x1, t1 + LDF [A1 + 12 * SIZE], a5 + FADD y2, t2, y2 + FMUL a6, x1, t2 + LDF [A1 + 13 * SIZE], a6 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + LDF [A1 + 14 * SIZE], a7 + FADD y4, t4, y4 + FMUL a8, x1, t4 + LDF [A1 + 15 * SIZE], a8 + + FADD y5, t1, y5 + FMUL a1, x1, t1 + LDF [A1 + 16 * SIZE], a1 + FADD y6, t2, y6 + FMUL a2, x1, t2 + LDF [A1 + 17 * SIZE], a2 + + FADD y7, t3, y7 + FMUL a3, x1, t3 + LDF [A1 + 18 * SIZE], a3 + FADD y8, t4, y8 + FMUL a4, x1, t4 + LDF [A1 + 19 * SIZE], a4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + + LDF [Y1 + 8 * SIZE], y1 + LDF [Y1 + 9 * SIZE], y2 + LDF [Y1 + 10 * SIZE], y3 + LDF [Y1 + 11 * SIZE], y4 + LDF [Y1 + 12 * SIZE], y5 + deccc I + LDF [Y1 + 13 * SIZE], y6 + add A1, 8 * SIZE, A1 + LDF [Y1 + 14 * SIZE], y7 + add Y1, 8 * SIZE, Y1 + bg,pn %icc, .LL32 + LDF [Y1 + 7 * SIZE], y8 + +.LL33: + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y2, t2, y2 + FMUL a6, x1, t2 + + FADD y3, t3, y3 + FMUL a7, x1, t3 + FADD y4, t4, y4 + FMUL a8, x1, t4 + + STF y1, [Y1 + 0 * SIZE] + FADD y5, t1, y5 + STF y2, [Y1 + 1 * SIZE] + FADD y6, t2, y6 + STF y3, [Y1 + 2 * SIZE] + FADD y7, t3, y7 + STF y4, [Y1 + 3 * SIZE] + FADD y8, t4, y8 + + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + add A1, 8 * SIZE, A1 + STF y8, [Y1 + 7 * SIZE] + add Y1, 8 * SIZE, Y1 + +.LL36: + andcc M, 4, I + ble,pn %icc, .LL37 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [Y1 + 0 * SIZE], y1 + add A1, 4 * SIZE, A1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + + FMUL a1, x1, t1 + FMUL a2, x1, t2 + FMUL a3, x1, t3 + FMUL a4, x1, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FADD y3, t3, y3 + FADD y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + add Y1, 4 * SIZE, Y1 + +.LL37: + andcc M, 2, I + ble,pn %icc, .LL38 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [Y1 + 0 * SIZE], y1 + + LDF [A1 + 1 * SIZE], a5 + LDF [Y1 + 1 * SIZE], y2 + add A1, 2 * SIZE, A1 + + FMUL a1, x1, t1 + FADD y1, t1, y1 + FMUL a5, x1, t1 + FADD y2, t1, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, 2 * SIZE, Y1 + +.LL38: + andcc M, 1, I + ble,pn %icc, .LL990 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [Y1 + 0 * SIZE], y1 + + FMUL a1, x1, t1 + FADD y1, t1, y1 + + STF y1, [Y1] + +.LL990: + cmp INCY, SIZE + be %icc, .LL999 + mov Y, Y1 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL995 + nop + +.LL991: + LDF [BUFFER + 0 * SIZE], a1 + LDF [Y], y1 + add Y, INCY, Y + + LDF [BUFFER + 1 * SIZE], a2 + LDF [Y], y2 + add Y, INCY, Y + + LDF [BUFFER + 2 * SIZE], a3 + LDF [Y], y3 + add Y, INCY, Y + + LDF [BUFFER + 3 * SIZE], a4 + LDF [Y], y4 + add Y, INCY, Y + + LDF [BUFFER + 4 * SIZE], a5 + FADD y1, a1, y1 + LDF [Y], y5 + add Y, INCY, Y + + LDF [BUFFER + 5 * SIZE], a6 + FADD y2, a2, y2 + LDF [Y], y6 + add Y, INCY, Y + + LDF [BUFFER + 6 * SIZE], a7 + FADD y3, a3, y3 + LDF [Y], y7 + add Y, INCY, Y + + LDF [BUFFER + 7 * SIZE], a8 + FADD y4, a4, y4 + LDF [Y], y8 + add Y, INCY, Y + + STF y1, [Y1] + FADD y5, a5, y5 + add Y1, INCY, Y1 + STF y2, [Y1] + FADD y6, a6, y6 + add Y1, INCY, Y1 + STF y3, [Y1] + FADD y7, a7, y7 + add Y1, INCY, Y1 + STF y4, [Y1] + FADD y8, a8, y8 + add Y1, INCY, Y1 + STF y5, [Y1] + add Y1, INCY, Y1 + STF y6, [Y1] + add Y1, INCY, Y1 + STF y7, [Y1] + add Y1, INCY, Y1 + STF y8, [Y1] + add Y1, INCY, Y1 + + deccc I + bg,pn %icc, .LL991 + add BUFFER, 8 * SIZE, BUFFER + +.LL995: + andcc M, 7, I + ble,pn %icc, .LL999 + nop + + andcc M, 4, I + ble,pn %icc, .LL996 + nop + + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + LDF [BUFFER + 2 * SIZE], a3 + LDF [BUFFER + 3 * SIZE], a4 + add BUFFER, 4 * SIZE, BUFFER + + LDF [Y], y1 + add Y, INCY, Y + LDF [Y], y2 + add Y, INCY, Y + LDF [Y], y3 + add Y, INCY, Y + LDF [Y], y4 + add Y, INCY, Y + + FADD y1, a1, y1 + FADD y2, a2, y2 + FADD y3, a3, y3 + FADD y4, a4, y4 + + STF y1, [Y1] + add Y1, INCY, Y1 + STF y2, [Y1] + add Y1, INCY, Y1 + STF y3, [Y1] + add Y1, INCY, Y1 + STF y4, [Y1] + add Y1, INCY, Y1 + +.LL996: + andcc M, 2, I + ble,pn %icc, .LL997 + nop + + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + add BUFFER, 2 * SIZE, BUFFER + + LDF [Y], y1 + add Y, INCY, Y + LDF [Y], y2 + add Y, INCY, Y + + FADD y1, a1, y1 + FADD y2, a2, y2 + + STF y1, [Y1] + add Y1, INCY, Y1 + STF y2, [Y1] + add Y1, INCY, Y1 + +.LL997: + andcc M, 1, I + ble,pn %icc, .LL999 + nop + + LDF [BUFFER + 0 * SIZE], a1 + + LDF [Y], y1 + + FADD y1, a1, y1 + + STF y1, [Y1] + + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/gemv_t.S b/kernel/sparc/gemv_t.S new file mode 100644 index 0000000000..fad006ade9 --- /dev/null +++ b/kernel/sparc/gemv_t.S @@ -0,0 +1,705 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 1020 + +#define M %i0 +#define N %i1 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define LDA %i2 +#define X %i3 +#define INCX %i4 +#else +#define A %i4 +#define LDA %i5 +#define X %i2 +#define INCX %i3 +#endif + +#define Y %l0 +#define INCY %l1 +#define BUFFER %l2 + +#define I %l3 +#define IS %l4 +#define J %l5 +#define MIN_M %l6 +#define XP %l7 + +#define A1 %o0 +#define A2 %o1 +#define A3 %o2 +#define A4 %o3 +#define X1 %o4 +#define Y1 %o5 +#define PNLDA %g1 +#define Y2 %o7 /* Danger? */ + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define c1 %f8 +#define c2 %f10 +#define c3 %f12 +#define c4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + +#define a9 %f32 +#define a10 %f34 +#define a11 %f36 +#define a12 %f38 +#define a13 %f40 +#define a14 %f42 +#define a15 %f44 +#define a16 %f46 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 +#define b6 %f58 +#define b7 %f60 +#define b8 %f62 + +#define FZERO %f60 +#define ALPHA %f62 + +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define c1 %f4 +#define c2 %f5 +#define c3 %f6 +#define c4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 + +#define a9 %f16 +#define a10 %f17 +#define a11 %f18 +#define a12 %f19 +#define a13 %f20 +#define a14 %f21 +#define a15 %f22 +#define a16 %f23 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 +#define b6 %f29 +#define b7 %f30 +#define b8 %f31 + +#define FZERO %f30 +#define ALPHA %f31 +#endif + +#ifndef __64BIT__ +#define STACK_FZERO [%sp + STACK_START + 8] +#define STACK_ALPHA [%sp + STACK_START + 16] +#else +#define STACK_FZERO [%sp + STACK_START + 32] +#define STACK_ALPHA [%sp + STACK_START + 40] +#endif + +#ifdef DOUBLE +#define PREFETCHSIZE 36 +#else +#define PREFETCHSIZE 72 +#endif + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], LDA + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY + ld [%sp + STACK_START + 48], BUFFER +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA */ + + ld [%sp + STACK_START + 28], X + ld [%sp + STACK_START + 32], INCX + ld [%sp + STACK_START + 36], Y + ld [%sp + STACK_START + 40], INCY + ld [%sp + STACK_START + 44], BUFFER +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp+ STACK_START + 56], X + ldx [%sp+ STACK_START + 64], INCX + ldx [%sp+ STACK_START + 72], Y + ldx [%sp+ STACK_START + 80], INCY + ldx [%sp+ STACK_START + 88], BUFFER +#ifdef DOUBLE + FMOV %f6, ALPHA + STF %f6, STACK_ALPHA +#else + FMOV %f7, ALPHA + STF %f7, STACK_ALPHA +#endif +#endif + +#ifdef DOUBLE + FCLR(29) +#else + FCLR(30) +#endif + + clr IS + mov P, I + sll LDA, BASE_SHIFT, LDA + sll I, BASE_SHIFT, I + smul LDA, N, PNLDA + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + sub I, PNLDA, PNLDA + +.LL10: + sll IS, BASE_SHIFT, I + sub M, IS, MIN_M + cmp MIN_M, P + nop + movg %icc, P, MIN_M + nop + cmp INCX, SIZE + beq .LL100 + add X, I, XP + + sra MIN_M, 2, I + mov BUFFER, XP + cmp I, 0 + ble,pn %icc, .LL15 + mov BUFFER, Y1 + +.LL11: + LDF [X], a1 + add X, INCX, X + LDF [X], a2 + add X, INCX, X + LDF [X], a3 + add X, INCX, X + LDF [X], a4 + add X, INCX, X + + STF a1, [Y1 + 0 * SIZE] + add I, -1, I + STF a2, [Y1 + 1 * SIZE] + cmp I, 0 + STF a3, [Y1 + 2 * SIZE] + STF a4, [Y1 + 3 * SIZE] + bg,pn %icc, .LL11 + add Y1, 4 * SIZE, Y1 + +.LL15: + and MIN_M, 3, I + cmp I, 0 + ble,pn %icc, .LL100 + nop + +.LL16: + LDF [X], a1 + add X, INCX, X + add I, -1, I + cmp I, 0 + nop + STF a1, [Y1] + bg,pn %icc, .LL16 + add Y1, 1 * SIZE, Y1 + +.LL100: + sra N, 1, J + cmp J, 0 + ble %icc, .LL200 + mov Y, Y1 + +.LL110: +#ifdef DOUBLE + FCLR(29) +#else + FCLR(30) +#endif + + FMOV FZERO, c1 + FMOV FZERO, c2 + FMOV FZERO, c3 + FMOV FZERO, c4 + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + mov A, A1 + add A, LDA, A2 + add A2, LDA, A + + mov XP, X1 + + sra MIN_M, 3, I + cmp I, 0 + ble %icc, .LL115 + prefetch [Y1 + 2 * SIZE], 0 + + LDF [A1 + 0 * SIZE], a1 + deccc I + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + + LDF [A2 + 0 * SIZE], a9 + LDF [A2 + 1 * SIZE], a10 + LDF [A2 + 2 * SIZE], a11 + LDF [A2 + 3 * SIZE], a12 + LDF [A2 + 4 * SIZE], a13 + LDF [A2 + 5 * SIZE], a14 + LDF [A2 + 6 * SIZE], a15 + LDF [A2 + 7 * SIZE], a16 + + LDF [X1 + 0 * SIZE], b1 + LDF [X1 + 1 * SIZE], b2 + LDF [X1 + 2 * SIZE], b3 + LDF [X1 + 3 * SIZE], b4 + LDF [X1 + 4 * SIZE], b5 + LDF [X1 + 5 * SIZE], b6 + + ble %icc, .LL112 + LDF [X1 + 6 * SIZE], b7 + +.LL111: + FADD c1, t1, c1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a1, b1, t1 + LDF [A1 + 8 * SIZE], a1 + + FADD c2, t2, c2 + LDF [X1 + 7 * SIZE], b8 + FMUL a9, b1, t2 + LDF [A2 + 8 * SIZE], a9 + + FADD c3, t3, c3 + LDF [X1 + 8 * SIZE], b1 + FMUL a2, b2, t3 + LDF [A1 + 9 * SIZE], a2 + + FADD c4, t4, c4 + deccc I + FMUL a10, b2, t4 + LDF [A2 + 9 * SIZE], a10 + + FADD c1, t1, c1 + LDF [X1 + 9 * SIZE], b2 + FMUL a3, b3, t1 + LDF [A1 + 10 * SIZE], a3 + + FADD c2, t2, c2 + nop + FMUL a11, b3, t2 + LDF [A2 + 10 * SIZE], a11 + + FADD c3, t3, c3 + LDF [X1 + 10 * SIZE], b3 + FMUL a4, b4, t3 + LDF [A1 + 11 * SIZE], a4 + + FADD c4, t4, c4 + nop + FMUL a12, b4, t4 + LDF [A2 + 11 * SIZE], a12 + + FADD c1, t1, c1 + LDF [X1 + 11 * SIZE], b4 + FMUL a5, b5, t1 + LDF [A1 + 12 * SIZE], a5 + + FADD c2, t2, c2 + prefetch [A2 + (PREFETCHSIZE + 4) * SIZE], 1 + FMUL a13, b5, t2 + LDF [A2 + 12 * SIZE], a13 + + FADD c3, t3, c3 + LDF [X1 + 12 * SIZE], b5 + FMUL a6, b6, t3 + LDF [A1 + 13 * SIZE], a6 + + FADD c4, t4, c4 + FMUL a14, b6, t4 + LDF [A2 + 13 * SIZE], a14 + + FADD c1, t1, c1 + LDF [X1 + 13 * SIZE], b6 + FMUL a7, b7, t1 + LDF [A1 + 14 * SIZE], a7 + + FADD c2, t2, c2 + add X1, 8 * SIZE, X1 + FMUL a15, b7, t2 + LDF [A2 + 14 * SIZE], a15 + + FADD c3, t3, c3 + LDF [X1 + 6 * SIZE], b7 + FMUL a8, b8, t3 + LDF [A1 + 15 * SIZE], a8 + + FADD c4, t4, c4 + add A1, 8 * SIZE, A1 + FMUL a16, b8, t4 + LDF [A2 + 15 * SIZE], a16 + + bg,pn %icc, .LL111 + add A2, 8 * SIZE, A2 + +.LL112: + FADD c1, t1, c1 + LDF [X1 + 7 * SIZE], b8 + FMUL a1, b1, t1 + add A1, 8 * SIZE, A1 + + FADD c2, t2, c2 + add A2, 8 * SIZE, A2 + FMUL a9, b1, t2 + add X1, 8 * SIZE, X1 + + FADD c3, t3, c3 + FMUL a2, b2, t3 + FADD c4, t4, c4 + FMUL a10, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a11, b3, t2 + + FADD c3, t3, c3 + FMUL a4, b4, t3 + FADD c4, t4, c4 + FMUL a12, b4, t4 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a13, b5, t2 + + FADD c3, t3, c3 + FMUL a6, b6, t3 + FADD c4, t4, c4 + FMUL a14, b6, t4 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a15, b7, t2 + + FADD c3, t3, c3 + FMUL a8, b8, t3 + FADD c4, t4, c4 + FMUL a16, b8, t4 + +.LL115: + andcc MIN_M, 7, I + ble %icc, .LL119 + mov Y1, Y2 + + LDF [X1 + 0 * SIZE], b1 + deccc I + LDF [A1 + 0 * SIZE], a1 + ble %icc, .LL117 + LDF [A2 + 0 * SIZE], a2 + +.LL116: + FADD c1, t1, c1 + add X1, 1 * SIZE, X1 + FMUL a1, b1, t1 + LDF [A1 + 1 * SIZE], a1 + + FADD c2, t2, c2 + add A1, 1 * SIZE, A1 + FMUL a2, b1, t2 + LDF [X1 + 0 * SIZE], b1 + + add A2, 1 * SIZE, A2 + deccc I + bg,pn %icc, .LL116 + LDF [A2 + 0 * SIZE], a2 + +.LL117: + FADD c1, t1, c1 + add X1, 1 * SIZE, X1 + FADD c2, t2, c2 + add A1, 1 * SIZE, A1 + + FMUL a1, b1, t1 + add A2, 1 * SIZE, A2 + FMUL a2, b1, t2 + nop + +.LL119: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + + FADD c1, c3, c1 + FADD c2, c4, c2 + + + LDF [Y1], a1 + LDF [Y1 + INCY], a2 + + add Y1, INCY, Y1 + add Y1, INCY, Y1 + + LDF STACK_ALPHA, ALPHA + + FMUL ALPHA, c1, c1 + FMUL ALPHA, c2, c2 + FADD a1, c1, a1 + FADD a2, c2, a2 + + STF a1, [Y2] + STF a2, [Y2 + INCY] + + deccc J + bg %icc, .LL110 +#ifdef DOUBLE + FCLR(29) +#else + FCLR(30) +#endif + +.LL200: + andcc N, 1, J + nop + ble %icc, .LL400 + FMOV FZERO, c1 + +.LL310: + FMOV FZERO, t1 + sra MIN_M, 3, I + FMOV FZERO, c2 + mov A, A1 + FMOV FZERO, t2 + add A, LDA, A + FMOV FZERO, t3 + cmp I, 0 + FMOV FZERO, t4 + ble %icc, .LL315 + mov XP, X1 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + add A1, 8 * SIZE, A1 + + LDF [X1 + 0 * SIZE], a9 + add I, -1, I + LDF [X1 + 1 * SIZE], a10 + cmp I, 0 + LDF [X1 + 2 * SIZE], a11 + LDF [X1 + 3 * SIZE], a12 + LDF [X1 + 4 * SIZE], a13 + LDF [X1 + 5 * SIZE], a14 + LDF [X1 + 6 * SIZE], a15 + LDF [X1 + 7 * SIZE], a16 + ble %icc, .LL312 + add X1, 8 * SIZE, X1 + +.LL311: + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + + FADD c1, t1, c1 + FMUL a1, a9, t1 + LDF [A1 + 0 * SIZE], a1 + LDF [X1 + 0 * SIZE], a9 + + FADD c2, t2, c2 + FMUL a2, a10, t2 + LDF [A1 + 1 * SIZE], a2 + LDF [X1 + 1 * SIZE], a10 + + FADD c1, t3, c1 + add I, -1, I + FMUL a3, a11, t3 + LDF [A1 + 2 * SIZE], a3 + LDF [X1 + 2 * SIZE], a11 + + FADD c2, t4, c2 + cmp I, 0 + FMUL a4, a12, t4 + LDF [A1 + 3 * SIZE], a4 + LDF [X1 + 3 * SIZE], a12 + + FADD c1, t1, c1 + nop + FMUL a5, a13, t1 + LDF [A1 + 4 * SIZE], a5 + LDF [X1 + 4 * SIZE], a13 + + FADD c2, t2, c2 + nop + FMUL a6, a14, t2 + LDF [A1 + 5 * SIZE], a6 + LDF [X1 + 5 * SIZE], a14 + + FADD c1, t3, c1 + FMUL a7, a15, t3 + LDF [A1 + 6 * SIZE], a7 + LDF [X1 + 6 * SIZE], a15 + + FADD c2, t4, c2 + add X1, 8 * SIZE, X1 + FMUL a8, a16, t4 + LDF [A1 + 7 * SIZE], a8 + add A1, 8 * SIZE, A1 + bg,pn %icc, .LL311 + LDF [X1 - 1 * SIZE], a16 + +.LL312: + FADD c1, t1, c1 + FMUL a1, a9, t1 + FADD c2, t2, c2 + FMUL a2, a10, t2 + FADD c1, t3, c1 + FMUL a3, a11, t3 + FADD c2, t4, c2 + FMUL a4, a12, t4 + + FADD c1, t1, c1 + FMUL a5, a13, t1 + FADD c2, t2, c2 + FMUL a6, a14, t2 + FADD c1, t3, c1 + FMUL a7, a15, t3 + FADD c2, t4, c2 + FMUL a8, a16, t4 + +.LL315: + and MIN_M, 7, I + cmp I, 0 + ble %icc, .LL319 + nop + +.LL316: + LDF [A1 + 0 * SIZE], a1 + add A1, 1 * SIZE, A1 + LDF [X1 + 0 * SIZE], b1 + nop + + FADD c1, t1, c1 + nop + add I, -1, I + FMUL a1, b1, t1 + nop + cmp I, 0 + bg,pn %icc, .LL316 + add X1, 1 * SIZE, X1 + +.LL319: + FADD c1, t1, c1 + nop + FADD c2, t2, c2 + nop + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + + FMUL ALPHA, c1, c1 + LDF [Y1 + 0 * SIZE], a1 + FADD a1, c1, a1 + STF a1, [Y1 + 0 * SIZE] + add Y1, INCY, Y1 + +.LL400: + add IS, P, IS + cmp IS, M + bl %icc, .LL10 + add A, PNLDA, A + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ger.S b/kernel/sparc/ger.S new file mode 100644 index 0000000000..84cd525c48 --- /dev/null +++ b/kernel/sparc/ger.S @@ -0,0 +1,464 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define X %i5 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#else +#define X %i4 +#define INCX %i5 +#define Y %i2 +#define INCY %i3 +#endif + +#define A %l0 +#define LDA %l1 +#define BUFFER %l2 + +#define I %l3 +#define J %l4 + +#define A1 %o0 +#define X1 %o2 +#define XX %o3 + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define x1 %f8 +#define x2 %f10 +#define x3 %f12 +#define x4 %f14 +#define x5 %f16 +#define x6 %f18 +#define x7 %f20 +#define x8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 + +#define a9 %f40 +#define a10 %f42 +#define a11 %f44 +#define a12 %f46 +#define a13 %f48 +#define a14 %f50 +#define a15 %f52 +#define a16 %f54 + +#define y1 %f56 +#define y2 %f58 + +#define ALPHA %f60 + +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define x1 %f4 +#define x2 %f5 +#define x3 %f6 +#define x4 %f7 +#define x5 %f8 +#define x6 %f9 +#define x7 %f10 +#define x8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 + +#define a9 %f20 +#define a10 %f21 +#define a11 %f22 +#define a12 %f23 +#define a13 %f24 +#define a14 %f25 +#define a15 %f26 +#define a16 %f27 + +#define y1 %f28 +#define y2 %f29 +#define ALPHA %f30 +#endif + +#define PREFETCHSIZE 60 + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], INCX + ld [%sp + STACK_START + 32], Y + ld [%sp + STACK_START + 36], INCY + ld [%sp + STACK_START + 40], A + ld [%sp + STACK_START + 44], LDA + ld [%sp + STACK_START + 48], BUFFER +#else + st %i3, [%sp + STACK_START + 16] + + ld [%sp + STACK_START + 28], Y + ld [%sp + STACK_START + 32], INCY + ld [%sp + STACK_START + 36], A + ld [%sp + STACK_START + 40], LDA + ld [%sp + STACK_START + 44], BUFFER +#endif + LDF [%sp + STACK_START + 16], ALPHA +#else + ldx [%sp + STACK_START + 56], Y + ldx [%sp + STACK_START + 64], INCY + ldx [%sp + STACK_START + 72], A + ldx [%sp + STACK_START + 80], LDA + ldx [%sp + STACK_START + 88], BUFFER +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif +#endif + + sll LDA, BASE_SHIFT, LDA + + cmp M, 0 + ble %icc, .LL999 + sll INCX, BASE_SHIFT, INCX + cmp N, 0 + ble %icc, .LL999 + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + be %icc, .LL10 + mov X, XX + + mov BUFFER, XX + mov BUFFER, X1 + + sra M, 3, J + cmp J, 0 + ble,pn %icc, .LL05 + nop + +.LL01: + LDF [X], a1 + add X, INCX, X + LDF [X], a2 + add X, INCX, X + LDF [X], a3 + add X, INCX, X + LDF [X], a4 + add X, INCX, X + LDF [X], a5 + add X, INCX, X + LDF [X], a6 + add X, INCX, X + LDF [X], a7 + add X, INCX, X + LDF [X], a8 + add X, INCX, X + + STF a1, [X1 + 0 * SIZE] + STF a2, [X1 + 1 * SIZE] + STF a3, [X1 + 2 * SIZE] + STF a4, [X1 + 3 * SIZE] + STF a5, [X1 + 4 * SIZE] + STF a6, [X1 + 5 * SIZE] + STF a7, [X1 + 6 * SIZE] + STF a8, [X1 + 7 * SIZE] + + add X1, 8 * SIZE, X1 + + deccc J + bg,pn %icc, .LL01 + nop + +.LL05: + andcc M, 7, J + ble,pn %icc, .LL10 + nop + +.LL06: + LDF [X], a1 + add X, INCX, X + + STF a1, [X1 + 0 * SIZE] + add X1, 1 * SIZE, X1 + + deccc J + bg,pn %icc, .LL06 + nop + +.LL10: + mov N, J + cmp N, 0 + ble,pn %icc, .LL999 + nop + +.LL11: + mov XX, X1 + + mov A, A1 + add A, LDA, A + + LDF [Y], y1 + add Y, INCY, Y + + FMUL ALPHA, y1, y1 + + sra M, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X1 + 0 * SIZE], x1 + LDF [A1 + 0 * SIZE], a1 + LDF [X1 + 1 * SIZE], x2 + LDF [A1 + 1 * SIZE], a2 + LDF [X1 + 2 * SIZE], x3 + LDF [A1 + 2 * SIZE], a3 + LDF [X1 + 3 * SIZE], x4 + LDF [A1 + 3 * SIZE], a4 + + LDF [X1 + 4 * SIZE], x5 + LDF [A1 + 4 * SIZE], a5 + LDF [X1 + 5 * SIZE], x6 + LDF [A1 + 5 * SIZE], a6 + LDF [X1 + 6 * SIZE], x7 + LDF [A1 + 6 * SIZE], a7 + LDF [X1 + 7 * SIZE], x8 + LDF [A1 + 7 * SIZE], a8 + + FMUL x1, y1, t1 + FMUL x2, y1, t2 + FMUL x3, y1, t3 + FMUL x4, y1, t4 + + FADD a1, t1, a1 + FMUL x5, y1, t1 + FADD a2, t2, a2 + FMUL x6, y1, t2 + + deccc I + ble,pn %icc, .LL13 + nop + +.LL12: + prefetch [A1 + PREFETCHSIZE * SIZE], 0 + + FADD a3, t3, a3 + LDF [X1 + 8 * SIZE], x1 + FMUL x7, y1, t3 + LDF [X1 + 9 * SIZE], x2 + FADD a4, t4, a4 + LDF [X1 + 10 * SIZE], x3 + FMUL x8, y1, t4 + LDF [X1 + 11 * SIZE], x4 + + FADD a5, t1, a5 + STF a1, [A1 + 0 * SIZE] + LDF [A1 + 8 * SIZE], a1 + FMUL x1, y1, t1 + STF a2, [A1 + 1 * SIZE] + LDF [A1 + 9 * SIZE], a2 + + FADD a6, t2, a6 + STF a3, [A1 + 2 * SIZE] + LDF [A1 + 10 * SIZE], a3 + FMUL x2, y1, t2 + STF a4, [A1 + 3 * SIZE] + LDF [A1 + 11 * SIZE], a4 + + FADD a7, t3, a7 + LDF [X1 + 12 * SIZE], x5 + FMUL x3, y1, t3 + LDF [X1 + 13 * SIZE], x6 + FADD a8, t4, a8 + LDF [X1 + 14 * SIZE], x7 + FMUL x4, y1, t4 + LDF [X1 + 15 * SIZE], x8 + + FADD a1, t1, a1 + STF a5, [A1 + 4 * SIZE] + deccc I + LDF [A1 + 12 * SIZE], a5 + FMUL x5, y1, t1 + STF a6, [A1 + 5 * SIZE] + LDF [A1 + 13 * SIZE], a6 + FADD a2, t2, a2 + STF a7, [A1 + 6 * SIZE] + LDF [A1 + 14 * SIZE], a7 + FMUL x6, y1, t2 + STF a8, [A1 + 7 * SIZE] + LDF [A1 + 15 * SIZE], a8 + add A1, 8 * SIZE, A1 + + bg,pn %icc, .LL12 + add X1, 8 * SIZE, X1 + +.LL13: + FADD a3, t3, a3 + FMUL x7, y1, t3 + FADD a4, t4, a4 + FMUL x8, y1, t4 + + FADD a5, t1, a5 + FADD a6, t2, a6 + FADD a7, t3, a7 + FADD a8, t4, a8 + + STF a1, [A1 + 0 * SIZE] + STF a2, [A1 + 1 * SIZE] + STF a3, [A1 + 2 * SIZE] + STF a4, [A1 + 3 * SIZE] + + STF a5, [A1 + 4 * SIZE] + STF a6, [A1 + 5 * SIZE] + STF a7, [A1 + 6 * SIZE] + STF a8, [A1 + 7 * SIZE] + + add A1, 8 * SIZE, A1 + add X1, 8 * SIZE, X1 + +.LL15: + andcc M, 4, I + ble,pn %icc, .LL16 + nop + + LDF [X1 + 0 * SIZE], x1 + LDF [A1 + 0 * SIZE], a1 + LDF [X1 + 1 * SIZE], x2 + LDF [A1 + 1 * SIZE], a2 + + LDF [X1 + 2 * SIZE], x3 + LDF [A1 + 2 * SIZE], a3 + LDF [X1 + 3 * SIZE], x4 + LDF [A1 + 3 * SIZE], a4 + + FMUL x1, y1, t1 + FMUL x2, y1, t2 + FMUL x3, y1, t3 + FMUL x4, y1, t4 + + FADD a1, t1, a1 + FADD a2, t2, a2 + FADD a3, t3, a3 + FADD a4, t4, a4 + + STF a1, [A1 + 0 * SIZE] + STF a2, [A1 + 1 * SIZE] + STF a3, [A1 + 2 * SIZE] + add X1, 4 * SIZE, X1 + STF a4, [A1 + 3 * SIZE] + add A1, 4 * SIZE, A1 + +.LL16: + andcc M, 2, I + ble,pn %icc, .LL17 + nop + + LDF [X1 + 0 * SIZE], x1 + LDF [X1 + 1 * SIZE], x2 + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + + FMUL x1, y1, t1 + FMUL x2, y1, t2 + + FADD a1, t1, a1 + FADD a2, t2, a2 + + STF a1, [A1 + 0 * SIZE] + add X1, 2 * SIZE, X1 + STF a2, [A1 + 1 * SIZE] + add A1, 2 * SIZE, A1 + +.LL17: + andcc M, 1, I + ble,pn %icc, .LL19 + nop + + LDF [X1 + 0 * SIZE], x1 + add X1, 1 * SIZE, X1 + + LDF [A1 + 0 * SIZE], a1 + + FMUL x1, y1, t1 + FADD a1, t1, a1 + + STF a1, [A1 + 0 * SIZE] + add A1, 1 * SIZE, A1 + +.LL19: + deccc J + bg %icc, .LL11 + nop + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/iamax.S b/kernel/sparc/iamax.S new file mode 100644 index 0000000000..eb4a1313dd --- /dev/null +++ b/kernel/sparc/iamax.S @@ -0,0 +1,456 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define v1 %o0 +#define v2 %o1 +#define v3 %o2 +#define v4 %o3 +#define count %o4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#define CMOV movg +#else +#define FCMOV FMOVL +#define CMOV movl +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + clr v1 + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + mov 1, v1 + + add N, -1, N + LDF [X], c4 + add X, INCX, X + cmp N, 0 + ble .LL20 + FABS c4, c1 + + FABS c4, c2 + mov 1, v2 + FABS c4, c3 + mov 1, v3 + FABS c4, c4 + mov 1, v4 + mov 2, count + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + FABS a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a2, t2 + LDF [X + 0 * SIZE], a1 + FABS a3, t3 + LDF [X + 1 * SIZE], a2 + FABS a4, t4 + LDF [X + 2 * SIZE], a3 + + FCMP %fcc0, t1, c1 + LDF [X + 3 * SIZE], a4 + FCMP %fcc1, t2, c2 + nop + + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + FABS a8, t4 + LDF [X + 7 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + nop + CMOV %fcc0, count, v1 + add I, -1, I + + FCMOV %fcc1, t2, c2 + cmp I, 0 + CMOV %fcc1, count, v2 + add X, 8 * SIZE, X + + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + bg,pt %icc, .LL11 + add count, 4, count + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + add count, 1, count + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + +.LL20: + mov v1, %i0 + return %i7 + 8 + nop + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FABS a2, t2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FABS a4, t4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FABS a6, t2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FABS a8, t4 + LDF [X + 0 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + cmp I, 0 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FABS a1, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + add count, 1, count + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + + mov v1, %i0 + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/imax.S b/kernel/sparc/imax.S new file mode 100644 index 0000000000..c24e18252a --- /dev/null +++ b/kernel/sparc/imax.S @@ -0,0 +1,419 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define v1 %o0 +#define v2 %o1 +#define v3 %o2 +#define v4 %o3 +#define count %o4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#define CMOV movg +#else +#define FCMOV FMOVL +#define CMOV movl +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + clr v1 + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + mov 1, v1 + + add N, -1, N + LDF [X], c1 + add X, INCX, X + cmp N, 0 + ble .LL20 + nop + + FMOV c1, c2 + mov 1, v2 + FMOV c1, c3 + mov 1, v3 + FMOV c1, c4 + mov 1, v4 + mov 2, count + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 40 + +.LL11: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + LDF [X + 0 * SIZE], a1 + FCMOV %fcc1, a2, c2 + CMOV %fcc1, count, v2 + LDF [X + 1 * SIZE], a2 + FCMOV %fcc2, a3, c3 + CMOV %fcc2, count, v3 + LDF [X + 2 * SIZE], a3 + FCMOV %fcc3, a4, c4 + CMOV %fcc3, count, v4 + LDF [X + 3 * SIZE], a4 + add count, 4, count + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + CMOV %fcc0, count, v1 + LDF [X + 4 * SIZE], a5 + add I, -1, I + FCMOV %fcc1, a6, c2 + CMOV %fcc1, count, v2 + LDF [X + 5 * SIZE], a6 + cmp I, 0 + FCMOV %fcc2, a7, c3 + CMOV %fcc2, count, v3 + LDF [X + 6 * SIZE], a7 + FCMOV %fcc3, a8, c4 + CMOV %fcc3, count, v4 + LDF [X + 7 * SIZE], a8 + add count, 4, count + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, a2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, a3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, a4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, a6, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, a7, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, a8, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FCMP %fcc0, a1, c1 + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + cmp I, 0 + add count, 1, count + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + +.LL20: + mov v1, %i0 + return %i7 + 8 + nop + + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FCMOV %fcc1, a2, c2 + CMOV %fcc1, count, v2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + + FCMOV %fcc2, a3, c3 + CMOV %fcc2, count, v3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + FCMOV %fcc3, a4, c4 + CMOV %fcc3, count, v4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + add count, 4, count + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + CMOV %fcc0, count, v1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + + FCMOV %fcc1, a6, c2 + add I, -1, I + CMOV %fcc1, count, v2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + + FCMOV %fcc2, a7, c3 + CMOV %fcc2, count, v3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + + cmp I, 0 + FCMOV %fcc3, a8, c4 + CMOV %fcc3, count, v4 + LDF [X + 0 * SIZE], a8 + add count, 4, count + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, a2, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, a3, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, a4, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, a6, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, a7, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, a8, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FCMP %fcc0, a1, c1 + FCMOV %fcc0, a1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + add count, 1, count + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + + mov v1, %i0 + return %i7 + 8 + nop + + + EPILOGUE diff --git a/kernel/sparc/izamax.S b/kernel/sparc/izamax.S new file mode 100644 index 0000000000..3d0a48e84c --- /dev/null +++ b/kernel/sparc/izamax.S @@ -0,0 +1,425 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define v1 %o0 +#define v2 %o1 +#define v3 %o2 +#define v4 %o3 +#define count %o4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 +#define t5 %f16 +#define t6 %f18 +#define t7 %f20 +#define t8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 +#define t5 %f8 +#define t6 %f9 +#define t7 %f10 +#define t8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#define CMOV movg +#else +#define FCMOV FMOVL +#define CMOV movl +#endif + + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + clr v1 + + cmp INCX, 0 + ble .LL20 + sll INCX, ZBASE_SHIFT, INCX + + mov 1, v1 + + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + add N, -1, N + FABS c1, c1 + add X, INCX, X + FABS c2, c2 + cmp N, 0 + ble .LL20 + FADD c1, c2, c1 + + FMOV c1, c2 + mov 1, v2 + FMOV c1, c3 + mov 1, v3 + FMOV c1, c4 + mov 1, v4 + mov 2, count + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 32 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FABS a5, t5 + LDF [X + 4 * SIZE], a5 + FABS a6, t6 + LDF [X + 5 * SIZE], a6 + FABS a7, t7 + LDF [X + 6 * SIZE], a7 + FABS a8, t8 + LDF [X + 7 * SIZE], a8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + FCMOV %fcc1, t3, c2 + CMOV %fcc1, count, v2 + cmp I, 0 + FCMOV %fcc2, t5, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t7, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FABS a5, t5 + FABS a6, t6 + FABS a7, t7 + FABS a8, t8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t3, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t5, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t7, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + + FABS a1, t1 + FABS a2, t2 + FADD t1, t2, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add count, 1, count + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + +.LL20: + mov v1, %i0 + return %i7 + 8 + nop + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + FABS a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FABS a5, t5 + LDF [X + 0 * SIZE], a5 + FABS a6, t6 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FABS a7, t7 + LDF [X + 0 * SIZE], a7 + FABS a8, t8 + LDF [X + 1 * SIZE], a8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add I, -1, I + FCMOV %fcc1, t3, c2 + CMOV %fcc1, count, v2 + cmp I, 0 + FCMOV %fcc2, t5, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t7, c4 + CMOV %fcc3, count, v4 + add count, 4, count + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FABS a5, t5 + FABS a6, t6 + FABS a7, t7 + FABS a8, t8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + FCMOV %fcc1, t3, c2 + CMOV %fcc1, count, v2 + FCMOV %fcc2, t5, c3 + CMOV %fcc2, count, v3 + FCMOV %fcc3, t7, c4 + CMOV %fcc3, count, v4 + add count, 4, count + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + + FABS a1, t1 + add I, -1, I + FABS a2, t2 + cmp I, 0 + FADD t1, t2, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + CMOV %fcc0, count, v1 + add count, 1, count + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + add v2, 1, v2 + FCMP %fcc1, c4, c3 + add v3, 2, v3 + add v4, 3, v4 + + FCMOV %fcc0, c2, c1 + CMOV %fcc0, v2, v1 + FCMOV %fcc1, c4, c3 + CMOV %fcc1, v4, v3 + FCMP %fcc0, c3, c1 + CMOV %fcc0, v3, v1 + + mov v1, %i0 + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/lsame.S b/kernel/sparc/lsame.S new file mode 100644 index 0000000000..778301fab8 --- /dev/null +++ b/kernel/sparc/lsame.S @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define A %o0 +#define B %o1 +#define AA %o4 +#define BB %o3 + + PROLOGUE + + ldub [A], A + ldub [B], B + add A, -32, AA + add B, -32, BB + + cmp A, 96 + movge %icc, AA, A + + cmp B, 96 + movge %icc, BB, B + + clr %g1 + cmp A, B + move %icc, 1, %g1 + retl + mov %g1, %o0 + + EPILOGUE diff --git a/kernel/sparc/max.S b/kernel/sparc/max.S new file mode 100644 index 0000000000..1a4bc44632 --- /dev/null +++ b/kernel/sparc/max.S @@ -0,0 +1,339 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#else +#define FCMOV FMOVL +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + add N, -1, N + LDF [X], c1 + add X, INCX, X + cmp N, 0 + ble .LL20 + nop + + FMOV c1, c2 + FMOV c1, c3 + FMOV c1, c4 + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 40 + +.LL11: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + LDF [X + 0 * SIZE], a1 + FCMOV %fcc1, a2, c2 + LDF [X + 1 * SIZE], a2 + FCMOV %fcc2, a3, c3 + LDF [X + 2 * SIZE], a3 + FCMOV %fcc3, a4, c4 + LDF [X + 3 * SIZE], a4 + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + LDF [X + 4 * SIZE], a5 + add I, -1, I + FCMOV %fcc1, a6, c2 + LDF [X + 5 * SIZE], a6 + cmp I, 0 + FCMOV %fcc2, a7, c3 + LDF [X + 6 * SIZE], a7 + FCMOV %fcc3, a8, c4 + LDF [X + 7 * SIZE], a8 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + FCMOV %fcc1, a2, c2 + FCMOV %fcc2, a3, c3 + FCMOV %fcc3, a4, c4 + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + FCMOV %fcc1, a6, c2 + FCMOV %fcc2, a7, c3 + FCMOV %fcc3, a8, c4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + FCMP %fcc0, a1, c1 + FCMOV %fcc0, a1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + +.LL20: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + LDF [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + FCMOV %fcc1, a2, c2 + LDF [X + 0 * SIZE], a2 + add X, INCX, X + FCMOV %fcc2, a3, c3 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + FCMOV %fcc3, a4, c4 + LDF [X + 0 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, a5, c1 + add I, -1, I + FCMP %fcc1, a6, c2 + cmp I, 0 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + LDF [X + 0 * SIZE], a5 + add X, INCX, X + FCMOV %fcc1, a6, c2 + LDF [X + 0 * SIZE], a6 + add X, INCX, X + FCMOV %fcc2, a7, c3 + LDF [X + 0 * SIZE], a7 + add X, INCX, X + FCMOV %fcc3, a8, c4 + LDF [X + 0 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FCMP %fcc0, a1, c1 + FCMP %fcc1, a2, c2 + FCMP %fcc2, a3, c3 + FCMP %fcc3, a4, c4 + + FCMOV %fcc0, a1, c1 + FCMOV %fcc1, a2, c2 + FCMOV %fcc2, a3, c3 + FCMOV %fcc3, a4, c4 + + FCMP %fcc0, a5, c1 + FCMP %fcc1, a6, c2 + FCMP %fcc2, a7, c3 + FCMP %fcc3, a8, c4 + + FCMOV %fcc0, a5, c1 + FCMOV %fcc1, a6, c2 + FCMOV %fcc2, a7, c3 + FCMOV %fcc3, a8, c4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + FCMP %fcc0, a1, c1 + FCMOV %fcc0, a1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/rot.S b/kernel/sparc/rot.S new file mode 100644 index 0000000000..f5c5770470 --- /dev/null +++ b/kernel/sparc/rot.S @@ -0,0 +1,668 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#define XX %l0 +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f4 +#define a2 %f6 +#define a3 %f8 +#define a4 %f10 +#define a5 %f12 +#define a6 %f14 +#define a7 %f16 +#define a8 %f18 +#define b1 %f20 +#define b2 %f22 +#define b3 %f24 +#define b4 %f26 +#define b5 %f28 +#define b6 %f30 +#define b7 %f32 +#define b8 %f34 + +#define c1 %f36 +#define c2 %f38 +#define c3 %f40 +#define c4 %f42 +#define c5 %f44 +#define c6 %f46 +#define c7 %f48 +#define c8 %f50 + +#define t1 %f52 +#define t2 %f54 +#define t3 %f56 +#define t4 %f58 +#else +#define a1 %f2 +#define a2 %f3 +#define a3 %f4 +#define a4 %f5 +#define a5 %f6 +#define a6 %f7 +#define a7 %f8 +#define a8 %f9 +#define b1 %f10 +#define b2 %f11 +#define b3 %f12 +#define b4 %f13 +#define b5 %f14 +#define b6 %f15 +#define b7 %f16 +#define b8 %f17 + +#define c1 %f18 +#define c2 %f19 +#define c3 %f20 +#define c4 %f21 +#define c5 %f22 +#define c6 %f23 +#define c7 %f24 +#define c8 %f25 + +#define t1 %f26 +#define t2 %f27 +#define t3 %f28 +#define t4 %f29 +#endif + +#ifdef DOUBLE +#define C %f0 +#define S %f2 +#else +#define C %f0 +#define S %f1 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i5, [%sp + STACK_START + 24] + + LDF [%sp + STACK_START + 24], C + LDF [%sp + STACK_START + 32], S +#else + st %i5, [%sp + STACK_START + 24] + + LDF [%sp + STACK_START + 24], C + LDF [%sp + STACK_START + 28], S +#endif +#else +#ifdef DOUBLE + FMOV %f10, C + FMOV %f12, S +#else + FMOV %f11, C + FMOV %f13, S +#endif +#endif + + cmp N, 0 + ble .LL19 + nop + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + LDF [Y + 8 * SIZE], b1 + FMUL S, a1, c4 + LDF [X + 8 * SIZE], a1 + + FMUL C, a2, c5 + FMUL S, b2, c6 + FADD c1, c2, t1 + + FMUL C, b2, c7 + LDF [Y + 9 * SIZE], b2 + FMUL S, a2, c8 + LDF [X + 9 * SIZE], a2 + FSUB c3, c4, t2 + + addcc I, -1, I + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 64 + +.LL11: + FMUL C, a3, c1 + nop + prefetch [Y + PREFETCHSIZE * SIZE], 1 + nop + + FMUL S, b3, c2 + STF t1, [X + 0 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b3, c3 + LDF [Y + 10 * SIZE], b3 + nop + nop + + FMUL S, a3, c4 + STF t2, [Y + 0 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a4, c5 + LDF [X + 10 * SIZE], a3 + nop + nop + + FMUL S, b4, c6 + STF t3, [X + 1 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b4, c7 + LDF [Y + 11 * SIZE], b4 + nop + nop + + FMUL S, a4, c8 + STF t4, [Y + 1 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a5, c1 + LDF [X + 11 * SIZE], a4 + nop + nop + + FMUL S, b5, c2 + STF t1, [X + 2 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b5, c3 + LDF [Y + 12 * SIZE], b5 + nop + nop + + FMUL S, a5, c4 + STF t2, [Y + 2 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a6, c5 + LDF [X + 12 * SIZE], a5 + nop + nop + + FMUL S, b6, c6 + STF t3, [X + 3 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b6, c7 + LDF [Y + 13 * SIZE], b6 + nop + nop + + FMUL S, a6, c8 + STF t4, [Y + 3 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a7, c1 + LDF [X + 13 * SIZE], a6 + nop + nop + + FMUL S, b7, c2 + STF t1, [X + 4 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b7, c3 + LDF [Y + 14 * SIZE], b7 + nop + nop + + FMUL S, a7, c4 + STF t2, [Y + 4 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a8, c5 + LDF [X + 14 * SIZE], a7 + nop + nop + + FMUL S, b8, c6 + STF t3, [X + 5 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b8, c7 + LDF [Y + 15 * SIZE], b8 + nop + nop + + FMUL S, a8, c8 + STF t4, [Y + 5 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a1, c1 + LDF [X + 15 * SIZE], a8 + addcc I, -1, I + nop + + FMUL S, b1, c2 + STF t1, [X + 6 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b1, c3 + LDF [Y + 16 * SIZE], b1 + nop + nop + + FMUL S, a1, c4 + STF t2, [Y + 6 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a2, c5 + LDF [X + 16 * SIZE], a1 + add Y, 8 * SIZE, Y + nop + + FMUL S, b2, c6 + STF t3, [X + 7 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b2, c7 + LDF [Y + 9 * SIZE], b2 + add X, 8 * SIZE, X + nop + + FMUL S, a2, c8 + STF t4, [Y - 1 * SIZE] + FSUB c3, c4, t2 + nop + + bg,pt %icc, .LL11 + LDF [X + 9 * SIZE], a2 + + +.LL12: + FMUL C, a3, c1 + FMUL S, b3, c2 + STF t1, [X + 0 * SIZE] + FADD c5, c6, t3 + + FMUL C, b3, c3 + FMUL S, a3, c4 + STF t2, [Y + 0 * SIZE] + FSUB c7, c8, t4 + + + FMUL C, a4, c5 + FMUL S, b4, c6 + STF t3, [X + 1 * SIZE] + FADD c1, c2, t1 + + FMUL C, b4, c7 + FMUL S, a4, c8 + STF t4, [Y + 1 * SIZE] + FSUB c3, c4, t2 + + + FMUL C, a5, c1 + FMUL S, b5, c2 + STF t1, [X + 2 * SIZE] + FADD c5, c6, t3 + + FMUL C, b5, c3 + FMUL S, a5, c4 + STF t2, [Y + 2 * SIZE] + FSUB c7, c8, t4 + + FMUL C, a6, c5 + FMUL S, b6, c6 + STF t3, [X + 3 * SIZE] + FADD c1, c2, t1 + + FMUL C, b6, c7 + FMUL S, a6, c8 + STF t4, [Y + 3 * SIZE] + FSUB c3, c4, t2 + + FMUL C, a7, c1 + FMUL S, b7, c2 + STF t1, [X + 4 * SIZE] + FADD c5, c6, t3 + + FMUL C, b7, c3 + FMUL S, a7, c4 + STF t2, [Y + 4 * SIZE] + FSUB c7, c8, t4 + + FMUL C, a8, c5 + FMUL S, b8, c6 + STF t3, [X + 5 * SIZE] + FADD c1, c2, t1 + + FMUL C, b8, c7 + FMUL S, a8, c8 + STF t4, [Y + 5 * SIZE] + FSUB c3, c4, t2 + + FADD c5, c6, t3 + STF t1, [X + 6 * SIZE] + + FSUB c7, c8, t4 + STF t2, [Y + 6 * SIZE] + + STF t3, [X + 7 * SIZE] + STF t4, [Y + 7 * SIZE] + + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + + +.LL15: + andcc N, 7, I + nop + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add X, 1 * SIZE, X + LDF [Y + 0 * SIZE], b1 + add Y, 1 * SIZE, Y + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FADD c1, c2, c2 + addcc I, -1, I + FSUB c3, c4, c4 + nop + + STF c2, [X - 1 * SIZE] + STF c4, [Y - 1 * SIZE] + bg,pt %icc, .LL16 + nop + +.LL19: + return %i7 + 8 + nop + +.LL50: + mov X, XX + mov Y, YY + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + add Y, INCY, Y + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FADD c1, c2, t1 + FSUB c3, c4, t2 + + STF t1, [XX + 0 * SIZE] + add XX, INCX, XX + STF t2, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a2, c5 + FMUL S, b2, c6 + FMUL C, b2, c7 + FMUL S, a2, c8 + + FADD c5, c6, t3 + FSUB c7, c8, t4 + + STF t3, [XX + 0 * SIZE] + add XX, INCX, XX + STF t4, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a3, c1 + FMUL S, b3, c2 + FMUL C, b3, c3 + FMUL S, a3, c4 + + FADD c1, c2, t1 + FSUB c3, c4, t2 + + STF t1, [XX + 0 * SIZE] + add XX, INCX, XX + STF t2, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a4, c5 + FMUL S, b4, c6 + FMUL C, b4, c7 + FMUL S, a4, c8 + + FADD c5, c6, t3 + FSUB c7, c8, t4 + + STF t3, [XX + 0 * SIZE] + add XX, INCX, XX + STF t4, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a5, c1 + FMUL S, b5, c2 + FMUL C, b5, c3 + FMUL S, a5, c4 + + FADD c1, c2, t1 + FSUB c3, c4, t2 + + STF t1, [XX + 0 * SIZE] + add XX, INCX, XX + STF t2, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a6, c5 + FMUL S, b6, c6 + FMUL C, b6, c7 + FMUL S, a6, c8 + + FADD c5, c6, t3 + FSUB c7, c8, t4 + + STF t3, [XX + 0 * SIZE] + add XX, INCX, XX + STF t4, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a7, c1 + FMUL S, b7, c2 + FMUL C, b7, c3 + FMUL S, a7, c4 + + FADD c1, c2, t1 + FSUB c3, c4, t2 + + STF t1, [XX + 0 * SIZE] + add XX, INCX, XX + STF t2, [YY + 0 * SIZE] + add YY, INCY, YY + + FMUL C, a8, c5 + FMUL S, b8, c6 + FMUL C, b8, c7 + FMUL S, a8, c8 + + FADD c5, c6, t3 + FSUB c7, c8, t4 + + STF t3, [XX + 0 * SIZE] + add XX, INCX, XX + STF t4, [YY + 0 * SIZE] + add YY, INCY, YY + + addcc I, -1, I + bg,pt %icc, .LL51 + nop + + +.LL55: + andcc N, 7, I + nop + ble %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FADD c1, c2, c2 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + add X, INCX, X + STF c4, [Y + 0 * SIZE] + addcc I, -1, I + + bg %icc, .LL56 + add Y, INCY, Y + + +.LL59: + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/scal.S b/kernel/sparc/scal.S new file mode 100644 index 0000000000..1414a09304 --- /dev/null +++ b/kernel/sparc/scal.S @@ -0,0 +1,398 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#if defined(DOUBLE) && !defined(__64BIT__) +#define X %i5 +#define INCX %i1 +#else +#define X %i4 +#define INCX %i5 +#endif + +#define I %i2 +#define XX %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define c5 %f8 +#define c6 %f10 +#define c7 %f12 +#define c8 %f14 + +#define t1 %f16 +#define t2 %f18 +#define t3 %f20 +#define t4 %f22 +#define t5 %f24 +#define t6 %f26 +#define t7 %f28 +#define t8 %f30 + +#define FZERO %f60 +#define ALPHA %f62 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define c5 %f4 +#define c6 %f5 +#define c7 %f6 +#define c8 %f7 + +#define t1 %f8 +#define t2 %f9 +#define t3 %f10 +#define t4 %f11 +#define t5 %f12 +#define t6 %f13 +#define t7 %f14 +#define t8 %f15 + +#define FZERO %f29 +#define ALPHA %f30 +#endif + +#define PREFETCHSIZE 168 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + ld [%sp + STACK_START + 28], INCX +#else + st %i3, [%sp + STACK_START + 16] +#endif + + LDF [%sp + STACK_START + 16], ALPHA +#else +#ifdef DOUBLE + FMOV %f6, ALPHA +#else + FMOV %f7, ALPHA +#endif +#endif + + FCLR(29) + + FCMP ALPHA, FZERO + fbne .LL100 + sll INCX, BASE_SHIFT, INCX + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + STF FZERO, [X + 0 * SIZE] + add I, -1, I + STF FZERO, [X + 1 * SIZE] + cmp I, 0 + STF FZERO, [X + 2 * SIZE] + STF FZERO, [X + 3 * SIZE] + STF FZERO, [X + 4 * SIZE] + STF FZERO, [X + 5 * SIZE] + add X, 8 * SIZE, X + STF FZERO, [X - 2 * SIZE] + bg,pt %icc, .LL11 + STF FZERO, [X - 1 * SIZE] + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + STF FZERO, [X + 0 * SIZE] + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + return %i7 + 8 + clr %o0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + add I, -1, I + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + cmp I, 0 + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + bg,pt %icc, .LL51 + add X, INCX, X + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + STF FZERO, [X + 0 * SIZE] + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + return %i7 + 8 + clr %o0 + +.LL100: + cmp INCX, SIZE + bne .LL150 + sra N, 3, I + + cmp I, 0 + ble,pn %icc, .LL115 + nop + + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + LDF [X + 2 * SIZE], c3 + LDF [X + 3 * SIZE], c4 + LDF [X + 4 * SIZE], c5 + LDF [X + 5 * SIZE], c6 + LDF [X + 6 * SIZE], c7 + LDF [X + 7 * SIZE], c8 + FMUL ALPHA, c1, t1 + LDF [X + 8 * SIZE], c1 + FMUL ALPHA, c2, t2 + LDF [X + 9 * SIZE], c2 + + deccc I + ble,pt %icc, .LL112 + nop + +.LL111: + prefetch [X + PREFETCHSIZE * SIZE], 0 + deccc I + + FMUL ALPHA, c3, t3 + LDF [X + 10 * SIZE], c3 + nop + STF t1, [X + 0 * SIZE] + + FMUL ALPHA, c4, t4 + LDF [X + 11 * SIZE], c4 + nop + STF t2, [X + 1 * SIZE] + + FMUL ALPHA, c5, t5 + LDF [X + 12 * SIZE], c5 + nop + STF t3, [X + 2 * SIZE] + + FMUL ALPHA, c6, t6 + LDF [X + 13 * SIZE], c6 + nop + STF t4, [X + 3 * SIZE] + + FMUL ALPHA, c7, t7 + LDF [X + 14 * SIZE], c7 + nop + STF t5, [X + 4 * SIZE] + + FMUL ALPHA, c8, t8 + LDF [X + 15 * SIZE], c8 + nop + STF t6, [X + 5 * SIZE] + + FMUL ALPHA, c1, t1 + STF t7, [X + 6 * SIZE] + nop + LDF [X + 16 * SIZE], c1 + + FMUL ALPHA, c2, t2 + STF t8, [X + 7 * SIZE] + nop + LDF [X + 17 * SIZE], c2 + + bg,pt %icc, .LL111 + add X, 8 * SIZE, X + +.LL112: + FMUL ALPHA, c3, t3 + STF t1, [X + 0 * SIZE] + FMUL ALPHA, c4, t4 + STF t2, [X + 1 * SIZE] + + FMUL ALPHA, c5, t5 + STF t3, [X + 2 * SIZE] + FMUL ALPHA, c6, t6 + STF t4, [X + 3 * SIZE] + + FMUL ALPHA, c7, t7 + STF t5, [X + 4 * SIZE] + FMUL ALPHA, c8, t8 + STF t6, [X + 5 * SIZE] + STF t7, [X + 6 * SIZE] + STF t8, [X + 7 * SIZE] + + add X, 8 * SIZE, X + +.LL115: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL119 + nop + +.LL116: + LDF [X + 0 * SIZE], c1 + add I, -1, I + FMUL ALPHA, c1, c1 + cmp I, 0 + STF c1, [X + 0 * SIZE] + bg,pt %icc, .LL116 + add X, 1 * SIZE, X + +.LL119: + return %i7 + 8 + clr %o0 + +.LL150: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL155 + mov X, XX + +.LL151: + LDF [X + 0 * SIZE], c1 + add X, INCX, X + LDF [X + 0 * SIZE], c2 + add X, INCX, X + LDF [X + 0 * SIZE], c3 + add X, INCX, X + LDF [X + 0 * SIZE], c4 + add X, INCX, X + LDF [X + 0 * SIZE], c5 + FMUL ALPHA, c1, c1 + add X, INCX, X + LDF [X + 0 * SIZE], c6 + FMUL ALPHA, c2, c2 + add X, INCX, X + LDF [X + 0 * SIZE], c7 + FMUL ALPHA, c3, c3 + add X, INCX, X + LDF [X + 0 * SIZE], c8 + FMUL ALPHA, c4, c4 + add X, INCX, X + + STF c1, [XX + 0 * SIZE] + FMUL ALPHA, c5, c5 + add XX, INCX, XX + STF c2, [XX + 0 * SIZE] + FMUL ALPHA, c6, c6 + add XX, INCX, XX + STF c3, [XX + 0 * SIZE] + FMUL ALPHA, c7, c7 + add XX, INCX, XX + STF c4, [XX + 0 * SIZE] + FMUL ALPHA, c8, c8 + add XX, INCX, XX + STF c5, [XX + 0 * SIZE] + add XX, INCX, XX + add I, -1, I + STF c6, [XX + 0 * SIZE] + add XX, INCX, XX + cmp I, 0 + STF c7, [XX + 0 * SIZE] + add XX, INCX, XX + STF c8, [XX + 0 * SIZE] + + bg,pt %icc, .LL151 + add XX, INCX, XX + +.LL155: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [X + 0 * SIZE], c1 + add I, -1, I + FMUL ALPHA, c1, c1 + cmp I, 0 + STF c1, [X + 0 * SIZE] + bg,pt %icc, .LL156 + add X, INCX, X + +.LL159: + return %i7 + 8 + clr %o0 + + + EPILOGUE diff --git a/kernel/sparc/snrm2.S b/kernel/sparc/snrm2.S new file mode 100644 index 0000000000..a802472594 --- /dev/null +++ b/kernel/sparc/snrm2.S @@ -0,0 +1,334 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + + PROLOGUE + SAVESP + + FCLR(0) + + FMOV c1, c2 + FMOV c1, c3 + FMOV c1, c4 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL20 + sll INCX, BASE_SHIFT, INCX + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + ld [X + 0 * SIZE], a1 + add I, -1, I + ld [X + 1 * SIZE], a2 + cmp I, 0 + ld [X + 2 * SIZE], a3 + ld [X + 3 * SIZE], a4 + ld [X + 4 * SIZE], a5 + ld [X + 5 * SIZE], a6 + ld [X + 6 * SIZE], a7 + ld [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + + faddd c2, t2, c2 + add I, -1, I + fsmuld a2, a2, t2 + ld [X + 0 * SIZE], a1 + + faddd c3, t3, c3 + cmp I, 0 + fsmuld a3, a3, t3 + ld [X + 1 * SIZE], a2 + + faddd c4, t4, c4 + fsmuld a4, a4, t4 + ld [X + 2 * SIZE], a3 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + ld [X + 3 * SIZE], a4 + + faddd c2, t2, c2 + fsmuld a6, a6, t2 + ld [X + 4 * SIZE], a5 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + ld [X + 5 * SIZE], a6 + + faddd c4, t4, c4 + ld [X + 6 * SIZE], a7 + fsmuld a8, a8, t4 + add X, 8 * SIZE, X + + bg,pt %icc, .LL11 + ld [X - 1 * SIZE], a8 + +.LL12: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + faddd c2, t2, c2 + fsmuld a2, a2, t2 + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + faddd c4, t4, c4 + fsmuld a4, a4, t4 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + faddd c2, t2, c2 + fsmuld a6, a6, t2 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + faddd c4, t4, c4 + fsmuld a8, a8, t4 + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + ld [X + 0 * SIZE], a1 + + add I, -1, I + cmp I, 0 + faddd c1, t1, c1 + fsmuld a1, a1, t1 + + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + faddd c1, t1, c1 + faddd c2, t2, c2 + faddd c3, t3, c3 + faddd c4, t4, c4 + + faddd c1, c2, c1 + faddd c3, c4, c3 + faddd c1, c3, c1 + + fsqrtd c1, c1 + +#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) + fdtos c1, c1 +#endif +.LL20: + + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + ld [X + 0 * SIZE], a1 + add X, INCX, X + ld [X + 0 * SIZE], a2 + add X, INCX, X + ld [X + 0 * SIZE], a3 + add X, INCX, X + ld [X + 0 * SIZE], a4 + add X, INCX, X + ld [X + 0 * SIZE], a5 + add X, INCX, X + ld [X + 0 * SIZE], a6 + add X, INCX, X + add I, -1, I + ld [X + 0 * SIZE], a7 + cmp I, 0 + add X, INCX, X + ld [X + 0 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + faddd c1, t1, c1 + add I, -1, I + fsmuld a1, a1, t1 + ld [X + 0 * SIZE], a1 + add X, INCX, X + + faddd c2, t2, c2 + cmp I, 0 + fsmuld a2, a2, t2 + ld [X + 0 * SIZE], a2 + add X, INCX, X + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + ld [X + 0 * SIZE], a3 + add X, INCX, X + + faddd c4, t4, c4 + fsmuld a4, a4, t4 + ld [X + 0 * SIZE], a4 + add X, INCX, X + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + ld [X + 0 * SIZE], a5 + add X, INCX, X + + faddd c2, t2, c2 + fsmuld a6, a6, t2 + ld [X + 0 * SIZE], a6 + add X, INCX, X + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + ld [X + 0 * SIZE], a7 + add X, INCX, X + + faddd c4, t4, c4 + fsmuld a8, a8, t4 + ld [X + 0 * SIZE], a8 + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + faddd c1, t1, c1 + fsmuld a1, a1, t1 + faddd c2, t2, c2 + fsmuld a2, a2, t2 + + faddd c3, t3, c3 + fsmuld a3, a3, t3 + faddd c4, t4, c4 + fsmuld a4, a4, t4 + + faddd c1, t1, c1 + fsmuld a5, a5, t1 + faddd c2, t2, c2 + fsmuld a6, a6, t2 + + faddd c3, t3, c3 + fsmuld a7, a7, t3 + faddd c4, t4, c4 + fsmuld a8, a8, t4 + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + ld [X + 0 * SIZE], a1 + add I, -1, I + cmp I, 0 + faddd c1, t1, c1 + fsmuld a1, a1, t1 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + faddd c1, t1, c1 + faddd c2, t2, c2 + faddd c3, t3, c3 + faddd c4, t4, c4 + + faddd c1, c2, c1 + faddd c3, c4, c3 + faddd c1, c3, c1 + + fsqrtd c1, c1 + +#if !defined(NEED_F2CCONV) || !defined(F_INTERFACE_F2C) + fdtos c1, c1 +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/staticbuffer.S b/kernel/sparc/staticbuffer.S new file mode 100644 index 0000000000..679ad56fff --- /dev/null +++ b/kernel/sparc/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 256 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 +#endif diff --git a/kernel/sparc/swap.S b/kernel/sparc/swap.S new file mode 100644 index 0000000000..1d7950cd8f --- /dev/null +++ b/kernel/sparc/swap.S @@ -0,0 +1,346 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(DOUBLE) && !defined(__64BIT__) +#define N %i0 +#define X %i5 +#define INCX %i1 +#define Y %i2 +#define INCY %i3 +#define I %i4 +#else +#define N %i0 +#define X %i4 +#define INCX %i5 +#define Y %i1 +#define INCY %i2 +#define I %i3 +#endif + +#define XX %l0 +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define b1 %f16 +#define b2 %f18 +#define b3 %f20 +#define b4 %f22 +#define b5 %f24 +#define b6 %f26 +#define b7 %f28 +#define b8 %f30 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define b1 %f8 +#define b2 %f9 +#define b3 %f10 +#define b4 %f11 +#define b5 %f12 +#define b6 %f13 +#define b7 %f14 +#define b8 %f15 +#endif + +#ifdef DOUBLE +#define PREFETCHSIZE 128 +#else +#define PREFETCHSIZE 256 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 28], INCX + ld [%sp + STACK_START + 32], Y + ld [%sp + STACK_START + 36], INCY +#else + ld [%sp+ STACK_START + 28], Y + ld [%sp+ STACK_START + 32], INCY +#endif +#else + ldx [%sp+ STACK_START + 56], Y + ldx [%sp+ STACK_START + 64], INCY +#endif + + sll INCX, BASE_SHIFT, INCX + sll INCY, BASE_SHIFT, INCY + + cmp INCX, SIZE + bne .LL50 + nop + cmp INCY, SIZE + bne .LL50 + nop + + sra N, 3, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + deccc I + ble,pn %icc, .LL12 + nop + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + deccc I + + STF a1, [Y + 0 * SIZE] + LDF [X + 8 * SIZE], a1 + STF b1, [X + 0 * SIZE] + LDF [Y + 8 * SIZE], b1 + + STF a2, [Y + 1 * SIZE] + LDF [X + 9 * SIZE], a2 + STF b2, [X + 1 * SIZE] + LDF [Y + 9 * SIZE], b2 + + STF a3, [Y + 2 * SIZE] + LDF [X + 10 * SIZE], a3 + STF b3, [X + 2 * SIZE] + LDF [Y + 10 * SIZE], b3 + + STF a4, [Y + 3 * SIZE] + LDF [X + 11 * SIZE], a4 + STF b4, [X + 3 * SIZE] + LDF [Y + 11 * SIZE], b4 + + prefetch [Y + PREFETCHSIZE * SIZE], 0 + add X, 8 * SIZE, X + + STF a5, [Y + 4 * SIZE] + LDF [X + 4 * SIZE], a5 + STF b5, [X - 4 * SIZE] + LDF [Y + 12 * SIZE], b5 + + STF a6, [Y + 5 * SIZE] + LDF [X + 5 * SIZE], a6 + STF b6, [X - 3 * SIZE] + LDF [Y + 13 * SIZE], b6 + + STF a7, [Y + 6 * SIZE] + LDF [X + 6 * SIZE], a7 + STF b7, [X - 2 * SIZE] + LDF [Y + 14 * SIZE], b7 + + STF a8, [Y + 7 * SIZE] + LDF [X + 7 * SIZE], a8 + STF b8, [X - 1 * SIZE] + LDF [Y + 15 * SIZE], b8 + + bg,pt %icc, .LL11 + add Y, 8 * SIZE, Y + +.LL12: + STF a1, [Y + 0 * SIZE] + STF b1, [X + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + STF b2, [X + 1 * SIZE] + STF a3, [Y + 2 * SIZE] + STF b3, [X + 2 * SIZE] + STF a4, [Y + 3 * SIZE] + STF b4, [X + 3 * SIZE] + STF a5, [Y + 4 * SIZE] + STF b5, [X + 4 * SIZE] + STF a6, [Y + 5 * SIZE] + STF b6, [X + 5 * SIZE] + STF a7, [Y + 6 * SIZE] + STF b7, [X + 6 * SIZE] + STF a8, [Y + 7 * SIZE] + STF b8, [X + 7 * SIZE] + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + +.LL15: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + STF a1, [Y + 0 * SIZE] + add Y, 1 * SIZE, Y + STF b1, [X + 0 * SIZE] + bg,pt %icc, .LL16 + add X, 1 * SIZE, X + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 3, I + mov X, XX + cmp I, 0 + ble,pn %icc, .LL55 + mov Y, YY + +.LL51: + LDF [X + 0 * SIZE], a1 + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + add Y, INCY, Y + LDF [X + 0 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + add X, INCX, X + LDF [Y + 0 * SIZE], b3 + add Y, INCY, Y + LDF [X + 0 * SIZE], a4 + add X, INCX, X + LDF [Y + 0 * SIZE], b4 + add Y, INCY, Y + LDF [X + 0 * SIZE], a5 + add X, INCX, X + LDF [Y + 0 * SIZE], b5 + add Y, INCY, Y + LDF [X + 0 * SIZE], a6 + add X, INCX, X + LDF [Y + 0 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + add X, INCX, X + LDF [Y + 0 * SIZE], b7 + add Y, INCY, Y + LDF [X + 0 * SIZE], a8 + add X, INCX, X + LDF [Y + 0 * SIZE], b8 + add Y, INCY, Y + + STF a1, [YY + 0 * SIZE] + add I, -1, I + add YY, INCY, YY + STF b1, [XX + 0 * SIZE] + cmp I, 0 + add XX, INCX, XX + STF a2, [YY + 0 * SIZE] + add YY, INCY, YY + STF b2, [XX + 0 * SIZE] + add XX, INCX, XX + STF a3, [YY + 0 * SIZE] + add YY, INCY, YY + STF b3, [XX + 0 * SIZE] + add XX, INCX, XX + STF a4, [YY + 0 * SIZE] + add YY, INCY, YY + STF b4, [XX + 0 * SIZE] + add XX, INCX, XX + STF a5, [YY + 0 * SIZE] + add YY, INCY, YY + STF b5, [XX + 0 * SIZE] + add XX, INCX, XX + STF a6, [YY + 0 * SIZE] + add YY, INCY, YY + STF b6, [XX + 0 * SIZE] + add XX, INCX, XX + STF a7, [YY + 0 * SIZE] + add YY, INCY, YY + STF b7, [XX + 0 * SIZE] + add XX, INCX, XX + STF a8, [YY + 0 * SIZE] + add YY, INCY, YY + STF b8, [XX + 0 * SIZE] + + bg,pt %icc, .LL51 + add XX, INCX, XX + +.LL55: + and N, 7, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + add I, -1, I + cmp I, 0 + STF b1, [X + 0 * SIZE] + add X, INCX, X + STF a1, [Y + 0 * SIZE] + bg,pt %icc, .LL56 + add Y, INCY, Y + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_LN.S b/kernel/sparc/trsm_kernel_LN.S new file mode 100644 index 0000000000..4577a30415 --- /dev/null +++ b/kernel/sparc/trsm_kernel_LN.S @@ -0,0 +1,4254 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define OFFSET %l5 +#define KK %l6 +#define TEMP1 %l7 +#define TEMP2 %i3 +#define AORIG %g1 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f58 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f60 +#define ALPHA %f62 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#define ALPHA %f30 +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET +#endif + + FCLR(29) + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +.LL11: +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 2, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C4, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL50 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL75 + nop + +.LL72: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a1, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 4 * SIZE], a1 + + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a2, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [BO + 9 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a2, b3, t3 + LDF [BO + 10 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 11 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 12 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 13 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [BO + 14 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a3, b4, t4 + LDF [BO + 15 * SIZE], b4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 16 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a4, b2, t2 + LDF [BO + 17 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 18 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 19 * SIZE], b4 + + add BO, 16 * SIZE, BO + bg,pt %icc, .LL72 + LDF [AO + 3 * SIZE], a4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL79 + nop + +.LL76: + FADD c01, t1, c01 + add AO, 1 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + add BO, 4 * SIZE, BO + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + bg,pt %icc, .LL76 + LDF [BO + 3 * SIZE], b4 + + +.LL79: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c04, t1, c04 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + FSUB c03, t1, c03 + FMUL a3, c04, t1 + FSUB c02, t1, c02 + FMUL a4, c04, t1 + FSUB c01, t1, c01 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 + add C3, 1 * SIZE, C3 + add C4, 1 * SIZE, C4 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL50: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL70 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c02 + FMOV FZERO, t1 + FMOV FZERO, c04 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD c04, t2, c04 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD c02, t1, c02 + FADD c04, t2, c04 + FADD c06, t3, c06 + FADD c08, t4, c08 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FMUL a2, c06, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c03, t2, c03 + FSUB c05, t3, c05 + FSUB c07, t4, c07 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FMUL a2, c01, t1 + FMUL a2, c03, t2 + FMUL a2, c05, t3 + FMUL a2, c07, t4 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FSUB c06, t3, c06 + FSUB c08, t4, c08 + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c07, t1 + FMUL a2, c08, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c07, t1 + FMUL a3, c08, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a4, c07, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL70: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL21: + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + FMOV FZERO, c01 + FMOV FZERO, c02 + FMOV FZERO, c03 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c04 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c05 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c06 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c07 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c08 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c09 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c10 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c11 + LDF [BO + 4 * SIZE], b5 /* ***** */ + + LDF [AO + 4 * SIZE], a5 /* ***** */ + +#ifdef LN + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 + 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 + 3 * SIZE], 3 + FMOV FZERO, c15 +#else + prefetch [C1 - 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 - 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 - 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 - 3 * SIZE], 3 + FMOV FZERO, c15 +#endif + + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c08, t2, c08 + FMUL a5, b2, t2 + FADD c12, t3, c12 + FMUL a5, b3, t3 + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL29 + nop + +.LL26: + FADD c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 4, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD c04, t1, c04 + FADD c08, t2, c08 + FADD c12, t3, c12 + FADD c16, t4, c16 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c09, c09 + FSUB a4, c13, c13 + + FSUB b1, c02, c02 + FSUB b2, c06, c06 + FSUB b3, c10, c10 + FSUB b4, c14, c14 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c03, c03 + FSUB a2, c07, c07 + FSUB a3, c11, c11 + FSUB a4, c15, c15 + + FSUB b1, c04, c04 + FSUB b2, c08, c08 + FSUB b3, c12, c12 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 + + FMUL a2, c04, t1 + FMUL a2, c08, t2 + FMUL a2, c12, t3 + FMUL a2, c16, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c04, t1 + FMUL a3, c08, t2 + FMUL a3, c12, t3 + FMUL a3, c16, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a4, c04, t1 + FMUL a4, c08, t2 + FMUL a4, c12, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c03, t1 + FMUL a3, c07, t2 + FMUL a3, c11, t3 + FMUL a3, c15, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 + + FMUL a2, c01, t1 + FMUL a2, c05, t2 + FMUL a2, c09, t3 + FMUL a2, c13, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c01, t1 + FMUL a3, c05, t2 + FMUL a3, c09, t3 + FMUL a3, c13, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a4, c01, t1 + FMUL a4, c05, t2 + FMUL a4, c09, t3 + FMUL a4, c13, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FMUL a3, c10, t3 + FMUL a3, c14, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + FMUL a4, c03, t3 + FMUL a4, c04, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a3, c07, t3 + FMUL a3, c08, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 + + FMUL a2, c13, t1 + FMUL a2, c14, t2 + FMUL a2, c15, t3 + FMUL a2, c16, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c13, t1 + FMUL a3, c14, t2 + FMUL a3, c15, t3 + FMUL a3, c16, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a4, c13, t1 + FMUL a4, c14, t2 + FMUL a4, c15, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 + add C3, -4 * SIZE, C3 + add C4, -4 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c13, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c06, [BO + 5 * SIZE] + STF c10, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] + + STF c03, [BO + 8 * SIZE] + STF c07, [BO + 9 * SIZE] + STF c11, [BO + 10 * SIZE] + STF c15, [BO + 11 * SIZE] + + STF c04, [BO + 12 * SIZE] + STF c08, [BO + 13 * SIZE] + STF c12, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c11, [C3 + 2 * SIZE] + STF c12, [C3 + 3 * SIZE] + + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + STF c15, [C4 + 2 * SIZE] + STF c16, [C4 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + add C3, 4 * SIZE, C3 + add C4, 4 * SIZE, C4 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + sra K, 2, L + bg,pt %icc, .LL21 + FMOV FZERO, c01 + + + + + + + +.LL99: +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: /* n & 2 */ + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 1, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C2, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL150 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL175 + nop + +.LL172: + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + add L, -1, L + LDF [AO + 0 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 9 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 10 * SIZE], b3 + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 11 * SIZE], b4 + add BO, 8 * SIZE, BO + + bg,pt %icc, .LL172 + LDF [AO + 3 * SIZE], a4 + +.LL175: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL179 + nop + +.LL176: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + add AO, 1 * SIZE, AO + LDF [BO + 2 * SIZE], b1 + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 3 * SIZE], b2 + + add BO, 2 * SIZE, BO + bg,pt %icc, .LL176 + LDF [AO + 0 * SIZE], a1 + +.LL179: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL150: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL170 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + ble,pn %icc, .LL155 + nop + +.LL152: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL a1, b1, t1 + FMUL a1, b2, t2 + FMUL a2, b1, t3 + FMUL a2, b2, t4 + + add AO, 2 * SIZE, AO + add BO, 2 * SIZE, BO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL156 + nop + +.LL159: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FMUL a3, c01, c01 + FMUL a3, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a2, c01, t1 + FMUL a2, c03, t2 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FSUB c01, t1, c01 + FSUB c03, t2, c03 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c02, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL170: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL199 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + +#ifdef LN + prefetch [C1 - 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 - 3 * SIZE], 2 + FMOV FZERO, c02 +#else + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 +#endif + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + FADD c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD c03, t1, c03 + FADD c07, t2, c07 + FADD c04, t3, c04 + FADD c08, t4, c08 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c02, c02 + FSUB a4, c06, c06 + + FSUB b1, c03, c03 + FSUB b2, c07, c07 + FSUB b3, c04, c04 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a2, c04, t1 + FMUL a2, c08, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c04, t1 + FMUL a3, c08, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a4, c04, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c03, t1 + FMUL a3, c07, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a2, c01, t1 + FMUL a2, c05, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c01, t1 + FMUL a3, c05, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a4, c01, t1 + FMUL a4, c05, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c05, c05 + FMUL a3, c06, c06 + FMUL a3, c07, c07 + FMUL a3, c08, c08 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FMUL a3, c01, c01 + FMUL a3, c02, c02 + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c07, [BO + 5 * SIZE] + STF c04, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL199: +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + +.LL200: + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL250 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t2 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + + ble,pn %icc, .LL275 + LDF [BO + 3 * SIZE], b4 + +.LL272: + FADD c01, t1, c01 + add L, -1, L + add AO, 4 * SIZE, AO + + FMUL a1, b1, t1 + add BO, 4 * SIZE, BO + LDF [AO + 0 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + LDF [BO + 0 * SIZE], b1 + FMUL a2, b2, t2 + + LDF [AO + 1 * SIZE], a2 + FADD c01, t3, c01 + LDF [BO + 1 * SIZE], b2 + FMUL a3, b3, t3 + + LDF [AO + 2 * SIZE], a3 + FADD c02, t4, c02 + LDF [BO + 2 * SIZE], b3 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL272 + LDF [BO + 3 * SIZE], b4 + +.LL275: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL279 + nop + +.LL276: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 1 * SIZE], a1 + + LDF [BO + 1 * SIZE], b1 + add BO, 1 * SIZE, BO + cmp L, 0 + bg,pt %icc, .LL276 + add AO, 1 * SIZE, AO + +.LL279: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + + FADD c01, c02, c01 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL250: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL270 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL255 + nop + +.LL252: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + LDF [BO + 4 * SIZE], b1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b2, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 5 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 9 * SIZE], a2 + LDF [BO + 6 * SIZE], b3 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + add AO, 8 * SIZE, AO + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL252 + add BO, 4 * SIZE, BO + +.LL255: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + + cmp L, 0 + ble,a,pn %icc, .LL259 + nop + +.LL256: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 2 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a2, b1, t2 + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add AO, 2 * SIZE, AO + + bg,pt %icc, .LL256 + add BO, 1 * SIZE, BO + +.LL259: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL270: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL299 + nop + +.LL221: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + +#ifdef LN + prefetch [C1 - 3 * SIZE], 2 +#else + prefetch [C1 + 3 * SIZE], 2 +#endif + + ble,pn %icc, .LL225 + prefetch [C1 + 4 * SIZE], 2 + +.LL222: + FADD c01, t1, c01 + add BO, 4 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + add L, -1, L + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + + FADD c01, t1, c01 + cmp L, 0 + FMUL a1, b2, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [AO + 9 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b2, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 11 * SIZE], a4 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 12 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 13 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [AO + 14 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b3, t4 + LDF [AO + 15 * SIZE], a4 + LDF [BO + 2 * SIZE], b3 + + FADD c01, t1, c01 + FMUL a1, b4, t1 + LDF [AO + 16 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b4, t2 + LDF [AO + 17 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 18 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 19 * SIZE], a4 + add AO, 16 * SIZE, AO + + bg,pt %icc, .LL222 + LDF [BO + 3 * SIZE], b4 + +.LL225: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL229 + nop + +.LL226: + FADD c01, t1, c01 + add BO, 1 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + add AO, 4 * SIZE, AO + + bg,pt %icc, .LL226 + LDF [BO + 0 * SIZE], b1 + +.LL229: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + + FSUB c03, t1, c03 + FMUL a3, c04, t1 + + FSUB c02, t1, c02 + FMUL a4, c04, t1 + + FSUB c01, t1, c01 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c04, t1, c04 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL221 + nop + + + +.LL299: +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_LN_2x8.S b/kernel/sparc/trsm_kernel_LN_2x8.S new file mode 100644 index 0000000000..a70f0e4284 --- /dev/null +++ b/kernel/sparc/trsm_kernel_LN_2x8.S @@ -0,0 +1,3897 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define C5 %l5 +#define C6 %l6 +#define C7 %l7 +#define C8 %i3 + +#define OFFSET %g1 +#define KK %g2 +#define TEMP1 %g3 +#define TEMP2 %g4 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif + + st %g1, [%sp + STACK_START + 8] + st %g2, [%sp + STACK_START + 12] + st %g3, [%sp + STACK_START + 16] + st %g4, [%sp + STACK_START + 20] +#else + + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET + + stx %g1, [%sp + STACK_START + 32] + stx %g2, [%sp + STACK_START + 40] + stx %g3, [%sp + STACK_START + 48] + stx %g4, [%sp + STACK_START + 56] +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, BASE_SHIFT + 3, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C5 + add C5, LDC, C6 + add C6, LDC, C7 + add C7, LDC, C8 + add C8, LDC, C +#else + sub C, LDC, C8 + sub C8, LDC, C7 + sub C7, LDC, C6 + sub C6, LDC, C5 + sub C5, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL20 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + FCLR (cc01) + LDF [BO + 1 * SIZE], b2 + FCLR (cc03) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc07) + LDF [BO + 4 * SIZE], b5 + FCLR (cc09) + LDF [BO + 5 * SIZE], b6 + FCLR (cc11) + LDF [BO + 6 * SIZE], b7 + FCLR (cc13) + LDF [BO + 7 * SIZE], b8 + FCLR (cc15) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + LDF [BO + 8 * SIZE], b9 + .align 4 + +.LL23: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa2, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa2, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa2, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa2, bb5, cc09, cc09) + LDF [BO + 20 * SIZE], b5 + FMADD (aa2, bb6, cc11, cc11) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa2, bb7, cc13, cc13) + LDF [BO + 22 * SIZE], b7 + FMADD (aa2, bb8, cc15, cc15) + LDF [BO + 23 * SIZE], b8 + + LDF [AO + 4 * SIZE], a1 + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb1, cc01, cc01) + LDF [BO + 32 * SIZE], b1 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 26 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [BO + 28 * SIZE], b5 + FMADD (aa3, bb6, cc11, cc11) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 30 * SIZE], b7 + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa4, bb9, cc01, cc01) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb2, cc03, cc03) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa4, bb3, cc05, cc05) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc07, cc07) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa4, bb5, cc09, cc09) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb6, cc11, cc11) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa4, bb7, cc13, cc13) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc15, cc15) + LDF [BO + 39 * SIZE], b8 + + LDF [AO + 6 * SIZE], a3 + LDF [AO + 7 * SIZE], a4 + + add AO, 4 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL23 + add BO, 32 * SIZE, BO + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 1 * SIZE], a1 + add AO, 1 * SIZE, AO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL27 + add BO, 8 * SIZE, BO + .align 4 + +.LL28: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb4, cc01, cc15, cc15) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb3, cc03, cc15, cc15) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb2, cc05, cc15, cc15) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (bb1, cc07, cc15, cc15) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa4, cc09, cc15, cc15) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa3, cc11, cc15, cc15) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc15, cc15) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c15, c15 + + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 + add C5, -1 * SIZE, C5 + add C6, -1 * SIZE, C6 + add C7, -1 * SIZE, C7 + add C8, -1 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c11, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c15, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c13, [C7 + 0 * SIZE] + STF c15, [C8 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL20: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL29 + nop + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 8 * SIZE], a5 + + LDF [BO + 0 * SIZE], b1 + + LDF [BO + 1 * SIZE], b2 + FCLR (cc01) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc09) + LDF [BO + 4 * SIZE], b5 + FCLR (cc13) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc06) + LDF [BO + 7 * SIZE], b8 + FCLR (cc10) + LDF [BO + 8 * SIZE], b9 + FCLR (cc14) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc03) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc11) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc15) + + prefetch [C5 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C6 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C7 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C8 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + nop + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + nop + FMADD (aa2, bb5, cc10, cc10) + nop + + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c02, c02 + FSUB a2, c04, c04 + FSUB a3, c06, c06 + FSUB a4, c08, c08 + + FSUB b1, c10, c10 + FSUB b2, c12, c12 + FSUB b3, c14, c14 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + FMUL a1, c10, c10 + FMUL a1, c12, c12 + FMUL a1, c14, c14 + FMUL a1, c16, c16 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + FNMSUB (aa2, cc10, cc09, cc09) + FNMSUB (aa2, cc12, cc11, cc11) + FNMSUB (aa2, cc14, cc13, cc13) + FNMSUB (aa2, cc16, cc15, cc15) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 + FMUL a3, c09, c09 + FMUL a3, c11, c11 + FMUL a3, c13, c13 + FMUL a3, c15, c15 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + FNMSUB (aa2, cc09, cc10, cc10) + FNMSUB (aa2, cc11, cc12, cc12) + FNMSUB (aa2, cc13, cc14, cc14) + FNMSUB (aa2, cc15, cc16, cc16) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 + FMUL a3, c10, c10 + FMUL a3, c12, c12 + FMUL a3, c14, c14 + FMUL a3, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb1, cc02, cc10, cc10) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb2, cc02, cc12, cc12) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb3, cc02, cc14, cc14) + FNMSUB (bb4, cc01, cc15, cc15) + FNMSUB (bb4, cc02, cc16, cc16) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (aa4, cc04, cc10, cc10) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb1, cc04, cc12, cc12) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb2, cc04, cc14, cc14) + FNMSUB (bb3, cc03, cc15, cc15) + FNMSUB (bb3, cc04, cc16, cc16) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa3, cc06, cc10, cc10) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (aa4, cc06, cc12, cc12) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb1, cc06, cc14, cc14) + FNMSUB (bb2, cc05, cc15, cc15) + FNMSUB (bb2, cc06, cc16, cc16) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa2, cc08, cc10, cc10) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa3, cc08, cc12, cc12) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (aa4, cc08, cc14, cc14) + FNMSUB (bb1, cc07, cc15, cc15) + FNMSUB (bb1, cc08, cc16, cc16) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa2, cc10, cc12, cc12) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa3, cc10, cc14, cc14) + FNMSUB (aa4, cc09, cc15, cc15) + FNMSUB (aa4, cc10, cc16, cc16) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa2, cc12, cc14, cc14) + FNMSUB (aa3, cc11, cc15, cc15) + FNMSUB (aa3, cc12, cc16, cc16) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + + FNMSUB (aa2, cc13, cc15, cc15) + FNMSUB (aa2, cc14, cc16, cc16) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c16, c16 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc16, cc14, cc14) + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc16, cc12, cc12) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc16, cc10, cc10) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc16, cc08, cc08) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc16, cc06, cc06) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc16, cc04, cc04) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc16, cc02, cc02) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c14, c14 + FMUL a1, c13, c13 + + FNMSUB (aa2, cc14, cc12, cc12) + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc14, cc10, cc10) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc14, cc08, cc08) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc14, cc06, cc06) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc14, cc04, cc04) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc14, cc02, cc02) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c12, c12 + FMUL a1, c11, c11 + + FNMSUB (aa2, cc12, cc10, cc10) + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc12, cc08, cc08) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc12, cc06, cc06) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc12, cc04, cc04) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc12, cc02, cc02) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c10, c10 + FMUL a1, c09, c09 + + FNMSUB (aa2, cc10, cc08, cc08) + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc10, cc06, cc06) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc10, cc04, cc04) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc10, cc02, cc02) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 + add C5, -2 * SIZE, C5 + add C6, -2 * SIZE, C6 + add C7, -2 * SIZE, C7 + add C8, -2 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] + + STF c02, [BO + 8 * SIZE] + STF c04, [BO + 9 * SIZE] + STF c06, [BO + 10 * SIZE] + STF c08, [BO + 11 * SIZE] + + STF c10, [BO + 12 * SIZE] + STF c12, [BO + 13 * SIZE] + STF c14, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c10, [C5 + 1 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c12, [C6 + 1 * SIZE] + + STF c13, [C7 + 0 * SIZE] + STF c14, [C7 + 1 * SIZE] + STF c15, [C8 + 0 * SIZE] + STF c16, [C8 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 + add C5, 2 * SIZE, C5 + add C6, 2 * SIZE, C6 + add C7, 2 * SIZE, C7 + add C8, 2 * SIZE, C8 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + .align 4 + +.LL29: +#ifdef LN + sll K, BASE_SHIFT + 3, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 8, KK +#endif + +#ifdef RT + sub KK, 8, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL30: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL50 + nop + +#ifdef RT + sll K, BASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc05) + LDF [BO + 8 * SIZE], b9 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL45 + nop + +.LL43: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + FMADD (aa2, bb7, cc05, cc05) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc07, cc07) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + LDF [AO + 2 * SIZE], a3 + add BO, 16 * SIZE, BO + + FMADD (aa4, bb5, cc01, cc01) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc03, cc03) + LDF [BO + 5 * SIZE], b6 + FMADD (aa4, bb7, cc05, cc05) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc07, cc07) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL43 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL45: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL48 + nop + .align 4 + +.LL47: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 4 * SIZE], b1 + add L, -1, L + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 5 * SIZE], b2 + add AO, 1 * SIZE, AO + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 6 * SIZE], b3 + cmp L, 0 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 7 * SIZE], b4 + add BO, 4 * SIZE, BO + + bg,pt %icc, .LL47 + LDF [AO + 0 * SIZE], a1 + .align 4 + +.LL48: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL40: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL49 + nop + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc02) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + LDF [BO + 8 * SIZE], b9 + FCLR (cc04) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc05) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C3 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + nop + .align 4 + +.LL33: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + nop + FMADD (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD (aa3, bb8, cc07, cc07) + FMADD (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL37 + add BO, 4 * SIZE, BO + .align 4 + +.LL38: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +.LL49: +#ifdef LN + sll K, BASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + .align 4 + +.LL50: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL70 + nop + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL60 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + FCLR (cc01) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL65 + nop + .align 4 + +.LL63: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb3, cc01, cc01) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc03, cc03) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + + LDF [AO + 2 * SIZE], a3 + add BO, 8 * SIZE, BO + + FMADD (aa4, bb7, cc01, cc01) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc03, cc03) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL63 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL65: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL68 + nop + .align 4 + +.LL67: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 2 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 3 * SIZE], b2 + + LDF [AO + 1 * SIZE], a1 + add L, -1, L + add AO, 1 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL67 + add BO, 2 * SIZE, BO + .align 4 + +.LL68: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL60: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL69 + nop + .align 4 + +.LL52: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL55 + nop + .align 4 + +.LL53: + FMADD (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL53 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL58 + nop + .align 4 + +.LL57: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL57 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL58: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL52 + nop + .align 4 + +.LL69: +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL70: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, BASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C1, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL80 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [BO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], a2 + LDF [BO + 1 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + LDF [BO + 2 * SIZE], b3 + LDF [AO + 3 * SIZE], a4 + LDF [BO + 3 * SIZE], b4 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL85 + FCLR (cc01) + .align 4 + +.LL83: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + LDF [BO + 4 * SIZE], b1 + + FMADD (aa2, bb2, cc01, cc01) + LDF [AO + 5 * SIZE], a2 + LDF [BO + 5 * SIZE], b2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + LDF [BO + 6 * SIZE], b3 + + FMADD (aa4, bb4, cc01, cc01) + LDF [AO + 7 * SIZE], a4 + LDF [BO + 7 * SIZE], b4 + + add AO, 4 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL83 + add BO, 4 * SIZE, BO + .align 4 + +.LL85: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL88 + nop + .align 4 + +.LL87: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 1 * SIZE], a1 + LDF [BO + 1 * SIZE], b1 + + add AO, 1 * SIZE, AO + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL87 + add BO, 1 * SIZE, BO + .align 4 + +.LL88: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL80: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL89 + nop + .align 4 + +.LL72: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + prefetch [C1 + 2 * SIZE], 3 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL75 + nop + +.LL73: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + LDF [BO + 4 * SIZE], b1 + cmp L, 0 + + FMADD (aa3, bb2, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb2, cc02, cc02) + LDF [AO + 7 * SIZE], a4 + + LDF [BO + 5 * SIZE], b2 + add BO, 4 * SIZE, BO + + FMADD (aa1, bb3, cc01, cc01) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb3, cc02, cc02) + LDF [AO + 9 * SIZE], a2 + + LDF [BO + 2 * SIZE], b3 + add AO, 8 * SIZE, AO + + FMADD (aa3, bb4, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa4, bb4, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL73 + LDF [BO + 3 * SIZE], b4 + .align 4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL78 + nop + .align 4 + +.LL77: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add L, -1, L + add AO, 2 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL77 + add BO, 1 * SIZE, BO + .align 4 + +.LL78: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + + FNMSUB (aa2, cc02, cc01, cc01) + + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc02, cc02) + + FMUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL72 + nop + .align 4 + +.LL89: +#ifdef LN + sll K, BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL999: +#ifdef TRMMKERNEL +#ifndef __64BIT__ + ld [%sp + STACK_START + 8], %g1 + ld [%sp + STACK_START + 12], %g2 + ld [%sp + STACK_START + 16], %g3 + ld [%sp + STACK_START + 20], %g4 +#else + ldx [%sp + STACK_START + 32], %g1 + ldx [%sp + STACK_START + 40], %g2 + ldx [%sp + STACK_START + 48], %g3 + ldx [%sp + STACK_START + 56], %g4 +#endif +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_LT.S b/kernel/sparc/trsm_kernel_LT.S new file mode 100644 index 0000000000..11df2050da --- /dev/null +++ b/kernel/sparc/trsm_kernel_LT.S @@ -0,0 +1,4221 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define OFFSET %l5 +#define KK %l6 +#define TEMP1 %l7 +#define TEMP2 %i3 +#define AORIG %g1 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f58 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f60 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#endif + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET +#endif + + FCLR(29) + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +.LL11: +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 2, TEMP1 + sub C, TEMP1, C +#endif + + add C, LDC, C2 + FMOV FZERO, t1 + nop + mov C, C1 + + add C2, LDC, C3 + FMOV FZERO, t2 + sra M, 2, I + add C3, LDC, C4 + FMOV FZERO, t3 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + cmp I, 0 +#ifndef RT + add C4, LDC, C +#endif + FMOV FZERO, t4 + + ble,pn %icc, .LL50 + FMOV FZERO, c01 + +.LL21: + FMOV FZERO, c02 + FMOV FZERO, c03 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c04 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c05 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c06 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c07 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c08 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c09 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c10 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c11 + LDF [BO + 4 * SIZE], b5 /* ***** */ + + LDF [AO + 4 * SIZE], a5 /* ***** */ + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 + 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 + 3 * SIZE], 3 + FMOV FZERO, c15 + + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + +.LL22: + FADD c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c08, t2, c08 + FMUL a5, b2, t2 + FADD c12, t3, c12 + FMUL a5, b3, t3 + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL29 + nop + +.LL26: + FADD c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 4, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD c04, t1, c04 + FADD c08, t2, c08 + FADD c12, t3, c12 + FADD c16, t4, c16 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c09, c09 + FSUB a4, c13, c13 + + FSUB b1, c02, c02 + FSUB b2, c06, c06 + FSUB b3, c10, c10 + FSUB b4, c14, c14 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c03, c03 + FSUB a2, c07, c07 + FSUB a3, c11, c11 + FSUB a4, c15, c15 + + FSUB b1, c04, c04 + FSUB b2, c08, c08 + FSUB b3, c12, c12 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 + + FMUL a2, c04, t1 + FMUL a2, c08, t2 + FMUL a2, c12, t3 + FMUL a2, c16, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c04, t1 + FMUL a3, c08, t2 + FMUL a3, c12, t3 + FMUL a3, c16, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a4, c04, t1 + FMUL a4, c08, t2 + FMUL a4, c12, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c03, t1 + FMUL a3, c07, t2 + FMUL a3, c11, t3 + FMUL a3, c15, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 + + FMUL a2, c01, t1 + FMUL a2, c05, t2 + FMUL a2, c09, t3 + FMUL a2, c13, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c01, t1 + FMUL a3, c05, t2 + FMUL a3, c09, t3 + FMUL a3, c13, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a4, c01, t1 + FMUL a4, c05, t2 + FMUL a4, c09, t3 + FMUL a4, c13, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FMUL a3, c10, t3 + FMUL a3, c14, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + FMUL a4, c03, t3 + FMUL a4, c04, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a3, c07, t3 + FMUL a3, c08, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 + + FMUL a2, c13, t1 + FMUL a2, c14, t2 + FMUL a2, c15, t3 + FMUL a2, c16, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c13, t1 + FMUL a3, c14, t2 + FMUL a3, c15, t3 + FMUL a3, c16, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a4, c13, t1 + FMUL a4, c14, t2 + FMUL a4, c15, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 + add C3, -4 * SIZE, C3 + add C4, -4 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c13, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c06, [BO + 5 * SIZE] + STF c10, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] + + STF c03, [BO + 8 * SIZE] + STF c07, [BO + 9 * SIZE] + STF c11, [BO + 10 * SIZE] + STF c15, [BO + 11 * SIZE] + + STF c04, [BO + 12 * SIZE] + STF c08, [BO + 13 * SIZE] + STF c12, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c11, [C3 + 2 * SIZE] + STF c12, [C3 + 3 * SIZE] + + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + STF c15, [C4 + 2 * SIZE] + STF c16, [C4 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + add C3, 4 * SIZE, C3 + add C4, 4 * SIZE, C4 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL70 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c02 + FMOV FZERO, t1 + FMOV FZERO, c04 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD c04, t2, c04 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD c02, t1, c02 + FADD c04, t2, c04 + FADD c06, t3, c06 + FADD c08, t4, c08 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FMUL a2, c06, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c03, t2, c03 + FSUB c05, t3, c05 + FSUB c07, t4, c07 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FMUL a2, c01, t1 + FMUL a2, c03, t2 + FMUL a2, c05, t3 + FMUL a2, c07, t4 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FSUB c06, t3, c06 + FSUB c08, t4, c08 + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c07, t1 + FMUL a2, c08, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c07, t1 + FMUL a3, c08, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a4, c07, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL70: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL75 + nop + +.LL72: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a1, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 4 * SIZE], a1 + + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a2, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [BO + 9 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a2, b3, t3 + LDF [BO + 10 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 11 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 12 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 13 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [BO + 14 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a3, b4, t4 + LDF [BO + 15 * SIZE], b4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 16 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a4, b2, t2 + LDF [BO + 17 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 18 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 19 * SIZE], b4 + + add BO, 16 * SIZE, BO + bg,pt %icc, .LL72 + LDF [AO + 3 * SIZE], a4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL79 + nop + +.LL76: + FADD c01, t1, c01 + add AO, 1 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + add BO, 4 * SIZE, BO + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + bg,pt %icc, .LL76 + LDF [BO + 3 * SIZE], b4 + + +.LL79: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c04, t1, c04 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + FSUB c03, t1, c03 + FMUL a3, c04, t1 + FSUB c02, t1, c02 + FMUL a4, c04, t1 + FSUB c01, t1, c01 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 + add C3, 1 * SIZE, C3 + add C4, 1 * SIZE, C4 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL99: +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: /* n & 2 */ + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 1, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C2, LDC, C +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + FADD c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD c03, t1, c03 + FADD c07, t2, c07 + FADD c04, t3, c04 + FADD c08, t4, c08 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c02, c02 + FSUB a4, c06, c06 + + FSUB b1, c03, c03 + FSUB b2, c07, c07 + FSUB b3, c04, c04 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a2, c04, t1 + FMUL a2, c08, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c04, t1 + FMUL a3, c08, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a4, c04, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c03, t1 + FMUL a3, c07, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a2, c01, t1 + FMUL a2, c05, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c01, t1 + FMUL a3, c05, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a4, c01, t1 + FMUL a4, c05, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c05, c05 + FMUL a3, c06, c06 + FMUL a3, c07, c07 + FMUL a3, c08, c08 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FMUL a3, c01, c01 + FMUL a3, c02, c02 + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c07, [BO + 5 * SIZE] + STF c04, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL170 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + ble,pn %icc, .LL155 + nop + +.LL152: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL a1, b1, t1 + FMUL a1, b2, t2 + FMUL a2, b1, t3 + FMUL a2, b2, t4 + + add AO, 2 * SIZE, AO + add BO, 2 * SIZE, BO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL156 + nop + +.LL159: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FMUL a3, c01, c01 + FMUL a3, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a2, c01, t1 + FMUL a2, c03, t2 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FSUB c01, t1, c01 + FSUB c03, t2, c03 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c02, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + +.LL170: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL175 + nop + +.LL172: + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + add L, -1, L + LDF [AO + 0 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 9 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 10 * SIZE], b3 + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 11 * SIZE], b4 + add BO, 8 * SIZE, BO + + bg,pt %icc, .LL172 + LDF [AO + 3 * SIZE], a4 + +.LL175: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL179 + nop + +.LL176: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + add AO, 1 * SIZE, AO + LDF [BO + 2 * SIZE], b1 + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 3 * SIZE], b2 + + add BO, 2 * SIZE, BO + bg,pt %icc, .LL176 + LDF [AO + 0 * SIZE], a1 + +.LL179: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL199: +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + +.LL200: + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL250 + nop + +.LL221: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL225 + prefetch [C1 + 4 * SIZE], 2 + +.LL222: + FADD c01, t1, c01 + add BO, 4 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + add L, -1, L + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + + FADD c01, t1, c01 + cmp L, 0 + FMUL a1, b2, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [AO + 9 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b2, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 11 * SIZE], a4 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 12 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 13 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [AO + 14 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b3, t4 + LDF [AO + 15 * SIZE], a4 + LDF [BO + 2 * SIZE], b3 + + FADD c01, t1, c01 + FMUL a1, b4, t1 + LDF [AO + 16 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b4, t2 + LDF [AO + 17 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 18 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 19 * SIZE], a4 + add AO, 16 * SIZE, AO + + bg,pt %icc, .LL222 + LDF [BO + 3 * SIZE], b4 + +.LL225: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL229 + nop + +.LL226: + FADD c01, t1, c01 + add BO, 1 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + add AO, 4 * SIZE, AO + + bg,pt %icc, .LL226 + LDF [BO + 0 * SIZE], b1 + +.LL229: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + + FSUB c03, t1, c03 + FMUL a3, c04, t1 + + FSUB c02, t1, c02 + FMUL a4, c04, t1 + + FSUB c01, t1, c01 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c04, t1, c04 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL221 + nop + +.LL250: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL270 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL255 + nop + +.LL252: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + LDF [BO + 4 * SIZE], b1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b2, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 5 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 9 * SIZE], a2 + LDF [BO + 6 * SIZE], b3 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + add AO, 8 * SIZE, AO + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL252 + add BO, 4 * SIZE, BO + +.LL255: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + + cmp L, 0 + ble,a,pn %icc, .LL259 + nop + +.LL256: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 2 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a2, b1, t2 + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add AO, 2 * SIZE, AO + + bg,pt %icc, .LL256 + add BO, 1 * SIZE, BO + +.LL259: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL270: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL299 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t2 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + + ble,pn %icc, .LL275 + LDF [BO + 3 * SIZE], b4 + +.LL272: + FADD c01, t1, c01 + add L, -1, L + add AO, 4 * SIZE, AO + + FMUL a1, b1, t1 + add BO, 4 * SIZE, BO + LDF [AO + 0 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + LDF [BO + 0 * SIZE], b1 + FMUL a2, b2, t2 + + LDF [AO + 1 * SIZE], a2 + FADD c01, t3, c01 + LDF [BO + 1 * SIZE], b2 + FMUL a3, b3, t3 + + LDF [AO + 2 * SIZE], a3 + FADD c02, t4, c02 + LDF [BO + 2 * SIZE], b3 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL272 + LDF [BO + 3 * SIZE], b4 + +.LL275: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL279 + nop + +.LL276: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 1 * SIZE], a1 + + LDF [BO + 1 * SIZE], b1 + add BO, 1 * SIZE, BO + cmp L, 0 + bg,pt %icc, .LL276 + add AO, 1 * SIZE, AO + +.LL279: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + + FADD c01, c02, c01 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + +.LL299: +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_LT_2x8.S b/kernel/sparc/trsm_kernel_LT_2x8.S new file mode 100644 index 0000000000..39015d72e5 --- /dev/null +++ b/kernel/sparc/trsm_kernel_LT_2x8.S @@ -0,0 +1,3896 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define C5 %l5 +#define C6 %l6 +#define C7 %l7 +#define C8 %i3 + +#define OFFSET %g1 +#define KK %g2 +#define TEMP1 %g3 +#define TEMP2 %g4 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif + st %g1, [%sp + STACK_START + 8] + st %g2, [%sp + STACK_START + 12] + st %g3, [%sp + STACK_START + 16] + st %g4, [%sp + STACK_START + 20] +#else + + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET + + stx %g1, [%sp + STACK_START + 32] + stx %g2, [%sp + STACK_START + 40] + stx %g3, [%sp + STACK_START + 48] + stx %g4, [%sp + STACK_START + 56] +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, BASE_SHIFT + 3, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C5 + add C5, LDC, C6 + add C6, LDC, C7 + add C7, LDC, C8 + add C8, LDC, C +#else + sub C, LDC, C8 + sub C8, LDC, C7 + sub C7, LDC, C6 + sub C6, LDC, C5 + sub C5, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL20 + nop + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 8 * SIZE], a5 + + LDF [BO + 0 * SIZE], b1 + + LDF [BO + 1 * SIZE], b2 + FCLR (cc01) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc09) + LDF [BO + 4 * SIZE], b5 + FCLR (cc13) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc06) + LDF [BO + 7 * SIZE], b8 + FCLR (cc10) + LDF [BO + 8 * SIZE], b9 + FCLR (cc14) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc03) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc11) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc15) + + prefetch [C5 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C6 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C7 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C8 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + nop + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + nop + FMADD (aa2, bb5, cc10, cc10) + nop + + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c02, c02 + FSUB a2, c04, c04 + FSUB a3, c06, c06 + FSUB a4, c08, c08 + + FSUB b1, c10, c10 + FSUB b2, c12, c12 + FSUB b3, c14, c14 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + FMUL a1, c10, c10 + FMUL a1, c12, c12 + FMUL a1, c14, c14 + FMUL a1, c16, c16 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + FNMSUB (aa2, cc10, cc09, cc09) + FNMSUB (aa2, cc12, cc11, cc11) + FNMSUB (aa2, cc14, cc13, cc13) + FNMSUB (aa2, cc16, cc15, cc15) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 + FMUL a3, c09, c09 + FMUL a3, c11, c11 + FMUL a3, c13, c13 + FMUL a3, c15, c15 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + FNMSUB (aa2, cc09, cc10, cc10) + FNMSUB (aa2, cc11, cc12, cc12) + FNMSUB (aa2, cc13, cc14, cc14) + FNMSUB (aa2, cc15, cc16, cc16) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 + FMUL a3, c10, c10 + FMUL a3, c12, c12 + FMUL a3, c14, c14 + FMUL a3, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb1, cc02, cc10, cc10) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb2, cc02, cc12, cc12) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb3, cc02, cc14, cc14) + FNMSUB (bb4, cc01, cc15, cc15) + FNMSUB (bb4, cc02, cc16, cc16) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (aa4, cc04, cc10, cc10) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb1, cc04, cc12, cc12) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb2, cc04, cc14, cc14) + FNMSUB (bb3, cc03, cc15, cc15) + FNMSUB (bb3, cc04, cc16, cc16) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa3, cc06, cc10, cc10) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (aa4, cc06, cc12, cc12) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb1, cc06, cc14, cc14) + FNMSUB (bb2, cc05, cc15, cc15) + FNMSUB (bb2, cc06, cc16, cc16) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa2, cc08, cc10, cc10) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa3, cc08, cc12, cc12) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (aa4, cc08, cc14, cc14) + FNMSUB (bb1, cc07, cc15, cc15) + FNMSUB (bb1, cc08, cc16, cc16) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa2, cc10, cc12, cc12) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa3, cc10, cc14, cc14) + FNMSUB (aa4, cc09, cc15, cc15) + FNMSUB (aa4, cc10, cc16, cc16) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa2, cc12, cc14, cc14) + FNMSUB (aa3, cc11, cc15, cc15) + FNMSUB (aa3, cc12, cc16, cc16) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + + FNMSUB (aa2, cc13, cc15, cc15) + FNMSUB (aa2, cc14, cc16, cc16) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c16, c16 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc16, cc14, cc14) + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc16, cc12, cc12) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc16, cc10, cc10) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc16, cc08, cc08) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc16, cc06, cc06) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc16, cc04, cc04) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc16, cc02, cc02) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c14, c14 + FMUL a1, c13, c13 + + FNMSUB (aa2, cc14, cc12, cc12) + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc14, cc10, cc10) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc14, cc08, cc08) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc14, cc06, cc06) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc14, cc04, cc04) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc14, cc02, cc02) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c12, c12 + FMUL a1, c11, c11 + + FNMSUB (aa2, cc12, cc10, cc10) + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc12, cc08, cc08) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc12, cc06, cc06) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc12, cc04, cc04) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc12, cc02, cc02) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c10, c10 + FMUL a1, c09, c09 + + FNMSUB (aa2, cc10, cc08, cc08) + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc10, cc06, cc06) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc10, cc04, cc04) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc10, cc02, cc02) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 + add C5, -2 * SIZE, C5 + add C6, -2 * SIZE, C6 + add C7, -2 * SIZE, C7 + add C8, -2 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] + + STF c02, [BO + 8 * SIZE] + STF c04, [BO + 9 * SIZE] + STF c06, [BO + 10 * SIZE] + STF c08, [BO + 11 * SIZE] + + STF c10, [BO + 12 * SIZE] + STF c12, [BO + 13 * SIZE] + STF c14, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c10, [C5 + 1 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c12, [C6 + 1 * SIZE] + + STF c13, [C7 + 0 * SIZE] + STF c14, [C7 + 1 * SIZE] + STF c15, [C8 + 0 * SIZE] + STF c16, [C8 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 + add C5, 2 * SIZE, C5 + add C6, 2 * SIZE, C6 + add C7, 2 * SIZE, C7 + add C8, 2 * SIZE, C8 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + .align 4 + +.LL20: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL29 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + FCLR (cc01) + LDF [BO + 1 * SIZE], b2 + FCLR (cc03) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc07) + LDF [BO + 4 * SIZE], b5 + FCLR (cc09) + LDF [BO + 5 * SIZE], b6 + FCLR (cc11) + LDF [BO + 6 * SIZE], b7 + FCLR (cc13) + LDF [BO + 7 * SIZE], b8 + FCLR (cc15) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + LDF [BO + 8 * SIZE], b9 + .align 4 + +.LL23: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa2, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa2, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa2, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa2, bb5, cc09, cc09) + LDF [BO + 20 * SIZE], b5 + FMADD (aa2, bb6, cc11, cc11) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa2, bb7, cc13, cc13) + LDF [BO + 22 * SIZE], b7 + FMADD (aa2, bb8, cc15, cc15) + LDF [BO + 23 * SIZE], b8 + + LDF [AO + 4 * SIZE], a1 + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb1, cc01, cc01) + LDF [BO + 32 * SIZE], b1 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 26 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [BO + 28 * SIZE], b5 + FMADD (aa3, bb6, cc11, cc11) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 30 * SIZE], b7 + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa4, bb9, cc01, cc01) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb2, cc03, cc03) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa4, bb3, cc05, cc05) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc07, cc07) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa4, bb5, cc09, cc09) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb6, cc11, cc11) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa4, bb7, cc13, cc13) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc15, cc15) + LDF [BO + 39 * SIZE], b8 + + LDF [AO + 6 * SIZE], a3 + LDF [AO + 7 * SIZE], a4 + + add AO, 4 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL23 + add BO, 32 * SIZE, BO + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 1 * SIZE], a1 + add AO, 1 * SIZE, AO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL27 + add BO, 8 * SIZE, BO + .align 4 + +.LL28: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb4, cc01, cc15, cc15) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb3, cc03, cc15, cc15) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb2, cc05, cc15, cc15) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (bb1, cc07, cc15, cc15) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa4, cc09, cc15, cc15) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa3, cc11, cc15, cc15) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc15, cc15) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c15, c15 + + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 + add C5, -1 * SIZE, C5 + add C6, -1 * SIZE, C6 + add C7, -1 * SIZE, C7 + add C8, -1 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c11, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c15, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c13, [C7 + 0 * SIZE] + STF c15, [C8 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL29: +#ifdef LN + sll K, BASE_SHIFT + 3, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 8, KK +#endif + +#ifdef RT + sub KK, 8, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL30: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL50 + nop + +#ifdef RT + sll K, BASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + nop + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc02) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + LDF [BO + 8 * SIZE], b9 + FCLR (cc04) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc05) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C3 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + nop + .align 4 + +.LL33: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + nop + FMADD (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD (aa3, bb8, cc07, cc07) + FMADD (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL37 + add BO, 4 * SIZE, BO + .align 4 + +.LL38: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +.LL40: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL49 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc05) + LDF [BO + 8 * SIZE], b9 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL45 + nop + +.LL43: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + FMADD (aa2, bb7, cc05, cc05) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc07, cc07) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + LDF [AO + 2 * SIZE], a3 + add BO, 16 * SIZE, BO + + FMADD (aa4, bb5, cc01, cc01) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc03, cc03) + LDF [BO + 5 * SIZE], b6 + FMADD (aa4, bb7, cc05, cc05) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc07, cc07) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL43 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL45: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL48 + nop + .align 4 + +.LL47: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 4 * SIZE], b1 + add L, -1, L + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 5 * SIZE], b2 + add AO, 1 * SIZE, AO + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 6 * SIZE], b3 + cmp L, 0 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 7 * SIZE], b4 + add BO, 4 * SIZE, BO + + bg,pt %icc, .LL47 + LDF [AO + 0 * SIZE], a1 + .align 4 + +.LL48: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL49: +#ifdef LN + sll K, BASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + .align 4 + +.LL50: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL70 + nop + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL60 + nop + .align 4 + +.LL52: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL55 + nop + .align 4 + +.LL53: + FMADD (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL53 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL58 + nop + .align 4 + +.LL57: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL57 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL58: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL52 + nop + .align 4 + +.LL60: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL69 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + FCLR (cc01) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL65 + nop + .align 4 + +.LL63: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb3, cc01, cc01) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc03, cc03) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + + LDF [AO + 2 * SIZE], a3 + add BO, 8 * SIZE, BO + + FMADD (aa4, bb7, cc01, cc01) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc03, cc03) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL63 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL65: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL68 + nop + .align 4 + +.LL67: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 2 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 3 * SIZE], b2 + + LDF [AO + 1 * SIZE], a1 + add L, -1, L + add AO, 1 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL67 + add BO, 2 * SIZE, BO + .align 4 + +.LL68: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL69: +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL70: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, BASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C1, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL80 + nop + .align 4 + +.LL72: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + prefetch [C1 + 2 * SIZE], 3 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL75 + nop + +.LL73: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + LDF [BO + 4 * SIZE], b1 + cmp L, 0 + + FMADD (aa3, bb2, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb2, cc02, cc02) + LDF [AO + 7 * SIZE], a4 + + LDF [BO + 5 * SIZE], b2 + add BO, 4 * SIZE, BO + + FMADD (aa1, bb3, cc01, cc01) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb3, cc02, cc02) + LDF [AO + 9 * SIZE], a2 + + LDF [BO + 2 * SIZE], b3 + add AO, 8 * SIZE, AO + + FMADD (aa3, bb4, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa4, bb4, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL73 + LDF [BO + 3 * SIZE], b4 + .align 4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL78 + nop + .align 4 + +.LL77: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add L, -1, L + add AO, 2 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL77 + add BO, 1 * SIZE, BO + .align 4 + +.LL78: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + + FNMSUB (aa2, cc02, cc01, cc01) + + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc02, cc02) + + FMUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL72 + nop + .align 4 + +.LL80: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL89 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [BO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], a2 + LDF [BO + 1 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + LDF [BO + 2 * SIZE], b3 + LDF [AO + 3 * SIZE], a4 + LDF [BO + 3 * SIZE], b4 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL85 + FCLR (cc01) + .align 4 + +.LL83: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + LDF [BO + 4 * SIZE], b1 + + FMADD (aa2, bb2, cc01, cc01) + LDF [AO + 5 * SIZE], a2 + LDF [BO + 5 * SIZE], b2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + LDF [BO + 6 * SIZE], b3 + + FMADD (aa4, bb4, cc01, cc01) + LDF [AO + 7 * SIZE], a4 + LDF [BO + 7 * SIZE], b4 + + add AO, 4 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL83 + add BO, 4 * SIZE, BO + .align 4 + +.LL85: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL88 + nop + .align 4 + +.LL87: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 1 * SIZE], a1 + LDF [BO + 1 * SIZE], b1 + + add AO, 1 * SIZE, AO + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL87 + add BO, 1 * SIZE, BO + .align 4 + +.LL88: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL89: +#ifdef LN + sll K, BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL999: +#ifdef TRMMKERNEL +#ifndef __64BIT__ + ld [%sp + STACK_START + 8], %g1 + ld [%sp + STACK_START + 12], %g2 + ld [%sp + STACK_START + 16], %g3 + ld [%sp + STACK_START + 20], %g4 +#else + ldx [%sp + STACK_START + 32], %g1 + ldx [%sp + STACK_START + 40], %g2 + ldx [%sp + STACK_START + 48], %g3 + ldx [%sp + STACK_START + 56], %g4 +#endif +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_RT.S b/kernel/sparc/trsm_kernel_RT.S new file mode 100644 index 0000000000..3e1a2b90ac --- /dev/null +++ b/kernel/sparc/trsm_kernel_RT.S @@ -0,0 +1,4227 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define OFFSET %l5 +#define KK %l6 +#define TEMP1 %l7 +#define TEMP2 %i3 +#define AORIG %g1 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f58 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f60 +#define ALPHA %f62 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#define ALPHA %f30 +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET +#endif + + FCLR(29) + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL250 + nop + +.LL221: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL225 + prefetch [C1 + 4 * SIZE], 2 + +.LL222: + FADD c01, t1, c01 + add BO, 4 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + add L, -1, L + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + + FADD c01, t1, c01 + cmp L, 0 + FMUL a1, b2, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [AO + 9 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b2, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 11 * SIZE], a4 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 12 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 13 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [AO + 14 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b3, t4 + LDF [AO + 15 * SIZE], a4 + LDF [BO + 2 * SIZE], b3 + + FADD c01, t1, c01 + FMUL a1, b4, t1 + LDF [AO + 16 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b4, t2 + LDF [AO + 17 * SIZE], a2 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 18 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 19 * SIZE], a4 + add AO, 16 * SIZE, AO + + bg,pt %icc, .LL222 + LDF [BO + 3 * SIZE], b4 + +.LL225: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL229 + nop + +.LL226: + FADD c01, t1, c01 + add BO, 1 * SIZE, BO + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b1, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b1, t4 + LDF [AO + 7 * SIZE], a4 + add AO, 4 * SIZE, AO + + bg,pt %icc, .LL226 + LDF [BO + 0 * SIZE], b1 + +.LL229: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + + FSUB c03, t1, c03 + FMUL a3, c04, t1 + + FSUB c02, t1, c02 + FMUL a4, c04, t1 + + FSUB c01, t1, c01 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + + FSUB c04, t1, c04 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL221 + nop + +.LL250: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL270 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 0 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL255 + nop + +.LL252: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 4 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b1, t2 + LDF [AO + 5 * SIZE], a2 + LDF [BO + 4 * SIZE], b1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a3, b2, t3 + LDF [AO + 6 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b2, t4 + LDF [AO + 7 * SIZE], a4 + LDF [BO + 5 * SIZE], b2 + + FADD c01, t1, c01 + FMUL a1, b3, t1 + LDF [AO + 8 * SIZE], a1 + + FADD c02, t2, c02 + FMUL a2, b3, t2 + LDF [AO + 9 * SIZE], a2 + LDF [BO + 6 * SIZE], b3 + + FADD c03, t3, c03 + FMUL a3, b4, t3 + LDF [AO + 10 * SIZE], a3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + add AO, 8 * SIZE, AO + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL252 + add BO, 4 * SIZE, BO + +.LL255: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + + cmp L, 0 + ble,a,pn %icc, .LL259 + nop + +.LL256: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 2 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a2, b1, t2 + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add AO, 2 * SIZE, AO + + bg,pt %icc, .LL256 + add BO, 1 * SIZE, BO + +.LL259: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL270: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL299 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c01 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t2 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c02 + + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + + ble,pn %icc, .LL275 + LDF [BO + 3 * SIZE], b4 + +.LL272: + FADD c01, t1, c01 + add L, -1, L + add AO, 4 * SIZE, AO + + FMUL a1, b1, t1 + add BO, 4 * SIZE, BO + LDF [AO + 0 * SIZE], a1 + + FADD c02, t2, c02 + cmp L, 0 + LDF [BO + 0 * SIZE], b1 + FMUL a2, b2, t2 + + LDF [AO + 1 * SIZE], a2 + FADD c01, t3, c01 + LDF [BO + 1 * SIZE], b2 + FMUL a3, b3, t3 + + LDF [AO + 2 * SIZE], a3 + FADD c02, t4, c02 + LDF [BO + 2 * SIZE], b3 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL272 + LDF [BO + 3 * SIZE], b4 + +.LL275: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL279 + nop + +.LL276: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [AO + 1 * SIZE], a1 + + LDF [BO + 1 * SIZE], b1 + add BO, 1 * SIZE, BO + cmp L, 0 + bg,pt %icc, .LL276 + add AO, 1 * SIZE, AO + +.LL279: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c01, t3, c01 + FADD c02, t4, c02 + + FADD c01, c02, c01 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + FSUB a1, c01, c01 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + +.LL299: +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + +.LL100: /* n & 2 */ + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL200 + nop + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 1, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C2, LDC, C +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 2 + FMOV FZERO, c05 + prefetch [C2 + 3 * SIZE], 2 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + FADD c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD c03, t1, c03 + FADD c07, t2, c07 + FADD c04, t3, c04 + FADD c08, t4, c08 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 4, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c02, c02 + FSUB a4, c06, c06 + + FSUB b1, c03, c03 + FSUB b2, c07, c07 + FSUB b3, c04, c04 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a2, c04, t1 + FMUL a2, c08, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c04, t1 + FMUL a3, c08, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a4, c04, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c03, t1 + FMUL a3, c07, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a2, c01, t1 + FMUL a2, c05, t2 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FMUL a3, c01, t1 + FMUL a3, c05, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a4, c01, t1 + FMUL a4, c05, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a2, c02, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a2, c03, t1 + FMUL a2, c07, t2 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c05, c05 + FMUL a3, c06, c06 + FMUL a3, c07, c07 + FMUL a3, c08, c08 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FMUL a3, c01, c01 + FMUL a3, c02, c02 + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c07, [BO + 5 * SIZE] + STF c04, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL170 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + ble,pn %icc, .LL155 + nop + +.LL152: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FMUL a1, b1, t1 + FMUL a1, b2, t2 + FMUL a2, b1, t3 + FMUL a2, b2, t4 + + add AO, 2 * SIZE, AO + add BO, 2 * SIZE, BO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL156 + nop + +.LL159: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FMUL a3, c01, c01 + FMUL a3, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c03, c03 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a2, c01, t1 + FMUL a2, c03, t2 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FSUB c01, t1, c01 + FSUB c03, t2, c03 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c02, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C1 + 1 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + +.LL170: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 1 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL175 + nop + +.LL172: + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + add L, -1, L + LDF [AO + 0 * SIZE], a1 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a2, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 9 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 10 * SIZE], b3 + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 11 * SIZE], b4 + add BO, 8 * SIZE, BO + + bg,pt %icc, .LL172 + LDF [AO + 3 * SIZE], a4 + +.LL175: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL179 + nop + +.LL176: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + add AO, 1 * SIZE, AO + LDF [BO + 2 * SIZE], b1 + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 3 * SIZE], b2 + + add BO, 2 * SIZE, BO + bg,pt %icc, .LL176 + LDF [AO + 0 * SIZE], a1 + +.LL179: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + + FADD c01, c03, c01 + FADD c02, c04, c02 + + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c02, c02 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + LDF [BO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + FMUL a3, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 1 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL199: +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + +.LL200: + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL11: +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sll LDC, 2, TEMP1 + sub C, TEMP1, C +#endif + + add C, LDC, C2 + FMOV FZERO, t1 + nop + mov C, C1 + + add C2, LDC, C3 + FMOV FZERO, t2 + nop + mov A, AO + + sra M, 2, I + add C3, LDC, C4 + FMOV FZERO, t3 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + cmp I, 0 +#ifndef RT + add C4, LDC, C +#endif + FMOV FZERO, t4 + + ble,pn %icc, .LL50 + FMOV FZERO, c01 + +.LL21: + FMOV FZERO, c02 + FMOV FZERO, c03 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 2 + BASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c04 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c05 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c06 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c07 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c08 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c09 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c10 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c11 + LDF [BO + 4 * SIZE], b5 /* ***** */ + + LDF [AO + 4 * SIZE], a5 /* ***** */ + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c12 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C3 + 3 * SIZE], 3 + FMOV FZERO, c14 + prefetch [C4 + 3 * SIZE], 3 + FMOV FZERO, c15 + + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD c08, t2, c08 + FMUL a5, b2, t2 + FADD c12, t3, c12 + FMUL a5, b3, t3 + + FADD c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL29 + nop + +.LL26: + FADD c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 4, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD c04, t1, c04 + FADD c08, t2, c08 + FADD c12, t3, c12 + FADD c16, t4, c16 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c05, c05 + FSUB a3, c09, c09 + FSUB a4, c13, c13 + + FSUB b1, c02, c02 + FSUB b2, c06, c06 + FSUB b3, c10, c10 + FSUB b4, c14, c14 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c03, c03 + FSUB a2, c07, c07 + FSUB a3, c11, c11 + FSUB a4, c15, c15 + + FSUB b1, c04, c04 + FSUB b2, c08, c08 + FSUB b3, c12, c12 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 15 * SIZE], a1 + LDF [AO + 14 * SIZE], a2 + LDF [AO + 13 * SIZE], a3 + LDF [AO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 + + FMUL a2, c04, t1 + FMUL a2, c08, t2 + FMUL a2, c12, t3 + FMUL a2, c16, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c04, t1 + FMUL a3, c08, t2 + FMUL a3, c12, t3 + FMUL a3, c16, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a4, c04, t1 + FMUL a4, c08, t2 + FMUL a4, c12, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c03, t1 + FMUL a3, c07, t2 + FMUL a3, c11, t3 + FMUL a3, c15, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c01, t1, c01 + FSUB c05, t2, c05 + FSUB c09, t3, c09 + FSUB c13, t4, c13 + + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c05, c05 + FMUL a1, c09, c09 + FMUL a1, c13, c13 + + FMUL a2, c01, t1 + FMUL a2, c05, t2 + FMUL a2, c09, t3 + FMUL a2, c13, t4 + + FSUB c02, t1, c02 + FSUB c06, t2, c06 + FSUB c10, t3, c10 + FSUB c14, t4, c14 + + FMUL a3, c01, t1 + FMUL a3, c05, t2 + FMUL a3, c09, t3 + FMUL a3, c13, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a4, c01, t1 + FMUL a4, c05, t2 + FMUL a4, c09, t3 + FMUL a4, c13, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 5 * SIZE], a1 + LDF [AO + 6 * SIZE], a2 + LDF [AO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c06, c06 + FMUL a1, c10, c10 + FMUL a1, c14, c14 + + FMUL a2, c02, t1 + FMUL a2, c06, t2 + FMUL a2, c10, t3 + FMUL a2, c14, t4 + + FSUB c03, t1, c03 + FSUB c07, t2, c07 + FSUB c11, t3, c11 + FSUB c15, t4, c15 + + FMUL a3, c02, t1 + FMUL a3, c06, t2 + FMUL a3, c10, t3 + FMUL a3, c14, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 10 * SIZE], a1 + LDF [AO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c07, c07 + FMUL a1, c11, c11 + FMUL a1, c15, c15 + + FMUL a2, c03, t1 + FMUL a2, c07, t2 + FMUL a2, c11, t3 + FMUL a2, c15, t4 + + FSUB c04, t1, c04 + FSUB c08, t2, c08 + FSUB c12, t3, c12 + FSUB c16, t4, c16 + + LDF [AO + 15 * SIZE], a1 + + FMUL a1, c04, c04 + FMUL a1, c08, c08 + FMUL a1, c12, c12 + FMUL a1, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + FMUL a2, c03, t3 + FMUL a2, c04, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + FMUL a4, c03, t3 + FMUL a4, c04, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a3, c07, t3 + FMUL a3, c08, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c13, t1, c13 + FSUB c14, t2, c14 + FSUB c15, t3, c15 + FSUB c16, t4, c16 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + FMUL a1, c15, c15 + FMUL a1, c16, c16 + + FMUL a2, c13, t1 + FMUL a2, c14, t2 + FMUL a2, c15, t3 + FMUL a2, c16, t4 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FMUL a3, c13, t1 + FMUL a3, c14, t2 + FMUL a3, c15, t3 + FMUL a3, c16, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a4, c13, t1 + FMUL a4, c14, t2 + FMUL a4, c15, t3 + FMUL a4, c16, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FMUL a2, c09, t1 + FMUL a2, c10, t2 + FMUL a2, c11, t3 + FMUL a2, c12, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FSUB c07, t3, c07 + FSUB c08, t4, c08 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + FMUL a2, c07, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 + add C3, -4 * SIZE, C3 + add C4, -4 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c05, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c13, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c06, [BO + 5 * SIZE] + STF c10, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] + + STF c03, [BO + 8 * SIZE] + STF c07, [BO + 9 * SIZE] + STF c11, [BO + 10 * SIZE] + STF c15, [BO + 11 * SIZE] + + STF c04, [BO + 12 * SIZE] + STF c08, [BO + 13 * SIZE] + STF c12, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + STF c07, [C2 + 2 * SIZE] + STF c08, [C2 + 3 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c11, [C3 + 2 * SIZE] + STF c12, [C3 + 3 * SIZE] + + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + STF c15, [C4 + 2 * SIZE] + STF c16, [C4 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + add C3, 4 * SIZE, C3 + add C4, 4 * SIZE, C4 +#endif + +#ifdef RT + sll K, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 4, KK +#endif + +#ifdef LN + sub KK, 4, KK +#endif + + add I, -1, I + cmp I, 0 + + sra K, 2, L + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 2, I + cmp I, 0 + ble,pn %icc, .LL70 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c02 + FMOV FZERO, t1 + FMOV FZERO, c04 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD c04, t2, c04 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD c04, t2, c04 + FMUL a3, b2, t2 + + FADD c06, t3, c06 + FMUL a3, b3, t3 + FADD c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD c06, t3, c06 + FMUL a1, b3, t3 + FADD c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD c02, t1, c02 + FADD c04, t2, c04 + FADD c06, t3, c06 + FADD c08, t4, c08 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FMUL a2, c02, t1 + FMUL a2, c04, t2 + FMUL a2, c06, t3 + FMUL a2, c08, t4 + + FSUB c01, t1, c01 + FSUB c03, t2, c03 + FSUB c05, t3, c05 + FSUB c07, t4, c07 + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FMUL a2, c01, t1 + FMUL a2, c03, t2 + FMUL a2, c05, t3 + FMUL a2, c07, t4 + + FSUB c02, t1, c02 + FSUB c04, t2, c04 + FSUB c06, t3, c06 + FSUB c08, t4, c08 + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FMUL a2, c01, t1 + FMUL a2, c02, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a4, c01, t1 + FMUL a4, c02, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c07, t1, c07 + FSUB c08, t2, c08 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FMUL a2, c07, t1 + FMUL a2, c08, t2 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + + FMUL a3, c07, t1 + FMUL a3, c08, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a4, c07, t1 + FMUL a4, c08, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FMUL a2, c05, t1 + FMUL a2, c06, t2 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FMUL a2, c03, t1 + FMUL a2, c04, t2 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, 1 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + +.LL70: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + BASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + BASE_SHIFT, TEMP1 + sll KK, 2 + BASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL75 + nop + +.LL72: + FADD c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + cmp L, 0 + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a1, b4, t4 + LDF [BO + 7 * SIZE], b4 + LDF [AO + 4 * SIZE], a1 + + FADD c01, t1, c01 + add AO, 4 * SIZE, AO + FMUL a2, b1, t1 + LDF [BO + 8 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a2, b2, t2 + LDF [BO + 9 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a2, b3, t3 + LDF [BO + 10 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a2, b4, t4 + LDF [BO + 11 * SIZE], b4 + LDF [AO + 1 * SIZE], a2 + + FADD c01, t1, c01 + FMUL a3, b1, t1 + LDF [BO + 12 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a3, b2, t2 + LDF [BO + 13 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a3, b3, t3 + LDF [BO + 14 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a3, b4, t4 + LDF [BO + 15 * SIZE], b4 + LDF [AO + 2 * SIZE], a3 + + FADD c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 16 * SIZE], b1 + + FADD c02, t2, c02 + FMUL a4, b2, t2 + LDF [BO + 17 * SIZE], b2 + + FADD c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 18 * SIZE], b3 + + FADD c04, t4, c04 + FMUL a4, b4, t4 + LDF [BO + 19 * SIZE], b4 + + add BO, 16 * SIZE, BO + bg,pt %icc, .LL72 + LDF [AO + 3 * SIZE], a4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL79 + nop + +.LL76: + FADD c01, t1, c01 + add AO, 1 * SIZE, AO + FMUL a1, b1, t1 + LDF [BO + 4 * SIZE], b1 + + FADD c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [BO + 5 * SIZE], b2 + + FADD c03, t3, c03 + cmp L, 0 + FMUL a1, b3, t3 + LDF [BO + 6 * SIZE], b3 + + FADD c04, t4, c04 + add BO, 4 * SIZE, BO + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + bg,pt %icc, .LL76 + LDF [BO + 3 * SIZE], b4 + + +.LL79: + FADD c01, t1, c01 + FADD c02, t2, c02 + FADD c03, t3, c03 + FADD c04, t4, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a2, c01, t1 + FSUB c02, t1, c02 + FMUL a3, c01, t1 + FSUB c03, t1, c03 + FMUL a4, c01, t1 + FSUB c04, t1, c04 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c03, t1, c03 + FMUL a3, c02, t1 + FSUB c04, t1, c04 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c04, t1, c04 + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c04, c04 + FMUL a2, c04, t1 + FSUB c03, t1, c03 + FMUL a3, c04, t1 + FSUB c02, t1, c02 + FMUL a4, c04, t1 + FSUB c01, t1, c01 + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a2, c03, t1 + FSUB c02, t1, c02 + FMUL a3, c03, t1 + FSUB c01, t1, c01 + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c02, c02 + FMUL a2, c02, t1 + FSUB c01, t1, c01 + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C2 + 0 * SIZE] + STF c03, [C3 + 0 * SIZE] + STF c04, [C4 + 0 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 1 * SIZE, C1 + add C2, 1 * SIZE, C2 + add C3, 1 * SIZE, C3 + add C4, 1 * SIZE, C4 +#endif + +#ifdef RT + sll K, 0 + BASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + BASE_SHIFT, TEMP2 + sll TEMP1, 2 + BASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL99: +#ifdef LN + sll K, 2 + BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/trsm_kernel_RT_2x8.S b/kernel/sparc/trsm_kernel_RT_2x8.S new file mode 100644 index 0000000000..c9f68abc09 --- /dev/null +++ b/kernel/sparc/trsm_kernel_RT_2x8.S @@ -0,0 +1,3896 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define A %i5 +#define B %i4 +#else +#define A %i4 +#define B %i5 +#endif + +#define C %o4 +#define LDC %o5 + +#define AO %l0 +#define BO %l1 +#define I %l2 +#define J %l3 +#define L %l4 + +#define C1 %o0 +#define C2 %o1 +#define C3 %o2 +#define C4 %o3 + +#define C5 %l5 +#define C6 %l6 +#define C7 %l7 +#define C8 %i3 + +#define OFFSET %g1 +#define KK %g2 +#define TEMP1 %g3 +#define TEMP2 %g4 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#else + ld [%sp + STACK_START + 28], C + ld [%sp + STACK_START + 32], LDC + ld [%sp + STACK_START + 36], OFFSET +#endif + st %g1, [%sp + STACK_START + 8] + st %g2, [%sp + STACK_START + 12] + st %g3, [%sp + STACK_START + 16] + st %g4, [%sp + STACK_START + 20] +#else + + ldx [%sp+ STACK_START + 56], C + ldx [%sp+ STACK_START + 64], LDC + ldx [%sp+ STACK_START + 72], OFFSET + + stx %g1, [%sp + STACK_START + 32] + stx %g2, [%sp + STACK_START + 40] + stx %g3, [%sp + STACK_START + 48] + stx %g4, [%sp + STACK_START + 56] +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sll LDC, BASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, BASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, BASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL50 + nop + +#ifdef RT + sll K, BASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C1, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL80 + nop + .align 4 + +.LL72: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + prefetch [C1 + 2 * SIZE], 3 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL75 + nop + +.LL73: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + LDF [BO + 4 * SIZE], b1 + cmp L, 0 + + FMADD (aa3, bb2, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb2, cc02, cc02) + LDF [AO + 7 * SIZE], a4 + + LDF [BO + 5 * SIZE], b2 + add BO, 4 * SIZE, BO + + FMADD (aa1, bb3, cc01, cc01) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb3, cc02, cc02) + LDF [AO + 9 * SIZE], a2 + + LDF [BO + 2 * SIZE], b3 + add AO, 8 * SIZE, AO + + FMADD (aa3, bb4, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa4, bb4, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL73 + LDF [BO + 3 * SIZE], b4 + .align 4 + +.LL75: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL78 + nop + .align 4 + +.LL77: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a2 + + LDF [BO + 1 * SIZE], b1 + add L, -1, L + add AO, 2 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL77 + add BO, 1 * SIZE, BO + .align 4 + +.LL78: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + + FNMSUB (aa2, cc02, cc01, cc01) + + FMUL a3, c01, c01 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc02, cc02) + + FMUL a3, c02, c02 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL72 + nop + .align 4 + +.LL80: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL89 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 0, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [BO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], a2 + LDF [BO + 1 * SIZE], b2 + LDF [AO + 2 * SIZE], a3 + LDF [BO + 2 * SIZE], b3 + LDF [AO + 3 * SIZE], a4 + LDF [BO + 3 * SIZE], b4 + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL85 + FCLR (cc01) + .align 4 + +.LL83: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + LDF [BO + 4 * SIZE], b1 + + FMADD (aa2, bb2, cc01, cc01) + LDF [AO + 5 * SIZE], a2 + LDF [BO + 5 * SIZE], b2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [AO + 6 * SIZE], a3 + LDF [BO + 6 * SIZE], b3 + + FMADD (aa4, bb4, cc01, cc01) + LDF [AO + 7 * SIZE], a4 + LDF [BO + 7 * SIZE], b4 + + add AO, 4 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL83 + add BO, 4 * SIZE, BO + .align 4 + +.LL85: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL88 + nop + .align 4 + +.LL87: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 1 * SIZE], a1 + LDF [BO + 1 * SIZE], b1 + + add AO, 1 * SIZE, AO + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL87 + add BO, 1 * SIZE, BO + .align 4 + +.LL88: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#else + LDF [AO + 0 * SIZE], a1 + + FSUB a1, c01, c01 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#if defined(RN) || defined(RT) + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] +#else + STF c01, [AO + 0 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 0, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL89: +#ifdef LN + sll K, BASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL50: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL60 + nop + .align 4 + +.LL52: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL55 + nop + .align 4 + +.LL53: + FMADD (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL53 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL58 + nop + .align 4 + +.LL57: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL57 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL58: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c02, c02 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c02, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL52 + nop + .align 4 + +.LL60: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL69 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + FCLR (cc01) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL65 + nop + .align 4 + +.LL63: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb3, cc01, cc01) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc03, cc03) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + + LDF [AO + 2 * SIZE], a3 + add BO, 8 * SIZE, BO + + FMADD (aa4, bb7, cc01, cc01) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc03, cc03) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL63 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL65: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL68 + nop + .align 4 + +.LL67: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 2 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 3 * SIZE], b2 + + LDF [AO + 1 * SIZE], a1 + add L, -1, L + add AO, 1 * SIZE, AO + cmp L, 0 + + bg,pt %icc, .LL67 + add BO, 2 * SIZE, BO + .align 4 + +.LL68: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + + LDF [BO + 3 * SIZE], a1 + + FMUL a1, c03, c03 +#endif + +#ifdef RT + LDF [BO + 3 * SIZE], a1 + LDF [BO + 2 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL69: +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL30: + and N, 4, J + cmp J, 0 + ble,pn %icc, .LL10 + nop + +#ifdef RT + sll K, BASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL40 + nop + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc02) + LDF [BO + 7 * SIZE], b8 + FCLR (cc03) + LDF [BO + 8 * SIZE], b9 + FCLR (cc04) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc05) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C3 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc08) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + nop + .align 4 + +.LL33: + FMADD (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb7, cc05, cc05) + nop + FMADD (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD (aa3, bb8, cc07, cc07) + FMADD (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL37 + add BO, 4 * SIZE, BO + .align 4 + +.LL38: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c02, c02 + FSUB b2, c04, c04 + FSUB b3, c06, c06 + FSUB b4, c08, c08 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c02, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c06, [BO + 6 * SIZE] + STF c08, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +.LL40: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL49 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + FCLR (cc01) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc05) + LDF [BO + 8 * SIZE], b9 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL45 + nop + +.LL43: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + LDF [AO + 4 * SIZE], a1 + cmp L, 0 + + FMADD (aa2, bb5, cc01, cc01) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc03, cc03) + LDF [BO + 13 * SIZE], b6 + FMADD (aa2, bb7, cc05, cc05) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc07, cc07) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 5 * SIZE], a2 + add AO, 4 * SIZE, AO + + FMADD (aa3, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + LDF [AO + 2 * SIZE], a3 + add BO, 16 * SIZE, BO + + FMADD (aa4, bb5, cc01, cc01) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb6, cc03, cc03) + LDF [BO + 5 * SIZE], b6 + FMADD (aa4, bb7, cc05, cc05) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc07, cc07) + LDF [BO + 7 * SIZE], b8 + + bg,pt %icc, .LL43 + LDF [AO + 3 * SIZE], a4 + .align 4 + +.LL45: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL48 + nop + .align 4 + +.LL47: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 4 * SIZE], b1 + add L, -1, L + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 5 * SIZE], b2 + add AO, 1 * SIZE, AO + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 6 * SIZE], b3 + cmp L, 0 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 7 * SIZE], b4 + add BO, 4 * SIZE, BO + + bg,pt %icc, .LL47 + LDF [AO + 0 * SIZE], a1 + .align 4 + +.LL48: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 6 * SIZE], a2 + LDF [BO + 7 * SIZE], a3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 11 * SIZE], a2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + + LDF [BO + 15 * SIZE], a1 + + FMUL a1, c07, c07 +#endif + +#ifdef RT + LDF [BO + 15 * SIZE], a1 + LDF [BO + 14 * SIZE], a2 + LDF [BO + 13 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 10 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 8 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 5 * SIZE], a1 + LDF [BO + 4 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL49: +#ifdef LN + sll K, BASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + .align 4 + +.LL10: + sra N, 3, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, BASE_SHIFT + 3, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C5 + add C5, LDC, C6 + add C6, LDC, C7 + add C7, LDC, C8 + add C8, LDC, C +#else + sub C, LDC, C8 + sub C8, LDC, C7 + sub C7, LDC, C6 + sub C6, LDC, C5 + sub C5, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL20 + nop + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 1, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 1, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 8 * SIZE], a5 + + LDF [BO + 0 * SIZE], b1 + + LDF [BO + 1 * SIZE], b2 + FCLR (cc01) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc09) + LDF [BO + 4 * SIZE], b5 + FCLR (cc13) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc06) + LDF [BO + 7 * SIZE], b8 + FCLR (cc10) + LDF [BO + 8 * SIZE], b9 + FCLR (cc14) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc03) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc11) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc15) + + prefetch [C5 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C6 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C7 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C8 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD (aa1, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa1, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD (aa1, bb6, cc11, cc11) + nop + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + nop + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD (aa2, bb6, cc12, cc12) + nop + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD (aa4, bb6, cc12, cc12) + nop + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD (aa5, bb1, cc01, cc01) + FMADD (aa2, bb1, cc02, cc02) + FMADD (aa5, bb2, cc03, cc03) + FMADD (aa2, bb2, cc04, cc04) + + FMADD (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD (aa3, bb9, cc01, cc01) + FMADD (aa4, bb9, cc02, cc02) + FMADD (aa3, bb2, cc03, cc03) + FMADD (aa4, bb2, cc04, cc04) + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD (aa3, bb6, cc11, cc11) + FMADD (aa4, bb6, cc12, cc12) + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD (aa2, bb1, cc02, cc02) + nop + + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD (aa2, bb3, cc06, cc06) + nop + + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + nop + FMADD (aa2, bb5, cc10, cc10) + nop + + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 + + LDF [BO + 8 * SIZE], a1 + LDF [BO + 9 * SIZE], a2 + LDF [BO + 10 * SIZE], a3 + LDF [BO + 11 * SIZE], a4 + + LDF [BO + 12 * SIZE], b1 + LDF [BO + 13 * SIZE], b2 + LDF [BO + 14 * SIZE], b3 + LDF [BO + 15 * SIZE], b4 + + FSUB a1, c02, c02 + FSUB a2, c04, c04 + FSUB a3, c06, c06 + FSUB a4, c08, c08 + + FSUB b1, c10, c10 + FSUB b2, c12, c12 + FSUB b3, c14, c14 + FSUB b4, c16, c16 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c05, c05 + FSUB b2, c06, c06 + FSUB b3, c07, c07 + FSUB b4, c08, c08 + + LDF [AO + 8 * SIZE], a1 + LDF [AO + 9 * SIZE], a2 + LDF [AO + 10 * SIZE], a3 + LDF [AO + 11 * SIZE], a4 + + LDF [AO + 12 * SIZE], b1 + LDF [AO + 13 * SIZE], b2 + LDF [AO + 14 * SIZE], b3 + LDF [AO + 15 * SIZE], b4 + + FSUB a1, c09, c09 + FSUB a2, c10, c10 + FSUB a3, c11, c11 + FSUB a4, c12, c12 + + FSUB b1, c13, c13 + FSUB b2, c14, c14 + FSUB b3, c15, c15 + FSUB b4, c16, c16 +#endif + +#ifdef LN + LDF [AO + 3 * SIZE], a1 + LDF [AO + 2 * SIZE], a2 + LDF [AO + 0 * SIZE], a3 + + FMUL a1, c02, c02 + FMUL a1, c04, c04 + FMUL a1, c06, c06 + FMUL a1, c08, c08 + FMUL a1, c10, c10 + FMUL a1, c12, c12 + FMUL a1, c14, c14 + FMUL a1, c16, c16 + + FNMSUB (aa2, cc02, cc01, cc01) + FNMSUB (aa2, cc04, cc03, cc03) + FNMSUB (aa2, cc06, cc05, cc05) + FNMSUB (aa2, cc08, cc07, cc07) + FNMSUB (aa2, cc10, cc09, cc09) + FNMSUB (aa2, cc12, cc11, cc11) + FNMSUB (aa2, cc14, cc13, cc13) + FNMSUB (aa2, cc16, cc15, cc15) + + FMUL a3, c01, c01 + FMUL a3, c03, c03 + FMUL a3, c05, c05 + FMUL a3, c07, c07 + FMUL a3, c09, c09 + FMUL a3, c11, c11 + FMUL a3, c13, c13 + FMUL a3, c15, c15 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 3 * SIZE], a3 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc01, cc02, cc02) + FNMSUB (aa2, cc03, cc04, cc04) + FNMSUB (aa2, cc05, cc06, cc06) + FNMSUB (aa2, cc07, cc08, cc08) + FNMSUB (aa2, cc09, cc10, cc10) + FNMSUB (aa2, cc11, cc12, cc12) + FNMSUB (aa2, cc13, cc14, cc14) + FNMSUB (aa2, cc15, cc16, cc16) + + FMUL a3, c02, c02 + FMUL a3, c04, c04 + FMUL a3, c06, c06 + FMUL a3, c08, c08 + FMUL a3, c10, c10 + FMUL a3, c12, c12 + FMUL a3, c14, c14 + FMUL a3, c16, c16 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + FMUL a1, c02, c02 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa2, cc02, cc04, cc04) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa3, cc02, cc06, cc06) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (aa4, cc02, cc08, cc08) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb1, cc02, cc10, cc10) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb2, cc02, cc12, cc12) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb3, cc02, cc14, cc14) + FNMSUB (bb4, cc01, cc15, cc15) + FNMSUB (bb4, cc02, cc16, cc16) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + FMUL a1, c04, c04 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa2, cc04, cc06, cc06) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa3, cc04, cc08, cc08) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (aa4, cc04, cc10, cc10) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb1, cc04, cc12, cc12) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb2, cc04, cc14, cc14) + FNMSUB (bb3, cc03, cc15, cc15) + FNMSUB (bb3, cc04, cc16, cc16) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + FMUL a1, c06, c06 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa2, cc06, cc08, cc08) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa3, cc06, cc10, cc10) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (aa4, cc06, cc12, cc12) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb1, cc06, cc14, cc14) + FNMSUB (bb2, cc05, cc15, cc15) + FNMSUB (bb2, cc06, cc16, cc16) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + FMUL a1, c08, c08 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa2, cc08, cc10, cc10) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa3, cc08, cc12, cc12) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (aa4, cc08, cc14, cc14) + FNMSUB (bb1, cc07, cc15, cc15) + FNMSUB (bb1, cc08, cc16, cc16) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + FMUL a1, c10, c10 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa2, cc10, cc12, cc12) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa3, cc10, cc14, cc14) + FNMSUB (aa4, cc09, cc15, cc15) + FNMSUB (aa4, cc10, cc16, cc16) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + FMUL a1, c12, c12 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa2, cc12, cc14, cc14) + FNMSUB (aa3, cc11, cc15, cc15) + FNMSUB (aa3, cc12, cc16, cc16) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + FMUL a1, c14, c14 + + FNMSUB (aa2, cc13, cc15, cc15) + FNMSUB (aa2, cc14, cc16, cc16) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 + FMUL a1, c16, c16 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c16, c16 + FMUL a1, c15, c15 + + FNMSUB (aa2, cc16, cc14, cc14) + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc16, cc12, cc12) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc16, cc10, cc10) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc16, cc08, cc08) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc16, cc06, cc06) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc16, cc04, cc04) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc16, cc02, cc02) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c14, c14 + FMUL a1, c13, c13 + + FNMSUB (aa2, cc14, cc12, cc12) + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc14, cc10, cc10) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc14, cc08, cc08) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc14, cc06, cc06) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc14, cc04, cc04) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc14, cc02, cc02) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c12, c12 + FMUL a1, c11, c11 + + FNMSUB (aa2, cc12, cc10, cc10) + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc12, cc08, cc08) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc12, cc06, cc06) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc12, cc04, cc04) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc12, cc02, cc02) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c10, c10 + FMUL a1, c09, c09 + + FNMSUB (aa2, cc10, cc08, cc08) + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc10, cc06, cc06) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc10, cc04, cc04) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc10, cc02, cc02) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c08, c08 + FMUL a1, c07, c07 + + FNMSUB (aa2, cc08, cc06, cc06) + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc08, cc04, cc04) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc08, cc02, cc02) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c06, c06 + FMUL a1, c05, c05 + + FNMSUB (aa2, cc06, cc04, cc04) + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc06, cc02, cc02) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c04, c04 + FMUL a1, c03, c03 + + FNMSUB (aa2, cc04, cc02, cc02) + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c02, c02 + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 + add C5, -2 * SIZE, C5 + add C6, -2 * SIZE, C6 + add C7, -2 * SIZE, C7 + add C8, -2 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] + + STF c02, [BO + 8 * SIZE] + STF c04, [BO + 9 * SIZE] + STF c06, [BO + 10 * SIZE] + STF c08, [BO + 11 * SIZE] + + STF c10, [BO + 12 * SIZE] + STF c12, [BO + 13 * SIZE] + STF c14, [BO + 14 * SIZE] + STF c16, [BO + 15 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c05, [AO + 4 * SIZE] + STF c06, [AO + 5 * SIZE] + STF c07, [AO + 6 * SIZE] + STF c08, [AO + 7 * SIZE] + + STF c09, [AO + 8 * SIZE] + STF c10, [AO + 9 * SIZE] + STF c11, [AO + 10 * SIZE] + STF c12, [AO + 11 * SIZE] + + STF c13, [AO + 12 * SIZE] + STF c14, [AO + 13 * SIZE] + STF c15, [AO + 14 * SIZE] + STF c16, [AO + 15 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c04, [C2 + 1 * SIZE] + + STF c05, [C3 + 0 * SIZE] + STF c06, [C3 + 1 * SIZE] + STF c07, [C4 + 0 * SIZE] + STF c08, [C4 + 1 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c10, [C5 + 1 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c12, [C6 + 1 * SIZE] + + STF c13, [C7 + 0 * SIZE] + STF c14, [C7 + 1 * SIZE] + STF c15, [C8 + 0 * SIZE] + STF c16, [C8 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 + add C5, 2 * SIZE, C5 + add C6, 2 * SIZE, C6 + add C7, 2 * SIZE, C7 + add C8, 2 * SIZE, C8 +#endif + +#ifdef RT + sll K, BASE_SHIFT + 1, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 1, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + .align 4 + +.LL20: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL29 + nop + +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, BASE_SHIFT + 0, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, BASE_SHIFT + 0, TEMP1 + sll KK, BASE_SHIFT + 3, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + FCLR (cc01) + LDF [BO + 1 * SIZE], b2 + FCLR (cc03) + LDF [BO + 2 * SIZE], b3 + FCLR (cc05) + LDF [BO + 3 * SIZE], b4 + FCLR (cc07) + LDF [BO + 4 * SIZE], b5 + FCLR (cc09) + LDF [BO + 5 * SIZE], b6 + FCLR (cc11) + LDF [BO + 6 * SIZE], b7 + FCLR (cc13) + LDF [BO + 7 * SIZE], b8 + FCLR (cc15) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + LDF [BO + 8 * SIZE], b9 + .align 4 + +.LL23: + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + add L, -1, L + + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 16 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + FMADD (aa2, bb9, cc01, cc01) + LDF [BO + 24 * SIZE], b9 + FMADD (aa2, bb2, cc03, cc03) + LDF [BO + 17 * SIZE], b2 + + FMADD (aa2, bb3, cc05, cc05) + LDF [BO + 18 * SIZE], b3 + FMADD (aa2, bb4, cc07, cc07) + LDF [BO + 19 * SIZE], b4 + + FMADD (aa2, bb5, cc09, cc09) + LDF [BO + 20 * SIZE], b5 + FMADD (aa2, bb6, cc11, cc11) + LDF [BO + 21 * SIZE], b6 + + FMADD (aa2, bb7, cc13, cc13) + LDF [BO + 22 * SIZE], b7 + FMADD (aa2, bb8, cc15, cc15) + LDF [BO + 23 * SIZE], b8 + + LDF [AO + 4 * SIZE], a1 + LDF [AO + 5 * SIZE], a2 + + FMADD (aa3, bb1, cc01, cc01) + LDF [BO + 32 * SIZE], b1 + FMADD (aa3, bb2, cc03, cc03) + LDF [BO + 25 * SIZE], b2 + + FMADD (aa3, bb3, cc05, cc05) + LDF [BO + 26 * SIZE], b3 + FMADD (aa3, bb4, cc07, cc07) + LDF [BO + 27 * SIZE], b4 + + FMADD (aa3, bb5, cc09, cc09) + LDF [BO + 28 * SIZE], b5 + FMADD (aa3, bb6, cc11, cc11) + LDF [BO + 29 * SIZE], b6 + + FMADD (aa3, bb7, cc13, cc13) + LDF [BO + 30 * SIZE], b7 + FMADD (aa3, bb8, cc15, cc15) + LDF [BO + 31 * SIZE], b8 + + FMADD (aa4, bb9, cc01, cc01) + LDF [BO + 40 * SIZE], b9 + FMADD (aa4, bb2, cc03, cc03) + LDF [BO + 33 * SIZE], b2 + + FMADD (aa4, bb3, cc05, cc05) + LDF [BO + 34 * SIZE], b3 + FMADD (aa4, bb4, cc07, cc07) + LDF [BO + 35 * SIZE], b4 + + FMADD (aa4, bb5, cc09, cc09) + LDF [BO + 36 * SIZE], b5 + FMADD (aa4, bb6, cc11, cc11) + LDF [BO + 37 * SIZE], b6 + + FMADD (aa4, bb7, cc13, cc13) + LDF [BO + 38 * SIZE], b7 + FMADD (aa4, bb8, cc15, cc15) + LDF [BO + 39 * SIZE], b8 + + LDF [AO + 6 * SIZE], a3 + LDF [AO + 7 * SIZE], a4 + + add AO, 4 * SIZE, AO + cmp L, 0 + bg,pt %icc, .LL23 + add BO, 32 * SIZE, BO + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD (aa1, bb1, cc01, cc01) + LDF [BO + 8 * SIZE], b1 + FMADD (aa1, bb2, cc03, cc03) + LDF [BO + 9 * SIZE], b2 + + FMADD (aa1, bb3, cc05, cc05) + LDF [BO + 10 * SIZE], b3 + FMADD (aa1, bb4, cc07, cc07) + LDF [BO + 11 * SIZE], b4 + + FMADD (aa1, bb5, cc09, cc09) + LDF [BO + 12 * SIZE], b5 + FMADD (aa1, bb6, cc11, cc11) + LDF [BO + 13 * SIZE], b6 + + FMADD (aa1, bb7, cc13, cc13) + LDF [BO + 14 * SIZE], b7 + FMADD (aa1, bb8, cc15, cc15) + LDF [BO + 15 * SIZE], b8 + + LDF [AO + 1 * SIZE], a1 + add AO, 1 * SIZE, AO + + add L, -1, L + cmp L, 0 + bg,pt %icc, .LL27 + add BO, 8 * SIZE, BO + .align 4 + +.LL28: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 8, TEMP1 +#endif + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c03, c03 + FSUB a3, c05, c05 + FSUB a4, c07, c07 + + FSUB b1, c09, c09 + FSUB b2, c11, c11 + FSUB b3, c13, c13 + FSUB b4, c15, c15 +#endif + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + + FMUL a1, c01, c01 + FMUL a1, c03, c03 + FMUL a1, c05, c05 + FMUL a1, c07, c07 + FMUL a1, c09, c09 + FMUL a1, c11, c11 + FMUL a1, c13, c13 + FMUL a1, c15, c15 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FMUL a1, c01, c01 + + FNMSUB (aa2, cc01, cc03, cc03) + FNMSUB (aa3, cc01, cc05, cc05) + FNMSUB (aa4, cc01, cc07, cc07) + FNMSUB (bb1, cc01, cc09, cc09) + FNMSUB (bb2, cc01, cc11, cc11) + FNMSUB (bb3, cc01, cc13, cc13) + FNMSUB (bb4, cc01, cc15, cc15) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 10 * SIZE], a2 + LDF [BO + 11 * SIZE], a3 + LDF [BO + 12 * SIZE], a4 + LDF [BO + 13 * SIZE], b1 + LDF [BO + 14 * SIZE], b2 + LDF [BO + 15 * SIZE], b3 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc05, cc05) + FNMSUB (aa3, cc03, cc07, cc07) + FNMSUB (aa4, cc03, cc09, cc09) + FNMSUB (bb1, cc03, cc11, cc11) + FNMSUB (bb2, cc03, cc13, cc13) + FNMSUB (bb3, cc03, cc15, cc15) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 19 * SIZE], a2 + LDF [BO + 20 * SIZE], a3 + LDF [BO + 21 * SIZE], a4 + LDF [BO + 22 * SIZE], b1 + LDF [BO + 23 * SIZE], b2 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc07, cc07) + FNMSUB (aa3, cc05, cc09, cc09) + FNMSUB (aa4, cc05, cc11, cc11) + FNMSUB (bb1, cc05, cc13, cc13) + FNMSUB (bb2, cc05, cc15, cc15) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 28 * SIZE], a2 + LDF [BO + 29 * SIZE], a3 + LDF [BO + 30 * SIZE], a4 + LDF [BO + 31 * SIZE], b1 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc09, cc09) + FNMSUB (aa3, cc07, cc11, cc11) + FNMSUB (aa4, cc07, cc13, cc13) + FNMSUB (bb1, cc07, cc15, cc15) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 37 * SIZE], a2 + LDF [BO + 38 * SIZE], a3 + LDF [BO + 39 * SIZE], a4 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc11, cc11) + FNMSUB (aa3, cc09, cc13, cc13) + FNMSUB (aa4, cc09, cc15, cc15) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 46 * SIZE], a2 + LDF [BO + 47 * SIZE], a3 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc13, cc13) + FNMSUB (aa3, cc11, cc15, cc15) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 55 * SIZE], a2 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc15, cc15) + + LDF [BO + 63 * SIZE], a1 + + FMUL a1, c15, c15 +#endif + +#ifdef RT + LDF [BO + 63 * SIZE], a1 + LDF [BO + 62 * SIZE], a2 + LDF [BO + 61 * SIZE], a3 + LDF [BO + 60 * SIZE], a4 + LDF [BO + 59 * SIZE], b1 + LDF [BO + 58 * SIZE], b2 + LDF [BO + 57 * SIZE], b3 + LDF [BO + 56 * SIZE], b4 + + FMUL a1, c15, c15 + + FNMSUB (aa2, cc15, cc13, cc13) + FNMSUB (aa3, cc15, cc11, cc11) + FNMSUB (aa4, cc15, cc09, cc09) + FNMSUB (bb1, cc15, cc07, cc07) + FNMSUB (bb2, cc15, cc05, cc05) + FNMSUB (bb3, cc15, cc03, cc03) + FNMSUB (bb4, cc15, cc01, cc01) + + LDF [BO + 54 * SIZE], a1 + LDF [BO + 53 * SIZE], a2 + LDF [BO + 52 * SIZE], a3 + LDF [BO + 51 * SIZE], a4 + LDF [BO + 50 * SIZE], b1 + LDF [BO + 49 * SIZE], b2 + LDF [BO + 48 * SIZE], b3 + + FMUL a1, c13, c13 + + FNMSUB (aa2, cc13, cc11, cc11) + FNMSUB (aa3, cc13, cc09, cc09) + FNMSUB (aa4, cc13, cc07, cc07) + FNMSUB (bb1, cc13, cc05, cc05) + FNMSUB (bb2, cc13, cc03, cc03) + FNMSUB (bb3, cc13, cc01, cc01) + + LDF [BO + 45 * SIZE], a1 + LDF [BO + 44 * SIZE], a2 + LDF [BO + 43 * SIZE], a3 + LDF [BO + 42 * SIZE], a4 + LDF [BO + 41 * SIZE], b1 + LDF [BO + 40 * SIZE], b2 + + FMUL a1, c11, c11 + + FNMSUB (aa2, cc11, cc09, cc09) + FNMSUB (aa3, cc11, cc07, cc07) + FNMSUB (aa4, cc11, cc05, cc05) + FNMSUB (bb1, cc11, cc03, cc03) + FNMSUB (bb2, cc11, cc01, cc01) + + LDF [BO + 36 * SIZE], a1 + LDF [BO + 35 * SIZE], a2 + LDF [BO + 34 * SIZE], a3 + LDF [BO + 33 * SIZE], a4 + LDF [BO + 32 * SIZE], b1 + + FMUL a1, c09, c09 + + FNMSUB (aa2, cc09, cc07, cc07) + FNMSUB (aa3, cc09, cc05, cc05) + FNMSUB (aa4, cc09, cc03, cc03) + FNMSUB (bb1, cc09, cc01, cc01) + + LDF [BO + 27 * SIZE], a1 + LDF [BO + 26 * SIZE], a2 + LDF [BO + 25 * SIZE], a3 + LDF [BO + 24 * SIZE], a4 + + FMUL a1, c07, c07 + + FNMSUB (aa2, cc07, cc05, cc05) + FNMSUB (aa3, cc07, cc03, cc03) + FNMSUB (aa4, cc07, cc01, cc01) + + LDF [BO + 18 * SIZE], a1 + LDF [BO + 17 * SIZE], a2 + LDF [BO + 16 * SIZE], a3 + + FMUL a1, c05, c05 + + FNMSUB (aa2, cc05, cc03, cc03) + FNMSUB (aa3, cc05, cc01, cc01) + + LDF [BO + 9 * SIZE], a1 + LDF [BO + 8 * SIZE], a2 + + FMUL a1, c03, c03 + + FNMSUB (aa2, cc03, cc01, cc01) + + LDF [BO + 0 * SIZE], a1 + + FMUL a1, c01, c01 +#endif + +#ifdef LN + add C1, -1 * SIZE, C1 + add C2, -1 * SIZE, C2 + add C3, -1 * SIZE, C3 + add C4, -1 * SIZE, C4 + add C5, -1 * SIZE, C5 + add C6, -1 * SIZE, C6 + add C7, -1 * SIZE, C7 + add C8, -1 * SIZE, C8 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c03, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c07, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c11, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c15, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c03, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c07, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c11, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c15, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c03, [C2 + 0 * SIZE] + STF c05, [C3 + 0 * SIZE] + STF c07, [C4 + 0 * SIZE] + + STF c09, [C5 + 0 * SIZE] + STF c11, [C6 + 0 * SIZE] + STF c13, [C7 + 0 * SIZE] + STF c15, [C8 + 0 * SIZE] + +#ifdef RT + sll K, BASE_SHIFT + 0, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, BASE_SHIFT + 0, TEMP2 + sll TEMP1, BASE_SHIFT + 3, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + .align 4 + +.LL29: +#ifdef LN + sll K, BASE_SHIFT + 3, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 8, KK +#endif + +#ifdef RT + sub KK, 8, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL999: +#ifdef TRMMKERNEL +#ifndef __64BIT__ + ld [%sp + STACK_START + 8], %g1 + ld [%sp + STACK_START + 12], %g2 + ld [%sp + STACK_START + 16], %g3 + ld [%sp + STACK_START + 20], %g4 +#else + ldx [%sp + STACK_START + 32], %g1 + ldx [%sp + STACK_START + 40], %g2 + ldx [%sp + STACK_START + 48], %g3 + ldx [%sp + STACK_START + 56], %g4 +#endif +#endif + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zamax.S b/kernel/sparc/zamax.S new file mode 100644 index 0000000000..b156c5a245 --- /dev/null +++ b/kernel/sparc/zamax.S @@ -0,0 +1,374 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 +#define t5 %f16 +#define t6 %f18 +#define t7 %f20 +#define t8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 +#define t5 %f8 +#define t6 %f9 +#define t7 %f10 +#define t8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 +#endif + +#ifndef USE_MIN +#define FCMOV FMOVG +#else +#define FCMOV FMOVL +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + cmp N, 0 + ble .LL20 + nop + + cmp INCX, 0 + ble .LL20 + sll INCX, ZBASE_SHIFT, INCX + + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + add N, -1, N + FABS c1, c1 + add X, INCX, X + FABS c2, c2 + cmp N, 0 + ble .LL20 + FADD c1, c2, c1 + + FMOV c1, c2 + FMOV c1, c3 + FMOV c1, c4 + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FABS a5, t5 + LDF [X + 4 * SIZE], a5 + FABS a6, t6 + LDF [X + 5 * SIZE], a6 + FABS a7, t7 + LDF [X + 6 * SIZE], a7 + FABS a8, t8 + LDF [X + 7 * SIZE], a8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + add I, -1, I + FCMOV %fcc1, t3, c2 + cmp I, 0 + FCMOV %fcc2, t5, c3 + FCMOV %fcc3, t7, c4 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FABS a5, t5 + FABS a6, t6 + FABS a7, t7 + FABS a8, t8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t3, c2 + FCMOV %fcc2, t5, c3 + FCMOV %fcc3, t7, c4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + + FABS a1, t1 + FABS a2, t2 + FADD t1, t2, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + +.LL20: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + FABS a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FABS a5, t5 + LDF [X + 0 * SIZE], a5 + FABS a6, t6 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FABS a7, t7 + LDF [X + 0 * SIZE], a7 + FABS a8, t8 + LDF [X + 1 * SIZE], a8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + add I, -1, I + FCMOV %fcc1, t3, c2 + cmp I, 0 + FCMOV %fcc2, t5, c3 + FCMOV %fcc3, t7, c4 + + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FABS a5, t5 + FABS a6, t6 + FABS a7, t7 + FABS a8, t8 + + FADD t1, t2, t1 + FADD t3, t4, t3 + FADD t5, t6, t5 + FADD t7, t8, t7 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t3, c2 + FCMP %fcc2, t5, c3 + FCMP %fcc3, t7, c4 + + FCMOV %fcc0, t1, c1 + FCMOV %fcc1, t3, c2 + FCMOV %fcc2, t5, c3 + FCMOV %fcc3, t7, c4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + + FABS a1, t1 + add I, -1, I + FABS a2, t2 + cmp I, 0 + FADD t1, t2, t1 + FCMP %fcc0, t1, c1 + FCMOV %fcc0, t1, c1 + + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + FCMOV %fcc0, c2, c1 + FCMOV %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FCMOV %fcc0, c3, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zasum.S b/kernel/sparc/zasum.S new file mode 100644 index 0000000000..53bd3c0b06 --- /dev/null +++ b/kernel/sparc/zasum.S @@ -0,0 +1,327 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#else +#define c1 %f0 +#define c2 %f1 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#endif + + PROLOGUE + SAVESP + + FCLR(0) + + sll INCX, ZBASE_SHIFT, INCX + + FMOV c1, c2 + FMOV c1, t1 + FMOV c1, t2 + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 0 + ble .LL19 + nop + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + cmp I, 0 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 32 + +.LL11: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + add I, -1, I + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + + FADD c1, t3, c1 + cmp I, 0 + FABS a3, t3 + LDF [X + 2 * SIZE], a3 + + FADD c2, t4, c2 + nop + FABS a4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + nop + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + + FADD c2, t2, c2 + nop + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + + FADD c1, t3, c1 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + add X, 8 * SIZE, X + + FADD c2, t4, c2 + FABS a8, t4 + bg,pt %icc, .LL11 + LDF [X - 1 * SIZE], a8 + +.LL12: + FADD c1, t1, c1 + FABS a1, t1 + FADD c2, t2, c2 + FABS a2, t2 + + FADD c1, t3, c1 + FABS a3, t3 + FADD c2, t4, c2 + FABS a4, t4 + + FADD c1, t1, c1 + FABS a5, t1 + FADD c2, t2, c2 + FABS a6, t2 + + FADD c1, t3, c1 + FABS a7, t3 + FADD c2, t4, c2 + FABS a8, t4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add I, -1, I + cmp I, 0 + FADD c1, t1, c1 + FADD c2, t2, c2 + FABS a1, t1 + FABS a2, t2 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + + ble,pt %icc, .LL52 + add X, INCX, X + +.LL51: + FADD c1, t1, c1 + add I, -1, I + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + + FADD c2, t2, c2 + cmp I, 0 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + + FADD c1, t3, c1 + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + + FADD c2, t4, c2 + FABS a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + + FADD c2, t2, c2 + FABS a6, t2 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + + FADD c1, t3, c1 + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + + FADD c2, t4, c2 + FABS a8, t4 + LDF [X + 1 * SIZE], a8 + + bg,pt %icc, .LL51 + add X, INCX, X + +.LL52: + FADD c1, t1, c1 + FABS a1, t1 + FADD c2, t2, c2 + FABS a2, t2 + + FADD c1, t3, c1 + FABS a3, t3 + FADD c2, t4, c2 + FABS a4, t4 + + FADD c1, t1, c1 + FABS a5, t1 + FADD c2, t2, c2 + FABS a6, t2 + + FADD c1, t3, c1 + FABS a7, t3 + FADD c2, t4, c2 + FABS a8, t4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FADD c1, t1, c1 + FADD c2, t2, c2 + add I, -1, I + FABS a1, t1 + FABS a2, t2 + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c1, t3, c1 + FADD c2, t4, c2 + + FADD c1, c2, c1 + + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zaxpy.S b/kernel/sparc/zaxpy.S new file mode 100644 index 0000000000..5e2be75942 --- /dev/null +++ b/kernel/sparc/zaxpy.S @@ -0,0 +1,594 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(DOUBLE) && !defined(__64BIT__) +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 +#else +#define N %i0 +#define X %i5 +#define INCX %i1 +#define Y %i2 +#define INCY %i3 +#define I %i4 +#endif + +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define b1 %f16 +#define b2 %f18 +#define b3 %f20 +#define b4 %f22 +#define b5 %f24 +#define b6 %f26 +#define b7 %f28 +#define b8 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 +#define c1 %f40 +#define c2 %f42 +#define c3 %f44 +#define c4 %f46 + +#define c5 %f48 +#define c6 %f50 +#define c7 %f52 +#define c8 %f54 + +#define ALPHA_R %f60 +#define ALPHA_I %f62 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define b1 %f8 +#define b2 %f9 +#define b3 %f10 +#define b4 %f11 +#define b5 %f12 +#define b6 %f13 +#define b7 %f14 +#define b8 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 +#define c1 %f20 +#define c2 %f21 +#define c3 %f22 +#define c4 %f23 + +#define c5 %f24 +#define c6 %f25 +#define c7 %f26 +#define c8 %f27 + +#define ALPHA_R %f30 +#define ALPHA_I %f31 +#endif + +#ifndef CONJ +#define ADD1 FSUB +#define ADD2 FADD +#else +#define ADD1 FADD +#define ADD2 FSUB +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] + + ld [%sp+ STACK_START + 32], X + ld [%sp+ STACK_START + 36], INCX + ld [%sp+ STACK_START + 40], Y + ld [%sp+ STACK_START + 44], INCY + + ldd [%sp + STACK_START + 16], ALPHA_R + ldd [%sp + STACK_START + 24], ALPHA_I +#else + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp+ STACK_START + 28], INCX + ld [%sp+ STACK_START + 32], Y + ld [%sp+ STACK_START + 36], INCY + + ld [%sp + STACK_START + 16], ALPHA_R + ld [%sp + STACK_START + 20], ALPHA_I +#endif +#else + ldx [%sp + STACK_START + 56], INCX + ldx [%sp + STACK_START + 64], Y + ldx [%sp + STACK_START + 72], INCY +#ifdef DOUBLE + FMOV %f6, ALPHA_R + FMOV %f8, ALPHA_I +#else + FMOV %f7, ALPHA_R + FMOV %f9, ALPHA_I +#endif +#endif + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + + cmp INCX, 2 * SIZE + bne .LL50 + nop + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 2 * SIZE], b3 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 4 * SIZE], b5 + LDF [Y + 5 * SIZE], b6 + + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 6 * SIZE], b7 + LDF [Y + 7 * SIZE], b8 + + FMUL ALPHA_R, a1, t1 + FMUL ALPHA_R, a2, t2 + FMUL ALPHA_R, a3, t3 + FMUL ALPHA_R, a4, t4 + + FADD b1, t1, c1 + FMUL ALPHA_I, a2, t1 + ADD2 b2, t2, c2 + FMUL ALPHA_I, a1, t2 + + deccc I + ble,pt %icc, .LL12 + nop + +#ifdef DOUBLE +#define PREFETCHSIZE 54 +#else +#define PREFETCHSIZE 108 +#endif + +.LL11: + FADD b3, t3, c3 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + FMUL ALPHA_I, a4, t3 + prefetch [X + PREFETCHSIZE * SIZE], 0 + + ADD2 b4, t4, c4 + LDF [Y + 8 * SIZE], b1 + FMUL ALPHA_I, a3, t4 + LDF [X + 9 * SIZE], a2 + + ADD1 c1, t1, c1 + LDF [Y + 9 * SIZE], b2 + FMUL ALPHA_R, a5, t1 + LDF [X + 8 * SIZE], a1 + + FADD c2, t2, c2 + LDF [Y + 10 * SIZE], b3 + FMUL ALPHA_R, a6, t2 + LDF [X + 11 * SIZE], a4 + + ADD1 c3, t3, c3 + STF c1, [Y + 0 * SIZE] + FMUL ALPHA_R, a7, t3 + LDF [Y + 11 * SIZE], b4 + + FADD c4, t4, c4 + STF c2, [Y + 1 * SIZE] + FMUL ALPHA_R, a8, t4 + LDF [X + 10 * SIZE], a3 + + FADD b5, t1, c5 + STF c3, [Y + 2 * SIZE] + FMUL ALPHA_I, a6, t1 + + ADD2 b6, t2, c6 + STF c4, [Y + 3 * SIZE] + FMUL ALPHA_I, a5, t2 + + FADD b7, t3, c7 + LDF [Y + 12 * SIZE], b5 + FMUL ALPHA_I, a8, t3 + LDF [X + 13 * SIZE], a6 + + ADD2 b8, t4, c8 + LDF [Y + 13 * SIZE], b6 + FMUL ALPHA_I, a7, t4 + LDF [X + 12 * SIZE], a5 + + ADD1 c5, t1, c5 + LDF [Y + 14 * SIZE], b7 + FMUL ALPHA_R, a1, t1 + LDF [X + 15 * SIZE], a8 + + FADD c6, t2, c6 + LDF [Y + 15 * SIZE], b8 + FMUL ALPHA_R, a2, t2 + LDF [X + 14 * SIZE], a7 + + ADD1 c7, t3, c7 + STF c5, [Y + 4 * SIZE] + FMUL ALPHA_R, a3, t3 + add X, 8 * SIZE, X + + FADD c8, t4, c8 + STF c6, [Y + 5 * SIZE] + FMUL ALPHA_R, a4, t4 + deccc I + + FADD b1, t1, c1 + STF c7, [Y + 6 * SIZE] + FMUL ALPHA_I, a2, t1 + + ADD2 b2, t2, c2 + STF c8, [Y + 7 * SIZE] + FMUL ALPHA_I, a1, t2 + + bg,pt %icc, .LL11 + add Y, 8 * SIZE, Y + + +.LL12: + FADD b3, t3, c3 + FMUL ALPHA_I, a4, t3 + ADD2 b4, t4, c4 + FMUL ALPHA_I, a3, t4 + + ADD1 c1, t1, c1 + FMUL ALPHA_R, a5, t1 + FADD c2, t2, c2 + FMUL ALPHA_R, a6, t2 + + ADD1 c3, t3, c3 + FMUL ALPHA_R, a7, t3 + FADD c4, t4, c4 + FMUL ALPHA_R, a8, t4 + + FADD b5, t1, c5 + FMUL ALPHA_I, a6, t1 + ADD2 b6, t2, c6 + FMUL ALPHA_I, a5, t2 + + FADD b7, t3, c7 + FMUL ALPHA_I, a8, t3 + ADD2 b8, t4, c8 + FMUL ALPHA_I, a7, t4 + + ADD1 c5, t1, c5 + FADD c6, t2, c6 + ADD1 c7, t3, c7 + FADD c8, t4, c8 + + STF c1, [Y + 0 * SIZE] + STF c2, [Y + 1 * SIZE] + STF c3, [Y + 2 * SIZE] + STF c4, [Y + 3 * SIZE] + + STF c5, [Y + 4 * SIZE] + STF c6, [Y + 5 * SIZE] + STF c7, [Y + 6 * SIZE] + STF c8, [Y + 7 * SIZE] + + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + + FMUL ALPHA_R, a1, t1 + FMUL ALPHA_R, a2, t2 + FMUL ALPHA_I, a2, t3 + FMUL ALPHA_I, a1, t4 + + FADD b1, t1, b1 + add I, -1, I + ADD2 b2, t2, b2 + cmp I, 0 + ADD1 b1, t3, c1 + FADD b2, t4, c2 + + STF c1, [Y + 0 * SIZE] + STF c2, [Y + 1 * SIZE] + + add Y, 2 * SIZE, Y + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + mov Y, YY + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + LDF [Y + 0 * SIZE], b3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + LDF [X + 0 * SIZE], a5 + add I, -1, I + LDF [Y + 0 * SIZE], b5 + LDF [X + 1 * SIZE], a6 + cmp I, 0 + add X, INCX, X + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + FMUL ALPHA_R, a1, t1 + LDF [Y + 0 * SIZE], b7 + FMUL ALPHA_R, a2, t2 + LDF [X + 1 * SIZE], a8 + FMUL ALPHA_R, a3, t3 + add X, INCX, X + LDF [Y + 1 * SIZE], b8 + FMUL ALPHA_R, a4, t4 + + ble,pt %icc, .LL52 + add Y, INCY, Y + + +.LL51: + FADD b1, t1, c1 + LDF [Y + 0 * SIZE], b1 + FMUL ALPHA_I, a2, t1 + LDF [X + 1 * SIZE], a2 + ADD2 b2, t2, c2 + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + FMUL ALPHA_I, a1, t2 + LDF [X + 0 * SIZE], a1 + add X, INCX, X + + FADD b3, t3, c3 + LDF [Y + 0 * SIZE], b3 + FMUL ALPHA_I, a4, t3 + LDF [X + 1 * SIZE], a4 + ADD2 b4, t4, c4 + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + FMUL ALPHA_I, a3, t4 + LDF [X + 0 * SIZE], a3 + add X, INCX, X + + ADD1 c1, t1, c1 + FMUL ALPHA_R, a5, t1 + FADD c2, t2, c2 + FMUL ALPHA_R, a6, t2 + ADD1 c3, t3, c3 + FMUL ALPHA_R, a7, t3 + FADD c4, t4, c4 + FMUL ALPHA_R, a8, t4 + + STF c1, [YY + 0 * SIZE] + FADD b5, t1, c1 + FMUL ALPHA_I, a6, t1 + STF c2, [YY + 1 * SIZE] + ADD2 b6, t2, c2 + FMUL ALPHA_I, a5, t2 + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + FADD b7, t3, c3 + FMUL ALPHA_I, a8, t3 + STF c4, [YY + 1 * SIZE] + ADD2 b8, t4, c4 + FMUL ALPHA_I, a7, t4 + add YY, INCY, YY + + LDF [X + 0 * SIZE], a5 + ADD1 c1, t1, c1 + LDF [Y + 0 * SIZE], b5 + FMUL ALPHA_R, a1, t1 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FADD c2, t2, c2 + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + FMUL ALPHA_R, a2, t2 + LDF [X + 0 * SIZE], a7 + ADD1 c3, t3, c3 + LDF [Y + 0 * SIZE], b7 + FMUL ALPHA_R, a3, t3 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + FADD c4, t4, c4 + LDF [Y + 1 * SIZE], b8 + add Y, INCY, Y + FMUL ALPHA_R, a4, t4 + + STF c1, [YY + 0 * SIZE] + add I, -1, I + STF c2, [YY + 1 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + cmp I, 0 + STF c4, [YY + 1 * SIZE] + + bg,pt %icc, .LL51 + add YY, INCY, YY + +.LL52: + FADD b1, t1, c1 + FMUL ALPHA_I, a2, t1 + ADD2 b2, t2, c2 + FMUL ALPHA_I, a1, t2 + + FADD b3, t3, c3 + FMUL ALPHA_I, a4, t3 + ADD2 b4, t4, c4 + FMUL ALPHA_I, a3, t4 + + ADD1 c1, t1, c1 + FMUL ALPHA_R, a5, t1 + FADD c2, t2, c2 + FMUL ALPHA_R, a6, t2 + ADD1 c3, t3, c3 + FMUL ALPHA_R, a7, t3 + FADD c4, t4, c4 + FMUL ALPHA_R, a8, t4 + + STF c1, [YY + 0 * SIZE] + STF c2, [YY + 1 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + STF c4, [YY + 1 * SIZE] + add YY, INCY, YY + + FADD b5, t1, c1 + FMUL ALPHA_I, a6, t1 + ADD2 b6, t2, c2 + FMUL ALPHA_I, a5, t2 + FADD b7, t3, c3 + FMUL ALPHA_I, a8, t3 + ADD2 b8, t4, c4 + FMUL ALPHA_I, a7, t4 + + ADD1 c1, t1, c1 + FADD c2, t2, c2 + ADD1 c3, t3, c3 + FADD c4, t4, c4 + + STF c1, [YY + 0 * SIZE] + STF c2, [YY + 1 * SIZE] + add YY, INCY, YY + STF c3, [YY + 0 * SIZE] + STF c4, [YY + 1 * SIZE] + add YY, INCY, YY + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + + FMUL ALPHA_R, a1, t1 + FMUL ALPHA_R, a2, t2 + FMUL ALPHA_I, a2, t3 + FMUL ALPHA_I, a1, t4 + FADD b1, t1, b1 + ADD2 b2, t2, b2 + ADD1 b1, t3, c1 + FADD b2, t4, c2 + + add I, -1, I + cmp I, 0 + STF c1, [Y + 0 * SIZE] + STF c2, [Y + 1 * SIZE] + + add Y, INCY, Y + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zcopy.S b/kernel/sparc/zcopy.S new file mode 100644 index 0000000000..039ed544f1 --- /dev/null +++ b/kernel/sparc/zcopy.S @@ -0,0 +1,196 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#endif + + PROLOGUE + SAVESP + + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + + cmp INCX, 2 * SIZE + bne .LL50 + nop + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + +#define PREFETCHSIZE 32 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + LDF [X + 4 * SIZE], a5 + LDF [X + 5 * SIZE], a6 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + STF a1, [Y + 0 * SIZE] + add I, -1, I + STF a2, [Y + 1 * SIZE] + cmp I, 0 + STF a3, [Y + 2 * SIZE] + add X, 8 * SIZE, X + STF a4, [Y + 3 * SIZE] + STF a5, [Y + 4 * SIZE] + STF a6, [Y + 5 * SIZE] + STF a7, [Y + 6 * SIZE] + STF a8, [Y + 7 * SIZE] + + bg,pt %icc, .LL11 + add Y, 8 * SIZE, Y + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add I, -1, I + cmp I, 0 + STF a1, [Y + 0 * SIZE] + add X, 2 * SIZE, X + STF a2, [Y + 1 * SIZE] + bg,pt %icc, .LL16 + add Y, 2 * SIZE, Y + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + LDF [X + 0 * SIZE], a7 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + + STF a1, [Y + 0 * SIZE] + add I, -1, I + STF a2, [Y + 1 * SIZE] + add Y, INCY, Y + cmp I, 0 + STF a3, [Y + 0 * SIZE] + STF a4, [Y + 1 * SIZE] + add Y, INCY, Y + STF a5, [Y + 0 * SIZE] + STF a6, [Y + 1 * SIZE] + add Y, INCY, Y + STF a7, [Y + 0 * SIZE] + STF a8, [Y + 1 * SIZE] + + bg,pt %icc, .LL51 + add Y, INCY, Y + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add I, -1, I + cmp I, 0 + add X, INCX, X + STF a1, [Y + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + bg,pt %icc, .LL56 + add Y, INCY, Y + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zdot.S b/kernel/sparc/zdot.S new file mode 100644 index 0000000000..3072f0f16f --- /dev/null +++ b/kernel/sparc/zdot.S @@ -0,0 +1,545 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) +#define OUT %i0 +#define N %i1 +#define X %i2 +#define INCX %i3 +#define Y %i4 +#define INCY %i5 +#else +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#endif + +#define I %l0 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 + +#define b1 %f32 +#define b2 %f34 +#define b3 %f36 +#define b4 %f38 +#define b5 %f40 +#define b6 %f42 +#define b7 %f44 +#define b8 %f46 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 + +#define b1 %f16 +#define b2 %f17 +#define b3 %f18 +#define b4 %f19 +#define b5 %f20 +#define b6 %f21 +#define b7 %f22 +#define b8 %f23 +#endif + + PROLOGUE + SAVESP + +#ifdef DOUBLE + FCLR(0) + FCLR(2) + FCLR(4) + FCLR(6) +#else + FCLR(0) + FCLR(1) + FCLR(2) + FCLR(3) +#endif + + FMOV c1, c4 + FMOV c1, t1 + sll INCX, ZBASE_SHIFT, INCX + FMOV c1, t2 + sll INCY, ZBASE_SHIFT, INCY + FMOV c1, t3 + FMOV c1, t4 + + cmp INCX, 2 * SIZE + bne .LL50 + nop + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + add X, 8 * SIZE, X + LDF [Y + 7 * SIZE], b8 + ble,pt %icc, .LL12 + add Y, 8 * SIZE, Y + +#define PREFETCHSIZE 40 + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + FADD c1, t1, c1 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + FMUL a1, b1, t1 + + FADD c2, t2, c2 + FMUL a2, b1, t2 + LDF [Y + 0 * SIZE], b1 + + FADD c3, t3, c3 + FMUL a1, b2, t3 + LDF [X + 0 * SIZE], a1 + + FADD c4, t4, c4 + FMUL a2, b2, t4 + LDF [X + 1 * SIZE], a2 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + LDF [Y + 1 * SIZE], b2 + FADD c2, t2, c2 + FMUL a4, b3, t2 + LDF [Y + 2 * SIZE], b3 + + FADD c3, t3, c3 + FMUL a3, b4, t3 + LDF [X + 2 * SIZE], a3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + LDF [X + 3 * SIZE], a4 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + LDF [Y + 3 * SIZE], b4 + FADD c2, t2, c2 + FMUL a6, b5, t2 + LDF [Y + 4 * SIZE], b5 + + FADD c3, t3, c3 + FMUL a5, b6, t3 + LDF [X + 4 * SIZE], a5 + FADD c4, t4, c4 + FMUL a6, b6, t4 + LDF [X + 5 * SIZE], a6 + + FADD c1, t1, c1 + add I, -1, I + FMUL a7, b7, t1 + LDF [Y + 5 * SIZE], b6 + FADD c2, t2, c2 + cmp I, 0 + FMUL a8, b7, t2 + LDF [Y + 6 * SIZE], b7 + + FADD c3, t3, c3 + add Y, 8 * SIZE, Y + FMUL a7, b8, t3 + LDF [X + 6 * SIZE], a7 + FADD c4, t4, c4 + FMUL a8, b8, t4 + LDF [X + 7 * SIZE], a8 + + add X, 8 * SIZE, X + bg,pt %icc, .LL11 + LDF [Y - 1 * SIZE], b8 + +.LL12: + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b1, t2 + + FADD c3, t3, c3 + FMUL a1, b2, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a4, b3, t2 + + FADD c3, t3, c3 + FMUL a3, b4, t3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a6, b5, t2 + + FADD c3, t3, c3 + FMUL a5, b6, t3 + FADD c4, t4, c4 + FMUL a6, b6, t4 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a8, b7, t2 + + FADD c3, t3, c3 + FMUL a7, b8, t3 + FADD c4, t4, c4 + FMUL a8, b8, t4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, 2 * SIZE, X + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + add Y, 2 * SIZE, Y + + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b1, t2 + FADD c3, t3, c3 + FMUL a1, b2, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + nop + +.LL19: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +#ifndef CONJ + FSUB c1, c4, c1 + FADD c2, c3, c2 +#else + FADD c1, c4, c1 + FSUB c3, c2, c2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STF c1, [OUT + 0 * SIZE] + STF c2, [OUT + 1 * SIZE] +#endif + return %i7 + 8 + clr %g0 +.LL50: +#ifdef F_INTERFACE + cmp INCX, 0 + bge .LL41 + sub N, 1, I + + smul I, INCX, I + sub X, I, X + +.LL41: + cmp INCY, 0 + bge .LL42 + sub N, 1, I + + smul I, INCY, I + sub Y, I, Y + +.LL42: +#endif + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + LDF [X + 0 * SIZE], a7 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b3 + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b5 + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + LDF [Y + 0 * SIZE], b7 + LDF [Y + 1 * SIZE], b8 + add Y, INCY, Y + + add I, -1, I + cmp I, 0 + ble,pt %icc, .LL52 + +.LL51: + FADD c1, t1, c1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + add I, -1, I + FMUL a1, b1, t1 + prefetch [Y + PREFETCHSIZE * SIZE], 0 + + FADD c2, t2, c2 + cmp I, 0 + FMUL a2, b1, t2 + LDF [Y + 0 * SIZE], b1 + + FADD c3, t3, c3 + FMUL a1, b2, t3 + LDF [X + 0 * SIZE], a1 + FADD c4, t4, c4 + FMUL a2, b2, t4 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + + FADD c1, t1, c1 + FMUL a3, b3, t1 + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + FADD c2, t2, c2 + FMUL a4, b3, t2 + LDF [Y + 0 * SIZE], b3 + + FADD c3, t3, c3 + FMUL a3, b4, t3 + LDF [X + 0 * SIZE], a3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FADD c1, t1, c1 + FMUL a5, b5, t1 + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + FADD c2, t2, c2 + FMUL a6, b5, t2 + LDF [Y + 0 * SIZE], b5 + + FADD c3, t3, c3 + FMUL a5, b6, t3 + LDF [X + 0 * SIZE], a5 + FADD c4, t4, c4 + FMUL a6, b6, t4 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + + FADD c1, t1, c1 + FMUL a7, b7, t1 + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + FADD c2, t2, c2 + FMUL a8, b7, t2 + LDF [Y + 0 * SIZE], b7 + + FADD c3, t3, c3 + FMUL a7, b8, t3 + LDF [X + 0 * SIZE], a7 + FADD c4, t4, c4 + FMUL a8, b8, t4 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + + LDF [Y + 1 * SIZE], b8 + bg,pt %icc, .LL51 + add Y, INCY, Y + +.LL52: + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b1, t2 + + FADD c3, t3, c3 + FMUL a1, b2, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADD c2, t2, c2 + FMUL a4, b3, t2 + + FADD c3, t3, c3 + FMUL a3, b4, t3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + + FADD c1, t1, c1 + FMUL a5, b5, t1 + FADD c2, t2, c2 + FMUL a6, b5, t2 + + FADD c3, t3, c3 + FMUL a5, b6, t3 + FADD c4, t4, c4 + FMUL a6, b6, t4 + + FADD c1, t1, c1 + FMUL a7, b7, t1 + FADD c2, t2, c2 + FMUL a8, b7, t2 + + FADD c3, t3, c3 + FMUL a7, b8, t3 + FADD c4, t4, c4 + FMUL a8, b8, t4 + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADD c2, t2, c2 + FMUL a2, b1, t2 + FADD c3, t3, c3 + FMUL a1, b2, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL56 + nop + +.LL59: + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +#ifndef CONJ + FSUB c1, c4, c1 + FADD c2, c3, c2 +#else + FADD c1, c4, c1 + FSUB c3, c2, c2 +#endif + +#if defined(F_INTERFACE) && defined(F_INTERFACE_F2C) + STF c1, [OUT + 0 * SIZE] + STF c2, [OUT + 1 * SIZE] +#endif + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemm_kernel.S b/kernel/sparc/zgemm_kernel.S new file mode 100644 index 0000000000..b02c942e3a --- /dev/null +++ b/kernel/sparc/zgemm_kernel.S @@ -0,0 +1,1917 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 + +#define OFFSET %l2 +#define KK %l3 +#define TEMP1 %l4 +#define TEMP2 %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f62 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f58 +#define ALPHA_R %f60 +#define ALPHA_I %f62 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#define ALPHA_R %f30 +#define ALPHA_I %f31 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FADD +#define FADD4 FSUB +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FSUB +#define FADD4 FADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FADD +#define FADD4 FADD +#else +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FSUB +#define FADD4 FSUB +#endif + + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE +#define STACK_ALPHA [%sp + STACK_START + 24] +#else +#define STACK_ALPHA [%sp + STACK_START + 20] +#endif +#else +#define STACK_ALPHA [%sp + STACK_START + 40] +#endif + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] + + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 48], OFFSET +#endif + ldd [%sp + STACK_START + 16], ALPHA_R + ldd [%sp + STACK_START + 24], ALPHA_I +#else + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 40], OFFSET +#endif + ld [%sp + STACK_START + 16], ALPHA_R + ld [%sp + STACK_START + 20], ALPHA_I +#endif +#else + +#ifdef DOUBLE + FMOV %f6, ALPHA_R + FMOV %f8, ALPHA_I + STF %f8, STACK_ALPHA +#else + FMOV %f7, ALPHA_R + FMOV %f9, ALPHA_I + STF %f9, STACK_ALPHA +#endif + + ldx [%sp+ STACK_START + 56], B + nop + ldx [%sp+ STACK_START + 64], C + nop + ldx [%sp+ STACK_START + 72], LDC +#ifdef TRMMKERNEL + ldx [%sp+ STACK_START + 80], OFFSET +#endif + + LDF [%sp + STACK_START + 32], FZERO +#endif + +#ifdef DOUBLE + FCLR(27) +#else + FCLR(29) +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDC, ZBASE_SHIFT, LDC + +.LL11: + sra M, 1, I + FMOV FZERO, t1 + add C, LDC, C2 + FMOV FZERO, t2 + + mov C, C1 + FMOV FZERO, t3 + cmp I, 0 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov A, AO + add C2, LDC, C + nop + ble,pn %icc, .LL50 + FMOV FZERO, t4 + + +.LL21: +#if !defined(TRMMKERNEL) + sra K, 2, L + FMOV FZERO, c01 + cmp L, 0 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [B + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [B + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 + mov B, BO + +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP1, AO + add B, TEMP1, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + FMOV FZERO, c01 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [BO + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 + +#endif + FMOV FZERO, c15 + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD2 c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD4 c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD2 c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD4 c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD4 c08, t2, c08 + FMUL a5, b2, t2 + FADD2 c12, t3, c12 + FMUL a5, b3, t3 + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL29 + LDF STACK_ALPHA, ALPHA_I + +.LL26: + FADD2 c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD4 c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#ifndef TRMMKERNEL + FADD2 c04, t1, c04 + LDF [C1 + 0 * SIZE], a1 + FADD4 c08, t2, c08 + LDF [C1 + 1 * SIZE], a2 + FADD2 c12, t3, c12 + LDF [C1 + 2 * SIZE], a3 + FADD4 c16, t4, c16 + LDF [C1 + 3 * SIZE], a4 + + FADD c01, c06, c01 + LDF [C2 + 0 * SIZE], b1 + FADD c02, c05, c02 + LDF [C2 + 1 * SIZE], b2 + FADD c03, c08, c03 + LDF [C2 + 2 * SIZE], b3 + FADD c04, c07, c04 + LDF [C2 + 3 * SIZE], b4 + + FADD c09, c14, c09 + FMUL ALPHA_R, c01, t1 + FADD c10, c13, c10 + FMUL ALPHA_R, c02, t2 + FADD c11, c16, c11 + FMUL ALPHA_R, c03, t3 + FADD c12, c15, c12 + FMUL ALPHA_R, c04, t4 + + FADD a1, t1, a1 + FMUL ALPHA_I, c02, t1 + FADD a2, t2, a2 + FMUL ALPHA_I, c01, t2 + FADD a3, t3, a3 + FMUL ALPHA_I, c04, t3 + FADD a4, t4, a4 + FMUL ALPHA_I, c03, t4 + + FSUB a1, t1, a1 + FMUL ALPHA_R, c09, t1 + FADD a2, t2, a2 + FMUL ALPHA_R, c10, t2 + FSUB a3, t3, a3 + FMUL ALPHA_R, c11, t3 + FADD a4, t4, a4 + FMUL ALPHA_R, c12, t4 + + FADD b1, t1, b1 + FMUL ALPHA_I, c10, t1 + FADD b2, t2, b2 + FMUL ALPHA_I, c09, t2 + FADD b3, t3, b3 + FMUL ALPHA_I, c12, t3 + FADD b4, t4, b4 + FMUL ALPHA_I, c11, t4 + + STF a1, [C1 + 0 * SIZE] + FSUB b1, t1, b1 + STF a2, [C1 + 1 * SIZE] + FADD b2, t2, b2 + STF a3, [C1 + 2 * SIZE] + FSUB b3, t3, b3 + STF a4, [C1 + 3 * SIZE] + FADD b4, t4, b4 + + STF b1, [C2 + 0 * SIZE] + FMOV FZERO, t1 + STF b2, [C2 + 1 * SIZE] + FMOV FZERO, t2 + STF b3, [C2 + 2 * SIZE] + FMOV FZERO, t3 + STF b4, [C2 + 3 * SIZE] + FMOV FZERO, t4 +#else + FADD2 c04, t1, c04 + FADD4 c08, t2, c08 + FADD2 c12, t3, c12 + FADD4 c16, t4, c16 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + STF c01, [C1 + 0 * SIZE] + FADD c09, c14, c09 + STF c02, [C1 + 1 * SIZE] + FADD c10, c13, c10 + STF c03, [C1 + 2 * SIZE] + FADD c11, c16, c11 + STF c04, [C1 + 3 * SIZE] + FADD c12, c15, c12 + + STF c09, [C2 + 0 * SIZE] + FMOV FZERO, t1 + STF c10, [C2 + 1 * SIZE] + FMOV FZERO, t2 + STF c11, [C2 + 2 * SIZE] + FMOV FZERO, t3 + STF c12, [C2 + 3 * SIZE] + FMOV FZERO, t4 +#endif + + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 1, I + FMOV FZERO, c02 + cmp I, 0 + FMOV FZERO, t1 + ble,pn %icc, .LL99 + FMOV FZERO, c04 + + +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t2 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, t3 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c05 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 1 + ZBASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + +#endif + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD2 c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD4 c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD4 c04, t2, c04 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD2 c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD4 c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#ifndef TRMMKERNEL + FADD2 c02, t1, c02 + LDF [C1 + 0 * SIZE], a1 + FADD4 c04, t2, c04 + LDF [C1 + 1 * SIZE], a2 + FADD2 c06, t3, c06 + LDF [C2 + 0 * SIZE], a3 + FADD4 c08, t4, c08 + LDF [C2 + 1 * SIZE], a4 + + FADD c01, c04, c01 + FMUL ALPHA_R, c01, t1 + FADD c02, c03, c02 + FMUL ALPHA_R, c02, t2 + FADD c05, c08, c05 + FMUL ALPHA_R, c05, t3 + FADD c06, c07, c06 + FMUL ALPHA_R, c06, t4 + + FADD a1, t1, a1 + FMUL ALPHA_I, c02, t1 + FADD a2, t2, a2 + FMUL ALPHA_I, c01, t2 + FADD a3, t3, a3 + FMUL ALPHA_I, c06, t3 + FADD a4, t4, a4 + FMUL ALPHA_I, c05, t4 + + FSUB a1, t1, a1 + FADD a2, t2, a2 + FSUB a3, t3, a3 + FADD a4, t4, a4 + + STF a1, [C1 + 0 * SIZE] + FMOV FZERO, t1 + STF a2, [C1 + 1 * SIZE] + FMOV FZERO, t2 + STF a3, [C2 + 0 * SIZE] + FMOV FZERO, t3 + STF a4, [C2 + 1 * SIZE] + FMOV FZERO, t4 +#else + FADD2 c02, t1, c02 + FADD4 c04, t2, c04 + FADD2 c06, t3, c06 + FADD4 c08, t4, c08 + + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + STF c01, [C1 + 0 * SIZE] + FMOV FZERO, t1 + STF c02, [C1 + 1 * SIZE] + FMOV FZERO, t2 + STF c05, [C2 + 0 * SIZE] + FMOV FZERO, t3 + STF c06, [C2 + 1 * SIZE] + FMOV FZERO, t4 +#endif + + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL99: + add J, -1, J + mov BO, B + cmp J, 0 + bg,pt %icc, .LL11 +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 2, KK +#else + nop +#endif + +.LL100: + sra M, 1, I + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + mov A, AO + + mov C, C1 + add C, LDC, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, t1 + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, t2 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c05 + FMOV FZERO, c02 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 1 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c05 + FMOV FZERO, c02 +#endif + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD1 c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD1 c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 2, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD1 c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD3 c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD4 c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b1, t1 + FADD3 c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD4 c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: +#ifndef TRMMKERNEL + FADD1 c03, t1, c03 + LDF [C1 + 0 * SIZE], a1 + FADD3 c07, t2, c07 + LDF [C1 + 1 * SIZE], a2 + FADD2 c04, t3, c04 + LDF [C1 + 2 * SIZE], a3 + FADD4 c08, t4, c08 + LDF [C1 + 3 * SIZE], a4 + + FADD c01, c06, c01 + FMUL ALPHA_R, c01, t1 + FADD c02, c05, c02 + FMUL ALPHA_R, c02, t2 + FADD c03, c08, c03 + FMUL ALPHA_R, c03, t3 + FADD c04, c07, c04 + FMUL ALPHA_R, c04, t4 + + FADD a1, t1, a1 + FMUL ALPHA_I, c02, t1 + FADD a2, t2, a2 + FMUL ALPHA_I, c01, t2 + FADD a3, t3, a3 + FMUL ALPHA_I, c04, t3 + FADD a4, t4, a4 + FMUL ALPHA_I, c03, t4 + + FSUB a1, t1, a1 + FADD a2, t2, a2 + FSUB a3, t3, a3 + FADD a4, t4, a4 + + STF a1, [C1 + 0 * SIZE] + FMOV FZERO, t1 + STF a2, [C1 + 1 * SIZE] + FMOV FZERO, t2 + STF a3, [C1 + 2 * SIZE] + FMOV FZERO, t3 + STF a4, [C1 + 3 * SIZE] + FMOV FZERO, t4 +#else + FADD1 c03, t1, c03 + FADD3 c07, t2, c07 + FADD2 c04, t3, c04 + FADD4 c08, t4, c08 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + STF c01, [C1 + 0 * SIZE] + FMOV FZERO, t1 + STF c02, [C1 + 1 * SIZE] + FMOV FZERO, t2 + STF c03, [C1 + 2 * SIZE] + FMOV FZERO, t3 + STF c04, [C1 + 3 * SIZE] + FMOV FZERO, t4 +#endif + + add C1, 4 * SIZE, C1 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -2, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 2, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +#if !defined(TRMMKERNEL) + LDF [AO + 0 * SIZE], a1 + sra K, 2, L + FMOV FZERO, c01 + + LDF [B + 0 * SIZE], b1 + mov B, BO + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + cmp L, 0 + FMOV FZERO, c02 + LDF [B + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [B + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [B + 3 * SIZE], b4 + FMOV FZERO, t4 +#else +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov B, BO +#else + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + sra L, 2, L + cmp L, 0 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 +#endif + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD1 c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD3 c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD2 c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD1 c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD3 c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD3 c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD2 c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + FADD1 c01, t1, c01 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + FADD3 c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + cmp L, 0 + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [BO + 1 * SIZE], b2 + + bg,pt %icc, .LL156 + LDF [AO + 1 * SIZE], a2 + +.LL159: +#ifndef TRMMKERNEL + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + LDF [C1 + 0 * SIZE], a1 + LDF [C1 + 1 * SIZE], a2 + + FADD c01, c04, c01 + FADD c02, c03, c02 + + FMUL ALPHA_R, c01, t1 + FMUL ALPHA_R, c02, t2 + FMUL ALPHA_I, c02, t3 + FMUL ALPHA_I, c01, t4 + + FADD a1, t1, a1 + FADD a2, t2, a2 + FSUB a1, t3, a1 + FADD a2, t4, a2 + + STF a1, [C1 + 0 * SIZE] + STF a2, [C1 + 1 * SIZE] +#else + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + FADD c01, c04, c01 + FADD c02, c03, c02 + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] +#endif + + add C1, 2 * SIZE, C1 + +#ifndef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemm_kernel_1x4.S b/kernel/sparc/zgemm_kernel_1x4.S new file mode 100644 index 0000000000..03397fd5c8 --- /dev/null +++ b/kernel/sparc/zgemm_kernel_1x4.S @@ -0,0 +1,1599 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define BB %o7 + +#define C1 %l0 +#define C2 %l1 +#define C3 %l2 +#define C4 %l3 + +#define OFFSET %l4 +#define KK %l5 +#define TEMP1 %l6 +#define TEMP2 %l7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define ALPHA_R %f60 +#define ALPHA_I %f62 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 + +#define alpha_r 29 +#define alpha_i 31 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define ALPHA_R %f30 +#define ALPHA_I %f31 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#define alpha_r 30 +#define alpha_i 31 + +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FMADD +#define FMADD4 FNMSUB +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FMADD +#define FMADD4 FMADD +#else +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FNMSUB +#define FMADD4 FNMSUB +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] + + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 48], OFFSET +#endif + + ldd [%sp + STACK_START + 16], ALPHA_R + ldd [%sp + STACK_START + 24], ALPHA_I +#else + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC +#ifdef TRMMKERNEL + ld [%sp + STACK_START + 40], OFFSET +#endif + + ld [%sp + STACK_START + 16], ALPHA_R + ld [%sp + STACK_START + 20], ALPHA_I +#endif +#else + ldx [%sp + STACK_START + 56], B + ldx [%sp + STACK_START + 64], C + ldx [%sp + STACK_START + 72], LDC +#ifdef TRMMKERNEL + ldx [%sp + STACK_START + 80], OFFSET +#endif + +#ifdef DOUBLE + FMOV %f6, ALPHA_R + FMOV %f8, ALPHA_I +#else + FMOV %f7, ALPHA_R + FMOV %f9, ALPHA_I +#endif +#endif + +#if defined(TRMMKERNEL) && !defined(LEFT) + neg OFFSET, KK +#endif + + cmp M, 0 + ble,pn %icc, .LL999 + nop + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL20 + sll LDC, ZBASE_SHIFT, LDC + +.LL11: + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C + + sll K, ZBASE_SHIFT + 2, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov A, AO + + mov M, I + add B, BB, BB + .align 4 + +.LL12: + prefetch [BB + 0 * SIZE], 1 +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 2, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + FCLR (cc01) + LDF [AO + 1 * SIZE], a2 + FCLR (cc05) + LDF [AO + 8 * SIZE], a5 + FCLR (cc09) + LDF [BO + 0 * SIZE], b1 + FCLR (cc13) + + LDF [BO + 1 * SIZE], b2 + FCLR (cc02) + LDF [BO + 2 * SIZE], b3 + FCLR (cc06) + LDF [BO + 3 * SIZE], b4 + FCLR (cc10) + LDF [BO + 4 * SIZE], b5 + FCLR (cc14) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc03) + LDF [BO + 6 * SIZE], b7 + FCLR (cc07) + LDF [BO + 7 * SIZE], b8 + FCLR (cc11) + LDF [BO + 8 * SIZE], b9 + FCLR (cc15) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc16) + +#ifndef TRMMKERNEL + sra K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + add BB, 32 * SIZE, BB + .align 4 + +.LL13: + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#ifndef TRMMKERNEL + and K, 7, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 4, L +#endif + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + nop + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + nop + FMADD2 (aa2, bb5, cc10, cc10) + nop + + FMADD3 (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD2 (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + FADD c01, c04, c01 + LDF [C1 + 1 * SIZE], a2 + FADD c02, c03, c02 + LDF [C2 + 0 * SIZE], a3 + FADD c05, c08, c05 + LDF [C2 + 1 * SIZE], a4 + FADD c06, c07, c06 + + LDF [C3 + 0 * SIZE], b1 + FADD c09, c12, c09 + LDF [C3 + 1 * SIZE], b2 + FADD c10, c11, c10 + LDF [C4 + 0 * SIZE], b3 + FADD c13, c16, c13 + LDF [C4 + 1 * SIZE], b4 + FADD c14, c15, c14 + + FMADD (alpha_r, cc01, aa1, aa1) + FMADD (alpha_r, cc02, aa2, aa2) + FMADD (alpha_r, cc05, aa3, aa3) + FMADD (alpha_r, cc06, aa4, aa4) + + FMADD (alpha_r, cc09, bb1, bb1) + FMADD (alpha_r, cc10, bb2, bb2) + FMADD (alpha_r, cc13, bb3, bb3) + FMADD (alpha_r, cc14, bb4, bb4) + +#else + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + FADD c09, c12, c09 + FADD c10, c11, c10 + FADD c13, c16, c13 + FADD c14, c15, c14 + + FMUL ALPHA_R, c01, a1 + FMUL ALPHA_R, c02, a2 + FMUL ALPHA_R, c05, a3 + FMUL ALPHA_R, c06, a4 + + FMUL ALPHA_R, c09, b1 + FMUL ALPHA_R, c10, b2 + FMUL ALPHA_R, c13, b3 + FMUL ALPHA_R, c14, b4 +#endif + + FNMSUB (alpha_i, cc02, aa1, aa1) + FMADD (alpha_i, cc01, aa2, aa2) + FNMSUB (alpha_i, cc06, aa3, aa3) + FMADD (alpha_i, cc05, aa4, aa4) + + FNMSUB (alpha_i, cc10, bb1, bb1) + STF a1, [C1 + 0 * SIZE] + FMADD (alpha_i, cc09, bb2, bb2) + STF a2, [C1 + 1 * SIZE] + FNMSUB (alpha_i, cc14, bb3, bb3) + STF a3, [C2 + 0 * SIZE] + FMADD (alpha_i, cc13, bb4, bb4) + STF a4, [C2 + 1 * SIZE] + + STF b1, [C3 + 0 * SIZE] + add C1, 2 * SIZE, C1 + STF b2, [C3 + 1 * SIZE] + add C2, 2 * SIZE, C2 + STF b3, [C4 + 0 * SIZE] + add C3, 2 * SIZE, C3 + STF b4, [C4 + 1 * SIZE] + add C4, 2 * SIZE, C4 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -4, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + mov BO, B + .align 4 + +.LL20: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL30 + mov C, C1 + + add C, LDC, C2 + add C2, LDC, C + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov M, I + mov A, AO + .align 4 + +.LL22: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 1, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + FCLR (cc01) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc04) + LDF [BO + 8 * SIZE], b9 + FCLR (cc05) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + FCLR (cc08) + .align 4 + +.LL23: + FMADD1 (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD2 (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD3 (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD2 (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + nop + FMADD2 (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD3 (aa3, bb8, cc07, cc07) + FMADD4 (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL23 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL25: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 2, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL27 + add BO, 4 * SIZE, BO + .align 4 + +.LL28: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + FADD c01, c04, c01 + LDF [C1 + 1 * SIZE], a2 + FADD c02, c03, c02 + LDF [C2 + 0 * SIZE], a3 + FADD c05, c08, c05 + LDF [C2 + 1 * SIZE], a4 + FADD c06, c07, c06 + + FMADD (alpha_r, cc01, aa1, aa1) + FMADD (alpha_r, cc02, aa2, aa2) + FMADD (alpha_r, cc05, aa3, aa3) + FMADD (alpha_r, cc06, aa4, aa4) +#else + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + FMUL ALPHA_R, c01, a1 + FMUL ALPHA_R, c02, a2 + FMUL ALPHA_R, c05, a3 + FMUL ALPHA_R, c06, a4 +#endif + + FNMSUB (alpha_i, cc02, aa1, aa1) + FMADD (alpha_i, cc01, aa2, aa2) + FNMSUB (alpha_i, cc06, aa3, aa3) + FMADD (alpha_i, cc05, aa4, aa4) + + STF a1, [C1 + 0 * SIZE] + add I, -1, I + STF a2, [C1 + 1 * SIZE] + cmp I, 0 + STF a3, [C2 + 0 * SIZE] + add C1, 2 * SIZE, C1 + STF a4, [C2 + 1 * SIZE] + add C2, 2 * SIZE, C2 + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -2, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + + bg,pt %icc, .LL22 + nop + +#if defined(TRMMKERNEL) && !defined(LEFT) + add KK, 2, KK +#endif + + mov BO, B + .align 4 + +.LL30: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + mov C, C1 + +#if defined(TRMMKERNEL) && defined(LEFT) + mov OFFSET, KK +#endif + + mov M, I + mov A, AO + .align 4 + +.LL32: +#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && ((defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)))) + mov B, BO +#else + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 0, TEMP2 + + add AO, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + +#ifndef TRMMKERNEL + sra K, 2, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + FCLR (cc08) + .align 4 + +.LL33: + FMADD1 (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD1 (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD2 (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD3 (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD4 (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD1 (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD2 (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD3 (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD4 (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD1 (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD2 (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD3 (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD4 (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#ifndef TRMMKERNEL + and K, 3, L +#else +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub K, KK, L +#elif defined(LEFT) + add KK, 1, L +#else + add KK, 1, L +#endif + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL37 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL38: +#ifndef TRMMKERNEL + LDF [C1 + 0 * SIZE], a1 + FADD c01, c04, c01 + LDF [C1 + 1 * SIZE], a2 + FADD c02, c03, c02 + + FMADD (alpha_r, cc01, aa1, aa1) + FMADD (alpha_r, cc02, aa2, aa2) +#else + FADD c01, c04, c01 + FADD c02, c03, c02 + + FMUL ALPHA_R, c01, a1 + FMUL ALPHA_R, c02, a2 +#endif + + FNMSUB (alpha_i, cc02, aa1, aa1) + FMADD (alpha_i, cc01, aa2, aa2) + + STF a1, [C1 + 0 * SIZE] + STF a2, [C1 + 1 * SIZE] + +#ifdef TRMMKERNEL +#if ( defined(LEFT) && defined(TRANSA)) || \ + (!defined(LEFT) && !defined(TRANSA)) + sub K, KK, TEMP1 +#ifdef LEFT + add TEMP1, -1, TEMP1 +#else + add TEMP1, -1, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 0, TEMP1 + + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LEFT + add KK, 1, KK +#endif +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + add C1, 2 * SIZE, C1 + .align 4 + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemm_ncopy.S b/kernel/sparc/zgemm_ncopy.S new file mode 100644 index 0000000000..2b0c398bf5 --- /dev/null +++ b/kernel/sparc/zgemm_ncopy.S @@ -0,0 +1,250 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 + +#define I %l4 +#define J %l5 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, ZBASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra M, 2, I + cmp I, 0 + + ble,pn %icc, .LL15 + add A2, LDA, A + +#define PREFETCHSIZE 36 +#define WPREFETCHSIZE 20 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + LDF [A1 + 2 * SIZE], c05 + LDF [A1 + 3 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 4 * SIZE], c09 + LDF [A1 + 5 * SIZE], c10 + LDF [A2 + 4 * SIZE], c11 + LDF [A2 + 5 * SIZE], c12 + + LDF [A1 + 6 * SIZE], c13 + LDF [A1 + 7 * SIZE], c14 + LDF [A2 + 6 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + prefetch [B + (WPREFETCHSIZE + 0) * SIZE], 2 + + STF c01, [B + 0 * SIZE] + add A1, 8 * SIZE, A1 + STF c02, [B + 1 * SIZE] + add A2, 8 * SIZE, A2 + STF c03, [B + 2 * SIZE] + add I, -1, I + STF c04, [B + 3 * SIZE] + cmp I, 0 + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] +#ifdef DOUBLE + prefetch [B + (WPREFETCHSIZE + 8) * SIZE], 2 +#endif + STF c09, [B + 8 * SIZE] + STF c10, [B + 9 * SIZE] + STF c11, [B + 10 * SIZE] + STF c12, [B + 11 * SIZE] + STF c13, [B + 12 * SIZE] + STF c14, [B + 13 * SIZE] + STF c15, [B + 14 * SIZE] + STF c16, [B + 15 * SIZE] + bg,pt %icc, .LL12 + add B, 16 * SIZE, B + +.LL15: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL16: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + add A1, 2 * SIZE, A1 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + add A2, 2 * SIZE, A2 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + bg,pt %icc, .LL16 + add B, 4 * SIZE, B + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL111: + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL115 + mov A, A1 + + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + add A1, 8 * SIZE, A1 + + STF c01, [B + 0 * SIZE] + add I, -1, I + STF c02, [B + 1 * SIZE] + cmp I, 0 + STF c03, [B + 2 * SIZE] + STF c04, [B + 3 * SIZE] + STF c05, [B + 4 * SIZE] + STF c06, [B + 5 * SIZE] + STF c07, [B + 6 * SIZE] + STF c08, [B + 7 * SIZE] + + bg,pt %icc, .LL112 + add B, 8 * SIZE, B + +.LL115: + and M, 3, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + +.LL116: + LDF [A1 + 0 * SIZE], c01 + add I, -1, I + LDF [A1 + 1 * SIZE], c02 + add A1, 2 * SIZE, A1 + cmp I, 0 + + STF c01, [B + 0 * SIZE] + STF c02, [B + 1 * SIZE] + bg,pt %icc, .LL116 + add B, 2 * SIZE, B + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemm_tcopy.S b/kernel/sparc/zgemm_tcopy.S new file mode 100644 index 0000000000..55537618d4 --- /dev/null +++ b/kernel/sparc/zgemm_tcopy.S @@ -0,0 +1,305 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define A %i2 +#define LDA %i3 +#define B %i4 + +#define A1 %l0 +#define A2 %l1 + +#define I %l4 +#define J %l5 + +#define B1 %o0 +#define B2 %o1 +#define M4 %o4 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 +#endif + + PROLOGUE + SAVESP + + sll M, BASE_SHIFT + 2, M4 + + and N, -2, B2 + sll M, ZBASE_SHIFT, B1 + smul B1, B2, B2 + add B, B2, B2 + + sra M, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + sll LDA, ZBASE_SHIFT, LDA + +.LL11: + add A, LDA, A2 + mov A, A1 + sra N, 2, I + cmp I, 0 + + mov B, B1 + add B, 8 * SIZE, B + + ble,pn %icc, .LL15 + add A2, LDA, A + +#define PREFETCHSIZE 16 + +.LL12: + prefetch [A1 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + + prefetch [A2 + (PREFETCHSIZE + 0) * SIZE], 0 + LDF [A2 + 0 * SIZE], c09 + LDF [A2 + 1 * SIZE], c10 + LDF [A2 + 2 * SIZE], c11 + LDF [A2 + 3 * SIZE], c12 + + LDF [A2 + 4 * SIZE], c13 + LDF [A2 + 5 * SIZE], c14 + LDF [A2 + 6 * SIZE], c15 + LDF [A2 + 7 * SIZE], c16 + + prefetch [B1 + (PREFETCHSIZE + 0) * SIZE], 2 + + STF c01, [B1 + 0 * SIZE] + add A1, 8 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add A2, 8 * SIZE, A2 + STF c03, [B1 + 2 * SIZE] + STF c04, [B1 + 3 * SIZE] + STF c09, [B1 + 4 * SIZE] + add I, -1, I + STF c10, [B1 + 5 * SIZE] + cmp I, 0 + STF c11, [B1 + 6 * SIZE] + STF c12, [B1 + 7 * SIZE] + add B1, M4, B1 + +#ifdef DOUBLE + prefetch [B1 + (PREFETCHSIZE + 8) * SIZE], 2 +#endif + STF c05, [B1 + 0 * SIZE] + STF c06, [B1 + 1 * SIZE] + STF c07, [B1 + 2 * SIZE] + STF c08, [B1 + 3 * SIZE] + STF c13, [B1 + 4 * SIZE] + STF c14, [B1 + 5 * SIZE] + STF c15, [B1 + 6 * SIZE] + STF c16, [B1 + 7 * SIZE] + bg,pt %icc, .LL12 + add B1, M4, B1 + +.LL15: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL17 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + LDF [A2 + 0 * SIZE], c05 + LDF [A2 + 1 * SIZE], c06 + LDF [A2 + 2 * SIZE], c07 + LDF [A2 + 3 * SIZE], c08 + + STF c01, [B1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF c03, [B1 + 2 * SIZE] + STF c04, [B1 + 3 * SIZE] + STF c05, [B1 + 4 * SIZE] + STF c06, [B1 + 5 * SIZE] + STF c07, [B1 + 6 * SIZE] + STF c08, [B1 + 7 * SIZE] + add B1, M4, B1 + +.LL17: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A2 + 0 * SIZE], c03 + LDF [A2 + 1 * SIZE], c04 + + STF c01, [B2 + 0 * SIZE] + STF c02, [B2 + 1 * SIZE] + STF c03, [B2 + 2 * SIZE] + STF c04, [B2 + 3 * SIZE] + add B2, 4 * SIZE, B2 + +.LL99: + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and M, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL111: + sra N, 2, I + cmp I, 0 + mov A, A1 + + ble,pn %icc, .LL115 + mov B, B1 + +.LL112: + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + LDF [A1 + 4 * SIZE], c05 + LDF [A1 + 5 * SIZE], c06 + LDF [A1 + 6 * SIZE], c07 + LDF [A1 + 7 * SIZE], c08 + + STF c01, [B1 + 0 * SIZE] + add A1, 8 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add I, -1, I + STF c03, [B1 + 2 * SIZE] + cmp I, 0 + STF c04, [B1 + 3 * SIZE] + add B1, M4, B1 + + STF c05, [B1 + 0 * SIZE] + STF c06, [B1 + 1 * SIZE] + STF c07, [B1 + 2 * SIZE] + STF c08, [B1 + 3 * SIZE] + + bg,pt %icc, .LL112 + add B1, M4, B1 + +.LL115: + and N, 2, I + cmp I, 0 + ble,pn %icc, .LL117 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + LDF [A1 + 2 * SIZE], c03 + LDF [A1 + 3 * SIZE], c04 + + STF c01, [B1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF c02, [B1 + 1 * SIZE] + add I, -1, I + STF c03, [B1 + 2 * SIZE] + cmp I, 0 + STF c04, [B1 + 3 * SIZE] + add B1, M4, B1 + +.LL117: + and N, 1, I + cmp I, 0 + ble,pn %icc, .LL999 + nop + + LDF [A1 + 0 * SIZE], c01 + LDF [A1 + 1 * SIZE], c02 + + STF c01, [B2 + 0 * SIZE] + STF c02, [B2 + 1 * SIZE] + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemv_n.S b/kernel/sparc/zgemv_n.S new file mode 100644 index 0000000000..46ff438882 --- /dev/null +++ b/kernel/sparc/zgemv_n.S @@ -0,0 +1,1176 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef DOUBLE +#define PREFETCHSIZE 44 +#else +#define PREFETCHSIZE 88 +#endif + +#define M %i0 +#define N %i1 +#define A %i5 +#define LDA %i2 +#define X %i3 +#define INCX %i4 + +#define Y %l0 +#define INCY %l1 +#define BUFFER %l2 + +#define I %l3 +#define J %l5 + +#define A1 %o0 +#define A2 %o1 +#define A3 %o2 +#define A4 %o3 + +#define Y1 %l4 +#define YY %l6 + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define y1 %f8 +#define y2 %f10 +#define y3 %f12 +#define y4 %f14 +#define y5 %f16 +#define y6 %f18 +#define y7 %f20 +#define y8 %f22 + +#define a1 %f24 +#define a2 %f26 +#define a3 %f28 +#define a4 %f30 +#define a5 %f32 +#define a6 %f34 +#define a7 %f36 +#define a8 %f38 + +#define a9 %f40 +#define a10 %f42 +#define a11 %f44 +#define a12 %f46 +#define a13 %f48 +#define a14 %f50 +#define a15 %f52 +#define a16 %f54 + +#define x1 %f56 +#define x2 %f58 +#define x3 %f60 +#define x4 %f62 + +#define FZERO %f50 +#define ALPHA_R %f52 +#define ALPHA_I %f54 +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define y1 %f4 +#define y2 %f5 +#define y3 %f6 +#define y4 %f7 +#define y5 %f8 +#define y6 %f9 +#define y7 %f10 +#define y8 %f11 + +#define a1 %f12 +#define a2 %f13 +#define a3 %f14 +#define a4 %f15 +#define a5 %f16 +#define a6 %f17 +#define a7 %f18 +#define a8 %f19 + +#define a9 %f20 +#define a10 %f21 +#define a11 %f22 +#define a12 %f23 +#define a13 %f24 +#define a14 %f25 +#define a15 %f26 +#define a16 %f27 + +#define x1 %f28 +#define x2 %f29 +#define x3 %f30 +#define x4 %f31 + +#define FZERO %f25 +#define ALPHA_R %f26 +#define ALPHA_I %f27 +#endif + +#ifndef __64BIT__ +#define STACK_ALPHA_R [%sp + STACK_START + 16] +#ifndef DOUBLE +#define STACK_ALPHA_I [%sp + STACK_START + 20] +#else +#define STACK_ALPHA_I [%sp + STACK_START + 24] +#endif +#else +#define STACK_ALPHA_R [%sp + STACK_START + 32] +#define STACK_ALPHA_I [%sp + STACK_START + 40] +#endif + +#ifndef CONJ +#define FSUBX FSUB +#define FADDX FADD +#else +#define FSUBX FADD +#define FADDX FSUB +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ + + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], LDA + ld [%sp + STACK_START + 40], X + ld [%sp + STACK_START + 44], INCX + ld [%sp + STACK_START + 48], Y + ld [%sp + STACK_START + 52], INCY + ld [%sp + STACK_START + 56], BUFFER +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ + st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ + + ld [%sp + STACK_START + 28], LDA + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY + ld [%sp + STACK_START + 48], BUFFER +#endif +#else + ldx [%sp + STACK_START + 56], LDA + ldx [%sp + STACK_START + 64], X + ldx [%sp + STACK_START + 72], INCX + ldx [%sp + STACK_START + 80], Y + ldx [%sp + STACK_START + 88], INCY + ldx [%sp + STACK_START + 96], BUFFER + +#ifdef DOUBLE + std %f6, STACK_ALPHA_R + std %f8, STACK_ALPHA_I +#else + st %f7, STACK_ALPHA_R + st %f9, STACK_ALPHA_I +#endif +#endif + + sll LDA, ZBASE_SHIFT, LDA + + cmp M, 0 + ble %icc, .LL999 + sll INCX, ZBASE_SHIFT, INCX + + cmp N, 0 + ble %icc, .LL999 + sll INCY, ZBASE_SHIFT, INCY + + cmp INCY, 2 * SIZE + be %icc, .LL20 + mov Y, YY + +#ifdef DOUBLE + FCLR(19) +#else + FCLR(25) +#endif + + add M, 3, J + sra J, 2, J + mov BUFFER, YY + mov BUFFER, Y1 + +.LL01: + STF FZERO, [Y1 + 0 * SIZE] + nop + STF FZERO, [Y1 + 1 * SIZE] + STF FZERO, [Y1 + 2 * SIZE] + STF FZERO, [Y1 + 3 * SIZE] + STF FZERO, [Y1 + 4 * SIZE] + nop + STF FZERO, [Y1 + 5 * SIZE] + deccc J + STF FZERO, [Y1 + 6 * SIZE] + nop + STF FZERO, [Y1 + 7 * SIZE] + bg,pn %icc, .LL01 + add Y1, 8 * SIZE, Y1 + +.LL20: + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + +.LL21: + mov YY, Y1 + mov A, A1 + LDF STACK_ALPHA_R, ALPHA_R + LDF STACK_ALPHA_I, ALPHA_I + + add A, LDA, A2 + add A2, LDA, A + + LDF [X + 0 * SIZE], x1 + LDF [X + 1 * SIZE], x2 + add X, INCX, X + LDF [X + 0 * SIZE], x3 + LDF [X + 1 * SIZE], x4 + add X, INCX, X + + FMUL ALPHA_R, x1, a1 + FMUL ALPHA_I, x2, a4 + FMUL ALPHA_I, x1, a2 + FMUL ALPHA_R, x2, a3 + + FMUL ALPHA_R, x3, a5 + FMUL ALPHA_I, x4, a8 + FMUL ALPHA_I, x3, a6 + FMUL ALPHA_R, x4, a7 + +#ifndef XCONJ + FSUB a1, a4, x1 + FADD a2, a3, x2 + FSUB a5, a8, x3 + FADD a6, a7, x4 +#else + FADD a1, a4, x1 + FSUB a2, a3, x2 + FADD a5, a8, x3 + FSUB a6, a7, x4 +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL27 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [A1 + 4 * SIZE], a9 + LDF [A1 + 5 * SIZE], a10 + LDF [A1 + 6 * SIZE], a11 + LDF [A1 + 7 * SIZE], a12 + + LDF [A2 + 0 * SIZE], a5 + LDF [A2 + 1 * SIZE], a6 + LDF [A2 + 2 * SIZE], a7 + LDF [A2 + 3 * SIZE], a8 + + LDF [A2 + 4 * SIZE], a13 + LDF [A2 + 5 * SIZE], a14 + LDF [A2 + 6 * SIZE], a15 + LDF [A2 + 7 * SIZE], a16 + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + + + FMUL a1, x1, t1 + deccc I + FMUL a1, x2, t2 + LDF [A1 + 8 * SIZE], a1 + + FMUL a3, x1, t3 + FMUL a3, x2, t4 + ble,pn %icc, .LL26 + LDF [A1 + 10 * SIZE], a3 + + FADD y1, t1, y1 + LDF [Y1 + 3 * SIZE], y4 + FMUL a2, x2, t1 + + FADD y2, t2, y2 + FMUL a2, x1, t2 + LDF [A1 + 9 * SIZE], a2 + + FADD y3, t3, y3 + LDF [Y1 + 4 * SIZE], y5 + FMUL a4, x2, t3 + + FADD y4, t4, y4 + FMUL a4, x1, t4 + LDF [A1 + 11 * SIZE], a4 + + FSUBX y1, t1, y1 + LDF [Y1 + 5 * SIZE], y6 + FMUL a5, x3, t1 + + FADDX y2, t2, y2 + FMUL a5, x4, t2 + LDF [A2 + 8 * SIZE], a5 + + FSUBX y3, t3, y3 + LDF [Y1 + 6 * SIZE], y7 + FMUL a7, x3, t3 + + FADDX y4, t4, y4 + FMUL a7, x4, t4 + LDF [A2 + 10 * SIZE], a7 + + FADD y1, t1, y1 + LDF [Y1 + 7 * SIZE], y8 + FMUL a6, x4, t1 + + FADD y2, t2, y2 + FMUL a6, x3, t2 + LDF [A2 + 9 * SIZE], a6 + + FADD y3, t3, y3 + FMUL a8, x4, t3 + + FADD y4, t4, y4 + FMUL a8, x3, t4 + LDF [A2 + 11 * SIZE], a8 + + FSUBX y1, t1, y1 + FMUL a9, x1, t1 + + FADDX y2, t2, y2 + FMUL a9, x2, t2 + LDF [A1 + 12 * SIZE], a9 + + FSUBX y3, t3, y3 + deccc I + FMUL a11, x1, t3 + + FADDX y4, t4, y4 + FMUL a11, x2, t4 + ble,pn %icc, .LL23 + LDF [A1 + 14 * SIZE], a11 + +.LL22: + FADD y5, t1, y5 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a10, x2, t1 + LDF [Y1 + 7 * SIZE], y8 + + FADD y6, t2, y6 + FMUL a10, x1, t2 + LDF [A1 + 13 * SIZE], a10 + + FADD y7, t3, y7 + FMUL a12, x2, t3 + STF y1, [Y1 + 0 * SIZE] + + FADD y8, t4, y8 + FMUL a12, x1, t4 + LDF [A1 + 15 * SIZE], a12 + + FSUBX y5, t1, y5 + FMUL a13, x3, t1 + STF y2, [Y1 + 1 * SIZE] + + FADDX y6, t2, y6 + FMUL a13, x4, t2 + LDF [A2 + 12 * SIZE], a13 + + FSUBX y7, t3, y7 + FMUL a15, x3, t3 + STF y3, [Y1 + 2 * SIZE] + + FADDX y8, t4, y8 + FMUL a15, x4, t4 + LDF [A2 + 14 * SIZE], a15 + + FADD y5, t1, y5 + FMUL a14, x4, t1 + STF y4, [Y1 + 3 * SIZE] + + FADD y6, t2, y6 + FMUL a14, x3, t2 + LDF [A2 + 13 * SIZE], a14 + + FADD y7, t3, y7 + FMUL a16, x4, t3 + LDF [Y1 + 8 * SIZE], y1 + + FADD y8, t4, y8 + FMUL a16, x3, t4 + LDF [A2 + 15 * SIZE], a16 + + FSUBX y5, t1, y5 + FMUL a1, x1, t1 + LDF [Y1 + 9 * SIZE], y2 + + FADDX y6, t2, y6 + FMUL a1, x2, t2 + LDF [A1 + 16 * SIZE], a1 + + FSUBX y7, t3, y7 + FMUL a3, x1, t3 + LDF [Y1 + 10 * SIZE], y3 + + FADDX y8, t4, y8 + FMUL a3, x2, t4 + LDF [A1 + 18 * SIZE], a3 + + FADD y1, t1, y1 + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + FMUL a2, x2, t1 + LDF [Y1 + 11 * SIZE], y4 + + FADD y2, t2, y2 + FMUL a2, x1, t2 + LDF [A1 + 17 * SIZE], a2 + + FADD y3, t3, y3 + FMUL a4, x2, t3 + STF y5, [Y1 + 4 * SIZE] + + FADD y4, t4, y4 + FMUL a4, x1, t4 + LDF [A1 + 19 * SIZE], a4 + + FSUBX y1, t1, y1 + FMUL a5, x3, t1 + STF y6, [Y1 + 5 * SIZE] + + FADDX y2, t2, y2 + FMUL a5, x4, t2 + LDF [A2 + 16 * SIZE], a5 + + FSUBX y3, t3, y3 + FMUL a7, x3, t3 + STF y7, [Y1 + 6 * SIZE] + + FADDX y4, t4, y4 + deccc I + FMUL a7, x4, t4 + LDF [A2 + 18 * SIZE], a7 + + FADD y1, t1, y1 + FMUL a6, x4, t1 + STF y8, [Y1 + 7 * SIZE] + + FADD y2, t2, y2 + FMUL a6, x3, t2 + LDF [A2 + 17 * SIZE], a6 + + FADD y3, t3, y3 + add A1, 8 * SIZE, A1 + FMUL a8, x4, t3 + LDF [Y1 + 12 * SIZE], y5 + + FADD y4, t4, y4 + FMUL a8, x3, t4 + LDF [A2 + 19 * SIZE], a8 + + FSUBX y1, t1, y1 + add A2, 8 * SIZE, A2 + FMUL a9, x1, t1 + LDF [Y1 + 13 * SIZE], y6 + + FADDX y2, t2, y2 + add Y1, 8 * SIZE, Y1 + FMUL a9, x2, t2 + LDF [A1 + 12 * SIZE], a9 + + FSUBX y3, t3, y3 + FMUL a11, x1, t3 + LDF [Y1 + 6 * SIZE], y7 + + FADDX y4, t4, y4 + FMUL a11, x2, t4 + bg,pn %icc, .LL22 + LDF [A1 + 14 * SIZE], a11 + +.LL23: + FADD y5, t1, y5 + FMUL a10, x2, t1 + LDF [Y1 + 7 * SIZE], y8 + + FADD y6, t2, y6 + FMUL a10, x1, t2 + LDF [A1 + 13 * SIZE], a10 + + FADD y7, t3, y7 + FMUL a12, x2, t3 + STF y1, [Y1 + 0 * SIZE] + + FADD y8, t4, y8 + FMUL a12, x1, t4 + LDF [A1 + 15 * SIZE], a12 + + FSUBX y5, t1, y5 + FMUL a13, x3, t1 + STF y2, [Y1 + 1 * SIZE] + + FADDX y6, t2, y6 + FMUL a13, x4, t2 + LDF [A2 + 12 * SIZE], a13 + + FSUBX y7, t3, y7 + FMUL a15, x3, t3 + STF y3, [Y1 + 2 * SIZE] + FADDX y8, t4, y8 + FMUL a15, x4, t4 + LDF [A2 + 14 * SIZE], a15 + + FADD y5, t1, y5 + FMUL a14, x4, t1 + STF y4, [Y1 + 3 * SIZE] + FADD y6, t2, y6 + FMUL a14, x3, t2 + LDF [A2 + 13 * SIZE], a14 + + FADD y7, t3, y7 + FMUL a16, x4, t3 + LDF [Y1 + 8 * SIZE], y1 + FADD y8, t4, y8 + FMUL a16, x3, t4 + LDF [A2 + 15 * SIZE], a16 + + FSUBX y5, t1, y5 + add A1, 8 * SIZE, A1 + FMUL a1, x1, t1 + LDF [Y1 + 9 * SIZE], y2 + + FADDX y6, t2, y6 + add A2, 8 * SIZE, A2 + FMUL a1, x2, t2 + LDF [A1 + 8 * SIZE], a1 + + FSUBX y7, t3, y7 + FMUL a3, x1, t3 + LDF [Y1 + 10 * SIZE], y3 + + FADDX y8, t4, y8 + add Y1, 8 * SIZE, Y1 + FMUL a3, x2, t4 + LDF [A1 + 10 * SIZE], a3 + + STF y5, [Y1 - 4 * SIZE] + STF y6, [Y1 - 3 * SIZE] + STF y7, [Y1 - 2 * SIZE] + STF y8, [Y1 - 1 * SIZE] + +.LL26: + FADD y1, t1, y1 + LDF [Y1 + 3 * SIZE], y4 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + + FADD y3, t3, y3 + LDF [Y1 + 4 * SIZE], y5 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + + FSUBX y1, t1, y1 + LDF [Y1 + 5 * SIZE], y6 + FMUL a5, x3, t1 + FADDX y2, t2, y2 + FMUL a5, x4, t2 + + FSUBX y3, t3, y3 + LDF [Y1 + 6 * SIZE], y7 + FADDX y4, t4, y4 + FMUL a7, x4, t4 + + FADD y1, t1, y1 + LDF [Y1 + 7 * SIZE], y8 + FMUL a7, x3, t3 + FMUL a6, x4, t1 + FADD y2, t2, y2 + FMUL a6, x3, t2 + + FADD y3, t3, y3 + FMUL a8, x4, t3 + FADD y4, t4, y4 + FMUL a8, x3, t4 + + FSUBX y1, t1, y1 + FMUL a9, x1, t1 + FADDX y2, t2, y2 + FMUL a9, x2, t2 + + FSUBX y3, t3, y3 + FMUL a11, x1, t3 + FADDX y4, t4, y4 + FMUL a11, x2, t4 + + FADD y5, t1, y5 + FMUL a10, x2, t1 + FADD y6, t2, y6 + FMUL a10, x1, t2 + + FADD y7, t3, y7 + FMUL a12, x2, t3 + FADD y8, t4, y8 + FMUL a12, x1, t4 + + FSUBX y5, t1, y5 + FMUL a13, x3, t1 + FADDX y6, t2, y6 + FMUL a13, x4, t2 + + FSUBX y7, t3, y7 + FMUL a15, x3, t3 + FADDX y8, t4, y8 + FMUL a15, x4, t4 + + FADD y5, t1, y5 + FMUL a14, x4, t1 + FADD y6, t2, y6 + FMUL a14, x3, t2 + + FADD y7, t3, y7 + FMUL a16, x4, t3 + FADD y8, t4, y8 + FMUL a16, x3, t4 + + STF y1, [Y1 + 0 * SIZE] + FSUBX y5, t1, y5 + STF y2, [Y1 + 1 * SIZE] + FADDX y6, t2, y6 + STF y3, [Y1 + 2 * SIZE] + FSUBX y7, t3, y7 + STF y4, [Y1 + 3 * SIZE] + FADDX y8, t4, y8 + + STF y5, [Y1 + 4 * SIZE] + add A1, 8 * SIZE, A1 + STF y6, [Y1 + 5 * SIZE] + add A2, 8 * SIZE, A2 + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + add Y1, 8 * SIZE, Y1 + +.LL27: + andcc M, 2, I + ble,pn %icc, .LL28 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + + FMUL a1, x1, t1 + LDF [A2 + 0 * SIZE], a5 + FMUL a1, x2, t2 + LDF [A2 + 1 * SIZE], a6 + FMUL a3, x1, t3 + LDF [A2 + 2 * SIZE], a7 + FMUL a3, x2, t4 + LDF [A2 + 3 * SIZE], a8 + + FADD y1, t1, y1 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + + FADD y3, t3, y3 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + + FSUBX y1, t1, y1 + FMUL a5, x3, t1 + FADDX y2, t2, y2 + FMUL a5, x4, t2 + + FSUBX y3, t3, y3 + FMUL a7, x3, t3 + FADDX y4, t4, y4 + FMUL a7, x4, t4 + + FADD y1, t1, y1 + FMUL a6, x4, t1 + FADD y2, t2, y2 + FMUL a6, x3, t2 + + FADD y3, t3, y3 + FMUL a8, x4, t3 + FADD y4, t4, y4 + FMUL a8, x3, t4 + + FSUBX y1, t1, y1 + FADDX y2, t2, y2 + FSUBX y3, t3, y3 + FADDX y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + add A1, 4 * SIZE, A1 + STF y2, [Y1 + 1 * SIZE] + add A2, 4 * SIZE, A2 + STF y3, [Y1 + 2 * SIZE] + nop + STF y4, [Y1 + 3 * SIZE] + add Y1, 4 * SIZE, Y1 + +.LL28: + andcc M, 1, I + ble,pn %icc, .LL29 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + + FMUL a1, x1, t1 + FMUL a1, x2, t2 + FMUL a2, x2, t3 + FMUL a2, x1, t4 + + FADD y1, t1, y1 + FMUL a3, x3, t1 + FADD y2, t2, y2 + FMUL a3, x4, t2 + + FSUBX y1, t3, y1 + FMUL a4, x4, t3 + FADDX y2, t4, y2 + FMUL a4, x3, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FSUBX y1, t3, y1 + FADDX y2, t4, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + +.LL29: + deccc J + bg %icc, .LL21 + nop + + +.LL30: + andcc N, 1, J + ble,pn %icc, .LL990 + nop + +.LL31: + mov YY, Y1 + mov A, A1 + + LDF STACK_ALPHA_R, ALPHA_R + LDF STACK_ALPHA_I, ALPHA_I + + LDF [X + 0 * SIZE], x1 + LDF [X + 1 * SIZE], x2 + + FMUL ALPHA_R, x1, a1 /* AC */ + FMUL ALPHA_I, x1, a2 /* AD */ + FMUL ALPHA_R, x2, a3 /* BC */ + FMUL ALPHA_I, x2, a4 /* BD */ + +#ifndef XCONJ + FSUB a1, a4, x1 + FADD a2, a3, x2 +#else + FADD a1, a4, x1 + FSUB a2, a3, x2 +#endif + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL37 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [A1 + 4 * SIZE], a9 + LDF [A1 + 5 * SIZE], a10 + LDF [A1 + 6 * SIZE], a11 + LDF [A1 + 7 * SIZE], a12 + + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + LDF [Y1 + 2 * SIZE], y3 + LDF [Y1 + 3 * SIZE], y4 + + LDF [Y1 + 4 * SIZE], y5 + LDF [Y1 + 5 * SIZE], y6 + LDF [Y1 + 6 * SIZE], y7 + LDF [Y1 + 7 * SIZE], y8 + + FMUL a1, x1, t1 + deccc I + FMUL a1, x2, t2 + LDF [A1 + 8 * SIZE], a1 + FMUL a3, x1, t3 + FMUL a3, x2, t4 + ble,pn %icc, .LL33 + LDF [A1 + 10 * SIZE], a3 + +.LL32: + FADD y1, t1, y1 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + LDF [A1 + 9 * SIZE], a2 + + FADD y3, t3, y3 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + LDF [A1 + 11 * SIZE], a4 + + FSUBX y1, t1, y1 + FMUL a9, x1, t1 + FADDX y2, t2, y2 + FMUL a9, x2, t2 + LDF [A1 + 12 * SIZE], a9 + + FSUBX y3, t3, y3 + FMUL a11, x1, t3 + FADDX y4, t4, y4 + FMUL a11, x2, t4 + LDF [A1 + 14 * SIZE], a11 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + FADD y5, t1, y5 + FMUL a10, x2, t1 + LDF [Y1 + 8 * SIZE], y1 + FADD y6, t2, y6 + FMUL a10, x1, t2 + LDF [A1 + 13 * SIZE], a10 + + FADD y7, t3, y7 + deccc I + FMUL a12, x2, t3 + LDF [Y1 + 9 * SIZE], y2 + FADD y8, t4, y8 + FMUL a12, x1, t4 + LDF [A1 + 15 * SIZE], a12 + + FSUBX y5, t1, y5 + add A1, 8 * SIZE, A1 + FMUL a1, x1, t1 + LDF [Y1 + 10 * SIZE], y3 + FADDX y6, t2, y6 + FMUL a1, x2, t2 + LDF [A1 + 8 * SIZE], a1 + + FSUBX y7, t3, y7 + FMUL a3, x1, t3 + LDF [Y1 + 11 * SIZE], y4 + FADDX y8, t4, y8 + FMUL a3, x2, t4 + LDF [A1 + 10 * SIZE], a3 + + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + + LDF [Y1 + 12 * SIZE], y5 + LDF [Y1 + 13 * SIZE], y6 + LDF [Y1 + 14 * SIZE], y7 + add Y1, 8 * SIZE, Y1 + bg,pn %icc, .LL32 + LDF [Y1 + 7 * SIZE], y8 + +.LL33: + FADD y1, t1, y1 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + + FADD y3, t3, y3 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + + FSUBX y1, t1, y1 + FMUL a9, x1, t1 + FADDX y2, t2, y2 + FMUL a9, x2, t2 + + FSUBX y3, t3, y3 + FMUL a11, x1, t3 + FADDX y4, t4, y4 + FMUL a11, x2, t4 + + FADD y5, t1, y5 + FMUL a10, x2, t1 + FADD y6, t2, y6 + FMUL a10, x1, t2 + + FADD y7, t3, y7 + FMUL a12, x2, t3 + FADD y8, t4, y8 + FMUL a12, x1, t4 + + FSUBX y5, t1, y5 + FADDX y6, t2, y6 + FSUBX y7, t3, y7 + FADDX y8, t4, y8 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + STF y5, [Y1 + 4 * SIZE] + STF y6, [Y1 + 5 * SIZE] + STF y7, [Y1 + 6 * SIZE] + STF y8, [Y1 + 7 * SIZE] + + add A1, 8 * SIZE, A1 + add Y1, 8 * SIZE, Y1 + + +.LL37: + andcc M, 2, I + ble,pn %icc, .LL38 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + + LDF [Y1 + 0 * SIZE], y1 + FMUL a1, x1, t1 + LDF [Y1 + 1 * SIZE], y2 + FMUL a1, x2, t2 + LDF [Y1 + 2 * SIZE], y3 + FMUL a3, x1, t3 + LDF [Y1 + 3 * SIZE], y4 + FMUL a3, x2, t4 + + FADD y1, t1, y1 + FMUL a2, x2, t1 + FADD y2, t2, y2 + FMUL a2, x1, t2 + FADD y3, t3, y3 + FMUL a4, x2, t3 + FADD y4, t4, y4 + FMUL a4, x1, t4 + + FSUBX y1, t1, y1 + FADDX y2, t2, y2 + FSUBX y3, t3, y3 + FADDX y4, t4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + STF y3, [Y1 + 2 * SIZE] + STF y4, [Y1 + 3 * SIZE] + + add A1, 4 * SIZE, A1 + add Y1, 4 * SIZE, Y1 + +.LL38: + andcc M, 1, I + ble,pn %icc, .LL990 + nop + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [Y1 + 0 * SIZE], y1 + LDF [Y1 + 1 * SIZE], y2 + + FMUL a1, x1, t1 + FMUL a1, x2, t2 + FMUL a2, x2, t3 + FMUL a2, x1, t4 + + FADD y1, t1, y1 + FADD y2, t2, y2 + FSUBX y1, t3, y1 + FADDX y2, t4, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + +.LL990: + cmp INCY, 2 * SIZE + be %icc, .LL999 + mov Y, Y1 + + sra M, 2, I + cmp I, 0 + ble,pn %icc, .LL995 + nop + +.LL991: + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], y1 + LDF [Y + 1 * SIZE], y2 + add Y, INCY, Y + + LDF [BUFFER + 2 * SIZE], a3 + LDF [BUFFER + 3 * SIZE], a4 + LDF [Y + 0 * SIZE], y3 + LDF [Y + 1 * SIZE], y4 + add Y, INCY, Y + + LDF [BUFFER + 4 * SIZE], a5 + LDF [BUFFER + 5 * SIZE], a6 + LDF [Y + 0 * SIZE], y5 + LDF [Y + 1 * SIZE], y6 + add Y, INCY, Y + + LDF [BUFFER + 6 * SIZE], a7 + LDF [BUFFER + 7 * SIZE], a8 + LDF [Y + 0 * SIZE], y7 + LDF [Y + 1 * SIZE], y8 + add Y, INCY, Y + + FADD y1, a1, y1 + FADD y2, a2, y2 + FADD y3, a3, y3 + FADD y4, a4, y4 + FADD y5, a5, y5 + FADD y6, a6, y6 + FADD y7, a7, y7 + FADD y8, a8, y8 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + STF y3, [Y1 + 0 * SIZE] + STF y4, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + STF y5, [Y1 + 0 * SIZE] + STF y6, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + STF y7, [Y1 + 0 * SIZE] + STF y8, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + + deccc I + bg,pn %icc, .LL991 + add BUFFER, 8 * SIZE, BUFFER + +.LL995: + andcc M, 2, I + ble,pn %icc, .LL996 + nop + + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], y1 + LDF [Y + 1 * SIZE], y2 + add Y, INCY, Y + + LDF [BUFFER + 2 * SIZE], a3 + LDF [BUFFER + 3 * SIZE], a4 + LDF [Y + 0 * SIZE], y3 + LDF [Y + 1 * SIZE], y4 + add Y, INCY, Y + + FADD y1, a1, y1 + FADD y2, a2, y2 + FADD y3, a3, y3 + FADD y4, a4, y4 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + STF y3, [Y1 + 0 * SIZE] + STF y4, [Y1 + 1 * SIZE] + add Y1, INCY, Y1 + + add BUFFER, 4 * SIZE, BUFFER + +.LL996: + andcc M, 1, I + ble,pn %icc, .LL999 + nop + + LDF [BUFFER + 0 * SIZE], a1 + LDF [BUFFER + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], y1 + LDF [Y + 1 * SIZE], y2 + + FADD y1, a1, y1 + FADD y2, a2, y2 + + STF y1, [Y1 + 0 * SIZE] + STF y2, [Y1 + 1 * SIZE] + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/zgemv_t.S b/kernel/sparc/zgemv_t.S new file mode 100644 index 0000000000..2b4a64cad7 --- /dev/null +++ b/kernel/sparc/zgemv_t.S @@ -0,0 +1,1737 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define P 4000 + +#define M %i0 +#define N %i1 +#define A %i5 +#define LDA %i2 +#define X %i3 +#define INCX %i4 + +#define Y %l0 +#define INCY %l1 +#define BUFFER %l2 + +#define I %l3 +#define IS %l4 +#define J %l5 +#define MIN_M %l6 +#define XP %l7 + +#define A1 %o0 +#define A2 %o1 +#define A3 %o2 +#define A4 %o3 + +#define X1 %o4 +#define Y1 %o5 +#define PNLDA %g1 +#define Y2 %o7 /* Danger? */ + +#ifdef DOUBLE +#define t1 %f0 +#define t2 %f2 +#define t3 %f4 +#define t4 %f6 + +#define c1 %f8 +#define c2 %f10 +#define c3 %f12 +#define c4 %f14 +#define c5 %f16 +#define c6 %f18 +#define c7 %f20 +#define c8 %f22 +#define c9 %f24 +#define c10 %f26 +#define c11 %f28 +#define c12 %f30 +#define c13 %f32 +#define c14 %f34 +#define c15 %f36 +#define c16 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f48 +#define a6 %f50 +#define a7 %f52 +#define a8 %f54 + +#define b1 %f56 +#define b2 %f58 +#define b3 %f60 +#define b4 %f62 +#else +#define t1 %f0 +#define t2 %f1 +#define t3 %f2 +#define t4 %f3 + +#define c1 %f4 +#define c2 %f5 +#define c3 %f6 +#define c4 %f7 +#define c5 %f8 +#define c6 %f9 +#define c7 %f10 +#define c8 %f11 +#define c9 %f12 +#define c10 %f13 +#define c11 %f14 +#define c12 %f15 +#define c13 %f16 +#define c14 %f17 +#define c15 %f18 +#define c16 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f24 +#define a6 %f25 +#define a7 %f26 +#define a8 %f27 + +#define b1 %f28 +#define b2 %f29 +#define b3 %f30 +#define b4 %f31 +#endif + +#ifndef __64BIT__ +#define ALPHA_R [%sp + STACK_START + 16] +#ifndef DOUBLE +#define ALPHA_I [%sp + STACK_START + 20] +#else +#define ALPHA_I [%sp + STACK_START + 24] +#endif +#else +#define ALPHA_R [%sp + STACK_START + 32] +#define ALPHA_I [%sp + STACK_START + 40] +#endif + +#ifdef DOUBLE +#define PREFETCHSIZE 18 +#else +#define PREFETCHSIZE 36 +#endif + + PROLOGUE + SAVESP + nop + +#ifndef __64BIT__ + +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] /* ALPHA_I */ + + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], LDA + ld [%sp + STACK_START + 40], X + ld [%sp + STACK_START + 44], INCX + ld [%sp + STACK_START + 48], Y + ld [%sp + STACK_START + 52], INCY + ld [%sp + STACK_START + 56], BUFFER +#else + st %i3, [%sp + STACK_START + 16] /* ALPHA_R */ + st %i4, [%sp + STACK_START + 20] /* ALPHA_I */ + + ld [%sp + STACK_START + 28], LDA + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY + ld [%sp + STACK_START + 48], BUFFER +#endif +#else + ldx [%sp + STACK_START + 56], LDA + ldx [%sp + STACK_START + 64], X + ldx [%sp + STACK_START + 72], INCX + ldx [%sp + STACK_START + 80], Y + ldx [%sp + STACK_START + 88], INCY + ldx [%sp + STACK_START + 96], BUFFER +#ifdef DOUBLE + std %f6, ALPHA_R + std %f8, ALPHA_I +#else + st %f7, ALPHA_R + st %f9, ALPHA_I +#endif +#endif + + clr IS + mov P, I + sll LDA, ZBASE_SHIFT, LDA + sll I, ZBASE_SHIFT, I + smul LDA, N, PNLDA + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + sub I, PNLDA, PNLDA + +.LL10: + sll IS, ZBASE_SHIFT, I + sub M, IS, MIN_M + mov P, J + + cmp MIN_M, J + nop + movg %icc, J, MIN_M + nop + cmp INCX, 2 * SIZE + beq .LL100 + add X, I, XP + + sra MIN_M, 2, I + mov BUFFER, XP + cmp I, 0 + ble,pn %icc, .LL15 + mov BUFFER, Y1 + +.LL11: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + LDF [X + 0 * SIZE], a7 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + + STF a1, [Y1 + 0 * SIZE] + add I, -1, I + STF a2, [Y1 + 1 * SIZE] + cmp I, 0 + STF a3, [Y1 + 2 * SIZE] + STF a4, [Y1 + 3 * SIZE] + STF a5, [Y1 + 4 * SIZE] + STF a6, [Y1 + 5 * SIZE] + STF a7, [Y1 + 6 * SIZE] + STF a8, [Y1 + 7 * SIZE] + bg,pn %icc, .LL11 + add Y1, 8 * SIZE, Y1 + +.LL15: + and MIN_M, 3, I + cmp I, 0 + ble,pn %icc, .LL100 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + add I, -1, I + cmp I, 0 + nop + STF a1, [Y1 + 0 * SIZE] + STF a2, [Y1 + 1 * SIZE] + bg,pn %icc, .LL16 + add Y1, 2 * SIZE, Y1 + +.LL100: + sra N, 2, J + cmp J, 0 + ble %icc, .LL200 + mov Y, Y1 + +.LL110: + FCLR(0) + + FMOV t1, c1 + sra MIN_M, 2, I + FMOV t1, c2 + add A, LDA, A2 + FMOV t1, c3 + mov A, A1 + FMOV t1, c4 + add A2, LDA, A3 + + FMOV t1, c5 + FMOV t1, c6 + FMOV t1, c7 + FMOV t1, c8 + FMOV t1, c9 + FMOV t1, c10 + FMOV t1, c11 + FMOV t1, c12 + FMOV t1, c13 + FMOV t1, c14 + FMOV t1, c15 + FMOV t1, c16 + + add A3, LDA, A4 + FMOV t1, t2 + mov XP, X1 + FMOV t1, t3 + add A4, LDA, A + cmp I, 0 + ble %icc, .LL115 + FMOV t1, t4 + + LDF [A1 + 0 * SIZE], a1 + nop + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + add A2, 2 * SIZE, A2 + LDF [A3 + 0 * SIZE], a5 + LDF [A3 + 1 * SIZE], a6 + add A3, 2 * SIZE, A3 + LDF [A4 + 0 * SIZE], a7 + LDF [A4 + 1 * SIZE], a8 + add A4, 2 * SIZE, A4 + + LDF [X1 + 0 * SIZE], b1 + nop + LDF [X1 + 1 * SIZE], b2 + nop + LDF [X1 + 2 * SIZE], b3 + add X1, 4 * SIZE, X1 + + deccc I + ble .LL112 + prefetch [Y1 + 7 * SIZE], 2 + +#ifndef XCONJ +#define FADDX FADD +#else +#define FADDX FSUB +#endif + +.LL111: + FADD c13, t1, c13 + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + FMUL a1, b1, t1 + nop + + FADDX c14, t2, c14 + nop + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b1, t3 + LDF [X1 - 1 * SIZE], b4 + + FADD c16, t4, c16 + nop + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b1, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b1, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b1, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b2, t2 + LDF [A3 + 0 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b1, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b2, t4 + LDF [A3 + 1 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b1, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b2, t2 + LDF [A4 + 0 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b1, t3 + LDF [X1 + 0 * SIZE], b1 + + FADD c12, t4, c12 + nop + FMUL a8, b2, t4 + LDF [A4 + 1 * SIZE], a8 + + FADD c13, t1, c13 + nop + FMUL a1, b3, t1 + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + + FADDX c14, t2, c14 + nop + FMUL a1, b4, t2 + LDF [A1 + 2 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b3, t3 + LDF [X1 + 1 * SIZE], b2 + + FADD c16, t4, c16 + nop + FMUL a2, b4, t4 + LDF [A1 + 3 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b3, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b4, t2 + LDF [A2 + 2 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b3, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b4, t4 + LDF [A2 + 3 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b3, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b4, t2 + LDF [A3 + 2 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b3, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b4, t4 + LDF [A3 + 3 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b3, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b4, t2 + LDF [A4 + 2 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b3, t3 + LDF [X1 + 2 * SIZE], b3 + + FADD c12, t4, c12 + nop + FMUL a8, b4, t4 + LDF [A4 + 3 * SIZE], a8 + + FADD c13, t1, c13 + prefetch [A3 + PREFETCHSIZE * SIZE], 1 + FMUL a1, b1, t1 + nop + + FADDX c14, t2, c14 + nop + FMUL a1, b2, t2 + LDF [A1 + 4 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b1, t3 + LDF [X1 + 3 * SIZE], b4 + + FADD c16, t4, c16 + nop + FMUL a2, b2, t4 + LDF [A1 + 5 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b1, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b2, t2 + LDF [A2 + 4 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b1, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b2, t4 + LDF [A2 + 5 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b1, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b2, t2 + LDF [A3 + 4 * SIZE], a5 + + FADD c7, t3, c7 + deccc I + FMUL a6, b1, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b2, t4 + LDF [A3 + 5 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b1, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b2, t2 + LDF [A4 + 4 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b1, t3 + LDF [X1 + 4 * SIZE], b1 + + FADD c12, t4, c12 + nop + FMUL a8, b2, t4 + LDF [A4 + 5 * SIZE], a8 + + FADD c13, t1, c13 + prefetch [A4 + PREFETCHSIZE * SIZE], 1 + FMUL a1, b3, t1 + nop + + FADDX c14, t2, c14 + nop + FMUL a1, b4, t2 + LDF [A1 + 6 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b3, t3 + LDF [X1 + 5 * SIZE], b2 + + FADD c16, t4, c16 + nop + FMUL a2, b4, t4 + LDF [A1 + 7 * SIZE], a2 + + FADD c1, t1, c1 + add A1, 8 * SIZE, A1 + FMUL a3, b3, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b4, t2 + LDF [A2 + 6 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b3, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b4, t4 + LDF [A2 + 7 * SIZE], a4 + + FADD c5, t1, c5 + add A2, 8 * SIZE, A2 + FMUL a5, b3, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b4, t2 + LDF [A3 + 6 * SIZE], a5 + + FADD c7, t3, c7 + add A4, 8 * SIZE, A4 + FMUL a6, b3, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b4, t4 + LDF [A3 + 7 * SIZE], a6 + + FADD c9, t1, c9 + add A3, 8 * SIZE, A3 + FMUL a7, b3, t1 + nop + + FADDX c10, t2, c10 + add X1, 8 * SIZE, X1 + FMUL a7, b4, t2 + LDF [A4 - 2 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b3, t3 + LDF [X1 - 2 * SIZE], b3 + + FADD c12, t4, c12 + FMUL a8, b4, t4 + bg,pn %icc, .LL111 + LDF [A4 - 1 * SIZE], a8 + +.LL112: + FADD c13, t1, c13 + nop + FMUL a1, b1, t1 + LDF [X1 - 1 * SIZE], b4 + + FADDX c14, t2, c14 + nop + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b1, t3 + LDF [X1 - 1 * SIZE], b4 + + FADD c16, t4, c16 + nop + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b1, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b1, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b1, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b2, t2 + LDF [A3 + 0 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b1, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b2, t4 + LDF [A3 + 1 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b1, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b2, t2 + LDF [A4 + 0 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b1, t3 + LDF [X1 + 0 * SIZE], b1 + + FADD c12, t4, c12 + nop + FMUL a8, b2, t4 + LDF [A4 + 1 * SIZE], a8 + + FADD c13, t1, c13 + nop + FMUL a1, b3, t1 + LDF [X1 + 1 * SIZE], b2 + + FADDX c14, t2, c14 + nop + FMUL a1, b4, t2 + LDF [A1 + 2 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b3, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a2, b4, t4 + LDF [A1 + 3 * SIZE], a2 + + FADD c1, t1, c1 + nop + FMUL a3, b3, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b4, t2 + LDF [A2 + 2 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b3, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b4, t4 + LDF [A2 + 3 * SIZE], a4 + + FADD c5, t1, c5 + nop + FMUL a5, b3, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b4, t2 + LDF [A3 + 2 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b3, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b4, t4 + LDF [A3 + 3 * SIZE], a6 + + FADD c9, t1, c9 + nop + FMUL a7, b3, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b4, t2 + LDF [A4 + 2 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b3, t3 + LDF [X1 + 2 * SIZE], b3 + + FADD c12, t4, c12 + nop + FMUL a8, b4, t4 + LDF [A4 + 3 * SIZE], a8 + + FADD c13, t1, c13 + nop + FMUL a1, b1, t1 + LDF [X1 + 3 * SIZE], b4 + + FADDX c14, t2, c14 + add X1, 4 * SIZE, X1 + FMUL a1, b2, t2 + LDF [A1 + 4 * SIZE], a1 + + FADD c15, t3, c15 + nop + FMUL a2, b1, t3 + nop + + FADD c16, t4, c16 + nop + FMUL a2, b2, t4 + LDF [A1 + 5 * SIZE], a2 + + FADD c1, t1, c1 + add A1, 6 * SIZE, A1 + FMUL a3, b1, t1 + nop + + FADDX c2, t2, c2 + nop + FMUL a3, b2, t2 + LDF [A2 + 4 * SIZE], a3 + + FADD c3, t3, c3 + nop + FMUL a4, b1, t3 + nop + + FADD c4, t4, c4 + nop + FMUL a4, b2, t4 + LDF [A2 + 5 * SIZE], a4 + + FADD c5, t1, c5 + add A2, 6 * SIZE, A2 + FMUL a5, b1, t1 + nop + + FADDX c6, t2, c6 + nop + FMUL a5, b2, t2 + LDF [A3 + 4 * SIZE], a5 + + FADD c7, t3, c7 + nop + FMUL a6, b1, t3 + nop + + FADD c8, t4, c8 + nop + FMUL a6, b2, t4 + LDF [A3 + 5 * SIZE], a6 + + FADD c9, t1, c9 + add A3, 6 * SIZE, A3 + FMUL a7, b1, t1 + nop + + FADDX c10, t2, c10 + nop + FMUL a7, b2, t2 + LDF [A4 + 4 * SIZE], a7 + + FADD c11, t3, c11 + nop + FMUL a8, b1, t3 + nop + + FADD c12, t4, c12 + nop + FMUL a8, b2, t4 + LDF [A4 + 5 * SIZE], a8 + + FADD c13, t1, c13 + add A4, 6 * SIZE, A4 + FMUL a1, b3, t1 + nop + + FADDX c14, t2, c14 + nop + FMUL a1, b4, t2 + nop + + FADD c15, t3, c15 + FMUL a2, b3, t3 + FADD c16, t4, c16 + FMUL a2, b4, t4 + + FADD c1, t1, c1 + FMUL a3, b3, t1 + FADDX c2, t2, c2 + FMUL a3, b4, t2 + FADD c3, t3, c3 + FMUL a4, b3, t3 + FADD c4, t4, c4 + FMUL a4, b4, t4 + + FADD c5, t1, c5 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + FADD c7, t3, c7 + FMUL a6, b3, t3 + FADD c8, t4, c8 + FMUL a6, b4, t4 + + FADD c9, t1, c9 + FMUL a7, b3, t1 + FADDX c10, t2, c10 + FMUL a7, b4, t2 + FADD c11, t3, c11 + FMUL a8, b3, t3 + FADD c12, t4, c12 + FMUL a8, b4, t4 + +.LL115: + andcc MIN_M, 3, I + LDF ALPHA_R, b3 + mov Y1, Y2 + ble,pn %icc, .LL119 + LDF ALPHA_I, b4 + +.L116: + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + LDF [X1 + 0 * SIZE], b1 + LDF [X1 + 1 * SIZE], b2 + add X1, 2 * SIZE, X1 + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + add A2, 2 * SIZE, A2 + LDF [A3 + 0 * SIZE], a5 + LDF [A3 + 1 * SIZE], a6 + add A3, 2 * SIZE, A3 + LDF [A4 + 0 * SIZE], a7 + LDF [A4 + 1 * SIZE], a8 + add A4, 2 * SIZE, A4 + + FADD c13, t1, c13 + FMUL a1, b1, t1 + FADDX c14, t2, c14 + FMUL a1, b2, t2 + FADD c15, t3, c15 + FMUL a2, b1, t3 + FADD c16, t4, c16 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + FADD c3, t3, c3 + FMUL a4, b1, t3 + FADD c4, t4, c4 + FMUL a4, b2, t4 + + FADD c5, t1, c5 + FMUL a5, b1, t1 + FADDX c6, t2, c6 + FMUL a5, b2, t2 + FADD c7, t3, c7 + FMUL a6, b1, t3 + FADD c8, t4, c8 + FMUL a6, b2, t4 + + FADD c9, t1, c9 + FMUL a7, b1, t1 + FADDX c10, t2, c10 + FMUL a7, b2, t2 + FADD c11, t3, c11 + FMUL a8, b1, t3 + FADD c12, t4, c12 + FMUL a8, b2, t4 + + deccc I + bg %icc, .L116 + nop + +.LL119: + FADD c13, t1, c13 + LDF [Y1 + 0 * SIZE], a1 + FADDX c14, t2, c14 + LDF [Y1 + 1 * SIZE] ,a2 + add Y1, INCY, Y1 + FADD c15, t3, c15 + LDF [Y1 + 0 * SIZE], a3 + FADD c16, t4, c16 + LDF [Y1 + 1 * SIZE] ,a4 + add Y1, INCY, Y1 + +#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) + FSUB c1, c4, c1 + LDF [Y1 + 0 * SIZE], a5 + FSUB c5, c8, c5 + LDF [Y1 + 1 * SIZE] ,a6 + add Y1, INCY, Y1 + FSUB c9, c12, c9 + LDF [Y1 + 0 * SIZE], a7 + FSUB c13, c16, c13 + LDF [Y1 + 1 * SIZE] ,a8 + add Y1, INCY, Y1 +#else + FADD c1, c4, c1 + LDF [Y1 + 0 * SIZE], a5 + FADD c5, c8, c5 + LDF [Y1 + 1 * SIZE] ,a6 + add Y1, INCY, Y1 + FADD c9, c12, c9 + LDF [Y1 + 0 * SIZE], a7 + FADD c13, c16, c13 + LDF [Y1 + 1 * SIZE] ,a8 + add Y1, INCY, Y1 +#endif + +#ifndef CONJ + FADD c2, c3, c2 + FCLR(0) + FADD c6, c7, c6 + FADD c10, c11, c10 + FADD c14, c15, c14 +#else + FSUB c2, c3, c2 + FCLR(0) + FSUB c6, c7, c6 + FSUB c10, c11, c10 + FSUB c14, c15, c14 +#endif + + FMUL b3, c1, c3 + FMOV t1, t2 + FMUL b4, c1, c4 + FMOV t1, t3 + FMUL b4, c2, c1 + FMOV t1, t4 + FMUL b3, c2, c2 + + FMUL b3, c5, c7 + FMUL b4, c5, c8 + FMUL b4, c6, c5 + FMUL b3, c6, c6 + + FMUL b3, c9, c11 + FMUL b4, c9, c12 + FMUL b4, c10, c9 + FMUL b3, c10, c10 + + FMUL b3, c13, c15 + FSUB c3, c1, c1 + FMUL b4, c13, c16 + FADD c2, c4, c2 + FMUL b4, c14, c13 + FSUB c7, c5, c5 + FMUL b3, c14, c14 + FADD c6, c8, c6 + + FSUB c11, c9, c9 + FADD c10, c12, c10 + FSUB c15, c13, c13 + FADD c14, c16, c14 + + FADD a1, c1, a1 + FADD a2, c2, a2 + FADD a3, c5, a3 + FADD a4, c6, a4 + + STF a1, [Y2 + 0 * SIZE] + FADD a5, c9, a5 + STF a2, [Y2 + 1 * SIZE] + FADD a6, c10, a6 + add Y2, INCY, Y2 + STF a3, [Y2 + 0 * SIZE] + FADD a7, c13, a7 + STF a4, [Y2 + 1 * SIZE] + FADD a8, c14, a8 + add Y2, INCY, Y2 + + STF a5, [Y2 + 0 * SIZE] + FMOV t1, c1 + add J, -1, J + STF a6, [Y2 + 1 * SIZE] + FMOV t1, c2 + cmp J, 0 + add Y2, INCY, Y2 + STF a7, [Y2 + 0 * SIZE] + FMOV t1, c3 + STF a8, [Y2 + 1 * SIZE] + FMOV t1, c4 + add Y2, INCY, Y2 + + FMOV t1, c5 + bg %icc, .LL110 + FMOV t1, c6 + +.LL200: + FCLR(0) + + and N, 2, J + cmp J, 0 + FMOV t1, c1 + ble %icc, .LL300 + + FMOV t1, c2 + sra MIN_M, 2, I + FMOV t1, t2 + add A, LDA, A2 + FMOV t1, c3 + mov A, A1 + FMOV t1, t3 + cmp I, 0 + FMOV t1, c4 + + FMOV t1, c5 + FMOV t1, c6 + FMOV t1, c7 + FMOV t1, c8 + + add A2, LDA, A + FMOV t1, t4 + ble %icc, .LL215 + mov XP, X1 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a5 + LDF [A1 + 3 * SIZE], a6 + add A1, 4 * SIZE, A1 + + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + LDF [A2 + 2 * SIZE], a7 + LDF [A2 + 3 * SIZE], a8 + add A2, 4 * SIZE, A2 + + LDF [X1 + 0 * SIZE], b1 + add I, -1, I + LDF [X1 + 1 * SIZE], b2 + cmp I, 0 + LDF [X1 + 2 * SIZE], b3 + LDF [X1 + 3 * SIZE], b4 + ble %icc, .LL212 + add X1, 4 * SIZE, X1 + +.LL211: + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + + FADD c5, t1, c5 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + FADD c3, t3, c3 + FMUL a4, b1, t3 + LDF [X1 + 0 * SIZE], b1 + FADD c4, t4, c4 + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + + FADD c5, t1, c5 + LDF [X1 + 1 * SIZE], b2 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + LDF [A1 + 2 * SIZE], a5 + FADD c7, t3, c7 + add I, -1, I + FMUL a6, b3, t3 + FADD c8, t4, c8 + cmp I, 0 + FMUL a6, b4, t4 + LDF [A1 + 3 * SIZE], a6 + + FADD c1, t1, c1 + FMUL a7, b3, t1 + FADDX c2, t2, c2 + FMUL a7, b4, t2 + LDF [A2 + 2 * SIZE], a7 + FADD c3, t3, c3 + FMUL a8, b3, t3 + LDF [X1 + 2 * SIZE], b3 + FADD c4, t4, c4 + FMUL a8, b4, t4 + LDF [A2 + 3 * SIZE], a8 + + prefetch [A2 + PREFETCHSIZE * SIZE], 1 + FADD c5, t1, c5 + LDF [X1 + 3 * SIZE], b4 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + LDF [A1 + 4 * SIZE], a1 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + LDF [A1 + 5 * SIZE], a2 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + LDF [A2 + 4 * SIZE], a3 + FADD c3, t3, c3 + FMUL a4, b1, t3 + LDF [X1 + 4 * SIZE], b1 + FADD c4, t4, c4 + FMUL a4, b2, t4 + LDF [A2 + 5 * SIZE], a4 + + FADD c5, t1, c5 + LDF [X1 + 5 * SIZE], b2 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + LDF [A1 + 6 * SIZE], a5 + FADD c7, t3, c7 + FMUL a6, b3, t3 + FADD c8, t4, c8 + FMUL a6, b4, t4 + LDF [A1 + 7 * SIZE], a6 + add A1, 8 * SIZE, A1 + + FADD c1, t1, c1 + FMUL a7, b3, t1 + FADDX c2, t2, c2 + FMUL a7, b4, t2 + LDF [A2 + 6 * SIZE], a7 + FADD c3, t3, c3 + FMUL a8, b3, t3 + LDF [X1 + 6 * SIZE], b3 + FADD c4, t4, c4 + add X1, 8 * SIZE, X1 + FMUL a8, b4, t4 + LDF [A2 + 7 * SIZE], a8 + add A2, 8 * SIZE, A2 + bg,pn %icc, .LL211 + LDF [X1 - 1 * SIZE], b4 + +.LL212: + FADD c5, t1, c5 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + FADD c3, t3, c3 + FMUL a4, b1, t3 + LDF [X1 + 0 * SIZE], b1 + FADD c4, t4, c4 + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + + FADD c5, t1, c5 + LDF [X1 + 1 * SIZE], b2 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + LDF [A1 + 2 * SIZE], a5 + FADD c7, t3, c7 + FMUL a6, b3, t3 + FADD c8, t4, c8 + FMUL a6, b4, t4 + LDF [A1 + 3 * SIZE], a6 + add A1, 4 * SIZE, A1 + + FADD c1, t1, c1 + FMUL a7, b3, t1 + FADDX c2, t2, c2 + FMUL a7, b4, t2 + LDF [A2 + 2 * SIZE], a7 + FADD c3, t3, c3 + FMUL a8, b3, t3 + LDF [X1 + 2 * SIZE], b3 + FADD c4, t4, c4 + FMUL a8, b4, t4 + LDF [A2 + 3 * SIZE], a8 + add A2, 4 * SIZE, A2 + + FADD c5, t1, c5 + LDF [X1 + 3 * SIZE], b4 + add X1, 4 * SIZE, X1 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + FADD c3, t3, c3 + FMUL a4, b1, t3 + FADD c4, t4, c4 + FMUL a4, b2, t4 + + FADD c5, t1, c5 + FMUL a5, b3, t1 + FADDX c6, t2, c6 + FMUL a5, b4, t2 + FADD c7, t3, c7 + FMUL a6, b3, t3 + FADD c8, t4, c8 + FMUL a6, b4, t4 + + FADD c1, t1, c1 + FMUL a7, b3, t1 + FADDX c2, t2, c2 + FMUL a7, b4, t2 + FADD c3, t3, c3 + FMUL a8, b3, t3 + FADD c4, t4, c4 + FMUL a8, b4, t4 + +.LL215: + andcc MIN_M, 3, I + LDF ALPHA_R, b3 + mov Y1, Y2 + ble %icc, .LL219 + LDF ALPHA_I, b4 + + LDF [A1 + 0 * SIZE], a1 + add I, -1, I + LDF [A1 + 1 * SIZE], a2 + cmp I, 0 + add A1, 2 * SIZE, A1 + + LDF [A2 + 0 * SIZE], a3 + LDF [A2 + 1 * SIZE], a4 + add A2, 2 * SIZE, A2 + + LDF [X1 + 0 * SIZE], b1 + LDF [X1 + 1 * SIZE], b2 + ble %icc, .LL217 + add X1, 2 * SIZE, X1 + +.LL216: + FADD c5, t1, c5 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c7, t3, c7 + add I, -1, I + FMUL a2, b1, t3 + FADD c8, t4, c8 + cmp I, 0 + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + LDF [A2 + 0 * SIZE], a3 + FADD c3, t3, c3 + FMUL a4, b1, t3 + LDF [X1 + 0 * SIZE], b1 + FADD c4, t4, c4 + add X1, 2 * SIZE, X1 + FMUL a4, b2, t4 + LDF [A2 + 1 * SIZE], a4 + add A2, 2 * SIZE, A2 + bg,pn %icc, .LL216 + LDF [X1 - 1 * SIZE], b2 + +.LL217: + FADD c5, t1, c5 + FMUL a1, b1, t1 + FADDX c6, t2, c6 + FMUL a1, b2, t2 + FADD c7, t3, c7 + FMUL a2, b1, t3 + FADD c8, t4, c8 + FMUL a2, b2, t4 + + FADD c1, t1, c1 + FMUL a3, b1, t1 + FADDX c2, t2, c2 + FMUL a3, b2, t2 + FADD c3, t3, c3 + FMUL a4, b1, t3 + FADD c4, t4, c4 + FMUL a4, b2, t4 + +.LL219: + FADD c5, t1, c5 + LDF [Y1 + 0 * SIZE], a1 + FADDX c6, t2, c6 + LDF [Y1 + 1 * SIZE] ,a2 + add Y1, INCY, Y1 + FADD c7, t3, c7 + LDF [Y1 + 0 * SIZE], a3 + FADD c8, t4, c8 + LDF [Y1 + 1 * SIZE] ,a4 + add Y1, INCY, Y1 + +#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) + FSUB c1, c4, c1 + FSUB c5, c8, c5 +#else + FADD c1, c4, c1 + FADD c5, c8, c5 +#endif + +#ifndef CONJ + FADD c2, c3, c2 + FADD c6, c7, c6 +#else + FSUB c2, c3, c2 + FSUB c6, c7, c6 +#endif + + FMUL b3, c1, c3 + FMUL b4, c1, c4 + FMUL b4, c2, c1 + FMUL b3, c2, c2 + + FMUL b3, c5, c7 + FMUL b4, c5, c8 + FMUL b4, c6, c5 + FMUL b3, c6, c6 + + FSUB c3, c1, c1 + FADD c2, c4, c2 + FSUB c7, c5, c5 + FADD c6, c8, c6 + + FADD a1, c1, a1 + FADD a2, c2, a2 + FADD a3, c5, a3 + FADD a4, c6, a4 + + STF a1, [Y2 + 0 * SIZE] + STF a2, [Y2 + 1 * SIZE] + add Y2, INCY, Y2 + STF a3, [Y2 + 0 * SIZE] + STF a4, [Y2 + 1 * SIZE] + +.LL300: + andcc N, 1, J + FCLR(0) + ble %icc, .LL400 + FMOV t1, c1 + +.LL310: + sra MIN_M, 2, I + FMOV t1, c2 + FMOV t1, c3 + FMOV t1, c4 + mov A, A1 + FMOV t1, t2 + add A, LDA, A + FMOV t1, t3 + cmp I, 0 + FMOV t1, t4 + ble %icc, .LL315 + mov XP, X1 + + LDF [A1 + 0 * SIZE], a1 + LDF [A1 + 1 * SIZE], a2 + LDF [A1 + 2 * SIZE], a3 + LDF [A1 + 3 * SIZE], a4 + LDF [A1 + 4 * SIZE], a5 + LDF [A1 + 5 * SIZE], a6 + LDF [A1 + 6 * SIZE], a7 + LDF [A1 + 7 * SIZE], a8 + add A1, 8 * SIZE, A1 + + LDF [X1 + 0 * SIZE], c9 + add I, -1, I + LDF [X1 + 1 * SIZE], c10 + cmp I, 0 + LDF [X1 + 2 * SIZE], c11 + LDF [X1 + 3 * SIZE], c12 + LDF [X1 + 4 * SIZE], c13 + LDF [X1 + 5 * SIZE], c14 + LDF [X1 + 6 * SIZE], c15 + LDF [X1 + 7 * SIZE], c16 + ble %icc, .LL312 + add X1, 8 * SIZE, X1 + +.LL311: + prefetch [A1 + PREFETCHSIZE * SIZE], 1 + + FADD c1, t1, c1 + FMUL a1, c9, t1 + FADDX c2, t2, c2 + FMUL a1, c10, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c3, t3, c3 + FMUL a2, c9, t3 + LDF [X1 + 0 * SIZE], c9 + FADD c4, t4, c4 + FMUL a2, c10, t4 + LDF [A1 + 1 * SIZE], a2 + LDF [X1 + 1 * SIZE], c10 + + FADD c1, t1, c1 + FMUL a3, c11, t1 + FADDX c2, t2, c2 + FMUL a3, c12, t2 + LDF [A1 + 2 * SIZE], a3 + FADD c3, t3, c3 + add I, -1, I + FMUL a4, c11, t3 + LDF [X1 + 2 * SIZE], c11 + FADD c4, t4, c4 + cmp I, 0 + FMUL a4, c12, t4 + LDF [A1 + 3 * SIZE], a4 + LDF [X1 + 3 * SIZE], c12 + + FADD c1, t1, c1 + FMUL a5, c13, t1 + FADDX c2, t2, c2 + FMUL a5, c14, t2 + LDF [A1 + 4 * SIZE], a5 + FADD c3, t3, c3 + FMUL a6, c13, t3 + LDF [X1 + 4 * SIZE], c13 + FADD c4, t4, c4 + FMUL a6, c14, t4 + LDF [A1 + 5 * SIZE], a6 + LDF [X1 + 5 * SIZE], c14 + + FADD c1, t1, c1 + FMUL a7, c15, t1 + FADDX c2, t2, c2 + FMUL a7, c16, t2 + LDF [A1 + 6 * SIZE], a7 + + FADD c3, t3, c3 + FMUL a8, c15, t3 + LDF [X1 + 6 * SIZE], c15 + FADD c4, t4, c4 + add X1, 8 * SIZE, X1 + FMUL a8, c16, t4 + LDF [A1 + 7 * SIZE], a8 + add A1, 8 * SIZE, A1 + bg,pn %icc, .LL311 + LDF [X1 - 1 * SIZE], c16 + +.LL312: + FADD c1, t1, c1 + FMUL a1, c9, t1 + FADDX c2, t2, c2 + FMUL a1, c10, t2 + FADD c3, t3, c3 + FMUL a2, c9, t3 + FADD c4, t4, c4 + FMUL a2, c10, t4 + + FADD c1, t1, c1 + FMUL a3, c11, t1 + FADDX c2, t2, c2 + FMUL a3, c12, t2 + FADD c3, t3, c3 + FMUL a4, c11, t3 + FADD c4, t4, c4 + FMUL a4, c12, t4 + + FADD c1, t1, c1 + FMUL a5, c13, t1 + FADDX c2, t2, c2 + FMUL a5, c14, t2 + FADD c3, t3, c3 + FMUL a6, c13, t3 + FADD c4, t4, c4 + FMUL a6, c14, t4 + + FADD c1, t1, c1 + FMUL a7, c15, t1 + FADDX c2, t2, c2 + FMUL a7, c16, t2 + FADD c3, t3, c3 + FMUL a8, c15, t3 + FADD c4, t4, c4 + FMUL a8, c16, t4 + +.LL315: + andcc MIN_M, 3, I + LDF ALPHA_R, b3 + mov Y1, Y2 + ble %icc, .LL319 + LDF ALPHA_I, b4 + + LDF [A1 + 0 * SIZE], a1 + add I, -1, I + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + LDF [X1 + 0 * SIZE], b1 + cmp I, 0 + LDF [X1 + 1 * SIZE], b2 + ble %icc, .LL317 + add X1, 2 * SIZE, X1 + +.LL316: + FADD c1, t1, c1 + add I, -1, I + FMUL a1, b1, t1 + FADDX c2, t2, c2 + FMUL a1, b2, t2 + LDF [A1 + 0 * SIZE], a1 + FADD c3, t3, c3 + cmp I, 0 + FMUL a2, b1, t3 + LDF [X1 + 0 * SIZE], b1 + FADD c4, t4, c4 + add X1, 2 * SIZE, X1 + FMUL a2, b2, t4 + LDF [A1 + 1 * SIZE], a2 + add A1, 2 * SIZE, A1 + + bg,pn %icc, .LL316 + LDF [X1 - 1 * SIZE], b2 + +.LL317: + FADD c1, t1, c1 + FMUL a1, b1, t1 + FADDX c2, t2, c2 + FMUL a1, b2, t2 + FADD c3, t3, c3 + FMUL a2, b1, t3 + FADD c4, t4, c4 + FMUL a2, b2, t4 + +.LL319: + FADD c1, t1, c1 + LDF [Y1 + 0 * SIZE], a1 + FADDX c2, t2, c2 + LDF [Y1 + 1 * SIZE] ,a2 + add Y1, INCY, Y1 + FADD c3, t3, c3 + FADD c4, t4, c4 + +#if (!defined(XCONJ) && !defined(CONJ)) || (defined(XCONJ) && defined(CONJ)) + FSUB c1, c4, c1 +#else + FADD c1, c4, c1 +#endif + +#ifndef CONJ + FADD c2, c3, c2 +#else + FSUB c2, c3, c2 +#endif + + FMUL b3, c1, c3 + FMUL b4, c1, c4 + FMUL b4, c2, c1 + FMUL b3, c2, c2 + + FSUB c3, c1, c1 + FADD c2, c4, c2 + FADD a1, c1, a1 + FADD a2, c2, a2 + + STF a1, [Y2 + 0 * SIZE] + STF a2, [Y2 + 1 * SIZE] + +.LL400: + mov P, I + add IS, I, IS + cmp IS, M + bl %icc, .LL10 + add A, PNLDA, A + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S new file mode 100644 index 0000000000..28e9e074d2 --- /dev/null +++ b/kernel/sparc/znrm2.S @@ -0,0 +1,665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define I %i3 +#define XX %i4 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define t1 %f8 +#define t2 %f10 +#define t3 %f12 +#define t4 %f14 + +#define a1 %f16 +#define a2 %f18 +#define a3 %f20 +#define a4 %f22 +#define a5 %f24 +#define a6 %f26 +#define a7 %f28 +#define a8 %f30 +#define fmax %f32 +#define fzero %f34 +#define fone %f36 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define t1 %f4 +#define t2 %f5 +#define t3 %f6 +#define t4 %f7 + +#define a1 %f8 +#define a2 %f9 +#define a3 %f10 +#define a4 %f11 +#define a5 %f12 +#define a6 %f13 +#define a7 %f14 +#define a8 %f15 +#define fmax %f16 +#define fzero %f17 +#define fone %f18 +#endif + + PROLOGUE + SAVESP + +#ifdef DOUBLE + FCLR(3) +#else + FCLR(17) +#endif + + mov X, XX + mov 0x3ff, %g1 + sll %g1, 20, %g1 + + cmp N, 0 + ble .LL99 + FMOV fzero, c1 + + cmp INCX, 0 + ble .LL99 + sll INCX, ZBASE_SHIFT, INCX + + add %sp, -8, %sp + st %g1, [%sp + STACK_START + 0] + st %g0, [%sp + STACK_START + 4] + + LDF [%sp + STACK_START], fone + add %sp, 8, %sp + + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + cmp INCX, 2 * SIZE + bne .LL100 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL12 + add X, 8 * SIZE, X + +#define PREFETCHSIZE 40 + +.LL11: + FABS a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FABS a2, t2 + LDF [X + 0 * SIZE], a1 + FABS a3, t3 + LDF [X + 1 * SIZE], a2 + FABS a4, t4 + LDF [X + 2 * SIZE], a3 + + FCMP %fcc0, t1, c1 + LDF [X + 3 * SIZE], a4 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 4 * SIZE], a5 + FABS a6, t2 + LDF [X + 5 * SIZE], a6 + FABS a7, t3 + LDF [X + 6 * SIZE], a7 + FABS a8, t4 + LDF [X + 7 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + add I, -1, I + FMOVG %fcc1, t2, c2 + cmp I, 0 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + bg,pt %icc, .LL11 + add X, 8 * SIZE, X + +.LL12: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FABS a1, t1 + FABS a2, t2 + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + mov XX, X + FMOVG %fcc0, c2, c1 + FMOVG %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FMOVG %fcc0, c3, c1 + + FCMP c1, fzero + fbe .LL99 + nop + + FMOV c1, fmax + FDIV fone, c1, fone + + FMOV fzero, c1 + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL35 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + LDF [X + 2 * SIZE], a3 + LDF [X + 3 * SIZE], a4 + + LDF [X + 4 * SIZE], a5 + add I, -1, I + LDF [X + 5 * SIZE], a6 + cmp I, 0 + LDF [X + 6 * SIZE], a7 + LDF [X + 7 * SIZE], a8 + + ble,pt %icc, .LL32 + add X, 8 * SIZE, X + +.LL31: + FMUL fone, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMUL fone, a2, t2 + LDF [X + 0 * SIZE], a1 + FMUL fone, a3, t3 + LDF [X + 1 * SIZE], a2 + FMUL fone, a4, t4 + LDF [X + 2 * SIZE], a3 + + FMUL t1, t1, t1 + LDF [X + 3 * SIZE], a4 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + LDF [X + 4 * SIZE], a5 + FADD c2, t2, c2 + FMUL fone, a6, t2 + LDF [X + 5 * SIZE], a6 + FADD c3, t3, c3 + FMUL fone, a7, t3 + LDF [X + 6 * SIZE], a7 + FADD c4, t4, c4 + FMUL fone, a8, t4 + LDF [X + 7 * SIZE], a8 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + add I, -1, I + FADD c2, t2, c2 + cmp I, 0 + FADD c3, t3, c3 + FADD c4, t4, c4 + + bg,pt %icc, .LL31 + add X, 8 * SIZE, X + +.LL32: + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL fone, a3, t3 + FMUL fone, a4, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + FADD c2, t2, c2 + FMUL fone, a6, t2 + FADD c3, t3, c3 + FMUL fone, a7, t3 + FADD c4, t4, c4 + FMUL fone, a8, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +.LL35: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL39 + nop + +.LL36: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FADD c1, t1, c1 + FADD c2, t2, c2 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL36 + add X, 2 * SIZE, X + +.LL39: + FADD c1, c2, c1 + FADD c3, c4, c3 + FADD c1, c3, c1 + + FSQRT c1, c1 + FMUL fmax, c1, c1 + +.LL99: + return %i7 + 8 + clr %g0 + +.LL100: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL105 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [X + 0 * SIZE], a5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + add I, -1, I + LDF [X + 0 * SIZE], a7 + cmp I, 0 + LDF [X + 1 * SIZE], a8 + ble,pt %icc, .LL102 + add X, INCX, X + +.LL101: + FABS a1, t1 + LDF [X + 0 * SIZE], a1 + FABS a2, t2 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + FABS a3, t3 + LDF [X + 0 * SIZE], a3 + FABS a4, t4 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + LDF [X + 0 * SIZE], a5 + FABS a6, t2 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FABS a7, t3 + LDF [X + 0 * SIZE], a7 + FABS a8, t4 + LDF [X + 1 * SIZE], a8 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + add I, -1, I + FMOVG %fcc1, t2, c2 + cmp I, 0 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + bg,pt %icc, .LL101 + add X, INCX, X + +.LL102: + FABS a1, t1 + FABS a2, t2 + FABS a3, t3 + FABS a4, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + + FABS a5, t1 + FABS a6, t2 + FABS a7, t3 + FABS a8, t4 + + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FCMP %fcc2, t3, c3 + FCMP %fcc3, t4, c4 + + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + FMOVG %fcc2, t3, c3 + FMOVG %fcc3, t4, c4 + +.LL105: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL109 + nop + +.LL106: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FABS a1, t1 + FABS a2, t2 + FCMP %fcc0, t1, c1 + FCMP %fcc1, t2, c2 + FMOVG %fcc0, t1, c1 + FMOVG %fcc1, t2, c2 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL106 + add X, INCX, X + +.LL109: + FCMP %fcc0, c2, c1 + FCMP %fcc1, c4, c3 + mov XX, X + FMOVG %fcc0, c2, c1 + FMOVG %fcc1, c4, c3 + FCMP %fcc0, c3, c1 + FMOVG %fcc0, c3, c1 + + FCMP c1, fzero + fbe .LL99 + nop + + FMOV c1, fmax + FDIV fone, c1, fone + + FMOV fzero, c1 + FMOV fzero, c2 + FMOV fzero, c3 + FMOV fzero, c4 + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL135 + nop + + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [X + 0 * SIZE], a3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + + LDF [X + 0 * SIZE], a5 + add I, -1, I + LDF [X + 1 * SIZE], a6 + add X, INCX, X + cmp I, 0 + LDF [X + 0 * SIZE], a7 + LDF [X + 1 * SIZE], a8 + + ble,pt %icc, .LL132 + add X, INCX, X + +.LL131: + FMUL fone, a1, t1 + prefetch [X + PREFETCHSIZE * SIZE], 0 + FMUL fone, a2, t2 + LDF [X + 0 * SIZE], a1 + FMUL fone, a3, t3 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + FMUL fone, a4, t4 + LDF [X + 0 * SIZE], a3 + + FMUL t1, t1, t1 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + LDF [X + 0 * SIZE], a5 + FADD c2, t2, c2 + FMUL fone, a6, t2 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + FADD c3, t3, c3 + FMUL fone, a7, t3 + LDF [X + 0 * SIZE], a7 + FADD c4, t4, c4 + FMUL fone, a8, t4 + LDF [X + 1 * SIZE], a8 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + add I, -1, I + FADD c2, t2, c2 + cmp I, 0 + FADD c3, t3, c3 + FADD c4, t4, c4 + + bg,pt %icc, .LL131 + add X, INCX, X + +.LL132: + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL fone, a3, t3 + FMUL fone, a4, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FMUL fone, a5, t1 + FADD c2, t2, c2 + FMUL fone, a6, t2 + FADD c3, t3, c3 + FMUL fone, a7, t3 + FADD c4, t4, c4 + FMUL fone, a8, t4 + + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FMUL t3, t3, t3 + FMUL t4, t4, t4 + + FADD c1, t1, c1 + FADD c2, t2, c2 + FADD c3, t3, c3 + FADD c4, t4, c4 + +.LL135: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL139 + nop + +.LL136: + LDF [X + 0 * SIZE], a1 + LDF [X + 1 * SIZE], a2 + FMUL fone, a1, t1 + FMUL fone, a2, t2 + FMUL t1, t1, t1 + FMUL t2, t2, t2 + FADD c1, t1, c1 + FADD c2, t2, c2 + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL136 + add X, INCX, X + +.LL139: + FADD c1, c2, c1 + FADD c3, c4, c3 + FADD c1, c3, c1 + + FSQRT c1, c1 + FMUL fmax, c1, c1 + + return %i7 + 8 + clr %g0 + + EPILOGUE diff --git a/kernel/sparc/zrot.S b/kernel/sparc/zrot.S new file mode 100644 index 0000000000..ec274ca16b --- /dev/null +++ b/kernel/sparc/zrot.S @@ -0,0 +1,673 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 + +#define XX %l0 +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f4 +#define a2 %f6 +#define a3 %f8 +#define a4 %f10 +#define a5 %f12 +#define a6 %f14 +#define a7 %f16 +#define a8 %f18 +#define b1 %f20 +#define b2 %f22 +#define b3 %f24 +#define b4 %f26 +#define b5 %f28 +#define b6 %f30 +#define b7 %f32 +#define b8 %f34 + +#define c1 %f36 +#define c2 %f38 +#define c3 %f40 +#define c4 %f42 +#define c5 %f44 +#define c6 %f46 +#define c7 %f48 +#define c8 %f50 + +#define t1 %f52 +#define t2 %f54 +#define t3 %f56 +#define t4 %f58 +#else +#define a1 %f2 +#define a2 %f3 +#define a3 %f4 +#define a4 %f5 +#define a5 %f6 +#define a6 %f7 +#define a7 %f8 +#define a8 %f9 +#define b1 %f10 +#define b2 %f11 +#define b3 %f12 +#define b4 %f13 +#define b5 %f14 +#define b6 %f15 +#define b7 %f16 +#define b8 %f17 + +#define c1 %f18 +#define c2 %f19 +#define c3 %f20 +#define c4 %f21 +#define c5 %f22 +#define c6 %f23 +#define c7 %f24 +#define c8 %f25 + +#define t1 %f26 +#define t2 %f27 +#define t3 %f28 +#define t4 %f29 +#endif + +#ifdef DOUBLE +#define C %f0 +#define S %f2 +#else +#define C %f0 +#define S %f1 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i5, [%sp + STACK_START + 24] + + LDF [%sp + STACK_START + 24], C + LDF [%sp + STACK_START + 32], S +#else + st %i5, [%sp + STACK_START + 24] + + LDF [%sp + STACK_START + 24], C + LDF [%sp + STACK_START + 28], S +#endif +#else +#ifdef DOUBLE + FMOV %f10, C + FMOV %f12, S +#else + FMOV %f11, C + FMOV %f13, S +#endif +#endif + + cmp N, 0 + ble .LL19 + nop + + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + LDF [Y + 8 * SIZE], b1 + FMUL S, a1, c4 + LDF [X + 8 * SIZE], a1 + + FMUL C, a2, c5 + FMUL S, b2, c6 + FADD c1, c2, t1 + + FMUL C, b2, c7 + LDF [Y + 9 * SIZE], b2 + FMUL S, a2, c8 + LDF [X + 9 * SIZE], a2 + FSUB c3, c4, t2 + + addcc I, -1, I + ble,pt %icc, .LL12 + nop + +#define PREFETCHSIZE 64 + +.LL11: + FMUL C, a3, c1 + nop + prefetch [Y + PREFETCHSIZE * SIZE], 1 + nop + + FMUL S, b3, c2 + STF t1, [X + 0 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b3, c3 + LDF [Y + 10 * SIZE], b3 + nop + nop + + FMUL S, a3, c4 + STF t2, [Y + 0 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a4, c5 + LDF [X + 10 * SIZE], a3 + nop + nop + + FMUL S, b4, c6 + STF t3, [X + 1 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b4, c7 + LDF [Y + 11 * SIZE], b4 + nop + nop + + FMUL S, a4, c8 + STF t4, [Y + 1 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a5, c1 + LDF [X + 11 * SIZE], a4 + nop + nop + + FMUL S, b5, c2 + STF t1, [X + 2 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b5, c3 + LDF [Y + 12 * SIZE], b5 + nop + nop + + FMUL S, a5, c4 + STF t2, [Y + 2 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a6, c5 + LDF [X + 12 * SIZE], a5 + nop + nop + + FMUL S, b6, c6 + STF t3, [X + 3 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b6, c7 + LDF [Y + 13 * SIZE], b6 + nop + nop + + FMUL S, a6, c8 + STF t4, [Y + 3 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a7, c1 + LDF [X + 13 * SIZE], a6 + nop + nop + + FMUL S, b7, c2 + STF t1, [X + 4 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b7, c3 + LDF [Y + 14 * SIZE], b7 + nop + nop + + FMUL S, a7, c4 + STF t2, [Y + 4 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a8, c5 + LDF [X + 14 * SIZE], a7 + nop + nop + + FMUL S, b8, c6 + STF t3, [X + 5 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b8, c7 + LDF [Y + 15 * SIZE], b8 + nop + nop + + FMUL S, a8, c8 + STF t4, [Y + 5 * SIZE] + FSUB c3, c4, t2 + nop + + FMUL C, a1, c1 + LDF [X + 15 * SIZE], a8 + addcc I, -1, I + nop + + FMUL S, b1, c2 + STF t1, [X + 6 * SIZE] + FADD c5, c6, t3 + nop + + FMUL C, b1, c3 + LDF [Y + 16 * SIZE], b1 + nop + nop + + FMUL S, a1, c4 + STF t2, [Y + 6 * SIZE] + FSUB c7, c8, t4 + nop + + FMUL C, a2, c5 + LDF [X + 16 * SIZE], a1 + add Y, 8 * SIZE, Y + nop + + FMUL S, b2, c6 + STF t3, [X + 7 * SIZE] + FADD c1, c2, t1 + nop + + FMUL C, b2, c7 + LDF [Y + 9 * SIZE], b2 + add X, 8 * SIZE, X + nop + + FMUL S, a2, c8 + STF t4, [Y - 1 * SIZE] + FSUB c3, c4, t2 + nop + + bg,pt %icc, .LL11 + LDF [X + 9 * SIZE], a2 + + +.LL12: + FMUL C, a3, c1 + FMUL S, b3, c2 + STF t1, [X + 0 * SIZE] + FADD c5, c6, t3 + + FMUL C, b3, c3 + FMUL S, a3, c4 + STF t2, [Y + 0 * SIZE] + FSUB c7, c8, t4 + + + FMUL C, a4, c5 + FMUL S, b4, c6 + STF t3, [X + 1 * SIZE] + FADD c1, c2, t1 + + FMUL C, b4, c7 + FMUL S, a4, c8 + STF t4, [Y + 1 * SIZE] + FSUB c3, c4, t2 + + + FMUL C, a5, c1 + FMUL S, b5, c2 + STF t1, [X + 2 * SIZE] + FADD c5, c6, t3 + + FMUL C, b5, c3 + FMUL S, a5, c4 + STF t2, [Y + 2 * SIZE] + FSUB c7, c8, t4 + + FMUL C, a6, c5 + FMUL S, b6, c6 + STF t3, [X + 3 * SIZE] + FADD c1, c2, t1 + + FMUL C, b6, c7 + FMUL S, a6, c8 + STF t4, [Y + 3 * SIZE] + FSUB c3, c4, t2 + + FMUL C, a7, c1 + FMUL S, b7, c2 + STF t1, [X + 4 * SIZE] + FADD c5, c6, t3 + + FMUL C, b7, c3 + FMUL S, a7, c4 + STF t2, [Y + 4 * SIZE] + FSUB c7, c8, t4 + + FMUL C, a8, c5 + FMUL S, b8, c6 + STF t3, [X + 5 * SIZE] + FADD c1, c2, t1 + + FMUL C, b8, c7 + FMUL S, a8, c8 + STF t4, [Y + 5 * SIZE] + FSUB c3, c4, t2 + + FADD c5, c6, t3 + STF t1, [X + 6 * SIZE] + + FSUB c7, c8, t4 + STF t2, [Y + 6 * SIZE] + + STF t3, [X + 7 * SIZE] + STF t4, [Y + 7 * SIZE] + + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + + +.LL15: + andcc N, 3, I + nop + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + add X, 2 * SIZE, X + FMUL S, b1, c2 + add Y, 2 * SIZE, Y + + FMUL C, b1, c3 + addcc I, -1, I + FMUL S, a1, c4 + nop + + FMUL C, a2, c5 + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X - 2 * SIZE] + FADD c5, c6, c6 + STF c4, [Y - 2 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X - 1 * SIZE] + bg,pt %icc, .LL16 + STF c8, [Y - 1 * SIZE] + +.LL19: + return %i7 + 8 + nop + +.LL50: + mov X, XX + mov Y, YY + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + nop + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + add Y, INCY, Y + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + nop + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + add Y, INCY, Y + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + nop + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + add Y, INCY, Y + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + nop + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + add Y, INCY, Y + + addcc I, -1, I + bg,pt %icc, .LL51 + nop + + +.LL55: + andcc N, 3, I + nop + ble %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + + FMUL C, a1, c1 + FMUL S, b1, c2 + FMUL C, b1, c3 + FMUL S, a1, c4 + + FMUL C, a2, c5 + addcc I, -1, I + FMUL S, b2, c6 + FADD c1, c2, c2 + + FMUL C, b2, c7 + nop + FMUL S, a2, c8 + FSUB c3, c4, c4 + + STF c2, [X + 0 * SIZE] + FADD c5, c6, c6 + STF c4, [Y + 0 * SIZE] + FSUB c7, c8, c8 + + STF c6, [X + 1 * SIZE] + add X, INCX, X + STF c8, [Y + 1 * SIZE] + + bg %icc, .LL56 + add Y, INCY, Y + + +.LL59: + return %i7 + 8 + nop + + EPILOGUE diff --git a/kernel/sparc/zscal.S b/kernel/sparc/zscal.S new file mode 100644 index 0000000000..5c6ade382b --- /dev/null +++ b/kernel/sparc/zscal.S @@ -0,0 +1,518 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N %i0 + +#if defined(DOUBLE) && !defined(__64BIT__) +#define X %i3 +#define INCX %i4 +#else +#define X %i5 +#define INCX %i3 +#endif + +#define I %i1 +#define XX %i2 + +#ifdef DOUBLE +#define c1 %f0 +#define c2 %f2 +#define c3 %f4 +#define c4 %f6 +#define c5 %f8 +#define c6 %f10 +#define c7 %f12 +#define c8 %f14 + +#define t1 %f16 +#define t2 %f18 +#define t3 %f20 +#define t4 %f22 +#define t5 %f24 +#define t6 %f26 +#define t7 %f28 +#define t8 %f30 + +#define c9 %f32 +#define c10 %f34 +#define c11 %f36 +#define c12 %f38 +#define c13 %f40 +#define c14 %f42 +#define c15 %f44 +#define c16 %f46 + +#define s1 %f32 +#define s2 %f34 +#define s3 %f36 +#define s4 %f38 +#define s5 %f40 +#define s6 %f42 +#define s7 %f44 +#define s8 %f46 + +#define FZERO %f48 +#define ALPHA_R %f50 +#define ALPHA_I %f52 +#else +#define c1 %f0 +#define c2 %f1 +#define c3 %f2 +#define c4 %f3 +#define c5 %f4 +#define c6 %f5 +#define c7 %f6 +#define c8 %f7 + +#define c9 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define s1 %f8 +#define s2 %f9 +#define s3 %f10 +#define s4 %f11 +#define s5 %f12 +#define s6 %f13 +#define s7 %f14 +#define s8 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 +#define t5 %f20 +#define t6 %f21 +#define t7 %f22 +#define t8 %f23 + +#define FZERO %f24 +#define ALPHA_R %f25 +#define ALPHA_I %f26 +#endif + +#define PREFETCHSIZE 128 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 20] + st %i5, [%sp + STACK_START + 24] + + ld [%sp+ STACK_START + 32], X + ld [%sp+ STACK_START + 36], INCX +#else + st %i3, [%sp + STACK_START + 16] + st %i4, [%sp + STACK_START + 24] + ld [%sp+ STACK_START + 28], INCX +#endif + LDF [%sp + STACK_START + 16], ALPHA_R + LDF [%sp + STACK_START + 24], ALPHA_I +#else + ldx [%sp + STACK_START + 56], INCX +#ifdef DOUBLE + FMOV %f6, ALPHA_R + FMOV %f8, ALPHA_I +#else + FMOV %f7, ALPHA_R + FMOV %f9, ALPHA_I +#endif +#endif + +#ifdef DOUBLE + FCLR(17) +#else + FCLR(24) +#endif + + FCMP ALPHA_R, FZERO + fbne .LL100 + sll INCX, ZBASE_SHIFT, INCX + + FCMP ALPHA_I, FZERO + fbne .LL100 + nop + cmp INCX, 2 * SIZE + bne .LL50 + nop + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + STF FZERO, [X + 0 * SIZE] + add I, -1, I + STF FZERO, [X + 1 * SIZE] + cmp I, 0 + STF FZERO, [X + 2 * SIZE] + STF FZERO, [X + 3 * SIZE] + STF FZERO, [X + 4 * SIZE] + STF FZERO, [X + 5 * SIZE] + add X, 8 * SIZE, X + STF FZERO, [X - 2 * SIZE] + bg,pt %icc, .LL11 + STF FZERO, [X - 1 * SIZE] + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + STF FZERO, [X + 0 * SIZE] + STF FZERO, [X + 1 * SIZE] + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + return %i7 + 8 + clr %o0 + +.LL50: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL55 + nop + +.LL51: + STF FZERO, [X + 0 * SIZE] + add I, -1, I + STF FZERO, [X + 1 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + cmp I, 0 + STF FZERO, [X + 1 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + STF FZERO, [X + 1 * SIZE] + add X, INCX, X + STF FZERO, [X + 0 * SIZE] + STF FZERO, [X + 1 * SIZE] + bg,pt %icc, .LL51 + add X, INCX, X + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + STF FZERO, [X + 0 * SIZE] + add I, -1, I + STF FZERO, [X + 1 * SIZE] + cmp I, 0 + bg,pt %icc, .LL56 + add X, INCX, X + +.LL59: + return %i7 + 8 + clr %o0 + +.LL100: + cmp INCX, 2 * SIZE + bne .LL150 + sra N, 2, I + + cmp I, 0 + ble,pn %icc, .LL115 + nop + + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + LDF [X + 2 * SIZE], c3 + LDF [X + 3 * SIZE], c4 + LDF [X + 4 * SIZE], c5 + LDF [X + 5 * SIZE], c6 + LDF [X + 6 * SIZE], c7 + LDF [X + 7 * SIZE], c8 + + FMUL ALPHA_R, c1, t1 + FMUL ALPHA_I, c2, t3 + + FMUL ALPHA_I, c1, t2 + LDF [X + 8 * SIZE], c1 + FMUL ALPHA_R, c2, t4 + LDF [X + 9 * SIZE], c2 + + FMUL ALPHA_R, c3, t5 + deccc I + FMUL ALPHA_I, c4, t7 + FSUB t1, t3, s1 + + FMUL ALPHA_I, c3, t6 + LDF [X + 10 * SIZE], c3 + FMUL ALPHA_R, c4, t8 + LDF [X + 11 * SIZE], c4 + FADD t4, t2, s2 + + ble,pn %icc, .LL112 + nop + +.LL111: + prefetch [X + PREFETCHSIZE * SIZE], 0 + + FMUL ALPHA_R, c5, t1 + FMUL ALPHA_I, c6, t3 + FSUB t5, t7, s3 + STF s1, [X + 0 * SIZE] + + FMUL ALPHA_I, c5, t2 + LDF [X + 12 * SIZE], c5 + FMUL ALPHA_R, c6, t4 + LDF [X + 13 * SIZE], c6 + + FADD t8, t6, s4 + STF s2, [X + 1 * SIZE] + + FMUL ALPHA_R, c7, t5 + FMUL ALPHA_I, c8, t7 + FSUB t1, t3, s5 + STF s3, [X + 2 * SIZE] + + FMUL ALPHA_I, c7, t6 + LDF [X + 14 * SIZE], c7 + FMUL ALPHA_R, c8, t8 + LDF [X + 15 * SIZE], c8 + + FADD t4, t2, s6 + STF s4, [X + 3 * SIZE] + + FMUL ALPHA_R, c1, t1 + FMUL ALPHA_I, c2, t3 + FSUB t5, t7, s7 + STF s5, [X + 4 * SIZE] + + FMUL ALPHA_I, c1, t2 + LDF [X + 16 * SIZE], c1 + FMUL ALPHA_R, c2, t4 + LDF [X + 17 * SIZE], c2 + + FADD t8, t6, s8 + STF s6, [X + 5 * SIZE] + + FMUL ALPHA_R, c3, t5 + deccc I + FMUL ALPHA_I, c4, t7 + FSUB t1, t3, s1 + STF s7, [X + 6 * SIZE] + + FMUL ALPHA_I, c3, t6 + LDF [X + 18 * SIZE], c3 + FMUL ALPHA_R, c4, t8 + LDF [X + 19 * SIZE], c4 + + FADD t4, t2, s2 + STF s8, [X + 7 * SIZE] + + bg,pt %icc, .LL111 + add X, 8 * SIZE, X + + +.LL112: + FMUL ALPHA_R, c5, t1 + FMUL ALPHA_I, c6, t3 + FSUB t5, t7, s3 + STF s1, [X + 0 * SIZE] + + FMUL ALPHA_I, c5, t2 + FMUL ALPHA_R, c6, t4 + FADD t8, t6, s4 + STF s2, [X + 1 * SIZE] + + FMUL ALPHA_R, c7, t5 + FMUL ALPHA_I, c8, t7 + FSUB t1, t3, s5 + STF s3, [X + 2 * SIZE] + + FMUL ALPHA_I, c7, t6 + FMUL ALPHA_R, c8, t8 + FADD t4, t2, s6 + STF s4, [X + 3 * SIZE] + + FSUB t5, t7, s7 + FADD t8, t6, s8 + + STF s5, [X + 4 * SIZE] + STF s6, [X + 5 * SIZE] + STF s7, [X + 6 * SIZE] + STF s8, [X + 7 * SIZE] + add X, 8 * SIZE, X + +.LL115: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL119 + nop + +.LL116: + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + + FMUL ALPHA_R, c1, c3 + FMUL ALPHA_I, c1, c4 + FMUL ALPHA_I, c2, c1 + FMUL ALPHA_R, c2, c2 + + FSUB c3, c1, c1 + FADD c2, c4, c2 + + STF c1, [X + 0 * SIZE] + STF c2, [X + 1 * SIZE] + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL116 + add X, 2 * SIZE, X + +.LL119: + return %i7 + 8 + clr %o0 + +.LL150: + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL155 + mov X, XX + +.LL151: + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + add X, INCX, X + LDF [X + 0 * SIZE], c3 + FMUL ALPHA_R, c1, c9 + LDF [X + 1 * SIZE], c4 + FMUL ALPHA_I, c1, c10 + add X, INCX, X + LDF [X + 0 * SIZE], c5 + FMUL ALPHA_I, c2, c1 + LDF [X + 1 * SIZE], c6 + FMUL ALPHA_R, c2, c2 + add X, INCX, X + LDF [X + 0 * SIZE], c7 + FMUL ALPHA_R, c3, c11 + LDF [X + 1 * SIZE], c8 + FMUL ALPHA_I, c3, c12 + add X, INCX, X + + FMUL ALPHA_I, c4, c3 + FMUL ALPHA_R, c4, c4 + + FMUL ALPHA_R, c5, c13 + FMUL ALPHA_I, c5, c14 + FMUL ALPHA_I, c6, c5 + FMUL ALPHA_R, c6, c6 + + FMUL ALPHA_R, c7, c15 + FSUB c9, c1, c1 + FMUL ALPHA_I, c7, c16 + FADD c2, c10, c2 + FMUL ALPHA_I, c8, c7 + FSUB c11, c3, c3 + FMUL ALPHA_R, c8, c8 + FADD c4, c12, c4 + + STF c1, [XX + 0 * SIZE] + FSUB c13, c5, c5 + add I, -1, I + STF c2, [XX + 1 * SIZE] + FADD c6, c14, c6 + add XX, INCX, XX + STF c3, [XX + 0 * SIZE] + FSUB c15, c7, c7 + cmp I, 0 + STF c4, [XX + 1 * SIZE] + FADD c8, c16, c8 + add XX, INCX, XX + STF c5, [XX + 0 * SIZE] + STF c6, [XX + 1 * SIZE] + add XX, INCX, XX + STF c7, [XX + 0 * SIZE] + STF c8, [XX + 1 * SIZE] + bg,pt %icc, .LL151 + add XX, INCX, XX + +.LL155: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + LDF [X + 0 * SIZE], c1 + LDF [X + 1 * SIZE], c2 + + FMUL ALPHA_R, c1, c3 + FMUL ALPHA_I, c1, c4 + FMUL ALPHA_I, c2, c1 + FMUL ALPHA_R, c2, c2 + + FSUB c3, c1, c1 + FADD c2, c4, c2 + + STF c1, [X + 0 * SIZE] + STF c2, [X + 1 * SIZE] + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL156 + add X, INCX, X + +.LL159: + return %i7 + 8 + clr %o0 + + + EPILOGUE diff --git a/kernel/sparc/zswap.S b/kernel/sparc/zswap.S new file mode 100644 index 0000000000..88ed22169e --- /dev/null +++ b/kernel/sparc/zswap.S @@ -0,0 +1,342 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(DOUBLE) && !defined(__64BIT__) +#define N %i0 +#define X %i1 +#define INCX %i2 +#define Y %i3 +#define INCY %i4 +#define I %i5 +#else +#define N %i0 +#define X %i5 +#define INCX %i1 +#define Y %i2 +#define INCY %i3 +#define I %i4 +#endif + +#define XX %l0 +#define YY %l1 + +#ifdef DOUBLE +#define a1 %f0 +#define a2 %f2 +#define a3 %f4 +#define a4 %f6 +#define a5 %f8 +#define a6 %f10 +#define a7 %f12 +#define a8 %f14 +#define b1 %f16 +#define b2 %f18 +#define b3 %f20 +#define b4 %f22 +#define b5 %f24 +#define b6 %f26 +#define b7 %f28 +#define b8 %f30 +#else +#define a1 %f0 +#define a2 %f1 +#define a3 %f2 +#define a4 %f3 +#define a5 %f4 +#define a6 %f5 +#define a7 %f6 +#define a8 %f7 +#define b1 %f8 +#define b2 %f9 +#define b3 %f10 +#define b4 %f11 +#define b5 %f12 +#define b6 %f13 +#define b7 %f14 +#define b8 %f15 +#endif + +#ifdef DOUBLE +#define PREFETCHSIZE 128 +#else +#define PREFETCHSIZE 256 +#endif + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], X + ld [%sp + STACK_START + 36], INCX + ld [%sp + STACK_START + 40], Y + ld [%sp + STACK_START + 44], INCY +#else + ld [%sp + STACK_START + 28], INCX + ld [%sp + STACK_START + 32], Y + ld [%sp + STACK_START + 36], INCY +#endif +#else + ldx [%sp + STACK_START + 56], INCX + ldx [%sp + STACK_START + 64], Y + ldx [%sp + STACK_START + 72], INCY +#endif + + sll INCX, ZBASE_SHIFT, INCX + sll INCY, ZBASE_SHIFT, INCY + + cmp INCX, 2 * SIZE + bne .LL50 + nop + + cmp INCY, 2 * SIZE + bne .LL50 + nop + + sra N, 2, I + cmp I, 0 + ble,pn %icc, .LL15 + nop + + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + LDF [Y + 1 * SIZE], b2 + LDF [X + 2 * SIZE], a3 + LDF [Y + 2 * SIZE], b3 + LDF [X + 3 * SIZE], a4 + LDF [Y + 3 * SIZE], b4 + LDF [X + 4 * SIZE], a5 + LDF [Y + 4 * SIZE], b5 + LDF [X + 5 * SIZE], a6 + LDF [Y + 5 * SIZE], b6 + LDF [X + 6 * SIZE], a7 + LDF [Y + 6 * SIZE], b7 + LDF [X + 7 * SIZE], a8 + LDF [Y + 7 * SIZE], b8 + + deccc I + ble,pn %icc, .LL12 + nop + +.LL11: + prefetch [X + PREFETCHSIZE * SIZE], 0 + deccc I + + STF a1, [Y + 0 * SIZE] + LDF [X + 8 * SIZE], a1 + STF b1, [X + 0 * SIZE] + LDF [Y + 8 * SIZE], b1 + + STF a2, [Y + 1 * SIZE] + LDF [X + 9 * SIZE], a2 + STF b2, [X + 1 * SIZE] + LDF [Y + 9 * SIZE], b2 + + STF a3, [Y + 2 * SIZE] + LDF [X + 10 * SIZE], a3 + STF b3, [X + 2 * SIZE] + LDF [Y + 10 * SIZE], b3 + + STF a4, [Y + 3 * SIZE] + LDF [X + 11 * SIZE], a4 + STF b4, [X + 3 * SIZE] + LDF [Y + 11 * SIZE], b4 + + prefetch [Y + PREFETCHSIZE * SIZE], 0 + add X, 8 * SIZE, X + + STF a5, [Y + 4 * SIZE] + LDF [X + 4 * SIZE], a5 + STF b5, [X - 4 * SIZE] + LDF [Y + 12 * SIZE], b5 + + STF a6, [Y + 5 * SIZE] + LDF [X + 5 * SIZE], a6 + STF b6, [X - 3 * SIZE] + LDF [Y + 13 * SIZE], b6 + + STF a7, [Y + 6 * SIZE] + LDF [X + 6 * SIZE], a7 + STF b7, [X - 2 * SIZE] + LDF [Y + 14 * SIZE], b7 + + STF a8, [Y + 7 * SIZE] + LDF [X + 7 * SIZE], a8 + STF b8, [X - 1 * SIZE] + LDF [Y + 15 * SIZE], b8 + + bg,pt %icc, .LL11 + add Y, 8 * SIZE, Y + +.LL12: + STF a1, [Y + 0 * SIZE] + STF b1, [X + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + STF b2, [X + 1 * SIZE] + STF a3, [Y + 2 * SIZE] + STF b3, [X + 2 * SIZE] + STF a4, [Y + 3 * SIZE] + STF b4, [X + 3 * SIZE] + STF a5, [Y + 4 * SIZE] + STF b5, [X + 4 * SIZE] + STF a6, [Y + 5 * SIZE] + STF b6, [X + 5 * SIZE] + STF a7, [Y + 6 * SIZE] + STF b7, [X + 6 * SIZE] + STF a8, [Y + 7 * SIZE] + STF b8, [X + 7 * SIZE] + add X, 8 * SIZE, X + add Y, 8 * SIZE, Y + +.LL15: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL19 + nop + +.LL16: + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + LDF [Y + 1 * SIZE], b2 + cmp I, 0 + STF a1, [Y + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + add Y, 2 * SIZE, Y + STF b1, [X + 0 * SIZE] + STF b2, [X + 1 * SIZE] + bg,pt %icc, .LL16 + add X, 2 * SIZE, X + +.LL19: + return %i7 + 8 + clr %g0 + +.LL50: + sra N, 2, I + mov X, XX + cmp I, 0 + ble,pn %icc, .LL55 + mov Y, YY + +.LL51: + LDF [X + 0 * SIZE], a1 + LDF [Y + 0 * SIZE], b1 + LDF [X + 1 * SIZE], a2 + add X, INCX, X + LDF [Y + 1 * SIZE], b2 + add Y, INCY, Y + LDF [X + 0 * SIZE], a3 + LDF [Y + 0 * SIZE], b3 + LDF [X + 1 * SIZE], a4 + add X, INCX, X + LDF [Y + 1 * SIZE], b4 + add Y, INCY, Y + LDF [X + 0 * SIZE], a5 + LDF [Y + 0 * SIZE], b5 + LDF [X + 1 * SIZE], a6 + add X, INCX, X + LDF [Y + 1 * SIZE], b6 + add Y, INCY, Y + LDF [X + 0 * SIZE], a7 + LDF [Y + 0 * SIZE], b7 + LDF [X + 1 * SIZE], a8 + add X, INCX, X + LDF [Y + 1 * SIZE], b8 + add Y, INCY, Y + + STF a1, [YY + 0 * SIZE] + add I, -1, I + STF b1, [XX + 0 * SIZE] + cmp I, 0 + STF a2, [YY + 1 * SIZE] + add YY, INCY, YY + STF b2, [XX + 1 * SIZE] + add XX, INCX, XX + STF a3, [YY + 0 * SIZE] + STF b3, [XX + 0 * SIZE] + STF a4, [YY + 1 * SIZE] + add YY, INCY, YY + STF b4, [XX + 1 * SIZE] + add XX, INCX, XX + STF a5, [YY + 0 * SIZE] + STF b5, [XX + 0 * SIZE] + STF a6, [YY + 1 * SIZE] + add YY, INCY, YY + STF b6, [XX + 1 * SIZE] + add XX, INCX, XX + STF a7, [YY + 0 * SIZE] + STF b7, [XX + 0 * SIZE] + STF a8, [YY + 1 * SIZE] + add YY, INCY, YY + STF b8, [XX + 1 * SIZE] + + bg,pt %icc, .LL51 + add XX, INCX, XX + +.LL55: + and N, 3, I + cmp I, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + LDF [X + 0 * SIZE], a1 + add I, -1, I + LDF [X + 1 * SIZE], a2 + LDF [Y + 0 * SIZE], b1 + cmp I, 0 + LDF [Y + 1 * SIZE], b2 + STF b1, [X + 0 * SIZE] + STF b2, [X + 1 * SIZE] + add X, INCX, X + STF a1, [Y + 0 * SIZE] + STF a2, [Y + 1 * SIZE] + bg,pt %icc, .LL56 + add Y, INCY, Y + +.LL59: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_LN.S b/kernel/sparc/ztrsm_kernel_LN.S new file mode 100644 index 0000000000..131284e8ef --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_LN.S @@ -0,0 +1,2395 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 + +#define OFFSET %l2 +#define KK %l3 +#define TEMP1 %l4 +#define TEMP2 %l5 +#define AORIG %l6 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f62 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f58 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#endif + +#define t5 c13 +#define t6 c14 +#define t7 c15 +#define t8 c16 + +#ifndef CONJ +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FADD +#define FADD4 FSUB +#else + +#if defined(LN) || defined(LT) +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FADD +#define FADD4 FADD +#endif + +#if defined(RN) || defined(RT) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FSUB +#define FADD4 FADD +#endif +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], B + ldx [%sp+ STACK_START + 64], C + ldx [%sp+ STACK_START + 72], LDC + ldx [%sp+ STACK_START + 80], OFFSET +#endif + +#ifdef DOUBLE + FCLR(27) +#else + FCLR(29) +#endif + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +.LL11: +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + add LDC, LDC, TEMP1 + sub C, TEMP1, C +#endif + + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C2, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL50 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 1 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c02 + FMOV FZERO, t1 + FMOV FZERO, c04 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD2 c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD4 c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD4 c04, t2, c04 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD2 c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD4 c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD2 c02, t1, c02 + FADD4 c04, t2, c04 + FADD2 c06, t3, c06 + FADD4 c08, t4, c08 + + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t3 + FMUL a4, c01, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FADD3 c05, t3, c05 + FADD4 c06, t4, c06 + + FMUL b1, c05, t1 + FMUL b2, c06, t2 + FMUL b1, c06, t3 + FMUL b2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c05, t1 + FMUL a2, c06, t2 + FMUL a1, c06, t3 + FMUL a2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a4, c06, t3 + FMUL a4, c05, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FADD3 c01, t3, c01 + FADD4 c02, t4, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL50: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL99 + nop + +.LL21: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + FMOV FZERO, c01 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [BO + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + +#ifdef LN + prefetch [C1 - 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 - 3 * SIZE], 3 + FMOV FZERO, c14 +#else + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 +#endif + + FMOV FZERO, c15 + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD2 c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD4 c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD2 c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD4 c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD4 c08, t2, c08 + FMUL a5, b2, t2 + FADD2 c12, t3, c12 + FMUL a5, b3, t3 + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL29 + nop + +.LL26: + FADD2 c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD4 c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 2, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD2 c04, t1, c04 + FADD4 c08, t2, c08 + FADD2 c12, t3, c12 + FADD4 c16, t4, c16 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + FADD c09, c14, c09 + FADD c10, c13, c10 + FADD c11, c16, c11 + FADD c12, c15, c12 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c09, c09 + FSUB a4, c10, c10 + + FSUB b1, c03, c03 + FSUB b2, c04, c04 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c09, t3, c09 + FSUB c10, t4, c10 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + FADD2 c09, t7, c09 + FADD4 c10, t8, c10 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c09, t5 + FMUL b2, c10, t6 + FMUL b1, c10, t7 + FMUL b2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c09, t5 + FMUL a2, c10, t6 + FMUL a1, c10, t7 + FMUL a2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c09, t3 + FMUL a3, c10, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c10, t7 + FMUL a4, c09, t8 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + FADD2 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c04, t7 + FMUL a4, c03, t8 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD3 c09, t5, c09 + FADD4 c10, t6, c10 + FADD3 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c09, t1 + FMUL b2, c10, t2 + FMUL b1, c10, t3 + FMUL b2, c09, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c09, t1 + FMUL a2, c10, t2 + FMUL a1, c10, t3 + FMUL a2, c09, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c10, t5 + FMUL a4, c09, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FADD3 c01, t5, c01 + FADD4 c02, t6, c02 + FADD3 c03, t7, c03 + FADD4 c04, t8, c04 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c03, t5 + FMUL b2, c04, t6 + FMUL b1, c04, t7 + FMUL b2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c10, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c11, [BO + 6 * SIZE] + STF c12, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c11, [AO + 6 * SIZE] + STF c12, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c09, [C2 + 0 * SIZE] + STF c10, [C2 + 1 * SIZE] + STF c11, [C2 + 2 * SIZE] + STF c12, [C2 + 3 * SIZE] + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL21 + nop + +.LL99: +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL150 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD1 c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD3 c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD2 c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD1 c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD3 c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD3 c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD2 c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + FADD1 c01, t1, c01 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + FADD3 c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + cmp L, 0 + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [BO + 1 * SIZE], b2 + + bg,pt %icc, .LL156 + LDF [AO + 1 * SIZE], a2 + +.LL159: + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL150: + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c03 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + +#ifdef LN + prefetch [C1 - 3 * SIZE], 3 +#else + prefetch [C1 + 3 * SIZE], 3 +#endif + FMOV FZERO, c05 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD1 c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD1 c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD1 c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD3 c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD4 c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b1, t1 + FADD3 c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD4 c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD1 c03, t1, c03 + FADD3 c07, t2, c07 + FADD2 c04, t3, c04 + FADD4 c08, t4, c08 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t5 + FMUL a4, c01, t6 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL199: +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_LT.S b/kernel/sparc/ztrsm_kernel_LT.S new file mode 100644 index 0000000000..2a85698506 --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_LT.S @@ -0,0 +1,2389 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 + +#define OFFSET %l2 +#define KK %l3 +#define TEMP1 %l4 +#define TEMP2 %l5 +#define AORIG %l6 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f62 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f58 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#endif + +#define t5 c13 +#define t6 c14 +#define t7 c15 +#define t8 c16 + +#ifndef CONJ +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FADD +#define FADD4 FSUB +#else + +#if defined(LN) || defined(LT) +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FADD +#define FADD4 FADD +#endif + +#if defined(RN) || defined(RT) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FSUB +#define FADD4 FADD +#endif +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], B + ldx [%sp+ STACK_START + 64], C + ldx [%sp+ STACK_START + 72], LDC + ldx [%sp+ STACK_START + 80], OFFSET +#endif + +#ifdef DOUBLE + FCLR(27) +#else + FCLR(29) +#endif + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL100 + nop + +.LL11: +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + add LDC, LDC, TEMP1 + sub C, TEMP1, C +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + + sra M, 1, I + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + cmp I, 0 +#ifndef RT + add C2, LDC, C +#endif + ble,pn %icc, .LL50 + FMOV FZERO, t4 + + +.LL21: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + FMOV FZERO, c01 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [BO + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 + + FMOV FZERO, c15 + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD2 c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD4 c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD2 c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD4 c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD4 c08, t2, c08 + FMUL a5, b2, t2 + FADD2 c12, t3, c12 + FMUL a5, b3, t3 + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL29 + nop + +.LL26: + FADD2 c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD4 c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 2, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD2 c04, t1, c04 + FADD4 c08, t2, c08 + FADD2 c12, t3, c12 + FADD4 c16, t4, c16 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + FADD c09, c14, c09 + FADD c10, c13, c10 + FADD c11, c16, c11 + FADD c12, c15, c12 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c09, c09 + FSUB a4, c10, c10 + + FSUB b1, c03, c03 + FSUB b2, c04, c04 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c09, t3, c09 + FSUB c10, t4, c10 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + FADD2 c09, t7, c09 + FADD4 c10, t8, c10 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c09, t5 + FMUL b2, c10, t6 + FMUL b1, c10, t7 + FMUL b2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c09, t5 + FMUL a2, c10, t6 + FMUL a1, c10, t7 + FMUL a2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c09, t3 + FMUL a3, c10, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c10, t7 + FMUL a4, c09, t8 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + FADD2 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c04, t7 + FMUL a4, c03, t8 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD3 c09, t5, c09 + FADD4 c10, t6, c10 + FADD3 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c09, t1 + FMUL b2, c10, t2 + FMUL b1, c10, t3 + FMUL b2, c09, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c09, t1 + FMUL a2, c10, t2 + FMUL a1, c10, t3 + FMUL a2, c09, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c10, t5 + FMUL a4, c09, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FADD3 c01, t5, c01 + FADD4 c02, t6, c02 + FADD3 c03, t7, c03 + FADD4 c04, t8, c04 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c03, t5 + FMUL b2, c04, t6 + FMUL b1, c04, t7 + FMUL b2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c10, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c11, [BO + 6 * SIZE] + STF c12, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c11, [AO + 6 * SIZE] + STF c12, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c09, [C2 + 0 * SIZE] + STF c10, [C2 + 1 * SIZE] + STF c11, [C2 + 2 * SIZE] + STF c12, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 1, I + FMOV FZERO, c02 + cmp I, 0 + FMOV FZERO, t1 + ble,pn %icc, .LL99 + FMOV FZERO, c04 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 1 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD2 c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD4 c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD4 c04, t2, c04 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD2 c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD4 c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD2 c02, t1, c02 + FADD4 c04, t2, c04 + FADD2 c06, t3, c06 + FADD4 c08, t4, c08 + + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t3 + FMUL a4, c01, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FADD3 c05, t3, c05 + FADD4 c06, t4, c06 + + FMUL b1, c05, t1 + FMUL b2, c06, t2 + FMUL b1, c06, t3 + FMUL b2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c05, t1 + FMUL a2, c06, t2 + FMUL a1, c06, t3 + FMUL a2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a4, c06, t3 + FMUL a4, c05, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FADD3 c01, t3, c01 + FADD4 c02, t4, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL99: +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL100: + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c03 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c05 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD1 c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD1 c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD1 c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD3 c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD4 c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b1, t1 + FADD3 c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD4 c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD1 c03, t1, c03 + FADD3 c07, t2, c07 + FADD2 c04, t3, c04 + FADD4 c08, t4, c08 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t5 + FMUL a4, c01, t6 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD1 c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD3 c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD2 c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD1 c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD3 c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD3 c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD2 c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + FADD1 c01, t1, c01 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + FADD3 c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + cmp L, 0 + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [BO + 1 * SIZE], b2 + + bg,pt %icc, .LL156 + LDF [AO + 1 * SIZE], a2 + +.LL159: + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL199: +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_LT_1x4.S b/kernel/sparc/ztrsm_kernel_LT_1x4.S new file mode 100644 index 0000000000..f7d9e38ed7 --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_LT_1x4.S @@ -0,0 +1,2131 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 +#define C3 %l2 +#define C4 %l3 + +#define OFFSET %l4 +#define KK %l5 +#define TEMP1 %l6 +#define TEMP2 %l7 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 +#endif + +#ifndef CONJ +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FMADD +#define FMADD4 FNMSUB +#else +#if defined(LN) || defined(LT) +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FMADD +#define FMADD4 FMADD +#endif +#if defined(RN) || defined(RT) +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#endif +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp + STACK_START + 56], B + ldx [%sp + STACK_START + 64], C + ldx [%sp + STACK_START + 72], LDC + ldx [%sp + STACK_START + 80], OFFSET +#endif + + cmp M, 0 + ble,pn %icc, .LL999 + nop + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL20 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, ZBASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + FCLR (cc01) + LDF [AO + 1 * SIZE], a2 + FCLR (cc05) + LDF [AO + 8 * SIZE], a5 + FCLR (cc09) + LDF [BO + 0 * SIZE], b1 + FCLR (cc13) + + LDF [BO + 1 * SIZE], b2 + FCLR (cc02) + LDF [BO + 2 * SIZE], b3 + FCLR (cc06) + LDF [BO + 3 * SIZE], b4 + FCLR (cc10) + LDF [BO + 4 * SIZE], b5 + FCLR (cc14) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc03) + LDF [BO + 6 * SIZE], b7 + FCLR (cc07) + LDF [BO + 7 * SIZE], b8 + FCLR (cc11) + LDF [BO + 8 * SIZE], b9 + FCLR (cc15) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + nop + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + nop + FMADD2 (aa2, bb5, cc10, cc10) + nop + + FMADD3 (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD2 (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + FADD c09, c12, c09 + FADD c10, c11, c10 + FADD c13, c16, c13 + FADD c14, c15, c14 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c13, c13 + FSUB b4, c14, c14 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + FMUL a1, c05, b3 + FMUL a2, c05, b4 + FMUL a1, c09, b5 + FMUL a2, c09, b6 + FMUL a1, c13, b7 + FMUL a2, c13, b8 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) + FNMSUB (aa2, cc06, bb3, cc05) + FMADD (aa1, cc06, bb4, cc06) + FNMSUB (aa2, cc10, bb5, cc09) + FMADD (aa1, cc10, bb6, cc10) + FNMSUB (aa2, cc14, bb7, cc13) + FMADD (aa1, cc14, bb8, cc14) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) + FMADD (aa2, cc06, bb3, cc05) + FMSUB (aa1, cc06, bb4, cc06) + FMADD (aa2, cc10, bb5, cc09) + FMSUB (aa1, cc10, bb6, cc10) + FMADD (aa2, cc14, bb7, cc13) + FMSUB (aa1, cc14, bb8, cc14) +#endif +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + LDF [BO + 7 * SIZE], b8 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif + + FNMSUB (bb3, cc01, cc05, cc05) + FNMSUB (bb3, cc02, cc06, cc06) + FNMSUB (bb5, cc01, cc09, cc09) + FNMSUB (bb5, cc02, cc10, cc10) + FNMSUB (bb7, cc01, cc13, cc13) + FNMSUB (bb7, cc02, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc02, cc05, cc05) + FNMSUB (bb4, cc01, cc06, cc06) + FMADD (bb6, cc02, cc09, cc09) + FNMSUB (bb6, cc01, cc10, cc10) + FMADD (bb8, cc02, cc13, cc13) + FNMSUB (bb8, cc01, cc14, cc14) +#else + FNMSUB (bb4, cc02, cc05, cc05) + FMADD (bb4, cc01, cc06, cc06) + FNMSUB (bb6, cc02, cc09, cc09) + FMADD (bb6, cc01, cc10, cc10) + FNMSUB (bb8, cc02, cc13, cc13) + FMADD (bb8, cc01, cc14, cc14) +#endif + + LDF [BO + 10 * SIZE], b1 + LDF [BO + 11 * SIZE], b2 + LDF [BO + 12 * SIZE], b3 + LDF [BO + 13 * SIZE], b4 + LDF [BO + 14 * SIZE], b5 + LDF [BO + 15 * SIZE], b6 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc09, cc09) + FNMSUB (bb3, cc06, cc10, cc10) + FNMSUB (bb5, cc05, cc13, cc13) + FNMSUB (bb5, cc06, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc06, cc09, cc09) + FNMSUB (bb4, cc05, cc10, cc10) + FMADD (bb6, cc06, cc13, cc13) + FNMSUB (bb6, cc05, cc14, cc14) +#else + FNMSUB (bb4, cc06, cc09, cc09) + FMADD (bb4, cc05, cc10, cc10) + FNMSUB (bb6, cc06, cc13, cc13) + FMADD (bb6, cc05, cc14, cc14) +#endif + + LDF [BO + 20 * SIZE], b1 + LDF [BO + 21 * SIZE], b2 + LDF [BO + 22 * SIZE], b3 + LDF [BO + 23 * SIZE], b4 + + FMUL b1, c09, a1 + FMUL b2, c09, a2 + +#ifndef CONJ + FNMSUB (bb2, cc10, aa1, cc09) + FMADD (bb1, cc10, aa2, cc10) +#else + FMADD (bb2, cc10, aa1, cc09) + FMSUB (bb1, cc10, aa2, cc10) +#endif + + FNMSUB (bb3, cc09, cc13, cc13) + FNMSUB (bb3, cc10, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc10, cc13, cc13) + FNMSUB (bb4, cc09, cc14, cc14) +#else + FNMSUB (bb4, cc10, cc13, cc13) + FMADD (bb4, cc09, cc14, cc14) +#endif + + LDF [BO + 30 * SIZE], b1 + LDF [BO + 31 * SIZE], b2 + + FMUL b1, c13, a1 + FMUL b2, c13, a2 + +#ifndef CONJ + FNMSUB (bb2, cc14, aa1, cc13) + FMADD (bb1, cc14, aa2, cc14) +#else + FMADD (bb2, cc14, aa1, cc13) + FMSUB (bb1, cc14, aa2, cc14) +#endif +#endif + +#ifdef RT + LDF [BO + 30 * SIZE], b1 + LDF [BO + 31 * SIZE], b2 + LDF [BO + 28 * SIZE], b3 + LDF [BO + 29 * SIZE], b4 + LDF [BO + 26 * SIZE], b5 + LDF [BO + 27 * SIZE], b6 + LDF [BO + 24 * SIZE], b7 + LDF [BO + 25 * SIZE], b8 + + FMUL b1, c13, a1 + FMUL b2, c13, a2 + +#ifndef CONJ + FNMSUB (bb2, cc14, aa1, cc13) + FMADD (bb1, cc14, aa2, cc14) +#else + FMADD (bb2, cc14, aa1, cc13) + FMSUB (bb1, cc14, aa2, cc14) +#endif + + FNMSUB (bb3, cc13, cc09, cc09) + FNMSUB (bb3, cc14, cc10, cc10) + FNMSUB (bb5, cc13, cc05, cc05) + FNMSUB (bb5, cc14, cc06, cc06) + FNMSUB (bb7, cc13, cc01, cc01) + FNMSUB (bb7, cc14, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc14, cc09, cc09) + FNMSUB (bb4, cc13, cc10, cc10) + FMADD (bb6, cc14, cc05, cc05) + FNMSUB (bb6, cc13, cc06, cc06) + FMADD (bb8, cc14, cc01, cc01) + FNMSUB (bb8, cc13, cc02, cc02) +#else + FNMSUB (bb4, cc14, cc09, cc09) + FMADD (bb4, cc13, cc10, cc10) + FNMSUB (bb6, cc14, cc05, cc05) + FMADD (bb6, cc13, cc06, cc06) + FNMSUB (bb8, cc14, cc01, cc01) + FMADD (bb8, cc13, cc02, cc02) +#endif + + LDF [BO + 20 * SIZE], b1 + LDF [BO + 21 * SIZE], b2 + LDF [BO + 18 * SIZE], b3 + LDF [BO + 19 * SIZE], b4 + LDF [BO + 16 * SIZE], b5 + LDF [BO + 17 * SIZE], b6 + + FMUL b1, c09, a1 + FMUL b2, c09, a2 + +#ifndef CONJ + FNMSUB (bb2, cc10, aa1, cc09) + FMADD (bb1, cc10, aa2, cc10) +#else + FMADD (bb2, cc10, aa1, cc09) + FMSUB (bb1, cc10, aa2, cc10) +#endif + + FNMSUB (bb3, cc09, cc05, cc05) + FNMSUB (bb3, cc10, cc06, cc06) + FNMSUB (bb5, cc09, cc01, cc01) + FNMSUB (bb5, cc10, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc10, cc05, cc05) + FNMSUB (bb4, cc09, cc06, cc06) + FMADD (bb6, cc10, cc01, cc01) + FNMSUB (bb6, cc09, cc02, cc02) +#else + FNMSUB (bb4, cc10, cc05, cc05) + FMADD (bb4, cc09, cc06, cc06) + FNMSUB (bb6, cc10, cc01, cc01) + FMADD (bb6, cc09, cc02, cc02) +#endif + + LDF [BO + 10 * SIZE], b1 + LDF [BO + 11 * SIZE], b2 + LDF [BO + 8 * SIZE], b3 + LDF [BO + 9 * SIZE], b4 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc01, cc01) + FNMSUB (bb3, cc06, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc06, cc01, cc01) + FNMSUB (bb4, cc05, cc02, cc02) +#else + FNMSUB (bb4, cc06, cc01, cc01) + FMADD (bb4, cc05, cc02, cc02) +#endif + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c10, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c14, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + +#ifdef LN + sll K, ZBASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL20: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL22: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + FCLR (cc01) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc04) + LDF [BO + 8 * SIZE], b9 + FCLR (cc05) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + FCLR (cc08) + .align 4 + +.LL23: + FMADD1 (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD2 (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD3 (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD2 (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + nop + FMADD2 (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD3 (aa3, bb8, cc07, cc07) + FMADD4 (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL23 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL27 + add BO, 4 * SIZE, BO + .align 4 + +.LL28: + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + FMUL a1, c05, b3 + FMUL a2, c05, b4 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) + FNMSUB (aa2, cc06, bb3, cc05) + FMADD (aa1, cc06, bb4, cc06) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) + FMADD (aa2, cc06, bb3, cc05) + FMSUB (aa1, cc06, bb4, cc06) +#endif +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif + + FNMSUB (bb3, cc01, cc05, cc05) + FNMSUB (bb3, cc02, cc06, cc06) + +#ifndef CONJ + FMADD (bb4, cc02, cc05, cc05) + FNMSUB (bb4, cc01, cc06, cc06) +#else + FNMSUB (bb4, cc02, cc05, cc05) + FMADD (bb4, cc01, cc06, cc06) +#endif + + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + LDF [BO + 4 * SIZE], b3 + LDF [BO + 5 * SIZE], b4 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc01, cc01) + FNMSUB (bb3, cc06, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc06, cc01, cc01) + FNMSUB (bb4, cc05, cc02, cc02) +#else + FNMSUB (bb4, cc06, cc01, cc01) + FMADD (bb4, cc05, cc02, cc02) +#endif + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL22 + nop + +#ifdef LN + sll K, ZBASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL30: + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + FCLR (cc08) + .align 4 + +.LL33: + FMADD1 (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD1 (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD2 (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD3 (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD4 (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD1 (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD2 (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD3 (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD4 (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD1 (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD2 (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD3 (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD4 (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL37 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL38: + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 +#else + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 +#endif + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_RT.S b/kernel/sparc/ztrsm_kernel_RT.S new file mode 100644 index 0000000000..2949e48433 --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_RT.S @@ -0,0 +1,2389 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 + +#define OFFSET %l2 +#define KK %l3 +#define TEMP1 %l4 +#define TEMP2 %l5 +#define AORIG %l6 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define t1 %f32 +#define t2 %f34 +#define t3 %f36 +#define t4 %f38 + +#define a1 %f40 +#define a2 %f42 +#define a3 %f44 +#define a4 %f46 +#define a5 %f62 + +#define b1 %f48 +#define b2 %f50 +#define b3 %f52 +#define b4 %f54 +#define b5 %f56 + +#define FZERO %f58 + +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define t1 %f16 +#define t2 %f17 +#define t3 %f18 +#define t4 %f19 + +#define a1 %f20 +#define a2 %f21 +#define a3 %f22 +#define a4 %f23 +#define a5 %f31 + +#define b1 %f24 +#define b2 %f25 +#define b3 %f26 +#define b4 %f27 +#define b5 %f28 + +#define FZERO %f29 +#endif + +#define t5 c13 +#define t6 c14 +#define t7 c15 +#define t8 c16 + +#ifndef CONJ +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FADD +#define FADD4 FSUB +#else + +#if defined(LN) || defined(LT) +#define FADD1 FADD +#define FADD2 FSUB +#define FADD3 FADD +#define FADD4 FADD +#endif + +#if defined(RN) || defined(RT) +#define FADD1 FADD +#define FADD2 FADD +#define FADD3 FSUB +#define FADD4 FADD +#endif +#endif + +#define APREFETCHSIZE 40 +#define BPREFETCHSIZE 40 + +#define APREFETCH_CATEGORY 0 +#define BPREFETCH_CATEGORY 0 + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp+ STACK_START + 56], B + ldx [%sp+ STACK_START + 64], C + ldx [%sp+ STACK_START + 72], LDC + ldx [%sp+ STACK_START + 80], OFFSET +#endif + +#ifdef DOUBLE + FCLR(27) +#else + FCLR(29) +#endif + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + and N, 1, J + + cmp J, 0 + ble,pn %icc, .LL100 + nop + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + sub C, LDC, C +#endif + + mov C, C1 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + +#ifndef RT + add C, LDC, C +#endif + sra M, 1, I + cmp I, 0 + ble,pn %icc, .LL150 + FMOV FZERO, c03 + +.LL121: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + sll KK, 0 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, c03 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t1 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c07 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t2 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c04 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t3 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, t4 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c01 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c05 + FMOV FZERO, c02 + + ble,pn %icc, .LL125 + FMOV FZERO, c06 + +.LL122: + FADD1 c03, t1, c03 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c07, t2, c07 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c04, t3, c04 + add AO, 16 * SIZE, AO + FMUL a2, b1, t3 + cmp L, 0 + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 11 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 10 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO - 3 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 9 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO - 8 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO - 6 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c06, t4, c06 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD1 c03, t1, c03 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b2, t2 + LDF [AO - 4 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b1, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b2, t4 + LDF [AO - 3 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b2, t2 + LDF [AO - 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c06, t4, c06 + nop + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + + FADD1 c03, t1, c03 + nop + FMUL a1, b3, t1 + LDF [AO - 1 * SIZE], a4 + + FADD3 c07, t2, c07 + nop + FMUL a1, b4, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + nop + FMUL a2, b3, t3 + nop + + FADD4 c08, t4, c08 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a3, b4, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c06, t4, c06 + FMUL a4, b4, t4 + LDF [AO + 3 * SIZE], a4 + + bg,pt %icc, .LL122 + LDF [BO + 3 * SIZE], b4 + +.LL125: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL129 + nop + +.LL126: + FADD1 c03, t1, c03 + add AO, 4 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + + FADD3 c07, t2, c07 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + + FADD2 c04, t3, c04 + cmp L, 0 + FMUL a2, b1, t3 + + FADD4 c08, t4, c08 + FMUL a2, b2, t4 + LDF [AO + 1 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b1, t1 + FADD3 c05, t2, c05 + FMUL a3, b2, t2 + LDF [AO + 2 * SIZE], a3 + + FADD2 c02, t3, c02 + FMUL a4, b1, t3 + LDF [BO + 0 * SIZE], b1 + FADD4 c06, t4, c06 + FMUL a4, b2, t4 + LDF [BO + 1 * SIZE], b2 + bg,pt %icc, .LL126 + LDF [AO + 3 * SIZE], a4 + +.LL129: + FADD1 c03, t1, c03 + FADD3 c07, t2, c07 + FADD2 c04, t3, c04 + FADD4 c08, t4, c08 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 2, TEMP1 +#else + sub KK, 1, TEMP1 +#endif + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t5 + FMUL a4, c01, t6 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c03, [BO + 2 * SIZE] + STF c04, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL121 + FMOV FZERO, c03 + +.LL150: + and M, 1, I + cmp I, 0 + ble,pn %icc, .LL199 + nop + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c01 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, t1 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c02 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, t2 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c03 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, t3 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c04 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, t4 + + ble,pn %icc, .LL155 + nop + +.LL152: + FADD1 c01, t1, c01 + add L, -1, L + FMUL a1, b1, t1 + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FADD3 c02, t2, c02 + add BO, 8 * SIZE, BO + FMUL a1, b2, t2 + LDF [AO + 4 * SIZE], a1 + + FADD2 c03, t3, c03 + cmp L, 0 + FMUL a2, b1, t3 + LDF [BO - 4 * SIZE], b1 + + FADD4 c04, t4, c04 + nop + FMUL a2, b2, t4 + LDF [AO + 5 * SIZE], a2 + + FADD1 c01, t1, c01 + nop + FMUL a3, b3, t1 + LDF [BO - 3 * SIZE], b2 + + FADD3 c02, t2, c02 + nop + FMUL a3, b4, t2 + LDF [AO + 6 * SIZE], a3 + + FADD2 c03, t3, c03 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD4 c04, t4, c04 + nop + FMUL a4, b4, t4 + LDF [AO + 7 * SIZE], a4 + + FADD1 c01, t1, c01 + nop + FMUL a1, b1, t1 + LDF [BO - 1 * SIZE], b4 + + FADD3 c02, t2, c02 + FMUL a1, b2, t2 + LDF [AO + 8 * SIZE], a1 + + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [AO + 9 * SIZE], a2 + + FADD1 c01, t1, c01 + FMUL a3, b3, t1 + LDF [BO + 1 * SIZE], b2 + + FADD3 c02, t2, c02 + FMUL a3, b4, t2 + LDF [AO + 10 * SIZE], a3 + + FADD2 c03, t3, c03 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD4 c04, t4, c04 + FMUL a4, b4, t4 + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + bg,pt %icc, .LL152 + LDF [BO + 3 * SIZE], b4 + +.LL155: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL159 + nop + +.LL156: + FADD1 c01, t1, c01 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add BO, 2 * SIZE, BO + FADD3 c02, t2, c02 + add L, -1, L + FMUL a1, b2, t2 + LDF [AO + 0 * SIZE], a1 + FADD2 c03, t3, c03 + FMUL a2, b1, t3 + LDF [BO + 0 * SIZE], b1 + cmp L, 0 + FADD4 c04, t4, c04 + FMUL a2, b2, t4 + LDF [BO + 1 * SIZE], b2 + + bg,pt %icc, .LL156 + LDF [AO + 1 * SIZE], a2 + +.LL159: + FADD1 c01, t1, c01 + FADD3 c02, t2, c02 + FADD2 c03, t3, c03 + FADD4 c04, t4, c04 + + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef RT + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL199: +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + +.LL100: + sra N, 1, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + +.LL11: +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B + + add LDC, LDC, TEMP1 + sub C, TEMP1, C +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + + sra M, 1, I + mov C, C1 + add C, LDC, C2 + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + cmp I, 0 +#ifndef RT + add C2, LDC, C +#endif + ble,pn %icc, .LL50 + FMOV FZERO, t4 + + +.LL21: +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 1 + ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + + FMOV FZERO, c01 + FMOV FZERO, c02 + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, c03 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c04 + + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, c05 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c06 + + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, c07 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c08 + + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c09 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c10 + + LDF [BO + 4 * SIZE], b5 + FMOV FZERO, c11 + LDF [AO + 4 * SIZE], a5 + FMOV FZERO, c12 + + prefetch [C1 + 3 * SIZE], 3 + FMOV FZERO, c13 + prefetch [C2 + 3 * SIZE], 3 + FMOV FZERO, c14 + + FMOV FZERO, c15 + ble,pn %icc, .LL25 + FMOV FZERO, c16 + +.LL22: + FADD2 c04, t1, c04 + prefetch [AO + APREFETCHSIZE * SIZE], APREFETCH_CATEGORY + FMUL a1, b1, t1 + nop + + FADD4 c08, t2, c08 + prefetch [BO + BPREFETCHSIZE * SIZE], BPREFETCH_CATEGORY + FMUL a1, b2, t2 + add AO, 16 * SIZE, AO + + FADD2 c12, t3, c12 + LDF [AO - 13 * SIZE], a4 + FMUL a1, b3, t3 + add BO, 16 * SIZE, BO + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 8 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + add L, -1, L + FMUL a2, b4, t4 + LDF [AO - 11 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 10 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 10 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 9 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a5, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a5, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO - 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO - 7 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 6 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b5, t1 + LDF [BO - 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a1, b1, t1 + LDF [AO - 5 * SIZE], a4 + + FADD4 c08, t2, c08 + nop + FMUL a1, b2, t2 + nop + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + nop + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO - 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + +#ifdef DOUBLE + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY +#else + nop +#endif + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + nop + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + + FADD4 c06, t2, c06 +#ifdef DOUBLE + prefetch [BO + (BPREFETCHSIZE + 8) * SIZE], BPREFETCH_CATEGORY +#else + nop +#endif + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO - 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO - 2 * SIZE], b3 + + FADD3 c15, t4, c15 + nop + FMUL a4, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c04, t1, c04 + nop + FMUL a5, b5, t1 + LDF [AO - 1 * SIZE], a4 + + FADD4 c08, t2, c08 + FMUL a5, b2, t2 + FADD2 c12, t3, c12 + FMUL a5, b3, t3 + + FADD4 c16, t4, c16 + nop + FMUL a5, b4, t4 + LDF [AO + 4 * SIZE], a5 + + FADD1 c01, t1, c01 + nop + FMUL a2, b5, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b5, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + cmp L, 0 + FMUL a4, b5, t1 + LDF [BO + 4 * SIZE], b5 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL22 + LDF [BO + 3 * SIZE], b4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL29 + nop + +.LL26: + FADD2 c04, t1, c04 + LDF [AO + 3 * SIZE], a4 + FMUL a1, b1, t1 + add AO, 4 * SIZE, AO + + FADD4 c08, t2, c08 + add BO, 4 * SIZE, BO + FMUL a1, b2, t2 + add L, -1, L + + FADD2 c12, t3, c12 + nop + FMUL a1, b3, t3 + cmp L, 0 + + FADD4 c16, t4, c16 + nop + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + nop + FMUL a2, b1, t1 + nop + + FADD3 c05, t2, c05 + nop + FMUL a2, b2, t2 + nop + + FADD1 c09, t3, c09 + nop + FMUL a2, b3, t3 + nop + + FADD3 c13, t4, c13 + nop + FMUL a2, b4, t4 + LDF [AO + 1 * SIZE], a2 + + FADD2 c02, t1, c02 + nop + FMUL a3, b1, t1 + nop + + FADD4 c06, t2, c06 + nop + FMUL a3, b2, t2 + nop + + FADD2 c10, t3, c10 + nop + FMUL a3, b3, t3 + nop + + FADD4 c14, t4, c14 + nop + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c03, t1, c03 + nop + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + + FADD3 c07, t2, c07 + nop + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c11, t3, c11 + nop + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + + FADD3 c15, t4, c15 + FMUL a4, b4, t4 + bg,pt %icc, .LL26 + LDF [BO + 3 * SIZE], b4 + +.LL29: +#if defined(LN) || defined(RT) + sub KK, 2, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + FADD2 c04, t1, c04 + FADD4 c08, t2, c08 + FADD2 c12, t3, c12 + FADD4 c16, t4, c16 + + FADD c01, c06, c01 + FADD c02, c05, c02 + FADD c03, c08, c03 + FADD c04, c07, c04 + + FADD c09, c14, c09 + FADD c10, c13, c10 + FADD c11, c16, c11 + FADD c12, c15, c12 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c09, c09 + FSUB a4, c10, c10 + + FSUB b1, c03, c03 + FSUB b2, c04, c04 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c03, c03 + FSUB a4, c04, c04 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c11, c11 + FSUB b4, c12, c12 +#endif + +#ifdef LN + LDF [AO + 6 * SIZE], a1 + LDF [AO + 7 * SIZE], a2 + LDF [AO + 4 * SIZE], a3 + LDF [AO + 5 * SIZE], a4 + LDF [AO + 0 * SIZE], b1 + LDF [AO + 1 * SIZE], b2 + + FMUL a1, c03, t1 + FMUL a2, c04, t2 + FMUL a1, c04, t3 + FMUL a2, c03, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 + + FMUL a3, c03, t1 + FMUL a3, c04, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c04, t5 + FMUL a4, c03, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c09, t3, c09 + FSUB c10, t4, c10 + + FADD2 c01, t5, c01 + FADD4 c02, t6, c02 + FADD2 c09, t7, c09 + FADD4 c10, t8, c10 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c09, t5 + FMUL b2, c10, t6 + FMUL b1, c10, t7 + FMUL b2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + LDF [AO + 6 * SIZE], b1 + LDF [AO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c09, t5 + FMUL a2, c10, t6 + FMUL a1, c10, t7 + FMUL a2, c09, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c09 + FADD2 t7, t8, c10 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c09, t3 + FMUL a3, c10, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c10, t7 + FMUL a4, c09, t8 + + FSUB c03, t1, c03 + FSUB c04, t2, c04 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD2 c03, t5, c03 + FADD4 c04, t6, c04 + FADD2 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c03, t1 + FMUL b2, c04, t2 + FMUL b1, c04, t3 + FMUL b2, c03, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c03 + FADD2 t3, t4, c04 + FADD4 t5, t6, c11 + FADD2 t7, t8, c12 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c03, t5 + FMUL a2, c04, t6 + FMUL a1, c04, t7 + FMUL a2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a3, c03, t3 + FMUL a3, c04, t4 + + FMUL a4, c02, t5 + FMUL a4, c01, t6 + FMUL a4, c04, t7 + FMUL a4, c03, t8 + + FSUB c09, t1, c09 + FSUB c10, t2, c10 + FSUB c11, t3, c11 + FSUB c12, t4, c12 + + FADD3 c09, t5, c09 + FADD4 c10, t6, c10 + FADD3 c11, t7, c11 + FADD4 c12, t8, c12 + + FMUL b1, c09, t1 + FMUL b2, c10, t2 + FMUL b1, c10, t3 + FMUL b2, c09, t4 + + FMUL b1, c11, t5 + FMUL b2, c12, t6 + FMUL b1, c12, t7 + FMUL b2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c09, t1 + FMUL a2, c10, t2 + FMUL a1, c10, t3 + FMUL a2, c09, t4 + + FMUL a1, c11, t5 + FMUL a2, c12, t6 + FMUL a1, c12, t7 + FMUL a2, c11, t8 + + FADD4 t1, t2, c09 + FADD3 t3, t4, c10 + FADD4 t5, t6, c11 + FADD3 t7, t8, c12 + + FMUL a3, c09, t1 + FMUL a3, c10, t2 + FMUL a3, c11, t3 + FMUL a3, c12, t4 + + FMUL a4, c10, t5 + FMUL a4, c09, t6 + FMUL a4, c12, t7 + FMUL a4, c11, t8 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FSUB c03, t3, c03 + FSUB c04, t4, c04 + + FADD3 c01, t5, c01 + FADD4 c02, t6, c02 + FADD3 c03, t7, c03 + FADD4 c04, t8, c04 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FMUL b1, c03, t5 + FMUL b2, c04, t6 + FMUL b1, c04, t7 + FMUL b2, c03, t8 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + FADD4 t5, t6, c03 + FADD3 t7, t8, c04 +#endif + +#ifdef LN + add C1, -4 * SIZE, C1 + add C2, -4 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c09, [BO + 2 * SIZE] + STF c10, [BO + 3 * SIZE] + + STF c03, [BO + 4 * SIZE] + STF c04, [BO + 5 * SIZE] + STF c11, [BO + 6 * SIZE] + STF c12, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c03, [AO + 2 * SIZE] + STF c04, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c11, [AO + 6 * SIZE] + STF c12, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c03, [C1 + 2 * SIZE] + STF c04, [C1 + 3 * SIZE] + + STF c09, [C2 + 0 * SIZE] + STF c10, [C2 + 1 * SIZE] + STF c11, [C2 + 2 * SIZE] + STF c12, [C2 + 3 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 4 * SIZE, C1 + add C2, 4 * SIZE, C2 +#endif + +#ifdef RT + sll K, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 2, KK +#endif + +#ifdef LN + sub KK, 2, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL21 + FMOV FZERO, c01 + +.LL50: + and M, 1, I + FMOV FZERO, c02 + cmp I, 0 + FMOV FZERO, t1 + ble,pn %icc, .LL99 + FMOV FZERO, c04 + +#if defined(LT) || defined(RN) + sra KK, 2, L + + mov B, BO + cmp L, 0 +#else + +#ifdef LN + sll K, 0 + ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, 0 + ZBASE_SHIFT, TEMP1 + sll KK, 1 + ZBASE_SHIFT, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO + + sub K, KK, TEMP1 + + sra TEMP1, 2, L + cmp L, 0 +#endif + + LDF [AO + 0 * SIZE], a1 + FMOV FZERO, t2 + LDF [BO + 0 * SIZE], b1 + FMOV FZERO, c06 + LDF [AO + 1 * SIZE], a2 + FMOV FZERO, t3 + LDF [BO + 1 * SIZE], b2 + FMOV FZERO, c08 + LDF [AO + 2 * SIZE], a3 + FMOV FZERO, t4 + LDF [BO + 2 * SIZE], b3 + FMOV FZERO, c01 + LDF [AO + 3 * SIZE], a4 + FMOV FZERO, c03 + LDF [BO + 3 * SIZE], b4 + FMOV FZERO, c05 + + ble,pn %icc, .LL55 + FMOV FZERO, c07 + +.LL52: + FADD2 c02, t1, c02 + add AO, 8 * SIZE, AO + prefetch [AO + APREFETCHSIZE * SIZE], 0 + + FMUL a1, b1, t1 + add BO, 16 * SIZE, BO + + FADD4 c04, t2, c04 + add L, -1, L + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + cmp L, 0 + FMUL a1, b3, t3 + + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO - 4 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 12 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 11 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 10 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 9 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO - 3 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO - 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO - 8 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO - 7 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO - 6 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO - 5 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a1, b1, t1 + LDF [AO - 1 * SIZE], a4 + FADD4 c04, t2, c04 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO - 4 * SIZE], b1 + + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO - 3 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO - 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO - 1 * SIZE], b4 + + FADD2 c02, t1, c02 + FMUL a3, b1, t1 + LDF [AO + 1 * SIZE], a2 + FADD4 c04, t2, c04 + FMUL a3, b2, t2 + + FADD2 c06, t3, c06 + FMUL a3, b3, t3 + FADD4 c08, t4, c08 + FMUL a3, b4, t4 + LDF [AO + 2 * SIZE], a3 + + FADD1 c01, t1, c01 + FMUL a4, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a4, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a4, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a4, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL52 + LDF [AO + 3 * SIZE], a4 + +.LL55: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + and TEMP1, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL59 + nop + +.LL56: + FADD2 c02, t1, c02 + add AO, 2 * SIZE, AO + FMUL a1, b1, t1 + add L, -1, L + + add BO, 4 * SIZE, BO + FADD4 c04, t2, c04 + cmp L, 0 + FMUL a1, b2, t2 + + FADD2 c06, t3, c06 + FMUL a1, b3, t3 + FADD4 c08, t4, c08 + FMUL a1, b4, t4 + LDF [AO + 0 * SIZE], a1 + + FADD1 c01, t1, c01 + FMUL a2, b1, t1 + LDF [BO + 0 * SIZE], b1 + FADD3 c03, t2, c03 + FMUL a2, b2, t2 + LDF [BO + 1 * SIZE], b2 + + FADD1 c05, t3, c05 + FMUL a2, b3, t3 + LDF [BO + 2 * SIZE], b3 + FADD3 c07, t4, c07 + FMUL a2, b4, t4 + LDF [BO + 3 * SIZE], b4 + + bg,pt %icc, .LL56 + LDF [AO + 1 * SIZE], a2 + +.LL59: +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + + FADD2 c02, t1, c02 + FADD4 c04, t2, c04 + FADD2 c06, t3, c06 + FADD4 c08, t4, c08 + + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 +#endif + +#ifdef LN + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef LT + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FMUL a1, c05, t5 + FMUL a2, c06, t6 + FMUL a1, c06, t7 + FMUL a2, c05, t8 + + FADD4 t1, t2, c01 + FADD2 t3, t4, c02 + FADD4 t5, t6, c05 + FADD2 t7, t8, c06 +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL a1, c01, t1 + FMUL a2, c02, t2 + FMUL a1, c02, t3 + FMUL a2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 + + FMUL a3, c01, t1 + FMUL a3, c02, t2 + FMUL a4, c02, t3 + FMUL a4, c01, t4 + + FSUB c05, t1, c05 + FSUB c06, t2, c06 + FADD3 c05, t3, c05 + FADD4 c06, t4, c06 + + FMUL b1, c05, t1 + FMUL b2, c06, t2 + FMUL b1, c06, t3 + FMUL b2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], a1 + LDF [BO + 7 * SIZE], a2 + LDF [BO + 4 * SIZE], a3 + LDF [BO + 5 * SIZE], a4 + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL a1, c05, t1 + FMUL a2, c06, t2 + FMUL a1, c06, t3 + FMUL a2, c05, t4 + + FADD4 t1, t2, c05 + FADD3 t3, t4, c06 + + FMUL a3, c05, t1 + FMUL a3, c06, t2 + FMUL a4, c06, t3 + FMUL a4, c05, t4 + + FSUB c01, t1, c01 + FSUB c02, t2, c02 + FADD3 c01, t3, c01 + FADD4 c02, t4, c02 + + FMUL b1, c01, t1 + FMUL b2, c02, t2 + FMUL b1, c02, t3 + FMUL b2, c01, t4 + + FADD4 t1, t2, c01 + FADD3 t3, t4, c02 +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + FMOV FZERO, t1 + FMOV FZERO, t2 + FMOV FZERO, t3 + FMOV FZERO, t4 + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, 0 + ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, 0 + ZBASE_SHIFT, TEMP2 + sll TEMP1, 1 + ZBASE_SHIFT, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + +.LL99: +#ifdef LN + sll K, 1 + ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/sparc/ztrsm_kernel_RT_1x4.S b/kernel/sparc/ztrsm_kernel_RT_1x4.S new file mode 100644 index 0000000000..49d449ab92 --- /dev/null +++ b/kernel/sparc/ztrsm_kernel_RT_1x4.S @@ -0,0 +1,2132 @@ +/*********************************************************************/ +/* Copyright 2005-2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define APREFETCHSIZE 24 +#define APREFETCH_CATEGORY 0 + +#define M %i0 +#define N %i1 +#define K %i2 +#define A %i5 +#define B %i3 +#define C %i4 + +#define LDC %o0 +#define AO %o1 +#define BO %o2 +#define I %o3 +#define J %o4 +#define L %o5 + +#define C1 %l0 +#define C2 %l1 +#define C3 %l2 +#define C4 %l3 + +#define OFFSET %l4 +#define KK %l5 +#define TEMP1 %l6 +#define TEMP2 %l7 +#define AORIG %o7 + +#ifdef DOUBLE +#define c01 %f0 +#define c02 %f2 +#define c03 %f4 +#define c04 %f6 +#define c05 %f8 +#define c06 %f10 +#define c07 %f12 +#define c08 %f14 +#define c09 %f16 +#define c10 %f18 +#define c11 %f20 +#define c12 %f22 +#define c13 %f24 +#define c14 %f26 +#define c15 %f28 +#define c16 %f30 + +#define a1 %f32 +#define a2 %f34 +#define a3 %f36 +#define a4 %f38 +#define a5 %f40 + +#define b1 %f42 +#define b2 %f44 +#define b3 %f46 +#define b4 %f48 +#define b5 %f50 +#define b6 %f52 +#define b7 %f54 +#define b8 %f56 +#define b9 %f58 + +#define cc01 0 +#define cc02 2 +#define cc03 4 +#define cc04 6 +#define cc05 8 +#define cc06 10 +#define cc07 12 +#define cc08 14 +#define cc09 16 +#define cc10 18 +#define cc11 20 +#define cc12 22 +#define cc13 24 +#define cc14 26 +#define cc15 28 +#define cc16 30 + +#define aa1 1 +#define aa2 3 +#define aa3 5 +#define aa4 7 +#define aa5 9 + +#define bb1 11 +#define bb2 13 +#define bb3 15 +#define bb4 17 +#define bb5 19 +#define bb6 21 +#define bb7 23 +#define bb8 25 +#define bb9 27 +#else +#define c01 %f0 +#define c02 %f1 +#define c03 %f2 +#define c04 %f3 +#define c05 %f4 +#define c06 %f5 +#define c07 %f6 +#define c08 %f7 +#define c09 %f8 +#define c10 %f9 +#define c11 %f10 +#define c12 %f11 +#define c13 %f12 +#define c14 %f13 +#define c15 %f14 +#define c16 %f15 + +#define a1 %f16 +#define a2 %f17 +#define a3 %f18 +#define a4 %f19 +#define a5 %f20 + +#define b1 %f21 +#define b2 %f22 +#define b3 %f23 +#define b4 %f24 +#define b5 %f25 +#define b6 %f26 +#define b7 %f27 +#define b8 %f28 +#define b9 %f29 + +#define cc01 0 +#define cc02 1 +#define cc03 2 +#define cc04 3 +#define cc05 4 +#define cc06 5 +#define cc07 6 +#define cc08 7 +#define cc09 8 +#define cc10 9 +#define cc11 10 +#define cc12 11 +#define cc13 12 +#define cc14 13 +#define cc15 14 +#define cc16 15 + +#define aa1 16 +#define aa2 17 +#define aa3 18 +#define aa4 19 +#define aa5 20 + +#define bb1 21 +#define bb2 22 +#define bb3 23 +#define bb4 24 +#define bb5 25 +#define bb6 26 +#define bb7 27 +#define bb8 28 +#define bb9 29 + +#endif + +#ifndef CONJ +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FMADD +#define FMADD4 FNMSUB +#else +#if defined(LN) || defined(LT) +#define FMADD1 FMADD +#define FMADD2 FNMSUB +#define FMADD3 FMADD +#define FMADD4 FMADD +#endif +#if defined(RN) || defined(RT) +#define FMADD1 FMADD +#define FMADD2 FMADD +#define FMADD3 FNMSUB +#define FMADD4 FMADD +#endif +#endif + + .register %g2, #scratch + .register %g3, #scratch + + PROLOGUE + SAVESP + +#ifndef __64BIT__ +#ifdef DOUBLE + ld [%sp + STACK_START + 32], A + ld [%sp + STACK_START + 36], B + ld [%sp + STACK_START + 40], C + ld [%sp + STACK_START + 44], LDC + ld [%sp + STACK_START + 48], OFFSET +#else + ld [%sp + STACK_START + 28], B + ld [%sp + STACK_START + 32], C + ld [%sp + STACK_START + 36], LDC + ld [%sp + STACK_START + 40], OFFSET +#endif +#else + ldx [%sp + STACK_START + 56], B + ldx [%sp + STACK_START + 64], C + ldx [%sp + STACK_START + 72], LDC + ldx [%sp + STACK_START + 80], OFFSET +#endif + + cmp M, 0 + ble,pn %icc, .LL999 + nop + + sll LDC, ZBASE_SHIFT, LDC + +#ifdef LN + smul M, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add A, TEMP1, A + + sll M, ZBASE_SHIFT, TEMP1 + add C, TEMP1, C +#endif + +#ifdef RN + neg OFFSET, KK +#endif + +#ifdef RT + smul N, K, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B + + smul N, LDC, TEMP1 + add C, TEMP1, C + + sub N, OFFSET, KK +#endif + + and N, 1, J + cmp J, 0 + ble,pn %icc, .LL20 + nop + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C +#else + sub C, LDC, C1 + sub C, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL32: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + FCLR (cc01) + LDF [BO + 3 * SIZE], b4 + FCLR (cc02) + + LDF [BO + 4 * SIZE], b5 + FCLR (cc03) + LDF [BO + 5 * SIZE], b6 + FCLR (cc04) + LDF [BO + 6 * SIZE], b7 + FCLR (cc05) + LDF [BO + 7 * SIZE], b8 + FCLR (cc06) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL35 + FCLR (cc08) + .align 4 + +.LL33: + FMADD1 (aa1, bb1, cc01, cc01) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 8 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 4 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 5 * SIZE], a2 + + FMADD1 (aa3, bb3, cc01, cc01) + LDF [BO + 9 * SIZE], b2 + FMADD2 (aa4, bb3, cc02, cc02) + LDF [BO + 10 * SIZE], b3 + + FMADD3 (aa3, bb4, cc03, cc03) + LDF [AO + 6 * SIZE], a3 + FMADD4 (aa4, bb4, cc04, cc04) + LDF [AO + 7 * SIZE], a4 + + FMADD1 (aa1, bb5, cc01, cc01) + LDF [BO + 11 * SIZE], b4 + FMADD2 (aa2, bb5, cc02, cc02) + LDF [BO + 12 * SIZE], b5 + + FMADD3 (aa1, bb6, cc03, cc03) + LDF [AO + 8 * SIZE], a1 + FMADD4 (aa2, bb6, cc04, cc04) + LDF [AO + 9 * SIZE], a2 + + FMADD1 (aa3, bb7, cc01, cc01) + LDF [BO + 13 * SIZE], b6 + + FMADD2 (aa4, bb7, cc02, cc02) + LDF [BO + 14 * SIZE], b7 + + FMADD3 (aa3, bb8, cc03, cc03) + LDF [AO + 10 * SIZE], a3 + FMADD4 (aa4, bb8, cc04, cc04) + LDF [AO + 11 * SIZE], a4 + + add AO, 8 * SIZE, AO + add L, -1, L + add BO, 8 * SIZE, BO + cmp L, 0 + + bg,pt %icc, .LL33 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL35: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL38 + nop + .align 4 + +.LL37: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 2 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [AO + 2 * SIZE], a1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [AO + 3 * SIZE], a2 + + add AO, 2 * SIZE, AO + cmp L, 0 + add BO, 2 * SIZE, BO + bg,pt %icc, .LL37 + LDF [BO + 1 * SIZE], b2 + .align 4 + +.LL38: + FADD c01, c04, c01 + FADD c02, c03, c02 + +#if defined(LN) || defined(RT) + sub KK, 1, TEMP1 + + sll TEMP1, ZBASE_SHIFT, TEMP1 + + add AORIG, TEMP1, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 +#else + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 +#endif + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT, TEMP1 + add AO, TEMP1, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL32 + nop + +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 1, KK +#endif + +#ifdef RT + sub KK, 1, KK +#endif + .align 4 + +.LL20: + and N, 2, J + cmp J, 0 + ble,pn %icc, .LL30 + nop + +#ifdef RT + sll K, ZBASE_SHIFT + 1, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C +#else + sub C, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL22: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 1, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + FCLR (cc01) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc02) + LDF [BO + 6 * SIZE], b7 + FCLR (cc03) + LDF [BO + 7 * SIZE], b8 + FCLR (cc04) + LDF [BO + 8 * SIZE], b9 + FCLR (cc05) + + prefetch [C1 + 2 * SIZE], 3 + FCLR (cc06) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc07) + +#if defined(LT) || defined(RN) + sra KK, 2, L +#else + sub K, KK, L + sra L, 2, L +#endif + cmp L, 0 + ble,pn %icc, .LL25 + FCLR (cc08) + .align 4 + +.LL23: + FMADD1 (aa1, bb1, cc01, cc01) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb1, cc02, cc02) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 16 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD2 (aa2, bb3, cc06, cc06) + add L, -1, L + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + cmp L, 0 + FMADD2 (aa4, bb7, cc06, cc06) + add AO, 8 * SIZE, AO + + FMADD3 (aa3, bb8, cc07, cc07) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa4, bb8, cc08, cc08) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa1, bb9, cc01, cc01) + LDF [AO - 2 * SIZE], a3 + FMADD2 (aa2, bb9, cc02, cc02) + LDF [AO - 1 * SIZE], a4 + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 24 * SIZE], b9 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 17 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + add BO, 16 * SIZE, BO + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc01, cc01) + LDF [AO + 0 * SIZE], a1 + FMADD2 (aa4, bb5, cc02, cc02) + LDF [AO + 1 * SIZE], a2 + FMADD3 (aa3, bb6, cc03, cc03) + LDF [BO + 4 * SIZE], b5 + FMADD4 (aa4, bb6, cc04, cc04) + LDF [BO + 5 * SIZE], b6 + + FMADD1 (aa3, bb7, cc05, cc05) + nop + FMADD2 (aa4, bb7, cc06, cc06) + LDF [BO + 6 * SIZE], b7 + + FMADD3 (aa3, bb8, cc07, cc07) + FMADD4 (aa4, bb8, cc08, cc08) + bg,pt %icc, .LL23 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL25: +#if defined(LT) || defined(RN) + and KK, 3, L +#else + sub K, KK, L + and L, 3, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL28 + nop + .align 4 + +.LL27: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + LDF [BO + 4 * SIZE], b1 + + FMADD3 (aa1, bb2, cc03, cc03) + add AO, 2 * SIZE, AO + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 5 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 6 * SIZE], b3 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 7 * SIZE], b4 + bg,pt %icc, .LL27 + add BO, 4 * SIZE, BO + .align 4 + +.LL28: + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 2, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + FMUL a1, c05, b3 + FMUL a2, c05, b4 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) + FNMSUB (aa2, cc06, bb3, cc05) + FMADD (aa1, cc06, bb4, cc06) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) + FMADD (aa2, cc06, bb3, cc05) + FMSUB (aa1, cc06, bb4, cc06) +#endif +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif + + FNMSUB (bb3, cc01, cc05, cc05) + FNMSUB (bb3, cc02, cc06, cc06) + +#ifndef CONJ + FMADD (bb4, cc02, cc05, cc05) + FNMSUB (bb4, cc01, cc06, cc06) +#else + FNMSUB (bb4, cc02, cc05, cc05) + FMADD (bb4, cc01, cc06, cc06) +#endif + + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif +#endif + +#ifdef RT + LDF [BO + 6 * SIZE], b1 + LDF [BO + 7 * SIZE], b2 + LDF [BO + 4 * SIZE], b3 + LDF [BO + 5 * SIZE], b4 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc01, cc01) + FNMSUB (bb3, cc06, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc06, cc01, cc01) + FNMSUB (bb4, cc05, cc02, cc02) +#else + FNMSUB (bb4, cc06, cc01, cc01) + FMADD (bb4, cc05, cc02, cc02) +#endif + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 1, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL22 + nop + +#ifdef LN + sll K, ZBASE_SHIFT + 1, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 2, KK +#endif + +#ifdef RT + sub KK, 2, KK +#endif + .align 4 + +.LL30: + sra N, 2, J + cmp J, 0 + ble,pn %icc, .LL999 + nop + .align 4 + +.LL11: +#ifdef RT + sll K, ZBASE_SHIFT + 2, TEMP1 + sub B, TEMP1, B +#endif + +#ifndef RT + mov C, C1 + add C, LDC, C2 + add C2, LDC, C3 + add C3, LDC, C4 + add C4, LDC, C +#else + sub C, LDC, C4 + sub C4, LDC, C3 + sub C3, LDC, C2 + sub C2, LDC, C1 + sub C2, LDC, C +#endif + +#ifdef LN + add M, OFFSET, KK +#endif + +#ifdef LT + mov OFFSET, KK +#endif + +#if defined(LN) || defined(RT) + mov A, AORIG +#else + mov A, AO +#endif + + mov M, I + .align 4 + +.LL12: +#if defined(LT) || defined(RN) + mov B, BO +#else +#ifdef LN + sll K, ZBASE_SHIFT, TEMP1 + sub AORIG, TEMP1, AORIG +#endif + + sll KK, ZBASE_SHIFT + 0, TEMP1 + sll KK, ZBASE_SHIFT + 2, TEMP2 + + add AORIG, TEMP1, AO + add B, TEMP2, BO +#endif + + LDF [AO + 0 * SIZE], a1 + FCLR (cc01) + LDF [AO + 1 * SIZE], a2 + FCLR (cc05) + LDF [AO + 8 * SIZE], a5 + FCLR (cc09) + LDF [BO + 0 * SIZE], b1 + FCLR (cc13) + + LDF [BO + 1 * SIZE], b2 + FCLR (cc02) + LDF [BO + 2 * SIZE], b3 + FCLR (cc06) + LDF [BO + 3 * SIZE], b4 + FCLR (cc10) + LDF [BO + 4 * SIZE], b5 + FCLR (cc14) + + LDF [BO + 5 * SIZE], b6 + FCLR (cc03) + LDF [BO + 6 * SIZE], b7 + FCLR (cc07) + LDF [BO + 7 * SIZE], b8 + FCLR (cc11) + LDF [BO + 8 * SIZE], b9 + FCLR (cc15) + + prefetch [C1 + 1 * SIZE], 3 + FCLR (cc04) + prefetch [C2 + 2 * SIZE], 3 + FCLR (cc08) + prefetch [C3 + 1 * SIZE], 3 + FCLR (cc12) + prefetch [C4 + 2 * SIZE], 3 + FCLR (cc16) + +#if defined(LT) || defined(RN) + sra KK, 3, L +#else + sub K, KK, L + sra L, 3, L +#endif + cmp L, 0 + ble,pn %icc, .LL15 + nop + .align 4 + +.LL13: + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + ble,pn %icc, .LL15 + LDF [BO + 7 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 16 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 9 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 2 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 3 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 0) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 12 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 13 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 14 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 15 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 24 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 17 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 18 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 19 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 4 * SIZE], a1 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 5 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + add L, -1, L + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 20 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 21 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 22 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 23 * SIZE], b8 + + FMADD1 (aa1, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa1, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa1, bb3, cc05, cc05) + LDF [BO + 32 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 25 * SIZE], b2 + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 26 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 27 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + LDF [AO + 6 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 7 * SIZE], a4 + + FMADD3 (aa1, bb6, cc11, cc11) + nop + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa1, bb7, cc13, cc13) + LDF [BO + 28 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 29 * SIZE], b6 + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [BO + 30 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 31 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 40 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 33 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 34 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 35 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 16 * SIZE], a1 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 9 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + nop + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 36 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 37 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 38 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 39 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 48 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 41 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 42 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 43 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 10 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 11 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + prefetch [AO + (APREFETCHSIZE + 8) * SIZE], APREFETCH_CATEGORY + FMADD4 (aa2, bb6, cc12, cc12) + nop + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO + 44 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO + 45 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO + 46 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO + 47 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 56 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 49 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 50 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 51 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 12 * SIZE], a5 + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 13 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + cmp L, 0 + FMADD4 (aa4, bb6, cc12, cc12) + nop + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 52 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 53 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 54 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + LDF [BO + 55 * SIZE], b8 + + FMADD1 (aa5, bb1, cc01, cc01) + FMADD2 (aa2, bb1, cc02, cc02) + FMADD3 (aa5, bb2, cc03, cc03) + FMADD4 (aa2, bb2, cc04, cc04) + + FMADD1 (aa5, bb3, cc05, cc05) + LDF [BO + 64 * SIZE], b1 + FMADD2 (aa2, bb3, cc06, cc06) + LDF [BO + 57 * SIZE], b2 + + FMADD3 (aa5, bb4, cc07, cc07) + LDF [BO + 58 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 59 * SIZE], b4 + + FMADD1 (aa5, bb5, cc09, cc09) + LDF [AO + 14 * SIZE], a3 + FMADD2 (aa2, bb5, cc10, cc10) + LDF [AO + 15 * SIZE], a4 + + FMADD3 (aa5, bb6, cc11, cc11) + add BO, 64 * SIZE, BO + FMADD4 (aa2, bb6, cc12, cc12) + add AO, 16 * SIZE, AO + + FMADD1 (aa5, bb7, cc13, cc13) + LDF [BO - 4 * SIZE], b5 + FMADD2 (aa2, bb7, cc14, cc14) + LDF [BO - 3 * SIZE], b6 + + FMADD3 (aa5, bb8, cc15, cc15) + LDF [BO - 2 * SIZE], b7 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [BO - 1 * SIZE], b8 + + FMADD1 (aa3, bb9, cc01, cc01) + FMADD2 (aa4, bb9, cc02, cc02) + FMADD3 (aa3, bb2, cc03, cc03) + FMADD4 (aa4, bb2, cc04, cc04) + + FMADD1 (aa3, bb3, cc05, cc05) + LDF [BO + 8 * SIZE], b9 + FMADD2 (aa4, bb3, cc06, cc06) + LDF [BO + 1 * SIZE], b2 + + FMADD3 (aa3, bb4, cc07, cc07) + LDF [BO + 2 * SIZE], b3 + FMADD4 (aa4, bb4, cc08, cc08) + LDF [BO + 3 * SIZE], b4 + + FMADD1 (aa3, bb5, cc09, cc09) + LDF [AO + 8 * SIZE], a5 /****/ + FMADD2 (aa4, bb5, cc10, cc10) + LDF [AO + 1 * SIZE], a2 + + FMADD3 (aa3, bb6, cc11, cc11) + FMADD4 (aa4, bb6, cc12, cc12) + + FMADD1 (aa3, bb7, cc13, cc13) + LDF [BO + 4 * SIZE], b5 + FMADD2 (aa4, bb7, cc14, cc14) + LDF [BO + 5 * SIZE], b6 + + FMADD3 (aa3, bb8, cc15, cc15) + LDF [BO + 6 * SIZE], b7 + FMADD4 (aa4, bb8, cc16, cc16) + bg,pt %icc, .LL13 + LDF [BO + 7 * SIZE], b8 + .align 4 + +.LL15: +#if defined(LT) || defined(RN) + and KK, 7, L +#else + sub K, KK, L + and L, 7, L +#endif + cmp L, 0 + ble,a,pn %icc, .LL18 + nop + .align 4 + +.LL17: + FMADD1 (aa1, bb1, cc01, cc01) + add L, -1, L + FMADD2 (aa2, bb1, cc02, cc02) + nop + + FMADD3 (aa1, bb2, cc03, cc03) + LDF [BO + 8 * SIZE], b1 + FMADD4 (aa2, bb2, cc04, cc04) + LDF [BO + 9 * SIZE], b2 + + FMADD1 (aa1, bb3, cc05, cc05) + cmp L, 0 + FMADD2 (aa2, bb3, cc06, cc06) + nop + + FMADD3 (aa1, bb4, cc07, cc07) + LDF [BO + 10 * SIZE], b3 + FMADD4 (aa2, bb4, cc08, cc08) + LDF [BO + 11 * SIZE], b4 + + FMADD1 (aa1, bb5, cc09, cc09) + nop + FMADD2 (aa2, bb5, cc10, cc10) + nop + + FMADD3 (aa1, bb6, cc11, cc11) + LDF [BO + 12 * SIZE], b5 + FMADD4 (aa2, bb6, cc12, cc12) + LDF [BO + 13 * SIZE], b6 + + FMADD1 (aa1, bb7, cc13, cc13) + add AO, 2 * SIZE, AO + FMADD2 (aa2, bb7, cc14, cc14) + add BO, 8 * SIZE, BO + + FMADD3 (aa1, bb8, cc15, cc15) + LDF [AO + 0 * SIZE], a1 + FMADD4 (aa2, bb8, cc16, cc16) + LDF [AO + 1 * SIZE], a2 + + LDF [BO + 6 * SIZE], b7 + bg,pt %icc, .LL17 + LDF [BO + 7 * SIZE], b8 + nop + .align 4 + +.LL18: + FADD c01, c04, c01 + FADD c02, c03, c02 + FADD c05, c08, c05 + FADD c06, c07, c06 + + FADD c09, c12, c09 + FADD c10, c11, c10 + FADD c13, c16, c13 + FADD c14, c15, c14 + +#if defined(LN) || defined(RT) +#ifdef LN + sub KK, 1, TEMP1 +#else + sub KK, 4, TEMP1 +#endif + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + + add AORIG, TEMP2, AO + add B, TEMP1, BO +#endif + +#if defined(LN) || defined(LT) + LDF [BO + 0 * SIZE], a1 + LDF [BO + 1 * SIZE], a2 + LDF [BO + 2 * SIZE], a3 + LDF [BO + 3 * SIZE], a4 + + LDF [BO + 4 * SIZE], b1 + LDF [BO + 5 * SIZE], b2 + LDF [BO + 6 * SIZE], b3 + LDF [BO + 7 * SIZE], b4 +#else + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + LDF [AO + 2 * SIZE], a3 + LDF [AO + 3 * SIZE], a4 + + LDF [AO + 4 * SIZE], b1 + LDF [AO + 5 * SIZE], b2 + LDF [AO + 6 * SIZE], b3 + LDF [AO + 7 * SIZE], b4 +#endif + + FSUB a1, c01, c01 + FSUB a2, c02, c02 + FSUB a3, c05, c05 + FSUB a4, c06, c06 + + FSUB b1, c09, c09 + FSUB b2, c10, c10 + FSUB b3, c13, c13 + FSUB b4, c14, c14 + +#if defined(LN) || defined(LT) + LDF [AO + 0 * SIZE], a1 + LDF [AO + 1 * SIZE], a2 + + FMUL a1, c01, b1 + FMUL a2, c01, b2 + FMUL a1, c05, b3 + FMUL a2, c05, b4 + FMUL a1, c09, b5 + FMUL a2, c09, b6 + FMUL a1, c13, b7 + FMUL a2, c13, b8 + +#ifndef CONJ + FNMSUB (aa2, cc02, bb1, cc01) + FMADD (aa1, cc02, bb2, cc02) + FNMSUB (aa2, cc06, bb3, cc05) + FMADD (aa1, cc06, bb4, cc06) + FNMSUB (aa2, cc10, bb5, cc09) + FMADD (aa1, cc10, bb6, cc10) + FNMSUB (aa2, cc14, bb7, cc13) + FMADD (aa1, cc14, bb8, cc14) +#else + FMADD (aa2, cc02, bb1, cc01) + FMSUB (aa1, cc02, bb2, cc02) + FMADD (aa2, cc06, bb3, cc05) + FMSUB (aa1, cc06, bb4, cc06) + FMADD (aa2, cc10, bb5, cc09) + FMSUB (aa1, cc10, bb6, cc10) + FMADD (aa2, cc14, bb7, cc13) + FMSUB (aa1, cc14, bb8, cc14) +#endif +#endif + +#ifdef RN + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + LDF [BO + 2 * SIZE], b3 + LDF [BO + 3 * SIZE], b4 + LDF [BO + 4 * SIZE], b5 + LDF [BO + 5 * SIZE], b6 + LDF [BO + 6 * SIZE], b7 + LDF [BO + 7 * SIZE], b8 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif + + FNMSUB (bb3, cc01, cc05, cc05) + FNMSUB (bb3, cc02, cc06, cc06) + FNMSUB (bb5, cc01, cc09, cc09) + FNMSUB (bb5, cc02, cc10, cc10) + FNMSUB (bb7, cc01, cc13, cc13) + FNMSUB (bb7, cc02, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc02, cc05, cc05) + FNMSUB (bb4, cc01, cc06, cc06) + FMADD (bb6, cc02, cc09, cc09) + FNMSUB (bb6, cc01, cc10, cc10) + FMADD (bb8, cc02, cc13, cc13) + FNMSUB (bb8, cc01, cc14, cc14) +#else + FNMSUB (bb4, cc02, cc05, cc05) + FMADD (bb4, cc01, cc06, cc06) + FNMSUB (bb6, cc02, cc09, cc09) + FMADD (bb6, cc01, cc10, cc10) + FNMSUB (bb8, cc02, cc13, cc13) + FMADD (bb8, cc01, cc14, cc14) +#endif + + LDF [BO + 10 * SIZE], b1 + LDF [BO + 11 * SIZE], b2 + LDF [BO + 12 * SIZE], b3 + LDF [BO + 13 * SIZE], b4 + LDF [BO + 14 * SIZE], b5 + LDF [BO + 15 * SIZE], b6 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc09, cc09) + FNMSUB (bb3, cc06, cc10, cc10) + FNMSUB (bb5, cc05, cc13, cc13) + FNMSUB (bb5, cc06, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc06, cc09, cc09) + FNMSUB (bb4, cc05, cc10, cc10) + FMADD (bb6, cc06, cc13, cc13) + FNMSUB (bb6, cc05, cc14, cc14) +#else + FNMSUB (bb4, cc06, cc09, cc09) + FMADD (bb4, cc05, cc10, cc10) + FNMSUB (bb6, cc06, cc13, cc13) + FMADD (bb6, cc05, cc14, cc14) +#endif + + LDF [BO + 20 * SIZE], b1 + LDF [BO + 21 * SIZE], b2 + LDF [BO + 22 * SIZE], b3 + LDF [BO + 23 * SIZE], b4 + + FMUL b1, c09, a1 + FMUL b2, c09, a2 + +#ifndef CONJ + FNMSUB (bb2, cc10, aa1, cc09) + FMADD (bb1, cc10, aa2, cc10) +#else + FMADD (bb2, cc10, aa1, cc09) + FMSUB (bb1, cc10, aa2, cc10) +#endif + + FNMSUB (bb3, cc09, cc13, cc13) + FNMSUB (bb3, cc10, cc14, cc14) + +#ifndef CONJ + FMADD (bb4, cc10, cc13, cc13) + FNMSUB (bb4, cc09, cc14, cc14) +#else + FNMSUB (bb4, cc10, cc13, cc13) + FMADD (bb4, cc09, cc14, cc14) +#endif + + LDF [BO + 30 * SIZE], b1 + LDF [BO + 31 * SIZE], b2 + + FMUL b1, c13, a1 + FMUL b2, c13, a2 + +#ifndef CONJ + FNMSUB (bb2, cc14, aa1, cc13) + FMADD (bb1, cc14, aa2, cc14) +#else + FMADD (bb2, cc14, aa1, cc13) + FMSUB (bb1, cc14, aa2, cc14) +#endif +#endif + +#ifdef RT + LDF [BO + 30 * SIZE], b1 + LDF [BO + 31 * SIZE], b2 + LDF [BO + 28 * SIZE], b3 + LDF [BO + 29 * SIZE], b4 + LDF [BO + 26 * SIZE], b5 + LDF [BO + 27 * SIZE], b6 + LDF [BO + 24 * SIZE], b7 + LDF [BO + 25 * SIZE], b8 + + FMUL b1, c13, a1 + FMUL b2, c13, a2 + +#ifndef CONJ + FNMSUB (bb2, cc14, aa1, cc13) + FMADD (bb1, cc14, aa2, cc14) +#else + FMADD (bb2, cc14, aa1, cc13) + FMSUB (bb1, cc14, aa2, cc14) +#endif + + FNMSUB (bb3, cc13, cc09, cc09) + FNMSUB (bb3, cc14, cc10, cc10) + FNMSUB (bb5, cc13, cc05, cc05) + FNMSUB (bb5, cc14, cc06, cc06) + FNMSUB (bb7, cc13, cc01, cc01) + FNMSUB (bb7, cc14, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc14, cc09, cc09) + FNMSUB (bb4, cc13, cc10, cc10) + FMADD (bb6, cc14, cc05, cc05) + FNMSUB (bb6, cc13, cc06, cc06) + FMADD (bb8, cc14, cc01, cc01) + FNMSUB (bb8, cc13, cc02, cc02) +#else + FNMSUB (bb4, cc14, cc09, cc09) + FMADD (bb4, cc13, cc10, cc10) + FNMSUB (bb6, cc14, cc05, cc05) + FMADD (bb6, cc13, cc06, cc06) + FNMSUB (bb8, cc14, cc01, cc01) + FMADD (bb8, cc13, cc02, cc02) +#endif + + LDF [BO + 20 * SIZE], b1 + LDF [BO + 21 * SIZE], b2 + LDF [BO + 18 * SIZE], b3 + LDF [BO + 19 * SIZE], b4 + LDF [BO + 16 * SIZE], b5 + LDF [BO + 17 * SIZE], b6 + + FMUL b1, c09, a1 + FMUL b2, c09, a2 + +#ifndef CONJ + FNMSUB (bb2, cc10, aa1, cc09) + FMADD (bb1, cc10, aa2, cc10) +#else + FMADD (bb2, cc10, aa1, cc09) + FMSUB (bb1, cc10, aa2, cc10) +#endif + + FNMSUB (bb3, cc09, cc05, cc05) + FNMSUB (bb3, cc10, cc06, cc06) + FNMSUB (bb5, cc09, cc01, cc01) + FNMSUB (bb5, cc10, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc10, cc05, cc05) + FNMSUB (bb4, cc09, cc06, cc06) + FMADD (bb6, cc10, cc01, cc01) + FNMSUB (bb6, cc09, cc02, cc02) +#else + FNMSUB (bb4, cc10, cc05, cc05) + FMADD (bb4, cc09, cc06, cc06) + FNMSUB (bb6, cc10, cc01, cc01) + FMADD (bb6, cc09, cc02, cc02) +#endif + + LDF [BO + 10 * SIZE], b1 + LDF [BO + 11 * SIZE], b2 + LDF [BO + 8 * SIZE], b3 + LDF [BO + 9 * SIZE], b4 + + FMUL b1, c05, a1 + FMUL b2, c05, a2 + +#ifndef CONJ + FNMSUB (bb2, cc06, aa1, cc05) + FMADD (bb1, cc06, aa2, cc06) +#else + FMADD (bb2, cc06, aa1, cc05) + FMSUB (bb1, cc06, aa2, cc06) +#endif + + FNMSUB (bb3, cc05, cc01, cc01) + FNMSUB (bb3, cc06, cc02, cc02) + +#ifndef CONJ + FMADD (bb4, cc06, cc01, cc01) + FNMSUB (bb4, cc05, cc02, cc02) +#else + FNMSUB (bb4, cc06, cc01, cc01) + FMADD (bb4, cc05, cc02, cc02) +#endif + + LDF [BO + 0 * SIZE], b1 + LDF [BO + 1 * SIZE], b2 + + FMUL b1, c01, a1 + FMUL b2, c01, a2 + +#ifndef CONJ + FNMSUB (bb2, cc02, aa1, cc01) + FMADD (bb1, cc02, aa2, cc02) +#else + FMADD (bb2, cc02, aa1, cc01) + FMSUB (bb1, cc02, aa2, cc02) +#endif +#endif + +#ifdef LN + add C1, -2 * SIZE, C1 + add C2, -2 * SIZE, C2 + add C3, -2 * SIZE, C3 + add C4, -2 * SIZE, C4 +#endif + +#if defined(LN) || defined(LT) + STF c01, [BO + 0 * SIZE] + STF c02, [BO + 1 * SIZE] + STF c05, [BO + 2 * SIZE] + STF c06, [BO + 3 * SIZE] + + STF c09, [BO + 4 * SIZE] + STF c10, [BO + 5 * SIZE] + STF c13, [BO + 6 * SIZE] + STF c14, [BO + 7 * SIZE] +#else + STF c01, [AO + 0 * SIZE] + STF c02, [AO + 1 * SIZE] + STF c05, [AO + 2 * SIZE] + STF c06, [AO + 3 * SIZE] + + STF c09, [AO + 4 * SIZE] + STF c10, [AO + 5 * SIZE] + STF c13, [AO + 6 * SIZE] + STF c14, [AO + 7 * SIZE] +#endif + + STF c01, [C1 + 0 * SIZE] + STF c02, [C1 + 1 * SIZE] + STF c05, [C2 + 0 * SIZE] + STF c06, [C2 + 1 * SIZE] + + STF c09, [C3 + 0 * SIZE] + STF c10, [C3 + 1 * SIZE] + STF c13, [C4 + 0 * SIZE] + STF c14, [C4 + 1 * SIZE] + +#ifndef LN + add C1, 2 * SIZE, C1 + add C2, 2 * SIZE, C2 + add C3, 2 * SIZE, C3 + add C4, 2 * SIZE, C4 +#endif + +#ifdef RT + sll K, ZBASE_SHIFT, TEMP1 + add AORIG, TEMP1, AORIG +#endif + +#if defined(LT) || defined(RN) + sub K, KK, TEMP1 + sll TEMP1, ZBASE_SHIFT + 0, TEMP2 + sll TEMP1, ZBASE_SHIFT + 2, TEMP1 + add AO, TEMP2, AO + add BO, TEMP1, BO +#endif + +#ifdef LT + add KK, 1, KK +#endif + +#ifdef LN + sub KK, 1, KK +#endif + + add I, -1, I + cmp I, 0 + bg,pt %icc, .LL12 + nop + +#ifdef LN + sll K, ZBASE_SHIFT + 2, TEMP1 + add B, TEMP1, B +#endif + +#if defined(LT) || defined(RN) + mov BO, B +#endif + +#ifdef RN + add KK, 4, KK +#endif + +#ifdef RT + sub KK, 4, KK +#endif + + add J, -1, J + cmp J, 0 + bg,pt %icc, .LL11 + nop + .align 4 + +.LL999: + return %i7 + 8 + clr %o0 + + EPILOGUE diff --git a/kernel/x86/KERNEL b/kernel/x86/KERNEL new file mode 100644 index 0000000000..69becf69ff --- /dev/null +++ b/kernel/x86/KERNEL @@ -0,0 +1,398 @@ +GEMVDEP = ../l2param.h + +ifdef HAVE_SSE + +ifndef SAMAXKERNEL +SAMAXKERNEL = amax_sse.S +endif + +ifndef CAMAXKERNEL +CAMAXKERNEL = zamax_sse.S +endif + +ifndef SAMINKERNEL +SAMINKERNEL = amax_sse.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax_sse.S +endif + +ifndef ISAMAXKERNEL +ISAMAXKERNEL = iamax_sse.S +endif + +ifndef ICAMAXKERNEL +ICAMAXKERNEL = izamax_sse.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax_sse.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax_sse.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = iamax_sse.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax_sse.S +endif + +ifndef SMAXKERNEL +SMAXKERNEL = amax_sse.S +endif + +ifndef SMINKERNEL +SMINKERNEL = amax_sse.S +endif + +ifndef SASUMKERNEL +SASUMKERNEL = asum_sse.S +endif + +ifndef CASUMKERNEL +CASUMKERNEL = zasum_sse.S +endif + +ifndef SDOTKERNEL +SDOTKERNEL = dot_sse.S +endif + +ifndef CDOTKERNEL +CDOTKERNEL = zdot_sse.S +endif + +ifndef SCOPYKERNEL +SCOPYKERNEL = copy_sse.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = zcopy_sse.S +endif + +ifndef SSACALKERNEL +SSCALKERNEL = scal_sse.S +endif + +ifndef CSACALKERNEL +CSCALKERNEL = zscal_sse.S +endif + +ifndef SAXPYKERNEL +SAXPYKERNEL = axpy_sse.S +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = zaxpy_sse.S +endif + +ifndef SROTKERNEL +SROTKERNEL = rot_sse.S +endif + +ifndef CROTKERNEL +CROTKERNEL = zrot_sse.S +endif + +ifndef SSWAPKERNEL +SSWAPKERNEL = swap_sse.S +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = zswap_sse.S +endif + +ifndef SGEMVNKERNEL +SGEMVNKERNEL = gemv_n_sse.S +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = gemv_t_sse.S +endif + +ifndef CGEMVNKERNEL +CGEMVNKERNEL = zgemv_n_sse.S +endif + +ifndef CGEMVTKERNEL +CGEMVTKERNEL = zgemv_t_sse.S +endif + +endif + + +ifdef HAVE_SSE2 + +ifndef DAMAXKERNEL +DAMAXKERNEL = amax_sse2.S +endif + +ifndef ZAMAXKERNEL +ZAMAXKERNEL = zamax_sse2.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax_sse2.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax_sse2.S +endif + +ifndef IDAMAXKERNEL +IDAMAXKERNEL = iamax_sse2.S +endif + +ifndef IZAMAXKERNEL +IZAMAXKERNEL = izamax_sse2.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax_sse2.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax_sse2.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = iamax_sse2.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax_sse2.S +endif + +ifndef DMAXKERNEL +DMAXKERNEL = amax_sse2.S +endif + +ifndef DMINKERNEL +DMINKERNEL = amax_sse2.S +endif + +ifndef DDOTKERNEL +DDOTKERNEL = dot_sse2.S +endif + +ifndef ZDOTKERNEL +ZDOTKERNEL = zdot_sse2.S +endif + +ifndef DCOPYKERNEL +# DCOPYKERNEL = copy_sse2.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = zcopy_sse2.S +endif + +ifndef DSACALKERNEL +DSCALKERNEL = scal_sse2.S +endif + +ifndef ZSACALKERNEL +ZSCALKERNEL = zscal_sse2.S +endif + +ifndef DASUMKERNEL +DASUMKERNEL = asum_sse2.S +endif + +ifndef ZASUMKERNEL +ZASUMKERNEL = zasum_sse2.S +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = axpy_sse2.S +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = zaxpy_sse2.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2_sse.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2_sse.S +endif + +ifndef DROTKERNEL +DROTKERNEL = rot_sse2.S +endif + +ifndef ZROTKERNEL +ZROTKERNEL = zrot_sse2.S +endif + +ifndef DSWAPKERNEL +DSWAPKERNEL = swap_sse2.S +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = zswap_sse2.S +endif + +endif + + +ifndef SAMINKERNEL +SAMINKERNEL = amax.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax.S +endif + +ifndef QAMINKERNEL +QAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax.S +endif + +ifndef XAMINKERNEL +XAMINKERNEL = zamax.S +endif + +ifndef SMAXKERNEL +SMAXKERNEL = amax.S +endif + +ifndef DMAXKERNEL +DMAXKERNEL = amax.S +endif + +ifndef QMAXKERNEL +QMAXKERNEL = amax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = amax.S +endif + +ifndef DMINKERNEL +DMINKERNEL = amax.S +endif + +ifndef QMINKERNEL +QMINKERNEL = amax.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax.S +endif + +ifndef IQAMINKERNEL +IQAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax.S +endif + +ifndef IXAMINKERNEL +IXAMINKERNEL = izamax.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = iamax.S +endif + +ifndef QDOTKERNEL +QDOTKERNEL = qdot.S +endif + +ifndef XDOTKERNEL +XDOTKERNEL = xdot.S +endif + +ifndef QAXPYKERNEL +QAXPYKERNEL = qaxpy.S +endif + +ifndef XAXPYKERNEL +XAXPYKERNEL = xaxpy.S +endif + +ifndef QGEMVNKERNEL +QGEMVNKERNEL = qgemv_n.S +endif + +ifndef QGEMVTKERNEL +QGEMVTKERNEL = qgemv_t.S +endif + +ifndef XGEMVNKERNEL +XGEMVNKERNEL = xgemv_n.S +endif + +ifndef XGEMVTKERNEL +XGEMVTKERNEL = xgemv_t.S +endif + +QGEMMKERNEL = qgemm_kernel_2x2.S +QGEMMINCOPY = +QGEMMITCOPY = +QGEMMONCOPY = ../generic/gemm_ncopy_2.c +QGEMMOTCOPY = ../generic/gemm_tcopy_2.c +QGEMMINCOPYOBJ = +QGEMMITCOPYOBJ = +QGEMMONCOPYOBJ = qgemm_oncopy$(TSUFFIX).$(SUFFIX) +QGEMMOTCOPYOBJ = qgemm_otcopy$(TSUFFIX).$(SUFFIX) + +XGEMMKERNEL = xgemm_kernel_1x1.S +XGEMMINCOPY = +XGEMMITCOPY = +XGEMMONCOPY = ../generic/zgemm_ncopy_1.c +XGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +XGEMMINCOPYOBJ = +XGEMMITCOPYOBJ = +XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX) +XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +QGEMM_BETA = ../generic/gemm_beta.c +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S +XGEMM_BETA = ../generic/zgemm_beta.c + +QTRSMKERNEL_LN = qtrsm_kernel_LN_2x2.S +QTRSMKERNEL_LT = qtrsm_kernel_LT_2x2.S +QTRSMKERNEL_RN = qtrsm_kernel_LT_2x2.S +QTRSMKERNEL_RT = qtrsm_kernel_RT_2x2.S + +XTRSMKERNEL_LN = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_LT = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S + +XGEMM3MKERNEL = xgemm3m_kernel_2x2.S diff --git a/kernel/x86/KERNEL.ATHLON b/kernel/x86/KERNEL.ATHLON new file mode 100644 index 0000000000..30f1e32f48 --- /dev/null +++ b/kernel/x86/KERNEL.ATHLON @@ -0,0 +1,63 @@ +SGEMMKERNEL = gemm_kernel_2x4_3dnow.S +SGEMMINCOPY = ../generic/gemm_ncopy_2.c +SGEMMITCOPY = ../generic/gemm_tcopy_2.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_1x4.S +DGEMMINCOPY = ../generic/gemm_ncopy_1.c +DGEMMITCOPY = ../generic/gemm_tcopy_1.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_1x2_3dnow.S +CGEMMINCOPY = ../generic/zgemm_ncopy_1.c +CGEMMITCOPY = ../generic/zgemm_tcopy_1.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = trsm_kernel_LT_1x4.S +DTRSMKERNEL_LT = trsm_kernel_LT_1x4.S +DTRSMKERNEL_RN = trsm_kernel_LT_1x4.S +DTRSMKERNEL_RT = trsm_kernel_RT_1x4.S + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ifdef HAVE_SSE +CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S +CGEMM3MKERNEL = zgemm3m_kernel_1x4_athlon.S +endif + +ZGEMM3MKERNEL = zgemm3m_kernel_1x4_athlon.S diff --git a/kernel/x86/KERNEL.ATOM b/kernel/x86/KERNEL.ATOM new file mode 100644 index 0000000000..b0f6733508 --- /dev/null +++ b/kernel/x86/KERNEL.ATOM @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_penryn.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x2_atom.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_2.S +DGEMMOTCOPY = gemm_tcopy_2.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_penryn.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x1_atom.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x2_atom.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x2_atom.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x2_atom.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x2_atom.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1_atom.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1_atom.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1_atom.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1_atom.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x2_atom.S diff --git a/kernel/x86/KERNEL.BANIAS b/kernel/x86/KERNEL.BANIAS new file mode 100644 index 0000000000..22c02f09d0 --- /dev/null +++ b/kernel/x86/KERNEL.BANIAS @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x2_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x1_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x1.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S diff --git a/kernel/x86/KERNEL.BARCELONA b/kernel/x86/KERNEL.BARCELONA new file mode 100644 index 0000000000..231350a620 --- /dev/null +++ b/kernel/x86/KERNEL.BARCELONA @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86/KERNEL.COPPERMINE b/kernel/x86/KERNEL.COPPERMINE new file mode 100644 index 0000000000..22c02f09d0 --- /dev/null +++ b/kernel/x86/KERNEL.COPPERMINE @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x2_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x1_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x1.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S diff --git a/kernel/x86/KERNEL.CORE2 b/kernel/x86/KERNEL.CORE2 new file mode 100644 index 0000000000..0c0659e500 --- /dev/null +++ b/kernel/x86/KERNEL.CORE2 @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x2_core2.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_2.S +SGEMMOTCOPY = gemm_tcopy_2.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x2_core2.S +DGEMMINCOPY = gemm_ncopy_4_sse.S +DGEMMITCOPY = gemm_tcopy_4_sse.S +DGEMMONCOPY = gemm_ncopy_2_sse.S +DGEMMOTCOPY = gemm_tcopy_2_sse.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x1_core2.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x1_core2.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x2_core2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x2_core2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x2_core2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x2_core2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_core2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_core2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_core2.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_core2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x2_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x2_core2.S diff --git a/kernel/x86/KERNEL.DUNNINGTON b/kernel/x86/KERNEL.DUNNINGTON new file mode 100644 index 0000000000..08e35438f7 --- /dev/null +++ b/kernel/x86/KERNEL.DUNNINGTON @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_penryn.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_penryn.S +DGEMMINCOPY = gemm_ncopy_2.S +DGEMMITCOPY = gemm_tcopy_2.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_penryn.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_penryn.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_penryn.S diff --git a/kernel/x86/KERNEL.KATMAI b/kernel/x86/KERNEL.KATMAI new file mode 100644 index 0000000000..93623e5964 --- /dev/null +++ b/kernel/x86/KERNEL.KATMAI @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.COPPERMINE diff --git a/kernel/x86/KERNEL.NANO b/kernel/x86/KERNEL.NANO new file mode 100644 index 0000000000..65b03ae50e --- /dev/null +++ b/kernel/x86/KERNEL.NANO @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/KERNEL.NEHALEM b/kernel/x86/KERNEL.NEHALEM new file mode 100644 index 0000000000..65b03ae50e --- /dev/null +++ b/kernel/x86/KERNEL.NEHALEM @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/KERNEL.NORTHWOOD b/kernel/x86/KERNEL.NORTHWOOD new file mode 100644 index 0000000000..ddf80e952a --- /dev/null +++ b/kernel/x86/KERNEL.NORTHWOOD @@ -0,0 +1,60 @@ +SGEMMKERNEL = gemm_kernel_8x2_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_2.S +SGEMMOTCOPY = gemm_tcopy_2.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x2_sse2.S +DGEMMINCOPY = gemm_ncopy_4_sse.S +DGEMMITCOPY = gemm_tcopy_4_sse.S +DGEMMONCOPY = gemm_ncopy_2.S +DGEMMOTCOPY = gemm_tcopy_2.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x1_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x1_sse2.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x2_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x2_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x2_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x2_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x2_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x2_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x2_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x1_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x1_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_4x1_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x2_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x2_northwood.S + diff --git a/kernel/x86/KERNEL.OPTERON b/kernel/x86/KERNEL.OPTERON new file mode 100644 index 0000000000..7b8b1373f7 --- /dev/null +++ b/kernel/x86/KERNEL.OPTERON @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_sse.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_sse2.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_sse.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_sse2.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_opteron.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_opteron.S diff --git a/kernel/x86/KERNEL.OPTERON_SSE3 b/kernel/x86/KERNEL.OPTERON_SSE3 new file mode 100644 index 0000000000..05e7b252a2 --- /dev/null +++ b/kernel/x86/KERNEL.OPTERON_SSE3 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.OPTERON diff --git a/kernel/x86/KERNEL.P5 b/kernel/x86/KERNEL.P5 new file mode 100644 index 0000000000..12de178e14 --- /dev/null +++ b/kernel/x86/KERNEL.P5 @@ -0,0 +1,2 @@ +include $(KERNELDIR)/KERNEL.P6 + diff --git a/kernel/x86/KERNEL.P6 b/kernel/x86/KERNEL.P6 new file mode 100644 index 0000000000..8a7500c17f --- /dev/null +++ b/kernel/x86/KERNEL.P6 @@ -0,0 +1,60 @@ +SGEMMKERNEL = gemm_kernel_2x2.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_2.c +SGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_1x1.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_1.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x1.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_2x2.S +STRSMKERNEL_LT = trsm_kernel_LT_2x2.S +STRSMKERNEL_RN = trsm_kernel_LT_2x2.S +STRSMKERNEL_RT = trsm_kernel_RT_2x2.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S +CTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x1.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_1x1.S + +CGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x2_coppermine.S + diff --git a/kernel/x86/KERNEL.PENRYN b/kernel/x86/KERNEL.PENRYN new file mode 100644 index 0000000000..08e35438f7 --- /dev/null +++ b/kernel/x86/KERNEL.PENRYN @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_penryn.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_penryn.S +DGEMMINCOPY = gemm_ncopy_2.S +DGEMMITCOPY = gemm_tcopy_2.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_penryn.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_penryn.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_penryn.S diff --git a/kernel/x86/KERNEL.PRESCOTT b/kernel/x86/KERNEL.PRESCOTT new file mode 100644 index 0000000000..355e00fcfc --- /dev/null +++ b/kernel/x86/KERNEL.PRESCOTT @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_sse3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_sse3.S +DGEMMINCOPY = gemm_ncopy_2.S +DGEMMITCOPY = gemm_tcopy_2.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_sse3.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse3.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse3.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse3.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse3.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse3.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_prescott.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_prescott.S diff --git a/kernel/x86/KERNEL.VIAC3 b/kernel/x86/KERNEL.VIAC3 new file mode 100644 index 0000000000..94ade284bb --- /dev/null +++ b/kernel/x86/KERNEL.VIAC3 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ATHLON diff --git a/kernel/x86/KERNEL.YONAH b/kernel/x86/KERNEL.YONAH new file mode 100644 index 0000000000..5b3ecaea67 --- /dev/null +++ b/kernel/x86/KERNEL.YONAH @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_sse3.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_sse3.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_sse3.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_sse3.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse3.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse3.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse3.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse3.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse3.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse3.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_prescott.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_prescott.S diff --git a/kernel/x86/Makefile b/kernel/x86/Makefile new file mode 100644 index 0000000000..efae70d7b7 --- /dev/null +++ b/kernel/x86/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/x86/amax.S b/kernel/x86/amax.S new file mode 100644 index 0000000000..01c2bd60ef --- /dev/null +++ b/kernel/x86/amax.S @@ -0,0 +1,315 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + + PROLOGUE + +#define M %ebx +#define INCX %esi +#define X %ecx +#define I %edx + +#ifndef USE_MIN +#define FMOV fcmovbe +#else +#define FMOV fcmovnbe +#endif + +#include "l1param.h" + + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_INCX, INCX + movl STACK_X, X + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + sall $BASE_SHIFT, INCX + + fldz + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + fstp %st(0) + + FLD (X) +#ifdef USE_ABS + fabs +#endif + addl INCX, X + decl M + jle .L999 + + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 1 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 2 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 3 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 4 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 5 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 6 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 7 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl $8 * SIZE, X + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl $1 * SIZE, X + decl I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl INCX, X + decl I + jg .L61 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/amax_sse.S b/kernel/x86/amax_sse.S new file mode 100644 index 0000000000..65792cf456 --- /dev/null +++ b/kernel/x86/amax_sse.S @@ -0,0 +1,510 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %eax + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + xorps %xmm0, %xmm0 + leal (, INCX, SIZE), INCX + + testl M, M + jle .L999 + +#ifdef USE_ABS +#ifndef HAVE_SSE2 + subl $8, %esp + movl $0x7fffffff, (%esp) + movss (%esp), %xmm3 + shufps $0, %xmm3, %xmm3 + addl $8, %esp +#else + pcmpeqb %xmm3, %xmm3 + psrld $1, %xmm3 +#endif +#endif + + movss (X), %xmm0 + shufps $0, %xmm0, %xmm0 +#ifdef USE_ABS + andps %xmm3, %xmm0 +#endif + movaps %xmm0, %xmm1 + addl INCX, X + decl M + jle .L999 + + cmpl $SIZE, INCX + jne .L40 + + subl $-32 * SIZE, X + + cmpl $3, M + jle .L17 + + testl $SIZE, X + je .L05 + + movss -32 * SIZE(X), %xmm4 + addl $SIZE, X + shufps $0, %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + + decl M + ALIGN_3 + +.L05: + testl $2 * SIZE, X + je .L06 + + movsd -32 * SIZE(X), %xmm4 + addl $2 * SIZE, X + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm1 + + subl $2, M + ALIGN_3 + +.L06: + movl M, I + sarl $5, I + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + movaps 12 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + + subl $-32 * SIZE, X + ALIGN_3 + + +.L15: + testl $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxps %xmm6, %xmm0 + + movaps -20 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxps %xmm7, %xmm1 + + addl $16 * SIZE, X + ALIGN_3 + +.L16: + testl $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxps %xmm5, %xmm1 + addl $8 * SIZE, X + ALIGN_3 + +.L17: + testl $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm0 + addl $4 * SIZE, X + ALIGN_3 + +.L18: + testl $2, M + je .L19 + + movsd -32 * SIZE(X), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxps %xmm4, %xmm1 + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L40: + movl M, I + sarl $3, I + jle .L45 + ALIGN_4 + +.L41: + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxss %xmm7, %xmm1 + + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxss %xmm7, %xmm1 + + decl I + jg .L41 + ALIGN_4 + +.L45: + testl $4, M + je .L46 + + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxss %xmm7, %xmm1 + ALIGN_3 + +.L46: + testl $2, M + je .L47 + + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxss %xmm5, %xmm1 + ALIGN_3 + +.L47: + testl $1, M + je .L998 + + movss (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxss %xmm4, %xmm0 + ALIGN_4 + +.L998: + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + subl $8, %esp + movss %xmm0, (%esp) + flds (%esp) + addl $8, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/amax_sse2.S b/kernel/x86/amax_sse2.S new file mode 100644 index 0000000000..ad56244b20 --- /dev/null +++ b/kernel/x86/amax_sse2.S @@ -0,0 +1,518 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %eax + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + leal (, INCX, SIZE), INCX + + testl M, M + jle .L999 + +#ifdef USE_ABS + pcmpeqb %xmm3, %xmm3 + psrlq $1, %xmm3 +#endif + + movsd (X), %xmm0 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm0 +#endif + unpcklpd %xmm0, %xmm0 + movaps %xmm0, %xmm1 + decl M + jle .L999 + + cmpl $SIZE, INCX + jne .L40 + + subl $-16 * SIZE, X + + testl $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm0 + addl $SIZE, X + decl M + jle .L998 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + movaps 6 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + + subl $-16 * SIZE, X + ALIGN_4 + +.L15: + testl $8, M + jle .L16 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movaps -14 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movaps -12 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + + movaps -10 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + addl $8 * SIZE, X + ALIGN_3 + +.L16: + testl $4, M + jle .L17 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movaps -14 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + addl $4 * SIZE, X + ALIGN_3 + +.L17: + testl $2, M + jle .L18 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + addl $2 * SIZE, X + ALIGN_3 + +.L18: + testl $1, M + jle .L998 + + movsd -16 * SIZE(X), %xmm4 + unpcklpd %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm1 + jmp .L998 + ALIGN_3 + +.L40: + movl M, I + sarl $4, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addl INCX, X + movhps (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + + movsd (X), %xmm7 + addl INCX, X + movhps (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addl INCX, X + movhps (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + + movsd (X), %xmm7 + addl INCX, X + movhps (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + + decl I + jg .L41 + ALIGN_4 + +.L45: + andl $15, M + jle .L998 + + testl $8, M + je .L46 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addl INCX, X + movhps (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + + movsd (X), %xmm7 + addl INCX, X + movhps (X), %xmm7 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + ALIGN_3 + +.L46: + testl $4, M + je .L47 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm5 +#endif + maxpd %xmm5, %xmm1 + ALIGN_3 + +.L47: + testl $2, M + je .L48 + + movsd (X), %xmm6 + addl INCX, X + movhps (X), %xmm6 + addl INCX, X +#ifdef USE_ABS + andps %xmm3, %xmm6 +#endif + maxpd %xmm6, %xmm0 + ALIGN_3 + +.L48: + testl $1, M + je .L998 + + movsd (X), %xmm7 + unpcklpd %xmm7, %xmm7 +#ifdef USE_ABS + andps %xmm3, %xmm7 +#endif + maxpd %xmm7, %xmm1 + ALIGN_4 + +.L998: + maxpd %xmm1, %xmm0 + movaps %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + subl $8, %esp + movsd %xmm0, (%esp) + fldl (%esp) + addl $8, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/asum.S b/kernel/x86/asum.S new file mode 100644 index 0000000000..e1b0a6eb79 --- /dev/null +++ b/kernel/x86/asum.S @@ -0,0 +1,225 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $BASE_SHIFT, INCX + fldz + fldz + fldz + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L21: + FLD (X) + fabs + faddp %st,%st(1) + addl $1 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + FLD (X) + addl INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addl INCX, X + fabs + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/asum_sse.S b/kernel/x86/asum_sse.S new file mode 100644 index 0000000000..4506f299c4 --- /dev/null +++ b/kernel/x86/asum_sse.S @@ -0,0 +1,366 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define I %eax +#define M %ecx +#define X %esi +#define INCX %ebx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + +#ifdef HAVE_SSE2 + pcmpeqb %xmm3, %xmm3 + psrld $1, %xmm3 +#else + movl $0x7fffffff, STACK_M + movss STACK_M, %xmm3 + shufps $0, %xmm3, %xmm3 +#endif + + leal (, INCX, SIZE), INCX + + cmpl $SIZE, INCX + jne .L100 + + subl $-32 * SIZE, X + + cmpl $3, M + jle .L18 + + testl $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + andps %xmm3, %xmm0 + addl $SIZE, X + decl M + jle .L998 + ALIGN_3 + +.L05: + testl $8, X + je .L10 + + movsd -32 * SIZE(X), %xmm1 + andps %xmm3, %xmm1 + addl $2 * SIZE, X + subl $2, M + jle .L998 + ALIGN_3 + +.L10: + movl M, I + sarl $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps 12 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + decl I + jg .L11 + ALIGN_3 + +.L12: + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + + subl $-32 * SIZE, X + ALIGN_3 + +.L14: + testl $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + + movaps -20 * SIZE(X), %xmm7 + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + + addl $16 * SIZE, X + ALIGN_3 + +.L16: + testl $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L17: + testl $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + addl $4 * SIZE, X + ALIGN_3 + +.L18: + testl $2, M + je .L19 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm1 + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + jmp .L998 + ALIGN_4 + +.L100: + movl M, I + sarl $3, I + jle .L105 + ALIGN_4 + +.L101: + movss (X), %xmm4 + addl INCX, X + andps %xmm3, %xmm4 + addss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X + andps %xmm3, %xmm5 + addss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X + andps %xmm3, %xmm6 + addss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X + andps %xmm3, %xmm7 + addss %xmm7, %xmm1 + + movss (X), %xmm4 + addl INCX, X + andps %xmm3, %xmm4 + addss %xmm4, %xmm0 + + movss (X), %xmm5 + addl INCX, X + andps %xmm3, %xmm5 + addss %xmm5, %xmm1 + + movss (X), %xmm6 + addl INCX, X + andps %xmm3, %xmm6 + addss %xmm6, %xmm0 + + movss (X), %xmm7 + addl INCX, X + andps %xmm3, %xmm7 + addss %xmm7, %xmm1 + + decl I + jg .L101 + ALIGN_4 + +.L105: + andl $7, M + jle .L998 + ALIGN_4 + +.L106: + movss (X), %xmm4 + andps %xmm3, %xmm4 + addss %xmm4, %xmm0 + addl INCX, X + decl M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + movss %xmm0, STACK_M + flds STACK_M + + popl %ebx + popl %esi + + ret + + EPILOGUE diff --git a/kernel/x86/asum_sse2.S b/kernel/x86/asum_sse2.S new file mode 100644 index 0000000000..cea3503696 --- /dev/null +++ b/kernel/x86/asum_sse2.S @@ -0,0 +1,318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define I %eax +#define M %ecx +#define X %esi +#define INCX %ebx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + pcmpeqb %xmm3, %xmm3 + psrlq $1, %xmm3 + + sall $BASE_SHIFT, INCX + + subl $-16 * SIZE, X + + cmpl $SIZE, INCX + jne .L40 + + testl $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm0 + addl $SIZE, X + + andps %xmm3, %xmm0 + subl $1, M + jle .L999 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decl I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps 6 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + decl I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + subl $-16 * SIZE, X + ALIGN_3 + +.L20: + andl $15, M + jle .L999 + + testl $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + addl $8 * SIZE, X + ALIGN_3 + +.L21: + testl $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + ALIGN_3 + +.L22: + testl $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + addl $2 * SIZE, X + +.L23: + testl $1, M + je .L999 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addsd %xmm4, %xmm1 + jmp .L999 + ALIGN_3 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + movsd -16 * SIZE(X), %xmm4 + addl INCX, X + movhps -16 * SIZE(X), %xmm4 + addl INCX, X + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movsd -16 * SIZE(X), %xmm5 + addl INCX, X + movhps -16 * SIZE(X), %xmm5 + addl INCX, X + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movsd -16 * SIZE(X), %xmm6 + addl INCX, X + movhps -16 * SIZE(X), %xmm6 + addl INCX, X + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + + movsd -16 * SIZE(X), %xmm7 + addl INCX, X + movhps -16 * SIZE(X), %xmm7 + addl INCX, X + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + decl I + jg .L50 + ALIGN_4 + +.L60: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andl $7, M + jle .L999 + ALIGN_4 + +.L61: + movsd -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addsd %xmm4, %xmm0 + addl INCX, X + decl M + jg .L61 + ALIGN_4 + +.L999: + addpd %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movaps %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + movsd %xmm0, STACK_M + fldl STACK_M + popl %ebx + popl %esi + ret + + EPILOGUE + diff --git a/kernel/x86/axpy.S b/kernel/x86/axpy.S new file mode 100644 index 0000000000..7f3d99e444 --- /dev/null +++ b/kernel/x86/axpy.S @@ -0,0 +1,247 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) +#else +#define STACK_X 20 + STACK + ARGS(%esp) +#define STACK_INCX 24 + STACK + ARGS(%esp) +#define STACK_Y 28 + STACK + ARGS(%esp) +#define STACK_INCY 32 + STACK + ARGS(%esp) +#endif + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + FLD STACK_ALPHA + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L40 + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl M, %eax + sarl $3, %eax + jle .L15 + ALIGN_3 + +#define PRESIZE 33 + +.L16: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * SIZE(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1),%st + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + fmul %st(1),%st + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1),%st + FADD 2 * SIZE(Y) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + fmul %st(1),%st + FADD 3 * SIZE(Y) + FST 3 * SIZE(Y) + +#ifdef HAS_PREFETCH + prefetcht0 (4 + PRESIZE) * SIZE(X) +#endif + + FLD 4 * SIZE(X) + fmul %st(1),%st + FADD 4 * SIZE(Y) + FST 4 * SIZE(Y) + + FLD 5 * SIZE(X) + fmul %st(1),%st + FADD 5 * SIZE(Y) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1),%st + FADD 6 * SIZE(Y) + FST 6 * SIZE(Y) + + FLD 7 * SIZE(X) + fmul %st(1),%st + FADD 7 * SIZE(Y) + FST 7 * SIZE(Y) + +#ifdef HAVE_3DNOW + prefetchw 24 * SIZE(Y) +#endif + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl M, %eax + andl $7, %eax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1),%st + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + addl $SIZE, X + addl $SIZE, Y + decl %eax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movl M, %eax + sarl $2, %eax + jle .L28 + ALIGN_3 + +.L29: + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L35: + FLD (X) + fmul %st(1),%st + FADD (Y) + FST (Y) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + xorl %eax,%eax + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/axpy_sse.S b/kernel/x86/axpy_sse.S new file mode 100644 index 0000000000..291a219ceb --- /dev/null +++ b/kernel/x86/axpy_sse.S @@ -0,0 +1,1551 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 20 + STACK + ARGS(%esp) +#define STACK_INCX 24 + STACK + ARGS(%esp) +#define STACK_Y 28 + STACK + ARGS(%esp) +#define STACK_INCY 32 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define Y %edi +#define INCX %ecx +#define INCY %edx +#define YY %ebp + +#define ALPHA %xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movss STACK_ALPHA, ALPHA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + shufps $0, ALPHA, ALPHA + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L19 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + cmpl $3, M + jle .L16 + + testl $SIZE, Y + je .L00 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_3 + +.L00: + testl $SIZE * 2, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_3 + +.L10: + testl $SIZE * 3, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl %eax + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 8 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L11 + ALIGN_3 + +.L12: + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L13: + movl M, %eax + andl $16, %eax + jle .L14 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + movl M, %eax + andl $8, %eax + jle .L15 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + movl M, %eax + andl $4, %eax + jle .L16 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + movl M, %eax + andl $2, %eax + jle .L17 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + movl M, %eax + andl $1, %eax + jle .L19 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: + +#ifdef ALIGNED_ACCESS + + testl $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -18 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -10 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -6 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -18 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -10 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -6 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L23: + movl M, %eax + andl $16, %eax + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + SHUFPD_1 %xmm3, %xmm2 + SHUFPD_1 %xmm4, %xmm3 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + movl M, %eax + andl $8, %eax + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $4, %eax + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $2, %eax + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + movl M, %eax + andl $1, %eax + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L29: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L30: + testl $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + + decl %eax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -17 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -13 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -9 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -5 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -17 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -13 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -9 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -5 * SIZE(X), %xmm3 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L33: + movl M, %eax + andl $16, %eax + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + movl M, %eax + andl $8, %eax + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + movl M, %eax + andl $4, %eax + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + movl M, %eax + andl $2, %eax + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + movl M, %eax + andl $1, %eax + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L39: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + + decl %eax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -19 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -15 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -11 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -7 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -19 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -15 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -11 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -7 * SIZE(X), %xmm3 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L43: + movl M, %eax + andl $16, %eax + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + movl M, %eax + andl $8, %eax + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + movl M, %eax + andl $4, %eax + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + movl M, %eax + andl $2, %eax + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + movl M, %eax + andl $1, %eax + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L49: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret +#else + + movl M, %eax + sarl $5, %eax + jle .L23 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + mulps ALPHA, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L23: + movl M, %eax + andl $16, %eax + jle .L24 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + movl M, %eax + andl $8, %eax + jle .L25 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $4, %eax + jle .L26 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $2, %eax + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + movl M, %eax + andl $1, %eax + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L29: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret +#endif + ALIGN_3 + + +.L50: + movl M, %eax + movl Y, YY + sarl $3, %eax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + addl INCX, X + mulss ALPHA, %xmm0 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm0 + + movss (X), %xmm1 + addl INCX, X + mulss ALPHA, %xmm1 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm1 + + movss (X), %xmm2 + addl INCX, X + mulss ALPHA, %xmm2 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm2 + + movss (X), %xmm3 + addl INCX, X + mulss ALPHA, %xmm3 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm3 + + movss %xmm0, (Y) + addl INCY, Y + movss %xmm1, (Y) + addl INCY, Y + movss %xmm2, (Y) + addl INCY, Y + movss %xmm3, (Y) + addl INCY, Y + + movss (X), %xmm0 + addl INCX, X + mulss ALPHA, %xmm0 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm0 + + movss (X), %xmm1 + addl INCX, X + mulss ALPHA, %xmm1 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm1 + + movss (X), %xmm2 + addl INCX, X + mulss ALPHA, %xmm2 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm2 + + movss (X), %xmm3 + addl INCX, X + mulss ALPHA, %xmm3 + movss (YY), %xmm6 + addl INCY, YY + addss %xmm6, %xmm3 + + movss %xmm0, (Y) + addl INCY, Y + movss %xmm1, (Y) + addl INCY, Y + movss %xmm2, (Y) + addl INCY, Y + movss %xmm3, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $7, %eax + jle .L59 + ALIGN_3 + +.L56: + movss (X), %xmm0 + addl INCX, X + mulss ALPHA, %xmm0 + movss (Y), %xmm6 + addss %xmm6, %xmm0 + movss %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L56 + ALIGN_3 + +.L59: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/axpy_sse2.S b/kernel/x86/axpy_sse2.S new file mode 100644 index 0000000000..5e31d3dba9 --- /dev/null +++ b/kernel/x86/axpy_sse2.S @@ -0,0 +1,799 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define Y %edi +#define INCX %ecx +#define INCY %edx +#define YY %ebp + +#define ALPHA %xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movsd STACK_ALPHA, ALPHA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + unpcklpd ALPHA, ALPHA + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L47 + + cmpl $SIZE, INCX + jne .L40 + cmpl $SIZE, INCY + jne .L40 + + testl $SIZE, Y + je .L10 + + movsd (X), %xmm0 + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_4 + +.L10: + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -8 * SIZE(X), %xmm0 + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -6 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -4 * SIZE(X), %xmm2 + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -2 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + mulpd ALPHA, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + mulpd ALPHA, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 6 * SIZE(X), %xmm3 + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -8 * SIZE(X), %xmm0 + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -6 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -4 * SIZE(X), %xmm2 + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -2 * SIZE(X), %xmm3 + + mulpd ALPHA, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + mulpd ALPHA, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L13: + movl M, %eax + andl $8, %eax + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + movl M, %eax + andl $4, %eax + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + movl M, %eax + andl $2, %eax + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L16: + movl M, %eax + andl $1, %eax + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -9 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -7 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -5 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -3 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 3 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulpd ALPHA, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 5 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -9 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -7 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -5 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm0, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -3 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm3 + mulpd ALPHA, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L23: + movl M, %eax + andl $8, %eax + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + movl M, %eax + andl $4, %eax + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $2, %eax + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $1, %eax + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +#else + movl M, %eax + sarl $3, %eax + jle .L23 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + + subl $-8 * SIZE, Y + subl $-8 * SIZE, X + decl %eax + jg .L21 + ALIGN_3 + +.L22: + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + subl $-8 * SIZE, Y + subl $-8 * SIZE, X + ALIGN_3 + +.L23: + movl M, %eax + andl $4, %eax + jle .L25 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $2, %eax + jle .L26 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $1, %eax + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 +#endif + +.L40: + movl Y, YY + movl M, %eax + sarl $3, %eax + jle .L45 + ALIGN_3 + +.L41: + movsd 0 * SIZE(X), %xmm0 + addl INCX, X + movhpd 0 * SIZE(X), %xmm0 + addl INCX, X + mulpd ALPHA, %xmm0 + + movsd 0 * SIZE(YY), %xmm6 + addl INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addl INCY, YY + addpd %xmm6, %xmm0 + + movsd 0 * SIZE(X), %xmm1 + addl INCX, X + movhpd 0 * SIZE(X), %xmm1 + addl INCX, X + mulpd ALPHA, %xmm1 + + movsd 0 * SIZE(YY), %xmm6 + addl INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addl INCY, YY + addpd %xmm6, %xmm1 + + movsd 0 * SIZE(X), %xmm2 + addl INCX, X + movhpd 0 * SIZE(X), %xmm2 + addl INCX, X + mulpd ALPHA, %xmm2 + + movsd 0 * SIZE(YY), %xmm6 + addl INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addl INCY, YY + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm3 + addl INCX, X + movhpd 0 * SIZE(X), %xmm3 + addl INCX, X + mulpd ALPHA, %xmm3 + + movsd 0 * SIZE(YY), %xmm6 + addl INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addl INCY, YY + addpd %xmm6, %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addl INCY, Y + movsd %xmm1, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addl INCY, Y + movsd %xmm2, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addl INCY, Y + movsd %xmm3, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L45: + movl M, %eax + andl $7, %eax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + addl INCX, X + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L46 + ALIGN_3 + +.L47: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/axpy_sse2_opteron.S b/kernel/x86/axpy_sse2_opteron.S new file mode 100644 index 0000000000..fb22415ba7 --- /dev/null +++ b/kernel/x86/axpy_sse2_opteron.S @@ -0,0 +1,496 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define Y %edi +#define INCX %ecx +#define INCY %edx + +#define PREFETCHSIZE 64 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + PROFCODE + + movlpd ALPHA, %xmm7 + unpcklpd %xmm7, %xmm7 + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L999 + + cmpl $SIZE, INCX + jne .L100 + cmpl $SIZE, INCY + jne .L100 + + testl $SIZE, Y + je .L00 + + movlpd 0 * SIZE(X), %xmm0 + mulsd %xmm7, %xmm0 + addsd 0 * SIZE(Y), %xmm0 + movlpd %xmm0, 0 * SIZE(Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L999 + ALIGN_3 + +.L00: + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L15 + ALIGN_3 + +.L11: + prefetch (PREFETCHSIZE + 0) * SIZE(X) + + movapd 0 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movapd 2 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + prefetchw (PREFETCHSIZE + 0) * SIZE(Y) + + movapd 4 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 4 * SIZE(Y), %xmm2 + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 6 * SIZE(Y), %xmm3 + movapd %xmm3, 6 * SIZE(Y) + + prefetch (PREFETCHSIZE + 8) * SIZE(X) + + movapd 8 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 8 * SIZE(Y), %xmm0 + movapd %xmm0, 8 * SIZE(Y) + + movapd 10 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 10 * SIZE(Y), %xmm1 + movapd %xmm1, 10 * SIZE(Y) + + prefetchw (PREFETCHSIZE + 8) * SIZE(Y) + + movapd 12 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 12 * SIZE(Y), %xmm2 + movapd %xmm2, 12 * SIZE(Y) + + movapd 14 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 14 * SIZE(Y), %xmm3 + movapd %xmm3, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + decl %eax + jg .L11 + ALIGN_3 + +.L15: + movl M, %eax + testl $8, %eax + jle .L16 + + movapd 0 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movapd 2 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + movapd 4 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 4 * SIZE(Y), %xmm2 + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 6 * SIZE(Y), %xmm3 + movapd %xmm3, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $4, %eax + jle .L17 + + movapd 0 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movapd 2 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $2, %eax + jle .L18 + + movapd 0 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L18: + testl $1, %eax + jle .L99 + + movlpd 0 * SIZE(X), %xmm0 + mulsd %xmm7, %xmm0 + addsd 0 * SIZE(Y), %xmm0 + movlpd %xmm0, 0 * SIZE(Y) + jmp .L99 + ALIGN_3 + +.L20: + movl M, %eax + sarl $4, %eax + jle .L25 + ALIGN_4 + +.L21: +#ifdef OPTERON + prefetcht0 (PREFETCHSIZE + 0) * SIZE(X) + prefetchw (PREFETCHSIZE + 0) * SIZE(Y) +#endif + + movlpd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movlpd 2 * SIZE(X), %xmm1 + movhpd 3 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + movlpd 4 * SIZE(X), %xmm2 + movhpd 5 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 4 * SIZE(Y), %xmm2 + movapd %xmm2, 4 * SIZE(Y) + + movlpd 6 * SIZE(X), %xmm3 + movhpd 7 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 6 * SIZE(Y), %xmm3 + movapd %xmm3, 6 * SIZE(Y) + +#ifdef OPTERON + prefetcht0 (PREFETCHSIZE + 8) * SIZE(X) + prefetchw (PREFETCHSIZE + 8) * SIZE(Y) +#endif + + movlpd 8 * SIZE(X), %xmm0 + movhpd 9 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 8 * SIZE(Y), %xmm0 + movapd %xmm0, 8 * SIZE(Y) + + movlpd 10 * SIZE(X), %xmm1 + movhpd 11 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 10 * SIZE(Y), %xmm1 + movapd %xmm1, 10 * SIZE(Y) + + movlpd 12 * SIZE(X), %xmm2 + movhpd 13 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 12 * SIZE(Y), %xmm2 + movapd %xmm2, 12 * SIZE(Y) + + movlpd 14 * SIZE(X), %xmm3 + movhpd 15 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 14 * SIZE(Y), %xmm3 + movapd %xmm3, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L25: + movl M, %eax + testl $8, %eax + jle .L26 + + movlpd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movlpd 2 * SIZE(X), %xmm1 + movhpd 3 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + movlpd 4 * SIZE(X), %xmm2 + movhpd 5 * SIZE(X), %xmm2 + mulpd %xmm7, %xmm2 + addpd 4 * SIZE(Y), %xmm2 + movapd %xmm2, 4 * SIZE(Y) + + movlpd 6 * SIZE(X), %xmm3 + movhpd 7 * SIZE(X), %xmm3 + mulpd %xmm7, %xmm3 + addpd 6 * SIZE(Y), %xmm3 + movapd %xmm3, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, %eax + jle .L27 + + movlpd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + movlpd 2 * SIZE(X), %xmm1 + movhpd 3 * SIZE(X), %xmm1 + mulpd %xmm7, %xmm1 + addpd 2 * SIZE(Y), %xmm1 + movapd %xmm1, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, %eax + jle .L28 + + movlpd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + mulpd %xmm7, %xmm0 + addpd 0 * SIZE(Y), %xmm0 + movapd %xmm0, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, %eax + jle .L99 + + movlpd 0 * SIZE(X), %xmm0 + mulsd %xmm7, %xmm0 + addsd 0 * SIZE(Y), %xmm0 + movlpd %xmm0, 0 * SIZE(Y) + ALIGN_3 + +.L99: + xorl %eax,%eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L100: + movl M, %eax + movl Y, %ebp + sarl $3, %eax + jle .L114 + ALIGN_3 + +.L110: + movlpd 0 * SIZE(X), %xmm0 + addl INCX, X + movhpd 0 * SIZE(X), %xmm0 + addl INCX, X + mulpd %xmm7, %xmm0 + + movlpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + movhpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + addpd %xmm6, %xmm0 + + movlpd 0 * SIZE(X), %xmm1 + addl INCX, X + movhpd 0 * SIZE(X), %xmm1 + addl INCX, X + mulpd %xmm7, %xmm1 + + movlpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + movhpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + addpd %xmm6, %xmm1 + + movlpd 0 * SIZE(X), %xmm2 + addl INCX, X + movhpd 0 * SIZE(X), %xmm2 + addl INCX, X + mulpd %xmm7, %xmm2 + + movlpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + movhpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + addpd %xmm6, %xmm2 + + movlpd 0 * SIZE(X), %xmm3 + addl INCX, X + movhpd 0 * SIZE(X), %xmm3 + addl INCX, X + mulpd %xmm7, %xmm3 + + movlpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + movhpd 0 * SIZE(%ebp), %xmm6 + addl INCY, %ebp + addpd %xmm6, %xmm3 + + movlpd %xmm0, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addl INCY, Y + movlpd %xmm1, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addl INCY, Y + movlpd %xmm2, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addl INCY, Y + movlpd %xmm3, 0 * SIZE(Y) + addl INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L110 + ALIGN_3 + +.L114: + movl M, %eax + andl $7, %eax + jle .L999 + ALIGN_3 + +.L115: + movlpd (X), %xmm0 + addl INCX, X + mulsd %xmm7, %xmm0 + addsd (Y), %xmm0 + movlpd %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L115 + ALIGN_3 + +.L999: + xorl %eax,%eax + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/cabs.S b/kernel/x86/cabs.S new file mode 100644 index 0000000000..ba804202ea --- /dev/null +++ b/kernel/x86/cabs.S @@ -0,0 +1,57 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl 4(%esp), %eax + FLD 0 * SIZE(%eax) + fabs + FLD 1 * SIZE(%eax) + fabs + faddp %st, %st(1) + ret + + EPILOGUE diff --git a/kernel/x86/copy.S b/kernel/x86/copy.S new file mode 100644 index 0000000000..721d5c5d9a --- /dev/null +++ b/kernel/x86/copy.S @@ -0,0 +1,213 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define X 8 + STACK + ARGS(%esp) +#define INCX 12 + STACK + ARGS(%esp) +#define Y 16 + STACK + ARGS(%esp) +#define INCY 20 + STACK + ARGS(%esp) + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl M, %ebx + movl X, %ecx + movl INCX, %esi + movl Y, %edx + movl INCY, %edi + + testl %ebx, %ebx # if m == 0 goto End + jle .L999 + +#if SIZE > 8 + sall $BASE_SHIFT, %esi + sall $BASE_SHIFT, %edi +#else + leal (, %esi, SIZE), %esi + leal (, %edi, SIZE), %edi +#endif + + cmpl $SIZE, %esi # if incx != 1 + jne .L100 + cmpl $SIZE, %edi # if incy != 1 + jne .L100 + + movl %ebx, %eax # i = m + sarl $3, %eax + jle .L20 + ALIGN_2 + +.L11: + FLD 7 * SIZE(%ecx) + FLD 6 * SIZE(%ecx) + FLD 5 * SIZE(%ecx) + FLD 4 * SIZE(%ecx) + FLD 3 * SIZE(%ecx) + FLD 2 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + FST 2 * SIZE(%edx) + FST 3 * SIZE(%edx) + FST 4 * SIZE(%edx) + FST 5 * SIZE(%edx) + FST 6 * SIZE(%edx) + FST 7 * SIZE(%edx) + + addl $8 * SIZE, %ecx + addl $8 * SIZE, %edx + decl %eax + jg .L11 + ALIGN_2 + +.L20: + movl %ebx, %eax # i = m + andl $7, %eax + jle .L99 + ALIGN_2 + +.L21: + FLD (%ecx) + FST (%edx) + addl $SIZE, %ecx + addl $SIZE, %edx + decl %eax + jg .L21 + +.L99: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L100: + movl %ebx, %eax + sarl $3, %eax + jle .L120 + ALIGN_2 + +.L111: + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + FLD (%ecx) + addl %esi, %ecx + + fxch %st(7) + FST (%edx) + addl %edi, %edx + + fxch %st(5) + FST (%edx) + addl %edi, %edx + + fxch %st(3) + FST (%edx) + addl %edi, %edx + + fxch %st(1) + FST (%edx) + addl %edi, %edx + + FST (%edx) + addl %edi, %edx + + FST (%edx) + addl %edi, %edx + + FST (%edx) + addl %edi, %edx + + FST (%edx) + addl %edi, %edx + + decl %eax + jg .L111 + +.L120: + movl %ebx, %eax + andl $7, %eax + jle .L999 + ALIGN_2 + +.L121: + FLD (%ecx) + FST (%edx) + addl %esi, %ecx + addl %edi, %edx + decl %eax + jg .L121 + +.L999: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/copy_sse.S b/kernel/x86/copy_sse.S new file mode 100644 index 0000000000..34902dcac3 --- /dev/null +++ b/kernel/x86/copy_sse.S @@ -0,0 +1,962 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + cmpl $3, M + jle .L55 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + ALIGN_4 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_4 + +.L10: + testl $3 * SIZE, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -32 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -28 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -24 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm2) + movaps %xmm3, -20 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4,-16 * SIZE(Y) + LOAD(16 * SIZE, X, %xmm4) + movaps %xmm5,-12 * SIZE(Y) + LOAD(20 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -8 * SIZE(Y) + LOAD(24 * SIZE, X, %xmm6) + movaps %xmm7, -4 * SIZE(Y) + LOAD(28 * SIZE, X, %xmm7) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + movaps %xmm4, -16 * SIZE(Y) + movaps %xmm5, -12 * SIZE(Y) + movaps %xmm6, -8 * SIZE(Y) + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + ALIGN_3 + +.L13: + testl $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + testl $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: + testl $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + movaps -6 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 14 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 18 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 22 * SIZE(X), %xmm6 + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 26 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L23: + testl $16, M + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $8, M + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm1, %xmm0 + shufps $0x4e, %xmm2, %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, M + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, M + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, M + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L30: + testl $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + movaps -5 * SIZE(X), %xmm7 + + decl %eax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 15 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 19 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 23 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 27 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L33: + testl $16, M + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + testl $8, M + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, M + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + testl $2, M + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, M + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L39: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + movaps -7 * SIZE(X), %xmm7 + + decl %eax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 13 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 17 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 21 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 25 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L43: + testl $16, M + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + testl $8, M + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, M + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + testl $2, M + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, M + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L49: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_4 + +.L50: + movl M, %eax + sarl $3, %eax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + addl INCX, X + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + movss (X), %xmm4 + addl INCX, X + movss (X), %xmm5 + addl INCX, X + movss (X), %xmm6 + addl INCX, X + movss (X), %xmm7 + addl INCX, X + + movss %xmm0, (Y) + addl INCY, Y + movss %xmm1, (Y) + addl INCY, Y + movss %xmm2, (Y) + addl INCY, Y + movss %xmm3, (Y) + addl INCY, Y + movss %xmm4, (Y) + addl INCY, Y + movss %xmm5, (Y) + addl INCY, Y + movss %xmm6, (Y) + addl INCY, Y + movss %xmm7, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $7, %eax + jle .L57 + ALIGN_3 + +.L56: + movss (X), %xmm0 + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/copy_sse2.S b/kernel/x86/copy_sse2.S new file mode 100644 index 0000000000..11524aa1f4 --- /dev/null +++ b/kernel/x86/copy_sse2.S @@ -0,0 +1,655 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + cmpl $SIZE, INCX + jne .L40 + cmpl $SIZE, INCY + jne .L40 + +#ifdef ALIGNED_ACCESS + testl $SIZE, Y +#else + testl $SIZE, X +#endif + je .L10 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_4 + +.L10: + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + +#ifdef ALIGNED_ACCESS + testl $SIZE, X +#else + testl $SIZE, Y +#endif + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -16 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -14 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -12 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movaps %xmm3, -10 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4, -8 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movaps %xmm5, -6 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -4 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movaps %xmm7, -2 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, -8 * SIZE(Y) + movaps %xmm5, -6 * SIZE(Y) + movaps %xmm6, -4 * SIZE(Y) + movaps %xmm7, -2 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L13: + testl $8, M + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + testl $4, M + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L16: + testl $1, M + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + LOAD( 1 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + LOAD( 3 * SIZE, X, %xmm2) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + LOAD( 5 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + LOAD( 7 * SIZE, X, %xmm4) + + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + LOAD( 9 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + LOAD(11 * SIZE, X, %xmm6) + + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + LOAD(13 * SIZE, X, %xmm7) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +#else + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movaps -10 * SIZE(X), %xmm3 + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +#endif + +.L40: + movl M, %eax + sarl $3, %eax + jle .L45 + ALIGN_3 + +.L41: + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + movlps %xmm0, (Y) + addl INCY, Y + movhps %xmm0, (Y) + addl INCY, Y + movlps %xmm1, (Y) + addl INCY, Y + movhps %xmm1, (Y) + addl INCY, Y + movlps %xmm2, (Y) + addl INCY, Y + movhps %xmm2, (Y) + addl INCY, Y + movlps %xmm3, (Y) + addl INCY, Y + movhps %xmm3, (Y) + addl INCY, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L45: + movl M, %eax + andl $7, %eax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + decl %eax + jg .L46 + ALIGN_3 + +.L47: + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/cpuid.S b/kernel/x86/cpuid.S new file mode 100644 index 0000000000..773b67dd9d --- /dev/null +++ b/kernel/x86/cpuid.S @@ -0,0 +1,64 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl 12(%esp), %eax + cpuid + + movl 16(%esp), %esi + movl %eax, (%esi) + movl 20(%esp), %esi + movl %ebx, (%esi) + movl 24(%esp), %esi + movl %ecx, (%esi) + movl 28(%esp), %esi + movl %edx, (%esi) + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/dot.S b/kernel/x86/dot.S new file mode 100644 index 0000000000..5bd5d282e6 --- /dev/null +++ b/kernel/x86/dot.S @@ -0,0 +1,219 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + fldz + fldz + fldz + fldz + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl N, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + FMUL 0 * SIZE(Y) + faddp %st,%st(1) + FLD 1 * SIZE(X) + FMUL 1 * SIZE(Y) + faddp %st,%st(2) + FLD 2 * SIZE(X) + FMUL 2 * SIZE(Y) + faddp %st,%st(3) + FLD 3 * SIZE(X) + FMUL 3 * SIZE(Y) + faddp %st,%st(4) + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addl $SIZE, X + FMUL (Y) + addl $SIZE, Y + faddp %st,%st(1) + decl %eax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L29: +#endif + movl N, %eax + sarl $2, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(1) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(2) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(3) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(4) + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st, %st(1) + decl %eax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_amd.S b/kernel/x86/dot_amd.S new file mode 100644 index 0000000000..75ad36ee6b --- /dev/null +++ b/kernel/x86/dot_amd.S @@ -0,0 +1,236 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + fldz + fldz + fldz + fldz + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl N, %eax + sarl $3, %eax + jle .L15 + FLD 0 * SIZE(X) + ALIGN_4 + +.L16: + FLD 1 * SIZE(X) + FMUL 1 * SIZE(Y) + faddp %st,%st(2) + FMUL 0 * SIZE(Y) + faddp %st,%st(2) + FLD 2 * SIZE(X) + FLD 3 * SIZE(X) + FMUL 3 * SIZE(Y) + faddp %st,%st(4) + FMUL 2 * SIZE(Y) + faddp %st,%st(4) + FLD 4 * SIZE(X) + + FLD 5 * SIZE(X) + FMUL 5 * SIZE(Y) + faddp %st,%st(2) + FMUL 4 * SIZE(Y) + faddp %st,%st(2) + FLD 6 * SIZE(X) + FLD 7 * SIZE(X) + FMUL 7 * SIZE(Y) + faddp %st,%st(4) + FMUL 6 * SIZE(Y) + faddp %st,%st(4) + FLD 8 * SIZE(X) + + prefetch 16 * SIZE(X) + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + + ffreep %st(0) + ALIGN_3 + +.L15: + movl N, %eax + andl $7, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addl $SIZE, X + FMUL (Y) + addl $SIZE, Y + faddp %st,%st(1) + decl %eax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L29: +#endif + movl N, %eax + sarl $2, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(1) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(2) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(3) + + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st,%st(4) + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addl INCX, X + FMUL (Y) + addl INCY, Y + faddp %st, %st(1) + decl %eax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_sse.S b/kernel/x86/dot_sse.S new file mode 100644 index 0000000000..1811921192 --- /dev/null +++ b/kernel/x86/dot_sse.S @@ -0,0 +1,1320 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ecx +#define X %esi +#define INCX %ebx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N), N # N + movl (INCX),INCX # INCX + movl (INCY),INCY # INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + cmpl $3, N + jle .L17 + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + mulss -32 * SIZE(Y), %xmm0 + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + ALIGN_2 + +.L05: + testl $2 * SIZE, Y + je .L10 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + mulps %xmm4, %xmm1 + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, N + jle .L999 + ALIGN_2 + +.L10: +#ifdef ALIGNED_ACCESS + testl $2 * SIZE, X + jne .L30 + + testl $SIZE, X + jne .L20 +#else + testl $3 * SIZE, X + jne .L20 +#endif + + movl N, %eax + sarl $5, %eax + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -8 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L11 + ALIGN_3 + +.L12: + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -8 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L14: + testl $31, N + jle .L999 + + testl $16, N + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L15: + testl $8, N + jle .L16 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $4, N + jle .L17 + + movaps -32 * SIZE(X), %xmm4 + mulps -32 * SIZE(Y), %xmm4 + + addps %xmm4, %xmm2 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $2, N + jle .L18 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + mulps %xmm6, %xmm4 + addps %xmm4, %xmm3 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L18: + testl $1, N + jle .L999 + + movss -32 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS +.L20: + + movaps -33 * SIZE(X), %xmm4 + addl $3 * SIZE, X + + movl N, %eax + sarl $5, %eax + jle .L24 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L24: + testl $31, N + jle .L999 + + testl $16, N + jle .L25 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD1($0x39, %xmm6) + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + PSHUFD1($0x39, %xmm7) + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + movss %xmm6, %xmm5 + PSHUFD1($0x39, %xmm5) + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm6, %xmm4 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, N + jle .L27 + + movaps -32 * SIZE(X), %xmm5 + movss %xmm5, %xmm4 + PSHUFD1($0x39, %xmm4) + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm2 + movaps %xmm5, %xmm4 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + PSHUFD2($0x39, %xmm4, %xmm5) + + mulps %xmm6, %xmm5 + addps %xmm5, %xmm3 + movhlps %xmm4, %xmm4 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, N + jle .L999 + + PSHUFD1($0x39, %xmm4) + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L30: + testl $SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm4 + addl $2 * SIZE, X + + movl N, %eax + sarl $5, %eax + jle .L34 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + decl %eax + jle .L32 + + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L32: + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L34: + testl $31, N + jle .L999 + + testl $16, N + jle .L35 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L35: + testl $8, N + jle .L36 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps %xmm6, %xmm4 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L36: + testl $4, N + jle .L37 + + movaps -32 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps %xmm5, %xmm4 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L37: + testl $2, N + jle .L38 + + xorps %xmm5, %xmm5 + movhlps %xmm4, %xmm5 + + mulps -32 * SIZE(Y), %xmm5 + addps %xmm5, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L38: + testl $1, N + jle .L999 + + movss -34 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm4 + addl $SIZE, X + + movl N, %eax + sarl $5, %eax + jle .L44 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + decl %eax + jle .L42 + + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(X), %xmm5 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(X), %xmm6 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L44: + testl $31, N + jle .L999 + + testl $16, N + jle .L45 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L45: + testl $8, N + jle .L46 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm6, %xmm4 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L46: + testl $4, N + jle .L47 + + movaps -32 * SIZE(X), %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm2 + movaps %xmm5, %xmm4 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L47: + testl $2, N + jle .L48 + + movaps -32 * SIZE(X), %xmm5 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(Y), %xmm7 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm3 + movlhps %xmm5, %xmm4 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L48: + testl $1, N + jle .L999 + + PSHUFD1($0x93, %xmm4) + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_4 + +#else +.L20: + movl N, %eax + sarl $5, %eax + jle .L24 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movlps -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movlps -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movlps -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movlps -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movlps 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 + + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movlps 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movlps -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movlps -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movlps -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movlps -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + mulps -16 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -12 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -8 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -4 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L24: + testl $31, N + jle .L999 + + testl $16, N + jle .L25 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, N + jle .L27 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + mulps -32 * SIZE(Y), %xmm4 + + addps %xmm4, %xmm2 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + mulps %xmm6, %xmm4 + addps %xmm4, %xmm3 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, N + jle .L999 + + movss -32 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 +#endif + +.L50: + movl N, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L53: + movss 0 * SIZE(X), %xmm4 + addl INCX, X + mulss 0 * SIZE(Y), %xmm4 + addl INCY, Y + movss 0 * SIZE(X), %xmm5 + addl INCX, X + mulss 0 * SIZE(Y), %xmm5 + addl INCY, Y + movss 0 * SIZE(X), %xmm6 + addl INCX, X + mulss 0 * SIZE(Y), %xmm6 + addl INCY, Y + movss 0 * SIZE(X), %xmm7 + addl INCX, X + mulss 0 * SIZE(Y), %xmm7 + addl INCY, Y + + addss %xmm4, %xmm0 + addss %xmm5, %xmm1 + addss %xmm6, %xmm2 + addss %xmm7, %xmm3 + + decl %eax + jg .L53 + ALIGN_3 + +.L55: + movl N, %eax + andl $3, %eax + jle .L999 + ALIGN_3 + +.L56: + movss 0 * SIZE(X), %xmm4 + addl INCX, X + mulss 0 * SIZE(Y), %xmm4 + addl INCY, Y + addss %xmm4, %xmm0 + decl %eax + jg .L56 + ALIGN_3 + +.L999: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if defined(HAVE_SSE3) && !defined(__INTERIX) + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#elif defined(HAVE_SSE2) + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + PSHUFD2($1, %xmm0, %xmm1) + addss %xmm1, %xmm0 +#else + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#endif + + movss %xmm0, STACK_N + flds STACK_N + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_sse2.S b/kernel/x86/dot_sse2.S new file mode 100644 index 0000000000..f2053d2eae --- /dev/null +++ b/kernel/x86/dot_sse2.S @@ -0,0 +1,728 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ecx +#define X %esi +#define INCX %ebx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, Y + je .L10 + + movsd -16 * SIZE(X), %xmm0 + mulsd -16 * SIZE(Y), %xmm0 + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + ALIGN_2 + +.L10: + testl $SIZE, X + jne .L20 + + movl N, %eax + sarl $4, %eax + jle .L14 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps -4 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L11 + ALIGN_3 + +.L12: + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps -4 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps -2 * SIZE(X), %xmm7 + + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L14: + testl $15, N + jle .L999 + + testl $8, N + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, N + jle .L16 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, N + jle .L17 + + movaps -16 * SIZE(X), %xmm4 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, N + jle .L999 + + movsd -16 * SIZE(X), %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L20: + +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm4 + addl $SIZE, X + + movl N, %eax + sarl $4, %eax + jle .L24 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + movaps -12 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -10 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps -8 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps -6 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -2 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps 2 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps 4 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -10 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps -8 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps -6 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -2 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + movaps -12 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -10 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movapd %xmm6, %xmm4 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movaps -16 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movapd %xmm5, %xmm4 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + SHUFPD_1 %xmm4, %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +#else + + movl N, %eax + sarl $4, %eax + jle .L24 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movlps -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movlps -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movlps -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movlps -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movlps -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movlps -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movlps 2 * SIZE(X), %xmm5 + movhps 3 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movlps 4 * SIZE(X), %xmm6 + movhps 5 * SIZE(X), %xmm6 + + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movlps 6 * SIZE(X), %xmm7 + movhps 7 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movlps -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movlps -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movlps -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movlps -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + mulpd -8 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -6 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -4 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -2 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movlps -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movlps -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + movsd -16 * SIZE(X), %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 +#endif + +.L50: + movl N, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L53: + movsd (X), %xmm4 + addl INCX, X + mulsd (Y), %xmm4 + addl INCY, Y + movsd (X), %xmm5 + addl INCX, X + mulsd (Y), %xmm5 + addl INCY, Y + movsd (X), %xmm6 + addl INCX, X + mulsd (Y), %xmm6 + addl INCY, Y + movsd (X), %xmm7 + addl INCX, X + mulsd (Y), %xmm7 + addl INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + decl %eax + jg .L53 + ALIGN_3 + +.L55: + movl N, %eax + andl $3, %eax + jle .L999 + ALIGN_3 + +.L56: + movsd (X), %xmm4 + addl INCX, X + mulsd (Y), %xmm4 + addl INCY, Y + addsd %xmm4, %xmm0 + decl %eax + jg .L56 + ALIGN_3 + +.L999: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + pshufd $0xe, %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + movlps %xmm0, STACK_N + fldl STACK_N + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_sse2_opteron.S b/kernel/x86/dot_sse2_opteron.S new file mode 100644 index 0000000000..7ac059f636 --- /dev/null +++ b/kernel/x86/dot_sse2_opteron.S @@ -0,0 +1,368 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ecx +#define X %esi +#define INCX %ebx +#define Y %edi +#define INCY %edx + +#define PREFETCHSIZE 84 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N), N # N + movl (INCX),INCX # INCX + movl (INCY),INCY # INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + testl $SIZE, Y + je .L10 + + movsd 0 * SIZE(X), %xmm0 + mulsd 0 * SIZE(Y), %xmm0 + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + ALIGN_2 + +.L10: + movl N, %eax + sarl $4, %eax + jle .L24 + + movlpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + movlpd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 + movlpd 4 * SIZE(X), %xmm6 + movhpd 5 * SIZE(X), %xmm6 + movlpd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 + + mulpd 0 * SIZE(Y), %xmm4 + mulpd 2 * SIZE(Y), %xmm5 + mulpd 4 * SIZE(Y), %xmm6 + mulpd 6 * SIZE(Y), %xmm7 + decl %eax + jle .L22 + + ALIGN_3 + +.L21: + prefetch (PREFETCHSIZE + 0) * SIZE(Y) + + addpd %xmm4, %xmm0 + movlpd 8 * SIZE(X), %xmm4 + movhpd 9 * SIZE(X), %xmm4 + addpd %xmm5, %xmm1 + movlpd 10 * SIZE(X), %xmm5 + movhpd 11 * SIZE(X), %xmm5 + addpd %xmm6, %xmm2 + movlpd 12 * SIZE(X), %xmm6 + movhpd 13 * SIZE(X), %xmm6 + addpd %xmm7, %xmm3 + movlpd 14 * SIZE(X), %xmm7 + movhpd 15 * SIZE(X), %xmm7 + + mulpd 8 * SIZE(Y), %xmm4 + mulpd 10 * SIZE(Y), %xmm5 + mulpd 12 * SIZE(Y), %xmm6 + mulpd 14 * SIZE(Y), %xmm7 + + prefetch (PREFETCHSIZE + 8) * SIZE(Y) + + addpd %xmm4, %xmm0 + movlpd 16 * SIZE(X), %xmm4 + movhpd 17 * SIZE(X), %xmm4 + addpd %xmm5, %xmm1 + movlpd 18 * SIZE(X), %xmm5 + movhpd 19 * SIZE(X), %xmm5 + addpd %xmm6, %xmm2 + movlpd 20 * SIZE(X), %xmm6 + movhpd 21 * SIZE(X), %xmm6 + addpd %xmm7, %xmm3 + movlpd 22 * SIZE(X), %xmm7 + movhpd 23 * SIZE(X), %xmm7 + + mulpd 16 * SIZE(Y), %xmm4 + mulpd 18 * SIZE(Y), %xmm5 + mulpd 20 * SIZE(Y), %xmm6 + mulpd 22 * SIZE(Y), %xmm7 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + addpd %xmm4, %xmm0 + movlpd 8 * SIZE(X), %xmm4 + movhpd 9 * SIZE(X), %xmm4 + addpd %xmm5, %xmm1 + movlpd 10 * SIZE(X), %xmm5 + movhpd 11 * SIZE(X), %xmm5 + addpd %xmm6, %xmm2 + movlpd 12 * SIZE(X), %xmm6 + movhpd 13 * SIZE(X), %xmm6 + addpd %xmm7, %xmm3 + movlpd 14 * SIZE(X), %xmm7 + movhpd 15 * SIZE(X), %xmm7 + + mulpd 8 * SIZE(Y), %xmm4 + mulpd 10 * SIZE(Y), %xmm5 + mulpd 12 * SIZE(Y), %xmm6 + mulpd 14 * SIZE(Y), %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movlpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + movlpd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 + movlpd 4 * SIZE(X), %xmm6 + movhpd 5 * SIZE(X), %xmm6 + movlpd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 + + mulpd 0 * SIZE(Y), %xmm4 + mulpd 2 * SIZE(Y), %xmm5 + mulpd 4 * SIZE(Y), %xmm6 + mulpd 6 * SIZE(Y), %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movlpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + movlpd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 + mulpd 0 * SIZE(Y), %xmm4 + mulpd 2 * SIZE(Y), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movlpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + mulpd 0 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + movsd 0 * SIZE(X), %xmm4 + mulsd 0 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + + +.L50: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L51 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L51: + testl INCY, INCY + jge .L52 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L52: +#endif + + movl N, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(X), %xmm4 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(X), %xmm5 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm5 + addl INCY, Y + movsd 0 * SIZE(X), %xmm6 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm6 + addl INCY, Y + movsd 0 * SIZE(X), %xmm7 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm7 + addl INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + decl %eax + jg .L53 + ALIGN_3 + +.L55: + movl N, %eax + andl $3, %eax + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm4 + addl INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addl INCY, Y + addsd %xmm4, %xmm0 + decl %eax + jg .L56 + ALIGN_3 + +.L999: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if !defined(HAVE_SSE3) || defined(__INTERIX) + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + movsd %xmm0, STACK_N + fldl STACK_N + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/dot_sse_opteron.S b/kernel/x86/dot_sse_opteron.S new file mode 100644 index 0000000000..fc632193f3 --- /dev/null +++ b/kernel/x86/dot_sse_opteron.S @@ -0,0 +1,411 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ecx +#define X %esi +#define INCX %ebx +#define Y %edi +#define INCY %edx + +#define PREFETCHSIZE 84 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N), N # N + movl (INCX),INCX # INCX + movl (INCY),INCY # INCY +#endif + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + cmpl $3, N + jle .L27 + + testl $SIZE, Y + je .L05 + + movss 0 * SIZE(X), %xmm0 + mulss 0 * SIZE(Y), %xmm0 + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + ALIGN_2 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + + mulss 0 * SIZE(Y), %xmm4 + mulss 1 * SIZE(Y), %xmm5 + + addss %xmm4, %xmm1 + addss %xmm5, %xmm2 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, N + jle .L999 + ALIGN_2 + +.L10: + movl N, %eax + sarl $5, %eax + jle .L24 + + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + movlps 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 + movlps 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 + + mulps 0 * SIZE(Y), %xmm4 + mulps 4 * SIZE(Y), %xmm5 + mulps 8 * SIZE(Y), %xmm6 + mulps 12 * SIZE(Y), %xmm7 + decl %eax + jle .L22 + + ALIGN_3 + +.L21: + prefetch (PREFETCHSIZE + 0) * SIZE(Y) + + addps %xmm4, %xmm0 + movlps 16 * SIZE(X), %xmm4 + movhps 18 * SIZE(X), %xmm4 + addps %xmm5, %xmm1 + movlps 20 * SIZE(X), %xmm5 + movhps 22 * SIZE(X), %xmm5 + addps %xmm6, %xmm2 + movlps 24 * SIZE(X), %xmm6 + movhps 26 * SIZE(X), %xmm6 + addps %xmm7, %xmm3 + movlps 28 * SIZE(X), %xmm7 + movhps 30 * SIZE(X), %xmm7 + + mulps 16 * SIZE(Y), %xmm4 + mulps 20 * SIZE(Y), %xmm5 + mulps 24 * SIZE(Y), %xmm6 + mulps 28 * SIZE(Y), %xmm7 + + prefetch (PREFETCHSIZE + 16) * SIZE(Y) + + addps %xmm4, %xmm0 + movlps 32 * SIZE(X), %xmm4 + movhps 34 * SIZE(X), %xmm4 + addps %xmm5, %xmm1 + movlps 36 * SIZE(X), %xmm5 + movhps 38 * SIZE(X), %xmm5 + addps %xmm6, %xmm2 + movlps 40 * SIZE(X), %xmm6 + movhps 42 * SIZE(X), %xmm6 + addps %xmm7, %xmm3 + movlps 44 * SIZE(X), %xmm7 + movhps 46 * SIZE(X), %xmm7 + + mulps 32 * SIZE(Y), %xmm4 + mulps 36 * SIZE(Y), %xmm5 + mulps 40 * SIZE(Y), %xmm6 + mulps 44 * SIZE(Y), %xmm7 + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + addps %xmm4, %xmm0 + movlps 16 * SIZE(X), %xmm4 + movhps 18 * SIZE(X), %xmm4 + addps %xmm5, %xmm1 + movlps 20 * SIZE(X), %xmm5 + movhps 22 * SIZE(X), %xmm5 + addps %xmm6, %xmm2 + movlps 24 * SIZE(X), %xmm6 + movhps 26 * SIZE(X), %xmm6 + addps %xmm7, %xmm3 + movlps 28 * SIZE(X), %xmm7 + movhps 30 * SIZE(X), %xmm7 + + mulps 16 * SIZE(Y), %xmm4 + mulps 20 * SIZE(Y), %xmm5 + mulps 24 * SIZE(Y), %xmm6 + mulps 28 * SIZE(Y), %xmm7 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + addps %xmm6, %xmm2 + addps %xmm7, %xmm3 + + addl $32 * SIZE, X + addl $32 * SIZE, Y + ALIGN_3 + +.L24: + testl $31, N + jle .L999 + + testl $16, N + jle .L25 + + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + movlps 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 + movlps 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 + + mulps 0 * SIZE(Y), %xmm4 + mulps 4 * SIZE(Y), %xmm5 + mulps 8 * SIZE(Y), %xmm6 + mulps 12 * SIZE(Y), %xmm7 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + addps %xmm6, %xmm2 + addps %xmm7, %xmm3 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + mulps 0 * SIZE(Y), %xmm4 + mulps 4 * SIZE(Y), %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, N + jle .L27 + + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + mulps 0 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + + mulss 0 * SIZE(Y), %xmm4 + mulss 1 * SIZE(Y), %xmm5 + + addss %xmm4, %xmm0 + addss %xmm5, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + + +.L28: + testl $1, N + jle .L999 + + movss 0 * SIZE(X), %xmm4 + mulss 0 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + + +.L50: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L51 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L51: + testl INCY, INCY + jge .L52 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L52: +#endif + + movl N, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L53: + movss 0 * SIZE(X), %xmm4 + addl INCX, X + mulss 0 * SIZE(Y), %xmm4 + addl INCY, Y + movss 0 * SIZE(X), %xmm5 + addl INCX, X + mulss 0 * SIZE(Y), %xmm5 + addl INCY, Y + movss 0 * SIZE(X), %xmm6 + addl INCX, X + mulss 0 * SIZE(Y), %xmm6 + addl INCY, Y + movss 0 * SIZE(X), %xmm7 + addl INCX, X + mulss 0 * SIZE(Y), %xmm7 + addl INCY, Y + + addss %xmm4, %xmm0 + addss %xmm5, %xmm1 + addss %xmm6, %xmm2 + addss %xmm7, %xmm3 + + decl %eax + jg .L53 + ALIGN_3 + +.L55: + movl N, %eax + andl $3, %eax + jle .L999 + ALIGN_3 + +.L56: + movss 0 * SIZE(X), %xmm4 + addl INCX, X + mulss 0 * SIZE(Y), %xmm4 + addl INCY, Y + addss %xmm4, %xmm0 + decl %eax + jg .L56 + ALIGN_3 + +.L999: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if !defined(HAVE_SSE3) || defined(__INTERIX) + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + pshufd $1, %xmm0, %xmm1 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + + movss %xmm0, STACK_N + flds STACK_N + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/gemm_beta.S b/kernel/x86/gemm_beta.S new file mode 100644 index 0000000000..b68dcf3d9a --- /dev/null +++ b/kernel/x86/gemm_beta.S @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define BETA 16 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define LDC 44 + STACK + ARGS(%esp) +#else +#define BETA 16 + STACK + ARGS(%esp) +#define C 36 + STACK + ARGS(%esp) +#define LDC 40 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl M, %esi # m + movl N, %ecx # n + FLD BETA # beta + + movl C, %edi # C + movl LDC, %ebp # ldc + + testl %esi, %esi # if n <= 0 goto End + jle .L999 + testl %ecx, %ecx # if m <= 0 goto End + jle .L999 + + ftst + fnstsw %ax + andb $68, %ah + je .L201 + ALIGN_4 + +.L101: + movl %edi, %eax # c_offset = c + leal (%edi, %ebp, SIZE), %edi # c += ldc + movl %esi, %edx + sarl $3, %edx + jle .L103 + ALIGN_4 + +.L102: +#ifdef HAS_PREFETCH +#ifndef ATHLON + prefetchnta 12 * SIZE(%eax) + prefetchnta 16 * SIZE(%eax) +#else + prefetchw 32 * SIZE(%eax) +#endif +#endif + + FSTU 0 * SIZE(%eax) + FSTU 1 * SIZE(%eax) + FSTU 2 * SIZE(%eax) + FSTU 3 * SIZE(%eax) + FSTU 4 * SIZE(%eax) + FSTU 5 * SIZE(%eax) + FSTU 6 * SIZE(%eax) + FSTU 7 * SIZE(%eax) + addl $8 * SIZE, %eax + decl %edx + jg .L102 + ALIGN_4 + +.L103: + movl %esi, %edx + andl $7, %edx + jle .L105 + ALIGN_4 + +.L104: + FSTU 0 * SIZE(%eax) + addl $SIZE, %eax + decl %edx + jg .L104 + ALIGN_4 + +.L105: + decl %ecx + jg .L101 + jmp .L999 + ALIGN_3 + + +.L201: + movl %edi, %eax # c_offset = c + leal (%edi, %ebp, SIZE), %edi # c += ldc + movl %esi, %edx + sarl $3, %edx + jle .L203 + ALIGN_4 + +.L202: +#ifdef HAS_PREFETCH +#ifndef ATHLON + prefetchnta 16 * SIZE(%eax) + prefetchnta 20 * SIZE(%eax) +#else + prefetchw 32 * SIZE(%eax) +#endif +#endif + + FLD 0 * SIZE(%eax) + fmul %st(1),%st + FST 0 * SIZE(%eax) + + FLD 1 * SIZE(%eax) + fmul %st(1),%st + FST 1 * SIZE(%eax) + + FLD 2 * SIZE(%eax) + fmul %st(1),%st + FST 2 * SIZE(%eax) + + FLD 3 * SIZE(%eax) + fmul %st(1),%st + FST 3 * SIZE(%eax) + + FLD 4 * SIZE(%eax) + fmul %st(1),%st + FST 4 * SIZE(%eax) + + FLD 5 * SIZE(%eax) + fmul %st(1),%st + FST 5 * SIZE(%eax) + + FLD 6 * SIZE(%eax) + fmul %st(1),%st + FST 6 * SIZE(%eax) + + FLD 7 * SIZE(%eax) + fmul %st(1),%st + FST 7 * SIZE(%eax) + + addl $8 * SIZE, %eax + decl %edx + jg .L202 + ALIGN_4 + +.L203: + movl %esi, %edx + andl $7, %edx + jle .L205 + ALIGN_4 + +.L204: + FLD 0 * SIZE(%eax) + fmul %st(1), %st + FST 0 * SIZE(%eax) + addl $SIZE, %eax + decl %edx + jg .L204 + ALIGN_4 + +.L205: + decl %ecx + jg .L201 + ALIGN_3 + +.L999: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_1x4.S b/kernel/x86/gemm_kernel_1x4.S new file mode 100644 index 0000000000..e1ff4e809b --- /dev/null +++ b/kernel/x86/gemm_kernel_1x4.S @@ -0,0 +1,907 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define STACK_A 20 + STACK + ARGS(%esp) +#define STACK_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define STACK_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define BB %ebx +#define LDC %ebp +#define BX %esi + +#define PREFETCHSIZE (8 * 5 + 4) + +#define AOFFSET 1 +#define BOFFSET -7 + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#else +#define PREFETCH prefetcht0 +#endif + +#define KERNEL \ + PREFETCH PREFETCHSIZE * SIZE + AOFFSET(A, %eax, 1);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -14 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -13 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -15 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -10 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -9 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -14 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -6 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -5 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -13 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -2 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -1 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -12 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 2 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 3 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -11 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 6 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 7 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -10 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 10 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 11 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD -9 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 14 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 15 * SIZE + BOFFSET(B, %eax, 4);\ + faddp %st, %st(6);\ + FLD 8 * SIZE + AOFFSET(A, %eax, 1);\ + fxch %st(1);\ + FLD 16 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -15 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + AOFFSET(A, %eax, 1);\ + faddp %st, %st(5);\ + FLD -14 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -13 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -7 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -12 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -11 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -10 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -9 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -6 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -8 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -7 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -6 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -5 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -5 * SIZE + AOFFSET(A, %eax, 1);\ + FLD -4 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD -3 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD -2 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL -1 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -4 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 0 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 1 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 2 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 3 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -3 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 4 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 5 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 6 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 7 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -2 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 8 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 9 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 10 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 11 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD -1 * SIZE + AOFFSET(A, %eax, 1);\ + FLD 12 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(4);\ + FLD 13 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(5);\ + FLD 14 * SIZE + BOFFSET(BB, %eax, 4);\ + fmul %st(1), %st;\ + faddp %st, %st(6);\ + FMUL 15 * SIZE + BOFFSET(BB, %eax, 4);\ + faddp %st, %st(6);\ + FLD 16 * SIZE + AOFFSET(A, %eax, 1);\ + fxch %st(2);\ + FLD 16 * SIZE + BOFFSET(BB, %eax, 4);\ + subl $-16 * SIZE, %eax + +/* + + A hint of scheduling is received from following URL + + http://www.netlib.org/atlas/atlas-comm/msg00260.html + +*/ + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl STACK_LDC, LDC + leal (, LDC, SIZE), LDC + + subl $(AOFFSET - 16 * SIZE), STACK_A + subl $(BOFFSET - 16 * SIZE), STACK_B + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + sarl $2, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L11: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl K, BX + sall $BASE_SHIFT + 2, BX + addl B, BX + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L14: + prefetchnta -16 * SIZE + BOFFSET(BX) + subl $-8 * SIZE, BX + + movl STACK_B, B + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 4), B +#endif + + leal (%edi, LDC, 2), %eax + + fldz + fldz + fldz + fldz + + FLD 0 * SIZE + AOFFSET(A) + FLD -8 * SIZE + AOFFSET(A) + FLD -16 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + +#ifdef HAVE_3DNOW + prefetchw 1 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, LDC) + prefetchw 1 * SIZE(%eax) + prefetchw 2 * SIZE(%eax, LDC) +#elif defined(HAVE_SSE) + prefetcht0 1 * SIZE(%edi) + prefetcht0 2 * SIZE(%edi, LDC) + prefetcht0 1 * SIZE(%eax) + prefetcht0 2 * SIZE(%eax, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-16, %eax + + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal 32 * SIZE(B, %eax, 4), BB + leal (B, %eax, 4), B + negl %eax + NOBRANCH + je .L16 + ALIGN_4 + +.L15: + KERNEL + jge .L16 + KERNEL + jge .L16 + KERNEL + jge .L16 + KERNEL + jl .L15 + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $15, %eax + je .L19 + ALIGN_4 + +.L17: + fmul %st(1), %st + faddp %st, %st(4) + + FLD -15 * SIZE + BOFFSET(B) + fmul %st(1), %st + faddp %st, %st(5) + + FLD -14 * SIZE + BOFFSET(B) + fmul %st(1), %st + faddp %st, %st(6) + + FMUL -13 * SIZE + BOFFSET(B) + faddp %st, %st(6) + FLD -15 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + addl $1 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + leal (%edi, LDC, 2), %eax + +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) + FADD (%edi,LDC) + FST (%edi,LDC) + FADD (%eax) + FST (%eax) + FADD (%eax,LDC) + FST (%eax,LDC) +#else + FST (%edi) + FST (%edi,LDC) + FST (%eax) + FST (%eax,LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 4), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, %edi + decl I + jne .L14 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C + movl B, STACK_B + decl J + jne .L11 + ALIGN_4 + +.L20: + movl N, %eax + andl $2, %eax + je .L30 + ALIGN_3 + +.L21: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L24: + movl STACK_B, B + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 2), B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -15 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -13 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -14 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -11 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -13 * SIZE + AOFFSET(A) + FLD -10 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -9 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -12 * SIZE + AOFFSET(A) + FLD -8 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -7 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -11 * SIZE + AOFFSET(A) + FLD -6 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -5 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -10 * SIZE + AOFFSET(A) + FLD -4 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -3 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -9 * SIZE + AOFFSET(A) + FLD -2 * SIZE + BOFFSET(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -1 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -8 * SIZE + AOFFSET(A) + FLD 0 * SIZE + BOFFSET(B) + + addl $ 8 * SIZE, A + subl $-16 * SIZE, B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -15 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + addl $1 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(2) + faddp %st, %st(2) + + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) + FADD (%edi,LDC) + FST (%edi,LDC) +#else + FST (%edi) + FST (%edi,LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 2), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, %edi + decl I + jne .L24 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + movl B, STACK_B + ALIGN_4 + +.L30: + movl N, %eax + andl $1, %eax + je .L999 + ALIGN_3 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L34: + movl STACK_B, B + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 1), B +#endif + + fldz + fldz + fldz + fldz + + prefetchw 1 * SIZE(%edi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L36 + ALIGN_3 + +.L35: + FLD -16 * SIZE + AOFFSET(A) + FMUL -16 * SIZE + BOFFSET(B) + faddp %st, %st(1) + + FLD -15 * SIZE + AOFFSET(A) + FMUL -15 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -14 * SIZE + AOFFSET(A) + FMUL -14 * SIZE + BOFFSET(B) + faddp %st, %st(3) + + FLD -13 * SIZE + AOFFSET(A) + FMUL -13 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + FLD -12 * SIZE + AOFFSET(A) + FMUL -12 * SIZE + BOFFSET(B) + faddp %st, %st(1) + + FLD -11 * SIZE + AOFFSET(A) + FMUL -11 * SIZE + BOFFSET(B) + faddp %st, %st(2) + + FLD -10 * SIZE + AOFFSET(A) + FMUL -10 * SIZE + BOFFSET(B) + faddp %st, %st(3) + + FLD -9 * SIZE + AOFFSET(A) + FMUL -9 * SIZE + BOFFSET(B) + faddp %st, %st(4) + + addl $8 * SIZE, A + addl $8 * SIZE, B + + decl %eax + jne .L35 + ALIGN_4 + +.L36: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L39 + ALIGN_4 + +.L37: + FLD -16 * SIZE + AOFFSET(A) + FMUL -16 * SIZE + BOFFSET(B) + faddp %st, %st(1) + + addl $1 * SIZE,A + addl $1 * SIZE,B + decl %eax + jne .L37 + ALIGN_4 + +.L39: + faddp %st, %st(2) + faddp %st, %st(2) + faddp %st, %st(1) + + FMUL ALPHA + +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) +#else + FST (%edi) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 1), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, %edi + decl I + jne .L34 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C + movl B, STACK_B + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x2.S b/kernel/x86/gemm_kernel_2x2.S new file mode 100644 index 0000000000..1483bc4d98 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x2.S @@ -0,0 +1,697 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl N, %eax # j = (n >> 1) # MEMORY + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + + sarl $1, %eax + leal (, %ebp, SIZE), %ebp + leal 0(%ecx) , %ecx # NOP + movl %eax, J # j = (n >> 1) # MEMORY + test %eax, %eax + je .L8 # if !(n >> 1) goto .L8 + ALIGN_4 + +.L34: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl %ebx, BX + + movl M, %esi # m # MEMORY + movl A, %edx # a # MEMORY + movl C, %edi # C # MEMORY + sarl $1, %esi # i = (m >> 1) + je .L12 + ALIGN_4 + +.MainHead: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ebx, %eax, 2), %ecx +#endif + +#ifdef HAVE_SSE + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + fldz + fldz + + FLD 4 * SIZE(%ecx) # b5 + FLD 4 * SIZE(%edx) # a5 + FLD 0 * SIZE(%ecx) # b1 + FLD 0 * SIZE(%edx) # a1 + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(%ecx) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(%ecx) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET) * SIZE(%edx) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(%edx) + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(%edx) + + fmul %st, %st(1) + FMUL 3 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(%edx) + fmul %st, %st(1) + FMUL 3 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(%edx) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(%ecx) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(%edx) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(%edx) + fmul %st, %st(3) + FMUL 5 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(%edx) + + fmul %st, %st(3) + FMUL 7 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(%edx) + fmul %st, %st(3) + FMUL 7 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(%edx) + fxch %st(2) + + subl $-8 * SIZE, %ecx + subl $-8 * SIZE, %edx + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(%edx) + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(%edx) + + addl $2 * SIZE,%ecx + addl $2 * SIZE,%edx + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmulp %st, %st(3) + +#ifndef TRMMKERNEL + FADD 0 * SIZE(%edi) + FST 0 * SIZE(%edi) + FADD 0 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi,%ebp) + FADD 1 * SIZE(%edi) + FST 1 * SIZE(%edi) + FADD 1 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi,%ebp) +#else + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) + FST 1 * SIZE(%edi,%ebp) +#endif + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ecx, %eax, 2), %ecx +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %edi + rep + decl %esi # i -- + rep + jne .MainHead + ALIGN_4 + +.L12: + movl M, %eax # m # MEMORY + andl $1, %eax + je .L27 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ebx, %eax, 2), %ecx +#endif + fldz + fldz + + FLD 0 * SIZE(%edx) # temp1 = *(aoffset + 0) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(%edx) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, %edx + addl $4 * SIZE, %ecx + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, %edx + addl $2 * SIZE, %ecx + ALIGN_4 + +.L33: + ffreep %st(0) + FLD ALPHA + + fmul %st, %st(2) + fmulp %st, %st(1) + +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) + FADD (%edi,%ebp) + FST (%edi,%ebp) +#else + FST (%edi) + FST (%edi,%ebp) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ecx, %eax, 2), %ecx +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L27: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + lea (, %ebp, 2), %eax + addl %eax, C # C + 2 * ldc # MEMORY + movl %ecx, %ebx # b # MEMORY + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.L8: + movl N, %eax # n # MEMORY + andl $1, %eax + je .End + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, %edi # c # MEMORY + movl A, %edx # a # MEMORY + + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L36 + ALIGN_4 + +.L46: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ebx, %eax, 1), %ecx +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + fldz + sarl $1, %eax + fldz + FLD 0 * SIZE(%ecx) # temp1 = *(boffset + 0) + + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%ecx) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(%ecx) # temp1 = *(boffset + 0) + + addl $4 * SIZE,%edx + addl $2 * SIZE,%ecx + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(%ecx) # temp1 = *(boffset + 0) + + addl $2 * SIZE,%edx + addl $1 * SIZE,%ecx + ALIGN_4 + +.L45: + ffreep %st(0) + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + +#ifndef TRMMKERNEL + FADD 0 * SIZE(%edi) + FST 0 * SIZE(%edi) + FADD 1 * SIZE(%edi) + FST 1 * SIZE(%edi) +#else + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) +#endif + + addl $2 * SIZE, %edi + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ecx, %eax, 1), %ecx +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L36: + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .End + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ebx, %eax, 1), %ecx +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + fldz + ALIGN_3 + +.L51: + FLD (%edx) + FMUL (%ecx) + addl $1 * SIZE,%edx + addl $1 * SIZE,%ecx + faddp %st,%st(1) + decl %eax + jne .L51 + + FMUL ALPHA +#ifndef TRMMKERNEL + FADD (%edi) + FST (%edi) +#else + FST (%edi) +#endif + ALIGN_4 + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x2_atom.S b/kernel/x86/gemm_kernel_2x2_atom.S new file mode 100644 index 0000000000..f8954128a8 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x2_atom.S @@ -0,0 +1,736 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define CO1 %esi +#define LDC %ebp +#define B %edi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 1, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, CO1 # coffset = c + leal (, LDC, 2), %eax + addl %eax, C + + movl A, AA # aoffset = a + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movl BX, %eax + prefetcht0 0 * SIZE(%eax) + subl $-8 * SIZE, BX + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movsd ALPHA, %xmm0 + + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm6 + + addsd 0 * SIZE(CO1, LDC), %xmm5 + addsd 1 * SIZE(CO1, LDC), %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm6, 1 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO1, LDC) + movsd %xmm7, 1 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO1 + decl %ebx + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movsd ALPHA, %xmm0 + + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 + addsd 0 * SIZE(CO1, LDC), %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, CO1 + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + decl J + jg .L10 + ALIGN_4 + +.L30: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO1 + addl LDC, C + + movl A, AA + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movsd ALPHA, %xmm3 + + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + + mulsd %xmm3, %xmm4 + mulsd %xmm3, %xmm6 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm6, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO1 + decl %ebx + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movsd ALPHA, %xmm0 + + addsd %xmm5, %xmm4 + mulsd %xmm0, %xmm4 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_3dnow.S b/kernel/x86/gemm_kernel_2x4_3dnow.S new file mode 100644 index 0000000000..a86efda260 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_3dnow.S @@ -0,0 +1,1917 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 20 + STACK + ARGS(%esi) +#define OLD_B 24 + STACK + ARGS(%esi) +#define OLD_C 28 + STACK + ARGS(%esi) +#define OLD_LDC 32 + STACK + ARGS(%esi) +#define OLD_OFFSET 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 8(%esp) +#define N 12(%esp) +#define M 16(%esp) +#define A 20(%esp) +#define C 24(%esp) +#define J 28(%esp) +#define OLD_STACK 32(%esp) +#define OFFSET 36(%esp) +#define KK 40(%esp) +#define KKK 44(%esp) +#define BUFFER 64(%esp) + +#define AA %edx +#define BB %ecx + +#define PREFETCHSIZE (16 * 2 + 6) + +#define AOFFSET -32 +#define BOFFSET 128 + +/* + + A hint of scheduling is received from following URL + +https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11 + +*/ + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movd OLD_ALPHA, %mm3 + + movl %ebx, M + movl %eax, N + movl %ecx, K + subl $AOFFSET * SIZE, %edx + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, %edi + movl OLD_C, %ebx + punpckldq %mm3, %mm3 + + movq %mm3, ALPHA + + movl %ebx, C + movl OLD_LDC, %ebp + leal (, %ebp, SIZE), %ebp + +#ifdef TRMMKERNEL + movl OLD_OFFSET, %eax + movl %eax, OFFSET +#ifndef LEFT + negl %eax + movl %eax, KK +#endif +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_3 + +.L01: +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_3 + +.L02: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm4, 8 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + + movd 8 * SIZE(%edi), %mm0 + movd 9 * SIZE(%edi), %mm1 + movd 10 * SIZE(%edi), %mm2 + movd 11 * SIZE(%edi), %mm3 + movd 12 * SIZE(%edi), %mm4 + movd 13 * SIZE(%edi), %mm5 + movd 14 * SIZE(%edi), %mm6 + movd 15 * SIZE(%edi), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 16 * SIZE(%ecx) + movq %mm1, 18 * SIZE(%ecx) + movq %mm2, 20 * SIZE(%ecx) + movq %mm3, 22 * SIZE(%ecx) + movq %mm4, 24 * SIZE(%ecx) + movq %mm5, 26 * SIZE(%ecx) + movq %mm6, 28 * SIZE(%ecx) + movq %mm7, 30 * SIZE(%ecx) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_2 + +.L04: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + addl $4 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + + leal (%ebp, %ebp, 2), %eax + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, %ebp) + prefetchw 2 * SIZE(%esi, %ebp, 2) + prefetchw 2 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L15 + ALIGN_4 + +.L12: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2 + pfmul (102 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2 + pfmul (110 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3 + pfmul (118 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3 + pfmul (126 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L12 + ALIGN_3 + +.L15: + movq ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L18 + ALIGN_3 + +.L16: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_3 + +.L18: + leal (%ebp, %ebp, 2), %eax + +#ifndef TRMMKERNEL + pfmul %mm3, %mm4 + pfadd 0 * SIZE(%esi), %mm4 + pfmul %mm3, %mm5 + pfadd 0 * SIZE(%esi, %ebp, 1), %mm5 + pfmul %mm3, %mm6 + pfadd 0 * SIZE(%esi, %ebp, 2), %mm6 + pfmul %mm3, %mm7 + pfadd 0 * SIZE(%esi, %eax, 1), %mm7 +#else + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + pfmul %mm3, %mm6 + pfmul %mm3, %mm7 +#endif + + movq %mm4, 0 * SIZE(%esi) + movq %mm5, 0 * SIZE(%esi, %ebp, 1) + movq %mm6, 0 * SIZE(%esi, %ebp, 2) + movq %mm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + ALIGN_4 + +.L21: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 8 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 2 + BOFFSET) * SIZE(BB), %mm2 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 10 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 18 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 3 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 26 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 34 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 5 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 42 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 50 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 7 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 58 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 16 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 66 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 68 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 72 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 9 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 74 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 76 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 96 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 10 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 82 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 84 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 88 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 11 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 90 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movd ( 92 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd (112 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 12 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 98 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movd (100 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd (104 + BOFFSET) * SIZE(BB), %mm2 + pfmul (102 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 13 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd (106 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movd (108 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd (128 + BOFFSET) * SIZE(BB), %mm2 + pfmul (110 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 14 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd (114 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movd (116 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd (120 + BOFFSET) * SIZE(BB), %mm3 + pfmul (118 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 15 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd (122 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movd (124 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd (144 + BOFFSET) * SIZE(BB), %mm3 + pfmul (126 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 24 + AOFFSET) * SIZE(AA), %mm1 + + subl $-16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_3 + +.L25: + movd ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L28 + ALIGN_3 + +.L26: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 2 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_3 + +.L28: + leal (%ebp, %ebp, 2), %eax + + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + pfmul %mm3, %mm6 + pfmul %mm3, %mm7 + +#ifndef TRMMKERNEL + movd 0 * SIZE(%esi) , %mm0 + movd 0 * SIZE(%esi, %ebp, 1), %mm1 + movd 0 * SIZE(%esi, %ebp, 2), %mm2 + movd 0 * SIZE(%esi, %eax, 1), %mm3 + + pfadd %mm0, %mm4 + pfadd %mm1, %mm5 + pfadd %mm2, %mm6 + pfadd %mm3, %mm7 +#endif + + movd %mm4, 0 * SIZE(%esi) + movd %mm5, 0 * SIZE(%esi, %ebp, 1) + movd %mm6, 0 * SIZE(%esi, %ebp, 2) + movd %mm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, %ebp, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $2, %eax + jle .L60 + ALIGN_3 + +.L31: +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L33 + ALIGN_3 + +.L32: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm4, 8 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L32 + +.L33: + movl K, %eax + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L34: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L34 + ALIGN_4 + +.L40: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L45 + ALIGN_4 + +.L42: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L42 + ALIGN_3 + +.L45: + movq ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L48 + ALIGN_3 + +.L46: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_3 + +.L48: + pfadd %mm6, %mm4 + pfadd %mm7, %mm5 + + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + +#ifndef TRMMKERNEL + pfadd 0 * SIZE(%esi), %mm4 + pfadd 0 * SIZE(%esi, %ebp, 1), %mm5 +#endif + + movq %mm4, 0 * SIZE(%esi) + movq %mm5, 0 * SIZE(%esi, %ebp, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + ALIGN_4 + +.L51: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 8 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 3 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 5 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 7 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 16 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 9 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 10 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 11 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movd ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 12 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 13 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 14 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movd ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 15 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movd ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 24 + AOFFSET) * SIZE(AA), %mm1 + + subl $-16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_3 + +.L55: + movd ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L58 + ALIGN_3 + +.L56: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movd ( 4 + BOFFSET) * SIZE(BB), %mm2 + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_3 + +.L58: + pfadd %mm6, %mm4 + pfadd %mm7, %mm5 + + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + +#ifndef TRMMKERNEL + movd 0 * SIZE(%esi) , %mm0 + movd 0 * SIZE(%esi, %ebp, 1), %mm1 + + pfadd %mm0, %mm4 + pfadd %mm1, %mm5 +#endif + + movd %mm4, 0 * SIZE(%esi) + movd %mm5, 0 * SIZE(%esi, %ebp, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, %ebp, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_3 + +.L61: +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L63 + ALIGN_3 + +.L62: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm4, 8 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L62 + +.L63: + movl K, %eax + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L64: + movd 0 * SIZE(%edi), %mm0 + punpckldq %mm0, %mm0 + movq %mm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L64 + ALIGN_4 + +.L70: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L90 + ALIGN_4 + +.L71: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + pxor %mm7, %mm7 + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L75 + ALIGN_4 + +.L72: + pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 4 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm6 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 8 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 12 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm6 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 16 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm4 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 20 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm6 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 24 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm4 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 28 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm6 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_3 + +.L75: + movq ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L78 + ALIGN_3 + +.L76: + pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_3 + +.L78: + pfadd %mm5, %mm4 + pfadd %mm7, %mm6 + pfadd %mm6, %mm4 + + pfmul %mm3, %mm4 +#ifndef TRMMKERNEL + pfadd 0 * SIZE(%esi), %mm4 +#endif + movq %mm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L90: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + ALIGN_4 + +.L91: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 8 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + pxor %mm7, %mm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L95 + ALIGN_4 + +.L92: + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 4 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm6 + movd ( 3 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 8 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movd ( 5 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movd ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 12 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm6 + movd ( 7 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movd ( 16 + AOFFSET) * SIZE(AA), %mm0 + + pfmul ( 16 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm4 + movd ( 9 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 10 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 20 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm6 + movd ( 11 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 12 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 24 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm4 + movd ( 13 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movd ( 14 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 28 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm6 + movd ( 15 + AOFFSET) * SIZE(AA), %mm1 + + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movd ( 24 + AOFFSET) * SIZE(AA), %mm1 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_3 + +.L95: + movd ALPHA, %mm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax + BRANCH + je .L98 + ALIGN_3 + +.L96: + pfmul ( 0 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm4 + movd ( 1 + AOFFSET) * SIZE(AA), %mm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L96 + ALIGN_3 + +.L98: +#ifndef TRMMKERNEL + movd 0 * SIZE(%esi), %mm0 +#endif + + pfadd %mm5, %mm4 + pfadd %mm7, %mm6 + pfadd %mm6, %mm4 + + pfmul %mm3, %mm4 + pfmul %mm3, %mm5 + +#ifndef TRMMKERNEL + pfadd %mm0, %mm4 +#endif + movd %mm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_barcelona.S b/kernel/x86/gemm_kernel_2x4_barcelona.S new file mode 100644 index 0000000000..1acdc16c5d --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_barcelona.S @@ -0,0 +1,1268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define OLD_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define OLD_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define B %edi +#define LDC %ebp +#define AO %edx +#define BO %ecx +#define CO %esi +#define I %ebx + +#define movsd movlps +#define movapd movups +#define movlpd movlps +#define movhpd movhps + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm1; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL2(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -12 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL3(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -10 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL4(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd (BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup (AO, %eax, 2), %xmm0 + +#define KERNEL5(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -6 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL6(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -4 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL7(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -2 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL8(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl OLD_B, B + movl OLD_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax + +#ifndef LEFT + negl %eax +#endif + + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax + movl %eax, BX + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm4, %xmm4 + movddup -8 * SIZE(AO), %xmm3 + + leal (LDC, LDC, 2), %eax + + prefetchw 1 * SIZE(CO) + pxor %xmm5, %xmm5 + prefetchw 3 * SIZE(CO, LDC) + pxor %xmm6, %xmm6 + prefetchw 1 * SIZE(CO, LDC, 2) + pxor %xmm7, %xmm7 + prefetchw 3 * SIZE(CO, %eax) + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetch -16 * SIZE(%eax) + addl $8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + BRANCH + jl .L12 + ALIGN_3 + +.L15: + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + je .L18 + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO, %eax, 4), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO, %eax, 2), %xmm0 + mulpd %xmm0, %xmm2 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + addpd %xmm0, %xmm7 + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + + addl $SIZE, %eax + jl .L17 + ALIGN_4 + +.L18: + leal (CO, LDC, 2), %eax + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO ), %xmm0 + movhpd 0 * SIZE(CO, LDC), %xmm0 + movsd 0 * SIZE(%eax ), %xmm1 + movhpd 0 * SIZE(%eax, LDC), %xmm1 + + movsd 1 * SIZE(CO ), %xmm2 + movhpd 1 * SIZE(CO, LDC), %xmm2 + movsd 1 * SIZE(%eax ), %xmm3 + movhpd 1 * SIZE(%eax, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(CO) + movsd %xmm6, 1 * SIZE(CO) + + movhpd %xmm4, 0 * SIZE(CO, LDC) + movhpd %xmm6, 1 * SIZE(CO, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movsd %xmm7, 1 * SIZE(%eax) + + movhpd %xmm5, 0 * SIZE(%eax, LDC) + movhpd %xmm7, 1 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO # coffset += 2 + decl I # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I # i = (m >> 2) + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + movddup -8 * SIZE(AO), %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd -8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -13 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd (BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -11 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd 8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -9 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -8 * SIZE(AO), %xmm0 + + subl $ -8 * SIZE, AO + subl $-32 * SIZE, BO + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO), %xmm0 + + addl $1 * SIZE, AO + addl $4 * SIZE, BO + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (CO, LDC, 2), %eax + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO ), %xmm0 + movhpd 0 * SIZE(CO, LDC), %xmm0 + movsd 0 * SIZE(%eax ), %xmm1 + movhpd 0 * SIZE(%eax, LDC), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(CO ) + movhpd %xmm4, 0 * SIZE(CO, LDC) + movsd %xmm5, 0 * SIZE(%eax ) + movhpd %xmm5, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BO, B + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + ALIGN_2 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + prefetchw 1 * SIZE(CO) + pxor %xmm5, %xmm5 + prefetchw 1 * SIZE(CO, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -16 * SIZE(BO), %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -14 * SIZE(BO), %xmm0 + movddup -13 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -14 * SIZE(BO), %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + mulpd -12 * SIZE(BO), %xmm0 + movddup -11 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -12 * SIZE(BO), %xmm1 + movddup -10 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -10 * SIZE(BO), %xmm1 + movddup -8 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AO) + + mulpd -8 * SIZE(BO), %xmm0 + movddup -7 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -8 * SIZE(BO), %xmm1 + movddup -6 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -6 * SIZE(BO), %xmm1 + movddup -4 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + mulpd -4 * SIZE(BO), %xmm0 + movddup -3 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -4 * SIZE(BO), %xmm1 + movddup -2 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -2 * SIZE(BO), %xmm0 + movddup -1 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -2 * SIZE(BO), %xmm1 + movddup 0 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + subl $-16 * SIZE, AO + subl $-16 * SIZE, BO + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -16 * SIZE(BO), %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + addl $2 * SIZE, AO + addl $2 * SIZE, BO + decl %eax + jg .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO), %xmm0 + movhpd 0 * SIZE(CO, LDC), %xmm0 + + movsd 1 * SIZE(CO), %xmm1 + movhpd 1 * SIZE(CO, LDC), %xmm1 +#endif + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movlpd %xmm4, 0 * SIZE(CO) + movlpd %xmm5, 1 * SIZE(CO) + + movhpd %xmm4, 0 * SIZE(CO, LDC) + movhpd %xmm5, 1 * SIZE(CO, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO # coffset += 2 + decl I # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, I + testl $1, I # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(AO), %xmm0 + + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -14 * SIZE(AO), %xmm0 + + mulpd -12 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -13 * SIZE(AO), %xmm0 + + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -12 * SIZE(AO), %xmm0 + + mulpd -8 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -11 * SIZE(AO), %xmm0 + + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -10 * SIZE(AO), %xmm0 + + mulpd -4 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -9 * SIZE(AO), %xmm0 + + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -8 * SIZE(AO), %xmm0 + + subl $ -8 * SIZE, AO + subl $-16 * SIZE, BO + + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(AO), %xmm0 + + subl $-1 * SIZE, AO + subl $-2 * SIZE, BO + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + addpd %xmm5, %xmm4 + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO), %xmm0 + movhpd 0 * SIZE(CO, LDC), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movlpd %xmm4, 0 * SIZE(CO) + movhpd %xmm4, 0 * SIZE(CO, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + + leal (, LDC, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + prefetchw 1 * SIZE(CO) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(BO), %xmm0 + + mulpd -14 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -14 * SIZE(BO), %xmm0 + + mulpd -12 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -13 * SIZE(BO), %xmm0 + + mulpd -10 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -12 * SIZE(BO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + mulpd -8 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -11 * SIZE(BO), %xmm0 + + mulpd -6 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -10 * SIZE(BO), %xmm0 + + mulpd -4 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -9 * SIZE(BO), %xmm0 + + mulpd -2 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -8 * SIZE(BO), %xmm0 + + subl $-16 * SIZE, AO + subl $ -8 * SIZE, BO + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movddup ALPHA, %xmm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd -16 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(BO), %xmm0 + + addl $2 * SIZE, AO + addl $1 * SIZE, BO + decl %eax + jg .L76 + ALIGN_4 + +.L78: + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO), %xmm0 + movhpd 1 * SIZE(CO), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(CO) + movhpd %xmm4, 1 * SIZE(CO) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO # coffset += 2 + decl I # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, I + testl $1, I # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movapd -14 * SIZE(AO), %xmm0 + + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm5 + movapd -12 * SIZE(AO), %xmm0 + + mulpd -12 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm6 + movapd -10 * SIZE(AO), %xmm0 + + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm7 + movapd -8 * SIZE(AO), %xmm0 + + subl $-8 * SIZE, AO + subl $-8 * SIZE, BO + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movddup ALPHA, %xmm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd -16 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm4 + movsd -15 * SIZE(AO), %xmm0 + + addl $1 * SIZE, AO + addl $1 * SIZE, BO + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO), %xmm0 + addsd %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(CO) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_core2.S b/kernel/x86/gemm_kernel_2x4_core2.S new file mode 100644 index 0000000000..9907131d6d --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_core2.S @@ -0,0 +1,1318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH_R (8 * 4) + +#define PREFETCHSIZE (8 * 21 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + + prefetcht2 (PREFETCH_R + 0) * SIZE(%eax) + prefetcht2 (PREFETCH_R + 8) * SIZE(%eax) + + subl $-8 * SIZE, BX + + leal (C1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(C1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 1 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps 2 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps 6 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps 10 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps 14 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 +// SHUFPD_1 %xmm0, %xmm0 + pshufd $0x4e, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + + subl $-32 * SIZE, BB + subl $-16 * SIZE, AA + + subl $1, %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm7 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 + SHUFPD_1 %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + + movddup ALPHA, %xmm3 + + movaps %xmm4, %xmm0 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm0, %xmm6 + + movaps %xmm5, %xmm1 + unpcklpd %xmm7, %xmm5 + unpckhpd %xmm1, %xmm7 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + + movsd 0 * SIZE(%eax), %xmm2 + movhpd 1 * SIZE(%eax), %xmm2 + movsd 0 * SIZE(%eax, LDC), %xmm3 + movhpd 1 * SIZE(%eax, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 + addpd %xmm2, %xmm5 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 0 * SIZE(C1, LDC) + movhpd %xmm6, 1 * SIZE(C1, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movhpd %xmm5, 1 * SIZE(%eax) + movsd %xmm7, 0 * SIZE(%eax, LDC) + movhpd %xmm7, 1 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movddup ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 0 * SIZE(C1, LDC), %xmm0 + + movsd 0 * SIZE(%eax), %xmm1 + movhpd 0 * SIZE(%eax, LDC), %xmm1 +#endif + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 0 * SIZE(C1, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movhpd %xmm5, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $2, %eax + jle .L50 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(C1) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(C1, LDC) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movddup ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + movsd %xmm0, %xmm5 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhpd %xmm5, 1 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L31 + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 0 * SIZE(C1, LDC), %xmm0 +#endif + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + haddpd %xmm4, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 +#endif + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_penryn.S b/kernel/x86/gemm_kernel_2x4_penryn.S new file mode 100644 index 0000000000..263aea0423 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_penryn.S @@ -0,0 +1,1367 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#ifdef NANO +#define PREFETCHSIZE (8 * 3 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 13 + 4) +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -16 * SIZE(%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + + leal (C1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + PREFETCHW 1 * SIZE(C1) + xorps %xmm5, %xmm5 + PREFETCHW 3 * SIZE(C1, LDC) + xorps %xmm6, %xmm6 + PREFETCHW 1 * SIZE(%eax) + xorps %xmm7, %xmm7 + PREFETCHW 3 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + subl $-32 * SIZE, BB + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movddup ALPHA, %xmm3 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + movsd %xmm0, %xmm5 + mulpd %xmm3, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + mulpd %xmm3, %xmm6 + movsd %xmm0, %xmm7 + mulpd %xmm3, %xmm7 + + movl C1, %eax + orl LDC, %eax + testl $15, %eax + NOBRANCH + jne .L18x + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movaps (C1), %xmm0 + movaps (C1, LDC), %xmm1 + movaps (%eax), %xmm2 + movaps (%eax, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movaps %xmm4, (C1) + movaps %xmm5, (C1, LDC) + movaps %xmm6, (%eax) + movaps %xmm7, (%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movups (C1), %xmm0 + movups (C1, LDC), %xmm1 + movups (%eax), %xmm2 + movups (%eax, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movups %xmm4, (C1) + movups %xmm5, (C1, LDC) + movups %xmm6, (%eax) + movups %xmm7, (%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movddup ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 0 * SIZE(C1, LDC), %xmm0 + + movsd 0 * SIZE(%eax), %xmm1 + movhpd 0 * SIZE(%eax, LDC), %xmm1 +#endif + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 0 * SIZE(C1, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movhpd %xmm5, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $2, %eax + jle .L50 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + xorps %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1) + xorps %xmm6, %xmm6 + PREFETCHW 1 * SIZE(C1, LDC) + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movddup ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + movsd %xmm0, %xmm5 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhpd %xmm5, 1 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L31 + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 0 * SIZE(C1, LDC), %xmm0 +#endif + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + xorps %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movddup ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + haddpd %xmm4, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 +#endif + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_sse2.S b/kernel/x86/gemm_kernel_2x4_sse2.S new file mode 100644 index 0000000000..be58235eef --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_sse2.S @@ -0,0 +1,1790 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define movsd movlpd +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movsd OLD_ALPHA, %xmm3 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movss OLD_OFFT, %xmm4 +#endif + + unpcklpd %xmm3, %xmm3 + movl OLD_B, %edi + movl OLD_C, %ebx + movapd %xmm3, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + leal (, LDC, SIZE), LDC + + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl %edi, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movl BX, %eax + + prefetchnta 0 * SIZE(%eax) + prefetchnta 8 * SIZE(%eax) + + subl $-8 * SIZE, BX + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + + prefetchw 1 * SIZE(%esi) + prefetchw 1 * SIZE(%esi, LDC) + prefetchw 1 * SIZE(%esi, LDC, 2) + prefetchw 1 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulpd %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + mulpd %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 + mulpd %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhpd 1 * SIZE(%esi, LDC, 2), %xmm2 + mulpd %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhpd 1 * SIZE(%esi, %eax, 1), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#else + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movhpd %xmm6, 1 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + movhpd %xmm7, 1 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movsd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movsd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movsd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movsd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movsd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movsd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movsd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movsd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movsd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movsd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movsd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movsd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulsd %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + mulsd %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + mulsd %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + mulsd %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + + addsd %xmm0, %xmm4 + addsd %xmm1, %xmm5 + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 +#else + mulsd %xmm3, %xmm4 + mulsd %xmm3, %xmm5 + mulsd %xmm3, %xmm6 + mulsd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + ALIGN_2 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + movddup 2 * SIZE(%edi), %xmm2 + movddup 3 * SIZE(%edi), %xmm3 + movddup 4 * SIZE(%edi), %xmm4 + movddup 5 * SIZE(%edi), %xmm5 + movddup 6 * SIZE(%edi), %xmm6 + movddup 7 * SIZE(%edi), %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + movsd 2 * SIZE(%edi), %xmm2 + movsd 3 * SIZE(%edi), %xmm3 + movsd 4 * SIZE(%edi), %xmm4 + movsd 5 * SIZE(%edi), %xmm5 + movsd 6 * SIZE(%edi), %xmm6 + movsd 7 * SIZE(%edi), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpckhpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#endif + prefetcht0 80 * SIZE(%edi) + prefetcht1 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L32 + ALIGN_2 + +.L35: + movl K, %eax + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) +#endif +#endif + +#if defined(OPTERON) || defined(BARCELONA) + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) +#endif + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L36 + ALIGN_4 + +.L40: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 +#endif + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + + addsd %xmm0, %xmm4 + addsd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + movddup 2 * SIZE(%edi), %xmm2 + movddup 3 * SIZE(%edi), %xmm3 + movddup 4 * SIZE(%edi), %xmm4 + movddup 5 * SIZE(%edi), %xmm5 + movddup 6 * SIZE(%edi), %xmm6 + movddup 7 * SIZE(%edi), %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + movsd 2 * SIZE(%edi), %xmm2 + movsd 3 * SIZE(%edi), %xmm3 + movsd 4 * SIZE(%edi), %xmm4 + movsd 5 * SIZE(%edi), %xmm5 + movsd 6 * SIZE(%edi), %xmm6 + movsd 7 * SIZE(%edi), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpckhpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#endif + prefetcht1 80 * SIZE(%edi) + prefetcht0 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L62 + ALIGN_2 + +.L65: + movl K, %eax + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) +#endif +#endif + +#if defined(OPTERON) || defined(BARCELONA) + movq 0 * SIZE(%edi), %mm0 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) +#endif + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L66 + ALIGN_4 + +.L70: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) +#endif + +#ifdef PENTIUM4 + prefetchnta 2 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + addsd %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_2x4_sse3.S b/kernel/x86/gemm_kernel_2x4_sse3.S new file mode 100644 index 0000000000..e2732daf88 --- /dev/null +++ b/kernel/x86/gemm_kernel_2x4_sse3.S @@ -0,0 +1,1635 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm2, %xmm7 + +#define KERNEL6(address) \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm2, %xmm7; \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + movl BX, %eax + prefetcht2 0 * SIZE(%eax) + subl $-4 * SIZE, BX + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC, 1) + prefetchnta 3 * SIZE(%esi, LDC, 2) + prefetchnta 3 * SIZE(%esi, %eax, 1) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#ifdef CORE_PRESCOTT + andl $-8, %eax + sall $4, %eax + je .L15 + +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(16 * 8) + KERNEL2(16 * 8) + KERNEL3(16 * 8) + KERNEL4(16 * 8) + KERNEL5(16 * 8) + KERNEL6(16 * 8) + KERNEL7(16 * 8) + KERNEL8(16 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(16 * 9) + KERNEL2(16 * 9) + KERNEL3(16 * 9) + KERNEL4(16 * 9) + KERNEL5(16 * 9) + KERNEL6(16 * 9) + KERNEL7(16 * 9) + KERNEL8(16 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(16 * 10) + KERNEL2(16 * 10) + KERNEL3(16 * 10) + KERNEL4(16 * 10) + KERNEL5(16 * 10) + KERNEL6(16 * 10) + KERNEL7(16 * 10) + KERNEL8(16 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(16 * 11) + KERNEL2(16 * 11) + KERNEL3(16 * 11) + KERNEL4(16 * 11) + KERNEL5(16 * 11) + KERNEL6(16 * 11) + KERNEL7(16 * 11) + KERNEL8(16 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(16 * 12) + KERNEL2(16 * 12) + KERNEL3(16 * 12) + KERNEL4(16 * 12) + KERNEL5(16 * 12) + KERNEL6(16 * 12) + KERNEL7(16 * 12) + KERNEL8(16 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(16 * 13) + KERNEL2(16 * 13) + KERNEL3(16 * 13) + KERNEL4(16 * 13) + KERNEL5(16 * 13) + KERNEL6(16 * 13) + KERNEL7(16 * 13) + KERNEL8(16 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(16 * 14) + KERNEL2(16 * 14) + KERNEL3(16 * 14) + KERNEL4(16 * 14) + KERNEL5(16 * 14) + KERNEL6(16 * 14) + KERNEL7(16 * 14) + KERNEL8(16 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(16 * 15) + KERNEL2(16 * 15) + KERNEL3(16 * 15) + KERNEL4(16 * 15) + KERNEL5(16 * 15) + KERNEL6(16 * 15) + KERNEL7(16 * 15) + KERNEL8(16 * 15) +#else + addl $32 * 4 * SIZE, AA + addl $32 * 8 * SIZE, BB + subl $128 * 8, %eax + jg .L1X +#endif + +.L12: + leal (AA, %eax, 1), AA # * 16 + leal (BB, %eax, 2), BB # * 64 + +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + SHUFPD_2 %xmm0, %xmm0 + SHUFPD_2 %xmm1, %xmm1 + SHUFPD_2 %xmm2, %xmm2 + SHUFPD_2 %xmm3, %xmm3 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + + movl %esi, %eax + orl LDC, %eax + testl $15, %eax + NOBRANCH + jne .L18x + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movapd 0 * SIZE(%esi), %xmm0 + movapd 0 * SIZE(%esi, LDC, 1), %xmm1 + movapd 0 * SIZE(%esi, LDC, 2), %xmm2 + movapd 0 * SIZE(%esi, %eax, 1), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movapd %xmm4, 0 * SIZE(%esi) + movapd %xmm5, 0 * SIZE(%esi, LDC, 1) + movapd %xmm6, 0 * SIZE(%esi, LDC, 2) + movapd %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhpd 1 * SIZE(%esi, LDC, 2), %xmm2 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhpd 1 * SIZE(%esi, %eax, 1), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movhpd %xmm6, 1 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + movhpd %xmm7, 1 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_3 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (%esi, LDC, 1), %eax + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 + SHUFPD_2 %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 0 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%esi, LDC, 2), %xmm1 + movhpd 0 * SIZE(%eax, LDC, 2), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 0 * SIZE(%eax) + movsd %xmm5, 0 * SIZE(%esi, LDC, 2) + movhpd %xmm5, 0 * SIZE(%eax, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + movl BB, B + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 + SHUFPD_2 %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 0 * SIZE(%esi, LDC, 1), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 0 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + movl BB, B + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef PENTIUM4 + SHUFPD_2 %xmm0, %xmm0 +#endif + + movsd 0 * SIZE(%esi), %xmm0 + + addsd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x2_core2.S b/kernel/x86/gemm_kernel_4x2_core2.S new file mode 100644 index 0000000000..641b5fc467 --- /dev/null +++ b/kernel/x86/gemm_kernel_4x2_core2.S @@ -0,0 +1,1304 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 256(%esp) + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 7 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl %esp, %esi # save old stack + + subl $512 + LOCAL_BUFFER_SIZE, %esp + andl $-4096, %esp # align stack + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movsd OLD_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movd OLD_OFFT, %mm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + unpcklpd %xmm3, %xmm3 + movl OLD_B, B + movl OLD_C, %ebx + + movapd %xmm3, ALPHA + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + leal (, LDC, SIZE), LDC + + sarl $1, %eax + movl %eax, J + jle .L40 + ALIGN_4 + +.L01: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl B, BX + + movl C, C1 + movl A, AA + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(C1) + pxor %xmm7, %xmm7 + prefetcht0 7 * SIZE(C1, LDC) + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetcht0 (%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd -12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd -10 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + movapd -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + PADDING; + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm6 + movapd -4 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm4 + movapd -2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm6 + PADDING; + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + movapd 8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm4 + movapd 14 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + subl $-32 * SIZE, BB + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm6 + movapd -16 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 24 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + subl $-32 * SIZE, AA + decl %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm2 + movhpd 3 * SIZE(C1), %xmm2 + + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + movsd 2 * SIZE(C1, LDC), %xmm3 + movhpd 3 * SIZE(C1, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 2 * SIZE(C1) + movhpd %xmm6, 3 * SIZE(C1) + + movsd %xmm5, 0 * SIZE(C1, LDC) + movhpd %xmm5, 1 * SIZE(C1, LDC) + movsd %xmm7, 2 * SIZE(C1, LDC) + movhpd %xmm7, 3 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $2, I + jle .L30 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $2, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movapd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhpd %xmm5, 1 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $1, I + jle .L39 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: + movsd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + mulsd %xmm3, %xmm4 + mulsd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + + addsd %xmm0, %xmm4 + addsd %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L40: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_4 + +.L41: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L45 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $7, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, C1 + movl A, AA + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht0 3 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movapd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm2 + movhpd 3 * SIZE(C1), %xmm2 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 2 * SIZE(C1) + movhpd %xmm6, 3 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $2, I + jle .L70 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: + movapd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm5, %xmm4 + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L70: + movl M, I + testl $1, I + jle .L79 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movsd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm5, %xmm4 + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + addsd %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L79: + addl LDC, C + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x2_sse2.S b/kernel/x86/gemm_kernel_4x2_sse2.S new file mode 100644 index 0000000000..2e67afaf9a --- /dev/null +++ b/kernel/x86/gemm_kernel_4x2_sse2.S @@ -0,0 +1,1539 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movq STACK_ALPHA, %mm7 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movq %mm7, 0 * SIZE + ALPHA + movq %mm7, 1 * SIZE + ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $BASE_SHIFT, LDC + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + BRANCH + jne .L04 + ALIGN_4 + +.L05: + movl B, BX + + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + NOBRANCH + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#endif + + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) + + movl BX, %eax + prefetcht2 0 * SIZE(%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + .align 8 + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + + movl %esi, %eax + orl LDC, %eax + testl $15, %eax + NOBRANCH + jne .L18x + +#ifndef TRMMKERNEL + movapd 0 * SIZE(%esi), %xmm0 + movapd 2 * SIZE(%esi), %xmm1 + movapd 0 * SIZE(%esi, LDC), %xmm2 + movapd 2 * SIZE(%esi, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 + addpd %xmm2, %xmm5 + addpd %xmm3, %xmm7 +#endif + + movapd %xmm4, 0 * SIZE(%esi) + movapd %xmm6, 2 * SIZE(%esi) + movapd %xmm5, 0 * SIZE(%esi, LDC) + movapd %xmm7, 2 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + BRANCH + jg .L10 + jmp .L30 + ALIGN_2 + +.L18x: +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhpd 1 * SIZE(%esi, LDC), %xmm2 + movsd 2 * SIZE(%esi, LDC), %xmm3 + movhpd 3 * SIZE(%esi, LDC), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 + addpd %xmm2, %xmm5 + addpd %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm6, 2 * SIZE(%esi) + movhpd %xmm6, 3 * SIZE(%esi) + + movsd %xmm5, 0 * SIZE(%esi, LDC) + movhpd %xmm5, 1 * SIZE(%esi, LDC) + movsd %xmm7, 2 * SIZE(%esi, LDC) + movhpd %xmm7, 3 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + BRANCH + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + + pxor %xmm7, %xmm7 +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $2, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + SHUFPD_1 %xmm0, %xmm0 + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + SHUFPD_1 %xmm2, %xmm2 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhpd 1 * SIZE(%esi, LDC), %xmm2 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + + movsd %xmm5, 0 * SIZE(%esi, LDC) + movhpd %xmm5, 1 * SIZE(%esi, LDC) + + addl $2 * SIZE, %esi # coffset += 4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + mulsd %xmm3, %xmm4 + mulsd %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(%esi), %xmm4 + addsd 0 * SIZE(%esi, LDC), %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC) + addl $1 * SIZE, %esi + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 2 * ldc + BRANCH + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + addpd %xmm0, %xmm4 + + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + addpd %xmm1, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + unpckhpd %xmm4, %xmm4 + movsd %xmm4, 1 * SIZE(%esi) + + movsd %xmm6, 2 * SIZE(%esi) + unpckhpd %xmm6, %xmm6 + movsd %xmm6, 3 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movapd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + mulpd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + unpckhpd %xmm4, %xmm4 + movsd %xmm4, 1 * SIZE(%esi) + + addl $2 * SIZE, %esi # coffset += 4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + mulsd %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(%esi), %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x4_barcelona.S b/kernel/x86/gemm_kernel_4x4_barcelona.S new file mode 100644 index 0000000000..18b9a43bd4 --- /dev/null +++ b/kernel/x86/gemm_kernel_4x4_barcelona.S @@ -0,0 +1,2151 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA 16 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + shufps $0, %xmm3, %xmm3 + movl OLD_B, %edi + movl OLD_C, %ebx + movaps %xmm3, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + leal (, LDC, SIZE), LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + movaps 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (%esi, LDC, 2), %eax + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + prefetchw 3 * SIZE(%eax) + prefetchw 3 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + shufps $0xe4, %xmm1, %xmm1 + shufps $0xe4, %xmm2, %xmm2 + shufps $0xe4, %xmm3, %xmm3 + + mulps %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhps 2 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhps 2 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movhps %xmm6, 2 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + movhps %xmm7, 2 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulps %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulss %xmm3, %xmm4 + movss 0 * SIZE(%esi), %xmm0 + mulss %xmm3, %xmm5 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + mulss %xmm3, %xmm6 + movss 0 * SIZE(%esi, LDC, 2), %xmm2 + mulss %xmm3, %xmm7 + movss 0 * SIZE(%esi, %eax, 1), %xmm3 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 + addss %xmm2, %xmm6 + addss %xmm3, %xmm7 +#else + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + mulss %xmm3, %xmm6 + mulss %xmm3, %xmm7 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC, 1) + movss %xmm6, 0 * SIZE(%esi, LDC, 2) + movss %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + movaps 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movsd 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + addl $2 * SIZE, %esi # coffset += 2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movups 0 * SIZE(%edi), %xmm3 + movups 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(%edi), %xmm3 + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + mulss %xmm3, %xmm4 +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + addss %xmm0, %xmm4 +#endif + movss %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x4_penryn.S b/kernel/x86/gemm_kernel_4x4_penryn.S new file mode 100644 index 0000000000..6775d1d181 --- /dev/null +++ b/kernel/x86/gemm_kernel_4x4_penryn.S @@ -0,0 +1,1831 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 20 + STACK + ARGS(%esp) +#define ARG_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define ARG_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef NANO +#define PREFETCHSIZE (16 * 3 + 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE (16 * 1 - 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (16 * 13 + 8) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + leal (, LDC, SIZE), LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -32 * SIZE(%eax) + subl $-16 * SIZE, %eax + movl %eax, BX + + leal (C1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + PREFETCHW 3 * SIZE(C1) + xorps %xmm5, %xmm5 + PREFETCHW 7 * SIZE(C1, LDC) + xorps %xmm6, %xmm6 + PREFETCHW 3 * SIZE(%eax) + xorps %xmm7, %xmm7 + PREFETCHW 7 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 +#ifndef NEHALEM + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) +#endif + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + + movss ALPHA, %xmm3 + + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + shufps $0, %xmm3, %xmm3 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + movsd 0 * SIZE(%eax), %xmm2 + movhps 2 * SIZE(%eax), %xmm2 + movsd 0 * SIZE(%eax, LDC), %xmm3 + movhps 2 * SIZE(%eax, LDC), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhps %xmm5, 2 * SIZE(C1, LDC) + + movsd %xmm6, 0 * SIZE(%eax) + movhps %xmm6, 2 * SIZE(%eax) + movsd %xmm7, 0 * SIZE(%eax, LDC) + movhps %xmm7, 2 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $2, I + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movss ALPHA, %xmm1 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 0 * SIZE(C1, LDC), %xmm0 + + movsd 0 * SIZE(%eax), %xmm1 + movhps 0 * SIZE(%eax, LDC), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 0 * SIZE(C1, LDC) + + movsd %xmm5, 0 * SIZE(%eax) + movhps %xmm5, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $1, I + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movss ALPHA, %xmm1 + + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + + leal (C1, LDC, 2), %eax + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + movss 0 * SIZE(C1, LDC), %xmm1 + + movss 0 * SIZE(%eax), %xmm2 + movss 0 * SIZE(%eax, LDC), %xmm3 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 + addss %xmm2, %xmm6 + addss %xmm3, %xmm7 +#endif + + movss %xmm4, 0 * SIZE(C1) + movss %xmm5, 0 * SIZE(C1, LDC) + movss %xmm6, 0 * SIZE(%eax) + movss %xmm7, 0 * SIZE(%eax, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L40: + movl N, %eax + testl $2, %eax + jle .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movss ALPHA, %xmm1 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhps %xmm5, 2 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L41 + ALIGN_4 + +.L50: + movl M, I + testl $2, I + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movss ALPHA, %xmm1 + + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 0 * SIZE(C1, LDC), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L69 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movss ALPHA, %xmm1 + + addps %xmm5, %xmm4 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + movss 0 * SIZE(C1, LDC), %xmm1 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 +#endif + + movss %xmm4, 0 * SIZE(C1) + movss %xmm5, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L70: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + movss ALPHA, %xmm1 + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + addl %eax, BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + decl I + jg .L71 + ALIGN_4 + +.L80: + movl M, I + testl $2, I + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + ALIGN_4 + +.L86: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + movss ALPHA, %xmm1 + + addps %xmm5, %xmm4 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L90: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + movss ALPHA, %xmm1 + + haddps %xmm4, %xmm4 + mulss %xmm1, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + + addss %xmm0, %xmm4 +#endif + + movss %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x4_sse.S b/kernel/x86/gemm_kernel_4x4_sse.S new file mode 100644 index 0000000000..b360a58da1 --- /dev/null +++ b/kernel/x86/gemm_kernel_4x4_sse.S @@ -0,0 +1,2589 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA 16 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#ifdef ATHLON +#define PREFETCH prefetch +#define PREFETCHSIZE 64 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; +#endif + +#ifdef PENTIUM4 +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + shufps $0, %xmm3, %xmm3 + movl OLD_B, %edi + movl OLD_C, %ebx + movaps %xmm3, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + leal (, LDC, SIZE), LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht2 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) + prefetchnta 80 * SIZE(%edi) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) +#endif + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl %edi, BX + + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movl BX, %eax + +#ifdef HAVE_SSE + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) + prefetchw 4 * SIZE(%esi, LDC, 2) + prefetchw 4 * SIZE(%esi, %eax) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + prefetchnta 4 * SIZE(%esi, LDC, 2) + prefetchnta 4 * SIZE(%esi, %eax) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + shufps $0xe4, %xmm1, %xmm1 + shufps $0xe4, %xmm2, %xmm2 + shufps $0xe4, %xmm3, %xmm3 + + mulps %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhps 2 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhps 2 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + movlps %xmm6, 0 * SIZE(%esi, LDC, 2) + movhps %xmm6, 2 * SIZE(%esi, LDC, 2) + movlps %xmm7, 0 * SIZE(%esi, %eax, 1) + movhps %xmm7, 2 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulps %xmm3, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC, 1) + movlps %xmm6, 0 * SIZE(%esi, LDC, 2) + movlps %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + mulss %xmm3, %xmm4 + movss 0 * SIZE(%esi), %xmm0 + mulss %xmm3, %xmm5 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + mulss %xmm3, %xmm6 + movss 0 * SIZE(%esi, LDC, 2), %xmm2 + mulss %xmm3, %xmm7 + movss 0 * SIZE(%esi, %eax, 1), %xmm3 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 + addss %xmm2, %xmm6 + addss %xmm3, %xmm7 +#else + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + mulss %xmm3, %xmm6 + mulss %xmm3, %xmm7 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC, 1) + movss %xmm6, 0 * SIZE(%esi, LDC, 2) + movss %xmm7, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + prefetchnta 80 * SIZE(%edi) + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht1 112 * SIZE(%ecx) +#endif + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) +#endif + addl $2 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC, 1) + addl $2 * SIZE, %esi # coffset += 2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + prefetchnta 80 * SIZE(%edi) + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht1 112 * SIZE(%ecx) +#endif + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) +#endif + addl $1 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + movlps %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + mulss %xmm3, %xmm4 +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + addss %xmm0, %xmm4 +#endif + movss %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_4x4_sse3.S b/kernel/x86/gemm_kernel_4x4_sse3.S new file mode 100644 index 0000000000..78efab6c4b --- /dev/null +++ b/kernel/x86/gemm_kernel_4x4_sse3.S @@ -0,0 +1,2090 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA 16 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm2, %xmm7; \ + movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm2, %xmm7; \ + movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm3, %xmm7; \ + movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm3, %xmm7; \ + movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm2, %xmm7 + +#define KERNEL6(address) \ + movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm2, %xmm7; \ + movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm3, %xmm7; \ + movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm3, %xmm7; \ + movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + shufps $0, %xmm3, %xmm3 + movl OLD_B, %edi + movl OLD_C, %ebx + movaps %xmm3, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + leal (, LDC, SIZE), LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + movddup 4 * SIZE(%edi), %xmm2 + movddup 6 * SIZE(%edi), %xmm3 + movddup 8 * SIZE(%edi), %xmm4 + movddup 10 * SIZE(%edi), %xmm5 + movddup 12 * SIZE(%edi), %xmm6 + movddup 14 * SIZE(%edi), %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_2 + +.L06: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $4 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + prefetchnta 4 * SIZE(%esi, LDC, 2) + prefetchnta 4 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(32 * 8) + KERNEL2(32 * 8) + KERNEL3(32 * 8) + KERNEL4(32 * 8) + KERNEL5(32 * 8) + KERNEL6(32 * 8) + KERNEL7(32 * 8) + KERNEL8(32 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(32 * 9) + KERNEL2(32 * 9) + KERNEL3(32 * 9) + KERNEL4(32 * 9) + KERNEL5(32 * 9) + KERNEL6(32 * 9) + KERNEL7(32 * 9) + KERNEL8(32 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(32 * 10) + KERNEL2(32 * 10) + KERNEL3(32 * 10) + KERNEL4(32 * 10) + KERNEL5(32 * 10) + KERNEL6(32 * 10) + KERNEL7(32 * 10) + KERNEL8(32 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(32 * 11) + KERNEL2(32 * 11) + KERNEL3(32 * 11) + KERNEL4(32 * 11) + KERNEL5(32 * 11) + KERNEL6(32 * 11) + KERNEL7(32 * 11) + KERNEL8(32 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(32 * 12) + KERNEL2(32 * 12) + KERNEL3(32 * 12) + KERNEL4(32 * 12) + KERNEL5(32 * 12) + KERNEL6(32 * 12) + KERNEL7(32 * 12) + KERNEL8(32 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(32 * 13) + KERNEL2(32 * 13) + KERNEL3(32 * 13) + KERNEL4(32 * 13) + KERNEL5(32 * 13) + KERNEL6(32 * 13) + KERNEL7(32 * 13) + KERNEL8(32 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(32 * 14) + KERNEL2(32 * 14) + KERNEL3(32 * 14) + KERNEL4(32 * 14) + KERNEL5(32 * 14) + KERNEL6(32 * 14) + KERNEL7(32 * 14) + KERNEL8(32 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(32 * 15) + KERNEL2(32 * 15) + KERNEL3(32 * 15) + KERNEL4(32 * 15) + KERNEL5(32 * 15) + KERNEL6(32 * 15) + KERNEL7(32 * 15) + KERNEL8(32 * 15) +#else + addl $128 * 4 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 +#endif + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsldup 8 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + shufps $0xe4, %xmm1, %xmm1 + shufps $0xe4, %xmm2, %xmm2 + shufps $0xe4, %xmm3, %xmm3 + + mulps %xmm3, %xmm4 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + mulps %xmm3, %xmm5 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + mulps %xmm3, %xmm6 + movsd 0 * SIZE(%esi, LDC, 2), %xmm2 + movhps 2 * SIZE(%esi, LDC, 2), %xmm2 + mulps %xmm3, %xmm7 + movsd 0 * SIZE(%esi, %eax, 1), %xmm3 + movhps 2 * SIZE(%esi, %eax, 1), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + movsd %xmm6, 0 * SIZE(%esi, LDC, 2) + movhps %xmm6, 2 * SIZE(%esi, LDC, 2) + movsd %xmm7, 0 * SIZE(%esi, %eax, 1) + movhps %xmm7, 2 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 44 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 60 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 0 * SIZE(%esi, LDC, 1), %xmm0 + movsd 0 * SIZE(%esi, LDC, 2), %xmm1 + movhps 0 * SIZE(%esi, %eax, 1), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 0 * SIZE(%esi, LDC, 1) + movsd %xmm5, 0 * SIZE(%esi, LDC, 2) + movhps %xmm5, 0 * SIZE(%esi, %eax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + shufps $0, %xmm0, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movhps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 8 * SIZE(BB), %xmm2 + shufps $0, %xmm0, %xmm0 + movhps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movhps 20 * SIZE(BB), %xmm3 + shufps $0, %xmm0, %xmm0 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + movss 3 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + shufps $0, %xmm0, %xmm0 + movhps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movss 8 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0, %xmm1, %xmm1 + movhps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movss 5 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movsd 40 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + movhps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movss 6 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + movhps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 56 * SIZE(BB), %xmm3 + shufps $0, %xmm1, %xmm1 + movhps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm7 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0, %xmm0, %xmm0 + movhps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 8 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + + addps %xmm5, %xmm4 + mulps %xmm7, %xmm4 + + movhlps %xmm4, %xmm5 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + movss 0 * SIZE(%esi, LDC, 2), %xmm2 + movss 0 * SIZE(%esi, %eax, 1), %xmm3 + + addss %xmm4, %xmm0 + psrlq $32, %xmm4 + addss %xmm4, %xmm1 + addss %xmm5, %xmm2 + psrlq $32, %xmm5 + addss %xmm5, %xmm3 + + movss %xmm0, 0 * SIZE(%esi) + movss %xmm1, 0 * SIZE(%esi, LDC, 1) + movss %xmm2, 0 * SIZE(%esi, LDC, 2) + movss %xmm3 , 0 * SIZE(%esi, %eax, 1) +#else + movss %xmm4, 0 * SIZE(%esi) + psrlq $32, %xmm4 + movss %xmm4, 0 * SIZE(%esi, LDC, 1) + + movss %xmm5, 0 * SIZE(%esi, LDC, 2) + psrlq $32, %xmm5 + movss %xmm5 , 0 * SIZE(%esi, %eax, 1) +#endif + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L45 + ALIGN_4 + +.L42: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + movddup 4 * SIZE(%edi), %xmm2 + movddup 6 * SIZE(%edi), %xmm3 + movddup 8 * SIZE(%edi), %xmm4 + movddup 10 * SIZE(%edi), %xmm5 + movddup 12 * SIZE(%edi), %xmm6 + movddup 14 * SIZE(%edi), %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $7, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movddup 0 * SIZE(%edi), %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 16 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 48 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC, 1), %xmm1 + movhps 2 * SIZE(%esi, LDC, 1), %xmm1 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#else + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm5, 0 * SIZE(%esi, LDC, 1) + movhps %xmm5, 2 * SIZE(%esi, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 0 * SIZE(%esi, LDC, 1), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 0 * SIZE(%esi, LDC, 1) + addl $2 * SIZE, %esi # coffset += 2 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + shufps $0, %xmm0, %xmm0 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + shufps $0, %xmm0, %xmm0 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 3 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + shufps $0, %xmm0, %xmm0 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + shufps $0, %xmm1, %xmm1 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 6 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + shufps $0, %xmm1, %xmm1 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + shufps $0, %xmm1, %xmm1 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + movss 0 * SIZE(%esi, LDC, 1), %xmm1 + + addss %xmm4, %xmm0 + psrlq $32, %xmm4 + addss %xmm4, %xmm1 + + movss %xmm0, 0 * SIZE(%esi) + movss %xmm1, 0 * SIZE(%esi, LDC, 1) +#else + movss %xmm4, 0 * SIZE(%esi) + psrlq $32, %xmm4 + movss %xmm4, 0 * SIZE(%esi, LDC, 1) +#endif + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + movss %xmm0, 0 * SIZE(%ecx) + movss %xmm0, 1 * SIZE(%ecx) + movss %xmm1, 2 * SIZE(%ecx) + movss %xmm1, 3 * SIZE(%ecx) + movss %xmm2, 4 * SIZE(%ecx) + movss %xmm2, 5 * SIZE(%ecx) + movss %xmm3, 6 * SIZE(%ecx) + movss %xmm3, 7 * SIZE(%ecx) + movss %xmm4, 8 * SIZE(%ecx) + movss %xmm4, 9 * SIZE(%ecx) + movss %xmm5, 10 * SIZE(%ecx) + movss %xmm5, 11 * SIZE(%ecx) + movss %xmm6, 12 * SIZE(%ecx) + movss %xmm6, 13 * SIZE(%ecx) + movss %xmm7, 14 * SIZE(%ecx) + movss %xmm7, 15 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(%edi), %xmm0 + movss %xmm0, 0 * SIZE(%ecx) + movss %xmm0, 1 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps 16 * SIZE(AA), %xmm1 + movddup 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movddup 2 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movddup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movddup 6 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movddup 16 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movddup 10 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movddup 12 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movddup 14 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movddup 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movddup 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(AA), %xmm1 + movsd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 16 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 10 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 12 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 14 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + movhlps %xmm4, %xmm5 + addps %xmm5, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 0 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movss 4 * SIZE(AA), %xmm1 + movss 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 2 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addss %xmm2, %xmm5 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 3 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 6 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 8 * SIZE(AA), %xmm0 + addss %xmm2, %xmm5 + movss 16 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 10 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 6 * SIZE(AA), %xmm1 + addss %xmm3, %xmm5 + movss 12 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 14 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + mulss %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(%esi), %xmm0 + addss %xmm0, %xmm4 +#else + mulss %xmm3, %xmm4 +#endif + movss %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_8x1_sse2.S b/kernel/x86/gemm_kernel_8x1_sse2.S new file mode 100644 index 0000000000..52a9ebc9ce --- /dev/null +++ b/kernel/x86/gemm_kernel_8x1_sse2.S @@ -0,0 +1,878 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 36(%esp) +#define J 44(%esp) +#define OLD_STACK 48(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define KERNELMACRO(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movq STACK_ALPHA, %mm7 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + + movq %mm7, 0 * SIZE + ALPHA + movq %mm7, 1 * SIZE + ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + + leal (, LDC, SIZE), LDC + + test %eax, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetchnta 96 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $7, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L10: + leal BUFFER, %ecx # boffset1 = boffset // different point + movl K, %eax + + movapd 0 * SIZE + BUFFER, %xmm2 + movapd 0 * SIZE(%edx), %xmm0 + movapd 8 * SIZE + BUFFER, %xmm3 + movapd 8 * SIZE(%edx), %xmm1 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if 0 + andl $-8, %eax + leal (, %eax, 8), %eax + je .L12 + + KERNELMACRO(32 * 0) # 0 + cmpl $64 * 1, %eax + jle .L11 + KERNELMACRO(32 * 1) # 1 + cmpl $64 * 2, %eax + jle .L11 + KERNELMACRO(32 * 2) # 2 + cmpl $64 * 3, %eax + jle .L11 + KERNELMACRO(32 * 3) # 3 + cmpl $64 * 4, %eax + jle .L11 + KERNELMACRO(32 * 4) # 4 + cmpl $64 * 5, %eax + jle .L11 + KERNELMACRO(32 * 5) # 5 + cmpl $64 * 6, %eax + jle .L11 + KERNELMACRO(32 * 6) # 6 + cmpl $64 * 7, %eax + jle .L11 + KERNELMACRO(32 * 7) # 7 + cmpl $64 * 8, %eax + jle .L11 + KERNELMACRO(32 * 8) # 8 + cmpl $64 * 9, %eax + jle .L11 + KERNELMACRO(32 * 9) # 9 + cmpl $64 * 10, %eax + jle .L11 + KERNELMACRO(32 * 10) # 10 + cmpl $64 * 11, %eax + jle .L11 + KERNELMACRO(32 * 11) # 11 + cmpl $64 * 12, %eax + jle .L11 + KERNELMACRO(32 * 12) # 12 + cmpl $64 * 13, %eax + jle .L11 + KERNELMACRO(32 * 13) # 13 + cmpl $64 * 14, %eax + jle .L11 + KERNELMACRO(32 * 14) # 14 + cmpl $64 * 15, %eax + jle .L11 + movq 1 * SIZE(%esi), %mm0 + movq 1 * SIZE(%esi, LDC), %mm1 + KERNELMACRO(32 * 15) # 15 +.L11: + leal (%edx, %eax, 4), %edx + leal (%ecx, %eax, 4), %ecx + +#else + movapd 0 * SIZE(BB), %xmm0 + movapd 8 * SIZE(BB), %xmm2 + movapd 0 * SIZE(AA), %xmm1 + movapd 8 * SIZE(AA), %xmm3 + + prefetchnta 8 * SIZE(%esi) + + sarl $3, %eax + je .L12 + +#define PRE 40 + +.L11: + mulpd %xmm0, %xmm1 + movd (PRE + 0) * SIZE(AA), %mm0 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 4 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(AA), %xmm1 + movd (PRE + 8) * SIZE(AA), %mm0 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(BB), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movapd 12 * SIZE(AA), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(AA), %xmm3 + movd (PRE + 16) * SIZE(AA), %mm0 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(BB), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 18 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 20 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 22 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd 32 * SIZE(AA), %xmm1 + movd (PRE + 24) * SIZE(AA), %mm0 + addpd %xmm0, %xmm7 + movapd 6 * SIZE(BB), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movapd 26 * SIZE(AA), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movapd 28 * SIZE(AA), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(AA), %xmm3 + movd (PRE + 32) * SIZE(AA), %mm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(BB), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm4 + movapd 34 * SIZE(AA), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm5 + movapd 36 * SIZE(AA), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 38 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd 48 * SIZE(AA), %xmm1 + movd (PRE + 40) * SIZE(AA), %mm0 + addpd %xmm2, %xmm7 + movapd 10 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm4 + movapd 42 * SIZE(AA), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm5 + movapd 44 * SIZE(AA), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 46 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(AA), %xmm3 + movd (PRE + 48) * SIZE(AA), %mm0 + addpd %xmm2, %xmm7 + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm4 + movapd 50 * SIZE(AA), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm5 + movapd 52 * SIZE(AA), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 54 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd 64 * SIZE(AA), %xmm1 + movd (PRE + 56) * SIZE(AA), %mm0 + addpd %xmm2, %xmm7 + movapd 14 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm4 + movapd 58 * SIZE(AA), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm5 + movapd 60 * SIZE(AA), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 62 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + movapd 24 * SIZE(BB), %xmm2 + + addl $64 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L11 +#endif + +.L12: + movapd ALPHA, %xmm3 + movl K, %eax + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 4 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + mulpd 6 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm7 + + addl $8 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm6 + mulpd %xmm3, %xmm7 + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + movsd 4 * SIZE(%esi), %xmm2 + movhpd 5 * SIZE(%esi), %xmm2 + movsd 6 * SIZE(%esi), %xmm3 + movhpd 7 * SIZE(%esi), %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movhpd %xmm5, 3 * SIZE(%esi) + movsd %xmm6, 4 * SIZE(%esi) + movhpd %xmm6, 5 * SIZE(%esi) + movsd %xmm7, 6 * SIZE(%esi) + movhpd %xmm7, 7 * SIZE(%esi) + + addl $8 * SIZE, %esi # coffset += 4 + BRANCH + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L20: + movl M, %ebx + testl $4, %ebx + jle .L30 + + leal BUFFER, %ecx + movl K, %eax + + movapd 0 * SIZE + BUFFER, %xmm2 + movapd 0 * SIZE(%edx), %xmm0 + movapd 8 * SIZE + BUFFER, %xmm3 + movapd 8 * SIZE(%edx), %xmm1 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + sarl $3, %eax + je .L22 + +.L21: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 2 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 2 * SIZE(BB), %xmm0 + movapd 4 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 6 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 4 * SIZE(BB), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 10 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 6 * SIZE(BB), %xmm0 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 14 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 8 * SIZE(BB), %xmm0 + movapd 16 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 18 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 10 * SIZE(BB), %xmm0 + movapd 20 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 22 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 12 * SIZE(BB), %xmm0 + movapd 24 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 26 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + movapd 14 * SIZE(BB), %xmm0 + movapd 28 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 30 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L21 + +.L22: + movapd ALPHA, %xmm3 + movl K, %eax + andl $7, %eax + BRANCH + je .L24 + +.L23: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + mulpd 2 * SIZE(AA), %xmm0 + addpd %xmm0, %xmm5 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L23 + ALIGN_4 + +.L24: + mulpd %xmm3, %xmm4 + mulpd %xmm3, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm5 + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movhpd %xmm5, 3 * SIZE(%esi) + addl $4 * SIZE, %esi # coffset += 4 + ALIGN_4 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + + leal BUFFER, %ecx + movl K, %eax + + movapd 0 * SIZE + BUFFER, %xmm2 + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE + BUFFER, %xmm3 + movapd 8 * SIZE(AA), %xmm1 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + sarl $3, %eax + je .L32 + +.L31: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 2 * SIZE(BB), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 4 * SIZE(BB), %xmm0 + movapd 4 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 6 * SIZE(BB), %xmm0 + movapd 6 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 8 * SIZE(BB), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 10 * SIZE(BB), %xmm0 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 12 * SIZE(BB), %xmm0 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + movapd 14 * SIZE(BB), %xmm0 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: + movapd ALPHA, %xmm3 + movl K, %eax + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + movapd 0 * SIZE(BB), %xmm0 + movapd 0 * SIZE(AA), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + mulpd %xmm3, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + addl $2 * SIZE, %esi + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + + leal BUFFER, %ecx + movl K, %eax + + movsd 0 * SIZE + BUFFER, %xmm2 + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE + BUFFER, %xmm3 + movsd 4 * SIZE(AA), %xmm1 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + sarl $3, %eax + je .L52 + +.L51: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 4 * SIZE(AA), %xmm0 + mulsd 8 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 5 * SIZE(AA), %xmm0 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 6 * SIZE(AA), %xmm0 + mulsd 12 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + movsd 7 * SIZE(AA), %xmm0 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: + movsd ALPHA, %xmm3 + movl K, %eax + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + movsd 0 * SIZE(%esi), %xmm0 + mulsd %xmm3, %xmm4 + addsd %xmm0, %xmm4 + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_2 + +.L99: + addl LDC, C + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_8x2_core2.S b/kernel/x86/gemm_kernel_8x2_core2.S new file mode 100644 index 0000000000..3fd8c566d8 --- /dev/null +++ b/kernel/x86/gemm_kernel_8x2_core2.S @@ -0,0 +1,1622 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 512(%esp) + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 16 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $512 + LOCAL_BUFFER_SIZE, %esp + andl $-4096, %esp # align stack + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + movss STACK_ALPHA, %xmm3 +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + shufps $0, %xmm3, %xmm3 + + movl STACK_B, B + movl STACK_C, %ebx + + movaps %xmm3, ALPHA + movl %ebx, C + movl STACK_LDC, LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + leal (, LDC, SIZE), LDC + + sarl $1, %eax + movl %eax, J + jle .L50 + ALIGN_4 + +.L01: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BB) + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BB) + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl C, C1 + movl A, AA + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -16 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 7 * SIZE(C1) + pxor %xmm7, %xmm7 + prefetcht0 7 * SIZE(C1, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps -12 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps -8 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps -4 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps 16 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps 8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps 16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 20 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm2, %xmm6 + movaps 24 * SIZE(AA), %xmm3 + addps %xmm1, %xmm7 + + movaps 24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 28 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + subl $-64 * SIZE, BB + movaps 48 * SIZE(AA), %xmm3 + subl $-64 * SIZE, AA + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps -32 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA, %xmm3 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm2 + movhps 6 * SIZE(C1), %xmm2 + + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + movsd 4 * SIZE(C1, LDC), %xmm3 + movhps 6 * SIZE(C1, LDC), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm6, 4 * SIZE(C1) + movhps %xmm6, 6 * SIZE(C1) + + movsd %xmm5, 0 * SIZE(C1, LDC) + movhps %xmm5, 2 * SIZE(C1, LDC) + movsd %xmm7, 4 * SIZE(C1, LDC) + movhps %xmm7, 6 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $8, KK +#endif + + addl $8 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $4, I + jle .L30 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + addps %xmm0, %xmm7 + movaps -24 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps -8 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + addps %xmm2, %xmm5 + movaps -12 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movaps 32 * SIZE(BB), %xmm1 + addps %xmm2, %xmm7 + movaps -8 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm2, %xmm5 + movaps -4 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm2, %xmm7 + movaps 16 * SIZE(AA), %xmm2 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + movhps %xmm5, 2 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $2, I + jle .L40 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -24 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addps %xmm0, %xmm7 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movsd -8 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd -16 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movsd 8 * SIZE(BB), %xmm1 + addps %xmm2, %xmm5 + movsd -22 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movsd 32 * SIZE(BB), %xmm1 + addps %xmm2, %xmm7 + movsd -20 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addps %xmm2, %xmm5 + movsd -18 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movsd 48 * SIZE(BB), %xmm3 + addps %xmm2, %xmm7 + movsd -8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: + movsd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movsd %xmm5, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ +#endif + + movss -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movss -28 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movss -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm0, %xmm1 + mulss -28 * SIZE(BB), %xmm0 + addss %xmm1, %xmm4 + movss -24 * SIZE(BB), %xmm1 + addss %xmm0, %xmm5 + movss -31 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm1 + mulss -20 * SIZE(BB), %xmm0 + addss %xmm1, %xmm6 + movss 0 * SIZE(BB), %xmm1 + addss %xmm0, %xmm7 + movss -30 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss -12 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss -8 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss -29 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss -4 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 16 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss -24 * SIZE(AA), %xmm0 + mulss %xmm2, %xmm1 + mulss 4 * SIZE(BB), %xmm2 + addss %xmm1, %xmm4 + movss 8 * SIZE(BB), %xmm1 + addss %xmm2, %xmm5 + movss -27 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm1 + mulss 12 * SIZE(BB), %xmm2 + addss %xmm1, %xmm6 + movss 32 * SIZE(BB), %xmm1 + addss %xmm2, %xmm7 + movss -26 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm3 + mulss 20 * SIZE(BB), %xmm2 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm2, %xmm5 + movss -25 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm3 + mulss 28 * SIZE(BB), %xmm2 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm2, %xmm7 + movss -20 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movss ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulss %xmm0, %xmm1 + mulss -28 * SIZE(BB), %xmm0 + addss %xmm1, %xmm4 + movss -24 * SIZE(BB), %xmm1 + addss %xmm0, %xmm5 + movss -31 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + movss 0 * SIZE(C1, LDC), %xmm1 + + addss %xmm0, %xmm4 + addss %xmm1, %xmm5 +#endif + + movss %xmm4, 0 * SIZE(C1) + movss %xmm5, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_4 + +.L51: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L55 + ALIGN_4 + +.L52: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movl K, %eax + andl $7, %eax + BRANCH + jle .L60 + ALIGN_4 + +.L56: + movss -32 * SIZE(B), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L56 + ALIGN_4 + +.L60: + movl C, C1 + movl A, AA + movl M, I + sarl $3, I + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht0 3 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AA), %xmm1 + addps %xmm0, %xmm4 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm6 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + mulps -20 * SIZE(AA), %xmm1 + addps %xmm0, %xmm5 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm2 + mulps -12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movaps -8 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm2 + mulps -4 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 16 * SIZE(AA), %xmm2 + addps %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm3, %xmm0 + mulps 4 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps -12 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 12 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps -8 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm2 + mulps 20 * SIZE(AA), %xmm3 + addps %xmm2, %xmm4 + movaps 24 * SIZE(AA), %xmm2 + addps %xmm3, %xmm6 + movaps -4 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm2 + mulps 28 * SIZE(AA), %xmm3 + addps %xmm2, %xmm5 + movaps 48 * SIZE(AA), %xmm2 + addps %xmm3, %xmm7 + movaps 16 * SIZE(BB), %xmm3 + + addl $ 64 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AA), %xmm1 + addps %xmm0, %xmm4 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm6 + movaps -28 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm2 + movhps 6 * SIZE(C1), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm6, 4 * SIZE(C1) + movhps %xmm6, 6 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $8, KK +#endif + + addl $8 * SIZE, C1 + decl I + jg .L61 + ALIGN_4 + +.L70: + movl M, I + testl $4, I + jle .L80 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + movaps -16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + movaps -12 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movaps -12 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps -8 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movaps -8 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps -4 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movaps -4 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps 16 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + + subl $-32 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, C1 + ALIGN_4 + +.L80: + movl M, I + testl $2, I + jle .L90 + +.L81: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -16 * SIZE(BB), %xmm3 + movsd -24 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + movsd -22 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movsd -12 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -20 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -18 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -8 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movsd 16 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movsd ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm5, %xmm4 + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + addps %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + addl $2 * SIZE, C1 + ALIGN_4 + +.L90: + movl M, I + testl $1, I + jle .L99 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movss -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movss -16 * SIZE(BB), %xmm3 + movss -28 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -28 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -30 * SIZE(AA), %xmm0 + addss %xmm1, %xmm5 + movss -24 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -29 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -20 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -24 * SIZE(AA), %xmm0 + addss %xmm1, %xmm5 + movss -0 * SIZE(BB), %xmm1 + mulss %xmm2, %xmm3 + movss -27 * SIZE(AA), %xmm2 + addss %xmm3, %xmm4 + movss -12 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -26 * SIZE(AA), %xmm2 + addss %xmm3, %xmm5 + movss -8 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -25 * SIZE(AA), %xmm2 + addss %xmm3, %xmm4 + movss -4 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -20 * SIZE(AA), %xmm2 + addss %xmm3, %xmm5 + movss 16 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: + movss ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm5, %xmm4 + mulss %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movss 0 * SIZE(C1), %xmm0 + addss %xmm0, %xmm4 +#endif + movss %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L99: + addl LDC, C + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_kernel_8x2_sse.S b/kernel/x86/gemm_kernel_8x2_sse.S new file mode 100644 index 0000000000..c3897646be --- /dev/null +++ b/kernel/x86/gemm_kernel_8x2_sse.S @@ -0,0 +1,2746 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCHSIZE 48 /* for PIII */ + +#define AA %edx +#define BB %ecx + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL4(address) \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL5(address) \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL6(address) \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movd STACK_ALPHA, %mm7 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movd %mm7, 0 * SIZE + ALPHA + movd %mm7, 1 * SIZE + ALPHA + movd %mm7, 2 * SIZE + ALPHA + movd %mm7, 3 * SIZE + ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + leal (, LDC, SIZE), LDC + + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + addl $2 * SIZE, B + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + addl $8 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef PENTIUM4 +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + + prefetchnta 7 * SIZE(%esi) + prefetchnta 7 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 8 * SIZE, AA + addl $64 * 8 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + + prefetchnta 8 * SIZE(%esi) + prefetchnta 8 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + shufps $0xe4, %xmm5, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + shufps $0xe4, %xmm6, %xmm6 + shufps $0xe4, %xmm7, %xmm7 + + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhps 2 * SIZE(%esi, LDC), %xmm2 + movsd 4 * SIZE(%esi, LDC), %xmm3 + movhps 6 * SIZE(%esi, LDC), %xmm3 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm6 + addps %xmm2, %xmm5 + addps %xmm3, %xmm7 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm6, 4 * SIZE(%esi) + movhps %xmm6, 6 * SIZE(%esi) + + movsd %xmm5, 0 * SIZE(%esi, LDC) + movhps %xmm5, 2 * SIZE(%esi, LDC) + movsd %xmm7, 4 * SIZE(%esi, LDC) + movhps %xmm7, 6 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $8, KK +#endif + + addl $8 * SIZE, %esi + BRANCH + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + andl $7, %ebx + jle .L99 + + testl $4, %ebx + jle .L50 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 +#endif + +.L32: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + addps %xmm0, %xmm5 +#endif + +#ifdef HAVE_SSE2 + movsd %xmm4, 0 * SIZE(%esi) + unpckhpd %xmm4, %xmm4 + movsd %xmm4, 2 * SIZE(%esi) + + movsd %xmm5, 0 * SIZE(%esi, LDC) + unpckhpd %xmm5, %xmm5 + movsd %xmm5, 2 * SIZE(%esi, LDC) +#else + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + + movlps %xmm5, 0 * SIZE(%esi, LDC) + movhps %xmm5, 2 * SIZE(%esi, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $4 * SIZE, %esi + ALIGN_2 + +.L50: + testl $2, %ebx + jle .L70 + + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 +#endif + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi, LDC), %xmm0 + addps %xmm0, %xmm5 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm5, 0 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, %esi + ALIGN_2 + +.L70: + testl $1, %ebx + jle .L99 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 40 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 48 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 72 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 +#endif + +.L72: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movss ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + mulss %xmm3, %xmm4 + mulss %xmm3, %xmm5 + +#ifndef TRMMKERNEL + addss 0 * SIZE(%esi), %xmm4 + addss 0 * SIZE(%esi, LDC), %xmm5 +#endif + + movss %xmm4, 0 * SIZE(%esi) + movss %xmm5, 0 * SIZE(%esi, LDC) + + addl $1 * SIZE, %esi + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 2 * ldc + BRANCH + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 96 * SIZE(B) + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + addl $ 8 * SIZE, B + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + addl $32 * SIZE, %ecx + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + addl $1 * SIZE, B + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(%ecx) + addl $4 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm6 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm6 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +#else + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 20 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 28 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 40 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm0 + mulps 36 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 48 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 44 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 72 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 40 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 +#endif + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm5 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 + + movsd 4 * SIZE(%esi), %xmm0 + movhps 6 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm5 +#endif + +#ifdef HAVE_SSE2 + movsd %xmm4, 0 * SIZE(%esi) + unpckhpd %xmm4, %xmm4 + movsd %xmm4, 2 * SIZE(%esi) + + movsd %xmm5, 4 * SIZE(%esi) + unpckhpd %xmm5, %xmm5 + movsd %xmm5, 6 * SIZE(%esi) +#else + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + + movlps %xmm5, 4 * SIZE(%esi) + movhps %xmm5, 6 * SIZE(%esi) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $8, KK +#endif + + addl $8 * SIZE, %esi + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + andl $7, %ebx + jle .L999 + + testl $4, %ebx + jle .L150 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + mulps 12 * SIZE(BB), %xmm1 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + movaps 20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 20 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + mulps 28 * SIZE(BB), %xmm1 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 +#endif + +.L132: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + addl $4 * SIZE, %esi + ALIGN_2 + +.L150: + testl $2, %ebx + jle .L170 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 40 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 +#endif + +.L152: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + mulps %xmm3, %xmm4 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + addl $2 * SIZE, %esi + ALIGN_2 + +.L170: + testl $1, %ebx + jle .L999 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + movss 3 * SIZE(AA), %xmm0 + addss %xmm3, %xmm6 + mulss 12 * SIZE(BB), %xmm0 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + movss 5 * SIZE(AA), %xmm1 + addss %xmm2, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 32 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addss %xmm3, %xmm6 + mulss 28 * SIZE(BB), %xmm1 + movss 40 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 +#endif + +.L172: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movss ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + mulss %xmm3, %xmm4 + +#ifndef TRMMKERNEL + addss 0 * SIZE(%esi), %xmm4 +#endif + movss %xmm4, 0 * SIZE(%esi) + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/gemm_ncopy_2.S b/kernel/x86/gemm_ncopy_2.S new file mode 100644 index 0000000000..a2674c7495 --- /dev/null +++ b/kernel/x86/gemm_ncopy_2.S @@ -0,0 +1,274 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 8 + +#define J 0 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define A 12 + STACK + ARGS(%esp) +#define LDA 16 + STACK + ARGS(%esp) +#define B 20 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl B, %esi # ESI : offsetB + movl M, %edi + + movl A, %ebx # EBX : offsetA + movl LDA, %edx + leal (%ebx, %edx, SIZE), %ebp + + addl %edx, %edx + subl %edi, %edx # edx = 2 * lda - m + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L21: +#if 0 + movl %edi, %ecx # ECX : I(Counter of M) + andl $-8, %ecx + leal (%ebx, %ecx, SIZE), %ebx + leal (%ebp, %ecx, SIZE), %ebp + negl %ecx + ALIGN_3 + +.Blocking1: + MMXLOAD (%ebx, %ecx, SIZE), %mm0 + MMXLOAD (%ebp, %ecx, SIZE), %mm1 + addl $8, %ecx + jl .Blocking1 + + movl %edi, %ecx # ECX : I(Counter of M) + andl $-8, %ecx + negl %ecx + leal (%ebx, %ecx, SIZE), %ebx + leal (%ebp, %ecx, SIZE), %ebp +#endif + + movl %edi, %ecx # ECX : I(Counter of M) + sarl $2, %ecx + je .L24 + ALIGN_3 + +.L25: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebx), %mm0 + MMXLOAD 0 * SIZE(%ebp), %mm1 + MMXLOAD 1 * SIZE(%ebx), %mm2 + MMXLOAD 1 * SIZE(%ebp), %mm3 + + MMXLOAD 2 * SIZE(%ebx), %mm4 + MMXLOAD 2 * SIZE(%ebp), %mm5 + MMXLOAD 3 * SIZE(%ebx), %mm6 + MMXLOAD 3 * SIZE(%ebp), %mm7 + + MMXSTORE %mm0, 0 * SIZE(%esi) + MMXSTORE %mm1, 1 * SIZE(%esi) + MMXSTORE %mm2, 2 * SIZE(%esi) + MMXSTORE %mm3, 3 * SIZE(%esi) + + MMXSTORE %mm4, 4 * SIZE(%esi) + MMXSTORE %mm5, 5 * SIZE(%esi) + MMXSTORE %mm6, 6 * SIZE(%esi) + MMXSTORE %mm7, 7 * SIZE(%esi) +#else + FLD 3 * SIZE(%ebp) + FLD 3 * SIZE(%ebx) + FLD 2 * SIZE(%ebp) + FLD 2 * SIZE(%ebx) + FLD 1 * SIZE(%ebp) + FLD 1 * SIZE(%ebx) + FLD 0 * SIZE(%ebp) + FLD 0 * SIZE(%ebx) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + FST 4 * SIZE(%esi) + FST 5 * SIZE(%esi) + FST 6 * SIZE(%esi) + FST 7 * SIZE(%esi) +#endif + addl $4 * SIZE, %ebx + addl $4 * SIZE, %ebp + addl $8 * SIZE, %esi + decl %ecx + jne .L25 + ALIGN_3 + +.L24: + movl %edi, %ecx + andl $3, %ecx + jle .L30 + ALIGN_3 + +.L31: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebx), %mm0 + MMXLOAD 0 * SIZE(%ebp), %mm1 + MMXSTORE %mm0, 0 * SIZE(%esi) + MMXSTORE %mm1, 1 * SIZE(%esi) +#else + FLD 0 * SIZE(%ebp) + FLD 0 * SIZE(%ebx) + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) +#endif + addl $1 * SIZE, %ebx + addl $1 * SIZE, %ebp + addl $2 * SIZE, %esi + decl %ecx + jne .L31 + ALIGN_3 + +.L30: + leal (%ebx, %edx, SIZE), %ebx + leal (%ebp, %edx, SIZE), %ebp + decl J + jne .L21 + ALIGN_3 + +.L20: + movl N, %eax + andl $1,%eax + jle .L38 + ALIGN_3 + +.L39: + movl %edi, %ecx + sarl $3, %ecx + je .L42 + ALIGN_3 + +.L43: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebx), %mm0 + MMXLOAD 1 * SIZE(%ebx), %mm1 + MMXLOAD 2 * SIZE(%ebx), %mm2 + MMXLOAD 3 * SIZE(%ebx), %mm3 + MMXLOAD 4 * SIZE(%ebx), %mm4 + MMXLOAD 5 * SIZE(%ebx), %mm5 + MMXLOAD 6 * SIZE(%ebx), %mm6 + MMXLOAD 7 * SIZE(%ebx), %mm7 + + MMXSTORE %mm0, 0 * SIZE(%esi) + MMXSTORE %mm1, 1 * SIZE(%esi) + MMXSTORE %mm2, 2 * SIZE(%esi) + MMXSTORE %mm3, 3 * SIZE(%esi) + MMXSTORE %mm4, 4 * SIZE(%esi) + MMXSTORE %mm5, 5 * SIZE(%esi) + MMXSTORE %mm6, 6 * SIZE(%esi) + MMXSTORE %mm7, 7 * SIZE(%esi) +#else + FLD 7 * SIZE(%ebx) + FLD 6 * SIZE(%ebx) + FLD 5 * SIZE(%ebx) + FLD 4 * SIZE(%ebx) + FLD 3 * SIZE(%ebx) + FLD 2 * SIZE(%ebx) + FLD 1 * SIZE(%ebx) + FLD 0 * SIZE(%ebx) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + FST 4 * SIZE(%esi) + FST 5 * SIZE(%esi) + FST 6 * SIZE(%esi) + FST 7 * SIZE(%esi) +#endif + + addl $8 * SIZE, %ebx + addl $8 * SIZE, %esi + decl %ecx + jne .L43 + ALIGN_3 + +.L42: + movl %edi, %ecx + andl $7, %ecx + jle .L38 + ALIGN_3 + +.L49: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebx), %mm0 + MMXSTORE %mm0, 0 * SIZE(%esi) +#else + FLD 0 * SIZE(%ebx) + FST 0 * SIZE(%esi) +#endif + addl $1 * SIZE, %ebx + addl $1 * SIZE, %esi + decl %ecx + jne .L49 + ALIGN_3 + +.L38: + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_ncopy_2_sse.S b/kernel/x86/gemm_ncopy_2_sse.S new file mode 100644 index 0000000000..1a8262c96d --- /dev/null +++ b/kernel/x86/gemm_ncopy_2_sse.S @@ -0,0 +1,215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 + +#define STACK 16 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define ARG_A 12 + STACK + ARGS(%esp) +#define ARG_LDA 16 + STACK + ARGS(%esp) +#define ARG_B 20 + STACK + ARGS(%esp) + +#define A %eax +#define B %ebx +#define LDA %ebp +#define A1 %ecx +#define A2 %edx +#define I %esi +#define J %edi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_A, A + movl ARG_B, B + movl ARG_LDA, LDA + + sall $BASE_SHIFT, LDA + + movl N, J + sarl $1, J + je .L20 + ALIGN_3 + +.L10: + movl A, A1 + leal (A, LDA, 1), A2 + leal (A, LDA, 2), A + + movl M, I + sarl $2, I + je .L15 + ALIGN_3 + +.L12: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A2) , %xmm0 + movsd 1 * SIZE(A1) , %xmm1 + movhps 1 * SIZE(A2) , %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(A2) + + movsd 2 * SIZE(A1) , %xmm2 + movhps 2 * SIZE(A2) , %xmm2 + movsd 3 * SIZE(A1) , %xmm3 + movhps 3 * SIZE(A2) , %xmm3 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + addl $ 4 * SIZE, A1 + addl $ 4 * SIZE, A2 + subl $-8 * SIZE, B + decl I + jne .L12 + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A2) , %xmm0 + movsd 1 * SIZE(A1) , %xmm1 + movhps 1 * SIZE(A2) , %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + addl $ 2 * SIZE, A1 + addl $ 2 * SIZE, A2 + subl $-4 * SIZE, B + ALIGN_4 + +.L16: + testl $1, M + jle .L19 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A2) , %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + subl $-2 * SIZE, B + ALIGN_4 + +.L19: + decl J + jne .L10 + ALIGN_3 + +.L20: + testl $1, N + jle .L999 + + movl A, A1 + + movl M, I + sarl $2, I + je .L25 + ALIGN_3 + +.L22: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1), %xmm0 + movhps 1 * SIZE(A1), %xmm0 + movsd 2 * SIZE(A1), %xmm1 + movhps 3 * SIZE(A1), %xmm1 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + addl $ 4 * SIZE, A1 + subl $-4 * SIZE, B + decl I + jne .L22 + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + + movsd 0 * SIZE(A1), %xmm0 + movhps 1 * SIZE(A1), %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + addl $ 2 * SIZE, A1 + subl $-2 * SIZE, B + ALIGN_4 + +.L26: + testl $1, M + jle .L999 + + movsd 0 * SIZE(A1), %xmm0 + movsd %xmm0, 0 * SIZE(B) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_ncopy_4_sse.S b/kernel/x86/gemm_ncopy_4_sse.S new file mode 100644 index 0000000000..3e919b26bc --- /dev/null +++ b/kernel/x86/gemm_ncopy_4_sse.S @@ -0,0 +1,315 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 + +#define STACK 16 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define ARG_A 12 + STACK + ARGS(%esp) +#define ARG_LDA 16 + STACK + ARGS(%esp) +#define ARG_B 20 + STACK + ARGS(%esp) + +#define A %eax +#define B %ebx +#define LDA %ebp +#define A1 %ecx +#define A2 %edx +#define I %esi +#define J %edi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_A, A + movl ARG_B, B + movl ARG_LDA, LDA + + sall $BASE_SHIFT, LDA + + movl N, J + sarl $2, J + je .L20 + ALIGN_3 + +.L10: + movl A, A1 + leal (A, LDA, 2), A2 + leal (A, LDA, 4), A + + movl M, I + sarl $2, I + je .L15 + ALIGN_3 + +.L12: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 0 * SIZE(A2) , %xmm1 + movhps 0 * SIZE(A2, LDA), %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) + + movsd 1 * SIZE(A1) , %xmm2 + movhps 1 * SIZE(A1, LDA), %xmm2 + movsd 1 * SIZE(A2) , %xmm3 + movhps 1 * SIZE(A2, LDA), %xmm3 + + PREFETCH RPREFETCHSIZE * SIZE(A2) + + movsd 2 * SIZE(A1) , %xmm4 + movhps 2 * SIZE(A1, LDA), %xmm4 + movsd 2 * SIZE(A2) , %xmm5 + movhps 2 * SIZE(A2, LDA), %xmm5 + + PREFETCH RPREFETCHSIZE * SIZE(A2, LDA) + + movsd 3 * SIZE(A1) , %xmm6 + movhps 3 * SIZE(A1, LDA), %xmm6 + movsd 3 * SIZE(A2) , %xmm7 + movhps 3 * SIZE(A2, LDA), %xmm7 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + PREFETCHW (RPREFETCHSIZE + 8) * SIZE(B) + + movaps %xmm4, 8 * SIZE(B) + movaps %xmm5, 10 * SIZE(B) + movaps %xmm6, 12 * SIZE(B) + movaps %xmm7, 14 * SIZE(B) + + addl $ 4 * SIZE, A1 + addl $ 4 * SIZE, A2 + subl $-16 * SIZE, B + decl I + jne .L12 + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 0 * SIZE(A2) , %xmm1 + movhps 0 * SIZE(A2, LDA), %xmm1 + + movsd 1 * SIZE(A1) , %xmm2 + movhps 1 * SIZE(A1, LDA), %xmm2 + movsd 1 * SIZE(A2) , %xmm3 + movhps 1 * SIZE(A2, LDA), %xmm3 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + addl $ 2 * SIZE, A1 + addl $ 2 * SIZE, A2 + subl $-8 * SIZE, B + ALIGN_4 + +.L16: + testl $1, M + jle .L19 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 0 * SIZE(A2) , %xmm1 + movhps 0 * SIZE(A2, LDA), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + subl $-4 * SIZE, B + ALIGN_4 + +.L19: + decl J + jne .L10 + ALIGN_3 + +.L20: + testl $2, N + jle .L30 + + movl A, A1 + leal (A, LDA, 2), A + + movl M, I + sarl $2, I + je .L25 + ALIGN_3 + +.L22: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 1 * SIZE(A1) , %xmm1 + movhps 1 * SIZE(A1, LDA), %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) + + movsd 2 * SIZE(A1) , %xmm2 + movhps 2 * SIZE(A1, LDA), %xmm2 + movsd 3 * SIZE(A1) , %xmm3 + movhps 3 * SIZE(A1, LDA), %xmm3 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + addl $ 4 * SIZE, A1 + subl $-8 * SIZE, B + decl I + jne .L22 + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 1 * SIZE(A1) , %xmm1 + movhps 1 * SIZE(A1, LDA), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + addl $ 2 * SIZE, A1 + addl $ 2 * SIZE, A2 + subl $-4 * SIZE, B + ALIGN_4 + +.L26: + testl $1, M + jle .L30 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + subl $-2 * SIZE, B + ALIGN_4 + +.L30: + testl $1, N + jle .L999 + + movl A, A1 + + movl M, I + sarl $2, I + je .L35 + ALIGN_3 + +.L32: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1), %xmm0 + movhps 1 * SIZE(A1), %xmm0 + movsd 2 * SIZE(A1), %xmm1 + movhps 3 * SIZE(A1), %xmm1 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + addl $ 4 * SIZE, A1 + subl $-4 * SIZE, B + decl I + jne .L32 + ALIGN_3 + +.L35: + testl $2, M + jle .L36 + + movsd 0 * SIZE(A1), %xmm0 + movhps 1 * SIZE(A1), %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + addl $ 2 * SIZE, A1 + subl $-2 * SIZE, B + ALIGN_4 + +.L36: + testl $1, M + jle .L999 + + movsd 0 * SIZE(A1), %xmm0 + movsd %xmm0, 0 * SIZE(B) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_tcopy_2.S b/kernel/x86/gemm_tcopy_2.S new file mode 100644 index 0000000000..61b7754757 --- /dev/null +++ b/kernel/x86/gemm_tcopy_2.S @@ -0,0 +1,305 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 8 + +#define J 0 + STACK(%esp) +#define BOFFSET2 4 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define A 12 + STACK + ARGS(%esp) +#define LDA 16 + STACK + ARGS(%esp) +#define B 20 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl A, %ebp + movl B, %edi + + movl M, %ebx + movl N, %eax + andl $-2, %eax + + imull %ebx, %eax # m * ( n & ~1) + + leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1) + movl %eax, BOFFSET2 + + movl M, %esi +#ifdef DOUBLE + sall $4,%esi +#else + sall $3,%esi +#endif + + sarl $1, %ebx # if !(m & 1) goto L28 + movl %ebx, J + jle .L28 + ALIGN_4 + +.L39: + movl %ebp, %edx # aoffset1 = a + movl LDA, %eax + movl N, %ebx + + leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda + leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda + movl %edi, %eax # boffset1 = b_offset + addl $4 * SIZE, %edi # boffset += 4 + + sarl $2, %ebx + jle .L32 + ALIGN_4 + +.L36: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%edx), %mm0 + MMXLOAD 1 * SIZE(%edx), %mm1 + MMXLOAD 0 * SIZE(%ecx), %mm2 + MMXLOAD 1 * SIZE(%ecx), %mm3 + + MMXLOAD 2 * SIZE(%edx), %mm4 + MMXLOAD 3 * SIZE(%edx), %mm5 + MMXLOAD 2 * SIZE(%ecx), %mm6 + MMXLOAD 3 * SIZE(%ecx), %mm7 + + MMXSTORE %mm0, 0 * SIZE(%eax) + MMXSTORE %mm1, 1 * SIZE(%eax) + MMXSTORE %mm2, 2 * SIZE(%eax) + MMXSTORE %mm3, 3 * SIZE(%eax) + + addl %esi, %eax + + MMXSTORE %mm4, 0 * SIZE(%eax) + MMXSTORE %mm5, 1 * SIZE(%eax) + MMXSTORE %mm6, 2 * SIZE(%eax) + MMXSTORE %mm7, 3 * SIZE(%eax) +#else + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%edx) + FLD 0 * SIZE(%edx) + + FST 0 * SIZE(%eax) + FST 1 * SIZE(%eax) + FST 2 * SIZE(%eax) + FST 3 * SIZE(%eax) + + addl %esi, %eax + + FLD 3 * SIZE(%ecx) + FLD 2 * SIZE(%ecx) + FLD 3 * SIZE(%edx) + FLD 2 * SIZE(%edx) + + FST 0 * SIZE(%eax) + FST 1 * SIZE(%eax) + FST 2 * SIZE(%eax) + FST 3 * SIZE(%eax) +#endif + + addl $4 * SIZE, %ecx + addl $4 * SIZE, %edx + addl %esi, %eax + decl %ebx + jne .L36 + ALIGN_4 + +.L32: + movl N, %ebx + test $2, %ebx + je .L37 + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%edx), %mm0 + MMXLOAD 1 * SIZE(%edx), %mm1 + MMXLOAD 0 * SIZE(%ecx), %mm2 + MMXLOAD 1 * SIZE(%ecx), %mm3 + + MMXSTORE %mm0, 0 * SIZE(%eax) + MMXSTORE %mm1, 1 * SIZE(%eax) + MMXSTORE %mm2, 2 * SIZE(%eax) + MMXSTORE %mm3, 3 * SIZE(%eax) +#else + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%edx) + FLD 0 * SIZE(%edx) + + FST 0 * SIZE(%eax) + FST 1 * SIZE(%eax) + FST 2 * SIZE(%eax) + FST 3 * SIZE(%eax) +#endif + + addl $2 * SIZE, %ecx + addl $2 * SIZE, %edx + ALIGN_4 + +.L37: + movl N, %ebx + test $1, %ebx + je .L38 + + movl BOFFSET2, %eax + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%edx), %mm0 + MMXLOAD 0 * SIZE(%ecx), %mm1 + MMXSTORE %mm0, 0 * SIZE(%eax) + MMXSTORE %mm1, 1 * SIZE(%eax) +#else + FLD 0 * SIZE(%edx) + FST 0 * SIZE(%eax) + FLD 0 * SIZE(%ecx) + FST 1 * SIZE(%eax) +#endif + addl $2 * SIZE, %eax + movl %eax, BOFFSET2 + ALIGN_4 + +.L38: + decl J + jg .L39 + ALIGN_4 + +.L28: + movl M, %eax + movl N, %ebx + + testb $1, %al + je .L40 + + sarl $2, %ebx + jle .L41 + ALIGN_4 + +.L45: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebp), %mm0 + MMXLOAD 1 * SIZE(%ebp), %mm1 + MMXLOAD 2 * SIZE(%ebp), %mm2 + MMXLOAD 3 * SIZE(%ebp), %mm3 + + MMXSTORE %mm0, 0 * SIZE(%edi) + MMXSTORE %mm1, 1 * SIZE(%edi) + + addl %esi, %edi + + MMXSTORE %mm2, 0 * SIZE(%edi) + MMXSTORE %mm3, 1 * SIZE(%edi) +#else + FLD 0 * SIZE(%ebp) + FST 0 * SIZE(%edi) + FLD 1 * SIZE(%ebp) + FST 1 * SIZE(%edi) + addl %esi, %edi + + FLD 2 * SIZE(%ebp) + FST 0 * SIZE(%edi) + FLD 3 * SIZE(%ebp) + FST 1 * SIZE(%edi) +#endif + addl %esi,%edi + addl $4 * SIZE, %ebp + decl %ebx + jg .L45 + ALIGN_4 + +.L41: + movl N, %ebx + test $2, %ebx + je .L46 + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebp), %mm0 + MMXSTORE %mm0, 0 * SIZE(%edi) + MMXLOAD 1 * SIZE(%ebp), %mm1 + MMXSTORE %mm1, 1 * SIZE(%edi) +#else + FLD 1 * SIZE(%ebp) + FLD 0 * SIZE(%ebp) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) +#endif + + addl $2 * SIZE, %ebp + ALIGN_4 + +.L46: + movl N, %ebx + test $1, %ebx + je .L40 + + movl BOFFSET2, %eax + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(%ebp), %mm0 + MMXSTORE %mm0, 0 * SIZE(%eax) +#else + FLD (%ebp) + FST (%eax) +#endif + ALIGN_4 + +.L40: + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS,%esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_tcopy_2_sse.S b/kernel/x86/gemm_tcopy_2_sse.S new file mode 100644 index 0000000000..de5f4ffe25 --- /dev/null +++ b/kernel/x86/gemm_tcopy_2_sse.S @@ -0,0 +1,236 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 + +#define STACK 16 +#define ARGS 8 + +#define J 0 + STACK(%esp) +#define BOFFSET2 4 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define A 12 + STACK + ARGS(%esp) +#define LDA 16 + STACK + ARGS(%esp) +#define B 20 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl A, %ebp + movl B, %edi + + movl M, %ebx + movl N, %eax + andl $-2, %eax + + imull %ebx, %eax # m * ( n & ~1) + + leal (%edi,%eax,SIZE), %eax # boffset2 = b + m * (n & ~1) + movl %eax, BOFFSET2 + + movl M, %esi +#ifdef DOUBLE + sall $4,%esi +#else + sall $3,%esi +#endif + + sarl $1, %ebx # if !(m & 1) goto L28 + movl %ebx, J + jle .L28 + ALIGN_4 + +.L39: + movl %ebp, %edx # aoffset1 = a + movl LDA, %eax + movl N, %ebx + + leal (%ebp, %eax,SIZE), %ecx # aoffset2 = a + lda + leal (%ecx, %eax,SIZE), %ebp # aoffset += 2 * lda + movl %edi, %eax # boffset1 = b_offset + addl $4 * SIZE, %edi # boffset += 4 + + sarl $2, %ebx + jle .L32 + ALIGN_4 + +.L36: + PREFETCH RPREFETCHSIZE * SIZE(%edx) + + movsd 0 * SIZE(%edx), %xmm0 + movhps 1 * SIZE(%edx), %xmm0 + movsd 0 * SIZE(%ecx), %xmm2 + movhps 1 * SIZE(%ecx), %xmm2 + + PREFETCH RPREFETCHSIZE * SIZE(%ecx) + + movsd 2 * SIZE(%edx), %xmm4 + movhps 3 * SIZE(%edx), %xmm4 + movsd 2 * SIZE(%ecx), %xmm6 + movhps 3 * SIZE(%ecx), %xmm6 + + movaps %xmm0, 0 * SIZE(%eax) + movaps %xmm2, 2 * SIZE(%eax) + + addl %esi, %eax + + movaps %xmm4, 0 * SIZE(%eax) + movaps %xmm6, 2 * SIZE(%eax) + + addl $4 * SIZE, %ecx + addl $4 * SIZE, %edx + addl %esi, %eax + decl %ebx + jne .L36 + ALIGN_4 + +.L32: + movl N, %ebx + test $2, %ebx + je .L37 + + PREFETCH RPREFETCHSIZE * SIZE(%edx) + movsd 0 * SIZE(%edx), %xmm0 + movhps 1 * SIZE(%edx), %xmm0 + + PREFETCH RPREFETCHSIZE * SIZE(%ecx) + movsd 0 * SIZE(%ecx), %xmm2 + movhps 1 * SIZE(%ecx), %xmm2 + + movaps %xmm0, 0 * SIZE(%eax) + movaps %xmm2, 2 * SIZE(%eax) + + addl $2 * SIZE, %ecx + addl $2 * SIZE, %edx + ALIGN_4 + +.L37: + movl N, %ebx + test $1, %ebx + je .L38 + + movl BOFFSET2, %eax + + movsd 0 * SIZE(%edx), %xmm0 + movhps 0 * SIZE(%ecx), %xmm0 + movaps %xmm0, 0 * SIZE(%eax) + + addl $2 * SIZE, %eax + movl %eax, BOFFSET2 + ALIGN_4 + +.L38: + decl J + jg .L39 + ALIGN_4 + +.L28: + movl M, %eax + movl N, %ebx + + testb $1, %al + je .L40 + + sarl $2, %ebx + jle .L41 + ALIGN_4 + +.L45: + movsd 0 * SIZE(%ebp), %xmm0 + movhps 1 * SIZE(%ebp), %xmm0 + movsd 2 * SIZE(%ebp), %xmm2 + movhps 3 * SIZE(%ebp), %xmm2 + + movaps %xmm0, 0 * SIZE(%edi) + + addl %esi, %edi + + movaps %xmm2, 0 * SIZE(%edi) + + addl %esi,%edi + addl $4 * SIZE, %ebp + decl %ebx + jg .L45 + ALIGN_4 + +.L41: + movl N, %ebx + test $2, %ebx + je .L46 + + movsd 0 * SIZE(%ebp), %xmm0 + movhps 1 * SIZE(%ebp), %xmm0 + movaps %xmm0, 0 * SIZE(%edi) + addl $2 * SIZE, %ebp + ALIGN_4 + +.L46: + movl N, %ebx + test $1, %ebx + je .L40 + + movl BOFFSET2, %eax + + movsd 0 * SIZE(%ebp), %xmm0 + movsd %xmm0, 0 * SIZE(%eax) + ALIGN_4 + +.L40: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS,%esp + ret + + EPILOGUE diff --git a/kernel/x86/gemm_tcopy_4_sse.S b/kernel/x86/gemm_tcopy_4_sse.S new file mode 100644 index 0000000000..4e1e2e6614 --- /dev/null +++ b/kernel/x86/gemm_tcopy_4_sse.S @@ -0,0 +1,305 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 8 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 + +#define STACK 16 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define ARG_A 12 + STACK + ARGS(%esp) +#define ARG_LDA 16 + STACK + ARGS(%esp) +#define ARG_B 20 + STACK + ARGS(%esp) + +#define A %eax +#define B %ebx +#define LDA %ebp +#define A1 %ecx +#define A2 %edx +#define I %esi +#define J %edi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_A, A + movl ARG_B, B + movl ARG_LDA, LDA + + sall $BASE_SHIFT, LDA + + movl N, J + sarl $2, J + je .L20 + ALIGN_3 + +.L10: + movl A, A1 + leal (A, LDA, 2), A2 + addl $4 * SIZE, A + + movl M, I + sarl $2, I + je .L15 + ALIGN_3 + +.L12: + PREFETCH RPREFETCHSIZE * SIZE(A1) + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 2 * SIZE(A1) , %xmm1 + movhps 3 * SIZE(A1) , %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(A1, LDA) + + movsd 0 * SIZE(A1, LDA), %xmm2 + movhps 1 * SIZE(A1, LDA), %xmm2 + movsd 2 * SIZE(A1, LDA), %xmm3 + movhps 3 * SIZE(A1, LDA), %xmm3 + + PREFETCH RPREFETCHSIZE * SIZE(A2) + + movsd 0 * SIZE(A2) , %xmm4 + movhps 1 * SIZE(A2) , %xmm4 + movsd 2 * SIZE(A2) , %xmm5 + movhps 3 * SIZE(A2) , %xmm5 + + PREFETCH RPREFETCHSIZE * SIZE(A2, LDA) + + movsd 0 * SIZE(A2, LDA), %xmm6 + movhps 1 * SIZE(A2, LDA), %xmm6 + movsd 2 * SIZE(A2, LDA), %xmm7 + movhps 3 * SIZE(A2, LDA), %xmm7 + + PREFETCHW (RPREFETCHSIZE + 0) * SIZE(B) + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + PREFETCHW (RPREFETCHSIZE + 8) * SIZE(B) + + movaps %xmm4, 8 * SIZE(B) + movaps %xmm5, 10 * SIZE(B) + movaps %xmm6, 12 * SIZE(B) + movaps %xmm7, 14 * SIZE(B) + + leal (A1, LDA, 4), A1 + leal (A2, LDA, 4), A2 + subl $-16 * SIZE, B + decl I + jne .L12 + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 2 * SIZE(A1) , %xmm1 + movhps 3 * SIZE(A1) , %xmm1 + + movsd 0 * SIZE(A1, LDA), %xmm2 + movhps 1 * SIZE(A1, LDA), %xmm2 + movsd 2 * SIZE(A1, LDA), %xmm3 + movhps 3 * SIZE(A1, LDA), %xmm3 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + leal (A1, LDA, 2), A1 + subl $-8 * SIZE, B + ALIGN_4 + +.L16: + testl $1, M + jle .L19 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 2 * SIZE(A1) , %xmm1 + movhps 3 * SIZE(A1) , %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + subl $-4 * SIZE, B + ALIGN_4 + +.L19: + decl J + jne .L10 + ALIGN_3 + +.L20: + testl $2, N + jle .L30 + + movl A, A1 + leal (A, LDA, 2), A2 + addl $2 * SIZE, A + + movl M, I + sarl $2, I + je .L25 + ALIGN_3 + +.L22: + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 0 * SIZE(A1, LDA), %xmm1 + movhps 1 * SIZE(A1, LDA), %xmm1 + + movsd 0 * SIZE(A2) , %xmm2 + movhps 1 * SIZE(A2) , %xmm2 + movsd 0 * SIZE(A2, LDA), %xmm3 + movhps 1 * SIZE(A2, LDA), %xmm3 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + movaps %xmm2, 4 * SIZE(B) + movaps %xmm3, 6 * SIZE(B) + + leal (A1, LDA, 4), A1 + leal (A2, LDA, 4), A2 + subl $-8 * SIZE, B + decl I + jne .L22 + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + movsd 0 * SIZE(A1, LDA), %xmm1 + movhps 1 * SIZE(A1, LDA), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + leal (A1, LDA, 2), A1 + subl $-4 * SIZE, B + ALIGN_4 + +.L26: + testl $1, M + jle .L30 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 1 * SIZE(A1) , %xmm0 + + movaps %xmm0, 0 * SIZE(B) + subl $-2 * SIZE, B + ALIGN_4 + +.L30: + testl $1, N + jle .L999 + + movl A, A1 + leal (A, LDA, 2), A2 + + movl M, I + sarl $2, I + je .L35 + ALIGN_3 + +.L32: + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + movsd 0 * SIZE(A2) , %xmm1 + movhps 0 * SIZE(A2, LDA), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 2 * SIZE(B) + + leal (A1, LDA, 4), A1 + leal (A2, LDA, 4), A2 + subl $-4 * SIZE, B + decl I + jne .L32 + ALIGN_3 + +.L35: + testl $2, M + jle .L36 + + movsd 0 * SIZE(A1) , %xmm0 + movhps 0 * SIZE(A1, LDA), %xmm0 + + movaps %xmm0, 0 * SIZE(B) + + leal (A1, LDA, 2), A1 + subl $-2 * SIZE, B + ALIGN_4 + +.L36: + testl $1, M + jle .L999 + + movsd 0 * SIZE(A1) , %xmm0 + movsd %xmm0, 0 * SIZE(B) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_n.S b/kernel/x86/gemv_n.S new file mode 100644 index 0000000000..13fd1ed676 --- /dev/null +++ b/kernel/x86/gemv_n.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 32 +#endif + +#if defined(ATHLON) || defined(OPTERON) || defined(OPTERON) +#define P 32 +#endif + +#ifndef P +#define P DTB_ENTRIES +#endif + +#define STACK 16 +#define ARGS 16 + +#define PLDA_M 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_N 8 + STACK(%esp) +#define IS 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define LDA 28 + STACK + ARGS(%esp) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#define BUFFER 48 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define LDA 24 + STACK + ARGS(%esp) +#define X 28 + STACK + ARGS(%esp) +#define INCX 32 + STACK + ARGS(%esp) +#define Y 36 + STACK + ARGS(%esp) +#define INCY 40 + STACK + ARGS(%esp) +#define BUFFER 44 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA + movl X, %edi + + movl LDA, %ebx + leal 0(,%ebx,SIZE),%ebx # EBX : lda + + movl $0, IS + movl M, %edx + movl N, %esi + + test %esi, %esi + jle .L79 # goto END + test %edx, %edx + jle .L79 # goto END + + movl INCY, %eax + leal (,%eax,SIZE),%eax + movl %eax, INCY + + movl LDA, %eax + imull $P, %eax # P * lda + subl M ,%eax # P * lda - m + leal (, %eax, SIZE), %eax + movl %eax, PLDA_M + ALIGN_2 + +.L32: + movl IS, %esi + movl $P, %edx + movl N, %eax + subl %esi,%eax # n - is + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + + movl %eax, MIN_N + movl INCX, %edx + + leal (%edi, %esi, SIZE), %esi # xp = x + is + movl %esi, XP + cmpl $1, %edx + je .L34 # if incx == 1 goto L34 + + movl BUFFER, %esi + leal (, %edx, SIZE), %edx + movl %esi, XP # xp = buffer + sarl $2,%eax + jle .L35 + ALIGN_2 + +.L36: + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_N, %eax + andl $3, %eax + jle .L34 + ALIGN_2 + +.L42: + FLD (%edi) + addl %edx, %edi + FST (%esi) + addl $SIZE, %esi + decl %eax + jg .L42 + ALIGN_3 + +/* Main Routine */ +.L34: + movl Y, %ecx # c_offset + movl M, %ebp + sarl $2, %ebp # j = (m >> 2) + jle .L47 + ALIGN_2 + +.L48: + movl A, %edx # a_offset = a + fldz + addl $4 * SIZE, A # a += 4 + fldz + movl XP, %esi # b_offset = xp + fldz + movl MIN_N, %eax # i = min_n + fldz + FLD (%esi) # bt1 = b_offset + sarl $1, %eax + jle .L51 + ALIGN_2 + +#ifdef PENTIUM3 +#define PRESIZE 8 +#else +#define PRESIZE 24 +#endif + +.L80: +#ifdef PENTIUM3 + prefetcht1 PRESIZE * SIZE(%edx, %ebx, 1) + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + + prefetcht1 PRESIZE * SIZE(%esi) + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(%esi) # bt1 = b_offset + + prefetcht1 PRESIZE * SIZE(%edx, %ebx, 2) + addl %ebx, %edx # a_offset += lda + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + addl %ebx, %edx + faddp %st, %st(4) # ct4 += at1 + + FLD 2 * SIZE(%esi) # bt1 = b_offset + addl $2 * SIZE, %esi # b_offset += 2 + +#else +#ifdef PENTIUM4 + prefetchnta 8 * SIZE(%esi) +#endif + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(%esi) # bt1 = b_offset + + addl %ebx, %edx # a_offset += lda + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + faddp %st, %st(4) # ct4 += at1 + FLD 2 * SIZE(%esi) # bt1 = b_offset + + addl %ebx, %edx + addl $2 * SIZE, %esi # b_offset += 2 +#endif + decl %eax + jg .L80 + +.L51: + movl MIN_N,%eax + andl $1, %eax + je .L57 + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + faddp %st, %st(4) # ct4 += at1 + fldz + ALIGN_2 + +.L57: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + movl INCY, %eax + + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + + decl %ebp # j -- + jg .L48 + ALIGN_3 + +.L47: + movl M, %ebp + andl $3, %ebp # j = (m & 3) + jle .L60 + ALIGN_2 + +.L61: + + movl A, %edx # a_offset = a + fldz + addl $SIZE, A # a++ + fldz + movl XP,%esi + fldz + movl MIN_N,%eax + fldz + sarl $3,%eax + jle .L64 + ALIGN_2 + +.L65: + FLD 0 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(1) + addl %ebx, %edx + + FLD 1 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(2) + addl %ebx ,%edx + + FLD 2 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(3) + addl %ebx, %edx + + FLD 3 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(4) + addl %ebx, %edx + + FLD 4 * SIZE(%esi) + FMUL (%edx) + faddp %st,%st(1) + addl %ebx, %edx + + FLD 5 * SIZE(%esi) + FMUL (%edx) + faddp %st, %st(2) + addl %ebx, %edx + + FLD 6 * SIZE(%esi) + FMUL (%edx) + faddp %st,%st(3) + addl %ebx, %edx + + FLD 7 * SIZE(%esi) + FMUL (%edx) + faddp %st,%st(4) + addl %ebx, %edx + + addl $8 * SIZE, %esi + decl %eax + jg .L65 + +.L64: + movl MIN_N,%eax + andl $7, %eax + jle .L70 + ALIGN_2 + +.L71: + FLD (%esi) + addl $SIZE, %esi # b_offset ++ + FMUL (%edx) + addl %ebx, %edx # a_offset += lda + faddp %st, %st(1) + decl %eax + jg .L71 + ALIGN_2 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1), %st + movl INCY, %eax + FADD (%ecx) + FST (%ecx) + addl %eax, %ecx + decl %ebp + jg .L61 + +.L60: + movl PLDA_M, %esi + addl %esi, A # a += P * lda - m + addl $P, IS + movl N, %esi + cmpl %esi,IS + jl .L32 + +.L79: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_n_atom.S b/kernel/x86/gemv_n_atom.S new file mode 100644 index 0000000000..e88409ce27 --- /dev/null +++ b/kernel/x86/gemv_n_atom.S @@ -0,0 +1,774 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + leal (,INCX, SIZE), INCX + leal (,LDA, SIZE), LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + pxor %xmm7, %xmm7 + + movl M, %eax + addl $16, %eax + sarl $4, %eax + ALIGN_3 + +.L01: + movapd %xmm7, 0 * SIZE(Y1) + movapd %xmm7, 2 * SIZE(Y1) + movapd %xmm7, 4 * SIZE(Y1) + movapd %xmm7, 6 * SIZE(Y1) + movapd %xmm7, 8 * SIZE(Y1) + movapd %xmm7, 10 * SIZE(Y1) + movapd %xmm7, 12 * SIZE(Y1) + movapd %xmm7, 14 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + movsd (X), %xmm6 + addl INCX, X + movsd (X), %xmm7 + addl INCX, X + + movsd ALPHA, %xmm0 + + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + + movl M, I + sarl $3, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + movsd -16 * SIZE(A1, LDA), %xmm4 + movsd -15 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -11 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -11 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) +#endif + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -9 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -10 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -9 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -12 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm0 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -7 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -8 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -7 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -10 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + movlpd %xmm1, -9 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -11 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -11 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -9 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -10 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -9 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -12 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm0 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, -10 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + movlpd %xmm1, -9 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $4, M + je .L16 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + movsd -16 * SIZE(A1, LDA), %xmm4 + movsd -15 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1, LDA), %xmm4 + mulsd %xmm6, %xmm2 + addsd %xmm5, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm5 + mulsd %xmm6, %xmm3 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L16: + testl $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + movsd -16 * SIZE(A1, LDA), %xmm4 + movsd -15 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + mulsd %xmm7, %xmm4 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm5 + addsd %xmm3, %xmm1 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm3 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm6, %xmm2 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm3 + addsd %xmm3, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L990 + + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + movsd (X), %xmm6 + addl INCX, X + movsd (X), %xmm7 + addl INCX, X + + movsd ALPHA, %xmm0 + + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + movsd -14 * SIZE(Y1), %xmm4 + movsd -13 * SIZE(Y1), %xmm5 + + movl M, I + sarl $3, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + decl I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -16 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + mulsd %xmm6, %xmm3 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addsd %xmm2, %xmm4 + movsd -12 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm5 + movsd -11 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm4, -14 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm4 + mulsd %xmm6, %xmm3 + movlpd %xmm5, -13 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm5 + + addsd %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -9 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -12 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + mulsd %xmm6, %xmm3 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + addsd %xmm2, %xmm4 + movsd -8 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm5 + movsd -7 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm4, -10 * SIZE(Y1) + movsd -6 * SIZE(Y1), %xmm4 + mulsd %xmm6, %xmm3 + movlpd %xmm5, -9 * SIZE(Y1) + movsd -5 * SIZE(Y1), %xmm5 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -16 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + mulsd %xmm6, %xmm3 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addsd %xmm2, %xmm4 + movsd -12 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm5 + movsd -11 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm4, -14 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm4 + mulsd %xmm6, %xmm3 + movlpd %xmm5, -13 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm5 + + addsd %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -9 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -12 * SIZE(Y1) + mulsd %xmm6, %xmm3 + movlpd %xmm1, -11 * SIZE(Y1) + + addsd %xmm2, %xmm4 + movsd -8 * SIZE(Y1), %xmm0 + addsd %xmm3, %xmm5 + movsd -7 * SIZE(Y1), %xmm1 + + movlpd %xmm4, -10 * SIZE(Y1) + movsd -6 * SIZE(Y1), %xmm4 + movlpd %xmm5, -9 * SIZE(Y1) + movsd -5 * SIZE(Y1), %xmm5 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testl $4, M + je .L26 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + + addsd %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm1 + movsd -13 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + movlpd %xmm0, -16 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + mulsd %xmm6, %xmm3 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + + movlpd %xmm4, -14 * SIZE(Y1) + movlpd %xmm5, -13 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L26: + testl $2, M + je .L27 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L27: + testl $1, M + je .L990 + + movsd -16 * SIZE(A1), %xmm2 + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm6, %xmm2 + addsd %xmm2, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L990: + movl Y, Y1 + movl BUFFER, X + movl Y1, A1 + + movl STACK_INCY, INCY + sall $BASE_SHIFT, INCY + + movl M, %eax + sarl $3, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y1), %xmm0 + addl INCY, Y1 + movsd (Y1), %xmm1 + addl INCY, Y1 + movsd (Y1), %xmm2 + addl INCY, Y1 + movsd (Y1), %xmm3 + addl INCY, Y1 + movsd (Y1), %xmm4 + addl INCY, Y1 + movsd (Y1), %xmm5 + addl INCY, Y1 + movsd (Y1), %xmm6 + addl INCY, Y1 + movsd (Y1), %xmm7 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + addsd 2 * SIZE(X), %xmm2 + addsd 3 * SIZE(X), %xmm3 + addsd 4 * SIZE(X), %xmm4 + addsd 5 * SIZE(X), %xmm5 + addsd 6 * SIZE(X), %xmm6 + addsd 7 * SIZE(X), %xmm7 + + movlpd %xmm0, (A1) + addl INCY, A1 + movlpd %xmm1, (A1) + addl INCY, A1 + movlpd %xmm2, (A1) + addl INCY, A1 + movlpd %xmm3, (A1) + addl INCY, A1 + movlpd %xmm4, (A1) + addl INCY, A1 + movlpd %xmm5, (A1) + addl INCY, A1 + movlpd %xmm6, (A1) + addl INCY, A1 + movlpd %xmm7, (A1) + addl INCY, A1 + + addl $8 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $7, M + jle .L999 + + testl $4, M + jle .L995 + + movsd (Y1), %xmm0 + addl INCY, Y1 + movsd (Y1), %xmm1 + addl INCY, Y1 + movsd (Y1), %xmm2 + addl INCY, Y1 + movsd (Y1), %xmm3 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + addsd 2 * SIZE(X), %xmm2 + addsd 3 * SIZE(X), %xmm3 + + movlpd %xmm0, (A1) + addl INCY, A1 + movlpd %xmm1, (A1) + addl INCY, A1 + movlpd %xmm2, (A1) + addl INCY, A1 + movlpd %xmm3, (A1) + addl INCY, A1 + + addl $4 * SIZE, X + ALIGN_3 + +.L995: + testl $2, M + jle .L996 + + movsd (Y1), %xmm0 + addl INCY, Y1 + movsd (Y1), %xmm1 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + + movlpd %xmm0, (A1) + addl INCY, A1 + movlpd %xmm1, (A1) + addl INCY, A1 + + addl $2 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movsd (Y1), %xmm0 + + addsd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, (A1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S new file mode 100644 index 0000000000..aae49a22dd --- /dev/null +++ b/kernel/x86/gemv_n_sse.S @@ -0,0 +1,662 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef movsd +#undef movsd +#endif + +#ifdef PENTIUM3 +#ifdef HAVE_SSE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 20 + STACKSIZE(%esp) +#define STACK_LDA 24 + STACKSIZE(%esp) +#define STACK_X 28 + STACKSIZE(%esp) +#define STACK_INCX 32 + STACKSIZE(%esp) +#define Y 36 + STACKSIZE(%esp) +#define STACK_INCY 40 + STACKSIZE(%esp) +#define BUFFER 44 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + leal (,INCX, SIZE), INCX + leal (,LDA, SIZE), LDA + + subl $-32 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + xorps %xmm7, %xmm7 + + movl M, %eax + addl $16, %eax + sarl $4, %eax + ALIGN_3 + +.L01: + movaps %xmm7, 0 * SIZE(Y1) + movaps %xmm7, 4 * SIZE(Y1) + movaps %xmm7, 8 * SIZE(Y1) + movaps %xmm7, 12 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, Y1 + addl $32 * SIZE, Y1 + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + movss (X), %xmm6 + addl INCX, X + movss (X), %xmm7 + addl INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm6 + mulss %xmm0, %xmm7 + + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + ALIGN_3 + + movl M, I + sarl $4, I + jle .L15 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm3 + movhps -26 * SIZE(A1), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + movsd -28 * SIZE(A1, LDA), %xmm5 + movhps -26 * SIZE(A1, LDA), %xmm5 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -20 * SIZE(A1), %xmm3 + movhps -18 * SIZE(A1), %xmm3 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1, LDA), %xmm4 + movhps -22 * SIZE(A1, LDA), %xmm4 + + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm5 + movhps -18 * SIZE(A1, LDA), %xmm5 + + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) +#endif + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -16 * SIZE(A1), %xmm2 + movhps -14 * SIZE(A1), %xmm2 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -12 * SIZE(A1), %xmm3 + movhps -10 * SIZE(A1), %xmm3 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm4 + movhps -14 * SIZE(A1, LDA), %xmm4 + + movaps %xmm0, -24 * SIZE(Y1) + movaps -16 * SIZE(Y1), %xmm0 + + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + movhps -10 * SIZE(A1, LDA), %xmm5 + + movaps %xmm1, -20 * SIZE(Y1) + movaps -12 * SIZE(Y1), %xmm1 + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -20 * SIZE(A1), %xmm3 + movhps -18 * SIZE(A1), %xmm3 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1, LDA), %xmm4 + movhps -22 * SIZE(A1, LDA), %xmm4 + + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm5 + movhps -18 * SIZE(A1, LDA), %xmm5 + + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + movaps %xmm0, -24 * SIZE(Y1) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -20 * SIZE(Y1) + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $8, M + je .L16 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm3 + movhps -26 * SIZE(A1), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + movsd -28 * SIZE(A1, LDA), %xmm5 + movhps -26 * SIZE(A1, LDA), %xmm5 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + addl $8 * SIZE, A1 + addl $8 * SIZE, Y1 + ALIGN_3 + +.L16: + testl $4, M + je .L17 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -32 * SIZE(A1, LDA), %xmm3 + movhps -30 * SIZE(A1, LDA), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $2, M + je .L18 + + movsd -32 * SIZE(A1), %xmm2 + movsd -32 * SIZE(A1, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L18: + testl $1, M + je .L19 + + movss -32 * SIZE(A1), %xmm2 + movss -32 * SIZE(A1, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm0 + + mulss %xmm6, %xmm2 + addss %xmm2, %xmm0 + mulss %xmm7, %xmm3 + addss %xmm3, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L990 + + movl BUFFER, Y1 + addl $32 * SIZE, Y1 + + movl A, A1 + + movss (X), %xmm6 + addl INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm6 + + shufps $0, %xmm6, %xmm6 + ALIGN_3 + + movl M, I + sarl $4, I + jle .L25 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm3 + movhps -26 * SIZE(A1), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + + decl I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 + + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -20 * SIZE(A1), %xmm3 + movhps -18 * SIZE(A1), %xmm3 + + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -16 * SIZE(A1), %xmm2 + movhps -14 * SIZE(A1), %xmm2 + + movaps %xmm0, -24 * SIZE(Y1) + movaps -16 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -12 * SIZE(A1), %xmm3 + movhps -10 * SIZE(A1), %xmm3 + + movaps %xmm1, -20 * SIZE(Y1) + movaps -12 * SIZE(Y1), %xmm1 + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movsd -20 * SIZE(A1), %xmm3 + movhps -18 * SIZE(A1), %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y1) + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + movaps %xmm1, -20 * SIZE(Y1) + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + ALIGN_3 + +.L25: + testl $8, M + je .L26 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm3 + movhps -26 * SIZE(A1), %xmm3 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + addl $8 * SIZE, A1 + addl $8 * SIZE, Y1 + ALIGN_3 + +.L26: + testl $4, M + je .L27 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + + movaps -32 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testl $2, M + je .L28 + + movsd -32 * SIZE(A1), %xmm2 + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L28: + testl $1, M + je .L990 + + movss -32 * SIZE(A1), %xmm2 + movss -32 * SIZE(Y1), %xmm0 + + mulss %xmm6, %xmm2 + addss %xmm2, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L990: + movl Y, Y1 + movl BUFFER, X + + movl STACK_INCY, INCY + sall $BASE_SHIFT, INCY + + movl M, %eax + sarl $2, %eax + jle .L994 + ALIGN_3 + +.L992: + movss (Y1), %xmm0 + addss 0 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + movss (Y1), %xmm0 + addss 1 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + movss (Y1), %xmm0 + addss 2 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + movss (Y1), %xmm0 + addss 3 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + addl $4 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $2, M + jle .L996 + + movss (Y1), %xmm0 + addss 0 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + movss (Y1), %xmm0 + addss 1 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + addl INCY, Y1 + + addl $2 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movss (Y1), %xmm0 + addss 0 * SIZE(X), %xmm0 + movss %xmm0, (Y1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S new file mode 100644 index 0000000000..669c5ac6c3 --- /dev/null +++ b/kernel/x86/gemv_n_sse2.S @@ -0,0 +1,686 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetch +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + leal (,INCX, SIZE), INCX + leal (,LDA, SIZE), LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + pxor %xmm7, %xmm7 + + movl M, %eax + addl $16, %eax + sarl $4, %eax + ALIGN_3 + +.L01: + movapd %xmm7, 0 * SIZE(Y1) + movapd %xmm7, 2 * SIZE(Y1) + movapd %xmm7, 4 * SIZE(Y1) + movapd %xmm7, 6 * SIZE(Y1) + movapd %xmm7, 8 * SIZE(Y1) + movapd %xmm7, 10 * SIZE(Y1) + movapd %xmm7, 12 * SIZE(Y1) + movapd %xmm7, 14 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + +#ifdef HAVE_SSE3 + movddup (X), %xmm6 + addl INCX, X + movddup (X), %xmm7 + addl INCX, X + + movddup ALPHA, %xmm0 + + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm7 +#else + movsd (X), %xmm6 + addl INCX, X + movsd (X), %xmm7 + addl INCX, X + + movsd ALPHA, %xmm0 + + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 +#endif + + ALIGN_3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm3 + movhpd -13 * SIZE(A1), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + movsd -14 * SIZE(A1, LDA), %xmm5 + movhpd -13 * SIZE(A1, LDA), %xmm5 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -10 * SIZE(A1), %xmm3 + movhpd -9 * SIZE(A1), %xmm3 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + movhpd -11 * SIZE(A1, LDA), %xmm4 + + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm5 + movhpd -9 * SIZE(A1, LDA), %xmm5 + + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) +#endif + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + movhpd -7 * SIZE(A1), %xmm2 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -6 * SIZE(A1), %xmm3 + movhpd -5 * SIZE(A1), %xmm3 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1, LDA), %xmm4 + movhpd -7 * SIZE(A1, LDA), %xmm4 + + movapd %xmm0, -12 * SIZE(Y1) + movapd -8 * SIZE(Y1), %xmm0 + + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movsd -6 * SIZE(A1, LDA), %xmm5 + movhpd -5 * SIZE(A1, LDA), %xmm5 + + movapd %xmm1, -10 * SIZE(Y1) + movapd -6 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -10 * SIZE(A1), %xmm3 + movhpd -9 * SIZE(A1), %xmm3 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + movhpd -11 * SIZE(A1, LDA), %xmm4 + + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm5 + movhpd -9 * SIZE(A1, LDA), %xmm5 + + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + movapd %xmm0, -12 * SIZE(Y1) + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movapd %xmm1, -10 * SIZE(Y1) + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $4, M + je .L16 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm3 + movhpd -13 * SIZE(A1), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + movsd -14 * SIZE(A1, LDA), %xmm5 + movhpd -13 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + + movapd %xmm0, -16 * SIZE(Y1) + movapd %xmm1, -14 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L16: + testl $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm3 + movhpd -15 * SIZE(A1, LDA), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm7, %xmm3 + addpd %xmm3, %xmm0 + + movapd %xmm0, -16 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm3 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm6, %xmm2 + addsd %xmm2, %xmm0 + mulsd %xmm7, %xmm3 + addsd %xmm3, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L990 + + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + +#ifdef HAVE_SSE3 + movddup (X), %xmm6 + addl INCX, X + + movddup ALPHA, %xmm0 + + mulpd %xmm0, %xmm6 +#else + movsd (X), %xmm6 + addl INCX, X + + movsd ALPHA, %xmm0 + + mulsd %xmm0, %xmm6 + unpcklpd %xmm6, %xmm6 +#endif + + ALIGN_3 + + movl M, I + sarl $3, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm3 + movhpd -13 * SIZE(A1), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + decl I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -10 * SIZE(A1), %xmm3 + movhpd -9 * SIZE(A1), %xmm3 + + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + movhpd -7 * SIZE(A1), %xmm2 + + movapd %xmm0, -12 * SIZE(Y1) + movapd -8 * SIZE(Y1), %xmm0 + + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -6 * SIZE(A1), %xmm3 + movhpd -5 * SIZE(A1), %xmm3 + + movapd %xmm1, -10 * SIZE(Y1) + movapd -6 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movsd -10 * SIZE(A1), %xmm3 + movhpd -9 * SIZE(A1), %xmm3 + + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movapd %xmm0, -12 * SIZE(Y1) + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + movapd %xmm1, -10 * SIZE(Y1) + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testl $4, M + je .L26 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm3 + movhpd -13 * SIZE(A1), %xmm3 + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm6, %xmm3 + addpd %xmm3, %xmm1 + + movapd %xmm0, -16 * SIZE(Y1) + movapd %xmm1, -14 * SIZE(Y1) + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L26: + testl $2, M + je .L27 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + + movapd -16 * SIZE(Y1), %xmm0 + + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + + movapd %xmm0, -16 * SIZE(Y1) + + addl $2 * SIZE, A1 + addl $2 * SIZE, Y1 + ALIGN_3 + +.L27: + testl $1, M + je .L990 + + movsd -16 * SIZE(A1), %xmm2 + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm6, %xmm2 + addsd %xmm2, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L990: + movl Y, Y1 + movl BUFFER, X + + movl STACK_INCY, INCY + sall $BASE_SHIFT, INCY + + movl M, %eax + sarl $3, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 2 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 4 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 6 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $8 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $7, M + jle .L999 + + testl $4, M + jle .L995 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 2 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $4 * SIZE, X + ALIGN_3 + +.L995: + testl $2, M + jle .L996 + + movsd (Y1), %xmm0 + movhpd (Y1, INCY), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $2 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movsd (Y1), %xmm0 + + movsd 0 * SIZE(X), %xmm4 + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_t.S b/kernel/x86/gemv_t.S new file mode 100644 index 0000000000..2eecd3fff1 --- /dev/null +++ b/kernel/x86/gemv_t.S @@ -0,0 +1,583 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 88 +#endif + +#ifndef P +#define P 1000 +#endif + +#define STACK 16 +#define ARGS 24 + +#define NLDA 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_M 8 + STACK(%esp) +#define J 12 + STACK(%esp) +#define IS 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define LDA 28 + STACK + ARGS(%esp) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#define BUFFER 48 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define LDA 24 + STACK + ARGS(%esp) +#define X 28 + STACK + ARGS(%esp) +#define INCX 32 + STACK + ARGS(%esp) +#define Y 36 + STACK + ARGS(%esp) +#define INCY 40 + STACK + ARGS(%esp) +#define BUFFER 44 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA + + movl X, %edi # X + + movl $0, IS + + movl M, %ebx + movl N, %eax + + testl %ebx, %ebx + jle .L79 + testl %eax, %eax + jle .L79 + + movl INCX, %esi + leal (,%esi,SIZE), %esi + movl %esi, INCX + + movl INCY, %esi + leal (, %esi, SIZE), %esi + movl %esi, INCY + + movl LDA, %ebx + + imull %ebx, %eax + movl $P, %esi + subl %eax, %esi + leal (, %esi, SIZE), %esi + movl %esi, NLDA + + leal (,%ebx,SIZE), %esi + movl %esi, LDA + ALIGN_2 + +.L32: + movl IS, %esi + + movl $P, %edx + movl M, %eax + subl %esi, %eax + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + movl %eax, MIN_M + + movl IS, %ecx + leal (%edi,%ecx,SIZE), %ecx # xp = x + is + movl INCX, %ebx + movl %ecx, XP + cmpl $SIZE, %ebx + je .L34 + + movl BUFFER, %esi + movl MIN_M, %ecx + movl %esi, XP + sarl $2, %ecx + jle .L35 + + ALIGN_3 + +.L36: + FLD (%edi) + addl %ebx, %edi + FST 0 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 1 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 2 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 3 * SIZE(%esi) + + addl $4 * SIZE, %esi + decl %ecx + jg .L36 + ALIGN_3 + +.L35: + movl MIN_M, %ecx + andl $3,%ecx + jle .L34 + ALIGN_2 + +.L42: + FLD (%edi) + addl %ebx, %edi + FST (%esi) + addl $SIZE, %esi + decl %ecx + jg .L42 + ALIGN_3 + +/* Main Routine */ + +.L34: + movl Y, %ebp # coffset = y + + movl N, %esi + sarl $2, %esi + movl %esi, J + jle .L47 + ALIGN_3 + +.L48: + movl A, %ebx # a_offset = a + fldz + movl LDA, %edx + fldz + + leal (%ebx, %edx), %ecx # a_offset2 = a + lda + fldz + leal (%ebx, %edx, 4), %eax + fldz + + movl %eax, A + movl XP, %esi + FLD (%esi) + + movl MIN_M, %eax + sarl $2,%eax + jle .L51 + ALIGN_3 + +#define PRESIZE 8 + +.L80: +#ifdef PENTIUM3 + prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) + FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + prefetcht0 PRESIZE * SIZE(%ecx) + faddp %st,%st(2) # ct1 += at1 + FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + + prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + prefetcht0 PRESIZE * SIZE(%ebx) + FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + + fmul %st(1),%st + faddp %st,%st(4) + FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 2 * SIZE(%esi) + + FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 3 * SIZE(%esi) + FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(2) # ct1 += at1 + FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(3) # ct2 += at1 + FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + addl $4 * SIZE, %ebx + faddp %st,%st(4) + addl $4 * SIZE, %ecx + + FLD 4 * SIZE(%esi) + addl $4 * SIZE, %esi + +#else + +#if defined(HAS_PREFETCH) + prefetcht0 PRESIZE * SIZE(%ebx) + prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) + prefetcht0 PRESIZE * SIZE(%ecx) + prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) +#endif + + FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + + FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 2 * SIZE(%esi) + + FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 3 * SIZE(%esi) + + FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 4 * SIZE(%esi) + + addl $4 * SIZE, %ebx + addl $4 * SIZE, %ecx + addl $4 * SIZE, %esi +#endif + + decl %eax + jg .L80 + ALIGN_3 + +.L51: + movl MIN_M, %eax + andl $3, %eax + je .L81 + ALIGN_3 + +.L52: + + FLD (%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD (%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD (%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FMUL (%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + + addl $SIZE, %ebx + addl $SIZE, %ecx + addl $SIZE, %esi + decl %eax + jg .L52 + ALIGN_3 + +.L81: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + movl INCY, %eax + + FADD (%ebp) + FST (%ebp) + addl %eax, %ebp + + FADD (%ebp) + FST (%ebp) + addl %eax, %ebp + + FADD (%ebp) + FST (%ebp) + addl %eax, %ebp + + FADD (%ebp) + FST (%ebp) + addl %eax, %ebp + + decl J + jg .L48 + ALIGN_3 + +.L47: + movl N, %esi + andl $3,%esi + movl %esi, J + jle .L60 + ALIGN_2 + +.L61: + movl A, %ebx # a_offset = a + fldz # ct1 = ZERO + movl LDA, %edx + fldz # ct1 = ZERO + + addl %ebx, %edx + fldz # ct1 = ZERO + movl %edx, A + fldz # ct1 = ZERO + + movl XP, %esi + + movl MIN_M, %eax + sarl $3,%eax + jle .L64 + ALIGN_3 + +.L65: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * 2 * SIZE(%ebx) + prefetcht0 PRESIZE * 2 * SIZE(%ebx) +#endif + + FLD 0 * SIZE(%esi) + FMUL 0 * SIZE(%ebx) + faddp %st,%st(1) + + FLD 1 * SIZE(%esi) + FMUL 1 * SIZE(%ebx) + faddp %st,%st(2) + + FLD 2 * SIZE(%esi) + FMUL 2 * SIZE(%ebx) + faddp %st,%st(3) + + FLD 3 * SIZE(%esi) + FMUL 3 * SIZE(%ebx) + faddp %st,%st(4) + + FLD 4 * SIZE(%esi) + FMUL 4 * SIZE(%ebx) + faddp %st,%st(1) + + FLD 5 * SIZE(%esi) + FMUL 5 * SIZE(%ebx) + faddp %st,%st(2) + + FLD 6 * SIZE(%esi) + FMUL 6 * SIZE(%ebx) + faddp %st,%st(3) + + FLD 7 * SIZE(%esi) + FMUL 7 * SIZE(%ebx) + faddp %st,%st(4) + + addl $8 * SIZE, %esi + addl $8 * SIZE, %ebx + + decl %eax + jg .L65 + ALIGN_3 + +.L64: + movl MIN_M, %eax + andl $7, %eax + jle .L70 + ALIGN_3 + +.L71: + FLD (%esi) + FMUL (%ebx) + faddp %st,%st(1) + + addl $SIZE, %esi + addl $SIZE, %ebx + decl %eax + jg .L71 + ALIGN_3 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1),%st + FADD (%ebp) + FST (%ebp) + addl INCY, %ebp + decl J + jg .L61 + ALIGN_3 + +.L60: + movl A, %ebx + addl NLDA, %ebx + movl %ebx, A + + addl $P, IS + movl M, %esi + cmpl %esi, IS + jl .L32 + ALIGN_3 + +.L79: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_t_atom.S b/kernel/x86/gemv_t_atom.S new file mode 100644 index 0000000000..a21416d49f --- /dev/null +++ b/kernel/x86/gemv_t_atom.S @@ -0,0 +1,616 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + leal (,INCX, SIZE), INCX + leal (,INCY, SIZE), INCY + leal (,LDA, SIZE), LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $3, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addl INCX, X + movhpd (X), %xmm0 + addl INCX, X + + movsd (X), %xmm1 + addl INCX, X + movhpd (X), %xmm1 + addl INCX, X + + movsd (X), %xmm2 + addl INCX, X + movhpd (X), %xmm2 + addl INCX, X + + movsd (X), %xmm3 + addl INCX, X + movhpd (X), %xmm3 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $7, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addl INCX, X + movsd %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movsd -16 * SIZE(X), %xmm2 + movsd -15 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -15 * SIZE(A1), %xmm6 + movsd -15 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm2, %xmm4 + mulsd %xmm2, %xmm5 + movsd -14 * SIZE(X), %xmm2 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -13 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -14 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -13 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -11 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -10 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -11 * SIZE(A1, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1, LDA) +#endif + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -10 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -9 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -9 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -8 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -9 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -7 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -8 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -7 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -6 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -7 * SIZE(A1, LDA), %xmm7 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -13 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -14 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -13 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -11 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -10 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -11 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -10 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -9 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -9 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -8 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -9 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + movsd -7 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addl $8 * SIZE, A1 + addsd %xmm7, %xmm1 + addl $8 * SIZE, X + ALIGN_4 + +.L15: + testl $4, M + jle .L16 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + + movsd -15 * SIZE(A1), %xmm6 + movsd -15 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm2, %xmm4 + mulsd %xmm2, %xmm5 + movsd -14 * SIZE(X), %xmm2 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -13 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -14 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + addsd %xmm6, %xmm0 + movsd -13 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm5 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -13 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addsd %xmm7, %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, X + ALIGN_4 + +.L16: + testl $2, M + jle .L17 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + + movsd -15 * SIZE(A1), %xmm6 + movsd -15 * SIZE(A1, LDA), %xmm7 + + mulsd %xmm2, %xmm4 + mulsd %xmm2, %xmm5 + movsd -14 * SIZE(X), %xmm2 + + mulsd %xmm3, %xmm6 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addsd %xmm7, %xmm1 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L17: + testl $1, M + jle .L18 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + + mulsd %xmm2, %xmm4 + mulsd %xmm2, %xmm5 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + ALIGN_4 + +.L18: + movsd ALPHA, %xmm7 + + mulpd %xmm7, %xmm0 + mulpd %xmm7, %xmm1 + + addsd (Y1), %xmm0 + addsd (Y1, INCY), %xmm1 + + movsd %xmm0, (Y1) + movsd %xmm1, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L999 + + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movsd -16 * SIZE(X), %xmm2 + movsd -15 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + movsd -14 * SIZE(A1), %xmm6 + movsd -13 * SIZE(A1), %xmm7 + + mulsd %xmm2, %xmm4 + movsd -14 * SIZE(X), %xmm2 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + + decl I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulsd %xmm2, %xmm6 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -11 * SIZE(A1), %xmm5 + + addsd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm4 + movsd -10 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -9 * SIZE(A1), %xmm7 + mulsd %xmm3, %xmm5 + movsd -9 * SIZE(X), %xmm3 + + mulsd %xmm2, %xmm6 + movsd -8 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -7 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -7 * SIZE(A1), %xmm5 + + addsd %xmm6, %xmm0 + movsd -6 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm4 + movsd -6 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -5 * SIZE(A1), %xmm7 + mulsd %xmm3, %xmm5 + movsd -5 * SIZE(X), %xmm3 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L22 + ALIGN_4 + +.L23: + mulsd %xmm2, %xmm6 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + movsd -11 * SIZE(A1), %xmm5 + + addsd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + mulsd %xmm2, %xmm4 + movsd -10 * SIZE(X), %xmm2 + addsd %xmm7, %xmm1 + movsd -9 * SIZE(A1), %xmm7 + mulsd %xmm3, %xmm5 + movsd -9 * SIZE(X), %xmm3 + + mulsd %xmm2, %xmm6 + movsd -8 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + movsd -7 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addsd %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L25: + testl $4, M + jle .L26 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + movsd -14 * SIZE(A1), %xmm6 + movsd -13 * SIZE(A1), %xmm7 + + mulsd %xmm2, %xmm4 + movsd -14 * SIZE(X), %xmm2 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + + mulsd %xmm2, %xmm6 + movsd -12 * SIZE(X), %xmm2 + addsd %xmm4, %xmm0 + mulsd %xmm3, %xmm7 + movsd -11 * SIZE(X), %xmm3 + addsd %xmm5, %xmm1 + + addsd %xmm6, %xmm0 + addsd %xmm7, %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, X + ALIGN_4 + +.L26: + testl $2, M + jle .L27 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + + mulsd %xmm2, %xmm4 + movsd -14 * SIZE(X), %xmm2 + mulsd %xmm3, %xmm5 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L27: + testl $1, M + jle .L28 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm4 + addsd %xmm4, %xmm0 + ALIGN_4 + +.L28: + movsd ALPHA, %xmm7 + addsd %xmm1, %xmm0 + + mulpd %xmm7, %xmm0 + + addsd (Y1), %xmm0 + + movsd %xmm0, (Y1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S new file mode 100644 index 0000000000..a4990116dd --- /dev/null +++ b/kernel/x86/gemv_t_sse.S @@ -0,0 +1,637 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef movsd +#undef movsd +#endif + +#ifdef PENTIUM3 +#ifdef HAVE_SSE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 20 + STACKSIZE(%esp) +#define STACK_LDA 24 + STACKSIZE(%esp) +#define STACK_X 28 + STACKSIZE(%esp) +#define STACK_INCX 32 + STACKSIZE(%esp) +#define Y 36 + STACKSIZE(%esp) +#define STACK_INCY 40 + STACKSIZE(%esp) +#define BUFFER 44 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + leal (,INCX, SIZE), INCX + leal (,INCY, SIZE), INCY + leal (,LDA, SIZE), LDA + + subl $-32 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $3, I + jle .L05 + ALIGN_4 + +.L02: + movss (X), %xmm0 + addl INCX, X + movss (X), %xmm1 + addl INCX, X + + unpcklps %xmm1, %xmm0 + + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + + unpcklps %xmm3, %xmm2 + + movss (X), %xmm4 + addl INCX, X + movss (X), %xmm5 + addl INCX, X + + unpcklps %xmm5, %xmm4 + + movss (X), %xmm6 + addl INCX, X + movss (X), %xmm7 + addl INCX, X + + unpcklps %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(Y1) + movlps %xmm2, 2 * SIZE(Y1) + movlps %xmm4, 4 * SIZE(Y1) + movlps %xmm6, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $7, I + jle .L10 + ALIGN_2 + +.L06: + movss (X), %xmm0 + addl INCX, X + movss %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, X + addl $32 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movaps -32 * SIZE(X), %xmm2 + movaps -28 * SIZE(X), %xmm3 + + movl M, I + sarl $4, I + jle .L15 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movhps -30 * SIZE(A1, LDA), %xmm5 + + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + movsd -28 * SIZE(A1, LDA), %xmm7 + movhps -26 * SIZE(A1, LDA), %xmm7 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + movsd -24 * SIZE(A1, LDA), %xmm5 + movhps -22 * SIZE(A1, LDA), %xmm5 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm7 + movhps -18 * SIZE(A1, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1, LDA) +#endif + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movsd -16 * SIZE(A1), %xmm4 + movhps -14 * SIZE(A1), %xmm4 + mulps %xmm2, %xmm5 + movaps -16 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + movsd -16 * SIZE(A1, LDA), %xmm5 + movhps -14 * SIZE(A1, LDA), %xmm5 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movsd -12 * SIZE(A1), %xmm6 + movhps -10 * SIZE(A1), %xmm6 + mulps %xmm3, %xmm7 + movaps -12 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm7 + movhps -10 * SIZE(A1, LDA), %xmm7 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + movsd -24 * SIZE(A1, LDA), %xmm5 + movhps -22 * SIZE(A1, LDA), %xmm5 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm7 + movhps -18 * SIZE(A1, LDA), %xmm7 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -16 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -12 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + ALIGN_4 + +.L15: + testl $8, M + jle .L16 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movhps -30 * SIZE(A1, LDA), %xmm5 + + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + movsd -28 * SIZE(A1, LDA), %xmm7 + movhps -26 * SIZE(A1, LDA), %xmm7 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + addps %xmm5, %xmm1 + + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + addps %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L16: + testl $4, M + jle .L17 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + + movsd -32 * SIZE(A1, LDA), %xmm5 + movhps -30 * SIZE(A1, LDA), %xmm5 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm3, %xmm2 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L17: + testl $2, M + jle .L18 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm4 + +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd -32 * SIZE(A1, LDA), %xmm5 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm1 + movhlps %xmm2, %xmm2 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L18: + testl $1, M + jle .L19 + + movss -32 * SIZE(A1), %xmm4 + mulss %xmm2, %xmm4 + addss %xmm4, %xmm0 + movss -32 * SIZE(A1, LDA), %xmm5 + mulss %xmm2, %xmm5 + addss %xmm5, %xmm1 + ALIGN_4 + +.L19: +#ifdef HAVE_SSE3 + haddps %xmm0, %xmm0 + haddps %xmm1, %xmm1 + + haddps %xmm0, %xmm0 + haddps %xmm1, %xmm1 +#else + movhlps %xmm0, %xmm2 + movhlps %xmm1, %xmm3 + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + shufps $1, %xmm0, %xmm0 + movaps %xmm1, %xmm3 + shufps $1, %xmm1, %xmm1 + + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 +#endif + + movss ALPHA, %xmm7 + + mulss %xmm7, %xmm0 + mulss %xmm7, %xmm1 + + addss (Y1), %xmm0 + addss (Y1, INCY), %xmm1 + + movss %xmm0, (Y1) + movss %xmm1, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L999 + + movl BUFFER, X + addl $32 * SIZE, X + + movl A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movaps -32 * SIZE(X), %xmm2 + movaps -28 * SIZE(X), %xmm3 + + movl M, I + sarl $4, I + jle .L25 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + + decl I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulps %xmm2, %xmm4 + movaps -24 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + + mulps %xmm3, %xmm6 + movaps -20 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + + mulps %xmm2, %xmm4 + movaps -16 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + movsd -16 * SIZE(A1), %xmm4 + movhps -14 * SIZE(A1), %xmm4 + + mulps %xmm3, %xmm6 + movaps -12 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + movsd -12 * SIZE(A1), %xmm6 + movhps -10 * SIZE(A1), %xmm6 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + + decl I + jg .L22 + ALIGN_4 + +.L23: + mulps %xmm2, %xmm4 + movaps -24 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + + mulps %xmm3, %xmm6 + movaps -20 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + + mulps %xmm2, %xmm4 + movaps -16 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + + mulps %xmm3, %xmm6 + movaps -12 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + ALIGN_4 + +.L25: + testl $8, M + jle .L26 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + + mulps %xmm2, %xmm4 + movaps -24 * SIZE(X), %xmm2 + addps %xmm4, %xmm0 + + mulps %xmm3, %xmm6 + movaps -20 * SIZE(X), %xmm3 + addps %xmm6, %xmm0 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L26: + testl $4, M + jle .L27 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movaps %xmm3, %xmm2 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L27: + testl $2, M + jle .L28 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm4 + + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + movhlps %xmm2, %xmm2 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L28: + testl $1, M + jle .L29 + + movss -32 * SIZE(A1), %xmm4 + mulss %xmm2, %xmm4 + addss %xmm4, %xmm0 + ALIGN_4 + +.L29: +#ifdef HAVE_SSE3 + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#else + movhlps %xmm0, %xmm2 + + addps %xmm2, %xmm0 + + movaps %xmm0, %xmm2 + shufps $1, %xmm0, %xmm0 + + addss %xmm2, %xmm0 +#endif + + movss ALPHA, %xmm7 + + mulss %xmm7, %xmm0 + + addss (Y1), %xmm0 + + movss %xmm0, (Y1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S new file mode 100644 index 0000000000..9960b5c0c5 --- /dev/null +++ b/kernel/x86/gemv_t_sse2.S @@ -0,0 +1,569 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetch +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA 16 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + leal (,INCX, SIZE), INCX + leal (,INCY, SIZE), INCY + leal (,LDA, SIZE), LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $3, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addl INCX, X + movhpd (X), %xmm0 + addl INCX, X + + movsd (X), %xmm1 + addl INCX, X + movhpd (X), %xmm1 + addl INCX, X + + movsd (X), %xmm2 + addl INCX, X + movhpd (X), %xmm2 + addl INCX, X + + movsd (X), %xmm3 + addl INCX, X + movhpd (X), %xmm3 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $7, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addl INCX, X + movsd %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + sarl $1, J + jle .L20 + ALIGN_3 + +.L11: + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movapd -16 * SIZE(X), %xmm2 + movapd -14 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movhpd -15 * SIZE(A1, LDA), %xmm5 + + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm7 + movhpd -13 * SIZE(A1, LDA), %xmm7 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + movhpd -11 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm7 + movhpd -9 * SIZE(A1, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1, LDA) +#endif + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + movapd -8 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + movsd -8 * SIZE(A1, LDA), %xmm5 + movhpd -7 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + movapd -6 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + movsd -6 * SIZE(A1, LDA), %xmm7 + movhpd -5 * SIZE(A1, LDA), %xmm7 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + movsd -12 * SIZE(A1, LDA), %xmm5 + movhpd -11 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm7 + movhpd -9 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + movapd -8 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + movapd -6 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L15: + testl $4, M + jle .L16 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movhpd -15 * SIZE(A1, LDA), %xmm5 + + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm7 + movhpd -13 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm5, %xmm1 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm7, %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, X + ALIGN_4 + +.L16: + testl $2, M + jle .L17 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + + movsd -16 * SIZE(A1, LDA), %xmm5 + movhpd -15 * SIZE(A1, LDA), %xmm5 + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm1 + movapd %xmm3, %xmm2 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L17: + testl $1, M + jle .L18 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm5 + mulsd %xmm2, %xmm5 + addsd %xmm5, %xmm1 + ALIGN_4 + +.L18: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + addpd %xmm2, %xmm0 +#endif + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm7 +#else + movsd ALPHA, %xmm7 + unpcklpd %xmm7, %xmm7 +#endif + + mulpd %xmm7, %xmm0 + + movsd (Y1), %xmm4 + movhpd (Y1, INCY), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + movhpd %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + decl J + jg .L11 + ALIGN_4 + +.L20: + testl $1, N + jle .L999 + + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + leal (A1, LDA, 2), %eax + movl %eax, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movapd -16 * SIZE(X), %xmm2 + movapd -14 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + decl I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + mulpd %xmm2, %xmm4 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + + mulpd %xmm3, %xmm6 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + + mulpd %xmm2, %xmm4 + movapd -8 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + + mulpd %xmm3, %xmm6 + movapd -6 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L22 + ALIGN_4 + +.L23: + mulpd %xmm2, %xmm4 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + + mulpd %xmm3, %xmm6 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + + mulpd %xmm2, %xmm4 + movapd -8 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + + mulpd %xmm3, %xmm6 + movapd -6 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L25: + testl $4, M + jle .L26 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + mulpd %xmm2, %xmm4 + movapd -12 * SIZE(X), %xmm2 + addpd %xmm4, %xmm0 + + mulpd %xmm3, %xmm6 + movapd -10 * SIZE(X), %xmm3 + addpd %xmm6, %xmm0 + + addl $4 * SIZE, A1 + addl $4 * SIZE, X + ALIGN_4 + +.L26: + testl $2, M + jle .L27 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movapd %xmm3, %xmm2 + + addl $2 * SIZE, A1 + ALIGN_4 + +.L27: + testl $1, M + jle .L28 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm4 + addsd %xmm4, %xmm0 + ALIGN_4 + +.L28: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + addsd %xmm2, %xmm0 +#endif + + movsd ALPHA, %xmm7 + + mulpd %xmm7, %xmm0 + + addsd (Y1), %xmm0 + + movlpd %xmm0, (Y1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/iamax.S b/kernel/x86/iamax.S new file mode 100644 index 0000000000..33204c07e7 --- /dev/null +++ b/kernel/x86/iamax.S @@ -0,0 +1,364 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %ebx +#define INCX %esi +#define X %ecx +#define I %edx +#define NUM %edi +#define RET %eax + +#ifndef USE_MIN +#define FMOV fcmovbe +#define IMOV cmovnbe +#else +#define FMOV fcmovnbe +#define IMOV cmovb +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_INCX, INCX + movl STACK_X, X + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + sall $BASE_SHIFT, INCX + + fldz + xorl RET, RET + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + fstp %st(0) + movl $2, NUM + movl $1, RET + + FLD (X) +#ifdef USE_ABS + fabs +#endif + addl INCX, X + decl M + jle .L999 + + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 1 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 2 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 3 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 4 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 5 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 6 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 7 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl $8 * SIZE, X + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + + addl $1 * SIZE, X + incl NUM + decl I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + addl INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl INCX, X + decl I + jg .L61 + ALIGN_4 + +.L999: + fstp %st(0) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/iamax_sse.S b/kernel/x86/iamax_sse.S new file mode 100644 index 0000000000..3b64ebdacf --- /dev/null +++ b/kernel/x86/iamax_sse.S @@ -0,0 +1,968 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#ifndef HAVE_SSE2 +#define pxor xorps +#define movsd movlps +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + pxor %xmm0, %xmm0 /* Return Value(Float) */ +#ifdef USE_ABS + pxor %xmm7, %xmm7 /* Generate USE_ABS */ +#endif + xor RET, RET /* Return Value(Int) */ + testl M, M + jle .L999 + leal (, INCX, SIZE), INCX + testl INCX, INCX + jle .L999 + + movl M, MM + movl X, XX + +#ifdef USE_ABS +#ifndef HAVE_SSE2 + subl $8, %esp + movl $0x7fffffff, (%esp) + movss (%esp), %xmm7 + shufps $0, %xmm7, %xmm7 + addl $8, %esp +#else + cmpeqps %xmm7, %xmm7 + psrld $1, %xmm7 /* Generate USE_ABS */ +#endif +#endif + + movss (XX), %xmm0 + addl INCX, XX + decl MM + shufps $0, %xmm0, %xmm0 +#ifdef USE_ABS + andps %xmm7, %xmm0 +#endif + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 /* Generating "seed value" */ + cmpl $SIZE, INCX + jne .L80 /* Incx != 1 goto L80 */ + +/* Analigned Check */ + testl $3, XX /* 00000011 */ + jne .L30 /* Purely Unaligned Mode */ + + cmpl $8, MM + jle .L30 /* if M <= 8 goto Unaligned mode */ + + testl $4, XX /* bit test 000100 */ + je .L05 + + movss 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + decl MM + addl $SIZE, XX + ALIGN_3 + +.L05: + testl $8, XX + je .L06 + + movsd 0 * SIZE(XX), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + subl $2, MM + addl $2 * SIZE, XX + ALIGN_3 + +.L06: + movl MM, I + sarl $4, I + jle .L15 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movaps 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps 4 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + + movaps 8 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm2 + + movaps 12 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm3 + + addl $16 * SIZE, XX + decl I + jg .L11 + ALIGN_4 + +.L15: + andl $15, MM + jle .L20 + + testl $8, MM + je .L16 + + movaps 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps 4 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + addl $8 * SIZE, XX + ALIGN_3 + +.L16: + testl $4, MM + je .L17 + + movaps 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm2 + addl $4 * SIZE, XX + ALIGN_3 + +.L17: + testl $2, MM + je .L18 + + movsd 0 * SIZE(XX), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm3 + addl $2 * SIZE, XX + +.L18: + testl $1, MM + je .L20 + + movss 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + ALIGN_3 + +.L20: + movl X, XX + movl M, MM + + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + testl $4, XX + je .L21 + + movss 0 * SIZE(XX), %xmm1 + + decl MM + addl $SIZE, XX + +#ifdef USE_ABS + andps %xmm7, %xmm1 +#endif + incl RET + comiss %xmm0, %xmm1 + je .L999 + ALIGN_3 + +.L21: + testl $8, XX + je .L22 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + + subl $2, MM + addl $2 * SIZE, XX + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L22: + movl MM, I + sarl $3, I + jle .L25 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movaps 0 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andps %xmm7, %xmm1 +#endif + cmpeqps %xmm0, %xmm1 + + movaps 4 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm2 +#endif + cmpeqps %xmm0, %xmm2 + + orps %xmm2, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L24 + + addl $8 * SIZE, XX + addl $8, RET + decl I + jg .L23 + jmp .L25 + ALIGN_3 + +.L24: + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + + movss 4 * SIZE(XX), %xmm1 + movss 5 * SIZE(XX), %xmm2 + movss 6 * SIZE(XX), %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 +#endif + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_4 + +.L25: + testl $4, MM + je .L26 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + addl $4 * SIZE, XX + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L26: + testl $2, MM + je .L27 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + addl $2 * SIZE, XX + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L27: + incl RET + jmp .L999 + ALIGN_3 + +/* Unaligned Mode */ +.L30: + movl MM, I + sarl $4, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm4 + movhps 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movsd 4 * SIZE(XX), %xmm4 + movhps 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + + movsd 8 * SIZE(XX), %xmm4 + movhps 10 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm2 + + movsd 12 * SIZE(XX), %xmm4 + movhps 14 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm3 + + addl $16 * SIZE, XX + decl I + jg .L31 + ALIGN_4 + +.L35: + andl $15, MM + jle .L40 + + testl $8, MM + je .L36 + + movsd 0 * SIZE(XX), %xmm4 + movhps 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movsd 4 * SIZE(XX), %xmm4 + movhps 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm1 + + addl $8 * SIZE, XX + ALIGN_3 + +.L36: + testl $4, MM + je .L37 + + movsd 0 * SIZE(XX), %xmm4 + movhps 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm2 + addl $4 * SIZE, XX + ALIGN_3 + +.L37: + testl $2, MM + je .L38 + + movsd 0 * SIZE(XX), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxps %xmm4, %xmm3 + addl $2 * SIZE, XX + +.L38: + testl $1, MM + je .L40 + + movss 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + jmp .L40 + ALIGN_4 + +.L40: + movl X, XX + movl M, MM + + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movl MM, I + sarl $3, I + jle .L45 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andps %xmm7, %xmm1 +#endif + cmpeqps %xmm0, %xmm1 + + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm2 +#endif + cmpeqps %xmm0, %xmm2 + + orps %xmm2, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L44 + + addl $8 * SIZE, XX + addl $8, RET + decl I + jg .L43 + jmp .L45 + ALIGN_3 + +.L44: + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + + movss 4 * SIZE(XX), %xmm1 + movss 5 * SIZE(XX), %xmm2 + movss 6 * SIZE(XX), %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 +#endif + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_4 + +.L45: + testl $4, MM + je .L46 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + addl $4 * SIZE, XX + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L46: + testl $2, MM + je .L47 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + addl $2 * SIZE, XX + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L47: + incl RET + jmp .L999 + ALIGN_3 + +.L80: + movl MM, I + sarl $3, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm1 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm2 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm3 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm1 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm2 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm3 + + decl I + jg .L81 + ALIGN_4 + +.L85: + andl $7, MM + jle .L90 + + testl $4, MM + je .L86 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm1 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm2 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm3 + ALIGN_3 + +.L86: + testl $2, MM + je .L87 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm1 + ALIGN_3 + +.L87: + testl $1, MM + je .L90 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + maxss %xmm4, %xmm2 + ALIGN_4 + +.L90: + movl X, XX + movl M, MM + + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm2 + maxss %xmm2, %xmm0 + shufps $0, %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L96 + ALIGN_4 + +.L92: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movss 0 * SIZE(XX), %xmm1 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm1 +#endif + cmpeqss %xmm0, %xmm1 + + movss 0 * SIZE(XX), %xmm2 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm2 +#endif + cmpeqss %xmm0, %xmm2 + + movss 0 * SIZE(XX), %xmm3 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm3 +#endif + cmpeqss %xmm0, %xmm3 + + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm4 +#endif + cmpeqss %xmm0, %xmm4 + + orps %xmm2, %xmm1 + orps %xmm4, %xmm3 + orps %xmm3, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L93 + + addl $4, RET + decl I + jg .L92 + jmp .L96 + ALIGN_3 + +.L93: + leal (, INCX, 4), TEMP + subl TEMP, XX + + movss 0 * SIZE(XX), %xmm1 + addl INCX, XX + movss 0 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + addl INCX, XX + movss 0 * SIZE(XX), %xmm4 + addl INCX, XX + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + incl RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L96: + testl $2, MM + je .L97 + + movss 0 * SIZE(XX), %xmm1 + addl INCX, XX + movss 0 * SIZE(XX), %xmm2 + addl INCX, XX +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L97: + incl RET + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/iamax_sse2.S b/kernel/x86/iamax_sse2.S new file mode 100644 index 0000000000..a0ddb26ddc --- /dev/null +++ b/kernel/x86/iamax_sse2.S @@ -0,0 +1,1152 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + pxor %xmm0, %xmm0 +#ifdef USE_ABS + pxor %xmm7, %xmm7 +#endif + xor RET, RET + testl M, M + jle .L999 + leal (, INCX, SIZE), INCX + testl INCX, INCX + jle .L999 + + movl M, MM + movl X, XX + +#ifdef USE_ABS + cmpeqpd %xmm7, %xmm7 + psrlq $1, %xmm7 +#endif + + movsd (XX), %xmm0 + addl INCX, XX + decl MM +#ifdef USE_ABS + andpd %xmm7, %xmm0 +#endif + unpcklpd %xmm0, %xmm0 + movapd %xmm0, %xmm1 + movapd %xmm0, %xmm2 + movapd %xmm0, %xmm3 + cmpl $SIZE, INCX + jne .L80 + +/* Analigned Check */ + cmpl $7, MM + jle .L50 + + testl $7, XX + jne .L50 # Purely Unaligned Mode + + testl $15, XX # Checking for 128bit align + je .L05 + + movsd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + decl MM + addl $SIZE, XX + ALIGN_3 + +.L05: + movl MM, I + sarl $4, I + jle .L15 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movapd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movapd 4 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movapd 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) +#endif + + movapd 8 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 10 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movapd 12 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movapd 14 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + + addl $16 * SIZE, XX + decl I + jg .L11 + ALIGN_4 + +.L15: + andl $15, MM + jle .L20 + + testl $8, MM + je .L16 + + movapd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movapd 4 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movapd 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + addl $8 * SIZE, XX + ALIGN_3 + +.L16: + testl $4, MM + je .L17 + + movapd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + addl $4 * SIZE, XX + ALIGN_3 + +.L17: + testl $2, MM + je .L18 + + movapd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + addl $2 * SIZE, XX + +.L18: + testl $1, MM + je .L20 + + movsd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + ALIGN_3 + +/* Finding Index */ +.L20: + movl X, XX + movl M, MM + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + testl $15, XX # Checking for 128bit align + je .L21 + + movsd 0 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andpd %xmm7, %xmm1 +#endif + incl RET + comisd %xmm0, %xmm1 + je .L999 + addl $SIZE, XX + decl MM + ALIGN_3 + +.L21: + movl MM, I + sarl $3, I + jle .L25 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movapd 0 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andpd %xmm7, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movapd 2 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andpd %xmm7, %xmm2 +#endif + cmpeqpd %xmm0, %xmm2 + + movapd 4 * SIZE(XX), %xmm3 +#ifdef USE_ABS + andpd %xmm7, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movapd 6 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + cmpeqpd %xmm0, %xmm4 + + orpd %xmm2, %xmm1 + orpd %xmm4, %xmm3 + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L23 + + addl $8 * SIZE, XX + addl $8, RET + decl I + jg .L22 + jmp .L25 + ALIGN_4 + +.L23: + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + + movsd 4 * SIZE(XX), %xmm1 + movsd 5 * SIZE(XX), %xmm2 + movsd 6 * SIZE(XX), %xmm3 + +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 +#endif + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_3 + +.L25: + testl $4, MM + je .L27 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + addl $4 * SIZE, XX + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L27: + testl $2, MM + je .L28 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 +#endif + addl $2 * SIZE, XX + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L28: + incl RET + jmp .L999 + ALIGN_3 + +.L50: +/* Unaligned Mode */ + movl MM, I + sarl $4, I + jle .L55 + ALIGN_4 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm4 + movhpd 1 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(XX), %xmm4 + movhpd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 4 * SIZE(XX), %xmm4 + movhpd 5 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 6 * SIZE(XX), %xmm4 + movhpd 7 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) +#endif + + movsd 8 * SIZE(XX), %xmm4 + movhpd 9 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 10 * SIZE(XX), %xmm4 + movhpd 11 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 12 * SIZE(XX), %xmm4 + movhpd 13 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 14 * SIZE(XX), %xmm4 + movhpd 15 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + + addl $16 * SIZE, XX + decl I + jg .L51 + ALIGN_4 + +.L55: + andl $15, MM + jle .L60 + + testl $8, MM + je .L56 + + movsd 0 * SIZE(XX), %xmm4 + movhpd 1 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(XX), %xmm4 + movhpd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 4 * SIZE(XX), %xmm4 + movhpd 5 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 6 * SIZE(XX), %xmm4 + movhpd 7 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + + addl $8 * SIZE, XX + ALIGN_3 + +.L56: + testl $4, MM + je .L57 + + movsd 0 * SIZE(XX), %xmm4 + movhpd 1 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(XX), %xmm4 + movhpd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + addl $4 * SIZE, XX + ALIGN_3 + +.L57: + testl $2, MM + je .L58 + + movsd 0 * SIZE(XX), %xmm4 + movhpd 1 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + addl $2 * SIZE, XX + +.L58: + testl $1, MM + je .L60 + + movsd 0 * SIZE(XX), %xmm4 + unpcklpd %xmm4, %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + ALIGN_3 + +.L60: + movl X, XX + movl M, MM + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movl MM, I + sarl $3, I + jle .L65 + ALIGN_4 + +.L62: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm1 +#ifdef USE_ABS + andpd %xmm7, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movsd 2 * SIZE(XX), %xmm2 + movhpd 3 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andpd %xmm7, %xmm2 +#endif + cmpeqpd %xmm0, %xmm2 + + movsd 4 * SIZE(XX), %xmm3 + movhpd 5 * SIZE(XX), %xmm3 +#ifdef USE_ABS + andpd %xmm7, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movsd 6 * SIZE(XX), %xmm4 + movhpd 7 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + cmpeqpd %xmm0, %xmm4 + + orpd %xmm2, %xmm1 + orpd %xmm4, %xmm3 + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L63 + + addl $8 * SIZE, XX + addl $8, RET + decl I + jg .L62 + jmp .L65 + ALIGN_4 + +.L63: + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 + +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + incl RET + + movsd 4 * SIZE(XX), %xmm1 + movsd 5 * SIZE(XX), %xmm2 + movsd 6 * SIZE(XX), %xmm3 + +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 +#endif + + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_3 + +.L65: + testl $4, MM + je .L67 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + addl $4 * SIZE, XX + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L67: + testl $2, MM + je .L68 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 +#endif + addl $2 * SIZE, XX + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L68: + incl RET + jmp .L999 + ALIGN_4 + +.L80: + movl MM, I + sarl $4, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + + decl I + jg .L81 + ALIGN_4 + +.L85: + andl $15, MM + jle .L90 + + testl $8, MM + je .L86 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm3 + ALIGN_3 + +.L86: + testl $4, MM + je .L87 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm1 + ALIGN_3 + +.L87: + testl $2, MM + je .L88 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + maxpd %xmm4, %xmm2 + ALIGN_3 + +.L88: + testl $1, MM + je .L90 + + movsd 0 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + ALIGN_4 + +.L90: + movl X, XX + movl M, MM + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movl MM, I + sarl $3, I + jle .L95 + ALIGN_4 + +.L92: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm2 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm2 +#endif + cmpeqpd %xmm0, %xmm2 + + movsd 0 * SIZE(XX), %xmm3 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm4 +#endif + cmpeqpd %xmm0, %xmm4 + + orpd %xmm2, %xmm1 + orpd %xmm4, %xmm3 + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L93 + + addl $8, RET + decl I + jg .L92 + jmp .L95 + ALIGN_4 + +.L93: + leal (, INCX, 8), TEMP + subl TEMP, XX + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 +#endif + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + jmp .L999 + ALIGN_3 + +.L95: + testl $4, MM + je .L97 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm4 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 +#endif + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + incl RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L97: + testl $2, MM + je .L98 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX +#ifdef USE_ABS + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 +#endif + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L98: + incl RET + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/izamax.S b/kernel/x86/izamax.S new file mode 100644 index 0000000000..63bcaef145 --- /dev/null +++ b/kernel/x86/izamax.S @@ -0,0 +1,289 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + + PROLOGUE + +#define M %ebx +#define INCX %esi +#define X %ecx +#define I %edx +#define NUM %edi +#define RET %eax + +#ifndef USE_MIN +#define FMOV fcmovbe +#define IMOV cmovnbe +#else +#define FMOV fcmovnb +#define IMOV cmovb +#endif + +#include "l1param.h" + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_INCX, INCX + movl STACK_X, X + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + sall $ZBASE_SHIFT, INCX + + fldz + xorl RET, RET + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + fstp %st(0) + movl $2, NUM + movl $1, RET + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + addl INCX, X + decl M + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl $8 * SIZE, X + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L999 + ALIGN_4 + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + IMOV NUM, RET + fstp %st(1) + incl NUM + + addl INCX, X + decl I + jg .L61 + ALIGN_4 + +.L999: + fstp %st(0) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/izamax_sse.S b/kernel/x86/izamax_sse.S new file mode 100644 index 0000000000..95223fe56f --- /dev/null +++ b/kernel/x86/izamax_sse.S @@ -0,0 +1,596 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#ifndef HAVE_SSE2 +#define pxor xorps +#define movsd movlps +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + pxor %xmm0, %xmm0 + pxor %xmm7, %xmm7 + xor RET, RET + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + movl M, MM + movl X, XX + +#ifdef USE_ABS +#ifndef HAVE_SSE2 + subl $8, %esp + movl $0x7fffffff, (%esp) + movss (%esp), %xmm7 + shufps $0, %xmm7, %xmm7 + addl $8, %esp +#else + cmpeqps %xmm7, %xmm7 + psrld $1, %xmm7 +#endif +#endif + + movss 0 * SIZE(XX), %xmm0 + movss 1 * SIZE(XX), %xmm1 + addl INCX, XX + decl MM + andps %xmm7, %xmm0 + andps %xmm7, %xmm1 + addps %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + cmpl $2 * SIZE, INCX + jne .L70 + +.L30: + movl MM, I + sarl $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + movsd 8 * SIZE(XX), %xmm1 + movhps 10 * SIZE(XX), %xmm1 + movsd 12 * SIZE(XX), %xmm2 + movhps 14 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + addl $16 * SIZE, XX + decl I + jg .L31 + ALIGN_4 + +.L35: + andl $7, MM + jle .L40 + + testl $4, MM + je .L36 + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + addl $8 * SIZE, XX + ALIGN_3 + +.L36: + testl $2, MM + je .L37 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm0 + addl $4 * SIZE, XX + ALIGN_3 + +.L37: + testl $1, MM + je .L40 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + addps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L40: + movl X, XX + movl M, MM + + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + cmpeqps %xmm0, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L43 + + addl $8 * SIZE, XX + addl $4, RET + decl I + jg .L41 + jmp .L45 + ALIGN_4 + +.L43: + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + + movss 4 * SIZE(XX), %xmm1 + movss 5 * SIZE(XX), %xmm2 + movss 6 * SIZE(XX), %xmm3 + movss 7 * SIZE(XX), %xmm4 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + addl $8 * SIZE, XX + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L45: + testl $2, MM + je .L47 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 + addl $4 * SIZE, XX + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L47: + incl RET + jmp .L999 + ALIGN_3 + +.L70: + movl MM, I + sarl $3, I + jle .L75 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + decl I + jg .L71 + ALIGN_4 + +.L75: + andl $7, MM + jle .L80 + + testl $4, MM + je .L76 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + ALIGN_3 + +.L76: + testl $2, MM + je .L77 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm0 + ALIGN_3 + +.L77: + testl $1, MM + je .L80 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + addps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L80: + movl X, XX + movl M, MM + + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + cmpeqps %xmm0, %xmm1 + movmskps %xmm1, TEMP + testl $15, TEMP + jne .L83 + + addl $4, RET + decl I + jg .L81 + jmp .L85 + ALIGN_4 + +.L83: + leal (, INCX, 4), TEMP + subl TEMP, XX + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L85: + testl $2, MM + je .L87 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incl RET + comiss %xmm0, %xmm1 + je .L999 + incl RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L87: + incl RET + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/izamax_sse2.S b/kernel/x86/izamax_sse2.S new file mode 100644 index 0000000000..0392e1d2ee --- /dev/null +++ b/kernel/x86/izamax_sse2.S @@ -0,0 +1,619 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + pxor %xmm0, %xmm0 + pxor %xmm7, %xmm7 + xor RET, RET + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + movl M, MM + movl X, XX + + cmpeqpd %xmm7, %xmm7 + psrlq $1, %xmm7 + + movsd 0 * SIZE(XX), %xmm0 + movsd 1 * SIZE(XX), %xmm1 + addl INCX, XX + decl MM + andpd %xmm7, %xmm0 + andpd %xmm7, %xmm1 + addpd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + cmpl $2 * SIZE, INCX + jne .L60 + + movl MM, I + sarl $3, I + jle .L25 + ALIGN_4 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) +#endif + + movsd 8 * SIZE(XX), %xmm1 + movsd 9 * SIZE(XX), %xmm2 + movhpd 10 * SIZE(XX), %xmm1 + movhpd 11 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 12 * SIZE(XX), %xmm3 + movsd 13 * SIZE(XX), %xmm4 + movhpd 14 * SIZE(XX), %xmm3 + movhpd 15 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + + addl $16 * SIZE, XX + decl I + jg .L21 + ALIGN_4 + +.L25: + andl $7, MM + jle .L30 + + testl $4, MM + je .L26 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + addl $8 * SIZE, XX + ALIGN_3 + +.L26: + testl $2, MM + je .L27 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + addl $4 * SIZE, XX + ALIGN_3 + +.L27: + testl $1, MM + je .L30 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L30: + movl X, XX + movl M, MM + + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + cmpeqpd %xmm0, %xmm1 + cmpeqpd %xmm0, %xmm3 + + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L33 + + addl $8 * SIZE, XX + addl $4, RET + decl I + jg .L31 + jmp .L35 + ALIGN_4 + +.L33: + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + + movsd 4 * SIZE(XX), %xmm1 + movsd 5 * SIZE(XX), %xmm2 + movsd 6 * SIZE(XX), %xmm3 + movsd 7 * SIZE(XX), %xmm4 + addl $8 * SIZE, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L35: + testl $2, MM + je .L36 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movsd 2 * SIZE(XX), %xmm3 + movsd 3 * SIZE(XX), %xmm4 + addl $4 * SIZE, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L36: + incl RET + jmp .L999 + ALIGN_3 + +.L60: + movl MM, I + sarl $3, I + jle .L65 + ALIGN_4 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + + decl I + jg .L61 + ALIGN_4 + +.L65: + andl $7, MM + jle .L70 + + testl $4, MM + je .L66 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + ALIGN_3 + +.L66: + testl $2, MM + je .L67 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + ALIGN_3 + +.L67: + testl $1, MM + je .L70 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxsd %xmm1, %xmm0 + ALIGN_3 + +.L70: + movl X, XX + movl M, MM + + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movl MM, I + sarl $2, I + jle .L75 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + cmpeqpd %xmm0, %xmm1 + cmpeqpd %xmm0, %xmm3 + + orpd %xmm3, %xmm1 + movmskpd %xmm1, TEMP + testl $3, TEMP + jne .L73 + + addl $4, RET + decl I + jg .L71 + jmp .L75 + ALIGN_4 + +.L73: + leal (, INCX, 4), TEMP + subl TEMP, XX + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L75: + testl $2, MM + je .L76 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + incl RET + comisd %xmm0, %xmm1 + je .L999 + incl RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L76: + incl RET + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/lsame.S b/kernel/x86/lsame.S new file mode 100644 index 0000000000..d7e48ad237 --- /dev/null +++ b/kernel/x86/lsame.S @@ -0,0 +1,90 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + movl 4(%esp), %eax + movl 8(%esp), %edx + movb (%eax), %al # a = *A + movb (%edx), %dl # b = *B + + andl $255, %eax + andl $255, %edx + + subl $65, %eax + subl $65, %edx + +#ifndef HAVE_CMOV + movl %eax, %ecx + subl $32, %ecx + jle .L1 + movl %ecx, %eax +.L1: + + movl %edx, %ecx + subl $32, %ecx + jle .L2 + movl %ecx, %edx +.L2: + subl %eax, %edx + movl $0, %eax + movl $1, %edx + jne .L3 + movl %edx, %eax +.L3: +#else + movl %eax, %ecx + subl $32, %ecx + cmovg %ecx, %eax + + movl %edx, %ecx + subl $32, %ecx + cmovg %ecx, %edx + + subl %eax, %edx + movl $0, %eax + movl $1, %edx + cmove %edx, %eax +#endif + ret + + EPILOGUE diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S new file mode 100644 index 0000000000..c0982496ac --- /dev/null +++ b/kernel/x86/nrm2.S @@ -0,0 +1,226 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $BASE_SHIFT, INCX + fldz + fldz + fldz + cmpl $SIZE, INCX + jne .L40 + + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + FLD 2 * SIZE(X) + fmul %st(0), %st + FLD 3 * SIZE(X) + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fmul %st(0), %st + FLD 5 * SIZE(X) + fmul %st(0), %st + FLD 6 * SIZE(X) + fmul %st(0), %st + FLD 7 * SIZE(X) + fmul %st(0), %st + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L21: + FLD (X) + fmul %st(0), %st + faddp %st,%st(1) + addl $1 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + FLD (X) + addl INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $7, I + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addl INCX, X + fmul %st(0), %st + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + fsqrt + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S new file mode 100644 index 0000000000..e70460912f --- /dev/null +++ b/kernel/x86/nrm2_sse.S @@ -0,0 +1,418 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + pxor %xmm0, %xmm0 + testl M, M + jle .L999 + pxor %xmm1, %xmm1 + testl INCX, INCX + jle .L999 + + leal (, INCX, SIZE), INCX + cmpl $SIZE, INCX + jne .L40 + + subl $-32 * SIZE, X + + testl $SIZE, X + je .L05 + + movss -32 * SIZE(X), %xmm0 + cvtss2sd %xmm0, %xmm0 + mulsd %xmm0, %xmm0 + + addl INCX, X + decl M + jle .L998 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L13 + + movsd -32 * SIZE(X), %xmm4 + movsd -30 * SIZE(X), %xmm5 + movsd -28 * SIZE(X), %xmm6 + movsd -26 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_3 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + cvtps2pd %xmm4, %xmm2 + movsd -24 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -22 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -20 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -18 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + movsd -16 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -14 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -12 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -10 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + decl I + jg .L10 + ALIGN_3 + +.L12: + cvtps2pd %xmm4, %xmm2 + movsd -24 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -22 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -20 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -18 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + ALIGN_4 + +.L13: + testl $8, M + je .L14 + + movsd -32 * SIZE(X), %xmm4 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -30 * SIZE(X), %xmm5 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + movsd -28 * SIZE(X), %xmm6 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -26 * SIZE(X), %xmm7 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L14: + testl $4, M + je .L15 + + movsd -32 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -30 * SIZE(X), %xmm5 + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + addl $4 * SIZE, X + ALIGN_3 + +.L15: + testl $2, M + je .L16 + + movsd -32 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + addl $2 * SIZE, X + ALIGN_3 + +.L16: + testl $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L44 + ALIGN_4 + +.L41: + movss (X), %xmm4 + addl INCX, X + + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm5 + addl INCX, X + + cvtss2sd %xmm5, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + movss (X), %xmm6 + addl INCX, X + + cvtss2sd %xmm6, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm7 + addl INCX, X + + cvtss2sd %xmm7, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + movss (X), %xmm4 + addl INCX, X + + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm5 + addl INCX, X + + cvtss2sd %xmm5, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + movss (X), %xmm6 + addl INCX, X + + cvtss2sd %xmm6, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm7 + addl INCX, X + + cvtss2sd %xmm7, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + decl I + jg .L41 + ALIGN_3 + +.L44: + testl $4, M + je .L45 + + movss (X), %xmm4 + addl INCX, X + + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm5 + addl INCX, X + + cvtss2sd %xmm5, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + + movss (X), %xmm6 + addl INCX, X + + cvtss2sd %xmm6, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm7 + addl INCX, X + + cvtss2sd %xmm7, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + ALIGN_3 + +.L45: + testl $2, M + je .L46 + + movss (X), %xmm4 + addl INCX, X + + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + + movss (X), %xmm5 + addl INCX, X + + cvtss2sd %xmm5, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + ALIGN_3 + +.L46: + testl $1, M + je .L998 + + movss (X), %xmm4 + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm0 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + sqrtsd %xmm0, %xmm0 + + cvtsd2ss %xmm0, %xmm0 + + movss %xmm0, STACK_M + flds STACK_M + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/qaxpy.S b/kernel/x86/qaxpy.S new file mode 100644 index 0000000000..0497ea323b --- /dev/null +++ b/kernel/x86/qaxpy.S @@ -0,0 +1,254 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) +#define STACK_Y 40 + STACK + ARGS(%esp) +#define STACK_INCY 44 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + FLD STACK_ALPHA + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $BASE_SHIFT, INCX + sall $BASE_SHIFT, INCY + + testl M, M + jle .L40 + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl M, %eax + sarl $3, %eax + jle .L15 + ALIGN_3 + +#define PRESIZE 33 + +.L16: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * SIZE(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + fmul %st(1),%st + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1),%st + FLD 2 * SIZE(Y) + faddp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + fmul %st(1),%st + FLD 3 * SIZE(Y) + faddp %st, %st(1) + FST 3 * SIZE(Y) + +#ifdef HAS_PREFETCH + prefetcht0 (4 + PRESIZE) * SIZE(X) +#endif + + FLD 4 * SIZE(X) + fmul %st(1),%st + FLD 4 * SIZE(Y) + faddp %st, %st(1) + FST 4 * SIZE(Y) + + FLD 5 * SIZE(X) + fmul %st(1),%st + FLD 5 * SIZE(Y) + faddp %st, %st(1) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1),%st + FLD 6 * SIZE(Y) + faddp %st, %st(1) + FST 6 * SIZE(Y) + + FLD 7 * SIZE(X) + fmul %st(1),%st + FLD 7 * SIZE(Y) + faddp %st, %st(1) + FST 7 * SIZE(Y) + +#ifdef HAVE_3DNOW + prefetchw 24 * SIZE(Y) +#endif + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl M, %eax + andl $7, %eax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + addl $SIZE, X + addl $SIZE, Y + decl %eax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movl M, %eax + sarl $2, %eax + jle .L28 + ALIGN_3 + +.L29: + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L35: + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + xorl %eax,%eax + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/qconjg.S b/kernel/x86/qconjg.S new file mode 100644 index 0000000000..3b40e0cb35 --- /dev/null +++ b/kernel/x86/qconjg.S @@ -0,0 +1,60 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl 4(%esp), %eax + movl 8(%esp), %ecx + fldz + FLD 1 * SIZE(%ecx) + fsubrp %st, %st(1) + FLD 0 * SIZE(%ecx) + + FST 0 * SIZE(%eax) + FST 1 * SIZE(%eax) + ret + + EPILOGUE diff --git a/kernel/x86/qdot.S b/kernel/x86/qdot.S new file mode 100644 index 0000000000..ce5ff29f1e --- /dev/null +++ b/kernel/x86/qdot.S @@ -0,0 +1,229 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + sall $BASE_SHIFT, INCX + sall $BASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl N, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(1) + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(2) + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(3) + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(4) + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addl $SIZE, X + FLD (Y) + fmulp %st, %st(1) + addl $SIZE, Y + faddp %st,%st(1) + decl %eax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 +.L29: +#endif + movl N, %eax + sarl $2, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(1) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(2) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(3) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(4) + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st, %st(1) + decl %eax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/qgemm_kernel_2x2.S b/kernel/x86/qgemm_kernel_2x2.S new file mode 100644 index 0000000000..a2852f2e15 --- /dev/null +++ b/kernel/x86/qgemm_kernel_2x2.S @@ -0,0 +1,810 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl ARG_LDC, LDC + movl ARG_B, B + + addl $8 * SIZE, A + addl $8 * SIZE, B + + sall $BASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + lea (, LDC, 2), %eax + addl %eax, C + + movl M, I + sarl $1, I + je .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO + decl I + jne .L11 + ALIGN_4 + +.L20: + movl M, %eax + andl $1, %eax + je .L29 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal ( B, %eax, 2), BO +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, CO + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + decl J + jne .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + addl LDC, C + + movl M, I + sarl $1, I + je .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal ( B, %eax, 1), BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $2 * SIZE, CO + decl I + jne .L31 + ALIGN_4 + +.L40: + movl M, %eax + andl $1, %eax + je .L49 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal ( B, %eax, 1), BO +#endif + + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + FLD ALPHA + + fmulp %st, %st(1) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) +#else + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $1 * SIZE, CO + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl BO, B + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qgemv_n.S b/kernel/x86/qgemv_n.S new file mode 100644 index 0000000000..8424232607 --- /dev/null +++ b/kernel/x86/qgemv_n.S @@ -0,0 +1,477 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 32 +#endif + +#if defined(ATHLON) || defined(OPTERON) +#define P 32 +#endif + +#ifndef P +#define P DTB_ENTRIES +#endif + +#define STACK 16 +#define ARGS 16 + +#define PLDA_M 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_N 8 + STACK(%esp) +#define IS 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) + +#define A 32 + STACK + ARGS(%esp) +#define LDA 36 + STACK + ARGS(%esp) +#define X 40 + STACK + ARGS(%esp) +#define INCX 44 + STACK + ARGS(%esp) +#define Y 48 + STACK + ARGS(%esp) +#define INCY 52 + STACK + ARGS(%esp) +#define BUFFER 56 + STACK + ARGS(%esp) + + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA + movl X, %edi + + movl LDA, %ebx + sall $BASE_SHIFT, %ebx + + movl $0, IS + movl M, %edx + movl N, %esi + + test %esi, %esi + jle .L79 # goto END + test %edx, %edx + jle .L79 # goto END + + movl INCY, %eax + sall $BASE_SHIFT, %eax + movl %eax, INCY + + movl LDA, %eax + imull $P, %eax # P * lda + subl M ,%eax # P * lda - m + sall $BASE_SHIFT, %eax + movl %eax, PLDA_M + ALIGN_2 + +.L32: + movl IS, %esi + movl $P, %edx + movl N, %eax + subl %esi,%eax # n - is + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + + movl %eax, MIN_N + movl INCX, %edx + + sall $BASE_SHIFT, %esi + leal (%edi, %esi, 1), %esi + + movl %esi, XP + cmpl $1, %edx + je .L34 # if incx == 1 goto L34 + + movl BUFFER, %esi + sall $BASE_SHIFT, %edx + movl %esi, XP # xp = buffer + sarl $2,%eax + jle .L35 + ALIGN_2 + +.L36: + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + FLD (%edi) + addl %edx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_N, %eax + andl $3, %eax + jle .L34 + ALIGN_2 + +.L42: + FLD (%edi) + addl %edx, %edi + FST (%esi) + addl $SIZE, %esi + decl %eax + jg .L42 + ALIGN_3 + +/* Main Routine */ +.L34: + movl Y, %ecx # c_offset + movl M, %ebp + sarl $2, %ebp # j = (m >> 2) + jle .L47 + ALIGN_2 + +.L48: + movl A, %edx # a_offset = a + fldz + addl $4 * SIZE, A # a += 4 + fldz + movl XP, %esi # b_offset = xp + fldz + movl MIN_N, %eax # i = min_n + fldz + FLD (%esi) # bt1 = b_offset + sarl $1, %eax + jle .L51 + ALIGN_2 + +#ifdef PENTIUM3 +#define PRESIZE 8 +#else +#define PRESIZE 24 +#endif + +.L80: +#ifdef PENTIUM3 + prefetcht1 PRESIZE * SIZE(%edx, %ebx, 1) + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + + prefetcht1 PRESIZE * SIZE(%esi) + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(%esi) # bt1 = b_offset + + prefetcht1 PRESIZE * SIZE(%edx, %ebx, 2) + addl %ebx, %edx # a_offset += lda + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + addl %ebx, %edx + faddp %st, %st(4) # ct4 += at1 + + FLD 2 * SIZE(%esi) # bt1 = b_offset + addl $2 * SIZE, %esi # b_offset += 2 + +#else +#ifdef PENTIUM4 + prefetchnta 8 * SIZE(%esi) +#endif + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(%esi) # bt1 = b_offset + + addl %ebx, %edx # a_offset += lda + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 2 * SIZE(%esi) # bt1 = b_offset + + addl %ebx, %edx + addl $2 * SIZE, %esi # b_offset += 2 +#endif + decl %eax + jg .L80 + +.L51: + movl MIN_N,%eax + andl $1, %eax + je .L57 + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(%edx) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(%edx) # bt1 *= *(a_offset + 3) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + fldz + ALIGN_2 + +.L57: + ffreep %st(0) + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + movl INCY, %eax + + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + + decl %ebp # j -- + jg .L48 + ALIGN_3 + +.L47: + movl M, %ebp + andl $3, %ebp # j = (m & 3) + jle .L60 + ALIGN_2 + +.L61: + + movl A, %edx # a_offset = a + fldz + addl $SIZE, A # a++ + fldz + movl XP,%esi + fldz + movl MIN_N,%eax + fldz + sarl $3,%eax + jle .L64 + ALIGN_2 + +.L65: + FLD 0 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(1) + addl %ebx, %edx + + FLD 1 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(2) + addl %ebx ,%edx + + FLD 2 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(3) + addl %ebx, %edx + + FLD 3 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(4) + addl %ebx, %edx + + FLD 4 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st,%st(1) + addl %ebx, %edx + + FLD 5 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st, %st(2) + addl %ebx, %edx + + FLD 6 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st,%st(3) + addl %ebx, %edx + + FLD 7 * SIZE(%esi) + FLD (%edx) + fmulp %st, %st(1) + faddp %st,%st(4) + addl %ebx, %edx + + addl $8 * SIZE, %esi + decl %eax + jg .L65 + +.L64: + movl MIN_N,%eax + andl $7, %eax + jle .L70 + ALIGN_2 + +.L71: + FLD (%esi) + addl $SIZE, %esi # b_offset ++ + FLD (%edx) + fmulp %st, %st(1) + addl %ebx, %edx # a_offset += lda + faddp %st, %st(1) + decl %eax + jg .L71 + ALIGN_2 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1), %st + movl INCY, %eax + FLD (%ecx) + faddp %st, %st(1) + FST (%ecx) + addl %eax, %ecx + decl %ebp + jg .L61 + +.L60: + movl PLDA_M, %esi + addl %esi, A # a += P * lda - m + addl $P, IS + movl N, %esi + cmpl %esi,IS + jl .L32 + +.L79: + ffreep %st(0) + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qgemv_t.S b/kernel/x86/qgemv_t.S new file mode 100644 index 0000000000..ff2ba80c49 --- /dev/null +++ b/kernel/x86/qgemv_t.S @@ -0,0 +1,585 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 88 +#endif + +#ifndef P +#define P 1000 +#endif + +#define STACK 16 +#define ARGS 24 + +#define NLDA 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_M 8 + STACK(%esp) +#define J 12 + STACK(%esp) +#define IS 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) + +#define A 32 + STACK + ARGS(%esp) +#define LDA 36 + STACK + ARGS(%esp) +#define X 40 + STACK + ARGS(%esp) +#define INCX 44 + STACK + ARGS(%esp) +#define Y 48 + STACK + ARGS(%esp) +#define INCY 52 + STACK + ARGS(%esp) +#define BUFFER 56 + STACK + ARGS(%esp) + + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA + + movl X, %edi # X + + movl $0, IS + + movl M, %ebx + movl N, %eax + + testl %ebx, %ebx + jle .L79 + testl %eax, %eax + jle .L79 + + movl INCX, %esi + sall $BASE_SHIFT, %esi + movl %esi, INCX + + movl INCY, %esi + sall $BASE_SHIFT, %esi + movl %esi, INCY + + movl LDA, %ebx + + imull %ebx, %eax + movl $P, %esi + subl %eax, %esi + sall $BASE_SHIFT, %esi + movl %esi, NLDA + + movl %ebx, %esi + sall $BASE_SHIFT, %esi + movl %esi, LDA + ALIGN_2 + +.L32: + movl IS, %esi + + movl $P, %edx + movl M, %eax + subl %esi, %eax + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + movl %eax, MIN_M + + movl IS, %ecx + sall $BASE_SHIFT, %ecx + leal (%edi,%ecx, 1), %ecx + movl INCX, %ebx + movl %ecx, XP + cmpl $SIZE, %ebx + je .L34 + + movl BUFFER, %esi + movl MIN_M, %ecx + movl %esi, XP + sarl $2, %ecx + jle .L35 + + ALIGN_3 + +.L36: + FLD (%edi) + addl %ebx, %edi + FST 0 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 1 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 2 * SIZE(%esi) + + FLD (%edi) + addl %ebx, %edi + FST 3 * SIZE(%esi) + + addl $4 * SIZE, %esi + decl %ecx + jg .L36 + ALIGN_3 + +.L35: + movl MIN_M, %ecx + andl $3,%ecx + jle .L34 + ALIGN_2 + +.L42: + FLD (%edi) + addl %ebx, %edi + FST (%esi) + addl $SIZE, %esi + decl %ecx + jg .L42 + ALIGN_3 + +/* Main Routine */ + +.L34: + movl Y, %ebp # coffset = y + + movl N, %esi + sarl $2, %esi + movl %esi, J + jle .L47 + ALIGN_3 + +.L48: + movl A, %ebx # a_offset = a + fldz + movl LDA, %edx + fldz + + leal (%ebx, %edx), %ecx # a_offset2 = a + lda + fldz + leal (%ebx, %edx, 4), %eax + fldz + + movl %eax, A + movl XP, %esi + FLD (%esi) + + movl MIN_M, %eax + sarl $2,%eax + jle .L51 + ALIGN_3 + +#define PRESIZE 8 + +.L80: +#ifdef PENTIUM3 + prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) + FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + prefetcht0 PRESIZE * SIZE(%ecx) + faddp %st,%st(2) # ct1 += at1 + FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + + prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + prefetcht0 PRESIZE * SIZE(%ebx) + FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + + fmul %st(1),%st + faddp %st,%st(4) + FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 2 * SIZE(%esi) + + FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 3 * SIZE(%esi) + FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(2) # ct1 += at1 + FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(3) # ct2 += at1 + FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + addl $4 * SIZE, %ebx + faddp %st,%st(4) + addl $4 * SIZE, %ecx + + FLD 4 * SIZE(%esi) + addl $4 * SIZE, %esi + +#else + +#if defined(HAS_PREFETCH) + prefetcht0 PRESIZE * SIZE(%ebx) + prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) + prefetcht0 PRESIZE * SIZE(%ecx) + prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) +#endif + + FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + + FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 2 * SIZE(%esi) + + FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 3 * SIZE(%esi) + + FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 4 * SIZE(%esi) + + addl $4 * SIZE, %ebx + addl $4 * SIZE, %ecx + addl $4 * SIZE, %esi +#endif + + decl %eax + jg .L80 + ALIGN_3 + +.L51: + movl MIN_M, %eax + andl $3, %eax + je .L81 + ALIGN_3 + +.L52: + + FLD (%ebx) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD (%ecx) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD (%ebx, %edx, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD (%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 1 * SIZE(%esi) + + addl $SIZE, %ebx + addl $SIZE, %ecx + addl $SIZE, %esi + decl %eax + jg .L52 + ALIGN_3 + +.L81: + ffreep %st(0) + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + movl INCY, %eax + + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl %eax, %ebp + + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl %eax, %ebp + + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl %eax, %ebp + + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl %eax, %ebp + + decl J + jg .L48 + ALIGN_3 + +.L47: + movl N, %esi + andl $3,%esi + movl %esi, J + jle .L60 + ALIGN_2 + +.L61: + movl A, %ebx # a_offset = a + fldz # ct1 = ZERO + movl LDA, %edx + fldz # ct1 = ZERO + + addl %ebx, %edx + fldz # ct1 = ZERO + movl %edx, A + fldz # ct1 = ZERO + + movl XP, %esi + + movl MIN_M, %eax + sarl $3,%eax + jle .L64 + ALIGN_3 + +.L65: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * 2 * SIZE(%ebx) + prefetcht0 PRESIZE * 2 * SIZE(%ebx) +#endif + + FLD 0 * SIZE(%esi) + FLD 0 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(1) + + FLD 1 * SIZE(%esi) + FLD 1 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(2) + + FLD 2 * SIZE(%esi) + FLD 2 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(3) + + FLD 3 * SIZE(%esi) + FLD 3 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 4 * SIZE(%esi) + FLD 4 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(1) + + FLD 5 * SIZE(%esi) + FLD 5 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(2) + + FLD 6 * SIZE(%esi) + FLD 6 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(3) + + FLD 7 * SIZE(%esi) + FLD 7 * SIZE(%ebx) + fmulp %st, %st(1) + faddp %st,%st(4) + + addl $8 * SIZE, %esi + addl $8 * SIZE, %ebx + + decl %eax + jg .L65 + ALIGN_3 + +.L64: + movl MIN_M, %eax + andl $7, %eax + jle .L70 + ALIGN_3 + +.L71: + FLD (%esi) + FLD (%ebx) + fmulp %st, %st(1) + faddp %st,%st(1) + + addl $SIZE, %esi + addl $SIZE, %ebx + decl %eax + jg .L71 + ALIGN_3 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1),%st + FLD (%ebp) + faddp %st, %st(1) + FST (%ebp) + addl INCY, %ebp + decl J + jg .L61 + ALIGN_3 + +.L60: + movl A, %ebx + addl NLDA, %ebx + movl %ebx, A + + addl $P, IS + movl M, %esi + cmpl %esi, IS + jl .L32 + ALIGN_3 + +.L79: + ffreep %st(0) + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qtrsm_kernel_LN_2x2.S b/kernel/x86/qtrsm_kernel_LN_2x2.S new file mode 100644 index 0000000000..37c268b414 --- /dev/null +++ b/kernel/x86/qtrsm_kernel_LN_2x2.S @@ -0,0 +1,1231 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_LDC, LDC + movl ARG_B, B + sall $BASE_SHIFT, LDC + + addl $8 * SIZE, A + addl $8 * SIZE, B + + +#ifdef LN + movl M, %eax + sall $BASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $BASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + lea (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %eax + andl $1, %eax + je .L20 + ALIGN_4 + +.L21: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, I + sarl $1, I + je .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J + jne .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + je .L999 + +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %eax + andl $1, %eax + je .L40 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L40: + movl M, I + sarl $1, I + je .L49 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L31 + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qtrsm_kernel_LT_2x2.S b/kernel/x86/qtrsm_kernel_LT_2x2.S new file mode 100644 index 0000000000..157e12d7f3 --- /dev/null +++ b/kernel/x86/qtrsm_kernel_LT_2x2.S @@ -0,0 +1,1229 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define AORIG 8 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_LDC, LDC + movl ARG_B, B + sall $BASE_SHIFT, LDC + + addl $8 * SIZE, A + addl $8 * SIZE, B + +#ifdef LN + movl M, %eax + sall $BASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $BASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + lea (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + sarl $1, I + je .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L11 + ALIGN_4 + +.L20: + movl M, %eax + andl $1, %eax + je .L29 + ALIGN_4 + +.L21: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J + jne .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + je .L999 + +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + sarl $1, I + je .L40 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L31 + ALIGN_4 + +.L40: + movl M, %eax + andl $1, %eax + je .L49 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/qtrsm_kernel_RT_2x2.S b/kernel/x86/qtrsm_kernel_RT_2x2.S new file mode 100644 index 0000000000..a0a4dafe3d --- /dev/null +++ b/kernel/x86/qtrsm_kernel_RT_2x2.S @@ -0,0 +1,1231 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_LDC, LDC + movl ARG_B, B + sall $BASE_SHIFT, LDC + + addl $8 * SIZE, A + addl $8 * SIZE, B + + +#ifdef LN + movl M, %eax + sall $BASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $BASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + je .L30 + +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + sarl $1, I + je .L40 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L31 + ALIGN_4 + +.L40: + movl M, %eax + andl $1, %eax + je .L49 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L30: + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + lea (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + sarl $1, I + je .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L11 + ALIGN_4 + +.L20: + movl M, %eax + andl $1, %eax + je .L29 + ALIGN_4 + +.L21: +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#else + movl B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + sall $BASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addl $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 2), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J + jne .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/rot.S b/kernel/x86/rot.S new file mode 100644 index 0000000000..111266a724 --- /dev/null +++ b/kernel/x86/rot.S @@ -0,0 +1,388 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#ifdef XDOUBLE +#define STACK_S 40 + STACK + ARGS(%esp) +#elif defined DOUBLE +#define STACK_S 32 + STACK + ARGS(%esp) +#else +#define STACK_S 28 + STACK + ARGS(%esp) +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCH_SIZE 144 +#endif + +#ifdef OPTERON +#define PREFETCH prefetchw +#define PREFETCH_SIZE 144 +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + FLD STACK_S + FLD STACK_C + + sall $BASE_SHIFT, INCX + sall $BASE_SHIFT, INCY + + testl N, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + movl N, I + sarl $2, I + jle .L15 + ALIGN_4 + +.L10: +#ifdef PENTIUM4 + PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) +#endif +#ifdef OPTERON + PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + +#ifdef PENTIUM4 + PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) +#endif +#ifdef OPTERON + PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 2 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 3 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 3 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + + decl I + jg .L10 + ALIGN_4 + +.L15: + movl N, I + andl $3, I + jle .L999 + ALIGN_4 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl $SIZE, X + addl $SIZE, Y + + decl I + jg .L16 + jmp .L999 + ALIGN_4 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_4 + +.L51: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L51 + ALIGN_4 + +.L55: + movl N, I + andl $3, I + jle .L999 + ALIGN_4 + +.L56: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_4 + + +.L999: + ffreep %st(0) + ffreep %st(0) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/rot_sse.S b/kernel/x86/rot_sse.S new file mode 100644 index 0000000000..af9f12f62c --- /dev/null +++ b/kernel/x86/rot_sse.S @@ -0,0 +1,1119 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#define STACK_S 28 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#define C %xmm6 +#define S %xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + movss STACK_C, C + movss STACK_S, S + + shufps $0x0, C, C + shufps $0x0, S, S + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + testl $SIZE, X + je .L05 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + jle .L999 + +.L05: + testl $2 * SIZE, X + je .L10 + + cmpl $1, N + je .L17 + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, N + jle .L999 + ALIGN_2 + +.L10: + testl $3 * SIZE, Y + jne .L20 + + movl N, I + sarl $5, I + jle .L14 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movaps 16 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 16 * SIZE(X) + movlps %xmm2, 16 * SIZE(Y) + movhps %xmm2, 18 * SIZE(Y) + + movsd 20 * SIZE(Y), %xmm1 + movhps 22 * SIZE(Y), %xmm1 + movaps 20 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 20 * SIZE(X) + movlps %xmm2, 20 * SIZE(Y) + movhps %xmm2, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movaps 24 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 24 * SIZE(X) + movlps %xmm2, 24 * SIZE(Y) + movhps %xmm2, 26 * SIZE(Y) + + movsd 28 * SIZE(Y), %xmm1 + movhps 30 * SIZE(Y), %xmm1 + movaps 28 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 28 * SIZE(X) + movlps %xmm2, 28 * SIZE(Y) + movhps %xmm2, 30 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl I + jg .L11 + ALIGN_3 + +.L14: + testl $31, N + jle .L999 + + testl $16, N + jle .L15 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L15: + testl $8, N + jle .L16 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $4, N + jle .L17 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $2, N + jle .L18 + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L18: + testl $1, N + jle .L999 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movl N, I + sarl $5, I + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movaps 16 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 16 * SIZE(X) + movlps %xmm2, 16 * SIZE(Y) + movhps %xmm2, 18 * SIZE(Y) + + movsd 20 * SIZE(Y), %xmm1 + movhps 22 * SIZE(Y), %xmm1 + movaps 20 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 20 * SIZE(X) + movlps %xmm2, 20 * SIZE(Y) + movhps %xmm2, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movaps 24 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 24 * SIZE(X) + movlps %xmm2, 24 * SIZE(Y) + movhps %xmm2, 26 * SIZE(Y) + + movsd 28 * SIZE(Y), %xmm1 + movhps 30 * SIZE(Y), %xmm1 + movaps 28 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 28 * SIZE(X) + movlps %xmm2, 28 * SIZE(Y) + movhps %xmm2, 30 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + decl I + jg .L21 + ALIGN_3 + +.L24: + testl $31, N + jle .L999 + + testl $16, N + jle .L25 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + + +.L26: + testl $4, N + jle .L27 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, N + jle .L999 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_3 + +.L53: + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L53 + ALIGN_3 + +.L55: + movl N, I + andl $3, I + jle .L999 + ALIGN_3 + +.L56: + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/rot_sse2.S b/kernel/x86/rot_sse2.S new file mode 100644 index 0000000000..8ec1d44bb0 --- /dev/null +++ b/kernel/x86/rot_sse2.S @@ -0,0 +1,960 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#define STACK_S 32 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#include "l1param.h" + +#define C %xmm6 +#define S %xmm7 + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + movsd STACK_C, C + movsd STACK_S, S + + pshufd $0x44, C, C + pshufd $0x44, S, S + + cmpl $0, N + jle .L999 + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + testl $SIZE, X + je .L10 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl N + jle .L999 + ALIGN_2 + +.L10: + testl $SIZE, Y + jne .L20 + + movl N, I + sarl $4, I + jle .L14 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 8 * SIZE(Y), %xmm1 + movapd 8 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 8 * SIZE(Y) + + movapd 10 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movapd %xmm2, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 12 * SIZE(Y), %xmm1 + movapd 12 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movapd %xmm2, 12 * SIZE(Y) + + movapd 14 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movapd %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + + decl I + jg .L11 + ALIGN_3 + +.L14: + testl $15, N + jle .L999 + + testl $8, N + jle .L15 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, N + jle .L16 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, N + jle .L17 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movapd -1 * SIZE(Y), %xmm1 + + movl N, I + sarl $4, I + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + movapd 5 * SIZE(Y), %xmm4 + movapd 4 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movlpd %xmm2, 4 * SIZE(Y) + movhpd %xmm2, 5 * SIZE(Y) + + movapd 7 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movlpd %xmm2, 6 * SIZE(Y) + movhpd %xmm2, 7 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 9 * SIZE(Y), %xmm4 + movapd 8 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movlpd %xmm2, 8 * SIZE(Y) + movhpd %xmm2, 9 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 11 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movlpd %xmm2, 10 * SIZE(Y) + movhpd %xmm2, 11 * SIZE(Y) + + movapd 13 * SIZE(Y), %xmm4 + movapd 12 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movlpd %xmm2, 12 * SIZE(Y) + movhpd %xmm2, 13 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 15 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movlpd %xmm2, 14 * SIZE(Y) + movhpd %xmm2, 15 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + decl I + jg .L21 + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + movapd 5 * SIZE(Y), %xmm4 + movapd 4 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movlpd %xmm2, 4 * SIZE(Y) + movhpd %xmm2, 5 * SIZE(Y) + + movapd 7 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movlpd %xmm2, 6 * SIZE(Y) + movhpd %xmm2, 7 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + movapd %xmm4, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + unpckhpd %xmm1, %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_3 + +.L53: + movsd (Y), %xmm1 + movhpd (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhpd (X, INCX), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, (X) + movhpd %xmm0, (X, INCX) + movlpd %xmm2, (Y) + movhpd %xmm2, (Y, INCY) + + leal (X, INCX, 2), X + leal (Y, INCY, 2), Y + + movsd (Y), %xmm1 + movhpd (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhpd (X, INCX), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, (X) + movhpd %xmm0, (X, INCX) + movlpd %xmm2, (Y) + movhpd %xmm2, (Y, INCY) + + leal (X, INCX, 2), X + leal (Y, INCY, 2), Y + + decl I + jg .L53 + ALIGN_3 + +.L55: + movl N, I + andl $3, I + jle .L999 + ALIGN_3 + +.L56: + movsd (Y), %xmm1 + movsd (X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, (X) + movsd %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/scal.S b/kernel/x86/scal.S new file mode 100644 index 0000000000..377d4ef616 --- /dev/null +++ b/kernel/x86/scal.S @@ -0,0 +1,352 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl 16(%esp),%edx + FLD 28(%esp) + +#ifdef XDOUBLE + movl 44(%esp),%edi + movl 48(%esp),%esi +#elif defined(DOUBLE) + movl 36(%esp),%edi + movl 40(%esp),%esi +#else + movl 32(%esp),%edi + movl 36(%esp),%esi +#endif + + ftst + fnstsw %ax + andb $68, %ah + je .L300 # Alpha != ZERO + +/* Alpha == ZERO */ + cmpl $1,%esi + jne .L104 + + movl %edx, %ecx # ecx = n + sarl $3, %ecx # (n >> 3) + jle .L102 + ALIGN_4 + +.L101: +#ifndef XDOUBLE + FSTU 0 * SIZE(%edi) + FSTU 1 * SIZE(%edi) + FSTU 2 * SIZE(%edi) + FSTU 3 * SIZE(%edi) + FSTU 4 * SIZE(%edi) + FSTU 5 * SIZE(%edi) + FSTU 6 * SIZE(%edi) + FSTU 7 * SIZE(%edi) +#else + fld %st + FST 0 * SIZE(%edi) + fld %st + FST 1 * SIZE(%edi) + fld %st + FST 2 * SIZE(%edi) + fld %st + FST 3 * SIZE(%edi) + fld %st + FST 4 * SIZE(%edi) + fld %st + FST 5 * SIZE(%edi) + fld %st + FST 6 * SIZE(%edi) + fld %st + FST 7 * SIZE(%edi) +#endif + + addl $8 * SIZE, %edi + decl %ecx + jg .L101 + ALIGN_4 + +.L102: + movl %edx, %ecx + andl $7, %ecx + jle .L999 + ALIGN_4 + +.L103: +#ifndef XDOUBLE + FSTU 0 * SIZE(%edi) +#else + fld %st + FST 0 * SIZE(%edi) +#endif + addl $SIZE, %edi + decl %ecx + jg .L103 + jmp .L999 + ALIGN_4 + +.L104: + sall $BASE_SHIFT, %esi + + movl %edx, %ecx # ecx = n + sarl $3, %ecx # (n >> 3) + jle .L106 + ALIGN_4 + +.L105: +#ifndef XDOUBLE + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi + FSTU 0 * SIZE(%edi) + addl %esi, %edi +#else + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi + fld %st + FST 0 * SIZE(%edi) + addl %esi, %edi +#endif + + decl %ecx + jg .L105 + ALIGN_4 + +.L106: + movl %edx, %ecx + andl $7, %ecx + jle .L999 + ALIGN_4 + +.L107: +#ifndef XDOUBLE + FSTU 0 * SIZE(%edi) +#else + fld %st + FST 0 * SIZE(%edi) +#endif + addl %esi, %edi + decl %ecx + jg .L107 + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L300: + cmpl $1,%esi + jne .L304 + + movl %edx, %ecx # ecx = n + sarl $3, %ecx # (n >> 3) + jle .L302 + ALIGN_4 + +.L301: + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + + FLD 1 * SIZE(%edi) + fmul %st(1), %st + FST 1 * SIZE(%edi) + + FLD 2 * SIZE(%edi) + fmul %st(1), %st + FST 2 * SIZE(%edi) + + FLD 3 * SIZE(%edi) + fmul %st(1), %st + FST 3 * SIZE(%edi) + + FLD 4 * SIZE(%edi) + fmul %st(1), %st + FST 4 * SIZE(%edi) + + FLD 5 * SIZE(%edi) + fmul %st(1), %st + FST 5 * SIZE(%edi) + + FLD 6 * SIZE(%edi) + fmul %st(1), %st + FST 6 * SIZE(%edi) + + FLD 7 * SIZE(%edi) + fmul %st(1), %st + FST 7 * SIZE(%edi) + + addl $8 * SIZE, %edi + decl %ecx + jg .L301 + ALIGN_4 + +.L302: + movl %edx, %ecx + andl $7, %ecx + jle .L999 + ALIGN_4 + +.L303: + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl $SIZE, %edi + decl %ecx + jg .L303 + jmp .L999 + ALIGN_4 + +.L304: + sall $BASE_SHIFT, %esi + + movl %edx, %ecx # ecx = n + sarl $3, %ecx # (n >> 3) + jle .L306 + ALIGN_4 + +.L305: + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + + decl %ecx + jg .L305 + ALIGN_4 + +.L306: + movl %edx, %ecx + andl $7, %ecx + jle .L999 + ALIGN_4 + +.L307: + FLD 0 * SIZE(%edi) + fmul %st(1), %st + FST 0 * SIZE(%edi) + addl %esi, %edi + decl %ecx + jg .L307 + ALIGN_4 + +.L999: + ffreep %st(0) + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/scal_sse.S b/kernel/x86/scal_sse.S new file mode 100644 index 0000000000..aa5ab760e0 --- /dev/null +++ b/kernel/x86/scal_sse.S @@ -0,0 +1,637 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 20 + STACK + ARGS(%esp) +#define STACK_INCX 24 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define XX %edi + +#include "l1param.h" + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + lea (, INCX, SIZE), INCX + + movss STACK_ALPHA, %xmm0 + + testl M, M + jle .L999 + + xorps %xmm1, %xmm1 + comiss %xmm0, %xmm1 + shufps $0, %xmm0, %xmm0 + + jne .L100 + +/* Alpha == ZERO */ + cmpl $SIZE, INCX + jne .L50 + +/* INCX == 1 */ + cmpl $3, M + jle .L14 + + testl $4, X # aligned for double word? + je .L05 + + movss %xmm1, 0 * SIZE(X) + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L05: + testl $8, X # aligned for quad word? + je .L06 + + movsd %xmm1, 0 * SIZE(X) + addl $2 * SIZE, X + subl $2, M + jle .L999 + ALIGN_3 + +.L06: + movl M, I + sarl $4, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 8 * SIZE(X) + movaps %xmm1, 12 * SIZE(X) + addl $16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: + testl $15, M + je .L999 + testl $8, M + je .L13 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L13: + testl $4, M + je .L14 + + movaps %xmm1, 0 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L14: + testl $2, M + je .L15 + + movsd %xmm1, 0 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L15: + testl $1, M + je .L999 + + movss %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L50: + movl M, I # rcx = n + sarl $3, I # (n >> 3) + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + + decl I + jg .L51 + ALIGN_4 + +.L52: + testl $7, M + je .L999 + + testl $4, M + je .L53 + + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + ALIGN_3 + +.L53: + testl $2, M + je .L54 + + movss %xmm1, (X) + addl INCX, X + movss %xmm1, (X) + addl INCX, X + ALIGN_3 + +.L54: + testl $1, M + je .L999 + + movss %xmm1, (X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + cmpl $SIZE, INCX + jne .L150 + + subl $-32 * SIZE, X + + cmpl $3, M + jle .L116 + + testl $SIZE, X + je .L105 + + movss -32 * SIZE(X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, -32 * SIZE(X) + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L105: + testl $2 * SIZE, X + je .L110 + + movsd -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movsd %xmm1, -32 * SIZE(X) + addl $2 * SIZE, X + subl $2, M + jle .L999 + ALIGN_3 + +.L110: + movl M, I + sarl $5, I + jle .L113 + +#if defined(BARCELONA) + + movaps %xmm0, %xmm1 + mulps -32 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulps -28 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulps -24 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulps -20 * SIZE(X), %xmm4 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, -32 * SIZE(X) + movaps %xmm0, %xmm1 + mulps -16 * SIZE(X), %xmm1 + + movaps %xmm2, -28 * SIZE(X) + movaps %xmm0, %xmm2 + mulps -12 * SIZE(X), %xmm2 + + movaps %xmm3, -24 * SIZE(X) + movaps %xmm0, %xmm3 + mulps -8 * SIZE(X), %xmm3 + + movaps %xmm4, -20 * SIZE(X) + movaps %xmm0, %xmm4 + mulps -4 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, -16 * SIZE(X) + movaps %xmm0, %xmm1 + mulps 0 * SIZE(X), %xmm1 + + movaps %xmm2, -12 * SIZE(X) + movaps %xmm0, %xmm2 + mulps 4 * SIZE(X), %xmm2 + + movaps %xmm3, -8 * SIZE(X) + movaps %xmm0, %xmm3 + mulps 8 * SIZE(X), %xmm3 + + movaps %xmm4, -4 * SIZE(X) + movaps %xmm0, %xmm4 + mulps 12 * SIZE(X), %xmm4 + + subl $-32 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + movaps %xmm1, -32 * SIZE(X) + movaps %xmm0, %xmm1 + mulps -16 * SIZE(X), %xmm1 + + movaps %xmm2, -28 * SIZE(X) + movaps %xmm0, %xmm2 + mulps -12 * SIZE(X), %xmm2 + + movaps %xmm3, -24 * SIZE(X) + movaps %xmm0, %xmm3 + mulps -8 * SIZE(X), %xmm3 + + movaps %xmm4, -20 * SIZE(X) + movaps %xmm0, %xmm4 + mulps -4 * SIZE(X), %xmm4 + + movaps %xmm1, -16 * SIZE(X) + movaps %xmm2, -12 * SIZE(X) + movaps %xmm3, -8 * SIZE(X) + movaps %xmm4, -4 * SIZE(X) + +#else + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm4 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + movaps -16 * SIZE(X), %xmm1 + + mulps %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(X) + movaps -12 * SIZE(X), %xmm2 + + mulps %xmm0, %xmm3 + movaps %xmm3, -24 * SIZE(X) + movaps -8 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm4 + movaps %xmm4, -20 * SIZE(X) + movaps -4 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + movaps 0 * SIZE(X), %xmm1 + + mulps %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(X) + movaps 4 * SIZE(X), %xmm2 + + mulps %xmm0, %xmm3 + movaps %xmm3, -8 * SIZE(X) + movaps 8 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm4 + movaps %xmm4, -4 * SIZE(X) + movaps 12 * SIZE(X), %xmm4 + + subl $-32 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + movaps -16 * SIZE(X), %xmm1 + + mulps %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(X) + movaps -12 * SIZE(X), %xmm2 + + mulps %xmm0, %xmm3 + movaps %xmm3, -24 * SIZE(X) + movaps -8 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm4 + movaps %xmm4, -20 * SIZE(X) + movaps -4 * SIZE(X), %xmm4 + + mulps %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulps %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -8 * SIZE(X) + mulps %xmm0, %xmm4 + movaps %xmm4, -4 * SIZE(X) + +#endif + + subl $-32 * SIZE, X + ALIGN_3 + +.L113: + testl $31, M + je .L999 + + testl $16, M + je .L114 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm3 + movaps -24 * SIZE(X), %xmm5 + movaps -20 * SIZE(X), %xmm7 + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -28 * SIZE(X) + mulps %xmm0, %xmm5 + movaps %xmm5, -24 * SIZE(X) + mulps %xmm0, %xmm7 + movaps %xmm7, -20 * SIZE(X) + + addl $16 * SIZE, X + ALIGN_3 + +.L114: + testl $8, M + je .L115 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -28 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L115: + testl $4, M + je .L116 + + movaps -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L116: + testl $2, M + je .L117 + + movsd -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movsd %xmm1, -32 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L117: + testl $1, M + je .L999 + + movss -32 * SIZE(X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, -32 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movl X, XX + movl M, I # rcx = n + sarl $3, I # (n >> 3) + jle .L152 + ALIGN_4 + +.L151: + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + movss (X), %xmm4 + addl INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + + movss %xmm1, (XX) + addl INCX, XX + movss %xmm2, (XX) + addl INCX, XX + movss %xmm3, (XX) + addl INCX, XX + movss %xmm4, (XX) + addl INCX, XX + + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + movss (X), %xmm4 + addl INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + + movss %xmm1, (XX) + addl INCX, XX + movss %xmm2, (XX) + addl INCX, XX + movss %xmm3, (XX) + addl INCX, XX + movss %xmm4, (XX) + addl INCX, XX + + decl I + jg .L151 + ALIGN_4 + +.L152: + testl $7, M + je .L999 + + testl $4, M + je .L153 + + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + movss (X), %xmm3 + addl INCX, X + movss (X), %xmm4 + addl INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + + movss %xmm1, (XX) + addl INCX, XX + movss %xmm2, (XX) + addl INCX, XX + movss %xmm3, (XX) + addl INCX, XX + movss %xmm4, (XX) + addl INCX, XX + ALIGN_3 + +.L153: + testl $2, M + je .L154 + + movss (X), %xmm1 + addl INCX, X + movss (X), %xmm2 + addl INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + + movss %xmm1, (XX) + addl INCX, XX + movss %xmm2, (XX) + addl INCX, XX + ALIGN_3 + +.L154: + testl $1, M + je .L999 + + movss (X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (X) + ALIGN_4 + +.L999: + xorl %eax, %eax + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S new file mode 100644 index 0000000000..dab543470e --- /dev/null +++ b/kernel/x86/scal_sse2.S @@ -0,0 +1,556 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA 16 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define XX %edi + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + movsd STACK_ALPHA, %xmm0 + + testl M, M + jle .L999 + + leal (, INCX, SIZE), INCX + + xorps %xmm1, %xmm1 + comisd %xmm0, %xmm1 + jne .L100 # Alpha != ZERO + +/* Alpha == ZERO */ + cmpl $SIZE, INCX + jne .L50 + +/* INCX == 1 */ + testl $15, X # aligned for quad word? + je .L05 + + movsd %xmm1, 0 * SIZE(X) + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 +.L05: + +/* Aligned Mode */ + movl M, I # rcx = n + sarl $4, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, 8 * SIZE(X) + movaps %xmm1, 10 * SIZE(X) + movaps %xmm1, 12 * SIZE(X) + movaps %xmm1, 14 * SIZE(X) + + addl $16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: + testl $15, M + je .L999 + testl $8, M + je .L13 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 6 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L13: + testl $4, M + je .L14 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L14: + testl $2, M + je .L15 + + movaps %xmm1, 0 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L15: + testl $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +.L50: + movl M, I + sarl $3, I + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + + decl I + jg .L51 + ALIGN_4 + +.L52: + testl $7, M + je .L999 + + testl $4, M + je .L53 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + ALIGN_3 + +.L53: + testl $2, M + je .L54 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm1, (X) + addl INCX, X + ALIGN_3 + +.L54: + testl $1, M + je .L999 + + movsd %xmm1, (X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + unpcklpd %xmm0, %xmm0 + + cmpl $SIZE, INCX + jne .L150 + + testl $SIZE, X + je .L105 + + movsd 0 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, 0 * SIZE(X) + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 +.L105: + subl $-16 * SIZE, X + + movl M, I # rcx = n + sarl $4, I + jle .L113 + +#if defined(BARCELONA) + + movaps %xmm0, %xmm1 + mulpd -16 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulpd -14 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulpd -12 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulpd -10 * SIZE(X), %xmm4 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, -16 * SIZE(X) + movaps %xmm0, %xmm1 + mulpd -8 * SIZE(X), %xmm1 + + movaps %xmm2, -14 * SIZE(X) + movaps %xmm0, %xmm2 + mulpd -6 * SIZE(X), %xmm2 + + movaps %xmm3, -12 * SIZE(X) + movaps %xmm0, %xmm3 + mulpd -4 * SIZE(X), %xmm3 + + movaps %xmm4, -10 * SIZE(X) + movaps %xmm0, %xmm4 + mulpd -2 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, -8 * SIZE(X) + movaps %xmm0, %xmm1 + mulpd 0 * SIZE(X), %xmm1 + + movaps %xmm2, -6 * SIZE(X) + movaps %xmm0, %xmm2 + mulpd 2 * SIZE(X), %xmm2 + + movaps %xmm3, -4 * SIZE(X) + movaps %xmm0, %xmm3 + mulpd 4 * SIZE(X), %xmm3 + + movaps %xmm4, -2 * SIZE(X) + movaps %xmm0, %xmm4 + mulpd 6 * SIZE(X), %xmm4 + + subl $-16 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + movaps %xmm1, -16 * SIZE(X) + movaps %xmm0, %xmm1 + mulpd -8 * SIZE(X), %xmm1 + + movaps %xmm2, -14 * SIZE(X) + movaps %xmm0, %xmm2 + mulpd -6 * SIZE(X), %xmm2 + + movaps %xmm3, -12 * SIZE(X) + movaps %xmm0, %xmm3 + mulpd -4 * SIZE(X), %xmm3 + + movaps %xmm4, -10 * SIZE(X) + movaps %xmm0, %xmm4 + mulpd -2 * SIZE(X), %xmm4 + + movaps %xmm1, -8 * SIZE(X) + movaps %xmm2, -6 * SIZE(X) + movaps %xmm3, -4 * SIZE(X) + movaps %xmm4, -2 * SIZE(X) + +#else + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + movaps -8 * SIZE(X), %xmm1 + + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + movaps -6 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + movaps -4 * SIZE(X), %xmm3 + + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + movaps -2 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd %xmm0, %xmm1 + movaps %xmm1, -8 * SIZE(X) + movaps 0 * SIZE(X), %xmm1 + + mulpd %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(X) + movaps 2 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm3 + movaps %xmm3, -4 * SIZE(X) + movaps 4 * SIZE(X), %xmm3 + + mulpd %xmm0, %xmm4 + movaps %xmm4, -2 * SIZE(X) + movaps 6 * SIZE(X), %xmm4 + + subl $-16 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + movaps -8 * SIZE(X), %xmm1 + + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + movaps -6 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + movaps -4 * SIZE(X), %xmm3 + + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + movaps -2 * SIZE(X), %xmm4 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -8 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(X) + mulpd %xmm0, %xmm3 + movaps %xmm3, -4 * SIZE(X) + mulpd %xmm0, %xmm4 + movaps %xmm4, -2 * SIZE(X) +#endif + + subl $-16 * SIZE, X + ALIGN_3 + +.L113: + testl $15, M + je .L999 + + testl $8, M + je .L114 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L114: + testl $4, M + je .L115 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L115: + testl $2, M + je .L116 + + movaps -16 * SIZE(X), %xmm1 + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L116: + testl $1, M + je .L999 + + movsd -16 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, -16 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movl X, XX + movl M, I + sarl $2, I + jle .L152 + ALIGN_4 + +.L151: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + movsd (X), %xmm1 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movsd (X), %xmm3 + addl INCX, X + movsd (X), %xmm4 + addl INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + + movsd %xmm1, (XX) + addl INCX, XX + movsd %xmm2, (XX) + addl INCX, XX + movsd %xmm3, (XX) + addl INCX, XX + movsd %xmm4, (XX) + addl INCX, XX + + decl I + jg .L151 + ALIGN_4 + +.L152: + testl $2, M + je .L154 + + movsd (X), %xmm1 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + + movsd %xmm1, (XX) + addl INCX, XX + movsd %xmm2, (XX) + addl INCX, XX + ALIGN_3 + +.L154: + testl $1, M + je .L999 + + movsd (X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, (X) + ALIGN_4 + +.L999: + xorl %eax, %eax + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/staticbuffer.S b/kernel/x86/staticbuffer.S new file mode 100644 index 0000000000..b041c62fac --- /dev/null +++ b/kernel/x86/staticbuffer.S @@ -0,0 +1,49 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + ALIGN_6 +#ifdef __CYGWIN__ + .comm _alloc_area, (NUM_BUFFERS * BUFFER_SIZE) +#else + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 4096 +#endif +#endif diff --git a/kernel/x86/swap.S b/kernel/x86/swap.S new file mode 100644 index 0000000000..d32c1a3c84 --- /dev/null +++ b/kernel/x86/swap.S @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define N 4 + STACK + ARGS(%esp) +#ifdef XDOUBLE +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#elif defined(DOUBLE) +#define X 24 + STACK + ARGS(%esp) +#define INCX 28 + STACK + ARGS(%esp) +#define Y 32 + STACK + ARGS(%esp) +#define INCY 36 + STACK + ARGS(%esp) +#else +#define X 20 + STACK + ARGS(%esp) +#define INCX 24 + STACK + ARGS(%esp) +#define Y 28 + STACK + ARGS(%esp) +#define INCY 32 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl N, %edx + movl X, %esi + movl Y, %edi + movl INCX, %ebx + movl INCY, %ecx + + sall $BASE_SHIFT, %ebx + sall $BASE_SHIFT, %ecx + + cmpl $SIZE, %ebx + jne .L14 + cmpl $SIZE, %ecx + jne .L14 + + movl %edx, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 3 * SIZE(%esi) + FLD 2 * SIZE(%esi) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + FLD 3 * SIZE(%edi) + FLD 2 * SIZE(%edi) + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) + FST 2 * SIZE(%edi) + FST 3 * SIZE(%edi) + + addl $4 * SIZE, %esi + addl $4 * SIZE, %edi + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl %edx, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (%esi) + FLD (%edi) + FST (%esi) + FST (%edi) + addl $SIZE, %esi + addl $SIZE, %edi + decl %eax + jg .L22 + jmp .L27 + ALIGN_3 + +/* INCX != 1 or INCY != 1 */ + +.L14: + movl %edx, %eax + sarl $2, %eax + jle .L28 + ALIGN_2 + +.L29: + FLD (%esi) + addl %ebx, %esi + FLD (%esi) + addl %ebx, %esi + FLD (%esi) + addl %ebx, %esi + FLD (%esi) + + FLD (%edi) + addl %ecx, %edi + FLD (%edi) + addl %ecx, %edi + FLD (%edi) + addl %ecx, %edi + FLD (%edi) + + FST (%esi) + subl %ebx, %esi + FST (%esi) + subl %ebx, %esi + FST (%esi) + subl %ebx, %esi + FST (%esi) + leal (%esi, %ebx, 4), %esi + + FST (%edi) + subl %ecx, %edi + FST (%edi) + subl %ecx, %edi + FST (%edi) + subl %ecx, %edi + FST (%edi) + leal (%edi, %ecx, 4), %edi + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl %edx, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L35: + FLD (%esi) + FLD (%edi) + FST (%esi) + addl %ebx, %esi + FST (%edi) + addl %ecx, %edi + decl %eax + jg .L35 + ALIGN_3 + +.L27: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/swap_sse.S b/kernel/x86/swap_sse.S new file mode 100644 index 0000000000..39c0d2f0b9 --- /dev/null +++ b/kernel/x86/swap_sse.S @@ -0,0 +1,1139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 20 + STACK + ARGS(%esp) +#define STACK_INCX 24 + STACK + ARGS(%esp) +#define STACK_Y 28 + STACK + ARGS(%esp) +#define STACK_INCY 32 + STACK + ARGS(%esp) + +#define M %edx +#define X %esi +#define Y %edi +#define INCX %ebx +#define INCY %ecx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_Y, Y + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $BASE_SHIFT, %ebx + sall $BASE_SHIFT, %ecx + + cmpl $SIZE, INCX + jne .L50 + cmpl $SIZE, INCY + jne .L50 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + cmpl $3, M + jle .L16 + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + ALIGN_3 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_3 + +.L10: + cmpl $3, M + jle .L16 + + testl $2 * SIZE, X + jne .L30 + + testl $1 * SIZE, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + + decl %eax + jg .L11 + ALIGN_3 + +.L13: + testl $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + testl $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + addl $2 * SIZE, X + movlps %xmm0, -32 * SIZE(Y) + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L20: + movaps -33 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + PSHUFD2($0x39, %xmm1, %xmm3) + movlps %xmm3, -31 * SIZE(X) + + subl $3, M + + movl M, %eax + sarl $5, %eax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -13 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -5 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -5 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L23: + testl $16, M + jle .L24 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $8, M + jle .L25 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, M + jle .L26 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + PSHUFD2($0x39, %xmm0, %xmm2) + PSHUFD1($0xff, %xmm0) + + movlps %xmm2, -32 * SIZE(Y) + movss %xmm0, -30 * SIZE(Y) + + testl $2, M + jle .L27 + + movsd -29 * SIZE(X), %xmm0 + movsd -29 * SIZE(Y), %xmm1 + + movlps %xmm0, -29 * SIZE(Y) + movlps %xmm1, -29 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, M + jle .L29 + + movss -29 * SIZE(X), %xmm0 + movss -29 * SIZE(Y), %xmm1 + + movss %xmm0, -29 * SIZE(Y) + movss %xmm1, -29 * SIZE(X) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L30: + testl $1 * SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + subl $2, M + + movl M, %eax + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -6 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -6 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L33: + testl $16, M + jle .L34 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + testl $8, M + jle .L35 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, M + jle .L36 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + movhps %xmm0, -32 * SIZE(Y) + + testl $2, M + jle .L37 + + movsd -30 * SIZE(X), %xmm0 + movsd -30 * SIZE(Y), %xmm1 + + movlps %xmm0, -30 * SIZE(Y) + movlps %xmm1, -30 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, M + jle .L39 + + movss -30 * SIZE(X), %xmm0 + movss -30 * SIZE(Y), %xmm1 + + movss %xmm0, -30 * SIZE(Y) + movss %xmm1, -30 * SIZE(X) + ALIGN_3 + +.L39: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + + subl $3, M + + movl M, %eax + sarl $5, %eax + jle .L43 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -11 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -3 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -3 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L43: + testl $16, M + jle .L44 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + testl $8, M + jle .L45 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, M + jle .L46 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + movsd -31 * SIZE(X), %xmm2 + + PSHUFD2($0x39, %xmm1, %xmm1) + movlps %xmm1, -31 * SIZE(X) + + PSHUFD1($0xff, %xmm0) + + movss %xmm0, -32 * SIZE(Y) + movlps %xmm2, -31 * SIZE(Y) + + addl $3 * SIZE, X + addl $3 * SIZE, Y + + testl $2, M + jle .L47 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm0, -32 * SIZE(Y) + movlps %xmm1, -32 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, M + jle .L49 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm0, -32 * SIZE(Y) + movss %xmm1, -32 * SIZE(X) + ALIGN_3 + +.L49: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L50: + movl M, %eax + sarl $3, %eax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addl INCX, X + movss %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $7, %eax + jle .L57 + ALIGN_3 + +.L56: + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + movss %xmm0, (Y) + + addl INCX, X + addl INCY, Y + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/swap_sse2.S b/kernel/x86/swap_sse2.S new file mode 100644 index 0000000000..b8808125f1 --- /dev/null +++ b/kernel/x86/swap_sse2.S @@ -0,0 +1,572 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %edx +#define X %esi +#define Y %edi +#define INCX %ebx +#define INCY %ecx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_Y, Y + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + cmpl $SIZE, INCX + jne .L40 + cmpl $SIZE, INCY + jne .L40 + + testl $SIZE, Y + je .L10 + + movsd 0 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm0, 0 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_4 + +.L10: + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + + decl %eax + jg .L11 + ALIGN_3 + +.L13: + testl $8, M + jle .L14 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + testl $4, M + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L16: + testl $1, M + jle .L19 + + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + movlps %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L20: + movhps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + decl M + jle .L29 + + movl M, %eax + sarl $4, %eax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -5 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -3 * SIZE(X), %xmm2 + movaps -2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + + movhps %xmm0, -16 * SIZE(Y) + movhps -15 * SIZE(X), %xmm0 + movhps %xmm1, -15 * SIZE(X) + + addl $SIZE, X + addl $SIZE, Y + ALIGN_3 + +.L29: + movhps %xmm0, -16 * SIZE(Y) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L40: + movl M, %eax + sarl $3, %eax + jle .L45 + ALIGN_3 + +.L41: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addl INCX, X + movsd %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L45: + movl M, %eax + andl $7, %eax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + movsd %xmm0, (Y) + + addl INCX, X + addl INCY, Y + decl %eax + jg .L46 + ALIGN_3 + +.L47: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x2.S b/kernel/x86/trsm_kernel_LN_2x2.S new file mode 100644 index 0000000000..d1c741b09f --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x2.S @@ -0,0 +1,1127 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + +#define AA %edx +#define BB %ecx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + leal (, %ebp, SIZE), %ebp + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, %ebx + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax # j = (n >> 1) # MEMORY + sarl $1, %eax + movl %eax, J # j = (n >> 1) # MEMORY + je .L8 + ALIGN_4 + +.L34: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + lea (, %ebp, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %eax # m # MEMORY + andl $1, %eax + je .L12 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + + FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + ALIGN_4 + +.L33: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmulp %st, %st(1) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmulp %st, %st(2) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L12: + movl M, %esi + sarl $1, %esi + je .L27 + ALIGN_4 + +.MainHead: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(BB) # b5 + FLD 4 * SIZE(AA) # a5 + FLD 0 * SIZE(BB) # b1 + FLD 0 * SIZE(AA) # a1 + +#ifdef LN +#if defined(HAVE_3DNOW) + prefetchw -2 * SIZE(%edi) + prefetchw -2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta -2 * SIZE(%edi) + prefetchnta -2 * SIZE(%edi, %ebp, 1) +#endif +#else +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(BB) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(AA) + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(AA) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(AA) + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(AA) + + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(AA) + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(AA) + fxch %st(2) + + subl $-8 * SIZE, BB + subl $-8 * SIZE, AA + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + addl $2 * SIZE,BB + addl $2 * SIZE,AA + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) + FLD 2 * SIZE(BB) + fsubp %st, %st(3) + FLD 3 * SIZE(BB) + fsubp %st, %st(4) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(3) + FLD 2 * SIZE(AA) + fsubp %st, %st(2) + FLD 3 * SIZE(AA) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 2 * SIZE(AA) + fmul %st(3), %st + FLD 2 * SIZE(AA) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + FLD 1 * SIZE(AA) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + FLD 1 * SIZE(BB) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + FLD 2 * SIZE(BB) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) + fxch %st(2) + FSTU 2 * SIZE(BB) + fxch %st(3) + FSTU 3 * SIZE(BB) + + FST 1 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) +#else + FSTU 0 * SIZE(AA) + fxch %st(2) + FSTU 1 * SIZE(AA) + fxch %st(1) + FSTU 2 * SIZE(AA) + fxch %st(3) + FSTU 3 * SIZE(AA) + + FST 1 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) +#endif + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .MainHead + ALIGN_4 + +.L27: +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (%ebx, %eax, 2), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.L8: + movl N, %eax # n # MEMORY + andl $1, %eax + je .End + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + +#ifdef RT + subl %ebp, C +#endif + movl C, %edi # c # MEMORY +#ifndef RT + addl %ebp, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .L36 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + +#ifdef LN +#if defined(HAVE_3DNOW) + prefetchw -2 * SIZE(%edi) +#elif defined(HAVE_SSE) + prefetchnta -2 * SIZE(%edi) +#endif +#else +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) +#endif +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + test %eax, %eax + jle .L52 + ALIGN_3 + +.L51: + FLD (AA) + FMUL (BB) + addl $1 * SIZE,AA + addl $1 * SIZE,BB + faddp %st,%st(1) + decl %eax + jne .L51 + ALIGN_4 + +.L52: + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FMUL 0 * SIZE(AA) +#else + FMUL 0 * SIZE(BB) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L36: + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L99 + ALIGN_4 + +.L46: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $4 * SIZE,AA + addl $2 * SIZE,BB + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $2 * SIZE,AA + addl $1 * SIZE,BB + ALIGN_4 + +.L45: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmulp %st, %st(2) + + FLD 2 * SIZE(AA) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD 0 * SIZE(AA) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmulp %st, %st(1) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(AA) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (%ebx, %eax, SIZE), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x2_atom.S b/kernel/x86/trsm_kernel_LN_2x2_atom.S new file mode 100644 index 0000000000..846a848580 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x2_atom.S @@ -0,0 +1,1145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm7 + + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 3 * SIZE(BB), %xmm7 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm1 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 0 * SIZE(BB), %xmm7 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, %ebx + sarl $1, %ebx + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + movsd 2 * SIZE(BB), %xmm2 + movsd 3 * SIZE(BB), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + movsd 2 * SIZE(AA), %xmm1 + movsd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 + subsd %xmm5, %xmm1 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm2, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm1 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm6 + subsd %xmm5, %xmm2 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm2 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm2, %xmm6 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm1 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm1, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm2 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) + movsd %xmm2, 2 * SIZE(BB) + movsd %xmm3, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) + movsd %xmm1, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movsd %xmm3, 1 * SIZE(CO1, LDC) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L40 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L40: + movl M, %ebx + sarl $1, %ebx + jle .L49 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + movsd 0 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm2 + mulsd %xmm7, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm2 +#endif + + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L31 + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S new file mode 100644 index 0000000000..6645b790ec --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -0,0 +1,2076 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx + jle .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + movapd -14 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm1 + movapd -14 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -10 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd -9 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -5 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd -2 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd -3 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd -4 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -7 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd -8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -12 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) + movsd %xmm2, -14 * SIZE(AA) + movsd %xmm3, -13 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, %ebx + sarl $1, %ebx + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -2 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 1 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + movsd %xmm0, %xmm7 + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm5 + movapd -12 * SIZE(BB), %xmm3 + movapd -10 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup -14 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup -15 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -10 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup -9 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -5 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup -2 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup -3 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup -4 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -7 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup -8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -12 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm2 + movhps -15 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + +.L56: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd -16 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx + jle .L59 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#else + prefetcht0 1 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup -15 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm3, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx + jle .L80 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm2 + movhps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L80: + movl M, %ebx + sarl $1, %ebx + jle .L89 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm1, -15 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S new file mode 100644 index 0000000000..9a7a466a65 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -0,0 +1,2584 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movd OLD_OFFT, %mm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movd %mm4, OFFSET + movd %mm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + movlpd 2 * SIZE(AA), %xmm2 + movlpd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) + movlpd %xmm2, 2 * SIZE(AA) + movlpd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) + prefetchw -2 * SIZE(CO1, LDC, 2) + prefetchw -2 * SIZE(CO1, %eax) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) + prefetchw 1 * SIZE(CO1, LDC, 2) + prefetchw 1 * SIZE(CO1, %eax) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm3 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + movhpd 6 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + movhpd 7 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + movhpd 11 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + movhpd 14 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + movhpd 13 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + movhpd 12 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + movhpd 9 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + movhpd 8 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + movhpd 4 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm3, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) + movlpd %xmm3, 8 * SIZE(BB) + movlpd %xmm3, 9 * SIZE(BB) + movhpd %xmm3, 10 * SIZE(BB) + movhpd %xmm3, 11 * SIZE(BB) + movlpd %xmm7, 12 * SIZE(BB) + movlpd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L32 + ALIGN_2 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L36 + ALIGN_4 + +.L40: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L50 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movlpd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movlpd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movlpd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movlpd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L59 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm3, 4 * SIZE(BB) + movlpd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_2 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L66 + ALIGN_4 + +.L70: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L80 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movlpd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movlpd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movlpd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movlpd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(B), %xmm2 + subsd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L80: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B,%eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse3.S b/kernel/x86/trsm_kernel_LN_2x4_sse3.S new file mode 100644 index 0000000000..5ab4ab3dbc --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_2x4_sse3.S @@ -0,0 +1,2031 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + movapd 2 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm1 + movapd 2 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 6 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd 7 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 11 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd 14 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd 13 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd 12 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 9 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd 8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 4 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) + movsd %xmm2, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) + prefetchnta -2 * SIZE(CO1, LDC, 2) + prefetchnta -2 * SIZE(CO1, %eax, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) + prefetchnta 2 * SIZE(CO1, LDC, 2) + prefetchnta 2 * SIZE(CO1, %eax, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm5 + movapd 4 * SIZE(BB), %xmm3 + movapd 6 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 6 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup 7 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 11 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup 14 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup 13 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup 12 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 9 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 4 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm5, 2 * SIZE(BB) + movapd %xmm3, 4 * SIZE(BB) + movapd %xmm7, 6 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L50 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd 0 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L59 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm3, 2 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L80 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + movhpd 9 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movhpd 1 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + movhpd 9 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + + +.L80: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L89 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_4x2_core2.S b/kernel/x86/trsm_kernel_LN_4x2_core2.S new file mode 100644 index 0000000000..d974fa6591 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_4x2_core2.S @@ -0,0 +1,2100 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L30 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm1 +#else + movsd -16 * SIZE(AA), %xmm0 + movsd -15 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd -13 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm1 + movsd -14 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + movsd %xmm1, -15 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) + movsd %xmm1, -14 * SIZE(BB) + movsd %xmm1, -13 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movddup -14 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movhpd %xmm1, 1 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 -3 * SIZE(CO1, LDC) +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 3 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + movapd -12 * SIZE(B), %xmm5 + movapd -10 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movddup -2 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movddup -3 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movddup -4 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -7 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movddup -8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -12 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -10 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movddup -9 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -5 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movddup -15 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm1, %xmm5 + subpd %xmm5, %xmm3 + + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + subpd %xmm5, %xmm1 + + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movsd %xmm7, 3 * SIZE(CO1) + + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) + movhpd %xmm5, 2 * SIZE(CO1, LDC) + movhpd %xmm7, 3 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm2, 1 * SIZE(CO1, LDC) + movsd %xmm3, 2 * SIZE(CO1, LDC) + movhpd %xmm3, 3 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + movapd %xmm5, -12 * SIZE(B) + movapd %xmm7, -10 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + movddup %xmm5, %xmm4 + movddup %xmm7, %xmm6 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpckhpd %xmm5, %xmm5 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L130 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd -16 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 +#else + movapd -16 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -14 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + + movddup %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + + movapd %xmm1, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L159 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -2 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd -3 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd -4 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -7 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd -8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -12 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -14 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd -13 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd -10 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd -9 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + movsd -5 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movddup %xmm0, %xmm2 + movddup %xmm1, %xmm3 + + unpckhpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm1, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_4x2_sse2.S b/kernel/x86/trsm_kernel_LN_4x2_sse2.S new file mode 100644 index 0000000000..a1fb8a199a --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_4x2_sse2.S @@ -0,0 +1,2293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define PREFETCHSIZE (8 * 4) + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L30 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd 3 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + movsd %xmm1, 1 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movsd %xmm1, 2 * SIZE(BB) + movsd %xmm1, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + movhpd %xmm1, 1 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetcht2 -4 * SIZE(%esi) + prefetcht2 -4 * SIZE(%esi, LDC) +#else + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movsd 14 * SIZE(AA), %xmm0 + movhpd 14 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movsd 13 * SIZE(AA), %xmm0 + movhpd 13 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movsd 12 * SIZE(AA), %xmm0 + movhpd 12 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 9 * SIZE(AA), %xmm0 + movhpd 9 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movsd 8 * SIZE(AA), %xmm0 + movhpd 8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + movhpd 4 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 6 * SIZE(AA), %xmm0 + movhpd 6 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movsd 7 * SIZE(AA), %xmm0 + movhpd 7 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 11 * SIZE(AA), %xmm0 + movhpd 11 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) + movsd %xmm5, 8 * SIZE(BB) + movsd %xmm5, 9 * SIZE(BB) + movhpd %xmm5, 10 * SIZE(BB) + movhpd %xmm5, 11 * SIZE(BB) + movsd %xmm7, 12 * SIZE(BB) + movsd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movsd %xmm7, 3 * SIZE(%esi) + + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) + movhpd %xmm5, 2 * SIZE(%esi, LDC) + movhpd %xmm7, 3 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + + movsd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm2, 1 * SIZE(%esi, LDC) + movsd %xmm3, 2 * SIZE(%esi, LDC) + movhpd %xmm3, 3 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L130 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, BB + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetcht2 -4 * SIZE(%esi) +#else + prefetcht2 4 * SIZE(%esi) +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 +#else + movapd 0 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L159 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 + movapd 2 * SIZE(B), %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 14 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd 13 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd 12 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 9 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd 8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 4 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 2 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 6 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd 7 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 11 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) + movsd %xmm1, 4 * SIZE(BB) + movsd %xmm1, 5 * SIZE(BB) + movhpd %xmm1, 6 * SIZE(BB) + movhpd %xmm1, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S new file mode 100644 index 0000000000..bb33918ef0 --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -0,0 +1,3129 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 20 + STACK + ARGS(%esp) +#define ARG_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define ARG_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, M + je .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movsd -32 * SIZE(AA), %xmm0 + movhps -30 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) + movss %xmm2, -30 * SIZE(AA) + movss %xmm3, -29 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + movsd -28 * SIZE(AA), %xmm2 + movsd -26 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) + movlps %xmm2, -28 * SIZE(AA) + movlps %xmm3, -26 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -4 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -4 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm4 + + movaps %xmm6, %xmm2 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm1 + movlhps %xmm2, %xmm0 + movhlps %xmm2, %xmm1 + + movaps %xmm6, %xmm7 + movlhps %xmm4, %xmm6 + movhlps %xmm4, %xmm7 + + pshufd $0x39, %xmm1, %xmm2 + pshufd $0x39, %xmm7, %xmm4 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + movaps -24 * SIZE(BB), %xmm5 + movaps -20 * SIZE(BB), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm3 + subps %xmm6, %xmm5 + subps %xmm4, %xmm7 +#else + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + movaps -24 * SIZE(AA), %xmm2 + movaps -20 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + movaps %xmm5, -24 * SIZE(BB) + movaps %xmm7, -20 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) + movaps %xmm2, -24 * SIZE(AA) + movaps %xmm3, -20 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, M + je .L60 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm5, %xmm4 + + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + movss -31 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + movhlps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + movsd -28 * SIZE(BB), %xmm5 + movsd -26 * SIZE(BB), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + movlps %xmm5, -28 * SIZE(BB) + movlps %xmm7, -26 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, M + je .L100 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + + leal (AA, %eax, SIZE), AA + leal (B, %eax, SIZE), BB +#endif + + haddps %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BB), %xmm1 + subss %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss -32 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss -32 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA + leal (BB, %eax, SIZE), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -32 * SIZE(AA), %xmm4 + movhps -30 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + movss -30 * SIZE(BB), %xmm5 + movss -29 * SIZE(BB), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) + movss %xmm5, -30 * SIZE(BB) + movss %xmm7, -29 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S new file mode 100644 index 0000000000..147ed19bdc --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -0,0 +1,3691 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movss STACK_OFFT, %xmm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + testl $1, M + je .L20 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (AA, %eax, SIZE), AA + + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + unpcklps %xmm5, %xmm4 + + movaps 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + movss 2 * SIZE(AA), %xmm2 + movss 3 * SIZE(AA), %xmm3 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 + subss %xmm6, %xmm2 + subss %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 4 * SIZE(AA), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 6 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) + movlps %xmm2, 4 * SIZE(AA) + movlps %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + PREFETCHW -4 * SIZE(CO1) + PREFETCHW -4 * SIZE(CO1, LDC) + PREFETCHW -4 * SIZE(CO1, LDC, 2) + PREFETCHW -4 * SIZE(CO1, %eax) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $2 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm5, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm2, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm2, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + testl $1, M + je .L60 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW -4 * SIZE(CO1) + PREFETCHW -4 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 4 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + movaps %xmm0, 24 * SIZE(BB) + movaps %xmm2, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L86 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + testl $1, M + je .L100 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW -4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + movss 2 * SIZE(B), %xmm5 + movss 3 * SIZE(B), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + movss %xmm5, 2 * SIZE(B) + movss %xmm7, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + movaps %xmm0, 8 * SIZE(BB) + pshufd $0x00, %xmm7, %xmm0 + movaps %xmm0, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LN_8x2_sse.S b/kernel/x86/trsm_kernel_LN_8x2_sse.S new file mode 100644 index 0000000000..16a2c2f5bb --- /dev/null +++ b/kernel/x86/trsm_kernel_LN_8x2_sse.S @@ -0,0 +1,3605 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define TRMASK 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-STACK_ALIGN, %esp + + STACK_TOUCHING + + movss STACK_M, %xmm0 + movl STACK_N, %eax + movss STACK_K, %xmm1 + movss STACK_A, %xmm2 + movl STACK_B, B + movss STACK_C, %xmm3 + movl STACK_LDC, LDC + movss STACK_OFFT, %xmm4 + + movss %xmm1, K + movl %eax, N + movss %xmm0, M + movss %xmm2, A + movss %xmm3, C + movl %esi, OLD_STACK + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LN) || defined(LT) + movl $0x3f800000, 0 + TRMASK # 1.0 + movl $0x00000000, 4 + TRMASK # 0.0 + movl $0x3f800000, 8 + TRMASK # 1.0 + movl $0x00000000, 12 + TRMASK # 0.0 +#endif + + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm3 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + testl $1, M + jle .L30 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm2 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm0, %xmm5 + subss %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm2, %xmm5 + subss %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movss %xmm2, 0 * SIZE(CO1) + movss %xmm0, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L30: + testl $2, M + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 2 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm2, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + testl $4, M + jle .L70 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm2, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + movl M, %ebx + sarl $3, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + PREFETCHW 7 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movaps %xmm6, %xmm1 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm1 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + movsd 8 * SIZE(B), %xmm5 + movhps 10 * SIZE(B), %xmm5 + movsd 12 * SIZE(B), %xmm7 + movhps 14 * SIZE(B), %xmm7 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 + subps %xmm6, %xmm5 + subps %xmm1, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 + subps %xmm5, %xmm2 + subps %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm6 + + subps %xmm5, %xmm2 + subps %xmm6, %xmm3 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + mulps %xmm3, %xmm6 + + subps %xmm5, %xmm0 + subps %xmm6, %xmm1 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + movlps %xmm5, 8 * SIZE(B) + movhps %xmm5, 10 * SIZE(B) + movlps %xmm7, 12 * SIZE(B) + movhps %xmm7, 14 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm1, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm7, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm1, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movaps %xmm5, %xmm4 + shufps $0x88, %xmm7, %xmm5 + shufps $0xdd, %xmm7, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) + movlps %xmm4, 4 * SIZE(CO1, LDC) + movhps %xmm4, 6 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + testl $1, N + jle .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + + shufps $0x00, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + testl $1, M + jle .L130 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +.L172: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + shufps $0x00, %xmm1, %xmm1 + movaps %xmm1, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L130: + testl $2, M + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + shufps $1, %xmm5, %xmm5 + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 + + movaps %xmm4, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm0 + mulss %xmm4, %xmm0 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + mulss %xmm4, %xmm0 + movaps %xmm4, %xmm6 + shufps $0x55, %xmm6, %xmm6 + mulss %xmm0, %xmm6 + subss %xmm6, %xmm1 + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + + shufps $0x00, %xmm0, %xmm0 + shufps $0x00, %xmm1, %xmm1 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + testl $4, M + jle .L170 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + movhps 2 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movsd 16 * SIZE(AA), %xmm1 + movhps 18 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 + + xorps %xmm5, %xmm5 + + movaps %xmm2, %xmm3 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L170: + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L179 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm5 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm6 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm5 + movhps 6 * SIZE(B), %xmm5 + + subps %xmm4, %xmm2 + subps %xmm6, %xmm5 + + xorps %xmm0, %xmm0 + + movaps %xmm2, %xmm3 + unpcklps %xmm0, %xmm2 + unpckhps %xmm0, %xmm3 + + movaps %xmm5, %xmm7 + unpcklps %xmm0, %xmm5 + unpckhps %xmm0, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + shufps $0x88, %xmm7, %xmm5 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movhps %xmm5, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L179: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_1x4.S b/kernel/x86/trsm_kernel_LT_1x4.S new file mode 100644 index 0000000000..5670746ec2 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_1x4.S @@ -0,0 +1,1251 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 32 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) +#define AORIG 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define STACK_A 20 + STACK + ARGS(%esp) +#define STACK_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define STACK_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define B_ORIG %ebx +#define LDC %ebp + +#define PREFETCHSIZE (5 + 8 * 10) + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_B, B_ORIG + movl STACK_LDC, LDC + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, STACK_A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B_ORIG + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + subl $-16 * SIZE, B_ORIG + subl $-16 * SIZE, STACK_A + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + sarl $2, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L11: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + + leal (, LDC, 4), %eax +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + jle .L13 + ALIGN_4 + +.L12: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + movl 16 * SIZE(B), %esi + movl 24 * SIZE(B), %esi + movl 32 * SIZE(B), %esi + movl 40 * SIZE(B), %esi + subl $-64 * SIZE, B + decl %eax + jne .L12 + ALIGN_3 + +.L13: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L14: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 4), B +#else + movl B_ORIG, B +#endif + + leal (%edi, LDC, 2), %eax + + fldz + fldz + fldz + fldz + + FLD -8 * SIZE(A) + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + movl $32 * SIZE, %esi + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + prefetchw 1 * SIZE(%eax) + prefetchw 1 * SIZE(%eax, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L16 + ALIGN_3 + +.L15: + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -15 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -13 * SIZE(B) + + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -11 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -9 * SIZE(B) + + faddp %st, %st(5) + FLD -14 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -7 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -5 * SIZE(B) + + faddp %st, %st(5) + FLD -13 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -3 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -1 * SIZE(B) + + faddp %st, %st(5) + FLD -12 * SIZE(A) + FLD 0 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 1 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 3 * SIZE(B) + + faddp %st, %st(5) + FLD -11 * SIZE(A) + FLD 4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 5 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 7 * SIZE(B) + + faddp %st, %st(5) + FLD -10 * SIZE(A) + FLD 8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 9 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 11 * SIZE(B) + + faddp %st, %st(5) + FLD -9 * SIZE(A) + FLD 12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 13 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 15 * SIZE(B) + + faddp %st, %st(5) + FLD 0 * SIZE(A) + + PADDING prefetch PREFETCHSIZE * SIZE(A) + + addl $8 * SIZE, A + fxch %st(1) + addl $32 * SIZE, B + + FLD -16 * SIZE(B) + decl %eax + jne .L15 + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L19 + ALIGN_4 + +.L17: + fmul %st(1), %st + faddp %st, %st(3) + + FLD -15 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(4) + + FLD -14 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(5) + + FMUL -13 * SIZE(B) + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + addl $1 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, A + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 4), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(B) + fsubp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fsubp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(B) + fsubp %st, %st(4) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(A) + fsubp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(A) + fsubp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(A) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef LT + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FMUL 0 * SIZE - 16 * SIZE(B) + + FLD 1 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(4) + + FLD 5 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 6 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(3) + FLD 7 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(4) + + FLD 10 * SIZE - 16 * SIZE(B) + fmulp %st, %st(3) + FLD 11 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(4) + + FLD 15 * SIZE - 16 * SIZE(B) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 15 * SIZE - 16 * SIZE(B) + fmulp %st, %st(4) + + FLD 14 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(3) + FLD 13 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(2) + FLD 12 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(1) + + FLD 10 * SIZE - 16 * SIZE(B) + fmulp %st, %st(3) + FLD 9 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(2) + FLD 8 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(1) + + FLD 5 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 4 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(1) + + FLD 0 * SIZE - 16 * SIZE(B) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(B) + fxch %st(2) + FSTU 2 * SIZE - 16 * SIZE(B) + fxch %st(3) + FSTU 3 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(A) + fxch %st(2) + FSTU 2 * SIZE - 16 * SIZE(A) + fxch %st(3) + FSTU 3 * SIZE - 16 * SIZE(A) +#endif + + leal (%edi, LDC, 2), %eax + + FST 0 * SIZE(%eax, LDC) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi, LDC) + FST 0 * SIZE(%eax) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 4), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L14 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 4), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J + jne .L11 + ALIGN_4 + +.L20: + movl N, %eax + andl $2, %eax + je .L30 + +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + + leal (, LDC, 2), %eax +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + jle .L23 + ALIGN_4 + +.L22: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L22 + ALIGN_3 + +.L23: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L24: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 2), B +#else + movl B_ORIG, B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -14 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -13 * SIZE(A) + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -7 * SIZE(B) + faddp %st, %st(2) + + FLD -11 * SIZE(A) + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -5 * SIZE(B) + faddp %st, %st(4) + + FLD -10 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -3 * SIZE(B) + faddp %st, %st(2) + + FLD -9 * SIZE(A) + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -1 * SIZE(B) + faddp %st, %st(4) + + FLD -8 * SIZE(A) + FLD 0 * SIZE(B) + + addl $ 8 * SIZE, A + subl $-16 * SIZE, B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + addl $1 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(2) + faddp %st, %st(2) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, A + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 2), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(B) + fsubp %st, %st(2) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(A) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FMUL 0 * SIZE - 16 * SIZE(B) + + FLD 1 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(2) + + FLD 3 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(1) + + FLD 0 * SIZE - 16 * SIZE(B) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(A) +#endif + + FST 0 * SIZE(%edi, LDC) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 2), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + decl I + jne .L24 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 2), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L30: + movl N, %eax + andl $1, %eax + je .L999 + ALIGN_3 + +.L31: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %edi +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L32: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L32 + ALIGN_3 + +.L33: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L34: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 1), B +#else + movl B_ORIG, B +#endif + + fldz + fldz + fldz + fldz + + prefetchw 1 * SIZE(%edi) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L36 + ALIGN_3 + +.L35: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + FLD -15 * SIZE(A) + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -14 * SIZE(A) + FMUL -14 * SIZE(B) + faddp %st, %st(3) + + FLD -13 * SIZE(A) + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FMUL -12 * SIZE(B) + faddp %st, %st(1) + + FLD -11 * SIZE(A) + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -10 * SIZE(A) + FMUL -10 * SIZE(B) + faddp %st, %st(3) + + FLD -9 * SIZE(A) + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + addl $8 * SIZE, A + addl $8 * SIZE, B + + decl %eax + jne .L35 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L39 + ALIGN_4 + +.L37: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + addl $1 * SIZE,A + addl $1 * SIZE,B + decl %eax + jne .L37 + ALIGN_4 + +.L39: + faddp %st, %st(2) + faddp %st, %st(2) + faddp %st, %st(1) + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, A + leal (A, %eax, SIZE), A + leal (B_ORIG, %eax, SIZE), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(A) + fmulp %st, %st(1) +#endif + +#if defined(RN) || defined(RT) + FMUL 0 * SIZE - 16 * SIZE(B) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (A, %eax, SIZE), A + leal (B, %eax, SIZE), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + decl I + jne .L34 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 1), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x2.S b/kernel/x86/trsm_kernel_LT_2x2.S new file mode 100644 index 0000000000..d21909d669 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x2.S @@ -0,0 +1,1104 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + +#define AA %edx +#define BB %ecx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + leal (, %ebp, SIZE), %ebp + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, %ebx + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax # j = (n >> 1) # MEMORY + sarl $1, %eax + movl %eax, J # j = (n >> 1) # MEMORY + je .L8 + ALIGN_4 + +.L34: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + lea (, %ebp, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %esi + sarl $1, %esi + je .L12 + ALIGN_4 + +.MainHead: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(BB) # b5 + FLD 4 * SIZE(AA) # a5 + FLD 0 * SIZE(BB) # b1 + FLD 0 * SIZE(AA) # a1 + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(BB) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(AA) + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(AA) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(AA) + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(AA) + + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(AA) + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(AA) + fxch %st(2) + + subl $-8 * SIZE, BB + subl $-8 * SIZE, AA + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + addl $2 * SIZE,BB + addl $2 * SIZE,AA + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) + FLD 2 * SIZE(BB) + fsubp %st, %st(3) + FLD 3 * SIZE(BB) + fsubp %st, %st(4) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(3) + FLD 2 * SIZE(AA) + fsubp %st, %st(2) + FLD 3 * SIZE(AA) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 2 * SIZE(AA) + fmul %st(3), %st + FLD 2 * SIZE(AA) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + FLD 1 * SIZE(AA) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + FLD 1 * SIZE(BB) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + FLD 2 * SIZE(BB) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) + fxch %st(2) + FSTU 2 * SIZE(BB) + fxch %st(3) + FSTU 3 * SIZE(BB) + + FST 1 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) +#else + FSTU 0 * SIZE(AA) + fxch %st(2) + FSTU 1 * SIZE(AA) + fxch %st(1) + FSTU 2 * SIZE(AA) + fxch %st(3) + FSTU 3 * SIZE(AA) + + FST 1 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) +#endif + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .MainHead + ALIGN_4 + +.L12: + movl M, %eax # m # MEMORY + andl $1, %eax + je .L27 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + + FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + ALIGN_4 + +.L33: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmulp %st, %st(1) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmulp %st, %st(2) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L27: +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (%ebx, %eax, 2), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.L8: + movl N, %eax # n # MEMORY + andl $1, %eax + je .End + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + +#ifdef RT + subl %ebp, C +#endif + movl C, %edi # c # MEMORY +#ifndef RT + addl %ebp, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L36 + ALIGN_4 + +.L46: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $4 * SIZE,AA + addl $2 * SIZE,BB + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $2 * SIZE,AA + addl $1 * SIZE,BB + ALIGN_4 + +.L45: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmulp %st, %st(2) + + FLD 2 * SIZE(AA) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD 0 * SIZE(AA) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmulp %st, %st(1) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(AA) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L36: + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + test %eax, %eax + jle .L52 + ALIGN_3 + +.L51: + FLD (AA) + FMUL (BB) + addl $1 * SIZE,AA + addl $1 * SIZE,BB + faddp %st,%st(1) + decl %eax + jne .L51 + ALIGN_4 + +.L52: + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FMUL 0 * SIZE(AA) +#else + FMUL 0 * SIZE(BB) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (%ebx, %eax, SIZE), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x2_atom.S b/kernel/x86/trsm_kernel_LT_2x2_atom.S new file mode 100644 index 0000000000..3835005319 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x2_atom.S @@ -0,0 +1,1145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + movsd 2 * SIZE(BB), %xmm2 + movsd 3 * SIZE(BB), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + movsd 2 * SIZE(AA), %xmm1 + movsd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 + subsd %xmm5, %xmm1 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm2, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm1 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm6 + subsd %xmm5, %xmm2 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm2 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm2, %xmm6 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm1 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm1, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm2 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) + movsd %xmm2, 2 * SIZE(BB) + movsd %xmm3, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) + movsd %xmm1, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movsd %xmm3, 1 * SIZE(CO1, LDC) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm7 + + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 3 * SIZE(BB), %xmm7 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm1 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 0 * SIZE(BB), %xmm7 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + movsd 0 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm2 + mulsd %xmm7, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm2 +#endif + + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L49 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S new file mode 100644 index 0000000000..55c69e49f2 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -0,0 +1,2071 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -2 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 1 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + movsd %xmm0, %xmm7 + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm5 + movapd -12 * SIZE(BB), %xmm3 + movapd -10 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup -14 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup -15 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -10 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup -9 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -5 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup -2 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup -3 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup -4 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -7 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup -8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -12 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + movapd -14 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm1 + movapd -14 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -10 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd -9 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -5 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd -2 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd -3 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd -4 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -7 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd -8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -12 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) + movsd %xmm2, -14 * SIZE(AA) + movsd %xmm3, -13 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#else + prefetcht0 1 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup -15 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm3, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + +.L56: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd -16 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm1, -15 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L89 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S new file mode 100644 index 0000000000..e4f59819b3 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -0,0 +1,2583 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movd OLD_OFFT, %mm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movd %mm4, OFFSET + movd %mm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) + prefetchw -2 * SIZE(CO1, LDC, 2) + prefetchw -2 * SIZE(CO1, %eax) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) + prefetchw 1 * SIZE(CO1, LDC, 2) + prefetchw 1 * SIZE(CO1, %eax) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm3 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + movhpd 6 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + movhpd 7 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + movhpd 11 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + movhpd 14 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + movhpd 13 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + movhpd 12 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + movhpd 9 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + movhpd 8 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + movhpd 4 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm3, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) + movlpd %xmm3, 8 * SIZE(BB) + movlpd %xmm3, 9 * SIZE(BB) + movhpd %xmm3, 10 * SIZE(BB) + movhpd %xmm3, 11 * SIZE(BB) + movlpd %xmm7, 12 * SIZE(BB) + movlpd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + movlpd 2 * SIZE(AA), %xmm2 + movlpd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) + movlpd %xmm2, 2 * SIZE(AA) + movlpd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L32 + ALIGN_2 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L36 + ALIGN_4 + +.L40: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm3, 4 * SIZE(BB) + movlpd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movlpd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movlpd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movlpd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movlpd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_2 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L66 + ALIGN_4 + +.L70: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L99 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movlpd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movlpd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movlpd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movlpd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(B), %xmm2 + subsd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B,%eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse3.S b/kernel/x86/trsm_kernel_LT_2x4_sse3.S new file mode 100644 index 0000000000..487f059221 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_2x4_sse3.S @@ -0,0 +1,2030 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) + prefetchnta -2 * SIZE(CO1, LDC, 2) + prefetchnta -2 * SIZE(CO1, %eax, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) + prefetchnta 2 * SIZE(CO1, LDC, 2) + prefetchnta 2 * SIZE(CO1, %eax, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm5 + movapd 4 * SIZE(BB), %xmm3 + movapd 6 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 6 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup 7 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 11 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup 14 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup 13 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup 12 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 9 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 4 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm5, 2 * SIZE(BB) + movapd %xmm3, 4 * SIZE(BB) + movapd %xmm7, 6 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + movapd 2 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm1 + movapd 2 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 6 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd 7 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 11 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd 14 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd 13 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd 12 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 9 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd 8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 4 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) + movsd %xmm2, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm3, 2 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd 0 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L89 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + movhpd 9 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movhpd 1 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + movhpd 9 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_4x2_core2.S b/kernel/x86/trsm_kernel_LT_4x2_core2.S new file mode 100644 index 0000000000..dba627f00b --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_4x2_core2.S @@ -0,0 +1,2100 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 -3 * SIZE(CO1, LDC) +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 3 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + movapd -12 * SIZE(B), %xmm5 + movapd -10 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movddup -2 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movddup -3 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movddup -4 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -7 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movddup -8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -12 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -10 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movddup -9 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -5 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movddup -15 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm1, %xmm5 + subpd %xmm5, %xmm3 + + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + subpd %xmm5, %xmm1 + + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movsd %xmm7, 3 * SIZE(CO1) + + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) + movhpd %xmm5, 2 * SIZE(CO1, LDC) + movhpd %xmm7, 3 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm2, 1 * SIZE(CO1, LDC) + movsd %xmm3, 2 * SIZE(CO1, LDC) + movhpd %xmm3, 3 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + movapd %xmm5, -12 * SIZE(B) + movapd %xmm7, -10 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + movddup %xmm5, %xmm4 + movddup %xmm7, %xmm6 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpckhpd %xmm5, %xmm5 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movddup -14 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movhpd %xmm1, 1 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm1 +#else + movsd -16 * SIZE(AA), %xmm0 + movsd -15 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd -13 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm1 + movsd -14 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + movsd %xmm1, -15 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) + movsd %xmm1, -14 * SIZE(BB) + movsd %xmm1, -13 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -2 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd -3 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd -4 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -7 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd -8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -12 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -14 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd -13 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd -10 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd -9 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + movsd -5 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movddup %xmm0, %xmm2 + movddup %xmm1, %xmm3 + + unpckhpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm1, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 +#else + movapd -16 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -14 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + + movddup %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + + movapd %xmm1, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L159 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd -16 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_4x2_sse2.S b/kernel/x86/trsm_kernel_LT_4x2_sse2.S new file mode 100644 index 0000000000..626d75a9b8 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_4x2_sse2.S @@ -0,0 +1,2280 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define PREFETCHSIZE (8 * 4) + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movsd 14 * SIZE(AA), %xmm0 + movhpd 14 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movsd 13 * SIZE(AA), %xmm0 + movhpd 13 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movsd 12 * SIZE(AA), %xmm0 + movhpd 12 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 9 * SIZE(AA), %xmm0 + movhpd 9 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movsd 8 * SIZE(AA), %xmm0 + movhpd 8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + movhpd 4 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 6 * SIZE(AA), %xmm0 + movhpd 6 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movsd 7 * SIZE(AA), %xmm0 + movhpd 7 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 11 * SIZE(AA), %xmm0 + movhpd 11 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) + movsd %xmm5, 8 * SIZE(BB) + movsd %xmm5, 9 * SIZE(BB) + movhpd %xmm5, 10 * SIZE(BB) + movhpd %xmm5, 11 * SIZE(BB) + movsd %xmm7, 12 * SIZE(BB) + movsd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movsd %xmm7, 3 * SIZE(%esi) + + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) + movhpd %xmm5, 2 * SIZE(%esi, LDC) + movhpd %xmm7, 3 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + + movsd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm2, 1 * SIZE(%esi, LDC) + movsd %xmm3, 2 * SIZE(%esi, LDC) + movhpd %xmm3, 3 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + movhpd %xmm1, 1 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd 3 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + movsd %xmm1, 1 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movsd %xmm1, 2 * SIZE(BB) + movsd %xmm1, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 + movapd 2 * SIZE(B), %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 14 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd 13 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd 12 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 9 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd 8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 4 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 2 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 6 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd 7 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 11 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) + movsd %xmm1, 4 * SIZE(BB) + movsd %xmm1, 5 * SIZE(BB) + movhpd %xmm1, 6 * SIZE(BB) + movhpd %xmm1, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 +#else + movapd 0 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L159 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, BB + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S new file mode 100644 index 0000000000..11cc104e2f --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -0,0 +1,3129 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 20 + STACK + ARGS(%esp) +#define ARG_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define ARG_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -4 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -4 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm4 + + movaps %xmm6, %xmm2 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm1 + movlhps %xmm2, %xmm0 + movhlps %xmm2, %xmm1 + + movaps %xmm6, %xmm7 + movlhps %xmm4, %xmm6 + movhlps %xmm4, %xmm7 + + pshufd $0x39, %xmm1, %xmm2 + pshufd $0x39, %xmm7, %xmm4 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + movaps -24 * SIZE(BB), %xmm5 + movaps -20 * SIZE(BB), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm3 + subps %xmm6, %xmm5 + subps %xmm4, %xmm7 +#else + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + movaps -24 * SIZE(AA), %xmm2 + movaps -20 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + movaps %xmm5, -24 * SIZE(BB) + movaps %xmm7, -20 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) + movaps %xmm2, -24 * SIZE(AA) + movaps %xmm3, -20 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + movsd -28 * SIZE(AA), %xmm2 + movsd -26 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) + movlps %xmm2, -28 * SIZE(AA) + movlps %xmm3, -26 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movsd -32 * SIZE(AA), %xmm0 + movhps -30 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) + movss %xmm2, -30 * SIZE(AA) + movss %xmm3, -29 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + movsd -28 * SIZE(BB), %xmm5 + movsd -26 * SIZE(BB), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + movlps %xmm5, -28 * SIZE(BB) + movlps %xmm7, -26 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + movhlps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm5, %xmm4 + + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + movss -31 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + movss -30 * SIZE(BB), %xmm5 + movss -29 * SIZE(BB), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) + movss %xmm5, -30 * SIZE(BB) + movss %xmm7, -29 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -32 * SIZE(AA), %xmm4 + movhps -30 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + testl $1, M + je .L119 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + + leal (AA, %eax, SIZE), AA + leal (B, %eax, SIZE), BB +#endif + + haddps %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BB), %xmm1 + subss %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss -32 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss -32 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA + leal (BB, %eax, SIZE), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S new file mode 100644 index 0000000000..8d61898657 --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -0,0 +1,3690 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movss STACK_OFFT, %xmm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + PREFETCHW 3 * SIZE(CO1, LDC, 2) + PREFETCHW 3 * SIZE(CO1, %eax) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $2 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm5, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm2, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm2, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 4 * SIZE(AA), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 6 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) + movlps %xmm2, 4 * SIZE(AA) + movlps %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (AA, %eax, SIZE), AA + + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + unpcklps %xmm5, %xmm4 + + movaps 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + movss 2 * SIZE(AA), %xmm2 + movss 3 * SIZE(AA), %xmm3 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 + subss %xmm6, %xmm2 + subss %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 4 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + movaps %xmm0, 24 * SIZE(BB) + movaps %xmm2, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L86 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + movss 2 * SIZE(B), %xmm5 + movss 3 * SIZE(B), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + movss %xmm5, 2 * SIZE(B) + movss %xmm7, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + movaps %xmm0, 8 * SIZE(BB) + pshufd $0x00, %xmm7, %xmm0 + movaps %xmm0, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + testl $1, M + je .L119 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_LT_8x2_sse.S b/kernel/x86/trsm_kernel_LT_8x2_sse.S new file mode 100644 index 0000000000..5d596980fe --- /dev/null +++ b/kernel/x86/trsm_kernel_LT_8x2_sse.S @@ -0,0 +1,3604 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define TRMASK 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-STACK_ALIGN, %esp + + STACK_TOUCHING + + movss STACK_M, %xmm0 + movl STACK_N, %eax + movss STACK_K, %xmm1 + movss STACK_A, %xmm2 + movl STACK_B, B + movss STACK_C, %xmm3 + movl STACK_LDC, LDC + movss STACK_OFFT, %xmm4 + + movss %xmm1, K + movl %eax, N + movss %xmm0, M + movss %xmm2, A + movss %xmm3, C + movl %esi, OLD_STACK + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LN) || defined(LT) + movl $0x3f800000, 0 + TRMASK # 1.0 + movl $0x00000000, 4 + TRMASK # 0.0 + movl $0x3f800000, 8 + TRMASK # 1.0 + movl $0x00000000, 12 + TRMASK # 0.0 +#endif + + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm3 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $3, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + PREFETCHW 7 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movaps %xmm6, %xmm1 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm1 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + movsd 8 * SIZE(B), %xmm5 + movhps 10 * SIZE(B), %xmm5 + movsd 12 * SIZE(B), %xmm7 + movhps 14 * SIZE(B), %xmm7 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 + subps %xmm6, %xmm5 + subps %xmm1, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 + subps %xmm5, %xmm2 + subps %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm6 + + subps %xmm5, %xmm2 + subps %xmm6, %xmm3 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + mulps %xmm3, %xmm6 + + subps %xmm5, %xmm0 + subps %xmm6, %xmm1 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + movlps %xmm5, 8 * SIZE(B) + movhps %xmm5, 10 * SIZE(B) + movlps %xmm7, 12 * SIZE(B) + movhps %xmm7, 14 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm1, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm7, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm1, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movaps %xmm5, %xmm4 + shufps $0x88, %xmm7, %xmm5 + shufps $0xdd, %xmm7, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) + movlps %xmm4, 4 * SIZE(CO1, LDC) + movhps %xmm4, 6 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + testl $4, M + jle .L50 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm2, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + testl $2, M + jle .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 2 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm2, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + testl $1, M + jle .L99 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm2 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm0, %xmm5 + subss %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm2, %xmm5 + subss %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movss %xmm2, 0 * SIZE(CO1) + movss %xmm0, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + testl $1, N + jle .L999 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + + shufps $0x00, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm5 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm6 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm5 + movhps 6 * SIZE(B), %xmm5 + + subps %xmm4, %xmm2 + subps %xmm6, %xmm5 + + xorps %xmm0, %xmm0 + + movaps %xmm2, %xmm3 + unpcklps %xmm0, %xmm2 + unpckhps %xmm0, %xmm3 + + movaps %xmm5, %xmm7 + unpcklps %xmm0, %xmm5 + unpckhps %xmm0, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + shufps $0x88, %xmm7, %xmm5 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movhps %xmm5, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + testl $4, M + jle .L150 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + movhps 2 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movsd 16 * SIZE(AA), %xmm1 + movhps 18 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 + + xorps %xmm5, %xmm5 + + movaps %xmm2, %xmm3 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + testl $2, M + jle .L170 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + shufps $1, %xmm5, %xmm5 + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 + + movaps %xmm4, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm0 + mulss %xmm4, %xmm0 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + mulss %xmm4, %xmm0 + movaps %xmm4, %xmm6 + shufps $0x55, %xmm6, %xmm6 + mulss %xmm0, %xmm6 + subss %xmm6, %xmm1 + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + + shufps $0x00, %xmm0, %xmm0 + shufps $0x00, %xmm1, %xmm1 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L170: + testl $1, M + jle .L179 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +.L172: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + shufps $0x00, %xmm1, %xmm1 + movaps %xmm1, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 +.L179: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_1x4.S b/kernel/x86/trsm_kernel_RT_1x4.S new file mode 100644 index 0000000000..b7f17e2599 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_1x4.S @@ -0,0 +1,1251 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 32 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) +#define AORIG 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define STACK_A 20 + STACK + ARGS(%esp) +#define STACK_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define STACK_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define B_ORIG %ebx +#define LDC %ebp + +#define PREFETCHSIZE (5 + 8 * 10) + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_B, B_ORIG + movl STACK_LDC, LDC + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, STACK_A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B_ORIG + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + subl $-16 * SIZE, B_ORIG + subl $-16 * SIZE, STACK_A + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + andl $1, %eax + je .L20 + ALIGN_3 + +.L31: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %edi +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L32: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L32 + ALIGN_3 + +.L33: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L34: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 1), B +#else + movl B_ORIG, B +#endif + + fldz + fldz + fldz + fldz + + prefetchw 1 * SIZE(%edi) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L36 + ALIGN_3 + +.L35: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + FLD -15 * SIZE(A) + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -14 * SIZE(A) + FMUL -14 * SIZE(B) + faddp %st, %st(3) + + FLD -13 * SIZE(A) + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FMUL -12 * SIZE(B) + faddp %st, %st(1) + + FLD -11 * SIZE(A) + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -10 * SIZE(A) + FMUL -10 * SIZE(B) + faddp %st, %st(3) + + FLD -9 * SIZE(A) + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + addl $8 * SIZE, A + addl $8 * SIZE, B + + decl %eax + jne .L35 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L39 + ALIGN_4 + +.L37: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + addl $1 * SIZE,A + addl $1 * SIZE,B + decl %eax + jne .L37 + ALIGN_4 + +.L39: + faddp %st, %st(2) + faddp %st, %st(2) + faddp %st, %st(1) + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, A + leal (A, %eax, SIZE), A + leal (B_ORIG, %eax, SIZE), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(A) + fmulp %st, %st(1) +#endif + +#if defined(RN) || defined(RT) + FMUL 0 * SIZE - 16 * SIZE(B) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (A, %eax, SIZE), A + leal (B, %eax, SIZE), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + decl I + jne .L34 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 1), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L20: + movl N, %eax + andl $2, %eax + je .L30 + +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + + leal (, LDC, 2), %eax +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + jle .L23 + ALIGN_4 + +.L22: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L22 + ALIGN_3 + +.L23: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L24: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 2), B +#else + movl B_ORIG, B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -14 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -13 * SIZE(A) + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -7 * SIZE(B) + faddp %st, %st(2) + + FLD -11 * SIZE(A) + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -5 * SIZE(B) + faddp %st, %st(4) + + FLD -10 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -3 * SIZE(B) + faddp %st, %st(2) + + FLD -9 * SIZE(A) + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -1 * SIZE(B) + faddp %st, %st(4) + + FLD -8 * SIZE(A) + FLD 0 * SIZE(B) + + addl $ 8 * SIZE, A + subl $-16 * SIZE, B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + addl $1 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(2) + faddp %st, %st(2) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, A + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 2), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(B) + fsubp %st, %st(2) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(A) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FMUL 0 * SIZE - 16 * SIZE(B) + + FLD 1 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(2) + + FLD 3 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(1) + + FLD 0 * SIZE - 16 * SIZE(B) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(A) +#endif + + FST 0 * SIZE(%edi, LDC) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 2), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + decl I + jne .L24 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 2), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L30: + movl N, %eax + sarl $2, %eax + movl %eax, J + je .L999 + ALIGN_3 + +.L11: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B_ORIG +#endif + + leal (, LDC, 4), %eax +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B_ORIG, B + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + jle .L13 + ALIGN_4 + +.L12: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + movl 16 * SIZE(B), %esi + movl 24 * SIZE(B), %esi + movl 32 * SIZE(B), %esi + movl 40 * SIZE(B), %esi + subl $-64 * SIZE, B + decl %eax + jne .L12 + ALIGN_3 + +.L13: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L14: +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, A + leal (A , %eax, 1), A + leal (B_ORIG, %eax, 4), B +#else + movl B_ORIG, B +#endif + + leal (%edi, LDC, 2), %eax + + fldz + fldz + fldz + fldz + + FLD -8 * SIZE(A) + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + movl $32 * SIZE, %esi + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + prefetchw 1 * SIZE(%eax) + prefetchw 1 * SIZE(%eax, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L16 + ALIGN_3 + +.L15: + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -15 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -13 * SIZE(B) + + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -11 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -9 * SIZE(B) + + faddp %st, %st(5) + FLD -14 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -7 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -5 * SIZE(B) + + faddp %st, %st(5) + FLD -13 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -3 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -1 * SIZE(B) + + faddp %st, %st(5) + FLD -12 * SIZE(A) + FLD 0 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 1 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 3 * SIZE(B) + + faddp %st, %st(5) + FLD -11 * SIZE(A) + FLD 4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 5 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 7 * SIZE(B) + + faddp %st, %st(5) + FLD -10 * SIZE(A) + FLD 8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 9 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 11 * SIZE(B) + + faddp %st, %st(5) + FLD -9 * SIZE(A) + FLD 12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 13 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 15 * SIZE(B) + + faddp %st, %st(5) + FLD 0 * SIZE(A) + + PADDING prefetch PREFETCHSIZE * SIZE(A) + + addl $8 * SIZE, A + fxch %st(1) + addl $32 * SIZE, B + + FLD -16 * SIZE(B) + decl %eax + jne .L15 + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $7, %eax + je .L19 + ALIGN_4 + +.L17: + fmul %st(1), %st + faddp %st, %st(3) + + FLD -15 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(4) + + FLD -14 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(5) + + FMUL -13 * SIZE(B) + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + addl $1 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, A + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 4), B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE - 16 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(B) + fsubp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fsubp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(B) + fsubp %st, %st(4) +#else + FLD 0 * SIZE - 16 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE - 16 * SIZE(A) + fsubp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(A) + fsubp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(A) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef LT + FLD 0 * SIZE - 16 * SIZE(A) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FMUL 0 * SIZE - 16 * SIZE(B) + + FLD 1 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(2) + FLD 2 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(3) + FLD 3 * SIZE - 16 * SIZE(B) + fmul %st(1), %st + fsubrp %st, %st(4) + + FLD 5 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 6 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(3) + FLD 7 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(4) + + FLD 10 * SIZE - 16 * SIZE(B) + fmulp %st, %st(3) + FLD 11 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(4) + + FLD 15 * SIZE - 16 * SIZE(B) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 15 * SIZE - 16 * SIZE(B) + fmulp %st, %st(4) + + FLD 14 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(3) + FLD 13 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(2) + FLD 12 * SIZE - 16 * SIZE(B) + fmul %st(4), %st + fsubrp %st, %st(1) + + FLD 10 * SIZE - 16 * SIZE(B) + fmulp %st, %st(3) + FLD 9 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(2) + FLD 8 * SIZE - 16 * SIZE(B) + fmul %st(3), %st + fsubrp %st, %st(1) + + FLD 5 * SIZE - 16 * SIZE(B) + fmulp %st, %st(2) + FLD 4 * SIZE - 16 * SIZE(B) + fmul %st(2), %st + fsubrp %st, %st(1) + + FLD 0 * SIZE - 16 * SIZE(B) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE - 16 * SIZE(B) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(B) + fxch %st(2) + FSTU 2 * SIZE - 16 * SIZE(B) + fxch %st(3) + FSTU 3 * SIZE - 16 * SIZE(B) +#else + FSTU 0 * SIZE - 16 * SIZE(A) + fxch %st(1) + FSTU 1 * SIZE - 16 * SIZE(A) + fxch %st(2) + FSTU 2 * SIZE - 16 * SIZE(A) + fxch %st(3) + FSTU 3 * SIZE - 16 * SIZE(A) +#endif + + leal (%edi, LDC, 2), %eax + + FST 0 * SIZE(%eax, LDC) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi, LDC) + FST 0 * SIZE(%eax) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B, %eax, 4), B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L14 + +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (B_ORIG, %eax, 4), B_ORIG +#endif +#if defined(LT) || defined(RN) + movl B, B_ORIG +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J + jne .L11 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x2.S b/kernel/x86/trsm_kernel_RT_2x2.S new file mode 100644 index 0000000000..860344616f --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x2.S @@ -0,0 +1,1102 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#else +#define A 20 + STACK + ARGS(%esp) +#define B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + +#define AA %edx +#define BB %ecx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + leal (, %ebp, SIZE), %ebp + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, %ebx + + movl N, %eax + imull %ebp, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax # n # MEMORY + andl $1, %eax + je .L8 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + +#ifdef RT + subl %ebp, C +#endif + movl C, %edi # c # MEMORY +#ifndef RT + addl %ebp, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L36 + ALIGN_4 + +.L46: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + FLD 0 * SIZE(BB) # temp1 = *(boffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(BB) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $4 * SIZE,AA + addl $2 * SIZE,BB + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(AA) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(AA) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(BB) # temp1 = *(boffset + 0) + + addl $2 * SIZE,AA + addl $1 * SIZE,BB + ALIGN_4 + +.L45: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmulp %st, %st(2) + + FLD 2 * SIZE(AA) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD 0 * SIZE(AA) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmulp %st, %st(1) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(AA) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L36: + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#else + movl %ebx, BB +#endif + + fldz + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + test %eax, %eax + jle .L52 + ALIGN_3 + +.L51: + FLD (AA) + FMUL (BB) + addl $1 * SIZE,AA + addl $1 * SIZE,BB + faddp %st,%st(1) + decl %eax + jne .L51 + ALIGN_4 + +.L52: + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) +#endif + +#if defined(LN) || defined(LT) + FMUL 0 * SIZE(AA) +#else + FMUL 0 * SIZE(BB) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (%ebx, %eax, SIZE), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L8: + movl N, %eax # j = (n >> 1) # MEMORY + sarl $1, %eax + movl %eax, J # j = (n >> 1) # MEMORY + je .End + ALIGN_4 + +.L34: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, %ebx +#endif + lea (, %ebp, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %edi +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %esi + sarl $1, %esi + je .L12 + ALIGN_4 + +.MainHead: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(BB) # b5 + FLD 4 * SIZE(AA) # a5 + FLD 0 * SIZE(BB) # b1 + FLD 0 * SIZE(AA) # a1 + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(BB) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET) * SIZE(AA) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(AA) + fmul %st, %st(1) + FMUL 3 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(AA) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(BB) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(AA) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(AA) + fmul %st, %st(3) + FMUL 5 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(AA) + + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(BB) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(AA) + fmul %st, %st(3) + FMUL 7 * SIZE(BB) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(BB) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(AA) + fxch %st(2) + + subl $-8 * SIZE, BB + subl $-8 * SIZE, AA + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(BB) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(AA) + fmul %st, %st(1) + FMUL 1 * SIZE(BB) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(BB) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(AA) + + addl $2 * SIZE,BB + addl $2 * SIZE,AA + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 2), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) + FLD 2 * SIZE(BB) + fsubp %st, %st(3) + FLD 3 * SIZE(BB) + fsubp %st, %st(4) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(3) + FLD 2 * SIZE(AA) + fsubp %st, %st(2) + FLD 3 * SIZE(AA) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 2 * SIZE(AA) + fmul %st(3), %st + FLD 2 * SIZE(AA) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(AA) + fmul %st(1), %st + FLD 1 * SIZE(AA) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD 3 * SIZE(AA) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + FLD 1 * SIZE(BB) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + FLD 2 * SIZE(BB) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subl $2 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) + fxch %st(2) + FSTU 2 * SIZE(BB) + fxch %st(3) + FSTU 3 * SIZE(BB) + + FST 1 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) +#else + FSTU 0 * SIZE(AA) + fxch %st(2) + FSTU 1 * SIZE(AA) + fxch %st(1) + FSTU 2 * SIZE(AA) + fxch %st(3) + FSTU 3 * SIZE(AA) + + FST 1 * SIZE(%edi,%ebp) + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) + FST 0 * SIZE(%edi,%ebp) +#endif + +#ifndef LN + addl $2 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %esi # i -- + jne .MainHead + ALIGN_4 + +.L12: + movl M, %eax # m # MEMORY + andl $1, %eax + je .L27 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + leal (, %eax, SIZE), %eax + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#else + movl %ebx, BB +#endif + + fldz + fldz + + FLD 0 * SIZE(AA) # temp1 = *(aoffset + 0) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(BB) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(BB) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(AA) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + ALIGN_4 + +.L33: + ffreep %st(0) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + leal (, %eax, SIZE), %eax + + movl AORIG, AA + leal (AA, %eax, 1), AA + leal (%ebx, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(BB) + fsubp %st, %st(1) + FLD 1 * SIZE(BB) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(AA) + fsubp %st, %st(1) + FLD 1 * SIZE(AA) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(AA) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD 0 * SIZE(BB) + fmulp %st, %st(1) + + FLD 1 * SIZE(BB) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD 3 * SIZE(BB) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD 3 * SIZE(BB) + fmulp %st, %st(2) + + FLD 2 * SIZE(BB) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD 0 * SIZE(BB) + fmulp %st, %st(1) +#endif + +#ifdef LN + subl $1 * SIZE, %edi +#endif + +#if defined(LN) || defined(LT) + FSTU 0 * SIZE(BB) + fxch %st(1) + FSTU 1 * SIZE(BB) +#else + FSTU 0 * SIZE(AA) + fxch %st(1) + FSTU 1 * SIZE(AA) +#endif + + FST 0 * SIZE(%edi,%ebp) + FST 0 * SIZE(%edi) + +#ifndef LN + addl $1 * SIZE, %edi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L27: +#ifdef LN + movl K, %eax + leal ( , %eax, SIZE), %eax + leal (%ebx, %eax, 2), %ebx +#endif +#if defined(LT) || defined(RN) + movl BB, %ebx +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x2_atom.S b/kernel/x86/trsm_kernel_RT_2x2_atom.S new file mode 100644 index 0000000000..97af198f9b --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x2_atom.S @@ -0,0 +1,1145 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L30 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + movsd 0 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm7 + subsd %xmm5, %xmm2 + mulsd %xmm7, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm2 +#endif + + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L49 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L30: + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + movsd 2 * SIZE(BB), %xmm2 + movsd 3 * SIZE(BB), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm2 + movsd 2 * SIZE(AA), %xmm1 + movsd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm2 + subsd %xmm5, %xmm1 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm2, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm1 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm4, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm6 + subsd %xmm5, %xmm2 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm2 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm0, %xmm5 + mulsd %xmm2, %xmm6 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm3 + mulsd %xmm7, %xmm1 + mulsd %xmm7, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + mulsd %xmm4, %xmm3 + movsd 0 * SIZE(BB), %xmm7 + + movaps %xmm5, %xmm6 + mulsd %xmm1, %xmm5 + mulsd %xmm3, %xmm6 + subsd %xmm5, %xmm0 + subsd %xmm6, %xmm2 + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) + movsd %xmm2, 2 * SIZE(BB) + movsd %xmm3, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm2, 1 * SIZE(AA) + movsd %xmm1, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movsd %xmm3, 1 * SIZE(CO1, LDC) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm7 + + mulsd %xmm7, %xmm0 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 3 * SIZE(BB), %xmm7 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm1 + mulsd %xmm7, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm5 + movaps %xmm5, %xmm6 + movsd 0 * SIZE(BB), %xmm7 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm0 + mulsd %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S new file mode 100644 index 0000000000..01876a5159 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -0,0 +1,2075 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L30 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + movhps -15 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm1, -15 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L89 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + movhps -15 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm2 + movhps -15 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#else + prefetcht0 1 * SIZE(CO1) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup -15 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm3, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + +.L56: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd -16 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -2 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -2 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 1 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + movsd %xmm0, %xmm7 + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(BB), %xmm2 + movapd -14 * SIZE(BB), %xmm5 + movapd -12 * SIZE(BB), %xmm3 + movapd -10 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup -14 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup -15 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup -13 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -14 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup -13 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -10 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup -9 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -5 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup -2 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup -3 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup -4 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup -6 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup -7 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup -8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup -11 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup -12 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup -16 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhps %xmm5, 0 * SIZE(CO1, %eax, 1) + movhps %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm0 + movapd -14 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd -16 * SIZE(AA), %xmm1 + movapd -14 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd -15 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd -14 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd -13 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -10 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd -9 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -5 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd -2 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd -3 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd -4 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd -6 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd -7 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd -8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd -11 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd -12 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd -16 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) + movsd %xmm2, -14 * SIZE(AA) + movsd %xmm3, -13 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S new file mode 100644 index 0000000000..6c2682a103 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -0,0 +1,2586 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA 16 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movd OLD_OFFT, %mm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movd %mm4, OFFSET + movd %mm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L30 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_2 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L66 + ALIGN_4 + +.L70: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 1 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm3 + unpckhpd %xmm3, %xmm3 + + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm3, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L99 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) + movlpd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movlpd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movlpd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movlpd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(B), %xmm2 + subsd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) +#else + movlpd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B,%eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + + +.L30: + testl $2, N + je .L60 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L32 + ALIGN_2 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L36 + ALIGN_4 + +.L40: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm3, 4 * SIZE(BB) + movlpd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movlpd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movlpd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movlpd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movlpd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movlpd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movlpd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movlpd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + + movapd 0 * SIZE(B), %xmm2 + + subpd %xmm4, %xmm2 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + movq %mm4, 8 * SIZE(BB) + movq %mm4, 9 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm5, 11 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm6, 13 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + movq %mm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + + movq %mm0, 0 * SIZE(BB) + movq %mm0, 1 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm1, 3 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm2, 5 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) + prefetchw -2 * SIZE(CO1, LDC, 2) + prefetchw -2 * SIZE(CO1, %eax) +#else + prefetchw 1 * SIZE(CO1) + prefetchw 1 * SIZE(CO1, LDC) + prefetchw 1 * SIZE(CO1, LDC, 2) + prefetchw 1 * SIZE(CO1, %eax) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm3 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movlpd 2 * SIZE(AA), %xmm4 + movhpd 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movlpd 1 * SIZE(AA), %xmm4 + movhpd 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movlpd 3 * SIZE(AA), %xmm4 + movhpd 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + movhpd 6 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + movhpd 7 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + movhpd 11 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + movhpd 15 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + movhpd 14 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + movhpd 13 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + movhpd 12 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + movhpd 10 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + movhpd 9 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + movhpd 8 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + movhpd 5 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + movhpd 4 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm3, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) + movlpd %xmm3, 8 * SIZE(BB) + movlpd %xmm3, 9 * SIZE(BB) + movhpd %xmm3, 10 * SIZE(BB) + movhpd %xmm3, 11 * SIZE(BB) + movlpd %xmm7, 12 * SIZE(BB) + movlpd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movlpd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movlpd 0 * SIZE(AA), %xmm0 + movlpd 4 * SIZE(AA), %xmm1 + movlpd 0 * SIZE(BB), %xmm2 + movlpd 8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movlpd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movlpd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movlpd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movlpd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movlpd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movlpd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movlpd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movlpd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movlpd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movlpd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movlpd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movlpd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movlpd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movlpd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movlpd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movlpd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movlpd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 4), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 +#else + movlpd 0 * SIZE(AA), %xmm0 + movlpd 1 * SIZE(AA), %xmm1 + movlpd 2 * SIZE(AA), %xmm2 + movlpd 3 * SIZE(AA), %xmm3 + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + subsd %xmm6, %xmm2 + subsd %xmm7, %xmm3 +#endif + +#ifdef LN + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef LT + movlpd 0 * SIZE(AA), %xmm4 + movhpd 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 + movlpd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movlpd 2 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movlpd 3 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 6 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movlpd 7 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 11 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm3 + movlpd 14 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movlpd 13 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movlpd 12 * SIZE(B), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 10 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm2 + movlpd 9 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movlpd 8 * SIZE(B), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 5 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm1 + movlpd 4 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movlpd 0 * SIZE(B), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BB) + movlpd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movlpd %xmm5, 4 * SIZE(BB) + movlpd %xmm5, 5 * SIZE(BB) + movhpd %xmm5, 6 * SIZE(BB) + movhpd %xmm5, 7 * SIZE(BB) +#else + movlpd %xmm0, 0 * SIZE(AA) + movlpd %xmm1, 1 * SIZE(AA) + movlpd %xmm2, 2 * SIZE(AA) + movlpd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm5, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movlpd %xmm1, 0 * SIZE(CO1, LDC, 1) + movlpd %xmm2, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA,%eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse3.S b/kernel/x86/trsm_kernel_RT_2x4_sse3.S new file mode 100644 index 0000000000..6be1d8643e --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_2x4_sse3.S @@ -0,0 +1,2030 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L30 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + + subpd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(AA), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RT + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L89 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + movhpd 9 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movhpd 1 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + movhpd 9 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + subsd %xmm4, %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 + subsd %xmm4, %xmm0 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#ifdef RT + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) +#else + movsd %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L30: + testl $2, N + je .L60 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + + movddup 2 * SIZE(AA), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + + movddup 1 * SIZE(AA), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm3, 2 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (B, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + + subpd %xmm4, %xmm0 +#else + movapd 0 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm1 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L59: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L60: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) + prefetchnta -2 * SIZE(CO1, LDC, 2) + prefetchnta -2 * SIZE(CO1, %eax, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) + prefetchnta 2 * SIZE(CO1, LDC, 2) + prefetchnta 2 * SIZE(CO1, %eax, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(BB), %xmm2 + movapd 2 * SIZE(BB), %xmm5 + movapd 4 * SIZE(BB), %xmm3 + movapd 6 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm6, %xmm5 + subpd %xmm0, %xmm3 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 + subpd %xmm6, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 + + movddup 2 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm7, %xmm6 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm5 + + movddup 1 * SIZE(AA), %xmm4 + movapd %xmm4, %xmm6 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + mulpd %xmm5, %xmm6 + subpd %xmm6, %xmm7 + + movddup 3 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm3 + mulpd %xmm4, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 + movddup 1 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup 2 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movddup 3 * SIZE(BB), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 6 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm2 + movddup 7 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 11 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm3 + + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup 15 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm3 + movddup 14 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm2 + movddup 13 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + movddup 12 * SIZE(BB), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm0 + + movddup 10 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm2 + movddup 9 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(BB), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + + movddup 5 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm1 + movddup 4 * SIZE(BB), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movddup 0 * SIZE(BB), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BB) + movapd %xmm5, 2 * SIZE(BB) + movapd %xmm3, 4 * SIZE(BB) + movapd %xmm7, 6 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) + movsd %xmm5, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) + movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm0 + movapd 2 * SIZE(BB), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#else + movapd 0 * SIZE(AA), %xmm1 + movapd 2 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm1 + subpd %xmm5, %xmm3 + + movapd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm1 + movapd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AA), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 + movsd 1 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + movsd 2 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + movsd 3 * SIZE(BB), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm3 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 6 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm2 + movsd 7 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm3 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 11 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm3 + + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 15 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm3 + movsd 14 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm2 + movsd 13 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm1 + movsd 12 * SIZE(BB), %xmm4 + mulsd %xmm3, %xmm4 + subsd %xmm4, %xmm0 + + movsd 10 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm2 + movsd 9 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm1 + movsd 8 * SIZE(BB), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + movsd 5 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm1 + movsd 4 * SIZE(BB), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + movsd 0 * SIZE(BB), %xmm4 + mulsd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) + movsd %xmm2, 2 * SIZE(AA) + movsd %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_4x2_core2.S b/kernel/x86/trsm_kernel_RT_4x2_core2.S new file mode 100644 index 0000000000..866eddf367 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_4x2_core2.S @@ -0,0 +1,2100 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + jle .L100 + ALIGN_2 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -2 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd -3 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd -4 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -7 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd -8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -12 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -14 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd -13 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd -11 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + movsd -10 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd -9 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd -6 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + movsd -5 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd -1 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movddup %xmm0, %xmm2 + movddup %xmm1, %xmm3 + + unpckhpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm2, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) + movapd %xmm3, -12 * SIZE(BB) + movapd %xmm1, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm0 +#else + movapd -16 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -14 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd -16 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -15 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd -13 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(B) + + movddup %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + + movapd %xmm1, -16 * SIZE(BB) + movapd %xmm0, -14 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L159 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 +#else + movsd -16 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd -16 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L100: + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 -3 * SIZE(CO1, LDC) +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm7, %xmm7 + prefetcht2 3 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + movapd -12 * SIZE(B), %xmm5 + movapd -10 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + movapd -12 * SIZE(AA), %xmm2 + movapd -10 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movddup -2 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movddup -3 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movddup -4 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -7 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movddup -8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -12 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movddup -11 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -10 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movddup -9 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movddup -6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movddup -5 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movddup -1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movddup -15 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + mulpd %xmm1, %xmm5 + subpd %xmm5, %xmm3 + + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movddup -14 * SIZE(B), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + subpd %xmm5, %xmm1 + + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movsd %xmm7, 3 * SIZE(CO1) + + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) + movhpd %xmm5, 2 * SIZE(CO1, LDC) + movhpd %xmm7, 3 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm2, 1 * SIZE(CO1, LDC) + movsd %xmm3, 2 * SIZE(CO1, LDC) + movhpd %xmm3, 3 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + movapd %xmm5, -12 * SIZE(B) + movapd %xmm7, -10 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + movddup %xmm5, %xmm4 + movddup %xmm7, %xmm6 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpckhpd %xmm5, %xmm5 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) + movapd %xmm2, -12 * SIZE(AA) + movapd %xmm3, -10 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd -16 * SIZE(B), %xmm2 + movapd -14 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd -16 * SIZE(AA), %xmm0 + movapd -14 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup -14 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movddup -15 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movddup -13 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + movddup -15 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + movddup -14 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + movddup -16 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movsd %xmm3, 1 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO1, LDC) + movhpd %xmm3, 1 * SIZE(CO1, LDC) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + movhpd %xmm1, 1 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, -16 * SIZE(B) + movapd %xmm3, -14 * SIZE(B) + + movddup %xmm2, %xmm0 + movddup %xmm3, %xmm1 + + unpckhpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm2, -14 * SIZE(BB) + movapd %xmm1, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) +#else + movapd %xmm0, -16 * SIZE(AA) + movapd %xmm1, -14 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm1 +#else + movsd -16 * SIZE(AA), %xmm0 + movsd -15 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm0 + movsd -15 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd -13 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm1 + movsd -14 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -16 * SIZE(B), %xmm0 +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC) + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(B) + movsd %xmm1, -15 * SIZE(B) + + movsd %xmm0, -16 * SIZE(BB) + movsd %xmm0, -15 * SIZE(BB) + movsd %xmm1, -14 * SIZE(BB) + movsd %xmm1, -13 * SIZE(BB) +#else + movsd %xmm0, -16 * SIZE(AA) + movsd %xmm1, -15 * SIZE(AA) +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_4x2_sse2.S b/kernel/x86/trsm_kernel_RT_4x2_sse2.S new file mode 100644 index 0000000000..68b52ba524 --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_4x2_sse2.S @@ -0,0 +1,2282 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA 16 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define PREFETCHSIZE (8 * 4) + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $BASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + jle .L100 + ALIGN_2 + +.L101: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 + movapd 2 * SIZE(B), %xmm1 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 +#endif + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 14 * SIZE(AA), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + movsd 13 * SIZE(AA), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + movsd 12 * SIZE(AA), %xmm7 + mulsd %xmm3, %xmm7 + subsd %xmm7, %xmm0 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 9 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm2 + movsd 8 * SIZE(AA), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 4 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movapd %xmm1, %xmm3 + unpckhpd %xmm3, %xmm3 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 2 * SIZE(AA), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + movsd 3 * SIZE(AA), %xmm7 + mulsd %xmm0, %xmm7 + subsd %xmm7, %xmm3 + + movsd 5 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 6 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm1 + movsd 7 * SIZE(AA), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + + movsd 10 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 11 * SIZE(AA), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + + movsd 15 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm3 + + unpcklpd %xmm2, %xmm0 + unpcklpd %xmm3, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) + movsd %xmm1, 4 * SIZE(BB) + movsd %xmm1, 5 * SIZE(BB) + movhpd %xmm1, 6 * SIZE(BB) + movhpd %xmm1, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm0 +#else + movapd 0 * SIZE(AA), %xmm0 +#endif + + subpd %xmm4, %xmm0 + +#ifdef LN + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 2 * SIZE(AA), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm2, %xmm0 +#endif + +#ifdef LT + movapd %xmm0, %xmm2 + unpckhpd %xmm2, %xmm2 + + movsd 0 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 1 * SIZE(AA), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + movsd 3 * SIZE(AA), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm2, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movhpd %xmm0, 2 * SIZE(BB) + movhpd %xmm0, 3 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L159 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, BB + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $0 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 2), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 +#else + movsd 0 * SIZE(AA), %xmm0 +#endif + + subsd %xmm4, %xmm0 + +#if defined(LN) || defined(LT) + mulsd 0 * SIZE(AA), %xmm0 +#endif + +#if defined(RN) || defined(RT) + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L159: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 1), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_2 + +.L100: + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +/* Copying to Sub Buffer */ +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, %esi # coffset = c +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd %xmm6, %xmm1 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm1 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 + subpd %xmm6, %xmm5 + subpd %xmm1, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + movapd 4 * SIZE(AA), %xmm2 + movapd 6 * SIZE(AA), %xmm3 + + subpd %xmm4, %xmm0 + subpd %xmm6, %xmm1 + subpd %xmm5, %xmm2 + subpd %xmm7, %xmm3 +#endif + +#ifdef LN + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 + movsd 14 * SIZE(AA), %xmm0 + movhpd 14 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm5 + movsd 13 * SIZE(AA), %xmm0 + movhpd 13 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm3 + movsd 12 * SIZE(AA), %xmm0 + movhpd 12 * SIZE(AA), %xmm0 + mulpd %xmm7, %xmm0 + subpd %xmm0, %xmm2 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 9 * SIZE(AA), %xmm0 + movhpd 9 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm3 + movsd 8 * SIZE(AA), %xmm0 + movhpd 8 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm2 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + movhpd 4 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm7 + + movsd 5 * SIZE(AA), %xmm0 + movhpd 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 6 * SIZE(AA), %xmm0 + movhpd 6 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm5 + movsd 7 * SIZE(AA), %xmm0 + movhpd 7 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm7 + + movsd 10 * SIZE(AA), %xmm0 + movhpd 10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm5 + movsd 11 * SIZE(AA), %xmm0 + movhpd 11 * SIZE(AA), %xmm0 + mulpd %xmm5, %xmm0 + subpd %xmm0, %xmm7 + + movsd 15 * SIZE(AA), %xmm0 + movhpd 15 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm2 + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm3 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm2 + mulpd %xmm4, %xmm3 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm2, %xmm4 + subpd %xmm4, %xmm0 + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm1 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + mulpd %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) + movsd %xmm5, 8 * SIZE(BB) + movsd %xmm5, 9 * SIZE(BB) + movhpd %xmm5, 10 * SIZE(BB) + movhpd %xmm5, 11 * SIZE(BB) + movsd %xmm7, 12 * SIZE(BB) + movsd %xmm7, 13 * SIZE(BB) + movhpd %xmm7, 14 * SIZE(BB) + movhpd %xmm7, 15 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) + movapd %xmm2, 4 * SIZE(AA) + movapd %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movsd %xmm5, 2 * SIZE(%esi) + movsd %xmm7, 3 * SIZE(%esi) + + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) + movhpd %xmm5, 2 * SIZE(%esi, LDC) + movhpd %xmm7, 3 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 2 * SIZE(%esi) + movhpd %xmm1, 3 * SIZE(%esi) + + movsd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm2, 1 * SIZE(%esi, LDC) + movsd %xmm3, 2 * SIZE(%esi, LDC) + movhpd %xmm3, 3 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movapd %xmm4, %xmm0 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm0 + + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm4, %xmm2 + subpd %xmm0, %xmm3 +#else + movapd 0 * SIZE(AA), %xmm0 + movapd 2 * SIZE(AA), %xmm1 + + subpd %xmm4, %xmm0 + subpd %xmm5, %xmm1 +#endif + +#ifdef LN + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + + movsd 2 * SIZE(AA), %xmm0 + movhpd 2 * SIZE(AA), %xmm0 + mulpd %xmm3, %xmm0 + subpd %xmm0, %xmm2 + + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm0 + movhpd 0 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + movhpd 1 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm0 + subpd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + movhpd 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 + + movsd 1 * SIZE(B), %xmm4 + movhpd 1 * SIZE(B), %xmm4 + mulpd %xmm0, %xmm4 + subpd %xmm4, %xmm1 + + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(B), %xmm4 + movhpd 3 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm1 + + movsd 2 * SIZE(B), %xmm4 + movhpd 2 * SIZE(B), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm0 + + movsd 0 * SIZE(B), %xmm4 + movhpd 0 * SIZE(B), %xmm4 + mulpd %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movsd %xmm2, 0 * SIZE(BB) + movsd %xmm2, 1 * SIZE(BB) + movhpd %xmm2, 2 * SIZE(BB) + movhpd %xmm2, 3 * SIZE(BB) + movsd %xmm3, 4 * SIZE(BB) + movsd %xmm3, 5 * SIZE(BB) + movhpd %xmm3, 6 * SIZE(BB) + movhpd %xmm3, 7 * SIZE(BB) +#else + movapd %xmm0, 0 * SIZE(AA) + movapd %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, %esi +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(%esi) + movsd %xmm3, 1 * SIZE(%esi) + movhpd %xmm2, 0 * SIZE(%esi, LDC) + movhpd %xmm3, 1 * SIZE(%esi, LDC) +#else + movsd %xmm0, 0 * SIZE(%esi) + movhpd %xmm0, 1 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + movhpd %xmm1, 1 * SIZE(%esi, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $0 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm5, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm0 + mulsd %xmm2, %xmm1 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm1 + mulsd 3 * SIZE(B), %xmm1 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm4 + mulsd %xmm1, %xmm4 + subsd %xmm4, %xmm0 + + mulsd 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(B) + movsd %xmm1, 1 * SIZE(B) + + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm0, 1 * SIZE(BB) + movsd %xmm1, 2 * SIZE(BB) + movsd %xmm1, 3 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, %esi +#endif + + movsd %xmm0, 0 * SIZE(%esi) + movsd %xmm1, 0 * SIZE(%esi, LDC) + +#ifndef LN + addl $1 * SIZE, %esi +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $0 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S new file mode 100644 index 0000000000..40a9604d3a --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -0,0 +1,3128 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 20 + STACK + ARGS(%esp) +#define ARG_B 24 + STACK + ARGS(%esp) +#define C 28 + STACK + ARGS(%esp) +#define ARG_LDC 32 + STACK + ARGS(%esp) +#define OFFSET 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 21 + 4) +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 4) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + leal (, LDC, SIZE), LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L40 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + movss -30 * SIZE(BB), %xmm5 + movss -29 * SIZE(BB), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) + movss %xmm5, -30 * SIZE(BB) + movss %xmm7, -29 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss -32 * SIZE(BB), %xmm1 + movss -31 * SIZE(BB), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movsd -32 * SIZE(AA), %xmm4 + movhps -30 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) + movss %xmm3, -31 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + testl $1, M + je .L119 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + + leal (AA, %eax, SIZE), AA + leal (B, %eax, SIZE), BB +#endif + + haddps %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BB), %xmm1 + subss %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss -32 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss -32 * SIZE(BB), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA + leal (BB, %eax, SIZE), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + movsd -28 * SIZE(BB), %xmm5 + movsd -26 * SIZE(BB), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + movlps %xmm5, -28 * SIZE(BB) + movlps %xmm7, -26 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + movhlps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movsd -32 * SIZE(BB), %xmm1 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm5, %xmm4 + + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movss -32 * SIZE(AA), %xmm0 + movss -31 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + leal (CO1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 -4 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 -4 * SIZE(%eax, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(%eax, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm4 + + movaps %xmm6, %xmm2 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm1 + movlhps %xmm2, %xmm0 + movhlps %xmm2, %xmm1 + + movaps %xmm6, %xmm7 + movlhps %xmm4, %xmm6 + movhlps %xmm4, %xmm7 + + pshufd $0x39, %xmm1, %xmm2 + pshufd $0x39, %xmm7, %xmm4 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + movaps -24 * SIZE(BB), %xmm5 + movaps -20 * SIZE(BB), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm3 + subps %xmm6, %xmm5 + subps %xmm4, %xmm7 +#else + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + movaps -32 * SIZE(AA), %xmm0 + movaps -28 * SIZE(AA), %xmm1 + movaps -24 * SIZE(AA), %xmm2 + movaps -20 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps -28 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps -24 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps -20 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + movaps %xmm5, -24 * SIZE(BB) + movaps %xmm7, -20 * SIZE(BB) +#else + movaps %xmm0, -32 * SIZE(AA) + movaps %xmm1, -28 * SIZE(AA) + movaps %xmm2, -24 * SIZE(AA) + movaps %xmm3, -20 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $4, KK +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps -32 * SIZE(BB), %xmm1 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else + movsd -32 * SIZE(AA), %xmm0 + movsd -30 * SIZE(AA), %xmm1 + movsd -28 * SIZE(AA), %xmm2 + movsd -26 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) +#else + movlps %xmm0, -32 * SIZE(AA) + movlps %xmm1, -30 * SIZE(AA) + movlps %xmm2, -28 * SIZE(AA) + movlps %xmm3, -26 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BB), %xmm1 + + subps %xmm4, %xmm1 +#else + movsd -32 * SIZE(AA), %xmm0 + movhps -30 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BB), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps -24 * SIZE(BB), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps -28 * SIZE(BB), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps -32 * SIZE(BB), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, -32 * SIZE(BB) +#else + movss %xmm0, -32 * SIZE(AA) + movss %xmm1, -31 * SIZE(AA) + movss %xmm2, -30 * SIZE(AA) + movss %xmm3, -29 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L10 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S new file mode 100644 index 0000000000..0d2fcb6d2d --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -0,0 +1,3683 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_A 20 + STACK(%esi) +#define OLD_B 24 + STACK(%esi) +#define OLD_C 28 + STACK(%esi) +#define OLD_LDC 32 + STACK(%esi) +#define STACK_OFFT 36 + STACK(%esi) + +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + movss STACK_OFFT, %xmm4 + + movl OLD_B, B + movl OLD_C, %ebx + + movl %ebx, C + movl OLD_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + testl $1, N + je .L40 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L86 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + movss 2 * SIZE(B), %xmm5 + movss 3 * SIZE(B), %xmm7 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 + subss %xmm0, %xmm5 + subss %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm7, %xmm6 + subss %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm5, %xmm6 + subss %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + movss %xmm5, 2 * SIZE(B) + movss %xmm7, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + movaps %xmm0, 8 * SIZE(BB) + pshufd $0x00, %xmm7, %xmm0 + movaps %xmm0, 12 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm4, %xmm6 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm3 + + subss %xmm4, %xmm1 + subss %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulss %xmm3, %xmm6 + subss %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulss %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulss %xmm6, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm3, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L110: + testl $1, M + je .L119 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + movaps %xmm0, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 4 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + movaps %xmm0, 24 * SIZE(BB) + movaps %xmm2, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm1 + unpcklps %xmm7, %xmm3 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm2, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm3, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + pshufd $1, %xmm1, %xmm3 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm3, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + ALIGN_4 + +.L80: + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L999 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 4), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + PREFETCHW 3 * SIZE(CO1, LDC, 2) + PREFETCHW 3 * SIZE(CO1, %eax) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $2 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm0 + + movaps %xmm5, %xmm1 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 + subps %xmm0, %xmm5 + subps %xmm2, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm7, %xmm6 + subps %xmm6, %xmm1 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm1 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(AA), %xmm4 + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm7 + + movaps 8 * SIZE(AA), %xmm4 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm6, %xmm5 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm5, %xmm6 + subps %xmm6, %xmm7 + + movaps 12 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm5, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm2 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm2, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm2 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm2, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movhps %xmm2, 2 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) + movhps %xmm6, 2 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) + movhps %xmm3, 2 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $1 + BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm3 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 2 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 4 * SIZE(AA), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 6 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm1 + subps %xmm6, %xmm2 + subps %xmm7, %xmm3 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 + pshufd $0xaa, %xmm4, %xmm6 + mulps %xmm3, %xmm6 + subps %xmm6, %xmm1 + + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 + + pshufd $0x55, %xmm4, %xmm6 + mulps %xmm1, %xmm6 + subps %xmm6, %xmm3 + + pshufd $0xff, %xmm4, %xmm6 + mulps %xmm6, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm0, %xmm7 + subps %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulps %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm3, %xmm7 + subps %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulps %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm2, %xmm7 + subps %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulps %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm1, %xmm7 + subps %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulps %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm2 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm2, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm1, 2 * SIZE(AA) + movlps %xmm2, 4 * SIZE(AA) + movlps %xmm3, 6 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 1) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC, 1) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $4, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + leal (AA, %eax, SIZE), AA + + sall $2 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm6, %xmm4 + unpcklps %xmm7, %xmm5 + unpcklps %xmm5, %xmm4 + + movaps 0 * SIZE(B), %xmm1 + + subps %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm1 + movss 2 * SIZE(AA), %xmm2 + movss 3 * SIZE(AA), %xmm3 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 + subss %xmm6, %xmm2 + subss %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm4 + pshufd $0x00, %xmm4, %xmm6 + mulps %xmm6, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm0, %xmm7 + subss %xmm7, %xmm3 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm3 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm3 + + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm6 + pshufd $0xff, %xmm6, %xmm7 + mulss %xmm7, %xmm3 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm3, %xmm7 + subss %xmm7, %xmm0 + + movaps 8 * SIZE(B), %xmm6 + pshufd $0xaa, %xmm6, %xmm7 + mulss %xmm7, %xmm2 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm2, %xmm7 + subss %xmm7, %xmm0 + + movaps 4 * SIZE(B), %xmm6 + pshufd $0x55, %xmm6, %xmm7 + mulss %xmm7, %xmm1 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm1, %xmm7 + subss %xmm7, %xmm0 + + movaps 0 * SIZE(B), %xmm6 + pshufd $0x00, %xmm6, %xmm7 + mulss %xmm7, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm0 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm2, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm1, 1 * SIZE(AA) + movss %xmm2, 2 * SIZE(AA) + movss %xmm3, 3 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + + leal (LDC, LDC, 2), %eax + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm0 + + movaps %xmm3, %xmm4 + unpcklps %xmm7, %xmm3 + unpckhps %xmm7, %xmm4 + + movaps %xmm1, %xmm2 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm2 + + movaps %xmm0, %xmm6 + unpcklps %xmm4, %xmm0 + unpckhps %xmm4, %xmm6 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC, 1) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm6, 0 * SIZE(CO1, %eax, 1) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO1, LDC, 1) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO1, %eax, 1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 4), B +#endif + +#ifdef RN + addl $4, KK +#endif + +#ifdef RT + subl $4, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/trsm_kernel_RT_8x2_sse.S b/kernel/x86/trsm_kernel_RT_8x2_sse.S new file mode 100644 index 0000000000..6bc1d21dce --- /dev/null +++ b/kernel/x86/trsm_kernel_RT_8x2_sse.S @@ -0,0 +1,3607 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 20 + STACK + ARGS(%esi) +#define STACK_B 24 + STACK + ARGS(%esi) +#define STACK_C 28 + STACK + ARGS(%esi) +#define STACK_LDC 32 + STACK + ARGS(%esi) +#define STACK_OFFT 36 + STACK + ARGS(%esi) + +#define TRMASK 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 10 + 8) +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define B %edi +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-STACK_ALIGN, %esp + + STACK_TOUCHING + + movss STACK_M, %xmm0 + movl STACK_N, %eax + movss STACK_K, %xmm1 + movss STACK_A, %xmm2 + movl STACK_B, B + movss STACK_C, %xmm3 + movl STACK_LDC, LDC + movss STACK_OFFT, %xmm4 + + movss %xmm1, K + movl %eax, N + movss %xmm0, M + movss %xmm2, A + movss %xmm3, C + movl %esi, OLD_STACK + movss %xmm4, OFFSET + movss %xmm4, KK + + leal (, LDC, SIZE), LDC + +#ifdef LN + movl M, %eax + leal (, %eax, SIZE), %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + leal (, %eax, SIZE), %eax + imull K, %eax + addl %eax, B + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LN) || defined(LT) + movl $0x3f800000, 0 + TRMASK # 1.0 + movl $0x00000000, 4 + TRMASK # 0.0 + movl $0x3f800000, 8 + TRMASK # 1.0 + movl $0x00000000, 12 + TRMASK # 0.0 +#endif + + testl $1, N + jle .L100 + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + + shufps $0x00, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm5 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm6 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm5 + movhps 6 * SIZE(B), %xmm5 + + subps %xmm4, %xmm2 + subps %xmm6, %xmm5 + + xorps %xmm0, %xmm0 + + movaps %xmm2, %xmm3 + unpcklps %xmm0, %xmm2 + unpckhps %xmm0, %xmm3 + + movaps %xmm5, %xmm7 + unpcklps %xmm0, %xmm5 + unpckhps %xmm0, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + shufps $0x88, %xmm7, %xmm5 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm5, 4 * SIZE(B) + movhps %xmm5, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + testl $4, M + jle .L150 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + movhps 2 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movsd 16 * SIZE(AA), %xmm1 + movhps 18 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +.L132: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 + + xorps %xmm5, %xmm5 + + movaps %xmm2, %xmm3 + unpcklps %xmm5, %xmm2 + unpckhps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + shufps $0x88, %xmm3, %xmm2 + + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L150: + testl $2, M + jle .L170 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +.L152: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + shufps $1, %xmm5, %xmm5 + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm1 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + subps %xmm4, %xmm0 +#endif + +#ifdef LN + movaps 0 * SIZE(AA), %xmm4 + + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 + + movaps %xmm4, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + mulss %xmm1, %xmm6 + subss %xmm6, %xmm0 + mulss %xmm4, %xmm0 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm4 + mulss %xmm4, %xmm0 + movaps %xmm4, %xmm6 + shufps $0x55, %xmm6, %xmm6 + mulss %xmm0, %xmm6 + subss %xmm6, %xmm1 + movaps %xmm4, %xmm6 + shufps $0xff, %xmm6, %xmm6 + mulss %xmm6, %xmm1 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#ifdef RT + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + + shufps $0x00, %xmm0, %xmm0 + shufps $0x00, %xmm1, %xmm1 + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 1 * SIZE(CO1) +#else + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L170: + testl $1, M + jle .L179 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + leal (AA, %eax, SIZE), AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +.L172: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm4, %xmm1 +#else + movss 0 * SIZE(AA), %xmm0 + subss %xmm4, %xmm0 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AA), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + shufps $0x00, %xmm1, %xmm1 + movaps %xmm1, 0 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (AA, %eax, SIZE), AA +#ifdef LT + addl $1 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 +.L179: +#ifdef LN + movl K, %eax + leal (B, %eax, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (B, %eax, SIZE), B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + BASE_SHIFT, %eax + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + shufps $0xff, %xmm3, %xmm3 + + movaps %xmm7, %xmm4 + shufps $0x00, %xmm4, %xmm4 + movaps %xmm7, %xmm5 + shufps $0x55, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xaa, %xmm6, %xmm6 + shufps $0xff, %xmm7, %xmm7 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm3 + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $3, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $3 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $3 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + + PREFETCHW 7 * SIZE(CO1) + PREFETCHW 7 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $8, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 8), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movaps %xmm6, %xmm1 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm1 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + movsd 8 * SIZE(B), %xmm5 + movhps 10 * SIZE(B), %xmm5 + movsd 12 * SIZE(B), %xmm7 + movhps 14 * SIZE(B), %xmm7 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 + subps %xmm6, %xmm5 + subps %xmm1, %xmm7 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm1 + movaps 8 * SIZE(AA), %xmm2 + movaps 12 * SIZE(AA), %xmm3 + + subps %xmm4, %xmm0 + subps %xmm6, %xmm1 + subps %xmm5, %xmm2 + subps %xmm7, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 62 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movsd 60 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 58 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 56 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 52 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 50 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 48 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 44 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 42 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 40 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 34 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 32 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 26 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 24 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 16 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 9 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 10 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 18 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 19 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 20 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 22 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 27 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 28 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 30 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 36 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 37 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm5 + + movsd 38 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 45 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm5 + + movaps %xmm5, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 46 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 54 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm7 + + movaps %xmm7, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 55 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm7 + + movss 63 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm7 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm6 + + subps %xmm5, %xmm2 + subps %xmm6, %xmm3 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + mulps %xmm6, %xmm3 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + mulps %xmm3, %xmm6 + + subps %xmm5, %xmm0 + subps %xmm6, %xmm1 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + mulps %xmm6, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + movlps %xmm5, 8 * SIZE(B) + movhps %xmm5, 10 * SIZE(B) + movlps %xmm7, 12 * SIZE(B) + movhps %xmm7, 14 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm5, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 32 * SIZE(BB) + movaps %xmm1, 36 * SIZE(BB) + movaps %xmm4, 40 * SIZE(BB) + movaps %xmm6, 44 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm6 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm7, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + movaps %xmm0, 48 * SIZE(BB) + movaps %xmm1, 52 * SIZE(BB) + movaps %xmm4, 56 * SIZE(BB) + movaps %xmm6, 60 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm1, 4 * SIZE(AA) + movaps %xmm2, 8 * SIZE(AA) + movaps %xmm3, 12 * SIZE(AA) +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movaps %xmm5, %xmm4 + shufps $0x88, %xmm7, %xmm5 + shufps $0xdd, %xmm7, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm5, 4 * SIZE(CO1) + movhps %xmm5, 6 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) + movlps %xmm4, 4 * SIZE(CO1, LDC) + movhps %xmm4, 6 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 8), AA +#ifdef LT + addl $16 * SIZE, B +#endif +#endif + +#ifdef LN + subl $8, KK + movl BORIG, B +#endif + +#ifdef LT + addl $8, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $3 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + testl $4, M + jle .L50 + +#ifdef LN + movl K, %eax + sall $2 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $2 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +.L32: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm0 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm0 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + movsd 4 * SIZE(B), %xmm3 + movhps 6 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm0, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm0 + movaps 4 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 14 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movsd 12 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movsd 8 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 4 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movsd 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 5 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movsd 6 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 10 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm3 + + movaps %xmm3, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 11 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm3 + + movss 15 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm3 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + movlps %xmm3, 4 * SIZE(B) + movhps %xmm3, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm6 +#else + movaps %xmm3, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm3, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm3, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm3, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm6, 28 * SIZE(BB) +#else + movaps %xmm0, 0 * SIZE(AA) + movaps %xmm2, 4 * SIZE(AA) +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) + movhps %xmm0, 2 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + testl $2, M + jle .L70 + +#ifdef LN + movl K, %eax + sall $1 + BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + + movsd 0 * SIZE(B), %xmm2 + movhps 2 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 2 * SIZE(AA), %xmm2 + + subps %xmm4, %xmm0 + subps %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#ifdef LN + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0xee, %xmm1, %xmm1 + + movss 2 * SIZE(AA), %xmm0 + shufps $0x50, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + +#endif + +#ifdef LT + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 + + movaps %xmm2, %xmm1 + shufps $0x44, %xmm1, %xmm1 + + movss 1 * SIZE(AA), %xmm0 + shufps $0x05, %xmm0, %xmm0 + mulps %xmm1, %xmm0 + subps %xmm0, %xmm2 + + movss 3 * SIZE(AA), %xmm0 + movaps %xmm6, %xmm1 + shufps $0x00, %xmm0, %xmm1 + mulps %xmm1, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm0, %xmm5 + subps %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + movaps %xmm6, %xmm5 + + mulps %xmm2, %xmm5 + + subps %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + shufps $0x00, %xmm6, %xmm6 + + mulps %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movhps %xmm2, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm6 +#else + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm2, %xmm4 + shufps $0xaa, %xmm4, %xmm4 + movaps %xmm2, %xmm6 + shufps $0xff, %xmm6, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm6, 12 * SIZE(BB) +#else + movlps %xmm0, 0 * SIZE(AA) + movlps %xmm2, 2 * SIZE(AA) +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO1, LDC) +#else + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + testl $1, M + jle .L99 + +#ifdef LN + movl K, %eax + sall $BASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $BASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + BASE_SHIFT, %eax + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $BASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm5, %xmm4 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else + movss 0 * SIZE(AA), %xmm0 + movss 1 * SIZE(AA), %xmm2 + + subss %xmm4, %xmm0 + subss %xmm5, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movaps TRMASK, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AA), %xmm0 + shufps $0x00, %xmm6, %xmm0 + mulps %xmm0, %xmm2 +#endif + +#ifdef RN + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 + + movss 1 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm0, %xmm5 + subss %xmm5, %xmm2 + + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 +#endif + +#ifdef RT + movss 3 * SIZE(B), %xmm6 + mulss %xmm6, %xmm2 + + movss 2 * SIZE(B), %xmm6 + movaps %xmm6, %xmm5 + + mulss %xmm2, %xmm5 + subss %xmm5, %xmm0 + + movss 0 * SIZE(B), %xmm6 + mulss %xmm6, %xmm0 +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd %xmm2, 0 * SIZE(B) + + movaps %xmm2, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm2, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movss %xmm0, 0 * SIZE(AA) + movss %xmm2, 1 * SIZE(AA) +#endif + +#ifdef LN + subl $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, %xmm0 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm3, %xmm0 + + movss %xmm2, 0 * SIZE(CO1) + movss %xmm0, 0 * SIZE(CO1, LDC) +#else + movss %xmm0, 0 * SIZE(CO1) + movss %xmm2, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 1), AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $BASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + leal (, %eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + leal (,%eax, SIZE), %eax + leal (B, %eax, 2), B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/xaxpy.S b/kernel/x86/xaxpy.S new file mode 100644 index 0000000000..554aa0c34f --- /dev/null +++ b/kernel/x86/xaxpy.S @@ -0,0 +1,356 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 32 + STACK + ARGS(%esp) +#define STACK_X 48 + STACK + ARGS(%esp) +#define STACK_INCX 52 + STACK + ARGS(%esp) +#define STACK_Y 56 + STACK + ARGS(%esp) +#define STACK_INCY 60 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#ifndef CONJ +#define ADD1 fsubrp +#define ADD2 faddp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + FLD STACK_ALPHA_I + FLD STACK_ALPHA_R + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L40 + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl M, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1), %st + FLD 3 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 2 * SIZE(Y) + faddp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(2), %st + FLD 3 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 3 * SIZE(Y) + faddp %st, %st(1) + FST 3 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(1), %st + FLD 5 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 4 * SIZE(Y) + faddp %st, %st(1) + FST 4 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(2), %st + FLD 5 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 5 * SIZE(Y) + faddp %st, %st(1) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1), %st + FLD 7 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 6 * SIZE(Y) + faddp %st, %st(1) + FST 6 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(2), %st + FLD 7 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 7 * SIZE(Y) + faddp %st, %st(1) + FST 7 * SIZE(Y) + +#ifdef HAVE_3DNOW + prefetch 20 * SIZE(X) + prefetchw 20 * SIZE(Y) +#endif + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl %eax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movl M, %eax + sarl $2, %eax + jle .L28 + ALIGN_3 + +.L29: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L35: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl %eax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + ffreep %st(0) + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/xdot.S b/kernel/x86/xdot.S new file mode 100644 index 0000000000..4a5af46429 --- /dev/null +++ b/kernel/x86/xdot.S @@ -0,0 +1,331 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) +#else +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + testl N, N + jle .L88 + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl N, %eax + sarl $1, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX # if (incx < 0) + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY # if (incy < 0) + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 + +.L29: +#endif + + movl N, %eax + sarl $1, %eax + jle .L30 + ALIGN_3 + + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addl INCX, X + + FLD 0 * SIZE(X) + addl INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + ALIGN_3 + +.L27: +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + movl RESULT, %eax +#endif + +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#else + fxch %st(1) +#endif + + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L88: +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + movl RESULT, %eax +#endif + + fldz + fldz + +#if defined(F_INTERFACE) && defined(RETURN_BY_STACK) + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/xgemm3m_kernel_2x2.S b/kernel/x86/xgemm3m_kernel_2x2.S new file mode 100644 index 0000000000..b844875f12 --- /dev/null +++ b/kernel/x86/xgemm3m_kernel_2x2.S @@ -0,0 +1,796 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define ARG_B 52 + STACK + ARGS(%esp) +#define C 56 + STACK + ARGS(%esp) +#define ARG_LDC 60 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl ARG_LDC, LDC + movl ARG_B, B + + addl $8 * SIZE, A + addl $8 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + lea (, LDC, 2), %eax + addl %eax, C + + movl M, I + sarl $1, I + je .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fld %st(3) + fmul %st(1), %st + + FLD 2 * SIZE(CO) + faddp %st, %st(1) + FST 2 * SIZE(CO) + + fld %st(4) + fmul %st(1), %st + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + fmul %st(5), %st + + FLD 2 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 2 * SIZE(CO, LDC) + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 3 * SIZE(CO) + faddp %st, %st(1) + FST 3 * SIZE(CO) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) + + FLD 3 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 3 * SIZE(CO, LDC) + + addl $4 * SIZE, CO + decl I + jne .L11 + ALIGN_4 + +.L20: + movl M, %eax + andl $1, %eax + je .L29 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal ( B, %eax, 2), BO +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $4 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $1 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L26 + ALIGN_4 + +.L28: + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmul %st(3), %st + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + decl J + jne .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + addl LDC, C + + movl M, I + sarl $1, I + je .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 2), AO + leal ( B, %eax, 1), BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $8 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addl $2 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L36 + ALIGN_4 + +.L38: + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmul %st(3), %st + + FLD 2 * SIZE(CO) + faddp %st, %st(1) + FST 2 * SIZE(CO) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 3 * SIZE(CO) + faddp %st, %st(1) + FST 3 * SIZE(CO) + + addl $4 * SIZE, CO + decl I + jne .L31 + ALIGN_4 + +.L40: + movl M, %eax + andl $1, %eax + je .L49 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $BASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal ( B, %eax, 1), BO +#endif + + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $4 * SIZE,AO + addl $4 * SIZE,BO + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addl $1 * SIZE,AO + addl $1 * SIZE,BO + + decl %eax + jne .L46 + ALIGN_4 + +.L48: + FLD ALPHA_I + FLD ALPHA_R + + fmul %st(2), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmulp %st(1), %st + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl BO, B + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/xgemm_kernel_1x1.S b/kernel/x86/xgemm_kernel_1x1.S new file mode 100644 index 0000000000..b401bd2068 --- /dev/null +++ b/kernel/x86/xgemm_kernel_1x1.S @@ -0,0 +1,374 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define ARG_B 52 + STACK + ARGS(%esp) +#define C 56 + STACK + ARGS(%esp) +#define ARG_LDC 60 + STACK + ARGS(%esp) +#define OFFSET 64 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 faddp +#define ADD4 faddp +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 fsubrp +#define ADD4 faddp +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 faddp +#define ADD4 fsubrp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 fsubrp +#define ADD4 fsubrp +#endif + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl ARG_LDC, LDC + movl ARG_B, B + + addl $8 * SIZE, A + addl $8 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + cmpl $0, M + jle .L999 + + movl N, %eax + movl %eax, J + testl %eax, %eax + jle .L999 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl A, AO + + movl C, CO + addl LDC, C + + movl M, I + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B, BO +#else + movl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: + faddp %st, %st(3) + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FLD ALPHA_R + fld %st + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + FLD ALPHA_I + fmul %st, %st(3) + fmulp %st, %st(4) + + fsubp %st, %st(2) + faddp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + sall $ZBASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, CO + decl I + jne .L11 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl BO, B + decl J + jne .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/xgemv_n.S b/kernel/x86/xgemv_n.S new file mode 100644 index 0000000000..0bf44455bb --- /dev/null +++ b/kernel/x86/xgemv_n.S @@ -0,0 +1,350 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 32 +#endif + +#if defined(PENTIUM4) || defined(ATHLON) +#define P (DTB_ENTRIES / 2) +#endif + +#ifndef P +#define P DTB_ENTRIES +#endif + +#define STACK 16 +#define ARGS 16 + +#define PLDA_M 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_N 8 + STACK(%esp) +#define IS 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) + +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define LDA 52 + STACK + ARGS(%esp) +#define X 56 + STACK + ARGS(%esp) +#define INCX 60 + STACK + ARGS(%esp) +#define Y 64 + STACK + ARGS(%esp) +#define INCY 68 + STACK + ARGS(%esp) +#define BUFFER 72 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movl X, %edi + + movl LDA, %ebx + sall $ZBASE_SHIFT, %ebx + + movl $0, IS + + movl M, %ecx + movl N, %esi + + test %ecx, %ecx + jle .L79 # goto END + test %esi, %esi + jle .L79 # goto END + + movl INCY, %eax + sall $ZBASE_SHIFT, %eax + movl %eax, INCY + + movl LDA, %eax + imull $P, %eax # P * lda + subl M ,%eax # P * lda - m + sall $ZBASE_SHIFT, %eax + movl %eax, PLDA_M + ALIGN_2 + +.L32: + movl IS, %esi + movl $P, %edx + movl N, %eax + subl %esi,%eax # n - is + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + + movl %eax, MIN_N + + sall $ZBASE_SHIFT, %esi + leal (%edi, %esi, 1), %esi + movl %esi, XP + + movl INCX, %edx + cmpl $1, %edx + je .L34 # if incx == 1 goto L34 + + movl BUFFER, %esi + movl %esi, XP # xp = buffer + + sall $ZBASE_SHIFT, %edx + sarl $1,%eax + jle .L35 + ALIGN_2 + +.L36: + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_N, %eax + andl $1, %eax + jle .L34 + + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + ALIGN_3 + +/* Main Routine */ +.L34: + movl Y, %ecx # c_offset + movl M, %ebp # j = m + ALIGN_3 + +.L61: + movl A, %edx # a_offset = a + fldz + addl $2 * SIZE, A # a++ + fldz + movl XP,%esi + fldz + movl MIN_N,%eax + fldz + FLD (%esi) # bt1 = *(b_offset + 0) + sarl $1, %eax + jle .L64 + ALIGN_3 + +.L65: +#ifdef PENTIUM4 + prefetchnta 16 * SIZE(%esi) +#endif + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) + + addl $2 * SIZE, %esi # b_offset += 2 + addl %ebx, %edx # a_offset += lda + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) + + addl $2 * SIZE, %esi # b_offset += 2 + addl %ebx, %edx # a_offset += lda + + decl %eax + jg .L65 + +.L64: + movl MIN_N, %eax + andl $1, %eax + jle .L70 + ALIGN_2 + +.L71: + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_2 + +.L70: + ffreep %st(0) + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4) + fld %st(2) + fmul %st(4) + fsubp %st, %st(1) + + movl INCY, %eax + + FLD 0 * SIZE(%ecx) + faddp %st, %st(1) + FST 0 * SIZE(%ecx) + + fmul %st(2) + fxch %st(1) + fmul %st(3) + faddp %st, %st(1) + + FLD 1 * SIZE(%ecx) + faddp %st, %st(1) + FST 1 * SIZE(%ecx) + + addl %eax, %ecx + decl %ebp + jg .L61 + +.L60: + movl PLDA_M, %esi + addl %esi, A # a += P * lda - m + addl $P, IS + movl N, %esi + cmpl %esi,IS + jl .L32 + +.L79: + ffreep %st(0) + ffreep %st(0) + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/xgemv_t.S b/kernel/x86/xgemv_t.S new file mode 100644 index 0000000000..1397a10f2b --- /dev/null +++ b/kernel/x86/xgemv_t.S @@ -0,0 +1,369 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 88 +#endif + +#ifndef P +#define P 400 +#endif + +#define STACK 16 +#define ARGS 24 + +#define NLDA 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_M 8 + STACK(%esp) +#define J 12 + STACK(%esp) +#define IS 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) + +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define LDA 52 + STACK + ARGS(%esp) +#define X 56 + STACK + ARGS(%esp) +#define INCX 60 + STACK + ARGS(%esp) +#define Y 64 + STACK + ARGS(%esp) +#define INCY 68 + STACK + ARGS(%esp) +#define BUFFER 72 + STACK + ARGS(%esp) + + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movl X, %edi # X + + movl $0, IS + + movl M, %ebx + movl N, %ecx + testl %ebx, %ebx + jle .L79 + + testl %ecx, %ecx + jle .L79 + + movl INCX, %esi + sall $ZBASE_SHIFT, %esi + movl %esi, INCX + + movl INCY, %esi + sall $ZBASE_SHIFT, %esi + movl %esi, INCY + + movl LDA, %ebx + + movl N, %eax + imull %ebx, %eax + movl $P, %esi + subl %eax, %esi + sall $ZBASE_SHIFT, %esi + movl %esi, NLDA + + movl %ebx, %esi + sall $ZBASE_SHIFT, %esi + movl %esi, LDA + ALIGN_2 + +.L32: + movl IS, %esi + + movl $P, %edx + movl M, %eax + subl %esi, %eax + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + movl %eax, MIN_M + + movl IS, %ecx + sall $ZBASE_SHIFT, %ecx + leal (%edi, %ecx, 1), %ecx # xp = x + is + movl INCX, %ebx + movl %ecx, XP + cmpl $2 * SIZE, %ebx + je .L34 + + movl BUFFER, %esi + movl MIN_M, %eax + movl %esi, XP + sarl $1, %eax + jle .L35 + + ALIGN_3 + +.L36: + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_M, %eax + andl $1,%eax + jle .L34 + + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + ALIGN_3 + +/* Main Routine */ + +.L34: + movl Y, %ebp # coffset = y + + movl N, %ecx + testl %ecx, %ecx + jle .L60 + ALIGN_2 + +.L61: + movl A, %ebx # a_offset = a + fldz # ct1 = ZERO + movl LDA, %edx + fldz # ct1 = ZERO + + addl %ebx, %edx + fldz # ct1 = ZERO + movl %edx, A + fldz # ct1 = ZERO + + movl XP, %esi + + FLD (%esi) # bt1 = *(b_offset + 0) + + movl MIN_M, %eax + sarl $1, %eax + jle .L64 + ALIGN_3 + +#define PRESIZE 8 + +.L65: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * SIZE(%ebx) + prefetcht0 PRESIZE * SIZE(%esi) +#endif + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 3 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 4 * SIZE(%esi) # bt1 = *(b_offset + 1) + + addl $4 * SIZE, %esi + addl $4 * SIZE, %ebx + decl %eax + jg .L65 + ALIGN_3 + +.L64: + movl MIN_M, %eax + andl $1, %eax + jle .L70 + ALIGN_3 + +.L71: + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_3 + +.L70: + ffreep %st(0) + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4) + fld %st(2) + fmul %st(4) + fsubp %st, %st(1) + + FLD 0 * SIZE(%ebp) + faddp %st, %st(1) + FST 0 * SIZE(%ebp) + + fmul %st(2) + fxch %st(1) + fmul %st(3) + faddp %st, %st(1) + + FLD 1 * SIZE(%ebp) + faddp %st, %st(1) + FST 1 * SIZE(%ebp) + addl INCY, %ebp + + decl %ecx + jg .L61 + ALIGN_3 + +.L60: + movl A, %ebx + addl NLDA, %ebx + movl %ebx, A + + addl $P, IS + movl M, %esi + cmpl %esi, IS + jl .L32 + ALIGN_3 + +.L79: + ffreep %st(0) + ffreep %st(0) + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE + diff --git a/kernel/x86/xtrsm_kernel_LT_1x1.S b/kernel/x86/xtrsm_kernel_LT_1x1.S new file mode 100644 index 0000000000..e05266f7c6 --- /dev/null +++ b/kernel/x86/xtrsm_kernel_LT_1x1.S @@ -0,0 +1,493 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 32 + STACK + ARGS(%esp) +#define A 48 + STACK + ARGS(%esp) +#define ARG_B 52 + STACK + ARGS(%esp) +#define C 56 + STACK + ARGS(%esp) +#define ARG_LDC 60 + STACK + ARGS(%esp) +#define OFFSET 64 + STACK + ARGS(%esp) + +#define I %esi +#define B %ebx +#define CO %edi +#define AO %edx +#define BO %ecx +#define LDC %ebp + +#ifndef CONJ +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 faddp +#define ADD4 faddp +#elif defined(LN) || defined(LT) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 fsubrp +#define ADD4 faddp +#else +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 faddp +#define ADD4 fsubrp +#endif + +#define PREFETCH_OFFSET 48 + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_LDC, LDC + movl ARG_B, B + + sall $ZBASE_SHIFT, LDC + + addl $8 * SIZE, A + addl $8 * SIZE, B + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + cmpl $0, M + jle .L999 + + movl N, %eax + movl %eax, J + testl %eax, %eax + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AO +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, I + ALIGN_4 + +.L11: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#else + movl B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addl $8 * SIZE,AO + addl $8 * SIZE,BO + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + and $3, %eax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addl $2 * SIZE,AO + addl $2 * SIZE,BO + + decl %eax + jne .L16 + ALIGN_4 + +.L18: + faddp %st, %st(3) + faddp %st, %st(1) + + fxch %st(1) + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + sall $ZBASE_SHIFT, %eax + + movl AORIG, AO + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st(1), %st + FLD -8 * SIZE(AO) + fmul %st(3), %st + FLD -7 * SIZE(AO) + fmulp %st, %st(3) + FLD -7 * SIZE(AO) + fmulp %st, %st(4) +#endif + +#if defined(RN) || defined(RT) + FLD -8 * SIZE(BO) + fmul %st(1), %st + FLD -8 * SIZE(BO) + fmul %st(3), %st + FLD -7 * SIZE(BO) + fmulp %st, %st(3) + FLD -7 * SIZE(BO) + fmulp %st, %st(4) +#endif + +#ifndef CONJ + faddp %st, %st(2) + fsubp %st, %st(2) +#else + fsubp %st, %st(2) + faddp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -7 * SIZE(BO) + fxch %st(1) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -8 * SIZE(AO) +#endif + +#ifdef LN + subl $2 * SIZE, CO +#endif + + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + +#ifndef LN + addl $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AO, %eax, 1), AO + leal (BO, %eax, 1), BO +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl I + jne .L11 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + leal (B, %eax, 1), B +#endif + +#if defined(LT) || defined(RN) + movl BO, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J + jne .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zamax.S b/kernel/x86/zamax.S new file mode 100644 index 0000000000..3056c1e62a --- /dev/null +++ b/kernel/x86/zamax.S @@ -0,0 +1,261 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + + PROLOGUE + +#define M %ebx +#define INCX %esi +#define X %ecx +#define I %edx + +#ifndef USE_MIN +#define FMOV fcmovbe +#else +#define FMOV fcmovnbe +#endif + +#include "l1param.h" + + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_INCX, INCX + movl STACK_X, X + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + sall $ZBASE_SHIFT, INCX + + fldz + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + fstp %st(0) + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + addl INCX, X + decl M + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl $8 * SIZE, X + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addl INCX, X + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi %st(1), %st + FMOV %st(1), %st(0) + fstp %st(1) + + addl INCX, X + decl I + jg .L61 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zamax_sse.S b/kernel/x86/zamax_sse.S new file mode 100644 index 0000000000..60dd25b87b --- /dev/null +++ b/kernel/x86/zamax_sse.S @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#ifndef HAVE_SSE2 +#define pxor xorps +#define movsd movlps +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + pxor %xmm0, %xmm0 + pxor %xmm7, %xmm7 + xor RET, RET + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + movl M, MM + movl X, XX + +#ifdef USE_ABS +#ifndef HAVE_SSE2 + subl $8, %esp + movl $0x7fffffff, (%esp) + movss (%esp), %xmm7 + shufps $0, %xmm7, %xmm7 + addl $8, %esp +#else + cmpeqps %xmm7, %xmm7 + psrld $1, %xmm7 /* Generate USE_ABS */ +#endif +#endif + + movss 0 * SIZE(XX), %xmm0 + movss 1 * SIZE(XX), %xmm1 + addl INCX, XX + decl MM + +#ifdef USE_ABS + andps %xmm7, %xmm0 + andps %xmm7, %xmm1 +#endif + addps %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + cmpl $2 * SIZE, INCX + jne .L70 + +.L30: + movl MM, I + sarl $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 +#endif + + movsd 8 * SIZE(XX), %xmm1 + movhps 10 * SIZE(XX), %xmm1 + movsd 12 * SIZE(XX), %xmm2 + movhps 14 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + addl $16 * SIZE, XX + decl I + jg .L31 + ALIGN_4 + +.L35: + andl $7, MM + jle .L40 + + testl $4, MM + je .L36 + + movsd 0 * SIZE(XX), %xmm1 + movhps 2 * SIZE(XX), %xmm1 + movsd 4 * SIZE(XX), %xmm2 + movhps 6 * SIZE(XX), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + addl $8 * SIZE, XX + ALIGN_3 + +.L36: + testl $2, MM + je .L37 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + movss 2 * SIZE(XX), %xmm3 + movss 3 * SIZE(XX), %xmm4 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm0 + addl $4 * SIZE, XX + ALIGN_3 + +.L37: + testl $1, MM + je .L40 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + addps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L40: + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + jmp .L999 + ALIGN_4 + +.L70: + movl MM, I + sarl $3, I + jle .L75 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + decl I + jg .L71 + ALIGN_4 + +.L75: + andl $7, MM + jle .L80 + + testl $4, MM + je .L76 + + movsd 0 * SIZE(XX), %xmm1 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm1 + addl INCX, XX + movsd 0 * SIZE(XX), %xmm2 + addl INCX, XX + movhps 0 * SIZE(XX), %xmm2 + addl INCX, XX + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm3 +#endif + addps %xmm3, %xmm1 + maxps %xmm1, %xmm0 + ALIGN_3 + +.L76: + testl $2, MM + je .L77 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 + addl INCX, XX + movss 0 * SIZE(XX), %xmm3 + movss 1 * SIZE(XX), %xmm4 + addl INCX, XX + +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 + andps %xmm7, %xmm3 + andps %xmm7, %xmm4 +#endif + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm0 + ALIGN_3 + +.L77: + testl $1, MM + je .L80 + + movss 0 * SIZE(XX), %xmm1 + movss 1 * SIZE(XX), %xmm2 +#ifdef USE_ABS + andps %xmm7, %xmm1 + andps %xmm7, %xmm2 +#endif + addps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L80: + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L999: + subl $8, %esp + movss %xmm0, (%esp) + flds (%esp) + addl $8, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zamax_sse2.S b/kernel/x86/zamax_sse2.S new file mode 100644 index 0000000000..50adffbecf --- /dev/null +++ b/kernel/x86/zamax_sse2.S @@ -0,0 +1,373 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define RET %eax +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define MM %ebp +#define XX %edi +#define TEMP %ebx + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + pxor %xmm0, %xmm0 + pxor %xmm7, %xmm7 + xor RET, RET + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + movl M, MM + movl X, XX + + cmpeqpd %xmm7, %xmm7 + psrlq $1, %xmm7 + + movsd 0 * SIZE(XX), %xmm0 + movsd 1 * SIZE(XX), %xmm1 + addl INCX, XX + decl MM + andpd %xmm7, %xmm0 + andpd %xmm7, %xmm1 + addpd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + cmpl $2 * SIZE, INCX + jne .L60 + + movl MM, I + sarl $3, I + jle .L25 + ALIGN_4 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) +#endif + + movsd 8 * SIZE(XX), %xmm1 + movsd 9 * SIZE(XX), %xmm2 + movhpd 10 * SIZE(XX), %xmm1 + movhpd 11 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 12 * SIZE(XX), %xmm3 + movsd 13 * SIZE(XX), %xmm4 + movhpd 14 * SIZE(XX), %xmm3 + movhpd 15 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + + addl $16 * SIZE, XX + decl I + jg .L21 + ALIGN_4 + +.L25: + andl $7, MM + jle .L30 + + testl $4, MM + je .L26 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 4 * SIZE(XX), %xmm3 + movsd 5 * SIZE(XX), %xmm4 + movhpd 6 * SIZE(XX), %xmm3 + movhpd 7 * SIZE(XX), %xmm4 + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + addl $8 * SIZE, XX + ALIGN_3 + +.L26: + testl $2, MM + je .L27 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + movhpd 2 * SIZE(XX), %xmm1 + movhpd 3 * SIZE(XX), %xmm2 + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + addl $4 * SIZE, XX + ALIGN_3 + +.L27: + testl $1, MM + je .L30 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L30: + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + jmp .L999 + ALIGN_3 + +.L60: + movl MM, I + sarl $3, I + jle .L65 + ALIGN_4 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) +#endif + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + + decl I + jg .L61 + ALIGN_4 + +.L65: + andl $7, MM + jle .L70 + + testl $4, MM + je .L66 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + + movsd 0 * SIZE(XX), %xmm3 + movsd 1 * SIZE(XX), %xmm4 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm3 + movhpd 1 * SIZE(XX), %xmm4 + addl INCX, XX + + andpd %xmm7, %xmm3 + andpd %xmm7, %xmm4 + addpd %xmm4, %xmm3 + maxpd %xmm3, %xmm0 + ALIGN_3 + +.L66: + testl $2, MM + je .L67 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + addl INCX, XX + movhpd 0 * SIZE(XX), %xmm1 + movhpd 1 * SIZE(XX), %xmm2 + addl INCX, XX + + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxpd %xmm1, %xmm0 + ALIGN_3 + +.L67: + testl $1, MM + je .L70 + + movsd 0 * SIZE(XX), %xmm1 + movsd 1 * SIZE(XX), %xmm2 + andpd %xmm7, %xmm1 + andpd %xmm7, %xmm2 + addpd %xmm2, %xmm1 + maxsd %xmm1, %xmm0 + ALIGN_3 + +.L70: + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + subl $8, %esp + movsd %xmm0, (%esp) + fldl (%esp) + addl $8, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zasum.S b/kernel/x86/zasum.S new file mode 100644 index 0000000000..84b8f60cf4 --- /dev/null +++ b/kernel/x86/zasum.S @@ -0,0 +1,228 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#ifdef F_INTERFACE + movl (M), M + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpl $SIZE * 2, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st,%st(3) + faddp %st,%st(1) + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addl INCX, X + fabs + faddp %st,%st(3) + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zasum_sse.S b/kernel/x86/zasum_sse.S new file mode 100644 index 0000000000..ff8230c518 --- /dev/null +++ b/kernel/x86/zasum_sse.S @@ -0,0 +1,341 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define I %eax +#define M %ecx +#define X %esi +#define INCX %ebx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + +#ifdef HAVE_SSE2 + pcmpeqb %xmm3, %xmm3 + psrld $1, %xmm3 +#else + movl $0x7fffffff, STACK_M + movss STACK_M, %xmm3 + shufps $0, %xmm3, %xmm3 +#endif + + sall $ZBASE_SHIFT, INCX + + cmpl $2 * SIZE, INCX + jne .L100 + + subl $-32 * SIZE, X + addl M, M + + cmpl $3, M + jle .L18 + + testl $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + andps %xmm3, %xmm0 + addl $SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L05: + testl $8, X + je .L10 + + movsd -32 * SIZE(X), %xmm1 + andps %xmm3, %xmm1 + addl $2 * SIZE, X + subl $2, M + jle .L999 + ALIGN_3 + +.L10: + movl M, I + sarl $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps 12 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + decl I + jg .L11 + ALIGN_3 + +.L12: + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + movaps -4 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + + addl $32 * SIZE, X + ALIGN_3 + +.L14: + testl $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + andps %xmm3, %xmm6 + addps %xmm6, %xmm0 + + movaps -20 * SIZE(X), %xmm7 + andps %xmm3, %xmm7 + addps %xmm7, %xmm1 + + addl $16 * SIZE, X + ALIGN_3 + +.L16: + testl $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L17: + testl $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + addl $4 * SIZE, X + ALIGN_3 + +.L18: + testl $2, M + je .L19 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm1 + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, M + je .L999 + + movss -32 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + jmp .L999 + ALIGN_4 + +.L100: + movl M, I + sarl $2, I + jle .L105 + ALIGN_4 + +.L101: + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + + andps %xmm3, %xmm5 + addps %xmm5, %xmm1 + + decl I + jg .L101 + ALIGN_4 + +.L105: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andl $3, M + jle .L999 + ALIGN_4 + +.L106: + movsd (X), %xmm4 + andps %xmm3, %xmm4 + addps %xmm4, %xmm0 + addl INCX, X + decl M + jg .L106 + ALIGN_4 + +.L999: + addps %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + + movss %xmm0, STACK_M + flds STACK_M + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zasum_sse2.S b/kernel/x86/zasum_sse2.S new file mode 100644 index 0000000000..b7dbc1512a --- /dev/null +++ b/kernel/x86/zasum_sse2.S @@ -0,0 +1,320 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define I %eax +#define M %ecx +#define X %esi +#define INCX %ebx + +#define xmm8 xmm4 +#define xmm9 xmm5 +#define xmm10 xmm6 +#define xmm11 xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + pcmpeqb %xmm3, %xmm3 + psrlq $1, %xmm3 + + sall $ZBASE_SHIFT, INCX + + cmpl $2 * SIZE, INCX + jne .L40 + + subl $-16 * SIZE, X + addl M, M + + testl $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm0 + addl $SIZE, X + + andps %xmm3, %xmm0 + subl $1, M + jle .L999 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decl I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps 6 * SIZE(X), %xmm7 + + subl $-16 * SIZE, X + decl I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + movaps -2 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + subl $-16 * SIZE, X + ALIGN_3 + +.L20: + andl $15, M + jle .L999 + + testl $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L21: + testl $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + addl $4 * SIZE, X + ALIGN_3 + +.L22: + testl $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + addl $2 * SIZE, X + +.L23: + testl $1, M + je .L999 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + movsd 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + andps %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addl INCX, X + andps %xmm3, %xmm6 + addpd %xmm6, %xmm0 + + movsd 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addl INCX, X + andps %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + decl I + jg .L50 + ALIGN_4 + +.L60: + andl $3, M + jle .L999 + ALIGN_4 + + +.L61: + movsd 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + andps %xmm3, %xmm4 + addpd %xmm4, %xmm0 + addl INCX, X + decl M + jg .L61 + ALIGN_4 + +.L999: + addpd %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movaps %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + movsd %xmm0, STACK_M + fldl STACK_M + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zaxpy.S b/kernel/x86/zaxpy.S new file mode 100644 index 0000000000..0894f5dc64 --- /dev/null +++ b/kernel/x86/zaxpy.S @@ -0,0 +1,348 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) +#define STACK_Y 40 + STACK + ARGS(%esp) +#define STACK_INCY 44 + STACK + ARGS(%esp) +#else +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) +#endif + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#ifndef CONJ +#define ADD1 fsubrp +#define ADD2 faddp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + FLD STACK_ALPHA_I + FLD STACK_ALPHA_R + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + addl INCX, INCX + addl INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + testl M, M + jle .L40 + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl M, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1), %st + FLD 3 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 2 * SIZE(Y) + FST 2 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(2), %st + FLD 3 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 3 * SIZE(Y) + FST 3 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(1), %st + FLD 5 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 4 * SIZE(Y) + FST 4 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(2), %st + FLD 5 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 5 * SIZE(Y) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1), %st + FLD 7 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 6 * SIZE(Y) + FST 6 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(2), %st + FLD 7 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 7 * SIZE(Y) + FST 7 * SIZE(Y) + +#ifdef HAVE_3DNOW + prefetch 20 * SIZE(X) + prefetchw 20 * SIZE(Y) +#endif + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl %eax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movl M, %eax + sarl $2, %eax + jle .L28 + ALIGN_3 + +.L29: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl M, %eax + andl $3, %eax + jle .L40 + ALIGN_3 + +.L35: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FADD 0 * SIZE(Y) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FADD 1 * SIZE(Y) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl %eax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + ffreep %st(0) + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zaxpy_sse.S b/kernel/x86/zaxpy_sse.S new file mode 100644 index 0000000000..edd9929cd4 --- /dev/null +++ b/kernel/x86/zaxpy_sse.S @@ -0,0 +1,3103 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx +#define YY %ebp + +#define ALPHA_R %xmm6 +#define ALPHA_I %xmm7 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movss STACK_ALPHA_R, ALPHA_R + movss STACK_ALPHA_I, ALPHA_I + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L100 + cmpl $2 * SIZE, INCY + jne .L100 + +#ifdef HAVE_SSE2 + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 +#else + movl $0x80000000, STACK_M + movss STACK_M, %xmm5 + shufps $0x11, %xmm5, %xmm5 +#endif + + shufps $0, ALPHA_R, ALPHA_R + shufps $0, ALPHA_I, ALPHA_I + +#ifndef CONJ + shufps $0xb1, %xmm5, %xmm5 + xorps %xmm5, ALPHA_I +#else + xorps %xmm5, ALPHA_R +#endif + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + testl $2 * SIZE, Y + je .L10 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 +#ifndef HAVE_SSE2 + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps %xmm5, %xmm0 + addps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl M + jle .L999 + ALIGN_2 + +.L10: + testl $SIZE, Y + jne .L50 + + testl $3 * SIZE, X + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 8 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L11 + ALIGN_3 + +.L12: + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L15: + testl $8, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L16: + testl $4, M + jle .L17 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L17: + testl $2, M + jle .L18 + + movaps -32 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L18: + testl $1, M + jle .L999 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + +#ifndef HAVE_SSE2 + xorps %xmm1, %xmm1 + movlps -32 * SIZE(Y), %xmm1 +#else + movsd -32 * SIZE(Y), %xmm1 +#endif + addps %xmm1, %xmm0 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + testl $2 * SIZE, X + jne .L30 + + subl $1 * SIZE, X + + movaps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L25 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, M + jle .L26 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L26: + testl $4, M + jle .L27 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L27: + testl $2, M + jle .L28 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L28: + testl $1, M + jle .L999 + + PSHUFD2($0x06, %xmm0, %xmm5) + PSHUFD2($0x09, %xmm0, %xmm0) + + mulps ALPHA_I, %xmm5 + mulps ALPHA_R, %xmm0 + +#ifndef HAVE_SSE2 + xorps %xmm1, %xmm1 + movlps -32 * SIZE(Y), %xmm1 +#else + movsd -32 * SIZE(Y), %xmm1 +#endif + addps %xmm1, %xmm0 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y) + + jmp .L999 + ALIGN_3 + +.L30: + testl $1 * SIZE, X + jne .L40 +#endif + + movl M, %eax + sarl $4, %eax + jle .L35 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decl %eax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L35: + testl $8, M + jle .L36 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L36: + testl $4, M + jle .L37 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L37: + testl $2, M + jle .L38 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L38: + testl $1, M + jle .L999 + + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS + +.L40: + subl $3 * SIZE, X + + movaps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L45 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl %eax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -16 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -12 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -8 * SIZE(X), %xmm2 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -16 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -12 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -8 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -4 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L45: + testl $8, M + jle .L46 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps -24 * SIZE(Y), %xmm2 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps -20 * SIZE(Y), %xmm3 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L46: + testl $4, M + jle .L47 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps -28 * SIZE(Y), %xmm1 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L47: + testl $2, M + jle .L48 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L48: + testl $1, M + jle .L999 + + movaps -28 * SIZE(X), %xmm1 + movsd -32 * SIZE(Y), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + + addps %xmm5, %xmm0 + addps %xmm2, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + jmp .L999 + ALIGN_3 +#endif + +.L50: + xorps %xmm0, %xmm0 + + subl $1 * SIZE, Y + + testl $3 * SIZE, X + jne .L60 + + movl M, %eax + sarl $4, %eax + jle .L55 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decl %eax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 8 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L51 + ALIGN_3 + +.L52: + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L55: + testl $8, M + jle .L56 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L56: + testl $4, M + jle .L57 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L57: + testl $2, M + jle .L58 + + movaps -32 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L58: + testl $1, M + jle .L59 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L59: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L60: +#ifdef ALIGNED_ACCESS + + testl $2 * SIZE, X + jne .L70 + + subl $1 * SIZE, X + + movaps -32 * SIZE(X), %xmm1 + + movl M, %eax + sarl $4, %eax + jle .L65 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decl %eax + jle .L62 + ALIGN_3 + +.L61: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 8 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L61 + ALIGN_3 + +.L62: + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L65: + testl $8, M + jle .L66 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L66: + testl $4, M + jle .L67 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L67: + testl $2, M + jle .L68 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L68: + testl $1, M + jle .L69 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + movhlps %xmm0, %xmm0 + movss %xmm0, -30 * SIZE(Y) + jmp .L999 + +.L69: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L70: + testl $1 * SIZE, X + jne .L80 +#endif + + movl M, %eax + sarl $4, %eax + jle .L75 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + movsd -24 * SIZE(X), %xmm3 + movhps -22 * SIZE(X), %xmm3 + + decl %eax + jle .L72 + ALIGN_3 + +.L71: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movsd -20 * SIZE(X), %xmm0 + movhps -18 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movsd -16 * SIZE(X), %xmm1 + movhps -14 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movsd -12 * SIZE(X), %xmm2 + movhps -10 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movsd -8 * SIZE(X), %xmm3 + movhps -6 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -4 * SIZE(X), %xmm0 + movhps -2 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movsd 0 * SIZE(X), %xmm1 + movhps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movsd 8 * SIZE(X), %xmm3 + movhps 10 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L71 + ALIGN_3 + +.L72: + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movsd -20 * SIZE(X), %xmm0 + movhps -18 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movsd -16 * SIZE(X), %xmm1 + movhps -14 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movsd -12 * SIZE(X), %xmm2 + movhps -10 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movsd -8 * SIZE(X), %xmm3 + movhps -6 * SIZE(X), %xmm3 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -4 * SIZE(X), %xmm0 + movhps -2 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L75: + testl $8, M + jle .L76 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm3 + movhps -22 * SIZE(X), %xmm3 + movsd -20 * SIZE(X), %xmm0 + movhps -18 * SIZE(X), %xmm0 + + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L76: + testl $4, M + jle .L77 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L77: + testl $2, M + jle .L78 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L78: + testl $1, M + jle .L79 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L79: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS + +.L80: + subl $3 * SIZE, X + + movaps -32 * SIZE(X), %xmm1 + + movl M, %eax + sarl $4, %eax + jle .L85 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decl %eax + jle .L82 + ALIGN_3 + +.L81: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + movaps 8 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L81 + ALIGN_3 + +.L82: + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -20 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps -8 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -4 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -12 * SIZE(Y), %xmm1 + movaps %xmm1, -12 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -8 * SIZE(Y), %xmm2 + movaps %xmm2, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -4 * SIZE(Y), %xmm3 + movaps %xmm3, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L85: + testl $8, M + jle .L86 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + PSHUFD2($0xb1, %xmm3, %xmm5) + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + PSHUFD2($0xb1, %xmm0, %xmm5) + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_2 + +.L86: + testl $4, M + jle .L87 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + PSHUFD2($0xb1, %xmm2, %xmm5) + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_2 + +.L87: + testl $2, M + jle .L88 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_2 + +.L88: + testl $1, M + jle .L89 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + PSHUFD2($0xb1, %xmm1, %xmm5) + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm5 + addps %xmm5, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + movhlps %xmm0, %xmm0 + movss %xmm0, -30 * SIZE(Y) + jmp .L999 + +.L89: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 +#endif + +.L100: + shufps $0, ALPHA_R, ALPHA_R + shufps $0, ALPHA_I, ALPHA_I + +#ifndef CONJ + xorps %xmm5, %xmm5 + subps ALPHA_I, %xmm5 + + unpcklps ALPHA_R, %xmm5 + unpcklps ALPHA_I, ALPHA_R + movaps %xmm5, ALPHA_I +#else + xorps %xmm5, %xmm5 + subps ALPHA_R, %xmm5 + + unpcklps ALPHA_I, ALPHA_R + unpcklps %xmm5, ALPHA_I +#endif + + movl Y, YY + + movl M, %eax + sarl $3, %eax + jle .L105 + ALIGN_3 + +.L102: + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 + + movaps %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 + shufps $0xf5, %xmm3, %xmm3 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm3 + + movsd (Y), %xmm4 + addl INCY, Y + movhps (Y), %xmm4 + addl INCY, Y + movsd (Y), %xmm5 + addl INCY, Y + movhps (Y), %xmm5 + addl INCY, Y + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + addps %xmm2, %xmm5 + addps %xmm3, %xmm5 + + movsd %xmm4, (YY) + addl INCY, YY + movhps %xmm4, (YY) + addl INCY, YY + movsd %xmm5, (YY) + addl INCY, YY + movhps %xmm5, (YY) + addl INCY, YY + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 + + movaps %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 + shufps $0xf5, %xmm3, %xmm3 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm3 + + movsd (Y), %xmm4 + addl INCY, Y + movhps (Y), %xmm4 + addl INCY, Y + movsd (Y), %xmm5 + addl INCY, Y + movhps (Y), %xmm5 + addl INCY, Y + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + addps %xmm2, %xmm5 + addps %xmm3, %xmm5 + + movsd %xmm4, (YY) + addl INCY, YY + movhps %xmm4, (YY) + addl INCY, YY + movsd %xmm5, (YY) + addl INCY, YY + movhps %xmm5, (YY) + addl INCY, YY + + decl %eax + jg .L102 + ALIGN_3 + +.L105: + testl $4, M + jle .L106 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 + + movaps %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 + shufps $0xf5, %xmm3, %xmm3 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm3 + + movsd (Y), %xmm4 + addl INCY, Y + movhps (Y), %xmm4 + addl INCY, Y + movsd (Y), %xmm5 + addl INCY, Y + movhps (Y), %xmm5 + addl INCY, Y + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + addps %xmm2, %xmm5 + addps %xmm3, %xmm5 + + movsd %xmm4, (YY) + addl INCY, YY + movhps %xmm4, (YY) + addl INCY, YY + movsd %xmm5, (YY) + addl INCY, YY + movhps %xmm5, (YY) + addl INCY, YY + ALIGN_3 + +.L106: + testl $2, M + jle .L107 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + + movsd (Y), %xmm4 + addl INCY, Y + movhps (Y), %xmm4 + addl INCY, Y + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + + movsd %xmm4, (YY) + addl INCY, YY + movhps %xmm4, (YY) + addl INCY, YY + ALIGN_3 + +.L107: + testl $1, M + jle .L999 + + movsd (X), %xmm0 + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + + movsd (Y), %xmm4 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + + movsd %xmm4, (Y) + ALIGN_3 + +.L999: + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE + diff --git a/kernel/x86/zaxpy_sse2.S b/kernel/x86/zaxpy_sse2.S new file mode 100644 index 0000000000..40afdc3fca --- /dev/null +++ b/kernel/x86/zaxpy_sse2.S @@ -0,0 +1,1522 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) +#define STACK_Y 40 + STACK + ARGS(%esp) +#define STACK_INCY 44 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx +#define YY %ebp + +#define ALPHA_R %xmm6 +#define ALPHA_I %xmm7 + +#if defined(HAVE_SSE3) && !defined(CORE_OPTERON) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + +#include "l1param.h" + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movsd STACK_ALPHA_R, %xmm0 + movsd STACK_ALPHA_I, %xmm1 + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#ifdef HAVE_SSE3 + movddup %xmm0, ALPHA_R + movddup %xmm1, ALPHA_I +#else + pshufd $0x44, %xmm0, ALPHA_R + pshufd $0x44, %xmm1, ALPHA_I +#endif + +#ifndef CONJ + shufps $0x0c, %xmm5, %xmm5 + xorpd %xmm5, ALPHA_I +#else + shufps $0xc0, %xmm5, %xmm5 + xorpd %xmm5, ALPHA_R +#endif + + testl $SIZE, Y + jne .L30 + + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $3, %eax + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -8 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -6 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -4 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -2 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -8 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps 0 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -6 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -4 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 4 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -2 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 6 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L11 + ALIGN_3 + +.L12: + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -8 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -6 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -4 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -2 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -8 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -6 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -4 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -2 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L15: + movl M, %eax + andl $4, %eax + jle .L16 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + movl M, %eax + andl $2, %eax + jle .L17 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + movl M, %eax + andl $1, %eax + jle .L999 + + movaps -16 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movl M, %eax + sarl $3, %eax + jle .L25 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -8 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -6 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -4 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -2 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -8 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -6 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -4 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -2 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L25: + movl M, %eax + andl $4, %eax + jle .L26 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + movl M, %eax + andl $2, %eax + jle .L27 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + movl M, %eax + andl $1, %eax + jle .L999 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + testl $SIZE, X + jne .L40 + + movaps -16 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + xorps %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm0 + + xorps %xmm4, %xmm4 + movhps -16 * SIZE(Y), %xmm4 + + addpd %xmm0, %xmm4 + movhps %xmm4, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $2 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L39 + + movl M, %eax + sarl $3, %eax + jle .L35 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + + decl %eax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -10 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -8 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -6 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 2 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 4 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -10 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -8 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -6 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L35: + movl M, %eax + andl $4, %eax + jle .L36 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L36: + movl M, %eax + andl $2, %eax + jle .L37 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L37: + movl M, %eax + andl $1, %eax + jle .L39 + + movaps -16 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L39: + SHUFPD_1 %xmm0, %xmm0 + + addsd -16 * SIZE(Y), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L40: + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + xorps %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm0 + + xorps %xmm4, %xmm4 + movhps -16 * SIZE(Y), %xmm4 + + addpd %xmm0, %xmm4 + movhps %xmm4, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addl $2 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L49 + + movl M, %eax + sarl $3, %eax + jle .L45 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + movsd -12 * SIZE(X), %xmm3 + movhps -11 * SIZE(X), %xmm3 + + decl %eax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -10 * SIZE(X), %xmm0 + movhps -9 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -8 * SIZE(X), %xmm1 + movhps -7 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -6 * SIZE(X), %xmm2 + movhps -5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -3 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd -2 * SIZE(X), %xmm0 + movhps -1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movsd 2 * SIZE(X), %xmm2 + movhps 3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movsd 4 * SIZE(X), %xmm3 + movhps 5 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -10 * SIZE(X), %xmm0 + movhps -9 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -8 * SIZE(X), %xmm1 + movhps -7 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -6 * SIZE(X), %xmm2 + movhps -5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -3 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd -2 * SIZE(X), %xmm0 + movhps -1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm0, %xmm5 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L45: + movl M, %eax + andl $4, %eax + jle .L46 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm3 + movhps -11 * SIZE(X), %xmm3 + movsd -10 * SIZE(X), %xmm4 + movhps -9 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L46: + movl M, %eax + andl $2, %eax + jle .L47 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L47: + movl M, %eax + andl $1, %eax + jle .L49 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm5 + addpd %xmm5, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + addl $2 * SIZE, Y + ALIGN_3 + +.L49: + SHUFPD_1 %xmm0, %xmm0 + + addsd -16 * SIZE(Y), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: +#ifndef CONJ + movaps %xmm0, ALPHA_R + + pxor ALPHA_I, ALPHA_I + subsd %xmm1, ALPHA_I + + unpcklpd ALPHA_R, ALPHA_I + unpcklpd %xmm1, ALPHA_R +#else + movaps %xmm0, ALPHA_R + movaps %xmm1, ALPHA_I + + pxor %xmm5, %xmm5 + subsd %xmm0, %xmm5 + + unpcklpd %xmm5, ALPHA_I + unpcklpd %xmm1, ALPHA_R +#endif + + movl Y, YY + movl M, %eax + sarl $2, %eax + jle .L55 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + decl %eax + jle .L52 + ALIGN_3 + +.L51: + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L52: + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + ALIGN_3 + +.L55: + movl M, %eax + andl $2, %eax + jle .L57 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addl INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addl INCX, X + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addl INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addl INCY, Y + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_I, %xmm3 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm5 + addpd %xmm1, %xmm4 + addpd %xmm3, %xmm5 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + addl INCY, YY + movlpd %xmm5, 0 * SIZE(YY) + movhpd %xmm5, 1 * SIZE(YY) + addl INCY, YY + ALIGN_3 + +.L57: + movl M, %eax + andl $1, %eax + jle .L999 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm4 + + movlpd %xmm4, 0 * SIZE(YY) + movhpd %xmm4, 1 * SIZE(YY) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zcopy.S b/kernel/x86/zcopy.S new file mode 100644 index 0000000000..153853ea0d --- /dev/null +++ b/kernel/x86/zcopy.S @@ -0,0 +1,250 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define X 8 + STACK + ARGS(%esp) +#define INCX 12 + STACK + ARGS(%esp) +#define Y 16 + STACK + ARGS(%esp) +#define INCY 20 + STACK + ARGS(%esp) + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl M, %ebx + movl X, %ecx + movl INCX, %esi + movl Y, %edx + movl INCY, %edi + + testl %ebx, %ebx # if m == 0 goto End + jle .L999 + + sall $ZBASE_SHIFT, %esi + sall $ZBASE_SHIFT, %edi + + cmpl $2 * SIZE, %esi # if incx != 1 + jne .L100 + cmpl $2 * SIZE, %edi # if incy != 1 + jne .L100 + + movl %ebx, %eax # i = m + sarl $2, %eax + jle .L20 + ALIGN_2 + +.L11: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 7 * SIZE(%ecx) + FLD 6 * SIZE(%ecx) + FLD 5 * SIZE(%ecx) + FLD 4 * SIZE(%ecx) + FLD 3 * SIZE(%ecx) + FLD 2 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + FST 2 * SIZE(%edx) + FST 3 * SIZE(%edx) + FST 4 * SIZE(%edx) + FST 5 * SIZE(%edx) + FST 6 * SIZE(%edx) + FST 7 * SIZE(%edx) +#else + fldl 6 * SIZE(%ecx) + fldl 4 * SIZE(%ecx) + fldl 2 * SIZE(%ecx) + fldl 0 * SIZE(%ecx) + + fstpl 0 * SIZE(%edx) + fstpl 2 * SIZE(%edx) + fstpl 4 * SIZE(%edx) + fstpl 6 * SIZE(%edx) +#endif + + addl $8 * SIZE, %ecx + addl $8 * SIZE, %edx + decl %eax + jg .L11 + ALIGN_2 + +.L20: + movl %ebx, %eax # i = m + andl $3, %eax + jle .L99 + ALIGN_2 + +.L21: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 1 * SIZE(%ecx) + FLD 0 * SIZE(%ecx) + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) +#else + fldl 0 * SIZE(%ecx) + fstpl 0 * SIZE(%edx) +#endif + + addl $2 * SIZE, %ecx + addl $2 * SIZE, %edx + decl %eax + jg .L21 + +.L99: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L100: + movl %ebx, %eax + sarl $2, %eax + jle .L120 + ALIGN_2 + +.L111: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + fxch %st(7) + FST 0 * SIZE(%edx) + fxch %st(5) + FST 1 * SIZE(%edx) + addl %edi, %edx + + fxch %st(3) + FST 0 * SIZE(%edx) + fxch %st(1) + FST 1 * SIZE(%edx) + addl %edi, %edx + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + addl %edi, %edx + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + addl %edi, %edx +#else + fldl 0 * SIZE(%ecx) + addl %esi, %ecx + fldl 0 * SIZE(%ecx) + addl %esi, %ecx + fldl 0 * SIZE(%ecx) + addl %esi, %ecx + fldl 0 * SIZE(%ecx) + addl %esi, %ecx + + fxch %st(3) + fstpl 0 * SIZE(%edx) + addl %edi, %edx + + fxch %st(1) + fstpl 0 * SIZE(%edx) + addl %edi, %edx + + fstpl 0 * SIZE(%edx) + addl %edi, %edx + + fstpl 0 * SIZE(%edx) + addl %edi, %edx +#endif + + decl %eax + jg .L111 + +.L120: + movl %ebx, %eax + andl $3, %eax + jle .L999 + ALIGN_2 + +.L121: + FLD 0 * SIZE(%ecx) + FLD 1 * SIZE(%ecx) + addl %esi, %ecx + + fxch %st(1) + + FST 0 * SIZE(%edx) + FST 1 * SIZE(%edx) + addl %edi, %edx + + decl %eax + jg .L121 + +.L999: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zcopy_sse.S b/kernel/x86/zcopy_sse.S new file mode 100644 index 0000000000..83930057aa --- /dev/null +++ b/kernel/x86/zcopy_sse.S @@ -0,0 +1,994 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + cmpl $2 * SIZE, INCX + jne .L100 + cmpl $2 * SIZE, INCY + jne .L100 + + cmpl $3, M + jle .L106 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + addl M, M + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + ALIGN_4 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_4 + +.L10: + testl $3 * SIZE, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -32 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -28 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -24 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm2) + movaps %xmm3, -20 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4,-16 * SIZE(Y) + LOAD(16 * SIZE, X, %xmm4) + movaps %xmm5,-12 * SIZE(Y) + LOAD(20 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -8 * SIZE(Y) + LOAD(24 * SIZE, X, %xmm6) + movaps %xmm7, -4 * SIZE(Y) + LOAD(28 * SIZE, X, %xmm7) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + movaps %xmm4, -16 * SIZE(Y) + movaps %xmm5, -12 * SIZE(Y) + movaps %xmm6, -8 * SIZE(Y) + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + ALIGN_3 + +.L13: + testl $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + testl $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: + testl $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + movaps -6 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 14 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 18 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 22 * SIZE(X), %xmm6 + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 26 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L23: + testl $16, M + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $8, M + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm1, %xmm0 + shufps $0x4e, %xmm2, %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, M + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, M + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, M + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L30: + testl $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + movaps -5 * SIZE(X), %xmm7 + + decl %eax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 15 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 19 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 23 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 27 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L33: + testl $16, M + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + testl $8, M + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, M + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + testl $2, M + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, M + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L39: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movl M, %eax + sarl $5, %eax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + movaps -7 * SIZE(X), %xmm7 + + decl %eax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 13 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 17 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 21 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 25 * SIZE(X), %xmm7 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L43: + testl $16, M + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + testl $8, M + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, M + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + testl $2, M + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, M + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addl $SIZE, Y + ALIGN_3 + +.L49: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_4 + +.L100: + movl M, %eax + sarl $3, %eax + jle .L105 + ALIGN_3 + +.L102: + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + movsd %xmm0, (Y) + addl INCY, Y + movhps %xmm0, (Y) + addl INCY, Y + movsd %xmm1, (Y) + addl INCY, Y + movhps %xmm1, (Y) + addl INCY, Y + movsd %xmm2, (Y) + addl INCY, Y + movhps %xmm2, (Y) + addl INCY, Y + movsd %xmm3, (Y) + addl INCY, Y + movhps %xmm3, (Y) + addl INCY, Y + + decl %eax + jg .L102 + ALIGN_3 + +.L105: + testl $4, M + jle .L106 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + movsd %xmm0, (Y) + addl INCY, Y + movhps %xmm0, (Y) + addl INCY, Y + movsd %xmm1, (Y) + addl INCY, Y + movhps %xmm1, (Y) + addl INCY, Y + ALIGN_3 + +.L106: + testl $2, M + jle .L107 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + movsd %xmm0, (Y) + addl INCY, Y + movhps %xmm0, (Y) + addl INCY, Y + ALIGN_3 + +.L107: + testl $1, M + jle .L999 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zcopy_sse2.S b/kernel/x86/zcopy_sse2.S new file mode 100644 index 0000000000..f936a34a91 --- /dev/null +++ b/kernel/x86/zcopy_sse2.S @@ -0,0 +1,668 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define M %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define xmm8 xmm0 +#define xmm9 xmm1 +#define xmm10 xmm2 +#define xmm11 xmm3 +#define xmm12 xmm4 +#define xmm13 xmm5 +#define xmm14 xmm6 +#define xmm15 xmm7 + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + addl M, M + +#ifdef ALIGNED_ACCESS + testl $SIZE, Y +#else + testl $SIZE, X +#endif + je .L10 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + jle .L19 + ALIGN_4 + +.L10: + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + +#ifdef ALIGNED_ACCESS + testl $SIZE, X +#else + testl $SIZE, Y +#endif + jne .L20 + + movl M, %eax + sarl $4, %eax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -16 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -14 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -12 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movaps %xmm3, -10 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4, -8 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movaps %xmm5, -6 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -4 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movaps %xmm7, -2 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, -8 * SIZE(Y) + movaps %xmm5, -6 * SIZE(Y) + movaps %xmm6, -4 * SIZE(Y) + movaps %xmm7, -2 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L13: + testl $8, M + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + testl $4, M + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + testl $2, M + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L16: + testl $1, M + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + LOAD( 1 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + LOAD( 3 * SIZE, X, %xmm2) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + LOAD( 5 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + LOAD( 7 * SIZE, X, %xmm4) + + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + LOAD( 9 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + LOAD(11 * SIZE, X, %xmm6) + + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + LOAD(13 * SIZE, X, %xmm7) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + decl %eax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +#else + + movl M, %eax + sarl $4, %eax + jle .L23 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + ALIGN_3 + +.L23: + testl $8, M + jle .L24 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movaps -10 * SIZE(X), %xmm3 + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $4, M + jle .L25 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, M + jle .L26 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 +#endif + +.L50: + movl M, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L51: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + movlps %xmm1, 0 * SIZE(Y) + movhps %xmm1, 1 * SIZE(Y) + addl INCY, Y + + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + addl INCY, Y + + movlps %xmm3, 0 * SIZE(Y) + movhps %xmm3, 1 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $3, %eax + jle .L57 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zdot.S b/kernel/x86/zdot.S new file mode 100644 index 0000000000..aa4481f976 --- /dev/null +++ b/kernel/x86/zdot.S @@ -0,0 +1,310 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#if defined(DOUBLE) || defined(XDOUBLE) +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) +#else +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#ifdef F_INTERFACE + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + testl N, N + jle .L88 + + addl INCX, INCX + fldz + addl INCY, INCY + fldz + + leal (, INCX, SIZE), INCX + fldz + leal (, INCY, SIZE), INCY + fldz + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl N, %eax + sarl $1, %eax + jle .L15 + ALIGN_3 + +.L16: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + FLD 2 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(Y) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 3 * SIZE(Y) + faddp %st, %st(4) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + jmp .L27 + ALIGN_3 + +.L14: + movl N, %eax + sarl $1, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + addl INCX, X + + FLD 0 * SIZE(X) + addl INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1), %st + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + ALIGN_3 + +.L27: +#if defined(DOUBLE) || defined(XDOUBLE) + movl RESULT, %eax +#endif + +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + +#if !defined(DOUBLE) && !defined(XDOUBLE) + subl $2 * SIZE, %esp + FST 1 * SIZE(%esp) + FST 0 * SIZE(%esp) + movl 0 * SIZE(%esp), %eax + movl 1 * SIZE(%esp), %edx + addl $2 * SIZE, %esp +#else + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L88: +#if defined(DOUBLE) || defined(XDOUBLE) + movl RESULT, %eax +#endif + + fldz + fldz + +#if !defined(DOUBLE) && !defined(XDOUBLE) + xor %eax, %eax + xor %edx, %edx +#else + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zdot_amd.S b/kernel/x86/zdot_amd.S new file mode 100644 index 0000000000..97a1e721dc --- /dev/null +++ b/kernel/x86/zdot_amd.S @@ -0,0 +1,377 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#if !defined(DOUBLE) && !defined(XDOUBLE) +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) +#else +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + +#if defined(F_INTERFACE) + movl (N),N + movl (INCX),INCX + movl (INCY),INCY +#endif + + testl N, N + jle .L88 + + fldz + fldz + fldz + fldz + + addl INCX, INCX + addl INCY, INCY + + leal (, INCX, SIZE), INCX + leal (, INCY, SIZE), INCY + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl N, %eax + sarl $2, %eax + jle .L15 + + FLD 0 * SIZE(X) + ALIGN_3 + +.L16: + FLD 0 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + FLD 2 * SIZE(X) + + FLD 2 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(2) + + FMUL 3 * SIZE(Y) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(4) + + FMUL 3 * SIZE(Y) + faddp %st, %st(4) + FLD 4 * SIZE(X) + + FLD 4 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(2) + + FMUL 5 * SIZE(Y) + faddp %st, %st(2) + FLD 5 * SIZE(X) + + FLD 4 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(4) + + FMUL 5 * SIZE(Y) + faddp %st, %st(4) + FLD 6 * SIZE(X) + + FLD 6 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(2) + + FMUL 7 * SIZE(Y) + faddp %st, %st(2) + FLD 7 * SIZE(X) + + FLD 6 * SIZE(Y) + PADDING fmul %st(1) + faddp %st, %st(4) + + FMUL 7 * SIZE(Y) + faddp %st, %st(4) + FLD 8 * SIZE(X) + + prefetch 32 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + decl %eax + jg .L16 + ffreep %st(0) + ALIGN_3 + +.L15: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + + decl %eax + jg .L22 + jmp .L27 + ALIGN_3 + +.L14: +#ifdef F_INTERFACE + testl INCX, INCX # if (incx < 0) + jge .L28 + + movl N, %eax + decl %eax + imull INCX, %eax + subl %eax, X + ALIGN_3 + +.L28: + testl INCY, INCY # if (incy < 0) + jge .L29 + + movl N, %eax + decl %eax + imull INCY, %eax + subl %eax, Y + ALIGN_3 + +.L29: +#endif + + movl N, %eax + sarl $1, %eax + jle .L30 + ALIGN_3 + + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + addl INCX, X + + FLD 0 * SIZE(X) + addl INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FMUL 1 * SIZE(Y) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FMUL 1 * SIZE(Y) + faddp %st, %st(4) + ALIGN_3 + +.L27: +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + +#if !defined(DOUBLE) && !defined(XDOUBLE) + subl $2 * SIZE, %esp + FST 1 * SIZE(%esp) + FST 0 * SIZE(%esp) + movl 0 * SIZE(%esp), %eax + movl 1 * SIZE(%esp), %edx + addl $2 * SIZE, %esp +#else + movl RESULT, %eax + + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi +#if defined(F_INTERFACE) && defined(F_PATHSCALE) + ret $0x4 +#else + ret +#endif + + ALIGN_3 + +.L88: +#if !defined(DOUBLE) && !defined(XDOUBLE) + xor %eax, %eax + xor %edx, %edx +#else + movl RESULT, %eax + + fldz + fldz + + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) +#endif + + popl %ebx + popl %esi + popl %edi +#if defined(F_INTERFACE) && defined(F_PATHSCALE) + ret $0x4 +#else + ret +#endif + + EPILOGUE diff --git a/kernel/x86/zdot_sse.S b/kernel/x86/zdot_sse.S new file mode 100644 index 0000000000..cc229643b2 --- /dev/null +++ b/kernel/x86/zdot_sse.S @@ -0,0 +1,3457 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + testl N, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L200 + cmpl $2 * SIZE, INCY + jne .L200 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + testl $SIZE, X + jne .L50 + +.L0x: + testl $2 * SIZE, X + je .L10 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm0 + + PSHUFD2($0xb1, %xmm0, %xmm1) + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl N + ALIGN_3 + +.L10: + testl $3 * SIZE, Y + jne .L20 + + movl N, %eax + sarl $4, %eax + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm6 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -24 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -20 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -16 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -16 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps 0 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps 0 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps 4 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps 4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L11 + ALIGN_3 + +.L12: + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -24 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -20 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -16 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -16 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L15: + testl $8, N + jle .L16 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm6 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm7 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -24 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -20 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L16: + testl $4, N + jle .L17 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm6 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm7 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L17: + testl $2, N + jle .L18 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L18: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + testl $2 * SIZE, Y + jne .L30 + + movaps -33 * SIZE(Y), %xmm6 + addl $3 * SIZE, Y + + shufps $0xb1, %xmm1, %xmm1 + + movl N, %eax + sarl $4, %eax + jle .L25 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps 0 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L25: + testl $8, N + jle .L26 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L26: + testl $4, N + jle .L27 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L27: + testl $2, N + jle .L28 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm7, %xmm6 + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L28: + testl $1, N + jle .L29 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L29: + shufps $0xb1, %xmm1, %xmm1 + jmp .L98 + ALIGN_3 + +.L30: + testl $SIZE, Y + jne .L40 +#endif + + movl N, %eax + sarl $4, %eax + jle .L35 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm6 + movhps -30 * SIZE(Y), %xmm6 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm7 + movhps -26 * SIZE(Y), %xmm7 + + decl %eax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -24 * SIZE(Y), %xmm6 + movhps -22 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -20 * SIZE(Y), %xmm7 + movhps -18 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -16 * SIZE(Y), %xmm6 + movhps -14 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -16 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -12 * SIZE(Y), %xmm7 + movhps -10 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -8 * SIZE(Y), %xmm6 + movhps -6 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -4 * SIZE(Y), %xmm7 + movhps -2 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd 0 * SIZE(Y), %xmm6 + movhps 2 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps 0 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd 4 * SIZE(Y), %xmm7 + movhps 6 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps 4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L32: + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -24 * SIZE(Y), %xmm6 + movhps -22 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -20 * SIZE(Y), %xmm7 + movhps -18 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -16 * SIZE(Y), %xmm6 + movhps -14 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -16 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -12 * SIZE(Y), %xmm7 + movhps -10 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -8 * SIZE(Y), %xmm6 + movhps -6 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -4 * SIZE(Y), %xmm7 + movhps -2 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L35: + testl $8, N + jle .L36 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm6 + movhps -30 * SIZE(Y), %xmm6 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm7 + movhps -26 * SIZE(Y), %xmm7 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd -24 * SIZE(Y), %xmm6 + movhps -22 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -24 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd -20 * SIZE(Y), %xmm7 + movhps -18 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -20 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L36: + testl $4, N + jle .L37 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm6 + movhps -30 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm7 + movhps -26 * SIZE(Y), %xmm7 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L37: + testl $2, N + jle .L38 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm6 + movhps -30 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L38: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +#ifdef ALIGNED_ACCESS +.L40: + movaps -35 * SIZE(Y), %xmm6 + addl $1 * SIZE, Y + + shufps $0xb1, %xmm1, %xmm1 + + movl N, %eax + sarl $4, %eax + jle .L45 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + decl %eax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps 0 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L45: + testl $8, N + jle .L46 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(X), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(X), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(Y), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L46: + testl $4, N + jle .L47 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(Y), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L47: + testl $2, N + jle .L48 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm7, %xmm6 + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L48: + testl $1, N + jle .L49 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + movss -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L49: + shufps $0xb1, %xmm1, %xmm1 + jmp .L98 + ALIGN_3 +#endif + +.L50: + testl $SIZE, Y + jne .L70 + +#ifdef ALIGNED_ACCESS + + testl $2 * SIZE, Y + je .L50x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + + PSHUFD2($0xb1, %xmm0, %xmm1) + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addl $2 * SIZE, X + addl $2 * SIZE, Y + + decl N + ALIGN_3 + +.L50x: + testl $2 * SIZE, X + jne .L60 + + movaps -33 * SIZE(X), %xmm6 + addl $3 * SIZE, X + + shufps $0xb1, %xmm1, %xmm1 + + movl N, %eax + sarl $4, %eax + jle .L55 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + decl %eax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps 0 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L52: + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L55: + testl $8, N + jle .L56 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L56: + testl $4, N + jle .L57 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x39, %xmm7, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L57: + testl $2, N + jle .L58 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm7, %xmm6 + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L58: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x39, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L60: + movaps -35 * SIZE(X), %xmm6 + addl $1 * SIZE, X + + shufps $0xb1, %xmm1, %xmm1 + + movl N, %eax + sarl $4, %eax + jle .L65 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + decl %eax + jle .L62 + ALIGN_3 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps 0 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L61 + ALIGN_3 + +.L62: + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -12 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -8 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -4 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L65: + testl $8, N + jle .L66 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movaps -24 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -20 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L66: + testl $4, N + jle .L67 + + movaps -32 * SIZE(Y), %xmm4 + movaps -28 * SIZE(Y), %xmm5 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + movaps -28 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0xb1, %xmm5, %xmm3) + shufps $0x93, %xmm6, %xmm7 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L67: + testl $2, N + jle .L68 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm7, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm7, %xmm6 + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L68: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + movss -32 * SIZE(X), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0xb1, %xmm4, %xmm3) + shufps $0x93, %xmm6, %xmm6 + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +#else + + testl $2 * SIZE, Y + je .L50x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(Y), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + + PSHUFD2($0xb1, %xmm0, %xmm1) + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addl $2 * SIZE, X + addl $2 * SIZE, Y + + decl N + ALIGN_3 + +.L50x: + movl N, %eax + sarl $4, %eax + jle .L55 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm6 + movhps -30 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm7 + movhps -26 * SIZE(X), %xmm7 + + decl %eax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -16 * SIZE(X), %xmm6 + movhps -14 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -12 * SIZE(X), %xmm7 + movhps -10 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps 0 * SIZE(X), %xmm6 + movhps 2 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps 4 * SIZE(X), %xmm7 + movhps 6 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L52: + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -16 * SIZE(X), %xmm6 + movhps -14 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -12 * SIZE(X), %xmm7 + movhps -10 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -8 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L55: + testl $8, N + jle .L56 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm6 + movhps -30 * SIZE(X), %xmm6 + + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm7 + movhps -26 * SIZE(X), %xmm7 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + movaps -24 * SIZE(Y), %xmm4 + mulps %xmm6, %xmm3 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps -20 * SIZE(Y), %xmm5 + mulps %xmm7, %xmm3 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L56: + testl $4, N + jle .L57 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm6 + movhps -30 * SIZE(X), %xmm6 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm7 + movhps -26 * SIZE(X), %xmm7 + + PSHUFD2($0xb1, %xmm5, %xmm3) + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm7, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L57: + testl $2, N + jle .L58 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm6 + movhps -30 * SIZE(X), %xmm6 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L58: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd -32 * SIZE(X), %xmm6 + + PSHUFD2($0xb1, %xmm4, %xmm3) + mulps %xmm6, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm6, %xmm3 + addps %xmm3, %xmm1 + jmp .L98 + ALIGN_3 +#endif + +.L70: + testl $2 * SIZE, Y + je .L70x + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + addl $2 * SIZE, X +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + addl $2 * SIZE, Y + + PSHUFD2($0xb1, %xmm1, %xmm0) + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + decl N + ALIGN_3 + +.L70x: + testl $2 * SIZE, X + jne .L80 + + movaps -33 * SIZE(X), %xmm4 + addl $3 * SIZE, X + movaps -33 * SIZE(Y), %xmm6 + addl $3 * SIZE, Y + + movl N, %eax + sarl $4, %eax + jle .L75 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + decl %eax + jle .L72 + ALIGN_3 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -28 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -24 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -20 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -16 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -16 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -8 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -4 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps 0 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps 0 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L71 + ALIGN_3 + +.L72: + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -28 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -24 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -20 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -16 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -16 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -8 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -4 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L75: + testl $8, N + jle .L76 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -28 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movaps -24 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -20 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L76: + testl $4, N + jle .L77 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movaps -28 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L77: + testl $2, N + jle .L78 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + ALIGN_3 + +.L78: + testl $1, N + jle .L79 + + xorps %xmm5, %xmm5 + movss %xmm5, %xmm4 + movss %xmm5, %xmm6 + + shufps $0x24, %xmm4, %xmm4 + PSHUFD2($0x18, %xmm6, %xmm3) + shufps $0x24, %xmm6, %xmm6 + + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L79: + shufps $0x39, %xmm0, %xmm0 + shufps $0x39, %xmm1, %xmm1 + jmp .L98 + ALIGN_3 + +.L80: + movsd -33 * SIZE(X), %xmm4 + movhps -31 * SIZE(X), %xmm4 + addl $3 * SIZE, X + movaps -33 * SIZE(Y), %xmm6 + addl $3 * SIZE, Y + + movl N, %eax + sarl $4, %eax + jle .L85 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + decl %eax + jle .L82 + ALIGN_3 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -28 * SIZE(X), %xmm4 + movhps -26 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -24 * SIZE(X), %xmm5 + movhps -22 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -16 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -12 * SIZE(X), %xmm4 + movhps -10 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -8 * SIZE(X), %xmm5 + movhps -6 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -4 * SIZE(X), %xmm4 + movhps -2 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps 0 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd 0 * SIZE(X), %xmm5 + movhps 2 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L81 + ALIGN_3 + +.L82: + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -28 * SIZE(X), %xmm4 + movhps -26 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -24 * SIZE(X), %xmm5 + movhps -22 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -16 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -12 * SIZE(X), %xmm4 + movhps -10 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -8 * SIZE(X), %xmm5 + movhps -6 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -4 * SIZE(X), %xmm4 + movhps -2 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + ALIGN_3 + +.L85: + testl $8, N + jle .L86 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -28 * SIZE(X), %xmm4 + movhps -26 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movaps -24 * SIZE(Y), %xmm7 + mulps %xmm5, %xmm3 + movsd -24 * SIZE(X), %xmm5 + movhps -22 * SIZE(X), %xmm5 + addps %xmm3, %xmm1 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -20 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L86: + testl $4, N + jle .L87 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movaps -28 * SIZE(Y), %xmm6 + mulps %xmm4, %xmm3 + movsd -28 * SIZE(X), %xmm4 + movhps -26 * SIZE(X), %xmm4 + addps %xmm3, %xmm1 + + movss %xmm6, %xmm7 + PSHUFD2($0x1b, %xmm7, %xmm3) + movss %xmm4, %xmm5 + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L87: + testl $2, N + jle .L88 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm7 + + movss %xmm7, %xmm6 + PSHUFD2($0x1b, %xmm6, %xmm3) + movss %xmm5, %xmm4 + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + ALIGN_3 + +.L88: + testl $1, N + jle .L89 + + xorps %xmm5, %xmm5 + movss %xmm5, %xmm4 + movss %xmm5, %xmm6 + + shufps $0x24, %xmm4, %xmm4 + PSHUFD2($0x18, %xmm6, %xmm3) + shufps $0x24, %xmm6, %xmm6 + + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L89: + shufps $0x39, %xmm0, %xmm0 + shufps $0x39, %xmm1, %xmm1 + jmp .L98 + ALIGN_3 + +.L200: + movl N, %eax + sarl $4, %eax + jle .L205 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + decl %eax + jle .L204 + ALIGN_3 + +.L203: + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + decl %eax + jg .L203 + ALIGN_3 + +.L204: + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L205: + testl $8, N + jle .L206 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + mulps %xmm4, %xmm3 + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + mulps %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L206: + testl $4, N + jle .L207 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + + movsd (X), %xmm5 + addl INCX, X + movhps (X), %xmm5 + addl INCX, X + movsd (Y), %xmm7 + addl INCY, Y + movhps (Y), %xmm7 + addl INCY, Y + + PSHUFD2($0xb1, %xmm7, %xmm3) + mulps %xmm5, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L207: + testl $2, N + jle .L208 + + movsd (X), %xmm4 + addl INCX, X + movhps (X), %xmm4 + addl INCX, X + movsd (Y), %xmm6 + addl INCY, Y + movhps (Y), %xmm6 + addl INCY, Y + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L208: + testl $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd (X), %xmm4 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd (Y), %xmm6 + + PSHUFD2($0xb1, %xmm6, %xmm3) + mulps %xmm4, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm1 + ALIGN_3 + +.L98: + movhlps %xmm0, %xmm2 + movhlps %xmm1, %xmm3 + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + PSHUFD2($1, %xmm0, %xmm2) + PSHUFD2($1, %xmm1, %xmm3) + +#ifndef CONJ + subss %xmm2, %xmm0 + addss %xmm3, %xmm1 +#else + addss %xmm2, %xmm0 + subss %xmm3, %xmm1 +#endif + ALIGN_4 + +.L999: + subl $2 * SIZE, %esp + movss %xmm0, 0 * SIZE(%esp) + movss %xmm1, 1 * SIZE(%esp) + movl 0 * SIZE(%esp), %eax + movl 1 * SIZE(%esp), %edx + addl $2 * SIZE, %esp + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zdot_sse2.S b/kernel/x86/zdot_sse2.S new file mode 100644 index 0000000000..6304f01a7f --- /dev/null +++ b/kernel/x86/zdot_sse2.S @@ -0,0 +1,1543 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + +#undef movsd + +#ifndef OPTERON +#define movlps movsd +#endif + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + cmpl $0, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, Y + jne .L30 + + testl $SIZE, X + jne .L20 + + movl N, %eax + sarl $3, %eax + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + decl %eax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -10 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -6 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -4 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -2 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps 0 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps 0 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps 2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps 2 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L11 + ALIGN_3 + +.L12: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -10 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -8 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -6 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -4 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -2 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, N + jle .L16 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -12 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -10 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, N + jle .L17 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, N + jle .L98 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L20: + movl N, %eax + sarl $3, %eax + jle .L25 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + decl %eax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(X), %xmm4 + movhps -11 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(X), %xmm5 + movhps -9 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -4 * SIZE(X), %xmm4 + movhps -3 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -2 * SIZE(X), %xmm5 + movhps -1 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps 0 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps 2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps 2 * SIZE(X), %xmm5 + movhps 3 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L22: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(X), %xmm4 + movhps -11 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(X), %xmm5 + movhps -9 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -4 * SIZE(X), %xmm4 + movhps -3 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -2 * SIZE(X), %xmm5 + movhps -1 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(X), %xmm4 + movhps -11 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(X), %xmm5 + movhps -9 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm7 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L98 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L30: + testl $SIZE, X + jne .L40 + + movl N, %eax + sarl $3, %eax + jle .L35 + + movlps -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm6 + + movlps -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm7 + + decl %eax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(Y), %xmm4 + movhps -11 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(Y), %xmm5 + movhps -9 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -8 * SIZE(Y), %xmm4 + movhps -7 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -6 * SIZE(Y), %xmm5 + movhps -5 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -4 * SIZE(Y), %xmm4 + movhps -3 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -2 * SIZE(Y), %xmm5 + movhps -1 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps 0 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(Y), %xmm4 + movhps 1 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps 2 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps 2 * SIZE(Y), %xmm5 + movhps 3 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L32: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(Y), %xmm4 + movhps -11 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(Y), %xmm5 + movhps -9 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -8 * SIZE(Y), %xmm4 + movhps -7 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -6 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -6 * SIZE(Y), %xmm5 + movhps -5 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -4 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -4 * SIZE(Y), %xmm4 + movhps -3 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -2 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -2 * SIZE(Y), %xmm5 + movhps -1 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, N + jle .L36 + + movlps -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm6 + + movlps -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -12 * SIZE(X), %xmm6 + mulpd %xmm4, %xmm3 + movlps -12 * SIZE(Y), %xmm4 + movhps -11 * SIZE(Y), %xmm4 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -10 * SIZE(X), %xmm7 + mulpd %xmm5, %xmm3 + movlps -10 * SIZE(Y), %xmm5 + movhps -9 * SIZE(Y), %xmm5 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L36: + testl $2, N + jle .L37 + + movlps -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + movlps -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L37: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + + testl $1, N + jle .L98 + + movlps -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + SHUFPD_1 %xmm3, %xmm3 + addpd %xmm3, %xmm1 + jmp .L98 + ALIGN_3 + +.L40: + movhps -16 * SIZE(X), %xmm4 + addl $SIZE, X + movhps -16 * SIZE(Y), %xmm6 + addl $SIZE, Y + + movl N, %eax + sarl $3, %eax + jle .L45 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm7 + + decl %eax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -14 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -14 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -10 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -8 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -6 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -6 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -2 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -2 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps 0 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps 0 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L42: + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -14 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -14 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -10 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -8 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -8 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -6 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -6 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -4 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -2 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -2 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, N + jle .L46 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm7 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -14 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -14 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movaps -12 * SIZE(Y), %xmm7 + mulpd %xmm5, %xmm3 + movaps -12 * SIZE(X), %xmm5 + addpd %xmm3, %xmm1 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -10 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L46: + testl $2, N + jle .L47 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm7 + + movsd %xmm7, %xmm6 + pshufd $0x4e, %xmm6, %xmm3 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movaps -14 * SIZE(Y), %xmm6 + mulpd %xmm4, %xmm3 + movaps -14 * SIZE(X), %xmm4 + addpd %xmm3, %xmm1 + + movsd %xmm6, %xmm7 + pshufd $0x4e, %xmm7, %xmm3 + movsd %xmm4, %xmm5 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, N + jle .L48 + + movlps -16 * SIZE(X), %xmm4 + movlps -16 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L48: + SHUFPD_1 %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm2, %xmm2 + SHUFPD_1 %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L50: + movl N, %eax + sarl $3, %eax + jle .L55 + + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + + decl %eax + jle .L54 + ALIGN_3 + +.L53: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + decl %eax + jg .L53 + ALIGN_3 + +.L54: + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L55: + testl $4, N + jle .L56 + + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + mulpd %xmm4, %xmm3 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + mulpd %xmm5, %xmm3 + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L56: + testl $2, N + jle .L57 + + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addl INCX, X + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + addl INCY, Y + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + + movlps 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addl INCX, X + movlps 0 * SIZE(Y), %xmm7 + movhps 1 * SIZE(Y), %xmm7 + addl INCY, Y + + pshufd $0x4e, %xmm7, %xmm3 + mulpd %xmm5, %xmm7 + addpd %xmm7, %xmm0 + mulpd %xmm5, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L57: + testl $1, N + jle .L98 + + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + movlps 0 * SIZE(Y), %xmm6 + movhps 1 * SIZE(Y), %xmm6 + + pshufd $0x4e, %xmm6, %xmm3 + mulpd %xmm4, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm4, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L98: + pshufd $0x4e, %xmm0, %xmm2 + pshufd $0x4e, %xmm1, %xmm3 + +#ifndef CONJ + subsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 +#else + addsd %xmm2, %xmm0 + subsd %xmm3, %xmm1 +#endif + +.L999: + movl RESULT, %eax + + movlps %xmm0, 0 * SIZE(%eax) + movlps %xmm1, 1 * SIZE(%eax) + + popl %ebx + popl %esi + popl %edi + ret + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_1x4_athlon.S b/kernel/x86/zgemm3m_kernel_1x4_athlon.S new file mode 100644 index 0000000000..c57a8cb7a8 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_1x4_athlon.S @@ -0,0 +1,979 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_A 32 + STACK + ARGS(%esp) +#define STACK_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define STACK_LDC 44 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define B_ORIG %ebx +#define LDC %ebp + +#define PREFETCHSIZE (5 + 8 * 10) + +/* + + A hint of scheduling is received from following URL + + http://www.netlib.org/atlas/atlas-comm/msg00260.html + + Julian's code is still faster than mine, since Athlon has big + defect ... So this is a sample coding and please don't expect too + much. + +*/ + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl STACK_B, B_ORIG + movl STACK_LDC, LDC + + sall $ZBASE_SHIFT, LDC + + subl $-16 * SIZE, B_ORIG + subl $-16 * SIZE, STACK_A + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + sarl $2, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L11: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl C, %edi + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (B_ORIG, %eax, 4), B +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + jle .L13 + ALIGN_4 + +.L12: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + movl 16 * SIZE(B), %esi + movl 24 * SIZE(B), %esi + movl 32 * SIZE(B), %esi + movl 40 * SIZE(B), %esi + subl $-64 * SIZE, B + decl %eax + jne .L12 + ALIGN_3 + +.L13: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L14: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 4), B +#endif + + leal (%edi, LDC, 2), %eax + + fldz + fldz + fldz + fldz + + FLD -8 * SIZE(A) + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + movl $32 * SIZE, %esi + +#ifdef HAVE_3DNOW + prefetchw 1 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, LDC) + prefetchw 1 * SIZE(%eax) + prefetchw 2 * SIZE(%eax, LDC) +#elif defined(HAVE_SSE) + prefetcht0 1 * SIZE(%edi) + prefetcht0 1 * SIZE(%edi, LDC) + prefetcht0 1 * SIZE(%eax) + prefetcht0 1 * SIZE(%eax, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L16 + ALIGN_3 + +.L15: + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -15 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -14 * SIZE(B) + +#if L1_DATA_LINESIZE == 32 +#ifdef HAVE_3DNOW + PADDING prefetch (PREFETCHSIZE - 4) * SIZE(A) +#elif defined(HAVE_SSE) + PADDING prefetcht0 (PREFETCHSIZE - 4) * SIZE(A) +#endif +#endif + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -13 * SIZE(B) + + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -11 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -9 * SIZE(B) + + faddp %st, %st(5) + FLD -14 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -7 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -5 * SIZE(B) + + faddp %st, %st(5) + FLD -13 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD -3 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL -1 * SIZE(B) + + faddp %st, %st(5) + FLD -12 * SIZE(A) + FLD 0 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 1 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 3 * SIZE(B) + + faddp %st, %st(5) + FLD -11 * SIZE(A) + FLD 4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 5 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 7 * SIZE(B) + + faddp %st, %st(5) + FLD -10 * SIZE(A) + FLD 8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 9 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 11 * SIZE(B) + + faddp %st, %st(5) + FLD -9 * SIZE(A) + FLD 12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(3) + PADDING + FLD 13 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + PADDING + FLD 14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(5) + PADDING + FMUL 15 * SIZE(B) + + faddp %st, %st(5) + FLD 0 * SIZE(A) + +#ifdef HAVE_3DNOW + PADDING prefetch PREFETCHSIZE * SIZE(A) +#elif defined(HAVE_SSE) + PADDING prefetcht0 PREFETCHSIZE * SIZE(A) +#endif + + addl $8 * SIZE, A + fxch %st(1) + addl $32 * SIZE, B + + FLD -16 * SIZE(B) + decl %eax + jne .L15 + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L19 + ALIGN_4 + +.L17: + fmul %st(1), %st + faddp %st, %st(3) + + FLD -15 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(4) + + FLD -14 * SIZE(B) + fmul %st(1), %st + faddp %st, %st(5) + + FMUL -13 * SIZE(B) + faddp %st, %st(5) + FLD -15 * SIZE(A) + FLD -12 * SIZE(B) + + addl $1 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + leal (%edi, LDC, 2), %eax + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fld %st(3) + fmul %st(1), %st + + FLD 0 * SIZE(%edi, LDC) + faddp %st, %st(1) + FST 0 * SIZE(%edi, LDC) + + fld %st(4) + fmul %st(1), %st + + FLD 0 * SIZE(%eax) + faddp %st, %st(1) + FST 0 * SIZE(%eax) + + fmul %st(5), %st + + FLD 0 * SIZE(%eax, LDC) + faddp %st, %st(1) + FST 0 * SIZE(%eax, LDC) + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 1 * SIZE(%edi, LDC) + faddp %st, %st(1) + FST 1 * SIZE(%edi, LDC) + + FLD 1 * SIZE(%eax) + faddp %st, %st(1) + FST 1 * SIZE(%eax) + + FLD 1 * SIZE(%eax, LDC) + faddp %st, %st(1) + FST 1 * SIZE(%eax, LDC) + + addl $2 * SIZE, %edi + decl I + jne .L14 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C + movl B, B_ORIG + decl J + jne .L11 + ALIGN_4 + +.L20: + movl N, %eax + andl $2, %eax + je .L30 + ALIGN_3 + +.L21: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl C, %edi + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (B_ORIG, %eax, 2), B +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + jle .L23 + ALIGN_4 + +.L22: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L22 + ALIGN_3 + +.L23: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L24: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 2), B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE(A) + FLD -16 * SIZE(B) + + prefetchw 1 * SIZE(%edi) + prefetchw 1 * SIZE(%edi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -14 * SIZE(A) + FLD -12 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -13 * SIZE(A) + FLD -10 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FLD -8 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -7 * SIZE(B) + faddp %st, %st(2) + + FLD -11 * SIZE(A) + FLD -6 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -5 * SIZE(B) + faddp %st, %st(4) + + FLD -10 * SIZE(A) + FLD -4 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -3 * SIZE(B) + faddp %st, %st(2) + + FLD -9 * SIZE(A) + FLD -2 * SIZE(B) + + fmul %st(1), %st + faddp %st, %st(4) + + FMUL -1 * SIZE(B) + faddp %st, %st(4) + + FLD -8 * SIZE(A) + FLD 0 * SIZE(B) + + addl $ 8 * SIZE, A + subl $-16 * SIZE, B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + faddp %st, %st(2) + + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -15 * SIZE(A) + FLD -14 * SIZE(B) + + addl $1 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(2) + faddp %st, %st(2) + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmul %st(3), %st + + FLD 0 * SIZE(%edi, LDC) + faddp %st, %st(1) + FST 0 * SIZE(%edi, LDC) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 1 * SIZE(%edi, LDC) + faddp %st, %st(1) + FST 1 * SIZE(%edi, LDC) + + addl $2 * SIZE, %edi + decl I + jne .L24 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + movl B, B_ORIG + ALIGN_4 + +.L30: + movl N, %eax + andl $1, %eax + je .L999 + ALIGN_3 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl C, %edi + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (B_ORIG, %eax, 1), B +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L32: + movl -16 * SIZE(B), %esi + movl -8 * SIZE(B), %esi + movl 0 * SIZE(B), %esi + movl 8 * SIZE(B), %esi + subl $-32 * SIZE, B + decl %eax + jne .L32 + ALIGN_3 + +.L33: + movl M, %esi + movl %esi, I + ALIGN_3 + +.L34: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl B_ORIG, B +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 1), A + leal (B_ORIG, %eax, 1), B +#endif + + fldz + fldz + fldz + fldz + + prefetchw 1 * SIZE(%edi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L36 + ALIGN_3 + +.L35: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + FLD -15 * SIZE(A) + FMUL -15 * SIZE(B) + faddp %st, %st(2) + + FLD -14 * SIZE(A) + FMUL -14 * SIZE(B) + faddp %st, %st(3) + + FLD -13 * SIZE(A) + FMUL -13 * SIZE(B) + faddp %st, %st(4) + + FLD -12 * SIZE(A) + FMUL -12 * SIZE(B) + faddp %st, %st(1) + + FLD -11 * SIZE(A) + FMUL -11 * SIZE(B) + faddp %st, %st(2) + + FLD -10 * SIZE(A) + FMUL -10 * SIZE(B) + faddp %st, %st(3) + + FLD -9 * SIZE(A) + FMUL -9 * SIZE(B) + faddp %st, %st(4) + + addl $8 * SIZE, A + addl $8 * SIZE, B + + decl %eax + jne .L35 + ALIGN_4 + +.L36: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L39 + ALIGN_4 + +.L37: + FLD -16 * SIZE(A) + FMUL -16 * SIZE(B) + faddp %st, %st(1) + + addl $1 * SIZE,A + addl $1 * SIZE,B + decl %eax + jne .L37 + ALIGN_4 + +.L39: + faddp %st, %st(2) + faddp %st, %st(2) + faddp %st, %st(1) + + FLD ALPHA_I + FLD ALPHA_R + + fmul %st(2), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmulp %st, %st(1) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + addl $2 * SIZE, %edi + decl I + jne .L34 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C + movl B, B_ORIG + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x2_atom.S b/kernel/x86/zgemm3m_kernel_2x2_atom.S new file mode 100644 index 0000000000..ee918bfc03 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x2_atom.S @@ -0,0 +1,734 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define CO1 %esi +#define LDC %ebp +#define B %edi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 1, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, CO1 # coffset = c + leal (, LDC, 2), %eax + addl %eax, C + + movl A, AA # aoffset = a + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movl BX, %eax + prefetcht0 0 * SIZE(%eax) + subl $-8 * SIZE, BX + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + xorps %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + addsd %xmm2, %xmm6 + addsd %xmm3, %xmm7 + + movaps %xmm4, %xmm2 + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm2 + + movaps %xmm6, %xmm3 + mulsd %xmm0, %xmm6 + mulsd %xmm1, %xmm3 + + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm2 + addsd 2 * SIZE(CO1), %xmm6 + addsd 3 * SIZE(CO1), %xmm3 + + movlps %xmm4, 0 * SIZE(CO1) + movlps %xmm2, 1 * SIZE(CO1) + movlps %xmm6, 2 * SIZE(CO1) + movlps %xmm3, 3 * SIZE(CO1) + + movaps %xmm5, %xmm2 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm2 + + movaps %xmm7, %xmm3 + mulsd %xmm0, %xmm7 + mulsd %xmm1, %xmm3 + + addsd 0 * SIZE(CO1, LDC), %xmm5 + addsd 1 * SIZE(CO1, LDC), %xmm2 + addsd 2 * SIZE(CO1, LDC), %xmm7 + addsd 3 * SIZE(CO1, LDC), %xmm3 + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movlps %xmm2, 1 * SIZE(CO1, LDC) + movlps %xmm7, 2 * SIZE(CO1, LDC) + movlps %xmm3, 3 * SIZE(CO1, LDC) + + addl $4 * SIZE, CO1 + decl %ebx + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L25 + ALIGN_4 + +.L22: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 3 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 5 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 3 * SIZE(AA), %xmm0 + + addsd %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 7 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addsd %xmm2, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm5 + movsd 1 * SIZE(BB), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + addsd %xmm2, %xmm4 + addsd %xmm3, %xmm5 + + movaps %xmm4, %xmm2 + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm2 + + movaps %xmm5, %xmm3 + mulsd %xmm0, %xmm5 + mulsd %xmm1, %xmm3 + + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm2 + addsd 0 * SIZE(CO1, LDC), %xmm5 + addsd 1 * SIZE(CO1, LDC), %xmm3 + + movlps %xmm4, 0 * SIZE(CO1) + movlps %xmm2, 1 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) + movlps %xmm3, 1 * SIZE(CO1, LDC) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + decl J + jg .L10 + ALIGN_4 + +.L30: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO1 + addl LDC, C + + movl A, AA + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + movsd 0 * SIZE(BB), %xmm1 + xorps %xmm0, %xmm0 + prefetcht0 3 * SIZE(CO1) + xorps %xmm2, %xmm2 + xorps %xmm4, %xmm4 + xorps %xmm6, %xmm6 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 2 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 3 * SIZE(BB), %xmm1 + + addsd %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 4 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addsd %xmm0, %xmm4 + movsd 0 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + mulsd %xmm1, %xmm0 + mulsd %xmm1, %xmm2 + movsd 1 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm0, %xmm4 + addsd %xmm2, %xmm6 + + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + movaps %xmm4, %xmm2 + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm2 + + movaps %xmm6, %xmm3 + mulsd %xmm0, %xmm6 + mulsd %xmm1, %xmm3 + + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm2 + addsd 2 * SIZE(CO1), %xmm6 + addsd 3 * SIZE(CO1), %xmm3 + + movlps %xmm4, 0 * SIZE(CO1) + movlps %xmm2, 1 * SIZE(CO1) + movlps %xmm6, 2 * SIZE(CO1) + movlps %xmm3, 3 * SIZE(CO1) + + addl $4 * SIZE, CO1 + decl %ebx + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movsd 0 * SIZE(BB), %xmm2 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L45 + ALIGN_4 + +.L42: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 2 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 3 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 3 * SIZE(BB), %xmm2 + + mulsd %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addsd %xmm5, %xmm4 + + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + movaps %xmm4, %xmm2 + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm2 + + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm2 + + movlps %xmm4, 0 * SIZE(CO1) + movlps %xmm2, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x2_coppermine.S b/kernel/x86/zgemm3m_kernel_2x2_coppermine.S new file mode 100644 index 0000000000..674829f80a --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x2_coppermine.S @@ -0,0 +1,722 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define LDC 44 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define LDC 36 + STACK + ARGS(%esp) +#endif + +#define PREFETCH_OFFSET 48 + +#if defined(PENTIUM3) || defined(PENTIUMM) +#define REP rep +#else +#define REP rep +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl N, %eax # j = (n >> 1) # MEMORY + movl LDC, %ebp # ldc # MEMORY + movl B, %ebx + + sall $ZBASE_SHIFT, %ebp + + sarl $1, %eax + + leal 0(%ecx) , %ecx # NOP + movl %eax, J # j = (n >> 1) # MEMORY + test %eax, %eax + je .L8 # if !(n >> 1) goto .L8 + ALIGN_4 + +.L34: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl %ebx, BX + + movl M, %esi # m # MEMORY + movl A, %edx # a # MEMORY + movl C, %edi # C # MEMORY + sarl $1, %esi # i = (m >> 1) + je .L12 + ALIGN_4 + +.MainHead: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ebx, %eax, 2), %ecx +#endif + +#ifdef HAVE_SSE + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + fldz + fldz + + FLD 4 * SIZE(%ecx) # b5 + FLD 4 * SIZE(%edx) # a5 + FLD 0 * SIZE(%ecx) # b1 + FLD 0 * SIZE(%edx) # a1 + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, %ebp, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(%edi) + prefetchnta 2 * SIZE(%edi, %ebp, 1) +#endif + sarl $2, %eax + je .L16 + ALIGN_4 + +.MainLoop: +#if defined(HAVE_3DNOW) + prefetch (PREFETCH_OFFSET) * SIZE(%ecx) + nop +#elif defined(HAVE_SSE) + prefetchnta (PREFETCH_OFFSET) * SIZE(%ecx) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET) * SIZE(%edx) +#endif +#endif + + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(%edx) + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(%edx) + + fmul %st, %st(1) + FMUL 3 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 3 * SIZE(%edx) + fmul %st, %st(1) + FMUL 3 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(%edx) + fxch %st(2) + +#if !defined(HAVE_3DNOW) && defined(HAVE_SSE) && defined(DOUBLE) + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(%ecx) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(%edx) +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(4) + FLD 4 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(5) + FLD 5 * SIZE(%edx) + fmul %st, %st(3) + FMUL 5 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(%edx) + + fmul %st, %st(3) + FMUL 7 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(4) + FLD 6 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(5) + FLD 7 * SIZE(%edx) + fmul %st, %st(3) + FMUL 7 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(%ecx) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(%edx) + fxch %st(2) + + subl $-8 * SIZE, %ecx + subl $-8 * SIZE, %edx + decl %eax # l -- + jne .MainLoop + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L21 + ALIGN_4 + +.SubLoop: + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(4) + FLD 0 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(5) + FLD 1 * SIZE(%edx) + fmul %st, %st(1) + FMUL 1 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(%ecx) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(%edx) + + addl $2 * SIZE,%ecx + addl $2 * SIZE,%edx + decl %eax + jne .SubLoop + ALIGN_4 + +.L21: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fld %st(3) + fmul %st(1), %st + + FLD 0 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 0 * SIZE(%edi, %ebp) + + fld %st(4) + fmul %st(1), %st + + FLD 2 * SIZE(%edi) + faddp %st, %st(1) + FST 2 * SIZE(%edi) + + fmul %st(5), %st + + FLD 2 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 2 * SIZE(%edi, %ebp) + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 1 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 1 * SIZE(%edi, %ebp) + + FLD 3 * SIZE(%edi) + faddp %st, %st(1) + FST 3 * SIZE(%edi) + + FLD 3 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 3 * SIZE(%edi, %ebp) + + addl $4 * SIZE, %edi + rep + decl %esi # i -- + rep + jne .MainHead + ALIGN_4 + +.L12: + movl M, %eax # m # MEMORY + andl $1, %eax + je .L27 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ebx, %eax, 2), %ecx +#endif + fldz + fldz + + FLD 0 * SIZE(%edx) # temp1 = *(aoffset + 0) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $1,%eax # k >> 1 # MEMORY + je .L54 + ALIGN_4 + +.L55: + FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) + + FLD 2 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(%edx) # temp1 = *(aoffset + 0) + + addl $2 * SIZE, %edx + addl $4 * SIZE, %ecx + decl %eax + jne .L55 + ALIGN_4 + +.L54: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $1,%eax # k & 1 + je .L33 + ALIGN_4 + + FLD 0 * SIZE(%ecx) # temp2 = *(boffset + 0) + rep + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%ecx) # temp2 = *(boffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%edx) # temp1 = *(aoffset + 0) + + addl $1 * SIZE, %edx + addl $2 * SIZE, %ecx + ALIGN_4 + +.L33: + ffreep %st(0) + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmul %st(3), %st + + FLD 0 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 0 * SIZE(%edi, %ebp) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 1 * SIZE(%edi, %ebp) + faddp %st, %st(1) + FST 1 * SIZE(%edi, %ebp) + ALIGN_4 + +.L27: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + lea (, %ebp, 2), %eax + addl %eax, C # C + 2 * ldc # MEMORY + movl %ecx, %ebx # b # MEMORY + decl J # j-- # MEMORY + jne .L34 + ALIGN_4 + +.L8: + movl N, %eax # n # MEMORY + andl $1, %eax + je .End + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, %edi # c # MEMORY + movl A, %edx # a # MEMORY + + movl M, %esi # m # MEMORY + sarl $1, %esi # m >> 1 + je .L36 + ALIGN_4 + +.L46: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 2), %edx + leal (%ebx, %eax, 1), %ecx +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + fldz + sarl $1, %eax + fldz + FLD 0 * SIZE(%ecx) # temp1 = *(boffset + 0) + + je .L56 + ALIGN_4 + +.L57: + FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 1 * SIZE(%ecx) # temp1 = *(boffset + 0) + + FLD 2 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 3 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 2 * SIZE(%ecx) # temp1 = *(boffset + 0) + + addl $4 * SIZE,%edx + addl $2 * SIZE,%ecx + dec %eax + jne .L57 + ALIGN_4 + +.L56: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $1, %eax + je .L45 + ALIGN_4 + + FLD 0 * SIZE(%edx) # temp2 = *(aoffset + 0) + fmul %st(1), %st + faddp %st, %st(2) + + FMUL 1 * SIZE(%edx) # temp2 = *(aoffset + 0) + faddp %st, %st(2) + FLD 3 * SIZE(%ecx) # temp1 = *(boffset + 0) + + addl $2 * SIZE,%edx + addl $1 * SIZE,%ecx + ALIGN_4 + +.L45: + ffreep %st(0) + + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmul %st(3), %st + + FLD 2 * SIZE(%edi) + faddp %st, %st(1) + FST 2 * SIZE(%edi) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + + FLD 3 * SIZE(%edi) + faddp %st, %st(1) + FST 3 * SIZE(%edi) + + addl $4 * SIZE, %edi + + decl %esi # i -- + jne .L46 + ALIGN_4 + +.L36: + movl M, %eax # m # MEMORY + andl $1, %eax # m & 1 + je .End + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl %ebx, %ecx +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (%edx, %eax, 1), %edx + leal (%ebx, %eax, 1), %ecx +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + fldz + ALIGN_3 + +.L51: + FLD (%edx) + FMUL (%ecx) + addl $1 * SIZE,%edx + addl $1 * SIZE,%ecx + faddp %st,%st(1) + decl %eax + jne .L51 + + FLD ALPHA_I + FLD ALPHA_R + + fmul %st(2), %st + + FLD 0 * SIZE(%edi) + faddp %st, %st(1) + FST 0 * SIZE(%edi) + + fmulp %st, %st(1) + + FLD 1 * SIZE(%edi) + faddp %st, %st(1) + FST 1 * SIZE(%edi) + ALIGN_4 + +.End: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x4_barcelona.S b/kernel/x86/zgemm3m_kernel_2x4_barcelona.S new file mode 100644 index 0000000000..7822094e43 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x4_barcelona.S @@ -0,0 +1,1291 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define OLD_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define OLD_LDC 44 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define B %edi +#define LDC %ebp +#define AO %edx +#define BO %ecx +#define CO %esi +#define I %ebx + +#define movsd movlps +#define movapd movups +#define movlpd movlps +#define movhpd movhps + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm1; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL2(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -12 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL3(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup -10 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL4(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm1, %xmm4; \ + movapd (BO, %eax, 4), %xmm1; \ + addpd %xmm0, %xmm5; \ + movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + addpd %xmm0, %xmm7; \ + movddup (AO, %eax, 2), %xmm0 + +#define KERNEL5(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -6 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL6(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -4 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL7(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup -2 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL8(address) \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm1, %xmm4; \ + movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ + addpd %xmm3, %xmm5; \ + movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + addpd %xmm3, %xmm7; \ + movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ + addpd %xmm2, %xmm6; \ + movapd %xmm1, %xmm2 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl OLD_B, B + movl OLD_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax + +#ifndef LEFT + negl %eax +#endif + + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax + movl %eax, BX + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm4, %xmm4 + movddup -8 * SIZE(AO), %xmm3 + + leal (LDC, LDC, 2), %eax + + prefetchw 1 * SIZE(CO) + pxor %xmm5, %xmm5 + prefetchw 3 * SIZE(CO, LDC) + pxor %xmm6, %xmm6 + prefetchw 1 * SIZE(CO, LDC, 2) + pxor %xmm7, %xmm7 + prefetchw 3 * SIZE(CO, %eax) + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetch -16 * SIZE(%eax) + addl $8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + BRANCH + jl .L12 + ALIGN_3 + +.L15: + movups ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + je .L18 + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO, %eax, 4), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO, %eax, 2), %xmm0 + mulpd %xmm0, %xmm2 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + addpd %xmm0, %xmm7 + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + addpd %xmm2, %xmm6 + movapd %xmm1, %xmm2 + + addl $SIZE, %eax + jl .L17 + ALIGN_4 + +.L18: + leal (CO, LDC, 2), %eax + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 0 * SIZE(CO, LDC), %xmm1 + movhps 1 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 0 * SIZE(CO, LDC) + movhps %xmm1, 1 * SIZE(CO, LDC) + + movsd 2 * SIZE(CO), %xmm0 + movhps 3 * SIZE(CO), %xmm0 + movsd 2 * SIZE(CO, LDC), %xmm1 + movhps 3 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 2 * SIZE(CO) + movhps %xmm0, 3 * SIZE(CO) + movlps %xmm1, 2 * SIZE(CO, LDC) + movhps %xmm1, 3 * SIZE(CO, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 1 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%eax, LDC), %xmm1 + movhps 1 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 1 * SIZE(%eax) + movlps %xmm1, 0 * SIZE(%eax, LDC) + movhps %xmm1, 1 * SIZE(%eax, LDC) + + movsd 2 * SIZE(%eax), %xmm0 + movhps 3 * SIZE(%eax), %xmm0 + movsd 2 * SIZE(%eax, LDC), %xmm1 + movhps 3 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 2 * SIZE(%eax) + movhps %xmm0, 3 * SIZE(%eax) + movlps %xmm1, 2 * SIZE(%eax, LDC) + movhps %xmm1, 3 * SIZE(%eax, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + + decl I # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I # i = (m >> 2) + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + movddup -8 * SIZE(AO), %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd -8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -13 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd (BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -11 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd 8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -9 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm7 + movddup -8 * SIZE(AO), %xmm0 + + subl $ -8 * SIZE, AO + subl $-32 * SIZE, BO + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movups ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm5 + movddup -15 * SIZE(AO), %xmm0 + + addl $1 * SIZE, AO + addl $4 * SIZE, BO + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (CO, LDC, 2), %eax + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 0 * SIZE(CO, LDC), %xmm1 + movhps 1 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 0 * SIZE(CO, LDC) + movhps %xmm1, 1 * SIZE(CO, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 1 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%eax, LDC), %xmm1 + movhps 1 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 1 * SIZE(%eax) + movlps %xmm1, 0 * SIZE(%eax, LDC) + movhps %xmm1, 1 * SIZE(%eax, LDC) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BO, B + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + ALIGN_2 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + prefetchw 1 * SIZE(CO) + pxor %xmm5, %xmm5 + prefetchw 1 * SIZE(CO, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -16 * SIZE(BO), %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -14 * SIZE(BO), %xmm0 + movddup -13 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -14 * SIZE(BO), %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + mulpd -12 * SIZE(BO), %xmm0 + movddup -11 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -12 * SIZE(BO), %xmm1 + movddup -10 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -10 * SIZE(BO), %xmm1 + movddup -8 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AO) + + mulpd -8 * SIZE(BO), %xmm0 + movddup -7 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -8 * SIZE(BO), %xmm1 + movddup -6 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -6 * SIZE(BO), %xmm1 + movddup -4 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + mulpd -4 * SIZE(BO), %xmm0 + movddup -3 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -4 * SIZE(BO), %xmm1 + movddup -2 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + mulpd -2 * SIZE(BO), %xmm0 + movddup -1 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm6 + mulpd -2 * SIZE(BO), %xmm1 + movddup 0 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm7 + + subl $-16 * SIZE, AO + subl $-16 * SIZE, BO + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movups ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm4 + mulpd -16 * SIZE(BO), %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm5 + + addl $2 * SIZE, AO + addl $2 * SIZE, BO + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 0 * SIZE(CO, LDC), %xmm1 + movhps 1 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 0 * SIZE(CO, LDC) + movhps %xmm1, 1 * SIZE(CO, LDC) + + movsd 2 * SIZE(CO), %xmm0 + movhps 3 * SIZE(CO), %xmm0 + movsd 2 * SIZE(CO, LDC), %xmm1 + movhps 3 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 2 * SIZE(CO) + movhps %xmm0, 3 * SIZE(CO) + movlps %xmm1, 2 * SIZE(CO, LDC) + movhps %xmm1, 3 * SIZE(CO, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + + decl I # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, I + testl $1, I # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(AO), %xmm0 + + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -14 * SIZE(AO), %xmm0 + + mulpd -12 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -13 * SIZE(AO), %xmm0 + + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -12 * SIZE(AO), %xmm0 + + mulpd -8 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -11 * SIZE(AO), %xmm0 + + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -10 * SIZE(AO), %xmm0 + + mulpd -4 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -9 * SIZE(AO), %xmm0 + + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -8 * SIZE(AO), %xmm0 + + subl $ -8 * SIZE, AO + subl $-16 * SIZE, BO + + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movups ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(AO), %xmm0 + + subl $-1 * SIZE, AO + subl $-2 * SIZE, BO + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + addpd %xmm5, %xmm4 + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 0 * SIZE(CO, LDC), %xmm1 + movhps 1 * SIZE(CO, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 0 * SIZE(CO, LDC) + movhps %xmm1, 1 * SIZE(CO, LDC) + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + + leal (, LDC, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO # coffset = c + movl A, AO # aoffset = a + + movl M, I + sarl $1, I # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 1), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + prefetchw 1 * SIZE(CO) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(BO), %xmm0 + + mulpd -14 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -14 * SIZE(BO), %xmm0 + + mulpd -12 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -13 * SIZE(BO), %xmm0 + + mulpd -10 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -12 * SIZE(BO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + mulpd -8 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -11 * SIZE(BO), %xmm0 + + mulpd -6 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -10 * SIZE(BO), %xmm0 + + mulpd -4 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -9 * SIZE(BO), %xmm0 + + mulpd -2 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -8 * SIZE(BO), %xmm0 + + subl $-16 * SIZE, AO + subl $ -8 * SIZE, BO + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movups ALPHA, %xmm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd -16 * SIZE(AO), %xmm0 + addpd %xmm0, %xmm4 + movddup -15 * SIZE(BO), %xmm0 + + addl $2 * SIZE, AO + addl $1 * SIZE, BO + decl %eax + jg .L76 + ALIGN_4 + +.L78: + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + movsd 2 * SIZE(CO), %xmm1 + movhps 3 * SIZE(CO), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + movlps %xmm1, 2 * SIZE(CO) + movhps %xmm1, 3 * SIZE(CO) + + addl $4 * SIZE, %esi # coffset += 2 + + decl I # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, I + testl $1, I # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 1), AO + leal (B, %eax, 1), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm4 + movapd -14 * SIZE(AO), %xmm0 + + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm5 + movapd -12 * SIZE(AO), %xmm0 + + mulpd -12 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm6 + movapd -10 * SIZE(AO), %xmm0 + + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm7 + movapd -8 * SIZE(AO), %xmm0 + + subl $-8 * SIZE, AO + subl $-8 * SIZE, BO + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movups ALPHA, %xmm3 +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd -16 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm4 + movsd -15 * SIZE(AO), %xmm0 + + addl $1 * SIZE, AO + addl $1 * SIZE, BO + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + + movsd 0 * SIZE(CO), %xmm0 + movhps 1 * SIZE(CO), %xmm0 + + unpcklpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(CO) + movhps %xmm0, 1 * SIZE(CO) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x4_opteron.S b/kernel/x86/zgemm3m_kernel_2x4_opteron.S new file mode 100644 index 0000000000..8e93a28e8c --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x4_opteron.S @@ -0,0 +1,1803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA_R 16 + STACK + ARGS(%esi) +#define OLD_ALPHA_I 24 + STACK + ARGS(%esi) +#define OLD_A 32 + STACK + ARGS(%esi) +#define OLD_B 36 + STACK + ARGS(%esi) +#define OLD_C 40 + STACK + ARGS(%esi) +#define OLD_LDC 44 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#if defined(OPTERON) || defined(BARCELONA) +#define movsd movlpd +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 10 + 4) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movsd OLD_ALPHA_R, %xmm0 + movhps OLD_ALPHA_I, %xmm0 + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movss OLD_OFFT, %xmm4 +#endif + + movl OLD_B, %edi + movl OLD_C, %ebx + movapd %xmm0, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl %edi, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movl BX, %eax + + prefetchnta 0 * SIZE(%eax) + prefetchnta 8 * SIZE(%eax) + + subl $-8 * SIZE, BX + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + + prefetchw 1 * SIZE(%esi) + prefetchw 1 * SIZE(%esi, LDC) + prefetchw 1 * SIZE(%esi, LDC, 2) + prefetchw 1 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 1 * SIZE(%esi, LDC, 2), %xmm0 + movsd 2 * SIZE(%esi, LDC, 2), %xmm1 + movhps 3 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 1 * SIZE(%esi, LDC, 2) + movlps %xmm1, 2 * SIZE(%esi, LDC, 2) + movhps %xmm1, 3 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 1 * SIZE(%esi, %eax), %xmm0 + movsd 2 * SIZE(%esi, %eax), %xmm1 + movhps 3 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 1 * SIZE(%esi, %eax) + movlps %xmm1, 2 * SIZE(%esi, %eax) + movhps %xmm1, 3 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movsd 10 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movsd 12 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 18 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 20 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 22 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm4 + movsd 26 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm5 + movsd 28 * SIZE(BB), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 30 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movsd 34 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movsd 36 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 38 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 48 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movsd 42 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movsd 44 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 46 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 56 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm4 + movsd 50 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + addsd %xmm2, %xmm5 + movsd 52 * SIZE(BB), %xmm2 + mulsd %xmm1, %xmm2 + mulsd 54 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 64 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm4 + movsd 58 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + addsd %xmm3, %xmm5 + movsd 60 * SIZE(BB), %xmm3 + mulsd %xmm1, %xmm3 + mulsd 62 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 72 * SIZE(BB), %xmm3 + addl $64 * SIZE, BB + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + addl $8 * SIZE, AA + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 8 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 1 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 1 * SIZE(%esi, %eax), %xmm1 + + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 1 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 1 * SIZE(%esi, %eax) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + ALIGN_2 + +.L31: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L35 + ALIGN_4 + +.L32: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + movddup 2 * SIZE(%edi), %xmm2 + movddup 3 * SIZE(%edi), %xmm3 + movddup 4 * SIZE(%edi), %xmm4 + movddup 5 * SIZE(%edi), %xmm5 + movddup 6 * SIZE(%edi), %xmm6 + movddup 7 * SIZE(%edi), %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + movsd 2 * SIZE(%edi), %xmm2 + movsd 3 * SIZE(%edi), %xmm3 + movsd 4 * SIZE(%edi), %xmm4 + movsd 5 * SIZE(%edi), %xmm5 + movsd 6 * SIZE(%edi), %xmm6 + movsd 7 * SIZE(%edi), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpckhpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#endif + prefetcht0 80 * SIZE(%edi) + prefetcht1 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L32 + ALIGN_2 + +.L35: + movl K, %eax + andl $3, %eax + BRANCH + jle .L40 + ALIGN_2 + +.L36: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) +#endif +#endif + +#if defined(OPTERON) || defined(BARCELONA) + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) +#endif + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L36 + ALIGN_4 + +.L40: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulsd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm6 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 + + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm6 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm7 + movsd 6 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm6 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L65 + ALIGN_4 + +.L62: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movddup 1 * SIZE(%edi), %xmm1 + movddup 2 * SIZE(%edi), %xmm2 + movddup 3 * SIZE(%edi), %xmm3 + movddup 4 * SIZE(%edi), %xmm4 + movddup 5 * SIZE(%edi), %xmm5 + movddup 6 * SIZE(%edi), %xmm6 + movddup 7 * SIZE(%edi), %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + movsd 1 * SIZE(%edi), %xmm1 + movsd 2 * SIZE(%edi), %xmm2 + movsd 3 * SIZE(%edi), %xmm3 + movsd 4 * SIZE(%edi), %xmm4 + movsd 5 * SIZE(%edi), %xmm5 + movsd 6 * SIZE(%edi), %xmm6 + movsd 7 * SIZE(%edi), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpckhpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpckhpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpckhpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) +#endif + prefetcht1 80 * SIZE(%edi) + prefetcht0 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define COPYPREFETCH 40 + + prefetchnta (COPYPREFETCH) * SIZE(%edi) + + movq 0 * SIZE(%edi), %mm0 + movq 1 * SIZE(%edi), %mm1 + movq 2 * SIZE(%edi), %mm2 + movq 3 * SIZE(%edi), %mm3 + movq 4 * SIZE(%edi), %mm4 + movq 5 * SIZE(%edi), %mm5 + movq 6 * SIZE(%edi), %mm6 + movq 7 * SIZE(%edi), %mm7 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) + movq %mm1, 2 * SIZE(%ecx) + movq %mm1, 3 * SIZE(%ecx) + movq %mm2, 4 * SIZE(%ecx) + movq %mm2, 5 * SIZE(%ecx) + movq %mm3, 6 * SIZE(%ecx) + movq %mm3, 7 * SIZE(%ecx) + + movq %mm4, 8 * SIZE(%ecx) + movq %mm4, 9 * SIZE(%ecx) + movq %mm5, 10 * SIZE(%ecx) + movq %mm5, 11 * SIZE(%ecx) + movq %mm6, 12 * SIZE(%ecx) + movq %mm6, 13 * SIZE(%ecx) + movq %mm7, 14 * SIZE(%ecx) + movq %mm7, 15 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L62 + ALIGN_2 + +.L65: + movl K, %eax + andl $7, %eax + BRANCH + jle .L70 + ALIGN_2 + +.L66: +#ifdef PENTIUM4 +#ifdef HAVE_SSE3 + movddup 0 * SIZE(%edi), %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) +#else + movsd 0 * SIZE(%edi), %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, 0 * SIZE(%ecx) +#endif +#endif + +#if defined(OPTERON) || defined(BARCELONA) + movq 0 * SIZE(%edi), %mm0 + + movq %mm0, 0 * SIZE(%ecx) + movq %mm0, 1 * SIZE(%ecx) +#endif + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L66 + ALIGN_4 + +.L70: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) +#endif + +#ifdef PENTIUM4 + prefetchnta 2 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movapd 16 * SIZE(BB), %xmm2 + + movapd 2 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + movapd 16 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movapd 24 * SIZE(BB), %xmm3 + + movapd 10 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + movapd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(AA), %xmm0 + movsd 4 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulsd %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 1 * SIZE(AA), %xmm0 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm6 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm7 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + mulsd 10 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm6 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addsd %xmm5, %xmm4 + addsd %xmm7, %xmm6 + addsd %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x4_penryn.S b/kernel/x86/zgemm3m_kernel_2x4_penryn.S new file mode 100644 index 0000000000..3920649879 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x4_penryn.S @@ -0,0 +1,1344 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#ifdef NANO +#define PREFETCHSIZE (8 * 3 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht2 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht2 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 21 + 4) +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -16 * SIZE(%eax) + subl $-8 * SIZE, BX + + leal (C1, LDC, 2), %eax + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + PREFETCHW 1 * SIZE(C1) + pxor %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + PREFETCHW 1 * SIZE(%eax) + pxor %xmm7, %xmm7 + PREFETCHW 1 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps -2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 6 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm3, %xmm7 + movaps 14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addpd %xmm3, %xmm7 + movaps -14 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm6 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm1, %xmm5 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm2, %xmm6 + addpd %xmm3, %xmm7 + + movups ALPHA, %xmm3 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movaps %xmm6, %xmm0 + movsd %xmm7, %xmm6 + movsd %xmm0, %xmm7 + + leal (C1, LDC, 2), %eax + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm1 + movhps 3 * SIZE(C1), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 2 * SIZE(C1) + movhps %xmm1, 3 * SIZE(C1) + + movsd 0 * SIZE(C1, LDC), %xmm0 + movhps 1 * SIZE(C1, LDC), %xmm0 + movsd 2 * SIZE(C1, LDC), %xmm1 + movhps 3 * SIZE(C1, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(C1, LDC) + movhps %xmm0, 1 * SIZE(C1, LDC) + movlps %xmm1, 2 * SIZE(C1, LDC) + movhps %xmm1, 3 * SIZE(C1, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 1 * SIZE(%eax), %xmm0 + movsd 2 * SIZE(%eax), %xmm1 + movhps 3 * SIZE(%eax), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 1 * SIZE(%eax) + movlps %xmm1, 2 * SIZE(%eax) + movhps %xmm1, 3 * SIZE(%eax) + + movsd 0 * SIZE(%eax, LDC), %xmm0 + movhps 1 * SIZE(%eax, LDC), %xmm0 + movsd 2 * SIZE(%eax, LDC), %xmm1 + movhps 3 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax, LDC) + movhps %xmm0, 1 * SIZE(%eax, LDC) + movlps %xmm1, 2 * SIZE(%eax, LDC) + movhps %xmm1, 3 * SIZE(%eax, LDC) + + addl $4 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $1, I + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps -14 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps -8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps -6 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -2 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 0 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 2 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 6 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 10 * SIZE(BB), %xmm3 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps 12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps 14 * SIZE(BB), %xmm3 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm7 + movaps 18 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm4 + movaps -12 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm5 + movaps -10 * SIZE(BB), %xmm3 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movups ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + leal (C1, LDC, 2), %eax + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 1 * SIZE(C1, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 0 * SIZE(C1, LDC) + movhps %xmm1, 1 * SIZE(C1, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 1 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%eax, LDC), %xmm1 + movhps 1 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 1 * SIZE(%eax) + movlps %xmm1, 0 * SIZE(%eax, LDC) + movhps %xmm1, 1 * SIZE(%eax, LDC) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $2, %eax + jle .L50 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1) + pxor %xmm6, %xmm6 + PREFETCHW 1 * SIZE(C1, LDC) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -12 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -10 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -8 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -6 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps -4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -2 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + addpd %xmm1, %xmm5 + movaps -14 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movups ALPHA, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movaps %xmm4, %xmm0 + movsd %xmm5, %xmm4 + movsd %xmm0, %xmm5 + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm1 + movhps 3 * SIZE(C1), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 2 * SIZE(C1) + movhps %xmm1, 3 * SIZE(C1) + + movsd 0 * SIZE(C1, LDC), %xmm0 + movhps 1 * SIZE(C1, LDC), %xmm0 + movsd 2 * SIZE(C1, LDC), %xmm1 + movhps 3 * SIZE(C1, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(C1, LDC) + movhps %xmm0, 1 * SIZE(C1, LDC) + movlps %xmm1, 2 * SIZE(C1, LDC) + movhps %xmm1, 3 * SIZE(C1, LDC) + + addl $4 * SIZE, C1 + decl I + jg .L31 + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -14 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -12 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -6 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -10 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps -4 * SIZE(BB), %xmm2 + + pshufd $0x44, %xmm0, %xmm1 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -2 * SIZE(BB), %xmm2 + + pshufd $0xee, %xmm0, %xmm1 + movaps -8 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + pshufd $0x44, %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + movups ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 1 * SIZE(C1, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 0 * SIZE(C1, LDC) + movhps %xmm1, 1 * SIZE(C1, LDC) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $1, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + PREFETCHW 1 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + pshufd $0x44, %xmm1, %xmm2 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + pshufd $0xee, %xmm1, %xmm2 + movaps -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm1, %xmm2 + movsd -15 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movups ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm1 + movhps 3 * SIZE(C1), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + movlps %xmm1, 2 * SIZE(C1) + movhps %xmm1, 3 * SIZE(C1) + + addl $4 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -14 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -12 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movaps -10 * SIZE(BB), %xmm2 + + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movaps -8 * SIZE(BB), %xmm2 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulsd %xmm0, %xmm2 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd -15 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movups ALPHA, %xmm3 + + addpd %xmm5, %xmm4 + + haddpd %xmm4, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 1 * SIZE(C1), %xmm0 + + pshufd $0x44, %xmm4, %xmm2 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 1 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_2x4_prescott.S b/kernel/x86/zgemm3m_kernel_2x4_prescott.S new file mode 100644 index 0000000000..a32e0ae940 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_2x4_prescott.S @@ -0,0 +1,1590 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + addpd %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm2, %xmm7 + +#define KERNEL6(address) \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm2, %xmm7; \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + addpd %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L30 + ALIGN_2 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + movl BX, %eax + prefetcht2 0 * SIZE(%eax) + subl $-4 * SIZE, BX + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC, 1) + prefetchnta 3 * SIZE(%esi, LDC, 2) + prefetchnta 3 * SIZE(%esi, %eax, 1) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#ifdef CORE_PRESCOTT + andl $-8, %eax + sall $4, %eax + je .L15 + +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(16 * 8) + KERNEL2(16 * 8) + KERNEL3(16 * 8) + KERNEL4(16 * 8) + KERNEL5(16 * 8) + KERNEL6(16 * 8) + KERNEL7(16 * 8) + KERNEL8(16 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(16 * 9) + KERNEL2(16 * 9) + KERNEL3(16 * 9) + KERNEL4(16 * 9) + KERNEL5(16 * 9) + KERNEL6(16 * 9) + KERNEL7(16 * 9) + KERNEL8(16 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(16 * 10) + KERNEL2(16 * 10) + KERNEL3(16 * 10) + KERNEL4(16 * 10) + KERNEL5(16 * 10) + KERNEL6(16 * 10) + KERNEL7(16 * 10) + KERNEL8(16 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(16 * 11) + KERNEL2(16 * 11) + KERNEL3(16 * 11) + KERNEL4(16 * 11) + KERNEL5(16 * 11) + KERNEL6(16 * 11) + KERNEL7(16 * 11) + KERNEL8(16 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(16 * 12) + KERNEL2(16 * 12) + KERNEL3(16 * 12) + KERNEL4(16 * 12) + KERNEL5(16 * 12) + KERNEL6(16 * 12) + KERNEL7(16 * 12) + KERNEL8(16 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(16 * 13) + KERNEL2(16 * 13) + KERNEL3(16 * 13) + KERNEL4(16 * 13) + KERNEL5(16 * 13) + KERNEL6(16 * 13) + KERNEL7(16 * 13) + KERNEL8(16 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(16 * 14) + KERNEL2(16 * 14) + KERNEL3(16 * 14) + KERNEL4(16 * 14) + KERNEL5(16 * 14) + KERNEL6(16 * 14) + KERNEL7(16 * 14) + KERNEL8(16 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(16 * 15) + KERNEL2(16 * 15) + KERNEL3(16 * 15) + KERNEL4(16 * 15) + KERNEL5(16 * 15) + KERNEL6(16 * 15) + KERNEL7(16 * 15) + KERNEL8(16 * 15) +#else + addl $32 * 4 * SIZE, AA + addl $32 * 8 * SIZE, BB + subl $128 * 8, %eax + jg .L1X +#endif + +.L12: + leal (AA, %eax, 1), AA # * 16 + leal (BB, %eax, 2), BB # * 64 + +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm0, %xmm3 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 17 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 18 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 19 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 20 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm4 + movddup 21 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm5 + movddup 22 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + addpd %xmm2, %xmm6 + movddup 23 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm2 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movddup 32 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 25 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 26 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 27 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 28 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 29 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movddup 30 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 31 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 40 * SIZE(BB), %xmm3 + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 1 * SIZE(%esi, LDC, 2), %xmm0 + movsd 2 * SIZE(%esi, LDC, 2), %xmm1 + movhps 3 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 1 * SIZE(%esi, LDC, 2) + movlps %xmm1, 2 * SIZE(%esi, LDC, 2) + movhps %xmm1, 3 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 1 * SIZE(%esi, %eax), %xmm0 + movsd 2 * SIZE(%esi, %eax), %xmm1 + movhps 3 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 1 * SIZE(%esi, %eax) + movlps %xmm1, 2 * SIZE(%esi, %eax) + movhps %xmm1, 3 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_3 + +.L20: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L29 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 3 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 18 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 5 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 22 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movddup 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 26 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 7 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 30 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 34 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 36 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 9 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 38 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 48 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 42 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 44 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 11 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 46 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 56 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 50 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 52 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 13 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 54 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 64 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movddup 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 58 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 60 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 15 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 62 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 72 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (%esi, LDC, 1), %eax + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 1 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 1 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 1 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 1 * SIZE(%esi, %eax) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + movl BB, B + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L10 + ALIGN_4 + +.L30: + testl $2, N + je .L60 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef HAVE_3DNOW + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L41 + ALIGN_4 + +.L50: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L59 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movddup 2 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 3 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + movddup 5 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm0 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movddup 6 * SIZE(AA), %xmm0 + mulpd 12 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movddup 7 * SIZE(AA), %xmm0 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movddup 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + movddup 9 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + mulpd 18 * SIZE(BB), %xmm1 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movddup 10 * SIZE(AA), %xmm1 + mulpd 20 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 11 * SIZE(AA), %xmm1 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + movddup 13 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 26 * SIZE(BB), %xmm1 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movddup 14 * SIZE(AA), %xmm1 + mulpd 28 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movddup 15 * SIZE(AA), %xmm1 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movddup 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L58 + +.L56: + mulpd %xmm0, %xmm2 + movddup 1 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + ALIGN_4 + +.L59: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + movl BB, B + addl %eax, C # c += 4 * ldc + ALIGN_4 + +.L60: + testl $1, N + je .L999 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 4 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 16 * SIZE(AA), %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd 4 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm2, %xmm7 + movddup 8 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movddup 5 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 24 * SIZE(AA), %xmm1 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm5 + movddup 6 * SIZE(BB), %xmm3 + mulpd 12 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm6 + movddup 7 * SIZE(BB), %xmm3 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulpd %xmm2, %xmm0 + movddup 1 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm4 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + addl $4 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L71 + ALIGN_4 + +.L80: + movl M, %ebx + testl $1, %ebx # i = (m >> 2) + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd 0 + ALPHA, %xmm3 + movhps 8 + ALPHA, %xmm3 + andl $15, %eax # if (k & 1) + BRANCH + je .L88 + +.L86: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + movsd 1 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + haddpd %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x2_core2.S b/kernel/x86/zgemm3m_kernel_4x2_core2.S new file mode 100644 index 0000000000..0c01de87e8 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x2_core2.S @@ -0,0 +1,1328 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA_R 16 + STACK + ARGS(%esi) +#define OLD_ALPHA_I 24 + STACK + ARGS(%esi) +#define OLD_A 32 + STACK + ARGS(%esi) +#define OLD_B 36 + STACK + ARGS(%esi) +#define OLD_C 40 + STACK + ARGS(%esi) +#define OLD_LDC 44 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 256(%esp) + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 7 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl %esp, %esi # save old stack + + subl $512 + LOCAL_BUFFER_SIZE, %esp + andl $-4096, %esp # align stack + + STACK_TOUCHING + + movl OLD_M, %ebx + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movsd OLD_ALPHA_R, %xmm0 + movhps OLD_ALPHA_I, %xmm0 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, B + movl OLD_C, %ebx + + movaps %xmm0, ALPHA + movl %ebx, C + movl OLD_LDC, LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax + movl %eax, J + jle .L40 + ALIGN_4 + +.L01: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl B, BX + + movl C, C1 + movl A, AA + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(C1) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(C1, LDC) + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetcht0 (%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd -12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd -10 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + movapd -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + PADDING; + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm6 + movapd -4 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm4 + movapd -2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm6 + PADDING; + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 8 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm6 + movapd 8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm6 + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd 12 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm4 + movapd 14 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + subl $-32 * SIZE, BB + addpd %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + addpd %xmm1, %xmm6 + movapd -16 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 24 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + subl $-32 * SIZE, AA + decl %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 4 * SIZE(%esi), %xmm0 + movhps 5 * SIZE(%esi), %xmm0 + movsd 6 * SIZE(%esi), %xmm1 + movhps 7 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi) + movhps %xmm0, 5 * SIZE(%esi) + movlps %xmm1, 6 * SIZE(%esi) + movhps %xmm1, 7 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + movsd 4 * SIZE(%esi, LDC), %xmm0 + movhps 5 * SIZE(%esi, LDC), %xmm0 + movsd 6 * SIZE(%esi, LDC), %xmm1 + movhps 7 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi, LDC) + movhps %xmm0, 5 * SIZE(%esi, LDC) + movlps %xmm1, 6 * SIZE(%esi, LDC) + movhps %xmm1, 7 * SIZE(%esi, LDC) + + addl $8 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $2, I + jle .L30 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $2, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + addpd %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + addpd %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $1, I + jle .L39 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -12 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm1 + mulsd -10 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm7 + movsd -14 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -6 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd -13 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd -2 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm6 + movsd 8 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm7 + movsd -8 * SIZE(AA), %xmm0 + mulsd %xmm2, %xmm1 + mulsd 2 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm4 + movsd 4 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm5 + movsd -11 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm1 + mulsd 6 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm6 + movsd 16 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm7 + movsd -10 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 10 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm5 + movsd -9 * SIZE(AA), %xmm2 + mulsd %xmm2, %xmm3 + mulsd 14 * SIZE(BB), %xmm2 + addsd %xmm3, %xmm6 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm2, %xmm7 + movsd -4 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + mulsd -14 * SIZE(BB), %xmm0 + addsd %xmm1, %xmm4 + movsd -12 * SIZE(BB), %xmm1 + addsd %xmm0, %xmm5 + movsd -15 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L40: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_4 + +.L41: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L45 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $7, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, -16 * SIZE(BB) + addl $1 * SIZE, B + addl $2 * SIZE, BB + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, C1 + movl A, AA + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht0 3 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm5 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm4 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm6 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm5 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm0 + mulpd 2 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 6 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 10 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm4 + movapd 12 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm6 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm2 + mulpd 14 * SIZE(AA), %xmm3 + addpd %xmm2, %xmm5 + movapd 24 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm7 + movapd 8 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AA), %xmm1 + addpd %xmm0, %xmm4 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm6 + movapd -14 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 4 * SIZE(%esi), %xmm0 + movhps 5 * SIZE(%esi), %xmm0 + movsd 6 * SIZE(%esi), %xmm1 + movhps 7 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi) + movhps %xmm0, 5 * SIZE(%esi) + movlps %xmm1, 6 * SIZE(%esi) + movhps %xmm1, 7 * SIZE(%esi) + + addl $8 * SIZE, C1 + decl I + jg .L51 + ALIGN_4 + +.L60: + movl M, I + testl $2, I + jle .L70 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + movapd -8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd -10 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -6 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd -4 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd -2 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm4 + movapd -2 * SIZE(BB), %xmm3 + mulpd %xmm2, %xmm3 + movapd 8 * SIZE(AA), %xmm2 + addpd %xmm3, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L70: + movl M, I + testl $1, I + jle .L79 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 16 * SIZE + BUFFER, BB +#else + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movsd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + movsd -12 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -14 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -12 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -13 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -10 * SIZE(BB), %xmm1 + mulsd %xmm0, %xmm1 + movsd -8 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -6 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -10 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd -4 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -9 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm4 + movsd -2 * SIZE(BB), %xmm3 + mulsd %xmm2, %xmm3 + movsd -4 * SIZE(AA), %xmm2 + addsd %xmm3, %xmm5 + movsd 8 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AA), %xmm0 + addsd %xmm1, %xmm4 + movsd -14 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + + unpcklpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + ALIGN_4 + +.L79: + addl LDC, C + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x2_northwood.S b/kernel/x86/zgemm3m_kernel_4x2_northwood.S new file mode 100644 index 0000000000..fb7d63954b --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x2_northwood.S @@ -0,0 +1,1522 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movsd STACK_ALPHA_R, %xmm0 + movhps STACK_ALPHA_I, %xmm0 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movaps %xmm0, ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + BRANCH + jne .L04 + ALIGN_4 + +.L05: + movl B, BX + + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + NOBRANCH + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#endif + + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) + + movl BX, %eax + prefetcht2 0 * SIZE(%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + +#ifdef PENTIUM4 + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + .align 8 + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + sarl $3, %eax + je .L12 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $32 * SIZE, %ecx + addl $32 * SIZE, %edx + decl %eax + jne .L11 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 4 * SIZE(%esi), %xmm0 + movhps 5 * SIZE(%esi), %xmm0 + movsd 6 * SIZE(%esi), %xmm1 + movhps 7 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi) + movhps %xmm0, 5 * SIZE(%esi) + movlps %xmm1, 6 * SIZE(%esi) + movhps %xmm1, 7 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + movsd 4 * SIZE(%esi, LDC), %xmm0 + movhps 5 * SIZE(%esi, LDC), %xmm0 + movsd 6 * SIZE(%esi, LDC), %xmm1 + movhps 7 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm7, %xmm2 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm7 + addpd %xmm7, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi, LDC) + movhps %xmm0, 5 * SIZE(%esi, LDC) + movlps %xmm1, 6 * SIZE(%esi, LDC) + movhps %xmm1, 7 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi + decl %ebx # i -- + BRANCH + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + testl $2, %ebx + jle .L50 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + + pxor %xmm7, %xmm7 +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $2, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + +.L31: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L31 + +.L32: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L33 + ALIGN_4 + +.L34: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 1 * SIZE(%esi, LDC), %xmm0 + movsd 2 * SIZE(%esi, LDC), %xmm1 + movhps 3 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x44, %xmm5, %xmm2 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 1 * SIZE(%esi, LDC) + movlps %xmm1, 2 * SIZE(%esi, LDC) + movhps %xmm1, 3 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi # coffset += 4 + ALIGN_2 + +.L50: + movl M, %ebx + testl $1, %ebx + jle .L99 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movsd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + +.L51: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm2 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 12 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 3 * SIZE(AA), %xmm0 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BB), %xmm0 + addsd %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm0, %xmm5 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm2 + mulsd 18 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 20 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 5 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm2 + mulsd 22 * SIZE(BB), %xmm1 + addsd %xmm2, %xmm4 + movsd 32 * SIZE(BB), %xmm2 + addsd %xmm1, %xmm5 + movsd 6 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 26 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 7 * SIZE(AA), %xmm1 + mulsd %xmm1, %xmm3 + mulsd 30 * SIZE(BB), %xmm1 + addsd %xmm3, %xmm4 + movsd 40 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm5 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + BRANCH + decl %eax + jne .L51 + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulsd %xmm0, %xmm2 + mulsd 2 * SIZE(BB), %xmm0 + addsd %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L53 + ALIGN_4 + +.L54: + addsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 1 * SIZE(%esi, LDC), %xmm1 + + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm3, %xmm5 + addpd %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 1 * SIZE(%esi, LDC) + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 2 * ldc + BRANCH + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(%ecx) + movapd %xmm1, 2 * SIZE(%ecx) + movapd %xmm2, 4 * SIZE(%ecx) + movapd %xmm3, 6 * SIZE(%ecx) + movapd %xmm4, 8 * SIZE(%ecx) + movapd %xmm5, 10 * SIZE(%ecx) + movapd %xmm6, 12 * SIZE(%ecx) + movapd %xmm7, 14 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, %ecx + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movsd 0 * SIZE(B), %xmm0 + + unpcklpd %xmm0, %xmm0 + + movapd %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, B + addl $2 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + +.L111: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + mulpd 6 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm5 + movapd 16 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm7 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 10 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm4 + movapd 12 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm6 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 14 * SIZE(AA), %xmm2 + addpd %xmm1, %xmm5 + movapd 24 * SIZE(AA), %xmm1 + addpd %xmm2, %xmm7 + movapd 16 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm0 + mulpd 18 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm4 + movapd 20 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm6 + movapd 10 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm0 + mulpd 22 * SIZE(AA), %xmm3 + addpd %xmm0, %xmm5 + movapd 32 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm7 + movapd 12 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 26 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm4 + movapd 28 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm6 + movapd 14 * SIZE(BB), %xmm3 + mulpd %xmm3, %xmm1 + mulpd 30 * SIZE(AA), %xmm3 + addpd %xmm1, %xmm5 + movapd 40 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm7 + movapd 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L111 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + mulpd %xmm2, %xmm0 + mulpd 2 * SIZE(AA), %xmm2 + addpd %xmm0, %xmm4 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm6 + movapd 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + movsd 4 * SIZE(%esi), %xmm0 + movhps 5 * SIZE(%esi), %xmm0 + movsd 6 * SIZE(%esi), %xmm1 + movhps 7 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm6, %xmm2 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm1 + + movlps %xmm0, 4 * SIZE(%esi) + movhps %xmm0, 5 * SIZE(%esi) + movlps %xmm1, 6 * SIZE(%esi) + movhps %xmm1, 7 * SIZE(%esi) + + addl $8 * SIZE, %esi # coffset += 4 + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + testl $2, %ebx + jle .L150 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + +.L131: + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm2, %xmm4 + mulpd 2 * SIZE(BB), %xmm0 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd 4 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm6 + movapd 6 * SIZE(AA), %xmm0 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + addpd %xmm3, %xmm4 + mulpd 10 * SIZE(BB), %xmm1 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 12 * SIZE(AA), %xmm1 + mulpd 12 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm6 + movapd 14 * SIZE(AA), %xmm1 + mulpd 14 * SIZE(BB), %xmm1 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L131 + +.L132: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movapd 0 * SIZE(AA), %xmm0 + mulpd 0 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm4 + + addl $2 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L133 + ALIGN_4 + +.L134: + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + addpd %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhps 3 * SIZE(%esi), %xmm1 + + pshufd $0x44, %xmm4, %xmm2 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + movlps %xmm1, 2 * SIZE(%esi) + movhps %xmm1, 3 * SIZE(%esi) + + addl $4 * SIZE, %esi + ALIGN_2 + +.L150: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 4 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + +.L151: + mulsd %xmm0, %xmm2 + movsd 1 * SIZE(AA), %xmm0 + addsd %xmm2, %xmm4 + mulsd 2 * SIZE(BB), %xmm0 + movsd 16 * SIZE(BB), %xmm2 + addsd %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + mulsd 4 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 3 * SIZE(AA), %xmm0 + mulsd 6 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + mulsd %xmm1, %xmm3 + movsd 5 * SIZE(AA), %xmm1 + addsd %xmm3, %xmm4 + mulsd 10 * SIZE(BB), %xmm1 + movsd 24 * SIZE(BB), %xmm3 + addsd %xmm1, %xmm4 + movsd 6 * SIZE(AA), %xmm1 + mulsd 12 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 7 * SIZE(AA), %xmm1 + mulsd 14 * SIZE(BB), %xmm1 + addsd %xmm1, %xmm4 + movsd 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + BRANCH + decl %eax + jne .L151 + +.L152: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + movsd 0 * SIZE(AA), %xmm0 + mulsd 0 * SIZE(BB), %xmm0 + addsd %xmm0, %xmm4 + + addl $1 * SIZE, AA # aoffset += 8 + addl $2 * SIZE, BB # boffset1 += 8 + decl %eax + BRANCH + jg .L153 + ALIGN_4 + +.L154: + movsd 0 * SIZE(%esi), %xmm0 + movhps 1 * SIZE(%esi), %xmm0 + + unpcklpd %xmm4, %xmm4 + + mulpd %xmm3, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 1 * SIZE(%esi) + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S new file mode 100644 index 0000000000..29158df25b --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S @@ -0,0 +1,2153 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA_R 16 + STACK(%esi) +#define OLD_ALPHA_I 20 + STACK(%esi) +#define OLD_A 24 + STACK(%esi) +#define OLD_B 28 + STACK(%esi) +#define OLD_C 32 + STACK(%esi) +#define OLD_LDC 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#if defined(OPTERON) || defined(BARCELONA) +#define movsd movlps +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA_R, %xmm0 + movss OLD_ALPHA_I, %xmm1 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, %edi + movl OLD_C, %ebx + + unpcklps %xmm1, %xmm0 + movlhps %xmm0, %xmm0 + + movaps %xmm0, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + sall $ZBASE_SHIFT, LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + movaps 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (%esi, LDC, 2), %eax + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + prefetchw 3 * SIZE(%eax) + prefetchw 3 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 4 * SIZE(%esi, LDC, 2), %xmm1 + movhps 6 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 4 * SIZE(%esi, LDC, 2) + movhps %xmm1, 6 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 2 * SIZE(%esi, %eax), %xmm0 + movsd 4 * SIZE(%esi, %eax), %xmm1 + movhps 6 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 2 * SIZE(%esi, %eax) + movlps %xmm1, 4 * SIZE(%esi, %eax) + movhps %xmm1, 6 * SIZE(%esi, %eax) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm4 + pshufd $0x50, %xmm5, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 2 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm6, %xmm6 + pshufd $0x50, %xmm7, %xmm7 + + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + + addps %xmm6, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 2 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (%esi, LDC, 2), %eax + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + movsd (%eax), %xmm1 + movhps (%eax, LDC), %xmm1 + + shufps $0, %xmm5, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + shufps $0, %xmm7, %xmm6 + mulps %xmm3, %xmm6 + addps %xmm6, %xmm1 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + movlps %xmm1, (%eax) + movhps %xmm1, (%eax, LDC) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + movaps 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movsd 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm4 + pshufd $0x50, %xmm5, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movups 0 * SIZE(%edi), %xmm3 + movups 4 * SIZE(%edi), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(%ecx) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(%edi), %xmm3 + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi # coffset += 2 + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_opteron.S b/kernel/x86/zgemm3m_kernel_4x4_opteron.S new file mode 100644 index 0000000000..511fc8b057 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x4_opteron.S @@ -0,0 +1,2532 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA_R 16 + STACK(%esi) +#define OLD_ALPHA_I 20 + STACK(%esi) +#define OLD_A 24 + STACK(%esi) +#define OLD_B 28 + STACK(%esi) +#define OLD_C 32 + STACK(%esi) +#define OLD_LDC 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define BX 40(%esp) +#define OLD_STACK 44(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define BUFFER 128(%esp) + + +#ifdef ATHLON +#define PREFETCH prefetch +#define PREFETCHSIZE 64 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 10 + 8) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#if defined(OPTERON) || defined(BARCELONA) +#define movsd movlps +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; +#endif + +#ifdef PENTIUM4 +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA_R, %xmm0 + movss OLD_ALPHA_I, %xmm1 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, %edi + movl OLD_C, %ebx + + unpcklps %xmm1, %xmm0 + movlhps %xmm0, %xmm0 + + movaps %xmm0, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC + + sall $ZBASE_SHIFT, LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $1, %eax + jle .L05 + ALIGN_4 + +.L02: +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht2 112 * SIZE(%ecx) +#endif + +#if defined(OPTERON) || defined(BARCELONA) + prefetchnta 80 * SIZE(%edi) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $1, %eax + BRANCH + jle .L10 + + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) +#endif + addl $4 * SIZE, %edi + ALIGN_4 + +.L10: + movl %edi, BX + + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movl BX, %eax + +#ifdef HAVE_SSE + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) + prefetchw 4 * SIZE(%esi, LDC, 2) + prefetchw 4 * SIZE(%esi, %eax) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + prefetchnta 4 * SIZE(%esi, LDC, 2) + prefetchnta 4 * SIZE(%esi, %eax) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $128 * SIZE, BB + addl $32 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 4 * SIZE(%esi, LDC, 2), %xmm1 + movhps 6 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 4 * SIZE(%esi, LDC, 2) + movhps %xmm1, 6 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 2 * SIZE(%esi, %eax), %xmm0 + movsd 4 * SIZE(%esi, %eax), %xmm1 + movhps 6 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 2 * SIZE(%esi, %eax) + movlps %xmm1, 4 * SIZE(%esi, %eax) + movhps %xmm1, 6 * SIZE(%esi, %eax) + + addl $8 * SIZE, %esi # coffset += 2 + + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 76 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 96 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 92 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 112 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movsd 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 108 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 128 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movsd 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 124 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 144 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 16 * SIZE(BB), %xmm2 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + shufps $0x50, %xmm4, %xmm4 + shufps $0x50, %xmm5, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 2 * SIZE(%esi, %eax), %xmm1 + + shufps $0x50, %xmm6, %xmm6 + shufps $0x50, %xmm7, %xmm7 + + mulps %xmm3, %xmm6 + mulps %xmm3, %xmm7 + + addps %xmm6, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 2 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movss 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movss 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 20 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 36 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 40 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 44 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 3 * SIZE(AA), %xmm0 + + mulss %xmm0, %xmm3 + addss %xmm3, %xmm4 + movss 52 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + addss %xmm3, %xmm5 + movss 56 * SIZE(BB), %xmm3 + mulss %xmm0, %xmm3 + mulss 60 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 68 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 72 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 76 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 96 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 5 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 84 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 88 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 92 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 112 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm2 + addss %xmm2, %xmm4 + movss 100 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + addss %xmm2, %xmm5 + movss 104 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm2 + mulss 108 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 128 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 7 * SIZE(AA), %xmm1 + + mulss %xmm1, %xmm3 + addss %xmm3, %xmm4 + movss 116 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + addss %xmm3, %xmm5 + movss 120 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + mulss 124 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 144 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm5 + movss 8 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + + shufps $0, %xmm5, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + + movsd (%esi, LDC, 2), %xmm0 + movhps (%esi, %eax), %xmm0 + + shufps $0, %xmm7, %xmm6 + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + + movlps %xmm0, (%esi, LDC, 2) + movhps %xmm0, (%esi, %eax) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L45 + ALIGN_4 + +.L42: + prefetchnta 80 * SIZE(%edi) + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht1 112 * SIZE(%ecx) +#endif + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $3, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) +#endif + addl $2 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movsd 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movsd 64 * SIZE(BB), %xmm2 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movsd 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + shufps $0x50, %xmm4, %xmm4 + shufps $0x50, %xmm5, %xmm5 + + mulps %xmm3, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $ 1 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + + shufps $0, %xmm5, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + prefetchnta 80 * SIZE(%edi) + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 112 * SIZE(%ecx) + prefetchw 120 * SIZE(%ecx) +#endif + +#ifdef PENTIUM4 + prefetcht1 112 * SIZE(%ecx) +#endif + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) + movd %mm1, 4 * SIZE(%ecx) + movd %mm1, 5 * SIZE(%ecx) + movd %mm1, 6 * SIZE(%ecx) + movd %mm1, 7 * SIZE(%ecx) + movd %mm2, 8 * SIZE(%ecx) + movd %mm2, 9 * SIZE(%ecx) + movd %mm2, 10 * SIZE(%ecx) + movd %mm2, 11 * SIZE(%ecx) + movd %mm3, 12 * SIZE(%ecx) + movd %mm3, 13 * SIZE(%ecx) + movd %mm3, 14 * SIZE(%ecx) + movd %mm3, 15 * SIZE(%ecx) + movd %mm4, 16 * SIZE(%ecx) + movd %mm4, 17 * SIZE(%ecx) + movd %mm4, 18 * SIZE(%ecx) + movd %mm4, 19 * SIZE(%ecx) + movd %mm5, 20 * SIZE(%ecx) + movd %mm5, 21 * SIZE(%ecx) + movd %mm5, 22 * SIZE(%ecx) + movd %mm5, 23 * SIZE(%ecx) + movd %mm6, 24 * SIZE(%ecx) + movd %mm6, 25 * SIZE(%ecx) + movd %mm6, 26 * SIZE(%ecx) + movd %mm6, 27 * SIZE(%ecx) + movd %mm7, 28 * SIZE(%ecx) + movd %mm7, 29 * SIZE(%ecx) + movd %mm7, 30 * SIZE(%ecx) + movd %mm7, 31 * SIZE(%ecx) +#endif + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + +#ifdef HAVE_SSE2 + movss 0 * SIZE(%edi), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) +#else + movd 0 * SIZE(%edi), %mm0 + movd %mm0, 0 * SIZE(%ecx) + movd %mm0, 1 * SIZE(%ecx) + movd %mm0, 2 * SIZE(%ecx) + movd %mm0, 3 * SIZE(%ecx) +#endif + addl $1 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movaps 48 * SIZE(BB), %xmm3 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(AA), %xmm0 + movaps 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi # coffset += 2 + + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + shufps $0x50, %xmm4, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + addl $4 * SIZE, %esi + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movss 0 * SIZE(AA), %xmm0 + movss 4 * SIZE(AA), %xmm1 + movss 0 * SIZE(BB), %xmm2 + movss 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) +#endif + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 48 * SIZE(BB), %xmm3 + mulss 20 * SIZE(BB), %xmm1 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + + movsd (%esi), %xmm0 + + shufps $0, %xmm5, %xmm4 + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (%esi) + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_penryn.S b/kernel/x86/zgemm3m_kernel_4x4_penryn.S new file mode 100644 index 0000000000..802298cf2b --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x4_penryn.S @@ -0,0 +1,1780 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA 16 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH_R (8 * 4) + +#define PREFETCHSIZE (8 * 17 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $2, %eax + movl %eax, J + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sall $BASE_SHIFT + 2, %eax + leal (B, %eax), %eax + movl %eax, BX + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + + prefetcht2 -32 * SIZE(%eax) + subl $-16 * SIZE, BX + + leal (C1, LDC, 2), %eax + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(%eax) + pxor %xmm7, %xmm7 + prefetcht0 3 * SIZE(%eax, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm7 + pshufd $0x93, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm6 + pshufd $0x93, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm2, %xmm5 + pshufd $0x93, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + addps %xmm3, %xmm6 + addps %xmm2, %xmm7 + + movddup ALPHA, %xmm3 + + pshufd $0x39, %xmm5, %xmm2 + pshufd $0x4e, %xmm6, %xmm0 + pshufd $0x93, %xmm7, %xmm7 + + movaps %xmm4, %xmm6 + unpcklps %xmm0, %xmm4 + unpckhps %xmm0, %xmm6 + + movaps %xmm2, %xmm1 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm1 + + movaps %xmm4, %xmm5 + unpcklps %xmm2, %xmm4 + unpckhps %xmm2, %xmm5 + + movaps %xmm6, %xmm7 + unpcklps %xmm1, %xmm6 + unpckhps %xmm1, %xmm7 + + pshufd $0x93, %xmm5, %xmm5 + pshufd $0x4e, %xmm6, %xmm6 + pshufd $0x39, %xmm7, %xmm7 + + leal (C1, LDC, 2), %eax + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm1 + movhps 6 * SIZE(C1), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 4 * SIZE(C1) + movhps %xmm1, 6 * SIZE(C1) + + movsd 0 * SIZE(C1, LDC), %xmm0 + movhps 2 * SIZE(C1, LDC), %xmm0 + movsd 4 * SIZE(C1, LDC), %xmm1 + movhps 6 * SIZE(C1, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(C1, LDC) + movhps %xmm0, 2 * SIZE(C1, LDC) + movlps %xmm1, 4 * SIZE(C1, LDC) + movhps %xmm1, 6 * SIZE(C1, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 2 * SIZE(%eax), %xmm0 + movsd 4 * SIZE(%eax), %xmm1 + movhps 6 * SIZE(%eax), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 2 * SIZE(%eax) + movlps %xmm1, 4 * SIZE(%eax) + movhps %xmm1, 6 * SIZE(%eax) + + movsd 0 * SIZE(%eax, LDC), %xmm0 + movhps 2 * SIZE(%eax, LDC), %xmm0 + movsd 4 * SIZE(%eax, LDC), %xmm1 + movhps 6 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax, LDC) + movhps %xmm0, 2 * SIZE(%eax, LDC) + movlps %xmm1, 4 * SIZE(%eax, LDC) + movhps %xmm1, 6 * SIZE(%eax, LDC) + + addl $8 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $2, I + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + pshufd $0x44, %xmm0, %xmm2 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm6 + pshufd $0xfa, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm7 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm4 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm5 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + movddup ALPHA, %xmm3 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + leal (C1, LDC, 2), %eax + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 0 * SIZE(C1, LDC) + movhps %xmm1, 2 * SIZE(C1, LDC) + + movsd 0 * SIZE(%eax), %xmm0 + movhps 2 * SIZE(%eax), %xmm0 + movsd 0 * SIZE(%eax, LDC), %xmm1 + movhps 2 * SIZE(%eax, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%eax) + movhps %xmm0, 2 * SIZE(%eax) + movlps %xmm1, 0 * SIZE(%eax, LDC) + movhps %xmm1, 2 * SIZE(%eax, LDC) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $1, I + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + + subl $1, %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + movddup ALPHA, %xmm3 + + leal (C1, LDC, 2), %eax + + movsd (C1), %xmm0 + movhps (C1, LDC), %xmm0 + movsd (%eax), %xmm1 + movhps (%eax, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, (C1) + movhps %xmm0, (C1, LDC) + movlps %xmm1, (%eax) + movhps %xmm1, (%eax, LDC) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + movl BB, B + + leal (, LDC, 4), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L40: + movl N, %eax + testl $2, %eax + jle .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(C1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + movddup ALPHA, %xmm3 + + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm1 + movhps 6 * SIZE(C1), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 4 * SIZE(C1) + movhps %xmm1, 6 * SIZE(C1) + + movsd 0 * SIZE(C1, LDC), %xmm0 + movhps 2 * SIZE(C1, LDC), %xmm0 + movsd 4 * SIZE(C1, LDC), %xmm1 + movhps 6 * SIZE(C1, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(C1, LDC) + movhps %xmm0, 2 * SIZE(C1, LDC) + movlps %xmm1, 4 * SIZE(C1, LDC) + movhps %xmm1, 6 * SIZE(C1, LDC) + + addl $8 * SIZE, C1 + decl I + jg .L41 + ALIGN_4 + +.L50: + movl M, I + testl $2, I + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + pshufd $0x44, %xmm0, %xmm2 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + mulps %xmm2, %xmm3 + + pshufd $0xee, %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + pshufd $0xfa, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L58 + ALIGN_4 + +.L56: + pshufd $0x44, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + pshufd $0x50, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + addps %xmm3, %xmm4 + addps %xmm5, %xmm4 + + movddup ALPHA, %xmm3 + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhps 2 * SIZE(C1, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 0 * SIZE(C1, LDC) + movhps %xmm1, 2 * SIZE(C1, LDC) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L60: + movl M, I + testl $1, I + jle .L69 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -28 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -22 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -20 * SIZE(BB), %xmm1 + + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -18 * SIZE(BB), %xmm1 + + pshufd $0x55, %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm5 + movsd -16 * SIZE(BB), %xmm1 + + subl $ -8 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x00, %xmm0, %xmm2 + movss -31 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + movddup ALPHA, %xmm3 + + addps %xmm5, %xmm4 + + movsd (C1), %xmm0 + movhps (C1, LDC), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, (C1) + movhps %xmm0, (C1, LDC) + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L70: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, I + sarl $2, I + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm5 + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + movddup ALPHA, %xmm3 + + addps %xmm2, %xmm4 + addps %xmm5, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + movsd 4 * SIZE(C1), %xmm1 + movhps 6 * SIZE(C1), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + movlps %xmm1, 4 * SIZE(C1) + movhps %xmm1, 6 * SIZE(C1) + + addl $8 * SIZE, C1 + decl I + jg .L71 + ALIGN_4 + +.L80: + movl M, I + testl $2, I + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm3, %xmm3 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -22 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + movsd -18 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + pshufd $0x55, %xmm1, %xmm2 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + + subl $-16 * SIZE, AA + subl $ -8 * SIZE, BB + + subl $1, %eax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + ALIGN_4 + +.L86: + pshufd $0x00, %xmm1, %xmm2 + movss -31 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + + addl $2 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + movddup ALPHA, %xmm3 + + addps %xmm5, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + movhps 2 * SIZE(C1), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(C1) + movhps %xmm0, 2 * SIZE(C1) + + addl $4 * SIZE, C1 + ALIGN_4 + +.L90: + movl M, I + testl $1, I + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + addl %eax, AA + addl %eax, BB +#endif + + pxor %xmm4, %xmm4 + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movsd -32 * SIZE(BB), %xmm1 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -30 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -26 * SIZE(BB), %xmm1 + + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + + subl $-8 * SIZE, AA + subl $-8 * SIZE, BB + + subl $1, %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -31 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $1 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + movddup ALPHA, %xmm3 + + haddps %xmm4, %xmm4 + + movsd 0 * SIZE(C1), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_4x4_prescott.S b/kernel/x86/zgemm3m_kernel_4x4_prescott.S new file mode 100644 index 0000000000..3d602e3e42 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_4x4_prescott.S @@ -0,0 +1,2060 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 + +#define OLD_M 4 + STACK(%esi) +#define OLD_N 8 + STACK(%esi) +#define OLD_K 12 + STACK(%esi) +#define OLD_ALPHA_R 16 + STACK(%esi) +#define OLD_ALPHA_I 20 + STACK(%esi) +#define OLD_A 24 + STACK(%esi) +#define OLD_B 28 + STACK(%esi) +#define OLD_C 32 + STACK(%esi) +#define OLD_LDC 36 + STACK(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 96 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm2, %xmm7; \ + movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm2, %xmm7; \ + movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm3, %xmm7; \ + movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + addps %xmm3, %xmm7; \ + movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm2, %xmm7 + +#define KERNEL6(address) \ + movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm2, %xmm7; \ + movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm3, %xmm7; \ + movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + addps %xmm3, %xmm7; \ + movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + movss OLD_ALPHA_R, %xmm0 + movss OLD_ALPHA_I, %xmm1 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl OLD_B, %edi + movl OLD_C, %ebx + + unpcklps %xmm1, %xmm0 + movlhps %xmm0, %xmm0 + + movaps %xmm0, ALPHA + + movl %ebx, C + movl OLD_LDC, LDC +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $2, %eax + movl %eax, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + movddup 4 * SIZE(%edi), %xmm2 + movddup 6 * SIZE(%edi), %xmm3 + movddup 8 * SIZE(%edi), %xmm4 + movddup 10 * SIZE(%edi), %xmm5 + movddup 12 * SIZE(%edi), %xmm6 + movddup 14 * SIZE(%edi), %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L02 + ALIGN_2 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_2 + +.L06: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $4 * SIZE, %edi + addl $8 * SIZE, %ecx + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + leal (LDC, LDC, 2), %eax + + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + prefetchnta 4 * SIZE(%esi, LDC, 2) + prefetchnta 4 * SIZE(%esi, %eax) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(32 * 8) + KERNEL2(32 * 8) + KERNEL3(32 * 8) + KERNEL4(32 * 8) + KERNEL5(32 * 8) + KERNEL6(32 * 8) + KERNEL7(32 * 8) + KERNEL8(32 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(32 * 9) + KERNEL2(32 * 9) + KERNEL3(32 * 9) + KERNEL4(32 * 9) + KERNEL5(32 * 9) + KERNEL6(32 * 9) + KERNEL7(32 * 9) + KERNEL8(32 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(32 * 10) + KERNEL2(32 * 10) + KERNEL3(32 * 10) + KERNEL4(32 * 10) + KERNEL5(32 * 10) + KERNEL6(32 * 10) + KERNEL7(32 * 10) + KERNEL8(32 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(32 * 11) + KERNEL2(32 * 11) + KERNEL3(32 * 11) + KERNEL4(32 * 11) + KERNEL5(32 * 11) + KERNEL6(32 * 11) + KERNEL7(32 * 11) + KERNEL8(32 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(32 * 12) + KERNEL2(32 * 12) + KERNEL3(32 * 12) + KERNEL4(32 * 12) + KERNEL5(32 * 12) + KERNEL6(32 * 12) + KERNEL7(32 * 12) + KERNEL8(32 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(32 * 13) + KERNEL2(32 * 13) + KERNEL3(32 * 13) + KERNEL4(32 * 13) + KERNEL5(32 * 13) + KERNEL6(32 * 13) + KERNEL7(32 * 13) + KERNEL8(32 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(32 * 14) + KERNEL2(32 * 14) + KERNEL3(32 * 14) + KERNEL4(32 * 14) + KERNEL5(32 * 14) + KERNEL6(32 * 14) + KERNEL7(32 * 14) + KERNEL8(32 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(32 * 15) + KERNEL2(32 * 15) + KERNEL3(32 * 15) + KERNEL4(32 * 15) + KERNEL5(32 * 15) + KERNEL6(32 * 15) + KERNEL7(32 * 15) + KERNEL8(32 * 15) +#else + addl $128 * 4 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 +#endif + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L12 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movsldup 8 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 4 * SIZE(%esi, LDC, 2), %xmm1 + movhps 6 * SIZE(%esi, LDC, 2), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 4 * SIZE(%esi, LDC, 2) + movhps %xmm1, 6 * SIZE(%esi, LDC, 2) + + movsd 0 * SIZE(%esi, %eax), %xmm0 + movhps 2 * SIZE(%esi, %eax), %xmm0 + movsd 4 * SIZE(%esi, %eax), %xmm1 + movhps 6 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, %eax) + movhps %xmm0, 2 * SIZE(%esi, %eax) + movlps %xmm1, 4 * SIZE(%esi, %eax) + movhps %xmm1, 6 * SIZE(%esi, %eax) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L11 + ALIGN_4 + +.L20: + testl $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 44 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 60 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + leal (LDC, LDC, 2), %eax + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + movsd 0 * SIZE(%esi, LDC, 2), %xmm0 + movhps 2 * SIZE(%esi, LDC, 2), %xmm0 + movsd 0 * SIZE(%esi, %eax), %xmm1 + movhps 2 * SIZE(%esi, %eax), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC, 2) + movhps %xmm0, 2 * SIZE(%esi, LDC, 2) + movlps %xmm1, 0 * SIZE(%esi, %eax) + movhps %xmm1, 2 * SIZE(%esi, %eax) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L30: + testl $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $4, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + shufps $0, %xmm0, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movhps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 8 * SIZE(BB), %xmm2 + shufps $0, %xmm0, %xmm0 + movhps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movhps 20 * SIZE(BB), %xmm3 + shufps $0, %xmm0, %xmm0 + movsd 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + movss 3 * SIZE(AA), %xmm0 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + shufps $0, %xmm0, %xmm0 + movhps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movss 8 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0, %xmm1, %xmm1 + movhps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movss 5 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movsd 40 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + movhps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movss 6 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + movhps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 56 * SIZE(BB), %xmm3 + shufps $0, %xmm1, %xmm1 + movhps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0, %xmm0, %xmm0 + movhps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 8 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + leal (LDC, LDC, 2), %eax + + addps %xmm5, %xmm4 + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + movsd (%esi, LDC, 2), %xmm1 + movhps (%esi, %eax), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + movlps %xmm1, (%esi, LDC, 2) + movhps %xmm1, (%esi, %eax) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leal (, LDC, 4), %eax + addl %eax, C # c += 4 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L40: + testl $2, N + je .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L45 + ALIGN_4 + +.L42: + movddup 0 * SIZE(%edi), %xmm0 + movddup 2 * SIZE(%edi), %xmm1 + movddup 4 * SIZE(%edi), %xmm2 + movddup 6 * SIZE(%edi), %xmm3 + movddup 8 * SIZE(%edi), %xmm4 + movddup 10 * SIZE(%edi), %xmm5 + movddup 12 * SIZE(%edi), %xmm6 + movddup 14 * SIZE(%edi), %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, %edi + addl $32 * SIZE, %ecx + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movl K, %eax + andl $7, %eax + BRANCH + jle .L50 + ALIGN_4 + +.L46: + movddup 0 * SIZE(%edi), %xmm0 + movaps %xmm0, 0 * SIZE(%ecx) + + addl $2 * SIZE, %edi + addl $4 * SIZE, %ecx + decl %eax + jne .L46 + ALIGN_4 + +.L50: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht2 4 * SIZE(%esi) + prefetcht2 4 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 16 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsldup 48 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L56 + ALIGN_4 + +.L58: + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi # coffset += 2 + decl %ebx # i -- + jg .L51 + ALIGN_4 + +.L60: + testl $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm1 + movhps 2 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 0 * SIZE(%esi, LDC) + movhps %xmm1, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi + ALIGN_4 + +.L70: + testl $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 4 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + shufps $0, %xmm0, %xmm0 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + shufps $0, %xmm0, %xmm0 + movsd 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 3 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + shufps $0, %xmm0, %xmm0 + movsd 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movss 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0, %xmm1, %xmm1 + mulps %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + shufps $0, %xmm1, %xmm1 + movsd 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 6 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + shufps $0, %xmm1, %xmm1 + movsd 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + shufps $0, %xmm1, %xmm1 + movsd 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $ 1 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm5, %xmm4 + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leal (, LDC, 2), %eax + addl %eax, C + ALIGN_4 + +.L80: + testl $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L85 + ALIGN_4 + +.L82: + movss 0 * SIZE(%edi), %xmm0 + movss 1 * SIZE(%edi), %xmm1 + movss 2 * SIZE(%edi), %xmm2 + movss 3 * SIZE(%edi), %xmm3 + movss 4 * SIZE(%edi), %xmm4 + movss 5 * SIZE(%edi), %xmm5 + movss 6 * SIZE(%edi), %xmm6 + movss 7 * SIZE(%edi), %xmm7 + + movss %xmm0, 0 * SIZE(%ecx) + movss %xmm0, 1 * SIZE(%ecx) + movss %xmm1, 2 * SIZE(%ecx) + movss %xmm1, 3 * SIZE(%ecx) + movss %xmm2, 4 * SIZE(%ecx) + movss %xmm2, 5 * SIZE(%ecx) + movss %xmm3, 6 * SIZE(%ecx) + movss %xmm3, 7 * SIZE(%ecx) + movss %xmm4, 8 * SIZE(%ecx) + movss %xmm4, 9 * SIZE(%ecx) + movss %xmm5, 10 * SIZE(%ecx) + movss %xmm5, 11 * SIZE(%ecx) + movss %xmm6, 12 * SIZE(%ecx) + movss %xmm6, 13 * SIZE(%ecx) + movss %xmm7, 14 * SIZE(%ecx) + movss %xmm7, 15 * SIZE(%ecx) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, %ecx + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movl K, %eax + andl $7, %eax + BRANCH + jle .L90 + ALIGN_4 + +.L86: + movss 0 * SIZE(%edi), %xmm0 + movss %xmm0, 0 * SIZE(%ecx) + movss %xmm0, 1 * SIZE(%ecx) + + addl $1 * SIZE, %edi + addl $2 * SIZE, %ecx + decl %eax + jne .L86 + ALIGN_4 + +.L90: + movl C, %esi # coffset = c + movl A, %edx # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movaps 16 * SIZE(AA), %xmm1 + movddup 8 * SIZE(BB), %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(%esi) +#elif defined(HAVE_SSE) || defined(HAVE_SSE2) + prefetcht2 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movddup 2 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movddup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movddup 6 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movddup 16 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movddup 10 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movddup 12 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movddup 14 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movddup 24 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movddup 2 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi + decl %ebx # i -- + jg .L91 + ALIGN_4 + +.L100: + testl $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 0 * SIZE(BB), %xmm2 + movsd 8 * SIZE(AA), %xmm1 + movsd 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L105 + ALIGN_4 + +.L102: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 6 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 16 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 10 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 12 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 14 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L106 + ALIGN_4 + +.L108: + addps %xmm5, %xmm4 + movhlps %xmm4, %xmm5 + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + addl $4 * SIZE, %esi # coffset += 2 + ALIGN_4 + +.L110: + testl $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movss 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss 0 * SIZE(BB), %xmm2 + pxor %xmm5, %xmm5 + movss 4 * SIZE(AA), %xmm1 + movss 8 * SIZE(BB), %xmm3 + + leal (LDC, LDC, 2), %eax + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L115 + ALIGN_4 + +.L112: + mulss %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 2 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 2 * SIZE(AA), %xmm0 + addss %xmm2, %xmm5 + movss 4 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 3 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 6 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + movss 8 * SIZE(AA), %xmm0 + addss %xmm2, %xmm5 + movss 16 * SIZE(BB), %xmm2 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 10 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 6 * SIZE(AA), %xmm1 + addss %xmm3, %xmm5 + movss 12 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + movss 14 * SIZE(BB), %xmm3 + mulss %xmm1, %xmm3 + movss 12 * SIZE(AA), %xmm1 + addss %xmm3, %xmm5 + movss 24 * SIZE(BB), %xmm3 + + addl $ 8 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + movss 2 * SIZE(BB), %xmm2 + + addl $1 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L116 + ALIGN_4 + +.L118: + addss %xmm5, %xmm4 + + movsd (%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, (%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_8x2_core2.S b/kernel/x86/zgemm3m_kernel_8x2_core2.S new file mode 100644 index 0000000000..9a28c8ec38 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_8x2_core2.S @@ -0,0 +1,1628 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 512(%esp) + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 16 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#ifdef OPTERON +#define MOVSD movlps +#else +#define MOVSD movsd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $512 + LOCAL_BUFFER_SIZE, %esp + andl $-4096, %esp # align stack + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, B + movl STACK_C, %ebx + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, 0 + ALPHA + movlps %xmm0, 8 + ALPHA + + movl %ebx, C + movl STACK_LDC, LDC + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax + movl %eax, J + jle .L50 + ALIGN_4 + +.L01: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L05 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BB) + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BB) + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L05: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L06 + ALIGN_4 + +.L10: + movl C, C1 + movl A, AA + movl M, I + sarl $3, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -16 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 7 * SIZE(C1) + pxor %xmm7, %xmm7 + prefetcht0 7 * SIZE(C1, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps -12 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps -8 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps -8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps -4 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps 16 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps 8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + + movaps 16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 20 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm2, %xmm6 + movaps 24 * SIZE(AA), %xmm3 + addps %xmm1, %xmm7 + + movaps 24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 28 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + subl $-64 * SIZE, BB + movaps 48 * SIZE(AA), %xmm3 + subl $-64 * SIZE, AA + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps -32 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + addps %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA, %xmm3 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + MOVSD 8 * SIZE(%esi), %xmm0 + movhps 10 * SIZE(%esi), %xmm0 + MOVSD 12 * SIZE(%esi), %xmm1 + movhps 14 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi) + movhps %xmm0, 10 * SIZE(%esi) + movlps %xmm1, 12 * SIZE(%esi) + movhps %xmm1, 14 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + MOVSD 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + MOVSD 8 * SIZE(%esi, LDC), %xmm0 + movhps 10 * SIZE(%esi, LDC), %xmm0 + MOVSD 12 * SIZE(%esi, LDC), %xmm1 + movhps 14 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm7, %xmm2 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi, LDC) + movhps %xmm0, 10 * SIZE(%esi, LDC) + movlps %xmm1, 12 * SIZE(%esi, LDC) + movhps %xmm1, 14 * SIZE(%esi, LDC) + + addl $16 * SIZE, C1 + decl I + jg .L11 + ALIGN_4 + +.L20: + movl M, I + testl $4, I + jle .L30 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + addps %xmm0, %xmm7 + movaps -24 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps -8 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + addps %xmm2, %xmm5 + movaps -12 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movaps 32 * SIZE(BB), %xmm1 + addps %xmm2, %xmm7 + movaps -8 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm2, %xmm5 + movaps -4 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm2, %xmm7 + movaps 16 * SIZE(AA), %xmm2 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + movsd 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + + pshufd $0x50, %xmm5, %xmm2 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, C1 + ALIGN_4 + +.L30: + movl M, I + testl $2, I + jle .L40 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -24 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + addps %xmm0, %xmm7 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movsd -8 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd -16 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movsd 8 * SIZE(BB), %xmm1 + addps %xmm2, %xmm5 + movsd -22 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movsd 32 * SIZE(BB), %xmm1 + addps %xmm2, %xmm7 + movsd -20 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + addps %xmm2, %xmm5 + movsd -18 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movsd 48 * SIZE(BB), %xmm3 + addps %xmm2, %xmm7 + movsd -8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_4 + +.L35: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + addps %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + movsd 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + + pshufd $0x50, %xmm5, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi + ALIGN_4 + +.L40: + movl M, I + testl $1, I + jle .L49 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ +#endif + + movss -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movss -28 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movss -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm0, %xmm1 + mulss -28 * SIZE(BB), %xmm0 + addss %xmm1, %xmm4 + movss -24 * SIZE(BB), %xmm1 + addss %xmm0, %xmm5 + movss -31 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm1 + mulss -20 * SIZE(BB), %xmm0 + addss %xmm1, %xmm6 + movss 0 * SIZE(BB), %xmm1 + addss %xmm0, %xmm7 + movss -30 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss -12 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss -8 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss -29 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss -4 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 16 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss -24 * SIZE(AA), %xmm0 + mulss %xmm2, %xmm1 + mulss 4 * SIZE(BB), %xmm2 + addss %xmm1, %xmm4 + movss 8 * SIZE(BB), %xmm1 + addss %xmm2, %xmm5 + movss -27 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm1 + mulss 12 * SIZE(BB), %xmm2 + addss %xmm1, %xmm6 + movss 32 * SIZE(BB), %xmm1 + addss %xmm2, %xmm7 + movss -26 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm3 + mulss 20 * SIZE(BB), %xmm2 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm2, %xmm5 + movss -25 * SIZE(AA), %xmm2 + mulss %xmm2, %xmm3 + mulss 28 * SIZE(BB), %xmm2 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm2, %xmm7 + movss -20 * SIZE(AA), %xmm2 + + subl $-8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L42 + ALIGN_4 + +.L45: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulss %xmm0, %xmm1 + mulss -28 * SIZE(BB), %xmm0 + addss %xmm1, %xmm4 + movss -24 * SIZE(BB), %xmm1 + addss %xmm0, %xmm5 + movss -31 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + movsd (%esi), %xmm0 + movhps (%esi, LDC), %xmm0 + + shufps $0, %xmm5, %xmm4 + + mulps %xmm3, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (%esi) + movhps %xmm0, (%esi, LDC) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + decl J + jg .L01 + ALIGN_4 + +.L50: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_4 + +.L51: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $3, %eax + jle .L55 + ALIGN_4 + +.L52: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L52 + ALIGN_4 + +.L55: + movl K, %eax + andl $7, %eax + BRANCH + jle .L60 + ALIGN_4 + +.L56: + movss -32 * SIZE(B), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(BB) + + addl $1 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L56 + ALIGN_4 + +.L60: + movl C, C1 + movl A, AA + movl M, I + sarl $3, I + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetcht0 3 * SIZE(C1) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AA), %xmm1 + addps %xmm0, %xmm4 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm6 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + mulps -20 * SIZE(AA), %xmm1 + addps %xmm0, %xmm5 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm2 + mulps -12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + movaps -8 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm2 + mulps -4 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 16 * SIZE(AA), %xmm2 + addps %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm3, %xmm0 + mulps 4 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + movaps -12 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 12 * SIZE(AA), %xmm3 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps -8 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm2 + mulps 20 * SIZE(AA), %xmm3 + addps %xmm2, %xmm4 + movaps 24 * SIZE(AA), %xmm2 + addps %xmm3, %xmm6 + movaps -4 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm2 + mulps 28 * SIZE(AA), %xmm3 + addps %xmm2, %xmm5 + movaps 48 * SIZE(AA), %xmm2 + addps %xmm3, %xmm7 + movaps 16 * SIZE(BB), %xmm3 + + addl $ 64 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L62 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AA), %xmm1 + addps %xmm0, %xmm4 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm6 + movaps -28 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + movsd 8 * SIZE(%esi), %xmm0 + movhps 10 * SIZE(%esi), %xmm0 + movsd 12 * SIZE(%esi), %xmm1 + movhps 14 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm6, %xmm2 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi) + movhps %xmm0, 10 * SIZE(%esi) + movlps %xmm1, 12 * SIZE(%esi) + movhps %xmm1, 14 * SIZE(%esi) + + addl $16 * SIZE, C1 + decl I + jg .L61 + ALIGN_4 + +.L70: + movl M, I + testl $4, I + jle .L80 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + movaps -16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + movaps -12 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movaps -12 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps -8 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movaps -8 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps -4 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movaps -4 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movaps 16 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + + subl $-32 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L72 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + + pshufd $0x50, %xmm4, %xmm2 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi + ALIGN_4 + +.L80: + movl M, I + testl $2, I + jle .L90 + +.L81: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -16 * SIZE(BB), %xmm3 + movsd -24 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm1 + movsd -16 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movsd -0 * SIZE(BB), %xmm1 + mulps %xmm2, %xmm3 + movsd -22 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movsd -12 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -20 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movsd -8 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -18 * SIZE(AA), %xmm2 + addps %xmm3, %xmm4 + movsd -4 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm3 + movsd -8 * SIZE(AA), %xmm2 + addps %xmm3, %xmm5 + movsd 16 * SIZE(BB), %xmm3 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L82 + ALIGN_4 + +.L85: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AA), %xmm0 + addps %xmm1, %xmm4 + movsd -28 * SIZE(BB), %xmm1 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + addl $4 * SIZE, %esi + ALIGN_4 + +.L90: + movl M, I + testl $1, I + jle .L99 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movss -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movss -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movss -16 * SIZE(BB), %xmm3 + movss -28 * SIZE(AA), %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L95 + ALIGN_4 + +.L92: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -28 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -30 * SIZE(AA), %xmm0 + addss %xmm1, %xmm5 + movss -24 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -29 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -20 * SIZE(BB), %xmm1 + mulss %xmm0, %xmm1 + movss -24 * SIZE(AA), %xmm0 + addss %xmm1, %xmm5 + movss -0 * SIZE(BB), %xmm1 + mulss %xmm2, %xmm3 + movss -27 * SIZE(AA), %xmm2 + addss %xmm3, %xmm4 + movss -12 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -26 * SIZE(AA), %xmm2 + addss %xmm3, %xmm5 + movss -8 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -25 * SIZE(AA), %xmm2 + addss %xmm3, %xmm4 + movss -4 * SIZE(BB), %xmm3 + mulss %xmm2, %xmm3 + movss -20 * SIZE(AA), %xmm2 + addss %xmm3, %xmm5 + movss 16 * SIZE(BB), %xmm3 + + subl $ -8 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L92 + ALIGN_4 + +.L95: + movaps ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AA), %xmm0 + addss %xmm1, %xmm4 + movss -28 * SIZE(BB), %xmm1 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm5, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + + pshufd $0x50, %xmm4, %xmm2 + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + ALIGN_4 + +.L99: + addl LDC, C + ALIGN_4 + + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm3m_kernel_8x2_sse.S b/kernel/x86/zgemm3m_kernel_8x2_sse.S new file mode 100644 index 0000000000..ea66dc1ae6 --- /dev/null +++ b/kernel/x86/zgemm3m_kernel_8x2_sse.S @@ -0,0 +1,2803 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCHSIZE 48 /* for PIII */ + +#define AA %edx +#define BB %ecx + +#ifdef HAVE_SSE2 +#define MOVSD movsd +#define XORPS pxor +#else +#define MOVSD movlps +#define XORPS xorps +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL4(address) \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL5(address) \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL6(address) \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, 0 + ALPHA + movlps %xmm0, 8 + ALPHA + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax # j = (n >> 1) + movl %eax, J + jle .L100 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, %ecx + decl %eax + BRANCH + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + addl $2 * SIZE, B + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + addl $8 * SIZE, %ecx + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef PENTIUM4 +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + + prefetchnta 7 * SIZE(%esi) + prefetchnta 7 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 8 * SIZE, AA + addl $64 * 8 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + + prefetchnta 8 * SIZE(%esi) + prefetchnta 8 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L12 + ALIGN_2 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + ALIGN_2 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm1, %xmm5 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm1, %xmm7 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + subl $1, %eax + jg .L13 + ALIGN_4 + +.L14: + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + MOVSD 8 * SIZE(%esi), %xmm0 + movhps 10 * SIZE(%esi), %xmm0 + MOVSD 12 * SIZE(%esi), %xmm1 + movhps 14 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm6, %xmm2 +#else + movaps %xmm6, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm6, %xmm6 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi) + movhps %xmm0, 10 * SIZE(%esi) + movlps %xmm1, 12 * SIZE(%esi) + movhps %xmm1, 14 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + MOVSD 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + MOVSD 8 * SIZE(%esi, LDC), %xmm0 + movhps 10 * SIZE(%esi, LDC), %xmm0 + MOVSD 12 * SIZE(%esi, LDC), %xmm1 + movhps 14 * SIZE(%esi, LDC), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm7, %xmm2 +#else + movaps %xmm7, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm7, %xmm7 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm7 + + addps %xmm2, %xmm0 + addps %xmm7, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi, LDC) + movhps %xmm0, 10 * SIZE(%esi, LDC) + movlps %xmm1, 12 * SIZE(%esi, LDC) + movhps %xmm1, 14 * SIZE(%esi, LDC) + + addl $16 * SIZE, %esi + BRANCH + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L30: + movl M, %ebx + andl $7, %ebx + jle .L99 + + testl $4, %ebx + jle .L50 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L32 + ALIGN_2 + +.L31: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L31 + ALIGN_2 +#endif + +.L32: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L34 + +.L33: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L33 + ALIGN_4 + +.L34: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + MOVSD 4 * SIZE(%esi, LDC), %xmm1 + movhps 6 * SIZE(%esi, LDC), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + movlps %xmm1, 4 * SIZE(%esi, LDC) + movhps %xmm1, 6 * SIZE(%esi, LDC) + + addl $8 * SIZE, %esi + ALIGN_2 + +.L50: + testl $2, %ebx + jle .L70 + + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + MOVSD 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + MOVSD 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + MOVSD 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + MOVSD 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + MOVSD 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + MOVSD 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + MOVSD 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + MOVSD 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + MOVSD 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + MOVSD 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + MOVSD 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + MOVSD 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + MOVSD 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + MOVSD 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + MOVSD 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + MOVSD 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + MOVSD 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_2 + +.L51: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + MOVSD 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + MOVSD 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + MOVSD 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + MOVSD 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + MOVSD 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + MOVSD 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + MOVSD 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + MOVSD 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + MOVSD 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + MOVSD 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + MOVSD 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 +#endif + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L53 + ALIGN_4 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + movhps 2 * SIZE(%esi, LDC), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + movhps %xmm0, 2 * SIZE(%esi, LDC) + + addl $4 * SIZE, %esi + ALIGN_2 + +.L70: + testl $1, %ebx + jle .L99 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm2, %xmm6 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm3, %xmm4 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 48 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 40 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm2, %xmm6 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm3, %xmm4 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 80 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_2 + +.L71: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm2 + mulss 20 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 3 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + mulss 28 * SIZE(BB), %xmm0 + addss %xmm3, %xmm6 + movss 40 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + mulss 36 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 48 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 5 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 44 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 56 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm2 + mulss 52 * SIZE(BB), %xmm1 + addss %xmm2, %xmm4 + movss 64 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 7 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + mulss 60 * SIZE(BB), %xmm1 + addss %xmm3, %xmm6 + movss 72 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 +#endif + +.L72: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulss %xmm0, %xmm2 + mulss 4 * SIZE(BB), %xmm0 + addss %xmm2, %xmm4 + movss 8 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 1 * SIZE(AA), %xmm0 + + addl $1 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L73 + ALIGN_4 + +.L74: + addss %xmm6, %xmm4 + addss %xmm7, %xmm5 + + MOVSD 0 * SIZE(%esi), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + + MOVSD 0 * SIZE(%esi, LDC), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi, LDC) + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += 2 * ldc + BRANCH + decl J # j -- + jg .L01 + ALIGN_2 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + ALIGN_2 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, %ecx + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 96 * SIZE(B) + + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + addl $ 8 * SIZE, B + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + movaps %xmm4, 16 * SIZE(%ecx) + movaps %xmm5, 20 * SIZE(%ecx) + movaps %xmm6, 24 * SIZE(%ecx) + movaps %xmm7, 28 * SIZE(%ecx) + addl $32 * SIZE, %ecx + + decl %eax + BRANCH + jne .L102 + ALIGN_2 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_2 + +.L104: + movss 0 * SIZE(B), %xmm0 + addl $1 * SIZE, B + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 * SIZE(%ecx) + addl $4 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $3, %ebx # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm0 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm0, %xmm6 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 20 * SIZE(AA), %xmm2 + addps %xmm1, %xmm4 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 28 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 36 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 40 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm0 + mulps 44 * SIZE(AA), %xmm3 + addps %xmm0, %xmm6 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm1, %xmm4 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 80 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 + +#else + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $8, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_2 + +.L111: + mulps %xmm2, %xmm0 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 20 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 28 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 40 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm2, %xmm0 + mulps 36 * SIZE(AA), %xmm2 + addps %xmm0, %xmm4 + movaps 48 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm2, %xmm1 + mulps 44 * SIZE(AA), %xmm2 + addps %xmm1, %xmm6 + movaps 56 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm3, %xmm0 + mulps 52 * SIZE(AA), %xmm3 + addps %xmm0, %xmm4 + movaps 64 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm3, %xmm1 + mulps 60 * SIZE(AA), %xmm3 + addps %xmm1, %xmm6 + movaps 72 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 40 * SIZE(BB), %xmm3 + + addl $64 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_2 +#endif + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + +.L113: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm0 + addps %xmm0, %xmm4 + mulps 4 * SIZE(AA), %xmm2 + addps %xmm2, %xmm5 + + addl $8 * SIZE, AA + addl $4 * SIZE, BB + subl $1, %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + MOVSD 8 * SIZE(%esi), %xmm0 + movhps 10 * SIZE(%esi), %xmm0 + MOVSD 12 * SIZE(%esi), %xmm1 + movhps 14 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm5, %xmm2 +#else + movaps %xmm5, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm5, %xmm5 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm5 + + addps %xmm2, %xmm0 + addps %xmm5, %xmm1 + + movlps %xmm0, 8 * SIZE(%esi) + movhps %xmm0, 10 * SIZE(%esi) + movlps %xmm1, 12 * SIZE(%esi) + movhps %xmm1, 14 * SIZE(%esi) + + addl $16 * SIZE, %esi + BRANCH + decl %ebx # i -- + jg .L110 + ALIGN_2 + +.L130: + movl M, %ebx + andl $7, %ebx + jle .L999 + + testl $4, %ebx + jle .L150 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 8 * SIZE(AA), %xmm0 + mulps 8 * SIZE(BB), %xmm0 + addps %xmm0, %xmm6 + movaps 12 * SIZE(AA), %xmm0 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + mulps 20 * SIZE(BB), %xmm1 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 24 * SIZE(AA), %xmm1 + mulps 24 * SIZE(BB), %xmm1 + addps %xmm1, %xmm6 + movaps 28 * SIZE(AA), %xmm1 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L132 + ALIGN_2 + +.L131: + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 4 * SIZE(BB), %xmm0 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + mulps 12 * SIZE(BB), %xmm1 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + movaps 20 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + mulps 20 * SIZE(BB), %xmm0 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + mulps 28 * SIZE(BB), %xmm1 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L131 + ALIGN_2 +#endif + +.L132: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L134 + +.L133: + movaps 0 * SIZE(BB), %xmm2 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L133 + ALIGN_4 + +.L134: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + MOVSD 4 * SIZE(%esi), %xmm1 + movhps 6 * SIZE(%esi), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + shufps $0xfa, %xmm4, %xmm4 + + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm4, %xmm1 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + movlps %xmm1, 4 * SIZE(%esi) + movhps %xmm1, 6 * SIZE(%esi) + + addl $8 * SIZE, %esi + ALIGN_2 + +.L150: + testl $2, %ebx + jle .L170 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + MOVSD 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + MOVSD 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + MOVSD 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + MOVSD 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + MOVSD 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + MOVSD 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + MOVSD 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + MOVSD 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 48 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + MOVSD 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB /* because it's doubled */ + + MOVSD 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + MOVSD 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + MOVSD 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + MOVSD 8 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L152 + ALIGN_2 + +.L151: + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + MOVSD 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + MOVSD 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + MOVSD 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm6 + MOVSD 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + MOVSD 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + MOVSD 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + MOVSD 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm4 + MOVSD 20 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + MOVSD 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + MOVSD 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + MOVSD 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm6 + MOVSD 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + MOVSD 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + MOVSD 40 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L151 + ALIGN_2 +#endif + +.L152: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L154 + +.L153: + mulps %xmm0, %xmm2 + MOVSD 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + MOVSD 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L153 + ALIGN_4 + +.L154: + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + addps %xmm6, %xmm4 + + MOVSD 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + movhps %xmm0, 2 * SIZE(%esi) + + addl $4 * SIZE, %esi + ALIGN_2 + +.L170: + testl $1, %ebx + jle .L999 + +#if (L1_DATA_LINESIZE == 64) +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 16 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 16 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 32 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss 8 * SIZE(BB), %xmm0 + addss %xmm0, %xmm6 + movss 3 * SIZE(AA), %xmm0 + mulss 12 * SIZE(BB), %xmm0 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm3 + movss 5 * SIZE(AA), %xmm1 + addss %xmm3, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 48 * SIZE(BB), %xmm3 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss 24 * SIZE(BB), %xmm1 + addss %xmm1, %xmm6 + movss 7 * SIZE(AA), %xmm1 + mulss 28 * SIZE(BB), %xmm1 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movss 0 * SIZE + BUFFER, %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 8 * SIZE + BUFFER, %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 4), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movss 0 * SIZE(BB), %xmm2 + XORPS %xmm4, %xmm4 + movss 0 * SIZE(AA), %xmm0 + XORPS %xmm5, %xmm5 + movss 8 * SIZE(BB), %xmm3 + XORPS %xmm6, %xmm6 + movss 4 * SIZE(AA), %xmm1 + XORPS %xmm7, %xmm7 +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L172 + ALIGN_2 + +.L171: + mulss %xmm0, %xmm2 + movss 1 * SIZE(AA), %xmm0 + addss %xmm2, %xmm4 + mulss 4 * SIZE(BB), %xmm0 + movss 16 * SIZE(BB), %xmm2 + addss %xmm0, %xmm5 + movss 2 * SIZE(AA), %xmm0 + mulss %xmm0, %xmm3 + movss 3 * SIZE(AA), %xmm0 + addss %xmm3, %xmm6 + mulss 12 * SIZE(BB), %xmm0 + movss 24 * SIZE(BB), %xmm3 + addss %xmm0, %xmm7 + movss 8 * SIZE(AA), %xmm0 + mulss %xmm1, %xmm2 + movss 5 * SIZE(AA), %xmm1 + addss %xmm2, %xmm4 + mulss 20 * SIZE(BB), %xmm1 + movss 32 * SIZE(BB), %xmm2 + addss %xmm1, %xmm5 + movss 6 * SIZE(AA), %xmm1 + mulss %xmm1, %xmm3 + movss 7 * SIZE(AA), %xmm1 + addss %xmm3, %xmm6 + mulss 28 * SIZE(BB), %xmm1 + movss 40 * SIZE(BB), %xmm3 + addss %xmm1, %xmm7 + movss 12 * SIZE(AA), %xmm1 + + addl $ 8 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L171 + ALIGN_2 +#endif + +.L172: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L174 + +.L173: + movss 0 * SIZE(AA), %xmm0 + movss 0 * SIZE(BB), %xmm2 + mulss %xmm0, %xmm2 + addss %xmm2, %xmm4 + + addl $1 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L173 + ALIGN_4 + +.L174: + addss %xmm5, %xmm4 + addss %xmm7, %xmm6 + addss %xmm6, %xmm4 + + MOVSD 0 * SIZE(%esi), %xmm0 + +#ifdef HAVE_SSE2 + pshufd $0x50, %xmm4, %xmm2 +#else + movaps %xmm4, %xmm2 + shufps $0x50, %xmm2, %xmm2 +#endif + + mulps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movlps %xmm0, 0 * SIZE(%esi) + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + + EPILOGUE diff --git a/kernel/x86/zgemm_beta.S b/kernel/x86/zgemm_beta.S new file mode 100644 index 0000000000..c36e7c5082 --- /dev/null +++ b/kernel/x86/zgemm_beta.S @@ -0,0 +1,242 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define BETA_R 16 + STACK + ARGS(%esp) +#define BETA_I 24 + STACK + ARGS(%esp) +#define C 48 + STACK + ARGS(%esp) +#define LDC 52 + STACK + ARGS(%esp) +#else +#define BETA_R 16 + STACK + ARGS(%esp) +#define BETA_I 20 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define LDC 44 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + + PROFCODE + + movl M, %ebp + movl N, %ecx + movl LDC, %edx + movl C, %edi + + FLD BETA_R + FLD BETA_I + + testl %ebp, %ebp # if n <= 0 goto End + jle .L83 + testl %ecx, %ecx # if m <= 0 goto End + jle .L83 + + fld %st(1) + fabs + fld %st(1) + fabs + faddp %st, %st(1) + + sall $ZBASE_SHIFT, %edx + + ftst + fnstsw %ax + andb $68, %ah +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + je .L71 + ALIGN_2 + +.L53: + movl %edi, %esi # c_offset1 = c_offset + addl %edx, %edi # c_offset += ldc + + movl %ebp, %eax + sarl $2, %eax + jle .L56 + ALIGN_2 + +.L57: +#if defined(HAS_PREFETCH) && defined(PENTIUM3) + prefetchnta 16 * SIZE(%esi) + prefetchnta 24 * SIZE(%esi) +#endif + + FSTU 0 * SIZE(%esi) # c_offset1 + FSTU 1 * SIZE(%esi) + FSTU 2 * SIZE(%esi) + FSTU 3 * SIZE(%esi) + FSTU 4 * SIZE(%esi) + FSTU 5 * SIZE(%esi) + FSTU 6 * SIZE(%esi) + FSTU 7 * SIZE(%esi) + addl $8 * SIZE, %esi # c_offset1 += 8 + decl %eax # i-- + jg .L57 + ALIGN_2 + +.L56: + movl %ebp, %eax + andl $3, %eax + jle .L62 + ALIGN_2 + +.L63: + FSTU 0 * SIZE(%esi) + FSTU 1 * SIZE(%esi) + addl $2 * SIZE,%esi + decl %eax + jg .L63 + ALIGN_2 + +.L62: + decl %ecx # j -- + jg .L53 + jmp .L83 + ALIGN_3 + +.L71: + movl %edi, %esi + addl %edx, %edi # c_offset += ldc + + + movl %ebp, %eax + sarl $1, %eax + jle .L84 + ALIGN_3 + +.L85: +#if defined(HAS_PREFETCH) && defined(PENTIUM3) + prefetchnta 16 * SIZE(%esi) +#endif + fld %st(0) + FMUL 0 * SIZE(%esi) + fld %st(2) + FMUL 1 * SIZE(%esi) + faddp %st,%st(1) + fld %st(2) + FMUL 0 * SIZE(%esi) + fld %st(2) + FMUL 1 * SIZE(%esi) + fsubrp %st,%st(1) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + + fld %st(0) + FMUL 2 * SIZE(%esi) + fld %st(2) + FMUL 3 * SIZE(%esi) + faddp %st,%st(1) + fld %st(2) + FMUL 2 * SIZE(%esi) + fld %st(2) + FMUL 3 * SIZE(%esi) + fsubrp %st,%st(1) + + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + addl $4 * SIZE, %esi + + decl %eax + jg .L85 + ALIGN_3 + +.L84: + movl %ebp, %eax + andl $1, %eax + jle .L74 + ALIGN_3 + +.L75: +#if defined(HAS_PREFETCH) && defined(PENTIUM3) + prefetchnta 16 * SIZE(%esi) +#endif + + fld %st(0) + FMUL 0 * SIZE(%esi) + fld %st(2) + FMUL 1 * SIZE(%esi) + faddp %st,%st(1) + fld %st(2) + FMUL 0 * SIZE(%esi) + fld %st(2) + FMUL 1 * SIZE(%esi) + fsubrp %st,%st(1) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + ALIGN_2 + +.L74: + decl %ecx + jg .L71 + ALIGN_2 + +.L83: +#ifndef C_SUN + ffreep %st(0) + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 + .byte 0xdf + .byte 0xc0 +#endif + + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x1.S b/kernel/x86/zgemm_kernel_1x1.S new file mode 100644 index 0000000000..117b245e27 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x1.S @@ -0,0 +1,450 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define BX 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_A 32 + STACK + ARGS(%esp) +#define STACK_B 36 + STACK + ARGS(%esp) +#define STACK_C 40 + STACK + ARGS(%esp) +#define STACK_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define STACK_C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#define M %esi +#define K %edi + +#define A %ebx +#define B %ecx +#define C %edx +#define LDC %ebp + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl STACK_K, K + movl STACK_LDC, LDC + sall $ZBASE_SHIFT, LDC + + cmpl $0, STACK_N + jle .L29 + cmpl $0, STACK_M + jle .L29 + ALIGN_4 + +.L30: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl %ebx, BX + + movl STACK_A, A + movl STACK_C, C + movl STACK_M, M + ALIGN_4 + +.L34: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl STACK_B, B +#else + movl STACK_B, B + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 2), B +#endif + +#ifdef HAVE_SSE + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + prefetcht2 4 * SIZE(%eax) + +#if L2_SIZE > 262144 + + subl $-8 * SIZE, BX + +#elif L2_SIZE > 131072 + + prefetcht2 8 * SIZE(%eax) + prefetcht2 12 * SIZE(%eax) + + + subl $-16 * SIZE, BX +#else + prefetcht2 16 * SIZE(%eax) + prefetcht2 20 * SIZE(%eax) + prefetcht2 24 * SIZE(%eax) + prefetcht2 28 * SIZE(%eax) + + subl $-32 * SIZE, BX +#endif +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(B) # B5 + FLD 4 * SIZE(A) # A5 + FLD 0 * SIZE(B) # B0 + FLD 0 * SIZE(A) # A0 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + +#ifdef HAVE_SSE + prefetcht2 2 * SIZE(C) +#endif + sarl $2, %eax + je .L37 + ALIGN_4 + +#define PREFETCH_OFFSET 40 + +.L38: +#ifdef HAVE_SSE + prefetchnta (PREFETCH_OFFSET) * SIZE(B) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET) * SIZE(A) +#endif +#endif + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + FLD 0 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + FLD 1 * SIZE(A) + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 2 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 2 * SIZE(A) + + fmul %st, %st(1) + FMUL 3 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + FLD 2 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + FLD 3 * SIZE(A) + fmul %st, %st(1) + FMUL 3 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 8 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 8 * SIZE(A) + fxch %st(2) + +#ifdef HAVE_SSE +#ifdef DOUBLE + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(B) +#ifdef CORE_KATMAI + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(A) +#endif +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(B) + fxch %st(3) + faddp %st, %st(5) + FLD 4 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + FLD 5 * SIZE(A) + fmul %st, %st(3) + FMUL 5 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 6 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 6 * SIZE(A) + + fmul %st, %st(3) + FMUL 7 * SIZE(B) + fxch %st(3) + faddp %st, %st(5) + FLD 6 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + FLD 7 * SIZE(A) + fmul %st, %st(3) + FMUL 7 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 12 * SIZE(B) + fxch %st(3) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 12 * SIZE(A) + fxch %st(2) + + subl $-8 * SIZE, B + subl $-8 * SIZE, A + decl %eax + jg .L38 + ALIGN_4 + +.L37: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax + jle .L43 + ALIGN_2 + +.L54: + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + + FLD 0 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CN) + faddp %st, %st(4) +#else + fsubrp %st, %st(4) +#endif + + FLD 1 * SIZE(A) + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(NC) + faddp %st, %st(7) +#else + fsubrp %st, %st(7) +#endif + FLD 2 * SIZE(B) + fxch %st(1) +#if defined(NN) || defined(CC) + fsubrp %st, %st(6) +#else + faddp %st, %st(6) +#endif + FLD 2 * SIZE(A) + + addl $2 * SIZE, A + addl $2 * SIZE, B + decl %eax + jg .L54 + ALIGN_3 + +.L43: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA_R + fxch %st(3) + FLD ALPHA_I + fxch %st(5) + + faddp %st, %st(2) # ctemp3 += ctemp4 + faddp %st, %st(2) # ctemp1 += ctemp2 + + fld %st(0) # copy ctemp2 + fmul %st(4), %st # ctemp3 *= alpha_i + fld %st(2) # copy ctemp1 + fmul %st(4), %st # ctemp1 *= alpha_r + fsubp %st, %st(1) # ctemp2 -= ctemp4 + +#ifndef TRMMKERNEL + FADD 0 * SIZE(C) +#endif + FST 0 * SIZE(C) + + fmulp %st, %st(2) # ctemp3 *= alpha_i + fmulp %st, %st(2) # ctemp1 *= alpha_r + faddp %st, %st(1) # ctemp1 += ctemp3 + +#ifndef TRMMKERNEL + FADD 1 * SIZE(C) +#endif + FST 1 * SIZE(C) + addl $2 * SIZE, C + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 2), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + decl M + jg .L34 + ALIGN_2 + +.L33: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl B, STACK_B + addl LDC, STACK_C + decl STACK_N + jg .L30 + ALIGN_2 + +.L29: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x1_atom.S b/kernel/x86/zgemm_kernel_1x1_atom.S new file mode 100644 index 0000000000..5d276b943b --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x1_atom.S @@ -0,0 +1,351 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 + +#define AA %edx +#define BB %ecx +#define CO1 %esi +#define LDC %ebp +#define B %edi + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 subsd +#define ADDSD4 subsd +#endif + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + sall $ZBASE_SHIFT, LDC + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + movl N, %eax + testl %eax, %eax + movl %eax, J + jle .L999 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, CO1 + addl LDC, C + + movl A, AA + + movl M, %ebx + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movl BX, %eax + prefetcht0 0 * SIZE(%eax) + subl $-8 * SIZE, BX + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADDSD3 %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADDSD3 %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + ADDSD3 %xmm2, %xmm6 + ADDSD4 %xmm3, %xmm7 + + addsd %xmm7, %xmm4 + addsd %xmm5, %xmm6 + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + mulsd %xmm0, %xmm4 + mulsd %xmm1, %xmm5 + mulsd %xmm1, %xmm6 + mulsd %xmm0, %xmm7 + + subsd %xmm6, %xmm4 + addsd %xmm7, %xmm5 + +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm4 + addsd 1 * SIZE(CO1), %xmm5 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, CO1 + decl %ebx + jg .L10 + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + movl BB, B + decl J + jg .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2.S b/kernel/x86/zgemm_kernel_1x2.S new file mode 100644 index 0000000000..0f98069745 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2.S @@ -0,0 +1,813 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define J 0 + STACK(%esp) +#define I 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_A 32 + STACK + ARGS(%esp) +#define STACK_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define STACK_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#endif + +#define A %edx +#define B %ecx +#define BB %ebx +#define LDC %ebp +#define BX %esi + +#define ADD1 faddp + +#if defined(NN) || defined(CN) +#define ADD2 faddp +#else +#define ADD2 fsubrp +#endif + +#if defined(NN) || defined(CC) +#define ADD3 fsubrp +#else +#define ADD3 faddp +#endif + +#if defined(NN) || defined(NC) +#define ADD4 faddp +#else +#define ADD4 fsubrp +#endif + +#define PREFETCHSIZE (8 * 5 + 4) + +#define AOFFSET 1 +#define BOFFSET 1 + +#ifdef HAVE_3DNOW +#define PREFETCH prefetch +#else +#define PREFETCH prefetcht0 +#endif + +#define KERNEL \ + PREFETCH PREFETCHSIZE * SIZE + AOFFSET(A, %eax, 2);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD -14 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL -13 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -15 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -15 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD -16 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD -13 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL -14 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -14 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD -10 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL -9 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -13 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -11 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD -12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD -9 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL -10 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -12 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD -6 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL -5 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -11 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -7 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD -8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD -5 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL -6 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -10 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD -2 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL -1 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -9 * SIZE + AOFFSET(A, %eax, 2);\ + FLD -3 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD -4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD -1 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL -2 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD 8 * SIZE + AOFFSET(A, %eax, 2);\ + fxch %st(1);\ + FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + AOFFSET(A, %eax, 2);\ + ADD2 %st, %st(5);\ + FLD 2 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL 3 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -7 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 1 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD 0 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD 3 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL 2 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -6 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD 6 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL 7 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -5 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 5 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD 4 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD 7 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL 6 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -4 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD 10 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL 11 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -3 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 9 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD 8 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD 11 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL 10 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD -2 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(4);\ + FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD2 %st, %st(5);\ + FLD 14 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD1 %st, %st(6);\ + FMUL 15 * SIZE + BOFFSET(B, %eax, 4);\ + ADD2 %st, %st(6);\ + FLD -1 * SIZE + AOFFSET(A, %eax, 2);\ + FLD 13 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(4);\ + FLD 12 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD4 %st, %st(5);\ + FLD 15 * SIZE + BOFFSET(B, %eax, 4);\ + fmul %st(1), %st;\ + ADD3 %st, %st(6);\ + FMUL 14 * SIZE + BOFFSET(B, %eax, 4);\ + ADD4 %st, %st(6);\ + FLD 16 * SIZE + AOFFSET(A, %eax, 2);\ + fxch %st(2);\ + FLD 0 * SIZE + BOFFSET(BB, %eax, 4);\ + subl $-8 * SIZE, %eax + +/* + + A hint of scheduling is received from following URL + + http://www.netlib.org/atlas/atlas-comm/msg00260.html + +*/ + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(TRMMKERNEL) && !defined(LEFT) + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + + movl STACK_LDC, LDC + sall $ZBASE_SHIFT, LDC + + subl $(AOFFSET - 16 * SIZE), STACK_A + subl $(BOFFSET - 16 * SIZE), STACK_B + + movl M, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + testl %eax, %eax + jle .L999 + + movl K, %eax + testl %eax, %eax + jle .L999 + + movl N, %eax + sarl $1, %eax + movl %eax, J + je .L20 + ALIGN_3 + +.L11: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl K, BX + sall $ZBASE_SHIFT + 1, BX + addl B, BX + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L14: + prefetchnta -16 * SIZE + BOFFSET(BX) + prefetchnta -8 * SIZE + BOFFSET(BX) + subl $-16 * SIZE, BX + + movl STACK_B, B + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 4), B +#endif + + fldz + fldz + fldz + fldz + + FLD 0 * SIZE + AOFFSET(A) + FLD -8 * SIZE + AOFFSET(A) + FLD -16 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + +#ifdef HAVE_3DNOW + prefetchw 1 * SIZE(%edi) + prefetchw 2 * SIZE(%edi, LDC) +#elif defined(HAVE_SSE) + prefetcht0 1 * SIZE(%edi) + prefetcht0 2 * SIZE(%edi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + + leal (, %eax, SIZE), %eax + leal (A, %eax, 2), A + leal 16 * SIZE(B, %eax, 4), BB + leal (B, %eax, 4), B + negl %eax + NOBRANCH + je .L16 + ALIGN_4 + +.L15: + KERNEL + jge .L16 + KERNEL + jge .L16 + KERNEL + jge .L16 + KERNEL + jl .L15 + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $7, %eax + je .L19 + ALIGN_4 + + +.L17: + fmul %st(1), %st + ADD1 %st, %st(4) + FLD -15 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD2 %st, %st(5) + FLD -14 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD1 %st, %st(6) + FMUL -13 * SIZE + BOFFSET(B) + + ADD2 %st, %st(6) + FLD -15 * SIZE + AOFFSET(A) + FLD -15 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD3 %st, %st(4) + FLD -16 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD4 %st, %st(5) + FLD -13 * SIZE + BOFFSET(B) + + fmul %st(1), %st + ADD3 %st, %st(6) + FMUL -14 * SIZE + BOFFSET(B) + + ADD4 %st, %st(6) + FLD -14 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + addl $2 * SIZE,A + addl $4 * SIZE,B + + decl %eax + jne .L17 + ALIGN_4 + +.L19: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + + FLD ALPHA_R + fmul %st(1), %st + FLD ALPHA_I + fmul %st(3), %st + fsubrp %st, %st(1) + fxch %st(2) + FMUL ALPHA_R + fxch %st(1) + FMUL ALPHA_I + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FADD 1 * SIZE(%edi) + FST 1 * SIZE(%edi) + FADD 0 * SIZE(%edi) + FST 0 * SIZE(%edi) +#else + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) +#endif + + FLD ALPHA_R + fmul %st(1), %st + FLD ALPHA_I + fmul %st(3), %st + fsubrp %st, %st(1) + fxch %st(2) + FMUL ALPHA_R + fxch %st(1) + FMUL ALPHA_I + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FADD 1 * SIZE(%edi,LDC) + FST 1 * SIZE(%edi,LDC) + FADD 0 * SIZE(%edi,LDC) + FST 0 * SIZE(%edi,LDC) +#else + FST 1 * SIZE(%edi,LDC) + FST 0 * SIZE(%edi,LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 4), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %edi + decl I + jne .L14 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C + movl B, STACK_B + decl J + jne .L11 + ALIGN_4 + +.L20: + movl N, %eax + andl $1, %eax + je .L999 + ALIGN_3 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl STACK_A, A + movl STACK_B, B + movl C, %edi + + movl M, %eax + movl %eax, I + ALIGN_3 + +.L24: + movl STACK_B, B +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 2), B +#endif + + fldz + fldz + fldz + fldz + + FLD -16 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + + prefetchw 1 * SIZE(%edi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $2, %eax + je .L26 + ALIGN_3 + +.L25: + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -15 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -15 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -15 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -14 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -13 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -13 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -13 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -12 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -11 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -11 * SIZE + AOFFSET(A) + FLD -12 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -11 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -10 * SIZE + AOFFSET(A) + FLD -10 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -9 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -9 * SIZE + AOFFSET(A) + FLD -10 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -9 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -8 * SIZE + AOFFSET(A) + FLD -8 * SIZE + BOFFSET(B) + + addl $8 * SIZE,A + addl $8 * SIZE,B + + decl %eax + jne .L25 + ALIGN_4 + +.L26: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + and $3, %eax + je .L29 + ALIGN_4 + +.L27: + fmul %st(1), %st + PADDING + ADD1 %st, %st(2) + FMUL -15 * SIZE + BOFFSET(B) + + ADD2 %st, %st(2) + FLD -15 * SIZE + AOFFSET(A) + FLD -16 * SIZE + BOFFSET(B) + + fmul %st(1), %st + PADDING + ADD4 %st, %st(4) + FMUL -15 * SIZE + BOFFSET(B) + + ADD3 %st, %st(4) + FLD -14 * SIZE + AOFFSET(A) + FLD -14 * SIZE + BOFFSET(B) + + addl $2 * SIZE,A + addl $2 * SIZE,B + + decl %eax + jne .L27 + ALIGN_4 + +.L29: + ffreep %st(0) + ffreep %st(0) + + faddp %st, %st(3) + faddp %st, %st(1) + + fxch %st(1) + + FLD ALPHA_R + fmul %st(1), %st + FLD ALPHA_I + fmul %st(3), %st + fsubrp %st, %st(1) + fxch %st(2) + FMUL ALPHA_R + fxch %st(1) + FMUL ALPHA_I + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FADD 1 * SIZE(%edi) + FST 1 * SIZE(%edi) + FADD 0 * SIZE(%edi) + FST 0 * SIZE(%edi) +#else + FST 1 * SIZE(%edi) + FST 0 * SIZE(%edi) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (A, %eax, 2), A + leal (B, %eax, 2), B +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %edi + decl I + jne .L24 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C + movl B, STACK_B + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_3dnow.S b/kernel/x86/zgemm_kernel_1x2_3dnow.S new file mode 100644 index 0000000000..3699bb25dd --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_3dnow.S @@ -0,0 +1,958 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define OLD_M 4 + STACK + ARGS(%esi) +#define OLD_N 8 + STACK + ARGS(%esi) +#define OLD_K 12 + STACK + ARGS(%esi) +#define OLD_ALPHA_R 16 + STACK + ARGS(%esi) +#define OLD_ALPHA_I 20 + STACK + ARGS(%esi) +#define OLD_A 24 + STACK + ARGS(%esi) +#define OLD_B 28 + STACK + ARGS(%esi) +#define OLD_C 32 + STACK + ARGS(%esi) +#define OLD_LDC 36 + STACK + ARGS(%esi) +#define OLD_OFFSET 40 + STACK + ARGS(%esi) + +#define GAMMA_R 0(%esp) +#define GAMMA_I 8(%esp) +#define ALPHA 16(%esp) +#define K 24(%esp) +#define N 28(%esp) +#define M 32(%esp) +#define A 36(%esp) +#define C 40(%esp) +#define J 44(%esp) +#define OLD_STACK 48(%esp) +#define OFFSET 52(%esp) +#define KK 56(%esp) +#define KKK 60(%esp) +#define BUFFER 128(%esp) + +#define AA %edx +#define BB %ecx + +#define PREFETCHSIZE (16 * 2 + 6) + +#define AOFFSET -32 +#define BOFFSET 128 + +/* + + A hint of scheduling is received from following URL + +https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11 + +*/ + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + subl $128 + LOCAL_BUFFER_SIZE, %esp + movl OLD_M, %ebx + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl OLD_N, %eax + movl OLD_K, %ecx + movl OLD_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + subl $AOFFSET * SIZE, %edx + movl %edx, A + movl %esi, OLD_STACK + + testl %ebx, %ebx + jle .L999 + + movl OLD_B, %edi + movl OLD_C, %ebx + + EMMS + + movd OLD_ALPHA_R, %mm0 + movd OLD_ALPHA_I, %mm1 + + movd %mm0, 0 + ALPHA + movd %mm1, 4 + ALPHA + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + movl $0x3f800000, 0 + GAMMA_R + movl $0x3f800000, 4 + GAMMA_R + movl $0xbf800000, 0 + GAMMA_I + movl $0x3f800000, 4 + GAMMA_I +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + movl $0x3f800000, 0 + GAMMA_R + movl $0x3f800000, 4 + GAMMA_R + movl $0x3f800000, 0 + GAMMA_I + movl $0xbf800000, 4 + GAMMA_I +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + movl $0x3f800000, 0 + GAMMA_R + movl $0xbF800000, 4 + GAMMA_R + movl $0x3f800000, 0 + GAMMA_I + movl $0x3F800000, 4 + GAMMA_I +#else + movl $0x3f800000, 0 + GAMMA_R + movl $0xbf800000, 4 + GAMMA_R + movl $0xbf800000, 0 + GAMMA_I + movl $0xbf800000, 4 + GAMMA_I +#endif + movl %ebx, C + movl OLD_LDC, %ebp + leal (, %ebp, SIZE * 2), %ebp + +#ifdef TRMMKERNEL + movl OLD_OFFSET, %eax + movl %eax, OFFSET +#ifndef LEFT + negl %eax + movl %eax, KK +#endif +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L20 + ALIGN_4 + +.L01: +/* Copying to Sub Buffer */ + leal BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm4, 8 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + + movd 8 * SIZE(%edi), %mm0 + movd 9 * SIZE(%edi), %mm1 + movd 10 * SIZE(%edi), %mm2 + movd 11 * SIZE(%edi), %mm3 + movd 12 * SIZE(%edi), %mm4 + movd 13 * SIZE(%edi), %mm5 + movd 14 * SIZE(%edi), %mm6 + movd 15 * SIZE(%edi), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 16 * SIZE(BB) + movq %mm1, 18 * SIZE(BB) + movq %mm2, 20 * SIZE(BB) + movq %mm3, 22 * SIZE(BB) + movq %mm4, 24 * SIZE(BB) + movq %mm5, 26 * SIZE(BB) + movq %mm6, 28 * SIZE(BB) + movq %mm7, 30 * SIZE(BB) + + addl $16 * SIZE, %edi + addl $32 * SIZE, BB + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + + movq %mm0, 0 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + + addl $4 * SIZE, %edi + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L10: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + ALIGN_4 + +.L11: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, %ebp) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L15 + ALIGN_4 + +.L12: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2 + pfmul (102 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm5 + PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2 + pfmul (110 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3 + pfmul (118 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm5 + PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3 + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3 + pfmul (126 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm5 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movq GAMMA_R, %mm0 + movq GAMMA_I, %mm1 + movq ALPHA, %mm2 + + pswapd %mm5, %mm5 + pswapd %mm7, %mm7 + + pfmul %mm0, %mm4 + pfmul %mm1, %mm5 + pfmul %mm0, %mm6 + pfmul %mm1, %mm7 + + pfadd %mm5, %mm4 + pfadd %mm7, %mm6 + + pswapd %mm4, %mm5 + pswapd %mm6, %mm7 + pfmul %mm2, %mm4 + pfmul %mm2, %mm6 + pfmul %mm2, %mm5 + pfmul %mm2, %mm7 + + pfpnacc %mm5, %mm4 + pfpnacc %mm7, %mm6 + +#ifndef TRMMKERNEL + pfadd (%esi), %mm4 + pfadd (%esi, %ebp), %mm6 +#endif + movq %mm4, (%esi) + movq %mm6, (%esi, %ebp) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi + decl %ebx + jg .L11 + ALIGN_4 + +.L19: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, %ebp, 2), %eax + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L20: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L21: +/* Copying to Sub Buffer */ + movl K, %eax + leal BUFFER, BB + sarl $2, %eax + jle .L25 + ALIGN_4 + +.L22: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + movd 2 * SIZE(%edi), %mm2 + movd 3 * SIZE(%edi), %mm3 + movd 4 * SIZE(%edi), %mm4 + movd 5 * SIZE(%edi), %mm5 + movd 6 * SIZE(%edi), %mm6 + movd 7 * SIZE(%edi), %mm7 + + prefetchnta 72 * SIZE(%edi) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BB) + movq %mm1, 2 * SIZE(BB) + movq %mm2, 4 * SIZE(BB) + movq %mm3, 6 * SIZE(BB) + movq %mm4, 8 * SIZE(BB) + movq %mm5, 10 * SIZE(BB) + movq %mm6, 12 * SIZE(BB) + movq %mm7, 14 * SIZE(BB) + + addl $ 8 * SIZE, %edi + addl $16 * SIZE, BB + decl %eax + jne .L22 + ALIGN_4 + +.L25: + movl K, %eax + andl $3, %eax + BRANCH + jle .L30 + ALIGN_4 + +.L26: + movd 0 * SIZE(%edi), %mm0 + movd 1 * SIZE(%edi), %mm1 + + movd %mm0, 0 * SIZE(BB) + movd %mm0, 1 * SIZE(BB) + movd %mm1, 2 * SIZE(BB) + movd %mm1, 3 * SIZE(BB) + + addl $2 * SIZE, %edi + addl $4 * SIZE, BB + decl %eax + jne .L26 + ALIGN_4 + +.L30: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + ALIGN_3 + +.L31: + leal - BOFFSET * SIZE + BUFFER, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movq ( 0 + AOFFSET) * SIZE(AA), %mm0 + pxor %mm4, %mm4 + movq ( 16 + AOFFSET) * SIZE(AA), %mm1 + pxor %mm5, %mm5 + PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 + pxor %mm6, %mm6 + PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 + pxor %mm7, %mm7 + + prefetchw 2 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $4, %eax + je .L35 + ALIGN_4 + +.L32: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 4 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 6 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 8 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 10 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 12 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 14 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm0, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm7 + movq ( 32 + AOFFSET) * SIZE(AA), %mm0 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 18 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 20 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 22 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm2 + pfadd %mm2, %mm6 + PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 + pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 24 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 26 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 28 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm4 + PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm5 + movq ( 30 + AOFFSET) * SIZE(AA), %mm1 + + pfmul %mm1, %mm3 + pfadd %mm3, %mm6 + PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 + pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 + pfadd %mm1, %mm7 + movq ( 48 + AOFFSET) * SIZE(AA), %mm1 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $15, %eax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pfmul %mm0, %mm2 + pfadd %mm2, %mm4 + PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 + + pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 + pfadd %mm0, %mm5 + movq ( 2 + AOFFSET) * SIZE(AA), %mm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L36 + ALIGN_4 + +.L38: + pfadd %mm6, %mm4 + pfadd %mm7, %mm5 + + movq ALPHA, %mm2 + pswapd %mm5, %mm5 + + pfmul GAMMA_R, %mm4 + pfmul GAMMA_I, %mm5 + + pfadd %mm5, %mm4 + + pswapd %mm4, %mm5 + pfmul %mm2, %mm4 + pfmul %mm2, %mm5 + pfpnacc %mm5, %mm4 + +#ifndef TRMMKERNEL + pfadd 0 * SIZE(%esi), %mm4 +#endif + movq %mm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L31 + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_barcelona.S b/kernel/x86/zgemm_kernel_1x2_barcelona.S new file mode 100644 index 0000000000..f71b095ade --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_barcelona.S @@ -0,0 +1,728 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define OLD_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define OLD_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#define B %edi +#define LDC %ebp +#define AO %edx +#define BO %ecx +#define CO %esi +#define I %ebx + +#define movsd movlps +#define movapd movups +#define movlpd movlps +#define movhpd movhps + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addpd +#define ADD2 subpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 subpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm1; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %eax, 2); \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + ADD1 %xmm1, %xmm4; \ + movapd -12 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm0, %xmm6; \ + movddup -15 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -14 * SIZE(BO, %eax, 4), %xmm0; \ + ADD2 %xmm0, %xmm7; \ + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL2(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + ADD1 %xmm1, %xmm4; \ + movapd -8 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm0, %xmm6; \ + movddup -13 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -10 * SIZE(BO, %eax, 4), %xmm0; \ + ADD2 %xmm0, %xmm7; \ + movddup -12 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL3(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + ADD1 %xmm1, %xmm4; \ + movapd -4 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm0, %xmm6; \ + movddup -11 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -6 * SIZE(BO, %eax, 4), %xmm0; \ + ADD2 %xmm0, %xmm7; \ + movddup -10 * SIZE(AO, %eax, 2), %xmm0 + +#define KERNEL4(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm0, %xmm1; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + ADD1 %xmm1, %xmm4; \ + movapd (BO, %eax, 4), %xmm1; \ + ADD1 %xmm0, %xmm6; \ + movddup -9 * SIZE(AO, %eax, 2), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd -2 * SIZE(BO, %eax, 4), %xmm0; \ + ADD2 %xmm0, %xmm7; \ + movddup (AO, %eax, 2), %xmm0 + +#define KERNEL5(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + ADD1 %xmm1, %xmm4; \ + movapd 4 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm3, %xmm6; \ + movddup -7 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 2 * SIZE(BO, %eax, 4), %xmm3; \ + ADD2 %xmm3, %xmm7; \ + movddup -6 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL6(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + ADD1 %xmm1, %xmm4; \ + movapd 8 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm3, %xmm6; \ + movddup -5 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 6 * SIZE(BO, %eax, 4), %xmm3; \ + ADD2 %xmm3, %xmm7; \ + movddup -4 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL7(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + ADD1 %xmm1, %xmm4; \ + movapd 12 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm3, %xmm6; \ + movddup -3 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 10 * SIZE(BO, %eax, 4), %xmm3; \ + ADD2 %xmm3, %xmm7; \ + movddup -2 * SIZE(AO, %eax, 2), %xmm3 + +#define KERNEL8(address) \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2; \ + mulpd %xmm3, %xmm1; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + ADD1 %xmm1, %xmm4; \ + movapd 16 * SIZE(BO, %eax, 4), %xmm1; \ + ADD1 %xmm3, %xmm6; \ + movddup -1 * SIZE(AO, %eax, 2), %xmm3; \ + mulpd %xmm3, %xmm2; \ + mulpd 14 * SIZE(BO, %eax, 4), %xmm3; \ + ADD2 %xmm3, %xmm7; \ + movddup 8 * SIZE(AO, %eax, 2), %xmm3; \ + ADD2 %xmm2, %xmm5; \ + movapd %xmm1, %xmm2 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl OLD_B, B + movl OLD_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax + +#ifndef LEFT + negl %eax +#endif + + movl %eax, KK +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal GEMM_DEFAULT_Q * GEMM_DEFAULT_UNROLL_N * SIZE(B), %eax + movl %eax, BX + + movl C, CO + movl A, AO + movl M, I + testl I, I + jle .L100 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 4), BO +#endif + + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + + subl $-8 * SIZE, BX + + movddup -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm4, %xmm4 + movddup -8 * SIZE(AO), %xmm3 + pxor %xmm5, %xmm5 + + prefetchw 1 * SIZE(CO) + pxor %xmm6, %xmm6 + prefetchw 1 * SIZE(CO, LDC) + pxor %xmm7, %xmm7 + movapd %xmm1, %xmm2 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + addl $8 * SIZE, %eax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO + negl %eax + ALIGN_4 + +.L16: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BO, %eax, 4), %xmm1 + ADD1 %xmm0, %xmm6 + movddup -15 * SIZE(AO, %eax, 2), %xmm0 + mulpd %xmm0, %xmm2 + mulpd -14 * SIZE(BO, %eax, 4), %xmm0 + ADD2 %xmm0, %xmm7 + movddup -14 * SIZE(AO, %eax, 2), %xmm0 + ADD2 %xmm2, %xmm5 + movapd %xmm1, %xmm2 + + addl $SIZE, %eax + jl .L16 + ALIGN_4 + +.L14: +#ifndef TRMMKERNEL + movupd 0 * SIZE(CO), %xmm0 + movupd 0 * SIZE(CO, LDC), %xmm1 +#endif + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 +#else + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm5 + movapd %xmm7, %xmm6 + pshufd $0x4e, %xmm7, %xmm7 +#endif + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm2, %xmm6 + mulpd %xmm3, %xmm7 + + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movlpd %xmm4, 0 * SIZE(CO) + movhpd %xmm4, 1 * SIZE(CO) + movlpd %xmm6, 0 * SIZE(CO, LDC) + movhpd %xmm6, 1 * SIZE(CO, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, CO # coffset += 4 + decl I # i -- + jg .L10 + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BO, B + + leal (, LDC, 2), %eax + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L500 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, CO + movl A, AO + + movl M, I + testl %ebx, I + jle .L500 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BO +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (B, %eax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + prefetchw 1 * SIZE(CO) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulpd -16 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -14 * SIZE(AO), %xmm0 + mulpd -16 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -13 * SIZE(AO), %xmm1 + + mulpd -14 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm6 + movddup -12 * SIZE(AO), %xmm0 + mulpd -14 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm7 + movddup -11 * SIZE(AO), %xmm1 + + mulpd -12 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -10 * SIZE(AO), %xmm0 + mulpd -12 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -9 * SIZE(AO), %xmm1 + + mulpd -10 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm6 + movddup -8 * SIZE(AO), %xmm0 + mulpd -10 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm7 + movddup -7 * SIZE(AO), %xmm1 + + mulpd -8 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -6 * SIZE(AO), %xmm0 + mulpd -8 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -5 * SIZE(AO), %xmm1 + + mulpd -6 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm6 + movddup -4 * SIZE(AO), %xmm0 + mulpd -6 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm7 + movddup -3 * SIZE(AO), %xmm1 + + mulpd -4 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -2 * SIZE(AO), %xmm0 + mulpd -4 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -1 * SIZE(AO), %xmm1 + + mulpd -2 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm6 + movddup 0 * SIZE(AO), %xmm0 + mulpd -2 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm7 + movddup 1 * SIZE(AO), %xmm1 + + subl $-16 * SIZE, AO + subl $-16 * SIZE, BO + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulpd -16 * SIZE(BO), %xmm0 + ADD1 %xmm0, %xmm4 + movddup -14 * SIZE(AO), %xmm0 + mulpd -16 * SIZE(BO), %xmm1 + ADD2 %xmm1, %xmm5 + movddup -13 * SIZE(AO), %xmm1 + + addl $2 * SIZE, AO + addl $2 * SIZE, BO + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#ifndef TRMMKERNEL + movupd 0 * SIZE(CO), %xmm0 +#endif + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) + addsubpd %xmm5, %xmm4 + pshufd $0x4e, %xmm4, %xmm5 +#else + addsubpd %xmm4, %xmm5 + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm5 +#endif + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + + addsubpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 +#endif + + movlpd %xmm4, 0 * SIZE(CO) + movhpd %xmm4, 1 * SIZE(CO) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AO, %eax, 2), AO + leal (BO, %eax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, CO # coffset += 4 + decl I # i -- + jg .L110 + ALIGN_4 + +.L500: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_penryn.S b/kernel/x86/zgemm_kernel_1x2_penryn.S new file mode 100644 index 0000000000..849361956a --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_penryn.S @@ -0,0 +1,701 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef NANO +#define PREFETCHSIZE (8 * 3 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 13 + 4) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 addpd +#define ADD2 subpd +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L20 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, C1 # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -16 * SIZE(%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + PREFETCHW 1 * SIZE(C1) + xorps %xmm5, %xmm5 + PREFETCHW 3 * SIZE(C1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + movaps 2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, BB + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + ADD1 %xmm3, %xmm6 + pcmpeqb %xmm0, %xmm0 + ADD2 %xmm2, %xmm7 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 0 * SIZE(C1, LDC), %xmm1 + movhpd 1 * SIZE(C1, LDC), %xmm1 +#endif + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 + + mulpd %xmm2, %xmm4 + mulpd %xmm2, %xmm6 + + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm7 + + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 0 * SIZE(C1, LDC) + movhpd %xmm6, 1 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, C1 # coffset += 4 + decl %ebx # i -- + jg .L10 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L20: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(C1) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -10 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -6 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -2 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm6, %xmm4 + pcmpeqb %xmm0, %xmm0 + addpd %xmm7, %xmm5 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#endif + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 +#endif + + haddpd %xmm5, %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + addsubpd %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, C1 + decl %ebx # i -- + jg .L21 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_sse2.S b/kernel/x86/zgemm_kernel_1x2_sse2.S new file mode 100644 index 0000000000..63fc30a5b2 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_sse2.S @@ -0,0 +1,909 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define OFFSET 76(%esp) +#define KK 80(%esp) +#define KKK 84(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#endif + +#define PREFETCHSIZE (8 * 10 + 4) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi + + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, B + movl STACK_C, %ebx +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movlpd STACK_ALPHA_R, %xmm0 + movlpd STACK_ALPHA_I, %xmm1 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm0, 0 + ALPHA_R + movlpd %xmm0, 8 + ALPHA_R + + movlpd %xmm1, 8 + ALPHA_I + xorpd %xmm7, %xmm1 + movlpd %xmm1, 0 + ALPHA_I + + movlpd %xmm2, 0 + POSINV + movlpd %xmm7, 8 + POSINV + + movl %ebx, C + movl STACK_LDC, LDC + +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + + movapd POSINV, %xmm7 + + movl K, %eax + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $1, %eax + BRANCH + jle .L05 + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetchw 2 * SIZE(%esi) + prefetchw 2 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + movapd POSINV, %xmm1 + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#ifndef TRMMKERNEL + movlpd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movlpd 0 * SIZE(%esi, LDC), %xmm1 + movhpd 1 * SIZE(%esi, LDC), %xmm1 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + mulpd %xmm2, %xmm6 + mulpd %xmm3, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movlpd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movlpd %xmm6, 0 * SIZE(%esi, LDC) + movhpd %xmm6, 1 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L500 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + movapd POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, %ecx + decl %eax + jne .L102 + ALIGN_4 + +.L103: + movl K, %eax + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB +#else + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + movapd POSINV, %xmm1 + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#ifndef TRMMKERNEL + movlpd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + + pshufd $0x4e, %xmm4, %xmm5 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm5 + + addpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm4 +#endif + + movlpd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L500: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_1x2_sse3.S b/kernel/x86/zgemm_kernel_1x2_sse3.S new file mode 100644 index 0000000000..70e6400978 --- /dev/null +++ b/kernel/x86/zgemm_kernel_1x2_sse3.S @@ -0,0 +1,857 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH_R (8 * 4) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addpd +#else +#define ADDSUB subpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7 + +#define KERNEL7(address) \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 4), BB +#endif + + movl BX, %eax + + prefetcht2 (PREFETCH_R + 0) * SIZE(%eax) + prefetcht2 (PREFETCH_R + 16) * SIZE(%eax) + + subl $-8 * SIZE, BX + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 3 * SIZE(%esi) + prefetchnta 3 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + +#ifdef CORE_PRESCOTT + andl $-8, %eax + sall $4, %eax + je .L12 + +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L11 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L11 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L11 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L11 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L11 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L11 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L11 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L11 + KERNEL1(16 * 8) + KERNEL2(16 * 8) + KERNEL3(16 * 8) + KERNEL4(16 * 8) + KERNEL5(16 * 8) + KERNEL6(16 * 8) + KERNEL7(16 * 8) + KERNEL8(16 * 8) + cmpl $128 * 9, %eax + jle .L11 + KERNEL1(16 * 9) + KERNEL2(16 * 9) + KERNEL3(16 * 9) + KERNEL4(16 * 9) + KERNEL5(16 * 9) + KERNEL6(16 * 9) + KERNEL7(16 * 9) + KERNEL8(16 * 9) + cmpl $128 * 10, %eax + jle .L11 + KERNEL1(16 * 10) + KERNEL2(16 * 10) + KERNEL3(16 * 10) + KERNEL4(16 * 10) + KERNEL5(16 * 10) + KERNEL6(16 * 10) + KERNEL7(16 * 10) + KERNEL8(16 * 10) + cmpl $128 * 11, %eax + jle .L11 + KERNEL1(16 * 11) + KERNEL2(16 * 11) + KERNEL3(16 * 11) + KERNEL4(16 * 11) + KERNEL5(16 * 11) + KERNEL6(16 * 11) + KERNEL7(16 * 11) + KERNEL8(16 * 11) + cmpl $128 * 12, %eax + jle .L11 + KERNEL1(16 * 12) + KERNEL2(16 * 12) + KERNEL3(16 * 12) + KERNEL4(16 * 12) + KERNEL5(16 * 12) + KERNEL6(16 * 12) + KERNEL7(16 * 12) + KERNEL8(16 * 12) + cmpl $128 * 13, %eax + jle .L11 + KERNEL1(16 * 13) + KERNEL2(16 * 13) + KERNEL3(16 * 13) + KERNEL4(16 * 13) + KERNEL5(16 * 13) + KERNEL6(16 * 13) + KERNEL7(16 * 13) + KERNEL8(16 * 13) + cmpl $128 * 14, %eax + jle .L11 + KERNEL1(16 * 14) + KERNEL2(16 * 14) + KERNEL3(16 * 14) + KERNEL4(16 * 14) + KERNEL5(16 * 14) + KERNEL6(16 * 14) + KERNEL7(16 * 14) + KERNEL8(16 * 14) + cmpl $128 * 15, %eax + jle .L11 + KERNEL1(16 * 15) + KERNEL2(16 * 15) + KERNEL3(16 * 15) + KERNEL4(16 * 15) + KERNEL5(16 * 15) + KERNEL6(16 * 15) + KERNEL7(16 * 15) + KERNEL8(16 * 15) +#else + addl $32 * 4 * SIZE, AA + addl $32 * 8 * SIZE, BB + subl $128 * 8, %eax + jg .L1X +#endif + +.L11: + leal (AA, %eax, 1), AA # * 16 + leal (BB, %eax, 2), BB # * 64 + +#else + + sarl $3, %eax + je .L12 + ALIGN_4 + +.L11: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA_R, %xmm1 + movddup ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + pcmpeqb %xmm0, %xmm0 + SHUFPD_1 %xmm5, %xmm5 + psllq $63, %xmm0 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + shufps $0x04, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#endif + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm6 + + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm7 + + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhpd 1 * SIZE(%esi, LDC), %xmm2 + + addpd %xmm0, %xmm4 + addpd %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm6, 0 * SIZE(%esi, LDC) + movhpd %xmm6, 1 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (, LDC, 2), %eax + movl BB, B + addl %eax, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + testl $1, %eax + jle .L500 + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je L112 + ALIGN_4 + +L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne L111 + ALIGN_4 + +L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movddup ALPHA_R, %xmm1 + movddup ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je L114 + ALIGN_4 + +L113: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L113 + ALIGN_4 + +L114: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + pcmpeqb %xmm0, %xmm0 + SHUFPD_1 %xmm5, %xmm5 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + shufps $0x04, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 +#else + shufps $0x40, %xmm0, %xmm0 + + pxor %xmm0, %xmm4 +#endif + + addpd %xmm5, %xmm4 + + movaps %xmm4, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + + mulpd %xmm1, %xmm4 + mulpd %xmm3, %xmm5 + + addsubpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg L110 + ALIGN_4 + +.L500: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x1_core2.S b/kernel/x86/zgemm_kernel_2x1_core2.S new file mode 100644 index 0000000000..3ed53425f7 --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x1_core2.S @@ -0,0 +1,695 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define BX 72(%esp) +#define OLD_STACK 76(%esp) +#define OFFSET 80(%esp) +#define KK 84(%esp) +#define KKK 88(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 16 + 4) +#define PREFETCH prefetcht0 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define C1 %esi + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 subpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 subpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movsd STACK_ALPHA_R, %xmm0 + movsd STACK_ALPHA_I, %xmm1 + + movddup %xmm0, %xmm0 + movddup %xmm1, %xmm1 + + movapd %xmm0, ALPHA_R + movapd %xmm1, ALPHA_I + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + sall $ZBASE_SHIFT, LDC + + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: + leal 16 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl B, BX + + movl C, C1 # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 16 * SIZE + BUFFER, BB +#else + + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 3 * SIZE(C1) + pxor %xmm7, %xmm7 + movapd %xmm1, %xmm2 + + movl BX, %eax + prefetcht0 (%eax) + subl $-8 * SIZE, %eax + movl %eax, BX + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + movapd -12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm4 + movapd -10 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm6 + movapd -8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + PADDING; + movapd 0 * SIZE(AA), %xmm0 + ADD2 %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + ADD1 %xmm2, %xmm6 + movapd -4 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + ADD2 %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + ADD1 %xmm2, %xmm4 + movapd -2 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm6 + PADDING; + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 8 * SIZE(AA), %xmm3 + ADD2 %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm4 + movapd 6 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm6 + movapd 8 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + ADD2 %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + ADD1 %xmm2, %xmm6 + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm3, %xmm1 + movapd 12 * SIZE(AA), %xmm3 + ADD2 %xmm1, %xmm7 + PADDING; + movapd %xmm2, %xmm1 + + mulpd %xmm3, %xmm2 + ADD1 %xmm2, %xmm4 + movapd 14 * SIZE(BB), %xmm2 + mulpd %xmm2, %xmm3 + subl $-32 * SIZE, BB + ADD2 %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm6 + movapd -16 * SIZE(BB), %xmm1 + mulpd %xmm3, %xmm2 + movapd 24 * SIZE(AA), %xmm3 + ADD2 %xmm2, %xmm7 + PADDING; + movapd %xmm1, %xmm2 + + subl $-32 * SIZE, AA + decl %eax + BRANCH + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AA), %xmm0 + ADD2 %xmm3, %xmm7 + movapd %xmm1, %xmm2 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + + movapd %xmm4, %xmm5 + movapd %xmm6, %xmm7 +#else + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 + + movapd %xmm5, %xmm4 + movapd %xmm7, %xmm6 +#endif + +#ifndef TRMMKERNEL + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 + movsd 2 * SIZE(C1), %xmm1 + movhpd 3 * SIZE(C1), %xmm1 +#endif + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + + mulpd %xmm2, %xmm4 + mulpd %xmm2, %xmm6 + + mulpd %xmm3, %xmm5 + mulpd %xmm3, %xmm7 + + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + movsd %xmm6, 2 * SIZE(C1) + movhpd %xmm6, 3 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, C1 # coffset += 4 + decl %ebx # i -- + jg .L10 + +.L20: + movl M, %ebx + testl $1, %ebx + je .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 16 * SIZE + BUFFER, %ecx +#else + + leal 16 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + + sarl $3, %eax + jle .L22 + +.L21: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax # l-- + jg .L21 + ALIGN_2 + +.L22: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # l = (k & 3) + jle .L24 + ALIGN_2 + +.L23: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax # l-- + jg .L23 + +.L24: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm5, %xmm4 + movapd %xmm4, %xmm5 +#else + addsubpd %xmm4, %xmm5 + movapd %xmm5, %xmm4 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm0 + movhpd 1 * SIZE(C1), %xmm0 +#endif + + SHUFPD_1 %xmm5, %xmm5 + + mulpd %xmm2, %xmm4 + + mulpd %xmm3, %xmm5 + + addsubpd %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhpd %xmm4, 1 * SIZE(C1) + ALIGN_2 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C # c += ldc + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + emms + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x1_sse2.S b/kernel/x86/zgemm_kernel_2x1_sse2.S new file mode 100644 index 0000000000..3ef96d1434 --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x1_sse2.S @@ -0,0 +1,824 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define BX 72(%esp) +#define OLD_STACK 76(%esp) +#define OFFSET 80(%esp) +#define KK 84(%esp) +#define KKK 88(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movsd STACK_ALPHA_R, %xmm0 + movsd STACK_ALPHA_I, %xmm1 + + pxor %xmm7, %xmm7 + cmpeqpd %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm0, 0 + ALPHA_R + movsd %xmm0, 8 + ALPHA_R + + movsd %xmm1, 8 + ALPHA_I + xorpd %xmm7, %xmm1 + movsd %xmm1, 0 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movsd %xmm7, 0 + POSINV + movsd %xmm2, 8 + POSINV +#else + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV +#endif + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + movapd POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 +#else + xorpd %xmm7, %xmm0 + xorpd %xmm7, %xmm2 +#endif + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + movapd %xmm2, 4 * SIZE(BB) + movapd %xmm3, 6 * SIZE(BB) + + movsd 4 * SIZE(B), %xmm0 + movsd 5 * SIZE(B), %xmm1 + movsd 6 * SIZE(B), %xmm2 + movsd 7 * SIZE(B), %xmm3 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 +#else + xorpd %xmm7, %xmm0 + xorpd %xmm7, %xmm2 +#endif + + movapd %xmm0, 8 * SIZE(BB) + movapd %xmm1, 10 * SIZE(BB) + movapd %xmm2, 12 * SIZE(BB) + movapd %xmm3, 14 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm1 +#else + xorpd %xmm7, %xmm0 +#endif + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl B, BX + + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L10: + movl BX, %eax + + prefetcht2 0 * SIZE(%eax) + + subl $-8 * SIZE, BX + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movapd 0 * SIZE + BUFFER, %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#endif + + prefetchnta 3 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (BB, %eax, 4), BB + leal (AA, %eax, 4), AA + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + +.L14: + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + + movapd %xmm4, %xmm5 + movapd %xmm6, %xmm7 + + SHUFPD_1 %xmm4, %xmm4 + SHUFPD_1 %xmm6, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + movsd 2 * SIZE(%esi), %xmm1 + movhpd 3 * SIZE(%esi), %xmm1 + + addpd %xmm0, %xmm4 + addpd %xmm1, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + movsd %xmm6, 2 * SIZE(%esi) + movhpd %xmm6, 3 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + +.L50: + movl M, %ebx + testl $1, %ebx + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, %ecx + + movapd 0 * SIZE + BUFFER, %xmm1 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE + BUFFER, %xmm2 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movapd 0 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + + sarl $2, %eax # l = (k >> 2) + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 16 * SIZE(BB), %xmm1 + + mulpd %xmm0, %xmm3 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm5 + movapd 4 * SIZE(BB), %xmm3 + + mulpd %xmm0, %xmm3 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 10 * SIZE(BB), %xmm0 + + addpd %xmm2, %xmm4 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 24 * SIZE(BB), %xmm2 + + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm5 + movapd 8 * SIZE(AA), %xmm0 + + addl $ 8 * SIZE, AA # aoffset += 2 + addl $16 * SIZE, BB # boffset1 += 4 + + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $3, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 2 + addl $4 * SIZE, BB # boffset1 += 4 + decl %eax # l-- + jg .L53 + +.L54: + movapd ALPHA_R, %xmm2 + movapd ALPHA_I, %xmm3 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + + movapd %xmm4, %xmm5 + + SHUFPD_1 %xmm4, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm5, %xmm4 + +#ifndef TRMMKERNEL + SHUFPD_2 %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhpd 1 * SIZE(%esi), %xmm0 + + addpd %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhpd %xmm4, 1 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C # c += ldc + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_2 + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x2_barcelona.S b/kernel/x86/zgemm_kernel_2x2_barcelona.S new file mode 100644 index 0000000000..2ad68935ce --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x2_barcelona.S @@ -0,0 +1,1363 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define OFFSET 76(%esp) +#define KK 80(%esp) +#define KKK 84(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + xorps %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + xorps %xmm2, %xmm2 + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 + ALPHA_R + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + movl %eax, J # j = n + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movaps POSINV, %xmm7 + + movl K, %eax + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + movaps 4 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + movaps %xmm0, 16 * SIZE(%ecx) + movaps %xmm1, 20 * SIZE(%ecx) + movaps %xmm2, 24 * SIZE(%ecx) + movaps %xmm3, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, %edi + subl $-32 * SIZE, %ecx + + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + addl $ 4 * SIZE, %edi + ALIGN_4 + +.L05: + movl C, %esi + movl A, %edx + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetchw 3 * SIZE(%esi) + prefetchw 3 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + shufps $0xe4, %xmm2, %xmm2 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhps 2 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movlps %xmm6, 0 * SIZE(%esi, LDC) + movhps %xmm6, 2 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movsd 0 * SIZE(AA), %xmm0 + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + shufps $0xe4, %xmm6, %xmm6 + + movsd 0 * SIZE(%esi), %xmm0 + movsd 0 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm6, 0 * SIZE(%esi, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (LDC, LDC), %eax + addl %eax, C # c += 2 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movaps POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetch (RPREFETCHSIZE + 0) * SIZE(%edi) + + movaps 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + movaps %xmm2, 8 * SIZE(%ecx) + movaps %xmm3, 12 * SIZE(%ecx) + + movaps 4 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + prefetchw (WPREFETCHSIZE + 0) * SIZE(%ecx) + + movaps %xmm0, 16 * SIZE(%ecx) + movaps %xmm1, 20 * SIZE(%ecx) + movaps %xmm2, 24 * SIZE(%ecx) + movaps %xmm3, 28 * SIZE(%ecx) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: + movl K, %eax + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movsd 0 * SIZE(%edi), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 +#else + xorps %xmm7, %xmm0 +#endif + + movaps %xmm0, 0 * SIZE(%ecx) + movaps %xmm1, 4 * SIZE(%ecx) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi + movl A, AA + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + prefetchw 3 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x2_penryn.S b/kernel/x86/zgemm_kernel_2x2_penryn.S new file mode 100644 index 0000000000..edd89b112a --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x2_penryn.S @@ -0,0 +1,1210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define BX 4 + STACK(%esp) +#define KK 8 + STACK(%esp) +#define KKK 12 + STACK(%esp) + +#ifdef NANO +#define PREFETCHSIZE (16 * 3 + 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE (16 * 1 + 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (16 * 13 + 8) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define C1 %esi +#define I %ebx + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addps +#define ADD2 addps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addps +#define ADD2 addps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addps +#define ADD2 addps +#else +#define ADD1 addps +#define ADD2 subps +#endif + + PROLOGUE + + subl $ARGS, %esp # Generate Stack Frame + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + +#ifdef TRMMKERNEL + movl OFFSET, %eax +#ifndef LEFT + negl %eax +#endif + movl %eax, KK +#endif + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + + movl N, %eax + sarl $1, %eax + movl %eax, J + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl B, BX + + movl C, C1 + movl A, AA + + movl M, %ebx + sarl $1, %ebx + jle .L20 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + + movl BX, %eax + PREFETCHB -32 * SIZE(%eax) + subl $-16 * SIZE, %eax + movl %eax, BX + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + PREFETCHW 3 * SIZE(C1) + xorps %xmm5, %xmm5 + PREFETCHW 7 * SIZE(C1, LDC) + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L16 + ALIGN_4 + +.L18: + ADD2 %xmm2, %xmm7 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm3, %xmm6 + psllq $63, %xmm0 + + movsd ALPHA_R, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + + shufps $0xd8, %xmm4, %xmm4 + shufps $0xd8, %xmm6, %xmm6 + + movaps %xmm4, %xmm5 + shufps $0xe4, %xmm6, %xmm4 + shufps $0xe4, %xmm5, %xmm6 + + pshufd $0x00, %xmm3, %xmm2 + pshufd $0x55, %xmm3, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + mulps %xmm2, %xmm6 + mulps %xmm3, %xmm7 + + addsubps %xmm5, %xmm4 + addsubps %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movhps 2 * SIZE(C1), %xmm2 + movsd 0 * SIZE(C1, LDC), %xmm3 + movhps 2 * SIZE(C1, LDC), %xmm3 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm6, 0 * SIZE(C1, LDC) + movhps %xmm6, 2 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, C1 + decl %ebx + jg .L10 + ALIGN_4 + +.L20: + movl M, %ebx + testl $1, %ebx + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + + decl %eax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L28 + ALIGN_4 + +.L26: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + + decl %eax + jg .L26 + ALIGN_4 + +.L28: + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 + + movsd ALPHA_R, %xmm3 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + pshufd $0xb1, %xmm7, %xmm7 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 + + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#else + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#endif + + pshufd $0x00, %xmm3, %xmm2 + pshufd $0x55, %xmm3, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + mulps %xmm2, %xmm6 + mulps %xmm3, %xmm7 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 + + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movsd 0 * SIZE(C1, LDC), %xmm3 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movsd %xmm6, 0 * SIZE(C1, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + addl $2 * SIZE, C1 + ALIGN_2 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + movl BB, B + + leal (, LDC, 2), %eax + addl %eax, C + + decl J + jg .L01 + ALIGN_4 + +.L30: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl C, C1 + movl A, AA + + movl M, %ebx + sarl $1, %ebx + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(C1) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movsd -32 * SIZE(BB), %xmm1 + andl $7, %eax + BRANCH + je .L38 + ALIGN_4 + +.L36: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + + decl %eax + jg .L36 + ALIGN_4 + +.L38: + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + movsd ALPHA_R, %xmm3 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm5 + subps %xmm5, %xmm4 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm5 + addps %xmm5, %xmm4 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm4 + addps %xmm5, %xmm4 +#else + pxor %xmm0, %xmm4 + subps %xmm5, %xmm4 +#endif + + pshufd $0x00, %xmm3, %xmm2 + pshufd $0x55, %xmm3, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + pxor %xmm0, %xmm5 + subps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movhps 2 * SIZE(C1), %xmm2 + + addps %xmm2, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, SIZE), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, C1 + decl %ebx + jg .L31 + ALIGN_4 + +.L40: + movl M, %ebx + testl $1, %ebx + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movl B, BB +#else + movl B, BB + movl KK, %eax + leal (, %eax, SIZE), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L45 + ALIGN_4 + +.L42: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -22 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -18 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax + BRANCH + je .L48 + ALIGN_4 + +.L46: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + + decl %eax + jg .L46 + ALIGN_4 + +.L48: + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + movsd ALPHA_R, %xmm3 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm5 + subps %xmm5, %xmm4 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm5 + addps %xmm5, %xmm4 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm4 + addps %xmm5, %xmm4 +#else + pxor %xmm0, %xmm4 + subps %xmm5, %xmm4 +#endif + + pshufd $0x00, %xmm3, %xmm2 + pshufd $0x55, %xmm3, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + pxor %xmm0, %xmm5 + subps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + addps %xmm2, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x2_sse.S b/kernel/x86/zgemm_kernel_2x2_sse.S new file mode 100644 index 0000000000..fad42ccb97 --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x2_sse.S @@ -0,0 +1,1562 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define OFFSET 76(%esp) +#define KK 80(%esp) +#define KKK 84(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#ifdef ATHLON +#define PREFETCHSIZE 64 +#define WPREFETCHSIZE 80 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCHSIZE (16 * 10 + 8) +#define WPREFETCHSIZE 112 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 168 +#endif + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 1 * SIZE(AA); \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; +#endif + +#ifdef PENTIUM4 +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1 +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + xorps %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + xorps %xmm2, %xmm2 + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 + ALPHA_R + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + movl %eax, J # j = n + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movaps POSINV, %xmm7 + + movl K, %eax + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + +#ifdef PENTIUM4 + prefetcht1 104 * SIZE(BB) +#endif + + addl $ 8 * SIZE, %edi + addl $32 * SIZE, %ecx + + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + addl $ 4 * SIZE, %edi + ALIGN_4 + +.L05: + movl C, %esi + movl A, %edx + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 4 * SIZE(%esi) + prefetchw 4 * SIZE(%esi, LDC) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 8 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + + addl $ 32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + shufps $0xe4, %xmm2, %xmm2 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhps 2 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movlps %xmm6, 0 * SIZE(%esi, LDC) + movhps %xmm6, 2 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + shufps $0xe4, %xmm6, %xmm6 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movlps %xmm6, 0 * SIZE(%esi, LDC) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 8), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (LDC, LDC), %eax + addl %eax, C # c += 2 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + movaps POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: + movl K, %eax + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 +#else + xorps %xmm7, %xmm0 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi + movl A, AA + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(OPTERON) || defined(BARCELONA) + prefetchw 4 * SIZE(%esi) +#endif + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm4, %xmm4 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_2x2_sse3.S b/kernel/x86/zgemm_kernel_2x2_sse3.S new file mode 100644 index 0000000000..23afa8f21e --- /dev/null +++ b/kernel/x86/zgemm_kernel_2x2_sse3.S @@ -0,0 +1,1365 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define OFFSET 76(%esp) +#define KK 80(%esp) +#define KKK 84(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 168 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 168 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addps +#else +#define ADDSUB subps +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (address) * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movshdup 0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movsldup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movsldup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + movaps 8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + movaps 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm6; \ + movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + movaps 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm6; \ + movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + movaps 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE, %esp + andl $-1024, %esp # align stack + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx +#ifdef TRMMKERNEL + movss STACK_OFFT, %xmm4 +#endif + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + + movl %ebx, C + movl STACK_LDC, LDC + +#ifdef TRMMKERNEL + movss %xmm4, OFFSET + movss %xmm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + sall $ZBASE_SHIFT, LDC + movl %eax, J # j = n + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $2, %eax + jle .L03 + ALIGN_4 + +.L02: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + +# prefetcht1 128 * SIZE(%ecx) + prefetcht0 112 * SIZE(%edi) + + addl $16 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $4 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi + movl A, %edx + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + prefetchnta 4 * SIZE(%esi) + prefetchnta 4 * SIZE(%esi, LDC) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) +#if 1 + cmpl $128 * 8, %eax + jle .L12 + KERNEL1(32 * 8) + KERNEL2(32 * 8) + KERNEL3(32 * 8) + KERNEL4(32 * 8) + KERNEL5(32 * 8) + KERNEL6(32 * 8) + KERNEL7(32 * 8) + KERNEL8(32 * 8) + cmpl $128 * 9, %eax + jle .L12 + KERNEL1(32 * 9) + KERNEL2(32 * 9) + KERNEL3(32 * 9) + KERNEL4(32 * 9) + KERNEL5(32 * 9) + KERNEL6(32 * 9) + KERNEL7(32 * 9) + KERNEL8(32 * 9) + cmpl $128 * 10, %eax + jle .L12 + KERNEL1(32 * 10) + KERNEL2(32 * 10) + KERNEL3(32 * 10) + KERNEL4(32 * 10) + KERNEL5(32 * 10) + KERNEL6(32 * 10) + KERNEL7(32 * 10) + KERNEL8(32 * 10) + cmpl $128 * 11, %eax + jle .L12 + KERNEL1(32 * 11) + KERNEL2(32 * 11) + KERNEL3(32 * 11) + KERNEL4(32 * 11) + KERNEL5(32 * 11) + KERNEL6(32 * 11) + KERNEL7(32 * 11) + KERNEL8(32 * 11) + cmpl $128 * 12, %eax + jle .L12 + KERNEL1(32 * 12) + KERNEL2(32 * 12) + KERNEL3(32 * 12) + KERNEL4(32 * 12) + KERNEL5(32 * 12) + KERNEL6(32 * 12) + KERNEL7(32 * 12) + KERNEL8(32 * 12) + cmpl $128 * 13, %eax + jle .L12 + KERNEL1(32 * 13) + KERNEL2(32 * 13) + KERNEL3(32 * 13) + KERNEL4(32 * 13) + KERNEL5(32 * 13) + KERNEL6(32 * 13) + KERNEL7(32 * 13) + KERNEL8(32 * 13) + cmpl $128 * 14, %eax + jle .L12 + KERNEL1(32 * 14) + KERNEL2(32 * 14) + KERNEL3(32 * 14) + KERNEL4(32 * 14) + KERNEL5(32 * 14) + KERNEL6(32 * 14) + KERNEL7(32 * 14) + KERNEL8(32 * 14) + cmpl $128 * 15, %eax + jle .L12 + KERNEL1(32 * 15) + KERNEL2(32 * 15) + KERNEL3(32 * 15) + KERNEL4(32 * 15) + KERNEL5(32 * 15) + KERNEL6(32 * 15) + KERNEL7(32 * 15) + KERNEL8(32 * 15) +#else + addl $128 * 4 * SIZE, BB + addl $128 * 2 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 +#endif + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB + ALIGN_4 +#else + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + ADDSUB %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movsldup 8 * SIZE(BB), %xmm2 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + + addsubps %xmm5, %xmm4 + addsubps %xmm7, %xmm6 + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 +#else + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + addsubps %xmm4, %xmm5 + addsubps %xmm6, %xmm7 + + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 +#endif + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#ifndef TRMMKERNEL + shufps $0xe4, %xmm0, %xmm0 + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + shufps $0xe4, %xmm2, %xmm2 + movsd 0 * SIZE(%esi, LDC), %xmm2 + movhps 2 * SIZE(%esi, LDC), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm6, 0 * SIZE(%esi, LDC) + movhps %xmm6, 2 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $2, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm0, %xmm3 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 36 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 40 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movsd 44 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm1, %xmm2 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movsd 64 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 52 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 56 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movsd 60 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + movaps %xmm4, %xmm6 + movlhps %xmm5, %xmm4 + movhlps %xmm6, %xmm5 + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm7, %xmm5 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + + addsubps %xmm5, %xmm4 + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + + addsubps %xmm4, %xmm5 + + movaps %xmm5, %xmm4 + + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 0 * SIZE(%esi, LDC), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 0 * SIZE(%esi, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leal (LDC, LDC), %eax + addl %eax, C # c += 2 * ldc + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + +/* Copying to Sub Buffer */ + leal BUFFER, %ecx + + movl K, %eax + sarl $3, %eax + jle .L103 + ALIGN_4 + +.L102: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $16 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: + movl K, %eax + andl $7, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup 0 * SIZE(B), %xmm0 + + movaps %xmm0, 0 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 4 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: + movl C, %esi + movl A, AA + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsldup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movsldup 16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef PENTIUM4 + prefetchnta 4 * SIZE(%esi) +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 12 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 32 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 32 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 16 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 20 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movsldup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 20 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 24 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movsldup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 24 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movsldup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movshdup 28 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movsldup 48 * SIZE(BB), %xmm3 + + addl $32 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movshdup 0 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movsldup 4 * SIZE(BB), %xmm2 + + addl $ 4 * SIZE, AA + addl $ 4 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm5, %xmm5 + + addsubps %xmm5, %xmm4 + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + + addsubps %xmm4, %xmm5 + + movaps %xmm5, %xmm4 + + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + + addl $4 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB # boffset1 = boffset +#else + leal BUFFER, BB # boffset1 = boffset + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + + movddup 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movsd 0 * SIZE(BB), %xmm2 + movsd 16 * SIZE(BB), %xmm3 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $1, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 8 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 12 * SIZE(BB), %xmm2 + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 16 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movsd 32 * SIZE(BB), %xmm2 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 10 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 20 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 24 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm4 + movsd 28 * SIZE(BB), %xmm3 + shufps $0x50, %xmm3, %xmm3 + mulps %xmm1, %xmm3 + movddup 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movsd 48 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + shufps $0x50, %xmm2, %xmm2 + mulps %xmm0, %xmm2 + movddup 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm4 + movsd 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm5, %xmm4 + + movhlps %xmm4, %xmm5 + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm7, %xmm5 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + + addsubps %xmm5, %xmm4 + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + + addsubps %xmm4, %xmm5 + + movaps %xmm5, %xmm4 + + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + addps %xmm0, %xmm4 +#endif + movsd %xmm4, 0 * SIZE(%esi) + ALIGN_4 + +.L999: + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_4x1_core2.S b/kernel/x86/zgemm_kernel_4x1_core2.S new file mode 100644 index 0000000000..ca232e4471 --- /dev/null +++ b/kernel/x86/zgemm_kernel_4x1_core2.S @@ -0,0 +1,872 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define TEMP 76(%esp) +#define OFFSET 80(%esp) +#define KK 84(%esp) +#define KKK 88(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define C1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define PREFETCH_R (8 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 16 + 4) +#define PREFETCH prefetcht0 + +#define AA %edx +#define BB %ecx + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addps +#else +#define ADDSUB subps +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movd %mm1, K + movd %mm0, M + movl %eax, N + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + leal (, LDC, SIZE * 2), LDC + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + pcmpeqb %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 + ALPHA_R + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + +.L01: + leal 32 * SIZE + BUFFER, BB + +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + movl K, %eax + sarl $2, %eax + jle .L03 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BB) + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BB) + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + movaps %xmm2, -24 * SIZE(BB) + movaps %xmm3, -20 * SIZE(BB) + movaps %xmm4, -16 * SIZE(BB) + movaps %xmm5, -12 * SIZE(BB) + movaps %xmm6, -8 * SIZE(BB) + movaps %xmm7, -4 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-32 * SIZE, BB + decl %eax + jne .L02 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + +.L04: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, -32 * SIZE(BB) + movaps %xmm1, -28 * SIZE(BB) + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, C1 # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 32 * SIZE + BUFFER, BB +#else + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -16 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + prefetcht0 7 * SIZE(C1) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps -24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 0 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps -16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + ADDSUB %xmm3, %xmm5 + movaps -12 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps -8 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps -8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + ADDSUB %xmm3, %xmm5 + movaps -4 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + movaps 16 * SIZE(AA), %xmm3 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + movaps 0 * SIZE(BB), %xmm1 + + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps 8 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + + movaps 16 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 20 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + ADDSUB %xmm3, %xmm5 + movaps 20 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm2, %xmm6 + movaps 24 * SIZE(AA), %xmm3 + ADDSUB %xmm1, %xmm7 + + movaps 24 * SIZE(BB), %xmm1 + movaps %xmm1, %xmm2 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm4 + movaps 28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm3 + ADDSUB %xmm3, %xmm5 + movaps 28 * SIZE(AA), %xmm3 + mulps %xmm3, %xmm2 + mulps %xmm3, %xmm1 + subl $-64 * SIZE, BB + movaps 48 * SIZE(AA), %xmm3 + subl $-64 * SIZE, AA + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + movaps -32 * SIZE(BB), %xmm1 + + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + +.L16: + movaps %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm1, %xmm0 + ADDSUB %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AA), %xmm0 + addps %xmm2, %xmm6 + ADDSUB %xmm1, %xmm7 + movaps -24 * SIZE(BB), %xmm1 + + addl $8 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L16 + +.L18: + movaps ALPHA_R, %xmm0 + movaps ALPHA_I, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + + addsubps %xmm5, %xmm4 + addsubps %xmm7, %xmm6 + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 +#else + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + addsubps %xmm4, %xmm5 + addsubps %xmm6, %xmm7 + + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 +#endif + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movhps 2 * SIZE(C1), %xmm2 + movsd 4 * SIZE(C1), %xmm3 + movhps 6 * SIZE(C1), %xmm3 + + addps %xmm2, %xmm4 + addps %xmm3, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + movsd %xmm6, 4 * SIZE(C1) + movhps %xmm6, 6 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $8 * SIZE, C1 + decl %ebx + jg .L10 + ALIGN_2 + +.L20: + movl M, %ebx + testl $2, %ebx + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 32 * SIZE + BUFFER, BB +#else + + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movaps -16 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movaps -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm7 + movaps -24 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps -8 * SIZE(BB), %xmm3 + ADDSUB %xmm0, %xmm5 + movaps -20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + ADDSUB %xmm0, %xmm7 + movaps 0 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + ADDSUB %xmm2, %xmm5 + movaps -12 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movaps 32 * SIZE(BB), %xmm1 + ADDSUB %xmm2, %xmm7 + movaps -8 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + ADDSUB %xmm2, %xmm5 + movaps -4 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + ADDSUB %xmm2, %xmm7 + movaps 16 * SIZE(AA), %xmm2 + + subl $-32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L22 + ALIGN_2 + +.L25: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L28 + +.L26: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm5 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L26 + +.L28: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps ALPHA_R, %xmm0 + movaps ALPHA_I, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + addsubps %xmm5, %xmm4 + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + addsubps %xmm4, %xmm5 + movaps %xmm5, %xmm4 + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + movhps 2 * SIZE(C1), %xmm2 + + addps %xmm2, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + movhps %xmm4, 2 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + addl $4 * SIZE, C1 + ALIGN_2 + +.L30: + testl $1, %ebx + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal 32 * SIZE + BUFFER, BB +#else + + leal 32 * SIZE + BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movsd -24 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movsd -16 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BB), %xmm0 + addps %xmm1, %xmm6 + movsd 0 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm7 + movsd -28 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movsd -8 * SIZE(BB), %xmm3 + ADDSUB %xmm0, %xmm5 + movsd -26 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movsd 16 * SIZE(BB), %xmm3 + ADDSUB %xmm0, %xmm7 + movsd -16 * SIZE(AA), %xmm0 + mulps %xmm2, %xmm1 + mulps 4 * SIZE(BB), %xmm2 + addps %xmm1, %xmm4 + movsd 8 * SIZE(BB), %xmm1 + ADDSUB %xmm2, %xmm5 + movsd -22 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm1 + mulps 12 * SIZE(BB), %xmm2 + addps %xmm1, %xmm6 + movsd 32 * SIZE(BB), %xmm1 + ADDSUB %xmm2, %xmm7 + movsd -20 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 20 * SIZE(BB), %xmm2 + addps %xmm3, %xmm4 + movsd 24 * SIZE(BB), %xmm3 + ADDSUB %xmm2, %xmm5 + movsd -18 * SIZE(AA), %xmm2 + mulps %xmm2, %xmm3 + mulps 28 * SIZE(BB), %xmm2 + addps %xmm3, %xmm6 + movsd 48 * SIZE(BB), %xmm3 + ADDSUB %xmm2, %xmm7 + movsd -8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 64 * SIZE, BB + + decl %eax + jne .L32 + ALIGN_2 + +.L35: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L38 + +.L36: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BB), %xmm0 + addps %xmm1, %xmm4 + movsd -24 * SIZE(BB), %xmm1 + ADDSUB %xmm0, %xmm5 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L36 + +.L38: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps ALPHA_R, %xmm0 + movaps ALPHA_I, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm5, %xmm5 + addsubps %xmm5, %xmm4 + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm4, %xmm4 + addsubps %xmm4, %xmm5 + movaps %xmm5, %xmm4 + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(C1), %xmm2 + addps %xmm2, %xmm4 +#endif + + movsd %xmm4, 0 * SIZE(C1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + ALIGN_2 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_kernel_4x1_sse.S b/kernel/x86/zgemm_kernel_4x1_sse.S new file mode 100644 index 0000000000..6c514639c7 --- /dev/null +++ b/kernel/x86/zgemm_kernel_4x1_sse.S @@ -0,0 +1,1508 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define ALPHA_R 16(%esp) +#define ALPHA_I 32(%esp) +#define K 48(%esp) +#define N 52(%esp) +#define M 56(%esp) +#define A 60(%esp) +#define C 64(%esp) +#define J 68(%esp) +#define OLD_STACK 72(%esp) +#define TEMP 76(%esp) +#define OFFSET 80(%esp) +#define KK 84(%esp) +#define KKK 88(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define AA %edx +#define BB %ecx + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL4(address) \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL5(address) \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 40 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL6(address) \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm4; \ + movaps 40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm5; \ + movaps 44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 64 * SIZE + (address) * SIZE * 2(AA), %xmm0 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 56 * SIZE + (address) * SIZE * 2(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm4; \ + movaps 56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm5; \ + movaps 60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \ + mulps %xmm1, %xmm3; \ + mulps 60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 80 * SIZE + (address) * SIZE * 2(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC +#ifdef TRMMKERNEL + movd STACK_OFFT, %mm4 +#endif + + movd %mm1, K + movd %mm0, M + movl %eax, N + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK +#ifdef TRMMKERNEL + movd %mm4, OFFSET + movd %mm4, KK +#ifndef LEFT + negl KK +#endif +#endif + + leal (, LDC, SIZE * 2), LDC + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + +#ifdef HAVE_SSE2 + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask +#else + movl $0x80000000, TEMP + movss TEMP, %xmm7 + shufps $0, %xmm7, %xmm7 +#endif + xorps %xmm2, %xmm2 + + shufps $0, %xmm0, %xmm0 + + movaps %xmm0, 0 + ALPHA_R + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movl OFFSET, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + movaps POSINV, %xmm7 + + movl K, %eax + sarl $2, %eax + jle .L03 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 + xorps %xmm7, %xmm3 +#else + xorps %xmm7, %xmm0 + xorps %xmm7, %xmm2 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L02 + +.L03: + movl K, %eax + andl $3, %eax + BRANCH + jle .L05 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm1 +#else + xorps %xmm7, %xmm0 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: + movl C, %esi # coffset = c + movl A, AA # aoffset = a + movl M, %ebx + sarl $2, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L10: + +#ifdef PENTIUM4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + + prefetchnta 8 * SIZE(%esi) + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + andl $-8, %eax + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $128 * 4 * SIZE, AA + addl $128 * 4 * SIZE, BB + subl $ 64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB + +#else + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $4, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + prefetcht0 8 * SIZE(%esi) + je .L12 + ALIGN_4 + +#define PREFETCHSIZE 48 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 +#endif + +.L12: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + + addl $8 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + + decl %eax + jg .L13 + +.L14: + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 +#else + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 +#endif + + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + mulps %xmm1, %xmm7 + mulps %xmm3, %xmm6 + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + shufps $0xe4, %xmm4, %xmm4 + shufps $0xe4, %xmm6, %xmm6 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + movsd 4 * SIZE(%esi), %xmm2 + movhps 6 * SIZE(%esi), %xmm2 + + addps %xmm0, %xmm4 + addps %xmm2, %xmm6 +#endif + + movsd %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + movsd %xmm6, 4 * SIZE(%esi) + movhps %xmm6, 6 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 4), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $4, KK +#endif + + addl $8 * SIZE, %esi # coffset += 4 + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L50: + movl M, %ebx + testl $2, %ebx + jle .L70 + + +#if (L1_DATA_LINESIZE == 64) + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_4 + +.L51: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm5 + movaps 12 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 36 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 40 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 20 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 48 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_2 + +#else + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax +#ifdef LEFT + addl $2, %eax +#else + addl $1, %eax +#endif + movl %eax, KKK +#endif + sarl $3, %eax + je .L52 + ALIGN_4 + +.L51: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 +#endif + +.L52: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L53 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(%esi), %xmm0 + movhps 2 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + movhps %xmm4, 2 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $2, KK +#endif + addl $4 * SIZE, %esi # coffset += 4 + ALIGN_2 + +.L70: + testl $1, %ebx + jle .L99 + + +#if (L1_DATA_LINESIZE == 64) + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_4 + +.L71: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +#else +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leal BUFFER, BB + movaps 0 * SIZE + BUFFER, %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE + BUFFER, %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#else + + leal BUFFER, BB + movl KK, %eax + leal (, %eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB /* because it's doubled */ + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#endif + + +#ifndef TRMMKERNEL + movl K, %eax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movl K, %eax + subl KK, %eax + movl %eax, KKK +#else + movl KK, %eax + addl $1, %eax + movl %eax, KKK +#endif + sarl $3, %eax + je .L72 + ALIGN_4 + +.L71: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 +#endif + +.L72: +#ifndef TRMMKERNEL + movl K, %eax +#else + movl KKK, %eax +#endif + movaps ALPHA_R, %xmm1 + movaps ALPHA_I, %xmm3 + andl $7, %eax # if (k & 1) + BRANCH + je .L74 + +.L73: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L73 + +.L74: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm5, %xmm4 +#else + addps %xmm5, %xmm4 +#endif + + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm1, %xmm5 + mulps %xmm3, %xmm4 + + addps %xmm5, %xmm4 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(%esi), %xmm0 + + addps %xmm0, %xmm4 +#endif + + movlps %xmm4, 0 * SIZE(%esi) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movl K, %eax + subl KKK, %eax + leal (,%eax, 8), %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addl $1, KK +#endif + + ALIGN_2 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $1, KK +#endif + + addl LDC, C # c += ldc + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_ncopy_2.S b/kernel/x86/zgemm_ncopy_2.S new file mode 100644 index 0000000000..bc80b47344 --- /dev/null +++ b/kernel/x86/zgemm_ncopy_2.S @@ -0,0 +1,268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 8 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_A 12 + STACK + ARGS(%esp) +#define STACK_LDA 16 + STACK + ARGS(%esp) +#define STACK_B 20 + STACK + ARGS(%esp) + +#define I %eax +#define J %ecx +#define LDA %edx +#define A %edi +#define A1 %ebx +#define A2 %ebp +#define B %esi + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl STACK_A, A + movl STACK_LDA, LDA + movl STACK_B, B + sall $ZBASE_SHIFT, LDA + + movl STACK_N, J + sarl $1, J + je .L20 + ALIGN_3 + +.L21: + movl A, A1 + leal (A1, LDA), A2 + leal (A, LDA, 2), A + + movl STACK_M, I + sarl $1, I + je .L24 + ALIGN_3 + +.L25: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXLOAD 0 * SIZE(A2), %mm2 + MMXLOAD 1 * SIZE(A2), %mm3 + + MMXLOAD 2 * SIZE(A1), %mm4 + MMXLOAD 3 * SIZE(A1), %mm5 + MMXLOAD 2 * SIZE(A2), %mm6 + MMXLOAD 3 * SIZE(A2), %mm7 + + MMXSTORE %mm0, 0 * SIZE(B) + MMXSTORE %mm1, 1 * SIZE(B) + MMXSTORE %mm2, 2 * SIZE(B) + MMXSTORE %mm3, 3 * SIZE(B) + + MMXSTORE %mm4, 4 * SIZE(B) + MMXSTORE %mm5, 5 * SIZE(B) + MMXSTORE %mm6, 6 * SIZE(B) + MMXSTORE %mm7, 7 * SIZE(B) +#else + FLD 3 * SIZE(A2) + FLD 2 * SIZE(A2) + FLD 3 * SIZE(A1) + FLD 2 * SIZE(A1) + FLD 1 * SIZE(A2) + FLD 0 * SIZE(A2) + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + + FST 0 * SIZE(B) + FST 1 * SIZE(B) + FST 2 * SIZE(B) + FST 3 * SIZE(B) + FST 4 * SIZE(B) + FST 5 * SIZE(B) + FST 6 * SIZE(B) + FST 7 * SIZE(B) +#endif + addl $4 * SIZE, A1 + addl $4 * SIZE, A2 + addl $8 * SIZE, B + decl I + jne .L25 + ALIGN_3 + +.L24: + movl STACK_M, I + andl $1, I + jle .L30 + ALIGN_3 + +.L31: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXLOAD 0 * SIZE(A2), %mm2 + MMXLOAD 1 * SIZE(A2), %mm3 + MMXSTORE %mm0, 0 * SIZE(B) + MMXSTORE %mm1, 1 * SIZE(B) + MMXSTORE %mm2, 2 * SIZE(B) + MMXSTORE %mm3, 3 * SIZE(B) +#else + FLD 1 * SIZE(A2) + FLD 0 * SIZE(A2) + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + FST 0 * SIZE(B) + FST 1 * SIZE(B) + FST 2 * SIZE(B) + FST 3 * SIZE(B) +#endif + addl $2 * SIZE, A1 + addl $2 * SIZE, A2 + addl $4 * SIZE, B + decl I + jne .L31 + ALIGN_3 + +.L30: + decl J + jne .L21 + ALIGN_3 + +.L20: + movl A, A1 + movl STACK_N, J + andl $1, J + jle .L38 + ALIGN_3 + +.L39: + movl STACK_M, I + sarl $2, I + je .L42 + ALIGN_3 + +.L43: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXLOAD 2 * SIZE(A1), %mm2 + MMXLOAD 3 * SIZE(A1), %mm3 + MMXLOAD 4 * SIZE(A1), %mm4 + MMXLOAD 5 * SIZE(A1), %mm5 + MMXLOAD 6 * SIZE(A1), %mm6 + MMXLOAD 7 * SIZE(A1), %mm7 + + MMXSTORE %mm0, 0 * SIZE(B) + MMXSTORE %mm1, 1 * SIZE(B) + MMXSTORE %mm2, 2 * SIZE(B) + MMXSTORE %mm3, 3 * SIZE(B) + MMXSTORE %mm4, 4 * SIZE(B) + MMXSTORE %mm5, 5 * SIZE(B) + MMXSTORE %mm6, 6 * SIZE(B) + MMXSTORE %mm7, 7 * SIZE(B) +#else + FLD 7 * SIZE(A1) + FLD 6 * SIZE(A1) + FLD 5 * SIZE(A1) + FLD 4 * SIZE(A1) + FLD 3 * SIZE(A1) + FLD 2 * SIZE(A1) + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + + FST 0 * SIZE(B) + FST 1 * SIZE(B) + FST 2 * SIZE(B) + FST 3 * SIZE(B) + FST 4 * SIZE(B) + FST 5 * SIZE(B) + FST 6 * SIZE(B) + FST 7 * SIZE(B) +#endif + + addl $8 * SIZE, A1 + addl $8 * SIZE, B + decl I + jne .L43 + ALIGN_3 + +.L42: + movl STACK_M, I + andl $3, I + jle .L38 + ALIGN_3 + +.L49: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXSTORE %mm0, 0 * SIZE(B) + MMXSTORE %mm1, 1 * SIZE(B) +#else + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + FST 0 * SIZE(B) + FST 1 * SIZE(B) +#endif + addl $2 * SIZE, A1 + addl $2 * SIZE, B + decl I + jne .L49 + ALIGN_3 + +.L38: + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemm_tcopy_2.S b/kernel/x86/zgemm_tcopy_2.S new file mode 100644 index 0000000000..f9a601d9b7 --- /dev/null +++ b/kernel/x86/zgemm_tcopy_2.S @@ -0,0 +1,174 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 8 + +#define J 0 + STACK(%esp) +#define BOFFSET2 4 + STACK(%esp) + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_A 12 + STACK + ARGS(%esp) +#define STACK_LDA 16 + STACK + ARGS(%esp) +#define STACK_B 20 + STACK + ARGS(%esp) + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#define A %ebp +#define A1 %edx +#define LDA %ecx +#define B %edi +#define I %ebx +#define B1 %eax +#define M4 %esi + + EMMS + + movl STACK_A, A + movl STACK_B, B + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_LDA, LDA + sall $ZBASE_SHIFT, LDA + + andl $-2, %eax + addl %eax, %eax + imull %ebx, %eax # m * ( n & ~1) + leal (B, %eax, SIZE), %eax # boffset2 = b + m * (n & ~1) + movl %eax, BOFFSET2 + + movl STACK_M, M4 + sall $ZBASE_SHIFT + 1, M4 + + testl %ebx, %ebx # if !(m & 1) goto L28 + movl %ebx, J + jle .L999 + ALIGN_4 + +.L39: + movl A, A1 + addl LDA, A + movl B, B1 + addl $4 * SIZE, B + + movl STACK_N, I + sarl $1, I + jle .L32 + ALIGN_4 + +.L36: +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + MMXLOAD 2 * SIZE(A1), %mm2 + MMXLOAD 3 * SIZE(A1), %mm3 + + MMXSTORE %mm0, 0 * SIZE(B1) + MMXSTORE %mm1, 1 * SIZE(B1) + MMXSTORE %mm2, 2 * SIZE(B1) + MMXSTORE %mm3, 3 * SIZE(B1) +#else + FLD 3 * SIZE(A1) + FLD 2 * SIZE(A1) + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + + FST 0 * SIZE(B1) + FST 1 * SIZE(B1) + FST 2 * SIZE(B1) + FST 3 * SIZE(B1) +#endif + addl $4 * SIZE, A1 + addl M4, B1 + decl I + jne .L36 + ALIGN_4 + +.L32: + movl STACK_N, I + andl $1, I + jle .L99 + ALIGN_4 + + movl BOFFSET2, B1 + +#ifdef HAVE_MMX + MMXLOAD 0 * SIZE(A1), %mm0 + MMXLOAD 1 * SIZE(A1), %mm1 + + MMXSTORE %mm0, 0 * SIZE(B1) + MMXSTORE %mm1, 1 * SIZE(B1) +#else + FLD 1 * SIZE(A1) + FLD 0 * SIZE(A1) + + FST 0 * SIZE(B1) + FST 1 * SIZE(B1) +#endif + addl $2 * SIZE, BOFFSET2 + ALIGN_4 + +.L99: + decl J + jne .L39 + ALIGN_4 + +.L999: + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS,%esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_n.S b/kernel/x86/zgemv_n.S new file mode 100644 index 0000000000..8e2b2b8424 --- /dev/null +++ b/kernel/x86/zgemv_n.S @@ -0,0 +1,367 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 32 +#endif + +#if defined(PENTIUM4) || defined(ATHLON) +#define P ((DTB_ENTRIES) >> 1) +#endif + +#ifndef P +#define P DTB_ENTRIES +#endif + +#define STACK 16 +#define ARGS 16 + +#define PLDA_M 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_N 8 + STACK(%esp) +#define IS 12 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define LDA 36 + STACK + ARGS(%esp) +#define X 40 + STACK + ARGS(%esp) +#define INCX 44 + STACK + ARGS(%esp) +#define Y 48 + STACK + ARGS(%esp) +#define INCY 52 + STACK + ARGS(%esp) +#define BUFFER 56 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define LDA 28 + STACK + ARGS(%esp) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#define BUFFER 48 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movl X, %edi + + movl LDA, %ebx + addl %ebx, %ebx # lda *= 2 + leal 0(,%ebx,SIZE),%ebx # EBX : lda + + movl $0, IS + + movl M, %ecx + movl N, %esi + + test %ecx, %ecx + jle .L79 # goto END + test %esi, %esi + jle .L79 # goto END + + movl INCY, %eax + addl %eax, %eax # incy *= 2 + leal (,%eax,SIZE),%eax + movl %eax, INCY + + movl LDA, %eax + imull $P, %eax # P * lda + subl M ,%eax # P * lda - m + leal (, %eax, SIZE), %eax + addl %eax, %eax + movl %eax, PLDA_M + ALIGN_2 + +.L32: + movl IS, %esi + movl $P, %edx + movl N, %eax + subl %esi,%eax # n - is + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + + movl %eax, MIN_N + movl INCX, %edx + addl %edx, %edx + + addl %esi, %esi + leal (%edi, %esi, SIZE), %esi # xp = x + is + movl %esi, XP + cmpl $2, %edx + je .L34 # if incx == 1 goto L34 + + movl BUFFER, %esi + leal (, %edx, SIZE), %edx + movl %esi, XP # xp = buffer + sarl $1,%eax + jle .L35 + ALIGN_2 + +.L36: + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_N, %eax + andl $1, %eax + jle .L34 + + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %edx,%edi # x += incx + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + ALIGN_3 + +/* Main Routine */ +.L34: + movl Y, %ecx # c_offset + movl M, %ebp # j = m + ALIGN_3 + +.L61: + movl A, %edx # a_offset = a + fldz + addl $2 * SIZE, A # a++ + fldz + movl XP,%esi + fldz + movl MIN_N,%eax + fldz + FLD (%esi) # bt1 = *(b_offset + 0) + sarl $1, %eax + jle .L64 + ALIGN_3 + +.L65: +#ifdef PENTIUM4 + prefetchnta 16 * SIZE(%esi) +#endif + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) + + addl $2 * SIZE, %esi # b_offset += 2 + addl %ebx, %edx # a_offset += lda + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 2) + + addl $2 * SIZE, %esi # b_offset += 2 + addl %ebx, %edx # a_offset += lda + + decl %eax + jg .L65 + +.L64: + movl MIN_N, %eax + andl $1, %eax + jle .L70 + ALIGN_2 + +.L71: + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%edx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%edx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_2 + +.L70: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4), %st + fld %st(2) + fmul %st(4), %st + fsubp %st, %st(1) + + movl INCY, %eax + + FADD 0 * SIZE(%ecx) + FST 0 * SIZE(%ecx) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + faddp %st, %st(1) + + FADD 1 * SIZE(%ecx) + FST 1 * SIZE(%ecx) + + addl %eax, %ecx + decl %ebp + jg .L61 + +.L60: + movl PLDA_M, %esi + addl %esi, A # a += P * lda - m + addl $P, IS + movl N, %esi + cmpl %esi,IS + jl .L32 + +.L79: +#ifndef C_SUN + ffreep %st(0) + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 + .byte 0xdf + .byte 0xc0 +#endif + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_n_atom.S b/kernel/x86/zgemv_n_atom.S new file mode 100644 index 0000000000..3dba030f8f --- /dev/null +++ b/kernel/x86/zgemv_n_atom.S @@ -0,0 +1,545 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 24 + STACKSIZE(%esp) +#define A 32 + STACKSIZE(%esp) +#define STACK_LDA 36 + STACKSIZE(%esp) +#define STACK_X 40 + STACKSIZE(%esp) +#define STACK_INCX 44 + STACKSIZE(%esp) +#define Y 48 + STACKSIZE(%esp) +#define STACK_INCY 52 + STACKSIZE(%esp) +#define BUFFER 56 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 subsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 addsd +#define ADD4 subsd +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 addsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 subsd +#define ADD4 subsd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl N, J + + pxor %xmm7, %xmm7 + + movl M, %eax + addl $8, %eax + sarl $3, %eax + ALIGN_3 + +.L01: + movapd %xmm7, 0 * SIZE(Y1) + movapd %xmm7, 2 * SIZE(Y1) + movapd %xmm7, 4 * SIZE(Y1) + movapd %xmm7, 6 * SIZE(Y1) + movapd %xmm7, 8 * SIZE(Y1) + movapd %xmm7, 10 * SIZE(Y1) + movapd %xmm7, 12 * SIZE(Y1) + movapd %xmm7, 14 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + addl LDA, A + + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addl INCX, X + + movapd %xmm6, %xmm2 + mulsd ALPHA_R, %xmm6 + mulsd ALPHA_I, %xmm2 + movapd %xmm7, %xmm3 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_R, %xmm7 + +#ifndef XCONJ + subsd %xmm3, %xmm6 + addsd %xmm2, %xmm7 +#else + addsd %xmm3, %xmm6 + subsd %xmm2, %xmm7 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + ALIGN_3 + + movl M, I + sarl $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + mulsd %xmm7, %xmm4 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -13 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -11 * SIZE(A1), %xmm3 + mulsd %xmm7, %xmm4 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -9 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -12 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm0 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -7 * SIZE(A1), %xmm3 + mulsd %xmm7, %xmm4 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -10 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + movlpd %xmm1, -9 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -13 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -11 * SIZE(A1), %xmm3 + mulsd %xmm7, %xmm4 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -10 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -9 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -12 * SIZE(Y1) + movsd -10 * SIZE(Y1), %xmm0 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -9 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + ADD3 %xmm3, %xmm0 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -10 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + movlpd %xmm1, -9 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + mulsd %xmm7, %xmm4 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + movsd -14 * SIZE(A1), %xmm2 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + ADD3 %xmm3, %xmm0 + movsd -13 * SIZE(A1), %xmm3 + ADD4 %xmm5, %xmm1 + mulsd %xmm7, %xmm4 + + movlpd %xmm0, -16 * SIZE(Y1) + movsd -14 * SIZE(Y1), %xmm0 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -13 * SIZE(Y1), %xmm1 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + ADD3 %xmm3, %xmm0 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -14 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movlpd %xmm1, -13 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm2 + movsd -15 * SIZE(A1), %xmm3 + + movapd %xmm2, %xmm4 + mulsd %xmm6, %xmm2 + mulsd %xmm7, %xmm4 + + movapd %xmm3, %xmm5 + mulsd %xmm7, %xmm3 + ADD1 %xmm2, %xmm0 + mulsd %xmm6, %xmm5 + ADD2 %xmm4, %xmm1 + + ADD3 %xmm3, %xmm0 + ADD4 %xmm5, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movlpd %xmm1, -15 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L10 + ALIGN_4 + +.L990: + movl Y, Y1 + movl BUFFER, X + movl STACK_INCY, INCY + + movl Y1, A1 + sall $ZBASE_SHIFT, INCY + + movl M, %eax + sarl $2, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd 0 * SIZE(Y1), %xmm0 + movsd 1 * SIZE(Y1), %xmm1 + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm2 + movsd 1 * SIZE(Y1), %xmm3 + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm4 + movsd 1 * SIZE(Y1), %xmm5 + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm6 + movsd 1 * SIZE(Y1), %xmm7 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + addsd 2 * SIZE(X), %xmm2 + addsd 3 * SIZE(X), %xmm3 + addsd 4 * SIZE(X), %xmm4 + addsd 5 * SIZE(X), %xmm5 + addsd 6 * SIZE(X), %xmm6 + addsd 7 * SIZE(X), %xmm7 + + movlpd %xmm0, 0 * SIZE(A1) + movlpd %xmm1, 1 * SIZE(A1) + addl INCY, A1 + + movlpd %xmm2, 0 * SIZE(A1) + movlpd %xmm3, 1 * SIZE(A1) + addl INCY, A1 + + movlpd %xmm4, 0 * SIZE(A1) + movlpd %xmm5, 1 * SIZE(A1) + addl INCY, A1 + + movlpd %xmm6, 0 * SIZE(A1) + movlpd %xmm7, 1 * SIZE(A1) + addl INCY, A1 + + addl $8 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $2, M + jle .L996 + + movsd 0 * SIZE(Y1), %xmm0 + movsd 1 * SIZE(Y1), %xmm1 + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm2 + movsd 1 * SIZE(Y1), %xmm3 + addl INCY, Y1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + addsd 2 * SIZE(X), %xmm2 + addsd 3 * SIZE(X), %xmm3 + + movlpd %xmm0, 0 * SIZE(A1) + movlpd %xmm1, 1 * SIZE(A1) + addl INCY, A1 + + movlpd %xmm2, 0 * SIZE(A1) + movlpd %xmm3, 1 * SIZE(A1) + addl INCY, A1 + + addl $4 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movsd 0 * SIZE(Y1), %xmm0 + movsd 1 * SIZE(Y1), %xmm1 + + addsd 0 * SIZE(X), %xmm0 + addsd 1 * SIZE(X), %xmm1 + + movlpd %xmm0, 0 * SIZE(A1) + movlpd %xmm1, 1 * SIZE(A1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S new file mode 100644 index 0000000000..340b9d3759 --- /dev/null +++ b/kernel/x86/zgemv_n_sse.S @@ -0,0 +1,604 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef movsd +#undef movsd +#endif + +#ifdef PENTIUM3 +#ifdef HAVE_SSE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 20 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#undef SUBPS + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPS subps +#else +#define SUBPS addps +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, LDA + + subl $-32 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl N, J + + xorps %xmm7, %xmm7 + + movl M, %eax + addl $8, %eax + sarl $3, %eax + ALIGN_3 + +.L01: + movaps %xmm7, 0 * SIZE(Y1) + movaps %xmm7, 4 * SIZE(Y1) + movaps %xmm7, 8 * SIZE(Y1) + movaps %xmm7, 12 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl BUFFER, Y1 + addl $32 * SIZE, Y1 + + movl A, A1 + addl LDA, A + + movsd (X), %xmm7 + addl INCX, X + +#ifdef HAVE_SSE2 + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 +#else + subl $8, %esp + movl $0x00000000, 0(%esp) + movl $0x80000000, 4(%esp) + movlps (%esp), %xmm5 + addl $8, %esp + movlhps %xmm5, %xmm5 +#endif + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm6 + pshufd $0x55, %xmm7, %xmm7 +#else + movaps %xmm7, %xmm6 + shufps $0x00, %xmm6, %xmm6 + shufps $0x55, %xmm7, %xmm7 +#endif + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm3 +#else + movsd ALPHA_R, %xmm3 + + movlhps %xmm3, %xmm3 +#endif + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm3, %xmm4 +#else + movaps %xmm3, %xmm4 + shufps $0xb1, %xmm4, %xmm4 +#endif + + +#ifndef XCONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + mulps %xmm3, %xmm6 + mulps %xmm4, %xmm7 + +#ifndef XCONJ + subps %xmm7, %xmm6 +#else + addps %xmm7, %xmm6 +#endif + +#ifdef HAVE_SSE2 + pshufd $0x55, %xmm6, %xmm7 + pshufd $0x00, %xmm6, %xmm6 +#else + movaps %xmm6, %xmm7 + shufps $0x55, %xmm7, %xmm7 + shufps $0x00, %xmm6, %xmm6 +#endif + +#ifndef CONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + ALIGN_3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm4 + movhps -26 * SIZE(A1), %xmm4 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + movsd -20 * SIZE(A1), %xmm4 + movhps -18 * SIZE(A1), %xmm4 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -16 * SIZE(A1), %xmm2 + movhps -14 * SIZE(A1), %xmm2 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + movsd -12 * SIZE(A1), %xmm4 + movhps -10 * SIZE(A1), %xmm4 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -24 * SIZE(Y1) + movaps -16 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -20 * SIZE(Y1) + movaps -12 * SIZE(Y1), %xmm1 + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + movsd -24 * SIZE(A1), %xmm2 + movhps -22 * SIZE(A1), %xmm2 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + movsd -20 * SIZE(A1), %xmm4 + movhps -18 * SIZE(A1), %xmm4 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -24 * SIZE(Y1) + movaps -16 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -20 * SIZE(Y1) + movaps -12 * SIZE(Y1), %xmm1 + + subl $-16 * SIZE, A1 + subl $-16 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $4, M + je .L17 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + movsd -28 * SIZE(A1), %xmm4 + movhps -26 * SIZE(A1), %xmm4 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm6, %xmm4 + addps %xmm4, %xmm1 + + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(Y1) + movaps -24 * SIZE(Y1), %xmm0 + mulps %xmm7, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(Y1) + movaps -20 * SIZE(Y1), %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $2, M + je .L18 + + movsd -32 * SIZE(A1), %xmm2 + movhps -30 * SIZE(A1), %xmm2 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L18: + testl $1, M + je .L19 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A1), %xmm2 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm2, %xmm3 +#else + movaps %xmm2, %xmm3 + shufps $0xb1, %xmm3, %xmm3 +#endif + mulps %xmm6, %xmm2 + addps %xmm2, %xmm0 + mulps %xmm7, %xmm3 + SUBPS %xmm3, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L10 + ALIGN_4 + +.L990: + movl Y, Y1 + movl BUFFER, X + + movl STACK_INCY, INCY + sall $ZBASE_SHIFT, INCY + + movl M, %eax + sarl $3, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 0 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 4 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 8 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 12 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $16 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $4, M + jle .L995 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 0 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 4 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $8 * SIZE, X + ALIGN_3 + +.L995: + testl $2, M + jle .L996 + + movsd (Y1), %xmm0 + movhps (Y1, INCY), %xmm0 + + addps 0 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + movhps %xmm0, (Y1, INCY) + leal (Y1, INCY, 2), Y1 + + addl $4 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd (Y1), %xmm0 + + addps 0 * SIZE(X), %xmm0 + + movlps %xmm0, (Y1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S new file mode 100644 index 0000000000..441fbb0c0c --- /dev/null +++ b/kernel/x86/zgemv_n_sse2.S @@ -0,0 +1,467 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 24 + STACKSIZE(%esp) +#define A 32 + STACKSIZE(%esp) +#define STACK_LDA 36 + STACKSIZE(%esp) +#define STACK_X 40 + STACKSIZE(%esp) +#define STACK_INCX 44 + STACKSIZE(%esp) +#define Y 48 + STACKSIZE(%esp) +#define STACK_INCY 52 + STACKSIZE(%esp) +#define BUFFER 56 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX %ecx +#define INCY J + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#undef SUBPD + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPD subpd +#else +#define SUBPD addpd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl N, J + + pxor %xmm7, %xmm7 + + movl M, %eax + addl $8, %eax + sarl $3, %eax + ALIGN_3 + +.L01: + movapd %xmm7, 0 * SIZE(Y1) + movapd %xmm7, 2 * SIZE(Y1) + movapd %xmm7, 4 * SIZE(Y1) + movapd %xmm7, 6 * SIZE(Y1) + movapd %xmm7, 8 * SIZE(Y1) + movapd %xmm7, 10 * SIZE(Y1) + movapd %xmm7, 12 * SIZE(Y1) + movapd %xmm7, 14 * SIZE(Y1) + subl $-16 * SIZE, Y1 + decl %eax + jg .L01 + ALIGN_3 + +.L10: + movl BUFFER, Y1 + addl $16 * SIZE, Y1 + + movl A, A1 + addl LDA, A + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 + addl INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0xc0, %xmm5, %xmm5 + + pshufd $0x4e, %xmm6, %xmm7 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm3 + movddup ALPHA_I, %xmm4 +#else + movsd ALPHA_R, %xmm3 + movsd ALPHA_I, %xmm4 + + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 +#endif + + xorpd %xmm5, %xmm7 + + mulpd %xmm3, %xmm6 + mulpd %xmm4, %xmm7 + +#ifndef XCONJ + subpd %xmm7, %xmm6 +#else + addpd %xmm7, %xmm6 +#endif + + pshufd $0xee, %xmm6, %xmm7 + pshufd $0x44, %xmm6, %xmm6 + +#ifndef CONJ + xorpd %xmm5, %xmm7 +#else + xorpd %xmm5, %xmm6 +#endif + + movapd -16 * SIZE(Y1), %xmm0 + movapd -14 * SIZE(Y1), %xmm1 + ALIGN_3 + + movl M, I + sarl $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm4 + movhpd -13 * SIZE(A1), %xmm4 + + decl I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + movsd -10 * SIZE(A1), %xmm4 + movhpd -9 * SIZE(A1), %xmm4 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -8 * SIZE(A1), %xmm2 + movhpd -7 * SIZE(A1), %xmm2 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + movsd -6 * SIZE(A1), %xmm4 + movhpd -5 * SIZE(A1), %xmm4 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -12 * SIZE(Y1) + movapd -8 * SIZE(Y1), %xmm0 + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -10 * SIZE(Y1) + movapd -6 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + + subl $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + movsd -12 * SIZE(A1), %xmm2 + movhpd -11 * SIZE(A1), %xmm2 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + movsd -10 * SIZE(A1), %xmm4 + movhpd -9 * SIZE(A1), %xmm4 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -16 * SIZE(Y1) + movapd -12 * SIZE(Y1), %xmm0 + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -14 * SIZE(Y1) + movapd -10 * SIZE(Y1), %xmm1 + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -12 * SIZE(Y1) + movapd -8 * SIZE(Y1), %xmm0 + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -10 * SIZE(Y1) + movapd -6 * SIZE(Y1), %xmm1 + + subl $-8 * SIZE, A1 + subl $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testl $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + movsd -14 * SIZE(A1), %xmm4 + movhpd -13 * SIZE(A1), %xmm4 + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm6, %xmm4 + addpd %xmm4, %xmm1 + + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + movapd %xmm0, -16 * SIZE(Y1) + mulpd %xmm7, %xmm5 + SUBPD %xmm5, %xmm1 + movapd %xmm1, -14 * SIZE(Y1) + + movapd -12 * SIZE(Y1), %xmm0 + + addl $4 * SIZE, A1 + addl $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testl $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm2 + movhpd -15 * SIZE(A1), %xmm2 + + pshufd $0x4e, %xmm2, %xmm3 + mulpd %xmm6, %xmm2 + addpd %xmm2, %xmm0 + mulpd %xmm7, %xmm3 + SUBPD %xmm3, %xmm0 + + movapd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L19: + decl J + jg .L10 + ALIGN_4 + +.L990: + movl Y, Y1 + movl BUFFER, X + + movl STACK_INCY, INCY + sall $ZBASE_SHIFT, INCY + + movl M, %eax + sarl $2, %eax + jle .L994 + ALIGN_3 + +.L992: + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 2 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 4 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 6 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + addl $8 * SIZE, X + decl %eax + jg .L992 + ALIGN_3 + +.L994: + testl $2, M + jle .L996 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 2 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + addl $4 * SIZE, X + ALIGN_3 + +.L996: + testl $1, M + jle .L999 + + movsd 0 * SIZE(Y1), %xmm0 + movhpd 1 * SIZE(Y1), %xmm0 + + addpd 0 * SIZE(X), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_t.S b/kernel/x86/zgemv_t.S new file mode 100644 index 0000000000..452794cc13 --- /dev/null +++ b/kernel/x86/zgemv_t.S @@ -0,0 +1,386 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM +#define P 88 +#endif + +#ifndef P +#define P 400 +#endif + +#define STACK 16 +#define ARGS 24 + +#define NLDA 0 + STACK(%esp) +#define XP 4 + STACK(%esp) +#define MIN_M 8 + STACK(%esp) +#define J 12 + STACK(%esp) +#define IS 16 + STACK(%esp) + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define LDA 36 + STACK + ARGS(%esp) +#define X 40 + STACK + ARGS(%esp) +#define INCX 44 + STACK + ARGS(%esp) +#define Y 48 + STACK + ARGS(%esp) +#define INCY 52 + STACK + ARGS(%esp) +#define BUFFER 56 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define LDA 28 + STACK + ARGS(%esp) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#define BUFFER 48 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movl X, %edi # X + + movl $0, IS + + movl M, %ebx + movl N, %ecx + testl %ebx, %ebx + jle .L79 + + testl %ecx, %ecx + jle .L79 + + movl INCX, %esi + addl %esi, %esi + leal (,%esi,SIZE), %esi + movl %esi, INCX + + movl INCY, %esi + addl %esi, %esi + leal (, %esi, SIZE), %esi + movl %esi, INCY + + movl LDA, %ebx + + movl N, %eax + imull %ebx, %eax + movl $P, %esi + subl %eax, %esi + leal (, %esi, SIZE), %esi + addl %esi, %esi + movl %esi, NLDA + + leal (,%ebx,SIZE), %esi + addl %esi, %esi + movl %esi, LDA + ALIGN_2 + +.L32: + movl IS, %esi + + movl $P, %edx + movl M, %eax + subl %esi, %eax + cmpl %edx, %eax +#ifdef PENTIUM + jle .L33 + movl %edx, %eax +.L33: +#else + cmovg %edx, %eax +#endif + movl %eax, MIN_M + + movl IS, %ecx + addl %ecx, %ecx + leal (%edi,%ecx,SIZE), %ecx # xp = x + is + movl INCX, %ebx + movl %ecx, XP + cmpl $2 * SIZE, %ebx + je .L34 + + movl BUFFER, %esi + movl MIN_M, %eax + movl %esi, XP + sarl $1, %eax + jle .L35 + + ALIGN_3 + +.L36: + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + + FST 3 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + + addl $4 * SIZE, %esi # xp += 4 + decl %eax + jg .L36 + ALIGN_3 + +.L35: + movl MIN_M, %eax + andl $1,%eax + jle .L34 + + FLD 0 * SIZE(%edi) + FLD 1 * SIZE(%edi) + addl %ebx,%edi # x += incx + FST 1 * SIZE(%esi) + FST 0 * SIZE(%esi) + ALIGN_3 + +/* Main Routine */ + +.L34: + movl Y, %ebp # coffset = y + + movl N, %ecx + testl %ecx, %ecx + jle .L60 + ALIGN_2 + +.L61: + movl A, %ebx # a_offset = a + fldz # ct1 = ZERO + movl LDA, %edx + fldz # ct1 = ZERO + + addl %ebx, %edx + fldz # ct1 = ZERO + movl %edx, A + fldz # ct1 = ZERO + + movl XP, %esi + + FLD (%esi) # bt1 = *(b_offset + 0) + + movl MIN_M, %eax + sarl $1, %eax + jle .L64 + ALIGN_3 + +#define PRESIZE 8 + +.L65: +#ifdef HAS_PREFETCH + prefetcht0 PRESIZE * SIZE(%ebx) + prefetcht0 PRESIZE * SIZE(%esi) +#endif + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 3 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 3 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + FLD 4 * SIZE(%esi) # bt1 = *(b_offset + 1) + + addl $4 * SIZE, %esi + addl $4 * SIZE, %ebx + decl %eax + jg .L65 + ALIGN_3 + +.L64: + movl MIN_M, %eax + andl $1, %eax + jle .L70 + ALIGN_3 + +.L71: + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(%esi) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(%ebx) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FMUL 1 * SIZE(%ebx) # bt1 *= *(a_offset + 1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_3 + +.L70: +#ifndef C_SUN + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 +#endif + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4), %st + fld %st(2) + fmul %st(4), %st + fsubp %st, %st(1) + + FADD 0 * SIZE(%ebp) + FST 0 * SIZE(%ebp) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + faddp %st, %st(1) + + FADD 1 * SIZE(%ebp) + FST 1 * SIZE(%ebp) + addl INCY, %ebp + + decl %ecx + jg .L61 + ALIGN_3 + +.L60: + movl A, %ebx + addl NLDA, %ebx + movl %ebx, A + + addl $P, IS + movl M, %esi + cmpl %esi, IS + jl .L32 + ALIGN_3 + +.L79: +#ifndef C_SUN + ffreep %st(0) + ffreep %st(0) +#else + .byte 0xdf + .byte 0xc0 + .byte 0xdf + .byte 0xc0 +#endif + + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE + diff --git a/kernel/x86/zgemv_t_atom.S b/kernel/x86/zgemv_t_atom.S new file mode 100644 index 0000000000..6f0dee0b6f --- /dev/null +++ b/kernel/x86/zgemv_t_atom.S @@ -0,0 +1,445 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 24 + STACKSIZE(%esp) +#define A 32 + STACKSIZE(%esp) +#define STACK_LDA 36 + STACKSIZE(%esp) +#define STACK_X 40 + STACKSIZE(%esp) +#define STACK_INCX 44 + STACKSIZE(%esp) +#define Y 48 + STACKSIZE(%esp) +#define STACK_INCY 52 + STACKSIZE(%esp) +#define BUFFER 56 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 subsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 addsd +#define ADD4 subsd +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 addsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 subsd +#define ADD4 subsd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + sall $ZBASE_SHIFT, LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addl INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addl INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addl INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + addl $2 * SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + ALIGN_3 + +.L11: + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + addl LDA, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movsd -16 * SIZE(X), %xmm2 + movsd -15 * SIZE(X), %xmm3 + + movl M, I + sarl $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + mulsd %xmm3, %xmm6 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -14 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -13 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -11 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -12 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -11 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -9 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -10 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -10 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -9 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -7 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -8 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -7 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -14 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -13 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -11 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -12 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -11 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -9 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -10 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -10 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -9 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -7 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + mulsd %xmm2, %xmm7 + movsd -8 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + ADD3 %xmm5, %xmm0 + ADD4 %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L15: + testl $2, M + jle .L17 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + mulsd %xmm3, %xmm6 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -13 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + movsd -14 * SIZE(A1), %xmm4 + mulsd %xmm2, %xmm7 + movsd -14 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + ADD3 %xmm5, %xmm0 + movsd -13 * SIZE(A1), %xmm5 + mulsd %xmm3, %xmm6 + ADD4 %xmm7, %xmm1 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + movsd -11 * SIZE(X), %xmm3 + ADD1 %xmm4, %xmm0 + mulsd %xmm2, %xmm7 + movsd -12 * SIZE(X), %xmm2 + ADD2 %xmm6, %xmm1 + + ADD3 %xmm5, %xmm0 + ADD4 %xmm7, %xmm1 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L17: + testl $1, M + jle .L18 + + movsd -16 * SIZE(A1), %xmm4 + movsd -15 * SIZE(A1), %xmm5 + + movapd %xmm4, %xmm6 + mulsd %xmm2, %xmm4 + mulsd %xmm3, %xmm6 + + movapd %xmm5, %xmm7 + mulsd %xmm3, %xmm5 + ADD1 %xmm4, %xmm0 + mulsd %xmm2, %xmm7 + ADD2 %xmm6, %xmm1 + + ADD3 %xmm5, %xmm0 + ADD4 %xmm7, %xmm1 + ALIGN_4 + +.L18: + movsd 0 * SIZE(Y1), %xmm4 + movapd %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movsd 1 * SIZE(Y1), %xmm5 + movapd %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + + mulsd ALPHA_I, %xmm2 + mulsd ALPHA_I, %xmm3 + + addsd %xmm2, %xmm1 + subsd %xmm3, %xmm0 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movlpd %xmm1, 1 * SIZE(Y1) + + addl INCY, Y1 + + decl J + jg .L11 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S new file mode 100644 index 0000000000..4312ed1730 --- /dev/null +++ b/kernel/x86/zgemv_t_sse.S @@ -0,0 +1,522 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef movsd +#undef movsd +#endif + +#ifdef PENTIUM3 +#ifdef HAVE_SSE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 20 + STACKSIZE(%esp) +#define A 24 + STACKSIZE(%esp) +#define STACK_LDA 28 + STACKSIZE(%esp) +#define STACK_X 32 + STACKSIZE(%esp) +#define STACK_INCX 36 + STACKSIZE(%esp) +#define Y 40 + STACKSIZE(%esp) +#define STACK_INCY 44 + STACKSIZE(%esp) +#define BUFFER 48 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#undef SUBPS + +#ifndef CONJ +#define SUBPS addps +#else +#define SUBPS subps +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, LDA + sall $ZBASE_SHIFT, INCY + + subl $-32 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + movaps %xmm0, 0 * SIZE(Y1) + movaps %xmm1, 4 * SIZE(Y1) + movaps %xmm2, 8 * SIZE(Y1) + movaps %xmm3, 12 * SIZE(Y1) + + addl $16 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addl INCX, X + + movlps %xmm0, (Y1) + addl $2 * SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + ALIGN_3 + +.L11: + movl BUFFER, X + addl $32 * SIZE, X + + movl A, A1 + addl LDA, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movaps -32 * SIZE(X), %xmm2 + movaps -28 * SIZE(X), %xmm3 + + movl M, I + sarl $3, I + jle .L15 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -16 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -12 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + movsd -16 * SIZE(A1), %xmm4 + movhps -14 * SIZE(A1), %xmm4 + movsd -12 * SIZE(A1), %xmm6 + movhps -10 * SIZE(A1), %xmm6 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + movsd -24 * SIZE(A1), %xmm4 + movhps -22 * SIZE(A1), %xmm4 + movsd -20 * SIZE(A1), %xmm6 + movhps -18 * SIZE(A1), %xmm6 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -16 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -12 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + addl $16 * SIZE, A1 + addl $16 * SIZE, X + ALIGN_4 + +.L15: + testl $4, M + jle .L17 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + movsd -28 * SIZE(A1), %xmm6 + movhps -26 * SIZE(A1), %xmm6 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + movaps -24 * SIZE(X), %xmm2 + SUBPS %xmm5, %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm6, %xmm7 +#else + movaps %xmm6, %xmm7 + shufps $0xb1, %xmm7, %xmm7 +#endif + mulps %xmm3, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm3, %xmm7 + movaps -20 * SIZE(X), %xmm3 + SUBPS %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L17: + testl $2, M + jle .L18 + + movsd -32 * SIZE(A1), %xmm4 + movhps -30 * SIZE(A1), %xmm4 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + SUBPS %xmm5, %xmm1 + movaps %xmm3, %xmm2 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L18: + testl $1, M + jle .L19 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm4 + shufps $0x44, %xmm2, %xmm2 + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm4, %xmm5 +#else + movaps %xmm4, %xmm5 + shufps $0xb1, %xmm5, %xmm5 +#endif + mulps %xmm2, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm2, %xmm5 + SUBPS %xmm5, %xmm1 + ALIGN_4 + +.L19: +#ifdef HAVE_SSE2 + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 +#else + subl $8, %esp + movl $0x00000000, 0(%esp) + movl $0x80000000, 4(%esp) + movlps (%esp), %xmm5 + addl $8, %esp + movlhps %xmm5, %xmm5 +#endif + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + addps %xmm2, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + +#ifdef HAVE_SSE2 + pshufd $0xb1, %xmm0, %xmm1 +#else + movaps %xmm0, %xmm1 + shufps $0xb1, %xmm1, %xmm1 +#endif + + movsd ALPHA_R, %xmm7 + movlhps %xmm7, %xmm7 + + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd 0 * SIZE(Y1), %xmm4 + + shufps $0xd8, %xmm0, %xmm0 + addps %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(Y1) + addl INCY, Y1 + + decl J + jg .L11 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S new file mode 100644 index 0000000000..78ca14cab9 --- /dev/null +++ b/kernel/x86/zgemv_t_sse2.S @@ -0,0 +1,404 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 7) +#endif + +#ifdef OPTERON +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetchnta +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5) +#endif + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 4) +#endif + +#define STACKSIZE 16 + +#define M 4 + STACKSIZE(%esp) +#define N 8 + STACKSIZE(%esp) +#define ALPHA_R 16 + STACKSIZE(%esp) +#define ALPHA_I 24 + STACKSIZE(%esp) +#define A 32 + STACKSIZE(%esp) +#define STACK_LDA 36 + STACKSIZE(%esp) +#define STACK_X 40 + STACKSIZE(%esp) +#define STACK_INCX 44 + STACKSIZE(%esp) +#define Y 48 + STACKSIZE(%esp) +#define STACK_INCY 52 + STACKSIZE(%esp) +#define BUFFER 56 + STACKSIZE(%esp) + +#define I %eax +#define J %ebx + +#define INCX J +#define INCY %ecx + +#define A1 %esi +#define X %edx +#define Y1 %edi +#define LDA %ebp + +#undef SUBPD + +#ifndef CONJ +#define SUBPD addpd +#else +#define SUBPD subpd +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_LDA, LDA + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + sall $ZBASE_SHIFT, LDA + + subl $-16 * SIZE, A + + cmpl $0, N + jle .L999 + cmpl $0, M + jle .L999 + + movl BUFFER, Y1 + + movl M, I + sarl $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addl INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addl INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addl INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) + + addl $8 * SIZE, Y1 + decl I + jg .L02 + ALIGN_4 + +.L05: + movl M, I + andl $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addl INCX, X + + movapd %xmm0, 0 * SIZE(Y1) + addl $2 * SIZE, Y1 + decl I + jg .L06 + ALIGN_4 + +.L10: + movl Y, Y1 + + movl N, J + ALIGN_3 + +.L11: + movl BUFFER, X + addl $16 * SIZE, X + + movl A, A1 + addl LDA, A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movapd -16 * SIZE(X), %xmm2 + movapd -14 * SIZE(X), %xmm3 + + movl M, I + sarl $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + decl I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + SUBPD %xmm5, %xmm1 + movapd -12 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + SUBPD %xmm7, %xmm1 + movapd -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + movapd -8 * SIZE(X), %xmm2 + SUBPD %xmm5, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + movapd -6 * SIZE(X), %xmm3 + SUBPD %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + + decl I + jg .L12 + ALIGN_4 + +.L13: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + mulpd %xmm2, %xmm5 + SUBPD %xmm5, %xmm1 + movapd -12 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + mulpd %xmm3, %xmm7 + SUBPD %xmm7, %xmm1 + movapd -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + movapd -8 * SIZE(X), %xmm2 + SUBPD %xmm5, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + movapd -6 * SIZE(X), %xmm3 + SUBPD %xmm7, %xmm1 + + addl $8 * SIZE, A1 + addl $8 * SIZE, X + ALIGN_4 + +.L15: + testl $2, M + jle .L17 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + movapd -12 * SIZE(X), %xmm2 + SUBPD %xmm5, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm3, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm3, %xmm7 + SUBPD %xmm7, %xmm1 + + addl $4 * SIZE, A1 + ALIGN_4 + +.L17: + testl $1, M + jle .L18 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm2, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm2, %xmm5 + SUBPD %xmm5, %xmm1 + ALIGN_4 + +.L18: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0xc0, %xmm5, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm5, %xmm0 +#else + xorpd %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + addpd %xmm2, %xmm0 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 +#else + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 +#endif + + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm1 + + xorpd %xmm5, %xmm1 + + subpd %xmm1, %xmm0 + + movsd 0 * SIZE(Y1), %xmm4 + movhpd 1 * SIZE(Y1), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addl INCY, Y1 + + decl J + jg .L11 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S new file mode 100644 index 0000000000..c645b57eff --- /dev/null +++ b/kernel/x86/znrm2.S @@ -0,0 +1,228 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + +#ifdef F_INTERFACE + movl (M), %ebx + movl (INCX), INCX +#endif + + fldz + testl M, M + jle .L999 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + fldz + fldz + fldz + cmpl $SIZE * 2, INCX + jne .L40 + + movl M, I + sarl $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + FLD 2 * SIZE(X) + fmul %st(0), %st + FLD 3 * SIZE(X) + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fmul %st(0), %st + FLD 5 * SIZE(X) + fmul %st(0), %st + FLD 6 * SIZE(X) + fmul %st(0), %st + FLD 7 * SIZE(X) + fmul %st(0), %st + + addl $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L10 + ALIGN_4 + +.L20: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + faddp %st,%st(3) + faddp %st,%st(1) + addl $2 * SIZE, X + decl I + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decl I + jg .L50 + ALIGN_4 + +.L60: + movl M, I + andl $3, I + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addl INCX, X + fmul %st(0), %st + faddp %st,%st(3) + faddp %st,%st(1) + decl I + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + fsqrt + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S new file mode 100644 index 0000000000..95ca9fda49 --- /dev/null +++ b/kernel/x86/znrm2_sse.S @@ -0,0 +1,465 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) + +#define M %edx +#define X %ecx +#define INCX %esi + +#define I %eax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + pxor %xmm0, %xmm0 + testl M, M + jle .L999 + pxor %xmm1, %xmm1 + testl INCX, INCX + jle .L999 + + sall $ZBASE_SHIFT, INCX + + cmpl $2 * SIZE, INCX + jne .L40 + + addl M, M + + subl $-32 * SIZE, X + + testl $SIZE, X + je .L05 + + movss -32 * SIZE(X), %xmm0 + cvtss2sd %xmm0, %xmm0 + mulsd %xmm0, %xmm0 + + addl $SIZE, X + decl M + jle .L998 + ALIGN_3 + +.L05: + movl M, I + sarl $4, I + jle .L13 + + movsd -32 * SIZE(X), %xmm4 + movsd -30 * SIZE(X), %xmm5 + movsd -28 * SIZE(X), %xmm6 + movsd -26 * SIZE(X), %xmm7 + + decl I + jle .L12 + ALIGN_3 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + cvtps2pd %xmm4, %xmm2 + movsd -24 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -22 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -20 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -18 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + movsd -16 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -14 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -12 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -10 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + decl I + jg .L10 + ALIGN_3 + +.L12: + cvtps2pd %xmm4, %xmm2 + movsd -24 * SIZE(X), %xmm4 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd -22 * SIZE(X), %xmm5 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd -20 * SIZE(X), %xmm6 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd -18 * SIZE(X), %xmm7 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + subl $-16 * SIZE, X + ALIGN_4 + +.L13: + testl $8, M + je .L14 + + movsd -32 * SIZE(X), %xmm4 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -30 * SIZE(X), %xmm5 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + movsd -28 * SIZE(X), %xmm6 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -26 * SIZE(X), %xmm7 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + addl $8 * SIZE, X + ALIGN_3 + +.L14: + testl $4, M + je .L15 + + movsd -32 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd -30 * SIZE(X), %xmm5 + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + addl $4 * SIZE, X + ALIGN_3 + +.L15: + testl $2, M + je .L16 + + movsd -32 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + addl $2 * SIZE, X + ALIGN_3 + +.L16: + testl $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 + cvtss2sd %xmm4, %xmm2 + mulsd %xmm2, %xmm2 + addsd %xmm2, %xmm1 + jmp .L998 + ALIGN_4 + +.L40: + movl M, I + sarl $3, I + jle .L43 + + movsd (X), %xmm4 + addl INCX, X + movsd (X), %xmm5 + addl INCX, X + movsd (X), %xmm6 + addl INCX, X + movsd (X), %xmm7 + addl INCX, X + + decl I + jle .L42 + ALIGN_3 + +.L41: + cvtps2pd %xmm4, %xmm2 + movsd (X), %xmm4 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd (X), %xmm6 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd (X), %xmm7 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + movsd (X), %xmm4 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd (X), %xmm6 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + movsd (X), %xmm7 + addl INCX, X + + decl I + jg .L41 + ALIGN_3 + +.L42: + cvtps2pd %xmm4, %xmm2 + movsd (X), %xmm4 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + movsd (X), %xmm5 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + movsd (X), %xmm6 + addl INCX, X + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + movsd (X), %xmm7 + addl INCX, X + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_4 + +.L43: + testl $4, M + je .L44 + + movsd (X), %xmm4 + addl INCX, X + + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + + movsd (X), %xmm6 + addl INCX, X + + cvtps2pd %xmm6, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd (X), %xmm7 + addl INCX, X + + cvtps2pd %xmm7, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L44: + testl $2, M + je .L45 + + movsd (X), %xmm4 + addl INCX, X + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + + movsd (X), %xmm5 + addl INCX, X + cvtps2pd %xmm5, %xmm3 + mulpd %xmm3, %xmm3 + addpd %xmm3, %xmm1 + ALIGN_3 + +.L45: + testl $1, M + je .L998 + + movsd (X), %xmm4 + cvtps2pd %xmm4, %xmm2 + mulpd %xmm2, %xmm2 + addpd %xmm2, %xmm0 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + +#ifndef HAVE_SSE3 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + sqrtsd %xmm0, %xmm0 + + cvtsd2ss %xmm0, %xmm0 + + movss %xmm0, STACK_M + flds STACK_M + + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zrot.S b/kernel/x86/zrot.S new file mode 100644 index 0000000000..7ac984e879 --- /dev/null +++ b/kernel/x86/zrot.S @@ -0,0 +1,407 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#ifdef XDOUBLE +#define STACK_S 40 + STACK + ARGS(%esp) +#elif defined DOUBLE +#define STACK_S 32 + STACK + ARGS(%esp) +#else +#define STACK_S 28 + STACK + ARGS(%esp) +#endif + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCH_SIZE 144 +#endif + +#ifdef OPTERON +#define PREFETCH prefetchw +#define PREFETCH_SIZE 144 +#endif + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + FLD STACK_S + FLD STACK_C + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl N, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + movl N, I + sarl $1, I + jle .L15 + ALIGN_4 + +.L10: +#ifdef PENTIUM4 + PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) +#endif +#ifdef OPTERON + PREFETCH (PREFETCH_SIZE + 0) * SIZE(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + +#ifdef PENTIUM4 + PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) +#endif +#ifdef OPTERON + PREFETCH (PREFETCH_SIZE + 0) * SIZE(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 2 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 3 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 3 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + + decl I + jg .L10 + ALIGN_4 + +.L15: + movl N, I + andl $1, I + jle .L999 + ALIGN_4 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + jmp .L999 + ALIGN_4 + +.L50: + movl N, I + sarl $1, I + jle .L55 + ALIGN_4 + +.L51: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L51 + ALIGN_4 + +.L55: + movl N, I + andl $1, I + jle .L999 + ALIGN_4 + +.L56: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + ALIGN_4 + +.L999: + ffreep %st(0) + ffreep %st(0) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86/zrot_sse.S b/kernel/x86/zrot_sse.S new file mode 100644 index 0000000000..d8d01009e1 --- /dev/null +++ b/kernel/x86/zrot_sse.S @@ -0,0 +1,1391 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#define STACK_S 28 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#include "l1param.h" + +#define C %xmm6 +#define S %xmm7 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + movss STACK_C, C + movss STACK_S, S + + shufps $0x0, C, C + shufps $0x0, S, S + + cmpl $0, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + testl $2 * SIZE, X + je .L10 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + addl $2 * SIZE, X + addl $2 * SIZE, Y + decl N + jle .L999 + +.L10: + testl $1 * SIZE, X + jne .L30 + + testl $3 * SIZE, Y + jne .L20 + + movl N, I + sarl $4, I + jle .L14 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + movaps 4 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 4 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps 8 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 8 * SIZE(Y) + + movaps 12 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movaps %xmm2, 12 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps 16 * SIZE(Y), %xmm1 + movaps 16 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 16 * SIZE(Y) + + movaps 20 * SIZE(Y), %xmm1 + movaps 20 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 20 * SIZE(X) + movaps %xmm2, 20 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps 24 * SIZE(Y), %xmm1 + movaps 24 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 24 * SIZE(X) + movaps %xmm2, 24 * SIZE(Y) + + movaps 28 * SIZE(Y), %xmm1 + movaps 28 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 28 * SIZE(X) + movaps %xmm2, 28 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl I + jg .L11 + ALIGN_3 + +.L14: + testl $15, N + jle .L999 + + testl $8, N + jle .L15 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + movaps 4 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 4 * SIZE(Y) + + movaps 8 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 8 * SIZE(Y) + + movaps 12 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movaps %xmm2, 12 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, N + jle .L16 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + movaps 4 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 4 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, N + jle .L17 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, N + jle .L999 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movl N, I + sarl $4, I + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movaps 16 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 16 * SIZE(X) + movlps %xmm2, 16 * SIZE(Y) + movhps %xmm2, 18 * SIZE(Y) + + movsd 20 * SIZE(Y), %xmm1 + movhps 22 * SIZE(Y), %xmm1 + movaps 20 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 20 * SIZE(X) + movlps %xmm2, 20 * SIZE(Y) + movhps %xmm2, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movaps 24 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 24 * SIZE(X) + movlps %xmm2, 24 * SIZE(Y) + movhps %xmm2, 26 * SIZE(Y) + + movsd 28 * SIZE(Y), %xmm1 + movhps 30 * SIZE(Y), %xmm1 + movaps 28 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 28 * SIZE(X) + movlps %xmm2, 28 * SIZE(Y) + movhps %xmm2, 30 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl I + jg .L21 + ALIGN_3 + +.L24: + testl $15, N + jle .L999 + + testl $8, N + jle .L25 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 8 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 12 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, N + jle .L26 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 4 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L26: + testl $2, N + jle .L27 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + movl N, I + sarl $4, I + jle .L34 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movsd 4 * SIZE(X), %xmm0 + movhps 6 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 4 * SIZE(X) + movhps %xmm0, 6 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 8 * SIZE(X), %xmm0 + movhps 10 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 8 * SIZE(X) + movhps %xmm0, 10 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movsd 12 * SIZE(X), %xmm0 + movhps 14 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 12 * SIZE(X) + movhps %xmm0, 14 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movsd 16 * SIZE(X), %xmm0 + movhps 18 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 16 * SIZE(X) + movhps %xmm0, 18 * SIZE(X) + movlps %xmm2, 16 * SIZE(Y) + movhps %xmm2, 18 * SIZE(Y) + + movsd 20 * SIZE(Y), %xmm1 + movhps 22 * SIZE(Y), %xmm1 + movsd 20 * SIZE(X), %xmm0 + movhps 22 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 20 * SIZE(X) + movhps %xmm0, 22 * SIZE(X) + movlps %xmm2, 20 * SIZE(Y) + movhps %xmm2, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movsd 24 * SIZE(X), %xmm0 + movhps 26 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 24 * SIZE(X) + movhps %xmm0, 26 * SIZE(X) + movlps %xmm2, 24 * SIZE(Y) + movhps %xmm2, 26 * SIZE(Y) + + movsd 28 * SIZE(Y), %xmm1 + movhps 30 * SIZE(Y), %xmm1 + movsd 28 * SIZE(X), %xmm0 + movhps 30 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 28 * SIZE(X) + movhps %xmm0, 30 * SIZE(X) + movlps %xmm2, 28 * SIZE(Y) + movhps %xmm2, 30 * SIZE(Y) + + addl $32 * SIZE, X + addl $32 * SIZE, Y + + decl I + jg .L31 + ALIGN_3 + +.L34: + testl $15, N + jle .L999 + + testl $8, N + jle .L35 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movsd 4 * SIZE(X), %xmm0 + movhps 6 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 4 * SIZE(X) + movhps %xmm0, 6 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 8 * SIZE(X), %xmm0 + movhps 10 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 8 * SIZE(X) + movhps %xmm0, 10 * SIZE(X) + movlps %xmm2, 8 * SIZE(Y) + movhps %xmm2, 10 * SIZE(Y) + + movsd 12 * SIZE(Y), %xmm1 + movhps 14 * SIZE(Y), %xmm1 + movsd 12 * SIZE(X), %xmm0 + movhps 14 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 12 * SIZE(X) + movhps %xmm0, 14 * SIZE(X) + movlps %xmm2, 12 * SIZE(Y) + movhps %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, N + jle .L36 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + movsd 4 * SIZE(Y), %xmm1 + movhps 6 * SIZE(Y), %xmm1 + movsd 4 * SIZE(X), %xmm0 + movhps 6 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 4 * SIZE(X) + movhps %xmm0, 6 * SIZE(X) + movlps %xmm2, 4 * SIZE(Y) + movhps %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L36: + testl $2, N + jle .L37 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, N + jle .L999 + +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + ALIGN_3 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_3 + +.L53: + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movhps %xmm0, (X, INCX) + movlps %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leal (X, INCX, 2), X + leal (Y, INCY, 2), Y + + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movhps %xmm0, (X, INCX) + movlps %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leal (X, INCX, 2), X + leal (Y, INCY, 2), Y + + decl I + jg .L53 + ALIGN_3 + +.L55: +#ifndef HAVE_SSE2 + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 +#endif + + movl N, I + andl $3, I + jle .L999 + ALIGN_3 + +.L56: + movsd (Y), %xmm1 + movsd (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movlps %xmm2, (Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/zrot_sse2.S b/kernel/x86/zrot_sse2.S new file mode 100644 index 0000000000..7787f45498 --- /dev/null +++ b/kernel/x86/zrot_sse2.S @@ -0,0 +1,1665 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) +#define STACK_C 24 + STACK + ARGS(%esp) +#define STACK_S 32 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#define I %eax + +#include "l1param.h" + +#define C %xmm6 +#define S %xmm7 + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + movsd STACK_C, C + movsd STACK_S, S + + pshufd $0x44, C, C + pshufd $0x44, S, S + + cmpl $0, N + jle .L999 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + +.L10: + testl $SIZE, X + jne .L30 + + testl $SIZE, Y + jne .L20 + + movl N, I + sarl $3, I + jle .L14 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 8 * SIZE(Y), %xmm1 + movapd 8 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 8 * SIZE(Y) + + movapd 10 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movapd %xmm2, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 12 * SIZE(Y), %xmm1 + movapd 12 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movapd %xmm2, 12 * SIZE(Y) + + movapd 14 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movapd %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + + decl I + jg .L11 + ALIGN_3 + +.L14: + testl $7, N + jle .L999 + + testl $4, N + jle .L15 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $2, N + jle .L16 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $1, N + jle .L999 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movapd -1 * SIZE(Y), %xmm1 + + movl N, I + sarl $3, I + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 5 * SIZE(Y), %xmm4 + movapd 4 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movlpd %xmm2, 4 * SIZE(Y) + movhpd %xmm2, 5 * SIZE(Y) + + movapd 7 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movlpd %xmm2, 6 * SIZE(Y) + movhpd %xmm2, 7 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 9 * SIZE(Y), %xmm4 + movapd 8 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movlpd %xmm2, 8 * SIZE(Y) + movhpd %xmm2, 9 * SIZE(Y) + + movapd 11 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movlpd %xmm2, 10 * SIZE(Y) + movhpd %xmm2, 11 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 13 * SIZE(Y), %xmm4 + movapd 12 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movlpd %xmm2, 12 * SIZE(Y) + movhpd %xmm2, 13 * SIZE(Y) + + movapd 15 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movlpd %xmm2, 14 * SIZE(Y) + movhpd %xmm2, 15 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + decl I + jg .L21 + ALIGN_3 + +.L24: + testl $7, N + jle .L999 + + testl $4, N + jle .L25 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + movapd 5 * SIZE(Y), %xmm4 + movapd 4 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movlpd %xmm2, 4 * SIZE(Y) + movhpd %xmm2, 5 * SIZE(Y) + + movapd 7 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movlpd %xmm2, 6 * SIZE(Y) + movhpd %xmm2, 7 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $2, N + jle .L26 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + movapd 3 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm1, %xmm4 + movapd %xmm4, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm4 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm4, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movlpd %xmm2, 2 * SIZE(Y) + movhpd %xmm2, 3 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + testl $1, N + jle .L999 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + testl $SIZE, Y + jne .L40 + + movapd -1 * SIZE(X), %xmm0 + + movl N, I + sarl $3, I + jle .L34 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 3 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 2 * SIZE(X) + movhpd %xmm4, 3 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 5 * SIZE(X), %xmm4 + movapd 4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 4 * SIZE(X) + movhpd %xmm0, 5 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 7 * SIZE(X), %xmm0 + movapd 6 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 6 * SIZE(X) + movhpd %xmm4, 7 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 9 * SIZE(X), %xmm4 + movapd 8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 8 * SIZE(X) + movhpd %xmm0, 9 * SIZE(X) + movapd %xmm2, 8 * SIZE(Y) + + movapd 11 * SIZE(X), %xmm0 + movapd 10 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 10 * SIZE(X) + movhpd %xmm4, 11 * SIZE(X) + movapd %xmm2, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 13 * SIZE(X), %xmm4 + movapd 12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 12 * SIZE(X) + movhpd %xmm0, 13 * SIZE(X) + movapd %xmm2, 12 * SIZE(Y) + + movapd 15 * SIZE(X), %xmm0 + movapd 14 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 14 * SIZE(X) + movhpd %xmm4, 15 * SIZE(X) + movapd %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, Y + addl $16 * SIZE, X + decl I + jg .L31 + ALIGN_3 + +.L34: + testl $7, N + jle .L999 + + testl $4, N + jle .L35 + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 3 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 2 * SIZE(X) + movhpd %xmm4, 3 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + movapd 5 * SIZE(X), %xmm4 + movapd 4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 4 * SIZE(X) + movhpd %xmm0, 5 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 7 * SIZE(X), %xmm0 + movapd 6 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 6 * SIZE(X) + movhpd %xmm4, 7 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, Y + addl $8 * SIZE, X + ALIGN_3 + +.L35: + testl $2, N + jle .L36 + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 3 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm4 + movapd %xmm1, %xmm2 + movapd %xmm4, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm4 + subpd %xmm3, %xmm2 + + movlpd %xmm4, 2 * SIZE(X) + movhpd %xmm4, 3 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, Y + addl $4 * SIZE, X + ALIGN_3 + +.L36: + testl $1, N + jle .L999 + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L40: + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + addl $1 * SIZE, Y + addl $1 * SIZE, X + + decl N + jle .L47 + + movl N, I + sarl $3, I + jle .L44 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 8 * SIZE(Y), %xmm1 + movapd 8 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 8 * SIZE(Y) + + movapd 10 * SIZE(Y), %xmm1 + movapd 10 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 10 * SIZE(X) + movapd %xmm2, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 12 * SIZE(Y), %xmm1 + movapd 12 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 12 * SIZE(X) + movapd %xmm2, 12 * SIZE(Y) + + movapd 14 * SIZE(Y), %xmm1 + movapd 14 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 14 * SIZE(X) + movapd %xmm2, 14 * SIZE(Y) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + + decl I + jg .L41 + ALIGN_3 + +.L44: + testl $4, N + jle .L45 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 4 * SIZE(Y) + + movapd 6 * SIZE(Y), %xmm1 + movapd 6 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 6 * SIZE(X) + movapd %xmm2, 6 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $2, N + jle .L46 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + movapd 2 * SIZE(Y), %xmm1 + movapd 2 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 2 * SIZE(X) + movapd %xmm2, 2 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + testl $1, N + jle .L47 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + + addl $2 * SIZE, Y + addl $2 * SIZE, X + ALIGN_3 + +.L47: + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: + movl N, I + sarl $2, I + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L53 + ALIGN_3 + +.L55: + movl N, I + andl $3, I + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(X) + movhpd %xmm0, 1 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + + addl INCX, X + addl INCY, Y + + decl I + jg .L56 + ALIGN_3 + +.L999: + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S new file mode 100644 index 0000000000..7505cea1a9 --- /dev/null +++ b/kernel/x86/zscal.S @@ -0,0 +1,318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 8 + +#define STACK_N 4 + STACK(%esp) +#ifdef XDOUBLE +#define ALPHA_R 16 + STACK(%esp) +#define ALPHA_I 32 + STACK(%esp) +#define STACK_X 48 + STACK(%esp) +#define STACK_INCX 52 + STACK(%esp) +#elif defined(DOUBLE) +#define ALPHA_R 16 + STACK(%esp) +#define ALPHA_I 24 + STACK(%esp) +#define STACK_X 32 + STACK(%esp) +#define STACK_INCX 36 + STACK(%esp) +#else +#define ALPHA_R 16 + STACK(%esp) +#define ALPHA_I 20 + STACK(%esp) +#define STACK_X 24 + STACK(%esp) +#define STACK_INCX 28 + STACK(%esp) +#endif + +#define N %esi +#define X %edx +#define INCX %ebx + +#define I %ecx + + PROLOGUE + + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + + sall $ZBASE_SHIFT, INCX + + FLD ALPHA_R + FLD ALPHA_I + + testl N, N + jle .L999 + + fld %st(1) + fabs + fld %st(1) + fabs + faddp %st, %st(1) + + fldz + fcomip %st(1), %st + ffreep %st(0) + jne .L30 + + EMMS + + pxor %mm0, %mm0 + + cmpl $2 * SIZE, INCX + jne .L20 + + movl N, I + sarl $2, I + jle .L15 + ALIGN_4 + +.L12: +#ifdef XDOUBLE + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + movq %mm0, 32(X) + movq %mm0, 40(X) + movq %mm0, 48(X) + movq %mm0, 56(X) + movq %mm0, 64(X) + movq %mm0, 72(X) + movq %mm0, 80(X) + movq %mm0, 88(X) + movq %mm0, 96(X) + movq %mm0, 104(X) + movq %mm0, 112(X) + movq %mm0, 120(X) +#elif defined(DOUBLE) + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + movq %mm0, 32(X) + movq %mm0, 40(X) + movq %mm0, 48(X) + movq %mm0, 56(X) +#else + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) +#endif + + addl $8 * SIZE, X + decl I + jg .L12 + ALIGN_3 + +.L15: + movl N, I + andl $3, I + jle .L18 + ALIGN_2 + +.L16: +#ifdef XDOUBLE + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) +#elif defined(DOUBLE) + movq %mm0, 0(X) + movq %mm0, 8(X) +#else + movq %mm0, 0(X) +#endif + + addl $2 * SIZE, X + decl I + jg .L16 + +.L18: + EMMS + + xorl %eax, %eax + popl %ebx + popl %esi + ret + ALIGN_2 + +.L20: + movl N, I + sarl $2, I + jle .L25 + ALIGN_3 + +.L22: +#ifdef XDOUBLE + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X + +#elif defined(DOUBLE) + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X +#else + movq %mm0, 0(X) + addl INCX, X + + movq %mm0, 0(X) + addl INCX, X + + movq %mm0, 0(X) + addl INCX, X + + movq %mm0, 0(X) + addl INCX, X +#endif + + decl I + jg .L22 + ALIGN_3 + +.L25: + movl N, I + andl $3, I + jle .L28 + ALIGN_3 + +.L26: +#ifdef XDOUBLE + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addl INCX, X +#elif defined(DOUBLE) + movq %mm0, 0(X) + movq %mm0, 8(X) + addl INCX, X +#else + movq %mm0, 0(X) + addl INCX, X +#endif + + decl I + jg .L26 + +.L28: + EMMS + + xorl %eax, %eax + popl %ebx + popl %esi + ret + ALIGN_3 + +.L30: + movl N, I + ALIGN_2 + +.L32: + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 1 * SIZE(X) + fmul %st(3),%st + faddp %st,%st(1) + + FLD 0 * SIZE(X) + fmul %st(3),%st + FLD 1 * SIZE(X) + fmul %st(3),%st + fsubrp %st,%st(1) + + FST 0 * SIZE(X) + FST 1 * SIZE(X) + addl INCX, X + decl I + jg .L32 + ALIGN_2 + +.L999: + ffreep %st(0) + ffreep %st(0) + + xorl %eax,%eax + popl %ebx + popl %esi + ret + + EPILOGUE diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S new file mode 100644 index 0000000000..849d787f6c --- /dev/null +++ b/kernel/x86/zscal_sse.S @@ -0,0 +1,1389 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define XX %edi +#define FLAG %ebp + +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#define USE_PSHUFD +#else +#define USE_PSHUFD_HALF +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + movss STACK_ALPHA_R, %xmm0 + movss STACK_ALPHA_I, %xmm1 + + sall $ZBASE_SHIFT, INCX + xor FLAG, FLAG + + testl M, M + jle .L999 + + xorps %xmm7, %xmm7 + comiss %xmm0, %xmm7 + jne .L100 # Alpha_r != ZERO + + comiss %xmm1, %xmm7 + jne .L100 # Alpha_i != ZERO + +/* Alpha == ZERO */ + cmpl $2 * SIZE, INCX + jne .L50 + +/* INCX == 1 */ + cmpl $3, M + jle .L13 + + testl $4, X + je .L05 + movss %xmm7, 0 * SIZE(X) + addl $SIZE, X + movl $1, FLAG + decl M + ALIGN_3 + +.L05: + testl $8, X + je .L06 + + movlps %xmm7, 0 * SIZE(X) + addl $2 * SIZE, X + subl $1, M + ALIGN_3 +.L06: + + movl M, I # rcx = n + sarl $3, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 4 * SIZE(X) + movaps %xmm7, 8 * SIZE(X) + movaps %xmm7, 12 * SIZE(X) + addl $16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: + testl $7, M + je .L19 + testl $4, M + je .L13 + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 4 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L13: + testl $2, M + je .L14 + + movlps %xmm7, 0 * SIZE(X) + movhps %xmm7, 2 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L14: + testl $1, M + je .L19 + + movlps %xmm7, 0 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, FLAG + je .L999 + + movss %xmm7, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L50: + movl M, I # rcx = n + sarl $2, I + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + decl I + jg .L51 + ALIGN_4 + +.L52: + testl $2, M + je .L53 + + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + movsd %xmm7, 0 * SIZE(X) + addl INCX, X + ALIGN_3 + +.L53: + testl $1, M + je .L999 + + movsd %xmm7, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + testl $SIZE, X + jne .L130 + + cmpl $2 * SIZE, INCX + jne .L120 + + movaps %xmm0, %xmm6 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm1, %xmm1 + subps %xmm1, %xmm7 + unpcklps %xmm1, %xmm7 + + subl $-32 * SIZE, X + + testl $2 * SIZE, X + je .L105 + + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + addl $2 * SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L105: + movl M, I + sarl $4, I + jle .L115 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps -16 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(X) + movaps -12 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(X) + movaps -8 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(X) + movaps -4 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + movaps 0 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(X) + movaps 4 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(X) + movaps 8 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(X) + movaps 12 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps -16 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(X) + movaps -12 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(X) + movaps -8 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(X) + movaps -4 * SIZE(X), %xmm3 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -12 * SIZE(X) + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -8 * SIZE(X) + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -4 * SIZE(X) + + subl $-32 * SIZE, X + ALIGN_4 + +.L115: + testl $8, M + je .L116 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + addl $16 * SIZE, X + ALIGN_3 + +.L116: + testl $4, M + je .L117 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L117: + testl $2, M + je .L118 + + movaps -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + addl $4 * SIZE, X + ALIGN_3 + +.L118: + testl $1, M + je .L999 + + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + jmp .L999 + ALIGN_3 + +.L120: + PSHUFD2($0, %xmm0, %xmm6) + PSHUFD2($0, %xmm1, %xmm1) + subps %xmm1, %xmm7 + unpcklps %xmm1, %xmm7 + + movl X, XX + + movl M, I + sarl $3, I + jle .L125 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + decl I + jle .L122 + ALIGN_4 + +.L121: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + addl INCX, XX + movhps %xmm0, (XX) + addl INCX, XX + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movlps %xmm1, (XX) + addl INCX, XX + movhps %xmm1, (XX) + addl INCX, XX + + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + + movlps %xmm2, (XX) + addl INCX, XX + movhps %xmm2, (XX) + addl INCX, XX + + movsd (X), %xmm2 + addl INCX, X + movhps (X), %xmm2 + addl INCX, X + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + + movlps %xmm3, (XX) + addl INCX, XX + movhps %xmm3, (XX) + addl INCX, XX + + movsd (X), %xmm3 + addl INCX, X + movhps (X), %xmm3 + addl INCX, X + + decl I + jg .L121 + ALIGN_4 + +.L122: + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + addl INCX, XX + movhps %xmm0, (XX) + addl INCX, XX + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movlps %xmm1, (XX) + addl INCX, XX + movhps %xmm1, (XX) + addl INCX, XX + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + + movlps %xmm2, (XX) + addl INCX, XX + movhps %xmm2, (XX) + addl INCX, XX + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + + movlps %xmm3, (XX) + addl INCX, XX + movhps %xmm3, (XX) + addl INCX, XX + ALIGN_4 + +.L125: + testl $4, M + je .L127 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + addl INCX, XX + movhps %xmm0, (XX) + addl INCX, XX + + movsd (X), %xmm1 + addl INCX, X + movhps (X), %xmm1 + addl INCX, X + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movlps %xmm1, (XX) + addl INCX, XX + movhps %xmm1, (XX) + addl INCX, XX + ALIGN_3 + +.L127: + testl $2, M + je .L128 + + movsd (X), %xmm0 + addl INCX, X + movhps (X), %xmm0 + addl INCX, X + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + addl INCX, XX + movhps %xmm0, (XX) + addl INCX, XX + ALIGN_3 + +.L128: + testl $1, M + je .L999 + + movsd (X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, (XX) + jmp .L999 + ALIGN_3 + +.L130: + cmpl $2 * SIZE, INCX + jne .L120 + +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) + + PSHUFD2($0, %xmm0, %xmm6) + PSHUFD2($0, %xmm1, %xmm1) + subps %xmm1, %xmm7 + unpcklps %xmm1, %xmm7 + + subl $-31 * SIZE, X + + testl $2 * SIZE, X + je .L130x + + movsd -31 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -31 * SIZE(X) + addl $2 * SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L130x: + shufps $0xb1, %xmm7, %xmm7 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, %xmm4 + + movl M, I + sarl $4, I + jle .L135 + + movaps -28 * SIZE(X), %xmm1 + + + decl I + jle .L132 + ALIGN_4 + +.L131: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -20 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -24 * SIZE(X) + + movaps -16 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -20 * SIZE(X) + + movaps -12 * SIZE(X), %xmm1 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + movaps -8 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -12 * SIZE(X) + + movaps -4 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -8 * SIZE(X) + + movaps 0 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -4 * SIZE(X) + + movaps 4 * SIZE(X), %xmm1 + + subl $-32 * SIZE, X + decl I + jg .L131 + ALIGN_4 + +.L132: + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -20 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -24 * SIZE(X) + + movaps -16 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -20 * SIZE(X) + + movaps -12 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + movaps -8 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -12 * SIZE(X) + + movaps -4 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -8 * SIZE(X) + + movaps 0 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -4 * SIZE(X) + + subl $-32 * SIZE, X + ALIGN_4 + +.L135: + testl $8, M + je .L136 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -20 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -24 * SIZE(X) + + movaps -16 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -20 * SIZE(X) + + addl $16 * SIZE, X + ALIGN_3 + +.L136: + testl $4, M + je .L137 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + + movss %xmm0, %xmm1 + PSHUFD2($0x1b, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm1, %xmm4 + movss %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L137: + testl $2, M + je .L138 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + PSHUFD2($0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movaps %xmm0, %xmm2 + movss %xmm4, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps %xmm2, %xmm4 + movaps %xmm1, %xmm0 + + addl $4 * SIZE, X + ALIGN_3 + +.L138: + movss %xmm4, -32 * SIZE(X) + + testl $1, M + je .L999 + + PSHUFD2( $0x1b, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + PSHUFD1( $0x39, %xmm0) + + movlps %xmm0, -31 * SIZE(X) + jmp .L999 + ALIGN_3 + + +#else + + PSHUFD2($0, %xmm0, %xmm6) + PSHUFD2($0, %xmm1, %xmm1) + subps %xmm1, %xmm7 + unpcklps %xmm1, %xmm7 + + subl $-32 * SIZE, X + + testl $2 * SIZE, X + je .L130x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + addl $2 * SIZE, X + decl M + jle .L999 + ALIGN_3 + +.L130x: + movl M, I + sarl $4, I + jle .L135 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decl I + jle .L132 + ALIGN_4 + +.L131: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -14 * SIZE(X) + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -12 * SIZE(X) + movhps %xmm1, -10 * SIZE(X) + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -8 * SIZE(X) + movhps %xmm2, -6 * SIZE(X) + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -4 * SIZE(X) + movhps %xmm3, -2 * SIZE(X) + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + + subl $-32 * SIZE, X + decl I + jg .L131 + ALIGN_4 + +.L132: + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + movsd -16 * SIZE(X), %xmm0 + movhps -14 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + movsd -12 * SIZE(X), %xmm1 + movhps -10 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + movsd -8 * SIZE(X), %xmm2 + movhps -6 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + movsd -4 * SIZE(X), %xmm3 + movhps -2 * SIZE(X), %xmm3 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -14 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -12 * SIZE(X) + movhps %xmm1, -10 * SIZE(X) + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -8 * SIZE(X) + movhps %xmm2, -6 * SIZE(X) + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -4 * SIZE(X) + movhps %xmm3, -2 * SIZE(X) + + subl $-32 * SIZE, X + ALIGN_4 + +.L135: + testl $8, M + je .L136 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + + PSHUFD2( $0xb1, %xmm2, %xmm5) + mulps %xmm6, %xmm2 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + PSHUFD2( $0xb1, %xmm3, %xmm5) + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + + addl $16 * SIZE, X + ALIGN_3 + +.L136: + testl $4, M + je .L137 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + PSHUFD2( $0xb1, %xmm1, %xmm5) + mulps %xmm6, %xmm1 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L137: + testl $2, M + je .L138 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + addl $4 * SIZE, X + ALIGN_3 + +.L138: + testl $1, M + je .L999 + + movsd -32 * SIZE(X), %xmm0 + + PSHUFD2( $0xb1, %xmm0, %xmm5) + mulps %xmm6, %xmm0 + mulps %xmm7, %xmm5 + addps %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + ALIGN_3 +#endif + +.L999: + xorl %eax, %eax + popl %ebp + popl %ebx + popl %esi + popl %edi + + ret + + EPILOGUE diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S new file mode 100644 index 0000000000..5b1da61e6b --- /dev/null +++ b/kernel/x86/zscal_sse2.S @@ -0,0 +1,1745 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esp) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) + +#define M %ebx +#define X %ecx +#define INCX %edx +#define I %esi +#define XX %edi +#define FLAG %ebp + +#include "l1param.h" + +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#define USE_PSHUFD +#else +#define USE_PSHUFD_HALF +#endif + + +#define xmm8 xmm0 +#define xmm9 xmm1 +#define xmm10 xmm2 +#define xmm11 xmm3 +#define xmm12 xmm4 +#define xmm13 xmm5 +#define xmm14 xmm6 +#define xmm15 xmm7 + + + PROLOGUE + PROFCODE + + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl STACK_M, M + movl STACK_X, X + movl STACK_INCX, INCX + + movsd STACK_ALPHA_R, %xmm0 + movsd STACK_ALPHA_I, %xmm1 + + sall $ZBASE_SHIFT, INCX + xor FLAG, FLAG + + testl M, M + jle .L999 + + xorps %xmm7, %xmm7 + comisd %xmm0, %xmm7 + jne .L100 + + comisd %xmm1, %xmm7 + jne .L100 + +/* Alpha == ZERO */ + cmpl $2 * SIZE, INCX + jne .L20 + +/* INCX == 1 */ + testl $SIZE, X + je .L05 + + movsd %xmm7, 0 * SIZE(X) + addl $SIZE, X + movl $1, FLAG + decl M + jle .L19 + ALIGN_3 +.L05: + + movl M, I # rcx = n + sarl $3, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 2 * SIZE(X) + movaps %xmm7, 4 * SIZE(X) + movaps %xmm7, 6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm7, 8 * SIZE(X) + movaps %xmm7, 10 * SIZE(X) + movaps %xmm7, 12 * SIZE(X) + movaps %xmm7, 14 * SIZE(X) + + addl $16 * SIZE, X + decl I + jg .L11 + ALIGN_4 + +.L12: + testl $4, M + je .L13 + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 2 * SIZE(X) + movaps %xmm7, 4 * SIZE(X) + movaps %xmm7, 6 * SIZE(X) + addl $8 * SIZE, X + ALIGN_3 + +.L13: + testl $2, M + je .L14 + + movaps %xmm7, 0 * SIZE(X) + movaps %xmm7, 2 * SIZE(X) + addl $4 * SIZE, X + ALIGN_3 + +.L14: + testl $1, M + je .L19 + movaps %xmm7, 0 * SIZE(X) + addl $2 * SIZE, X + ALIGN_3 + +.L19: + testl $1, FLAG + je .L999 + + movsd %xmm7, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L20: + testl $SIZE, X + jne .L30 + +/* Aligned Mode */ + movl M, I # rcx = n + sarl $2, I + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm7, (X) + addl INCX, X + movaps %xmm7, (X) + addl INCX, X + movaps %xmm7, (X) + addl INCX, X + movaps %xmm7, (X) + addl INCX, X + decl I + jg .L21 + ALIGN_4 + +.L22: + testl $3, M + je .L999 + + testl $2, M + je .L23 + + movaps %xmm7, (X) + addl INCX, X + movaps %xmm7, (X) + addl INCX, X + ALIGN_3 + +.L23: + testl $1, M + je .L999 + + movaps %xmm7, (X) + jmp .L999 + ALIGN_4 + + +/* Unaligned Mode */ +.L30: + movl M, I # rcx = n + sarl $2, I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + decl I + jg .L31 + ALIGN_4 + +.L32: + testl $3, M + je .L999 + + testl $2, M + je .L33 + + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + addl INCX, X + ALIGN_3 + +.L33: + testl $1, M + je .L999 + + movlps %xmm7, 0 * SIZE(X) + movlps %xmm7, 1 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ +.L100: + testl $SIZE, X + jne .L200 + +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm6 +#else + pshufd $0x44, %xmm0, %xmm6 +#endif + + xorps %xmm7, %xmm7 + subsd %xmm1, %xmm7 + movlhps %xmm1, %xmm7 + + cmpl $2 * SIZE, INCX + jne .L120 + + subl $-16 * SIZE, X + + movl M, I + sarl $3, I + jle .L115 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decl I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm5 +#else + movsd -15 * SIZE(X), %xmm5 + movhps -16 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + movaps -8 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm5 +#else + movsd -13 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(X) + movaps -6 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm5 +#else + movsd -11 * SIZE(X), %xmm5 + movhps -12 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(X) + movaps -4 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm5 +#else + movsd -9 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(X) + movaps -2 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm5 +#else + movsd -7 * SIZE(X), %xmm5 + movhps -8 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(X) + movaps 0 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm5 +#else + movsd -5 * SIZE(X), %xmm5 + movhps -6 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(X) + movaps 2 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm5 +#else + movsd -3 * SIZE(X), %xmm5 + movhps -4 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(X) + movaps 4 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm5 +#else + movsd -1 * SIZE(X), %xmm5 + movhps -2 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(X) + movaps 6 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + decl I + jg .L111 + ALIGN_4 + +.L112: +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm5 +#else + movsd -15 * SIZE(X), %xmm5 + movhps -16 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + movaps -8 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm5 +#else + movsd -13 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(X) + movaps -6 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm5 +#else + movsd -11 * SIZE(X), %xmm5 + movhps -12 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(X) + movaps -4 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm5 +#else + movsd -9 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(X) + movaps -2 * SIZE(X), %xmm3 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm5 +#else + movsd -7 * SIZE(X), %xmm5 + movhps -8 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -8 * SIZE(X) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm5 +#else + movsd -5 * SIZE(X), %xmm5 + movhps -6 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -6 * SIZE(X) + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm5 +#else + movsd -3 * SIZE(X), %xmm5 + movhps -4 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -4 * SIZE(X) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm5 +#else + movsd -1 * SIZE(X), %xmm5 + movhps -2 * SIZE(X), %xmm5 +#endif + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subl $-16 * SIZE, X + ALIGN_3 + +.L115: + testl $7, M + je .L999 + + testl $4, M + je .L116 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, -12 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, -10 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L116: + testl $2, M + je .L117 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + addl $4 * SIZE, X + ALIGN_3 + +.L117: + testl $1, M + je .L999 + + movaps -16 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + + movaps %xmm0, -16 * SIZE(X) + jmp .L999 + ALIGN_3 + +.L120: + movl X, XX + + movl M, I + sarl $3, I + jle .L125 + + movaps (X), %xmm0 + addl INCX, X + movaps (X), %xmm1 + addl INCX, X + movaps (X), %xmm2 + addl INCX, X + movaps (X), %xmm3 + addl INCX, X + + decl I + jle .L122 + ALIGN_4 + +.L121: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + movaps (X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + movaps (X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + movaps (X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + movaps (X), %xmm3 + addl INCX, X + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + movaps (X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + movaps (X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + movaps (X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + movaps (X), %xmm3 + addl INCX, X + + decl I + jg .L121 + ALIGN_4 + +.L122: + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + movaps (X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + movaps (X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + movaps (X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + movaps (X), %xmm3 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + ALIGN_3 + +.L125: + testl $7, M + je .L999 + + testl $4, M + je .L126 + + movaps (X), %xmm0 + addl INCX, X + movaps (X), %xmm1 + addl INCX, X + + movaps (X), %xmm2 + addl INCX, X + movaps (X), %xmm3 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movaps %xmm2, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movaps %xmm3, (XX) + addl INCX, XX + ALIGN_3 + +.L126: + testl $2, M + je .L127 + + movaps (X), %xmm0 + addl INCX, X + movaps (X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movaps %xmm0, (XX) + addl INCX, XX + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movaps %xmm1, (XX) + addl INCX, XX + ALIGN_3 + +.L127: + testl $1, M + je .L999 + + movaps (X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + + movaps %xmm0, (XX) + jmp .L999 + ALIGN_3 + +.L200: + cmpl $2 * SIZE, INCX + jne .L220 + +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) + +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm6 +#else + pshufd $0x44, %xmm0, %xmm6 +#endif + pxor %xmm7, %xmm7 + subsd %xmm1, %xmm7 + movlhps %xmm1, %xmm7 + shufpd $1, %xmm7, %xmm7 + + movhps 0 * SIZE(X), %xmm0 + movaps 1 * SIZE(X), %xmm1 + subl $-16 * SIZE, X + + unpckhpd %xmm0, %xmm0 + mulsd %xmm6, %xmm0 + movaps %xmm1, %xmm5 + mulsd %xmm7, %xmm5 + subsd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + + decl M + + movl M, I + sarl $3, I + jle .L205 + + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + + decl I + jle .L202 + ALIGN_4 + +.L201: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + movaps -9 * SIZE(X), %xmm0 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -13 * SIZE(X) + movaps -7 * SIZE(X), %xmm1 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -11 * SIZE(X) + movaps -5 * SIZE(X), %xmm2 + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -9 * SIZE(X) + movaps -3 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -7 * SIZE(X) + movaps -1 * SIZE(X), %xmm0 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -5 * SIZE(X) + movaps 1 * SIZE(X), %xmm1 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -3 * SIZE(X) + movaps 3 * SIZE(X), %xmm2 + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -1 * SIZE(X) + movaps 5 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + decl I + jg .L201 + ALIGN_4 + +.L202: + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + movaps -9 * SIZE(X), %xmm0 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -13 * SIZE(X) + movaps -7 * SIZE(X), %xmm1 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -11 * SIZE(X) + movaps -5 * SIZE(X), %xmm2 + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -9 * SIZE(X) + movaps -3 * SIZE(X), %xmm3 + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -7 * SIZE(X) + movaps -1 * SIZE(X), %xmm0 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -5 * SIZE(X) + movaps 1 * SIZE(X), %xmm1 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -3 * SIZE(X) + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-16 * SIZE, X + ALIGN_3 + +.L205: + testl $4, M + je .L206 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + + movaps %xmm3, %xmm5 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm2 + addpd %xmm5, %xmm2 + movaps %xmm2, -11 * SIZE(X) + + movaps -7 * SIZE(X), %xmm1 + + movaps %xmm0, %xmm5 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm3 + addpd %xmm5, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L206: + testl $2, M + je .L207 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm5 + SHUFPD_1 %xmm3, %xmm1 + + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm1 + addpd %xmm5, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + ALIGN_3 + +.L207: + testl $1, M + je .L208 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm5 + SHUFPD_1 %xmm2, %xmm0 + + mulpd %xmm6, %xmm5 + mulpd %xmm7, %xmm0 + addpd %xmm5, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + addl $2 * SIZE, X + ALIGN_3 + +.L208: + unpckhpd %xmm0, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm0 + addsd %xmm1, %xmm0 + movlps %xmm0, -15 * SIZE(X) + jmp .L999 + ALIGN_3 + +#else + +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm6 +#else + pshufd $0x44, %xmm0, %xmm6 +#endif + pxor %xmm7, %xmm7 + subsd %xmm1, %xmm7 + movlhps %xmm1, %xmm7 + + subl $-16 * SIZE, X + + movl M, I + sarl $3, I + jle .L205 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decl I + jle .L202 + ALIGN_4 + +.L201: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -8 * SIZE(X) + movhps %xmm0, -7 * SIZE(X) + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -6 * SIZE(X) + movhps %xmm1, -5 * SIZE(X) + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -4 * SIZE(X) + movhps %xmm2, -3 * SIZE(X) + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -2 * SIZE(X) + movhps %xmm3, -1 * SIZE(X) + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + + subl $-16 * SIZE, X + decl I + jg .L201 + ALIGN_4 + +.L202: + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + movsd -8 * SIZE(X), %xmm0 + movhps -7 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + movsd -6 * SIZE(X), %xmm1 + movhps -5 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + movsd -4 * SIZE(X), %xmm2 + movhps -3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + movsd -2 * SIZE(X), %xmm3 + movhps -1 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -8 * SIZE(X) + movhps %xmm0, -7 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -6 * SIZE(X) + movhps %xmm1, -5 * SIZE(X) + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -4 * SIZE(X) + movhps %xmm2, -3 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -2 * SIZE(X) + movhps %xmm3, -1 * SIZE(X) + + subl $-16 * SIZE, X + ALIGN_3 + +.L205: + testl $7, M + je .L999 + + testl $4, M + je .L206 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + + addl $8 * SIZE, X + ALIGN_3 + +.L206: + testl $2, M + je .L207 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + addl $4 * SIZE, X + ALIGN_3 + +.L207: + testl $1, M + je .L999 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + jmp .L999 + ALIGN_3 + +#endif + +.L220: +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm6 +#else + pshufd $0x44, %xmm0, %xmm6 +#endif + pxor %xmm7, %xmm7 + subsd %xmm1, %xmm7 + movlhps %xmm1, %xmm7 + + movl X, XX + + movl M, I + sarl $3, I + jle .L225 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + decl I + jle .L222 + ALIGN_4 + +.L221: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + decl I + jg .L221 + ALIGN_4 + +.L222: + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + ALIGN_3 + +.L225: + testl $7, M + je .L999 + + testl $4, M + je .L226 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addl INCX, X + + pshufd $0x4e, %xmm2, %xmm5 + mulpd %xmm6, %xmm2 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addl INCX, XX + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addl INCX, X + + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm6, %xmm3 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addl INCX, XX + ALIGN_3 + +.L226: + testl $2, M + je .L227 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addl INCX, X + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addl INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addl INCX, X + + pshufd $0x4e, %xmm1, %xmm5 + mulpd %xmm6, %xmm1 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addl INCX, XX + ALIGN_3 + +.L227: + testl $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm5 + mulpd %xmm6, %xmm0 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm0 + + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + ALIGN_3 + +.L999: + xorl %eax, %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE + diff --git a/kernel/x86/zswap.S b/kernel/x86/zswap.S new file mode 100644 index 0000000000..ca4660f448 --- /dev/null +++ b/kernel/x86/zswap.S @@ -0,0 +1,248 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define N 4 + STACK + ARGS(%esp) +#ifdef XDOUBLE +#define X 48 + STACK + ARGS(%esp) +#define INCX 52 + STACK + ARGS(%esp) +#define Y 56 + STACK + ARGS(%esp) +#define INCY 60 + STACK + ARGS(%esp) +#elif defined(DOUBLE) +#define X 32 + STACK + ARGS(%esp) +#define INCX 36 + STACK + ARGS(%esp) +#define Y 40 + STACK + ARGS(%esp) +#define INCY 44 + STACK + ARGS(%esp) +#else +#define X 24 + STACK + ARGS(%esp) +#define INCX 28 + STACK + ARGS(%esp) +#define Y 32 + STACK + ARGS(%esp) +#define INCY 36 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) + EMMS +#endif + + movl N, %edx + movl X, %esi + movl Y, %edi + movl INCX, %ebx + movl INCY, %ecx + + sall $ZBASE_SHIFT, %ebx + sall $ZBASE_SHIFT, %ecx + + cmpl $2 * SIZE, %ebx + jne .L14 + cmpl $2 * SIZE, %ecx + jne .L14 + + movl %edx, %eax + sarl $1, %eax + jle .L15 + ALIGN_3 + +.L16: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 3 * SIZE(%esi) + FLD 2 * SIZE(%esi) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + FLD 3 * SIZE(%edi) + FLD 2 * SIZE(%edi) + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 2 * SIZE(%esi) + FST 3 * SIZE(%esi) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) + FST 2 * SIZE(%edi) + FST 3 * SIZE(%edi) +#else + fldl 2 * SIZE(%esi) + fldl 0 * SIZE(%esi) + fldl 2 * SIZE(%edi) + fldl 0 * SIZE(%edi) + + fstpl 0 * SIZE(%esi) + fstpl 2 * SIZE(%esi) + fstpl 0 * SIZE(%edi) + fstpl 2 * SIZE(%edi) +#endif + addl $4 * SIZE, %esi + addl $4 * SIZE, %edi + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl %edx, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L22: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) +#else + fldl 0 * SIZE(%esi) + fldl 0 * SIZE(%edi) + fstpl 0 * SIZE(%esi) + fstpl 0 * SIZE(%edi) +#endif + + jmp .L27 + ALIGN_3 + +/* INCX != 1 or INCY != 1 */ + +.L14: + movl %edx, %eax + sarl $1, %eax + jle .L28 + ALIGN_2 + +.L29: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + addl %ebx, %esi + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + addl %ecx, %edi + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + subl %ebx, %esi + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + leal (%esi, %ebx, 2), %esi + + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) + subl %ecx, %edi + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) + leal (%edi, %ecx, 2), %edi +#else + fldl 0 * SIZE(%esi) + addl %ebx, %esi + fldl 0 * SIZE(%esi) + + fldl 0 * SIZE(%edi) + addl %ecx, %edi + fldl 0 * SIZE(%edi) + + fstpl 0 * SIZE(%esi) + subl %ebx, %esi + fstpl 0 * SIZE(%esi) + leal (%esi, %ebx, 2), %esi + + fstpl 0 * SIZE(%edi) + subl %ecx, %edi + fstpl 0 * SIZE(%edi) + leal (%edi, %ecx, 2), %edi +#endif + + decl %eax + jg .L29 + ALIGN_3 + +.L28: + movl %edx, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L35: +#if defined(DOUBLE) || defined(XDOUBLE) + FLD 1 * SIZE(%esi) + FLD 0 * SIZE(%esi) + FLD 1 * SIZE(%edi) + FLD 0 * SIZE(%edi) + FST 0 * SIZE(%esi) + FST 1 * SIZE(%esi) + FST 0 * SIZE(%edi) + FST 1 * SIZE(%edi) +#else + fldl 0 * SIZE(%esi) + fldl 0 * SIZE(%edi) + fstpl 0 * SIZE(%esi) + fstpl 0 * SIZE(%edi) +#endif + ALIGN_3 + +.L27: + xorl %eax,%eax + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zswap_sse.S b/kernel/x86/zswap_sse.S new file mode 100644 index 0000000000..24d0001669 --- /dev/null +++ b/kernel/x86/zswap_sse.S @@ -0,0 +1,1112 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 24 + STACK + ARGS(%esp) +#define STACK_INCX 28 + STACK + ARGS(%esp) +#define STACK_Y 32 + STACK + ARGS(%esp) +#define STACK_INCY 36 + STACK + ARGS(%esp) + +#define M %edx +#define X %esi +#define Y %edi +#define INCX %ebx +#define INCY %ecx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_Y, Y + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L19 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + addl M, M + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + cmpl $3, M + jle .L16 + + testl $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + + addl $1 * SIZE, X + addl $1 * SIZE, Y + decl M + ALIGN_3 + +.L05: + testl $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + movlps %xmm0, -32 * SIZE(Y) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + subl $2, M + jle .L19 + ALIGN_3 + +.L10: + cmpl $3, M + jle .L16 + + testl $2 * SIZE, X + jne .L30 + + testl $1 * SIZE, X + jne .L20 + + movl M, %eax + sarl $5, %eax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + subl $-32 * SIZE, Y + subl $-32 * SIZE, X + + decl %eax + jg .L11 + ALIGN_3 + +.L13: + testl $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L14: + testl $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L15: + testl $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L16: + testl $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + addl $2 * SIZE, X + movlps %xmm0, -32 * SIZE(Y) + addl $2 * SIZE, Y + ALIGN_3 + +.L17: + testl $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L20: + movaps -33 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + PSHUFD2($0x39, %xmm1, %xmm3) + movlps %xmm3, -31 * SIZE(X) + + subl $3, M + + movl M, %eax + sarl $5, %eax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -13 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -5 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -5 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L23: + testl $16, M + jle .L24 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L24: + testl $8, M + jle .L25 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L25: + testl $4, M + jle .L26 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L26: + PSHUFD2($0x39, %xmm0, %xmm2) + PSHUFD1($0xff, %xmm0) + + movlps %xmm2, -32 * SIZE(Y) + movss %xmm0, -30 * SIZE(Y) + + testl $2, M + jle .L27 + + movsd -29 * SIZE(X), %xmm0 + movsd -29 * SIZE(Y), %xmm1 + + movlps %xmm0, -29 * SIZE(Y) + movlps %xmm1, -29 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L27: + testl $1, M + jle .L29 + + movss -29 * SIZE(X), %xmm0 + movss -29 * SIZE(Y), %xmm1 + + movss %xmm0, -29 * SIZE(Y) + movss %xmm1, -29 * SIZE(X) + ALIGN_3 + +.L29: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L30: + testl $1 * SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + subl $2, M + + movl M, %eax + sarl $5, %eax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -6 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -6 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L33: + testl $16, M + jle .L34 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L34: + testl $8, M + jle .L35 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L35: + testl $4, M + jle .L36 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L36: + movhps %xmm0, -32 * SIZE(Y) + + testl $2, M + jle .L37 + + movsd -30 * SIZE(X), %xmm0 + movsd -30 * SIZE(Y), %xmm1 + + movlps %xmm0, -30 * SIZE(Y) + movlps %xmm1, -30 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L37: + testl $1, M + jle .L39 + + movss -30 * SIZE(X), %xmm0 + movss -30 * SIZE(Y), %xmm1 + + movss %xmm0, -30 * SIZE(Y) + movss %xmm1, -30 * SIZE(X) + ALIGN_3 + +.L39: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + + subl $3, M + + movl M, %eax + sarl $5, %eax + jle .L43 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -11 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -3 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -3 * SIZE(X) + + subl $-32 * SIZE, X + subl $-32 * SIZE, Y + + decl %eax + jg .L41 + ALIGN_3 + +.L43: + testl $16, M + jle .L44 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + + addl $16 * SIZE, X + addl $16 * SIZE, Y + ALIGN_3 + +.L44: + testl $8, M + jle .L45 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L45: + testl $4, M + jle .L46 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L46: + movsd -31 * SIZE(X), %xmm2 + + PSHUFD2($0x39, %xmm1, %xmm1) + movlps %xmm1, -31 * SIZE(X) + + PSHUFD1($0xff, %xmm0) + + movss %xmm0, -32 * SIZE(Y) + movlps %xmm2, -31 * SIZE(Y) + + addl $3 * SIZE, X + addl $3 * SIZE, Y + + testl $2, M + jle .L47 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm0, -32 * SIZE(Y) + movlps %xmm1, -32 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L47: + testl $1, M + jle .L49 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm0, -32 * SIZE(Y) + movss %xmm1, -32 * SIZE(X) + ALIGN_3 + +.L49: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L50: + movl M, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L51: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $3, %eax + jle .L57 + ALIGN_3 + +.L56: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addl INCX, X + movlps %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/zswap_sse2.S b/kernel/x86/zswap_sse2.S new file mode 100644 index 0000000000..d900ea547d --- /dev/null +++ b/kernel/x86/zswap_sse2.S @@ -0,0 +1,978 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_X 32 + STACK + ARGS(%esp) +#define STACK_INCX 36 + STACK + ARGS(%esp) +#define STACK_Y 40 + STACK + ARGS(%esp) +#define STACK_INCY 44 + STACK + ARGS(%esp) + +#define M %edx +#define X %esi +#define Y %edi +#define INCX %ebx +#define INCY %ecx + +#include "l1param.h" + + PROLOGUE + PROFCODE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + movl STACK_M, M + movl STACK_X, X + movl STACK_Y, Y + movl STACK_INCX, INCX + movl STACK_INCY, INCY + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + testl M, M + jle .L19 + + cmpl $2 * SIZE, INCX + jne .L50 + cmpl $2 * SIZE, INCY + jne .L50 + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + testl $SIZE, Y + jne .L30 + + testl $SIZE, X + jne .L20 + + movl M, %eax + sarl $3, %eax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + + decl %eax + jg .L11 + ALIGN_3 + +.L13: + testl $4, M + jle .L14 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L14: + testl $2, M + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L15: + testl $1, M + jle .L19 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L19: + xorl %eax,%eax + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L20: + movhps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + decl M + jle .L29 + + movl M, %eax + sarl $3, %eax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -5 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -3 * SIZE(X), %xmm2 + movaps -2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L21 + ALIGN_3 + +.L23: + testl $4, M + jle .L24 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L24: + testl $2, M + jle .L25 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L25: + testl $1, M + jle .L29 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L29: + movaps -15 * SIZE(X), %xmm2 + + movhps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L30: + testl $SIZE, X + jne .L40 + + movhps -16 * SIZE(Y), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movlps %xmm1, -16 * SIZE(Y) + decl M + jle .L39 + + movl M, %eax + sarl $3, %eax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -11 * SIZE(Y), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(Y) + + movaps -9 * SIZE(Y), %xmm0 + movaps -8 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(Y), %xmm2 + movaps -6 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(Y) + + movaps -5 * SIZE(Y), %xmm0 + movaps -4 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -3 * SIZE(Y), %xmm2 + movaps -2 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(Y) + + movaps -1 * SIZE(Y), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(Y) + + subl $-16 * SIZE, X + subl $-16 * SIZE, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L33: + testl $4, M + jle .L34 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + + movaps -11 * SIZE(Y), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(Y) + + movaps -9 * SIZE(Y), %xmm0 + movaps -8 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(Y) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L34: + testl $2, M + jle .L35 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L35: + testl $1, M + jle .L39 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L39: + movaps -15 * SIZE(Y), %xmm2 + + movhps %xmm1, -15 * SIZE(Y) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L40: + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm0, -16 * SIZE(Y) + movlps %xmm1, -16 * SIZE(X) + + addl $SIZE, X + addl $SIZE, Y + decl M + jle .L49 + + movl M, %eax + sarl $3, %eax + jle .L43 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subl $-16 * SIZE, Y + subl $-16 * SIZE, X + + decl %eax + jg .L41 + ALIGN_3 + +.L43: + testl $4, M + jle .L44 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addl $8 * SIZE, X + addl $8 * SIZE, Y + ALIGN_3 + +.L44: + testl $2, M + jle .L45 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + ALIGN_3 + +.L45: + testl $1, M + jle .L49 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addl $2 * SIZE, X + addl $2 * SIZE, Y + ALIGN_3 + +.L49: + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm0, -16 * SIZE(Y) + movlps %xmm1, -16 * SIZE(X) + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L50: + testl $SIZE, X + jne .L60 + testl $SIZE, Y + jne .L60 + + movl M, %eax + sarl $2, %eax + jle .L55 + ALIGN_3 + +.L51: + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L51 + ALIGN_3 + +.L55: + movl M, %eax + andl $3, %eax + jle .L57 + ALIGN_3 + +.L56: + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addl INCX, X + movaps %xmm0, (Y) + addl INCY, Y + + decl %eax + jg .L56 + ALIGN_3 + +.L57: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + ALIGN_3 + +.L60: + movl M, %eax + sarl $2, %eax + jle .L65 + ALIGN_3 + +.L61: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L61 + ALIGN_3 + +.L65: + movl M, %eax + andl $3, %eax + jle .L67 + ALIGN_3 + +.L66: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addl INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addl INCY, Y + + decl %eax + jg .L66 + ALIGN_3 + +.L67: + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_2x1_core2.S b/kernel/x86/ztrsm_kernel_LN_2x1_core2.S new file mode 100644 index 0000000000..1d3107a419 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_2x1_core2.S @@ -0,0 +1,1057 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addpd +#define ADD2 addpd + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $ZBASE_SHIFT, LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + je .L50 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax # l-- + jg .L53 + +.L54: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AA), %xmm5 + + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(B) + + movddup %xmm5, %xmm4 + unpckhpd %xmm5, %xmm5 + + movapd %xmm4, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + ADD2 %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AA), %xmm0 + ADD2 %xmm3, %xmm7 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm5 + movapd -14 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm5 + movapd -14 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -10 * SIZE(AA), %xmm2 + movddup -9 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup -12 * SIZE(AA), %xmm2 + movddup -11 * SIZE(AA), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup -14 * SIZE(AA), %xmm2 + movddup -13 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup -10 * SIZE(AA), %xmm2 + movddup -9 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + movsd %xmm7, 2 * SIZE(CO1) + movhpd %xmm7, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + + movddup %xmm5, %xmm4 + unpckhpd %xmm5, %xmm5 + movddup %xmm7, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm4, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm6, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) + movapd %xmm7, -14 * SIZE(AA) + +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S b/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S new file mode 100644 index 0000000000..7aef336961 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_2x1_sse2.S @@ -0,0 +1,1163 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + movapd %xmm2, 4 * SIZE(BB) + movapd %xmm3, 6 * SIZE(BB) + movapd %xmm4, 8 * SIZE(BB) + movapd %xmm5, 10 * SIZE(BB) + movapd %xmm6, 12 * SIZE(BB) + movapd %xmm7, 14 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + je .L50 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax # l = (k >> 2) + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 16 * SIZE(BB), %xmm1 + + mulpd %xmm0, %xmm3 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm5 + movapd 4 * SIZE(BB), %xmm3 + + mulpd %xmm0, %xmm3 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 10 * SIZE(BB), %xmm0 + + addpd %xmm2, %xmm4 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 24 * SIZE(BB), %xmm2 + + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm5 + movapd 8 * SIZE(AA), %xmm0 + + addl $ 8 * SIZE, AA # aoffset += 2 + addl $16 * SIZE, BB # boffset1 += 4 + + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 2 + addl $4 * SIZE, BB # boffset1 += 4 + decl %eax # l-- + jg .L53 + +.L54: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + + movsd %xmm5, 0 * SIZE(BB) + movsd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L50: + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (BB, %eax, 4), BB + leal (AA, %eax, 4), AA + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + movapd 2 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movsd 6 * SIZE(AA), %xmm2 + movhpd 6 * SIZE(AA), %xmm2 + movsd 7 * SIZE(AA), %xmm3 + movhpd 7 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movsd 4 * SIZE(AA), %xmm2 + movhpd 4 * SIZE(AA), %xmm2 + movsd 5 * SIZE(AA), %xmm3 + movhpd 5 * SIZE(AA), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movsd 2 * SIZE(AA), %xmm2 + movhpd 2 * SIZE(AA), %xmm2 + movsd 3 * SIZE(AA), %xmm3 + movhpd 3 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movsd 6 * SIZE(AA), %xmm2 + movhpd 6 * SIZE(AA), %xmm2 + movsd 7 * SIZE(AA), %xmm3 + movhpd 7 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + movsd %xmm7, 2 * SIZE(CO1) + movhpd %xmm7, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + + movsd %xmm5, 0 * SIZE(BB) + movsd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) + movsd %xmm7, 4 * SIZE(BB) + movsd %xmm7, 5 * SIZE(BB) + movhpd %xmm7, 6 * SIZE(BB) + movhpd %xmm7, 7 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S new file mode 100644 index 0000000000..e5949aa6e9 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -0,0 +1,1966 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + andl $1, %ebx + jle .L30 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + pshufd $0xb1, %xmm7, %xmm7 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + movsd -30 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, -32 * SIZE(AA) + movlps %xmm5, -30 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + movl M, %ebx + sarl $1, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + ADD2 %xmm2, %xmm7 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm3, %xmm6 + psllq $63, %xmm0 + +#ifndef CONJ + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + shufps $0xb1, %xmm0, %xmm0 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else + pshufd $0xb1, %xmm0, %xmm1 + + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm7 +#endif +#endif + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + + shufps $0xd8, %xmm4, %xmm4 + shufps $0xd8, %xmm6, %xmm6 + + movaps %xmm4, %xmm5 + shufps $0xe4, %xmm6, %xmm4 + shufps $0xe4, %xmm5, %xmm6 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps -32 * SIZE(BB), %xmm2 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + movaps -28 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, -32 * SIZE(AA) + movaps %xmm5, -28 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + andl $1, %ebx + jle .L130 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -22 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -18 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L130: + movl M, %ebx + sarl $1, %ebx + jle .L149 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + movhps -30 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movsd -32 * SIZE(BB), %xmm2 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S new file mode 100644 index 0000000000..f77a06d6cd --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -0,0 +1,2201 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCHSIZE (16 * 10 + 8) +#define WPREFETCHSIZE 112 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + xorps %xmm7, %xmm7 + pcmpeqb %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $ 4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + andl $1, %ebx + jle .L30 + ALIGN_4 + +.L40: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, 0 * SIZE(AA) + movlps %xmm5, 2 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L30: + movl M, %ebx + sarl $1, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + PREFETCHW -4 * SIZE(CO1) + PREFETCHW -4 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $ 32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + movaps 4 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm5, 28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, 0 * SIZE(AA) + movaps %xmm5, 4 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + andl $1, %ebx + jle .L130 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L130: + movl M, %ebx + sarl $1, %ebx + jle .L149 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW -4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm1, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LN_4x1_sse.S b/kernel/x86/ztrsm_kernel_LN_4x1_sse.S new file mode 100644 index 0000000000..877a3ba4f1 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LN_4x1_sse.S @@ -0,0 +1,1893 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + +#ifndef CONJ + movl $0x80000000, 0 + POSINV + movl $0x00000000, 4 + POSINV + movl $0x80000000, 8 + POSINV + movl $0x00000000, 12 + POSINV +#else + movl $0x00000000, 0 + POSINV + movl $0x80000000, 4 + POSINV + movl $0x00000000, 8 + POSINV + movl $0x80000000, 12 + POSINV +#endif + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L02 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl $1, %ebx + jle .L50 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_4 + +.L71: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + je .L74 + +.L73: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L73 + +.L74: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm5, %xmm5 +#endif +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AA), %xmm5 +#endif + + subps %xmm4, %xmm5 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm1 +#else + movsd 0 * SIZE(B), %xmm1 +#endif + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm5, 0 * SIZE(B) + + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm5, 0 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L50: + movl M, %ebx + testl $2, %ebx + jle .L70 + +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_4 + +.L51: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_4 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L53 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 + movhps 2 * SIZE(B), %xmm5 +#else + movaps 0 * SIZE(AA), %xmm5 +#endif + + subps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + movhlps %xmm5, %xmm4 +#endif + +#ifdef LN +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 4 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef LT +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + + movsd 2 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + + movaps %xmm1, %xmm2 + shufps $0x44, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x11, %xmm2, %xmm3 + + movaps %xmm5, %xmm4 + shufps $0xa0, %xmm4, %xmm4 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlhps %xmm4, %xmm5 + + movsd %xmm5, 0 * SIZE(B) + movhps %xmm5, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm2 + pshufd $0xff, %xmm5, %xmm3 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) +#else + movaps %xmm5, 0 * SIZE(AA) +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhps %xmm5, 2 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + movl M, %ebx + sarl $2, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + prefetcht0 8 * SIZE(CO1) + je .L12 + ALIGN_4 + +#define PREFETCHSIZE 48 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + + addl $8 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + + decl %eax + jg .L13 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 + movhps 2 * SIZE(B), %xmm5 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 +#else + movaps 0 * SIZE(AA), %xmm5 + movaps 4 * SIZE(AA), %xmm7 +#endif + + subps %xmm4, %xmm5 + subps %xmm6, %xmm7 + +#if defined(LN) || defined(LT) + movhlps %xmm5, %xmm4 + movhlps %xmm7, %xmm6 +#endif + +#ifdef LN +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 30 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps POSINV, %xmm6 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm6 + + addps %xmm3, %xmm6 + + movsd 28 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 26 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 24 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 20 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps POSINV, %xmm7 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm7 + + addps %xmm3, %xmm7 + + movsd 18 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 16 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 10 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 8 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + + movsd 2 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 4 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 10 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 12 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 14 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 20 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps POSINV, %xmm7 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm7 + + addps %xmm3, %xmm7 + + movsd 22 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 30 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps POSINV, %xmm6 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm6 + + addps %xmm3, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x44, %xmm1, %xmm2 + pshufd $0x11, %xmm1, %xmm3 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 +#else + movaps %xmm1, %xmm2 + shufps $0x44, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x11, %xmm3, %xmm3 + + movaps %xmm5, %xmm4 + shufps $0xa0, %xmm4, %xmm4 + shufps $0xf5, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xa0, %xmm6, %xmm6 + shufps $0xf5, %xmm7, %xmm7 +#endif + +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm2, %xmm6 + mulps %xmm3, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlhps %xmm4, %xmm5 + movlhps %xmm6, %xmm7 + + movsd %xmm5, 0 * SIZE(B) + movhps %xmm5, 2 * SIZE(B) + movsd %xmm7, 4 * SIZE(B) + movhps %xmm7, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm2 + pshufd $0xff, %xmm5, %xmm3 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm2 + pshufd $0xff, %xmm7, %xmm3 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) +#else + movaps %xmm5, 0 * SIZE(AA) + movaps %xmm7, 4 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + movhps %xmm5, 2 * SIZE(CO1) + movlps %xmm7, 4 * SIZE(CO1) + movhps %xmm7, 6 * SIZE(CO1) + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x1.S b/kernel/x86/ztrsm_kernel_LT_1x1.S new file mode 100644 index 0000000000..5b13a54b88 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x1.S @@ -0,0 +1,493 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define KK 0 + STACK(%esp) +#define KKK 4 + STACK(%esp) +#define AORIG 8 + STACK(%esp) + +#define STACK_M 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_K 12 + STACK + ARGS(%esp) +#ifdef DOUBLE +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define STACK_A 32 + STACK + ARGS(%esp) +#define STACK_B 36 + STACK + ARGS(%esp) +#define STACK_C 40 + STACK + ARGS(%esp) +#define STACK_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) +#else +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 20 + STACK + ARGS(%esp) +#define STACK_A 24 + STACK + ARGS(%esp) +#define STACK_B 28 + STACK + ARGS(%esp) +#define STACK_C 32 + STACK + ARGS(%esp) +#define STACK_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) +#endif + + PROLOGUE + + subl $ARGS, %esp + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#define M %esi +#define K %edi + +#define A %ebx +#define B %ecx +#define C %edx +#define LDC %ebp + + movl STACK_K, K + movl STACK_LDC, LDC + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl STACK_M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, STACK_C + imull K, %eax + addl %eax, STACK_A +#endif + +#ifdef RT + movl STACK_N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, STACK_B + + movl STACK_N, %eax + imull LDC, %eax + addl %eax, STACK_C +#endif + +#ifdef RN + movl OFFSET, %eax + negl %eax + movl %eax, KK +#endif + +#ifdef RT + movl STACK_N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + cmpl $0, STACK_N + jle .L29 + cmpl $0, STACK_M + jle .L29 + ALIGN_4 + +.L30: +#if defined(LT) || defined(RN) + movl STACK_A, A +#else + movl STACK_A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, STACK_B +#endif + +#ifdef RT + subl LDC, STACK_C +#endif + movl STACK_C, C +#ifndef RT + addl LDC, STACK_C +#endif + + movl STACK_M, M + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + ALIGN_4 + +.L34: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + movl AORIG, A + movl STACK_B, B + addl %eax, A + addl %eax, B +#else + movl STACK_B, B +#endif + + fldz + fldz + fldz + fldz + + FLD 4 * SIZE(B) # B5 + FLD 4 * SIZE(A) # A5 + FLD 0 * SIZE(B) # B0 + FLD 0 * SIZE(A) # A0 + +#ifdef HAVE_SSE + prefetcht2 2 * SIZE(C) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L37 + ALIGN_4 + +#define PREFETCH_OFFSET 40 + +.L38: +#ifdef HAVE_SSE + prefetchnta (PREFETCH_OFFSET) * SIZE(B) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET) * SIZE(A) +#endif +#endif + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + FLD 0 * SIZE(B) + fxch %st(1) + faddp %st, %st(4) + FLD 1 * SIZE(A) + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(B) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(A) + + fmul %st, %st(1) + FMUL 3 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + FLD 2 * SIZE(B) + fxch %st(1) + faddp %st, %st(4) + FLD 3 * SIZE(A) + fmul %st, %st(1) + FMUL 3 * SIZE(B) + fxch %st(1) + faddp %st, %st(7) + FLD 8 * SIZE(B) + fxch %st(1) + faddp %st, %st(6) + FLD 8 * SIZE(A) + fxch %st(2) + +#ifdef HAVE_SSE +#ifdef DOUBLE + prefetchnta (PREFETCH_OFFSET + 4) * SIZE(B) +#if (L2_SIZE == 524288) + prefetcht0 (PREFETCH_OFFSET + 4) * SIZE(A) +#endif +#endif +#endif + + fmul %st, %st(3) + FMUL 5 * SIZE(B) + fxch %st(3) + faddp %st, %st(5) + FLD 4 * SIZE(B) + fxch %st(3) + faddp %st, %st(4) + FLD 5 * SIZE(A) + fmul %st, %st(3) + FMUL 5 * SIZE(B) + fxch %st(3) + faddp %st, %st(7) + FLD 6 * SIZE(B) + fxch %st(3) + faddp %st, %st(6) + FLD 6 * SIZE(A) + + fmul %st, %st(3) + FMUL 7 * SIZE(B) + fxch %st(3) + faddp %st, %st(5) + FLD 6 * SIZE(B) + fxch %st(3) + faddp %st, %st(4) + FLD 7 * SIZE(A) + fmul %st, %st(3) + FMUL 7 * SIZE(B) + fxch %st(3) + faddp %st, %st(7) + FLD 12 * SIZE(B) + fxch %st(3) + faddp %st, %st(6) + FLD 12 * SIZE(A) + fxch %st(2) + + subl $-8 * SIZE, B + subl $-8 * SIZE, A + decl %eax + jg .L38 + ALIGN_4 + +.L37: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + jle .L43 + ALIGN_2 + +.L54: + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(5) + + FLD 0 * SIZE(B) + fxch %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(A) + fmul %st, %st(1) + FMUL 1 * SIZE(B) + fxch %st(1) + faddp %st, %st(7) + FLD 2 * SIZE(B) + fxch %st(1) + faddp %st, %st(6) + FLD 2 * SIZE(A) + + addl $2 * SIZE, A + addl $2 * SIZE, B + decl %eax + jg .L54 + ALIGN_3 + +.L43: + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + ffreep %st(0) + +#if defined(LN) || defined(LT) +#ifndef CONJ + faddp %st, %st(3) # ctemp3 += ctemp4 + fsubp %st, %st(1) # ctemp1 += ctemp2 +#else + fsubp %st, %st(3) # ctemp1 += ctemp2 + faddp %st, %st(1) # ctemp3 += ctemp4 +#endif +#endif + +#if defined(RN) || defined(RT) +#ifndef CONJ + faddp %st, %st(3) # ctemp3 += ctemp4 + fsubp %st, %st(1) # ctemp1 += ctemp2 +#else + fsubrp %st, %st(3) # ctemp1 += ctemp2 + faddp %st, %st(1) # ctemp3 += ctemp4 +#endif +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + sall $ZBASE_SHIFT, %eax + + movl AORIG, A + movl STACK_B, B + addl %eax, A + addl %eax, B +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(B) + fsubp %st, %st(1) + FLD 1 * SIZE(B) + fsubp %st, %st(2) +#else + FLD 0 * SIZE(A) + fsubp %st, %st(1) + FLD 1 * SIZE(A) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD 0 * SIZE(A) + fmul %st(1), %st + FLD 0 * SIZE(A) + fmul %st(3), %st + FLD 1 * SIZE(A) + fmulp %st, %st(3) + FLD 1 * SIZE(A) + fmulp %st, %st(4) +#endif + +#if defined(RN) || defined(RT) + FLD 0 * SIZE(B) + fmul %st(1), %st + FLD 0 * SIZE(B) + fmul %st(3), %st + FLD 1 * SIZE(B) + fmulp %st, %st(3) + FLD 1 * SIZE(B) + fmulp %st, %st(4) +#endif + +#ifndef CONJ + faddp %st, %st(2) + fsubp %st, %st(2) +#else + fsubp %st, %st(2) + faddp %st, %st(2) +#endif + +#ifdef LN + subl $2 * SIZE, C +#endif + +#if defined(LN) || defined(LT) + FSTU 1 * SIZE(B) + fxch %st(1) + FSTU 0 * SIZE(B) +#else + FSTU 1 * SIZE(A) + fxch %st(1) + FSTU 0 * SIZE(A) +#endif + FST 0 * SIZE(C) + FST 1 * SIZE(C) + +#ifndef LN + addl $2 * SIZE, C +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, A + addl %eax, B +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl M + jg .L34 + ALIGN_2 + +.L33: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, STACK_B +#endif +#if defined(LT) || defined(RN) + movl B, STACK_B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl STACK_N + jg .L30 + ALIGN_2 + +.L29: + popl %ebx + popl %esi + popl %edi + popl %ebp + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x1_atom.S b/kernel/x86/ztrsm_kernel_LT_1x1_atom.S new file mode 100644 index 0000000000..bc0d03e94e --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x1_atom.S @@ -0,0 +1,453 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#ifndef CONJ +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd + +#elif defined(LN) || defined(LT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl %eax, %eax + movl %eax, J # j = n + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + xorps %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADDSD3 %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 3 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 2 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 3 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 4 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 2 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 3 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 5 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 4 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 5 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 6 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 4 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 5 * SIZE(BB), %xmm3 + + ADDSD3 %xmm2, %xmm6 + movsd 7 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 6 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 7 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 8 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 6 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 7 * SIZE(BB), %xmm3 + + addl $8 * SIZE, BB + addl $8 * SIZE, AA + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADDSD3 %xmm2, %xmm6 + movsd 1 * SIZE(AA), %xmm2 + movaps %xmm0, %xmm1 + mulsd 0 * SIZE(BB), %xmm0 + ADDSD4 %xmm3, %xmm7 + mulsd 1 * SIZE(BB), %xmm1 + + ADDSD1 %xmm0, %xmm4 + movsd 2 * SIZE(AA), %xmm0 + movaps %xmm2, %xmm3 + mulsd 0 * SIZE(BB), %xmm2 + ADDSD2 %xmm1, %xmm5 + mulsd 1 * SIZE(BB), %xmm3 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: + ADDSD3 %xmm2, %xmm6 + ADDSD4 %xmm3, %xmm7 + + addsd %xmm7, %xmm4 + addsd %xmm5, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BB), %xmm0 + movsd 1 * SIZE(BB), %xmm1 +#else + movsd 0 * SIZE(AA), %xmm0 + movsd 1 * SIZE(AA), %xmm1 +#endif + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm6 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(AA), %xmm7 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BB), %xmm6 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BB), %xmm7 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD2 %xmm5, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BB) + movsd %xmm1, 1 * SIZE(BB) +#else + movsd %xmm0, 0 * SIZE(AA) + movsd %xmm1, 1 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S new file mode 100644 index 0000000000..b01498f783 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -0,0 +1,969 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps 2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + ADD1 %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm1 + ADD2 %xmm2, %xmm7 + psllq $63, %xmm1 + +#ifndef CONJ + pshufd $0x40, %xmm1, %xmm0 + shufps $0x04, %xmm1, %xmm1 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm1, %xmm0 +#else + pshufd $0x04, %xmm1, %xmm0 +#endif + shufps $0x40, %xmm1, %xmm1 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm5 + movapd -14 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm5 + movapd -14 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup -14 * SIZE(BB), %xmm2 + movddup -13 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup -10 * SIZE(BB), %xmm2 + movddup -9 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup -10 * SIZE(BB), %xmm2 + movddup -9 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup -12 * SIZE(BB), %xmm2 + movddup -11 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(BB) + movapd %xmm7, -14 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) + movapd %xmm7, -14 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + testl $1, %eax + jle .L999 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + ALIGN_4 + +L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je L115 + ALIGN_4 + +L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -10 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -6 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -2 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne L112 + ALIGN_4 + +L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je L118 + ALIGN_4 + +L116: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L116 + ALIGN_4 + +L118: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addpd %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm1 + addpd %xmm7, %xmm5 + psllq $63, %xmm1 + +#ifndef CONJ + pshufd $0x40, %xmm1, %xmm0 + shufps $0x04, %xmm1, %xmm1 + + pxor %xmm0, %xmm4 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm1, %xmm0 +#else + pshufd $0x04, %xmm1, %xmm0 +#endif + shufps $0x40, %xmm1, %xmm1 + + pxor %xmm0, %xmm5 +#endif + + haddpd %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg L110 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S b/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S new file mode 100644 index 0000000000..fdeecc7939 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x2_sse2.S @@ -0,0 +1,1328 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#else +#define PREFETCH prefetcht0 +#endif + +#define PREFETCHSIZE (8 * 10 + 4) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, B + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm7, 8 + POSINV + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + leal (, LDC, 2), %eax + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + leal (, LDC, 2), %eax + addl %eax, C +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 2 * SIZE(CO1) + prefetchw 2 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + movapd 2 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movlpd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movlpd 6 * SIZE(B), %xmm2 + movhpd 6 * SIZE(B), %xmm2 + movlpd 7 * SIZE(B), %xmm3 + movhpd 7 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm2 + movhpd 6 * SIZE(B), %xmm2 + movlpd 7 * SIZE(B), %xmm3 + movhpd 7 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movlpd 4 * SIZE(B), %xmm2 + movhpd 4 * SIZE(B), %xmm2 + movlpd 5 * SIZE(B), %xmm3 + movhpd 5 * SIZE(B), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + + movlpd %xmm5, 0 * SIZE(BB) + movlpd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) + movlpd %xmm7, 4 * SIZE(BB) + movlpd %xmm7, 5 * SIZE(BB) + movhpd %xmm7, 6 * SIZE(BB) + movhpd %xmm7, 7 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L500 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L199 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movlpd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + + movlpd %xmm5, 0 * SIZE(BB) + movlpd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L199: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L500: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S b/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S new file mode 100644 index 0000000000..29103bad29 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_1x2_sse3.S @@ -0,0 +1,965 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#define ADDSUB addpd + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7 + +#define KERNEL7(address) \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) + prefetchnta -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_4 + +.L11: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + pcmpeqb %xmm1, %xmm1 + psllq $63, %xmm1 + + shufps $0x40, %xmm1, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 + + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else +#if defined(LN) || defined(LT) + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#else + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#endif + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm5 + movapd 2 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movddup 0 * SIZE(AA), %xmm2 + movddup 1 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + movapd %xmm7, %xmm6 + + SHUFPD_1 %xmm4, %xmm4 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup 2 * SIZE(BB), %xmm2 + movddup 3 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + movapd %xmm5, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup 6 * SIZE(BB), %xmm2 + movddup 7 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup 6 * SIZE(BB), %xmm2 + movddup 7 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup 4 * SIZE(BB), %xmm2 + movddup 5 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm4 + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(BB) + movapd %xmm7, 2 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + testl $1, %eax + jle .L500 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je L112 + ALIGN_4 + +L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne L111 + ALIGN_4 + +L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je L114 + ALIGN_4 + +L113: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L113 + ALIGN_4 + +L114: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + pcmpeqb %xmm1, %xmm1 + psllq $63, %xmm1 + + shufps $0x40, %xmm1, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + xorpd %xmm1, %xmm5 + + subpd %xmm5, %xmm4 +#else +#if defined(LN) || defined(LT) + xorpd %xmm1, %xmm4 +#else + xorpd %xmm1, %xmm5 +#endif + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movddup 0 * SIZE(AA), %xmm2 + movddup 1 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg L110 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L500: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_2x1_core2.S b/kernel/x86/ztrsm_kernel_LT_2x1_core2.S new file mode 100644 index 0000000000..467465430b --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_2x1_core2.S @@ -0,0 +1,1056 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addpd +#define ADD2 addpd + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $ZBASE_SHIFT, LDC + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal 16 * SIZE + BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + movapd %xmm2, -12 * SIZE(BB) + movapd %xmm3, -10 * SIZE(BB) + movapd %xmm4, -8 * SIZE(BB) + movapd %xmm5, -6 * SIZE(BB) + movapd %xmm6, -4 * SIZE(BB) + movapd %xmm7, -2 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, -16 * SIZE(BB) + movapd %xmm1, -14 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm3 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd -12 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 0 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -6 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd -4 * SIZE(AA), %xmm3 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd -4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd -2 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + movapd 8 * SIZE(AA), %xmm3 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + movapd 0 * SIZE(BB), %xmm1 + + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd 4 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 6 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm0 + ADD2 %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm1 + movapd 16 * SIZE(AA), %xmm0 + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + + movapd 8 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 10 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 10 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm2, %xmm6 + movapd 12 * SIZE(AA), %xmm3 + ADD2 %xmm1, %xmm7 + + movapd 12 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm2 + mulpd %xmm3, %xmm1 + ADD1 %xmm1, %xmm4 + movapd 14 * SIZE(BB), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm3, %xmm5 + movapd 14 * SIZE(AA), %xmm3 + mulpd %xmm3, %xmm2 + mulpd %xmm3, %xmm1 + subl $-32 * SIZE, BB + movapd 24 * SIZE(AA), %xmm3 + subl $-32 * SIZE, AA + ADD1 %xmm2, %xmm6 + ADD2 %xmm1, %xmm7 + movapd -16 * SIZE(BB), %xmm1 + + decl %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L16: + movapd %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm4 + movapd -14 * SIZE(BB), %xmm1 + movapd %xmm1, %xmm3 + mulpd %xmm0, %xmm1 + movapd -14 * SIZE(AA), %xmm0 + ADD2 %xmm1, %xmm5 + movapd -12 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + ADD1 %xmm2, %xmm6 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AA), %xmm0 + ADD2 %xmm3, %xmm7 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm5 + movapd -14 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm5 + movapd -14 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -10 * SIZE(AA), %xmm2 + movddup -9 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup -12 * SIZE(AA), %xmm2 + movddup -11 * SIZE(AA), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup -14 * SIZE(AA), %xmm2 + movddup -13 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup -10 * SIZE(AA), %xmm2 + movddup -9 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + movsd %xmm7, 2 * SIZE(CO1) + movhpd %xmm7, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + + movddup %xmm5, %xmm4 + unpckhpd %xmm5, %xmm5 + movddup %xmm7, %xmm6 + unpckhpd %xmm7, %xmm7 + + movapd %xmm4, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) + movapd %xmm6, -12 * SIZE(BB) + movapd %xmm7, -10 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) + movapd %xmm7, -14 * SIZE(AA) + +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + +.L50: + movl M, %ebx + testl $1, %ebx + je .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal 16 * SIZE + BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd -16 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd -16 * SIZE(BB), %xmm1 + pxor %xmm5, %xmm5 + movapd -8 * SIZE(AA), %xmm2 + pxor %xmm6, %xmm6 + movapd -8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm6 + movapd 0 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm7 + movapd -12 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm4 + movapd -4 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm5 + movapd -10 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BB), %xmm0 + ADD1 %xmm3, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + ADD2 %xmm0, %xmm7 + movapd 0 * SIZE(AA), %xmm0 + mulpd %xmm2, %xmm1 + mulpd 2 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm4 + movapd 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + movapd -6 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BB), %xmm2 + ADD1 %xmm1, %xmm6 + movapd 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + movapd -4 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm5 + movapd -2 * SIZE(AA), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BB), %xmm2 + ADD1 %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + movapd 8 * SIZE(AA), %xmm2 + + subl $-16 * SIZE, AA + addl $ 32 * SIZE, BB + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BB), %xmm0 + ADD1 %xmm1, %xmm4 + movapd -12 * SIZE(BB), %xmm1 + ADD2 %xmm0, %xmm5 + movapd -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax # l-- + jg .L53 + +.L54: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal 16 * SIZE + BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AA), %xmm5 + + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm2 + movddup -15 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(B) + + movddup %xmm5, %xmm4 + unpckhpd %xmm5, %xmm5 + + movapd %xmm4, -16 * SIZE(BB) + movapd %xmm5, -14 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S b/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S new file mode 100644 index 0000000000..77f30264dc --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_2x1_sse2.S @@ -0,0 +1,1164 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define PREFETCHSIZE (8 * 4) + +#if !defined(HAVE_SSE2) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define KERNEL1(address) \ + movq (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 0 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 2 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 2 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 4 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 6 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + movq (PREFETCHSIZE + 8) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 8 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 10 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 10 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL4(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 12 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 14 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 14 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL5(address) \ + movq (PREFETCHSIZE + 16) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 16 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 18 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 18 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 20 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL6(address) \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm4; \ + movapd 20 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm5; \ + movapd 22 * SIZE + (address) * SIZE(AA), %xmm0; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 32 * SIZE + (address) * SIZE(AA), %xmm0 + +#define KERNEL7(address) \ + movq (PREFETCHSIZE + 24) * SIZE + (address) * SIZE(AA), %mm2; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 24 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 26 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 26 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 28 * SIZE + (address) * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm4; \ + movapd 28 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm5; \ + movapd 30 * SIZE + (address) * SIZE(AA), %xmm1; \ + mulpd %xmm1, %xmm3; \ + mulpd 30 * SIZE + (address) * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 40 * SIZE + (address) * SIZE(AA), %xmm1 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + EMMS + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movd STACK_M, %mm0 + movl STACK_N, %eax + movd STACK_K, %mm1 + movd STACK_A, %mm2 + movl STACK_B, B + movd STACK_C, %mm3 + movl STACK_LDC, LDC + movd STACK_OFFT, %mm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movsd %xmm2, 0 + POSINV + movsd %xmm7, 8 + POSINV + + movd %mm1, K + movl %eax, N + movd %mm0, M + movd %mm2, A + movd %mm3, C + movl %esi, OLD_STACK + movd %mm4, OFFSET + movd %mm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + ALIGN_2 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + ALIGN_2 + +.L02: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + unpcklpd %xmm2, %xmm2 + unpcklpd %xmm3, %xmm3 + unpcklpd %xmm4, %xmm4 + unpcklpd %xmm5, %xmm5 + unpcklpd %xmm6, %xmm6 + unpcklpd %xmm7, %xmm7 + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + movapd %xmm2, 4 * SIZE(BB) + movapd %xmm3, 6 * SIZE(BB) + movapd %xmm4, 8 * SIZE(BB) + movapd %xmm5, 10 * SIZE(BB) + movapd %xmm6, 12 * SIZE(BB) + movapd %xmm7, 14 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $16 * SIZE, BB + decl %eax + jne .L02 + ALIGN_2 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + ALIGN_2 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + unpcklpd %xmm0, %xmm0 + unpcklpd %xmm1, %xmm1 + + movapd %xmm0, 0 * SIZE(BB) + movapd %xmm1, 2 * SIZE(BB) + + addl $ 2 * SIZE, B + addl $ 4 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx # i = (m >> 2) + jle .L50 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $-8, %eax + NOBRANCH + je .L12 + sall $3, %eax + +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + cmpl $64 * 1, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + cmpl $64 * 2, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 2) + KERNEL2(32 * 2) + KERNEL3(32 * 2) + KERNEL4(32 * 2) + KERNEL5(32 * 2) + KERNEL6(32 * 2) + KERNEL7(32 * 2) + KERNEL8(32 * 2) + cmpl $64 * 3, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 3) + KERNEL2(32 * 3) + KERNEL3(32 * 3) + KERNEL4(32 * 3) + KERNEL5(32 * 3) + KERNEL6(32 * 3) + KERNEL7(32 * 3) + KERNEL8(32 * 3) + cmpl $64 * 4, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 4) + KERNEL2(32 * 4) + KERNEL3(32 * 4) + KERNEL4(32 * 4) + KERNEL5(32 * 4) + KERNEL6(32 * 4) + KERNEL7(32 * 4) + KERNEL8(32 * 4) + cmpl $64 * 5, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 5) + KERNEL2(32 * 5) + KERNEL3(32 * 5) + KERNEL4(32 * 5) + KERNEL5(32 * 5) + KERNEL6(32 * 5) + KERNEL7(32 * 5) + KERNEL8(32 * 5) + cmpl $64 * 6, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 6) + KERNEL2(32 * 6) + KERNEL3(32 * 6) + KERNEL4(32 * 6) + KERNEL5(32 * 6) + KERNEL6(32 * 6) + KERNEL7(32 * 6) + KERNEL8(32 * 6) + cmpl $64 * 7, %eax + NOBRANCH + jle .L11 + KERNEL1(32 * 7) + KERNEL2(32 * 7) + KERNEL3(32 * 7) + KERNEL4(32 * 7) + KERNEL5(32 * 7) + KERNEL6(32 * 7) + KERNEL7(32 * 7) + KERNEL8(32 * 7) + + addl $64 * 4 * SIZE, AA + addl $64 * 4 * SIZE, BB + subl $64 * 8, %eax + BRANCH + jg .L1X + +.L11: + leal (BB, %eax, 4), BB + leal (AA, %eax, 4), AA + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 0 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm1 + movapd 4 * SIZE(AA), %xmm0 + addpd %xmm1, %xmm7 + + addl $4 * SIZE, AA # aoffset += 8 + addl $4 * SIZE, BB # boffset1 += 8 + subl $1, %eax + jg .L13 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + movapd 2 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + + +#ifdef LN + movsd 6 * SIZE(AA), %xmm2 + movhpd 6 * SIZE(AA), %xmm2 + movsd 7 * SIZE(AA), %xmm3 + movhpd 7 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movsd 4 * SIZE(AA), %xmm2 + movhpd 4 * SIZE(AA), %xmm2 + movsd 5 * SIZE(AA), %xmm3 + movhpd 5 * SIZE(AA), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movsd 2 * SIZE(AA), %xmm2 + movhpd 2 * SIZE(AA), %xmm2 + movsd 3 * SIZE(AA), %xmm3 + movhpd 3 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movsd 6 * SIZE(AA), %xmm2 + movhpd 6 * SIZE(AA), %xmm2 + movsd 7 * SIZE(AA), %xmm3 + movhpd 7 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + movsd %xmm7, 2 * SIZE(CO1) + movhpd %xmm7, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + + movsd %xmm5, 0 * SIZE(BB) + movsd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) + movsd %xmm7, 4 * SIZE(BB) + movsd %xmm7, 5 * SIZE(BB) + movhpd %xmm7, 6 * SIZE(BB) + movhpd %xmm7, 7 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + +.L50: + movl M, %ebx + testl $1, %ebx + je .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, %ecx + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm5, %xmm5 + movapd 8 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax # l = (k >> 2) + jle .L52 + +.L51: + mulpd %xmm0, %xmm1 + movapd 2 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm4 + movapd 16 * SIZE(BB), %xmm1 + + mulpd %xmm0, %xmm3 + movapd 2 * SIZE(AA), %xmm0 + addpd %xmm3, %xmm5 + movapd 4 * SIZE(BB), %xmm3 + + mulpd %xmm0, %xmm3 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + + addpd %xmm0, %xmm5 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 10 * SIZE(BB), %xmm0 + + addpd %xmm2, %xmm4 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + + movapd 12 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 24 * SIZE(BB), %xmm2 + + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm0, %xmm5 + movapd 8 * SIZE(AA), %xmm0 + + addl $ 8 * SIZE, AA # aoffset += 2 + addl $16 * SIZE, BB # boffset1 += 4 + + decl %eax # l-- + jg .L51 + ALIGN_2 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax # l = (k & 3) + jle .L54 + ALIGN_2 + +.L53: + movapd 0 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm4 + movapd 2 * SIZE(BB), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA # aoffset += 2 + addl $4 * SIZE, BB # boffset1 += 4 + decl %eax # l-- + jg .L53 + +.L54: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#ifdef LN + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movsd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movsd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movsd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + + movsd %xmm5, 0 * SIZE(BB) + movsd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S new file mode 100644 index 0000000000..3668ee2bbf --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -0,0 +1,1966 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + ADD2 %xmm2, %xmm7 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm3, %xmm6 + psllq $63, %xmm0 + +#ifndef CONJ + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + shufps $0xb1, %xmm0, %xmm0 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else + pshufd $0xb1, %xmm0, %xmm1 + + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm7 +#endif +#endif + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + + shufps $0xd8, %xmm4, %xmm4 + shufps $0xd8, %xmm6, %xmm6 + + movaps %xmm4, %xmm5 + shufps $0xe4, %xmm6, %xmm4 + shufps $0xe4, %xmm5, %xmm6 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps -32 * SIZE(BB), %xmm2 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + movaps -28 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, -32 * SIZE(AA) + movaps %xmm5, -28 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + pshufd $0xb1, %xmm7, %xmm7 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + movsd -30 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, -32 * SIZE(AA) + movlps %xmm5, -30 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + movhps -30 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movsd -32 * SIZE(BB), %xmm2 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L149 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -22 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -18 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S new file mode 100644 index 0000000000..84d40ddecb --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -0,0 +1,2201 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCHSIZE (16 * 10 + 8) +#define WPREFETCHSIZE 112 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + xorps %xmm7, %xmm7 + pcmpeqb %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J + sarl $1, J + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $ 4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $ 32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + movaps 4 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm5, 28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, 0 * SIZE(AA) + movaps %xmm5, 4 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, 0 * SIZE(AA) + movlps %xmm5, 2 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L100: + movl N, %eax + andl $1, %eax + jle .L999 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm1, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L149 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_LT_4x1_sse.S b/kernel/x86/ztrsm_kernel_LT_4x1_sse.S new file mode 100644 index 0000000000..4f324bced6 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_LT_4x1_sse.S @@ -0,0 +1,1898 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if !defined(HAVE_SSE) || !defined(HAVE_MMX) +#error You have to check your configuration. +#endif + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if !defined(HAVE_SSE2) || defined(OPTERON) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + +#ifndef CONJ + movl $0x80000000, 0 + POSINV + movl $0x00000000, 4 + POSINV + movl $0x80000000, 8 + POSINV + movl $0x00000000, 12 + POSINV +#else + movl $0x00000000, 0 + POSINV + movl $0x80000000, 4 + POSINV + movl $0x00000000, 8 + POSINV + movl $0x80000000, 12 + POSINV +#endif + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + movl %eax, J # j = n + testl %eax, %eax + jle .L999 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L03 + +.L02: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + movss 4 * SIZE(B), %xmm0 + movss 5 * SIZE(B), %xmm1 + movss 6 * SIZE(B), %xmm2 + movss 7 * SIZE(B), %xmm3 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) + + prefetcht0 104 * SIZE(B) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L02 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L05 + +.L04: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $2 * SIZE, B + addl $8 * SIZE, BB + decl %eax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $2, %ebx + jle .L50 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $2 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + prefetcht0 8 * SIZE(CO1) + je .L12 + ALIGN_4 + +#define PREFETCHSIZE 48 + +.L11: +#ifdef CORE_KATMAI + prefetcht0 PREFETCHSIZE * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 8 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 12 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 20 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 24) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 24 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 32) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 36 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 48 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 40) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 40 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 44 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 44 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 56 * SIZE(AA), %xmm1 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 48) * SIZE(AA) +#endif + + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + + addps %xmm0, %xmm5 + movaps 52 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 52 * SIZE(BB), %xmm0 + + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 64 * SIZE(AA), %xmm0 + +#ifdef CORE_KATMAI + prefetcht0 (PREFETCHSIZE + 56) * SIZE(AA) +#endif + + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm4 + movaps 56 * SIZE(BB), %xmm3 + + addps %xmm1, %xmm5 + movaps 60 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 72 * SIZE(AA), %xmm1 + + addl $64 * SIZE, BB + addl $64 * SIZE, AA + decl %eax + jne .L11 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + +.L13: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 0 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 8 * SIZE(AA), %xmm0 + + addl $8 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + + decl %eax + jg .L13 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $4, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 4), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 + movhps 2 * SIZE(B), %xmm5 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 +#else + movaps 0 * SIZE(AA), %xmm5 + movaps 4 * SIZE(AA), %xmm7 +#endif + + subps %xmm4, %xmm5 + subps %xmm6, %xmm7 + +#if defined(LN) || defined(LT) + movhlps %xmm5, %xmm4 + movhlps %xmm7, %xmm6 +#endif + +#ifdef LN +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 30 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps POSINV, %xmm6 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm6 + + addps %xmm3, %xmm6 + + movsd 28 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 26 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 24 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm6, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 20 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps POSINV, %xmm7 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm7 + + addps %xmm3, %xmm7 + + movsd 18 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 16 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 10 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 8 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef LT +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + + movsd 2 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 4 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 10 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 12 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm7 + subps %xmm3, %xmm7 + + movsd 14 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 20 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps POSINV, %xmm7 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm7 + + addps %xmm3, %xmm7 + + movsd 22 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm7, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm6 + subps %xmm3, %xmm6 + + movsd 30 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm6, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps POSINV, %xmm6 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm6 + + addps %xmm3, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + +#ifdef HAVE_SSE2 + pshufd $0x44, %xmm1, %xmm2 + pshufd $0x11, %xmm1, %xmm3 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 +#else + movaps %xmm1, %xmm2 + shufps $0x44, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x11, %xmm3, %xmm3 + + movaps %xmm5, %xmm4 + shufps $0xa0, %xmm4, %xmm4 + shufps $0xf5, %xmm5, %xmm5 + movaps %xmm7, %xmm6 + shufps $0xa0, %xmm6, %xmm6 + shufps $0xf5, %xmm7, %xmm7 +#endif + +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + mulps %xmm2, %xmm6 + mulps %xmm3, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef LN + subl $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlhps %xmm4, %xmm5 + movlhps %xmm6, %xmm7 + + movlps %xmm5, 0 * SIZE(B) + movhps %xmm5, 2 * SIZE(B) + movlps %xmm7, 4 * SIZE(B) + movhps %xmm7, 6 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm2 + pshufd $0xff, %xmm5, %xmm3 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm2 + pshufd $0xff, %xmm7, %xmm3 +#else + movaps %xmm7, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm7, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm7, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm7, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm2, 24 * SIZE(BB) + movaps %xmm3, 28 * SIZE(BB) +#else + movaps %xmm5, 0 * SIZE(AA) + movaps %xmm7, 4 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + movhps %xmm5, 2 * SIZE(CO1) + movlps %xmm7, 4 * SIZE(CO1) + movhps %xmm7, 6 * SIZE(CO1) + +#ifndef LN + addl $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $4, KK + movl BORIG, B +#endif + +#ifdef LT + addl $4, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $2 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_2 + +.L50: + movl M, %ebx + testl $2, %ebx + jle .L70 + +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm4, %xmm4 + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 + xorps %xmm6, %xmm6 + movaps 8 * SIZE(AA), %xmm1 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L52 + ALIGN_4 + +.L51: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 24 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 20 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 28 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 40 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 24 * SIZE(AA), %xmm1 + mulps %xmm0, %xmm2 + mulps 36 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 48 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 20 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 56 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movaps 32 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 + mulps 52 * SIZE(BB), %xmm1 + addps %xmm2, %xmm4 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm1, %xmm5 + movaps 28 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + mulps 60 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 72 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movaps 40 * SIZE(AA), %xmm1 + + addl $32 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L51 + ALIGN_4 + +.L52: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L54 + +.L53: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L53 + +.L54: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 + movhps 2 * SIZE(B), %xmm5 +#else + movaps 0 * SIZE(AA), %xmm5 +#endif + + subps %xmm4, %xmm5 + +#if defined(LN) || defined(LT) + movhlps %xmm5, %xmm4 +#endif + +#ifdef LN +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 + + movsd 4 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm4, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm5 + subps %xmm3, %xmm5 + + + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef LT +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + + movsd 2 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm2 + shufps $0xa0, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps POSINV, %xmm3 +#else + xorps POSINV, %xmm2 +#endif + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + subps %xmm2, %xmm4 + subps %xmm3, %xmm4 + + movsd 6 * SIZE(AA), %xmm1 + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm4, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm4, %xmm4 + +#ifndef CONJ + xorps POSINV, %xmm4 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + + addps %xmm3, %xmm4 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + + movaps %xmm1, %xmm2 + shufps $0x44, %xmm2, %xmm2 + movaps %xmm1, %xmm3 + shufps $0x11, %xmm2, %xmm3 + + movaps %xmm5, %xmm4 + shufps $0xa0, %xmm4, %xmm4 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm2, %xmm4 + mulps %xmm3, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlhps %xmm4, %xmm5 + + movlps %xmm5, 0 * SIZE(B) + movhps %xmm5, 2 * SIZE(B) + +#ifdef HAVE_SSE2 + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + pshufd $0xaa, %xmm5, %xmm2 + pshufd $0xff, %xmm5, %xmm3 +#else + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + movaps %xmm5, %xmm2 + shufps $0xaa, %xmm2, %xmm2 + movaps %xmm5, %xmm3 + shufps $0xff, %xmm3, %xmm3 +#endif + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) +#else + movaps %xmm5, 0 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + movhps %xmm5, 2 * SIZE(CO1) + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L70: + movl M, %ebx + testl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + + movaps 0 * SIZE(BB), %xmm2 + + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm5, %xmm5 + movaps 8 * SIZE(BB), %xmm3 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L72 + ALIGN_4 + +.L71: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 16 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 12 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 20 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 40 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 48 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 44 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 52 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 72 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $64 * SIZE, BB + decl %eax + jne .L71 + ALIGN_2 + +.L72: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax + je .L74 + +.L73: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA # aoffset += 8 + addl $8 * SIZE, BB # boffset1 += 8 + decl %eax + jg .L73 + +.L74: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#ifdef movsd + xorps %xmm5, %xmm5 +#endif +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AA), %xmm5 +#endif + subps %xmm4, %xmm5 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AA), %xmm1 +#else + movsd 0 * SIZE(B), %xmm1 +#endif + + movaps %xmm1, %xmm0 + shufps $0x44, %xmm0, %xmm0 + shufps $0x11, %xmm1, %xmm1 + + movaps %xmm5, %xmm3 + shufps $0xa0, %xmm3, %xmm3 + shufps $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps POSINV, %xmm5 +#else + xorps POSINV, %xmm3 +#endif + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm5 + + addps %xmm3, %xmm5 + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm5, 0 * SIZE(B) + + movaps %xmm5, %xmm0 + shufps $0x00, %xmm0, %xmm0 + movaps %xmm5, %xmm1 + shufps $0x55, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) +#else + movlps %xmm5, 0 * SIZE(AA) +#endif + + movlps %xmm5, 0 * SIZE(CO1) + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_2 + +.L99: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_2 + +.L999: + movl OLD_STACK, %esp + + EMMS + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S new file mode 100644 index 0000000000..13064166f0 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -0,0 +1,969 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 2) +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-16 * SIZE, A + subl $-16 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + jle .L100 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + ALIGN_4 + +L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -2 * SIZE(CO1) +#else + prefetcht0 1 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je L115 + ALIGN_4 + +L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -10 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -6 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -2 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm6 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm7 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + subl $1, %eax + jne L112 + ALIGN_4 + +L115: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je L118 + ALIGN_4 + +L116: + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm1, %xmm4 + movaps -14 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L116 + ALIGN_4 + +L118: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addpd %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm1 + addpd %xmm7, %xmm5 + psllq $63, %xmm1 + +#ifndef CONJ + pshufd $0x40, %xmm1, %xmm0 + shufps $0x04, %xmm1, %xmm1 + + pxor %xmm0, %xmm4 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm1, %xmm0 +#else + pshufd $0x04, %xmm1, %xmm0 +#endif + shufps $0x40, %xmm1, %xmm1 + + pxor %xmm0, %xmm5 +#endif + + haddpd %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg L110 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -16 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -16 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -2 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -2 * SIZE(CO1, LDC) +#else + pxor %xmm4, %xmm4 + prefetcht0 1 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 1 * SIZE(CO1, LDC) +#endif + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps -2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 0 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) + + ADD1 %xmm3, %xmm6 + movaps 2 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 4 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -6 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 6 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 8 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 10 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + movaps -2 * SIZE(AA), %xmm0 + + ADD1 %xmm3, %xmm6 + movaps 14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps 16 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + subl $-32 * SIZE, BB + mulpd %xmm0, %xmm2 + movaps 0 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + + subl $1, %eax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADD1 %xmm3, %xmm6 + movaps -14 * SIZE(BB), %xmm3 + ADD2 %xmm2, %xmm7 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + ADD2 %xmm2, %xmm5 + pshufd $0x4e, %xmm3, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm2 + + movaps -14 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + ADD1 %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm1 + ADD2 %xmm2, %xmm7 + psllq $63, %xmm1 + +#ifndef CONJ + pshufd $0x40, %xmm1, %xmm0 + shufps $0x04, %xmm1, %xmm1 + + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm1, %xmm0 +#else + pshufd $0x04, %xmm1, %xmm0 +#endif + shufps $0x40, %xmm1, %xmm1 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BB), %xmm5 + movapd -14 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AA), %xmm5 + movapd -14 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AA), %xmm2 + movddup -15 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup -14 * SIZE(BB), %xmm2 + movddup -13 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup -10 * SIZE(BB), %xmm2 + movddup -9 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup -10 * SIZE(BB), %xmm2 + movddup -9 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup -12 * SIZE(BB), %xmm2 + movddup -11 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup -16 * SIZE(BB), %xmm2 + movddup -15 * SIZE(BB), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, -16 * SIZE(BB) + movapd %xmm7, -14 * SIZE(BB) +#else + movapd %xmm5, -16 * SIZE(AA) + movapd %xmm7, -14 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S b/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S new file mode 100644 index 0000000000..8824868133 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_1x2_sse2.S @@ -0,0 +1,1325 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_ALPHA_R 16 + STACK + ARGS(%esi) +#define STACK_ALPHA_I 24 + STACK + ARGS(%esi) +#define STACK_A 32 + STACK + ARGS(%esi) +#define STACK_B 36 + STACK + ARGS(%esi) +#define STACK_C 40 + STACK + ARGS(%esi) +#define STACK_LDC 44 + STACK + ARGS(%esi) +#define STACK_OFFT 48 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 44(%esp) +#define KK 48(%esp) +#define KKK 52(%esp) +#define AORIG 56(%esp) +#define BORIG 60(%esp) +#define BUFFER 128(%esp) + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#else +#define PREFETCH prefetcht0 +#endif + +#define PREFETCHSIZE (8 * 10 + 4) + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm2, %xmm6; \ + movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm0, %xmm7; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addpd %xmm3, %xmm6; \ + movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm0, %xmm7; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm5; \ + movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm2, %xmm6; \ + movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addpd %xmm1, %xmm7; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm5; \ + movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addpd %xmm3, %xmm6; \ + movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addpd %xmm1, %xmm7; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, B + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm7, 8 + POSINV + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + andl $1, %eax + jle .L100 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + + addl $2 * SIZE, B + addl $4 * SIZE, BB + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L199 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + movapd 0 * SIZE(AA), %xmm0 + movapd 8 * SIZE(AA), %xmm1 + movapd 0 * SIZE(BB), %xmm2 + movapd 8 * SIZE(BB), %xmm3 + +#ifdef LN + prefetchw -2 * SIZE(CO1) +#else + prefetchw 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 16 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 4 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm4 + movapd 12 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm5 + movapd 6 * SIZE(AA), %xmm0 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BB), %xmm0 + addpd %xmm3, %xmm6 + movapd 24 * SIZE(BB), %xmm3 + addpd %xmm0, %xmm7 + movapd 16 * SIZE(AA), %xmm0 + mulpd %xmm1, %xmm2 + mulpd 18 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm4 + movapd 20 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm5 + movapd 10 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm2 + mulpd 22 * SIZE(BB), %xmm1 + addpd %xmm2, %xmm6 + movapd 32 * SIZE(BB), %xmm2 + addpd %xmm1, %xmm7 + movapd 12 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 26 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm4 + movapd 28 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm5 + movapd 14 * SIZE(AA), %xmm1 + mulpd %xmm1, %xmm3 + mulpd 30 * SIZE(BB), %xmm1 + addpd %xmm3, %xmm6 + movapd 40 * SIZE(BB), %xmm3 + addpd %xmm1, %xmm7 + movapd 24 * SIZE(AA), %xmm1 + + addl $16 * SIZE, AA + addl $32 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulpd %xmm0, %xmm2 + mulpd 2 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm4 + movapd 4 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm5 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 2), BB +#endif + + movapd POSINV, %xmm1 + + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 +#else + xorpd %xmm1, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 +#else + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movlpd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + + movlpd %xmm5, 0 * SIZE(BB) + movlpd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L199: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L500 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, BB + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 2), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + prefetchnta 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + movlpd %xmm4, 8 * SIZE(BB) + movlpd %xmm4, 9 * SIZE(BB) + movlpd %xmm5, 10 * SIZE(BB) + movlpd %xmm5, 11 * SIZE(BB) + movlpd %xmm6, 12 * SIZE(BB) + movlpd %xmm6, 13 * SIZE(BB) + movlpd %xmm7, 14 * SIZE(BB) + movlpd %xmm7, 15 * SIZE(BB) + + addl $ 8 * SIZE, B + subl $-16 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BB) + movlpd %xmm0, 1 * SIZE(BB) + movlpd %xmm1, 2 * SIZE(BB) + movlpd %xmm1, 3 * SIZE(BB) + movlpd %xmm2, 4 * SIZE(BB) + movlpd %xmm2, 5 * SIZE(BB) + movlpd %xmm3, 6 * SIZE(BB) + movlpd %xmm3, 7 * SIZE(BB) + + addl $4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + leal (, LDC, 2), %eax + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + leal (, LDC, 2), %eax + addl %eax, C +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L100 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + leal (BB, %eax, 2), BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movapd 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movapd 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchw -2 * SIZE(CO1) + prefetchw -2 * SIZE(CO1, LDC) +#else + prefetchw 2 * SIZE(CO1) + prefetchw 2 * SIZE(CO1, LDC) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + +#if 1 + andl $-8, %eax + sall $4, %eax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + cmpl $128 * 1, %eax + jle .L12 + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpl $128 * 2, %eax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + cmpl $128 * 3, %eax + jle .L12 + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpl $128 * 4, %eax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + cmpl $128 * 5, %eax + jle .L12 + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpl $128 * 6, %eax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + cmpl $128 * 7, %eax + jle .L12 + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addl $128 * 4 * SIZE, BB + addl $128 * 1 * SIZE, AA + subl $128 * 8, %eax + jg .L1X + jmp .L15 + +.L12: + leal (AA, %eax, 1), AA + leal (BB, %eax, 4), BB + ALIGN_4 +#else + + sarl $3, %eax + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $64 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 +#endif + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movapd 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm5 + movapd 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + mulpd 6 * SIZE(BB), %xmm0 + addpd %xmm2, %xmm6 + movapd 8 * SIZE(BB), %xmm2 + addpd %xmm0, %xmm7 + movapd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (B, %eax, 2), B + leal (BB, %eax, 4), BB +#endif + + movapd POSINV, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#else + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm5 + movapd 2 * SIZE(B), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AA), %xmm2 + movhpd 0 * SIZE(AA), %xmm2 + movlpd 1 * SIZE(AA), %xmm3 + movhpd 1 * SIZE(AA), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + + movapd %xmm5, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movlpd 6 * SIZE(B), %xmm2 + movhpd 6 * SIZE(B), %xmm2 + movlpd 7 * SIZE(B), %xmm3 + movhpd 7 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm2 + movhpd 6 * SIZE(B), %xmm2 + movlpd 7 * SIZE(B), %xmm3 + movhpd 7 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movlpd 4 * SIZE(B), %xmm2 + movhpd 4 * SIZE(B), %xmm2 + movlpd 5 * SIZE(B), %xmm3 + movhpd 5 * SIZE(B), %xmm3 + + movapd %xmm7, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movlpd 0 * SIZE(B), %xmm2 + movhpd 0 * SIZE(B), %xmm2 + movlpd 1 * SIZE(B), %xmm3 + movhpd 1 * SIZE(B), %xmm3 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + + movlpd %xmm5, 0 * SIZE(BB) + movlpd %xmm5, 1 * SIZE(BB) + movhpd %xmm5, 2 * SIZE(BB) + movhpd %xmm5, 3 * SIZE(BB) + movlpd %xmm7, 4 * SIZE(BB) + movlpd %xmm7, 5 * SIZE(BB) + movhpd %xmm7, 6 * SIZE(BB) + movhpd %xmm7, 7 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + + +.L500: + movl OLD_STACK, %esp + + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S b/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S new file mode 100644 index 0000000000..8b7bf6bf7b --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_1x2_sse3.S @@ -0,0 +1,965 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define ALPHA_R 16 + STACK + ARGS(%esp) +#define ALPHA_I 24 + STACK + ARGS(%esp) +#define A 32 + STACK + ARGS(%esp) +#define ARG_B 36 + STACK + ARGS(%esp) +#define C 40 + STACK + ARGS(%esp) +#define ARG_LDC 44 + STACK + ARGS(%esp) +#define OFFSET 48 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef PENTIUMM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#define AA %edx +#define BB %ecx +#define LDC %ebp +#define B %edi +#define CO1 %esi + +#define ADDSUB addpd + +#define KERNEL1(address) \ + mulpd %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addpd %xmm2, %xmm4; \ + movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL2(address) \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm0, %xmm2; \ + movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm2, %xmm7; \ + movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL3(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL4(address) \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm0, %xmm3; \ + movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ + ADDSUB %xmm3, %xmm7; \ + movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL5(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7; \ + movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2 + +#define KERNEL6(address) \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm4; \ + movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + ADDSUB %xmm2, %xmm5; \ + movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + addpd %xmm2, %xmm6; \ + movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm2; \ + movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm2, %xmm7 + +#define KERNEL7(address) \ + movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + +#define KERNEL8(address) \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm4; \ + movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + ADDSUB %xmm3, %xmm5; \ + movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + addpd %xmm3, %xmm6; \ + movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ + mulpd %xmm1, %xmm3; \ + movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ + ADDSUB %xmm3, %xmm7; \ + movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + testl $1, %eax + jle .L100 + +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +L110: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetchnta -2 * SIZE(CO1) +#else + prefetchnta 2 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je L112 + ALIGN_4 + +L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 4 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 5 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 6 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 6 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 7 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 16 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 16 * SIZE(BB), %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 9 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 10 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 10 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 11 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 12 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 12 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm4 + movddup 13 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 14 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm5 + movddup 14 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + addpd %xmm3, %xmm6 + movddup 15 * SIZE(BB), %xmm3 + mulpd %xmm1, %xmm3 + movapd 24 * SIZE(AA), %xmm1 + ADDSUB %xmm3, %xmm7 + movddup 24 * SIZE(BB), %xmm3 + + addl $16 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jne L111 + ALIGN_4 + +L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je L114 + ALIGN_4 + +L113: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg L113 + ALIGN_4 + +L114: + addpd %xmm6, %xmm4 + addpd %xmm7, %xmm5 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + pcmpeqb %xmm1, %xmm1 + psllq $63, %xmm1 + + shufps $0x40, %xmm1, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + xorpd %xmm1, %xmm5 + + subpd %xmm5, %xmm4 +#else +#if defined(LN) || defined(LT) + xorpd %xmm1, %xmm4 +#else + xorpd %xmm1, %xmm5 +#endif + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm5 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AA), %xmm5 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movddup 0 * SIZE(AA), %xmm2 + movddup 1 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg L110 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + sarl $1, %eax + movl %eax, J # j = n + jle .L500 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + testl %ebx, %ebx + jle .L500 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movapd 0 * SIZE(AA), %xmm0 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(AA), %xmm1 + pxor %xmm5, %xmm5 + movddup 0 * SIZE(BB), %xmm2 + pxor %xmm6, %xmm6 + movddup 8 * SIZE(BB), %xmm3 + pxor %xmm7, %xmm7 + +#ifdef LN + prefetcht0 -2 * SIZE(CO1) + prefetcht0 -2 * SIZE(CO1, LDC, 1) +#else + prefetchnta 2 * SIZE(CO1) + prefetchnta 2 * SIZE(CO1, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L12 + ALIGN_4 + +.L11: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addl $32 * SIZE, BB + addl $16 * SIZE, AA + decl %eax + jne .L11 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm4 + movddup 1 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + ADDSUB %xmm2, %xmm5 + movddup 2 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm6 + movddup 3 * SIZE(BB), %xmm2 + mulpd %xmm0, %xmm2 + movapd 2 * SIZE(AA), %xmm0 + ADDSUB %xmm2, %xmm7 + movddup 4 * SIZE(BB), %xmm2 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + pcmpeqb %xmm1, %xmm1 + psllq $63, %xmm1 + + shufps $0x40, %xmm1, %xmm1 + + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 + + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else +#if defined(LN) || defined(LT) + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 +#else + xorpd %xmm1, %xmm5 + xorpd %xmm1, %xmm7 +#endif + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BB), %xmm5 + movapd 2 * SIZE(BB), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AA), %xmm5 + movapd 2 * SIZE(AA), %xmm7 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm1, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movddup 0 * SIZE(AA), %xmm2 + movddup 1 * SIZE(AA), %xmm3 + + movapd %xmm5, %xmm4 + movapd %xmm7, %xmm6 + + SHUFPD_1 %xmm4, %xmm4 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm4 + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 + + movddup 2 * SIZE(BB), %xmm2 + movddup 3 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + movapd %xmm5, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm7 + subpd %xmm6, %xmm7 + + movddup 6 * SIZE(BB), %xmm2 + movddup 7 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movddup 6 * SIZE(BB), %xmm2 + movddup 7 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm7 + mulpd %xmm3, %xmm6 + + addpd %xmm6, %xmm7 + + movddup 4 * SIZE(BB), %xmm2 + movddup 5 * SIZE(BB), %xmm3 + + movapd %xmm7, %xmm4 + movapd %xmm7, %xmm6 + SHUFPD_1 %xmm6, %xmm6 + + xorpd %xmm1, %xmm6 + + mulpd %xmm2, %xmm4 + mulpd %xmm3, %xmm6 + + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm5 + + movddup 0 * SIZE(BB), %xmm2 + movddup 1 * SIZE(BB), %xmm3 + + movapd %xmm5, %xmm4 + SHUFPD_1 %xmm4, %xmm4 + + xorpd %xmm1, %xmm4 + + mulpd %xmm2, %xmm5 + mulpd %xmm3, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + + movlpd %xmm5, 0 * SIZE(CO1) + movhpd %xmm5, 1 * SIZE(CO1) + + movlpd %xmm7, 0 * SIZE(CO1, LDC) + movhpd %xmm7, 1 * SIZE(CO1, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm5, 0 * SIZE(BB) + movapd %xmm7, 2 * SIZE(BB) +#else + movapd %xmm5, 0 * SIZE(AA) + movapd %xmm7, 2 * SIZE(AA) + +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L500: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S new file mode 100644 index 0000000000..ebff425c02 --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -0,0 +1,1966 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 16 + +#define M 4 + STACK + ARGS(%esp) +#define N 8 + STACK + ARGS(%esp) +#define K 12 + STACK + ARGS(%esp) +#define A 24 + STACK + ARGS(%esp) +#define ARG_B 28 + STACK + ARGS(%esp) +#define C 32 + STACK + ARGS(%esp) +#define ARG_LDC 36 + STACK + ARGS(%esp) +#define OFFSET 40 + STACK + ARGS(%esp) + +#define J 0 + STACK(%esp) +#define KK 4 + STACK(%esp) +#define KKK 8 + STACK(%esp) +#define AORIG 12 + STACK(%esp) + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 84 +#endif + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 84 +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (16 * 2) +#endif + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + + subl $ARGS, %esp + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl ARG_B, B + movl ARG_LDC, LDC + movl OFFSET, %eax +#ifdef RN + negl %eax +#endif + movl %eax, KK + + movl M, %ebx + testl %ebx, %ebx + jle .L999 + + subl $-32 * SIZE, A + subl $-32 * SIZE, B + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + andl $1, %eax + jle .L100 + +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#ifdef RT + subl LDC, C +#endif + movl C, CO1 +#ifndef RT + addl LDC, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + movhps -30 * SIZE(BB), %xmm1 + pxor %xmm4, %xmm4 +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -12 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -8 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + movaps -4 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps 0 * SIZE(AA), %xmm0 + + subl $-32 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movsd -32 * SIZE(BB), %xmm2 + movsd -30 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + movlps %xmm3, -30 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L149 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movsd -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -26 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -22 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -18 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-16 * SIZE, BB + + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + addps %xmm2, %xmm4 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0x55, %xmm1, %xmm3 + movsd -30 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $2 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 1), BB +#endif + + addps %xmm2, %xmm4 + addps %xmm3, %xmm5 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 +#else + pxor %xmm0, %xmm5 +#endif +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, -32 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 1), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + movl %eax, J + sarl $1, J + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + movl C, CO1 +#ifndef RT + addl %eax, C +#endif + +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + +#ifdef LT + movl OFFSET, %eax + movl %eax, KK +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + +#ifdef LN + pxor %xmm4, %xmm4 + prefetcht0 -4 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 -4 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#else + pxor %xmm4, %xmm4 + prefetcht0 3 * SIZE(CO1) + pxor %xmm5, %xmm5 + prefetcht0 3 * SIZE(CO1, LDC) + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AA), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -8 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(AA), %xmm0 + + ADD2 %xmm2, %xmm7 + subl $-32 * SIZE, BB + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + subl $-32 * SIZE, AA + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -32 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -32 * SIZE(AA), %xmm0 + + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + ADD2 %xmm2, %xmm7 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm6 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD2 %xmm2, %xmm5 + pshufd $0xb1, %xmm3, %xmm2 + mulps %xmm0, %xmm3 + ADD1 %xmm1, %xmm4 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AA), %xmm0 + + addl $4 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), BB +#endif + + ADD2 %xmm2, %xmm7 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm3, %xmm6 + psllq $63, %xmm0 + +#ifndef CONJ + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 + + shufps $0xb1, %xmm0, %xmm0 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else + pshufd $0xb1, %xmm0, %xmm1 + + pxor %xmm1, %xmm5 + pxor %xmm1, %xmm7 +#endif +#endif + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + + shufps $0xd8, %xmm4, %xmm4 + shufps $0xd8, %xmm6, %xmm6 + + movaps %xmm4, %xmm5 + shufps $0xe4, %xmm6, %xmm4 + shufps $0xe4, %xmm5, %xmm6 + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps -32 * SIZE(BB), %xmm2 + movaps -28 * SIZE(BB), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AA), %xmm1 + movaps -28 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps -28 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + movaps %xmm3, -28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, -32 * SIZE(AA) + movaps %xmm5, -28 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $2, KK +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + movl B, BB + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movsd -32 * SIZE(AA), %xmm0 + pxor %xmm2, %xmm2 + movaps -32 * SIZE(BB), %xmm1 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -24 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -20 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -16 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -12 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -22 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -8 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -20 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -4 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -18 * SIZE(AA), %xmm0 + + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps 0 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AA), %xmm0 + + subl $-16 * SIZE, AA + subl $-32 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + addps %xmm2, %xmm6 + pshufd $0x00, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm7 + pshufd $0x55, %xmm1, %xmm3 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm4 + pshufd $0xaa, %xmm1, %xmm2 + mulps %xmm0, %xmm2 + addps %xmm3, %xmm5 + pshufd $0xff, %xmm1, %xmm3 + movaps -28 * SIZE(BB), %xmm1 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $4 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), BB +#endif + + addps %xmm2, %xmm6 + addps %xmm3, %xmm7 + + pshufd $0xb1, %xmm5, %xmm5 + pcmpeqb %xmm0, %xmm0 + pshufd $0xb1, %xmm7, %xmm7 + psllq $63, %xmm0 + +#ifndef CONJ + shufps $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#else +#if defined(LN) || defined(LT) + pxor %xmm0, %xmm4 + pxor %xmm0, %xmm6 +#else + pxor %xmm0, %xmm5 + pxor %xmm0, %xmm7 +#endif +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps -32 * SIZE(BB), %xmm2 + + subps %xmm4, %xmm2 +#else + movsd -32 * SIZE(AA), %xmm1 + movsd -30 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps -28 * SIZE(BB), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps -32 * SIZE(BB), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -32 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, -32 * SIZE(AA) + movlps %xmm5, -30 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (BB, %eax, 2), BB +#endif + +#ifdef LN + subl $1, KK +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl BB, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + +.L999: + popl %ebx + popl %esi + popl %edi + popl %ebp + + addl $ARGS, %esp + ret + + EPILOGUE diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S new file mode 100644 index 0000000000..bce0b0252e --- /dev/null +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -0,0 +1,2202 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 16 +#define ARGS 0 + +#define STACK_M 4 + STACK + ARGS(%esi) +#define STACK_N 8 + STACK + ARGS(%esi) +#define STACK_K 12 + STACK + ARGS(%esi) +#define STACK_A 24 + STACK + ARGS(%esi) +#define STACK_B 28 + STACK + ARGS(%esi) +#define STACK_C 32 + STACK + ARGS(%esi) +#define STACK_LDC 36 + STACK + ARGS(%esi) +#define STACK_OFFT 40 + STACK + ARGS(%esi) + +#define POSINV 0(%esp) +#define K 16(%esp) +#define N 20(%esp) +#define M 24(%esp) +#define A 28(%esp) +#define C 32(%esp) +#define J 36(%esp) +#define OLD_STACK 40(%esp) +#define OFFSET 48(%esp) +#define KK 52(%esp) +#define KKK 56(%esp) +#define AORIG 60(%esp) +#define BORIG 64(%esp) +#define BUFFER 128(%esp) + +#define B %edi +#define LDC %ebp +#define AA %edx +#define BB %ecx +#define CO1 %esi + +#define STACK_ALIGN 4096 +#define STACK_OFFSET 1024 + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCHSIZE (16 * 10 + 8) +#define WPREFETCHSIZE 112 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#if defined(PENTIUM4) || defined(PENTIUMM) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht1 +#define PREFETCHSIZE 168 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || !defined(HAVE_SSE2) +#define movsd movlps +#endif + +#ifdef HAVE_SSE2 +#define xorps pxor +#endif + +#define KERNEL1(address) \ + mulps %xmm0, %xmm2; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ + addps %xmm2, %xmm4; \ + movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL2(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL3(address) \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm0, %xmm2; \ + mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm2, %xmm6; \ + movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm0, %xmm7; \ + movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL4(address) \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm0, %xmm3; \ + mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \ + addps %xmm3, %xmm6; \ + movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm0, %xmm7; \ + movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0 + +#define KERNEL5(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL6(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL7(address) \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm4; \ + movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + addps %xmm2, %xmm5; \ + movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + mulps %xmm1, %xmm2; \ + mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm2, %xmm6; \ + movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \ + addps %xmm1, %xmm7; \ + movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1 + +#define KERNEL8(address) \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm4; \ + movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + addps %xmm3, %xmm5; \ + movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + mulps %xmm1, %xmm3; \ + mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \ + addps %xmm3, %xmm6; \ + movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \ + addps %xmm1, %xmm7; \ + movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1; + + PROLOGUE + + pushl %ebp + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl %esp, %esi # save old stack + + subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp + andl $-STACK_ALIGN, %esp # align stack + addl $STACK_OFFSET, %esp + + STACK_TOUCHING + + movl STACK_M, %ebx + movl STACK_N, %eax + movl STACK_K, %ecx + movl STACK_A, %edx + + movl %ebx, M + movl %eax, N + movl %ecx, K + movl %edx, A + movl %esi, OLD_STACK + + movl STACK_B, %edi + movl STACK_C, %ebx + movss STACK_OFFT, %xmm4 + + xorps %xmm7, %xmm7 + pcmpeqb %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm7, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + EMMS + + movl %ebx, C + movl STACK_LDC, LDC + + movss %xmm4, OFFSET + movss %xmm4, KK + + sall $ZBASE_SHIFT, LDC + +#ifdef LN + movl M, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, C + imull K, %eax + addl %eax, A +#endif + +#ifdef RT + movl N, %eax + sall $ZBASE_SHIFT, %eax + imull K, %eax + addl %eax, B + + movl N, %eax + imull LDC, %eax + addl %eax, C +#endif + +#ifdef RN + negl KK +#endif + +#ifdef RT + movl N, %eax + subl OFFSET, %eax + movl %eax, KK +#endif + + movl N, %eax + andl $1, %eax + jle .L100 + ALIGN_4 + +.L101: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $2, %eax + jle .L103 + ALIGN_4 + +.L102: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + decl %eax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $3, %eax + BRANCH + jle .L105 + ALIGN_4 + +.L104: +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + addl $ 2 * SIZE, %edi + addl $ 8 * SIZE, %ecx + decl %eax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + +#ifdef RT + subl LDC, C +#endif + + movl C, CO1 + +#ifndef RT + addl LDC, C +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movaps 0 * SIZE(AA), %xmm0 + movaps 16 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + + PREFETCHW 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L112 + ALIGN_4 + +.L111: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movaps 8 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 12 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movaps 32 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 20 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movaps 24 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 28 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movaps 48 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 32 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L114 + ALIGN_4 + +.L113: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $ 8 * SIZE, BB + decl %eax + jg .L113 + ALIGN_4 + +.L114: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $1, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 1), B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BB) + movaps %xmm1, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx # i -- + jg .L110 + ALIGN_4 + +.L130: + movl M, %ebx + andl $1, %ebx + jle .L149 + +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $2 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L142 + ALIGN_4 + +.L141: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 2 * SIZE(AA), %xmm0 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm6 + movaps 12 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + movsd 4 * SIZE(AA), %xmm0 + addps %xmm2, %xmm7 + movaps 32 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 6 * SIZE(AA), %xmm0 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm6 + movaps 28 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + movsd 16 * SIZE(AA), %xmm0 + addps %xmm3, %xmm7 + movaps 48 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 10 * SIZE(AA), %xmm1 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm6 + movaps 44 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + movsd 12 * SIZE(AA), %xmm1 + addps %xmm2, %xmm7 + movaps 64 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 14 * SIZE(AA), %xmm1 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm6 + movaps 60 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + movsd 24 * SIZE(AA), %xmm1 + addps %xmm3, %xmm7 + movaps 80 * SIZE(BB), %xmm3 + + addl $ 16 * SIZE, AA + addl $ 64 * SIZE, BB + decl %eax + jne .L141 + ALIGN_4 + +.L142: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L144 + ALIGN_4 + +.L143: + mulps %xmm0, %xmm2 + mulps 4 * SIZE(BB), %xmm0 + addps %xmm2, %xmm4 + movaps 8 * SIZE(BB), %xmm2 + addps %xmm0, %xmm5 + movsd 2 * SIZE(AA), %xmm0 + + addl $2 * SIZE, AA + addl $8 * SIZE, BB + decl %eax + jg .L143 + ALIGN_4 + +.L144: + addps %xmm6, %xmm4 + addps %xmm7, %xmm5 + + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm4 +#endif +#else + xorps %xmm0, %xmm5 +#endif + + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movl KK, %eax + subl $1, %eax + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + addl %eax, AA + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 + + subps %xmm4, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $2 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $1, KK +#endif + +#ifdef RT + subl $1, KK +#endif + ALIGN_4 + +.L100: + movl N, %eax + movl %eax, J + sarl $1, J + jle .L999 + ALIGN_4 + +.L01: +#ifdef LN + movl OFFSET, %eax + addl M, %eax + movl %eax, KK +#endif + + leal BUFFER, %ecx + +#ifdef RT + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, B +#endif + +#if defined(LN) || defined(RT) + movl KK, %eax + movl B, BORIG + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B + leal (BB, %eax, 4), BB +#endif + +#if defined(LT) + movl OFFSET, %eax + movl %eax, KK +#endif + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $1, %eax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BB) + movaps %xmm5, 20 * SIZE(BB) + movaps %xmm6, 24 * SIZE(BB) + movaps %xmm7, 28 * SIZE(BB) + + addl $ 8 * SIZE, B + addl $32 * SIZE, BB + + decl %eax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $1, %eax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm2, 8 * SIZE(BB) + movaps %xmm3, 12 * SIZE(BB) + + addl $ 4 * SIZE, B + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movl A, %eax + movl %eax, AA +#else + movl A, %eax + movl %eax, AORIG +#endif + + leal (, LDC, 2), %eax + +#ifdef RT + subl %eax, C +#endif + + movl C, CO1 + +#ifndef RT + addl %eax, C +#endif + + movl M, %ebx + sarl $1, %ebx + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + movaps 0 * SIZE(AA), %xmm0 + xorps %xmm4, %xmm4 + movaps 16 * SIZE(AA), %xmm1 + xorps %xmm5, %xmm5 + movaps 0 * SIZE(BB), %xmm2 + xorps %xmm6, %xmm6 + movaps 16 * SIZE(BB), %xmm3 + xorps %xmm7, %xmm7 + + PREFETCHW 3 * SIZE(CO1) + PREFETCHW 3 * SIZE(CO1, LDC) + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L15 + ALIGN_4 + +.L11: + KERNEL1(0 * 16) + KERNEL2(0 * 16) + KERNEL3(0 * 16) + KERNEL4(0 * 16) + KERNEL5(0 * 16) + KERNEL6(0 * 16) + KERNEL7(0 * 16) + KERNEL8(0 * 16) + + addl $ 32 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L11 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movaps 4 * SIZE(AA), %xmm0 + + addl $ 4 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L13 + ALIGN_4 + +.L14: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $2, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 2), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm4, %xmm2 + subps %xmm5, %xmm3 +#else + movaps 0 * SIZE(AA), %xmm1 + movaps 4 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#ifdef LN + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm1 + subps %xmm4, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AA), %xmm5 + + pshufd $0xee, %xmm5, %xmm6 + pshufd $0xbb, %xmm5, %xmm7 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm0, %xmm3 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm3 + addps %xmm4, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BB) + movaps %xmm1, 20 * SIZE(BB) + movaps %xmm4, 24 * SIZE(BB) + movaps %xmm5, 28 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm3, 2 * SIZE(CO1, LDC) +#else + movaps %xmm1, 0 * SIZE(AA) + movaps %xmm5, 4 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO1, LDC) + movhps %xmm5, 2 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $8 * SIZE, B +#endif +#endif + +#ifdef LN + subl $2, KK + movl BORIG, B +#endif + +#ifdef LT + addl $2, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $1 + ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + + decl %ebx + jg .L10 + ALIGN_4 + +.L30: + movl M, %ebx + andl $1, %ebx + jle .L99 + ALIGN_4 + +.L40: +#ifdef LN + movl K, %eax + sall $ZBASE_SHIFT, %eax + subl %eax, AORIG +#endif + +#if defined(LN) || defined(RT) + movl AORIG, %eax + movl %eax, AA + + movl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#endif + + leal BUFFER, BB # boffset1 = boffset + +#if defined(LN) || defined(RT) + movl KK, %eax + sall $3 + ZBASE_SHIFT, %eax + addl %eax, BB +#endif + + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(AA), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 8 * SIZE(AA), %xmm1 + movaps 0 * SIZE(BB), %xmm2 + movaps 16 * SIZE(BB), %xmm3 + +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + sarl $3, %eax + je .L42 + ALIGN_4 + +.L41: + mulps %xmm0, %xmm2 + prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA) + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 32 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 20 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 24 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 28 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 48 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 4 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 36 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 40 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 44 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 64 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 6 * SIZE(AA), %xmm0 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm4 + movaps 52 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm5 + movaps 56 * SIZE(BB), %xmm3 + mulps %xmm0, %xmm3 + mulps 60 * SIZE(BB), %xmm0 + addps %xmm3, %xmm6 + movaps 80 * SIZE(BB), %xmm3 + addps %xmm0, %xmm7 + movsd 16 * SIZE(AA), %xmm0 + mulps %xmm1, %xmm2 +#if defined(OPTERON) || defined(BARCELONA) + prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) +#endif + addps %xmm2, %xmm4 + movaps 68 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 72 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 76 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 96 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 10 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 84 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 88 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 92 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 112 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 12 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm4 + movaps 100 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm5 + movaps 104 * SIZE(BB), %xmm2 + mulps %xmm1, %xmm2 + mulps 108 * SIZE(BB), %xmm1 + addps %xmm2, %xmm6 + movaps 128 * SIZE(BB), %xmm2 + addps %xmm1, %xmm7 + movsd 14 * SIZE(AA), %xmm1 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm4 + movaps 116 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + addps %xmm3, %xmm5 + movaps 120 * SIZE(BB), %xmm3 + mulps %xmm1, %xmm3 + mulps 124 * SIZE(BB), %xmm1 + addps %xmm3, %xmm6 + movaps 144 * SIZE(BB), %xmm3 + addps %xmm1, %xmm7 + movsd 24 * SIZE(AA), %xmm1 + addl $ 16 * SIZE, AA + addl $128 * SIZE, BB + decl %eax + jne .L41 + ALIGN_4 + +.L42: +#if defined(LT) || defined(RN) + movl KK, %eax +#else + movl K, %eax + subl KK, %eax +#endif + andl $7, %eax # if (k & 1) + BRANCH + je .L44 + ALIGN_4 + +.L43: + mulps %xmm0, %xmm2 + addps %xmm2, %xmm4 + movaps 4 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + addps %xmm2, %xmm5 + movaps 8 * SIZE(BB), %xmm2 + mulps %xmm0, %xmm2 + mulps 12 * SIZE(BB), %xmm0 + addps %xmm2, %xmm6 + movaps 16 * SIZE(BB), %xmm2 + addps %xmm0, %xmm7 + movsd 2 * SIZE(AA), %xmm0 + + addl $ 2 * SIZE, AA + addl $16 * SIZE, BB + decl %eax + jg .L43 + ALIGN_4 + +.L44: + movaps POSINV, %xmm0 + + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#else + xorps %xmm0, %xmm4 + xorps %xmm0, %xmm6 +#endif +#else + xorps %xmm0, %xmm5 + xorps %xmm0, %xmm7 +#endif + + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movl KK, %eax +#ifdef LN + subl $1, %eax +#else + subl $2, %eax +#endif + + movl AORIG, AA + movl BORIG, B + leal BUFFER, BB + + sall $ZBASE_SHIFT, %eax + leal (AA, %eax, 1), AA + leal (B, %eax, 2), B + leal (BB, %eax, 8), BB +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm6, %xmm4 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm4, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AA), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AA), %xmm5 + + subps %xmm4, %xmm1 + subps %xmm6, %xmm5 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AA), %xmm5 + + pshufd $0x44, %xmm5, %xmm6 + pshufd $0x11, %xmm5, %xmm7 + + pshufd $0xa0, %xmm2, %xmm4 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm4 +#endif + + mulps %xmm6, %xmm4 + mulps %xmm7, %xmm2 + addps %xmm4, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm4 + + pshufd $0xee, %xmm4, %xmm6 + pshufd $0xbb, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm0, %xmm5 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm3, %xmm5 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm5, %xmm3 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm0, %xmm2 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm2 + + subps %xmm3, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm4 + + pshufd $0x44, %xmm4, %xmm6 + pshufd $0x11, %xmm4, %xmm7 + + pshufd $0xa0, %xmm1, %xmm3 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm0, %xmm1 +#else + xorps %xmm0, %xmm3 +#endif + + mulps %xmm6, %xmm3 + mulps %xmm7, %xmm1 + + addps %xmm3, %xmm1 +#endif + +#ifdef LN + subl $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BB) + movaps %xmm1, 4 * SIZE(BB) + movaps %xmm4, 8 * SIZE(BB) + movaps %xmm5, 12 * SIZE(BB) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO1, LDC) +#else + movlps %xmm1, 0 * SIZE(AA) + movlps %xmm5, 2 * SIZE(AA) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO1, LDC) +#endif + +#ifndef LN + addl $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $ZBASE_SHIFT, %eax + addl %eax, AA +#ifdef LT + addl $4 * SIZE, B +#endif +#endif + +#ifdef LN + subl $1, KK + movl BORIG, B +#endif + +#ifdef LT + addl $1, KK +#endif + +#ifdef RT + movl K, %eax + movl BORIG, B + sall $ZBASE_SHIFT, %eax + addl %eax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + movl K, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#if defined(LT) || defined(RN) + movl K, %eax + subl KK, %eax + sall $1 + ZBASE_SHIFT, %eax + addl %eax, B +#endif + +#ifdef RN + addl $2, KK +#endif + +#ifdef RT + subl $2, KK +#endif + + decl J # j -- + jg .L01 + ALIGN_4 + + +.L999: + EMMS + + movl OLD_STACK, %esp + popl %ebx + popl %esi + popl %edi + popl %ebp + ret + + EPILOGUE diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL new file mode 100644 index 0000000000..3d980f98fc --- /dev/null +++ b/kernel/x86_64/KERNEL @@ -0,0 +1,456 @@ +ifndef SAMAXKERNEL +SAMAXKERNEL = amax_sse.S +endif + +ifndef DAMAXKERNEL +DAMAXKERNEL = amax_sse2.S +endif + +ifndef QAMAXKERNEL +QAMAXKERNEL = amax.S +endif + +ifndef CAMAXKERNEL +CAMAXKERNEL = zamax_sse.S +endif + +ifndef ZAMAXKERNEL +ZAMAXKERNEL = zamax_sse2.S +endif + +ifndef XAMAXKERNEL +XAMAXKERNEL = zamax.S +endif + +ifndef SASUMKERNEL +SASUMKERNEL = asum_sse.S +endif + +ifndef DASUMKERNEL +DASUMKERNEL = asum_sse2.S +endif + +ifndef CASUMKERNEL +CASUMKERNEL = zasum_sse.S +endif + +ifndef ZASUMKERNEL +ZASUMKERNEL = zasum_sse2.S +endif + +ifndef QASUMKERNEL +QASUMKERNEL = asum.S +endif + +ifndef XASUMKERNEL +XASUMKERNEL = zasum.S +endif + +ifndef SAMINKERNEL +SAMINKERNEL = amax_sse.S +endif + +ifndef DAMINKERNEL +DAMINKERNEL = amax_sse2.S +endif + +ifndef QAMINKERNEL +QAMINKERNEL = amax.S +endif + +ifndef CAMINKERNEL +CAMINKERNEL = zamax_sse.S +endif + +ifndef ZAMINKERNEL +ZAMINKERNEL = zamax_sse2.S +endif + +ifndef XAMINKERNEL +XAMINKERNEL = zamax.S +endif + +ifndef SAXPYKERNEL +SAXPYKERNEL = axpy_sse.S +endif + +ifndef DAXPYKERNEL +DAXPYKERNEL = axpy_sse2.S +endif + +ifndef CAXPYKERNEL +CAXPYKERNEL = zaxpy_sse.S +endif + +ifndef ZAXPYKERNEL +ZAXPYKERNEL = zaxpy_sse2.S +endif + +ifndef QAXPYKERNEL +QAXPYKERNEL = axpy.S +endif + +ifndef XAXPYKERNEL +XAXPYKERNEL = zaxpy.S +endif + +ifndef SCOPYKERNEL +SCOPYKERNEL = copy_sse.S +endif + +ifndef DCOPYKERNEL +DCOPYKERNEL = copy_sse2.S +endif + +ifndef CCOPYKERNEL +CCOPYKERNEL = zcopy_sse.S +endif + +ifndef ZCOPYKERNEL +ZCOPYKERNEL = zcopy_sse2.S +endif + +ifndef QCOPYKERNEL +QCOPYKERNEL = copy.S +endif + +ifndef XCOPYKERNEL +XCOPYKERNEL = zcopy.S +endif + +ifndef SDOTKERNEL +SDOTKERNEL = dot_sse.S +endif + +ifndef DDOTKERNEL +DDOTKERNEL = dot_sse2.S +endif + +ifndef CDOTKERNEL +CDOTKERNEL = zdot_sse.S +endif + +ifndef ZDOTKERNEL +ZDOTKERNEL = zdot_sse2.S +endif + +ifndef QDOTKERNEL +QDOTKERNEL = dot.S +endif + +ifndef XDOTKERNEL +XDOTKERNEL = zdot.S +endif + +ifndef ISAMAXKERNEL +ISAMAXKERNEL = iamax_sse.S +endif + +ifndef IDAMAXKERNEL +IDAMAXKERNEL = iamax_sse2.S +endif + +ifndef IQAMAXKERNEL +IQAMAXKERNEL = iamax.S +endif + +ifndef ICAMAXKERNEL +ICAMAXKERNEL = izamax_sse.S +endif + +ifndef IZAMAXKERNEL +IZAMAXKERNEL = izamax_sse2.S +endif + +ifndef IXAMAXKERNEL +IXAMAXKERNEL = izamax.S +endif + +ifndef ISAMINKERNEL +ISAMINKERNEL = iamax_sse.S +endif + +ifndef IDAMINKERNEL +IDAMINKERNEL = iamax_sse2.S +endif + +ifndef IQAMINKERNEL +IQAMINKERNEL = iamax.S +endif + +ifndef ICAMINKERNEL +ICAMINKERNEL = izamax_sse.S +endif + +ifndef IZAMINKERNEL +IZAMINKERNEL = izamax_sse2.S +endif + +ifndef IXAMINKERNEL +IXAMINKERNEL = izamax.S +endif + +ifndef ISMAXKERNEL +ISMAXKERNEL = iamax_sse.S +endif + +ifndef IDMAXKERNEL +IDMAXKERNEL = iamax_sse2.S +endif + +ifndef IQMAXKERNEL +IQMAXKERNEL = iamax.S +endif + +ifndef ISMINKERNEL +ISMINKERNEL = iamax_sse.S +endif + +ifndef IDMINKERNEL +IDMINKERNEL = iamax_sse2.S +endif + +ifndef IQMINKERNEL +IQMINKERNEL = iamax.S +endif + +ifndef SMAXKERNEL +SMAXKERNEL = amax_sse.S +endif + +ifndef DMAXKERNEL +DMAXKERNEL = amax_sse2.S +endif + +ifndef QMAXKERNEL +QMAXKERNEL = amax.S +endif + +ifndef SMINKERNEL +SMINKERNEL = amax_sse.S +endif + +ifndef DMINKERNEL +DMINKERNEL = amax_sse2.S +endif + +ifndef QMINKERNEL +QMINKERNEL = amax.S +endif + +ifndef SNRM2KERNEL +SNRM2KERNEL = nrm2_sse.S +endif + +ifndef DNRM2KERNEL +DNRM2KERNEL = nrm2.S +endif + +ifndef QNRM2KERNEL +QNRM2KERNEL = nrm2.S +endif + +ifndef CNRM2KERNEL +CNRM2KERNEL = znrm2_sse.S +endif + +ifndef ZNRM2KERNEL +ZNRM2KERNEL = znrm2.S +endif + +ifndef XNRM2KERNEL +XNRM2KERNEL = znrm2.S +endif + +ifndef SROTKERNEL +SROTKERNEL = rot_sse.S +endif + +ifndef DROTKERNEL +DROTKERNEL = rot_sse2.S +endif + +ifndef QROTKERNEL +QROTKERNEL = rot.S +endif + +ifndef CROTKERNEL +CROTKERNEL = zrot_sse.S +endif + +ifndef ZROTKERNEL +ZROTKERNEL = zrot_sse2.S +endif + +ifndef XROTKERNEL +XROTKERNEL = zrot.S +endif + +ifndef SSCALKERNEL +SSCALKERNEL = scal_sse.S +endif + +ifndef DSCALKERNEL +DSCALKERNEL = scal_sse2.S +endif + +ifndef CSCALKERNEL +CSCALKERNEL = zscal_sse.S +endif + +ifndef ZSCALKERNEL +ZSCALKERNEL = zscal_sse2.S +endif + +ifndef ASCALKERNEL +QSCALKERNEL = scal.S +endif + +ifndef XSCALKERNEL +XSCALKERNEL = zscal.S +endif + +ifndef SSWAPKERNEL +SSWAPKERNEL = swap_sse.S +endif + +ifndef DSWAPKERNEL +DSWAPKERNEL = swap_sse2.S +endif + +ifndef CSWAPKERNEL +CSWAPKERNEL = zswap_sse.S +endif + +ifndef ZSWAPKERNEL +ZSWAPKERNEL = zswap_sse2.S +endif + +ifndef QSWAPKERNEL +QSWAPKERNEL = swap.S +endif + +ifndef XSWAPKERNEL +XSWAPKERNEL = zswap.S +endif + +ifndef SSYMV_U_KERNEL +SSYMV_U_KERNEL = symv_U_sse.S +endif + +ifndef SSYMV_L_KERNEL +SSYMV_L_KERNEL = symv_L_sse.S +endif + +ifndef DSYMV_U_KERNEL +DSYMV_U_KERNEL = symv_U_sse2.S +endif + +ifndef DSYMV_L_KERNEL +DSYMV_L_KERNEL = symv_L_sse2.S +endif + +ifndef ZSYMV_U_KERNEL +ZSYMV_U_KERNEL = zsymv_U_sse2.S +endif + +ifndef ZSYMV_L_KERNEL +ZSYMV_L_KERNEL = zsymv_L_sse2.S +endif + +ifndef ZHEMV_U_KERNEL +ZHEMV_U_KERNEL = zsymv_U_sse2.S +endif + +ifndef ZHEMV_L_KERNEL +ZHEMV_L_KERNEL = zsymv_L_sse2.S +endif + +GEMVDEP = ../l2param.h + +ifndef SGEMVNKERNEL +SGEMVNKERNEL = sgemv_n.S +endif + +ifndef SGEMVTKERNEL +SGEMVTKERNEL = sgemv_t.S +endif + +ifndef DGEMVNKERNEL +DGEMVNKERNEL = dgemv_n.S +endif + +ifndef DGEMVTKERNEL +DGEMVTKERNEL = dgemv_t.S +endif + +ifndef CGEMVNKERNEL +CGEMVNKERNEL = cgemv_n.S +endif + +ifndef CGEMVTKERNEL +CGEMVTKERNEL = cgemv_t.S +endif + +ifndef ZGEMVNKERNEL +ZGEMVNKERNEL = zgemv_n.S +endif + +ifndef ZGEMVTKERNEL +ZGEMVTKERNEL = zgemv_t.S +endif + +ifndef QGEMVNKERNEL +QGEMVNKERNEL = qgemv_n.S +endif + +ifndef QGEMVTKERNEL +QGEMVTKERNEL = qgemv_t.S +endif + +ifndef XGEMVNKERNEL +XGEMVNKERNEL = xgemv_n.S +endif + +ifndef XGEMVTKERNEL +XGEMVTKERNEL = xgemv_t.S +endif + +QGEMMKERNEL = qgemm_kernel_2x2.S +QGEMMINCOPY = +QGEMMITCOPY = +QGEMMONCOPY = ../generic/gemm_ncopy_2.c +QGEMMOTCOPY = ../generic/gemm_tcopy_2.c +QGEMMINCOPYOBJ = +QGEMMITCOPYOBJ = +QGEMMONCOPYOBJ = qgemm_oncopy$(TSUFFIX).$(SUFFIX) +QGEMMOTCOPYOBJ = qgemm_otcopy$(TSUFFIX).$(SUFFIX) + +XGEMMKERNEL = xgemm_kernel_1x1.S +XGEMMINCOPY = +XGEMMITCOPY = +XGEMMONCOPY = ../generic/zgemm_ncopy_1.c +XGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +XGEMMINCOPYOBJ = +XGEMMITCOPYOBJ = +XGEMMONCOPYOBJ = xgemm_oncopy$(TSUFFIX).$(SUFFIX) +XGEMMOTCOPYOBJ = xgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SGEMM_BETA = gemm_beta.S +DGEMM_BETA = gemm_beta.S +CGEMM_BETA = zgemm_beta.S +ZGEMM_BETA = zgemm_beta.S +QGEMM_BETA = ../generic/gemm_beta.c +XGEMM_BETA = ../generic/zgemm_beta.c + +QTRSMKERNEL_LN = qtrsm_kernel_LN_2x2.S +QTRSMKERNEL_LT = qtrsm_kernel_LT_2x2.S +QTRSMKERNEL_RN = qtrsm_kernel_LT_2x2.S +QTRSMKERNEL_RT = qtrsm_kernel_RT_2x2.S + +XTRSMKERNEL_LN = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_LT = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S +XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S + +XGEMM3MKERNEL = xgemm3m_kernel_2x2.S diff --git a/kernel/x86_64/KERNEL.ATOM b/kernel/x86_64/KERNEL.ATOM new file mode 100644 index 0000000000..cfbd05a628 --- /dev/null +++ b/kernel/x86_64/KERNEL.ATOM @@ -0,0 +1,85 @@ +DAMAXKERNEL = amax_atom.S +ZAMAXKERNEL = zamax_atom.S + +DAMINKERNEL = amax_atom.S +ZAMINKERNEL = zamax_atom.S + +DASUMKERNEL = asum_atom.S +ZASUMKERNEL = zasum_atom.S + +DAXPYKERNEL = axpy_atom.S +ZAXPYKERNEL = zaxpy_atom.S + +DDOTKERNEL = dot_atom.S +ZDOTKERNEL = zdot_atom.S + +DMAXKERNEL = amax_atom.S +DMINKERNEL = amax_atom.S + +DSCALKERNEL = scal_atom.S +ZSCALKERNEL = zscal_atom.S + +DGEMVNKERNEL = dgemv_n_atom.S +DGEMVTKERNEL = dgemv_t_atom.S +ZGEMVNKERNEL = zgemv_n_atom.S +ZGEMVTKERNEL = zgemv_t_atom.S + +SGEMMKERNEL = gemm_kernel_8x4_penryn.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x2_atom.S +DGEMMINCOPY = gemm_ncopy_4.S +DGEMMITCOPY = gemm_tcopy_4.S +DGEMMONCOPY = ../generic/gemm_ncopy_2.c +DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_penryn.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x1_atom.S +ZGEMMINCOPY = zgemm_ncopy_2.S +ZGEMMITCOPY = zgemm_tcopy_2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x2_atom.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x2_atom.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x2_atom.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x2_atom.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x1_atom.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x1_atom.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x1_atom.S +ZTRSMKERNEL_RT = ztrsm_kernel_LT_2x1_atom.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x2_atom.S diff --git a/kernel/x86_64/KERNEL.BARCELONA b/kernel/x86_64/KERNEL.BARCELONA new file mode 100644 index 0000000000..051a52286b --- /dev/null +++ b/kernel/x86_64/KERNEL.BARCELONA @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/KERNEL.CORE2 b/kernel/x86_64/KERNEL.CORE2 new file mode 100644 index 0000000000..8a07e80845 --- /dev/null +++ b/kernel/x86_64/KERNEL.CORE2 @@ -0,0 +1,60 @@ +SGEMMKERNEL = gemm_kernel_8x4_core2.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_core2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_core2.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_core2.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_core2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_core2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_core2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_core2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S + diff --git a/kernel/x86_64/KERNEL.DUNNINGTON b/kernel/x86_64/KERNEL.DUNNINGTON new file mode 100644 index 0000000000..b96daa03fd --- /dev/null +++ b/kernel/x86_64/KERNEL.DUNNINGTON @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x4_penryn.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_penryn.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_penryn.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S diff --git a/kernel/x86_64/KERNEL.NANO b/kernel/x86_64/KERNEL.NANO new file mode 100644 index 0000000000..0b771a4518 --- /dev/null +++ b/kernel/x86_64/KERNEL.NANO @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x8_nano.S +SGEMMINCOPY = gemm_ncopy_4.S +SGEMMITCOPY = gemm_tcopy_4.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_penryn.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMINCOPY = zgemm_ncopy_2.S +CGEMMITCOPY = zgemm_tcopy_2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_core2.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_core2.S diff --git a/kernel/x86_64/KERNEL.NEHALEM b/kernel/x86_64/KERNEL.NEHALEM new file mode 100644 index 0000000000..58a883243f --- /dev/null +++ b/kernel/x86_64/KERNEL.NEHALEM @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x8_nehalem.S +SGEMMINCOPY = gemm_ncopy_4.S +SGEMMITCOPY = gemm_tcopy_4.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x8_nehalem.S +DGEMMINCOPY = dgemm_ncopy_2.S +DGEMMITCOPY = dgemm_tcopy_2.S +DGEMMONCOPY = dgemm_ncopy_8.S +DGEMMOTCOPY = dgemm_tcopy_8.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMINCOPY = zgemm_ncopy_2.S +CGEMMITCOPY = zgemm_tcopy_2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMINCOPY = zgemm_ncopy_1.S +ZGEMMITCOPY = zgemm_tcopy_1.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/KERNEL.OPTERON b/kernel/x86_64/KERNEL.OPTERON new file mode 100644 index 0000000000..27fb785986 --- /dev/null +++ b/kernel/x86_64/KERNEL.OPTERON @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x4_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_sse2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_sse2.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse2.S diff --git a/kernel/x86_64/KERNEL.OPTERON_SSE3 b/kernel/x86_64/KERNEL.OPTERON_SSE3 new file mode 100644 index 0000000000..565daf3662 --- /dev/null +++ b/kernel/x86_64/KERNEL.OPTERON_SSE3 @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_sse.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_sse2.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_sse.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_sse2.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse2.S diff --git a/kernel/x86_64/KERNEL.PENRYN b/kernel/x86_64/KERNEL.PENRYN new file mode 100644 index 0000000000..b96daa03fd --- /dev/null +++ b/kernel/x86_64/KERNEL.PENRYN @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_8x4_penryn.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_penryn.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_penryn.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_penryn.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_penryn.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_penryn.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_penryn.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_penryn.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_penryn.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_penryn.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_penryn.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_penryn.S diff --git a/kernel/x86_64/KERNEL.PRESCOTT b/kernel/x86_64/KERNEL.PRESCOTT new file mode 100644 index 0000000000..e155531906 --- /dev/null +++ b/kernel/x86_64/KERNEL.PRESCOTT @@ -0,0 +1,63 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_sse3.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_sse3.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_sse3.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_sse3.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_sse3.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_sse3.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_sse3.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_sse3.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse3.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse3.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse3.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse3.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + diff --git a/kernel/x86_64/Makefile b/kernel/x86_64/Makefile new file mode 100644 index 0000000000..efae70d7b7 --- /dev/null +++ b/kernel/x86_64/Makefile @@ -0,0 +1,2 @@ +clean :: + diff --git a/kernel/x86_64/amax.S b/kernel/x86_64/amax.S new file mode 100644 index 0000000000..d096d883c8 --- /dev/null +++ b/kernel/x86_64/amax.S @@ -0,0 +1,307 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define I %rax + +#ifndef USE_MIN +#define FMOV fcmovbe +#else +#define FMOV fcmovnbe +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $BASE_SHIFT, INCX + + fldz + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + ffreep %st + + FLD (X) +#ifdef USE_ABS + fabs +#endif + addq INCX, X + decq M + jle .L999 + + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 1 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 2 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 3 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 4 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 5 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 6 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 7 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq $8 * SIZE, X + + decq I + jg .L10 + ALIGN_4 + +.L20: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq $1 * SIZE, X + decq I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + decq I + jg .L50 + ALIGN_4 + +.L60: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq INCX, X + decq I + jg .L61 + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/amax_atom.S b/kernel/x86_64/amax_atom.S new file mode 100644 index 0000000000..fa7b9a3662 --- /dev/null +++ b/kernel/x86_64/amax_atom.S @@ -0,0 +1,460 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + leaq (, INCX, SIZE), INCX + + testq M, M + jle .L999 + + testq INCX, INCX + jle .L999 + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 +#endif + + movsd (X), %xmm0 + addq INCX, X + +#ifdef USE_ABS + andps %xmm15, %xmm0 +#endif + decq M + jle .L999 + + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 + + cmpq $SIZE, INCX + jne .L20 + + movq M, I + sarq $3, I + jle .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movsd 2 * SIZE(X), %xmm6 + movsd 3 * SIZE(X), %xmm7 + + movsd 4 * SIZE(X), %xmm8 + movsd 5 * SIZE(X), %xmm9 + movsd 6 * SIZE(X), %xmm10 + movsd 7 * SIZE(X), %xmm11 + + decq I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm1 + movsd 8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm2 + movsd 9 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm1 + movsd 10 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm2 + movsd 11 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + maxsd %xmm8, %xmm1 + movsd 12 * SIZE(X), %xmm8 + +#ifdef USE_ABS + andps %xmm15, %xmm9 +#endif + maxsd %xmm9, %xmm2 + movsd 13 * SIZE(X), %xmm9 + +#ifdef USE_ABS + andps %xmm15, %xmm10 +#endif + maxsd %xmm10, %xmm1 + movsd 14 * SIZE(X), %xmm10 + +#ifdef USE_ABS + andps %xmm15, %xmm11 +#endif + maxsd %xmm11, %xmm2 + movsd 15 * SIZE(X), %xmm11 + + addq $8 * SIZE, X + decq I + jg .L12 + ALIGN_4 + +.L13: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm3 + +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + maxsd %xmm8, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm9 +#endif + maxsd %xmm9, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm10 +#endif + maxsd %xmm10, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm11 +#endif + maxsd %xmm11, %xmm3 + + addq $8 * SIZE, X + ALIGN_4 + +.L15: + testq $4, M + jle .L17 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movsd 2 * SIZE(X), %xmm6 + movsd 3 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm3 + + addq $4 * SIZE, X + ALIGN_3 + +.L17: + testq $2, M + jle .L18 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm2 + addq $2 * SIZE, X + ALIGN_3 + +.L18: + testq $1, M + jle .L998 + + movsd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm3 + jmp .L998 + ALIGN_3 + +.L20: + movq M, I + sarq $3, I + jle .L25 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + movsd (X), %xmm8 + addq INCX, X + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm10 + addq INCX, X + movsd (X), %xmm11 + + decq I + jle .L23 + ALIGN_4 + +.L22: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + addq INCX, X + maxsd %xmm4, %xmm1 + movsd (X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + addq INCX, X + maxsd %xmm5, %xmm2 + movsd (X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + addq INCX, X + maxsd %xmm6, %xmm1 + movsd (X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + addq INCX, X + maxsd %xmm7, %xmm2 + movsd (X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + addq INCX, X + maxsd %xmm8, %xmm1 + movsd (X), %xmm8 + +#ifdef USE_ABS + andps %xmm15, %xmm9 +#endif + addq INCX, X + maxsd %xmm9, %xmm2 + movsd (X), %xmm9 + +#ifdef USE_ABS + andps %xmm15, %xmm10 +#endif + addq INCX, X + maxsd %xmm10, %xmm1 + movsd (X), %xmm10 + +#ifdef USE_ABS + andps %xmm15, %xmm11 +#endif + addq INCX, X + maxsd %xmm11, %xmm2 + movsd (X), %xmm11 + + decq I + jg .L22 + ALIGN_4 + +.L23: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + addq INCX, X + maxsd %xmm4, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm3 + +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + maxsd %xmm8, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm9 +#endif + maxsd %xmm9, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm10 +#endif + maxsd %xmm10, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm11 +#endif + maxsd %xmm11, %xmm3 + ALIGN_4 + +.L25: + testq $4, M + jle .L27 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxsd %xmm6, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxsd %xmm7, %xmm3 + ALIGN_3 + +.L27: + testq $2, M + jle .L28 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxsd %xmm5, %xmm2 + ALIGN_3 + +.L28: + testq $1, M + jle .L998 + + movsd (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxsd %xmm4, %xmm3 + ALIGN_3 + +.L998: + maxsd %xmm1, %xmm0 + maxsd %xmm3, %xmm2 + maxsd %xmm2, %xmm0 + ALIGN_4 + +.L999: + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/amax_sse.S b/kernel/x86_64/amax_sse.S new file mode 100644 index 0000000000..22b8b16d28 --- /dev/null +++ b/kernel/x86_64/amax_sse.S @@ -0,0 +1,475 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + leaq (, INCX, SIZE), INCX + + testq M, M + jle .L999 + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 +#endif + + movss (X), %xmm0 + shufps $0, %xmm0, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm0 +#endif + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 + addq INCX, X + decq M + jle .L999 + + cmpq $SIZE, INCX + jne .L40 + + subq $-32 * SIZE, X + + cmpq $3, M + jle .L17 + + testq $SIZE, X + je .L05 + + movss -32 * SIZE(X), %xmm1 + shufps $0, %xmm1, %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + decq M + addq $SIZE, X + ALIGN_3 + +.L05: + testq $2 * SIZE, X + je .L06 + + movsd -32 * SIZE(X), %xmm2 + unpcklps %xmm2, %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm2 +#endif + subq $2, M + addq $2 * SIZE, X + ALIGN_3 + +.L06: + movq M, I + sarq $5, I + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + decq I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + movaps -8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + movaps -12 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + movaps -8 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + movaps -4 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + + subq $-32 * SIZE, X + ALIGN_3 + + +.L15: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + + movaps -20 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + + movsd -32 * SIZE(X), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L40: + movq M, I + sarq $3, I + jle .L45 + ALIGN_4 + +.L41: + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + + decq I + jg .L41 + ALIGN_4 + +.L45: + testq $4, M + je .L46 + + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + ALIGN_3 + +.L46: + testq $2, M + je .L47 + + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + ALIGN_3 + +.L47: + testq $1, M + je .L998 + + movss (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm2 + ALIGN_4 + +.L998: + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/amax_sse2.S b/kernel/x86_64/amax_sse2.S new file mode 100644 index 0000000000..033e8e1768 --- /dev/null +++ b/kernel/x86_64/amax_sse2.S @@ -0,0 +1,498 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + leaq (, INCX, SIZE), INCX + + testq M, M + jle .L999 + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 +#endif + + movsd (X), %xmm0 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm0 +#endif + unpcklpd %xmm0, %xmm0 + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 + decq M + jle .L999 + + cmpq $SIZE, INCX + jne .L40 + + subq $-16 * SIZE, X + + testq $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + decq I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + movaps -4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + movaps -2 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + + subq $-16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + movaps -8 * SIZE(X), %xmm4 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + movaps -6 * SIZE(X), %xmm5 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + movaps -4 * SIZE(X), %xmm6 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + movaps -2 * SIZE(X), %xmm7 + +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + subq $-16 * SIZE, X + ALIGN_4 + +.L15: + testq $8, M + jle .L16 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movaps -14 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movaps -12 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movaps -10 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L16: + testq $4, M + jle .L17 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movaps -14 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L17: + testq $2, M + jle .L18 + + movaps -16 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm2 + addq $2 * SIZE, X + ALIGN_3 + +.L18: + testq $1, M + jle .L998 + + movsd -16 * SIZE(X), %xmm4 + unpcklpd %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm3 + jmp .L998 + ALIGN_3 + +.L40: + movq M, I + sarq $4, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + decq I + jg .L41 + ALIGN_4 + +.L45: + andq $15, M + jle .L998 + + testq $8, M + je .L46 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + ALIGN_3 + +.L46: + testq $4, M + je .L47 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + ALIGN_3 + +.L47: + testq $2, M + je .L48 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + ALIGN_3 + +.L48: + testq $1, M + je .L998 + + movsd (X), %xmm7 + unpcklpd %xmm7, %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + ALIGN_4 + +.L998: + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movaps %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/asum.S b/kernel/x86_64/asum.S new file mode 100644 index 0000000000..13c6f4fa28 --- /dev/null +++ b/kernel/x86_64/asum.S @@ -0,0 +1,197 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $BASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $7, M + jle .L998 + ALIGN_4 + +.L21: + FLD (X) + fabs + faddp %st,%st(1) + addq $1 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + FLD (X) + addq INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $7, M + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addq INCX, X + fabs + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/asum_atom.S b/kernel/x86_64/asum_atom.S new file mode 100644 index 0000000000..b6ea65f01d --- /dev/null +++ b/kernel/x86_64/asum_atom.S @@ -0,0 +1,433 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + salq $BASE_SHIFT, INCX + xorps %xmm13, %xmm13 + + cmpq $SIZE, INCX + jne .L20 + + testq $SIZE, X + je .L05 + + movsd (X), %xmm0 + addq $SIZE, X + andps %xmm15, %xmm0 + decq M + jle .L999 + ALIGN_3 + +.L05: + subq $-16 * SIZE, X + + movq M, I + sarq $4, I + jle .L12 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm8, %xmm12 + addsd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm9, %xmm13 + addsd %xmm9, %xmm2 + movaps 10 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm10, %xmm12 + addsd %xmm10, %xmm0 + movaps 12 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm11, %xmm13 + addsd %xmm11, %xmm2 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm15, %xmm4 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + + andps %xmm15, %xmm6 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + + andps %xmm15, %xmm8 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm8, %xmm12 + addsd %xmm8, %xmm0 + + andps %xmm15, %xmm9 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm9, %xmm13 + addsd %xmm9, %xmm2 + + andps %xmm15, %xmm10 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm10, %xmm12 + addsd %xmm10, %xmm0 + + andps %xmm15, %xmm11 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm11, %xmm13 + addsd %xmm11, %xmm2 + + addsd %xmm13, %xmm3 + subq $-16 * SIZE, X + ALIGN_3 + +.L12: + andq $15, M + jle .L998 + + testq $8, M + je .L13 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + addq $8 * SIZE, X + + andps %xmm15, %xmm4 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + addsd %xmm13, %xmm3 + andps %xmm15, %xmm6 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + addsd %xmm13, %xmm3 + ALIGN_3 + +.L13: + testq $4, M + je .L14 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + addq $4 * SIZE, X + + andps %xmm15, %xmm4 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + addsd %xmm13, %xmm3 + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movaps -16 * SIZE(X), %xmm4 + addq $2 * SIZE, X + andps %xmm15, %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + addsd %xmm4, %xmm2 + addsd %xmm5, %xmm3 + ALIGN_3 + +.L15: + testq $1, M + je .L998 + + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L20: + movq M, I + sarq $3, I + jle .L25 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + movsd (X), %xmm8 + addq INCX, X + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm10 + addq INCX, X + movsd (X), %xmm11 + + decq I + jle .L23 + ALIGN_4 + +.L22: + andps %xmm15, %xmm4 + addq INCX, X + addsd %xmm4, %xmm0 + movsd (X), %xmm4 + andps %xmm15, %xmm5 + addq INCX, X + addsd %xmm5, %xmm1 + movsd (X), %xmm5 + andps %xmm15, %xmm6 + addq INCX, X + addsd %xmm6, %xmm2 + movsd (X), %xmm6 + andps %xmm15, %xmm7 + addq INCX, X + addsd %xmm7, %xmm3 + movsd (X), %xmm7 + + andps %xmm15, %xmm8 + addq INCX, X + addsd %xmm8, %xmm0 + movsd (X), %xmm8 + andps %xmm15, %xmm9 + addq INCX, X + addsd %xmm9, %xmm1 + movsd (X), %xmm9 + andps %xmm15, %xmm10 + addq INCX, X + addsd %xmm10, %xmm2 + movsd (X), %xmm10 + andps %xmm15, %xmm11 + addq INCX, X + addsd %xmm11, %xmm3 + movsd (X), %xmm11 + + decq I + jg .L22 + ALIGN_4 + +.L23: + andps %xmm15, %xmm4 + addq INCX, X + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + andps %xmm15, %xmm6 + addsd %xmm6, %xmm2 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + + andps %xmm15, %xmm8 + addsd %xmm8, %xmm0 + andps %xmm15, %xmm9 + addsd %xmm9, %xmm1 + andps %xmm15, %xmm10 + addsd %xmm10, %xmm2 + andps %xmm15, %xmm11 + addsd %xmm11, %xmm3 + ALIGN_3 + +.L25: + andq $7, M + jle .L998 + + testq $4, M + je .L26 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + addq INCX, X + movsd (X), %xmm7 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + addq INCX, X + + andps %xmm15, %xmm6 + addsd %xmm6, %xmm2 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + ALIGN_3 + +.L26: + testq $2, M + je .L27 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + ALIGN_3 + +.L27: + testq $1, M + je .L998 + + movsd (X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + ALIGN_3 + +.L998: + addsd %xmm1, %xmm0 + addsd %xmm3, %xmm2 + addsd %xmm2, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/asum_sse.S b/kernel/x86_64/asum_sse.S new file mode 100644 index 0000000000..840e1939da --- /dev/null +++ b/kernel/x86_64/asum_sse.S @@ -0,0 +1,345 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 + + leaq (, INCX, SIZE), INCX + + cmpq $SIZE, INCX + jne .L100 + + subq $-32 * SIZE, X + + cmpq $3, M + jle .L18 + + testq $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + andps %xmm15, %xmm0 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + testq $8, X + je .L10 + + movsd -32 * SIZE(X), %xmm1 + andps %xmm15, %xmm1 + addq $2 * SIZE, X + subq $2, M + jle .L998 + ALIGN_3 + +.L10: + movq M, I + sarq $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + decq I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_3 + +.L12: + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + + andps %xmm15, %xmm8 + addps %xmm8, %xmm0 + andps %xmm15, %xmm9 + addps %xmm9, %xmm1 + + andps %xmm15, %xmm10 + addps %xmm10, %xmm2 + andps %xmm15, %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + ALIGN_3 + +.L14: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -20 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(X), %xmm7 + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + jmp .L998 + ALIGN_4 + +.L100: + movq M, I + sarq $3, I + jle .L105 + ALIGN_4 + +.L101: + movss 0 * SIZE(X), %xmm4 + addq INCX, X + andps %xmm15, %xmm4 + addss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X + andps %xmm15, %xmm5 + addss %xmm5, %xmm1 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X + andps %xmm15, %xmm6 + addss %xmm6, %xmm2 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X + andps %xmm15, %xmm7 + addss %xmm7, %xmm3 + + movss 0 * SIZE(X), %xmm8 + addq INCX, X + andps %xmm15, %xmm8 + addss %xmm8, %xmm0 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X + andps %xmm15, %xmm4 + addss %xmm4, %xmm1 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X + andps %xmm15, %xmm5 + addss %xmm5, %xmm2 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X + andps %xmm15, %xmm6 + addss %xmm6, %xmm3 + + decq I + jg .L101 + ALIGN_4 + +.L105: + andq $7, M + jle .L998 + ALIGN_4 + +.L106: + movss 0 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + addq INCX, X + decq M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/asum_sse2.S b/kernel/x86_64/asum_sse2.S new file mode 100644 index 0000000000..7286fc093e --- /dev/null +++ b/kernel/x86_64/asum_sse2.S @@ -0,0 +1,311 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + salq $BASE_SHIFT, INCX + + subq $-16 * SIZE, X + + cmpq $SIZE, INCX + jne .L40 + + testq $SIZE, X + je .L05 + + movsd -16 * SIZE(X), %xmm0 + addq $SIZE, X + + andps %xmm15, %xmm0 + subq $1, M + jle .L999 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + andps %xmm15, %xmm8 + andps %xmm15, %xmm9 + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + ALIGN_3 + +.L20: + andq $15, M + jle .L998 + + testq $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L21: + testq $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L22: + testq $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addpd %xmm6, %xmm3 + addq $2 * SIZE, X + +.L23: + testq $1, M + je .L998 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + movsd -16 * SIZE(X), %xmm4 + addq INCX, X + movhpd -16 * SIZE(X), %xmm4 + addq INCX, X + andps %xmm15, %xmm4 + addpd %xmm4, %xmm0 + + movsd -16 * SIZE(X), %xmm5 + addq INCX, X + movhpd -16 * SIZE(X), %xmm5 + addq INCX, X + andps %xmm15, %xmm5 + addpd %xmm5, %xmm1 + + movsd -16 * SIZE(X), %xmm6 + addq INCX, X + movhpd -16 * SIZE(X), %xmm6 + addq INCX, X + andps %xmm15, %xmm6 + addpd %xmm6, %xmm2 + + movsd -16 * SIZE(X), %xmm7 + addq INCX, X + movhpd -16 * SIZE(X), %xmm7 + addq INCX, X + andps %xmm15, %xmm7 + addpd %xmm7, %xmm3 + + decq I + jg .L50 + ALIGN_4 + +.L60: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andq $7, M + jle .L998 + ALIGN_4 + +.L61: + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addpd %xmm4, %xmm0 + addq INCX, X + decq M + jg .L61 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + ALIGN_4 + +.L999: +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/axpy.S b/kernel/x86_64/axpy.S new file mode 100644 index 0000000000..478cc88e89 --- /dev/null +++ b/kernel/x86_64/axpy.S @@ -0,0 +1,224 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG4 /* rsi */ +#define INCX ARG5 /* rdx */ +#define Y ARG6 /* rcx */ +#define INCY ARG2 /* r8 */ + +#define ALPHA 8(%rsp) + +#include "l1param.h" + + PROLOGUE + PROFCODE + + movq 24(%rsp), INCY + + FLD ALPHA + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + testq M, M + jle .L40 + + cmpq $SIZE, INCX + jne .L14 + cmpq $SIZE, INCY + jne .L14 + + movq M, %rax + sarq $3, %rax + jle .L15 + ALIGN_3 + +#define PRESIZE 33 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + fmul %st(1),%st + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1),%st + FLD 2 * SIZE(Y) + faddp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + fmul %st(1),%st + FLD 3 * SIZE(Y) + faddp %st, %st(1) + FST 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 4 * SIZE(X) + fmul %st(1),%st + FLD 4 * SIZE(Y) + faddp %st, %st(1) + FST 4 * SIZE(Y) + + FLD 5 * SIZE(X) + fmul %st(1),%st + FLD 5 * SIZE(Y) + faddp %st, %st(1) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1),%st + FLD 6 * SIZE(Y) + faddp %st, %st(1) + FST 6 * SIZE(Y) + + FLD 7 * SIZE(X) + fmul %st(1),%st + FLD 7 * SIZE(Y) + faddp %st, %st(1) + FST 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq M, %rax + andq $7, %rax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + addq $SIZE, X + addq $SIZE, Y + decq %rax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movq M, %rax + sarq $2, %rax + jle .L28 + ALIGN_3 + +.L29: + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + decq %rax + jg .L29 + ALIGN_3 + +.L28: + movq M, %rax + andq $3, %rax + jle .L40 + ALIGN_3 + +.L35: + FLD (X) + fmul %st(1),%st + FLD (Y) + faddp %st, %st(1) + FST (Y) + addq INCX, X + addq INCY, Y + + decq %rax + jg .L35 + +.L40: + ffreep %st(0) + ret + + EPILOGUE diff --git a/kernel/x86_64/axpy_atom.S b/kernel/x86_64/axpy_atom.S new file mode 100644 index 0000000000..a786329e47 --- /dev/null +++ b/kernel/x86_64/axpy_atom.S @@ -0,0 +1,555 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif + movaps %xmm0, ALPHA +#else + movaps %xmm3, ALPHA + + movq 40(%rsp), X + movq 48(%rsp), INCX + movq 56(%rsp), Y + movq 64(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + testq M, M + jle .L29 + + cmpq $SIZE, INCX + jne .L20 + cmpq $SIZE, INCY + jne .L20 + + movq M, %rax + sarq $3, %rax + jle .L13 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 2 * SIZE(X), %xmm2 + movsd 3 * SIZE(X), %xmm3 + + movsd 0 * SIZE(Y), %xmm4 + movsd 1 * SIZE(Y), %xmm5 + movsd 2 * SIZE(Y), %xmm6 + movsd 3 * SIZE(Y), %xmm7 + + movsd 4 * SIZE(X), %xmm8 + mulsd ALPHA, %xmm0 + movsd 5 * SIZE(X), %xmm9 + mulsd ALPHA, %xmm1 + movsd 6 * SIZE(X), %xmm10 + mulsd ALPHA, %xmm2 + movsd 7 * SIZE(X), %xmm11 + mulsd ALPHA, %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addsd %xmm4, %xmm0 + movsd 4 * SIZE(Y), %xmm4 + addsd %xmm5, %xmm1 + movsd 5 * SIZE(Y), %xmm5 + addsd %xmm6, %xmm2 + movsd 6 * SIZE(Y), %xmm6 + addsd %xmm7, %xmm3 + movsd 7 * SIZE(Y), %xmm7 + + movsd %xmm0, 0 * SIZE(Y) + mulsd ALPHA, %xmm8 + movsd 8 * SIZE(X), %xmm0 + + movsd %xmm1, 1 * SIZE(Y) + mulsd ALPHA, %xmm9 + movsd 9 * SIZE(X), %xmm1 + + movsd %xmm2, 2 * SIZE(Y) + mulsd ALPHA, %xmm10 + movsd 10 * SIZE(X), %xmm2 + + movsd %xmm3, 3 * SIZE(Y) + mulsd ALPHA, %xmm11 + movsd 11 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + addsd %xmm4, %xmm8 + movsd 8 * SIZE(Y), %xmm4 + addsd %xmm5, %xmm9 + movsd 9 * SIZE(Y), %xmm5 + addsd %xmm6, %xmm10 + movsd 10 * SIZE(Y), %xmm6 + addsd %xmm7, %xmm11 + movsd 11 * SIZE(Y), %xmm7 + + movsd %xmm8, 4 * SIZE(Y) + mulsd ALPHA, %xmm0 + movsd 12 * SIZE(X), %xmm8 + + movsd %xmm9, 5 * SIZE(Y) + mulsd ALPHA, %xmm1 + movsd 13 * SIZE(X), %xmm9 + + movsd %xmm10, 6 * SIZE(Y) + mulsd ALPHA, %xmm2 + movsd 14 * SIZE(X), %xmm10 + + movsd %xmm11, 7 * SIZE(Y) + mulsd ALPHA, %xmm3 + movsd 15 * SIZE(X), %xmm11 + + addq $8 * SIZE, Y + addq $8 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + addsd %xmm4, %xmm0 + movsd 4 * SIZE(Y), %xmm4 + addsd %xmm5, %xmm1 + movsd 5 * SIZE(Y), %xmm5 + addsd %xmm6, %xmm2 + movsd 6 * SIZE(Y), %xmm6 + addsd %xmm7, %xmm3 + movsd 7 * SIZE(Y), %xmm7 + + movsd %xmm0, 0 * SIZE(Y) + mulsd ALPHA, %xmm8 + movsd %xmm1, 1 * SIZE(Y) + mulsd ALPHA, %xmm9 + movsd %xmm2, 2 * SIZE(Y) + mulsd ALPHA, %xmm10 + movsd %xmm3, 3 * SIZE(Y) + mulsd ALPHA, %xmm11 + + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + addsd %xmm6, %xmm10 + addsd %xmm7, %xmm11 + + movsd %xmm8, 4 * SIZE(Y) + movsd %xmm9, 5 * SIZE(Y) + movsd %xmm10, 6 * SIZE(Y) + movsd %xmm11, 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L13: + movq M, %rax + andq $4, %rax + jle .L15 + ALIGN_3 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 2 * SIZE(X), %xmm2 + movsd 3 * SIZE(X), %xmm3 + + movsd 0 * SIZE(Y), %xmm4 + mulsd ALPHA, %xmm0 + movsd 1 * SIZE(Y), %xmm5 + mulsd ALPHA, %xmm1 + movsd 2 * SIZE(Y), %xmm6 + mulsd ALPHA, %xmm2 + movsd 3 * SIZE(Y), %xmm7 + mulsd ALPHA, %xmm3 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + movsd %xmm1, 1 * SIZE(Y) + movsd %xmm2, 2 * SIZE(Y) + movsd %xmm3, 3 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $2, %rax + jle .L16 + ALIGN_3 + + movsd 0 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm4 + movsd 1 * SIZE(X), %xmm1 + movsd 1 * SIZE(Y), %xmm5 + + mulsd ALPHA, %xmm0 + mulsd ALPHA, %xmm1 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movsd %xmm0, 0 * SIZE(Y) + movsd %xmm1, 1 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $1, %rax + jle .L19 + ALIGN_3 + + movsd 0 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd 0 * SIZE(Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: + movq Y, YY + + movq M, %rax + sarq $3, %rax + jle .L23 + + movsd (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + + movsd (Y), %xmm4 + addq INCY, Y + movsd (Y), %xmm5 + addq INCY, Y + movsd (Y), %xmm6 + addq INCY, Y + movsd (Y), %xmm7 + addq INCY, Y + + movsd (X), %xmm8 + addq INCX, X + mulsd ALPHA, %xmm0 + movsd (X), %xmm9 + addq INCX, X + mulsd ALPHA, %xmm1 + movsd (X), %xmm10 + addq INCX, X + mulsd ALPHA, %xmm2 + movsd (X), %xmm11 + addq INCX, X + mulsd ALPHA, %xmm3 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + addsd %xmm4, %xmm0 + movsd (Y), %xmm4 + addq INCY, Y + addsd %xmm5, %xmm1 + movsd (Y), %xmm5 + addq INCY, Y + + addsd %xmm6, %xmm2 + movsd (Y), %xmm6 + addq INCY, Y + addsd %xmm7, %xmm3 + movsd (Y), %xmm7 + addq INCY, Y + + movsd %xmm0, (YY) + addq INCY, YY + movsd (X), %xmm0 + addq INCX, X + mulsd ALPHA, %xmm8 + + movsd %xmm1, (YY) + addq INCY, YY + movsd (X), %xmm1 + addq INCX, X + mulsd ALPHA, %xmm9 + + movsd %xmm2, (YY) + addq INCY, YY + movsd (X), %xmm2 + addq INCX, X + mulsd ALPHA, %xmm10 + + movsd %xmm3, (YY) + addq INCY, YY + movsd (X), %xmm3 + addq INCX, X + mulsd ALPHA, %xmm11 + + addsd %xmm4, %xmm8 + movsd (Y), %xmm4 + addq INCY, Y + addsd %xmm5, %xmm9 + movsd (Y), %xmm5 + addq INCY, Y + + addsd %xmm6, %xmm10 + movsd (Y), %xmm6 + addq INCY, Y + addsd %xmm7, %xmm11 + movsd (Y), %xmm7 + addq INCY, Y + + movsd %xmm8, (YY) + addq INCY, YY + movsd (X), %xmm8 + addq INCX, X + mulsd ALPHA, %xmm0 + + movsd %xmm9, (YY) + addq INCY, YY + movsd (X), %xmm9 + addq INCX, X + mulsd ALPHA, %xmm1 + + movsd %xmm10, (YY) + addq INCY, YY + movsd (X), %xmm10 + addq INCX, X + mulsd ALPHA, %xmm2 + + movsd %xmm11, (YY) + addq INCY, YY + movsd (X), %xmm11 + addq INCX, X + mulsd ALPHA, %xmm3 + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + addsd %xmm4, %xmm0 + movsd (Y), %xmm4 + addq INCY, Y + addsd %xmm5, %xmm1 + movsd (Y), %xmm5 + addq INCY, Y + addsd %xmm6, %xmm2 + movsd (Y), %xmm6 + addq INCY, Y + addsd %xmm7, %xmm3 + movsd (Y), %xmm7 + addq INCY, Y + + movsd %xmm0, (YY) + addq INCY, YY + mulsd ALPHA, %xmm8 + + movsd %xmm1, (YY) + addq INCY, YY + mulsd ALPHA, %xmm9 + + movsd %xmm2, (YY) + addq INCY, YY + mulsd ALPHA, %xmm10 + + movsd %xmm3, (YY) + addq INCY, YY + mulsd ALPHA, %xmm11 + + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + addsd %xmm6, %xmm10 + addsd %xmm7, %xmm11 + + movsd %xmm8, (YY) + addq INCY, YY + movsd %xmm9, (YY) + addq INCY, YY + movsd %xmm10, (YY) + addq INCY, YY + movsd %xmm11, (YY) + addq INCY, YY + ALIGN_3 + +.L23: + movq M, %rax + andq $4, %rax + jle .L25 + ALIGN_3 + + movsd (X), %xmm0 + addq INCX, X + movsd (Y), %xmm4 + addq INCY, Y + movsd (X), %xmm1 + addq INCX, X + movsd (Y), %xmm5 + addq INCY, Y + + movsd (X), %xmm2 + addq INCX, X + mulsd ALPHA, %xmm0 + movsd (Y), %xmm6 + addq INCY, Y + mulsd ALPHA, %xmm1 + movsd (X), %xmm3 + addq INCX, X + mulsd ALPHA, %xmm2 + movsd (Y), %xmm7 + addq INCY, Y + mulsd ALPHA, %xmm3 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + movsd %xmm0, (YY) + addq INCY, YY + movsd %xmm1, (YY) + addq INCY, YY + movsd %xmm2, (YY) + addq INCY, YY + movsd %xmm3, (YY) + addq INCY, YY + ALIGN_3 + +.L25: + movq M, %rax + andq $2, %rax + jle .L26 + ALIGN_3 + + movsd (X), %xmm0 + addq INCX, X + movsd (Y), %xmm4 + addq INCY, Y + movsd (X), %xmm1 + addq INCX, X + movsd (Y), %xmm5 + addq INCY, Y + + mulsd ALPHA, %xmm0 + mulsd ALPHA, %xmm1 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movsd %xmm0, (YY) + addq INCY, YY + movsd %xmm1, (YY) + addq INCY, YY + ALIGN_3 + +.L26: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movsd (X), %xmm0 + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + + movsd %xmm0, (YY) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S new file mode 100644 index 0000000000..23c2ec54e6 --- /dev/null +++ b/kernel/x86_64/axpy_sse.S @@ -0,0 +1,1576 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif + movaps %xmm0, ALPHA +#else + movaps %xmm3, ALPHA + + movq 40(%rsp), X + movq 48(%rsp), INCX + movq 56(%rsp), Y + movq 64(%rsp), INCY +#endif + + SAVEREGISTERS + + shufps $0, ALPHA, ALPHA + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + testq M, M + jle .L19 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + cmpq $3, M + jle .L16 + + testq $SIZE, Y + je .L00 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_3 + +.L00: + testq $SIZE * 2, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_3 + +.L10: + testq $SIZE * 3, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_4 + +.L11: + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 8 * SIZE(X), %xmm2 + movaps 12 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L13: + movq M, %rax + andq $16, %rax + jle .L14 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + movq M, %rax + andq $8, %rax + jle .L15 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $4, %rax + jle .L16 + ALIGN_3 + + movaps -32 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $2, %rax + jle .L17 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + movq M, %rax + andq $1, %rax + jle .L19 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: + +#ifdef ALIGNED_ACCESS + + testq $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -6 * SIZE(X), %xmm7 + movaps -2 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 2 * SIZE(X), %xmm1 + movaps 6 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + SHUFPD_1 %xmm6, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 10 * SIZE(X), %xmm3 + movaps 14 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -6 * SIZE(X), %xmm7 + movaps -2 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + SHUFPD_1 %xmm6, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L23: + movq M, %rax + andq $16, %rax + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + SHUFPD_1 %xmm3, %xmm2 + SHUFPD_1 %xmm4, %xmm3 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + movq M, %rax + andq $8, %rax + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + SHUFPD_1 %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $4, %rax + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm1, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $2, %rax + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L30: + testq $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + decq %rax + jle .L32 + ALIGN_4 + +.L31: + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -5 * SIZE(X), %xmm7 + movaps -1 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 3 * SIZE(X), %xmm1 + movaps 7 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 11 * SIZE(X), %xmm3 + movaps 15 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -5 * SIZE(X), %xmm7 + movaps -1 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L33: + movq M, %rax + andq $16, %rax + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + movq M, %rax + andq $8, %rax + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + movq M, %rax + andq $4, %rax + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + movq M, %rax + andq $2, %rax + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + movq M, %rax + andq $1, %rax + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + decq %rax + jle .L42 + ALIGN_4 + +.L41: + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -7 * SIZE(X), %xmm7 + movaps -3 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 1 * SIZE(X), %xmm1 + movaps 5 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 9 * SIZE(X), %xmm3 + movaps 13 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -7 * SIZE(X), %xmm7 + movaps -3 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L43: + movq M, %rax + andq $16, %rax + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + movq M, %rax + andq $8, %rax + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + movq M, %rax + andq $4, %rax + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + movq M, %rax + andq $2, %rax + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + movq M, %rax + andq $1, %rax + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + +#else + + movq M, %rax + sarq $5, %rax + jle .L23 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + mulps ALPHA, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + mulps ALPHA, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + mulps ALPHA, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + mulps ALPHA, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L23: + movq M, %rax + andq $16, %rax + jle .L24 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + mulps ALPHA, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + mulps ALPHA, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + movq M, %rax + andq $8, %rax + jle .L25 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + mulps ALPHA, %xmm0 + addps -32 * SIZE(Y), %xmm0 + mulps ALPHA, %xmm1 + addps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $4, %rax + jle .L26 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + mulps ALPHA, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $2, %rax + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm4 + + mulps ALPHA, %xmm0 + addps %xmm4, %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + mulss ALPHA, %xmm0 + addss -32 * SIZE(Y), %xmm0 + + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret +#endif + ALIGN_3 + + +.L50: + movq M, %rax + movq Y, YY + sarq $3, %rax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + addq INCX, X + mulss ALPHA, %xmm0 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm0 + + movss (X), %xmm1 + addq INCX, X + mulss ALPHA, %xmm1 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm1 + + movss (X), %xmm2 + addq INCX, X + mulss ALPHA, %xmm2 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm2 + + movss (X), %xmm3 + addq INCX, X + mulss ALPHA, %xmm3 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm3 + + movss %xmm0, (Y) + addq INCY, Y + movss %xmm1, (Y) + addq INCY, Y + movss %xmm2, (Y) + addq INCY, Y + movss %xmm3, (Y) + addq INCY, Y + + movss (X), %xmm0 + addq INCX, X + mulss ALPHA, %xmm0 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm0 + + movss (X), %xmm1 + addq INCX, X + mulss ALPHA, %xmm1 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm1 + + movss (X), %xmm2 + addq INCX, X + mulss ALPHA, %xmm2 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm2 + + movss (X), %xmm3 + addq INCX, X + mulss ALPHA, %xmm3 + movss (YY), %xmm6 + addq INCY, YY + addss %xmm6, %xmm3 + + movss %xmm0, (Y) + addq INCY, Y + movss %xmm1, (Y) + addq INCY, Y + movss %xmm2, (Y) + addq INCY, Y + movss %xmm3, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $7, %rax + jle .L59 + ALIGN_3 + +.L56: + movss (X), %xmm0 + addq INCX, X + mulss ALPHA, %xmm0 + movss (Y), %xmm6 + addss %xmm6, %xmm0 + movss %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L56 + ALIGN_3 + +.L59: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + + + EPILOGUE diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S new file mode 100644 index 0000000000..5546029177 --- /dev/null +++ b/kernel/x86_64/axpy_sse2.S @@ -0,0 +1,906 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif + movaps %xmm0, ALPHA +#else + movaps %xmm3, ALPHA + + movq 40(%rsp), X + movq 48(%rsp), INCX + movq 56(%rsp), Y + movq 64(%rsp), INCY +#endif + + SAVEREGISTERS + + unpcklpd ALPHA, ALPHA + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + testq M, M + jle .L47 + + cmpq $SIZE, INCX + jne .L40 + cmpq $SIZE, INCY + jne .L40 + + testq $SIZE, Y + je .L10 + + movsd (X), %xmm0 + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, X + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm2 + movaps 6 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L13: + movq M, %rax + andq $8, %rax + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + movq M, %rax + andq $4, %rax + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $2, %rax + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $1, %rax + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps -1 * SIZE(X), %xmm0 + movaps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + SHUFPD_1 %xmm6, %xmm5 + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movaps 3 * SIZE(X), %xmm2 + movaps 5 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm7 + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm4, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + SHUFPD_1 %xmm6, %xmm5 + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + SHUFPD_1 %xmm0, %xmm7 + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L23: + movq M, %rax + andq $8, %rax + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm8, %xmm3 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm8, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + movq M, %rax + andq $4, %rax + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $2, %rax + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#else + movq M, %rax + sarq $4, %rax + jle .L23 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + mulpd ALPHA, %xmm4 + addpd -8 * SIZE(Y), %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + mulpd ALPHA, %xmm5 + addpd -6 * SIZE(Y), %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + mulpd ALPHA, %xmm6 + addpd -4 * SIZE(Y), %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + mulpd ALPHA, %xmm7 + addpd -2 * SIZE(Y), %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L23: + movq M, %rax + andq $8, %rax + jle .L24 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + mulpd ALPHA, %xmm1 + addpd -14 * SIZE(Y), %xmm1 + mulpd ALPHA, %xmm2 + addpd -12 * SIZE(Y), %xmm2 + mulpd ALPHA, %xmm3 + addpd -10 * SIZE(Y), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + movq M, %rax + andq $4, %rax + jle .L25 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm1 + + addpd -16 * SIZE(Y), %xmm0 + addpd -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $2, %rax + jle .L26 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + mulpd ALPHA, %xmm0 + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $1, %rax + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + mulsd ALPHA, %xmm0 + addsd -16 * SIZE(Y), %xmm0 + + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 +#endif + +.L40: + movq Y, YY + movq M, %rax + sarq $3, %rax + jle .L45 + ALIGN_3 + +.L41: + movsd 0 * SIZE(X), %xmm0 + addq INCX, X + movhpd 0 * SIZE(X), %xmm0 + addq INCX, X + mulpd ALPHA, %xmm0 + + movsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addq INCY, YY + addpd %xmm6, %xmm0 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + addq INCX, X + mulpd ALPHA, %xmm1 + + movsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addq INCY, YY + addpd %xmm6, %xmm1 + + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movhpd 0 * SIZE(X), %xmm2 + addq INCX, X + mulpd ALPHA, %xmm2 + + movsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addq INCY, YY + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + addq INCX, X + mulpd ALPHA, %xmm3 + + movsd 0 * SIZE(YY), %xmm6 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm6 + addq INCY, YY + addpd %xmm6, %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L45: + movq M, %rax + andq $7, %rax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + addq INCX, X + mulsd %xmm15, %xmm0 + addsd (Y), %xmm0 + movsd %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L46 + ALIGN_3 + +.L47: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/builtin_stinit.S b/kernel/x86_64/builtin_stinit.S new file mode 100644 index 0000000000..c05a1c5471 --- /dev/null +++ b/kernel/x86_64/builtin_stinit.S @@ -0,0 +1,61 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + cmpq $4096, %rax + jle .L999 + ALIGN_3 + +.L01: + subq $4096, %rax + subq $4096, %rsp + movq $0, (%rsp) + cmpq $4096, %rax + jg .L01 + ALIGN_3 + +.L999: + subq %rax, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cabs.S b/kernel/x86_64/cabs.S new file mode 100644 index 0000000000..0b1a911857 --- /dev/null +++ b/kernel/x86_64/cabs.S @@ -0,0 +1,70 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + +#ifdef DOUBLE + movsd 0 * SIZE(ARG1), %xmm0 + movsd 1 * SIZE(ARG1), %xmm1 + pcmpeqb %xmm4, %xmm4 + + psrlq $1, %xmm4 + andpd %xmm4, %xmm0 + andpd %xmm4, %xmm1 + addpd %xmm1, %xmm0 +#else + movss 0 * SIZE(ARG1), %xmm0 + movss 1 * SIZE(ARG1), %xmm1 + pcmpeqb %xmm4, %xmm4 + + psrld $1, %xmm4 + andps %xmm4, %xmm0 + andps %xmm4, %xmm1 + addps %xmm1, %xmm0 +#endif + +#if !defined(DOUBLE) && defined(NEED_F2CCONV) + cvtss2sd %xmm0, %xmm0 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemv_n.S b/kernel/x86_64/cgemv_n.S new file mode 100644 index 0000000000..77e9b3d966 --- /dev/null +++ b/kernel/x86_64/cgemv_n.S @@ -0,0 +1,4302 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define A1 %r11 +#define A2 %r12 + +#define Y1 %r13 +#define BUFFER %r14 + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + +#undef SUBPS + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPS subps +#else +#define SUBPS addps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, ALPHA + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + subq $-32 * SIZE, A + + movq BUFFER, Y1 + + pxor %xmm4, %xmm4 + + movq M, %rax + addq $8, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movaps %xmm4, 0 * SIZE(Y1) + movaps %xmm4, 4 * SIZE(Y1) + movaps %xmm4, 8 * SIZE(Y1) + movaps %xmm4, 12 * SIZE(Y1) + + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: +#ifdef ALIGNED_ACCESS + movq M, MM + + movq A, %rax + andq $4 * SIZE - 1, %rax + leaq 2 * SIZE(BUFFER), A1 + leaq -1(M), A2 + + cmpq $2 * SIZE, %rax + cmovge A1, BUFFER + cmovge A2, MM + + testq $SIZE, A + jne .L200 + + testq $2 * SIZE, LDA + jne .L100 +#endif + +#if GEMV_UNROLL >= 4 + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm11 + addq INCX, X + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm6 +#else + movsd ALPHA, %xmm6 + unpcklpd %xmm6, %xmm6 +#endif + + pshufd $0xb1, %xmm6, %xmm5 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + + pshufd $0x00, %xmm9, %xmm8 + pshufd $0x55, %xmm9, %xmm9 + pshufd $0x00, %xmm11, %xmm10 + pshufd $0x55, %xmm11, %xmm11 + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + + mulps %xmm6, %xmm8 + mulps %xmm5, %xmm9 + mulps %xmm6, %xmm10 + mulps %xmm5, %xmm11 + mulps %xmm6, %xmm12 + mulps %xmm5, %xmm13 + mulps %xmm6, %xmm14 + mulps %xmm5, %xmm15 + +#ifndef XCONJ + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + pshufd $0x55, %xmm10, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L1X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L1X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L15 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A2, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm6) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A2, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm6) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm2 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $4, MM + je .L17 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $2, MM + je .L18 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L18: + testq $1, MM + je .L19 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L19: + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + mulps %xmm8, %xmm14 + mulps %xmm9, %xmm15 + +#ifndef XCONJ + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L2X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L2X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq MM, I + sarq $3, I + jle .L25 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm10) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm10) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm2 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $4, MM + je .L27 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + MOVUPS_A1(-28 * SIZE, A2, %xmm10) + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm0 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $2, MM + je .L28 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-32 * SIZE, A2, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L28: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L29 +#else + je .L30 +#endif + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L29: + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: + cmpq $1, N + jl .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd (X), %xmm13 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + +#ifndef XCONJ + subps %xmm13, %xmm12 +#else + addps %xmm13, %xmm12 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + +#ifndef CONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L3X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 +.L3X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq MM, I + sarq $3, I + jle .L35 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $4, MM + je .L37 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $2, MM + je .L38 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L38: + testq $1, MM + je .L990 + + movsd -32 * SIZE(A1), %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + +#ifdef ALIGNED_ACCESS + jmp .L990 + ALIGN_3 + +.L100: +#if GEMV_UNROLL >= 4 + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm11 + addq INCX, X + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm6 +#else + movsd ALPHA, %xmm6 + unpcklpd %xmm6, %xmm6 +#endif + + pshufd $0xb1, %xmm6, %xmm5 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + + pshufd $0x00, %xmm9, %xmm8 + pshufd $0x55, %xmm9, %xmm9 + pshufd $0x00, %xmm11, %xmm10 + pshufd $0x55, %xmm11, %xmm11 + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + + mulps %xmm6, %xmm8 + mulps %xmm5, %xmm9 + mulps %xmm6, %xmm10 + mulps %xmm5, %xmm11 + mulps %xmm6, %xmm12 + mulps %xmm5, %xmm13 + mulps %xmm6, %xmm14 + mulps %xmm5, %xmm15 + +#ifndef XCONJ + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + pshufd $0x55, %xmm10, %xmm11 + pshufd $0x00, %xmm10, %xmm10 + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L10X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L10X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L105 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm2 + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm3 + movsd -28 * SIZE(A1, LDA), %xmm6 + movhps -26 * SIZE(A1, LDA), %xmm6 + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1, LDA), %xmm4 + movhps -22 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm6 + movhps -18 * SIZE(A1, LDA), %xmm6 + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A2, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm4 + movhps -30 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm3 + movsd -28 * SIZE(A2, LDA), %xmm6 + movhps -26 * SIZE(A2, LDA), %xmm6 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A2, LDA), %xmm4 + movhps -22 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + movsd -20 * SIZE(A2, LDA), %xmm6 + movhps -18 * SIZE(A2, LDA), %xmm6 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A1, %xmm6) + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm2 + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm3 + movsd -28 * SIZE(A1, LDA), %xmm6 + movhps -26 * SIZE(A1, LDA), %xmm6 + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A1, LDA), %xmm4 + movhps -22 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + movsd -20 * SIZE(A1, LDA), %xmm6 + movhps -18 * SIZE(A1, LDA), %xmm6 + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm2 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm3 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-24 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-20 * SIZE, A2, %xmm6) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm4 + movhps -30 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm3 + movsd -28 * SIZE(A2, LDA), %xmm6 + movhps -26 * SIZE(A2, LDA), %xmm6 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movsd -24 * SIZE(A2, LDA), %xmm4 + movhps -22 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + movsd -20 * SIZE(A2, LDA), %xmm6 + movhps -18 * SIZE(A2, LDA), %xmm6 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm2 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L105: + testq $4, MM + je .L107 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm4 + movhps -30 * SIZE(A1, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm8, %xmm6 + addps %xmm6, %xmm1 + movsd -28 * SIZE(A1, LDA), %xmm6 + movhps -26 * SIZE(A1, LDA), %xmm6 + + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm9, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm6) + + mulps %xmm11, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm4 + movhps -30 * SIZE(A2, LDA), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movsd -28 * SIZE(A2, LDA), %xmm6 + movhps -26 * SIZE(A2, LDA), %xmm6 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L107: + testq $2, MM + je .L108 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + movsd -32 * SIZE(A1, LDA), %xmm6 + movhps -30 * SIZE(A1, LDA), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + movhps -30 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L108: + testq $1, MM + je .L109 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm9, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA), %xmm6 + mulps %xmm11, %xmm7 + SUBPS %xmm7, %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L109: + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L120 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L111: +#endif + + subq $2, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + mulps %xmm8, %xmm14 + mulps %xmm9, %xmm15 + +#ifndef XCONJ + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L11X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L11X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq MM, I + sarq $3, I + jle .L115 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + movhps -30 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movsd -28 * SIZE(A2), %xmm6 + movhps -26 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + movsd -24 * SIZE(A2), %xmm8 + movhps -22 * SIZE(A2), %xmm8 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movsd -20 * SIZE(A2), %xmm10 + movhps -18 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + movhps -30 * SIZE(A2), %xmm4 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movsd -28 * SIZE(A2), %xmm6 + movhps -26 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + movsd -24 * SIZE(A2), %xmm8 + movhps -22 * SIZE(A2), %xmm8 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movsd -20 * SIZE(A2), %xmm10 + movhps -18 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm1 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm2 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L115: + testq $4, MM + je .L117 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + movsd -32 * SIZE(A2), %xmm8 + movhps -30 * SIZE(A2), %xmm8 + movsd -28 * SIZE(A2), %xmm10 + movhps -26 * SIZE(A2), %xmm10 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + mulps %xmm15, %xmm9 + SUBPS %xmm9, %xmm0 + mulps %xmm15, %xmm11 + SUBPS %xmm11, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L117: + testq $2, MM + je .L118 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + movsd -32 * SIZE(A2), %xmm6 + movhps -30 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L118: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L119 +#else + je .L120 +#endif + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L119: + cmpq $2, N + jge .L111 +#endif + ALIGN_3 + +.L120: +#endif + + cmpq $1, N + jl .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd (X), %xmm13 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + +#ifndef XCONJ + subps %xmm13, %xmm12 +#else + addps %xmm13, %xmm12 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + +#ifndef CONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L12X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 +.L12X: +#endif + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq MM, I + sarq $3, I + jle .L125 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L125: + testq $4, MM + je .L127 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + MOVUPS_A1(-28 * SIZE, A1, %xmm6) + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L127: + testq $2, MM + je .L128 + + MOVUPS_A1(-32 * SIZE, A1, %xmm4) + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L128: + testq $1, MM + je .L990 + + movsd -32 * SIZE(A1), %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + jmp .L990 + ALIGN_3 + +.L200: + testq $2 * SIZE, LDA + jne .L300 + + cmpq $2, N + jl .L210 + ALIGN_3 + +.L201: + subq $2, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + mulps %xmm8, %xmm14 + mulps %xmm9, %xmm15 + +#ifndef XCONJ + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L20X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L20X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + movaps -33 * SIZE(A2), %xmm6 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L205 + + movaps -29 * SIZE(A1), %xmm8 + movaps -25 * SIZE(A1), %xmm9 + movaps -21 * SIZE(A1), %xmm10 + + decq I + jle .L204 + ALIGN_3 + +.L203: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + movaps -29 * SIZE(A2), %xmm8 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movaps -25 * SIZE(A2), %xmm9 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -21 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + movaps -17 * SIZE(A2), %xmm6 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm1 + movaps -13 * SIZE(A1), %xmm8 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm2 + movaps -9 * SIZE(A1), %xmm9 + + movss %xmm6, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + movaps -5 * SIZE(A1), %xmm10 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L203 + ALIGN_3 + +.L204: + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + movaps -29 * SIZE(A2), %xmm8 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movaps -25 * SIZE(A2), %xmm9 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -21 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + movaps -17 * SIZE(A2), %xmm6 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm2 + + movss %xmm6, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L205: + testq $4, MM + je .L207 + + movaps -29 * SIZE(A1), %xmm8 + movaps -25 * SIZE(A1), %xmm9 + movaps -29 * SIZE(A2), %xmm10 + movaps -25 * SIZE(A2), %xmm11 + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movss %xmm11, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm9, %xmm4 + movaps %xmm11, %xmm6 + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L207: + testq $2, MM + je .L208 + + movaps -29 * SIZE(A1), %xmm8 + movaps -29 * SIZE(A2), %xmm9 + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movss %xmm9, %xmm6 + shufps $0x39, %xmm6, %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L208: + testq $1, MM + je .L209 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L209: + cmpq $2, N + jge .L201 + ALIGN_3 + +.L210: + cmpq $1, N + jl .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd (X), %xmm13 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + +#ifndef XCONJ + subps %xmm13, %xmm12 +#else + addps %xmm13, %xmm12 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + +#ifndef CONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L21X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 +.L21X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L215 + + movaps -29 * SIZE(A1), %xmm6 + movaps -25 * SIZE(A1), %xmm8 + movaps -21 * SIZE(A1), %xmm10 + + decq I + jle .L214 + ALIGN_3 + +.L213: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movaps -13 * SIZE(A1), %xmm6 + + movss %xmm10, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + movaps -9 * SIZE(A1), %xmm8 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -5 * SIZE(A1), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L213 + ALIGN_3 + +.L214: + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + movss %xmm10, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L215: + testq $4, MM + je .L217 + + movaps -29 * SIZE(A1), %xmm6 + movaps -25 * SIZE(A1), %xmm8 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + movaps %xmm8, %xmm4 + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L217: + testq $2, MM + je .L218 + + movaps -29 * SIZE(A1), %xmm6 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L218: + testq $1, MM + je .L990 + + movsd -32 * SIZE(A1), %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L300: + cmpq $2, N + jl .L310 + ALIGN_3 + +.L301: + subq $2, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd (X), %xmm13 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + pshufd $0x00, %xmm15, %xmm14 + pshufd $0x55, %xmm15, %xmm15 + +#ifndef XCONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + mulps %xmm8, %xmm14 + mulps %xmm9, %xmm15 + +#ifndef XCONJ + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + pshufd $0x55, %xmm14, %xmm15 + pshufd $0x00, %xmm14, %xmm14 + +#ifndef CONJ + xorps %xmm11, %xmm13 + xorps %xmm11, %xmm15 +#else + xorps %xmm11, %xmm12 + xorps %xmm11, %xmm14 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L30X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 +.L30X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + movaps -35 * SIZE(A2), %xmm6 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L305 + + movaps -29 * SIZE(A1), %xmm8 + movaps -25 * SIZE(A1), %xmm9 + movaps -21 * SIZE(A1), %xmm10 + + decq I + jle .L304 + ALIGN_3 + +.L303: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + movaps -31 * SIZE(A2), %xmm8 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movaps -27 * SIZE(A2), %xmm9 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -23 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + movaps -19 * SIZE(A2), %xmm6 + + movss %xmm9, %xmm8 + shufps $0x93, %xmm9, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm1 + movaps -13 * SIZE(A1), %xmm8 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x93, %xmm10, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm2 + movaps -9 * SIZE(A1), %xmm9 + + movss %xmm6, %xmm10 + shufps $0x93, %xmm6, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + movaps -5 * SIZE(A1), %xmm10 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L303 + ALIGN_3 + +.L304: + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + movaps -31 * SIZE(A2), %xmm8 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x39, %xmm9, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movaps -27 * SIZE(A2), %xmm9 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -23 * SIZE(A2), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + movaps -19 * SIZE(A2), %xmm6 + + movss %xmm9, %xmm8 + shufps $0x93, %xmm9, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm14, %xmm8 + addps %xmm8, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm9 + shufps $0x93, %xmm10, %xmm9 + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm2 + + movss %xmm6, %xmm10 + shufps $0x93, %xmm6, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm2 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L305: + testq $4, MM + je .L307 + + movaps -29 * SIZE(A1), %xmm8 + movaps -25 * SIZE(A1), %xmm9 + movaps -31 * SIZE(A2), %xmm10 + movaps -27 * SIZE(A2), %xmm11 + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm9, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm7 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movss %xmm10, %xmm6 + shufps $0x93, %xmm10, %xmm6 + pshufd $0xb1, %xmm6, %xmm5 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movss %xmm11, %xmm10 + shufps $0x93, %xmm11, %xmm10 + pshufd $0xb1, %xmm10, %xmm7 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + mulps %xmm15, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm9, %xmm4 + movaps %xmm11, %xmm6 + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L307: + testq $2, MM + je .L308 + + movaps -29 * SIZE(A1), %xmm8 + movaps -31 * SIZE(A2), %xmm9 + + movss %xmm8, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm9, %xmm6 + shufps $0x93, %xmm9, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L308: + testq $1, MM + je .L309 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A2), %xmm6 + + pshufd $0xb1, %xmm4, %xmm5 + pshufd $0xb1, %xmm6, %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + SUBPS %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L309: + cmpq $2, N + jge .L301 + ALIGN_3 + +.L310: + cmpq $1, N + jl .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd (X), %xmm13 + addq INCX, X + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm8 +#else + movsd ALPHA, %xmm8 + unpcklpd %xmm8, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + + pshufd $0x00, %xmm13, %xmm12 + pshufd $0x55, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + + mulps %xmm8, %xmm12 + mulps %xmm9, %xmm13 + +#ifndef XCONJ + subps %xmm13, %xmm12 +#else + addps %xmm13, %xmm12 +#endif + + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + +#ifndef CONJ + xorps %xmm11, %xmm13 +#else + xorps %xmm11, %xmm12 +#endif + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L31X + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(Y1), %xmm0 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 +.L31X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + + movaps -32 * SIZE(Y1), %xmm0 + movaps -28 * SIZE(Y1), %xmm1 + movaps -24 * SIZE(Y1), %xmm2 + movaps -20 * SIZE(Y1), %xmm3 + + movq MM, I + sarq $3, I + jle .L315 + + movaps -29 * SIZE(A1), %xmm6 + movaps -25 * SIZE(A1), %xmm8 + movaps -21 * SIZE(A1), %xmm10 + + decq I + jle .L314 + ALIGN_3 + +.L313: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + movaps -13 * SIZE(A1), %xmm6 + + movss %xmm10, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + movaps -9 * SIZE(A1), %xmm8 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + movaps -5 * SIZE(A1), %xmm10 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L313 + ALIGN_3 + +.L314: + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + movss %xmm10, %xmm8 + shufps $0x39, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm2 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + pshufd $0xb1, %xmm10, %xmm11 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm3 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + mulps %xmm13, %xmm9 + SUBPS %xmm9, %xmm2 + mulps %xmm13, %xmm11 + SUBPS %xmm11, %xmm3 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + movaps %xmm2, -24 * SIZE(Y1) + movaps %xmm3, -20 * SIZE(Y1) + + movaps -16 * SIZE(Y1), %xmm0 + movaps -12 * SIZE(Y1), %xmm1 + movaps -8 * SIZE(Y1), %xmm2 + movaps -4 * SIZE(Y1), %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L315: + testq $4, MM + je .L317 + + movaps -29 * SIZE(A1), %xmm6 + movaps -25 * SIZE(A1), %xmm8 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm7 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm1 + + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm1 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, -28 * SIZE(Y1) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + movaps %xmm8, %xmm4 + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L317: + testq $2, MM + je .L318 + + movaps -29 * SIZE(A1), %xmm6 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movaps %xmm0, -32 * SIZE(Y1) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L318: + testq $1, MM + je .L990 + + movsd -32 * SIZE(A1), %xmm4 + + pshufd $0xb1, %xmm4, %xmm5 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + SUBPS %xmm5, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) +#endif + ALIGN_3 + +.L990: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L991 + + movsd (Y), %xmm0 + addq INCY, Y + + movsd (BUFFER), %xmm1 + addq $2 * SIZE, BUFFER + + addps %xmm1, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + ALIGN_3 + +.L991: +#endif + movq MM, %rax + sarq $3, %rax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y), %xmm0 + addq INCY, Y + movhps (Y), %xmm0 + addq INCY, Y + + movsd (Y), %xmm1 + addq INCY, Y + movhps (Y), %xmm1 + addq INCY, Y + + movsd (Y), %xmm2 + addq INCY, Y + movhps (Y), %xmm2 + addq INCY, Y + + movsd (Y), %xmm3 + addq INCY, Y + movhps (Y), %xmm3 + addq INCY, Y + + addps 0 * SIZE(BUFFER), %xmm0 + addps 4 * SIZE(BUFFER), %xmm1 + addps 8 * SIZE(BUFFER), %xmm2 + addps 12 * SIZE(BUFFER), %xmm3 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + movlps %xmm1, (Y1) + addq INCY, Y1 + movhps %xmm1, (Y1) + addq INCY, Y1 + + movlps %xmm2, (Y1) + addq INCY, Y1 + movhps %xmm2, (Y1) + addq INCY, Y1 + + movlps %xmm3, (Y1) + addq INCY, Y1 + movhps %xmm3, (Y1) + addq INCY, Y1 + + addq $16 * SIZE, BUFFER + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $7, MM + jle .L999 + + testq $4, MM + jle .L995 + + movsd (Y), %xmm0 + addq INCY, Y + movhps (Y), %xmm0 + addq INCY, Y + + movsd (Y), %xmm1 + addq INCY, Y + movhps (Y), %xmm1 + addq INCY, Y + + addps 0 * SIZE(BUFFER), %xmm0 + addps 4 * SIZE(BUFFER), %xmm1 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + movlps %xmm1, (Y1) + addq INCY, Y1 + movhps %xmm1, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L995: + testq $2, MM + jle .L996 + + movsd (Y), %xmm0 + addq INCY, Y + movhps (Y), %xmm0 + addq INCY, Y + + addps 0 * SIZE(BUFFER), %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L996: + testq $1, MM + jle .L999 + + movsd (Y), %xmm0 + + addps 0 * SIZE(BUFFER), %xmm0 + + movlps %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/cgemv_t.S b/kernel/x86_64/cgemv_t.S new file mode 100644 index 0000000000..c268e4f594 --- /dev/null +++ b/kernel/x86_64/cgemv_t.S @@ -0,0 +1,4378 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define A1 %r11 +#define A2 %r12 + +#define X1 %rbx +#define Y1 %r13 +#define BUFFER %r14 + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + +#undef SUBPS + +#ifndef CONJ +#define SUBPS addps +#else +#define SUBPS subps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, ALPHA + + testq M, M + jle .L999 + testq N, N + jle .L999 + + subq $-32 * SIZE, A + + movq BUFFER, X1 + +#ifdef ALIGNED_ACCESS + movq M, MM + movq A, %rax + andq $4 * SIZE - 1, %rax + cmpq $2 * SIZE, %rax + + jl .L0X + + movsd (X), %xmm0 + addq INCX, X + movlps %xmm0, 2 * SIZE(X1) + + addq $2 * SIZE, BUFFER + addq $4 * SIZE, X1 + decq MM + +.L0X: +#endif + + movq MM, I + sarq $3, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + movaps %xmm0, 0 * SIZE(X1) + movaps %xmm1, 4 * SIZE(X1) + movaps %xmm2, 8 * SIZE(X1) + movaps %xmm3, 12 * SIZE(X1) + + addq $16 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq MM, I + andq $7, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addq INCX, X + movlps %xmm0, 0 * SIZE(X1) + addq $2 * SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + jne .L200 + + testq $2 * SIZE, LDA + jne .L100 +#endif + +#if GEMV_UNROLL >= 4 + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L1X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A1, LDA), %xmm9 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd -32 * SIZE(A2), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd -32 * SIZE(A2, LDA), %xmm11 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L1X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#ifdef PREFETCHW + PREFETCHW 7 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L15 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-32 * SIZE, A2, %xmm10) + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-28 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-28 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-20 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-28 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-28 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-20 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L15: + testq $4, MM + je .L17 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-32 * SIZE, A2, %xmm10) + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-28 * SIZE, A1, %xmm8) + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm9) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + MOVUPS_A1(-28 * SIZE, A2, %xmm10) + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm11) + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_3 + +.L17: + testq $2, MM + je .L18 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-32 * SIZE, A2, %xmm10) + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm11) + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + testq $1, MM + je .L19 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A1, LDA), %xmm9 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd -32 * SIZE(A2), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd -32 * SIZE(A2, LDA), %xmm11 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + ALIGN_3 + +.L19: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm11, %xmm0 + xorps %xmm11, %xmm2 + xorps %xmm11, %xmm4 + xorps %xmm11, %xmm6 +#else + xorps %xmm11, %xmm1 + xorps %xmm11, %xmm3 + xorps %xmm11, %xmm5 + xorps %xmm11, %xmm7 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + haddps %xmm6, %xmm4 +#else + + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm9 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm9 + + movaps %xmm4, %xmm10 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm10 + + movaps %xmm6, %xmm12 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm12 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm2 + addps %xmm10, %xmm4 + addps %xmm12, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movlhps %xmm2, %xmm0 + movlhps %xmm6, %xmm4 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + pshufd $0xb1, %xmm4, %xmm5 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm5 + + xorps %xmm11, %xmm0 + xorps %xmm11, %xmm4 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm5, %xmm4 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + movaps %xmm4, %xmm6 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm4 +#endif + + movsd (Y), %xmm2 + addq INCY, Y + movhps (Y), %xmm2 + addq INCY, Y + movsd (Y), %xmm6 + addq INCY, Y + movhps (Y), %xmm6 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + shufps $0xd8, %xmm4, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm4 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + movlps %xmm4, (Y1) + addq INCY, Y1 + movhps %xmm4, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + + cmpq $2, N + jl .L30 +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + subq $2, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L2X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L2X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#if (GEMV_UNROLL == 2) && defined(PREFETCHW) + PREFETCHW 3 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L25 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-32 * SIZE, A2, %xmm9) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + MOVUPS_A1(-28 * SIZE, A2, %xmm11) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -16 * SIZE(X1), %xmm12 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm11) + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -16 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L25: + testq $4, MM + je .L27 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-32 * SIZE, A2, %xmm9) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + MOVUPS_A1(-28 * SIZE, A2, %xmm11) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L27: + testq $2, MM + je .L28 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-32 * SIZE, A2, %xmm9) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L28: + testq $1, MM + je .L29 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + ALIGN_3 + +.L29: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 + xorps %xmm5, %xmm2 +#else + xorps %xmm5, %xmm1 + xorps %xmm5, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm4 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm4 + + addps %xmm8, %xmm0 + addps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + movlhps %xmm2, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + movhps (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: + cmpq $1, N + jl .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L3X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 +.L3X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + + movq MM, I + sarq $3, I + jle .L35 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -16 * SIZE(X1), %xmm12 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -16 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L35: + testq $4, MM + je .L37 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + ALIGN_3 + +.L37: + testq $2, MM + je .L38 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L38: + testq $1, MM + je .L39 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + ALIGN_3 + +.L39: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + addps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_3 + +.L100: + +#if GEMV_UNROLL >= 4 + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L10X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A1, LDA), %xmm9 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd -32 * SIZE(A2), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd -32 * SIZE(A2, LDA), %xmm11 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L10X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#ifdef PREFETCHW + PREFETCHW 7 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L105 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A1, LDA), %xmm9 + movhps -30 * SIZE(A1, LDA), %xmm9 + + movaps -32 * SIZE(A2), %xmm10 + movsd -32 * SIZE(A2, LDA), %xmm11 + movhps -30 * SIZE(A2, LDA), %xmm11 + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -28 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -28 * SIZE(A1, LDA), %xmm9 + movhps -26 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -28 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -28 * SIZE(A2, LDA), %xmm11 + movhps -26 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -24 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + movsd -24 * SIZE(A1, LDA), %xmm9 + movhps -22 * SIZE(A1, LDA), %xmm9 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + movaps -24 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + movsd -24 * SIZE(A2, LDA), %xmm11 + movhps -22 * SIZE(A2, LDA), %xmm11 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -20 * SIZE(A1, LDA), %xmm9 + movhps -18 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -20 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -20 * SIZE(A2, LDA), %xmm11 + movhps -18 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm9 + movhps -14 * SIZE(A1, LDA), %xmm9 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + movaps -16 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + movsd -16 * SIZE(A2, LDA), %xmm11 + movhps -14 * SIZE(A2, LDA), %xmm11 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -28 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -28 * SIZE(A1, LDA), %xmm9 + movhps -26 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -28 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -28 * SIZE(A2, LDA), %xmm11 + movhps -26 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -24 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + movsd -24 * SIZE(A1, LDA), %xmm9 + movhps -22 * SIZE(A1, LDA), %xmm9 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + movaps -24 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + movsd -24 * SIZE(A2, LDA), %xmm11 + movhps -22 * SIZE(A2, LDA), %xmm11 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -20 * SIZE(A1, LDA), %xmm9 + movhps -18 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -20 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -20 * SIZE(A2, LDA), %xmm11 + movhps -18 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L105: + testq $4, MM + je .L107 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A1, LDA), %xmm9 + movhps -30 * SIZE(A1, LDA), %xmm9 + + movaps -32 * SIZE(A2), %xmm10 + movsd -32 * SIZE(A2, LDA), %xmm11 + movhps -30 * SIZE(A2, LDA), %xmm11 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -28 * SIZE(A1), %xmm8 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -28 * SIZE(A1, LDA), %xmm9 + movhps -26 * SIZE(A1, LDA), %xmm9 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + movaps -28 * SIZE(A2), %xmm10 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + movsd -28 * SIZE(A2, LDA), %xmm11 + movhps -26 * SIZE(A2, LDA), %xmm11 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm15, %xmm7 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm13, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm7 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_3 + +.L107: + testq $2, MM + je .L108 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A1, LDA), %xmm9 + movhps -30 * SIZE(A1, LDA), %xmm9 + + movaps -32 * SIZE(A2), %xmm10 + movsd -32 * SIZE(A2, LDA), %xmm11 + movhps -30 * SIZE(A2, LDA), %xmm11 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L108: + testq $1, MM + je .L109 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A1, LDA), %xmm9 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd -32 * SIZE(A2), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd -32 * SIZE(A2, LDA), %xmm11 + + pshufd $0xb1, %xmm8, %xmm14 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + pshufd $0xb1, %xmm9, %xmm15 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm3 + + pshufd $0xb1, %xmm10, %xmm14 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm4 + pshufd $0xb1, %xmm11, %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm6 + + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm5 + mulps %xmm12, %xmm15 + SUBPS %xmm15, %xmm7 + ALIGN_3 + +.L109: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm11, %xmm0 + xorps %xmm11, %xmm2 + xorps %xmm11, %xmm4 + xorps %xmm11, %xmm6 +#else + xorps %xmm11, %xmm1 + xorps %xmm11, %xmm3 + xorps %xmm11, %xmm5 + xorps %xmm11, %xmm7 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 + + haddps %xmm5, %xmm4 + haddps %xmm7, %xmm6 + haddps %xmm6, %xmm4 +#else + + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm9 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm9 + + movaps %xmm4, %xmm10 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm10 + + movaps %xmm6, %xmm11 + unpcklps %xmm7, %xmm6 + unpckhps %xmm7, %xmm11 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm2 + addps %xmm10, %xmm4 + addps %xmm11, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + + movlhps %xmm2, %xmm0 + movlhps %xmm6, %xmm4 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + pshufd $0xb1, %xmm4, %xmm5 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm5 + + xorps %xmm11, %xmm0 + xorps %xmm11, %xmm4 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm5, %xmm4 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + movaps %xmm4, %xmm6 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm4 +#endif + + movsd (Y), %xmm2 + addq INCY, Y + movhps (Y), %xmm2 + addq INCY, Y + movsd (Y), %xmm6 + addq INCY, Y + movhps (Y), %xmm6 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + shufps $0xd8, %xmm4, %xmm4 + + addps %xmm2, %xmm0 + addps %xmm6, %xmm4 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + movlps %xmm4, (Y1) + addq INCY, Y1 + movhps %xmm4, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: +#endif + + cmpq $2, N + jl .L120 +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L111: +#endif + subq $2, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L11X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L11X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#if (GEMV_UNROLL == 2) && defined(PREFETCHW) + PREFETCHW 3 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L115 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A2), %xmm9 + movhps -30 * SIZE(A2), %xmm9 + + movaps -28 * SIZE(A1), %xmm10 + movsd -28 * SIZE(A2), %xmm11 + movhps -26 * SIZE(A2), %xmm11 + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -24 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -24 * SIZE(A2), %xmm9 + movhps -22 * SIZE(A2), %xmm9 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + movaps -20 * SIZE(A1), %xmm10 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + movsd -20 * SIZE(A2), %xmm11 + movhps -18 * SIZE(A2), %xmm11 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -16 * SIZE(A2), %xmm9 + movhps -14 * SIZE(A2), %xmm9 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -16 * SIZE(X1), %xmm12 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(A1), %xmm10 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + movsd -12 * SIZE(A2), %xmm11 + movhps -10 * SIZE(A2), %xmm11 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -24 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + movsd -24 * SIZE(A2), %xmm9 + movhps -22 * SIZE(A2), %xmm9 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + movaps -20 * SIZE(A1), %xmm10 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + movsd -20 * SIZE(A2), %xmm11 + movhps -18 * SIZE(A2), %xmm11 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -16 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L115: + testq $4, MM + je .L117 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A2), %xmm9 + movhps -30 * SIZE(A2), %xmm9 + + movaps -28 * SIZE(A1), %xmm10 + movsd -28 * SIZE(A2), %xmm11 + movhps -26 * SIZE(A2), %xmm11 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + pshufd $0xb1, %xmm11, %xmm7 + mulps %xmm13, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm13, %xmm7 + SUBPS %xmm7, %xmm3 + + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L117: + testq $2, MM + je .L118 + + movaps -32 * SIZE(A1), %xmm8 + movsd -32 * SIZE(A2), %xmm9 + movhps -30 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L118: + testq $1, MM + je .L119 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + ALIGN_3 + +.L119: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 + xorps %xmm5, %xmm2 +#else + xorps %xmm5, %xmm1 + xorps %xmm5, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm4 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm4 + + addps %xmm8, %xmm0 + addps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + movlhps %xmm2, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + movhps (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L111 +#endif + ALIGN_3 + +.L120: + cmpq $1, N + jl .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L12X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 +.L12X: +#endif + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + + movq MM, I + sarq $3, I + jle .L125 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -16 * SIZE(X1), %xmm12 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-24 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + MOVUPS_A1(-20 * SIZE, A1, %xmm10) + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -16 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -12 * SIZE(X1), %xmm13 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L125: + testq $4, MM + je .L127 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + MOVUPS_A1(-28 * SIZE, A1, %xmm10) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm10, %xmm6 + mulps %xmm13, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm13, %xmm6 + SUBPS %xmm6, %xmm1 + + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + ALIGN_3 + +.L127: + testq $2, MM + je .L128 + + MOVUPS_A1(-32 * SIZE, A1, %xmm8) + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L128: + testq $1, MM + je .L129 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + ALIGN_3 + +.L129: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + addps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_3 + + +.L200: + testq $2 * SIZE, LDA + jne .L300 + + cmpq $2, N + jl .L210 + ALIGN_3 + +.L201: + subq $2, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L20X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L20X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + movaps -33 * SIZE(A2), %xmm5 + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L205 + + movaps -29 * SIZE(A1), %xmm6 + movaps -29 * SIZE(A2), %xmm7 + + decq I + jle .L204 + ALIGN_3 + +.L203: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -25 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -25 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -21 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -21 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -17 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -13 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -13 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L203 + ALIGN_3 + +.L204: + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -25 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -25 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -21 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -21 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -17 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + mulps %xmm13, %xmm14 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L205: + testq $4, MM + je .L207 + + movaps -29 * SIZE(A1), %xmm6 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps -29 * SIZE(A2), %xmm7 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps -25 * SIZE(A1), %xmm8 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps -25 * SIZE(A2), %xmm9 + + movss %xmm9, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps %xmm8, %xmm4 + movaps %xmm9, %xmm5 + + movaps -24 * SIZE(X1), %xmm12 + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L207: + testq $2, MM + je .L208 + + movaps -29 * SIZE(A1), %xmm6 + movaps -29 * SIZE(A2), %xmm7 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps %xmm6, %xmm4 + movaps %xmm7, %xmm5 + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L208: + testq $1, MM + je .L209 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + ALIGN_3 + +.L209: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 + xorps %xmm5, %xmm2 +#else + xorps %xmm5, %xmm1 + xorps %xmm5, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm4 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm4 + + addps %xmm8, %xmm0 + addps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + movlhps %xmm2, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + movhps (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + cmpq $2, N + jge .L201 + ALIGN_3 + +.L210: + cmpq $1, N + jl .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L21X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 +.L21X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + + movq MM, I + sarq $3, I + jle .L215 + + movaps -29 * SIZE(A1), %xmm5 + movaps -25 * SIZE(A1), %xmm6 + movaps -21 * SIZE(A1), %xmm7 + + decq I + jle .L214 + ALIGN_3 + +.L213: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + movaps -13 * SIZE(A1), %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm0 + movaps -9 * SIZE(A1), %xmm6 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm4, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm15 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm0 + movaps -5 * SIZE(A1), %xmm7 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L213 + ALIGN_3 + +.L214: + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm4, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm15 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L215: + testq $4, MM + je .L217 + + movaps -29 * SIZE(A1), %xmm5 + movaps -25 * SIZE(A1), %xmm6 + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + movaps -20 * SIZE(X1), %xmm13 + movaps %xmm6, %xmm4 + + addq $8 * SIZE, A1 + ALIGN_3 + +.L217: + testq $2, MM + je .L218 + + movaps -29 * SIZE(A1), %xmm5 + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L218: + testq $1, MM + je .L219 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + ALIGN_3 + +.L219: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + addps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + jmp .L999 + +.L300: + cmpq $2, N + jl .L310 + ALIGN_3 + +.L301: + subq $2, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L30X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 +.L30X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + movaps -35 * SIZE(A2), %xmm5 + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq MM, I + sarq $3, I + jle .L305 + + movaps -29 * SIZE(A1), %xmm6 + movaps -31 * SIZE(A2), %xmm7 + + decq I + jle .L304 + ALIGN_3 + +.L303: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -25 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -27 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -21 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x93, %xmm5, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -23 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -19 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -13 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x93, %xmm5, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -15 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L303 + ALIGN_3 + +.L304: + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -25 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -27 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + movaps -21 * SIZE(A1), %xmm6 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x93, %xmm5, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + movaps -23 * SIZE(A2), %xmm7 + mulps %xmm13, %xmm14 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + movaps -19 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm3 + + movss %xmm4, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm5, %xmm7 + shufps $0x93, %xmm5, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + mulps %xmm13, %xmm14 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm14, %xmm3 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L305: + testq $4, MM + je .L307 + + movaps -29 * SIZE(A1), %xmm6 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps -31 * SIZE(A2), %xmm7 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps -25 * SIZE(A1), %xmm8 + + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps -27 * SIZE(A2), %xmm9 + + movss %xmm9, %xmm7 + shufps $0x93, %xmm9, %xmm7 + pshufd $0xb1, %xmm7, %xmm14 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm2 + mulps %xmm13, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps %xmm8, %xmm4 + movaps %xmm9, %xmm5 + + movaps -24 * SIZE(X1), %xmm12 + movaps -20 * SIZE(X1), %xmm13 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L307: + testq $2, MM + je .L308 + + movaps -29 * SIZE(A1), %xmm6 + movaps -31 * SIZE(A2), %xmm7 + + movss %xmm6, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm7, %xmm5 + shufps $0x93, %xmm7, %xmm5 + pshufd $0xb1, %xmm5, %xmm14 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm2 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm3 + + movaps %xmm6, %xmm4 + movaps %xmm7, %xmm5 + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L308: + testq $1, MM + je .L309 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd -32 * SIZE(A2), %xmm9 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + pshufd $0xb1, %xmm9, %xmm5 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm12, %xmm5 + SUBPS %xmm5, %xmm3 + ALIGN_3 + +.L309: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 + xorps %xmm5, %xmm2 +#else + xorps %xmm5, %xmm1 + xorps %xmm5, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm3, %xmm2 + haddps %xmm2, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + movaps %xmm2, %xmm4 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm4 + + addps %xmm8, %xmm0 + addps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + movlhps %xmm2, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + movhps (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 + movhps %xmm0, (Y1) + addq INCY, Y1 + + cmpq $2, N + jge .L301 + ALIGN_3 + +.L310: + cmpq $1, N + jl .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + +#ifdef ALIGNED_ACCESS + cmpq M, MM + je .L31X + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd -32 * SIZE(X1), %xmm12 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 +.L31X: +#endif + + movaps -33 * SIZE(A1), %xmm4 + + movaps -32 * SIZE(X1), %xmm12 + movaps -28 * SIZE(X1), %xmm13 + + movq MM, I + sarq $3, I + jle .L315 + + movaps -29 * SIZE(A1), %xmm5 + movaps -25 * SIZE(A1), %xmm6 + movaps -21 * SIZE(A1), %xmm7 + + decq I + jle .L314 + ALIGN_3 + +.L313: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + movaps -13 * SIZE(A1), %xmm5 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm0 + movaps -9 * SIZE(A1), %xmm6 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm4, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm15 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm0 + movaps -5 * SIZE(A1), %xmm7 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + + subq $1, I + BRANCH + jg .L313 + ALIGN_3 + +.L314: + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -17 * SIZE(A1), %xmm4 + mulps %xmm12, %xmm14 + movaps -24 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm13, %xmm15 + movaps -20 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm14 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm12, %xmm14 + movaps -16 * SIZE(X1), %xmm12 + SUBPS %xmm14, %xmm1 + + movss %xmm4, %xmm7 + shufps $0x39, %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm15 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm13, %xmm15 + movaps -12 * SIZE(X1), %xmm13 + SUBPS %xmm15, %xmm1 + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, X1 + ALIGN_3 + +.L315: + testq $4, MM + je .L317 + + movaps -29 * SIZE(A1), %xmm5 + movaps -25 * SIZE(A1), %xmm6 + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm15 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm13, %xmm15 + SUBPS %xmm15, %xmm1 + + movaps -24 * SIZE(X1), %xmm12 + movaps -20 * SIZE(X1), %xmm13 + movaps %xmm6, %xmm4 + + addq $8 * SIZE, A1 + ALIGN_3 + +.L317: + testq $2, MM + je .L318 + + movaps -29 * SIZE(A1), %xmm5 + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + + pshufd $0xb1, %xmm4, %xmm14 + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm14 + SUBPS %xmm14, %xmm1 + + movaps %xmm13, %xmm12 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L318: + testq $1, MM + je .L319 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(A1), %xmm8 + + pshufd $0xb1, %xmm8, %xmm4 + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm4 + SUBPS %xmm4, %xmm1 + ALIGN_3 + +.L319: + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorps %xmm5, %xmm0 +#else + xorps %xmm5, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 + haddps %xmm0, %xmm0 +#else + movaps %xmm0, %xmm8 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm8 + + addps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + + addps %xmm1, %xmm0 +#endif + + pshufd $0xb1, %xmm0, %xmm1 + +#ifdef HAVE_SSE3 + movddup ALPHA, %xmm15 +#else + movsd ALPHA, %xmm15 + pshufd $0x44, %xmm15, %xmm15 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + + xorps %xmm5, %xmm0 + +#ifdef HAVE_SSE3 + haddps %xmm1, %xmm0 +#else + movaps %xmm0, %xmm2 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm1, %xmm2 + + addps %xmm2, %xmm0 +#endif + + movsd (Y), %xmm12 + addq INCY, Y + + shufps $0xd8, %xmm0, %xmm0 + + addps %xmm12, %xmm0 + + movlps %xmm0, (Y1) + addq INCY, Y1 +#endif + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/copy.S b/kernel/x86_64/copy.S new file mode 100644 index 0000000000..bb66d10195 --- /dev/null +++ b/kernel/x86_64/copy.S @@ -0,0 +1,366 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#define FLAG ARG6 +#else +#define INCY %r10 +#define FLAG %r11 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + EMMS + + testq N, N # if m == 0 goto End + jle .L999 + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + cmpq $SIZE, INCX # if incx != 1 + jne .L100 + cmpq $SIZE, INCY # if incy != 1 + jne .L100 + + movq N, %rax # i = m + sarq $3, %rax + jle .L20 + ALIGN_2 + +.L11: +#ifdef XDOUBLE + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq 0(X), %mm0 + movq 8(X), %mm1 + + movq %mm0, 0(Y) + movq %mm1, 8(Y) + + movq 16(X), %mm2 + movq 24(X), %mm3 + + movq %mm2, 16(Y) + movq %mm3, 24(Y) + + movq 32(X), %mm4 + movq 40(X), %mm5 + + movq %mm4, 32(Y) + movq %mm5, 40(Y) + + movq 48(X), %mm6 + movq 56(X), %mm7 + + movq %mm6, 48(Y) + movq %mm7, 56(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq 64(X), %mm0 + movq 72(X), %mm1 + + movq %mm0, 64(Y) + movq %mm1, 72(Y) + + movq 80(X), %mm2 + movq 88(X), %mm3 + + movq %mm2, 80(Y) + movq %mm3, 88(Y) + + movq 96(X), %mm4 + movq 104(X), %mm5 + + movq %mm4, 96(Y) + movq %mm5, 104(Y) + + movq 112(X), %mm6 + movq 120(X), %mm7 + + movq %mm6, 112(Y) + movq %mm7, 120(Y) +#elif defined(DOUBLE) + + movq 0(X), %mm0 + movq 8(X), %mm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq %mm0, 0(Y) + movq %mm1, 8(Y) + + movq 16(X), %mm2 + movq 24(X), %mm3 + + movq %mm2, 16(Y) + movq %mm3, 24(Y) + + movq 32(X), %mm4 + movq 40(X), %mm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 32(Y) + movq %mm5, 40(Y) + + movq 48(X), %mm6 + movq 56(X), %mm7 + + movq %mm6, 48(Y) + movq %mm7, 56(Y) +#else + movq 0 * SIZE(X), %mm0 + movq 2 * SIZE(X), %mm2 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq %mm0, 0 * SIZE(Y) + movq %mm2, 2 * SIZE(Y) + + movq 4 * SIZE(X), %mm4 + movq 6 * SIZE(X), %mm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 4 * SIZE(Y) + movq %mm6, 6 * SIZE(Y) +#endif + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L11 + ALIGN_2 + +.L20: + movq N, %rax + andq $7, %rax + jle .L99 + ALIGN_2 + +.L21: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq %mm0, 0(Y) + movq %mm1, 8(Y) +#else + MOVQ (X), %mm0 + MOVQ %mm0, (Y) +#endif + + addq $SIZE, X + addq $SIZE, Y + decq %rax + jg .L21 + +.L99: + xorq %rax,%rax + EMMS + ret + ALIGN_3 + +.L100: + movq N, %rax + sarq $3, %rax + jle .L120 + ALIGN_2 + +.L111: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + + movq 0(X), %mm2 + movq 8(X), %mm3 + addq INCX, X + + movq 0(X), %mm4 + movq 8(X), %mm5 + addq INCX, X + + movq 0(X), %mm6 + movq 8(X), %mm7 + addq INCX, X + + movq %mm0, 0(Y) + movq %mm1, 8(Y) + addq INCY, Y + + movq %mm2, 0(Y) + movq %mm3, 8(Y) + addq INCY, Y + + movq %mm4, 0(Y) + movq %mm5, 8(Y) + addq INCY, Y + + movq %mm6, 0(Y) + movq %mm7, 8(Y) + addq INCY, Y + + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + + movq 0(X), %mm2 + movq 8(X), %mm3 + addq INCX, X + + movq 0(X), %mm4 + movq 8(X), %mm5 + addq INCX, X + + movq 0(X), %mm6 + movq 8(X), %mm7 + addq INCX, X + + movq %mm0, 0(Y) + movq %mm1, 8(Y) + addq INCY, Y + + movq %mm2, 0(Y) + movq %mm3, 8(Y) + addq INCY, Y + + movq %mm4, 0(Y) + movq %mm5, 8(Y) + addq INCY, Y + + movq %mm6, 0(Y) + movq %mm7, 8(Y) + addq INCY, Y +#else + MOVQ (X), %mm0 + addq INCX, X + MOVQ (X), %mm1 + addq INCX, X + MOVQ (X), %mm2 + addq INCX, X + MOVQ (X), %mm3 + addq INCX, X + MOVQ (X), %mm4 + addq INCX, X + MOVQ (X), %mm5 + addq INCX, X + MOVQ (X), %mm6 + addq INCX, X + MOVQ (X), %mm7 + addq INCX, X + + MOVQ %mm0, (Y) + addq INCY, Y + MOVQ %mm1, (Y) + addq INCY, Y + MOVQ %mm2, (Y) + addq INCY, Y + MOVQ %mm3, (Y) + addq INCY, Y + MOVQ %mm4, (Y) + addq INCY, Y + MOVQ %mm5, (Y) + addq INCY, Y + MOVQ %mm6, (Y) + addq INCY, Y + MOVQ %mm7, (Y) + addq INCY, Y +#endif + + decq %rax + jg .L111 + +.L120: + movq N, %rax + andq $7, %rax + jle .L999 + ALIGN_2 + +.L121: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq %mm0, 0(Y) + movq %mm1, 8(Y) +#else + MOVQ (X), %mm0 + MOVQ %mm0, (Y) +#endif + addq INCX, X + addq INCY, Y + + decq %rax + jg .L121 + +.L999: + xorq %rax,%rax + EMMS + ret + + EPILOGUE + diff --git a/kernel/x86_64/copy_sse.S b/kernel/x86_64/copy_sse.S new file mode 100644 index 0000000000..e949172931 --- /dev/null +++ b/kernel/x86_64/copy_sse.S @@ -0,0 +1,959 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + cmpq $3, M + jle .L55 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + ALIGN_4 + +.L05: + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_4 + +.L10: + testq $3 * SIZE, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -32 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -28 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -24 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm2) + movaps %xmm3, -20 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4,-16 * SIZE(Y) + LOAD(16 * SIZE, X, %xmm4) + movaps %xmm5,-12 * SIZE(Y) + LOAD(20 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -8 * SIZE(Y) + LOAD(24 * SIZE, X, %xmm6) + movaps %xmm7, -4 * SIZE(Y) + LOAD(28 * SIZE, X, %xmm7) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + movaps %xmm4, -16 * SIZE(Y) + movaps %xmm5, -12 * SIZE(Y) + movaps %xmm6, -8 * SIZE(Y) + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + ALIGN_3 + +.L13: + testq $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + + +.L20: + testq $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + movaps -6 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 14 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 18 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 22 * SIZE(X), %xmm6 + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 26 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L23: + testq $16, M + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + testq $8, M + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm1, %xmm0 + shufps $0x4e, %xmm2, %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, M + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, M + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, M + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L30: + testq $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + movaps -5 * SIZE(X), %xmm7 + + decq %rax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 15 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 19 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 23 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 27 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L33: + testq $16, M + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + testq $8, M + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, M + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + testq $2, M + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, M + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + movaps -7 * SIZE(X), %xmm7 + + decq %rax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 13 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 17 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 21 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 25 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L43: + testq $16, M + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $8, M + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, M + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + testq $2, M + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, M + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_4 + +.L50: + movq M, %rax + sarq $3, %rax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + addq INCX, X + movss (X), %xmm1 + addq INCX, X + movss (X), %xmm2 + addq INCX, X + movss (X), %xmm3 + addq INCX, X + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm7 + addq INCX, X + + movss %xmm0, (Y) + addq INCY, Y + movss %xmm1, (Y) + addq INCY, Y + movss %xmm2, (Y) + addq INCY, Y + movss %xmm3, (Y) + addq INCY, Y + movss %xmm4, (Y) + addq INCY, Y + movss %xmm5, (Y) + addq INCY, Y + movss %xmm6, (Y) + addq INCY, Y + movss %xmm7, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $7, %rax + jle .L57 + ALIGN_3 + +.L56: + movss (X), %xmm0 + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/copy_sse2.S b/kernel/x86_64/copy_sse2.S new file mode 100644 index 0000000000..200daafd90 --- /dev/null +++ b/kernel/x86_64/copy_sse2.S @@ -0,0 +1,650 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L40 + cmpq $SIZE, INCY + jne .L40 + +#ifdef ALIGNED_ACCESS + testq $SIZE, Y +#else + testq $SIZE, X +#endif + je .L10 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + +#ifdef ALIGNED_ACCESS + testq $SIZE, X +#else + testq $SIZE, Y +#endif + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -16 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -14 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -12 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movaps %xmm3, -10 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4, -8 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movaps %xmm5, -6 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -4 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movaps %xmm7, -2 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, -8 * SIZE(Y) + movaps %xmm5, -6 * SIZE(Y) + movaps %xmm6, -4 * SIZE(Y) + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L13: + testq $8, M + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $4, M + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, M + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, M + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + LOAD( 1 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + LOAD( 3 * SIZE, X, %xmm2) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + LOAD( 5 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + LOAD( 7 * SIZE, X, %xmm4) + + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + LOAD( 9 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + LOAD(11 * SIZE, X, %xmm6) + + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + LOAD(13 * SIZE, X, %xmm7) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm8, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#else + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movaps -10 * SIZE(X), %xmm3 + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#endif + +.L40: + movq M, %rax + sarq $3, %rax + jle .L45 + ALIGN_3 + +.L41: + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + movlps %xmm0, (Y) + addq INCY, Y + movhps %xmm0, (Y) + addq INCY, Y + movlps %xmm1, (Y) + addq INCY, Y + movhps %xmm1, (Y) + addq INCY, Y + movlps %xmm2, (Y) + addq INCY, Y + movhps %xmm2, (Y) + addq INCY, Y + movlps %xmm3, (Y) + addq INCY, Y + movhps %xmm3, (Y) + addq INCY, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L45: + movq M, %rax + andq $7, %rax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + decq %rax + jg .L46 + ALIGN_3 + +.L47: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_ncopy_2.S b/kernel/x86_64/dgemm_ncopy_2.S new file mode 100644 index 0000000000..2724cfe92b --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_2.S @@ -0,0 +1,597 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef NEHALEM +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef MOVAPS +#define MOVAPS movaps +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define MM %r13 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA + subq $-16 * SIZE, B + + movq M, MM + leaq -1(M), %rax + testq $SIZE, A + cmovne %rax, MM + + testq $SIZE, LDA + jne .L50 + + movq N, J + sarq $1, J + jle .L30 + ALIGN_4 + +.L21: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L22 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L22: + movq MM, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 2 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm4, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 4 * SIZE(AO2), %xmm1 + MOVAPS 6 * SIZE(AO1), %xmm2 + MOVAPS 6 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm4, -6 * SIZE(B) + movaps %xmm2, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, MM + jle .L26 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 2 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm4, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L26: + testq $2, MM + jle .L28 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + + movaps %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L28: + testq $1, MM + jle .L29 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L29: + decq J + jg .L21 + ALIGN_4 + +.L30: + testq $1, N + jle .L999 + +.L30x: + movq A, AO1 + + testq $SIZE, A + jne .L35 + + movq M, I + sarq $3, I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm1 + MOVAPS 4 * SIZE(AO1), %xmm2 + MOVAPS 6 * SIZE(AO1), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, B + + decq I + jg .L31 + ALIGN_4 + +.L32: + testq $4, M + jle .L33 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L33: + testq $2, M + jle .L34 + + MOVAPS 0 * SIZE(AO1), %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L34: + testq $1, M + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L35: + movaps -1 * SIZE(AO1), %xmm0 + + movq M, I + sarq $3, I + jle .L36 + ALIGN_4 + +.L36: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + MOVAPS 5 * SIZE(AO1), %xmm3 + MOVAPS 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + movaps %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L36 + ALIGN_4 + +.L37: + testq $4, M + jle .L38 + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + movaps %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L38: + testq $2, M + jle .L39 + + MOVAPS 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + movaps %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L39: + testq $1, M + jle .L999 + + movhpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L50: + movq N, J + sarq $1, J + jle .L30 + ALIGN_4 + +.L61: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L62 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L62: + MOVAPS -1 * SIZE(AO2), %xmm5 + + movq MM, I + sarq $3, I + jle .L64 + ALIGN_4 + +.L63: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + movaps %xmm1, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 5 * SIZE(AO2), %xmm1 + MOVAPS 6 * SIZE(AO1), %xmm2 + MOVAPS 7 * SIZE(AO2), %xmm5 + + movsd %xmm0, %xmm3 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm5, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm3, -8 * SIZE(B) + movaps %xmm0, -6 * SIZE(B) + movaps %xmm1, -4 * SIZE(B) + movaps %xmm2, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L63 + ALIGN_4 + +.L64: + testq $4, MM + jle .L66 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + movaps %xmm1, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + + movaps %xmm3, %xmm5 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L66: + testq $2, MM + jle .L68 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L68: + testq $1, MM + jle .L69 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L69: + decq J + jg .L61 + + testq $1, N + jne .L30 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_ncopy_4.S b/kernel/x86_64/dgemm_ncopy_4.S new file mode 100644 index 0000000000..52115bd4d9 --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_4.S @@ -0,0 +1,1237 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef ATOM +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NANO +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef OPTERON +#define PREFETCHSIZE 16 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef GENERIC +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define MM %r13 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA + subq $-16 * SIZE, B + + movq M, MM + leaq -1(M), %rax + testq $SIZE, A + cmovne %rax, MM + + testq $SIZE, LDA + jne .L50 + + movq N, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L12 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L12: + movq MM, I + sarq $3, I + jle .L14 + ALIGN_4 + +.L13: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 0 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + movapd %xmm4, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + movapd 2 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1, LDA), %xmm1 + movapd 2 * SIZE(AO2), %xmm2 + movapd 2 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movapd %xmm0, -8 * SIZE(B) + movapd %xmm2, -6 * SIZE(B) + movapd %xmm4, -4 * SIZE(B) + movapd %xmm6, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + movapd 4 * SIZE(AO1), %xmm0 + movapd 4 * SIZE(AO1, LDA), %xmm1 + movapd 4 * SIZE(AO2), %xmm2 + movapd 4 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) +#endif + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm2, 2 * SIZE(B) + movapd %xmm4, 4 * SIZE(B) + movapd %xmm6, 6 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + movapd 6 * SIZE(AO1), %xmm0 + movapd 6 * SIZE(AO1, LDA), %xmm1 + movapd 6 * SIZE(AO2), %xmm2 + movapd 6 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) +#endif + + movapd %xmm0, 8 * SIZE(B) + movapd %xmm2, 10 * SIZE(B) + movapd %xmm4, 12 * SIZE(B) + movapd %xmm6, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $4, MM + jle .L16 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 0 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + movapd %xmm4, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + + movapd 2 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1, LDA), %xmm1 + movapd 2 * SIZE(AO2), %xmm2 + movapd 2 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movapd %xmm0, -8 * SIZE(B) + movapd %xmm2, -6 * SIZE(B) + movapd %xmm4, -4 * SIZE(B) + movapd %xmm6, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L16: + testq $2, MM + jle .L18 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 0 * SIZE(AO2, LDA), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + movapd %xmm4, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L18: + testq $1, MM + jle .L19 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $2, N + jle .L30 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L22 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L22: + movq MM, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO2), %xmm1 + movapd 2 * SIZE(AO1), %xmm2 + movapd 2 * SIZE(AO2), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm4, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + movapd 4 * SIZE(AO1), %xmm0 + movapd 4 * SIZE(AO2), %xmm1 + movapd 6 * SIZE(AO1), %xmm2 + movapd 6 * SIZE(AO2), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movapd %xmm0, -8 * SIZE(B) + movapd %xmm4, -6 * SIZE(B) + movapd %xmm2, -4 * SIZE(B) + movapd %xmm6, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, MM + jle .L26 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO2), %xmm1 + movapd 2 * SIZE(AO1), %xmm2 + movapd 2 * SIZE(AO2), %xmm3 + + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movapd %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm4, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L26: + testq $2, MM + jle .L28 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 0 * SIZE(AO2), %xmm1 + + movapd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L28: + testq $1, MM + jle .L30 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L30: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L35 + + movq MM, I + sarq $3, I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1), %xmm1 + movapd 4 * SIZE(AO1), %xmm2 + movapd 6 * SIZE(AO1), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm3, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L31 + ALIGN_4 + +.L32: + testq $4, MM + jle .L33 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1), %xmm1 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L33: + testq $2, MM + jle .L34 + + movapd 0 * SIZE(AO1), %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L34: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L35: + movapd -1 * SIZE(AO1), %xmm0 + + movq MM, I + sarq $3, I + jle .L36 + ALIGN_4 + +.L36: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + movapd 1 * SIZE(AO1), %xmm1 + movapd 3 * SIZE(AO1), %xmm2 + movapd 5 * SIZE(AO1), %xmm3 + movapd 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm3, -10 * SIZE(B) + + movapd %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L36 + ALIGN_4 + +.L37: + testq $4, MM + jle .L38 + + movapd 1 * SIZE(AO1), %xmm1 + movapd 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movapd %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L38: + testq $2, MM + jle .L39 + + movapd 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + movapd %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L39: + testq $1, MM + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L50: + movq N, J + sarq $2, J + jle .L60 + ALIGN_4 + +.L51: + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L52 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L52: + movapd -1 * SIZE(AO1, LDA), %xmm5 + movapd -1 * SIZE(AO2, LDA), %xmm7 + + movq MM, I + sarq $3, I + jle .L54 + ALIGN_4 + +.L53: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + movapd %xmm0, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + movapd 2 * SIZE(AO1), %xmm0 + movapd 3 * SIZE(AO1, LDA), %xmm5 + movapd 2 * SIZE(AO2), %xmm2 + movapd 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm0, -4 * SIZE(B) + movapd %xmm2, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + movapd 4 * SIZE(AO1), %xmm0 + movapd 5 * SIZE(AO1, LDA), %xmm1 + movapd 4 * SIZE(AO2), %xmm2 + movapd 5 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) +#endif + + movapd %xmm5, 0 * SIZE(B) + movapd %xmm7, 2 * SIZE(B) + movapd %xmm0, 4 * SIZE(B) + movapd %xmm2, 6 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + movapd 6 * SIZE(AO1), %xmm0 + movapd 7 * SIZE(AO1, LDA), %xmm5 + movapd 6 * SIZE(AO2), %xmm2 + movapd 7 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) +#endif + + movapd %xmm1, 8 * SIZE(B) + movapd %xmm3, 10 * SIZE(B) + movapd %xmm0, 12 * SIZE(B) + movapd %xmm2, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L53 + ALIGN_4 + +.L54: + testq $4, MM + jle .L56 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm7 + shufpd $1, %xmm3, %xmm2 + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + movapd %xmm0, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + + movapd 2 * SIZE(AO1), %xmm0 + movapd 3 * SIZE(AO1, LDA), %xmm5 + movapd 2 * SIZE(AO2), %xmm2 + movapd 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + shufpd $1, %xmm5, %xmm0 + movsd %xmm2, %xmm3 + shufpd $1, %xmm7, %xmm2 + + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm0, -4 * SIZE(B) + movapd %xmm2, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L56: + testq $2, MM + jle .L58 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO1, LDA), %xmm1 + movapd 0 * SIZE(AO2), %xmm2 + movapd 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm7, -14 * SIZE(B) + movapd %xmm0, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L58: + testq $1, MM + jle .L59 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L59: + decq J + jg .L51 + ALIGN_4 + +.L60: + testq $2, N + jle .L70 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L62 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L62: + movapd -1 * SIZE(AO2), %xmm5 + + movq MM, I + sarq $3, I + jle .L64 + ALIGN_4 + +.L63: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO2), %xmm1 + movapd 2 * SIZE(AO1), %xmm2 + movapd 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm0, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + movapd 4 * SIZE(AO1), %xmm0 + movapd 5 * SIZE(AO2), %xmm1 + movapd 6 * SIZE(AO1), %xmm2 + movapd 7 * SIZE(AO2), %xmm5 + + movsd %xmm0, %xmm3 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm5, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm3, -8 * SIZE(B) + movapd %xmm0, -6 * SIZE(B) + movapd %xmm1, -4 * SIZE(B) + movapd %xmm2, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L63 + ALIGN_4 + +.L64: + testq $4, MM + jle .L66 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO2), %xmm1 + movapd 2 * SIZE(AO1), %xmm2 + movapd 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm0, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm2, -10 * SIZE(B) + + movaps %xmm3, %xmm5 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L66: + testq $2, MM + jle .L68 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 1 * SIZE(AO2), %xmm1 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + + movapd %xmm5, -16 * SIZE(B) + movapd %xmm0, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L68: + testq $1, MM + jle .L70 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L75 + + movq MM, I + sarq $3, I + jle .L72 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + movapd 0 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1), %xmm2 + movapd 4 * SIZE(AO1), %xmm4 + movapd 6 * SIZE(AO1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + movapd %xmm4, -12 * SIZE(B) + movapd %xmm6, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L71 + ALIGN_4 + +.L72: + testq $4, MM + jle .L73 + + movapd 0 * SIZE(AO1), %xmm0 + movapd 2 * SIZE(AO1), %xmm2 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm2, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L73: + testq $2, MM + jle .L74 + + movapd 0 * SIZE(AO1), %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L74: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L75: + movapd -1 * SIZE(AO1), %xmm0 + + movq MM, I + sarq $3, I + jle .L76 + ALIGN_4 + +.L76: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + movapd 1 * SIZE(AO1), %xmm1 + movapd 3 * SIZE(AO1), %xmm2 + movapd 5 * SIZE(AO1), %xmm3 + movapd 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + movapd %xmm2, -12 * SIZE(B) + movapd %xmm3, -10 * SIZE(B) + + movapd %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L76 + ALIGN_4 + +.L77: + testq $4, MM + jle .L78 + + movapd 1 * SIZE(AO1), %xmm1 + movapd 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movapd %xmm0, -16 * SIZE(B) + movapd %xmm1, -14 * SIZE(B) + + movapd %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L78: + testq $2, MM + jle .L79 + + movapd 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movapd %xmm0, -16 * SIZE(B) + + movapd %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L79: + testq $1, MM + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_ncopy_8.S b/kernel/x86_64/dgemm_ncopy_8.S new file mode 100644 index 0000000000..5d3627230e --- /dev/null +++ b/kernel/x86_64/dgemm_ncopy_8.S @@ -0,0 +1,2002 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef NEHALEM +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef MOVAPS +#define MOVAPS movaps +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define J %r12 +#define MM %r13 + +#else + +#define STACKSIZE 128 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r15 + +#define AO1 %r10 +#define AO2 %r11 +#define LDA3 %r12 +#define J %r13 +#define MM %r14 + +#endif + +#define I %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + subq $-16 * SIZE, B + + movq M, MM + leaq -1(M), %rax + testq $SIZE, A + cmovne %rax, MM + + testq $SIZE, LDA + jne .L50 + + movq N, J + sarq $3, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + testq $SIZE, A + je .L12 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_3 + +.L12: + movq MM, I + sarq $3, I + jle .L14 + ALIGN_4 + +.L13: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 0 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm8, -8 * SIZE(B) + movaps %xmm9, -6 * SIZE(B) + movaps %xmm10, -4 * SIZE(B) + movaps %xmm11, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) +#endif + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1, LDA), %xmm1 + MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 2 * SIZE(AO1, LDA3), %xmm3 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) +#endif + + MOVAPS 2 * SIZE(AO2), %xmm4 + MOVAPS 2 * SIZE(AO2, LDA), %xmm5 + MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 2 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) +#endif + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm2, 2 * SIZE(B) + movaps %xmm4, 4 * SIZE(B) + movaps %xmm6, 6 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) +#endif + + movaps %xmm8, 8 * SIZE(B) + movaps %xmm9, 10 * SIZE(B) + movaps %xmm10, 12 * SIZE(B) + movaps %xmm11, 14 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 4 * SIZE(AO1, LDA), %xmm1 + MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 4 * SIZE(AO1, LDA3), %xmm3 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + MOVAPS 4 * SIZE(AO2), %xmm4 + MOVAPS 4 * SIZE(AO2, LDA), %xmm5 + MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 4 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B) +#endif + + movaps %xmm0, 16 * SIZE(B) + movaps %xmm2, 18 * SIZE(B) + movaps %xmm4, 20 * SIZE(B) + movaps %xmm6, 22 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B) +#endif + + movaps %xmm8, 24 * SIZE(B) + movaps %xmm9, 26 * SIZE(B) + movaps %xmm10, 28 * SIZE(B) + movaps %xmm11, 30 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) +#endif + + MOVAPS 6 * SIZE(AO1), %xmm0 + MOVAPS 6 * SIZE(AO1, LDA), %xmm1 + MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 6 * SIZE(AO1, LDA3), %xmm3 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) +#endif + + MOVAPS 6 * SIZE(AO2), %xmm4 + MOVAPS 6 * SIZE(AO2, LDA), %xmm5 + MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 6 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B) +#endif + + movaps %xmm0, 32 * SIZE(B) + movaps %xmm2, 34 * SIZE(B) + movaps %xmm4, 36 * SIZE(B) + movaps %xmm6, 38 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 56) * SIZE(B) +#endif + + movaps %xmm8, 40 * SIZE(B) + movaps %xmm9, 42 * SIZE(B) + movaps %xmm10, 44 * SIZE(B) + movaps %xmm11, 46 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-64 * SIZE, B + + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $4, MM + jle .L16 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 + + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 0 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + movaps %xmm8, -8 * SIZE(B) + movaps %xmm9, -6 * SIZE(B) + movaps %xmm10, -4 * SIZE(B) + movaps %xmm11, -2 * SIZE(B) + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1, LDA), %xmm1 + MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 2 * SIZE(AO1, LDA3), %xmm3 + + MOVAPS 2 * SIZE(AO2), %xmm4 + MOVAPS 2 * SIZE(AO2, LDA), %xmm5 + MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 2 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm2, 2 * SIZE(B) + movaps %xmm4, 4 * SIZE(B) + movaps %xmm6, 6 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + movaps %xmm8, 8 * SIZE(B) + movaps %xmm9, 10 * SIZE(B) + movaps %xmm10, 12 * SIZE(B) + movaps %xmm11, 14 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B + ALIGN_4 + +.L16: + testq $2, MM + jle .L18 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 0 * SIZE(AO1, LDA3), %xmm3 + + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 0 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 0 * SIZE(AO2, LDA3), %xmm7 + + movaps %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + + movaps %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + movaps %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + unpckhpd %xmm1, %xmm8 + unpckhpd %xmm3, %xmm9 + unpckhpd %xmm5, %xmm10 + unpckhpd %xmm7, %xmm11 + + movaps %xmm8, -8 * SIZE(B) + movaps %xmm9, -6 * SIZE(B) + movaps %xmm10, -4 * SIZE(B) + movaps %xmm11, -2 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L18: + testq $1, MM + jle .L19 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + subq $-8 * SIZE, B + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $4, N + jle .L30 + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L22 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L22: + movq MM, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 0 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA) +#endif + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1, LDA), %xmm1 + MOVAPS 2 * SIZE(AO2), %xmm2 + MOVAPS 2 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 4 * SIZE(AO1, LDA), %xmm1 + MOVAPS 4 * SIZE(AO2), %xmm2 + MOVAPS 4 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) +#endif + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm2, 2 * SIZE(B) + movaps %xmm4, 4 * SIZE(B) + movaps %xmm6, 6 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA) +#endif + + MOVAPS 6 * SIZE(AO1), %xmm0 + MOVAPS 6 * SIZE(AO1, LDA), %xmm1 + MOVAPS 6 * SIZE(AO2), %xmm2 + MOVAPS 6 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) +#endif + + movaps %xmm0, 8 * SIZE(B) + movaps %xmm2, 10 * SIZE(B) + movaps %xmm4, 12 * SIZE(B) + movaps %xmm6, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, MM + jle .L26 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 0 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1, LDA), %xmm1 + MOVAPS 2 * SIZE(AO2), %xmm2 + MOVAPS 2 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L26: + testq $2, MM + jle .L28 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 0 * SIZE(AO2, LDA), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L28: + testq $1, MM + jle .L30 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L30: + testq $2, N + jle .L40 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L32 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L32: + movq MM, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 2 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm4, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 4 * SIZE(AO2), %xmm1 + MOVAPS 6 * SIZE(AO1), %xmm2 + MOVAPS 6 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + + unpckhpd %xmm1, %xmm4 + unpckhpd %xmm3, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm4, -6 * SIZE(B) + movaps %xmm2, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, MM + jle .L36 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 2 * SIZE(AO2), %xmm3 + + movaps %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movaps %xmm2, %xmm6 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm4, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L36: + testq $2, MM + jle .L38 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 0 * SIZE(AO2), %xmm1 + + movaps %xmm0, %xmm2 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L38: + testq $1, MM + jle .L40 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L40: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L45 + + movq MM, I + sarq $3, I + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm1 + MOVAPS 4 * SIZE(AO1), %xmm2 + MOVAPS 6 * SIZE(AO1), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L41 + ALIGN_4 + +.L42: + testq $4, MM + jle .L43 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L43: + testq $2, MM + jle .L44 + + MOVAPS 0 * SIZE(AO1), %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L44: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L45: + MOVAPS -1 * SIZE(AO1), %xmm0 + + movq M, I + sarq $3, I + jle .L46 + ALIGN_4 + +.L46: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + MOVAPS 5 * SIZE(AO1), %xmm3 + MOVAPS 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + movaps %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L46 + ALIGN_4 + +.L47: + testq $4, M + jle .L48 + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + movaps %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L48: + testq $2, M + jle .L49 + + MOVAPS 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + movaps %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L49: + testq $1, M + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L50: + movq N, J + sarq $3, J + jle .L60 + ALIGN_4 + +.L51: + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + testq $SIZE, A + je .L52 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_3 + +.L52: + MOVAPS -1 * SIZE(AO1, LDA), %xmm9 + MOVAPS -1 * SIZE(AO1, LDA3), %xmm10 + MOVAPS -1 * SIZE(AO2, LDA), %xmm11 + MOVAPS -1 * SIZE(AO2, LDA3), %xmm12 + + movq MM, I + sarq $3, I + jle .L54 + ALIGN_4 + +.L53: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 1 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm9, -16 * SIZE(B) + movaps %xmm10, -14 * SIZE(B) + movaps %xmm11, -12 * SIZE(B) + movaps %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) +#endif + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 3 * SIZE(AO1, LDA), %xmm9 + MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 3 * SIZE(AO1, LDA3), %xmm10 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) +#endif + + MOVAPS 2 * SIZE(AO2), %xmm4 + MOVAPS 3 * SIZE(AO2, LDA), %xmm11 + MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 3 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) +#endif + + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 2 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm7, 6 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) +#endif + + movaps %xmm0, 8 * SIZE(B) + movaps %xmm2, 10 * SIZE(B) + movaps %xmm4, 12 * SIZE(B) + movaps %xmm6, 14 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 5 * SIZE(AO1, LDA), %xmm1 + MOVAPS 4 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 5 * SIZE(AO1, LDA3), %xmm3 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + MOVAPS 4 * SIZE(AO2), %xmm4 + MOVAPS 5 * SIZE(AO2, LDA), %xmm5 + MOVAPS 4 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 5 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 32) * SIZE(B) +#endif + + movaps %xmm9, 16 * SIZE(B) + movaps %xmm10, 18 * SIZE(B) + movaps %xmm11, 20 * SIZE(B) + movaps %xmm12, 22 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movaps %xmm0, 24 * SIZE(B) + movaps %xmm2, 26 * SIZE(B) + movaps %xmm4, 28 * SIZE(B) + movaps %xmm6, 30 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) +#endif + + MOVAPS 6 * SIZE(AO1), %xmm0 + MOVAPS 7 * SIZE(AO1, LDA), %xmm9 + MOVAPS 6 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 7 * SIZE(AO1, LDA3), %xmm10 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) +#endif + + MOVAPS 6 * SIZE(AO2), %xmm4 + MOVAPS 7 * SIZE(AO2, LDA), %xmm11 + MOVAPS 6 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 7 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 40) * SIZE(B) +#endif + + movaps %xmm1, 32 * SIZE(B) + movaps %xmm3, 34 * SIZE(B) + movaps %xmm5, 36 * SIZE(B) + movaps %xmm7, 38 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 48) * SIZE(B) +#endif + movaps %xmm0, 40 * SIZE(B) + movaps %xmm2, 42 * SIZE(B) + movaps %xmm4, 44 * SIZE(B) + movaps %xmm6, 46 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-64 * SIZE, B + + decq I + jg .L53 + ALIGN_4 + +.L54: + testq $4, MM + jle .L56 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 1 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + + movaps %xmm9, -16 * SIZE(B) + movaps %xmm10, -14 * SIZE(B) + movaps %xmm11, -12 * SIZE(B) + movaps %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 3 * SIZE(AO1, LDA), %xmm9 + MOVAPS 2 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 3 * SIZE(AO1, LDA3), %xmm10 + MOVAPS 2 * SIZE(AO2), %xmm4 + MOVAPS 3 * SIZE(AO2, LDA), %xmm11 + MOVAPS 2 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 3 * SIZE(AO2, LDA3), %xmm12 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + movsd %xmm4, %xmm5 + movsd %xmm6, %xmm7 + + movaps %xmm1, 0 * SIZE(B) + movaps %xmm3, 2 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm7, 6 * SIZE(B) + + shufpd $1, %xmm9, %xmm0 + shufpd $1, %xmm10, %xmm2 + shufpd $1, %xmm11, %xmm4 + shufpd $1, %xmm12, %xmm6 + + movaps %xmm0, 8 * SIZE(B) + movaps %xmm2, 10 * SIZE(B) + movaps %xmm4, 12 * SIZE(B) + movaps %xmm6, 14 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B + ALIGN_4 + +.L56: + testq $2, MM + jle .L58 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO1, LDA, 2), %xmm2 + MOVAPS 1 * SIZE(AO1, LDA3), %xmm3 + MOVAPS 0 * SIZE(AO2), %xmm4 + MOVAPS 1 * SIZE(AO2, LDA), %xmm5 + MOVAPS 0 * SIZE(AO2, LDA, 2), %xmm6 + MOVAPS 1 * SIZE(AO2, LDA3), %xmm7 + + movsd %xmm0, %xmm9 + movsd %xmm2, %xmm10 + movsd %xmm4, %xmm11 + movsd %xmm6, %xmm12 + + movaps %xmm9, -16 * SIZE(B) + movaps %xmm10, -14 * SIZE(B) + movaps %xmm11, -12 * SIZE(B) + movaps %xmm12, -10 * SIZE(B) + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm7, %xmm6 + + movaps %xmm0, -8 * SIZE(B) + movaps %xmm2, -6 * SIZE(B) + movaps %xmm4, -4 * SIZE(B) + movaps %xmm6, -2 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L58: + testq $1, MM + jle .L59 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + movsd 0 * SIZE(AO2), %xmm4 + movsd 0 * SIZE(AO2, LDA), %xmm5 + movsd 0 * SIZE(AO2, LDA, 2), %xmm6 + movsd 0 * SIZE(AO2, LDA3), %xmm7 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + unpcklpd %xmm5, %xmm4 + unpcklpd %xmm7, %xmm6 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + subq $-8 * SIZE, B + ALIGN_4 + +.L59: + decq J + jg .L51 + ALIGN_4 + +.L60: + testq $4, N + jle .L70 + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + testq $SIZE, A + je .L62 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_3 + +.L62: + movaps -1 * SIZE(AO1, LDA), %xmm5 + movaps -1 * SIZE(AO2, LDA), %xmm7 + + movq MM, I + sarq $3, I + jle .L64 + ALIGN_4 + +.L63: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm7, -14 * SIZE(B) + movaps %xmm0, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1, LDA) +#endif + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 3 * SIZE(AO1, LDA), %xmm5 + MOVAPS 2 * SIZE(AO2), %xmm2 + MOVAPS 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm1, -8 * SIZE(B) + movaps %xmm3, -6 * SIZE(B) + movaps %xmm0, -4 * SIZE(B) + movaps %xmm2, -2 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 5 * SIZE(AO1, LDA), %xmm1 + MOVAPS 4 * SIZE(AO2), %xmm2 + MOVAPS 5 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 16) * SIZE(B) +#endif + + movaps %xmm5, 0 * SIZE(B) + movaps %xmm7, 2 * SIZE(B) + movaps %xmm0, 4 * SIZE(B) + movaps %xmm2, 6 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2, LDA) +#endif + + MOVAPS 6 * SIZE(AO1), %xmm0 + MOVAPS 7 * SIZE(AO1, LDA), %xmm5 + MOVAPS 6 * SIZE(AO2), %xmm2 + MOVAPS 7 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + movsd %xmm2, %xmm3 + shufpd $1, %xmm5, %xmm0 + shufpd $1, %xmm7, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 24) * SIZE(B) +#endif + + movaps %xmm1, 8 * SIZE(B) + movaps %xmm3, 10 * SIZE(B) + movaps %xmm0, 12 * SIZE(B) + movaps %xmm2, 14 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-32 * SIZE, B + + decq I + jg .L63 + ALIGN_4 + +.L64: + testq $4, MM + jle .L66 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm7 + shufpd $1, %xmm3, %xmm2 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm7, -14 * SIZE(B) + movaps %xmm0, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + + MOVAPS 2 * SIZE(AO1), %xmm0 + MOVAPS 3 * SIZE(AO1, LDA), %xmm5 + MOVAPS 2 * SIZE(AO2), %xmm2 + MOVAPS 3 * SIZE(AO2, LDA), %xmm7 + + movsd %xmm0, %xmm1 + shufpd $1, %xmm5, %xmm0 + movsd %xmm2, %xmm3 + shufpd $1, %xmm7, %xmm2 + + movaps %xmm1, -8 * SIZE(B) + movaps %xmm3, -6 * SIZE(B) + movaps %xmm0, -4 * SIZE(B) + movaps %xmm2, -2 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B + ALIGN_4 + +.L66: + testq $2, MM + jle .L68 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO1, LDA), %xmm1 + MOVAPS 0 * SIZE(AO2), %xmm2 + MOVAPS 1 * SIZE(AO2, LDA), %xmm3 + + movsd %xmm0, %xmm5 + movsd %xmm2, %xmm7 + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm3, %xmm2 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm7, -14 * SIZE(B) + movaps %xmm0, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L68: + testq $1, MM + jle .L70 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + subq $-4 * SIZE, B + ALIGN_4 + +.L70: + testq $2, N + jle .L80 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + testq $SIZE, A + je .L72 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $1 * SIZE, AO1 + addq $1 * SIZE, AO2 + subq $-2 * SIZE, B + ALIGN_3 + +.L72: + MOVAPS -1 * SIZE(AO2), %xmm5 + + movq MM, I + sarq $3, I + jle .L74 + ALIGN_4 + +.L73: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + movaps %xmm1, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO2) +#endif + + MOVAPS 4 * SIZE(AO1), %xmm0 + MOVAPS 5 * SIZE(AO2), %xmm1 + MOVAPS 6 * SIZE(AO1), %xmm2 + MOVAPS 7 * SIZE(AO2), %xmm5 + + movsd %xmm0, %xmm3 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm5, %xmm2 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 8) * SIZE(B) +#endif + + movaps %xmm3, -8 * SIZE(B) + movaps %xmm0, -6 * SIZE(B) + movaps %xmm1, -4 * SIZE(B) + movaps %xmm2, -2 * SIZE(B) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + + decq I + jg .L73 + ALIGN_4 + +.L74: + testq $4, MM + jle .L76 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 3 * SIZE(AO2), %xmm3 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + movsd %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + movaps %xmm1, -12 * SIZE(B) + movaps %xmm2, -10 * SIZE(B) + + movaps %xmm3, %xmm5 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + ALIGN_4 + +.L76: + testq $2, MM + jle .L78 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 1 * SIZE(AO2), %xmm1 + + movsd %xmm0, %xmm5 + shufpd $1, %xmm1, %xmm0 + + movaps %xmm5, -16 * SIZE(B) + movaps %xmm0, -14 * SIZE(B) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B + ALIGN_4 + +.L78: + testq $1, MM + jle .L80 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + subq $-2 * SIZE, B + ALIGN_4 + +.L80: + testq $1, N + jle .L999 + + movq A, AO1 + + testq $SIZE, A + jne .L85 + + movq MM, I + sarq $3, I + jle .L82 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm2 + MOVAPS 4 * SIZE(AO1), %xmm4 + MOVAPS 6 * SIZE(AO1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + movaps %xmm4, -12 * SIZE(B) + movaps %xmm6, -10 * SIZE(B) + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L81 + ALIGN_4 + +.L82: + testq $4, MM + jle .L83 + + MOVAPS 0 * SIZE(AO1), %xmm0 + MOVAPS 2 * SIZE(AO1), %xmm2 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm2, -14 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + ALIGN_4 + +.L83: + testq $2, MM + jle .L84 + + MOVAPS 0 * SIZE(AO1), %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L84: + testq $1, MM + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + jmp .L999 + ALIGN_4 + +.L85: + MOVAPS -1 * SIZE(AO1), %xmm0 + + movq M, I + sarq $3, I + jle .L86 + ALIGN_4 + +.L86: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + MOVAPS 5 * SIZE(AO1), %xmm3 + MOVAPS 7 * SIZE(AO1), %xmm4 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + shufpd $1, %xmm3, %xmm2 + shufpd $1, %xmm4, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 8 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + movaps %xmm2, -12 * SIZE(B) + movaps %xmm3, -10 * SIZE(B) + + movaps %xmm4, %xmm0 + + addq $8 * SIZE, AO1 + subq $-8 * SIZE, B + + decq I + jg .L86 + ALIGN_4 + +.L87: + testq $4, M + jle .L88 + + MOVAPS 1 * SIZE(AO1), %xmm1 + MOVAPS 3 * SIZE(AO1), %xmm2 + + shufpd $1, %xmm1, %xmm0 + shufpd $1, %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(B) + movaps %xmm1, -14 * SIZE(B) + + movaps %xmm2, %xmm0 + + addq $4 * SIZE, AO1 + addq $4 * SIZE, B + ALIGN_4 + +.L88: + testq $2, M + jle .L89 + + MOVAPS 1 * SIZE(AO1), %xmm1 + + shufpd $1, %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B) + + movaps %xmm1, %xmm0 + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B + ALIGN_4 + +.L89: + testq $1, M + jle .L999 + + shufpd $1, %xmm0, %xmm0 + + movlpd %xmm0, -16 * SIZE(B) + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_tcopy_2.S b/kernel/x86_64/dgemm_tcopy_2.S new file mode 100644 index 0000000000..06e59991d1 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_2.S @@ -0,0 +1,334 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef OPTERON +#define PREFETCHSIZE 16 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef MOVUPS_A +#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS +#else +#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS +#endif + +#ifndef WINDOWS_ABI + +#define N ARG1 /* rsi */ +#define M ARG2 /* rdi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define M8 %r12 + +#else + +#define N ARG1 /* rdx */ +#define M ARG2 /* rcx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 40(%rsp) + +#define B %r12 + +#define AO1 %rsi +#define AO2 %rdi +#define LDA3 %r10 +#define M8 %r11 +#endif + +#define I %rax +#define B0 %rbp +#define B3 %r13 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + + pushq %r12 + pushq %r13 + pushq %rbp + +#ifdef WINDOWS_ABI + movq OLD_B, B +#endif + + subq $-16 * SIZE, B + + movq M, B3 + andq $-2, B3 + imulq N, B3 + + leaq (B, B3, SIZE), B3 + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + + leaq (, N, SIZE), M8 + + cmpq $2, N + jl .L40 + ALIGN_4 + +.L31: + subq $2, N + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq B, B0 + addq $4 * SIZE, B + + movq M, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A1(2 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm2, -14 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + movaps %xmm3, -14 * SIZE(B0, M8, 2) + + leaq (B0, M8, 4), B0 + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVUPS_A1(4 * SIZE, AO1, %xmm0) + MOVUPS_A1(6 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm2, -14 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + movaps %xmm3, -14 * SIZE(B0, M8, 2) + + leaq (B0, M8, 4), B0 + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, M + jle .L36 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A1(2 * SIZE, AO2, %xmm3) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm2, -14 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + movaps %xmm3, -14 * SIZE(B0, M8, 2) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L36: + testq $2, M + jle .L38 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(0 * SIZE, AO2, %xmm1) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + leaq (B0, M8, 2), B0 + ALIGN_4 + +.L38: + testq $1, M + jle .L39 + + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, -16 * SIZE(B3) + subq $-2 * SIZE, B3 + ALIGN_4 + +.L39: + cmpq $2, N + jge .L31 + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + + movq A, AO1 + movq B, B0 + + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + addq $8 * SIZE, AO1 + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + leaq (B0, M8, 4), B0 + movaps %xmm2, -16 * SIZE(B0) + movaps %xmm3, -16 * SIZE(B0, M8, 2) + leaq (B0, M8, 4), B0 + + decq I + jg .L43 + ALIGN_4 + +.L44: + testq $4, M + jle .L45 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + + addq $4 * SIZE, AO1 + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -16 * SIZE(B0, M8, 2) + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L45: + testq $2, M + jle .L46 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + + movaps %xmm0, -16 * SIZE(B0) + + addq $2 * SIZE, AO1 + ALIGN_4 + +.L46: + testq $1, M + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B3) + ALIGN_4 + +.L999: + popq %rbp + popq %r13 + popq %r12 + +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_tcopy_4.S b/kernel/x86_64/dgemm_tcopy_4.S new file mode 100644 index 0000000000..8b81c41c0d --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_4.S @@ -0,0 +1,516 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NEHALEM +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define MOVUPS_A movups +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef OPTERON +#define PREFETCHSIZE 16 +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef MOVUPS_A +#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS +#else +#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS +#endif + +#ifndef WINDOWS_ABI + +#define N ARG1 /* rsi */ +#define M ARG2 /* rdi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define M8 %r12 + +#else + +#define STACKSIZE 256 + +#define N ARG1 /* rdx */ +#define M ARG2 /* rcx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 64 + 32 + STACKSIZE(%rsp) + +#define B %r12 + +#define AO1 %rsi +#define AO2 %rdi +#define LDA3 %r10 +#define M8 %r11 +#endif + +#define I %rax + +#define B0 %rbp +#define B2 %r14 +#define B3 %r15 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + +#ifdef WINDOWS_ABI + movq OLD_B, B +#endif + + subq $-16 * SIZE, B + + movq M, B2 + movq M, B3 + + andq $-4, B2 + andq $-2, B3 + + imulq N, B2 + imulq N, B3 + + leaq (B, B2, SIZE), B2 + leaq (B, B3, SIZE), B3 + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + + leaq (, N, SIZE), M8 + + cmpq $4, N + jl .L30 + ALIGN_4 + +.L21: + subq $4, N + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + movq B, B0 + addq $16 * SIZE, B + + movq M, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -16 * SIZE(B0, M8, 4) + movaps %xmm3, -14 * SIZE(B0, M8, 4) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movaps %xmm0, -12 * SIZE(B0) + movaps %xmm1, -10 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0, M8, 4) + movaps %xmm3, -10 * SIZE(B0, M8, 4) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 16) * SIZE(B) +#endif + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -8 * SIZE(B0, M8, 4) + movaps %xmm3, -6 * SIZE(B0, M8, 4) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 24) * SIZE(B) +#endif + + movaps %xmm0, -4 * SIZE(B0) + movaps %xmm1, -2 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0, M8, 4) + movaps %xmm3, -2 * SIZE(B0, M8, 4) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, M + jle .L26 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0) + movaps %xmm3, -2 * SIZE(B0) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L26: + testq $2, M + jle .L28 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + movaps %xmm2, -12 * SIZE(B2) + movaps %xmm3, -10 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B2 + ALIGN_4 + +.L28: + testq $1, M + jle .L29 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B3) + movaps %xmm2, -14 * SIZE(B3) + subq $-4 * SIZE, B3 + ALIGN_4 + +.L29: + cmpq $4, N + jge .L21 + ALIGN_4 + +.L30: + cmpq $2, N + jl .L40 + + subq $2, N + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq B, B0 + addq $8 * SIZE, B + + movq M, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -16 * SIZE(B0, M8, 4) + movaps %xmm3, -14 * SIZE(B0, M8, 4) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 2 * SIZE(AO2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 8) * SIZE(B) +#endif + + movaps %xmm0, -12 * SIZE(B0) + movaps %xmm1, -10 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0, M8, 4) + movaps %xmm3, -10 * SIZE(B0, M8, 4) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, M + jle .L36 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A1(2 * SIZE, AO2, %xmm3) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L36: + testq $2, M + jle .L38 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(0 * SIZE, AO2, %xmm1) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B2 + ALIGN_4 + +.L38: + testq $1, M + jle .L40 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B3) + subq $-2 * SIZE, B3 + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + + movq A, AO1 + + movq B, B0 + + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 4 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE * 4 + 0) * SIZE(B) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -16 * SIZE(B0, M8, 4) + movaps %xmm3, -14 * SIZE(B0, M8, 4) + + addq $8 * SIZE, AO1 + leaq (B0, M8, 8), B0 + + decq I + jg .L43 + ALIGN_4 + +.L44: + testq $4, M + jle .L45 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + + addq $4 * SIZE, AO1 + leaq (B0, M8, 4), B0 + ALIGN_4 + +.L45: + testq $2, M + jle .L46 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + + movaps %xmm0, -16 * SIZE(B2) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B2 + ALIGN_4 + +.L46: + testq $1, M + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B3) + jmp .L999 + ALIGN_4 + +.L999: + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemm_tcopy_8.S b/kernel/x86_64/dgemm_tcopy_8.S new file mode 100644 index 0000000000..9760337146 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_8.S @@ -0,0 +1,780 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef NEHALEM +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + +#ifdef MOVUPS_A +#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS +#else +#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS +#endif + +#ifndef WINDOWS_ABI + +#define N ARG1 /* rsi */ +#define M ARG2 /* rdi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define AO1 %r9 +#define AO2 %r10 +#define LDA3 %r11 +#define M8 %r12 + +#else + +#define N ARG1 /* rdx */ +#define M ARG2 /* rcx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 56(%rsp) + +#define B %r12 + +#define AO1 %rsi +#define AO2 %rdi +#define LDA3 %r10 +#define M8 %r11 +#endif + +#define I %rax + +#define B0 %rbp +#define B1 %r13 +#define B2 %r14 +#define B3 %r15 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + +#ifdef WINDOWS_ABI + movq OLD_B, B +#endif + + subq $-16 * SIZE, B + + movq M, B1 + movq M, B2 + movq M, B3 + + andq $-8, B1 + andq $-4, B2 + andq $-2, B3 + + imulq N, B1 + imulq N, B2 + imulq N, B3 + + leaq (B, B1, SIZE), B1 + leaq (B, B2, SIZE), B2 + leaq (B, B3, SIZE), B3 + + leaq (,LDA, SIZE), LDA + leaq (LDA, LDA, 2), LDA3 + + leaq (, N, SIZE), M8 + + cmpq $8, N + jl .L20 + ALIGN_4 + +.L11: + subq $8, N + + movq A, AO1 + leaq (A, LDA, 4), AO2 + leaq (A, LDA, 8), A + + movq B, B0 + addq $64 * SIZE, B + + movq M, I + sarq $3, I + jle .L14 + ALIGN_4 + +.L13: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 48 * SIZE(B0) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 56 * SIZE(B0) +#endif + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0) + movaps %xmm3, -2 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA, 2, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA, 2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 64 * SIZE(B0) +#endif + + movaps %xmm0, 0 * SIZE(B0) + movaps %xmm1, 2 * SIZE(B0) + movaps %xmm2, 4 * SIZE(B0) + movaps %xmm3, 6 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA3, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA3, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 72 * SIZE(B0) +#endif + + movaps %xmm0, 8 * SIZE(B0) + movaps %xmm1, 10 * SIZE(B0) + movaps %xmm2, 12 * SIZE(B0) + movaps %xmm3, 14 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 80 * SIZE(B0) +#endif + + movaps %xmm0, 16 * SIZE(B0) + movaps %xmm1, 18 * SIZE(B0) + movaps %xmm2, 20 * SIZE(B0) + movaps %xmm3, 22 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 88 * SIZE(B0) +#endif + + movaps %xmm0, 24 * SIZE(B0) + movaps %xmm1, 26 * SIZE(B0) + movaps %xmm2, 28 * SIZE(B0) + movaps %xmm3, 30 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA, 2) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA, 2, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA, 2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 96 * SIZE(B0) +#endif + + movaps %xmm0, 32 * SIZE(B0) + movaps %xmm1, 34 * SIZE(B0) + movaps %xmm2, 36 * SIZE(B0) + movaps %xmm3, 38 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2, LDA3) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA3, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA3, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 104 * SIZE(B0) +#endif + + movaps %xmm0, 40 * SIZE(B0) + movaps %xmm1, 42 * SIZE(B0) + movaps %xmm2, 44 * SIZE(B0) + movaps %xmm3, 46 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $4, M + jle .L16 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B1) + movaps %xmm1, -14 * SIZE(B1) + movaps %xmm2, -12 * SIZE(B1) + movaps %xmm3, -10 * SIZE(B1) + + MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 2, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO1, LDA3, 1, %xmm3) + + movaps %xmm0, -8 * SIZE(B1) + movaps %xmm1, -6 * SIZE(B1) + movaps %xmm2, -4 * SIZE(B1) + movaps %xmm3, -2 * SIZE(B1) + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, 0 * SIZE(B1) + movaps %xmm1, 2 * SIZE(B1) + movaps %xmm2, 4 * SIZE(B1) + movaps %xmm3, 6 * SIZE(B1) + + MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 2, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO2, LDA3, 1, %xmm3) + + movaps %xmm0, 8 * SIZE(B1) + movaps %xmm1, 10 * SIZE(B1) + movaps %xmm2, 12 * SIZE(B1) + movaps %xmm3, 14 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-32 * SIZE, B1 + ALIGN_4 + +.L16: + testq $2, M + jle .L18 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA, 2, %xmm2) + MOVUPS_A2(0 * SIZE, AO1, LDA3, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + movaps %xmm2, -12 * SIZE(B2) + movaps %xmm3, -10 * SIZE(B2) + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA, 2, %xmm2) + MOVUPS_A2(0 * SIZE, AO2, LDA3, 1, %xmm3) + + movaps %xmm0, -8 * SIZE(B2) + movaps %xmm1, -6 * SIZE(B2) + movaps %xmm2, -4 * SIZE(B2) + movaps %xmm3, -2 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-16 * SIZE, B2 + ALIGN_4 + +.L18: + testq $1, M + jle .L19 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO1, LDA, 2), %xmm2 + movsd 0 * SIZE(AO1, LDA3), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B3) + movaps %xmm2, -14 * SIZE(B3) + + movsd 0 * SIZE(AO2), %xmm0 + movsd 0 * SIZE(AO2, LDA), %xmm1 + movsd 0 * SIZE(AO2, LDA, 2), %xmm2 + movsd 0 * SIZE(AO2, LDA3), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -12 * SIZE(B3) + movaps %xmm2, -10 * SIZE(B3) + + subq $-8 * SIZE, B3 + ALIGN_4 + +.L19: + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: + cmpq $4, N + jl .L30 + + subq $4, N + + movq A, AO1 + leaq (A, LDA, 2), AO2 + leaq (A, LDA, 4), A + + movq B, B0 + addq $32 * SIZE, B + + movq M, I + sarq $3, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 16 * SIZE(B0) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA) +#endif + + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO1, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 24 * SIZE(B0) +#endif + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0) + movaps %xmm3, -2 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA, 2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 32 * SIZE(B0) +#endif + + movaps %xmm0, 0 * SIZE(B0) + movaps %xmm1, 2 * SIZE(B0) + movaps %xmm2, 4 * SIZE(B0) + movaps %xmm3, 6 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1, LDA3) +#endif + + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm0) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm1) + MOVUPS_A2(4 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(6 * SIZE, AO2, LDA, 1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 40 * SIZE(B0) +#endif + + movaps %xmm0, 8 * SIZE(B0) + movaps %xmm1, 10 * SIZE(B0) + movaps %xmm2, 12 * SIZE(B0) + movaps %xmm3, 14 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L23 + ALIGN_4 + +.L24: + testq $4, M + jle .L26 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO1, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B1) + movaps %xmm1, -14 * SIZE(B1) + movaps %xmm2, -12 * SIZE(B1) + movaps %xmm3, -10 * SIZE(B1) + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm2) + MOVUPS_A2(2 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, -8 * SIZE(B1) + movaps %xmm1, -6 * SIZE(B1) + movaps %xmm2, -4 * SIZE(B1) + movaps %xmm3, -2 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-16 * SIZE, B1 + ALIGN_4 + +.L26: + testq $2, M + jle .L28 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A2(0 * SIZE, AO1, LDA, 1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A2(0 * SIZE, AO2, LDA, 1, %xmm3) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + movaps %xmm2, -12 * SIZE(B2) + movaps %xmm3, -10 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-8 * SIZE, B2 + ALIGN_4 + +.L28: + testq $1, M + jle .L30 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO1, LDA), %xmm1 + movsd 0 * SIZE(AO2), %xmm2 + movsd 0 * SIZE(AO2, LDA), %xmm3 + + unpcklpd %xmm1, %xmm0 + unpcklpd %xmm3, %xmm2 + + movaps %xmm0, -16 * SIZE(B3) + movaps %xmm2, -14 * SIZE(B3) + subq $-4 * SIZE, B3 + ALIGN_4 + +.L30: + cmpq $2, N + jl .L40 + + subq $2, N + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq B, B0 + addq $16 * SIZE, B + + movq M, I + sarq $3, I + jle .L34 + ALIGN_4 + +.L33: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 0 * SIZE(B0) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * SIZE(AO2) +#endif + + MOVUPS_A1(0 * SIZE, AO2, %xmm0) + MOVUPS_A1(2 * SIZE, AO2, %xmm1) + MOVUPS_A1(4 * SIZE, AO2, %xmm2) + MOVUPS_A1(6 * SIZE, AO2, %xmm3) + +#ifdef PREFETCHW + PREFETCHW 8 * SIZE(B0) +#endif + + movaps %xmm0, -8 * SIZE(B0) + movaps %xmm1, -6 * SIZE(B0) + movaps %xmm2, -4 * SIZE(B0) + movaps %xmm3, -2 * SIZE(B0) + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (B0, M8, 8), B0 + + decq I + jg .L33 + ALIGN_4 + +.L34: + testq $4, M + jle .L36 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(0 * SIZE, AO2, %xmm2) + MOVUPS_A1(2 * SIZE, AO2, %xmm3) + + movaps %xmm0, -16 * SIZE(B1) + movaps %xmm1, -14 * SIZE(B1) + movaps %xmm2, -12 * SIZE(B1) + movaps %xmm3, -10 * SIZE(B1) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B1 + ALIGN_4 + +.L36: + testq $2, M + jle .L38 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(0 * SIZE, AO2, %xmm1) + + movaps %xmm0, -16 * SIZE(B2) + movaps %xmm1, -14 * SIZE(B2) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + subq $-4 * SIZE, B2 + ALIGN_4 + +.L38: + testq $1, M + jle .L40 + + movsd 0 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(B3) + subq $-2 * SIZE, B3 + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + + movq A, AO1 + + movq B, B0 + + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH PREFETCHSIZE * 8 * SIZE(AO1) +#endif + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + MOVUPS_A1(4 * SIZE, AO1, %xmm2) + MOVUPS_A1(6 * SIZE, AO1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW -8 * SIZE(B0) +#endif + + movaps %xmm0, -16 * SIZE(B0) + movaps %xmm1, -14 * SIZE(B0) + movaps %xmm2, -12 * SIZE(B0) + movaps %xmm3, -10 * SIZE(B0) + + addq $8 * SIZE, AO1 + leaq (B0, M8, 8), B0 + + decq I + jg .L43 + ALIGN_4 + +.L44: + testq $4, M + jle .L45 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + MOVUPS_A1(2 * SIZE, AO1, %xmm1) + + movaps %xmm0, -16 * SIZE(B1) + movaps %xmm1, -14 * SIZE(B1) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B1 + ALIGN_4 + +.L45: + testq $2, M + jle .L46 + + MOVUPS_A1(0 * SIZE, AO1, %xmm0) + + movaps %xmm0, -16 * SIZE(B2) + + addq $2 * SIZE, AO1 + subq $-2 * SIZE, B2 + ALIGN_4 + +.L46: + testq $1, M + jle .L999 + + movsd 0 * SIZE(AO1), %xmm0 + + movlpd %xmm0, -16 * SIZE(B3) + jmp .L999 + ALIGN_4 + +.L999: + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 + +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/dgemv_n.S b/kernel/x86_64/dgemv_n.S new file mode 100644 index 0000000000..3c3cdfb07e --- /dev/null +++ b/kernel/x86_64/dgemv_n.S @@ -0,0 +1,2843 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + movsd %xmm0, ALPHA +#else + movsd %xmm3, ALPHA +#endif + + leaq -1(INCY), %rax + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + +#ifdef ALIGNED_ACCESS + leaq -1 (M), MM + testq $SIZE, A + cmoveq M, MM +#endif + + testq N, N # if n <= 0 goto END + jle .L999 + testq M, M # if n <= 0 goto END + jle .L999 + +#if !defined(COPY_FORCE) && !defined(ALIGNED_ACCESS) +#ifndef NOCOPY_UNALIGNED + movq Y, Y1 + andq $0xf, Y1 + orq Y1, %rax +#endif + testq %rax, %rax + cmoveq Y, BUFFER + je .L10 +#endif + + movq BUFFER, Y1 + + pxor %xmm4, %xmm4 + + movq M, %rax + addq $16, %rax + sarq $4, %rax + ALIGN_3 + +.L01: + movapd %xmm4, 0 * SIZE(Y1) + movapd %xmm4, 2 * SIZE(Y1) + movapd %xmm4, 4 * SIZE(Y1) + movapd %xmm4, 6 * SIZE(Y1) + movapd %xmm4, 8 * SIZE(Y1) + movapd %xmm4, 10 * SIZE(Y1) + movapd %xmm4, 12 * SIZE(Y1) + movapd %xmm4, 14 * SIZE(Y1) + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: + +#ifdef ALIGNED_ACCESS + leaq SIZE(BUFFER), %rax + testq $SIZE, A + cmovne %rax, BUFFER + + testq $SIZE, LDA + jne .L50 +#endif + +#if GEMV_UNROLL >= 8 + + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 4), A2 + leaq (A, LDA, 8), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm8 + addq INCX, X + movddup (X), %xmm9 + addq INCX, X + movddup (X), %xmm10 + addq INCX, X + movddup (X), %xmm11 + addq INCX, X + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + movddup (X), %xmm14 + addq INCX, X + movddup (X), %xmm15 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm8 + unpcklpd %xmm8, %xmm8 + addq INCX, X + movsd (X), %xmm9 + unpcklpd %xmm9, %xmm9 + addq INCX, X + movsd (X), %xmm10 + unpcklpd %xmm10, %xmm10 + addq INCX, X + movsd (X), %xmm11 + unpcklpd %xmm11, %xmm11 + addq INCX, X + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + movsd (X), %xmm14 + unpcklpd %xmm14, %xmm14 + addq INCX, X + movsd (X), %xmm15 + unpcklpd %xmm15, %xmm15 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L1X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -16 * SIZE(A1, LDA, 2), %xmm6 + movsd -16 * SIZE(A1, LDA3), %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm8, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + mulsd %xmm9, %xmm5 + addsd %xmm5, %xmm0 + movsd -16 * SIZE(A2, LDA), %xmm5 + mulsd %xmm10, %xmm6 + addsd %xmm6, %xmm0 + movsd -16 * SIZE(A2, LDA, 2), %xmm6 + mulsd %xmm11, %xmm7 + addsd %xmm7, %xmm0 + movsd -16 * SIZE(A2, LDA3), %xmm7 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm7 + addsd %xmm7, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L1X: +#endif + + movq MM, I + sarq $3, I + jle .L15 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1) +#endif + + mulpd %xmm9, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + mulpd %xmm9, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + + mulpd %xmm9, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) + mulpd %xmm9, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) +#endif + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) +#endif + + mulpd %xmm11, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm11, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm11, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + mulpd %xmm11, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1) +#endif + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) +#endif + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) +#endif + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1( -6 * SIZE, A1, %xmm5) + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1( -4 * SIZE, A1, %xmm6) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1( -2 * SIZE, A1, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + mulpd %xmm9, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + mulpd %xmm9, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + + mulpd %xmm9, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm6) + mulpd %xmm9, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm7) + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm4) + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm5) + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm6) + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm7) + + mulpd %xmm11, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm11, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm11, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + mulpd %xmm11, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm7) + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm4) + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm5) + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm6) + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm7) + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $4, MM + je .L16 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm7) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm4) + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm5) + + mulpd %xmm9, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm6) + mulpd %xmm9, %xmm7 + addpd %xmm7, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm7) + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm11, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm11, %xmm7 + addpd %xmm7, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm7) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm4) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm7) + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm1 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L16: + testq $2, MM + je .L17 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm6) + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm7) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm9, %xmm5 + addpd %xmm5, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm6) + mulpd %xmm11, %xmm7 + addpd %xmm7, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm7) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm0 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, MM + je .L18 + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -16 * SIZE(A1, LDA, 2), %xmm6 + movsd -16 * SIZE(A1, LDA3), %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm8, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + mulsd %xmm9, %xmm5 + addsd %xmm5, %xmm0 + movsd -16 * SIZE(A2, LDA), %xmm5 + mulsd %xmm10, %xmm6 + addsd %xmm6, %xmm0 + movsd -16 * SIZE(A2, LDA, 2), %xmm6 + mulsd %xmm11, %xmm7 + addsd %xmm7, %xmm0 + movsd -16 * SIZE(A2, LDA3), %xmm7 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm7 + addsd %xmm7, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L18: + cmpq $8, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + movddup (X), %xmm14 + addq INCX, X + movddup (X), %xmm15 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + movsd (X), %xmm14 + unpcklpd %xmm14, %xmm14 + addq INCX, X + movsd (X), %xmm15 + unpcklpd %xmm15, %xmm15 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L2X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -16 * SIZE(A2), %xmm6 + movsd -16 * SIZE(A2, LDA), %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm7 + addsd %xmm7, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L2X: +#endif + + movq MM, I + sarq $3, I + jle .L25 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + MOVUPS_A1(-12 * SIZE, A1, %xmm2) + MOVUPS_A1(-10 * SIZE, A1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm7) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1(-16 * SIZE, A2, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1(-14 * SIZE, A2, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1(-12 * SIZE, A2, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1(-10 * SIZE, A2, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm14, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1( -8 * SIZE, A1, %xmm0) + mulpd %xmm14, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1( -6 * SIZE, A1, %xmm1) + + mulpd %xmm14, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1( -4 * SIZE, A1, %xmm2) + mulpd %xmm14, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1( -2 * SIZE, A1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm4) + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_A2( -6 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm6) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_A2( -2 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1(-16 * SIZE, A2, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1(-14 * SIZE, A2, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1(-12 * SIZE, A2, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1(-10 * SIZE, A2, %xmm3) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm7) + + mulpd %xmm14, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm14, %xmm1 + addpd %xmm1, %xmm9 + + mulpd %xmm14, %xmm2 + addpd %xmm2, %xmm10 + mulpd %xmm14, %xmm3 + addpd %xmm3, %xmm11 + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + mulpd %xmm15, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $4, MM + je .L26 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + + MOVUPS_A1(-16 * SIZE, A2, %xmm0) + MOVUPS_A1(-14 * SIZE, A2, %xmm1) + + mulpd %xmm14, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm14, %xmm1 + addpd %xmm1, %xmm9 + + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm5) + + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm8 + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm9 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L26: + testq $2, MM + je .L27 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm0 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm15, %xmm11 + addpd %xmm11, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, MM +#if GEMV_UNROLL == 4 + je .L28 +#else + je .L30 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + + movsd -16 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A1, LDA), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -16 * SIZE(A2, LDA), %xmm11 + + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + mulsd %xmm13, %xmm9 + addsd %xmm9, %xmm0 + mulsd %xmm14, %xmm10 + addsd %xmm10, %xmm0 + mulsd %xmm15, %xmm11 + addsd %xmm11, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 4 +.L28: + cmpq $4, N + jge .L21 + ALIGN_3 + +#endif + +.L30: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L40 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L31: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 2), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L3X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A2), %xmm5 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L3X: +#endif + + movq MM, I + sarq $3, I + jle .L35 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + MOVUPS_A1(-12 * SIZE, A1, %xmm2) + MOVUPS_A1(-10 * SIZE, A1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1( -8 * SIZE, A1, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1( -6 * SIZE, A1, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1( -4 * SIZE, A1, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1( -2 * SIZE, A1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_A1( -8 * SIZE, A2, %xmm4) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_A1( -6 * SIZE, A2, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_A1( -4 * SIZE, A2, %xmm6) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_A1( -2 * SIZE, A2, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm10 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm11 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $4, MM + je .L36 + + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L36: + testq $2, MM + je .L37 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L38 +#else + je .L40 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + + movsd -16 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A2), %xmm9 + + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + mulsd %xmm13, %xmm9 + addsd %xmm9, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 2 +.L38: + cmpq $2, N + jge .L31 + ALIGN_3 + +#endif + +.L40: + cmpq $1, N + jl .L900 +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L4X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, Y1 + ALIGN_3 + +.L4X: +#endif + + movq MM, I + sarq $3, I + jle .L45 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + MOVUPS_A1(-12 * SIZE, A1, %xmm2) + MOVUPS_A1(-10 * SIZE, A1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + decq I + jle .L44 + ALIGN_3 + +.L43: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1( -8 * SIZE, A1, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1( -6 * SIZE, A1, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1( -4 * SIZE, A1, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1( -2 * SIZE, A1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L43 + ALIGN_3 + +.L44: + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L45: + testq $4, MM + je .L46 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L46: + testq $2, MM + je .L47 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L47: + testq $1, MM + je .L900 + + movsd -16 * SIZE(Y1), %xmm0 + movsd -16 * SIZE(A1), %xmm8 + + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#ifdef ALIGNED_ACCESS + jmp .L900 + ALIGN_3 + +.L50: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L60 + ALIGN_3 + +.L51: + + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + movddup (X), %xmm14 + addq INCX, X + movddup (X), %xmm15 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + movsd (X), %xmm14 + unpcklpd %xmm14, %xmm14 + addq INCX, X + movsd (X), %xmm15 + unpcklpd %xmm15, %xmm15 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + testq $SIZE, A + je .L5X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm5 + movsd -16 * SIZE(A2), %xmm6 + movsd -16 * SIZE(A2, LDA), %xmm7 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm7 + addsd %xmm7, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L5X: + movhpd -16 * SIZE(A1, LDA), %xmm8 + movhpd -16 * SIZE(A2, LDA), %xmm9 + + movq MM, I + sarq $3, I + jle .L55 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L54 + ALIGN_3 + +.L53: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A1, LDA) +#endif + + shufpd $1, %xmm4, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm1 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + shufpd $1, %xmm8, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A2, LDA) +#endif + + shufpd $1, %xmm4, %xmm9 + mulpd %xmm15, %xmm9 + addpd %xmm9, %xmm0 + MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm1 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm5) + shufpd $1, %xmm9, %xmm6 + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm6) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L53 + ALIGN_3 + +.L54: + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm4) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm5) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm6) + + shufpd $1, %xmm4, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm1 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + shufpd $1, %xmm8, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm7) + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm4) + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm5) + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm6) + + shufpd $1, %xmm4, %xmm9 + mulpd %xmm15, %xmm9 + addpd %xmm9, %xmm0 + MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + shufpd $1, %xmm5, %xmm4 + mulpd %xmm15, %xmm4 + addpd %xmm4, %xmm1 + shufpd $1, %xmm6, %xmm5 + mulpd %xmm15, %xmm5 + addpd %xmm5, %xmm2 + shufpd $1, %xmm9, %xmm6 + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L55: + testq $4, MM + je .L56 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm7) + + shufpd $1, %xmm6, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm7, %xmm8 + shufpd $1, %xmm7, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm1 + + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + MOVUPS_A1(-14 * SIZE, A2, %xmm5) + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm6) + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm7) + + shufpd $1, %xmm6, %xmm9 + mulpd %xmm15, %xmm9 + addpd %xmm9, %xmm0 + movaps %xmm7, %xmm9 + shufpd $1, %xmm7, %xmm6 + mulpd %xmm15, %xmm6 + addpd %xmm6, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L56: + testq $2, MM + je .L57 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm5, %xmm8 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + shufpd $1, %xmm7, %xmm9 + mulpd %xmm15, %xmm9 + addpd %xmm9, %xmm0 + movaps %xmm7, %xmm9 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L57: + testq $1, MM + je .L58 + + movsd -16 * SIZE(Y1), %xmm0 + + movsd -16 * SIZE(A1), %xmm4 + shufpd $1, %xmm8, %xmm8 + movsd -16 * SIZE(A2), %xmm6 + shufpd $1, %xmm9, %xmm9 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm8 + addsd %xmm8, %xmm0 + mulsd %xmm14, %xmm6 + addsd %xmm6, %xmm0 + mulsd %xmm15, %xmm9 + addsd %xmm9, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +.L58: + cmpq $4, N + jge .L51 + ALIGN_3 + +.L60: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L70 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L61: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 2), A + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + movddup (X), %xmm13 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + movsd (X), %xmm13 + unpcklpd %xmm13, %xmm13 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + testq $SIZE, A + je .L6X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A2), %xmm5 + + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm5 + addsd %xmm5, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, Y1 + ALIGN_3 + +.L6X: + movhpd -16 * SIZE(A2), %xmm8 + + movq MM, I + sarq $3, I + jle .L65 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + decq I + jle .L64 + ALIGN_3 + +.L63: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-15 * SIZE, A2, %xmm4) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-13 * SIZE, A2, %xmm5) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1(-11 * SIZE, A2, %xmm6) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(A2) +#endif + + shufpd $1, %xmm4, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -9 * SIZE, A2, %xmm8) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm1 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm5) + shufpd $1, %xmm8, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1( -4 * SIZE, A1, %xmm6) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L63 + ALIGN_3 + +.L64: + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A1(-15 * SIZE, A2, %xmm4) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-13 * SIZE, A2, %xmm5) + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + MOVUPS_A1(-11 * SIZE, A2, %xmm6) + + shufpd $1, %xmm4, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -9 * SIZE, A2, %xmm8) + shufpd $1, %xmm5, %xmm4 + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm1 + + shufpd $1, %xmm6, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + shufpd $1, %xmm8, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L65: + testq $4, MM + je .L66 + + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + + MOVUPS_A1(-15 * SIZE, A2, %xmm6) + MOVUPS_A1(-13 * SIZE, A2, %xmm7) + + shufpd $1, %xmm6, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm7, %xmm8 + shufpd $1, %xmm7, %xmm6 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L66: + testq $2, MM + je .L67 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-15 * SIZE, A2, %xmm5) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm5, %xmm8 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L67: + testq $1, MM +#if GEMV_UNROLL == 2 + je .L68 +#else + je .L70 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + + movsd -16 * SIZE(A1), %xmm4 + shufpd $1, %xmm8, %xmm8 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + mulsd %xmm13, %xmm8 + addsd %xmm8, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 2 +.L68: + cmpq $2, N + jge .L61 + ALIGN_3 + +#endif + +.L70: + cmpq $1, N + jl .L900 + +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + +#ifdef HAVE_SSE3 + movddup (X), %xmm12 + addq INCX, X + + movddup ALPHA, %xmm0 +#else + movsd (X), %xmm12 + unpcklpd %xmm12, %xmm12 + addq INCX, X + + movsd ALPHA, %xmm0 + unpcklpd %xmm0, %xmm0 +#endif + + mulpd %xmm0, %xmm12 + + testq $SIZE, A + je .L7X + + movsd -16 * SIZE(A1), %xmm4 + movsd -16 * SIZE(Y1), %xmm0 + + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) + + addq $SIZE, A1 + addq $SIZE, Y1 + ALIGN_3 + +.L7X: + + movq MM, I + sarq $3, I + jle .L75 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + MOVUPS_A1(-12 * SIZE, A1, %xmm2) + MOVUPS_A1(-10 * SIZE, A1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm11) + + decq I + jle .L74 + ALIGN_3 + +.L73: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_A1( -8 * SIZE, A1, %xmm0) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_A1( -6 * SIZE, A1, %xmm1) + + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_A1( -4 * SIZE, A1, %xmm2) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_A1( -2 * SIZE, A1, %xmm3) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm8) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm9) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm10) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L73 + ALIGN_3 + +.L74: + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + mulpd %xmm12, %xmm2 + addpd %xmm2, %xmm10 + MOVUPS_YS1(-12 * SIZE, Y1, %xmm10) + mulpd %xmm12, %xmm3 + addpd %xmm3, %xmm11 + MOVUPS_YS1(-10 * SIZE, Y1, %xmm11) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L75: + testq $4, MM + je .L76 + + MOVUPS_A1(-16 * SIZE, A1, %xmm0) + MOVUPS_A1(-14 * SIZE, A1, %xmm1) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm8) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm9) + + mulpd %xmm12, %xmm0 + addpd %xmm0, %xmm8 + MOVUPS_YS1(-16 * SIZE, Y1, %xmm8) + mulpd %xmm12, %xmm1 + addpd %xmm1, %xmm9 + MOVUPS_YS1(-14 * SIZE, Y1, %xmm9) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L76: + testq $2, MM + je .L77 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L77: + testq $1, MM + je .L900 + + movsd -16 * SIZE(Y1), %xmm0 + movsd -16 * SIZE(A1), %xmm8 + + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + + movsd %xmm0, -16 * SIZE(Y1) +#endif + ALIGN_3 + + +.L900: +#ifndef COPY_FORCE + cmpq Y, BUFFER + je .L999 +#endif + + cmpq $SIZE, INCY + jne .L950 + + testq $SIZE, Y + je .L910 + + movsd (Y), %xmm0 + addsd (BUFFER), %xmm0 + movsd %xmm0, (Y) + + addq $SIZE, Y + addq $SIZE, BUFFER + + decq M + jle .L999 + ALIGN_4 + +.L910: + testq $SIZE, BUFFER + jne .L920 + + movq M, %rax + sarq $3, %rax + jle .L914 + ALIGN_3 + +.L912: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y) +#endif + + movapd 0 * SIZE(Y), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + movapd 4 * SIZE(Y), %xmm2 + movapd 6 * SIZE(Y), %xmm3 + + movapd 0 * SIZE(BUFFER), %xmm4 + movapd 2 * SIZE(BUFFER), %xmm5 + movapd 4 * SIZE(BUFFER), %xmm6 + movapd 6 * SIZE(BUFFER), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 + PREOFFSET(BUFFER) +#endif + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + movapd %xmm0, 0 * SIZE(Y) + movapd %xmm1, 2 * SIZE(Y) + movapd %xmm2, 4 * SIZE(Y) + movapd %xmm3, 6 * SIZE(Y) + + addq $8 * SIZE, Y + addq $8 * SIZE, BUFFER + + decq %rax + jg .L912 + ALIGN_3 + +.L914: + testq $7, M + jle .L999 + + testq $4, M + jle .L915 + + movapd 0 * SIZE(Y), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + movapd 0 * SIZE(BUFFER), %xmm4 + movapd 2 * SIZE(BUFFER), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + movapd %xmm0, 0 * SIZE(Y) + movapd %xmm1, 2 * SIZE(Y) + + addq $4 * SIZE, Y + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L915: + testq $2, M + jle .L916 + + movapd (Y), %xmm0 + + movapd (BUFFER), %xmm4 + + addpd %xmm4, %xmm0 + + movapd %xmm0, (Y) + + addq $2 * SIZE, Y + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L916: + testq $1, M + jle .L999 + + movsd (Y), %xmm0 + + movsd 0 * SIZE(BUFFER), %xmm4 + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y) + ALIGN_3 + + jmp .L999 + ALIGN_4 + +.L920: + movapd -1 * SIZE(BUFFER), %xmm4 + + movq M, %rax + sarq $3, %rax + jle .L924 + ALIGN_3 + +.L922: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y) +#endif + + movapd 0 * SIZE(Y), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + movapd 4 * SIZE(Y), %xmm2 + movapd 6 * SIZE(Y), %xmm3 + + movapd 1 * SIZE(BUFFER), %xmm5 + movapd 3 * SIZE(BUFFER), %xmm6 + movapd 5 * SIZE(BUFFER), %xmm7 + movapd 7 * SIZE(BUFFER), %xmm8 + + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm6, %xmm5 + shufpd $1, %xmm7, %xmm6 + shufpd $1, %xmm8, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 + PREOFFSET(BUFFER) +#endif + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + movapd %xmm0, 0 * SIZE(Y) + movapd %xmm1, 2 * SIZE(Y) + movapd %xmm2, 4 * SIZE(Y) + movapd %xmm3, 6 * SIZE(Y) + + movapd %xmm8, %xmm4 + + addq $8 * SIZE, Y + addq $8 * SIZE, BUFFER + + decq %rax + jg .L922 + ALIGN_3 + +.L924: + testq $7, M + jle .L999 + + testq $4, M + jle .L925 + + movapd 0 * SIZE(Y), %xmm0 + movapd 2 * SIZE(Y), %xmm1 + + movapd 1 * SIZE(BUFFER), %xmm5 + movapd 3 * SIZE(BUFFER), %xmm6 + + shufpd $1, %xmm5, %xmm4 + shufpd $1, %xmm6, %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + movapd %xmm0, 0 * SIZE(Y) + movapd %xmm1, 2 * SIZE(Y) + + movapd %xmm6, %xmm4 + + addq $4 * SIZE, Y + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L925: + testq $2, M + jle .L926 + + movapd (Y), %xmm0 + + movapd 1 * SIZE(BUFFER), %xmm5 + + shufpd $1, %xmm5, %xmm4 + + addpd %xmm4, %xmm0 + + movapd %xmm0, (Y) + + movaps %xmm5, %xmm4 + + addq $2 * SIZE, Y + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L926: + testq $1, M + jle .L999 + + movsd (Y), %xmm0 + + shufpd $1, %xmm4, %xmm4 + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y) + ALIGN_3 + + jmp .L999 + ALIGN_4 + +.L950: + testq $SIZE, BUFFER + je .L960 + + movsd (Y), %xmm0 + addsd (BUFFER), %xmm0 + movsd %xmm0, (Y) + + addq INCY, Y + addq $SIZE, BUFFER + + decq M + jle .L999 + ALIGN_4 + +.L960: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L964 + ALIGN_3 + +.L962: + movsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + movapd 0 * SIZE(BUFFER), %xmm4 + + movsd (Y), %xmm1 + addq INCY, Y + movhpd (Y), %xmm1 + addq INCY, Y + + movapd 2 * SIZE(BUFFER), %xmm5 + + movsd (Y), %xmm2 + addq INCY, Y + movhpd (Y), %xmm2 + addq INCY, Y + + movapd 4 * SIZE(BUFFER), %xmm6 + + addpd %xmm4, %xmm0 + + movsd (Y), %xmm3 + addq INCY, Y + movhpd (Y), %xmm3 + addq INCY, Y + + movapd 6 * SIZE(BUFFER), %xmm7 + + addpd %xmm5, %xmm1 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + + addpd %xmm6, %xmm2 + + movlpd %xmm1, (Y1) + addq INCY, Y1 + movhpd %xmm1, (Y1) + addq INCY, Y1 + + addpd %xmm7, %xmm3 + + movlpd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + movlpd %xmm3, (Y1) + addq INCY, Y1 + movhpd %xmm3, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + decq %rax + jg .L962 + ALIGN_3 + +.L964: + testq $7, M + jle .L999 + + testq $4, M + jle .L965 + + movsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + movapd 0 * SIZE(BUFFER), %xmm4 + + movsd (Y), %xmm1 + addq INCY, Y + movhpd (Y), %xmm1 + addq INCY, Y + + movapd 2 * SIZE(BUFFER), %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + movlpd %xmm1, (Y1) + addq INCY, Y1 + movhpd %xmm1, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L965: + testq $2, M + jle .L966 + + movsd (Y), %xmm0 + addq INCY, Y + movhpd (Y), %xmm0 + addq INCY, Y + + movapd 0 * SIZE(BUFFER), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L966: + testq $1, M + jle .L999 + + movsd (Y), %xmm0 + + movsd 0 * SIZE(BUFFER), %xmm4 + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + + ret + EPILOGUE diff --git a/kernel/x86_64/dgemv_n_atom.S b/kernel/x86_64/dgemv_n_atom.S new file mode 100644 index 0000000000..27a763a6b5 --- /dev/null +++ b/kernel/x86_64/dgemv_n_atom.S @@ -0,0 +1,788 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCH_SIZE (8 * 6) + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define STACK_ALPHA 48 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define STACK_ALPHA 224 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %r11 +#define A1 %r12 +#define A2 %r13 +#define Y1 %r14 +#define BUFFER %r15 +#define MM %rbx + +#define ALPHA %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#endif + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + movsd %xmm0, STACK_ALPHA +#else + movsd %xmm3, STACK_ALPHA +#endif + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq N, N + jle .L999 + testq M, M + jle .L999 + + cmpq $SIZE, INCY + cmoveq Y, BUFFER + je .L10 + + movq BUFFER, Y1 + xorps %xmm4, %xmm4 + + movq M, %rax + addq $7, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movsd %xmm4, 0 * SIZE(Y1) + movsd %xmm4, 1 * SIZE(Y1) + movsd %xmm4, 2 * SIZE(Y1) + movsd %xmm4, 3 * SIZE(Y1) + movsd %xmm4, 4 * SIZE(Y1) + movsd %xmm4, 5 * SIZE(Y1) + movsd %xmm4, 6 * SIZE(Y1) + movsd %xmm4, 7 * SIZE(Y1) + + addq $8 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: + movq N, J + sarq $1, J + jle .L20 + ALIGN_3 + +.L11: + movq BUFFER, Y1 + + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd STACK_ALPHA, %xmm0 + + movsd (X), %xmm14 + addq INCX, X + movsd (X), %xmm15 + addq INCX, X + + mulsd %xmm0, %xmm14 + mulsd %xmm0, %xmm15 + + movq M, I + sarq $3, I + jle .L15 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + movsd 2 * SIZE(A1), %xmm2 + movsd 3 * SIZE(A1), %xmm3 + + movsd 0 * SIZE(A2), %xmm4 + movsd 1 * SIZE(A2), %xmm5 + movsd 2 * SIZE(A2), %xmm6 + movsd 3 * SIZE(A2), %xmm7 + + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd 2 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd 3 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + decq I + jle .L14 + ALIGN_3 + +.L13: + PREFETCH PREFETCH_SIZE * SIZE(A1) + mulsd %xmm15, %xmm4 + PREFETCH PREFETCH_SIZE * SIZE(A2) + addsd %xmm0, %xmm8 + movsd 4 * SIZE(A1), %xmm0 + + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + movsd 5 * SIZE(A1), %xmm1 + + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + movsd 6 * SIZE(A1), %xmm2 + + mulsd %xmm15, %xmm7 + addsd %xmm3, %xmm11 + movsd 7 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm8 + mulsd %xmm14, %xmm0 + movsd 4 * SIZE(A2), %xmm4 + + addsd %xmm5, %xmm9 + mulsd %xmm14, %xmm1 + movsd 5 * SIZE(A2), %xmm5 + + addsd %xmm6, %xmm10 + mulsd %xmm14, %xmm2 + movsd 6 * SIZE(A2), %xmm6 + + addsd %xmm7, %xmm11 + mulsd %xmm14, %xmm3 + movsd 7 * SIZE(A2), %xmm7 + + movsd %xmm8, 0 * SIZE(Y1) + movsd 4 * SIZE(Y1), %xmm8 + movsd %xmm9, 1 * SIZE(Y1) + movsd 5 * SIZE(Y1), %xmm9 + + movsd %xmm10, 2 * SIZE(Y1) + movsd 6 * SIZE(Y1), %xmm10 + movsd %xmm11, 3 * SIZE(Y1) + movsd 7 * SIZE(Y1), %xmm11 + + mulsd %xmm15, %xmm4 + addsd %xmm0, %xmm8 + movsd 8 * SIZE(A1), %xmm0 + + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + movsd 9 * SIZE(A1), %xmm1 + + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + movsd 10 * SIZE(A1), %xmm2 + + mulsd %xmm15, %xmm7 + addq $8 * SIZE, A2 + addsd %xmm3, %xmm11 + movsd 11 * SIZE(A1), %xmm3 + + mulsd %xmm14, %xmm0 + addsd %xmm4, %xmm8 + movsd 0 * SIZE(A2), %xmm4 + + mulsd %xmm14, %xmm1 + addq $8 * SIZE, Y1 + addsd %xmm5, %xmm9 + movsd 1 * SIZE(A2), %xmm5 + + mulsd %xmm14, %xmm2 + addq $8 * SIZE, A1 + addsd %xmm6, %xmm10 + movsd 2 * SIZE(A2), %xmm6 + + mulsd %xmm14, %xmm3 + decq I + addsd %xmm7, %xmm11 + movsd 3 * SIZE(A2), %xmm7 + + movsd %xmm8, -4 * SIZE(Y1) + movsd 0 * SIZE(Y1), %xmm8 + movsd %xmm9, -3 * SIZE(Y1) + movsd 1 * SIZE(Y1), %xmm9 + + movsd %xmm10,-2 * SIZE(Y1) + movsd 2 * SIZE(Y1), %xmm10 + movsd %xmm11,-1 * SIZE(Y1) + movsd 3 * SIZE(Y1), %xmm11 + jg .L13 + ALIGN_3 + +.L14: + mulsd %xmm15, %xmm4 + addsd %xmm0, %xmm8 + movsd 4 * SIZE(A1), %xmm0 + + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + movsd 5 * SIZE(A1), %xmm1 + + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + movsd 6 * SIZE(A1), %xmm2 + + mulsd %xmm15, %xmm7 + addsd %xmm3, %xmm11 + movsd 7 * SIZE(A1), %xmm3 + + addsd %xmm4, %xmm8 + mulsd %xmm14, %xmm0 + movsd 4 * SIZE(A2), %xmm4 + + addsd %xmm5, %xmm9 + mulsd %xmm14, %xmm1 + movsd 5 * SIZE(A2), %xmm5 + + addsd %xmm6, %xmm10 + mulsd %xmm14, %xmm2 + movsd 6 * SIZE(A2), %xmm6 + + addsd %xmm7, %xmm11 + mulsd %xmm14, %xmm3 + movsd 7 * SIZE(A2), %xmm7 + + movsd %xmm8, 0 * SIZE(Y1) + movsd 4 * SIZE(Y1), %xmm8 + movsd %xmm9, 1 * SIZE(Y1) + movsd 5 * SIZE(Y1), %xmm9 + + movsd %xmm10, 2 * SIZE(Y1) + movsd 6 * SIZE(Y1), %xmm10 + movsd %xmm11, 3 * SIZE(Y1) + movsd 7 * SIZE(Y1), %xmm11 + + mulsd %xmm15, %xmm4 + addsd %xmm0, %xmm8 + + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + + mulsd %xmm15, %xmm7 + addq $8 * SIZE, A2 + addsd %xmm3, %xmm11 + + mulsd %xmm14, %xmm0 + addsd %xmm4, %xmm8 + + mulsd %xmm14, %xmm1 + addq $8 * SIZE, Y1 + addsd %xmm5, %xmm9 + + mulsd %xmm14, %xmm2 + addq $8 * SIZE, A1 + addsd %xmm6, %xmm10 + + mulsd %xmm14, %xmm3 + addsd %xmm7, %xmm11 + + movsd %xmm8, -4 * SIZE(Y1) + movsd %xmm9, -3 * SIZE(Y1) + movsd %xmm10,-2 * SIZE(Y1) + movsd %xmm11,-1 * SIZE(Y1) + ALIGN_3 + +.L15: + testq $4, M + je .L17 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + movsd 2 * SIZE(A1), %xmm2 + movsd 3 * SIZE(A1), %xmm3 + + movsd 0 * SIZE(A2), %xmm4 + movsd 1 * SIZE(A2), %xmm5 + movsd 2 * SIZE(A2), %xmm6 + movsd 3 * SIZE(A2), %xmm7 + + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd 2 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd 3 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + mulsd %xmm15, %xmm4 + addsd %xmm0, %xmm8 + mulsd %xmm15, %xmm5 + addsd %xmm1, %xmm9 + mulsd %xmm15, %xmm6 + addsd %xmm2, %xmm10 + mulsd %xmm15, %xmm7 + addsd %xmm3, %xmm11 + + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + addsd %xmm6, %xmm10 + addsd %xmm7, %xmm11 + + movsd %xmm8, 0 * SIZE(Y1) + movsd %xmm9, 1 * SIZE(Y1) + movsd %xmm10, 2 * SIZE(Y1) + movsd %xmm11, 3 * SIZE(Y1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $2, M + je .L18 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + + movsd 0 * SIZE(A2), %xmm4 + movsd 1 * SIZE(A2), %xmm5 + + mulsd %xmm14, %xmm0 + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm1 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm15, %xmm4 + mulsd %xmm15, %xmm5 + + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + + movsd %xmm8, 0 * SIZE(Y1) + movsd %xmm9, 1 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L18: + testq $1, M + je .L19 + + movsd 0 * SIZE(Y1), %xmm8 + + movsd 0 * SIZE(A1), %xmm0 + movsd 0 * SIZE(A2), %xmm4 + + mulsd %xmm14, %xmm0 + mulsd %xmm15, %xmm4 + + addsd %xmm0, %xmm8 + addsd %xmm4, %xmm8 + + movsd %xmm8, 0 * SIZE(Y1) + ALIGN_3 + +.L19: + decq J + jg .L11 + ALIGN_3 + +.L20: + testq $1, N + je .L990 + + movq BUFFER, Y1 + movq A, A1 + + movsd (X), %xmm14 + mulsd STACK_ALPHA, %xmm14 + + movq M, I + sarq $3, I + jle .L25 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + movsd 2 * SIZE(A1), %xmm2 + movsd 3 * SIZE(A1), %xmm3 + + movsd 4 * SIZE(A1), %xmm4 + movsd 5 * SIZE(A1), %xmm5 + movsd 6 * SIZE(A1), %xmm6 + movsd 7 * SIZE(A1), %xmm7 + + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd 2 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd 3 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + decq I + jle .L24 + ALIGN_3 + +.L23: + PREFETCH PREFETCH_SIZE * SIZE(A1) + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(A1), %xmm0 + addsd %xmm1, %xmm9 + movsd 9 * SIZE(A1), %xmm1 + addsd %xmm2, %xmm10 + movsd 10 * SIZE(A1), %xmm2 + addsd %xmm3, %xmm11 + movsd 11 * SIZE(A1), %xmm3 + + movsd %xmm8, 0 * SIZE(Y1) + movsd 4 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm4 + movsd %xmm9, 1 * SIZE(Y1) + movsd 5 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm5 + + movsd %xmm10, 2 * SIZE(Y1) + movsd 6 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm6 + + movsd %xmm11, 3 * SIZE(Y1) + movsd 7 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm7 + + addsd %xmm4, %xmm8 + movsd 12 * SIZE(A1), %xmm4 + addsd %xmm5, %xmm9 + movsd 13 * SIZE(A1), %xmm5 + addsd %xmm6, %xmm10 + movsd 14 * SIZE(A1), %xmm6 + addsd %xmm7, %xmm11 + movsd 15 * SIZE(A1), %xmm7 + + movsd %xmm8, 4 * SIZE(Y1) + movsd 8 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd %xmm9, 5 * SIZE(Y1) + movsd 9 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd %xmm10, 6 * SIZE(Y1) + movsd 10 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd %xmm11, 7 * SIZE(Y1) + movsd 11 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + addq $8 * SIZE, Y1 + addq $8 * SIZE, A1 + + decq I + jg .L23 + ALIGN_3 + +.L24: + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + mulsd %xmm14, %xmm4 + movsd %xmm8, 0 * SIZE(Y1) + movsd 4 * SIZE(Y1), %xmm8 + + mulsd %xmm14, %xmm5 + movsd %xmm9, 1 * SIZE(Y1) + movsd 5 * SIZE(Y1), %xmm9 + + mulsd %xmm14, %xmm6 + movsd %xmm10, 2 * SIZE(Y1) + movsd 6 * SIZE(Y1), %xmm10 + + mulsd %xmm14, %xmm7 + movsd %xmm11, 3 * SIZE(Y1) + movsd 7 * SIZE(Y1), %xmm11 + + addsd %xmm4, %xmm8 + addsd %xmm5, %xmm9 + addsd %xmm6, %xmm10 + addsd %xmm7, %xmm11 + + movsd %xmm8, 4 * SIZE(Y1) + movsd %xmm9, 5 * SIZE(Y1) + movsd %xmm10, 6 * SIZE(Y1) + movsd %xmm11, 7 * SIZE(Y1) + + addq $8 * SIZE, Y1 + addq $8 * SIZE, A1 + ALIGN_3 + +.L25: + testq $4, M + je .L27 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + movsd 2 * SIZE(A1), %xmm2 + movsd 3 * SIZE(A1), %xmm3 + + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm0 + movsd 1 * SIZE(Y1), %xmm9 + mulsd %xmm14, %xmm1 + movsd 2 * SIZE(Y1), %xmm10 + mulsd %xmm14, %xmm2 + movsd 3 * SIZE(Y1), %xmm11 + mulsd %xmm14, %xmm3 + + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + movsd %xmm8, 0 * SIZE(Y1) + movsd %xmm9, 1 * SIZE(Y1) + movsd %xmm10, 2 * SIZE(Y1) + movsd %xmm11, 3 * SIZE(Y1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $2, M + je .L28 + + movsd 0 * SIZE(A1), %xmm0 + movsd 1 * SIZE(A1), %xmm1 + + mulsd %xmm14, %xmm0 + movsd 0 * SIZE(Y1), %xmm8 + mulsd %xmm14, %xmm1 + movsd 1 * SIZE(Y1), %xmm9 + + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + + movsd %xmm8, 0 * SIZE(Y1) + movsd %xmm9, 1 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L28: + testq $1, M + je .L990 + + movsd 0 * SIZE(Y1), %xmm8 + movsd 0 * SIZE(A1), %xmm0 + + mulsd %xmm14, %xmm0 + addsd %xmm0, %xmm8 + + movsd %xmm8, 0 * SIZE(Y1) + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq Y, Y1 + + movq M, %rax + sarq $2, %rax + jle .L994 + ALIGN_3 + +.L992: + movsd (Y), %xmm0 + addq INCY, Y + movsd (Y), %xmm1 + addq INCY, Y + movsd (Y), %xmm2 + addq INCY, Y + movsd (Y), %xmm3 + addq INCY, Y + + addsd 0 * SIZE(BUFFER), %xmm0 + addsd 1 * SIZE(BUFFER), %xmm1 + addsd 2 * SIZE(BUFFER), %xmm2 + addsd 3 * SIZE(BUFFER), %xmm3 + addq $4 * SIZE, BUFFER + + movsd %xmm0, (Y1) + addq INCY, Y1 + movsd %xmm1, (Y1) + addq INCY, Y1 + movsd %xmm2, (Y1) + addq INCY, Y1 + movsd %xmm3, (Y1) + addq INCY, Y1 + + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $2, M + jle .L996 + + movsd (Y), %xmm0 + addq INCY, Y + movsd (Y), %xmm1 + addq INCY, Y + + addsd 0 * SIZE(BUFFER), %xmm0 + addsd 1 * SIZE(BUFFER), %xmm1 + addq $2 * SIZE, BUFFER + + movsd %xmm0, (Y1) + addq INCY, Y1 + movsd %xmm1, (Y1) + addq INCY, Y1 + ALIGN_3 + +.L996: + testq $1, M + jle .L999 + + movsd (Y), %xmm0 + + addsd (BUFFER), %xmm0 + + movsd %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/dgemv_t.S b/kernel/x86_64/dgemv_t.S new file mode 100644 index 0000000000..071920723d --- /dev/null +++ b/kernel/x86_64/dgemv_t.S @@ -0,0 +1,2490 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 2 +#undef GEMV_UNROLL +#define GEMV_UNROLL 2 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp +#define X1 %r15 + +#ifdef ALIGNED_ACCESS +#define MM INCX +#else +#define MM M +#endif + +#define ALPHA %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + + leaq -1(INCX), %rax + + leaq (,LDA, SIZE), LDA + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + + leaq (LDA, LDA, 2), LDA3 + + subq $-16 * SIZE, A + +#ifdef HAVE_SSE3 +#ifndef WINDOWS_ABI + movddup %xmm0, ALPHA +#else + movddup %xmm3, ALPHA +#endif +#else +#ifndef WINDOWS_ABI + movapd %xmm0, ALPHA +#else + movapd %xmm3, ALPHA +#endif + unpcklpd ALPHA, ALPHA +#endif + + testq M, M + jle .L999 + testq N, N + jle .L999 + + movq BUFFER, X1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L01 + + movsd (X), %xmm0 + addq INCX, X + + movsd %xmm0, 1 * SIZE(BUFFER) + addq $1 * SIZE, BUFFER + addq $2 * SIZE, X1 + decq M + jle .L10 + ALIGN_4 + +.L01: +#endif + + movq M, I + sarq $3, I + jle .L05 + ALIGN_4 + +.L02: + movsd (X), %xmm0 + addq INCX, X + movhpd (X), %xmm0 + addq INCX, X + + movsd (X), %xmm1 + addq INCX, X + movhpd (X), %xmm1 + addq INCX, X + + movsd (X), %xmm2 + addq INCX, X + movhpd (X), %xmm2 + addq INCX, X + + movsd (X), %xmm3 + addq INCX, X + movhpd (X), %xmm3 + addq INCX, X + + movapd %xmm0, 0 * SIZE(X1) + movapd %xmm1, 2 * SIZE(X1) + movapd %xmm2, 4 * SIZE(X1) + movapd %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $7, I + jle .L10 + ALIGN_2 + +.L06: + movsd (X), %xmm0 + addq INCX, X + movsd %xmm0, 0 * SIZE(X1) + addq $SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + testq $SIZE, LDA + jne .L50 +#endif + +#if GEMV_UNROLL >= 8 + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 4), A2 + leaq (A1, LDA, 8), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef PREFETCHW + PREFETCHW 7 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L1X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + movsd -16 * SIZE(A1, LDA, 2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm2 + movsd -16 * SIZE(A1, LDA3), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm3 + movsd -16 * SIZE(A2), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm4 + movsd -16 * SIZE(A2, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm5 + movsd -16 * SIZE(A2, LDA, 2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm6 + movsd -16 * SIZE(A2, LDA3), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm7 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L1X: +#endif + + movq M, I + sarq $3, I + jle .L15 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-8 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-8 * SIZE, A1, LDA, 1, %xmm9) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-8 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + MOVUPS_A2(-8 * SIZE, A1, LDA3, 1, %xmm11) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L12 + ALIGN_4 + +.L13: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-12 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + MOVUPS_A2(-12 * SIZE, A1, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-12 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-10 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + MOVUPS_A2(-10 * SIZE, A1, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-10 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm7 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L15: + testq $4, M + jle .L16 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + MOVUPS_A2(-14 * SIZE, A1, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L16: + testq $2, M + jle .L17 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A2(-16 * SIZE, A1, LDA, 2, %xmm10) + MOVUPS_A2(-16 * SIZE, A1, LDA3, 1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-16 * SIZE, A2, LDA3, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L17: + testq $1, M + je .L18 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + movsd -16 * SIZE(A1, LDA, 2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm2 + movsd -16 * SIZE(A1, LDA3), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm3 + movsd -16 * SIZE(A2), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm4 + movsd -16 * SIZE(A2, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm5 + movsd -16 * SIZE(A2, LDA, 2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm6 + movsd -16 * SIZE(A2, LDA3), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm7 + ALIGN_4 + +.L18: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + movapd %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm10 + + movapd %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 +#endif + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm2 + mulpd ALPHA, %xmm4 + mulpd ALPHA, %xmm6 + + cmpq $SIZE, INCY + jne .L19 + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + movsd 2 * SIZE(Y), %xmm9 + movhpd 3 * SIZE(Y), %xmm9 + movsd 4 * SIZE(Y), %xmm10 + movhpd 5 * SIZE(Y), %xmm10 + movsd 6 * SIZE(Y), %xmm11 + movhpd 7 * SIZE(Y), %xmm11 + addq $8 * SIZE, Y + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + movlpd %xmm2, 2 * SIZE(Y1) + movhpd %xmm2, 3 * SIZE(Y1) + movlpd %xmm4, 4 * SIZE(Y1) + movhpd %xmm4, 5 * SIZE(Y1) + movlpd %xmm6, 6 * SIZE(Y1) + movhpd %xmm6, 7 * SIZE(Y1) + addq $8 * SIZE, Y1 + + cmpq $8, N + jge .L11 + jmp .L20 + ALIGN_4 + +.L19: + movsd (Y), %xmm8 + addq INCY, Y + movhpd (Y), %xmm8 + addq INCY, Y + movsd (Y), %xmm9 + addq INCY, Y + movhpd (Y), %xmm9 + addq INCY, Y + movsd (Y), %xmm10 + addq INCY, Y + movhpd (Y), %xmm10 + addq INCY, Y + movsd (Y), %xmm11 + addq INCY, Y + movhpd (Y), %xmm11 + addq INCY, Y + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + movlpd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + movlpd %xmm4, (Y1) + addq INCY, Y1 + movhpd %xmm4, (Y1) + addq INCY, Y1 + movlpd %xmm6, (Y1) + addq INCY, Y1 + movhpd %xmm6, (Y1) + addq INCY, Y1 + + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#if (GEMV_UNROLL == 4 ) && defined(PREFETCHW) + PREFETCHW 3 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L2X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + movsd -16 * SIZE(A2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm3 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L2X: +#endif + + movq M, I + sarq $3, I + jle .L25 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm9) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1( -8 * SIZE, A2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A2( -8 * SIZE, A2, LDA, 1, %xmm11) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L22 + ALIGN_4 + +.L23: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm11) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L25: + testq $4, M + jle .L26 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm10) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm11) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L26: + testq $2, M + jle .L27 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm9) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm11) + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L27: + testq $1, M + je .L28 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + movsd -16 * SIZE(A2), %xmm10 + mulsd %xmm12, %xmm10 + addsd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm11 + mulsd %xmm12, %xmm11 + addsd %xmm11, %xmm3 + ALIGN_4 + +.L28: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 +#endif + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm2 + + cmpq $SIZE, INCY + jne .L29 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + movsd 2 * SIZE(Y), %xmm5 + movhpd 3 * SIZE(Y), %xmm5 + addq $4 * SIZE, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + movlpd %xmm2, 2 * SIZE(Y1) + movhpd %xmm2, 3 * SIZE(Y1) + addq $4 * SIZE, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + jmp .L30 + ALIGN_4 + +.L29: + movsd (Y), %xmm4 + addq INCY, Y + movhpd (Y), %xmm4 + addq INCY, Y + movsd (Y), %xmm5 + addq INCY, Y + movhpd (Y), %xmm5 + addq INCY, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + movlpd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + ALIGN_4 + +.L30: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L40 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L31: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#if (GEMV_UNROLL == 2 ) && defined(PREFETCHW) + PREFETCHW 2 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L3X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L3X: +#endif + + movq M, I + sarq $3, I + jle .L35 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + MOVUPS_A1(-14 * SIZE, A1, %xmm10) + MOVUPS_A1(-14 * SIZE, A2, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L33 + ALIGN_4 + +.L32: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + MOVUPS_A1(-12 * SIZE, A2, %xmm9) + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-10 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + MOVUPS_A1( -8 * SIZE, A2, %xmm9) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A1( -6 * SIZE, A2, %xmm11) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L32 + ALIGN_4 + +.L33: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + MOVUPS_A1(-12 * SIZE, A2, %xmm9) + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-10 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm11) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L35: + testq $4, M + jle .L36 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + MOVUPS_A1(-14 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm11) + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L36: + testq $2, M + jle .L37 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm9) + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L37: + testq $1, M + je .L38 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm1 + ALIGN_4 + +.L38: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + addpd %xmm8, %xmm0 +#endif + + mulpd ALPHA, %xmm0 + + movsd (Y), %xmm4 + addq INCY, Y + movhpd (Y), %xmm4 + addq INCY, Y + + addpd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L31 +#endif + ALIGN_4 + +.L40: + cmpq $1, N + jl .L999 + +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L4X + + movsd -16 * SIZE(X1), %xmm12 + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + + addq $SIZE, A1 + addq $SIZE, X1 + ALIGN_3 + +.L4X: +#endif + + movq M, I + sarq $3, I + jle .L45 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-14 * SIZE, A1, %xmm9) + MOVUPS_A1(-12 * SIZE, A1, %xmm10) + MOVUPS_A1(-10 * SIZE, A1, %xmm11) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L43 + ALIGN_4 + +.L42: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm9, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm9) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm12, %xmm10 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm10, %xmm0 + MOVUPS_A1( -4 * SIZE, A1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm2 + MOVUPS_A1( -2 * SIZE, A1, %xmm11) + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + + decq I + jg .L42 + ALIGN_4 + +.L43: + mulpd %xmm12, %xmm8 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm9, %xmm2 + + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm2 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L45: + testq $4, M + jle .L46 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-14 * SIZE, A1, %xmm9) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm2 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L46: + testq $2, M + jle .L47 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L47: + testq $1, M + je .L48 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm0 + ALIGN_4 + +.L48: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + addpd %xmm1, %xmm0 + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + addsd %xmm8, %xmm0 +#endif + + mulsd ALPHA, %xmm0 + + movsd (Y), %xmm4 + addq INCY, Y + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_4 + +.L50: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L60 + ALIGN_3 + +.L51: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L5X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm5 + mulsd %xmm12, %xmm5 + addsd %xmm5, %xmm1 + movsd -16 * SIZE(A2), %xmm6 + mulsd %xmm12, %xmm6 + addsd %xmm6, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm7 + mulsd %xmm12, %xmm7 + addsd %xmm7, %xmm3 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L5X: +#endif + + movhpd -16 * SIZE(A1, LDA), %xmm8 + movhpd -16 * SIZE(A2, LDA), %xmm9 + + movq M, I + sarq $3, I + jle .L55 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L53 + ALIGN_4 + +.L52: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm3 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A1, LDA) +#endif + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-8 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm3 + MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(A2, LDA) +#endif + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-8 * SIZE, A1, %xmm4) + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET + 8(X1) +#endif + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-8 * SIZE, A2, %xmm6) + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + MOVUPS_XL1(-6 * SIZE, X1, %xmm13) + addpd %xmm7, %xmm3 + MOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L52 + ALIGN_4 + +.L53: + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm3 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + MOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm6) + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm7, %xmm3 + MOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm3 + MOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm3 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L55: + testq $4, M + jle .L56 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-14 * SIZE, A1, %xmm4) + + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + MOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm3 + MOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) + + mulpd %xmm13, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm8, %xmm5 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm2 + shufpd $1, %xmm9, %xmm7 + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L56: + testq $2, M + jle .L57 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + MOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm5, %xmm8 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + shufpd $1, %xmm7, %xmm9 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm3 + movaps %xmm7, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L57: + testq $1, M + je .L58 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + shufpd $1, %xmm8, %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm1 + movsd -16 * SIZE(A2), %xmm6 + mulsd %xmm12, %xmm6 + addsd %xmm6, %xmm2 + shufpd $1, %xmm9, %xmm9 + mulsd %xmm12, %xmm9 + addsd %xmm9, %xmm3 + ALIGN_4 + +.L58: +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 +#else + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + movapd %xmm2, %xmm5 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm5 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 +#endif + + mulpd ALPHA, %xmm0 + mulpd ALPHA, %xmm2 + + cmpq $SIZE, INCY + jne .L59 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + movsd 2 * SIZE(Y), %xmm5 + movhpd 3 * SIZE(Y), %xmm5 + addq $4 * SIZE, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + movlpd %xmm2, 2 * SIZE(Y1) + movhpd %xmm2, 3 * SIZE(Y1) + addq $4 * SIZE, Y1 + + cmpq $4, N + jge .L51 + jmp .L60 + ALIGN_4 + +.L59: + movsd (Y), %xmm4 + addq INCY, Y + movhpd (Y), %xmm4 + addq INCY, Y + movsd (Y), %xmm5 + addq INCY, Y + movhpd (Y), %xmm5 + addq INCY, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + movlpd %xmm2, (Y1) + addq INCY, Y1 + movhpd %xmm2, (Y1) + addq INCY, Y1 + cmpq $4, N + jge .L51 + ALIGN_4 + +.L60: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L70 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L61: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#if (GEMV_UNROLL == 2 ) && defined(PREFETCHW) + PREFETCHW 2 * SIZE(Y1) +#endif + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L6X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm5 + mulsd %xmm12, %xmm5 + addsd %xmm5, %xmm1 + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + ALIGN_3 + +.L6X: +#endif + + movhpd -16 * SIZE(A2), %xmm8 + + movq M, I + sarq $3, I + jle .L65 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-15 * SIZE, A2, %xmm5) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-13 * SIZE, A2, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L63 + ALIGN_4 + +.L62: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm8, %xmm1 + MOVUPS_A1(-11 * SIZE, A2, %xmm9) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm6) + shufpd $1, %xmm7, %xmm5 + mulpd %xmm13, %xmm5 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm5, %xmm1 + MOVUPS_A1( -9 * SIZE, A2, %xmm8) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(A2) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-8 * SIZE, A1, %xmm4) + shufpd $1, %xmm9, %xmm7 + mulpd %xmm12, %xmm7 + MOVUPS_XL1(-8 * SIZE, X1, %xmm12) + addpd %xmm7, %xmm1 + MOVUPS_A1(-7 * SIZE, A2, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET + 8(X1) +#endif + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A1(-6 * SIZE, A1, %xmm6) + shufpd $1, %xmm8, %xmm9 + mulpd %xmm13, %xmm9 + MOVUPS_XL1(-6 * SIZE, X1, %xmm13) + addpd %xmm9, %xmm1 + MOVUPS_A1(-5 * SIZE, A2, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + + decq I + jg .L62 + ALIGN_4 + +.L63: + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm8, %xmm1 + MOVUPS_A1(-11 * SIZE, A2, %xmm9) + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm6) + shufpd $1, %xmm7, %xmm5 + mulpd %xmm13, %xmm5 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm5, %xmm1 + MOVUPS_A1( -9 * SIZE, A2, %xmm8) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm9, %xmm7 + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm1 + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + shufpd $1, %xmm8, %xmm9 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L65: + testq $4, M + jle .L66 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-15 * SIZE, A2, %xmm5) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-13 * SIZE, A2, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + + mulpd %xmm13, %xmm6 + addpd %xmm6, %xmm0 + shufpd $1, %xmm7, %xmm5 + movaps %xmm7, %xmm8 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L66: + testq $2, M + jle .L67 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-15 * SIZE, A2, %xmm5) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + shufpd $1, %xmm5, %xmm8 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm5, %xmm8 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L67: + testq $1, M + je .L68 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + shufpd $1, %xmm8, %xmm8 + mulsd %xmm12, %xmm8 + addsd %xmm8, %xmm1 + ALIGN_4 + +.L68: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + addpd %xmm4, %xmm0 +#endif + + mulpd ALPHA, %xmm0 + + movsd (Y), %xmm4 + addq INCY, Y + movhpd (Y), %xmm4 + addq INCY, Y + + addpd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 + movhpd %xmm0, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L61 +#endif + ALIGN_4 + +.L70: + cmpq $1, N + jl .L999 + +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef ALIGNED_ACCESS + testq $SIZE, A + je .L7X + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + + addq $SIZE, A1 + addq $SIZE, X1 + ALIGN_3 + +.L7X: +#endif + movq M, I + sarq $3, I + jle .L75 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + MOVUPS_A1(-12 * SIZE, A1, %xmm6) + MOVUPS_A1(-10 * SIZE, A1, %xmm7) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + decq I + jle .L73 + ALIGN_4 + +.L72: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm4 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm4, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + mulpd %xmm13, %xmm5 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm5, %xmm2 + MOVUPS_A1( -6 * SIZE, A1, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm12, %xmm6 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm6, %xmm0 + MOVUPS_A1( -4 * SIZE, A1, %xmm6) + mulpd %xmm13, %xmm7 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm7, %xmm2 + MOVUPS_A1( -2 * SIZE, A1, %xmm7) + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + + decq I + jg .L72 + ALIGN_4 + +.L73: + mulpd %xmm12, %xmm4 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm5, %xmm2 + + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm13, %xmm7 + addpd %xmm7, %xmm2 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L75: + testq $4, M + jle .L76 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm5) + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + addpd %xmm5, %xmm2 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L76: + testq $2, M + jle .L77 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L77: + testq $1, M + je .L78 + + movsd -16 * SIZE(X1), %xmm12 + + movsd -16 * SIZE(A1), %xmm4 + mulsd %xmm12, %xmm4 + addsd %xmm4, %xmm0 + ALIGN_4 + +.L78: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + addpd %xmm1, %xmm0 + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm4 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm4 + + addsd %xmm4, %xmm0 +#endif + + mulsd ALPHA, %xmm0 + + movsd (Y), %xmm4 + addq INCY, Y + + addsd %xmm4, %xmm0 + + movlpd %xmm0, (Y1) + addq INCY, Y1 +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + ALIGN_4 + + EPILOGUE diff --git a/kernel/x86_64/dgemv_t_atom.S b/kernel/x86_64/dgemv_t_atom.S new file mode 100644 index 0000000000..246bdd3e40 --- /dev/null +++ b/kernel/x86_64/dgemv_t_atom.S @@ -0,0 +1,686 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCH_SIZE (8 * 6) + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 +#define BUFFER %rbx + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %rdi +#define LDA %r8 +#define X %r9 +#define INCX %rsi +#define Y %rbp +#define INCY %r10 +#define BUFFER %rbx + +#endif + +#define I %rax +#define J %r11 +#define A1 %r12 +#define A2 %r13 +#define X1 %r14 +#define Y1 %r15 + +#define ALPHA %xmm3 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (, LDA, SIZE), LDA + +#ifndef WINDOWS_ABI + movapd %xmm0, ALPHA +#endif + + movq Y, Y1 + + testq M, M + jle .L999 + testq N, N + jle .L999 + + cmpq $SIZE, INCX + cmoveq X, BUFFER + je .L10 + + movq BUFFER, X1 + + movq M, I + sarq $3, I + jle .L05 + ALIGN_3 + +.L02: + movsd (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + + movsd (X), %xmm2 + addq INCX, X + movsd (X), %xmm8 + addq INCX, X + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + movsd %xmm0, 0 * SIZE(X1) + movsd %xmm1, 1 * SIZE(X1) + movsd %xmm2, 2 * SIZE(X1) + movsd %xmm8, 3 * SIZE(X1) + movsd %xmm4, 4 * SIZE(X1) + movsd %xmm5, 5 * SIZE(X1) + movsd %xmm6, 6 * SIZE(X1) + movsd %xmm7, 7 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_3 + +.L05: + movq M, I + andq $7, I + jle .L10 + ALIGN_3 + +.L06: + movsd (X), %xmm0 + addq INCX, X + + movsd %xmm0, (X1) + addq $SIZE, X1 + + decq I + jg .L06 + ALIGN_3 + +.L10: + movq N, J + sarq $1, J + jle .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movq BUFFER, X1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + PREFETCHW 1 * SIZE(X1) + + movq M, I + sarq $3, I + jle .L14 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 0 * SIZE(A2), %xmm12 + + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + movsd 1 * SIZE(A2), %xmm13 + + movsd 2 * SIZE(X1), %xmm6 + movsd 2 * SIZE(A1), %xmm10 + movsd 2 * SIZE(A2), %xmm14 + + movsd 3 * SIZE(X1), %xmm7 + mulsd %xmm4, %xmm8 + movsd 3 * SIZE(A1), %xmm11 + mulsd %xmm4, %xmm12 + movsd 4 * SIZE(X1), %xmm4 + + mulsd %xmm5, %xmm9 + movsd 3 * SIZE(A2), %xmm15 + mulsd %xmm5, %xmm13 + movsd 5 * SIZE(X1), %xmm5 + + decq I + jle .L13 + ALIGN_3 + +.L12: + PREFETCH PREFETCH_SIZE * SIZE(A1) + addsd %xmm8, %xmm0 + PREFETCH PREFETCH_SIZE * SIZE(A2) + mulsd %xmm6, %xmm10 + movsd 4 * SIZE(A1), %xmm8 + addsd %xmm12, %xmm1 + movsd 4 * SIZE(A2), %xmm12 + mulsd %xmm6, %xmm14 + movsd 6 * SIZE(X1), %xmm6 + + addsd %xmm9, %xmm0 + movsd 5 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm11 + addsd %xmm13, %xmm1 + movsd 5 * SIZE(A2), %xmm13 + mulsd %xmm7, %xmm15 + movsd 7 * SIZE(X1), %xmm7 + + addsd %xmm10, %xmm0 + movsd 6 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm8 + addsd %xmm14, %xmm1 + movsd 6 * SIZE(A2), %xmm14 + mulsd %xmm4, %xmm12 + movsd 8 * SIZE(X1), %xmm4 + + addsd %xmm11, %xmm0 + movsd 7 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm9 + addsd %xmm15, %xmm1 + movsd 7 * SIZE(A2), %xmm15 + mulsd %xmm5, %xmm13 + movsd 9 * SIZE(X1), %xmm5 + + addsd %xmm8, %xmm0 + movsd 8 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm10 + addq $8 * SIZE, X1 + addsd %xmm12, %xmm1 + movsd 8 * SIZE(A2), %xmm12 + mulsd %xmm6, %xmm14 + movsd 2 * SIZE(X1), %xmm6 + + addsd %xmm9, %xmm0 + movsd 9 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm11 + addq $8 * SIZE, A2 + addsd %xmm13, %xmm1 + movsd 1 * SIZE(A2), %xmm13 + mulsd %xmm7, %xmm15 + movsd 3 * SIZE(X1), %xmm7 + + addsd %xmm10, %xmm0 + movsd 10 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm8 + addq $8 * SIZE, A1 + addsd %xmm14, %xmm1 + movsd 2 * SIZE(A2), %xmm14 + mulsd %xmm4, %xmm12 + movsd 4 * SIZE(X1), %xmm4 + + addsd %xmm11, %xmm0 + movsd 3 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm9 + decq I + addsd %xmm15, %xmm1 + movsd 3 * SIZE(A2), %xmm15 + mulsd %xmm5, %xmm13 + movsd 5 * SIZE(X1), %xmm5 + + jg .L12 + ALIGN_3 + +.L13: + addsd %xmm8, %xmm0 + movsd 4 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm10 + addsd %xmm12, %xmm1 + movsd 4 * SIZE(A2), %xmm12 + mulsd %xmm6, %xmm14 + movsd 6 * SIZE(X1), %xmm6 + + addsd %xmm9, %xmm0 + movsd 5 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm11 + addsd %xmm13, %xmm1 + movsd 5 * SIZE(A2), %xmm13 + mulsd %xmm7, %xmm15 + movsd 7 * SIZE(X1), %xmm7 + + addsd %xmm10, %xmm0 + movsd 6 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm8 + addsd %xmm14, %xmm1 + movsd 6 * SIZE(A2), %xmm14 + mulsd %xmm4, %xmm12 + + addsd %xmm11, %xmm0 + movsd 7 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm9 + addsd %xmm15, %xmm1 + movsd 7 * SIZE(A2), %xmm15 + mulsd %xmm5, %xmm13 + + addsd %xmm8, %xmm0 + mulsd %xmm6, %xmm10 + addsd %xmm12, %xmm1 + mulsd %xmm6, %xmm14 + + addsd %xmm9, %xmm0 + mulsd %xmm7, %xmm11 + addsd %xmm13, %xmm1 + mulsd %xmm7, %xmm15 + + addsd %xmm10, %xmm0 + addq $8 * SIZE, A1 + addsd %xmm14, %xmm1 + addq $8 * SIZE, A2 + addsd %xmm11, %xmm0 + addq $8 * SIZE, X1 + addsd %xmm15, %xmm1 + ALIGN_4 + +.L14: + testq $4, M + je .L16 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 0 * SIZE(A2), %xmm12 + + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + movsd 1 * SIZE(A2), %xmm13 + + movsd 2 * SIZE(X1), %xmm6 + movsd 2 * SIZE(A1), %xmm10 + movsd 2 * SIZE(A2), %xmm14 + + movsd 3 * SIZE(X1), %xmm7 + movsd 3 * SIZE(A1), %xmm11 + movsd 3 * SIZE(A2), %xmm15 + + mulsd %xmm4, %xmm8 + mulsd %xmm4, %xmm12 + mulsd %xmm5, %xmm9 + mulsd %xmm5, %xmm13 + + addsd %xmm8, %xmm0 + addsd %xmm12, %xmm1 + addsd %xmm9, %xmm0 + addsd %xmm13, %xmm1 + + mulsd %xmm6, %xmm10 + mulsd %xmm6, %xmm14 + mulsd %xmm7, %xmm11 + mulsd %xmm7, %xmm15 + + addsd %xmm10, %xmm0 + addsd %xmm14, %xmm1 + addsd %xmm11, %xmm0 + addsd %xmm15, %xmm1 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L16: + testq $2, M + je .L17 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 0 * SIZE(A2), %xmm12 + + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + movsd 1 * SIZE(A2), %xmm13 + + mulsd %xmm4, %xmm8 + mulsd %xmm4, %xmm12 + mulsd %xmm5, %xmm9 + mulsd %xmm5, %xmm13 + + addsd %xmm8, %xmm0 + addsd %xmm12, %xmm1 + addsd %xmm9, %xmm0 + addsd %xmm13, %xmm1 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + + ALIGN_4 + +.L17: + testq $1, M + je .L19 + + movsd 0 * SIZE(X1), %xmm4 + + movsd 0 * SIZE(A1), %xmm8 + movsd 0 * SIZE(A2), %xmm12 + + mulsd %xmm4, %xmm8 + mulsd %xmm4, %xmm12 + + addsd %xmm8, %xmm0 + addsd %xmm12, %xmm1 + ALIGN_4 + +.L19: + mulsd ALPHA, %xmm0 + addsd (Y), %xmm0 + addq INCY, Y + + mulsd ALPHA, %xmm1 + addsd (Y), %xmm1 + addq INCY, Y + + movsd %xmm0, (Y1) + addq INCY, Y1 + movsd %xmm1, (Y1) + addq INCY, Y1 + + decq J + jg .L11 + ALIGN_3 + +.L20: + testq $1, N + jle .L999 + + movq A, A1 + movq BUFFER, X1 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + movq M, I + sarq $3, I + jle .L24 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + + movsd 2 * SIZE(X1), %xmm6 + movsd 2 * SIZE(A1), %xmm10 + movsd 3 * SIZE(X1), %xmm7 + movsd 3 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + movsd 4 * SIZE(X1), %xmm4 + mulsd %xmm5, %xmm9 + movsd 5 * SIZE(X1), %xmm5 + mulsd %xmm6, %xmm10 + movsd 6 * SIZE(X1), %xmm6 + mulsd %xmm7, %xmm11 + movsd 7 * SIZE(X1), %xmm7 + + decq I + jle .L23 + ALIGN_3 + +.L22: + PREFETCH PREFETCH_SIZE * SIZE(A1) + addsd %xmm8, %xmm0 + movsd 4 * SIZE(A1), %xmm8 + addsd %xmm9, %xmm0 + movsd 5 * SIZE(A1), %xmm9 + addsd %xmm10, %xmm0 + movsd 6 * SIZE(A1), %xmm10 + addsd %xmm11, %xmm0 + movsd 7 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + movsd 8 * SIZE(X1), %xmm4 + mulsd %xmm5, %xmm9 + movsd 9 * SIZE(X1), %xmm5 + mulsd %xmm6, %xmm10 + movsd 10 * SIZE(X1), %xmm6 + mulsd %xmm7, %xmm11 + movsd 11 * SIZE(X1), %xmm7 + + addsd %xmm8, %xmm0 + movsd 8 * SIZE(A1), %xmm8 + addsd %xmm9, %xmm1 + movsd 9 * SIZE(A1), %xmm9 + addsd %xmm10, %xmm1 + movsd 10 * SIZE(A1), %xmm10 + addsd %xmm11, %xmm0 + movsd 11 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + movsd 12 * SIZE(X1), %xmm4 + mulsd %xmm5, %xmm9 + movsd 13 * SIZE(X1), %xmm5 + mulsd %xmm6, %xmm10 + movsd 14 * SIZE(X1), %xmm6 + mulsd %xmm7, %xmm11 + movsd 15 * SIZE(X1), %xmm7 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + decq I + jg .L22 + ALIGN_3 + +.L23: + addsd %xmm8, %xmm0 + movsd 4 * SIZE(A1), %xmm8 + addsd %xmm9, %xmm1 + movsd 5 * SIZE(A1), %xmm9 + addsd %xmm10, %xmm0 + movsd 6 * SIZE(A1), %xmm10 + addsd %xmm11, %xmm1 + movsd 7 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm9 + mulsd %xmm6, %xmm10 + mulsd %xmm7, %xmm11 + + addsd %xmm8, %xmm0 + addsd %xmm9, %xmm1 + addsd %xmm10, %xmm0 + addq $8 * SIZE, A1 + addsd %xmm11, %xmm1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L24: + testq $4, M + je .L26 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + + movsd 2 * SIZE(X1), %xmm6 + movsd 2 * SIZE(A1), %xmm10 + movsd 3 * SIZE(X1), %xmm7 + movsd 3 * SIZE(A1), %xmm11 + + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm9 + mulsd %xmm6, %xmm10 + mulsd %xmm7, %xmm11 + + addsd %xmm8, %xmm0 + addsd %xmm9, %xmm1 + addsd %xmm10, %xmm0 + addq $4 * SIZE, A1 + addsd %xmm11, %xmm1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L26: + testq $2, M + je .L27 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + movsd 1 * SIZE(X1), %xmm5 + movsd 1 * SIZE(A1), %xmm9 + + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm9 + addsd %xmm8, %xmm0 + addq $2 * SIZE, A1 + addsd %xmm9, %xmm1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L27: + testq $1, M + je .L29 + + movsd 0 * SIZE(X1), %xmm4 + movsd 0 * SIZE(A1), %xmm8 + + mulsd %xmm4, %xmm8 + addsd %xmm8, %xmm0 + ALIGN_4 + +.L29: + addsd %xmm1, %xmm0 + + mulsd ALPHA, %xmm0 + + addsd (Y), %xmm0 + movsd %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + ALIGN_3 + + EPILOGUE diff --git a/kernel/x86_64/dot.S b/kernel/x86_64/dot.S new file mode 100644 index 0000000000..e63d9cd893 --- /dev/null +++ b/kernel/x86_64/dot.S @@ -0,0 +1,184 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#define INCY ARG5 /* r8 */ + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpq $SIZE, INCX + jne .L14 + cmpq $SIZE, INCY + jne .L14 + + movq N, %rax + sarq $2, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(1) + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(3) + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(4) + addq $4 * SIZE, X + addq $4 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq N, %rax + andq $3, %rax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addq $SIZE, X + FLD (Y) + fmulp %st, %st(1) + addq $SIZE, Y + faddp %st,%st(1) + decq %rax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: + movq N, %rax + sarq $2, %rax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st,%st(1) + + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st,%st(2) + + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st,%st(3) + + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st,%st(4) + + decq %rax + jg .L31 + ALIGN_3 + +.L30: + movq N, %rax + andq $3, %rax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addq INCX, X + FLD (Y) + fmulp %st, %st(1) + addq INCY, Y + faddp %st, %st(1) + decq %rax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + ret + + EPILOGUE diff --git a/kernel/x86_64/dot_atom.S b/kernel/x86_64/dot_atom.S new file mode 100644 index 0000000000..bc67b28d33 --- /dev/null +++ b/kernel/x86_64/dot_atom.S @@ -0,0 +1,299 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + pxor %xmm0, %xmm0 + leaq (, INCY, SIZE), INCY + pxor %xmm1, %xmm1 + + pxor %xmm2, %xmm2 + cmpq $0, N + pxor %xmm3, %xmm3 + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + movq N, %rax + sarq $3, %rax + jle .L14 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm9 + + movsd 2 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 2 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 3 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 3 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + + decq %rax + jle .L12 + + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addsd %xmm4, %xmm0 + movsd 4 * SIZE(X), %xmm4 + addsd %xmm5, %xmm1 + movsd 4 * SIZE(Y), %xmm8 + addsd %xmm6, %xmm2 + movsd 5 * SIZE(X), %xmm5 + addsd %xmm7, %xmm3 + movsd 5 * SIZE(Y), %xmm9 + + movsd 6 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 6 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 7 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 7 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + addsd %xmm4, %xmm0 + movsd 8 * SIZE(X), %xmm4 + addsd %xmm5, %xmm1 + movsd 8 * SIZE(Y), %xmm8 + addsd %xmm6, %xmm2 + movsd 9 * SIZE(X), %xmm5 + addsd %xmm7, %xmm3 + movsd 9 * SIZE(Y), %xmm9 + + movsd 10 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 10 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 11 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 11 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + addsd %xmm4, %xmm0 + movsd 4 * SIZE(X), %xmm4 + addsd %xmm5, %xmm1 + movsd 4 * SIZE(Y), %xmm8 + addsd %xmm6, %xmm2 + movsd 5 * SIZE(X), %xmm5 + addsd %xmm7, %xmm3 + movsd 5 * SIZE(Y), %xmm9 + + movsd 6 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 6 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 7 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 7 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + addq $ 8 * SIZE, X + addq $ 8 * SIZE, Y + ALIGN_3 + +.L14: + testq $7, N + jle .L999 + + testq $4, N + jle .L16 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm9 + + movsd 2 * SIZE(X), %xmm6 + mulsd %xmm8, %xmm4 + movsd 2 * SIZE(Y), %xmm10 + mulsd %xmm9, %xmm5 + movsd 3 * SIZE(X), %xmm7 + mulsd %xmm10, %xmm6 + movsd 3 * SIZE(Y), %xmm11 + mulsd %xmm11, %xmm7 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm9 + + mulsd %xmm8, %xmm4 + mulsd %xmm9, %xmm5 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm8 + + mulsd %xmm8, %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addq INCY, Y + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm6 + addq INCY, Y + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm7 + addq INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addq INCY, Y + addsd %xmm4, %xmm0 + decq %rax + jg .L56 + ALIGN_3 + +.L999: + addsd %xmm1, %xmm0 + addsd %xmm3, %xmm2 + addsd %xmm2, %xmm0 + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S new file mode 100644 index 0000000000..cc866a9c54 --- /dev/null +++ b/kernel/x86_64/dot_sse.S @@ -0,0 +1,1293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + cmpq $3, N + jle .L17 + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + mulss -32 * SIZE(Y), %xmm0 + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + ALIGN_2 + +.L05: + testq $2 * SIZE, Y + je .L10 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + mulps %xmm4, %xmm1 + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, N + jle .L999 + ALIGN_2 + +.L10: +#ifdef ALIGNED_ACCESS + testq $2 * SIZE, X + jne .L30 + + testq $SIZE, X + jne .L20 +#else + testq $3 * SIZE, X + jne .L20 +#endif + + movq N, %rax + sarq $5, %rax + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + + decq %rax + jle .L12 + + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L14: + testq $31, N + jle .L999 + + testq $16, N + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L15: + testq $8, N + jle .L16 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + testq $4, N + jle .L17 + + movaps -32 * SIZE(X), %xmm4 + mulps -32 * SIZE(Y), %xmm4 + + addps %xmm4, %xmm2 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + testq $2, N + jle .L18 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm3 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L18: + testq $1, N + jle .L999 + + movss -32 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movaps -33 * SIZE(X), %xmm4 + addq $3 * SIZE, X + + movq N, %rax + sarq $5, %rax + jle .L24 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movaps -20 * SIZE(X), %xmm8 + movaps -16 * SIZE(X), %xmm9 + movaps -12 * SIZE(X), %xmm10 + movaps -8 * SIZE(X), %xmm11 + + decq %rax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + pshufd $0x39, %xmm5, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + pshufd $0x39, %xmm6, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + movss %xmm8, %xmm7 + pshufd $0x39, %xmm7, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x39, %xmm8, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movaps 12 * SIZE(X), %xmm8 + + movss %xmm10, %xmm9 + pshufd $0x39, %xmm9, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movaps 16 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x39, %xmm10, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movaps 20 * SIZE(X), %xmm10 + + movss %xmm4, %xmm11 + pshufd $0x39, %xmm11, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movaps 24 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + pshufd $0x39, %xmm5, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + pshufd $0x39, %xmm6, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm8, %xmm7 + pshufd $0x39, %xmm7, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + movss %xmm9, %xmm8 + pshufd $0x39, %xmm8, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + + movss %xmm10, %xmm9 + pshufd $0x39, %xmm9, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0x39, %xmm10, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + + movss %xmm4, %xmm11 + pshufd $0x39, %xmm11, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L24: + testq $31, N + jle .L999 + + testq $16, N + jle .L25 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + pshufd $0x39, %xmm5, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + pshufd $0x39, %xmm6, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + pshufd $0x39, %xmm7, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, N + jle .L26 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + movss %xmm6, %xmm5 + pshufd $0x39, %xmm5, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm6, %xmm4 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + testq $4, N + jle .L27 + + movaps -32 * SIZE(X), %xmm5 + movss %xmm5, %xmm4 + pshufd $0x39, %xmm4, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm2 + movaps %xmm5, %xmm4 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $2, N + jle .L28 + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + pshufd $0x39, %xmm4, %xmm5 + + mulps %xmm8, %xmm5 + addps %xmm5, %xmm3 + movhlps %xmm4, %xmm4 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L28: + testq $1, N + jle .L999 + + pshufd $0x39, %xmm4, %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L30: + testq $SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm4 + addq $2 * SIZE, X + + movq N, %rax + sarq $5, %rax + jle .L34 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + movaps -20 * SIZE(X), %xmm8 + + movaps -16 * SIZE(X), %xmm9 + movaps -12 * SIZE(X), %xmm10 + movaps -8 * SIZE(X), %xmm11 + + decq %rax + jle .L32 + + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm8, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm9, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movaps 12 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm10, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movaps 16 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm11, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movaps 20 * SIZE(X), %xmm10 + + SHUFPD_1 %xmm4, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movaps 24 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L32: + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + SHUFPD_1 %xmm8, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + SHUFPD_1 %xmm9, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + + SHUFPD_1 %xmm10, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + + SHUFPD_1 %xmm11, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + + SHUFPD_1 %xmm4, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L34: + testq $31, N + jle .L999 + + testq $16, N + jle .L35 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L35: + testq $8, N + jle .L36 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + SHUFPD_1 %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movapd %xmm6, %xmm4 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L36: + testq $4, N + jle .L37 + + movaps -32 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps %xmm5, %xmm4 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L37: + testq $2, N + jle .L38 + + xorps %xmm5, %xmm5 + movhlps %xmm4, %xmm5 + + mulps -32 * SIZE(Y), %xmm5 + addps %xmm5, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L38: + testq $1, N + jle .L999 + + movss -34 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm4 + addq $SIZE, X + + movq N, %rax + sarq $5, %rax + jle .L44 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movaps -20 * SIZE(X), %xmm8 + movaps -16 * SIZE(X), %xmm9 + movaps -12 * SIZE(X), %xmm10 + movaps -8 * SIZE(X), %xmm11 + + decq %rax + jle .L42 + + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + movss %xmm8, %xmm7 + shufps $0x93, %xmm8, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movaps 8 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + shufps $0x93, %xmm9, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movaps 12 * SIZE(X), %xmm8 + + movss %xmm10, %xmm9 + shufps $0x93, %xmm10, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movaps 16 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + shufps $0x93, %xmm11, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movaps 20 * SIZE(X), %xmm10 + + movss %xmm4, %xmm11 + shufps $0x93, %xmm4, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movaps 24 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -4 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm8, %xmm7 + shufps $0x93, %xmm8, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + movss %xmm9, %xmm8 + shufps $0x93, %xmm9, %xmm8 + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + + movss %xmm10, %xmm9 + shufps $0x93, %xmm10, %xmm9 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + + movss %xmm11, %xmm10 + shufps $0x93, %xmm11, %xmm10 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + + movss %xmm4, %xmm11 + shufps $0x93, %xmm4, %xmm11 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L44: + testq $31, N + jle .L999 + + testq $16, N + jle .L45 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + movaps -24 * SIZE(X), %xmm7 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + + movss %xmm4, %xmm7 + shufps $0x93, %xmm4, %xmm7 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L45: + testq $8, N + jle .L46 + + movaps -32 * SIZE(X), %xmm5 + movaps -28 * SIZE(X), %xmm6 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + movaps %xmm6, %xmm4 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L46: + testq $4, N + jle .L47 + + movaps -32 * SIZE(X), %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm2 + movaps %xmm5, %xmm4 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L47: + testq $2, N + jle .L48 + + movaps -32 * SIZE(X), %xmm5 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm3 + movlhps %xmm5, %xmm4 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L48: + testq $1, N + jle .L999 + + pshufd $0x93, %xmm4, %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_4 + +#else + movq N, %rax + sarq $5, %rax + jle .L24 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + + movlps -16 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 + movlps -12 * SIZE(X), %xmm9 + movhps -10 * SIZE(X), %xmm9 + movlps -8 * SIZE(X), %xmm10 + movhps -6 * SIZE(X), %xmm10 + movlps -4 * SIZE(X), %xmm11 + movhps -2 * SIZE(X), %xmm11 + + decq %rax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + movlps 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + movlps 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + movlps 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 + + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + movlps 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + movlps 16 * SIZE(X), %xmm8 + movhps 18 * SIZE(X), %xmm8 + + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + movlps 20 * SIZE(X), %xmm9 + movhps 22 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + movlps 24 * SIZE(X), %xmm10 + movhps 26 * SIZE(X), %xmm10 + + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + movlps 28 * SIZE(X), %xmm11 + movhps 30 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + mulps -16 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + mulps -12 * SIZE(Y), %xmm9 + addps %xmm9, %xmm1 + mulps -8 * SIZE(Y), %xmm10 + addps %xmm10, %xmm2 + mulps -4 * SIZE(Y), %xmm11 + addps %xmm11, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L24: + testq $31, N + jle .L999 + + testq $16, N + jle .L25 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + movlps -24 * SIZE(X), %xmm6 + movhps -22 * SIZE(X), %xmm6 + movlps -20 * SIZE(X), %xmm7 + movhps -18 * SIZE(X), %xmm7 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + mulps -24 * SIZE(Y), %xmm6 + addps %xmm6, %xmm2 + mulps -20 * SIZE(Y), %xmm7 + addps %xmm7, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, N + jle .L26 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + movlps -28 * SIZE(X), %xmm5 + movhps -26 * SIZE(X), %xmm5 + + mulps -32 * SIZE(Y), %xmm4 + addps %xmm4, %xmm0 + mulps -28 * SIZE(Y), %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + testq $4, N + jle .L27 + + movlps -32 * SIZE(X), %xmm4 + movhps -30 * SIZE(X), %xmm4 + mulps -32 * SIZE(Y), %xmm4 + + addps %xmm4, %xmm2 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $2, N + jle .L28 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm3 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L28: + testq $1, N + jle .L999 + + movss -32 * SIZE(X), %xmm4 + mulss -32 * SIZE(Y), %xmm4 + addss %xmm4, %xmm0 + jmp .L999 + ALIGN_3 +#endif + + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movss 0 * SIZE(X), %xmm4 + addq INCX, X + mulss 0 * SIZE(Y), %xmm4 + addq INCY, Y + movss 0 * SIZE(X), %xmm5 + addq INCX, X + mulss 0 * SIZE(Y), %xmm5 + addq INCY, Y + movss 0 * SIZE(X), %xmm6 + addq INCX, X + mulss 0 * SIZE(Y), %xmm6 + addq INCY, Y + movss 0 * SIZE(X), %xmm7 + addq INCX, X + mulss 0 * SIZE(Y), %xmm7 + addq INCY, Y + + addss %xmm4, %xmm0 + addss %xmm5, %xmm1 + addss %xmm6, %xmm2 + addss %xmm7, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movss 0 * SIZE(X), %xmm4 + addq INCX, X + mulss 0 * SIZE(Y), %xmm4 + addq INCY, Y + addss %xmm4, %xmm0 + decq %rax + jg .L56 + ALIGN_3 + +.L999: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/dot_sse2.S b/kernel/x86_64/dot_sse2.S new file mode 100644 index 0000000000..875bf4e8b1 --- /dev/null +++ b/kernel/x86_64/dot_sse2.S @@ -0,0 +1,714 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, Y + je .L10 + + movsd -16 * SIZE(X), %xmm0 + mulsd -16 * SIZE(Y), %xmm0 + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + ALIGN_2 + +.L10: + testq $SIZE, X + jne .L20 + + movq N, %rax + sarq $4, %rax + jle .L14 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq %rax + jle .L12 + + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L14: + testq $15, N + jle .L999 + + testq $8, N + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movaps -16 * SIZE(X), %xmm4 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + movsd -16 * SIZE(X), %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +.L20: + +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm4 + addq $SIZE, X + + movq N, %rax + sarq $4, %rax + jle .L24 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + movaps -12 * SIZE(X), %xmm7 + movaps -10 * SIZE(X), %xmm8 + + movaps -8 * SIZE(X), %xmm9 + movaps -6 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + + decq %rax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -2 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movaps 0 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movaps 2 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm8, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movaps 4 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm9, %xmm8 + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + movaps 6 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm10, %xmm9 + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + movaps 8 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm11, %xmm10 + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + movaps 10 * SIZE(X), %xmm10 + + SHUFPD_1 %xmm4, %xmm11 + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + movaps 12 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -2 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + + SHUFPD_1 %xmm8, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + SHUFPD_1 %xmm9, %xmm8 + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + + SHUFPD_1 %xmm10, %xmm9 + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + + SHUFPD_1 %xmm11, %xmm10 + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + + SHUFPD_1 %xmm4, %xmm11 + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L24: + testq $15, N + jle .L999 + + testq $8, N + jle .L25 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + movaps -12 * SIZE(X), %xmm7 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movaps -10 * SIZE(X), %xmm4 + + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + SHUFPD_1 %xmm7, %xmm6 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + + SHUFPD_1 %xmm4, %xmm7 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + movaps -16 * SIZE(X), %xmm5 + movaps -14 * SIZE(X), %xmm6 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + SHUFPD_1 %xmm6, %xmm5 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movapd %xmm6, %xmm4 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + movaps -16 * SIZE(X), %xmm5 + + SHUFPD_1 %xmm5, %xmm4 + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movapd %xmm5, %xmm4 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L999 + + SHUFPD_1 %xmm4, %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 + +#else + + movq N, %rax + sarq $4, %rax + jle .L24 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movlps -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movlps -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + + movlps -8 * SIZE(X), %xmm8 + movhps -7 * SIZE(X), %xmm8 + movlps -6 * SIZE(X), %xmm9 + movhps -5 * SIZE(X), %xmm9 + movlps -4 * SIZE(X), %xmm10 + movhps -3 * SIZE(X), %xmm10 + movlps -2 * SIZE(X), %xmm11 + movhps -1 * SIZE(X), %xmm11 + + decq %rax + jle .L22 + + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + movlps 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + movlps 2 * SIZE(X), %xmm5 + movhps 3 * SIZE(X), %xmm5 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + movlps 4 * SIZE(X), %xmm6 + movhps 5 * SIZE(X), %xmm6 + + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + movlps 6 * SIZE(X), %xmm7 + movhps 7 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + movlps 8 * SIZE(X), %xmm8 + movhps 9 * SIZE(X), %xmm8 + + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + movlps 10 * SIZE(X), %xmm9 + movhps 11 * SIZE(X), %xmm9 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + movlps 12 * SIZE(X), %xmm10 + movhps 13 * SIZE(X), %xmm10 + + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + movlps 14 * SIZE(X), %xmm11 + movhps 15 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + mulpd -8 * SIZE(Y), %xmm8 + addpd %xmm8, %xmm0 + mulpd -6 * SIZE(Y), %xmm9 + addpd %xmm9, %xmm1 + mulpd -4 * SIZE(Y), %xmm10 + addpd %xmm10, %xmm2 + mulpd -2 * SIZE(Y), %xmm11 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L24: + testq $15, N + jle .L999 + + testq $8, N + jle .L25 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movlps -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movlps -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + mulpd -12 * SIZE(Y), %xmm6 + addpd %xmm6, %xmm2 + mulpd -10 * SIZE(Y), %xmm7 + addpd %xmm7, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movlps -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + mulpd -14 * SIZE(Y), %xmm5 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + movlps -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + + mulpd -16 * SIZE(Y), %xmm4 + addpd %xmm4, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L999 + + movsd -16 * SIZE(X), %xmm4 + mulsd -16 * SIZE(Y), %xmm4 + addsd %xmm4, %xmm0 + jmp .L999 + ALIGN_3 +#endif + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addq INCY, Y + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm6 + addq INCY, Y + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm7 + addq INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd 0 * SIZE(Y), %xmm4 + addq INCY, Y + addsd %xmm4, %xmm0 + decq %rax + jg .L56 + ALIGN_3 + +.L999: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + pshufd $0xe, %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_beta.S b/kernel/x86_64/gemm_beta.S new file mode 100644 index 0000000000..461df50e06 --- /dev/null +++ b/kernel/x86_64/gemm_beta.S @@ -0,0 +1,239 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 +#define N ARG2 +#define C ARG3 +#define LDC ARG4 +#define C1 ARG5 + +#define STACK_C 16(%rsp) +#define STACK_LDC 24(%rsp) + +#else + +#define STACKSIZE 256 + +#define M ARG1 +#define N ARG2 +#define C ARG3 +#define LDC ARG4 +#define C1 %r10 + +#define STACK_C 72 + STACKSIZE(%rsp) +#define STACK_LDC 80 + STACKSIZE(%rsp) + +#endif + +#define I %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movaps %xmm3, %xmm0 +#endif + + movq STACK_C, C + movq STACK_LDC, LDC + + pxor %xmm1, %xmm1 + + test M, M + jle .L999 + test N, N + jle .L999 + +#ifdef DOUBLE + ucomisd %xmm1, %xmm0 +#else + ucomiss %xmm1, %xmm0 +#endif + jne .L201 + ALIGN_4 + +.L101: + movq C, C1 + leaq (C, LDC, SIZE), C + + movq M, I + sarq $3, I + jle .L103 + ALIGN_4 + +.L102: +#ifdef OPTERON + prefetchw 32 * SIZE(C1) +#endif + + MOVSD %xmm0, 0 * SIZE(C1) + MOVSD %xmm0, 1 * SIZE(C1) + MOVSD %xmm0, 2 * SIZE(C1) + MOVSD %xmm0, 3 * SIZE(C1) + MOVSD %xmm0, 4 * SIZE(C1) + MOVSD %xmm0, 5 * SIZE(C1) + MOVSD %xmm0, 6 * SIZE(C1) + MOVSD %xmm0, 7 * SIZE(C1) + addq $8 * SIZE, C1 + decq I + jg .L102 + ALIGN_4 + +.L103: + movq M, I + andq $7, I + jle .L105 + ALIGN_4 + +.L104: + MOVSD %xmm0, 0 * SIZE(C1) + addq $SIZE, C1 + decq I + jg .L104 + ALIGN_4 + +.L105: + decq N + jg .L101 + jmp .L999 + ALIGN_3 + +.L201: + movq C, C1 # c_offset = c + leaq (C, LDC, SIZE), C # c += ldc + movq M, I + sarq $3, I + jle .L203 + ALIGN_4 + +.L202: +#ifdef OPTERON + prefetchw 32 * SIZE(C1) +#endif + + MOVSD 0 * SIZE(C1), %xmm8 + MOVSD 1 * SIZE(C1), %xmm9 + MOVSD 2 * SIZE(C1), %xmm10 + MOVSD 3 * SIZE(C1), %xmm11 + MOVSD 4 * SIZE(C1), %xmm12 + MOVSD 5 * SIZE(C1), %xmm13 + MOVSD 6 * SIZE(C1), %xmm14 + MOVSD 7 * SIZE(C1), %xmm15 + + MULSD %xmm0, %xmm8 + MULSD %xmm0, %xmm9 + MULSD %xmm0, %xmm10 + MULSD %xmm0, %xmm11 + MULSD %xmm0, %xmm12 + MULSD %xmm0, %xmm13 + MULSD %xmm0, %xmm14 + MULSD %xmm0, %xmm15 + + MOVSD %xmm8, 0 * SIZE(C1) + MOVSD %xmm9, 1 * SIZE(C1) + MOVSD %xmm10, 2 * SIZE(C1) + MOVSD %xmm11, 3 * SIZE(C1) + MOVSD %xmm12, 4 * SIZE(C1) + MOVSD %xmm13, 5 * SIZE(C1) + MOVSD %xmm14, 6 * SIZE(C1) + MOVSD %xmm15, 7 * SIZE(C1) + + addq $8 * SIZE, C1 + decq I + jg .L202 + ALIGN_4 + +.L203: + movq M, I + andq $7, I + jle .L205 + ALIGN_4 + +.L204: + MOVSD 0 * SIZE(C1), %xmm8 + MULSD %xmm0, %xmm8 + MOVSD %xmm8, 0 * SIZE(C1) + addq $SIZE, C1 + decq I + jg .L204 + ALIGN_4 + +.L205: + decq N + jg .L201 + ALIGN_3 + +.L999: + xorq %rax, %rax + +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_2x8_nehalem.S b/kernel/x86_64/gemm_kernel_2x8_nehalem.S new file mode 100644 index 0000000000..24e66d730b --- /dev/null +++ b/kernel/x86_64/gemm_kernel_2x8_nehalem.S @@ -0,0 +1,1849 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define INC32 %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define J 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define J 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCHSIZE 4 +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + leaq (, LDC, SIZE), LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + PADDING + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + PADDING + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO1, LDC, 1) + PADDING + xorps %xmm10, %xmm10 + prefetcht0 1 * SIZE(CO1, LDC, 2) + PADDING + xorps %xmm11, %xmm11 + prefetcht0 3 * SIZE(CO1, %rax, 1) + + movaps -16 * SIZE(AO), %xmm0 + + PADDING + xorps %xmm12, %xmm12 + prefetcht0 1 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 1 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 3 * SIZE(CO2, %rax, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -14 * SIZE(AO), %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm5 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + PADDING; + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + PADDING; + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm4 + + subq $-32 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm1, %xmm12 + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + mulpd %xmm7, %xmm8 + shufpd $2, %xmm0, %xmm9 + mulpd %xmm7, %xmm9 + + addpd %xmm2, %xmm13 + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + mulpd %xmm7, %xmm10 + shufpd $2, %xmm0, %xmm11 + mulpd %xmm7, %xmm11 + + addpd %xmm3, %xmm14 + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + mulpd %xmm7, %xmm12 + shufpd $2, %xmm0, %xmm13 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm15 + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + mulpd %xmm7, %xmm14 + shufpd $2, %xmm0, %xmm15 + mulpd %xmm7, %xmm15 + + movq CO1, %rax + orq LDC, %rax + testq $15, %rax + NOBRANCH + jne .L18x + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movups (CO1), %xmm0 + movups (CO1, LDC, 1), %xmm1 + movups (CO1, LDC, 2), %xmm2 + movups (CO1, %rax, 1), %xmm3 + + movups (CO2), %xmm4 + movups (CO2, LDC, 1), %xmm5 + movups (CO2, LDC, 2), %xmm6 + movups (CO2, %rax, 1), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + addpd %xmm4, %xmm12 + addpd %xmm5, %xmm13 + addpd %xmm6, %xmm14 + addpd %xmm7, %xmm15 +#endif + + movaps %xmm8, (CO1) + movaps %xmm9, (CO1, LDC, 1) + movaps %xmm10, (CO1, LDC, 2) + movaps %xmm11, (CO1, %rax, 1) + + movaps %xmm12, (CO2) + movaps %xmm13, (CO2, LDC, 1) + movaps %xmm14, (CO2, LDC, 2) + movaps %xmm15, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movups (CO1), %xmm0 + movups (CO1, LDC, 1), %xmm1 + movups (CO1, LDC, 2), %xmm2 + movups (CO1, %rax, 1), %xmm3 + movups (CO2), %xmm4 + movups (CO2, LDC, 1), %xmm5 + movups (CO2, LDC, 2), %xmm6 + movups (CO2, %rax, 1), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm12 + addpd %xmm5, %xmm13 + addpd %xmm6, %xmm14 + addpd %xmm7, %xmm15 +#endif + + movups %xmm8, (CO1) + movups %xmm9, (CO1, LDC, 1) + movups %xmm10, (CO1, LDC, 2) + movups %xmm11, (CO1, %rax, 1) + + movups %xmm12, (CO2) + movups %xmm13, (CO2, LDC, 1) + movups %xmm14, (CO2, LDC, 2) + movups %xmm15, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO1, LDC, 2), %xmm1 + movhps (CO1, %rax, 1), %xmm1 + movsd (CO2), %xmm2 + movhps (CO2, LDC, 1), %xmm2 + movsd (CO2, LDC, 2), %xmm3 + movhps (CO2, %rax, 1), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + movsd %xmm9, (CO1, LDC, 2) + movhps %xmm9, (CO1, %rax, 1) + + movsd %xmm10, (CO2) + movhps %xmm10, (CO2, LDC, 1) + movsd %xmm11, (CO2, LDC, 2) + movhps %xmm11, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK +#endif + + movq BO, B + + leaq (C, LDC, 8), C + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $4, N + jle .L50 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 2 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO2, LDC, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + mulpd %xmm7, %xmm8 + shufpd $2, %xmm0, %xmm9 + mulpd %xmm7, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + mulpd %xmm7, %xmm10 + shufpd $2, %xmm0, %xmm11 + mulpd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + + movsd 0 * SIZE(CO1, LDC, 1), %xmm1 + movhps 1 * SIZE(CO1, LDC, 1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 1 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO2, LDC, 1), %xmm3 + movhps 1 * SIZE(CO2, LDC, 1), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO1, LDC, 1) + movhps %xmm9, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 1) + movhps %xmm11, 1 * SIZE(CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm10, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm11, %xmm9 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC, 1), %xmm1 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + movsd %xmm9, (CO2) + movhps %xmm9, (CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + ALIGN_4 + +.L50: + testq $2, N + jle .L70 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + mulpd %xmm7, %xmm8 + shufpd $2, %xmm0, %xmm9 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 1 * SIZE(CO2), %xmm1 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm9, %xmm8 + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm8 + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L999 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifndef TRMMKERNEL + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 +#else + movsd -16 * SIZE(AO), %xmm0 + movhps -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhps -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 +#ifndef TRMMKERNEL + movaps -14 * SIZE(AO), %xmm0 +#else + movsd -14 * SIZE(AO), %xmm0 + movhps -13 * SIZE(AO), %xmm0 +#endif + addpd %xmm1, %xmm8 +#ifndef TRMMKERNEL + movaps -14 * SIZE(BO), %xmm1 +#else + movsd -14 * SIZE(BO), %xmm1 + movhps -13 * SIZE(BO), %xmm1 +#endif + + mulpd %xmm0, %xmm1 +#ifndef TRMMKERNEL + movaps -12 * SIZE(AO), %xmm0 +#else + movsd -12 * SIZE(AO), %xmm0 + movhps -11 * SIZE(AO), %xmm0 +#endif + addpd %xmm1, %xmm9 +#ifndef TRMMKERNEL + movaps -12 * SIZE(BO), %xmm1 +#else + movsd -12 * SIZE(BO), %xmm1 + movhps -11 * SIZE(BO), %xmm1 +#endif + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: + movsd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: + haddpd %xmm8, %xmm8 + mulsd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + + addsd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x2_atom.S b/kernel/x86_64/gemm_kernel_4x2_atom.S new file mode 100644 index 0000000000..47b16ceb93 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x2_atom.S @@ -0,0 +1,1385 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KKK 64(%rsp) +#define KK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movsd %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $1, J + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + leaq (C, LDC, 2), C + + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + movsd ALPHA, %xmm5 + + addsd %xmm2, %xmm13 + mulsd %xmm5, %xmm8 + addsd %xmm7, %xmm14 + mulsd %xmm5, %xmm10 + addsd %xmm6, %xmm15 + mulsd %xmm5, %xmm12 + mulsd %xmm5, %xmm14 + + mulsd %xmm5, %xmm9 + mulsd %xmm5, %xmm11 + mulsd %xmm5, %xmm13 + mulsd %xmm5, %xmm15 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 + addsd 2 * SIZE(CO1), %xmm12 + addsd 3 * SIZE(CO1), %xmm14 + + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm11 + addsd 2 * SIZE(CO2), %xmm13 + addsd 3 * SIZE(CO2), %xmm15 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movsd %xmm14, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm11, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movsd %xmm15, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + mulsd %xmm7, %xmm8 + addsd %xmm6, %xmm11 + mulsd %xmm7, %xmm10 + mulsd %xmm7, %xmm9 + mulsd %xmm7, %xmm11 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 + + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm11, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + movsd ALPHA, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 0 * SIZE(CO2), %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movq BO, B + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + addq LDC, C + + movq A, AO + + movq M, I + sarq $2, I + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + + movsd ALPHA, %xmm7 + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm10 + mulsd %xmm7, %xmm12 + mulsd %xmm7, %xmm14 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 + addsd 2 * SIZE(CO1), %xmm12 + addsd 3 * SIZE(CO1), %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movsd %xmm14, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + + decq I # i -- + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm10 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L60: + testq $1, M + je .L999 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + + mulsd %xmm7, %xmm8 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_barcelona.S b/kernel/x86_64/gemm_kernel_4x4_barcelona.S new file mode 100644 index 0000000000..f7015c04f2 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_barcelona.S @@ -0,0 +1,2093 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define movapd movaps +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + addpd %xmm0, %xmm8 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + addpd %xmm0, %xmm8 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ +/*A*/ movapd (AO, %rax, 4), %xmm6 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + addpd %xmm4, %xmm8 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + addpd %xmm4, %xmm8 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ +/*A*/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + addpd %xmm6, %xmm8 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + addpd %xmm6, %xmm8 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ +/*A*/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ +/**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ +/*A*/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax ;\ + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO), %xmm3 ;\ + movapd %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 2), CO2 # coffset2 = c + ldc + + leaq (C, LDC, 4), C # c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + movddup -15 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + movapd -8 * SIZE(AO), %xmm4 + xorps %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm5 + xorps %xmm12, %xmm12 + + prefetchw 3 * SIZE(CO1) + xorps %xmm13, %xmm13 + prefetchw 7 * SIZE(CO1, LDC) + xorps %xmm14, %xmm14 + prefetchw 3 * SIZE(CO2) + xorps %xmm15, %xmm15 + prefetchw 7 * SIZE(CO2, LDC) + movapd %xmm0, %xmm2 + + prefetch -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + jl .L12 + ALIGN_4 + +.L15: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm1 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm12 +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhps %xmm12, 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movupd (CO1, LDC), %xmm2 + movupd 2 * SIZE(CO1, LDC), %xmm3 +#endif + + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm13 +#ifndef TRMMKERNEL + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm9, (CO1, LDC) + movhps %xmm9, 1 * SIZE(CO1, LDC) + movsd %xmm13, 2 * SIZE(CO1, LDC) + movhps %xmm13, 3 * SIZE(CO1, LDC) + +#ifndef TRMMKERNEL + movupd (CO2), %xmm0 + movupd 2 * SIZE(CO2), %xmm1 +#endif + + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm14 +#ifndef TRMMKERNEL + addpd %xmm0, %xmm10 + addpd %xmm1, %xmm14 +#endif + + movsd %xmm10, (CO2) + movhps %xmm10, 1 * SIZE(CO2) + movsd %xmm14, 2 * SIZE(CO2) + movhps %xmm14, 3 * SIZE(CO2) + +#ifndef TRMMKERNEL + movupd (CO2, LDC), %xmm2 + movupd 2 * SIZE(CO2, LDC), %xmm3 +#endif + + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm15 +#ifndef TRMMKERNEL + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 +#endif + + movsd %xmm11, (CO2, LDC) + movhps %xmm11, 1 * SIZE(CO2, LDC) + movsd %xmm15, 2 * SIZE(CO2, LDC) + movhps %xmm15, 3 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + xorps %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + xorps %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + xorps %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd (CO1, LDC), %xmm2 + movupd (CO2), %xmm4 + movupd (CO2, LDC), %xmm6 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm6, %xmm11 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm9, (CO1, LDC) + movhps %xmm9, 1 * SIZE(CO1, LDC) + + movsd %xmm10, (CO2) + movhps %xmm10, 1 * SIZE(CO2) + movsd %xmm11, (CO2, LDC) + movhps %xmm11, 1 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + xorps %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + xorps %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + xorps %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC), %xmm1 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC) + movsd %xmm9, (CO2) + movhps %xmm9, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + xorps %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 3 * SIZE(CO2) + + prefetch -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm1 + movupd (CO2), %xmm2 + movupd 2 * SIZE(CO2), %xmm3 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm12 + mulpd %xmm7, %xmm13 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhps %xmm12, 3 * SIZE(CO1) + + movsd %xmm9, (CO2) + movhps %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhps %xmm13, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + xorps %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + xorps %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd (CO2), %xmm2 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm9, (CO2) + movhps %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + xorps %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + xorps %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 +#endif + + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + movapd -8 * SIZE(AO), %xmm2 + xorps %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + xorps %xmm12, %xmm12 + movddup -14 * SIZE(BO), %xmm3 + xorps %xmm13, %xmm13 + movddup -15 * SIZE(BO), %xmm5 + + prefetchw 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm9 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 + +.L99: + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm1 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhps %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + xorps %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 +#endif + + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 +#endif + + mulsd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_core2.S b/kernel/x86_64/gemm_kernel_4x4_core2.S new file mode 100644 index 0000000000..fa79fe0c56 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_core2.S @@ -0,0 +1,2221 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 4) + +#define PREFETCHSIZE (8 * 13 + 5) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + unpcklpd %xmm0, %xmm0 + movapd %xmm0, ALPHA + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + + leaq (, LDC, SIZE), LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movapd -16 * SIZE(B), %xmm0 + movapd -8 * SIZE(B), %xmm4 + + movq K, %rax + sarq $2, %rax + NOBRANCH + jle .L05 + ALIGN_3 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + movddup %xmm0, %xmm8 + movapd %xmm8, -16 * SIZE(BO) + unpckhpd %xmm0, %xmm0 + movapd %xmm0, -14 * SIZE(BO) + movapd 0 * SIZE(B), %xmm0 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movddup %xmm1, %xmm9 + movapd %xmm9, -12 * SIZE(BO) + unpckhpd %xmm1, %xmm1 + movapd %xmm1, -10 * SIZE(BO) + movddup %xmm2, %xmm10 + movapd %xmm10, -8 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + unpckhpd %xmm2, %xmm2 + movapd %xmm2, -6 * SIZE(BO) + movddup %xmm3, %xmm11 + movapd %xmm11, -4 * SIZE(BO) + unpckhpd %xmm3, %xmm3 + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + + movddup %xmm4, %xmm12 + movapd %xmm12, 0 * SIZE(BO) + unpckhpd %xmm4, %xmm4 + movapd %xmm4, 2 * SIZE(BO) + movapd 8 * SIZE(B), %xmm4 + movddup %xmm5, %xmm13 + movapd %xmm13, 4 * SIZE(BO) + unpckhpd %xmm5, %xmm5 + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movddup %xmm6, %xmm14 + movapd %xmm14, 8 * SIZE(BO) + unpckhpd %xmm6, %xmm6 + movapd %xmm6, 10 * SIZE(BO) + movddup %xmm7, %xmm15 + movapd %xmm15, 12 * SIZE(BO) + unpckhpd %xmm7, %xmm7 + movapd %xmm7, 14 * SIZE(BO) + + subq $-32 * SIZE, BO + subq $-16 * SIZE, B + decq %rax + BRANCH + jne .L02 + ALIGN_3 + +.L05: + movq K, %rax + andq $3, %rax + BRANCH + BRANCH + jle .L10 + ALIGN_3 + +.L06: + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd -12 * SIZE(B), %xmm0 + + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + BRANCH + jne .L06 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 19 * SIZE + BUFFER, BO +#else + leaq 19 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + prefetcht2 (BB) + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -19 * SIZE(BO), %xmm6 + movaps -17 * SIZE(BO), %xmm7 + + pxor %xmm2, %xmm2 + prefetcht0 3 * SIZE(CO1) + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + prefetcht0 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + + movapd %xmm2, %xmm8 + movapd %xmm2, %xmm9 + movapd %xmm2, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC, 2) + movapd %xmm2, %xmm11 + + movapd %xmm2, %xmm12 + movapd %xmm2, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 2) + movapd %xmm2, %xmm14 + movapd %xmm2, %xmm15 + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PADDING; + addpd %xmm2, %xmm10 + movaps -15 * SIZE(BO), %xmm2 + PADDING; + addpd %xmm3, %xmm14 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -13 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -11 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -9 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm10 + movaps -7 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -5 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -3 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -1 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm10 + movaps 1 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps 3 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + PADDING + movaps %xmm7, %xmm5 + mulpd %xmm1, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + + addpd %xmm6, %xmm8 + movaps 5 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps 7 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm10 + movaps 9 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + subq $-16 * SIZE, AO + movaps 11 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps 13 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps 15 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + subq $-32 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht2 -8 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm2, %xmm10 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -13 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -11 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + addq $4 * SIZE, AO + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -9 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + addq $8 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + movddup ALPHA, %xmm7 + + addpd %xmm2, %xmm10 + mulpd %xmm7, %xmm8 + addpd %xmm3, %xmm14 + mulpd %xmm7, %xmm12 + addpd %xmm4, %xmm11 + mulpd %xmm7, %xmm9 + addpd %xmm5, %xmm15 + mulpd %xmm7, %xmm13 + + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm14 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm15 + + movq CO1, %rax + orq LDC, %rax + testq $15, %rax + NOBRANCH + jne .L18x + +#ifndef TRMMKERNEL + addpd 0 * SIZE(CO1), %xmm8 + addpd 2 * SIZE(CO1), %xmm12 + addpd 0 * SIZE(CO2), %xmm9 + addpd 2 * SIZE(CO2), %xmm13 + + addpd 0 * SIZE(CO1, LDC, 2), %xmm10 + addpd 2 * SIZE(CO1, LDC, 2), %xmm14 + addpd 0 * SIZE(CO2, LDC, 2), %xmm11 + addpd 2 * SIZE(CO2, LDC, 2), %xmm15 +#endif + + movapd %xmm8, 0 * SIZE(CO1) + movapd %xmm12, 2 * SIZE(CO1) + movapd %xmm9, 0 * SIZE(CO2) + movapd %xmm13, 2 * SIZE(CO2) + + movapd %xmm10, 0 * SIZE(CO1, LDC, 2) + movapd %xmm14, 2 * SIZE(CO1, LDC, 2) + movapd %xmm11, 0 * SIZE(CO2, LDC, 2) + movapd %xmm15, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movsd 2 * SIZE(CO1, LDC, 2), %xmm5 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 + movsd 2 * SIZE(CO2, LDC, 2), %xmm7 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm14 + addpd %xmm6, %xmm11 + addpd %xmm7, %xmm15 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm9, %xmm3 + movapd %xmm10, %xmm4 + movapd %xmm11, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L21: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -4 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -2 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd 0 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd 2 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd 4 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd 6 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd 8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd 12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd 14 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + BRANCH + jg .L21 + ALIGN_4 + +.L25: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm9, %xmm3 + movapd %xmm10, %xmm4 + movapd %xmm11, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L31: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -15 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd -8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -6 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -4 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -14 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd 0 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd 2 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd 4 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -13 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd 8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd 10 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd 12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -12 * SIZE(AO), %xmm0 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + BRANCH + jg .L31 + ALIGN_4 + +.L35: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + mulsd %xmm7, %xmm10 + mulsd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 + addsd %xmm2, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + leaq (C, LDC, 4), C + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L43 + + addq %rax, %rax + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $7, %rax + BRANCH + jle .L45 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L44 + ALIGN_4 + +.L45: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L50: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L55 + ALIGN_4 + +.L51: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -6 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -6 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -2 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd 2 * SIZE(AO), %xmm1 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L51 + ALIGN_4 + +.L55: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L58 + ALIGN_4 + +.L56: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm9 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm12 + mulpd %xmm7, %xmm13 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + subq $1, I + jg .L50 + ALIGN_4 + +.L60: + testq $2, M + jle .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L65 + ALIGN_4 + +.L61: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm1, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO), %xmm0 + addpd %xmm4, %xmm10 + movapd -4 * SIZE(BO), %xmm4 + mulpd %xmm1, %xmm4 + addpd %xmm5, %xmm11 + movapd -2 * SIZE(BO), %xmm5 + mulpd %xmm1, %xmm5 + movapd -6 * SIZE(AO), %xmm1 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L61 + ALIGN_4 + +.L65: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L68 + ALIGN_4 + +.L66: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + jle .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L75 + ALIGN_4 + +.L71: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm1, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + movsd -13 * SIZE(AO), %xmm1 + + addsd %xmm2, %xmm8 + movsd -8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -6 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + addsd %xmm4, %xmm10 + movsd -4 * SIZE(BO), %xmm4 + mulsd %xmm1, %xmm4 + addsd %xmm5, %xmm11 + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + movsd -11 * SIZE(AO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L71 + ALIGN_4 + +.L75: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + ALIGN_4 + +.L76: + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 +#endif + + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 + addsd %xmm2, %xmm9 +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $4, %rax + jle .L83 + + addq %rax, %rax + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: + movq K, %rax + andq $15, %rax + BRANCH + jle .L85 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm8 + + movapd %xmm8, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L85: + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + jle .L100 + ALIGN_4 + +.L90: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(BO), %xmm5 + pxor %xmm12, %xmm12 + movapd -12 * SIZE(BO), %xmm6 + pxor %xmm13, %xmm13 + movapd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + prefetcht0 3 * SIZE(CO1) + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L95 + ALIGN_4 + +.L91: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm4, %xmm1 + movapd -8 * SIZE(BO), %xmm4 + addpd %xmm2, %xmm9 + movapd -12 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + addpd %xmm3, %xmm13 + movapd -10 * SIZE(AO), %xmm3 + mulpd %xmm5, %xmm3 + movapd -6 * SIZE(BO), %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm0, %xmm8 + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm6, %xmm0 + addpd %xmm1, %xmm12 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm6, %xmm1 + movapd -4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + addpd %xmm3, %xmm13 + movapd -2 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movapd -2 * SIZE(BO), %xmm7 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jg .L91 + ALIGN_4 + +.L95: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L98 + ALIGN_4 + +.L96: + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm4, %xmm1 + movapd -14 * SIZE(BO), %xmm4 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L98: + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 + + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + subq $1, I + jg .L90 + ALIGN_4 + +.L100: + testq $2, M + jle .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -12 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movapd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L105 + ALIGN_4 + +.L101: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movapd -8 * SIZE(BO), %xmm4 + addpd %xmm1, %xmm9 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm5, %xmm1 + movapd -6 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm10 + movapd -12 * SIZE(AO), %xmm2 + mulpd %xmm6, %xmm2 + movapd -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm11 + movapd -10 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movapd -2 * SIZE(BO), %xmm7 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jg .L101 + ALIGN_4 + +.L105: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L108 + ALIGN_4 + +.L106: + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movapd -14 * SIZE(BO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L108: + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + addpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 +#endif + + mulpd %xmm7, %xmm8 +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L110: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movsd -14 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movsd -12 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movsd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L115 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movsd -8 * SIZE(BO), %xmm4 + addpd %xmm1, %xmm9 + movsd -15 * SIZE(AO), %xmm1 + mulpd %xmm5, %xmm1 + movsd -6 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm10 + movsd -14 * SIZE(AO), %xmm2 + mulpd %xmm6, %xmm2 + movsd -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm11 + movsd -13 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movsd -2 * SIZE(BO), %xmm7 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jg .L111 + ALIGN_4 + +.L115: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + ALIGN_4 + +.L116: + addsd %xmm0, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + mulsd %xmm4, %xmm0 + movsd -14 * SIZE(BO), %xmm4 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 +#endif + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + addsd %xmm9, %xmm8 + + mulsd %xmm7, %xmm8 +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 +#endif + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_penryn.S b/kernel/x86_64/gemm_kernel_4x4_penryn.S new file mode 100644 index 0000000000..3179c7db75 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_penryn.S @@ -0,0 +1,2072 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define J 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define J 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#ifdef NANO +#define PREFETCHSIZE (8 * 2 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef DUNNINGTON +#define PREFETCHSIZE (8 * 97 + 4) +#define PREFETCHB prefetcht2 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht2 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 17 + 4) +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA + + subq $-16 * SIZE, A + subq $-17 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + leaq (, LDC, SIZE), LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHB -16 * SIZE(BB) + + xorpd %xmm5, %xmm5 + xorpd %xmm6, %xmm6 + + PREFETCHW 3 * SIZE(CO1) + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + PREFETCHW 7 * SIZE(CO2) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + PREFETCHW 3 * SIZE(CO1, LDC, 2) + movaps %xmm4, %xmm12 + movaps %xmm4, %xmm13 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -11 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movapd %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movapd %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + PADDING + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addpd %xmm2, %xmm9 + movaps -5 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + subq $-16 * SIZE, AO + movaps -3 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -1 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + subq $-16 * SIZE, BO + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + PREFETCHB -8 * SIZE(BB) +#ifdef DUNNINGTON + PREFETCHB 0 * SIZE(BB) + PREFETCHB 8 * SIZE(BB) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movddup ALPHA, %xmm1 + +#ifndef DUNNINGTON + subq $-16 * SIZE, BB +#else + subq $-32 * SIZE, BB +#endif + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + mulpd %xmm1, %xmm8 + movsd %xmm0, %xmm9 + mulpd %xmm1, %xmm9 + + movaps %xmm10, %xmm0 + movsd %xmm11, %xmm10 + mulpd %xmm1, %xmm10 + movsd %xmm0, %xmm11 + mulpd %xmm1, %xmm11 + + movaps %xmm12, %xmm0 + movsd %xmm13, %xmm12 + mulpd %xmm1, %xmm12 + movsd %xmm0, %xmm13 + mulpd %xmm1, %xmm13 + + movaps %xmm14, %xmm0 + movsd %xmm15, %xmm14 + mulpd %xmm1, %xmm14 + movsd %xmm0, %xmm15 + mulpd %xmm1, %xmm15 + + movq CO1, %rax + orq LDC, %rax + testq $15, %rax + NOBRANCH + jne .L18x + +#ifndef TRMMKERNEL + addpd 0 * SIZE(CO1), %xmm8 + addpd 2 * SIZE(CO1), %xmm12 + addpd 0 * SIZE(CO2), %xmm9 + addpd 2 * SIZE(CO2), %xmm13 + + addpd 0 * SIZE(CO1, LDC, 2), %xmm10 + addpd 2 * SIZE(CO1, LDC, 2), %xmm14 + addpd 0 * SIZE(CO2, LDC, 2), %xmm11 + addpd 2 * SIZE(CO2, LDC, 2), %xmm15 +#endif + + movaps %xmm8, 0 * SIZE(CO1) + movaps %xmm12, 2 * SIZE(CO1) + movaps %xmm9, 0 * SIZE(CO2) + movaps %xmm13, 2 * SIZE(CO2) + + movaps %xmm10, 0 * SIZE(CO1, LDC, 2) + movaps %xmm14, 2 * SIZE(CO1, LDC, 2) + movaps %xmm11, 0 * SIZE(CO2, LDC, 2) + movaps %xmm15, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + + decq I # i -- + BRANCH + jg .L11 + jmp .L20 + ALIGN_4 + +.L18x: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movsd 2 * SIZE(CO1, LDC, 2), %xmm5 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 + movsd 2 * SIZE(CO2, LDC, 2), %xmm7 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm14 + addpd %xmm6, %xmm11 + addpd %xmm7, %xmm15 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm3, %xmm8 + movaps %xmm3, %xmm9 + movaps %xmm3, %xmm10 + movaps %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -11 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -5 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -3 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps -1 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movddup ALPHA, %xmm3 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + mulpd %xmm3, %xmm8 + movsd %xmm0, %xmm9 + mulpd %xmm3, %xmm9 + + movaps %xmm10, %xmm0 + movsd %xmm11, %xmm10 + mulpd %xmm3, %xmm10 + movsd %xmm0, %xmm11 + mulpd %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 + + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -11 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -5 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -3 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -1 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 1 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -11 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + movddup ALPHA, %xmm3 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 0 * SIZE(CO2), %xmm0 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 0 * SIZE(CO2, LDC, 2), %xmm1 +#endif + + mulpd %xmm3, %xmm8 + mulpd %xmm3, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 0 * SIZE(CO2) + + movlpd %xmm9, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm9, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + PREFETCHB -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + PREFETCHW 3 * SIZE(CO2) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -11 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + movddup ALPHA, %xmm3 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + mulpd %xmm3, %xmm8 + movsd %xmm0, %xmm9 + mulpd %xmm3, %xmm9 + + movaps %xmm12, %xmm0 + movsd %xmm13, %xmm12 + mulpd %xmm3, %xmm12 + movsd %xmm0, %xmm13 + mulpd %xmm3, %xmm13 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -13 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -9 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movddup ALPHA, %xmm3 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + mulpd %xmm3, %xmm8 + movsd %xmm0, %xmm9 + mulpd %xmm3, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 2), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: + movddup ALPHA, %xmm3 + + addpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 0 * SIZE(CO2), %xmm0 +#endif + + mulpd %xmm3, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 +#endif + + mulpd %xmm3, %xmm8 + mulpd %xmm3, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: + addpd %xmm9, %xmm8 + + movddup ALPHA, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 +#endif + + mulpd %xmm3, %xmm8 + + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -17 * SIZE(BO), %xmm2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: + movddup ALPHA, %xmm3 + + addpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 +#endif + + mulsd %xmm3, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_sse2.S b/kernel/x86_64/gemm_kernel_4x4_sse2.S new file mode 100644 index 0000000000..10601970fc --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_sse2.S @@ -0,0 +1,2707 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 256(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 9 + 4) +#define movsd movlps +#define movapd movaps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 13 + 4) +#define movapd movaps +#endif + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else + +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + + unpcklpd %xmm0, %xmm0 + movapd %xmm0, ALPHA + + leaq (, LDC, SIZE), LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_3 + +.L01: +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_3 + + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq 1 * SIZE(B), %mm1 + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq 3 * SIZE(B), %mm3 + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + movq 4 * SIZE(B), %mm4 + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq 5 * SIZE(B), %mm5 + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) + + movq 6 * SIZE(B), %mm6 + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq 7 * SIZE(B), %mm7 + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) + + movq 8 * SIZE(B), %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq 9 * SIZE(B), %mm1 + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + movq 10 * SIZE(B), %mm2 + movq %mm2, 4 * SIZE(BO) + movq %mm2, 5 * SIZE(BO) + movq 11 * SIZE(B), %mm3 + movq %mm3, 6 * SIZE(BO) + movq %mm3, 7 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + movq 12 * SIZE(B), %mm4 + movq %mm4, 8 * SIZE(BO) + movq %mm4, 9 * SIZE(BO) + movq 13 * SIZE(B), %mm5 + movq %mm5, 10 * SIZE(BO) + movq %mm5, 11 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) + + movq 14 * SIZE(B), %mm6 + movq %mm6, 12 * SIZE(BO) + movq %mm6, 13 * SIZE(BO) + movq 15 * SIZE(B), %mm7 + movq %mm7, 14 * SIZE(BO) + movq %mm7, 15 * SIZE(BO) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, B + + subq $1, %rax + jne .L02 + ALIGN_3 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_3 + +.L04: + movq 0 * SIZE(B), %mm0 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq 1 * SIZE(B), %mm1 + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq 3 * SIZE(B), %mm3 + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_3 + +.L10: + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_3 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -12 * SIZE(AO), %xmm4 + movapd -12 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -10 * SIZE(AO), %xmm6 + movapd -8 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + PREFETCHW 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCHW 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + + PREFETCH 0 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $32 * SIZE, BO + addq $16 * SIZE, AO + ALIGN_3 + +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm0 + addpd %xmm1, %xmm10 + movapd -16 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm13 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm2 + addpd %xmm1, %xmm14 + movapd -8 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm2, %xmm15 + movapd -10 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_3 + +.L19: + PREFETCH 8 * SIZE(BB) + subq $-12 * SIZE, BB + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm7, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm7, %xmm14 + mulpd %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movlpd 2 * SIZE(CO1, LDC, 2), %xmm5 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 + + movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 + movlpd 2 * SIZE(CO2, LDC, 2), %xmm7 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + + movlpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movlpd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + +#ifndef TRMMKERNEL + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm14 + addpd %xmm6, %xmm11 + addpd %xmm7, %xmm15 +#endif + + movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm14, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) + + movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + movlpd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_3 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_3 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 16 * SIZE(BO), %xmm5 + movapd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm8 + movapd 18 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movapd 20 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + mulpd 22 * SIZE(BO), %xmm0 + addpd %xmm5, %xmm10 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm0, %xmm11 + movapd -10 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm8 + movapd 26 * SIZE(BO), %xmm7 + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm9 + movapd 28 * SIZE(BO), %xmm7 + mulpd %xmm0, %xmm7 + mulpd 30 * SIZE(BO), %xmm0 + addpd %xmm7, %xmm10 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movapd 34 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm9 + movapd 36 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 38 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 64 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -6 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movapd 42 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm9 + movapd 44 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 46 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 72 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm8 + movapd 50 * SIZE(BO), %xmm5 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movapd 52 * SIZE(BO), %xmm5 + mulpd %xmm2, %xmm5 + mulpd 54 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm10 + movapd 80 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm11 + movapd -2 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + addpd %xmm7, %xmm8 + movapd 58 * SIZE(BO), %xmm7 + mulpd %xmm2, %xmm7 + addpd %xmm7, %xmm9 + movapd 60 * SIZE(BO), %xmm7 + mulpd %xmm2, %xmm7 + mulpd 62 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm10 + movapd 88 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_3 + +.L29: +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movlpd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 + movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 +#endif + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm6, %xmm11 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_3 + +.L30: + testq $1, M + je .L39 + ALIGN_3 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movsd 16 * SIZE(BO), %xmm5 + movsd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_3 + +.L32: + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 32 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -15 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm8 + movsd 10 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm9 + movsd 12 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 40 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -14 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm8 + movsd 18 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm9 + movsd 20 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + mulsd 22 * SIZE(BO), %xmm0 + addsd %xmm5, %xmm10 + movsd 48 * SIZE(BO), %xmm5 + addsd %xmm0, %xmm11 + movsd -13 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm8 + movsd 26 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm9 + movsd 28 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + mulsd 30 * SIZE(BO), %xmm0 + addsd %xmm7, %xmm10 + movsd 56 * SIZE(BO), %xmm7 + addsd %xmm0, %xmm11 + movsd -12 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + movsd 34 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 36 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 38 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 64 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -11 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm8 + movsd 42 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm9 + movsd 44 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 46 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 72 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -10 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm8 + movsd 50 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm9 + movsd 52 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + mulsd 54 * SIZE(BO), %xmm0 + addsd %xmm5, %xmm10 + movsd 80 * SIZE(BO), %xmm5 + addsd %xmm0, %xmm11 + movsd -9 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm8 + movsd 58 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm9 + movsd 60 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + mulsd 62 * SIZE(BO), %xmm0 + addsd %xmm7, %xmm10 + movsd 88 * SIZE(BO), %xmm7 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + movsd 2 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 8 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_3 + +.L38: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + mulsd %xmm7, %xmm10 + mulsd %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 + addsd %xmm2, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + ALIGN_3 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_3 + +.L42: + PREFETCH 56 * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_3 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_3 + +.L44: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_3 + +.L50: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_3 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm12, %xmm12 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + + movapd 0 * SIZE(AO), %xmm4 + movapd 16 * SIZE(BO), %xmm5 + movapd 8 * SIZE(AO), %xmm6 + movapd 24 * SIZE(BO), %xmm7 + + PREFETCHW 4 * SIZE(CO1) + PREFETCHW 4 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_3 + +.L52: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd 16 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd 24 * SIZE(AO), %xmm2 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm4, %xmm5 + mulpd 18 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm8 + movapd 16 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm9 + movapd 2 * SIZE(AO), %xmm4 + mulpd %xmm4, %xmm5 + mulpd 18 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm12 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm13 + movapd 4 * SIZE(AO), %xmm4 + + mulpd %xmm4, %xmm5 + mulpd 22 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm8 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm9 + movapd 6 * SIZE(AO), %xmm4 + mulpd %xmm4, %xmm5 + mulpd 22 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm12 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm13 + movapd 32 * SIZE(AO), %xmm4 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm6, %xmm7 + mulpd 26 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm8 + movapd 24 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm9 + movapd 10 * SIZE(AO), %xmm6 + mulpd %xmm6, %xmm7 + mulpd 26 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm12 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm13 + movapd 12 * SIZE(AO), %xmm6 + + mulpd %xmm6, %xmm7 + mulpd 30 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm8 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm9 + movapd 14 * SIZE(AO), %xmm6 + mulpd %xmm6, %xmm7 + mulpd 30 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm12 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm13 + movapd 40 * SIZE(AO), %xmm6 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_3 + +.L56: + movapd 0 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm12 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_3 + +.L59: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm12 + mulpd %xmm7, %xmm13 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm13, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_3 + +.L60: + testq $2, M + je .L70 + ALIGN_3 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 16 * SIZE(BO), %xmm5 + movapd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm5 + mulpd 18 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm8 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm5 + mulpd 22 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm10 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + mulpd 26 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm8 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + mulpd 30 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm10 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_3 + +.L69: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_3 + +.L70: + testq $1, M + je .L79 + ALIGN_3 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -12 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movsd 16 * SIZE(BO), %xmm5 + movsd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_3 + +.L72: + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm9 + movsd -15 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 32 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -14 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm8 + movsd 12 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm9 + movsd -13 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 40 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + + mulsd %xmm2, %xmm5 + mulsd 18 * SIZE(BO), %xmm2 + addsd %xmm5, %xmm8 + movsd 20 * SIZE(BO), %xmm5 + addsd %xmm2, %xmm9 + movsd -11 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm5 + mulsd 22 * SIZE(BO), %xmm2 + addsd %xmm5, %xmm10 + movsd 48 * SIZE(BO), %xmm5 + addsd %xmm2, %xmm11 + movsd -10 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm7 + mulsd 26 * SIZE(BO), %xmm2 + addsd %xmm7, %xmm8 + movsd 28 * SIZE(BO), %xmm7 + addsd %xmm2, %xmm9 + movsd -9 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm7 + mulsd 30 * SIZE(BO), %xmm2 + addsd %xmm7, %xmm10 + movsd 56 * SIZE(BO), %xmm7 + addsd %xmm2, %xmm11 + movsd -4 * SIZE(AO), %xmm2 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_3 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulsd %xmm0, %xmm1 + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm8 + addsd %xmm0, %xmm9 + movsd -15 * SIZE(AO), %xmm0 + movsd 4 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_3 + +.L78: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 +#endif + + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + mulsd %xmm7, %xmm8 + mulsd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addsd %xmm0, %xmm8 + addsd %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_3 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + ALIGN_3 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L83 + ALIGN_3 + +.L82: + PREFETCH 56 * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_3 + +.L83: + movq K, %rax + andq $7, %rax + BRANCH + jle .L90 + ALIGN_3 + +.L84: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_3 + +.L90: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_3 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 0 * SIZE(AO), %xmm4 + movapd 8 * SIZE(AO), %xmm6 + + PREFETCHW 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_3 + +.L92: + mulpd %xmm1, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm10 + movapd 16 * SIZE(AO), %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm1, %xmm11 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AO), %xmm1 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO), %xmm2 + addpd %xmm1, %xmm9 + movapd 6 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AO), %xmm1 + addpd %xmm2, %xmm10 + movapd 24 * SIZE(AO), %xmm2 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm1, %xmm11 + movapd 16 * SIZE(BO), %xmm1 + mulpd %xmm3, %xmm4 + mulpd 2 * SIZE(AO), %xmm3 + addpd %xmm4, %xmm8 + movapd 4 * SIZE(AO), %xmm4 + addpd %xmm3, %xmm9 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm4 + mulpd 6 * SIZE(AO), %xmm3 + addpd %xmm4, %xmm10 + movapd 32 * SIZE(AO), %xmm4 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm3, %xmm11 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm6 + mulpd 10 * SIZE(AO), %xmm3 + addpd %xmm6, %xmm8 + movapd 12 * SIZE(AO), %xmm6 + addpd %xmm3, %xmm9 + movapd 14 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm6 + mulpd 14 * SIZE(AO), %xmm3 + addpd %xmm6, %xmm10 + movapd 40 * SIZE(AO), %xmm6 + addpd %xmm3, %xmm11 + movapd 24 * SIZE(BO), %xmm3 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_3 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_3 + +.L96: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movapd 2 * SIZE(BO), %xmm1 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_3 + +.L99: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_3 + +.L100: + testq $2, M + je .L110 + ALIGN_3 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L105 + ALIGN_3 + +.L102: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd -14 * SIZE(AO), %xmm0 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -12 * SIZE(AO), %xmm0 + mulpd 4 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm10 + movapd -10 * SIZE(AO), %xmm0 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm2 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -4 * SIZE(AO), %xmm2 + mulpd 12 * SIZE(BO), %xmm2 + addpd %xmm2, %xmm10 + movapd -2 * SIZE(AO), %xmm2 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_3 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_3 + +.L106: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(AO), %xmm0 + movapd 2 * SIZE(BO), %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_3 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + + mulpd %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + addq $2 * SIZE, CO1 # coffset += 4 + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + ALIGN_3 + +.L110: + testq $1, M + je .L999 + ALIGN_3 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -12 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_3 + +.L112: + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd 16 * SIZE(BO), %xmm1 + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm9 + movsd -14 * SIZE(AO), %xmm0 + mulsd 4 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm10 + movsd -13 * SIZE(AO), %xmm0 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm8 + movsd 24 * SIZE(BO), %xmm3 + mulsd 10 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm9 + movsd -10 * SIZE(AO), %xmm2 + mulsd 12 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm10 + movsd -9 * SIZE(AO), %xmm2 + mulsd 14 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm11 + movsd -4 * SIZE(AO), %xmm2 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_3 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd 2 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_3 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + + mulsd %xmm7, %xmm8 +#ifndef TRMMKERNEL + addsd 0 * SIZE(CO1), %xmm8 +#endif + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_3 + +.L999: + movq %rbx, %rsp + + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x4_sse3.S b/kernel/x86_64/gemm_kernel_4x4_sse3.S new file mode 100644 index 0000000000..8cbe6ed168 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x4_sse3.S @@ -0,0 +1,2561 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KKK 64(%rsp) +#define KK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movsd %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + pxor %xmm4, %xmm4 + movddup 16 * SIZE(BO), %xmm13 + pxor %xmm5, %xmm5 + movapd 24 * SIZE(AO), %xmm14 + pxor %xmm6, %xmm6 + movddup 24 * SIZE(BO), %xmm15 + pxor %xmm7, %xmm7 + + prefetchnta 3 * SIZE(CO1) + prefetchnta 3 * SIZE(CO2) + prefetchnta 3 * SIZE(CO1, LDC, 2) + prefetchnta 3 * SIZE(CO2, LDC, 2) + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + NOBRANCH + je .L15 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + BRANCH + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm4 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm5 + + testq $15, CO1 + NOBRANCH + jne .L19x + testq $15, LDC + NOBRANCH + jne .L19x + + mulpd %xmm15, %xmm2 + mulpd %xmm15, %xmm3 + mulpd %xmm15, %xmm6 + mulpd %xmm15, %xmm7 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd 0 * SIZE(CO1), %xmm0 + addpd 2 * SIZE(CO1), %xmm4 + addpd 0 * SIZE(CO2), %xmm1 + addpd 2 * SIZE(CO2), %xmm5 + + addpd 0 * SIZE(CO1, LDC, 2), %xmm2 + addpd 2 * SIZE(CO1, LDC, 2), %xmm6 + addpd 0 * SIZE(CO2, LDC, 2), %xmm3 + addpd 2 * SIZE(CO2, LDC, 2), %xmm7 +#endif + + movapd %xmm0, 0 * SIZE(CO1) + movapd %xmm4, 2 * SIZE(CO1) + movapd %xmm1, 0 * SIZE(CO2) + movapd %xmm5, 2 * SIZE(CO2) + + movapd %xmm2, 0 * SIZE(CO1, LDC, 2) + movapd %xmm6, 2 * SIZE(CO1, LDC, 2) + movapd %xmm3, 0 * SIZE(CO2, LDC, 2) + movapd %xmm7, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L11 + jmp .L20 + ALIGN_4 + +.L19x: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movsd 0 * SIZE(CO2), %xmm10 + movhpd 1 * SIZE(CO2), %xmm10 + movsd 2 * SIZE(CO2), %xmm11 + movhpd 3 * SIZE(CO2), %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm4 + addpd %xmm10, %xmm1 + addpd %xmm11, %xmm5 +#endif + + mulpd %xmm15, %xmm2 + mulpd %xmm15, %xmm3 + mulpd %xmm15, %xmm6 + mulpd %xmm15, %xmm7 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm12 + movsd 2 * SIZE(CO1, LDC, 2), %xmm13 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm13 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm14 + movsd 2 * SIZE(CO2, LDC, 2), %xmm15 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm15 + + addpd %xmm12, %xmm2 + addpd %xmm13, %xmm6 + addpd %xmm14, %xmm3 + addpd %xmm15, %xmm7 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movhpd %xmm4, 3 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movhpd %xmm1, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm6, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm6, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhpd 1 * SIZE(CO2), %xmm10 + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm14 +#endif + + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm2 + mulpd %xmm15, %xmm3 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm10, %xmm1 + addpd %xmm12, %xmm2 + addpd %xmm14, %xmm3 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhpd %xmm1, 1 * SIZE(CO2) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 1 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 0 * SIZE(CO2), %xmm8 + movsd 0 * SIZE(CO1, LDC, 2), %xmm9 + movhpd 0 * SIZE(CO2, LDC, 2), %xmm9 +#endif + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO2) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + movq BO, B + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-4 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) + prefetchw 4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + movsd 0 * SIZE(CO2), %xmm10 + movhpd 1 * SIZE(CO2), %xmm10 + movsd 2 * SIZE(CO2), %xmm11 + movhpd 3 * SIZE(CO2), %xmm11 +#endif + + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm4 + mulpd %xmm15, %xmm5 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm4 + addpd %xmm10, %xmm1 + addpd %xmm11, %xmm5 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movhpd %xmm4, 3 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhpd %xmm1, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhpd 1 * SIZE(CO2), %xmm10 +#endif + + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm10, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhpd %xmm1, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 0 * SIZE(CO2), %xmm8 +#endif + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + mulpd %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 +#endif + + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 +#endif + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + mulpd %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(AO), %xmm9 + movapd 0 * SIZE(BO), %xmm8 + movapd 4 * SIZE(AO), %xmm11 + movapd 4 * SIZE(BO), %xmm10 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 +#endif + + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + mulsd %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S new file mode 100644 index 0000000000..4d814053f5 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x8_nano.S @@ -0,0 +1,2479 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi + +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 256(%rsp) + +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define RPREFETCHSIZE (16 * 4) +#define PREFETCHSIZE (16 * 8 + 8) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + salq $BASE_SHIFT, LDC + + movq N, J + sarq $3, J + jle .L40 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 32 * SIZE + BUFFER, BO + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + movq K, %rax + sarq $1, %rax + jle .L03 + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + movaps 16 * SIZE(B), %xmm1 + + pshufd $0x50, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xfa, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps 20 * SIZE(B), %xmm3 + + pshufd $0x50, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(BO) + pshufd $0xfa, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(BO) + + movaps 24 * SIZE(B), %xmm5 + + pshufd $0x50, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(BO) + pshufd $0xfa, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(BO) + + movaps 28 * SIZE(B), %xmm7 + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $1, %rax + BRANCH + jle .L10 + + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + pshufd $0x50, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xfa, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + ALIGN_4 + +.L10: + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + leaq (LDC, LDC, 2), %rax + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + + pxor %xmm8, %xmm8 + PREFETCHW 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + PREFETCHW 5 * SIZE(CO1, LDC, 1) + pxor %xmm10, %xmm10 + PREFETCHW 3 * SIZE(CO1, LDC, 2) + pxor %xmm11, %xmm11 + PREFETCHW 5 * SIZE(CO1, %rax) + + pxor %xmm12, %xmm12 + PREFETCHW 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCHW 5 * SIZE(CO2, LDC, 1) + pxor %xmm14, %xmm14 + PREFETCHW 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + PREFETCHW 5 * SIZE(CO2, %rax) + + PREFETCH -32 * SIZE(BB) + addq $16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0)(AO) + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + movaps -28 * SIZE(AO), %xmm2 + addps %xmm3, %xmm15 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps -8 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -4 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm14 + movaps 0 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + movaps -24 * SIZE(AO), %xmm2 + addps %xmm3, %xmm15 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps 8 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps 12 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm14 + movaps 16 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + movaps -20 * SIZE(AO), %xmm2 + addps %xmm3, %xmm15 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movaps 20 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps 24 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps 28 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm14 + movaps 32 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm15 + + subq $-16 * SIZE, AO + addq $ 64 * SIZE, BO + decq %rax + BRANCH + jg .L12 + +.L16: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L18 + ALIGN_4 + +.L17: + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm12 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm13 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm15 + + addq $ 4 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L17 + ALIGN_4 + +.L18: + leaq (LDC, LDC, 2), %rax + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1, LDC, 1), %xmm0 + movsd 0 * SIZE(CO1, LDC, 1), %xmm1 + movhps 2 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm2 + movhps 2 * SIZE(CO1, %rax), %xmm2 + movsd 0 * SIZE(CO1, %rax), %xmm3 + movhps 2 * SIZE(CO1, LDC, 2), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO2), %xmm4 + movhps 2 * SIZE(CO2, LDC, 1), %xmm4 + movsd 0 * SIZE(CO2, LDC, 1), %xmm5 + movhps 2 * SIZE(CO2), %xmm5 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, %rax), %xmm6 + movsd 0 * SIZE(CO2, %rax), %xmm7 + movhps 2 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm4, %xmm12 + addps %xmm5, %xmm13 + addps %xmm6, %xmm14 + addps %xmm7, %xmm15 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1, LDC, 1) + movlps %xmm9, 0 * SIZE(CO1, LDC, 1) + movhps %xmm9, 2 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, %rax) + movlps %xmm11, 0 * SIZE(CO1, %rax) + movhps %xmm11, 2 * SIZE(CO1, LDC, 2) + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2, LDC, 1) + movlps %xmm13, 0 * SIZE(CO2, LDC, 1) + movhps %xmm13, 2 * SIZE(CO2) + + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, %rax) + movlps %xmm15, 0 * SIZE(CO2, %rax) + movhps %xmm15, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movddup -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L26 + ALIGN_3 + +.L22: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -30 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 0 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -28 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps 8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps 12 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -26 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps 20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps 24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps 28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + BRANCH + jg .L22 + +.L26: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L28 + ALIGN_4 + +.L27: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movddup -30 * SIZE(AO), %xmm0 + + addq $ 2 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L27 + ALIGN_4 + +.L28: + leaq (LDC, LDC, 2), %rax + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO1, LDC, 2), %xmm1 + movhps (CO1, %rax), %xmm1 + + movsd (CO2), %xmm2 + movhps (CO2, LDC, 1), %xmm2 + movsd (CO2, LDC, 2), %xmm3 + movhps (CO2, %rax), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + movlps %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + + movlps %xmm9, (CO1, LDC, 2) + movhps %xmm9, (CO1, %rax) + + movlps %xmm10, (CO2) + movhps %xmm10, (CO2, LDC, 1) + + movlps %xmm11, (CO2, LDC, 2) + movhps %xmm11, (CO2, %rax) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + addq %rax, %rax + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L36 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0)(AO) + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -31 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 0 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -30 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps 8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps 12 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -29 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps 20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps 24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps 28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + BRANCH + jg .L32 + +.L36: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L38 + ALIGN_4 + +.L37: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movss -31 * SIZE(AO), %xmm0 + + addq $ 1 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L37 + ALIGN_4 + +.L38: + leaq (LDC, LDC, 2), %rax + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + movhlps %xmm8, %xmm12 + movhlps %xmm9, %xmm13 + movhlps %xmm10, %xmm14 + movhlps %xmm11, %xmm15 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO1, LDC, 1), %xmm12 + addss (CO1, LDC, 2), %xmm9 + addss (CO1, %rax), %xmm13 + + addss (CO2), %xmm10 + addss (CO2, LDC, 1), %xmm14 + addss (CO2, LDC, 2), %xmm11 + addss (CO2, %rax), %xmm15 +#endif + + movss %xmm8, (CO1) + movss %xmm12, (CO1, LDC, 1) + + movss %xmm9, (CO1, LDC, 2) + movss %xmm13, (CO1, %rax) + + movss %xmm10, (CO2) + movss %xmm14, (CO2, LDC, 1) + + movss %xmm11, (CO2, LDC, 2) + movss %xmm15, (CO2, %rax) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + addq %rax, %rax + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $8, KK +#endif + leaq (C, LDC, 8), C + decq J + jg .L01 + ALIGN_4 + +.L40: + testq $4, N + jle .L80 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 32 * SIZE + BUFFER, BO + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm5 + movaps 12 * SIZE(B), %xmm7 + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + movaps 16 * SIZE(B), %xmm1 + + pshufd $0x50, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xfa, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps 20 * SIZE(B), %xmm3 + + pshufd $0x50, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(BO) + pshufd $0xfa, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(BO) + + movaps 24 * SIZE(B), %xmm5 + + pshufd $0x50, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(BO) + pshufd $0xfa, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(BO) + + movaps 28 * SIZE(B), %xmm7 + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L45: + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + movaps 4 * SIZE(B), %xmm1 + + addq $ 4 * SIZE, B + subq $-8 * SIZE, BO + decq %rax + jne .L45 + ALIGN_4 + +.L50: + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + + pxor %xmm8, %xmm8 + PREFETCHW 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + PREFETCHW 5 * SIZE(CO1, LDC) + pxor %xmm10, %xmm10 + PREFETCHW 3 * SIZE(CO2) + pxor %xmm11, %xmm11 + PREFETCHW 5 * SIZE(CO2, LDC) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L56 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0)(AO) + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -8 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps 0 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + decq %rax + BRANCH + jg .L52 + +.L56: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L58 + ALIGN_4 + +.L57: + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + + addq $ 4 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + jg .L57 + ALIGN_4 + +.L58: + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1, LDC), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 2 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2, LDC), %xmm2 + movsd 0 * SIZE(CO2, LDC), %xmm3 + movhps 2 * SIZE(CO2), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1, LDC) + movlps %xmm9, 0 * SIZE(CO1, LDC) + movhps %xmm9, 2 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2, LDC) + movlps %xmm11, 0 * SIZE(CO2, LDC) + movhps %xmm11, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L66 + ALIGN_3 + +.L62: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -30 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -28 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -12 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -26 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps 0 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -24 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + decq %rax + BRANCH + jg .L62 + +.L66: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L68 + ALIGN_4 + +.L67: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movddup -30 * SIZE(AO), %xmm0 + + addq $ 2 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + jg .L67 + ALIGN_4 + +.L68: + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + + movsd (CO2), %xmm1 + movhps (CO2, LDC), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movlps %xmm8, (CO1) + movhps %xmm8, (CO1, LDC) + + movlps %xmm9, (CO2) + movhps %xmm9, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT + 1, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L76 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0)(AO) + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -31 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -30 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -12 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -29 * SIZE(AO), %xmm0 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps 0 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -28 * SIZE(AO), %xmm0 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + decq %rax + BRANCH + jg .L72 + +.L76: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L78 + ALIGN_4 + +.L77: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movss -31 * SIZE(AO), %xmm0 + + addq $ 1 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + jg .L77 + ALIGN_4 + +.L78: + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + + movhlps %xmm8, %xmm10 + movhlps %xmm9, %xmm11 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO1, LDC), %xmm10 + addss (CO2), %xmm9 + addss (CO2, LDC), %xmm11 +#endif + + movss %xmm8, (CO1) + movss %xmm10, (CO1, LDC) + + movss %xmm9, (CO2) + movss %xmm11, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C + ALIGN_4 + +.L80: + testq $2, N + jle .L120 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 32 * SIZE + BUFFER, BO + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm3 + + movq K, %rax + sarq $2, %rax + jle .L83 + ALIGN_4 + +.L82: + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + movaps 8 * SIZE(B), %xmm1 + + pshufd $0x50, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xfa, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps 12 * SIZE(B), %xmm3 + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + + decq %rax + jne .L82 + ALIGN_4 + +.L83: + movq K, %rax + andq $3, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L85: + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + movsd 2 * SIZE(B), %xmm1 + + addq $ 2 * SIZE, B + subq $-4 * SIZE, BO + decq %rax + jne .L85 + ALIGN_4 + +.L90: + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $2, I + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + + pxor %xmm8, %xmm8 + PREFETCHW 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + PREFETCHW 3 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L96 + ALIGN_3 + +.L92: + PREFETCH (PREFETCHSIZE + 0)(AO) + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -24 * SIZE(AO), %xmm0 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + BRANCH + jg .L92 + +.L96: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L98 + ALIGN_4 + +.L97: + pshufd $0x4e, %xmm1, %xmm3 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + + addq $ 4 * SIZE, AO + subq $-4 * SIZE, BO + decq %rax + jg .L97 + ALIGN_4 + +.L98: + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO2) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L106 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + BRANCH + jg .L102 + +.L106: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L108 + ALIGN_4 + +.L107: + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + + addq $ 2 * SIZE, AO + subq $-4 * SIZE, BO + decq %rax + jg .L107 + ALIGN_4 + +.L108: + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movlps %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L110: + testq $1, M + je .L119 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L116 + ALIGN_3 + +.L112: + PREFETCH (PREFETCHSIZE + 0)(AO) + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + BRANCH + jg .L112 + +.L116: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L118 + ALIGN_4 + +.L117: + shufps $0, %xmm0, %xmm0 + mulps %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + + addq $ 1 * SIZE, AO + subq $-4 * SIZE, BO + decq %rax + jg .L117 + ALIGN_4 + +.L118: + mulps %xmm7, %xmm8 + movhlps %xmm8, %xmm9 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO2), %xmm9 +#endif + + movss %xmm8, (CO1) + movss %xmm9, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L119: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + ALIGN_4 + +.L120: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 32 * SIZE + BUFFER, BO + + movsd 0 * SIZE(B), %xmm1 + movhps 2 * SIZE(B), %xmm1 + + movq K, %rax + sarq $2, %rax + jle .L123 + ALIGN_4 + +.L122: + pshufd $0x50, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0xfa, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + + movsd 4 * SIZE(B), %xmm1 + movhps 6 * SIZE(B), %xmm1 + + addq $ 4 * SIZE, B + subq $-8 * SIZE, BO + + decq %rax + jne .L122 + ALIGN_4 + +.L123: + movq K, %rax + andq $3, %rax + BRANCH + jle .L130 + ALIGN_4 + +.L125: + pshufd $0x50, %xmm1, %xmm0 + movlps %xmm0, -32 * SIZE(BO) + movss 1 * SIZE(B), %xmm1 + + addq $ 1 * SIZE, B + subq $-2 * SIZE, BO + decq %rax + jne .L125 + ALIGN_4 + +.L130: + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + jle .L140 + ALIGN_4 + +.L131: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movddup -32 * SIZE(BO), %xmm1 + + pxor %xmm8, %xmm8 + PREFETCHW 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L136 + ALIGN_3 + +.L132: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -24 * SIZE(BO), %xmm1 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + decq %rax + BRANCH + jg .L132 + +.L136: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L138 + ALIGN_4 + +.L137: + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + + addq $ 4 * SIZE, AO + subq $-2 * SIZE, BO + decq %rax + jg .L137 + ALIGN_4 + +.L138: + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + decq I + jg .L131 + ALIGN_4 + +.L140: + testq $2, M + je .L150 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L146 + ALIGN_3 + +.L142: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -24 * SIZE(BO), %xmm1 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + BRANCH + jg .L142 + +.L146: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L148 + ALIGN_4 + +.L147: + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + + addq $ 2 * SIZE, AO + subq $-2 * SIZE, BO + decq %rax + jg .L147 + ALIGN_4 + +.L148: + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movlps %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L150: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movss -32 * SIZE(BO), %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L156 + ALIGN_3 + +.L152: + PREFETCH (PREFETCHSIZE + 0)(AO) + + mulss %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + + mulss %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -28 * SIZE(BO), %xmm1 + + mulss %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -26 * SIZE(BO), %xmm1 + + mulss %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -24 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + decq %rax + BRANCH + jg .L152 + +.L156: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L158 + ALIGN_4 + +.L157: + mulss %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addss %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L157 + ALIGN_4 + +.L158: + mulss %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 +#endif + + movss %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_4x8_nehalem.S b/kernel/x86_64/gemm_kernel_4x8_nehalem.S new file mode 100644 index 0000000000..5d02ac63d3 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_4x8_nehalem.S @@ -0,0 +1,2397 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %rbp + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rdx +#define BB %r12 + +#define PREA %r10 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define J 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define J 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCHSIZE 8 +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm0, %xmm0 + movlps %xmm0, ALPHA + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: + prefetcht2 -32 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + leaq (LDC, LDC, 2), %rax + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + PADDING + xorps %xmm4, %xmm4 + + PADDING + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO1, LDC, 1) + PADDING + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC, 2) + PADDING + xorps %xmm11, %xmm11 + prefetcht0 7 * SIZE(CO1, %rax, 1) + + movaps -32 * SIZE(AO), %xmm0 + + PADDING + xorps %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 3 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 7 * SIZE(CO2, %rax, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + movaps -28 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + movaps -20 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm4 + + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movups (CO1), %xmm0 + movups (CO1, LDC, 1), %xmm1 + movups (CO1, LDC, 2), %xmm2 + movups (CO1, %rax, 1), %xmm3 + + movups (CO2), %xmm4 + movups (CO2, LDC, 1), %xmm5 + movups (CO2, LDC, 2), %xmm6 + movups (CO2, %rax, 1), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + addps %xmm4, %xmm12 + addps %xmm5, %xmm13 + addps %xmm6, %xmm14 + addps %xmm7, %xmm15 +#endif + + movups %xmm8, (CO1) + movups %xmm9, (CO1, LDC, 1) + movups %xmm10, (CO1, LDC, 2) + movups %xmm11, (CO1, %rax, 1) + + movups %xmm12, (CO2) + movups %xmm13, (CO2, LDC, 1) + movups %xmm14, (CO2, LDC, 2) + movups %xmm15, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO1, LDC, 2), %xmm1 + movhps (CO1, %rax, 1), %xmm1 + + movsd (CO2), %xmm2 + movhps (CO2, LDC, 1), %xmm2 + movsd (CO2, LDC, 2), %xmm3 + movhps (CO2, %rax, 1), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + movsd %xmm9, (CO1, LDC, 2) + movhps %xmm9, (CO1, %rax, 1) + + movsd %xmm10, (CO2) + movhps %xmm10, (CO2, LDC, 1) + movsd %xmm11, (CO2, LDC, 2) + movhps %xmm11, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm12 + + pshufd $0xff, %xmm8, %xmm11 + pshufd $0xaa, %xmm8, %xmm10 + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + + pshufd $0xff, %xmm12, %xmm15 + pshufd $0xaa, %xmm12, %xmm14 + pshufd $0x55, %xmm12, %xmm13 + pshufd $0x00, %xmm12, %xmm12 + + leaq (LDC, LDC, 2), %rax + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO1, LDC, 1), %xmm9 + addss (CO1, LDC, 2), %xmm10 + addss (CO1, %rax, 1), %xmm11 + + addss (CO2), %xmm12 + addss (CO2, LDC, 1), %xmm13 + addss (CO2, LDC, 2), %xmm14 + addss (CO2, %rax, 1), %xmm15 +#endif + + movss %xmm8, (CO1) + movss %xmm9, (CO1, LDC, 1) + movss %xmm10, (CO1, LDC, 2) + movss %xmm11, (CO1, %rax, 1) + + movss %xmm12, (CO2) + movss %xmm13, (CO2, LDC, 1) + movss %xmm14, (CO2, LDC, 2) + movss %xmm15, (CO2, %rax, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK +#endif + + movq BO, B + + leaq (C, LDC, 8), C + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L40: + testq $4, N + jle .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO2, LDC, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC, 1), %xmm1 + movhps 2 * SIZE(CO1, LDC, 1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO2, LDC, 1), %xmm3 + movhps 2 * SIZE(CO2, LDC, 1), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO1, LDC, 1) + movhps %xmm9, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 1) + movhps %xmm11, 2 * SIZE(CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC, 1), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC, 1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC, 1) + movsd %xmm9, (CO2) + movhps %xmm9, (CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + addps %xmm2, %xmm8 + mulps %xmm7, %xmm8 + + pshufd $0xff, %xmm8, %xmm11 + pshufd $0xaa, %xmm8, %xmm10 + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO1, LDC, 1), %xmm9 + addss (CO2), %xmm10 + addss (CO2, LDC, 1), %xmm11 +#endif + + movss %xmm8, (CO1) + movss %xmm9, (CO1, LDC, 1) + movss %xmm10, (CO2) + movss %xmm11, (CO2, LDC, 1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + ALIGN_4 + +.L70: + testq $2, N + jle .L100 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: + addps %xmm1, %xmm8 + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L90: + testq $1, M + BRANCH + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: + addps %xmm2, %xmm8 + mulps %xmm7, %xmm8 + + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 + addss (CO2), %xmm9 +#endif + + movss %xmm8, (CO1) + movss %xmm9, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L100: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: + addps %xmm1, %xmm8 + + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: + addps %xmm1, %xmm8 + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: + movddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: + addps %xmm2, %xmm8 + + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addss (CO1), %xmm8 +#endif + + movss %xmm8, (CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_barcelona.S b/kernel/x86_64/gemm_kernel_8x4_barcelona.S new file mode 100644 index 0000000000..b40c8bac7f --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_barcelona.S @@ -0,0 +1,3253 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 4 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define KERNEL1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + movaps (AO, %rax, 4), %xmm6 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ + addps %xmm1, %xmm14 ;\ + movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ + addps %xmm5, %xmm14 ;\ + movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 ;\ + addq $16 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps (AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 32) * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm0 + pshufd $0x55, %xmm11, %xmm1 + pshufd $0xaa, %xmm11, %xmm2 + pshufd $0xff, %xmm11, %xmm3 + + prefetchw (WPREFETCHSIZE + 48) * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm4 + pshufd $0x55, %xmm15, %xmm5 + pshufd $0xaa, %xmm15, %xmm6 + pshufd $0xff, %xmm15, %xmm7 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movaps -28 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movaps -16 * SIZE(AO), %xmm4 + xorps %xmm10, %xmm10 + movaps 0 * SIZE(BO), %xmm5 + xorps %xmm11, %xmm11 + + prefetch -20 * SIZE(BB) + + prefetchw 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1, LDC, 2) + xorps %xmm14, %xmm14 + prefetchw 7 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + movaps %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + prefetch 16 * SIZE(BB) + subq $-32 * SIZE, BB + + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL_SUB1(32 * 0) + KERNEL_SUB2(32 * 0) + KERNEL_SUB3(32 * 0) + KERNEL_SUB4(32 * 0) + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + ALIGN_3 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm8 + movaps %xmm2, %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm9 + movaps %xmm0, %xmm2 + addps %xmm3, %xmm13 + movaps -20 * SIZE(BO, %rax, 8), %xmm3 + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm10 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm11 + addps %xmm3, %xmm15 + movaps -12 * SIZE(BO, %rax, 8), %xmm3 + movaps %xmm0, %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + movups 0 * SIZE(CO1), %xmm0 + movups 4 * SIZE(CO1), %xmm1 + movups 0 * SIZE(CO2), %xmm2 + movups 4 * SIZE(CO2), %xmm3 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movups 0 * SIZE(CO1, LDC, 2), %xmm4 + movups 4 * SIZE(CO1, LDC, 2), %xmm5 + movups 0 * SIZE(CO2, LDC, 2), %xmm6 + movups 4 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movsd %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + +#ifndef TRMMKERNEL + addps %xmm4, %xmm10 + addps %xmm5, %xmm14 + addps %xmm6, %xmm11 + addps %xmm7, %xmm15 +#endif + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movsd %xmm14, 4 * SIZE(CO1, LDC, 2) + movhps %xmm14, 6 * SIZE(CO1, LDC, 2) + + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movsd %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhps 2 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhps 2 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movsd 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsd 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movsd 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movsd 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movsd 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movsd 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movsd 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movsd 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movsd 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movsd 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movsd 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movsd 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsd 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movsd 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsd 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsd 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movsd 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsd 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss -30 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss -29 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss -24 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss -27 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss -26 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss -25 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + mulss %xmm15, %xmm0 + mulss %xmm15, %xmm1 + mulss %xmm15, %xmm2 + mulss %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm10 + movss 0 * SIZE(CO1, LDC, 2), %xmm12 + movss 0 * SIZE(CO2, LDC, 2), %xmm14 + + addss %xmm8, %xmm0 + addss %xmm10, %xmm1 + addss %xmm12, %xmm2 + addss %xmm14, %xmm3 +#endif + + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO2) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $ 2 * SIZE, B + addq $ 8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + prefetchw 4 * SIZE(CO1) + xorps %xmm4, %xmm4 + prefetchw 4 * SIZE(CO2) + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 48 * SIZE(AO), %xmm10 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 4 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 8 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 12 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 64 * SIZE(AO), %xmm12 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 20 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 24 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 80 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + movsd 4 * SIZE(CO2), %xmm11 + movhps 6 * SIZE(CO2), %xmm11 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 + addps %xmm10, %xmm1 + addps %xmm11, %xmm5 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + movsd %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movsd 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsd 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movsd 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsd 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movsd 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsd 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -29 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -24 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -27 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -26 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -25 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm10 +#endif + + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + mulss %xmm15, %xmm0 + mulss %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addss %xmm8, %xmm0 + addss %xmm10, %xmm1 +#endif + + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movups 0 * SIZE(B), %xmm3 + movups 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + prefetchw 4 * SIZE(CO1) + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps -20 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps -12 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps -8 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps -4 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 4 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 8 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 12 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 64 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 20 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 24 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 28 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 80 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps -28 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -24 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps -20 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps -12 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -8 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps -4 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + mulps %xmm15, %xmm0 +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss -31 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss -30 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss -29 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss -24 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss -27 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss -26 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss -25 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss -20 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movss ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + mulss %xmm15, %xmm0 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + addss %xmm8, %xmm0 +#endif + movss %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_core2.S b/kernel/x86_64/gemm_kernel_8x4_core2.S new file mode 100644 index 0000000000..285d6441e6 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_core2.S @@ -0,0 +1,2615 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (16 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (16 * 13 + 10) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq OLD_M, M + movq OLD_N, N + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J + jle .L50 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq 32 * SIZE + BUFFER, BO + + movaps -32 * SIZE(B), %xmm3 + + movq K, %rax + sarq $2, %rax + jle .L05 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movaps -28 * SIZE(B), %xmm7 + movaps -24 * SIZE(B), %xmm11 + movaps -20 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0x55, %xmm3, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + pshufd $0xaa, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xff, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps -16 * SIZE(B), %xmm3 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + pshufd $0x00, %xmm7, %xmm4 + movaps %xmm4, -16 * SIZE(BO) + pshufd $0x55, %xmm7, %xmm5 + movaps %xmm5, -12 * SIZE(BO) + pshufd $0xaa, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(BO) + pshufd $0xff, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 32) * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + movaps %xmm8, 0 * SIZE(BO) + pshufd $0x55, %xmm11, %xmm9 + movaps %xmm9, 4 * SIZE(BO) + pshufd $0xaa, %xmm11, %xmm10 + movaps %xmm10, 8 * SIZE(BO) + pshufd $0xff, %xmm11, %xmm11 + movaps %xmm11, 12 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 48) * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm12 + movaps %xmm12, 16 * SIZE(BO) + pshufd $0x55, %xmm15, %xmm13 + movaps %xmm13, 20 * SIZE(BO) + pshufd $0xaa, %xmm15, %xmm14 + movaps %xmm14, 24 * SIZE(BO) + pshufd $0xff, %xmm15, %xmm15 + movaps %xmm15, 28 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-64 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L05: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movaps -32 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L06 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 40 * SIZE + BUFFER, BO +#else + leaq 40 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movaps -28 * SIZE(AO), %xmm1 + pxor %xmm10, %xmm10 + movaps -40 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movaps -36 * SIZE(BO), %xmm7 + + prefetcht2 -32 * SIZE(BB) + + prefetcht0 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 7 * SIZE(CO2) + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + prefetcht0 7 * SIZE(CO1, LDC, 2) + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + prefetcht0 7 * SIZE(CO2, LDC, 2) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -4 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + PADDING; + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps 12 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps 20 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + subq $-32 * SIZE, AO + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps 28 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, BO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_4 + +.L15: + prefetcht2 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + addq $8 * SIZE, AO + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + addq $16 * SIZE, BO + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA, %xmm7 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm14 + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 4 * SIZE(CO1, LDC, 2), %xmm5 + movhps 6 * SIZE(CO1, LDC, 2), %xmm5 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 + movsd 4 * SIZE(CO2, LDC, 2), %xmm7 + movhps 6 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movlps %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + +#ifndef TRMMKERNEL + addps %xmm4, %xmm10 + addps %xmm5, %xmm14 + addps %xmm6, %xmm11 + addps %xmm7, %xmm15 +#endif + + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 4 * SIZE(CO1, LDC, 2) + movhps %xmm14, 6 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + subq $1, I + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L25 + ALIGN_4 + +.L21: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -28 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps 0 * SIZE(BO), %xmm2 + movaps 4 * SIZE(BO), %xmm3 + movaps 8 * SIZE(BO), %xmm4 + movaps 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -20 * SIZE(AO), %xmm0 + movaps 16 * SIZE(BO), %xmm2 + movaps 20 * SIZE(BO), %xmm3 + movaps 24 * SIZE(BO), %xmm4 + movaps 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L21 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L28 + ALIGN_4 + +.L26: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + addq $ 4 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 + addps %xmm4, %xmm10 + addps %xmm6, %xmm11 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + subq $1, I + ALIGN_4 + +.L30: + testq $2, M + jle .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L35 + ALIGN_4 + +.L31: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -30 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 4 * SIZE(BO), %xmm3 + movsd 8 * SIZE(BO), %xmm4 + movsd 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -26 * SIZE(AO), %xmm0 + movsd 16 * SIZE(BO), %xmm2 + movsd 20 * SIZE(BO), %xmm3 + movsd 24 * SIZE(BO), %xmm4 + movsd 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L31 + ALIGN_4 + +.L35: + movsd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + ALIGN_4 + +.L36: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + addq $ 2 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 + addps %xmm4, %xmm10 + addps %xmm6, %xmm11 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L40: + testq $1, M + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L45 + ALIGN_4 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -31 * SIZE(AO), %xmm0 + movss -16 * SIZE(BO), %xmm2 + movss -12 * SIZE(BO), %xmm3 + movss -8 * SIZE(BO), %xmm4 + movss -4 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -30 * SIZE(AO), %xmm0 + movss 0 * SIZE(BO), %xmm2 + movss 4 * SIZE(BO), %xmm3 + movss 8 * SIZE(BO), %xmm4 + movss 12 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -29 * SIZE(AO), %xmm0 + movss 16 * SIZE(BO), %xmm2 + movss 20 * SIZE(BO), %xmm3 + movss 24 * SIZE(BO), %xmm4 + movss 28 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L41 + ALIGN_4 + +.L45: + movss ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L48 + ALIGN_4 + +.L46: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + addq $ 1 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm0 + movss 0 * SIZE(CO2), %xmm2 + movss 0 * SIZE(CO1, LDC, 2), %xmm4 + movss 0 * SIZE(CO2, LDC, 2), %xmm6 +#endif + + mulss %xmm7, %xmm8 + mulss %xmm7, %xmm9 + mulss %xmm7, %xmm10 + mulss %xmm7, %xmm11 + +#ifndef TRMMKERNEL + addss %xmm0, %xmm8 + addss %xmm2, %xmm9 + addss %xmm4, %xmm10 + addss %xmm6, %xmm11 +#endif + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + movss %xmm10, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C + subq $1, J + jg .L01 + ALIGN_4 + +.L50: + testq $2, N + jle .L100 + ALIGN_4 + +.L51: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L53 + + addq %rax, %rax + ALIGN_4 + +.L52: + movaps -32 * SIZE(B), %xmm3 + movaps -28 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + subq $1, %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $7, %rax + BRANCH + jle .L55 + ALIGN_4 + +.L54: + movss -32 * SIZE(B), %xmm8 + movss -31 * SIZE(B), %xmm9 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L54 + ALIGN_4 + +.L55: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO # aoffset = a + + movq M, I + sarq $3, I + jle .L70 + ALIGN_4 + +.L60: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht0 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L65 + ALIGN_4 + +.L61: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -20 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + + movaps -16 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -12 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + + movaps -8 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -4 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + subq $-32 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L61 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L68 + ALIGN_4 + +.L66: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movlps %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + subq $1, I + jg .L60 + ALIGN_4 + +.L70: + testq $4, M + jle .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L75 + ALIGN_4 + +.L71: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L71 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + ALIGN_4 + +.L76: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L80: + testq $2, M + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L85 + ALIGN_4 + +.L81: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L81 + ALIGN_4 + +.L85: + movsd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L88 + ALIGN_4 + +.L86: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L90: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L95 + ALIGN_4 + +.L91: + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -31 * SIZE(AO), %xmm1 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm4 + mulss %xmm1, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -30 * SIZE(AO), %xmm0 + movss -29 * SIZE(AO), %xmm1 + movss -16 * SIZE(BO), %xmm2 + movss -12 * SIZE(BO), %xmm3 + movss -8 * SIZE(BO), %xmm4 + movss -4 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm4 + mulss %xmm1, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L91 + ALIGN_4 + +.L95: + movss ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L98 + ALIGN_4 + +.L96: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm10, %xmm8 + addss %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm0 + movss 0 * SIZE(CO2), %xmm2 +#endif + + mulss %xmm7, %xmm8 + mulss %xmm7, %xmm9 + +#ifndef TRMMKERNEL + addss %xmm0, %xmm8 + addss %xmm2, %xmm9 +#endif + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + + + +.L100: + testq $1, N + jle .L999 + ALIGN_4 + +.L101: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $4, %rax + jle .L103 + + addq %rax, %rax + ALIGN_4 + +.L102: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $15, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movss -32 * SIZE(B), %xmm8 + + shufps $0, %xmm8, %xmm8 + + movaps %xmm8, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L104 + ALIGN_4 + +.L105: + movq C, CO1 + movq A, AO + + movq M, I + sarq $3, I + jle .L120 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L115 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -28 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + movaps -20 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + + subq $-32 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L111 + ALIGN_4 + +.L115: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + ALIGN_4 + +.L116: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm12 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + subq $1, I + jg .L110 + ALIGN_4 + +.L120: + testq $4, M + jle .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L125 + ALIGN_4 + +.L121: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L121 + ALIGN_4 + +.L125: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L128 + ALIGN_4 + +.L126: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm8 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 +#endif + + mulps %xmm7, %xmm8 +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L130: + testq $2, M + jle .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L135 + ALIGN_4 + +.L131: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -24 * SIZE(BO), %xmm2 + movsd -20 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L131 + ALIGN_4 + +.L135: + movsd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L138 + ALIGN_4 + +.L136: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L136 + ALIGN_4 + +.L138: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm9, %xmm8 + + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + movlps %xmm8, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $2 * SIZE, CO1 + ALIGN_4 + +.L140: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L145 + ALIGN_4 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -31 * SIZE(AO), %xmm1 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm1, %xmm3 + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + + movss -30 * SIZE(AO), %xmm0 + movss -29 * SIZE(AO), %xmm1 + movss -24 * SIZE(BO), %xmm2 + movss -20 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm1, %xmm3 + addss %xmm2, %xmm10 + addss %xmm3, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L141 + ALIGN_4 + +.L145: + movss ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L148 + ALIGN_4 + +.L146: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L146 + ALIGN_4 + +.L148: +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm0 +#endif + + addss %xmm10, %xmm8 + addss %xmm11, %xmm9 + addss %xmm9, %xmm8 + mulss %xmm7, %xmm8 + +#ifndef TRMMKERNEL + addss %xmm0, %xmm8 +#endif + + movss %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_penryn.S b/kernel/x86_64/gemm_kernel_8x4_penryn.S new file mode 100644 index 0000000000..68ca5fc08a --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_penryn.S @@ -0,0 +1,2515 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA 48(%rsp) +#define J 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define J 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 17 + 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm0, %xmm0 + movlps %xmm0, ALPHA + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L50 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $3, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + xorpd %xmm5, %xmm5 + prefetcht0 -32 * SIZE(BB) + xorpd %xmm6, %xmm6 + + prefetcht2 7 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht2 7 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht2 7 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movaps %xmm4, %xmm13 + prefetcht2 7 * SIZE(CO2, LDC, 2) + movaps %xmm4, %xmm14 + movaps %xmm4, %xmm15 + + subq $-24 * SIZE, BB + + leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH -32 * SIZE(PREA) + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + PREFETCH -16 * SIZE(PREA) + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 4 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + PREFETCH 0 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 12 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 20 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + PREFETCH 16 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 28 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + subq $-64 * SIZE, AO + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, PREA + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht0 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + + movddup ALPHA, %xmm3 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + mulps %xmm3, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm3, %xmm10 + mulps %xmm3, %xmm11 + + mulps %xmm3, %xmm12 + mulps %xmm3, %xmm13 + mulps %xmm3, %xmm14 + mulps %xmm3, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 4 * SIZE(CO1, LDC, 2), %xmm5 + movhps 6 * SIZE(CO1, LDC, 2), %xmm5 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 + movsd 4 * SIZE(CO2, LDC, 2), %xmm7 + movhps 6 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + addps %xmm4, %xmm10 + addps %xmm5, %xmm14 + addps %xmm6, %xmm11 + addps %xmm7, %xmm15 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movsd %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movsd %xmm14, 4 * SIZE(CO1, LDC, 2) + movhps %xmm14, 6 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movsd %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm6, %xmm10 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + subq $-16 * SIZE, AO + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + addps %xmm6, %xmm10 + addps %xmm4, %xmm11 + + movddup ALPHA, %xmm3 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + mulps %xmm3, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm3, %xmm10 + mulps %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 + addps %xmm4, %xmm10 + addps %xmm6, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + + movsd %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movsd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $2, M + BRANCH + jle .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm0, %xmm1 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0xee, %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm10 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0x44, %xmm0, %xmm1 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0xee, %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm10 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm2, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x44, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + movddup ALPHA, %xmm2 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 0 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO1, LDC, 2), %xmm1 + movhps 0 * SIZE(CO2, LDC, 2), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 0 * SIZE(CO2) + movsd %xmm9, 0 * SIZE(CO1, LDC, 2) + movhps %xmm9, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movaps -24 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + movddup ALPHA, %xmm2 + addps %xmm9, %xmm8 + mulps %xmm2, %xmm8 + + pshufd $0xff, %xmm8, %xmm11 + pshufd $0xaa, %xmm8, %xmm10 + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + +#ifndef TRMMKERNEL + addss 0 * SIZE(CO1), %xmm8 + addss 0 * SIZE(CO2), %xmm9 + addss 0 * SIZE(CO1, LDC, 2), %xmm10 + addss 0 * SIZE(CO2, LDC, 2), %xmm11 +#endif + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + movss %xmm10, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L50: + testq $2, N + jle .L90 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $3, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 -32 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + prefetcht0 7 * SIZE(CO1) + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + prefetcht0 7 * SIZE(CO2) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + addps %xmm5, %xmm10 + pshufd $0xaa, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0xff, %xmm2, %xmm6 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0xaa, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0xff, %xmm2, %xmm6 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + movddup ALPHA, %xmm7 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm9 + addps %xmm3, %xmm11 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm10, 4 * SIZE(CO1) + movhps %xmm10, 6 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movsd %xmm11, 4 * SIZE(CO2) + movhps %xmm11, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $4, M + BRANCH + jle .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm10 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xff, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm10 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xff, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_3 + +.L66: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + movddup ALPHA, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $2, M + BRANCH + jle .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm9 + pshufd $0xee, %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + pshufd $0xfa, %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm9 + pshufd $0xee, %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + pshufd $0xfa, %xmm2, %xmm3 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + pshufd $0x50, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: + movddup ALPHA, %xmm2 + + addps %xmm9, %xmm8 + addps %xmm3, %xmm8 + + mulps %xmm2, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 0 * SIZE(CO2), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L89 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -30 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movsd -28 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -26 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movsd -24 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L88 + ALIGN_3 + +.L86: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -30 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: + movddup ALPHA, %xmm2 + addps %xmm9, %xmm8 + mulps %xmm2, %xmm8 + + pshufd $0x55, %xmm8, %xmm9 + pshufd $0x00, %xmm8, %xmm8 + +#ifndef TRMMKERNEL + addss 0 * SIZE(CO1), %xmm8 + addss 0 * SIZE(CO2), %xmm9 +#endif + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L90: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $3, I + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm1 + xorps %xmm9, %xmm9 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + prefetcht0 7 * SIZE(CO1) + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -20 * SIZE(AO), %xmm1 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm10 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm11 + movaps -12 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -4 * SIZE(AO), %xmm1 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm10 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm11 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + ALIGN_3 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: + movddup ALPHA, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $4, M + BRANCH + jle .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L108 + ALIGN_3 + +.L106: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: + movddup ALPHA, %xmm7 + + addps %xmm9, %xmm8 + mulps %xmm7, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L118 + ALIGN_3 + +.L116: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: + movddup ALPHA, %xmm2 + + mulps %xmm2, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movss -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + addss %xmm2, %xmm9 + movss -30 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + addss %xmm2, %xmm9 + movss -28 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L128 + ALIGN_3 + +.L126: + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: + movss ALPHA, %xmm2 + addss %xmm9, %xmm8 + mulss %xmm2, %xmm8 + +#ifndef TRMMKERNEL + addss 0 * SIZE(CO1), %xmm8 +#endif + + movss %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S new file mode 100644 index 0000000000..218cb047c8 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_sse.S @@ -0,0 +1,3446 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi + +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 256(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 9 + 8) +#endif + +#if defined(GENERIC) || defined(NANO) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 5 + 8) +#endif + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movd 0 * SIZE(B), %mm0 + + movq K, %rax + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + punpckldq %mm0, %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + punpckldq %mm1, %mm1 + movd 8 * SIZE(B), %mm0 + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + punpckldq %mm2, %mm2 + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + punpckldq %mm3, %mm3 + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + punpckldq %mm4, %mm4 + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + punpckldq %mm5, %mm5 + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + punpckldq %mm6, %mm6 + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + punpckldq %mm7, %mm7 + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movaps -24 * SIZE(AO), %xmm4 + movaps -24 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + movaps -20 * SIZE(AO), %xmm6 + movaps -16 * SIZE(BO), %xmm7 + xorps %xmm11, %xmm11 + + PREFETCHW 7 * SIZE(CO1) + xorps %xmm12, %xmm12 + PREFETCHW 15 * SIZE(CO2) + xorps %xmm13, %xmm13 + PREFETCHW 7 * SIZE(CO1, LDC, 2) + xorps %xmm14, %xmm14 + PREFETCHW 15 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + PREFETCH -32 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: + PREFETCH -16 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $64 * SIZE, BO + addq $32 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 64 * SIZE, BO + subq $-32 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm0 + addps %xmm1, %xmm10 + movaps -32 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm13 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm2 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm2, %xmm15 + movaps -20 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm11 + + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm13 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm15 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1, LDC, 2), %xmm4 + movhps 2 * SIZE(CO1, LDC, 2), %xmm4 + movsd 4 * SIZE(CO1, LDC, 2), %xmm5 + movhps 6 * SIZE(CO1, LDC, 2), %xmm5 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm6 + movhps 2 * SIZE(CO2, LDC, 2), %xmm6 + movsd 4 * SIZE(CO2, LDC, 2), %xmm7 + movhps 6 * SIZE(CO2, LDC, 2), %xmm7 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm9, 0 * SIZE(CO2) + movhps %xmm9, 2 * SIZE(CO2) + movlps %xmm13, 4 * SIZE(CO2) + movhps %xmm13, 6 * SIZE(CO2) + +#ifndef TRMMKERNEL + addps %xmm4, %xmm10 + addps %xmm5, %xmm14 + addps %xmm6, %xmm11 + addps %xmm7, %xmm15 +#endif + + movlps %xmm10, 0 * SIZE(CO1, LDC, 2) + movhps %xmm10, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 4 * SIZE(CO1, LDC, 2) + movhps %xmm14, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhps 2 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhps 2 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(CO1), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 0 * SIZE(CO2), %xmm10 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss -30 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss -29 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss -24 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss -27 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss -26 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss -25 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + mulss %xmm15, %xmm0 + mulss %xmm15, %xmm1 + mulss %xmm15, %xmm2 + mulss %xmm15, %xmm3 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm10 + movss 0 * SIZE(CO1, LDC, 2), %xmm12 + movss 0 * SIZE(CO2, LDC, 2), %xmm14 + + addss %xmm8, %xmm0 + addss %xmm10, %xmm1 + addss %xmm12, %xmm2 + addss %xmm14, %xmm3 +#endif + + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO2) + movss %xmm2, 0 * SIZE(CO1, LDC, 2) + movss %xmm3, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + PREFETCH 32 * SIZE(B) + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH 32 * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) +#endif + + addq $ 2 * SIZE, B + addq $ 8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + PREFETCHW 7 * SIZE(CO1) + xorps %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 48 * SIZE(AO), %xmm10 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 4 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 8 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 12 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 64 * SIZE(AO), %xmm12 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 20 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 24 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 80 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + movsd 4 * SIZE(CO2), %xmm11 + movhps 6 * SIZE(CO2), %xmm11 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 + addps %xmm10, %xmm1 + addps %xmm11, %xmm5 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + movlps %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(CO1), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 0 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -29 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -24 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -27 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -26 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -25 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm10 +#endif + + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + mulss %xmm15, %xmm0 + mulss %xmm15, %xmm1 + +#ifndef TRMMKERNEL + addss %xmm8, %xmm0 + addss %xmm10, %xmm1 +#endif + + movss %xmm0, 0 * SIZE(CO1) + movss %xmm1, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + PREFETCH 32 * SIZE(B) + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH 32 * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 * SIZE(BO) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + movd 0 * SIZE(B), %mm0 + punpckldq %mm0, %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) +#endif + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + + PREFETCHW 7 * SIZE(CO1) + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps -20 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps -12 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps -8 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps -4 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 4 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 8 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 12 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 64 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 20 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 24 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 28 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 80 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps -28 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -24 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps -20 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps -12 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -8 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps -4 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + mulps %xmm15, %xmm0 +#ifndef TRMMKERNEL + addps %xmm8, %xmm0 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(CO1), %xmm8 + addps %xmm8, %xmm0 +#endif + + movlps %xmm0, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss -31 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss -30 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss -29 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss -24 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss -27 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss -26 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss -25 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss -20 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movss ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + mulss %xmm15, %xmm0 + +#ifndef TRMMKERNEL + movss 0 * SIZE(CO1), %xmm8 + addss %xmm8, %xmm0 +#endif + movss %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_sse3.S b/kernel/x86_64/gemm_kernel_8x4_sse3.S new file mode 100644 index 0000000000..c7954fefa0 --- /dev/null +++ b/kernel/x86_64/gemm_kernel_8x4_sse3.S @@ -0,0 +1,3022 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r12 +#define BO %r13 +#define CO1 %r14 +#define CO2 %r15 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 320 + +#define KERNEL1(address) \ + mulps %xmm8, %xmm9; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AO); \ + addps %xmm9, %xmm0; \ + movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm1; \ + movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 4 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm3; \ + movsldup 0 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm5; \ + movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 8 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm7; \ + movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm0; \ + movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm1; \ + movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 12 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm3; \ + movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm5; \ + movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 64 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm7; \ + movsldup 64 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm1; \ + movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 20 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm3; \ + movsldup 16 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm5; \ + movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 24 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm7; \ + movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm1; \ + movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 28 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm3; \ + movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm5; \ + movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 80 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm7; \ + movsldup 80 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulps %xmm12, %xmm13; \ + PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * SIZE(AO); \ + addps %xmm13, %xmm0; \ + movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm1; \ + movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 36 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm3; \ + movsldup 32 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm5; \ + movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 40 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm7; \ + movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm0; \ + movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm1; \ + movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 44 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm3; \ + movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm5; \ + movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 96 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm7; \ + movsldup 96 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm1; \ + movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 52 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm3; \ + movsldup 48 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm5; \ + movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 56 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm7; \ + movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm1; \ + movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 60 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm3; \ + movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm5; \ + movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 112 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm7; \ + movsldup 112 * SIZE + (address) * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + leaq (, LDC, SIZE), LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq 112 * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: + prefetcht0 0 * SIZE(BB) + prefetcht0 8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + prefetchnta 8 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 8 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 8 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 8 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 + +.L1X: + KERNEL1 (64 * 0) + KERNEL2 (64 * 0) + KERNEL3 (64 * 0) + KERNEL4 (64 * 0) + KERNEL5 (64 * 0) + KERNEL6 (64 * 0) + KERNEL7 (64 * 0) + KERNEL8 (64 * 0) + KERNEL9 (64 * 0) + KERNEL10(64 * 0) + KERNEL11(64 * 0) + KERNEL12(64 * 0) + KERNEL13(64 * 0) + KERNEL14(64 * 0) + KERNEL15(64 * 0) + KERNEL16(64 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 1) + KERNEL2 (64 * 1) + KERNEL3 (64 * 1) + KERNEL4 (64 * 1) + KERNEL5 (64 * 1) + KERNEL6 (64 * 1) + KERNEL7 (64 * 1) + KERNEL8 (64 * 1) + KERNEL9 (64 * 1) + KERNEL10(64 * 1) + KERNEL11(64 * 1) + KERNEL12(64 * 1) + KERNEL13(64 * 1) + KERNEL14(64 * 1) + KERNEL15(64 * 1) + KERNEL16(64 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 2) + KERNEL2 (64 * 2) + KERNEL3 (64 * 2) + KERNEL4 (64 * 2) + KERNEL5 (64 * 2) + KERNEL6 (64 * 2) + KERNEL7 (64 * 2) + KERNEL8 (64 * 2) + KERNEL9 (64 * 2) + KERNEL10(64 * 2) + KERNEL11(64 * 2) + KERNEL12(64 * 2) + KERNEL13(64 * 2) + KERNEL14(64 * 2) + KERNEL15(64 * 2) + KERNEL16(64 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 3) + KERNEL2 (64 * 3) + KERNEL3 (64 * 3) + KERNEL4 (64 * 3) + KERNEL5 (64 * 3) + KERNEL6 (64 * 3) + KERNEL7 (64 * 3) + KERNEL8 (64 * 3) + KERNEL9 (64 * 3) + KERNEL10(64 * 3) + KERNEL11(64 * 3) + KERNEL12(64 * 3) + KERNEL13(64 * 3) + KERNEL14(64 * 3) + KERNEL15(64 * 3) + KERNEL16(64 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 4) + KERNEL2 (64 * 4) + KERNEL3 (64 * 4) + KERNEL4 (64 * 4) + KERNEL5 (64 * 4) + KERNEL6 (64 * 4) + KERNEL7 (64 * 4) + KERNEL8 (64 * 4) + KERNEL9 (64 * 4) + KERNEL10(64 * 4) + KERNEL11(64 * 4) + KERNEL12(64 * 4) + KERNEL13(64 * 4) + KERNEL14(64 * 4) + KERNEL15(64 * 4) + KERNEL16(64 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 5) + KERNEL2 (64 * 5) + KERNEL3 (64 * 5) + KERNEL4 (64 * 5) + KERNEL5 (64 * 5) + KERNEL6 (64 * 5) + KERNEL7 (64 * 5) + KERNEL8 (64 * 5) + KERNEL9 (64 * 5) + KERNEL10(64 * 5) + KERNEL11(64 * 5) + KERNEL12(64 * 5) + KERNEL13(64 * 5) + KERNEL14(64 * 5) + KERNEL15(64 * 5) + KERNEL16(64 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 6) + KERNEL2 (64 * 6) + KERNEL3 (64 * 6) + KERNEL4 (64 * 6) + KERNEL5 (64 * 6) + KERNEL6 (64 * 6) + KERNEL7 (64 * 6) + KERNEL8 (64 * 6) + KERNEL9 (64 * 6) + KERNEL10(64 * 6) + KERNEL11(64 * 6) + KERNEL12(64 * 6) + KERNEL13(64 * 6) + KERNEL14(64 * 6) + KERNEL15(64 * 6) + KERNEL16(64 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 7) + KERNEL2 (64 * 7) + KERNEL3 (64 * 7) + KERNEL4 (64 * 7) + KERNEL5 (64 * 7) + KERNEL6 (64 * 7) + KERNEL7 (64 * 7) + KERNEL8 (64 * 7) + KERNEL9 (64 * 7) + KERNEL10(64 * 7) + KERNEL11(64 * 7) + KERNEL12(64 * 7) + KERNEL13(64 * 7) + KERNEL14(64 * 7) + KERNEL15(64 * 7) + KERNEL16(64 * 7) + + addq $64 * 8 * SIZE, AO + addq $64 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + KERNEL1 (64 * 0) + KERNEL2 (64 * 0) + KERNEL3 (64 * 0) + KERNEL4 (64 * 0) + KERNEL5 (64 * 0) + KERNEL6 (64 * 0) + KERNEL7 (64 * 0) + KERNEL8 (64 * 0) + KERNEL9 (64 * 0) + KERNEL10(64 * 0) + KERNEL11(64 * 0) + KERNEL12(64 * 0) + KERNEL13(64 * 0) + KERNEL14(64 * 0) + KERNEL15(64 * 0) + KERNEL16(64 * 0) + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm6 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm7 + movsldup 8 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + mulps %xmm15, %xmm0 + movhps 2 * SIZE(CO1), %xmm8 + mulps %xmm15, %xmm1 + movsd 4 * SIZE(CO1), %xmm9 + mulps %xmm15, %xmm2 + movhps 6 * SIZE(CO1), %xmm9 + mulps %xmm15, %xmm3 + movsd 0 * SIZE(CO2), %xmm10 + mulps %xmm15, %xmm4 + movhps 2 * SIZE(CO2), %xmm10 + mulps %xmm15, %xmm5 + movsd 4 * SIZE(CO2), %xmm11 + mulps %xmm15, %xmm6 + movhps 6 * SIZE(CO2), %xmm11 + mulps %xmm15, %xmm7 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhps 2 * SIZE(CO1, LDC, 2), %xmm12 + movsd 4 * SIZE(CO1, LDC, 2), %xmm13 + movhps 6 * SIZE(CO1, LDC, 2), %xmm13 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhps 2 * SIZE(CO2, LDC, 2), %xmm14 + movsd 4 * SIZE(CO2, LDC, 2), %xmm15 + movhps 6 * SIZE(CO2, LDC, 2), %xmm15 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 + addps %xmm10, %xmm1 + addps %xmm11, %xmm5 + addps %xmm12, %xmm2 + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + addps %xmm13, %xmm6 + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + addps %xmm14, %xmm3 + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + addps %xmm15, %xmm7 + movsd %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) +#else + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm5 + mulps %xmm15, %xmm6 + mulps %xmm15, %xmm7 + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + movsd %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) +#endif + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm6, 4 * SIZE(CO1, LDC, 2) + movhps %xmm6, 6 * SIZE(CO1, LDC, 2) + + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 2 * SIZE(CO2, LDC, 2) + movsd %xmm7, 4 * SIZE(CO2, LDC, 2) + movhps %xmm7, 6 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 64 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsldup 80 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 32 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsldup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsldup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsldup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsldup 96 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 48 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsldup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsldup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsldup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsldup 112 * SIZE(BO), %xmm15 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm12 + movhps 2 * SIZE(CO1, LDC, 2), %xmm12 + movsd 0 * SIZE(CO2, LDC, 2), %xmm14 + movhps 2 * SIZE(CO2, LDC, 2), %xmm14 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 + addps %xmm12, %xmm2 + addps %xmm14, %xmm3 +#else + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm3 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 2 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + movddup 8 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 32 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 16 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 20 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 24 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 28 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movsd 64 * SIZE(BO), %xmm9 + addps %xmm11, %xmm0 + movsd 36 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 40 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 44 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 48 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movsd 52 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 56 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 60 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 96 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 + movsd 0 * SIZE(CO1, LDC, 2), %xmm9 + movhps 0 * SIZE(CO2, LDC, 2), %xmm9 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 + addps %xmm9, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO2, LDC, 2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 32 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + shufps $0, %xmm8, %xmm8 + movhps 4 * SIZE(BO), %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 8 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 16 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 20 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 3 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 24 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 28 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 64 * SIZE(BO), %xmm9 + shufps $0, %xmm10, %xmm10 + movhps 36 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 40 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 44 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 7 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 56 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 60 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 96 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + shufps $0, %xmm8, %xmm8 + movhps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm9 + movss 0 * SIZE(CO1, LDC, 2), %xmm10 + movss 0 * SIZE(CO2, LDC, 2), %xmm11 +#endif + + addps %xmm1, %xmm0 + + mulps %xmm15, %xmm0 + + movhlps %xmm0, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addss %xmm0, %xmm8 + psrlq $32, %xmm0 + addss %xmm0, %xmm9 + addss %xmm1, %xmm10 + psrlq $32, %xmm1 + addss %xmm1, %xmm11 + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) + movss %xmm10, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movss %xmm0, 0 * SIZE(CO1) + psrlq $32, %xmm0 + movss %xmm0, 0 * SIZE(CO2) + movss %xmm1, 0 * SIZE(CO1, LDC, 2) + psrlq $32, %xmm1 + movss %xmm1, 0 * SIZE(CO2, LDC, 2) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L53 + ALIGN_4 + +.L52: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $7, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movddup 0 * SIZE(B), %xmm0 + movaps %xmm0, 0 * SIZE(BO) + + addq $ 2 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + prefetcht2 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetcht2 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 36 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movsldup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm0 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movsldup 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + movsd 4 * SIZE(CO2), %xmm11 + movhps 6 * SIZE(CO2), %xmm11 +#endif + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 + addps %xmm10, %xmm1 + addps %xmm11, %xmm5 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + movsd %xmm5, 4 * SIZE(CO2) + movhps %xmm5, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movsldup 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movsldup 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsldup 48 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 +#endif + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm1 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 + addps %xmm10, %xmm1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + movddup 8 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + shufps $0x50, %xmm9, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L86 + ALIGN_4 + +.L88: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 +#endif + + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + shufps $0, %xmm8, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 3 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 7 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L96 + ALIGN_4 + +.L98: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movss 0 * SIZE(CO1), %xmm8 + movss 0 * SIZE(CO2), %xmm9 + + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + addss %xmm0, %xmm8 + psrlq $32, %xmm0 + addss %xmm0, %xmm9 + + movss %xmm8, 0 * SIZE(CO1) + movss %xmm9, 0 * SIZE(CO2) +#else + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + + movss %xmm0, 0 * SIZE(CO1) + psrlq $32, %xmm0 + movss %xmm0, 0 * SIZE(CO2) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + movss %xmm0, 0 * SIZE(BO) + movss %xmm0, 1 * SIZE(BO) + movss %xmm1, 2 * SIZE(BO) + movss %xmm1, 3 * SIZE(BO) + movss %xmm2, 4 * SIZE(BO) + movss %xmm2, 5 * SIZE(BO) + movss %xmm3, 6 * SIZE(BO) + movss %xmm3, 7 * SIZE(BO) + movss %xmm4, 8 * SIZE(BO) + movss %xmm4, 9 * SIZE(BO) + movss %xmm5, 10 * SIZE(BO) + movss %xmm5, 11 * SIZE(BO) + movss %xmm6, 12 * SIZE(BO) + movss %xmm6, 13 * SIZE(BO) + movss %xmm7, 14 * SIZE(BO) + movss %xmm7, 15 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm0 + movss %xmm0, 0 * SIZE(BO) + movss %xmm0, 1 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 2 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movddup 0 * SIZE(BO), %xmm9 + movddup 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + prefetchnta 8 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm9, %xmm0 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + movddup 8 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm9 + movaps 36 * SIZE(AO), %xmm12 + addps %xmm9, %xmm0 + movddup 16 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + addps %xmm11, %xmm0 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movddup 2 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 +#endif + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + movaps 0 * SIZE(AO), %xmm8 + movddup 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movddup 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 16 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L126 + ALIGN_4 + +.L128: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 +#endif + + addps %xmm1, %xmm0 + mulps %xmm15, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $4, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $15, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + movsd 0 * SIZE(AO), %xmm8 + movsd 0 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L136 + ALIGN_4 + +.L138: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 +#endif + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + mulps %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm8, %xmm0 +#endif + movsd %xmm0, 0 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movss 0 * SIZE(BO), %xmm9 + movss 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss 1 * SIZE(AO), %xmm8 + mulss 2 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 16 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 6 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 10 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 24 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 12 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 14 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movss ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + movss 0 * SIZE(AO), %xmm8 + movss 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movss 0 * SIZE(CO1), %xmm8 +#endif + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + mulss %xmm15, %xmm0 +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addss %xmm8, %xmm0 +#endif + movss %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_ncopy_2.S b/kernel/x86_64/gemm_ncopy_2.S new file mode 100644 index 0000000000..72c2b9d203 --- /dev/null +++ b/kernel/x86_64/gemm_ncopy_2.S @@ -0,0 +1,290 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(NEHALEM) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define AO3 %r13 +#define AO4 %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA # Scaling + + movq N, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L12: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $2, I + jle .L14 + ALIGN_4 + +.L13: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 1 * SIZE(AO1), %xmm2 + movss 1 * SIZE(AO2), %xmm3 + movss 2 * SIZE(AO1), %xmm4 + movss 2 * SIZE(AO2), %xmm5 + movss 3 * SIZE(AO1), %xmm6 + movss 3 * SIZE(AO2), %xmm7 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) + movss %xmm4, 4 * SIZE(B) + movss %xmm5, 5 * SIZE(B) + movss %xmm6, 6 * SIZE(B) + movss %xmm7, 7 * SIZE(B) +#else + PREFETCH RPREFETCHSIZE * SIZE(AO1) + + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 1 * SIZE(AO1), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(AO2) + + movsd 2 * SIZE(AO1), %xmm2 + movhpd 2 * SIZE(AO2), %xmm2 + movsd 3 * SIZE(AO1), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + PREFETCHW WPREFETCHSIZE * SIZE(B) + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movapd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $2 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L20: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L34 + ALIGN_4 + +.L33: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 1 * SIZE(AO1), %xmm1 + movss 2 * SIZE(AO1), %xmm2 + movss 3 * SIZE(AO1), %xmm3 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + decq I + jg .L33 + ALIGN_4 + +.L34: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L35: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $1 * SIZE, B + decq I + jg .L35 + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S new file mode 100644 index 0000000000..a04542f6af --- /dev/null +++ b/kernel/x86_64/gemm_ncopy_4.S @@ -0,0 +1,470 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 +#endif + +#ifdef ATOM +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NANO +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef BARCELONA +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef GENERIC +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define AO3 %r13 +#define AO4 %rax + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + leaq (,LDA, SIZE), LDA # Scaling + + movq N, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L12: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + leaq (A, LDA, 4), A + + movq M, I + sarq $2, I + jle .L14 + ALIGN_4 + +.L13: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 0 * SIZE(AO3), %xmm2 + movss 0 * SIZE(AO4), %xmm3 + + movss 1 * SIZE(AO1), %xmm4 + movss 1 * SIZE(AO2), %xmm5 + movss 1 * SIZE(AO3), %xmm6 + movss 1 * SIZE(AO4), %xmm7 + + movss 2 * SIZE(AO1), %xmm8 + movss 2 * SIZE(AO2), %xmm9 + movss 2 * SIZE(AO3), %xmm10 + movss 2 * SIZE(AO4), %xmm11 + + movss 3 * SIZE(AO1), %xmm12 + movss 3 * SIZE(AO2), %xmm13 + movss 3 * SIZE(AO3), %xmm14 + movss 3 * SIZE(AO4), %xmm15 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) + movss %xmm4, 4 * SIZE(B) + movss %xmm5, 5 * SIZE(B) + movss %xmm6, 6 * SIZE(B) + movss %xmm7, 7 * SIZE(B) + + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + PREFETCH RPREFETCHSIZE * SIZE(AO3) + PREFETCH RPREFETCHSIZE * SIZE(AO4) + + PREFETCHW WPREFETCHSIZE * SIZE(B) + + movss %xmm8, 8 * SIZE(B) + movss %xmm9, 9 * SIZE(B) + movss %xmm10, 10 * SIZE(B) + movss %xmm11, 11 * SIZE(B) + movss %xmm12, 12 * SIZE(B) + movss %xmm13, 13 * SIZE(B) + movss %xmm14, 14 * SIZE(B) + movss %xmm15, 15 * SIZE(B) +#else + PREFETCH RPREFETCHSIZE * SIZE(AO1) + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 1 * SIZE(AO1), %xmm2 + movhpd 1 * SIZE(AO2), %xmm2 + PREFETCH RPREFETCHSIZE * SIZE(AO2) + movsd 2 * SIZE(AO1), %xmm4 + movhpd 2 * SIZE(AO2), %xmm4 + movsd 3 * SIZE(AO1), %xmm6 + movhpd 3 * SIZE(AO2), %xmm6 + + PREFETCH RPREFETCHSIZE * SIZE(AO3) + movsd 0 * SIZE(AO3), %xmm1 + movhpd 0 * SIZE(AO4), %xmm1 + movsd 1 * SIZE(AO3), %xmm3 + movhpd 1 * SIZE(AO4), %xmm3 + PREFETCH RPREFETCHSIZE * SIZE(AO4) + movsd 2 * SIZE(AO3), %xmm5 + movhpd 2 * SIZE(AO4), %xmm5 + movsd 3 * SIZE(AO3), %xmm7 + movhpd 3 * SIZE(AO4), %xmm7 + + PREFETCHW WPREFETCHSIZE * SIZE(B) + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(B) +#endif + movapd %xmm4, 8 * SIZE(B) + movapd %xmm5, 10 * SIZE(B) + movapd %xmm6, 12 * SIZE(B) + movapd %xmm7, 14 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + addq $4 * SIZE, AO3 + addq $4 * SIZE, AO4 + + subq $-16 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 0 * SIZE(AO3), %xmm2 + movss 0 * SIZE(AO4), %xmm3 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 0 * SIZE(AO3), %xmm1 + movhpd 0 * SIZE(AO4), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $SIZE, AO3 + addq $SIZE, AO4 + addq $4 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L20: + testq $2, N + jle .L30 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $2, I + jle .L24 + ALIGN_4 + +.L23: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 1 * SIZE(AO1), %xmm2 + movss 1 * SIZE(AO2), %xmm3 + movss 2 * SIZE(AO1), %xmm4 + movss 2 * SIZE(AO2), %xmm5 + movss 3 * SIZE(AO1), %xmm6 + movss 3 * SIZE(AO2), %xmm7 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) + movss %xmm4, 4 * SIZE(B) + movss %xmm5, 5 * SIZE(B) + movss %xmm6, 6 * SIZE(B) + movss %xmm7, 7 * SIZE(B) + +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 1 * SIZE(AO1), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movsd 2 * SIZE(AO1), %xmm2 + movhpd 2 * SIZE(AO2), %xmm2 + movsd 3 * SIZE(AO1), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) +#endif + + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + + PREFETCHW WPREFETCHSIZE * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + decq I + jg .L23 + ALIGN_4 + +.L24: + movq M, I + andq $3, I + jle .L30 + ALIGN_4 + +.L25: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + + movapd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $2 * SIZE, B + decq I + jg .L25 + ALIGN_4 + +.L30: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L34 + ALIGN_4 + +.L33: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 1 * SIZE(AO1), %xmm1 + movss 2 * SIZE(AO1), %xmm2 + movss 3 * SIZE(AO1), %xmm3 + + movss %xmm0, 0 * SIZE(B) + movss %xmm1, 1 * SIZE(B) + movss %xmm2, 2 * SIZE(B) + movss %xmm3, 3 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) +#endif + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + decq I + jg .L33 + ALIGN_4 + +.L34: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L35: +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(B) +#endif + + addq $SIZE, AO1 + addq $1 * SIZE, B + decq I + jg .L35 + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 + +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_ncopy_4_opteron.S b/kernel/x86_64/gemm_ncopy_4_opteron.S new file mode 100644 index 0000000000..edde7e2c1b --- /dev/null +++ b/kernel/x86_64/gemm_ncopy_4_opteron.S @@ -0,0 +1,388 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCHSIZE (12 + 4) +#define WPREFETCHSIZE (48 + 4) +#define MOVNTQ MOVQ +#else +#define RPREFETCHSIZE (12 + 4) +#define WPREFETCHSIZE (24 + 4) +#define MOVNTQ MOVQ +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 32 + STACKSIZE(%rsp) + +#define B %r14 +#define I %r15 + +#endif + +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#define AO3 %r13 +#define AO4 %rax + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCH prefetch +#else +#define RPREFETCH prefetch +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r15 + pushq %r14 +#endif + pushq %r13 + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + EMMS + + leaq (,LDA, SIZE), LDA # Scaling + + movq N, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L11: +#if 0 + movq A, AO1 + leaq (A, LDA, 1), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + + movq M, I + sarq $4, I + jle .L13 + ALIGN_4 + +.L12: + MOVQ 0 * SIZE(AO1), %mm0 + addq $8 * SIZE, AO1 + MOVQ 0 * SIZE(AO2), %mm1 + addq $8 * SIZE, AO2 + MOVQ 0 * SIZE(AO3), %mm2 + addq $8 * SIZE, AO3 + MOVQ 0 * SIZE(AO4), %mm3 + addq $8 * SIZE, AO4 + + decq I + jg .L12 + ALIGN_4 + +.L13: +#endif + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + leaq (A, LDA, 4), A + + movq M, I + sarq $2, I + jle .L15 + ALIGN_4 + +.L14: + RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) + + MOVQ 0 * SIZE(AO1), %mm0 + MOVNTQ %mm0, 0 * SIZE(B) + MOVQ 0 * SIZE(AO2), %mm1 + MOVNTQ %mm1, 1 * SIZE(B) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) + + MOVQ 0 * SIZE(AO3), %mm2 + MOVNTQ %mm2, 2 * SIZE(B) + MOVQ 0 * SIZE(AO4), %mm3 + MOVNTQ %mm3, 3 * SIZE(B) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + MOVQ 1 * SIZE(AO1), %mm4 + MOVNTQ %mm4, 4 * SIZE(B) + MOVQ 1 * SIZE(AO2), %mm5 + MOVNTQ %mm5, 5 * SIZE(B) + MOVQ 1 * SIZE(AO3), %mm6 + MOVNTQ %mm6, 6 * SIZE(B) + MOVQ 1 * SIZE(AO4), %mm7 + MOVNTQ %mm7, 7 * SIZE(B) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO3) + + MOVQ 2 * SIZE(AO1), %mm0 + MOVNTQ %mm0, 8 * SIZE(B) + MOVQ 2 * SIZE(AO2), %mm1 + MOVNTQ %mm1, 9 * SIZE(B) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO4) + + MOVQ 2 * SIZE(AO3), %mm2 + MOVNTQ %mm2, 10 * SIZE(B) + MOVQ 2 * SIZE(AO4), %mm3 + MOVNTQ %mm3, 11 * SIZE(B) + + prefetchw (WPREFETCHSIZE + 8) * SIZE(B) + MOVQ 3 * SIZE(AO1), %mm4 + MOVNTQ %mm4, 12 * SIZE(B) + MOVQ 3 * SIZE(AO2), %mm5 + MOVNTQ %mm5, 13 * SIZE(B) + MOVQ 3 * SIZE(AO3), %mm6 + MOVNTQ %mm6, 14 * SIZE(B) + MOVQ 3 * SIZE(AO4), %mm7 + MOVNTQ %mm7, 15 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + addq $4 * SIZE, AO3 + addq $4 * SIZE, AO4 + + subq $-16 * SIZE, B + decq I + jg .L14 + ALIGN_4 + +.L15: + movq M, I + andq $3, I + jle .L17 + ALIGN_4 + +.L16: + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 0 * SIZE(AO2), %mm1 + MOVQ 0 * SIZE(AO3), %mm2 + MOVQ 0 * SIZE(AO4), %mm3 + + MOVNTQ %mm0, 0 * SIZE(B) + MOVNTQ %mm1, 1 * SIZE(B) + MOVNTQ %mm2, 2 * SIZE(B) + MOVNTQ %mm3, 3 * SIZE(B) + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $SIZE, AO3 + addq $SIZE, AO4 + addq $4 * SIZE, B + decq I + jg .L16 + ALIGN_4 + +.L17: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $2, N + jle .L30 + + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $2, I + jle .L24 + ALIGN_4 + +.L23: + prefetch (RPREFETCHSIZE) * SIZE(AO1) + MOVQ 0 * SIZE(AO1), %mm0 + prefetch (RPREFETCHSIZE) * SIZE(AO2) + MOVQ 0 * SIZE(AO2), %mm1 + MOVQ 1 * SIZE(AO1), %mm2 + MOVQ 1 * SIZE(AO2), %mm3 + MOVQ 2 * SIZE(AO1), %mm4 + MOVQ 2 * SIZE(AO2), %mm5 + MOVQ 3 * SIZE(AO1), %mm6 + MOVQ 3 * SIZE(AO2), %mm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + + MOVNTQ %mm0, 0 * SIZE(B) + MOVNTQ %mm1, 1 * SIZE(B) + MOVNTQ %mm2, 2 * SIZE(B) + MOVNTQ %mm3, 3 * SIZE(B) + MOVNTQ %mm4, 4 * SIZE(B) + MOVNTQ %mm5, 5 * SIZE(B) + MOVNTQ %mm6, 6 * SIZE(B) + MOVNTQ %mm7, 7 * SIZE(B) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + subq $-8 * SIZE, B + decq I + jg .L23 + ALIGN_4 + +.L24: + movq M, I + andq $3, I + jle .L30 + ALIGN_4 + +.L25: + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 0 * SIZE(AO2), %mm1 + + MOVNTQ %mm0, 0 * SIZE(B) + MOVNTQ %mm1, 1 * SIZE(B) + + addq $SIZE, AO1 + addq $SIZE, AO2 + addq $2 * SIZE, B + decq I + jg .L25 + ALIGN_4 + +.L30: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L34 + ALIGN_4 + +.L33: + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 2 * SIZE(AO1), %mm2 + MOVQ 3 * SIZE(AO1), %mm3 + + MOVNTQ %mm0, 0 * SIZE(B) + MOVNTQ %mm1, 1 * SIZE(B) + MOVNTQ %mm2, 2 * SIZE(B) + MOVNTQ %mm3, 3 * SIZE(B) + + addq $4 * SIZE, AO1 + subq $-4 * SIZE, B + decq I + jg .L33 + ALIGN_4 + +.L34: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L35: + MOVQ 0 * SIZE(AO1), %mm0 + addq $SIZE, AO1 + + MOVNTQ %mm0, 0 * SIZE(B) + addq $1 * SIZE, B + decq I + jg .L35 + ALIGN_4 + + +.L999: + EMMS + +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 + popq %r13 +#ifdef WINDOWS_ABI + popq %r14 + popq %r15 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_tcopy_2.S b/kernel/x86_64/gemm_tcopy_2.S new file mode 100644 index 0000000000..8bfaca2651 --- /dev/null +++ b/kernel/x86_64/gemm_tcopy_2.S @@ -0,0 +1,276 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(NEHALEM) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r10 +#define J %rbp + +#define AO1 %r9 +#define AO2 %r15 +#define AO3 %r11 +#define AO4 %r14 +#define BO1 %r13 +#define M8 %rbx +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 64 + STACKSIZE(%rsp) + +#define B %rdi + +#define I %r10 +#define J %r11 + +#define AO1 %r12 +#define AO2 %r13 +#define AO3 %r14 +#define AO4 %r15 + +#define BO1 %rsi +#define M8 %rbp +#define BO %rax + +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + andq $-2, %rax + imulq M, %rax + + leaq (B, %rax, SIZE), BO1 + + leaq (, LDA, SIZE), LDA + leaq (, M, SIZE), M8 + + movq M, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $4 * SIZE, B + + movq N, I + sarq $1, I + jle .L14 + ALIGN_4 + +.L12: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, 0 * SIZE(BO) +#else + PREFETCH RPREFETCHSIZE * SIZE(AO1) + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + PREFETCH RPREFETCHSIZE * SIZE(AO2) + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + PREFETCHW WPREFETCHSIZE * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) +#endif + + leaq (BO, M8, 2), BO + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + decq I + jg .L12 + ALIGN_4 + +.L14: + testq $1, N + jle .L19 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + + movss %xmm0, 0 * SIZE(BO1) + movss %xmm1, 1 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + + movapd %xmm0, 0 * SIZE(BO1) +#endif + + addq $2 * SIZE, BO1 + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + jle .L999 + ALIGN_4 + +.L31: + movq A, AO1 + movq B, BO + + movq N, I + sarq $1, I + jle .L33 + ALIGN_4 + +.L32: +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movapd %xmm0, 0 * SIZE(BO) +#endif + + addq $2 * SIZE, AO1 + leaq (BO, M8, 2), BO + decq I + jg .L32 + ALIGN_4 + +.L33: + testq $1, N + jle .L999 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(BO1) +#endif + addq $1 * SIZE, BO1 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S new file mode 100644 index 0000000000..877969ff5e --- /dev/null +++ b/kernel/x86_64/gemm_tcopy_4.S @@ -0,0 +1,544 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(PENTIUM4) || defined(GENERIC) +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht2 +#endif + +#ifdef ATOM +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef NANO +#define RPREFETCHSIZE 8 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifdef BARCELONA +#define RPREFETCHSIZE 8 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#endif + +#ifdef GENERIC +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE (RPREFETCHSIZE * 4) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r10 +#define J %rbp + +#define AO1 %r9 +#define AO2 %r15 +#define AO3 %r11 +#define AO4 %r14 +#define BO1 %r13 +#define BO2 %r12 +#define M8 %rbx +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 64 + STACKSIZE(%rsp) + +#define B %rdi + +#define I %r10 +#define J %r11 + +#define AO1 %r12 +#define AO2 %r13 +#define AO3 %r14 +#define AO4 %r15 + +#define BO1 %rsi +#define BO2 %rbx +#define M8 %rbp +#define BO %rax + +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + movq N, %rbx + andq $-4, %rax + andq $-2, %rbx + imulq M, %rax + imulq M, %rbx + + leaq (B, %rax, SIZE), BO1 + leaq (B, %rbx, SIZE), BO2 + + leaq (, LDA, SIZE), LDA + leaq (, M, SIZE), M8 + + movq M, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + leaq (A, LDA, 4), A + + movq B, BO + addq $16 * SIZE, B + + movq N, I + sarq $2, I + jle .L13 + ALIGN_4 + +.L12: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + movlps 0 * SIZE(AO2), %xmm1 + movhps 2 * SIZE(AO2), %xmm1 + + movlps 0 * SIZE(AO3), %xmm2 + movhps 2 * SIZE(AO3), %xmm2 + movlps 0 * SIZE(AO4), %xmm3 + movhps 2 * SIZE(AO4), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + PREFETCH RPREFETCHSIZE * SIZE(AO3) + PREFETCH RPREFETCHSIZE * SIZE(AO4) + + PREFETCHW WPREFETCHSIZE * SIZE(BO) +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) +#else + + PREFETCH RPREFETCHSIZE * SIZE(AO1) + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + PREFETCH RPREFETCHSIZE * SIZE(AO2) + movsd 0 * SIZE(AO2), %xmm2 + movhpd 1 * SIZE(AO2), %xmm2 + movsd 2 * SIZE(AO2), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + PREFETCH RPREFETCHSIZE * SIZE(AO3) + movsd 0 * SIZE(AO3), %xmm4 + movhpd 1 * SIZE(AO3), %xmm4 + movsd 2 * SIZE(AO3), %xmm5 + movhpd 3 * SIZE(AO3), %xmm5 + + PREFETCH RPREFETCHSIZE * SIZE(AO4) + movsd 0 * SIZE(AO4), %xmm6 + movhpd 1 * SIZE(AO4), %xmm6 + movsd 2 * SIZE(AO4), %xmm7 + movhpd 3 * SIZE(AO4), %xmm7 + + PREFETCHW WPREFETCHSIZE * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(B) +#endif + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) +#endif + + leaq (BO, M8, 4), BO + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + addq $4 * SIZE, AO3 + addq $4 * SIZE, AO4 + decq I + jg .L12 + ALIGN_4 + +.L13: + testq $2, N + jle .L14 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movlps 0 * SIZE(AO3), %xmm1 + movhps 0 * SIZE(AO4), %xmm1 + + movaps %xmm0, 0 * SIZE(BO1) + movaps %xmm1, 4 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movsd 0 * SIZE(AO3), %xmm2 + movhpd 1 * SIZE(AO3), %xmm2 + movsd 0 * SIZE(AO4), %xmm3 + movhpd 1 * SIZE(AO4), %xmm3 + + movapd %xmm0, 0 * SIZE(BO1) + movapd %xmm1, 2 * SIZE(BO1) + movapd %xmm2, 4 * SIZE(BO1) + movapd %xmm3, 6 * SIZE(BO1) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $2 * SIZE, AO3 + addq $2 * SIZE, AO4 + addq $8 * SIZE, BO1 + ALIGN_4 + +.L14: + testq $1, N + jle .L19 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + movss 0 * SIZE(AO3), %xmm2 + movss 0 * SIZE(AO4), %xmm3 + + movss %xmm0, 0 * SIZE(BO2) + movss %xmm1, 1 * SIZE(BO2) + movss %xmm2, 2 * SIZE(BO2) + movss %xmm3, 3 * SIZE(BO2) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + movsd 0 * SIZE(AO3), %xmm1 + movhpd 0 * SIZE(AO4), %xmm1 + + movapd %xmm0, 0 * SIZE(BO2) + movapd %xmm1, 2 * SIZE(BO2) +#endif + + addq $4 * SIZE, BO2 + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + jle .L30 + ALIGN_4 + +.L21: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $8 * SIZE, B + + movq N, I + sarq $2, I + jle .L23 + ALIGN_4 + +.L22: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movlps 0 * SIZE(AO2), %xmm1 + movhps 2 * SIZE(AO2), %xmm1 + +#if defined(PENTIUM4) || defined(GENERIC) + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + PREFETCHW WPREFETCHSIZE * SIZE(BO) +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movsd 0 * SIZE(AO2), %xmm2 + movhpd 1 * SIZE(AO2), %xmm2 + movsd 2 * SIZE(AO2), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) + PREFETCH RPREFETCHSIZE * SIZE(AO1) + PREFETCH RPREFETCHSIZE * SIZE(AO2) + PREFETCHW WPREFETCHSIZE * SIZE(BO) +#endif + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (BO, M8, 4), BO + decq I + jg .L22 + ALIGN_4 + +.L23: + testq $2, N + jle .L24 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movapd %xmm0, 0 * SIZE(BO1) + movapd %xmm1, 2 * SIZE(BO1) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $4 * SIZE, BO1 + ALIGN_4 + +.L24: + testq $1, N + jle .L30 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss 0 * SIZE(AO2), %xmm1 + + movss %xmm0, 0 * SIZE(BO2) + movss %xmm1, 1 * SIZE(BO2) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 0 * SIZE(AO2), %xmm0 + + movapd %xmm0, 0 * SIZE(BO2) +#endif + addq $2 * SIZE, BO2 + ALIGN_4 + +.L30: + testq $1, M + jle .L999 + ALIGN_4 + +.L31: + movq A, AO1 + movq B, BO + + movq N, I + sarq $2, I + jle .L33 + ALIGN_4 + +.L32: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movaps %xmm0, 0 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) +#endif + + addq $4 * SIZE, AO1 + leaq (BO, M8, 4), BO + decq I + jg .L32 + ALIGN_4 + +.L33: + testq $2, N + jle .L34 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + + movlps %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(BO1) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, BO1 + ALIGN_4 + +.L34: + testq $1, N + jle .L999 + +#ifndef DOUBLE + movss 0 * SIZE(AO1), %xmm0 + movss %xmm0, 0 * SIZE(BO2) +#else + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(BO2) +#endif + addq $1 * SIZE, BO2 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_tcopy_4_opteron.S b/kernel/x86_64/gemm_tcopy_4_opteron.S new file mode 100644 index 0000000000..459eeb8c50 --- /dev/null +++ b/kernel/x86_64/gemm_tcopy_4_opteron.S @@ -0,0 +1,476 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCHSIZE (12 + 4) +#define WPREFETCHSIZE (12 + 4) +#define MOVNTQ MOVQ +#else +#define RPREFETCHSIZE (12 + 4) +#define WPREFETCHSIZE (12 + 4) +#define MOVNTQ MOVQ +#endif + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r10 +#define J %rbp + +#define AO1 %r9 +#define AO2 %r15 +#define AO3 %r11 +#define AO4 %r14 +#define BO1 %r13 +#define BO2 %r12 +#define M8 %rbx +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 64 + STACKSIZE(%rsp) + +#define B %rdi + +#define I %r10 +#define J %r11 + +#define AO1 %r12 +#define AO2 %r13 +#define AO3 %r14 +#define AO4 %r15 + +#define BO1 %rsi +#define BO2 %rbx +#define M8 %rbp +#define BO %rax + +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCH prefetch +#else +#define RPREFETCH prefetch +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r15 + pushq %r14 + pushq %r13 + pushq %r12 + pushq %rbp + pushq %rbx + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + movq N, %rbx + andq $-4, %rax + andq $-2, %rbx + imulq M, %rax + imulq M, %rbx + + EMMS + + leaq (B, %rax, SIZE), BO1 + leaq (B, %rbx, SIZE), BO2 + + leaq (, LDA, SIZE), LDA + leaq (, M, SIZE), M8 + movq M, J + sarq $2, J + jle .L20 + ALIGN_4 + +.L11: +#if 0 + movq A, AO1 + leaq (A, LDA, 1), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + + movq N, I + sarq $3, I + jle .L13 + ALIGN_4 + +.L12: + MOVQ 0 * SIZE(AO1), %mm0 + addq $8 * SIZE, AO1 + MOVQ 0 * SIZE(AO2), %mm1 + addq $8 * SIZE, AO2 + MOVQ 0 * SIZE(AO3), %mm2 + addq $8 * SIZE, AO3 + MOVQ 0 * SIZE(AO4), %mm3 + addq $8 * SIZE, AO4 + + decq I + jg .L12 + ALIGN_4 + +.L13: +#endif + + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), AO3 + leaq (AO2, LDA, 2), AO4 + leaq (A, LDA, 4), A + + movq B, BO + addq $16 * SIZE, B + + movq N, I + sarq $2, I + jle .L15 + ALIGN_4 + +.L14: + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) + + MOVQ 0 * SIZE(AO1), %mm0 + MOVNTQ %mm0, 0 * SIZE(BO) + MOVQ 1 * SIZE(AO1), %mm1 + MOVNTQ %mm1, 1 * SIZE(BO) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) + + MOVQ 2 * SIZE(AO1), %mm2 + MOVNTQ %mm2, 2 * SIZE(BO) + MOVQ 3 * SIZE(AO1), %mm3 + MOVNTQ %mm3, 3 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + MOVQ 0 * SIZE(AO2), %mm4 + MOVNTQ %mm4, 4 * SIZE(BO) + MOVQ 1 * SIZE(AO2), %mm5 + MOVNTQ %mm5, 5 * SIZE(BO) + MOVQ 2 * SIZE(AO2), %mm6 + MOVNTQ %mm6, 6 * SIZE(BO) + MOVQ 3 * SIZE(AO2), %mm7 + MOVNTQ %mm7, 7 * SIZE(BO) + + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO3) + + MOVQ 0 * SIZE(AO3), %mm0 + MOVNTQ %mm0, 8 * SIZE(BO) + MOVQ 1 * SIZE(AO3), %mm1 + MOVNTQ %mm1, 9 * SIZE(BO) + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO4) + + MOVQ 2 * SIZE(AO3), %mm2 + MOVNTQ %mm2, 10 * SIZE(BO) + MOVQ 3 * SIZE(AO3), %mm3 + MOVNTQ %mm3, 11 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 8) * SIZE(B) + MOVQ 0 * SIZE(AO4), %mm4 + MOVNTQ %mm4, 12 * SIZE(BO) + MOVQ 1 * SIZE(AO4), %mm5 + MOVNTQ %mm5, 13 * SIZE(BO) + MOVQ 2 * SIZE(AO4), %mm6 + MOVNTQ %mm6, 14 * SIZE(BO) + MOVQ 3 * SIZE(AO4), %mm7 + MOVNTQ %mm7, 15 * SIZE(BO) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + addq $4 * SIZE, AO3 + addq $4 * SIZE, AO4 + + leaq (BO, M8, 4), BO + decq I + jg .L14 + ALIGN_4 + +.L15: + testq $2, N + jle .L16 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 0 * SIZE(AO2), %mm2 + MOVQ 1 * SIZE(AO2), %mm3 + + MOVQ 0 * SIZE(AO3), %mm4 + MOVQ 1 * SIZE(AO3), %mm5 + MOVQ 0 * SIZE(AO4), %mm6 + MOVQ 1 * SIZE(AO4), %mm7 + + MOVNTQ %mm0, 0 * SIZE(BO1) + MOVNTQ %mm1, 1 * SIZE(BO1) + MOVNTQ %mm2, 2 * SIZE(BO1) + MOVNTQ %mm3, 3 * SIZE(BO1) + MOVNTQ %mm4, 4 * SIZE(BO1) + MOVNTQ %mm5, 5 * SIZE(BO1) + MOVNTQ %mm6, 6 * SIZE(BO1) + MOVNTQ %mm7, 7 * SIZE(BO1) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $2 * SIZE, AO3 + addq $2 * SIZE, AO4 + addq $8 * SIZE, BO1 + ALIGN_4 + +.L16: + testq $1, N + jle .L19 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 0 * SIZE(AO2), %mm1 + MOVQ 0 * SIZE(AO3), %mm2 + MOVQ 0 * SIZE(AO4), %mm3 + + MOVNTQ %mm0, 0 * SIZE(BO2) + MOVNTQ %mm1, 1 * SIZE(BO2) + MOVNTQ %mm2, 2 * SIZE(BO2) + MOVNTQ %mm3, 3 * SIZE(BO2) + + addq $4 * SIZE, BO2 + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + jle .L30 + ALIGN_4 + +.L21: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $8 * SIZE, B + + movq N, I + sarq $2, I + jle .L23 + ALIGN_4 + +.L22: + RPREFETCH (RPREFETCHSIZE) * SIZE(AO1) + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 2 * SIZE(AO1), %mm2 + MOVQ 3 * SIZE(AO1), %mm3 + + RPREFETCH (RPREFETCHSIZE) * SIZE(AO2) + MOVQ 0 * SIZE(AO2), %mm4 + MOVQ 1 * SIZE(AO2), %mm5 + MOVQ 2 * SIZE(AO2), %mm6 + MOVQ 3 * SIZE(AO2), %mm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + MOVNTQ %mm0, 0 * SIZE(BO) + MOVNTQ %mm1, 1 * SIZE(BO) + MOVNTQ %mm2, 2 * SIZE(BO) + MOVNTQ %mm3, 3 * SIZE(BO) + MOVNTQ %mm4, 4 * SIZE(BO) + MOVNTQ %mm5, 5 * SIZE(BO) + MOVNTQ %mm6, 6 * SIZE(BO) + MOVNTQ %mm7, 7 * SIZE(BO) + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (BO, M8, 4), BO + decq I + jg .L22 + ALIGN_4 + +.L23: + testq $2, N + jle .L24 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 0 * SIZE(AO2), %mm2 + MOVQ 1 * SIZE(AO2), %mm3 + + MOVNTQ %mm0, 0 * SIZE(BO1) + MOVNTQ %mm1, 1 * SIZE(BO1) + MOVNTQ %mm2, 2 * SIZE(BO1) + MOVNTQ %mm3, 3 * SIZE(BO1) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $4 * SIZE, BO1 + ALIGN_4 + +.L24: + testq $1, N + jle .L30 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 0 * SIZE(AO2), %mm1 + + MOVNTQ %mm0, 0 * SIZE(BO2) + MOVNTQ %mm1, 1 * SIZE(BO2) + + addq $2 * SIZE, BO2 + ALIGN_4 + +.L30: + testq $1, M + jle .L999 + ALIGN_4 + +.L31: + movq A, AO1 + movq B, BO + + movq N, I + sarq $2, I + jle .L33 + ALIGN_4 + +.L32: + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + MOVQ 2 * SIZE(AO1), %mm2 + MOVQ 3 * SIZE(AO1), %mm3 + + MOVNTQ %mm0, 0 * SIZE(BO) + MOVNTQ %mm1, 1 * SIZE(BO) + MOVNTQ %mm2, 2 * SIZE(BO) + MOVNTQ %mm3, 3 * SIZE(BO) + + addq $4 * SIZE, AO1 + leaq (BO, M8, 4), BO + decq I + jg .L32 + ALIGN_4 + +.L33: + testq $2, N + jle .L34 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVQ 1 * SIZE(AO1), %mm1 + + MOVNTQ %mm0, 0 * SIZE(BO1) + MOVNTQ %mm1, 1 * SIZE(BO1) + + addq $2 * SIZE, AO1 + addq $2 * SIZE, BO1 + ALIGN_4 + +.L34: + testq $1, N + jle .L999 + + MOVQ 0 * SIZE(AO1), %mm0 + MOVNTQ %mm0, 0 * SIZE(BO2) + + addq $1 * SIZE, BO2 + ALIGN_4 + +.L999: + EMMS + +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %rbx + popq %rbp + popq %r12 + popq %r13 + popq %r14 + popq %r15 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/iamax.S b/kernel/x86_64/iamax.S new file mode 100644 index 0000000000..27637c53d6 --- /dev/null +++ b/kernel/x86_64/iamax.S @@ -0,0 +1,352 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define RET %rax +#define I ARG4 +#define NUM %r10 + +#ifndef USE_MIN +#define FMOV fcmovbe +#define IMOV cmovnbe +#else +#define FMOV fcmovnbe +#define IMOV cmovb +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $BASE_SHIFT, INCX + + fldz + xorq RET, RET + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + ffreep %st + movq $2, NUM + movq $1, RET + + FLD (X) +#ifdef USE_ABS + fabs +#endif + addq INCX, X + decq M + jle .L999 + + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 1 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 2 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 3 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 4 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 5 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 6 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 7 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq $8 * SIZE, X + + decq I + jg .L10 + ALIGN_4 + +.L20: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + + addq $1 * SIZE, X + incq NUM + decq I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + addq INCX, X +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + decq I + jg .L50 + ALIGN_4 + +.L60: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) +#ifdef USE_ABS + fabs +#endif + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq INCX, X + decq I + jg .L61 + ALIGN_4 + +.L999: + ffreep %st + ret + + EPILOGUE diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S new file mode 100644 index 0000000000..8b7de07f22 --- /dev/null +++ b/kernel/x86_64/iamax_sse.S @@ -0,0 +1,1020 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define RET %rax +#define I ARG4 +#define XX %r10 +#define MM %r11 + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 /* Return Value(Float) */ + xor RET, RET /* Return Value(Int) */ + testq M, M + jle .L999 + leaq (, INCX, SIZE), INCX + testq INCX, INCX + jle .L999 + + movq M, MM + movq X, XX + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 /* Generate USE_ABS */ +#endif + + movss (X), %xmm0 + addq INCX, X + decq M + shufps $0, %xmm0, %xmm0 +#ifdef USE_ABS + andps %xmm15, %xmm0 +#endif + movaps %xmm0, %xmm1 + movaps %xmm0, %xmm2 + movaps %xmm0, %xmm3 /* Generating "seed value" */ + cmpq $SIZE, INCX + jne .L80 /* Incx != 1 goto L80 */ + +/* Analigned Check */ + testq $3, X /* 00000011 */ + jne .L30 /* Purely Unaligned Mode */ + + cmpq $8, M + jle .L30 /* if M <= 8 goto Unaligned mode */ + + testq $4, X /* bit test 000100 */ + je .L05 + + movss 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + decq M + addq $SIZE, X + ALIGN_3 + +.L05: + testq $8, X + je .L06 + + movsd 0 * SIZE(X), %xmm4 + unpcklps %xmm4, %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm1 + subq $2, M + addq $2 * SIZE, X + ALIGN_3 + +.L06: + movq M, I + sarq $4, I + jle .L15 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps 4 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + + movaps 8 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + + movaps 12 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L15: + andq $15, M + jle .L20 + + testq $8, M + je .L16 + + movaps 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movaps 4 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + addq $8 * SIZE, X + ALIGN_3 + +.L16: + testq $4, M + je .L17 + + movaps 0 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L17: + testq $2, M + je .L18 + + movsd 0 * SIZE(X), %xmm7 + unpcklps %xmm7, %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + addq $2 * SIZE, X + +.L18: + testq $1, M + je .L20 + + movss 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + ALIGN_3 + +.L20: + movq XX, X + movq MM, M + + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + testq $4, X + je .L21 + + movss 0 * SIZE(X), %xmm1 + + decq M + addq $SIZE, X + +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + ALIGN_3 + +.L21: + testq $8, X + je .L22 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + + subq $2, M + addq $2 * SIZE, X + +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L22: + movq M, I + sarq $3, I + jle .L25 + ALIGN_4 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + cmpeqps %xmm0, %xmm1 + + movaps 4 * SIZE(X), %xmm3 +#ifdef USE_ABS + andps %xmm15, %xmm3 +#endif + cmpeqps %xmm0, %xmm3 + + orps %xmm3, %xmm1 +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L24 + + addq $8 * SIZE, X + addq $8, RET + decq I + jg .L23 + jmp .L25 + ALIGN_3 + +.L24: + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 + movss 4 * SIZE(X), %xmm5 + movss 5 * SIZE(X), %xmm6 + movss 6 * SIZE(X), %xmm7 + movss 7 * SIZE(X), %xmm8 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm6 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_4 + +.L25: + testq $4, M + je .L26 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 +#endif + addq $4 * SIZE, X + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L26: + testq $2, M + je .L27 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 +#endif + addq $2 * SIZE, X + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L27: + incq RET + jmp .L999 + ALIGN_3 + +/* Unaligned Mode */ +.L30: + movq M, I + sarq $4, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + + movsd 8 * SIZE(X), %xmm6 + movhps 10 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + + movsd 12 * SIZE(X), %xmm7 + movhps 14 * SIZE(X), %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L31 + ALIGN_4 + +.L35: + andq $15, M + jle .L40 + + testq $8, M + je .L36 + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxps %xmm4, %xmm0 + + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L36: + testq $4, M + je .L37 + + movsd 0 * SIZE(X), %xmm6 + movhps 2 * SIZE(X), %xmm6 +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L37: + testq $2, M + je .L38 + + movsd 0 * SIZE(X), %xmm7 + unpcklps %xmm7, %xmm7 +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxps %xmm7, %xmm3 + addq $2 * SIZE, X + +.L38: + testq $1, M + je .L40 + + movss 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + jmp .L40 + ALIGN_4 + +.L40: + movq XX, X + movq MM, M + + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L45 + ALIGN_4 + +.L43: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movhps 2 * SIZE(X), %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + cmpeqps %xmm0, %xmm1 + + movsd 4 * SIZE(X), %xmm3 + movhps 6 * SIZE(X), %xmm3 +#ifdef USE_ABS + andps %xmm15, %xmm3 +#endif + cmpeqps %xmm0, %xmm3 + + orps %xmm3, %xmm1 +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L44 + + addq $8 * SIZE, X + addq $8, RET + decq I + jg .L43 + jmp .L45 + ALIGN_3 + +.L44: + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 + movss 4 * SIZE(X), %xmm5 + movss 5 * SIZE(X), %xmm6 + movss 6 * SIZE(X), %xmm7 + movss 7 * SIZE(X), %xmm8 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm6 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_4 + +.L45: + testq $4, M + je .L46 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 +#endif + addq $4 * SIZE, X + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L46: + testq $2, M + je .L47 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 +#endif + addq $2 * SIZE, X + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L47: + incq RET + jmp .L999 + ALIGN_3 + +.L80: + movq M, I + sarq $3, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + + decq I + jg .L81 + ALIGN_4 + +.L85: + andq $7, M + jle .L90 + + testq $4, M + je .L86 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + maxss %xmm7, %xmm3 + ALIGN_3 + +.L86: + testq $2, M + je .L87 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + maxss %xmm4, %xmm0 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + maxss %xmm5, %xmm1 + ALIGN_3 + +.L87: + testq $1, M + je .L90 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + maxss %xmm6, %xmm2 + ALIGN_4 + +.L90: + movq XX, X + movq MM, M + + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm2 + maxss %xmm2, %xmm0 + shufps $0, %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L95 + ALIGN_4 + +.L93: + movss 0 * SIZE(X), %xmm1 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm1 +#endif + cmpeqss %xmm0, %xmm1 + + movss 0 * SIZE(X), %xmm2 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm2 +#endif + cmpeqss %xmm0, %xmm2 + + movss 0 * SIZE(X), %xmm3 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm3 +#endif + cmpeqss %xmm0, %xmm3 + + movss 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm4 +#endif + cmpeqss %xmm0, %xmm4 + + movss 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm5 +#endif + cmpeqps %xmm0, %xmm5 + + movss 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm6 +#endif + cmpeqss %xmm0, %xmm6 + + movss 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm7 +#endif + cmpeqss %xmm0, %xmm7 + + movss 0 * SIZE(X), %xmm8 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm8 +#endif + cmpeqss %xmm0, %xmm8 + + orps %xmm2, %xmm1 + orps %xmm4, %xmm3 + orps %xmm6, %xmm5 + orps %xmm8, %xmm7 + orps %xmm3, %xmm1 + orps %xmm7, %xmm5 + orps %xmm5, %xmm1 + +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L94 + + addq $8, RET + decq I + jg .L93 + jmp .L95 + ALIGN_3 + +.L94: + subq INCX, X + movss 0 * SIZE(X), %xmm8 + subq INCX, X + movss 0 * SIZE(X), %xmm7 + subq INCX, X + movss 0 * SIZE(X), %xmm6 + subq INCX, X + movss 0 * SIZE(X), %xmm5 + subq INCX, X + movss 0 * SIZE(X), %xmm4 + subq INCX, X + movss 0 * SIZE(X), %xmm3 + subq INCX, X + movss 0 * SIZE(X), %xmm2 + subq INCX, X + movss 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm6 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_4 + +.L95: + testq $4, M + je .L96 + + movss 0 * SIZE(X), %xmm1 + addq INCX, X + movss 0 * SIZE(X), %xmm2 + addq INCX, X + movss 0 * SIZE(X), %xmm3 + addq INCX, X + movss 0 * SIZE(X), %xmm4 + addq INCX, X + +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L96: + testq $2, M + je .L97 + + movss 0 * SIZE(X), %xmm1 + addq INCX, X + movss 0 * SIZE(X), %xmm2 + addq INCX, X +#ifdef USE_ABS + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 +#endif + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L97: + incq RET + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/iamax_sse2.S b/kernel/x86_64/iamax_sse2.S new file mode 100644 index 0000000000..c17a81ab90 --- /dev/null +++ b/kernel/x86_64/iamax_sse2.S @@ -0,0 +1,1136 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define RET %rax +#define I ARG4 +#define XX %r10 +#define MM %r11 + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + xor RET, RET + testq M, M + jle .L999 + leaq (, INCX, SIZE), INCX + testq INCX, INCX + jle .L999 + + movq M, MM + movq X, XX + +#ifdef USE_ABS + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 +#endif + + movsd (X), %xmm0 + addq INCX, X + decq M +#ifdef USE_ABS + andpd %xmm15, %xmm0 +#endif + unpcklpd %xmm0, %xmm0 + movapd %xmm0, %xmm1 + movapd %xmm0, %xmm2 + movapd %xmm0, %xmm3 + cmpq $SIZE, INCX + jne .L80 + +/* Analigned Check */ + cmpq $7, M + jle .L50 + + testq $7, X + jne .L50 # Purely Unaligned Mode + + testq $15, X # Checking for 128bit align + je .L05 + + movsd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + unpcklpd %xmm4, %xmm4 + maxpd %xmm4, %xmm3 + decq M + addq $SIZE, X + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L15 + ALIGN_4 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movapd 4 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movapd 6 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 8 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 10 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movapd 12 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movapd 14 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L15: + andq $15, M + jle .L20 + + testq $8, M + je .L16 + + movapd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movapd 4 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movapd 6 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L16: + testq $4, M + je .L17 + + movapd 0 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movapd 2 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L17: + testq $2, M + je .L18 + + movapd 0 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + addq $2 * SIZE, X + +.L18: + testq $1, M + je .L20 + + movsd 0 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + unpcklpd %xmm7, %xmm7 + maxpd %xmm7, %xmm3 + ALIGN_3 + +/* Finding Index */ +.L20: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + ALIGN_3 + + testq $15, X # Checking for 128bit align + je .L21 + + movsd 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andpd %xmm15, %xmm1 +#endif + incq RET + comisd %xmm0, %xmm1 + je .L999 + addq $SIZE, X + decq M + ALIGN_3 + +.L21: + movq M, I + sarq $3, I + jle .L25 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andpd %xmm15, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movapd 2 * SIZE(X), %xmm3 +#ifdef USE_ABS + andpd %xmm15, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movapd 4 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + cmpeqpd %xmm0, %xmm5 + + movapd 6 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + cmpeqpd %xmm0, %xmm7 + + orpd %xmm3, %xmm1 + orpd %xmm7, %xmm5 + orpd %xmm5, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L23 + + addq $8 * SIZE, X + addq $8, RET + decq I + jg .L22 + jmp .L25 + ALIGN_4 + +.L23: + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movsd 5 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 7 * SIZE(X), %xmm8 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm6 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_3 + +.L25: + testq $4, M + je .L27 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 +#endif + addq $4 * SIZE, X + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L27: + testq $2, M + je .L28 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 +#endif + addq $2 * SIZE, X + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L28: + incq RET + jmp .L999 + ALIGN_3 + +/* Unaligned Mode */ +.L50: + movq M, I + sarq $4, I + jle .L55 + ALIGN_4 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 4 * SIZE(X), %xmm6 + movhpd 5 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 8 * SIZE(X), %xmm4 + movhpd 9 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 10 * SIZE(X), %xmm5 + movhpd 11 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 12 * SIZE(X), %xmm6 + movhpd 13 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 14 * SIZE(X), %xmm7 + movhpd 15 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L51 + ALIGN_4 + +.L55: + andq $15, M + jle .L60 + + testq $8, M + je .L56 + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 4 * SIZE(X), %xmm6 + movhpd 5 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + addq $8 * SIZE, X + ALIGN_3 + +.L56: + testq $4, M + je .L57 + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 2 * SIZE(X), %xmm5 + movhpd 3 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L57: + testq $2, M + je .L58 + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + addq $2 * SIZE, X + +.L58: + testq $1, M + je .L60 + + movsd 0 * SIZE(X), %xmm7 + unpcklpd %xmm7, %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + ALIGN_3 + +.L60: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L65 + ALIGN_4 + +.L62: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 +#ifdef USE_ABS + andpd %xmm15, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movsd 2 * SIZE(X), %xmm3 + movhpd 3 * SIZE(X), %xmm3 +#ifdef USE_ABS + andpd %xmm15, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movsd 4 * SIZE(X), %xmm5 + movhpd 5 * SIZE(X), %xmm5 +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + cmpeqpd %xmm0, %xmm5 + + movsd 6 * SIZE(X), %xmm7 + movhpd 7 * SIZE(X), %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + cmpeqpd %xmm0, %xmm7 + + orpd %xmm3, %xmm1 + orpd %xmm7, %xmm5 + orpd %xmm5, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L63 + + addq $8 * SIZE, X + addq $8, RET + decq I + jg .L62 + jmp .L65 + ALIGN_4 + +.L63: + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movsd 5 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 7 * SIZE(X), %xmm8 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm6 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_3 + +.L65: + testq $4, M + je .L67 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 +#endif + addq $4 * SIZE, X + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L67: + testq $2, M + je .L68 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 +#endif + addq $2 * SIZE, X + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L68: + incq RET + jmp .L999 + ALIGN_4 + +.L80: + movq M, I + sarq $4, I + jle .L85 + ALIGN_4 + +.L81: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + decq I + jg .L81 + ALIGN_4 + +.L85: + andq $15, M + jle .L90 + + testq $8, M + je .L86 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + ALIGN_3 + +.L86: + testq $4, M + je .L87 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm4 +#endif + maxpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + maxpd %xmm5, %xmm1 + ALIGN_3 + +.L87: + testq $2, M + je .L88 + + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm6 +#endif + maxpd %xmm6, %xmm2 + ALIGN_3 + +.L88: + testq $1, M + je .L90 + + movsd 0 * SIZE(X), %xmm7 + unpcklpd %xmm7, %xmm7 +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + maxpd %xmm7, %xmm3 + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L90: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L95 + ALIGN_4 + +.L92: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm1 +#endif + cmpeqpd %xmm0, %xmm1 + + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm3 +#endif + cmpeqpd %xmm0, %xmm3 + + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm5 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm5 +#endif + cmpeqpd %xmm0, %xmm5 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm7 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm7 +#endif + cmpeqpd %xmm0, %xmm7 + + orpd %xmm3, %xmm1 + orpd %xmm7, %xmm5 + orpd %xmm5, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L93 + + addq $8, RET + decq I + jg .L92 + jmp .L95 + ALIGN_4 + +.L93: + subq INCX, X + movsd 0 * SIZE(X), %xmm8 + subq INCX, X + movsd 0 * SIZE(X), %xmm7 + subq INCX, X + movsd 0 * SIZE(X), %xmm6 + subq INCX, X + movsd 0 * SIZE(X), %xmm5 + subq INCX, X + movsd 0 * SIZE(X), %xmm4 + subq INCX, X + movsd 0 * SIZE(X), %xmm3 + subq INCX, X + movsd 0 * SIZE(X), %xmm2 + subq INCX, X + movsd 0 * SIZE(X), %xmm1 +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 +#endif + + addq $8 * SIZE, X + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm6 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + incq RET + jmp .L999 + ALIGN_3 + +.L95: + testq $4, M + je .L97 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 +#endif + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm4 + je .L999 + ALIGN_3 + +.L97: + testq $2, M + je .L98 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X +#ifdef USE_ABS + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 +#endif + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm2 + je .L999 + ALIGN_3 + +.L98: + incq RET + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/izamax.S b/kernel/x86_64/izamax.S new file mode 100644 index 0000000000..a77b06df9b --- /dev/null +++ b/kernel/x86_64/izamax.S @@ -0,0 +1,270 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I ARG4 +#define NUM %r10 +#define RET %rax + +#ifndef USE_MIN +#define FMOV fcmovbe +#define IMOV cmovnbe +#else +#define FMOV fcmovnb +#define IMOV cmovb +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $ZBASE_SHIFT, INCX + + fldz + xorq RET, RET + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + ffreep %st + movq $2, NUM + movq $1, RET + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + addq INCX, X + decq M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq $8 * SIZE, X + + decq I + jg .L10 + ALIGN_4 + +.L20: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq $2 * SIZE, X + decq I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + decq I + jg .L50 + ALIGN_4 + +.L60: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + IMOV NUM, RET + fxch %st(1) + ffreep %st + incq NUM + + addq INCX, X + decq I + jg .L61 + ALIGN_4 + +.L999: + ffreep %st + ret + + EPILOGUE diff --git a/kernel/x86_64/izamax_sse.S b/kernel/x86_64/izamax_sse.S new file mode 100644 index 0000000000..2dfeb93ea3 --- /dev/null +++ b/kernel/x86_64/izamax_sse.S @@ -0,0 +1,554 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define RET %rax +#define I ARG4 +#define XX %r10 +#define MM %r11 + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + xor RET, RET + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + movq M, MM + movq X, XX + + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 + + movss 0 * SIZE(X), %xmm0 + movss 1 * SIZE(X), %xmm1 + addq INCX, X + decq M + andps %xmm15, %xmm0 + andps %xmm15, %xmm1 + addps %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, %xmm1 + cmpq $2 * SIZE, INCX + jne .L70 + +.L30: + movq M, I + sarq $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + movsd 8 * SIZE(X), %xmm7 + movhps 10 * SIZE(X), %xmm7 + movsd 12 * SIZE(X), %xmm8 + movhps 14 * SIZE(X), %xmm8 + movaps %xmm7, %xmm9 + + shufps $0x88, %xmm8, %xmm7 + shufps $0xdd, %xmm8, %xmm9 + + andps %xmm15, %xmm7 + andps %xmm15, %xmm9 + addps %xmm9, %xmm7 + maxps %xmm7, %xmm0 + + addq $16 * SIZE, X + decq I + jg .L31 + ALIGN_4 + +.L35: + andq $7, M + jle .L40 + + testq $4, M + je .L36 + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + addq $8 * SIZE, X + ALIGN_3 + +.L36: + testq $2, M + je .L37 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + movss 2 * SIZE(X), %xmm6 + movss 3 * SIZE(X), %xmm7 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + maxss %xmm4, %xmm0 + maxss %xmm6, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L37: + testq $1, M + je .L40 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addps %xmm5, %xmm4 + maxss %xmm4, %xmm0 + ALIGN_4 + +.L40: + movq XX, X + movq MM, M + + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movq M, I + sarq $2, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movhps 2 * SIZE(X), %xmm1 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm15, %xmm1 + andps %xmm15, %xmm3 + addps %xmm3, %xmm1 + + cmpeqps %xmm0, %xmm1 +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L43 + + addq $8 * SIZE, X + addq $4, RET + decq I + jg .L41 + jmp .L45 + ALIGN_4 + +.L43: + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 + movss 4 * SIZE(X), %xmm5 + movss 5 * SIZE(X), %xmm6 + movss 6 * SIZE(X), %xmm7 + movss 7 * SIZE(X), %xmm8 + addq $8 * SIZE, X + + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + addps %xmm6, %xmm5 + addps %xmm8, %xmm7 + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + ALIGN_3 + +.L45: + testq $2, M + je .L47 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + movss 2 * SIZE(X), %xmm3 + movss 3 * SIZE(X), %xmm4 + addq $4 * SIZE, X + + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L47: + incq RET + jmp .L999 + ALIGN_3 + +.L70: + movq M, I + sarq $3, I + jle .L75 + ALIGN_4 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhps 0 * SIZE(X), %xmm5 + addq INCX, X + + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhps 0 * SIZE(X), %xmm7 + addq INCX, X + movsd 0 * SIZE(X), %xmm8 + addq INCX, X + movhps 0 * SIZE(X), %xmm8 + addq INCX, X + movaps %xmm7, %xmm9 + + shufps $0x88, %xmm8, %xmm7 + shufps $0xdd, %xmm8, %xmm9 + + andps %xmm15, %xmm7 + andps %xmm15, %xmm9 + addps %xmm9, %xmm7 + maxps %xmm7, %xmm0 + + decq I + jg .L71 + ALIGN_4 + +.L75: + andq $7, M + jle .L80 + + testq $4, M + je .L76 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhps 0 * SIZE(X), %xmm5 + addq INCX, X + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + ALIGN_3 + +.L76: + testq $2, M + je .L77 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + addq INCX, X + movss 0 * SIZE(X), %xmm6 + movss 1 * SIZE(X), %xmm7 + addq INCX, X + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + maxss %xmm4, %xmm0 + maxss %xmm6, %xmm1 + ALIGN_3 + +.L77: + testq $1, M + je .L80 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addps %xmm5, %xmm4 + maxss %xmm4, %xmm0 + ALIGN_4 + +.L80: + movq XX, X + movq MM, M + + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + + movq M, I + sarq $2, I + jle .L85 + ALIGN_4 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhps 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movhps 0 * SIZE(X), %xmm2 + addq INCX, X + + movaps %xmm1, %xmm3 + + shufps $0x88, %xmm2, %xmm1 + shufps $0xdd, %xmm2, %xmm3 + + andps %xmm15, %xmm1 + andps %xmm15, %xmm3 + addps %xmm3, %xmm1 + + cmpeqps %xmm0, %xmm1 +#ifndef C_SUN + movmskps %xmm1, %r11 +#else + .long 0xd9500f4c +#endif + testq $15, %r11 + jne .L83 + + addq $4, RET + decq I + jg .L81 + jmp .L85 + ALIGN_4 + +.L83: + subq INCX, X + movss 0 * SIZE(X), %xmm7 + movss 1 * SIZE(X), %xmm8 + subq INCX, X + movss 0 * SIZE(X), %xmm5 + movss 1 * SIZE(X), %xmm6 + subq INCX, X + movss 0 * SIZE(X), %xmm3 + movss 1 * SIZE(X), %xmm4 + subq INCX, X + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + andps %xmm15, %xmm8 + + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + addps %xmm6, %xmm5 + addps %xmm8, %xmm7 + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + incq RET + comiss %xmm0, %xmm5 + je .L999 + incq RET + comiss %xmm0, %xmm7 + je .L999 + ALIGN_3 + +.L85: + testq $2, M + je .L87 + + movss 0 * SIZE(X), %xmm1 + movss 1 * SIZE(X), %xmm2 + addq INCX, X + movss 0 * SIZE(X), %xmm3 + movss 1 * SIZE(X), %xmm4 + addq INCX, X + + andps %xmm15, %xmm1 + andps %xmm15, %xmm2 + andps %xmm15, %xmm3 + andps %xmm15, %xmm4 + addps %xmm2, %xmm1 + addps %xmm4, %xmm3 + + incq RET + comiss %xmm0, %xmm1 + je .L999 + incq RET + comiss %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L87: + incq RET + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/izamax_sse2.S b/kernel/x86_64/izamax_sse2.S new file mode 100644 index 0000000000..4e66e5338b --- /dev/null +++ b/kernel/x86_64/izamax_sse2.S @@ -0,0 +1,597 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define RET %rax +#define I ARG4 +#define XX %r10 +#define MM %r11 + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + xor RET, RET + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + movq M, MM + movq X, XX + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + decq M + andpd %xmm15, %xmm0 + andpd %xmm15, %xmm1 + addpd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, %xmm1 + movapd %xmm0, %xmm2 + movapd %xmm0, %xmm3 + cmpq $2 * SIZE, INCX + jne .L60 + + movq M, I + sarq $3, I + jle .L25 + ALIGN_4 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 5 * SIZE(X), %xmm7 + movhpd 6 * SIZE(X), %xmm6 + movhpd 7 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 8 * SIZE(X), %xmm4 + movsd 9 * SIZE(X), %xmm5 + movhpd 10 * SIZE(X), %xmm4 + movhpd 11 * SIZE(X), %xmm5 + movsd 12 * SIZE(X), %xmm6 + movsd 13 * SIZE(X), %xmm7 + movhpd 14 * SIZE(X), %xmm6 + movhpd 15 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm2 + maxpd %xmm6, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L21 + ALIGN_4 + +.L25: + andq $7, M + jle .L30 + + testq $4, M + je .L26 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 5 * SIZE(X), %xmm7 + movhpd 6 * SIZE(X), %xmm6 + movhpd 7 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L26: + testq $2, M + je .L27 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + addq $4 * SIZE, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm0 + ALIGN_3 + +.L27: + testq $1, M + je .L30 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxsd %xmm4, %xmm2 + ALIGN_4 + +.L30: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $2, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movhpd 2 * SIZE(X), %xmm1 + movhpd 3 * SIZE(X), %xmm2 + movsd 4 * SIZE(X), %xmm3 + movsd 5 * SIZE(X), %xmm4 + movhpd 6 * SIZE(X), %xmm3 + movhpd 7 * SIZE(X), %xmm4 + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + cmpeqpd %xmm0, %xmm1 + cmpeqpd %xmm0, %xmm3 + + orpd %xmm3, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L33 + + addq $8 * SIZE, X + addq $4, RET + decq I + jg .L31 + jmp .L35 + ALIGN_4 + +.L33: + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movsd 5 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 7 * SIZE(X), %xmm8 + addq $8 * SIZE, X + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + addpd %xmm8, %xmm7 + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + ALIGN_3 + +.L35: + testq $2, M + je .L36 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + addq $4 * SIZE, X + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L36: + incq RET + jmp .L999 + ALIGN_3 + +.L60: + movq M, I + sarq $3, I + jle .L65 + ALIGN_4 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm2 + maxpd %xmm6, %xmm3 + + decq I + jg .L61 + ALIGN_4 + +.L65: + andq $7, M + jle .L70 + + testq $4, M + je .L66 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + ALIGN_3 + +.L66: + testq $2, M + je .L67 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm2 + ALIGN_3 + +.L67: + testq $1, M + je .L70 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxsd %xmm4, %xmm3 + ALIGN_3 + +.L70: + movq XX, X + movq MM, M + + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $2, I + jle .L75 + ALIGN_4 + +.L71: +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetch PREFETCHSIZE * SIZE(X) +#endif + +#ifdef PENTIUM4 + prefetchnta PREFETCHSIZE * SIZE(X) +#endif + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + movsd 1 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm4 + addq INCX, X + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + + cmpeqpd %xmm0, %xmm1 + cmpeqpd %xmm0, %xmm3 + + orpd %xmm3, %xmm1 +#ifndef C_SUN + movmskpd %xmm1, %r11 +#else + .byte 0x66 + .long 0xd9500f4c +#endif + testq $3, %r11 + jne .L73 + + addq $4, RET + decq I + jg .L71 + jmp .L75 + ALIGN_4 + +.L73: + subq INCX, X + movsd 0 * SIZE(X), %xmm7 + movsd 1 * SIZE(X), %xmm8 + subq INCX, X + movsd 0 * SIZE(X), %xmm5 + movsd 1 * SIZE(X), %xmm6 + subq INCX, X + movsd 0 * SIZE(X), %xmm3 + movsd 1 * SIZE(X), %xmm4 + subq INCX, X + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + andpd %xmm15, %xmm8 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + addpd %xmm8, %xmm7 + + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + incq RET + comisd %xmm0, %xmm5 + je .L999 + incq RET + comisd %xmm0, %xmm7 + je .L999 + ALIGN_3 + +.L75: + testq $2, M + je .L76 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + movsd 1 * SIZE(X), %xmm4 + addq INCX, X + + andpd %xmm15, %xmm1 + andpd %xmm15, %xmm2 + andpd %xmm15, %xmm3 + andpd %xmm15, %xmm4 + + addpd %xmm2, %xmm1 + addpd %xmm4, %xmm3 + incq RET + comisd %xmm0, %xmm1 + je .L999 + incq RET + comisd %xmm0, %xmm3 + je .L999 + ALIGN_3 + +.L76: + incq RET + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/lsame.S b/kernel/x86_64/lsame.S new file mode 100644 index 0000000000..8b1ca10db5 --- /dev/null +++ b/kernel/x86_64/lsame.S @@ -0,0 +1,72 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define X ARG1 /* rdi */ +#define Y ARG2 /* rsi */ +#define XX ARG3 +#define YY ARG4 + + PROLOGUE + PROFCODE + + movzbq (X), X + movzbq (Y), Y + + andq $255, X + andq $255, Y + + leaq -32(X), XX + leaq -32(Y), YY + + cmpq $97, X + cmovge XX, X + + cmpq $97,Y + cmovge YY, Y + + movq $0, %rax + movq $1, %r8 + + cmpq X, Y + cmoveq %r8, %rax + ret + + EPILOGUE diff --git a/kernel/x86_64/mcount.S b/kernel/x86_64/mcount.S new file mode 100644 index 0000000000..2770e3295b --- /dev/null +++ b/kernel/x86_64/mcount.S @@ -0,0 +1,46 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + + jmp _mcount + + EPILOGUE diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S new file mode 100644 index 0000000000..d375e8e60d --- /dev/null +++ b/kernel/x86_64/nrm2.S @@ -0,0 +1,206 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $BASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE, INCX + jne .L40 + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + FLD 2 * SIZE(X) + fmul %st(0), %st + FLD 3 * SIZE(X) + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fmul %st(0), %st + FLD 5 * SIZE(X) + fmul %st(0), %st + FLD 6 * SIZE(X) + fmul %st(0), %st + FLD 7 * SIZE(X) + fmul %st(0), %st + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $7, M + jle .L998 + ALIGN_4 + + +.L21: + FLD (X) + fmul %st(0), %st + faddp %st,%st(1) + addq $1 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L60 + ALIGN_4 + +.L50: + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + FLD (X) + addq INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $7, M + jle .L998 + ALIGN_4 + + +.L61: + FLD (X) + addq INCX, X + fmul %st(0), %st + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + fsqrt +#ifndef XDOUBLE + sub $2 * SIZE, %rsp + FST (%rsp) + MOVSD (%rsp), %xmm0 + add $2 * SIZE, %rsp +#endif + ret + + EPILOGUE + diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S new file mode 100644 index 0000000000..37762abcbe --- /dev/null +++ b/kernel/x86_64/nrm2_sse.S @@ -0,0 +1,316 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + pxor %xmm1, %xmm1 + testq INCX, INCX + jle .L999 + + pxor %xmm2, %xmm2 + leaq (, INCX, SIZE), INCX + pxor %xmm3, %xmm3 + cmpq $SIZE, INCX + jne .L40 + + testq $SIZE, X + je .L05 + + movss 0 * SIZE(X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + addq INCX, X + decq M + jle .L998 + ALIGN_3 + +.L05: + movq M, I + sarq $3, I + jle .L14 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + addq $8 * SIZE, X + decq I + jle .L12 + ALIGN_3 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + cvtps2pd %xmm6, %xmm10 + cvtps2pd %xmm7, %xmm11 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + addq $8 * SIZE, X + decq I + jg .L10 + ALIGN_3 + +.L12: + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + cvtps2pd %xmm6, %xmm10 + cvtps2pd %xmm7, %xmm11 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + ALIGN_3 + + +.L14: + testq $4, M + je .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + cvtps2pd %xmm4, %xmm6 + cvtps2pd %xmm5, %xmm7 + mulpd %xmm6, %xmm6 + mulpd %xmm7, %xmm7 + addpd %xmm6, %xmm0 + addpd %xmm7, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L15: + testq $2, M + je .L16 + + movsd 0 * SIZE(X), %xmm4 + cvtps2pd %xmm4, %xmm6 + mulpd %xmm6, %xmm6 + addpd %xmm6, %xmm2 + addq $2 * SIZE, X + ALIGN_3 + +.L16: + testq $1, M + je .L998 + + movss 0 * SIZE(X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L41: + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm7 + addq INCX, X + movss (X), %xmm8 + addq INCX, X + movss (X), %xmm9 + addq INCX, X + movss (X), %xmm10 + addq INCX, X + movss (X), %xmm11 + addq INCX, X + + cvtss2sd %xmm4, %xmm4 + cvtss2sd %xmm5, %xmm5 + cvtss2sd %xmm6, %xmm6 + cvtss2sd %xmm7, %xmm7 + cvtss2sd %xmm8, %xmm8 + cvtss2sd %xmm9, %xmm9 + cvtss2sd %xmm10, %xmm10 + cvtss2sd %xmm11, %xmm11 + + mulsd %xmm4, %xmm4 + mulsd %xmm5, %xmm5 + mulsd %xmm6, %xmm6 + mulsd %xmm7, %xmm7 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + mulsd %xmm8, %xmm8 + mulsd %xmm9, %xmm9 + mulsd %xmm10, %xmm10 + mulsd %xmm11, %xmm11 + + addsd %xmm8, %xmm0 + addsd %xmm9, %xmm1 + addsd %xmm10, %xmm2 + addsd %xmm11, %xmm3 + + decq I + jg .L41 + ALIGN_3 + +.L44: + testq $4, M + je .L45 + + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm7 + addq INCX, X + + cvtss2sd %xmm4, %xmm8 + cvtss2sd %xmm5, %xmm9 + cvtss2sd %xmm6, %xmm10 + cvtss2sd %xmm7, %xmm11 + + mulsd %xmm8, %xmm8 + mulsd %xmm9, %xmm9 + mulsd %xmm10, %xmm10 + mulsd %xmm11, %xmm11 + + addsd %xmm8, %xmm0 + addsd %xmm9, %xmm1 + addsd %xmm10, %xmm2 + addsd %xmm11, %xmm3 + ALIGN_3 + +.L45: + testq $2, M + je .L46 + + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + + cvtss2sd %xmm4, %xmm6 + cvtss2sd %xmm5, %xmm7 + mulsd %xmm6, %xmm6 + mulsd %xmm7, %xmm7 + addsd %xmm6, %xmm1 + addsd %xmm7, %xmm2 + ALIGN_3 + +.L46: + testq $1, M + je .L998 + + movss (X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + sqrtsd %xmm0, %xmm0 + + cvtsd2ss %xmm0, %xmm0 + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/qconjg.S b/kernel/x86_64/qconjg.S new file mode 100644 index 0000000000..49ca766491 --- /dev/null +++ b/kernel/x86_64/qconjg.S @@ -0,0 +1,54 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + PROFCODE + + fldz + FLD 1 * SIZE(ARG1) + fsubrp %st, %st(1) + FLD 0 * SIZE(ARG1) + + FST 0 * SIZE(ARG2) + FST 1 * SIZE(ARG2) + ret + + EPILOGUE diff --git a/kernel/x86_64/qdot.S b/kernel/x86_64/qdot.S new file mode 100644 index 0000000000..c958fc57d9 --- /dev/null +++ b/kernel/x86_64/qdot.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define STACK_N 4 + STACK + ARGS(%esp) +#define STACK_X 8 + STACK + ARGS(%esp) +#define STACK_INCX 12 + STACK + ARGS(%esp) +#define STACK_Y 16 + STACK + ARGS(%esp) +#define STACK_INCY 20 + STACK + ARGS(%esp) + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + +#include "l1param.h" + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + sall $BASE_SHIFT, INCX + sall $BASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpl $SIZE, INCX + jne .L14 + cmpl $SIZE, INCY + jne .L14 + + movl N, %eax + sarl $2, %eax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(1) + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(2) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(3) + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st,%st(4) + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD (X) + addl $SIZE, X + FLD (Y) + fmulp %st, %st(1) + addl $SIZE, Y + faddp %st,%st(1) + decl %eax + jg .L22 + + jmp .L27 + ALIGN_3 + +.L14: + movl N, %eax + sarl $2, %eax + jle .L30 + ALIGN_3 + +.L31: + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(1) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(2) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(3) + + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st,%st(4) + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $3, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD (X) + addl INCX, X + FLD (Y) + fmulp %st, %st(1) + addl INCY, Y + faddp %st, %st(1) + decl %eax + jg .L37 + ALIGN_3 + +.L27: + faddp %st,%st(2) + faddp %st,%st(2) + faddp %st,%st(1) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86_64/qgemm_kernel_2x2.S b/kernel/x86_64/qgemm_kernel_2x2.S new file mode 100644 index 0000000000..9db145b9fa --- /dev/null +++ b/kernel/x86_64/qgemm_kernel_2x2.S @@ -0,0 +1,810 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define KKK 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OFFSET 32 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 24 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $BASE_SHIFT, LDC + + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + leaq (, LDC, 2), %rax + addq %rax, C + + movq M, I + sarq $1, I + je .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO + decq I + jne .L11 + ALIGN_4 + +.L20: + movq M, %rax + andq $1, %rax + je .L29 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq ( B, %rax, 2), BO +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + decq J + jne .L01 + ALIGN_4 + +.L30: + movq N, %rax + testq $1, %rax + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + addq LDC, C + + movq M, I + sarq $1, I + je .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq ( B, %rax, 1), BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + FLD ALPHA + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO + decq I + jne .L31 + ALIGN_4 + +.L40: + movq M, %rax + andq $1, %rax + je .L49 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq ( B, %rax, 1), BO +#endif + + fldz + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + FLD ALPHA + + fmulp %st, %st(1) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) +#else + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + movq BO, B + ALIGN_4 + +.L999: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/qgemv_n.S b/kernel/x86_64/qgemv_n.S new file mode 100644 index 0000000000..28415ecb1a --- /dev/null +++ b/kernel/x86_64/qgemv_n.S @@ -0,0 +1,410 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define P 32 + +#define STACKSIZE 80 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OLD_INCX 24 + STACKSIZE(%rsp) +#define OLD_Y 32 + STACKSIZE(%rsp) +#define OLD_INCY 40 + STACKSIZE(%rsp) +#define BUFFER 48 + STACKSIZE(%rsp) + +#define PLDA_M 56 (%rsp) +#define IS 64 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#define TEMP %rax +#define I %rax +#define J %r11 +#define A1 %r12 +#define X1 %r13 +#define Y1 %r14 +#define XP %r15 +/* #define BUFFER %r15 */ +#define MIN_N %rbx + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + + FLD ALPHA + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + movq $0, IS + + test M, M + jle .L79 # goto END + test N, N + jle .L79 # goto END + + movq LDA, %rax + imulq $P, %rax # P * lda + subq M ,%rax # P * lda - m + salq $BASE_SHIFT, %rax + movq %rax, PLDA_M + + salq $BASE_SHIFT, LDA + ALIGN_2 + +.L32: + movq $P, %rax + movq N, MIN_N + subq IS, MIN_N + cmpq %rax, MIN_N + cmovg %rax, MIN_N + + movq IS, XP + salq $BASE_SHIFT, XP + leaq (X,XP, 1), XP + + cmpq $SIZE, INCX + je .L34 # if incx == 1 goto L34 + + movq BUFFER, XP + movq XP, X1 + + movq MIN_N, I + sarq $2,I + jle .L35 + ALIGN_2 + +.L36: + FLD (X) + addq INCX,X + FLD (X) + addq INCX,X + FLD (X) + addq INCX,X + FLD (X) + addq INCX,X + + FST 3 * SIZE(X1) + FST 2 * SIZE(X1) + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + + addq $4 * SIZE, X1 + decq I + jg .L36 + ALIGN_3 + +.L35: + movq MIN_N, I + andq $3, I + jle .L34 + ALIGN_2 + +.L42: + FLD (X) + addq INCX, X + FST (X1) + addq $SIZE, X1 + decq I + jg .L42 + ALIGN_3 + +/* Main Routine */ +.L34: + movq Y, Y1 + movq M, J + sarq $2, J + jle .L47 + ALIGN_2 + +.L48: + movq A, A1 # a_offset = a + fldz + addq $4 * SIZE, A # a += 4 + fldz + movq XP, X1 # b_offset = xp + fldz + movq MIN_N, I # i = min_n + fldz + FLD (X1) # bt1 = b_offset + sarq $1, I + jle .L51 + ALIGN_2 + +.L80: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + FLD 1 * SIZE(X1) # bt1 = b_offset + + addq LDA, A1 # a_offset += lda + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) + + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) + + fmulp %st, %st(1) + addq LDA, A1 + faddp %st, %st(4) # ct4 += at1 + + FLD 2 * SIZE(X1) # bt1 = b_offset + addq $2 * SIZE, X1 # b_offset += 2 + + decq I + jg .L80 + +.L51: + movq MIN_N, I + andq $1, I + je .L57 + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # at1 = *(a_offset + 1) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(3) # ct2 += at1 + + FLD 2 * SIZE(A1) # at1 = *(a_offset + 2) + fmul %st(1), %st # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 3) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += at1 + fldz + ALIGN_2 + +.L57: + ffreep %st(0) + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + decq J # j -- + jg .L48 + ALIGN_3 + +.L47: + movq M, J + andq $3, J # j = (m & 3) + jle .L60 + ALIGN_2 + +.L61: + movq A, A1 # a_offset = a + fldz + addq $SIZE, A # a++ + fldz + movq XP, X1 + fldz + fldz + movq MIN_N, I + sarq $3, I + jle .L64 + ALIGN_2 + +.L65: + FLD 0 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(1) + addq LDA, A1 + + FLD 1 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(2) + addq LDA ,A1 + + FLD 2 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(3) + addq LDA, A1 + + FLD 3 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(4) + addq LDA, A1 + + FLD 4 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st,%st(1) + addq LDA, A1 + + FLD 5 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st, %st(2) + addq LDA, A1 + + FLD 6 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st,%st(3) + addq LDA, A1 + + FLD 7 * SIZE(X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st,%st(4) + addq LDA, A1 + + addq $8 * SIZE, X1 + decq I + jg .L65 + +.L64: + movq MIN_N,I + andq $7, I + jle .L70 + ALIGN_2 + +.L71: + FLD (X1) + addq $SIZE, X1 + FLD (A1) + fmulp %st, %st(1) + addq LDA, A1 # a_offset += lda + faddp %st, %st(1) + decq I + jg .L71 + ALIGN_2 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1), %st + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + decq J + jg .L61 + +.L60: + addq PLDA_M, A + addq $P, IS + cmpq N, IS + jl .L32 + +.L79: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/qgemv_t.S b/kernel/x86_64/qgemv_t.S new file mode 100644 index 0000000000..9402f21a9c --- /dev/null +++ b/kernel/x86_64/qgemv_t.S @@ -0,0 +1,466 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define STACKSIZE 80 +#define P 4096 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OLD_INCX 24 + STACKSIZE(%rsp) +#define OLD_Y 32 + STACKSIZE(%rsp) +#define OLD_INCY 40 + STACKSIZE(%rsp) +#define BUFFER 48 + STACKSIZE(%rsp) + +#define NLDA 56 (%rsp) +#define IS 64 (%rsp) +#define XP 72 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#define TEMP %rax +#define I %rax +#define J %r11 +#define A1 %r12 +#define A2 %r15 +#define X1 %r13 +#define Y1 %r14 +#define MIN_M %rbx + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + + FLD ALPHA + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + movq $0, IS + + test M, M + jle .L79 # goto END + test N, N + jle .L79 # goto END + + movq N, %rax + imulq LDA, %rax + movq $P, NLDA + subq %rax, NLDA + salq $BASE_SHIFT, NLDA + + salq $BASE_SHIFT, LDA + ALIGN_2 + +.L32: + movq $P, %rax + movq M, MIN_M + subq IS , MIN_M + cmpq %rax, MIN_M + cmovg %rax, MIN_M + + movq IS, X1 + salq $BASE_SHIFT, X1 + leaq (X,X1, 1), X1 + + movq X1, XP + + cmpq $SIZE, INCX + je .L34 + + movq BUFFER, X1 + movq X1, XP + + movq MIN_M, I + sarq $2, I + jle .L35 + ALIGN_3 + +.L36: + FLD (X) + addq INCX, X + FST 0 * SIZE(X1) + + FLD (X) + addq INCX, X + FST 1 * SIZE(X1) + + FLD (X) + addq INCX, X + FST 2 * SIZE(X1) + + FLD (X) + addq INCX, X + FST 3 * SIZE(X1) + + addq $4 * SIZE, X1 + decq I + jg .L36 + ALIGN_3 + +.L35: + movq MIN_M, I + andq $3,I + jle .L34 + ALIGN_2 + +.L42: + FLD (X) + addq INCX, X + FST (X1) + addq $SIZE, X1 + decq I + jg .L42 + ALIGN_3 + +/* Main Routine */ + +.L34: + movq Y, Y1 # coffset = y + + movq N, J + sarq $2, J + jle .L47 + ALIGN_3 + +.L48: + movq A, A1 + leaq (A, LDA), A2 + leaq (A, LDA, 4), A + + fldz + fldz + fldz + fldz + + movq XP, X1 + FLD (X1) + + movq MIN_M, I + sarq $2,I + jle .L51 + ALIGN_3 + +.L80: + FLD 0 * SIZE(A1) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(2) # ct1 += at1 + FLD 0 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 0 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 0 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + faddp %st,%st(4) + FLD 1 * SIZE(X1) + FLD 1 * SIZE(A1) # at = *(a_offset + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + FLD 1 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) + + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + FLD 1 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) + + fmul %st(1),%st + faddp %st,%st(4) + FLD 1 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 2 * SIZE(X1) + + FLD 2 * SIZE(A1) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD 2 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD 2 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD 2 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 3 * SIZE(X1) + FLD 3 * SIZE(A1) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(2) # ct1 += at1 + FLD 3 * SIZE(A2) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + + faddp %st,%st(3) # ct2 += at1 + FLD 3 * SIZE(A1, LDA, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + + faddp %st,%st(4) + FLD 3 * SIZE(A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + + addq $4 * SIZE, A1 + faddp %st,%st(4) + addq $4 * SIZE, A2 + + FLD 4 * SIZE(X1) + addq $4 * SIZE, X1 + + decq I + jg .L80 + ALIGN_3 + +.L51: + movq MIN_M, I + andq $3, I + je .L81 + ALIGN_3 + +.L52: + FLD (A1) # at = *(a_offset + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(2) # ct1 += at1 + + FLD (A2) # at1 = *(a_offset2 + 0 * lda) + fmul %st(1),%st # at1 *= bt1 + faddp %st,%st(3) # ct2 += at1 + + FLD (A1, LDA, 2) # at = *(a_offset + 2 * lda) + fmul %st(1),%st + faddp %st,%st(4) + + FLD (A2, LDA, 2) # at1 = *(a_offset2 + 2 * lda) + fmulp %st, %st(1) + faddp %st,%st(4) + FLD 1 * SIZE(X1) + + addq $SIZE, A1 + addq $SIZE, A2 + addq $SIZE, X1 + decq I + jg .L52 + ALIGN_3 + +.L81: + ffreep %st(0) + + fxch %st(4) + fmul %st, %st(4) + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fxch %st(4) + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + + decq J + jg .L48 + ALIGN_3 + +.L47: + movq N, J + andq $3, J + jle .L60 + ALIGN_2 + +.L61: + movq A, A1 # a_offset = a + fldz # ct1 = ZERO + fldz # ct1 = ZERO + + addq LDA, A + fldz # ct1 = ZERO + fldz # ct1 = ZERO + + movq XP, X1 + + movq MIN_M, I + sarq $3,I + jle .L64 + ALIGN_3 + +.L65: + FLD 0 * SIZE(X1) + FLD 0 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(1) + + FLD 1 * SIZE(X1) + FLD 1 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(2) + + FLD 2 * SIZE(X1) + FLD 2 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(3) + + FLD 3 * SIZE(X1) + FLD 3 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(4) + + FLD 4 * SIZE(X1) + FLD 4 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(1) + + FLD 5 * SIZE(X1) + FLD 5 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(2) + + FLD 6 * SIZE(X1) + FLD 6 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(3) + + FLD 7 * SIZE(X1) + FLD 7 * SIZE(A1) + fmulp %st, %st(1) + faddp %st,%st(4) + + addq $8 * SIZE, X1 + addq $8 * SIZE, A1 + + decq I + jg .L65 + ALIGN_3 + +.L64: + movq MIN_M, I + andq $7, I + jle .L70 + ALIGN_3 + +.L71: + FLD (X1) + FLD (A1) + fmulp %st, %st(1) + faddp %st,%st(1) + + addq $SIZE, X1 + addq $SIZE, A1 + decq I + jg .L71 + ALIGN_3 + +.L70: + faddp %st, %st(1) + faddp %st, %st(1) + faddp %st, %st(1) + + fmul %st(1),%st + FLD (Y1) + faddp %st, %st(1) + FST (Y1) + addq INCY, Y1 + decq J + jg .L61 + ALIGN_3 + +.L60: + addq NLDA, A + + addq $P, IS + cmpq M, IS + jl .L32 + ALIGN_3 + +.L79: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/qtrsm_kernel_LN_2x2.S b/kernel/x86_64/qtrsm_kernel_LN_2x2.S new file mode 100644 index 0000000000..7093ebae5f --- /dev/null +++ b/kernel/x86_64/qtrsm_kernel_LN_2x2.S @@ -0,0 +1,1234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define AORIG 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OFFSET 32 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 24 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $BASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $BASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $BASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + + lea (, LDC, 2), %rax + +#ifdef RT + subq %rax, C +#endif + movq C, CO +#ifndef RT + addq %rax, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, %rax + andq $1, %rax + je .L20 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + je .L29 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L11 + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J + jne .L01 + ALIGN_4 + +.L30: + movq N, %rax + testq $1, %rax + je .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#ifdef RT + subq LDC, C +#endif + movq C, CO +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, %rax + andq $1, %rax + je .L40 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + movq M, I + sarq $1, I + je .L49 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L31 + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/qtrsm_kernel_LT_2x2.S b/kernel/x86_64/qtrsm_kernel_LT_2x2.S new file mode 100644 index 0000000000..d2a05a11ec --- /dev/null +++ b/kernel/x86_64/qtrsm_kernel_LT_2x2.S @@ -0,0 +1,1234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define AORIG 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OFFSET 32 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 24 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $BASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $BASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $BASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + + lea (, LDC, 2), %rax + +#ifdef RT + subq %rax, C +#endif + movq C, CO +#ifndef RT + addq %rax, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + je .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L11 + ALIGN_4 + +.L20: + movq M, %rax + andq $1, %rax + je .L29 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J + jne .L01 + ALIGN_4 + +.L30: + movq N, %rax + testq $1, %rax + je .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#ifdef RT + subq LDC, C +#endif + movq C, CO +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + je .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L31 + ALIGN_4 + +.L40: + movq M, %rax + andq $1, %rax + je .L49 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/qtrsm_kernel_RT_2x2.S b/kernel/x86_64/qtrsm_kernel_RT_2x2.S new file mode 100644 index 0000000000..288aa07789 --- /dev/null +++ b/kernel/x86_64/qtrsm_kernel_RT_2x2.S @@ -0,0 +1,1234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define AORIG 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA 8 + STACKSIZE(%rsp) +#define OFFSET 32 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 24 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $BASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $BASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $BASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, %rax + testq $1, %rax + je .L30 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#ifdef RT + subq LDC, C +#endif + movq C, CO +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + je .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmulp %st, %st(2) + + FLD -6 * SIZE(AO) + fmul %st(2), %st + + fsubrp %st, %st(1) + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L31 + ALIGN_4 + +.L40: + movq M, %rax + andq $1, %rax + je .L49 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) +#endif + +#ifdef LN + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmulp %st, %st(1) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef RT + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) +#endif + + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L30: + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, %rax + movq %rax, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + + lea (, LDC, 2), %rax + +#ifdef RT + subq %rax, C +#endif + movq C, CO +#ifndef RT + addq %rax, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + je .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(4) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) + FLD -6 * SIZE(BO) + fsubp %st, %st(3) + FLD -5 * SIZE(BO) + fsubp %st, %st(4) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(3) + FLD -6 * SIZE(AO) + fsubp %st, %st(2) + FLD -5 * SIZE(AO) + fsubp %st, %st(4) +#endif + +#ifdef LN + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD -6 * SIZE(AO) + fmul %st(3), %st + FLD -6 * SIZE(AO) + fmul %st(5), %st + + fsubrp %st, %st(3) + fsubrp %st, %st(1) + + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef LT + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st(1), %st + FLD -7 * SIZE(AO) + fmul %st(3), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(3) + + FLD -5 * SIZE(AO) + fmul %st, %st(3) + fmulp %st, %st(4) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + FLD -7 * SIZE(BO) + fmul %st(4), %st + + fsubrp %st, %st(5) + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmul %st, %st(2) + fmulp %st, %st(4) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + FLD -6 * SIZE(BO) + fmul %st(5), %st + + fsubrp %st, %st(4) + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmul %st, %st(1) + fmulp %st, %st(3) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) + fxch %st(2) + fld %st + FST -6 * SIZE(BO) + fxch %st(3) + fld %st + FST -5 * SIZE(BO) + + FST 1 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(2) + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -6 * SIZE(AO) + fxch %st(3) + fld %st + FST -5 * SIZE(AO) + + FST 1 * SIZE(CO, LDC) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L11 + ALIGN_4 + +.L20: + movq M, %rax + andq $1, %rax + je .L29 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + fldz + fldz + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $BASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st, %st(1) + fmulp %st, %st(2) +#endif + +#ifdef RN + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + + FLD -7 * SIZE(BO) + fmul %st(1), %st + + fsubrp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(2) +#endif + +#ifdef RT + FLD -5 * SIZE(BO) + fmulp %st, %st(2) + + FLD -6 * SIZE(BO) + fmul %st(2), %st + + fsubrp %st, %st(1) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) +#endif + +#ifdef LN + subq $1 * SIZE, CO +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -8 * SIZE(BO) + fxch %st(1) + fld %st + FST -7 * SIZE(BO) +#else + fld %st + FST -8 * SIZE(AO) + fxch %st(1) + fld %st + FST -7 * SIZE(AO) +#endif + + FST 0 * SIZE(CO, LDC) + FST 0 * SIZE(CO) + +#ifndef LN + addq $1 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J + jne .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/rot.S b/kernel/x86_64/rot.S new file mode 100644 index 0000000000..05e5aebb31 --- /dev/null +++ b/kernel/x86_64/rot.S @@ -0,0 +1,348 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 48(%rsp), INCY + FLD 72(%rsp) + FLD 56(%rsp) +#else + FLD 24(%rsp) + FLD 8(%rsp) +#endif + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + testq N, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + movq N, I + sarq $2, I + jle .L15 + ALIGN_4 + +.L10: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 2 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 3 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 3 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + + decq I + jg .L10 + ALIGN_4 + +.L15: + movq N, I + andq $3, I + jle .L999 + ALIGN_4 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq $SIZE, X + addq $SIZE, Y + + decq I + jg .L16 + jmp .L999 + ALIGN_4 + +.L50: + movq N, I + sarq $2, I + jle .L55 + ALIGN_4 + +.L51: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq I + jg .L51 + ALIGN_4 + +.L55: + movq N, I + andq $3, I + jle .L999 + ALIGN_4 + +.L56: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq I + jg .L56 + ALIGN_4 + + +.L999: + ffreep %st + ffreep %st + ret + + EPILOGUE diff --git a/kernel/x86_64/rot_sse.S b/kernel/x86_64/rot_sse.S new file mode 100644 index 0000000000..cb7e1b3171 --- /dev/null +++ b/kernel/x86_64/rot_sse.S @@ -0,0 +1,1090 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define C %xmm14 +#define S %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY + movss 48(%rsp), %xmm0 + movss 56(%rsp), %xmm1 +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + pshufd $0x0, %xmm0, C + pshufd $0x0, %xmm1, S + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + testq $SIZE, X + je .L05 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + jle .L999 + +.L05: + testq $2 * SIZE, X + je .L10 + + cmpq $1, N + je .L17 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, N + jle .L999 + ALIGN_2 + +.L10: + testq $3 * SIZE, Y + jne .L20 + + movq N, %rax + sarq $5, %rax + jle .L14 + + movaps 0 * SIZE(Y), %xmm1 + movaps 4 * SIZE(Y), %xmm3 + movaps 8 * SIZE(Y), %xmm9 + movaps 12 * SIZE(Y), %xmm11 + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm8 + movaps 12 * SIZE(X), %xmm10 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 20 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 24 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 28 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 32 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 36 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps 32 * SIZE(X), %xmm0 + movaps %xmm2, 20 * SIZE(X) + movaps 36 * SIZE(X), %xmm2 + movaps %xmm4, 16 * SIZE(Y) + movaps %xmm6, 20 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 40 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 44 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps 40 * SIZE(X), %xmm8 + movaps %xmm10, 28 * SIZE(X) + movaps 44 * SIZE(X), %xmm10 + movaps %xmm4, 24 * SIZE(Y) + movaps %xmm6, 28 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 20 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 24 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 28 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 20 * SIZE(X) + movaps %xmm4, 16 * SIZE(Y) + movaps %xmm6, 20 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps %xmm10, 28 * SIZE(X) + movaps %xmm4, 24 * SIZE(Y) + movaps %xmm6, 28 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + ALIGN_3 + +.L14: + testq $31, N + jle .L999 + + testq $16, N + jle .L15 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps 8 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + movaps 12 * SIZE(Y), %xmm3 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L15: + testq $8, N + jle .L16 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + testq $4, N + jle .L17 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + testq $2, N + jle .L18 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L18: + testq $1, N + jle .L999 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movq N, %rax + sarq $5, %rax + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 12 * SIZE(Y), %xmm3 + movhps 14 * SIZE(Y), %xmm3 + movaps 8 * SIZE(X), %xmm0 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + movaps 16 * SIZE(X), %xmm0 + movaps 20 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 20 * SIZE(X) + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd 24 * SIZE(Y), %xmm1 + movhps 26 * SIZE(Y), %xmm1 + movsd 28 * SIZE(Y), %xmm3 + movhps 30 * SIZE(Y), %xmm3 + movaps 24 * SIZE(X), %xmm0 + movaps 28 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 24 * SIZE(X) + movaps %xmm2, 28 * SIZE(X) + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L24: + testq $31, N + jle .L999 + + testq $16, N + jle .L25 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 12 * SIZE(Y), %xmm3 + movhps 14 * SIZE(Y), %xmm3 + movaps 8 * SIZE(X), %xmm0 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, N + jle .L26 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + + +.L26: + testq $4, N + jle .L27 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $2, N + jle .L28 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L28: + testq $1, N + jle .L999 + + movss 0 * SIZE(Y), %xmm1 + movss 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, 0 * SIZE(X) + movss %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movss (Y), %xmm1 + movss (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulss C, %xmm0 + mulss S, %xmm1 + + mulss C, %xmm2 + mulss S, %xmm3 + + addss %xmm1, %xmm0 + subss %xmm3, %xmm2 + + movss %xmm0, (X) + movss %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/rot_sse2.S b/kernel/x86_64/rot_sse2.S new file mode 100644 index 0000000000..5055547073 --- /dev/null +++ b/kernel/x86_64/rot_sse2.S @@ -0,0 +1,986 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define C %xmm14 +#define S %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY + movsd 48(%rsp), %xmm0 + movsd 56(%rsp), %xmm1 +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + pshufd $0x44, %xmm0, C + pshufd $0x44, %xmm1, S + + cmpq $0, N + jle .L999 + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + testq $SIZE, X + je .L10 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq N + jle .L999 + ALIGN_2 + +.L10: + testq $SIZE, Y + jne .L20 + + movq N, %rax + sarq $4, %rax + jle .L14 + + movaps 0 * SIZE(Y), %xmm1 + movaps 2 * SIZE(Y), %xmm3 + movaps 4 * SIZE(Y), %xmm9 + movaps 6 * SIZE(Y), %xmm11 + + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm2 + movaps 4 * SIZE(X), %xmm8 + movaps 6 * SIZE(X), %xmm10 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulpd S, %xmm1 + movaps %xmm3, %xmm6 + mulpd S, %xmm3 + movaps %xmm0, %xmm5 + mulpd C, %xmm0 + movaps %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movaps 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movaps 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 0 * SIZE(X) + movaps 8 * SIZE(X), %xmm0 + movaps %xmm2, 2 * SIZE(X) + movaps 10 * SIZE(X), %xmm2 + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 2 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulpd S, %xmm9 + movaps %xmm8, %xmm5 + mulpd C, %xmm8 + movaps %xmm11, %xmm6 + mulpd S, %xmm11 + movaps %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movaps 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movaps 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm8, 4 * SIZE(X) + movaps 12 * SIZE(X), %xmm8 + movaps %xmm10,6 * SIZE(X) + movaps 14 * SIZE(X), %xmm10 + movaps %xmm4, 4 * SIZE(Y) + movaps %xmm6, 6 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulpd S, %xmm1 + movaps %xmm3, %xmm6 + mulpd S, %xmm3 + movaps %xmm0, %xmm5 + mulpd C, %xmm0 + movaps %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movaps 18 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 10 * SIZE(X) + movaps 18 * SIZE(X), %xmm2 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 10 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulpd S, %xmm9 + movaps %xmm8, %xmm5 + mulpd C, %xmm8 + movaps %xmm11, %xmm6 + mulpd S, %xmm11 + movaps %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movaps 20 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movaps 22 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm8, 12 * SIZE(X) + movaps 20 * SIZE(X), %xmm8 + movaps %xmm10, 14 * SIZE(X) + movaps 22 * SIZE(X), %xmm10 + movaps %xmm4, 12 * SIZE(Y) + movaps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm1, %xmm4 + mulpd S, %xmm1 + movaps %xmm3, %xmm6 + mulpd S, %xmm3 + movaps %xmm0, %xmm5 + mulpd C, %xmm0 + movaps %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movaps 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movaps 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps 8 * SIZE(X), %xmm0 + movaps %xmm2, 2 * SIZE(X) + movaps 10 * SIZE(X), %xmm2 + + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 2 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulpd S, %xmm9 + movaps %xmm8, %xmm5 + mulpd C, %xmm8 + movaps %xmm11, %xmm6 + mulpd S, %xmm11 + movaps %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movaps 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movaps 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm8, 4 * SIZE(X) + movaps 12 * SIZE(X), %xmm8 + movaps %xmm10,6 * SIZE(X) + movaps 14 * SIZE(X), %xmm10 + movaps %xmm4, 4 * SIZE(Y) + movaps %xmm6, 6 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulpd S, %xmm1 + movaps %xmm3, %xmm6 + mulpd S, %xmm3 + movaps %xmm0, %xmm5 + mulpd C, %xmm0 + movaps %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 10 * SIZE(X) + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 10 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulpd S, %xmm9 + movaps %xmm8, %xmm5 + mulpd C, %xmm8 + movaps %xmm11, %xmm6 + mulpd S, %xmm11 + movaps %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm8, 12 * SIZE(X) + movaps %xmm10, 14 * SIZE(X) + movaps %xmm4, 12 * SIZE(Y) + movaps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + + +.L14: + testq $15, N + jle .L999 + + testq $8, N + jle .L15 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(Y), %xmm3 + movaps 2 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 2 * SIZE(Y) + + movaps 4 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + movaps 6 * SIZE(Y), %xmm3 + movaps 6 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 6 * SIZE(X) + movaps %xmm4, 4 * SIZE(Y) + movaps %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(Y), %xmm3 + movaps 2 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movaps -1 * SIZE(Y), %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L24 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps 1 * SIZE(Y), %xmm3 + movaps 3 * SIZE(Y), %xmm8 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + + movlpd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlpd %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps 5 * SIZE(Y), %xmm9 + movaps 7 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + movaps 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movaps %xmm8, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm9, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 6 * SIZE(X) + movlpd %xmm4, 4 * SIZE(Y) + movhps %xmm4, 5 * SIZE(Y) + movlpd %xmm6, 6 * SIZE(Y) + movhps %xmm6, 7 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps 9 * SIZE(Y), %xmm3 + movaps 11 * SIZE(Y), %xmm8 + movaps 8 * SIZE(X), %xmm0 + movaps 10 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 10 * SIZE(X) + movlpd %xmm4, 8 * SIZE(Y) + movhps %xmm4, 9 * SIZE(Y) + movlpd %xmm6, 10 * SIZE(Y) + movhps %xmm6, 11 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps 13 * SIZE(Y), %xmm9 + movaps 15 * SIZE(Y), %xmm1 + movaps 12 * SIZE(X), %xmm0 + movaps 14 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movaps %xmm8, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm9, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 12 * SIZE(X) + movaps %xmm2, 14 * SIZE(X) + movlpd %xmm4, 12 * SIZE(Y) + movhps %xmm4, 13 * SIZE(Y) + movlpd %xmm6, 14 * SIZE(Y) + movhps %xmm6, 15 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L24: + testq $15, N + jle .L999 + + testq $8, N + jle .L25 + + movaps 1 * SIZE(Y), %xmm3 + movaps 3 * SIZE(Y), %xmm8 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + movlpd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlpd %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + + movaps 5 * SIZE(Y), %xmm9 + movaps 7 * SIZE(Y), %xmm1 + movaps 4 * SIZE(X), %xmm0 + movaps 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movaps %xmm8, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm9, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 4 * SIZE(X) + movaps %xmm2, 6 * SIZE(X) + movlpd %xmm4, 4 * SIZE(Y) + movhps %xmm4, 5 * SIZE(Y) + movlpd %xmm6, 6 * SIZE(Y) + movhps %xmm6, 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + movaps 1 * SIZE(Y), %xmm3 + movaps 3 * SIZE(Y), %xmm8 + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 2 * SIZE(X) + movlpd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlpd %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + movaps %xmm8, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + movaps 1 * SIZE(Y), %xmm4 + movaps 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlpd %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + movaps %xmm4, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L999 + + unpckhpd %xmm1, %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, (X) + movhps %xmm0, (X, INCX) + movlpd %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leaq (X, INCX, 2), X + leaq (Y, INCY, 2), Y + + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlpd %xmm0, (X) + movhps %xmm0, (X, INCX) + movlpd %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leaq (X, INCX, 2), X + leaq (Y, INCY, 2), Y + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd (Y), %xmm1 + movsd (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, (X) + movsd %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/scal.S b/kernel/x86_64/scal.S new file mode 100644 index 0000000000..1f8e4d4447 --- /dev/null +++ b/kernel/x86_64/scal.S @@ -0,0 +1,302 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG4 +#define INCX ARG5 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + FLD 8(%rsp) + + ftst + fnstsw %ax + andb $68, %ah + je .L300 + +/* Alpha == ZERO */ + cmpq $1, INCX + jne .L104 + + movq M, I + sarq $3, I + jle .L102 + ALIGN_4 + +.L101: + fld %st + FST 0 * SIZE(X) + fld %st + FST 1 * SIZE(X) + fld %st + FST 2 * SIZE(X) + fld %st + FST 3 * SIZE(X) + fld %st + FST 4 * SIZE(X) + fld %st + FST 5 * SIZE(X) + fld %st + FST 6 * SIZE(X) + fld %st + FST 7 * SIZE(X) + + addq $8 * SIZE, X + decq I + jg .L101 + ALIGN_4 + +.L102: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + +.L103: + fld %st + FST 0 * SIZE(X) + + addq $SIZE, X + decq I + jg .L103 + jmp .L999 + ALIGN_4 + +.L104: + salq $BASE_SHIFT, INCX + + movq M, I + sarq $3, I + jle .L106 + ALIGN_4 + +.L105: + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + fld %st + FST 0 * SIZE(X) + addq INCX, X + + decq I + jg .L105 + ALIGN_4 + +.L106: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + +.L107: + fld %st + FST 0 * SIZE(X) + addq INCX, X + decq I + jg .L107 + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L300: + cmpq $1,INCX + jne .L304 + + movq M, I + sarq $3, I + jle .L302 + ALIGN_4 + +.L301: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + + FLD 1 * SIZE(X) + fmul %st(1), %st + FST 1 * SIZE(X) + + FLD 2 * SIZE(X) + fmul %st(1), %st + FST 2 * SIZE(X) + + FLD 3 * SIZE(X) + fmul %st(1), %st + FST 3 * SIZE(X) + + FLD 4 * SIZE(X) + fmul %st(1), %st + FST 4 * SIZE(X) + + FLD 5 * SIZE(X) + fmul %st(1), %st + FST 5 * SIZE(X) + + FLD 6 * SIZE(X) + fmul %st(1), %st + FST 6 * SIZE(X) + + FLD 7 * SIZE(X) + fmul %st(1), %st + FST 7 * SIZE(X) + + addq $8 * SIZE, X + decq I + jg .L301 + ALIGN_4 + +.L302: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + +.L303: + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq $SIZE, X + decq I + jg .L303 + jmp .L999 + ALIGN_4 + +.L304: + salq $BASE_SHIFT, INCX + + movq M, I + sarq $3, I + jle .L306 + ALIGN_4 + +.L305: + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + + decq I + jg .L305 + ALIGN_4 + +.L306: + movq M, I + andq $7, I + jle .L999 + ALIGN_4 + +.L307: + FLD 0 * SIZE(X) + fmul %st(1), %st + FST 0 * SIZE(X) + addq INCX, X + decq I + jg .L307 + ALIGN_4 + +.L999: + ffreep %st(0) + ret + + EPILOGUE diff --git a/kernel/x86_64/scal_atom.S b/kernel/x86_64/scal_atom.S new file mode 100644 index 0000000000..ecc687c028 --- /dev/null +++ b/kernel/x86_64/scal_atom.S @@ -0,0 +1,446 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), X + movq 48(%rsp), INCX + + movaps %xmm3, %xmm0 +#endif + + SAVEREGISTERS + + testq M, M + jle .L999 + + pxor %xmm1, %xmm1 + lea (, INCX, SIZE), INCX + comisd %xmm0, %xmm1 + jne .L100 + +/* Alpha == ZERO */ + cmpq $SIZE, INCX + jne .L50 + + movq M, I + sarq $3, I + jle .L12 + ALIGN_4 + +.L11: + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + movsd %xmm1, 2 * SIZE(X) + movsd %xmm1, 3 * SIZE(X) + + movsd %xmm1, 4 * SIZE(X) + movsd %xmm1, 5 * SIZE(X) + movsd %xmm1, 6 * SIZE(X) + movsd %xmm1, 7 * SIZE(X) + + addq $8 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $4, M + je .L14 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + movsd %xmm1, 2 * SIZE(X) + movsd %xmm1, 3 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + + addq $2 * SIZE, X + ALIGN_3 + +.L15: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +.L50: + movq M, I + sarq $3, I + jle .L52 + ALIGN_4 + +.L51: + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + + decq I + jg .L51 + ALIGN_4 + +.L52: + testq $7, M + je .L999 + + testq $4, M + je .L53 + + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L53: + testq $2, M + je .L54 + + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L54: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + cmpq $SIZE, INCX + jne .L150 + + unpcklpd %xmm0, %xmm0 + + movq M, I + sarq $3, I + jle .L113 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movsd 5 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 7 * SIZE(X), %xmm8 + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm2, 1 * SIZE(X) + movsd %xmm3, 2 * SIZE(X) + movsd %xmm4, 3 * SIZE(X) + + movsd 8 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm5 + movsd 9 * SIZE(X), %xmm2 + mulsd %xmm0, %xmm6 + movsd 10 * SIZE(X), %xmm3 + mulsd %xmm0, %xmm7 + movsd 11 * SIZE(X), %xmm4 + mulsd %xmm0, %xmm8 + + movsd %xmm5, 4 * SIZE(X) + movsd %xmm6, 5 * SIZE(X) + movsd %xmm7, 6 * SIZE(X) + movsd %xmm8, 7 * SIZE(X) + + movsd 12 * SIZE(X), %xmm5 + mulsd %xmm0, %xmm1 + movsd 13 * SIZE(X), %xmm6 + mulsd %xmm0, %xmm2 + movsd 14 * SIZE(X), %xmm7 + mulsd %xmm0, %xmm3 + movsd 15 * SIZE(X), %xmm8 + mulsd %xmm0, %xmm4 + + addq $8 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + movsd %xmm1, 0 * SIZE(X) + mulsd %xmm0, %xmm5 + movsd %xmm2, 1 * SIZE(X) + mulsd %xmm0, %xmm6 + movsd %xmm3, 2 * SIZE(X) + mulsd %xmm0, %xmm7 + movsd %xmm4, 3 * SIZE(X) + mulsd %xmm0, %xmm8 + + movsd %xmm5, 4 * SIZE(X) + movsd %xmm6, 5 * SIZE(X) + movsd %xmm7, 6 * SIZE(X) + movsd %xmm8, 7 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L113: + testq $4, M + je .L115 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + movsd 2 * SIZE(X), %xmm3 + movsd 3 * SIZE(X), %xmm4 + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm2, 1 * SIZE(X) + movsd %xmm3, 2 * SIZE(X) + movsd %xmm4, 3 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L115: + testq $2, M + je .L116 + + movsd 0 * SIZE(X), %xmm1 + movsd 1 * SIZE(X), %xmm2 + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm2, 1 * SIZE(X) + + addq $2 * SIZE, X + ALIGN_3 + +.L116: + testq $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movq X, XX + movq M, I # rcx = n + sarq $3, I # (n >> 3) + jle .L152 + ALIGN_4 + +.L151: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + mulsd %xmm0, %xmm1 + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + mulsd %xmm0, %xmm2 + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd %xmm0, %xmm3 + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + mulsd %xmm0, %xmm4 + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + mulsd %xmm0, %xmm5 + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + mulsd %xmm0, %xmm6 + movsd 0 * SIZE(X), %xmm8 + addq INCX, X + mulsd %xmm0, %xmm7 + + movsd %xmm1, 0 * SIZE(XX) + addq INCX, XX + mulsd %xmm0, %xmm8 + movsd %xmm2, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm3, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm4, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm5, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm6, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm7, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm8, 0 * SIZE(XX) + addq INCX, XX + decq I + jg .L151 + ALIGN_4 + +.L152: + testq $7, M + je .L999 + + testq $4, M + je .L153 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + mulsd %xmm0, %xmm1 + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + mulsd %xmm0, %xmm2 + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + mulsd %xmm0, %xmm3 + + movsd %xmm1, 0 * SIZE(XX) + addq INCX, XX + mulsd %xmm0, %xmm4 + movsd %xmm2, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm3, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm4, 0 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L153: + testq $2, M + je .L154 + + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + mulsd %xmm0, %xmm1 + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + mulsd %xmm0, %xmm2 + + movsd %xmm1, 0 * SIZE(XX) + addq INCX, XX + movsd %xmm2, 0 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L154: + testq $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, 0 * SIZE(X) + ALIGN_4 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S new file mode 100644 index 0000000000..323e8b9ddd --- /dev/null +++ b/kernel/x86_64/scal_sse.S @@ -0,0 +1,612 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), X + movq 48(%rsp), INCX + + movaps %xmm3, %xmm0 +#endif + + SAVEREGISTERS + + testq M, M + jle .L999 + + lea (, INCX, SIZE), INCX + + pxor %xmm1, %xmm1 + comiss %xmm0, %xmm1 + shufps $0, %xmm0, %xmm0 + + jne .L100 # Alpha != ZERO + +/* Alpha == ZERO */ + cmpq $SIZE, INCX + jne .L50 + +/* INCX == 1 */ + cmpq $3, M + jle .L14 + + testq $4, X # aligned for double word? + je .L05 + + movss %xmm1, 0 * SIZE(X) + addq $SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L05: + testq $8, X # aligned for quad word? + je .L06 + + movsd %xmm1, 0 * SIZE(X) + addq $2 * SIZE, X + subq $2, M + jle .L999 + ALIGN_3 + +.L06: + movq M, I + sarq $4, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 8 * SIZE(X) + movaps %xmm1, 12 * SIZE(X) + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $15, M + je .L999 + testq $8, M + je .L13 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L13: + testq $4, M + je .L14 + + movaps %xmm1, 0 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movsd %xmm1, 0 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L15: + testq $1, M + je .L999 + + movss %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L50: + movq M, I # rcx = n + sarq $3, I # (n >> 3) + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + + decq I + jg .L51 + ALIGN_4 + +.L52: + testq $7, M + je .L999 + + testq $4, M + je .L53 + + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + ALIGN_3 + +.L53: + testq $2, M + je .L54 + + movss %xmm1, (X) + addq INCX, X + movss %xmm1, (X) + addq INCX, X + ALIGN_3 + +.L54: + testq $1, M + je .L999 + + movss %xmm1, (X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + cmpq $SIZE, INCX + jne .L150 + + subq $-32 * SIZE, X + + cmpq $3, M + jle .L116 + + testq $SIZE, X + je .L105 + + movss -32 * SIZE(X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, -32 * SIZE(X) + addq $SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L105: + testq $2 * SIZE, X + je .L110 + + movsd -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movsd %xmm1, -32 * SIZE(X) + addq $2 * SIZE, X + subq $2, M + jle .L999 + ALIGN_3 + +.L110: + movq M, I + sarq $5, I + jle .L113 + +#if defined(BARCELONA) || defined(SHANGHAI) + + movaps %xmm0, %xmm1 + mulps -32 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulps -28 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulps -24 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulps -20 * SIZE(X), %xmm4 + movaps %xmm0, %xmm5 + mulps -16 * SIZE(X), %xmm5 + movaps %xmm0, %xmm6 + mulps -12 * SIZE(X), %xmm6 + movaps %xmm0, %xmm7 + mulps -8 * SIZE(X), %xmm7 + movaps %xmm0, %xmm8 + mulps -4 * SIZE(X), %xmm8 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, -32 * SIZE(X) + movaps %xmm2, -28 * SIZE(X) + movaps %xmm3, -24 * SIZE(X) + movaps %xmm4, -20 * SIZE(X) + + movaps %xmm0, %xmm1 + mulps 0 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulps 4 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulps 8 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulps 12 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm5, -16 * SIZE(X) + movaps %xmm6, -12 * SIZE(X) + movaps %xmm7, -8 * SIZE(X) + movaps %xmm8, -4 * SIZE(X) + + movaps %xmm0, %xmm5 + mulps 16 * SIZE(X), %xmm5 + movaps %xmm0, %xmm6 + mulps 20 * SIZE(X), %xmm6 + movaps %xmm0, %xmm7 + mulps 24 * SIZE(X), %xmm7 + movaps %xmm0, %xmm8 + mulps 28 * SIZE(X), %xmm8 + + subq $-32 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + movaps %xmm1, -32 * SIZE(X) + movaps %xmm2, -28 * SIZE(X) + movaps %xmm3, -24 * SIZE(X) + movaps %xmm4, -20 * SIZE(X) + + movaps %xmm5, -16 * SIZE(X) + movaps %xmm6, -12 * SIZE(X) + movaps %xmm7, -8 * SIZE(X) + movaps %xmm8, -4 * SIZE(X) + +#else + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + movaps -4 * SIZE(X), %xmm8 + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + movaps 0 * SIZE(X), %xmm1 + mulps %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(X) + movaps 4 * SIZE(X), %xmm2 + mulps %xmm0, %xmm3 + movaps %xmm3, -24 * SIZE(X) + movaps 8 * SIZE(X), %xmm3 + mulps %xmm0, %xmm4 + movaps %xmm4, -20 * SIZE(X) + movaps 12 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulps %xmm0, %xmm5 + movaps %xmm5, -16 * SIZE(X) + movaps 16 * SIZE(X), %xmm5 + mulps %xmm0, %xmm6 + movaps %xmm6, -12 * SIZE(X) + movaps 20 * SIZE(X), %xmm6 + mulps %xmm0, %xmm7 + movaps %xmm7, -8 * SIZE(X) + movaps 24 * SIZE(X), %xmm7 + mulps %xmm0, %xmm8 + movaps %xmm8, -4 * SIZE(X) + movaps 28 * SIZE(X), %xmm8 + + subq $-32 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -24 * SIZE(X) + mulps %xmm0, %xmm4 + movaps %xmm4, -20 * SIZE(X) + + mulps %xmm0, %xmm5 + movaps %xmm5, -16 * SIZE(X) + mulps %xmm0, %xmm6 + movaps %xmm6, -12 * SIZE(X) + mulps %xmm0, %xmm7 + movaps %xmm7, -8 * SIZE(X) + mulps %xmm0, %xmm8 + movaps %xmm8, -4 * SIZE(X) + +#endif + + subq $-32 * SIZE, X + ALIGN_3 + +.L113: + testq $31, M + je .L999 + + testq $16, M + je .L114 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm3 + movaps -24 * SIZE(X), %xmm5 + movaps -20 * SIZE(X), %xmm7 + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -28 * SIZE(X) + mulps %xmm0, %xmm5 + movaps %xmm5, -24 * SIZE(X) + mulps %xmm0, %xmm7 + movaps %xmm7, -20 * SIZE(X) + + addq $16 * SIZE, X + ALIGN_3 + +.L114: + testq $8, M + je .L115 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm3 + + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + mulps %xmm0, %xmm3 + movaps %xmm3, -28 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L115: + testq $4, M + je .L116 + + movaps -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movaps %xmm1, -32 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L116: + testq $2, M + je .L117 + + movsd -32 * SIZE(X), %xmm1 + mulps %xmm0, %xmm1 + movsd %xmm1, -32 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L117: + testq $1, M + je .L999 + + movss -32 * SIZE(X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, -32 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movq X, XX + movq M, I # rcx = n + sarq $3, I # (n >> 3) + jle .L152 + ALIGN_4 + +.L151: + movss (X), %xmm1 + addq INCX, X + movss (X), %xmm2 + addq INCX, X + movss (X), %xmm3 + addq INCX, X + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm7 + addq INCX, X + movss (X), %xmm8 + addq INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + mulss %xmm0, %xmm6 + mulss %xmm0, %xmm7 + mulss %xmm0, %xmm8 + + movss %xmm1, (XX) + addq INCX, XX + movss %xmm2, (XX) + addq INCX, XX + movss %xmm3, (XX) + addq INCX, XX + movss %xmm4, (XX) + addq INCX, XX + movss %xmm5, (XX) + addq INCX, XX + movss %xmm6, (XX) + addq INCX, XX + movss %xmm7, (XX) + addq INCX, XX + movss %xmm8, (XX) + addq INCX, XX + decq I + jg .L151 + ALIGN_4 + +.L152: + testq $7, M + je .L999 + + testq $4, M + je .L153 + + movss (X), %xmm1 + addq INCX, X + movss (X), %xmm2 + addq INCX, X + movss (X), %xmm3 + addq INCX, X + movss (X), %xmm4 + addq INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + + movss %xmm1, (XX) + addq INCX, XX + movss %xmm2, (XX) + addq INCX, XX + movss %xmm3, (XX) + addq INCX, XX + movss %xmm4, (XX) + addq INCX, XX + ALIGN_3 + +.L153: + testq $2, M + je .L154 + + movss (X), %xmm1 + addq INCX, X + movss (X), %xmm2 + addq INCX, X + + mulss %xmm0, %xmm1 + mulss %xmm0, %xmm2 + + movss %xmm1, (XX) + addq INCX, XX + movss %xmm2, (XX) + addq INCX, XX + ALIGN_3 + +.L154: + testq $1, M + je .L999 + + movss (X), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (X) + ALIGN_4 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S new file mode 100644 index 0000000000..b0abb4533e --- /dev/null +++ b/kernel/x86_64/scal_sse2.S @@ -0,0 +1,588 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), X + movq 48(%rsp), INCX + + movaps %xmm3, %xmm0 +#endif + + SAVEREGISTERS + + testq M, M + jle .L999 + + leaq (, INCX, SIZE), INCX + + xorps %xmm1, %xmm1 + comisd %xmm0, %xmm1 + jne .L100 # Alpha != ZERO + +/* Alpha == ZERO */ + cmpq $SIZE, INCX + jne .L50 + +/* INCX == 1 */ + testq $15, X # aligned for quad word? + je .L05 + + movsd %xmm1, 0 * SIZE(X) + addq $SIZE, X + decq M + jle .L999 + ALIGN_3 +.L05: + +/* Aligned Mode */ + movq M, I # rcx = n + sarq $4, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, 8 * SIZE(X) + movaps %xmm1, 10 * SIZE(X) + movaps %xmm1, 12 * SIZE(X) + movaps %xmm1, 14 * SIZE(X) + + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $15, M + je .L999 + testq $8, M + je .L13 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + movaps %xmm1, 4 * SIZE(X) + movaps %xmm1, 6 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L13: + testq $4, M + je .L14 + + movaps %xmm1, 0 * SIZE(X) + movaps %xmm1, 2 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movaps %xmm1, 0 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L15: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +.L50: + movq M, I + sarq $3, I + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + + decq I + jg .L51 + ALIGN_4 + +.L52: + testq $7, M + je .L999 + + testq $4, M + je .L53 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + ALIGN_3 + +.L53: + testq $2, M + je .L54 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm1, (X) + addq INCX, X + ALIGN_3 + +.L54: + testq $1, M + je .L999 + + movsd %xmm1, (X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + unpcklpd %xmm0, %xmm0 + + cmpq $SIZE, INCX + jne .L150 + + testq $SIZE, X + je .L105 + + movsd 0 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, 0 * SIZE(X) + addq $SIZE, X + decq M + jle .L999 + ALIGN_3 +.L105: + subq $-16 * SIZE, X + + movq M, I # rcx = n + sarq $4, I + jle .L113 + +#if defined(BARCELONA) || defined(SHANGHAI) + + movaps %xmm0, %xmm1 + mulpd -16 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulpd -14 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulpd -12 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulpd -10 * SIZE(X), %xmm4 + movaps %xmm0, %xmm5 + mulpd -8 * SIZE(X), %xmm5 + movaps %xmm0, %xmm6 + mulpd -6 * SIZE(X), %xmm6 + movaps %xmm0, %xmm7 + mulpd -4 * SIZE(X), %xmm7 + movaps %xmm0, %xmm8 + mulpd -2 * SIZE(X), %xmm8 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, -16 * SIZE(X) + movaps %xmm2, -14 * SIZE(X) + movaps %xmm3, -12 * SIZE(X) + movaps %xmm4, -10 * SIZE(X) + + movaps %xmm0, %xmm1 + mulpd 0 * SIZE(X), %xmm1 + movaps %xmm0, %xmm2 + mulpd 2 * SIZE(X), %xmm2 + movaps %xmm0, %xmm3 + mulpd 4 * SIZE(X), %xmm3 + movaps %xmm0, %xmm4 + mulpd 6 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm5, -8 * SIZE(X) + movaps %xmm6, -6 * SIZE(X) + movaps %xmm7, -4 * SIZE(X) + movaps %xmm8, -2 * SIZE(X) + + movaps %xmm0, %xmm5 + mulpd 8 * SIZE(X), %xmm5 + movaps %xmm0, %xmm6 + mulpd 10 * SIZE(X), %xmm6 + movaps %xmm0, %xmm7 + mulpd 12 * SIZE(X), %xmm7 + movaps %xmm0, %xmm8 + mulpd 14 * SIZE(X), %xmm8 + + subq $-16 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + movaps %xmm1, -16 * SIZE(X) + movaps %xmm2, -14 * SIZE(X) + movaps %xmm3, -12 * SIZE(X) + movaps %xmm4, -10 * SIZE(X) + movaps %xmm5, -8 * SIZE(X) + movaps %xmm6, -6 * SIZE(X) + movaps %xmm7, -4 * SIZE(X) + movaps %xmm8, -2 * SIZE(X) + +#else + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + movaps -8 * SIZE(X), %xmm5 + movaps -6 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + movaps -2 * SIZE(X), %xmm8 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + movaps 0 * SIZE(X), %xmm1 + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + movaps 2 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + movaps 4 * SIZE(X), %xmm3 + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + movaps 6 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + mulpd %xmm0, %xmm5 + movaps %xmm5, -8 * SIZE(X) + movaps 8 * SIZE(X), %xmm5 + mulpd %xmm0, %xmm6 + movaps %xmm6, -6 * SIZE(X) + movaps 10 * SIZE(X), %xmm6 + + mulpd %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(X) + movaps 12 * SIZE(X), %xmm7 + mulpd %xmm0, %xmm8 + movaps %xmm8, -2 * SIZE(X) + movaps 14 * SIZE(X), %xmm8 + + subq $-16 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + + mulpd %xmm0, %xmm5 + movaps %xmm5, -8 * SIZE(X) + mulpd %xmm0, %xmm6 + movaps %xmm6, -6 * SIZE(X) + mulpd %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(X) + mulpd %xmm0, %xmm8 + movaps %xmm8, -2 * SIZE(X) +#endif + + subq $-16 * SIZE, X + ALIGN_3 + +.L113: + testq $15, M + je .L999 + + testq $8, M + je .L114 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + mulpd %xmm0, %xmm3 + movaps %xmm3, -12 * SIZE(X) + mulpd %xmm0, %xmm4 + movaps %xmm4, -10 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L114: + testq $4, M + je .L115 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + mulpd %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L115: + testq $2, M + je .L116 + + movaps -16 * SIZE(X), %xmm1 + mulpd %xmm0, %xmm1 + movaps %xmm1, -16 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L116: + testq $1, M + je .L999 + + movsd -16 * SIZE(X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, -16 * SIZE(X) + jmp .L999 + ALIGN_3 + +/* incx != 1 */ + +.L150: + movq X, XX + movq M, I # rcx = n + sarq $3, I # (n >> 3) + jle .L152 + ALIGN_4 + +.L151: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + movsd (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + movsd (X), %xmm8 + addq INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 + mulsd %xmm0, %xmm8 + + movsd %xmm1, (XX) + addq INCX, XX + movsd %xmm2, (XX) + addq INCX, XX + movsd %xmm3, (XX) + addq INCX, XX + movsd %xmm4, (XX) + addq INCX, XX + movsd %xmm5, (XX) + addq INCX, XX + movsd %xmm6, (XX) + addq INCX, XX + movsd %xmm7, (XX) + addq INCX, XX + movsd %xmm8, (XX) + addq INCX, XX + decq I + jg .L151 + ALIGN_4 + +.L152: + testq $7, M + je .L999 + + testq $4, M + je .L153 + + movsd (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + movsd (X), %xmm4 + addq INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + + movsd %xmm1, (XX) + addq INCX, XX + movsd %xmm2, (XX) + addq INCX, XX + movsd %xmm3, (XX) + addq INCX, XX + movsd %xmm4, (XX) + addq INCX, XX + ALIGN_3 + +.L153: + testq $2, M + je .L154 + + movsd (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + + mulsd %xmm0, %xmm1 + mulsd %xmm0, %xmm2 + + movsd %xmm1, (XX) + addq INCX, XX + movsd %xmm2, (XX) + addq INCX, XX + ALIGN_3 + +.L154: + testq $1, M + je .L999 + + movsd (X), %xmm1 + mulsd %xmm0, %xmm1 + movsd %xmm1, (X) + ALIGN_4 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/sgemv_n.S b/kernel/x86_64/sgemv_n.S new file mode 100644 index 0000000000..ead2420c4a --- /dev/null +++ b/kernel/x86_64/sgemv_n.S @@ -0,0 +1,6018 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 4 +#undef GEMV_UNROLL +#define GEMV_UNROLL 4 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA 48 (%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) +#define ALPHA 224 (%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define Y1 %rbp + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + movss %xmm0, ALPHA +#else + movss %xmm3, ALPHA +#endif + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + leaq (LDA, LDA, 2), LDA3 + +#ifdef ALIGNED_ACCESS + movq M, MM + testq $4 * SIZE - 1, A + je .L0X + cmpq $3, M + jle .L0X + + movq A, MM + sarq $BASE_SHIFT, MM + andq $3, MM + subq $4, MM + addq M, MM + +.L0X: +#endif + + testq N, N # if n <= 0 goto END + jle .L999 + testq M, M # if n <= 0 goto END + jle .L999 + + subq $-32 * SIZE, A + + movq BUFFER, Y1 + + pxor %xmm0, %xmm0 + + movq M, %rax +#ifdef ALIGNED_ACCESS + addq $19, %rax +#else + addq $16, %rax +#endif + sarq $4, %rax + ALIGN_3 + +.L01: + movaps %xmm0, 0 * SIZE(Y1) + movaps %xmm0, 4 * SIZE(Y1) + movaps %xmm0, 8 * SIZE(Y1) + movaps %xmm0, 12 * SIZE(Y1) + addq $16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: +#ifdef ALIGNED_ACCESS + movq A, %rax + andq $4 * SIZE - 1, %rax + addq %rax, BUFFER + + testq $4 * SIZE - 1, LDA + jne .L100 +#endif + +#if GEMV_UNROLL >= 8 + + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 4), A2 + leaq (A, LDA, 8), A + + movss (X), %xmm8 + addq INCX, X + movss (X), %xmm9 + addq INCX, X + movss (X), %xmm10 + addq INCX, X + movss (X), %xmm11 + addq INCX, X + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm8 + shufps $0, %xmm8, %xmm8 + mulss %xmm0, %xmm9 + shufps $0, %xmm9, %xmm9 + mulss %xmm0, %xmm10 + shufps $0, %xmm10, %xmm10 + mulss %xmm0, %xmm11 + shufps $0, %xmm11, %xmm11 + + mulss %xmm0, %xmm12 + shufps $0, %xmm12, %xmm12 + mulss %xmm0, %xmm13 + shufps $0, %xmm13, %xmm13 + mulss %xmm0, %xmm14 + shufps $0, %xmm14, %xmm14 + mulss %xmm0, %xmm15 + shufps $0, %xmm15, %xmm15 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L17 + + testq $SIZE, A1 + je .L1X + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA, 1), %xmm5 + movss -32 * SIZE(A1, LDA, 2), %xmm6 + movss -32 * SIZE(A1, LDA3, 1), %xmm7 + + movss -32 * SIZE(Y1), %xmm0 + + mulss %xmm8, %xmm4 + addss %xmm4, %xmm0 + movss -32 * SIZE(A2), %xmm4 + mulss %xmm9, %xmm5 + addss %xmm5, %xmm0 + movss -32 * SIZE(A2, LDA, 1), %xmm5 + mulss %xmm10, %xmm6 + addss %xmm6, %xmm0 + movss -32 * SIZE(A2, LDA, 2), %xmm6 + mulss %xmm11, %xmm7 + addss %xmm7, %xmm0 + movss -32 * SIZE(A2, LDA3, 1), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L1X: + testq $2 * SIZE, A1 + je .L1XX + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA, 1), %xmm5 + movsd -32 * SIZE(A1, LDA, 2), %xmm6 + movsd -32 * SIZE(A1, LDA3, 1), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movsd -32 * SIZE(A2, LDA, 1), %xmm5 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA, 2), %xmm6 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movsd -32 * SIZE(A2, LDA3, 1), %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L1XX: +#endif + movq MM, I + sarq $4, I + jle .L15 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm4) + MOVUPS_A1 (-28 * SIZE, A1, %xmm5) + MOVUPS_A1 (-24 * SIZE, A1, %xmm6) + MOVUPS_A1 (-20 * SIZE, A1, %xmm7) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm4) + mulps %xmm8, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm5) + mulps %xmm8, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm6) + mulps %xmm8, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm9, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) + mulps %xmm9, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) + mulps %xmm9, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm6) + mulps %xmm9, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) +#endif + + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm4) + mulps %xmm10, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm5) + mulps %xmm10, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm6) + mulps %xmm10, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) +#endif + + mulps %xmm11, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-32 * SIZE, A2, %xmm4) + mulps %xmm11, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1 (-28 * SIZE, A2, %xmm5) + mulps %xmm11, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A1 (-24 * SIZE, A2, %xmm6) + mulps %xmm11, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A1 (-20 * SIZE, A2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) +#endif + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm4) + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm5) + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm6) + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) +#endif + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm4) + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm5) + mulps %xmm15, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm6) + mulps %xmm15, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm4) + mulps %xmm8, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm5) + mulps %xmm8, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm6) + mulps %xmm8, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm7) + + mulps %xmm9, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) + mulps %xmm9, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) + mulps %xmm9, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm6) + mulps %xmm9, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm7) + + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm4) + mulps %xmm10, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm5) + mulps %xmm10, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm6) + mulps %xmm10, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm7) + + mulps %xmm11, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-32 * SIZE, A2, %xmm4) + mulps %xmm11, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1 (-28 * SIZE, A2, %xmm5) + mulps %xmm11, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A1 (-24 * SIZE, A2, %xmm6) + mulps %xmm11, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A1 (-20 * SIZE, A2, %xmm7) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm7) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm7) + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm4) + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm5) + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm6) + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm7) + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm2 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $8, MM + je .L16 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm4) + MOVUPS_A1 (-28 * SIZE, A1, %xmm5) + + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm7) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm4) + mulps %xmm8, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm5) + + mulps %xmm9, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm6) + mulps %xmm9, %xmm7 + addps %xmm7, %xmm1 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm7) + + mulps %xmm10, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-32 * SIZE, A2, %xmm4) + mulps %xmm10, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1 (-28 * SIZE, A2, %xmm5) + + mulps %xmm11, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm11, %xmm7 + addps %xmm7, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm7) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm4) + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm5) + + mulps %xmm13, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm1 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm7) + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + + mulps %xmm15, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm15, %xmm7 + addps %xmm7, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L16: + testq $4, MM + je .L17 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm4) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm6) + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm7) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1 (-32 * SIZE, A2, %xmm4) + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm6) + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm7) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $2, MM + je .L18 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA, 1), %xmm5 + movsd -32 * SIZE(A1, LDA, 2), %xmm6 + movsd -32 * SIZE(A1, LDA3, 1), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movsd -32 * SIZE(A2), %xmm4 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movsd -32 * SIZE(A2, LDA, 1), %xmm5 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movsd -32 * SIZE(A2, LDA, 2), %xmm6 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movsd -32 * SIZE(A2, LDA3, 1), %xmm7 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L18: + testq $1, MM + je .L19 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA, 1), %xmm5 + movss -32 * SIZE(A1, LDA, 2), %xmm6 + movss -32 * SIZE(A1, LDA3, 1), %xmm7 + + movss -32 * SIZE(Y1), %xmm0 + + mulss %xmm8, %xmm4 + addss %xmm4, %xmm0 + movss -32 * SIZE(A2), %xmm4 + mulss %xmm9, %xmm5 + addss %xmm5, %xmm0 + movss -32 * SIZE(A2, LDA, 1), %xmm5 + mulss %xmm10, %xmm6 + addss %xmm6, %xmm0 + movss -32 * SIZE(A2, LDA, 2), %xmm6 + mulss %xmm11, %xmm7 + addss %xmm7, %xmm0 + movss -32 * SIZE(A2, LDA3, 1), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L19: + cmpq $8, N + jge .L11 + ALIGN_3 + +.L20: +#endif + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + mulss %xmm0, %xmm15 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L27 + + testq $SIZE, A1 + je .L2X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L2X: + testq $2 * SIZE, A1 + je .L2XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L2XX: +#endif + + movq MM, I + sarq $4, I + jle .L25 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + MOVUPS_A1 (-24 * SIZE, A1, %xmm10) + MOVUPS_A1 (-20 * SIZE, A1, %xmm11) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm7) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm8) + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm9) + mulps %xmm14, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) + mulps %xmm14, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm5) + mulps %xmm15, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm6) + mulps %xmm15, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2(-24 * SIZE, A2, LDA, 1, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2(-20 * SIZE, A2, LDA, 1, %xmm7) + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm2 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm3 + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm15, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm15, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $8, MM + je .L26 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-28 * SIZE, A2, LDA, 1, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm15, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L26: + testq $4, MM + je .L27 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-32 * SIZE, A2, LDA, 1, %xmm4) + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + + mulps %xmm15, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $2, MM + je .L28 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L28: + testq $1, MM +#if GEMV_UNROLL == 4 + je .L29 +#else + je .L30 +#endif + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + ALIGN_3 + +#if GEMV_UNROLL == 4 +.L29: + cmpq $4, N + jge .L21 +#endif + ALIGN_3 + +.L30: + testq N, N + jle .L990 + + cmpq $3, N + jne .L40 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L37 + + testq $SIZE, A1 + je .L3X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L3X: + testq $2 * SIZE, A1 + je .L3XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L3XX: +#endif + + movq MM, I + sarq $4, I + jle .L35 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + MOVUPS_A1 (-24 * SIZE, A1, %xmm10) + MOVUPS_A1 (-20 * SIZE, A1, %xmm11) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) + MOVUPS_A2(-24 * SIZE, A1, LDA, 1, %xmm6) + MOVUPS_A2(-20 * SIZE, A1, LDA, 1, %xmm7) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A2( -4 * SIZE, A1, LDA, 1, %xmm7) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm8) + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm9) + mulps %xmm14, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) + mulps %xmm14, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1(-24 * SIZE, A2, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1(-20 * SIZE, A2, %xmm11) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm14, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm14, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $8, MM + je .L36 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + MOVUPS_A2(-28 * SIZE, A1, LDA, 1, %xmm5) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm9) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm1 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L36: + testq $4, MM + je .L37 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + MOVUPS_A2(-32 * SIZE, A1, LDA, 1, %xmm4) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm8) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + + mulps %xmm14, %xmm8 + addps %xmm8, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $2, MM + je .L38 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L38: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L40: + cmpq $2, N + jne .L50 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L47 + + testq $SIZE, A1 + je .L4X + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L4X: + testq $2 * SIZE, A1 + je .L4XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L4XX: +#endif + + movq MM, I + sarq $4, I + jle .L45 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + MOVUPS_A1 (-24 * SIZE, A1, %xmm10) + MOVUPS_A1 (-20 * SIZE, A1, %xmm11) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + MOVUPS_A1(-28 * SIZE, A2, %xmm5) + MOVUPS_A1(-24 * SIZE, A2, %xmm6) + MOVUPS_A1(-20 * SIZE, A2, %xmm7) + + decq I + jle .L44 + ALIGN_3 + +.L43: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_A1(-12 * SIZE, A2, %xmm5) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_A1( -8 * SIZE, A2, %xmm6) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_A1( -4 * SIZE, A2, %xmm7) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L43 + ALIGN_3 + +.L44: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm13, %xmm6 + addps %xmm6, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm13, %xmm7 + addps %xmm7, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L45: + testq $8, MM + je .L46 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1(-28 * SIZE, A2, %xmm5) + + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm13, %xmm5 + addps %xmm5, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L46: + testq $4, MM + je .L47 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1(-32 * SIZE, A2, %xmm4) + mulps %xmm13, %xmm4 + addps %xmm4, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L47: + testq $2, MM + je .L48 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L48: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L50: + cmpq $1, N + jne .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movss (X), %xmm12 + + mulss ALPHA, %xmm12 + shufps $0, %xmm12, %xmm12 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L57 + + testq $SIZE, A1 + je .L5X + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L5X: + testq $2 * SIZE, A1 + je .L5XX + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L5XX: +#endif + + movq MM, I + sarq $4, I + jle .L55 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + MOVUPS_A1 (-24 * SIZE, A1, %xmm10) + MOVUPS_A1 (-20 * SIZE, A1, %xmm11) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L54 + ALIGN_3 + +.L53: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_A1 (-16 * SIZE, A1, %xmm8) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_A1 (-12 * SIZE, A1, %xmm9) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_A1 ( -8 * SIZE, A1, %xmm10) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_A1 ( -4 * SIZE, A1, %xmm11) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L53 + ALIGN_3 + +.L54: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L55: + testq $8, MM + je .L56 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_A1 (-28 * SIZE, A1, %xmm9) + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L56: + testq $4, MM + je .L57 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm8) + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L57: + testq $2, MM + je .L58 + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L58: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + +#ifdef ALIGNED_ACCESS + jmp .L990 + ALIGN_3 + +.L100: + testq $2 * SIZE - 1, LDA + jne .L200 + + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + mulss %xmm0, %xmm15 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + cmpq $3, M + jle .L107 + + testq $SIZE, A1 + je .L10X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L10X: + testq $2 * SIZE, A1 + je .L10XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L10XX: + movhps -32 * SIZE(A1, LDA), %xmm8 + movhps -32 * SIZE(A2, LDA), %xmm9 + + movq MM, I + sarq $4, I + jle .L105 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -32 * SIZE(A2), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -28 * SIZE(A2), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -24 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A2), %xmm7 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A2, LDA), %xmm4 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A2, LDA), %xmm5 + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A2, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm15, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2, LDA), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -32 * SIZE(A2), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -28 * SIZE(A2), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -24 * SIZE(A2), %xmm6 + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A2), %xmm7 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A2, LDA), %xmm4 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A2, LDA), %xmm5 + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A2, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm15, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2, LDA), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L105: + testq $8, MM + je .L106 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -30 * SIZE(A1, LDA), %xmm6 + movaps -26 * SIZE(A1, LDA), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -32 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -28 * SIZE(A2), %xmm10 + + shufps $0x4e, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -30 * SIZE(A2, LDA), %xmm11 + shufps $0x4e, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + movaps -26 * SIZE(A2, LDA), %xmm7 + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + shufps $0x4e, %xmm11, %xmm9 + mulps %xmm15, %xmm9 + addps %xmm9, %xmm0 + shufps $0x4e, %xmm7, %xmm11 + mulps %xmm15, %xmm11 + addps %xmm11, %xmm1 + movaps %xmm7, %xmm9 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L106: + testq $4, MM + je .L107 + + movaps -32 * SIZE(A1), %xmm4 + movaps -30 * SIZE(A1, LDA), %xmm5 + movaps -32 * SIZE(A2), %xmm6 + movaps -30 * SIZE(A2, LDA), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + shufps $0x4e, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + shufps $0x4e, %xmm7, %xmm9 + mulps %xmm15, %xmm9 + addps %xmm9, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L107: + testq $2, MM + je .L108 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + movsd -32 * SIZE(A2, LDA), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L108: + testq $1, MM + je .L109 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + movss -32 * SIZE(A2, LDA), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L109: + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: + testq N, N + jle .L990 + + cmpq $3, N + jne .L120 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + + cmpq $3, M + jle .L117 + + testq $SIZE, A1 + je .L11X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L11X: + testq $2 * SIZE, A1 + je .L11XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L11XX: + movhps -32 * SIZE(A1, LDA), %xmm8 + movhps -32 * SIZE(A2, LDA), %xmm9 + + movq MM, I + sarq $4, I + jle .L115 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -32 * SIZE(A2), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -28 * SIZE(A2), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -24 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A2), %xmm7 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -32 * SIZE(A2), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -28 * SIZE(A2), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -24 * SIZE(A2), %xmm6 + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A2), %xmm7 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm1 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm2 + mulps %xmm14, %xmm7 + addps %xmm7, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L115: + testq $8, MM + je .L116 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -30 * SIZE(A1, LDA), %xmm6 + movaps -26 * SIZE(A1, LDA), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -32 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -28 * SIZE(A2), %xmm10 + + shufps $0x4e, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + shufps $0x4e, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + + mulps %xmm14, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm14, %xmm10 + addps %xmm10, %xmm1 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L116: + testq $4, MM + je .L117 + + movaps -32 * SIZE(A1), %xmm4 + movaps -30 * SIZE(A1, LDA), %xmm5 + movaps -32 * SIZE(A2), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + shufps $0x4e, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L117: + testq $2, MM + je .L118 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L118: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L120: + cmpq $2, N + jl .L130 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + + cmpq $3, M + jle .L127 + + testq $SIZE, A1 + je .L12X + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L12X: + testq $2 * SIZE, A1 + je .L12XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L12XX: + movhps -32 * SIZE(A2), %xmm8 + + movq MM, I + sarq $4, I + jle .L125 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -30 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -26 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -22 * SIZE(A1, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -18 * SIZE(A1, LDA), %xmm8 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + shufps $0x4e, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L125: + testq $8, MM + je .L126 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -30 * SIZE(A2), %xmm6 + movaps -26 * SIZE(A2), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + + shufps $0x4e, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + shufps $0x4e, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L126: + testq $4, MM + je .L127 + + movaps -32 * SIZE(A1), %xmm4 + movaps -30 * SIZE(A2), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + shufps $0x4e, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L127: + testq $2, MM + je .L128 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L128: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L130: + cmpq $1, N + jne .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movss (X), %xmm12 + + mulss ALPHA, %xmm12 + shufps $0, %xmm12, %xmm12 + + cmpq $3, M + jle .L137 + + testq $SIZE, A1 + je .L13X + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L13X: + testq $2 * SIZE, A1 + je .L13XX + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L13XX: + movq MM, I + sarq $4, I + jle .L135 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + movaps -24 * SIZE(A1), %xmm10 + movaps -20 * SIZE(A1), %xmm11 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L134 + ALIGN_3 + +.L133: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + movaps -12 * SIZE(A1), %xmm9 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + movaps -8 * SIZE(A1), %xmm10 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + movaps -4 * SIZE(A1), %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L133 + ALIGN_3 + +.L134: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L135: + testq $8, MM + je .L136 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L136: + testq $4, MM + je .L137 + + movaps -32 * SIZE(A1), %xmm8 + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L137: + testq $2, MM + je .L138 + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L138: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L200: + testq $2 * SIZE, LDA + jne .L300 + + cmpq $4, N + jl .L210 + ALIGN_3 + +.L201: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + mulss %xmm0, %xmm15 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + cmpq $3, M + jle .L207 + + testq $SIZE, A1 + je .L20X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L20X: + testq $2 * SIZE, A1 + je .L20XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L20XX: + movaps -33 * SIZE(A1, LDA), %xmm8 + movaps -34 * SIZE(A2), %xmm9 + movaps -35 * SIZE(A2, LDA), %xmm10 + + movq MM, I + sarq $4, I + jle .L205 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L204 + ALIGN_3 + +.L203: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -31 * SIZE(A2, LDA), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -27 * SIZE(A2, LDA), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -23 * SIZE(A2, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + movss %xmm4, %xmm10 + shufps $0x93, %xmm4, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movaps -19 * SIZE(A2, LDA), %xmm10 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + movss %xmm10, %xmm6 + shufps $0x93, %xmm10, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L203 + ALIGN_3 + +.L204: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A1, LDA), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -31 * SIZE(A2, LDA), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -27 * SIZE(A2, LDA), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -23 * SIZE(A2, LDA), %xmm6 + + movss %xmm4, %xmm10 + shufps $0x93, %xmm4, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movaps -19 * SIZE(A2, LDA), %xmm10 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movss %xmm10, %xmm6 + shufps $0x93, %xmm10, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L205: + testq $8, MM + je .L206 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -29 * SIZE(A1, LDA), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -30 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -26 * SIZE(A2), %xmm5 + + movss %xmm6, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -25 * SIZE(A1, LDA), %xmm8 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps -31 * SIZE(A2, LDA), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -27 * SIZE(A2, LDA), %xmm7 + + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps %xmm5, %xmm9 + + movss %xmm6, %xmm10 + shufps $0x93, %xmm6, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm10 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L206: + testq $4, MM + je .L207 + + movaps -32 * SIZE(A1), %xmm4 + movaps -29 * SIZE(A1, LDA), %xmm5 + movaps -30 * SIZE(A2), %xmm6 + movaps -31 * SIZE(A2, LDA), %xmm7 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movss %xmm5, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + shufps $0x4e, %xmm6, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movss %xmm7, %xmm10 + shufps $0x93, %xmm7, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L207: + testq $2, MM + je .L208 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + movsd -32 * SIZE(A2, LDA), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L208: + testq $1, MM + je .L209 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + movss -32 * SIZE(A2, LDA), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L209: + cmpq $4, N + jge .L201 + ALIGN_3 + +.L210: + cmpq $3, N + jne .L220 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + + cmpq $3, M + jle .L217 + + testq $SIZE, A1 + je .L21X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L21X: + testq $2 * SIZE, A1 + je .L21XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L21XX: + movaps -33 * SIZE(A1, LDA), %xmm8 + movaps -34 * SIZE(A2), %xmm9 + movaps -35 * SIZE(A2, LDA), %xmm10 + + movq MM, I + sarq $4, I + jle .L215 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L214 + ALIGN_3 + +.L213: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L213 + ALIGN_3 + +.L214: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A1, LDA), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L215: + testq $8, MM + je .L216 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -29 * SIZE(A1, LDA), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -30 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -26 * SIZE(A2), %xmm5 + + movss %xmm6, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -25 * SIZE(A1, LDA), %xmm8 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps %xmm5, %xmm9 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L216: + testq $4, MM + je .L217 + + movaps -32 * SIZE(A1), %xmm4 + movaps -29 * SIZE(A1, LDA), %xmm5 + movaps -30 * SIZE(A2), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movss %xmm5, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + shufps $0x4e, %xmm6, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movss %xmm7, %xmm10 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L217: + testq $2, MM + je .L218 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L218: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + jmp .L990 + ALIGN_4 + +.L220: + testq N, N + jle .L990 + + cmpq $2, N + jne .L230 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + + cmpq $3, M + jle .L227 + + testq $SIZE, A1 + je .L22X + + movss -32 * SIZE(Y1), %xmm9 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm9 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm9 + + movss %xmm9, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L22X: + testq $2 * SIZE, A1 + je .L22XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm9 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm9 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm9 + + movlps %xmm9, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L22XX: + movaps -33 * SIZE(A1, LDA), %xmm8 + + movq MM, I + sarq $4, I + jle .L225 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L224 + ALIGN_3 + +.L223: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A2), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L223 + ALIGN_3 + +.L224: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -29 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -25 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -21 * SIZE(A2), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -17 * SIZE(A2), %xmm8 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movss %xmm8, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L225: + testq $8, MM + je .L226 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -29 * SIZE(A2), %xmm6 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -25 * SIZE(A2), %xmm7 + + movss %xmm6, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L226: + testq $4, MM + je .L227 + + movaps -32 * SIZE(A1), %xmm4 + movaps -29 * SIZE(A2), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movss %xmm5, %xmm8 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L227: + testq $2, MM + je .L228 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm9 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm9 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm9 + + movlps %xmm9, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L228: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm9 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm9 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm9 + + movss %xmm9, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L230: + cmpq $1, N + jne .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movss (X), %xmm12 + + mulss ALPHA, %xmm12 + shufps $0, %xmm12, %xmm12 + + cmpq $3, M + jle .L237 + + testq $SIZE, A1 + je .L23X + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L23X: + testq $2 * SIZE, A1 + je .L23XX + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L23XX: + testq $2 * SIZE, A1 + jne .L230 + + movq MM, I + sarq $4, I + jle .L235 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + movaps -24 * SIZE(A1), %xmm10 + movaps -20 * SIZE(A1), %xmm11 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L234 + ALIGN_3 + +.L233: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + movaps -12 * SIZE(A1), %xmm9 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + movaps -8 * SIZE(A1), %xmm10 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + movaps -4 * SIZE(A1), %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L233 + ALIGN_3 + +.L234: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L235: + testq $8, MM + je .L236 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L236: + testq $4, MM + je .L237 + + movaps -32 * SIZE(A1), %xmm8 + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L237: + testq $2, MM + je .L238 + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L238: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_4 + +.L300: + cmpq $4, N + jl .L310 + ALIGN_3 + +.L301: + subq $4, N + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + movss (X), %xmm15 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + mulss %xmm0, %xmm15 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + cmpq $3, M + jle .L307 + + testq $SIZE, A1 + je .L30X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + movss -32 * SIZE(A2, LDA), %xmm3 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + mulss %xmm15, %xmm3 + addss %xmm3, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L30X: + testq $2 * SIZE, A1 + je .L30XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + movsd -32 * SIZE(A2, LDA), %xmm3 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + mulps %xmm15, %xmm3 + addps %xmm3, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L30XX: + movaps -35 * SIZE(A1, LDA), %xmm8 + movaps -34 * SIZE(A2), %xmm9 + movaps -33 * SIZE(A2, LDA), %xmm10 + + movq MM, I + sarq $4, I + jle .L305 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L304 + ALIGN_3 + +.L303: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -29 * SIZE(A2, LDA), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -25 * SIZE(A2, LDA), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -21 * SIZE(A2, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movaps -17 * SIZE(A2, LDA), %xmm10 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + movss %xmm10, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L303 + ALIGN_3 + +.L304: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A1, LDA), %xmm4 + + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A1, LDA), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -29 * SIZE(A2, LDA), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -25 * SIZE(A2, LDA), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -21 * SIZE(A2, LDA), %xmm6 + + movss %xmm4, %xmm10 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movaps -17 * SIZE(A2, LDA), %xmm10 + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + mulps %xmm15, %xmm4 + addps %xmm4, %xmm1 + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + mulps %xmm15, %xmm5 + addps %xmm5, %xmm2 + movss %xmm10, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L305: + testq $8, MM + je .L306 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -31 * SIZE(A1, LDA), %xmm6 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -27 * SIZE(A1, LDA), %xmm7 + + movss %xmm6, %xmm8 + shufps $0x93, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + movaps -26 * SIZE(A2), %xmm5 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -29 * SIZE(A2, LDA), %xmm6 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps %xmm5, %xmm9 + movaps -25 * SIZE(A2, LDA), %xmm7 + + movss %xmm6, %xmm10 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + mulps %xmm15, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm10 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L306: + testq $4, MM + je .L307 + + movaps -32 * SIZE(A1), %xmm4 + movaps -31 * SIZE(A1, LDA), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -30 * SIZE(A2), %xmm6 + movss %xmm5, %xmm8 + shufps $0x93, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -29 * SIZE(A2, LDA), %xmm7 + + shufps $0x4e, %xmm6, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movss %xmm7, %xmm10 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm15, %xmm10 + addps %xmm10, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L307: + testq $2, MM + je .L308 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + movsd -32 * SIZE(A2, LDA), %xmm7 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm15, %xmm7 + addps %xmm7, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L308: + testq $1, MM + je .L309 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + movss -32 * SIZE(A2, LDA), %xmm7 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + mulss %xmm15, %xmm7 + addss %xmm7, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + ALIGN_3 + +.L309: + cmpq $4, N + jge .L301 + ALIGN_3 + +.L310: + cmpq $3, N + jne .L320 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + movss (X), %xmm14 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + mulss %xmm0, %xmm14 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + + cmpq $3, M + jle .L317 + + testq $SIZE, A1 + je .L31X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A1, LDA), %xmm1 + movss -32 * SIZE(A2), %xmm2 + + movss -32 * SIZE(Y1), %xmm8 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + mulss %xmm14, %xmm2 + addss %xmm2, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L31X: + testq $2 * SIZE, A1 + je .L31XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A1, LDA), %xmm1 + movsd -32 * SIZE(A2), %xmm2 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + mulps %xmm14, %xmm2 + addps %xmm2, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L31XX: + movaps -35 * SIZE(A1, LDA), %xmm8 + movaps -34 * SIZE(A2), %xmm9 + movaps -33 * SIZE(A2, LDA), %xmm10 + + movq MM, I + sarq $4, I + jle .L315 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L314 + ALIGN_3 + +.L313: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A1, LDA), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A1, LDA), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 3 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L313 + ALIGN_3 + +.L314: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A1, LDA), %xmm4 + + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A1, LDA), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A1, LDA), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A1, LDA), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -30 * SIZE(A2), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -26 * SIZE(A2), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -22 * SIZE(A2), %xmm6 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + movaps -18 * SIZE(A2), %xmm9 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + shufps $0x4e, %xmm6, %xmm5 + mulps %xmm14, %xmm5 + addps %xmm5, %xmm2 + shufps $0x4e, %xmm9, %xmm6 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L315: + testq $8, MM + je .L316 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -31 * SIZE(A1, LDA), %xmm6 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -27 * SIZE(A1, LDA), %xmm7 + + movss %xmm6, %xmm8 + shufps $0x93, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -30 * SIZE(A2), %xmm4 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + movaps -26 * SIZE(A2), %xmm5 + + shufps $0x4e, %xmm4, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + shufps $0x4e, %xmm5, %xmm4 + mulps %xmm14, %xmm4 + addps %xmm4, %xmm1 + movaps %xmm5, %xmm9 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L316: + testq $4, MM + je .L317 + + movaps -32 * SIZE(A1), %xmm4 + movaps -31 * SIZE(A1, LDA), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -30 * SIZE(A2), %xmm6 + movss %xmm5, %xmm8 + shufps $0x93, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + shufps $0x4e, %xmm6, %xmm9 + mulps %xmm14, %xmm9 + addps %xmm9, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L317: + testq $2, MM + je .L318 + + movsd -32 * SIZE(A1), %xmm4 + movsd -32 * SIZE(A1, LDA), %xmm5 + movsd -32 * SIZE(A2), %xmm6 + + movsd -32 * SIZE(Y1), %xmm0 + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm14, %xmm6 + addps %xmm6, %xmm0 + + movlps %xmm0, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L318: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm0 + + movss -32 * SIZE(A1), %xmm4 + movss -32 * SIZE(A1, LDA), %xmm5 + movss -32 * SIZE(A2), %xmm6 + + mulss %xmm12, %xmm4 + addss %xmm4, %xmm0 + mulss %xmm13, %xmm5 + addss %xmm5, %xmm0 + mulss %xmm14, %xmm6 + addss %xmm6, %xmm0 + + movss %xmm0, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L320: + cmpq $2, N + jne .L330 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + + movss (X), %xmm12 + addq INCX, X + movss (X), %xmm13 + addq INCX, X + + movss ALPHA, %xmm0 + + mulss %xmm0, %xmm12 + mulss %xmm0, %xmm13 + + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + + cmpq $3, M + jle .L327 + + testq $SIZE, A1 + je .L32X + + movss -32 * SIZE(Y1), %xmm9 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm9 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm9 + + movss %xmm9, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L32X: + testq $2 * SIZE, A1 + je .L32XX + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L32XX: + movaps -35 * SIZE(A1, LDA), %xmm8 + + movq MM, I + sarq $4, I + jle .L325 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + movaps -24 * SIZE(A1), %xmm6 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L324 + ALIGN_3 + +.L323: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A2), %xmm4 + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A2), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A2), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + movaps -16 * SIZE(A1), %xmm4 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movaps -12 * SIZE(A1), %xmm5 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + movaps -8 * SIZE(A1), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L323 + ALIGN_3 + +.L324: + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -20 * SIZE(A1), %xmm7 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -31 * SIZE(A2), %xmm4 + + mulps %xmm12, %xmm6 + addps %xmm6, %xmm2 + movaps -27 * SIZE(A2), %xmm5 + mulps %xmm12, %xmm7 + addps %xmm7, %xmm3 + movaps -23 * SIZE(A2), %xmm6 + + movss %xmm4, %xmm8 + shufps $0x93, %xmm4, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movaps -19 * SIZE(A2), %xmm8 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + mulps %xmm13, %xmm4 + addps %xmm4, %xmm1 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + mulps %xmm13, %xmm5 + addps %xmm5, %xmm2 + movss %xmm8, %xmm6 + shufps $0x93, %xmm8, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm3 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, A2 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L325: + testq $8, MM + je .L326 + + movaps -32 * SIZE(A1), %xmm4 + movaps -28 * SIZE(A1), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movaps -31 * SIZE(A2), %xmm6 + mulps %xmm12, %xmm5 + addps %xmm5, %xmm1 + movaps -27 * SIZE(A2), %xmm7 + + movss %xmm6, %xmm8 + shufps $0x93, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + mulps %xmm13, %xmm6 + addps %xmm6, %xmm1 + movaps %xmm7, %xmm8 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L326: + testq $4, MM + je .L327 + + movaps -32 * SIZE(A1), %xmm4 + movaps -31 * SIZE(A2), %xmm5 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm4 + addps %xmm4, %xmm0 + movss %xmm5, %xmm8 + shufps $0x93, %xmm5, %xmm8 + mulps %xmm13, %xmm8 + addps %xmm8, %xmm0 + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L327: + testq $2, MM + je .L328 + + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(A2), %xmm1 + + movsd -32 * SIZE(Y1), %xmm8 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm13, %xmm1 + addps %xmm1, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L328: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(A2), %xmm1 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + mulss %xmm13, %xmm1 + addss %xmm1, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 + ALIGN_3 + +.L330: + cmpq $1, N + jne .L990 + + leaq 32 * SIZE(BUFFER), Y1 + movq A, A1 + + movss (X), %xmm12 + + mulss ALPHA, %xmm12 + shufps $0, %xmm12, %xmm12 + + cmpq $3, M + jle .L337 + + testq $SIZE, A1 + je .L33X + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + + addq $1 * SIZE, A1 + addq $1 * SIZE, Y1 + ALIGN_3 + +.L33X: + testq $2 * SIZE, A1 + je .L33XX + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L33XX: + movq MM, I + sarq $4, I + jle .L335 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + movaps -24 * SIZE(A1), %xmm10 + movaps -20 * SIZE(A1), %xmm11 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-20 * SIZE, Y1, %xmm3) + + decq I + jle .L334 + ALIGN_3 + +.L333: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(A1), %xmm8 + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + movaps -12 * SIZE(A1), %xmm9 + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + movaps -8 * SIZE(A1), %xmm10 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + movaps -4 * SIZE(A1), %xmm11 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 8 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -8 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L333 + ALIGN_3 + +.L334: + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + mulps %xmm12, %xmm10 + addps %xmm10, %xmm2 + MOVUPS_YS1(-24 * SIZE, Y1, %xmm2) + mulps %xmm12, %xmm11 + addps %xmm11, %xmm3 + MOVUPS_YS1(-20 * SIZE, Y1, %xmm3) + + subq $-16 * SIZE, A1 + subq $-16 * SIZE, Y1 + ALIGN_3 + +.L335: + testq $8, MM + je .L336 + + movaps -32 * SIZE(A1), %xmm8 + movaps -28 * SIZE(A1), %xmm9 + + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-28 * SIZE, Y1, %xmm1) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + mulps %xmm12, %xmm9 + addps %xmm9, %xmm1 + MOVUPS_YS1(-28 * SIZE, Y1, %xmm1) + + addq $8 * SIZE, A1 + addq $8 * SIZE, Y1 + ALIGN_3 + +.L336: + testq $4, MM + je .L337 + + movaps -32 * SIZE(A1), %xmm8 + MOVUPS_YL1(-32 * SIZE, Y1, %xmm0) + + mulps %xmm12, %xmm8 + addps %xmm8, %xmm0 + MOVUPS_YS1(-32 * SIZE, Y1, %xmm0) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L337: + testq $2, MM + je .L338 + + movsd -32 * SIZE(Y1), %xmm8 + movsd -32 * SIZE(A1), %xmm0 + + mulps %xmm12, %xmm0 + addps %xmm0, %xmm8 + + movlps %xmm8, -32 * SIZE(Y1) + + addq $2 * SIZE, A1 + addq $2 * SIZE, Y1 + ALIGN_3 + +.L338: + testq $1, MM + je .L990 + + movss -32 * SIZE(Y1), %xmm8 + movss -32 * SIZE(A1), %xmm0 + + mulss %xmm12, %xmm0 + addss %xmm0, %xmm8 + + movss %xmm8, -32 * SIZE(Y1) + jmp .L990 +#endif + ALIGN_4 + + +.L990: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L994 + ALIGN_3 +.L992: + movsd 0 * SIZE(BUFFER), %xmm0 + movhps 2 * SIZE(BUFFER), %xmm0 + movsd 4 * SIZE(BUFFER), %xmm4 + movhps 6 * SIZE(BUFFER), %xmm4 + + pshufd $0x01, %xmm0, %xmm1 + pshufd $0x02, %xmm0, %xmm2 + pshufd $0x03, %xmm0, %xmm3 + + pshufd $0x01, %xmm4, %xmm5 + pshufd $0x02, %xmm4, %xmm6 + pshufd $0x03, %xmm4, %xmm7 + + addss (Y), %xmm0 + addq INCY, Y + addss (Y), %xmm1 + addq INCY, Y + addss (Y), %xmm2 + addq INCY, Y + addss (Y), %xmm3 + addq INCY, Y + addss (Y), %xmm4 + addq INCY, Y + addss (Y), %xmm5 + addq INCY, Y + addss (Y), %xmm6 + addq INCY, Y + addss (Y), %xmm7 + addq INCY, Y + + movss %xmm0, (Y1) + addq INCY, Y1 + movss %xmm1, (Y1) + addq INCY, Y1 + movss %xmm2, (Y1) + addq INCY, Y1 + movss %xmm3, (Y1) + addq INCY, Y1 + movss %xmm4, (Y1) + addq INCY, Y1 + movss %xmm5, (Y1) + addq INCY, Y1 + movss %xmm6, (Y1) + addq INCY, Y1 + movss %xmm7, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $7, M + jle .L999 + + testq $4, M + jle .L995 + + movsd 0 * SIZE(BUFFER), %xmm0 + movhps 2 * SIZE(BUFFER), %xmm0 + + pshufd $0x01, %xmm0, %xmm1 + pshufd $0x02, %xmm0, %xmm2 + pshufd $0x03, %xmm0, %xmm3 + + addss (Y), %xmm0 + addq INCY, Y + addss (Y), %xmm1 + addq INCY, Y + addss (Y), %xmm2 + addq INCY, Y + addss (Y), %xmm3 + addq INCY, Y + + movss %xmm0, (Y1) + addq INCY, Y1 + movss %xmm1, (Y1) + addq INCY, Y1 + movss %xmm2, (Y1) + addq INCY, Y1 + movss %xmm3, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L995: + testq $2, M + jle .L996 + + movsd (BUFFER), %xmm0 + + pshufd $0x01, %xmm0, %xmm1 + + addss (Y), %xmm0 + addq INCY, Y + addss (Y), %xmm1 + addq INCY, Y + + movss %xmm0, (Y1) + addq INCY, Y1 + movss %xmm1, (Y1) + addq INCY, Y1 + + addq $2 * SIZE, BUFFER + ALIGN_3 + +.L996: + testq $1, M + jle .L999 + + movss (BUFFER), %xmm0 + + addss (Y), %xmm0 + + movss %xmm0, (Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + + ret + EPILOGUE diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S new file mode 100644 index 0000000000..052ff1a794 --- /dev/null +++ b/kernel/x86_64/sgemv_t.S @@ -0,0 +1,6370 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#if GEMV_UNROLL < 4 +#undef GEMV_UNROLL +#define GEMV_UNROLL 4 +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_A %rcx +#define OLD_LDA %r8 +#define STACK_INCX 8 + STACKSIZE(%rsp) +#define STACK_Y 16 + STACKSIZE(%rsp) +#define STACK_INCY 24 + STACKSIZE(%rsp) +#define STACK_BUFFER 32 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_M %rcx +#define OLD_N %rdx +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define STACK_INCX 64 + STACKSIZE(%rsp) +#define STACK_Y 72 + STACKSIZE(%rsp) +#define STACK_INCY 80 + STACKSIZE(%rsp) +#define STACK_BUFFER 88 + STACKSIZE(%rsp) + +#endif + +#define LDA %r8 +#define X %r9 + +#define INCX %rsi +#define INCY %rdi + +#define M %r10 +#define N %r11 +#define A %r12 +#define Y %r14 +#define BUFFER %r13 + +#define I %rax +#define A1 %rbx +#define A2 %rcx +#define LDA3 %rdx +#define X1 %rbp + +#define Y1 INCX + +#ifdef ALIGNED_ACCESS +#define MM %r15 +#else +#define MM M +#endif + +#define ALPHA %xmm7 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X +#else + movq OLD_M, M + movq OLD_N, N + movq OLD_A, A + movq OLD_LDA, LDA +#endif + + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + +#ifndef WINDOWS_ABI + pshufd $0, %xmm0, ALPHA +#else + pshufd $0, %xmm3, ALPHA +#endif + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + leaq (LDA, LDA, 2), LDA3 + +#ifdef ALIGNED_ACCESS + movq M, MM + testq $4 * SIZE - 1, A + je .L0X + cmpq $3, M + jle .L0X + + movq A, MM + sarq $BASE_SHIFT, MM + andq $3, MM + subq $4, MM + addq M, MM + +.L0X: +#endif + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_4 + + subq $-32 * SIZE, A + +#ifdef ALIGNED_ACCESS + movq A, %rax + andq $4 * SIZE - 1, %rax + addq %rax, BUFFER +#endif + + movq BUFFER, X1 + + movq M, I + sarq $3, I + jle .L05 + ALIGN_4 + +.L02: + movss (X), %xmm0 + addq INCX, X + movss (X), %xmm1 + addq INCX, X + + movss (X), %xmm2 + addq INCX, X + movss (X), %xmm3 + addq INCX, X + + movss (X), %xmm4 + addq INCX, X + movss (X), %xmm5 + addq INCX, X + + movss (X), %xmm6 + addq INCX, X + movss (X), %xmm8 + addq INCX, X + + movss %xmm0, 0 * SIZE(X1) + movss %xmm1, 1 * SIZE(X1) + movss %xmm2, 2 * SIZE(X1) + movss %xmm3, 3 * SIZE(X1) + movss %xmm4, 4 * SIZE(X1) + movss %xmm5, 5 * SIZE(X1) + movss %xmm6, 6 * SIZE(X1) + movss %xmm8, 7 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $7, I + jle .L10 + ALIGN_2 + +.L06: + movss (X), %xmm0 + addq INCX, X + movss %xmm0, 0 * SIZE(X1) + addq $SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq Y, Y1 + +#ifdef ALIGNED_ACCESS + testq $4 * SIZE - 1, LDA + jne .L100 +#endif + +#if GEMV_UNROLL >= 8 + + cmpq $8, N + jl .L20 + ALIGN_3 + +.L11: + subq $8, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 4), A2 + leaq (A1, LDA, 8), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L17 + + testq $SIZE, A1 + je .L1X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA, 1), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A1, LDA, 2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A1, LDA3, 1), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + movss -32 * SIZE(A2), %xmm0 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm12 + movss -32 * SIZE(A2, LDA, 1), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm13 + movss -32 * SIZE(A2, LDA, 2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm14 + movss -32 * SIZE(A2, LDA3, 1), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm15 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L1X: + testq $2 * SIZE, A1 + je .L1XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA, 1), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A1, LDA, 2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A1, LDA3, 1), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + movsd -32 * SIZE(A2), %xmm0 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + movsd -32 * SIZE(A2, LDA, 1), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + movsd -32 * SIZE(A2, LDA, 2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + movsd -32 * SIZE(A2, LDA3, 1), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm15 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L1XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 8 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L15 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) + + decq I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA, 2) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA3) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm15 + MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 1) +#endif + + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA, 2) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA3) +#endif + + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + MOVUPS_A2 (-16 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm15 + MOVUPS_A2 (-16 * SIZE, A1, LDA3, 1, %xmm3) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L12 + ALIGN_4 + +.L13: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm15 + MOVUPS_A2 (-24 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-24 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-20 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-20 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm15 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L15: + testq $8, MM + jle .L16 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm15 + MOVUPS_A2 (-28 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A2, %xmm0) + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-28 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm5, %xmm0 + addps %xmm0, %xmm12 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm13 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm14 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm15 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L16: + testq $4, MM + jle .L17 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 2, %xmm2) + MOVUPS_A2 (-32 * SIZE, A1, LDA3, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 2, %xmm2) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + MOVUPS_A2 (-32 * SIZE, A2, LDA3, 1, %xmm3) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm15 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L17: + testq $2, MM + jle .L18 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA, 1), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A1, LDA, 2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A1, LDA3, 1), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + movsd -32 * SIZE(A2), %xmm0 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm12 + movsd -32 * SIZE(A2, LDA, 1), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm13 + movsd -32 * SIZE(A2, LDA, 2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm14 + movsd -32 * SIZE(A2, LDA3, 1), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm15 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L18: + testq $1, MM + jle .L19 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA, 1), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A1, LDA, 2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A1, LDA3, 1), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + movss -32 * SIZE(A2), %xmm0 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm12 + movss -32 * SIZE(A2, LDA, 1), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm13 + movss -32 * SIZE(A2, LDA, 2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm14 + movss -32 * SIZE(A2, LDA3, 1), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm15 + ALIGN_4 + +.L19: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 + + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + haddps %xmm14, %xmm12 + + pshufd $0x1, %xmm12, %xmm13 + pshufd $0x2, %xmm12, %xmm14 + pshufd $0x3, %xmm12, %xmm15 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 + + movaps %xmm12, %xmm0 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm0 + + movaps %xmm14, %xmm1 + unpcklps %xmm15, %xmm14 + unpckhps %xmm15, %xmm1 + + movaps %xmm12, %xmm13 + unpcklps %xmm14, %xmm12 + unpckhps %xmm14, %xmm13 + + movaps %xmm0, %xmm14 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm14 + + addps %xmm13, %xmm12 + addps %xmm0, %xmm14 + addps %xmm14, %xmm12 + + pshufd $0x2, %xmm12, %xmm13 + pshufd $0x1, %xmm12, %xmm14 + pshufd $0x3, %xmm12, %xmm15 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + mulss ALPHA, %xmm12 + mulss ALPHA, %xmm13 + mulss ALPHA, %xmm14 + mulss ALPHA, %xmm15 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + addss (Y), %xmm12 + addq INCY, Y + addss (Y), %xmm13 + addq INCY, Y + addss (Y), %xmm14 + addq INCY, Y + addss (Y), %xmm15 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + movss %xmm12, (Y1) + addq INCY, Y1 + movss %xmm13, (Y1) + addq INCY, Y1 + movss %xmm14, (Y1) + addq INCY, Y1 + movss %xmm15, (Y1) + addq INCY, Y1 + + cmpq $8, N + jge .L11 + ALIGN_4 + +.L20: +#endif + + cmpq $4, N + jl .L30 + +#if GEMV_UNROLL == 4 + ALIGN_3 + +.L21: +#endif + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L27 + + testq $SIZE, A1 + je .L2X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L2X: + testq $2 * SIZE, A1 + je .L2XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L2XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#if (GEMV_UNROLL == 4) && defined(PREFETCHW) + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L25 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) + MOVUPS_A1 (-28 * SIZE, A2, %xmm14) + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15) + + decq I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm15 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm15, %xmm11 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-16 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + MOVUPS_A2 (-16 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + MOVUPS_A1 (-12 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm15 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm15, %xmm11 + MOVUPS_A2 (-12 * SIZE, A2, LDA, 1, %xmm15) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L22 + ALIGN_4 + +.L23: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + MOVUPS_A2 (-24 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm15 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm15, %xmm11 + MOVUPS_A2 (-20 * SIZE, A2, LDA, 1, %xmm15) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + mulps %xmm5, %xmm15 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm15, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L25: + testq $8, MM + jle .L26 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) + mulps %xmm4, %xmm3 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm3, %xmm11 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A1 (-28 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + MOVUPS_A2 (-28 * SIZE, A2, LDA, 1, %xmm15) + mulps %xmm5, %xmm15 + addps %xmm15, %xmm11 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L26: + testq $4, MM + jle .L27 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-32 * SIZE, A2, LDA, 1, %xmm3) + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L27: + testq $2, MM + jle .L28 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L28: + testq $1, MM + jle .L29 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + ALIGN_4 + +.L29: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 4 + cmpq $4, N + jge .L21 +#endif + ALIGN_4 + +.L30: + cmpq $3, N + jne .L40 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L37 + + testq $SIZE, A1 + je .L3X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L3X: + testq $2 * SIZE, A1 + je .L3XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L3XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#if (GEMV_UNROLL == 4) && defined(PREFETCHW) + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L35 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) + MOVUPS_A1 (-28 * SIZE, A2, %xmm14) + + decq I + jle .L33 + ALIGN_4 + +.L32: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm14, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-16 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-16 * SIZE, A2, %xmm2) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-12 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm14, %xmm10 + MOVUPS_A1 (-12 * SIZE, A2, %xmm14) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L32 + ALIGN_4 + +.L33: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-24 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A2 (-20 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm14 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm14, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm14) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + mulps %xmm5, %xmm14 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm14, %xmm10 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L35: + testq $8, MM + jle .L36 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A2 (-28 * SIZE, A1, LDA, 1, %xmm13) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + MOVUPS_A1 (-28 * SIZE, A2, %xmm14) + mulps %xmm5, %xmm14 + addps %xmm14, %xmm10 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L36: + testq $4, MM + jle .L37 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-32 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L37: + testq $2, MM + jle .L38 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L38: + testq $1, MM + jle .L39 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + ALIGN_4 + +.L39: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L40: + cmpq $2, N + jne .L50 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L47 + + testq $SIZE, A1 + je .L4X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L4X: + testq $2 * SIZE, A1 + je .L4XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L4XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L45 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-32 * SIZE, A2, %xmm1) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + MOVUPS_A1 (-28 * SIZE, A2, %xmm13) + + decq I + jle .L43 + ALIGN_4 + +.L42: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + MOVUPS_A1 (-24 * SIZE, A2, %xmm1) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm13, %xmm9 + MOVUPS_A1 (-20 * SIZE, A2, %xmm13) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + MOVUPS_A1 (-16 * SIZE, A2, %xmm1) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm13, %xmm9 + MOVUPS_A1 (-12 * SIZE, A2, %xmm13) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L42 + ALIGN_4 + +.L43: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + MOVUPS_A1 (-24 * SIZE, A2, %xmm1) + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm13 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm13, %xmm9 + MOVUPS_A1 (-20 * SIZE, A2, %xmm13) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + mulps %xmm5, %xmm13 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm13, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L45: + testq $8, MM + jle .L46 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm1) + mulps %xmm4, %xmm1 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm1, %xmm9 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm8 + MOVUPS_A1 (-28 * SIZE, A2, %xmm13) + mulps %xmm5, %xmm13 + addps %xmm13, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L46: + testq $4, MM + jle .L47 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-32 * SIZE, A2, %xmm1) + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L47: + testq $2, MM + jle .L48 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L48: + testq $1, MM + jle .L49 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + ALIGN_4 + +.L49: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm8, %xmm8 +#else + movaps %xmm8, %xmm10 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm10 + + addps %xmm10, %xmm8 + movhlps %xmm8, %xmm9 + addps %xmm9, %xmm8 +#endif + + pshufd $0x1, %xmm8, %xmm9 + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L50: + cmpq $1, N + jne .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifdef ALIGNED_ACCESS + cmpq $3, M + jle .L57 + + testq $SIZE, A1 + je .L5X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + + addq $1 * SIZE, A1 + addq $1 * SIZE, X1 + ALIGN_3 + +.L5X: + testq $2 * SIZE, A1 + je .L5XX + + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 + +.L5XX: +#endif + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L55 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + + decq I + jle .L53 + ALIGN_4 + +.L52: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + + decq I + jg .L52 + ALIGN_4 + +.L53: + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + ALIGN_4 + +.L55: + testq $8, MM + jle .L56 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L56: + testq $4, MM + jle .L57 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L57: + testq $2, MM + jle .L58 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L58: + testq $1, MM + jle .L59 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + ALIGN_4 + +.L59: + addps %xmm9, %xmm8 + +#ifdef HAVE_SSE3 + haddps %xmm8, %xmm8 + haddps %xmm8, %xmm8 +#else + pshufd $1, %xmm8, %xmm9 + pshufd $2, %xmm8, %xmm10 + pshufd $3, %xmm8, %xmm11 + + addss %xmm9, %xmm8 + addss %xmm11, %xmm10 + addss %xmm10, %xmm8 +#endif + + mulss ALPHA, %xmm8 + + addss (Y), %xmm8 + movss %xmm8, (Y1) + +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_4 + +.L100: + testq $2 * SIZE - 1, LDA + jne .L200 + + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + cmpq $3, M + jle .L107 + + testq $SIZE, A1 + je .L10X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L10X: + testq $2 * SIZE, A1 + je .L10XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L10XX: + MOVUPS_A2 (-34 * SIZE, A1, LDA, 1, %xmm12) + MOVUPS_A2 (-34 * SIZE, A2, LDA, 1, %xmm13) + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L105 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) + + decq I + jle .L103 + ALIGN_4 + +.L102: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-16 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-14 * SIZE, A2, LDA, 1, %xmm3) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L102 + ALIGN_4 + +.L103: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-22 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-18 * SIZE, A2, LDA, 1, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L105: + testq $8, MM + jle .L106 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm11 + MOVUPS_A2 (-26 * SIZE, A2, LDA, 1, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + shufps $0x4e, %xmm13, %xmm3 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L106: + testq $4, MM + jle .L107 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A2 (-30 * SIZE, A2, LDA, 1, %xmm3) + shufps $0x4e, %xmm3, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm11 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L107: + testq $2, MM + jle .L108 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L108: + testq $1, MM + jle .L109 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + ALIGN_4 + +.L109: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L101 + ALIGN_4 + +.L110: + cmpq $3, N + jne .L120 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + + cmpq $3, M + jle .L117 + + testq $SIZE, A1 + je .L11X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L11X: + testq $2 * SIZE, A1 + je .L11XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L11XX: + MOVUPS_A2 (-34 * SIZE, A1, LDA, 1, %xmm12) + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L115 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + + decq I + jle .L113 + ALIGN_4 + +.L112: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm2) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-14 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-16 * SIZE, A2, %xmm2) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L112 + ALIGN_4 + +.L113: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-22 * SIZE, A1, LDA, 1, %xmm1) + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-24 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-18 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-20 * SIZE, A2, %xmm2) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L115: + testq $8, MM + jle .L116 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-26 * SIZE, A1, LDA, 1, %xmm12) + mulps %xmm4, %xmm2 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm2, %xmm10 + MOVUPS_A1 (-28 * SIZE, A2, %xmm2) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L116: + testq $4, MM + jle .L117 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A2 (-30 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + MOVUPS_A1 (-32 * SIZE, A2, %xmm2) + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L117: + testq $2, MM + jle .L118 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L118: + testq $1, MM + jle .L119 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + ALIGN_4 + +.L119: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + jmp .L999 + ALIGN_4 + +.L120: + cmpq $2, N + jne .L130 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L127 + + testq $SIZE, A1 + je .L12X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L12X: + testq $2 * SIZE, A1 + je .L12XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L12XX: + MOVUPS_A1 (-34 * SIZE, A2, %xmm12) + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L125 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-30 * SIZE, A2, %xmm1) + + decq I + jle .L123 + ALIGN_4 + +.L122: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-26 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-22 * SIZE, A2, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-18 * SIZE, A2, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-14 * SIZE, A2, %xmm1) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L122 + ALIGN_4 + +.L123: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-26 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-22 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-18 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L125: + testq $8, MM + jle .L126 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-30 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-26 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + shufps $0x4e, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L126: + testq $4, MM + jle .L127 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-30 * SIZE, A2, %xmm1) + shufps $0x4e, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L127: + testq $2, MM + jle .L128 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L128: + testq $1, MM + jle .L129 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + ALIGN_4 + +.L129: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm8, %xmm8 +#else + movaps %xmm8, %xmm10 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm10 + + addps %xmm10, %xmm8 + movhlps %xmm8, %xmm9 + addps %xmm9, %xmm8 +#endif + + pshufd $0x1, %xmm8, %xmm9 + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L130: + cmpq $1, N + jne .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L137 + + testq $SIZE, A1 + je .L13X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + + addq $1 * SIZE, A1 + addq $1 * SIZE, X1 + ALIGN_3 + +.L13X: + testq $2 * SIZE, A1 + je .L13XX + + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 + +.L13XX: + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L135 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + + decq I + jle .L133 + ALIGN_4 + +.L132: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + + decq I + jg .L132 + ALIGN_4 + +.L133: + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + ALIGN_4 + +.L135: + testq $8, MM + jle .L136 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L136: + testq $4, MM + jle .L137 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L137: + testq $2, MM + jle .L138 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L138: + testq $1, MM + jle .L139 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + ALIGN_4 + +.L139: + addps %xmm9, %xmm8 + +#ifdef HAVE_SSE3 + haddps %xmm8, %xmm8 + haddps %xmm8, %xmm8 +#else + pshufd $1, %xmm8, %xmm9 + pshufd $2, %xmm8, %xmm10 + pshufd $3, %xmm8, %xmm11 + + addss %xmm9, %xmm8 + addss %xmm11, %xmm10 + addss %xmm10, %xmm8 +#endif + + mulss ALPHA, %xmm8 + + addss (Y), %xmm8 + movss %xmm8, (Y1) + jmp .L999 + ALIGN_4 + +.L200: + testq $2 * SIZE, LDA + jne .L300 + + cmpq $4, N + jl .L210 + ALIGN_3 + +.L201: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + cmpq $3, M + jle .L207 + + testq $SIZE, A1 + je .L20X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L20X: + testq $2 * SIZE, A1 + je .L20XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L20XX: + movaps -33 * SIZE(A1, LDA), %xmm12 + movaps -34 * SIZE(A2), %xmm13 + movaps -35 * SIZE(A2, LDA), %xmm14 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L205 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) + + decq I + jle .L203 + ALIGN_4 + +.L202: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-14 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-15 * SIZE, A2, LDA, 1, %xmm3) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L202 + ALIGN_4 + +.L203: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L205: + testq $8, MM + jle .L206 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L206: + testq $4, MM + jle .L207 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-31 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + addps %xmm14, %xmm11 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L207: + testq $2, MM + jle .L208 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L208: + testq $1, MM + jle .L209 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + ALIGN_4 + +.L209: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L201 + ALIGN_4 + +.L210: + cmpq $3, N + jne .L220 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + + cmpq $3, M + jle .L217 + + testq $SIZE, A1 + je .L21X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L21X: + testq $2 * SIZE, A1 + je .L21XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L21XX: + movaps -33 * SIZE(A1, LDA), %xmm12 + movaps -34 * SIZE(A2), %xmm13 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L215 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + decq I + jle .L213 + ALIGN_4 + +.L212: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-13 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-14 * SIZE, A2, %xmm2) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L212 + ALIGN_4 + +.L213: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-27 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-21 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-23 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-17 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x93, %xmm3, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-19 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x93, %xmm14, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L215: + testq $8, MM + jle .L216 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-25 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L216: + testq $4, MM + jle .L217 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-29 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L217: + testq $2, MM + jle .L218 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L218: + testq $1, MM + jle .L219 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + ALIGN_4 + +.L219: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + jmp .L999 + ALIGN_4 + +.L220: + testq N, N + jle .L999 + + cmpq $2, N + jne .L230 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L227 + + testq $SIZE, A1 + je .L22X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L22X: + testq $2 * SIZE, A1 + je .L22XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L22XX: + movaps -33 * SIZE(A2), %xmm12 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L225 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-29 * SIZE, A2, %xmm1) + + decq I + jle .L223 + ALIGN_4 + +.L222: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-25 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-21 * SIZE, A2, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-17 * SIZE, A2, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-13 * SIZE, A2, %xmm1) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L222 + ALIGN_4 + +.L223: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-25 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-21 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-17 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L225: + testq $8, MM + jle .L226 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-29 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm2) + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-25 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm2 + addps %xmm2, %xmm8 + movss %xmm12, %xmm1 + shufps $0x39, %xmm1, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L226: + testq $4, MM + jle .L227 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-29 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + movss %xmm1, %xmm12 + shufps $0x39, %xmm12, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L227: + testq $2, MM + jle .L228 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L228: + testq $1, MM + jle .L229 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + ALIGN_4 + +.L229: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm8, %xmm8 +#else + movaps %xmm8, %xmm10 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm10 + + addps %xmm10, %xmm8 + movhlps %xmm8, %xmm9 + addps %xmm9, %xmm8 +#endif + + pshufd $0x1, %xmm8, %xmm9 + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L230: + cmpq $1, N + jne .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L237 + + testq $SIZE, A1 + je .L23X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + + addq $1 * SIZE, A1 + addq $1 * SIZE, X1 + ALIGN_3 + +.L23X: + testq $2 * SIZE, A1 + je .L23XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 + +.L23XX: + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + + movq MM, I + sarq $4, I + jle .L235 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + + decq I + jle .L233 + ALIGN_4 + +.L232: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + + decq I + jg .L232 + ALIGN_4 + +.L233: + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + ALIGN_4 + +.L235: + testq $8, MM + jle .L236 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L236: + testq $4, MM + jle .L237 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L237: + testq $2, MM + jle .L238 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L238: + testq $1, MM + jle .L239 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + ALIGN_4 + +.L239: + addps %xmm9, %xmm8 + +#ifdef HAVE_SSE3 + haddps %xmm8, %xmm8 + haddps %xmm8, %xmm8 +#else + pshufd $1, %xmm8, %xmm9 + pshufd $2, %xmm8, %xmm10 + pshufd $3, %xmm8, %xmm11 + + addss %xmm9, %xmm8 + addss %xmm11, %xmm10 + addss %xmm10, %xmm8 +#endif + + mulss ALPHA, %xmm8 + + addss (Y), %xmm8 + movss %xmm8, (Y1) + jmp .L999 + ALIGN_4 + +.L300: + cmpq $4, N + jl .L310 + ALIGN_3 + +.L301: + subq $4, N + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + cmpq $3, M + jle .L307 + + testq $SIZE, A1 + je .L30X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L30X: + testq $2 * SIZE, A1 + je .L30XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L30XX: + movaps -35 * SIZE(A1, LDA), %xmm12 + movaps -34 * SIZE(A2), %xmm13 + movaps -33 * SIZE(A2, LDA), %xmm14 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 4 * SIZE(Y1) +#endif + + movq MM, I + sarq $4, I + jle .L305 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) + + decq I + jle .L303 + ALIGN_4 + +.L302: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-14 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-13 * SIZE, A2, LDA, 1, %xmm3) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L302 + ALIGN_4 + +.L303: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L305: + testq $8, MM + jle .L306 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + addps %xmm3, %xmm11 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L306: + testq $4, MM + jle .L307 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + MOVUPS_A2 (-29 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + addps %xmm14, %xmm11 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L307: + testq $2, MM + jle .L308 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd -32 * SIZE(A2, LDA), %xmm3 + mulps %xmm4, %xmm3 + addps %xmm3, %xmm11 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L308: + testq $1, MM + jle .L309 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + movss -32 * SIZE(A2, LDA), %xmm3 + mulss %xmm4, %xmm3 + addss %xmm3, %xmm11 + ALIGN_4 + +.L309: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 + pshufd $0x3, %xmm8, %xmm11 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + mulss ALPHA, %xmm11 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + addq INCY, Y + addss (Y), %xmm11 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + addq INCY, Y1 + movss %xmm11, (Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L301 + ALIGN_4 + +.L310: + testq N, N + jle .L999 + + cmpq $3, N + jne .L320 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + + cmpq $3, M + jle .L317 + + testq $SIZE, A1 + je .L31X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L31X: + testq $2 * SIZE, A1 + je .L31XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L31XX: + movaps -35 * SIZE(A1, LDA), %xmm12 + movaps -34 * SIZE(A2), %xmm13 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L315 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + decq I + jle .L313 + ALIGN_4 + +.L312: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A1, LDA) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 3 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-15 * SIZE, A1, LDA, 1, %xmm1) + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm2, %xmm10 + MOVUPS_A1 (-14 * SIZE, A2, %xmm2) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L312 + ALIGN_4 + +.L313: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-25 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + MOVUPS_A2 (-23 * SIZE, A1, LDA, 1, %xmm1) + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + MOVUPS_A1 (-22 * SIZE, A2, %xmm2) + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + MOVUPS_A2 (-21 * SIZE, A2, LDA, 1, %xmm3) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-19 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + MOVUPS_A1 (-18 * SIZE, A2, %xmm13) + movss %xmm3, %xmm14 + shufps $0x39, %xmm14, %xmm14 + mulps %xmm4, %xmm14 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm14, %xmm11 + MOVUPS_A2 (-17 * SIZE, A2, LDA, 1, %xmm14) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + movss %xmm14, %xmm3 + shufps $0x39, %xmm3, %xmm3 + mulps %xmm5, %xmm3 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm3, %xmm11 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L315: + testq $8, MM + jle .L316 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + MOVUPS_A2 (-27 * SIZE, A1, LDA, 1, %xmm12) + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm13, %xmm10 + MOVUPS_A1 (-26 * SIZE, A2, %xmm13) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + shufps $0x4e, %xmm13, %xmm2 + mulps %xmm5, %xmm2 + addps %xmm2, %xmm10 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L316: + testq $4, MM + jle .L317 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A2 (-31 * SIZE, A1, LDA, 1, %xmm1) + MOVUPS_A1 (-30 * SIZE, A2, %xmm2) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + shufps $0x4e, %xmm2, %xmm13 + mulps %xmm4, %xmm13 + addps %xmm13, %xmm10 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L317: + testq $2, MM + jle .L318 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A1, LDA), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd -32 * SIZE(A2), %xmm2 + mulps %xmm4, %xmm2 + addps %xmm2, %xmm10 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L318: + testq $1, MM + jle .L319 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A1, LDA), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + movss -32 * SIZE(A2), %xmm2 + mulss %xmm4, %xmm2 + addss %xmm2, %xmm10 + ALIGN_4 + +.L319: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm10, %xmm8 + + pshufd $0x1, %xmm8, %xmm9 + pshufd $0x2, %xmm8, %xmm10 +#else + movaps %xmm8, %xmm0 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm0 + + movaps %xmm10, %xmm1 + unpcklps %xmm11, %xmm10 + unpckhps %xmm11, %xmm1 + + movaps %xmm8, %xmm9 + unpcklps %xmm10, %xmm8 + unpckhps %xmm10, %xmm9 + + movaps %xmm0, %xmm10 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm0, %xmm10 + addps %xmm10, %xmm8 + + pshufd $0x2, %xmm8, %xmm9 + pshufd $0x1, %xmm8, %xmm10 +#endif + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + mulss ALPHA, %xmm10 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + addss (Y), %xmm10 + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + movss %xmm10, (Y1) + jmp .L999 + ALIGN_3 + +.L320: + cmpq $2, N + jne .L330 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L327 + + testq $SIZE, A1 + je .L32X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + + addq $1 * SIZE, A1 + addq $1 * SIZE, A2 + addq $1 * SIZE, X1 + ALIGN_3 + +.L32X: + testq $2 * SIZE, A1 + je .L32XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_3 + +.L32XX: + movaps -35 * SIZE(A2), %xmm12 + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L325 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-31 * SIZE, A2, %xmm1) + + decq I + jle .L323 + ALIGN_4 + +.L322: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-27 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-23 * SIZE, A2, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A2) +#endif + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-19 * SIZE, A2, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-15 * SIZE, A2, %xmm1) + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + + decq I + jg .L322 + ALIGN_4 + +.L323: + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-27 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + MOVUPS_A1 (-23 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-20 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-19 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm1, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + addq $16 * SIZE, X1 + ALIGN_4 + +.L325: + testq $8, MM + jle .L326 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-31 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + MOVUPS_A1 (-28 * SIZE, A1, %xmm0) + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm12, %xmm9 + MOVUPS_A1 (-27 * SIZE, A2, %xmm12) + + mulps %xmm5, %xmm0 + addps %xmm0, %xmm8 + movss %xmm12, %xmm1 + shufps $0x93, %xmm12, %xmm1 + mulps %xmm5, %xmm1 + addps %xmm1, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + addq $8 * SIZE, X1 + ALIGN_4 + +.L326: + testq $4, MM + jle .L327 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-31 * SIZE, A2, %xmm1) + + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + movss %xmm1, %xmm12 + shufps $0x93, %xmm1, %xmm12 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm9 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, X1 + ALIGN_4 + +.L327: + testq $2, MM + jle .L328 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(A2), %xmm1 + mulps %xmm4, %xmm1 + addps %xmm1, %xmm9 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + addq $2 * SIZE, X1 + ALIGN_4 + +.L328: + testq $1, MM + jle .L329 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + movss -32 * SIZE(A2), %xmm1 + mulss %xmm4, %xmm1 + addss %xmm1, %xmm9 + ALIGN_4 + +.L329: +#ifdef HAVE_SSE3 + haddps %xmm9, %xmm8 + haddps %xmm8, %xmm8 +#else + movaps %xmm8, %xmm10 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm10 + + addps %xmm10, %xmm8 + movhlps %xmm8, %xmm9 + addps %xmm9, %xmm8 +#endif + + pshufd $0x1, %xmm8, %xmm9 + + mulss ALPHA, %xmm8 + mulss ALPHA, %xmm9 + + addss (Y), %xmm8 + addq INCY, Y + addss (Y), %xmm9 + addq INCY, Y + + movss %xmm8, (Y1) + addq INCY, Y1 + movss %xmm9, (Y1) + addq INCY, Y1 + jmp .L999 + ALIGN_4 + +.L330: + cmpq $1, N + jne .L999 + + leaq 32 * SIZE(BUFFER), X1 + + movq A, A1 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + cmpq $3, M + jle .L337 + + testq $SIZE, A1 + je .L33X + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + + addq $1 * SIZE, A1 + addq $1 * SIZE, X1 + ALIGN_3 + +.L33X: + testq $2 * SIZE, A1 + je .L33XX + +#ifdef movsd + xorps %xmm0, %xmm0 + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(A1), %xmm0 + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_3 + +.L33XX: + + MOVUPS_XL1 (-32 * SIZE, X1, %xmm4) + MOVUPS_XL1 (-28 * SIZE, X1, %xmm5) + + movq MM, I + sarq $4, I + jle .L335 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + + decq I + jle .L333 + ALIGN_4 + +.L332: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(A1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 8 - 128 + PREOFFSET(X1) +#endif + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-16 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-12 * SIZE, A1, %xmm12) + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + + decq I + jg .L332 + ALIGN_4 + +.L333: + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + MOVUPS_A1 (-24 * SIZE, A1, %xmm0) + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-20 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + MOVUPS_A1 (-20 * SIZE, A1, %xmm12) + + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-16 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + mulps %xmm5, %xmm12 + MOVUPS_XL1 (-12 * SIZE, X1, %xmm5) + addps %xmm12, %xmm9 + + addq $16 * SIZE, A1 + addq $16 * SIZE, X1 + ALIGN_4 + +.L335: + testq $8, MM + jle .L336 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + MOVUPS_XL1 (-24 * SIZE, X1, %xmm4) + addps %xmm0, %xmm8 + + MOVUPS_A1 (-28 * SIZE, A1, %xmm12) + mulps %xmm5, %xmm12 + addps %xmm12, %xmm9 + + addq $8 * SIZE, A1 + addq $8 * SIZE, X1 + ALIGN_4 + +.L336: + testq $4, MM + jle .L337 + + MOVUPS_A1 (-32 * SIZE, A1, %xmm0) + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + + addq $4 * SIZE, A1 + addq $4 * SIZE, X1 + ALIGN_4 + +.L337: + testq $2, MM + jle .L338 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(A1), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X1), %xmm4 + mulps %xmm4, %xmm0 + addps %xmm0, %xmm8 + shufps $0xe, %xmm4, %xmm4 + + addq $2 * SIZE, A1 + addq $2 * SIZE, X1 + ALIGN_4 + +.L338: + testq $1, MM + jle .L339 + + movss -32 * SIZE(A1), %xmm0 + movss -32 * SIZE(X1), %xmm4 + mulss %xmm4, %xmm0 + addss %xmm0, %xmm8 + ALIGN_4 + +.L339: + addps %xmm9, %xmm8 + +#ifdef HAVE_SSE3 + haddps %xmm8, %xmm8 + haddps %xmm8, %xmm8 +#else + pshufd $1, %xmm8, %xmm9 + pshufd $2, %xmm8, %xmm10 + pshufd $3, %xmm8, %xmm11 + + addss %xmm9, %xmm8 + addss %xmm11, %xmm10 + addss %xmm10, %xmm8 +#endif + + mulss ALPHA, %xmm8 + + addss (Y), %xmm8 + movss %xmm8, (Y1) + + jmp .L999 +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + ALIGN_4 + + EPILOGUE diff --git a/kernel/x86_64/staticbuffer.S b/kernel/x86_64/staticbuffer.S new file mode 100644 index 0000000000..7bbd23d891 --- /dev/null +++ b/kernel/x86_64/staticbuffer.S @@ -0,0 +1,45 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ALLOC_STATIC + .align 8 + .comm alloc_area, (NUM_BUFFERS * BUFFER_SIZE), 16384 +#endif diff --git a/kernel/x86_64/swap.S b/kernel/x86_64/swap.S new file mode 100644 index 0000000000..50a7fb5570 --- /dev/null +++ b/kernel/x86_64/swap.S @@ -0,0 +1,439 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define N ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define N ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#define XX %r10 +#define YY %r11 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 24(%rsp), INCY +#endif +#else + pushq %rbx + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + EMMS + + salq $BASE_SHIFT, INCX + salq $BASE_SHIFT, INCY + + cmpq $SIZE, INCX + jne .L14 + cmpq $SIZE, INCY + jne .L14 + + movq N, %rax + sarq $3, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm6, 16(X) + movq %mm7, 24(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) + movq %mm2, 16(Y) + movq %mm3, 24(Y) + + movq 32(X), %mm0 + movq 40(X), %mm1 + movq 48(X), %mm2 + movq 56(X), %mm3 + movq 32(Y), %mm4 + movq 40(Y), %mm5 + movq 48(Y), %mm6 + movq 56(Y), %mm7 + + movq %mm4, 32(X) + movq %mm5, 40(X) + movq %mm6, 48(X) + movq %mm7, 56(X) + movq %mm0, 32(Y) + movq %mm1, 40(Y) + movq %mm2, 48(Y) + movq %mm3, 56(Y) + + movq 64(X), %mm0 + movq 72(X), %mm1 + movq 80(X), %mm2 + movq 88(X), %mm3 + movq 64(Y), %mm4 + movq 72(Y), %mm5 + movq 80(Y), %mm6 + movq 88(Y), %mm7 + + movq %mm4, 64(X) + movq %mm5, 72(X) + movq %mm6, 80(X) + movq %mm7, 88(X) + movq %mm0, 64(Y) + movq %mm1, 72(Y) + movq %mm2, 80(Y) + movq %mm3, 88(Y) + + movq 96(X), %mm0 + movq 104(X), %mm1 + movq 112(X), %mm2 + movq 120(X), %mm3 + movq 96(Y), %mm4 + movq 104(Y), %mm5 + movq 112(Y), %mm6 + movq 120(Y), %mm7 + + movq %mm4, 96(X) + movq %mm5, 104(X) + movq %mm6, 112(X) + movq %mm7, 120(X) + movq %mm0, 96(Y) + movq %mm1, 104(Y) + movq %mm2, 112(Y) + movq %mm3, 120(Y) + +#elif defined(DOUBLE) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + movq 2 * SIZE(X), %mm2 + movq 3 * SIZE(X), %mm3 + movq 0 * SIZE(Y), %mm4 + movq 1 * SIZE(Y), %mm5 + movq 2 * SIZE(Y), %mm6 + movq 3 * SIZE(Y), %mm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 0 * SIZE(X) + movq %mm5, 1 * SIZE(X) + movq %mm6, 2 * SIZE(X) + movq %mm7, 3 * SIZE(X) + movq %mm0, 0 * SIZE(Y) + movq %mm1, 1 * SIZE(Y) + movq %mm2, 2 * SIZE(Y) + movq %mm3, 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movq 4 * SIZE(X), %mm0 + movq 5 * SIZE(X), %mm1 + movq 6 * SIZE(X), %mm2 + movq 7 * SIZE(X), %mm3 + movq 4 * SIZE(Y), %mm4 + movq 5 * SIZE(Y), %mm5 + movq 6 * SIZE(Y), %mm6 + movq 7 * SIZE(Y), %mm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 4 * SIZE(X) + movq %mm5, 5 * SIZE(X) + movq %mm6, 6 * SIZE(X) + movq %mm7, 7 * SIZE(X) + movq %mm0, 4 * SIZE(Y) + movq %mm1, 5 * SIZE(Y) + movq %mm2, 6 * SIZE(Y) + movq %mm3, 7 * SIZE(Y) + +#else +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + movq 0 * SIZE(X), %mm0 + movq 2 * SIZE(X), %mm1 + movq 4 * SIZE(X), %mm2 + movq 6 * SIZE(X), %mm3 + movq 0 * SIZE(Y), %mm4 + movq 2 * SIZE(Y), %mm5 + movq 4 * SIZE(Y), %mm6 + movq 6 * SIZE(Y), %mm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 0 * SIZE(X) + movq %mm5, 2 * SIZE(X) + movq %mm6, 4 * SIZE(X) + movq %mm7, 6 * SIZE(X) + + movq %mm0, 0 * SIZE(Y) + movq %mm1, 2 * SIZE(Y) + movq %mm2, 4 * SIZE(Y) + movq %mm3, 6 * SIZE(Y) +#endif + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq N, %rax + andq $7, %rax + jle .L27 + ALIGN_3 + +.L22: + +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) +#else + MOVQ 0 * SIZE(X), %mm0 + MOVQ 0 * SIZE(Y), %mm4 + MOVQ %mm4, 0 * SIZE(X) + MOVQ %mm0, 0 * SIZE(Y) +#endif + + addq $SIZE, X + addq $SIZE, Y + decq %rax + jg .L22 + jmp .L27 + ALIGN_3 + +/* INCX != 1 or INCY != 1 */ + +.L14: + movq N, %rax + movq X, XX + movq Y, YY + sarq $2, %rax + jle .L28 + ALIGN_2 + +.L29: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + addq INCY, YY + + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + addq INCY, YY + + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + addq INCY, YY + + movq 0(X), %mm0 + movq 8(X), %mm1 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + addq INCY, YY +#else + MOVQ (X), %mm0 + addq INCX, X + MOVQ (X), %mm1 + addq INCX, X + MOVQ (X), %mm2 + addq INCX, X + MOVQ (X), %mm3 + addq INCX, X + + MOVQ (Y), %mm4 + addq INCY, Y + MOVQ (Y), %mm5 + addq INCY, Y + MOVQ (Y), %mm6 + addq INCY, Y + MOVQ (Y), %mm7 + addq INCY, Y + + MOVQ %mm4, (XX) + addq INCX, XX + MOVQ %mm5, (XX) + addq INCX, XX + MOVQ %mm6, (XX) + addq INCX, XX + MOVQ %mm7, (XX) + addq INCX, XX + + MOVQ %mm0, (YY) + addq INCY, YY + MOVQ %mm1, (YY) + addq INCY, YY + MOVQ %mm2, (YY) + addq INCY, YY + MOVQ %mm3, (YY) + addq INCY, YY +#endif + + decq %rax + jg .L29 + ALIGN_3 + +.L28: + movq N, %rax + andq $3, %rax + jle .L27 + ALIGN_3 + +.L35: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) +#else + MOVQ (X), %mm0 + MOVQ (Y), %mm4 + + MOVQ %mm4, (X) + MOVQ %mm0, (Y) +#endif + addq INCX, X + addq INCY, Y + + decq %rax + jg .L35 + ALIGN_3 + +.L27: + EMMS + xorq %rax,%rax + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE + diff --git a/kernel/x86_64/swap_sse.S b/kernel/x86_64/swap_sse.S new file mode 100644 index 0000000000..5702870513 --- /dev/null +++ b/kernel/x86_64/swap_sse.S @@ -0,0 +1,1160 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + pushq %rbx + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L50 + cmpq $SIZE, INCY + jne .L50 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + cmpq $3, M + jle .L16 + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + ALIGN_3 + +.L05: + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_3 + +.L10: + cmpq $3, M + jle .L16 + + testq $2 * SIZE, X + jne .L30 + + testq $1 * SIZE, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + + decq %rax + jg .L11 + ALIGN_3 + +.L13: + testq $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + addq $2 * SIZE, X + movlps %xmm0, -32 * SIZE(Y) + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L20: + movaps -33 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + pshufd $0x39, %xmm1, %xmm3 + movlps %xmm3, -31 * SIZE(X) + + subq $3, M + + movq M, %rax + sarq $5, %rax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -13 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -5 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -5 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L23: + testq $16, M + jle .L24 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + testq $8, M + jle .L25 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, M + jle .L26 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + pshufd $0x39, %xmm0, %xmm2 + pshufd $0xff, %xmm0, %xmm0 + + movlps %xmm2, -32 * SIZE(Y) + movss %xmm0, -30 * SIZE(Y) + + testq $2, M + jle .L27 + + movsd -29 * SIZE(X), %xmm0 + movsd -29 * SIZE(Y), %xmm1 + + movlps %xmm0, -29 * SIZE(Y) + movlps %xmm1, -29 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, M + jle .L29 + + movss -29 * SIZE(X), %xmm0 + movss -29 * SIZE(Y), %xmm1 + + movss %xmm0, -29 * SIZE(Y) + movss %xmm1, -29 * SIZE(X) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L30: + testq $1 * SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + subq $2, M + + movq M, %rax + sarq $5, %rax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -6 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -6 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L33: + testq $16, M + jle .L34 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + testq $8, M + jle .L35 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, M + jle .L36 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + movhps %xmm0, -32 * SIZE(Y) + + testq $2, M + jle .L37 + + movsd -30 * SIZE(X), %xmm0 + movsd -30 * SIZE(Y), %xmm1 + + movlps %xmm0, -30 * SIZE(Y) + movlps %xmm1, -30 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, M + jle .L39 + + movss -30 * SIZE(X), %xmm0 + movss -30 * SIZE(Y), %xmm1 + + movss %xmm0, -30 * SIZE(Y) + movss %xmm1, -30 * SIZE(X) + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + + subq $3, M + + movq M, %rax + sarq $5, %rax + jle .L43 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -11 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -3 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -3 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L43: + testq $16, M + jle .L44 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $8, M + jle .L45 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, M + jle .L46 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + movsd -31 * SIZE(X), %xmm2 + + pshufd $0x39, %xmm1, %xmm1 + movlps %xmm1, -31 * SIZE(X) + + pshufd $0xff, %xmm0, %xmm0 + + movss %xmm0, -32 * SIZE(Y) + movlps %xmm2, -31 * SIZE(Y) + + addq $3 * SIZE, X + addq $3 * SIZE, Y + + testq $2, M + jle .L47 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm0, -32 * SIZE(Y) + movlps %xmm1, -32 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, M + jle .L49 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm0, -32 * SIZE(Y) + movss %xmm1, -32 * SIZE(X) + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L50: + movq M, %rax + sarq $3, %rax + jle .L55 + ALIGN_3 + +.L51: + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + addq INCX, X + movss %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $7, %rax + jle .L57 + ALIGN_3 + +.L56: + movss (X), %xmm0 + movss (Y), %xmm1 + + movss %xmm1, (X) + movss %xmm0, (Y) + + addq INCX, X + addq INCY, Y + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/swap_sse2.S b/kernel/x86_64/swap_sse2.S new file mode 100644 index 0000000000..5f164197d5 --- /dev/null +++ b/kernel/x86_64/swap_sse2.S @@ -0,0 +1,585 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + pushq %rbx + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + + leaq (, INCX, SIZE), INCX + leaq (, INCY, SIZE), INCY + + cmpq $SIZE, INCX + jne .L40 + cmpq $SIZE, INCY + jne .L40 + + testq $SIZE, Y + je .L10 + + movsd 0 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm8 + + movsd %xmm8, 0 * SIZE(X) + movsd %xmm0, 0 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, X + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + + decq %rax + jg .L11 + ALIGN_3 + +.L13: + testq $8, M + jle .L14 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $4, M + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, M + jle .L16 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, M + jle .L19 + + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + movlps %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L20: + movhps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + decq M + jle .L29 + + movq M, %rax + sarq $4, %rax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -5 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -3 * SIZE(X), %xmm2 + movaps -2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + + movhps %xmm0, -16 * SIZE(Y) + movhps -15 * SIZE(X), %xmm0 + movhps %xmm1, -15 * SIZE(X) + + addq $SIZE, X + addq $SIZE, Y + ALIGN_3 + +.L29: + movhps %xmm0, -16 * SIZE(Y) + + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L40: + movq M, %rax + sarq $3, %rax + jle .L45 + ALIGN_3 + +.L41: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + addq INCX, X + movsd %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L45: + movq M, %rax + andq $7, %rax + jle .L47 + ALIGN_3 + +.L46: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movsd %xmm1, (X) + movsd %xmm0, (Y) + + addq INCX, X + addq INCY, Y + decq %rax + jg .L46 + ALIGN_3 + +.L47: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S new file mode 100644 index 0000000000..901a5ad317 --- /dev/null +++ b/kernel/x86_64/symv_L_sse.S @@ -0,0 +1,1029 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 8) +#define movsd movlps +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_LDA 40 + STACKSIZE(%rsp) +#define OLD_X 48 + STACKSIZE(%rsp) +#define OLD_INCX 56 + STACKSIZE(%rsp) +#define OLD_Y 64 + STACKSIZE(%rsp) +#define OLD_INCY 72 + STACKSIZE(%rsp) +#define OLD_BUFFER 80 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA %xmm0 + +#define atemp1 %xmm0 +#define atemp2 %xmm1 +#define atemp3 %xmm2 +#define atemp4 %xmm3 + +#define xsum1 %xmm4 +#define xsum2 %xmm5 +#define xsum3 %xmm6 +#define xsum4 %xmm7 + +#define xtemp1 %xmm8 +#define xtemp2 %xmm9 +#define yy1 %xmm10 +#define xt1 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define a4 %xmm15 + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq M, M + jle .L999 + + shufps $0, ALPHA, ALPHA + + movq BUFFER, XX + + movq M, %rax + sarq $3, %rax + jle .L02 + ALIGN_3 + +.L01: + movss 0 * SIZE(X), %xmm1 + addq INCX, X + movss 0 * SIZE(X), %xmm2 + addq INCX, X + movss 0 * SIZE(X), %xmm3 + addq INCX, X + movss 0 * SIZE(X), %xmm4 + addq INCX, X + movss 0 * SIZE(X), %xmm5 + addq INCX, X + movss 0 * SIZE(X), %xmm6 + addq INCX, X + movss 0 * SIZE(X), %xmm7 + addq INCX, X + movss 0 * SIZE(X), %xmm8 + addq INCX, X + + mulss ALPHA, %xmm1 + mulss ALPHA, %xmm2 + mulss ALPHA, %xmm3 + mulss ALPHA, %xmm4 + mulss ALPHA, %xmm5 + mulss ALPHA, %xmm6 + mulss ALPHA, %xmm7 + mulss ALPHA, %xmm8 + + movss %xmm1, 0 * SIZE(XX) + movss %xmm2, 1 * SIZE(XX) + movss %xmm3, 2 * SIZE(XX) + movss %xmm4, 3 * SIZE(XX) + movss %xmm5, 4 * SIZE(XX) + movss %xmm6, 5 * SIZE(XX) + movss %xmm7, 6 * SIZE(XX) + movss %xmm8, 7 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $7, %rax + jle .L05 + ALIGN_3 + +.L03: + movss 0 * SIZE(X), %xmm1 + addq INCX, X + + mulss ALPHA, %xmm1 + + movss %xmm1, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $3, %rax + jle .L07 + ALIGN_3 + +.L06: + movss 0 * SIZE(YY), %xmm0 + addq INCY, YY + movss 0 * SIZE(YY), %xmm1 + addq INCY, YY + movss 0 * SIZE(YY), %xmm2 + addq INCY, YY + movss 0 * SIZE(YY), %xmm3 + addq INCY, YY + movss 0 * SIZE(YY), %xmm4 + addq INCY, YY + movss 0 * SIZE(YY), %xmm5 + addq INCY, YY + movss 0 * SIZE(YY), %xmm6 + addq INCY, YY + movss 0 * SIZE(YY), %xmm7 + addq INCY, YY + + movss %xmm0, 0 * SIZE(XX) + movss %xmm1, 1 * SIZE(XX) + movss %xmm2, 2 * SIZE(XX) + movss %xmm3, 3 * SIZE(XX) + movss %xmm4, 4 * SIZE(XX) + movss %xmm5, 5 * SIZE(XX) + movss %xmm6, 6 * SIZE(XX) + movss %xmm7, 7 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $7, %rax + jle .L10 + ALIGN_3 + +.L08: + movss 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movss %xmm0, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + xorq IS, IS # is = 0 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 2), A2 + leaq 4 * SIZE(A, LDA, 4), A + + leaq (NEW_X, IS, SIZE), XX + leaq 4 * SIZE(NEW_Y, IS, SIZE), YY + + movaps 0 * SIZE(XX), atemp4 + + movsd 0 * SIZE(A1), xsum1 + movhps 2 * SIZE(A1), xsum1 + mulps atemp4, xsum1 + + movss 1 * SIZE(A1), xsum2 + movss 1 * SIZE(A1, LDA, 1), a2 + movss 2 * SIZE(A1, LDA, 1), a3 + movss 3 * SIZE(A1, LDA, 1), a4 + unpcklps a3, xsum2 + unpcklps a4, a2 + unpcklps a2, xsum2 + mulps atemp4, xsum2 + + movss 2 * SIZE(A1), xsum3 + movss 2 * SIZE(A1, LDA, 1), a2 + movss 2 * SIZE(A2), a3 + movss 3 * SIZE(A2), a4 + unpcklps a3, xsum3 + unpcklps a4, a2 + unpcklps a2, xsum3 + mulps atemp4, xsum3 + + movss 3 * SIZE(A1), xsum4 + movss 3 * SIZE(A1, LDA, 1), a2 + movss 3 * SIZE(A2), a3 + movss 3 * SIZE(A2, LDA, 1), a4 + unpcklps a3, xsum4 + unpcklps a4, a2 + unpcklps a2, xsum4 + mulps atemp4, xsum4 + + pshufd $0x00, atemp4, atemp1 + pshufd $0x55, atemp4, atemp2 + pshufd $0xaa, atemp4, atemp3 + pshufd $0xff, atemp4, atemp4 + + movaps 4 * SIZE(XX), xtemp1 + movaps 8 * SIZE(XX), xtemp2 + + movsd 0 * SIZE(YY), yy1 + movhps 2 * SIZE(YY), yy1 + + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + addq $4 * SIZE, XX + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + + movq M, I + subq IS, I + subq $4, I + sarq $4, I + jle .L14 + ALIGN_3 + +.L12: + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A1) + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCH PREFETCHSIZE(XX) +#endif + + movaps xtemp1, xt1 + movaps 8 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 8 * SIZE(A1), a1 + movhps 10 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A1, LDA, 1) + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 8 * SIZE(A1, LDA, 1), a2 + movhps 10 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 8 * SIZE(A2), a3 + movhps 10 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 12 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 8 * SIZE(A2, LDA, 1), a4 + movhps 10 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 4 * SIZE(YY) + movhps yy1, 6 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhps 10 * SIZE(YY), yy1 + + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 12 * SIZE(A1), a1 + movhps 14 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A2) + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 12 * SIZE(A1, LDA, 1), a2 + movhps 14 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 12 * SIZE(A2), a3 + movhps 14 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCHW PREFETCHSIZE(YY) +#endif + + movaps xtemp1, xt1 + movaps 16 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 12 * SIZE(A2, LDA, 1), a4 + movhps 14 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 8 * SIZE(YY) + movhps yy1, 10 * SIZE(YY) + movsd 12 * SIZE(YY), yy1 + movhps 14 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 16 * SIZE(A1), a1 + movhps 18 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A2, LDA, 1) + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 16 * SIZE(A1, LDA, 1), a2 + movhps 18 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 16 * SIZE(A2), a3 + movhps 18 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 20 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 16 * SIZE(A2, LDA, 1), a4 + movhps 18 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 12 * SIZE(YY) + movhps yy1, 14 * SIZE(YY) + movsd 16 * SIZE(YY), yy1 + movhps 18 * SIZE(YY), yy1 + + addq $16 * SIZE, XX + addq $16 * SIZE, YY + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L14: + movq M, I + subq IS, I + subq $4, I + test $8, I + jle .L15 + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + + movaps xtemp1, xt1 + movaps 8 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 8 * SIZE(A1), a1 + movhps 10 * SIZE(A1), a1 + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 8 * SIZE(A1, LDA, 1), a2 + movhps 10 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 8 * SIZE(A2), a3 + movhps 10 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 12 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 8 * SIZE(A2, LDA, 1), a4 + movhps 10 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 4 * SIZE(YY) + movhps yy1, 6 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhps 10 * SIZE(YY), yy1 + + addq $8 * SIZE, XX + addq $8 * SIZE, YY + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L15: + test $4, I + jle .L17 + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + + movaps xtemp1, xt1 + movsd 4 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $2, M + jle .L18 + + pxor xtemp2, xtemp2 + + movlhps xtemp2, a1 + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movss 2 * SIZE(A1), a1 + + movlhps xtemp2, a2 + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movss 2 * SIZE(A1, LDA, 1), a2 + + movlhps xtemp2, a3 + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movss 2 * SIZE(A2), a3 + + movlhps xtemp2, a4 + movaps xtemp1, xt1 + movss 2 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movss 2 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movss 2 * SIZE(YY), yy1 + + addq $2 * SIZE, XX + addq $2 * SIZE, YY + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + ALIGN_3 + +.L18: + testq $1, M + jle .L19 + + movss 0 * SIZE(XX), xtemp1 + + movss 0 * SIZE(YY), yy1 + + movss 0 * SIZE(A1), a1 + movss 0 * SIZE(A1, LDA, 1), a2 + movss 0 * SIZE(A2), a3 + movss 0 * SIZE(A2, LDA, 1), a4 + + movaps xtemp1, xt1 + mulss a1, xt1 + mulss atemp1, a1 + addss xt1, xsum1 + addss a1, yy1 + + movaps xtemp1, xt1 + mulss a2, xt1 + mulss atemp2, a2 + addss xt1, xsum2 + addss a2, yy1 + + movaps xtemp1, xt1 + mulss a3, xt1 + mulss atemp3, a3 + addss xt1, xsum3 + addss a3, yy1 + + movaps xtemp1, xt1 + mulss a4, xt1 + mulss atemp4, a4 + addss xt1, xsum4 + addss a4, yy1 + + movss yy1, 0 * SIZE(YY) + ALIGN_3 + +.L19: +#ifndef HAVE_SSE3 + movaps xsum1, xtemp1 + unpcklps xsum3, xsum1 + unpckhps xsum3, xtemp1 + + movaps xsum2, xtemp2 + unpcklps xsum4, xsum2 + unpckhps xsum4, xtemp2 + + movaps xsum1, xsum3 + unpcklps xsum2, xsum1 + unpckhps xsum2, xsum3 + + movaps xtemp1, xsum4 + unpcklps xtemp2, xtemp1 + unpckhps xtemp2, xsum4 + + addps xsum3, xsum1 + addps xtemp1, xsum4 + addps xsum4, xsum1 +#else + haddps xsum2, xsum1 + haddps xsum4, xsum3 + + haddps xsum3, xsum1 +#endif + + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1 + + addps xsum1, yy1 + + movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) + movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE) + + addq $4, IS + + movq IS, I + addq $4, I + cmpq N, I + jle .L11 + ALIGN_3 + +.L20: + testq $2, N + jle .L30 + + movq A, A1 + leaq 2 * SIZE(A, LDA, 2), A + + movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 + +#if defined(OPTERON) + pxor xsum1, xsum1 +#endif + movsd 0 * SIZE(A1), xsum1 + mulps atemp4, xsum1 + + movss 1 * SIZE(A1), xsum2 + movss 1 * SIZE(A1, LDA, 1), a2 + unpcklps a2, xsum2 + mulps atemp4, xsum2 + + pshufd $0x00, atemp4, atemp1 + pshufd $0x55, atemp4, atemp2 + + testq $1, M + jle .L29 + + movss 2 * SIZE(A1), a1 + movss 2 * SIZE(A1, LDA, 1), a2 + movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1 + movss 2 * SIZE(NEW_Y, IS, SIZE), yy1 + + movaps xtemp1, xt1 + mulss a1, xt1 + mulss atemp1, a1 + addss xt1, xsum1 + addps a1, yy1 + + movaps xtemp1, xt1 + mulss a2, xt1 + mulss atemp2, a2 + addss xt1, xsum2 + addss a2, yy1 + + movss yy1, 2 * SIZE(NEW_Y, IS, SIZE) + ALIGN_3 + +.L29: + +#ifndef HAVE_SSE3 + unpcklps xsum2, xsum1 + movhlps xsum1, xsum2 + addps xsum2, xsum1 +#else + haddps xsum2, xsum1 + haddps xsum1, xsum1 +#endif + + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + + addps xsum1, yy1 + + movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE) + + addq $2, IS + ALIGN_3 + +.L30: + testq $1, N + jle .L990 + + movss 0 * SIZE(NEW_X, IS, SIZE), xsum1 + mulss 0 * SIZE(A), xsum1 + addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1 + movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE) + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq M, %rax + sarq $3, %rax + jle .L997 + ALIGN_3 + +.L996: + movss 0 * SIZE(NEW_Y), %xmm0 + movss 1 * SIZE(NEW_Y), %xmm1 + movss 2 * SIZE(NEW_Y), %xmm2 + movss 3 * SIZE(NEW_Y), %xmm3 + movss 4 * SIZE(NEW_Y), %xmm4 + movss 5 * SIZE(NEW_Y), %xmm5 + movss 6 * SIZE(NEW_Y), %xmm6 + movss 7 * SIZE(NEW_Y), %xmm7 + + movss %xmm0, 0 * SIZE(Y) + addq INCY, Y + movss %xmm1, 0 * SIZE(Y) + addq INCY, Y + movss %xmm2, 0 * SIZE(Y) + addq INCY, Y + movss %xmm3, 0 * SIZE(Y) + addq INCY, Y + movss %xmm4, 0 * SIZE(Y) + addq INCY, Y + movss %xmm5, 0 * SIZE(Y) + addq INCY, Y + movss %xmm6, 0 * SIZE(Y) + addq INCY, Y + movss %xmm7, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L998: + movss 0 * SIZE(NEW_Y), %xmm0 + + movss %xmm0, 0 * SIZE(Y) + addq INCY, Y + + addq $1 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S new file mode 100644 index 0000000000..bfe7ebd690 --- /dev/null +++ b/kernel/x86_64/symv_L_sse2.S @@ -0,0 +1,978 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 8) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_LDA 40 + STACKSIZE(%rsp) +#define OLD_X 48 + STACKSIZE(%rsp) +#define OLD_INCX 56 + STACKSIZE(%rsp) +#define OLD_Y 64 + STACKSIZE(%rsp) +#define OLD_INCY 72 + STACKSIZE(%rsp) +#define OLD_BUFFER 80 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA %xmm0 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define yy1 %xmm2 +#define yy2 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define xsum3 %xmm10 +#define xsum4 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq M, M + jle .L999 + + unpcklpd ALPHA, ALPHA + + movq BUFFER, XX + + movq M, %rax + sarq $3, %rax + jle .L02 + ALIGN_3 + +.L01: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movhpd 0 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X + + mulpd ALPHA, %xmm1 + mulpd ALPHA, %xmm2 + mulpd ALPHA, %xmm3 + mulpd ALPHA, %xmm4 + + movapd %xmm1, 0 * SIZE(XX) + movapd %xmm2, 2 * SIZE(XX) + movapd %xmm3, 4 * SIZE(XX) + movapd %xmm4, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $7, %rax + jle .L05 + ALIGN_3 + +.L03: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + + mulsd ALPHA, %xmm1 + + movlpd %xmm1, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $3, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $7, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movsd %xmm0, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + xorq IS, IS # is = 0 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 2), A2 + leaq 4 * SIZE(A, LDA, 4), A + + leaq (NEW_X, IS, SIZE), XX + leaq 4 * SIZE(NEW_Y, IS, SIZE), YY + + movapd 0 * SIZE(XX), atemp2 + movapd 2 * SIZE(XX), atemp4 + + movsd 0 * SIZE(A1), xsum1 + movhpd 1 * SIZE(A1), xsum1 + mulpd atemp2, xsum1 + + movsd 1 * SIZE(A1), xsum2 + movhpd 1 * SIZE(A1, LDA, 1), xsum2 + mulpd atemp2, xsum2 + + movsd 2 * SIZE(A1), xsum3 + movhpd 2 * SIZE(A1, LDA, 1), xsum3 + mulpd atemp2, xsum3 + + movsd 3 * SIZE(A1), xsum4 + movhpd 3 * SIZE(A1, LDA, 1), xsum4 + mulpd atemp2, xsum4 + + movsd 2 * SIZE(A1), a1 + movhpd 3 * SIZE(A1), a1 + mulpd atemp4, a1 + addpd a1, xsum1 + + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + mulpd atemp4, a1 + addpd a1, xsum2 + + movsd 2 * SIZE(A2), a1 + movhpd 3 * SIZE(A2), a1 + mulpd atemp4, a1 + addpd a1, xsum3 + + movsd 3 * SIZE(A2), a1 + movhpd 3 * SIZE(A2, LDA, 1), a1 + mulpd atemp4, a1 + addpd a1, xsum4 + + movapd 4 * SIZE(XX), xtemp1 + movapd 6 * SIZE(XX), xtemp2 + + movsd 4 * SIZE(A1), a1 + movhpd 5 * SIZE(A1), a1 + movsd 6 * SIZE(A1), a2 + movhpd 7 * SIZE(A1), a2 + movsd 4 * SIZE(A1, LDA, 1), a3 + movhpd 5 * SIZE(A1, LDA, 1), a3 + + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + +#ifndef HAVE_SSE3 + movapd atemp2, atemp1 + unpcklpd atemp1, atemp1 + unpckhpd atemp2, atemp2 + movapd atemp4, atemp3 + unpcklpd atemp3, atemp3 + unpckhpd atemp4, atemp4 +#else + movddup atemp2, atemp1 + unpckhpd atemp2, atemp2 + movddup atemp4, atemp3 + unpckhpd atemp4, atemp4 +#endif + + addq $4 * SIZE, XX + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + + movq M, I + subq IS, I + subq $4, I + sarq $3, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + movsd 0 * SIZE(A2), a2 + movhpd 1 * SIZE(A2), a2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy1 + movsd 2 * SIZE(A2), a3 + movhpd 3 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCH PREFETCHSIZE(XX) +#endif + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy2 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy1 + movsd 2 * SIZE(A2, LDA, 1), a2 + movhpd 3 * SIZE(A2, LDA, 1), a2 + + PREFETCH PREFETCHSIZE(A1, LDA, 1) + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum3 + addpd a3, yy2 + movsd 4 * SIZE(A1), a3 + movhpd 5 * SIZE(A1), a3 + + movapd xtemp1, xt1 + movapd 4 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + movsd 6 * SIZE(A1), a1 + movhpd 7 * SIZE(A1), a1 + + movapd xtemp2, xt1 + movapd 6 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum4 + addpd a2, yy2 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhpd 5 * SIZE(A1, LDA, 1), a2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp1, a3 + addpd xt1, xsum1 + addpd a3, yy1 + movsd 6 * SIZE(A1, LDA, 1), a3 + movhpd 7 * SIZE(A1, LDA, 1), a3 + + PREFETCH PREFETCHSIZE(A2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + movsd 4 * SIZE(A2), a1 + movhpd 5 * SIZE(A2), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + addpd xt1, xsum2 + addpd a2, yy1 + movsd 6 * SIZE(A2), a2 + movhpd 7 * SIZE(A2), a2 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCHW PREFETCHSIZE(YY) +#endif + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy2 + movsd 4 * SIZE(A2, LDA, 1), a3 + movhpd 5 * SIZE(A2, LDA, 1), a3 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum3 + addpd a1, yy1 + movsd 6 * SIZE(A2, LDA, 1), a1 + movhpd 7 * SIZE(A2, LDA, 1), a1 + + PREFETCH PREFETCHSIZE(A2, LDA, 1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy2 + movsd 10 * SIZE(A1), a2 + movhpd 11 * SIZE(A1), a2 + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a3, xt1 + mulpd atemp4, a3 + addpd xt1, xsum4 + addpd a3, yy1 + movsd 8 * SIZE(A1, LDA, 1), a3 + movhpd 9 * SIZE(A1, LDA, 1), a3 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy2 + movsd 8 * SIZE(A1), a1 + movhpd 9 * SIZE(A1), a1 + + movsd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + movsd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + addq $8 * SIZE, XX + addq $8 * SIZE, YY + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + movq M, I + subq IS, I + subq $4, I + test $4, I + jle .L17 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + movsd 0 * SIZE(A2), a2 + movhpd 1 * SIZE(A2), a2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy1 + movsd 2 * SIZE(A2), a3 + movhpd 3 * SIZE(A2), a3 + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy2 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy1 + movsd 2 * SIZE(A2, LDA, 1), a2 + movhpd 3 * SIZE(A2, LDA, 1), a2 + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum3 + addpd a3, yy2 + movsd 4 * SIZE(A1, LDA, 1), a3 + movhpd 5 * SIZE(A1, LDA, 1), a3 + + movapd xtemp1, xt1 + movapd 4 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + movsd 4 * SIZE(A1), a1 + movhpd 5 * SIZE(A1), a1 + + movapd xtemp2, xt1 + movapd 6 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum4 + addpd a2, yy2 + movsd 6 * SIZE(A1), a2 + movhpd 7 * SIZE(A1), a2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $2, M + jle .L18 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 0 * SIZE(A1, LDA, 1), a1 + movhpd 1 * SIZE(A1, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy1 + movsd 0 * SIZE(A2), a1 + movhpd 1 * SIZE(A2), a1 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum3 + addpd a1, yy1 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + movapd 2 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + movsd 2 * SIZE(A1), a1 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 2 * SIZE(YY), yy1 + + addq $2 * SIZE, XX + addq $2 * SIZE, YY + addq $2 * SIZE, A1 + addq $2 * SIZE, A2 + ALIGN_3 + +.L18: + testq $1, M + jle .L19 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp1, a1 + addsd xt1, xsum1 + addpd a1, yy1 + movsd 0 * SIZE(A1, LDA, 1), a1 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp2, a1 + addsd xt1, xsum2 + addsd a1, yy1 + movsd 0 * SIZE(A2), a1 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp3, a1 + addsd xt1, xsum3 + addsd a1, yy1 + movsd 0 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp4, a1 + addsd xt1, xsum4 + addsd a1, yy1 + + movsd yy1, 0 * SIZE(YY) + ALIGN_3 + +.L19: +#ifndef HAVE_SSE3 + movapd xsum1, atemp1 + movapd xsum3, atemp3 + + unpcklpd xsum2, xsum1 + unpcklpd xsum4, xsum3 + + unpckhpd xsum2, atemp1 + unpckhpd xsum4, atemp3 + + addpd atemp1, xsum1 + addpd atemp3, xsum3 +#else + haddpd xsum2, xsum1 + haddpd xsum4, xsum3 +#endif + + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1 + movsd 2 * SIZE(NEW_Y, IS, SIZE), yy2 + movhpd 3 * SIZE(NEW_Y, IS, SIZE), yy2 + + addpd xsum1, yy1 + addpd xsum3, yy2 + + movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) + movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE) + movsd yy2, 2 * SIZE(NEW_Y, IS, SIZE) + movhpd yy2, 3 * SIZE(NEW_Y, IS, SIZE) + + addq $4, IS + + movq IS, I + addq $4, I + cmpq N, I + jle .L11 + ALIGN_3 + +.L20: + testq $2, N + jle .L30 + + movq A, A1 + leaq 2 * SIZE(A, LDA, 2), A + + movapd 0 * SIZE(NEW_X, IS, SIZE), atemp2 + + movsd 0 * SIZE(A1), xsum1 + movhpd 1 * SIZE(A1), xsum1 + mulpd atemp2, xsum1 + + movsd 1 * SIZE(A1), xsum2 + movhpd 1 * SIZE(A1, LDA, 1), xsum2 + mulpd atemp2, xsum2 + +#ifndef HAVE_SSE3 + movapd atemp2, atemp1 + unpcklpd atemp1, atemp1 +#else + movddup atemp2, atemp1 +#endif + unpckhpd atemp2, atemp2 + + testq $1, M + jle .L29 + + movsd 2 * SIZE(A1), a1 + movsd 2 * SIZE(A1, LDA, 1), a2 + movsd 2 * SIZE(NEW_X, IS, SIZE), xtemp1 + movsd 2 * SIZE(NEW_Y, IS, SIZE), yy1 + + movapd xtemp1, xt1 + mulsd a1, xt1 + mulsd atemp1, a1 + addsd xt1, xsum1 + addpd a1, yy1 + + movapd xtemp1, xt1 + mulsd a2, xt1 + mulsd atemp2, a2 + addsd xt1, xsum2 + addsd a2, yy1 + + movsd yy1, 2 * SIZE(NEW_Y, IS, SIZE) + ALIGN_3 + +.L29: +#ifndef HAVE_SSE3 + movapd xsum1, atemp1 + unpcklpd xsum2, xsum1 + unpckhpd xsum2, atemp1 + addpd atemp1, xsum1 +#else + haddpd xsum2, xsum1 +#endif + + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1 + + addpd xsum1, yy1 + + movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) + movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE) + + addq $2, IS + ALIGN_3 + +.L30: + testq $1, N + jle .L990 + + movsd 0 * SIZE(A), xsum1 + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 + + mulsd atemp1, xsum1 + addsd xsum1, yy1 + movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq M, %rax + sarq $3, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L998: + movsd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + + addq $1 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S new file mode 100644 index 0000000000..2df76f1cbc --- /dev/null +++ b/kernel/x86_64/symv_U_sse.S @@ -0,0 +1,1059 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 8) +#define movsd movlps +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_LDA 40 + STACKSIZE(%rsp) +#define OLD_X 48 + STACKSIZE(%rsp) +#define OLD_INCX 56 + STACKSIZE(%rsp) +#define OLD_Y 64 + STACKSIZE(%rsp) +#define OLD_INCY 72 + STACKSIZE(%rsp) +#define OLD_BUFFER 80 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA %xmm0 + +#define atemp1 %xmm0 +#define atemp2 %xmm1 +#define atemp3 %xmm2 +#define atemp4 %xmm3 + +#define xsum1 %xmm4 +#define xsum2 %xmm5 +#define xsum3 %xmm6 +#define xsum4 %xmm7 + +#define xtemp1 %xmm8 +#define xtemp2 %xmm9 +#define yy1 %xmm10 +#define xt1 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define a4 %xmm15 + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq M, M + jle .L999 + + negq IS + addq M, IS + + movq IS, TEMP + imulq LDA, TEMP + addq TEMP, A + + shufps $0, ALPHA, ALPHA + + movq BUFFER, XX + + movq M, %rax + sarq $3, %rax + jle .L02 + ALIGN_3 + +.L01: + movss 0 * SIZE(X), %xmm1 + addq INCX, X + movss 0 * SIZE(X), %xmm2 + addq INCX, X + movss 0 * SIZE(X), %xmm3 + addq INCX, X + movss 0 * SIZE(X), %xmm4 + addq INCX, X + movss 0 * SIZE(X), %xmm5 + addq INCX, X + movss 0 * SIZE(X), %xmm6 + addq INCX, X + movss 0 * SIZE(X), %xmm7 + addq INCX, X + movss 0 * SIZE(X), %xmm8 + addq INCX, X + + mulss ALPHA, %xmm1 + mulss ALPHA, %xmm2 + mulss ALPHA, %xmm3 + mulss ALPHA, %xmm4 + mulss ALPHA, %xmm5 + mulss ALPHA, %xmm6 + mulss ALPHA, %xmm7 + mulss ALPHA, %xmm8 + + movss %xmm1, 0 * SIZE(XX) + movss %xmm2, 1 * SIZE(XX) + movss %xmm3, 2 * SIZE(XX) + movss %xmm4, 3 * SIZE(XX) + movss %xmm5, 4 * SIZE(XX) + movss %xmm6, 5 * SIZE(XX) + movss %xmm7, 6 * SIZE(XX) + movss %xmm8, 7 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $7, %rax + jle .L05 + ALIGN_3 + +.L03: + movss 0 * SIZE(X), %xmm1 + addq INCX, X + + mulss ALPHA, %xmm1 + + movss %xmm1, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $3, %rax + jle .L07 + ALIGN_3 + +.L06: + movss 0 * SIZE(YY), %xmm0 + addq INCY, YY + movss 0 * SIZE(YY), %xmm1 + addq INCY, YY + movss 0 * SIZE(YY), %xmm2 + addq INCY, YY + movss 0 * SIZE(YY), %xmm3 + addq INCY, YY + movss 0 * SIZE(YY), %xmm4 + addq INCY, YY + movss 0 * SIZE(YY), %xmm5 + addq INCY, YY + movss 0 * SIZE(YY), %xmm6 + addq INCY, YY + movss 0 * SIZE(YY), %xmm7 + addq INCY, YY + + movss %xmm0, 0 * SIZE(XX) + movss %xmm1, 1 * SIZE(XX) + movss %xmm2, 2 * SIZE(XX) + movss %xmm3, 3 * SIZE(XX) + movss %xmm4, 4 * SIZE(XX) + movss %xmm5, 5 * SIZE(XX) + movss %xmm6, 6 * SIZE(XX) + movss %xmm7, 7 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $7, %rax + jle .L10 + ALIGN_3 + +.L08: + movss 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movss %xmm0, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + movq IS, I + addq $4, I + cmpq M, I + jg .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 + + pshufd $0x00, atemp4, atemp1 + pshufd $0x55, atemp4, atemp2 + pshufd $0xaa, atemp4, atemp3 + pshufd $0xff, atemp4, atemp4 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + pxor xsum3, xsum3 + pxor xsum4, xsum4 + + movaps 0 * SIZE(NEW_X), xtemp1 + movaps 4 * SIZE(NEW_X), xtemp2 + + movsd 0 * SIZE(A1), a1 + movhps 2 * SIZE(A1), a1 + movsd 0 * SIZE(A1, LDA, 1), a2 + movhps 2 * SIZE(A1, LDA, 1), a2 + movsd 0 * SIZE(A2), a3 + movhps 2 * SIZE(A2), a3 + movsd 0 * SIZE(A2, LDA, 1), a4 + movhps 2 * SIZE(A2, LDA, 1), a4 + + movsd 0 * SIZE(NEW_Y), yy1 + movhps 2 * SIZE(NEW_Y), yy1 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $4, I + jle .L14 + ALIGN_3 + +.L12: + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A1) + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCH PREFETCHSIZE(XX) +#endif + + movaps xtemp1, xt1 + movaps 8 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 8 * SIZE(A1), a1 + movhps 10 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A1, LDA, 1) + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 8 * SIZE(A1, LDA, 1), a2 + movhps 10 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 8 * SIZE(A2), a3 + movhps 10 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 12 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 8 * SIZE(A2, LDA, 1), a4 + movhps 10 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 4 * SIZE(YY) + movhps yy1, 6 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhps 10 * SIZE(YY), yy1 + + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 12 * SIZE(A1), a1 + movhps 14 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A2) + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 12 * SIZE(A1, LDA, 1), a2 + movhps 14 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 12 * SIZE(A2), a3 + movhps 14 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCHW PREFETCHSIZE(YY) +#endif + + movaps xtemp1, xt1 + movaps 16 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 12 * SIZE(A2, LDA, 1), a4 + movhps 14 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 8 * SIZE(YY) + movhps yy1, 10 * SIZE(YY) + movsd 12 * SIZE(YY), yy1 + movhps 14 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 16 * SIZE(A1), a1 + movhps 18 * SIZE(A1), a1 + + PREFETCH PREFETCHSIZE(A2, LDA, 1) + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 16 * SIZE(A1, LDA, 1), a2 + movhps 18 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 16 * SIZE(A2), a3 + movhps 18 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 20 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 16 * SIZE(A2, LDA, 1), a4 + movhps 18 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 12 * SIZE(YY) + movhps yy1, 14 * SIZE(YY) + movsd 16 * SIZE(YY), yy1 + movhps 18 * SIZE(YY), yy1 + + addq $16 * SIZE, XX + addq $16 * SIZE, YY + addq $16 * SIZE, A1 + addq $16 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L14: + testq $8, IS + jle .L15 + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 4 * SIZE(A2), a3 + movhps 6 * SIZE(A2), a3 + + movaps xtemp1, xt1 + movaps 8 * SIZE(XX), xtemp1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 4 * SIZE(A2, LDA, 1), a4 + movhps 6 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 8 * SIZE(A1), a1 + movhps 10 * SIZE(A1), a1 + + movaps xtemp2, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 8 * SIZE(A1, LDA, 1), a2 + movhps 10 * SIZE(A1, LDA, 1), a2 + + movaps xtemp2, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + movsd 8 * SIZE(A2), a3 + movhps 10 * SIZE(A2), a3 + + movaps xtemp2, xt1 + movaps 12 * SIZE(XX), xtemp2 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + movsd 8 * SIZE(A2, LDA, 1), a4 + movhps 10 * SIZE(A2, LDA, 1), a4 + + movlps yy1, 4 * SIZE(YY) + movhps yy1, 6 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhps 10 * SIZE(YY), yy1 + + addq $8 * SIZE, XX + addq $8 * SIZE, YY + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + ALIGN_3 + +.L15: + testq $4, IS + jle .L18 + + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + + movaps xtemp1, xt1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + + movaps xtemp1, xt1 + mulps a3, xt1 + mulps atemp3, a3 + addps xt1, xsum3 + addps a3, yy1 + + movaps xtemp1, xt1 + mulps a4, xt1 + mulps atemp4, a4 + addps xt1, xsum4 + addps a4, yy1 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + movaps 0 * SIZE(NEW_X, IS, SIZE), atemp1 + + movss 0 * SIZE(A1), a1 + movss 0 * SIZE(A1, LDA, 1), a2 + movss 0 * SIZE(A2), a3 + movss 0 * SIZE(A2, LDA, 1), a4 + + unpcklps a3, a1 + unpcklps a4, a2 + unpcklps a2, a1 + + mulps atemp1, a1 + addps a1, xsum1 + + movsd 0 * SIZE(A1, LDA, 1), a1 + movss 1 * SIZE(A2), a2 + movhps 1 * SIZE(A2, LDA, 1), a2 + + shufps $0x84, a2, a1 + + mulps atemp1, a1 + addps a1, xsum2 + + movsd 0 * SIZE(A2), a1 + movss 2 * SIZE(A2), a2 + movhps 2 * SIZE(A2, LDA, 1), a2 + + shufps $0x84, a2, a1 + + mulps atemp1, a1 + addps a1, xsum3 + + movsd 0 * SIZE(A2, LDA, 1), a1 + movhps 2 * SIZE(A2, LDA, 1), a1 + + mulps atemp1, a1 + addps a1, xsum4 + + +#ifndef HAVE_SSE3 + movaps xsum1, xtemp1 + unpcklps xsum3, xsum1 + unpckhps xsum3, xtemp1 + + movaps xsum2, xtemp2 + unpcklps xsum4, xsum2 + unpckhps xsum4, xtemp2 + + movaps xsum1, xsum3 + unpcklps xsum2, xsum1 + unpckhps xsum2, xsum3 + + movaps xtemp1, xsum4 + unpcklps xtemp2, xtemp1 + unpckhps xtemp2, xsum4 + + addps xsum3, xsum1 + addps xtemp1, xsum4 + addps xsum4, xsum1 +#else + haddps xsum2, xsum1 + haddps xsum4, xsum3 + + haddps xsum3, xsum1 +#endif + + addps xsum1, yy1 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + + addq $4, IS + + movq IS, I + addq $4, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + testq $2, M + jle .L30 + + movq A, A1 + leaq (A, LDA, 2), A + + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp4 + + pshufd $0x00, atemp4, atemp1 + pshufd $0x55, atemp4, atemp2 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + movaps 0 * SIZE(NEW_X), xtemp1 + + movsd 0 * SIZE(A1), a1 + movhps 2 * SIZE(A1), a1 + movsd 0 * SIZE(A1, LDA, 1), a2 + movhps 2 * SIZE(A1, LDA, 1), a2 + + movsd 0 * SIZE(NEW_Y), yy1 + movhps 2 * SIZE(NEW_Y), yy1 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $2, I + jle .L28 + ALIGN_3 + +.L22: + movaps xtemp1, xt1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movsd 4 * SIZE(A1), a1 + movhps 6 * SIZE(A1), a1 + + movaps xtemp1, xt1 + movaps 4 * SIZE(XX), xtemp1 + mulps a2, xt1 + mulps atemp2, a2 + addps xt1, xsum2 + addps a2, yy1 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhps 6 * SIZE(A1, LDA, 1), a2 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhps 6 * SIZE(YY), yy1 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + + decq I + jg .L22 + ALIGN_3 + +.L28: + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + + movss 0 * SIZE(A1), a1 + movss 0 * SIZE(A1, LDA, 1), a2 + + unpcklps a2, a1 + + mulps atemp1, a1 + addps a1, xsum1 + + movsd 0 * SIZE(A1, LDA, 1), a1 + mulps atemp1, a1 + addps a1, xsum2 + +#ifndef HAVE_SSE3 + movhlps xsum1, xsum3 + movhlps xsum2, xsum4 + addps xsum3, xsum1 + addps xsum4, xsum2 + + unpcklps xsum2, xsum1 + movhlps xsum1, xsum2 + + addps xsum2, xsum1 +#else + haddps xsum2, xsum1 + haddps xsum1, xsum1 +#endif + + addps xsum1, yy1 + + movlps yy1, 0 * SIZE(YY) + + addq $2, IS + ALIGN_3 + +.L30: + testq $1, M + jle .L990 + + movq A, A1 + + movss 0 * SIZE(NEW_X, IS, SIZE), atemp1 + + pshufd $0x00, atemp1, atemp1 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + movss 0 * SIZE(NEW_Y), yy1 + + movss 0 * SIZE(NEW_X), xtemp1 + movss 1 * SIZE(NEW_X), xtemp2 + + movss 0 * SIZE(A1), a1 + movss 1 * SIZE(A1), a2 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $1, I + jle .L38 + ALIGN_3 + +.L32: + movaps xtemp1, xt1 + movss 2 * SIZE(XX), xtemp1 + mulps a1, xt1 + mulps atemp1, a1 + addps xt1, xsum1 + addps a1, yy1 + movss 2 * SIZE(A1), a1 + + movss yy1, 0 * SIZE(YY) + movss 1 * SIZE(YY), yy1 + + movaps xtemp2, xt1 + movss 3 * SIZE(XX), xtemp2 + mulps a2, xt1 + mulps atemp1, a2 + addps xt1, xsum1 + addps a2, yy1 + movss 3 * SIZE(A1), a2 + + movss yy1, 1 * SIZE(YY) + movss 2 * SIZE(YY), yy1 + + addq $2 * SIZE, XX + addq $2 * SIZE, YY + addq $2 * SIZE, A1 + + decq I + jg .L32 + ALIGN_3 + +.L38: + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + + movss 0 * SIZE(A1), a1 + mulss atemp1, a1 + addss a1, xsum1 + +#ifndef HAVE_SSE3 + movhlps xsum1, xsum3 + movhlps xsum2, xsum4 + addps xsum3, xsum1 + addps xsum4, xsum2 + + unpcklps xsum2, xsum1 + movhlps xsum1, xsum2 + + addps xsum2, xsum1 +#else + addss xsum2, xsum1 +#endif + + addss xsum1, yy1 + + movss yy1, 0 * SIZE(YY) + + addq $2, IS + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq M, %rax + sarq $3, %rax + jle .L997 + ALIGN_3 + +.L996: + movss 0 * SIZE(NEW_Y), %xmm0 + movss 1 * SIZE(NEW_Y), %xmm1 + movss 2 * SIZE(NEW_Y), %xmm2 + movss 3 * SIZE(NEW_Y), %xmm3 + movss 4 * SIZE(NEW_Y), %xmm4 + movss 5 * SIZE(NEW_Y), %xmm5 + movss 6 * SIZE(NEW_Y), %xmm6 + movss 7 * SIZE(NEW_Y), %xmm7 + + movss %xmm0, 0 * SIZE(Y) + addq INCY, Y + movss %xmm1, 0 * SIZE(Y) + addq INCY, Y + movss %xmm2, 0 * SIZE(Y) + addq INCY, Y + movss %xmm3, 0 * SIZE(Y) + addq INCY, Y + movss %xmm4, 0 * SIZE(Y) + addq INCY, Y + movss %xmm5, 0 * SIZE(Y) + addq INCY, Y + movss %xmm6, 0 * SIZE(Y) + addq INCY, Y + movss %xmm7, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L998: + movss 0 * SIZE(NEW_Y), %xmm0 + + movss %xmm0, 0 * SIZE(Y) + addq INCY, Y + + addq $1 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S new file mode 100644 index 0000000000..bbba0b427d --- /dev/null +++ b/kernel/x86_64/symv_U_sse2.S @@ -0,0 +1,976 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 8) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 20) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_LDA 40 + STACKSIZE(%rsp) +#define OLD_X 48 + STACKSIZE(%rsp) +#define OLD_INCX 56 + STACKSIZE(%rsp) +#define OLD_Y 64 + STACKSIZE(%rsp) +#define OLD_INCY 72 + STACKSIZE(%rsp) +#define OLD_BUFFER 80 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA %xmm0 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define yy1 %xmm2 +#define yy2 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define xsum3 %xmm10 +#define xsum4 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + leaq (,INCX, SIZE), INCX + leaq (,INCY, SIZE), INCY + leaq (,LDA, SIZE), LDA + + testq M, M + jle .L999 + + negq IS + addq M, IS + + movq IS, TEMP + imulq LDA, TEMP + addq TEMP, A + + unpcklpd ALPHA, ALPHA + + movq BUFFER, XX + + movq M, %rax + sarq $3, %rax + jle .L02 + ALIGN_3 + +.L01: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + movhpd 0 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + addq INCX, X + movhpd 0 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + addq INCX, X + movhpd 0 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + addq INCX, X + + mulpd ALPHA, %xmm1 + mulpd ALPHA, %xmm2 + mulpd ALPHA, %xmm3 + mulpd ALPHA, %xmm4 + + movapd %xmm1, 0 * SIZE(XX) + movapd %xmm2, 2 * SIZE(XX) + movapd %xmm3, 4 * SIZE(XX) + movapd %xmm4, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $7, %rax + jle .L05 + ALIGN_3 + +.L03: + movsd 0 * SIZE(X), %xmm1 + addq INCX, X + + mulsd ALPHA, %xmm1 + + movlpd %xmm1, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $3, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + addq INCY, YY + movhpd 0 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $7, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movsd %xmm0, 0 * SIZE(XX) + + addq $1 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + movq IS, I + addq $4, I + cmpq M, I + jg .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + +#ifdef HAVE_SSE3 + movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 + movddup 2 * SIZE(NEW_X, IS, SIZE), atemp3 + movddup 3 * SIZE(NEW_X, IS, SIZE), atemp4 +#else + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 + movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 + movsd 2 * SIZE(NEW_X, IS, SIZE), atemp3 + movhpd 2 * SIZE(NEW_X, IS, SIZE), atemp3 + movsd 3 * SIZE(NEW_X, IS, SIZE), atemp4 + movhpd 3 * SIZE(NEW_X, IS, SIZE), atemp4 +#endif + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + pxor xsum3, xsum3 + pxor xsum4, xsum4 + + movapd 0 * SIZE(NEW_X), xtemp1 + movapd 2 * SIZE(NEW_X), xtemp2 + + movsd 0 * SIZE(A1), a1 + movhpd 1 * SIZE(A1), a1 + movsd 2 * SIZE(A1), a2 + movhpd 3 * SIZE(A1), a2 + movsd 0 * SIZE(A1, LDA, 1), a3 + movhpd 1 * SIZE(A1, LDA, 1), a3 + + movsd 0 * SIZE(NEW_Y), yy1 + movhpd 1 * SIZE(NEW_Y), yy1 + movsd 2 * SIZE(NEW_Y), yy2 + movhpd 3 * SIZE(NEW_Y), yy2 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $3, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + movsd 0 * SIZE(A2), a2 + movhpd 1 * SIZE(A2), a2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy1 + movsd 2 * SIZE(A2), a3 + movhpd 3 * SIZE(A2), a3 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCH PREFETCHSIZE(XX) +#endif + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy2 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy1 + movsd 2 * SIZE(A2, LDA, 1), a2 + movhpd 3 * SIZE(A2, LDA, 1), a2 + + PREFETCH PREFETCHSIZE(A1, LDA, 1) + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum3 + addpd a3, yy2 + movsd 4 * SIZE(A1), a3 + movhpd 5 * SIZE(A1), a3 + + movapd xtemp1, xt1 + movapd 4 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + movsd 6 * SIZE(A1), a1 + movhpd 7 * SIZE(A1), a1 + + movapd xtemp2, xt1 + movapd 6 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum4 + addpd a2, yy2 + movsd 4 * SIZE(A1, LDA, 1), a2 + movhpd 5 * SIZE(A1, LDA, 1), a2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp1, a3 + addpd xt1, xsum1 + addpd a3, yy1 + movsd 6 * SIZE(A1, LDA, 1), a3 + movhpd 7 * SIZE(A1, LDA, 1), a3 + + PREFETCH PREFETCHSIZE(A2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + movsd 4 * SIZE(A2), a1 + movhpd 5 * SIZE(A2), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + addpd xt1, xsum2 + addpd a2, yy1 + movsd 6 * SIZE(A2), a2 + movhpd 7 * SIZE(A2), a2 + +#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) + PREFETCHW PREFETCHSIZE(YY) +#endif + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy2 + movsd 4 * SIZE(A2, LDA, 1), a3 + movhpd 5 * SIZE(A2, LDA, 1), a3 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum3 + addpd a1, yy1 + movsd 6 * SIZE(A2, LDA, 1), a1 + movhpd 7 * SIZE(A2, LDA, 1), a1 + + PREFETCH PREFETCHSIZE(A2, LDA, 1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy2 + movsd 10 * SIZE(A1), a2 + movhpd 11 * SIZE(A1), a2 + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a3, xt1 + mulpd atemp4, a3 + addpd xt1, xsum4 + addpd a3, yy1 + movsd 8 * SIZE(A1, LDA, 1), a3 + movhpd 9 * SIZE(A1, LDA, 1), a3 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy2 + movsd 8 * SIZE(A1), a1 + movhpd 9 * SIZE(A1), a1 + + movsd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + movsd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + addq $8 * SIZE, XX + addq $8 * SIZE, YY + addq $8 * SIZE, A1 + addq $8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + testq $4, IS + jle .L18 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1, LDA, 1), a1 + movhpd 3 * SIZE(A1, LDA, 1), a1 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + movsd 0 * SIZE(A2), a2 + movhpd 1 * SIZE(A2), a2 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + addpd xt1, xsum2 + addpd a3, yy1 + movsd 2 * SIZE(A2), a3 + movhpd 3 * SIZE(A2), a3 + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum2 + addpd a1, yy2 + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum3 + addpd a2, yy1 + movsd 2 * SIZE(A2, LDA, 1), a2 + movhpd 3 * SIZE(A2, LDA, 1), a2 + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum3 + addpd a3, yy2 + + movapd xtemp1, xt1 + movapd 4 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp4, a1 + addpd xt1, xsum4 + addpd a1, yy1 + + movapd xtemp2, xt1 + movapd 6 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum4 + addpd a2, yy2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + addq $4 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + unpckhpd atemp2, atemp1 + unpckhpd atemp4, atemp3 + + movsd 0 * SIZE(A1), a1 + movhpd 0 * SIZE(A1, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum1 + + movsd 0 * SIZE(A1, LDA, 1), a1 + movhpd 1 * SIZE(A1, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum2 + + movsd 0 * SIZE(A2), a1 + movhpd 1 * SIZE(A2), a1 + mulpd atemp1, a1 + addpd a1, xsum3 + + movsd 0 * SIZE(A2, LDA, 1), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum4 + + movsd 0 * SIZE(A2), a1 + movhpd 0 * SIZE(A2, LDA, 1), a1 + mulpd atemp3, a1 + addpd a1, xsum1 + + movsd 1 * SIZE(A2), a1 + movhpd 1 * SIZE(A2, LDA, 1), a1 + mulpd atemp3, a1 + addpd a1, xsum2 + + movsd 2 * SIZE(A2), a1 + movhpd 2 * SIZE(A2, LDA, 1), a1 + mulpd atemp3, a1 + addpd a1, xsum3 + + movsd 2 * SIZE(A2, LDA, 1), a1 + movhpd 3 * SIZE(A2, LDA, 1), a1 + mulpd atemp3, a1 + addpd a1, xsum4 + +#ifndef HAVE_SSE3 + movapd xsum1, atemp1 + movapd xsum3, atemp3 + + unpcklpd xsum2, xsum1 + unpcklpd xsum4, xsum3 + + unpckhpd xsum2, atemp1 + unpckhpd xsum4, atemp3 + + addpd atemp1, xsum1 + addpd atemp3, xsum3 +#else + haddpd xsum2, xsum1 + haddpd xsum4, xsum3 +#endif + + addpd xsum1, yy1 + addpd xsum3, yy2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + + addq $4, IS + + movq IS, I + addq $4, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + testq $2, M + je .L30 + ALIGN_3 + +.L21: + movq A, A1 + leaq (A, LDA, 2), A + +#ifdef HAVE_SSE3 + movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2 +#else + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2 + movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2 +#endif + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + movapd 0 * SIZE(NEW_X), xtemp1 + + movsd 0 * SIZE(NEW_Y), yy1 + movhpd 1 * SIZE(NEW_Y), yy1 + + movsd 0 * SIZE(A1), a1 + movhpd 1 * SIZE(A1), a1 + movsd 0 * SIZE(A1, LDA, 1), a2 + movhpd 1 * SIZE(A1, LDA, 1), a2 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $1, I + jle .L28 + ALIGN_3 + +.L22: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 2 * SIZE(A1), a1 + movhpd 3 * SIZE(A1), a1 + + movapd xtemp1, xt1 + movapd 2 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp2, a2 + addpd xt1, xsum2 + addpd a2, yy1 + movsd 2 * SIZE(A1, LDA, 1), a2 + movhpd 3 * SIZE(A1, LDA, 1), a2 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 2 * SIZE(YY), yy1 + movhpd 3 * SIZE(YY), yy1 + + addq $2 * SIZE, XX + addq $2 * SIZE, YY + addq $2 * SIZE, A1 + + decq I + jg .L22 + ALIGN_3 + +.L28: + unpckhpd atemp2, atemp1 + + movsd 0 * SIZE(A1), a1 + movhpd 0 * SIZE(A1, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum1 + + movsd 0 * SIZE(A1, LDA, 1), a1 + movhpd 1 * SIZE(A1, LDA, 1), a1 + mulpd atemp1, a1 + addpd a1, xsum2 + +#ifndef HAVE_SSE3 + movapd xsum1, atemp1 + + unpcklpd xsum2, xsum1 + unpckhpd xsum2, atemp1 + + addpd atemp1, xsum1 +#else + haddpd xsum2, xsum1 +#endif + + addpd xsum1, yy1 + + movsd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + + addq $2, IS + ALIGN_3 + +.L30: + testq $1, M + je .L990 + ALIGN_3 + +.L31: + movq A, A1 + +#ifdef HAVE_SSE3 + movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1 +#else + movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1 + movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1 +#endif + + pxor xsum1, xsum1 + + movsd 0 * SIZE(NEW_X), xtemp1 + movsd 0 * SIZE(NEW_Y), yy1 + movsd 0 * SIZE(A1), a1 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + testq I, I + jle .L38 + ALIGN_3 + +.L32: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + movsd 1 * SIZE(A1), a1 + + movsd 1 * SIZE(XX), xtemp1 + + movsd yy1, 0 * SIZE(YY) + movsd 1 * SIZE(YY), yy1 + + addq $1 * SIZE, XX + addq $1 * SIZE, YY + addq $1 * SIZE, A1 + + decq I + jg .L32 + ALIGN_3 + +.L38: + movsd 0 * SIZE(A1), a1 + mulsd atemp1, a1 + addsd a1, xsum1 + + addsd xsum1, yy1 + + movsd yy1, 0 * SIZE(YY) + ALIGN_3 + +.L990: + cmpq $SIZE, INCY + je .L999 + + movq M, %rax + sarq $3, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm0, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm1, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm2, 0 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + addq INCY, Y + movhpd %xmm3, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $7, %rax + jle .L999 + ALIGN_3 + +.L998: + movsd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + addq INCY, Y + + addq $1 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S new file mode 100644 index 0000000000..d70bede704 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_2x8_nehalem.S @@ -0,0 +1,3075 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movaps %xmm3, %xmm0 +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + testq $1, M + BRANCH + jle .L20 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm3, %xmm7 + movaps %xmm3, %xmm6 + pshufd $0xe, %xmm2, %xmm5 + movaps %xmm2, %xmm4 + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + movsd -12 * SIZE(BO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm4 + movsd -11 * SIZE(BO), %xmm13 + mulsd %xmm0, %xmm13 + subsd %xmm13, %xmm5 + movsd -10 * SIZE(BO), %xmm14 + mulsd %xmm0, %xmm14 + subsd %xmm14, %xmm6 + movsd -9 * SIZE(BO), %xmm15 + mulsd %xmm0, %xmm15 + subsd %xmm15, %xmm7 + + movsd -7 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + movsd -4 * SIZE(BO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm4 + movsd -3 * SIZE(BO), %xmm13 + mulsd %xmm1, %xmm13 + subsd %xmm13, %xmm5 + movsd -2 * SIZE(BO), %xmm14 + mulsd %xmm1, %xmm14 + subsd %xmm14, %xmm6 + movsd -1 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm7 + + movsd 2 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd 3 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + movsd 4 * SIZE(BO), %xmm12 + mulsd %xmm2, %xmm12 + subsd %xmm12, %xmm4 + movsd 5 * SIZE(BO), %xmm13 + mulsd %xmm2, %xmm13 + subsd %xmm13, %xmm5 + movsd 6 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm6 + movsd 7 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm7 + + movsd 11 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 + movsd 12 * SIZE(BO), %xmm12 + mulsd %xmm3, %xmm12 + subsd %xmm12, %xmm4 + movsd 13 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm5 + movsd 14 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm6 + movsd 15 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm7 + + movsd 20 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm4 + movsd 21 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm5 + movsd 22 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm6 + movsd 23 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm7 + + movsd 29 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm5 + movsd 30 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm6 + movsd 31 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm7 + + movsd 38 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm6 + movsd 39 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm7 + + movsd 47 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm7 +#endif + +#ifdef RT + movsd 47 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm7 + movsd 46 * SIZE(BO), %xmm9 + mulsd %xmm7, %xmm9 + subsd %xmm9, %xmm6 + movsd 45 * SIZE(BO), %xmm10 + mulsd %xmm7, %xmm10 + subsd %xmm10, %xmm5 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm7, %xmm11 + subsd %xmm11, %xmm4 + movsd 43 * SIZE(BO), %xmm12 + mulsd %xmm7, %xmm12 + subsd %xmm12, %xmm3 + movsd 42 * SIZE(BO), %xmm13 + mulsd %xmm7, %xmm13 + subsd %xmm13, %xmm2 + movsd 41 * SIZE(BO), %xmm14 + mulsd %xmm7, %xmm14 + subsd %xmm14, %xmm1 + movsd 40 * SIZE(BO), %xmm15 + mulsd %xmm7, %xmm15 + subsd %xmm15, %xmm0 + + movsd 38 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm6 + movsd 37 * SIZE(BO), %xmm10 + mulsd %xmm6, %xmm10 + subsd %xmm10, %xmm5 + movsd 36 * SIZE(BO), %xmm11 + mulsd %xmm6, %xmm11 + subsd %xmm11, %xmm4 + movsd 35 * SIZE(BO), %xmm12 + mulsd %xmm6, %xmm12 + subsd %xmm12, %xmm3 + movsd 34 * SIZE(BO), %xmm13 + mulsd %xmm6, %xmm13 + subsd %xmm13, %xmm2 + movsd 33 * SIZE(BO), %xmm14 + mulsd %xmm6, %xmm14 + subsd %xmm14, %xmm1 + movsd 32 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm0 + + movsd 29 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm5 + movsd 28 * SIZE(BO), %xmm11 + mulsd %xmm5, %xmm11 + subsd %xmm11, %xmm4 + movsd 27 * SIZE(BO), %xmm12 + mulsd %xmm5, %xmm12 + subsd %xmm12, %xmm3 + movsd 26 * SIZE(BO), %xmm13 + mulsd %xmm5, %xmm13 + subsd %xmm13, %xmm2 + movsd 25 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm1 + movsd 24 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm0 + + movsd 20 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm4 + movsd 19 * SIZE(BO), %xmm12 + mulsd %xmm4, %xmm12 + subsd %xmm12, %xmm3 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm2 + movsd 17 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm1 + movsd 16 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm0 + + movsd 11 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd 10 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd 9 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd 8 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd 2 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd 1 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd 0 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 + movaps %xmm4, %xmm2 + unpcklpd %xmm5, %xmm2 + movaps %xmm6, %xmm3 + unpcklpd %xmm7, %xmm3 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO1, %rax, 1) + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 0 * SIZE(CO2, %rax, 1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) + movapd %xmm2, -12 * SIZE(BO) + movapd %xmm3, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + NOBRANCH + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetcht0 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 -2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 -3 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 -2 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 -3 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 -2 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 -3 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 -2 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 -3 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -10 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addq $32 * SIZE, BO + subq $-8 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm13 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $0, %xmm13, %xmm12 + shufpd $3, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $0, %xmm15, %xmm14 + shufpd $3, %xmm0, %xmm15 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm2 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm6 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + shufpd $2, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + shufpd $2, %xmm0, %xmm15 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + subpd %xmm12, %xmm4 + subpd %xmm13, %xmm5 + subpd %xmm14, %xmm6 + subpd %xmm15, %xmm7 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -14 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + movapd %xmm12, %xmm14 + movapd %xmm12, %xmm15 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + mulpd %xmm5, %xmm14 + mulpd %xmm7, %xmm15 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + subpd %xmm14, %xmm4 + subpd %xmm15, %xmm6 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 + + movddup -15 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + movapd %xmm12, %xmm14 + movapd %xmm12, %xmm15 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + mulpd %xmm4, %xmm14 + mulpd %xmm6, %xmm15 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + subpd %xmm14, %xmm5 + subpd %xmm15, %xmm7 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + movddup -12 * SIZE(BO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm4 + movddup -11 * SIZE(BO), %xmm13 + mulpd %xmm0, %xmm13 + subpd %xmm13, %xmm5 + movddup -10 * SIZE(BO), %xmm14 + mulpd %xmm0, %xmm14 + subpd %xmm14, %xmm6 + movddup -9 * SIZE(BO), %xmm15 + mulpd %xmm0, %xmm15 + subpd %xmm15, %xmm7 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + movddup -4 * SIZE(BO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm4 + movddup -3 * SIZE(BO), %xmm13 + mulpd %xmm1, %xmm13 + subpd %xmm13, %xmm5 + movddup -2 * SIZE(BO), %xmm14 + mulpd %xmm1, %xmm14 + subpd %xmm14, %xmm6 + movddup -1 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm7 + + movddup 2 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + movddup 4 * SIZE(BO), %xmm12 + mulpd %xmm2, %xmm12 + subpd %xmm12, %xmm4 + movddup 5 * SIZE(BO), %xmm13 + mulpd %xmm2, %xmm13 + subpd %xmm13, %xmm5 + movddup 6 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm6 + movddup 7 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm7 + + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm4 + movddup 13 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm5 + movddup 14 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm6 + movddup 15 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm7 + + movddup 20 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm6 + movddup 23 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm7 + + movddup 29 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm5 + movddup 30 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm7 + + movddup 38 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm6 + movddup 39 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm7 + + movddup 47 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm7 +#endif + +#ifdef RT + movddup 47 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm7 + movddup 46 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm6 + movddup 45 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm5 + movddup 44 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm4 + movddup 43 * SIZE(BO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + movddup 42 * SIZE(BO), %xmm13 + mulpd %xmm7, %xmm13 + subpd %xmm13, %xmm2 + movddup 41 * SIZE(BO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm1 + movddup 40 * SIZE(BO), %xmm15 + mulpd %xmm7, %xmm15 + subpd %xmm15, %xmm0 + + movddup 38 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm6 + movddup 37 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm5 + movddup 36 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm4 + movddup 35 * SIZE(BO), %xmm12 + mulpd %xmm6, %xmm12 + subpd %xmm12, %xmm3 + movddup 34 * SIZE(BO), %xmm13 + mulpd %xmm6, %xmm13 + subpd %xmm13, %xmm2 + movddup 33 * SIZE(BO), %xmm14 + mulpd %xmm6, %xmm14 + subpd %xmm14, %xmm1 + movddup 32 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm0 + + movddup 29 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm5 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm5, %xmm11 + subpd %xmm11, %xmm4 + movddup 27 * SIZE(BO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + movddup 26 * SIZE(BO), %xmm13 + mulpd %xmm5, %xmm13 + subpd %xmm13, %xmm2 + movddup 25 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm1 + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm0 + + movddup 20 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm4 + movddup 19 * SIZE(BO), %xmm12 + mulpd %xmm4, %xmm12 + subpd %xmm12, %xmm3 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm2 + movddup 17 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm1 + movddup 16 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm0 + + movddup 11 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup 10 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup 9 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup 8 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup 2 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup 1 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup 0 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm2, -14 * SIZE(BO) + movapd %xmm4, -12 * SIZE(BO) + movapd %xmm6, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5 , -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 1 * SIZE(CO1, LDC, 2) + movhps %xmm2, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 1 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movsd %xmm5, 1 * SIZE(CO2) + movhps %xmm4, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 1 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movsd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhps %xmm6, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 1 * SIZE(CO2, %rax, 1) +#else + movups %xmm0, 0 * SIZE(CO1) + movups %xmm1, 0 * SIZE(CO1, LDC, 1) + movups %xmm2, 0 * SIZE(CO1, LDC, 2) + movups %xmm3, 0 * SIZE(CO1, %rax, 1) + movups %xmm4, 0 * SIZE(CO2) + movups %xmm5, 0 * SIZE(CO2, LDC, 1) + movups %xmm6, 0 * SIZE(CO2, LDC, 2) + movups %xmm7, 0 * SIZE(CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $4, N + jle .L50 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L40 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + + movsd -11 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -10 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -9 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + + movsd -1 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd -2 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd -3 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd -4 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd -6 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -11 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -12 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 0 * SIZE(CO2, LDC, 1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + movq M, I + sarq $1, I + NOBRANCH + jle .L49 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 2 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm2 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm3 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + + movddup -15 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + + movddup -11 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -10 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -9 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + + movddup -1 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup -2 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup -3 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup -4 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup -6 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -11 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -12 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm2, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm3, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L50: + testq $2, N + jle .L70 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L60 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 +#else + movapd -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 + movsd -15 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm1 + + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -14 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $1, I + NOBRANCH + jle .L69 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm0 + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(AO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 + movddup -15 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm1 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -14 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L80 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movhps -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhps -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 + movsd -14 * SIZE(AO), %xmm0 + movhps -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movsd -14 * SIZE(BO), %xmm1 + movhps -13 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movsd -12 * SIZE(AO), %xmm0 + movhps -11 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movsd -12 * SIZE(BO), %xmm1 + movhps -11 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm0 +#else + movsd -16 * SIZE(AO), %xmm0 +#endif + + subsd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BO) +#else + movsd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + + +.L80: + movq M, I + sarq $1, I + NOBRANCH + jle .L89 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 +#else + movapd -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef LN + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 + movsd -14 * SIZE(AO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm0 + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(AO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm1 + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm1, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L89: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x2_atom.S b/kernel/x86_64/trsm_kernel_LN_4x2_atom.S new file mode 100644 index 0000000000..6ba2fc4bdb --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x2_atom.S @@ -0,0 +1,2116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L20 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 3 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm1 + mulsd %xmm13, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm1, %xmm9 + movsd 0 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm0 + mulsd %xmm13, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + addsd %xmm6, %xmm11 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm1 + movsd 3 * SIZE(AO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm2 + movsd 2 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm10 + + subsd %xmm9, %xmm2 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm1, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addsd %xmm2, %xmm13 + addsd %xmm7, %xmm14 + addsd %xmm6, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 5 * SIZE(BO), %xmm5 + movsd 6 * SIZE(BO), %xmm6 + movsd 7 * SIZE(BO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 + subsd %xmm12, %xmm4 + subsd %xmm13, %xmm5 + subsd %xmm14, %xmm6 + subsd %xmm15, %xmm7 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + movsd 4 * SIZE(AO), %xmm1 + movsd 5 * SIZE(AO), %xmm3 + movsd 6 * SIZE(AO), %xmm5 + movsd 7 * SIZE(AO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 + subsd %xmm13, %xmm5 + subsd %xmm15, %xmm7 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm7 + movsd 13 * SIZE(AO), %xmm11 + + movaps %xmm9, %xmm10 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm9 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm7, %xmm10 + subsd %xmm9, %xmm4 + movsd 9 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm6, %xmm11 + mulsd %xmm7, %xmm12 + subsd %xmm11, %xmm2 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm3 + + movaps %xmm13, %xmm14 + mulsd %xmm6, %xmm13 + mulsd %xmm7, %xmm14 + subsd %xmm13, %xmm0 + subsd %xmm14, %xmm1 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 5 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm4, %xmm11 + mulsd %xmm5, %xmm12 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm1 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm11, %xmm0 + mulsd %xmm11, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + + movsd 2 * SIZE(AO), %xmm11 + movaps %xmm9, %xmm10 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm9 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm1, %xmm10 + subsd %xmm9, %xmm2 + movsd 6 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm12 + subsd %xmm11, %xmm4 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm5 + + movaps %xmm13, %xmm14 + mulsd %xmm0, %xmm13 + mulsd %xmm1, %xmm14 + subsd %xmm13, %xmm6 + subsd %xmm14, %xmm7 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + movsd 10 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm2, %xmm11 + mulsd %xmm3, %xmm12 + subsd %xmm11, %xmm6 + subsd %xmm12, %xmm7 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 15 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm6 + subsd %xmm10, %xmm7 + + mulsd %xmm8, %xmm6 + mulsd %xmm8, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + mulsd %xmm4, %xmm11 + mulsd %xmm6, %xmm12 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + subsd %xmm11, %xmm5 + subsd %xmm12, %xmm7 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 + mulsd %xmm13, %xmm5 + mulsd %xmm13, %xmm7 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm5 + mulsd %xmm8, %xmm7 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + mulsd %xmm5, %xmm11 + mulsd %xmm7, %xmm12 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm4 + subsd %xmm12, %xmm6 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm4 + mulsd %xmm13, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movsd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) + movsd %xmm4, 4 * SIZE(BO) + movsd %xmm5, 5 * SIZE(BO) + movsd %xmm6, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) + movsd %xmm1, 4 * SIZE(AO) + movsd %xmm3, 5 * SIZE(AO) + movsd %xmm5, 6 * SIZE(AO) + movsd %xmm7, 7 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L50 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + subsd %xmm8, %xmm0 +#else + movsd 0 * SIZE(AO), %xmm0 + subsd %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L50: + testq $2, M + je .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + movsd 2 * SIZE(AO), %xmm9 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11,%xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + movsd 1 * SIZE(AO), %xmm9 + movsd 3 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm0 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm2 + mulsd %xmm11,%xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $2, I + jle .L69 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm4 + movsd 3 * SIZE(BO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm9 + movsd 13 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm4 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm11 + movsd 10 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm2 + movsd 9 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm13 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm0 + + mulsd %xmm8, %xmm4 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm11 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 2 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm2 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm11 + movsd 5 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm4 + movsd 6 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm13 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm6 + + mulsd %xmm8, %xmm2 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm6 + mulsd %xmm8, %xmm4 + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm6 + mulsd %xmm8, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) + movsd %xmm4, 2 * SIZE(BO) + movsd %xmm6, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L41 + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S new file mode 100644 index 0000000000..4cdaff30be --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_barcelona.S @@ -0,0 +1,3390 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define OFFSET 48(%rsp) +#define AORIG 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define AORIG 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ +/***/ movapd (AO, %rax, 4), %xmm6 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ +/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ +/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm12 +#else + movq STACKSIZE + 8(%rsp), LDC + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movsd %xmm12, OFFSET + movsd %xmm12, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L20 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#else + movapd -16 * SIZE(AO), %xmm2 + movapd -14 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -15 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd -14 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd -13 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd -9 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -5 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd -3 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd -4 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -7 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd -8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -12 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm3, -14 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) + movaps %xmm3, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) + movaps %xmm4, -12 * SIZE(AO) + movaps %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#else + prefetchw -8 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw -8 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw -8 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw -8 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#endif + + prefetch -10 * SIZE(BB) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + prefetch 14 * SIZE(BB) + subq $-16 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm15 + + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm11, %xmm14 + subpd %xmm14, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm3 + + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) + movaps %xmm1, -8 * SIZE(BO) + movaps %xmm3, -6 * SIZE(BO) + movaps %xmm5, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) + movaps %xmm4, -8 * SIZE(AO) + movaps %xmm5, -6 * SIZE(AO) + movaps %xmm6, -4 * SIZE(AO) + movaps %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L60 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 1), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 +#else + movapd -16 * SIZE(AO), %xmm2 +#endif + + subpd %xmm8, %xmm2 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -16 * SIZE(BO), %xmm2 + movsd -15 * SIZE(BO), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + + movlpd -14 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + + mulsd -16 * SIZE(BO), %xmm2 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 3 * SIZE(CO2) +#else + prefetchw -8 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw -8 * SIZE(CO2) +#endif + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13,-14 * SIZE(BO) + movaps %xmm1, -12 * SIZE(BO) + movaps %xmm5, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L100 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm10, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movlpd %xmm10, -16 * SIZE(BO) +#else + movlpd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L100: + testq $2, M + je .L110 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -14 * SIZE(BO), %xmm3 + +#ifndef LN + prefetchw 3 * SIZE(CO1) +#else + prefetchw -8 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm10 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm11 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm9 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm10 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movlpd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) + movaps %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_core2.S b/kernel/x86_64/trsm_kernel_LN_4x4_core2.S new file mode 100644 index 0000000000..fc5284ae56 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_core2.S @@ -0,0 +1,3739 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define J 0(%rsp) +#define OFFSET 8(%rsp) +#define KK 16(%rsp) +#define KKK 24(%rsp) +#define AORIG 32(%rsp) +#define BORIG 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -8 * SIZE(B), %xmm4 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + movddup %xmm2, %xmm10 + unpckhpd %xmm2, %xmm2 + movddup %xmm3, %xmm11 + unpckhpd %xmm3, %xmm3 + movddup %xmm4, %xmm12 + unpckhpd %xmm4, %xmm4 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + movddup %xmm6, %xmm14 + unpckhpd %xmm6, %xmm6 + movddup %xmm7, %xmm15 + unpckhpd %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + movapd %xmm10, -8 * SIZE(BO) + movapd %xmm2, -6 * SIZE(BO) + movapd %xmm11, -4 * SIZE(BO) + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + movapd %xmm12, 0 * SIZE(BO) + movapd %xmm4, 2 * SIZE(BO) + movapd %xmm13, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movapd %xmm14, 8 * SIZE(BO) + movapd %xmm6, 10 * SIZE(BO) + movapd %xmm15, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + testq $1, M + je .L20 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -15 * SIZE(AO), %xmm0 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 6 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -13 * SIZE(AO), %xmm0 + movsd 8 * SIZE(BO), %xmm2 + movsd 10 * SIZE(BO), %xmm3 + movsd 12 * SIZE(BO), %xmm4 + movsd 14 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 + movsd -14 * SIZE(B), %xmm14 + movsd -13 * SIZE(B), %xmm15 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 + movsd -14 * SIZE(AO), %xmm14 + movsd -13 * SIZE(AO), %xmm15 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + subsd %xmm10, %xmm14 + subsd %xmm11, %xmm15 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movlpd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(B), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(B), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(B), %xmm13 + movlpd -10 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(B), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(B), %xmm14 + movlpd -5 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(B), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(B), %xmm15 + + movlpd -2 * SIZE(B), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(B), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(B), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(B), %xmm14 + + movlpd -7 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(B), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(B), %xmm13 + + movlpd -12 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + movsd %xmm14, -14 * SIZE(B) + movsd %xmm15, -13 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) + movsd %xmm14, -12 * SIZE(BO) + movsd %xmm14, -11 * SIZE(BO) + movsd %xmm15, -10 * SIZE(BO) + movsd %xmm15, -9 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jne .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(B), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(B), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(B), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(B), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(B), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(B), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(B), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 -3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 -3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 +#endif + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm10 + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm14 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 0 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd 4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd 6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + addq $32 * SIZE, BO + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subq $1, %rax + mulpd %xmm1, %xmm5 + + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + movapd -8 * SIZE(B), %xmm1 + movapd -6 * SIZE(B), %xmm3 + movapd -4 * SIZE(B), %xmm5 + movapd -2 * SIZE(B), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm5, -4 * SIZE(B) + movapd %xmm7, -2 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm3, %xmm2 + SHUFPD_3 %xmm3, %xmm3 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + movddup %xmm7, %xmm6 + SHUFPD_3 %xmm7, %xmm7 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + je .L60 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(B), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm13 + + movlpd -14 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + movapd -12 * SIZE(B), %xmm1 + movapd -10 * SIZE(B), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm5, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) + movapd %xmm0, -8 * SIZE(BO) + movapd %xmm1, -6 * SIZE(BO) + movapd %xmm4, -4 * SIZE(BO) + movapd %xmm5, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + testq $1, M + je .L100 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -12 * SIZE(BO), %xmm2 + movsd -10 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(B) + + movlpd %xmm10, -16 * SIZE(BO) + movlpd %xmm10, -15 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) +#else + prefetcht2 3 * SIZE(CO1) +#endif + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -14 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + movapd -10 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + movddup %xmm11, %xmm9 + SHUFPD_3 %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S b/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S new file mode 100644 index 0000000000..09f91220ac --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_penryn.S @@ -0,0 +1,3425 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L20 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -2 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm12 + movaps -14 * SIZE(BO), %xmm13 +#else + movaps -16 * SIZE(AO), %xmm12 + movaps -14 * SIZE(AO), %xmm13 +#endif + + subpd %xmm8, %xmm12 + subpd %xmm9, %xmm13 + +#if defined(RN) || defined(RT) + movhlps %xmm13, %xmm15 + movsd %xmm13, %xmm14 + movhlps %xmm12, %xmm13 + movsd %xmm12, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm12 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movlpd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(BO), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(BO), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(BO), %xmm13 + movlpd -10 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(BO), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(BO), %xmm14 + movlpd -5 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(BO), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(BO), %xmm15 + + movlpd -2 * SIZE(BO), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(BO), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(BO), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(BO), %xmm14 + + movlpd -7 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(BO), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(BO), %xmm13 + + movlpd -12 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + movsd %xmm13, 0 * SIZE(CO1, LDC, 2) + movhps %xmm13, 0 * SIZE(CO2, LDC, 2) + + movaps %xmm12, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm5, %xmm5 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 -4 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 -4 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 3 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + addpd %xmm3, %xmm11 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 16 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + subq $-32 * SIZE, AO + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movapd %xmm14, %xmm0 + movsd %xmm15, %xmm14 + movsd %xmm0, %xmm15 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L60 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + movhlps %xmm8, %xmm9 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(BO), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(BO), %xmm13 + + movlpd -14 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm13, -15 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 -4 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#else + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm5, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L90 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -12 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + pxor %xmm9, %xmm9 + movhps -15 * SIZE(BO), %xmm2 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S b/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S new file mode 100644 index 0000000000..ca0bfbdc55 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_sse2.S @@ -0,0 +1,4150 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#ifndef ALLOC_HUGETLB +#define PREFETCHSIZE (8 * 4 + 4) +#else +#define PREFETCHSIZE (8 * 2 + 4) +#endif +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 4 + 4) +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 40 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $16 * SIZE, BO + addq $ 8 * SIZE, B + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + movsd %xmm2, 4 * SIZE(BO) + movsd %xmm2, 5 * SIZE(BO) + movsd %xmm3, 6 * SIZE(BO) + movsd %xmm3, 7 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + testq $1, M + je .L20 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 10 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 12 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 20 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 22 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 26 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 28 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 30 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 4 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 34 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 36 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 38 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 64 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 5 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 42 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 46 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 72 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 6 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 50 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 52 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 54 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 80 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 7 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 58 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 60 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 62 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 88 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 8 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 + movsd 2 * SIZE(B), %xmm6 + movsd 3 * SIZE(B), %xmm7 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 + movsd 2 * SIZE(AO), %xmm6 + movsd 3 * SIZE(AO), %xmm7 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + subsd %xmm2, %xmm6 + subsd %xmm3, %xmm7 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movlpd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + movlpd 2 * SIZE(B), %xmm2 + mulsd %xmm4, %xmm2 + subsd %xmm2, %xmm6 + movlpd 3 * SIZE(B), %xmm3 + mulsd %xmm4, %xmm3 + subsd %xmm3, %xmm7 + + mulsd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm6 + movlpd 7 * SIZE(B), %xmm2 + mulsd %xmm5, %xmm2 + subsd %xmm2, %xmm7 + + mulsd 10 * SIZE(B), %xmm6 + movlpd 11 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm7 + + mulsd 15 * SIZE(B), %xmm7 +#endif + +#ifdef RT + mulsd 15 * SIZE(B), %xmm7 + + movlpd 14 * SIZE(B), %xmm1 + mulsd %xmm7, %xmm1 + subsd %xmm1, %xmm6 + movlpd 13 * SIZE(B), %xmm2 + mulsd %xmm7, %xmm2 + subsd %xmm2, %xmm5 + movlpd 12 * SIZE(B), %xmm3 + mulsd %xmm7, %xmm3 + subsd %xmm3, %xmm4 + + mulsd 10 * SIZE(B), %xmm6 + + movlpd 9 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm5 + movlpd 8 * SIZE(B), %xmm2 + mulsd %xmm6, %xmm2 + subsd %xmm2, %xmm4 + + mulsd 5 * SIZE(B), %xmm5 + + movlpd 4 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + movsd %xmm6, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + movsd %xmm6, 2 * SIZE(B) + movsd %xmm7, 3 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) + movsd %xmm6, 4 * SIZE(BO) + movsd %xmm6, 5 * SIZE(BO) + movsd %xmm7, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) + movsd %xmm6, 2 * SIZE(AO) + movsd %xmm7, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm0 + movapd 18 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm1 + movapd 20 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + mulpd 22 * SIZE(BO), %xmm8 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm8, %xmm3 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm0 + movapd 26 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm1 + movapd 28 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + mulpd 30 * SIZE(BO), %xmm8 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 34 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movapd 36 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 38 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm2 + movapd 64 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm3 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 42 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movapd 44 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + mulpd 46 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm2 + movapd 72 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm0 + movapd 50 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm1 + movapd 52 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + mulpd 54 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 80 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm0 + movapd 58 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm1 + movapd 60 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + mulpd 62 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 88 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(BO), %xmm9 + movapd 2 * SIZE(BO), %xmm11 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + +#ifdef LN + PREFETCHW -4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW -4 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW -4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW -4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 +#else + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + movapd 8 * SIZE(B), %xmm9 + movapd 10 * SIZE(B), %xmm11 + movapd 12 * SIZE(B), %xmm13 + movapd 14 * SIZE(B), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + movapd %xmm9, 8 * SIZE(B) + movapd %xmm11, 10 * SIZE(B) + movapd %xmm13, 12 * SIZE(B) + movapd %xmm15, 14 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) + movlpd %xmm9, 16 * SIZE(BO) + movlpd %xmm9, 17 * SIZE(BO) + movhpd %xmm9, 18 * SIZE(BO) + movhpd %xmm9, 19 * SIZE(BO) + movlpd %xmm11, 20 * SIZE(BO) + movlpd %xmm11, 21 * SIZE(BO) + movhpd %xmm11, 22 * SIZE(BO) + movhpd %xmm11, 23 * SIZE(BO) + movlpd %xmm13, 24 * SIZE(BO) + movlpd %xmm13, 25 * SIZE(BO) + movhpd %xmm13, 26 * SIZE(BO) + movhpd %xmm13, 27 * SIZE(BO) + movlpd %xmm15, 28 * SIZE(BO) + movlpd %xmm15, 29 * SIZE(BO) + movhpd %xmm15, 30 * SIZE(BO) + movhpd %xmm15, 31 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + je .L60 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 10 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm0 + movsd 12 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + mulsd %xmm10, %xmm13 + mulsd 18 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm0 + movsd 20 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm1 + movsd 5 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm13 + mulsd 22 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm3 + movsd 6 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 26 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm0 + movsd 28 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm1 + movsd 7 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 30 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm8, %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + + mulsd 3 * SIZE(B), %xmm5 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm5 + + movlpd 2 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm13 + mulpd 18 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + mulpd 22 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 26 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 30 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + movapd 16 * SIZE(AO), %xmm12 + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movapd 24 * SIZE(BO), %xmm15 + +#ifdef LN + PREFETCHW -4 * SIZE(CO1) + PREFETCHW -4 * SIZE(CO2) +#else + PREFETCHW 4 * SIZE(CO1) + PREFETCHW 4 * SIZE(CO2) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 32 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 8 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 40 * SIZE(AO), %xmm10 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 16 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 18 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 20 * SIZE(AO), %xmm12 + + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 22 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 48 * SIZE(AO), %xmm12 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 24 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 26 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 28 * SIZE(AO), %xmm14 + + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 30 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 56 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm9 + movapd 6 * SIZE(B), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm9, 4 * SIZE(B) + movapd %xmm13, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) + movlpd %xmm9, 8 * SIZE(BO) + movlpd %xmm9, 9 * SIZE(BO) + movhpd %xmm9, 10 * SIZE(BO) + movhpd %xmm9, 11 * SIZE(BO) + movlpd %xmm13, 12 * SIZE(BO) + movlpd %xmm13, 13 * SIZE(BO) + movhpd %xmm13, 14 * SIZE(BO) + movhpd %xmm13, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movsd 0 * SIZE(B), %xmm0 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + testq $1, M + je .L100 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 16 * SIZE(BO), %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm1 + movsd 2 * SIZE(AO), %xmm8 + mulsd 4 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm2 + movsd 3 * SIZE(AO), %xmm8 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + mulsd %xmm10, %xmm11 + movsd 5 * SIZE(AO), %xmm10 + addsd %xmm11, %xmm0 + movsd 24 * SIZE(BO), %xmm11 + mulsd 10 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm1 + movsd 6 * SIZE(AO), %xmm10 + mulsd 12 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm2 + movsd 7 * SIZE(AO), %xmm10 + mulsd 14 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm8, %xmm9 + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + addsd %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(AO), %xmm8 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 4 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movapd 6 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 12 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movapd 14 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movapd 24 * SIZE(AO), %xmm14 + +#ifdef LN + PREFETCHW -4 * SIZE(CO1) +#else + PREFETCHW 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm12 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm0 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm11, %xmm1 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm12 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm2 + movapd 48 * SIZE(AO), %xmm12 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm11, %xmm3 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm0 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm1 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm2 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm3 + movapd 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S b/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S new file mode 100644 index 0000000000..66a5e40d3d --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x4_sse3.S @@ -0,0 +1,3873 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 272 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L30 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $2, M + BRANCH + je .L20 + ALIGN_4 + +.L21: + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta -4 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta -4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta -4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 +#else + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + movapd 8 * SIZE(BO), %xmm9 + movapd 10 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm13 + movapd 14 * SIZE(BO), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) + movapd %xmm9, 8 * SIZE(BO) + movapd %xmm11, 10 * SIZE(BO) + movapd %xmm13, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + + testq $1, M + je .L70 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $2, M + je .L60 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-4 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) + prefetchnta -4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + movapd 4 * SIZE(BO), %xmm9 + movapd 6 * SIZE(BO), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) + movapd %xmm9, 4 * SIZE(BO) + movapd %xmm13, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + je .L110 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $2, M + je .L100 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L100: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L119 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S new file mode 100644 index 0000000000..28c2ca0514 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_4x8_nehalem.S @@ -0,0 +1,4847 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + testq $1, M + BRANCH + jle .L20 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + movsd -28 * SIZE(AO), %xmm4 + movhps -26 * SIZE(AO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + unpcklps %xmm5, %xmm4 + unpcklps %xmm7, %xmm6 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) + movlps %xmm4, -28 * SIZE(AO) + movlps %xmm6, -26 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO1, LDC, 2) + movss %xmm3, (CO1, %rax, 1) + + movss %xmm4, (CO2) + movss %xmm5, (CO2, LDC, 1) + movss %xmm6, (CO2, LDC, 2) + movss %xmm7, (CO2, %rax, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm5 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm3 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 + subps %xmm10, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + movaps -24 * SIZE(AO), %xmm4 + movaps -20 * SIZE(AO), %xmm6 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + subps %xmm10, %xmm4 + subps %xmm11, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm2, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movaps %xmm2, %xmm5 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm5 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO1, LDC, 2) + movhps %xmm4, (CO1, %rax, 1) + + movsd %xmm2, (CO2) + movhps %xmm2, (CO2, LDC, 1) + movsd %xmm5, (CO2, LDC, 2) + movhps %xmm5, (CO2, %rax, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + movlhps %xmm5, %xmm4 + movlhps %xmm7, %xmm6 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + movaps %xmm4, -24 * SIZE(AO) + movaps %xmm6, -20 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO1, LDC, 2) + movsd %xmm3, (CO1, %rax, 1) + + movsd %xmm4, (CO2) + movsd %xmm5, (CO2, LDC, 1) + movsd %xmm6, (CO2, LDC, 2) + movsd %xmm7, (CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I + NOBRANCH + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht2 -4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 -4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 -4 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht2 -4 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht2 -4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 -4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht2 -4 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht2 -4 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps %xmm12, %xmm4 + shufps $0x88, %xmm13, %xmm12 + movaps %xmm14, %xmm5 + shufps $0x88, %xmm15, %xmm14 + shufps $0xdd, %xmm15, %xmm4 + shufps $0xdd, %xmm13, %xmm5 + + movaps %xmm12, %xmm6 + shufps $0x88, %xmm14, %xmm12 + shufps $0xdd, %xmm6, %xmm14 + + movaps %xmm4, %xmm13 + movaps %xmm5, %xmm15 + shufps $0x22, %xmm5, %xmm13 + shufps $0x77, %xmm4, %xmm15 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm5 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm6 + movaps -8 * SIZE(BO), %xmm3 + movaps -4 * SIZE(BO), %xmm7 + +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 + movaps -16 * SIZE(AO), %xmm4 + movaps -12 * SIZE(AO), %xmm5 + movaps -8 * SIZE(AO), %xmm6 + movaps -4 * SIZE(AO), %xmm7 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + subps %xmm12, %xmm4 + subps %xmm13, %xmm5 + subps %xmm14, %xmm6 + subps %xmm15, %xmm7 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm5, -20 * SIZE(BO) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm6, -12 * SIZE(BO) + movaps %xmm3, -8 * SIZE(BO) + movaps %xmm7, -4 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm8, %xmm5 + + movaps %xmm6, %xmm9 + shufps $0x88, %xmm7, %xmm6 + shufps $0xdd, %xmm9, %xmm7 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm6, %xmm4 + movaps %xmm5, %xmm9 + shufps $0x22, %xmm7, %xmm5 + shufps $0xdd, %xmm6, %xmm8 + movaps %xmm8, %xmm6 + shufps $0x77, %xmm7, %xmm9 + movaps %xmm9, %xmm7 + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) + movaps %xmm4, -16 * SIZE(AO) + movaps %xmm5, -12 * SIZE(AO) + movaps %xmm6, -8 * SIZE(AO) + movaps %xmm7, -4 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 2 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movsd %xmm5, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 2 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhps %xmm6, 2 * SIZE(CO2, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 2 * SIZE(CO2, %rax, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L40: + testq $4, N + jle .L70 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L50 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO2) + movss %xmm3, (CO2, LDC, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO2) + movhps %xmm4, (CO2, LDC, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO2) + movsd %xmm3, (CO2, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $2, I + NOBRANCH + jle .L69 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 -4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 -4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 -4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 -4 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 2 * SIZE(CO2, LDC, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L70: + testq $2, N + jle .L100 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L80 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 + + mulss %xmm8, %xmm0 + mulss %xmm8, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) + movss %xmm1, -31 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) + movss %xmm1, -31 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO2) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + pshufd $0xd8, %xmm8, %xmm8 + + movaps -32 * SIZE(BO), %xmm0 +#else + movaps -32 * SIZE(AO), %xmm0 +#endif + + subps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO2) +#else + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm1, -30 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + movq M, I + sarq $2, I + NOBRANCH + jle .L99 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 -4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 -4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 +#endif + + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + movlps %xmm2, -28 * SIZE(BO) + movlps %xmm3, -26 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + testq $1, N + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, M + BRANCH + jle .L110 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addss %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BO), %xmm0 + + subss %xmm8, %xmm0 +#else + movss -32 * SIZE(AO), %xmm0 + + subss %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 +#endif + + mulss %xmm8, %xmm0 + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0x55, %xmm0, %xmm1 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) +#else + movlps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L120: + movq M, I + sarq $2, I + NOBRANCH + jle .L129 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 -4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + movhps -30 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm2, -30 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) +#else + movaps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L129: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S new file mode 100644 index 0000000000..513572ee96 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S @@ -0,0 +1,5950 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define movsd movlps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + EMMS + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $2 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + pshufd $0x55, %xmm11, %xmm9 + pshufd $0xaa, %xmm11, %xmm10 + pshufd $0xff, %xmm11, %xmm11 + + pshufd $0x00, %xmm15, %xmm12 + pshufd $0x55, %xmm15, %xmm13 + pshufd $0xaa, %xmm15, %xmm14 + pshufd $0xff, %xmm15, %xmm15 + + movaps %xmm8, 32 * SIZE(BO) + movaps %xmm9, 36 * SIZE(BO) + movaps %xmm10, 40 * SIZE(BO) + movaps %xmm11, 44 * SIZE(BO) + movaps %xmm12, 48 * SIZE(BO) + movaps %xmm13, 52 * SIZE(BO) + movaps %xmm14, 56 * SIZE(BO) + movaps %xmm15, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + testq $1, M + je .L20 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss 2 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss 3 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss 8 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss 5 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss 7 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + unpcklps %xmm1, %xmm0 + + movapd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + movss 2 * SIZE(AO), %xmm12 + movss 3 * SIZE(AO), %xmm14 + + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 + subss %xmm2, %xmm12 + subss %xmm3, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) + movss %xmm12, 2 * SIZE(AO) + movss %xmm14, 3 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm12, 0 * SIZE(CO1, LDC, 2) + movss %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movapd 0 * SIZE(B), %xmm1 + movapd 4 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 4 * SIZE(AO), %xmm12 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 6 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) + movlps %xmm12, 4 * SIZE(AO) + movlps %xmm14, 6 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $4, M + je .L40 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + movaps 8 * SIZE(AO), %xmm12 + movaps 12 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) + movaps %xmm12, 8 * SIZE(AO) + movaps %xmm14, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + movq M, I + sarq $3, I # i = (m >> 3) + jle .L49 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW -8 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW -8 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW -8 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW -8 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 4 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 8 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 16 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 16 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 28 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 24 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 28 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 36 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 60 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 40 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 80 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 68 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 72 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 44 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + movaps 16 * SIZE(B), %xmm12 + movaps 20 * SIZE(B), %xmm13 + movaps 24 * SIZE(B), %xmm14 + movaps 28 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + movaps 16 * SIZE(AO), %xmm12 + movaps 20 * SIZE(AO), %xmm13 + movaps 24 * SIZE(AO), %xmm14 + movaps 28 * SIZE(AO), %xmm15 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 + subps %xmm2, %xmm12 + subps %xmm6, %xmm13 + subps %xmm3, %xmm14 + subps %xmm7, %xmm15 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm15 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm15 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm15 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm9 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm9 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + movaps %xmm12, 16 * SIZE(B) + movaps %xmm13, 20 * SIZE(B) + movaps %xmm14, 24 * SIZE(B) + movaps %xmm15, 28 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + pshufd $0xaa, %xmm12, %xmm4 + pshufd $0xff, %xmm12, %xmm6 + movaps %xmm2, 64 * SIZE(BO) + movaps %xmm3, 68 * SIZE(BO) + movaps %xmm4, 72 * SIZE(BO) + movaps %xmm6, 76 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + pshufd $0xaa, %xmm13, %xmm4 + pshufd $0xff, %xmm13, %xmm6 + movaps %xmm2, 80 * SIZE(BO) + movaps %xmm3, 84 * SIZE(BO) + movaps %xmm4, 88 * SIZE(BO) + movaps %xmm6, 92 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + pshufd $0xaa, %xmm14, %xmm4 + pshufd $0xff, %xmm14, %xmm6 + movaps %xmm2, 96 * SIZE(BO) + movaps %xmm3, 100 * SIZE(BO) + movaps %xmm4, 104 * SIZE(BO) + movaps %xmm6, 108 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + pshufd $0xaa, %xmm15, %xmm4 + pshufd $0xff, %xmm15, %xmm6 + movaps %xmm2, 112 * SIZE(BO) + movaps %xmm3, 116 * SIZE(BO) + movaps %xmm4, 120 * SIZE(BO) + movaps %xmm6, 124 * SIZE(BO) + +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) + movaps %xmm12, 16 * SIZE(AO) + movaps %xmm13, 20 * SIZE(AO) + movaps %xmm14, 24 * SIZE(AO) + movaps %xmm15, 28 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movaps %xmm12, %xmm2 + unpcklps %xmm14, %xmm12 + unpckhps %xmm14, %xmm2 + + movaps %xmm13, %xmm7 + unpcklps %xmm15, %xmm13 + unpckhps %xmm15, %xmm7 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movaps %xmm2, %xmm15 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm15 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm11, 4 * SIZE(CO2) + movhps %xmm11, 6 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm13, 4 * SIZE(CO1, LDC, 2) + movhps %xmm13, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $32 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $1 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + je .L70 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 3 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 8 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 5 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 6 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 7 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (AO, %rax, SIZE), AO +#ifdef LT + addq $ 2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $2, M + je .L80 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $ 4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L80: + testq $4, M + je .L90 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $ 8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + movq M, I + sarq $3, I # i = (m >> 3) + jle .L99 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW -8 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW -8 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + addps %xmm12, %xmm5 + movaps 96 * SIZE(BO), %xmm13 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 8 * SIZE(B), %xmm12 +#ifdef movsd + xorps %xmm13, %xmm13 +#endif + movsd 10 * SIZE(B), %xmm13 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 12 * SIZE(B), %xmm14 +#ifdef movsd + xorps %xmm15, %xmm15 +#endif + movsd 14 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + movlps %xmm12, 8 * SIZE(B) + movlps %xmm13, 10 * SIZE(B) + movlps %xmm14, 12 * SIZE(B) + movlps %xmm15, 14 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + movaps %xmm2, 56 * SIZE(BO) + movaps %xmm3, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm14, 4 * SIZE(CO1, LDC, 1) + movhps %xmm14, 6 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm11, 4 * SIZE(CO1, LDC, 1) + movhps %xmm11, 6 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L61 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + testq $1, N + je .L999 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + testq $1, M + je .L120 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 1 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + subps %xmm0, %xmm8 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AO), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L120: + testq $2, M + je .L130 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + testq $4, M + je .L140 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L140: + movq M, I + sarq $3, I # i = (m >> 3) + jle .L149 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW -8 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps 12 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps 20 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps 28 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 36 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 44 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 52 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 60 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 8), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 + subss %xmm4, %xmm12 + subss %xmm6, %xmm13 + subss %xmm9, %xmm14 + subss %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm8, %xmm15 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + movss %xmm12, 4 * SIZE(B) + movss %xmm13, 5 * SIZE(B) + movss %xmm14, 6 * SIZE(B) + movss %xmm15, 7 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + movaps %xmm2, 16 * SIZE(BO) + pshufd $0x00, %xmm13, %xmm2 + movaps %xmm2, 20 * SIZE(BO) + pshufd $0x00, %xmm14, %xmm2 + movaps %xmm2, 24 * SIZE(BO) + pshufd $0x00, %xmm15, %xmm2 + movaps %xmm2, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + unpcklps %xmm13, %xmm12 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L111 + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + EMMS + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S new file mode 100644 index 0000000000..b04299ab9b --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_2x8_nehalem.S @@ -0,0 +1,3077 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetcht0 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movapd -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 1 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 1 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 2 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 1 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 2 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -10 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addq $32 * SIZE, BO + subq $-8 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm13 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $0, %xmm13, %xmm12 + shufpd $3, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $0, %xmm15, %xmm14 + shufpd $3, %xmm0, %xmm15 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm2 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm6 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + shufpd $2, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + shufpd $2, %xmm0, %xmm15 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + subpd %xmm12, %xmm4 + subpd %xmm13, %xmm5 + subpd %xmm14, %xmm6 + subpd %xmm15, %xmm7 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -14 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + movapd %xmm12, %xmm14 + movapd %xmm12, %xmm15 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + mulpd %xmm5, %xmm14 + mulpd %xmm7, %xmm15 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + subpd %xmm14, %xmm4 + subpd %xmm15, %xmm6 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 + + movddup -15 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + movapd %xmm12, %xmm14 + movapd %xmm12, %xmm15 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + mulpd %xmm4, %xmm14 + mulpd %xmm6, %xmm15 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + subpd %xmm14, %xmm5 + subpd %xmm15, %xmm7 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + movddup -12 * SIZE(BO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm4 + movddup -11 * SIZE(BO), %xmm13 + mulpd %xmm0, %xmm13 + subpd %xmm13, %xmm5 + movddup -10 * SIZE(BO), %xmm14 + mulpd %xmm0, %xmm14 + subpd %xmm14, %xmm6 + movddup -9 * SIZE(BO), %xmm15 + mulpd %xmm0, %xmm15 + subpd %xmm15, %xmm7 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + movddup -4 * SIZE(BO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm4 + movddup -3 * SIZE(BO), %xmm13 + mulpd %xmm1, %xmm13 + subpd %xmm13, %xmm5 + movddup -2 * SIZE(BO), %xmm14 + mulpd %xmm1, %xmm14 + subpd %xmm14, %xmm6 + movddup -1 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm7 + + movddup 2 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + movddup 4 * SIZE(BO), %xmm12 + mulpd %xmm2, %xmm12 + subpd %xmm12, %xmm4 + movddup 5 * SIZE(BO), %xmm13 + mulpd %xmm2, %xmm13 + subpd %xmm13, %xmm5 + movddup 6 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm6 + movddup 7 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm7 + + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm4 + movddup 13 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm5 + movddup 14 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm6 + movddup 15 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm7 + + movddup 20 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm6 + movddup 23 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm7 + + movddup 29 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm5 + movddup 30 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm7 + + movddup 38 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm6 + movddup 39 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm7 + + movddup 47 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm7 +#endif + +#ifdef RT + movddup 47 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm7 + movddup 46 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm6 + movddup 45 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm5 + movddup 44 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm4 + movddup 43 * SIZE(BO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + movddup 42 * SIZE(BO), %xmm13 + mulpd %xmm7, %xmm13 + subpd %xmm13, %xmm2 + movddup 41 * SIZE(BO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm1 + movddup 40 * SIZE(BO), %xmm15 + mulpd %xmm7, %xmm15 + subpd %xmm15, %xmm0 + + movddup 38 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm6 + movddup 37 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm5 + movddup 36 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm4 + movddup 35 * SIZE(BO), %xmm12 + mulpd %xmm6, %xmm12 + subpd %xmm12, %xmm3 + movddup 34 * SIZE(BO), %xmm13 + mulpd %xmm6, %xmm13 + subpd %xmm13, %xmm2 + movddup 33 * SIZE(BO), %xmm14 + mulpd %xmm6, %xmm14 + subpd %xmm14, %xmm1 + movddup 32 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm0 + + movddup 29 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm5 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm5, %xmm11 + subpd %xmm11, %xmm4 + movddup 27 * SIZE(BO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + movddup 26 * SIZE(BO), %xmm13 + mulpd %xmm5, %xmm13 + subpd %xmm13, %xmm2 + movddup 25 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm1 + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm0 + + movddup 20 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm4 + movddup 19 * SIZE(BO), %xmm12 + mulpd %xmm4, %xmm12 + subpd %xmm12, %xmm3 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm2 + movddup 17 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm1 + movddup 16 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm0 + + movddup 11 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup 10 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup 9 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup 8 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup 2 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup 1 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup 0 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm2, -14 * SIZE(BO) + movapd %xmm4, -12 * SIZE(BO) + movapd %xmm6, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5 , -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 1 * SIZE(CO1, LDC, 2) + movhps %xmm2, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 1 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movsd %xmm5, 1 * SIZE(CO2) + movhps %xmm4, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 1 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movsd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhps %xmm6, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 1 * SIZE(CO2, %rax, 1) +#else + movups %xmm0, 0 * SIZE(CO1) + movups %xmm1, 0 * SIZE(CO1, LDC, 1) + movups %xmm2, 0 * SIZE(CO1, LDC, 2) + movups %xmm3, 0 * SIZE(CO1, %rax, 1) + movups %xmm4, 0 * SIZE(CO2) + movups %xmm5, 0 * SIZE(CO2, LDC, 1) + movups %xmm6, 0 * SIZE(CO2, LDC, 2) + movups %xmm7, 0 * SIZE(CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm3, %xmm7 + movaps %xmm3, %xmm6 + pshufd $0xe, %xmm2, %xmm5 + movaps %xmm2, %xmm4 + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + movsd -12 * SIZE(BO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm4 + movsd -11 * SIZE(BO), %xmm13 + mulsd %xmm0, %xmm13 + subsd %xmm13, %xmm5 + movsd -10 * SIZE(BO), %xmm14 + mulsd %xmm0, %xmm14 + subsd %xmm14, %xmm6 + movsd -9 * SIZE(BO), %xmm15 + mulsd %xmm0, %xmm15 + subsd %xmm15, %xmm7 + + movsd -7 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + movsd -4 * SIZE(BO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm4 + movsd -3 * SIZE(BO), %xmm13 + mulsd %xmm1, %xmm13 + subsd %xmm13, %xmm5 + movsd -2 * SIZE(BO), %xmm14 + mulsd %xmm1, %xmm14 + subsd %xmm14, %xmm6 + movsd -1 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm7 + + movsd 2 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd 3 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + movsd 4 * SIZE(BO), %xmm12 + mulsd %xmm2, %xmm12 + subsd %xmm12, %xmm4 + movsd 5 * SIZE(BO), %xmm13 + mulsd %xmm2, %xmm13 + subsd %xmm13, %xmm5 + movsd 6 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm6 + movsd 7 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm7 + + movsd 11 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 + movsd 12 * SIZE(BO), %xmm12 + mulsd %xmm3, %xmm12 + subsd %xmm12, %xmm4 + movsd 13 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm5 + movsd 14 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm6 + movsd 15 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm7 + + movsd 20 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm4 + movsd 21 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm5 + movsd 22 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm6 + movsd 23 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm7 + + movsd 29 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm5 + movsd 30 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm6 + movsd 31 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm7 + + movsd 38 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm6 + movsd 39 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm7 + + movsd 47 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm7 +#endif + +#ifdef RT + movsd 47 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm7 + movsd 46 * SIZE(BO), %xmm9 + mulsd %xmm7, %xmm9 + subsd %xmm9, %xmm6 + movsd 45 * SIZE(BO), %xmm10 + mulsd %xmm7, %xmm10 + subsd %xmm10, %xmm5 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm7, %xmm11 + subsd %xmm11, %xmm4 + movsd 43 * SIZE(BO), %xmm12 + mulsd %xmm7, %xmm12 + subsd %xmm12, %xmm3 + movsd 42 * SIZE(BO), %xmm13 + mulsd %xmm7, %xmm13 + subsd %xmm13, %xmm2 + movsd 41 * SIZE(BO), %xmm14 + mulsd %xmm7, %xmm14 + subsd %xmm14, %xmm1 + movsd 40 * SIZE(BO), %xmm15 + mulsd %xmm7, %xmm15 + subsd %xmm15, %xmm0 + + movsd 38 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm6 + movsd 37 * SIZE(BO), %xmm10 + mulsd %xmm6, %xmm10 + subsd %xmm10, %xmm5 + movsd 36 * SIZE(BO), %xmm11 + mulsd %xmm6, %xmm11 + subsd %xmm11, %xmm4 + movsd 35 * SIZE(BO), %xmm12 + mulsd %xmm6, %xmm12 + subsd %xmm12, %xmm3 + movsd 34 * SIZE(BO), %xmm13 + mulsd %xmm6, %xmm13 + subsd %xmm13, %xmm2 + movsd 33 * SIZE(BO), %xmm14 + mulsd %xmm6, %xmm14 + subsd %xmm14, %xmm1 + movsd 32 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm0 + + movsd 29 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm5 + movsd 28 * SIZE(BO), %xmm11 + mulsd %xmm5, %xmm11 + subsd %xmm11, %xmm4 + movsd 27 * SIZE(BO), %xmm12 + mulsd %xmm5, %xmm12 + subsd %xmm12, %xmm3 + movsd 26 * SIZE(BO), %xmm13 + mulsd %xmm5, %xmm13 + subsd %xmm13, %xmm2 + movsd 25 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm1 + movsd 24 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm0 + + movsd 20 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm4 + movsd 19 * SIZE(BO), %xmm12 + mulsd %xmm4, %xmm12 + subsd %xmm12, %xmm3 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm2 + movsd 17 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm1 + movsd 16 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm0 + + movsd 11 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd 10 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd 9 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd 8 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd 2 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd 1 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd 0 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 + movaps %xmm4, %xmm2 + unpcklpd %xmm5, %xmm2 + movaps %xmm6, %xmm3 + unpcklpd %xmm7, %xmm3 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO1, %rax, 1) + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 0 * SIZE(CO2, %rax, 1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) + movapd %xmm2, -12 * SIZE(BO) + movapd %xmm3, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $4, N + jle .L50 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 2 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm2 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm3 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + + movddup -15 * SIZE(AO), %xmm12 + movapd %xmm12, %xmm13 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + + movddup -11 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -10 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -9 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + + movddup -1 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup -2 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup -3 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup -4 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup -6 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -11 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -12 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm2, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm3, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + + movsd -11 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -10 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -9 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + + movsd -1 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd -2 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd -3 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd -4 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd -6 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -11 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -12 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 0 * SIZE(CO2, LDC, 1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L50: + testq $2, N + jle .L70 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movapd -16 * SIZE(BO), %xmm0 + movapd -14 * SIZE(BO), %xmm1 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm0 + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(AO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 + movddup -15 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm1 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -14 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) + movapd %xmm1, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 +#else + movapd -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 + movsd -15 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm1 + + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -14 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm0 +#else + movapd -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef LN + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 + movsd -14 * SIZE(AO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm0 + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(AO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm1 + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm1, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm0, -16 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L89 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movhps -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhps -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 + movsd -14 * SIZE(AO), %xmm0 + movhps -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movsd -14 * SIZE(BO), %xmm1 + movhps -13 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movsd -12 * SIZE(AO), %xmm0 + movhps -11 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movsd -12 * SIZE(BO), %xmm1 + movhps -11 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm0 +#else + movsd -16 * SIZE(AO), %xmm0 +#endif + + subsd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BO) +#else + movsd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x2_atom.S b/kernel/x86_64/trsm_kernel_LT_4x2_atom.S new file mode 100644 index 0000000000..c6ad0a2ccf --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x2_atom.S @@ -0,0 +1,2116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addsd %xmm2, %xmm13 + addsd %xmm7, %xmm14 + addsd %xmm6, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 5 * SIZE(BO), %xmm5 + movsd 6 * SIZE(BO), %xmm6 + movsd 7 * SIZE(BO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 + subsd %xmm12, %xmm4 + subsd %xmm13, %xmm5 + subsd %xmm14, %xmm6 + subsd %xmm15, %xmm7 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + movsd 4 * SIZE(AO), %xmm1 + movsd 5 * SIZE(AO), %xmm3 + movsd 6 * SIZE(AO), %xmm5 + movsd 7 * SIZE(AO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 + subsd %xmm13, %xmm5 + subsd %xmm15, %xmm7 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm7 + movsd 13 * SIZE(AO), %xmm11 + + movaps %xmm9, %xmm10 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm9 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm7, %xmm10 + subsd %xmm9, %xmm4 + movsd 9 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm6, %xmm11 + mulsd %xmm7, %xmm12 + subsd %xmm11, %xmm2 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm3 + + movaps %xmm13, %xmm14 + mulsd %xmm6, %xmm13 + mulsd %xmm7, %xmm14 + subsd %xmm13, %xmm0 + subsd %xmm14, %xmm1 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 5 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm4, %xmm11 + mulsd %xmm5, %xmm12 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm1 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm11, %xmm0 + mulsd %xmm11, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + + movsd 2 * SIZE(AO), %xmm11 + movaps %xmm9, %xmm10 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm9 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm1, %xmm10 + subsd %xmm9, %xmm2 + movsd 6 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm12 + subsd %xmm11, %xmm4 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm5 + + movaps %xmm13, %xmm14 + mulsd %xmm0, %xmm13 + mulsd %xmm1, %xmm14 + subsd %xmm13, %xmm6 + subsd %xmm14, %xmm7 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + movsd 10 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm2, %xmm11 + mulsd %xmm3, %xmm12 + subsd %xmm11, %xmm6 + subsd %xmm12, %xmm7 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 15 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm6 + subsd %xmm10, %xmm7 + + mulsd %xmm8, %xmm6 + mulsd %xmm8, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + mulsd %xmm4, %xmm11 + mulsd %xmm6, %xmm12 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + subsd %xmm11, %xmm5 + subsd %xmm12, %xmm7 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 + mulsd %xmm13, %xmm5 + mulsd %xmm13, %xmm7 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm5 + mulsd %xmm8, %xmm7 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + mulsd %xmm5, %xmm11 + mulsd %xmm7, %xmm12 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm4 + subsd %xmm12, %xmm6 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm4 + mulsd %xmm13, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movsd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) + movsd %xmm4, 4 * SIZE(BO) + movsd %xmm5, 5 * SIZE(BO) + movsd %xmm6, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) + movsd %xmm1, 4 * SIZE(AO) + movsd %xmm3, 5 * SIZE(AO) + movsd %xmm5, 6 * SIZE(AO) + movsd %xmm7, 7 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + addsd %xmm6, %xmm11 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm1 + movsd 3 * SIZE(AO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm2 + movsd 2 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm10 + + subsd %xmm9, %xmm2 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm1, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 3 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm1 + mulsd %xmm13, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm1, %xmm9 + movsd 0 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm0 + mulsd %xmm13, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm4 + movsd 3 * SIZE(BO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm9 + movsd 13 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm4 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm11 + movsd 10 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm2 + movsd 9 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm13 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm0 + + mulsd %xmm8, %xmm4 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm11 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 2 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm2 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm11 + movsd 5 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm4 + movsd 6 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm13 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm6 + + mulsd %xmm8, %xmm2 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm6 + mulsd %xmm8, %xmm4 + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm6 + mulsd %xmm8, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) + movsd %xmm4, 2 * SIZE(BO) + movsd %xmm6, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + je .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + movsd 2 * SIZE(AO), %xmm9 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11,%xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + movsd 1 * SIZE(AO), %xmm9 + movsd 3 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm0 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm2 + mulsd %xmm11,%xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $1, M + je .L69 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + subsd %xmm8, %xmm0 +#else + movsd 0 * SIZE(AO), %xmm0 + subsd %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S new file mode 100644 index 0000000000..b133bcf4b8 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_barcelona.S @@ -0,0 +1,3396 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define OFFSET 48(%rsp) +#define AORIG 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define AORIG 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ +/**/ movapd (AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ +/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ +/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ +/**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ +/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm12 +#else + movq STACKSIZE + 8(%rsp), LDC + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movsd %xmm12, OFFSET + movsd %xmm12, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#else + prefetchw -8 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw -8 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw -8 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw -8 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#endif + + prefetch -16 * SIZE(BB) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm15 + + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm11, %xmm14 + subpd %xmm14, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm3 + + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) + movaps %xmm1, -8 * SIZE(BO) + movaps %xmm3, -6 * SIZE(BO) + movaps %xmm5, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) + movaps %xmm4, -8 * SIZE(AO) + movaps %xmm5, -6 * SIZE(AO) + movaps %xmm6, -4 * SIZE(AO) + movaps %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) + movaps %xmm4, -12 * SIZE(AO) + movaps %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#else + movapd -16 * SIZE(AO), %xmm2 + movapd -14 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -15 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd -14 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd -13 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd -9 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -5 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd -3 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd -4 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -7 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd -8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -12 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm3, -14 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) + movaps %xmm3, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 3 * SIZE(CO2) +#else + prefetchw -8 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw -8 * SIZE(CO2) +#endif + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13,-14 * SIZE(BO) + movaps %xmm1, -12 * SIZE(BO) + movaps %xmm5, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 1), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 +#else + movapd -16 * SIZE(AO), %xmm2 +#endif + + subpd %xmm8, %xmm2 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -16 * SIZE(BO), %xmm2 + movsd -15 * SIZE(BO), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + + movlpd -14 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + + mulsd -16 * SIZE(BO), %xmm2 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -14 * SIZE(BO), %xmm3 + +#ifndef LN + prefetchw 3 * SIZE(CO1) +#else + prefetchw -8 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm10 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm11 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm9 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm10 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movlpd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) + movaps %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm10, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movlpd %xmm10, -16 * SIZE(BO) +#else + movlpd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_core2.S b/kernel/x86_64/trsm_kernel_LT_4x4_core2.S new file mode 100644 index 0000000000..7864ec5501 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_core2.S @@ -0,0 +1,3730 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define J 0(%rsp) +#define OFFSET 8(%rsp) +#define KK 16(%rsp) +#define KKK 24(%rsp) +#define AORIG 32(%rsp) +#define BORIG 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -8 * SIZE(B), %xmm4 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + movddup %xmm2, %xmm10 + unpckhpd %xmm2, %xmm2 + movddup %xmm3, %xmm11 + unpckhpd %xmm3, %xmm3 + movddup %xmm4, %xmm12 + unpckhpd %xmm4, %xmm4 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + movddup %xmm6, %xmm14 + unpckhpd %xmm6, %xmm6 + movddup %xmm7, %xmm15 + unpckhpd %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + movapd %xmm10, -8 * SIZE(BO) + movapd %xmm2, -6 * SIZE(BO) + movapd %xmm11, -4 * SIZE(BO) + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + movapd %xmm12, 0 * SIZE(BO) + movapd %xmm4, 2 * SIZE(BO) + movapd %xmm13, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movapd %xmm14, 8 * SIZE(BO) + movapd %xmm6, 10 * SIZE(BO) + movapd %xmm15, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm10 + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm14 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 0 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd 4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd 6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + addq $32 * SIZE, BO + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subq $1, %rax + mulpd %xmm1, %xmm5 + + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + movapd -8 * SIZE(B), %xmm1 + movapd -6 * SIZE(B), %xmm3 + movapd -4 * SIZE(B), %xmm5 + movapd -2 * SIZE(B), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm5, -4 * SIZE(B) + movapd %xmm7, -2 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm3, %xmm2 + SHUFPD_3 %xmm3, %xmm3 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + movddup %xmm7, %xmm6 + SHUFPD_3 %xmm7, %xmm7 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jne .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(B), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(B), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(B), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(B), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(B), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(B), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(B), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -15 * SIZE(AO), %xmm0 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 6 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -13 * SIZE(AO), %xmm0 + movsd 8 * SIZE(BO), %xmm2 + movsd 10 * SIZE(BO), %xmm3 + movsd 12 * SIZE(BO), %xmm4 + movsd 14 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 + movsd -14 * SIZE(B), %xmm14 + movsd -13 * SIZE(B), %xmm15 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 + movsd -14 * SIZE(AO), %xmm14 + movsd -13 * SIZE(AO), %xmm15 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + subsd %xmm10, %xmm14 + subsd %xmm11, %xmm15 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movlpd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(B), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(B), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(B), %xmm13 + movlpd -10 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(B), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(B), %xmm14 + movlpd -5 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(B), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(B), %xmm15 + + movlpd -2 * SIZE(B), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(B), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(B), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(B), %xmm14 + + movlpd -7 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(B), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(B), %xmm13 + + movlpd -12 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + movsd %xmm14, -14 * SIZE(B) + movsd %xmm15, -13 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) + movsd %xmm14, -12 * SIZE(BO) + movsd %xmm14, -11 * SIZE(BO) + movsd %xmm15, -10 * SIZE(BO) + movsd %xmm15, -9 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + movapd -12 * SIZE(B), %xmm1 + movapd -10 * SIZE(B), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm5, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) + movapd %xmm0, -8 * SIZE(BO) + movapd %xmm1, -6 * SIZE(BO) + movapd %xmm4, -4 * SIZE(BO) + movapd %xmm5, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(B), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm13 + + movlpd -14 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) +#else + prefetcht2 3 * SIZE(CO1) +#endif + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -14 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + movapd -10 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + movddup %xmm11, %xmm9 + SHUFPD_3 %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -12 * SIZE(BO), %xmm2 + movsd -10 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(B) + + movlpd %xmm10, -16 * SIZE(BO) + movlpd %xmm10, -15 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S b/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S new file mode 100644 index 0000000000..77fc0c5c0d --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_penryn.S @@ -0,0 +1,3424 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 -4 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 -4 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 3 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + addpd %xmm3, %xmm11 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 16 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + subq $-32 * SIZE, AO + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movapd %xmm14, %xmm0 + movsd %xmm15, %xmm14 + movsd %xmm0, %xmm15 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm5, %xmm5 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -2 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm12 + movaps -14 * SIZE(BO), %xmm13 +#else + movaps -16 * SIZE(AO), %xmm12 + movaps -14 * SIZE(AO), %xmm13 +#endif + + subpd %xmm8, %xmm12 + subpd %xmm9, %xmm13 + +#if defined(RN) || defined(RT) + movhlps %xmm13, %xmm15 + movsd %xmm13, %xmm14 + movhlps %xmm12, %xmm13 + movsd %xmm12, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm12 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movlpd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(BO), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(BO), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(BO), %xmm13 + movlpd -10 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(BO), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(BO), %xmm14 + movlpd -5 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(BO), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(BO), %xmm15 + + movlpd -2 * SIZE(BO), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(BO), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(BO), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(BO), %xmm14 + + movlpd -7 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(BO), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(BO), %xmm13 + + movlpd -12 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + movsd %xmm13, 0 * SIZE(CO1, LDC, 2) + movhps %xmm13, 0 * SIZE(CO2, LDC, 2) + + movaps %xmm12, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 -4 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#else + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm5, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + movhlps %xmm8, %xmm9 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(BO), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(BO), %xmm13 + + movlpd -14 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm13, -15 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + pxor %xmm9, %xmm9 + movhps -15 * SIZE(BO), %xmm2 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + BRANCH + jle .L119 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -12 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S b/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S new file mode 100644 index 0000000000..d50c8d5010 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_sse2.S @@ -0,0 +1,4169 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#ifndef ALLOC_HUGETLB +#define PREFETCHSIZE (8 * 4 + 4) +#else +#define PREFETCHSIZE (8 * 2 + 4) +#endif +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 4 + 4) +#endif + +#ifdef OPTERON +#define movsd movlpd +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 40 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $16 * SIZE, BO + addq $ 8 * SIZE, B + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + movsd %xmm2, 4 * SIZE(BO) + movsd %xmm2, 5 * SIZE(BO) + movsd %xmm3, 6 * SIZE(BO) + movsd %xmm3, 7 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(BO), %xmm9 + movapd 2 * SIZE(BO), %xmm11 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +#else + sarq $3, %rax + je .L15 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jg .L12 + ALIGN_4 +#endif + + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + movapd 8 * SIZE(B), %xmm9 + movapd 10 * SIZE(B), %xmm11 + movapd 12 * SIZE(B), %xmm13 + movapd 14 * SIZE(B), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + movapd %xmm9, 8 * SIZE(B) + movapd %xmm11, 10 * SIZE(B) + movapd %xmm13, 12 * SIZE(B) + movapd %xmm15, 14 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) + movlpd %xmm9, 16 * SIZE(BO) + movlpd %xmm9, 17 * SIZE(BO) + movhpd %xmm9, 18 * SIZE(BO) + movhpd %xmm9, 19 * SIZE(BO) + movlpd %xmm11, 20 * SIZE(BO) + movlpd %xmm11, 21 * SIZE(BO) + movhpd %xmm11, 22 * SIZE(BO) + movhpd %xmm11, 23 * SIZE(BO) + movlpd %xmm13, 24 * SIZE(BO) + movlpd %xmm13, 25 * SIZE(BO) + movhpd %xmm13, 26 * SIZE(BO) + movhpd %xmm13, 27 * SIZE(BO) + movlpd %xmm15, 28 * SIZE(BO) + movlpd %xmm15, 29 * SIZE(BO) + movhpd %xmm15, 30 * SIZE(BO) + movhpd %xmm15, 31 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm0 + movapd 18 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm1 + movapd 20 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + mulpd 22 * SIZE(BO), %xmm8 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm8, %xmm3 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm0 + movapd 26 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm1 + movapd 28 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + mulpd 30 * SIZE(BO), %xmm8 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 34 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movapd 36 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 38 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm2 + movapd 64 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm3 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 42 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movapd 44 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + mulpd 46 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm2 + movapd 72 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm0 + movapd 50 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm1 + movapd 52 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + mulpd 54 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 80 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm0 + movapd 58 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm1 + movapd 60 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + mulpd 62 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 88 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 10 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 12 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 20 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 22 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 26 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 28 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 30 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 4 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 34 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 36 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 38 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 64 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 5 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 42 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 46 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 72 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 6 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 50 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 52 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 54 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 80 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 7 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 58 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 60 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 62 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 88 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 8 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 + movsd 2 * SIZE(B), %xmm6 + movsd 3 * SIZE(B), %xmm7 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 + movsd 2 * SIZE(AO), %xmm6 + movsd 3 * SIZE(AO), %xmm7 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + subsd %xmm2, %xmm6 + subsd %xmm3, %xmm7 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movlpd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + movlpd 2 * SIZE(B), %xmm2 + mulsd %xmm4, %xmm2 + subsd %xmm2, %xmm6 + movlpd 3 * SIZE(B), %xmm3 + mulsd %xmm4, %xmm3 + subsd %xmm3, %xmm7 + + mulsd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm6 + movlpd 7 * SIZE(B), %xmm2 + mulsd %xmm5, %xmm2 + subsd %xmm2, %xmm7 + + mulsd 10 * SIZE(B), %xmm6 + movlpd 11 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm7 + + mulsd 15 * SIZE(B), %xmm7 +#endif + +#ifdef RT + mulsd 15 * SIZE(B), %xmm7 + + movlpd 14 * SIZE(B), %xmm1 + mulsd %xmm7, %xmm1 + subsd %xmm1, %xmm6 + movlpd 13 * SIZE(B), %xmm2 + mulsd %xmm7, %xmm2 + subsd %xmm2, %xmm5 + movlpd 12 * SIZE(B), %xmm3 + mulsd %xmm7, %xmm3 + subsd %xmm3, %xmm4 + + mulsd 10 * SIZE(B), %xmm6 + + movlpd 9 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm5 + movlpd 8 * SIZE(B), %xmm2 + mulsd %xmm6, %xmm2 + subsd %xmm2, %xmm4 + + mulsd 5 * SIZE(B), %xmm5 + + movlpd 4 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + movsd %xmm6, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + movsd %xmm6, 2 * SIZE(B) + movsd %xmm7, 3 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) + movsd %xmm6, 4 * SIZE(BO) + movsd %xmm6, 5 * SIZE(BO) + movsd %xmm7, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) + movsd %xmm6, 2 * SIZE(AO) + movsd %xmm7, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + movapd 16 * SIZE(AO), %xmm12 + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movapd 24 * SIZE(BO), %xmm15 + + PREFETCHW 4 * SIZE(CO1) + PREFETCHW 4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 32 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 8 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 40 * SIZE(AO), %xmm10 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 16 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 18 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 20 * SIZE(AO), %xmm12 + + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 22 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 48 * SIZE(AO), %xmm12 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 24 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 26 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 28 * SIZE(AO), %xmm14 + + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 30 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 56 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm9 + movapd 6 * SIZE(B), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm9, 4 * SIZE(B) + movapd %xmm13, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) + movlpd %xmm9, 8 * SIZE(BO) + movlpd %xmm9, 9 * SIZE(BO) + movhpd %xmm9, 10 * SIZE(BO) + movhpd %xmm9, 11 * SIZE(BO) + movlpd %xmm13, 12 * SIZE(BO) + movlpd %xmm13, 13 * SIZE(BO) + movhpd %xmm13, 14 * SIZE(BO) + movhpd %xmm13, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm13 + mulpd 18 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + mulpd 22 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 26 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 30 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 10 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm0 + movsd 12 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + mulsd %xmm10, %xmm13 + mulsd 18 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm0 + movsd 20 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm1 + movsd 5 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm13 + mulsd 22 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm3 + movsd 6 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 26 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm0 + movsd 28 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm1 + movsd 7 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 30 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm8, %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + + mulsd 3 * SIZE(B), %xmm5 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm5 + + movlpd 2 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movsd 0 * SIZE(B), %xmm0 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movapd 24 * SIZE(AO), %xmm14 + + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm12 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm0 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm11, %xmm1 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm12 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm2 + movapd 48 * SIZE(AO), %xmm12 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm11, %xmm3 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm0 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm1 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm2 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm3 + movapd 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(AO), %xmm8 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 4 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movapd 6 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 12 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movapd 14 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 16 * SIZE(BO), %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm1 + movsd 2 * SIZE(AO), %xmm8 + mulsd 4 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm2 + movsd 3 * SIZE(AO), %xmm8 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + mulsd %xmm10, %xmm11 + movsd 5 * SIZE(AO), %xmm10 + addsd %xmm11, %xmm0 + movsd 24 * SIZE(BO), %xmm11 + mulsd 10 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm1 + movsd 6 * SIZE(AO), %xmm10 + mulsd 12 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm2 + movsd 7 * SIZE(AO), %xmm10 + mulsd 14 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm8, %xmm9 + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + addsd %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S b/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S new file mode 100644 index 0000000000..266f44243e --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x4_sse3.S @@ -0,0 +1,3856 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + movapd 8 * SIZE(BO), %xmm9 + movapd 10 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm13 + movapd 14 * SIZE(BO), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) + movapd %xmm9, 8 * SIZE(BO) + movapd %xmm11, 10 * SIZE(BO) + movapd %xmm13, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + ALIGN_4 + +.L21: + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-4 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) + prefetchw 4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + movapd 4 * SIZE(BO), %xmm9 + movapd 6 * SIZE(BO), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) + movapd %xmm9, 4 * SIZE(BO) + movapd %xmm13, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm9 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm8 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm11 + pxor %xmm2, %xmm2 + movapd 4 * SIZE(BO), %xmm10 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S new file mode 100644 index 0000000000..917f8f9a5c --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_4x8_nehalem.S @@ -0,0 +1,4847 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht2 4 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht2 4 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps %xmm12, %xmm4 + shufps $0x88, %xmm13, %xmm12 + movaps %xmm14, %xmm5 + shufps $0x88, %xmm15, %xmm14 + shufps $0xdd, %xmm15, %xmm4 + shufps $0xdd, %xmm13, %xmm5 + + movaps %xmm12, %xmm6 + shufps $0x88, %xmm14, %xmm12 + shufps $0xdd, %xmm6, %xmm14 + + movaps %xmm4, %xmm13 + movaps %xmm5, %xmm15 + shufps $0x22, %xmm5, %xmm13 + shufps $0x77, %xmm4, %xmm15 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm5 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm6 + movaps -8 * SIZE(BO), %xmm3 + movaps -4 * SIZE(BO), %xmm7 + +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 + movaps -16 * SIZE(AO), %xmm4 + movaps -12 * SIZE(AO), %xmm5 + movaps -8 * SIZE(AO), %xmm6 + movaps -4 * SIZE(AO), %xmm7 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + subps %xmm12, %xmm4 + subps %xmm13, %xmm5 + subps %xmm14, %xmm6 + subps %xmm15, %xmm7 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm5, -20 * SIZE(BO) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm6, -12 * SIZE(BO) + movaps %xmm3, -8 * SIZE(BO) + movaps %xmm7, -4 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm8, %xmm5 + + movaps %xmm6, %xmm9 + shufps $0x88, %xmm7, %xmm6 + shufps $0xdd, %xmm9, %xmm7 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm6, %xmm4 + movaps %xmm5, %xmm9 + shufps $0x22, %xmm7, %xmm5 + shufps $0xdd, %xmm6, %xmm8 + movaps %xmm8, %xmm6 + shufps $0x77, %xmm7, %xmm9 + movaps %xmm9, %xmm7 + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) + movaps %xmm4, -16 * SIZE(AO) + movaps %xmm5, -12 * SIZE(AO) + movaps %xmm6, -8 * SIZE(AO) + movaps %xmm7, -4 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 2 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movsd %xmm5, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 2 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhps %xmm6, 2 * SIZE(CO2, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 2 * SIZE(CO2, %rax, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm5 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm3 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 + subps %xmm10, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + movaps -24 * SIZE(AO), %xmm4 + movaps -20 * SIZE(AO), %xmm6 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + subps %xmm10, %xmm4 + subps %xmm11, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm2, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movaps %xmm2, %xmm5 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm5 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO1, LDC, 2) + movhps %xmm4, (CO1, %rax, 1) + + movsd %xmm2, (CO2) + movhps %xmm2, (CO2, LDC, 1) + movsd %xmm5, (CO2, LDC, 2) + movhps %xmm5, (CO2, %rax, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + movlhps %xmm5, %xmm4 + movlhps %xmm7, %xmm6 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + movaps %xmm4, -24 * SIZE(AO) + movaps %xmm6, -20 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO1, LDC, 2) + movsd %xmm3, (CO1, %rax, 1) + + movsd %xmm4, (CO2) + movsd %xmm5, (CO2, LDC, 1) + movsd %xmm6, (CO2, LDC, 2) + movsd %xmm7, (CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + movsd -28 * SIZE(AO), %xmm4 + movhps -26 * SIZE(AO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + unpcklps %xmm5, %xmm4 + unpcklps %xmm7, %xmm6 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) + movlps %xmm4, -28 * SIZE(AO) + movlps %xmm6, -26 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO1, LDC, 2) + movss %xmm3, (CO1, %rax, 1) + + movss %xmm4, (CO2) + movss %xmm5, (CO2, LDC, 1) + movss %xmm6, (CO2, LDC, 2) + movss %xmm7, (CO2, %rax, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L40: + testq $4, N + jle .L70 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 2 * SIZE(CO2, LDC, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO2) + movhps %xmm4, (CO2, LDC, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO2) + movsd %xmm3, (CO2, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO2) + movss %xmm3, (CO2, LDC, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L70: + testq $2, N + jle .L100 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 +#endif + + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + movlps %xmm2, -28 * SIZE(BO) + movlps %xmm3, -26 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + pshufd $0xd8, %xmm8, %xmm8 + + movaps -32 * SIZE(BO), %xmm0 +#else + movaps -32 * SIZE(AO), %xmm0 +#endif + + subps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO2) +#else + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm1, -30 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $1, M + BRANCH + jle .L99 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 + + mulss %xmm8, %xmm0 + mulss %xmm8, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) + movss %xmm1, -31 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) + movss %xmm1, -31 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO2) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + testq $1, N + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + movhps -30 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm2, -30 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) +#else + movaps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0x55, %xmm0, %xmm1 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) +#else + movlps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L129 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addss %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BO), %xmm0 + + subss %xmm8, %xmm0 +#else + movss -32 * SIZE(AO), %xmm0 + + subss %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 +#endif + + mulss %xmm8, %xmm0 + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L129: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S new file mode 100644 index 0000000000..526a78c576 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S @@ -0,0 +1,5949 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define movsd movlps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + EMMS + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $2 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + pshufd $0x55, %xmm11, %xmm9 + pshufd $0xaa, %xmm11, %xmm10 + pshufd $0xff, %xmm11, %xmm11 + + pshufd $0x00, %xmm15, %xmm12 + pshufd $0x55, %xmm15, %xmm13 + pshufd $0xaa, %xmm15, %xmm14 + pshufd $0xff, %xmm15, %xmm15 + + movaps %xmm8, 32 * SIZE(BO) + movaps %xmm9, 36 * SIZE(BO) + movaps %xmm10, 40 * SIZE(BO) + movaps %xmm11, 44 * SIZE(BO) + movaps %xmm12, 48 * SIZE(BO) + movaps %xmm13, 52 * SIZE(BO) + movaps %xmm14, 56 * SIZE(BO) + movaps %xmm15, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 7 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 4 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 8 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 16 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 16 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 28 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 24 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 28 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 36 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 60 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 40 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 80 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 68 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 72 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 44 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + movaps 16 * SIZE(B), %xmm12 + movaps 20 * SIZE(B), %xmm13 + movaps 24 * SIZE(B), %xmm14 + movaps 28 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + movaps 16 * SIZE(AO), %xmm12 + movaps 20 * SIZE(AO), %xmm13 + movaps 24 * SIZE(AO), %xmm14 + movaps 28 * SIZE(AO), %xmm15 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 + subps %xmm2, %xmm12 + subps %xmm6, %xmm13 + subps %xmm3, %xmm14 + subps %xmm7, %xmm15 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm15 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm15 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm15 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm9 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm9 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + movaps %xmm12, 16 * SIZE(B) + movaps %xmm13, 20 * SIZE(B) + movaps %xmm14, 24 * SIZE(B) + movaps %xmm15, 28 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + pshufd $0xaa, %xmm12, %xmm4 + pshufd $0xff, %xmm12, %xmm6 + movaps %xmm2, 64 * SIZE(BO) + movaps %xmm3, 68 * SIZE(BO) + movaps %xmm4, 72 * SIZE(BO) + movaps %xmm6, 76 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + pshufd $0xaa, %xmm13, %xmm4 + pshufd $0xff, %xmm13, %xmm6 + movaps %xmm2, 80 * SIZE(BO) + movaps %xmm3, 84 * SIZE(BO) + movaps %xmm4, 88 * SIZE(BO) + movaps %xmm6, 92 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + pshufd $0xaa, %xmm14, %xmm4 + pshufd $0xff, %xmm14, %xmm6 + movaps %xmm2, 96 * SIZE(BO) + movaps %xmm3, 100 * SIZE(BO) + movaps %xmm4, 104 * SIZE(BO) + movaps %xmm6, 108 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + pshufd $0xaa, %xmm15, %xmm4 + pshufd $0xff, %xmm15, %xmm6 + movaps %xmm2, 112 * SIZE(BO) + movaps %xmm3, 116 * SIZE(BO) + movaps %xmm4, 120 * SIZE(BO) + movaps %xmm6, 124 * SIZE(BO) + +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) + movaps %xmm12, 16 * SIZE(AO) + movaps %xmm13, 20 * SIZE(AO) + movaps %xmm14, 24 * SIZE(AO) + movaps %xmm15, 28 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movaps %xmm12, %xmm2 + unpcklps %xmm14, %xmm12 + unpckhps %xmm14, %xmm2 + + movaps %xmm13, %xmm7 + unpcklps %xmm15, %xmm13 + unpckhps %xmm15, %xmm7 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movaps %xmm2, %xmm15 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm15 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm11, 4 * SIZE(CO2) + movhps %xmm11, 6 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm13, 4 * SIZE(CO1, LDC, 2) + movhps %xmm13, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $32 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + movaps 8 * SIZE(AO), %xmm12 + movaps 12 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) + movaps %xmm12, 8 * SIZE(AO) + movaps %xmm14, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movapd 0 * SIZE(B), %xmm1 + movapd 4 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 4 * SIZE(AO), %xmm12 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 6 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) + movlps %xmm12, 4 * SIZE(AO) + movlps %xmm14, 6 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss 2 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss 3 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss 8 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss 5 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss 7 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + unpcklps %xmm1, %xmm0 + + movapd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + movss 2 * SIZE(AO), %xmm12 + movss 3 * SIZE(AO), %xmm14 + + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 + subss %xmm2, %xmm12 + subss %xmm3, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) + movss %xmm12, 2 * SIZE(AO) + movss %xmm14, 3 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm12, 0 * SIZE(CO1, LDC, 2) + movss %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $1 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + addps %xmm12, %xmm5 + movaps 96 * SIZE(BO), %xmm13 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 8 * SIZE(B), %xmm12 +#ifdef movsd + xorps %xmm13, %xmm13 +#endif + movsd 10 * SIZE(B), %xmm13 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 12 * SIZE(B), %xmm14 +#ifdef movsd + xorps %xmm15, %xmm15 +#endif + movsd 14 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + movlps %xmm12, 8 * SIZE(B) + movlps %xmm13, 10 * SIZE(B) + movlps %xmm14, 12 * SIZE(B) + movlps %xmm15, 14 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + movaps %xmm2, 56 * SIZE(BO) + movaps %xmm3, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm14, 4 * SIZE(CO1, LDC, 1) + movhps %xmm14, 6 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm11, 4 * SIZE(CO1, LDC, 1) + movhps %xmm11, 6 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $ 8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $ 4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 3 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 8 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 5 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 6 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 7 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (AO, %rax, SIZE), AO +#ifdef LT + addq $ 2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + testq $1, N + je .L999 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps 12 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps 20 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps 28 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 36 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 44 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 52 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 60 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 8), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 + subss %xmm4, %xmm12 + subss %xmm6, %xmm13 + subss %xmm9, %xmm14 + subss %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm8, %xmm15 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + movss %xmm12, 4 * SIZE(B) + movss %xmm13, 5 * SIZE(B) + movss %xmm14, 6 * SIZE(B) + movss %xmm15, 7 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + movaps %xmm2, 16 * SIZE(BO) + pshufd $0x00, %xmm13, %xmm2 + movaps %xmm2, 20 * SIZE(BO) + pshufd $0x00, %xmm14, %xmm2 + movaps %xmm2, 24 * SIZE(BO) + pshufd $0x00, %xmm15, %xmm2 + movaps %xmm2, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + unpcklps %xmm13, %xmm12 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L140: + testq $1, M + je .L149 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 1 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + subps %xmm0, %xmm8 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AO), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + EMMS + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S b/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S new file mode 100644 index 0000000000..8c7f92fbda --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_2x8_nehalem.S @@ -0,0 +1,3077 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movaps %xmm3, %xmm0 +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + jle .L30 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm0 +#else + movaps -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef LN + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 + movsd -14 * SIZE(AO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm0 + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(AO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm1 + movsd -13 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm1 +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm1, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L89 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movhps -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhps -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 + movsd -14 * SIZE(AO), %xmm0 + movhps -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movsd -14 * SIZE(BO), %xmm1 + movhps -13 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movsd -12 * SIZE(AO), %xmm0 + movhps -11 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movsd -12 * SIZE(BO), %xmm1 + movhps -11 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm0 +#else + movsd -16 * SIZE(AO), %xmm0 +#endif + + subsd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, -16 * SIZE(BO) +#else + movsd %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L89: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L30: + testq $2, N + jle .L50 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm1 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm0 + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(AO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm0 + movddup -15 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm1 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm1 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -14 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm1, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm0 +#else + movaps -16 * SIZE(AO), %xmm0 +#endif + + subpd %xmm8, %xmm0 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm0 + movsd -15 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm1 + + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm1 +#endif + +#ifdef RT + movsd -13 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -14 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L50: + testq $4, N + jle .L70 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 2 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm1 + movaps -10 * SIZE(BO), %xmm3 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -12 * SIZE(AO), %xmm2 + movaps -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(AO), %xmm12 + movaps %xmm12, %xmm13 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + + movddup -15 * SIZE(AO), %xmm12 + movaps %xmm12, %xmm13 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + + movddup -11 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -10 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -9 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + + movddup -1 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup -2 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup -3 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup -4 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup -6 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -11 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -12 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 1 * SIZE(CO2, LDC, 1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm2, -14 * SIZE(BO) + movaps %xmm1, -12 * SIZE(BO) + movaps %xmm3, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm1 +#else + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + + movsd -11 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -10 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -9 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + + movsd -1 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 +#endif + +#ifdef RT + movsd -1 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd -2 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd -3 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd -4 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd -6 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -11 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -12 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 0 * SIZE(CO2, LDC, 1) + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm1, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L70: + movq N, J + sarq $3, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetcht0 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 1 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 2 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 1 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 2 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 1 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 2 * SIZE(CO2, %rax, 1) + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -10 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addq $32 * SIZE, BO + subq $-8 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm13 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm0 + shufpd $0, %xmm9, %xmm8 + shufpd $3, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $0, %xmm11, %xmm10 + shufpd $3, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $0, %xmm13, %xmm12 + shufpd $3, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $0, %xmm15, %xmm14 + shufpd $3, %xmm0, %xmm15 + + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm4 + movaps -10 * SIZE(BO), %xmm6 + movaps -8 * SIZE(BO), %xmm1 + movaps -6 * SIZE(BO), %xmm3 + movaps -4 * SIZE(BO), %xmm5 + movaps -2 * SIZE(BO), %xmm7 +#else + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + shufpd $2, %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + shufpd $2, %xmm0, %xmm15 + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -12 * SIZE(AO), %xmm2 + movaps -10 * SIZE(AO), %xmm3 + + movaps -8 * SIZE(AO), %xmm4 + movaps -6 * SIZE(AO), %xmm5 + movaps -4 * SIZE(AO), %xmm6 + movaps -2 * SIZE(AO), %xmm7 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + subpd %xmm12, %xmm4 + subpd %xmm13, %xmm5 + subpd %xmm14, %xmm6 + subpd %xmm15, %xmm7 + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -14 * SIZE(AO), %xmm12 + movaps %xmm12, %xmm13 + movaps %xmm12, %xmm14 + movaps %xmm12, %xmm15 + + mulpd %xmm1, %xmm12 + mulpd %xmm3, %xmm13 + mulpd %xmm5, %xmm14 + mulpd %xmm7, %xmm15 + + subpd %xmm12, %xmm0 + subpd %xmm13, %xmm2 + subpd %xmm14, %xmm4 + subpd %xmm15, %xmm6 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm6 + + movddup -15 * SIZE(AO), %xmm12 + movaps %xmm12, %xmm13 + movaps %xmm12, %xmm14 + movaps %xmm12, %xmm15 + + mulpd %xmm0, %xmm12 + mulpd %xmm2, %xmm13 + mulpd %xmm4, %xmm14 + mulpd %xmm6, %xmm15 + + subpd %xmm12, %xmm1 + subpd %xmm13, %xmm3 + subpd %xmm14, %xmm5 + subpd %xmm15, %xmm7 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm1 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm3 + movddup -12 * SIZE(BO), %xmm12 + mulpd %xmm0, %xmm12 + subpd %xmm12, %xmm4 + movddup -11 * SIZE(BO), %xmm13 + mulpd %xmm0, %xmm13 + subpd %xmm13, %xmm5 + movddup -10 * SIZE(BO), %xmm14 + mulpd %xmm0, %xmm14 + subpd %xmm14, %xmm6 + movddup -9 * SIZE(BO), %xmm15 + mulpd %xmm0, %xmm15 + subpd %xmm15, %xmm7 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm1 + movddup -6 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm2 + movddup -5 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm3 + movddup -4 * SIZE(BO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm4 + movddup -3 * SIZE(BO), %xmm13 + mulpd %xmm1, %xmm13 + subpd %xmm13, %xmm5 + movddup -2 * SIZE(BO), %xmm14 + mulpd %xmm1, %xmm14 + subpd %xmm14, %xmm6 + movddup -1 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm7 + + movddup 2 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm2 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm2, %xmm11 + subpd %xmm11, %xmm3 + movddup 4 * SIZE(BO), %xmm12 + mulpd %xmm2, %xmm12 + subpd %xmm12, %xmm4 + movddup 5 * SIZE(BO), %xmm13 + mulpd %xmm2, %xmm13 + subpd %xmm13, %xmm5 + movddup 6 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm6 + movddup 7 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm7 + + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm4 + movddup 13 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm5 + movddup 14 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm6 + movddup 15 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm7 + + movddup 20 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm6 + movddup 23 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm7 + + movddup 29 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm5 + movddup 30 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm7 + + movddup 38 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm6 + movddup 39 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm7 + + movddup 47 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm7 +#endif + +#ifdef RT + movddup 47 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm7 + movddup 46 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm6 + movddup 45 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm5 + movddup 44 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm4 + movddup 43 * SIZE(BO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + movddup 42 * SIZE(BO), %xmm13 + mulpd %xmm7, %xmm13 + subpd %xmm13, %xmm2 + movddup 41 * SIZE(BO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm1 + movddup 40 * SIZE(BO), %xmm15 + mulpd %xmm7, %xmm15 + subpd %xmm15, %xmm0 + + movddup 38 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm6 + movddup 37 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm5 + movddup 36 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm4 + movddup 35 * SIZE(BO), %xmm12 + mulpd %xmm6, %xmm12 + subpd %xmm12, %xmm3 + movddup 34 * SIZE(BO), %xmm13 + mulpd %xmm6, %xmm13 + subpd %xmm13, %xmm2 + movddup 33 * SIZE(BO), %xmm14 + mulpd %xmm6, %xmm14 + subpd %xmm14, %xmm1 + movddup 32 * SIZE(BO), %xmm15 + mulpd %xmm6, %xmm15 + subpd %xmm15, %xmm0 + + movddup 29 * SIZE(BO), %xmm10 + mulpd %xmm10, %xmm5 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm5, %xmm11 + subpd %xmm11, %xmm4 + movddup 27 * SIZE(BO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + movddup 26 * SIZE(BO), %xmm13 + mulpd %xmm5, %xmm13 + subpd %xmm13, %xmm2 + movddup 25 * SIZE(BO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm1 + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm5, %xmm15 + subpd %xmm15, %xmm0 + + movddup 20 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm4 + movddup 19 * SIZE(BO), %xmm12 + mulpd %xmm4, %xmm12 + subpd %xmm12, %xmm3 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm4, %xmm13 + subpd %xmm13, %xmm2 + movddup 17 * SIZE(BO), %xmm14 + mulpd %xmm4, %xmm14 + subpd %xmm14, %xmm1 + movddup 16 * SIZE(BO), %xmm15 + mulpd %xmm4, %xmm15 + subpd %xmm15, %xmm0 + + movddup 11 * SIZE(BO), %xmm12 + mulpd %xmm12, %xmm3 + movddup 10 * SIZE(BO), %xmm13 + mulpd %xmm3, %xmm13 + subpd %xmm13, %xmm2 + movddup 9 * SIZE(BO), %xmm14 + mulpd %xmm3, %xmm14 + subpd %xmm14, %xmm1 + movddup 8 * SIZE(BO), %xmm15 + mulpd %xmm3, %xmm15 + subpd %xmm15, %xmm0 + + movddup 2 * SIZE(BO), %xmm13 + mulpd %xmm13, %xmm2 + movddup 1 * SIZE(BO), %xmm14 + mulpd %xmm2, %xmm14 + subpd %xmm14, %xmm1 + movddup 0 * SIZE(BO), %xmm15 + mulpd %xmm2, %xmm15 + subpd %xmm15, %xmm0 + + movddup -7 * SIZE(BO), %xmm14 + mulpd %xmm14, %xmm1 + movddup -8 * SIZE(BO), %xmm15 + mulpd %xmm1, %xmm15 + subpd %xmm15, %xmm0 + + movddup -16 * SIZE(BO), %xmm15 + mulpd %xmm15, %xmm0 +#endif + + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 1 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movsd %xmm3, 1 * SIZE(CO1, LDC, 2) + movhps %xmm2, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 1 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movsd %xmm5, 1 * SIZE(CO2) + movhps %xmm4, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 1 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movsd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhps %xmm6, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 1 * SIZE(CO2, %rax, 1) +#else + movups %xmm0, 0 * SIZE(CO1) + movups %xmm1, 0 * SIZE(CO1, LDC, 1) + movups %xmm2, 0 * SIZE(CO1, LDC, 2) + movups %xmm3, 0 * SIZE(CO1, %rax, 1) + movups %xmm4, 0 * SIZE(CO2) + movups %xmm5, 0 * SIZE(CO2, LDC, 1) + movups %xmm6, 0 * SIZE(CO2, LDC, 2) + movups %xmm7, 0 * SIZE(CO2, %rax, 1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm2, -14 * SIZE(BO) + movaps %xmm4, -12 * SIZE(BO) + movaps %xmm6, -10 * SIZE(BO) + movaps %xmm1, -8 * SIZE(BO) + movaps %xmm3, -6 * SIZE(BO) + movaps %xmm5, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) + movaps %xmm4, -8 * SIZE(AO) + movaps %xmm5 , -6 * SIZE(AO) + movaps %xmm6, -4 * SIZE(AO) + movaps %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm0 + movaps -14 * SIZE(BO), %xmm1 + movaps -12 * SIZE(BO), %xmm2 + movaps -10 * SIZE(BO), %xmm3 +#else + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -12 * SIZE(AO), %xmm2 + movaps -10 * SIZE(AO), %xmm3 +#endif + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm1 + subpd %xmm10, %xmm2 + subpd %xmm11, %xmm3 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#if defined(RN) || defined(RT) + pshufd $0xe, %xmm3, %xmm7 + movaps %xmm3, %xmm6 + pshufd $0xe, %xmm2, %xmm5 + movaps %xmm2, %xmm4 + pshufd $0xe, %xmm1, %xmm3 + movaps %xmm1, %xmm2 + pshufd $0xe, %xmm0, %xmm1 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm1 + movsd -14 * SIZE(BO), %xmm10 + mulsd %xmm0, %xmm10 + subsd %xmm10, %xmm2 + movsd -13 * SIZE(BO), %xmm11 + mulsd %xmm0, %xmm11 + subsd %xmm11, %xmm3 + movsd -12 * SIZE(BO), %xmm12 + mulsd %xmm0, %xmm12 + subsd %xmm12, %xmm4 + movsd -11 * SIZE(BO), %xmm13 + mulsd %xmm0, %xmm13 + subsd %xmm13, %xmm5 + movsd -10 * SIZE(BO), %xmm14 + mulsd %xmm0, %xmm14 + subsd %xmm14, %xmm6 + movsd -9 * SIZE(BO), %xmm15 + mulsd %xmm0, %xmm15 + subsd %xmm15, %xmm7 + + movsd -7 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm1 + movsd -6 * SIZE(BO), %xmm10 + mulsd %xmm1, %xmm10 + subsd %xmm10, %xmm2 + movsd -5 * SIZE(BO), %xmm11 + mulsd %xmm1, %xmm11 + subsd %xmm11, %xmm3 + movsd -4 * SIZE(BO), %xmm12 + mulsd %xmm1, %xmm12 + subsd %xmm12, %xmm4 + movsd -3 * SIZE(BO), %xmm13 + mulsd %xmm1, %xmm13 + subsd %xmm13, %xmm5 + movsd -2 * SIZE(BO), %xmm14 + mulsd %xmm1, %xmm14 + subsd %xmm14, %xmm6 + movsd -1 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm7 + + movsd 2 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm2 + movsd 3 * SIZE(BO), %xmm11 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm3 + movsd 4 * SIZE(BO), %xmm12 + mulsd %xmm2, %xmm12 + subsd %xmm12, %xmm4 + movsd 5 * SIZE(BO), %xmm13 + mulsd %xmm2, %xmm13 + subsd %xmm13, %xmm5 + movsd 6 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm6 + movsd 7 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm7 + + movsd 11 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm3 + movsd 12 * SIZE(BO), %xmm12 + mulsd %xmm3, %xmm12 + subsd %xmm12, %xmm4 + movsd 13 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm5 + movsd 14 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm6 + movsd 15 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm7 + + movsd 20 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm4 + movsd 21 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm5 + movsd 22 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm6 + movsd 23 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm7 + + movsd 29 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm5 + movsd 30 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm6 + movsd 31 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm7 + + movsd 38 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm6 + movsd 39 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm7 + + movsd 47 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm7 +#endif + +#ifdef RT + movsd 47 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm7 + movsd 46 * SIZE(BO), %xmm9 + mulsd %xmm7, %xmm9 + subsd %xmm9, %xmm6 + movsd 45 * SIZE(BO), %xmm10 + mulsd %xmm7, %xmm10 + subsd %xmm10, %xmm5 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm7, %xmm11 + subsd %xmm11, %xmm4 + movsd 43 * SIZE(BO), %xmm12 + mulsd %xmm7, %xmm12 + subsd %xmm12, %xmm3 + movsd 42 * SIZE(BO), %xmm13 + mulsd %xmm7, %xmm13 + subsd %xmm13, %xmm2 + movsd 41 * SIZE(BO), %xmm14 + mulsd %xmm7, %xmm14 + subsd %xmm14, %xmm1 + movsd 40 * SIZE(BO), %xmm15 + mulsd %xmm7, %xmm15 + subsd %xmm15, %xmm0 + + movsd 38 * SIZE(BO), %xmm9 + mulsd %xmm9, %xmm6 + movsd 37 * SIZE(BO), %xmm10 + mulsd %xmm6, %xmm10 + subsd %xmm10, %xmm5 + movsd 36 * SIZE(BO), %xmm11 + mulsd %xmm6, %xmm11 + subsd %xmm11, %xmm4 + movsd 35 * SIZE(BO), %xmm12 + mulsd %xmm6, %xmm12 + subsd %xmm12, %xmm3 + movsd 34 * SIZE(BO), %xmm13 + mulsd %xmm6, %xmm13 + subsd %xmm13, %xmm2 + movsd 33 * SIZE(BO), %xmm14 + mulsd %xmm6, %xmm14 + subsd %xmm14, %xmm1 + movsd 32 * SIZE(BO), %xmm15 + mulsd %xmm6, %xmm15 + subsd %xmm15, %xmm0 + + movsd 29 * SIZE(BO), %xmm10 + mulsd %xmm10, %xmm5 + movsd 28 * SIZE(BO), %xmm11 + mulsd %xmm5, %xmm11 + subsd %xmm11, %xmm4 + movsd 27 * SIZE(BO), %xmm12 + mulsd %xmm5, %xmm12 + subsd %xmm12, %xmm3 + movsd 26 * SIZE(BO), %xmm13 + mulsd %xmm5, %xmm13 + subsd %xmm13, %xmm2 + movsd 25 * SIZE(BO), %xmm14 + mulsd %xmm5, %xmm14 + subsd %xmm14, %xmm1 + movsd 24 * SIZE(BO), %xmm15 + mulsd %xmm5, %xmm15 + subsd %xmm15, %xmm0 + + movsd 20 * SIZE(BO), %xmm11 + mulsd %xmm11, %xmm4 + movsd 19 * SIZE(BO), %xmm12 + mulsd %xmm4, %xmm12 + subsd %xmm12, %xmm3 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm4, %xmm13 + subsd %xmm13, %xmm2 + movsd 17 * SIZE(BO), %xmm14 + mulsd %xmm4, %xmm14 + subsd %xmm14, %xmm1 + movsd 16 * SIZE(BO), %xmm15 + mulsd %xmm4, %xmm15 + subsd %xmm15, %xmm0 + + movsd 11 * SIZE(BO), %xmm12 + mulsd %xmm12, %xmm3 + movsd 10 * SIZE(BO), %xmm13 + mulsd %xmm3, %xmm13 + subsd %xmm13, %xmm2 + movsd 9 * SIZE(BO), %xmm14 + mulsd %xmm3, %xmm14 + subsd %xmm14, %xmm1 + movsd 8 * SIZE(BO), %xmm15 + mulsd %xmm3, %xmm15 + subsd %xmm15, %xmm0 + + movsd 2 * SIZE(BO), %xmm13 + mulsd %xmm13, %xmm2 + movsd 1 * SIZE(BO), %xmm14 + mulsd %xmm2, %xmm14 + subsd %xmm14, %xmm1 + movsd 0 * SIZE(BO), %xmm15 + mulsd %xmm2, %xmm15 + subsd %xmm15, %xmm0 + + movsd -7 * SIZE(BO), %xmm14 + mulsd %xmm14, %xmm1 + movsd -8 * SIZE(BO), %xmm15 + mulsd %xmm1, %xmm15 + subsd %xmm15, %xmm0 + + movsd -16 * SIZE(BO), %xmm15 + mulsd %xmm15, %xmm0 +#endif + +#if defined(RN) || defined(RT) + unpcklpd %xmm1, %xmm0 + movaps %xmm2, %xmm1 + unpcklpd %xmm3, %xmm1 + movaps %xmm4, %xmm2 + unpcklpd %xmm5, %xmm2 + movaps %xmm6, %xmm3 + unpcklpd %xmm7, %xmm3 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO1, LDC, 1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 2) + movhps %xmm1, 0 * SIZE(CO1, %rax, 1) + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 0 * SIZE(CO2, LDC, 1) + movsd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhps %xmm3, 0 * SIZE(CO2, %rax, 1) + +#if defined(LN) || defined(LT) + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm1, -14 * SIZE(BO) + movaps %xmm2, -12 * SIZE(BO) + movaps %xmm3, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x2_atom.S b/kernel/x86_64/trsm_kernel_RT_4x2_atom.S new file mode 100644 index 0000000000..ae49c38370 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x2_atom.S @@ -0,0 +1,2116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm4 + movsd 3 * SIZE(BO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm9 + movsd 13 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm4 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm11 + movsd 10 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm2 + movsd 9 * SIZE(AO), %xmm9 + mulsd %xmm6, %xmm13 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm0 + + mulsd %xmm8, %xmm4 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm11 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11, %xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 2 * SIZE(AO), %xmm11 + subsd %xmm9, %xmm2 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm11 + movsd 5 * SIZE(AO), %xmm8 + subsd %xmm11, %xmm4 + movsd 6 * SIZE(AO), %xmm9 + mulsd %xmm0, %xmm13 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm13, %xmm6 + + mulsd %xmm8, %xmm2 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + mulsd %xmm2, %xmm11 + subsd %xmm11, %xmm6 + mulsd %xmm8, %xmm4 + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm4, %xmm9 + subsd %xmm9, %xmm6 + mulsd %xmm8, %xmm6 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) + movsd %xmm4, 2 * SIZE(BO) + movsd %xmm6, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + je .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + movsd 2 * SIZE(AO), %xmm9 + movsd 0 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm2 + mulsd %xmm2, %xmm9 + subsd %xmm9, %xmm0 + mulsd %xmm11,%xmm0 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + movsd 1 * SIZE(AO), %xmm9 + movsd 3 * SIZE(AO), %xmm11 + mulsd %xmm8, %xmm0 + mulsd %xmm0, %xmm9 + subsd %xmm9, %xmm2 + mulsd %xmm11,%xmm2 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $1, M + je .L69 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + subsd %xmm8, %xmm0 +#else + movsd 0 * SIZE(AO), %xmm0 + subsd %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L40: + movq N, J + sarq $1, J + jle .L999 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addsd %xmm2, %xmm13 + addsd %xmm7, %xmm14 + addsd %xmm6, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 5 * SIZE(BO), %xmm5 + movsd 6 * SIZE(BO), %xmm6 + movsd 7 * SIZE(BO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 + subsd %xmm12, %xmm4 + subsd %xmm13, %xmm5 + subsd %xmm14, %xmm6 + subsd %xmm15, %xmm7 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + movsd 4 * SIZE(AO), %xmm1 + movsd 5 * SIZE(AO), %xmm3 + movsd 6 * SIZE(AO), %xmm5 + movsd 7 * SIZE(AO), %xmm7 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm12, %xmm4 + subsd %xmm14, %xmm6 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 + subsd %xmm13, %xmm5 + subsd %xmm15, %xmm7 +#endif + +#ifdef LN + movsd 15 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm6 + movsd 14 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm7 + movsd 13 * SIZE(AO), %xmm11 + + movaps %xmm9, %xmm10 + movsd 12 * SIZE(AO), %xmm13 + mulsd %xmm6, %xmm9 + movsd 10 * SIZE(AO), %xmm8 + mulsd %xmm7, %xmm10 + subsd %xmm9, %xmm4 + movsd 9 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm6, %xmm11 + mulsd %xmm7, %xmm12 + subsd %xmm11, %xmm2 + movsd 8 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm3 + + movaps %xmm13, %xmm14 + mulsd %xmm6, %xmm13 + mulsd %xmm7, %xmm14 + subsd %xmm13, %xmm0 + subsd %xmm14, %xmm1 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 5 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm2 + movsd 4 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm4, %xmm11 + mulsd %xmm5, %xmm12 + subsd %xmm11, %xmm0 + movsd 0 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm1 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm11, %xmm0 + mulsd %xmm11, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + + movsd 2 * SIZE(AO), %xmm11 + movaps %xmm9, %xmm10 + movsd 3 * SIZE(AO), %xmm13 + mulsd %xmm0, %xmm9 + movsd 5 * SIZE(AO), %xmm8 + mulsd %xmm1, %xmm10 + subsd %xmm9, %xmm2 + movsd 6 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm3 + + movaps %xmm11, %xmm12 + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm12 + subsd %xmm11, %xmm4 + movsd 7 * SIZE(AO), %xmm11 + subsd %xmm12, %xmm5 + + movaps %xmm13, %xmm14 + mulsd %xmm0, %xmm13 + mulsd %xmm1, %xmm14 + subsd %xmm13, %xmm6 + subsd %xmm14, %xmm7 + + mulsd %xmm8, %xmm2 + mulsd %xmm8, %xmm3 + movsd 10 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + subsd %xmm9, %xmm4 + movsd 11 * SIZE(AO), %xmm9 + subsd %xmm10, %xmm5 + + movaps %xmm11, %xmm12 + mulsd %xmm2, %xmm11 + mulsd %xmm3, %xmm12 + subsd %xmm11, %xmm6 + subsd %xmm12, %xmm7 + + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm5 + movsd 15 * SIZE(AO), %xmm8 + + movaps %xmm9, %xmm10 + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm10 + subsd %xmm9, %xmm6 + subsd %xmm10, %xmm7 + + mulsd %xmm8, %xmm6 + mulsd %xmm8, %xmm7 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm4 + mulsd %xmm8, %xmm6 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + mulsd %xmm4, %xmm11 + mulsd %xmm6, %xmm12 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + subsd %xmm11, %xmm5 + subsd %xmm12, %xmm7 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 + mulsd %xmm13, %xmm5 + mulsd %xmm13, %xmm7 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm5 + mulsd %xmm8, %xmm7 + + movaps %xmm9, %xmm10 + movaps %xmm9, %xmm11 + movaps %xmm9, %xmm12 + + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + mulsd %xmm5, %xmm11 + mulsd %xmm7, %xmm12 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm4 + subsd %xmm12, %xmm6 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm4 + mulsd %xmm13, %xmm6 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movsd %xmm6, 3 * SIZE(CO1) + + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + movsd %xmm5, 2 * SIZE(CO2) + movsd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) + movsd %xmm4, 4 * SIZE(BO) + movsd %xmm5, 5 * SIZE(BO) + movsd %xmm6, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm4, 2 * SIZE(AO) + movsd %xmm6, 3 * SIZE(AO) + movsd %xmm1, 4 * SIZE(AO) + movsd %xmm3, 5 * SIZE(AO) + movsd %xmm5, 6 * SIZE(AO) + movsd %xmm7, 7 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + addsd %xmm6, %xmm11 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm2 + subsd %xmm11, %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm2 + movsd 2 * SIZE(AO), %xmm1 + movsd 3 * SIZE(AO), %xmm3 + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm2 + subsd %xmm9, %xmm1 + subsd %xmm11, %xmm3 +#endif + +#ifdef LN + movsd 3 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm2 + movsd 2 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm2, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm1 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + mulsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm10 + + subsd %xmm9, %xmm2 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm2 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm2 + movsd 3 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm0, %xmm9 + mulsd %xmm2, %xmm10 + + subsd %xmm9, %xmm1 + subsd %xmm10, %xmm3 + + mulsd %xmm13, %xmm1 + mulsd %xmm13, %xmm3 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm3 + movsd 0 * SIZE(BO), %xmm13 + + movaps %xmm9, %xmm10 + mulsd %xmm1, %xmm9 + mulsd %xmm3, %xmm10 + + subsd %xmm9, %xmm0 + subsd %xmm10, %xmm2 + + mulsd %xmm13, %xmm0 + mulsd %xmm13, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm2, 1 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + movsd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm2, 1 * SIZE(AO) + movsd %xmm1, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + + subsd %xmm8, %xmm0 + subsd %xmm9, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm8 + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm0 + movsd 1 * SIZE(BO), %xmm9 + mulsd %xmm0, %xmm9 + movsd 3 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm1 + mulsd %xmm13, %xmm1 +#endif + +#ifdef RT + movsd 3 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm1 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm1, %xmm9 + movsd 0 * SIZE(BO), %xmm13 + subsd %xmm9, %xmm0 + mulsd %xmm13, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S b/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S new file mode 100644 index 0000000000..400f60ecb9 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_barcelona.S @@ -0,0 +1,3393 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define OFFSET 48(%rsp) +#define AORIG 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define AORIG 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 7 + 0) + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ +/***/ movapd (AO, %rax, 4), %xmm6 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ +/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ +/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ +/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm12 +#else + movq STACKSIZE + 8(%rsp), LDC + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movsd %xmm12, OFFSET + movsd %xmm12, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -14 * SIZE(BO), %xmm3 + +#ifndef LN + prefetchw 3 * SIZE(CO1) +#else + prefetchw -8 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm10 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm11 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm9 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm10 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movlpd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) + movaps %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movlpd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm10, -16 * SIZE(BO) +#else + movaps %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (BO, %rax, SIZE), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#if defined(RN) || defined(RT) + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + + movsd %xmm10, 0 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movlpd %xmm10, -16 * SIZE(BO) +#else + movlpd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + testq $2, N + je .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 5 * SIZE(CO2) +#else + prefetchw -4 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw -4 * SIZE(CO2) +#endif + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13,-14 * SIZE(BO) + movaps %xmm1, -12 * SIZE(BO) + movaps %xmm5, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 1), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 +#else + movapd -16 * SIZE(AO), %xmm2 +#endif + + subpd %xmm8, %xmm2 + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -16 * SIZE(BO), %xmm2 + movsd -15 * SIZE(BO), %xmm4 + mulsd %xmm2, %xmm4 + subsd %xmm4, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + mulsd -13 * SIZE(BO), %xmm0 + + movlpd -14 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + subsd %xmm4, %xmm2 + + mulsd -16 * SIZE(BO), %xmm2 + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + movq B, BB + subq %rax, BB + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + +#ifndef LN + prefetchw 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 5 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 5 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#else + prefetchw -8 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw -8 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw -8 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw -8 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 +#endif + + prefetch -16 * SIZE(BB) + prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm7, %xmm14 + subpd %xmm14, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm15 + + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm11, %xmm14 + subpd %xmm14, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm3 + + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm1, %xmm9 + subpd %xmm9, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm1, %xmm11 + subpd %xmm11, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm7, %xmm9 + subpd %xmm9, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm7, %xmm10 + subpd %xmm10, %xmm3 + + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm7, %xmm11 + subpd %xmm11, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm5, %xmm9 + subpd %xmm9, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm3, %xmm9 + subpd %xmm9, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movlpd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) + movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movlpd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movlpd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) + movaps %xmm1, -8 * SIZE(BO) + movaps %xmm3, -6 * SIZE(BO) + movaps %xmm5, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm1, -14 * SIZE(AO) + movaps %xmm2, -12 * SIZE(AO) + movaps %xmm3, -10 * SIZE(AO) + movaps %xmm4, -8 * SIZE(AO) + movaps %xmm5, -6 * SIZE(AO) + movaps %xmm6, -4 * SIZE(AO) + movaps %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm15, %xmm10 + subpd %xmm10, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm11, %xmm10 + subpd %xmm10, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm9, 0 * SIZE(CO1) + movlpd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) + movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movlpd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -16 * SIZE(BO) + movaps %xmm11, -14 * SIZE(BO) + movaps %xmm13, -12 * SIZE(BO) + movaps %xmm15, -10 * SIZE(BO) +#else + movaps %xmm0, -16 * SIZE(AO) + movaps %xmm2, -14 * SIZE(AO) + movaps %xmm4, -12 * SIZE(AO) + movaps %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + movq B, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#else + movapd -16 * SIZE(AO), %xmm2 + movapd -14 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm2 + subpd %xmm9, %xmm3 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd -15 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd -14 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd -13 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd -9 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -5 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd -1 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd -3 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd -4 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd -6 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd -7 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd -8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd -11 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd -12 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd -16 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movlpd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm3, -14 * SIZE(BO) +#else + movaps %xmm2, -16 * SIZE(AO) + movaps %xmm3, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_core2.S b/kernel/x86_64/trsm_kernel_RT_4x4_core2.S new file mode 100644 index 0000000000..89d07cef52 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_core2.S @@ -0,0 +1,3737 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define J 0(%rsp) +#define OFFSET 8(%rsp) +#define KK 16(%rsp) +#define KKK 24(%rsp) +#define AORIG 32(%rsp) +#define BORIG 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm0 + + movapd %xmm0, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 +#ifdef LN + prefetcht2 -3 * SIZE(CO1) +#else + prefetcht2 3 * SIZE(CO1) +#endif + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -14 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + movapd -10 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm9, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + movddup %xmm11, %xmm9 + SHUFPD_3 %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -12 * SIZE(BO), %xmm2 + movapd -10 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm2 + addpd %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(B) + + movddup %xmm10, %xmm8 + SHUFPD_3 %xmm10, %xmm10 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm10, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -12 * SIZE(BO), %xmm2 + movsd -10 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm1, %xmm3 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(B), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(B) + + movlpd %xmm10, -16 * SIZE(BO) + movlpd %xmm10, -15 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -12 * SIZE(B), %xmm4 + movddup -11 * SIZE(B), %xmm5 + movddup -10 * SIZE(B), %xmm6 + movddup -9 * SIZE(B), %xmm7 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + movapd -12 * SIZE(B), %xmm1 + movapd -10 * SIZE(B), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(B), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + movapd %xmm1, -12 * SIZE(B) + movapd %xmm5, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) + movapd %xmm0, -8 * SIZE(BO) + movapd %xmm1, -6 * SIZE(BO) + movapd %xmm4, -4 * SIZE(BO) + movapd %xmm5, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd -13 * SIZE(AO), %xmm1 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + mulsd %xmm1, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(B), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(B), %xmm13 + + movlpd -14 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -8 * SIZE(B), %xmm4 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + movddup %xmm2, %xmm10 + unpckhpd %xmm2, %xmm2 + movddup %xmm3, %xmm11 + unpckhpd %xmm3, %xmm3 + movddup %xmm4, %xmm12 + unpckhpd %xmm4, %xmm4 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + movddup %xmm6, %xmm14 + unpckhpd %xmm6, %xmm6 + movddup %xmm7, %xmm15 + unpckhpd %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + movapd %xmm10, -8 * SIZE(BO) + movapd %xmm2, -6 * SIZE(BO) + movapd %xmm11, -4 * SIZE(BO) + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + movapd %xmm12, 0 * SIZE(BO) + movapd %xmm4, 2 * SIZE(BO) + movapd %xmm13, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movapd %xmm14, 8 * SIZE(BO) + movapd %xmm6, 10 * SIZE(BO) + movapd %xmm15, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + prefetcht2 0 * SIZE(BB) + +#ifdef LN + prefetcht2 -3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 -3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 -3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 -3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 +#else + prefetcht2 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht2 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetcht2 3 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetcht2 3 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 +#endif + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm10 + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm14 + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 0 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd 4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd 6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + movapd 10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + addq $32 * SIZE, BO + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + subq $1, %rax + mulpd %xmm1, %xmm5 + + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + movapd -16 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + movapd -8 * SIZE(B), %xmm1 + movapd -6 * SIZE(B), %xmm3 + movapd -4 * SIZE(B), %xmm5 + movapd -2 * SIZE(B), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(B), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + movapd %xmm1, -8 * SIZE(B) + movapd %xmm3, -6 * SIZE(B) + movapd %xmm5, -4 * SIZE(B) + movapd %xmm7, -2 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + movddup %xmm1, %xmm0 + SHUFPD_3 %xmm1, %xmm1 + movddup %xmm3, %xmm2 + SHUFPD_3 %xmm3, %xmm3 + movddup %xmm5, %xmm4 + SHUFPD_3 %xmm5, %xmm5 + movddup %xmm7, %xmm6 + SHUFPD_3 %xmm7, %xmm7 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) + movapd %xmm4, 8 * SIZE(BO) + movapd %xmm5, 10 * SIZE(BO) + movapd %xmm6, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jne .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(B), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(B), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(B), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(B), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(B), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(B), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(B), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(B), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(B), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(B), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(B), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + SHUFPD_3 %xmm9, %xmm9 + movddup %xmm11, %xmm10 + SHUFPD_3 %xmm11, %xmm11 + movddup %xmm13, %xmm12 + SHUFPD_3 %xmm13, %xmm13 + movddup %xmm15, %xmm14 + SHUFPD_3 %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -15 * SIZE(AO), %xmm0 + movsd -8 * SIZE(BO), %xmm2 + movsd -6 * SIZE(BO), %xmm3 + movsd -4 * SIZE(BO), %xmm4 + movsd -2 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -14 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 2 * SIZE(BO), %xmm3 + movsd 4 * SIZE(BO), %xmm4 + movsd 6 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd -13 * SIZE(AO), %xmm0 + movsd 8 * SIZE(BO), %xmm2 + movsd 10 * SIZE(BO), %xmm3 + movsd 12 * SIZE(BO), %xmm4 + movsd 14 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -14 * SIZE(BO), %xmm3 + movsd -12 * SIZE(BO), %xmm4 + movsd -10 * SIZE(BO), %xmm5 + + mulsd %xmm0, %xmm2 + mulsd %xmm0, %xmm3 + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(B), %xmm12 + movsd -15 * SIZE(B), %xmm13 + movsd -14 * SIZE(B), %xmm14 + movsd -13 * SIZE(B), %xmm15 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 + movsd -14 * SIZE(AO), %xmm14 + movsd -13 * SIZE(AO), %xmm15 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + subsd %xmm10, %xmm14 + subsd %xmm11, %xmm15 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 + mulsd %xmm8, %xmm14 + mulsd %xmm8, %xmm15 +#endif + +#ifdef RN + mulsd -16 * SIZE(B), %xmm12 + movlpd -15 * SIZE(B), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(B), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(B), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(B), %xmm13 + movlpd -10 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(B), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(B), %xmm14 + movlpd -5 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(B), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(B), %xmm15 + + movlpd -2 * SIZE(B), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(B), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(B), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(B), %xmm14 + + movlpd -7 * SIZE(B), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(B), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(B), %xmm13 + + movlpd -12 * SIZE(B), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(B), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(B) + movsd %xmm13, -15 * SIZE(B) + movsd %xmm14, -14 * SIZE(B) + movsd %xmm15, -13 * SIZE(B) + + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm12, -15 * SIZE(BO) + movsd %xmm13, -14 * SIZE(BO) + movsd %xmm13, -13 * SIZE(BO) + movsd %xmm14, -12 * SIZE(BO) + movsd %xmm14, -11 * SIZE(BO) + movsd %xmm15, -10 * SIZE(BO) + movsd %xmm15, -9 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S b/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S new file mode 100644 index 0000000000..a575d4cb11 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_penryn.S @@ -0,0 +1,3426 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + BRANCH + jle .L40 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) +#else + prefetcht0 3 * SIZE(CO1) +#endif + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm10 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm10 + subpd %xmm12, %xmm11 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + movsd -2 * SIZE(AO), %xmm13 + mulsd %xmm9, %xmm13 + subsd %xmm13, %xmm11 + movsd -3 * SIZE(AO), %xmm14 + mulsd %xmm9, %xmm14 + subsd %xmm14, %xmm8 + movsd -4 * SIZE(AO), %xmm15 + mulsd %xmm9, %xmm15 + subsd %xmm15, %xmm10 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -7 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm8 + movsd -8 * SIZE(AO), %xmm14 + mulsd %xmm11, %xmm14 + subsd %xmm14, %xmm10 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -12 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movapd %xmm11, %xmm9 + unpckhpd %xmm9, %xmm9 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + movsd -14 * SIZE(AO), %xmm14 + mulsd %xmm10, %xmm14 + subsd %xmm14, %xmm11 + movsd -13 * SIZE(AO), %xmm15 + mulsd %xmm10, %xmm15 + subsd %xmm15, %xmm9 + + movsd -11 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -10 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm11 + movsd -9 * SIZE(AO), %xmm14 + mulsd %xmm8, %xmm14 + subsd %xmm14, %xmm9 + + movsd -6 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm11 + + movsd -5 * SIZE(AO), %xmm13 + mulsd %xmm11, %xmm13 + subsd %xmm13, %xmm9 + + movsd -1 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm9 + + unpcklpd %xmm8, %xmm10 + unpcklpd %xmm9, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhpd %xmm11, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + pxor %xmm9, %xmm9 + movhps -15 * SIZE(BO), %xmm2 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -12 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm10 + subpd %xmm8, %xmm10 +#else + movapd -16 * SIZE(AO), %xmm10 + subpd %xmm8, %xmm10 +#endif + +#ifdef LN + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + movsd -14 * SIZE(AO), %xmm13 + mulsd %xmm8, %xmm13 + subsd %xmm13, %xmm10 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef LT + movapd %xmm10, %xmm8 + unpckhpd %xmm8, %xmm8 + + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 + + movsd -15 * SIZE(AO), %xmm13 + mulsd %xmm10, %xmm13 + subsd %xmm13, %xmm8 + + movsd -13 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm8 + + unpcklpd %xmm8, %xmm10 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) + movhpd %xmm10, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm10, -16 * SIZE(BO) +#else + movapd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + BRANCH + jle .L119 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -12 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addpd %xmm9, %xmm8 + + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm10 + subsd %xmm8, %xmm10 +#else + movsd -16 * SIZE(AO), %xmm10 + subsd %xmm8, %xmm10 +#endif + +#ifdef LN + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm12 + mulsd %xmm12, %xmm10 +#endif + +#ifdef RN + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef RT + movsd -16 * SIZE(BO), %xmm8 + mulsd %xmm8, %xmm10 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, 0 * SIZE(CO1) +#else + movsd %xmm10, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm10, -16 * SIZE(BO) +#else + movsd %xmm10, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (B, K, SIZE), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + movq B, BB + subq %rax, BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 -4 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#else + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO2) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + movapd -12 * SIZE(BO), %xmm1 + movapd -10 * SIZE(BO), %xmm5 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 + subpd %xmm12, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + movddup -2 * SIZE(AO), %xmm10 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + movddup -3 * SIZE(AO), %xmm12 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm13 + movddup -4 * SIZE(AO), %xmm14 + mulpd %xmm5, %xmm14 + subpd %xmm14, %xmm9 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -7 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + movddup -8 * SIZE(AO), %xmm12 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm9 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + movddup -12 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + movddup -14 * SIZE(AO), %xmm12 + mulpd %xmm9, %xmm12 + subpd %xmm12, %xmm1 + movddup -13 * SIZE(AO), %xmm14 + mulpd %xmm9, %xmm14 + subpd %xmm14, %xmm5 + + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -10 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + movddup -9 * SIZE(AO), %xmm12 + mulpd %xmm13, %xmm12 + subpd %xmm12, %xmm5 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + movddup -5 * SIZE(AO), %xmm10 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm3 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -14 * SIZE(BO), %xmm9 + movapd %xmm9, %xmm10 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + mulpd %xmm3, %xmm10 + subpd %xmm10, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) + movapd %xmm1, -12 * SIZE(BO) + movapd %xmm5, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm0, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + + movddup -14 * SIZE(AO), %xmm10 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + + movddup -15 * SIZE(AO), %xmm10 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 +#endif + +#ifdef RT + movddup -13 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + + movddup -14 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm9, %xmm8 + movhlps %xmm8, %xmm9 + +#if defined(LN) || defined(LT) + movsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm13 +#else + movsd -16 * SIZE(AO), %xmm12 + movsd -15 * SIZE(AO), %xmm13 +#endif + + subsd %xmm8, %xmm12 + subsd %xmm9, %xmm13 + +#ifdef LN + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef LT + movsd -16 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm12 + mulsd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movsd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + + mulsd -13 * SIZE(BO), %xmm13 +#endif + +#ifdef RT + mulsd -13 * SIZE(BO), %xmm13 + + movlpd -14 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm12, -16 * SIZE(BO) + movsd %xmm13, -15 * SIZE(BO) +#else + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + movq N, J + sarq $2, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + movq B, BB + subq %rax, BB + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 -4 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 -4 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht0 3 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + addpd %xmm3, %xmm11 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 4 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 8 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps 16 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + subq $-32 * SIZE, AO + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movapd %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movapd %xmm14, %xmm0 + movsd %xmm15, %xmm14 + movsd %xmm0, %xmm15 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd %xmm12, %xmm4 + unpcklpd %xmm13, %xmm12 + unpckhpd %xmm13, %xmm4 + + movapd %xmm14, %xmm6 + unpcklpd %xmm15, %xmm14 + unpckhpd %xmm15, %xmm6 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + movapd -8 * SIZE(BO), %xmm1 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm5 + movapd -2 * SIZE(BO), %xmm7 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 + subpd %xmm12, %xmm1 + subpd %xmm14, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -12 * SIZE(AO), %xmm2 + movapd -10 * SIZE(AO), %xmm3 + + movapd -8 * SIZE(AO), %xmm4 + movapd -6 * SIZE(AO), %xmm5 + movapd -4 * SIZE(AO), %xmm6 + movapd -2 * SIZE(AO), %xmm7 + + subpd %xmm8, %xmm0 + subpd %xmm12, %xmm1 + subpd %xmm9, %xmm2 + subpd %xmm13, %xmm3 + subpd %xmm10, %xmm4 + subpd %xmm14, %xmm5 + subpd %xmm11, %xmm6 + subpd %xmm15, %xmm7 +#endif + +#ifdef LN + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -3 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm15 + + movddup -4 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm5, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm11 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -7 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm15 + + movddup -8 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm11 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -12 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm3 + + movddup -13 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -10 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm1 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm3 + + movddup -9 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm1 + mulpd %xmm8, %xmm3 + + movddup -5 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm1, %xmm10 + subpd %xmm10, %xmm5 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm5 + mulpd %xmm8, %xmm7 +#endif + + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 + + movddup -15 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm3 + + movddup -14 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm5 + + movddup -13 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm1, %xmm12 + subpd %xmm12, %xmm7 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -10 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm5 + + movddup -9 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm7 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -5 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm6 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm7 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + mulpd %xmm8, %xmm7 + + movddup -2 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm4 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm5 + + movddup -3 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm3 + + movddup -4 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm7, %xmm12 + subpd %xmm12, %xmm1 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + mulpd %xmm8, %xmm5 + + movddup -7 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm2 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm3 + + movddup -8 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm5, %xmm12 + subpd %xmm12, %xmm1 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + mulpd %xmm8, %xmm3 + + movddup -12 * SIZE(BO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm0 + mulpd %xmm3, %xmm12 + subpd %xmm12, %xmm1 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + mulpd %xmm8, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movsd %xmm5, 3 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movhpd %xmm1, 2 * SIZE(CO2) + movhpd %xmm5, 3 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + movsd %xmm3, 2 * SIZE(CO1, LDC, 2) + movsd %xmm7, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm3, 2 * SIZE(CO2) + movhpd %xmm3, 3 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + movsd %xmm5, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) + movsd %xmm7, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) + movapd %xmm1, -8 * SIZE(BO) + movapd %xmm3, -6 * SIZE(BO) + movapd %xmm5, -4 * SIZE(BO) + movapd %xmm7, -2 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm1, -14 * SIZE(AO) + movapd %xmm2, -12 * SIZE(AO) + movapd %xmm3, -10 * SIZE(AO) + movapd %xmm4, -8 * SIZE(AO) + movapd %xmm5, -6 * SIZE(AO) + movapd %xmm6, -4 * SIZE(AO) + movapd %xmm7, -2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm5, %xmm5 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movapd %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movapd %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + +#if defined(LN) || defined(LT) + movapd %xmm8, %xmm0 + unpcklpd %xmm9, %xmm8 + unpckhpd %xmm9, %xmm0 + + movapd %xmm10, %xmm2 + unpcklpd %xmm11, %xmm10 + unpckhpd %xmm11, %xmm2 + + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm0, %xmm13 + subpd %xmm2, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm2 + movapd -12 * SIZE(AO), %xmm4 + movapd -10 * SIZE(AO), %xmm6 + + subpd %xmm8, %xmm0 + subpd %xmm9, %xmm2 + subpd %xmm10, %xmm4 + subpd %xmm11, %xmm6 +#endif + +#ifdef LN + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 + + movddup -14 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm13, %xmm10 + subpd %xmm10, %xmm9 + mulpd %xmm15, %xmm12 + subpd %xmm12, %xmm11 + + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd %xmm8, %xmm11 + + movddup -15 * SIZE(AO), %xmm10 + movapd %xmm10, %xmm12 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm13 + mulpd %xmm11, %xmm12 + subpd %xmm12, %xmm15 + + movddup -13 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm13 + mulpd %xmm8, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 + + movddup -15 * SIZE(BO), %xmm9 + mulpd %xmm0, %xmm9 + subpd %xmm9, %xmm2 + movddup -14 * SIZE(BO), %xmm10 + mulpd %xmm0, %xmm10 + subpd %xmm10, %xmm4 + movddup -13 * SIZE(BO), %xmm11 + mulpd %xmm0, %xmm11 + subpd %xmm11, %xmm6 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -10 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm4 + movddup -9 * SIZE(BO), %xmm10 + mulpd %xmm2, %xmm10 + subpd %xmm10, %xmm6 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + + movddup -5 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm6 + + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 +#endif + +#ifdef RT + movddup -1 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm6 + + movddup -2 * SIZE(BO), %xmm9 + mulpd %xmm6, %xmm9 + subpd %xmm9, %xmm4 + movddup -3 * SIZE(BO), %xmm10 + mulpd %xmm6, %xmm10 + subpd %xmm10, %xmm2 + movddup -4 * SIZE(BO), %xmm11 + mulpd %xmm6, %xmm11 + subpd %xmm11, %xmm0 + + movddup -6 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm4 + movddup -7 * SIZE(BO), %xmm9 + mulpd %xmm4, %xmm9 + subpd %xmm9, %xmm2 + movddup -8 * SIZE(BO), %xmm10 + mulpd %xmm4, %xmm10 + subpd %xmm10, %xmm0 + + movddup -11 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm2 + movddup -12 * SIZE(BO), %xmm9 + mulpd %xmm2, %xmm9 + subpd %xmm9, %xmm0 + + movddup -16 * SIZE(BO), %xmm8 + mulpd %xmm8, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm13, 1 * SIZE(CO1) + + movhpd %xmm9, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + + movsd %xmm11, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + + movsd %xmm4, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm0, -16 * SIZE(AO) + movapd %xmm2, -14 * SIZE(AO) + movapd %xmm4, -12 * SIZE(AO) + movapd %xmm6, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -6 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -2 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 2 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -10 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(LT) + movaps -16 * SIZE(BO), %xmm12 + movaps -14 * SIZE(BO), %xmm13 +#else + movaps -16 * SIZE(AO), %xmm12 + movaps -14 * SIZE(AO), %xmm13 +#endif + + subpd %xmm8, %xmm12 + subpd %xmm9, %xmm13 + +#if defined(RN) || defined(RT) + movhlps %xmm13, %xmm15 + movsd %xmm13, %xmm14 + movhlps %xmm12, %xmm13 + movsd %xmm12, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm12 + mulpd %xmm8, %xmm13 +#endif + +#ifdef RN + mulsd -16 * SIZE(BO), %xmm12 + movlpd -15 * SIZE(BO), %xmm9 + mulsd %xmm12, %xmm9 + subsd %xmm9, %xmm13 + movlpd -14 * SIZE(BO), %xmm10 + mulsd %xmm12, %xmm10 + subsd %xmm10, %xmm14 + movlpd -13 * SIZE(BO), %xmm11 + mulsd %xmm12, %xmm11 + subsd %xmm11, %xmm15 + + mulsd -11 * SIZE(BO), %xmm13 + movlpd -10 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm14 + movlpd -9 * SIZE(BO), %xmm10 + mulsd %xmm13, %xmm10 + subsd %xmm10, %xmm15 + + mulsd -6 * SIZE(BO), %xmm14 + movlpd -5 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm15 + + mulsd -1 * SIZE(BO), %xmm15 +#endif + +#ifdef RT + mulsd -1 * SIZE(BO), %xmm15 + + movlpd -2 * SIZE(BO), %xmm9 + mulsd %xmm15, %xmm9 + subsd %xmm9, %xmm14 + movlpd -3 * SIZE(BO), %xmm10 + mulsd %xmm15, %xmm10 + subsd %xmm10, %xmm13 + movlpd -4 * SIZE(BO), %xmm11 + mulsd %xmm15, %xmm11 + subsd %xmm11, %xmm12 + + mulsd -6 * SIZE(BO), %xmm14 + + movlpd -7 * SIZE(BO), %xmm9 + mulsd %xmm14, %xmm9 + subsd %xmm9, %xmm13 + movlpd -8 * SIZE(BO), %xmm10 + mulsd %xmm14, %xmm10 + subsd %xmm10, %xmm12 + + mulsd -11 * SIZE(BO), %xmm13 + + movlpd -12 * SIZE(BO), %xmm9 + mulsd %xmm13, %xmm9 + subsd %xmm9, %xmm12 + + mulsd -16 * SIZE(BO), %xmm12 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + movsd %xmm13, 0 * SIZE(CO1, LDC, 2) + movhps %xmm13, 0 * SIZE(CO2, LDC, 2) + + movaps %xmm12, -16 * SIZE(BO) + movaps %xmm13, -14 * SIZE(BO) +#else + movsd %xmm12, 0 * SIZE(CO1) + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm14, 0 * SIZE(CO1, LDC, 2) + movsd %xmm15, 0 * SIZE(CO2, LDC, 2) + + movsd %xmm12, -16 * SIZE(AO) + movsd %xmm13, -15 * SIZE(AO) + movsd %xmm14, -14 * SIZE(AO) + movsd %xmm15, -13 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S b/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S new file mode 100644 index 0000000000..07c978ee97 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_sse2.S @@ -0,0 +1,4134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#ifndef ALLOC_HUGETLB +#define PREFETCHSIZE (8 * 4 + 4) +#else +#define PREFETCHSIZE (8 * 2 + 4) +#endif +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 4 + 4) +#endif + +#ifdef OPTERON +#define movsd movlpd +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_4 + +.L83: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movsd 0 * SIZE(B), %xmm0 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_4 + +.L90: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movapd 24 * SIZE(AO), %xmm14 + + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm12 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm0 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm11, %xmm1 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm12 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm12, %xmm2 + movapd 48 * SIZE(AO), %xmm12 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm11, %xmm3 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm0 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm1 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm14 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm14, %xmm2 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm11, %xmm3 + movapd 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(AO), %xmm8 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 4 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movapd 6 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 12 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movapd 14 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) + movhpd %xmm2, 2 * SIZE(BO) + movhpd %xmm2, 3 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 16 * SIZE(BO), %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm1 + movsd 2 * SIZE(AO), %xmm8 + mulsd 4 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm2 + movsd 3 * SIZE(AO), %xmm8 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + mulsd %xmm10, %xmm11 + movsd 5 * SIZE(AO), %xmm10 + addsd %xmm11, %xmm0 + movsd 24 * SIZE(BO), %xmm11 + mulsd 10 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm1 + movsd 6 * SIZE(AO), %xmm10 + mulsd 12 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm2 + movsd 7 * SIZE(AO), %xmm10 + mulsd 14 * SIZE(BO), %xmm10 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm8, %xmm9 + movsd 1 * SIZE(AO), %xmm8 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + addsd %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(B), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(B) + + movlpd %xmm2, 0 * SIZE(BO) + movlpd %xmm2, 1 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + PREFETCH 56 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + movapd 16 * SIZE(AO), %xmm12 + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movapd 24 * SIZE(BO), %xmm15 + + PREFETCHW 4 * SIZE(CO1) + PREFETCHW 4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm4 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm5 + movapd 32 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 8 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 10 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + mulpd %xmm10, %xmm11 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm4 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm5 + movapd 40 * SIZE(AO), %xmm10 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 16 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 18 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 18 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 20 * SIZE(AO), %xmm12 + + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm1 + movapd 22 * SIZE(AO), %xmm12 + mulpd %xmm12, %xmm13 + mulpd 22 * SIZE(BO), %xmm12 + addpd %xmm13, %xmm4 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm12, %xmm5 + movapd 48 * SIZE(AO), %xmm12 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 24 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 26 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 26 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 28 * SIZE(AO), %xmm14 + + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm1 + movapd 30 * SIZE(AO), %xmm14 + mulpd %xmm14, %xmm15 + mulpd 30 * SIZE(BO), %xmm14 + addpd %xmm15, %xmm4 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm14, %xmm5 + movapd 56 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + movapd 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + movapd 4 * SIZE(B), %xmm9 + movapd 6 * SIZE(B), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + movapd %xmm9, 4 * SIZE(B) + movapd %xmm13, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) + movlpd %xmm9, 8 * SIZE(BO) + movlpd %xmm9, 9 * SIZE(BO) + movhpd %xmm9, 10 * SIZE(BO) + movhpd %xmm9, 11 * SIZE(BO) + movlpd %xmm13, 12 * SIZE(BO) + movlpd %xmm13, 13 * SIZE(BO) + movhpd %xmm13, 14 * SIZE(BO) + movhpd %xmm13, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm0 + movapd 12 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm1 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm13 + mulpd 18 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm0 + movapd 20 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm1 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + mulpd 22 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 26 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm0 + movapd 28 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm1 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + mulpd 30 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm0 + movapd 4 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movlpd 3 * SIZE(B), %xmm0 + movhpd 3 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + + movlpd 2 * SIZE(B), %xmm1 + movhpd 2 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulsd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 10 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm0 + movsd 12 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm1 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + mulsd %xmm10, %xmm13 + mulsd 18 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm0 + movsd 20 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm1 + movsd 5 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm13 + mulsd 22 * SIZE(BO), %xmm10 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm10, %xmm3 + movsd 6 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 26 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm0 + movsd 28 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm1 + movsd 7 * SIZE(AO), %xmm10 + + mulsd %xmm10, %xmm15 + mulsd 30 * SIZE(BO), %xmm10 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm10, %xmm3 + movsd 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulsd %xmm8, %xmm9 + mulsd 2 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm0 + addsd %xmm8, %xmm1 + movsd 1 * SIZE(AO), %xmm8 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + + mulsd 3 * SIZE(B), %xmm5 +#endif + +#ifdef RT + mulsd 3 * SIZE(B), %xmm5 + + movlpd 2 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L80: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + leaq (, %rax, SIZE), %rax + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 40 * SIZE(B) + + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm4 + movsd 5 * SIZE(B), %xmm5 + movsd 6 * SIZE(B), %xmm6 + movsd 7 * SIZE(B), %xmm7 + + addq $16 * SIZE, BO + addq $ 8 * SIZE, B + + movsd %xmm0, -16 * SIZE(BO) + movsd %xmm0, -15 * SIZE(BO) + movsd %xmm1, -14 * SIZE(BO) + movsd %xmm1, -13 * SIZE(BO) + movsd %xmm2, -12 * SIZE(BO) + movsd %xmm2, -11 * SIZE(BO) + movsd %xmm3, -10 * SIZE(BO) + movsd %xmm3, -9 * SIZE(BO) + movsd %xmm4, -8 * SIZE(BO) + movsd %xmm4, -7 * SIZE(BO) + movsd %xmm5, -6 * SIZE(BO) + movsd %xmm5, -5 * SIZE(BO) + movsd %xmm6, -4 * SIZE(BO) + movsd %xmm6, -3 * SIZE(BO) + movsd %xmm7, -2 * SIZE(BO) + movsd %xmm7, -1 * SIZE(BO) + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movsd 0 * SIZE(B), %xmm0 + movsd 1 * SIZE(B), %xmm1 + movsd 2 * SIZE(B), %xmm2 + movsd 3 * SIZE(B), %xmm3 + + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm0, 1 * SIZE(BO) + movsd %xmm1, 2 * SIZE(BO) + movsd %xmm1, 3 * SIZE(BO) + movsd %xmm2, 4 * SIZE(BO) + movsd %xmm2, 5 * SIZE(BO) + movsd %xmm3, 6 * SIZE(BO) + movsd %xmm3, 7 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(BO), %xmm9 + movapd 2 * SIZE(BO), %xmm11 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + movapd 8 * SIZE(B), %xmm9 + movapd 10 * SIZE(B), %xmm11 + movapd 12 * SIZE(B), %xmm13 + movapd 14 * SIZE(B), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + +#ifdef LN + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movlpd 14 * SIZE(AO), %xmm2 + movhpd 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movlpd 13 * SIZE(AO), %xmm4 + movhpd 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movlpd 12 * SIZE(AO), %xmm6 + movhpd 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movlpd 9 * SIZE(AO), %xmm2 + movhpd 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movlpd 8 * SIZE(AO), %xmm4 + movhpd 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 4 * SIZE(AO), %xmm2 + movhpd 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movlpd 2 * SIZE(AO), %xmm4 + movhpd 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movlpd 3 * SIZE(AO), %xmm6 + movhpd 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movlpd 5 * SIZE(AO), %xmm0 + movhpd 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movlpd 6 * SIZE(AO), %xmm2 + movhpd 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movlpd 7 * SIZE(AO), %xmm4 + movhpd 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movlpd 10 * SIZE(AO), %xmm0 + movhpd 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movlpd 11 * SIZE(AO), %xmm2 + movhpd 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 15 * SIZE(AO), %xmm0 + movhpd 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + movapd %xmm9, 8 * SIZE(B) + movapd %xmm11, 10 * SIZE(B) + movapd %xmm13, 12 * SIZE(B) + movapd %xmm15, 14 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) + movlpd %xmm9, 16 * SIZE(BO) + movlpd %xmm9, 17 * SIZE(BO) + movhpd %xmm9, 18 * SIZE(BO) + movhpd %xmm9, 19 * SIZE(BO) + movlpd %xmm11, 20 * SIZE(BO) + movlpd %xmm11, 21 * SIZE(BO) + movhpd %xmm11, 22 * SIZE(BO) + movhpd %xmm11, 23 * SIZE(BO) + movlpd %xmm13, 24 * SIZE(BO) + movlpd %xmm13, 25 * SIZE(BO) + movhpd %xmm13, 26 * SIZE(BO) + movhpd %xmm13, 27 * SIZE(BO) + movlpd %xmm15, 28 * SIZE(BO) + movlpd %xmm15, 29 * SIZE(BO) + movhpd %xmm15, 30 * SIZE(BO) + movhpd %xmm15, 31 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 32 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm11, %xmm2 + movapd 40 * SIZE(BO), %xmm11 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm0 + movapd 18 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + addpd %xmm13, %xmm1 + movapd 20 * SIZE(BO), %xmm13 + mulpd %xmm8, %xmm13 + mulpd 22 * SIZE(BO), %xmm8 + addpd %xmm13, %xmm2 + movapd 48 * SIZE(BO), %xmm13 + addpd %xmm8, %xmm3 + movapd 6 * SIZE(AO), %xmm8 + + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm0 + movapd 26 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + addpd %xmm15, %xmm1 + movapd 28 * SIZE(BO), %xmm15 + mulpd %xmm8, %xmm15 + mulpd 30 * SIZE(BO), %xmm8 + addpd %xmm15, %xmm2 + movapd 56 * SIZE(BO), %xmm15 + addpd %xmm8, %xmm3 + movapd 16 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 34 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movapd 36 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 38 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm2 + movapd 64 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm3 + movapd 10 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 42 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movapd 44 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + mulpd 46 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm2 + movapd 72 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm3 + movapd 12 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm0 + movapd 50 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + addpd %xmm13, %xmm1 + movapd 52 * SIZE(BO), %xmm13 + mulpd %xmm10, %xmm13 + mulpd 54 * SIZE(BO), %xmm10 + addpd %xmm13, %xmm2 + movapd 80 * SIZE(BO), %xmm13 + addpd %xmm10, %xmm3 + movapd 14 * SIZE(AO), %xmm10 + + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm0 + movapd 58 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + addpd %xmm15, %xmm1 + movapd 60 * SIZE(BO), %xmm15 + mulpd %xmm10, %xmm15 + mulpd 62 * SIZE(BO), %xmm10 + addpd %xmm15, %xmm2 + movapd 88 * SIZE(BO), %xmm15 + addpd %xmm10, %xmm3 + movapd 24 * SIZE(AO), %xmm10 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movlpd 2 * SIZE(AO), %xmm2 + movhpd 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm0 + movhpd 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movlpd 1 * SIZE(AO), %xmm2 + movhpd 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movlpd 3 * SIZE(AO), %xmm0 + movhpd 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 + + movlpd 1 * SIZE(B), %xmm1 + movhpd 1 * SIZE(B), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movlpd 2 * SIZE(B), %xmm2 + movhpd 2 * SIZE(B), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movlpd 3 * SIZE(B), %xmm3 + movhpd 3 * SIZE(B), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 6 * SIZE(B), %xmm1 + movhpd 6 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movlpd 7 * SIZE(B), %xmm2 + movhpd 7 * SIZE(B), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + + movlpd 11 * SIZE(B), %xmm1 + movhpd 11 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movlpd 15 * SIZE(B), %xmm0 + movhpd 15 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm14 + + movlpd 14 * SIZE(B), %xmm1 + movhpd 14 * SIZE(B), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movlpd 13 * SIZE(B), %xmm2 + movhpd 13 * SIZE(B), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movlpd 12 * SIZE(B), %xmm3 + movhpd 12 * SIZE(B), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movlpd 10 * SIZE(B), %xmm0 + movhpd 10 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm12 + movlpd 9 * SIZE(B), %xmm1 + movhpd 9 * SIZE(B), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movlpd 8 * SIZE(B), %xmm2 + movhpd 8 * SIZE(B), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movlpd 5 * SIZE(B), %xmm0 + movhpd 5 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm10 + movlpd 4 * SIZE(B), %xmm1 + movhpd 4 * SIZE(B), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movlpd 0 * SIZE(B), %xmm0 + movhpd 0 * SIZE(B), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movsd 16 * SIZE(BO), %xmm13 + movsd 24 * SIZE(BO), %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 32 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 10 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 12 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 14 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 40 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 2 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 18 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 20 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 22 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 48 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 3 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 26 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 28 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 30 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 56 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 4 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 34 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 36 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 38 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 64 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 5 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm0 + movsd 42 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + addsd %xmm11, %xmm1 + movsd 44 * SIZE(BO), %xmm11 + mulsd %xmm8, %xmm11 + mulsd 46 * SIZE(BO), %xmm8 + addsd %xmm11, %xmm2 + movsd 72 * SIZE(BO), %xmm11 + addsd %xmm8, %xmm3 + movsd 6 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm0 + movsd 50 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + addsd %xmm13, %xmm1 + movsd 52 * SIZE(BO), %xmm13 + mulsd %xmm8, %xmm13 + mulsd 54 * SIZE(BO), %xmm8 + addsd %xmm13, %xmm2 + movsd 80 * SIZE(BO), %xmm13 + addsd %xmm8, %xmm3 + movsd 7 * SIZE(AO), %xmm8 + + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm0 + movsd 58 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + addsd %xmm15, %xmm1 + movsd 60 * SIZE(BO), %xmm15 + mulsd %xmm8, %xmm15 + mulsd 62 * SIZE(BO), %xmm8 + addsd %xmm15, %xmm2 + movsd 88 * SIZE(BO), %xmm15 + addsd %xmm8, %xmm3 + movsd 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm0 + movsd 2 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + addsd %xmm9, %xmm1 + movsd 4 * SIZE(BO), %xmm9 + mulsd %xmm8, %xmm9 + mulsd 6 * SIZE(BO), %xmm8 + addsd %xmm9, %xmm2 + movsd 8 * SIZE(BO), %xmm9 + addsd %xmm8, %xmm3 + movsd 1 * SIZE(AO), %xmm8 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(B), %xmm4 + movsd 1 * SIZE(B), %xmm5 + movsd 2 * SIZE(B), %xmm6 + movsd 3 * SIZE(B), %xmm7 +#else + movsd 0 * SIZE(AO), %xmm4 + movsd 1 * SIZE(AO), %xmm5 + movsd 2 * SIZE(AO), %xmm6 + movsd 3 * SIZE(AO), %xmm7 +#endif + + subsd %xmm0, %xmm4 + subsd %xmm1, %xmm5 + subsd %xmm2, %xmm6 + subsd %xmm3, %xmm7 + +#ifdef LN + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm4 + mulsd %xmm0, %xmm5 + mulsd %xmm0, %xmm6 + mulsd %xmm0, %xmm7 +#endif + +#ifdef RN + mulsd 0 * SIZE(B), %xmm4 + movlpd 1 * SIZE(B), %xmm1 + mulsd %xmm4, %xmm1 + subsd %xmm1, %xmm5 + movlpd 2 * SIZE(B), %xmm2 + mulsd %xmm4, %xmm2 + subsd %xmm2, %xmm6 + movlpd 3 * SIZE(B), %xmm3 + mulsd %xmm4, %xmm3 + subsd %xmm3, %xmm7 + + mulsd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm6 + movlpd 7 * SIZE(B), %xmm2 + mulsd %xmm5, %xmm2 + subsd %xmm2, %xmm7 + + mulsd 10 * SIZE(B), %xmm6 + movlpd 11 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm7 + + mulsd 15 * SIZE(B), %xmm7 +#endif + +#ifdef RT + mulsd 15 * SIZE(B), %xmm7 + + movlpd 14 * SIZE(B), %xmm1 + mulsd %xmm7, %xmm1 + subsd %xmm1, %xmm6 + movlpd 13 * SIZE(B), %xmm2 + mulsd %xmm7, %xmm2 + subsd %xmm2, %xmm5 + movlpd 12 * SIZE(B), %xmm3 + mulsd %xmm7, %xmm3 + subsd %xmm3, %xmm4 + + mulsd 10 * SIZE(B), %xmm6 + + movlpd 9 * SIZE(B), %xmm1 + mulsd %xmm6, %xmm1 + subsd %xmm1, %xmm5 + movlpd 8 * SIZE(B), %xmm2 + mulsd %xmm6, %xmm2 + subsd %xmm2, %xmm4 + + mulsd 5 * SIZE(B), %xmm5 + + movlpd 4 * SIZE(B), %xmm1 + mulsd %xmm5, %xmm1 + subsd %xmm1, %xmm4 + + mulsd 0 * SIZE(B), %xmm4 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + movsd %xmm4, 0 * SIZE(CO1) + movsd %xmm5, 0 * SIZE(CO2) + movsd %xmm6, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, LDC, 2) + +#if defined(LN) || defined(LT) + movsd %xmm4, 0 * SIZE(B) + movsd %xmm5, 1 * SIZE(B) + movsd %xmm6, 2 * SIZE(B) + movsd %xmm7, 3 * SIZE(B) + + movsd %xmm4, 0 * SIZE(BO) + movsd %xmm4, 1 * SIZE(BO) + movsd %xmm5, 2 * SIZE(BO) + movsd %xmm5, 3 * SIZE(BO) + movsd %xmm6, 4 * SIZE(BO) + movsd %xmm6, 5 * SIZE(BO) + movsd %xmm7, 6 * SIZE(BO) + movsd %xmm7, 7 * SIZE(BO) +#else + movsd %xmm4, 0 * SIZE(AO) + movsd %xmm5, 1 * SIZE(AO) + movsd %xmm6, 2 * SIZE(AO) + movsd %xmm7, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S b/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S new file mode 100644 index 0000000000..f0e8bf9a35 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x4_sse3.S @@ -0,0 +1,3844 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 272 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) + +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L80 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(AO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(AO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(AO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(AO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) + movsd %xmm3, 2 * SIZE(CO1) + movhpd %xmm3, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(AO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L110: + testq $1, M + je .L119 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm2 + subsd %xmm0, %xmm2 +#else + movsd 0 * SIZE(AO), %xmm2 + subsd %xmm0, %xmm2 +#endif + +#ifdef LN + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm4 + mulsd %xmm4, %xmm2 +#endif + +#ifdef RN + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef RT + movsd 0 * SIZE(BO), %xmm0 + mulsd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) +#else + movsd %xmm2, 0 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(BO) +#else + movsd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L119: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_2 + +.L80: + testq $2, N + je .L40 + ALIGN_4 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) + prefetchw 4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + movapd 4 * SIZE(BO), %xmm9 + movapd 6 * SIZE(BO), %xmm13 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 + subpd %xmm4, %xmm9 + subpd %xmm12, %xmm13 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) + movapd %xmm9, 4 * SIZE(BO) + movapd %xmm13, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm8, %xmm5 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 +#endif + +#ifdef RT + movddup 3 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + + movddup 2 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm5, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + subpd %xmm0, %xmm2 +#else + movapd 0 * SIZE(AO), %xmm2 + subpd %xmm0, %xmm2 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movsd 3 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L40: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 4 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 4 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $4, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd %xmm4, %xmm12 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm12 + + movapd %xmm6, %xmm14 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm14 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + movapd 8 * SIZE(BO), %xmm9 + movapd 10 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm13 + movapd 14 * SIZE(BO), %xmm15 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 + subpd %xmm4, %xmm9 + subpd %xmm6, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 + + movapd 8 * SIZE(AO), %xmm12 + movapd 10 * SIZE(AO), %xmm13 + movapd 12 * SIZE(AO), %xmm14 + movapd 14 * SIZE(AO), %xmm15 + + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm1, %xmm10 + subpd %xmm5, %xmm11 + subpd %xmm2, %xmm12 + subpd %xmm6, %xmm13 + subpd %xmm3, %xmm14 + subpd %xmm7, %xmm15 +#endif + + +#ifdef LN + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + movddup 14 * SIZE(AO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm13, %xmm4 + subpd %xmm4, %xmm5 + movddup 13 * SIZE(AO), %xmm4 + mulpd %xmm15, %xmm4 + subpd %xmm4, %xmm7 + + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm13, %xmm6 + subpd %xmm6, %xmm1 + movddup 12 * SIZE(AO), %xmm6 + mulpd %xmm15, %xmm6 + subpd %xmm6, %xmm3 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm5 + movddup 9 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm7 + + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm9, %xmm4 + subpd %xmm4, %xmm1 + movddup 8 * SIZE(AO), %xmm4 + mulpd %xmm11, %xmm4 + subpd %xmm4, %xmm3 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm1, %xmm4 + subpd %xmm4, %xmm9 + movddup 2 * SIZE(AO), %xmm4 + mulpd %xmm3, %xmm4 + subpd %xmm4, %xmm11 + + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm1, %xmm6 + subpd %xmm6, %xmm13 + movddup 3 * SIZE(AO), %xmm6 + mulpd %xmm3, %xmm6 + subpd %xmm6, %xmm15 + + movddup 5 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm9 + movddup 6 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm11 + + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm5, %xmm4 + subpd %xmm4, %xmm13 + movddup 7 * SIZE(AO), %xmm4 + mulpd %xmm7, %xmm4 + subpd %xmm4, %xmm15 + + movddup 10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm9 + mulpd %xmm0, %xmm11 + + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + movddup 11 * SIZE(AO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 15 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm13 + mulpd %xmm0, %xmm15 +#endif + + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm9, %xmm1 + subpd %xmm1, %xmm11 + + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm9, %xmm2 + subpd %xmm2, %xmm13 + + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm9, %xmm3 + subpd %xmm3, %xmm15 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm13 + + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm11, %xmm2 + subpd %xmm2, %xmm15 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm15 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + mulpd %xmm0, %xmm15 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm15, %xmm1 + subpd %xmm1, %xmm13 + + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm15, %xmm2 + subpd %xmm2, %xmm11 + + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm15, %xmm3 + subpd %xmm3, %xmm9 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + mulpd %xmm0, %xmm13 + + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm13, %xmm1 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm13, %xmm2 + subpd %xmm2, %xmm9 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm11, %xmm1 + subpd %xmm1, %xmm9 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + movhpd %xmm9, 2 * SIZE(CO2) + movhpd %xmm13, 3 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movsd %xmm11, 2 * SIZE(CO1, LDC, 2) + movsd %xmm15, 3 * SIZE(CO1, LDC, 2) + + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm13, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) + movsd %xmm15, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) + movapd %xmm9, 8 * SIZE(BO) + movapd %xmm11, 10 * SIZE(BO) + movapd %xmm13, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) + movapd %xmm12, 8 * SIZE(AO) + movapd %xmm13, 10 * SIZE(AO) + movapd %xmm14, 12 * SIZE(AO) + movapd %xmm15, 14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + ALIGN_4 + +.L21: + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm10 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm10 + + movapd 0 * SIZE(BO), %xmm1 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm5 + movapd 6 * SIZE(BO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm8, %xmm5 + subpd %xmm10, %xmm7 +#else + + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm10 + movapd 4 * SIZE(AO), %xmm12 + movapd 6 * SIZE(AO), %xmm14 + + subpd %xmm0, %xmm8 + subpd %xmm1, %xmm10 + subpd %xmm2, %xmm12 + subpd %xmm3, %xmm14 +#endif + +#ifdef LN + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 + + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + subpd %xmm2, %xmm1 + movddup 2 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + subpd %xmm2, %xmm3 + + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm3 + + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm1, %xmm2 + subpd %xmm2, %xmm5 + movddup 1 * SIZE(AO), %xmm2 + mulpd %xmm3, %xmm2 + subpd %xmm2, %xmm7 + + movddup 3 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm5 + mulpd %xmm0, %xmm7 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 + + movddup 1 * SIZE(BO), %xmm1 + mulpd %xmm8, %xmm1 + subpd %xmm1, %xmm10 + movddup 2 * SIZE(BO), %xmm2 + mulpd %xmm8, %xmm2 + subpd %xmm2, %xmm12 + movddup 3 * SIZE(BO), %xmm3 + mulpd %xmm8, %xmm3 + subpd %xmm3, %xmm14 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 6 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm12 + movddup 7 * SIZE(BO), %xmm2 + mulpd %xmm10, %xmm2 + subpd %xmm2, %xmm14 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + + movddup 11 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm14 + + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 +#endif + +#ifdef RT + movddup 15 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm14 + + movddup 14 * SIZE(BO), %xmm1 + mulpd %xmm14, %xmm1 + subpd %xmm1, %xmm12 + movddup 13 * SIZE(BO), %xmm2 + mulpd %xmm14, %xmm2 + subpd %xmm2, %xmm10 + movddup 12 * SIZE(BO), %xmm3 + mulpd %xmm14, %xmm3 + subpd %xmm3, %xmm8 + + movddup 10 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm12 + movddup 9 * SIZE(BO), %xmm1 + mulpd %xmm12, %xmm1 + subpd %xmm1, %xmm10 + movddup 8 * SIZE(BO), %xmm2 + mulpd %xmm12, %xmm2 + subpd %xmm2, %xmm8 + + movddup 5 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm10 + movddup 4 * SIZE(BO), %xmm1 + mulpd %xmm10, %xmm1 + subpd %xmm1, %xmm8 + + movddup 0 * SIZE(BO), %xmm0 + mulpd %xmm0, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm1, 0 * SIZE(CO1) + movsd %xmm5, 1 * SIZE(CO1) + movhpd %xmm1, 0 * SIZE(CO2) + movhpd %xmm5, 1 * SIZE(CO2) + + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movsd %xmm7, 1 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + + movsd %xmm12, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) + movsd %xmm14, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm5, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm10, 2 * SIZE(AO) + movapd %xmm12, 4 * SIZE(AO) + movapd %xmm14, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $0 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm2 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm2 + subpd %xmm1, %xmm3 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 +#endif + +#ifdef RN + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + movsd 1 * SIZE(BO), %xmm5 + mulsd %xmm2, %xmm5 + subsd %xmm5, %xmm0 + movsd 2 * SIZE(BO), %xmm6 + mulsd %xmm2, %xmm6 + subsd %xmm6, %xmm3 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm7 + subsd %xmm7, %xmm1 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm3 + movsd 7 * SIZE(BO), %xmm6 + mulsd %xmm0, %xmm6 + subsd %xmm6, %xmm1 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 11 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 +#endif + +#ifdef RT + movapd %xmm2, %xmm0 + unpckhpd %xmm0, %xmm0 + + movapd %xmm3, %xmm1 + unpckhpd %xmm1, %xmm1 + + movsd 15 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm1 + + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + subsd %xmm5, %xmm3 + movsd 13 * SIZE(BO), %xmm6 + mulsd %xmm1, %xmm6 + subsd %xmm6, %xmm0 + movsd 12 * SIZE(BO), %xmm7 + mulsd %xmm1, %xmm7 + subsd %xmm7, %xmm2 + + movsd 10 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm3 + + movsd 9 * SIZE(BO), %xmm5 + mulsd %xmm3, %xmm5 + subsd %xmm5, %xmm0 + movsd 8 * SIZE(BO), %xmm6 + mulsd %xmm3, %xmm6 + subsd %xmm6, %xmm2 + + movsd 5 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm0 + + movsd 4 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + subsd %xmm5, %xmm2 + + movsd 0 * SIZE(BO), %xmm4 + mulsd %xmm4, %xmm2 + + unpcklpd %xmm0, %xmm2 + unpcklpd %xmm1, %xmm3 + +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#else + movsd %xmm2, 0 * SIZE(CO1) + movhpd %xmm2, 0 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#else + movapd %xmm2, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L10 + ALIGN_4 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S b/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S new file mode 100644 index 0000000000..ffac798e33 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_4x8_nehalem.S @@ -0,0 +1,4847 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + leaq (, LDC, SIZE), LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + jle .L40 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L110 + ALIGN_4 + +.L101: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + movhps -30 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm2, -30 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) +#else + movaps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0x55, %xmm0, %xmm1 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(BO) + + movlps %xmm0, 0 * SIZE(CO1) +#else + movlps %xmm0, -32 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L129 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + addss %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(BO), %xmm0 + + subss %xmm8, %xmm0 +#else + movss -32 * SIZE(AO), %xmm0 + + subss %xmm8, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm8 +#endif + + mulss %xmm8, %xmm0 + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L129: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + testq $2, N + jle .L70 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + unpcklps %xmm9, %xmm8 + unpckhps %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 +#endif + + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + movlps %xmm2, -28 * SIZE(BO) + movlps %xmm3, -26 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(CO1) + movlps %xmm2, 2 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm1, %xmm8 + +#if defined(LN) || defined(LT) + pshufd $0xd8, %xmm8, %xmm8 + + movaps -32 * SIZE(BO), %xmm0 +#else + movaps -32 * SIZE(AO), %xmm0 +#endif + + subps %xmm8, %xmm0 + + movhlps %xmm0, %xmm1 + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm0, -32 * SIZE(BO) + movlps %xmm1, -30 * SIZE(BO) + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO2) +#else + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm1, -30 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $1, M + BRANCH + jle .L99 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 +#endif + + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + +#if defined(LN) || defined(LT) + movss -32 * SIZE(AO), %xmm8 + + mulss %xmm8, %xmm0 + mulss %xmm8, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 +#endif + +#ifdef RT + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movss %xmm0, -32 * SIZE(BO) + movss %xmm1, -31 * SIZE(BO) +#else + movss %xmm0, -32 * SIZE(AO) + movss %xmm1, -31 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO2) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L70: + testq $4, N + jle .L100 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO2, LDC, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movsd %xmm3, 0 * SIZE(CO2, LDC, 1) + movhps %xmm3, 2 * SIZE(CO2, LDC, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm1 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO2) + movhps %xmm4, (CO2, LDC, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO2) + movsd %xmm3, (CO2, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + addps %xmm2, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + + subps %xmm8, %xmm0 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + + subps %xmm8, %xmm0 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 +#endif + +#ifdef RT + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO2) + movss %xmm3, (CO2, LDC, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + ALIGN_4 + +.L100: + movq N, J + sarq $3, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 8), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 +#ifndef RT + leaq (C, LDC, 8), C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht2 4 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht2 4 * SIZE(CO2, %rax, 1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm4 + shufps $0xdd, %xmm9, %xmm5 + + movaps %xmm8, %xmm6 + shufps $0x88, %xmm10, %xmm8 + shufps $0xdd, %xmm6, %xmm10 + + movaps %xmm4, %xmm9 + movaps %xmm5, %xmm11 + shufps $0x22, %xmm5, %xmm9 + shufps $0x77, %xmm4, %xmm11 + + movaps %xmm12, %xmm4 + shufps $0x88, %xmm13, %xmm12 + movaps %xmm14, %xmm5 + shufps $0x88, %xmm15, %xmm14 + shufps $0xdd, %xmm15, %xmm4 + shufps $0xdd, %xmm13, %xmm5 + + movaps %xmm12, %xmm6 + shufps $0x88, %xmm14, %xmm12 + shufps $0xdd, %xmm6, %xmm14 + + movaps %xmm4, %xmm13 + movaps %xmm5, %xmm15 + shufps $0x22, %xmm5, %xmm13 + shufps $0x77, %xmm4, %xmm15 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm5 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm6 + movaps -8 * SIZE(BO), %xmm3 + movaps -4 * SIZE(BO), %xmm7 + +#else + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -24 * SIZE(AO), %xmm2 + movaps -20 * SIZE(AO), %xmm3 + movaps -16 * SIZE(AO), %xmm4 + movaps -12 * SIZE(AO), %xmm5 + movaps -8 * SIZE(AO), %xmm6 + movaps -4 * SIZE(AO), %xmm7 +#endif + + subps %xmm8, %xmm0 + subps %xmm9, %xmm1 + subps %xmm10, %xmm2 + subps %xmm11, %xmm3 + subps %xmm12, %xmm4 + subps %xmm13, %xmm5 + subps %xmm14, %xmm6 + subps %xmm15, %xmm7 + +#ifdef LN + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps -28 * SIZE(AO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(AO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps -20 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm7 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm5, -20 * SIZE(BO) + movaps %xmm2, -16 * SIZE(BO) + movaps %xmm6, -12 * SIZE(BO) + movaps %xmm3, -8 * SIZE(BO) + movaps %xmm7, -4 * SIZE(BO) + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm1, %xmm0 + shufps $0xdd, %xmm8, %xmm1 + + movaps %xmm2, %xmm9 + shufps $0x88, %xmm3, %xmm2 + shufps $0xdd, %xmm9, %xmm3 + + movaps %xmm0, %xmm8 + shufps $0x88, %xmm2, %xmm0 + movaps %xmm1, %xmm9 + shufps $0x22, %xmm3, %xmm1 + shufps $0xdd, %xmm2, %xmm8 + movaps %xmm8, %xmm2 + shufps $0x77, %xmm3, %xmm9 + movaps %xmm9, %xmm3 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm8, %xmm5 + + movaps %xmm6, %xmm9 + shufps $0x88, %xmm7, %xmm6 + shufps $0xdd, %xmm9, %xmm7 + + movaps %xmm4, %xmm8 + shufps $0x88, %xmm6, %xmm4 + movaps %xmm5, %xmm9 + shufps $0x22, %xmm7, %xmm5 + shufps $0xdd, %xmm6, %xmm8 + movaps %xmm8, %xmm6 + shufps $0x77, %xmm7, %xmm9 + movaps %xmm9, %xmm7 + +#else + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm1, -28 * SIZE(AO) + movaps %xmm2, -24 * SIZE(AO) + movaps %xmm3, -20 * SIZE(AO) + movaps %xmm4, -16 * SIZE(AO) + movaps %xmm5, -12 * SIZE(AO) + movaps %xmm6, -8 * SIZE(AO) + movaps %xmm7, -4 * SIZE(AO) +#endif + + leaq (LDC, LDC, 2), %rax + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm1, 0 * SIZE(CO1, LDC, 1) + movhps %xmm1, 2 * SIZE(CO1, LDC, 1) + + movsd %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movsd %xmm3, 0 * SIZE(CO1, %rax, 1) + movhps %xmm3, 2 * SIZE(CO1, %rax, 1) + + movsd %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movsd %xmm5, 0 * SIZE(CO2, LDC, 1) + movhps %xmm5, 2 * SIZE(CO2, LDC, 1) + + movsd %xmm6, 0 * SIZE(CO2, LDC, 2) + movhps %xmm6, 2 * SIZE(CO2, LDC, 2) + movsd %xmm7, 0 * SIZE(CO2, %rax, 1) + movhps %xmm7, 2 * SIZE(CO2, %rax, 1) + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $4, KK +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm4 + shufps $0x88, %xmm9, %xmm8 + shufps $0xdd, %xmm9, %xmm4 + + movaps %xmm10, %xmm5 + shufps $0x88, %xmm11, %xmm10 + shufps $0xdd, %xmm11, %xmm5 + + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm2 + movaps -24 * SIZE(BO), %xmm1 + movaps -20 * SIZE(BO), %xmm3 + + subps %xmm8, %xmm0 + subps %xmm4, %xmm1 + subps %xmm10, %xmm2 + subps %xmm5, %xmm3 +#else + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm2 + movaps -24 * SIZE(AO), %xmm4 + movaps -20 * SIZE(AO), %xmm6 + + subps %xmm8, %xmm0 + subps %xmm9, %xmm2 + subps %xmm10, %xmm4 + subps %xmm11, %xmm6 + + movhlps %xmm0, %xmm1 + movhlps %xmm2, %xmm3 + movhlps %xmm4, %xmm5 + movhlps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm2 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm3 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm0, %xmm15 + subps %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm7, %xmm15 + subps %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm6, %xmm15 + subps %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm5, %xmm15 + subps %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm4, %xmm15 + subps %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulps %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm3, %xmm15 + subps %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulps %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm2, %xmm15 + subps %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulps %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm1, %xmm15 + subps %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm2, -28 * SIZE(BO) + movaps %xmm1, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + movaps %xmm0, %xmm4 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm4 + + movaps %xmm2, %xmm5 + unpcklps %xmm3, %xmm2 + unpckhps %xmm3, %xmm5 + + movsd %xmm0, (CO1) + movhps %xmm0, (CO1, LDC, 1) + movsd %xmm4, (CO1, LDC, 2) + movhps %xmm4, (CO1, %rax, 1) + + movsd %xmm2, (CO2) + movhps %xmm2, (CO2, LDC, 1) + movsd %xmm5, (CO2, LDC, 2) + movhps %xmm5, (CO2, %rax, 1) +#else + movlhps %xmm1, %xmm0 + movlhps %xmm3, %xmm2 + movlhps %xmm5, %xmm4 + movlhps %xmm7, %xmm6 + + movaps %xmm0, -32 * SIZE(AO) + movaps %xmm2, -28 * SIZE(AO) + movaps %xmm4, -24 * SIZE(AO) + movaps %xmm6, -20 * SIZE(AO) + + movsd %xmm0, (CO1) + movsd %xmm1, (CO1, LDC, 1) + movsd %xmm2, (CO1, LDC, 2) + movsd %xmm3, (CO1, %rax, 1) + + movsd %xmm4, (CO2) + movsd %xmm5, (CO2, LDC, 1) + movsd %xmm6, (CO2, LDC, 2) + movsd %xmm7, (CO2, %rax, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#else + movq B, BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $8, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 8), BO +#endif + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm0 + movaps -28 * SIZE(BO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 +#else + movsd -32 * SIZE(AO), %xmm0 + movhps -30 * SIZE(AO), %xmm0 + movsd -28 * SIZE(AO), %xmm4 + movhps -26 * SIZE(AO), %xmm4 + + subps %xmm8, %xmm0 + subps %xmm12, %xmm4 + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(AO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm4 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm3 + + movaps -28 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm0, %xmm15 + subss %xmm15, %xmm7 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm3 + + movaps -20 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm7 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm3 + + movaps -12 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm7 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + + movaps -4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm7 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm7 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm7 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm7 + + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 +#endif + +#ifdef RT + movaps 28 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm7 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm4 + + movaps 24 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm7, %xmm15 + subss %xmm15, %xmm0 + + movaps 20 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm6 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm4 + + movaps 16 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm6, %xmm15 + subss %xmm15, %xmm0 + + movaps 12 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm5 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm4 + + movaps 8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm5, %xmm15 + subss %xmm15, %xmm0 + + movaps 4 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm4 + + movaps 0 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm4, %xmm15 + subss %xmm15, %xmm0 + + movaps -8 * SIZE(BO), %xmm8 + + pshufd $0xff, %xmm8, %xmm15 + mulss %xmm15, %xmm3 + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm3, %xmm15 + subss %xmm15, %xmm0 + + movaps -16 * SIZE(BO), %xmm8 + + pshufd $0xaa, %xmm8, %xmm15 + mulss %xmm15, %xmm2 + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm2, %xmm15 + subss %xmm15, %xmm0 + + movaps -24 * SIZE(BO), %xmm8 + + pshufd $0x55, %xmm8, %xmm15 + mulss %xmm15, %xmm1 + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm1, %xmm15 + subss %xmm15, %xmm0 + + movaps -32 * SIZE(BO), %xmm8 + + pshufd $0x00, %xmm8, %xmm15 + mulss %xmm15, %xmm0 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + + leaq (LDC, LDC, 2), %rax + +#if defined(LN) || defined(LT) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm4, -28 * SIZE(BO) + + pshufd $0xff, %xmm0, %xmm3 + pshufd $0xaa, %xmm0, %xmm2 + pshufd $0x55, %xmm0, %xmm1 + pshufd $0x00, %xmm0, %xmm0 + + pshufd $0xff, %xmm4, %xmm7 + pshufd $0xaa, %xmm4, %xmm6 + pshufd $0x55, %xmm4, %xmm5 + pshufd $0x00, %xmm4, %xmm4 +#else + unpcklps %xmm1, %xmm0 + unpcklps %xmm3, %xmm2 + unpcklps %xmm5, %xmm4 + unpcklps %xmm7, %xmm6 + + movlps %xmm0, -32 * SIZE(AO) + movlps %xmm2, -30 * SIZE(AO) + movlps %xmm4, -28 * SIZE(AO) + movlps %xmm6, -26 * SIZE(AO) +#endif + + movss %xmm0, (CO1) + movss %xmm1, (CO1, LDC, 1) + movss %xmm2, (CO1, LDC, 2) + movss %xmm3, (CO1, %rax, 1) + + movss %xmm4, (CO2) + movss %xmm5, (CO2, LDC, 1) + movss %xmm6, (CO2, LDC, 2) + movss %xmm7, (CO2, %rax, 1) + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 8), B +#endif +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $8, KK +#endif + +#ifdef RT + subq $8, KK +#endif + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S new file mode 100644 index 0000000000..e96496fd69 --- /dev/null +++ b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S @@ -0,0 +1,5975 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#if defined(OPTERON) || defined(BARCELONA) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define movsd movlps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht0 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm4, OFFSET + movsd %xmm4, KK + + leaq (, LDC, SIZE), LDC + +#ifdef LN + leaq (, M, SIZE), %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + leaq (, N, SIZE), %rax + imulq K, %rax + addq %rax, B + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L50 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + jle .L103 + ALIGN_4 + +.L102: + movsd 0 * SIZE(B), %xmm3 + movhps 2 * SIZE(B), %xmm3 + movsd 4 * SIZE(B), %xmm7 + movhps 6 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps 12 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps 20 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps 28 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 36 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 44 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 52 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 60 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps 4 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 8), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 + subss %xmm4, %xmm12 + subss %xmm6, %xmm13 + subss %xmm9, %xmm14 + subss %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm15, %xmm8 + subss %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm12, %xmm8 + subss %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulss %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm13, %xmm8 + subss %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm14, %xmm8 + subss %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulss %xmm8, %xmm15 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + movss %xmm12, 4 * SIZE(B) + movss %xmm13, 5 * SIZE(B) + movss %xmm14, 6 * SIZE(B) + movss %xmm15, 7 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + movaps %xmm2, 16 * SIZE(BO) + pshufd $0x00, %xmm13, %xmm2 + movaps %xmm2, 20 * SIZE(BO) + pshufd $0x00, %xmm14, %xmm2 + movaps %xmm2, 24 * SIZE(BO) + pshufd $0x00, %xmm15, %xmm2 + movaps %xmm2, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + unpcklps %xmm13, %xmm12 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 + subss %xmm8, %xmm10 + subss %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm11, %xmm8 + subss %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm10, %xmm8 + subss %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + movss %xmm10, 2 * SIZE(B) + movss %xmm11, 3 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) + pshufd $0x00, %xmm10, %xmm2 + movaps %xmm2, 8 * SIZE(BO) + pshufd $0x00, %xmm11, %xmm2 + movaps %xmm2, 12 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movss 0 * SIZE(B), %xmm1 + movss 1 * SIZE(B), %xmm5 + + subss %xmm0, %xmm1 + subss %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 + + subps %xmm0, %xmm8 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulss %xmm5, %xmm8 + subss %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulss %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulss %xmm1, %xmm8 + subss %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulss %xmm8, %xmm5 +#endif + +#if defined(RN) || defined(RT) + movss 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + movss %xmm5, 1 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) + pshufd $0x00, %xmm5, %xmm2 + movaps %xmm2, 4 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + unpcklps %xmm5, %xmm1 + + movlps %xmm1, 0 * SIZE(CO1) +#else + movlps %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L140: + testq $1, M + je .L149 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 1 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(B), %xmm1 + subss %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + subps %xmm0, %xmm8 +#endif + +#if defined(LN) || defined(LT) + mulss 0 * SIZE(AO), %xmm1 +#endif + +#if defined(RN) || defined(RT) + mulss 0 * SIZE(B), %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + movaps %xmm2, 0 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movss %xmm1, 0 * SIZE(CO1) +#else + movss %xmm8, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $1 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L50: + testq $2, N + je .L100 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $1 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + addps %xmm12, %xmm5 + movaps 96 * SIZE(BO), %xmm13 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 8 * SIZE(B), %xmm12 +#ifdef movsd + xorps %xmm13, %xmm13 +#endif + movsd 10 * SIZE(B), %xmm13 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 12 * SIZE(B), %xmm14 +#ifdef movsd + xorps %xmm15, %xmm15 +#endif + movsd 14 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + movlps %xmm12, 8 * SIZE(B) + movlps %xmm13, 10 * SIZE(B) + movlps %xmm14, 12 * SIZE(B) + movlps %xmm15, 14 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + movaps %xmm2, 56 * SIZE(BO) + movaps %xmm3, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + unpcklps %xmm14, %xmm12 + unpcklps %xmm15, %xmm13 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm14, 4 * SIZE(CO1, LDC, 1) + movhps %xmm14, 6 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) + movlps %xmm11, 4 * SIZE(CO1, LDC, 1) + movhps %xmm11, 6 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 4 * SIZE(B), %xmm10 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 6 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + movlps %xmm10, 4 * SIZE(B) + movlps %xmm11, 6 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + movaps %xmm2, 24 * SIZE(BO) + movaps %xmm3, 28 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) + movhps %xmm10, 2 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $ 8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 8 * SIZE(AO), %xmm10 + +#ifdef movsd + xorps %xmm9, %xmm9 +#endif + movsd 0 * SIZE(BO), %xmm9 +#ifdef movsd + xorps %xmm11, %xmm11 +#endif + movsd 16 * SIZE(BO), %xmm11 +#ifdef movsd + xorps %xmm13, %xmm13 +#endif + movsd 32 * SIZE(BO), %xmm13 +#ifdef movsd + xorps %xmm15, %xmm15 +#endif + movsd 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + movlps %xmm5, 2 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $ 4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 3 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss 8 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 5 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss 6 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 7 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm1, %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 +#endif + +#ifdef RT + movaps 0 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm10, %xmm1 + unpcklps %xmm11, %xmm5 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO1, LDC, 1) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (AO, %rax, SIZE), AO +#ifdef LT + addq $ 2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L100: + movq N, J + sarq $2, J # j = (n >> 2) + jle .L999 + +.L01: +/* Copying to Sub Buffer */ + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $2 + BASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + pshufd $0x55, %xmm11, %xmm9 + pshufd $0xaa, %xmm11, %xmm10 + pshufd $0xff, %xmm11, %xmm11 + + pshufd $0x00, %xmm15, %xmm12 + pshufd $0x55, %xmm15, %xmm13 + pshufd $0xaa, %xmm15, %xmm14 + pshufd $0xff, %xmm15, %xmm15 + + movaps %xmm8, 32 * SIZE(BO) + movaps %xmm9, 36 * SIZE(BO) + movaps %xmm10, 40 * SIZE(BO) + movaps %xmm11, 44 * SIZE(BO) + movaps %xmm12, 48 * SIZE(BO) + movaps %xmm13, 52 * SIZE(BO) + movaps %xmm14, 56 * SIZE(BO) + movaps %xmm15, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc +#ifndef RT + leaq (C, LDC, 4), C +#endif + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $3 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + PREFETCHW 7 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 4 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 8 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 16 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 16 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 28 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 24 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 24 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 28 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 36 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm5 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm6 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm10, %xmm7 + movaps 36 * SIZE(AO), %xmm10 + mulps %xmm12, %xmm15 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm1 + movaps 52 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm13 + mulps 60 * SIZE(BO), %xmm12 + addps %xmm13, %xmm2 + movaps 56 * SIZE(BO), %xmm13 + addps %xmm12, %xmm3 + movaps 40 * SIZE(AO), %xmm12 + mulps %xmm14, %xmm15 + addps %xmm15, %xmm4 + movaps 80 * SIZE(BO), %xmm15 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm5 + movaps 68 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm13 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm13, %xmm6 + movaps 72 * SIZE(BO), %xmm13 + addps %xmm14, %xmm7 + movaps 44 * SIZE(AO), %xmm14 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $8, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps %xmm4, %xmm9 + unpcklps %xmm6, %xmm4 + unpckhps %xmm6, %xmm9 + + movaps %xmm5, %xmm14 + unpcklps %xmm7, %xmm5 + unpckhps %xmm7, %xmm14 + + movaps %xmm4, %xmm6 + unpcklps %xmm5, %xmm4 + unpckhps %xmm5, %xmm6 + + movaps %xmm9, %xmm7 + unpcklps %xmm14, %xmm9 + unpckhps %xmm14, %xmm7 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + movaps 16 * SIZE(B), %xmm12 + movaps 20 * SIZE(B), %xmm13 + movaps 24 * SIZE(B), %xmm14 + movaps 28 * SIZE(B), %xmm15 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 + subps %xmm4, %xmm12 + subps %xmm6, %xmm13 + subps %xmm9, %xmm14 + subps %xmm7, %xmm15 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm9 + movaps 8 * SIZE(AO), %xmm10 + movaps 12 * SIZE(AO), %xmm11 + + movaps 16 * SIZE(AO), %xmm12 + movaps 20 * SIZE(AO), %xmm13 + movaps 24 * SIZE(AO), %xmm14 + movaps 28 * SIZE(AO), %xmm15 + + subps %xmm0, %xmm8 + subps %xmm4, %xmm9 + subps %xmm1, %xmm10 + subps %xmm5, %xmm11 + subps %xmm2, %xmm12 + subps %xmm6, %xmm13 + subps %xmm3, %xmm14 + subps %xmm7, %xmm15 +#endif + +#ifdef LN + movaps 60 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm15 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm12 + + movaps 56 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm15, %xmm8 + subps %xmm8, %xmm1 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm12 + + movaps 48 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm1 + + movaps 44 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm12 + + movaps 40 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm1 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + + movaps 32 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm1 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm15 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm15 + + movaps 16 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 20 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm15 + + movaps 24 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + + movaps 28 * SIZE(AO), %xmm7 + pshufd $0x00, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm12 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm15 + + movaps 36 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm12 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm13 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm12, %xmm8 + subps %xmm8, %xmm15 + + movaps 44 * SIZE(AO), %xmm7 + pshufd $0x55, %xmm7, %xmm8 + mulps %xmm8, %xmm13 + pshufd $0xaa, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm14 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm13, %xmm8 + subps %xmm8, %xmm15 + + movaps 52 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm14 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm14, %xmm8 + subps %xmm8, %xmm15 + + movaps 60 * SIZE(AO), %xmm7 + pshufd $0xff, %xmm7, %xmm8 + mulps %xmm8, %xmm15 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm9, %xmm2 + subps %xmm2, %xmm15 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm15 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm15 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + mulps %xmm2, %xmm15 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm15, %xmm2 + subps %xmm2, %xmm9 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + mulps %xmm2, %xmm13 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm13, %xmm2 + subps %xmm2, %xmm9 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + mulps %xmm2, %xmm11 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm11, %xmm2 + subps %xmm2, %xmm9 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + mulps %xmm2, %xmm9 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + movaps %xmm12, 16 * SIZE(B) + movaps %xmm13, 20 * SIZE(B) + movaps %xmm14, 24 * SIZE(B) + movaps %xmm15, 28 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) + + pshufd $0x00, %xmm12, %xmm2 + pshufd $0x55, %xmm12, %xmm3 + pshufd $0xaa, %xmm12, %xmm4 + pshufd $0xff, %xmm12, %xmm6 + movaps %xmm2, 64 * SIZE(BO) + movaps %xmm3, 68 * SIZE(BO) + movaps %xmm4, 72 * SIZE(BO) + movaps %xmm6, 76 * SIZE(BO) + + pshufd $0x00, %xmm13, %xmm2 + pshufd $0x55, %xmm13, %xmm3 + pshufd $0xaa, %xmm13, %xmm4 + pshufd $0xff, %xmm13, %xmm6 + movaps %xmm2, 80 * SIZE(BO) + movaps %xmm3, 84 * SIZE(BO) + movaps %xmm4, 88 * SIZE(BO) + movaps %xmm6, 92 * SIZE(BO) + + pshufd $0x00, %xmm14, %xmm2 + pshufd $0x55, %xmm14, %xmm3 + pshufd $0xaa, %xmm14, %xmm4 + pshufd $0xff, %xmm14, %xmm6 + movaps %xmm2, 96 * SIZE(BO) + movaps %xmm3, 100 * SIZE(BO) + movaps %xmm4, 104 * SIZE(BO) + movaps %xmm6, 108 * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm2 + pshufd $0x55, %xmm15, %xmm3 + pshufd $0xaa, %xmm15, %xmm4 + pshufd $0xff, %xmm15, %xmm6 + movaps %xmm2, 112 * SIZE(BO) + movaps %xmm3, 116 * SIZE(BO) + movaps %xmm4, 120 * SIZE(BO) + movaps %xmm6, 124 * SIZE(BO) + +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm9, 4 * SIZE(AO) + movaps %xmm10, 8 * SIZE(AO) + movaps %xmm11, 12 * SIZE(AO) + movaps %xmm12, 16 * SIZE(AO) + movaps %xmm13, 20 * SIZE(AO) + movaps %xmm14, 24 * SIZE(AO) + movaps %xmm15, 28 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movaps %xmm12, %xmm2 + unpcklps %xmm14, %xmm12 + unpckhps %xmm14, %xmm2 + + movaps %xmm13, %xmm7 + unpcklps %xmm15, %xmm13 + unpckhps %xmm15, %xmm7 + + movaps %xmm12, %xmm14 + unpcklps %xmm13, %xmm12 + unpckhps %xmm13, %xmm14 + + movaps %xmm2, %xmm15 + unpcklps %xmm7, %xmm2 + unpckhps %xmm7, %xmm15 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm9, 4 * SIZE(CO1) + movhps %xmm9, 6 * SIZE(CO1) + + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movlps %xmm11, 4 * SIZE(CO2) + movhps %xmm11, 6 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm13, 4 * SIZE(CO1, LDC, 2) + movhps %xmm13, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) + movlps %xmm15, 4 * SIZE(CO2, LDC, 2) + movhps %xmm15, 6 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 8), AO +#ifdef LT + addq $32 * SIZE, B +#endif +#endif + +#ifdef LN + subq $8, KK + movq BORIG, B +#endif + +#ifdef LT + addq $8, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $3 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#ifdef LN + movq K, %rax + salq $2 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $2 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm8 + unpcklps %xmm2, %xmm0 + unpckhps %xmm2, %xmm8 + + movaps %xmm1, %xmm14 + unpcklps %xmm3, %xmm1 + unpckhps %xmm3, %xmm14 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movaps %xmm8, %xmm3 + unpcklps %xmm14, %xmm8 + unpckhps %xmm14, %xmm3 + + movaps 0 * SIZE(B), %xmm1 + movaps 4 * SIZE(B), %xmm5 + movaps 8 * SIZE(B), %xmm10 + movaps 12 * SIZE(B), %xmm11 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 + subps %xmm8, %xmm10 + subps %xmm3, %xmm11 +#else + movaps 0 * SIZE(AO), %xmm8 + movaps 4 * SIZE(AO), %xmm10 + movaps 8 * SIZE(AO), %xmm12 + movaps 12 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm11, %xmm8 + subps %xmm8, %xmm1 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm1 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm11 + + movaps 4 * SIZE(AO), %xmm6 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm11 + + movaps 8 * SIZE(AO), %xmm6 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm8, %xmm10 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm10, %xmm8 + subps %xmm8, %xmm11 + + movaps 12 * SIZE(AO), %xmm6 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm11 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + movaps %xmm10, 8 * SIZE(B) + movaps %xmm11, 12 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) + + pshufd $0x00, %xmm10, %xmm2 + pshufd $0x55, %xmm10, %xmm3 + pshufd $0xaa, %xmm10, %xmm4 + pshufd $0xff, %xmm10, %xmm6 + movaps %xmm2, 32 * SIZE(BO) + movaps %xmm3, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm6, 44 * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm2 + pshufd $0x55, %xmm11, %xmm3 + pshufd $0xaa, %xmm11, %xmm4 + pshufd $0xff, %xmm11, %xmm6 + movaps %xmm2, 48 * SIZE(BO) + movaps %xmm3, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm6, 60 * SIZE(BO) +#else + movaps %xmm8, 0 * SIZE(AO) + movaps %xmm10, 4 * SIZE(AO) + movaps %xmm12, 8 * SIZE(AO) + movaps %xmm14, 12 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) + movhps %xmm11, 2 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) + movhps %xmm14, 2 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#ifdef LN + movq K, %rax + salq $1 + BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movaps 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movaps 8 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $1 + BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm0 + unpckhps %xmm1, %xmm2 + + movapd 0 * SIZE(B), %xmm1 + movapd 4 * SIZE(B), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#else +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd 0 * SIZE(AO), %xmm8 +#ifdef movsd + xorps %xmm10, %xmm10 +#endif + movsd 2 * SIZE(AO), %xmm10 +#ifdef movsd + xorps %xmm12, %xmm12 +#endif + movsd 4 * SIZE(AO), %xmm12 +#ifdef movsd + xorps %xmm14, %xmm14 +#endif + movsd 6 * SIZE(AO), %xmm14 + + subps %xmm0, %xmm8 + subps %xmm1, %xmm10 + subps %xmm2, %xmm12 + subps %xmm3, %xmm14 +#endif + +#ifdef LN + movaps 0 * SIZE(AO), %xmm6 + + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 + pshufd $0xaa, %xmm6, %xmm8 + mulps %xmm5, %xmm8 + subps %xmm8, %xmm1 + + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 + pshufd $0x55, %xmm6, %xmm8 + mulps %xmm1, %xmm8 + subps %xmm8, %xmm5 + pshufd $0xff, %xmm6, %xmm8 + mulps %xmm8, %xmm5 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm8, %xmm2 + subps %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulps %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm14, %xmm2 + subps %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulps %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm12, %xmm2 + subps %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulps %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm10, %xmm2 + subps %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulps %xmm2, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + movaps %xmm5, 4 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) + + pshufd $0x00, %xmm5, %xmm2 + pshufd $0x55, %xmm5, %xmm3 + pshufd $0xaa, %xmm5, %xmm4 + pshufd $0xff, %xmm5, %xmm6 + movaps %xmm2, 16 * SIZE(BO) + movaps %xmm3, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm6, 28 * SIZE(BO) +#else + movlps %xmm8, 0 * SIZE(AO) + movlps %xmm10, 2 * SIZE(AO) + movlps %xmm12, 4 * SIZE(AO) + movlps %xmm14, 6 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movlps %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movlps %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#ifdef LN + movq K, %rax + salq $BASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + leaq (AO, %rax, SIZE), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $2 + BASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss 2 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss 3 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss 8 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss 5 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss 7 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss 12 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), B + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm1 + + unpcklps %xmm1, %xmm0 + + movapd 0 * SIZE(B), %xmm1 + subps %xmm0, %xmm1 +#else + movss 0 * SIZE(AO), %xmm8 + movss 1 * SIZE(AO), %xmm10 + movss 2 * SIZE(AO), %xmm12 + movss 3 * SIZE(AO), %xmm14 + + subss %xmm0, %xmm8 + subss %xmm1, %xmm10 + subss %xmm2, %xmm12 + subss %xmm3, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movss 0 * SIZE(AO), %xmm6 + pshufd $0x00, %xmm6, %xmm8 + mulps %xmm8, %xmm1 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm8, %xmm2 + subss %xmm2, %xmm14 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm14 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm14 + + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 +#endif + +#ifdef RT + movaps 12 * SIZE(B), %xmm0 + pshufd $0xff, %xmm0, %xmm2 + mulss %xmm2, %xmm14 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm14, %xmm2 + subss %xmm2, %xmm8 + + movaps 8 * SIZE(B), %xmm0 + pshufd $0xaa, %xmm0, %xmm2 + mulss %xmm2, %xmm12 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm12, %xmm2 + subss %xmm2, %xmm8 + + movaps 4 * SIZE(B), %xmm0 + pshufd $0x55, %xmm0, %xmm2 + mulss %xmm2, %xmm10 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm10, %xmm2 + subss %xmm2, %xmm8 + + movaps 0 * SIZE(B), %xmm0 + pshufd $0x00, %xmm0, %xmm2 + mulss %xmm2, %xmm8 +#endif + +#ifdef LN + subq $1 * SIZE, CO1 + subq $1 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, 0 * SIZE(B) + + pshufd $0x00, %xmm1, %xmm2 + pshufd $0x55, %xmm1, %xmm3 + pshufd $0xaa, %xmm1, %xmm4 + pshufd $0xff, %xmm1, %xmm6 + movaps %xmm2, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm6, 12 * SIZE(BO) +#else + movss %xmm8, 0 * SIZE(AO) + movss %xmm10, 1 * SIZE(AO) + movss %xmm12, 2 * SIZE(AO) + movss %xmm14, 3 * SIZE(AO) +#endif + +#if defined(LN) || defined(LT) + movaps %xmm1, %xmm0 + unpcklps %xmm10, %xmm1 + unpckhps %xmm10, %xmm0 + + movaps %xmm5, %xmm7 + unpcklps %xmm11, %xmm5 + unpckhps %xmm11, %xmm7 + + movaps %xmm1, %xmm10 + unpcklps %xmm5, %xmm1 + unpckhps %xmm5, %xmm10 + + movaps %xmm0, %xmm11 + unpcklps %xmm7, %xmm0 + unpckhps %xmm7, %xmm11 + + movss %xmm1, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm0, 0 * SIZE(CO1, LDC, 2) + movss %xmm11, 0 * SIZE(CO2, LDC, 2) +#else + movss %xmm8, 0 * SIZE(CO1) + movss %xmm10, 0 * SIZE(CO2) + movss %xmm12, 0 * SIZE(CO1, LDC, 2) + movss %xmm14, 0 * SIZE(CO2, LDC, 2) +#endif + +#ifndef LN + addq $1 * SIZE, CO1 + addq $1 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $BASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + decq J # j -- + jg .L01 + + + +.L999: + movq %rbx, %rsp + EMMS + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/xdot.S b/kernel/x86_64/xdot.S new file mode 100644 index 0000000000..966b499603 --- /dev/null +++ b/kernel/x86_64/xdot.S @@ -0,0 +1,290 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define STACK 12 +#define ARGS 0 + +#define RESULT 4 + STACK + ARGS(%esp) +#define STACK_N 8 + STACK + ARGS(%esp) +#define STACK_X 12 + STACK + ARGS(%esp) +#define STACK_INCX 16 + STACK + ARGS(%esp) +#define STACK_Y 20 + STACK + ARGS(%esp) +#define STACK_INCY 24 + STACK + ARGS(%esp) + +#include "l1param.h" + + PROLOGUE + + pushl %edi + pushl %esi + pushl %ebx + + PROFCODE + +#define N %ebx +#define X %esi +#define INCX %ecx +#define Y %edi +#define INCY %edx + + movl STACK_N, N + movl STACK_X, X + movl STACK_INCX, INCX + movl STACK_Y, Y + movl STACK_INCY, INCY + + testl N, N + jle .L88 + + sall $ZBASE_SHIFT, INCX + sall $ZBASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpl $2 * SIZE, INCX + jne .L14 + cmpl $2 * SIZE, INCY + jne .L14 + + movl N, %eax + sarl $1, %eax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(X) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + + addl $4 * SIZE, X + addl $4 * SIZE, Y + decl %eax + jg .L16 + ALIGN_3 + +.L15: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + jmp .L27 + ALIGN_3 + +.L14: + movl N, %eax + sarl $1, %eax + jle .L30 + ALIGN_3 + + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addl INCX, X + + FLD 0 * SIZE(X) + addl INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addl INCX, X + addl INCY, Y + + decl %eax + jg .L31 + ALIGN_3 + +.L30: + movl N, %eax + andl $1, %eax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + ALIGN_3 + +.L27: + movl RESULT, %eax + +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) + + popl %ebx + popl %esi + popl %edi + ret + ALIGN_3 + +.L88: + movl RESULT, %eax + + fldz + fldz + + FST 1 * SIZE(%eax) + FST 0 * SIZE(%eax) + + popl %ebx + popl %esi + popl %edi + ret + + EPILOGUE diff --git a/kernel/x86_64/xgemm3m_kernel_2x2.S b/kernel/x86_64/xgemm3m_kernel_2x2.S new file mode 100644 index 0000000000..6d116a1d79 --- /dev/null +++ b/kernel/x86_64/xgemm3m_kernel_2x2.S @@ -0,0 +1,877 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define KK %r11 +#define KKK 48(%rsp) + +#define STACKSIZE 64 + +#define ALPHA_R 8 + STACKSIZE(%rsp) +#define ALPHA_I 24 + STACKSIZE(%rsp) +#define OFFSET 48 + STACKSIZE(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 40 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq N, %rax + sarq $1, %rax + movq %rax, J + je .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + leaq (, LDC, 2), %rax + addq %rax, C + + movq M, I + sarq $1, I + je .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) + prefetchw 2 * SIZE(CO, LDC, 1) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) + prefetchnta 2 * SIZE(CO, LDC, 1) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + faddp %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + faddp %st, %st(6) + faddp %st, %st(3) + faddp %st, %st(3) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fld %st(3) + fmul %st(1), %st + + FLD 2 * SIZE(CO) + faddp %st, %st(1) + FST 2 * SIZE(CO) + + fld %st(4) + fmul %st(1), %st + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + fmul %st(5), %st + + FLD 2 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 2 * SIZE(CO, LDC) + + fmul %st, %st(1) + fmul %st, %st(2) + fmul %st, %st(3) + fmulp %st, %st(4) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 3 * SIZE(CO) + faddp %st, %st(1) + FST 3 * SIZE(CO) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) + + FLD 3 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 3 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + FST 0 * SIZE(CO, LDC) + FST 1 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO + decq I + jne .L11 + ALIGN_4 + +.L20: + movq M, %rax + andq $1, %rax + je .L29 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq ( B, %rax, 2), BO +#endif + + fldz + fldz + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + + FLD -6 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(AO) + + FLD -4 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + + FLD -2 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $4 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L28 + ALIGN_4 + +.L26: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $1 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmul %st(3), %st + + FLD 0 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 0 * SIZE(CO, LDC) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 1 * SIZE(CO, LDC) + faddp %st, %st(1) + FST 1 * SIZE(CO, LDC) +#else + FST 0 * SIZE(CO) + FST 0 * SIZE(CO, LDC) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + decq J + jne .L01 + ALIGN_4 + +.L30: + movq N, %rax + testq $1, %rax + je .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + addq LDC, C + + movq M, I + sarq $1, I + je .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq ( B, %rax, 1), BO +#endif + + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(BO) + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -7 * SIZE(BO) + FLD -6 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -5 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -6 * SIZE(BO) + FLD -4 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -3 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + FLD -5 * SIZE(BO) + FLD -2 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -1 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $8 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L38 + ALIGN_4 + +.L36: + FLD -8 * SIZE(BO) + + FLD -8 * SIZE(AO) + fmul %st(1), %st + faddp %st, %st(2) + + FLD -7 * SIZE(AO) + fmulp %st, %st(1) + faddp %st, %st(2) + + addq $2 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + FLD ALPHA_I + FLD ALPHA_R + + fld %st(2) + fmul %st(1), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmul %st(3), %st + + FLD 2 * SIZE(CO) + faddp %st, %st(1) + FST 2 * SIZE(CO) + + fmul %st, %st(1) + fmulp %st, %st(2) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) + + FLD 3 * SIZE(CO) + faddp %st, %st(1) + FST 3 * SIZE(CO) +#else + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO + decq I + jne .L31 + ALIGN_4 + +.L40: + movq M, %rax + andq $1, %rax + je .L49 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq ( B, %rax, 1), BO +#endif + + fldz + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -7 * SIZE(AO) + FLD -7 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -6 * SIZE(AO) + FLD -6 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + FLD -5 * SIZE(AO) + FLD -5 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $4 * SIZE,AO + addq $4 * SIZE,BO + + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L48 + ALIGN_4 + +.L46: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fmulp %st, %st(1) + faddp %st, %st(1) + + addq $1 * SIZE,AO + addq $1 * SIZE,BO + + decq %rax + jne .L46 + ALIGN_4 + +.L48: +#ifndef TRMMKERNEL + FLD ALPHA_I + FLD ALPHA_R + + fmul %st(2), %st + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + fmulp %st, %st(1) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $1 * SIZE, CO + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + movq BO, B + ALIGN_4 + +.L999: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/xgemm_kernel_1x1.S b/kernel/x86_64/xgemm_kernel_1x1.S new file mode 100644 index 0000000000..164e618a43 --- /dev/null +++ b/kernel/x86_64/xgemm_kernel_1x1.S @@ -0,0 +1,374 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define STACKSIZE 64 + +#define ALPHA_R 8 + STACKSIZE(%rsp) +#define ALPHA_I 24 + STACKSIZE(%rsp) +#define OFFSET 48 + STACKSIZE(%rsp) + +#define KK %r11 +#define KKK 48(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 faddp +#define ADD4 faddp +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 fsubrp +#define ADD4 faddp +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 faddp +#define ADD4 fsubrp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 fsubrp +#define ADD4 fsubrp +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 40 + STACKSIZE(%rsp), LDC + +#if defined(TRMMKERNEL) && !defined(LEFT) + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + + addq $8 * SIZE, A + addq $8 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + cmpq $0, M + jle .L999 + + movq N, %rax + movq %rax, J + testq %rax, %rax + jle .L999 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO + + movq C, CO + addq LDC, C + + movq M, I + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: + faddp %st, %st(3) + faddp %st, %st(1) + +#ifndef TRMMKERNEL + FLD ALPHA_R + fld %st + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + FLD ALPHA_I + fmul %st, %st(3) + fmulp %st, %st(4) + + fsubp %st, %st(2) + faddp %st, %st(2) + + FLD 0 * SIZE(CO) + faddp %st, %st(1) + FST 0 * SIZE(CO) + + FLD 1 * SIZE(CO) + faddp %st, %st(1) + FST 1 * SIZE(CO) +#else + FST 1 * SIZE(CO) + FST 0 * SIZE(CO) +#endif + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO + decq I + jne .L11 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + movq BO, B + decq J + jne .L01 + ALIGN_4 + +.L999: + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/xgemv_n.S b/kernel/x86_64/xgemv_n.S new file mode 100644 index 0000000000..db6d80a98f --- /dev/null +++ b/kernel/x86_64/xgemv_n.S @@ -0,0 +1,334 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define P 32 + +#define STACKSIZE 80 + +#define ALPHA_R 8 + STACKSIZE(%rsp) +#define ALPHA_I 24 + STACKSIZE(%rsp) +#define OLD_INCX 40 + STACKSIZE(%rsp) +#define OLD_Y 48 + STACKSIZE(%rsp) +#define OLD_INCY 56 + STACKSIZE(%rsp) +#define BUFFER 64 + STACKSIZE(%rsp) + +#define PLDA_M 56 (%rsp) +#define IS 64 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#define TEMP %rax +#define I %rax +#define J %r11 +#define A1 %r12 +#define X1 %r13 +#define Y1 %r14 +#define XP %r15 +#define MIN_N %rbx + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + + FLD ALPHA_I + FLD ALPHA_R + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movq $0, IS + + test M, M + jle .L79 + test N, N + jle .L79 + + movq LDA, %rax + imulq $P, %rax # P * lda + subq M ,%rax # P * lda - m + salq $ZBASE_SHIFT, %rax + movq %rax, PLDA_M + + salq $ZBASE_SHIFT, LDA + ALIGN_2 + +.L32: + movq $P, %rax + movq N, MIN_N + subq IS, MIN_N + cmpq %rax, MIN_N + cmovg %rax, MIN_N + + movq IS, XP + salq $ZBASE_SHIFT, XP + leaq (X,XP, 1), XP + + cmpq $2 * SIZE, INCX + je .L34 + + movq BUFFER, XP + movq XP, X1 + + movq MIN_N, I + sarq $1, I + jle .L35 + ALIGN_2 + +.L36: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + + FST 3 * SIZE(X1) + FST 2 * SIZE(X1) + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + + addq $4 * SIZE, X1 # xp += 4 + decq I + jg .L36 + ALIGN_3 + +.L35: + movq MIN_N, I + andq $1, I + jle .L34 + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + ALIGN_3 + +/* Main Routine */ +.L34: + movq Y, Y1 # c_offset + movq M, J # j = m + ALIGN_3 + +.L61: + movq A, A1 # a_offset = a + addq $2 * SIZE, A # a++ + + fldz + fldz + fldz + fldz + + movq XP, X1 + FLD (X1) # bt1 = *(b_offset + 0) + + movq MIN_N, I + sarq $1, I + jle .L64 + ALIGN_3 + +.L65: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(X1) # bt1 = *(b_offset + 2) + + addq $2 * SIZE, X1 # b_offset += 2 + addq LDA, A1 # a_offset += lda + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(X1) # bt1 = *(b_offset + 2) + + addq $2 * SIZE, X1 # b_offset += 2 + addq LDA, A1 # a_offset += lda + + decq I + jg .L65 + +.L64: + movq MIN_N, I + andq $1, I + jle .L70 + ALIGN_2 + +.L71: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_2 + +.L70: + ffreep %st(0) + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4) + fld %st(2) + fmul %st(4) + fsubp %st, %st(1) + + FLD 0 * SIZE(Y1) + faddp %st, %st(1) + FST 0 * SIZE(Y1) + + fmul %st(2) + fxch %st(1) + fmul %st(3) + faddp %st, %st(1) + + FLD 1 * SIZE(Y1) + faddp %st, %st(1) + FST 1 * SIZE(Y1) + + addq INCY, Y1 + decq J + jg .L61 + +.L60: + addq PLDA_M, A + addq $P, IS + cmpq N, IS + jl .L32 + +.L79: + ffreep %st + ffreep %st + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/xgemv_t.S b/kernel/x86_64/xgemv_t.S new file mode 100644 index 0000000000..c09dcf0648 --- /dev/null +++ b/kernel/x86_64/xgemv_t.S @@ -0,0 +1,338 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#define STACKSIZE 80 +#define P 4096 + +#define ALPHA_R 8 + STACKSIZE(%rsp) +#define ALPHA_I 24 + STACKSIZE(%rsp) +#define OLD_INCX 40 + STACKSIZE(%rsp) +#define OLD_Y 48 + STACKSIZE(%rsp) +#define OLD_INCY 56 + STACKSIZE(%rsp) +#define BUFFER 64 + STACKSIZE(%rsp) + +#define NLDA 56 (%rsp) +#define IS 64 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#define TEMP %rax +#define I %rax +#define J %r11 +#define A1 %r12 +#define XP %r15 +#define X1 %r13 +#define Y1 %r14 +#define MIN_M %rbx + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + + FLD ALPHA_I + FLD ALPHA_R + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movq $0, IS + + test M, M + jle .L79 # goto END + test N, N + jle .L79 # goto END + + movq N, %rax + imulq LDA, %rax + movq $P, NLDA + subq %rax, NLDA + salq $ZBASE_SHIFT, NLDA + + salq $ZBASE_SHIFT, LDA + ALIGN_2 + +.L32: + movq $P, %rax + movq M, MIN_M + subq IS , MIN_M + cmpq %rax, MIN_M + cmovg %rax, MIN_M + + movq IS, X1 + salq $ZBASE_SHIFT, X1 + leaq (X,X1, 1), X1 + + movq X1, XP + + cmpq $2 * SIZE, INCX + je .L34 + + movq BUFFER, X1 + movq X1, XP + + movq MIN_M, I + sarq $1, I + jle .L35 + ALIGN_3 + +.L36: + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + + FST 3 * SIZE(X1) + FST 2 * SIZE(X1) + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + + addq $4 * SIZE, X1 # xp += 4 + decq I + jg .L36 + ALIGN_3 + +.L35: + movq MIN_M, I + andq $1,I + jle .L34 + + FLD 0 * SIZE(X) + FLD 1 * SIZE(X) + addq INCX,X # x += incx + FST 1 * SIZE(X1) + FST 0 * SIZE(X1) + ALIGN_3 + +/* Main Routine */ + +.L34: + movq Y, Y1 # coffset = y + + movq N, J + ALIGN_2 + +.L61: + movq A, A1 # a_offset = a + fldz # ct1 = ZERO + fldz # ct1 = ZERO + + addq LDA, A + fldz # ct1 = ZERO + fldz # ct1 = ZERO + + movq XP, X1 + + FLD (X1) # bt1 = *(b_offset + 0) + + movq MIN_M, I + sarq $1, I + jle .L64 + ALIGN_3 + +.L65: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 2 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 3 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 2 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 3 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + FLD 4 * SIZE(X1) # bt1 = *(b_offset + 1) + + addq $4 * SIZE, X1 + addq $4 * SIZE, A1 + decq I + jg .L65 + ALIGN_3 + +.L64: + movq MIN_M, I + andq $1, I + jle .L70 + ALIGN_3 + +.L71: + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(2) # ct1 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) +#ifndef CONJ + faddp %st, %st(2) # ct2 += bt1 +#else + fsubrp %st, %st(2) # ct2 -= bt1 +#endif + FLD 1 * SIZE(X1) # bt1 = *(b_offset + 1) + + FLD 0 * SIZE(A1) # at1 = *(a_offset + 0) + fmul %st(1) # at1 *= bt1 + faddp %st, %st(4) # ct3 += at1 + + FLD 1 * SIZE(A1) # bt1 *= *(a_offset + 1) + fmulp %st, %st(1) + faddp %st, %st(4) # ct4 += bt1 + fldz + ALIGN_3 + +.L70: + ffreep %st(0) + +#ifndef XCONJ +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + faddp %st, %st(1) +#endif +#else +#ifndef CONJ + faddp %st, %st(3) + fsubp %st, %st(1) +#else + fsubp %st, %st(3) + fsubp %st, %st(1) +#endif +#endif + + fld %st(0) # ct4 = ct2 + fmul %st(4) + fld %st(2) + fmul %st(4) + fsubp %st, %st(1) + + FLD 0 * SIZE(Y1) + faddp %st, %st(1) + FST 0 * SIZE(Y1) + + fmul %st(2) + fxch %st(1) + fmul %st(3) + faddp %st, %st(1) + + FLD 1 * SIZE(Y1) + faddp %st, %st(1) + FST 1 * SIZE(Y1) + addq INCY, Y1 + + decq J + jg .L61 + ALIGN_3 + +.L60: + addq NLDA, A + + addq $P, IS + cmpq M, IS + jl .L32 + ALIGN_3 + +.L79: + ffreep %st + ffreep %st + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/xtrsm_kernel_LT_1x1.S b/kernel/x86_64/xtrsm_kernel_LT_1x1.S new file mode 100644 index 0000000000..86d4a748b0 --- /dev/null +++ b/kernel/x86_64/xtrsm_kernel_LT_1x1.S @@ -0,0 +1,486 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define N ARG2 +#define K ARG3 +#define A ARG4 +#define B ARG5 +#define C ARG6 +#define LDC %r10 + +#define I %r12 +#define J %r13 +#define AO %r14 +#define BO %r15 +#define CO %rbp + +#define OFFSET 48 + STACKSIZE(%rsp) + +#define STACKSIZE 64 + +#define KK %r11 +#define AORIG 48(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#else +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + +#define PREFETCHSIZE (5 + 4 * 10) + +#ifndef CONJ +#define ADD1 faddp +#define ADD2 fsubrp +#define ADD3 faddp +#define ADD4 faddp +#elif defined(LN) || defined(LT) +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 fsubrp +#define ADD4 faddp +#else +#define ADD1 faddp +#define ADD2 faddp +#define ADD3 faddp +#define ADD4 fsubrp +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq 40 + STACKSIZE(%rsp), LDC + + salq $ZBASE_SHIFT, LDC + + addq $8 * SIZE, A + addq $8 * SIZE, B + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + movq OFFSET, %rax + negq %rax + movq %rax, KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + cmpq $0, M + jle .L999 + + movq N, %rax + movq %rax, J + testq %rax, %rax + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#ifdef RT + subq LDC, C +#endif + movq C, CO +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +#ifdef LT + movq OFFSET, %rax + movq %rax, KK +#endif + movq M, I + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + fldz + fldz + fldz + fldz + +#if defined(HAVE_3DNOW) + prefetchw 2 * SIZE(CO) +#elif defined(HAVE_SSE) + prefetchnta 2 * SIZE(CO) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -6 * SIZE(AO) + + FLD -6 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -5 * SIZE(BO) + fmul %st, %st(2) + + FLD -5 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + PREFETCH (PREFETCHSIZE + 4) * SIZE(AO) + + FLD -4 * SIZE(AO) + + FLD -4 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -3 * SIZE(BO) + fmul %st, %st(2) + + FLD -3 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + FLD -2 * SIZE(AO) + + FLD -2 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -1 * SIZE(BO) + fmul %st, %st(2) + + FLD -1 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addq $8 * SIZE,AO + addq $8 * SIZE,BO + + decq %rax + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + and $3, %rax + je .L18 + ALIGN_4 + +.L16: + FLD -8 * SIZE(AO) + + FLD -8 * SIZE(BO) + fld %st(1) + fmul %st(1), %st + ADD1 %st, %st(3) + + FLD -7 * SIZE(BO) + fmul %st, %st(2) + + FLD -7 * SIZE(AO) + fmul %st, %st(2) + fmulp %st, %st(1) + + ADD2 %st, %st(6) + ADD3 %st, %st(3) + ADD4 %st, %st(3) + + addq $2 * SIZE,AO + addq $2 * SIZE,BO + + decq %rax + jne .L16 + ALIGN_4 + +.L18: + faddp %st, %st(3) + faddp %st, %st(1) + + fxch %st(1) + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(BO) + fsubp %st, %st(1) + FLD -7 * SIZE(BO) + fsubp %st, %st(2) +#else + FLD -8 * SIZE(AO) + fsubp %st, %st(1) + FLD -7 * SIZE(AO) + fsubp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + FLD -8 * SIZE(AO) + fmul %st(1), %st + FLD -8 * SIZE(AO) + fmul %st(3), %st + FLD -7 * SIZE(AO) + fmulp %st, %st(3) + FLD -7 * SIZE(AO) + fmulp %st, %st(4) +#endif + +#if defined(RN) || defined(RT) + FLD -8 * SIZE(BO) + fmul %st(1), %st + FLD -8 * SIZE(BO) + fmul %st(3), %st + FLD -7 * SIZE(BO) + fmulp %st, %st(3) + FLD -7 * SIZE(BO) + fmulp %st, %st(4) +#endif + +#ifndef CONJ + faddp %st, %st(2) + fsubp %st, %st(2) +#else + fsubp %st, %st(2) + faddp %st, %st(2) +#endif + +#if defined(LN) || defined(LT) + fld %st + FST -7 * SIZE(BO) + fxch %st(1) + fld %st + FST -8 * SIZE(BO) +#else + fld %st + FST -7 * SIZE(AO) + fxch %st(1) + fld %st + FST -8 * SIZE(AO) +#endif + +#ifdef LN + subq $2 * SIZE, CO +#endif + + FST 0 * SIZE(CO) + FST 1 * SIZE(CO) + +#ifndef LN + addq $2 * SIZE, CO +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + jne .L11 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + + decq J + jne .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zamax.S b/kernel/x86_64/zamax.S new file mode 100644 index 0000000000..21d96b640c --- /dev/null +++ b/kernel/x86_64/zamax.S @@ -0,0 +1,241 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define I %rax + +#ifndef USE_MIN +#define FMOV fcmovbe +#else +#define FMOV fcmovnbe +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $ZBASE_SHIFT, INCX + + fldz + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + ffreep %st + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + addq INCX, X + decq M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq $8 * SIZE, X + + decq I + jg .L10 + ALIGN_4 + +.L20: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq $2 * SIZE, X + decq I + jg .L21 + jmp .L999 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + addq INCX, X + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + decq I + jg .L50 + ALIGN_4 + +.L60: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st, %st(1) + fcomi + FMOV %st(1), %st(0) + fxch %st(1) + ffreep %st + + addq INCX, X + decq I + jg .L61 + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/zamax_atom.S b/kernel/x86_64/zamax_atom.S new file mode 100644 index 0000000000..3f67574103 --- /dev/null +++ b/kernel/x86_64/zamax_atom.S @@ -0,0 +1,336 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + salq $ZBASE_SHIFT, INCX + + testq M, M + jle .L999 + + testq INCX, INCX + jle .L999 + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm4 + addq INCX, X + + andps %xmm15, %xmm0 + andps %xmm15, %xmm4 + + addsd %xmm4, %xmm0 + decq M + jle .L999 + + movaps %xmm0, %xmm1 + + cmpq $2 * SIZE, INCX + jne .L20 + + movq M, I + sarq $2, I + jle .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movsd 2 * SIZE(X), %xmm6 + movsd 3 * SIZE(X), %xmm7 + + movsd 4 * SIZE(X), %xmm8 + andps %xmm15, %xmm4 + movsd 5 * SIZE(X), %xmm9 + andps %xmm15, %xmm5 + movsd 6 * SIZE(X), %xmm10 + addsd %xmm4, %xmm5 + movsd 7 * SIZE(X), %xmm11 + decq I + jle .L13 + ALIGN_4 + +.L12: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm6 + movsd 8 * SIZE(X), %xmm4 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + movsd 10 * SIZE(X), %xmm6 + + maxsd %xmm5, %xmm0 + movsd 9 * SIZE(X), %xmm5 + andps %xmm15, %xmm8 + maxsd %xmm7, %xmm1 + movsd 11 * SIZE(X), %xmm7 + andps %xmm15, %xmm9 + addsd %xmm8, %xmm9 + movsd 12 * SIZE(X), %xmm8 + + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + addsd %xmm10, %xmm11 + movsd 14 * SIZE(X), %xmm10 + + maxsd %xmm9, %xmm0 + movsd 13 * SIZE(X), %xmm9 + andps %xmm15, %xmm4 + maxsd %xmm11, %xmm1 + movsd 15 * SIZE(X), %xmm11 + andps %xmm15, %xmm5 + addsd %xmm4, %xmm5 + + addq $8 * SIZE, X + decq I + jg .L12 + ALIGN_4 + +.L13: + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + andps %xmm15, %xmm8 + maxsd %xmm7, %xmm1 + andps %xmm15, %xmm9 + addsd %xmm8, %xmm9 + + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + addsd %xmm10, %xmm11 + + maxsd %xmm9, %xmm0 + maxsd %xmm11, %xmm1 + + addq $8 * SIZE, X + ALIGN_4 + +.L15: + testq $2, M + jle .L17 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movsd 2 * SIZE(X), %xmm6 + movsd 3 * SIZE(X), %xmm7 + addq $4 * SIZE, X + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addsd %xmm4, %xmm5 + + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + maxsd %xmm7, %xmm1 + ALIGN_3 + +.L17: + testq $1, M + jle .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + + addsd %xmm4, %xmm5 + maxsd %xmm5, %xmm0 + jmp .L998 + ALIGN_3 + +.L20: + movq M, I + sarq $2, I + jle .L25 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + movsd 0 * SIZE(X), %xmm8 + andps %xmm15, %xmm4 + movsd 1 * SIZE(X), %xmm9 + addq INCX, X + andps %xmm15, %xmm5 + movsd 0 * SIZE(X), %xmm10 + addsd %xmm4, %xmm5 + movsd 1 * SIZE(X), %xmm11 + addq INCX, X + + decq I + jle .L23 + ALIGN_4 + +.L22: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm6 + movsd 0 * SIZE(X), %xmm4 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + movsd 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm8 + addq INCX, X + maxsd %xmm7, %xmm1 + movsd 0 * SIZE(X), %xmm6 + andps %xmm15, %xmm9 + movsd 1 * SIZE(X), %xmm7 + addsd %xmm8, %xmm9 + addq INCX, X + + andps %xmm15, %xmm10 + movsd 0 * SIZE(X), %xmm8 + andps %xmm15, %xmm11 + addsd %xmm10, %xmm11 + + maxsd %xmm9, %xmm0 + movsd 1 * SIZE(X), %xmm9 + addq INCX, X + andps %xmm15, %xmm4 + movsd 0 * SIZE(X), %xmm10 + maxsd %xmm11, %xmm1 + movsd 1 * SIZE(X), %xmm11 + andps %xmm15, %xmm5 + addq INCX, X + addsd %xmm4, %xmm5 + + decq I + jg .L22 + ALIGN_4 + +.L23: + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + andps %xmm15, %xmm8 + maxsd %xmm7, %xmm1 + andps %xmm15, %xmm9 + addsd %xmm8, %xmm9 + + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + addsd %xmm10, %xmm11 + + maxsd %xmm9, %xmm0 + maxsd %xmm11, %xmm1 + ALIGN_4 + +.L25: + testq $2, M + jle .L27 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addsd %xmm4, %xmm5 + + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addsd %xmm6, %xmm7 + + maxsd %xmm5, %xmm0 + maxsd %xmm7, %xmm1 + ALIGN_3 + +.L27: + testq $1, M + jle .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + + addsd %xmm4, %xmm5 + maxsd %xmm5, %xmm0 + ALIGN_3 + +.L998: + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zamax_sse.S b/kernel/x86_64/zamax_sse.S new file mode 100644 index 0000000000..5566a35a3f --- /dev/null +++ b/kernel/x86_64/zamax_sse.S @@ -0,0 +1,309 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxps minps +#define maxss minss +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + salq $ZBASE_SHIFT, INCX + + testq M, M + jle .L999 + + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 + + movss 0 * SIZE(X), %xmm0 + movss 1 * SIZE(X), %xmm1 + addq INCX, X + decq M + andps %xmm15, %xmm0 + andps %xmm15, %xmm1 + addps %xmm1, %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, %xmm1 + cmpq $2 * SIZE, INCX + jne .L40 + +.L30: + movq M, I + sarq $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + movsd 8 * SIZE(X), %xmm7 + movhps 10 * SIZE(X), %xmm7 + movsd 12 * SIZE(X), %xmm8 + movhps 14 * SIZE(X), %xmm8 + movaps %xmm7, %xmm9 + + shufps $0x88, %xmm8, %xmm7 + shufps $0xdd, %xmm8, %xmm9 + + andps %xmm15, %xmm7 + andps %xmm15, %xmm9 + addps %xmm9, %xmm7 + maxps %xmm7, %xmm0 + + addq $16 * SIZE, X + decq I + jg .L31 + ALIGN_4 + +.L35: + andq $7, M + jle .L998 + + testq $4, M + je .L36 + + movsd 0 * SIZE(X), %xmm4 + movhps 2 * SIZE(X), %xmm4 + movsd 4 * SIZE(X), %xmm5 + movhps 6 * SIZE(X), %xmm5 + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + addq $8 * SIZE, X + ALIGN_3 + +.L36: + testq $2, M + je .L37 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + movss 2 * SIZE(X), %xmm6 + movss 3 * SIZE(X), %xmm7 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + maxss %xmm4, %xmm0 + maxss %xmm6, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L37: + testq $1, M + je .L998 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addps %xmm5, %xmm4 + maxss %xmm4, %xmm0 + jmp .L998 + ALIGN_4 + + +.L40: + movq M, I + sarq $3, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhps 0 * SIZE(X), %xmm5 + addq INCX, X + + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm7 + addq INCX, X + movhps 0 * SIZE(X), %xmm7 + addq INCX, X + movsd 0 * SIZE(X), %xmm8 + addq INCX, X + movhps 0 * SIZE(X), %xmm8 + addq INCX, X + movaps %xmm7, %xmm9 + + shufps $0x88, %xmm8, %xmm7 + shufps $0xdd, %xmm8, %xmm9 + + andps %xmm15, %xmm7 + andps %xmm15, %xmm9 + addps %xmm9, %xmm7 + maxps %xmm7, %xmm0 + + decq I + jg .L41 + ALIGN_4 + +.L45: + andq $7, M + jle .L998 + + testq $4, M + je .L46 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + addq INCX, X + movhps 0 * SIZE(X), %xmm5 + addq INCX, X + movaps %xmm4, %xmm6 + + shufps $0x88, %xmm5, %xmm4 + shufps $0xdd, %xmm5, %xmm6 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm6 + addps %xmm6, %xmm4 + maxps %xmm4, %xmm0 + ALIGN_3 + +.L46: + testq $2, M + je .L47 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + addq INCX, X + movss 0 * SIZE(X), %xmm6 + movss 1 * SIZE(X), %xmm7 + addq INCX, X + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + maxss %xmm4, %xmm0 + maxss %xmm6, %xmm1 + ALIGN_3 + +.L47: + testq $1, M + je .L998 + + movss 0 * SIZE(X), %xmm4 + movss 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addps %xmm5, %xmm4 + maxss %xmm4, %xmm0 + jmp .L998 + ALIGN_4 + +.L998: + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + movhlps %xmm0, %xmm0 + maxps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + maxss %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zamax_sse2.S b/kernel/x86_64/zamax_sse2.S new file mode 100644 index 0000000000..eb8fd43795 --- /dev/null +++ b/kernel/x86_64/zamax_sse2.S @@ -0,0 +1,341 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#ifdef USE_MIN +#define maxpd minpd +#define maxsd minsd +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + decq M + andpd %xmm15, %xmm0 + andpd %xmm15, %xmm1 + addpd %xmm1, %xmm0 + unpcklpd %xmm0, %xmm0 + movapd %xmm0, %xmm1 + movapd %xmm0, %xmm2 + movapd %xmm0, %xmm3 + + cmpq $2 * SIZE, INCX + jne .L40 + +.L30: + movq M, I + sarq $3, I + jle .L35 + ALIGN_4 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 5 * SIZE(X), %xmm7 + movhpd 6 * SIZE(X), %xmm6 + movhpd 7 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm0 + + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm6 + maxpd %xmm6, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 8 * SIZE(X), %xmm4 + movsd 9 * SIZE(X), %xmm5 + movhpd 10 * SIZE(X), %xmm4 + movhpd 11 * SIZE(X), %xmm5 + movsd 12 * SIZE(X), %xmm6 + movsd 13 * SIZE(X), %xmm7 + movhpd 14 * SIZE(X), %xmm6 + movhpd 15 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm2 + + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm6 + maxpd %xmm6, %xmm3 + + addq $16 * SIZE, X + decq I + jg .L31 + ALIGN_4 + +.L35: + andq $7, M + jle .L998 + + testq $4, M + je .L36 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 5 * SIZE(X), %xmm7 + movhpd 6 * SIZE(X), %xmm6 + movhpd 7 * SIZE(X), %xmm7 + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L36: + testq $2, M + je .L37 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + movhpd 2 * SIZE(X), %xmm4 + movhpd 3 * SIZE(X), %xmm5 + addq $4 * SIZE, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm0 + ALIGN_3 + +.L37: + testq $1, M + je .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxsd %xmm4, %xmm2 + jmp .L998 + ALIGN_4 + + +.L40: + movq M, I + sarq $3, I + jle .L45 + ALIGN_4 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm0 + + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm6 + maxpd %xmm6, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm2 + + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm6 + maxpd %xmm6, %xmm3 + + decq I + jg .L41 + ALIGN_4 + +.L45: + andq $7, M + jle .L998 + + testq $4, M + je .L46 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + movhpd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + andpd %xmm15, %xmm6 + andpd %xmm15, %xmm7 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + maxpd %xmm4, %xmm0 + maxpd %xmm6, %xmm1 + ALIGN_3 + +.L46: + testq $2, M + je .L47 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movhpd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxpd %xmm4, %xmm2 + ALIGN_3 + +.L47: + testq $1, M + je .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + andpd %xmm15, %xmm4 + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm4 + maxsd %xmm4, %xmm3 + jmp .L998 + ALIGN_4 + +.L998: + maxpd %xmm1, %xmm0 + maxpd %xmm3, %xmm2 + maxpd %xmm2, %xmm0 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + maxsd %xmm1, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zasum.S b/kernel/x86_64/zasum.S new file mode 100644 index 0000000000..b94e49bf0a --- /dev/null +++ b/kernel/x86_64/zasum.S @@ -0,0 +1,200 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 +#define X ARG2 +#define INCX ARG3 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE * 2, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + FLD 2 * SIZE(X) + fabs + FLD 3 * SIZE(X) + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fabs + FLD 5 * SIZE(X) + fabs + FLD 6 * SIZE(X) + fabs + FLD 7 * SIZE(X) + fabs + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $3, M + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + fabs + faddp %st,%st(3) + faddp %st,%st(1) + addq $2 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fabs + FLD 1 * SIZE(X) + addq INCX, X + fabs + faddp %st,%st(3) + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + ret + + EPILOGUE diff --git a/kernel/x86_64/zasum_atom.S b/kernel/x86_64/zasum_atom.S new file mode 100644 index 0000000000..ab83809d22 --- /dev/null +++ b/kernel/x86_64/zasum_atom.S @@ -0,0 +1,411 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + salq $ZBASE_SHIFT, INCX + xorps %xmm13, %xmm13 + + cmpq $2 * SIZE, INCX + jne .L20 + + addq M, M + + testq $SIZE, X + je .L05 + + movsd (X), %xmm0 + addq $SIZE, X + andps %xmm15, %xmm0 + decq M + ALIGN_3 + +.L05: + subq $-16 * SIZE, X + + movq M, I + sarq $4, I + jle .L12 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm8, %xmm12 + addsd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm9, %xmm13 + addsd %xmm9, %xmm2 + movaps 10 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm10, %xmm12 + addsd %xmm10, %xmm0 + movaps 12 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm11, %xmm13 + addsd %xmm11, %xmm2 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm15, %xmm4 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + + andps %xmm15, %xmm6 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + + andps %xmm15, %xmm8 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm8, %xmm12 + addsd %xmm8, %xmm0 + + andps %xmm15, %xmm9 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm9, %xmm13 + addsd %xmm9, %xmm2 + + andps %xmm15, %xmm10 + addsd %xmm13, %xmm3 + pshufd $0x4e, %xmm10, %xmm12 + addsd %xmm10, %xmm0 + + andps %xmm15, %xmm11 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm11, %xmm13 + addsd %xmm11, %xmm2 + + addsd %xmm13, %xmm3 + subq $-16 * SIZE, X + ALIGN_3 + +.L12: + andq $15, M + jle .L998 + + testq $8, M + je .L13 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + addq $8 * SIZE, X + + andps %xmm15, %xmm4 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + addsd %xmm13, %xmm3 + andps %xmm15, %xmm6 + pshufd $0x4e, %xmm6, %xmm12 + addsd %xmm6, %xmm0 + andps %xmm15, %xmm7 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm7, %xmm13 + addsd %xmm7, %xmm2 + addsd %xmm13, %xmm3 + ALIGN_3 + +.L13: + testq $4, M + je .L14 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + addq $4 * SIZE, X + + andps %xmm15, %xmm4 + pshufd $0x4e, %xmm4, %xmm12 + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm12, %xmm1 + pshufd $0x4e, %xmm5, %xmm13 + addsd %xmm5, %xmm2 + addsd %xmm13, %xmm3 + ALIGN_3 + +.L14: + testq $2, M + je .L15 + + movaps -16 * SIZE(X), %xmm4 + addq $2 * SIZE, X + andps %xmm15, %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + addsd %xmm4, %xmm2 + addsd %xmm5, %xmm3 + ALIGN_3 + +.L15: + testq $1, M + je .L998 + + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + +.L20: + movq M, I + sarq $2, I + jle .L25 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + movsd 0 * SIZE(X), %xmm8 + movsd 1 * SIZE(X), %xmm9 + addq INCX, X + movsd 0 * SIZE(X), %xmm10 + movsd 1 * SIZE(X), %xmm11 + + decq I + jle .L23 + ALIGN_4 + +.L22: + andps %xmm15, %xmm4 + addq INCX, X + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + movsd 1 * SIZE(X), %xmm5 + andps %xmm15, %xmm6 + addq INCX, X + addsd %xmm6, %xmm2 + movsd 0 * SIZE(X), %xmm6 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + movsd 1 * SIZE(X), %xmm7 + + andps %xmm15, %xmm8 + addq INCX, X + addsd %xmm8, %xmm0 + movsd 0 * SIZE(X), %xmm8 + andps %xmm15, %xmm9 + addsd %xmm9, %xmm1 + movsd 1 * SIZE(X), %xmm9 + andps %xmm15, %xmm10 + addq INCX, X + addsd %xmm10, %xmm2 + movsd 0 * SIZE(X), %xmm10 + andps %xmm15, %xmm11 + addsd %xmm11, %xmm3 + movsd 1 * SIZE(X), %xmm11 + + decq I + jg .L22 + ALIGN_4 + +.L23: + andps %xmm15, %xmm4 + addq INCX, X + addsd %xmm4, %xmm0 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + andps %xmm15, %xmm6 + addsd %xmm6, %xmm2 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + + andps %xmm15, %xmm8 + addsd %xmm8, %xmm0 + andps %xmm15, %xmm9 + addsd %xmm9, %xmm1 + andps %xmm15, %xmm10 + addsd %xmm10, %xmm2 + andps %xmm15, %xmm11 + addsd %xmm11, %xmm3 + ALIGN_3 + +.L25: + testq $2, M + je .L26 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + movsd 1 * SIZE(X), %xmm7 + andps %xmm15, %xmm5 + addsd %xmm5, %xmm1 + addq INCX, X + + andps %xmm15, %xmm6 + addsd %xmm6, %xmm2 + andps %xmm15, %xmm7 + addsd %xmm7, %xmm3 + ALIGN_3 + +.L26: + testq $1, M + je .L998 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + ALIGN_3 + +.L998: + addsd %xmm1, %xmm0 + addsd %xmm3, %xmm2 + addsd %xmm2, %xmm0 + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zasum_sse.S b/kernel/x86_64/zasum_sse.S new file mode 100644 index 0000000000..7f3d3d12d2 --- /dev/null +++ b/kernel/x86_64/zasum_sse.S @@ -0,0 +1,332 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrld $1, %xmm15 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L100 + + subq $-32 * SIZE, X + addq M, M + + cmpq $3, M + jle .L18 + + testq $4, X + je .L05 + movss -32 * SIZE(X), %xmm0 + andps %xmm15, %xmm0 + addq $SIZE, X + decq M + jle .L998 + ALIGN_3 + +.L05: + testq $8, X + je .L10 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + andps %xmm15, %xmm1 + addq $2 * SIZE, X + subq $2, M + jle .L998 + ALIGN_3 + +.L10: + movq M, I + sarq $5, I + jle .L14 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + + movaps -16 * SIZE(X), %xmm8 + movaps -12 * SIZE(X), %xmm9 + movaps -8 * SIZE(X), %xmm10 + movaps -4 * SIZE(X), %xmm11 + decq I + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + movaps 4 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + movaps 8 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + movaps 12 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps 16 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addps %xmm9, %xmm1 + movaps 20 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addps %xmm10, %xmm2 + movaps 24 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addps %xmm11, %xmm3 + movaps 28 * SIZE(X), %xmm11 + + subq $-32 * SIZE, X + decq I + jg .L11 + ALIGN_3 + +.L12: + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + + andps %xmm15, %xmm8 + addps %xmm8, %xmm0 + andps %xmm15, %xmm9 + addps %xmm9, %xmm1 + + andps %xmm15, %xmm10 + addps %xmm10, %xmm2 + andps %xmm15, %xmm11 + addps %xmm11, %xmm3 + + addq $32 * SIZE, X + ALIGN_3 + +.L14: + testq $31, M + jle .L998 + +.L15: + testq $16, M + je .L16 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + movaps -24 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -20 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + addq $16 * SIZE, X + ALIGN_3 + +.L16: + testq $8, M + je .L17 + + movaps -32 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movaps -28 * SIZE(X), %xmm5 + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + addq $8 * SIZE, X + ALIGN_3 + +.L17: + testq $4, M + je .L18 + + movaps -32 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + addq $4 * SIZE, X + ALIGN_3 + +.L18: + testq $2, M + je .L19 + +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd -32 * SIZE(X), %xmm7 + andps %xmm15, %xmm7 + addps %xmm7, %xmm3 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, M + je .L998 + + movss -32 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addps %xmm6, %xmm2 + jmp .L998 + ALIGN_4 + +.L100: + movq M, I + sarq $2, I + jle .L105 + ALIGN_4 + +.L101: + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + + andps %xmm15, %xmm5 + addps %xmm5, %xmm1 + + decq I + jg .L101 + ALIGN_4 + +.L105: +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + andq $3, M + jle .L998 + ALIGN_4 + +.L106: + movsd (X), %xmm4 + andps %xmm15, %xmm4 + addps %xmm4, %xmm0 + addq INCX, X + decq M + jg .L106 + ALIGN_4 + +.L998: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + shufps $1, %xmm0, %xmm0 + addss %xmm1, %xmm0 +#else + haddps %xmm0, %xmm0 + haddps %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zasum_sse2.S b/kernel/x86_64/zasum_sse2.S new file mode 100644 index 0000000000..9d0ec2e48d --- /dev/null +++ b/kernel/x86_64/zasum_sse2.S @@ -0,0 +1,318 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + xorps %xmm0, %xmm0 + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + pcmpeqb %xmm15, %xmm15 + psrlq $1, %xmm15 + + salq $ZBASE_SHIFT, INCX + + cmpq $2 * SIZE, INCX + jne .L40 + + subq $-16 * SIZE, X + addq M, M + + testq $SIZE, X + je .L05 + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -16 * SIZE(X), %xmm0 + addq $SIZE, X + + andps %xmm15, %xmm0 + subq $1, M + jle .L999 + ALIGN_3 + +.L05: + movq M, I + sarq $4, I + jle .L20 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + movaps -8 * SIZE(X), %xmm8 + movaps -6 * SIZE(X), %xmm9 + movaps -4 * SIZE(X), %xmm10 + movaps -2 * SIZE(X), %xmm11 + + decq I + jle .L11 + ALIGN_4 + +.L10: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm4 + addpd %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + + andps %xmm15, %xmm5 + addpd %xmm5, %xmm1 + movaps 2 * SIZE(X), %xmm5 + + andps %xmm15, %xmm6 + addpd %xmm6, %xmm2 + movaps 4 * SIZE(X), %xmm6 + + andps %xmm15, %xmm7 + addpd %xmm7, %xmm3 + movaps 6 * SIZE(X), %xmm7 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + andps %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps 8 * SIZE(X), %xmm8 + + andps %xmm15, %xmm9 + addpd %xmm9, %xmm1 + movaps 10 * SIZE(X), %xmm9 + + andps %xmm15, %xmm10 + addpd %xmm10, %xmm2 + movaps 12 * SIZE(X), %xmm10 + + andps %xmm15, %xmm11 + addpd %xmm11, %xmm3 + movaps 14 * SIZE(X), %xmm11 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_4 + +.L11: + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + andps %xmm15, %xmm8 + andps %xmm15, %xmm9 + andps %xmm15, %xmm10 + andps %xmm15, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + subq $-16 * SIZE, X + ALIGN_3 + +.L20: + andq $15, M + jle .L998 + + testq $8, M + je .L21 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + andps %xmm15, %xmm6 + andps %xmm15, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + addq $8 * SIZE, X + ALIGN_3 + +.L21: + testq $4, M + je .L22 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + + andps %xmm15, %xmm4 + andps %xmm15, %xmm5 + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L22: + testq $2, M + je .L23 + + movaps -16 * SIZE(X), %xmm6 + andps %xmm15, %xmm6 + addpd %xmm6, %xmm3 + addq $2 * SIZE, X + +.L23: + testq $1, M + je .L998 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -16 * SIZE(X), %xmm4 + andps %xmm15, %xmm4 + addsd %xmm4, %xmm0 + jmp .L998 + ALIGN_3 + + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetcht0 PREFETCHSIZE * SIZE(X) +#endif + +#ifdef PENTIUM4 + prefetchnta PREFETCHSIZE * SIZE(X) +#endif + + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + addq INCX, X + andpd %xmm15, %xmm4 + addpd %xmm4, %xmm0 + + movsd 0 * SIZE(X), %xmm5 + movhpd 1 * SIZE(X), %xmm5 + addq INCX, X + andpd %xmm15, %xmm5 + addpd %xmm5, %xmm1 + + movsd 0 * SIZE(X), %xmm6 + movhpd 1 * SIZE(X), %xmm6 + addq INCX, X + andpd %xmm15, %xmm6 + addpd %xmm6, %xmm2 + + movsd 0 * SIZE(X), %xmm7 + movhpd 1 * SIZE(X), %xmm7 + addq INCX, X + andpd %xmm15, %xmm7 + addpd %xmm7, %xmm3 + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + movsd 0 * SIZE(X), %xmm4 + movhpd 1 * SIZE(X), %xmm4 + andpd %xmm15, %xmm4 + addpd %xmm4, %xmm0 + addq INCX, X + decq M + jg .L61 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movhlps %xmm0, %xmm1 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zaxpy.S b/kernel/x86_64/zaxpy.S new file mode 100644 index 0000000000..266c1477d1 --- /dev/null +++ b/kernel/x86_64/zaxpy.S @@ -0,0 +1,336 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG4 /* rsi */ +#define INCX ARG5 /* rdx */ +#define Y ARG6 /* rcx */ +#define INCY ARG2 /* r8 */ + +#ifndef CONJ +#define ADD1 fsubrp +#define ADD2 faddp +#else +#define ADD1 faddp +#define ADD2 fsubrp +#endif + +#define ALPHA_R 8(%rsp) +#define ALPHA_I 24(%rsp) + +#include "l1param.h" + + PROLOGUE + PROFCODE + + FLD ALPHA_I + FLD ALPHA_R + + movq 40(%rsp), INCY + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L40 + + cmpq $2 * SIZE, INCX + jne .L14 + cmpq $2 * SIZE, INCY + jne .L14 + + movq M, %rax + sarq $2, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(1), %st + FLD 3 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 2 * SIZE(Y) + faddp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 2 * SIZE(X) + fmul %st(2), %st + FLD 3 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 3 * SIZE(Y) + faddp %st, %st(1) + FST 3 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 4 * SIZE(X) + fmul %st(1), %st + FLD 5 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 4 * SIZE(Y) + faddp %st, %st(1) + FST 4 * SIZE(Y) + + FLD 4 * SIZE(X) + fmul %st(2), %st + FLD 5 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 5 * SIZE(Y) + faddp %st, %st(1) + FST 5 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(1), %st + FLD 7 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 6 * SIZE(Y) + faddp %st, %st(1) + FST 6 * SIZE(Y) + + FLD 6 * SIZE(X) + fmul %st(2), %st + FLD 7 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 7 * SIZE(Y) + faddp %st, %st(1) + FST 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq M, %rax + andq $3, %rax + jle .L40 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq %rax + jg .L22 + jmp .L40 + ALIGN_3 + +.L14: + movq M, %rax + sarq $2, %rax + jle .L28 + ALIGN_3 + +.L29: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L29 + ALIGN_3 + +.L28: + movq M, %rax + andq $3, %rax + jle .L40 + ALIGN_3 + +.L35: + FLD 0 * SIZE(X) + fmul %st(1), %st + FLD 1 * SIZE(X) + fmul %st(3), %st + ADD1 %st, %st(1) + FLD 0 * SIZE(Y) + faddp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 0 * SIZE(X) + fmul %st(2), %st + FLD 1 * SIZE(X) + fmul %st(2), %st + ADD2 %st, %st(1) + FLD 1 * SIZE(Y) + faddp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L35 + ALIGN_3 + +.L40: + ffreep %st(0) + ffreep %st(0) + ret + + EPILOGUE diff --git a/kernel/x86_64/zaxpy_atom.S b/kernel/x86_64/zaxpy_atom.S new file mode 100644 index 0000000000..e623326f57 --- /dev/null +++ b/kernel/x86_64/zaxpy_atom.S @@ -0,0 +1,675 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 40(%rsp), INCY +#endif +#else + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + +#ifndef CONJ +#define ADD1 subsd +#define ADD2 addsd +#else +#define ADD1 addsd +#define ADD2 subsd +#endif + + salq $ZBASE_SHIFT, INCX + movaps %xmm0, ALPHA_R + salq $ZBASE_SHIFT, INCY + movaps %xmm1, ALPHA_I + + testq M, M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L20 + cmpq $2 * SIZE, INCY + jne .L20 + + movq M, %rax + sarq $2, %rax + jle .L15 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(Y), %xmm9 + + movsd 2 * SIZE(X), %xmm4 + movsd 3 * SIZE(X), %xmm5 + movsd 2 * SIZE(Y), %xmm10 + movsd 3 * SIZE(Y), %xmm11 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + movsd 4 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm9 + movsd 5 * SIZE(X), %xmm1 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + mulsd ALPHA_I, %xmm7 + movsd 4 * SIZE(Y), %xmm12 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + movsd 5 * SIZE(Y), %xmm13 + addsd %xmm2, %xmm9 + + addsd %xmm4, %xmm10 + movsd 6 * SIZE(X), %xmm4 + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + + ADD2 %xmm5, %xmm11 + movsd 7 * SIZE(X), %xmm5 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + + ADD1 %xmm7, %xmm10 + movsd %xmm8, 0 * SIZE(Y) + mulsd ALPHA_I, %xmm3 + + addsd %xmm6, %xmm11 + movsd %xmm9, 1 * SIZE(Y) + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + movsd %xmm10, 2 * SIZE(Y) + mulsd ALPHA_R, %xmm4 + movsd 6 * SIZE(Y), %xmm10 + addsd %xmm0, %xmm12 + movsd 8 * SIZE(X), %xmm0 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm5, %xmm7 + movsd %xmm11, 3 * SIZE(Y) + mulsd ALPHA_R, %xmm5 + movsd 7 * SIZE(Y), %xmm11 + ADD2 %xmm1, %xmm13 + movsd 9 * SIZE(X), %xmm1 + + mulsd ALPHA_I, %xmm7 + movsd 8 * SIZE(Y), %xmm8 + ADD1 %xmm3, %xmm12 + + mulsd ALPHA_I, %xmm6 + movsd 9 * SIZE(Y), %xmm9 + addsd %xmm2, %xmm13 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + addsd %xmm4, %xmm10 + movsd 10 * SIZE(X), %xmm4 + + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + ADD2 %xmm5, %xmm11 + movsd 11 * SIZE(X), %xmm5 + + mulsd ALPHA_I, %xmm3 + movsd %xmm12, 4 * SIZE(Y) + ADD1 %xmm7, %xmm10 + + mulsd ALPHA_I, %xmm2 + movsd %xmm13, 5 * SIZE(Y) + addsd %xmm6, %xmm11 + + movaps %xmm4, %xmm6 + movsd %xmm10, 6 * SIZE(Y) + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + movsd 10 * SIZE(Y), %xmm10 + movsd 12 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + movsd %xmm11, 7 * SIZE(Y) + mulsd ALPHA_R, %xmm5 + movsd 11 * SIZE(Y), %xmm11 + ADD2 %xmm1, %xmm9 + movsd 13 * SIZE(X), %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L11 + ALIGN_3 + +.L12: + mulsd ALPHA_I, %xmm7 + movsd 4 * SIZE(Y), %xmm12 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + movsd 5 * SIZE(Y), %xmm13 + addsd %xmm2, %xmm9 + + addsd %xmm4, %xmm10 + movsd 6 * SIZE(X), %xmm4 + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + + ADD2 %xmm5, %xmm11 + movsd 7 * SIZE(X), %xmm5 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + + ADD1 %xmm7, %xmm10 + movsd %xmm8, 0 * SIZE(Y) + mulsd ALPHA_I, %xmm3 + + addsd %xmm6, %xmm11 + movsd %xmm9, 1 * SIZE(Y) + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + movsd %xmm10, 2 * SIZE(Y) + mulsd ALPHA_R, %xmm4 + movsd 6 * SIZE(Y), %xmm10 + addsd %xmm0, %xmm12 + + movaps %xmm5, %xmm7 + movsd %xmm11, 3 * SIZE(Y) + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm13 + movsd 7 * SIZE(Y), %xmm11 + + mulsd ALPHA_I, %xmm7 + ADD1 %xmm3, %xmm12 + + mulsd ALPHA_I, %xmm6 + addsd %xmm2, %xmm13 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + addsd %xmm4, %xmm10 + + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + ADD2 %xmm5, %xmm11 + + mulsd ALPHA_I, %xmm3 + ADD1 %xmm7, %xmm10 + + addsd %xmm6, %xmm11 + mulsd ALPHA_I, %xmm2 + + movsd %xmm12, 4 * SIZE(Y) + movsd %xmm13, 5 * SIZE(Y) + movsd %xmm10, 6 * SIZE(Y) + movsd %xmm11, 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $2, %rax + jle .L17 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 2 * SIZE(X), %xmm4 + movsd 3 * SIZE(X), %xmm5 + + movaps %xmm0, %xmm2 + movsd 0 * SIZE(Y), %xmm8 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + movsd 1 * SIZE(Y), %xmm9 + mulsd ALPHA_R, %xmm1 + movsd 2 * SIZE(Y), %xmm10 + mulsd ALPHA_I, %xmm3 + movsd 3 * SIZE(Y), %xmm11 + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + + movaps %xmm5, %xmm7 + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm9 + + mulsd ALPHA_I, %xmm7 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + addsd %xmm2, %xmm9 + + addsd %xmm4, %xmm10 + movsd %xmm8, 0 * SIZE(Y) + ADD2 %xmm5, %xmm11 + movsd %xmm9, 1 * SIZE(Y) + ADD1 %xmm7, %xmm10 + addsd %xmm6, %xmm11 + + movsd %xmm10, 2 * SIZE(Y) + movsd %xmm11, 3 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + movq M, %rax + andq $1, %rax + jle .L999 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(Y), %xmm9 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_I, %xmm2 + + addsd %xmm0, %xmm8 + ADD2 %xmm1, %xmm9 + ADD1 %xmm3, %xmm8 + addsd %xmm2, %xmm9 + + movsd %xmm8, 0 * SIZE(Y) + movsd %xmm9, 1 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movq Y, YY + + movq M, %rax + sarq $2, %rax + jle .L25 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(Y), %xmm9 + addq INCY, Y + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(Y), %xmm10 + movsd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_I, %xmm2 + + movaps %xmm4, %xmm6 + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm9 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + mulsd ALPHA_I, %xmm7 + movsd 0 * SIZE(Y), %xmm12 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + movsd 1 * SIZE(Y), %xmm13 + addsd %xmm2, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm10 + movsd 0 * SIZE(X), %xmm4 + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + + ADD2 %xmm5, %xmm11 + movsd 1 * SIZE(X), %xmm5 + movaps %xmm1, %xmm3 + addq INCX, X + mulsd ALPHA_R, %xmm1 + + ADD1 %xmm7, %xmm10 + movsd %xmm8, 0 * SIZE(YY) + mulsd ALPHA_I, %xmm3 + + addsd %xmm6, %xmm11 + movsd %xmm9, 1 * SIZE(YY) + mulsd ALPHA_I, %xmm2 + addq INCY, YY + + movaps %xmm4, %xmm6 + movsd %xmm10, 0 * SIZE(YY) + mulsd ALPHA_R, %xmm4 + movsd 0 * SIZE(Y), %xmm10 + addsd %xmm0, %xmm12 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + movsd %xmm11, 1 * SIZE(YY) + addq INCY, YY + mulsd ALPHA_R, %xmm5 + movsd 1 * SIZE(Y), %xmm11 + addq INCY, Y + ADD2 %xmm1, %xmm13 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + + mulsd ALPHA_I, %xmm7 + movsd 0 * SIZE(Y), %xmm8 + ADD1 %xmm3, %xmm12 + + mulsd ALPHA_I, %xmm6 + movsd 1 * SIZE(Y), %xmm9 + addsd %xmm2, %xmm13 + addq INCY, Y + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + addsd %xmm4, %xmm10 + movsd 0 * SIZE(X), %xmm4 + + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + ADD2 %xmm5, %xmm11 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + + mulsd ALPHA_I, %xmm3 + movsd %xmm12, 0 * SIZE(YY) + ADD1 %xmm7, %xmm10 + + mulsd ALPHA_I, %xmm2 + movsd %xmm13, 1 * SIZE(YY) + addsd %xmm6, %xmm11 + addq INCY, YY + + movaps %xmm4, %xmm6 + movsd %xmm10, 0 * SIZE(YY) + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + movsd 0 * SIZE(Y), %xmm10 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm7 + movsd %xmm11, 1 * SIZE(YY) + addq INCY, YY + mulsd ALPHA_R, %xmm5 + movsd 1 * SIZE(Y), %xmm11 + addq INCY, Y + ADD2 %xmm1, %xmm9 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + mulsd ALPHA_I, %xmm7 + movsd 0 * SIZE(Y), %xmm12 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + movsd 1 * SIZE(Y), %xmm13 + addsd %xmm2, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm10 + movsd 0 * SIZE(X), %xmm4 + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + + ADD2 %xmm5, %xmm11 + movsd 1 * SIZE(X), %xmm5 + movaps %xmm1, %xmm3 + addq INCX, X + mulsd ALPHA_R, %xmm1 + + ADD1 %xmm7, %xmm10 + movsd %xmm8, 0 * SIZE(YY) + mulsd ALPHA_I, %xmm3 + + addsd %xmm6, %xmm11 + movsd %xmm9, 1 * SIZE(YY) + mulsd ALPHA_I, %xmm2 + addq INCY, YY + + movaps %xmm4, %xmm6 + movsd %xmm10, 0 * SIZE(YY) + mulsd ALPHA_R, %xmm4 + movsd 0 * SIZE(Y), %xmm10 + addsd %xmm0, %xmm12 + + movaps %xmm5, %xmm7 + movsd %xmm11, 1 * SIZE(YY) + mulsd ALPHA_R, %xmm5 + addq INCY, YY + ADD2 %xmm1, %xmm13 + movsd 1 * SIZE(Y), %xmm11 + + mulsd ALPHA_I, %xmm7 + addq INCY, Y + ADD1 %xmm3, %xmm12 + + mulsd ALPHA_I, %xmm6 + addsd %xmm2, %xmm13 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + addsd %xmm4, %xmm10 + + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + ADD2 %xmm5, %xmm11 + + mulsd ALPHA_I, %xmm3 + ADD1 %xmm7, %xmm10 + + addsd %xmm6, %xmm11 + mulsd ALPHA_I, %xmm2 + + movsd %xmm12, 0 * SIZE(YY) + movsd %xmm13, 1 * SIZE(YY) + addq INCY, YY + movsd %xmm10, 0 * SIZE(YY) + movsd %xmm11, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L25: + movq M, %rax + andq $2, %rax + jle .L27 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + + movaps %xmm0, %xmm2 + movsd 0 * SIZE(Y), %xmm8 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + movsd 1 * SIZE(Y), %xmm9 + addq INCY, Y + mulsd ALPHA_R, %xmm1 + movsd 0 * SIZE(Y), %xmm10 + mulsd ALPHA_I, %xmm3 + movsd 1 * SIZE(Y), %xmm11 + mulsd ALPHA_I, %xmm2 + addq INCY, Y + + movaps %xmm4, %xmm6 + mulsd ALPHA_R, %xmm4 + addsd %xmm0, %xmm8 + + movaps %xmm5, %xmm7 + mulsd ALPHA_R, %xmm5 + ADD2 %xmm1, %xmm9 + + mulsd ALPHA_I, %xmm7 + ADD1 %xmm3, %xmm8 + + mulsd ALPHA_I, %xmm6 + addsd %xmm2, %xmm9 + + addsd %xmm4, %xmm10 + movsd %xmm8, 0 * SIZE(YY) + ADD2 %xmm5, %xmm11 + movsd %xmm9, 1 * SIZE(YY) + ADD1 %xmm7, %xmm10 + addq INCY, YY + addsd %xmm6, %xmm11 + + movsd %xmm10, 0 * SIZE(YY) + movsd %xmm11, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L27: + movq M, %rax + andq $1, %rax + jle .L999 + + movsd 0 * SIZE(X), %xmm0 + movsd 1 * SIZE(X), %xmm1 + movsd 0 * SIZE(Y), %xmm8 + movsd 1 * SIZE(Y), %xmm9 + + movaps %xmm0, %xmm2 + mulsd ALPHA_R, %xmm0 + movaps %xmm1, %xmm3 + mulsd ALPHA_R, %xmm1 + mulsd ALPHA_I, %xmm3 + mulsd ALPHA_I, %xmm2 + + addsd %xmm0, %xmm8 + ADD2 %xmm1, %xmm9 + ADD1 %xmm3, %xmm8 + addsd %xmm2, %xmm9 + + movsd %xmm8, 0 * SIZE(YY) + movsd %xmm9, 1 * SIZE(YY) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zaxpy_sse.S b/kernel/x86_64/zaxpy_sse.S new file mode 100644 index 0000000000..69cdedaaa6 --- /dev/null +++ b/kernel/x86_64/zaxpy_sse.S @@ -0,0 +1,3118 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + movaps %xmm3, %xmm0 + movss 40(%rsp), %xmm1 + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L100 + cmpq $2 * SIZE, INCY + jne .L100 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + + pshufd $0, %xmm0, ALPHA_R + pshufd $0, %xmm1, ALPHA_I + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 + xorpd %xmm7, ALPHA_I +#else + xorpd %xmm7, ALPHA_R +#endif + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps %xmm8, %xmm0 + addps %xmm1, %xmm0 + + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq M + jle .L999 + ALIGN_2 + +.L10: + testq $SIZE, Y + jne .L50 + + testq $3 * SIZE, X + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 8 * SIZE(X), %xmm2 + movaps 12 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L15: + testq $8, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L16: + testq $4, M + jle .L17 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L17: + testq $2, M + jle .L18 + + movaps -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L18: + testq $1, M + jle .L999 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + movsd -32 * SIZE(Y), %xmm1 + addps %xmm1, %xmm0 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + testq $2 * SIZE, X + jne .L30 + + subq $1 * SIZE, X + + movaps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L25 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm7 + movaps 0 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm1 + movaps 8 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 12 * SIZE(X), %xmm3 + movaps 16 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm7 + movaps 0 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, M + jle .L26 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L26: + testq $4, M + jle .L27 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L27: + testq $2, M + jle .L28 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L28: + testq $1, M + jle .L999 + + pshufd $0x06, %xmm0, %xmm8 + pshufd $0x09, %xmm0, %xmm0 + + mulps ALPHA_I, %xmm8 + mulps ALPHA_R, %xmm0 + + addps -32 * SIZE(Y), %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(Y) + + jmp .L999 + ALIGN_3 + +.L30: + testq $1 * SIZE, X + jne .L40 +#endif + + movq M, %rax + sarq $4, %rax + jle .L35 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L35: + testq $8, M + jle .L36 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L36: + testq $4, M + jle .L37 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L37: + testq $2, M + jle .L38 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L38: + testq $1, M + jle .L999 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + movsd -32 * SIZE(Y), %xmm1 + + addps %xmm1, %xmm0 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS + +.L40: + subq $3 * SIZE, X + + movaps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L45 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + + decq %rax + jle .L42 + ALIGN_3 + +.L41: + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm7 + movaps 0 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm1 + movaps 8 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 12 * SIZE(X), %xmm3 + movaps 16 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm7 + movaps 0 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps -16 * SIZE(Y), %xmm4 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps -12 * SIZE(Y), %xmm5 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps -8 * SIZE(Y), %xmm6 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps -4 * SIZE(Y), %xmm7 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L45: + testq $8, M + jle .L46 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm0 + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps -24 * SIZE(Y), %xmm2 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps -20 * SIZE(Y), %xmm3 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L46: + testq $4, M + jle .L47 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps -28 * SIZE(Y), %xmm1 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L47: + testq $2, M + jle .L48 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps -32 * SIZE(Y), %xmm0 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L48: + testq $1, M + jle .L999 + + movaps -28 * SIZE(X), %xmm1 + movsd -32 * SIZE(Y), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + + addps %xmm8, %xmm0 + addps %xmm2, %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + jmp .L999 + ALIGN_3 +#endif + +.L50: + xorps %xmm0, %xmm0 + + subq $1 * SIZE, Y + + testq $3 * SIZE, X + jne .L60 + + movq M, %rax + sarq $4, %rax + jle .L55 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm4 + + decq %rax + jle .L52 + ALIGN_3 + +.L51: + movaps -16 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm7 + movaps -4 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm1 + movaps 4 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 8 * SIZE(X), %xmm3 + movaps 12 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L51 + ALIGN_3 + +.L52: + movaps -16 * SIZE(X), %xmm5 + movaps -12 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -8 * SIZE(X), %xmm7 + movaps -4 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L55: + testq $8, M + jle .L56 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -24 * SIZE(X), %xmm3 + movaps -20 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L56: + testq $4, M + jle .L57 + + movaps -32 * SIZE(X), %xmm1 + movaps -28 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L57: + testq $2, M + jle .L58 + + movaps -32 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L58: + testq $1, M + jle .L59 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L59: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L60: +#ifdef ALIGNED_ACCESS + + testq $2 * SIZE, X + jne .L70 + + subq $1 * SIZE, X + + movaps -32 * SIZE(X), %xmm1 + + movq M, %rax + sarq $4, %rax + jle .L65 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decq %rax + jle .L62 + ALIGN_3 + +.L61: + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L61 + ALIGN_3 + +.L62: + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + + movss %xmm4, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm5, %xmm4 + SHUFPS_39 %xmm4, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm6, %xmm5 + SHUFPS_39 %xmm5, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + movss %xmm7, %xmm6 + SHUFPS_39 %xmm6, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm0, %xmm7 + SHUFPS_39 %xmm7, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L65: + testq $8, M + jle .L66 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + SHUFPS_39 %xmm3, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm1, %xmm0 + SHUFPS_39 %xmm0, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L66: + testq $4, M + jle .L67 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + SHUFPS_39 %xmm2, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L67: + testq $2, M + jle .L68 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L68: + testq $1, M + jle .L69 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + SHUFPS_39 %xmm1, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + movhlps %xmm0, %xmm0 + movss %xmm0, -30 * SIZE(Y) + jmp .L999 + +.L69: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L70: + testq $1 * SIZE, X + jne .L80 +#endif + + movq M, %rax + sarq $4, %rax + jle .L75 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + movsd -24 * SIZE(X), %xmm3 + movhps -22 * SIZE(X), %xmm3 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + + decq %rax + jle .L72 + ALIGN_3 + +.L71: + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + movsd -12 * SIZE(X), %xmm6 + movhps -10 * SIZE(X), %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm7 + movhps -6 * SIZE(X), %xmm7 + movsd -4 * SIZE(X), %xmm0 + movhps -2 * SIZE(X), %xmm0 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm1 + movhps 2 * SIZE(X), %xmm1 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movsd 8 * SIZE(X), %xmm3 + movhps 10 * SIZE(X), %xmm3 + movsd 12 * SIZE(X), %xmm4 + movhps 14 * SIZE(X), %xmm4 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L71 + ALIGN_3 + +.L72: + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + movsd -12 * SIZE(X), %xmm6 + movhps -10 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -8 * SIZE(X), %xmm7 + movhps -6 * SIZE(X), %xmm7 + movsd -4 * SIZE(X), %xmm0 + movhps -2 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L75: + testq $8, M + jle .L76 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movsd -24 * SIZE(X), %xmm3 + movhps -22 * SIZE(X), %xmm3 + movsd -20 * SIZE(X), %xmm0 + movhps -18 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L76: + testq $4, M + jle .L77 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movsd -28 * SIZE(X), %xmm2 + movhps -26 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L77: + testq $2, M + jle .L78 + + movsd -32 * SIZE(X), %xmm1 + movhps -30 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L78: + testq $1, M + jle .L79 + +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L79: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 + +#ifdef ALIGNED_ACCESS + +.L80: + subq $3 * SIZE, X + + movaps -32 * SIZE(X), %xmm1 + + movq M, %rax + sarq $4, %rax + jle .L85 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + decq %rax + jle .L82 + ALIGN_3 + +.L81: + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L81 + ALIGN_3 + +.L82: + movaps -20 * SIZE(X), %xmm4 + movaps -16 * SIZE(X), %xmm5 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -12 * SIZE(X), %xmm6 + movaps -8 * SIZE(X), %xmm7 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + pshufd $0xb1, %xmm4, %xmm8 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm4 + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + pshufd $0xb1, %xmm5, %xmm8 + mulps ALPHA_R, %xmm5 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm5 + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + addps -16 * SIZE(Y), %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + pshufd $0xb1, %xmm6, %xmm8 + mulps ALPHA_R, %xmm6 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm6 + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + addps -12 * SIZE(Y), %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + pshufd $0xb1, %xmm7, %xmm8 + mulps ALPHA_R, %xmm7 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm7 + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + addps -8 * SIZE(Y), %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + addps -4 * SIZE(Y), %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L85: + testq $8, M + jle .L86 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps -20 * SIZE(X), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + pshufd $0xb1, %xmm3, %xmm8 + mulps ALPHA_R, %xmm3 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm3 + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + addps -24 * SIZE(Y), %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + pshufd $0xb1, %xmm0, %xmm8 + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm0 + movss %xmm0, %xmm3 + shufps $0x93, %xmm0, %xmm3 + addps -20 * SIZE(Y), %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_2 + +.L86: + testq $4, M + jle .L87 + + movaps -28 * SIZE(X), %xmm2 + movaps -24 * SIZE(X), %xmm3 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + pshufd $0xb1, %xmm2, %xmm8 + mulps ALPHA_R, %xmm2 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm2 + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + addps -28 * SIZE(Y), %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_2 + +.L87: + testq $2, M + jle .L88 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + addps -32 * SIZE(Y), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_2 + +.L88: + testq $1, M + jle .L89 + + movaps -28 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + pshufd $0xb1, %xmm1, %xmm8 + mulps ALPHA_R, %xmm1 + mulps ALPHA_I, %xmm8 + addps %xmm8, %xmm1 + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + addps -32 * SIZE(Y), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + movhlps %xmm0, %xmm0 + movss %xmm0, -30 * SIZE(Y) + jmp .L999 + +.L89: + shufps $0x93, %xmm0, %xmm0 + + addss -32 * SIZE(Y), %xmm0 + movss %xmm0, -32 * SIZE(Y) + jmp .L999 + ALIGN_3 +#endif + +.L100: +#ifndef CONJ + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm15 + + pxor %xmm13, %xmm13 + subps %xmm15, %xmm13 + + unpcklps %xmm14, %xmm13 + unpcklps %xmm15, %xmm14 + movaps %xmm13, %xmm15 +#else + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm15 + + pxor %xmm13, %xmm13 + subps %xmm14, %xmm13 + + unpcklps %xmm15, %xmm14 + unpcklps %xmm13, %xmm15 +#endif + + movq Y, YY + + movq M, %rax + sarq $3, %rax + jle .L105 + ALIGN_3 + +.L102: + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 + movshdup %xmm4, %xmm5 + movsldup %xmm4, %xmm4 + movshdup %xmm6, %xmm7 + movsldup %xmm6, %xmm6 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + pshufd $0xf5, %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 + pshufd $0xf5, %xmm4, %xmm5 + shufps $0xa0, %xmm4, %xmm4 + pshufd $0xf5, %xmm6, %xmm7 + shufps $0xa0, %xmm6, %xmm6 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm3 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm5 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm7 + + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + addps %xmm7, %xmm11 + + movsd %xmm8, (YY) + addq INCY, YY + movhps %xmm8, (YY) + addq INCY, YY + movsd %xmm9, (YY) + addq INCY, YY + movhps %xmm9, (YY) + addq INCY, YY + movsd %xmm10, (YY) + addq INCY, YY + movhps %xmm10, (YY) + addq INCY, YY + movsd %xmm11, (YY) + addq INCY, YY + movhps %xmm11, (YY) + addq INCY, YY + + decq %rax + jg .L102 + ALIGN_3 + +.L105: + testq $4, M + jle .L106 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 + movshdup %xmm2, %xmm3 + movsldup %xmm2, %xmm2 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + pshufd $0xf5, %xmm2, %xmm3 + shufps $0xa0, %xmm2, %xmm2 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm3 + + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm9 + + movsd %xmm8, (YY) + addq INCY, YY + movhps %xmm8, (YY) + addq INCY, YY + movsd %xmm9, (YY) + addq INCY, YY + movhps %xmm9, (YY) + addq INCY, YY + ALIGN_3 + +.L106: + testq $2, M + jle .L107 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + + movsd %xmm8, (YY) + addq INCY, YY + movhps %xmm8, (YY) + addq INCY, YY + ALIGN_3 + +.L107: + testq $1, M + jle .L999 + + movsd (X), %xmm0 + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + + movsd (Y), %xmm8 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + + movsd %xmm8, (Y) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S new file mode 100644 index 0000000000..f1616e3622 --- /dev/null +++ b/kernel/x86_64/zaxpy_sse2.S @@ -0,0 +1,1793 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %r10 +#endif + +#define YY %r11 +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#define USE_PSHUFD + +#if defined(HAVE_SSE3) && !defined(CORE_OPTERON) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + + movq 48(%rsp), X + movq 56(%rsp), INCX + movq 64(%rsp), Y + movq 72(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifdef HAVE_SSE3 + movddup %xmm0, ALPHA_R + movddup %xmm1, ALPHA_I +#else + pshufd $0x44, %xmm0, ALPHA_R + pshufd $0x44, %xmm1, ALPHA_I +#endif + +#ifndef CONJ + shufps $0x0c, %xmm7, %xmm7 + xorpd %xmm7, ALPHA_I +#else + shufps $0xc0, %xmm7, %xmm7 + xorpd %xmm7, ALPHA_R +#endif + + testq $SIZE, Y + jne .L30 + + testq $SIZE, X + jne .L20 + + movq M, %rax + sarq $3, %rax + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm8 +#else + movsd -15 * SIZE(X), %xmm8 + movhps -16 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm8 +#else + movsd -13 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm8 +#else + movsd -11 * SIZE(X), %xmm8 + movhps -12 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm8 +#else + movsd -9 * SIZE(X), %xmm8 + movhps -10 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps 0 * SIZE(X), %xmm0 + movaps 2 * SIZE(X), %xmm1 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm4, %xmm8 +#else + movsd -7 * SIZE(X), %xmm8 + movhps -8 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd -8 * SIZE(Y), %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm5, %xmm8 +#else + movsd -5 * SIZE(X), %xmm8 + movhps -6 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm8 + addpd -6 * SIZE(Y), %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movaps 4 * SIZE(X), %xmm2 + movaps 6 * SIZE(X), %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm6, %xmm8 +#else + movsd -3 * SIZE(X), %xmm8 + movhps -4 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm8 + addpd -4 * SIZE(Y), %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm7, %xmm8 +#else + movsd -1 * SIZE(X), %xmm8 + movhps -2 * SIZE(X), %xmm8 +#endif + mulpd ALPHA_R, %xmm7 + mulpd ALPHA_I, %xmm8 + addpd -2 * SIZE(Y), %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd -8 * SIZE(Y), %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + pshufd $0x4e, %xmm5, %xmm8 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm8 + addpd -6 * SIZE(Y), %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + pshufd $0x4e, %xmm6, %xmm8 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm8 + addpd -4 * SIZE(Y), %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + pshufd $0x4e, %xmm7, %xmm8 + mulpd ALPHA_R, %xmm7 + mulpd ALPHA_I, %xmm8 + addpd -2 * SIZE(Y), %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L15: + movq M, %rax + andq $4, %rax + jle .L16 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + movq M, %rax + andq $2, %rax + jle .L17 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + movq M, %rax + andq $1, %rax + jle .L999 + + movaps -16 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movq M, %rax + sarq $3, %rax + jle .L25 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd -8 * SIZE(Y), %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm5, %xmm8 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm8 + addpd -6 * SIZE(Y), %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm6, %xmm8 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm8 + addpd -4 * SIZE(Y), %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm7, %xmm8 + mulpd ALPHA_R, %xmm7 + mulpd ALPHA_I, %xmm8 + addpd -2 * SIZE(Y), %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd -8 * SIZE(Y), %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + + pshufd $0x4e, %xmm5, %xmm8 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm8 + addpd -6 * SIZE(Y), %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + pshufd $0x4e, %xmm6, %xmm8 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm8 + addpd -4 * SIZE(Y), %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + + pshufd $0x4e, %xmm7, %xmm8 + mulpd ALPHA_R, %xmm7 + mulpd ALPHA_I, %xmm8 + addpd -2 * SIZE(Y), %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L25: + movq M, %rax + andq $4, %rax + jle .L26 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd -12 * SIZE(Y), %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd -10 * SIZE(Y), %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + movq M, %rax + andq $2, %rax + jle .L27 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd -14 * SIZE(Y), %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + movq M, %rax + andq $1, %rax + jle .L999 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd -16 * SIZE(Y), %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + testq $SIZE, X + jne .L40 + + movaps -16 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + xorps %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm0 + + xorps %xmm4, %xmm4 + movhps -16 * SIZE(Y), %xmm4 + + addpd %xmm0, %xmm4 + movhps %xmm4, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $2 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L39 + + movq M, %rax + sarq $3, %rax + jle .L35 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -10 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -8 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -6 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movaps 0 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movaps 2 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movaps 4 * SIZE(X), %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps -10 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps -8 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movaps -6 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps -4 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L35: + movq M, %rax + andq $4, %rax + jle .L36 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(X), %xmm3 + movaps -10 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm4 + SHUFPD_1 %xmm4, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L36: + movq M, %rax + andq $2, %rax + jle .L37 + + movaps -16 * SIZE(X), %xmm1 + movaps -14 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L37: + movq M, %rax + andq $1, %rax + jle .L39 + + movaps -16 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L39: + SHUFPD_1 %xmm0, %xmm0 + + addsd -16 * SIZE(Y), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L40: + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + xorps %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm0 + + xorps %xmm4, %xmm4 + movhps -16 * SIZE(Y), %xmm4 + + addpd %xmm0, %xmm4 + movhps %xmm4, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $2 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L49 + + movq M, %rax + sarq $3, %rax + jle .L45 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + movsd -12 * SIZE(X), %xmm3 + movhps -11 * SIZE(X), %xmm3 + + decq %rax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -10 * SIZE(X), %xmm0 + movhps -9 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -8 * SIZE(X), %xmm1 + movhps -7 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -6 * SIZE(X), %xmm2 + movhps -5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -3 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd -2 * SIZE(X), %xmm0 + movhps -1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movsd 2 * SIZE(X), %xmm2 + movhps 3 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + movsd 4 * SIZE(X), %xmm3 + movhps 5 * SIZE(X), %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movsd -10 * SIZE(X), %xmm0 + movhps -9 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movsd -8 * SIZE(X), %xmm1 + movhps -7 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movsd -6 * SIZE(X), %xmm2 + movhps -5 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movsd -4 * SIZE(X), %xmm3 + movhps -3 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -8 * SIZE(Y), %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movsd -2 * SIZE(X), %xmm0 + movhps -1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -6 * SIZE(Y), %xmm1 + movaps %xmm1, -6 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -4 * SIZE(Y), %xmm2 + movaps %xmm2, -4 * SIZE(Y) + + pshufd $0x4e, %xmm0, %xmm8 + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm0 + SHUFPD_1 %xmm0, %xmm3 + + addpd -2 * SIZE(Y), %xmm3 + movaps %xmm3, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L45: + movq M, %rax + andq $4, %rax + jle .L46 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + movsd -12 * SIZE(X), %xmm3 + movhps -11 * SIZE(X), %xmm3 + movsd -10 * SIZE(X), %xmm4 + movhps -9 * SIZE(X), %xmm4 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm3 + SHUFPD_1 %xmm3, %xmm2 + + addpd -12 * SIZE(Y), %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm4 + SHUFPD_1 %xmm4, %xmm3 + + addpd -10 * SIZE(Y), %xmm3 + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L46: + movq M, %rax + andq $2, %rax + jle .L47 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + movsd -14 * SIZE(X), %xmm2 + movhps -13 * SIZE(X), %xmm2 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm2 + SHUFPD_1 %xmm2, %xmm1 + + addpd -14 * SIZE(Y), %xmm1 + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L47: + movq M, %rax + andq $1, %rax + jle .L49 + + movsd -16 * SIZE(X), %xmm1 + movhps -15 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd ALPHA_R, %xmm1 + mulpd ALPHA_I, %xmm8 + addpd %xmm8, %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + addpd -16 * SIZE(Y), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, %xmm0 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L49: + SHUFPD_1 %xmm0, %xmm0 + + addsd -16 * SIZE(Y), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: +#ifndef CONJ + movaps %xmm0, %xmm14 # a 0 + + pxor %xmm15, %xmm15 # 0 0 + subsd %xmm1, %xmm15 # -b 0 + + unpcklpd %xmm14, %xmm15 # -b a + unpcklpd %xmm1, %xmm14 # a b +#else + movaps %xmm0, %xmm14 # a 0 + movaps %xmm1, %xmm15 # b 0 + + pxor %xmm13, %xmm13 # 0 0 + subsd %xmm0, %xmm13 # -a 0 + + unpcklpd %xmm13, %xmm15 # b -a + unpcklpd %xmm1, %xmm14 # a b +#endif + + movq Y, YY + movq M, %rax + sarq $3, %rax + jle .L55 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + decq %rax + jle .L52 + ALIGN_3 + +.L51: + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L52: + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L55: + movq M, %rax + andq $4, %rax + jle .L56 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + + MOVDDUP( 0 * SIZE, X, %xmm4) + MOVDDUP( 1 * SIZE, X, %xmm5) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm6) + MOVDDUP( 1 * SIZE, X, %xmm7) + addq INCX, X + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm10 + movhpd 1 * SIZE(Y), %xmm10 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm11 + movhpd 1 * SIZE(Y), %xmm11 + addq INCY, Y + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + addpd %xmm0, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm2, %xmm9 + mulpd %xmm15, %xmm3 + addpd %xmm4, %xmm10 + mulpd %xmm15, %xmm5 + addpd %xmm6, %xmm11 + mulpd %xmm15, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm5, %xmm10 + addpd %xmm7, %xmm11 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm10, 0 * SIZE(YY) + movhpd %xmm10, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm11, 0 * SIZE(YY) + movhpd %xmm11, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L56: + movq M, %rax + andq $2, %rax + jle .L57 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + addq INCX, X + MOVDDUP( 0 * SIZE, X, %xmm2) + MOVDDUP( 1 * SIZE, X, %xmm3) + addq INCX, X + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm9 + movhpd 1 * SIZE(Y), %xmm9 + addq INCY, Y + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm9 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + addq INCY, YY + movlpd %xmm9, 0 * SIZE(YY) + movhpd %xmm9, 1 * SIZE(YY) + addq INCY, YY + ALIGN_3 + +.L57: + movq M, %rax + andq $1, %rax + jle .L999 + + MOVDDUP( 0 * SIZE, X, %xmm0) + MOVDDUP( 1 * SIZE, X, %xmm1) + + movsd 0 * SIZE(Y), %xmm8 + movhpd 1 * SIZE(Y), %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm1 + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm8 + + movlpd %xmm8, 0 * SIZE(YY) + movhpd %xmm8, 1 * SIZE(YY) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zcopy.S b/kernel/x86_64/zcopy.S new file mode 100644 index 0000000000..d76426b665 --- /dev/null +++ b/kernel/x86_64/zcopy.S @@ -0,0 +1,389 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#define FLAG ARG6 +#else +#define INCY %r10 +#define FLAG %r11 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + EMMS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq N, N # if m == 0 goto End + jle .L999 + + cmpq $2 * SIZE, INCX # if incx != 1 + jne .L100 + cmpq $2 * SIZE, INCY # if incy != 1 + jne .L100 + + movq N, %rax # i = m + sarq $2, %rax + jle .L20 + ALIGN_2 + +.L11: +#ifdef XDOUBLE + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq 0(X), %mm0 + movq %mm0, 0(Y) + + movq 8(X), %mm1 + movq %mm1, 8(Y) + + movq 16(X), %mm2 + movq %mm2, 16(Y) + + movq 24(X), %mm3 + movq %mm3, 24(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq 32(X), %mm4 + movq %mm4, 32(Y) + + movq 40(X), %mm5 + movq %mm5, 40(Y) + + movq 48(X), %mm6 + movq %mm6, 48(Y) + + movq 56(X), %mm7 + movq %mm7, 56(Y) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movq 64(X), %mm0 + movq %mm0, 64(Y) + + movq 72(X), %mm1 + movq %mm1, 72(Y) + + movq 80(X), %mm2 + movq %mm2, 80(Y) + + movq 88(X), %mm3 + movq %mm3, 88(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movq 96(X), %mm4 + movq %mm4, 96(Y) + + movq 104(X), %mm5 + movq %mm5, 104(Y) + + movq 112(X), %mm6 + movq %mm6, 112(Y) + + movq 120(X), %mm7 + movq %mm7, 120(Y) +#elif defined(DOUBLE) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + + movq %mm0, 0 * SIZE(Y) + movq %mm1, 1 * SIZE(Y) + + movq 2 * SIZE(X), %mm2 + movq 3 * SIZE(X), %mm3 + + movq %mm2, 2 * SIZE(Y) + movq %mm3, 3 * SIZE(Y) + + movq 4 * SIZE(X), %mm4 + movq 5 * SIZE(X), %mm5 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 4 * SIZE(Y) + movq %mm5, 5 * SIZE(Y) + + movq 6 * SIZE(X), %mm6 + movq 7 * SIZE(X), %mm7 + + movq %mm6, 6 * SIZE(Y) + movq %mm7, 7 * SIZE(Y) +#else + movq 0 * SIZE(X), %mm0 + movq 2 * SIZE(X), %mm2 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq %mm0, 0 * SIZE(Y) + movq %mm2, 2 * SIZE(Y) + + movq 4 * SIZE(X), %mm4 + movq 6 * SIZE(X), %mm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movq %mm4, 4 * SIZE(Y) + movq %mm6, 6 * SIZE(Y) +#endif + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L11 + ALIGN_2 + +.L20: + movq N, %rax # i = m + andq $3, %rax + jle .L99 + ALIGN_2 + +.L21: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm1 + movq %mm1, 1 * SIZE(Y) +#else + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) +#endif + + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq %rax + jg .L21 + +.L99: + xorq %rax,%rax + EMMS + ret + ALIGN_3 + +.L100: + movq N, %rax + sarq $2, %rax + jle .L120 + ALIGN_2 + +.L111: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y + + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y + + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y + + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm1 + movq %mm1, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm2 + movq %mm2, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm3 + movq %mm3, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm4 + movq %mm4, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm5 + movq %mm5, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm6 + movq %mm6, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm7 + movq %mm7, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y +#else + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm2 + movq %mm2, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm4 + movq %mm4, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y + + movq 0 * SIZE(X), %mm6 + movq %mm6, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y +#endif + + decq %rax + jg .L111 + +.L120: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_2 + +.L121: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq %mm0, 0(Y) + movq 8(X), %mm1 + movq %mm1, 8(Y) + movq 16(X), %mm2 + movq %mm2, 16(Y) + movq 24(X), %mm3 + movq %mm3, 24(Y) + addq INCX, X + addq INCY, Y +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + movq 1 * SIZE(X), %mm1 + movq %mm1, 1 * SIZE(Y) + addq INCX, X + addq INCY, Y +#else + movq 0 * SIZE(X), %mm0 + movq %mm0, 0 * SIZE(Y) + addq INCX, X + addq INCY, Y +#endif + + decq %rax + jg .L121 + +.L999: + xorq %rax,%rax + EMMS + ret + + EPILOGUE + diff --git a/kernel/x86_64/zcopy_sse.S b/kernel/x86_64/zcopy_sse.S new file mode 100644 index 0000000000..91f283aaf8 --- /dev/null +++ b/kernel/x86_64/zcopy_sse.S @@ -0,0 +1,992 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addps OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + cmpq $2 * SIZE, INCX + jne .L100 + cmpq $2 * SIZE, INCY + jne .L100 + + cmpq $3, M + jle .L106 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + addq M, M + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + ALIGN_4 + +.L05: + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_4 + +.L10: + testq $3 * SIZE, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -32 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -28 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -24 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm2) + movaps %xmm3, -20 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4,-16 * SIZE(Y) + LOAD(16 * SIZE, X, %xmm4) + movaps %xmm5,-12 * SIZE(Y) + LOAD(20 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -8 * SIZE(Y) + LOAD(24 * SIZE, X, %xmm6) + movaps %xmm7, -4 * SIZE(Y) + LOAD(28 * SIZE, X, %xmm7) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + movaps %xmm4, -16 * SIZE(Y) + movaps %xmm5, -12 * SIZE(Y) + movaps %xmm6, -8 * SIZE(Y) + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + ALIGN_3 + +.L13: + testq $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, -24 * SIZE(Y) + movaps %xmm3, -20 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + + +.L20: + testq $SIZE, X + jne .L30 + + movhps -32 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L23 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -10 * SIZE(X), %xmm6 + movaps -6 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 2 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 6 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 10 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 14 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 18 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 22 * SIZE(X), %xmm6 + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 26 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -2 * SIZE(X), %xmm0 + + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + shufps $0x4e, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + shufps $0x4e, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + shufps $0x4e, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + shufps $0x4e, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L23: + testq $16, M + jle .L24 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + movaps -22 * SIZE(X), %xmm3 + movaps -18 * SIZE(X), %xmm4 + + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + shufps $0x4e, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + shufps $0x4e, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + shufps $0x4e, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + testq $8, M + jle .L25 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + movaps -26 * SIZE(X), %xmm2 + + shufps $0x4e, %xmm1, %xmm0 + shufps $0x4e, %xmm2, %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, M + jle .L26 + ALIGN_3 + + movaps -30 * SIZE(X), %xmm1 + shufps $0x4e, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, M + jle .L27 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, M + jle .L29 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L30: + testq $2 * SIZE, X + jne .L40 + + movaps -33 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L33 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + movaps -13 * SIZE(X), %xmm5 + movaps -9 * SIZE(X), %xmm6 + movaps -5 * SIZE(X), %xmm7 + + decq %rax + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 3 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 7 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 11 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 15 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 19 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 23 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 27 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -1 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x39, %xmm4, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x39, %xmm5, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x39, %xmm6, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x39, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L33: + testq $16, M + jle .L34 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + movaps -21 * SIZE(X), %xmm3 + movaps -17 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + testq $8, M + jle .L35 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + movaps -25 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, M + jle .L36 + ALIGN_3 + + movaps -29 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x39, %xmm0, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + testq $2, M + jle .L37 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, M + jle .L39 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + + movq M, %rax + sarq $5, %rax + jle .L43 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + movaps -15 * SIZE(X), %xmm5 + movaps -11 * SIZE(X), %xmm6 + movaps -7 * SIZE(X), %xmm7 + + decq %rax + jle .L42 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + movaps 1 * SIZE(X), %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + movaps 5 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + movaps 9 * SIZE(X), %xmm3 + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + movaps 13 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + movaps 17 * SIZE(X), %xmm5 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + movaps 21 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + movaps 25 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movaps -3 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movss %xmm5, %xmm4 + shufps $0x93, %xmm5, %xmm4 + movaps %xmm4, -16 * SIZE(Y) + + movss %xmm6, %xmm5 + shufps $0x93, %xmm6, %xmm5 + movaps %xmm5, -12 * SIZE(Y) + + movss %xmm7, %xmm6 + shufps $0x93, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(Y) + + movss %xmm0, %xmm7 + shufps $0x93, %xmm0, %xmm7 + movaps %xmm7, -4 * SIZE(Y) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L43: + testq $16, M + jle .L44 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + movaps -23 * SIZE(X), %xmm3 + movaps -19 * SIZE(X), %xmm4 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movss %xmm3, %xmm2 + shufps $0x93, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(Y) + + movss %xmm4, %xmm3 + shufps $0x93, %xmm4, %xmm3 + movaps %xmm3, -20 * SIZE(Y) + + movaps %xmm4, %xmm0 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $8, M + jle .L45 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + movaps -27 * SIZE(X), %xmm2 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + + movss %xmm2, %xmm1 + shufps $0x93, %xmm2, %xmm1 + movaps %xmm1, -28 * SIZE(Y) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, M + jle .L46 + ALIGN_3 + + movaps -31 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + shufps $0x93, %xmm1, %xmm0 + + movaps %xmm0, -32 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + testq $2, M + jle .L47 + ALIGN_3 + + movsd -32 * SIZE(X), %xmm0 + movsd %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, M + jle .L49 + ALIGN_3 + + movss -32 * SIZE(X), %xmm0 + movss %xmm0, -32 * SIZE(Y) + addq $SIZE, Y + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_4 + +.L100: + movq M, %rax + sarq $3, %rax + jle .L105 + ALIGN_3 + +.L102: + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + movsd %xmm0, (Y) + addq INCY, Y + movhps %xmm0, (Y) + addq INCY, Y + movsd %xmm1, (Y) + addq INCY, Y + movhps %xmm1, (Y) + addq INCY, Y + movsd %xmm2, (Y) + addq INCY, Y + movhps %xmm2, (Y) + addq INCY, Y + movsd %xmm3, (Y) + addq INCY, Y + movhps %xmm3, (Y) + addq INCY, Y + + decq %rax + jg .L102 + ALIGN_3 + +.L105: + testq $4, M + jle .L106 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + movsd %xmm0, (Y) + addq INCY, Y + movhps %xmm0, (Y) + addq INCY, Y + movsd %xmm1, (Y) + addq INCY, Y + movhps %xmm1, (Y) + addq INCY, Y + ALIGN_3 + +.L106: + testq $2, M + jle .L107 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + movsd %xmm0, (Y) + addq INCY, Y + movhps %xmm0, (Y) + addq INCY, Y + ALIGN_3 + +.L107: + testq $1, M + jle .L999 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zcopy_sse2.S b/kernel/x86_64/zcopy_sse2.S new file mode 100644 index 0000000000..c3a99a57b3 --- /dev/null +++ b/kernel/x86_64/zcopy_sse2.S @@ -0,0 +1,655 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#ifdef OPTERON +#define LOAD(OFFSET, ADDR, REG) xorps REG, REG; addpd OFFSET(ADDR), REG +#else +#define LOAD(OFFSET, ADDR, REG) movaps OFFSET(ADDR), REG +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + addq M, M + +#ifdef ALIGNED_ACCESS + testq $SIZE, Y +#else + testq $SIZE, X +#endif + je .L10 + + movsd (X), %xmm0 + movsd %xmm0, (Y) + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + jle .L19 + ALIGN_4 + +.L10: + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + +#ifdef ALIGNED_ACCESS + testq $SIZE, X +#else + testq $SIZE, Y +#endif + jne .L20 + + movq M, %rax + sarq $4, %rax + jle .L13 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, -16 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movaps %xmm1, -14 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm2, -12 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movaps %xmm3, -10 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm4, -8 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movaps %xmm5, -6 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm6, -4 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movaps %xmm7, -2 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + movaps %xmm4, -8 * SIZE(Y) + movaps %xmm5, -6 * SIZE(Y) + movaps %xmm6, -4 * SIZE(Y) + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L13: + testq $8, M + jle .L14 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, -12 * SIZE(Y) + movaps %xmm3, -10 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $4, M + jle .L15 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, M + jle .L16 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, M + jle .L19 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + movhps -16 * SIZE(X), %xmm0 + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + movaps -7 * SIZE(X), %xmm5 + movaps -5 * SIZE(X), %xmm6 + movaps -3 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + LOAD( 1 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + LOAD( 3 * SIZE, X, %xmm2) + + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + LOAD( 5 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + LOAD( 7 * SIZE, X, %xmm4) + + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + LOAD( 9 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + LOAD(11 * SIZE, X, %xmm6) + + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + LOAD(13 * SIZE, X, %xmm7) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L22: + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + LOAD(-1 * SIZE, X, %xmm0) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm4, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + SHUFPD_1 %xmm5, %xmm4 + movaps %xmm4, -8 * SIZE(Y) + SHUFPD_1 %xmm6, %xmm5 + movaps %xmm5, -6 * SIZE(Y) + + SHUFPD_1 %xmm7, %xmm6 + movaps %xmm6, -4 * SIZE(Y) + SHUFPD_1 %xmm0, %xmm7 + movaps %xmm7, -2 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm8 + + SHUFPD_1 %xmm1, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + SHUFPD_1 %xmm2, %xmm1 + movaps %xmm1, -14 * SIZE(Y) + + SHUFPD_1 %xmm3, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + + SHUFPD_1 %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(Y) + + movaps %xmm8, %xmm0 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + movaps -13 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm1, %xmm0 + SHUFPD_1 %xmm2, %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -14 * SIZE(Y) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + ALIGN_3 + + movaps -15 * SIZE(X), %xmm1 + SHUFPD_1 %xmm1, %xmm0 + + movaps %xmm0, -16 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#else + + movq M, %rax + sarq $4, %rax + jle .L23 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + LOAD( 0 * SIZE, X, %xmm0) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + LOAD( 2 * SIZE, X, %xmm1) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + LOAD( 4 * SIZE, X, %xmm2) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + LOAD( 6 * SIZE, X, %xmm3) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + LOAD( 8 * SIZE, X, %xmm4) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + LOAD(10 * SIZE, X, %xmm5) + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + LOAD(12 * SIZE, X, %xmm6) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + LOAD(14 * SIZE, X, %xmm7) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + movlps %xmm4, -8 * SIZE(Y) + movhps %xmm4, -7 * SIZE(Y) + movlps %xmm5, -6 * SIZE(Y) + movhps %xmm5, -5 * SIZE(Y) + movlps %xmm6, -4 * SIZE(Y) + movhps %xmm6, -3 * SIZE(Y) + movlps %xmm7, -2 * SIZE(Y) + movhps %xmm7, -1 * SIZE(Y) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + ALIGN_3 + +.L23: + testq $8, M + jle .L24 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + movaps -12 * SIZE(X), %xmm2 + movlps %xmm2, -12 * SIZE(Y) + movhps %xmm2, -11 * SIZE(Y) + movaps -10 * SIZE(X), %xmm3 + movlps %xmm3, -10 * SIZE(Y) + movhps %xmm3, -9 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $4, M + jle .L25 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + movaps -14 * SIZE(X), %xmm1 + movlps %xmm1, -14 * SIZE(Y) + movhps %xmm1, -13 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, M + jle .L26 + ALIGN_3 + + movaps -16 * SIZE(X), %xmm0 + movlps %xmm0, -16 * SIZE(Y) + movhps %xmm0, -15 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, M + jle .L29 + ALIGN_3 + + movsd -16 * SIZE(X), %xmm0 + movsd %xmm0, -16 * SIZE(Y) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + + ret + ALIGN_3 + +#endif + +.L50: + movq M, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L51: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addq INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addq INCX, X + + + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + movlps %xmm1, 0 * SIZE(Y) + movhps %xmm1, 1 * SIZE(Y) + addq INCY, Y + + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + addq INCY, Y + + movlps %xmm3, 0 * SIZE(Y) + movhps %xmm3, 1 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $3, %rax + jle .L57 + ALIGN_3 + +.L56: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S new file mode 100644 index 0000000000..f968347088 --- /dev/null +++ b/kernel/x86_64/zdot.S @@ -0,0 +1,259 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + testq N, N + jle .L88 + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + fldz + fldz + fldz + fldz + + cmpq $2 * SIZE, INCX + jne .L14 + cmpq $2 * SIZE, INCY + jne .L14 + + movq N, %rax + sarq $1, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + FLD 2 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 3 * SIZE(X) + + FLD 2 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 3 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq N, %rax + andq $1, %rax + jle .L27 + ALIGN_3 + +.L22: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + jmp .L27 + ALIGN_3 + +.L14: + movq N, %rax + sarq $1, %rax + jle .L30 + ALIGN_3 + + +.L31: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addq INCX, X + + FLD 0 * SIZE(X) + addq INCY, Y + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + addq INCX, X + addq INCY, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L30: + movq N, %rax + andq $1, %rax + jle .L27 + ALIGN_3 + +.L37: + FLD 0 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(2) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(2) + FLD 1 * SIZE(X) + + FLD 0 * SIZE(Y) + fmul %st(1) + faddp %st, %st(4) + + FLD 1 * SIZE(Y) + fmulp %st, %st(1) + faddp %st, %st(4) + ALIGN_3 + +.L27: +#ifndef CONJ + fsubp %st, %st(3) + faddp %st, %st(1) +#else + faddp %st, %st(3) + fsubp %st, %st(1) +#endif + ret + ALIGN_3 + +.L88: + fldz + fldz + + ret + EPILOGUE diff --git a/kernel/x86_64/zdot_atom.S b/kernel/x86_64/zdot_atom.S new file mode 100644 index 0000000000..9a8239c8d4 --- /dev/null +++ b/kernel/x86_64/zdot_atom.S @@ -0,0 +1,461 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + pxor %xmm0, %xmm0 + salq $ZBASE_SHIFT, INCY + pxor %xmm1, %xmm1 + + pxor %xmm2, %xmm2 + cmpq $0, N + pxor %xmm3, %xmm3 + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L20 + cmpq $2 * SIZE, INCY + jne .L20 + + movq N, %rax + sarq $2, %rax + jle .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 2 * SIZE(X), %xmm10 + mulsd %xmm7, %xmm8 + movsd 2 * SIZE(Y), %xmm11 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 3 * SIZE(X), %xmm12 + mulsd %xmm6, %xmm9 + movsd 3 * SIZE(Y), %xmm13 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + movsd 4 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + movsd 4 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + movsd 5 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + movsd 5 * SIZE(Y), %xmm7 + + addsd %xmm10, %xmm0 + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 6 * SIZE(X), %xmm10 + addsd %xmm14, %xmm1 + mulsd %xmm7, %xmm8 + movsd 6 * SIZE(Y), %xmm11 + addsd %xmm12, %xmm2 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 7 * SIZE(X), %xmm12 + addsd %xmm15, %xmm3 + mulsd %xmm6, %xmm9 + movsd 7 * SIZE(Y), %xmm13 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + movsd 8 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + movsd 8 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + movsd 9 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + movsd 9 * SIZE(Y), %xmm7 + + addsd %xmm10, %xmm0 + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 10 * SIZE(X), %xmm10 + addsd %xmm14, %xmm1 + mulsd %xmm7, %xmm8 + movsd 10 * SIZE(Y), %xmm11 + addsd %xmm12, %xmm2 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 11 * SIZE(X), %xmm12 + addsd %xmm15, %xmm3 + mulsd %xmm6, %xmm9 + movsd 11 * SIZE(Y), %xmm13 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + movsd 4 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + movsd 4 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + movsd 5 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + movsd 5 * SIZE(Y), %xmm7 + + addsd %xmm10, %xmm0 + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 6 * SIZE(X), %xmm10 + addsd %xmm14, %xmm1 + mulsd %xmm7, %xmm8 + movsd 6 * SIZE(Y), %xmm11 + addsd %xmm12, %xmm2 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 7 * SIZE(X), %xmm12 + addsd %xmm15, %xmm3 + mulsd %xmm6, %xmm9 + movsd 7 * SIZE(Y), %xmm13 + + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + + addsd %xmm10, %xmm0 + addsd %xmm14, %xmm1 + addsd %xmm12, %xmm2 + addsd %xmm15, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + movq N, %rax + andq $2, %rax + jle .L17 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + movsd 2 * SIZE(X), %xmm10 + mulsd %xmm7, %xmm8 + movsd 2 * SIZE(Y), %xmm11 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + movsd 3 * SIZE(X), %xmm12 + mulsd %xmm6, %xmm9 + movsd 3 * SIZE(Y), %xmm13 + + addsd %xmm4, %xmm0 + movaps %xmm10, %xmm14 + mulsd %xmm11, %xmm10 + addsd %xmm8, %xmm1 + mulsd %xmm13, %xmm14 + addsd %xmm5, %xmm2 + movaps %xmm12, %xmm15 + mulsd %xmm13, %xmm12 + addsd %xmm9, %xmm3 + mulsd %xmm11, %xmm15 + + addsd %xmm10, %xmm0 + addsd %xmm14, %xmm1 + addsd %xmm12, %xmm2 + addsd %xmm15, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + movq N, %rax + andq $1, %rax + jle .L999 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + mulsd %xmm6, %xmm9 + + addsd %xmm4, %xmm0 + addsd %xmm8, %xmm1 + addsd %xmm5, %xmm2 + addsd %xmm9, %xmm3 + jmp .L999 + ALIGN_3 + +.L20: + movq N, %rax + sarq $2, %rax + jle .L25 + ALIGN_3 + +.L23: + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + movsd 0 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movsd 1 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + movsd 0 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movsd 1 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + movsd 0 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movsd 1 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm8, %xmm1 + addsd %xmm5, %xmm2 + addsd %xmm9, %xmm3 + + decq %rax + jg .L23 + ALIGN_3 + +.L25: + testq $3, N + je .L999 + + movq N, %rax + andq $2, %rax + jle .L27 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + movsd 0 * SIZE(X), %xmm4 + addsd %xmm8, %xmm1 + movsd 0 * SIZE(Y), %xmm6 + addsd %xmm5, %xmm2 + movsd 1 * SIZE(X), %xmm5 + addsd %xmm9, %xmm3 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + addq INCX, X + mulsd %xmm6, %xmm9 + addq INCY, Y + + addsd %xmm4, %xmm0 + addsd %xmm8, %xmm1 + addsd %xmm5, %xmm2 + addsd %xmm9, %xmm3 + + ALIGN_3 + +.L27: + movq N, %rax + andq $1, %rax + jle .L999 + + movsd 0 * SIZE(X), %xmm4 + movsd 0 * SIZE(Y), %xmm6 + movsd 1 * SIZE(X), %xmm5 + movsd 1 * SIZE(Y), %xmm7 + + movaps %xmm4, %xmm8 + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm8 + movaps %xmm5, %xmm9 + mulsd %xmm7, %xmm5 + mulsd %xmm6, %xmm9 + + addsd %xmm4, %xmm0 + addsd %xmm8, %xmm1 + addsd %xmm5, %xmm2 + addsd %xmm9, %xmm3 + ALIGN_3 + +.L999: +#ifndef CONJ + subsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 +#else + addsd %xmm2, %xmm0 + subsd %xmm3, %xmm1 +#endif + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S new file mode 100644 index 0000000000..3302b90880 --- /dev/null +++ b/kernel/x86_64/zdot_sse.S @@ -0,0 +1,3492 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + testq N, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L200 + cmpq $2 * SIZE, INCY + jne .L200 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + testq $SIZE, X + jne .L50 + +.L0x: + testq $2 * SIZE, X + je .L10 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm0 + + pshufd $0xb1, %xmm0, %xmm1 + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq N + ALIGN_3 + +.L10: + testq $3 * SIZE, Y + jne .L20 + + movq N, %rax + sarq $4, %rax + jle .L15 + + movaps -32 * SIZE(X), %xmm4 + movaps -28 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm8 + movaps -28 * SIZE(Y), %xmm9 + movaps -24 * SIZE(X), %xmm6 + movaps -20 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm10 + movaps -20 * SIZE(Y), %xmm11 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -16 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -12 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -12 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -8 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -4 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps 0 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps 0 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps 4 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps 4 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps 8 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps 8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps 12 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps 12 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -16 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -16 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -12 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -12 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -8 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -4 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L15: + testq $8, N + jle .L16 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm9 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm10 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm7 + movaps -20 * SIZE(Y), %xmm11 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L16: + testq $4, N + jle .L17 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm8 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm9 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L17: + testq $2, N + jle .L18 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L18: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L20: +#ifdef ALIGNED_ACCESS + + testq $2 * SIZE, Y + jne .L30 + + movaps -33 * SIZE(Y), %xmm8 + addq $3 * SIZE, Y + + shufps $0xb1, %xmm1, %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L25 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm11 + movaps -20 * SIZE(X), %xmm7 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps 0 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps 4 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps 8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L25: + testq $8, N + jle .L26 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm11 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm7 + movaps -20 * SIZE(Y), %xmm8 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L26: + testq $4, N + jle .L27 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L27: + testq $2, N + jle .L28 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L28: + testq $1, N + jle .L29 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L29: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L30: + + testq $SIZE, Y + jne .L40 +#endif + + movq N, %rax + sarq $4, %rax + jle .L35 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm8 + movhps -30 * SIZE(Y), %xmm8 + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm9 + movhps -26 * SIZE(Y), %xmm9 + + movaps -24 * SIZE(X), %xmm6 + movsd -24 * SIZE(Y), %xmm10 + movhps -22 * SIZE(Y), %xmm10 + movaps -20 * SIZE(X), %xmm7 + movsd -20 * SIZE(Y), %xmm11 + movhps -18 * SIZE(Y), %xmm11 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd -16 * SIZE(Y), %xmm8 + movhps -14 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -16 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd -12 * SIZE(Y), %xmm9 + movhps -10 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -12 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd -8 * SIZE(Y), %xmm10 + movhps -6 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd -4 * SIZE(Y), %xmm11 + movhps -2 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd 0 * SIZE(Y), %xmm8 + movhps 2 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps 0 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd 4 * SIZE(Y), %xmm9 + movhps 6 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps 4 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd 8 * SIZE(Y), %xmm10 + movhps 10 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps 8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd 12 * SIZE(Y), %xmm11 + movhps 14 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps 12 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L32: + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd -16 * SIZE(Y), %xmm8 + movhps -14 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -16 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd -12 * SIZE(Y), %xmm9 + movhps -10 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -12 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd -8 * SIZE(Y), %xmm10 + movhps -6 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -8 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd -4 * SIZE(Y), %xmm11 + movhps -2 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L35: + testq $8, N + jle .L36 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm8 + movhps -30 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm9 + movhps -26 * SIZE(Y), %xmm9 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps -24 * SIZE(X), %xmm6 + movsd -24 * SIZE(Y), %xmm10 + movhps -22 * SIZE(Y), %xmm10 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm7 + movsd -20 * SIZE(Y), %xmm11 + movhps -18 * SIZE(Y), %xmm11 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L36: + testq $4, N + jle .L37 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm8 + movhps -30 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movsd -28 * SIZE(Y), %xmm9 + movhps -26 * SIZE(Y), %xmm9 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L37: + testq $2, N + jle .L38 + + movaps -32 * SIZE(X), %xmm4 + movsd -32 * SIZE(Y), %xmm8 + movhps -30 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L38: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +#ifdef ALIGNED_ACCESS +.L40: + movaps -35 * SIZE(Y), %xmm8 + addq $1 * SIZE, Y + + shufps $0xb1, %xmm1, %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L45 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm11 + movaps -20 * SIZE(X), %xmm7 + + decq %rax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps 0 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps 4 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps 8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(X), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(X), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(Y), %xmm9 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(X), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(Y), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(X), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(Y), %xmm11 + addps %xmm12, %xmm1 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(Y), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L45: + testq $8, N + jle .L46 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(X), %xmm6 + movaps -24 * SIZE(Y), %xmm11 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm7 + movaps -20 * SIZE(Y), %xmm8 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L46: + testq $4, N + jle .L47 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm5 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L47: + testq $2, N + jle .L48 + + movaps -32 * SIZE(X), %xmm4 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L48: + testq $1, N + jle .L49 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + movss -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L49: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + jmp .L98 + ALIGN_3 +#endif + +.L50: + testq $SIZE, Y + jne .L70 + +#ifdef ALIGNED_ACCESS + + testq $2 * SIZE, Y + je .L50x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(X), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + + pshufd $0xb1, %xmm0, %xmm1 + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addq $2 * SIZE, X + addq $2 * SIZE, Y + + decq N + ALIGN_3 + +.L50x: + testq $2 * SIZE, X + jne .L60 + + movaps -33 * SIZE(X), %xmm8 + addq $3 * SIZE, X + + shufps $0xb1, %xmm1, %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L55 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + movaps -24 * SIZE(Y), %xmm6 + movaps -24 * SIZE(X), %xmm11 + movaps -20 * SIZE(Y), %xmm7 + + decq %rax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps 0 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps 4 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps 8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L52: + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L55: + testq $8, N + jle .L56 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(Y), %xmm6 + movaps -24 * SIZE(X), %xmm11 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(Y), %xmm7 + movaps -20 * SIZE(X), %xmm8 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x39, %xmm10, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x39, %xmm11, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L56: + testq $4, N + jle .L57 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x39, %xmm9, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L57: + testq $2, N + jle .L58 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L58: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x39, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L60: + movaps -35 * SIZE(X), %xmm8 + addq $1 * SIZE, X + + shufps $0xb1, %xmm1, %xmm1 + + movq N, %rax + sarq $4, %rax + jle .L65 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + movaps -24 * SIZE(Y), %xmm6 + movaps -24 * SIZE(X), %xmm11 + movaps -20 * SIZE(Y), %xmm7 + + decq %rax + jle .L62 + ALIGN_3 + +.L61: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps 0 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps 4 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps 8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L61 + ALIGN_3 + +.L62: + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movaps -20 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movaps -16 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movaps -12 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movaps -8 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + movaps -4 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L65: + testq $8, N + jle .L66 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(Y), %xmm6 + movaps -24 * SIZE(X), %xmm11 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(Y), %xmm7 + movaps -20 * SIZE(X), %xmm8 + + movss %xmm11, %xmm10 + pshufd $0xb1, %xmm6, %xmm12 + shufps $0x93, %xmm11, %xmm10 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0xb1, %xmm7, %xmm12 + shufps $0x93, %xmm8, %xmm11 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L66: + testq $4, N + jle .L67 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(Y), %xmm5 + movaps -28 * SIZE(X), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0xb1, %xmm5, %xmm12 + shufps $0x93, %xmm10, %xmm9 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L67: + testq $2, N + jle .L68 + + movaps -32 * SIZE(Y), %xmm4 + movaps -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm9, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L68: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 + movss -32 * SIZE(X), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0xb1, %xmm4, %xmm12 + shufps $0x93, %xmm8, %xmm8 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +#else + + testq $2 * SIZE, Y + je .L50x + +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd -32 * SIZE(Y), %xmm0 +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm0, %xmm1 + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + addq $2 * SIZE, X + addq $2 * SIZE, Y + + decq N + ALIGN_3 + +.L50x: + movq N, %rax + sarq $4, %rax + jle .L55 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm8 + movhps -30 * SIZE(X), %xmm8 + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm9 + movhps -26 * SIZE(X), %xmm9 + + movaps -24 * SIZE(Y), %xmm6 + movlps -24 * SIZE(X), %xmm10 + movhps -22 * SIZE(X), %xmm10 + movaps -20 * SIZE(Y), %xmm7 + movlps -20 * SIZE(X), %xmm11 + movhps -18 * SIZE(X), %xmm11 + + decq %rax + jle .L52 + ALIGN_3 + +.L51: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movlps -16 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movlps -12 * SIZE(X), %xmm9 + movhps -10 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movlps -8 * SIZE(X), %xmm10 + movhps -6 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movlps -4 * SIZE(X), %xmm11 + movhps -2 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps 0 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movlps 0 * SIZE(X), %xmm8 + movhps 2 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps 4 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movlps 4 * SIZE(X), %xmm9 + movhps 6 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps 8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movlps 8 * SIZE(X), %xmm10 + movhps 10 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps 12 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movlps 12 * SIZE(X), %xmm11 + movhps 14 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L52: + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + movaps -16 * SIZE(Y), %xmm4 + mulps %xmm8, %xmm12 + movlps -16 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + movaps -12 * SIZE(Y), %xmm5 + mulps %xmm9, %xmm12 + movlps -12 * SIZE(X), %xmm9 + movhps -10 * SIZE(X), %xmm9 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + movaps -8 * SIZE(Y), %xmm6 + mulps %xmm10, %xmm12 + movlps -8 * SIZE(X), %xmm10 + movhps -6 * SIZE(X), %xmm10 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + movaps -4 * SIZE(Y), %xmm7 + mulps %xmm11, %xmm12 + movlps -4 * SIZE(X), %xmm11 + movhps -2 * SIZE(X), %xmm11 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L55: + testq $8, N + jle .L56 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm8 + movhps -30 * SIZE(X), %xmm8 + + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm9 + movhps -26 * SIZE(X), %xmm9 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -24 * SIZE(Y), %xmm6 + movlps -24 * SIZE(X), %xmm10 + movhps -22 * SIZE(X), %xmm10 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(Y), %xmm7 + movlps -20 * SIZE(X), %xmm11 + movhps -18 * SIZE(X), %xmm11 + + pshufd $0xb1, %xmm6, %xmm12 + mulps %xmm10, %xmm6 + addps %xmm6, %xmm0 + mulps %xmm10, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm7, %xmm12 + mulps %xmm11, %xmm7 + addps %xmm7, %xmm0 + mulps %xmm11, %xmm12 + addps %xmm12, %xmm1 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L56: + testq $4, N + jle .L57 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm8 + movhps -30 * SIZE(X), %xmm8 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(Y), %xmm5 + movlps -28 * SIZE(X), %xmm9 + movhps -26 * SIZE(X), %xmm9 + + pshufd $0xb1, %xmm5, %xmm12 + mulps %xmm9, %xmm5 + addps %xmm5, %xmm0 + mulps %xmm9, %xmm12 + addps %xmm12, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L57: + testq $2, N + jle .L58 + + movaps -32 * SIZE(Y), %xmm4 + movlps -32 * SIZE(X), %xmm8 + movhps -30 * SIZE(X), %xmm8 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm9, %xmm8 + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L58: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(Y), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd -32 * SIZE(X), %xmm8 + + pshufd $0xb1, %xmm4, %xmm12 + mulps %xmm8, %xmm4 + addps %xmm4, %xmm0 + mulps %xmm8, %xmm12 + addps %xmm12, %xmm1 + jmp .L98 + ALIGN_3 +#endif + +.L70: + testq $2 * SIZE, Y + je .L70x + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd -32 * SIZE(X), %xmm4 + addq $2 * SIZE, X +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd -32 * SIZE(Y), %xmm1 + addq $2 * SIZE, Y + + pshufd $0xb1, %xmm1, %xmm0 + shufps $0xb1, %xmm4, %xmm4 + + mulps %xmm4, %xmm0 + mulps %xmm4, %xmm1 + decq N + ALIGN_3 + +.L70x: + testq $2 * SIZE, X + jne .L80 + + movaps -33 * SIZE(X), %xmm4 + addq $3 * SIZE, X + movaps -33 * SIZE(Y), %xmm8 + addq $3 * SIZE, Y + + movq N, %rax + sarq $4, %rax + jle .L75 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + movaps -24 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm11 + + decq %rax + jle .L72 + ALIGN_3 + +.L71: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -20 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -16 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -16 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -12 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -8 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -4 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -4 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps 0 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps 0 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps 4 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps 4 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps 8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps 8 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L71 + ALIGN_3 + +.L72: + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -20 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -16 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movaps -16 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movaps -12 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movaps -8 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -4 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movaps -4 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L75: + testq $8, N + jle .L76 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps -28 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps -24 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm11 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movaps -20 * SIZE(X), %xmm4 + movaps -20 * SIZE(Y), %xmm8 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L76: + testq $4, N + jle .L77 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + movaps -28 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps %xmm6, %xmm4 + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L77: + testq $2, N + jle .L78 + + movaps -32 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm5, %xmm4 + movaps %xmm9, %xmm8 + ALIGN_3 + +.L78: + testq $1, N + jle .L79 + + xorps %xmm5, %xmm5 + movss %xmm5, %xmm4 + movss %xmm5, %xmm8 + + shufps $0x24, %xmm4, %xmm4 + pshufd $0x18, %xmm8, %xmm12 + shufps $0x24, %xmm8, %xmm8 + + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L79: + shufps $0x39, %xmm0, %xmm0 + shufps $0x39, %xmm1, %xmm1 + shufps $0x39, %xmm2, %xmm2 + shufps $0x39, %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L80: + movsd -33 * SIZE(X), %xmm4 + movhps -31 * SIZE(X), %xmm4 + addq $3 * SIZE, X + movaps -33 * SIZE(Y), %xmm8 + addq $3 * SIZE, Y + + movq N, %rax + sarq $4, %rax + jle .L85 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movsd -28 * SIZE(X), %xmm6 + movhps -26 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movsd -24 * SIZE(X), %xmm7 + movhps -22 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm11 + + decq %rax + jle .L82 + ALIGN_3 + +.L81: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -16 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movsd -12 * SIZE(X), %xmm6 + movhps -10 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movsd -8 * SIZE(X), %xmm7 + movhps -6 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -4 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movsd -4 * SIZE(X), %xmm4 + movhps -2 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps 0 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movsd 0 * SIZE(X), %xmm5 + movhps 2 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps 4 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movsd 4 * SIZE(X), %xmm6 + movhps 6 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps 8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movsd 8 * SIZE(X), %xmm7 + movhps 10 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L81 + ALIGN_3 + +.L82: + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -20 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movaps -16 * SIZE(Y), %xmm9 + mulps %xmm5, %xmm12 + movsd -16 * SIZE(X), %xmm5 + movhps -14 * SIZE(X), %xmm5 + addps %xmm12, %xmm3 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movaps -12 * SIZE(Y), %xmm10 + mulps %xmm6, %xmm12 + movsd -12 * SIZE(X), %xmm6 + movhps -10 * SIZE(X), %xmm6 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movaps -8 * SIZE(Y), %xmm11 + mulps %xmm7, %xmm12 + movsd -8 * SIZE(X), %xmm7 + movhps -6 * SIZE(X), %xmm7 + addps %xmm12, %xmm3 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movaps -4 * SIZE(Y), %xmm8 + mulps %xmm4, %xmm12 + movsd -4 * SIZE(X), %xmm4 + movhps -2 * SIZE(X), %xmm4 + addps %xmm12, %xmm1 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + ALIGN_3 + +.L85: + testq $8, N + jle .L86 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movsd -28 * SIZE(X), %xmm6 + movhps -26 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movsd -24 * SIZE(X), %xmm7 + movhps -22 * SIZE(X), %xmm7 + movaps -24 * SIZE(Y), %xmm11 + + movss %xmm11, %xmm10 + pshufd $0x1b, %xmm10, %xmm12 + movss %xmm7, %xmm6 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movsd -20 * SIZE(X), %xmm4 + movhps -18 * SIZE(X), %xmm4 + movaps -20 * SIZE(Y), %xmm8 + + movss %xmm8, %xmm11 + pshufd $0x1b, %xmm11, %xmm12 + movss %xmm4, %xmm7 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L86: + testq $4, N + jle .L87 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movsd -28 * SIZE(X), %xmm6 + movhps -26 * SIZE(X), %xmm6 + movaps -28 * SIZE(Y), %xmm10 + + movss %xmm10, %xmm9 + pshufd $0x1b, %xmm9, %xmm12 + movss %xmm6, %xmm5 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movaps %xmm6, %xmm4 + movaps %xmm10, %xmm8 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L87: + testq $2, N + jle .L88 + + movsd -32 * SIZE(X), %xmm5 + movhps -30 * SIZE(X), %xmm5 + movaps -32 * SIZE(Y), %xmm9 + + movss %xmm9, %xmm8 + pshufd $0x1b, %xmm8, %xmm12 + movss %xmm5, %xmm4 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movaps %xmm5, %xmm4 + movaps %xmm9, %xmm8 + ALIGN_3 + +.L88: + testq $1, N + jle .L89 + + xorps %xmm5, %xmm5 + movss %xmm5, %xmm4 + movss %xmm5, %xmm8 + + shufps $0x24, %xmm4, %xmm4 + pshufd $0x18, %xmm8, %xmm12 + shufps $0x24, %xmm8, %xmm8 + + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L89: + shufps $0x39, %xmm0, %xmm0 + shufps $0x39, %xmm1, %xmm1 + shufps $0x39, %xmm2, %xmm2 + shufps $0x39, %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L200: + movq N, %rax + sarq $4, %rax + jle .L205 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + + decq %rax + jle .L204 + ALIGN_3 + +.L203: + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + mulps %xmm4, %xmm12 + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + mulps %xmm5, %xmm12 + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + mulps %xmm6, %xmm12 + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + mulps %xmm7, %xmm12 + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + mulps %xmm4, %xmm12 + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + mulps %xmm5, %xmm12 + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + mulps %xmm6, %xmm12 + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + + mulps %xmm7, %xmm12 + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + addps %xmm12, %xmm3 + + decq %rax + jg .L203 + ALIGN_3 + +.L204: + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + mulps %xmm4, %xmm12 + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + mulps %xmm5, %xmm12 + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + mulps %xmm6, %xmm12 + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + mulps %xmm7, %xmm12 + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + ALIGN_3 + +.L205: + testq $8, N + jle .L206 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + + movsd (X), %xmm6 + addq INCX, X + movhps (X), %xmm6 + addq INCX, X + movsd (Y), %xmm10 + addq INCY, Y + movhps (Y), %xmm10 + addq INCY, Y + + pshufd $0xb1, %xmm10, %xmm12 + mulps %xmm6, %xmm10 + addps %xmm10, %xmm0 + mulps %xmm6, %xmm12 + addps %xmm12, %xmm1 + + movsd (X), %xmm7 + addq INCX, X + movhps (X), %xmm7 + addq INCX, X + movsd (Y), %xmm11 + addq INCY, Y + movhps (Y), %xmm11 + addq INCY, Y + + pshufd $0xb1, %xmm11, %xmm12 + mulps %xmm7, %xmm11 + addps %xmm11, %xmm2 + mulps %xmm7, %xmm12 + addps %xmm12, %xmm3 + ALIGN_3 + +.L206: + testq $4, N + jle .L207 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + + movsd (X), %xmm5 + addq INCX, X + movhps (X), %xmm5 + addq INCX, X + movsd (Y), %xmm9 + addq INCY, Y + movhps (Y), %xmm9 + addq INCY, Y + + pshufd $0xb1, %xmm9, %xmm12 + mulps %xmm5, %xmm9 + addps %xmm9, %xmm2 + mulps %xmm5, %xmm12 + addps %xmm12, %xmm3 + ALIGN_3 + +.L207: + testq $2, N + jle .L208 + + movsd (X), %xmm4 + addq INCX, X + movhps (X), %xmm4 + addq INCX, X + movsd (Y), %xmm8 + addq INCY, Y + movhps (Y), %xmm8 + addq INCY, Y + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L208: + testq $1, N + jle .L98 + +#ifdef movsd + xorps %xmm4, %xmm4 +#endif + movsd (X), %xmm4 +#ifdef movsd + xorps %xmm8, %xmm8 +#endif + movsd (Y), %xmm8 + + pshufd $0xb1, %xmm8, %xmm12 + mulps %xmm4, %xmm8 + addps %xmm8, %xmm0 + mulps %xmm4, %xmm12 + addps %xmm12, %xmm1 + ALIGN_3 + +.L98: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movhlps %xmm0, %xmm2 + movhlps %xmm1, %xmm3 + + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + pshufd $1, %xmm0, %xmm2 + pshufd $1, %xmm1, %xmm3 + ALIGN_3 + +.L999: +#ifndef CONJ + subss %xmm2, %xmm0 + addss %xmm3, %xmm1 +#else + addss %xmm2, %xmm0 + subss %xmm3, %xmm1 +#endif + unpcklps %xmm1, %xmm0 + + RESTOREREGISTERS + + ret + ALIGN_3 + + EPILOGUE diff --git a/kernel/x86_64/zdot_sse2.S b/kernel/x86_64/zdot_sse2.S new file mode 100644 index 0000000000..77fa8e3784 --- /dev/null +++ b/kernel/x86_64/zdot_sse2.S @@ -0,0 +1,1550 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#include "l1param.h" + +#undef movsd + +#ifndef OPTERON +#define MOVLPS movsd +#else +#define MOVLPS movlps +#endif + + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + cmpq $0, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, Y + jne .L30 + + testq $SIZE, X + jne .L20 + + movq N, %rax + sarq $3, %rax + jle .L15 + + movaps -16 * SIZE(X), %xmm4 + movaps -14 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm8 + movaps -14 * SIZE(Y), %xmm9 + movaps -12 * SIZE(X), %xmm6 + movaps -10 * SIZE(X), %xmm7 + movaps -12 * SIZE(Y), %xmm10 + movaps -10 * SIZE(Y), %xmm11 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -8 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps -6 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps -4 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps -2 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps 0 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps 0 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps 2 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps 2 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps 4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps 4 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps 6 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps 6 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -8 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps -6 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps -4 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps -2 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm9 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + movaps -12 * SIZE(X), %xmm6 + movaps -12 * SIZE(Y), %xmm10 + movaps -10 * SIZE(X), %xmm7 + movaps -10 * SIZE(Y), %xmm11 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + movaps -14 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm9 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L98 + + movaps -16 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L20: + movq N, %rax + sarq $3, %rax + jle .L25 + + MOVLPS -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + MOVLPS -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm8 + movaps -14 * SIZE(Y), %xmm9 + MOVLPS -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + MOVLPS -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + movaps -12 * SIZE(Y), %xmm10 + movaps -10 * SIZE(Y), %xmm11 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps 0 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps 2 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS 2 * SIZE(X), %xmm5 + movhps 3 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps 4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS 4 * SIZE(X), %xmm6 + movhps 5 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps 6 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS 6 * SIZE(X), %xmm7 + movhps 7 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + MOVLPS -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm9 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + MOVLPS -12 * SIZE(X), %xmm6 + movhps -11 * SIZE(X), %xmm6 + movaps -12 * SIZE(Y), %xmm10 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -10 * SIZE(X), %xmm7 + movhps -9 * SIZE(X), %xmm7 + movaps -10 * SIZE(Y), %xmm11 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + MOVLPS -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -14 * SIZE(X), %xmm5 + movhps -13 * SIZE(X), %xmm5 + movaps -14 * SIZE(Y), %xmm9 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L98 + + MOVLPS -16 * SIZE(X), %xmm4 + movhps -15 * SIZE(X), %xmm4 + movaps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L30: + testq $SIZE, X + jne .L40 + + movq N, %rax + sarq $3, %rax + jle .L35 + + MOVLPS -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + MOVLPS -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -16 * SIZE(X), %xmm8 + movaps -14 * SIZE(X), %xmm9 + MOVLPS -12 * SIZE(Y), %xmm6 + movhps -11 * SIZE(Y), %xmm6 + MOVLPS -10 * SIZE(Y), %xmm7 + movhps -9 * SIZE(Y), %xmm7 + movaps -12 * SIZE(X), %xmm10 + movaps -10 * SIZE(X), %xmm11 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(X), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS -8 * SIZE(Y), %xmm4 + movhps -7 * SIZE(Y), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(X), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS -6 * SIZE(Y), %xmm5 + movhps -5 * SIZE(Y), %xmm5 + addpd %xmm12, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(X), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS -4 * SIZE(Y), %xmm6 + movhps -3 * SIZE(Y), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(X), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS -2 * SIZE(Y), %xmm7 + movhps -1 * SIZE(Y), %xmm7 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps 0 * SIZE(X), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(Y), %xmm4 + movhps 1 * SIZE(Y), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps 2 * SIZE(X), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS 2 * SIZE(Y), %xmm5 + movhps 3 * SIZE(Y), %xmm5 + addpd %xmm12, %xmm3 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps 4 * SIZE(X), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS 4 * SIZE(Y), %xmm6 + movhps 5 * SIZE(Y), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps 6 * SIZE(X), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS 6 * SIZE(Y), %xmm7 + movhps 7 * SIZE(Y), %xmm7 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L32: + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -8 * SIZE(X), %xmm8 + mulpd %xmm4, %xmm12 + MOVLPS -8 * SIZE(Y), %xmm4 + movhps -7 * SIZE(Y), %xmm4 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + movaps -6 * SIZE(X), %xmm9 + mulpd %xmm5, %xmm12 + MOVLPS -6 * SIZE(Y), %xmm5 + movhps -5 * SIZE(Y), %xmm5 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -4 * SIZE(X), %xmm10 + mulpd %xmm6, %xmm12 + MOVLPS -4 * SIZE(Y), %xmm6 + movhps -3 * SIZE(Y), %xmm6 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + movaps -2 * SIZE(X), %xmm11 + mulpd %xmm7, %xmm12 + MOVLPS -2 * SIZE(Y), %xmm7 + movhps -1 * SIZE(Y), %xmm7 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, N + jle .L36 + + MOVLPS -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm9 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + MOVLPS -12 * SIZE(Y), %xmm6 + movhps -11 * SIZE(Y), %xmm6 + movaps -12 * SIZE(X), %xmm10 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -10 * SIZE(Y), %xmm7 + movhps -9 * SIZE(Y), %xmm7 + movaps -10 * SIZE(X), %xmm11 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L36: + testq $2, N + jle .L37 + + MOVLPS -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS -14 * SIZE(Y), %xmm5 + movhps -13 * SIZE(Y), %xmm5 + movaps -14 * SIZE(X), %xmm9 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L37: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + + testq $1, N + jle .L98 + + MOVLPS -16 * SIZE(Y), %xmm4 + movhps -15 * SIZE(Y), %xmm4 + movaps -16 * SIZE(X), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + addpd %xmm12, %xmm1 + jmp .L98 + ALIGN_3 + +.L40: + movhps -16 * SIZE(X), %xmm4 + addq $SIZE, X + movhps -16 * SIZE(Y), %xmm8 + addq $SIZE, Y + + movq N, %rax + sarq $3, %rax + jle .L45 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm9 + movaps -14 * SIZE(X), %xmm6 + movaps -14 * SIZE(Y), %xmm10 + movaps -12 * SIZE(X), %xmm7 + movaps -12 * SIZE(Y), %xmm11 + decq %rax + jle .L42 + ALIGN_3 + +.L41: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -10 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + movaps -8 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps -8 * SIZE(X), %xmm5 + addpd %xmm12, %xmm1 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -6 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps -6 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + movaps -4 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addpd %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -2 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -2 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + movaps 0 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps 0 * SIZE(X), %xmm5 + addpd %xmm12, %xmm1 + +#if defined(PREFETCH) && !defined(FETCH128) + PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps 2 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps 2 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + movaps 4 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps 4 * SIZE(X), %xmm7 + addpd %xmm12, %xmm1 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -10 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -10 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + movaps -8 * SIZE(Y), %xmm9 + mulpd %xmm5, %xmm12 + movaps -8 * SIZE(X), %xmm5 + addpd %xmm12, %xmm1 + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + movaps -6 * SIZE(Y), %xmm10 + mulpd %xmm6, %xmm12 + movaps -6 * SIZE(X), %xmm6 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + movaps -4 * SIZE(Y), %xmm11 + mulpd %xmm7, %xmm12 + movaps -4 * SIZE(X), %xmm7 + addpd %xmm12, %xmm1 + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movaps -2 * SIZE(Y), %xmm8 + mulpd %xmm4, %xmm12 + movaps -2 * SIZE(X), %xmm4 + addpd %xmm12, %xmm1 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm1 + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm1 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, N + jle .L46 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm9 + movaps -14 * SIZE(X), %xmm6 + movaps -14 * SIZE(Y), %xmm10 + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + movaps -12 * SIZE(X), %xmm7 + movaps -12 * SIZE(Y), %xmm11 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm1 + + movaps -10 * SIZE(X), %xmm4 + movaps -10 * SIZE(Y), %xmm8 + + movsd %xmm11, %xmm10 + pshufd $0x4e, %xmm10, %xmm12 + movsd %xmm7, %xmm6 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + movsd %xmm8, %xmm11 + pshufd $0x4e, %xmm11, %xmm12 + movsd %xmm4, %xmm7 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm0 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm1 + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L46: + testq $2, N + jle .L47 + + movaps -16 * SIZE(X), %xmm5 + movaps -16 * SIZE(Y), %xmm9 + + movsd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm12 + movsd %xmm5, %xmm4 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + movaps -14 * SIZE(X), %xmm6 + movaps -14 * SIZE(Y), %xmm10 + + movsd %xmm10, %xmm9 + pshufd $0x4e, %xmm9, %xmm12 + movsd %xmm6, %xmm5 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm0 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm1 + + movaps %xmm6, %xmm4 + movaps %xmm10, %xmm8 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, N + jle .L48 + + movlps -16 * SIZE(X), %xmm4 + movlps -16 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + ALIGN_3 + +.L48: + SHUFPD_1 %xmm0, %xmm0 + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm2, %xmm2 + SHUFPD_1 %xmm3, %xmm3 + jmp .L98 + ALIGN_3 + +.L50: + movq N, %rax + sarq $3, %rax + jle .L55 + + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + + decq %rax + jle .L54 + ALIGN_3 + +.L53: + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + mulpd %xmm5, %xmm12 + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + mulpd %xmm6, %xmm12 + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + mulpd %xmm7, %xmm12 + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + + mulpd %xmm5, %xmm12 + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + mulpd %xmm6, %xmm12 + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + mulpd %xmm7, %xmm12 + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm12, %xmm3 + + decq %rax + jg .L53 + ALIGN_3 + +.L54: + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + mulpd %xmm4, %xmm12 + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + mulpd %xmm5, %xmm12 + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + mulpd %xmm6, %xmm12 + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + mulpd %xmm7, %xmm12 + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + ALIGN_3 + +.L55: + testq $4, N + jle .L56 + + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + + MOVLPS 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm10 + movhps 1 * SIZE(Y), %xmm10 + addq INCY, Y + + pshufd $0x4e, %xmm10, %xmm12 + mulpd %xmm6, %xmm10 + addpd %xmm10, %xmm0 + mulpd %xmm6, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm11 + movhps 1 * SIZE(Y), %xmm11 + addq INCY, Y + + pshufd $0x4e, %xmm11, %xmm12 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm2 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + ALIGN_3 + +.L56: + testq $2, N + jle .L57 + + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + addq INCY, Y + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + + MOVLPS 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + MOVLPS 0 * SIZE(Y), %xmm9 + movhps 1 * SIZE(Y), %xmm9 + addq INCY, Y + + pshufd $0x4e, %xmm9, %xmm12 + mulpd %xmm5, %xmm9 + addpd %xmm9, %xmm2 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm3 + ALIGN_3 + +.L57: + testq $1, N + jle .L98 + + MOVLPS 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + MOVLPS 0 * SIZE(Y), %xmm8 + movhps 1 * SIZE(Y), %xmm8 + + pshufd $0x4e, %xmm8, %xmm12 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm12 + addpd %xmm12, %xmm1 + ALIGN_3 + +.L98: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + pshufd $0x4e, %xmm0, %xmm2 + pshufd $0x4e, %xmm1, %xmm3 + +.L999: +#ifndef CONJ + subsd %xmm2, %xmm0 + addsd %xmm3, %xmm1 +#else + addsd %xmm2, %xmm0 + subsd %xmm3, %xmm1 +#endif + + RESTOREREGISTERS + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S b/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S new file mode 100644 index 0000000000..97eb1ec7a0 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_2x8_nehalem.S @@ -0,0 +1,1933 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define BX %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif +#endif + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $3, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + prefetcht0 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 7 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 3 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 7 * SIZE(CO2, %rax, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm6, %xmm2 + mulpd %xmm0, %xmm6 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm6, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + movaps -10 * SIZE(AO), %xmm5 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addpd %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + addpd %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + addq $32 * SIZE, BO + subq $-8 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm13 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm15 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + shufpd $2, %xmm13, %xmm12 + shufpd $2, %xmm0, %xmm13 + movaps %xmm14, %xmm0 + shufpd $2, %xmm15, %xmm14 + shufpd $2, %xmm0, %xmm15 + + leaq (LDC, LDC, 2), %rax + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC), %xmm2 + movhps 1 * SIZE(CO1, LDC), %xmm2 + movsd 2 * SIZE(CO1, LDC), %xmm3 + movhps 3 * SIZE(CO1, LDC), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 1 * SIZE(CO1, LDC) + movlps %xmm3, 2 * SIZE(CO1, LDC) + movhps %xmm3, 3 * SIZE(CO1, LDC) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhps 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 0 * SIZE(CO1, %rax), %xmm2 + movhps 1 * SIZE(CO1, %rax), %xmm2 + movsd 2 * SIZE(CO1, %rax), %xmm3 + movhps 3 * SIZE(CO1, %rax), %xmm3 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movddup %xmm11, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 1 * SIZE(CO1, LDC, 2) + movlps %xmm1, 2 * SIZE(CO1, LDC, 2) + movhps %xmm1, 3 * SIZE(CO1, LDC, 2) + + movlps %xmm2, 0 * SIZE(CO1, %rax) + movhps %xmm2, 1 * SIZE(CO1, %rax) + movlps %xmm3, 2 * SIZE(CO1, %rax) + movhps %xmm3, 3 * SIZE(CO1, %rax) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movsd 0 * SIZE(CO2, LDC), %xmm2 + movhps 1 * SIZE(CO2, LDC), %xmm2 + movsd 2 * SIZE(CO2, LDC), %xmm3 + movhps 3 * SIZE(CO2, LDC), %xmm3 + + movddup %xmm12, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm12, %xmm12 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm1 + + movddup %xmm13, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm13, %xmm13 + mulpd %xmm7, %xmm13 + addpd %xmm13, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO2, LDC) + movhps %xmm2, 1 * SIZE(CO2, LDC) + movlps %xmm3, 2 * SIZE(CO2, LDC) + movhps %xmm3, 3 * SIZE(CO2, LDC) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhps 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 0 * SIZE(CO2, %rax), %xmm2 + movhps 1 * SIZE(CO2, %rax), %xmm2 + movsd 2 * SIZE(CO2, %rax), %xmm3 + movhps 3 * SIZE(CO2, %rax), %xmm3 + + movddup %xmm14, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm14, %xmm14 + mulpd %xmm7, %xmm14 + addpd %xmm14, %xmm1 + + movddup %xmm15, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm15, %xmm15 + mulpd %xmm7, %xmm15 + addpd %xmm15, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2, LDC, 2) + movhps %xmm0, 1 * SIZE(CO2, LDC, 2) + movlps %xmm1, 2 * SIZE(CO2, LDC, 2) + movhps %xmm1, 3 * SIZE(CO2, LDC, 2) + + movlps %xmm2, 0 * SIZE(CO2, %rax) + movhps %xmm2, 1 * SIZE(CO2, %rax) + movlps %xmm3, 2 * SIZE(CO2, %rax) + movhps %xmm3, 3 * SIZE(CO2, %rax) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps 10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps 12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps 14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 16 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + leaq (LDC, LDC, 2), %rax + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 1 * SIZE(CO1, LDC), %xmm1 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm2 + movhps 1 * SIZE(CO1, LDC, 2), %xmm2 + movsd 0 * SIZE(CO1, %rax), %xmm3 + movhps 1 * SIZE(CO1, %rax), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) + movhps %xmm1, 1 * SIZE(CO1, LDC) + + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 1 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %rax) + movhps %xmm3, 1 * SIZE(CO1, %rax) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO2, LDC), %xmm1 + movhps 1 * SIZE(CO2, LDC), %xmm1 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm2 + movhps 1 * SIZE(CO2, LDC, 2), %xmm2 + movsd 0 * SIZE(CO2, %rax), %xmm3 + movhps 1 * SIZE(CO2, %rax), %xmm3 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movddup %xmm11, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 0 * SIZE(CO2, LDC) + movhps %xmm1, 1 * SIZE(CO2, LDC) + + movlps %xmm2, 0 * SIZE(CO2, LDC, 2) + movhps %xmm2, 1 * SIZE(CO2, LDC, 2) + movlps %xmm3, 0 * SIZE(CO2, %rax) + movhps %xmm3, 1 * SIZE(CO2, %rax) + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK +#endif + + movq BO, B + + leaq (C, LDC, 8), C + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $4, N + jle .L50 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht0 7 * SIZE(CO2, LDC, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + addpd %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm10 + addpd %xmm4, %xmm11 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + shufpd $2, %xmm11, %xmm10 + shufpd $2, %xmm0, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC), %xmm2 + movhps 1 * SIZE(CO1, LDC), %xmm2 + movsd 2 * SIZE(CO1, LDC), %xmm3 + movhps 3 * SIZE(CO1, LDC), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 1 * SIZE(CO1, LDC) + movlps %xmm3, 2 * SIZE(CO1, LDC) + movhps %xmm3, 3 * SIZE(CO1, LDC) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movsd 0 * SIZE(CO2, LDC), %xmm2 + movhps 1 * SIZE(CO2, LDC), %xmm2 + movsd 2 * SIZE(CO2, LDC), %xmm3 + movhps 3 * SIZE(CO2, LDC), %xmm3 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movddup %xmm11, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO2, LDC) + movhps %xmm2, 1 * SIZE(CO2, LDC) + movlps %xmm3, 2 * SIZE(CO2, LDC) + movhps %xmm3, 3 * SIZE(CO2, LDC) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps -8 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -6 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -4 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movaps -2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm11 + movaps 0 * SIZE(BO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_4 + +.L48: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 1 * SIZE(CO1, LDC), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 1 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO2, LDC), %xmm3 + movhps 1 * SIZE(CO2, LDC), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) + movhps %xmm1, 1 * SIZE(CO1, LDC) + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movlps %xmm3, 0 * SIZE(CO2, LDC) + movhps %xmm3, 1 * SIZE(CO2, LDC) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + ALIGN_4 + +.L50: + testq $2, N + jle .L70 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L55: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addpd %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm1, %xmm8 + addpd %xmm2, %xmm9 + + movaps %xmm8, %xmm0 + shufpd $2, %xmm9, %xmm8 + shufpd $2, %xmm0, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhps 3 * SIZE(CO2), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm9, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 1 * SIZE(CO2) + movlps %xmm3, 2 * SIZE(CO2) + movhps %xmm3, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -14 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -12 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -13 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -10 * SIZE(BO), %xmm1 + + mulpd %xmm0, %xmm1 + movddup -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movaps -8 * SIZE(BO), %xmm1 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + movddup -15 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm8 + movaps -14 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 1 * SIZE(CO2), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L70: + testq $1, N + jle .L999 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -15 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm1, %xmm9 + movddup -13 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L75: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addpd %xmm1, %xmm8 + movddup -16 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L999 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifndef TRMMKERNEL + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 +#else + movsd -16 * SIZE(AO), %xmm0 + movhpd -15 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm1 + movhpd -15 * SIZE(BO), %xmm1 + xorps %xmm9, %xmm9 +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + mulpd %xmm0, %xmm1 +#ifndef TRMMKERNEL + movapd -14 * SIZE(AO), %xmm0 +#else + movsd -14 * SIZE(AO), %xmm0 + movhpd -13 * SIZE(AO), %xmm0 +#endif + addpd %xmm1, %xmm8 +#ifndef TRMMKERNEL + movapd -14 * SIZE(BO), %xmm1 +#else + movsd -14 * SIZE(BO), %xmm1 + movhpd -13 * SIZE(BO), %xmm1 +#endif + + mulpd %xmm0, %xmm1 +#ifndef TRMMKERNEL + movapd -12 * SIZE(AO), %xmm0 +#else + movsd -12 * SIZE(AO), %xmm0 + movhpd -11 * SIZE(AO), %xmm0 +#endif + addpd %xmm1, %xmm9 +#ifndef TRMMKERNEL + movapd -12 * SIZE(BO), %xmm1 +#else + movsd -12 * SIZE(BO), %xmm1 + movhpd -11 * SIZE(BO), %xmm1 +#endif + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + + addpd %xmm9, %xmm8 + ALIGN_3 + +.L85: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd -15 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_4 + +.L88: + haddpd %xmm8, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x2_atom.S b/kernel/x86_64/zgemm3m_kernel_4x2_atom.S new file mode 100644 index 0000000000..189505dd37 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x2_atom.S @@ -0,0 +1,1215 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KKK 72(%rsp) +#define KK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 + +#else + movq OLD_LDC, LDC +#endif + + movsd %xmm0, ALPHA_R + movsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $1, J + jle .L40 + ALIGN_4 + +.L10: + movq C, CO1 + leaq (C, LDC, 1), CO2 + leaq (C, LDC, 2), C + + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L11: + movq B, BO + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + + movq K, %rax + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + addsd %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: + movq K, %rax + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + addsd %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + movsd ALPHA_R, %xmm4 + addsd %xmm2, %xmm13 + movsd ALPHA_I, %xmm5 + addsd %xmm7, %xmm14 + addsd %xmm6, %xmm15 + + movaps %xmm8, %xmm0 + movaps %xmm10, %xmm1 + movaps %xmm12, %xmm2 + movaps %xmm14, %xmm3 + + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm0 + mulsd %xmm4, %xmm10 + mulsd %xmm5, %xmm1 + mulsd %xmm4, %xmm12 + mulsd %xmm5, %xmm2 + mulsd %xmm4, %xmm14 + mulsd %xmm5, %xmm3 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm0 + addsd 2 * SIZE(CO1), %xmm10 + addsd 3 * SIZE(CO1), %xmm1 + addsd 4 * SIZE(CO1), %xmm12 + addsd 5 * SIZE(CO1), %xmm2 + addsd 6 * SIZE(CO1), %xmm14 + addsd 7 * SIZE(CO1), %xmm3 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm0, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movsd %xmm1, 3 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movsd %xmm2, 5 * SIZE(CO1) + movsd %xmm14, 6 * SIZE(CO1) + movsd %xmm3, 7 * SIZE(CO1) + + movaps %xmm9, %xmm0 + movaps %xmm11, %xmm1 + movaps %xmm13, %xmm2 + movaps %xmm15, %xmm3 + + mulsd %xmm4, %xmm9 + mulsd %xmm5, %xmm0 + mulsd %xmm4, %xmm11 + mulsd %xmm5, %xmm1 + mulsd %xmm4, %xmm13 + mulsd %xmm5, %xmm2 + mulsd %xmm4, %xmm15 + mulsd %xmm5, %xmm3 + + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm0 + addsd 2 * SIZE(CO2), %xmm11 + addsd 3 * SIZE(CO2), %xmm1 + addsd 4 * SIZE(CO2), %xmm13 + addsd 5 * SIZE(CO2), %xmm2 + addsd 6 * SIZE(CO2), %xmm15 + addsd 7 * SIZE(CO2), %xmm3 + + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm0, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movsd %xmm1, 3 * SIZE(CO2) + movsd %xmm13, 4 * SIZE(CO2) + movsd %xmm2, 5 * SIZE(CO2) + movsd %xmm15, 6 * SIZE(CO2) + movsd %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + jle .L30 + + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + movq K, %rax + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + addsd %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + addsd %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + addsd %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + addsd %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + addsd %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: + movq K, %rax + movsd ALPHA_R, %xmm5 + movsd ALPHA_I, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + addsd %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + addsd %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + addsd %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + addsd %xmm2, %xmm9 + addsd %xmm6, %xmm11 + + movaps %xmm8, %xmm12 + movaps %xmm10, %xmm13 + movaps %xmm9, %xmm14 + movaps %xmm11, %xmm15 + + mulsd %xmm5, %xmm8 + mulsd %xmm7, %xmm12 + mulsd %xmm5, %xmm10 + mulsd %xmm7, %xmm13 + mulsd %xmm5, %xmm9 + mulsd %xmm7, %xmm14 + mulsd %xmm5, %xmm11 + mulsd %xmm7, %xmm15 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm12 + addsd 2 * SIZE(CO1), %xmm10 + addsd 3 * SIZE(CO1), %xmm13 + + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm14 + addsd 2 * SIZE(CO2), %xmm11 + addsd 3 * SIZE(CO2), %xmm15 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm12, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm14, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movsd %xmm15, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm7, %xmm7 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm5, %xmm5 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + movq K, %rax + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 2 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 3 * SIZE(AO), %xmm2 + + addsd %xmm5, %xmm8 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + + addsd %xmm7, %xmm9 + movsd 7 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm3 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 8 * SIZE(BO), %xmm1 + mulsd %xmm2, %xmm5 + + addsd %xmm3, %xmm9 + movsd 9 * SIZE(BO), %xmm3 + mulsd %xmm2, %xmm7 + movsd 5 * SIZE(AO), %xmm2 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: + movq K, %rax + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulsd %xmm0, %xmm1 + addq $2 * SIZE, BO + mulsd %xmm0, %xmm3 + movsd 1 * SIZE(AO), %xmm0 + + addsd %xmm1, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + addsd %xmm3, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + decq %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + movaps %xmm8, %xmm10 + movaps %xmm9, %xmm11 + + mulsd %xmm6, %xmm8 + mulsd %xmm7, %xmm10 + mulsd %xmm6, %xmm9 + mulsd %xmm7, %xmm11 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm10 + addsd 0 * SIZE(CO2), %xmm9 + addsd 1 * SIZE(CO2), %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 1 * SIZE(CO1) + movsd %xmm9, 0 * SIZE(CO2) + movsd %xmm11, 1 * SIZE(CO2) + ALIGN_4 + +.L39: + movq BO, B + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + + movq C, CO1 + addq LDC, C + + movq A, AO + + movq M, I + sarq $2, I + jle .L50 + ALIGN_4 + +.L41: + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm11, %xmm11 + movsd 2 * SIZE(AO), %xmm2 + xorps %xmm13, %xmm13 + movsd 3 * SIZE(AO), %xmm3 + xorps %xmm15, %xmm15 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + prefetcht0 7 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm14, %xmm14 + + movq K, %rax + sarq $2, %rax + je .L45 + ALIGN_4 + +.L42: + addsd %xmm9, %xmm8 + movsd 4 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 5 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 6 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 7 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 9 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + + addsd %xmm2, %xmm12 + movsd 10 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + + addsd %xmm3, %xmm14 + movsd 11 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm9, %xmm8 + movsd 12 * SIZE(AO), %xmm9 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addsd %xmm11, %xmm10 + movsd 13 * SIZE(AO), %xmm11 + mulsd %xmm4, %xmm1 + + addsd %xmm13, %xmm12 + movsd 14 * SIZE(AO), %xmm13 + mulsd %xmm4, %xmm2 + + addsd %xmm15, %xmm14 + movsd 15 * SIZE(AO), %xmm15 + mulsd %xmm4, %xmm3 + movsd 4 * SIZE(BO), %xmm4 + subq $-16 * SIZE, AO + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm9 + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm11 + addq $ 4 * SIZE, BO + + addsd %xmm2, %xmm12 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm5, %xmm13 + decq %rax + + addsd %xmm3, %xmm14 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm5, %xmm15 + movsd 1 * SIZE(BO), %xmm5 + + jne .L42 + ALIGN_4 + +.L45: + movq K, %rax + + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + addsd %xmm9, %xmm8 + addsd %xmm11, %xmm10 + addsd %xmm13, %xmm12 + addsd %xmm15, %xmm14 + + andq $3, %rax + BRANCH + BRANCH + je .L49 + ALIGN_4 + +.L46: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + mulsd %xmm4, %xmm2 + mulsd %xmm4, %xmm3 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 5 * SIZE(AO), %xmm1 + addsd %xmm2, %xmm12 + movsd 6 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm14 + movsd 7 * SIZE(AO), %xmm3 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L46 + ALIGN_4 + +.L49: + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + mulsd %xmm6, %xmm8 + mulsd %xmm7, %xmm9 + mulsd %xmm6, %xmm10 + mulsd %xmm7, %xmm11 + mulsd %xmm6, %xmm12 + mulsd %xmm7, %xmm13 + mulsd %xmm6, %xmm14 + mulsd %xmm7, %xmm15 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 + addsd 2 * SIZE(CO1), %xmm10 + addsd 3 * SIZE(CO1), %xmm11 + addsd 4 * SIZE(CO1), %xmm12 + addsd 5 * SIZE(CO1), %xmm13 + addsd 6 * SIZE(CO1), %xmm14 + addsd 7 * SIZE(CO1), %xmm15 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movsd %xmm11, 3 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movsd %xmm13, 5 * SIZE(CO1) + movsd %xmm14, 6 * SIZE(CO1) + movsd %xmm15, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 + + decq I # i -- + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + jle .L60 + + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm1 + xorps %xmm3, %xmm3 + + movsd 0 * SIZE(BO), %xmm4 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm5 + xorps %xmm10, %xmm10 + + movq K, %rax + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + addsd %xmm2, %xmm8 + movsd 2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm3, %xmm10 + movsd 3 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 2 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + addq $8 * SIZE, AO + + addsd %xmm1, %xmm10 + movsd -3 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 3 * SIZE(BO), %xmm5 + + addsd %xmm2, %xmm8 + movsd -2 * SIZE(AO), %xmm2 + mulsd %xmm4, %xmm0 + addq $4 * SIZE, BO + + addsd %xmm3, %xmm10 + movsd -1 * SIZE(AO), %xmm3 + mulsd %xmm4, %xmm1 + movsd 0 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm5, %xmm2 + decq %rax + + addsd %xmm1, %xmm10 + movsd 1 * SIZE(AO), %xmm1 + mulsd %xmm5, %xmm3 + movsd 1 * SIZE(BO), %xmm5 + + jne .L52 + ALIGN_4 + +.L55: + movq K, %rax + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm10 + + andq $3, %rax + BRANCH + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulsd %xmm4, %xmm0 + mulsd %xmm4, %xmm1 + movsd 1 * SIZE(BO), %xmm4 + + addsd %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm10 + movsd 3 * SIZE(AO), %xmm1 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + decq %rax + BRANCH + jg .L56 + ALIGN_4 + +.L59: + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + mulsd %xmm6, %xmm8 + mulsd %xmm7, %xmm9 + mulsd %xmm6, %xmm10 + mulsd %xmm7, %xmm11 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 + addsd 2 * SIZE(CO1), %xmm10 + addsd 3 * SIZE(CO1), %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movsd %xmm11, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L60: + testq $1, M + je .L999 + ALIGN_4 + + movq B, BO + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm5, %xmm5 + movsd 1 * SIZE(AO), %xmm2 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm9, %xmm9 + movsd 2 * SIZE(AO), %xmm4 + movsd 3 * SIZE(AO), %xmm6 + + movq K, %rax + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + addsd %xmm5, %xmm8 + movsd 2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm1 + movsd 4 * SIZE(AO), %xmm0 + + addsd %xmm7, %xmm9 + movsd 3 * SIZE(BO), %xmm7 + mulsd %xmm2, %xmm3 + movsd 5 * SIZE(AO), %xmm2 + + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm4, %xmm5 + movsd 6 * SIZE(AO), %xmm4 + + addsd %xmm3, %xmm9 + movsd 5 * SIZE(BO), %xmm3 + mulsd %xmm6, %xmm7 + movsd 7 * SIZE(AO), %xmm6 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + decq %rax + jne .L62 + + addsd %xmm5, %xmm8 + addsd %xmm7, %xmm9 + ALIGN_4 + +.L65: + movq K, %rax + movsd ALPHA_R, %xmm6 + movsd ALPHA_I, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L68 + ALIGN_4 + +.L66: + movsd 0 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm1 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + decq %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addsd %xmm9, %xmm8 + + movaps %xmm8, %xmm9 + mulsd %xmm6, %xmm8 + mulsd %xmm7, %xmm9 + + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S new file mode 100644 index 0000000000..4199bd91e9 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_barcelona.S @@ -0,0 +1,2467 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define BUFFERED + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 512(%rsp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (8 * 21 + 0) + +#define RPREFETCHSIZE (8 * 14 + 0) +#define WPREFETCHSIZE (8 * 6 + 0) + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd (AO, %rax, 4), %xmm6 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + addpd %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm8 ;\ + movapd %xmm2, %xmm6 ;\ + addpd %xmm1, %xmm12 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm6, %xmm10 ;\ + movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm8 ;\ + movapd %xmm2, %xmm7 ;\ + addpd %xmm5, %xmm12 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm7, %xmm10 ;\ + movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + addpd %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + addpd %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + addpd %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + addpd %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + addpd %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + addpd %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + addpd %xmm4, %xmm10 ;\ + addpd %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + addpd %xmm2, %xmm11 ;\ + addpd %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#ifndef __APPLE__ + .align 512 +#endif +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif +#endif + + movq %rsp, %rbx # save old stack + subq $1024 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A +#ifndef BUFFERED + subq $-16 * SIZE, B +#endif + + movsd %xmm0, 0 + ALPHA + movsd %xmm1, 8 + ALPHA + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#endif + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#ifdef BUFFERED + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_3 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps (B), %xmm0 + movaps 2 * SIZE(B), %xmm1 + + movaps %xmm0, -16 * SIZE(BO) + movaps %xmm1, -14 * SIZE(BO) + + prefetch (RPREFETCHSIZE + 8) * SIZE(B) + + movaps 4 * SIZE(B), %xmm2 + movaps 6 * SIZE(B), %xmm3 + + movaps %xmm2, -12 * SIZE(BO) + movaps %xmm3, -10 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + movaps 8 * SIZE(B), %xmm4 + movaps 10 * SIZE(B), %xmm5 + + movaps %xmm4, -8 * SIZE(BO) + movaps %xmm5, -6 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 8) * SIZE(BO) + + movaps 12 * SIZE(B), %xmm6 + movaps 14 * SIZE(B), %xmm7 + + movaps %xmm6, -4 * SIZE(BO) + movaps %xmm7, -2 * SIZE(BO) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, B + + subq $1, %rax + jne .L02 + ALIGN_3 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_3 + +.L04: + movaps (B), %xmm0 + movaps %xmm0, -16 * SIZE(BO) + + movaps 2 * SIZE(B), %xmm1 + movaps %xmm1, -14 * SIZE(BO) + + addq $4 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: +#endif + movq A, AO # aoffset = a + movq B, BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + prefetch (RPREFETCHSIZE + 0) * SIZE(BB) + prefetch (RPREFETCHSIZE + 8) * SIZE(BB) + prefetch (RPREFETCHSIZE + 16) * SIZE(BB) + subq $-16 * SIZE, BB + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + + prefetchw 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 7 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd %xmm2, %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movapd %xmm0, %xmm2 + addpd %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + addpd %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 + movsd 6 * SIZE(CO1, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 + + movddup %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + movddup %xmm14, %xmm5 + unpckhpd %xmm14, %xmm14 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm14 + + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm14, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) + movsd %xmm3, 6 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 + movsd 6 * SIZE(CO2, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 + + movddup %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + movddup %xmm15, %xmm5 + unpckhpd %xmm15, %xmm15 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm15 + + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm15, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO2, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) + movsd %xmm3, 6 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movddup %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movddup %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -15 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm11, %xmm11 + movapd -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + mulpd %xmm4, %xmm1 + mulpd -10 * SIZE(BO, %rax, 4), %xmm4 + addpd %xmm1, %xmm10 + movapd (BO, %rax, 4), %xmm1 + addpd %xmm4, %xmm11 + movddup -11 * SIZE(AO, %rax, 1), %xmm4 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm9 + movddup -13 * SIZE(AO, %rax, 1), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO, %rax, 4), %xmm2 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO, %rax, 4), %xmm3 + addpd %xmm2, %xmm11 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO, %rax, 4), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO, %rax, 4), %xmm1 + addpd %xmm0, %xmm9 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + +#ifndef BUFFERED + movq BO, B +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#ifdef BUFFERED + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + prefetchnta (RPREFETCHSIZE + 0) * SIZE(B) + + movaps (B), %xmm0 + movaps %xmm0, -16 * SIZE(BO) + + movaps 2 * SIZE(B), %xmm1 + movaps %xmm1, -14 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + movaps 4 * SIZE(B), %xmm2 + movaps %xmm2, -12 * SIZE(BO) + + movaps 6 * SIZE(B), %xmm3 + movaps %xmm3, -10 * SIZE(BO) + + subq $-8 * SIZE, BO + subq $-8 * SIZE, B + + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movaps (B), %xmm0 + movaps %xmm0, -16 * SIZE(BO) + + addq $2 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L44 + ALIGN_4 + +.L50: +#endif + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + prefetchw 7 * SIZE(CO1) + movapd %xmm0, %xmm2 + prefetchw 7 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + addpd %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(AO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(AO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(BO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(AO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(BO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + mulpd -16 * SIZE(BO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + +#ifndef BUFFERED + movq BO, B +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#ifdef BUFFERED + movq K, %rax + sarq $3, %rax + jle .L83 + ALIGN_4 + +.L82: + prefetchnta (RPREFETCHSIZE + 0) * SIZE(B) + + movaps (B), %xmm0 + movaps %xmm0, -16 * SIZE(BO) + + movaps 2 * SIZE(B), %xmm1 + movaps %xmm1, -14 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + movaps 4 * SIZE(B), %xmm2 + movaps %xmm2, -12 * SIZE(BO) + + movaps 6 * SIZE(B), %xmm3 + movaps %xmm3, -10 * SIZE(BO) + + subq $-8 * SIZE, BO + subq $-8 * SIZE, B + + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: + movq K, %rax + andq $7, %rax + BRANCH + jle .L90 + ALIGN_4 + +.L84: + movsd (B), %xmm0 + movlpd %xmm0, -16 * SIZE(BO) + + addq $1 * SIZE, B + addq $1 * SIZE, BO + decq %rax + jne .L84 + ALIGN_4 + +.L90: +#endif + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm12, %xmm12 + movddup -14 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + movddup -15 * SIZE(BO), %xmm5 + + prefetchw 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -12 * SIZE(BO, %rax, 1), %xmm1 + mulpd %xmm5, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm0, %xmm9 + movapd (AO, %rax, 4), %xmm0 + addpd %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 1), %xmm5 + mulpd %xmm3, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 1), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + addpd %xmm2, %xmm9 + movapd 8 * SIZE(AO, %rax, 4), %xmm2 + addpd %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + addpd %xmm1, %xmm12 + movddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 + +.L99: + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movddup -16 * SIZE(BO), %xmm0 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movddup -14 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + movddup -13 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + movddup -12 * SIZE(BO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(AO, %rax, 2), %xmm1 + addpd %xmm1, %xmm9 + movddup -11 * SIZE(BO, %rax, 1), %xmm1 + + mulpd -12 * SIZE(AO, %rax, 2), %xmm2 + addpd %xmm2, %xmm10 + movddup -10 * SIZE(BO, %rax, 1), %xmm2 + + mulpd -10 * SIZE(AO, %rax, 2), %xmm3 + addpd %xmm3, %xmm11 + movddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + movddup -16 * SIZE(BO, %rax, 1), %xmm0 + mulpd -16 * SIZE(AO, %rax, 2), %xmm0 + addpd %xmm0, %xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + +#ifdef BUFFERED + leaq 16 * SIZE + BUFFER, BO +#else + movq B, BO +#endif +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd -16 * SIZE(BO, %rax, 1), %xmm0 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 1), %xmm0 + + mulpd -14 * SIZE(BO, %rax, 1), %xmm1 + addpd %xmm1, %xmm9 + movapd -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + mulsd -16 * SIZE(BO, %rax, 1), %xmm0 + addsd %xmm0, %xmm8 + movsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + addpd %xmm9, %xmm8 + haddpd %xmm8, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + ALIGN_3 + +.L999: + movq %rbx, %rsp + + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_core2.S b/kernel/x86_64/zgemm3m_kernel_4x4_core2.S new file mode 100644 index 0000000000..1b466fb19a --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_core2.S @@ -0,0 +1,2282 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 13 + 5) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif +#endif + + movq %rsp, %r15 # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movsd %xmm0, 0 + ALPHA + movsd %xmm1, 8 + ALPHA + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + NOBRANCH + jle .L05 + ALIGN_4 + +.L02: + movapd -16 * SIZE(B), %xmm0 + prefetchnta (PREFETCH_R + 0) * SIZE(B) + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -8 * SIZE(B), %xmm4 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + prefetchnta (PREFETCH_R + 8) * SIZE(B) + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + movddup %xmm2, %xmm10 + unpckhpd %xmm2, %xmm2 + movddup %xmm3, %xmm11 + unpckhpd %xmm3, %xmm3 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movddup %xmm4, %xmm12 + unpckhpd %xmm4, %xmm4 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + movddup %xmm6, %xmm14 + unpckhpd %xmm6, %xmm6 + movddup %xmm7, %xmm15 + unpckhpd %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + movapd %xmm10, -8 * SIZE(BO) + movapd %xmm2, -6 * SIZE(BO) + movapd %xmm11, -4 * SIZE(BO) + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + movapd %xmm12, 0 * SIZE(BO) + movapd %xmm4, 2 * SIZE(BO) + movapd %xmm13, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movapd %xmm14, 8 * SIZE(BO) + movapd %xmm6, 10 * SIZE(BO) + movapd %xmm15, 12 * SIZE(BO) + movapd %xmm7, 14 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-32 * SIZE, BO + decq %rax + BRANCH + jne .L02 + ALIGN_4 + +.L05: + movq K, %rax + andq $3, %rax + BRANCH + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movapd -16 * SIZE(B), %xmm0 + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + BRANCH + jne .L06 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 20 * SIZE + BUFFER, BO +#else + leaq 20 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -20 * SIZE(BO), %xmm6 + movaps -18 * SIZE(BO), %xmm7 + + prefetcht2 0 * SIZE(BB) + + pxor %xmm2, %xmm2 + prefetcht0 7 * SIZE(CO1) + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + prefetcht0 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + + movapd %xmm2, %xmm8 + movapd %xmm2, %xmm9 + movapd %xmm2, %xmm10 + prefetcht0 7 * SIZE(CO1, LDC, 2) + movapd %xmm2, %xmm11 + + movapd %xmm2, %xmm12 + movapd %xmm2, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 2) + movapd %xmm2, %xmm14 + movapd %xmm2, %xmm15 + + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PADDING; + addpd %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + PADDING; + addpd %xmm3, %xmm14 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + movaps -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps -10 * SIZE(AO), %xmm1 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -6 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -2 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps -6 * SIZE(AO), %xmm1 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps 2 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm7, %xmm5 + mulpd %xmm1, %xmm5 + mulpd %xmm0, %xmm7 + + addpd %xmm6, %xmm8 + movaps 4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps 6 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + + addpd %xmm2, %xmm10 + movaps 8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps -2 * SIZE(AO), %xmm1 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps 10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps 12 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + subq $-16 * SIZE, AO + + addpd %xmm7, %xmm9 + movaps 14 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + subq $-32 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_4 + +.L15: + prefetcht2 -8 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addpd %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm11 + movaps -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + addpd %xmm6, %xmm8 + movaps -12 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm12 + addq $4 * SIZE, AO + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm7 + addpd %xmm5, %xmm13 + addq $8 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movapd ALPHA, %xmm7 + + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm14 + addpd %xmm4, %xmm11 + addpd %xmm5, %xmm15 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 + movsd 6 * SIZE(CO1, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 + + movddup %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + movddup %xmm14, %xmm5 + unpckhpd %xmm14, %xmm14 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm14 + + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm14, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) + movsd %xmm3, 6 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 + movsd 6 * SIZE(CO2, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 + + movddup %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + movddup %xmm15, %xmm5 + unpckhpd %xmm15, %xmm15 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm15 + + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm15, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO2, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) + movsd %xmm3, 6 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm9, %xmm3 + movapd %xmm10, %xmm4 + movapd %xmm11, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L21: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -4 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -2 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd 0 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd 2 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd 4 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd 6 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movapd 8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd 12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd 14 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + BRANCH + jg .L21 + ALIGN_4 + +.L25: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movddup %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movddup %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm9, %xmm3 + movapd %xmm10, %xmm4 + movapd %xmm11, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L31: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -15 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd -8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -6 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -4 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -14 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd 0 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd 2 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd 4 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd 6 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -13 * SIZE(AO), %xmm0 + + addsd %xmm2, %xmm8 + movsd 8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd 10 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd 12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd 14 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -12 * SIZE(AO), %xmm0 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + BRANCH + jg .L31 + ALIGN_4 + +.L35: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm0, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + + movddup %xmm11, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L43 + + addq %rax, %rax + ALIGN_4 + +.L42: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $8 * SIZE, B + addq $16 * SIZE, BO + + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $7, %rax + BRANCH + jle .L45 + ALIGN_4 + +.L44: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L44 + ALIGN_4 + +.L45: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L50: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + pxor %xmm13, %xmm13 + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L55 + ALIGN_4 + +.L51: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -6 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -6 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -4 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + addpd %xmm4, %xmm9 + movapd -2 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd 2 * SIZE(AO), %xmm1 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L51 + ALIGN_4 + +.L55: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L58 + ALIGN_4 + +.L56: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + addpd %xmm4, %xmm9 + movapd -14 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L58: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm12 + addpd %xmm4, %xmm9 + addpd %xmm5, %xmm13 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + subq $1, I + jg .L50 + ALIGN_4 + +.L60: + testq $2, M + jle .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(AO), %xmm1 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L65 + ALIGN_4 + +.L61: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm4, %xmm10 + movapd -12 * SIZE(BO), %xmm4 + mulpd %xmm1, %xmm4 + addpd %xmm5, %xmm11 + movapd -10 * SIZE(BO), %xmm5 + mulpd %xmm1, %xmm5 + movapd -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm8 + movapd -8 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO), %xmm0 + addpd %xmm4, %xmm10 + movapd -4 * SIZE(BO), %xmm4 + mulpd %xmm1, %xmm4 + addpd %xmm5, %xmm11 + movapd -2 * SIZE(BO), %xmm5 + mulpd %xmm1, %xmm5 + movapd -6 * SIZE(AO), %xmm1 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L61 + ALIGN_4 + +.L65: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L68 + ALIGN_4 + +.L66: + addpd %xmm2, %xmm8 + movapd -16 * SIZE(BO), %xmm2 + mulpd %xmm0, %xmm2 + addpd %xmm3, %xmm9 + movapd -14 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm2, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm4, %xmm10 + addpd %xmm5, %xmm11 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + jle .L79 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -15 * SIZE(AO), %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + movapd %xmm8, %xmm4 + movapd %xmm8, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L75 + ALIGN_4 + +.L71: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + addsd %xmm4, %xmm10 + movsd -12 * SIZE(BO), %xmm4 + mulsd %xmm1, %xmm4 + addsd %xmm5, %xmm11 + movsd -10 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + movsd -13 * SIZE(AO), %xmm1 + + addsd %xmm2, %xmm8 + movsd -8 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -6 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + addsd %xmm4, %xmm10 + movsd -4 * SIZE(BO), %xmm4 + mulsd %xmm1, %xmm4 + addsd %xmm5, %xmm11 + movsd -2 * SIZE(BO), %xmm5 + mulsd %xmm1, %xmm5 + movsd -11 * SIZE(AO), %xmm1 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L71 + ALIGN_4 + +.L75: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + ALIGN_4 + +.L76: + addsd %xmm2, %xmm8 + movsd -16 * SIZE(BO), %xmm2 + mulsd %xmm0, %xmm2 + addsd %xmm3, %xmm9 + movsd -14 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addsd %xmm2, %xmm8 + addsd %xmm3, %xmm9 + addsd %xmm4, %xmm10 + addsd %xmm5, %xmm11 + + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $4, %rax + jle .L83 + + addq %rax, %rax + ALIGN_4 + +.L82: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + subq $1, %rax + jne .L82 + ALIGN_4 + +.L83: + movq K, %rax + andq $15, %rax + BRANCH + jle .L85 + ALIGN_4 + +.L84: + movddup -16 * SIZE(B), %xmm8 + + movapd %xmm8, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + subq $1, %rax + jne .L84 + ALIGN_4 + +.L85: + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + jle .L100 + ALIGN_4 + +.L90: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(BO), %xmm5 + pxor %xmm12, %xmm12 + movapd -12 * SIZE(BO), %xmm6 + pxor %xmm13, %xmm13 + movapd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + prefetcht0 3 * SIZE(CO1) + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L95 + ALIGN_4 + +.L91: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm4, %xmm1 + movapd -8 * SIZE(BO), %xmm4 + addpd %xmm2, %xmm9 + movapd -12 * SIZE(AO), %xmm2 + mulpd %xmm5, %xmm2 + addpd %xmm3, %xmm13 + movapd -10 * SIZE(AO), %xmm3 + mulpd %xmm5, %xmm3 + movapd -6 * SIZE(BO), %xmm5 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm0, %xmm8 + movapd -8 * SIZE(AO), %xmm0 + mulpd %xmm6, %xmm0 + addpd %xmm1, %xmm12 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm6, %xmm1 + movapd -4 * SIZE(BO), %xmm6 + addpd %xmm2, %xmm9 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm7, %xmm2 + addpd %xmm3, %xmm13 + movapd -2 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movapd -2 * SIZE(BO), %xmm7 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + jg .L91 + ALIGN_4 + +.L95: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L98 + ALIGN_4 + +.L96: + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm4, %xmm1 + movapd -14 * SIZE(BO), %xmm4 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L98: + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 + addpd %xmm2, %xmm9 + addpd %xmm3, %xmm13 + + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + movddup %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + + subq $1, I + jg .L90 + ALIGN_4 + +.L100: + testq $2, M + jle .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movapd -14 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -12 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movapd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L105 + ALIGN_4 + +.L101: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movapd -8 * SIZE(BO), %xmm4 + addpd %xmm1, %xmm9 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm5, %xmm1 + movapd -6 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm10 + movapd -12 * SIZE(AO), %xmm2 + mulpd %xmm6, %xmm2 + movapd -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm11 + movapd -10 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movapd -2 * SIZE(BO), %xmm7 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jg .L101 + ALIGN_4 + +.L105: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L108 + ALIGN_4 + +.L106: + addpd %xmm0, %xmm8 + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movapd -14 * SIZE(BO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L106 + ALIGN_4 + +.L108: + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm2, %xmm10 + addpd %xmm3, %xmm11 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + movsd -16 * SIZE(BO), %xmm4 + pxor %xmm9, %xmm9 + movsd -14 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movsd -12 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movsd -10 * SIZE(BO), %xmm7 + + movapd %xmm8, %xmm0 + movapd %xmm8, %xmm1 + movapd %xmm8, %xmm2 + movapd %xmm8, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L115 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addpd %xmm0, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + mulpd %xmm4, %xmm0 + movsd -8 * SIZE(BO), %xmm4 + addpd %xmm1, %xmm9 + movsd -15 * SIZE(AO), %xmm1 + mulpd %xmm5, %xmm1 + movsd -6 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm10 + movsd -14 * SIZE(AO), %xmm2 + mulpd %xmm6, %xmm2 + movsd -4 * SIZE(BO), %xmm6 + addpd %xmm3, %xmm11 + movsd -13 * SIZE(AO), %xmm3 + mulpd %xmm7, %xmm3 + movsd -2 * SIZE(BO), %xmm7 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + jg .L111 + ALIGN_4 + +.L115: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + ALIGN_4 + +.L116: + addsd %xmm0, %xmm8 + movsd -16 * SIZE(AO), %xmm0 + mulsd %xmm4, %xmm0 + movsd -14 * SIZE(BO), %xmm4 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addsd %xmm0, %xmm8 + addsd %xmm1, %xmm9 + addsd %xmm2, %xmm10 + addsd %xmm3, %xmm11 + + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + addsd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S b/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S new file mode 100644 index 0000000000..7dd2c9155c --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_penryn.S @@ -0,0 +1,2131 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#ifdef NANO +#define PREFETCHSIZE (8 * 2 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht2 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 17 + 4) +#endif + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-17 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHB -16 * SIZE(BB) + + xorpd %xmm5, %xmm5 + xorpd %xmm6, %xmm6 + + PREFETCHW 3 * SIZE(CO1) + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + PREFETCHW 7 * SIZE(CO2) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + PREFETCHW 3 * SIZE(CO1, LDC, 2) + movaps %xmm4, %xmm12 + movaps %xmm4, %xmm13 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + + subq $-12 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -11 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movapd %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movapd %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + PADDING + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + addpd %xmm2, %xmm9 + movaps -5 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm11 + subq $-16 * SIZE, AO + movaps -3 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -1 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + subq $-16 * SIZE, BO + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + PREFETCHB -8 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + addpd %xmm4, %xmm15 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movups ALPHA_R, %xmm7 + + addpd %xmm3, %xmm11 + addpd %xmm4, %xmm15 + addpd %xmm5, %xmm10 + addpd %xmm6, %xmm14 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movaps %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movaps %xmm14, %xmm0 + movsd %xmm15, %xmm14 + movsd %xmm0, %xmm15 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhps 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhps 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm12, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm12, %xmm12 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 4 * SIZE(CO1) + movhps %xmm2, 5 * SIZE(CO1) + movlps %xmm3, 6 * SIZE(CO1) + movhps %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhps 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhps 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movddup %xmm13, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm13, %xmm13 + mulpd %xmm7, %xmm13 + addpd %xmm13, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movlps %xmm2, 4 * SIZE(CO2) + movhps %xmm2, 5 * SIZE(CO2) + movlps %xmm3, 6 * SIZE(CO2) + movhps %xmm3, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhps 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm2 + movhps 5 * SIZE(CO1, LDC, 2), %xmm2 + movsd 6 * SIZE(CO1, LDC, 2), %xmm3 + movhps 7 * SIZE(CO1, LDC, 2), %xmm3 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movddup %xmm14, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm14, %xmm14 + mulpd %xmm7, %xmm14 + addpd %xmm14, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 1 * SIZE(CO1, LDC, 2) + movlps %xmm1, 2 * SIZE(CO1, LDC, 2) + movhps %xmm1, 3 * SIZE(CO1, LDC, 2) + + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 5 * SIZE(CO1, LDC, 2) + movlps %xmm3, 6 * SIZE(CO1, LDC, 2) + movhps %xmm3, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhps 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm2 + movhps 5 * SIZE(CO2, LDC, 2), %xmm2 + movsd 6 * SIZE(CO2, LDC, 2), %xmm3 + movhps 7 * SIZE(CO2, LDC, 2), %xmm3 + + movddup %xmm11, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm1 + + movddup %xmm15, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm15, %xmm15 + mulpd %xmm7, %xmm15 + addpd %xmm15, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2, LDC, 2) + movhps %xmm0, 1 * SIZE(CO2, LDC, 2) + movlps %xmm1, 2 * SIZE(CO2, LDC, 2) + movhps %xmm1, 3 * SIZE(CO2, LDC, 2) + + movlps %xmm2, 4 * SIZE(CO2, LDC, 2) + movhps %xmm2, 5 * SIZE(CO2, LDC, 2) + movlps %xmm3, 6 * SIZE(CO2, LDC, 2) + movhps %xmm3, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm3, %xmm8 + movaps %xmm3, %xmm9 + movaps %xmm3, %xmm10 + movaps %xmm3, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -11 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -5 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm3, %xmm11 + movaps -3 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + addpd %xmm2, %xmm9 + movaps -1 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + addpd %xmm3, %xmm11 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + addpd %xmm5, %xmm10 + mulpd %xmm0, %xmm7 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + addpd %xmm7, %xmm8 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + movups ALPHA_R, %xmm7 + + addpd %xmm3, %xmm11 + addpd %xmm5, %xmm10 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movaps %xmm10, %xmm0 + movsd %xmm11, %xmm10 + movsd %xmm0, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhps 3 * SIZE(CO1, LDC, 2), %xmm1 + + movddup %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm10, %xmm10 + mulpd %xmm7, %xmm10 + addpd %xmm10, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 1 * SIZE(CO1, LDC, 2) + movlps %xmm1, 2 * SIZE(CO1, LDC, 2) + movhps %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhps 3 * SIZE(CO2, LDC, 2), %xmm1 + + movddup %xmm11, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm11, %xmm11 + mulpd %xmm7, %xmm11 + addpd %xmm11, %xmm1 + + movlps %xmm0, 0 * SIZE(CO2, LDC, 2) + movhps %xmm0, 1 * SIZE(CO2, LDC, 2) + movlps %xmm1, 2 * SIZE(CO2, LDC, 2) + movhps %xmm1, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -11 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps -7 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -13 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -5 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -3 * SIZE(BO), %xmm3 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm10 + movaps -1 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm11 + movaps 1 * SIZE(BO), %xmm3 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + movsd -15 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm9 + movaps -11 * SIZE(BO), %xmm3 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_4 + +.L38: + movups ALPHA_R, %xmm7 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 1 * SIZE(CO2), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 0 * SIZE(CO2, LDC, 2), %xmm1 + movhps 1 * SIZE(CO2, LDC, 2), %xmm1 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 1 * SIZE(CO1, LDC, 2) + movlps %xmm1, 0 * SIZE(CO2, LDC, 2) + movhps %xmm1, 1 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $2, N + BRANCH + jle .L80 + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + PREFETCHB -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + PREFETCHW 3 * SIZE(CO2) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -11 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm2, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + addpd %xmm4, %xmm13 + addpd %xmm7, %xmm8 + addpd %xmm6, %xmm12 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + movups ALPHA_R, %xmm7 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movaps %xmm12, %xmm0 + movsd %xmm13, %xmm12 + movsd %xmm0, %xmm13 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhps 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhps 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm12, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm12, %xmm12 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 4 * SIZE(CO1) + movhps %xmm2, 5 * SIZE(CO1) + movlps %xmm3, 6 * SIZE(CO1) + movhps %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhps 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhps 7 * SIZE(CO2), %xmm3 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movddup %xmm13, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm13, %xmm13 + mulpd %xmm7, %xmm13 + addpd %xmm13, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + movlps %xmm2, 4 * SIZE(CO2) + movhps %xmm2, 5 * SIZE(CO2) + movlps %xmm3, 6 * SIZE(CO2) + movhps %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + BRANCH + jle .L70 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -13 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm11 + addpd %xmm7, %xmm10 + movaps -9 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + addpd %xmm2, %xmm9 + addpd %xmm7, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + movups ALPHA_R, %xmm7 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movaps %xmm8, %xmm0 + movsd %xmm9, %xmm8 + movsd %xmm0, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhps 3 * SIZE(CO2), %xmm1 + + movddup %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm9, %xmm9 + mulpd %xmm7, %xmm9 + addpd %xmm9, %xmm1 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 1 * SIZE(CO2) + movlps %xmm1, 2 * SIZE(CO2) + movhps %xmm1, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + leaq (BO, %rax, 2), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -14 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -13 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -13 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -12 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm9 + movaps -9 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x44, %xmm0, %xmm0 + mulpd %xmm0, %xmm2 + movsd -15 * SIZE(AO), %xmm0 + addpd %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_4 + +.L78: + movups ALPHA_R, %xmm7 + + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 1 * SIZE(CO2), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 1 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L80: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + addq %rax, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movsd -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_4 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -6 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps 2 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + pshufd $0x44, %xmm2, %xmm3 + pshufd $0x44, %xmm2, %xmm4 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm4 + movaps -10 * SIZE(AO), %xmm1 + + addpd %xmm3, %xmm8 + addpd %xmm4, %xmm12 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_4 + +.L98: + movups ALPHA_R, %xmm7 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhps 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhps 7 * SIZE(CO1), %xmm3 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movddup %xmm12, %xmm5 + mulpd %xmm7, %xmm5 + addpd %xmm5, %xmm2 + + unpckhpd %xmm12, %xmm12 + mulpd %xmm7, %xmm12 + addpd %xmm12, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + movlps %xmm2, 4 * SIZE(CO1) + movhps %xmm2, 5 * SIZE(CO1) + movlps %xmm3, 6 * SIZE(CO1) + movhps %xmm3, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + BRANCH + jle .L110 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + addq %rax, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_4 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm2, %xmm3 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -15 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -12 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + pshufd $0x44, %xmm2, %xmm3 + movsd -14 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + pshufd $0x44, %xmm2, %xmm3 + movsd -13 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm9 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_4 + +.L106: + pshufd $0x44, %xmm2, %xmm3 + movsd -16 * SIZE(BO), %xmm2 + + mulpd %xmm0, %xmm3 + movaps -14 * SIZE(AO), %xmm0 + addpd %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_4 + +.L108: + movups ALPHA_R, %xmm7 + + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhps 3 * SIZE(CO1), %xmm1 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + unpckhpd %xmm8, %xmm8 + mulpd %xmm7, %xmm8 + addpd %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + movlps %xmm1, 2 * SIZE(CO1) + movhps %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + addq %rax, AO + addq %rax, BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + movsd -17 * SIZE(BO), %xmm2 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_4 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -14 * SIZE(AO), %xmm0 + movsd -15 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -13 * SIZE(AO), %xmm0 + movsd -14 * SIZE(BO), %xmm2 + + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -12 * SIZE(AO), %xmm0 + movsd -13 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd %xmm0, %xmm2 + addsd %xmm2, %xmm8 + movsd -15 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_4 + +.L118: + movups ALPHA_R, %xmm7 + + addpd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 1 * SIZE(CO1), %xmm0 + + movddup %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S b/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S new file mode 100644 index 0000000000..3b313b3816 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_sse2.S @@ -0,0 +1,2820 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 5 + 4) +#define movsd movlps +#define movapd movaps +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 5 + 4) +#define movapd movaps +#endif + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else + +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 +#endif + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + + movsd %xmm0, 0 + ALPHA + movsd %xmm1, 8 + ALPHA + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_3 + +.L01: +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_3 + + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) + + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) + + movq 8 * SIZE(B), %mm0 + movq 9 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + movq 10 * SIZE(B), %mm2 + movq 11 * SIZE(B), %mm3 + movq %mm2, 4 * SIZE(BO) + movq %mm2, 5 * SIZE(BO) + movq %mm3, 6 * SIZE(BO) + movq %mm3, 7 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + movq 12 * SIZE(B), %mm4 + movq 13 * SIZE(B), %mm5 + movq %mm4, 8 * SIZE(BO) + movq %mm4, 9 * SIZE(BO) + movq %mm5, 10 * SIZE(BO) + movq %mm5, 11 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) + + movq 14 * SIZE(B), %mm6 + movq 15 * SIZE(B), %mm7 + movq %mm6, 12 * SIZE(BO) + movq %mm6, 13 * SIZE(BO) + movq %mm7, 14 * SIZE(BO) + movq %mm7, 15 * SIZE(BO) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, B + + subq $1, %rax + jne .L02 + ALIGN_3 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_3 + +.L04: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_3 + +.L10: + movq A, AO # aoffset = a + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_3 + +.L11: + PREFETCH 0 * SIZE(BB) + PREFETCH 8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movapd -14 * SIZE(AO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -12 * SIZE(AO), %xmm4 + movapd -12 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -10 * SIZE(AO), %xmm6 + movapd -8 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCHW 7 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + PREFETCHW 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $32 * SIZE, BO + addq $16 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movapd ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm0 + addpd %xmm1, %xmm10 + movapd -16 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm13 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm2 + addpd %xmm1, %xmm14 + movapd -8 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm2, %xmm15 + movapd -10 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_3 + +.L19: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + pshufd $0x44, %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + pshufd $0x44, %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + pshufd $0x44, %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO1, LDC, 2), %xmm2 + movsd 6 * SIZE(CO1, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x44, %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + pshufd $0x44, %xmm14, %xmm5 + unpckhpd %xmm14, %xmm14 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm14 + + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm14, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO1, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO1, LDC, 2) + movsd %xmm3, 6 * SIZE(CO1, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm2 + movhpd 5 * SIZE(CO2, LDC, 2), %xmm2 + movsd 6 * SIZE(CO2, LDC, 2), %xmm3 + movhpd 7 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x44, %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + pshufd $0x44, %xmm15, %xmm5 + unpckhpd %xmm15, %xmm15 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm15 + + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm15, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + movsd %xmm2, 4 * SIZE(CO2, LDC, 2) + movhpd %xmm2, 5 * SIZE(CO2, LDC, 2) + movsd %xmm3, 6 * SIZE(CO2, LDC, 2) + movhpd %xmm3, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_3 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_3 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 16 * SIZE(BO), %xmm5 + movapd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_3 + +.L22: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm8 + movapd 18 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + addpd %xmm5, %xmm9 + movapd 20 * SIZE(BO), %xmm5 + mulpd %xmm0, %xmm5 + mulpd 22 * SIZE(BO), %xmm0 + addpd %xmm5, %xmm10 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm0, %xmm11 + movapd -10 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm8 + movapd 26 * SIZE(BO), %xmm7 + mulpd %xmm0, %xmm7 + addpd %xmm7, %xmm9 + movapd 28 * SIZE(BO), %xmm7 + mulpd %xmm0, %xmm7 + mulpd 30 * SIZE(BO), %xmm0 + addpd %xmm7, %xmm10 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movapd 34 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm9 + movapd 36 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 38 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 64 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -6 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movapd 42 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm9 + movapd 44 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 46 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 72 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm8 + movapd 50 * SIZE(BO), %xmm5 + mulpd %xmm2, %xmm5 + addpd %xmm5, %xmm9 + movapd 52 * SIZE(BO), %xmm5 + mulpd %xmm2, %xmm5 + mulpd 54 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm10 + movapd 80 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm11 + movapd -2 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + addpd %xmm7, %xmm8 + movapd 58 * SIZE(BO), %xmm7 + mulpd %xmm2, %xmm7 + addpd %xmm7, %xmm9 + movapd 60 * SIZE(BO), %xmm7 + mulpd %xmm2, %xmm7 + mulpd 62 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm10 + movapd 88 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_3 + +.L26: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_3 + +.L29: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + pshufd $0x44, %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + movsd 2 * SIZE(CO1, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm1 + + pshufd $0x44, %xmm10, %xmm4 + unpckhpd %xmm10, %xmm10 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm10 + addpd %xmm4, %xmm0 + addpd %xmm10, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + movsd %xmm1, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + movsd 2 * SIZE(CO2, LDC, 2), %xmm1 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x44, %xmm11, %xmm4 + unpckhpd %xmm11, %xmm11 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm11 + addpd %xmm4, %xmm0 + addpd %xmm11, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + movsd %xmm1, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm1, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_3 + +.L30: + testq $1, M + je .L39 + ALIGN_3 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movsd 16 * SIZE(BO), %xmm5 + movsd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_3 + +.L32: + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 2 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 32 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -15 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm8 + movsd 10 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm9 + movsd 12 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 40 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -14 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm8 + movsd 18 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm9 + movsd 20 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + mulsd 22 * SIZE(BO), %xmm0 + addsd %xmm5, %xmm10 + movsd 48 * SIZE(BO), %xmm5 + addsd %xmm0, %xmm11 + movsd -13 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm8 + movsd 26 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm9 + movsd 28 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + mulsd 30 * SIZE(BO), %xmm0 + addsd %xmm7, %xmm10 + movsd 56 * SIZE(BO), %xmm7 + addsd %xmm0, %xmm11 + movsd -12 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + movsd 34 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 36 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 38 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 64 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -11 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm8 + movsd 42 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + addsd %xmm3, %xmm9 + movsd 44 * SIZE(BO), %xmm3 + mulsd %xmm0, %xmm3 + mulsd 46 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 72 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -10 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm8 + movsd 50 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + addsd %xmm5, %xmm9 + movsd 52 * SIZE(BO), %xmm5 + mulsd %xmm0, %xmm5 + mulsd 54 * SIZE(BO), %xmm0 + addsd %xmm5, %xmm10 + movsd 80 * SIZE(BO), %xmm5 + addsd %xmm0, %xmm11 + movsd -9 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm8 + movsd 58 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + addsd %xmm7, %xmm9 + movsd 60 * SIZE(BO), %xmm7 + mulsd %xmm0, %xmm7 + mulsd 62 * SIZE(BO), %xmm0 + addsd %xmm7, %xmm10 + movsd 88 * SIZE(BO), %xmm7 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm8 + movsd 2 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + addsd %xmm1, %xmm9 + movsd 4 * SIZE(BO), %xmm1 + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 8 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -15 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_3 + +.L38: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + pshufd $0x44, %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + pshufd $0x44, %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm0 + + pshufd $0x44, %xmm10, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm0 + + pshufd $0x44, %xmm11, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm0, 1 * SIZE(CO2, LDC, 2) + ALIGN_3 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + ALIGN_3 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_3 + +.L42: + PREFETCH 56 * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + decq %rax + jne .L42 + ALIGN_3 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_3 + +.L44: + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_3 + +.L50: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_3 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm12, %xmm12 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + + movapd 0 * SIZE(AO), %xmm4 + movapd 16 * SIZE(BO), %xmm5 + movapd 8 * SIZE(AO), %xmm6 + movapd 24 * SIZE(BO), %xmm7 + + PREFETCHW 7 * SIZE(CO1) + PREFETCHW 7 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_3 + +.L52: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd 16 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd 24 * SIZE(AO), %xmm2 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulpd %xmm4, %xmm5 + mulpd 18 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm8 + movapd 16 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm9 + movapd 2 * SIZE(AO), %xmm4 + mulpd %xmm4, %xmm5 + mulpd 18 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm12 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm13 + movapd 4 * SIZE(AO), %xmm4 + + mulpd %xmm4, %xmm5 + mulpd 22 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm8 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm9 + movapd 6 * SIZE(AO), %xmm4 + mulpd %xmm4, %xmm5 + mulpd 22 * SIZE(BO), %xmm4 + addpd %xmm5, %xmm12 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm4, %xmm13 + movapd 32 * SIZE(AO), %xmm4 + + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + mulpd %xmm6, %xmm7 + mulpd 26 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm8 + movapd 24 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm9 + movapd 10 * SIZE(AO), %xmm6 + mulpd %xmm6, %xmm7 + mulpd 26 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm12 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm13 + movapd 12 * SIZE(AO), %xmm6 + + mulpd %xmm6, %xmm7 + mulpd 30 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm8 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm9 + movapd 14 * SIZE(AO), %xmm6 + mulpd %xmm6, %xmm7 + mulpd 30 * SIZE(BO), %xmm6 + addpd %xmm7, %xmm12 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm6, %xmm13 + movapd 40 * SIZE(AO), %xmm6 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_3 + +.L56: + movapd 0 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm12 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_3 + +.L59: + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + pshufd $0x44, %xmm12, %xmm5 + unpckhpd %xmm12, %xmm12 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm12 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm12, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + movsd 4 * SIZE(CO2), %xmm2 + movhpd 5 * SIZE(CO2), %xmm2 + movsd 6 * SIZE(CO2), %xmm3 + movhpd 7 * SIZE(CO2), %xmm3 + + pshufd $0x44, %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + pshufd $0x44, %xmm13, %xmm5 + unpckhpd %xmm13, %xmm13 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm13 + + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm13, %xmm3 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + movsd %xmm2, 4 * SIZE(CO2) + movhpd %xmm2, 5 * SIZE(CO2) + movsd %xmm3, 6 * SIZE(CO2) + movhpd %xmm3, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_3 + +.L60: + testq $2, M + je .L70 + ALIGN_3 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 16 * SIZE(BO), %xmm5 + movapd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_3 + +.L62: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 10 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm2, %xmm5 + mulpd 18 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm8 + movapd 20 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm5 + mulpd 22 * SIZE(BO), %xmm2 + addpd %xmm5, %xmm10 + movapd 48 * SIZE(BO), %xmm5 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + mulpd 26 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm8 + movapd 28 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + + mulpd %xmm2, %xmm7 + mulpd 30 * SIZE(BO), %xmm2 + addpd %xmm7, %xmm10 + movapd 56 * SIZE(BO), %xmm7 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_3 + +.L66: + mulpd %xmm0, %xmm1 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_3 + +.L69: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + movsd 2 * SIZE(CO2), %xmm1 + movhpd 3 * SIZE(CO2), %xmm1 + + pshufd $0x44, %xmm9, %xmm4 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm9 + addpd %xmm4, %xmm0 + addpd %xmm9, %xmm1 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + movsd %xmm1, 2 * SIZE(CO2) + movhpd %xmm1, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_3 + +.L70: + testq $1, M + je .L79 + ALIGN_3 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -12 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movsd 16 * SIZE(BO), %xmm5 + movsd 24 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_3 + +.L72: + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm8 + movsd 4 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm9 + movsd -15 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm1 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm10 + movsd 32 * SIZE(BO), %xmm1 + addsd %xmm0, %xmm11 + movsd -14 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 10 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm8 + movsd 12 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm9 + movsd -13 * SIZE(AO), %xmm0 + + mulsd %xmm0, %xmm3 + mulsd 14 * SIZE(BO), %xmm0 + addsd %xmm3, %xmm10 + movsd 40 * SIZE(BO), %xmm3 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + + mulsd %xmm2, %xmm5 + mulsd 18 * SIZE(BO), %xmm2 + addsd %xmm5, %xmm8 + movsd 20 * SIZE(BO), %xmm5 + addsd %xmm2, %xmm9 + movsd -11 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm5 + mulsd 22 * SIZE(BO), %xmm2 + addsd %xmm5, %xmm10 + movsd 48 * SIZE(BO), %xmm5 + addsd %xmm2, %xmm11 + movsd -10 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm7 + mulsd 26 * SIZE(BO), %xmm2 + addsd %xmm7, %xmm8 + movsd 28 * SIZE(BO), %xmm7 + addsd %xmm2, %xmm9 + movsd -9 * SIZE(AO), %xmm2 + + mulsd %xmm2, %xmm7 + mulsd 30 * SIZE(BO), %xmm2 + addsd %xmm7, %xmm10 + movsd 56 * SIZE(BO), %xmm7 + addsd %xmm2, %xmm11 + movsd -4 * SIZE(AO), %xmm2 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_3 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + mulsd %xmm0, %xmm1 + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm1, %xmm8 + addsd %xmm0, %xmm9 + movsd -15 * SIZE(AO), %xmm0 + movsd 4 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_3 + +.L78: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + pshufd $0x44, %xmm8, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhpd 1 * SIZE(CO2), %xmm0 + + pshufd $0x44, %xmm9, %xmm4 + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO2) + movhpd %xmm0, 1 * SIZE(CO2) + ALIGN_3 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + ALIGN_3 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L83 + ALIGN_3 + +.L82: + PREFETCH 56 * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq 1 * SIZE(B), %mm1 + movq 2 * SIZE(B), %mm2 + movq 3 * SIZE(B), %mm3 + movq 4 * SIZE(B), %mm4 + movq 5 * SIZE(B), %mm5 + movq 6 * SIZE(B), %mm6 + movq 7 * SIZE(B), %mm7 + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + decq %rax + jne .L82 + ALIGN_3 + +.L83: + movq K, %rax + andq $7, %rax + BRANCH + jle .L90 + ALIGN_3 + +.L84: + movq 0 * SIZE(B), %mm0 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + + addq $1 * SIZE, B + addq $2 * SIZE, BO + decq %rax + jne .L84 + ALIGN_3 + +.L90: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_3 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + + movapd 0 * SIZE(AO), %xmm4 + movapd 8 * SIZE(AO), %xmm6 + + PREFETCHW 7 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_3 + +.L92: + mulpd %xmm1, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm10 + movapd 16 * SIZE(AO), %xmm0 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm1, %xmm11 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -6 * SIZE(AO), %xmm1 + addpd %xmm2, %xmm8 + movapd -4 * SIZE(AO), %xmm2 + addpd %xmm1, %xmm9 + movapd 6 * SIZE(BO), %xmm1 + mulpd %xmm1, %xmm2 + mulpd -2 * SIZE(AO), %xmm1 + addpd %xmm2, %xmm10 + movapd 24 * SIZE(AO), %xmm2 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm1, %xmm11 + movapd 16 * SIZE(BO), %xmm1 + mulpd %xmm3, %xmm4 + mulpd 2 * SIZE(AO), %xmm3 + addpd %xmm4, %xmm8 + movapd 4 * SIZE(AO), %xmm4 + addpd %xmm3, %xmm9 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm4 + mulpd 6 * SIZE(AO), %xmm3 + addpd %xmm4, %xmm10 + movapd 32 * SIZE(AO), %xmm4 + PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) + addpd %xmm3, %xmm11 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm6 + mulpd 10 * SIZE(AO), %xmm3 + addpd %xmm6, %xmm8 + movapd 12 * SIZE(AO), %xmm6 + addpd %xmm3, %xmm9 + movapd 14 * SIZE(BO), %xmm3 + mulpd %xmm3, %xmm6 + mulpd 14 * SIZE(AO), %xmm3 + addpd %xmm6, %xmm10 + movapd 40 * SIZE(AO), %xmm6 + addpd %xmm3, %xmm11 + movapd 24 * SIZE(BO), %xmm3 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L92 + ALIGN_3 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_3 + +.L96: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO), %xmm1 + addpd %xmm0, %xmm8 + movapd -12 * SIZE(AO), %xmm0 + addpd %xmm1, %xmm9 + movapd 2 * SIZE(BO), %xmm1 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_3 + +.L99: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + movsd 4 * SIZE(CO1), %xmm2 + movhpd 5 * SIZE(CO1), %xmm2 + movsd 6 * SIZE(CO1), %xmm3 + movhpd 7 * SIZE(CO1), %xmm3 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + pshufd $0x44, %xmm9, %xmm5 + unpckhpd %xmm9, %xmm9 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + mulpd %xmm7, %xmm5 + mulpd %xmm7, %xmm9 + + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + addpd %xmm5, %xmm2 + addpd %xmm9, %xmm3 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + movsd %xmm2, 4 * SIZE(CO1) + movhpd %xmm2, 5 * SIZE(CO1) + movsd %xmm3, 6 * SIZE(CO1) + movhpd %xmm3, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_3 + +.L100: + testq $2, M + je .L110 + ALIGN_3 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L105 + ALIGN_3 + +.L102: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movapd -14 * SIZE(AO), %xmm0 + mulpd 2 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -12 * SIZE(AO), %xmm0 + mulpd 4 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm10 + movapd -10 * SIZE(AO), %xmm0 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm2 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -4 * SIZE(AO), %xmm2 + mulpd 12 * SIZE(BO), %xmm2 + addpd %xmm2, %xmm10 + movapd -2 * SIZE(AO), %xmm2 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_3 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_3 + +.L106: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(AO), %xmm0 + movapd 2 * SIZE(BO), %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_3 + +.L109: + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm10, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + pshufd $0x44, %xmm8, %xmm4 + unpckhpd %xmm8, %xmm8 + + mulpd %xmm7, %xmm4 + mulpd %xmm7, %xmm8 + addpd %xmm4, %xmm0 + addpd %xmm8, %xmm1 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm1, 2 * SIZE(CO1) + movhpd %xmm1, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_3 + +.L110: + testq $1, M + je .L999 + ALIGN_3 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movsd 0 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movsd -12 * SIZE(AO), %xmm2 + pxor %xmm10, %xmm10 + movsd 8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_3 + +.L112: + mulsd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd 16 * SIZE(BO), %xmm1 + mulsd 2 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm9 + movsd -14 * SIZE(AO), %xmm0 + mulsd 4 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm10 + movsd -13 * SIZE(AO), %xmm0 + mulsd 6 * SIZE(BO), %xmm0 + addsd %xmm0, %xmm11 + movsd -8 * SIZE(AO), %xmm0 + mulsd %xmm2, %xmm3 + movsd -11 * SIZE(AO), %xmm2 + addsd %xmm3, %xmm8 + movsd 24 * SIZE(BO), %xmm3 + mulsd 10 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm9 + movsd -10 * SIZE(AO), %xmm2 + mulsd 12 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm10 + movsd -9 * SIZE(AO), %xmm2 + mulsd 14 * SIZE(BO), %xmm2 + addsd %xmm2, %xmm11 + movsd -4 * SIZE(AO), %xmm2 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_3 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + mulsd %xmm0, %xmm1 + movsd -15 * SIZE(AO), %xmm0 + addsd %xmm1, %xmm8 + movsd 2 * SIZE(BO), %xmm1 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_3 + +.L118: + addsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + addsd %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + pshufd $0x44, %xmm8, %xmm4 + + mulpd %xmm7, %xmm4 + addpd %xmm4, %xmm0 + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + ALIGN_3 + +.L999: + movq %rbx, %rsp + + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S b/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S new file mode 100644 index 0000000000..73f5fcef5e --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x4_sse3.S @@ -0,0 +1,2622 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KKK 72(%rsp) +#define KK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define PREFETCH prefetcht2 +#define PREFETCHSIZE (16 * 12 + 3) + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + addpd %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + addpd %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + addpd %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + addpd %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + addpd %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + addpd %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + addpd %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + addpd %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + addpd %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#endif + + movsd %xmm0, ALPHA_R + movsd %xmm1, ALPHA_I + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (, K, 4), BB + leaq (B, BB, SIZE), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: + prefetcht0 0 * SIZE(BB) + prefetcht0 8 * SIZE(BB) + subq $-8 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + pxor %xmm4, %xmm4 + movddup 16 * SIZE(BO), %xmm13 + pxor %xmm5, %xmm5 + movapd 24 * SIZE(AO), %xmm14 + pxor %xmm6, %xmm6 + movddup 24 * SIZE(BO), %xmm15 + pxor %xmm7, %xmm7 + + prefetchnta 7 * SIZE(CO1) + prefetchnta 7 * SIZE(CO2) + prefetchnta 7 * SIZE(CO1, LDC, 2) + prefetchnta 7 * SIZE(CO2, LDC, 2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + NOBRANCH + je .L15 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + BRANCH + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm6 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm7 + + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm7 + movddup 40 * SIZE(BO), %xmm11 + + mulpd %xmm12, %xmm13 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm13, %xmm0 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 18 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 16 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 17 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 18 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 19 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 20 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm0 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm1 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm2 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 22 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm3 + + movddup 20 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm4 + movddup 21 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm5 + movddup 22 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + addpd %xmm13, %xmm6 + movddup 23 * SIZE(BO), %xmm13 + mulpd %xmm12, %xmm13 + movapd 48 * SIZE(AO), %xmm12 + addpd %xmm13, %xmm7 + movddup 48 * SIZE(BO), %xmm13 + + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 26 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 24 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 25 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 26 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 27 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 28 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm0 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm1 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm2 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 30 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm3 + + movddup 28 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm4 + movddup 29 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm5 + movddup 30 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + addpd %xmm15, %xmm6 + movddup 31 * SIZE(BO), %xmm15 + mulpd %xmm14, %xmm15 + movapd 56 * SIZE(AO), %xmm14 + addpd %xmm15, %xmm7 + movddup 56 * SIZE(BO), %xmm15 + + addq $32 * SIZE, BO + addq $32 * SIZE, AO + decq %rax + BRANCH + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movsd 4 * SIZE(CO1), %xmm10 + movhpd 5 * SIZE(CO1), %xmm10 + movsd 6 * SIZE(CO1), %xmm11 + movhpd 7 * SIZE(CO1), %xmm11 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + movddup %xmm4, %xmm13 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm4 + + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm4, %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 4 * SIZE(CO1) + movhpd %xmm10, 5 * SIZE(CO1) + movsd %xmm11, 6 * SIZE(CO1) + movhpd %xmm11, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + movsd 2 * SIZE(CO2), %xmm9 + movhpd 3 * SIZE(CO2), %xmm9 + + movsd 4 * SIZE(CO2), %xmm10 + movhpd 5 * SIZE(CO2), %xmm10 + movsd 6 * SIZE(CO2), %xmm11 + movhpd 7 * SIZE(CO2), %xmm11 + + movddup %xmm1, %xmm12 + unpckhpd %xmm1, %xmm1 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm5 + + addpd %xmm12, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm5, %xmm11 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + movsd %xmm9, 2 * SIZE(CO2) + movhpd %xmm9, 3 * SIZE(CO2) + + movsd %xmm10, 4 * SIZE(CO2) + movhpd %xmm10, 5 * SIZE(CO2) + movsd %xmm11, 6 * SIZE(CO2) + movhpd %xmm11, 7 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 + movsd 2 * SIZE(CO1, LDC, 2), %xmm9 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm9 + + movsd 4 * SIZE(CO1, LDC, 2), %xmm10 + movhpd 5 * SIZE(CO1, LDC, 2), %xmm10 + movsd 6 * SIZE(CO1, LDC, 2), %xmm11 + movhpd 7 * SIZE(CO1, LDC, 2), %xmm11 + + movddup %xmm2, %xmm12 + unpckhpd %xmm2, %xmm2 + movddup %xmm6, %xmm13 + unpckhpd %xmm6, %xmm6 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm2 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm6 + + addpd %xmm12, %xmm8 + addpd %xmm2, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm6, %xmm11 + + movsd %xmm8, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) + movsd %xmm9, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) + + movsd %xmm10, 4 * SIZE(CO1, LDC, 2) + movhpd %xmm10, 5 * SIZE(CO1, LDC, 2) + movsd %xmm11, 6 * SIZE(CO1, LDC, 2) + movhpd %xmm11, 7 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 + movsd 2 * SIZE(CO2, LDC, 2), %xmm9 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm9 + + movsd 4 * SIZE(CO2, LDC, 2), %xmm10 + movhpd 5 * SIZE(CO2, LDC, 2), %xmm10 + movsd 6 * SIZE(CO2, LDC, 2), %xmm11 + movhpd 7 * SIZE(CO2, LDC, 2), %xmm11 + + movddup %xmm3, %xmm12 + unpckhpd %xmm3, %xmm3 + movddup %xmm7, %xmm13 + unpckhpd %xmm7, %xmm7 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm3 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm7 + + addpd %xmm12, %xmm8 + addpd %xmm3, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm7, %xmm11 + + movsd %xmm8, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) + movsd %xmm9, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm9, 3 * SIZE(CO2, LDC, 2) + + movsd %xmm10, 4 * SIZE(CO2, LDC, 2) + movhpd %xmm10, 5 * SIZE(CO2, LDC, 2) + movsd %xmm11, 6 * SIZE(CO2, LDC, 2) + movhpd %xmm11, 7 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L11 + jmp .L20 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L29 + ALIGN_4 + +.L26: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L29: + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + movsd 2 * SIZE(CO2), %xmm9 + movhpd 3 * SIZE(CO2), %xmm9 + + movddup %xmm1, %xmm12 + unpckhpd %xmm1, %xmm1 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm1 + addpd %xmm12, %xmm8 + addpd %xmm1, %xmm9 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + movsd %xmm9, 2 * SIZE(CO2) + movhpd %xmm9, 3 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 + movsd 2 * SIZE(CO1, LDC, 2), %xmm9 + movhpd 3 * SIZE(CO1, LDC, 2), %xmm9 + + movddup %xmm2, %xmm12 + unpckhpd %xmm2, %xmm2 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm2 + addpd %xmm12, %xmm8 + addpd %xmm2, %xmm9 + + movsd %xmm8, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) + movsd %xmm9, 2 * SIZE(CO1, LDC, 2) + movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 + movsd 2 * SIZE(CO2, LDC, 2), %xmm9 + movhpd 3 * SIZE(CO2, LDC, 2), %xmm9 + + movddup %xmm3, %xmm12 + unpckhpd %xmm3, %xmm3 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm3 + addpd %xmm12, %xmm8 + addpd %xmm3, %xmm9 + + movsd %xmm8, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) + movsd %xmm9, 2 * SIZE(CO2, LDC, 2) + movhpd %xmm9, 3 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 3 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movapd 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movddup 8 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movapd 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movddup 6 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movapd 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 7 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movapd 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movapd 40 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + + movddup %xmm0, %xmm12 + + mulpd %xmm15, %xmm12 + addpd %xmm12, %xmm8 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm0 + addpd %xmm0, %xmm8 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 + + movddup %xmm1, %xmm12 + + mulpd %xmm15, %xmm12 + addpd %xmm12, %xmm8 + + movsd %xmm8, 0 * SIZE(CO1, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 + + unpckhpd %xmm1, %xmm1 + + mulpd %xmm15, %xmm1 + addpd %xmm1, %xmm8 + + movsd %xmm8, 0 * SIZE(CO2, LDC, 2) + movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C # c += 4 * ldc + movq BO, B + decq J # j -- + jg .L10 + ALIGN_4 + +.L40: + testq $2, N + je .L80 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) + prefetchw 4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + addpd %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L59 + ALIGN_4 + +.L56: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L59: + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movsd 4 * SIZE(CO1), %xmm10 + movhpd 5 * SIZE(CO1), %xmm10 + movsd 6 * SIZE(CO1), %xmm11 + movhpd 7 * SIZE(CO1), %xmm11 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + movddup %xmm4, %xmm13 + unpckhpd %xmm4, %xmm4 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm4 + + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm4, %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 4 * SIZE(CO1) + movhpd %xmm10, 5 * SIZE(CO1) + movsd %xmm11, 6 * SIZE(CO1) + movhpd %xmm11, 7 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + movsd 2 * SIZE(CO2), %xmm9 + movhpd 3 * SIZE(CO2), %xmm9 + + movsd 4 * SIZE(CO2), %xmm10 + movhpd 5 * SIZE(CO2), %xmm10 + movsd 6 * SIZE(CO2), %xmm11 + movhpd 7 * SIZE(CO2), %xmm11 + + movddup %xmm1, %xmm12 + unpckhpd %xmm1, %xmm1 + movddup %xmm5, %xmm13 + unpckhpd %xmm5, %xmm5 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm5 + + addpd %xmm12, %xmm8 + addpd %xmm1, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm5, %xmm11 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + movsd %xmm9, 2 * SIZE(CO2) + movhpd %xmm9, 3 * SIZE(CO2) + + movsd %xmm10, 4 * SIZE(CO2) + movhpd %xmm10, 5 * SIZE(CO2) + movsd %xmm11, 6 * SIZE(CO2) + movhpd %xmm11, 7 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + addpd %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L69 + ALIGN_4 + +.L66: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L69: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + movsd 2 * SIZE(CO2), %xmm9 + movhpd 3 * SIZE(CO2), %xmm9 + + movddup %xmm1, %xmm12 + unpckhpd %xmm1, %xmm1 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm1 + addpd %xmm12, %xmm8 + addpd %xmm1, %xmm9 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + movsd %xmm9, 2 * SIZE(CO2) + movhpd %xmm9, 3 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movapd 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + movapd 16 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm1 + movddup 2 * SIZE(AO), %xmm8 + mulpd 4 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm2 + movddup 3 * SIZE(AO), %xmm8 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + movddup 8 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm11 + movddup 5 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm0 + mulpd 10 * SIZE(BO), %xmm10 + movapd 24 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm1 + movddup 6 * SIZE(AO), %xmm10 + mulpd 12 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm2 + movddup 7 * SIZE(AO), %xmm10 + mulpd 14 * SIZE(BO), %xmm10 + addpd %xmm10, %xmm3 + movddup 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulpd %xmm8, %xmm9 + movddup 1 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + + movddup %xmm0, %xmm12 + mulpd %xmm15, %xmm12 + addpd %xmm12, %xmm8 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhpd 1 * SIZE(CO2), %xmm8 + + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm0 + addpd %xmm0, %xmm8 + + movsd %xmm8, 0 * SIZE(CO2) + movhpd %xmm8, 1 * SIZE(CO2) + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifdef HAVE_3DNOW + prefetchw 4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm8 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm2 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm3 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 10 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm0 + movapd 12 * SIZE(AO), %xmm10 + addpd %xmm9, %xmm1 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm9, %xmm10 + mulpd 14 * SIZE(AO), %xmm9 + addpd %xmm10, %xmm2 + movapd 24 * SIZE(AO), %xmm10 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm8 + mulpd 18 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm0 + movapd 20 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm1 + movddup 5 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm8 + mulpd 22 * SIZE(AO), %xmm11 + addpd %xmm8, %xmm2 + movapd 32 * SIZE(AO), %xmm8 + addpd %xmm11, %xmm3 + movddup 6 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 26 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + movapd 28 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 7 * SIZE(BO), %xmm11 + mulpd %xmm11, %xmm10 + mulpd 30 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm2 + movapd 40 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L99 + ALIGN_4 + +.L96: + mulpd %xmm9, %xmm8 + mulpd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + movapd 4 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 1 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L99: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movsd 4 * SIZE(CO1), %xmm10 + movhpd 5 * SIZE(CO1), %xmm10 + movsd 6 * SIZE(CO1), %xmm11 + movhpd 7 * SIZE(CO1), %xmm11 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm13 + unpckhpd %xmm1, %xmm1 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + mulpd %xmm15, %xmm13 + mulpd %xmm15, %xmm1 + + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + addpd %xmm13, %xmm10 + addpd %xmm1, %xmm11 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 4 * SIZE(CO1) + movhpd %xmm10, 5 * SIZE(CO1) + movsd %xmm11, 6 * SIZE(CO1) + movhpd %xmm11, 7 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L105 + ALIGN_4 + +.L102: + mulpd %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(AO), %xmm9 + movapd 16 * SIZE(AO), %xmm8 + addpd %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd 4 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd 6 * SIZE(AO), %xmm9 + addpd %xmm9, %xmm3 + movddup 8 * SIZE(BO), %xmm9 + mulpd %xmm11, %xmm10 + movddup 5 * SIZE(BO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 10 * SIZE(AO), %xmm11 + movapd 24 * SIZE(AO), %xmm10 + addpd %xmm11, %xmm1 + movddup 6 * SIZE(BO), %xmm11 + mulpd 12 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm2 + movddup 7 * SIZE(BO), %xmm11 + mulpd 14 * SIZE(AO), %xmm11 + addpd %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L109 + ALIGN_4 + +.L106: + mulpd %xmm9, %xmm8 + movddup 1 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm0 + movapd 2 * SIZE(AO), %xmm8 + + addq $2 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L106 + ALIGN_4 + +.L109: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm9 + movhpd 3 * SIZE(CO1), %xmm9 + + movddup %xmm0, %xmm12 + unpckhpd %xmm0, %xmm0 + + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm0 + addpd %xmm12, %xmm8 + addpd %xmm0, %xmm9 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movsd 4 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movsd 4 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(AO), %xmm9 + movapd 0 * SIZE(BO), %xmm8 + movapd 4 * SIZE(AO), %xmm11 + movapd 4 * SIZE(BO), %xmm10 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulpd %xmm9, %xmm8 + movapd 2 * SIZE(AO), %xmm9 + addpd %xmm8, %xmm0 + mulpd 2 * SIZE(BO), %xmm9 + movapd 8 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm1 + movapd 8 * SIZE(AO), %xmm9 + mulpd %xmm11, %xmm10 + movapd 6 * SIZE(AO), %xmm11 + addpd %xmm10, %xmm0 + mulpd 6 * SIZE(BO), %xmm11 + movapd 12 * SIZE(BO), %xmm10 + addpd %xmm11, %xmm1 + movapd 12 * SIZE(AO), %xmm11 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movsd ALPHA_R, %xmm15 + movhpd ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulsd 0 * SIZE(BO), %xmm9 + addsd %xmm9, %xmm0 + movsd 1 * SIZE(AO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $1 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addpd %xmm1, %xmm0 + haddpd %xmm0, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + + movddup %xmm0, %xmm12 + + mulpd %xmm15, %xmm12 + addpd %xmm12, %xmm8 + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S b/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S new file mode 100644 index 0000000000..92be8fc25c --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_4x8_nehalem.S @@ -0,0 +1,2472 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 - 8) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, ALPHA_R + movlps %xmm0, ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $3, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 4), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 3, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + prefetcht0 -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + leaq (LDC, LDC, 2), %rax + + xorps %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 7 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC, 2) + xorps %xmm11, %xmm11 + prefetcht0 7 * SIZE(CO1, %rax, 1) + + xorps %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 7 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + prefetcht0 3 * SIZE(CO2, LDC, 2) + xorps %xmm15, %xmm15 + prefetcht0 7 * SIZE(CO2, %rax, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm4 + + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm7 + + addps %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + addps %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm4 + + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm13 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + pshufd $0x39, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + addps %xmm1, %xmm12 + addps %xmm2, %xmm13 + addps %xmm3, %xmm14 + addps %xmm4, %xmm15 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + leaq (LDC, LDC, 2), %rax + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC), %xmm2 + movhps 2 * SIZE(CO1, LDC), %xmm2 + movsd 4 * SIZE(CO1, LDC), %xmm3 + movhps 6 * SIZE(CO1, LDC), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + + movsd 0 * SIZE(CO1, %rax), %xmm2 + movhps 2 * SIZE(CO1, %rax), %xmm2 + movsd 4 * SIZE(CO1, %rax), %xmm3 + movhps 6 * SIZE(CO1, %rax), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm11, %xmm5 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm11 + + addps %xmm4, %xmm0 + addps %xmm10, %xmm1 + addps %xmm5, %xmm2 + addps %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 2 * SIZE(CO1, LDC, 2) + movlps %xmm1, 4 * SIZE(CO1, LDC, 2) + movhps %xmm1, 6 * SIZE(CO1, LDC, 2) + + movlps %xmm2, 0 * SIZE(CO1, %rax) + movhps %xmm2, 2 * SIZE(CO1, %rax) + movlps %xmm3, 4 * SIZE(CO1, %rax) + movhps %xmm3, 6 * SIZE(CO1, %rax) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + movsd 0 * SIZE(CO2, LDC), %xmm2 + movhps 2 * SIZE(CO2, LDC), %xmm2 + movsd 4 * SIZE(CO2, LDC), %xmm3 + movhps 6 * SIZE(CO2, LDC), %xmm3 + + pshufd $0x50, %xmm12, %xmm4 + pshufd $0xfa, %xmm12, %xmm12 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm12 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm4, %xmm0 + addps %xmm12, %xmm1 + addps %xmm5, %xmm2 + addps %xmm13, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO2, LDC) + movhps %xmm2, 2 * SIZE(CO2, LDC) + movlps %xmm3, 4 * SIZE(CO2, LDC) + movhps %xmm3, 6 * SIZE(CO2, LDC) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + + movsd 0 * SIZE(CO2, %rax), %xmm2 + movhps 2 * SIZE(CO2, %rax), %xmm2 + movsd 4 * SIZE(CO2, %rax), %xmm3 + movhps 6 * SIZE(CO2, %rax), %xmm3 + + pshufd $0x50, %xmm14, %xmm4 + pshufd $0xfa, %xmm14, %xmm14 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm14 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm4, %xmm0 + addps %xmm14, %xmm1 + addps %xmm5, %xmm2 + addps %xmm15, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2, LDC, 2) + movhps %xmm0, 2 * SIZE(CO2, LDC, 2) + movlps %xmm1, 4 * SIZE(CO2, LDC, 2) + movhps %xmm1, 6 * SIZE(CO2, LDC, 2) + + movlps %xmm2, 0 * SIZE(CO2, %rax) + movhps %xmm2, 2 * SIZE(CO2, %rax) + movlps %xmm3, 4 * SIZE(CO2, %rax) + movhps %xmm3, 6 * SIZE(CO2, %rax) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + + addps %xmm3, %xmm10 + pshufd $0x50, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm5, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + + leaq (LDC, LDC, 2), %rax + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 2 * SIZE(CO1, LDC), %xmm1 + + movsd 0 * SIZE(CO1, LDC, 2), %xmm2 + movhps 2 * SIZE(CO1, LDC, 2), %xmm2 + movsd 0 * SIZE(CO1, %rax), %xmm3 + movhps 2 * SIZE(CO1, %rax), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) + movhps %xmm1, 2 * SIZE(CO1, LDC) + + movlps %xmm2, 0 * SIZE(CO1, LDC, 2) + movhps %xmm2, 2 * SIZE(CO1, LDC, 2) + movlps %xmm3, 0 * SIZE(CO1, %rax) + movhps %xmm3, 2 * SIZE(CO1, %rax) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO2, LDC), %xmm1 + movhps 2 * SIZE(CO2, LDC), %xmm1 + + movsd 0 * SIZE(CO2, LDC, 2), %xmm2 + movhps 2 * SIZE(CO2, LDC, 2), %xmm2 + movsd 0 * SIZE(CO2, %rax), %xmm3 + movhps 2 * SIZE(CO2, %rax), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm11, %xmm5 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm11 + + addps %xmm4, %xmm0 + addps %xmm10, %xmm1 + addps %xmm5, %xmm2 + addps %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + movlps %xmm1, 0 * SIZE(CO2, LDC) + movhps %xmm1, 2 * SIZE(CO2, LDC) + + movlps %xmm2, 0 * SIZE(CO2, LDC, 2) + movhps %xmm2, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 0 * SIZE(CO2, %rax) + movhps %xmm3, 2 * SIZE(CO2, %rax) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm8, %xmm8 + xorps %xmm12, %xmm12 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $8, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -20 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + subq $-32 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + addps %xmm3, %xmm12 + movaps -28 * SIZE(BO), %xmm3 + mulps %xmm1, %xmm3 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + leaq (LDC, LDC, 2), %rax + + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + movsd (CO1, LDC, 2), %xmm1 + movhps (CO1, %rax), %xmm1 + + movsd (CO2), %xmm2 + movhps (CO2, LDC), %xmm2 + movsd (CO2, LDC, 2), %xmm3 + movhps (CO2, %rax), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm12, %xmm3 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO1, LDC) + movlps %xmm1, (CO1, LDC, 2) + movhps %xmm1, (CO1, %rax) + + movlps %xmm2, (CO2) + movhps %xmm2, (CO2, LDC) + movlps %xmm3, (CO2, LDC, 2) + movhps %xmm3, (CO2, %rax) + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $8, KK +#endif + + movq BO, B + + leaq (C, LDC, 8), C + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L40: + testq $4, N + jle .L70 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + prefetcht2 4 * SIZE(CO2) + xorps %xmm11, %xmm11 + prefetcht2 4 * SIZE(CO2, LDC, 1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm2, %xmm9 + pshufd $0x39, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + addps %xmm3, %xmm10 + pshufd $0x39, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + addps %xmm4, %xmm11 + pshufd $0x39, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + addps %xmm3, %xmm10 + addps %xmm4, %xmm11 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO1, LDC), %xmm2 + movhps 2 * SIZE(CO1, LDC), %xmm2 + movsd 4 * SIZE(CO1, LDC), %xmm3 + movhps 6 * SIZE(CO1, LDC), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO1, LDC) + movhps %xmm2, 2 * SIZE(CO1, LDC) + movlps %xmm3, 4 * SIZE(CO1, LDC) + movhps %xmm3, 6 * SIZE(CO1, LDC) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + movsd 0 * SIZE(CO2, LDC), %xmm2 + movhps 2 * SIZE(CO2, LDC), %xmm2 + movsd 4 * SIZE(CO2, LDC), %xmm3 + movhps 6 * SIZE(CO2, LDC), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm11, %xmm5 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm11 + + addps %xmm4, %xmm0 + addps %xmm10, %xmm1 + addps %xmm5, %xmm2 + addps %xmm11, %xmm3 + + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movlps %xmm2, 0 * SIZE(CO2, LDC) + movhps %xmm2, 2 * SIZE(CO2, LDC) + movlps %xmm3, 4 * SIZE(CO2, LDC) + movhps %xmm3, 6 * SIZE(CO2, LDC) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm1, %xmm8 + pshufd $0x50, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0xfa, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO1, LDC), %xmm1 + movhps 2 * SIZE(CO1, LDC), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 0 * SIZE(CO2, LDC), %xmm3 + movhps 2 * SIZE(CO2, LDC), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO1, LDC) + movhps %xmm1, 2 * SIZE(CO1, LDC) + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm3, 0 * SIZE(CO2, LDC) + movhps %xmm3, 2 * SIZE(CO2, LDC) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-16 * SIZE, BO + subq $ -4 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + addps %xmm9, %xmm8 + ALIGN_3 + +.L65: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + addps %xmm2, %xmm8 + + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO1, LDC) + movlps %xmm1, (CO2) + movhps %xmm1, (CO2, LDC) + ALIGN_4 + +.L69: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + ALIGN_4 + +.L70: + testq $2, N + jle .L100 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L80 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm3 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -26 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -24 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + mulps %xmm0, %xmm1 + addps %xmm2, %xmm9 + pshufd $0x55, %xmm3, %xmm2 + movsd -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: + addps %xmm1, %xmm8 + addps %xmm2, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + addps %xmm5, %xmm2 + addps %xmm9, %xmm3 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm3, 4 * SIZE(CO2) + movhps %xmm3, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + decq I + BRANCH + jg .L71 + ALIGN_4 + +.L80: + testq $2, M + BRANCH + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movsd -26 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_3 + +.L86: + addps %xmm1, %xmm8 + movsd -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: + addps %xmm1, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 0 * SIZE(CO2) + movhps %xmm1, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L90: + testq $1, M + BRANCH + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm2, %xmm2 + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + addps %xmm2, %xmm8 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + pshufd $0x55, %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm2, %xmm9 + movsd -26 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + addps %xmm9, %xmm8 + ALIGN_3 + +.L95: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + addps %xmm2, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: + addps %xmm2, %xmm8 + + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (CO1) + movhps %xmm0, (CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L100: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm3 + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -30 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -29 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -28 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L108 + ALIGN_3 + +.L106: + addps %xmm1, %xmm8 + pshufd $0x00, %xmm3, %xmm1 + movss -31 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: + addps %xmm1, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm4, %xmm0 + addps %xmm8, %xmm1 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movlps %xmm1, 4 * SIZE(CO1) + movhps %xmm1, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 + decq I + BRANCH + jg .L101 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -31 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -30 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm1, %xmm8 + movss -29 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -24 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_3 + +.L116: + addps %xmm1, %xmm8 + movss -32 * SIZE(BO), %xmm1 + unpcklps %xmm1, %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: + addps %xmm1, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm2, %xmm2 + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -30 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: + movups ALPHA_R, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_3 + +.L126: + addss %xmm2, %xmm8 + movss -32 * SIZE(BO), %xmm2 + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: + addps %xmm2, %xmm8 + + movsd (CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm4, %xmm0 + + movlps %xmm0, (CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S b/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S new file mode 100644 index 0000000000..80c85244a2 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_barcelona.S @@ -0,0 +1,3253 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define KERNEL1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + movaps (AO, %rax, 4), %xmm6 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ + addps %xmm1, %xmm14 ;\ + movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ + addps %xmm5, %xmm14 ;\ + movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 ;\ + addq $16 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps (AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-1024, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + movss %xmm0, 0 + ALPHA + movss %xmm1, 4 + ALPHA + movss %xmm0, 8 + ALPHA + movss %xmm1, 12 + ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + movaps 8 * SIZE(B), %xmm11 + movaps 12 * SIZE(B), %xmm15 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetchw (WPREFETCHSIZE + 32) * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm0 + pshufd $0x55, %xmm11, %xmm1 + pshufd $0xaa, %xmm11, %xmm2 + pshufd $0xff, %xmm11, %xmm3 + + prefetchw (WPREFETCHSIZE + 48) * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm4 + pshufd $0x55, %xmm15, %xmm5 + pshufd $0xaa, %xmm15, %xmm6 + pshufd $0xff, %xmm15, %xmm7 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + prefetch 0 * SIZE(BB) + prefetch 16 * SIZE(BB) + subq $-32 * SIZE, BB + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movaps -28 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movaps 0 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + + prefetchw 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + prefetchw 7 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + prefetchw 7 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + + movaps %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL_SUB1(32 * 0) + KERNEL_SUB2(32 * 0) + KERNEL_SUB3(32 * 0) + KERNEL_SUB4(32 * 0) + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + ALIGN_3 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm8 + movaps %xmm2, %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm9 + movaps %xmm0, %xmm2 + addps %xmm3, %xmm13 + movaps -20 * SIZE(BO, %rax, 8), %xmm3 + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm10 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm11 + addps %xmm3, %xmm15 + movaps -12 * SIZE(BO, %rax, 8), %xmm3 + movaps %xmm0, %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: + movups 0 * SIZE(CO1), %xmm0 + movups 4 * SIZE(CO1), %xmm1 + movups 8 * SIZE(CO1), %xmm2 + movups 12 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm0 + movups 4 * SIZE(CO2), %xmm1 + movups 8 * SIZE(CO2), %xmm2 + movups 12 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + movups 0 * SIZE(CO1, LDC, 2), %xmm0 + movups 4 * SIZE(CO1, LDC, 2), %xmm1 + movups 8 * SIZE(CO1, LDC, 2), %xmm2 + movups 12 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm14, %xmm5 + pshufd $0xfa, %xmm14, %xmm14 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm14 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + addps %xmm2, %xmm5 + addps %xmm3, %xmm14 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + movlps %xmm5, 8 * SIZE(CO1, LDC, 2) + movhps %xmm5, 10 * SIZE(CO1, LDC, 2) + movlps %xmm14, 12 * SIZE(CO1, LDC, 2) + movhps %xmm14, 14 * SIZE(CO1, LDC, 2) + + movups 0 * SIZE(CO2, LDC, 2), %xmm0 + movups 4 * SIZE(CO2, LDC, 2), %xmm1 + movups 8 * SIZE(CO2, LDC, 2), %xmm2 + movups 12 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + addps %xmm2, %xmm5 + addps %xmm3, %xmm15 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + movlps %xmm5, 8 * SIZE(CO2, LDC, 2) + movhps %xmm5, 10 * SIZE(CO2, LDC, 2) + movlps %xmm15, 12 * SIZE(CO2, LDC, 2) + movhps %xmm15, 14 * SIZE(CO2, LDC, 2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm4 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm0 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm0 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + movups 4 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm4 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm1 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm1 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movups 0 * SIZE(CO1, LDC, 2), %xmm8 + movups 4 * SIZE(CO1, LDC, 2), %xmm9 + + pshufd $0x50, %xmm2, %xmm4 + pshufd $0xfa, %xmm2, %xmm2 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm2 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm2 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movups 0 * SIZE(CO2, LDC, 2), %xmm8 + movups 4 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm3, %xmm4 + pshufd $0xfa, %xmm3, %xmm3 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm3 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm3 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 4 * SIZE(CO2, LDC, 2) + movhps %xmm3, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movsd 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsd 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movsd 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movsd 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movsd 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movsd 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movsd 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movsd 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movsd 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movsd 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movsd 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movsd 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsd 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movsd 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsd 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsd 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movsd 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsd 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + movups 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + + movups 0 * SIZE(CO1, LDC, 2), %xmm8 + + pshufd $0x50, %xmm2, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + + movups 0 * SIZE(CO2, LDC, 2), %xmm8 + + pshufd $0x50, %xmm3, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss -30 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss -29 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss -24 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss -27 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss -26 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss -25 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + + pshufd $0x50, %xmm2, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + + pshufd $0x50, %xmm3, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $ 2 * SIZE, B + addq $ 8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + prefetchw 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchw 15 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 32 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 48 * SIZE(AO), %xmm10 + + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 4 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 8 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 12 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 64 * SIZE(AO), %xmm12 + + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) + + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 20 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 24 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 80 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + movups 8 * SIZE(CO1), %xmm10 + movups 12 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm3 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + addps %xmm10, %xmm3 + addps %xmm11, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm3, 8 * SIZE(CO1) + movhps %xmm3, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + movups 4 * SIZE(CO2), %xmm9 + movups 8 * SIZE(CO2), %xmm10 + movups 12 * SIZE(CO2), %xmm11 + + pshufd $0x50, %xmm1, %xmm2 + pshufd $0xfa, %xmm1, %xmm1 + pshufd $0x50, %xmm5, %xmm3 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm1 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm1 + addps %xmm10, %xmm3 + addps %xmm11, %xmm5 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + movlps %xmm3, 8 * SIZE(CO2) + movhps %xmm3, 10 * SIZE(CO2) + movlps %xmm5, 12 * SIZE(CO2) + movhps %xmm5, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + movups 4 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm2 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm1 + addps %xmm8, %xmm2 + addps %xmm9, %xmm1 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movsd 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsd 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movsd 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsd 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movsd 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsd 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movups 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + + movups 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm2 + + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -29 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -24 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -27 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -26 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -25 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm2 + + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movups 0 * SIZE(B), %xmm3 + movups 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + prefetchw 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps -20 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm9, %xmm10 + mulps -12 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps -8 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps -4 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + mulps %xmm11, %xmm12 + mulps 4 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 8 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 12 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 64 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) + mulps %xmm11, %xmm14 + mulps 20 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 24 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 28 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 80 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + movups 8 * SIZE(CO1), %xmm10 + movups 12 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm3 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + addps %xmm10, %xmm3 + addps %xmm11, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm3, 8 * SIZE(CO1) + movhps %xmm3, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps -28 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -24 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps -20 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm10, %xmm11 + movaps -12 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -8 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps -4 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movups 0 * SIZE(CO1), %xmm8 + movups 4 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + + movups 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss -31 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss -30 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss -29 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss -24 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss -27 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss -26 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss -25 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss -20 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_core2.S b/kernel/x86_64/zgemm3m_kernel_8x4_core2.S new file mode 100644 index 0000000000..2ddbb5cfbb --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_core2.S @@ -0,0 +1,2675 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (16 * 16 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (16 * 21 + 8) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif +#endif + + movq %rsp, %r15 # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movss %xmm0, 0 + ALPHA + movss %xmm1, 4 + ALPHA + movss %xmm0, 8 + ALPHA + movss %xmm1, 12 + ALPHA + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq OLD_M, M + movq OLD_N, N + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J + jle .L50 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq 32 * SIZE + BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L05 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movaps -32 * SIZE(B), %xmm3 + movaps -28 * SIZE(B), %xmm7 + movaps -24 * SIZE(B), %xmm11 + movaps -20 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + pshufd $0x00, %xmm11, %xmm8 + pshufd $0x55, %xmm11, %xmm9 + pshufd $0xaa, %xmm11, %xmm10 + pshufd $0xff, %xmm11, %xmm11 + pshufd $0x00, %xmm15, %xmm12 + pshufd $0x55, %xmm15, %xmm13 + pshufd $0xaa, %xmm15, %xmm14 + pshufd $0xff, %xmm15, %xmm15 + + prefetcht0 (PREFETCH_W + 32) * SIZE(BO) + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + movaps %xmm4, -16 * SIZE(BO) + movaps %xmm5, -12 * SIZE(BO) + movaps %xmm6, -8 * SIZE(BO) + movaps %xmm7, -4 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 48) * SIZE(BO) + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + movaps %xmm12, 16 * SIZE(BO) + movaps %xmm13, 20 * SIZE(BO) + movaps %xmm14, 24 * SIZE(BO) + movaps %xmm15, 28 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-64 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L05: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L06: + movaps -32 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, -32 * SIZE(BO) + movaps %xmm1, -28 * SIZE(BO) + movaps %xmm2, -24 * SIZE(BO) + movaps %xmm3, -20 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L06 + ALIGN_4 + +.L10: + movq B, BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 40 * SIZE + BUFFER, BO +#else + leaq 40 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm9, %xmm9 + movaps -28 * SIZE(AO), %xmm1 + pxor %xmm10, %xmm10 + movaps -40 * SIZE(BO), %xmm6 + pxor %xmm11, %xmm11 + movaps -36 * SIZE(BO), %xmm7 + + prefetcht0 (PREFETCH_R + 0) * SIZE(BB) + + prefetcht0 15 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 15 * SIZE(CO2) + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + prefetcht0 15 * SIZE(CO1, LDC, 2) + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + prefetcht0 15 * SIZE(CO2, LDC, 2) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -4 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps 12 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps 20 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + subq $-32 * SIZE, AO + + addps %xmm7, %xmm9 + movaps 28 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, BO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + addq $8 * SIZE, AO + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + addps %xmm5, %xmm13 + addq $16 * SIZE, BO + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA, %xmm7 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm14 + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + movsd 8 * SIZE(CO1, LDC, 2), %xmm2 + movhps 10 * SIZE(CO1, LDC, 2), %xmm2 + movsd 12 * SIZE(CO1, LDC, 2), %xmm3 + movhps 14 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm14, %xmm5 + pshufd $0xfa, %xmm14, %xmm14 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm14 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + addps %xmm2, %xmm5 + addps %xmm3, %xmm14 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + movlps %xmm5, 8 * SIZE(CO1, LDC, 2) + movhps %xmm5, 10 * SIZE(CO1, LDC, 2) + movlps %xmm14, 12 * SIZE(CO1, LDC, 2) + movhps %xmm14, 14 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + movsd 8 * SIZE(CO2, LDC, 2), %xmm2 + movhps 10 * SIZE(CO2, LDC, 2), %xmm2 + movsd 12 * SIZE(CO2, LDC, 2), %xmm3 + movhps 14 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + addps %xmm2, %xmm5 + addps %xmm3, %xmm15 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + movlps %xmm5, 8 * SIZE(CO2, LDC, 2) + movhps %xmm5, 10 * SIZE(CO2, LDC, 2) + movlps %xmm15, 12 * SIZE(CO2, LDC, 2) + movhps %xmm15, 14 * SIZE(CO2, LDC, 2) + + addq $16 * SIZE, CO1 + addq $16 * SIZE, CO2 + subq $1, I + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L25 + ALIGN_4 + +.L21: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -28 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps 0 * SIZE(BO), %xmm2 + movaps 4 * SIZE(BO), %xmm3 + movaps 8 * SIZE(BO), %xmm4 + movaps 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -20 * SIZE(AO), %xmm0 + movaps 16 * SIZE(BO), %xmm2 + movaps 20 * SIZE(BO), %xmm3 + movaps 24 * SIZE(BO), %xmm4 + movaps 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L21 + ALIGN_4 + +.L25: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L28 + ALIGN_4 + +.L26: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + addq $ 4 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L26 + ALIGN_4 + +.L28: + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $2, M + jle .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L35 + ALIGN_4 + +.L31: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -30 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 4 * SIZE(BO), %xmm3 + movsd 8 * SIZE(BO), %xmm4 + movsd 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -26 * SIZE(AO), %xmm0 + movsd 16 * SIZE(BO), %xmm2 + movsd 20 * SIZE(BO), %xmm3 + movsd 24 * SIZE(BO), %xmm4 + movsd 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L31 + ALIGN_4 + +.L35: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + ALIGN_4 + +.L36: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + addq $ 2 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm9, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + + pshufd $0x50, %xmm10, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + + pshufd $0x50, %xmm11, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L40: + testq $1, M + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L45 + ALIGN_4 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -31 * SIZE(AO), %xmm0 + movss -16 * SIZE(BO), %xmm2 + movss -12 * SIZE(BO), %xmm3 + movss -8 * SIZE(BO), %xmm4 + movss -4 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -30 * SIZE(AO), %xmm0 + movss 0 * SIZE(BO), %xmm2 + movss 4 * SIZE(BO), %xmm3 + movss 8 * SIZE(BO), %xmm4 + movss 12 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -29 * SIZE(AO), %xmm0 + movss 16 * SIZE(BO), %xmm2 + movss 20 * SIZE(BO), %xmm3 + movss 24 * SIZE(BO), %xmm4 + movss 28 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jg .L41 + ALIGN_4 + +.L45: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L48 + ALIGN_4 + +.L46: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm0, %xmm4 + mulss %xmm0, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + addq $ 1 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L46 + ALIGN_4 + +.L48: + movsd 0 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm9, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + + pshufd $0x50, %xmm10, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + + pshufd $0x50, %xmm11, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + + leaq (C, LDC, 4), C + subq $1, J + jg .L01 + ALIGN_4 + +.L50: + testq $2, N + jle .L100 + ALIGN_4 + +.L51: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $3, %rax + jle .L53 + + addq %rax, %rax + ALIGN_4 + +.L52: + movaps -32 * SIZE(B), %xmm3 + movaps -28 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + subq $1, %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $7, %rax + BRANCH + jle .L55 + ALIGN_4 + +.L54: + movss -32 * SIZE(B), %xmm8 + movss -31 * SIZE(B), %xmm9 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L54 + ALIGN_4 + +.L55: + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO # aoffset = a + + movq M, I + sarq $3, I + jle .L70 + ALIGN_4 + +.L60: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 15 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetcht0 15 * SIZE(CO2) + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L65 + ALIGN_4 + +.L61: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -20 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + + movaps -16 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -12 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + + movaps -8 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -4 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + subq $-32 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L61 + ALIGN_4 + +.L65: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L68 + ALIGN_4 + +.L66: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + addps %xmm4, %xmm9 + addps %xmm5, %xmm13 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 + addq $16 * SIZE, CO2 + subq $1, I + jg .L60 + ALIGN_4 + +.L70: + testq $4, M + jle .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L75 + ALIGN_4 + +.L71: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L71 + ALIGN_4 + +.L75: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + ALIGN_4 + +.L76: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + ALIGN_4 + +.L80: + testq $2, M + jle .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L85 + ALIGN_4 + +.L81: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + addps %xmm4, %xmm10 + addps %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L81 + ALIGN_4 + +.L85: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L88 + ALIGN_4 + +.L86: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm9, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L90: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L95 + ALIGN_4 + +.L91: + + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -31 * SIZE(AO), %xmm1 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + movss -24 * SIZE(BO), %xmm4 + movss -20 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm4 + mulss %xmm1, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + movss -30 * SIZE(AO), %xmm0 + movss -29 * SIZE(AO), %xmm1 + movss -16 * SIZE(BO), %xmm2 + movss -12 * SIZE(BO), %xmm3 + movss -8 * SIZE(BO), %xmm4 + movss -4 * SIZE(BO), %xmm5 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + mulss %xmm1, %xmm4 + mulss %xmm1, %xmm5 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + addss %xmm4, %xmm10 + addss %xmm5, %xmm11 + + subq $ -4 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jg .L91 + ALIGN_4 + +.L95: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L98 + ALIGN_4 + +.L96: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm0, %xmm3 + + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + + addq $1 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm10, %xmm8 + addss %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm9, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C + ALIGN_4 + + + +.L100: + testq $1, N + jle .L999 + ALIGN_4 + +.L101: +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq K, %rax + sarq $4, %rax + jle .L103 + + addq %rax, %rax + ALIGN_4 + +.L102: + movss -32 * SIZE(B), %xmm0 + movss -31 * SIZE(B), %xmm1 + movss -30 * SIZE(B), %xmm2 + movss -29 * SIZE(B), %xmm3 + movss -28 * SIZE(B), %xmm4 + movss -27 * SIZE(B), %xmm5 + movss -26 * SIZE(B), %xmm6 + movss -25 * SIZE(B), %xmm7 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-32 * SIZE, BO + subq $1, %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $15, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movss -32 * SIZE(B), %xmm8 + + shufps $0, %xmm8, %xmm8 + + movaps %xmm8, 0 * SIZE(BO) + + addq $1 * SIZE, B + addq $4 * SIZE, BO + subq $1, %rax + jne .L104 + ALIGN_4 + +.L105: + movq C, CO1 + movq A, AO + + movq M, I + sarq $3, I + jle .L120 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + prefetcht0 15 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L115 + ALIGN_4 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -28 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + movaps -20 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm9 + addps %xmm3, %xmm13 + + subq $-32 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L111 + ALIGN_4 + +.L115: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + ALIGN_4 + +.L116: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L116 + ALIGN_4 + +.L118: + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 + subq $1, I + jg .L110 + ALIGN_4 + +.L120: + testq $4, M + jle .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L125 + ALIGN_4 + +.L121: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -24 * SIZE(BO), %xmm2 + movaps -20 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L121 + ALIGN_4 + +.L125: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L128 + ALIGN_4 + +.L126: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm8 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 + ALIGN_4 + +.L130: + testq $2, M + jle .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L135 + ALIGN_4 + +.L131: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm8 + addps %xmm3, %xmm9 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -24 * SIZE(BO), %xmm2 + movsd -20 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + addps %xmm2, %xmm10 + addps %xmm3, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L131 + ALIGN_4 + +.L135: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L138 + ALIGN_4 + +.L136: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + + mulps %xmm0, %xmm2 + addps %xmm2, %xmm8 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L140: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L145 + ALIGN_4 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movss -32 * SIZE(AO), %xmm0 + movss -31 * SIZE(AO), %xmm1 + movss -32 * SIZE(BO), %xmm2 + movss -28 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm1, %xmm3 + addss %xmm2, %xmm8 + addss %xmm3, %xmm9 + + movss -30 * SIZE(AO), %xmm0 + movss -29 * SIZE(AO), %xmm1 + movss -24 * SIZE(BO), %xmm2 + movss -20 * SIZE(BO), %xmm3 + + mulss %xmm0, %xmm2 + mulss %xmm1, %xmm3 + addss %xmm2, %xmm10 + addss %xmm3, %xmm11 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jg .L141 + ALIGN_4 + +.L145: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L148 + ALIGN_4 + +.L146: + movss -32 * SIZE(AO), %xmm0 + movss -32 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + addss %xmm2, %xmm8 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm10, %xmm8 + addss %xmm11, %xmm9 + addss %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S b/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S new file mode 100644 index 0000000000..bf2d96e52e --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_penryn.S @@ -0,0 +1,2593 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 17 + 4) +#define PREFETCH prefetcht0 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif +#endif + + unpcklps %xmm1, %xmm0 + + movlps %xmm0, ALPHA_R + movlps %xmm0, ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J + NOBRANCH + jle .L50 + ALIGN_4 + +.L10: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $3, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + xorpd %xmm5, %xmm5 + prefetcht0 -32 * SIZE(BB) + xorpd %xmm6, %xmm6 + + prefetcht2 7 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht2 7 * SIZE(CO2) + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht2 7 * SIZE(CO1, LDC, 2) + movapd %xmm4, %xmm12 + movaps %xmm4, %xmm13 + prefetcht2 7 * SIZE(CO2, LDC, 2) + movaps %xmm4, %xmm14 + movaps %xmm4, %xmm15 + + subq $-24 * SIZE, BB + + leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH -32 * SIZE(PREA) + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + PREFETCH -16 * SIZE(PREA) + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 4 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + PREFETCH 0 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 12 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 20 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + PREFETCH 16 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 28 * SIZE(AO), %xmm1 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + subq $-64 * SIZE, AO + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + subq $-32 * SIZE, BO + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, PREA + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht0 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + addps %xmm7, %xmm9 + addps %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + movups ALPHA_R, %xmm7 + + addps %xmm6, %xmm10 + addps %xmm3, %xmm14 + addps %xmm4, %xmm11 + addps %xmm5, %xmm15 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movaps %xmm13, %xmm4 + shufps $0xd8, %xmm12, %xmm13 + shufps $0xd8, %xmm15, %xmm12 + shufps $0xd8, %xmm14, %xmm15 + shufps $0xd8, %xmm4, %xmm14 + + movaps %xmm12, %xmm4 + shufps $0xd8, %xmm14, %xmm12 + shufps $0xd8, %xmm4, %xmm14 + movaps %xmm13, %xmm5 + shufps $0xd8, %xmm15, %xmm13 + shufps $0xd8, %xmm5, %xmm15 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + movsd 8 * SIZE(CO1, LDC, 2), %xmm2 + movhps 10 * SIZE(CO1, LDC, 2), %xmm2 + movsd 12 * SIZE(CO1, LDC, 2), %xmm3 + movhps 14 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm14, %xmm5 + pshufd $0xfa, %xmm14, %xmm14 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm14 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + addps %xmm2, %xmm5 + addps %xmm3, %xmm14 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + movlps %xmm5, 8 * SIZE(CO1, LDC, 2) + movhps %xmm5, 10 * SIZE(CO1, LDC, 2) + movlps %xmm14, 12 * SIZE(CO1, LDC, 2) + movhps %xmm14, 14 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + movsd 8 * SIZE(CO2, LDC, 2), %xmm2 + movhps 10 * SIZE(CO2, LDC, 2), %xmm2 + movsd 12 * SIZE(CO2, LDC, 2), %xmm3 + movhps 14 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + addps %xmm2, %xmm5 + addps %xmm3, %xmm15 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + movlps %xmm5, 8 * SIZE(CO2, LDC, 2) + movhps %xmm5, 10 * SIZE(CO2, LDC, 2) + movlps %xmm15, 12 * SIZE(CO2, LDC, 2) + movhps %xmm15, 14 * SIZE(CO2, LDC, 2) + + addq $16 * SIZE, CO1 + addq $16 * SIZE, CO2 + decq I + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + addps %xmm6, %xmm10 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + subq $-16 * SIZE, AO + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L28 + ALIGN_3 + +.L26: + addps %xmm6, %xmm10 + pshufd $0x39, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + addps %xmm4, %xmm11 + pshufd $0x39, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0x39, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + addps %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + movups ALPHA_R, %xmm7 + + addps %xmm6, %xmm10 + addps %xmm4, %xmm11 + + movaps %xmm9, %xmm4 + shufps $0xd8, %xmm8, %xmm9 + shufps $0xd8, %xmm11, %xmm8 + shufps $0xd8, %xmm10, %xmm11 + shufps $0xd8, %xmm4, %xmm10 + + movaps %xmm8, %xmm4 + shufps $0xd8, %xmm10, %xmm8 + shufps $0xd8, %xmm4, %xmm10 + movaps %xmm9, %xmm5 + shufps $0xd8, %xmm11, %xmm9 + shufps $0xd8, %xmm5, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $2, M + BRANCH + jle .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x44, %xmm0, %xmm1 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0xee, %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm10 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0x44, %xmm0, %xmm1 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -20 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + pshufd $0xee, %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm10 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xfa, %xmm2, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + pshufd $0x44, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xfa, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm4 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + movups ALPHA_R, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 0 * SIZE(CO2) + movhps %xmm8, 2 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 0 * SIZE(CO2, LDC, 2), %xmm1 + movhps 2 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm9, 0 * SIZE(CO2, LDC, 2) + movhps %xmm9, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movaps -24 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + + subq $ -4 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L48 + ALIGN_3 + +.L46: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + movups ALPHA_R, %xmm7 + + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 0 * SIZE(CO2), %xmm0 + movsd 0 * SIZE(CO1, LDC, 2), %xmm1 + movhps 0 * SIZE(CO2, LDC, 2), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 0 * SIZE(CO2) + movlps %xmm8, 0 * SIZE(CO1, LDC, 2) + movhps %xmm8, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + leaq (C, LDC, 4), C + + subq $1, J + BRANCH + jg .L10 + ALIGN_4 + +.L50: + testq $2, N + jle .L90 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $3, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 -32 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + prefetcht0 7 * SIZE(CO1) + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + prefetcht0 7 * SIZE(CO2) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + addps %xmm5, %xmm10 + pshufd $0xaa, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0xff, %xmm2, %xmm6 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0xaa, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0xff, %xmm2, %xmm6 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm5, %xmm10 + pshufd $0x00, %xmm2, %xmm5 + mulps %xmm1, %xmm5 + addps %xmm6, %xmm11 + pshufd $0x55, %xmm2, %xmm6 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + movups ALPHA_R, %xmm7 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + addps %xmm5, %xmm10 + addps %xmm6, %xmm11 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm10, %xmm5 + pshufd $0xfa, %xmm10, %xmm10 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm10 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm10 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm10, 12 * SIZE(CO1) + movhps %xmm10, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm11, %xmm5 + pshufd $0xfa, %xmm11, %xmm11 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm11 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm11 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm11, 12 * SIZE(CO2) + movhps %xmm11, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 + addq $16 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $4, M + BRANCH + jle .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm10 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xff, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm10 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm11 + pshufd $0xff, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L68 + ALIGN_3 + +.L66: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + movups ALPHA_R, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + addps %xmm3, %xmm8 + addps %xmm4, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 + ALIGN_4 + +.L70: + testq $2, M + BRANCH + jle .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movaps -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L75 + ALIGN_3 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm9 + pshufd $0xee, %xmm0, %xmm1 + movaps -28 * SIZE(AO), %xmm0 + pshufd $0xfa, %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + pshufd $0x50, %xmm2, %xmm3 + mulps %xmm1, %xmm3 + + addps %xmm3, %xmm9 + pshufd $0xee, %xmm0, %xmm1 + movaps -24 * SIZE(AO), %xmm0 + pshufd $0xfa, %xmm2, %xmm3 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L72 + ALIGN_3 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L78 + ALIGN_3 + +.L76: + addps %xmm3, %xmm8 + pshufd $0x44, %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + pshufd $0x50, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm3 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L76 + ALIGN_3 + +.L78: + movups ALPHA_R, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm3, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 0 * SIZE(CO2) + movhps %xmm8, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L80: + testq $1, M + BRANCH + jle .L89 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L85 + ALIGN_3 + +.L82: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -30 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -30 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movsd -28 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -29 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -26 * SIZE(BO), %xmm2 + + pshufd $0x00, %xmm0, %xmm1 + movss -28 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm9 + movsd -24 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L82 + ALIGN_3 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L88 + ALIGN_3 + +.L86: + pshufd $0x00, %xmm0, %xmm1 + movss -31 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm2 + addps %xmm2, %xmm8 + movsd -30 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L86 + ALIGN_3 + +.L88: + movups ALPHA_R, %xmm7 + + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 0 * SIZE(CO2), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 0 * SIZE(CO2) + ALIGN_4 + +.L89: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L90: + testq $1, N + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $3, I + NOBRANCH + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 8), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm1 + xorps %xmm9, %xmm9 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + prefetcht0 7 * SIZE(CO1) + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L95 + ALIGN_3 + +.L92: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -20 * SIZE(AO), %xmm1 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm10 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm11 + movaps -12 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -4 * SIZE(AO), %xmm1 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm10 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm11 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L92 + ALIGN_3 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_3 + +.L96: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm3, %xmm0 + addps %xmm0, %xmm8 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm3, %xmm1 + addps %xmm1, %xmm9 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L96 + ALIGN_3 + +.L98: + movups ALPHA_R, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm9, %xmm5 + pshufd $0xfa, %xmm9, %xmm9 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm9 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm9 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm9, 12 * SIZE(CO1) + movhps %xmm9, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 + decq I + BRANCH + jg .L91 + ALIGN_4 + +.L100: + testq $4, M + BRANCH + jle .L110 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L105 + ALIGN_3 + +.L102: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movaps -20 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + + subq $-16 * SIZE, AO + subq $ -4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L102 + ALIGN_3 + +.L105: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L108 + ALIGN_3 + +.L106: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movaps -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + addq $4 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L106 + ALIGN_3 + +.L108: + movups ALPHA_R, %xmm7 + + addps %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 + ALIGN_4 + +.L110: + testq $2, M + BRANCH + jle .L120 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm3, %xmm3 + movsd -32 * SIZE(BO), %xmm2 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L115 + ALIGN_3 + +.L112: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + pshufd $0x55, %xmm2, %xmm3 + movsd -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -24 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + subq $-8 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L112 + ALIGN_3 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L118 + ALIGN_3 + +.L116: + pshufd $0x00, %xmm2, %xmm3 + movss -31 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm3 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm3, %xmm8 + + addq $2 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L116 + ALIGN_3 + +.L118: + movups ALPHA_R, %xmm7 + + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L120: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movss -32 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + movss -32 * SIZE(BO), %xmm2 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L125 + ALIGN_3 + +.L122: + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -30 * SIZE(AO), %xmm0 + addss %xmm2, %xmm9 + movss -30 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -29 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -29 * SIZE(BO), %xmm2 + + mulss %xmm0, %xmm2 + movss -28 * SIZE(AO), %xmm0 + addss %xmm2, %xmm9 + movss -28 * SIZE(BO), %xmm2 + + subq $-4 * SIZE, AO + subq $-4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L122 + ALIGN_3 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L128 + ALIGN_3 + +.L126: + mulss %xmm0, %xmm2 + movss -31 * SIZE(AO), %xmm0 + addss %xmm2, %xmm8 + movss -31 * SIZE(BO), %xmm2 + + addq $1 * SIZE, AO + addq $1 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L126 + ALIGN_3 + +.L128: + movups ALPHA_R, %xmm7 + + addss %xmm9, %xmm8 + + movsd 0 * SIZE(CO1), %xmm0 + + pshufd $0x50, %xmm8, %xmm4 + + mulps %xmm7, %xmm4 + addps %xmm0, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_sse.S b/kernel/x86_64/zgemm3m_kernel_8x4_sse.S new file mode 100644 index 0000000000..6bd9148025 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_sse.S @@ -0,0 +1,3498 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi + +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (16 * 5 + 8) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (16 * 5 + 8) +#endif + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#endif + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-1024, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + movss %xmm0, 0 + ALPHA + movss %xmm1, 4 + ALPHA + movss %xmm0, 8 + ALPHA + movss %xmm1, 12 + ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-32 * SIZE, A + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: + PREFETCH 0 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movaps -24 * SIZE(AO), %xmm4 + movaps -24 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movaps -20 * SIZE(AO), %xmm6 + movaps -16 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + PREFETCHW 15 * SIZE(CO1) + pxor %xmm12, %xmm12 + PREFETCHW 15 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCHW 15 * SIZE(CO1, LDC, 2) + pxor %xmm14, %xmm14 + PREFETCHW 15 * SIZE(CO2, LDC, 2) + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $64 * SIZE, BO + addq $32 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 64 * SIZE, BO + subq $-32 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movaps ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm0 + addps %xmm1, %xmm10 + movaps -32 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm13 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm2 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm2, %xmm15 + movaps -20 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 8 * SIZE(CO1), %xmm2 + movhps 10 * SIZE(CO1), %xmm2 + movsd 12 * SIZE(CO1), %xmm3 + movhps 14 * SIZE(CO1), %xmm3 + + pshufd $0x50, %xmm8, %xmm4 + pshufd $0xfa, %xmm8, %xmm8 + pshufd $0x50, %xmm12, %xmm5 + pshufd $0xfa, %xmm12, %xmm12 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm8 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm12 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm8 + addps %xmm2, %xmm5 + addps %xmm3, %xmm12 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm8, 4 * SIZE(CO1) + movhps %xmm8, 6 * SIZE(CO1) + movlps %xmm5, 8 * SIZE(CO1) + movhps %xmm5, 10 * SIZE(CO1) + movlps %xmm12, 12 * SIZE(CO1) + movhps %xmm12, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm0 + movhps 2 * SIZE(CO2), %xmm0 + movsd 4 * SIZE(CO2), %xmm1 + movhps 6 * SIZE(CO2), %xmm1 + movsd 8 * SIZE(CO2), %xmm2 + movhps 10 * SIZE(CO2), %xmm2 + movsd 12 * SIZE(CO2), %xmm3 + movhps 14 * SIZE(CO2), %xmm3 + + pshufd $0x50, %xmm9, %xmm4 + pshufd $0xfa, %xmm9, %xmm9 + pshufd $0x50, %xmm13, %xmm5 + pshufd $0xfa, %xmm13, %xmm13 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm9 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm13 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm9 + addps %xmm2, %xmm5 + addps %xmm3, %xmm13 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm9, 4 * SIZE(CO2) + movhps %xmm9, 6 * SIZE(CO2) + movlps %xmm5, 8 * SIZE(CO2) + movhps %xmm5, 10 * SIZE(CO2) + movlps %xmm13, 12 * SIZE(CO2) + movhps %xmm13, 14 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm0 + movhps 2 * SIZE(CO1, LDC, 2), %xmm0 + movsd 4 * SIZE(CO1, LDC, 2), %xmm1 + movhps 6 * SIZE(CO1, LDC, 2), %xmm1 + movsd 8 * SIZE(CO1, LDC, 2), %xmm2 + movhps 10 * SIZE(CO1, LDC, 2), %xmm2 + movsd 12 * SIZE(CO1, LDC, 2), %xmm3 + movhps 14 * SIZE(CO1, LDC, 2), %xmm3 + + pshufd $0x50, %xmm10, %xmm4 + pshufd $0xfa, %xmm10, %xmm10 + pshufd $0x50, %xmm14, %xmm5 + pshufd $0xfa, %xmm14, %xmm14 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm10 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm14 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm10 + addps %xmm2, %xmm5 + addps %xmm3, %xmm14 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm10, 4 * SIZE(CO1, LDC, 2) + movhps %xmm10, 6 * SIZE(CO1, LDC, 2) + movlps %xmm5, 8 * SIZE(CO1, LDC, 2) + movhps %xmm5, 10 * SIZE(CO1, LDC, 2) + movlps %xmm14, 12 * SIZE(CO1, LDC, 2) + movhps %xmm14, 14 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm0 + movhps 2 * SIZE(CO2, LDC, 2), %xmm0 + movsd 4 * SIZE(CO2, LDC, 2), %xmm1 + movhps 6 * SIZE(CO2, LDC, 2), %xmm1 + movsd 8 * SIZE(CO2, LDC, 2), %xmm2 + movhps 10 * SIZE(CO2, LDC, 2), %xmm2 + movsd 12 * SIZE(CO2, LDC, 2), %xmm3 + movhps 14 * SIZE(CO2, LDC, 2), %xmm3 + + pshufd $0x50, %xmm11, %xmm4 + pshufd $0xfa, %xmm11, %xmm11 + pshufd $0x50, %xmm15, %xmm5 + pshufd $0xfa, %xmm15, %xmm15 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm11 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm15 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm11 + addps %xmm2, %xmm5 + addps %xmm3, %xmm15 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm11, 4 * SIZE(CO2, LDC, 2) + movhps %xmm11, 6 * SIZE(CO2, LDC, 2) + movlps %xmm5, 8 * SIZE(CO2, LDC, 2) + movhps %xmm5, 10 * SIZE(CO2, LDC, 2) + movlps %xmm15, 12 * SIZE(CO2, LDC, 2) + movhps %xmm15, 14 * SIZE(CO2, LDC, 2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $ 32 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -28 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm4 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm0 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm0 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm4 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm1 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm1 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + movsd 4 * SIZE(CO1, LDC, 2), %xmm9 + movhps 6 * SIZE(CO1, LDC, 2), %xmm9 + + pshufd $0x50, %xmm2, %xmm4 + pshufd $0xfa, %xmm2, %xmm2 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm2 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm2 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhps 2 * SIZE(CO2, LDC, 2), %xmm8 + movsd 4 * SIZE(CO2, LDC, 2), %xmm9 + movhps 6 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm3, %xmm4 + pshufd $0xfa, %xmm3, %xmm3 + + mulps %xmm7, %xmm4 + mulps %xmm7, %xmm3 + + addps %xmm8, %xmm4 + addps %xmm9, %xmm3 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 4 * SIZE(CO2, LDC, 2) + movhps %xmm3, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $ 16 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + movhps %xmm4, 2 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + movhps %xmm4, 2 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + + pshufd $0x50, %xmm2, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + movhps %xmm4, 2 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhps 2 * SIZE(CO2, LDC, 2), %xmm8 + + pshufd $0x50, %xmm3, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + movhps %xmm4, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + mulss %xmm8, %xmm9 + addss %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + addss %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulss %xmm8, %xmm11 + addss %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + addss %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulss %xmm8, %xmm11 + movss -30 * SIZE(AO), %xmm8 + addss %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulss %xmm8, %xmm13 + addss %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + addss %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulss %xmm8, %xmm13 + movss -29 * SIZE(AO), %xmm8 + addss %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulss %xmm8, %xmm15 + addss %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + addss %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulss %xmm8, %xmm15 + movss -24 * SIZE(AO), %xmm8 + addss %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + mulss %xmm10, %xmm9 + addss %xmm9, %xmm0 + movss 68 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm1 + movss 72 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + addss %xmm9, %xmm2 + movss 76 * SIZE(BO), %xmm9 + mulss %xmm10, %xmm9 + movss -27 * SIZE(AO), %xmm10 + addss %xmm9, %xmm3 + movss 128 * SIZE(BO), %xmm9 + + mulss %xmm10, %xmm11 + addss %xmm11, %xmm0 + movss 84 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm1 + movss 88 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + addss %xmm11, %xmm2 + movss 92 * SIZE(BO), %xmm11 + mulss %xmm10, %xmm11 + movss -26 * SIZE(AO), %xmm10 + addss %xmm11, %xmm3 + movss 144 * SIZE(BO), %xmm11 + + mulss %xmm10, %xmm13 + addss %xmm13, %xmm0 + movss 100 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm1 + movss 104 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + addss %xmm13, %xmm2 + movss 108 * SIZE(BO), %xmm13 + mulss %xmm10, %xmm13 + movss -25 * SIZE(AO), %xmm10 + addss %xmm13, %xmm3 + movss 160 * SIZE(BO), %xmm13 + + mulss %xmm10, %xmm15 + addss %xmm15, %xmm0 + movss 116 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm1 + movss 120 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + addss %xmm15, %xmm2 + movss 124 * SIZE(BO), %xmm15 + mulss %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addss %xmm15, %xmm3 + movss 176 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $128 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 16 * SIZE(BO), %xmm9 + + addq $ 1 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + + pshufd $0x50, %xmm2, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + + pshufd $0x50, %xmm3, %xmm4 + mulps %xmm7, %xmm4 + addps %xmm8, %xmm4 + + movlps %xmm4, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L53 + ALIGN_4 + +.L52: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + PREFETCHNTA 32 * SIZE(B) + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCHNTA 32 * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $3, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) +#endif + + addq $ 2 * SIZE, B + addq $ 8 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 15 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 48 * SIZE(AO), %xmm10 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 4 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 8 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 12 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 64 * SIZE(AO), %xmm12 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 20 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 24 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 28 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 80 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps -24 * SIZE(AO), %xmm8 + + addq $8 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm3 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + addps %xmm10, %xmm3 + addps %xmm11, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm3, 8 * SIZE(CO1) + movhps %xmm3, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + movsd 8 * SIZE(CO2), %xmm10 + movhps 10 * SIZE(CO2), %xmm10 + movsd 12 * SIZE(CO2), %xmm11 + movhps 14 * SIZE(CO2), %xmm11 + + pshufd $0x50, %xmm1, %xmm2 + pshufd $0xfa, %xmm1, %xmm1 + pshufd $0x50, %xmm5, %xmm3 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm1 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm5 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm1 + addps %xmm10, %xmm3 + addps %xmm11, %xmm5 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + movlps %xmm3, 8 * SIZE(CO2) + movhps %xmm3, 10 * SIZE(CO2) + movlps %xmm5, 12 * SIZE(CO2) + movhps %xmm5, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps -24 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps -20 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps -12 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps -8 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps -4 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -28 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm2 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm1 + addps %xmm8, %xmm2 + addps %xmm9, %xmm1 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + movss 32 * SIZE(BO), %xmm13 + movss 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movss 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movss 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movss 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -29 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movss 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movss 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movss -24 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movss 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movss 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -27 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movss 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movss 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movss -26 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movss 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movss 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -25 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movss 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movss 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movss -20 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movss 112 * SIZE(BO), %xmm15 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movss 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addss %xmm2, %xmm0 + addss %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm1, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + PREFETCHNTA 32 * SIZE(B) + + shufps $0, %xmm0, %xmm0 + shufps $0, %xmm1, %xmm1 + shufps $0, %xmm2, %xmm2 + shufps $0, %xmm3, %xmm3 + shufps $0, %xmm4, %xmm4 + shufps $0, %xmm5, %xmm5 + shufps $0, %xmm6, %xmm6 + shufps $0, %xmm7, %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCHNTA 32 * SIZE(B) + + movd 0 * SIZE(B), %mm0 + movd 1 * SIZE(B), %mm1 + movd 2 * SIZE(B), %mm2 + movd 3 * SIZE(B), %mm3 + movd 4 * SIZE(B), %mm4 + movd 5 * SIZE(B), %mm5 + movd 6 * SIZE(B), %mm6 + movd 7 * SIZE(B), %mm7 + + punpckldq %mm0, %mm0 + punpckldq %mm1, %mm1 + punpckldq %mm2, %mm2 + punpckldq %mm3, %mm3 + punpckldq %mm4, %mm4 + punpckldq %mm5, %mm5 + punpckldq %mm6, %mm6 + punpckldq %mm7, %mm7 + + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) + movq %mm1, 4 * SIZE(BO) + movq %mm1, 6 * SIZE(BO) + movq %mm2, 8 * SIZE(BO) + movq %mm2, 10 * SIZE(BO) + movq %mm3, 12 * SIZE(BO) + movq %mm3, 14 * SIZE(BO) + movq %mm4, 16 * SIZE(BO) + movq %mm4, 18 * SIZE(BO) + movq %mm5, 20 * SIZE(BO) + movq %mm5, 22 * SIZE(BO) + movq %mm6, 24 * SIZE(BO) + movq %mm6, 26 * SIZE(BO) + movq %mm7, 28 * SIZE(BO) + movq %mm7, 30 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO +#endif + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: +#if defined(PENTIUM4) || defined(GENERIC) + movss 0 * SIZE(B), %xmm0 + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 * SIZE(BO) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + movd 0 * SIZE(B), %mm0 + punpckldq %mm0, %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 2 * SIZE(BO) +#endif + + addq $ 1 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + movaps 0 * SIZE(AO), %xmm12 + movaps 16 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + PREFETCHW 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm9, %xmm8 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm8 + mulps -20 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm9, %xmm10 + mulps -12 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps -8 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm9, %xmm10 + mulps -4 * SIZE(AO), %xmm9 + addps %xmm10, %xmm0 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movaps 32 * SIZE(BO), %xmm9 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm11, %xmm12 + mulps 4 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 8 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm12 + mulps 12 * SIZE(AO), %xmm11 + addps %xmm12, %xmm0 + movaps 64 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm11, %xmm14 + mulps 20 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 24 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm11, %xmm14 + mulps 28 * SIZE(AO), %xmm11 + addps %xmm14, %xmm0 + movaps 80 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movaps 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm9, %xmm8 + mulps -28 * SIZE(AO), %xmm9 + addps %xmm8, %xmm0 + movaps -24 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L116 + ALIGN_4 + +.L118: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm3 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + addps %xmm10, %xmm3 + addps %xmm11, %xmm4 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm3, 8 * SIZE(CO1) + movhps %xmm3, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps -28 * SIZE(AO), %xmm8 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 32 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps -24 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps -20 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 0 * SIZE(AO), %xmm8 + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + movaps -12 * SIZE(AO), %xmm10 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps -8 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps -4 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 16 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm2 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm7, %xmm2 + mulps %xmm7, %xmm0 + + addps %xmm8, %xmm2 + addps %xmm9, %xmm0 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm8 + movaps -24 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd -28 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -26 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 12 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + movsd -16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 32 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + movsd -22 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -18 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 28 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm11 + movsd -8 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movaps 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + mulps %xmm8, %xmm9 + movsd -30 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss -32 * SIZE(AO), %xmm8 + movss -28 * SIZE(AO), %xmm10 + + movss 0 * SIZE(BO), %xmm9 + movss 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movss -31 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 32 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss -30 * SIZE(AO), %xmm8 + mulss 8 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss -29 * SIZE(AO), %xmm8 + mulss 12 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss -24 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss -27 * SIZE(AO), %xmm10 + mulss 20 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 48 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss -26 * SIZE(AO), %xmm10 + mulss 24 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss -25 * SIZE(AO), %xmm10 + mulss 28 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss -20 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + mulss %xmm8, %xmm9 + movss -31 * SIZE(AO), %xmm8 + addss %xmm9, %xmm0 + movss 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm2 + mulps %xmm7, %xmm2 + addps %xmm8, %xmm2 + + movlps %xmm2, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + EMMS + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S b/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S new file mode 100644 index 0000000000..67537a7025 --- /dev/null +++ b/kernel/x86_64/zgemm3m_kernel_8x4_sse3.S @@ -0,0 +1,3075 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r12 +#define BO %r13 +#define CO1 %r14 +#define CO2 %r15 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 320 + +#define KERNEL1(address) \ + mulps %xmm8, %xmm9; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * SIZE(AO); \ + addps %xmm9, %xmm0; \ + movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm1; \ + movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 4 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm3; \ + movsldup 0 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 0 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm5; \ + movsldup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 4 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 8 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm7; \ + movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm0; \ + movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm1; \ + movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 12 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm3; \ + movsldup 8 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 8 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm5; \ + movsldup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 12 * SIZE + (address) * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 64 * SIZE + (address) * SIZE(AO), %xmm8; \ + addps %xmm9, %xmm7; \ + movsldup 64 * SIZE + (address) * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm1; \ + movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 20 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm3; \ + movsldup 16 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 16 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm5; \ + movsldup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 20 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 24 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm7; \ + movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm1; \ + movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 28 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm3; \ + movsldup 24 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 24 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm5; \ + movsldup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 28 * SIZE + (address) * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 80 * SIZE + (address) * SIZE(AO), %xmm10; \ + addps %xmm11, %xmm7; \ + movsldup 80 * SIZE + (address) * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulps %xmm12, %xmm13; \ + PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * SIZE(AO); \ + addps %xmm13, %xmm0; \ + movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm1; \ + movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 36 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm3; \ + movsldup 32 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 32 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm5; \ + movsldup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 36 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 40 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm7; \ + movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm0; \ + movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm1; \ + movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 44 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm3; \ + movsldup 40 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 40 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm5; \ + movsldup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 44 * SIZE + (address) * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 96 * SIZE + (address) * SIZE(AO), %xmm12; \ + addps %xmm13, %xmm7; \ + movsldup 96 * SIZE + (address) * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm1; \ + movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 52 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm3; \ + movsldup 48 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 48 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm5; \ + movsldup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 52 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 56 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm7; \ + movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm1; \ + movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 60 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm3; \ + movsldup 56 * SIZE + (address) * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 56 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm5; \ + movsldup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 60 * SIZE + (address) * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 112 * SIZE + (address) * SIZE(AO), %xmm14; \ + addps %xmm15, %xmm7; \ + movsldup 112 * SIZE + (address) * SIZE(BO), %xmm15 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-1024, %rsp # align stack + + STACK_TOUCHING + + movss %xmm0, 0 + ALPHA + movss %xmm1, 4 + ALPHA + movss %xmm0, 8 + ALPHA + movss %xmm1, 12 + ALPHA + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $2, J # j = (n >> 2) + jle .L50 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq 112 * SIZE(B), BB + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L20 + ALIGN_4 + +.L11: + prefetcht0 0 * SIZE(BB) + prefetcht0 8 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + prefetchnta 15 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 15 * SIZE(CO2) + pxor %xmm5, %xmm5 + prefetchnta 15 * SIZE(CO1, LDC, 2) + pxor %xmm6, %xmm6 + prefetchnta 15 * SIZE(CO2, LDC, 2) + pxor %xmm7, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 + +.L1X: + KERNEL1 (64 * 0) + KERNEL2 (64 * 0) + KERNEL3 (64 * 0) + KERNEL4 (64 * 0) + KERNEL5 (64 * 0) + KERNEL6 (64 * 0) + KERNEL7 (64 * 0) + KERNEL8 (64 * 0) + KERNEL9 (64 * 0) + KERNEL10(64 * 0) + KERNEL11(64 * 0) + KERNEL12(64 * 0) + KERNEL13(64 * 0) + KERNEL14(64 * 0) + KERNEL15(64 * 0) + KERNEL16(64 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 1) + KERNEL2 (64 * 1) + KERNEL3 (64 * 1) + KERNEL4 (64 * 1) + KERNEL5 (64 * 1) + KERNEL6 (64 * 1) + KERNEL7 (64 * 1) + KERNEL8 (64 * 1) + KERNEL9 (64 * 1) + KERNEL10(64 * 1) + KERNEL11(64 * 1) + KERNEL12(64 * 1) + KERNEL13(64 * 1) + KERNEL14(64 * 1) + KERNEL15(64 * 1) + KERNEL16(64 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 2) + KERNEL2 (64 * 2) + KERNEL3 (64 * 2) + KERNEL4 (64 * 2) + KERNEL5 (64 * 2) + KERNEL6 (64 * 2) + KERNEL7 (64 * 2) + KERNEL8 (64 * 2) + KERNEL9 (64 * 2) + KERNEL10(64 * 2) + KERNEL11(64 * 2) + KERNEL12(64 * 2) + KERNEL13(64 * 2) + KERNEL14(64 * 2) + KERNEL15(64 * 2) + KERNEL16(64 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 3) + KERNEL2 (64 * 3) + KERNEL3 (64 * 3) + KERNEL4 (64 * 3) + KERNEL5 (64 * 3) + KERNEL6 (64 * 3) + KERNEL7 (64 * 3) + KERNEL8 (64 * 3) + KERNEL9 (64 * 3) + KERNEL10(64 * 3) + KERNEL11(64 * 3) + KERNEL12(64 * 3) + KERNEL13(64 * 3) + KERNEL14(64 * 3) + KERNEL15(64 * 3) + KERNEL16(64 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 4) + KERNEL2 (64 * 4) + KERNEL3 (64 * 4) + KERNEL4 (64 * 4) + KERNEL5 (64 * 4) + KERNEL6 (64 * 4) + KERNEL7 (64 * 4) + KERNEL8 (64 * 4) + KERNEL9 (64 * 4) + KERNEL10(64 * 4) + KERNEL11(64 * 4) + KERNEL12(64 * 4) + KERNEL13(64 * 4) + KERNEL14(64 * 4) + KERNEL15(64 * 4) + KERNEL16(64 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 5) + KERNEL2 (64 * 5) + KERNEL3 (64 * 5) + KERNEL4 (64 * 5) + KERNEL5 (64 * 5) + KERNEL6 (64 * 5) + KERNEL7 (64 * 5) + KERNEL8 (64 * 5) + KERNEL9 (64 * 5) + KERNEL10(64 * 5) + KERNEL11(64 * 5) + KERNEL12(64 * 5) + KERNEL13(64 * 5) + KERNEL14(64 * 5) + KERNEL15(64 * 5) + KERNEL16(64 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 6) + KERNEL2 (64 * 6) + KERNEL3 (64 * 6) + KERNEL4 (64 * 6) + KERNEL5 (64 * 6) + KERNEL6 (64 * 6) + KERNEL7 (64 * 6) + KERNEL8 (64 * 6) + KERNEL9 (64 * 6) + KERNEL10(64 * 6) + KERNEL11(64 * 6) + KERNEL12(64 * 6) + KERNEL13(64 * 6) + KERNEL14(64 * 6) + KERNEL15(64 * 6) + KERNEL16(64 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L12 + KERNEL1 (64 * 7) + KERNEL2 (64 * 7) + KERNEL3 (64 * 7) + KERNEL4 (64 * 7) + KERNEL5 (64 * 7) + KERNEL6 (64 * 7) + KERNEL7 (64 * 7) + KERNEL8 (64 * 7) + KERNEL9 (64 * 7) + KERNEL10(64 * 7) + KERNEL11(64 * 7) + KERNEL12(64 * 7) + KERNEL13(64 * 7) + KERNEL14(64 * 7) + KERNEL15(64 * 7) + KERNEL16(64 * 7) + + addq $64 * 8 * SIZE, AO + addq $64 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + KERNEL1 (64 * 0) + KERNEL2 (64 * 0) + KERNEL3 (64 * 0) + KERNEL4 (64 * 0) + KERNEL5 (64 * 0) + KERNEL6 (64 * 0) + KERNEL7 (64 * 0) + KERNEL8 (64 * 0) + KERNEL9 (64 * 0) + KERNEL10(64 * 0) + KERNEL11(64 * 0) + KERNEL12(64 * 0) + KERNEL13(64 * 0) + KERNEL14(64 * 0) + KERNEL15(64 * 0) + KERNEL16(64 * 0) + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm6 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm7 + movsldup 8 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm13 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm4 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + addps %xmm10, %xmm13 + addps %xmm11, %xmm4 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm13, 8 * SIZE(CO1) + movhps %xmm13, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + movsd 8 * SIZE(CO2), %xmm10 + movhps 10 * SIZE(CO2), %xmm10 + movsd 12 * SIZE(CO2), %xmm11 + movhps 14 * SIZE(CO2), %xmm11 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + pshufd $0x50, %xmm5, %xmm13 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm5 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + addps %xmm10, %xmm13 + addps %xmm11, %xmm5 + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + movlps %xmm13, 8 * SIZE(CO2) + movhps %xmm13, 10 * SIZE(CO2) + movlps %xmm5, 12 * SIZE(CO2) + movhps %xmm5, 14 * SIZE(CO2) + + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + movsd 4 * SIZE(CO1, LDC, 2), %xmm9 + movhps 6 * SIZE(CO1, LDC, 2), %xmm9 + movsd 8 * SIZE(CO1, LDC, 2), %xmm10 + movhps 10 * SIZE(CO1, LDC, 2), %xmm10 + movsd 12 * SIZE(CO1, LDC, 2), %xmm11 + movhps 14 * SIZE(CO1, LDC, 2), %xmm11 + + pshufd $0x50, %xmm2, %xmm12 + pshufd $0xfa, %xmm2, %xmm2 + pshufd $0x50, %xmm6, %xmm13 + pshufd $0xfa, %xmm6, %xmm6 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm2 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm6 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm2 + addps %xmm10, %xmm13 + addps %xmm11, %xmm6 + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + movlps %xmm13, 8 * SIZE(CO1, LDC, 2) + movhps %xmm13, 10 * SIZE(CO1, LDC, 2) + movlps %xmm6, 12 * SIZE(CO1, LDC, 2) + movhps %xmm6, 14 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhps 2 * SIZE(CO2, LDC, 2), %xmm8 + movsd 4 * SIZE(CO2, LDC, 2), %xmm9 + movhps 6 * SIZE(CO2, LDC, 2), %xmm9 + movsd 8 * SIZE(CO2, LDC, 2), %xmm10 + movhps 10 * SIZE(CO2, LDC, 2), %xmm10 + movsd 12 * SIZE(CO2, LDC, 2), %xmm11 + movhps 14 * SIZE(CO2, LDC, 2), %xmm11 + + pshufd $0x50, %xmm3, %xmm12 + pshufd $0xfa, %xmm3, %xmm3 + pshufd $0x50, %xmm7, %xmm13 + pshufd $0xfa, %xmm7, %xmm7 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm3 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm7 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm3 + addps %xmm10, %xmm13 + addps %xmm11, %xmm7 + + movlps %xmm12, 0 * SIZE(CO2, LDC, 2) + movhps %xmm12, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 4 * SIZE(CO2, LDC, 2) + movhps %xmm3, 6 * SIZE(CO2, LDC, 2) + movlps %xmm13, 8 * SIZE(CO2, LDC, 2) + movhps %xmm13, 10 * SIZE(CO2, LDC, 2) + movlps %xmm7, 12 * SIZE(CO2, LDC, 2) + movhps %xmm7, 14 * SIZE(CO2, LDC, 2) + + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $8, KK +#endif + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $4, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 64 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movsldup 80 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 32 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsldup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsldup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movsldup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movsldup 96 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 48 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsldup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsldup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movsldup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movsldup 112 * SIZE(BO), %xmm15 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + movsd 4 * SIZE(CO1, LDC, 2), %xmm9 + movhps 6 * SIZE(CO1, LDC, 2), %xmm9 + + pshufd $0x50, %xmm2, %xmm12 + pshufd $0xfa, %xmm2, %xmm2 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm2 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm2 + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm2, 4 * SIZE(CO1, LDC, 2) + movhps %xmm2, 6 * SIZE(CO1, LDC, 2) + + movsd 0 * SIZE(CO2, LDC, 2), %xmm8 + movhps 2 * SIZE(CO2, LDC, 2), %xmm8 + movsd 4 * SIZE(CO2, LDC, 2), %xmm9 + movhps 6 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm3, %xmm12 + pshufd $0xfa, %xmm3, %xmm3 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm3 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm3 + + movlps %xmm12, 0 * SIZE(CO2, LDC, 2) + movhps %xmm12, 2 * SIZE(CO2, LDC, 2) + movlps %xmm3, 4 * SIZE(CO2, LDC, 2) + movhps %xmm3, 6 * SIZE(CO2, LDC, 2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $2, M + je .L40 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + movddup 8 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 32 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsd 16 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 20 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 24 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movsd 28 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movsd 64 * SIZE(BO), %xmm9 + addps %xmm11, %xmm0 + movsd 36 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 40 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 44 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 48 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movsd 52 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 56 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movsd 60 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsd 96 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm9 + movhps 2 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + + movsd 0 * SIZE(CO1, LDC, 2), %xmm8 + movhps 2 * SIZE(CO1, LDC, 2), %xmm8 + movsd 0 * SIZE(CO2, LDC, 2), %xmm9 + movhps 2 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + + movlps %xmm12, 0 * SIZE(CO1, LDC, 2) + movhps %xmm12, 2 * SIZE(CO1, LDC, 2) + movlps %xmm1, 0 * SIZE(CO2, LDC, 2) + movhps %xmm1, 2 * SIZE(CO2, LDC, 2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L40: + testq $1, M + je .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 32 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L45 + ALIGN_4 + +.L42: + shufps $0, %xmm8, %xmm8 + movhps 4 * SIZE(BO), %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 8 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 16 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 20 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 3 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 24 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + movhps 28 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 64 * SIZE(BO), %xmm9 + shufps $0, %xmm10, %xmm10 + movhps 36 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 40 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 44 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 52 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 7 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 56 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + movhps 60 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movss 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 96 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_4 + +.L46: + shufps $0, %xmm8, %xmm8 + movhps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 8 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L46 + ALIGN_4 + +.L48: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 + movsd 0 * SIZE(CO1, LDC, 2), %xmm9 + movhps 0 * SIZE(CO2, LDC, 2), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + movlps %xmm0, 0 * SIZE(CO1, LDC, 2) + movhps %xmm0, 0 * SIZE(CO2, LDC, 2) + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $4, KK +#endif + leaq (C, LDC, 4), C # c += 4 * ldc + decq J # j -- + jg .L01 + +.L50: + testq $2, N + je .L100 + +.L51: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L53 + ALIGN_4 + +.L52: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L53: + movq K, %rax + andq $7, %rax + BRANCH + jle .L60 + ALIGN_4 + +.L54: + movddup 0 * SIZE(B), %xmm0 + movaps %xmm0, 0 * SIZE(BO) + + addq $ 2 * SIZE, B + addq $ 4 * SIZE, BO + decq %rax + jne .L54 + ALIGN_4 + +.L60: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + prefetcht2 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetcht2 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 36 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movsldup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm0 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movsldup 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L66 + ALIGN_4 + +.L68: + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm13 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm4 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + addps %xmm10, %xmm13 + addps %xmm11, %xmm4 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm13, 8 * SIZE(CO1) + movhps %xmm13, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + movsd 8 * SIZE(CO2), %xmm10 + movhps 10 * SIZE(CO2), %xmm10 + movsd 12 * SIZE(CO2), %xmm11 + movhps 14 * SIZE(CO2), %xmm11 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + pshufd $0x50, %xmm5, %xmm13 + pshufd $0xfa, %xmm5, %xmm5 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm5 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + addps %xmm10, %xmm13 + addps %xmm11, %xmm5 + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + movlps %xmm13, 8 * SIZE(CO2) + movhps %xmm13, 10 * SIZE(CO2) + movlps %xmm5, 12 * SIZE(CO2) + movhps %xmm5, 14 * SIZE(CO2) + + addq $16 * SIZE, CO1 # coffset += 4 + addq $16 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L61 + ALIGN_4 + +.L70: + testq $4, M + je .L80 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movsldup 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movsldup 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movsldup 48 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + movsd 0 * SIZE(CO2), %xmm8 + movhps 2 * SIZE(CO2), %xmm8 + movsd 4 * SIZE(CO2), %xmm9 + movhps 6 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm1, %xmm12 + pshufd $0xfa, %xmm1, %xmm1 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm1 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm1 + + movlps %xmm12, 0 * SIZE(CO2) + movhps %xmm12, 2 * SIZE(CO2) + movlps %xmm1, 4 * SIZE(CO2) + movhps %xmm1, 6 * SIZE(CO2) + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L80: + testq $2, M + je .L90 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + movddup 8 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L85 + ALIGN_4 + +.L82: + shufps $0x50, %xmm9, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L82 + ALIGN_4 + +.L85: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L88 + ALIGN_4 + +.L86: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L86 + ALIGN_4 + +.L88: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm9 + movhps 2 * SIZE(CO2), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 0 * SIZE(CO2) + movhps %xmm0, 2 * SIZE(CO2) + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L90: + testq $1, M + je .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movsd 0 * SIZE(BO), %xmm9 + movsd 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L95 + ALIGN_4 + +.L92: + shufps $0, %xmm8, %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 3 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 6 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 7 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0, %xmm10, %xmm10 + mulps %xmm10, %xmm11 + movss 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L92 + ALIGN_4 + +.L95: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L98 + ALIGN_4 + +.L96: + shufps $0, %xmm8, %xmm8 + mulps %xmm8, %xmm9 + movss 1 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $1 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L96 + ALIGN_4 + +.L98: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 + + pshufd $0x50, %xmm0, %xmm12 + + mulps %xmm15, %xmm12 + + addps %xmm8, %xmm12 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 0 * SIZE(CO2) + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + leaq (C, LDC, 2), C # c += 4 * ldc + ALIGN_4 + +.L100: + testq $1, N + je .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L103 + ALIGN_4 + + +.L102: + movss 0 * SIZE(B), %xmm0 + movss 1 * SIZE(B), %xmm1 + movss 2 * SIZE(B), %xmm2 + movss 3 * SIZE(B), %xmm3 + movss 4 * SIZE(B), %xmm4 + movss 5 * SIZE(B), %xmm5 + movss 6 * SIZE(B), %xmm6 + movss 7 * SIZE(B), %xmm7 + + movss %xmm0, 0 * SIZE(BO) + movss %xmm0, 1 * SIZE(BO) + movss %xmm1, 2 * SIZE(BO) + movss %xmm1, 3 * SIZE(BO) + movss %xmm2, 4 * SIZE(BO) + movss %xmm2, 5 * SIZE(BO) + movss %xmm3, 6 * SIZE(BO) + movss %xmm3, 7 * SIZE(BO) + movss %xmm4, 8 * SIZE(BO) + movss %xmm4, 9 * SIZE(BO) + movss %xmm5, 10 * SIZE(BO) + movss %xmm5, 11 * SIZE(BO) + movss %xmm6, 12 * SIZE(BO) + movss %xmm6, 13 * SIZE(BO) + movss %xmm7, 14 * SIZE(BO) + movss %xmm7, 15 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $16 * SIZE, BO + + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $7, %rax + BRANCH + jle .L110 + ALIGN_4 + +.L104: + movss 0 * SIZE(B), %xmm0 + movss %xmm0, 0 * SIZE(BO) + movss %xmm0, 1 * SIZE(BO) + + addq $ 1 * SIZE, B + addq $ 2 * SIZE, BO + decq %rax + jne .L104 + ALIGN_4 + +.L110: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $3, I # i = (m >> 3) + jle .L120 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movddup 0 * SIZE(BO), %xmm9 + movddup 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + prefetchnta 8 * SIZE(CO1) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $8, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L115 + ALIGN_4 + +.L112: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + addps %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm9, %xmm0 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm9, %xmm4 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + addps %xmm9, %xmm5 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + movddup 8 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm9 + movaps 36 * SIZE(AO), %xmm12 + addps %xmm9, %xmm0 + movddup 16 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + addps %xmm11, %xmm4 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + addps %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + addps %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + addps %xmm11, %xmm0 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + addps %xmm11, %xmm4 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + addps %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + addps %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L112 + ALIGN_4 + +.L115: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L118 + ALIGN_4 + +.L116: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm4 + movddup 2 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L116 + ALIGN_4 + +.L118: + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + movsd 8 * SIZE(CO1), %xmm10 + movhps 10 * SIZE(CO1), %xmm10 + movsd 12 * SIZE(CO1), %xmm11 + movhps 14 * SIZE(CO1), %xmm11 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + pshufd $0x50, %xmm4, %xmm13 + pshufd $0xfa, %xmm4, %xmm4 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + mulps %xmm15, %xmm13 + mulps %xmm15, %xmm4 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + addps %xmm10, %xmm13 + addps %xmm11, %xmm4 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + movlps %xmm13, 8 * SIZE(CO1) + movhps %xmm13, 10 * SIZE(CO1) + movlps %xmm4, 12 * SIZE(CO1) + movhps %xmm4, 14 * SIZE(CO1) + + addq $16 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L111 + ALIGN_4 + +.L120: + testq $4, M + je .L130 + + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + + movaps 0 * SIZE(AO), %xmm8 + movddup 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movddup 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L125 + ALIGN_4 + +.L122: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movddup 2 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 6 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movddup 16 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movddup 10 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movddup 14 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L122 + ALIGN_4 + +.L125: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L128 + ALIGN_4 + +.L126: + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movddup 2 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L126 + ALIGN_4 + +.L128: + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + pshufd $0x50, %xmm0, %xmm12 + pshufd $0xfa, %xmm0, %xmm0 + + mulps %xmm15, %xmm12 + mulps %xmm15, %xmm0 + + addps %xmm8, %xmm12 + addps %xmm9, %xmm0 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + movlps %xmm0, 4 * SIZE(CO1) + movhps %xmm0, 6 * SIZE(CO1) + + addq $8 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L130: + testq $2, M + je .L140 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(AO), %xmm10 + movaps 16 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $4, %rax + je .L135 + ALIGN_4 + +.L132: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm8, %xmm1 + movaps 8 * SIZE(AO), %xmm8 + mulps 8 * SIZE(BO), %xmm8 + addps %xmm8, %xmm2 + movaps 12 * SIZE(AO), %xmm8 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + movaps 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movaps 48 * SIZE(BO), %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm10, %xmm1 + movaps 24 * SIZE(AO), %xmm10 + mulps 24 * SIZE(BO), %xmm10 + addps %xmm10, %xmm2 + movaps 28 * SIZE(AO), %xmm10 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L132 + ALIGN_4 + +.L135: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $15, %rax # if (k & 1) + BRANCH + je .L138 + ALIGN_4 + +.L136: + movsd 0 * SIZE(AO), %xmm8 + movsd 0 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L136 + ALIGN_4 + +.L138: + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm2, %xmm0 + + movhlps %xmm0, %xmm1 + addps %xmm1, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm12 + + mulps %xmm15, %xmm12 + addps %xmm8, %xmm12 + + movlps %xmm12, 0 * SIZE(CO1) + movhps %xmm12, 2 * SIZE(CO1) + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L140: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 4), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movss 0 * SIZE(AO), %xmm8 + movss 4 * SIZE(AO), %xmm10 + movss 0 * SIZE(BO), %xmm9 + movss 8 * SIZE(BO), %xmm11 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L145 + ALIGN_4 + +.L142: + mulss %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movss 1 * SIZE(AO), %xmm8 + mulss 2 * SIZE(BO), %xmm8 + addss %xmm9, %xmm0 + movss 16 * SIZE(BO), %xmm9 + addss %xmm8, %xmm1 + movss 2 * SIZE(AO), %xmm8 + mulss 4 * SIZE(BO), %xmm8 + addss %xmm8, %xmm2 + movss 3 * SIZE(AO), %xmm8 + mulss 6 * SIZE(BO), %xmm8 + addss %xmm8, %xmm3 + movss 8 * SIZE(AO), %xmm8 + mulss %xmm10, %xmm11 + movss 5 * SIZE(AO), %xmm10 + mulss 10 * SIZE(BO), %xmm10 + addss %xmm11, %xmm0 + movss 24 * SIZE(BO), %xmm11 + addss %xmm10, %xmm1 + movss 6 * SIZE(AO), %xmm10 + mulss 12 * SIZE(BO), %xmm10 + addss %xmm10, %xmm2 + movss 7 * SIZE(AO), %xmm10 + mulss 14 * SIZE(BO), %xmm10 + addss %xmm10, %xmm3 + movss 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L142 + ALIGN_4 + +.L145: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L148 + ALIGN_4 + +.L146: + movss 0 * SIZE(AO), %xmm8 + movss 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + + addq $1 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addss %xmm1, %xmm0 + addss %xmm3, %xmm2 + addss %xmm2, %xmm0 + + movsd 0 * SIZE(CO1), %xmm8 + + pshufd $0x50, %xmm0, %xmm12 + + mulps %xmm15, %xmm12 + addps %xmm8, %xmm12 + + movlps %xmm12, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_beta.S b/kernel/x86_64/zgemm_beta.S new file mode 100644 index 0000000000..ffc775b033 --- /dev/null +++ b/kernel/x86_64/zgemm_beta.S @@ -0,0 +1,260 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 +#define N ARG2 +#define C ARG3 +#define LDC ARG4 +#define C1 ARG5 + +#define STACK_C 16(%rsp) +#define STACK_LDC 24(%rsp) + +#else + +#define STACKSIZE 256 + +#define M ARG1 +#define N ARG2 +#define C ARG3 +#define LDC ARG4 +#define C1 %r10 + +#define STACK_ALPHA_I 40 + STACKSIZE(%rsp) +#define STACK_C 80 + STACKSIZE(%rsp) +#define STACK_LDC 88 + STACKSIZE(%rsp) + +#endif + +#define I %rax + + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movaps %xmm3, %xmm0 + movsd STACK_ALPHA_I, %xmm1 +#endif + + pxor %xmm15, %xmm15 + + movq STACK_C, C + movq STACK_LDC, LDC + + testq M, M + jle .L999 + testq N, N + jle .L999 + + salq $ZBASE_SHIFT, LDC + +#ifdef DOUBLE + ucomisd %xmm15, %xmm0 + jne .L71 + ucomisd %xmm15, %xmm1 + jne .L71 +#else + ucomiss %xmm15, %xmm0 + jne .L71 + ucomiss %xmm15, %xmm1 + jne .L71 +#endif + ALIGN_2 + +.L53: + movq C, C1 # c_offset1 = c_offset + addq LDC, C # c_offset += ldc + + movq M, I + sarq $2, I + jle .L56 + ALIGN_2 + +.L57: +#ifdef OPTERON + prefetchw 64 * SIZE(C1) +#endif + + MOVSD %xmm0, 0 * SIZE(C1) # c_offset1 + MOVSD %xmm0, 1 * SIZE(C1) + MOVSD %xmm0, 2 * SIZE(C1) + MOVSD %xmm0, 3 * SIZE(C1) + MOVSD %xmm0, 4 * SIZE(C1) + MOVSD %xmm0, 5 * SIZE(C1) + MOVSD %xmm0, 6 * SIZE(C1) + MOVSD %xmm0, 7 * SIZE(C1) + addq $8 * SIZE, C1 # c_offset1 += 8 + decq I # i-- + jg .L57 + ALIGN_2 + +.L56: + movq M, I + andq $3, I + jle .L62 + ALIGN_2 + +.L63: + MOVSD %xmm0, 0 * SIZE(C1) + MOVSD %xmm0, 1 * SIZE(C1) + addq $2 * SIZE,C1 + decq I + jg .L63 + ALIGN_2 + +.L62: + decq N # j -- + jg .L53 + jmp .L999 + ALIGN_3 + +.L71: + movq C, C1 + addq LDC, C # c_offset += ldc + + movq M, I + sarq $1, I + jle .L84 + ALIGN_3 + +.L85: +#ifdef OPTERON + prefetchw 16 * SIZE(C1) +#endif + + MOVSD 0 * SIZE(C1), %xmm2 + MOVSD 1 * SIZE(C1), %xmm3 + MOVSD 0 * SIZE(C1), %xmm4 + MOVSD 1 * SIZE(C1), %xmm5 + + MOVSD 2 * SIZE(C1), %xmm6 + MOVSD 3 * SIZE(C1), %xmm7 + MOVSD 2 * SIZE(C1), %xmm8 + MOVSD 3 * SIZE(C1), %xmm9 + + MULSD %xmm0, %xmm2 + MULSD %xmm1, %xmm3 + MULSD %xmm1, %xmm4 + MULSD %xmm0, %xmm5 + + MULSD %xmm0, %xmm6 + MULSD %xmm1, %xmm7 + MULSD %xmm1, %xmm8 + MULSD %xmm0, %xmm9 + + SUBSD %xmm3, %xmm2 + ADDPD %xmm5, %xmm4 + SUBSD %xmm7, %xmm6 + ADDPD %xmm9, %xmm8 + + MOVSD %xmm2, 0 * SIZE(C1) + MOVSD %xmm4, 1 * SIZE(C1) + MOVSD %xmm6, 2 * SIZE(C1) + MOVSD %xmm8, 3 * SIZE(C1) + addq $4 * SIZE, C1 + decq I + jg .L85 + ALIGN_3 + +.L84: + testq $1, M + jle .L74 + ALIGN_3 + +.L75: + prefetchnta 80 * SIZE(C1) + + MOVSD 0 * SIZE(C1), %xmm2 + MULSD %xmm0, %xmm2 + MOVSD 1 * SIZE(C1), %xmm3 + MULSD %xmm1, %xmm3 + MOVSD 0 * SIZE(C1), %xmm4 + MULSD %xmm1, %xmm4 + MOVSD 1 * SIZE(C1), %xmm5 + MULSD %xmm0, %xmm5 + + SUBSD %xmm3, %xmm2 + ADDPD %xmm5, %xmm4 + + MOVSD %xmm2, 0 * SIZE(C1) + MOVSD %xmm4, 1 * SIZE(C1) + ALIGN_2 + +.L74: + decq N + jg .L71 + ALIGN_2 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_1x4_nehalem.S b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S new file mode 100644 index 0000000000..e72a19c96a --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_1x4_nehalem.S @@ -0,0 +1,1093 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %rbp + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rdx +#define BB %r12 + +#define PREA %r10 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE 4 +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 addpd +#define ADD2 subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + testq M, M + jle .L999 + + movq N, J + sarq $2, J + NOBRANCH + jle .L20 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + ALIGN_4 + +.L11: + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + PADDING + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + prefetcht0 3 * SIZE(CO1, LDC) + xorps %xmm11, %xmm11 + + movaps -16 * SIZE(AO), %xmm0 + + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + prefetcht0 1 * SIZE(CO2) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + prefetcht0 3 * SIZE(CO2, LDC) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -14 * SIZE(AO), %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm3 + mulpd %xmm5, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + movaps -10 * SIZE(AO), %xmm5 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + mulpd %xmm5, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + PADDING; + mulpd %xmm5, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm5, %xmm1 + PADDING; + mulpd %xmm5, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm5, %xmm3 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm5, %xmm4 + + subq $-32 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + testq $15, CO1 + NOBRANCH + jne .L18x + +#ifndef TRMMKERNEL + movaps (CO1), %xmm0 + movaps (CO1, LDC), %xmm1 + movaps (CO2), %xmm2 + movaps (CO2, LDC), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 + addpd %xmm2, %xmm12 + addpd %xmm3, %xmm14 +#endif + + movaps %xmm8, (CO1) + movaps %xmm10, (CO1, LDC) + movaps %xmm12, (CO2) + movaps %xmm14, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L11 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + leaq (C, LDC, 4), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + jmp .L20 + ALIGN_4 + +.L18x: +#ifndef TRMMKERNEL + movups (CO1), %xmm0 + movups (CO1, LDC), %xmm1 + movups (CO2), %xmm2 + movups (CO2, LDC), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 + addpd %xmm2, %xmm12 + addpd %xmm3, %xmm14 +#endif + + movups %xmm8, (CO1) + movups %xmm10, (CO1, LDC) + movups %xmm12, (CO2) + movups %xmm14, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L11 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + leaq (C, LDC, 4), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L20: + testq $2, N + BRANCH + jle .L30 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq M, I + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm11 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + decq I + BRANCH + jg .L21 + +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L30: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm9 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + + addsubpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + decq I + BRANCH + jg .L31 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x1_atom.S b/kernel/x86_64/zgemm_kernel_2x1_atom.S new file mode 100644 index 0000000000..be42e036d3 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x1_atom.S @@ -0,0 +1,769 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KKK 72(%rsp) +#define KK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KKK 248(%rsp) +#define KK 256(%rsp) +#endif + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 subsd +#define ADDSD4 subsd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movsd %xmm0, ALPHA_R + movsd %xmm1, ALPHA_I + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + + movq N, J + testq N, N + jle .L999 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + addq LDC, C + + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + jle .L20 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + ADDSD3 %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + ADDSD2 %xmm2, %xmm13 + ADDSD3 %xmm7, %xmm14 + ADDSD4 %xmm6, %xmm15 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + addsd %xmm15, %xmm12 + addsd %xmm13, %xmm14 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + mulsd %xmm0, %xmm8 + mulsd %xmm1, %xmm9 + mulsd %xmm1, %xmm10 + mulsd %xmm0, %xmm11 + + subsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + + mulsd %xmm0, %xmm12 + mulsd %xmm1, %xmm13 + mulsd %xmm1, %xmm14 + mulsd %xmm0, %xmm15 + + subsd %xmm14, %xmm12 + addsd %xmm15, %xmm13 + +#if !defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 + addsd 2 * SIZE(CO1), %xmm12 + addsd 3 * SIZE(CO1), %xmm13 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movsd %xmm13, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + decq I + jg .L10 + ALIGN_4 + +.L20: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + ADDSD1 %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + ADDSD3 %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + movsd ALPHA_R, %xmm0 + movsd ALPHA_I, %xmm1 + + ADDSD2 %xmm2, %xmm9 + ADDSD4 %xmm6, %xmm11 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + mulsd %xmm0, %xmm8 + mulsd %xmm1, %xmm9 + mulsd %xmm1, %xmm10 + mulsd %xmm0, %xmm11 + + subsd %xmm10, %xmm8 + addsd %xmm11, %xmm9 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addsd 0 * SIZE(CO1), %xmm8 + addsd 1 * SIZE(CO1), %xmm9 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm9, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + movq BO, B + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_barcelona.S b/kernel/x86_64/zgemm_kernel_2x2_barcelona.S new file mode 100644 index 0000000000..31fad2b8ca --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_barcelona.S @@ -0,0 +1,1423 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbp +#define CO2 %rbx +#define BB %r12 +#define J %r15 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KK 72(%rsp) +#define KKK 80(%rsp) + +#else + +#define STACKSIZE 320 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KK 248(%rsp) +#define KKK 256(%rsp) + +#endif + +#define movlpd movsd +#define movapd movups +#define movupd movups + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 subpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 subpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + +#define KERNEL1(xx) \ + mulpd %xmm1, %xmm0 ;\ + ADD1 %xmm0, %xmm8 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm0 ;\ + ADD1 %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + ADD1 %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulpd %xmm1, %xmm0 ;\ + ADD1 %xmm0, %xmm8 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm0 ;\ + ADD1 %xmm1, %xmm12 ;\ +/*A*/ movapd (AO, %rax, 4), %xmm6 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm10 ;\ + ADD1 %xmm1, %xmm14 ;\ +/**/ movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm5, %xmm4 ;\ + ADD1 %xmm4, %xmm8 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm4 ;\ + ADD1 %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + ADD1 %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulpd %xmm5, %xmm4 ;\ + ADD1 %xmm4, %xmm8 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm4 ;\ + ADD1 %xmm5, %xmm12 ;\ +/*A*/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm10 ;\ + ADD1 %xmm5, %xmm14 ;\ +/**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulpd %xmm1, %xmm6 ;\ + ADD1 %xmm6, %xmm8 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm6 ;\ + ADD1 %xmm1, %xmm12 ;\ + movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm6, %xmm10 ;\ + movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + ADD1 %xmm1, %xmm14 ;\ + movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulpd %xmm1, %xmm6 ;\ + ADD1 %xmm6, %xmm8 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + movapd %xmm2, %xmm6 ;\ + ADD1 %xmm1, %xmm12 ;\ +/*A*/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm6, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm6 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm6, %xmm10 ;\ + ADD1 %xmm1, %xmm14 ;\ +/**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm5, %xmm7 ;\ + ADD1 %xmm7, %xmm8 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm7 ;\ + ADD1 %xmm5, %xmm12 ;\ + movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm7, %xmm10 ;\ + movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + ADD1 %xmm5, %xmm14 ;\ + movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulpd %xmm5, %xmm7 ;\ + ADD1 %xmm7, %xmm8 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + movapd %xmm2, %xmm7 ;\ + ADD1 %xmm5, %xmm12 ;\ +/*A*/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm7, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm7 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm7, %xmm10 ;\ + ADD1 %xmm5, %xmm14 ;\ +/**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax ;\ + +#define KERNEL_SUB1(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + ADD1 %xmm1, %xmm12 ;\ + movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm10 ;\ + movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + ADD1 %xmm1, %xmm14 ;\ + movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm8 ;\ + movapd %xmm2, %xmm0 ;\ + ADD1 %xmm1, %xmm12 ;\ + movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm0, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm1, %xmm0 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ + ADD1 %xmm0, %xmm10 ;\ + movapd (AO, %rax, 4), %xmm0 ;\ + ADD1 %xmm1, %xmm14 ;\ + movddup (BO, %rax, 4), %xmm1 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + ADD1 %xmm5, %xmm12 ;\ + movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm10 ;\ + movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + ADD1 %xmm5, %xmm14 ;\ + movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm8 ;\ + movapd %xmm2, %xmm4 ;\ + ADD1 %xmm5, %xmm12 ;\ + movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm9 ;\ + movapd %xmm4, %xmm2 ;\ + ADD2 %xmm3, %xmm13 ;\ + movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + mulpd %xmm5, %xmm4 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ + ADD1 %xmm4, %xmm10 ;\ + ADD1 %xmm5, %xmm14 ;\ + mulpd %xmm3, %xmm2 ;\ + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ + ADD2 %xmm2, %xmm11 ;\ + ADD2 %xmm3, %xmm15 ;\ + movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + movapd %xmm0, %xmm2 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq OLD_M, M + movq OLD_N, N + + movlpd %xmm0, ALPHA_R + movlpd %xmm1, ALPHA_I + +#ifdef TRMMKERNEL + movlpd %xmm12, OFFSET + movlpd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $1, J + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + + movq A, AO # aoffset = a + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq B, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movddup -8 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + + prefetchw 3 * SIZE(CO1) + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetchw 7 * SIZE(CO2) + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + movapd %xmm0, %xmm2 + + prefetch -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + andq $-8, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + jl .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm8 + movapd %xmm2, %xmm0 + ADD1 %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + ADD2 %xmm2, %xmm9 + movapd %xmm0, %xmm2 + ADD2 %xmm3, %xmm13 + movddup -13 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm10 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + ADD1 %xmm1, %xmm14 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm3, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm3 + ADD2 %xmm2, %xmm11 + ADD2 %xmm3, %xmm15 + movddup -11 * SIZE(BO, %rax, 4), %xmm3 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm2 + movupd (CO2), %xmm1 + movupd 2 * SIZE(CO2), %xmm3 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm10, %xmm11 + addsubpd %xmm12, %xmm13 + addsubpd %xmm14, %xmm15 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm9 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm11, %xmm11 + movapd %xmm13, %xmm12 + pshufd $0x4e, %xmm13, %xmm13 + movapd %xmm15, %xmm14 + pshufd $0x4e, %xmm15, %xmm15 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 + addpd %xmm1, %xmm10 + addpd %xmm3, %xmm14 +#endif + + movlpd %xmm8, (CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + + movlpd %xmm10, (CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movlpd %xmm14, 2 * SIZE(CO2) + movhpd %xmm14, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq B, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + movddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L46 + ALIGN_4 + +.L42: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -9 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm10 + movddup (BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm5, %xmm11 + movddup -7 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + ADD1 %xmm3, %xmm8 + movddup -6 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -5 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + ADD1 %xmm3, %xmm10 + movddup -4 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + ADD2 %xmm5, %xmm11 + movddup -3 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + ADD1 %xmm3, %xmm8 + movddup -2 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -1 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm2, %xmm3 + ADD1 %xmm3, %xmm10 + movddup 8 * SIZE(BO, %rax, 4), %xmm3 + mulpd %xmm2, %xmm5 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + ADD2 %xmm5, %xmm11 + movddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L42 + ALIGN_4 + +.L46: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L49 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L47: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + ADD2 %xmm5, %xmm9 + movddup -13 * SIZE(BO, %rax, 4), %xmm5 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 4), %xmm1 + mulpd %xmm0, %xmm5 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm5, %xmm11 + movddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L47 + ALIGN_4 + +.L49: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd (CO2), %xmm1 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm9 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm11, %xmm11 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 +#endif + + movlpd %xmm8, (CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm10, (CO2) + movhpd %xmm10, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq B, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movddup -16 * SIZE(BO), %xmm1 + movddup -15 * SIZE(BO), %xmm5 + pxor %xmm8, %xmm8 + movddup -12 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(AO), %xmm4 + pxor %xmm13, %xmm13 + prefetchw 3 * SIZE(CO1) + movapd %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + ADD1 %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + mulpd %xmm1, %xmm0 + mulpd -10 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm8 + movapd (AO, %rax, 4), %xmm0 + ADD1 %xmm1, %xmm12 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -10 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -11 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -6 * SIZE(AO, %rax, 4), %xmm3 + ADD1 %xmm4, %xmm8 + movapd -4 * SIZE(AO, %rax, 4), %xmm4 + ADD1 %xmm3, %xmm12 + movddup -10 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -6 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -9 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm4, %xmm2 + mulpd %xmm3, %xmm4 + mulpd -2 * SIZE(AO, %rax, 4), %xmm3 + ADD1 %xmm4, %xmm8 + movapd 8 * SIZE(AO, %rax, 4), %xmm4 + ADD1 %xmm3, %xmm12 + movddup -4 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm5, %xmm2 + mulpd -2 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -7 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L119 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L117: + mulpd %xmm1, %xmm0 + mulpd -14 * SIZE(AO, %rax, 4), %xmm1 + ADD1 %xmm0, %xmm8 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + ADD1 %xmm1, %xmm12 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm5, %xmm2 + mulpd -14 * SIZE(AO, %rax, 4), %xmm5 + ADD2 %xmm2, %xmm9 + ADD2 %xmm5, %xmm13 + movddup -13 * SIZE(BO, %rax, 2), %xmm5 + movapd %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L119: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 + movupd 2 * SIZE(CO1), %xmm2 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm12, %xmm13 + +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm9 + movapd %xmm13, %xmm12 + pshufd $0x4e, %xmm13, %xmm13 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 +#endif + + movlpd %xmm8, (CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq B, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -12 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movddup -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movddup -15 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L146 + ALIGN_4 + +.L142: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm10 + movddup -12 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -8 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm3, %xmm11 + movddup -11 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -10 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -10 * SIZE(AO, %rax, 2), %xmm2 + ADD2 %xmm3, %xmm9 + movddup -9 * SIZE(BO, %rax, 2), %xmm3 + mulpd %xmm2, %xmm1 + ADD1 %xmm1, %xmm10 + movddup -8 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm2, %xmm3 + movapd -4 * SIZE(AO, %rax, 2), %xmm2 + ADD2 %xmm3, %xmm11 + movddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L142 + ALIGN_4 + +.L146: + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L148 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L147: + mulpd %xmm0, %xmm1 + ADD1 %xmm1, %xmm8 + movddup -14 * SIZE(BO, %rax, 2), %xmm1 + mulpd %xmm0, %xmm3 + movapd -14 * SIZE(AO, %rax, 2), %xmm0 + ADD2 %xmm3, %xmm9 + movddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L147 + ALIGN_4 + +.L148: +#ifndef TRMMKERNEL + movupd (CO1), %xmm0 +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + addsubpd %xmm9, %xmm8 + pshufd $0x4e, %xmm8, %xmm9 +#else + addsubpd %xmm8, %xmm9 + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm9 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + + addsubpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movlpd %xmm8, (CO1) + movhpd %xmm8, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_core2.S b/kernel/x86_64/zgemm_kernel_2x2_core2.S new file mode 100644 index 0000000000..799c151034 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_core2.S @@ -0,0 +1,1353 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA_R 0(%rsp) +#define ALPHA_I 16(%rsp) +#define J 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 2) + +#define PREFETCHSIZE (8 * 13 + 5) +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 subpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 subpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + +#define ADDSUB subpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movddup %xmm0, %xmm0 + movddup %xmm1, %xmm1 + + movapd %xmm0, ALPHA_R + movapd %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $1, J # j = (n >> 2) + NOBRANCH + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq 16 * SIZE + BUFFER, BO + + movapd -16 * SIZE(B), %xmm0 + movapd -8 * SIZE(B), %xmm4 + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_3 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + prefetcht0 (PREFETCH_R + 8) * SIZE(B) + + movapd -14 * SIZE(B), %xmm1 + movapd -12 * SIZE(B), %xmm2 + movapd -10 * SIZE(B), %xmm3 + movapd -6 * SIZE(B), %xmm5 + movapd -4 * SIZE(B), %xmm6 + movapd -2 * SIZE(B), %xmm7 + + movddup %xmm0, %xmm8 + movapd %xmm8, -16 * SIZE(BO) + unpckhpd %xmm0, %xmm0 + movapd %xmm0, -14 * SIZE(BO) + movapd 0 * SIZE(B), %xmm0 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + movddup %xmm1, %xmm9 + movapd %xmm9, -12 * SIZE(BO) + unpckhpd %xmm1, %xmm1 + movapd %xmm1, -10 * SIZE(BO) + movddup %xmm2, %xmm10 + movapd %xmm10, -8 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + unpckhpd %xmm2, %xmm2 + movapd %xmm2, -6 * SIZE(BO) + movddup %xmm3, %xmm11 + movapd %xmm11, -4 * SIZE(BO) + unpckhpd %xmm3, %xmm3 + movapd %xmm3, -2 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + + movddup %xmm4, %xmm12 + movapd %xmm12, 0 * SIZE(BO) + unpckhpd %xmm4, %xmm4 + movapd %xmm4, 2 * SIZE(BO) + movapd 8 * SIZE(B), %xmm4 + movddup %xmm5, %xmm13 + movapd %xmm13, 4 * SIZE(BO) + unpckhpd %xmm5, %xmm5 + movapd %xmm5, 6 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 24) * SIZE(BO) + movddup %xmm6, %xmm14 + movapd %xmm14, 8 * SIZE(BO) + unpckhpd %xmm6, %xmm6 + movapd %xmm6, 10 * SIZE(BO) + movddup %xmm7, %xmm15 + movapd %xmm15, 12 * SIZE(BO) + unpckhpd %xmm7, %xmm7 + movapd %xmm7, 14 * SIZE(BO) + + subq $-32 * SIZE, BO + subq $-16 * SIZE, B + decq %rax + jne .L02 + ALIGN_3 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L05 + ALIGN_3 + +.L04: + movapd -14 * SIZE(B), %xmm1 + + movddup %xmm0, %xmm8 + unpckhpd %xmm0, %xmm0 + movddup %xmm1, %xmm9 + unpckhpd %xmm1, %xmm1 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm0, -14 * SIZE(BO) + movapd -12 * SIZE(B), %xmm0 + + movapd %xmm9, -12 * SIZE(BO) + movapd %xmm1, -10 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_3 + +.L05: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 19 * SIZE + BUFFER, BO +#else + leaq 19 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -19 * SIZE(BO), %xmm6 + movaps -17 * SIZE(BO), %xmm7 + + prefetcht2 0 * SIZE(BB) + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + prefetcht2 8 * SIZE(BB) + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + prefetcht0 3 * SIZE(CO1) + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + prefetcht0 3 * SIZE(CO2) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PADDING; + ADD1 %xmm2, %xmm10 + movaps -15 * SIZE(BO), %xmm2 + PADDING; + ADD1 %xmm3, %xmm14 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + movaps -13 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + ADD1 %xmm6, %xmm8 + movaps -11 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps -9 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm10 + movaps -7 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + movaps -5 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + ADD1 %xmm6, %xmm8 + movaps -3 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps -1 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm10 + movaps 1 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + movaps 3 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + PADDING + movaps %xmm7, %xmm5 + mulpd %xmm1, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm7 + + ADD1 %xmm6, %xmm8 + movaps 5 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps 7 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm10 + movaps 9 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + subq $-16 * SIZE, AO + movaps 11 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + ADD1 %xmm6, %xmm8 + movaps 13 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps 15 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + subq $-32 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht2 -8 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + ADD1 %xmm2, %xmm10 + movaps -15 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulpd %xmm0, %xmm6 + mulpd %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + movaps -13 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm5 + + ADD1 %xmm6, %xmm8 + movaps -11 * SIZE(BO), %xmm6 + ADD1 %xmm3, %xmm12 + addq $4 * SIZE, AO + movaps %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + movaps -9 * SIZE(BO), %xmm7 + ADD2 %xmm5, %xmm13 + addq $8 * SIZE, BO + movaps %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm5 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L19: + movapd ALPHA_R, %xmm6 + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + movapd ALPHA_I, %xmm7 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + movapd %xmm8, %xmm9 + movapd %xmm10, %xmm11 + movapd %xmm12, %xmm13 + movapd %xmm14, %xmm15 +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm10, %xmm11 + addsubpd %xmm12, %xmm13 + addsubpd %xmm14, %xmm15 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + movapd %xmm13, %xmm12 + movapd %xmm15, %xmm14 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm2 + movhpd 3 * SIZE(CO1), %xmm2 + + movsd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm6, %xmm10 + mulpd %xmm6, %xmm12 + mulpd %xmm6, %xmm14 + + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm11 + mulpd %xmm7, %xmm13 + mulpd %xmm7, %xmm15 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 + addpd %xmm2, %xmm12 + addpd %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm14, 2 * SIZE(CO2) + movhpd %xmm14, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L42 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L41 + +.L42: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L43 + ALIGN_4 + +.L44: + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + + movapd %xmm8, %xmm9 + movapd %xmm10, %xmm11 +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + movsd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + + mulpd %xmm6, %xmm8 + mulpd %xmm6, %xmm10 + + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm11 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L112 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L113 + ALIGN_4 + +.L114: + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + + movapd %xmm8, %xmm9 + movapd %xmm12, %xmm13 +#else + addsubpd %xmm8, %xmm9 + addsubpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm12 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm2 + movhpd 3 * SIZE(CO1), %xmm2 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm6, %xmm8 + mulpd %xmm6, %xmm12 + + mulpd %xmm7, %xmm9 + mulpd %xmm7, %xmm13 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L142 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L141 + +.L142: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L143 + ALIGN_4 + +.L144: + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm9, %xmm8 + movapd %xmm8, %xmm9 +#else + addsubpd %xmm8, %xmm9 + movapd %xmm9, %xmm8 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 +#endif + + SHUFPD_1 %xmm9, %xmm9 + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + addsubpd %xmm9, %xmm8 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_penryn.S b/kernel/x86_64/zgemm_kernel_2x2_penryn.S new file mode 100644 index 0000000000..751110fd1c --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_penryn.S @@ -0,0 +1,1297 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#ifdef NANO +#define PREFETCHSIZE (8 * 2 + 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + +#ifdef DUNNINGTON +#define PREFETCHSIZE (8 * 81 + 4) +#endif + +#ifndef PREFETCH +#define PREFETCH prefetcht0 +#endif + +#ifndef PREFETCHW +#define PREFETCHW prefetcht2 +#endif + +#ifndef PREFETCHB +#define PREFETCHB prefetcht0 +#endif + +#ifndef PREFETCHSIZE +#define PREFETCHSIZE (8 * 17 + 4) +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 addpd +#define ADD2 subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + subq $-17 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $1, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorpd %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + xorpd %xmm4, %xmm4 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHB -16 * SIZE(BB) + + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + + movaps %xmm4, %xmm8 + movaps %xmm4, %xmm9 + PREFETCHW 3 * SIZE(CO1) + movaps %xmm4, %xmm10 + movaps %xmm4, %xmm11 + + movaps %xmm4, %xmm12 + movaps %xmm4, %xmm13 + PREFETCHW 3 * SIZE(CO2) + movaps %xmm4, %xmm14 + movaps %xmm4, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm3, %xmm12 + movaps -15 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -11 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -9 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -7 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + PADDING + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + + ADD1 %xmm2, %xmm8 + movaps -5 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + subq $-16 * SIZE, AO + movaps -3 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -1 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + subq $-16 * SIZE, BO + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + PREFETCHB -8 * SIZE(BB) +#ifdef DUNNINGTON + PREFETCHB 0 * SIZE(BB) + PREFETCHB 8 * SIZE(BB) +#endif + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm3, %xmm12 + movaps -15 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#ifndef DUNNINGTON + subq $-16 * SIZE, BB +#else + subq $-32 * SIZE, BB +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm0, %xmm0 + ADD1 %xmm4, %xmm14 + psllq $63, %xmm0 + ADD2 %xmm5, %xmm13 + movddup ALPHA_R, %xmm2 + ADD2 %xmm6, %xmm15 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm11 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm15 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm11, %xmm10 + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + movsd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 + addpd %xmm2, %xmm12 + addpd %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movhpd %xmm10, 3 * SIZE(CO1) + movsd %xmm12, 0 * SIZE(CO2) + movhpd %xmm12, 1 * SIZE(CO2) + movsd %xmm14, 2 * SIZE(CO2) + movhpd %xmm14, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -17 * SIZE(BO), %xmm2 + movaps -15 * SIZE(BO), %xmm3 + + xorps %xmm3, %xmm3 + xorps %xmm5, %xmm5 + + movaps %xmm3, %xmm8 + movaps %xmm3, %xmm9 + movaps %xmm3, %xmm12 + movaps %xmm3, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + ADD1 %xmm3, %xmm12 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -11 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -9 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -7 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -5 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -3 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + ADD1 %xmm2, %xmm8 + movaps -1 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + ADD1 %xmm3, %xmm12 + movaps -15 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm0, %xmm0 + ADD2 %xmm5, %xmm13 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm12 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm13 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm12, %xmm13 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhpd 1 * SIZE(CO2), %xmm2 + + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 0 * SIZE(CO2) + movhpd %xmm12, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + BRANCH + jle .L999 + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -17 * SIZE(BO), %xmm2 + + PREFETCHW 3 * SIZE(CO1) + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -13 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -11 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -9 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -15 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: + pcmpeqb %xmm0, %xmm0 + movddup ALPHA_R, %xmm2 + psllq $63, %xmm0 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm12 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm13 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm12, %xmm13 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm9, %xmm8 + addsubpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movsd 2 * SIZE(CO1), %xmm1 + movhpd 3 * SIZE(CO1), %xmm1 + + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movaps -17 * SIZE(BO), %xmm2 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -13 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -11 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -9 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -15 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + pcmpeqb %xmm0, %xmm0 + movddup ALPHA_R, %xmm2 + psllq $63, %xmm0 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0x04, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + shufps $0x40, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + + pshufd $0x4e, %xmm8, %xmm9 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm9 + + addsubpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + + addpd %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $1, KK +#endif + + addq LDC, C + movq BO, B + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_sse2.S b/kernel/x86_64/zgemm_kernel_2x2_sse2.S new file mode 100644 index 0000000000..4b83eeebdf --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_sse2.S @@ -0,0 +1,1829 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define J 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER 256(%rsp) + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (8 * 9 + 4) + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 5 + 4) + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) +#endif + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else +#define KERNEL1(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL3(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm6, %xmm15 ;\ + movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulpd %xmm0, %xmm1 ;\ + addpd %xmm1, %xmm8 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm0, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm0, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm0, %xmm11 ;\ + movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulpd %xmm2, %xmm1 ;\ + addpd %xmm1, %xmm12 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulpd %xmm2, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm2, %xmm5 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm2, %xmm15 ;\ + movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulpd %xmm4, %xmm7 ;\ + addpd %xmm7, %xmm8 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm4, %xmm3 ;\ + addpd %xmm3, %xmm9 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm4, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addpd %xmm5, %xmm10 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm4, %xmm11 ;\ + movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulpd %xmm6, %xmm7 ;\ + addpd %xmm7, %xmm12 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulpd %xmm6, %xmm3 ;\ + addpd %xmm3, %xmm13 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulpd %xmm6, %xmm5 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addpd %xmm5, %xmm14 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addpd %xmm6, %xmm15 ;\ + movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm12 +#endif + +#endif + + EMMS + + movq %rsp, %rbx # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 # Generate mask + pxor %xmm10, %xmm10 + + movlpd %xmm0, 0 + ALPHA_R + movlpd %xmm0, 8 + ALPHA_R + + movlpd %xmm1, 8 + ALPHA_I + xorpd %xmm7, %xmm1 + movlpd %xmm1, 0 + ALPHA_I + + movlpd %xmm10, 0 + POSINV + movlpd %xmm7, 8 + POSINV + +#ifdef TRMMKERNEL + movlpd %xmm12, OFFSET + movlpd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + subq $-16 * SIZE, A + + salq $ZBASE_SHIFT, LDC + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movq 0 * SIZE(B), %mm0 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq 1 * SIZE(B), %mm1 + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq 3 * SIZE(B), %mm3 + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + movq 4 * SIZE(B), %mm4 + movq %mm4, -8 * SIZE(BO) + movq %mm4, -7 * SIZE(BO) + movq 5 * SIZE(B), %mm5 + movq %mm5, -6 * SIZE(BO) + movq %mm5, -5 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) + + movq 6 * SIZE(B), %mm6 + movq %mm6, -4 * SIZE(BO) + movq %mm6, -3 * SIZE(BO) + movq 7 * SIZE(B), %mm7 + movq %mm7, -2 * SIZE(BO) + movq %mm7, -1 * SIZE(BO) + + PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) + + movq 8 * SIZE(B), %mm0 + movq %mm0, 0 * SIZE(BO) + movq %mm0, 1 * SIZE(BO) + movq 9 * SIZE(B), %mm1 + movq %mm1, 2 * SIZE(BO) + movq %mm1, 3 * SIZE(BO) + + movq 10 * SIZE(B), %mm2 + movq %mm2, 4 * SIZE(BO) + movq %mm2, 5 * SIZE(BO) + movq 11 * SIZE(B), %mm3 + movq %mm3, 6 * SIZE(BO) + movq %mm3, 7 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + + movq 12 * SIZE(B), %mm4 + movq %mm4, 8 * SIZE(BO) + movq %mm4, 9 * SIZE(BO) + movq 13 * SIZE(B), %mm5 + movq %mm5, 10 * SIZE(BO) + movq %mm5, 11 * SIZE(BO) + + PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) + + movq 14 * SIZE(B), %mm6 + movq %mm6, 12 * SIZE(BO) + movq %mm6, 13 * SIZE(BO) + movq 15 * SIZE(B), %mm7 + movq %mm7, 14 * SIZE(BO) + movq %mm7, 15 * SIZE(BO) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movq 0 * SIZE(B), %mm0 + movq %mm0, -16 * SIZE(BO) + movq %mm0, -15 * SIZE(BO) + movq 1 * SIZE(B), %mm1 + movq %mm1, -14 * SIZE(BO) + movq %mm1, -13 * SIZE(BO) + + movq 2 * SIZE(B), %mm2 + movq %mm2, -12 * SIZE(BO) + movq %mm2, -11 * SIZE(BO) + movq 3 * SIZE(B), %mm3 + movq %mm3, -10 * SIZE(BO) + movq %mm3, -9 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + PREFETCH 0 * SIZE(BB) + movapd -14 * SIZE(AO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movapd -12 * SIZE(AO), %xmm4 + movapd -12 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movapd -10 * SIZE(AO), %xmm6 + movapd -8 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + PREFETCHW 3 * SIZE(CO1) + pxor %xmm13, %xmm13 + PREFETCHW 3 * SIZE(CO2) + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + + addq $8 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: + PREFETCH 8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $32 * SIZE, BO + addq $16 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 32 * SIZE, BO + subq $-16 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + +.L16: + movapd POSINV, %xmm5 + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_3 + +.L17: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm0 + addpd %xmm1, %xmm10 + movapd -16 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO, %rax, 4), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm12 + movapd -14 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm13 + movapd -12 * SIZE(BO, %rax, 8), %xmm1 + mulpd %xmm2, %xmm1 + mulpd -10 * SIZE(BO, %rax, 8), %xmm2 + addpd %xmm1, %xmm14 + movapd -8 * SIZE(BO, %rax, 8), %xmm1 + addpd %xmm2, %xmm15 + movapd -10 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_3 + +.L19: +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movlpd 2 * SIZE(CO1), %xmm2 + movhpd 3 * SIZE(CO1), %xmm2 + + movlpd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 + movlpd 2 * SIZE(CO2), %xmm3 + movhpd 3 * SIZE(CO2), %xmm3 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm10 + xorpd %xmm5, %xmm12 + xorpd %xmm5, %xmm14 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 + addpd %xmm1, %xmm10 + addpd %xmm3, %xmm14 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + + movlpd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movlpd %xmm14, 2 * SIZE(CO2) + movhpd %xmm14, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movapd -8 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L44 + ALIGN_4 + +.L41: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd -4 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movapd 18 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm9 + movapd 20 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 22 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 32 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movapd 26 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm9 + movapd 28 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 30 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 40 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm8 + movapd 34 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + addpd %xmm1, %xmm9 + movapd 36 * SIZE(BO), %xmm1 + mulpd %xmm2, %xmm1 + mulpd 38 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 48 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm8 + movapd 42 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + addpd %xmm3, %xmm9 + movapd 44 * SIZE(BO), %xmm3 + mulpd %xmm2, %xmm3 + mulpd 46 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 56 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + subq $-16 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L41 + ALIGN_4 + +.L44: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $4, %rax + BRANCH + jle .L45 + + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd -6 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd -4 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd 2 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd 4 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd 6 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm8 + movapd 10 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + addpd %xmm3, %xmm9 + movapd 12 * SIZE(BO), %xmm3 + mulpd %xmm0, %xmm3 + mulpd 14 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd -8 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + ALIGN_4 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd POSINV, %xmm5 + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L47 + ALIGN_4 + +.L46: + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm8 + movapd -14 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + addpd %xmm1, %xmm9 + movapd -12 * SIZE(BO), %xmm1 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd -8 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jg .L46 + ALIGN_4 + +.L47: +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movlpd 0 * SIZE(CO2), %xmm1 + movhpd 1 * SIZE(CO2), %xmm1 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm10 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm1, %xmm10 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movlpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movlpd 4 * SIZE(B), %xmm12 + movlpd 5 * SIZE(B), %xmm13 + movlpd 6 * SIZE(B), %xmm14 + movlpd 7 * SIZE(B), %xmm15 + + movlpd %xmm8, 0 * SIZE(BO) + movlpd %xmm8, 1 * SIZE(BO) + movlpd %xmm9, 2 * SIZE(BO) + movlpd %xmm9, 3 * SIZE(BO) + movlpd %xmm10, 4 * SIZE(BO) + movlpd %xmm10, 5 * SIZE(BO) + movlpd %xmm11, 6 * SIZE(BO) + movlpd %xmm11, 7 * SIZE(BO) + movlpd %xmm12, 8 * SIZE(BO) + movlpd %xmm12, 9 * SIZE(BO) + movlpd %xmm13, 10 * SIZE(BO) + movlpd %xmm13, 11 * SIZE(BO) + movlpd %xmm14, 12 * SIZE(BO) + movlpd %xmm14, 13 * SIZE(BO) + movlpd %xmm15, 14 * SIZE(BO) + movlpd %xmm15, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L102 + ALIGN_4 + +.L103: + movq K, %rax + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + + movlpd %xmm8, 0 * SIZE(BO) + movlpd %xmm8, 1 * SIZE(BO) + movlpd %xmm9, 2 * SIZE(BO) + movlpd %xmm9, 3 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movapd -16 * SIZE(BO), %xmm1 + pxor %xmm9, %xmm9 + movapd -8 * SIZE(AO), %xmm2 + pxor %xmm12, %xmm12 + movapd -8 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + PREFETCHW 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L112 + +.L111: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm2, %xmm3 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd -6 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd -8 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -6 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd -2 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm12 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm13 + movapd 8 * SIZE(AO), %xmm2 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movapd POSINV, %xmm5 + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -16 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm12 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm13 + movapd -12 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 + movlpd 2 * SIZE(CO1), %xmm2 + movhpd 3 * SIZE(CO1), %xmm2 +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm13 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm12 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm13, %xmm12 +#else + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm12, %xmm13 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 + addpd %xmm2, %xmm12 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movlpd %xmm12, 2 * SIZE(CO1) + movhpd %xmm12, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 16 * SIZE + BUFFER, BO +#else + leaq 16 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm1 + movapd -8 * SIZE(AO), %xmm2 + movapd -8 * SIZE(BO), %xmm3 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L144 + ALIGN_4 + +.L141: + mulpd %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + movapd 8 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm11 + movapd 0 * SIZE(AO), %xmm0 + mulpd %xmm2, %xmm1 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd 2 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm8 + movapd 4 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm9 + movapd -6 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm1 + mulpd 6 * SIZE(BO), %xmm2 + addpd %xmm1, %xmm10 + movapd 16 * SIZE(BO), %xmm1 + addpd %xmm2, %xmm11 + movapd -4 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 10 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm8 + movapd 12 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm9 + movapd -2 * SIZE(AO), %xmm2 + mulpd %xmm2, %xmm3 + mulpd 14 * SIZE(BO), %xmm2 + addpd %xmm3, %xmm10 + movapd 24 * SIZE(BO), %xmm3 + addpd %xmm2, %xmm11 + movapd 8 * SIZE(AO), %xmm2 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + decq %rax + jne .L141 + ALIGN_4 + + +.L144: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $4, %rax # if (k & 1) + BRANCH + jle .L145 + + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm1 + mulpd -10 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm10 + movapd 0 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm11 + movapd -12 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -6 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm8 + movapd -4 * SIZE(BO), %xmm3 + addpd %xmm0, %xmm9 + movapd -10 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm3 + mulpd -2 * SIZE(BO), %xmm0 + addpd %xmm3, %xmm10 + addpd %xmm0, %xmm11 + movapd -8 * SIZE(AO), %xmm0 + + addq $8 * SIZE, AO + subq $-16 * SIZE, BO + ALIGN_4 + +.L145: + movapd POSINV, %xmm5 + movapd ALPHA_R, %xmm6 + movapd ALPHA_I, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L148 + ALIGN_4 + +.L146: + mulpd %xmm0, %xmm1 + mulpd -14 * SIZE(BO), %xmm0 + addpd %xmm1, %xmm8 + movapd -12 * SIZE(BO), %xmm1 + addpd %xmm0, %xmm9 + movapd -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L146 + ALIGN_4 + +.L148: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#ifndef TRMMKERNEL + movlpd 0 * SIZE(CO1), %xmm0 + movhpd 1 * SIZE(CO1), %xmm0 +#endif + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm5, %xmm9 +#else + xorpd %xmm5, %xmm8 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + + addpd %xmm9, %xmm8 + +#ifndef TRMMKERNEL + addpd %xmm0, %xmm8 +#endif + + movlpd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + EMMS + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x2_sse3.S b/kernel/x86_64/zgemm_kernel_2x2_sse3.S new file mode 100644 index 0000000000..afb092439e --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x2_sse3.S @@ -0,0 +1,1539 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define OFFSET 64(%rsp) +#define KKK 72(%rsp) +#define KK 80(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define OFFSET 240(%rsp) +#define KKK 248(%rsp) +#define KK 256(%rsp) +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addpd +#define ADD2 addpd +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addpd +#define ADD2 subpd +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 subpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 subpd +#endif + +#define ADDSUB subpd + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + +#endif + + movsd %xmm0, ALPHA_R + movsd %xmm1, ALPHA_I + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + pxor %xmm4, %xmm4 + movddup 16 * SIZE(BO), %xmm13 + pxor %xmm5, %xmm5 + movapd 24 * SIZE(AO), %xmm14 + pxor %xmm6, %xmm6 + movddup 24 * SIZE(BO), %xmm15 + pxor %xmm7, %xmm7 + + prefetchnta 3 * SIZE(CO1) + prefetchnta 3 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + salq $4, %rax + je .L12 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L11: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + ALIGN_4 + +.L12: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA_R, %xmm14 + movddup ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L13 + ALIGN_4 + +.L14: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + + movapd %xmm0, %xmm1 + movapd %xmm2, %xmm3 + movapd %xmm4, %xmm5 + movapd %xmm6, %xmm7 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 + movapd %xmm5, %xmm4 + movapd %xmm7, %xmm6 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm10 + movhpd 3 * SIZE(CO1), %xmm10 + + movsd 0 * SIZE(CO2), %xmm9 + movhpd 1 * SIZE(CO2), %xmm9 + movsd 2 * SIZE(CO2), %xmm11 + movhpd 3 * SIZE(CO2), %xmm11 +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + mulpd %xmm14, %xmm4 + mulpd %xmm14, %xmm6 + + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm3 + mulpd %xmm15, %xmm5 + mulpd %xmm15, %xmm7 + + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movhpd %xmm4, 3 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + movsd %xmm6, 2 * SIZE(CO2) + movhpd %xmm6, 3 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +.L40: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L42 + +.L41: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA_R, %xmm14 + movddup ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L43 + ALIGN_4 + +.L44: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + + movapd %xmm0, %xmm1 + movapd %xmm2, %xmm3 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + + movsd 0 * SIZE(CO2), %xmm9 + movhpd 1 * SIZE(CO2), %xmm9 +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm2 + + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm3 + + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm2, 0 * SIZE(CO2) + movhpd %xmm2, 1 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L99: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + movq BO, B + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + prefetchnta 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L112 + +.L111: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA_R, %xmm14 + movddup ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 + + movapd %xmm0, %xmm1 + movapd %xmm4, %xmm5 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm4, %xmm5 + + movapd %xmm1, %xmm0 + movapd %xmm5, %xmm4 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 + movsd 2 * SIZE(CO1), %xmm10 + movhpd 3 * SIZE(CO1), %xmm10 +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + + mulpd %xmm14, %xmm0 + mulpd %xmm14, %xmm4 + + mulpd %xmm15, %xmm1 + mulpd %xmm15, %xmm5 + + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 + addpd %xmm10, %xmm4 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + movsd %xmm4, 2 * SIZE(CO1) + movhpd %xmm4, 3 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L999 + ALIGN_4 + +.L140: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L142 + +.L141: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movddup ALPHA_R, %xmm14 + movddup ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + SHUFPD_1 %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + addsubpd %xmm1, %xmm0 + movapd %xmm0, %xmm1 +#else + addsubpd %xmm0, %xmm1 + movapd %xmm1, %xmm0 +#endif + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhpd 1 * SIZE(CO1), %xmm8 +#endif + + SHUFPD_1 %xmm1, %xmm1 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm1 + addsubpd %xmm1, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + addpd %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhpd %xmm0, 1 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_2x4_nehalem.S b/kernel/x86_64/zgemm_kernel_2x4_nehalem.S new file mode 100644 index 0000000000..6a16b7e133 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_2x4_nehalem.S @@ -0,0 +1,1628 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %rbp + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rdx +#define BB %r12 + +#define PREA %r10 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE 8 +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addps +#define ADD2 addps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addps +#define ADD2 addps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addps +#define ADD2 addps +#else +#define ADD1 addps +#define ADD2 subps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm0, %xmm0 + unpcklps %xmm1, %xmm1 + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: + prefetcht2 -32 * SIZE(BB) + subq $-16 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 1 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 3 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + movaps -32 * SIZE(AO), %xmm0 + + xorps %xmm12, %xmm12 + prefetcht0 1 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + movaps -28 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm3 + mulps %xmm7, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + movaps -20 * SIZE(AO), %xmm7 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm7, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm7, %xmm5 + mulps %xmm7, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm7, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm7, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm7, %xmm3 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm7, %xmm4 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + pshufd $0xb1, %xmm12, %xmm13 + pshufd $0xb1, %xmm14, %xmm15 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm2, %xmm12 + mulps %xmm3, %xmm13 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + mulps %xmm2, %xmm14 + mulps %xmm3, %xmm15 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + addsubps %xmm13, %xmm12 + addsubps %xmm15, %xmm14 + +#ifndef TRMMKERNEL + movups 0 * SIZE(CO1), %xmm0 + movups 0 * SIZE(CO1, LDC), %xmm1 + movups 0 * SIZE(CO2), %xmm2 + movups 0 * SIZE(CO2, LDC), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm12 + addps %xmm3, %xmm14 +#endif + + movups %xmm8, 0 * SIZE(CO1) + movups %xmm10, 0 * SIZE(CO1, LDC) + movups %xmm12, 0 * SIZE(CO2) + movups %xmm14, 0 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#else + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO1, LDC), %xmm0 + movsd (CO2), %xmm1 + movhps (CO2, LDC), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO1, LDC) + movsd %xmm10, (CO2) + movhps %xmm10, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L29: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + leaq (C, LDC, 4), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $2, N + BRANCH + jle .L50 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC), CO2 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + decq I # i -- + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + + pxor %xmm0, %xmm9 +#else + pxor %xmm0, %xmm8 + + shufps $0xb1, %xmm9, %xmm9 +#endif + + addps %xmm9, %xmm8 + + pshufd $0xb1, %xmm8, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + movhps (CO2), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + movhps %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L49: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + ALIGN_4 + +.L50: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 +#endif + + haddps %xmm9, %xmm8 + + shufps $0xd8, %xmm8, %xmm8 + pshufd $0xb1, %xmm8, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + decq I # i -- + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -26 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + + pxor %xmm0, %xmm9 +#else + pxor %xmm0, %xmm8 + + shufps $0xb1, %xmm9, %xmm9 +#endif + + addps %xmm9, %xmm8 + + pshufd $0xb1, %xmm8, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd (CO1), %xmm0 + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_barcelona.S b/kernel/x86_64/zgemm_kernel_4x2_barcelona.S new file mode 100644 index 0000000000..c59a50d055 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_barcelona.S @@ -0,0 +1,2226 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define J 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#define PREFETCH prefetch +#define PREFETCHSIZE (16 * 17 + 0) + +#define RPREFETCHSIZE (16 * 9 + 0) +#define WPREFETCHSIZE (16 * 9 + 0) + +#define KERNEL1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + movaps (AO, %rax, 4), %xmm6 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + movaps 64 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 40 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps 16 * SIZE(AO, %rax, 4), %xmm7 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 44 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 8 * SIZE(AO, %rax, 4), %xmm6 ;\ + addps %xmm1, %xmm14 ;\ + movaps 48 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 52 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm8 ;\ + movaps %xmm2, %xmm6 ;\ + addps %xmm1, %xmm12 ;\ + movaps 56 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm6, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 60 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm6 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm6, %xmm10 ;\ + movaps 32 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 68 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps 96 * SIZE(BO, %rax, 8), %xmm1 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 72 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 76 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 24 * SIZE(AO, %rax, 4), %xmm7 ;\ + addps %xmm5, %xmm14 ;\ + movaps 80 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 84 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm8 ;\ + movaps %xmm2, %xmm7 ;\ + addps %xmm5, %xmm12 ;\ + movaps 88 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm7, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 92 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm7 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm7, %xmm10 ;\ + movaps 48 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 128 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps 28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 100 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 ;\ + addq $16 * SIZE, %rax + +#define KERNEL_SUB1(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#define KERNEL_SUB2(xx) \ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm8 ;\ + movaps %xmm2, %xmm0 ;\ + addps %xmm1, %xmm12 ;\ + movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm0, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm1, %xmm0 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ + addps %xmm0, %xmm10 ;\ + movaps (AO, %rax, 4), %xmm0 ;\ + addps %xmm1, %xmm14 ;\ + movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ + addps %xmm5, %xmm14 ;\ + movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm8 ;\ + movaps %xmm2, %xmm4 ;\ + addps %xmm5, %xmm12 ;\ + movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm9 ;\ + movaps %xmm4, %xmm2 ;\ + addps %xmm3, %xmm13 ;\ + movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm5, %xmm4 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ + addps %xmm4, %xmm10 ;\ + addps %xmm5, %xmm14 ;\ + mulps %xmm3, %xmm2 ;\ + mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ + addps %xmm2, %xmm11 ;\ + addps %xmm3, %xmm15 ;\ + movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ + movaps %xmm0, %xmm2 + +#if defined(OS_LINUX) && defined(CORE_BARCELONA) && !defined(TRMMKERNEL) + .align 32768 +#endif + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + pxor %xmm10, %xmm10 + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm10, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm10,12 + POSINV +#else + movss %xmm10, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm10, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + addq $32 * SIZE, A + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + movaps POSINV, %xmm15 + + movq K, %rax + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + prefetch -20 * SIZE(BB) + prefetch 28 * SIZE(BB) + subq $-32 * SIZE, BB + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movaps -28 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movaps 0 * SIZE(BO), %xmm5 + pxor %xmm11, %xmm11 + + prefetchw 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + prefetchw 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + movaps %xmm0, %xmm2 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + NOBRANCH + je .L15 + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + BRANCH + jl .L12 + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL_SUB1(32 * 0) + KERNEL_SUB2(32 * 0) + KERNEL_SUB3(32 * 0) + KERNEL_SUB4(32 * 0) + + addq $64 * SIZE, BO + addq $32 * SIZE, AO + ALIGN_3 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm8 + movaps %xmm2, %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm9 + movaps %xmm0, %xmm2 + addps %xmm3, %xmm13 + movaps -20 * SIZE(BO, %rax, 8), %xmm3 + mulps %xmm1, %xmm0 + mulps -28 * SIZE(AO, %rax, 4), %xmm1 + addps %xmm0, %xmm10 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm3, %xmm2 + mulps -28 * SIZE(AO, %rax, 4), %xmm3 + addps %xmm2, %xmm11 + addps %xmm3, %xmm15 + movaps -12 * SIZE(BO, %rax, 8), %xmm3 + movaps %xmm0, %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + shufps $0xb1, %xmm13, %xmm13 + shufps $0xb1, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + shufps $0xb1, %xmm12, %xmm12 + shufps $0xb1, %xmm14, %xmm14 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + mulps %xmm6, %xmm15 + mulps %xmm7, %xmm14 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm12 + addps %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -16 * SIZE(AO), %xmm2 + movaps 0 * SIZE(AO), %xmm4 + movaps 16 * SIZE(AO), %xmm6 + + movaps -32 * SIZE(BO), %xmm1 + movaps -16 * SIZE(BO), %xmm3 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -28 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -8 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm3, %xmm10 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm5 + addps %xmm5, %xmm8 + movaps 4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm9 + movaps 8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + mulps 12 * SIZE(BO), %xmm0 + addps %xmm5, %xmm10 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm0, %xmm11 + movaps -20 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm7 + addps %xmm7, %xmm8 + movaps 20 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm9 + movaps 24 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + mulps 28 * SIZE(BO), %xmm0 + addps %xmm7, %xmm10 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm0, %xmm11 + movaps 0 * SIZE(AO), %xmm0 + + mulps %xmm2, %xmm1 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addps %xmm1, %xmm8 + movaps 36 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm9 + movaps 40 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + mulps 44 * SIZE(BO), %xmm2 + addps %xmm1, %xmm10 + movaps 96 * SIZE(BO), %xmm1 + addps %xmm2, %xmm11 + movaps -12 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm3 + addps %xmm3, %xmm8 + movaps 52 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + movaps 56 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + mulps 60 * SIZE(BO), %xmm2 + addps %xmm3, %xmm10 + movaps 112 * SIZE(BO), %xmm3 + addps %xmm2, %xmm11 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movaps 68 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm9 + movaps 72 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + mulps 76 * SIZE(BO), %xmm2 + addps %xmm5, %xmm10 + movaps 128 * SIZE(BO), %xmm5 + addps %xmm2, %xmm11 + movaps -4 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movaps 84 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm9 + movaps 88 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + mulps 92 * SIZE(BO), %xmm2 + addps %xmm7, %xmm10 + movaps 144 * SIZE(BO), %xmm7 + addps %xmm2, %xmm11 + movaps 16 * SIZE(AO), %xmm2 + + subq $ -32 * SIZE, AO + subq $-128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -28 * SIZE(AO), %xmm0 + + subq $- 4 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -24 * SIZE(AO), %xmm2 + + movaps -32 * SIZE(BO), %xmm1 + movaps -16 * SIZE(BO), %xmm3 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movsd -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movsd -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movsd 32 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movsd -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movsd -8 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm10 + movsd -4 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + movsd 48 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm5 + addps %xmm5, %xmm8 + movsd 4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm9 + movsd 8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm10 + movsd 12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm5, %xmm11 + movsd 64 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm7 + addps %xmm7, %xmm8 + movsd 20 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm9 + movsd 24 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm10 + movsd 28 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + movsd -16 * SIZE(AO), %xmm0 + addps %xmm7, %xmm11 + movsd 80 * SIZE(BO), %xmm7 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movsd 36 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm9 + movsd 40 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movsd 44 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + movsd -22 * SIZE(AO), %xmm2 + addps %xmm1, %xmm11 + movsd 96 * SIZE(BO), %xmm1 + + mulps %xmm2, %xmm3 + addps %xmm3, %xmm8 + movsd 52 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + movsd 56 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm10 + movsd 60 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + movsd -20 * SIZE(AO), %xmm2 + addps %xmm3, %xmm11 + movsd 112 * SIZE(BO), %xmm3 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movsd 68 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm9 + movsd 72 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm10 + movsd 76 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -18 * SIZE(AO), %xmm2 + addps %xmm5, %xmm11 + movsd 128 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movsd 84 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm9 + movsd 88 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm10 + movsd 92 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -8 * SIZE(AO), %xmm2 + addps %xmm7, %xmm11 + movsd 144 * SIZE(BO), %xmm7 + + subq $ -16 * SIZE, AO + subq $-128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movsd -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movsd -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movsd -16 * SIZE(BO), %xmm1 + + subq $ -2 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm1 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + movaps POSINV, %xmm15 + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + prefetch (RPREFETCHSIZE + 0) * SIZE(B) + + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + prefetchw (WPREFETCHSIZE + 16) * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + decq %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -16 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movaps 0 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movaps 16 * SIZE(AO), %xmm6 + pxor %xmm11, %xmm11 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm12, %xmm12 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + movaps 0 * SIZE(BO), %xmm5 + pxor %xmm14, %xmm14 + movaps 16 * SIZE(BO), %xmm7 + pxor %xmm15, %xmm15 + + prefetchw 7 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps 32 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + mulps %xmm2, %xmm3 + mulps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm8 + movaps -16 * SIZE(BO), %xmm3 + addps %xmm2, %xmm9 + movaps -12 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm3 + mulps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm2, %xmm13 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm3 + mulps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm8 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm2, %xmm9 + movaps -4 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm3 + mulps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm2, %xmm13 + movaps 48 * SIZE(AO), %xmm2 + + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + + mulps %xmm4, %xmm5 + mulps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm8 + movaps 0 * SIZE(BO), %xmm5 + addps %xmm4, %xmm9 + movaps 4 * SIZE(AO), %xmm4 + mulps %xmm4, %xmm5 + mulps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm12 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm4, %xmm13 + movaps 8 * SIZE(AO), %xmm4 + + mulps %xmm4, %xmm5 + mulps 12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm8 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm4, %xmm9 + movaps 12 * SIZE(AO), %xmm4 + mulps %xmm4, %xmm5 + mulps 12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm12 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm4, %xmm13 + movaps 64 * SIZE(AO), %xmm4 + + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) + + mulps %xmm6, %xmm7 + mulps 20 * SIZE(BO), %xmm6 + addps %xmm7, %xmm8 + movaps 16 * SIZE(BO), %xmm7 + addps %xmm6, %xmm9 + movaps 20 * SIZE(AO), %xmm6 + mulps %xmm6, %xmm7 + mulps 20 * SIZE(BO), %xmm6 + addps %xmm7, %xmm12 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm6, %xmm13 + movaps 24 * SIZE(AO), %xmm6 + + mulps %xmm6, %xmm7 + mulps 28 * SIZE(BO), %xmm6 + addps %xmm7, %xmm8 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm6, %xmm9 + movaps 28 * SIZE(AO), %xmm6 + mulps %xmm6, %xmm7 + mulps 28 * SIZE(BO), %xmm6 + addps %xmm7, %xmm12 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm6, %xmm13 + movaps 80 * SIZE(AO), %xmm6 + + subq $-64 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps -24 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jg .L56 + ALIGN_4 + +.L58: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm13, %xmm12 +#else + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm12, %xmm13 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm12, %xmm12 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -16 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BO), %xmm0 + addps %xmm3, %xmm8 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm0, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm3, %xmm10 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm0, %xmm11 + movaps 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + mulps %xmm2, %xmm5 + mulps 4 * SIZE(BO), %xmm2 + addps %xmm5, %xmm8 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm2, %xmm9 + movaps -12 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm5 + mulps 12 * SIZE(BO), %xmm2 + addps %xmm5, %xmm10 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm2, %xmm11 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm7 + mulps 20 * SIZE(BO), %xmm2 + addps %xmm7, %xmm8 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm2, %xmm9 + movaps -4 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm7 + mulps 28 * SIZE(BO), %xmm2 + addps %xmm7, %xmm10 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm2, %xmm11 + movaps 16 * SIZE(AO), %xmm2 + + subq $-32 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 +#else + addps %xmm9, %xmm8 +#endif + + movaps %xmm8, %xmm9 + + shufps $0xb1, %xmm8, %xmm8 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + + addps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -24 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm9 + movsd -24 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movsd -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movsd 32 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movsd -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + movsd -8 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm10 + movsd -4 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + movsd 48 * SIZE(BO), %xmm3 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movsd 4 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -22 * SIZE(AO), %xmm2 + addps %xmm5, %xmm9 + movsd 8 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm10 + movsd 12 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -20 * SIZE(AO), %xmm2 + addps %xmm5, %xmm11 + movsd 64 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movsd 20 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -18 * SIZE(AO), %xmm2 + addps %xmm7, %xmm9 + movsd 24 * SIZE(BO), %xmm7 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm10 + movsd 28 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -8 * SIZE(AO), %xmm2 + addps %xmm7, %xmm11 + movsd 80 * SIZE(BO), %xmm7 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movsd -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm9 + movsd -24 * SIZE(BO), %xmm1 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 +#else + addps %xmm9, %xmm8 +#endif + + movaps %xmm8, %xmm9 + + shufps $0xb1, %xmm8, %xmm8 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + + addps %xmm9, %xmm8 +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_core2.S b/kernel/x86_64/zgemm_kernel_4x2_core2.S new file mode 100644 index 0000000000..1b5d9a03f7 --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_core2.S @@ -0,0 +1,1744 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA_R 0(%rsp) +#define ALPHA_I 16(%rsp) +#define J 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (16 * 4 + 0) +#define PREFETCH_W (PREFETCH_R * 4) + +#define PREFETCHSIZE (16 * 13 + 10) +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addps +#else +#define ADDSUB subps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + +#endif + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + movq OLD_M, M + movq OLD_N, N + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq 32 * SIZE + BUFFER, BO + + movaps -32 * SIZE(B), %xmm3 + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + movaps -28 * SIZE(B), %xmm7 + movaps -24 * SIZE(B), %xmm11 + movaps -20 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0x55, %xmm3, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + pshufd $0xaa, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xff, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps -16 * SIZE(B), %xmm3 + + prefetcht0 (PREFETCH_W + 16) * SIZE(BO) + pshufd $0x00, %xmm7, %xmm4 + movaps %xmm4, -16 * SIZE(BO) + pshufd $0x55, %xmm7, %xmm5 + movaps %xmm5, -12 * SIZE(BO) + pshufd $0xaa, %xmm7, %xmm6 + movaps %xmm6, -8 * SIZE(BO) + pshufd $0xff, %xmm7, %xmm7 + movaps %xmm7, -4 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 32) * SIZE(BO) + + pshufd $0x00, %xmm11, %xmm8 + movaps %xmm8, 0 * SIZE(BO) + pshufd $0x55, %xmm11, %xmm9 + movaps %xmm9, 4 * SIZE(BO) + pshufd $0xaa, %xmm11, %xmm10 + movaps %xmm10, 8 * SIZE(BO) + pshufd $0xff, %xmm11, %xmm11 + movaps %xmm11, 12 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 48) * SIZE(BO) + + pshufd $0x00, %xmm15, %xmm12 + movaps %xmm12, 16 * SIZE(BO) + pshufd $0x55, %xmm15, %xmm13 + movaps %xmm13, 20 * SIZE(BO) + pshufd $0xaa, %xmm15, %xmm14 + movaps %xmm14, 24 * SIZE(BO) + pshufd $0xff, %xmm15, %xmm15 + movaps %xmm15, 28 * SIZE(BO) + + subq $-16 * SIZE, B + subq $-64 * SIZE, BO + subq $1, %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + pshufd $0x00, %xmm3, %xmm0 + movaps %xmm0, -32 * SIZE(BO) + pshufd $0x55, %xmm3, %xmm1 + movaps %xmm1, -28 * SIZE(BO) + pshufd $0xaa, %xmm3, %xmm2 + movaps %xmm2, -24 * SIZE(BO) + pshufd $0xff, %xmm3, %xmm3 + movaps %xmm3, -20 * SIZE(BO) + + movaps -28 * SIZE(B), %xmm3 + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + subq $1, %rax + jne .L04 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 40 * SIZE + BUFFER, BO +#else + leaq 40 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -40 * SIZE(BO), %xmm6 + movaps -36 * SIZE(BO), %xmm7 + + prefetcht2 -32 * SIZE(BB) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + + pxor %xmm12, %xmm12 + prefetcht0 7 * SIZE(CO1) + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + prefetcht0 7 * SIZE(CO2) + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-32 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + jle .L15 + ALIGN_4 + +.L12: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps -16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps -12 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps -4 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 0 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + PADDING; + movaps %xmm6, %xmm3 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps 4 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 8 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps 12 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm2, %xmm10 + movaps 16 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps 20 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + subq $-32 * SIZE, AO + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps 24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps 28 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, BO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_4 + +.L15: + prefetcht2 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + addps %xmm2, %xmm10 + movaps -32 * SIZE(BO), %xmm2 + addps %xmm3, %xmm14 + movaps %xmm6, %xmm3 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADDSUB %xmm4, %xmm11 + movaps -28 * SIZE(BO), %xmm4 + ADDSUB %xmm5, %xmm15 + movaps %xmm7, %xmm5 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + addps %xmm6, %xmm8 + movaps -24 * SIZE(BO), %xmm6 + addps %xmm3, %xmm12 + addq $8 * SIZE, AO + movaps %xmm2, %xmm3 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADDSUB %xmm7, %xmm9 + movaps -20 * SIZE(BO), %xmm7 + ADDSUB %xmm5, %xmm13 + addq $16 * SIZE, BO + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $1, %rax + jg .L16 + ALIGN_4 + +.L18: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + + addps %xmm2, %xmm10 + addps %xmm3, %xmm14 + ADDSUB %xmm4, %xmm11 + ADDSUB %xmm5, %xmm15 + +#if !defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + shufps $0xb1, %xmm13, %xmm13 + shufps $0xb1, %xmm15, %xmm15 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + addsubps %xmm13, %xmm12 + addsubps %xmm15, %xmm14 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + shufps $0xb1, %xmm12, %xmm12 + shufps $0xb1, %xmm14, %xmm14 +#else + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + shufps $0xb1, %xmm12, %xmm12 + shufps $0xb1, %xmm14, %xmm14 + + addsubps %xmm8, %xmm9 + addsubps %xmm10, %xmm11 + addsubps %xmm12, %xmm13 + addsubps %xmm14, %xmm15 + + movaps %xmm9, %xmm8 + movaps %xmm11, %xmm10 + movaps %xmm13, %xmm12 + movaps %xmm15, %xmm14 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + shufps $0xb1, %xmm13, %xmm13 + shufps $0xb1, %xmm15, %xmm15 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + mulps %xmm6, %xmm15 + mulps %xmm7, %xmm14 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 + +#if !defined(TRMMKERNEL) && !defined(BETAZERO) + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm12 + addps %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movaps -28 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps 0 * SIZE(BO), %xmm2 + movaps 4 * SIZE(BO), %xmm3 + movaps 8 * SIZE(BO), %xmm4 + movaps 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movaps -20 * SIZE(AO), %xmm0 + movaps 16 * SIZE(BO), %xmm2 + movaps 20 * SIZE(BO), %xmm3 + movaps 24 * SIZE(BO), %xmm4 + movaps 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + addq $ 4 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L26 + ALIGN_4 + +.L28: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 +#else + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + addsubps %xmm8, %xmm9 + addsubps %xmm10, %xmm11 + + movaps %xmm9, %xmm8 + movaps %xmm11, %xmm10 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L35 + ALIGN_4 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movsd -30 * SIZE(AO), %xmm0 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd 0 * SIZE(BO), %xmm2 + movsd 4 * SIZE(BO), %xmm3 + movsd 8 * SIZE(BO), %xmm4 + movsd 12 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movsd -26 * SIZE(AO), %xmm0 + movsd 16 * SIZE(BO), %xmm2 + movsd 20 * SIZE(BO), %xmm3 + movsd 24 * SIZE(BO), %xmm4 + movsd 28 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-64 * SIZE, BO + subq $1, %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm0, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + addq $ 2 * SIZE, AO + addq $16 * SIZE, BO + subq $1, %rax + jg .L36 + ALIGN_4 + +.L38: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 +#else + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + addsubps %xmm8, %xmm9 + addsubps %xmm10, %xmm11 + + movaps %xmm9, %xmm8 + movaps %xmm11, %xmm10 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movss -32 * SIZE(B), %xmm8 + movss -31 * SIZE(B), %xmm9 + movss -30 * SIZE(B), %xmm10 + movss -29 * SIZE(B), %xmm11 + movss -28 * SIZE(B), %xmm12 + movss -27 * SIZE(B), %xmm13 + movss -26 * SIZE(B), %xmm14 + movss -25 * SIZE(B), %xmm15 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + shufps $0, %xmm10, %xmm10 + shufps $0, %xmm11, %xmm11 + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + movaps %xmm12, 16 * SIZE(BO) + movaps %xmm13, 20 * SIZE(BO) + movaps %xmm14, 24 * SIZE(BO) + movaps %xmm15, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + + subq $1, %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movss -32 * SIZE(B), %xmm8 + movss -31 * SIZE(B), %xmm9 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + subq $1, %rax + jne .L44 + ALIGN_4 + +.L50: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + prefetcht0 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L55 + ALIGN_4 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + + movaps -24 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -20 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + movaps -16 * SIZE(AO), %xmm0 + movaps -12 * SIZE(AO), %xmm1 + + movaps -16 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -12 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + movaps -8 * SIZE(AO), %xmm0 + movaps -4 * SIZE(AO), %xmm1 + + movaps -8 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -4 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + subq $-32 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + + movaps -32 * SIZE(BO), %xmm2 + movaps %xmm2, %xmm3 + movaps -28 * SIZE(BO), %xmm4 + movaps %xmm4, %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + mulps %xmm0, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + addps %xmm3, %xmm12 + ADDSUB %xmm4, %xmm9 + ADDSUB %xmm5, %xmm13 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L56 + ALIGN_4 + +.L58: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm13, %xmm13 + + addsubps %xmm9, %xmm8 + addsubps %xmm13, %xmm12 + + movaps %xmm8, %xmm9 + movaps %xmm12, %xmm13 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm12, %xmm12 +#else + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm12, %xmm12 + + addsubps %xmm8, %xmm9 + addsubps %xmm12, %xmm13 + + movaps %xmm9, %xmm8 + movaps %xmm13, %xmm12 + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm13, %xmm13 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movaps -32 * SIZE(AO), %xmm0 + movaps -28 * SIZE(AO), %xmm1 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + movaps -24 * SIZE(BO), %xmm4 + movaps -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movaps -24 * SIZE(AO), %xmm0 + movaps -20 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + movaps -12 * SIZE(BO), %xmm3 + movaps -8 * SIZE(BO), %xmm4 + movaps -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + subq $-16 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L66 + ALIGN_4 + +.L68: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + addsubps %xmm9, %xmm8 + movaps %xmm8, %xmm9 + shufps $0xb1, %xmm8, %xmm8 +#else + shufps $0xb1, %xmm8, %xmm8 + addsubps %xmm8, %xmm9 + movaps %xmm9, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + addps %xmm9, %xmm8 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + je .L75 + ALIGN_4 + +.L72: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movsd -32 * SIZE(AO), %xmm0 + movsd -30 * SIZE(AO), %xmm1 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + movsd -24 * SIZE(BO), %xmm4 + movsd -20 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + movsd -28 * SIZE(AO), %xmm0 + movsd -26 * SIZE(AO), %xmm1 + movsd -16 * SIZE(BO), %xmm2 + movsd -12 * SIZE(BO), %xmm3 + movsd -8 * SIZE(BO), %xmm4 + movsd -4 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm4 + mulps %xmm1, %xmm5 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + addps %xmm4, %xmm10 + ADDSUB %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax + BRANCH + je .L78 + ALIGN_4 + +.L76: + movsd -32 * SIZE(AO), %xmm0 + movsd -32 * SIZE(BO), %xmm2 + movsd -28 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm2 + mulps %xmm0, %xmm3 + + addps %xmm2, %xmm8 + ADDSUB %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L76 + ALIGN_4 + +.L78: + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm9, %xmm9 + addsubps %xmm9, %xmm8 + movaps %xmm8, %xmm9 + shufps $0xb1, %xmm8, %xmm8 +#else + shufps $0xb1, %xmm8, %xmm8 + addsubps %xmm8, %xmm9 + movaps %xmm9, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + addps %xmm9, %xmm8 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_penryn.S b/kernel/x86_64/zgemm_kernel_4x2_penryn.S new file mode 100644 index 0000000000..241148db8b --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_penryn.S @@ -0,0 +1,1794 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#define PREA %rdx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define ALPHA_R 48(%rsp) +#define ALPHA_I 56(%rsp) +#define J 64(%rsp) +#define OFFSET 72(%rsp) +#define KK 80(%rsp) +#define KKK 88(%rsp) + +#else + +#define STACKSIZE 512 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define ALPHA_R 224(%rsp) +#define ALPHA_I 232(%rsp) +#define J 240(%rsp) +#define OFFSET 248(%rsp) +#define KK 256(%rsp) +#define KKK 264(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 17 + 4) +#define PREFETCH prefetcht0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define ADD1 addps +#define ADD2 addps +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define ADD1 addps +#define ADD2 addps +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADD1 addps +#define ADD2 addps +#else +#define ADD1 addps +#define ADD2 subps +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#else + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movq OLD_OFFSET, %r11 +#endif + +#endif + + unpcklps %xmm0, %xmm0 + unpcklps %xmm1, %xmm1 + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + + salq $ZBASE_SHIFT, LDC + +#ifdef TRMMKERNEL + movq %r11, OFFSET +#ifndef LEFT + negq %r11 +#endif + movq %r11, KK +#endif + + movq N, J + sarq $1, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + movq A, AO + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + prefetcht0 -32 * SIZE(BB) + pxor %xmm6, %xmm6 + + prefetcht2 7 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + prefetcht2 7 * SIZE(CO2) + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + + subq $-24 * SIZE, BB + + leaq (PREFETCHSIZE + 0) * SIZE(AO), PREA + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH -32 * SIZE(PREA) + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -12 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + PREFETCH -16 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -4 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 0 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 4 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + PREFETCH 0 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 8 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 12 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 16 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 20 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + PREFETCH 16 * SIZE(PREA) + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps 24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps 28 * SIZE(AO), %xmm1 + + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + subq $-64 * SIZE, AO + movaps 0 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -28 * SIZE(AO), %xmm1 + + subq $-64 * SIZE, PREA + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: + prefetcht0 -16 * SIZE(BB) + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + movaps %xmm2, %xmm3 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + mulps %xmm1, %xmm3 + + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movaps %xmm7, %xmm5 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + mulps %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movaps %xmm6, %xmm3 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + mulps %xmm1, %xmm3 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm5, %xmm13 + movaps %xmm4, %xmm5 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + mulps %xmm1, %xmm5 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: + ADD1 %xmm6, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + pshufd $0xb1, %xmm12, %xmm13 + pshufd $0xb1, %xmm14, %xmm15 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm2, %xmm12 + mulps %xmm3, %xmm13 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + mulps %xmm2, %xmm14 + mulps %xmm3, %xmm15 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + addsubps %xmm13, %xmm12 + addsubps %xmm15, %xmm14 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 + addps %xmm2, %xmm10 + addps %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + BRANCH + jle .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm6, %xmm6 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm6, %xmm10 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + subq $-16 * SIZE, AO + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -32 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: + ADD1 %xmm6, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + movhps 2 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + BRANCH + jle .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm6, %xmm6 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + ADD1 %xmm6, %xmm10 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -30 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -24 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -28 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -20 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -26 * SIZE(AO), %xmm0 + + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -16 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + subq $-8 * SIZE, AO + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -32 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm6, %xmm10 + pshufd $0xb1, %xmm2, %xmm7 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0x1b, %xmm7, %xmm6 + mulps %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -28 * SIZE(BO), %xmm2 + pshufd $0xb1, %xmm6, %xmm4 + mulps %xmm0, %xmm6 + ADD2 %xmm7, %xmm9 + mulps %xmm0, %xmm4 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: + ADD1 %xmm6, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + movddup ALPHA_R, %xmm2 + movddup ALPHA_I, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pshufd $0xb1, %xmm0, %xmm0 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm10, %xmm11 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + mulps %xmm2, %xmm10 + mulps %xmm3, %xmm11 + + addsubps %xmm9, %xmm8 + addsubps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movsd 0 * SIZE(CO2), %xmm2 + + addps %xmm0, %xmm8 + addps %xmm2, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movsd %xmm10, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + leaq (C, LDC, 2), C + movq BO, B + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + BRANCH + jle .L999 + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 + movq A, AO + + movq M, I + sarq $2, I + NOBRANCH + jle .L50 + ALIGN_4 + +.L41: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -28 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + + prefetcht0 7 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0x00, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0x55, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0xaa, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0xff, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0xff, %xmm2, %xmm6 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -12 * SIZE(AO), %xmm1 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0x00, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0x55, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0x55, %xmm2, %xmm6 + mulps %xmm1, %xmm6 + movaps -4 * SIZE(AO), %xmm1 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0xaa, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0xff, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0xff, %xmm2, %xmm6 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps 4 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm12 + pshufd $0x00, %xmm2, %xmm4 + mulps %xmm1, %xmm4 + + addps %xmm5, %xmm9 + pshufd $0x55, %xmm2, %xmm5 + mulps %xmm0, %xmm5 + movaps -24 * SIZE(AO), %xmm0 + addps %xmm6, %xmm13 + pshufd $0x55, %xmm2, %xmm6 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm1, %xmm6 + movaps -20 * SIZE(AO), %xmm1 + + addq $8 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: + addps %xmm3, %xmm8 + addps %xmm4, %xmm12 + addps %xmm5, %xmm9 + addps %xmm6, %xmm13 + + pshufd $0xb1, %xmm9, %xmm9 + movddup ALPHA_R, %xmm2 + pshufd $0xb1, %xmm13, %xmm13 + movddup ALPHA_I, %xmm3 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 + + subps %xmm9, %xmm8 + subps %xmm13, %xmm12 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 +#else + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 + + subps %xmm9, %xmm8 + subps %xmm13, %xmm12 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + pshufd $0xb1, %xmm12, %xmm13 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + mulps %xmm2, %xmm12 + mulps %xmm3, %xmm13 + + addsubps %xmm9, %xmm8 + addsubps %xmm13, %xmm12 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm1 + movhps 6 * SIZE(CO1), %xmm1 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm12 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + BRANCH + jg .L41 + ALIGN_4 + +.L50: + testq $2, M + BRANCH + jle .L60 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: + addps %xmm3, %xmm8 + movddup ALPHA_R, %xmm2 + addps %xmm4, %xmm9 + movddup ALPHA_I, %xmm3 + + pshufd $0xb1, %xmm9, %xmm9 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + + subps %xmm9, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm9 + + addps %xmm9, %xmm8 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm8 + + addps %xmm9, %xmm8 +#else + pxor %xmm0, %xmm8 + + subps %xmm9, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + movq B, BO +#else + movq B, BO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + + movsd -32 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + movaps -32 * SIZE(BO), %xmm2 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movsd -30 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + movaps -28 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movsd -28 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + mulps %xmm0, %xmm4 + movsd -26 * SIZE(AO), %xmm0 + + addps %xmm3, %xmm8 + pshufd $0xaa, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0xff, %xmm2, %xmm4 + movaps -24 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movsd -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + addps %xmm3, %xmm8 + pshufd $0x00, %xmm2, %xmm3 + mulps %xmm0, %xmm3 + addps %xmm4, %xmm9 + pshufd $0x55, %xmm2, %xmm4 + movsd -30 * SIZE(BO), %xmm2 + mulps %xmm0, %xmm4 + movsd -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: + addps %xmm3, %xmm8 + movddup ALPHA_R, %xmm2 + addps %xmm4, %xmm9 + movddup ALPHA_I, %xmm3 + + pshufd $0xb1, %xmm9, %xmm9 + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + pxor %xmm0, %xmm9 + + subps %xmm9, %xmm8 +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) + pxor %xmm0, %xmm9 + + addps %xmm9, %xmm8 +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) + pxor %xmm0, %xmm8 + + addps %xmm9, %xmm8 +#else + pxor %xmm0, %xmm8 + + subps %xmm9, %xmm8 +#endif + + pshufd $0xb1, %xmm8, %xmm9 + mulps %xmm2, %xmm8 + mulps %xmm3, %xmm9 + addsubps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + addps %xmm0, %xmm8 +#endif + movsd %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse.S b/kernel/x86_64/zgemm_kernel_4x2_sse.S new file mode 100644 index 0000000000..04dbf1ad1c --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_sse.S @@ -0,0 +1,2293 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi + +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define J 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) +#define BUFFER 256(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 5 + 8) +#endif + +#if defined(PENTIUM4) || defined(GENERIC) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 160 +#endif + +#define RPREFETCHSIZE (8 * 7 + 4) +#define WPREFETCHSIZE (8 * 8 + 4) + +#ifndef GENERIC +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 + +#else + +#define KERNEL1(xx) \ + mulps %xmm0, %xmm1 ;\ + addps %xmm1, %xmm8 ;\ + movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL2(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ + +#define KERNEL3(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL4(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 + +#define KERNEL5(xx) \ + mulps %xmm0, %xmm1 ;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm1, %xmm8 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm0, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm0, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ + addps %xmm5, %xmm10 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm0, %xmm11 ;\ + movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 + +#define KERNEL6(xx) \ + mulps %xmm2, %xmm1 ;\ + addps %xmm1, %xmm12 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ + mulps %xmm2, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm2, %xmm5 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ + addps %xmm5, %xmm14 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm2, %xmm15 ;\ + movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 + +#define KERNEL7(xx) \ + mulps %xmm4, %xmm7 ;\ + addps %xmm7, %xmm8 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm4, %xmm3 ;\ + addps %xmm3, %xmm9 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm4, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ + addps %xmm5, %xmm10 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm4, %xmm11 ;\ + movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 + +#define KERNEL8(xx) \ + mulps %xmm6, %xmm7 ;\ + addps %xmm7, %xmm12 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ + mulps %xmm6, %xmm3 ;\ + addps %xmm3, %xmm13 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ + mulps %xmm6, %xmm5 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ + addps %xmm5, %xmm14 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ + addps %xmm6, %xmm15 ;\ + movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm12 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $256 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm7, %xmm7 + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 # Generate mask + pxor %xmm10, %xmm10 + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm7, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + movss %xmm7, 0 + POSINV + movss %xmm10, 4 + POSINV + movss %xmm7, 8 + POSINV + movss %xmm10,12 + POSINV +#else + movss %xmm10, 0 + POSINV + movss %xmm7, 4 + POSINV + movss %xmm10, 8 + POSINV + movss %xmm7, 12 + POSINV +#endif + + addq $32 * SIZE, A + +#ifdef TRMMKERNEL + movsd %xmm12, OFFSET + movsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + movaps POSINV, %xmm7 + + movq K, %rax + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) + + movss 0 * SIZE(B), %xmm8 + movss 1 * SIZE(B), %xmm9 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + shufps $0, %xmm10, %xmm10 + shufps $0, %xmm11, %xmm11 + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + + PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + movaps %xmm12, 16 * SIZE(BO) + movaps %xmm13, 20 * SIZE(BO) + movaps %xmm14, 24 * SIZE(BO) + movaps %xmm15, 28 * SIZE(BO) + + addq $32 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movss 0 * SIZE(B), %xmm8 + movss 1 * SIZE(B), %xmm9 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + shufps $0, %xmm10, %xmm10 + shufps $0, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 +#endif + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq (RPREFETCHSIZE + 0) * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm8, %xmm8 + movaps -28 * SIZE(AO), %xmm2 + movaps -28 * SIZE(BO), %xmm3 + pxor %xmm9, %xmm9 + movaps -24 * SIZE(AO), %xmm4 + movaps -24 * SIZE(BO), %xmm5 + pxor %xmm10, %xmm10 + movaps -20 * SIZE(AO), %xmm6 + movaps -16 * SIZE(BO), %xmm7 + pxor %xmm11, %xmm11 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm12, %xmm12 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm13, %xmm13 + PREFETCH -32 * SIZE(BB) + pxor %xmm14, %xmm14 + PREFETCH -16 * SIZE(BB) + pxor %xmm15, %xmm15 + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif +#ifndef GENERIC + andq $-8, %rax + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + + addq $16 * SIZE, %rax + BRANCH + jl .L12 + ALIGN_3 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + xorq %rax, %rax + ALIGN_3 + + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $64 * SIZE, BO + addq $32 * SIZE, AO + ALIGN_3 +#else + sarq $2, %rax + NOBRANCH + jle .L16 + ALIGN_3 + +.L12: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + + addq $ 64 * SIZE, BO + subq $-32 * SIZE, AO + decq %rax + BRANCH + jg .L12 +#endif + + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO + negq %rax + ALIGN_4 + +.L17: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm0 + addps %xmm1, %xmm10 + movaps -32 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO, %rax, 4), %xmm0 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm12 + movaps -28 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm13 + movaps -24 * SIZE(BO, %rax, 8), %xmm1 + mulps %xmm2, %xmm1 + mulps -20 * SIZE(BO, %rax, 8), %xmm2 + addps %xmm1, %xmm14 + movaps -16 * SIZE(BO, %rax, 8), %xmm1 + addps %xmm2, %xmm15 + movaps -20 * SIZE(AO, %rax, 4), %xmm2 + + addq $SIZE * 2, %rax + jl .L17 + ALIGN_4 + +.L18: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 + movsd 4 * SIZE(CO2), %xmm3 + movhps 6 * SIZE(CO2), %xmm3 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + shufps $0xb1, %xmm13, %xmm13 + shufps $0xb1, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 + subps %xmm13, %xmm12 + subps %xmm15, %xmm14 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + movaps %xmm12, %xmm13 + movaps %xmm14, %xmm15 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + shufps $0xb1, %xmm12, %xmm12 + shufps $0xb1, %xmm14, %xmm14 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + mulps %xmm6, %xmm15 + mulps %xmm7, %xmm14 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + addps %xmm13, %xmm12 + addps %xmm15, %xmm14 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 + addps %xmm2, %xmm12 + addps %xmm3, %xmm14 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movsd %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + movsd %xmm14, 4 * SIZE(CO2) + movhps %xmm14, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -16 * SIZE(AO), %xmm2 + movaps 0 * SIZE(AO), %xmm4 + movaps 16 * SIZE(AO), %xmm6 + + movaps -32 * SIZE(BO), %xmm1 + movaps -16 * SIZE(BO), %xmm3 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -28 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -8 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm3, %xmm10 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm5 + addps %xmm5, %xmm8 + movaps 4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm9 + movaps 8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + mulps 12 * SIZE(BO), %xmm0 + addps %xmm5, %xmm10 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm0, %xmm11 + movaps -20 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm7 + addps %xmm7, %xmm8 + movaps 20 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm9 + movaps 24 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + mulps 28 * SIZE(BO), %xmm0 + addps %xmm7, %xmm10 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm0, %xmm11 + movaps 0 * SIZE(AO), %xmm0 + + mulps %xmm2, %xmm1 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + addps %xmm1, %xmm8 + movaps 36 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm9 + movaps 40 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + mulps 44 * SIZE(BO), %xmm2 + addps %xmm1, %xmm10 + movaps 96 * SIZE(BO), %xmm1 + addps %xmm2, %xmm11 + movaps -12 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm3 + addps %xmm3, %xmm8 + movaps 52 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + movaps 56 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + mulps 60 * SIZE(BO), %xmm2 + addps %xmm3, %xmm10 + movaps 112 * SIZE(BO), %xmm3 + addps %xmm2, %xmm11 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movaps 68 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm9 + movaps 72 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + mulps 76 * SIZE(BO), %xmm2 + addps %xmm5, %xmm10 + movaps 128 * SIZE(BO), %xmm5 + addps %xmm2, %xmm11 + movaps -4 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movaps 84 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm9 + movaps 88 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + mulps 92 * SIZE(BO), %xmm2 + addps %xmm7, %xmm10 + movaps 144 * SIZE(BO), %xmm7 + addps %xmm2, %xmm11 + movaps 16 * SIZE(AO), %xmm2 + + subq $ -32 * SIZE, AO + subq $-128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps -16 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -28 * SIZE(AO), %xmm0 + + subq $- 4 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + + movsd 0 * SIZE(CO2), %xmm1 + movhps 2 * SIZE(CO2), %xmm1 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhps %xmm10, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + movaps -24 * SIZE(AO), %xmm2 + + movaps -32 * SIZE(BO), %xmm1 + movaps -16 * SIZE(BO), %xmm3 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movaps 32 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm9 + movaps -8 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + addps %xmm3, %xmm10 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + movaps 48 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm5 + addps %xmm5, %xmm8 + movaps 4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm9 + movaps 8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + addps %xmm5, %xmm10 + movaps 12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm5 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm5, %xmm11 + movaps 64 * SIZE(BO), %xmm5 + + mulps %xmm0, %xmm7 + addps %xmm7, %xmm8 + movaps 20 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm9 + movaps 24 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + addps %xmm7, %xmm10 + movaps 28 * SIZE(BO), %xmm7 + mulps %xmm0, %xmm7 + movsd -16 * SIZE(AO), %xmm0 + addps %xmm7, %xmm11 + movaps 80 * SIZE(BO), %xmm7 + + mulps %xmm2, %xmm1 + addps %xmm1, %xmm8 + movaps 36 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm9 + movaps 40 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + addps %xmm1, %xmm10 + movaps 44 * SIZE(BO), %xmm1 + mulps %xmm2, %xmm1 + movsd -22 * SIZE(AO), %xmm2 + addps %xmm1, %xmm11 + movaps 96 * SIZE(BO), %xmm1 + + mulps %xmm2, %xmm3 + addps %xmm3, %xmm8 + movaps 52 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm9 + movaps 56 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + addps %xmm3, %xmm10 + movaps 60 * SIZE(BO), %xmm3 + mulps %xmm2, %xmm3 + movsd -20 * SIZE(AO), %xmm2 + addps %xmm3, %xmm11 + movaps 112 * SIZE(BO), %xmm3 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movaps 68 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm9 + movaps 72 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + addps %xmm5, %xmm10 + movaps 76 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -18 * SIZE(AO), %xmm2 + addps %xmm5, %xmm11 + movaps 128 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movaps 84 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm9 + movaps 88 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + addps %xmm7, %xmm10 + movaps 92 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -8 * SIZE(AO), %xmm2 + addps %xmm7, %xmm11 + movaps 144 * SIZE(BO), %xmm7 + + subq $ -16 * SIZE, AO + subq $-128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movaps -16 * SIZE(BO), %xmm1 + + subq $ -2 * SIZE, AO + subq $-16 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(CO1), %xmm0 +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(CO2), %xmm1 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm11, %xmm10 +#else + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm10, %xmm11 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm10, %xmm10 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm11 + mulps %xmm7, %xmm10 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm1, %xmm10 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movlps %xmm10, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 8), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + movaps POSINV, %xmm7 + + movq K, %rax + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movss 0 * SIZE(B), %xmm8 + movss 1 * SIZE(B), %xmm9 + movss 2 * SIZE(B), %xmm10 + movss 3 * SIZE(B), %xmm11 + movss 4 * SIZE(B), %xmm12 + movss 5 * SIZE(B), %xmm13 + movss 6 * SIZE(B), %xmm14 + movss 7 * SIZE(B), %xmm15 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + shufps $0, %xmm10, %xmm10 + shufps $0, %xmm11, %xmm11 + shufps $0, %xmm12, %xmm12 + shufps $0, %xmm13, %xmm13 + shufps $0, %xmm14, %xmm14 + shufps $0, %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm9 + xorps %xmm7, %xmm11 + xorps %xmm7, %xmm13 + xorps %xmm7, %xmm15 +#else + xorps %xmm7, %xmm8 + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + xorps %xmm7, %xmm14 +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) + prefetchnta 56 * SIZE(B) +#endif + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + movaps %xmm10, 8 * SIZE(BO) + movaps %xmm11, 12 * SIZE(BO) + movaps %xmm12, 16 * SIZE(BO) + movaps %xmm13, 20 * SIZE(BO) + movaps %xmm14, 24 * SIZE(BO) + movaps %xmm15, 28 * SIZE(BO) + +#if defined(PENTIUM4) || defined(GENERIC) + PREFETCHW 128 * SIZE(BO) + PREFETCH 112 * SIZE(B) +#endif + + addq $32 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movss 0 * SIZE(B), %xmm8 + movss 1 * SIZE(B), %xmm9 + + shufps $0, %xmm8, %xmm8 + shufps $0, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(NR) || defined(NC) || \ + defined(TN) || defined(TT) || defined(TR) || defined(TC) + xorps %xmm7, %xmm9 +#else + xorps %xmm7, %xmm8 +#endif + + movaps %xmm8, 0 * SIZE(BO) + movaps %xmm9, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -16 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + movaps 0 * SIZE(AO), %xmm4 + pxor %xmm10, %xmm10 + movaps 16 * SIZE(AO), %xmm6 + pxor %xmm11, %xmm11 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm12, %xmm12 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm13, %xmm13 + movaps 0 * SIZE(BO), %xmm5 + pxor %xmm14, %xmm14 + movaps 16 * SIZE(BO), %xmm7 + pxor %xmm15, %xmm15 + + PREFETCHW 7 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps 32 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + mulps %xmm2, %xmm3 + mulps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm8 + movaps -16 * SIZE(BO), %xmm3 + addps %xmm2, %xmm9 + movaps -12 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm3 + mulps -12 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm2, %xmm13 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm3 + mulps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm8 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm2, %xmm9 + movaps -4 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm3 + mulps -4 * SIZE(BO), %xmm2 + addps %xmm3, %xmm12 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm2, %xmm13 + movaps 48 * SIZE(AO), %xmm2 + + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + + mulps %xmm4, %xmm5 + mulps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm8 + movaps 0 * SIZE(BO), %xmm5 + addps %xmm4, %xmm9 + movaps 4 * SIZE(AO), %xmm4 + mulps %xmm4, %xmm5 + mulps 4 * SIZE(BO), %xmm4 + addps %xmm5, %xmm12 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm4, %xmm13 + movaps 8 * SIZE(AO), %xmm4 + + mulps %xmm4, %xmm5 + mulps 12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm8 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm4, %xmm9 + movaps 12 * SIZE(AO), %xmm4 + mulps %xmm4, %xmm5 + mulps 12 * SIZE(BO), %xmm4 + addps %xmm5, %xmm12 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm4, %xmm13 + movaps 64 * SIZE(AO), %xmm4 + + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) + + mulps %xmm6, %xmm7 + mulps 20 * SIZE(BO), %xmm6 + addps %xmm7, %xmm8 + movaps 16 * SIZE(BO), %xmm7 + addps %xmm6, %xmm9 + movaps 20 * SIZE(AO), %xmm6 + mulps %xmm6, %xmm7 + mulps 20 * SIZE(BO), %xmm6 + addps %xmm7, %xmm12 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm6, %xmm13 + movaps 24 * SIZE(AO), %xmm6 + + mulps %xmm6, %xmm7 + mulps 28 * SIZE(BO), %xmm6 + addps %xmm7, %xmm8 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm6, %xmm9 + movaps 28 * SIZE(AO), %xmm6 + mulps %xmm6, %xmm7 + mulps 28 * SIZE(BO), %xmm6 + addps %xmm7, %xmm12 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm6, %xmm13 + movaps 80 * SIZE(AO), %xmm6 + + subq $-64 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm13 + movaps -24 * SIZE(AO), %xmm0 + + addq $ 8 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jg .L56 + ALIGN_4 + +.L58: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 + movsd 4 * SIZE(CO1), %xmm2 + movhps 6 * SIZE(CO1), %xmm2 +#endif + + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 + subps %xmm13, %xmm12 +#else + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 +#endif + + movaps %xmm8, %xmm9 + movaps %xmm12, %xmm13 + + shufps $0xb1, %xmm8, %xmm8 + shufps $0xb1, %xmm12, %xmm12 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + mulps %xmm6, %xmm13 + mulps %xmm7, %xmm12 + + addps %xmm9, %xmm8 + addps %xmm13, %xmm12 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 + addps %xmm2, %xmm12 +#endif + + movlps %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + movlps %xmm12, 4 * SIZE(CO1) + movhps %xmm12, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -16 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm1 + mulps -20 * SIZE(BO), %xmm0 + addps %xmm1, %xmm10 + movaps 32 * SIZE(BO), %xmm1 + addps %xmm0, %xmm11 + movaps -24 * SIZE(AO), %xmm0 + + mulps %xmm0, %xmm3 + mulps -12 * SIZE(BO), %xmm0 + addps %xmm3, %xmm8 + movaps -8 * SIZE(BO), %xmm3 + addps %xmm0, %xmm9 + movaps -20 * SIZE(AO), %xmm0 + mulps %xmm0, %xmm3 + mulps -4 * SIZE(BO), %xmm0 + addps %xmm3, %xmm10 + movaps 48 * SIZE(BO), %xmm3 + addps %xmm0, %xmm11 + movaps 0 * SIZE(AO), %xmm0 + + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + + mulps %xmm2, %xmm5 + mulps 4 * SIZE(BO), %xmm2 + addps %xmm5, %xmm8 + movaps 8 * SIZE(BO), %xmm5 + addps %xmm2, %xmm9 + movaps -12 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm5 + mulps 12 * SIZE(BO), %xmm2 + addps %xmm5, %xmm10 + movaps 64 * SIZE(BO), %xmm5 + addps %xmm2, %xmm11 + movaps -8 * SIZE(AO), %xmm2 + + mulps %xmm2, %xmm7 + mulps 20 * SIZE(BO), %xmm2 + addps %xmm7, %xmm8 + movaps 24 * SIZE(BO), %xmm7 + addps %xmm2, %xmm9 + movaps -4 * SIZE(AO), %xmm2 + mulps %xmm2, %xmm7 + mulps 28 * SIZE(BO), %xmm2 + addps %xmm7, %xmm10 + movaps 80 * SIZE(BO), %xmm7 + addps %xmm2, %xmm11 + movaps 16 * SIZE(AO), %xmm2 + + subq $-32 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm0, %xmm1 + mulps -28 * SIZE(BO), %xmm0 + addps %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + addps %xmm0, %xmm9 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#ifndef TRMMKERNEL + movsd 0 * SIZE(CO1), %xmm0 + movhps 2 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 +#else + addps %xmm9, %xmm8 +#endif + + movaps %xmm8, %xmm9 + + shufps $0xb1, %xmm8, %xmm8 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + + addps %xmm9, %xmm8 + +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + + movsd %xmm8, 0 * SIZE(CO1) + movhps %xmm8, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq 32 * SIZE + BUFFER, BO +#else + leaq 32 * SIZE + BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movaps -32 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + movaps -24 * SIZE(AO), %xmm2 + pxor %xmm9, %xmm9 + + movaps -32 * SIZE(BO), %xmm1 + pxor %xmm10, %xmm10 + movaps -16 * SIZE(BO), %xmm3 + pxor %xmm11, %xmm11 + movaps 0 * SIZE(BO), %xmm5 + movaps 16 * SIZE(BO), %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm0, %xmm1 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm1 + addps %xmm1, %xmm10 + movaps -20 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -28 * SIZE(AO), %xmm0 + addps %xmm1, %xmm11 + movaps 32 * SIZE(BO), %xmm1 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm8 + movaps -12 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -26 * SIZE(AO), %xmm0 + addps %xmm3, %xmm9 + movaps -8 * SIZE(BO), %xmm3 + + mulps %xmm0, %xmm3 + addps %xmm3, %xmm10 + movaps -4 * SIZE(BO), %xmm3 + mulps %xmm0, %xmm3 + movsd -16 * SIZE(AO), %xmm0 + addps %xmm3, %xmm11 + movaps 48 * SIZE(BO), %xmm3 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm8 + movaps 4 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -22 * SIZE(AO), %xmm2 + addps %xmm5, %xmm9 + movaps 8 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm5 + addps %xmm5, %xmm10 + movaps 12 * SIZE(BO), %xmm5 + mulps %xmm2, %xmm5 + movsd -20 * SIZE(AO), %xmm2 + addps %xmm5, %xmm11 + movaps 64 * SIZE(BO), %xmm5 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm8 + movaps 20 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -18 * SIZE(AO), %xmm2 + addps %xmm7, %xmm9 + movaps 24 * SIZE(BO), %xmm7 + + mulps %xmm2, %xmm7 + addps %xmm7, %xmm10 + movaps 28 * SIZE(BO), %xmm7 + mulps %xmm2, %xmm7 + movsd -8 * SIZE(AO), %xmm2 + addps %xmm7, %xmm11 + movaps 80 * SIZE(BO), %xmm7 + + subq $-16 * SIZE, AO + subq $-64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm6 + movaps ALPHA_I, %xmm7 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm0, %xmm1 + addps %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + mulps %xmm0, %xmm1 + movsd -30 * SIZE(AO), %xmm0 + addps %xmm1, %xmm9 + movaps -24 * SIZE(BO), %xmm1 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: +#ifndef TRMMKERNEL +#ifdef movsd + xorps %xmm0, %xmm0 +#endif + movsd 0 * SIZE(CO1), %xmm0 +#endif + + addps %xmm10, %xmm8 + addps %xmm11, %xmm9 + + shufps $0xb1, %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subps %xmm9, %xmm8 +#else + addps %xmm9, %xmm8 +#endif + + movaps %xmm8, %xmm9 + + shufps $0xb1, %xmm8, %xmm8 + + mulps %xmm6, %xmm9 + mulps %xmm7, %xmm8 + + addps %xmm9, %xmm8 +#ifndef TRMMKERNEL + addps %xmm0, %xmm8 +#endif + movlps %xmm8, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_kernel_4x2_sse3.S b/kernel/x86_64/zgemm_kernel_4x2_sse3.S new file mode 100644 index 0000000000..ecc3a6f05b --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_sse3.S @@ -0,0 +1,2101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %r12 +#define BO %r13 +#define CO1 %r14 +#define CO2 %r15 +#define BB %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define ALPHA_R 0(%rsp) +#define ALPHA_I 16(%rsp) +#define J 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 320 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define ADDSUB addps +#else +#define ADDSUB subps +#endif + +#define KERNEL1(address) \ + mulps %xmm8, %xmm9; \ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO); \ + addps %xmm9, %xmm0; \ + movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + ADDSUB %xmm9, %xmm1; \ + movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ + ADDSUB %xmm9, %xmm3; \ + movsldup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + ADDSUB %xmm9, %xmm5; \ + movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 8 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ + ADDSUB %xmm9, %xmm7; \ + movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm0; \ + movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + ADDSUB %xmm9, %xmm1; \ + movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm2; \ + movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 12 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ + ADDSUB %xmm9, %xmm3; \ + movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm4; \ + movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + ADDSUB %xmm9, %xmm5; \ + movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + addps %xmm9, %xmm6; \ + movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ + mulps %xmm8, %xmm9; \ + movaps 64 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ + ADDSUB %xmm9, %xmm7; \ + movsldup 64 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + ADDSUB %xmm11, %xmm1; \ + movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 20 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ + ADDSUB %xmm11, %xmm3; \ + movsldup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + ADDSUB %xmm11, %xmm5; \ + movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 24 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ + ADDSUB %xmm11, %xmm7; \ + movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm0; \ + movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + ADDSUB %xmm11, %xmm1; \ + movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm2; \ + movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 28 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ + ADDSUB %xmm11, %xmm3; \ + movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm4; \ + movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + ADDSUB %xmm11, %xmm5; \ + movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + addps %xmm11, %xmm6; \ + movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ + mulps %xmm10, %xmm11; \ + movaps 80 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ + ADDSUB %xmm11, %xmm7; \ + movsldup 80 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulps %xmm12, %xmm13; \ + PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * 2 * SIZE(AO); \ + addps %xmm13, %xmm0; \ + movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + ADDSUB %xmm13, %xmm1; \ + movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 36 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ + ADDSUB %xmm13, %xmm3; \ + movsldup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + ADDSUB %xmm13, %xmm5; \ + movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 40 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ + ADDSUB %xmm13, %xmm7; \ + movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm0; \ + movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + ADDSUB %xmm13, %xmm1; \ + movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm2; \ + movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 44 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ + ADDSUB %xmm13, %xmm3; \ + movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm4; \ + movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + ADDSUB %xmm13, %xmm5; \ + movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + addps %xmm13, %xmm6; \ + movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ + mulps %xmm12, %xmm13; \ + movaps 96 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ + ADDSUB %xmm13, %xmm7; \ + movsldup 96 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + ADDSUB %xmm15, %xmm1; \ + movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 52 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ + ADDSUB %xmm15, %xmm3; \ + movsldup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + ADDSUB %xmm15, %xmm5; \ + movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ + ADDSUB %xmm15, %xmm7; \ + movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm0; \ + movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + ADDSUB %xmm15, %xmm1; \ + movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm2; \ + movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 60 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ + ADDSUB %xmm15, %xmm3; \ + movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm4; \ + movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + ADDSUB %xmm15, %xmm5; \ + movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + addps %xmm15, %xmm6; \ + movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ + mulps %xmm14, %xmm15; \ + movaps 112 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ + ADDSUB %xmm15, %xmm7; \ + movsldup 112 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm4 +#endif + movaps %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#else + movq 72(%rsp), LDC +#ifdef TRMMKERNEL + movsd 80(%rsp), %xmm4 +#endif + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + pxor %xmm15, %xmm15 + cmpeqps %xmm15, %xmm15 + pslld $31, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + shufps $0, %xmm0, %xmm0 + movaps %xmm0, 0 + ALPHA_R + + movss %xmm1, 4 + ALPHA_I + movss %xmm1, 12 + ALPHA_I + xorps %xmm15, %xmm1 + movss %xmm1, 0 + ALPHA_I + movss %xmm1, 8 + ALPHA_I + +#ifdef TRMMKERNEL + movsd %xmm4, OFFSET + movsd %xmm4, KK +#ifndef LEFT + negq KK +#endif +#endif + + salq $ZBASE_SHIFT, LDC + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: + movq K, %rax + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $4 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + leaq 112 * SIZE(B), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 16 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 32 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 48 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + movsldup 0 * SIZE(BO), %xmm9 + pxor %xmm4, %xmm4 + movsldup 16 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + movsldup 32 * SIZE(BO), %xmm13 + pxor %xmm6, %xmm6 + movsldup 48 * SIZE(BO), %xmm15 + pxor %xmm7, %xmm7 + + prefetchnta 8 * SIZE(CO1) + prefetchnta 8 * SIZE(CO2) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif +#if 1 + andq $-8, %rax + salq $4, %rax + je .L15 + +.L1X: + KERNEL1 (32 * 0) + KERNEL2 (32 * 0) + KERNEL3 (32 * 0) + KERNEL4 (32 * 0) + KERNEL5 (32 * 0) + KERNEL6 (32 * 0) + KERNEL7 (32 * 0) + KERNEL8 (32 * 0) + KERNEL9 (32 * 0) + KERNEL10(32 * 0) + KERNEL11(32 * 0) + KERNEL12(32 * 0) + KERNEL13(32 * 0) + KERNEL14(32 * 0) + KERNEL15(32 * 0) + KERNEL16(32 * 0) + cmpq $128 * 1, %rax + jle .L12 + KERNEL1 (32 * 1) + KERNEL2 (32 * 1) + KERNEL3 (32 * 1) + KERNEL4 (32 * 1) + KERNEL5 (32 * 1) + KERNEL6 (32 * 1) + KERNEL7 (32 * 1) + KERNEL8 (32 * 1) + KERNEL9 (32 * 1) + KERNEL10(32 * 1) + KERNEL11(32 * 1) + KERNEL12(32 * 1) + KERNEL13(32 * 1) + KERNEL14(32 * 1) + KERNEL15(32 * 1) + KERNEL16(32 * 1) + cmpq $128 * 2, %rax + jle .L12 + KERNEL1 (32 * 2) + KERNEL2 (32 * 2) + KERNEL3 (32 * 2) + KERNEL4 (32 * 2) + KERNEL5 (32 * 2) + KERNEL6 (32 * 2) + KERNEL7 (32 * 2) + KERNEL8 (32 * 2) + KERNEL9 (32 * 2) + KERNEL10(32 * 2) + KERNEL11(32 * 2) + KERNEL12(32 * 2) + KERNEL13(32 * 2) + KERNEL14(32 * 2) + KERNEL15(32 * 2) + KERNEL16(32 * 2) + cmpq $128 * 3, %rax + jle .L12 + KERNEL1 (32 * 3) + KERNEL2 (32 * 3) + KERNEL3 (32 * 3) + KERNEL4 (32 * 3) + KERNEL5 (32 * 3) + KERNEL6 (32 * 3) + KERNEL7 (32 * 3) + KERNEL8 (32 * 3) + KERNEL9 (32 * 3) + KERNEL10(32 * 3) + KERNEL11(32 * 3) + KERNEL12(32 * 3) + KERNEL13(32 * 3) + KERNEL14(32 * 3) + KERNEL15(32 * 3) + KERNEL16(32 * 3) + cmpq $128 * 4, %rax + jle .L12 + KERNEL1 (32 * 4) + KERNEL2 (32 * 4) + KERNEL3 (32 * 4) + KERNEL4 (32 * 4) + KERNEL5 (32 * 4) + KERNEL6 (32 * 4) + KERNEL7 (32 * 4) + KERNEL8 (32 * 4) + KERNEL9 (32 * 4) + KERNEL10(32 * 4) + KERNEL11(32 * 4) + KERNEL12(32 * 4) + KERNEL13(32 * 4) + KERNEL14(32 * 4) + KERNEL15(32 * 4) + KERNEL16(32 * 4) + cmpq $128 * 5, %rax + jle .L12 + KERNEL1 (32 * 5) + KERNEL2 (32 * 5) + KERNEL3 (32 * 5) + KERNEL4 (32 * 5) + KERNEL5 (32 * 5) + KERNEL6 (32 * 5) + KERNEL7 (32 * 5) + KERNEL8 (32 * 5) + KERNEL9 (32 * 5) + KERNEL10(32 * 5) + KERNEL11(32 * 5) + KERNEL12(32 * 5) + KERNEL13(32 * 5) + KERNEL14(32 * 5) + KERNEL15(32 * 5) + KERNEL16(32 * 5) + cmpq $128 * 6, %rax + jle .L12 + KERNEL1 (32 * 6) + KERNEL2 (32 * 6) + KERNEL3 (32 * 6) + KERNEL4 (32 * 6) + KERNEL5 (32 * 6) + KERNEL6 (32 * 6) + KERNEL7 (32 * 6) + KERNEL8 (32 * 6) + KERNEL9 (32 * 6) + KERNEL10(32 * 6) + KERNEL11(32 * 6) + KERNEL12(32 * 6) + KERNEL13(32 * 6) + KERNEL14(32 * 6) + KERNEL15(32 * 6) + KERNEL16(32 * 6) + cmpq $128 * 7, %rax + jle .L12 + KERNEL1 (32 * 7) + KERNEL2 (32 * 7) + KERNEL3 (32 * 7) + KERNEL4 (32 * 7) + KERNEL5 (32 * 7) + KERNEL6 (32 * 7) + KERNEL7 (32 * 7) + KERNEL8 (32 * 7) + KERNEL9 (32 * 7) + KERNEL10(32 * 7) + KERNEL11(32 * 7) + KERNEL12(32 * 7) + KERNEL13(32 * 7) + KERNEL14(32 * 7) + KERNEL15(32 * 7) + KERNEL16(32 * 7) + + addq $64 * 8 * SIZE, AO + addq $64 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 +#else + sarq $3, %rax + je .L15 + ALIGN_4 + +.L12: + KERNEL1 (32 * 0) + KERNEL2 (32 * 0) + KERNEL3 (32 * 0) + KERNEL4 (32 * 0) + KERNEL5 (32 * 0) + KERNEL6 (32 * 0) + KERNEL7 (32 * 0) + KERNEL8 (32 * 0) + KERNEL9 (32 * 0) + KERNEL10(32 * 0) + KERNEL11(32 * 0) + KERNEL12(32 * 0) + KERNEL13(32 * 0) + KERNEL14(32 * 0) + KERNEL15(32 * 0) + KERNEL16(32 * 0) + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + decq %rax + jne .L12 +#endif + ALIGN_4 + +.L15: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm3 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm6 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm7 + movsldup 8 * SIZE(BO), %xmm9 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L16 + ALIGN_4 + +.L18: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + + addsubps %xmm1, %xmm0 + addsubps %xmm3, %xmm2 + addsubps %xmm5, %xmm4 + addsubps %xmm7, %xmm6 + + movaps %xmm0, %xmm1 + movaps %xmm2, %xmm3 + movaps %xmm4, %xmm5 + movaps %xmm6, %xmm7 + + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm2, %xmm2 + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 +#else + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm2, %xmm2 + shufps $0xb1, %xmm4, %xmm4 + shufps $0xb1, %xmm6, %xmm6 + + addsubps %xmm0, %xmm1 + addsubps %xmm2, %xmm3 + addsubps %xmm4, %xmm5 + addsubps %xmm6, %xmm7 + + movaps %xmm1, %xmm0 + movaps %xmm3, %xmm2 + movaps %xmm5, %xmm4 + movaps %xmm7, %xmm6 + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm2 + + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm4 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm6 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + shufps $0xe4, %xmm8, %xmm8 + shufps $0xe4, %xmm9, %xmm9 + shufps $0xe4, %xmm10, %xmm10 + shufps $0xe4, %xmm11, %xmm11 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm10 + movhps 6 * SIZE(CO1), %xmm10 + + movsd 0 * SIZE(CO2), %xmm9 + movhps 2 * SIZE(CO2), %xmm9 + movsd 4 * SIZE(CO2), %xmm11 + movhps 6 * SIZE(CO2), %xmm11 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm2 + addps %xmm10, %xmm4 + addps %xmm11, %xmm6 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + movsd %xmm6, 4 * SIZE(CO2) + movhps %xmm6, 6 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + addq $8 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 16 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + + movsldup 0 * SIZE(BO), %xmm9 + pxor %xmm2, %xmm2 + movsldup 16 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + movsldup 32 * SIZE(BO), %xmm13 + movsldup 48 * SIZE(BO), %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm3 + movsldup 64 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + ADDSUB %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 12 * SIZE(AO), %xmm8 + ADDSUB %xmm11, %xmm3 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + ADDSUB %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movaps 32 * SIZE(AO), %xmm8 + ADDSUB %xmm11, %xmm3 + movsldup 80 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 32 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + ADDSUB %xmm13, %xmm1 + movsldup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 20 * SIZE(AO), %xmm10 + ADDSUB %xmm13, %xmm3 + movsldup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movshdup 40 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + ADDSUB %xmm13, %xmm1 + movsldup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movshdup 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movaps 24 * SIZE(AO), %xmm10 + ADDSUB %xmm13, %xmm3 + movsldup 96 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 48 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + ADDSUB %xmm15, %xmm1 + movsldup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 28 * SIZE(AO), %xmm10 + ADDSUB %xmm15, %xmm3 + movsldup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movshdup 56 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + ADDSUB %xmm15, %xmm1 + movsldup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movshdup 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movaps 48 * SIZE(AO), %xmm10 + ADDSUB %xmm15, %xmm3 + movsldup 112 * SIZE(BO), %xmm15 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm3 + movsldup 8 * SIZE(BO), %xmm9 + + addq $ 4 * SIZE, AO + addq $ 8 * SIZE, BO + decq %rax + jg .L26 + ALIGN_4 + +.L28: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + + addsubps %xmm1, %xmm0 + addsubps %xmm3, %xmm2 + + movaps %xmm0, %xmm1 + movaps %xmm2, %xmm3 + + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm2, %xmm2 +#else + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm2, %xmm2 + + addsubps %xmm0, %xmm1 + addsubps %xmm2, %xmm3 + + movaps %xmm1, %xmm0 + movaps %xmm3, %xmm2 + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm2 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + shufps $0xe4, %xmm8, %xmm8 + shufps $0xe4, %xmm10, %xmm10 + + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 0 * SIZE(CO2), %xmm10 + movhps 2 * SIZE(CO2), %xmm10 + + addps %xmm8, %xmm0 + addps %xmm10, %xmm2 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm2, 0 * SIZE(CO2) + movhps %xmm2, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 8 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm2, %xmm2 + movsd 16 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + movsd 32 * SIZE(BO), %xmm13 + movsd 48 * SIZE(BO), %xmm15 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + shufps $0x50, %xmm9, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 64 * SIZE(BO), %xmm9 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm8, %xmm11 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm8, %xmm11 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movsd 80 * SIZE(BO), %xmm11 + shufps $0x50, %xmm13, %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 36 * SIZE(BO), %xmm13 + shufps $0x50, %xmm13, %xmm13 + mulps %xmm10, %xmm13 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movsd 40 * SIZE(BO), %xmm13 + shufps $0x50, %xmm13, %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movsd 44 * SIZE(BO), %xmm13 + shufps $0x50, %xmm13, %xmm13 + mulps %xmm10, %xmm13 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movsd 96 * SIZE(BO), %xmm13 + shufps $0x50, %xmm15, %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 52 * SIZE(BO), %xmm15 + shufps $0x50, %xmm15, %xmm15 + mulps %xmm10, %xmm15 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movsd 56 * SIZE(BO), %xmm15 + shufps $0x50, %xmm15, %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movsd 60 * SIZE(BO), %xmm15 + shufps $0x50, %xmm15, %xmm15 + mulps %xmm10, %xmm15 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movsd 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + decq %rax + jg .L36 + ALIGN_4 + +.L38: + movaps %xmm0, %xmm6 + movlhps %xmm1, %xmm0 + movhlps %xmm6, %xmm1 + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm7, %xmm1 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm1, %xmm1 + + addsubps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + + shufps $0xb1, %xmm0, %xmm0 +#else + shufps $0xb1, %xmm0, %xmm0 + + addsubps %xmm0, %xmm1 + + movaps %xmm1, %xmm0 + + shufps $0xb1, %xmm1, %xmm1 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + + addps %xmm1, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 0 * SIZE(CO2), %xmm8 + + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 0 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addl $2, KK +#endif + + leaq (C, LDC, 2), C # c += 2 * ldc + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + + movq K, %rax + sarq $3, %rax + jle .L43 + ALIGN_4 + +.L42: + movddup 0 * SIZE(B), %xmm0 + movddup 2 * SIZE(B), %xmm1 + movddup 4 * SIZE(B), %xmm2 + movddup 6 * SIZE(B), %xmm3 + movddup 8 * SIZE(B), %xmm4 + movddup 10 * SIZE(B), %xmm5 + movddup 12 * SIZE(B), %xmm6 + movddup 14 * SIZE(B), %xmm7 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + prefetcht1 128 * SIZE(BO) + prefetcht0 112 * SIZE(B) + + addq $16 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L43: + movq K, %rax + andq $7, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movddup 0 * SIZE(B), %xmm0 + + movaps %xmm0, 0 * SIZE(BO) + + addq $2 * SIZE, B + addq $4 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 16 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 32 * SIZE(AO), %xmm12 + pxor %xmm4, %xmm4 + movaps 48 * SIZE(AO), %xmm14 + pxor %xmm5, %xmm5 + + movsldup 0 * SIZE(BO), %xmm9 + movsldup 16 * SIZE(BO), %xmm11 + + prefetchnta 4 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 64 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm5 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 20 * SIZE(AO), %xmm10 + ADDSUB %xmm9, %xmm1 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 24 * SIZE(AO), %xmm10 + ADDSUB %xmm9, %xmm5 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 28 * SIZE(AO), %xmm10 + ADDSUB %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movaps 80 * SIZE(AO), %xmm10 + ADDSUB %xmm9, %xmm5 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm12, %xmm11 + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 36 * SIZE(AO), %xmm12 + ADDSUB %xmm11, %xmm1 + movsldup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 40 * SIZE(AO), %xmm12 + ADDSUB %xmm11, %xmm5 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm0 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 44 * SIZE(AO), %xmm12 + ADDSUB %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + addps %xmm11, %xmm4 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm12, %xmm11 + movaps 96 * SIZE(AO), %xmm12 + ADDSUB %xmm11, %xmm5 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 52 * SIZE(AO), %xmm14 + ADDSUB %xmm11, %xmm1 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 56 * SIZE(AO), %xmm14 + ADDSUB %xmm11, %xmm5 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm0 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 60 * SIZE(AO), %xmm14 + ADDSUB %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + addps %xmm11, %xmm4 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm14, %xmm11 + movaps 112 * SIZE(AO), %xmm14 + ADDSUB %xmm11, %xmm5 + movsldup 48 * SIZE(BO), %xmm11 + + addq $64 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm4 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm5 + movsldup 4 * SIZE(BO), %xmm9 + + addq $ 8 * SIZE, AO + addq $ 4 * SIZE, BO + decq %rax + jg .L56 + ALIGN_4 + +.L58: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 + + addsubps %xmm1, %xmm0 + addsubps %xmm5, %xmm4 + + movaps %xmm0, %xmm1 + movaps %xmm4, %xmm5 + + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm4, %xmm4 +#else + shufps $0xb1, %xmm0, %xmm0 + shufps $0xb1, %xmm4, %xmm4 + + addsubps %xmm0, %xmm1 + addsubps %xmm4, %xmm5 + + movaps %xmm1, %xmm0 + movaps %xmm5, %xmm4 + + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm4 + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + movsd 4 * SIZE(CO1), %xmm9 + movhps 6 * SIZE(CO1), %xmm9 + + addps %xmm8, %xmm0 + addps %xmm9, %xmm4 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + movsd %xmm4, 4 * SIZE(CO1) + movhps %xmm4, 6 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $8 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsldup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movaps 16 * SIZE(AO), %xmm10 + movsldup 16 * SIZE(BO), %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 8 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 12 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 32 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 32 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 16 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 20 * SIZE(AO), %xmm10 + ADDSUB %xmm11, %xmm1 + movsldup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 20 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 24 * SIZE(AO), %xmm10 + ADDSUB %xmm11, %xmm1 + movsldup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 24 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 28 * SIZE(AO), %xmm10 + ADDSUB %xmm11, %xmm1 + movsldup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movshdup 28 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movaps 48 * SIZE(AO), %xmm10 + ADDSUB %xmm11, %xmm1 + movsldup 48 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $32 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movshdup 0 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movaps 4 * SIZE(AO), %xmm8 + ADDSUB %xmm9, %xmm1 + movsldup 4 * SIZE(BO), %xmm9 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm1, %xmm1 + addsubps %xmm1, %xmm0 + movaps %xmm0, %xmm1 + shufps $0xb1, %xmm0, %xmm0 +#else + shufps $0xb1, %xmm0, %xmm0 + addsubps %xmm0, %xmm1 + movaps %xmm1, %xmm0 + shufps $0xb1, %xmm1, %xmm1 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + addps %xmm1, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + movhps 2 * SIZE(CO1), %xmm8 + + addps %xmm8, %xmm0 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movhps %xmm0, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, 8), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + addq $4 * SIZE, CO1 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L999 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + + leaq BUFFER, BO +#else + leaq BUFFER, BO + movq KK, %rax + leaq (, %rax, 8), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + + movddup 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movsd 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movddup 8 * SIZE(AO), %xmm10 + movsd 16 * SIZE(BO), %xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + shufps $0x50, %xmm9, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 8 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 6 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 12 * SIZE(BO), %xmm9 + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 16 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movsd 32 * SIZE(BO), %xmm9 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 10 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 20 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 24 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 14 * SIZE(AO), %xmm10 + addps %xmm11, %xmm0 + movsd 28 * SIZE(BO), %xmm11 + shufps $0x50, %xmm11, %xmm11 + mulps %xmm10, %xmm11 + movddup 24 * SIZE(AO), %xmm10 + addps %xmm11, %xmm1 + movsd 48 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + movaps ALPHA_R, %xmm14 + movaps ALPHA_I, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + shufps $0x50, %xmm9, %xmm9 + mulps %xmm8, %xmm9 + movddup 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm0 + movsd 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm1, %xmm0 + + movhlps %xmm0, %xmm1 + +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + cmpeqps %xmm7, %xmm7 + pslld $31, %xmm7 + xorps %xmm7, %xmm1 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + shufps $0xb1, %xmm1, %xmm1 + + addsubps %xmm1, %xmm0 + + movaps %xmm0, %xmm1 + + shufps $0xb1, %xmm0, %xmm0 +#else + shufps $0xb1, %xmm0, %xmm0 + + addsubps %xmm0, %xmm1 + + movaps %xmm1, %xmm0 + + shufps $0xb1, %xmm1, %xmm1 +#endif + + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm0 + + addps %xmm1, %xmm0 + +#if! defined(TRMMKERNEL) && !defined(BETAZERO) + movsd 0 * SIZE(CO1), %xmm8 + addps %xmm8, %xmm0 +#endif + movsd %xmm0, 0 * SIZE(CO1) + ALIGN_4 + +.L999: + movq %rbx, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_ncopy_1.S b/kernel/x86_64/zgemm_ncopy_1.S new file mode 100644 index 0000000000..9f9ae73693 --- /dev/null +++ b/kernel/x86_64/zgemm_ncopy_1.S @@ -0,0 +1,203 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 24 + STACKSIZE(%rsp) + +#define B %r10 +#define I %r11 +#define J %r12 +#define AO1 %r13 +#define AO2 %r14 + +#endif + +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 48 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r14 + pushq %r13 +#endif + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + salq $ZBASE_SHIFT, LDA + + testq N, N + movq N, J + jle .L999 + ALIGN_4 + +.L12: + movq A, AO1 + addq LDA, A + + movq M, I + sarq $2, I + jle .L14 + ALIGN_4 + +.L13: +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + movsd 4 * SIZE(AO1), %xmm1 + movhps 6 * SIZE(AO1), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 4 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + movsd 4 * SIZE(AO1), %xmm2 + movhpd 5 * SIZE(AO1), %xmm2 + movsd 6 * SIZE(AO1), %xmm3 + movhpd 7 * SIZE(AO1), %xmm3 + + prefetcht2 RPREFETCHSIZE * SIZE(AO1) + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) + + prefetcht2 WPREFETCHSIZE * SIZE(B) +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(B) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 +#ifdef WINDOWS_ABI + popq %r13 + popq %r14 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_ncopy_2.S b/kernel/x86_64/zgemm_ncopy_2.S new file mode 100644 index 0000000000..bf318b7ffb --- /dev/null +++ b/kernel/x86_64/zgemm_ncopy_2.S @@ -0,0 +1,359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 24 + STACKSIZE(%rsp) + +#define B %r10 +#define I %r11 +#define J %r12 +#define AO1 %r13 +#define AO2 %r14 + +#endif + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) +#define RPREFETCHSIZE 16 +#define WPREFETCHSIZE 48 +#endif + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 80 +#endif + +#ifdef OPTERON +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 48 +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 48 +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r14 + pushq %r13 +#endif + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + salq $ZBASE_SHIFT, LDA + + movq N, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L12: + movq A, AO1 + leaq (A, LDA), AO2 + leaq (A, LDA, 2), A + + movq M, I + sarq $2, I + jle .L14 + ALIGN_4 + +.L13: +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + prefetchw (WPREFETCHSIZE + 8) * SIZE(B) +#endif + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + movlps 2 * SIZE(AO1), %xmm1 + movhps 2 * SIZE(AO2), %xmm1 + + movlps 4 * SIZE(AO1), %xmm2 + movhps 4 * SIZE(AO2), %xmm2 + movlps 6 * SIZE(AO1), %xmm3 + movhps 6 * SIZE(AO2), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + + prefetcht0 WPREFETCHSIZE * SIZE(B) +#endif + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 4 * SIZE(B) + movaps %xmm2, 8 * SIZE(B) + movaps %xmm3, 12 * SIZE(B) + +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 RPREFETCHSIZE * SIZE(AO1) +#endif + + movsd 2 * SIZE(AO1), %xmm2 + movhpd 3 * SIZE(AO1), %xmm2 + movsd 2 * SIZE(AO2), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + movsd 4 * SIZE(AO1), %xmm4 + movhpd 5 * SIZE(AO1), %xmm4 + movsd 4 * SIZE(AO2), %xmm5 + movhpd 5 * SIZE(AO2), %xmm5 + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 RPREFETCHSIZE * SIZE(AO2) +#endif + + movsd 6 * SIZE(AO1), %xmm6 + movhpd 7 * SIZE(AO1), %xmm6 + movsd 6 * SIZE(AO2), %xmm7 + movhpd 7 * SIZE(AO2), %xmm7 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + + prefetcht0 WPREFETCHSIZE * SIZE(B) +#endif + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 WPREFETCHSIZE * SIZE(B) +#endif + + movapd %xmm4, 8 * SIZE(B) + movapd %xmm5, 10 * SIZE(B) + movapd %xmm6, 12 * SIZE(B) + movapd %xmm7, 14 * SIZE(B) +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + subq $-16 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + movq M, I + andq $3, I + jle .L16 + ALIGN_4 + +.L15: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, AO2 + addq $4 * SIZE, B + decq I + jg .L15 + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L20: + testq $1, N + jle .L999 + + movq A, AO1 + + movq M, I + sarq $2, I + jle .L24 + ALIGN_4 + +.L23: +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(B) + prefetchw (WPREFETCHSIZE + 8) * SIZE(B) +#endif + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + movlps 4 * SIZE(AO1), %xmm1 + movhps 6 * SIZE(AO1), %xmm1 + + movaps %xmm0, 0 * SIZE(B) + movaps %xmm1, 4 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + movsd 4 * SIZE(AO1), %xmm2 + movhpd 5 * SIZE(AO1), %xmm2 + movsd 6 * SIZE(AO1), %xmm3 + movhpd 7 * SIZE(AO1), %xmm3 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + movapd %xmm2, 4 * SIZE(B) + movapd %xmm3, 6 * SIZE(B) +#endif + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + + prefetcht0 WPREFETCHSIZE * SIZE(B) +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, B + decq I + jg .L23 + ALIGN_4 + +.L24: + movq M, I + andq $3, I + jle .L999 + ALIGN_4 + +.L25: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + + movlps %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(B) +#endif + + addq $2 * SIZE, AO1 + addq $2 * SIZE, B + decq I + jg .L25 + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 +#ifdef WINDOWS_ABI + popq %r13 + popq %r14 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_tcopy_1.S b/kernel/x86_64/zgemm_tcopy_1.S new file mode 100644 index 0000000000..b4348e60e0 --- /dev/null +++ b/kernel/x86_64/zgemm_tcopy_1.S @@ -0,0 +1,190 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#define J %r10 +#define AO1 %r11 +#define AO2 %r12 +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 24 + STACKSIZE(%rsp) + +#define B %r10 +#define I %r11 +#define J %r12 +#define AO1 %r13 +#define AO2 %r14 + +#endif + +#define RPREFETCHSIZE 4 +#define WPREFETCHSIZE 4 + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %r14 + pushq %r13 +#endif + pushq %r12 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + salq $ZBASE_SHIFT, LDA + + testq N, N + movq N, J + jle .L999 + ALIGN_4 + +.L12: + movq A, AO1 + addq $2 * SIZE, A + + movq M, I + sarq $1, I + jle .L14 + ALIGN_4 + +.L13: +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO1, LDA, 1), %xmm0 + + movaps %xmm0, 0 * SIZE(B) +#else + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + prefetcht0 RPREFETCHSIZE * SIZE(AO1, LDA) + + movsd 0 * SIZE(AO1, LDA), %xmm1 + movhpd 1 * SIZE(AO1, LDA), %xmm1 + + movapd %xmm0, 0 * SIZE(B) + movapd %xmm1, 2 * SIZE(B) + + prefetcht0 WPREFETCHSIZE * SIZE(B) +#endif + + leaq (AO1, LDA, 2), AO1 + addq $4 * SIZE, B + decq I + jg .L13 + ALIGN_4 + +.L14: + testq $1, M + jle .L16 + +#ifndef DOUBLE + movsd 0 * SIZE(AO1), %xmm0 + movsd %xmm0, 0 * SIZE(B) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(B) +#endif + addq $2 * SIZE, B + ALIGN_4 + +.L16: + decq J + jg .L12 + ALIGN_4 + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r12 +#ifdef WINDOWS_ABI + popq %r13 + popq %r14 +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemm_tcopy_2.S b/kernel/x86_64/zgemm_tcopy_2.S new file mode 100644 index 0000000000..f83022d266 --- /dev/null +++ b/kernel/x86_64/zgemm_tcopy_2.S @@ -0,0 +1,432 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define RPREFETCHSIZE 32 +#define WPREFETCHSIZE 16 + +#ifndef WINDOWS_ABI + +#define M ARG1 /* rdi */ +#define N ARG2 /* rsi */ +#define A ARG3 /* rdx */ +#define LDA ARG4 /* rcx */ +#define B ARG5 /* r8 */ + +#define I %r9 +#define J %r10 + +#define AO1 %r11 +#define AO2 %r12 +#define BO1 %r13 +#define M8 %r14 +#define BO %rax + +#else + +#define STACKSIZE 256 + +#define M ARG1 /* rcx */ +#define N ARG2 /* rdx */ +#define A ARG3 /* r8 */ +#define LDA ARG4 /* r9 */ +#define OLD_B 40 + 48 + STACKSIZE(%rsp) + +#define B %r10 + +#define I %r11 +#define J %r12 + +#define AO1 %r13 +#define AO2 %r14 + +#define BO1 %rdi +#define M8 %rsi +#define BO %rax + +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + pushq %rdi + pushq %rsi +#endif + pushq %r14 + pushq %r13 + pushq %r12 + pushq %r11 + +#ifdef WINDOWS_ABI + subq $STACKSIZE, %rsp + + movups %xmm6, 0(%rsp) + movups %xmm7, 16(%rsp) + movups %xmm8, 32(%rsp) + movups %xmm9, 48(%rsp) + movups %xmm10, 64(%rsp) + movups %xmm11, 80(%rsp) + movups %xmm12, 96(%rsp) + movups %xmm13, 112(%rsp) + movups %xmm14, 128(%rsp) + movups %xmm15, 144(%rsp) + + movq OLD_B, B +#endif + + movq N, %rax + andq $-2, %rax + imulq M, %rax + + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), BO1 + + salq $ZBASE_SHIFT, LDA + + leaq (, M, SIZE), M8 + + movq M, J + sarq $1, J + jle .L20 + ALIGN_4 + +.L11: + movq A, AO1 + leaq (A, LDA ), AO2 + leaq (A, LDA, 2), A + + movq B, BO + addq $8 * SIZE, B + + movq N, I + sarq $2, I + jle .L13 + ALIGN_4 + +.L12: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + movlps 4 * SIZE(AO1), %xmm1 + movhps 6 * SIZE(AO1), %xmm1 + + movlps 0 * SIZE(AO2), %xmm2 + movhps 2 * SIZE(AO2), %xmm2 + movlps 4 * SIZE(AO2), %xmm3 + movhps 6 * SIZE(AO2), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + prefetcht0 WPREFETCHSIZE * SIZE(BO) +#endif + +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) +#endif + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm2, 4 * SIZE(BO) + leaq (BO, M8, 4), BO + movaps %xmm1, 0 * SIZE(BO) + movaps %xmm3, 4 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 RPREFETCHSIZE * SIZE(AO1) +#endif + + movsd 4 * SIZE(AO1), %xmm2 + movhpd 5 * SIZE(AO1), %xmm2 + movsd 6 * SIZE(AO1), %xmm3 + movhpd 7 * SIZE(AO1), %xmm3 + + movsd 0 * SIZE(AO2), %xmm4 + movhpd 1 * SIZE(AO2), %xmm4 + movsd 2 * SIZE(AO2), %xmm5 + movhpd 3 * SIZE(AO2), %xmm5 + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 RPREFETCHSIZE * SIZE(AO2) +#endif + + movsd 4 * SIZE(AO2), %xmm6 + movhpd 5 * SIZE(AO2), %xmm6 + movsd 6 * SIZE(AO2), %xmm7 + movhpd 7 * SIZE(AO2), %xmm7 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 RPREFETCHSIZE * SIZE(AO2) + prefetcht0 WPREFETCHSIZE * SIZE(BO) +#endif + +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) + prefetchw (WPREFETCHSIZE + 8) * SIZE(BO) +#endif + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm4, 4 * SIZE(BO) + movapd %xmm5, 6 * SIZE(BO) + +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) + prefetcht2 WPREFETCHSIZE * SIZE(BO) +#endif + leaq (BO, M8, 4), BO + + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) + movapd %xmm6, 4 * SIZE(BO) + movapd %xmm7, 6 * SIZE(BO) +#endif + + addq $8 * SIZE, AO1 + addq $8 * SIZE, AO2 + leaq (BO, M8, 4), BO + decq I + jg .L12 + ALIGN_4 + +.L13: + testq $2, N + jle .L14 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movlps 0 * SIZE(AO2), %xmm1 + movhps 2 * SIZE(AO2), %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movsd 0 * SIZE(AO2), %xmm2 + movhpd 1 * SIZE(AO2), %xmm2 + movsd 2 * SIZE(AO2), %xmm3 + movhpd 3 * SIZE(AO2), %xmm3 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + movapd %xmm2, 4 * SIZE(BO) + movapd %xmm3, 6 * SIZE(BO) +#endif + + addq $4 * SIZE, AO1 + addq $4 * SIZE, AO2 + leaq (BO, M8, 4), BO + ALIGN_4 + +.L14: + testq $1, N + jle .L19 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 0 * SIZE(AO2), %xmm0 + + movaps %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 0 * SIZE(AO2), %xmm1 + movhpd 1 * SIZE(AO2), %xmm1 + + movapd %xmm0, 0 * SIZE(BO1) + movapd %xmm1, 2 * SIZE(BO1) +#endif + + addq $4 * SIZE, BO1 + ALIGN_4 + +.L19: + decq J + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + jle .L999 + ALIGN_4 + +.L21: + movq A, AO1 + + movq B, BO + + movq N, I + sarq $2, I + jle .L23 + ALIGN_4 + +.L22: +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movlps 4 * SIZE(AO1), %xmm1 + movhps 6 * SIZE(AO1), %xmm1 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 WPREFETCHSIZE * SIZE(BO) +#endif + +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) +#endif + + movaps %xmm0, 0 * SIZE(BO) + leaq (BO, M8, 4), BO + movaps %xmm1, 0 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movsd 4 * SIZE(AO1), %xmm2 + movhpd 5 * SIZE(AO1), %xmm2 + movsd 6 * SIZE(AO1), %xmm3 + movhpd 7 * SIZE(AO1), %xmm3 + +#if defined(PENTIUM4) || defined(GENERIC) || defined(NANO) + prefetcht0 RPREFETCHSIZE * SIZE(AO1) + prefetcht0 WPREFETCHSIZE * SIZE(BO) +#endif + +#ifdef HAVE_3DNOW + prefetchw (WPREFETCHSIZE + 0) * SIZE(BO) +#endif + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) + + leaq (BO, M8, 4), BO + + movapd %xmm2, 0 * SIZE(BO) + movapd %xmm3, 2 * SIZE(BO) +#endif + + addq $8 * SIZE, AO1 + leaq (BO, M8, 4), BO + decq I + jg .L22 + ALIGN_4 + +.L23: + testq $2, N + jle .L24 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + movhps 2 * SIZE(AO1), %xmm0 + + movaps %xmm0, 0 * SIZE(BO) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + movsd 2 * SIZE(AO1), %xmm1 + movhpd 3 * SIZE(AO1), %xmm1 + + movapd %xmm0, 0 * SIZE(BO) + movapd %xmm1, 2 * SIZE(BO) +#endif + + addq $4 * SIZE, AO1 + leaq (BO, M8, 4), BO + ALIGN_4 + +.L24: + testq $1, N + jle .L999 + +#ifndef DOUBLE + movlps 0 * SIZE(AO1), %xmm0 + + movlps %xmm0, 0 * SIZE(BO1) +#else + movsd 0 * SIZE(AO1), %xmm0 + movhpd 1 * SIZE(AO1), %xmm0 + + movapd %xmm0, 0 * SIZE(BO1) +#endif + ALIGN_4 + + +.L999: +#ifdef WINDOWS_ABI + movups 0(%rsp), %xmm6 + movups 16(%rsp), %xmm7 + movups 32(%rsp), %xmm8 + movups 48(%rsp), %xmm9 + movups 64(%rsp), %xmm10 + movups 80(%rsp), %xmm11 + movups 96(%rsp), %xmm12 + movups 112(%rsp), %xmm13 + movups 128(%rsp), %xmm14 + movups 144(%rsp), %xmm15 + + addq $STACKSIZE, %rsp +#endif + + popq %r11 + popq %r12 + popq %r13 + popq %r14 +#ifdef WINDOWS_ABI + popq %rsi + popq %rdi +#endif + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_n.S b/kernel/x86_64/zgemv_n.S new file mode 100644 index 0000000000..b584a53708 --- /dev/null +++ b/kernel/x86_64/zgemv_n.S @@ -0,0 +1,2701 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA_R 48 (%rsp) +#define ALPHA_I 56 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) +#define ALPHA_R 224 (%rsp) +#define ALPHA_I 232 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define A1 %r12 +#define A2 %r13 + +#define Y1 %r14 +#define BUFFER %r15 + +#define J %r11 + +#undef SUBPD + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPD subpd +#else +#define SUBPD addpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movapd %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movlpd %xmm0, ALPHA_R + movlpd %xmm1, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, Y1 + + pxor %xmm4, %xmm4 + + movq M, %rax + addq $8, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movapd %xmm4, 0 * SIZE(Y1) + movapd %xmm4, 2 * SIZE(Y1) + movapd %xmm4, 4 * SIZE(Y1) + movapd %xmm4, 6 * SIZE(Y1) + movapd %xmm4, 8 * SIZE(Y1) + movapd %xmm4, 10 * SIZE(Y1) + movapd %xmm4, 12 * SIZE(Y1) + movapd %xmm4, 14 * SIZE(Y1) + + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: +#ifdef ALIGNED_ACCESS + testq $SIZE, A + jne .L100 +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movsd 0 * SIZE(X), %xmm8 + movhpd 1 * SIZE(X), %xmm8 + addq INCX, X + movsd 0 * SIZE(X), %xmm10 + movhpd 1 * SIZE(X), %xmm10 + addq INCX, X + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + movsd 0 * SIZE(X), %xmm14 + movhpd 1 * SIZE(X), %xmm14 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0xc0, %xmm5, %xmm5 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 +#else + movsd ALPHA_R, %xmm6 + unpcklpd %xmm6, %xmm6 + movsd ALPHA_I, %xmm7 + unpcklpd %xmm7, %xmm7 +#endif + + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + +#ifndef XCONJ + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0x44, %xmm8, %xmm8 + + pshufd $0xee, %xmm10, %xmm11 + pshufd $0x44, %xmm10, %xmm10 + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + + pshufd $0xee, %xmm14, %xmm15 + pshufd $0x44, %xmm14, %xmm14 + +#ifndef CONJ + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm10 + xorpd %xmm5, %xmm12 + xorpd %xmm5, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L15 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-10 * SIZE, A1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1( -6 * SIZE, A1, %xmm6) + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-10 * SIZE, A1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm3 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm2 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm3 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm6) + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm6) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm6) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm6) + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm0 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + ALIGN_3 + +.L19: + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + movsd 0 * SIZE(X), %xmm14 + movhpd 1 * SIZE(X), %xmm14 + addq INCX, X + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm8 + movddup ALPHA_I, %xmm9 +#else + movsd ALPHA_R, %xmm8 + unpcklpd %xmm8, %xmm8 + movsd ALPHA_I, %xmm9 + unpcklpd %xmm9, %xmm9 +#endif + + xorpd %xmm11, %xmm13 + xorpd %xmm11, %xmm15 + + mulpd %xmm8, %xmm12 + mulpd %xmm9, %xmm13 + mulpd %xmm8, %xmm14 + mulpd %xmm9, %xmm15 + +#ifndef XCONJ + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + + pshufd $0xee, %xmm14, %xmm15 + pshufd $0x44, %xmm14, %xmm14 + +#ifndef CONJ + xorpd %xmm11, %xmm13 + xorpd %xmm11, %xmm15 +#else + xorpd %xmm11, %xmm12 + xorpd %xmm11, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L25 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + MOVUPS_A1(-10 * SIZE, A1, %xmm10) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm10) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1( -6 * SIZE, A1, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm2 + MOVUPS_A1( -4 * SIZE, A1, %xmm8) + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm3 + MOVUPS_A1( -2 * SIZE, A1, %xmm10) + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + MOVUPS_A1(-10 * SIZE, A2, %xmm10) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm2 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm3 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + MOVUPS_A1(-14 * SIZE, A2, %xmm10) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm0 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, M +#if GEMV_UNROLL == 2 + je .L29 +#else + je .L30 +#endif + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-16 * SIZE, A2, %xmm6) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L29: + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: +#endif + + cmpq $1, N + jl .L980 + +#if GEMV_UNROLL == 1 +.L31: + decq N +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + + pshufd $0x4e, %xmm12, %xmm13 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm8 + movddup ALPHA_I, %xmm9 +#else + movsd ALPHA_R, %xmm8 + unpcklpd %xmm8, %xmm8 + movsd ALPHA_I, %xmm9 + unpcklpd %xmm9, %xmm9 +#endif + + xorpd %xmm11, %xmm13 + + mulpd %xmm8, %xmm12 + mulpd %xmm9, %xmm13 + +#ifndef XCONJ + subpd %xmm13, %xmm12 +#else + addpd %xmm13, %xmm12 +#endif + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + +#ifndef CONJ + xorpd %xmm11, %xmm13 +#else + xorpd %xmm11, %xmm12 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + movq M, I + sarq $2, I + jle .L35 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + MOVUPS_A1(-10 * SIZE, A1, %xmm10) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm4) + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + MOVUPS_A1( -6 * SIZE, A1, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + MOVUPS_A1( -4 * SIZE, A1, %xmm8) + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + MOVUPS_A1( -2 * SIZE, A1, %xmm10) + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + MOVUPS_A1(-14 * SIZE, A1, %xmm6) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $1, M +#if GEMV_UNROLL == 1 + je .L39 +#else + je .L980 +#endif + + MOVUPS_A1(-16 * SIZE, A1, %xmm4) + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 1 + ALIGN_3 +.L39: + cmpq $1, N + jge .L31 +#endif + +#ifdef ALIGNED_ACCESS + + jmp .L980 + ALIGN_3 + +.L100: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movsd 0 * SIZE(X), %xmm8 + movhpd 1 * SIZE(X), %xmm8 + addq INCX, X + movsd 0 * SIZE(X), %xmm10 + movhpd 1 * SIZE(X), %xmm10 + addq INCX, X + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + movsd 0 * SIZE(X), %xmm14 + movhpd 1 * SIZE(X), %xmm14 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0xc0, %xmm5, %xmm5 + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm6 + movddup ALPHA_I, %xmm7 +#else + movsd ALPHA_R, %xmm6 + unpcklpd %xmm6, %xmm6 + movsd ALPHA_I, %xmm7 + unpcklpd %xmm7, %xmm7 +#endif + + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + +#ifndef XCONJ + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0x44, %xmm8, %xmm8 + + pshufd $0xee, %xmm10, %xmm11 + pshufd $0x44, %xmm10, %xmm10 + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + + pshufd $0xee, %xmm14, %xmm15 + pshufd $0x44, %xmm14, %xmm14 + +#ifndef CONJ + xorpd %xmm5, %xmm9 + xorpd %xmm5, %xmm11 + xorpd %xmm5, %xmm13 + xorpd %xmm5, %xmm15 +#else + xorpd %xmm5, %xmm8 + xorpd %xmm5, %xmm10 + xorpd %xmm5, %xmm12 + xorpd %xmm5, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L105 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A1, LDA), %xmm6 + movhpd -13 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + movhpd -11 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm6 + movhpd -9 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A2), %xmm4 + movhpd -11 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A2), %xmm6 + movhpd -9 * SIZE(A2), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm4 + movhpd -15 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A2, LDA), %xmm6 + movhpd -13 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A2, LDA), %xmm4 + movhpd -11 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A2, LDA), %xmm6 + movhpd -9 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm2 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm3 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1), %xmm4 + movhpd -11 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A1), %xmm6 + movhpd -9 * SIZE(A1), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A1, LDA), %xmm6 + movhpd -13 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A1, LDA), %xmm4 + movhpd -11 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A1, LDA), %xmm6 + movhpd -9 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A2), %xmm4 + movhpd -11 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A2), %xmm6 + movhpd -9 * SIZE(A2), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm4 + movhpd -15 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm3 + movsd -14 * SIZE(A2, LDA), %xmm6 + movhpd -13 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movsd -12 * SIZE(A2, LDA), %xmm4 + movhpd -11 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + movsd -10 * SIZE(A2, LDA), %xmm6 + movhpd -9 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm2 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm3 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L105: + testq $2, M + je .L107 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A1, LDA), %xmm4 + movhpd -15 * SIZE(A1, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A1, LDA), %xmm6 + movhpd -13 * SIZE(A1, LDA), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2, LDA), %xmm4 + movhpd -15 * SIZE(A2, LDA), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A2, LDA), %xmm6 + movhpd -13 * SIZE(A2, LDA), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L107: + testq $1, M + je .L109 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A1, LDA), %xmm6 + movhpd -15 * SIZE(A1, LDA), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + movsd -16 * SIZE(A2, LDA), %xmm6 + movhpd -15 * SIZE(A2, LDA), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm0 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + ALIGN_3 + +.L109: + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L120 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L111: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + movsd 0 * SIZE(X), %xmm14 + movhpd 1 * SIZE(X), %xmm14 + addq INCX, X + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm8 + movddup ALPHA_I, %xmm9 +#else + movsd ALPHA_R, %xmm8 + unpcklpd %xmm8, %xmm8 + movsd ALPHA_I, %xmm9 + unpcklpd %xmm9, %xmm9 +#endif + + xorpd %xmm11, %xmm13 + xorpd %xmm11, %xmm15 + + mulpd %xmm8, %xmm12 + mulpd %xmm9, %xmm13 + mulpd %xmm8, %xmm14 + mulpd %xmm9, %xmm15 + +#ifndef XCONJ + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + + pshufd $0xee, %xmm14, %xmm15 + pshufd $0x44, %xmm14, %xmm14 + +#ifndef CONJ + xorpd %xmm11, %xmm13 + xorpd %xmm11, %xmm15 +#else + xorpd %xmm11, %xmm12 + xorpd %xmm11, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L115 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + movsd -10 * SIZE(A1), %xmm10 + movhpd -9 * SIZE(A1), %xmm10 + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + movsd -12 * SIZE(A2), %xmm8 + movhpd -11 * SIZE(A2), %xmm8 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + movsd -10 * SIZE(A2), %xmm10 + movhpd -9 * SIZE(A2), %xmm10 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm2 + movsd -4 * SIZE(A1), %xmm8 + movhpd -3 * SIZE(A1), %xmm8 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm3 + movsd -2 * SIZE(A1), %xmm10 + movhpd -1 * SIZE(A1), %xmm10 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -16 * SIZE(A2), %xmm4 + movhpd -15 * SIZE(A2), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + movsd -12 * SIZE(A2), %xmm8 + movhpd -11 * SIZE(A2), %xmm8 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + movsd -10 * SIZE(A2), %xmm10 + movhpd -9 * SIZE(A2), %xmm10 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm2 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm3 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L115: + testq $2, M + je .L117 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + movsd -14 * SIZE(A2), %xmm10 + movhpd -13 * SIZE(A2), %xmm10 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm14, %xmm8 + addpd %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm14, %xmm10 + addpd %xmm10, %xmm1 + + mulpd %xmm15, %xmm9 + SUBPD %xmm9, %xmm0 + mulpd %xmm15, %xmm11 + SUBPD %xmm11, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L117: + testq $1, M +#if GEMV_UNROLL == 2 + je .L119 +#else + je .L120 +#endif + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -16 * SIZE(A2), %xmm6 + movhpd -15 * SIZE(A2), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L119: + cmpq $2, N + jge .L111 +#endif + ALIGN_3 + +.L120: +#endif + + cmpq $1, N + jl .L980 + +#if GEMV_UNROLL == 1 +.L121: + decq N +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + movsd 0 * SIZE(X), %xmm12 + movhpd 1 * SIZE(X), %xmm12 + addq INCX, X + + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + + pshufd $0x4e, %xmm12, %xmm13 + +#ifdef HAVE_SSE3 + movddup ALPHA_R, %xmm8 + movddup ALPHA_I, %xmm9 +#else + movsd ALPHA_R, %xmm8 + unpcklpd %xmm8, %xmm8 + movsd ALPHA_I, %xmm9 + unpcklpd %xmm9, %xmm9 +#endif + + xorpd %xmm11, %xmm13 + + mulpd %xmm8, %xmm12 + mulpd %xmm9, %xmm13 + +#ifndef XCONJ + subpd %xmm13, %xmm12 +#else + addpd %xmm13, %xmm12 +#endif + + pshufd $0xee, %xmm12, %xmm13 + pshufd $0x44, %xmm12, %xmm12 + +#ifndef CONJ + xorpd %xmm11, %xmm13 +#else + xorpd %xmm11, %xmm12 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + movq M, I + sarq $2, I + jle .L125 + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + movsd -10 * SIZE(A1), %xmm10 + movhpd -9 * SIZE(A1), %xmm10 + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movsd -8 * SIZE(A1), %xmm4 + movhpd -7 * SIZE(A1), %xmm4 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movsd -6 * SIZE(A1), %xmm6 + movhpd -5 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + movsd -4 * SIZE(A1), %xmm8 + movhpd -3 * SIZE(A1), %xmm8 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + movsd -2 * SIZE(A1), %xmm10 + movhpd -1 * SIZE(A1), %xmm10 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm2 + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm3 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm2 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L125: + testq $2, M + je .L127 + + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + movsd -14 * SIZE(A1), %xmm6 + movhpd -13 * SIZE(A1), %xmm6 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movapd %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L127: + testq $1, M +#if GEMV_UNROLL == 1 + je .L129 +#else + je .L980 +#endif + + movsd -16 * SIZE(A1), %xmm4 + movhpd -15 * SIZE(A1), %xmm4 + + pshufd $0x4e, %xmm4, %xmm5 + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 1 + ALIGN_3 +.L129: + cmpq $1, N + jge .L121 +#endif + + +#endif + ALIGN_3 + +.L980: + testq $SIZE, Y + jne .L990 + + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L184 + ALIGN_3 + +.L182: + movapd (Y), %xmm0 + addq INCY, Y + movapd (Y), %xmm1 + addq INCY, Y + movapd (Y), %xmm2 + addq INCY, Y + movapd (Y), %xmm3 + addq INCY, Y + movapd (Y), %xmm4 + addq INCY, Y + movapd (Y), %xmm5 + addq INCY, Y + movapd (Y), %xmm6 + addq INCY, Y + movapd (Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movapd %xmm0, (Y1) + addq INCY, Y1 + movapd %xmm1, (Y1) + addq INCY, Y1 + movapd %xmm2, (Y1) + addq INCY, Y1 + movapd %xmm3, (Y1) + addq INCY, Y1 + movapd %xmm4, (Y1) + addq INCY, Y1 + movapd %xmm5, (Y1) + addq INCY, Y1 + movapd %xmm6, (Y1) + addq INCY, Y1 + movapd %xmm7, (Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L182 + ALIGN_3 + +.L184: + testq $7, M + jle .L999 + + testq $4, M + jle .L185 + + movapd (Y), %xmm0 + addq INCY, Y + movapd (Y), %xmm1 + addq INCY, Y + movapd (Y), %xmm2 + addq INCY, Y + movapd (Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movapd %xmm0, (Y1) + addq INCY, Y1 + movapd %xmm1, (Y1) + addq INCY, Y1 + movapd %xmm2, (Y1) + addq INCY, Y1 + movapd %xmm3, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L185: + testq $2, M + jle .L186 + + movapd (Y), %xmm0 + addq INCY, Y + movapd (Y), %xmm1 + addq INCY, Y + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movapd %xmm0, (Y1) + addq INCY, Y1 + movapd %xmm1, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L186: + testq $1, M + jle .L999 + + movapd (Y), %xmm0 + + addpd (BUFFER), %xmm0 + + movapd %xmm0, (Y1) + jmp .L999 + ALIGN_3 + +.L990: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L994 + ALIGN_3 + +.L992: + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm6 + movhpd 1 * SIZE(Y), %xmm6 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm5, 0 * SIZE(Y1) + movhpd %xmm5, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm7, 0 * SIZE(Y1) + movhpd %xmm7, 1 * SIZE(Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $7, M + jle .L999 + + testq $4, M + jle .L995 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L995: + testq $2, M + jle .L996 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L996: + testq $1, M + jle .L999 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + + addpd 0 * SIZE(BUFFER), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_n_atom.S b/kernel/x86_64/zgemv_n_atom.S new file mode 100644 index 0000000000..289c07670b --- /dev/null +++ b/kernel/x86_64/zgemv_n_atom.S @@ -0,0 +1,1142 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %r11 +#define A1 %r12 +#define A2 %r13 + +#define Y1 %r14 +#define BUFFER %r15 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 subsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 addsd +#define ADD4 subsd +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 addsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 subsd +#define ADD4 subsd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movapd %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movaps %xmm0, ALPHA_R + movaps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, Y1 + + pxor %xmm4, %xmm4 + + movq M, %rax + addq $8, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movapd %xmm4, 0 * SIZE(Y1) + movapd %xmm4, 2 * SIZE(Y1) + movapd %xmm4, 4 * SIZE(Y1) + movapd %xmm4, 6 * SIZE(Y1) + movapd %xmm4, 8 * SIZE(Y1) + movapd %xmm4, 10 * SIZE(Y1) + movapd %xmm4, 12 * SIZE(Y1) + movapd %xmm4, 14 * SIZE(Y1) + + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: + movq N, J + sarq $1, J + jle .L20 + ALIGN_3 + +.L11: + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + movapd %xmm4, %xmm8 + mulsd ALPHA_R, %xmm4 + mulsd ALPHA_I, %xmm8 + movapd %xmm6, %xmm10 + mulsd ALPHA_R, %xmm6 + mulsd ALPHA_I, %xmm10 + + movapd %xmm5, %xmm9 + mulsd ALPHA_I, %xmm9 + mulsd ALPHA_R, %xmm5 + movapd %xmm7, %xmm11 + mulsd ALPHA_I, %xmm11 + mulsd ALPHA_R, %xmm7 + +#ifndef XCONJ + subsd %xmm9, %xmm4 + addsd %xmm8, %xmm5 + subsd %xmm11, %xmm6 + addsd %xmm10, %xmm7 +#else + addsd %xmm9, %xmm4 + subsd %xmm8, %xmm5 + addsd %xmm11, %xmm6 + subsd %xmm10, %xmm7 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + movsd -14 * SIZE(Y1), %xmm2 + movsd -13 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq M, I + sarq $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -14 * SIZE(A1), %xmm10 + movsd -13 * SIZE(A1), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + ADD2 %xmm12, %xmm1 + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A2) +#endif + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -15 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A1), %xmm10 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -16 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -12 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm12 + movlpd %xmm1, -15 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -14 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movlpd %xmm3, -13 * SIZE(Y1) + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + movsd -10 * SIZE(Y1), %xmm2 + ADD2 %xmm12, %xmm1 + movsd -9 * SIZE(Y1), %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -6 * SIZE(A1), %xmm10 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -7 * SIZE(A1), %xmm9 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -12 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -8 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -5 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm12 + movlpd %xmm1, -11 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -7 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -10 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movsd -6 * SIZE(Y1), %xmm2 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + movlpd %xmm3, -9 * SIZE(Y1) + ADD2 %xmm12, %xmm1 + movsd -5 * SIZE(Y1), %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -15 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A1), %xmm10 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -16 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -12 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm12 + movlpd %xmm1, -15 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -14 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movlpd %xmm3, -13 * SIZE(Y1) + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + movsd -10 * SIZE(Y1), %xmm2 + ADD2 %xmm12, %xmm1 + movsd -9 * SIZE(Y1), %xmm3 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm2 + movlpd %xmm0, -12 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + ADD4 %xmm13, %xmm3 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + movlpd %xmm2, -10 * SIZE(Y1) + movsd -6 * SIZE(Y1), %xmm2 + movlpd %xmm3, -9 * SIZE(Y1) + movsd -5 * SIZE(Y1), %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -14 * SIZE(A1), %xmm10 + movsd -13 * SIZE(A1), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + mulsd %xmm5, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -15 * SIZE(A2), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm10, %xmm13 + mulsd %xmm6, %xmm10 + ADD1 %xmm8, %xmm0 + mulsd %xmm7, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm7, %xmm9 + ADD1 %xmm10, %xmm2 + mulsd %xmm6, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD3 %xmm9, %xmm0 + mulsd %xmm6, %xmm13 + ADD4 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm2 + ADD4 %xmm13, %xmm3 + + movlpd %xmm0, -16 * SIZE(Y1) + movlpd %xmm1, -15 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + movsd -11 * SIZE(Y1), %xmm1 + + movlpd %xmm2, -14 * SIZE(Y1) + movlpd %xmm3, -13 * SIZE(Y1) + + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -15 * SIZE(A2), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD1 %xmm10, %xmm0 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm0 + ADD4 %xmm13, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movlpd %xmm1, -15 * SIZE(Y1) + ALIGN_3 + +.L19: + decq J + jg .L11 + ALIGN_3 + +.L20: + testq $1, N + jle .L90 + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + + movsd 0 * SIZE(X), %xmm4 + movsd 1 * SIZE(X), %xmm5 + + movapd %xmm4, %xmm8 + mulsd ALPHA_R, %xmm4 + mulsd ALPHA_I, %xmm8 + movapd %xmm5, %xmm9 + mulsd ALPHA_I, %xmm9 + mulsd ALPHA_R, %xmm5 + +#ifndef XCONJ + subsd %xmm9, %xmm4 + addsd %xmm8, %xmm5 +#else + addsd %xmm9, %xmm4 + subsd %xmm8, %xmm5 +#endif + + movsd -16 * SIZE(Y1), %xmm0 + movsd -15 * SIZE(Y1), %xmm1 + movsd -14 * SIZE(Y1), %xmm2 + movsd -13 * SIZE(Y1), %xmm3 + ALIGN_3 + + movq M, I + sarq $2, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -14 * SIZE(A1), %xmm10 + movsd -13 * SIZE(A1), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm5, %xmm13 + ADD2 %xmm12, %xmm1 + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A2) +#endif + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -16 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -12 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A1), %xmm11 + + mulsd %xmm5, %xmm12 + movlpd %xmm1, -15 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -14 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movsd -10 * SIZE(Y1), %xmm2 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + mulsd %xmm5, %xmm13 + movlpd %xmm3, -13 * SIZE(Y1) + ADD2 %xmm12, %xmm1 + movsd -9 * SIZE(Y1), %xmm3 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -6 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -7 * SIZE(A1), %xmm9 + mulsd %xmm4, %xmm13 + subq $-8 * SIZE, A1 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -12 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -8 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A1), %xmm11 + mulsd %xmm5, %xmm12 + movlpd %xmm1, -11 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -7 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + movlpd %xmm2, -10 * SIZE(Y1) + mulsd %xmm4, %xmm10 + movsd -6 * SIZE(Y1), %xmm2 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm5, %xmm13 + movlpd %xmm3, -9 * SIZE(Y1) + ADD2 %xmm12, %xmm1 + movsd -5 * SIZE(Y1), %xmm3 + + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A1), %xmm10 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + movlpd %xmm0, -16 * SIZE(Y1) + mulsd %xmm4, %xmm8 + movsd -12 * SIZE(Y1), %xmm0 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A1), %xmm11 + + mulsd %xmm5, %xmm12 + movlpd %xmm1, -15 * SIZE(Y1) + ADD4 %xmm13, %xmm3 + movsd -11 * SIZE(Y1), %xmm1 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + movlpd %xmm2, -14 * SIZE(Y1) + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(Y1), %xmm2 + mulsd %xmm5, %xmm13 + movlpd %xmm3, -13 * SIZE(Y1) + ADD2 %xmm12, %xmm1 + movsd -9 * SIZE(Y1), %xmm3 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm2 + movlpd %xmm0, -12 * SIZE(Y1) + movsd -8 * SIZE(Y1), %xmm0 + ADD4 %xmm13, %xmm3 + movlpd %xmm1, -11 * SIZE(Y1) + movsd -7 * SIZE(Y1), %xmm1 + + movlpd %xmm2, -10 * SIZE(Y1) + movlpd %xmm3, -9 * SIZE(Y1) + movsd -6 * SIZE(Y1), %xmm2 + movsd -5 * SIZE(Y1), %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -14 * SIZE(A1), %xmm10 + movsd -13 * SIZE(A1), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm10, %xmm13 + mulsd %xmm4, %xmm10 + ADD1 %xmm8, %xmm0 + mulsd %xmm5, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm9, %xmm12 + mulsd %xmm5, %xmm9 + ADD1 %xmm10, %xmm2 + mulsd %xmm4, %xmm12 + ADD2 %xmm13, %xmm3 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD3 %xmm9, %xmm0 + mulsd %xmm4, %xmm13 + ADD4 %xmm12, %xmm1 + + ADD3 %xmm11, %xmm2 + movlpd %xmm0, -16 * SIZE(Y1) + movsd -12 * SIZE(Y1), %xmm0 + ADD4 %xmm13, %xmm3 + movlpd %xmm1, -15 * SIZE(Y1) + movsd -11 * SIZE(Y1), %xmm1 + + movlpd %xmm2, -14 * SIZE(Y1) + movlpd %xmm3, -13 * SIZE(Y1) + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, M + je .L90 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm9, %xmm0 + ADD4 %xmm13, %xmm1 + + movlpd %xmm0, -16 * SIZE(Y1) + movlpd %xmm1, -15 * SIZE(Y1) + ALIGN_3 + +.L90: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L94 + ALIGN_3 + +.L92: + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm6 + movhpd 1 * SIZE(Y), %xmm6 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm5, 0 * SIZE(Y1) + movhpd %xmm5, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm7, 0 * SIZE(Y1) + movhpd %xmm7, 1 * SIZE(Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L92 + ALIGN_3 + +.L94: + testq $7, M + jle .L999 + + testq $4, M + jle .L95 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L95: + testq $2, M + jle .L96 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L96: + testq $1, M + jle .L999 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + + addpd 0 * SIZE(BUFFER), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_n_dup.S b/kernel/x86_64/zgemv_n_dup.S new file mode 100644 index 0000000000..8a49fc9704 --- /dev/null +++ b/kernel/x86_64/zgemv_n_dup.S @@ -0,0 +1,1500 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) +#define ALPHA_R 48 (%rsp) +#define ALPHA_I 56 (%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) +#define ALPHA_R 224 (%rsp) +#define ALPHA_I 232 (%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define A1 %r12 +#define A2 %r13 + +#define Y1 %r14 +#define BUFFER %r15 + +#define J %r11 + +#undef SUBPD + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) +#define SUBPD subpd +#else +#define SUBPD addpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movapd %xmm3, %xmm0 + movsd OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movlps %xmm0, ALPHA_R + movlps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, Y1 + + xorps %xmm4, %xmm4 + + movq M, %rax + addq $8, %rax + sarq $3, %rax + ALIGN_3 + +.L01: + movaps %xmm4, 0 * SIZE(Y1) + movaps %xmm4, 2 * SIZE(Y1) + movaps %xmm4, 4 * SIZE(Y1) + movaps %xmm4, 6 * SIZE(Y1) + movaps %xmm4, 8 * SIZE(Y1) + movaps %xmm4, 10 * SIZE(Y1) + movaps %xmm4, 12 * SIZE(Y1) + movaps %xmm4, 14 * SIZE(Y1) + + subq $-16 * SIZE, Y1 + decq %rax + jg .L01 + ALIGN_3 + +.L10: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 2), A2 + leaq (A, LDA, 4), A + + movddup 0 * SIZE(X), %xmm8 + movddup 1 * SIZE(X), %xmm9 + addq INCX, X + movddup 0 * SIZE(X), %xmm10 + movddup 1 * SIZE(X), %xmm11 + addq INCX, X + movddup 0 * SIZE(X), %xmm12 + movddup 1 * SIZE(X), %xmm13 + addq INCX, X + movddup 0 * SIZE(X), %xmm14 + movddup 1 * SIZE(X), %xmm15 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0x40, %xmm5, %xmm5 + + movsd ALPHA_R, %xmm6 + movhps ALPHA_I, %xmm6 + + pshufd $0x4e, %xmm6, %xmm7 + +#ifndef XCONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + + mulpd %xmm6, %xmm12 + mulpd %xmm7, %xmm13 + mulpd %xmm6, %xmm14 + mulpd %xmm7, %xmm15 + +#ifndef XCONJ + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + pshufd $0x4e, %xmm12, %xmm13 + pshufd $0x4e, %xmm14, %xmm15 + +#ifndef XCONJ + xorps %xmm5, %xmm9 + xorps %xmm5, %xmm11 + xorps %xmm5, %xmm13 + xorps %xmm5, %xmm15 +#else + xorps %xmm5, %xmm8 + xorps %xmm5, %xmm10 + xorps %xmm5, %xmm12 + xorps %xmm5, %xmm14 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + ALIGN_3 + + movq M, I + sarq $2, I + jle .L15 + + movddup -16 * SIZE(A1), %xmm4 + movddup -14 * SIZE(A1), %xmm5 + movddup -12 * SIZE(A1), %xmm6 + movddup -10 * SIZE(A1), %xmm7 + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A1, LDA), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A1, LDA), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A1, LDA), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A1, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1, LDA), %xmm4 + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1, LDA), %xmm5 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1, LDA), %xmm6 + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm11, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2), %xmm5 + mulpd %xmm11, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2), %xmm4 + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2), %xmm5 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2), %xmm6 + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2), %xmm7 + + mulpd %xmm13, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2, LDA), %xmm4 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2, LDA), %xmm5 + mulpd %xmm13, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2, LDA), %xmm6 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2, LDA), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2, LDA), %xmm4 + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2, LDA), %xmm5 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2, LDA), %xmm6 + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2, LDA), %xmm7 + + mulpd %xmm15, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -8 * SIZE(A1), %xmm4 + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -6 * SIZE(A1), %xmm5 + mulpd %xmm15, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -4 * SIZE(A1), %xmm6 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -2 * SIZE(A1), %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A1, LDA), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A1, LDA), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A1, LDA), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1, LDA), %xmm4 + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1, LDA), %xmm5 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1, LDA), %xmm6 + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1, LDA), %xmm7 + + mulpd %xmm11, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2), %xmm5 + mulpd %xmm11, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2), %xmm7 + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2), %xmm4 + mulpd %xmm12, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2), %xmm5 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2), %xmm6 + mulpd %xmm12, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2), %xmm7 + + mulpd %xmm13, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2, LDA), %xmm4 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2, LDA), %xmm5 + mulpd %xmm13, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2, LDA), %xmm6 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2, LDA), %xmm7 + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2, LDA), %xmm4 + mulpd %xmm14, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2, LDA), %xmm5 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2, LDA), %xmm6 + mulpd %xmm14, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2, LDA), %xmm7 + + mulpd %xmm15, %xmm4 + SUBPD %xmm4, %xmm0 + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm1 + mulpd %xmm15, %xmm6 + SUBPD %xmm6, %xmm2 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -14 * SIZE(A1), %xmm6 + movddup -13 * SIZE(A1), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A1, LDA, 1), %xmm4 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movddup -14 * SIZE(A1, LDA, 1), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A1, LDA, 1), %xmm5 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + movddup -13 * SIZE(A1, LDA, 1), %xmm7 + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + movddup -14 * SIZE(A2), %xmm6 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A2), %xmm5 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + movddup -13 * SIZE(A2), %xmm7 + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A2, LDA, 1), %xmm4 + mulpd %xmm12, %xmm6 + addpd %xmm6, %xmm1 + movddup -14 * SIZE(A2, LDA, 1), %xmm6 + + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A2, LDA, 1), %xmm5 + mulpd %xmm13, %xmm7 + SUBPD %xmm7, %xmm1 + movddup -13 * SIZE(A2, LDA, 1), %xmm7 + + mulpd %xmm14, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm15, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -16 * SIZE(A1, LDA, 1), %xmm6 + movddup -15 * SIZE(A1, LDA, 1), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A2), %xmm5 + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + movddup -16 * SIZE(A2, LDA, 1), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm0 + movddup -15 * SIZE(A2, LDA, 1), %xmm7 + + mulpd %xmm12, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm13, %xmm5 + SUBPD %xmm5, %xmm0 + + mulpd %xmm14, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm15, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + ALIGN_3 + +.L19: + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + movddup 0 * SIZE(X), %xmm8 + movddup 1 * SIZE(X), %xmm9 + addq INCX, X + movddup 0 * SIZE(X), %xmm10 + movddup 1 * SIZE(X), %xmm11 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0x40, %xmm5, %xmm5 + + movsd ALPHA_R, %xmm6 + movhps ALPHA_I, %xmm6 + + pshufd $0x4e, %xmm6, %xmm7 + +#ifndef XCONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + mulpd %xmm6, %xmm10 + mulpd %xmm7, %xmm11 + +#ifndef XCONJ + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + pshufd $0x4e, %xmm10, %xmm11 + +#ifndef XCONJ + xorps %xmm5, %xmm9 + xorps %xmm5, %xmm11 +#else + xorps %xmm5, %xmm8 + xorps %xmm5, %xmm10 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + movq M, I + sarq $2, I + jle .L25 + + movddup -16 * SIZE(A1), %xmm4 + movddup -14 * SIZE(A1), %xmm5 + movddup -12 * SIZE(A1), %xmm6 + movddup -10 * SIZE(A1), %xmm7 + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2), %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2), %xmm4 + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2), %xmm5 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2), %xmm6 + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2), %xmm7 + + mulpd %xmm11, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -8 * SIZE(A1), %xmm4 + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -6 * SIZE(A1), %xmm5 + mulpd %xmm11, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -4 * SIZE(A1), %xmm6 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -2 * SIZE(A1), %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 2 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -14 * SIZE(A2), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -12 * SIZE(A2), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -10 * SIZE(A2), %xmm7 + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A2), %xmm4 + mulpd %xmm10, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A2), %xmm5 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A2), %xmm6 + mulpd %xmm10, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A2), %xmm7 + + mulpd %xmm11, %xmm4 + SUBPD %xmm4, %xmm0 + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm1 + mulpd %xmm11, %xmm6 + SUBPD %xmm6, %xmm2 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -14 * SIZE(A1), %xmm6 + movddup -13 * SIZE(A1), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -16 * SIZE(A2), %xmm4 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + movddup -14 * SIZE(A2), %xmm6 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + movddup -15 * SIZE(A2), %xmm5 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + movddup -13 * SIZE(A2), %xmm7 + + mulpd %xmm10, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm11, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L27: + testq $1, M +#if GEMV_UNROLL == 2 + je .L29 +#else + je .L30 +#endif + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -16 * SIZE(A2), %xmm6 + movddup -15 * SIZE(A2), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + + mulpd %xmm10, %xmm6 + addpd %xmm6, %xmm0 + mulpd %xmm11, %xmm7 + SUBPD %xmm7, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L29: + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: +#endif + + cmpq $1, N + jl .L980 + +#if GEMV_UNROLL == 1 +.L31: + decq N +#endif + + leaq 16 * SIZE(BUFFER), Y1 + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + movddup 0 * SIZE(X), %xmm8 + movddup 1 * SIZE(X), %xmm9 + addq INCX, X + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0x40, %xmm5, %xmm5 + + movsd ALPHA_R, %xmm6 + movhps ALPHA_I, %xmm6 + + pshufd $0x4e, %xmm6, %xmm7 + +#ifndef XCONJ + xorps %xmm5, %xmm7 +#else + xorps %xmm5, %xmm6 +#endif + + mulpd %xmm6, %xmm8 + mulpd %xmm7, %xmm9 + +#ifndef XCONJ + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + pshufd $0x4e, %xmm8, %xmm9 + +#ifndef XCONJ + xorps %xmm5, %xmm9 +#else + xorps %xmm5, %xmm8 +#endif + + MOVUPS_YL1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YL1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YL1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YL1(-10 * SIZE, Y1, %xmm3) + + movq M, I + sarq $2, I + jle .L35 + + movddup -16 * SIZE(A1), %xmm4 + movddup -14 * SIZE(A1), %xmm5 + movddup -12 * SIZE(A1), %xmm6 + movddup -10 * SIZE(A1), %xmm7 + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + movddup -8 * SIZE(A1), %xmm4 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + movddup -6 * SIZE(A1), %xmm5 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + movddup -4 * SIZE(A1), %xmm6 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + movddup -2 * SIZE(A1), %xmm7 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE) * 4 - 128 + PREOFFSET(Y1) +#endif + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + movddup -15 * SIZE(A1), %xmm4 + mulpd %xmm8, %xmm5 + addpd %xmm5, %xmm1 + movddup -13 * SIZE(A1), %xmm5 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm2 + movddup -11 * SIZE(A1), %xmm6 + mulpd %xmm8, %xmm7 + addpd %xmm7, %xmm3 + movddup -9 * SIZE(A1), %xmm7 + + mulpd %xmm9, %xmm4 + SUBPD %xmm4, %xmm0 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm1 + mulpd %xmm9, %xmm6 + SUBPD %xmm6, %xmm2 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + MOVUPS_YS1(-12 * SIZE, Y1, %xmm2) + MOVUPS_YS1(-10 * SIZE, Y1, %xmm3) + + MOVUPS_YL1( -8 * SIZE, Y1, %xmm0) + MOVUPS_YL1( -6 * SIZE, Y1, %xmm1) + MOVUPS_YL1( -4 * SIZE, Y1, %xmm2) + MOVUPS_YL1( -2 * SIZE, Y1, %xmm3) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, Y1 + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + movddup -14 * SIZE(A1), %xmm6 + movddup -13 * SIZE(A1), %xmm7 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm8, %xmm6 + addpd %xmm6, %xmm1 + + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + mulpd %xmm9, %xmm7 + SUBPD %xmm7, %xmm1 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + MOVUPS_YS1(-14 * SIZE, Y1, %xmm1) + movaps %xmm2, %xmm0 + + addq $4 * SIZE, A1 + addq $4 * SIZE, Y1 + ALIGN_3 + +.L37: + testq $1, M +#if GEMV_UNROLL == 1 + je .L39 +#else + je .L980 +#endif + + movddup -16 * SIZE(A1), %xmm4 + movddup -15 * SIZE(A1), %xmm5 + + mulpd %xmm8, %xmm4 + addpd %xmm4, %xmm0 + mulpd %xmm9, %xmm5 + SUBPD %xmm5, %xmm0 + + MOVUPS_YS1(-16 * SIZE, Y1, %xmm0) + +#if GEMV_UNROLL == 1 + ALIGN_3 +.L39: + cmpq $1, N + jge .L31 +#endif + +.L980: + testq $SIZE, Y + jne .L990 + + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L184 + ALIGN_3 + +.L182: + movaps (Y), %xmm0 + addq INCY, Y + movaps (Y), %xmm1 + addq INCY, Y + movaps (Y), %xmm2 + addq INCY, Y + movaps (Y), %xmm3 + addq INCY, Y + movaps (Y), %xmm4 + addq INCY, Y + movaps (Y), %xmm5 + addq INCY, Y + movaps (Y), %xmm6 + addq INCY, Y + movaps (Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movaps %xmm0, (Y1) + addq INCY, Y1 + movaps %xmm1, (Y1) + addq INCY, Y1 + movaps %xmm2, (Y1) + addq INCY, Y1 + movaps %xmm3, (Y1) + addq INCY, Y1 + movaps %xmm4, (Y1) + addq INCY, Y1 + movaps %xmm5, (Y1) + addq INCY, Y1 + movaps %xmm6, (Y1) + addq INCY, Y1 + movaps %xmm7, (Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L182 + ALIGN_3 + +.L184: + testq $7, M + jle .L999 + + testq $4, M + jle .L185 + + movaps (Y), %xmm0 + addq INCY, Y + movaps (Y), %xmm1 + addq INCY, Y + movaps (Y), %xmm2 + addq INCY, Y + movaps (Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movaps %xmm0, (Y1) + addq INCY, Y1 + movaps %xmm1, (Y1) + addq INCY, Y1 + movaps %xmm2, (Y1) + addq INCY, Y1 + movaps %xmm3, (Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L185: + testq $2, M + jle .L186 + + movaps (Y), %xmm0 + addq INCY, Y + movaps (Y), %xmm1 + addq INCY, Y + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movaps %xmm0, (Y1) + addq INCY, Y1 + movaps %xmm1, (Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L186: + testq $1, M + jle .L999 + + movaps (Y), %xmm0 + + addpd (BUFFER), %xmm0 + + movaps %xmm0, (Y1) + jmp .L999 + ALIGN_3 + +.L990: + movq Y, Y1 + + movq M, %rax + sarq $3, %rax + jle .L994 + ALIGN_3 + +.L992: + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm6 + movhpd 1 * SIZE(Y), %xmm6 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + addpd 8 * SIZE(BUFFER), %xmm4 + addpd 10 * SIZE(BUFFER), %xmm5 + addpd 12 * SIZE(BUFFER), %xmm6 + addpd 14 * SIZE(BUFFER), %xmm7 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm5, 0 * SIZE(Y1) + movhpd %xmm5, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm7, 0 * SIZE(Y1) + movhpd %xmm7, 1 * SIZE(Y1) + addq INCY, Y1 + + subq $-16 * SIZE, BUFFER + decq %rax + jg .L992 + ALIGN_3 + +.L994: + testq $7, M + jle .L999 + + testq $4, M + jle .L995 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm2 + movhpd 1 * SIZE(Y), %xmm2 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + addpd 4 * SIZE(BUFFER), %xmm2 + addpd 6 * SIZE(BUFFER), %xmm3 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm3, 0 * SIZE(Y1) + movhpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $8 * SIZE, BUFFER + ALIGN_3 + +.L995: + testq $2, M + jle .L996 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + + addpd 0 * SIZE(BUFFER), %xmm0 + addpd 2 * SIZE(BUFFER), %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + + movlpd %xmm1, 0 * SIZE(Y1) + movhpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + + addq $4 * SIZE, BUFFER + ALIGN_3 + +.L996: + testq $1, M + jle .L999 + + movsd 0 * SIZE(Y), %xmm0 + movhpd 1 * SIZE(Y), %xmm0 + + addpd 0 * SIZE(BUFFER), %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_t.S b/kernel/x86_64/zgemv_t.S new file mode 100644 index 0000000000..d7f9d49fe9 --- /dev/null +++ b/kernel/x86_64/zgemv_t.S @@ -0,0 +1,2433 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %rbx +#define A1 %r11 +#define A2 %r12 + +#define X1 %r13 +#define Y1 %r14 +#define BUFFER %r15 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#undef SUBPD + +#ifndef CONJ +#define SUBPD addpd +#else +#define SUBPD subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + +#ifdef HAVE_SSE3 + movddup %xmm0, ALPHA_R + movddup %xmm1, ALPHA_I +#else + pshufd $0x44, %xmm0, ALPHA_R + pshufd $0x44, %xmm1, ALPHA_I +#endif + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, X1 + + movq Y, Y1 + + movq M, I + sarq $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addq INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addq INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addq INCX, X + + movapd %xmm0, 0 * SIZE(X1) + movapd %xmm1, 2 * SIZE(X1) + movapd %xmm2, 4 * SIZE(X1) + movapd %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + movapd %xmm0, 0 * SIZE(X1) + addq $2 * SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: +#ifdef ALIGNED_ACCESS + testq $SIZE, A + jne .L100 +#endif + +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorpd %xmm4, %xmm4 + xorpd %xmm5, %xmm5 + xorpd %xmm6, %xmm6 + xorpd %xmm7, %xmm7 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L15 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2( -8 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-12 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-12 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-10 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-10 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-10 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-10 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + MOVUPS_A1(-14 * SIZE, A1, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + MOVUPS_A2(-14 * SIZE, A1, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-14 * SIZE, A2, %xmm8) + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-14 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm7 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A2(-16 * SIZE, A1, LDA, 1, %xmm10) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-16 * SIZE, A2, %xmm8) + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A2(-16 * SIZE, A2, LDA, 1, %xmm10) + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm7 + ALIGN_3 + +.L19: + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0xc0, %xmm13, %xmm13 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm13, %xmm0 + xorpd %xmm13, %xmm2 + xorpd %xmm13, %xmm4 + xorpd %xmm13, %xmm6 +#else + xorpd %xmm13, %xmm1 + xorpd %xmm13, %xmm3 + xorpd %xmm13, %xmm5 + xorpd %xmm13, %xmm7 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + movapd %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm10 + + movapd %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm2, %xmm3 + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm7 + + xorpd %xmm13, %xmm1 + xorpd %xmm13, %xmm3 + xorpd %xmm13, %xmm5 + xorpd %xmm13, %xmm7 + + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm4) + MOVUPS_XL1(-14 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L25 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + MOVUPS_A1(-14 * SIZE, A1, %xmm12) + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm10) + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1( -8 * SIZE, A2, %xmm10) + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1( -6 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1( -6 * SIZE, A2, %xmm6) + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1( -6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + MOVUPS_A1(-12 * SIZE, A2, %xmm10) + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + MOVUPS_A1(-10 * SIZE, A2, %xmm6) + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1( -6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + + MOVUPS_A1(-14 * SIZE, A1, %xmm12) + MOVUPS_A1(-14 * SIZE, A2, %xmm6) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L27: + testq $1, M + je .L29 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-16 * SIZE, A2, %xmm10) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + ALIGN_3 + +.L29: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm11, %xmm0 + xorpd %xmm11, %xmm2 +#else + xorpd %xmm11, %xmm1 + xorpd %xmm11, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm2, %xmm3 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + xorpd %xmm11, %xmm1 + xorpd %xmm11, %xmm3 + + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: +#endif + + cmpq $1, N + jl .L999 + +#if GEMV_UNROLL == 1 +.L31: + decq N +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm4) + MOVUPS_XL1(-14 * SIZE, X1, %xmm5) + + movq M, I + sarq $2, I + jle .L35 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-14 * SIZE, A1, %xmm12) + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1( -8 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1( -6 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + MOVUPS_A1(-12 * SIZE, A1, %xmm8) + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + MOVUPS_A1(-10 * SIZE, A1, %xmm12) + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + MOVUPS_A1(-14 * SIZE, A1, %xmm12) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L37: + testq $1, M + je .L39 + + MOVUPS_A1(-16 * SIZE, A1, %xmm8) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + ALIGN_3 + +.L39: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm11, %xmm0 +#else + xorpd %xmm11, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + addpd %xmm8, %xmm0 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + + xorpd %xmm11, %xmm1 + + subpd %xmm1, %xmm0 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + +#if GEMV_UNROLL == 1 + addq INCY, Y + addq INCY, Y1 + + cmpq $1, N + jge .L31 +#endif + +#ifdef ALIGNED_ACCESS + jmp .L999 + ALIGN_3 + +.L100: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L110 + ALIGN_3 + +.L101: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorpd %xmm4, %xmm4 + xorpd %xmm5, %xmm5 + xorpd %xmm6, %xmm6 + xorpd %xmm7, %xmm7 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L105 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + + movsd -16 * SIZE(A1, LDA), %xmm10 + movhpd -15 * SIZE(A1, LDA), %xmm10 + + decq I + jle .L104 + ALIGN_3 + +.L103: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm10 + movhpd -15 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -14 * SIZE(A1), %xmm8 + movhpd -13 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm10 + movhpd -13 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -14 * SIZE(A2), %xmm8 + movhpd -13 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -14 * SIZE(A2, LDA), %xmm10 + movhpd -13 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movsd -12 * SIZE(A1, LDA), %xmm10 + movhpd -11 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A2), %xmm8 + movhpd -11 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -12 * SIZE(A2, LDA), %xmm10 + movhpd -11 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -10 * SIZE(A1), %xmm8 + movhpd -9 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -10 * SIZE(A1, LDA), %xmm10 + movhpd -9 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -10 * SIZE(A2), %xmm8 + movhpd -9 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -10 * SIZE(A2, LDA), %xmm10 + movhpd -9 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movsd -8 * SIZE(A1), %xmm8 + movhpd -7 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movsd -8 * SIZE(A1, LDA), %xmm10 + movhpd -7 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L103 + ALIGN_3 + +.L104: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm10 + movhpd -15 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -14 * SIZE(A1), %xmm8 + movhpd -13 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm10 + movhpd -13 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -14 * SIZE(A2), %xmm8 + movhpd -13 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -14 * SIZE(A2, LDA), %xmm10 + movhpd -13 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movsd -12 * SIZE(A1, LDA), %xmm10 + movhpd -11 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A2), %xmm8 + movhpd -11 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -12 * SIZE(A2, LDA), %xmm10 + movhpd -11 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -10 * SIZE(A1), %xmm8 + movhpd -9 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -10 * SIZE(A1, LDA), %xmm10 + movhpd -9 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -10 * SIZE(A2), %xmm8 + movhpd -9 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -10 * SIZE(A2, LDA), %xmm10 + movhpd -9 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + SUBPD %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L105: + testq $2, M + je .L107 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + + movsd -16 * SIZE(A1, LDA), %xmm10 + movhpd -15 * SIZE(A1, LDA), %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm10 + movhpd -15 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movsd -14 * SIZE(A1), %xmm8 + movhpd -13 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movsd -14 * SIZE(A1, LDA), %xmm10 + movhpd -13 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + SUBPD %xmm11, %xmm7 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movsd -14 * SIZE(A2), %xmm8 + movhpd -13 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movsd -14 * SIZE(A2, LDA), %xmm10 + movhpd -13 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + SUBPD %xmm11, %xmm7 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L107: + testq $1, M + je .L109 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + + movsd -16 * SIZE(A1, LDA), %xmm10 + movhpd -15 * SIZE(A1, LDA), %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movsd -16 * SIZE(A2), %xmm8 + movhpd -15 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movsd -16 * SIZE(A2, LDA), %xmm10 + movhpd -15 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm3 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm12, %xmm9 + SUBPD %xmm9, %xmm5 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm12, %xmm11 + SUBPD %xmm11, %xmm7 + ALIGN_3 + +.L109: + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0xc0, %xmm13, %xmm13 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm13, %xmm0 + xorpd %xmm13, %xmm2 + xorpd %xmm13, %xmm4 + xorpd %xmm13, %xmm6 +#else + xorpd %xmm13, %xmm1 + xorpd %xmm13, %xmm3 + xorpd %xmm13, %xmm5 + xorpd %xmm13, %xmm7 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 + + haddpd %xmm5, %xmm4 + haddpd %xmm7, %xmm6 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + movapd %xmm4, %xmm10 + unpcklpd %xmm5, %xmm4 + unpckhpd %xmm5, %xmm10 + + movapd %xmm6, %xmm11 + unpcklpd %xmm7, %xmm6 + unpckhpd %xmm7, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + addpd %xmm10, %xmm4 + addpd %xmm11, %xmm6 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm2, %xmm3 + pshufd $0x4e, %xmm4, %xmm5 + pshufd $0x4e, %xmm6, %xmm7 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm7 + + xorpd %xmm13, %xmm1 + xorpd %xmm13, %xmm3 + xorpd %xmm13, %xmm5 + xorpd %xmm13, %xmm7 + + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L101 + ALIGN_3 + +.L110: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L120 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L111: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm4) + MOVUPS_XL1(-14 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L115 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A2), %xmm10 + movhpd -15 * SIZE(A2), %xmm10 + + movsd -14 * SIZE(A1), %xmm12 + movhpd -13 * SIZE(A1), %xmm12 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + decq I + jle .L114 + ALIGN_3 + +.L113: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + movsd -12 * SIZE(A2), %xmm10 + movhpd -11 * SIZE(A2), %xmm10 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -10 * SIZE(A1), %xmm12 + movhpd -9 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + movsd -10 * SIZE(A2), %xmm6 + movhpd -9 * SIZE(A2), %xmm6 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + movhpd -7 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + movsd -8 * SIZE(A2), %xmm10 + movhpd -7 * SIZE(A2), %xmm10 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -6 * SIZE(A1), %xmm12 + movhpd -5 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + movsd -6 * SIZE(A2), %xmm6 + movhpd -5 * SIZE(A2), %xmm6 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1( -6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L113 + ALIGN_3 + +.L114: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + movsd -12 * SIZE(A2), %xmm10 + movhpd -11 * SIZE(A2), %xmm10 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -10 * SIZE(A1), %xmm12 + movhpd -9 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + movsd -10 * SIZE(A2), %xmm6 + movhpd -9 * SIZE(A2), %xmm6 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + MOVUPS_XL1( -6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L115: + testq $2, M + je .L117 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A2), %xmm10 + movhpd -15 * SIZE(A2), %xmm10 + + movsd -14 * SIZE(A1), %xmm12 + movhpd -13 * SIZE(A1), %xmm12 + movsd -14 * SIZE(A2), %xmm6 + movhpd -13 * SIZE(A2), %xmm6 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + pshufd $0x4e, %xmm6, %xmm7 + mulpd %xmm5, %xmm6 + addpd %xmm6, %xmm2 + mulpd %xmm5, %xmm7 + SUBPD %xmm7, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L117: + testq $1, M + je .L119 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -16 * SIZE(A2), %xmm10 + movhpd -15 * SIZE(A2), %xmm10 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + pshufd $0x4e, %xmm10, %xmm11 + mulpd %xmm4, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm4, %xmm11 + SUBPD %xmm11, %xmm3 + ALIGN_3 + +.L119: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm11, %xmm0 + xorpd %xmm11, %xmm2 +#else + xorpd %xmm11, %xmm1 + xorpd %xmm11, %xmm3 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 + haddpd %xmm3, %xmm2 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + movapd %xmm2, %xmm9 + unpcklpd %xmm3, %xmm2 + unpckhpd %xmm3, %xmm9 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + pshufd $0x4e, %xmm2, %xmm3 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + xorpd %xmm11, %xmm1 + xorpd %xmm11, %xmm3 + + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L111 +#endif + ALIGN_3 + +.L120: +#endif + + cmpq $1, N + jl .L999 + +#if GEMV_UNROLL == 1 +.L121: + decq N +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + + MOVUPS_XL1(-16 * SIZE, X1, %xmm4) + MOVUPS_XL1(-14 * SIZE, X1, %xmm5) + + movq M, I + sarq $2, I + jle .L125 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -14 * SIZE(A1), %xmm12 + movhpd -13 * SIZE(A1), %xmm12 + + decq I + jle .L124 + ALIGN_3 + +.L123: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -10 * SIZE(A1), %xmm12 + movhpd -9 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + movhpd -7 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -6 * SIZE(A1), %xmm12 + movhpd -5 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L123 + ALIGN_3 + +.L124: + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + movhpd -11 * SIZE(A1), %xmm8 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + movsd -10 * SIZE(A1), %xmm12 + movhpd -9 * SIZE(A1), %xmm12 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-10 * SIZE, X1, %xmm5) + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1( -8 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + MOVUPS_XL1(-6 * SIZE, X1, %xmm5) + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L125: + testq $2, M + je .L127 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + movsd -14 * SIZE(A1), %xmm12 + movhpd -13 * SIZE(A1), %xmm12 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm4) + + pshufd $0x4e, %xmm12, %xmm13 + mulpd %xmm5, %xmm12 + addpd %xmm12, %xmm0 + mulpd %xmm5, %xmm13 + SUBPD %xmm13, %xmm1 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L127: + testq $1, M + je .L129 + + movsd -16 * SIZE(A1), %xmm8 + movhpd -15 * SIZE(A1), %xmm8 + + pshufd $0x4e, %xmm8, %xmm9 + mulpd %xmm4, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm4, %xmm9 + SUBPD %xmm9, %xmm1 + ALIGN_3 + +.L129: + pcmpeqb %xmm11, %xmm11 + psllq $63, %xmm11 + shufps $0xc0, %xmm11, %xmm11 + +#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) + xorpd %xmm11, %xmm0 +#else + xorpd %xmm11, %xmm1 +#endif + +#ifdef HAVE_SSE3 + haddpd %xmm1, %xmm0 +#else + movapd %xmm0, %xmm8 + unpcklpd %xmm1, %xmm0 + unpckhpd %xmm1, %xmm8 + + addpd %xmm8, %xmm0 +#endif + + pshufd $0x4e, %xmm0, %xmm1 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + + xorpd %xmm11, %xmm1 + + subpd %xmm1, %xmm0 + + movsd 0 * SIZE(Y), %xmm4 + movhpd 1 * SIZE(Y), %xmm4 + + addpd %xmm4, %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + +#if GEMV_UNROLL == 1 + addq INCY, Y + addq INCY, Y1 + + cmpq $1, N + jge .L121 +#endif + + +#endif + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_t_atom.S b/kernel/x86_64/zgemv_t_atom.S new file mode 100644 index 0000000000..5d3ecdd69c --- /dev/null +++ b/kernel/x86_64/zgemv_t_atom.S @@ -0,0 +1,968 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifdef ATOM +#define PREFETCH prefetchnta +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 6) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %rbx +#define A1 %r11 +#define A2 %r12 + +#define X1 %r13 +#define Y1 %r14 +#define BUFFER %r15 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#if !defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 subsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && !defined(XCONJ) +#define ADD1 addsd +#define ADD2 addsd +#define ADD3 addsd +#define ADD4 subsd +#endif + +#if !defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 addsd +#define ADD4 addsd +#endif + +#if defined(CONJ) && defined(XCONJ) +#define ADD1 addsd +#define ADD2 subsd +#define ADD3 subsd +#define ADD4 subsd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + movaps %xmm0, ALPHA_R + movaps %xmm1, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, X1 + + movq Y, Y1 + + movq M, I + sarq $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addq INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addq INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addq INCX, X + + movapd %xmm0, 0 * SIZE(X1) + movapd %xmm1, 2 * SIZE(X1) + movapd %xmm2, 4 * SIZE(X1) + movapd %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + movapd %xmm0, 0 * SIZE(X1) + addq $2 * SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: + movq N, J + sarq $1, J + jle .L20 + ALIGN_3 + +.L11: + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + xorpd %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + + movsd -16 * SIZE(X1), %xmm4 + movsd -15 * SIZE(X1), %xmm5 + movsd -14 * SIZE(X1), %xmm6 + movsd -13 * SIZE(X1), %xmm7 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L15 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -15 * SIZE(A2), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + movsd -9 * SIZE(X1), %xmm7 + ADD1 %xmm10, %xmm2 + movsd -12 * SIZE(A2), %xmm10 + mulsd %xmm6, %xmm13 + movsd -10 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -11 * SIZE(A2), %xmm11 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm3 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A2) +#endif + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -9 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -7 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -8 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -7 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + movsd -5 * SIZE(X1), %xmm7 + ADD1 %xmm10, %xmm2 + movsd -8 * SIZE(A2), %xmm10 + mulsd %xmm6, %xmm13 + movsd -6 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + subq $-8 * SIZE, A1 + mulsd %xmm4, %xmm8 + subq $-8 * SIZE, X1 + ADD3 %xmm11, %xmm2 + movsd -7 * SIZE(A2), %xmm11 + mulsd %xmm5, %xmm12 + subq $-8 * SIZE, A2 + ADD4 %xmm13, %xmm3 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + movsd -9 * SIZE(X1), %xmm7 + ADD1 %xmm10, %xmm2 + movsd -12 * SIZE(A2), %xmm10 + mulsd %xmm6, %xmm13 + movsd -10 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -11 * SIZE(A2), %xmm11 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -9 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -7 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -10 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -8 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -9 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + movsd -5 * SIZE(X1), %xmm7 + ADD1 %xmm10, %xmm2 + mulsd %xmm6, %xmm13 + movsd -6 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm3 + + ADD3 %xmm11, %xmm2 + ADD4 %xmm13, %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -15 * SIZE(A2), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm10, %xmm2 + movsd -14 * SIZE(A2), %xmm10 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm3 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm11, %xmm2 + movsd -13 * SIZE(A2), %xmm11 + + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm3 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm6, %xmm10 + ADD3 %xmm9, %xmm0 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm7, %xmm11 + ADD1 %xmm10, %xmm2 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm3 + + ADD3 %xmm11, %xmm2 + ADD4 %xmm13, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + movsd -16 * SIZE(A2), %xmm10 + movsd -15 * SIZE(A2), %xmm11 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + movapd %xmm10, %xmm12 + mulsd %xmm4, %xmm10 + ADD3 %xmm9, %xmm0 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm11, %xmm13 + mulsd %xmm5, %xmm11 + ADD1 %xmm10, %xmm2 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm3 + + ADD3 %xmm11, %xmm2 + ADD4 %xmm13, %xmm3 + ALIGN_3 + +.L19: + movsd 0 * SIZE(Y), %xmm4 + movapd %xmm0, %xmm10 + mulsd ALPHA_R, %xmm0 + movsd 1 * SIZE(Y), %xmm5 + movapd %xmm1, %xmm11 + mulsd ALPHA_R, %xmm1 + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm6 + movapd %xmm2, %xmm12 + mulsd ALPHA_R, %xmm2 + movsd 1 * SIZE(Y), %xmm7 + movapd %xmm3, %xmm13 + mulsd ALPHA_R, %xmm3 + addq INCY, Y + + mulsd ALPHA_I, %xmm10 + mulsd ALPHA_I, %xmm11 + mulsd ALPHA_I, %xmm12 + mulsd ALPHA_I, %xmm13 + + addsd %xmm10, %xmm1 + subsd %xmm11, %xmm0 + addsd %xmm12, %xmm3 + subsd %xmm13, %xmm2 + + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + addsd %xmm6, %xmm2 + addsd %xmm7, %xmm3 + + movlpd %xmm0, 0 * SIZE(Y1) + movlpd %xmm1, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movlpd %xmm3, 1 * SIZE(Y1) + addq INCY, Y1 + + decq J + jg .L11 + ALIGN_3 + +.L20: + testq $1, N + jle .L999 + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + + xorpd %xmm0, %xmm0 + xorpd %xmm1, %xmm1 + + movsd -16 * SIZE(X1), %xmm4 + movsd -15 * SIZE(X1), %xmm5 + movsd -14 * SIZE(X1), %xmm6 + movsd -13 * SIZE(X1), %xmm7 + + movq M, I + sarq $2, I + jle .L25 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) +#endif + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + movsd -9 * SIZE(X1), %xmm7 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + movsd -10 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -7 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -8 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -9 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + movsd -5 * SIZE(X1), %xmm7 + ADD1 %xmm8, %xmm0 + movsd -8 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + movsd -6 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm9, %xmm0 + mulsd %xmm5, %xmm12 + movsd -7 * SIZE(A1), %xmm9 + ADD4 %xmm13, %xmm1 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + subq $-8 * SIZE, A2 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + movsd -9 * SIZE(X1), %xmm7 + ADD1 %xmm8, %xmm0 + movsd -12 * SIZE(A1), %xmm8 + mulsd %xmm6, %xmm13 + movsd -10 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -11 * SIZE(A1), %xmm9 + mulsd %xmm5, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -7 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -10 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -8 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -9 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + movsd -5 * SIZE(X1), %xmm7 + ADD1 %xmm8, %xmm0 + mulsd %xmm6, %xmm13 + movsd -6 * SIZE(X1), %xmm6 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm9, %xmm0 + ADD4 %xmm13, %xmm1 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + movsd -11 * SIZE(X1), %xmm5 + ADD1 %xmm8, %xmm0 + movsd -14 * SIZE(A1), %xmm8 + mulsd %xmm4, %xmm13 + movsd -12 * SIZE(X1), %xmm4 + ADD2 %xmm12, %xmm1 + + movapd %xmm8, %xmm12 + mulsd %xmm6, %xmm8 + ADD3 %xmm9, %xmm0 + movsd -13 * SIZE(A1), %xmm9 + mulsd %xmm7, %xmm12 + ADD4 %xmm13, %xmm1 + + movapd %xmm9, %xmm13 + mulsd %xmm7, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm6, %xmm13 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm9, %xmm0 + ADD4 %xmm13, %xmm1 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L27: + testq $1, M + je .L29 + + movsd -16 * SIZE(A1), %xmm8 + movsd -15 * SIZE(A1), %xmm9 + + movapd %xmm8, %xmm12 + mulsd %xmm4, %xmm8 + mulsd %xmm5, %xmm12 + + movapd %xmm9, %xmm13 + mulsd %xmm5, %xmm9 + ADD1 %xmm8, %xmm0 + mulsd %xmm4, %xmm13 + ADD2 %xmm12, %xmm1 + + ADD3 %xmm9, %xmm0 + ADD4 %xmm13, %xmm1 + ALIGN_3 + +.L29: + movsd 0 * SIZE(Y), %xmm4 + movapd %xmm0, %xmm10 + mulsd ALPHA_R, %xmm0 + movsd 1 * SIZE(Y), %xmm5 + movapd %xmm1, %xmm11 + mulsd ALPHA_R, %xmm1 + + mulsd ALPHA_I, %xmm10 + mulsd ALPHA_I, %xmm11 + + addsd %xmm10, %xmm1 + subsd %xmm11, %xmm0 + addsd %xmm4, %xmm0 + addsd %xmm5, %xmm1 + + movlpd %xmm0, 0 * SIZE(Y1) + movlpd %xmm1, 1 * SIZE(Y1) + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/zgemv_t_dup.S b/kernel/x86_64/zgemv_t_dup.S new file mode 100644 index 0000000000..2db99b6dd2 --- /dev/null +++ b/kernel/x86_64/zgemv_t_dup.S @@ -0,0 +1,1223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "l2param.h" + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_INCX 8 + STACKSIZE(%rsp) +#define OLD_Y 16 + STACKSIZE(%rsp) +#define OLD_INCY 24 + STACKSIZE(%rsp) +#define OLD_BUFFER 32 + STACKSIZE(%rsp) + +#define M %rdi +#define N %rsi +#define A %rcx +#define LDA %r8 +#define X %r9 +#define INCX %rdx +#define Y %rbp +#define INCY %r10 + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_LDA 56 + STACKSIZE(%rsp) +#define OLD_X 64 + STACKSIZE(%rsp) +#define OLD_INCX 72 + STACKSIZE(%rsp) +#define OLD_Y 80 + STACKSIZE(%rsp) +#define OLD_INCY 88 + STACKSIZE(%rsp) +#define OLD_BUFFER 96 + STACKSIZE(%rsp) + +#define M %rcx +#define N %rdx +#define A %r8 +#define LDA %r9 +#define X %rdi +#define INCX %rsi +#define Y %rbp +#define INCY %r10 + +#endif + +#define I %rax +#define J %rbx +#define A1 %r11 +#define A2 %r12 + +#define X1 %r13 +#define Y1 %r14 +#define BUFFER %r15 + +#define ALPHA_R %xmm14 +#define ALPHA_I %xmm15 + +#undef SUBPD + +#ifndef CONJ +#define SUBPD addpd +#else +#define SUBPD subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + + movaps %xmm3, %xmm0 + movss OLD_ALPHA_I, %xmm1 +#endif + + movq OLD_INCX, INCX + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, LDA + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + pcmpeqb %xmm5, %xmm5 + psllq $63, %xmm5 + shufps $0x04, %xmm5, %xmm5 + + unpcklpd %xmm1, %xmm0 + + movaps %xmm0, ALPHA_R + pshufd $0x4e, %xmm0, ALPHA_I + + xorps %xmm5, ALPHA_I + + subq $-16 * SIZE, A + + testq M, M + jle .L999 + testq N, N + jle .L999 + ALIGN_3 + + movq BUFFER, X1 + + movq Y, Y1 + + movq M, I + sarq $2, I + jle .L05 + ALIGN_4 + +.L02: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + + movsd 0 * SIZE(X), %xmm1 + movhpd 1 * SIZE(X), %xmm1 + addq INCX, X + + movsd 0 * SIZE(X), %xmm2 + movhpd 1 * SIZE(X), %xmm2 + addq INCX, X + + movsd 0 * SIZE(X), %xmm3 + movhpd 1 * SIZE(X), %xmm3 + addq INCX, X + + movapd %xmm0, 0 * SIZE(X1) + movapd %xmm1, 2 * SIZE(X1) + movapd %xmm2, 4 * SIZE(X1) + movapd %xmm3, 6 * SIZE(X1) + + addq $8 * SIZE, X1 + decq I + jg .L02 + ALIGN_4 + +.L05: + movq M, I + andq $3, I + jle .L10 + ALIGN_2 + +.L06: + movsd 0 * SIZE(X), %xmm0 + movhpd 1 * SIZE(X), %xmm0 + addq INCX, X + movapd %xmm0, 0 * SIZE(X1) + addq $2 * SIZE, X1 + decq I + jg .L06 + ALIGN_4 + +.L10: +#if GEMV_UNROLL >= 4 + + cmpq $4, N + jl .L20 + ALIGN_3 + +.L11: + subq $4, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA, 2), A2 + leaq (A1, LDA, 4), A + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorps %xmm4, %xmm4 + xorps %xmm5, %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L15 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + decq I + jle .L14 + ALIGN_3 + +.L13: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -16 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -15 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -16 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -15 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + movddup -13 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A1, LDA) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + movddup -11 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movddup -12 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + movddup -11 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -12 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -11 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -10 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -9 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -10 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + movddup -9 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(A2, LDA) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -10 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -9 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A2, LDA), %xmm11 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movddup -8 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + movddup -7 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movddup -8 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + movddup -7 * SIZE(A1, LDA), %xmm11 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L13 + ALIGN_3 + +.L14: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -16 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -15 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -16 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -15 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + movddup -13 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + movddup -11 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + movddup -12 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + movddup -11 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -12 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -11 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -10 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -9 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -10 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm7 + movddup -9 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -10 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -9 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm7 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L15: + testq $2, M + je .L17 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -16 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -15 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -16 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -15 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + movddup -13 * SIZE(A1, LDA), %xmm11 + + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A2), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A2), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A2, LDA), %xmm10 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm7 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L17: + testq $1, M + je .L19 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -16 * SIZE(A2), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -15 * SIZE(A2), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -16 * SIZE(A2, LDA), %xmm10 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + movddup -15 * SIZE(A2, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm4 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm5 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm6 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm7 + ALIGN_3 + +.L19: + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0x40, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm13, %xmm1 + xorps %xmm13, %xmm3 + xorps %xmm13, %xmm5 + xorps %xmm13, %xmm7 +#else + xorps %xmm13, %xmm0 + xorps %xmm13, %xmm2 + xorps %xmm13, %xmm4 + xorps %xmm13, %xmm6 +#endif + + pshufd $0x4e, %xmm1, %xmm1 + pshufd $0x4e, %xmm3, %xmm3 + pshufd $0x4e, %xmm5, %xmm5 + pshufd $0x4e, %xmm7, %xmm7 + +#ifndef CONJ + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#else + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#endif + + pshufd $0xee, %xmm0, %xmm1 + movddup %xmm0, %xmm0 + pshufd $0xee, %xmm2, %xmm3 + movddup %xmm2, %xmm2 + pshufd $0xee, %xmm4, %xmm5 + movddup %xmm4, %xmm4 + pshufd $0xee, %xmm6, %xmm7 + movddup %xmm6, %xmm6 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + mulpd ALPHA_R, %xmm4 + mulpd ALPHA_I, %xmm5 + mulpd ALPHA_R, %xmm6 + mulpd ALPHA_I, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm5 + movhpd 1 * SIZE(Y), %xmm5 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm7 + movhpd 1 * SIZE(Y), %xmm7 + addq INCY, Y + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm4, 0 * SIZE(Y1) + movhpd %xmm4, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm6, 0 * SIZE(Y1) + movhpd %xmm6, 1 * SIZE(Y1) + addq INCY, Y1 + + cmpq $4, N + jge .L11 + ALIGN_3 + +.L20: +#endif + +#if GEMV_UNROLL >= 2 + + cmpq $2, N + jl .L30 + +#if GEMV_UNROLL == 2 + ALIGN_3 + +.L21: +#endif + + subq $2, N + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 + leaq (A1, LDA), A2 + leaq (A1, LDA, 2), A + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + +#ifdef PREFETCHW + PREFETCHW 3 * SIZE(Y1) +#endif + + movq M, I + sarq $2, I + jle .L25 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + decq I + jle .L24 + ALIGN_3 + +.L23: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -12 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -11 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(A2) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -10 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -9 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A1, LDA), %xmm11 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 2 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -8 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -7 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -8 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -7 * SIZE(A1, LDA), %xmm11 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L23 + ALIGN_3 + +.L24: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A1), %xmm9 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -12 * SIZE(A1, LDA), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -11 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -10 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -9 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, A2 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L25: + testq $2, M + je .L27 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -14 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + movddup -13 * SIZE(A1), %xmm9 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + movddup -14 * SIZE(A1, LDA), %xmm10 + mulpd %xmm12, %xmm11 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm11, %xmm3 + movddup -13 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm13, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm13, %xmm9 + addpd %xmm9, %xmm1 + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L27: + testq $1, M + je .L29 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -16 * SIZE(A1, LDA), %xmm10 + movddup -15 * SIZE(A1, LDA), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + mulpd %xmm12, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm12, %xmm11 + addpd %xmm11, %xmm3 + ALIGN_3 + +.L29: + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0x40, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm13, %xmm1 + xorps %xmm13, %xmm3 +#else + xorps %xmm13, %xmm0 + xorps %xmm13, %xmm2 +#endif + + pshufd $0x4e, %xmm1, %xmm1 + pshufd $0x4e, %xmm3, %xmm3 + +#ifndef CONJ + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 +#else + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 +#endif + + pshufd $0xee, %xmm0, %xmm1 + movddup %xmm0, %xmm0 + pshufd $0xee, %xmm2, %xmm3 + movddup %xmm2, %xmm2 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + mulpd ALPHA_R, %xmm2 + mulpd ALPHA_I, %xmm3 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + addq INCY, Y + movsd 0 * SIZE(Y), %xmm3 + movhpd 1 * SIZE(Y), %xmm3 + addq INCY, Y + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + addq INCY, Y1 + movlpd %xmm2, 0 * SIZE(Y1) + movhpd %xmm2, 1 * SIZE(Y1) + addq INCY, Y1 + +#if GEMV_UNROLL == 2 + cmpq $2, N + jge .L21 +#endif + ALIGN_3 + +.L30: +#endif + + cmpq $1, N + jl .L999 + +#if GEMV_UNROLL == 1 +.L31: + decq N +#endif + + leaq 16 * SIZE(BUFFER), X1 + + movq A, A1 +#if GEMV_UNROLL == 1 + addq LDA, A +#endif + + MOVUPS_XL1(-16 * SIZE, X1, %xmm12) + xorps %xmm0, %xmm0 + xorps %xmm1, %xmm1 + MOVUPS_XL1(-14 * SIZE, X1, %xmm13) + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + + movq M, I + sarq $2, I + jle .L35 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -14 * SIZE(A1), %xmm10 + movddup -13 * SIZE(A1), %xmm11 + + decq I + jle .L34 + ALIGN_3 + +.L33: +#ifdef PREFETCH + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(A1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A1), %xmm9 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A1), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A1), %xmm11 + +#ifdef PREFETCHW + PREFETCH (PREFETCHSIZE) * 4 - 128 + PREOFFSET(X1) +#endif + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -8 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + movddup -7 * SIZE(A1), %xmm9 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -6 * SIZE(A1), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -5 * SIZE(A1), %xmm11 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + + subq $1, I + BRANCH + jg .L33 + ALIGN_3 + +.L34: + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + movddup -12 * SIZE(A1), %xmm8 + mulpd %xmm12, %xmm9 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + movddup -11 * SIZE(A1), %xmm9 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + movddup -10 * SIZE(A1), %xmm10 + mulpd %xmm13, %xmm11 + MOVUPS_XL1(-10 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + movddup -9 * SIZE(A1), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + MOVUPS_XL1( -8 * SIZE, X1, %xmm12) + addpd %xmm9, %xmm1 + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + MOVUPS_XL1( -6 * SIZE, X1, %xmm13) + addpd %xmm11, %xmm3 + + subq $-8 * SIZE, A1 + subq $-8 * SIZE, X1 + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + movddup -14 * SIZE(A1), %xmm10 + movddup -13 * SIZE(A1), %xmm11 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + MOVUPS_XL1(-12 * SIZE, X1, %xmm12) + + mulpd %xmm13, %xmm10 + addpd %xmm10, %xmm2 + mulpd %xmm13, %xmm11 + addpd %xmm11, %xmm3 + + addq $4 * SIZE, A1 + ALIGN_3 + +.L37: + testq $1, M + je .L39 + + movddup -16 * SIZE(A1), %xmm8 + movddup -15 * SIZE(A1), %xmm9 + + mulpd %xmm12, %xmm8 + addpd %xmm8, %xmm0 + mulpd %xmm12, %xmm9 + addpd %xmm9, %xmm1 + ALIGN_3 + +.L39: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + pcmpeqb %xmm13, %xmm13 + psllq $63, %xmm13 + shufps $0x40, %xmm13, %xmm13 + +#ifndef XCONJ + xorps %xmm13, %xmm1 +#else + xorps %xmm13, %xmm0 +#endif + + pshufd $0x4e, %xmm1, %xmm1 + +#ifndef CONJ + addpd %xmm1, %xmm0 +#else + subpd %xmm1, %xmm0 +#endif + + pshufd $0xee, %xmm0, %xmm1 + movddup %xmm0, %xmm0 + + mulpd ALPHA_R, %xmm0 + mulpd ALPHA_I, %xmm1 + + addpd %xmm1, %xmm0 + + movsd 0 * SIZE(Y), %xmm1 + movhpd 1 * SIZE(Y), %xmm1 + + addpd %xmm1, %xmm0 + + movlpd %xmm0, 0 * SIZE(Y1) + movhpd %xmm0, 1 * SIZE(Y1) + +#if GEMV_UNROLL == 1 + addq INCY, Y + addq INCY, Y1 + + cmpq $1, N + jge .L31 +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S new file mode 100644 index 0000000000..950262611e --- /dev/null +++ b/kernel/x86_64/znrm2.S @@ -0,0 +1,208 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + fldz + testq M, M + jle .L999 + testq INCX, INCX + jle .L999 + + salq $ZBASE_SHIFT, INCX + + fldz + fldz + fldz + cmpq $SIZE * 2, INCX + jne .L40 + + movq M, I + sarq $2, I + jle .L20 + ALIGN_4 + +.L10: +#if defined(PREFETCH) + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + FLD 2 * SIZE(X) + fmul %st(0), %st + FLD 3 * SIZE(X) + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 4 * SIZE(X) + fmul %st(0), %st + FLD 5 * SIZE(X) + fmul %st(0), %st + FLD 6 * SIZE(X) + fmul %st(0), %st + FLD 7 * SIZE(X) + fmul %st(0), %st + + addq $8 * SIZE, X + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L10 + ALIGN_4 + +.L20: + andq $3, M + jle .L998 + ALIGN_4 + + +.L21: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + fmul %st(0), %st + faddp %st,%st(3) + faddp %st,%st(1) + addq $2 * SIZE, X + decq M + jg .L21 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $2, I + jle .L60 + ALIGN_4 + +.L50: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + + faddp %st, %st(7) + faddp %st, %st(5) + faddp %st, %st(3) + faddp %st, %st(1) + + decq I + jg .L50 + ALIGN_4 + +.L60: + andq $3, M + jle .L998 + ALIGN_4 + + +.L61: + FLD 0 * SIZE(X) + fmul %st(0), %st + FLD 1 * SIZE(X) + addq INCX, X + fmul %st(0), %st + faddp %st,%st(3) + faddp %st,%st(1) + decq M + jg .L61 + ALIGN_4 + +.L998: + faddp %st,%st(2) + faddp %st,%st(1) + faddp %st,%st(1) + ALIGN_4 + +.L999: + fsqrt +#ifndef XDOUBLE + subq $2 * SIZE, %rsp + FST (%rsp) + MOVSD (%rsp), %xmm0 + add $2 * SIZE, %rsp +#endif + ret + + EPILOGUE + diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S new file mode 100644 index 0000000000..005536a04c --- /dev/null +++ b/kernel/x86_64/znrm2_sse.S @@ -0,0 +1,387 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ + +#define I %rax +#define FLAG %r10 + +#include "l1param.h" + + PROLOGUE + PROFCODE + + SAVEREGISTERS + + pxor %xmm0, %xmm0 + testq M, M + jle .L999 + pxor %xmm1, %xmm1 + testq INCX, INCX + jle .L999 + + xorq FLAG, FLAG + + pxor %xmm2, %xmm2 + leaq (, INCX, 2 * SIZE), INCX + pxor %xmm3, %xmm3 + cmpq $2 * SIZE, INCX + jne .L40 + + testq $SIZE, X + je .L05 + + movss (X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + addq $SIZE, X + movq $1, FLAG + decq M + jle .L19 + ALIGN_3 + +.L05: + movq M, I + sarq $3, I + jle .L14 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + movsd 8 * SIZE(X), %xmm8 + movsd 10 * SIZE(X), %xmm9 + movsd 12 * SIZE(X), %xmm10 + movsd 14 * SIZE(X), %xmm11 + + addq $16 * SIZE, X + decq I + jle .L12 + ALIGN_3 + +.L10: +#if defined(PREFETCH) + PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + cvtps2pd %xmm4, %xmm12 + cvtps2pd %xmm5, %xmm13 + cvtps2pd %xmm6, %xmm14 + cvtps2pd %xmm7, %xmm15 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + + mulpd %xmm12, %xmm12 + mulpd %xmm13, %xmm13 + mulpd %xmm14, %xmm14 + mulpd %xmm15, %xmm15 + + addpd %xmm12, %xmm0 + addpd %xmm13, %xmm1 + addpd %xmm14, %xmm2 + addpd %xmm15, %xmm3 + + cvtps2pd %xmm8, %xmm12 + cvtps2pd %xmm9, %xmm13 + cvtps2pd %xmm10, %xmm14 + cvtps2pd %xmm11, %xmm15 + + movsd 8 * SIZE(X), %xmm8 + movsd 10 * SIZE(X), %xmm9 + movsd 12 * SIZE(X), %xmm10 + movsd 14 * SIZE(X), %xmm11 + + mulpd %xmm12, %xmm12 + mulpd %xmm13, %xmm13 + mulpd %xmm14, %xmm14 + mulpd %xmm15, %xmm15 + + addpd %xmm12, %xmm0 + addpd %xmm13, %xmm1 + addpd %xmm14, %xmm2 + addpd %xmm15, %xmm3 + + subq $-16 * SIZE, X + decq I + jg .L10 + ALIGN_3 + +.L12: + cvtps2pd %xmm4, %xmm12 + cvtps2pd %xmm5, %xmm13 + cvtps2pd %xmm6, %xmm14 + cvtps2pd %xmm7, %xmm15 + + mulpd %xmm12, %xmm12 + mulpd %xmm13, %xmm13 + mulpd %xmm14, %xmm14 + mulpd %xmm15, %xmm15 + + addpd %xmm12, %xmm0 + addpd %xmm13, %xmm1 + addpd %xmm14, %xmm2 + addpd %xmm15, %xmm3 + + cvtps2pd %xmm8, %xmm12 + cvtps2pd %xmm9, %xmm13 + cvtps2pd %xmm10, %xmm14 + cvtps2pd %xmm11, %xmm15 + + mulpd %xmm12, %xmm12 + mulpd %xmm13, %xmm13 + mulpd %xmm14, %xmm14 + mulpd %xmm15, %xmm15 + + addpd %xmm12, %xmm0 + addpd %xmm13, %xmm1 + addpd %xmm14, %xmm2 + addpd %xmm15, %xmm3 + ALIGN_3 + + +.L14: + testq $4, M + je .L15 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + movsd 4 * SIZE(X), %xmm6 + movsd 6 * SIZE(X), %xmm7 + + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + cvtps2pd %xmm6, %xmm10 + cvtps2pd %xmm7, %xmm11 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + addq $8 * SIZE, X + ALIGN_3 + +.L15: + testq $2, M + je .L16 + + movsd 0 * SIZE(X), %xmm4 + movsd 2 * SIZE(X), %xmm5 + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addq $4 * SIZE, X + ALIGN_3 + +.L16: + testq $1, M + je .L19 + + movsd (X), %xmm4 + cvtps2pd %xmm4, %xmm6 + mulpd %xmm6, %xmm6 + addpd %xmm6, %xmm2 + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq FLAG, FLAG + je .L998 + + movss (X), %xmm4 + cvtss2sd %xmm4, %xmm6 + mulsd %xmm6, %xmm6 + addsd %xmm6, %xmm3 + jmp .L998 + ALIGN_4 + +.L40: + movq M, I + sarq $3, I + jle .L44 + ALIGN_4 + +.L41: + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + movsd (X), %xmm8 + addq INCX, X + movsd (X), %xmm9 + addq INCX, X + movsd (X), %xmm10 + addq INCX, X + movsd (X), %xmm11 + addq INCX, X + + cvtps2pd %xmm4, %xmm4 + cvtps2pd %xmm5, %xmm5 + cvtps2pd %xmm6, %xmm6 + cvtps2pd %xmm7, %xmm7 + cvtps2pd %xmm8, %xmm8 + cvtps2pd %xmm9, %xmm9 + cvtps2pd %xmm10, %xmm10 + cvtps2pd %xmm11, %xmm11 + + mulpd %xmm4, %xmm4 + mulpd %xmm5, %xmm5 + mulpd %xmm6, %xmm6 + mulpd %xmm7, %xmm7 + + addpd %xmm4, %xmm0 + addpd %xmm5, %xmm1 + addpd %xmm6, %xmm2 + addpd %xmm7, %xmm3 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + + decq I + jg .L41 + ALIGN_3 + +.L44: + testq $4, M + je .L45 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + movsd (X), %xmm6 + addq INCX, X + movsd (X), %xmm7 + addq INCX, X + + cvtps2pd %xmm4, %xmm8 + cvtps2pd %xmm5, %xmm9 + cvtps2pd %xmm6, %xmm10 + cvtps2pd %xmm7, %xmm11 + + mulpd %xmm8, %xmm8 + mulpd %xmm9, %xmm9 + mulpd %xmm10, %xmm10 + mulpd %xmm11, %xmm11 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm1 + addpd %xmm10, %xmm2 + addpd %xmm11, %xmm3 + ALIGN_3 + +.L45: + testq $2, M + je .L46 + + movsd (X), %xmm4 + addq INCX, X + movsd (X), %xmm5 + addq INCX, X + + cvtps2pd %xmm4, %xmm6 + cvtps2pd %xmm5, %xmm7 + mulpd %xmm6, %xmm6 + mulpd %xmm7, %xmm7 + addpd %xmm6, %xmm0 + addpd %xmm7, %xmm1 + ALIGN_3 + +.L46: + testq $1, M + je .L998 + + movsd (X), %xmm4 + cvtps2pd %xmm4, %xmm6 + mulpd %xmm6, %xmm6 + addpd %xmm6, %xmm3 + ALIGN_4 + +.L998: + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm2, %xmm0 + +#ifndef HAVE_SSE3 + movapd %xmm0, %xmm1 + unpckhpd %xmm0, %xmm0 + addsd %xmm1, %xmm0 +#else + haddpd %xmm0, %xmm0 +#endif + ALIGN_4 + +.L999: + sqrtsd %xmm0, %xmm0 + cvtsd2ss %xmm0, %xmm0 + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zrot.S b/kernel/x86_64/zrot.S new file mode 100644 index 0000000000..d645d6f2e4 --- /dev/null +++ b/kernel/x86_64/zrot.S @@ -0,0 +1,367 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 48(%rsp), INCY + FLD 72(%rsp) + FLD 56(%rsp) +#else + FLD 24(%rsp) + FLD 8(%rsp) +#endif + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq N, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + movq N, I + sarq $1, I + jle .L15 + ALIGN_4 + +.L10: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + FLD 2 * SIZE(X) + FLD 2 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 2 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 2 * SIZE(Y) + + FLD 3 * SIZE(X) + FLD 3 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 3 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 3 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + + decq I + jg .L10 + ALIGN_4 + +.L15: + movq N, I + andq $1, I + jle .L999 + ALIGN_4 + +.L16: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + jmp .L999 + ALIGN_4 + +.L50: + movq N, I + sarq $1, I + jle .L55 + ALIGN_4 + +.L51: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq I + jg .L51 + ALIGN_4 + +.L55: + movq N, I + andq $1, I + jle .L999 + ALIGN_4 + +.L56: + FLD 0 * SIZE(X) + FLD 0 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 0 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 0 * SIZE(Y) + + FLD 1 * SIZE(X) + FLD 1 * SIZE(Y) + + fld %st(1) + fmul %st(3), %st + + fld %st(1) + fmul %st(5), %st + + faddp %st, %st(1) + FST 1 * SIZE(X) + + fmul %st(2), %st + fxch %st(1) + fmul %st(3), %st + + fsubrp %st, %st(1) + FST 1 * SIZE(Y) + ALIGN_4 + +.L999: + ffreep %st + ffreep %st + ret + + EPILOGUE diff --git a/kernel/x86_64/zrot_sse.S b/kernel/x86_64/zrot_sse.S new file mode 100644 index 0000000000..4aa0e72117 --- /dev/null +++ b/kernel/x86_64/zrot_sse.S @@ -0,0 +1,1622 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define C %xmm14 +#define S %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY + movss 48(%rsp), %xmm0 + movss 56(%rsp), %xmm1 +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + pshufd $0x0, %xmm0, C + pshufd $0x0, %xmm1, S + + cmpq $0, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + testq $2 * SIZE, X + je .L10 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq N + jle .L999 + +.L10: + testq $1 * SIZE, X + jne .L30 + + testq $3 * SIZE, Y + jne .L20 + + movq N, %rax + sarq $4, %rax + jle .L14 + + movaps 0 * SIZE(Y), %xmm1 + movaps 4 * SIZE(Y), %xmm3 + movaps 8 * SIZE(Y), %xmm9 + movaps 12 * SIZE(Y), %xmm11 + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm8 + movaps 12 * SIZE(X), %xmm10 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 20 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 24 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 28 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 32 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 36 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps 32 * SIZE(X), %xmm0 + movaps %xmm2, 20 * SIZE(X) + movaps 36 * SIZE(X), %xmm2 + movaps %xmm4, 16 * SIZE(Y) + movaps %xmm6, 20 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 40 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 44 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps 40 * SIZE(X), %xmm8 + movaps %xmm10, 28 * SIZE(X) + movaps 44 * SIZE(X), %xmm10 + movaps %xmm4, 24 * SIZE(Y) + movaps %xmm6, 28 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movaps 16 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movaps 20 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movaps 24 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movaps 28 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 20 * SIZE(X) + movaps %xmm4, 16 * SIZE(Y) + movaps %xmm6, 20 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps %xmm10, 28 * SIZE(X) + movaps %xmm4, 24 * SIZE(Y) + movaps %xmm6, 28 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + ALIGN_3 + +.L14: + testq $15, N + jle .L999 + + testq $8, N + jle .L15 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + movaps 8 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + movaps 12 * SIZE(Y), %xmm3 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movaps %xmm4, 8 * SIZE(Y) + movaps %xmm6, 12 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, N + jle .L16 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movaps %xmm4, 0 * SIZE(Y) + movaps %xmm6, 4 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, N + jle .L17 + + movaps 0 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 0 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movq N, %rax + sarq $4, %rax + jle .L24 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movsd 8 * SIZE(Y), %xmm9 + movhps 10 * SIZE(Y), %xmm9 + movsd 12 * SIZE(Y), %xmm11 + movhps 14 * SIZE(Y), %xmm11 + + movaps 0 * SIZE(X), %xmm0 + movaps 4 * SIZE(X), %xmm2 + movaps 8 * SIZE(X), %xmm8 + movaps 12 * SIZE(X), %xmm10 + + decq %rax + jle .L22 + ALIGN_3 + +.L21: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 24 * SIZE(Y), %xmm9 + movhps 26 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 28 * SIZE(Y), %xmm11 + movhps 30 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 32 * SIZE(Y), %xmm1 + movhps 34 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 36 * SIZE(Y), %xmm3 + movhps 38 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps 32 * SIZE(X), %xmm0 + movaps %xmm2, 20 * SIZE(X) + movaps 36 * SIZE(X), %xmm2 + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 40 * SIZE(Y), %xmm9 + movhps 42 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 44 * SIZE(Y), %xmm11 + movhps 46 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps 40 * SIZE(X), %xmm8 + movaps %xmm10, 28 * SIZE(X) + movaps 44 * SIZE(X), %xmm10 + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L22: + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps 16 * SIZE(X), %xmm0 + movaps %xmm2, 4 * SIZE(X) + movaps 20 * SIZE(X), %xmm2 + + movsd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movsd %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 24 * SIZE(Y), %xmm9 + movhps 26 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 28 * SIZE(Y), %xmm11 + movhps 30 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 8 * SIZE(X) + movaps 24 * SIZE(X), %xmm8 + movaps %xmm10,12 * SIZE(X) + movaps 28 * SIZE(X), %xmm10 + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 16 * SIZE(X) + movaps %xmm2, 20 * SIZE(X) + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm8, 24 * SIZE(X) + movaps %xmm10, 28 * SIZE(X) + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + ALIGN_3 + +.L24: + testq $15, N + jle .L999 + + testq $8, N + jle .L25 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movaps 8 * SIZE(X), %xmm0 + movsd 12 * SIZE(Y), %xmm3 + movhps 14 * SIZE(Y), %xmm3 + movaps 12 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 8 * SIZE(X) + movaps %xmm2, 12 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, N + jle .L26 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movaps 4 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movaps %xmm0, 0 * SIZE(X) + movaps %xmm2, 4 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L26: + testq $2, N + jle .L27 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movaps 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movaps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + movq N, %rax + sarq $4, %rax + jle .L34 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movsd 8 * SIZE(Y), %xmm9 + movhps 10 * SIZE(Y), %xmm9 + movsd 12 * SIZE(Y), %xmm11 + movhps 14 * SIZE(Y), %xmm11 + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + movsd 8 * SIZE(X), %xmm8 + movhps 10 * SIZE(X), %xmm8 + movsd 12 * SIZE(X), %xmm10 + movhps 14 * SIZE(X), %xmm10 + + decq %rax + jle .L32 + ALIGN_3 + +.L31: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movsd 16 * SIZE(X), %xmm0 + movhps 18 * SIZE(X), %xmm0 + movlps %xmm2, 4 * SIZE(X) + movhps %xmm2, 6 * SIZE(X) + movsd 20 * SIZE(X), %xmm2 + movhps 22 * SIZE(X), %xmm2 + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 24 * SIZE(Y), %xmm9 + movhps 26 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 28 * SIZE(Y), %xmm11 + movhps 30 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm8, 8 * SIZE(X) + movhps %xmm8, 10 * SIZE(X) + movsd 24 * SIZE(X), %xmm8 + movhps 26 * SIZE(X), %xmm8 + movlps %xmm10, 12 * SIZE(X) + movhps %xmm10, 14 * SIZE(X) + movsd 28 * SIZE(X), %xmm10 + movhps 30 * SIZE(X), %xmm10 + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 32 * SIZE(Y), %xmm1 + movhps 34 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 36 * SIZE(Y), %xmm3 + movhps 38 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 16 * SIZE(X) + movhps %xmm0, 18 * SIZE(X) + movsd 32 * SIZE(X), %xmm0 + movhps 34 * SIZE(X), %xmm0 + movlps %xmm2, 20 * SIZE(X) + movhps %xmm2, 22 * SIZE(X) + movsd 36 * SIZE(X), %xmm2 + movhps 38 * SIZE(X), %xmm2 + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 40 * SIZE(Y), %xmm9 + movhps 42 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 44 * SIZE(Y), %xmm11 + movhps 46 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm8, 24 * SIZE(X) + movhps %xmm8, 26 * SIZE(X) + movsd 40 * SIZE(X), %xmm8 + movhps 42 * SIZE(X), %xmm8 + movlps %xmm10, 28 * SIZE(X) + movhps %xmm10, 30 * SIZE(X) + movsd 44 * SIZE(X), %xmm10 + movhps 46 * SIZE(X), %xmm10 + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L32: + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + movsd 16 * SIZE(Y), %xmm1 + movhps 18 * SIZE(Y), %xmm1 + addps %xmm3, %xmm2 + movsd 20 * SIZE(Y), %xmm3 + movhps 22 * SIZE(Y), %xmm3 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movsd 16 * SIZE(X), %xmm0 + movhps 18 * SIZE(X), %xmm0 + movlps %xmm2, 4 * SIZE(X) + movhps %xmm2, 6 * SIZE(X) + movsd 20 * SIZE(X), %xmm2 + movhps 22 * SIZE(X), %xmm2 + + movsd %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movsd %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + movsd 24 * SIZE(Y), %xmm9 + movhps 26 * SIZE(Y), %xmm9 + addps %xmm11, %xmm10 + movsd 28 * SIZE(Y), %xmm11 + movhps 30 * SIZE(Y), %xmm11 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm8, 8 * SIZE(X) + movhps %xmm8, 10 * SIZE(X) + movsd 24 * SIZE(X), %xmm8 + movhps 26 * SIZE(X), %xmm8 + movlps %xmm10, 12 * SIZE(X) + movhps %xmm10, 14 * SIZE(X) + movsd 28 * SIZE(X), %xmm10 + movhps 30 * SIZE(X), %xmm10 + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + movaps %xmm1, %xmm4 + mulps S, %xmm1 + movaps %xmm3, %xmm6 + mulps S, %xmm3 + movaps %xmm0, %xmm5 + mulps C, %xmm0 + movaps %xmm2, %xmm7 + mulps C, %xmm2 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 16 * SIZE(X) + movhps %xmm0, 18 * SIZE(X) + movlps %xmm2, 20 * SIZE(X) + movhps %xmm2, 22 * SIZE(X) + movlps %xmm4, 16 * SIZE(Y) + movhps %xmm4, 18 * SIZE(Y) + movlps %xmm6, 20 * SIZE(Y) + movhps %xmm6, 22 * SIZE(Y) + + movaps %xmm9, %xmm4 + mulps S, %xmm9 + movaps %xmm8, %xmm5 + mulps C, %xmm8 + movaps %xmm11, %xmm6 + mulps S, %xmm11 + movaps %xmm10, %xmm7 + mulps C, %xmm10 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm8, 24 * SIZE(X) + movhps %xmm8, 26 * SIZE(X) + movlps %xmm10, 28 * SIZE(X) + movhps %xmm10, 30 * SIZE(X) + movlps %xmm4, 24 * SIZE(Y) + movhps %xmm4, 26 * SIZE(Y) + movlps %xmm6, 28 * SIZE(Y) + movhps %xmm6, 30 * SIZE(Y) + + addq $32 * SIZE, X + addq $32 * SIZE, Y + ALIGN_3 + +.L34: + testq $15, N + jle .L999 + + testq $8, N + jle .L35 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 4 * SIZE(X) + movhps %xmm2, 6 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + movsd 8 * SIZE(Y), %xmm1 + movhps 10 * SIZE(Y), %xmm1 + movsd 8 * SIZE(X), %xmm0 + movhps 10 * SIZE(X), %xmm0 + movsd 12 * SIZE(Y), %xmm3 + movhps 14 * SIZE(Y), %xmm3 + movsd 12 * SIZE(X), %xmm2 + movhps 14 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 8 * SIZE(X) + movhps %xmm0, 10 * SIZE(X) + movlps %xmm2, 12 * SIZE(X) + movhps %xmm2, 14 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 10 * SIZE(Y) + movlps %xmm6, 12 * SIZE(Y) + movhps %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, N + jle .L36 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + movsd 4 * SIZE(Y), %xmm3 + movhps 6 * SIZE(Y), %xmm3 + movsd 4 * SIZE(X), %xmm2 + movhps 6 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm4 + movaps %xmm0, %xmm5 + movaps %xmm3, %xmm6 + movaps %xmm2, %xmm7 + + mulps C, %xmm0 + mulps S, %xmm1 + mulps C, %xmm2 + mulps S, %xmm3 + + mulps C, %xmm4 + mulps S, %xmm5 + mulps C, %xmm6 + mulps S, %xmm7 + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + subps %xmm5, %xmm4 + subps %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 4 * SIZE(X) + movhps %xmm2, 6 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 2 * SIZE(Y) + movlps %xmm6, 4 * SIZE(Y) + movhps %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L36: + testq $2, N + jle .L37 + + movsd 0 * SIZE(Y), %xmm1 + movhps 2 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 2 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, N + jle .L999 + + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + ALIGN_3 + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movhps %xmm0, (X, INCX) + movlps %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leaq (X, INCX, 2), X + leaq (Y, INCY, 2), Y + + movsd (Y), %xmm1 + movhps (Y, INCY), %xmm1 + movsd (X), %xmm0 + movhps (X, INCX), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movhps %xmm0, (X, INCX) + movlps %xmm2, (Y) + movhps %xmm2, (Y, INCY) + + leaq (X, INCX, 2), X + leaq (Y, INCY, 2), Y + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd (Y), %xmm1 + movsd (X), %xmm0 + + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm3 + + mulps C, %xmm0 + mulps S, %xmm1 + + mulps C, %xmm2 + mulps S, %xmm3 + + addps %xmm1, %xmm0 + subps %xmm3, %xmm2 + + movlps %xmm0, (X) + movlps %xmm2, (Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zrot_sse2.S b/kernel/x86_64/zrot_sse2.S new file mode 100644 index 0000000000..368101816d --- /dev/null +++ b/kernel/x86_64/zrot_sse2.S @@ -0,0 +1,1727 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 /* rdi */ +#define X ARG2 /* rsi */ +#define INCX ARG3 /* rdx */ +#define Y ARG4 /* rcx */ +#ifndef WINDOWS_ABI +#define INCY ARG5 /* r8 */ +#else +#define INCY %r10 +#endif + +#define C %xmm14 +#define S %xmm15 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movq 40(%rsp), INCY + movsd 48(%rsp), %xmm0 + movsd 56(%rsp), %xmm1 +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + pshufd $0x44, %xmm0, C + pshufd $0x44, %xmm1, S + + cmpq $0, N + jle .L999 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + +.L10: + testq $SIZE, X + jne .L30 + + testq $SIZE, Y + jne .L20 + + movq N, %rax + sarq $3, %rax + jle .L14 + + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + movapd 4 * SIZE(Y), %xmm9 + movapd 6 * SIZE(Y), %xmm11 + + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + movapd 4 * SIZE(X), %xmm8 + movapd 6 * SIZE(X), %xmm10 + + decq %rax + jle .L12 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd %xmm0, 0 * SIZE(X) + movapd 8 * SIZE(X), %xmm0 + movapd %xmm2, 2 * SIZE(X) + movapd 10 * SIZE(X), %xmm2 + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 4 * SIZE(X) + movapd 12 * SIZE(X), %xmm8 + movapd %xmm10,6 * SIZE(X) + movapd 14 * SIZE(X), %xmm10 + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 16 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 18 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd 16 * SIZE(X), %xmm0 + movapd %xmm2, 10 * SIZE(X) + movapd 18 * SIZE(X), %xmm2 + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 20 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 22 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 12 * SIZE(X) + movapd 20 * SIZE(X), %xmm8 + movapd %xmm10, 14 * SIZE(X) + movapd 22 * SIZE(X), %xmm10 + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + + decq %rax + jg .L11 + ALIGN_3 + +.L12: + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd 8 * SIZE(X), %xmm0 + movapd %xmm2, 2 * SIZE(X) + movapd 10 * SIZE(X), %xmm2 + + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 4 * SIZE(X) + movapd 12 * SIZE(X), %xmm8 + movapd %xmm10,6 * SIZE(X) + movapd 14 * SIZE(X), %xmm10 + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 10 * SIZE(X) + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 12 * SIZE(X) + movapd %xmm10, 14 * SIZE(X) + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $7, N + jle .L999 + + testq $4, N + jle .L15 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm3 + movapd 2 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + movapd 6 * SIZE(Y), %xmm3 + movapd 6 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 6 * SIZE(X) + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $2, N + jle .L16 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm3 + movapd 2 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $1, N + jle .L999 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L20: + movapd -1 * SIZE(Y), %xmm1 + + movq N, %rax + sarq $3, %rax + jle .L24 + ALIGN_3 + +.L21: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(Y), %xmm3 + movapd 3 * SIZE(Y), %xmm8 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlps %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 5 * SIZE(Y), %xmm9 + movapd 7 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + movapd 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movapd %xmm8, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm9, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 6 * SIZE(X) + movlps %xmm4, 4 * SIZE(Y) + movhps %xmm4, 5 * SIZE(Y) + movlps %xmm6, 6 * SIZE(Y) + movhps %xmm6, 7 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 9 * SIZE(Y), %xmm3 + movapd 11 * SIZE(Y), %xmm8 + movapd 8 * SIZE(X), %xmm0 + movapd 10 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 10 * SIZE(X) + movlps %xmm4, 8 * SIZE(Y) + movhps %xmm4, 9 * SIZE(Y) + movlps %xmm6, 10 * SIZE(Y) + movhps %xmm6, 11 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 13 * SIZE(Y), %xmm9 + movapd 15 * SIZE(Y), %xmm1 + movapd 12 * SIZE(X), %xmm0 + movapd 14 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movapd %xmm8, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm9, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 12 * SIZE(X) + movapd %xmm2, 14 * SIZE(X) + movlps %xmm4, 12 * SIZE(Y) + movhps %xmm4, 13 * SIZE(Y) + movlps %xmm6, 14 * SIZE(Y) + movhps %xmm6, 15 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + decq %rax + jg .L21 + ALIGN_3 + +.L24: + testq $7, N + jle .L999 + + testq $4, N + jle .L25 + + movapd 1 * SIZE(Y), %xmm3 + movapd 3 * SIZE(Y), %xmm8 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlps %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + + movapd 5 * SIZE(Y), %xmm9 + movapd 7 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + movapd 6 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm9, %xmm8 + SHUFPD_1 %xmm1, %xmm9 + + movapd %xmm8, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm9, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm8 + mulpd C, %xmm2 + mulpd S, %xmm9 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm8, %xmm0 + addpd %xmm9, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 6 * SIZE(X) + movlps %xmm4, 4 * SIZE(Y) + movhps %xmm4, 5 * SIZE(Y) + movlps %xmm6, 6 * SIZE(Y) + movhps %xmm6, 7 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $2, N + jle .L26 + + movapd 1 * SIZE(Y), %xmm3 + movapd 3 * SIZE(Y), %xmm8 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + + SHUFPD_1 %xmm3, %xmm1 + SHUFPD_1 %xmm8, %xmm3 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movlps %xmm4, 0 * SIZE(Y) + movhps %xmm4, 1 * SIZE(Y) + movlps %xmm6, 2 * SIZE(Y) + movhps %xmm6, 3 * SIZE(Y) + movapd %xmm8, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + testq $1, N + jle .L999 + + movapd 1 * SIZE(Y), %xmm4 + movapd 0 * SIZE(X), %xmm0 + + SHUFPD_1 %xmm4, %xmm1 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L30: + testq $SIZE, Y + jne .L40 + + movapd -1 * SIZE(X), %xmm0 + + movq N, %rax + sarq $3, %rax + jle .L34 + ALIGN_3 + +.L31: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd 1 * SIZE(X), %xmm2 + movapd 3 * SIZE(X), %xmm8 + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + SHUFPD_1 %xmm8, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 2 * SIZE(X) + movhps %xmm2, 3 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd 5 * SIZE(X), %xmm2 + movapd 7 * SIZE(X), %xmm0 + movapd 4 * SIZE(Y), %xmm1 + movapd 6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm8 + SHUFPD_1 %xmm0, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm8, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm8 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm8, 4 * SIZE(X) + movhps %xmm8, 5 * SIZE(X) + movlps %xmm2, 6 * SIZE(X) + movhps %xmm2, 7 * SIZE(X) + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd 9 * SIZE(X), %xmm2 + movapd 11 * SIZE(X), %xmm8 + movapd 8 * SIZE(Y), %xmm1 + movapd 10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + SHUFPD_1 %xmm8, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm0, 8 * SIZE(X) + movhps %xmm0, 9 * SIZE(X) + movlps %xmm2, 10 * SIZE(X) + movhps %xmm2, 11 * SIZE(X) + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd 13 * SIZE(X), %xmm2 + movapd 15 * SIZE(X), %xmm0 + movapd 12 * SIZE(Y), %xmm1 + movapd 14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm8 + SHUFPD_1 %xmm0, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm8, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm8 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm8, 12 * SIZE(X) + movhps %xmm8, 13 * SIZE(X) + movlps %xmm2, 14 * SIZE(X) + movhps %xmm2, 15 * SIZE(X) + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, Y + addq $16 * SIZE, X + decq %rax + jg .L31 + ALIGN_3 + +.L34: + testq $7, N + jle .L999 + + testq $4, N + jle .L35 + + movapd 1 * SIZE(X), %xmm2 + movapd 3 * SIZE(X), %xmm8 + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + SHUFPD_1 %xmm8, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 2 * SIZE(X) + movhps %xmm2, 3 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd 5 * SIZE(X), %xmm2 + movapd 7 * SIZE(X), %xmm0 + movapd 4 * SIZE(Y), %xmm1 + movapd 6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm8 + SHUFPD_1 %xmm0, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm8, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm8 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm8 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm8, 4 * SIZE(X) + movhps %xmm8, 5 * SIZE(X) + movlps %xmm2, 6 * SIZE(X) + movhps %xmm2, 7 * SIZE(X) + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, Y + addq $8 * SIZE, X + ALIGN_3 + +.L35: + testq $2, N + jle .L36 + + movapd 1 * SIZE(X), %xmm2 + movapd 3 * SIZE(X), %xmm8 + + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + SHUFPD_1 %xmm8, %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 2 * SIZE(X) + movhps %xmm2, 3 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + movapd %xmm8, %xmm0 + + addq $4 * SIZE, Y + addq $4 * SIZE, X + ALIGN_3 + +.L36: + testq $1, N + jle .L999 + + movapd 1 * SIZE(X), %xmm4 + movapd 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm4, %xmm0 + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L40: + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + addq $1 * SIZE, Y + addq $1 * SIZE, X + + decq N + jle .L47 + + movq N, %rax + sarq $3, %rax + jle .L44 + + movapd 0 * SIZE(Y), %xmm1 + movapd 2 * SIZE(Y), %xmm3 + movapd 4 * SIZE(Y), %xmm9 + movapd 6 * SIZE(Y), %xmm11 + + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(X), %xmm2 + movapd 4 * SIZE(X), %xmm8 + movapd 6 * SIZE(X), %xmm10 + + decq %rax + jle .L42 + ALIGN_3 + +.L41: +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + +#if defined(PREFETCHW) + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movapd %xmm0, 0 * SIZE(X) + movapd 8 * SIZE(X), %xmm0 + movapd %xmm2, 2 * SIZE(X) + movapd 10 * SIZE(X), %xmm2 + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 4 * SIZE(X) + movapd 12 * SIZE(X), %xmm8 + movapd %xmm10,6 * SIZE(X) + movapd 14 * SIZE(X), %xmm10 + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 16 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 18 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd 16 * SIZE(X), %xmm0 + movapd %xmm2, 10 * SIZE(X) + movapd 18 * SIZE(X), %xmm2 + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + +#if defined(PREFETCHW) && !defined(FETCH128) + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 20 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 22 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 12 * SIZE(X) + movapd 20 * SIZE(X), %xmm8 + movapd %xmm10, 14 * SIZE(X) + movapd 22 * SIZE(X), %xmm10 + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L42: + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + movapd 8 * SIZE(Y), %xmm1 + addpd %xmm3, %xmm2 + movapd 10 * SIZE(Y), %xmm3 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd 8 * SIZE(X), %xmm0 + movapd %xmm2, 2 * SIZE(X) + movapd 10 * SIZE(X), %xmm2 + + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + movapd 12 * SIZE(Y), %xmm9 + addpd %xmm11, %xmm10 + movapd 14 * SIZE(Y), %xmm11 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 4 * SIZE(X) + movapd 12 * SIZE(X), %xmm8 + movapd %xmm10,6 * SIZE(X) + movapd 14 * SIZE(X), %xmm10 + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + movapd %xmm1, %xmm4 + mulpd S, %xmm1 + movapd %xmm3, %xmm6 + mulpd S, %xmm3 + movapd %xmm0, %xmm5 + mulpd C, %xmm0 + movapd %xmm2, %xmm7 + mulpd C, %xmm2 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 8 * SIZE(X) + movapd %xmm2, 10 * SIZE(X) + movapd %xmm4, 8 * SIZE(Y) + movapd %xmm6, 10 * SIZE(Y) + + movapd %xmm9, %xmm4 + mulpd S, %xmm9 + movapd %xmm8, %xmm5 + mulpd C, %xmm8 + movapd %xmm11, %xmm6 + mulpd S, %xmm11 + movapd %xmm10, %xmm7 + mulpd C, %xmm10 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm8, 12 * SIZE(X) + movapd %xmm10, 14 * SIZE(X) + movapd %xmm4, 12 * SIZE(Y) + movapd %xmm6, 14 * SIZE(Y) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $4, N + jle .L45 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm3 + movapd 2 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + movapd 4 * SIZE(Y), %xmm1 + movapd 4 * SIZE(X), %xmm0 + movapd 6 * SIZE(Y), %xmm3 + movapd 6 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 4 * SIZE(X) + movapd %xmm2, 6 * SIZE(X) + movapd %xmm4, 4 * SIZE(Y) + movapd %xmm6, 6 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $2, N + jle .L46 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + movapd 2 * SIZE(Y), %xmm3 + movapd 2 * SIZE(X), %xmm2 + + movapd %xmm1, %xmm4 + movapd %xmm0, %xmm5 + movapd %xmm3, %xmm6 + movapd %xmm2, %xmm7 + + mulpd C, %xmm0 + mulpd S, %xmm1 + mulpd C, %xmm2 + mulpd S, %xmm3 + + mulpd C, %xmm4 + mulpd S, %xmm5 + mulpd C, %xmm6 + mulpd S, %xmm7 + + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 2 * SIZE(X) + movapd %xmm4, 0 * SIZE(Y) + movapd %xmm6, 2 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + testq $1, N + jle .L47 + + movapd 0 * SIZE(Y), %xmm1 + movapd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movapd %xmm0, 0 * SIZE(X) + movapd %xmm2, 0 * SIZE(Y) + addq $2 * SIZE, Y + addq $2 * SIZE, X + ALIGN_3 + +.L47: + movsd 0 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulsd C, %xmm0 + mulsd S, %xmm1 + + mulsd C, %xmm2 + mulsd S, %xmm3 + + addsd %xmm1, %xmm0 + subsd %xmm3, %xmm2 + + movsd %xmm0, 0 * SIZE(X) + movsd %xmm2, 0 * SIZE(Y) + jmp .L999 + ALIGN_3 + +.L50: + movq N, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L53: + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L53 + ALIGN_3 + +.L55: + movq N, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L56: + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + movapd %xmm1, %xmm2 + movapd %xmm0, %xmm3 + + mulpd C, %xmm0 + mulpd S, %xmm1 + + mulpd C, %xmm2 + mulpd S, %xmm3 + + addpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + + movlps %xmm0, 0 * SIZE(X) + movhps %xmm0, 1 * SIZE(X) + movlps %xmm2, 0 * SIZE(Y) + movhps %xmm2, 1 * SIZE(Y) + + addq INCX, X + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L999: + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zscal.S b/kernel/x86_64/zscal.S new file mode 100644 index 0000000000..5282e0f725 --- /dev/null +++ b/kernel/x86_64/zscal.S @@ -0,0 +1,223 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N ARG1 +#define X ARG4 +#define INCX ARG5 + +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + + salq $ZBASE_SHIFT, INCX + + FLD 8(%rsp) + FLD 24(%rsp) + + testq N, N + jle .L999 + + fld %st(1) + fabs + fld %st(1) + fabs + faddp %st, %st(1) + + fldz + fcomip %st(1), %st + ffreep %st + jne .L30 + + EMMS + + pxor %mm0, %mm0 + + cmpq $2 * SIZE, INCX + jne .L20 + + movq N, I + sarq $2, I + jle .L15 + ALIGN_4 + +.L12: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + movq %mm0, 32(X) + movq %mm0, 40(X) + movq %mm0, 48(X) + movq %mm0, 56(X) + movq %mm0, 64(X) + movq %mm0, 72(X) + movq %mm0, 80(X) + movq %mm0, 88(X) + movq %mm0, 96(X) + movq %mm0, 104(X) + movq %mm0, 112(X) + movq %mm0, 120(X) + addq $8 * SIZE, X + decq I + jg .L12 + ALIGN_3 + +.L15: + movq N, I + andq $3, I + jle .L18 + ALIGN_2 + +.L16: + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + + addq $2 * SIZE, X + decq I + jg .L16 + +.L18: + EMMS + + ret + ALIGN_2 + +.L20: + movq N, I + sarq $2, I + jle .L25 + ALIGN_3 + +.L22: + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + decq I + jg .L22 + ALIGN_3 + +.L25: + movq N, I + andq $3, I + jle .L28 + ALIGN_3 + +.L26: + movq %mm0, 0(X) + movq %mm0, 8(X) + movq %mm0, 16(X) + movq %mm0, 24(X) + addq INCX, X + + decq I + jg .L26 + +.L28: + EMMS + + ret + ALIGN_3 + +.L30: + movq N, I + ALIGN_2 + +.L32: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + FLD 0 * SIZE(X) + fmul %st(1),%st + FLD 1 * SIZE(X) + fmul %st(3),%st + faddp %st,%st(1) + + FLD 0 * SIZE(X) + fmul %st(3),%st + FLD 1 * SIZE(X) + fmul %st(3),%st + fsubrp %st,%st(1) + + FST 0 * SIZE(X) + FST 1 * SIZE(X) + addq INCX, X + decq I + jg .L32 + ALIGN_2 + +.L999: + ffreep %st + ffreep %st + + ret + + EPILOGUE diff --git a/kernel/x86_64/zscal_atom.S b/kernel/x86_64/zscal_atom.S new file mode 100644 index 0000000000..c01d5c14fb --- /dev/null +++ b/kernel/x86_64/zscal_atom.S @@ -0,0 +1,394 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + movq 48(%rsp), X + movq 56(%rsp), INCX +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + + testq M, M + jle .L999 + + pxor %xmm15, %xmm15 + comisd %xmm0, %xmm15 + jne .L30 # Alpha_r != ZERO + + comisd %xmm1, %xmm15 + jne .L30 # Alpha_i != ZERO + + +/* Alpha == ZERO */ + cmpq $2 * SIZE, INCX + jne .L20 + + movq M, I + sarq $2, I + jle .L12 + ALIGN_4 + +.L11: + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + movsd %xmm1, 2 * SIZE(X) + movsd %xmm1, 3 * SIZE(X) + + movsd %xmm1, 4 * SIZE(X) + movsd %xmm1, 5 * SIZE(X) + movsd %xmm1, 6 * SIZE(X) + movsd %xmm1, 7 * SIZE(X) + + addq $8 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $2, M + je .L14 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + movsd %xmm1, 2 * SIZE(X) + movsd %xmm1, 3 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq $2 * SIZE, X + jmp .L999 + ALIGN_4 + +.L20: + movq M, I # rcx = n + sarq $2, I + jle .L22 + ALIGN_4 + +.L21: + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + decq I + jg .L21 + ALIGN_4 + +.L22: + testq $2, M + je .L23 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L23: + testq $1, M + je .L999 + + movsd %xmm1, 0 * SIZE(X) + movsd %xmm1, 1 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ +.L30: + movq X, XX + + movq M, I + sarq $2, I + jle .L35 + + movsd 0 * SIZE(X), %xmm2 + movsd 1 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movsd 1 * SIZE(X), %xmm7 + addq INCX, X + + movaps %xmm2, %xmm4 + movsd 0 * SIZE(X), %xmm8 + mulsd %xmm0, %xmm2 + movaps %xmm3, %xmm5 + movsd 1 * SIZE(X), %xmm9 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm2 + movsd 0 * SIZE(X), %xmm10 + addsd %xmm4, %xmm3 + movsd 1 * SIZE(X), %xmm11 + + movaps %xmm6, %xmm4 + mulsd %xmm0, %xmm6 + addq INCX, X + movaps %xmm7, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm7 + mulsd %xmm1, %xmm4 + + decq I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + subsd %xmm5, %xmm6 + movsd %xmm2, 0 * SIZE(XX) + addsd %xmm4, %xmm7 + movsd %xmm3, 1 * SIZE(XX) + + movaps %xmm8, %xmm4 + movsd 0 * SIZE(X), %xmm2 + mulsd %xmm0, %xmm8 + addq INCX, XX + movaps %xmm9, %xmm5 + movsd 1 * SIZE(X), %xmm3 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm8 + movsd %xmm6, 0 * SIZE(XX) + addsd %xmm4, %xmm9 + movsd %xmm7, 1 * SIZE(XX) + + movaps %xmm10, %xmm4 + movsd 0 * SIZE(X), %xmm6 + mulsd %xmm0, %xmm10 + addq INCX, XX + movaps %xmm11, %xmm5 + movsd 1 * SIZE(X), %xmm7 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm10 + movsd %xmm8, 0 * SIZE(XX) + addsd %xmm4, %xmm11 + movsd %xmm9, 1 * SIZE(XX) + + movaps %xmm2, %xmm4 + movsd 0 * SIZE(X), %xmm8 + mulsd %xmm0, %xmm2 + addq INCX, XX + movaps %xmm3, %xmm5 + movsd 1 * SIZE(X), %xmm9 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm2 + movsd %xmm10, 0 * SIZE(XX) + addsd %xmm4, %xmm3 + movsd %xmm11, 1 * SIZE(XX) + + movaps %xmm6, %xmm4 + movsd 0 * SIZE(X), %xmm10 + mulsd %xmm0, %xmm6 + addq INCX, XX + movaps %xmm7, %xmm5 + movsd 1 * SIZE(X), %xmm11 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm7 + mulsd %xmm1, %xmm4 + + decq I + jg .L31 + ALIGN_4 + +.L32: + subsd %xmm5, %xmm6 + movsd %xmm2, 0 * SIZE(XX) + addsd %xmm4, %xmm7 + movsd %xmm3, 1 * SIZE(XX) + + movaps %xmm8, %xmm4 + mulsd %xmm0, %xmm8 + addq INCX, XX + movaps %xmm9, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm9 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm8 + movsd %xmm6, 0 * SIZE(XX) + addsd %xmm4, %xmm9 + movsd %xmm7, 1 * SIZE(XX) + + movaps %xmm10, %xmm4 + mulsd %xmm0, %xmm10 + addq INCX, XX + movaps %xmm11, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm11 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm10 + movsd %xmm8, 0 * SIZE(XX) + addsd %xmm4, %xmm11 + movsd %xmm9, 1 * SIZE(XX) + addq INCX, XX + + movsd %xmm10, 0 * SIZE(XX) + movsd %xmm11, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L35: + testq $2, M + je .L37 + + movsd 0 * SIZE(X), %xmm2 + movsd 1 * SIZE(X), %xmm3 + addq INCX, X + + movaps %xmm2, %xmm4 + movsd 0 * SIZE(X), %xmm6 + mulsd %xmm0, %xmm2 + movaps %xmm3, %xmm5 + movsd 1 * SIZE(X), %xmm7 + mulsd %xmm1, %xmm5 + addq INCX, X + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm2 + addsd %xmm4, %xmm3 + + movaps %xmm6, %xmm4 + mulsd %xmm0, %xmm6 + movaps %xmm7, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm7 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm6 + movsd %xmm2, 0 * SIZE(XX) + addsd %xmm4, %xmm7 + movsd %xmm3, 1 * SIZE(XX) + addq INCX, XX + + movsd %xmm6, 0 * SIZE(XX) + movsd %xmm7, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L37: + testq $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm2 + movsd 1 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm4 + mulsd %xmm0, %xmm2 + movaps %xmm3, %xmm5 + mulsd %xmm1, %xmm5 + mulsd %xmm0, %xmm3 + mulsd %xmm1, %xmm4 + + subsd %xmm5, %xmm2 + addsd %xmm4, %xmm3 + + movsd %xmm2, 0 * SIZE(XX) + movsd %xmm3, 1 * SIZE(XX) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S new file mode 100644 index 0000000000..eb2092dc7c --- /dev/null +++ b/kernel/x86_64/zscal_sse.S @@ -0,0 +1,1359 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define FLAG %r11 +#define I %rax + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + movq 48(%rsp), X + movq 56(%rsp), INCX +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + xor FLAG, FLAG + + testq M, M + jle .L999 + + pxor %xmm15, %xmm15 + comiss %xmm0, %xmm15 + jne .L100 # Alpha_r != ZERO + + comiss %xmm1, %xmm15 + jne .L100 # Alpha_i != ZERO + +/* Alpha == ZERO */ + cmpq $2 * SIZE, INCX + jne .L50 + +/* INCX == 1 */ + cmpq $3, M + jle .L13 + + testq $4, X + je .L05 + movss %xmm15, 0 * SIZE(X) + addq $SIZE, X + movq $1, FLAG + decq M + ALIGN_3 + +.L05: + testq $8, X + je .L06 + + movlps %xmm15, 0 * SIZE(X) + addq $2 * SIZE, X + subq $1, M + ALIGN_3 +.L06: + + movq M, I # rcx = n + sarq $3, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 4 * SIZE(X) + movaps %xmm15, 8 * SIZE(X) + movaps %xmm15, 12 * SIZE(X) + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $7, M + je .L19 + testq $4, M + je .L13 + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 4 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L13: + testq $2, M + je .L14 + + movlps %xmm15, 0 * SIZE(X) + movhps %xmm15, 2 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $1, M + je .L19 + + movlps %xmm15, 0 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, FLAG + je .L999 + + movss %xmm15, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L50: + movq M, I # rcx = n + sarq $2, I + jle .L52 + ALIGN_4 + +.L51: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + decq I + jg .L51 + ALIGN_4 + +.L52: + testq $2, M + je .L53 + + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + movsd %xmm15, 0 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L53: + testq $1, M + je .L999 + + movsd %xmm15, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ + +.L100: + testq $SIZE, X + jne .L130 + + cmpq $2 * SIZE, INCX + jne .L120 + + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm1 + subps %xmm1, %xmm15 + unpcklps %xmm1, %xmm15 + + subq $-32 * SIZE, X + + testq $2 * SIZE, X + je .L105 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + addq $2 * SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L105: + movq M, I + sarq $4, I + jle .L115 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps 0 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(X) + movaps 4 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(X) + movaps 8 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(X) + movaps 12 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(X) + movaps 16 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(X) + movaps 20 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(X) + movaps 24 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(X) + movaps 28 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + pshufd $0xb1, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movaps %xmm4, -16 * SIZE(X) + + pshufd $0xb1, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movaps %xmm5, -12 * SIZE(X) + + pshufd $0xb1, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movaps %xmm6, -8 * SIZE(X) + + pshufd $0xb1, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movaps %xmm7, -4 * SIZE(X) + + subq $-32 * SIZE, X + ALIGN_4 + +.L115: + testq $8, M + je .L116 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + addq $16 * SIZE, X + ALIGN_3 + +.L116: + testq $4, M + je .L117 + + movaps -32 * SIZE(X), %xmm0 + movaps -28 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L117: + testq $2, M + je .L118 + + movaps -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L118: + testq $1, M + je .L999 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + jmp .L999 + ALIGN_3 + +.L120: + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm1 + subps %xmm1, %xmm15 + unpcklps %xmm1, %xmm15 + + movq X, XX + + movq M, I + sarq $3, I + jle .L125 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + decq I + jle .L122 + ALIGN_4 + +.L121: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + addq INCX, XX + movhps %xmm0, (XX) + addq INCX, XX + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movlps %xmm1, (XX) + addq INCX, XX + movhps %xmm1, (XX) + addq INCX, XX + + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + + movlps %xmm2, (XX) + addq INCX, XX + movhps %xmm2, (XX) + addq INCX, XX + + movsd (X), %xmm2 + addq INCX, X + movhps (X), %xmm2 + addq INCX, X + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + + movlps %xmm3, (XX) + addq INCX, XX + movhps %xmm3, (XX) + addq INCX, XX + + movsd (X), %xmm3 + addq INCX, X + movhps (X), %xmm3 + addq INCX, X + + decq I + jg .L121 + ALIGN_4 + +.L122: + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + addq INCX, XX + movhps %xmm0, (XX) + addq INCX, XX + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movlps %xmm1, (XX) + addq INCX, XX + movhps %xmm1, (XX) + addq INCX, XX + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + + movlps %xmm2, (XX) + addq INCX, XX + movhps %xmm2, (XX) + addq INCX, XX + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + + movlps %xmm3, (XX) + addq INCX, XX + movhps %xmm3, (XX) + addq INCX, XX + ALIGN_4 + +.L125: + testq $4, M + je .L127 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + addq INCX, XX + movhps %xmm0, (XX) + addq INCX, XX + + movsd (X), %xmm1 + addq INCX, X + movhps (X), %xmm1 + addq INCX, X + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movlps %xmm1, (XX) + addq INCX, XX + movhps %xmm1, (XX) + addq INCX, XX + ALIGN_3 + +.L127: + testq $2, M + je .L128 + + movsd (X), %xmm0 + addq INCX, X + movhps (X), %xmm0 + addq INCX, X + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + addq INCX, XX + movhps %xmm0, (XX) + addq INCX, XX + ALIGN_3 + +.L128: + testq $1, M + je .L999 + + movsd (X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, (XX) + jmp .L999 + ALIGN_3 + +.L130: + cmpq $2 * SIZE, INCX + jne .L120 + +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) + + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm1 + subps %xmm1, %xmm15 + unpcklps %xmm1, %xmm15 + + subq $-31 * SIZE, X + + testq $2 * SIZE, X + je .L130x + + movsd -31 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -31 * SIZE(X) + addq $2 * SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L130x: + shufps $0xb1, %xmm15, %xmm15 + + movaps -32 * SIZE(X), %xmm0 + movaps %xmm0, %xmm9 + + movq M, I + sarq $4, I + jle .L135 + + movaps -28 * SIZE(X), %xmm1 + movaps -24 * SIZE(X), %xmm2 + movaps -20 * SIZE(X), %xmm3 + movaps -16 * SIZE(X), %xmm4 + movaps -12 * SIZE(X), %xmm5 + movaps -8 * SIZE(X), %xmm6 + movaps -4 * SIZE(X), %xmm7 + + decq I + jle .L132 + ALIGN_4 + +.L131: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + pshufd $0x1b, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, %xmm9 + movss %xmm10, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps 4 * SIZE(X), %xmm1 + + movss %xmm3, %xmm2 + pshufd $0x1b, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, %xmm10 + movss %xmm9, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + movaps 8 * SIZE(X), %xmm2 + + movss %xmm4, %xmm3 + pshufd $0x1b, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, %xmm9 + movss %xmm10, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + movaps 12 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movss %xmm5, %xmm4 + pshufd $0x1b, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movaps %xmm4, %xmm10 + movss %xmm9, %xmm4 + movaps %xmm4, -16 * SIZE(X) + + movaps 16 * SIZE(X), %xmm4 + + movss %xmm6, %xmm5 + pshufd $0x1b, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movaps %xmm5, %xmm9 + movss %xmm10, %xmm5 + movaps %xmm5, -12 * SIZE(X) + + movaps 20 * SIZE(X), %xmm5 + + movss %xmm7, %xmm6 + pshufd $0x1b, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movaps %xmm6, %xmm10 + movss %xmm9, %xmm6 + movaps %xmm6, -8 * SIZE(X) + + movaps 24 * SIZE(X), %xmm6 + + movss %xmm0, %xmm7 + pshufd $0x1b, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movaps %xmm7, %xmm9 + movss %xmm10, %xmm7 + movaps %xmm7, -4 * SIZE(X) + + movaps 28 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + decq I + jg .L131 + ALIGN_4 + +.L132: + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps 0 * SIZE(X), %xmm0 + + movss %xmm2, %xmm1 + pshufd $0x1b, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movaps %xmm1, %xmm9 + movss %xmm10, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movss %xmm3, %xmm2 + pshufd $0x1b, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movaps %xmm2, %xmm10 + movss %xmm9, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + movss %xmm4, %xmm3 + pshufd $0x1b, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movaps %xmm3, %xmm9 + movss %xmm10, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + movss %xmm5, %xmm4 + pshufd $0x1b, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movaps %xmm4, %xmm10 + movss %xmm9, %xmm4 + movaps %xmm4, -16 * SIZE(X) + + movss %xmm6, %xmm5 + pshufd $0x1b, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movaps %xmm5, %xmm9 + movss %xmm10, %xmm5 + movaps %xmm5, -12 * SIZE(X) + + movss %xmm7, %xmm6 + pshufd $0x1b, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movaps %xmm6, %xmm10 + movss %xmm9, %xmm6 + movaps %xmm6, -8 * SIZE(X) + + movss %xmm0, %xmm7 + pshufd $0x1b, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movaps %xmm7, %xmm9 + movss %xmm10, %xmm7 + movaps %xmm7, -4 * SIZE(X) + + subq $-32 * SIZE, X + ALIGN_4 + +.L135: + testq $8, M + je .L136 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + pshufd $0x1b, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movaps %xmm1, %xmm9 + movss %xmm10, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps -20 * SIZE(X), %xmm3 + + movss %xmm3, %xmm2 + pshufd $0x1b, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + + movaps %xmm2, %xmm10 + movss %xmm9, %xmm2 + movaps %xmm2, -24 * SIZE(X) + + movaps -16 * SIZE(X), %xmm0 + + movss %xmm0, %xmm3 + pshufd $0x1b, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + + movaps %xmm3, %xmm9 + movss %xmm10, %xmm3 + movaps %xmm3, -20 * SIZE(X) + + addq $16 * SIZE, X + ALIGN_3 + +.L136: + testq $4, M + je .L137 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + + movaps -24 * SIZE(X), %xmm2 + + movss %xmm2, %xmm1 + pshufd $0x1b, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + + movaps %xmm1, %xmm9 + movss %xmm10, %xmm1 + movaps %xmm1, -28 * SIZE(X) + + movaps %xmm2, %xmm0 + + addq $8 * SIZE, X + ALIGN_3 + +.L137: + testq $2, M + je .L138 + + movaps -28 * SIZE(X), %xmm1 + + movss %xmm1, %xmm0 + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movaps %xmm0, %xmm10 + movss %xmm9, %xmm0 + movaps %xmm0, -32 * SIZE(X) + movaps %xmm10, %xmm9 + movaps %xmm1, %xmm0 + + addq $4 * SIZE, X + ALIGN_3 + +.L138: + movss %xmm9, -32 * SIZE(X) + + testq $1, M + je .L999 + + pshufd $0x1b, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + pshufd $0x39, %xmm0, %xmm0 + + movlps %xmm0, -31 * SIZE(X) + jmp .L999 + ALIGN_3 + + +#else + + pshufd $0, %xmm0, %xmm14 + pshufd $0, %xmm1, %xmm1 + subps %xmm1, %xmm15 + unpcklps %xmm1, %xmm15 + + subq $-32 * SIZE, X + + testq $2 * SIZE, X + je .L130x + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + addq $2 * SIZE, X + decq M + jle .L999 + ALIGN_3 + +.L130x: + movq M, I + sarq $4, I + jle .L135 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + movsd -16 * SIZE(X), %xmm4 + movhps -14 * SIZE(X), %xmm4 + movsd -12 * SIZE(X), %xmm5 + movhps -10 * SIZE(X), %xmm5 + movsd -8 * SIZE(X), %xmm6 + movhps -6 * SIZE(X), %xmm6 + movsd -4 * SIZE(X), %xmm7 + movhps -2 * SIZE(X), %xmm7 + + decq I + jle .L132 + ALIGN_4 + +.L131: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + movsd 0 * SIZE(X), %xmm0 + movhps 2 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + movsd 4 * SIZE(X), %xmm1 + movhps 6 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + + movsd 8 * SIZE(X), %xmm2 + movhps 10 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + + movsd 12 * SIZE(X), %xmm3 + movhps 14 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0xb1, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movlps %xmm4, -16 * SIZE(X) + movhps %xmm4, -14 * SIZE(X) + + movsd 16 * SIZE(X), %xmm4 + movhps 18 * SIZE(X), %xmm4 + + pshufd $0xb1, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movlps %xmm5, -12 * SIZE(X) + movhps %xmm5, -10 * SIZE(X) + + movsd 20 * SIZE(X), %xmm5 + movhps 22 * SIZE(X), %xmm5 + + pshufd $0xb1, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movlps %xmm6, -8 * SIZE(X) + movhps %xmm6, -6 * SIZE(X) + + movsd 24 * SIZE(X), %xmm6 + movhps 26 * SIZE(X), %xmm6 + + pshufd $0xb1, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movlps %xmm7, -4 * SIZE(X) + movhps %xmm7, -2 * SIZE(X) + + movsd 28 * SIZE(X), %xmm7 + movhps 30 * SIZE(X), %xmm7 + + subq $-32 * SIZE, X + decq I + jg .L131 + ALIGN_4 + +.L132: + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + + pshufd $0xb1, %xmm4, %xmm8 + mulps %xmm14, %xmm4 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm4 + movlps %xmm4, -16 * SIZE(X) + movhps %xmm4, -14 * SIZE(X) + + pshufd $0xb1, %xmm5, %xmm8 + mulps %xmm14, %xmm5 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm5 + movlps %xmm5, -12 * SIZE(X) + movhps %xmm5, -10 * SIZE(X) + + pshufd $0xb1, %xmm6, %xmm8 + mulps %xmm14, %xmm6 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm6 + movlps %xmm6, -8 * SIZE(X) + movhps %xmm6, -6 * SIZE(X) + + pshufd $0xb1, %xmm7, %xmm8 + mulps %xmm14, %xmm7 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm7 + movlps %xmm7, -4 * SIZE(X) + movhps %xmm7, -2 * SIZE(X) + + subq $-32 * SIZE, X + ALIGN_4 + +.L135: + testq $8, M + je .L136 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + movsd -24 * SIZE(X), %xmm2 + movhps -22 * SIZE(X), %xmm2 + + pshufd $0xb1, %xmm2, %xmm8 + mulps %xmm14, %xmm2 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm2 + movlps %xmm2, -24 * SIZE(X) + movhps %xmm2, -22 * SIZE(X) + + movsd -20 * SIZE(X), %xmm3 + movhps -18 * SIZE(X), %xmm3 + + pshufd $0xb1, %xmm3, %xmm8 + mulps %xmm14, %xmm3 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm3 + movlps %xmm3, -20 * SIZE(X) + movhps %xmm3, -18 * SIZE(X) + + addq $16 * SIZE, X + ALIGN_3 + +.L136: + testq $4, M + je .L137 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + movsd -28 * SIZE(X), %xmm1 + movhps -26 * SIZE(X), %xmm1 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + pshufd $0xb1, %xmm1, %xmm8 + mulps %xmm14, %xmm1 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm1 + movlps %xmm1, -28 * SIZE(X) + movhps %xmm1, -26 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L137: + testq $2, M + je .L138 + + movsd -32 * SIZE(X), %xmm0 + movhps -30 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + movlps %xmm0, -32 * SIZE(X) + movhps %xmm0, -30 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L138: + testq $1, M + je .L999 + + movsd -32 * SIZE(X), %xmm0 + + pshufd $0xb1, %xmm0, %xmm8 + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm8 + addps %xmm8, %xmm0 + + movlps %xmm0, -32 * SIZE(X) + ALIGN_3 +#endif + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S new file mode 100644 index 0000000000..23d2da73de --- /dev/null +++ b/kernel/x86_64/zscal_sse2.S @@ -0,0 +1,1724 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 +#define X ARG4 +#define INCX ARG5 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#endif + +#define XX %r10 +#define FLAG %r11 +#define I %rax + +#include "l1param.h" + +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) +#define USE_PSHUFD +#else +#define USE_PSHUFD_HALF +#endif + + PROLOGUE + PROFCODE + +#ifdef WINDOWS_ABI + movaps %xmm3, %xmm0 + movsd 40(%rsp), %xmm1 + movq 48(%rsp), X + movq 56(%rsp), INCX +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + xor FLAG, FLAG + + testq M, M + jle .L999 + + pxor %xmm15, %xmm15 + comisd %xmm0, %xmm15 + jne .L100 + + comisd %xmm1, %xmm15 + jne .L100 + +/* Alpha == ZERO */ + cmpq $2 * SIZE, INCX + jne .L20 + +/* INCX == 1 */ + testq $SIZE, X + je .L05 + + movsd %xmm15, 0 * SIZE(X) + addq $SIZE, X + movq $1, FLAG + decq M + jle .L19 + ALIGN_3 +.L05: + + movq M, I # rcx = n + sarq $3, I + jle .L12 + ALIGN_4 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 2 * SIZE(X) + movaps %xmm15, 4 * SIZE(X) + movaps %xmm15, 6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm15, 8 * SIZE(X) + movaps %xmm15, 10 * SIZE(X) + movaps %xmm15, 12 * SIZE(X) + movaps %xmm15, 14 * SIZE(X) + + addq $16 * SIZE, X + decq I + jg .L11 + ALIGN_4 + +.L12: + testq $4, M + je .L13 + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 2 * SIZE(X) + movaps %xmm15, 4 * SIZE(X) + movaps %xmm15, 6 * SIZE(X) + addq $8 * SIZE, X + ALIGN_3 + +.L13: + testq $2, M + je .L14 + + movaps %xmm15, 0 * SIZE(X) + movaps %xmm15, 2 * SIZE(X) + addq $4 * SIZE, X + ALIGN_3 + +.L14: + testq $1, M + je .L19 + movaps %xmm15, 0 * SIZE(X) + addq $2 * SIZE, X + ALIGN_3 + +.L19: + testq $1, FLAG + je .L999 + + movsd %xmm15, 0 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* incx != 1 */ +.L20: + testq $SIZE, X + jne .L30 + +/* Aligned Mode */ + movq M, I # rcx = n + sarq $2, I + jle .L22 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm15, (X) + addq INCX, X + movaps %xmm15, (X) + addq INCX, X + movaps %xmm15, (X) + addq INCX, X + movaps %xmm15, (X) + addq INCX, X + decq I + jg .L21 + ALIGN_4 + +.L22: + testq $3, M + je .L999 + + testq $2, M + je .L23 + + movaps %xmm15, (X) + addq INCX, X + movaps %xmm15, (X) + addq INCX, X + ALIGN_3 + +.L23: + testq $1, M + je .L999 + + movaps %xmm15, (X) + jmp .L999 + ALIGN_4 + + +/* Unaligned Mode */ +.L30: + movq M, I # rcx = n + sarq $2, I + jle .L32 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + decq I + jg .L31 + ALIGN_4 + +.L32: + testq $3, M + je .L999 + + testq $2, M + je .L33 + + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + addq INCX, X + ALIGN_3 + +.L33: + testq $1, M + je .L999 + + movlps %xmm15, 0 * SIZE(X) + movlps %xmm15, 1 * SIZE(X) + jmp .L999 + ALIGN_4 + +/* Alpha != ZERO */ +.L100: + testq $SIZE, X + jne .L200 + +#ifdef HAVE_SSE3 + movddup %xmm0, %xmm14 +#else + pshufd $0x44, %xmm0, %xmm14 +#endif + pxor %xmm15, %xmm15 + subsd %xmm1, %xmm15 + movlhps %xmm1, %xmm15 + + cmpq $2 * SIZE, INCX + jne .L120 + + subq $-16 * SIZE, X + + movq M, I + sarq $3, I + jle .L115 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + movaps -8 * SIZE(X), %xmm4 + movaps -6 * SIZE(X), %xmm5 + movaps -4 * SIZE(X), %xmm6 + movaps -2 * SIZE(X), %xmm7 + + decq I + jle .L112 + ALIGN_4 + +.L111: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm8 +#else + movsd -15 * SIZE(X), %xmm8 + movhps -16 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(X) + movaps 0 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm8 +#else + movsd -13 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(X) + movaps 2 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm8 +#else + movsd -11 * SIZE(X), %xmm8 + movhps -12 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(X) + movaps 4 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm8 +#else + movsd -9 * SIZE(X), %xmm8 + movhps -10 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(X) + movaps 6 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm4, %xmm8 +#else + movsd -7 * SIZE(X), %xmm8 + movhps -8 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(X) + movaps 8 * SIZE(X), %xmm4 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm5, %xmm8 +#else + movsd -5 * SIZE(X), %xmm8 + movhps -6 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(X) + movaps 10 * SIZE(X), %xmm5 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm6, %xmm8 +#else + movsd -3 * SIZE(X), %xmm8 + movhps -4 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(X) + movaps 12 * SIZE(X), %xmm6 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm7, %xmm8 +#else + movsd -1 * SIZE(X), %xmm8 + movhps -2 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(X) + movaps 14 * SIZE(X), %xmm7 + + subq $-16 * SIZE, X + decq I + jg .L111 + ALIGN_4 + +.L112: + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(X) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movaps %xmm4, -8 * SIZE(X) + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movaps %xmm5, -6 * SIZE(X) + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movaps %xmm6, -4 * SIZE(X) + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movaps %xmm7, -2 * SIZE(X) + + subq $-16 * SIZE, X + ALIGN_3 + +.L115: + testq $7, M + je .L999 + + testq $4, M + je .L116 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, -12 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, -10 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L116: + testq $2, M + je .L117 + + movaps -16 * SIZE(X), %xmm0 + movaps -14 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L117: + testq $1, M + je .L999 + + movaps -16 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + + movaps %xmm0, -16 * SIZE(X) + jmp .L999 + ALIGN_3 + +.L120: + movq X, XX + + movq M, I + sarq $3, I + jle .L125 + + movaps (X), %xmm0 + addq INCX, X + movaps (X), %xmm1 + addq INCX, X + movaps (X), %xmm2 + addq INCX, X + movaps (X), %xmm3 + addq INCX, X + movaps (X), %xmm4 + addq INCX, X + movaps (X), %xmm5 + addq INCX, X + movaps (X), %xmm6 + addq INCX, X + movaps (X), %xmm7 + addq INCX, X + + decq I + jle .L122 + ALIGN_4 + +.L121: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, (XX) + addq INCX, XX + movaps (X), %xmm0 + addq INCX, X + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, (XX) + addq INCX, XX + movaps (X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, (XX) + addq INCX, XX + movaps (X), %xmm2 + addq INCX, X + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, (XX) + addq INCX, XX + movaps (X), %xmm3 + addq INCX, X + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movaps %xmm4, (XX) + addq INCX, XX + movaps (X), %xmm4 + addq INCX, X + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movaps %xmm5, (XX) + addq INCX, XX + movaps (X), %xmm5 + addq INCX, X + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movaps %xmm6, (XX) + addq INCX, XX + movaps (X), %xmm6 + addq INCX, X + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movaps %xmm7, (XX) + addq INCX, XX + movaps (X), %xmm7 + addq INCX, X + + decq I + jg .L121 + ALIGN_4 + +.L122: + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movaps %xmm4, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movaps %xmm5, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movaps %xmm6, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movaps %xmm7, (XX) + addq INCX, XX + ALIGN_3 + +.L125: + testq $7, M + je .L999 + + testq $4, M + je .L126 + + movaps (X), %xmm0 + addq INCX, X + movaps (X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, (XX) + addq INCX, XX + + movaps (X), %xmm2 + addq INCX, X + movaps (X), %xmm3 + addq INCX, X + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movaps %xmm2, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movaps %xmm3, (XX) + addq INCX, XX + ALIGN_3 + +.L126: + testq $2, M + je .L127 + + movaps (X), %xmm0 + addq INCX, X + movaps (X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movaps %xmm0, (XX) + addq INCX, XX + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movaps %xmm1, (XX) + addq INCX, XX + ALIGN_3 + +.L127: + testq $1, M + je .L999 + + movaps (X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + + movaps %xmm0, (XX) + jmp .L999 + ALIGN_3 + +.L200: + cmpq $2 * SIZE, INCX + jne .L220 + +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) + + movddup %xmm0, %xmm14 + pxor %xmm15, %xmm15 + subsd %xmm1, %xmm15 + movlhps %xmm1, %xmm15 + shufpd $1, %xmm15, %xmm15 + + movhps 0 * SIZE(X), %xmm0 + movaps 1 * SIZE(X), %xmm1 + subq $-16 * SIZE, X + + unpckhpd %xmm0, %xmm0 + mulsd %xmm14, %xmm0 + movaps %xmm1, %xmm8 + mulsd %xmm15, %xmm8 + subsd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + + decq M + + movq M, I + sarq $3, I + jle .L205 + + movaps -13 * SIZE(X), %xmm2 + movaps -11 * SIZE(X), %xmm3 + movaps -9 * SIZE(X), %xmm4 + + decq I + jle .L202 + ALIGN_4 + +.L201: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + movaps -7 * SIZE(X), %xmm5 + + movaps %xmm2, %xmm8 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -13 * SIZE(X) + movaps -5 * SIZE(X), %xmm6 + + movaps %xmm3, %xmm8 + SHUFPD_1 %xmm4, %xmm2 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -11 * SIZE(X) + movaps -3 * SIZE(X), %xmm7 + + movaps %xmm4, %xmm8 + SHUFPD_1 %xmm5, %xmm3 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -9 * SIZE(X) + movaps -1 * SIZE(X), %xmm0 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps %xmm5, %xmm8 + SHUFPD_1 %xmm6, %xmm4 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -7 * SIZE(X) + movaps 1 * SIZE(X), %xmm1 + + movaps %xmm6, %xmm8 + SHUFPD_1 %xmm7, %xmm5 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -5 * SIZE(X) + movaps 3 * SIZE(X), %xmm2 + + movaps %xmm7, %xmm8 + SHUFPD_1 %xmm0, %xmm6 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -3 * SIZE(X) + movaps 5 * SIZE(X), %xmm3 + + movaps %xmm0, %xmm8 + SHUFPD_1 %xmm1, %xmm7 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -1 * SIZE(X) + movaps 7 * SIZE(X), %xmm4 + + subq $-16 * SIZE, X + decq I + jg .L201 + ALIGN_4 + +.L202: + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + movaps -7 * SIZE(X), %xmm5 + + movaps %xmm2, %xmm8 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -13 * SIZE(X) + movaps -5 * SIZE(X), %xmm6 + + movaps %xmm3, %xmm8 + SHUFPD_1 %xmm4, %xmm2 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -11 * SIZE(X) + movaps -3 * SIZE(X), %xmm7 + + movaps %xmm4, %xmm8 + SHUFPD_1 %xmm5, %xmm3 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -9 * SIZE(X) + movaps -1 * SIZE(X), %xmm0 + + movaps %xmm5, %xmm8 + SHUFPD_1 %xmm6, %xmm4 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm4 + addpd %xmm8, %xmm4 + movaps %xmm4, -7 * SIZE(X) + movaps 1 * SIZE(X), %xmm1 + + movaps %xmm6, %xmm8 + SHUFPD_1 %xmm7, %xmm5 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm5 + addpd %xmm8, %xmm5 + movaps %xmm5, -5 * SIZE(X) + + movaps %xmm7, %xmm8 + SHUFPD_1 %xmm0, %xmm6 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm6 + addpd %xmm8, %xmm6 + movaps %xmm6, -3 * SIZE(X) + + movaps %xmm0, %xmm8 + SHUFPD_1 %xmm1, %xmm7 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm7 + addpd %xmm8, %xmm7 + movaps %xmm7, -1 * SIZE(X) + + subq $-16 * SIZE, X + ALIGN_3 + +.L205: + testq $4, M + je .L206 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm8 + SHUFPD_1 %xmm3, %xmm1 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + + movaps %xmm3, %xmm8 + SHUFPD_1 %xmm0, %xmm2 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm2 + addpd %xmm8, %xmm2 + movaps %xmm2, -11 * SIZE(X) + + movaps -7 * SIZE(X), %xmm1 + + movaps %xmm0, %xmm8 + SHUFPD_1 %xmm1, %xmm3 + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm3 + addpd %xmm8, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L206: + testq $2, M + je .L207 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm3 + + movaps %xmm2, %xmm8 + SHUFPD_1 %xmm3, %xmm1 + + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm1 + addpd %xmm8, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + ALIGN_3 + +.L207: + testq $1, M + je .L208 + + movaps -13 * SIZE(X), %xmm2 + + movaps %xmm1, %xmm8 + SHUFPD_1 %xmm2, %xmm0 + + mulpd %xmm14, %xmm8 + mulpd %xmm15, %xmm0 + addpd %xmm8, %xmm0 + movaps %xmm0, -15 * SIZE(X) + + movaps %xmm1, %xmm0 + movaps %xmm2, %xmm1 + addq $2 * SIZE, X + ALIGN_3 + +.L208: + unpckhpd %xmm0, %xmm0 + mulsd %xmm14, %xmm1 + mulsd %xmm15, %xmm0 + addsd %xmm1, %xmm0 + movlps %xmm0, -15 * SIZE(X) + jmp .L999 + ALIGN_3 + +#else + + movddup %xmm0, %xmm14 + pxor %xmm15, %xmm15 + subsd %xmm1, %xmm15 + movlhps %xmm1, %xmm15 + + subq $-16 * SIZE, X + + movq M, I + sarq $3, I + jle .L205 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + movsd -8 * SIZE(X), %xmm4 + movhps -7 * SIZE(X), %xmm4 + movsd -6 * SIZE(X), %xmm5 + movhps -5 * SIZE(X), %xmm5 + movsd -4 * SIZE(X), %xmm6 + movhps -3 * SIZE(X), %xmm6 + movsd -2 * SIZE(X), %xmm7 + movhps -1 * SIZE(X), %xmm7 + + decq I + jle .L202 + ALIGN_4 + +.L201: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm0, %xmm8 +#else + movsd -15 * SIZE(X), %xmm8 + movhps -16 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm1, %xmm8 +#else + movsd -13 * SIZE(X), %xmm8 + movhps -14 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + movsd 2 * SIZE(X), %xmm1 + movhps 3 * SIZE(X), %xmm1 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm2, %xmm8 +#else + movsd -11 * SIZE(X), %xmm8 + movhps -12 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + movsd 4 * SIZE(X), %xmm2 + movhps 5 * SIZE(X), %xmm2 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm3, %xmm8 +#else + movsd -9 * SIZE(X), %xmm8 + movhps -10 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + movsd 6 * SIZE(X), %xmm3 + movhps 7 * SIZE(X), %xmm3 + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm4, %xmm8 +#else + movsd -7 * SIZE(X), %xmm8 + movhps -8 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movlps %xmm4, -8 * SIZE(X) + movhps %xmm4, -7 * SIZE(X) + movsd 8 * SIZE(X), %xmm4 + movhps 9 * SIZE(X), %xmm4 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm5, %xmm8 +#else + movsd -5 * SIZE(X), %xmm8 + movhps -6 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movlps %xmm5, -6 * SIZE(X) + movhps %xmm5, -5 * SIZE(X) + movsd 10 * SIZE(X), %xmm5 + movhps 11 * SIZE(X), %xmm5 + +#if defined(USE_PSHUFD) || defined(USE_PSHUFD_HALF) + pshufd $0x4e, %xmm6, %xmm8 +#else + movsd -3 * SIZE(X), %xmm8 + movhps -4 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movlps %xmm6, -4 * SIZE(X) + movhps %xmm6, -3 * SIZE(X) + movsd 12 * SIZE(X), %xmm6 + movhps 13 * SIZE(X), %xmm6 + +#ifdef USE_PSHUFD + pshufd $0x4e, %xmm7, %xmm8 +#else + movsd -1 * SIZE(X), %xmm8 + movhps -2 * SIZE(X), %xmm8 +#endif + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movlps %xmm7, -2 * SIZE(X) + movhps %xmm7, -1 * SIZE(X) + movsd 14 * SIZE(X), %xmm7 + movhps 15 * SIZE(X), %xmm7 + + subq $-16 * SIZE, X + decq I + jg .L201 + ALIGN_4 + +.L202: + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movlps %xmm4, -8 * SIZE(X) + movhps %xmm4, -7 * SIZE(X) + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movlps %xmm5, -6 * SIZE(X) + movhps %xmm5, -5 * SIZE(X) + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movlps %xmm6, -4 * SIZE(X) + movhps %xmm6, -3 * SIZE(X) + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movlps %xmm7, -2 * SIZE(X) + movhps %xmm7, -1 * SIZE(X) + + subq $-16 * SIZE, X + ALIGN_3 + +.L205: + testq $7, M + je .L999 + + testq $4, M + je .L206 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + movsd -12 * SIZE(X), %xmm2 + movhps -11 * SIZE(X), %xmm2 + movsd -10 * SIZE(X), %xmm3 + movhps -9 * SIZE(X), %xmm3 + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, -12 * SIZE(X) + movhps %xmm2, -11 * SIZE(X) + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, -10 * SIZE(X) + movhps %xmm3, -9 * SIZE(X) + + addq $8 * SIZE, X + ALIGN_3 + +.L206: + testq $2, M + je .L207 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + + movsd -14 * SIZE(X), %xmm1 + movhps -13 * SIZE(X), %xmm1 + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, -14 * SIZE(X) + movhps %xmm1, -13 * SIZE(X) + + addq $4 * SIZE, X + ALIGN_3 + +.L207: + testq $1, M + je .L999 + + movsd -16 * SIZE(X), %xmm0 + movhps -15 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + + movlps %xmm0, -16 * SIZE(X) + movhps %xmm0, -15 * SIZE(X) + jmp .L999 + ALIGN_3 + +#endif + +.L220: + movddup %xmm0, %xmm14 + pxor %xmm15, %xmm15 + subsd %xmm1, %xmm15 + movlhps %xmm1, %xmm15 + + movq X, XX + + movq M, I + sarq $3, I + jle .L225 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addq INCX, X + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addq INCX, X + movsd 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + movsd 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + + decq I + jle .L222 + ALIGN_4 + +.L221: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addq INCX, X + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addq INCX, X + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movlps %xmm4, 0 * SIZE(XX) + movhps %xmm4, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm4 + movhps 1 * SIZE(X), %xmm4 + addq INCX, X + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movlps %xmm5, 0 * SIZE(XX) + movhps %xmm5, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm5 + movhps 1 * SIZE(X), %xmm5 + addq INCX, X + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movlps %xmm6, 0 * SIZE(XX) + movhps %xmm6, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm6 + movhps 1 * SIZE(X), %xmm6 + addq INCX, X + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movlps %xmm7, 0 * SIZE(XX) + movhps %xmm7, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm7 + movhps 1 * SIZE(X), %xmm7 + addq INCX, X + + + decq I + jg .L221 + ALIGN_4 + +.L222: + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm4, %xmm8 + mulpd %xmm14, %xmm4 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm4 + movlps %xmm4, 0 * SIZE(XX) + movhps %xmm4, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm5, %xmm8 + mulpd %xmm14, %xmm5 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm5 + movlps %xmm5, 0 * SIZE(XX) + movhps %xmm5, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm6, %xmm8 + mulpd %xmm14, %xmm6 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm6 + movlps %xmm6, 0 * SIZE(XX) + movhps %xmm6, 1 * SIZE(XX) + addq INCX, XX + + pshufd $0x4e, %xmm7, %xmm8 + mulpd %xmm14, %xmm7 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm7 + movlps %xmm7, 0 * SIZE(XX) + movhps %xmm7, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L225: + testq $7, M + je .L999 + + testq $4, M + je .L226 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm2 + movhps 1 * SIZE(X), %xmm2 + addq INCX, X + + pshufd $0x4e, %xmm2, %xmm8 + mulpd %xmm14, %xmm2 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm2 + movlps %xmm2, 0 * SIZE(XX) + movhps %xmm2, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm3 + movhps 1 * SIZE(X), %xmm3 + addq INCX, X + + pshufd $0x4e, %xmm3, %xmm8 + mulpd %xmm14, %xmm3 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + movhps %xmm3, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L226: + testq $2, M + je .L227 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + addq INCX, X + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + addq INCX, XX + + movsd 0 * SIZE(X), %xmm1 + movhps 1 * SIZE(X), %xmm1 + addq INCX, X + + pshufd $0x4e, %xmm1, %xmm8 + mulpd %xmm14, %xmm1 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm1 + movlps %xmm1, 0 * SIZE(XX) + movhps %xmm1, 1 * SIZE(XX) + addq INCX, XX + ALIGN_3 + +.L227: + testq $1, M + je .L999 + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + + pshufd $0x4e, %xmm0, %xmm8 + mulpd %xmm14, %xmm0 + mulpd %xmm15, %xmm8 + addpd %xmm8, %xmm0 + + movlps %xmm0, 0 * SIZE(XX) + movhps %xmm0, 1 * SIZE(XX) + ALIGN_3 + +.L999: + xorq %rax, %rax + + RESTOREREGISTERS + + ret + + EPILOGUE + diff --git a/kernel/x86_64/zswap.S b/kernel/x86_64/zswap.S new file mode 100644 index 0000000000..8f96875e32 --- /dev/null +++ b/kernel/x86_64/zswap.S @@ -0,0 +1,452 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define N ARG1 /* rdi */ +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define N ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#define XX %r10 +#define YY %r11 + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI +#ifndef XDOUBLE + movq 8(%rsp), INCY +#else + movq 40(%rsp), INCY +#endif +#else + pushq %rbx + + movq 56(%rsp), X + movq 64(%rsp), INCX + movq 72(%rsp), Y + movq 80(%rsp), INCY +#endif + + EMMS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + cmpq $2 * SIZE, INCX + jne .L14 + cmpq $2 * SIZE, INCY + jne .L14 + + movq N, %rax + sarq $2, %rax + jle .L15 + ALIGN_3 + +.L16: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm6, 16(X) + movq %mm7, 24(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) + movq %mm2, 16(Y) + movq %mm3, 24(Y) + + movq 32(X), %mm0 + movq 40(X), %mm1 + movq 48(X), %mm2 + movq 56(X), %mm3 + movq 32(Y), %mm4 + movq 40(Y), %mm5 + movq 48(Y), %mm6 + movq 56(Y), %mm7 + + movq %mm4, 32(X) + movq %mm5, 40(X) + movq %mm6, 48(X) + movq %mm7, 56(X) + movq %mm0, 32(Y) + movq %mm1, 40(Y) + movq %mm2, 48(Y) + movq %mm3, 56(Y) + + movq 64(X), %mm0 + movq 72(X), %mm1 + movq 80(X), %mm2 + movq 88(X), %mm3 + movq 64(Y), %mm4 + movq 72(Y), %mm5 + movq 80(Y), %mm6 + movq 88(Y), %mm7 + + movq %mm4, 64(X) + movq %mm5, 72(X) + movq %mm6, 80(X) + movq %mm7, 88(X) + movq %mm0, 64(Y) + movq %mm1, 72(Y) + movq %mm2, 80(Y) + movq %mm3, 88(Y) + + movq 96(X), %mm0 + movq 104(X), %mm1 + movq 112(X), %mm2 + movq 120(X), %mm3 + movq 96(Y), %mm4 + movq 104(Y), %mm5 + movq 112(Y), %mm6 + movq 120(Y), %mm7 + + movq %mm4, 96(X) + movq %mm5, 104(X) + movq %mm6, 112(X) + movq %mm7, 120(X) + movq %mm0, 96(Y) + movq %mm1, 104(Y) + movq %mm2, 112(Y) + movq %mm3, 120(Y) +#elif defined(DOUBLE) + prefetchw PREFETCHSIZE * SIZE(X) + MOVQ 0 * SIZE(X), %mm0 + MOVQ 1 * SIZE(X), %mm1 + MOVQ 2 * SIZE(X), %mm2 + MOVQ 3 * SIZE(X), %mm3 + prefetchw PREFETCHSIZE * SIZE(Y) + MOVQ 0 * SIZE(Y), %mm4 + MOVQ 1 * SIZE(Y), %mm5 + MOVQ 2 * SIZE(Y), %mm6 + MOVQ 3 * SIZE(Y), %mm7 + + MOVQ %mm4, 0 * SIZE(X) + MOVQ %mm5, 1 * SIZE(X) + MOVQ %mm6, 2 * SIZE(X) + MOVQ %mm7, 3 * SIZE(X) + MOVQ %mm0, 0 * SIZE(Y) + MOVQ %mm1, 1 * SIZE(Y) + MOVQ %mm2, 2 * SIZE(Y) + MOVQ %mm3, 3 * SIZE(Y) + + MOVQ 4 * SIZE(X), %mm0 + MOVQ 5 * SIZE(X), %mm1 + MOVQ 6 * SIZE(X), %mm2 + MOVQ 7 * SIZE(X), %mm3 + MOVQ 4 * SIZE(Y), %mm4 + MOVQ 5 * SIZE(Y), %mm5 + MOVQ 6 * SIZE(Y), %mm6 + MOVQ 7 * SIZE(Y), %mm7 + + MOVQ %mm4, 4 * SIZE(X) + MOVQ %mm5, 5 * SIZE(X) + MOVQ %mm6, 6 * SIZE(X) + MOVQ %mm7, 7 * SIZE(X) + MOVQ %mm0, 4 * SIZE(Y) + MOVQ %mm1, 5 * SIZE(Y) + MOVQ %mm2, 6 * SIZE(Y) + MOVQ %mm3, 7 * SIZE(Y) + +#else +#ifdef OPTERON + prefetchw PREFETCHSIZE * SIZE(X) +#endif + movq 0 * SIZE(X), %mm0 + movq 2 * SIZE(X), %mm1 + movq 4 * SIZE(X), %mm2 + movq 6 * SIZE(X), %mm3 + movq 0 * SIZE(Y), %mm4 + movq 2 * SIZE(Y), %mm5 + movq 4 * SIZE(Y), %mm6 + movq 6 * SIZE(Y), %mm7 + +#ifdef OPTERON + prefetchw PREFETCHSIZE * SIZE(Y) +#endif + movq %mm4, 0 * SIZE(X) + movq %mm5, 2 * SIZE(X) + movq %mm6, 4 * SIZE(X) + movq %mm7, 6 * SIZE(X) + + movq %mm0, 0 * SIZE(Y) + movq %mm1, 2 * SIZE(Y) + movq %mm2, 4 * SIZE(Y) + movq %mm3, 6 * SIZE(Y) +#endif + + addq $8 * SIZE, X + addq $8 * SIZE, Y + decq %rax + jg .L16 + ALIGN_3 + +.L15: + movq N, %rax + andq $3, %rax + jle .L27 + ALIGN_3 + +.L22: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm6, 16(X) + movq %mm7, 24(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) + movq %mm2, 16(Y) + movq %mm3, 24(Y) +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + movq 0 * SIZE(Y), %mm4 + movq 1 * SIZE(Y), %mm5 + movq %mm4, 0 * SIZE(X) + movq %mm5, 1 * SIZE(X) + movq %mm0, 0 * SIZE(Y) + movq %mm1, 1 * SIZE(Y) +#else + movq 0 * SIZE(X), %mm0 + movq 0 * SIZE(Y), %mm4 + movq %mm4, 0 * SIZE(X) + movq %mm0, 0 * SIZE(Y) +#endif + addq $2 * SIZE, X + addq $2 * SIZE, Y + decq %rax + jg .L22 + jmp .L27 + ALIGN_3 + +/* INCX != 1 or INCY != 1 */ + +.L14: + movq N, %rax + movq X, XX + movq Y, YY + sarq $1, %rax + jle .L28 + ALIGN_2 + +.L29: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + movq %mm6, 16(XX) + movq %mm7, 24(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + movq %mm2, 16(YY) + movq %mm3, 24(YY) + addq INCY, YY + + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + addq INCX, X + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + addq INCY, Y + + movq %mm4, 0(XX) + movq %mm5, 8(XX) + movq %mm6, 16(XX) + movq %mm7, 24(XX) + addq INCX, XX + movq %mm0, 0(YY) + movq %mm1, 8(YY) + movq %mm2, 16(YY) + movq %mm3, 24(YY) + addq INCY, YY +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + addq INCX, X + movq 0 * SIZE(X), %mm2 + movq 1 * SIZE(X), %mm3 + addq INCX, X + + movq 0 * SIZE(Y), %mm4 + movq 1 * SIZE(Y), %mm5 + addq INCY, Y + movq 0 * SIZE(Y), %mm6 + movq 1 * SIZE(Y), %mm7 + addq INCY, Y + + movq %mm4, 0 * SIZE(XX) + movq %mm5, 1 * SIZE(XX) + addq INCX, XX + movq %mm6, 0 * SIZE(XX) + movq %mm7, 1 * SIZE(XX) + addq INCX, XX + + movq %mm0, 0 * SIZE(YY) + movq %mm1, 1 * SIZE(YY) + addq INCY, YY + movq %mm2, 0 * SIZE(YY) + movq %mm3, 1 * SIZE(YY) + addq INCY, YY +#else + movq 0 * SIZE(X), %mm0 + addq INCX, X + movq 0 * SIZE(X), %mm2 + addq INCX, X + + movq 0 * SIZE(Y), %mm4 + addq INCY, Y + movq 0 * SIZE(Y), %mm6 + addq INCY, Y + + movq %mm4, 0 * SIZE(XX) + addq INCX, XX + movq %mm6, 0 * SIZE(XX) + addq INCX, XX + + movq %mm0, 0 * SIZE(YY) + addq INCY, YY + movq %mm2, 0 * SIZE(YY) + addq INCY, YY +#endif + decq %rax + jg .L29 + ALIGN_3 + +.L28: + movq N, %rax + andq $1, %rax + jle .L27 + ALIGN_3 + +.L35: +#ifdef XDOUBLE + movq 0(X), %mm0 + movq 8(X), %mm1 + movq 16(X), %mm2 + movq 24(X), %mm3 + movq 0(Y), %mm4 + movq 8(Y), %mm5 + movq 16(Y), %mm6 + movq 24(Y), %mm7 + + movq %mm4, 0(X) + movq %mm5, 8(X) + movq %mm6, 16(X) + movq %mm7, 24(X) + movq %mm0, 0(Y) + movq %mm1, 8(Y) + movq %mm2, 16(Y) + movq %mm3, 24(Y) +#elif defined(DOUBLE) + movq 0 * SIZE(X), %mm0 + movq 1 * SIZE(X), %mm1 + movq 0 * SIZE(Y), %mm4 + movq 1 * SIZE(Y), %mm5 + + movq %mm4, 0 * SIZE(X) + movq %mm5, 1 * SIZE(X) + movq %mm0, 0 * SIZE(Y) + movq %mm1, 1 * SIZE(Y) +#else + movq 0 * SIZE(X), %mm0 + movq 0 * SIZE(Y), %mm4 + movq %mm4, 0 * SIZE(X) + movq %mm0, 0 * SIZE(Y) +#endif + addq INCX, X + addq INCY, Y + + decq %rax + jg .L35 + ALIGN_3 + +.L27: + EMMS + xorq %rax,%rax + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/zswap_sse.S b/kernel/x86_64/zswap_sse.S new file mode 100644 index 0000000000..2f217592f8 --- /dev/null +++ b/kernel/x86_64/zswap_sse.S @@ -0,0 +1,1134 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 /* rdi */ +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + pushq %rbx + + movq 56(%rsp), X + movq 64(%rsp), INCX + movq 72(%rsp), Y + movq 80(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L19 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + addq M, M + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + cmpq $3, M + jle .L16 + + testq $SIZE, Y + je .L05 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + + addq $1 * SIZE, X + addq $1 * SIZE, Y + decq M + ALIGN_3 + +.L05: + testq $2 * SIZE, Y + je .L10 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + movlps %xmm0, -32 * SIZE(Y) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + subq $2, M + jle .L19 + ALIGN_3 + +.L10: + cmpq $3, M + jle .L16 + + testq $2 * SIZE, X + jne .L30 + + testq $1 * SIZE, X + jne .L20 + + movq M, %rax + sarq $5, %rax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + subq $-32 * SIZE, Y + subq $-32 * SIZE, X + + decq %rax + jg .L11 + ALIGN_3 + +.L13: + testq $16, M + jle .L14 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + movaps -24 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movaps %xmm0, -24 * SIZE(Y) + movaps %xmm1, -24 * SIZE(X) + + movaps -20 * SIZE(X), %xmm0 + movaps -20 * SIZE(Y), %xmm1 + + movaps %xmm0, -20 * SIZE(Y) + movaps %xmm1, -20 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L14: + testq $8, M + jle .L15 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + movaps -28 * SIZE(X), %xmm0 + movaps -28 * SIZE(Y), %xmm1 + + movaps %xmm0, -28 * SIZE(Y) + movaps %xmm1, -28 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L15: + testq $4, M + jle .L16 + + movaps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movaps %xmm0, -32 * SIZE(Y) + movaps %xmm1, -32 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L16: + testq $2, M + jle .L17 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + addq $2 * SIZE, X + movlps %xmm0, -32 * SIZE(Y) + addq $2 * SIZE, Y + ALIGN_3 + +.L17: + testq $1, M + jle .L19 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + movss %xmm0, -32 * SIZE(Y) + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L20: + movaps -33 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + pshufd $0x39, %xmm1, %xmm3 + movlps %xmm3, -31 * SIZE(X) + + subq $3, M + + movq M, %rax + sarq $5, %rax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -13 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -13 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -5 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -5 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L23: + testq $16, M + jle .L24 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + movaps -21 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -21 * SIZE(X) + + movaps -17 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -17 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L24: + testq $8, M + jle .L25 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps -25 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x39, %xmm2, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x93, %xmm1, %xmm3 + movaps %xmm3, -25 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L25: + testq $4, M + jle .L26 + + movaps -29 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x39, %xmm0, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x93, %xmm3, %xmm1 + movaps %xmm1, -29 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L26: + pshufd $0x39, %xmm0, %xmm2 + pshufd $0xff, %xmm0, %xmm0 + + movlps %xmm2, -32 * SIZE(Y) + movss %xmm0, -30 * SIZE(Y) + + testq $2, M + jle .L27 + + movsd -29 * SIZE(X), %xmm0 + movsd -29 * SIZE(Y), %xmm1 + + movlps %xmm0, -29 * SIZE(Y) + movlps %xmm1, -29 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L27: + testq $1, M + jle .L29 + + movss -29 * SIZE(X), %xmm0 + movss -29 * SIZE(Y), %xmm1 + + movss %xmm0, -29 * SIZE(Y) + movss %xmm1, -29 * SIZE(X) + ALIGN_3 + +.L29: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L30: + testq $1 * SIZE, X + jne .L40 + + movhps -32 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movlps %xmm1, -32 * SIZE(X) + subq $2, M + + movq M, %rax + sarq $5, %rax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -14 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -14 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -6 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -6 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -2 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L33: + testq $16, M + jle .L34 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + movaps -22 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -22 * SIZE(X) + + movaps -18 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -18 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L34: + testq $8, M + jle .L35 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps -26 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -26 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L35: + testq $4, M + jle .L36 + + movaps -30 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -30 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L36: + movhps %xmm0, -32 * SIZE(Y) + + testq $2, M + jle .L37 + + movsd -30 * SIZE(X), %xmm0 + movsd -30 * SIZE(Y), %xmm1 + + movlps %xmm0, -30 * SIZE(Y) + movlps %xmm1, -30 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L37: + testq $1, M + jle .L39 + + movss -30 * SIZE(X), %xmm0 + movss -30 * SIZE(Y), %xmm1 + + movss %xmm0, -30 * SIZE(Y) + movss %xmm1, -30 * SIZE(X) + ALIGN_3 + +.L39: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L40: + movaps -35 * SIZE(X), %xmm0 + movaps -32 * SIZE(Y), %xmm1 + + movss %xmm1, -32 * SIZE(X) + + subq $3, M + + movq M, %rax + sarq $5, %rax + jle .L43 + ALIGN_4 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -12 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -11 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -12 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -11 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -4 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -3 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -4 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -3 * SIZE(X) + + subq $-32 * SIZE, X + subq $-32 * SIZE, Y + + decq %rax + jg .L41 + ALIGN_3 + +.L43: + testq $16, M + jle .L44 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + movaps -23 * SIZE(X), %xmm2 + movaps -20 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -24 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -23 * SIZE(X) + + movaps -19 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -20 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -19 * SIZE(X) + + addq $16 * SIZE, X + addq $16 * SIZE, Y + ALIGN_3 + +.L44: + testq $8, M + jle .L45 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps -27 * SIZE(X), %xmm0 + movaps -24 * SIZE(Y), %xmm1 + + movss %xmm0, %xmm2 + shufps $0x93, %xmm0, %xmm2 + movaps %xmm2, -28 * SIZE(Y) + movss %xmm1, %xmm3 + shufps $0x39, %xmm3, %xmm3 + movaps %xmm3, -27 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L45: + testq $4, M + jle .L46 + + movaps -31 * SIZE(X), %xmm2 + movaps -28 * SIZE(Y), %xmm3 + + movss %xmm2, %xmm0 + shufps $0x93, %xmm2, %xmm0 + movaps %xmm0, -32 * SIZE(Y) + movss %xmm3, %xmm1 + shufps $0x39, %xmm1, %xmm1 + movaps %xmm1, -31 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L46: + movsd -31 * SIZE(X), %xmm2 + + pshufd $0x39, %xmm1, %xmm1 + movlps %xmm1, -31 * SIZE(X) + + pshufd $0xff, %xmm0, %xmm0 + + movss %xmm0, -32 * SIZE(Y) + movlps %xmm2, -31 * SIZE(Y) + + addq $3 * SIZE, X + addq $3 * SIZE, Y + + testq $2, M + jle .L47 + + movsd -32 * SIZE(X), %xmm0 + movsd -32 * SIZE(Y), %xmm1 + + movlps %xmm0, -32 * SIZE(Y) + movlps %xmm1, -32 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L47: + testq $1, M + jle .L49 + + movss -32 * SIZE(X), %xmm0 + movss -32 * SIZE(Y), %xmm1 + + movss %xmm0, -32 * SIZE(Y) + movss %xmm1, -32 * SIZE(X) + ALIGN_3 + +.L49: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L50: + movq M, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L51: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $3, %rax + jle .L57 + ALIGN_3 + +.L56: + movsd (X), %xmm0 + movsd (Y), %xmm1 + + movlps %xmm1, (X) + addq INCX, X + movlps %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + + EPILOGUE diff --git a/kernel/x86_64/zswap_sse2.S b/kernel/x86_64/zswap_sse2.S new file mode 100644 index 0000000000..c505014dd6 --- /dev/null +++ b/kernel/x86_64/zswap_sse2.S @@ -0,0 +1,999 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifndef WINDOWS_ABI +#define M ARG1 /* rdi */ +#define X ARG4 +#define INCX ARG5 +#define Y ARG6 +#define INCY ARG2 +#else +#define M ARG1 +#define X ARG2 +#define INCX ARG3 +#define Y ARG4 +#define INCY %rbx +#endif + +#include "l1param.h" + + PROLOGUE + PROFCODE + +#ifndef WINDOWS_ABI + movq 8(%rsp), INCY +#else + pushq %rbx + + movq 56(%rsp), X + movq 64(%rsp), INCX + movq 72(%rsp), Y + movq 80(%rsp), INCY +#endif + + SAVEREGISTERS + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + + testq M, M + jle .L19 + + cmpq $2 * SIZE, INCX + jne .L50 + cmpq $2 * SIZE, INCY + jne .L50 + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + testq $SIZE, Y + jne .L30 + + testq $SIZE, X + jne .L20 + + movq M, %rax + sarq $3, %rax + jle .L13 + ALIGN_3 + +.L11: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + + decq %rax + jg .L11 + ALIGN_3 + +.L13: + testq $4, M + jle .L14 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L14: + testq $2, M + jle .L15 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L15: + testq $1, M + jle .L19 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L19: + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + ret + ALIGN_3 + +.L20: + movhps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movlps %xmm1, -16 * SIZE(X) + decq M + jle .L29 + + movq M, %rax + sarq $3, %rax + jle .L23 + ALIGN_4 + +.L21: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -7 * SIZE(X), %xmm2 + movaps -6 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(X) + + movaps -5 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -3 * SIZE(X), %xmm2 + movaps -2 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(X) + + movaps -1 * SIZE(X), %xmm0 + movaps 0 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(X) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L21 + ALIGN_3 + +.L23: + testq $4, M + jle .L24 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + movaps -11 * SIZE(X), %xmm2 + movaps -10 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(X) + + movaps -9 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L24: + testq $2, M + jle .L25 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + + movaps -13 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(Y) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L25: + testq $1, M + jle .L29 + + movaps -15 * SIZE(X), %xmm2 + movaps -14 * SIZE(Y), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L29: + movaps -15 * SIZE(X), %xmm2 + + movhps %xmm1, -15 * SIZE(X) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(Y) + + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L30: + testq $SIZE, X + jne .L40 + + movhps -16 * SIZE(Y), %xmm0 + movaps -16 * SIZE(X), %xmm1 + + movlps %xmm1, -16 * SIZE(Y) + decq M + jle .L39 + + movq M, %rax + sarq $3, %rax + jle .L33 + ALIGN_4 + +.L31: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -11 * SIZE(Y), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(Y) + + movaps -9 * SIZE(Y), %xmm0 + movaps -8 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -7 * SIZE(Y), %xmm2 + movaps -6 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -8 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -7 * SIZE(Y) + + movaps -5 * SIZE(Y), %xmm0 + movaps -4 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -6 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -5 * SIZE(Y) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -3 * SIZE(Y), %xmm2 + movaps -2 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -4 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -3 * SIZE(Y) + + movaps -1 * SIZE(Y), %xmm0 + movaps 0 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -2 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -1 * SIZE(Y) + + subq $-16 * SIZE, X + subq $-16 * SIZE, Y + + decq %rax + jg .L31 + ALIGN_3 + +.L33: + testq $4, M + jle .L34 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + + movaps -11 * SIZE(Y), %xmm2 + movaps -10 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -12 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -11 * SIZE(Y) + + movaps -9 * SIZE(Y), %xmm0 + movaps -8 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -10 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -9 * SIZE(Y) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L34: + testq $2, M + jle .L35 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + + movaps -13 * SIZE(Y), %xmm0 + movaps -12 * SIZE(X), %xmm1 + + SHUFPD_1 %xmm0, %xmm2 + movaps %xmm2, -14 * SIZE(X) + SHUFPD_1 %xmm1, %xmm3 + movaps %xmm3, -13 * SIZE(Y) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L35: + testq $1, M + jle .L39 + + movaps -15 * SIZE(Y), %xmm2 + movaps -14 * SIZE(X), %xmm3 + + SHUFPD_1 %xmm3, %xmm1 + movaps %xmm1, -15 * SIZE(Y) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L39: + movaps -15 * SIZE(Y), %xmm2 + + movhps %xmm1, -15 * SIZE(Y) + SHUFPD_1 %xmm2, %xmm0 + movaps %xmm0, -16 * SIZE(X) + + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L40: + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm0, -16 * SIZE(Y) + movlps %xmm1, -16 * SIZE(X) + + addq $SIZE, X + addq $SIZE, Y + decq M + jle .L49 + + movq M, %rax + sarq $3, %rax + jle .L43 + ALIGN_3 + +.L41: +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(X) +#endif + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 0) - PREOFFSET(Y) +#endif + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(X) +#endif + + movaps -8 * SIZE(X), %xmm0 + movaps -8 * SIZE(Y), %xmm1 + + movaps %xmm0, -8 * SIZE(Y) + movaps %xmm1, -8 * SIZE(X) + + movaps -6 * SIZE(X), %xmm0 + movaps -6 * SIZE(Y), %xmm1 + + movaps %xmm0, -6 * SIZE(Y) + movaps %xmm1, -6 * SIZE(X) + +#ifdef PREFETCHW + PREFETCHW (PREFETCHSIZE + 64) - PREOFFSET(Y) +#endif + + movaps -4 * SIZE(X), %xmm0 + movaps -4 * SIZE(Y), %xmm1 + + movaps %xmm0, -4 * SIZE(Y) + movaps %xmm1, -4 * SIZE(X) + + movaps -2 * SIZE(X), %xmm0 + movaps -2 * SIZE(Y), %xmm1 + + movaps %xmm0, -2 * SIZE(Y) + movaps %xmm1, -2 * SIZE(X) + + subq $-16 * SIZE, Y + subq $-16 * SIZE, X + + decq %rax + jg .L41 + ALIGN_3 + +.L43: + testq $4, M + jle .L44 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + movaps -12 * SIZE(X), %xmm0 + movaps -12 * SIZE(Y), %xmm1 + + movaps %xmm0, -12 * SIZE(Y) + movaps %xmm1, -12 * SIZE(X) + + movaps -10 * SIZE(X), %xmm0 + movaps -10 * SIZE(Y), %xmm1 + + movaps %xmm0, -10 * SIZE(Y) + movaps %xmm1, -10 * SIZE(X) + + addq $8 * SIZE, X + addq $8 * SIZE, Y + ALIGN_3 + +.L44: + testq $2, M + jle .L45 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + movaps -14 * SIZE(X), %xmm0 + movaps -14 * SIZE(Y), %xmm1 + + movaps %xmm0, -14 * SIZE(Y) + movaps %xmm1, -14 * SIZE(X) + + addq $4 * SIZE, X + addq $4 * SIZE, Y + ALIGN_3 + +.L45: + testq $1, M + jle .L49 + + movaps -16 * SIZE(X), %xmm0 + movaps -16 * SIZE(Y), %xmm1 + + movaps %xmm0, -16 * SIZE(Y) + movaps %xmm1, -16 * SIZE(X) + + addq $2 * SIZE, X + addq $2 * SIZE, Y + ALIGN_3 + +.L49: + movsd -16 * SIZE(X), %xmm0 + movsd -16 * SIZE(Y), %xmm1 + + movlps %xmm0, -16 * SIZE(Y) + movlps %xmm1, -16 * SIZE(X) + + xorq %rax,%rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + ret + ALIGN_3 + +.L50: + testq $SIZE, X + jne .L60 + testq $SIZE, Y + jne .L60 + + movq M, %rax + sarq $2, %rax + jle .L55 + ALIGN_3 + +.L51: + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L51 + ALIGN_3 + +.L55: + movq M, %rax + andq $3, %rax + jle .L57 + ALIGN_3 + +.L56: + movaps (X), %xmm0 + movaps (Y), %xmm1 + + movaps %xmm1, (X) + addq INCX, X + movaps %xmm0, (Y) + addq INCY, Y + + decq %rax + jg .L56 + ALIGN_3 + +.L57: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + ALIGN_3 + +.L60: + movq M, %rax + sarq $2, %rax + jle .L65 + ALIGN_3 + +.L61: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L61 + ALIGN_3 + +.L65: + movq M, %rax + andq $3, %rax + jle .L67 + ALIGN_3 + +.L66: + movsd 0 * SIZE(X), %xmm0 + movhps 1 * SIZE(X), %xmm0 + movsd 0 * SIZE(Y), %xmm1 + movhps 1 * SIZE(Y), %xmm1 + + movlps %xmm1, 0 * SIZE(X) + movhps %xmm1, 1 * SIZE(X) + addq INCX, X + movlps %xmm0, 0 * SIZE(Y) + movhps %xmm0, 1 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L66 + ALIGN_3 + +.L67: + xorq %rax, %rax + + RESTOREREGISTERS + +#ifdef WINDOWS_ABI + popq %rbx +#endif + + ret + + EPILOGUE diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S new file mode 100644 index 0000000000..39f0ff46f4 --- /dev/null +++ b/kernel/x86_64/zsymv_L_sse.S @@ -0,0 +1,814 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 12) +#define movsd movlpd +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA_R %xmm0 +#define ALPHA_I %xmm1 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define xtemp3 %xmm2 +#define xtemp4 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define yy1 %xmm10 +#define yy2 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + salq $ZBASE_SHIFT, LDA + + testq M, M + jle .L999 + + pcmpeqb %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + psllq $63, %xmm2 + unpcklpd %xmm3, %xmm2 + + unpcklpd ALPHA_I, ALPHA_R + unpcklpd ALPHA_R, ALPHA_I + xorpd %xmm2, ALPHA_I + + movq BUFFER, XX + + movq M, %rax + sarq $2, %rax + jle .L02 + ALIGN_3 + +.L01: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + movapd %xmm5, 4 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 6 * SIZE(XX) + + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 8 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 10 * SIZE(XX) + + movapd %xmm5, 12 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 14 * SIZE(XX) + + subq $-16 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $3, %rax + jle .L05 + ALIGN_3 + +.L03: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + + addpd %xmm4, %xmm3 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + addq $4 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $2 * SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $2, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + movhpd 1 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + movhpd 1 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + movhpd 1 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $3, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + + addq $2 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + xorq IS, IS # is = 0 + + cmpq $2, N + jl .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq 4 * SIZE(A, LDA, 2), A + + leaq (, IS, SIZE), I + + leaq 0 * SIZE(NEW_X, I, 4), XX + leaq 4 * SIZE(NEW_Y, I, 2), YY + + movapd 0 * SIZE(XX), atemp1 + movapd 2 * SIZE(XX), atemp2 + movapd 4 * SIZE(XX), atemp3 + movapd 6 * SIZE(XX), atemp4 + + MOVDDUP(0 * SIZE, A1, xsum1) + MOVDDUP(2 * SIZE, A1, xsum2) + + mulpd atemp1, xsum1 + mulpd atemp1, xsum2 + + MOVDDUP(1 * SIZE, A1, a1) + MOVDDUP(3 * SIZE, A1, a2) + + mulpd atemp2, a1 + mulpd atemp2, a2 + addpd a1, xsum1 + addpd a2, xsum2 + + MOVDDUP(2 * SIZE, A1, a1) + MOVDDUP(2 * SIZE, A2, a2) + + mulpd atemp3, a1 + mulpd atemp3, a2 + addpd a1, xsum1 + addpd a2, xsum2 + + MOVDDUP(3 * SIZE, A1, a1) + MOVDDUP(3 * SIZE, A2, a2) + + mulpd atemp4, a1 + mulpd atemp4, a2 + addpd a1, xsum1 + addpd a2, xsum2 + + MOVDDUP(4 * SIZE, A1, a1) + MOVDDUP(6 * SIZE, A2, a2) + + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + + movapd 8 * SIZE(XX), xtemp1 + movapd 10 * SIZE(XX), xtemp2 + movapd 12 * SIZE(XX), xtemp3 + movapd 14 * SIZE(XX), xtemp4 + + addq $8 * SIZE, XX + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + + movq M, I + subq IS, I + subq $2, I + sarq $2, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + PREFETCH PREFETCHSIZE(XX) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(6 * SIZE, A2, a2) + + PREFETCH PREFETCHSIZE(A2) + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(5 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(7 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(6 * SIZE, A1, a1) + + PREFETCHW PREFETCHSIZE(YY) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(4 * SIZE, A2, a2) + + movapd xtemp3, xt1 + movapd 20 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(7 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 16 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(5 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 22 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP( 8 * SIZE, A1, a1) + + movlpd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 18 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(10 * SIZE, A2, a2) + + movlpd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + subq $-16 * SIZE, XX + addq $ 8 * SIZE, YY + addq $ 8 * SIZE, A1 + addq $ 8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + movq M, I + subq IS, I + subq $2, I + testq $2, I + jle .L16 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L16: + testq $1, M + jle .L18 + + MOVDDUP(1 * SIZE, A1, a2) + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + + MOVDDUP(0 * SIZE, A2, a1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + addpd xt1, xsum1 + addpd a2, yy1 + + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum2 + addpd a1, yy1 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + addpd xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + ALIGN_3 + +.L18: + leaq (, IS, SIZE), I + + movsd 0 * SIZE(NEW_Y, I, 2), yy1 + movhpd 1 * SIZE(NEW_Y, I, 2), yy1 + movsd 2 * SIZE(NEW_Y, I, 2), yy2 + movhpd 3 * SIZE(NEW_Y, I, 2), yy2 + + addpd xsum1, yy1 + addpd xsum2, yy2 + + movlpd yy1, 0 * SIZE(NEW_Y, I, 2) + movhpd yy1, 1 * SIZE(NEW_Y, I, 2) + movlpd yy2, 2 * SIZE(NEW_Y, I, 2) + movhpd yy2, 3 * SIZE(NEW_Y, I, 2) + + addq $2, IS + + movq IS, I + addq $2, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + HALT + testq $1, N + jle .L990 + + leaq (, IS, SIZE), I + + movapd 0 * SIZE(NEW_X, I, 4), atemp1 + movapd 2 * SIZE(NEW_X, I, 4), atemp2 + + movsd 0 * SIZE(NEW_Y, I, 2), yy1 + movhpd 1 * SIZE(NEW_Y, I, 2), yy1 + + MOVDDUP(0 * SIZE, A, a1) + MOVDDUP(1 * SIZE, A, a2) + + mulpd atemp1, a1 + mulpd atemp2, a2 + addpd a1, yy1 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(NEW_Y, I, 2) + movhpd yy1, 1 * SIZE(NEW_Y, I, 2) + ALIGN_3 + +.L990: + cmpq $2 * SIZE, INCY + je .L999 + + movq M, %rax + sarq $2, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + movhpd %xmm1, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + movhpd %xmm3, 1 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L998: + movapd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + + addq $2 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S new file mode 100644 index 0000000000..7119077114 --- /dev/null +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -0,0 +1,886 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 12) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 12) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA_R %xmm0 +#define ALPHA_I %xmm1 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define xtemp3 %xmm2 +#define xtemp4 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define yy1 %xmm10 +#define yy2 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + +#ifndef HEMV +#define ADD addpd +#else +#define ADD subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + salq $ZBASE_SHIFT, LDA + + testq M, M + jle .L999 + + pcmpeqb %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + psllq $63, %xmm2 + unpcklpd %xmm3, %xmm2 + + unpcklpd ALPHA_I, ALPHA_R + unpcklpd ALPHA_R, ALPHA_I + xorpd %xmm2, ALPHA_I + + movq BUFFER, XX + + movq M, %rax + sarq $2, %rax + jle .L02 + ALIGN_3 + +.L01: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + movapd %xmm5, 4 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 6 * SIZE(XX) + + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 8 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 10 * SIZE(XX) + + movapd %xmm5, 12 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 14 * SIZE(XX) + + subq $-16 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $3, %rax + jle .L05 + ALIGN_3 + +.L03: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + + addpd %xmm4, %xmm3 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + addq $4 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $2 * SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $2, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + movhpd 1 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + movhpd 1 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + movhpd 1 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $3, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + + addq $2 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + xorq IS, IS # is = 0 + + cmpq $2, N + jl .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq 4 * SIZE(A, LDA, 2), A + + leaq (, IS, SIZE), I + + leaq 0 * SIZE(NEW_X, I, 4), XX + leaq 4 * SIZE(NEW_Y, I, 2), YY + + movapd 0 * SIZE(XX), atemp1 + movapd 2 * SIZE(XX), atemp2 + movapd 4 * SIZE(XX), atemp3 + movapd 6 * SIZE(XX), atemp4 + + MOVDDUP(0 * SIZE, A1, xsum1) + MOVDDUP(2 * SIZE, A1, xsum2) + + mulpd atemp1, xsum1 + mulpd atemp1, xsum2 + +#ifndef HEMV + MOVDDUP(1 * SIZE, A1, a1) + MOVDDUP(3 * SIZE, A1, a2) + + mulpd atemp2, a1 + mulpd atemp2, a2 + addpd a1, xsum1 + addpd a2, xsum2 +#else + MOVDDUP(3 * SIZE, A1, a2) + + mulpd atemp2, a2 + addpd a2, xsum2 +#endif + + MOVDDUP(2 * SIZE, A1, a1) + MOVDDUP(2 * SIZE, A2, a2) + + mulpd atemp3, a1 + mulpd atemp3, a2 + addpd a1, xsum1 + addpd a2, xsum2 + +#ifndef HEMV + MOVDDUP(3 * SIZE, A1, a1) + MOVDDUP(3 * SIZE, A2, a2) + + mulpd atemp4, a1 + mulpd atemp4, a2 + addpd a1, xsum1 + addpd a2, xsum2 +#else + MOVDDUP(3 * SIZE, A1, a1) + + mulpd atemp4, a1 + subpd a1, xsum1 +#endif + + MOVDDUP(4 * SIZE, A1, a1) + MOVDDUP(6 * SIZE, A2, a2) + + movsd 0 * SIZE(YY), yy1 + movhpd 1 * SIZE(YY), yy1 + movsd 2 * SIZE(YY), yy2 + movhpd 3 * SIZE(YY), yy2 + + movapd 8 * SIZE(XX), xtemp1 + movapd 10 * SIZE(XX), xtemp2 + movapd 12 * SIZE(XX), xtemp3 + movapd 14 * SIZE(XX), xtemp4 + + addq $8 * SIZE, XX + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + + movq M, I + subq IS, I + subq $2, I + sarq $2, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + PREFETCH PREFETCHSIZE(XX) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + MOVDDUP(6 * SIZE, A2, a2) + + PREFETCH PREFETCHSIZE(A2) + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(5 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(7 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy1 + MOVDDUP(6 * SIZE, A1, a1) + + PREFETCHW PREFETCHSIZE(YY) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(4 * SIZE, A2, a2) + + movapd xtemp3, xt1 + movapd 20 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(7 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 16 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(5 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 22 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + MOVDDUP( 8 * SIZE, A1, a1) + + movlpd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 18 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + MOVDDUP(10 * SIZE, A2, a2) + + movlpd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + subq $-16 * SIZE, XX + addq $ 8 * SIZE, YY + addq $ 8 * SIZE, A1 + addq $ 8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + movq M, I + subq IS, I + subq $2, I + testq $2, I + jle .L16 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L16: + testq $1, M + jle .L18 + + MOVDDUP(1 * SIZE, A1, a2) + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + + MOVDDUP(0 * SIZE, A2, a1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + ADD xt1, xsum1 + addpd a2, yy1 + + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum2 + addpd a1, yy1 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + ALIGN_3 + +.L18: + leaq (, IS, SIZE), I + + movsd 0 * SIZE(NEW_Y, I, 2), yy1 + movhpd 1 * SIZE(NEW_Y, I, 2), yy1 + movsd 2 * SIZE(NEW_Y, I, 2), yy2 + movhpd 3 * SIZE(NEW_Y, I, 2), yy2 + + addpd xsum1, yy1 + addpd xsum2, yy2 + + movlpd yy1, 0 * SIZE(NEW_Y, I, 2) + movhpd yy1, 1 * SIZE(NEW_Y, I, 2) + movlpd yy2, 2 * SIZE(NEW_Y, I, 2) + movhpd yy2, 3 * SIZE(NEW_Y, I, 2) + + addq $2, IS + + movq IS, I + addq $2, I + cmpq N, I + jle .L11 + ALIGN_3 + +.L20: + testq $1, N + jle .L990 + + leaq (, IS, SIZE), I + + movapd 0 * SIZE(NEW_X, I, 4), atemp1 + movapd 2 * SIZE(NEW_X, I, 4), atemp2 + + movsd 0 * SIZE(NEW_Y, I, 2), yy1 + movhpd 1 * SIZE(NEW_Y, I, 2), yy1 + +#ifndef HEMV + MOVDDUP(0 * SIZE, A, a1) + MOVDDUP(1 * SIZE, A, a2) + + mulpd atemp1, a1 + mulpd atemp2, a2 + addpd a1, yy1 + addpd a2, yy1 +#else + MOVDDUP(0 * SIZE, A, a1) + + mulpd atemp1, a1 + addpd a1, yy1 +#endif + + movlpd yy1, 0 * SIZE(NEW_Y, I, 2) + movhpd yy1, 1 * SIZE(NEW_Y, I, 2) + ALIGN_3 + +.L990: + cmpq $2 * SIZE, INCY + je .L999 + + movq M, %rax + sarq $2, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + movhpd %xmm1, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + movhpd %xmm3, 1 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L998: + movapd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + + addq $2 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S new file mode 100644 index 0000000000..175912c716 --- /dev/null +++ b/kernel/x86_64/zsymv_U_sse.S @@ -0,0 +1,594 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 12) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 14) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M ARG1 +#define N ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define IS %r15 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA_R %xmm0 +#define ALPHA_I %xmm1 + +#define xsum1 %xmm0 +#define xsum2 %xmm1 +#define xsum3 %xmm2 +#define xsum4 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xtemp1 %xmm8 +#define xtemp2 %xmm9 +#define a1 %xmm10 +#define a2 %xmm11 + +#define a3 %xmm12 +#define yy1 %xmm13 +#define xt1 %xmm14 +#define xt2 %xmm15 + +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + salq $ZBASE_SHIFT, LDA + + testq M, M + jle .L999 + + negq IS + addq M, IS + + movq IS, TEMP + imulq LDA, TEMP + addq TEMP, A + + pcmpeqb %xmm3, %xmm3 + xorpd %xmm2, %xmm2 + pslld $31, %xmm3 + unpckhps %xmm3, %xmm2 + + shufps $0, ALPHA_R, ALPHA_R + shufps $0, ALPHA_I, ALPHA_I + movaps ALPHA_I, %xmm3 + + unpcklps ALPHA_R, ALPHA_I + unpcklps %xmm3, ALPHA_R + pxor %xmm2, ALPHA_R + + movq BUFFER, XX + + movq M, %rax + sarq $2, %rax + jle .L02 + ALIGN_3 + +.L01: + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + movsd 0 * SIZE(X), %xmm6 + addq INCX, X + movhps 0 * SIZE(X), %xmm6 + addq INCX, X + + movsldup %xmm4, %xmm3 + movshdup %xmm4, %xmm4 + movsldup %xmm6, %xmm5 + movshdup %xmm6, %xmm6 + + mulps ALPHA_I, %xmm3 + mulps ALPHA_R, %xmm4 + mulps ALPHA_I, %xmm5 + mulps ALPHA_R, %xmm6 + + addps %xmm4, %xmm3 + addps %xmm6, %xmm5 + + movaps %xmm3, 4 * SIZE(XX) + movaps %xmm5, 12 * SIZE(XX) + + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + + pxor %xmm2, %xmm3 + pxor %xmm2, %xmm5 + + movaps %xmm3, 0 * SIZE(XX) + movaps %xmm5, 8 * SIZE(XX) + + subq $-16 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + testq $2, M + jle .L03 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + movhps 0 * SIZE(X), %xmm4 + addq INCX, X + + movsldup %xmm4, %xmm3 + movshdup %xmm4, %xmm4 + + mulps ALPHA_I, %xmm3 + mulps ALPHA_R, %xmm4 + + addps %xmm4, %xmm3 + + movaps %xmm3, 4 * SIZE(XX) + + shufps $0xb1, %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movaps %xmm3, 0 * SIZE(XX) + + subq $-8 * SIZE, XX + ALIGN_3 + +.L03: + testq $1, M + jle .L05 + + movsd 0 * SIZE(X), %xmm4 + addq INCX, X + + movsldup %xmm4, %xmm3 + movshdup %xmm4, %xmm4 + + mulps ALPHA_I, %xmm3 + mulps ALPHA_R, %xmm4 + + addps %xmm4, %xmm3 + + movlps %xmm3, 2 * SIZE(XX) + + shufps $0xb1, %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movlps %xmm3, 0 * SIZE(XX) + + subq $-4 * SIZE, XX + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $2 * SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $2, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + movhps 0 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + addq INCY, YY + movhps 0 * SIZE(YY), %xmm1 + addq INCY, YY + + movaps %xmm0, 0 * SIZE(XX) + movaps %xmm1, 8 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $3, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + addq INCY, YY + + movlps %xmm0, 0 * SIZE(XX) + + addq $2 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + movq IS, I + addq $2, I + cmpq M, I + jg .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + leaq (, IS, 4), I + + movsd 0 * SIZE(NEW_X, I, SIZE), atemp2 + movhps 4 * SIZE(NEW_X, I, SIZE), atemp2 + movsd 2 * SIZE(NEW_X, I, SIZE), atemp4 + movhps 6 * SIZE(NEW_X, I, SIZE), atemp4 + + pshufd $0xcc, atemp2, atemp1 + pshufd $0x99, atemp2, atemp2 + pshufd $0xcc, atemp4, atemp3 + pshufd $0x99, atemp4, atemp4 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + pxor xsum3, xsum3 + pxor xsum4, xsum4 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $2, I + jle .L15 + ALIGN_3 + +.L12: + HALT + + subq $-16 * SIZE, XX + addq $ 8 * SIZE, YY + addq $ 8 * SIZE, A1 + addq $ 8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + testq $2, IS + jle .L18 + + movsd 0 * SIZE(YY), yy1 + movhps 2 * SIZE(YY), yy1 + + movaps 0 * SIZE(XX), xtemp1 + movaps 4 * SIZE(XX), xtemp2 + + movsd 0 * SIZE(A1), a1 + movhps 2 * SIZE(A1), a1 + + movaps xtemp1, xt1 + movaps xtemp2, xt2 + mulps a1, xt1 + mulps a1, xt2 + addps xt1, xsum1 + addps xt2, xsum2 + + pshufd $0xb1, a1, xt2 + mulps atemp1, a1 + mulps atemp2, xt2 + addps a1, yy1 + addps xt2, yy1 + + movsd 0 * SIZE(A2), a1 + movhps 2 * SIZE(A2), a1 + + movaps xtemp1, xt1 + movaps xtemp2, xt2 + mulps a1, xt1 + mulps a1, xt2 + addps xt1, xsum3 + addps xt2, xsum4 + + pshufd $0xb1, a1, xt2 + mulps atemp1, a1 + mulps atemp2, xt2 + addps a1, yy1 + addps xt2, yy1 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + + addq $8 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + leaq (, IS, 4), I + + movaps 0 * SIZE(NEW_X, I, SIZE), atemp1 + movaps 4 * SIZE(NEW_X, I, SIZE), atemp2 + + movlps 0 * SIZE(YY), yy1 + movhps 2 * SIZE(YY), yy1 + + movsd 0 * SIZE(A1), a1 + movhps 0 * SIZE(A2), a1 + + movaps a1, a2 + mulps atemp1, a1 + mulps atemp2, a2 + addps a1, xsum1 + addps a2, xsum2 + + movsd 0 * SIZE(A2), a1 + movhps 2 * SIZE(A2), a1 + + movaps a1, a2 + mulps atemp1, a1 + mulps atemp2, a2 + addps a1, xsum3 + addps a2, xsum4 + + haddps xsum2, xsum1 + haddps xsum4, xsum3 + + haddps xsum3, xsum1 + addps xsum1, yy1 + + movlps yy1, 0 * SIZE(YY) + movhps yy1, 2 * SIZE(YY) + + addq $2, IS + + movq IS, I + addq $2, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + testq $1, M + jle .L990 + + +.L990: + cmpq $2 * SIZE, INCY + je .L999 + + movq M, %rax + sarq $2, %rax + jle .L997 + ALIGN_3 + +.L996: + movaps 0 * SIZE(NEW_Y), %xmm0 + movaps 4 * SIZE(NEW_Y), %xmm1 + + movlps %xmm0, 0 * SIZE(Y) + addq INCY, Y + movhps %xmm0, 0 * SIZE(Y) + addq INCY, Y + movlps %xmm1, 0 * SIZE(Y) + addq INCY, Y + movhps %xmm1, 0 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L998: + movlps 0 * SIZE(NEW_Y), %xmm0 + addq $2 * SIZE, NEW_Y + + movlps %xmm0, 0 * SIZE(Y) + addq INCY, Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S new file mode 100644 index 0000000000..3e4b170301 --- /dev/null +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -0,0 +1,916 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#ifdef ATOM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef CORE2 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#if defined(PENRYN) || defined(DUNNINGTON) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 24) +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 12) +#define movsd movlpd +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (16 * 16) +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (8 * 24) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (16 * 28) +#endif + +#ifndef WINDOWS_ABI + +#define STACKSIZE 80 + +#define OLD_Y 8 + STACKSIZE(%rsp) +#define OLD_INCY 16 + STACKSIZE(%rsp) +#define OLD_BUFFER 24 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG3 +#define LDA ARG4 +#define X ARG5 +#define INCX ARG6 + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_LDA 48 + STACKSIZE(%rsp) +#define OLD_X 56 + STACKSIZE(%rsp) +#define OLD_INCX 64 + STACKSIZE(%rsp) +#define OLD_Y 72 + STACKSIZE(%rsp) +#define OLD_INCY 80 + STACKSIZE(%rsp) +#define OLD_BUFFER 88 + STACKSIZE(%rsp) + +#define M ARG1 +#define IS ARG2 +#define A ARG4 +#define LDA ARG3 +#define X %rdi +#define INCX %rsi + +#endif + +#define Y %r10 +#define INCY %r11 +#define BUFFER %r12 + +#define TEMP %rax +#define I %rax +#define A1 %rbx +#define A2 %rbp +#define XX %r13 +#define YY %r14 +#define NEW_X BUFFER +#define NEW_Y X + +#define ALPHA_R %xmm0 +#define ALPHA_I %xmm1 + +#define xtemp1 %xmm0 +#define xtemp2 %xmm1 +#define xtemp3 %xmm2 +#define xtemp4 %xmm3 + +#define atemp1 %xmm4 +#define atemp2 %xmm5 +#define atemp3 %xmm6 +#define atemp4 %xmm7 + +#define xsum1 %xmm8 +#define xsum2 %xmm9 +#define yy1 %xmm10 +#define yy2 %xmm11 + +#define a1 %xmm12 +#define a2 %xmm13 +#define a3 %xmm14 +#define xt1 %xmm15 + +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#define MOVDDUP(a, b, c) movddup a(b), c +#define MOVDDUP2(a, b, c) movddup a##b, c +#else +#define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c +#define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c +#endif + +#ifndef HEMV +#define ADD addpd +#else +#define ADD subpd +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq OLD_A, A + movq OLD_LDA, LDA + movq OLD_X, X + movq OLD_INCX, INCX + + movaps %xmm2, %xmm0 + movaps %xmm3, %xmm1 +#endif + + movq OLD_Y, Y + movq OLD_INCY, INCY + movq OLD_BUFFER, BUFFER + + salq $ZBASE_SHIFT, INCX + salq $ZBASE_SHIFT, INCY + salq $ZBASE_SHIFT, LDA + + testq M, M + jle .L999 + + negq IS + addq M, IS + + movq IS, TEMP + imulq LDA, TEMP + addq TEMP, A + + pcmpeqb %xmm2, %xmm2 + xorpd %xmm3, %xmm3 + psllq $63, %xmm2 + unpcklpd %xmm3, %xmm2 + + unpcklpd ALPHA_I, ALPHA_R + unpcklpd ALPHA_R, ALPHA_I + xorpd %xmm2, ALPHA_I + + movq BUFFER, XX + + movq M, %rax + sarq $2, %rax + jle .L02 + ALIGN_3 + +.L01: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + movapd %xmm5, 4 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 6 * SIZE(XX) + + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + MOVDDUP(0 * SIZE, X, %xmm5) + MOVDDUP(1 * SIZE, X, %xmm6) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + mulpd ALPHA_R, %xmm5 + mulpd ALPHA_I, %xmm6 + + addpd %xmm4, %xmm3 + addpd %xmm6, %xmm5 + + movapd %xmm3, 8 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 10 * SIZE(XX) + + movapd %xmm5, 12 * SIZE(XX) + SHUFPD_1 %xmm5, %xmm5 + pxor %xmm2, %xmm5 + movapd %xmm5, 14 * SIZE(XX) + + subq $-16 * SIZE, XX + decq %rax + jg .L01 + ALIGN_3 + +.L02: + movq M, %rax + andq $3, %rax + jle .L05 + ALIGN_3 + +.L03: + MOVDDUP(0 * SIZE, X, %xmm3) + MOVDDUP(1 * SIZE, X, %xmm4) + addq INCX, X + + mulpd ALPHA_R, %xmm3 + mulpd ALPHA_I, %xmm4 + + addpd %xmm4, %xmm3 + + movapd %xmm3, 0 * SIZE(XX) + SHUFPD_1 %xmm3, %xmm3 + pxor %xmm2, %xmm3 + movapd %xmm3, 2 * SIZE(XX) + + addq $4 * SIZE, XX + decq %rax + jg .L03 + ALIGN_3 + +.L05: + /* now we don't need original X */ + movq Y, NEW_Y + + addq $512, XX + andq $-512, XX + + cmpq $2 * SIZE, INCY + je .L10 + + movq Y, YY + movq XX, NEW_Y + + movq M, %rax + sarq $2, %rax + jle .L07 + ALIGN_3 + +.L06: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm1 + movhpd 1 * SIZE(YY), %xmm1 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm2 + movhpd 1 * SIZE(YY), %xmm2 + addq INCY, YY + movsd 0 * SIZE(YY), %xmm3 + movhpd 1 * SIZE(YY), %xmm3 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + movapd %xmm1, 2 * SIZE(XX) + movapd %xmm2, 4 * SIZE(XX) + movapd %xmm3, 6 * SIZE(XX) + + addq $8 * SIZE, XX + decq %rax + jg .L06 + ALIGN_3 + +.L07: + movq M, %rax + andq $3, %rax + jle .L10 + ALIGN_3 + +.L08: + movsd 0 * SIZE(YY), %xmm0 + movhpd 1 * SIZE(YY), %xmm0 + addq INCY, YY + + movapd %xmm0, 0 * SIZE(XX) + + addq $2 * SIZE, XX + decq %rax + jg .L08 + ALIGN_3 + +.L10: + movq IS, I + addq $2, I + cmpq M, I + jg .L20 + ALIGN_3 + +.L11: + movq A, A1 + leaq (A, LDA, 1), A2 + leaq (A, LDA, 2), A + + leaq (, IS, 4), I + + movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 + movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 + movapd 4 * SIZE(NEW_X, I, SIZE), atemp3 + movapd 6 * SIZE(NEW_X, I, SIZE), atemp4 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + movsd 0 * SIZE(NEW_Y), yy1 + movhpd 1 * SIZE(NEW_Y), yy1 + movsd 2 * SIZE(NEW_Y), yy2 + movhpd 3 * SIZE(NEW_Y), yy2 + + movapd 0 * SIZE(NEW_X), xtemp1 + movapd 2 * SIZE(NEW_X), xtemp2 + movapd 4 * SIZE(NEW_X), xtemp3 + movapd 6 * SIZE(NEW_X), xtemp4 + + MOVDDUP(0 * SIZE, A1, a1) + MOVDDUP(2 * SIZE, A2, a2) + MOVDDUP(1 * SIZE, A1, a3) + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $2, I + jle .L15 + ALIGN_3 + +.L12: + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(3 * SIZE, A2, a1) + + PREFETCH PREFETCHSIZE(A1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(2 * SIZE, A1, a2) + + movapd xtemp2, xt1 + mulpd a3, xt1 + mulpd atemp2, a3 + ADD xt1, xsum1 + addpd a3, yy1 + MOVDDUP(0 * SIZE, A2, a3) + + movapd xtemp4, xt1 + mulpd a1, xt1 + mulpd atemp4, a1 + ADD xt1, xsum2 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + PREFETCH PREFETCHSIZE(XX) + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a2, xt1 + mulpd atemp1, a2 + addpd xt1, xsum1 + addpd a2, yy2 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a3, xt1 + mulpd atemp3, a3 + addpd xt1, xsum2 + addpd a3, yy1 + MOVDDUP(4 * SIZE, A1, a3) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + MOVDDUP(6 * SIZE, A2, a1) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + MOVDDUP(5 * SIZE, A1, a2) + + PREFETCH PREFETCHSIZE(A2) + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movapd xtemp1, xt1 + mulpd a3, xt1 + mulpd atemp1, a3 + addpd xt1, xsum1 + addpd a3, yy1 + MOVDDUP(7 * SIZE, A2, a3) + + movapd xtemp3, xt1 + mulpd a1, xt1 + mulpd atemp3, a1 + addpd xt1, xsum2 + addpd a1, yy2 + MOVDDUP(6 * SIZE, A1, a1) + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp2, a2 + ADD xt1, xsum1 + addpd a2, yy1 + MOVDDUP(4 * SIZE, A2, a2) + + PREFETCHW PREFETCHSIZE(YY) + + movapd xtemp4, xt1 + mulpd a3, xt1 + mulpd atemp4, a3 + ADD xt1, xsum2 + addpd a3, yy2 + MOVDDUP(7 * SIZE, A1, a3) + + movapd xtemp3, xt1 + movapd 20 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(5 * SIZE, A2, a1) + + movapd xtemp1, xt1 + movapd 16 * SIZE(XX), xtemp1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(10 * SIZE, A2, a2) + + movapd xtemp4, xt1 + movapd 22 * SIZE(XX), xtemp4 + mulpd a3, xt1 + mulpd atemp2, a3 + ADD xt1, xsum1 + addpd a3, yy2 + MOVDDUP( 9 * SIZE, A1, a3) + + movlpd yy2, 6 * SIZE(YY) + movhpd yy2, 7 * SIZE(YY) + movsd 10 * SIZE(YY), yy2 + movhpd 11 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + movapd 18 * SIZE(XX), xtemp2 + mulpd a1, xt1 + mulpd atemp4, a1 + ADD xt1, xsum2 + addpd a1, yy1 + MOVDDUP( 8 * SIZE, A1, a1) + + movlpd yy1, 4 * SIZE(YY) + movhpd yy1, 5 * SIZE(YY) + movsd 8 * SIZE(YY), yy1 + movhpd 9 * SIZE(YY), yy1 + + subq $-16 * SIZE, XX + addq $ 8 * SIZE, YY + addq $ 8 * SIZE, A1 + addq $ 8 * SIZE, A2 + + decq I + jg .L12 + ALIGN_3 + +.L15: + testq $2, IS + jle .L18 + + movapd xtemp1, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(1 * SIZE, A1, a1) + + movapd xtemp3, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy2 + MOVDDUP(3 * SIZE, A2, a2) + + movapd xtemp2, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp4, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(0 * SIZE, A2, a2) + + movapd xtemp3, xt1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(3 * SIZE, A1, a1) + + movapd xtemp1, xt1 + mulpd a2, xt1 + mulpd atemp3, a2 + addpd xt1, xsum2 + addpd a2, yy1 + MOVDDUP(1 * SIZE, A2, a2) + + movapd xtemp4, xt1 + mulpd a1, xt1 + mulpd atemp2, a1 + ADD xt1, xsum1 + addpd a1, yy2 + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + movapd xtemp2, xt1 + mulpd a2, xt1 + mulpd atemp4, a2 + ADD xt1, xsum2 + addpd a2, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + addq $4 * SIZE, A2 + ALIGN_3 + +.L18: + MOVDDUP(0 * SIZE, A1, a1) + MOVDDUP(0 * SIZE, A2, a2) + + mulpd atemp1, a1 + mulpd atemp1, a2 + addpd a1, xsum1 + addpd a2, xsum2 + +#ifndef HEMV + MOVDDUP(1 * SIZE, A1, a1) + MOVDDUP(1 * SIZE, A2, a2) + + mulpd atemp2, a1 + mulpd atemp2, a2 + addpd a1, xsum1 + addpd a2, xsum2 +#else + MOVDDUP(1 * SIZE, A2, a2) + + mulpd atemp2, a2 + subpd a2, xsum2 +#endif + + MOVDDUP(0 * SIZE, A2, a1) + MOVDDUP(2 * SIZE, A2, a2) + + mulpd atemp3, a1 + mulpd atemp3, a2 + addpd a1, xsum1 + addpd a2, xsum2 + +#ifndef HEMV + MOVDDUP(1 * SIZE, A2, a1) + MOVDDUP(3 * SIZE, A2, a2) + + mulpd atemp4, a1 + mulpd atemp4, a2 + addpd a1, xsum1 + addpd a2, xsum2 +#else + MOVDDUP(1 * SIZE, A2, a1) + + mulpd atemp4, a1 + addpd a1, xsum1 +#endif + + addpd xsum1, yy1 + addpd xsum2, yy2 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + + addq $2, IS + + movq IS, I + addq $2, I + cmpq M, I + jle .L11 + ALIGN_3 + +.L20: + testq $1, M + jle .L990 + + movq A, A1 + leaq (, IS, 4), I + + movapd 0 * SIZE(NEW_X, I, SIZE), atemp1 + movapd 2 * SIZE(NEW_X, I, SIZE), atemp2 + + pxor xsum1, xsum1 + pxor xsum2, xsum2 + + MOVDDUP(0 * SIZE, A1, a1) + MOVDDUP(1 * SIZE, A1, a2) + + movapd 0 * SIZE(NEW_X), xtemp1 + movapd 2 * SIZE(NEW_X), xtemp2 + movapd 4 * SIZE(NEW_X), xtemp3 + movapd 6 * SIZE(NEW_X), xtemp4 + + movsd 0 * SIZE(NEW_Y), yy1 + movhpd 1 * SIZE(NEW_Y), yy1 + movsd 2 * SIZE(NEW_Y), yy2 + movhpd 3 * SIZE(NEW_Y), yy2 + + movq NEW_X, XX + movq NEW_Y, YY + + movq IS, I + sarq $1, I + jle .L28 + ALIGN_3 + +.L22: + movapd xtemp1, xt1 + movapd 8 * SIZE(XX), xtemp1 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy1 + MOVDDUP(2 * SIZE, A1, a1) + + movapd xtemp2, xt1 + movapd 10 * SIZE(XX), xtemp2 + mulpd a2, xt1 + mulpd atemp2, a2 + ADD xt1, xsum2 + addpd a2, yy1 + MOVDDUP(3 * SIZE, A1, a2) + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + movsd 4 * SIZE(YY), yy1 + movhpd 5 * SIZE(YY), yy1 + + movapd xtemp3, xt1 + movapd 12 * SIZE(XX), xtemp3 + mulpd a1, xt1 + mulpd atemp1, a1 + addpd xt1, xsum1 + addpd a1, yy2 + MOVDDUP(4 * SIZE, A1, a1) + + movapd xtemp4, xt1 + movapd 14 * SIZE(XX), xtemp4 + mulpd a2, xt1 + mulpd atemp2, a2 + ADD xt1, xsum2 + addpd a2, yy2 + MOVDDUP(5 * SIZE, A1, a2) + + movlpd yy2, 2 * SIZE(YY) + movhpd yy2, 3 * SIZE(YY) + movsd 6 * SIZE(YY), yy2 + movhpd 7 * SIZE(YY), yy2 + + addq $8 * SIZE, XX + addq $4 * SIZE, YY + addq $4 * SIZE, A1 + + decq I + jg .L22 + ALIGN_3 + +.L28: + MOVDDUP(0 * SIZE, A1, a1) + +#ifndef HEMV + MOVDDUP(1 * SIZE, A1, a2) + + mulpd atemp1, a1 + mulpd atemp2, a2 + addpd a1, xsum1 + addpd a2, xsum2 + +#else + mulpd atemp1, a1 + addpd a1, xsum1 +#endif + + addpd xsum2, xsum1 + addpd xsum1, yy1 + + movlpd yy1, 0 * SIZE(YY) + movhpd yy1, 1 * SIZE(YY) + ALIGN_3 + +.L990: + cmpq $2 * SIZE, INCY + je .L999 + + movq M, %rax + sarq $2, %rax + jle .L997 + ALIGN_3 + +.L996: + movapd 0 * SIZE(NEW_Y), %xmm0 + movapd 2 * SIZE(NEW_Y), %xmm1 + movapd 4 * SIZE(NEW_Y), %xmm2 + movapd 6 * SIZE(NEW_Y), %xmm3 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm1, 0 * SIZE(Y) + movhpd %xmm1, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm2, 0 * SIZE(Y) + movhpd %xmm2, 1 * SIZE(Y) + addq INCY, Y + movsd %xmm3, 0 * SIZE(Y) + movhpd %xmm3, 1 * SIZE(Y) + addq INCY, Y + + addq $8 * SIZE, NEW_Y + decq %rax + jg .L996 + ALIGN_3 + +.L997: + movq M, %rax + andq $3, %rax + jle .L999 + ALIGN_3 + +.L998: + movapd 0 * SIZE(NEW_Y), %xmm0 + + movsd %xmm0, 0 * SIZE(Y) + movhpd %xmm0, 1 * SIZE(Y) + addq INCY, Y + + addq $2 * SIZE, NEW_Y + + decq %rax + jg .L998 + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S b/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S new file mode 100644 index 0000000000..31bd57b432 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x1_atom.S @@ -0,0 +1,995 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define BB %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#ifndef CONJ +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd + +#elif defined(LN) || defined(LT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#endif + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + testq N, N + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + subq LDC, C +#endif + + movq C, CO1 + +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax), BB + + testq $1, M + jle .L20 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + ADDSD1 %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + ADDSD3 %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + ADDSD2 %xmm2, %xmm9 + ADDSD4 %xmm6, %xmm11 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 +#endif + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm6 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(AO), %xmm7 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BO), %xmm9 + movaps %xmm1, %xmm4 + + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 + mulsd %xmm9, %xmm5 + mulsd %xmm9, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD2 %xmm5, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + ADDSD3 %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + ADDSD2 %xmm2, %xmm13 + ADDSD3 %xmm7, %xmm14 + ADDSD4 %xmm6, %xmm15 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + addsd %xmm15, %xmm12 + addsd %xmm13, %xmm14 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + movsd 2 * SIZE(AO), %xmm2 + movsd 3 * SIZE(AO), %xmm3 +#endif + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm1 + subsd %xmm12, %xmm2 + subsd %xmm14, %xmm3 + +#ifdef LN + movsd 6 * SIZE(AO), %xmm6 + movsd 7 * SIZE(AO), %xmm7 + + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + movsd 4 * SIZE(AO), %xmm6 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + movsd 5 * SIZE(AO), %xmm7 + + ADDSD4 %xmm4, %xmm2 + ADDSD3 %xmm5, %xmm3 + + movaps %xmm2, %xmm4 + movaps %xmm3, %xmm5 + + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm5 + mulsd %xmm3, %xmm6 + mulsd %xmm2, %xmm7 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm1 + movsd 0 * SIZE(AO), %xmm6 + + ADDSD3 %xmm5, %xmm0 + ADDSD4 %xmm7, %xmm1 + movsd 1 * SIZE(AO), %xmm7 + + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm6 + movsd 1 * SIZE(AO), %xmm7 + + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + movsd 3 * SIZE(AO), %xmm7 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 + + movaps %xmm0, %xmm4 + movaps %xmm1, %xmm5 + + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm5 + mulsd %xmm1, %xmm6 + mulsd %xmm0, %xmm7 + + subsd %xmm4, %xmm2 + subsd %xmm6, %xmm3 + movsd 6 * SIZE(AO), %xmm6 + + ADDSD3 %xmm5, %xmm2 + ADDSD4 %xmm7, %xmm3 + movsd 7 * SIZE(AO), %xmm7 + + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm2 + ADDSD3 %xmm5, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BO), %xmm9 + movaps %xmm1, %xmm4 + movaps %xmm2, %xmm7 + movaps %xmm3, %xmm6 + + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 + mulsd %xmm9, %xmm5 + mulsd %xmm9, %xmm4 + + ADDSD4 %xmm4, %xmm0 + mulsd %xmm8, %xmm2 + ADDSD2 %xmm5, %xmm1 + mulsd %xmm8, %xmm3 + mulsd %xmm9, %xmm7 + mulsd %xmm9, %xmm6 + + ADDSD4 %xmm6, %xmm2 + ADDSD2 %xmm7, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movsd %xmm2, 2 * SIZE(CO1) + movsd %xmm3, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) + movsd %xmm2, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S new file mode 100644 index 0000000000..065abe0ce5 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_core2.S @@ -0,0 +1,2162 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define AORIG 48(%rsp) +#define BORIG 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + jle .L30 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + +#ifdef LN + pxor %xmm8, %xmm8 + prefetcht1 -3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 -3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#else + pxor %xmm8, %xmm8 + prefetcht1 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#endif + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -12 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -8 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -8 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd 0 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 2 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd 4 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd 6 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -4 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + subq $-32 * SIZE, BO + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + subq $1, %rax + BRANCH + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 + xorpd %xmm7, %xmm13 + xorpd %xmm7, %xmm15 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + movddup %xmm15, %xmm14 + unpckhpd %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + testq $1, M + jle .L130 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 +#else + xorpd %xmm7, %xmm8 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L199 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 -3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm13 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm13, %xmm12 +#else + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S new file mode 100644 index 0000000000..093a580ba0 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_penryn.S @@ -0,0 +1,2016 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $1, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L20 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm3, %xmm3 + pxor %xmm5, %xmm5 + + movapd %xmm3, %xmm8 + movapd %xmm3, %xmm9 + movapd %xmm3, %xmm12 + movapd %xmm3, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD2 %xmm5, %xmm13 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + NOBRANCH + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) +#endif + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 16 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + subq $-32 * SIZE, AO + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD1 %xmm4, %xmm14 + psllq $63, %xmm7 + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L60 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $1, I # i = (m >> 2) + NOBRANCH + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S new file mode 100644 index 0000000000..fb428cbf53 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -0,0 +1,2278 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) +#define BORIG 72(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + jle .L30 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 2 * SIZE(AO), %xmm8 + + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 10 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 14 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 6 * SIZE(AO), %xmm8 + + movapd 24 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 26 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 28 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 30 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + + decq %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm3 + subpd %xmm4, %xmm3 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm2, %xmm3 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm2, %xmm3 + + movapd %xmm3, %xmm0 + pshufd $0x4e, %xmm3, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm4, %xmm4 + movapd 2 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + +#ifdef LN + PREFETCHW -4 * SIZE(CO1) + pxor %xmm6, %xmm6 + PREFETCHW -4 * SIZE(CO2) + pxor %xmm7, %xmm7 +#else + PREFETCHW 4 * SIZE(CO1) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm7, %xmm7 +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 + xorpd %xmm15, %xmm5 + xorpd %xmm15, %xmm7 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + movapd 4 * SIZE(AO), %xmm3 + movapd 6 * SIZE(AO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 + + movapd %xmm5, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm3 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm3, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm5 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 + + movapd %xmm1, %xmm0 + movapd %xmm5, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm3 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm3 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 + + movapd %xmm3, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm3, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm5 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm5 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + movsd %xmm7, 2 * SIZE(CO2) + movhpd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) + movapd %xmm3, 4 * SIZE(AO) + movapd %xmm7, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + testq $1, M + jle .L130 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movapd POSINV, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 +#else + xorpd %xmm15, %xmm0 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 +#else + addpd %xmm1, %xmm0 +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + + subpd %xmm0, %xmm1 +#else + movapd 0 * SIZE(AO), %xmm1 + + subpd %xmm0, %xmm1 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L199 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 +#ifdef LN + PREFETCHW -4 * SIZE(CO1) +#else + PREFETCHW 4 * SIZE(CO1) +#endif + + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 4 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 8 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 10 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 12 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 14 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm5 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm5, %xmm4 +#else + addpd %xmm1, %xmm0 + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm4, %xmm5 + + movapd %xmm5, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm5 + subpd %xmm4, %xmm5 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S new file mode 100644 index 0000000000..74a799af4e --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse3.S @@ -0,0 +1,2203 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#ifndef CONJ +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 addpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + jle .L30 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L42 + +.L41: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L43 + ALIGN_4 + +.L44: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm3, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm9 +#endif + + +#ifdef RT + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L99 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) + prefetchnta -4 * SIZE(CO2) +#else + prefetchnta 4 * SIZE(CO1) + prefetchnta 4 * SIZE(CO2) +#endif + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L12 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L11: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L13 + ALIGN_4 + +.L14: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + movapd 4 * SIZE(BO), %xmm10 + movapd 6 * SIZE(BO), %xmm11 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 + subpd %xmm4, %xmm10 + subpd %xmm6, %xmm11 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 + subpd %xmm5, %xmm10 + subpd %xmm7, %xmm11 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm2, %xmm10 + subpd %xmm6, %xmm11 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 + addsubpd %xmm3, %xmm10 + addsubpd %xmm7, %xmm11 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 +#else + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm10, %xmm13 + movapd %xmm11, %xmm14 + movapd %xmm11, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm8 + subpd %xmm14, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#if defined(LT) || defined(RN) + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 +#else + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + movapd %xmm9, %xmm14 + movapd %xmm9, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm10 + subpd %xmm14, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm10 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movhpd %xmm10, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + jle .L130 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L142 + +.L141: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + SHUFPD_1 %xmm1, %xmm1 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 +#else + addsubpd %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 +#else + movapd 0 * SIZE(AO), %xmm8 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#else + addsubpd %xmm1, %xmm8 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L130: + movq M, I + sarq $1, I # i = (m >> 2) + jle .L149 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + +#ifdef LN + prefetchnta -4 * SIZE(CO1) +#else + prefetchnta 4 * SIZE(CO1) +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L112 + +.L111: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm5, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S new file mode 100644 index 0000000000..fc5a4a317a --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_2x4_nehalem.S @@ -0,0 +1,3116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L20 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm13 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm13 + + movhlps %xmm9, %xmm11 + movhlps %xmm13, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm11, %xmm12 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm12 + + addps %xmm10, %xmm9 + addps %xmm12, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, (CO1) + movhps %xmm9, (CO1, LDC) + movsd %xmm11, (CO2) + movhps %xmm11, (CO2, LDC) +#else + movlhps %xmm11, %xmm9 + movlhps %xmm15, %xmm13 + + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm13, -28 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO1, LDC) + movlps %xmm13, (CO2) + movlps %xmm15, (CO2, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + movq M, I + sarq $1, I + NOBRANCH + jle .L29 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps %xmm12, %xmm11 + movlhps %xmm14, %xmm12 + movhlps %xmm11, %xmm14 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm13 + movaps -24 * SIZE(BO), %xmm11 + movaps -20 * SIZE(BO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + movaps -24 * SIZE(AO), %xmm13 + movaps -20 * SIZE(AO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + movaps %xmm15, %xmm5 + pshufd $0xb1, %xmm15, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + subps %xmm5, %xmm13 + subps %xmm4, %xmm13 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + movaps %xmm13, %xmm5 + pshufd $0xb1, %xmm13, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + subps %xmm5, %xmm15 + subps %xmm4, %xmm15 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm13, -28 * SIZE(BO) + movaps %xmm11, -24 * SIZE(BO) + movaps %xmm15, -20 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhps %xmm13, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + movaps %xmm13, -24 * SIZE(AO) + movaps %xmm15, -20 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhps %xmm13, 2 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $2, N + BRANCH + jle .L50 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L40 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + + movlps %xmm9, (CO1) + movhps %xmm9, (CO2) +#else + movlps %xmm9, -32 * SIZE(AO) + movlps %xmm11, -30 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L40: + movq M, I + sarq $1, I + NOBRANCH + jle .L49 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L31 + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L50: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + testq $1, M + BRANCH + jle .L60 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -26 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm9 +#else + movsd -32 * SIZE(AO), %xmm9 +#endif + + subps %xmm8, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 +#endif + +#if defined(RN) || defined(RT) + movsd -32 * SIZE(BO), %xmm5 +#endif + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) +#else + movlps %xmm9, -32 * SIZE(AO) +#endif + + movlps %xmm9, (CO1) + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + movq M, I + sarq $1, I + NOBRANCH + jle .L69 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + +#endif + + haddps %xmm9, %xmm8 + + shufps $0xd8, %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) + movlps %xmm11, -30 * SIZE(BO) + + movlps %xmm9, 0 * SIZE(CO1) + movlps %xmm11, 2 * SIZE(CO1) +#else + movaps %xmm9, -32 * SIZE(AO) + + movlps %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L51 + ALIGN_4 + +.L69: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S new file mode 100644 index 0000000000..e9edc29ac5 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -0,0 +1,4004 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm15, %xmm15 + cmpeqps %xmm15, %xmm15 + pslld $31, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm15, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm15, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm15, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm15, 12 + POSINV +#endif + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + movaps 8 * SIZE(B), %xmm3 + movaps 12 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + testq $1, M + je .L20 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movlps 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movlps 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm2, %xmm0 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) +#else + movlps %xmm1, 0 * SIZE(AO) + movlps %xmm5, 2 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm5, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L39 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW -8 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW -8 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + + addq $32 * 2 * SIZE, AO + addq $64 * 2 * SIZE, BO + subq $64 * 2, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L18: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm6 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + movaps 8 * SIZE(AO), %xmm5 + movaps 12 * SIZE(AO), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm2 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm0, %xmm1 + addps %xmm2, %xmm3 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm5 + subps %xmm4, %xmm7 + + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm2 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm0, %xmm5 + addps %xmm2, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + + pshufd $0xa0, %xmm7, %xmm4 + pshufd $0xf5, %xmm7, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + + subps %xmm2, %xmm1 + subps %xmm6, %xmm3 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm6, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + pshufd $0xaa, %xmm6, %xmm4 + pshufd $0xff, %xmm6, %xmm5 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm5, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm5 + + movaps %xmm0, 48 * SIZE(BO) + movaps %xmm1, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm5, 60 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) + + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) + movhps %xmm6, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + movaps %xmm5, 8 * SIZE(AO) + movaps %xmm7, 12 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) + movlps %xmm7, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#endif + + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c + +#ifndef RT + addq LDC, C +#endif + + testq $1, M + je .L60 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 + + subps %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + subps %xmm0, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + movq M, I + sarq $2, I # i = (m >> 2) + jle .L79 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + PREFETCHW -8 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $ 8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L58: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#endif + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd 4 * SIZE(B), %xmm6 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm6, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + + movaps %xmm0, 24 * SIZE(BO) + movaps %xmm1, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S new file mode 100644 index 0000000000..e53e29759d --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_1x4_nehalem.S @@ -0,0 +1,1586 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 + 2) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq M, M + jle .L999 + + movq N, J + sarq $2, J + NOBRANCH + jle .L20 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht0 2 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 2 * SIZE(CO2, LDC) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm0 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + movaps -12 * SIZE(AO), %xmm0 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + ADD1 %xmm1, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + movapd -12 * SIZE(AO), %xmm13 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 + + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm11 + subpd %xmm3, %xmm11 + + movddup -12 * SIZE(BO), %xmm0 + movddup -11 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm1 + subpd %xmm0, %xmm13 + subpd %xmm1, %xmm13 + + movddup -10 * SIZE(BO), %xmm2 + movddup -9 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -4 * SIZE(BO), %xmm0 + movddup -3 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm0 + mulpd %xmm10, %xmm1 + subpd %xmm0, %xmm13 + subpd %xmm1, %xmm13 + + movddup -2 * SIZE(BO), %xmm2 + movddup -1 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup 4 * SIZE(BO), %xmm0 + movddup 5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + addpd %xmm12, %xmm13 + + movddup 6 * SIZE(BO), %xmm2 + movddup 7 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup 14 * SIZE(BO), %xmm0 + movddup 15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup 14 * SIZE(BO), %xmm0 + movddup 15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + addpd %xmm14, %xmm15 + + movddup 12 * SIZE(BO), %xmm2 + movddup 13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm2 + mulpd %xmm14, %xmm3 + subpd %xmm2, %xmm13 + subpd %xmm3, %xmm13 + + movddup 10 * SIZE(BO), %xmm0 + movddup 11 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm0 + mulpd %xmm14, %xmm1 + subpd %xmm0, %xmm11 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + movddup 9 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm2 + mulpd %xmm14, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup 4 * SIZE(BO), %xmm0 + movddup 5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + addpd %xmm12, %xmm13 + + movddup 2 * SIZE(BO), %xmm0 + movddup 3 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm1 + subpd %xmm0, %xmm11 + subpd %xmm1, %xmm11 + + movddup 0 * SIZE(BO), %xmm2 + movddup 1 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -8 * SIZE(BO), %xmm2 + movddup -7 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhpd %xmm11, 1 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhpd %xmm15, 1 * SIZE(CO2, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + movapd %xmm13, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L20: + testq $2, N + BRANCH + jle .L30 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 + + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm11 + subpd %xmm3, %xmm11 + + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L21 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L30: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 +#endif + +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L31 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S b/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S new file mode 100644 index 0000000000..a1760adf11 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x1_atom.S @@ -0,0 +1,995 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define BB %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + + +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (8 * 8 + 3) + +#ifndef CONJ +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 addsd +#define ADDSD4 subsd + +#elif defined(LN) || defined(LT) +#define ADDSD1 addsd +#define ADDSD2 addsd +#define ADDSD3 subsd +#define ADDSD4 addsd +#else +#define ADDSD1 addsd +#define ADDSD2 subsd +#define ADDSD3 addsd +#define ADDSD4 addsd +#endif + + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + testq N, N + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + subq LDC, C +#endif + + movq C, CO1 + +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax), BB + + movq M, I + sarq $1, I + jle .L20 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht0 0 * SIZE(BB) + subq $-8 * SIZE, BB + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + xorps %xmm7, %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + prefetcht0 3 * SIZE(CO1) + xorps %xmm12, %xmm12 + xorps %xmm13, %xmm13 + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L15 + ALIGN_4 + +.L12: + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + PREFETCH ((PREFETCHSIZE) >> 1 + 0) * SIZE(BO) + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 12 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 13 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 14 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 15 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + subq $-16 * SIZE, AO + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 0 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + addq $ 8 * SIZE, BO + + ADDSD3 %xmm4, %xmm10 + movsd 1 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + decq %rax + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 0 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 2 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 1 * SIZE(BO), %xmm3 + + jne .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L18 + ALIGN_4 + +.L16: + ADDSD2 %xmm2, %xmm13 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD3 %xmm7, %xmm14 + movsd 3 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm15 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm6 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm2 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm12 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L16 + ALIGN_4 + +.L18: + ADDSD2 %xmm2, %xmm13 + ADDSD3 %xmm7, %xmm14 + ADDSD4 %xmm6, %xmm15 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + addsd %xmm15, %xmm12 + addsd %xmm13, %xmm14 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 + movsd 2 * SIZE(BO), %xmm2 + movsd 3 * SIZE(BO), %xmm3 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 + movsd 2 * SIZE(AO), %xmm2 + movsd 3 * SIZE(AO), %xmm3 +#endif + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm1 + subsd %xmm12, %xmm2 + subsd %xmm14, %xmm3 + +#ifdef LN + movsd 6 * SIZE(AO), %xmm6 + movsd 7 * SIZE(AO), %xmm7 + + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + movsd 4 * SIZE(AO), %xmm6 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + movsd 5 * SIZE(AO), %xmm7 + + ADDSD4 %xmm4, %xmm2 + ADDSD3 %xmm5, %xmm3 + + movaps %xmm2, %xmm4 + movaps %xmm3, %xmm5 + + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm5 + mulsd %xmm3, %xmm6 + mulsd %xmm2, %xmm7 + + subsd %xmm4, %xmm0 + subsd %xmm6, %xmm1 + movsd 0 * SIZE(AO), %xmm6 + + ADDSD3 %xmm5, %xmm0 + ADDSD4 %xmm7, %xmm1 + movsd 1 * SIZE(AO), %xmm7 + + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#ifdef LT + movsd 0 * SIZE(AO), %xmm6 + movsd 1 * SIZE(AO), %xmm7 + + movaps %xmm0, %xmm5 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + movsd 2 * SIZE(AO), %xmm6 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + movsd 3 * SIZE(AO), %xmm7 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 + + movaps %xmm0, %xmm4 + movaps %xmm1, %xmm5 + + mulsd %xmm6, %xmm4 + mulsd %xmm7, %xmm5 + mulsd %xmm1, %xmm6 + mulsd %xmm0, %xmm7 + + subsd %xmm4, %xmm2 + subsd %xmm6, %xmm3 + movsd 6 * SIZE(AO), %xmm6 + + ADDSD3 %xmm5, %xmm2 + ADDSD4 %xmm7, %xmm3 + movsd 7 * SIZE(AO), %xmm7 + + movaps %xmm2, %xmm5 + movaps %xmm3, %xmm4 + + mulsd %xmm6, %xmm2 + mulsd %xmm6, %xmm3 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm2 + ADDSD3 %xmm5, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BO), %xmm9 + movaps %xmm1, %xmm4 + movaps %xmm2, %xmm7 + movaps %xmm3, %xmm6 + + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 + mulsd %xmm9, %xmm5 + mulsd %xmm9, %xmm4 + + ADDSD4 %xmm4, %xmm0 + mulsd %xmm8, %xmm2 + ADDSD2 %xmm5, %xmm1 + mulsd %xmm8, %xmm3 + mulsd %xmm9, %xmm7 + mulsd %xmm9, %xmm6 + + ADDSD4 %xmm6, %xmm2 + ADDSD2 %xmm7, %xmm3 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + movsd %xmm2, 2 * SIZE(CO1) + movsd %xmm3, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) + movsd %xmm2, 2 * SIZE(BO) + movsd %xmm3, 3 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) + movsd %xmm2, 2 * SIZE(AO) + movsd %xmm3, 3 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L20: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movsd 0 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd 1 * SIZE(AO), %xmm4 + xorps %xmm5, %xmm5 + movsd 2 * SIZE(AO), %xmm5 + xorps %xmm6, %xmm6 + movsd 3 * SIZE(AO), %xmm7 + + movsd 0 * SIZE(BO), %xmm1 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + movsd 1 * SIZE(BO), %xmm3 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L25 + ALIGN_4 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 4 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 5 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 4 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 6 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 7 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 5 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 6 * SIZE(BO), %xmm1 + + ADDSD1 %xmm0, %xmm8 + movsd 8 * SIZE(AO), %xmm0 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm4, %xmm10 + movsd 9 * SIZE(AO), %xmm4 + mulsd %xmm3, %xmm6 + movsd 7 * SIZE(BO), %xmm3 + + ADDSD2 %xmm2, %xmm9 + movaps %xmm5, %xmm2 + mulsd %xmm1, %xmm5 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm7, %xmm6 + mulsd %xmm1, %xmm7 + movsd 8 * SIZE(BO), %xmm1 + + ADDSD1 %xmm5, %xmm8 + movsd 10 * SIZE(AO), %xmm5 + mulsd %xmm3, %xmm2 + + ADDSD3 %xmm7, %xmm10 + movsd 11 * SIZE(AO), %xmm7 + mulsd %xmm3, %xmm6 + movsd 9 * SIZE(BO), %xmm3 + + addq $8 * SIZE, AO + addq $8 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + BRANCH + je .L29 + ALIGN_4 + +.L26: + ADDSD2 %xmm2, %xmm9 + movaps %xmm0, %xmm2 + mulsd %xmm1, %xmm0 + + ADDSD4 %xmm6, %xmm11 + movaps %xmm4, %xmm6 + mulsd %xmm1, %xmm4 + movsd 2 * SIZE(BO), %xmm1 + + mulsd %xmm3, %xmm2 + ADDSD1 %xmm0, %xmm8 + movsd 2 * SIZE(AO), %xmm0 + + mulsd %xmm3, %xmm6 + movsd 3 * SIZE(BO), %xmm3 + ADDSD3 %xmm4, %xmm10 + movsd 3 * SIZE(AO), %xmm4 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + decq %rax + BRANCH + jg .L26 + ALIGN_4 + +.L29: + ADDSD2 %xmm2, %xmm9 + ADDSD4 %xmm6, %xmm11 + + addsd %xmm11, %xmm8 + addsd %xmm9, %xmm10 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(BO), %xmm0 + movsd 1 * SIZE(BO), %xmm1 +#else + movsd 0 * SIZE(AO), %xmm0 + movsd 1 * SIZE(AO), %xmm1 +#endif + + subsd %xmm8, %xmm0 + subsd %xmm10, %xmm1 + +#if defined(LN) || defined(LT) + movsd 0 * SIZE(AO), %xmm6 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(AO), %xmm7 + movaps %xmm1, %xmm4 + + mulsd %xmm6, %xmm0 + mulsd %xmm6, %xmm1 + mulsd %xmm7, %xmm5 + mulsd %xmm7, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD3 %xmm5, %xmm1 +#endif + +#if defined(RN) || defined(RT) + movsd 0 * SIZE(BO), %xmm8 + movaps %xmm0, %xmm5 + movsd 1 * SIZE(BO), %xmm9 + movaps %xmm1, %xmm4 + + mulsd %xmm8, %xmm0 + mulsd %xmm8, %xmm1 + mulsd %xmm9, %xmm5 + mulsd %xmm9, %xmm4 + + ADDSD4 %xmm4, %xmm0 + ADDSD2 %xmm5, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm0, 0 * SIZE(CO1) + movsd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movsd %xmm0, 0 * SIZE(BO) + movsd %xmm1, 1 * SIZE(BO) +#else + movsd %xmm0, 0 * SIZE(AO) + movsd %xmm1, 1 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S new file mode 100644 index 0000000000..93cbcad2d3 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_core2.S @@ -0,0 +1,2162 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define AORIG 48(%rsp) +#define BORIG 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + +#ifdef LN + pxor %xmm8, %xmm8 + prefetcht1 -3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 -3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#else + pxor %xmm8, %xmm8 + prefetcht1 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#endif + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -12 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -8 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -8 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd 0 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 2 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd 4 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd 6 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -4 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + subq $-32 * SIZE, BO + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + subq $1, %rax + BRANCH + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 + xorpd %xmm7, %xmm13 + xorpd %xmm7, %xmm15 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + movddup %xmm15, %xmm14 + unpckhpd %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm13 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm13, %xmm12 +#else + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L199 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 +#else + xorpd %xmm7, %xmm8 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S new file mode 100644 index 0000000000..e38e87ec98 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_penryn.S @@ -0,0 +1,2016 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $1, J + NOBRANCH + jle .L40 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) +#endif + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 16 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + subq $-32 * SIZE, AO + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD1 %xmm4, %xmm14 + psllq $63, %xmm7 + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm3, %xmm3 + pxor %xmm5, %xmm5 + + movapd %xmm3, %xmm8 + movapd %xmm3, %xmm9 + movapd %xmm3, %xmm12 + movapd %xmm3, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD2 %xmm5, %xmm13 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-4 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S new file mode 100644 index 0000000000..dabc97c3e2 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -0,0 +1,2266 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) +#define BORIG 72(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm4, %xmm4 + movapd 2 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 + xorpd %xmm15, %xmm5 + xorpd %xmm15, %xmm7 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + movapd 4 * SIZE(AO), %xmm3 + movapd 6 * SIZE(AO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 + + movapd %xmm5, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm3 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm3, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm5 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 + + movapd %xmm1, %xmm0 + movapd %xmm5, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm3 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm3 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 + + movapd %xmm3, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm3, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm5 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm5 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + movsd %xmm7, 2 * SIZE(CO2) + movhpd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) + movapd %xmm3, 4 * SIZE(AO) + movapd %xmm7, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 2 * SIZE(AO), %xmm8 + + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 10 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 14 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 6 * SIZE(AO), %xmm8 + + movapd 24 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 26 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 28 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 30 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + + decq %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm3 + subpd %xmm4, %xmm3 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm2, %xmm3 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm2, %xmm3 + + movapd %xmm3, %xmm0 + pshufd $0x4e, %xmm3, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 4 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 8 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 10 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 12 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 14 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm5 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm5, %xmm4 +#else + addpd %xmm1, %xmm0 + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm4, %xmm5 + + movapd %xmm5, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm5 + subpd %xmm4, %xmm5 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L199 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movapd POSINV, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 +#else + xorpd %xmm15, %xmm0 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 +#else + addpd %xmm1, %xmm0 +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + + subpd %xmm0, %xmm1 +#else + movapd 0 * SIZE(AO), %xmm1 + + subpd %xmm0, %xmm1 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S new file mode 100644 index 0000000000..708a984da7 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse3.S @@ -0,0 +1,2194 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#ifndef CONJ +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 addpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L100 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L12 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L11: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L13 + ALIGN_4 + +.L14: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + movapd 4 * SIZE(BO), %xmm10 + movapd 6 * SIZE(BO), %xmm11 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 + subpd %xmm4, %xmm10 + subpd %xmm6, %xmm11 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 + subpd %xmm5, %xmm10 + subpd %xmm7, %xmm11 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm2, %xmm10 + subpd %xmm6, %xmm11 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 + addsubpd %xmm3, %xmm10 + addsubpd %xmm7, %xmm11 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 +#else + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm10, %xmm13 + movapd %xmm11, %xmm14 + movapd %xmm11, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm8 + subpd %xmm14, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#if defined(LT) || defined(RN) + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 +#else + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + movapd %xmm9, %xmm14 + movapd %xmm9, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm10 + subpd %xmm14, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm10 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movhpd %xmm10, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L42 + +.L41: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L43 + ALIGN_4 + +.L44: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm3, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm9 +#endif + + +#ifdef RT + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + decq J # j -- + jg .L01 + +.L100: + testq $1, N + jle .L999 + +.L101: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + prefetchnta 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L112 + +.L111: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm5, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L149 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L142 + +.L141: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + SHUFPD_1 %xmm1, %xmm1 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 +#else + addsubpd %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 +#else + movapd 0 * SIZE(AO), %xmm8 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#else + addsubpd %xmm1, %xmm8 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_3 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S new file mode 100644 index 0000000000..d07930dece --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_2x4_nehalem.S @@ -0,0 +1,3116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + movq N, J + sarq $2, J + NOBRANCH + jle .L30 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps %xmm12, %xmm11 + movlhps %xmm14, %xmm12 + movhlps %xmm11, %xmm14 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm13 + movaps -24 * SIZE(BO), %xmm11 + movaps -20 * SIZE(BO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + movaps -24 * SIZE(AO), %xmm13 + movaps -20 * SIZE(AO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + movaps %xmm15, %xmm5 + pshufd $0xb1, %xmm15, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + subps %xmm5, %xmm13 + subps %xmm4, %xmm13 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + movaps %xmm13, %xmm5 + pshufd $0xb1, %xmm13, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + subps %xmm5, %xmm15 + subps %xmm4, %xmm15 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm13, -28 * SIZE(BO) + movaps %xmm11, -24 * SIZE(BO) + movaps %xmm15, -20 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhps %xmm13, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + movaps %xmm13, -24 * SIZE(AO) + movaps %xmm15, -20 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhps %xmm13, 2 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm13 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm13 + + movhlps %xmm9, %xmm11 + movhlps %xmm13, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm11, %xmm12 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm12 + + addps %xmm10, %xmm9 + addps %xmm12, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, (CO1) + movhps %xmm9, (CO1, LDC) + movsd %xmm11, (CO2) + movhps %xmm11, (CO2, LDC) +#else + movlhps %xmm11, %xmm9 + movlhps %xmm15, %xmm13 + + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm13, -28 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO1, LDC) + movlps %xmm13, (CO2) + movlps %xmm15, (CO2, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L30: + testq $2, N + BRANCH + jle .L50 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + + movlps %xmm9, (CO1) + movhps %xmm9, (CO2) +#else + movlps %xmm9, -32 * SIZE(AO) + movlps %xmm11, -30 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L50: + testq $1, N + BRANCH + jle .L999 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + +#endif + + haddps %xmm9, %xmm8 + + shufps $0xd8, %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) + movlps %xmm11, -30 * SIZE(BO) + + movlps %xmm9, 0 * SIZE(CO1) + movlps %xmm11, 2 * SIZE(CO1) +#else + movaps %xmm9, -32 * SIZE(AO) + + movlps %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -26 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm9 +#else + movsd -32 * SIZE(AO), %xmm9 +#endif + + subps %xmm8, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 +#endif + +#if defined(RN) || defined(RT) + movsd -32 * SIZE(BO), %xmm5 +#endif + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) +#else + movlps %xmm9, -32 * SIZE(AO) +#endif + + movlps %xmm9, (CO1) + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S new file mode 100644 index 0000000000..7375c34871 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -0,0 +1,4004 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm15, %xmm15 + cmpeqps %xmm15, %xmm15 + pslld $31, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm15, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm15, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm15, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm15, 12 + POSINV +#endif + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + movq N, J + sarq $1, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + movaps 8 * SIZE(B), %xmm3 + movaps 12 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + + addq $32 * 2 * SIZE, AO + addq $64 * 2 * SIZE, BO + subq $64 * 2, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L18: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm6 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + movaps 8 * SIZE(AO), %xmm5 + movaps 12 * SIZE(AO), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm2 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm0, %xmm1 + addps %xmm2, %xmm3 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm5 + subps %xmm4, %xmm7 + + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm2 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm0, %xmm5 + addps %xmm2, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + + pshufd $0xa0, %xmm7, %xmm4 + pshufd $0xf5, %xmm7, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + + subps %xmm2, %xmm1 + subps %xmm6, %xmm3 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm6, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + pshufd $0xaa, %xmm6, %xmm4 + pshufd $0xff, %xmm6, %xmm5 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm5, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm5 + + movaps %xmm0, 48 * SIZE(BO) + movaps %xmm1, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm5, 60 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) + + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) + movhps %xmm6, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + movaps %xmm5, 8 * SIZE(AO) + movaps %xmm7, 12 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) + movlps %xmm7, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#endif + + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm5, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movlps 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movlps 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm2, %xmm0 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) +#else + movlps %xmm1, 0 * SIZE(AO) + movlps %xmm5, 2 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $1, N + je .L999 + ALIGN_4 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movsd 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c + +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $ 8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L58: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#endif + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd 4 * SIZE(B), %xmm6 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm6, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + + movaps %xmm0, 24 * SIZE(BO) + movaps %xmm1, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + subps %xmm0, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 + + subps %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S new file mode 100644 index 0000000000..451aafad7f --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_1x4_nehalem.S @@ -0,0 +1,1586 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (8 * 1 + 2) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq M, M + jle .L999 + + testq $1, N + BRANCH + jle .L20 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -14 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm10 + movaps -10 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm11 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L32 + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 +#endif + +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#if defined(RN) || defined(RT) + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L31 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L20: + testq $2, N + BRANCH + jle .L30 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L21: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-16 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 + + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm11 + subpd %xmm3, %xmm11 + + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L21 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L30: + movq N, J + sarq $2, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -16 * SIZE(BB) + subq $-8 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -16 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht0 2 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht0 2 * SIZE(CO1, LDC) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht0 2 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht0 2 * SIZE(CO2, LDC) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + movaps -14 * SIZE(AO), %xmm0 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm12 + movaps 0 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + movaps -12 * SIZE(AO), %xmm0 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 2 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 6 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -10 * SIZE(AO), %xmm0 + ADD1 %xmm1, %xmm12 + movaps 8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps 10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps 12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps 14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + movaps -8 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, AO + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + movaps -14 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm15 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0x4e, %xmm1, %xmm2 + mulpd %xmm0, %xmm1 + mulpd %xmm0, %xmm2 + + ADD1 %xmm3, %xmm10 + movaps -10 * SIZE(BO), %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0x4e, %xmm3, %xmm4 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + shufps $0x40, %xmm0, %xmm0 + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0x04, %xmm0, %xmm0 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + movapd -12 * SIZE(AO), %xmm13 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0x04, %xmm7, %xmm7 +#else + shufps $0x40, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 + + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm11 + subpd %xmm3, %xmm11 + + movddup -12 * SIZE(BO), %xmm0 + movddup -11 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm1 + subpd %xmm0, %xmm13 + subpd %xmm1, %xmm13 + + movddup -10 * SIZE(BO), %xmm2 + movddup -9 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -4 * SIZE(BO), %xmm0 + movddup -3 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm0 + mulpd %xmm10, %xmm1 + subpd %xmm0, %xmm13 + subpd %xmm1, %xmm13 + + movddup -2 * SIZE(BO), %xmm2 + movddup -1 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup 4 * SIZE(BO), %xmm0 + movddup 5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + addpd %xmm12, %xmm13 + + movddup 6 * SIZE(BO), %xmm2 + movddup 7 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm3 + subpd %xmm2, %xmm15 + subpd %xmm3, %xmm15 + + movddup 14 * SIZE(BO), %xmm0 + movddup 15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup 14 * SIZE(BO), %xmm0 + movddup 15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + addpd %xmm14, %xmm15 + + movddup 12 * SIZE(BO), %xmm2 + movddup 13 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm2 + mulpd %xmm14, %xmm3 + subpd %xmm2, %xmm13 + subpd %xmm3, %xmm13 + + movddup 10 * SIZE(BO), %xmm0 + movddup 11 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm0 + mulpd %xmm14, %xmm1 + subpd %xmm0, %xmm11 + subpd %xmm1, %xmm11 + + movddup 8 * SIZE(BO), %xmm2 + movddup 9 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm15, %xmm14 + xorpd %xmm7, %xmm14 + mulpd %xmm15, %xmm2 + mulpd %xmm14, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup 4 * SIZE(BO), %xmm0 + movddup 5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + addpd %xmm12, %xmm13 + + movddup 2 * SIZE(BO), %xmm0 + movddup 3 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm1 + subpd %xmm0, %xmm11 + subpd %xmm1, %xmm11 + + movddup 0 * SIZE(BO), %xmm2 + movddup 1 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm13, %xmm12 + xorpd %xmm7, %xmm12 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -6 * SIZE(BO), %xmm0 + movddup -5 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + addpd %xmm10, %xmm11 + + movddup -8 * SIZE(BO), %xmm2 + movddup -7 * SIZE(BO), %xmm3 + + pshufd $0x4e, %xmm11, %xmm10 + xorpd %xmm7, %xmm10 + mulpd %xmm11, %xmm2 + mulpd %xmm10, %xmm3 + subpd %xmm2, %xmm9 + subpd %xmm3, %xmm9 + + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhpd %xmm11, 1 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhpd %xmm13, 1 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhpd %xmm15, 1 * SIZE(CO2, LDC) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + movapd %xmm13, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I + BRANCH + jg .L11 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S new file mode 100644 index 0000000000..005b65eb7e --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_core2.S @@ -0,0 +1,2162 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define J 16(%rsp) +#define OFFSET 24(%rsp) +#define KK 32(%rsp) +#define KKK 40(%rsp) +#define AORIG 48(%rsp) +#define BORIG 56(%rsp) +#define BUFFER 128(%rsp) + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCH_W (PREFETCH_R) + +#define PREFETCHSIZE (8 * 17 + 2) +#define PREFETCH prefetcht0 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, %rax + + movq %rsp, %r15 # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq %rax, KK + movq %rax, OFFSET + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + jle .L100 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) + movapd %xmm12, 8 * SIZE(BO) + movapd %xmm13, 10 * SIZE(BO) + movapd %xmm14, 12 * SIZE(BO) + movapd %xmm15, 14 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + prefetcht0 3 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -8 * SIZE(AO), %xmm0 + movapd -6 * SIZE(AO), %xmm1 + + movapd -8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -4 * SIZE(AO), %xmm0 + movapd -2 * SIZE(AO), %xmm1 + + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + subq $-16 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm13, %xmm13 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm13 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm13, %xmm12 +#else + addpd %xmm9, %xmm8 + addpd %xmm13, %xmm12 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm13, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm12, -12 * SIZE(BO) + movapd %xmm13, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L199 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -14 * SIZE(AO), %xmm1 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd -10 * SIZE(AO), %xmm1 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-16 * SIZE, BO + subq $1, %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + subq $1, %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 +#else + xorpd %xmm7, %xmm8 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 +#else + addpd %xmm9, %xmm8 +#endif + + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L100: + movq N, J + sarq $1, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq 16 * SIZE + BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + prefetcht0 (PREFETCH_R + 0) * SIZE(B) + + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + movddup -12 * SIZE(B), %xmm12 + movddup -11 * SIZE(B), %xmm13 + movddup -10 * SIZE(B), %xmm14 + movddup -9 * SIZE(B), %xmm15 + + prefetcht0 (PREFETCH_W + 0) * SIZE(BO) + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + prefetcht0 (PREFETCH_W + 8) * SIZE(BO) + + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) + + addq $ 8 * SIZE, B + subq $-16 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movddup -16 * SIZE(B), %xmm8 + movddup -15 * SIZE(B), %xmm9 + movddup -14 * SIZE(B), %xmm10 + movddup -13 * SIZE(B), %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: + leaq (PREFETCH_R + 0) * SIZE(B), BB + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + prefetcht2 0 * SIZE(BB) + +#ifdef LN + pxor %xmm8, %xmm8 + prefetcht1 -3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 -3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#else + pxor %xmm8, %xmm8 + prefetcht1 3 * SIZE(CO1) + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + prefetcht1 3 * SIZE(CO2) + pxor %xmm11, %xmm11 +#endif + + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + pxor %xmm14, %xmm14 + pxor %xmm15, %xmm15 + + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + + subq $-8 * SIZE, BB + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_4 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -16 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd -14 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd -12 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd -10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -12 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd -8 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + movapd -6 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -4 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -2 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -8 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + movapd 0 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm14 + movapd %xmm2, %xmm3 + movapd -6 * SIZE(AO), %xmm1 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 2 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + movapd 4 * SIZE(BO), %xmm2 + ADD1 %xmm3, %xmm12 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + movapd 6 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm13 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + movapd -4 * SIZE(AO), %xmm0 + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + movapd 8 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + movapd -2 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm11 + movapd 10 * SIZE(BO), %xmm4 + ADD2 %xmm5, %xmm15 + subq $-32 * SIZE, BO + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + movapd -20 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + mulpd %xmm0, %xmm2 + subq $-16 * SIZE, AO + mulpd %xmm1, %xmm3 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + movapd -18 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + subq $1, %rax + BRANCH + BRANCH + jg .L12 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax + BRANCH + BRANCH + je .L19 + ALIGN_4 + +.L16: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + + movapd -16 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -14 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + movapd -16 * SIZE(AO), %xmm0 + mulpd %xmm0, %xmm2 + movapd -14 * SIZE(AO), %xmm1 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD1 %xmm3, %xmm12 + ADD2 %xmm4, %xmm9 + ADD2 %xmm5, %xmm13 + + movapd -12 * SIZE(BO), %xmm2 + movapd %xmm2, %xmm3 + movapd -10 * SIZE(BO), %xmm4 + movapd %xmm4, %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm1, %xmm5 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L16 + ALIGN_4 + +.L19: + ADD1 %xmm2, %xmm10 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm11 + ADD2 %xmm5, %xmm15 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 + xorpd %xmm7, %xmm13 + xorpd %xmm7, %xmm15 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 + subpd %xmm13, %xmm12 + subpd %xmm15, %xmm14 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 + addpd %xmm13, %xmm12 + addpd %xmm15, %xmm14 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + movapd -12 * SIZE(B), %xmm13 + movapd -10 * SIZE(B), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + movapd %xmm13, -12 * SIZE(B) + movapd %xmm15, -10 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + movddup %xmm13, %xmm12 + unpckhpd %xmm13, %xmm13 + movddup %xmm15, %xmm14 + unpckhpd %xmm15, %xmm15 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) + movapd %xmm12, -8 * SIZE(BO) + movapd %xmm13, -6 * SIZE(BO) + movapd %xmm14, -4 * SIZE(BO) + movapd %xmm15, -2 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq 16 * SIZE + BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -14 * SIZE(AO), %xmm0 + movapd -8 * SIZE(BO), %xmm2 + movapd -6 * SIZE(BO), %xmm3 + movapd -4 * SIZE(BO), %xmm4 + movapd -2 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -12 * SIZE(AO), %xmm0 + movapd 0 * SIZE(BO), %xmm2 + movapd 2 * SIZE(BO), %xmm3 + movapd 4 * SIZE(BO), %xmm4 + movapd 6 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + movapd -10 * SIZE(AO), %xmm0 + movapd 8 * SIZE(BO), %xmm2 + movapd 10 * SIZE(BO), %xmm3 + movapd 12 * SIZE(BO), %xmm4 + movapd 14 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + subq $ -8 * SIZE, AO + subq $-32 * SIZE, BO + subq $1, %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm7 + + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd -16 * SIZE(AO), %xmm0 + movapd -16 * SIZE(BO), %xmm2 + movapd -14 * SIZE(BO), %xmm3 + movapd -12 * SIZE(BO), %xmm4 + movapd -10 * SIZE(BO), %xmm5 + + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm3 + mulpd %xmm0, %xmm4 + mulpd %xmm0, %xmm5 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm3, %xmm9 + ADD1 %xmm4, %xmm10 + ADD2 %xmm5, %xmm11 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + subq $1, %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq 16 * SIZE + BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm9, %xmm9 + SHUFPD_1 %xmm11, %xmm11 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm7, %xmm9 + xorpd %xmm7, %xmm11 +#else + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm9, %xmm8 + subpd %xmm11, %xmm10 +#else + addpd %xmm9, %xmm8 + addpd %xmm11, %xmm10 +#endif + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(B), %xmm9 + movapd -14 * SIZE(B), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(B), %xmm0 + movddup -15 * SIZE(B), %xmm1 + movddup -14 * SIZE(B), %xmm2 + movddup -13 * SIZE(B), %xmm3 + movddup -10 * SIZE(B), %xmm4 + movddup -9 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(B), %xmm0 + movddup -9 * SIZE(B), %xmm1 + movddup -12 * SIZE(B), %xmm2 + movddup -11 * SIZE(B), %xmm3 + movddup -16 * SIZE(B), %xmm4 + movddup -15 * SIZE(B), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(B) + movapd %xmm11, -14 * SIZE(B) + + movddup %xmm9, %xmm8 + unpckhpd %xmm9, %xmm9 + movddup %xmm11, %xmm10 + unpckhpd %xmm11, %xmm11 + + movapd %xmm8, -16 * SIZE(BO) + movapd %xmm9, -14 * SIZE(BO) + movapd %xmm10, -12 * SIZE(BO) + movapd %xmm11, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq %r15, %rsp + + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S new file mode 100644 index 0000000000..4ed789a944 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_penryn.S @@ -0,0 +1,2010 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCH_R (8 * 4 + 0) +#define PREFETCHSIZE (8 * 21 + 6) +#define PREFETCH prefetcht0 + +#define ADD1 addpd +#define ADD2 addpd + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq $1, N + BRANCH + jle .L40 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -14 * SIZE(AO), %xmm1 + movaps -16 * SIZE(BO), %xmm2 + + prefetcht0 3 * SIZE(CO1) + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + pxor %xmm12, %xmm12 + pxor %xmm13, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_4 + +.L52: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -10 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + subq $-16 * SIZE, AO + subq $ -8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm2, %xmm8 + movaps -14 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm12 + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm13 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_4 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm13 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm12, %xmm13 + + movapd %xmm13, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm13 + subpd %xmm12, %xmm13 + + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm12, %xmm13 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm13, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L79 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm8, %xmm8 + pxor %xmm9, %xmm9 + movaps -16 * SIZE(BO), %xmm2 + pxor %xmm10, %xmm10 + pxor %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_4 + +.L62: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -12 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -10 * SIZE(BO), %xmm2 + + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -8 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm10 + ADD2 %xmm7, %xmm11 + movaps -8 * SIZE(BO), %xmm2 + + subq $-8 * SIZE, AO + subq $-8 * SIZE, BO + subq $1, %rax + BRANCH + jg .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm0, %xmm7 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm2, %xmm8 + ADD2 %xmm7, %xmm9 + movaps -14 * SIZE(BO), %xmm2 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_4 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + addpd %xmm10, %xmm8 + addpd %xmm11, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 +#endif + + haddpd %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + + subpd %xmm8, %xmm9 +#else + movapd -16 * SIZE(AO), %xmm9 + + subpd %xmm8, %xmm9 +#endif + +#ifdef LN + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef RT + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) +#endif + + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + movq N, J + sarq $1, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 1, %rax + movq B, BB + subq %rax, BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetcht2 -16 * SIZE(BB) + subq $-8 * SIZE, BB + + movaps -16 * SIZE(AO), %xmm0 + pxor %xmm3, %xmm3 + movaps -14 * SIZE(AO), %xmm1 + pxor %xmm4, %xmm4 + movaps -16 * SIZE(BO), %xmm2 + + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + +#ifdef LN + prefetcht0 -4 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 -4 * SIZE(CO2) +#else + prefetcht0 3 * SIZE(CO1) + movapd %xmm4, %xmm8 + movapd %xmm4, %xmm9 + prefetcht0 3 * SIZE(CO2) +#endif + movapd %xmm4, %xmm10 + movapd %xmm4, %xmm11 + + movapd %xmm4, %xmm12 + movapd %xmm4, %xmm13 + movapd %xmm4, %xmm14 + movapd %xmm4, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 0 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 2 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 2 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 4 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 4 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 6 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 6 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 8 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 8 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 10 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 10 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps 12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps 14 * SIZE(AO), %xmm1 + + ADD1 %xmm3, %xmm12 + movaps 14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps 16 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + subq $-32 * SIZE, AO + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -14 * SIZE(AO), %xmm1 + + subq $-32 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + ADD1 %xmm4, %xmm14 + movaps %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + mulpd %xmm1, %xmm4 + + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + movaps %xmm7, %xmm6 + mulpd %xmm0, %xmm7 + mulpd %xmm1, %xmm6 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + ADD1 %xmm4, %xmm10 + movaps %xmm3, %xmm4 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + mulpd %xmm1, %xmm4 + + ADD2 %xmm7, %xmm9 + ADD2 %xmm6, %xmm11 + movaps %xmm5, %xmm6 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + mulpd %xmm1, %xmm6 + movaps -10 * SIZE(AO), %xmm1 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD1 %xmm4, %xmm14 + psllq $63, %xmm7 + ADD2 %xmm5, %xmm13 + ADD2 %xmm6, %xmm15 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm10 + pxor %xmm0, %xmm12 + pxor %xmm0, %xmm14 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm11 + pxor %xmm0, %xmm13 + pxor %xmm0, %xmm15 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm11, %xmm10 + haddpd %xmm13, %xmm12 + haddpd %xmm15, %xmm14 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + movapd -12 * SIZE(BO), %xmm13 + movapd -10 * SIZE(BO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm13 + movapd -12 * SIZE(AO), %xmm11 + movapd -10 * SIZE(AO), %xmm15 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 + subpd %xmm10, %xmm13 + subpd %xmm14, %xmm15 +#endif + +#ifdef LN + movddup -10 * SIZE(AO), %xmm0 + movddup -9 * SIZE(AO), %xmm1 + movddup -12 * SIZE(AO), %xmm2 + movddup -11 * SIZE(AO), %xmm3 + movddup -16 * SIZE(AO), %xmm4 + movddup -15 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 + + movapd %xmm13, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm11 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm11 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef LT + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + movddup -14 * SIZE(AO), %xmm2 + movddup -13 * SIZE(AO), %xmm3 + movddup -10 * SIZE(AO), %xmm4 + movddup -9 * SIZE(AO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 + + movapd %xmm9, %xmm8 + movapd %xmm11, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm11, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm13 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm13 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm13, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm12, %xmm13 + addpd %xmm14, %xmm15 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm13 + mulpd %xmm1, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 + + movapd %xmm9, %xmm8 + movapd %xmm13, %xmm10 + pshufd $0x4e, %xmm9, %xmm12 + pshufd $0x4e, %xmm13, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm11 + subpd %xmm10, %xmm15 + subpd %xmm12, %xmm11 + subpd %xmm14, %xmm15 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + mulpd %xmm4, %xmm15 + mulpd %xmm5, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm10 + xorpd %xmm7, %xmm14 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + mulpd %xmm0, %xmm15 + mulpd %xmm1, %xmm14 + + addpd %xmm10, %xmm11 + addpd %xmm14, %xmm15 + + movapd %xmm11, %xmm8 + movapd %xmm15, %xmm10 + pshufd $0x4e, %xmm11, %xmm12 + pshufd $0x4e, %xmm15, %xmm14 + + xorpd %xmm7, %xmm12 + xorpd %xmm7, %xmm14 + + mulpd %xmm2, %xmm8 + mulpd %xmm2, %xmm10 + mulpd %xmm3, %xmm12 + mulpd %xmm3, %xmm14 + + subpd %xmm8, %xmm9 + subpd %xmm10, %xmm13 + subpd %xmm12, %xmm9 + subpd %xmm14, %xmm13 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm13, %xmm12 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + mulpd %xmm4, %xmm13 + mulpd %xmm5, %xmm12 + + addpd %xmm8, %xmm9 + addpd %xmm12, %xmm13 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + movsd %xmm13, 2 * SIZE(CO1) + movhpd %xmm13, 3 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhpd %xmm15, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) + movapd %xmm13, -12 * SIZE(BO) + movapd %xmm15, -10 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm13, -14 * SIZE(AO) + movapd %xmm11, -12 * SIZE(AO) + movapd %xmm15, -10 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L39 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movaps -16 * SIZE(AO), %xmm0 + movaps -16 * SIZE(BO), %xmm2 + movaps -14 * SIZE(BO), %xmm3 + + pxor %xmm3, %xmm3 + pxor %xmm5, %xmm5 + + movapd %xmm3, %xmm8 + movapd %xmm3, %xmm9 + movapd %xmm3, %xmm12 + movapd %xmm3, %xmm13 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_4 + +.L22: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -10 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -8 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -12 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -6 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -4 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -10 * SIZE(AO), %xmm0 + + ADD1 %xmm3, %xmm12 + movaps -2 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + subq $ -8 * SIZE, AO + + ADD1 %xmm2, %xmm8 + movaps 0 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $1, %rax + BRANCH + jg .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + ADD1 %xmm3, %xmm12 + movaps -14 * SIZE(BO), %xmm3 + pshufd $0x4e, %xmm2, %xmm7 + mulpd %xmm0, %xmm2 + ADD2 %xmm5, %xmm13 + mulpd %xmm0, %xmm7 + + ADD1 %xmm2, %xmm8 + movaps -12 * SIZE(BO), %xmm2 + pshufd $0x4e, %xmm3, %xmm5 + mulpd %xmm0, %xmm3 + ADD2 %xmm7, %xmm9 + mulpd %xmm0, %xmm5 + movaps -14 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_4 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm3, %xmm12 + pcmpeqb %xmm7, %xmm7 + ADD2 %xmm5, %xmm13 + psllq $63, %xmm7 + +#ifndef CONJ + pshufd $0x40, %xmm7, %xmm0 + shufps $0x04, %xmm7, %xmm7 + + pxor %xmm0, %xmm8 + pxor %xmm0, %xmm12 +#else +#if defined(LN) || defined(LT) + pshufd $0x40, %xmm7, %xmm0 +#else + pshufd $0x04, %xmm7, %xmm0 +#endif + shufps $0x40, %xmm7, %xmm7 + + pxor %xmm0, %xmm9 + pxor %xmm0, %xmm13 +#endif + + haddpd %xmm9, %xmm8 + haddpd %xmm13, %xmm12 + +#if defined(LN) || defined(LT) + movapd -16 * SIZE(BO), %xmm9 + movapd -14 * SIZE(BO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#else + movapd -16 * SIZE(AO), %xmm9 + movapd -14 * SIZE(AO), %xmm11 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm11 +#endif + +#if defined(LN) || defined(LT) + movddup -16 * SIZE(AO), %xmm0 + movddup -15 * SIZE(AO), %xmm1 + + pshufd $0x4e, %xmm9, %xmm8 + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm8 + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm8, %xmm9 + addpd %xmm10, %xmm11 +#endif + +#ifdef RN + movddup -16 * SIZE(BO), %xmm0 + movddup -15 * SIZE(BO), %xmm1 + movddup -14 * SIZE(BO), %xmm2 + movddup -13 * SIZE(BO), %xmm3 + movddup -10 * SIZE(BO), %xmm4 + movddup -9 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm8 + + addpd %xmm8, %xmm9 + + movapd %xmm9, %xmm8 + pshufd $0x4e, %xmm9, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm11 + subpd %xmm12, %xmm11 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm10 + + addpd %xmm10, %xmm11 +#endif + +#ifdef RT + movddup -10 * SIZE(BO), %xmm0 + movddup -9 * SIZE(BO), %xmm1 + movddup -12 * SIZE(BO), %xmm2 + movddup -11 * SIZE(BO), %xmm3 + movddup -16 * SIZE(BO), %xmm4 + movddup -15 * SIZE(BO), %xmm5 + + pshufd $0x4e, %xmm11, %xmm10 + + xorpd %xmm7, %xmm10 + + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm10 + + addpd %xmm10, %xmm11 + + movapd %xmm11, %xmm8 + pshufd $0x4e, %xmm11, %xmm12 + + xorpd %xmm7, %xmm12 + + mulpd %xmm2, %xmm8 + mulpd %xmm3, %xmm12 + + subpd %xmm8, %xmm9 + subpd %xmm12, %xmm9 + + pshufd $0x4e, %xmm9, %xmm8 + + xorpd %xmm7, %xmm8 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm8 + + addpd %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm9, 0 * SIZE(CO1) + movhpd %xmm9, 1 * SIZE(CO1) + + movsd %xmm11, 0 * SIZE(CO2) + movhpd %xmm11, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm9, -16 * SIZE(BO) + movapd %xmm11, -14 * SIZE(BO) +#else + movapd %xmm9, -16 * SIZE(AO) + movapd %xmm11, -14 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S new file mode 100644 index 0000000000..3ab9e5be8b --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -0,0 +1,2266 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define ALPHA_R 16(%rsp) +#define ALPHA_I 32(%rsp) +#define OFFSET 40(%rsp) +#define KK 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) +#define BORIG 72(%rsp) +#define BUFFER 128(%rsp) + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulpd %xmm8, %xmm9 ;\ + addpd %xmm9, %xmm0 ;\ + movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addpd %xmm11, %xmm1 ;\ + movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm8, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm8, %xmm3 ;\ + movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulpd %xmm10, %xmm9 ;\ + addpd %xmm9, %xmm4 ;\ + movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulpd %xmm10, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm10, %xmm13 ;\ + mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm10, %xmm7 ;\ + movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulpd %xmm12, %xmm15 ;\ + addpd %xmm15, %xmm0 ;\ + movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm12, %xmm11 ;\ + addpd %xmm11, %xmm1 ;\ + movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm12, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addpd %xmm13, %xmm2 ;\ + movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm12, %xmm3 ;\ + movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulpd %xmm14, %xmm15 ;\ + addpd %xmm15, %xmm4 ;\ + movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulpd %xmm14, %xmm11 ;\ + addpd %xmm11, %xmm5 ;\ + movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulpd %xmm14, %xmm13 ;\ + mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addpd %xmm13, %xmm6 ;\ + movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addpd %xmm14, %xmm7 ;\ + movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#ifndef CONJ +#define NN +#else +#if defined(LN) || defined(LT) +#define CN +#else +#define NC +#endif +#endif + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + + movaps %xmm3, %xmm0 + +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 + +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pcmpeqb %xmm15, %xmm15 + psllq $63, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + + movlpd %xmm2, 0 + POSINV + movlpd %xmm15, 8 + POSINV + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + jle .L100 + +.L101: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L103 + ALIGN_4 + +.L102: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L102 + ALIGN_4 + +.L103: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L105 + ALIGN_4 + +.L104: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + + addq $4 * SIZE, BO + addq $2 * SIZE, B + decq %rax + jne .L104 + ALIGN_4 + +.L105: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L112 + +.L111: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 4 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 8 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 10 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + movapd 12 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 14 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm4 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm5 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm5, %xmm4 +#else + addpd %xmm1, %xmm0 + addpd %xmm5, %xmm4 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm5 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm4, %xmm5 + + movapd %xmm5, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm5 + subpd %xmm4, %xmm5 + + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm4, %xmm5 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm5, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm5, 4 * SIZE(BO) + movlpd %xmm5, 5 * SIZE(BO) + movhpd %xmm5, 6 * SIZE(BO) + movhpd %xmm5, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L199 + ALIGN_4 + +.L140: +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $0 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L142 + +.L141: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 2 * SIZE(AO), %xmm8 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 10 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + movapd 6 * SIZE(AO), %xmm8 + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + mulpd 14 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm3 + + addq $8 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + movapd POSINV, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + movapd 0 * SIZE(AO), %xmm8 + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + mulpd 2 * SIZE(BO), %xmm8 + addpd %xmm8, %xmm1 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 2), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 +#else + xorpd %xmm15, %xmm0 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 +#else + addpd %xmm1, %xmm0 +#endif + + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + + subpd %xmm0, %xmm1 +#else + movapd 0 * SIZE(AO), %xmm1 + + subpd %xmm0, %xmm1 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef RT + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L199: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 1 * COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L100: + movq N, J + sarq $1, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + +/* Copying to Sub Buffer */ + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + + addq %rax, %rax + ALIGN_4 + +.L02: + PREFETCHNTA 56 * SIZE(B) + + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + movlpd 4 * SIZE(B), %xmm4 + movlpd 5 * SIZE(B), %xmm5 + movlpd 6 * SIZE(B), %xmm6 + movlpd 7 * SIZE(B), %xmm7 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + movlpd %xmm4, 8 * SIZE(BO) + movlpd %xmm4, 9 * SIZE(BO) + movlpd %xmm5, 10 * SIZE(BO) + movlpd %xmm5, 11 * SIZE(BO) + movlpd %xmm6, 12 * SIZE(BO) + movlpd %xmm6, 13 * SIZE(BO) + movlpd %xmm7, 14 * SIZE(BO) + movlpd %xmm7, 15 * SIZE(BO) + + subq $-16 * SIZE, BO + addq $ 8 * SIZE, B + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L05 + ALIGN_4 + +.L04: + movlpd 0 * SIZE(B), %xmm0 + movlpd 1 * SIZE(B), %xmm1 + movlpd 2 * SIZE(B), %xmm2 + movlpd 3 * SIZE(B), %xmm3 + + movlpd %xmm0, 0 * SIZE(BO) + movlpd %xmm0, 1 * SIZE(BO) + movlpd %xmm1, 2 * SIZE(BO) + movlpd %xmm1, 3 * SIZE(BO) + movlpd %xmm2, 4 * SIZE(BO) + movlpd %xmm2, 5 * SIZE(BO) + movlpd %xmm3, 6 * SIZE(BO) + movlpd %xmm3, 7 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $ 8 * SIZE, BO + + decq %rax + jne .L04 + ALIGN_4 + +.L05: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movapd 2 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movapd 4 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movapd 6 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + movapd 0 * SIZE(BO), %xmm9 + pxor %xmm4, %xmm4 + movapd 2 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + movapd 4 * SIZE(BO), %xmm13 + movapd 8 * SIZE(BO), %xmm15 + + PREFETCHW 4 * SIZE(CO1) + pxor %xmm6, %xmm6 + PREFETCHW 4 * SIZE(CO2) + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + KERNEL1(16 * 1) + KERNEL2(16 * 1) + KERNEL3(16 * 1) + KERNEL4(16 * 1) + KERNEL5(16 * 1) + KERNEL6(16 * 1) + KERNEL7(16 * 1) + KERNEL8(16 * 1) + cmpq $64 * 2, %rax + jle .L12 + KERNEL1(16 * 2) + KERNEL2(16 * 2) + KERNEL3(16 * 2) + KERNEL4(16 * 2) + KERNEL5(16 * 2) + KERNEL6(16 * 2) + KERNEL7(16 * 2) + KERNEL8(16 * 2) + KERNEL1(16 * 3) + KERNEL2(16 * 3) + KERNEL3(16 * 3) + KERNEL4(16 * 3) + KERNEL5(16 * 3) + KERNEL6(16 * 3) + KERNEL7(16 * 3) + KERNEL8(16 * 3) + cmpq $64 * 4, %rax + jle .L12 + KERNEL1(16 * 4) + KERNEL2(16 * 4) + KERNEL3(16 * 4) + KERNEL4(16 * 4) + KERNEL5(16 * 4) + KERNEL6(16 * 4) + KERNEL7(16 * 4) + KERNEL8(16 * 4) + KERNEL1(16 * 5) + KERNEL2(16 * 5) + KERNEL3(16 * 5) + KERNEL4(16 * 5) + KERNEL5(16 * 5) + KERNEL6(16 * 5) + KERNEL7(16 * 5) + KERNEL8(16 * 5) + cmpq $64 * 6, %rax + jle .L12 + KERNEL1(16 * 6) + KERNEL2(16 * 6) + KERNEL3(16 * 6) + KERNEL4(16 * 6) + KERNEL5(16 * 6) + KERNEL6(16 * 6) + KERNEL7(16 * 6) + KERNEL8(16 * 6) + KERNEL1(16 * 7) + KERNEL2(16 * 7) + KERNEL3(16 * 7) + KERNEL4(16 * 7) + KERNEL5(16 * 7) + KERNEL6(16 * 7) + KERNEL7(16 * 7) + KERNEL8(16 * 7) + + addq $16 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $64 * 8, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 4), BO # * 64 + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L19 + ALIGN_4 + +.L16: + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + mulpd 6 * SIZE(BO), %xmm8 + addpd %xmm9, %xmm2 + movapd 0 * SIZE(BO), %xmm9 + addpd %xmm8, %xmm3 + movapd 4 * SIZE(AO), %xmm8 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm4 + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + addpd %xmm9, %xmm5 + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + mulpd 6 * SIZE(BO), %xmm10 + addpd %xmm9, %xmm6 + movapd 8 * SIZE(BO), %xmm9 + addpd %xmm10, %xmm7 + movapd 6 * SIZE(AO), %xmm10 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L19: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 + xorpd %xmm15, %xmm5 + xorpd %xmm15, %xmm7 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 + subpd %xmm5, %xmm4 + subpd %xmm7, %xmm6 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 + addpd %xmm5, %xmm4 + addpd %xmm7, %xmm6 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + movapd 4 * SIZE(B), %xmm5 + movapd 6 * SIZE(B), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm5 + movapd 4 * SIZE(AO), %xmm3 + movapd 6 * SIZE(AO), %xmm7 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#ifdef LN + movlpd 6 * SIZE(AO), %xmm8 + movhpd 6 * SIZE(AO), %xmm8 + movlpd 7 * SIZE(AO), %xmm9 + movhpd 7 * SIZE(AO), %xmm9 + movlpd 4 * SIZE(AO), %xmm10 + movhpd 4 * SIZE(AO), %xmm10 + movlpd 5 * SIZE(AO), %xmm11 + movhpd 5 * SIZE(AO), %xmm11 + movlpd 0 * SIZE(AO), %xmm12 + movhpd 0 * SIZE(AO), %xmm12 + movlpd 1 * SIZE(AO), %xmm13 + movhpd 1 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 + + movapd %xmm5, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm3 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef LT + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + movlpd 2 * SIZE(AO), %xmm10 + movhpd 2 * SIZE(AO), %xmm10 + movlpd 3 * SIZE(AO), %xmm11 + movhpd 3 * SIZE(AO), %xmm11 + movlpd 6 * SIZE(AO), %xmm12 + movhpd 6 * SIZE(AO), %xmm12 + movlpd 7 * SIZE(AO), %xmm13 + movhpd 7 * SIZE(AO), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 + + movapd %xmm1, %xmm0 + movapd %xmm3, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm3, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm5 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm5 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm5, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm4, %xmm5 + addpd %xmm6, %xmm7 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm5 + mulpd %xmm9, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 + + movapd %xmm1, %xmm0 + movapd %xmm5, %xmm2 + pshufd $0x4e, %xmm1, %xmm4 + pshufd $0x4e, %xmm5, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm3 + subpd %xmm2, %xmm7 + subpd %xmm4, %xmm3 + subpd %xmm6, %xmm7 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + mulpd %xmm12, %xmm7 + mulpd %xmm13, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm2 + xorpd %xmm15, %xmm6 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + mulpd %xmm8, %xmm7 + mulpd %xmm9, %xmm6 + + addpd %xmm2, %xmm3 + addpd %xmm6, %xmm7 + + movapd %xmm3, %xmm0 + movapd %xmm7, %xmm2 + pshufd $0x4e, %xmm3, %xmm4 + pshufd $0x4e, %xmm7, %xmm6 + + xorpd %xmm15, %xmm4 + xorpd %xmm15, %xmm6 + + mulpd %xmm10, %xmm0 + mulpd %xmm10, %xmm2 + mulpd %xmm11, %xmm4 + mulpd %xmm11, %xmm6 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm5 + subpd %xmm4, %xmm1 + subpd %xmm6, %xmm5 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm5, %xmm4 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm4 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + mulpd %xmm12, %xmm5 + mulpd %xmm13, %xmm4 + + addpd %xmm0, %xmm1 + addpd %xmm4, %xmm5 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + movsd %xmm5, 2 * SIZE(CO1) + movhpd %xmm5, 3 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + movsd %xmm7, 2 * SIZE(CO2) + movhpd %xmm7, 3 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + movapd %xmm5, 4 * SIZE(B) + movapd %xmm7, 6 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) + movlpd %xmm5, 8 * SIZE(BO) + movlpd %xmm5, 9 * SIZE(BO) + movhpd %xmm5, 10 * SIZE(BO) + movhpd %xmm5, 11 * SIZE(BO) + movlpd %xmm7, 12 * SIZE(BO) + movlpd %xmm7, 13 * SIZE(BO) + movhpd %xmm7, 14 * SIZE(BO) + movhpd %xmm7, 15 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm5, 2 * SIZE(AO) + movapd %xmm3, 4 * SIZE(AO) + movapd %xmm7, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + addq %rax, AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 2), BO +#endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + je .L42 + +.L41: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 2 * SIZE(AO), %xmm8 + + movapd 8 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 10 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 12 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 14 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 4 * SIZE(AO), %xmm8 + + movapd 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 18 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 20 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 22 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + movapd 6 * SIZE(AO), %xmm8 + + movapd 24 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 26 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 28 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 30 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $ 8 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movapd POSINV, %xmm15 + andq $3, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + movapd 0 * SIZE(AO), %xmm8 + + movapd 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm0 + + movapd 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm1 + + movapd 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm2 + + movapd 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + addpd %xmm9, %xmm3 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + + decq %rax + jg .L43 + ALIGN_4 + +.L44: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 4), BO +#endif + + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(NR) || defined(NC) || defined(TR) || defined(TC) + xorpd %xmm15, %xmm1 + xorpd %xmm15, %xmm3 +#else + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 +#endif + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + subpd %xmm1, %xmm0 + subpd %xmm3, %xmm2 +#else + addpd %xmm1, %xmm0 + addpd %xmm3, %xmm2 +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(B), %xmm1 + movapd 2 * SIZE(B), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#else + movapd 0 * SIZE(AO), %xmm1 + movapd 2 * SIZE(AO), %xmm3 + + subpd %xmm0, %xmm1 + subpd %xmm2, %xmm3 +#endif + +#ifndef CONJ + SHUFPD_1 %xmm15, %xmm15 +#endif + +#if defined(LN) || defined(LT) + movlpd 0 * SIZE(AO), %xmm8 + movhpd 0 * SIZE(AO), %xmm8 + movlpd 1 * SIZE(AO), %xmm9 + movhpd 1 * SIZE(AO), %xmm9 + + pshufd $0x4e, %xmm1, %xmm0 + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm0 + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm0, %xmm1 + addpd %xmm2, %xmm3 +#endif + +#ifdef RN + movlpd 0 * SIZE(B), %xmm8 + movhpd 0 * SIZE(B), %xmm8 + movlpd 1 * SIZE(B), %xmm9 + movhpd 1 * SIZE(B), %xmm9 + movlpd 2 * SIZE(B), %xmm10 + movhpd 2 * SIZE(B), %xmm10 + movlpd 3 * SIZE(B), %xmm11 + movhpd 3 * SIZE(B), %xmm11 + movlpd 6 * SIZE(B), %xmm12 + movhpd 6 * SIZE(B), %xmm12 + movlpd 7 * SIZE(B), %xmm13 + movhpd 7 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm8, %xmm1 + mulpd %xmm9, %xmm0 + + addpd %xmm0, %xmm1 + + movapd %xmm1, %xmm0 + pshufd $0x4e, %xmm1, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm3 + subpd %xmm4, %xmm3 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm12, %xmm3 + mulpd %xmm13, %xmm2 + + addpd %xmm2, %xmm3 +#endif + +#ifdef RT + movlpd 6 * SIZE(B), %xmm8 + movhpd 6 * SIZE(B), %xmm8 + movlpd 7 * SIZE(B), %xmm9 + movhpd 7 * SIZE(B), %xmm9 + movlpd 4 * SIZE(B), %xmm10 + movhpd 4 * SIZE(B), %xmm10 + movlpd 5 * SIZE(B), %xmm11 + movhpd 5 * SIZE(B), %xmm11 + movlpd 0 * SIZE(B), %xmm12 + movhpd 0 * SIZE(B), %xmm12 + movlpd 1 * SIZE(B), %xmm13 + movhpd 1 * SIZE(B), %xmm13 + + pshufd $0x4e, %xmm3, %xmm2 + + xorpd %xmm15, %xmm2 + + mulpd %xmm8, %xmm3 + mulpd %xmm9, %xmm2 + + addpd %xmm2, %xmm3 + + movapd %xmm3, %xmm0 + pshufd $0x4e, %xmm3, %xmm4 + + xorpd %xmm15, %xmm4 + + mulpd %xmm10, %xmm0 + mulpd %xmm11, %xmm4 + + subpd %xmm0, %xmm1 + subpd %xmm4, %xmm1 + + pshufd $0x4e, %xmm1, %xmm0 + + xorpd %xmm15, %xmm0 + + mulpd %xmm12, %xmm1 + mulpd %xmm13, %xmm0 + + addpd %xmm0, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + + movsd %xmm1, 0 * SIZE(CO1) + movhpd %xmm1, 1 * SIZE(CO1) + + movsd %xmm3, 0 * SIZE(CO2) + movhpd %xmm3, 1 * SIZE(CO2) + +#if defined(LN) || defined(LT) + movapd %xmm1, 0 * SIZE(B) + movapd %xmm3, 2 * SIZE(B) + + movlpd %xmm1, 0 * SIZE(BO) + movlpd %xmm1, 1 * SIZE(BO) + movhpd %xmm1, 2 * SIZE(BO) + movhpd %xmm1, 3 * SIZE(BO) + movlpd %xmm3, 4 * SIZE(BO) + movlpd %xmm3, 5 * SIZE(BO) + movhpd %xmm3, 6 * SIZE(BO) + movhpd %xmm3, 7 * SIZE(BO) +#else + movapd %xmm1, 0 * SIZE(AO) + movapd %xmm3, 2 * SIZE(AO) + +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_3 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S new file mode 100644 index 0000000000..ca700eb946 --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse3.S @@ -0,0 +1,2196 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M %rdi +#define N %rsi +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %r13 +#define BO %r14 +#define CO1 %r15 +#define CO2 %rbx +#define KK %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define KKK 56(%rsp) +#define AORIG 64(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define KKK 232(%rsp) +#define AORIG 240(%rsp) +#endif + +#define PREFETCH prefetcht1 +#define PREFETCHSIZE (16 * 12 + 3) +#define PREFETCH_R (4 * 4 + 0) + +#ifndef CONJ +#define ADD1 addpd +#define ADD2 addpd +#else +#define ADD1 subpd +#define ADD2 addpd +#endif + +#define KERNEL1(address) \ + mulpd %xmm8, %xmm9;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm9, %xmm0;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL2(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL3(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm0;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm1;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm2;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm3;\ + movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL4(address) \ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm4;\ + movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD2 %xmm9, %xmm5;\ + movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + ADD1 %xmm9, %xmm6;\ + movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ + mulpd %xmm8, %xmm9;\ + movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ + ADD2 %xmm9, %xmm7;\ + movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 + +#define KERNEL5(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL6(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL7(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm0;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm1;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm2;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm3;\ + movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL8(address) \ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm4;\ + movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD2 %xmm11, %xmm5;\ + movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + ADD1 %xmm11, %xmm6;\ + movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ + mulpd %xmm10, %xmm11;\ + movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ + ADD2 %xmm11, %xmm7;\ + movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 + +#define KERNEL9(address) \ + mulpd %xmm12, %xmm13;\ + PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ + ADD1 %xmm13, %xmm0;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL10(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL11(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm0;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm1;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm2;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm3;\ + movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL12(address) \ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm4;\ + movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD2 %xmm13, %xmm5;\ + movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + ADD1 %xmm13, %xmm6;\ + movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ + mulpd %xmm12, %xmm13;\ + movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ + ADD2 %xmm13, %xmm7;\ + movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 + +#define KERNEL13(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL14(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL15(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm0;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm1;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm2;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm3;\ + movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + +#define KERNEL16(address) \ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm4;\ + movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD2 %xmm15, %xmm5;\ + movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + ADD1 %xmm15, %xmm6;\ + movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ + mulpd %xmm14, %xmm15;\ + movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ + ADD2 %xmm15, %xmm7;\ + movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, M + movq ARG2, N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#endif + + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + movq KK, OFFSET + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq $1, N + jle .L100 + +.L101: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + movq C, CO1 # coffset1 = c +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L130 + ALIGN_4 + +.L110: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm4, %xmm4 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm5, %xmm5 + + prefetchnta 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L112 + +.L111: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 0 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm5 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm4 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 40 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm5 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 18 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 8 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 20 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 22 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 24 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 26 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 28 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 30 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 32 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm5 + movddup 24 * SIZE(BO), %xmm11 + + addq $32 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L111 + ALIGN_4 + +.L112: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L114 + +.L113: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + movapd 4 * SIZE(AO), %xmm8 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + + addq $4 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L113 + ALIGN_4 + +.L114: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm5, %xmm5 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm5, %xmm4 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm4, %xmm5 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm5, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + SHUFPD_1 %xmm13, %xmm13 + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + addsubpd %xmm13, %xmm12 + subpd %xmm12, %xmm9 + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L110 + ALIGN_4 + +.L130: + testq $1, M + jle .L149 + ALIGN_4 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L142 + +.L141: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $16 * SIZE, BO + decq %rax + jne .L141 + +.L142: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L144 + +.L143: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $2 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L143 + ALIGN_4 + +.L144: + addpd %xmm2, %xmm0 + addpd %xmm3, %xmm1 + + SHUFPD_1 %xmm1, %xmm1 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 +#else + addsubpd %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 +#else + movapd 0 * SIZE(AO), %xmm8 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 +#else + addsubpd %xmm1, %xmm8 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef RT + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L149: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_3 + + +.L100: + movq N, J + sarq $1, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I # i = (m >> 2) + jle .L30 + ALIGN_4 + +.L10: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + + movapd 16 * SIZE(AO), %xmm12 + movddup 16 * SIZE(BO), %xmm13 + movapd 24 * SIZE(AO), %xmm14 + movddup 24 * SIZE(BO), %xmm15 + + prefetchnta 4 * SIZE(CO1) + pxor %xmm4, %xmm4 + prefetchnta 4 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L12 + +.L1X: + KERNEL1 (16 * 0) + KERNEL2 (16 * 0) + KERNEL3 (16 * 0) + KERNEL4 (16 * 0) + KERNEL5 (16 * 0) + KERNEL6 (16 * 0) + KERNEL7 (16 * 0) + KERNEL8 (16 * 0) + KERNEL9 (16 * 0) + KERNEL10(16 * 0) + KERNEL11(16 * 0) + KERNEL12(16 * 0) + KERNEL13(16 * 0) + KERNEL14(16 * 0) + KERNEL15(16 * 0) + KERNEL16(16 * 0) + cmpq $128 * 1, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 1) + KERNEL2 (16 * 1) + KERNEL3 (16 * 1) + KERNEL4 (16 * 1) + KERNEL5 (16 * 1) + KERNEL6 (16 * 1) + KERNEL7 (16 * 1) + KERNEL8 (16 * 1) + KERNEL9 (16 * 1) + KERNEL10(16 * 1) + KERNEL11(16 * 1) + KERNEL12(16 * 1) + KERNEL13(16 * 1) + KERNEL14(16 * 1) + KERNEL15(16 * 1) + KERNEL16(16 * 1) + cmpq $128 * 2, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 2) + KERNEL2 (16 * 2) + KERNEL3 (16 * 2) + KERNEL4 (16 * 2) + KERNEL5 (16 * 2) + KERNEL6 (16 * 2) + KERNEL7 (16 * 2) + KERNEL8 (16 * 2) + KERNEL9 (16 * 2) + KERNEL10(16 * 2) + KERNEL11(16 * 2) + KERNEL12(16 * 2) + KERNEL13(16 * 2) + KERNEL14(16 * 2) + KERNEL15(16 * 2) + KERNEL16(16 * 2) + cmpq $128 * 3, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 3) + KERNEL2 (16 * 3) + KERNEL3 (16 * 3) + KERNEL4 (16 * 3) + KERNEL5 (16 * 3) + KERNEL6 (16 * 3) + KERNEL7 (16 * 3) + KERNEL8 (16 * 3) + KERNEL9 (16 * 3) + KERNEL10(16 * 3) + KERNEL11(16 * 3) + KERNEL12(16 * 3) + KERNEL13(16 * 3) + KERNEL14(16 * 3) + KERNEL15(16 * 3) + KERNEL16(16 * 3) + cmpq $128 * 4, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 4) + KERNEL2 (16 * 4) + KERNEL3 (16 * 4) + KERNEL4 (16 * 4) + KERNEL5 (16 * 4) + KERNEL6 (16 * 4) + KERNEL7 (16 * 4) + KERNEL8 (16 * 4) + KERNEL9 (16 * 4) + KERNEL10(16 * 4) + KERNEL11(16 * 4) + KERNEL12(16 * 4) + KERNEL13(16 * 4) + KERNEL14(16 * 4) + KERNEL15(16 * 4) + KERNEL16(16 * 4) + cmpq $128 * 5, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 5) + KERNEL2 (16 * 5) + KERNEL3 (16 * 5) + KERNEL4 (16 * 5) + KERNEL5 (16 * 5) + KERNEL6 (16 * 5) + KERNEL7 (16 * 5) + KERNEL8 (16 * 5) + KERNEL9 (16 * 5) + KERNEL10(16 * 5) + KERNEL11(16 * 5) + KERNEL12(16 * 5) + KERNEL13(16 * 5) + KERNEL14(16 * 5) + KERNEL15(16 * 5) + KERNEL16(16 * 5) + cmpq $128 * 6, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 6) + KERNEL2 (16 * 6) + KERNEL3 (16 * 6) + KERNEL4 (16 * 6) + KERNEL5 (16 * 6) + KERNEL6 (16 * 6) + KERNEL7 (16 * 6) + KERNEL8 (16 * 6) + KERNEL9 (16 * 6) + KERNEL10(16 * 6) + KERNEL11(16 * 6) + KERNEL12(16 * 6) + KERNEL13(16 * 6) + KERNEL14(16 * 6) + KERNEL15(16 * 6) + KERNEL16(16 * 6) + cmpq $128 * 7, %rax + NOBRANCH + jle .L11 + KERNEL1 (16 * 7) + KERNEL2 (16 * 7) + KERNEL3 (16 * 7) + KERNEL4 (16 * 7) + KERNEL5 (16 * 7) + KERNEL6 (16 * 7) + KERNEL7 (16 * 7) + KERNEL8 (16 * 7) + KERNEL9 (16 * 7) + KERNEL10(16 * 7) + KERNEL11(16 * 7) + KERNEL12(16 * 7) + KERNEL13(16 * 7) + KERNEL14(16 * 7) + KERNEL15(16 * 7) + KERNEL16(16 * 7) + + addq $32 * 8 * SIZE, AO + addq $32 * 8 * SIZE, BO + subq $128 * 8, %rax + jg .L1X + +.L11: + leaq (AO, %rax, 2), AO # * 16 + leaq (BO, %rax, 2), BO # * 64 + ALIGN_4 + +.L12: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + je .L14 + ALIGN_4 + +.L13: + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm10 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movddup 0 * SIZE(BO), %xmm11 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm4 + movddup 1 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm5 + movddup 2 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm6 + movddup 3 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm7 + + addq $4 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L13 + ALIGN_4 + +.L14: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + SHUFPD_1 %xmm5, %xmm5 + SHUFPD_1 %xmm7, %xmm7 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 + addsubpd %xmm5, %xmm4 + addsubpd %xmm7, %xmm6 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 + addsubpd %xmm4, %xmm5 + addsubpd %xmm6, %xmm7 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + subq $2, %rax + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 + movapd 4 * SIZE(BO), %xmm10 + movapd 6 * SIZE(BO), %xmm11 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 + movapd 4 * SIZE(AO), %xmm10 + movapd 6 * SIZE(AO), %xmm11 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 + subpd %xmm4, %xmm10 + subpd %xmm6, %xmm11 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 + subpd %xmm5, %xmm10 + subpd %xmm7, %xmm11 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm4, %xmm9 + subpd %xmm2, %xmm10 + subpd %xmm6, %xmm11 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm5, %xmm9 + addsubpd %xmm3, %xmm10 + addsubpd %xmm7, %xmm11 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#if defined(LN) || defined(RT) +#ifdef LN + movddup 6 * SIZE(AO), %xmm0 + movddup 7 * SIZE(AO), %xmm1 + movddup 4 * SIZE(AO), %xmm2 + movddup 5 * SIZE(AO), %xmm3 + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 +#else + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm10 + mulpd %xmm0, %xmm11 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm10, %xmm13 + movapd %xmm11, %xmm14 + movapd %xmm11, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm8 + subpd %xmm14, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#if defined(LT) || defined(RN) + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + movddup 2 * SIZE(AO), %xmm2 + movddup 3 * SIZE(AO), %xmm3 + movddup 6 * SIZE(AO), %xmm4 + movddup 7 * SIZE(AO), %xmm5 +#else + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 +#endif + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + movapd %xmm9, %xmm14 + movapd %xmm9, %xmm15 + + SHUFPD_1 %xmm13, %xmm13 + SHUFPD_1 %xmm15, %xmm15 + + mulpd %xmm2, %xmm12 + mulpd %xmm2, %xmm14 + mulpd %xmm3, %xmm13 + mulpd %xmm3, %xmm15 + + addsubpd %xmm13, %xmm12 + addsubpd %xmm15, %xmm14 + + subpd %xmm12, %xmm10 + subpd %xmm14, %xmm11 + + movapd %xmm10, %xmm12 + movapd %xmm11, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm10 + mulpd %xmm4, %xmm11 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm10 + addsubpd %xmm13, %xmm11 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm10, 2 * SIZE(CO1) + movhpd %xmm10, 3 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + movsd %xmm9, 2 * SIZE(CO1) + movhpd %xmm9, 3 * SIZE(CO1) + + movsd %xmm10, 0 * SIZE(CO2) + movhpd %xmm10, 1 * SIZE(CO2) + movsd %xmm11, 2 * SIZE(CO2) + movhpd %xmm11, 3 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) + movapd %xmm10, 4 * SIZE(BO) + movapd %xmm11, 6 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) + movapd %xmm10, 4 * SIZE(AO) + movapd %xmm11, 6 * SIZE(AO) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + decq I # i -- + jg .L10 + ALIGN_4 + +.L30: + testq $1, M + jle .L99 + +#ifdef LN + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + leaq (, %rax, SIZE), %rax + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + movapd 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movddup 0 * SIZE(BO), %xmm9 + pxor %xmm1, %xmm1 + movapd 8 * SIZE(AO), %xmm10 + pxor %xmm2, %xmm2 + movddup 8 * SIZE(BO), %xmm11 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L42 + +.L41: + mulpd %xmm8, %xmm9 + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 5 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 6 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 7 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 4 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 16 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 9 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 10 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 11 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 6 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 12 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 13 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 14 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 15 * SIZE(BO), %xmm11 + mulpd %xmm8, %xmm11 + movapd 16 * SIZE(AO), %xmm8 + ADD2 %xmm11, %xmm3 + movddup 24 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 17 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 18 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 19 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 10 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 20 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 21 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 22 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 23 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm9 + movapd 12 * SIZE(AO), %xmm10 + ADD2 %xmm9, %xmm3 + movddup 32 * SIZE(BO), %xmm9 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 25 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 26 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 27 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 14 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 28 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm0 + movddup 29 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD2 %xmm11, %xmm1 + movddup 30 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + ADD1 %xmm11, %xmm2 + movddup 31 * SIZE(BO), %xmm11 + mulpd %xmm10, %xmm11 + movapd 24 * SIZE(AO), %xmm10 + ADD2 %xmm11, %xmm3 + movddup 40 * SIZE(BO), %xmm11 + + addq $16 * SIZE, AO + addq $32 * SIZE, BO + decq %rax + jne .L41 + +.L42: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $7, %rax # if (k & 1) + BRANCH + jle .L44 + +.L43: + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm0 + movddup 1 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD2 %xmm9, %xmm1 + movddup 2 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + ADD1 %xmm9, %xmm2 + movddup 3 * SIZE(BO), %xmm9 + mulpd %xmm8, %xmm9 + movapd 2 * SIZE(AO), %xmm8 + ADD2 %xmm9, %xmm3 + movddup 4 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $4 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L43 + ALIGN_4 + +.L44: + SHUFPD_1 %xmm1, %xmm1 + SHUFPD_1 %xmm3, %xmm3 + +#ifndef CONJ + addsubpd %xmm1, %xmm0 + addsubpd %xmm3, %xmm2 +#else + addsubpd %xmm0, %xmm1 + addsubpd %xmm2, %xmm3 +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + leaq (, %rax, SIZE), %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movapd 0 * SIZE(BO), %xmm8 + movapd 2 * SIZE(BO), %xmm9 +#else + movapd 0 * SIZE(AO), %xmm8 + movapd 2 * SIZE(AO), %xmm9 +#endif + +#if (defined(LN) || defined(LT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#elif (defined(LN) || defined(LT)) && defined(CONJ) + subpd %xmm1, %xmm8 + subpd %xmm3, %xmm9 +#elif (defined(RN) || defined(RT)) && !defined(CONJ) + subpd %xmm0, %xmm8 + subpd %xmm2, %xmm9 +#else + addsubpd %xmm1, %xmm8 + addsubpd %xmm3, %xmm9 +#endif + +#ifdef CONJ + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 +#endif + +#ifdef LN + movddup 0 * SIZE(AO), %xmm4 + movddup 1 * SIZE(AO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm4, %xmm8 + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + mulpd %xmm5, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef LT + movddup 0 * SIZE(AO), %xmm0 + movddup 1 * SIZE(AO), %xmm1 + +#ifdef CONJ + xorpd %xmm7, %xmm1 +#endif + + movapd %xmm8, %xmm12 + movapd %xmm9, %xmm13 + SHUFPD_1 %xmm12, %xmm12 + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm0, %xmm8 + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + mulpd %xmm1, %xmm13 + + addsubpd %xmm12, %xmm8 + addsubpd %xmm13, %xmm9 +#endif + +#ifdef RN + movddup 0 * SIZE(BO), %xmm0 + movddup 1 * SIZE(BO), %xmm1 + movddup 2 * SIZE(BO), %xmm2 + movddup 3 * SIZE(BO), %xmm3 + movddup 6 * SIZE(BO), %xmm4 + movddup 7 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm8 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + movapd %xmm8, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm9 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm9 +#endif + + +#ifdef RT + movddup 6 * SIZE(BO), %xmm0 + movddup 7 * SIZE(BO), %xmm1 + movddup 4 * SIZE(BO), %xmm2 + movddup 5 * SIZE(BO), %xmm3 + movddup 0 * SIZE(BO), %xmm4 + movddup 1 * SIZE(BO), %xmm5 + +#ifdef CONJ + xorpd %xmm7, %xmm1 + xorpd %xmm7, %xmm3 + xorpd %xmm7, %xmm5 +#endif + + movapd %xmm9, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm0, %xmm9 + mulpd %xmm1, %xmm12 + + addsubpd %xmm12, %xmm9 + + movapd %xmm9, %xmm12 + movapd %xmm9, %xmm13 + + SHUFPD_1 %xmm13, %xmm13 + + mulpd %xmm2, %xmm12 + mulpd %xmm3, %xmm13 + + addsubpd %xmm13, %xmm12 + + subpd %xmm12, %xmm8 + + movapd %xmm8, %xmm12 + SHUFPD_1 %xmm12, %xmm12 + + mulpd %xmm4, %xmm8 + mulpd %xmm5, %xmm12 + + addsubpd %xmm12, %xmm8 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#else + movsd %xmm8, 0 * SIZE(CO1) + movhpd %xmm8, 1 * SIZE(CO1) + + movsd %xmm9, 0 * SIZE(CO2) + movhpd %xmm9, 1 * SIZE(CO2) +#endif + +#if defined(LN) || defined(LT) + movapd %xmm8, 0 * SIZE(BO) + movapd %xmm9, 2 * SIZE(BO) +#else + movapd %xmm8, 0 * SIZE(AO) + movapd %xmm9, 2 * SIZE(AO) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $0 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L99: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + decq J # j -- + jg .L01 + + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S b/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S new file mode 100644 index 0000000000..a5f01340bc --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_2x4_nehalem.S @@ -0,0 +1,3116 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define OLD_K %rdx + +#define M %r13 +#define N %r14 +#define K %r15 + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %rbx +#define CO2 %rbp +#define KK %rdx +#define BB %r12 + +#ifndef WINDOWS_ABI + +#define STACKSIZE 128 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#define OFFSET 48(%rsp) +#define J 56(%rsp) +#define KKK 64(%rsp) +#define AORIG 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#define OFFSET 224(%rsp) +#define J 232(%rsp) +#define KKK 240(%rsp) +#define AORIG 248(%rsp) + +#endif + +#define PREFETCHSIZE (16 * 1 + 4) +#define PREFETCH prefetcht0 + +#define ADD1 addps +#define ADD2 addps + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, OLD_K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C +#endif + + subq $-32 * SIZE, A + subq $-32 * SIZE, B + + movq OLD_M, M + movq OLD_N, N + movq OLD_K, K + movq OLD_LDC, LDC + movq OLD_OFFSET, KK + + salq $ZBASE_SHIFT, LDC + + movq KK, OFFSET + negq KK + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RT + movq N, KK + subq OFFSET, KK +#endif + + testq $1, N + BRANCH + jle .L30 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B + + subq LDC, C +#endif + + movq C, CO1 +#ifndef RT + addq LDC, C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L55 + ALIGN_3 + +.L52: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -30 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movddup -26 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -16 * SIZE(AO), %xmm0 + + subq $ -8 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L52 + ALIGN_3 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_3 + +.L56: + ADD1 %xmm1, %xmm8 + movddup -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + mulps %xmm0, %xmm2 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L56 + ALIGN_3 + +.L58: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + xorps %xmm0, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 +#endif + +#endif + + haddps %xmm9, %xmm8 + + shufps $0xd8, %xmm8, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#if defined(RN) || defined(RT) + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) + movlps %xmm11, -30 * SIZE(BO) + + movlps %xmm9, 0 * SIZE(CO1) + movlps %xmm11, 2 * SIZE(CO1) +#else + movaps %xmm9, -32 * SIZE(AO) + + movlps %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L51 + ALIGN_4 + +.L60: + testq $1, M + BRANCH + jle .L69 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movsd -32 * SIZE(BO), %xmm5 + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L65 + ALIGN_3 + +.L62: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -26 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-8 * SIZE, BO + subq $-8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L62 + ALIGN_3 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_3 + +.L66: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movsd -30 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $2 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L66 + ALIGN_3 + +.L68: +#if defined(LN) || defined(RT) + movq KK, %rax + subq $1, %rax + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(BO), %xmm9 +#else + movsd -32 * SIZE(AO), %xmm9 +#endif + + subps %xmm8, %xmm9 + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 +#endif + +#if defined(RN) || defined(RT) + movsd -32 * SIZE(BO), %xmm5 +#endif + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm9, -32 * SIZE(BO) +#else + movlps %xmm9, -32 * SIZE(AO) +#endif + + movlps %xmm9, (CO1) + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L69: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L30: + testq $2, N + BRANCH + jle .L50 + +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 1), CO2 +#ifndef RT + leaq (C, LDC, 2), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L40 + ALIGN_4 + +.L31: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO2) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L35 + ALIGN_3 + +.L32: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L32 + ALIGN_3 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_3 + +.L36: + ADD1 %xmm1, %xmm8 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + ADD1 %xmm3, %xmm10 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + ADD2 %xmm4, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L36 + ALIGN_3 + +.L38: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO2) + movhps %xmm11, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L31 + ALIGN_4 + +.L40: + testq $1, M + BRANCH + jle .L49 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L45 + ALIGN_3 + +.L42: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -24 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L42 + ALIGN_3 + +.L45: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L48 + ALIGN_3 + +.L46: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $4 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L46 + ALIGN_3 + +.L48: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + xorps %xmm0, %xmm8 + shufps $0xb1, %xmm9, %xmm9 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + shufps $0xb1, %xmm9, %xmm9 +#else + shufps $0xb1, %xmm9, %xmm9 + xorps %xmm0, %xmm9 +#endif + +#endif + + addps %xmm9, %xmm8 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + + subps %xmm8, %xmm9 +#else + movaps -32 * SIZE(AO), %xmm9 + + subps %xmm8, %xmm9 + movhlps %xmm9, %xmm11 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm9 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 +#endif + +#ifdef RT + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + + movlps %xmm9, (CO1) + movhps %xmm9, (CO2) +#else + movlps %xmm9, -32 * SIZE(AO) + movlps %xmm11, -30 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L49: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + ALIGN_4 + +.L50: + movq N, J + sarq $2, J + NOBRANCH + jle .L999 + ALIGN_4 + +.L01: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, B + + leaq (, LDC, 4), %rax + subq %rax, C +#endif + + movq C, CO1 + leaq (C, LDC, 2), CO2 +#ifndef RT + leaq (C, LDC, 4), C +#endif + +#ifdef LN + movq OFFSET, KK + addq M, KK +#endif + + movq K, %rax + salq $ZBASE_SHIFT + 2, %rax + leaq (B, %rax), BB + +#ifdef LT + movq OFFSET, KK +#endif + + movq M, I + sarq $1, I + NOBRANCH + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + prefetchnta -32 * SIZE(BB) + subq $-16 * SIZE, BB + + xorps %xmm1, %xmm1 + movaps -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + prefetcht2 4 * SIZE(CO1) + xorps %xmm9, %xmm9 + prefetcht2 4 * SIZE(CO1, LDC, 1) + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + + xorps %xmm12, %xmm12 + prefetcht2 4 * SIZE(CO2) + xorps %xmm13, %xmm13 + prefetcht2 4 * SIZE(CO2, LDC, 1) + xorps %xmm14, %xmm14 + xorps %xmm15, %xmm15 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L15 + ALIGN_3 + +.L12: + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) + + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -24 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -20 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -24 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -16 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -12 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -20 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm12 + movaps -8 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -4 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + subq $-32 * SIZE, BO + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -16 * SIZE(AO), %xmm0 + + subq $-16 * SIZE, AO + subq $1, %rax + BRANCH + jg .L12 + ALIGN_3 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_3 + +.L16: + ADD1 %xmm1, %xmm12 + movaps -32 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm13 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm5 + mulps %xmm0, %xmm2 + + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + pshufd $0xb1, %xmm5, %xmm6 + mulps %xmm0, %xmm5 + mulps %xmm0, %xmm6 + + ADD1 %xmm1, %xmm8 + movaps -28 * SIZE(BO), %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xb1, %xmm1, %xmm2 + mulps %xmm0, %xmm1 + pshufd $0x1b, %xmm2, %xmm3 + mulps %xmm0, %xmm2 + + ADD1 %xmm5, %xmm10 + ADD2 %xmm6, %xmm11 + pshufd $0xb1, %xmm3, %xmm4 + mulps %xmm0, %xmm3 + mulps %xmm0, %xmm4 + + movaps -28 * SIZE(AO), %xmm0 + + addq $4 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L16 + ALIGN_3 + +.L18: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm12 + ADD2 %xmm2, %xmm13 + ADD1 %xmm3, %xmm14 + ADD2 %xmm4, %xmm15 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + xorps %xmm0, %xmm12 + xorps %xmm0, %xmm14 +#else + shufps $0xb1, %xmm0, %xmm0 + + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + xorps %xmm0, %xmm13 + xorps %xmm0, %xmm15 +#endif + +#endif + + haddps %xmm9, %xmm8 + haddps %xmm11, %xmm10 + haddps %xmm13, %xmm12 + haddps %xmm15, %xmm14 + + shufps $0xd8, %xmm8, %xmm8 + shufps $0xd8, %xmm10, %xmm10 + shufps $0xd8, %xmm12, %xmm12 + shufps $0xd8, %xmm14, %xmm14 + + movaps %xmm8, %xmm9 + shufps $0xe4, %xmm10, %xmm8 + shufps $0xe4, %xmm9, %xmm10 + + movaps %xmm12, %xmm13 + shufps $0xe4, %xmm14, %xmm12 + shufps $0xe4, %xmm13, %xmm14 + +#if defined(LN) || defined(LT) + movaps %xmm8, %xmm9 + movlhps %xmm10, %xmm8 + movhlps %xmm9, %xmm10 + + movaps %xmm12, %xmm11 + movlhps %xmm14, %xmm12 + movhlps %xmm11, %xmm14 + + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm13 + movaps -24 * SIZE(BO), %xmm11 + movaps -20 * SIZE(BO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm11 + movaps -24 * SIZE(AO), %xmm13 + movaps -20 * SIZE(AO), %xmm15 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 + subps %xmm12, %xmm13 + subps %xmm14, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#ifdef LN + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + movaps %xmm15, %xmm5 + pshufd $0xb1, %xmm15, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm9 + subps %xmm2, %xmm9 + subps %xmm5, %xmm13 + subps %xmm4, %xmm13 + + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 +#endif + +#ifdef LT + movaps -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm13, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm9 + addps %xmm14, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + movaps %xmm13, %xmm5 + pshufd $0xb1, %xmm13, %xmm4 + + xorps %xmm7, %xmm2 + xorps %xmm7, %xmm4 + + mulps %xmm0, %xmm3 + mulps %xmm1, %xmm2 + mulps %xmm0, %xmm5 + mulps %xmm1, %xmm4 + + subps %xmm3, %xmm11 + subps %xmm2, %xmm11 + subps %xmm5, %xmm15 + subps %xmm4, %xmm15 + + movaps -28 * SIZE(AO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm10, %xmm11 + addps %xmm14, %xmm15 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm13, -28 * SIZE(BO) + movaps %xmm11, -24 * SIZE(BO) + movaps %xmm15, -20 * SIZE(BO) + + movsd %xmm9, 0 * SIZE(CO1) + movsd %xmm11, 2 * SIZE(CO1) + movhps %xmm9, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + + movsd %xmm13, 0 * SIZE(CO2) + movsd %xmm15, 2 * SIZE(CO2) + movhps %xmm13, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#else + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm11, -28 * SIZE(AO) + movaps %xmm13, -24 * SIZE(AO) + movaps %xmm15, -20 * SIZE(AO) + + movsd %xmm9, 0 * SIZE(CO1) + movhps %xmm9, 2 * SIZE(CO1) + movsd %xmm11, 0 * SIZE(CO1, LDC) + movhps %xmm11, 2 * SIZE(CO1, LDC) + movsd %xmm13, 0 * SIZE(CO2) + movhps %xmm13, 2 * SIZE(CO2) + movsd %xmm15, 0 * SIZE(CO2, LDC) + movhps %xmm15, 2 * SIZE(CO2, LDC) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $2, KK +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $1, M + BRANCH + jle .L29 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq AORIG, AO + + movq KK, %rax + salq $ZBASE_SHIFT, %rax + + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#else + movq B, BO +#endif + + xorps %xmm1, %xmm1 + movddup -32 * SIZE(AO), %xmm0 + xorps %xmm2, %xmm2 + movaps -32 * SIZE(BO), %xmm5 + xorps %xmm3, %xmm3 + xorps %xmm4, %xmm4 + + xorps %xmm8, %xmm8 + xorps %xmm9, %xmm9 + xorps %xmm10, %xmm10 + xorps %xmm11, %xmm11 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + NOBRANCH + jle .L25 + ALIGN_3 + +.L22: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -20 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -16 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -28 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -12 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -8 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -26 * SIZE(AO), %xmm0 + + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -4 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps 0 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -24 * SIZE(AO), %xmm0 + + subq $-32 * SIZE, BO + subq $ -8 * SIZE, AO + + subq $1, %rax + BRANCH + jg .L22 + ALIGN_3 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_3 + +.L26: + ADD1 %xmm1, %xmm8 + pshufd $0xa0, %xmm5, %xmm1 + mulps %xmm0, %xmm1 + ADD2 %xmm2, %xmm9 + pshufd $0xf5, %xmm5, %xmm2 + movaps -28 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm2 + ADD1 %xmm3, %xmm10 + pshufd $0xa0, %xmm5, %xmm3 + mulps %xmm0, %xmm3 + ADD2 %xmm4, %xmm11 + pshufd $0xf5, %xmm5, %xmm4 + movaps -24 * SIZE(BO), %xmm5 + mulps %xmm0, %xmm4 + movddup -30 * SIZE(AO), %xmm0 + + addq $2 * SIZE, AO + addq $8 * SIZE, BO + + subq $1, %rax + BRANCH + jg .L26 + ALIGN_3 + +.L28: +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $4, %rax +#endif + + salq $ZBASE_SHIFT, %rax + + movq AORIG, AO + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + ADD1 %xmm1, %xmm8 + ADD2 %xmm2, %xmm9 + ADD1 %xmm3, %xmm10 + ADD2 %xmm4, %xmm11 + + pcmpeqb %xmm0, %xmm0 + psllq $63, %xmm0 + +#if defined(LN) || defined(LT) + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + xorps %xmm0, %xmm8 + xorps %xmm0, %xmm10 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#endif + +#else + +#ifndef CONJ + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 +#else + shufps $0xb1, %xmm9, %xmm9 + shufps $0xb1, %xmm11, %xmm11 + xorps %xmm0, %xmm9 + xorps %xmm0, %xmm11 +#endif + +#endif + + addps %xmm9, %xmm8 + addps %xmm11, %xmm10 + +#if defined(LN) || defined(LT) + movaps -32 * SIZE(BO), %xmm9 + movaps -28 * SIZE(BO), %xmm11 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm11 +#else + movaps -32 * SIZE(AO), %xmm9 + movaps -28 * SIZE(AO), %xmm13 + + subps %xmm8, %xmm9 + subps %xmm10, %xmm13 + + movhlps %xmm9, %xmm11 + movhlps %xmm13, %xmm15 +#endif + + pcmpeqb %xmm7, %xmm7 + psllq $63, %xmm7 + +#ifndef CONJ + shufps $0xb1, %xmm7, %xmm7 +#endif + +#if defined(LN) || defined(LT) + movsd -32 * SIZE(AO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm10 + pshufd $0xb1, %xmm11, %xmm12 + + xorps %xmm7, %xmm10 + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm10 + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm12 + + addps %xmm10, %xmm9 + addps %xmm12, %xmm11 +#endif + +#ifdef RN + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 + + movaps %xmm9, %xmm3 + pshufd $0xb1, %xmm9, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + movaps -28 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -20 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm15 + subps %xmm1, %xmm15 + + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 +#endif + +#ifdef RT + movaps -4 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm15, %xmm14 + + xorps %xmm7, %xmm14 + + mulps %xmm0, %xmm15 + mulps %xmm1, %xmm14 + + addps %xmm14, %xmm15 + + movaps %xmm15, %xmm3 + pshufd $0xb1, %xmm15, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm13 + subps %xmm1, %xmm13 + + movaps -8 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -12 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm13, %xmm12 + + xorps %xmm7, %xmm12 + + mulps %xmm0, %xmm13 + mulps %xmm1, %xmm12 + + addps %xmm12, %xmm13 + + movaps %xmm13, %xmm3 + pshufd $0xb1, %xmm13, %xmm2 + + xorps %xmm7, %xmm2 + + movaps -16 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm11 + subps %xmm1, %xmm11 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -24 * SIZE(BO), %xmm5 + + pshufd $0xaa, %xmm5, %xmm0 + pshufd $0xff, %xmm5, %xmm1 + + pshufd $0xb1, %xmm11, %xmm10 + + xorps %xmm7, %xmm10 + + mulps %xmm0, %xmm11 + mulps %xmm1, %xmm10 + + addps %xmm10, %xmm11 + + movaps %xmm11, %xmm3 + pshufd $0xb1, %xmm11, %xmm2 + + xorps %xmm7, %xmm2 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + mulps %xmm3, %xmm0 + mulps %xmm2, %xmm1 + + subps %xmm0, %xmm9 + subps %xmm1, %xmm9 + + movaps -32 * SIZE(BO), %xmm5 + + pshufd $0x00, %xmm5, %xmm0 + pshufd $0x55, %xmm5, %xmm1 + + pshufd $0xb1, %xmm9, %xmm8 + + xorps %xmm7, %xmm8 + + mulps %xmm0, %xmm9 + mulps %xmm1, %xmm8 + + addps %xmm8, %xmm9 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm9, -32 * SIZE(BO) + movaps %xmm11, -28 * SIZE(BO) + + movsd %xmm9, (CO1) + movhps %xmm9, (CO1, LDC) + movsd %xmm11, (CO2) + movhps %xmm11, (CO2, LDC) +#else + movlhps %xmm11, %xmm9 + movlhps %xmm15, %xmm13 + + movaps %xmm9, -32 * SIZE(AO) + movaps %xmm13, -28 * SIZE(AO) + + movlps %xmm9, (CO1) + movlps %xmm11, (CO1, LDC) + movlps %xmm13, (CO2) + movlps %xmm15, (CO2, LDC) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#ifdef LN + subq $1, KK +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L29: +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 4), B +#endif + +#if defined(LT) || defined(RN) + movq BO, B +#endif + +#ifdef RN + addq $4, KK +#endif + +#ifdef RT + subq $4, KK +#endif + + subq $1, J + BRANCH + jg .L01 + ALIGN_4 + +.L999: + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S new file mode 100644 index 0000000000..85c0ac231c --- /dev/null +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -0,0 +1,4005 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define J %r12 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %rbp + +#ifndef WINDOWS_ABI + +#define STACKSIZE 64 + +#define OLD_LDC 8 + STACKSIZE(%rsp) +#define OLD_OFFSET 16 + STACKSIZE(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) +#define OLD_A 48 + STACKSIZE(%rsp) +#define OLD_B 56 + STACKSIZE(%rsp) +#define OLD_C 64 + STACKSIZE(%rsp) +#define OLD_LDC 72 + STACKSIZE(%rsp) +#define OLD_OFFSET 80 + STACKSIZE(%rsp) + +#endif + +#define POSINV 0(%rsp) +#define OFFSET 16(%rsp) +#define KK 24(%rsp) +#define KKK 32(%rsp) +#define AORIG 40(%rsp) +#define BORIG 48(%rsp) +#define BUFFER 128(%rsp) + +#ifdef OPTERON +#define movsd movlps +#endif + +#if defined(PENTIUM4) || defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(ATOM) || defined(NANO) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#ifdef GENERIC +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHNTA prefetchnta +#define PREFETCHSIZE (8 * 6 + 4) +#endif + +#define KERNEL1(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL2(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL3(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 24 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL4(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 28 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + +#define KERNEL5(xx) \ + mulps %xmm8, %xmm9 ;\ + addps %xmm9, %xmm0 ;\ + movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm8, %xmm11 ;\ + PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ + addps %xmm11, %xmm1 ;\ + movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm8, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ + addps %xmm13, %xmm2 ;\ + movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm8, %xmm3 ;\ + movaps 32 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 + +#define KERNEL6(xx) \ + mulps %xmm10, %xmm9 ;\ + addps %xmm9, %xmm4 ;\ + movaps 64 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ + mulps %xmm10, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm10, %xmm13 ;\ + mulps 44 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ + addps %xmm13, %xmm6 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm10, %xmm7 ;\ + movaps 36 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 + +#define KERNEL7(xx) \ + mulps %xmm12, %xmm15 ;\ + addps %xmm15, %xmm0 ;\ + movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm12, %xmm11 ;\ + addps %xmm11, %xmm1 ;\ + movaps 52 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm12, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ + addps %xmm13, %xmm2 ;\ + movaps 56 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm12, %xmm3 ;\ + movaps 40 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 + +#define KERNEL8(xx) \ + mulps %xmm14, %xmm15 ;\ + addps %xmm15, %xmm4 ;\ + movaps 80 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ + mulps %xmm14, %xmm11 ;\ + addps %xmm11, %xmm5 ;\ + movaps 68 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ + mulps %xmm14, %xmm13 ;\ + mulps 60 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ + addps %xmm13, %xmm6 ;\ + movaps 72 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ + addps %xmm14, %xmm7 ;\ + movaps 44 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + + movq %rbx, 0(%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#else + movq OLD_LDC, LDC + movsd OLD_OFFSET, %xmm4 +#endif + + movq %rsp, %rbx # save old stack + subq $128 + LOCAL_BUFFER_SIZE, %rsp + andq $-4096, %rsp # align stack + + STACK_TOUCHING + + movq OLD_M, M + movq OLD_N, N + + pxor %xmm15, %xmm15 + cmpeqps %xmm15, %xmm15 + pslld $31, %xmm15 # Generate mask + pxor %xmm2, %xmm2 + +#ifndef CONJ + movss %xmm15, 0 + POSINV + movss %xmm2, 4 + POSINV + movss %xmm15, 8 + POSINV + movss %xmm2, 12 + POSINV +#else + movss %xmm2, 0 + POSINV + movss %xmm15, 4 + POSINV + movss %xmm2, 8 + POSINV + movss %xmm15, 12 + POSINV +#endif + + movlpd %xmm4, OFFSET + movlpd %xmm4, KK + + salq $ZBASE_SHIFT, LDC + +#ifdef LN + movq M, %rax + salq $ZBASE_SHIFT, %rax + addq %rax, C + imulq K, %rax + addq %rax, A +#endif + +#ifdef RT + movq N, %rax + salq $ZBASE_SHIFT, %rax + imulq K, %rax + addq %rax, B + + movq N, %rax + imulq LDC, %rax + addq %rax, C +#endif + +#ifdef RN + negq KK +#endif + +#ifdef RT + movq N, %rax + subq OFFSET, %rax + movq %rax, KK +#endif + + testq $1, N + je .L40 + ALIGN_4 + +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L43 + ALIGN_4 + +.L42: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + addq $ 8 * SIZE, B + addq $32 * SIZE, BO + decq %rax + jne .L42 + ALIGN_4 + +.L43: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L50 + ALIGN_4 + +.L44: + movlps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + addq $2 * SIZE, B + addq $8 * SIZE, BO + decq %rax + jne .L44 + ALIGN_4 + +.L50: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + subq LDC, C +#endif + + movq C, CO1 # coffset1 = c + +#ifndef RT + addq LDC, C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + + PREFETCHW 4 * SIZE(CO1) + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L55 + ALIGN_4 + +.L52: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 64 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 16 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 20 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm11 + mulps 28 * SIZE(BO), %xmm10 + addps %xmm11, %xmm4 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm10, %xmm5 + movaps 80 * SIZE(AO), %xmm10 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) +#endif + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 32 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 36 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 36 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 40 * SIZE(AO), %xmm12 + + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm12, %xmm1 + movaps 44 * SIZE(AO), %xmm12 + mulps %xmm12, %xmm13 + mulps 44 * SIZE(BO), %xmm12 + addps %xmm13, %xmm4 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm12, %xmm5 + movaps 96 * SIZE(AO), %xmm12 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) +#endif + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 48 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 52 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 52 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 56 * SIZE(AO), %xmm14 + + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm14, %xmm1 + movaps 60 * SIZE(AO), %xmm14 + mulps %xmm14, %xmm15 + mulps 60 * SIZE(BO), %xmm14 + addps %xmm15, %xmm4 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm14, %xmm5 + movaps 112 * SIZE(AO), %xmm14 + + addq $64 * SIZE, AO + addq $64 * SIZE, BO + + + decq %rax + jne .L52 + ALIGN_4 + +.L55: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L58 + ALIGN_4 + +.L56: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm4 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm5 + movaps 8 * SIZE(AO), %xmm8 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $ 8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L56 + ALIGN_4 + +.L58: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm5, %xmm5 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm5 +#endif + + addps %xmm1, %xmm0 + addps %xmm5, %xmm4 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 +#ifdef movsd + xorps %xmm6, %xmm6 +#endif + movsd 4 * SIZE(B), %xmm6 +#ifdef movsd + xorps %xmm7, %xmm7 +#endif + movsd 6 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 +#endif + +#ifdef LN + subq $8 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + movlps %xmm6, 4 * SIZE(B) + movlps %xmm7, 6 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + + movaps %xmm0, 24 * SIZE(BO) + movaps %xmm1, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) +#endif + +#ifndef LN + addq $8 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L65 + ALIGN_4 + +.L62: + mulps %xmm8, %xmm9 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + mulps 20 * SIZE(BO), %xmm8 + addps %xmm11, %xmm0 + movaps 24 * SIZE(BO), %xmm11 + addps %xmm8, %xmm1 + movaps 12 * SIZE(AO), %xmm8 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm13 + mulps 36 * SIZE(BO), %xmm10 + addps %xmm13, %xmm0 + movaps 40 * SIZE(BO), %xmm13 + addps %xmm10, %xmm1 + movaps 20 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm13 + mulps 44 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + mulps 52 * SIZE(BO), %xmm10 + addps %xmm15, %xmm0 + movaps 56 * SIZE(BO), %xmm15 + addps %xmm10, %xmm1 + movaps 28 * SIZE(AO), %xmm10 + mulps %xmm10, %xmm15 + mulps 60 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L62 + ALIGN_4 + +.L65: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L68 + ALIGN_4 + +.L66: + mulps %xmm8, %xmm9 + mulps 4 * SIZE(BO), %xmm8 + addps %xmm9, %xmm0 + movaps 8 * SIZE(BO), %xmm9 + addps %xmm8, %xmm1 + movaps 4 * SIZE(AO), %xmm8 + + addq $4 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L66 + ALIGN_4 + +.L68: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 +#ifdef movsd + xorps %xmm3, %xmm3 +#endif + movsd 2 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + subps %xmm0, %xmm1 +#endif + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + movlps %xmm3, 2 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + + movaps %xmm0, 8 * SIZE(BO) + movaps %xmm1, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) +#else + movaps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L70: + testq $1, M + je .L79 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L75 + ALIGN_4 + +.L72: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $64 * SIZE, BO + + decq %rax + jne .L72 + ALIGN_4 + +.L75: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L78 + ALIGN_4 + +.L76: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + + addq $2 * SIZE, AO # aoffset += 4 + addq $8 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L76 + ALIGN_4 + +.L78: + addps %xmm2, %xmm0 + addps %xmm3, %xmm1 + + shufps $0xb1, %xmm1, %xmm1 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif +#else + xorps %xmm15, %xmm1 +#endif + + addps %xmm1, %xmm0 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $1, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), B + leaq (BO, %rax, 4), BO +#endif + +#if defined(LN) || defined(LT) +#ifdef movsd + xorps %xmm2, %xmm2 +#endif + movsd 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 + + subps %xmm0, %xmm1 +#endif + +#if defined(LN) || defined(LT) + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#if defined(RN) || defined(RT) + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 +#endif + +#if defined(LN) || defined(LT) + movlps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) +#else + movlps %xmm1, 0 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $2 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L79: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, COMPSIZE), B +#endif + +#ifdef RN + addq $1, KK +#endif + +#ifdef RT + subq $1, KK +#endif + ALIGN_4 + +.L40: + movq N, J + sarq $1, J # j = (n >> 2) + jle .L999 + ALIGN_4 + +.L01: +#ifdef LN + movq OFFSET, %rax + addq M, %rax + movq %rax, KK +#endif + + leaq BUFFER, BO + +#ifdef RT + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, B +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq B, BORIG + salq $ZBASE_SHIFT, %rax + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LT) + movq OFFSET, %rax + movq %rax, KK +#endif + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $2, %rax + jle .L03 + ALIGN_4 + +.L02: + movaps 0 * SIZE(B), %xmm3 + movaps 4 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 16 * SIZE(BO) + movaps %xmm5, 20 * SIZE(BO) + movaps %xmm6, 24 * SIZE(BO) + movaps %xmm7, 28 * SIZE(BO) + + movaps 8 * SIZE(B), %xmm3 + movaps 12 * SIZE(B), %xmm7 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm2, 40 * SIZE(BO) + movaps %xmm3, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm4 + pshufd $0x55, %xmm7, %xmm5 + pshufd $0xaa, %xmm7, %xmm6 + pshufd $0xff, %xmm7, %xmm7 + + movaps %xmm4, 48 * SIZE(BO) + movaps %xmm5, 52 * SIZE(BO) + movaps %xmm6, 56 * SIZE(BO) + movaps %xmm7, 60 * SIZE(BO) + + addq $16 * SIZE, B + addq $64 * SIZE, BO + decq %rax + jne .L02 + ALIGN_4 + +.L03: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $3, %rax + BRANCH + jle .L10 + ALIGN_4 + +.L04: + movaps 0 * SIZE(B), %xmm3 + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm2 + pshufd $0xff, %xmm3, %xmm3 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm2, 8 * SIZE(BO) + movaps %xmm3, 12 * SIZE(BO) + + addq $ 4 * SIZE, B + addq $16 * SIZE, BO + decq %rax + jne .L04 + ALIGN_4 + +.L10: +#if defined(LT) || defined(RN) + movq A, AO +#else + movq A, AORIG +#endif + +#ifdef RT + leaq (, LDC, 2), %rax + subq %rax, C +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 + +#ifndef RT + leaq (C, LDC, 2), C +#endif + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + +.L11: +#ifdef LN + movq K, %rax + salq $2 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(BO), %xmm9 + movaps 4 * SIZE(BO), %xmm11 + movaps 8 * SIZE(BO), %xmm13 + movaps 16 * SIZE(BO), %xmm15 + + movaps 0 * SIZE(AO), %xmm8 + pxor %xmm0, %xmm0 + movaps 4 * SIZE(AO), %xmm10 + pxor %xmm1, %xmm1 + movaps 8 * SIZE(AO), %xmm12 + pxor %xmm2, %xmm2 + movaps 12 * SIZE(AO), %xmm14 + pxor %xmm3, %xmm3 + + PREFETCHW 7 * SIZE(CO1) + pxor %xmm4, %xmm4 + PREFETCHW 7 * SIZE(CO2) + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + andq $-8, %rax + salq $4, %rax + je .L15 +.L1X: + KERNEL1(32 * 0) + KERNEL2(32 * 0) + KERNEL3(32 * 0) + KERNEL4(32 * 0) + KERNEL5(32 * 0) + KERNEL6(32 * 0) + KERNEL7(32 * 0) + KERNEL8(32 * 0) + KERNEL1(32 * 1) + KERNEL2(32 * 1) + KERNEL3(32 * 1) + KERNEL4(32 * 1) + KERNEL5(32 * 1) + KERNEL6(32 * 1) + KERNEL7(32 * 1) + KERNEL8(32 * 1) + + addq $32 * 2 * SIZE, AO + addq $64 * 2 * SIZE, BO + subq $64 * 2, %rax + jg .L1X + +.L12: + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + ALIGN_4 + +.L15: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L18 + ALIGN_4 + +.L16: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 0 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm4 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm5 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 12 * SIZE(BO), %xmm10 + addps %xmm9, %xmm6 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm10, %xmm7 + movaps 12 * SIZE(AO), %xmm10 + + addq $ 8 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L16 + ALIGN_4 + +.L18: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + shufps $0xb1, %xmm5, %xmm5 + shufps $0xb1, %xmm7, %xmm7 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + addps %xmm5, %xmm4 + addps %xmm7, %xmm6 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $4, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps %xmm4, %xmm5 + unpcklpd %xmm6, %xmm4 + unpckhpd %xmm6, %xmm5 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + movaps 8 * SIZE(B), %xmm6 + movaps 12 * SIZE(B), %xmm7 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 + subps %xmm4, %xmm6 + subps %xmm5, %xmm7 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm3 + movaps 8 * SIZE(AO), %xmm5 + movaps 12 * SIZE(AO), %xmm7 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 +#endif + +#ifdef LN + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + movaps 24 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + movaps 16 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 8 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + movaps 12 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm6 + subps %xmm1, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 20 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm6 + addps %xmm0, %xmm6 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm6, %xmm0 + pshufd $0xf5, %xmm6, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm7 + subps %xmm1, %xmm7 + + movaps 28 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm7, %xmm0 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm7 + addps %xmm0, %xmm7 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm2 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm0, %xmm1 + addps %xmm2, %xmm3 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + + pshufd $0xa0, %xmm3, %xmm4 + pshufd $0xf5, %xmm3, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm5 + subps %xmm4, %xmm7 + + subps %xmm2, %xmm5 + subps %xmm6, %xmm7 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm6 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm4, %xmm5 + addps %xmm6, %xmm7 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + + pshufd $0xa0, %xmm7, %xmm2 + pshufd $0xf5, %xmm7, %xmm7 + +#ifndef CONJ + xorps %xmm15, %xmm5 + xorps %xmm15, %xmm7 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm2 + mulps %xmm10, %xmm5 + mulps %xmm10, %xmm7 + + addps %xmm0, %xmm5 + addps %xmm2, %xmm7 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + + pshufd $0xa0, %xmm7, %xmm4 + pshufd $0xf5, %xmm7, %xmm6 + +#ifndef CONJ + xorps %xmm15, %xmm2 + xorps %xmm15, %xmm6 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm9, %xmm4 + + mulps %xmm10, %xmm2 + mulps %xmm10, %xmm6 + + subps %xmm0, %xmm1 + subps %xmm4, %xmm3 + + subps %xmm2, %xmm1 + subps %xmm6, %xmm3 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + + pshufd $0xa0, %xmm3, %xmm6 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm4 + xorps %xmm15, %xmm6 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm9, %xmm6 + mulps %xmm10, %xmm1 + mulps %xmm10, %xmm3 + + addps %xmm4, %xmm1 + addps %xmm6, %xmm3 + +#endif + +#ifdef LN + subq $8 * SIZE, CO1 + subq $8 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + movaps %xmm6, 8 * SIZE(B) + movaps %xmm7, 12 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + pshufd $0x00, %xmm6, %xmm0 + pshufd $0x55, %xmm6, %xmm1 + pshufd $0xaa, %xmm6, %xmm4 + pshufd $0xff, %xmm6, %xmm5 + + movaps %xmm0, 32 * SIZE(BO) + movaps %xmm1, 36 * SIZE(BO) + movaps %xmm4, 40 * SIZE(BO) + movaps %xmm5, 44 * SIZE(BO) + + pshufd $0x00, %xmm7, %xmm0 + pshufd $0x55, %xmm7, %xmm1 + pshufd $0xaa, %xmm7, %xmm4 + pshufd $0xff, %xmm7, %xmm5 + + movaps %xmm0, 48 * SIZE(BO) + movaps %xmm1, 52 * SIZE(BO) + movaps %xmm4, 56 * SIZE(BO) + movaps %xmm5, 60 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movlps %xmm6, 4 * SIZE(CO1) + movlps %xmm7, 6 * SIZE(CO1) + + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) + movhps %xmm6, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm3, 4 * SIZE(AO) + movaps %xmm5, 8 * SIZE(AO) + movaps %xmm7, 12 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + movlps %xmm3, 4 * SIZE(CO1) + movhps %xmm3, 6 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) + movlps %xmm7, 4 * SIZE(CO2) + movhps %xmm7, 6 * SIZE(CO2) +#endif + + +#ifndef LN + addq $8 * SIZE, CO1 + addq $8 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 4), AO +#ifdef LT + addq $16 * SIZE, B +#endif +#endif + +#ifdef LN + subq $4, KK + movq BORIG, B +#endif + +#ifdef LT + addq $4, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $2 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + + decq I # i -- + jg .L11 + ALIGN_4 + +.L20: + testq $2, M + je .L30 + +#ifdef LN + movq K, %rax + salq $1 + ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movaps 0 * SIZE(AO), %xmm8 + movaps 16 * SIZE(AO), %xmm10 + movaps 32 * SIZE(AO), %xmm12 + movaps 48 * SIZE(AO), %xmm14 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L25 + ALIGN_4 + +.L22: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 64 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + mulps 28 * SIZE(BO), %xmm8 + addps %xmm11, %xmm2 + movaps 80 * SIZE(BO), %xmm11 + addps %xmm8, %xmm3 + movaps 8 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + mulps 44 * SIZE(BO), %xmm8 + addps %xmm13, %xmm2 + movaps 96 * SIZE(BO), %xmm13 + addps %xmm8, %xmm3 + movaps 12 * SIZE(AO), %xmm8 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + mulps 60 * SIZE(BO), %xmm8 + addps %xmm15, %xmm2 + movaps 112 * SIZE(BO), %xmm15 + addps %xmm8, %xmm3 + movaps 32 * SIZE(AO), %xmm8 + +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) +#endif + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + mulps 76 * SIZE(BO), %xmm10 + addps %xmm9, %xmm2 + movaps 128 * SIZE(BO), %xmm9 + addps %xmm10, %xmm3 + movaps 20 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + mulps 92 * SIZE(BO), %xmm10 + addps %xmm11, %xmm2 + movaps 144 * SIZE(BO), %xmm11 + addps %xmm10, %xmm3 + movaps 24 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + mulps 108 * SIZE(BO), %xmm10 + addps %xmm13, %xmm2 + movaps 160 * SIZE(BO), %xmm13 + addps %xmm10, %xmm3 + movaps 28 * SIZE(AO), %xmm10 + + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + mulps 124 * SIZE(BO), %xmm10 + addps %xmm15, %xmm2 + movaps 176 * SIZE(BO), %xmm15 + addps %xmm10, %xmm3 + movaps 48 * SIZE(AO), %xmm10 + + addq $32 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L22 + ALIGN_4 + +.L25: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L28 + ALIGN_4 + +.L26: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + mulps 12 * SIZE(BO), %xmm8 + addps %xmm9, %xmm2 + movaps 16 * SIZE(BO), %xmm9 + addps %xmm8, %xmm3 + movaps 4 * SIZE(AO), %xmm8 + + addq $ 4 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L26 + ALIGN_4 + +.L28: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $2, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + movaps %xmm0, %xmm1 + unpcklpd %xmm2, %xmm0 + unpckhpd %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm2 + movaps 4 * SIZE(B), %xmm3 + + subps %xmm0, %xmm2 + subps %xmm1, %xmm3 +#else + movaps 0 * SIZE(AO), %xmm1 + movaps 4 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm2 + subps %xmm1, %xmm2 + + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + subps %xmm0, %xmm3 + subps %xmm1, %xmm3 + + movaps 4 * SIZE(AO), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm3, %xmm0 + pshufd $0xf5, %xmm3, %xmm3 + +#ifndef CONJ + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm3 + addps %xmm0, %xmm3 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $4 * SIZE, CO1 + subq $4 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + movaps %xmm3, 4 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + pshufd $0x00, %xmm3, %xmm0 + pshufd $0x55, %xmm3, %xmm1 + pshufd $0xaa, %xmm3, %xmm4 + pshufd $0xff, %xmm3, %xmm5 + + movaps %xmm0, 16 * SIZE(BO) + movaps %xmm1, 20 * SIZE(BO) + movaps %xmm4, 24 * SIZE(BO) + movaps %xmm5, 28 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movlps %xmm3, 2 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) + movhps %xmm3, 2 * SIZE(CO2) +#else + movaps %xmm1, 0 * SIZE(AO) + movaps %xmm5, 4 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movhps %xmm1, 2 * SIZE(CO1) + + movlps %xmm5, 0 * SIZE(CO2) + movhps %xmm5, 2 * SIZE(CO2) +#endif + +#ifndef LN + addq $4 * SIZE, CO1 + addq $4 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 2), AO +#ifdef LT + addq $8 * SIZE, B +#endif +#endif + +#ifdef LN + subq $2, KK + movq BORIG, B +#endif + +#ifdef LT + addq $2, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $1 + ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#ifdef LN + movq K, %rax + salq $ZBASE_SHIFT, %rax + subq %rax, AORIG +#endif + +#if defined(LN) || defined(RT) + movq KK, %rax + movq AORIG, AO + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#endif + + leaq BUFFER, BO + +#if defined(LN) || defined(RT) + movq KK, %rax + salq $1 + ZBASE_SHIFT, %rax + leaq (BO, %rax, 4), BO +#endif + + movsd 0 * SIZE(AO), %xmm8 + movhps 2 * SIZE(AO), %xmm8 + movsd 8 * SIZE(AO), %xmm10 + movhps 10 * SIZE(AO), %xmm10 + + movaps 0 * SIZE(BO), %xmm9 + movaps 16 * SIZE(BO), %xmm11 + movaps 32 * SIZE(BO), %xmm13 + movaps 48 * SIZE(BO), %xmm15 + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + sarq $3, %rax + je .L35 + ALIGN_4 + +.L32: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 +#if defined(OPTERON) && defined(HAVE_PREFETCH) + PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) +#endif + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 64 * SIZE(BO), %xmm9 + + mulps %xmm8, %xmm11 + addps %xmm11, %xmm0 + movaps 20 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm1 + movaps 24 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + addps %xmm11, %xmm2 + movaps 28 * SIZE(BO), %xmm11 + mulps %xmm8, %xmm11 + movsd 4 * SIZE(AO), %xmm8 + addps %xmm11, %xmm3 + movaps 80 * SIZE(BO), %xmm11 + + mulps %xmm8, %xmm13 + addps %xmm13, %xmm0 + movaps 36 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm1 + movaps 40 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + addps %xmm13, %xmm2 + movaps 44 * SIZE(BO), %xmm13 + mulps %xmm8, %xmm13 + movsd 6 * SIZE(AO), %xmm8 + addps %xmm13, %xmm3 + movaps 96 * SIZE(BO), %xmm13 + + mulps %xmm8, %xmm15 + addps %xmm15, %xmm0 + movaps 52 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm1 + movaps 56 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + addps %xmm15, %xmm2 + movaps 60 * SIZE(BO), %xmm15 + mulps %xmm8, %xmm15 + movsd 16 * SIZE(AO), %xmm8 + addps %xmm15, %xmm3 + movaps 112 * SIZE(BO), %xmm15 + + mulps %xmm10, %xmm9 + addps %xmm9, %xmm0 + movaps 68 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm1 + movaps 72 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + addps %xmm9, %xmm2 + movaps 76 * SIZE(BO), %xmm9 + mulps %xmm10, %xmm9 + movsd 10 * SIZE(AO), %xmm10 + addps %xmm9, %xmm3 + movaps 128 * SIZE(BO), %xmm9 + + mulps %xmm10, %xmm11 + addps %xmm11, %xmm0 + movaps 84 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm1 + movaps 88 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + addps %xmm11, %xmm2 + movaps 92 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm11 + movsd 12 * SIZE(AO), %xmm10 + addps %xmm11, %xmm3 + movaps 144 * SIZE(BO), %xmm11 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm0 + movaps 100 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm1 + movaps 104 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + addps %xmm13, %xmm2 + movaps 108 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm13 + movsd 14 * SIZE(AO), %xmm10 + addps %xmm13, %xmm3 + movaps 160 * SIZE(BO), %xmm13 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm0 + movaps 116 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm1 + movaps 120 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + addps %xmm15, %xmm2 + movaps 124 * SIZE(BO), %xmm15 + mulps %xmm10, %xmm15 + movsd 24 * SIZE(AO), %xmm10 + addps %xmm15, %xmm3 + movaps 176 * SIZE(BO), %xmm15 + + addq $16 * SIZE, AO + addq $128 * SIZE, BO + + decq %rax + jne .L32 + ALIGN_4 + +.L35: +#if defined(LT) || defined(RN) + movq KK, %rax +#else + movq K, %rax + subq KK, %rax +#endif + movaps POSINV, %xmm15 + andq $7, %rax # if (k & 1) + BRANCH + je .L38 + ALIGN_4 + +.L36: + mulps %xmm8, %xmm9 + addps %xmm9, %xmm0 + movaps 4 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm1 + movaps 8 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + addps %xmm9, %xmm2 + movaps 12 * SIZE(BO), %xmm9 + mulps %xmm8, %xmm9 + movsd 2 * SIZE(AO), %xmm8 + addps %xmm9, %xmm3 + movaps 16 * SIZE(BO), %xmm9 + + + addq $ 2 * SIZE, AO # aoffset += 4 + addq $16 * SIZE, BO # boffset1 += 8 + decq %rax + jg .L36 + ALIGN_4 + +.L38: + shufps $0xb1, %xmm1, %xmm1 + shufps $0xb1, %xmm3, %xmm3 + +#if defined(LN) || defined(LT) +#ifndef CONJ + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#else + xorps %xmm15, %xmm0 + xorps %xmm15, %xmm2 +#endif +#else + xorps %xmm15, %xmm1 + xorps %xmm15, %xmm3 +#endif + + addps %xmm1, %xmm0 + addps %xmm3, %xmm2 + +#if defined(LN) || defined(RT) + movq KK, %rax +#ifdef LN + subq $1, %rax +#else + subq $2, %rax +#endif + + movq AORIG, AO + movq BORIG, B + leaq BUFFER, BO + + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), B + leaq (BO, %rax, 8), BO +#endif + +#if defined(LN) || defined(LT) + unpcklpd %xmm2, %xmm0 + + movaps 0 * SIZE(B), %xmm2 + + subps %xmm0, %xmm2 +#else +#ifdef movsd + xorps %xmm1, %xmm1 +#endif + movsd 0 * SIZE(AO), %xmm1 +#ifdef movsd + xorps %xmm5, %xmm5 +#endif + movsd 2 * SIZE(AO), %xmm5 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm5 +#endif + + +#ifdef LN + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef LT + movaps 0 * SIZE(AO), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm2, %xmm0 + pshufd $0xf5, %xmm2, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + addps %xmm0, %xmm2 +#endif + +#ifdef RN + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm1 + + addps %xmm0, %xmm1 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm0 + pshufd $0xf5, %xmm1, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm5 + subps %xmm2, %xmm5 + + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm4 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm5 + + addps %xmm4, %xmm5 +#endif + +#ifdef RT + movaps 4 * SIZE(B), %xmm8 + + pshufd $0xee, %xmm8, %xmm9 + pshufd $0xbb, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm5 + +#ifndef CONJ + xorps %xmm15, %xmm5 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm5 + + addps %xmm0, %xmm5 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm5, %xmm0 + pshufd $0xf5, %xmm5, %xmm2 + +#ifndef CONJ + xorps %xmm15, %xmm2 +#else + xorps %xmm15, %xmm0 +#endif + + mulps %xmm9, %xmm0 + mulps %xmm10, %xmm2 + + subps %xmm0, %xmm1 + subps %xmm2, %xmm1 + + movaps 0 * SIZE(B), %xmm8 + + pshufd $0x44, %xmm8, %xmm9 + pshufd $0x11, %xmm8, %xmm10 + + pshufd $0xa0, %xmm1, %xmm4 + pshufd $0xf5, %xmm1, %xmm1 + +#ifndef CONJ + xorps %xmm15, %xmm1 +#else + xorps %xmm15, %xmm4 +#endif + + mulps %xmm9, %xmm4 + mulps %xmm10, %xmm1 + + addps %xmm4, %xmm1 +#endif + +#ifdef LN + subq $2 * SIZE, CO1 + subq $2 * SIZE, CO2 +#endif + +#if defined(LN) || defined(LT) + movaps %xmm2, 0 * SIZE(B) + + pshufd $0x00, %xmm2, %xmm0 + pshufd $0x55, %xmm2, %xmm1 + pshufd $0xaa, %xmm2, %xmm4 + pshufd $0xff, %xmm2, %xmm5 + + movaps %xmm0, 0 * SIZE(BO) + movaps %xmm1, 4 * SIZE(BO) + movaps %xmm4, 8 * SIZE(BO) + movaps %xmm5, 12 * SIZE(BO) + + movlps %xmm2, 0 * SIZE(CO1) + movhps %xmm2, 0 * SIZE(CO2) +#else + movlps %xmm1, 0 * SIZE(AO) + movlps %xmm5, 2 * SIZE(AO) + + movlps %xmm1, 0 * SIZE(CO1) + movlps %xmm5, 0 * SIZE(CO2) +#endif + +#ifndef LN + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + salq $ZBASE_SHIFT, %rax + leaq (AO, %rax, 1), AO +#ifdef LT + addq $4 * SIZE, B +#endif +#endif + +#ifdef LN + subq $1, KK + movq BORIG, B +#endif + +#ifdef LT + addq $1, KK +#endif + +#ifdef RT + movq K, %rax + movq BORIG, B + salq $ZBASE_SHIFT, %rax + addq %rax, AORIG +#endif + ALIGN_4 + +.L39: +#ifdef LN + leaq (, K, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#if defined(LT) || defined(RN) + movq K, %rax + subq KK, %rax + leaq (,%rax, SIZE), %rax + leaq (B, %rax, 2 * COMPSIZE), B +#endif + +#ifdef RN + addq $2, KK +#endif + +#ifdef RT + subq $2, KK +#endif + + decq J # j -- + jg .L01 + ALIGN_4 + +.L999: + movq %rbx, %rsp + movq 0(%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/l1param.h b/l1param.h new file mode 100644 index 0000000000..f1d223ea70 --- /dev/null +++ b/l1param.h @@ -0,0 +1,84 @@ +#if defined(CORE2) || defined(PENRYN) +#define ALIGNED_ACCESS +#endif + +#ifdef NEHALEM +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (128 * 12) +#define ALIGNED_ACCESS +#endif + +#ifdef ATHLON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 10) +#define ALIGNED_ACCESS +#define movsd movlps +#endif + +#ifdef PENTIUM3 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (128 * 10) +#define ALIGNED_ACCESS +#define movsd movlps +#endif + +#ifdef PENTIUM4 +#define PREFETCH prefetcht0 +#define PREFETCHSIZE (128 * 10) +#define FETCH128 +#define ALIGNED_ACCESS +#define xorps pxor +#define xorpd pxor +#endif + +#ifdef ATOM +#define ALIGNED_ACCESS +#define PREFETCH prefetcht0 +#define PREFETCHSIZE ( 64 * 12 + 32) +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 3) +#define movsd movlps +#endif + +#ifdef BARCELONA +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + +#ifdef SHANGHAI +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + +#ifdef NANO +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (128 * 4) +#define ALIGNED_ACCESS +#endif + +#define PREOFFSET 128 + + +#ifdef HAVE_SSE2 +#define PSHUFD1(A, B) pshufd A, B, B +#define PSHUFD2(A, B, C) pshufd A, B, C +#else +#define PSHUFD1(A, B) shufps A, B, B +#define PSHUFD2(A, B, C) movaps B, C; shufps A, C, C +#endif + +#define MOVDDUP1(OFFSET, BASE, REGS) movddup OFFSET(BASE), REGS + +#define MOVAPS(OFFSET, BASE, REGS) movlps REGS, OFFSET(BASE); movhps REGS, OFFSET + SIZE(BASE) + diff --git a/l2param.h b/l2param.h new file mode 100644 index 0000000000..af9d171796 --- /dev/null +++ b/l2param.h @@ -0,0 +1,165 @@ +#ifndef GEMV_PARAM_H +#define GEMV_PARAM_H + +#ifdef movsd +#undef movsd +#endif + +#undef movapd +#define movapd movaps + +#ifdef ATHLON +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 64 * 3 +#endif + +#ifdef PENTIUM4 +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 64 * 2 +#endif + +#ifdef CORE2 +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 64 * 4 +#endif + +#ifdef PENRYN +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#define PREFETCHSIZE 64 * 4 +#endif + +#ifdef NEHALEM +#define MOVUPS_A movups +#define MOVUPS_XL movups +#define MOVUPS_XS movups +#define MOVUPS_YL movups +#define MOVUPS_YS movups +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 64 * 3 +#endif + +#ifdef OPTERON +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#ifndef COMPLEX +#define PREFETCHSIZE 64 * 1 +#else +#define PREFETCHSIZE 64 * 1 +#endif +#define movsd movlps +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps + +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#ifndef COMPLEX +#define PREFETCHSIZE 64 * 2 +#else +#define PREFETCHSIZE 64 * 4 +#endif +#endif + +#ifdef NANO +#define ALIGNED_ACCESS +#define MOVUPS_A movaps +#define MOVUPS_XL movaps +#define MOVUPS_XS movaps +#define MOVUPS_YL movaps +#define MOVUPS_YS movaps +#define PREFETCH prefetcht0 +#ifndef COMPLEX +#define PREFETCHSIZE 64 * 1 +#else +#define PREFETCHSIZE 64 * 2 +#endif +#endif + +#ifndef PREOFFSET +#ifdef L1_DATA_LINESIZE +#define PREOFFSET (L1_DATA_LINESIZE >> 1) +#else +#define PREOFFSET 32 +#endif +#endif + +#ifndef GEMV_UNROLL +#define GEMV_UNROLL 4 +#endif + +#ifndef ZGEMV_UNROLL +#define ZGEMV_UNROLL 4 +#endif + +/* #define COPY_FORCE */ /* Always copy X or Y to the buffer */ +/* #define NOCOPY_UNALIGNED */ /* Not copy if X or Y is not aligned */ + +#ifdef MOVUPS_A +#define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS +#else +#define MOVUPS_A1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF(ADDR, BASE, SCALE), REGS; movhps OFF + 8(ADDR, BASE, SCALE), REGS +#endif + +#define MOVRPS_A1(OFF, ADDR, REGS) movsd OFF + 8(ADDR), REGS; movhps OFF(ADDR), REGS +#define MOVRPS_A2(OFF, ADDR, BASE, SCALE, REGS) movsd OFF + 8(ADDR, BASE, SCALE), REGS; movhps OFF(ADDR, BASE, SCALE), REGS + +#ifdef MOVUPS_XL +#define MOVUPS_XL1(OFF, ADDR, REGS) MOVUPS_XL OFF(ADDR), REGS +#else +#define MOVUPS_XL1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#endif + +#ifdef MOVUPS_XS +#define MOVUPS_XS1(OFF, ADDR, REGS) MOVUPS_XS REGS, OFF(ADDR) +#else +#define MOVUPS_XS1(OFF, ADDR, REGS) movsd REGS, OFF(ADDR); movhps REGS, OFF + 8(ADDR) +#endif + +#ifdef MOVUPS_YL +#define MOVUPS_YL1(OFF, ADDR, REGS) MOVUPS_YL OFF(ADDR), REGS +#else +#define MOVUPS_YL1(OFF, ADDR, REGS) movsd OFF(ADDR), REGS; movhps OFF + 8(ADDR), REGS +#endif + +#ifdef MOVUPS_YS +#define MOVUPS_YS1(OFF, ADDR, REGS) MOVUPS_YS REGS, OFF(ADDR) +#else +#define MOVUPS_YS1(OFF, ADDR, REGS) movsd REGS, OFF(ADDR); movhps REGS, OFF + 8(ADDR) +#endif + + + +#endif diff --git a/lapack/Makefile b/lapack/Makefile new file mode 100644 index 0000000000..215badb748 --- /dev/null +++ b/lapack/Makefile @@ -0,0 +1,40 @@ +TOPDIR = .. +include ../Makefile.system + +SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs + +FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 + +libs: + @for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done + +prof: + @for d in $(SUBDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + (cd $$d; $(MAKE) prof) ; \ + fi; \ + done + +flame: + @for d in $(FLAMEDIRS) ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d libs || exit 1 ; \ + fi; \ + done + +hpl: + +hpl_p: + +clean :: + @for d in $(SUBDIRS) tpp ; \ + do if test -d $$d; then \ + $(MAKE) -C $$d $(@F) || exit 1 ; \ + fi; \ + done + diff --git a/lapack/getf2/Makefile b/lapack/getf2/Makefile new file mode 100644 index 0000000000..612c6f9ccd --- /dev/null +++ b/lapack/getf2/Makefile @@ -0,0 +1,49 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = sgetf2_k.$(SUFFIX) +DBLASOBJS = dgetf2_k.$(SUFFIX) +QBLASOBJS = qgetf2_k.$(SUFFIX) +CBLASOBJS = cgetf2_k.$(SUFFIX) +ZBLASOBJS = zgetf2_k.$(SUFFIX) +XBLASOBJS = xgetf2_k.$(SUFFIX) + +sgetf2_k.$(SUFFIX) : getf2_k.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dgetf2_k.$(SUFFIX) : getf2_k.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qgetf2_k.$(SUFFIX) : getf2_k.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cgetf2_k.$(SUFFIX) : zgetf2_k.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zgetf2_k.$(SUFFIX) : zgetf2_k.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xgetf2_k.$(SUFFIX) : zgetf2_k.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +sgetf2_k.$(PSUFFIX) : getf2_k.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dgetf2_k.$(PSUFFIX) : getf2_k.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qgetf2_k.$(PSUFFIX) : getf2_k.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cgetf2_k.$(PSUFFIX) : zgetf2_k.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zgetf2_k.$(PSUFFIX) : zgetf2_k.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xgetf2_k.$(PSUFFIX) : zgetf2_k.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail + + diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c new file mode 100644 index 0000000000..fdc4eaef98 --- /dev/null +++ b/lapack/getf2/getf2_k.c @@ -0,0 +1,117 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; +static FLOAT dm1 = -1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, lda; + blasint *ipiv, offset; + FLOAT *a; + + FLOAT temp1, temp2; + blasint i, j; + blasint ip, jp; + blasint info; + BLASLONG len; + FLOAT *b; + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + info = 0; + b = a; + + for (j = 0; j < n; j++) { + + len = MIN(j, m); + + for (i = 0; i < len; i++) { + ip = ipiv[i + offset] - 1 - offset; + if (ip != i) { + temp1 = *(b + i); + temp2 = *(b + ip); + *(b + i) = temp2; + *(b + ip) = temp1; + } + } + + for (i = 1; i < len; i++) { + b[i] -= DOTU_K(i, a + i, lda, b, 1); + } + + if (j < m) { + GEMV_N(m - j, j, 0, dm1, a + j, lda, b, 1, b + j, 1, sb); + + jp = j + IAMAX_K(m - j, b + j, 1); + ipiv[j + offset] = jp + offset; + jp--; + temp1 = *(b + jp); + + if (temp1 != ZERO) { + temp1 = dp1 / temp1; + + if (jp != j) { + SWAP_K(j + 1, 0, 0, ZERO, a + j, lda, a + jp, lda, NULL, 0); + } + if (j + 1 < m) { + SCAL_K(m - j - 1, 0, 0, temp1, b + j + 1, 1, NULL, 0, NULL, 0); + } + } else { + if (!info) info = j + 1; + } + } + b += lda; + } + return info; +} diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c new file mode 100644 index 0000000000..ae8c6fd608 --- /dev/null +++ b/lapack/getf2/zgetf2_k.c @@ -0,0 +1,139 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +double fabs(double); + +static FLOAT dp1 = 1.; +static FLOAT dm1 = -1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, lda, offset; + blasint *ipiv; + FLOAT *a; + + FLOAT temp1, temp2, temp3, temp4, ratio, den; + blasint i, j; + blasint ip, jp; + blasint info; + BLASLONG len; + FLOAT *b; + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + info = 0; + b = a; + + for (j = 0; j < n; j++) { + + len = MIN(j, m); + + for (i = 0; i < len; i++) { + ip = ipiv[i + offset] - 1 - offset; + if (ip != i) { + temp1 = *(b + i * 2 + 0); + temp2 = *(b + i * 2 + 1); + temp3 = *(b + ip * 2 + 0); + temp4 = *(b + ip * 2 + 1); + *(b + i * 2 + 0) = temp3; + *(b + i * 2 + 1) = temp4; + *(b + ip * 2 + 0) = temp1; + *(b + ip * 2 + 1) = temp2; + } + } + + ZTRSV_NLU(len, a, lda, b, 1, sb); + + if (j < m) { + GEMV_N(m - j, j, 0, dm1, ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb); + + jp = j + IAMAX_K(m - j, b + j * 2, 1); + ipiv[j + offset] = jp + offset; + jp--; + + temp1 = *(b + jp * 2 + 0); + temp2 = *(b + jp * 2 + 1); + + if ((temp1 != ZERO) || (temp2 != ZERO)) { + + if (jp != j) { + SWAP_K(j + 1, 0, 0, ZERO, ZERO, a + j * 2, lda, + a + jp * 2, lda, NULL, 0); + } + + if (fabs(temp1) >= fabs(temp2)){ + ratio = temp2 / temp1; + den = dp1 /(temp1 * ( 1 + ratio * ratio)); + temp3 = den; + temp4 = -ratio * den; + } else { + ratio = temp1 / temp2; + den = dp1 /(temp2 * ( 1 + ratio * ratio)); + temp3 = ratio * den; + temp4 = -den; + } + + if (j + 1 < m) { + SCAL_K(m - j - 1, 0, 0, temp3, temp4, + b + (j + 1) * 2, 1, NULL, 0, NULL, 0); + } + } else { + if (!info) info = j + 1; + } + } + b += lda * 2; + } + return info; + +} + diff --git a/lapack/getrf/Makefile b/lapack/getrf/Makefile new file mode 100644 index 0000000000..a559dfb0d6 --- /dev/null +++ b/lapack/getrf/Makefile @@ -0,0 +1,98 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = sgetrf_single.$(SUFFIX) +DBLASOBJS = dgetrf_single.$(SUFFIX) +QBLASOBJS = qgetrf_single.$(SUFFIX) +CBLASOBJS = cgetrf_single.$(SUFFIX) +ZBLASOBJS = zgetrf_single.$(SUFFIX) +XBLASOBJS = xgetrf_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += sgetrf_parallel.$(SUFFIX) +DBLASOBJS += dgetrf_parallel.$(SUFFIX) +QBLASOBJS += qgetrf_parallel.$(SUFFIX) +CBLASOBJS += cgetrf_parallel.$(SUFFIX) +ZBLASOBJS += zgetrf_parallel.$(SUFFIX) +XBLASOBJS += xgetrf_parallel.$(SUFFIX) +endif + +ifeq ($(USE_OPENMP), 1) +GETRF_SRC = getrf_parallel_omp.c +else +GETRF_SRC = getrf_parallel.c +endif + +sgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +dgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +qgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +cgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +zgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +xgetrf_single.$(SUFFIX) : getrf_single.c + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +sgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +dgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +qgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +cgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +zgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +xgetrf_parallel.$(SUFFIX) : $(GETRF_SRC) ../../param.h + $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +sgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +dgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +qgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +cgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +zgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +xgetrf_single.$(PSUFFIX) : getrf_single.c + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +sgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -UDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +dgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -DDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +qgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -DXDOUBLE -UCOMPLEX -DUNIT $< -o $(@F) + +cgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -UDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +zgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -DDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +xgetrf_parallel.$(PSUFFIX) : $(GETRF_SRC) + $(CC) -c $(PFLAGS) -DXDOUBLE -DCOMPLEX -DUNIT $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c new file mode 100644 index 0000000000..0db93da921 --- /dev/null +++ b/lapack/getrf/getrf_parallel.c @@ -0,0 +1,857 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +double sqrt(double); + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +#ifndef GETRF_FACTOR +#define GETRF_FACTOR 0.75 +#endif + +#undef GETRF_FACTOR +#define GETRF_FACTOR 1.00 + +static inline long FORMULA1(long M, long N, long IS, long BK, long T) { + + double m = (double)(M - IS - BK); + double n = (double)(N - IS - BK); + double b = (double)BK; + double a = (double)T; + + return (long)((n + GETRF_FACTOR * m * b * (1. - a) / (b + m)) / a); + +} + +#define FORMULA2(M, N, IS, BK, T) (BLASLONG)((double)(N - IS + BK) * (1. - sqrt(1. - 1. / (double)(T)))) + + +static void inner_basic_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG is, min_i; + BLASLONG js, min_j; + BLASLONG jjs, min_jj; + + BLASLONG m = args -> m; + BLASLONG n = args -> n; + BLASLONG k = args -> k; + + BLASLONG lda = args -> lda; + BLASLONG off = args -> ldb; + + FLOAT *b = (FLOAT *)args -> b + (k ) * COMPSIZE; + FLOAT *c = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; + FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; + FLOAT *sbb = sb; + + volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; + + blasint *ipiv = (blasint *)args -> c; + + if (range_n) { + n = range_n[1] - range_n[0]; + c += range_n[0] * lda * COMPSIZE; + d += range_n[0] * lda * COMPSIZE; + } + + if (args -> a == NULL) { + TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); + sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + } else { + sb = (FLOAT *)args -> a; + } + + for (js = 0; js < n; js += REAL_GEMM_R) { + min_j = n - js; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ + min_jj = js + min_j - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + if (GEMM_UNROLL_N <= 8) { + + LASWP_NCOPY(min_jj, off + 1, off + k, + c + (- off + jjs * lda) * COMPSIZE, lda, + ipiv, sbb + k * (jjs - js) * COMPSIZE); + + } else { + + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, +#ifdef COMPLEX + ZERO, +#endif + c + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); + + GEMM_ONCOPY (k, min_jj, c + jjs * lda * COMPSIZE, lda, sbb + (jjs - js) * k * COMPSIZE); + + } + + for (is = 0; is < k; is += GEMM_P) { + min_i = k - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb + k * is * COMPSIZE, + sbb + (jjs - js) * k * COMPSIZE, + c + (is + jjs * lda) * COMPSIZE, lda, is); + } + } + + if ((js + REAL_GEMM_R >= n) && (mypos >= 0)) flag[mypos * CACHE_LINE_SIZE] = 0; + + for (is = 0; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); + + GEMM_KERNEL_N(min_i, min_j, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sbb, d + (is + js * lda) * COMPSIZE, lda); + } + } +} + + +/* Non blocking implementation */ + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); + +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, SA, SB, C, LDC, X, Y) \ + GEMM_KERNEL_N(M, N, K, dm1, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#else +#define KERNEL_OPERATION(M, N, K, SA, SB, C, LDC, X, Y) \ + GEMM_KERNEL_N(M, N, K, dm1, ZERO, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) +#endif + +static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + job_t *job = (job_t *)args -> common; + + BLASLONG xxx, bufferside; + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG jjs, min_jj, div_n; + + BLASLONG i, current; + BLASLONG is, min_i; + + BLASLONG m, n_from, n_to; + BLASLONG k = args -> k; + + BLASLONG lda = args -> lda; + BLASLONG off = args -> ldb; + + FLOAT *a = (FLOAT *)args -> b + (k ) * COMPSIZE; + FLOAT *b = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; + FLOAT *c = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; + FLOAT *sbb= sb; + + blasint *ipiv = (blasint *)args -> c; + + volatile BLASLONG *flag = (volatile BLASLONG *)args -> d; + + if (args -> a == NULL) { + TRSM_ILTCOPY(k, k, (FLOAT *)args -> b, lda, 0, sb); + sbb = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + } else { + sb = (FLOAT *)args -> a; + } + + m = range_m[1] - range_m[0]; + n_from = range_n[mypos + 0]; + n_to = range_n[mypos + 1]; + + a += range_m[0] * COMPSIZE; + c += range_m[0] * COMPSIZE; + + div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE; + + buffer[0] = sbb; + + + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE; + } + + for (xxx = n_from, bufferside = 0; xxx < n_to; xxx += div_n, bufferside ++) { + + for (i = 0; i < args -> nthreads; i++) + while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {}; + + for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ + min_jj = MIN(n_to, xxx + div_n) - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + if (GEMM_UNROLL_N <= 8) { + + LASWP_NCOPY(min_jj, off + 1, off + k, + b + (- off + jjs * lda) * COMPSIZE, lda, + ipiv, buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); + + } else { + + LASWP_PLUS(min_jj, off + 1, off + k, ZERO, +#ifdef COMPLEX + ZERO, +#endif + b + (- off + jjs * lda) * COMPSIZE, lda, NULL, 0, ipiv, 1); + + GEMM_ONCOPY (k, min_jj, b + jjs * lda * COMPSIZE, lda, + buffer[bufferside] + (jjs - xxx) * k * COMPSIZE); + } + + for (is = 0; is < k; is += GEMM_P) { + min_i = k - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb + k * is * COMPSIZE, + buffer[bufferside] + (jjs - xxx) * k * COMPSIZE, + b + (is + jjs * lda) * COMPSIZE, lda, is); + } + } + + for (i = 0; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; + + } + + flag[mypos * CACHE_LINE_SIZE] = 0; + + if (m == 0) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + job[mypos].working[mypos][CACHE_LINE_SIZE * xxx] = 0; + } + } + + for(is = 0; is < m; is += min_i){ + min_i = m - is; + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1); + } + + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); + + current = mypos; + + do { + + div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + if ((current != mypos) && (!is)) { + while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {}; + } + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, is, xxx); + + if (is + min_i >= m) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; + } + } + + current ++; + if (current >= args -> nthreads) current = 0; + + } while (current != mypos); + } + + for (i = 0; i < args -> nthreads; i++) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {}; + } + } + + return 0; +} + +#if 1 + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, mn, lda, offset; + BLASLONG init_bk, next_bk, range_n_mine[2], range_n_new[2]; + blasint *ipiv, iinfo, info; + int mode; + blas_arg_t newarg; + + FLOAT *a, *sbb; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range_M[MAX_CPU_NUMBER + 1]; + BLASLONG range_N[MAX_CPU_NUMBER + 1]; + + job_t job[MAX_CPU_NUMBER]; + + BLASLONG width, nn, mm; + BLASLONG i, j, k, is, bk; + + BLASLONG num_cpu; + + volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (m <= 0 || n <= 0) return 0; + + newarg.c = ipiv; + newarg.lda = lda; + newarg.common = (void *)job; + + info = 0; + + mn = MIN(m, n); + + init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (init_bk > GEMM_Q) init_bk = GEMM_Q; + + if (init_bk <= GEMM_UNROLL_N) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } + + next_bk = init_bk; + + bk = mn; + if (bk > next_bk) bk = next_bk; + + range_n_new[0] = offset; + range_n_new[1] = offset + bk; + + iinfo = CNAME(args, NULL, range_n_new, sa, sb, 0); + + if (iinfo && !info) info = iinfo; + + TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); + + sbb = (FLOAT *)((((long)(sb + bk * bk * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + + is = 0; + num_cpu = 0; + + while (is < mn) { + + width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (width > mn - is - bk) width = mn - is - bk; + + if (width < bk) { + next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1); + + if (next_bk > bk) next_bk = bk; + + width = next_bk; + if (width > mn - is - bk) width = mn - is - bk; + } + + if (num_cpu > 0) exec_blas_async_wait(num_cpu, &queue[0]); + + mm = m - bk - is; + nn = n - bk - is; + + newarg.a = sb; + newarg.b = a + (is + is * lda) * COMPSIZE; + newarg.d = (void *)flag; + newarg.m = mm; + newarg.n = nn; + newarg.k = bk; + newarg.ldb = is + offset; + + nn -= width; + + range_n_mine[0] = 0; + range_n_mine[1] = width; + + range_N[0] = width; + range_M[0] = 0; + + num_cpu = 0; + + while (nn > 0){ + + if (mm >= nn) { + + width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (nn < width) width = nn; + nn -= width; + range_N[num_cpu + 1] = range_N[num_cpu] + width; + + width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (mm < width) width = mm; + if (nn <= 0) width = mm; + mm -= width; + range_M[num_cpu + 1] = range_M[num_cpu] + width; + + } else { + + width = blas_quickdivide(mm + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (mm < width) width = mm; + mm -= width; + range_M[num_cpu + 1] = range_M[num_cpu] + width; + + width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu - 1); + if (nn < width) width = nn; + if (mm <= 0) width = nn; + nn -= width; + range_N[num_cpu + 1] = range_N[num_cpu] + width; + + } + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_advanced_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = &range_M[num_cpu]; + queue[num_cpu].range_n = &range_N[0]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + flag[num_cpu * CACHE_LINE_SIZE] = 1; + + num_cpu ++; + + } + + newarg.nthreads = num_cpu; + + if (num_cpu > 0) { + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + } + + is += bk; + + bk = mn - is; + if (bk > next_bk) bk = next_bk; + + range_n_new[0] = offset + is; + range_n_new[1] = offset + is + bk; + + if (num_cpu > 0) { + + queue[num_cpu - 1].next = NULL; + + exec_blas_async(0, &queue[0]); + + inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + + if (iinfo && !info) info = iinfo + is; + + for (i = 0; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; + + TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); + + } else { + + inner_basic_thread(&newarg, NULL, range_n_mine, sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + + if (iinfo && !info) info = iinfo + is; + + } + + } + + next_bk = init_bk; + is = 0; + + while (is < mn) { + + bk = mn - is; + if (bk > next_bk) bk = next_bk; + + width = (FORMULA1(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (width > mn - is - bk) width = mn - is - bk; + + if (width < bk) { + next_bk = (FORMULA2(m, n, is, bk, args -> nthreads) + GEMM_UNROLL_N) & ~(GEMM_UNROLL_N - 1); + if (next_bk > bk) next_bk = bk; + } + + blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, + a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, + ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); + + is += bk; + } + + return info; +} + +#else + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, mn, lda, offset; + BLASLONG i, is, bk, init_bk, next_bk, range_n_new[2]; + blasint *ipiv, iinfo, info; + int mode; + blas_arg_t newarg; + FLOAT *a, *sbb; + FLOAT dummyalpha[2] = {ZERO, ZERO}; + + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + + BLASLONG width, nn, num_cpu; + + volatile BLASLONG flag[MAX_CPU_NUMBER * CACHE_LINE_SIZE] __attribute__((aligned(128))); + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (m <= 0 || n <= 0) return 0; + + newarg.c = ipiv; + newarg.lda = lda; + newarg.common = NULL; + newarg.nthreads = args -> nthreads; + + mn = MIN(m, n); + + init_bk = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (init_bk > GEMM_Q) init_bk = GEMM_Q; + + if (init_bk <= GEMM_UNROLL_N) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } + + width = FORMULA1(m, n, 0, init_bk, args -> nthreads); + width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (width > n - init_bk) width = n - init_bk; + + if (width < init_bk) { + long temp; + + temp = FORMULA2(m, n, 0, init_bk, args -> nthreads); + temp = (temp + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (temp < GEMM_UNROLL_N) temp = GEMM_UNROLL_N; + if (temp < init_bk) init_bk = temp; + + } + + next_bk = init_bk; + bk = init_bk; + + range_n_new[0] = offset; + range_n_new[1] = offset + bk; + + info = CNAME(args, NULL, range_n_new, sa, sb, 0); + + TRSM_ILTCOPY(bk, bk, a, lda, 0, sb); + + is = 0; + num_cpu = 0; + + sbb = (FLOAT *)((((long)(sb + GEMM_PQ * GEMM_PQ * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + + while (is < mn) { + + width = FORMULA1(m, n, is, bk, args -> nthreads); + width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (width < bk) { + + next_bk = FORMULA2(m, n, is, bk, args -> nthreads); + next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (next_bk > bk) next_bk = bk; +#if 0 + if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is); +#else + if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is); +#endif + + width = next_bk; + } + + if (width > mn - is - bk) { + next_bk = mn - is - bk; + width = next_bk; + } + + nn = n - bk - is; + if (width > nn) width = nn; + + if (num_cpu > 1) exec_blas_async_wait(num_cpu - 1, &queue[1]); + + range[0] = 0; + range[1] = width; + + num_cpu = 1; + nn -= width; + + newarg.a = sb; + newarg.b = a + (is + is * lda) * COMPSIZE; + newarg.d = (void *)flag; + newarg.m = m - bk - is; + newarg.n = n - bk - is; + newarg.k = bk; + newarg.ldb = is + offset; + + while (nn > 0){ + + width = blas_quickdivide(nn + args -> nthreads - num_cpu, args -> nthreads - num_cpu); + + nn -= width; + if (nn < 0) width = width + nn; + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + //queue[num_cpu].routine = inner_advanced_thread; + queue[num_cpu].routine = (void *)inner_basic_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = &range[num_cpu]; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + flag[num_cpu * CACHE_LINE_SIZE] = 1; + + num_cpu ++; + } + + queue[num_cpu - 1].next = NULL; + + is += bk; + + bk = n - is; + if (bk > next_bk) bk = next_bk; + + range_n_new[0] = offset + is; + range_n_new[1] = offset + is + bk; + + if (num_cpu > 1) { + + exec_blas_async(1, &queue[1]); + +#if 0 + inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, 0); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); +#else + + if (range[1] >= bk * 4) { + + BLASLONG myrange[2]; + + myrange[0] = 0; + myrange[1] = bk; + + inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + + myrange[0] = bk; + myrange[1] = range[1]; + + inner_basic_thread(&newarg, NULL, &myrange[0], sa, sbb, -1); + + } else { + + inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + } + +#endif + + for (i = 1; i < num_cpu; i ++) while (flag[i * CACHE_LINE_SIZE]) {}; + + TRSM_ILTCOPY(bk, bk, a + (is + is * lda) * COMPSIZE, lda, 0, sb); + + } else { + + inner_basic_thread(&newarg, NULL, &range[0], sa, sbb, -1); + + iinfo = GETRF_SINGLE(args, NULL, range_n_new, sa, sbb, 0); + } + + if (iinfo && !info) info = iinfo + is; + + } + + next_bk = init_bk; + bk = init_bk; + + is = 0; + + while (is < mn) { + + bk = mn - is; + if (bk > next_bk) bk = next_bk; + + width = FORMULA1(m, n, is, bk, args -> nthreads); + width = (width + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (width < bk) { + next_bk = FORMULA2(m, n, is, bk, args -> nthreads); + next_bk = (next_bk + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + + if (next_bk > bk) next_bk = bk; +#if 0 + if (next_bk < GEMM_UNROLL_N) next_bk = MIN(GEMM_UNROLL_N, mn - bk - is); +#else + if (next_bk < GEMM_UNROLL_N) next_bk = MAX(GEMM_UNROLL_N, mn - bk - is); +#endif + } + + if (width > mn - is - bk) { + next_bk = mn - is - bk; + width = next_bk; + } + + blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, + a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, + ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); + + is += bk; + } + + return info; +} + +#endif + diff --git a/lapack/getrf/getrf_parallel_omp.c b/lapack/getrf/getrf_parallel_omp.c new file mode 100644 index 0000000000..b637e6db5a --- /dev/null +++ b/lapack/getrf/getrf_parallel_omp.c @@ -0,0 +1,222 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +static FLOAT dm1 = -1.; + +static void inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + BLASLONG is, min_i; + BLASLONG js, min_j; + BLASLONG jjs, min_jj; + + BLASLONG m = args -> m; + BLASLONG n = args -> n; + BLASLONG k = args -> k; + + BLASLONG lda = args -> lda; + BLASLONG off = args -> ldb; + + FLOAT *b = (FLOAT *)args -> b + (k ) * COMPSIZE; + FLOAT *c = (FLOAT *)args -> b + ( k * lda) * COMPSIZE; + FLOAT *d = (FLOAT *)args -> b + (k + k * lda) * COMPSIZE; + + blasint *ipiv = (blasint *)args -> c; + + if (range_n) { + n = range_n[1] - range_n[0]; + c += range_n[0] * lda * COMPSIZE; + d += range_n[0] * lda * COMPSIZE; + } + + for (js = 0; js < n; js += REAL_GEMM_R) { + min_j = n - js; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + for (jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ + min_jj = js + min_j - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + LASWP_NCOPY(min_jj, off + 1, off + k, + c + (- off + jjs * lda) * COMPSIZE, lda, + ipiv, sb + k * (jjs - js) * COMPSIZE); + + for (is = 0; is < k; is += GEMM_P) { + min_i = k - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRSM_KERNEL_LT(min_i, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + (FLOAT *)args -> a + k * is * COMPSIZE, + sb + (jjs - js) * k * COMPSIZE, + c + (is + jjs * lda) * COMPSIZE, lda, is); + } + } + + for (is = 0; is < m; is += GEMM_P){ + min_i = m - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY (k, min_i, b + is * COMPSIZE, lda, sa); + + GEMM_KERNEL_N(min_i, min_j, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb, d + (is + js * lda) * COMPSIZE, lda); + } + } +} + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, lda, offset; + blasint *ipiv, iinfo, info; + BLASLONG j, jb, mn, blocking; + FLOAT *a, *offsetA, *offsetB; + BLASLONG range_N[2]; + blas_arg_t newarg; + + int mode; + + FLOAT *sbb; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (m <= 0 || n <= 0) return 0; + + mn = MIN(m, n); + + blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + if (blocking <= GEMM_UNROLL_N * 2) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } + + sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + + info = 0; + + for (j = 0; j < mn; j += blocking) { + + jb = mn - j; + if (jb > blocking) jb = blocking; + + offsetA = a + j * lda * COMPSIZE; + offsetB = a + (j + jb) * lda * COMPSIZE; + + range_N[0] = offset + j; + range_N[1] = offset + j + jb; + + iinfo = CNAME(args, NULL, range_N, sa, sb, 0); + + if (iinfo && !info) info = iinfo + j; + + if (j + jb < n) { + + TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); + + newarg.m = m - jb - j; + newarg.n = n - jb - j; + newarg.k = jb; + + newarg.a = sb; + newarg.lda = lda; + newarg.b = a + (j + j * lda) * COMPSIZE; + newarg.ldb = j + offset; + newarg.c = ipiv; + + newarg.common = NULL; + newarg.nthreads = args -> nthreads; + + gemm_thread_n(mode, &newarg, NULL, NULL, (void *)inner_thread, sa, sbb, args -> nthreads); + + } + } + + for (j = 0; j < mn; j += jb) { + jb = MIN(mn - j, blocking); + LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, +#ifdef COMPLEX + ZERO, +#endif + a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); + + } + + return info; +} diff --git a/lapack/getrf/getrf_single.c b/lapack/getrf/getrf_single.c new file mode 100644 index 0000000000..a761dee4c4 --- /dev/null +++ b/lapack/getrf/getrf_single.c @@ -0,0 +1,173 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +static FLOAT dm1 = -1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG m, n, lda, offset; + BLASLONG j, js, jmin, is, imin, jc, jcmin; + BLASLONG jjs, min_jj; + blasint *ipiv, iinfo, info; + BLASLONG jb, mn, blocking; + FLOAT *a, *offsetA, *offsetB; + BLASLONG range_N[2]; + + FLOAT *sbb; + + m = args -> m; + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + ipiv = (blasint *)args -> c; + offset = 0; + + if (range_n) { + m -= range_n[0]; + n = range_n[1] - range_n[0]; + offset = range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (m <= 0 || n <= 0) return 0; + + mn = MIN(m, n); + + blocking = (mn / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + if (blocking <= GEMM_UNROLL_N * 2) { + info = GETF2(args, NULL, range_n, sa, sb, 0); + return info; + } + + sbb = (FLOAT *)((((long)(sb + blocking * blocking * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + + info = 0; + + for (j = 0; j < mn; j += blocking) { + + jb = mn - j; + if (jb > blocking) jb = blocking; + + offsetA = a + j * lda * COMPSIZE; + offsetB = a + (j + jb) * lda * COMPSIZE; + + range_N[0] = offset + j; + range_N[1] = offset + j + jb; + + iinfo = CNAME(args, NULL, range_N, sa, sb, 0); + + if (iinfo && !info) info = iinfo + j; + + if (j + jb < n) { + + TRSM_ILTCOPY(jb, jb, offsetA + j * COMPSIZE, lda, 0, sb); + + for (js = j + jb; js < n; js += REAL_GEMM_R){ + jmin = n - js; + if (jmin > REAL_GEMM_R) jmin = REAL_GEMM_R; + + for (jjs = js; jjs < js + jmin; jjs += GEMM_UNROLL_N){ + min_jj = js + jmin - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + +#if 0 + LASWP_PLUS(min_jj, j + offset + 1, j + jb + offset, ZERO, +#ifdef COMPLEX + ZERO, +#endif + a + (- offset + jjs * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); + + GEMM_ONCOPY (jb, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sbb + jb * (jjs - js) * COMPSIZE); +#else + LASWP_NCOPY(min_jj, j + offset + 1, j + jb + offset, + a + (- offset + jjs * lda) * COMPSIZE, lda, ipiv, sbb + jb * (jjs - js) * COMPSIZE); +#endif + + + for (jc = 0; jc < jb; jc += GEMM_P) { + jcmin = jb - jc; + if (jcmin > GEMM_P) jcmin = GEMM_P; + + TRSM_KERNEL_LT(jcmin, min_jj, jb, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb + jb * jc * COMPSIZE, + sbb + jb * (jjs - js) * COMPSIZE, + a + (j + jc + jjs * lda) * COMPSIZE, lda, jc); + } + } + + + for (is = j + jb; is < m; is += GEMM_P){ + + imin = m - is; + if (imin > GEMM_P) imin = GEMM_P; + + GEMM_ITCOPY (jb, imin, offsetA + is * COMPSIZE, lda, sa); + + GEMM_KERNEL_N(imin, jmin, jb, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sbb, a + (is + js * lda) * COMPSIZE, lda); + } + } + } + } + + for (j = 0; j < mn; j += jb) { + jb = MIN(mn - j, blocking); + LASWP_PLUS(jb, j + jb + offset + 1, mn + offset, ZERO, +#ifdef COMPLEX + ZERO, +#endif + a - (offset - j * lda) * COMPSIZE, lda, NULL, 0 , ipiv, 1); + + } + + return info; +} diff --git a/lapack/getri/cgetri.f b/lapack/getri/cgetri.f new file mode 100644 index 0000000000..6840f531c8 --- /dev/null +++ b/lapack/getri/cgetri.f @@ -0,0 +1,194 @@ + SUBROUTINE CGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* CGETRI computes the inverse of a matrix using the LU factorization +* computed by CGETRF. +* +* This method inverts U and then computes inv(A) by solving the system +* inv(A)*L = inv(U) for inv(A). +* +* Arguments +* ========= +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the factors L and U from the factorization +* A = P*L*U as computed by CGETRF. +* On exit, if INFO = 0, the inverse of the original matrix A. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from CGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* WORK (workspace/output) COMPLEX array, dimension (LWORK) +* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* The dimension of the array WORK. LWORK >= max(1,N). +* For optimal performance LWORK >= N*NB, where NB is +* the optimal blocksize returned by ILAENV. +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is +* singular and its inverse could not be computed. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ), + $ ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, + $ NBMIN, NN +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGEMV, CSWAP, CTRSM, CTRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 ) + LWKOPT = N*NB + WORK( 1 ) = LWKOPT + LQUERY = ( LWORK.EQ.-1 ) + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -3 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -6 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRI', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form inv(U). If INFO > 0 from CTRTRI, then U is singular, +* and the inverse is not computed. +* + CALL CTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* + NBMIN = 2 + LDWORK = N + IF( NB.GT.1 .AND. NB.LT.N ) THEN + IWS = MAX( LDWORK*NB, 1 ) + IF( LWORK.LT.IWS ) THEN + NB = LWORK / LDWORK + NBMIN = MAX( 2, ILAENV( 2, 'CGETRI', ' ', N, -1, -1, -1 ) ) + END IF + ELSE + IWS = N + END IF +* +* Solve the equation inv(A)*L = inv(U) for inv(A). +* + IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + DO 20 J = N, 1, -1 +* +* Copy current column of L to WORK and replace with zeros. +* + DO 10 I = J + 1, N + WORK( I ) = A( I, J ) + A( I, J ) = ZERO + 10 CONTINUE +* +* Compute current column of inv(A). +* + IF( J.LT.N ) + $ CALL CGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), + $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) + 20 CONTINUE + ELSE +* +* Use blocked code. +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 50 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) +* +* Copy current block column of L to WORK and replace with +* zeros. +* + DO 40 JJ = J, J + JB - 1 + DO 30 I = JJ + 1, N + WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) + A( I, JJ ) = ZERO + 30 CONTINUE + 40 CONTINUE +* +* Compute current block column of inv(A). +* + IF( J+JB.LE.N ) + $ CALL CGEMM( 'No transpose', 'No transpose', N, JB, + $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, + $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) + CALL CTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, + $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) + 50 CONTINUE + END IF +* +* Apply column interchanges. +* + DO 60 J = N - 1, 1, -1 + JP = IPIV( J ) + IF( JP.NE.J ) + $ CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) + 60 CONTINUE +* + WORK( 1 ) = IWS + RETURN +* +* End of CGETRI +* + END diff --git a/lapack/getri/dgetri.f b/lapack/getri/dgetri.f new file mode 100644 index 0000000000..c67a348030 --- /dev/null +++ b/lapack/getri/dgetri.f @@ -0,0 +1,193 @@ + SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* DGETRI computes the inverse of a matrix using the LU factorization +* computed by DGETRF. +* +* This method inverts U and then computes inv(A) by solving the system +* inv(A)*L = inv(U) for inv(A). +* +* Arguments +* ========= +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the factors L and U from the factorization +* A = P*L*U as computed by DGETRF. +* On exit, if INFO = 0, the inverse of the original matrix A. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from DGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* WORK (workspace/output) DOUBLE PRECISION array, dimension (LWORK) +* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* The dimension of the array WORK. LWORK >= max(1,N). +* For optimal performance LWORK >= N*NB, where NB is +* the optimal blocksize returned by ILAENV. +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is +* singular and its inverse could not be computed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, + $ NBMIN, NN +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGEMV, DSWAP, DTRSM, DTRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 ) + LWKOPT = N*NB + WORK( 1 ) = LWKOPT + LQUERY = ( LWORK.EQ.-1 ) + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -3 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -6 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRI', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form inv(U). If INFO > 0 from DTRTRI, then U is singular, +* and the inverse is not computed. +* + CALL DTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* + NBMIN = 2 + LDWORK = N + IF( NB.GT.1 .AND. NB.LT.N ) THEN + IWS = MAX( LDWORK*NB, 1 ) + IF( LWORK.LT.IWS ) THEN + NB = LWORK / LDWORK + NBMIN = MAX( 2, ILAENV( 2, 'DGETRI', ' ', N, -1, -1, -1 ) ) + END IF + ELSE + IWS = N + END IF +* +* Solve the equation inv(A)*L = inv(U) for inv(A). +* + IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + DO 20 J = N, 1, -1 +* +* Copy current column of L to WORK and replace with zeros. +* + DO 10 I = J + 1, N + WORK( I ) = A( I, J ) + A( I, J ) = ZERO + 10 CONTINUE +* +* Compute current column of inv(A). +* + IF( J.LT.N ) + $ CALL DGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), + $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) + 20 CONTINUE + ELSE +* +* Use blocked code. +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 50 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) +* +* Copy current block column of L to WORK and replace with +* zeros. +* + DO 40 JJ = J, J + JB - 1 + DO 30 I = JJ + 1, N + WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) + A( I, JJ ) = ZERO + 30 CONTINUE + 40 CONTINUE +* +* Compute current block column of inv(A). +* + IF( J+JB.LE.N ) + $ CALL DGEMM( 'No transpose', 'No transpose', N, JB, + $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, + $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) + CALL DTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, + $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) + 50 CONTINUE + END IF +* +* Apply column interchanges. +* + DO 60 J = N - 1, 1, -1 + JP = IPIV( J ) + IF( JP.NE.J ) + $ CALL DSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) + 60 CONTINUE +* + WORK( 1 ) = IWS + RETURN +* +* End of DGETRI +* + END diff --git a/lapack/getri/sgetri.f b/lapack/getri/sgetri.f new file mode 100644 index 0000000000..ec5932f166 --- /dev/null +++ b/lapack/getri/sgetri.f @@ -0,0 +1,193 @@ + SUBROUTINE SGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* SGETRI computes the inverse of a matrix using the LU factorization +* computed by SGETRF. +* +* This method inverts U and then computes inv(A) by solving the system +* inv(A)*L = inv(U) for inv(A). +* +* Arguments +* ========= +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the factors L and U from the factorization +* A = P*L*U as computed by SGETRF. +* On exit, if INFO = 0, the inverse of the original matrix A. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from SGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* WORK (workspace/output) REAL array, dimension (LWORK) +* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* The dimension of the array WORK. LWORK >= max(1,N). +* For optimal performance LWORK >= N*NB, where NB is +* the optimal blocksize returned by ILAENV. +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is +* singular and its inverse could not be computed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, + $ NBMIN, NN +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 ) + LWKOPT = N*NB + WORK( 1 ) = LWKOPT + LQUERY = ( LWORK.EQ.-1 ) + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -3 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -6 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRI', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form inv(U). If INFO > 0 from STRTRI, then U is singular, +* and the inverse is not computed. +* + CALL STRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* + NBMIN = 2 + LDWORK = N + IF( NB.GT.1 .AND. NB.LT.N ) THEN + IWS = MAX( LDWORK*NB, 1 ) + IF( LWORK.LT.IWS ) THEN + NB = LWORK / LDWORK + NBMIN = MAX( 2, ILAENV( 2, 'SGETRI', ' ', N, -1, -1, -1 ) ) + END IF + ELSE + IWS = N + END IF +* +* Solve the equation inv(A)*L = inv(U) for inv(A). +* + IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + DO 20 J = N, 1, -1 +* +* Copy current column of L to WORK and replace with zeros. +* + DO 10 I = J + 1, N + WORK( I ) = A( I, J ) + A( I, J ) = ZERO + 10 CONTINUE +* +* Compute current column of inv(A). +* + IF( J.LT.N ) + $ CALL SGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), + $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) + 20 CONTINUE + ELSE +* +* Use blocked code. +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 50 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) +* +* Copy current block column of L to WORK and replace with +* zeros. +* + DO 40 JJ = J, J + JB - 1 + DO 30 I = JJ + 1, N + WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) + A( I, JJ ) = ZERO + 30 CONTINUE + 40 CONTINUE +* +* Compute current block column of inv(A). +* + IF( J+JB.LE.N ) + $ CALL SGEMM( 'No transpose', 'No transpose', N, JB, + $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, + $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) + CALL STRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, + $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) + 50 CONTINUE + END IF +* +* Apply column interchanges. +* + DO 60 J = N - 1, 1, -1 + JP = IPIV( J ) + IF( JP.NE.J ) + $ CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) + 60 CONTINUE +* + WORK( 1 ) = IWS + RETURN +* +* End of SGETRI +* + END diff --git a/lapack/getri/zgetri.f b/lapack/getri/zgetri.f new file mode 100644 index 0000000000..1eb4eb7f18 --- /dev/null +++ b/lapack/getri/zgetri.f @@ -0,0 +1,194 @@ + SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LWORK, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ), WORK( * ) +* .. +* +* Purpose +* ======= +* +* ZGETRI computes the inverse of a matrix using the LU factorization +* computed by ZGETRF. +* +* This method inverts U and then computes inv(A) by solving the system +* inv(A)*L = inv(U) for inv(A). +* +* Arguments +* ========= +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the factors L and U from the factorization +* A = P*L*U as computed by ZGETRF. +* On exit, if INFO = 0, the inverse of the original matrix A. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from ZGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* WORK (workspace/output) COMPLEX*16 array, dimension (LWORK) +* On exit, if INFO=0, then WORK(1) returns the optimal LWORK. +* +* LWORK (input) INTEGER +* The dimension of the array WORK. LWORK >= max(1,N). +* For optimal performance LWORK >= N*NB, where NB is +* the optimal blocksize returned by ILAENV. +* +* If LWORK = -1, then a workspace query is assumed; the routine +* only calculates the optimal size of the WORK array, returns +* this value as the first entry of the WORK array, and no error +* message related to LWORK is issued by XERBLA. +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero; the matrix is +* singular and its inverse could not be computed. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), + $ ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IWS, J, JB, JJ, JP, LDWORK, LWKOPT, NB, + $ NBMIN, NN +* .. +* .. External Functions .. + INTEGER ILAENV + EXTERNAL ILAENV +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGEMM, ZGEMV, ZSWAP, ZTRSM, ZTRTRI +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 ) + LWKOPT = N*NB + WORK( 1 ) = LWKOPT + LQUERY = ( LWORK.EQ.-1 ) + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -3 + ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN + INFO = -6 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRI', -INFO ) + RETURN + ELSE IF( LQUERY ) THEN + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Form inv(U). If INFO > 0 from ZTRTRI, then U is singular, +* and the inverse is not computed. +* + CALL ZTRTRI( 'Upper', 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* + NBMIN = 2 + LDWORK = N + IF( NB.GT.1 .AND. NB.LT.N ) THEN + IWS = MAX( LDWORK*NB, 1 ) + IF( LWORK.LT.IWS ) THEN + NB = LWORK / LDWORK + NBMIN = MAX( 2, ILAENV( 2, 'ZGETRI', ' ', N, -1, -1, -1 ) ) + END IF + ELSE + IWS = N + END IF +* +* Solve the equation inv(A)*L = inv(U) for inv(A). +* + IF( NB.LT.NBMIN .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + DO 20 J = N, 1, -1 +* +* Copy current column of L to WORK and replace with zeros. +* + DO 10 I = J + 1, N + WORK( I ) = A( I, J ) + A( I, J ) = ZERO + 10 CONTINUE +* +* Compute current column of inv(A). +* + IF( J.LT.N ) + $ CALL ZGEMV( 'No transpose', N, N-J, -ONE, A( 1, J+1 ), + $ LDA, WORK( J+1 ), 1, ONE, A( 1, J ), 1 ) + 20 CONTINUE + ELSE +* +* Use blocked code. +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 50 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) +* +* Copy current block column of L to WORK and replace with +* zeros. +* + DO 40 JJ = J, J + JB - 1 + DO 30 I = JJ + 1, N + WORK( I+( JJ-J )*LDWORK ) = A( I, JJ ) + A( I, JJ ) = ZERO + 30 CONTINUE + 40 CONTINUE +* +* Compute current block column of inv(A). +* + IF( J+JB.LE.N ) + $ CALL ZGEMM( 'No transpose', 'No transpose', N, JB, + $ N-J-JB+1, -ONE, A( 1, J+JB ), LDA, + $ WORK( J+JB ), LDWORK, ONE, A( 1, J ), LDA ) + CALL ZTRSM( 'Right', 'Lower', 'No transpose', 'Unit', N, JB, + $ ONE, WORK( J ), LDWORK, A( 1, J ), LDA ) + 50 CONTINUE + END IF +* +* Apply column interchanges. +* + DO 60 J = N - 1, 1, -1 + JP = IPIV( J ) + IF( JP.NE.J ) + $ CALL ZSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 ) + 60 CONTINUE +* + WORK( 1 ) = IWS + RETURN +* +* End of ZGETRI +* + END diff --git a/lapack/getrs/Makefile b/lapack/getrs/Makefile new file mode 100644 index 0000000000..2640ef0975 --- /dev/null +++ b/lapack/getrs/Makefile @@ -0,0 +1,236 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = sgetrs_N_single.$(SUFFIX) sgetrs_T_single.$(SUFFIX) +DBLASOBJS = dgetrs_N_single.$(SUFFIX) dgetrs_T_single.$(SUFFIX) +QBLASOBJS = qgetrs_N_single.$(SUFFIX) qgetrs_T_single.$(SUFFIX) +CBLASOBJS = cgetrs_N_single.$(SUFFIX) cgetrs_T_single.$(SUFFIX) cgetrs_R_single.$(SUFFIX) cgetrs_C_single.$(SUFFIX) +ZBLASOBJS = zgetrs_N_single.$(SUFFIX) zgetrs_T_single.$(SUFFIX) zgetrs_R_single.$(SUFFIX) zgetrs_C_single.$(SUFFIX) +XBLASOBJS = xgetrs_N_single.$(SUFFIX) xgetrs_T_single.$(SUFFIX) xgetrs_R_single.$(SUFFIX) xgetrs_C_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += sgetrs_N_parallel.$(SUFFIX) sgetrs_T_parallel.$(SUFFIX) +DBLASOBJS += dgetrs_N_parallel.$(SUFFIX) dgetrs_T_parallel.$(SUFFIX) +QBLASOBJS += qgetrs_N_parallel.$(SUFFIX) qgetrs_T_parallel.$(SUFFIX) +CBLASOBJS += cgetrs_N_parallel.$(SUFFIX) cgetrs_T_parallel.$(SUFFIX) cgetrs_R_parallel.$(SUFFIX) cgetrs_C_parallel.$(SUFFIX) +ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_parallel.$(SUFFIX) zgetrs_C_parallel.$(SUFFIX) +XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX) +endif + +sgetrs_N_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) + +sgetrs_T_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) + +sgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) + +sgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) + +dgetrs_N_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) + +dgetrs_T_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) + +dgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) + +dgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) + +qgetrs_N_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) + +qgetrs_T_single.$(SUFFIX) : getrs_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) + +qgetrs_N_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) + +qgetrs_T_parallel.$(SUFFIX) : getrs_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) + +cgetrs_N_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) + +cgetrs_T_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) + +cgetrs_R_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) + +cgetrs_C_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) + +cgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) + +cgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) + +cgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) + +cgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) + +zgetrs_N_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) + +zgetrs_T_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) + +zgetrs_R_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) + +zgetrs_C_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) + +zgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) + +zgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) + +zgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) + +zgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) + +xgetrs_N_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) + +xgetrs_T_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) + +xgetrs_R_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) + +xgetrs_C_single.$(SUFFIX) : zgetrs_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) + +xgetrs_N_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) + +xgetrs_T_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) + +xgetrs_R_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) + +xgetrs_C_parallel.$(SUFFIX) : zgetrs_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) + +sgetrs_N_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) + +sgetrs_T_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) + +sgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) + +sgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DTRANS $< -o $(@F) + +dgetrs_N_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) + +dgetrs_T_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) + +dgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UTRANS $< -o $(@F) + +dgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DTRANS $< -o $(@F) + +qgetrs_N_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) + +qgetrs_T_single.$(PSUFFIX) : getrs_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) + +qgetrs_N_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UTRANS $< -o $(@F) + +qgetrs_T_parallel.$(PSUFFIX) : getrs_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DTRANS $< -o $(@F) + +cgetrs_N_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) + +cgetrs_T_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) + +cgetrs_R_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) + +cgetrs_C_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) + +cgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=1 $< -o $(@F) + +cgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=2 $< -o $(@F) + +cgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=3 $< -o $(@F) + +cgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DTRANS=4 $< -o $(@F) + +zgetrs_N_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) + +zgetrs_T_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) + +zgetrs_R_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) + +zgetrs_C_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) + +zgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=1 $< -o $(@F) + +zgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=2 $< -o $(@F) + +zgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=3 $< -o $(@F) + +zgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DTRANS=4 $< -o $(@F) + +xgetrs_N_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) + +xgetrs_T_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) + +xgetrs_R_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) + +xgetrs_C_single.$(PSUFFIX) : zgetrs_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) + +xgetrs_N_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=1 $< -o $(@F) + +xgetrs_T_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=2 $< -o $(@F) + +xgetrs_R_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=3 $< -o $(@F) + +xgetrs_C_parallel.$(PSUFFIX) : zgetrs_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DTRANS=4 $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/getrs/getrs_parallel.c b/lapack/getrs/getrs_parallel.c new file mode 100644 index 0000000000..3a7e4260ad --- /dev/null +++ b/lapack/getrs/getrs_parallel.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + BLASLONG n = args -> n; + BLASLONG off = 0; + + if (range_n) { + n = range_n[1] - range_n[0]; + off = range_n[0]; + } + +#ifndef TRANS + LASWP_PLUS(n, 1, args -> m, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + TRSM_LNUN (args, range_m, range_n, sa, sb, 0); +#else + TRSM_LTUN (args, range_m, range_n, sa, sb, 0); + TRSM_LTLU (args, range_m, range_n, sa, sb, 0); + LASWP_MINUS(n, 1, args -> m, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); +#endif + + return 0; +} + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + int mode; + +#ifndef TRANS + if (args -> n == 1){ + LASWP_PLUS(1, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } +#else + if (args -> n == 1){ + TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + LASWP_MINUS(1, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#else + mode = BLAS_SINGLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT); +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } +#endif + + return 0; + } diff --git a/lapack/getrs/getrs_single.c b/lapack/getrs/getrs_single.c new file mode 100644 index 0000000000..0dbb03869c --- /dev/null +++ b/lapack/getrs/getrs_single.c @@ -0,0 +1,68 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + +#ifndef TRANS + LASWP_PLUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + + if (args -> n == 1){ + TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + TRSM_LNUN (args, range_m, range_n, sa, sb, 0); + } + +#else + + if (args -> n == 1){ + TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + } else { + TRSM_LTUN (args, range_m, range_n, sa, sb, 0); + TRSM_LTLU (args, range_m, range_n, sa, sb, 0); + } + + LASWP_MINUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#endif + + return 0; } diff --git a/lapack/getrs/zgetrs_parallel.c b/lapack/getrs/zgetrs_parallel.c new file mode 100644 index 0000000000..b0d3fb0c2f --- /dev/null +++ b/lapack/getrs/zgetrs_parallel.c @@ -0,0 +1,113 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, + FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + BLASLONG n = args -> n; + BLASLONG off = 0; + + if (range_n) { + n = range_n[1] - range_n[0]; + off = range_n[0]; + } + +#if TRANS == 1 + LASWP_PLUS(n, 1, args -> m, ZERO, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); + TRSM_LNLU (args, range_m, range_n, sa, sb, 0); + TRSM_LNUN (args, range_m, range_n, sa, sb, 0); +#elif TRANS == 2 + TRSM_LTUN (args, range_m, range_n, sa, sb, 0); + TRSM_LTLU (args, range_m, range_n, sa, sb, 0); + LASWP_MINUS(n, 1, args -> m, ZERO, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); +#elif TRANS == 3 + LASWP_PLUS(n, 1, args -> m, ZERO, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, 1); + TRSM_LRLU (args, range_m, range_n, sa, sb, 0); + TRSM_LRUN (args, range_m, range_n, sa, sb, 0); +#else + TRSM_LCUN (args, range_m, range_n, sa, sb, 0); + TRSM_LCLU (args, range_m, range_n, sa, sb, 0); + LASWP_MINUS(n, 1, args -> m, ZERO, ZERO, + (FLOAT *)args -> b + off * args -> ldb * COMPSIZE, args -> ldb, NULL, 0, args -> c, -1); +#endif + + return 0; +} + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + int mode; + + if (args -> n == 1){ +#if TRANS == 1 + LASWP_PLUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + ZTRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + ZTRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); +#elif TRANS == 2 + ZTRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + ZTRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + LASWP_MINUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#elif TRANS == 3 + LASWP_PLUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + ZTRSV_RLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + ZTRSV_RUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); +#else + ZTRSV_CUN (args -> m, args -> a, args -> lda, args -> b, 1, sb); + ZTRSV_CLU (args -> m, args -> a, args -> lda, args -> b, 1, sb); + LASWP_MINUS(1, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#endif + } else { +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif + + gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads); + } + + return 0; + } diff --git a/lapack/getrs/zgetrs_single.c b/lapack/getrs/zgetrs_single.c new file mode 100644 index 0000000000..3910d0e639 --- /dev/null +++ b/lapack/getrs/zgetrs_single.c @@ -0,0 +1,66 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + +#if TRANS == 1 + LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + + TRSM_LNLU(args, range_m, range_n, sa, sb, 0); + TRSM_LNUN(args, range_m, range_n, sa, sb, 0); +#elif TRANS == 2 + TRSM_LTUN(args, range_m, range_n, sa, sb, 0); + TRSM_LTLU(args, range_m, range_n, sa, sb, 0); + + LASWP_MINUS(args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#elif TRANS == 3 + LASWP_PLUS (args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1); + + TRSM_LRLU(args, range_m, range_n, sa, sb, 0); + TRSM_LRUN(args, range_m, range_n, sa, sb, 0); +#else + TRSM_LCUN(args, range_m, range_n, sa, sb, 0); + TRSM_LCLU(args, range_m, range_n, sa, sb, 0); + + LASWP_MINUS(args -> n, 1, args -> m, ZERO, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1); +#endif + return 0; + } diff --git a/lapack/laswp/Makefile b/lapack/laswp/Makefile new file mode 100644 index 0000000000..389800692d --- /dev/null +++ b/lapack/laswp/Makefile @@ -0,0 +1,22 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) +DBLASOBJS = dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) +QBLASOBJS = qlaswp_plus.$(SUFFIX) qlaswp_minus.$(SUFFIX) +CBLASOBJS = claswp_plus.$(SUFFIX) claswp_minus.$(SUFFIX) +ZBLASOBJS = zlaswp_plus.$(SUFFIX) zlaswp_minus.$(SUFFIX) +XBLASOBJS = xlaswp_plus.$(SUFFIX) xlaswp_minus.$(SUFFIX) + +slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) \ +qlaswp_plus.$(SUFFIX) qlaswp_minus.$(SUFFIX) \ +claswp_plus.$(SUFFIX) claswp_minus.$(SUFFIX) zlaswp_plus.$(SUFFIX) zlaswp_minus.$(SUFFIX) \ +xlaswp_plus.$(SUFFIX) xlaswp_minus.$(SUFFIX) \ +slaswp_plus.$(PSUFFIX) slaswp_minus.$(PSUFFIX) dlaswp_plus.$(PSUFFIX) dlaswp_minus.$(PSUFFIX) \ +qlaswp_plus.$(PSUFFIX) qlaswp_minus.$(PSUFFIX) \ +claswp_plus.$(PSUFFIX) claswp_minus.$(PSUFFIX) zlaswp_plus.$(PSUFFIX) zlaswp_minus.$(PSUFFIX) \ +xlaswp_plus.$(PSUFFIX) xlaswp_minus.$(PSUFFIX) : dummy + cd $(ARCH) && $(MAKE) ../$(@F) + +include ../../Makefile.tail + diff --git a/lapack/laswp/alpha/Makefile b/lapack/laswp/alpha/Makefile new file mode 100644 index 0000000000..af1f0199c0 --- /dev/null +++ b/lapack/laswp/alpha/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/lapack/laswp/generic/Makefile b/lapack/laswp/generic/Makefile new file mode 100644 index 0000000000..bc9ab80bd1 --- /dev/null +++ b/lapack/laswp/generic/Makefile @@ -0,0 +1,95 @@ +ifndef INCLUDED +TOPDIR = ../../.. +include $(TOPDIR)/Makefile.system +endif + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +LASWP_DEPS = ../generic/laswp_k_1.c ../generic/laswp_k_2.c \ + ../generic/laswp_k_4.c ../generic/laswp_k_8.c + +ZLASWP_DEPS = ../generic/zlaswp_k_1.c ../generic/zlaswp_k_2.c \ + ../generic/zlaswp_k_4.c + +include ../../../Makefile.tail + +all: + +../slaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) + +../slaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) + +../dlaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) + +../dlaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) + +../qlaswp_plus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) + +../qlaswp_minus.$(SUFFIX) : $(LASWP) $(LASWP_DEPS) + $(CC) -c $(CFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) + +../claswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) + +../claswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) + +../zlaswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) + +../zlaswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) + +../xlaswp_plus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) + +../xlaswp_minus.$(SUFFIX) : $(ZLASWP) $(ZLASWP_DEPS) + $(CC) -c $(CFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) + +../slaswp_plus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) + +../slaswp_minus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) + +../dlaswp_plus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) + +../dlaswp_minus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) + +../qlaswp_plus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) + +../qlaswp_minus.$(PSUFFIX) : $(LASWP) + $(CC) -c $(PFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) + +../claswp_plus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -UDOUBLE -UMINUS $< -o ../$(@F) + +../claswp_minus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -UDOUBLE -DMINUS $< -o ../$(@F) + +../zlaswp_plus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -DDOUBLE -UMINUS $< -o ../$(@F) + +../zlaswp_minus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -DDOUBLE -DMINUS $< -o ../$(@F) + +../xlaswp_plus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -DXDOUBLE -UMINUS $< -o ../$(@F) + +../xlaswp_minus.$(PSUFFIX) : $(ZLASWP) + $(CC) -c $(PFLAGS) -DXDOUBLE -DMINUS $< -o ../$(@F) + diff --git a/lapack/laswp/generic/laswp_k.c b/lapack/laswp/generic/laswp_k.c new file mode 100644 index 0000000000..b4ee0195ea --- /dev/null +++ b/lapack/laswp/generic/laswp_k.c @@ -0,0 +1,49 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if GEMM_UNROLL_N >= 8 +#include "laswp_k_8.c" +#elif GEMM_UNROLL_N >= 4 +#include "laswp_k_4.c" +#elif GEMM_UNROLL_N >= 2 +#include "laswp_k_2.c" +#else +#include "laswp_k_1.c" +#endif diff --git a/lapack/laswp/generic/laswp_k_1.c b/lapack/laswp/generic/laswp_k_1.c new file mode 100644 index 0000000000..c190176314 --- /dev/null +++ b/lapack/laswp/generic/laswp_k_1.c @@ -0,0 +1,195 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 1) +#else +#define a2 (a1 - 1) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, B1, B2; + + a--; + k1 --; + +#ifndef MINUS + ipiv += k1 +; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = n; + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { +#ifdef OPTERON +#ifndef MINUS + asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(b1)); +#else + asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(b1)); +#endif +#endif + +#ifdef CORE2 +#ifndef MINUS + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b1)); + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b2)); +#else + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b1)); + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b2)); +#endif +#endif + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + *a1 = B1; + *b1 = A1; + } + + a += lda; + + j --; + } while (j > 0); + } + + return 0; +} + diff --git a/lapack/laswp/generic/laswp_k_2.c b/lapack/laswp/generic/laswp_k_2.c new file mode 100644 index 0000000000..1105aee82d --- /dev/null +++ b/lapack/laswp/generic/laswp_k_2.c @@ -0,0 +1,324 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 1) +#define a4 (a3 + 1) +#else +#define a2 (a1 - 1) +#define a4 (a3 - 1) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3; + FLOAT *b1, *b2, *b3, *b4; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + + a--; + k1 --; + +#ifndef MINUS + ipiv += k1 +; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = (n >> 1); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { +#ifdef CORE2 +#ifndef MINUS + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b3)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a3)); +#else + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b3)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a3)); +#endif +#endif + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } + + a += 2 * lda; + j --; + } while (j > 0); + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + *a1 = B1; + *b1 = A1; + } + } + + return 0; +} + diff --git a/lapack/laswp/generic/laswp_k_4.c b/lapack/laswp/generic/laswp_k_4.c new file mode 100644 index 0000000000..e08d49667b --- /dev/null +++ b/lapack/laswp/generic/laswp_k_4.c @@ -0,0 +1,529 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 1) +#define a4 (a3 + 1) +#define a6 (a5 + 1) +#define a8 (a7 + 1) +#else +#define a2 (a1 - 1) +#define a4 (a3 - 1) +#define a6 (a5 - 1) +#define a8 (a7 - 1) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + + a--; + k1 --; + +#ifndef MINUS + ipiv += k1 +; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = (n >> 2); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + if (n & 2) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + *a1 = B1; + *b1 = A1; + } + } + + return 0; +} + diff --git a/lapack/laswp/generic/laswp_k_8.c b/lapack/laswp/generic/laswp_k_8.c new file mode 100644 index 0000000000..a4d4bce991 --- /dev/null +++ b/lapack/laswp/generic/laswp_k_8.c @@ -0,0 +1,909 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 1) +#define a4 (a3 + 1) +#define a6 (a5 + 1) +#define a8 (a7 + 1) +#define a10 (a9 + 1) +#define a12 (a11 + 1) +#define a14 (a13 + 1) +#define a16 (a15 + 1) +#else +#define a2 (a1 - 1) +#define a4 (a3 - 1) +#define a6 (a5 - 1) +#define a8 (a7 - 1) +#define a10 (a9 - 1) +#define a12 (a11 - 1) +#define a14 (a13 - 1) +#define a16 (a15 - 1) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *a9, *a11, *a13, *a15; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT *b9, *b10, *b11, *b12; + FLOAT *b13, *b14, *b15, *b16; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + FLOAT A9, A10, B9, B10, A11, A12, B11, B12; + FLOAT A13, A14, B13, B14, A15, A16, B15, B16; + + a--; + k1 --; + +#ifndef MINUS + ipiv += k1; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = (n >> 3); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + a9 = a1 + 4 * lda; + a11 = a1 + 5 * lda; + a13 = a1 + 6 * lda; + a15 = a1 + 7 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + b9 = b1 + 4 * lda; + b10 = b2 + 4 * lda; + b11 = b1 + 5 * lda; + b12 = b2 + 5 * lda; + b13 = b1 + 6 * lda; + b14 = b2 + 6 * lda; + b15 = b1 + 7 * lda; + b16 = b2 + 7 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + B9 = *b9; + B10 = *b10; + B11 = *b11; + B12 = *b12; + B13 = *b13; + B14 = *b14; + B15 = *b15; + B16 = *b16; + + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + A9 = *a9; + A10 = *a10; + A11 = *a11; + A12 = *a12; + A13 = *a13; + A14 = *a14; + A15 = *a15; + A16 = *a16; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + + *a10 = B10; + *b10 = A10; + *a12 = B12; + *b12 = A12; + *a14 = B14; + *b14 = A14; + *a16 = B16; + *b16 = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + + *a9 = A10; + *a10 = A9; + *a11 = A12; + *a12 = A11; + *a13 = A14; + *a14 = A13; + *a15 = A16; + *a16 = A15; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + + *a9 = A10; + *a10 = B10; + *b10 = A9; + *a11 = A12; + *a12 = B12; + *b12 = A11; + *a13 = A14; + *a14 = B14; + *b14 = A13; + *a15 = A16; + *a16 = B16; + *b16 = A15; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + + *a9 = A10; + *a10 = B9; + *b9 = A9; + *a11 = A12; + *a12 = B11; + *b11 = A11; + *a13 = A14; + *a14 = B13; + *b13 = A13; + *a15 = A16; + *a16 = B15; + *b15 = A15; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + + *a9 = B9; + *b9 = A9; + *a11 = B11; + *b11 = A11; + *a13 = B13; + *b13 = A13; + *a15 = B15; + *b15 = A15; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + + *a9 = B9; + *a10 = A9; + *b9 = A10; + *a11 = B11; + *a12 = A11; + *b11 = A12; + *a13 = B13; + *a14 = A13; + *b13 = A14; + *a15 = B15; + *a16 = A15; + *b15 = A16; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + + *a9 = B9; + *a10 = B10; + *b9 = A9; + *b10 = A10; + *a11 = B11; + *a12 = B12; + *b11 = A11; + *b12 = A12; + *a13 = B13; + *a14 = B14; + *b13 = A13; + *b14 = A14; + *a15 = B15; + *a16 = B16; + *b15 = A15; + *b16 = A16; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + b9 = b1 + 4 * lda; + b10 = b2 + 4 * lda; + b11 = b1 + 5 * lda; + b12 = b2 + 5 * lda; + b13 = b1 + 6 * lda; + b14 = b2 + 6 * lda; + b15 = b1 + 7 * lda; + b16 = b2 + 7 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; + a9 += 2; + a11 += 2; + a13 += 2; + a15 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; + a9 -= 2; + a11 -= 2; + a13 -= 2; + a15 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + A9 = *a9; + B9 = *b9; + A11 = *a11; + B11 = *b11; + A13 = *a13; + B13 = *b13; + A15 = *a15; + B15 = *b15; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + + *a9 = B9; + *b9 = A9; + *a11 = B11; + *b11 = A11; + *a13 = B13; + *b13 = A13; + *a15 = B15; + *b15 = A15; + } + + a += 8 * lda; + + j --; + } while (j > 0); + } + + if (n & 4) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + A5 = *a5; + A6 = *a6; + A7 = *a7; + A8 = *a8; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + B5 = *b5; + B6 = *b6; + B7 = *b7; + B8 = *b8; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + *a6 = B6; + *b6 = A6; + *a8 = B8; + *b8 = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + *a5 = A6; + *a6 = A5; + *a7 = A8; + *a8 = A7; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + *a5 = A6; + *a6 = B6; + *b6 = A5; + *a7 = A8; + *a8 = B8; + *b8 = A7; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + *a5 = A6; + *a6 = B5; + *b5 = A5; + *a7 = A8; + *a8 = B7; + *b7 = A7; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + *a5 = B5; + *a6 = A5; + *b5 = A6; + *a7 = B7; + *a8 = A7; + *b7 = A8; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + *a5 = B5; + *a6 = B6; + *b5 = A5; + *b6 = A6; + *a7 = B7; + *a8 = B8; + *b7 = A7; + *b8 = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; + a5 += 2; + a7 += 2; +#else + a1 -= 2; + a3 -= 2; + a5 -= 2; + a7 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + A5 = *a5; + B5 = *b5; + A7 = *a7; + B7 = *b7; + + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + *a5 = B5; + *b5 = A5; + *a7 = B7; + *b7 = A7; + } + + a += 4 * lda; + } + + if (n & 2) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + a3 = a1 + 1 * lda; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + A3 = *a3; + A4 = *a4; + + B1 = *b1; + B2 = *b2; + B3 = *b3; + B4 = *b4; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + *a4 = B4; + *b4 = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + *a3 = A4; + *a4 = A3; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + *a3 = A4; + *a4 = B4; + *b4 = A3; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + *a3 = A4; + *a4 = B3; + *b3 = A3; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + *a3 = B3; + *a4 = A3; + *b3 = A4; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + *a3 = B3; + *a4 = B4; + *b3 = A3; + *b4 = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + +#ifndef MINUS + a1 += 2; + a3 += 2; +#else + a1 -= 2; + a3 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + A3 = *a3; + B3 = *b3; + *a1 = B1; + *b1 = A1; + *a3 = B3; + *b3 = A3; + } + + a += 2 * lda; + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + k1 + 1; +#else + a1 = a + k2; +#endif + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *a1; + A2 = *a2; + B1 = *b1; + B2 = *b2; + + ip1 = *piv; + piv += incx; + ip2 = *piv; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *a1 = A2; + *a2 = A1; + } else + if (b2 != a2) { + *a2 = B2; + *b2 = A2; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *a1 = A2; + *a2 = A1; + } else { + *a1 = A2; + *a2 = B2; + *b2 = A1; + } + } + } else { + if (b2 == a1) { + *a1 = A2; + *a2 = B1; + *b1 = A1; + } else + if (b2 == a2) { + *a1 = B1; + *b1 = A1; + } else + if (b2 == b1) { + *a1 = B1; + *a2 = A1; + *b1 = A2; + } else { + *a1 = B1; + *a2 = B2; + *b1 = A1; + *b2 = A2; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 2; +#else + a1 -= 2; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *a1; + B1 = *b1; + *a1 = B1; + *b1 = A1; + } + } + + return 0; +} + diff --git a/lapack/laswp/generic/zlaswp_k.c b/lapack/laswp/generic/zlaswp_k.c new file mode 100644 index 0000000000..c7938375d5 --- /dev/null +++ b/lapack/laswp/generic/zlaswp_k.c @@ -0,0 +1,47 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" + +#if GEMM_UNROLL_N >= 4 +#include "zlaswp_k_4.c" +#elif GEMM_UNROLL_N >= 2 +#include "zlaswp_k_2.c" +#else +#include "zlaswp_k_1.c" +#endif diff --git a/lapack/laswp/generic/zlaswp_k_1.c b/lapack/laswp/generic/zlaswp_k_1.c new file mode 100644 index 0000000000..3dd653baf4 --- /dev/null +++ b/lapack/laswp/generic/zlaswp_k_1.c @@ -0,0 +1,225 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 2) +#else +#define a2 (a1 - 2) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + + a -= 2; + lda *= 2; + k1 --; + +#ifndef MINUS + ipiv += k1; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = n; + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { +#ifdef OPTERON +#ifndef MINUS + asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetchw 2 * 128(%0)\n" : : "r"(b1)); +#else + asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetchw -2 * 128(%0)\n" : : "r"(b1)); +#endif +#endif + +#ifdef CORE2 +#ifndef MINUS + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b1)); + asm volatile("prefetcht1 2 * 128(%0)\n" : : "r"(b2)); +#else + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(a1)); + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b1)); + asm volatile("prefetcht1 -2 * 128(%0)\n" : : "r"(b2)); +#endif +#endif + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + + a += lda; + + j --; + } while (j > 0); + } + + return 0; +} + diff --git a/lapack/laswp/generic/zlaswp_k_2.c b/lapack/laswp/generic/zlaswp_k_2.c new file mode 100644 index 0000000000..a877ef66bd --- /dev/null +++ b/lapack/laswp/generic/zlaswp_k_2.c @@ -0,0 +1,406 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 2) +#else +#define a2 (a1 - 2) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1; + FLOAT *b1, *b2; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + + a -= 2; + lda *= 2; + k1 --; + +#ifndef MINUS + ipiv += k1; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + + j = (n >> 1); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { +#ifdef CORE2 +#ifndef MINUS + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(b1 + lda)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1)); + asm volatile("prefetcht0 1 * 64(%0)\n" : : "r"(a1 + lda)); +#else + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(b1 + lda)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1)); + asm volatile("prefetcht0 -1 * 64(%0)\n" : : "r"(a1 + lda)); +#endif +#endif + + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a1 + 0 + lda); + A6 = *(a1 + 1 + lda); + A7 = *(a2 + 0 + lda); + A8 = *(a2 + 1 + lda); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b1 + 0 + lda); + B6 = *(b1 + 1 + lda); + B7 = *(b2 + 0 + lda); + B8 = *(b2 + 1 + lda); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b2 + 0 + lda) = A5; + *(b2 + 1 + lda) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = A7; + *(a1 + 1 + lda) = A8; + *(a2 + 0 + lda) = B5; + *(a2 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = A5; + *(a2 + 1 + lda) = A6; + *(b1 + 0 + lda) = A7; + *(b1 + 1 + lda) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a1 + 0 + lda) = B5; + *(a1 + 1 + lda) = B6; + *(a2 + 0 + lda) = B7; + *(a2 + 1 + lda) = B8; + *(b1 + 0 + lda) = A5; + *(b1 + 1 + lda) = A6; + *(b2 + 0 + lda) = A7; + *(b2 + 1 + lda) = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a1 + 0 + lda); + A4 = *(a1 + 1 + lda); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b1 + 0 + lda); + B4 = *(b1 + 1 + lda); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a1 + 0 + lda) = B3; + *(a1 + 1 + lda) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b1 + 0 + lda) = A3; + *(b1 + 1 + lda) = A4; + } + + a += 2 * lda; + + j --; + } while (j > 0); + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + } + + return 0; +} + diff --git a/lapack/laswp/generic/zlaswp_k_4.c b/lapack/laswp/generic/zlaswp_k_4.c new file mode 100644 index 0000000000..4dc5598953 --- /dev/null +++ b/lapack/laswp/generic/zlaswp_k_4.c @@ -0,0 +1,742 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef MINUS +#define a2 (a1 + 2) +#define a4 (a3 + 2) +#define a6 (a5 + 2) +#define a8 (a7 + 2) +#else +#define a2 (a1 - 2) +#define a4 (a3 - 2) +#define a6 (a5 - 2) +#define a8 (a7 - 2) +#endif + +int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, + FLOAT *a, BLASLONG lda, + FLOAT *dummy2, BLASLONG dumy3, blasint *ipiv, BLASLONG incx){ + + BLASLONG i, j, ip1, ip2; + blasint *piv; + FLOAT *a1, *a3, *a5, *a7; + FLOAT *b1, *b2, *b3, *b4; + FLOAT *b5, *b6, *b7, *b8; + FLOAT A1, A2, B1, B2, A3, A4, B3, B4; + FLOAT A5, A6, B5, B6, A7, A8, B7, B8; + FLOAT A9, A10, B9, B10, A11, A12, B11, B12; + FLOAT A13, A14, B13, B14, A15, A16, B15, B16; + + a -= 2; + lda *= 2; + k1 --; + +#ifndef MINUS + ipiv += k1; +#else + ipiv -= (k2 - 1) * incx; +#endif + + if (n <= 0) return 0; + + j = (n >> 2); + if (j > 0) { + do { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + a3 = a1 + 1 * lda; + a5 = a1 + 2 * lda; + a7 = a1 + 3 * lda; + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + A9 = *(a5 + 0); + A10 = *(a5 + 1); + A11 = *(a6 + 0); + A12 = *(a6 + 1); + A13 = *(a7 + 0); + A14 = *(a7 + 1); + A15 = *(a8 + 0); + A16 = *(a8 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + B9 = *(b5 + 0); + B10 = *(b5 + 1); + B11 = *(b6 + 0); + B12 = *(b6 + 1); + B13 = *(b7 + 0); + B14 = *(b7 + 1); + B15 = *(b8 + 0); + B16 = *(b8 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b6 + 0) = A9; + *(b6 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b8 + 0) = A13; + *(b8 + 1) = A14; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = A11; + *(a5 + 1) = A12; + *(a6 + 0) = B9; + *(a6 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = A15; + *(a7 + 1) = A16; + *(a8 + 0) = B13; + *(a8 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = A9; + *(a6 + 1) = A10; + *(b5 + 0) = A11; + *(b5 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = A13; + *(a8 + 1) = A14; + *(b7 + 0) = A15; + *(b7 + 1) = A16; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + *(a5 + 0) = B9; + *(a5 + 1) = B10; + *(a6 + 0) = B11; + *(a6 + 1) = B12; + *(b5 + 0) = A9; + *(b5 + 1) = A10; + *(b6 + 0) = A11; + *(b6 + 1) = A12; + *(a7 + 0) = B13; + *(a7 + 1) = B14; + *(a8 + 0) = B15; + *(a8 + 1) = B16; + *(b7 + 0) = A13; + *(b7 + 1) = A14; + *(b8 + 0) = A15; + *(b8 + 1) = A16; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + 1 * lda; + b4 = b2 + 1 * lda; + b5 = b1 + 2 * lda; + b6 = b2 + 2 * lda; + b7 = b1 + 3 * lda; + b8 = b2 + 3 * lda; + +#ifndef MINUS + a1 += 4; + a3 += 4; + a5 += 4; + a7 += 4; +#else + a1 -= 4; + a3 -= 4; + a5 -= 4; + a7 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + A5 = *(a5 + 0); + A6 = *(a5 + 1); + A7 = *(a7 + 0); + A8 = *(a7 + 1); + B5 = *(b5 + 0); + B6 = *(b5 + 1); + B7 = *(b7 + 0); + B8 = *(b7 + 1); + + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a3 + 0) = B3; + *(a3 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + *(a5 + 0) = B5; + *(a5 + 1) = B6; + *(a7 + 0) = B7; + *(a7 + 1) = B8; + *(b5 + 0) = A5; + *(b5 + 1) = A6; + *(b7 + 0) = A7; + *(b7 + 1) = A8; + } + + a += 4 * lda; + + j --; + } while (j > 0); + } + + if (n & 2) { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + a3 = a1 + lda; + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + + A5 = *(a3 + 0); + A6 = *(a3 + 1); + A7 = *(a4 + 0); + A8 = *(a4 + 1); + + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + B5 = *(b3 + 0); + B6 = *(b3 + 1); + B7 = *(b4 + 0); + B8 = *(b4 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b4 + 0) = A5; + *(b4 + 1) = A6; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = A7; + *(a3 + 1) = A8; + *(a4 + 0) = B5; + *(a4 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = A5; + *(a4 + 1) = A6; + *(b3 + 0) = A7; + *(b3 + 1) = A8; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + *(a3 + 0) = B5; + *(a3 + 1) = B6; + *(a4 + 0) = B7; + *(a4 + 1) = B8; + *(b3 + 0) = A5; + *(b3 + 1) = A6; + *(b4 + 0) = A7; + *(b4 + 1) = A8; + } + } + + b1 = a + ip1; + b2 = a + ip2; + + b3 = b1 + lda; + b4 = b2 + lda; + +#ifndef MINUS + a1 += 4; + a3 += 4; +#else + a1 -= 4; + a3 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a3 + 0); + A4 = *(a3 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b3 + 0); + B4 = *(b3 + 1); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a3 + 0) = B3; + *(a3 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b3 + 0) = A3; + *(b3 + 1) = A4; + } + + a += 2 * lda; + + } + + if (n & 1) { + piv = ipiv; + +#ifndef MINUS + a1 = a + (k1 + 1) * 2; +#else + a1 = a + k2 * 2; +#endif + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + b1 = a + ip1; + b2 = a + ip2; + + i = ((k2 - k1) >> 1); + + if (i > 0) { + do { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + A3 = *(a2 + 0); + A4 = *(a2 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + B3 = *(b2 + 0); + B4 = *(b2 + 1); + + ip1 = *piv * 2; + piv += incx; + ip2 = *piv * 2; + piv += incx; + + if (b1 == a1) { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else + if (b2 != a2) { + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } else + if (b1 == a2) { + if (b2 != a1) { + if (b2 == a2) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + } else { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b2 + 0) = A1; + *(b2 + 1) = A2; + } + } + } else { + if (b2 == a1) { + *(a1 + 0) = A3; + *(a1 + 1) = A4; + *(a2 + 0) = B1; + *(a2 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == a2) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } else + if (b2 == b1) { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = A1; + *(a2 + 1) = A2; + *(b1 + 0) = A3; + *(b1 + 1) = A4; + } else { + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(a2 + 0) = B3; + *(a2 + 1) = B4; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + *(b2 + 0) = A3; + *(b2 + 1) = A4; + } + } + + b1 = a + ip1; + b2 = a + ip2; + +#ifndef MINUS + a1 += 4; +#else + a1 -= 4; +#endif + i --; + } while (i > 0); + } + + i = ((k2 - k1) & 1); + + if (i > 0) { + A1 = *(a1 + 0); + A2 = *(a1 + 1); + B1 = *(b1 + 0); + B2 = *(b1 + 1); + *(a1 + 0) = B1; + *(a1 + 1) = B2; + *(b1 + 0) = A1; + *(b1 + 1) = A2; + } + } + + return 0; +} + diff --git a/lapack/laswp/ia64/Makefile b/lapack/laswp/ia64/Makefile new file mode 100644 index 0000000000..42245c6244 --- /dev/null +++ b/lapack/laswp/ia64/Makefile @@ -0,0 +1,5 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +include ../generic/Makefile + diff --git a/lapack/laswp/mips64/Makefile b/lapack/laswp/mips64/Makefile new file mode 100644 index 0000000000..af1f0199c0 --- /dev/null +++ b/lapack/laswp/mips64/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/lapack/laswp/power/Makefile b/lapack/laswp/power/Makefile new file mode 100644 index 0000000000..af1f0199c0 --- /dev/null +++ b/lapack/laswp/power/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/lapack/laswp/sparc/Makefile b/lapack/laswp/sparc/Makefile new file mode 100644 index 0000000000..af1f0199c0 --- /dev/null +++ b/lapack/laswp/sparc/Makefile @@ -0,0 +1,8 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c + +include ../generic/Makefile + diff --git a/lapack/laswp/x86/Makefile b/lapack/laswp/x86/Makefile new file mode 100644 index 0000000000..105ec4027e --- /dev/null +++ b/lapack/laswp/x86/Makefile @@ -0,0 +1,28 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_2.c +ZLASWP = ../generic/zlaswp_k_2.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k_1.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k_1.c +endif + +include ../generic/Makefile + diff --git a/lapack/laswp/x86_64/Makefile b/lapack/laswp/x86_64/Makefile new file mode 100644 index 0000000000..ba07dcf4f8 --- /dev/null +++ b/lapack/laswp/x86_64/Makefile @@ -0,0 +1,33 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifeq ($(CORE), PENRYN) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifeq ($(CORE), CORE2) +LASWP = ../generic/laswp_k_4.c +ZLASWP = ../generic/zlaswp_k_4.c +endif + +ifeq ($(CORE), OPTERON) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifeq ($(CORE), PRESCOTT) +LASWP = ../generic/laswp_k_1.c +ZLASWP = ../generic/zlaswp_k_1.c +endif + +ifndef LASWP +LASWP = ../generic/laswp_k_1.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k_1.c +endif + +include ../generic/Makefile + diff --git a/lapack/lauu2/Makefile b/lapack/lauu2/Makefile new file mode 100644 index 0000000000..dc6a640b4d --- /dev/null +++ b/lapack/lauu2/Makefile @@ -0,0 +1,83 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = slauu2_U.$(SUFFIX) slauu2_L.$(SUFFIX) +DBLASOBJS = dlauu2_U.$(SUFFIX) dlauu2_L.$(SUFFIX) +QBLASOBJS = qlauu2_U.$(SUFFIX) qlauu2_L.$(SUFFIX) +CBLASOBJS = clauu2_U.$(SUFFIX) clauu2_L.$(SUFFIX) +ZBLASOBJS = zlauu2_U.$(SUFFIX) zlauu2_L.$(SUFFIX) +XBLASOBJS = xlauu2_U.$(SUFFIX) xlauu2_L.$(SUFFIX) + +slauu2_U.$(SUFFIX) : lauu2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauu2_L.$(SUFFIX) : lauu2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dlauu2_U.$(SUFFIX) : lauu2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauu2_L.$(SUFFIX) : lauu2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qlauu2_U.$(SUFFIX) : lauu2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauu2_L.$(SUFFIX) : lauu2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +clauu2_U.$(SUFFIX) : zlauu2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauu2_L.$(SUFFIX) : zlauu2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zlauu2_U.$(SUFFIX) : zlauu2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauu2_L.$(SUFFIX) : zlauu2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xlauu2_U.$(SUFFIX) : zlauu2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauu2_L.$(SUFFIX) : zlauu2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +slauu2_U.$(PSUFFIX) : lauu2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauu2_L.$(PSUFFIX) : lauu2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dlauu2_U.$(PSUFFIX) : lauu2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauu2_L.$(PSUFFIX) : lauu2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qlauu2_U.$(PSUFFIX) : lauu2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauu2_L.$(PSUFFIX) : lauu2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +clauu2_U.$(PSUFFIX) : zlauu2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauu2_L.$(PSUFFIX) : zlauu2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zlauu2_U.$(PSUFFIX) : zlauu2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauu2_L.$(PSUFFIX) : zlauu2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xlauu2_U.$(PSUFFIX) : zlauu2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauu2_L.$(PSUFFIX) : zlauu2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/lauu2/lauu2_L.c b/lapack/lauu2/lauu2_L.c new file mode 100644 index 0000000000..aedb966ff2 --- /dev/null +++ b/lapack/lauu2/lauu2_L.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT aii; + BLASLONG i; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (i = 0; i < n; i++) { + + SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i, lda, NULL, 0, NULL, 0); + + if (i < n - 1) { + aii = DOTU_K(n - i - 1, a + i + 1 + i * lda, 1, a + i + 1 + i * lda, 1); + + *(a + i + i * lda) += aii; + + GEMV_T(n - i - 1, i, 0, dp1, + a + (i + 1) , lda, + a + (i + 1) + i * lda, 1, + a + i , lda, sb); + } + } + + return 0; +} diff --git a/lapack/lauu2/lauu2_U.c b/lapack/lauu2/lauu2_U.c new file mode 100644 index 0000000000..f9a7186411 --- /dev/null +++ b/lapack/lauu2/lauu2_U.c @@ -0,0 +1,78 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT aii; + BLASLONG i; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (i = 0; i < n; i++) { + + SCAL_K(i + 1, 0, 0, *(a + i + i * lda), a + i * lda, 1, NULL, 0, NULL, 0); + + if (i < n - 1) { + aii = DOTU_K(n - i - 1, a + i + (i + 1)* lda, lda, a + i + (i + 1) * lda, lda); + + *(a + i + i * lda) += aii; + + GEMV_N(i, n - i - 1, 0, dp1, + a + (i + 1) * lda, lda, + a + i + (i + 1) * lda, lda, + a + i * lda, 1, sb); + } + } + + return 0; +} diff --git a/lapack/lauu2/zlauu2_L.c b/lapack/lauu2/zlauu2_L.c new file mode 100644 index 0000000000..8a892d9749 --- /dev/null +++ b/lapack/lauu2/zlauu2_L.c @@ -0,0 +1,83 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT temp[2]; + BLASLONG i; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (i = 0; i < n; i++) { + + SCAL_K(i + 1, 0, 0, *(a + (i + i * lda) * COMPSIZE + 0), ZERO, + a + i * COMPSIZE, lda, NULL, 0, NULL, 0); + + if (i < n - 1) { + temp[0] = DOTC_K(n - i - 1, + a + (i + 1 + i * lda) * COMPSIZE, 1, + a + (i + 1 + i * lda) * COMPSIZE, 1); + GET_IMAGE(temp[1]); + + *(a + (i + i * lda) * COMPSIZE + 0) += temp[0]; + *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; + + GEMV_U(n - i - 1, i, 0, dp1, ZERO, + a + ((i + 1) ) * COMPSIZE, lda, + a + ((i + 1) + i * lda) * COMPSIZE, 1, + a + ( i ) * COMPSIZE , lda, sb); + } + } + + return 0; +} diff --git a/lapack/lauu2/zlauu2_U.c b/lapack/lauu2/zlauu2_U.c new file mode 100644 index 0000000000..b20ea994ac --- /dev/null +++ b/lapack/lauu2/zlauu2_U.c @@ -0,0 +1,81 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT temp[2]; + BLASLONG i; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + for (i = 0; i < n; i++) { + + SCAL_K(i + 1, 0, 0, + *(a + (i + i * lda) * COMPSIZE + 0), ZERO, + a + i * lda * COMPSIZE, 1, NULL, 0, NULL, 0); + + if (i < n - 1) { + temp[0] = DOTC_K(n - i - 1, a + (i + (i + 1) * lda) * COMPSIZE, lda, a + (i + (i + 1) * lda) * COMPSIZE, lda); + GET_IMAGE(temp[1]); + + *(a + (i + i * lda) * COMPSIZE + 0) += temp[0]; + *(a + (i + i * lda) * COMPSIZE + 1) = ZERO; + + GEMV_O(i, n - i - 1, 0, dp1, ZERO, + a + ( (i + 1) * lda) * COMPSIZE, lda, + a + (i + (i + 1) * lda) * COMPSIZE, lda, + a + ( i * lda) * COMPSIZE, 1, sb); + } + } + + return 0; +} diff --git a/lapack/lauum/Makefile b/lapack/lauum/Makefile new file mode 100644 index 0000000000..f163479ef9 --- /dev/null +++ b/lapack/lauum/Makefile @@ -0,0 +1,164 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = slauum_U_single.$(SUFFIX) slauum_L_single.$(SUFFIX) +DBLASOBJS = dlauum_U_single.$(SUFFIX) dlauum_L_single.$(SUFFIX) +QBLASOBJS = qlauum_U_single.$(SUFFIX) qlauum_L_single.$(SUFFIX) +CBLASOBJS = clauum_U_single.$(SUFFIX) clauum_L_single.$(SUFFIX) +ZBLASOBJS = zlauum_U_single.$(SUFFIX) zlauum_L_single.$(SUFFIX) +XBLASOBJS = xlauum_U_single.$(SUFFIX) xlauum_L_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += slauum_U_parallel.$(SUFFIX) slauum_L_parallel.$(SUFFIX) +DBLASOBJS += dlauum_U_parallel.$(SUFFIX) dlauum_L_parallel.$(SUFFIX) +QBLASOBJS += qlauum_U_parallel.$(SUFFIX) qlauum_L_parallel.$(SUFFIX) +CBLASOBJS += clauum_U_parallel.$(SUFFIX) clauum_L_parallel.$(SUFFIX) +ZBLASOBJS += zlauum_U_parallel.$(SUFFIX) zlauum_L_parallel.$(SUFFIX) +XBLASOBJS += xlauum_U_parallel.$(SUFFIX) xlauum_L_parallel.$(SUFFIX) +endif + +slauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dlauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qlauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +clauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zlauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xlauum_U_single.$(SUFFIX) : lauum_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_L_single.$(SUFFIX) : lauum_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_U_parallel.$(SUFFIX) : lauum_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_L_parallel.$(SUFFIX) : lauum_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +slauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +slauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dlauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qlauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +clauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +clauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zlauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xlauum_U_single.$(PSUFFIX) : lauum_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_L_single.$(PSUFFIX) : lauum_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_U_parallel.$(PSUFFIX) : lauum_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xlauum_L_parallel.$(PSUFFIX) : lauum_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c new file mode 100644 index 0000000000..8d9cde9f75 --- /dev/null +++ b/lapack/lauum/lauum_L_parallel.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { + LAUUM_L_SINGLE(args, NULL, NULL, sa, sb, 0); + return 0; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { + LAUUM_L_SINGLE(args, NULL, range_n, sa, sb, 0); + return 0; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.n = i; + newarg.k = bk; + newarg.a = a + i * COMPSIZE; + newarg.c = a; + + syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LC, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = i; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i ) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRMM_LCLN, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + CNAME(&newarg, NULL, NULL, sa, sb, 0); + } + + return 0; +} diff --git a/lapack/lauum/lauum_L_single.c b/lapack/lauum/lauum_L_single.c new file mode 100644 index 0000000000..65e8f04466 --- /dev/null +++ b/lapack/lauum/lauum_L_single.c @@ -0,0 +1,234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +#ifndef COMPLEX +#define TRMM_KERNEL TRMM_KERNEL_LN +#define SYRK_KERNEL SYRK_KERNEL_L +#else +#define TRMM_KERNEL TRMM_KERNEL_LR +#ifdef XDOUBLE +#define SYRK_KERNEL xherk_kernel_LC +#elif defined(DOUBLE) +#define SYRK_KERNEL zherk_kernel_LC +#else +#define SYRK_KERNEL cherk_kernel_LC +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 64 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG j, bk, blocking; + BLASLONG jjs, min_jj; + + BLASLONG is, ls, ks; + BLASLONG min_i, min_l, min_k; + BLASLONG range_N[2]; + + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + +#if 0 + FLOAT *aa; +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES) { + LAUU2_L(args, NULL, range_n, sa, sb, 0); + return 0; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (j = 0; j < n; j += blocking) { + bk = MIN(blocking, n - j); + + if (j > 0 ){ + + TRMM_ILNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb); + + for (ls = 0; ls < j; ls += REAL_GEMM_R) { + min_l = j - ls; + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + +#if 0 + + min_i = j - ls; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (ls + min_i >= ls + min_l) { + GEMM_INCOPY(bk, min_i, a + (j + ls * lda)* COMPSIZE, lda, sa); + aa = sa; + } else { + aa = sb2; + } + + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ + min_jj = ls + min_l - jjs; + if (min_jj > GEMM_P) min_jj = GEMM_P; + + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + aa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (ls + jjs * lda) * COMPSIZE, lda, + ls - jjs); + } + + + for(is = ls + min_i; is < j ; is += GEMM_P){ + min_i = j - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa); + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, + is - ls); + } + + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_k, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sb + ks * bk * COMPSIZE, + sb2, + a + (ks + j + ls * lda) * COMPSIZE, lda, ks); + } +#else + + min_i = j - ls; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_INCOPY(bk, min_i, a + (j + ls * lda)* COMPSIZE, lda, sa); + + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ + min_jj = ls + min_l - jjs; + if (min_jj > GEMM_P) min_jj = GEMM_P; + + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + sa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (ls + jjs * lda) * COMPSIZE, lda, + ls - jjs); + } + + for(is = ls + min_i; is < j ; is += GEMM_P){ + min_i = j - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_INCOPY(bk, min_i, a + (j + is * lda)* COMPSIZE, lda, sa); + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, + is - ls); + } + + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_k, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sb + ks * bk * COMPSIZE, + sb2, + a + (ks + j + ls * lda) * COMPSIZE, lda, ks); + } + +#endif + + } + } + + if (!range_n) { + range_N[0] = j; + range_N[1] = j + bk; + } else { + range_N[0] = range_n[0] + j; + range_N[1] = range_n[0] + j + bk; + } + + CNAME(args, NULL, range_N, sa, sb, 0); + + } + + return 0; +} diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c new file mode 100644 index 0000000000..d68d12bd39 --- /dev/null +++ b/lapack/lauum/lauum_U_parallel.c @@ -0,0 +1,123 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { + LAUUM_U_SINGLE(args, NULL, NULL, sa, sb, 0); + return 0; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { + LAUUM_U_SINGLE(args, NULL, range_n, sa, sb, 0); + return 0; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.n = i; + newarg.k = bk; + newarg.a = a + ( i * lda) * COMPSIZE; + newarg.c = a; + + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UN, sa, sb, args -> nthreads); + + newarg.m = i; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + ( i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE, + &newarg, NULL, NULL, (void *)TRMM_RCUN, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + CNAME(&newarg, NULL, NULL, sa, sb, 0); + } + + return 0; +} diff --git a/lapack/lauum/lauum_U_single.c b/lapack/lauum/lauum_U_single.c new file mode 100644 index 0000000000..14cf0ad2bb --- /dev/null +++ b/lapack/lauum/lauum_U_single.c @@ -0,0 +1,268 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; + +#ifndef COMPLEX +#define TRMM_KERNEL TRMM_KERNEL_RT +#define SYRK_KERNEL SYRK_KERNEL_U +#else +#define TRMM_KERNEL TRMM_KERNEL_RC +#ifdef XDOUBLE +#define SYRK_KERNEL xherk_kernel_UN +#elif defined(DOUBLE) +#define SYRK_KERNEL zherk_kernel_UN +#else +#define SYRK_KERNEL cherk_kernel_UN +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 24 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG j, bk, blocking; + BLASLONG is, ls, ks; + BLASLONG jjs, min_jj; + + BLASLONG min_i, min_l, min_k; + BLASLONG range_N[2]; + + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + +#if 0 + FLOAT *aa; +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES) { + LAUU2_U(args, NULL, range_n, sa, sb, 0); + return 0; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (j = 0; j < n; j += blocking) { + bk = n - j; + if (bk > blocking) bk = blocking; + + if (j > 0) { + + TRMM_OUTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, 0, sb); + + for (ls = 0; ls < j; ls += REAL_GEMM_R) { + min_l = j - ls; + +#if 0 + + + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + min_i = ls + min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (ls > 0) { + GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa); + aa = sa; + } else { + aa = sb2; + } + + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ + min_jj = ls + min_l - jjs; + if (min_jj > GEMM_P) min_jj = GEMM_P; + + GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + aa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (jjs * lda) * COMPSIZE, lda, - jjs); + } + + if (ls + REAL_GEMM_R >= j ) { + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_i, min_k, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + aa, + sb + ks * bk * COMPSIZE, + a + ((ks + j) * lda) * COMPSIZE, lda, -ks); + } + } + + for(is = min_i; is < ls + min_l ; is += GEMM_P){ + min_i = ls + min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (is < ls) { + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + aa = sa; + } else { + aa = sb2 + (is - ls) * bk * COMPSIZE; + } + + SYRK_KERNEL(min_i, min_l, bk, dp1, + aa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, is - ls); + + if (ls + REAL_GEMM_R >= j ) { + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_i, min_k, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + aa, + sb + ks * bk * COMPSIZE, + a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); + } + } + } +#else + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + min_i = ls + min_l; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(bk, min_i, a + (j * lda) * COMPSIZE, lda, sa); + + for (jjs = ls; jjs < ls + min_l; jjs += GEMM_P){ + min_jj = ls + min_l - jjs; + if (min_jj > GEMM_P) min_jj = GEMM_P; + + GEMM_OTCOPY(bk, min_jj, a + (jjs + j * lda) * COMPSIZE, lda, sb2 + (jjs - ls) * bk * COMPSIZE); + + SYRK_KERNEL(min_i, min_jj, bk, dp1, + sa, + sb2 + (jjs - ls) * bk * COMPSIZE, + a + (jjs * lda) * COMPSIZE, lda, - jjs); + } + + if (ls + REAL_GEMM_R >= j ) { + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_i, min_k, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + ks * bk * COMPSIZE, + a + ((ks + j) * lda) * COMPSIZE, lda, -ks); + } + } + + for(is = min_i; is < ls + min_l ; is += GEMM_P){ + min_i = ls + min_l - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + + SYRK_KERNEL(min_i, min_l, bk, dp1, + sa, + sb2, + a + (is + ls * lda) * COMPSIZE, lda, is - ls); + + if (ls + REAL_GEMM_R >= j ) { + for (ks = 0; ks < bk; ks += GEMM_P) { + min_k = bk - ks; + if (min_k > GEMM_P) min_k = GEMM_P; + + TRMM_KERNEL(min_i, min_k, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, + sb + ks * bk * COMPSIZE, + a + (is + (ks + j) * lda) * COMPSIZE, lda, -ks); + } + } + } +#endif + } /* end of ls */ + } + + if (!range_n) { + range_N[0] = j; + range_N[1] = j + bk; + } else { + range_N[0] = range_n[0] + j; + range_N[1] = range_n[0] + j + bk; + } + + CNAME(args, NULL, range_N, sa, sb, 0); + + } + + return 0; +} diff --git a/lapack/potf2/Makefile b/lapack/potf2/Makefile new file mode 100644 index 0000000000..5946ad9c8a --- /dev/null +++ b/lapack/potf2/Makefile @@ -0,0 +1,83 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = spotf2_U.$(SUFFIX) spotf2_L.$(SUFFIX) +DBLASOBJS = dpotf2_U.$(SUFFIX) dpotf2_L.$(SUFFIX) +QBLASOBJS = qpotf2_U.$(SUFFIX) qpotf2_L.$(SUFFIX) +CBLASOBJS = cpotf2_U.$(SUFFIX) cpotf2_L.$(SUFFIX) +ZBLASOBJS = zpotf2_U.$(SUFFIX) zpotf2_L.$(SUFFIX) +XBLASOBJS = xpotf2_U.$(SUFFIX) xpotf2_L.$(SUFFIX) + +spotf2_U.$(SUFFIX) : potf2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotf2_L.$(SUFFIX) : potf2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dpotf2_U.$(SUFFIX) : potf2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotf2_L.$(SUFFIX) : potf2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qpotf2_U.$(SUFFIX) : potf2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotf2_L.$(SUFFIX) : potf2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cpotf2_U.$(SUFFIX) : zpotf2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotf2_L.$(SUFFIX) : zpotf2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zpotf2_U.$(SUFFIX) : zpotf2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotf2_L.$(SUFFIX) : zpotf2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xpotf2_U.$(SUFFIX) : zpotf2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotf2_L.$(SUFFIX) : zpotf2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +spotf2_U.$(PSUFFIX) : potf2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotf2_L.$(PSUFFIX) : potf2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dpotf2_U.$(PSUFFIX) : potf2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotf2_L.$(PSUFFIX) : potf2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qpotf2_U.$(PSUFFIX) : potf2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotf2_L.$(PSUFFIX) : potf2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cpotf2_U.$(PSUFFIX) : zpotf2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotf2_L.$(PSUFFIX) : zpotf2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zpotf2_U.$(PSUFFIX) : zpotf2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotf2_L.$(PSUFFIX) : zpotf2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xpotf2_U.$(PSUFFIX) : zpotf2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotf2_L.$(PSUFFIX) : zpotf2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/potf2/potf2_L.c b/lapack/potf2/potf2_L.c new file mode 100644 index 0000000000..23aa97c51c --- /dev/null +++ b/lapack/potf2/potf2_L.c @@ -0,0 +1,97 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dm1 = -1.; +static FLOAT dp1 = 1.; + +#ifndef SQRT +#define SQRT(x) sqrt(x) +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj; + FLOAT *aoffset; + BLASLONG i, j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + aoffset = a; + + for (j = 0; j < n; j++) { + + ajj = *(aoffset + j) - DOTU_K(j, a + j, lda, a + j, lda); + + if (ajj <= 0){ + *(aoffset + j) = ajj; + return j + 1; + } + ajj = SQRT(ajj); + *(aoffset + j) = ajj; + + i = n - j - 1; + + if (i > 0) { + GEMV_N(i, j, 0, dm1, + a + j + 1, lda, + a + j, lda, + aoffset + j + 1, 1, sb); + + SCAL_K(i, 0, 0, dp1 / ajj, + aoffset + j + 1, 1, NULL, 0, NULL, 0); + } + + aoffset += lda; + } + + return 0; +} diff --git a/lapack/potf2/potf2_U.c b/lapack/potf2/potf2_U.c new file mode 100644 index 0000000000..755bf8d51d --- /dev/null +++ b/lapack/potf2/potf2_U.c @@ -0,0 +1,94 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dm1 = -1.; +static FLOAT dp1 = 1.; + +#ifndef SQRT +#define SQRT(x) sqrt(x) +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj; + BLASLONG i, j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = 0; j < n; j++) { + + ajj = *(a + j) - DOTU_K(j, a, 1, a, 1); + + if (ajj <= 0){ + *(a + j) = ajj; + return j + 1; + } + ajj = SQRT(ajj); + *(a + j) = ajj; + + i = n - j - 1; + + if (i > 0) { + GEMV_T(j, i, 0, dm1, + a + lda, lda, + a, 1, + a + j + lda, lda, sb); + + SCAL_K(i, 0, 0, dp1 / ajj, + a + j + lda, lda, NULL, 0, NULL, 0); + } + + a += lda; + } + + return 0; +} diff --git a/lapack/potf2/zpotf2_L.c b/lapack/potf2/zpotf2_L.c new file mode 100644 index 0000000000..8ce0d4e07b --- /dev/null +++ b/lapack/potf2/zpotf2_L.c @@ -0,0 +1,101 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifndef SQRT +#define SQRT(x) sqrt(x) +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj[2]; + FLOAT *aoffset; + BLASLONG i, j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + aoffset = a; + + for (j = 0; j < n; j++) { + + ajj[0] = DOTC_K(j, a + j * 2, lda, a + j * 2, lda); + GET_IMAGE(ajj[1]); + + ajj[0] = *(aoffset + j * 2) - ajj[0]; + + if (ajj[0] <= 0){ + *(aoffset + j * 2 + 0) = ajj[0]; + *(aoffset + j * 2 + 1) = ZERO; + return j + 1; + } + ajj[0] = SQRT(ajj[0]); + *(aoffset + j * 2 + 0) = ajj[0]; + *(aoffset + j * 2 + 1) = ZERO; + + i = n - j - 1; + + if (i > 0) { + GEMV_O(i, j, 0, dm1, ZERO, + a + (j + 1) * 2, lda, + a + j * 2, lda, + aoffset + (j + 1) * 2, 1, sb); + + SCAL_K(i, 0, 0, ONE / ajj[0], ZERO, + aoffset + (j + 1) * 2, 1, NULL, 0, NULL, 0); + } + + aoffset += lda * 2; + } + + return 0; +} diff --git a/lapack/potf2/zpotf2_U.c b/lapack/potf2/zpotf2_U.c new file mode 100644 index 0000000000..c1f5156aa4 --- /dev/null +++ b/lapack/potf2/zpotf2_U.c @@ -0,0 +1,99 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifndef SQRT +#define SQRT(x) sqrt(x) +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj[2]; + BLASLONG i, j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = 0; j < n; j++) { + + ajj[0] = DOTC_K(j, a, 1, a, 1); + GET_IMAGE(ajj[1]); + + ajj[0] = *(a + j * 2) - ajj[0]; + + if (ajj[0] <= 0){ + *(a + j * 2 + 0) = ajj[0]; + *(a + j * 2 + 1) = ZERO; + return j + 1; + } + + ajj[0] = SQRT(ajj[0]); + *(a + j * 2 + 0) = ajj[0]; + *(a + j * 2 + 1) = ZERO; + + i = n - j - 1; + + if (i > 0){ + GEMV_U(j, i, 0, dm1, ZERO, + a + lda * 2, lda, + a, 1, + a + (j + lda) * 2, lda, sb); + + SCAL_K(i, 0, 0, ONE / ajj[0], ZERO, + a + (j + lda) * 2, lda, NULL, 0, NULL, 0); + } + + a += 2 * lda; + } + + return 0; +} diff --git a/lapack/potrf/Makefile b/lapack/potrf/Makefile new file mode 100644 index 0000000000..21efa55403 --- /dev/null +++ b/lapack/potrf/Makefile @@ -0,0 +1,164 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = spotrf_U_single.$(SUFFIX) spotrf_L_single.$(SUFFIX) +DBLASOBJS = dpotrf_U_single.$(SUFFIX) dpotrf_L_single.$(SUFFIX) +QBLASOBJS = qpotrf_U_single.$(SUFFIX) qpotrf_L_single.$(SUFFIX) +CBLASOBJS = cpotrf_U_single.$(SUFFIX) cpotrf_L_single.$(SUFFIX) +ZBLASOBJS = zpotrf_U_single.$(SUFFIX) zpotrf_L_single.$(SUFFIX) +XBLASOBJS = xpotrf_U_single.$(SUFFIX) xpotrf_L_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += spotrf_U_parallel.$(SUFFIX) spotrf_L_parallel.$(SUFFIX) +DBLASOBJS += dpotrf_U_parallel.$(SUFFIX) dpotrf_L_parallel.$(SUFFIX) +QBLASOBJS += qpotrf_U_parallel.$(SUFFIX) qpotrf_L_parallel.$(SUFFIX) +CBLASOBJS += cpotrf_U_parallel.$(SUFFIX) cpotrf_L_parallel.$(SUFFIX) +ZBLASOBJS += zpotrf_U_parallel.$(SUFFIX) zpotrf_L_parallel.$(SUFFIX) +XBLASOBJS += xpotrf_U_parallel.$(SUFFIX) xpotrf_L_parallel.$(SUFFIX) +endif + +spotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xpotrf_U_single.$(SUFFIX) : potrf_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_L_single.$(SUFFIX) : potrf_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_U_parallel.$(SUFFIX) : potrf_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_L_parallel.$(SUFFIX) : potrf_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +spotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +spotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) + +dpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +dpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +qpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +qpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE $< -o $(@F) + +cpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +cpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE $< -o $(@F) + +zpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +zpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE $< -o $(@F) + +xpotrf_U_single.$(PSUFFIX) : potrf_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_L_single.$(PSUFFIX) : potrf_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_U_parallel.$(PSUFFIX) : potrf_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +xpotrf_L_parallel.$(PSUFFIX) : potrf_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c new file mode 100644 index 0000000000..1ebcad82f9 --- /dev/null +++ b/lapack/potrf/potrf_L_parallel.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 4) { + info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { + newarg.m = n - i - bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + bk + i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); +#endif + } + } + + return 0; +} diff --git a/lapack/potrf/potrf_L_single.c b/lapack/potrf/potrf_L_single.c new file mode 100644 index 0000000000..b88f8fc7a9 --- /dev/null +++ b/lapack/potrf/potrf_L_single.c @@ -0,0 +1,234 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RR +#undef SYRK_KERNEL_L +#ifdef XDOUBLE +#define SYRK_KERNEL_L xherk_kernel_LN +#elif defined(DOUBLE) +#define SYRK_KERNEL_L zherk_kernel_LN +#else +#define SYRK_KERNEL_L cherk_kernel_LN +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 128 +#define GEMM_Q 128 +#define GEMM_R 4000 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +#if 0 +#define SHARED_ARRAY +#define SA aa +#else +#undef SHARED_ARRAY +#define SA sa +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG info; + BLASLONG bk, j, blocking; + BLASLONG is, min_i; + BLASLONG js, min_j; + BLASLONG range_N[2]; + + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + +#ifdef SHARED_ARRAY + FLOAT *aa; +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES / 2) { + info = POTF2_L(args, NULL, range_n, sa, sb, 0); + return info; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = n / 4; + + for (j = 0; j < n; j += blocking) { + bk = n - j; + if (bk > blocking) bk = blocking; + + if (!range_n) { + range_N[0] = j; + range_N[1] = j + bk; + } else { + range_N[0] = range_n[0] + j; + range_N[1] = range_n[0] + j + bk; + } + info = CNAME(args, NULL, range_N, sa, sb, 0); + if (info) return info + j; + + if (n - j - bk > 0) { + + TRSM_OLTCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); + + /* First tile */ + min_j = n - j - bk; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + for (is = j + bk; is < n; is += GEMM_P) { + min_i = n - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifdef SHARED_ARRAY + + if (is < j + bk + min_j) { + aa = sb2 + bk * (is - j - bk) * COMPSIZE; + } else { + aa = sa; + } + + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, aa); + + TRSM_KERNEL(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + sb, + a + (is + j * lda) * COMPSIZE, lda, 0); + + SYRK_KERNEL_L(min_i, min_j, bk, dm1, + aa, + sb2, + a + (is + (j + bk) * lda) * COMPSIZE, lda, + is - j - bk); + +#else + + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + + sa, + sb, + a + (is + j * lda) * COMPSIZE, lda, 0); + + if (is < j + bk + min_j) { + GEMM_OTCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sb2 + bk * (is - j - bk) * COMPSIZE); + } + + SYRK_KERNEL_L(min_i, min_j, bk, dm1, + sa, + sb2, + a + (is + (j + bk) * lda) * COMPSIZE, lda, + is - j - bk); +#endif + } + + for(js = j + bk + min_j; js < n; js += REAL_GEMM_R){ + min_j = n - js; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + GEMM_OTCOPY(bk, min_j, a + (js + j * lda) * COMPSIZE, lda, sb2); + + for (is = js; is < n; is += GEMM_P) { + min_i = n - is; + if (min_i > GEMM_P) min_i = GEMM_P; + +#ifdef SHARED_ARRAY + + if (is + min_i < js + min_j) { + aa = sb2 + bk * (is - js) * COMPSIZE; + } else { + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + aa = sa; + } + + SYRK_KERNEL_L(min_i, min_j, bk, dm1, + aa, + sb2, + a + (is + js * lda) * COMPSIZE, lda, + is - js); + +#else + + GEMM_ITCOPY(bk, min_i, a + (is + j * lda) * COMPSIZE, lda, sa); + + SYRK_KERNEL_L(min_i, min_j, bk, dm1, + sa, + sb2, + a + (is + js * lda) * COMPSIZE, lda, + - is + js); +#endif + + } + } + + } + + } + + return 0; +} diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c new file mode 100644 index 0000000000..31da141018 --- /dev/null +++ b/lapack/potrf/potrf_U_parallel.c @@ -0,0 +1,130 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 4) { + info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { + newarg.m = bk; + newarg.n = n - i - bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); +#endif + } + } + + return 0; +} diff --git a/lapack/potrf/potrf_U_single.c b/lapack/potrf/potrf_U_single.c new file mode 100644 index 0000000000..aa445c5273 --- /dev/null +++ b/lapack/potrf/potrf_U_single.c @@ -0,0 +1,193 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dm1 = -1.; + +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_LT +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#undef SYRK_KERNEL_U +#ifdef XDOUBLE +#define SYRK_KERNEL_U xherk_kernel_UC +#elif defined(DOUBLE) +#define SYRK_KERNEL_U zherk_kernel_UC +#else +#define SYRK_KERNEL_U cherk_kernel_UC +#endif +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 64 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +#if 0 +#define SHARED_ARRAY +#define SA aa +#else +#undef SHARED_ARRAY +#define SA sa +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG info; + BLASLONG bk, blocking; + BLASLONG is, min_i; + BLASLONG jjs, min_jj; + BLASLONG range_N[2]; + BLASLONG j, js, min_j; + +#ifdef SHARED_ARRAY + FLOAT *aa; +#endif + + FLOAT *sb2 = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES / 2) { + info = POTF2_U(args, NULL, range_n, sa, sb, 0); + return info; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (j = 0; j < n; j += blocking) { + bk = n - j; + if (bk > blocking) bk = blocking; + + if (!range_n) { + range_N[0] = j; + range_N[1] = j + bk; + } else { + range_N[0] = range_n[0] + j; + range_N[1] = range_n[0] + j + bk; + } + + info = CNAME(args, NULL, range_N, sa, sb, 0); + if (info) return info + j; + + if (n - j - bk > 0) { + + TRSM_IUNCOPY(bk, bk, a + (j + j * lda) * COMPSIZE, lda, 0, sb); + + for(js = j + bk; js < n; js += REAL_GEMM_R) { + min_j = n - js; + if (min_j > REAL_GEMM_R) min_j = REAL_GEMM_R; + + for(jjs = js; jjs < js + min_j; jjs += GEMM_UNROLL_N){ + min_jj = min_j + js - jjs; + if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; + + GEMM_ONCOPY(bk, min_jj, a + (j + jjs * lda) * COMPSIZE, lda, sb2 + bk * (jjs - js) * COMPSIZE); + + for (is = 0; is < bk; is += GEMM_P) { + min_i = bk - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRSM_KERNEL (min_i, min_jj, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb + bk * is * COMPSIZE, + sb2 + bk * (jjs - js) * COMPSIZE, + a + (j + is + jjs * lda) * COMPSIZE, lda, is); + } + } + + for (is = j + bk; is < js + min_j; is += min_i) { + min_i = js + min_j - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifdef SHARED_ARRAY + if ((is >= js) && (is + min_i <= js + min_j)) { + aa = sb2 + bk * (is - js) * COMPSIZE; + } else { + GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); + aa = sa; + } +#else + GEMM_INCOPY(bk, min_i, a + (j + is * lda) * COMPSIZE, lda, sa); +#endif + + SYRK_KERNEL_U(min_i, min_j, bk, + dm1, + SA, sb2, + a + (is + js * lda) * COMPSIZE, lda, + is - js); + + } + } + } + + } + + return 0; +} diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c new file mode 100644 index 0000000000..f270c3d9e2 --- /dev/null +++ b/lapack/potrf/potrf_parallel.c @@ -0,0 +1,634 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifndef USE_SIMPLE_THREADED_LEVEL3 + +static FLOAT dm1 = -1.; + +#ifndef KERNEL_FUNC +#ifndef LOWER +#define KERNEL_FUNC SYRK_KERNEL_U +#else +#define KERNEL_FUNC SYRK_KERNEL_L +#endif +#endif + +#ifndef LOWER +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_LT +#else +#define TRSM_KERNEL TRSM_KERNEL_LC +#endif +#else +#ifndef COMPLEX +#define TRSM_KERNEL TRSM_KERNEL_RN +#else +#define TRSM_KERNEL TRSM_KERNEL_RR +#endif +#endif + +#ifndef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 8 +#endif + +#ifndef DIVIDE_RATE +#define DIVIDE_RATE 2 +#endif + +#ifndef SWITCH_RATIO +#define SWITCH_RATIO 2 +#endif + +#ifndef LOWER +#define TRANS +#endif + +#ifndef SYRK_LOCAL +#if !defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_UN +#elif !defined(LOWER) && defined(TRANS) +#define SYRK_LOCAL SYRK_UT +#elif defined(LOWER) && !defined(TRANS) +#define SYRK_LOCAL SYRK_LN +#else +#define SYRK_LOCAL SYRK_LT +#endif +#endif + +typedef struct { + volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +} job_t; + + +#ifndef KERNEL_OPERATION +#ifndef COMPLEX +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#else +#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ + KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) +#endif +#endif + +#ifndef ICOPY_OPERATION +#ifndef TRANS +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef OCOPY_OPERATION +#ifdef TRANS +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + +#ifndef S +#define S args -> a +#endif +#ifndef A +#define A args -> b +#endif +#ifndef C +#define C args -> c +#endif +#ifndef LDA +#define LDA args -> lda +#endif +#ifndef N +#define N args -> m +#endif +#ifndef K +#define K args -> k +#endif + +static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + + FLOAT *buffer[DIVIDE_RATE]; + + BLASLONG k, lda; + BLASLONG m_from, m_to; + + FLOAT *alpha; + FLOAT *a, *c; + job_t *job = (job_t *)args -> common; + BLASLONG xxx, bufferside; + + BLASLONG jjs, min_jj; + BLASLONG is, min_i, div_n; + + BLASLONG i, current; + + k = K; + + a = (FLOAT *)A; + c = (FLOAT *)C; + + lda = LDA; + + alpha = (FLOAT *)args -> alpha; + + m_from = range_n[mypos + 0]; + m_to = range_n[mypos + 1]; + +#if 0 + fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); +#endif + + div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + buffer[0] = (FLOAT *)((((long)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); + for (i = 1; i < DIVIDE_RATE; i++) { + buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; + } + +#ifndef LOWER + TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#else + TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); +#endif + + for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { + + for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ + + min_jj = MIN(m_to, xxx + div_n) - jjs; + +#ifndef LOWER + if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; +#else + if (min_jj > GEMM_P) min_jj = GEMM_P; +#endif + +#ifndef LOWER + OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (k, min_jj, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + sb, + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + a + jjs * lda * COMPSIZE, lda, 0); +#else + ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); + + TRSM_KERNEL (min_jj, k, k, dm1, +#ifdef COMPLEX + ZERO, +#endif + buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, + sb, + a + jjs * COMPSIZE, lda, 0); +#endif + } + +#ifndef LOWER + for (i = 0; i <= mypos; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#else + for (i = mypos; i < args -> nthreads; i++) + job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; +#endif + + WMB; + } + + min_i = m_to - m_from; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + /* thread has to wait */ + if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, m_from, xxx); + + if (m_from + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } + +#ifndef LOWER + current ++; +#else + current --; +#endif + } + + for(is = m_from + min_i; is < m_to; is += min_i){ + min_i = m_to - is; + + if (min_i >= GEMM_P * 2) { + min_i = GEMM_P; + } else + if (min_i > GEMM_P) { + min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + } + +#ifndef LOWER + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#else + OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); +#endif + + current = mypos; + +#ifndef LOWER + while (current < args -> nthreads) +#else + while (current >= 0) +#endif + { + div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1); + + for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { + + KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, + sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], + c, lda, is, xxx); + + if (is + min_i >= m_to) { + job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; + WMB; + } + } +#ifndef LOWER + current ++; +#else + current --; +#endif + } + } + + for (i = 0; i < args -> nthreads; i++) { + if (i != mypos) { + for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { + while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; + } + } + } + + return 0; + } + +static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ + + blas_arg_t newarg; + + job_t job[MAX_CPU_NUMBER]; + blas_queue_t queue[MAX_CPU_NUMBER]; + + BLASLONG range[MAX_CPU_NUMBER + 100]; + + BLASLONG num_cpu; + + BLASLONG nthreads = args -> nthreads; + + BLASLONG width, i, j, k; + BLASLONG n, n_from, n_to; + int mode, mask; + double dnum; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; + mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; + mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_REAL; + mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; + mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; + mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; + mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; +#endif +#endif + + newarg.m = args -> m; + newarg.k = args -> k; + newarg.a = args -> a; + newarg.b = args -> b; + newarg.c = args -> c; + newarg.lda = args -> lda; + newarg.alpha = args -> alpha; + newarg.common = (void *)job; + + n_from = 0; + n_to = args -> m; + +#ifndef LOWER + + range[MAX_CPU_NUMBER] = n_to - n_from; + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); + + if (num_cpu == 0) width = n - ((n - width) & ~mask); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + + for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; + +#else + + range[0] = 0; + num_cpu = 0; + i = 0; + n = n_to - n_from; + + dnum = (double)n * (double)n /(double)nthreads; + + while (i < n){ + + if (nthreads - num_cpu > 1) { + + double di = (double)i; + + width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask); + + if ((width > n - i) || (width < mask)) width = n - i; + + } else { + width = n - i; + } + + range[num_cpu + 1] = range[num_cpu] + width; + + queue[num_cpu].mode = mode; + queue[num_cpu].routine = inner_thread; + queue[num_cpu].args = &newarg; + queue[num_cpu].range_m = NULL; + queue[num_cpu].range_n = range; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].next = &queue[num_cpu + 1]; + + num_cpu ++; + i += width; + } + +#endif + + newarg.nthreads = num_cpu; + + if (num_cpu) { + + for (j = 0; j < num_cpu; j++) { + for (i = 0; i < num_cpu; i++) { + for (k = 0; k < DIVIDE_RATE; k++) { + job[j].working[i][CACHE_LINE_SIZE * k] = 0; + } + } + } + + queue[0].sa = sa; + queue[0].sb = sb; + queue[num_cpu - 1].next = NULL; + + exec_blas(num_cpu, queue); + } + + return 0; +} + +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, bk, i, blocking, lda; + BLASLONG info; + int mode; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { -ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + if (args -> nthreads == 1) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); +#endif + return info; + } + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= GEMM_UNROLL_N * 2) { +#ifndef LOWER + info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); +#else + info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); +#endif + return info; + } + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + newarg.beta = NULL; + newarg.nthreads = args -> nthreads; + + blocking = (n / 2 + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1); + if (blocking > GEMM_Q) blocking = GEMM_Q; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + newarg.m = bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + + info = CNAME(&newarg, NULL, NULL, sa, sb, 0); + if (info) return info + i; + + if (n - i - bk > 0) { +#ifndef USE_SIMPLE_THREADED_LEVEL3 + newarg.m = n - i - bk; + newarg.k = bk; +#ifndef LOWER + newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; +#else + newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; +#endif + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + + thread_driver(&newarg, sa, sb); +#else + +#ifndef LOWER + newarg.m = bk; + newarg.n = n - i - bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + gemm_thread_n(mode | BLAS_TRANSA_T, + &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, + &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); +#endif +#else + newarg.m = n - i - bk; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + bk + i * lda) * COMPSIZE; + + gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); + + newarg.n = n - i - bk; + newarg.k = bk; + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; + +#if 0 + HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); +#else + syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, + &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); +#endif +#endif + +#endif + } + } + return 0; +} diff --git a/lapack/trti2/Makefile b/lapack/trti2/Makefile new file mode 100644 index 0000000000..45251fb1e7 --- /dev/null +++ b/lapack/trti2/Makefile @@ -0,0 +1,155 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = strti2_UU.$(SUFFIX) strti2_UN.$(SUFFIX) strti2_LU.$(SUFFIX) strti2_LN.$(SUFFIX) +DBLASOBJS = dtrti2_UU.$(SUFFIX) dtrti2_UN.$(SUFFIX) dtrti2_LU.$(SUFFIX) dtrti2_LN.$(SUFFIX) +QBLASOBJS = qtrti2_UU.$(SUFFIX) qtrti2_UN.$(SUFFIX) qtrti2_LU.$(SUFFIX) qtrti2_LN.$(SUFFIX) +CBLASOBJS = ctrti2_UU.$(SUFFIX) ctrti2_UN.$(SUFFIX) ctrti2_LU.$(SUFFIX) ctrti2_LN.$(SUFFIX) +ZBLASOBJS = ztrti2_UU.$(SUFFIX) ztrti2_UN.$(SUFFIX) ztrti2_LU.$(SUFFIX) ztrti2_LN.$(SUFFIX) +XBLASOBJS = xtrti2_UU.$(SUFFIX) xtrti2_UN.$(SUFFIX) xtrti2_LU.$(SUFFIX) xtrti2_LN.$(SUFFIX) + +strti2_UU.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strti2_UN.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strti2_LU.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strti2_LN.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +dtrti2_UU.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrti2_UN.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrti2_LU.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrti2_LN.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +qtrti2_UU.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrti2_UN.$(SUFFIX) : trti2_U.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrti2_LU.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrti2_LN.$(SUFFIX) : trti2_L.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +ctrti2_UU.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrti2_UN.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrti2_LU.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrti2_LN.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ztrti2_UU.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrti2_UN.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrti2_LU.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrti2_LN.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +xtrti2_UU.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrti2_UN.$(SUFFIX) : ztrti2_U.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrti2_LU.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrti2_LN.$(SUFFIX) : ztrti2_L.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +strti2_UU.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strti2_UN.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strti2_LU.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strti2_LN.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +dtrti2_UU.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrti2_UN.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrti2_LU.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrti2_LN.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +qtrti2_UU.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrti2_UN.$(PSUFFIX) : trti2_U.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrti2_LU.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrti2_LN.$(PSUFFIX) : trti2_L.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +ctrti2_UU.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrti2_UN.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrti2_LU.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrti2_LN.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ztrti2_UU.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrti2_UN.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrti2_LU.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrti2_LN.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +xtrti2_UU.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrti2_UN.$(PSUFFIX) : ztrti2_U.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrti2_LU.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrti2_LN.$(PSUFFIX) : ztrti2_L.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/trti2/trti2_L.c b/lapack/trti2/trti2_L.c new file mode 100644 index 0000000000..47fb53d091 --- /dev/null +++ b/lapack/trti2/trti2_L.c @@ -0,0 +1,86 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define TRMV TRMV_NLU +#else +#define TRMV TRMV_NLN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj; + BLASLONG j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = n - 1; j >= 0; j--) { + + ajj = ONE; + +#ifndef UNIT + ajj /= *(a + j + j * lda); + *(a + j + j * lda) = ajj; +#endif + + TRMV (n - j - 1, + a + (j + 1) + (j + 1) * lda, lda, + a + (j + 1) + j * lda, + 1, sb); + + SCAL_K(n - j - 1, 0, 0, + -ajj, + a + (j + 1) + j * lda, 1, + NULL, 0, NULL, 0); + } + + return 0; +} diff --git a/lapack/trti2/trti2_U.c b/lapack/trti2/trti2_U.c new file mode 100644 index 0000000000..f43cecdf29 --- /dev/null +++ b/lapack/trti2/trti2_U.c @@ -0,0 +1,87 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define TRMV TRMV_NUU +#else +#define TRMV TRMV_NUN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj; + BLASLONG j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = 0; j < n; j++) { + + ajj = ONE; + +#ifndef UNIT + ajj /= *(a + j + j * lda); + *(a + j + j * lda) = ajj; +#endif + + TRMV (j, + a , lda, + a + j * lda, 1, + sb); + + SCAL_K(j, 0, 0, + -ajj, + a + j * lda, 1, + NULL, 0, NULL, 0); + + } + + return 0; +} diff --git a/lapack/trti2/ztrti2_L.c b/lapack/trti2/ztrti2_L.c new file mode 100644 index 0000000000..fd19be2844 --- /dev/null +++ b/lapack/trti2/ztrti2_L.c @@ -0,0 +1,105 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define ZTRMV ZTRMV_NLU +#else +#define ZTRMV ZTRMV_NLN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj_r, ajj_i; +#ifndef UNIT + FLOAT ratio, den; +#endif + BLASLONG j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = n - 1; j >= 0; j--) { + + ajj_r = ONE; + ajj_i = ZERO; + +#ifndef UNIT + ajj_r = *(a + (j + j * lda) * COMPSIZE + 0); + ajj_i = *(a + (j + j * lda) * COMPSIZE + 1); + + if (fabs(ajj_r) >= fabs(ajj_i)){ + ratio = ajj_i / ajj_r; + den = 1. / (ajj_r * ( 1 + ratio * ratio)); + ajj_r = den; + ajj_i = -ratio * den; + } else { + ratio = ajj_r / ajj_i; + den = 1. /(ajj_i * ( 1 + ratio * ratio)); + ajj_r = ratio * den; + ajj_i = -den; + } + + *(a + (j + j * lda) * COMPSIZE + 0) = ajj_r; + *(a + (j + j * lda) * COMPSIZE + 1) = ajj_i; +#endif + + ZTRMV (n - j - 1, + a + ((j + 1) + (j + 1) * lda) * COMPSIZE, lda, + a + ((j + 1) + j * lda) * COMPSIZE, 1, + sb); + + SCAL_K(n - j - 1, 0, 0, + -ajj_r, -ajj_i, + a + ((j + 1) + j * lda) * COMPSIZE, 1, + NULL, 0, NULL, 0); + } + + return 0; +} diff --git a/lapack/trti2/ztrti2_U.c b/lapack/trti2/ztrti2_U.c new file mode 100644 index 0000000000..d85b327eb4 --- /dev/null +++ b/lapack/trti2/ztrti2_U.c @@ -0,0 +1,107 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define ZTRMV ZTRMV_NUU +#else +#define ZTRMV ZTRMV_NUN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + FLOAT ajj_r, ajj_i; +#ifndef UNIT + FLOAT ratio, den; +#endif + BLASLONG j; + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + for (j = 0; j < n; j++) { + + ajj_r = ONE; + ajj_i = ZERO; + +#ifndef UNIT + ajj_r = *(a + (j + j * lda) * COMPSIZE + 0); + ajj_i = *(a + (j + j * lda) * COMPSIZE + 1); + + + if (fabs(ajj_r) >= fabs(ajj_i)){ + ratio = ajj_i / ajj_r; + den = 1. / (ajj_r * ( 1 + ratio * ratio)); + ajj_r = den; + ajj_i = -ratio * den; + } else { + ratio = ajj_r / ajj_i; + den = 1. /(ajj_i * ( 1 + ratio * ratio)); + ajj_r = ratio * den; + ajj_i = -den; + } + + *(a + (j + j * lda) * COMPSIZE + 0) = ajj_r; + *(a + (j + j * lda) * COMPSIZE + 1) = ajj_i; +#endif + + ZTRMV (j, + a , lda, + a + j * lda * COMPSIZE, 1, + sb); + + SCAL_K(j, 0, 0, + -ajj_r, -ajj_i, + a + j * lda * COMPSIZE, 1, + NULL, 0, NULL, 0); + + } + + return 0; +} diff --git a/lapack/trtri/Makefile b/lapack/trtri/Makefile new file mode 100644 index 0000000000..722f112b0a --- /dev/null +++ b/lapack/trtri/Makefile @@ -0,0 +1,313 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = strtri_UU_single.$(SUFFIX) strtri_UN_single.$(SUFFIX) strtri_LU_single.$(SUFFIX) strtri_LN_single.$(SUFFIX) + +DBLASOBJS = dtrtri_UU_single.$(SUFFIX) dtrtri_UN_single.$(SUFFIX) dtrtri_LU_single.$(SUFFIX) dtrtri_LN_single.$(SUFFIX) + +QBLASOBJS = qtrtri_UU_single.$(SUFFIX) qtrtri_UN_single.$(SUFFIX) qtrtri_LU_single.$(SUFFIX) qtrtri_LN_single.$(SUFFIX) + +CBLASOBJS = ctrtri_UU_single.$(SUFFIX) ctrtri_UN_single.$(SUFFIX) ctrtri_LU_single.$(SUFFIX) ctrtri_LN_single.$(SUFFIX) + +ZBLASOBJS = ztrtri_UU_single.$(SUFFIX) ztrtri_UN_single.$(SUFFIX) ztrtri_LU_single.$(SUFFIX) ztrtri_LN_single.$(SUFFIX) + +XBLASOBJS = xtrtri_UU_single.$(SUFFIX) xtrtri_UN_single.$(SUFFIX) xtrtri_LU_single.$(SUFFIX) xtrtri_LN_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += strtri_UU_parallel.$(SUFFIX) strtri_UN_parallel.$(SUFFIX) strtri_LU_parallel.$(SUFFIX) strtri_LN_parallel.$(SUFFIX) +DBLASOBJS += dtrtri_UU_parallel.$(SUFFIX) dtrtri_UN_parallel.$(SUFFIX) dtrtri_LU_parallel.$(SUFFIX) dtrtri_LN_parallel.$(SUFFIX) +QBLASOBJS += qtrtri_UU_parallel.$(SUFFIX) qtrtri_UN_parallel.$(SUFFIX) qtrtri_LU_parallel.$(SUFFIX) qtrtri_LN_parallel.$(SUFFIX) +CBLASOBJS += ctrtri_UU_parallel.$(SUFFIX) ctrtri_UN_parallel.$(SUFFIX) ctrtri_LU_parallel.$(SUFFIX) ctrtri_LN_parallel.$(SUFFIX) +ZBLASOBJS += ztrtri_UU_parallel.$(SUFFIX) ztrtri_UN_parallel.$(SUFFIX) ztrtri_LU_parallel.$(SUFFIX) ztrtri_LN_parallel.$(SUFFIX) +XBLASOBJS += xtrtri_UU_parallel.$(SUFFIX) xtrtri_UN_parallel.$(SUFFIX) xtrtri_LU_parallel.$(SUFFIX) xtrtri_LN_parallel.$(SUFFIX) +endif + +strtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +dtrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +qtrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +ctrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ztrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +xtrtri_UU_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_UN_single.$(SUFFIX) : trtri_U_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_LU_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_LN_single.$(SUFFIX) : trtri_L_single.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_UU_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_UN_parallel.$(SUFFIX) : trtri_U_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_LU_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_LN_parallel.$(SUFFIX) : trtri_L_parallel.c + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +strtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +strtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +strtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +dtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +dtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +dtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +qtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +qtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +qtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +ctrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ctrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) + +ctrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -UDOUBLE -UUNIT $< -o $(@F) + +ztrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +ztrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -DUNIT $< -o $(@F) + +ztrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DDOUBLE -UUNIT $< -o $(@F) + +xtrtri_UU_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_UN_single.$(PSUFFIX) : trtri_U_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_LU_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_LN_single.$(PSUFFIX) : trtri_L_single.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_UU_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_UN_parallel.$(PSUFFIX) : trtri_U_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +xtrtri_LU_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -DUNIT $< -o $(@F) + +xtrtri_LN_parallel.$(PSUFFIX) : trtri_L_parallel.c + $(CC) -c $(PFLAGS) -DCOMPLEX -DXDOUBLE -UUNIT $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/trtri/trtri_L_parallel.c b/lapack/trtri/trtri_L_parallel.c new file mode 100644 index 0000000000..5969eb671a --- /dev/null +++ b/lapack/trtri/trtri_L_parallel.c @@ -0,0 +1,151 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define TRTI2 TRTI2_LU +#define TRMM TRMM_LNLU +#define TRSM TRSM_RNLU +#else +#define TRTI2 TRTI2_LN +#define TRMM TRMM_LNLN +#define TRSM TRSM_RNLN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + BLASLONG n, info; + BLASLONG bk, i, blocking, start_i; + int mode; + BLASLONG lda, range_N[2]; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { ONE, ZERO}; + FLOAT beta [2] = {-ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= DTB_ENTRIES) { + info = TRTI2(args, NULL, range_n, sa, sb, 0); + return info; + } + + blocking = GEMM_Q; + if (n < 4 * GEMM_Q) blocking = (n + 3) / 4; + + start_i = 0; + while (start_i < n) start_i += blocking; + start_i -= blocking; + + for (i = start_i; i >= 0; i -= blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + range_N[0] = i; + range_N[1] = i + bk; + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + + newarg.m = n - bk - i; + newarg.n = bk; + newarg.a = a + ( i + i * lda) * COMPSIZE; + newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; + + newarg.beta = beta; + newarg.nthreads = args -> nthreads; + + gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = bk; + + newarg.a = a + (i + i * lda) * COMPSIZE; + + CNAME (&newarg, NULL, NULL, sa, sb, 0); + + newarg.m = n - bk - i; + newarg.n = i; + newarg.k = bk; + + newarg.a = a + (i + bk + i * lda) * COMPSIZE; + newarg.b = a + (i ) * COMPSIZE; + newarg.c = a + (i + bk ) * COMPSIZE; + + newarg.beta = NULL; + + gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads); + + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i ) * COMPSIZE; + + newarg.m = bk; + newarg.n = i; + + gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads); + } + + + return 0; +} diff --git a/lapack/trtri/trtri_L_single.c b/lapack/trtri/trtri_L_single.c new file mode 100644 index 0000000000..a940ce2f60 --- /dev/null +++ b/lapack/trtri/trtri_L_single.c @@ -0,0 +1,190 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; +static FLOAT dm1 = -1.; + +#ifdef UNIT +#define TRTI2 TRTI2_LU +#else +#define TRTI2 TRTI2_LN +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 64 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG i, is, min_i, start_i; + BLASLONG ls, min_l; + BLASLONG bk; + BLASLONG blocking; + BLASLONG range_N[2]; + + FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); + FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_A); + FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES) { + TRTI2(args, NULL, range_n, sa, sb, 0); + return 0; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + start_i = 0; + while (start_i < n) start_i += blocking; + start_i -= blocking; + + for (i = start_i; i >= 0; i -= blocking) { + bk = MIN(blocking, n - i); + + if (n - bk - i > 0) TRSM_OLNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); + + if (!range_n) { + range_N[0] = i; + range_N[1] = i + bk; + } else { + range_N[0] = range_n[0] + i; + range_N[1] = range_n[0] + i + bk; + } + + CNAME(args, NULL, range_N, sa, sa_trmm, 0); + + if (i > 0) { + TRMM_ILTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); + + for (ls = 0; ls < i; ls += REAL_GEMM_R) { + min_l = i - ls; + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + + GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); + + if (n - bk - i > 0) { + for (is = i + bk; is < n; is += GEMM_P) { + min_i = n - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (ls == 0) { + NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL_RT(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sa_trsm, + a + (is + i * lda) * COMPSIZE, lda, 0); + } else { + GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + } + + GEMM_KERNEL_N(min_i, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb_gemm, + a + (is + ls * lda) * COMPSIZE, lda); + } + } + + for (is = 0; is < bk; is += GEMM_P) { + min_i = bk - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRMM_KERNEL_LT(min_i, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa_trmm + is * bk * COMPSIZE, sb_gemm, + a + (i + is + ls * lda) * COMPSIZE, lda, is); + } + } + + } else { + + if (n - bk - i > 0) { + for (is = 0; is < n - bk - i; is += GEMM_P) { + min_i = n - bk - i - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + NEG_TCOPY (bk, min_i, a + (i + bk + is + i * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL_RT(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sa_trsm, + a + (i + bk + is + i * lda) * COMPSIZE, lda, 0); + } + } + + } + } + + return 0; +} diff --git a/lapack/trtri/trtri_U_parallel.c b/lapack/trtri/trtri_U_parallel.c new file mode 100644 index 0000000000..8761a40c23 --- /dev/null +++ b/lapack/trtri/trtri_U_parallel.c @@ -0,0 +1,147 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +#ifdef UNIT +#define TRTI2 TRTI2_UU +#define TRMM TRMM_LNUU +#define TRSM TRSM_RNUU +#else +#define TRTI2 TRTI2_UN +#define TRMM TRMM_LNUN +#define TRSM TRSM_RNUN +#endif + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) { + + BLASLONG n, info; + BLASLONG bk, i, blocking; + int mode; + BLASLONG lda, range_N[2]; + blas_arg_t newarg; + FLOAT *a; + FLOAT alpha[2] = { ONE, ZERO}; + FLOAT beta [2] = {-ONE, ZERO}; + +#ifndef COMPLEX +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_REAL; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif +#else +#ifdef XDOUBLE + mode = BLAS_XDOUBLE | BLAS_COMPLEX; +#elif defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_COMPLEX; +#else + mode = BLAS_SINGLE | BLAS_COMPLEX; +#endif +#endif + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) n = range_n[1] - range_n[0]; + + if (n <= DTB_ENTRIES) { + info = TRTI2(args, NULL, range_n, sa, sb, 0); + return info; + } + + blocking = GEMM_Q; + if (n < 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (i = 0; i < n; i += blocking) { + bk = n - i; + if (bk > blocking) bk = blocking; + + range_N[0] = i; + range_N[1] = i + bk; + + newarg.lda = lda; + newarg.ldb = lda; + newarg.ldc = lda; + newarg.alpha = alpha; + + newarg.m = i; + newarg.n = bk; + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + ( i * lda) * COMPSIZE; + + newarg.beta = beta; + newarg.nthreads = args -> nthreads; + + gemm_thread_m(mode, &newarg, NULL, NULL, TRSM, sa, sb, args -> nthreads); + + newarg.m = bk; + newarg.n = bk; + + newarg.a = a + (i + i * lda) * COMPSIZE; + + CNAME (&newarg, NULL, NULL, sa, sb, 0); + + newarg.m = i; + newarg.n = n - i - bk; + newarg.k = bk; + + newarg.a = a + ( i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + newarg.c = a + ( (i + bk) * lda) * COMPSIZE; + + newarg.beta = NULL; + + gemm_thread_n(mode, &newarg, NULL, NULL, GEMM_NN, sa, sb, args -> nthreads); + + newarg.a = a + (i + i * lda) * COMPSIZE; + newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; + + newarg.m = bk; + newarg.n = n - i - bk; + + gemm_thread_n(mode, &newarg, NULL, NULL, TRMM, sa, sb, args -> nthreads); + + } + + return 0; +} diff --git a/lapack/trtri/trtri_U_single.c b/lapack/trtri/trtri_U_single.c new file mode 100644 index 0000000000..72133d896f --- /dev/null +++ b/lapack/trtri/trtri_U_single.c @@ -0,0 +1,188 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +static FLOAT dp1 = 1.; +static FLOAT dm1 = -1.; + +#ifdef UNIT +#define TRTI2 TRTI2_UU +#else +#define TRTI2 TRTI2_UN +#endif + +#if 0 +#undef GEMM_P +#undef GEMM_Q +#undef GEMM_R + +#define GEMM_P 8 +#define GEMM_Q 20 +#define GEMM_R 64 +#endif + +#define GEMM_PQ MAX(GEMM_P, GEMM_Q) +#define REAL_GEMM_R (GEMM_R - 2 * GEMM_PQ) + +blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { + + BLASLONG n, lda; + FLOAT *a; + + BLASLONG i, is, min_i, start_is; + BLASLONG ls, min_l; + BLASLONG bk; + BLASLONG blocking; + BLASLONG range_N[2]; + + FLOAT *sa_trsm = (FLOAT *)((BLASLONG)sb); + FLOAT *sa_trmm = (FLOAT *)((((BLASLONG)sb + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_A); + FLOAT *sb_gemm = (FLOAT *)((((BLASLONG)sa_trmm + + GEMM_PQ * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN) + + GEMM_OFFSET_B); + + n = args -> n; + a = (FLOAT *)args -> a; + lda = args -> lda; + + if (range_n) { + n = range_n[1] - range_n[0]; + a += range_n[0] * (lda + 1) * COMPSIZE; + } + + if (n <= DTB_ENTRIES) { + TRTI2(args, NULL, range_n, sa, sb, 0); + return 0; + } + + blocking = GEMM_Q; + if (n <= 4 * GEMM_Q) blocking = (n + 3) / 4; + + for (i = 0; i < n; i += blocking) { + bk = MIN(blocking, n - i); + + if (i > 0) TRSM_OUNCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, sa_trsm); + + if (!range_n) { + range_N[0] = i; + range_N[1] = i + bk; + } else { + range_N[0] = range_n[0] + i; + range_N[1] = range_n[0] + i + bk; + } + + CNAME(args, NULL, range_N, sa, sa_trmm, 0); + + if (n -bk - i > 0) { + TRMM_IUTCOPY(bk, bk, a + (i + i * lda) * COMPSIZE, lda, 0, 0, sa_trmm); + + for (ls = i + bk; ls < n; ls += REAL_GEMM_R) { + min_l = n - ls; + if (min_l > REAL_GEMM_R) min_l = REAL_GEMM_R; + + GEMM_ONCOPY (bk, min_l, a + (i + ls * lda) * COMPSIZE, lda, sb_gemm); + + if (i > 0) { + for (is = 0; is < i; is += GEMM_P) { + min_i = i - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + if (ls == i + bk) { + NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL_RN(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sa_trsm, + a + (is + i * lda) * COMPSIZE, lda, 0); + } else { + GEMM_ITCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + } + + GEMM_KERNEL_N(min_i, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa, sb_gemm, + a + (is + ls * lda) * COMPSIZE, lda); + } + } + + start_is = 0; + while (start_is < bk) start_is += GEMM_P; + start_is -= GEMM_P; + + for (is = 0; is < bk; is += GEMM_P) { + min_i = bk - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + TRMM_KERNEL_LN(min_i, min_l, bk, dp1, +#ifdef COMPLEX + ZERO, +#endif + sa_trmm + is * bk * COMPSIZE, sb_gemm, + a + (i + is + ls * lda) * COMPSIZE, lda, is); + } + } + + } else { + if (i > 0) { + for (is = 0; is < i; is += GEMM_P) { + min_i = i - is; + if (min_i > GEMM_P) min_i = GEMM_P; + + NEG_TCOPY (bk, min_i, a + (is + i * lda) * COMPSIZE, lda, sa); + + TRSM_KERNEL_RN(min_i, bk, bk, dm1, +#ifdef COMPLEX + ZERO, +#endif + sa, sa_trsm, + a + (is + i * lda) * COMPSIZE, lda, 0); + } + } + } + } + + return 0; +} diff --git a/make.inc b/make.inc new file mode 100644 index 0000000000..30004233f8 --- /dev/null +++ b/make.inc @@ -0,0 +1,11 @@ +SHELL = /bin/sh +PLAT = _LINUX +DRVOPTS = $(OPTS) +LOADER = $(FORTRAN) +TIMER = NONE +ARCHFLAGS= -ru +RANLIB = ranlib +BLASLIB = +TMGLIB = tmglib.a +EIGSRCLIB = eigsrc.a +LINSRCLIB = linsrc.a diff --git a/param.h b/param.h new file mode 100644 index 0000000000..d8dbcfadf3 --- /dev/null +++ b/param.h @@ -0,0 +1,1543 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef PARAM_H +#define PARAM_H + +#ifdef OPTERON + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 256 +#define GEMM_DEFAULT_ALIGN 0x01ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#ifdef ALLOC_HUGETLB + +#define SGEMM_DEFAULT_Q 248 +#define DGEMM_DEFAULT_Q 248 +#define QGEMM_DEFAULT_Q 248 +#define CGEMM_DEFAULT_Q 248 +#define ZGEMM_DEFAULT_Q 248 +#define XGEMM_DEFAULT_Q 248 + +#else + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 240 +#define QGEMM_DEFAULT_Q 240 +#define CGEMM_DEFAULT_Q 240 +#define ZGEMM_DEFAULT_Q 240 +#define XGEMM_DEFAULT_Q 240 + +#endif + + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#endif + +#if defined(BARCELONA) || defined(SHANGHAI) + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#if 0 +#define SGEMM_DEFAULT_P 496 +#define DGEMM_DEFAULT_P 248 +#define QGEMM_DEFAULT_P 124 +#define CGEMM_DEFAULT_P 248 +#define ZGEMM_DEFAULT_P 124 +#define XGEMM_DEFAULT_P 62 + +#define SGEMM_DEFAULT_Q 248 +#define DGEMM_DEFAULT_Q 248 +#define QGEMM_DEFAULT_Q 248 +#define CGEMM_DEFAULT_Q 248 +#define ZGEMM_DEFAULT_Q 248 +#define XGEMM_DEFAULT_Q 248 + +#else + +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 224 +#define QGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#define ZGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define QGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#endif + +#define SGEMM_DEFAULT_R sgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + +#ifdef ATHLON + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 384 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 1 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_P 208 +#define DGEMM_DEFAULT_P 104 +#define QGEMM_DEFAULT_P 56 +#define CGEMM_DEFAULT_P 104 +#define ZGEMM_DEFAULT_P 56 +#define XGEMM_DEFAULT_P 28 + +#define SGEMM_DEFAULT_Q 208 +#define DGEMM_DEFAULT_Q 208 +#define QGEMM_DEFAULT_Q 208 +#define CGEMM_DEFAULT_Q 208 +#define ZGEMM_DEFAULT_Q 208 +#define XGEMM_DEFAULT_Q 208 + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE +#endif + +#ifdef VIAC3 + +#define SNUMOPT 2 +#define DNUMOPT 1 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 256 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_M 1 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define QGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 +#define XGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 + +#define SYMV_P 16 +#endif + +#ifdef NANO + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 256 +#define GEMM_DEFAULT_ALIGN 0x01ffffUL + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_DEFAULT_P 288 +#define DGEMM_DEFAULT_P 288 +#define QGEMM_DEFAULT_P 288 +#define CGEMM_DEFAULT_P 288 +#define ZGEMM_DEFAULT_P 288 +#define XGEMM_DEFAULT_P 288 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 64 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 64 +#define XGEMM_DEFAULT_Q 32 + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#endif + +#if defined(PENTIUM) || defined(PENTIUM2) || defined(PENTIUM3) + +#ifdef HAVE_SSE +#define SNUMOPT 2 +#else +#define SNUMOPT 1 +#endif +#define DNUMOPT 1 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#ifdef HAVE_SSE +#define SGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 4 +#else +#define SGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#endif +#define DGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_Q 256 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 4 + +#endif + +#ifdef PENTIUMM + +#define SNUMOPT 2 +#define DNUMOPT 1 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#ifdef CORE_YONAH +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 1 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_Q 256 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 4 +#endif + +#ifdef CORE_NORTHWOOD + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 32 + +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SYMV_P 8 + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 1 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 +#endif + +#ifdef CORE_PRESCOTT + +#define SNUMOPT 4 +#define DNUMOPT 2 + +#ifndef __64BIT__ +#define GEMM_DEFAULT_OFFSET_A 128 +#define GEMM_DEFAULT_OFFSET_B 192 +#else +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 256 +#endif + +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SYMV_P 8 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 128 +#define DGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 128 +#define XGEMM_DEFAULT_Q 128 +#endif + +#ifdef CORE2 + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 448 +#define GEMM_DEFAULT_OFFSET_B 128 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 1 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define MASK(a, b) ((((a) + (b) - 1) / (b)) * (b)) + +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 256 + +#endif + +#ifdef PENRYN + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 128 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.75 +#endif + +#ifdef DUNNINGTON + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 128 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 768 +#define DGEMM_DEFAULT_Q 384 +#define QGEMM_DEFAULT_Q 192 +#define CGEMM_DEFAULT_Q 768 +#define ZGEMM_DEFAULT_Q 384 +#define XGEMM_DEFAULT_Q 192 + +#define GETRF_FACTOR 0.75 +#define GEMM_THREAD gemm_thread_mn +#endif + +#ifdef NEHALEM + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 32 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P 504 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P 504 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P 252 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P 252 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.72 + +#endif + + +#ifdef ATOM + +#define SNUMOPT 2 +#define DNUMOPT 1 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SYMV_P 8 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 1 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_P sgemm_p +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P dgemm_p +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P qgemm_p +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P cgemm_p +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P zgemm_p +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P xgemm_p +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 256 + +#endif + + +#ifdef ITANIUM2 + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 128 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P sgemm_p +#define DGEMM_DEFAULT_P dgemm_p +#define QGEMM_DEFAULT_P qgemm_p +#define CGEMM_DEFAULT_P cgemm_p +#define ZGEMM_DEFAULT_P zgemm_p +#define XGEMM_DEFAULT_P xgemm_p + +#define SGEMM_DEFAULT_Q 1024 +#define DGEMM_DEFAULT_Q 1024 +#define QGEMM_DEFAULT_Q 1024 +#define CGEMM_DEFAULT_Q 1024 +#define ZGEMM_DEFAULT_Q 1024 +#define XGEMM_DEFAULT_Q 1024 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 + +#define GETRF_FACTOR 0.65 + +#endif + +#if defined(EV4) || defined(EV5) || defined(EV6) + +#ifdef EV4 +#define SNUMOPT 1 +#define DNUMOPT 1 +#else +#define SNUMOPT 2 +#define DNUMOPT 2 +#endif + +#define GEMM_DEFAULT_OFFSET_A 512 +#define GEMM_DEFAULT_OFFSET_B 512 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SYMV_P 8 + +#ifdef EV4 +#define SGEMM_DEFAULT_P 32 +#define SGEMM_DEFAULT_Q 112 +#define SGEMM_DEFAULT_R 256 + +#define DGEMM_DEFAULT_P 32 +#define DGEMM_DEFAULT_Q 56 +#define DGEMM_DEFAULT_R 256 + +#define CGEMM_DEFAULT_P 32 +#define CGEMM_DEFAULT_Q 64 +#define CGEMM_DEFAULT_R 240 + +#define ZGEMM_DEFAULT_P 32 +#define ZGEMM_DEFAULT_Q 32 +#define ZGEMM_DEFAULT_R 240 +#endif + +#ifdef EV5 +#define SGEMM_DEFAULT_P 64 +#define SGEMM_DEFAULT_Q 256 + +#define DGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_Q 128 + +#define CGEMM_DEFAULT_P 64 +#define CGEMM_DEFAULT_Q 128 + +#define ZGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_Q 64 +#endif + +#ifdef EV6 +#define SGEMM_DEFAULT_P 256 +#define SGEMM_DEFAULT_Q 512 + +#define DGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_Q 256 + +#define CGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_Q 256 + +#define ZGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_Q 256 +#endif + +#endif + +#ifdef CELL + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 8192 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 + +#define SYMV_P 4 +#endif + +#ifdef PPCG4 +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 1024 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 + +#define SYMV_P 4 +#endif + +#ifdef PPC970 + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 2688 +#define GEMM_DEFAULT_OFFSET_B 3072 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#ifdef OS_LINUX +#if L2_SIZE == 1024976 +#define SGEMM_DEFAULT_P 320 +#define DGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 256 +#else +#define SGEMM_DEFAULT_P 176 +#define DGEMM_DEFAULT_P 176 +#define CGEMM_DEFAULT_P 176 +#define ZGEMM_DEFAULT_P 176 +#endif +#endif + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 + +#define SYMV_P 4 + +#endif + +#ifdef PPC440 + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A (32 * 0) +#define GEMM_DEFAULT_OFFSET_B (32 * 0) +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 512 + +#define SGEMM_DEFAULT_Q 1024 +#define DGEMM_DEFAULT_Q 512 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 + +#define SGEMM_DEFAULT_R SGEMM_DEFAULT_P +#define DGEMM_DEFAULT_R DGEMM_DEFAULT_P +#define CGEMM_DEFAULT_R CGEMM_DEFAULT_P +#define ZGEMM_DEFAULT_R ZGEMM_DEFAULT_P + +#define SYMV_P 4 +#endif + +#ifdef PPC440FP2 + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A (32 * 0) +#define GEMM_DEFAULT_OFFSET_B (32 * 0) +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 +#if 1 +#define SGEMM_DEFAULT_Q 4096 +#define DGEMM_DEFAULT_Q 3072 +#define CGEMM_DEFAULT_Q 2048 +#define ZGEMM_DEFAULT_Q 1024 +#else +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 +#endif + +#define SYMV_P 4 +#endif + + + +#if defined(POWER3) || defined(POWER4) || defined(POWER5) +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 2048 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#ifdef POWER3 + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define SGEMM_DEFAULT_P 256 +#define SGEMM_DEFAULT_Q 432 +#define SGEMM_DEFAULT_R 1012 + +#define DGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_Q 216 +#define DGEMM_DEFAULT_R 1012 + +#define ZGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_Q 104 +#define ZGEMM_DEFAULT_R 1012 +#endif + +#if defined(POWER4) +#ifdef ALLOC_HUGETLB +#define SGEMM_DEFAULT_P 184 +#define DGEMM_DEFAULT_P 184 +#define CGEMM_DEFAULT_P 184 +#define ZGEMM_DEFAULT_P 184 +#else +#define SGEMM_DEFAULT_P 144 +#define DGEMM_DEFAULT_P 144 +#define CGEMM_DEFAULT_P 144 +#define ZGEMM_DEFAULT_P 144 +#endif +#endif + +#if defined(POWER5) +#ifdef ALLOC_HUGETLB +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 128 +#else +#define SGEMM_DEFAULT_P 320 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 160 +#define ZGEMM_DEFAULT_P 80 +#endif + +#define SGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 256 +#endif + +#define SYMV_P 8 + +#endif + +#if defined(POWER6) + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 384 +#define GEMM_DEFAULT_OFFSET_B 1024 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 992 +#define DGEMM_DEFAULT_P 480 +#define CGEMM_DEFAULT_P 488 +#define ZGEMM_DEFAULT_P 248 + +#define SGEMM_DEFAULT_Q 504 +#define DGEMM_DEFAULT_Q 504 +#define CGEMM_DEFAULT_Q 400 +#define ZGEMM_DEFAULT_Q 400 + +#define SYMV_P 8 + +#endif + +#if defined(SPARC) && defined(V7) + +#define SNUMOPT 4 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 2048 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 256 +#define DGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 128 + +#define SYMV_P 8 +#define GEMM_THREAD gemm_thread_mn +#endif + +#if defined(SPARC) && defined(V9) + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 2048 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_P 512 + +#define SGEMM_DEFAULT_Q 1024 +#define DGEMM_DEFAULT_Q 512 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 + +#define SYMV_P 8 +#endif + +#ifdef SICORTEX + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define CGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 108 +#define DGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 108 +#define ZGEMM_DEFAULT_P 112 + +#define SGEMM_DEFAULT_Q 288 +#define DGEMM_DEFAULT_Q 144 +#define CGEMM_DEFAULT_Q 144 +#define ZGEMM_DEFAULT_Q 72 + +#define SGEMM_DEFAULT_R 2000 +#define DGEMM_DEFAULT_R 2000 +#define CGEMM_DEFAULT_R 2000 +#define ZGEMM_DEFAULT_R 2000 + +#define SYMV_P 16 +#endif + +#ifdef GENERIC + +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x0ffffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + +#define SGEMM_P sgemm_p +#define DGEMM_P dgemm_p +#define QGEMM_P qgemm_p +#define CGEMM_P cgemm_p +#define ZGEMM_P zgemm_p +#define XGEMM_P xgemm_p + +#define SGEMM_R sgemm_r +#define DGEMM_R dgemm_r +#define QGEMM_R qgemm_r +#define CGEMM_R cgemm_r +#define ZGEMM_R zgemm_r +#define XGEMM_R xgemm_r + +#define SGEMM_Q 128 +#define DGEMM_Q 128 +#define QGEMM_Q 128 +#define CGEMM_Q 128 +#define ZGEMM_Q 128 +#define XGEMM_Q 128 + +#define SYMV_P 16 + +#endif + +#ifndef QGEMM_DEFAULT_UNROLL_M +#define QGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef QGEMM_DEFAULT_UNROLL_N +#define QGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_M +#define XGEMM_DEFAULT_UNROLL_M 2 +#endif + +#ifndef XGEMM_DEFAULT_UNROLL_N +#define XGEMM_DEFAULT_UNROLL_N 2 +#endif + +#ifndef HAVE_SSE2 +#define SHUFPD_0 shufps $0x44, +#define SHUFPD_1 shufps $0x4e, +#define SHUFPD_2 shufps $0xe4, +#define SHUFPD_3 shufps $0xee, +#endif + +#ifndef SHUFPD_0 +#define SHUFPD_0 shufpd $0, +#endif + +#ifndef SHUFPD_1 +#define SHUFPD_1 shufpd $1, +#endif + +#ifndef SHUFPD_2 +#define SHUFPD_2 shufpd $2, +#endif + +#ifndef SHUFPD_3 +#define SHUFPD_3 shufpd $3, +#endif + +#ifndef SHUFPS_39 +#define SHUFPS_39 shufps $0x39, +#endif + + +#endif diff --git a/patch.for_lapack-3.1.1 b/patch.for_lapack-3.1.1 new file mode 100644 index 0000000000..9f10f26c7e --- /dev/null +++ b/patch.for_lapack-3.1.1 @@ -0,0 +1,684 @@ +diff -ruN lapack-3.1.1.old/INSTALL/Makefile lapack-3.1.1/INSTALL/Makefile +--- lapack-3.1.1.old/INSTALL/Makefile 2007-02-23 14:07:35.000000000 -0600 ++++ lapack-3.1.1/INSTALL/Makefile 2009-12-16 14:40:35.000000000 -0600 +@@ -27,7 +27,7 @@ + $(LOADER) $(LOADOPTS) -o testversion ilaver.o LAPACK_version.o + + clean: +- rm -f *.o ++ rm -f *.o test* + + slamch.o: slamch.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ + dlamch.o: dlamch.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +diff -ruN lapack-3.1.1.old/Makefile lapack-3.1.1/Makefile +--- lapack-3.1.1.old/Makefile 2007-02-22 15:55:00.000000000 -0600 ++++ lapack-3.1.1/Makefile 2009-12-16 14:40:35.000000000 -0600 +@@ -20,9 +20,12 @@ + blaslib: + ( cd BLAS/SRC; $(MAKE) ) + +-lapacklib: lapack_install ++lapacklib: + ( cd SRC; $(MAKE) ) + ++lapack_prof: ++ ( cd SRC; $(MAKE) lapack_prof) ++ + tmglib: + ( cd TESTING/MATGEN; $(MAKE) ) + +diff -ruN lapack-3.1.1.old/SRC/Makefile lapack-3.1.1/SRC/Makefile +--- lapack-3.1.1.old/SRC/Makefile 2007-02-23 15:33:05.000000000 -0600 ++++ lapack-3.1.1/SRC/Makefile 2009-12-16 14:41:09.000000000 -0600 +@@ -38,265 +38,273 @@ + # + ####################################################################### + +-ALLAUX = ilaenv.o ieeeck.o lsamen.o xerbla.o iparmq.o \ +- ../INSTALL/ilaver.o ../INSTALL/lsame.o ++ALLAUX = ilaenv.$(SUFFIX) ieeeck.$(SUFFIX) lsamen.$(SUFFIX) iparmq.$(SUFFIX) \ ++ ../INSTALL/ilaver.$(SUFFIX) + + SCLAUX = \ +- sbdsdc.o \ +- sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ +- slaed0.o slaed1.o slaed2.o slaed3.o slaed4.o slaed5.o slaed6.o \ +- slaed7.o slaed8.o slaed9.o slaeda.o slaev2.o slagtf.o \ +- slagts.o slamrg.o slanst.o \ +- slapy2.o slapy3.o slarnv.o \ +- slarra.o slarrb.o slarrc.o slarrd.o slarre.o slarrf.o slarrj.o \ +- slarrk.o slarrr.o slaneg.o \ +- slartg.o slaruv.o slas2.o slascl.o \ +- slasd0.o slasd1.o slasd2.o slasd3.o slasd4.o slasd5.o slasd6.o \ +- slasd7.o slasd8.o slasda.o slasdq.o slasdt.o \ +- slaset.o slasq1.o slasq2.o slasq3.o slazq3.o slasq4.o slazq4.o slasq5.o slasq6.o \ +- slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ +- ssteqr.o ssterf.o slaisnan.o sisnan.o \ +- ../INSTALL/slamch.o ../INSTALL/second_$(TIMER).o ++ sbdsdc.$(SUFFIX) \ ++ sbdsqr.$(SUFFIX) sdisna.$(SUFFIX) slabad.$(SUFFIX) slacpy.$(SUFFIX) sladiv.$(SUFFIX) slae2.$(SUFFIX) slaebz.$(SUFFIX) \ ++ slaed0.$(SUFFIX) slaed1.$(SUFFIX) slaed2.$(SUFFIX) slaed3.$(SUFFIX) slaed4.$(SUFFIX) slaed5.$(SUFFIX) slaed6.$(SUFFIX) \ ++ slaed7.$(SUFFIX) slaed8.$(SUFFIX) slaed9.$(SUFFIX) slaeda.$(SUFFIX) slaev2.$(SUFFIX) slagtf.$(SUFFIX) \ ++ slagts.$(SUFFIX) slamrg.$(SUFFIX) slanst.$(SUFFIX) \ ++ slapy2.$(SUFFIX) slapy3.$(SUFFIX) slarnv.$(SUFFIX) \ ++ slarra.$(SUFFIX) slarrb.$(SUFFIX) slarrc.$(SUFFIX) slarrd.$(SUFFIX) slarre.$(SUFFIX) slarrf.$(SUFFIX) slarrj.$(SUFFIX) \ ++ slarrk.$(SUFFIX) slarrr.$(SUFFIX) slaneg.$(SUFFIX) \ ++ slartg.$(SUFFIX) slaruv.$(SUFFIX) slas2.$(SUFFIX) slascl.$(SUFFIX) \ ++ slasd0.$(SUFFIX) slasd1.$(SUFFIX) slasd2.$(SUFFIX) slasd3.$(SUFFIX) slasd4.$(SUFFIX) slasd5.$(SUFFIX) slasd6.$(SUFFIX) \ ++ slasd7.$(SUFFIX) slasd8.$(SUFFIX) slasda.$(SUFFIX) slasdq.$(SUFFIX) slasdt.$(SUFFIX) \ ++ slaset.$(SUFFIX) slasq1.$(SUFFIX) slasq2.$(SUFFIX) slasq3.$(SUFFIX) slazq3.$(SUFFIX) slasq4.$(SUFFIX) slazq4.$(SUFFIX) slasq5.$(SUFFIX) slasq6.$(SUFFIX) \ ++ slasr.$(SUFFIX) slasrt.$(SUFFIX) slassq.$(SUFFIX) slasv2.$(SUFFIX) spttrf.$(SUFFIX) sstebz.$(SUFFIX) sstedc.$(SUFFIX) \ ++ ssteqr.$(SUFFIX) ssterf.$(SUFFIX) slaisnan.$(SUFFIX) sisnan.$(SUFFIX) \ ++ ../INSTALL/second_$(TIMER).$(SUFFIX) + + DZLAUX = \ +- dbdsdc.o \ +- dbdsqr.o ddisna.o dlabad.o dlacpy.o dladiv.o dlae2.o dlaebz.o \ +- dlaed0.o dlaed1.o dlaed2.o dlaed3.o dlaed4.o dlaed5.o dlaed6.o \ +- dlaed7.o dlaed8.o dlaed9.o dlaeda.o dlaev2.o dlagtf.o \ +- dlagts.o dlamrg.o dlanst.o \ +- dlapy2.o dlapy3.o dlarnv.o \ +- dlarra.o dlarrb.o dlarrc.o dlarrd.o dlarre.o dlarrf.o dlarrj.o \ +- dlarrk.o dlarrr.o dlaneg.o \ +- dlartg.o dlaruv.o dlas2.o dlascl.o \ +- dlasd0.o dlasd1.o dlasd2.o dlasd3.o dlasd4.o dlasd5.o dlasd6.o \ +- dlasd7.o dlasd8.o dlasda.o dlasdq.o dlasdt.o \ +- dlaset.o dlasq1.o dlasq2.o dlasq3.o dlazq3.o dlasq4.o dlazq4.o dlasq5.o dlasq6.o \ +- dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ +- dsteqr.o dsterf.o dlaisnan.o disnan.o \ +- ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o ++ dbdsdc.$(SUFFIX) \ ++ dbdsqr.$(SUFFIX) ddisna.$(SUFFIX) dlabad.$(SUFFIX) dlacpy.$(SUFFIX) dladiv.$(SUFFIX) dlae2.$(SUFFIX) dlaebz.$(SUFFIX) \ ++ dlaed0.$(SUFFIX) dlaed1.$(SUFFIX) dlaed2.$(SUFFIX) dlaed3.$(SUFFIX) dlaed4.$(SUFFIX) dlaed5.$(SUFFIX) dlaed6.$(SUFFIX) \ ++ dlaed7.$(SUFFIX) dlaed8.$(SUFFIX) dlaed9.$(SUFFIX) dlaeda.$(SUFFIX) dlaev2.$(SUFFIX) dlagtf.$(SUFFIX) \ ++ dlagts.$(SUFFIX) dlamrg.$(SUFFIX) dlanst.$(SUFFIX) \ ++ dlapy2.$(SUFFIX) dlapy3.$(SUFFIX) dlarnv.$(SUFFIX) \ ++ dlarra.$(SUFFIX) dlarrb.$(SUFFIX) dlarrc.$(SUFFIX) dlarrd.$(SUFFIX) dlarre.$(SUFFIX) dlarrf.$(SUFFIX) dlarrj.$(SUFFIX) \ ++ dlarrk.$(SUFFIX) dlarrr.$(SUFFIX) dlaneg.$(SUFFIX) \ ++ dlartg.$(SUFFIX) dlaruv.$(SUFFIX) dlas2.$(SUFFIX) dlascl.$(SUFFIX) \ ++ dlasd0.$(SUFFIX) dlasd1.$(SUFFIX) dlasd2.$(SUFFIX) dlasd3.$(SUFFIX) dlasd4.$(SUFFIX) dlasd5.$(SUFFIX) dlasd6.$(SUFFIX) \ ++ dlasd7.$(SUFFIX) dlasd8.$(SUFFIX) dlasda.$(SUFFIX) dlasdq.$(SUFFIX) dlasdt.$(SUFFIX) \ ++ dlaset.$(SUFFIX) dlasq1.$(SUFFIX) dlasq2.$(SUFFIX) dlasq3.$(SUFFIX) dlazq3.$(SUFFIX) dlasq4.$(SUFFIX) dlazq4.$(SUFFIX) dlasq5.$(SUFFIX) dlasq6.$(SUFFIX) \ ++ dlasr.$(SUFFIX) dlasrt.$(SUFFIX) dlassq.$(SUFFIX) dlasv2.$(SUFFIX) dpttrf.$(SUFFIX) dstebz.$(SUFFIX) dstedc.$(SUFFIX) \ ++ dsteqr.$(SUFFIX) dsterf.$(SUFFIX) dlaisnan.$(SUFFIX) disnan.$(SUFFIX) \ ++ ../INSTALL/dsecnd_$(TIMER).$(SUFFIX) + + SLASRC = \ +- sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ +- sgbsvx.o sgbtf2.o sgbtrf.o sgbtrs.o sgebak.o sgebal.o sgebd2.o \ +- sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ +- sgegs.o sgegv.o sgehd2.o sgehrd.o sgelq2.o sgelqf.o \ +- sgels.o sgelsd.o sgelss.o sgelsx.o sgelsy.o sgeql2.o sgeqlf.o \ +- sgeqp3.o sgeqpf.o sgeqr2.o sgeqrf.o sgerfs.o sgerq2.o sgerqf.o \ +- sgesc2.o sgesdd.o sgesv.o sgesvd.o sgesvx.o sgetc2.o sgetf2.o \ +- sgetrf.o sgetri.o \ +- sgetrs.o sggbak.o sggbal.o sgges.o sggesx.o sggev.o sggevx.o \ +- sggglm.o sgghrd.o sgglse.o sggqrf.o \ +- sggrqf.o sggsvd.o sggsvp.o sgtcon.o sgtrfs.o sgtsv.o \ +- sgtsvx.o sgttrf.o sgttrs.o sgtts2.o shgeqz.o \ +- shsein.o shseqr.o slabrd.o slacon.o slacn2.o \ +- slaein.o slaexc.o slag2.o slags2.o slagtm.o slagv2.o slahqr.o \ +- slahrd.o slahr2.o slaic1.o slaln2.o slals0.o slalsa.o slalsd.o \ +- slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \ +- slansy.o slantb.o slantp.o slantr.o slanv2.o \ +- slapll.o slapmt.o \ +- slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ +- slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ +- slaqtr.o slar1v.o slar2v.o \ +- slarf.o slarfb.o slarfg.o slarft.o slarfx.o slargv.o \ +- slarrv.o slartv.o \ +- slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o \ +- slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o slatzm.o \ +- slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ +- sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ +- sorgrq.o sorgtr.o sorm2l.o sorm2r.o \ +- sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ +- sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ +- spbstf.o spbsv.o spbsvx.o \ +- spbtf2.o spbtrf.o spbtrs.o spocon.o spoequ.o sporfs.o sposv.o \ +- sposvx.o spotf2.o spotrf.o spotri.o spotrs.o sppcon.o sppequ.o \ +- spprfs.o sppsv.o sppsvx.o spptrf.o spptri.o spptrs.o sptcon.o \ +- spteqr.o sptrfs.o sptsv.o sptsvx.o spttrs.o sptts2.o srscl.o \ +- ssbev.o ssbevd.o ssbevx.o ssbgst.o ssbgv.o ssbgvd.o ssbgvx.o \ +- ssbtrd.o sspcon.o sspev.o sspevd.o sspevx.o sspgst.o \ +- sspgv.o sspgvd.o sspgvx.o ssprfs.o sspsv.o sspsvx.o ssptrd.o \ +- ssptrf.o ssptri.o ssptrs.o sstegr.o sstein.o sstev.o sstevd.o sstevr.o \ +- sstevx.o ssycon.o ssyev.o ssyevd.o ssyevr.o ssyevx.o ssygs2.o \ +- ssygst.o ssygv.o ssygvd.o ssygvx.o ssyrfs.o ssysv.o ssysvx.o \ +- ssytd2.o ssytf2.o ssytrd.o ssytrf.o ssytri.o ssytrs.o stbcon.o \ +- stbrfs.o stbtrs.o stgevc.o stgex2.o stgexc.o stgsen.o \ +- stgsja.o stgsna.o stgsy2.o stgsyl.o stpcon.o stprfs.o stptri.o \ +- stptrs.o \ +- strcon.o strevc.o strexc.o strrfs.o strsen.o strsna.o strsyl.o \ +- strti2.o strtri.o strtrs.o stzrqf.o stzrzf.o sstemr.o ++ sgbbrd.$(SUFFIX) sgbcon.$(SUFFIX) sgbequ.$(SUFFIX) sgbrfs.$(SUFFIX) sgbsv.$(SUFFIX) \ ++ sgbsvx.$(SUFFIX) sgbtf2.$(SUFFIX) sgbtrf.$(SUFFIX) sgbtrs.$(SUFFIX) sgebak.$(SUFFIX) sgebal.$(SUFFIX) sgebd2.$(SUFFIX) \ ++ sgebrd.$(SUFFIX) sgecon.$(SUFFIX) sgeequ.$(SUFFIX) sgees.$(SUFFIX) sgeesx.$(SUFFIX) sgeev.$(SUFFIX) sgeevx.$(SUFFIX) \ ++ sgegs.$(SUFFIX) sgegv.$(SUFFIX) sgehd2.$(SUFFIX) sgehrd.$(SUFFIX) sgelq2.$(SUFFIX) sgelqf.$(SUFFIX) \ ++ sgels.$(SUFFIX) sgelsd.$(SUFFIX) sgelss.$(SUFFIX) sgelsx.$(SUFFIX) sgelsy.$(SUFFIX) sgeql2.$(SUFFIX) sgeqlf.$(SUFFIX) \ ++ sgeqp3.$(SUFFIX) sgeqpf.$(SUFFIX) sgeqr2.$(SUFFIX) sgeqrf.$(SUFFIX) sgerfs.$(SUFFIX) sgerq2.$(SUFFIX) sgerqf.$(SUFFIX) \ ++ sgesc2.$(SUFFIX) sgesdd.$(SUFFIX) sgesvd.$(SUFFIX) sgesvx.$(SUFFIX) sgetc2.$(SUFFIX) \ ++ sgetri.$(SUFFIX) \ ++ sggbak.$(SUFFIX) sggbal.$(SUFFIX) sgges.$(SUFFIX) sggesx.$(SUFFIX) sggev.$(SUFFIX) sggevx.$(SUFFIX) \ ++ sggglm.$(SUFFIX) sgghrd.$(SUFFIX) sgglse.$(SUFFIX) sggqrf.$(SUFFIX) \ ++ sggrqf.$(SUFFIX) sggsvd.$(SUFFIX) sggsvp.$(SUFFIX) sgtcon.$(SUFFIX) sgtrfs.$(SUFFIX) sgtsv.$(SUFFIX) \ ++ sgtsvx.$(SUFFIX) sgttrf.$(SUFFIX) sgttrs.$(SUFFIX) sgtts2.$(SUFFIX) shgeqz.$(SUFFIX) \ ++ shsein.$(SUFFIX) shseqr.$(SUFFIX) slabrd.$(SUFFIX) slacon.$(SUFFIX) slacn2.$(SUFFIX) \ ++ slaein.$(SUFFIX) slaexc.$(SUFFIX) slag2.$(SUFFIX) slags2.$(SUFFIX) slagtm.$(SUFFIX) slagv2.$(SUFFIX) slahqr.$(SUFFIX) \ ++ slahrd.$(SUFFIX) slahr2.$(SUFFIX) slaic1.$(SUFFIX) slaln2.$(SUFFIX) slals0.$(SUFFIX) slalsa.$(SUFFIX) slalsd.$(SUFFIX) \ ++ slangb.$(SUFFIX) slange.$(SUFFIX) slangt.$(SUFFIX) slanhs.$(SUFFIX) slansb.$(SUFFIX) slansp.$(SUFFIX) \ ++ slansy.$(SUFFIX) slantb.$(SUFFIX) slantp.$(SUFFIX) slantr.$(SUFFIX) slanv2.$(SUFFIX) \ ++ slapll.$(SUFFIX) slapmt.$(SUFFIX) \ ++ slaqgb.$(SUFFIX) slaqge.$(SUFFIX) slaqp2.$(SUFFIX) slaqps.$(SUFFIX) slaqsb.$(SUFFIX) slaqsp.$(SUFFIX) slaqsy.$(SUFFIX) \ ++ slaqr0.$(SUFFIX) slaqr1.$(SUFFIX) slaqr2.$(SUFFIX) slaqr3.$(SUFFIX) slaqr4.$(SUFFIX) slaqr5.$(SUFFIX) \ ++ slaqtr.$(SUFFIX) slar1v.$(SUFFIX) slar2v.$(SUFFIX) \ ++ slarf.$(SUFFIX) slarfb.$(SUFFIX) slarfg.$(SUFFIX) slarft.$(SUFFIX) slarfx.$(SUFFIX) slargv.$(SUFFIX) \ ++ slarrv.$(SUFFIX) slartv.$(SUFFIX) \ ++ slarz.$(SUFFIX) slarzb.$(SUFFIX) slarzt.$(SUFFIX) slasy2.$(SUFFIX) slasyf.$(SUFFIX) \ ++ slatbs.$(SUFFIX) slatdf.$(SUFFIX) slatps.$(SUFFIX) slatrd.$(SUFFIX) slatrs.$(SUFFIX) slatrz.$(SUFFIX) slatzm.$(SUFFIX) \ ++ sopgtr.$(SUFFIX) sopmtr.$(SUFFIX) sorg2l.$(SUFFIX) sorg2r.$(SUFFIX) \ ++ sorgbr.$(SUFFIX) sorghr.$(SUFFIX) sorgl2.$(SUFFIX) sorglq.$(SUFFIX) sorgql.$(SUFFIX) sorgqr.$(SUFFIX) sorgr2.$(SUFFIX) \ ++ sorgrq.$(SUFFIX) sorgtr.$(SUFFIX) sorm2l.$(SUFFIX) sorm2r.$(SUFFIX) \ ++ sormbr.$(SUFFIX) sormhr.$(SUFFIX) sorml2.$(SUFFIX) sormlq.$(SUFFIX) sormql.$(SUFFIX) sormqr.$(SUFFIX) sormr2.$(SUFFIX) \ ++ sormr3.$(SUFFIX) sormrq.$(SUFFIX) sormrz.$(SUFFIX) sormtr.$(SUFFIX) spbcon.$(SUFFIX) spbequ.$(SUFFIX) spbrfs.$(SUFFIX) \ ++ spbstf.$(SUFFIX) spbsv.$(SUFFIX) spbsvx.$(SUFFIX) \ ++ spbtf2.$(SUFFIX) spbtrf.$(SUFFIX) spbtrs.$(SUFFIX) spocon.$(SUFFIX) spoequ.$(SUFFIX) sporfs.$(SUFFIX) sposv.$(SUFFIX) \ ++ sposvx.$(SUFFIX) spotrs.$(SUFFIX) sppcon.$(SUFFIX) sppequ.$(SUFFIX) \ ++ spprfs.$(SUFFIX) sppsv.$(SUFFIX) sppsvx.$(SUFFIX) spptrf.$(SUFFIX) spptri.$(SUFFIX) spptrs.$(SUFFIX) sptcon.$(SUFFIX) \ ++ spteqr.$(SUFFIX) sptrfs.$(SUFFIX) sptsv.$(SUFFIX) sptsvx.$(SUFFIX) spttrs.$(SUFFIX) sptts2.$(SUFFIX) srscl.$(SUFFIX) \ ++ ssbev.$(SUFFIX) ssbevd.$(SUFFIX) ssbevx.$(SUFFIX) ssbgst.$(SUFFIX) ssbgv.$(SUFFIX) ssbgvd.$(SUFFIX) ssbgvx.$(SUFFIX) \ ++ ssbtrd.$(SUFFIX) sspcon.$(SUFFIX) sspev.$(SUFFIX) sspevd.$(SUFFIX) sspevx.$(SUFFIX) sspgst.$(SUFFIX) \ ++ sspgv.$(SUFFIX) sspgvd.$(SUFFIX) sspgvx.$(SUFFIX) ssprfs.$(SUFFIX) sspsv.$(SUFFIX) sspsvx.$(SUFFIX) ssptrd.$(SUFFIX) \ ++ ssptrf.$(SUFFIX) ssptri.$(SUFFIX) ssptrs.$(SUFFIX) sstegr.$(SUFFIX) sstein.$(SUFFIX) sstev.$(SUFFIX) sstevd.$(SUFFIX) sstevr.$(SUFFIX) \ ++ sstevx.$(SUFFIX) ssycon.$(SUFFIX) ssyev.$(SUFFIX) ssyevd.$(SUFFIX) ssyevr.$(SUFFIX) ssyevx.$(SUFFIX) ssygs2.$(SUFFIX) \ ++ ssygst.$(SUFFIX) ssygv.$(SUFFIX) ssygvd.$(SUFFIX) ssygvx.$(SUFFIX) ssyrfs.$(SUFFIX) ssysv.$(SUFFIX) ssysvx.$(SUFFIX) \ ++ ssytd2.$(SUFFIX) ssytf2.$(SUFFIX) ssytrd.$(SUFFIX) ssytrf.$(SUFFIX) ssytri.$(SUFFIX) ssytrs.$(SUFFIX) stbcon.$(SUFFIX) \ ++ stbrfs.$(SUFFIX) stbtrs.$(SUFFIX) stgevc.$(SUFFIX) stgex2.$(SUFFIX) stgexc.$(SUFFIX) stgsen.$(SUFFIX) \ ++ stgsja.$(SUFFIX) stgsna.$(SUFFIX) stgsy2.$(SUFFIX) stgsyl.$(SUFFIX) stpcon.$(SUFFIX) stprfs.$(SUFFIX) stptri.$(SUFFIX) \ ++ stptrs.$(SUFFIX) \ ++ strcon.$(SUFFIX) strevc.$(SUFFIX) strexc.$(SUFFIX) strrfs.$(SUFFIX) strsen.$(SUFFIX) strsna.$(SUFFIX) strsyl.$(SUFFIX) \ ++ strtrs.$(SUFFIX) stzrqf.$(SUFFIX) stzrzf.$(SUFFIX) sstemr.$(SUFFIX) + + CLASRC = \ +- cbdsqr.o cgbbrd.o cgbcon.o cgbequ.o cgbrfs.o cgbsv.o cgbsvx.o \ +- cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \ +- cgecon.o cgeequ.o cgees.o cgeesx.o cgeev.o cgeevx.o \ +- cgegs.o cgegv.o cgehd2.o cgehrd.o cgelq2.o cgelqf.o \ +- cgels.o cgelsd.o cgelss.o cgelsx.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \ +- cgeqpf.o cgeqr2.o cgeqrf.o cgerfs.o cgerq2.o cgerqf.o \ +- cgesc2.o cgesdd.o cgesv.o cgesvd.o cgesvx.o cgetc2.o cgetf2.o cgetrf.o \ +- cgetri.o cgetrs.o \ +- cggbak.o cggbal.o cgges.o cggesx.o cggev.o cggevx.o cggglm.o \ +- cgghrd.o cgglse.o cggqrf.o cggrqf.o \ +- cggsvd.o cggsvp.o \ +- cgtcon.o cgtrfs.o cgtsv.o cgtsvx.o cgttrf.o cgttrs.o cgtts2.o chbev.o \ +- chbevd.o chbevx.o chbgst.o chbgv.o chbgvd.o chbgvx.o chbtrd.o \ +- checon.o cheev.o cheevd.o cheevr.o cheevx.o chegs2.o chegst.o \ +- chegv.o chegvd.o chegvx.o cherfs.o chesv.o chesvx.o chetd2.o \ +- chetf2.o chetrd.o \ +- chetrf.o chetri.o chetrs.o chgeqz.o chpcon.o chpev.o chpevd.o \ +- chpevx.o chpgst.o chpgv.o chpgvd.o chpgvx.o chprfs.o chpsv.o \ +- chpsvx.o \ +- chptrd.o chptrf.o chptri.o chptrs.o chsein.o chseqr.o clabrd.o \ +- clacgv.o clacon.o clacn2.o clacp2.o clacpy.o clacrm.o clacrt.o cladiv.o \ +- claed0.o claed7.o claed8.o \ +- claein.o claesy.o claev2.o clags2.o clagtm.o \ +- clahef.o clahqr.o \ +- clahrd.o clahr2.o claic1.o clals0.o clalsa.o clalsd.o clangb.o clange.o clangt.o \ +- clanhb.o clanhe.o \ +- clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \ +- clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \ +- claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ +- claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ +- claqsp.o claqsy.o clar1v.o clar2v.o clarf.o clarfb.o clarfg.o clarft.o \ +- clarfx.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ +- clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ +- claswp.o clasyf.o clatbs.o clatdf.o clatps.o clatrd.o clatrs.o clatrz.o \ +- clatzm.o clauu2.o clauum.o cpbcon.o cpbequ.o cpbrfs.o cpbstf.o cpbsv.o \ +- cpbsvx.o cpbtf2.o cpbtrf.o cpbtrs.o cpocon.o cpoequ.o cporfs.o \ +- cposv.o cposvx.o cpotf2.o cpotrf.o cpotri.o cpotrs.o cppcon.o \ +- cppequ.o cpprfs.o cppsv.o cppsvx.o cpptrf.o cpptri.o cpptrs.o \ +- cptcon.o cpteqr.o cptrfs.o cptsv.o cptsvx.o cpttrf.o cpttrs.o cptts2.o \ +- crot.o cspcon.o cspmv.o cspr.o csprfs.o cspsv.o \ +- cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \ +- cstegr.o cstein.o csteqr.o csycon.o csymv.o \ +- csyr.o csyrfs.o csysv.o csysvx.o csytf2.o csytrf.o csytri.o \ +- csytrs.o ctbcon.o ctbrfs.o ctbtrs.o ctgevc.o ctgex2.o \ +- ctgexc.o ctgsen.o ctgsja.o ctgsna.o ctgsy2.o ctgsyl.o ctpcon.o \ +- ctprfs.o ctptri.o \ +- ctptrs.o ctrcon.o ctrevc.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ +- ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrqf.o ctzrzf.o cung2l.o cung2r.o \ +- cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ +- cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o \ +- cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ +- cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o ++ cbdsqr.$(SUFFIX) cgbbrd.$(SUFFIX) cgbcon.$(SUFFIX) cgbequ.$(SUFFIX) cgbrfs.$(SUFFIX) cgbsv.$(SUFFIX) cgbsvx.$(SUFFIX) \ ++ cgbtf2.$(SUFFIX) cgbtrf.$(SUFFIX) cgbtrs.$(SUFFIX) cgebak.$(SUFFIX) cgebal.$(SUFFIX) cgebd2.$(SUFFIX) cgebrd.$(SUFFIX) \ ++ cgecon.$(SUFFIX) cgeequ.$(SUFFIX) cgees.$(SUFFIX) cgeesx.$(SUFFIX) cgeev.$(SUFFIX) cgeevx.$(SUFFIX) \ ++ cgegs.$(SUFFIX) cgegv.$(SUFFIX) cgehd2.$(SUFFIX) cgehrd.$(SUFFIX) cgelq2.$(SUFFIX) cgelqf.$(SUFFIX) \ ++ cgels.$(SUFFIX) cgelsd.$(SUFFIX) cgelss.$(SUFFIX) cgelsx.$(SUFFIX) cgelsy.$(SUFFIX) cgeql2.$(SUFFIX) cgeqlf.$(SUFFIX) cgeqp3.$(SUFFIX) \ ++ cgeqpf.$(SUFFIX) cgeqr2.$(SUFFIX) cgeqrf.$(SUFFIX) cgerfs.$(SUFFIX) cgerq2.$(SUFFIX) cgerqf.$(SUFFIX) \ ++ cgesc2.$(SUFFIX) cgesdd.$(SUFFIX) cgesvd.$(SUFFIX) cgesvx.$(SUFFIX) cgetc2.$(SUFFIX) \ ++ cgetri.$(SUFFIX) \ ++ cggbak.$(SUFFIX) cggbal.$(SUFFIX) cgges.$(SUFFIX) cggesx.$(SUFFIX) cggev.$(SUFFIX) cggevx.$(SUFFIX) cggglm.$(SUFFIX) \ ++ cgghrd.$(SUFFIX) cgglse.$(SUFFIX) cggqrf.$(SUFFIX) cggrqf.$(SUFFIX) \ ++ cggsvd.$(SUFFIX) cggsvp.$(SUFFIX) \ ++ cgtcon.$(SUFFIX) cgtrfs.$(SUFFIX) cgtsv.$(SUFFIX) cgtsvx.$(SUFFIX) cgttrf.$(SUFFIX) cgttrs.$(SUFFIX) cgtts2.$(SUFFIX) chbev.$(SUFFIX) \ ++ chbevd.$(SUFFIX) chbevx.$(SUFFIX) chbgst.$(SUFFIX) chbgv.$(SUFFIX) chbgvd.$(SUFFIX) chbgvx.$(SUFFIX) chbtrd.$(SUFFIX) \ ++ checon.$(SUFFIX) cheev.$(SUFFIX) cheevd.$(SUFFIX) cheevr.$(SUFFIX) cheevx.$(SUFFIX) chegs2.$(SUFFIX) chegst.$(SUFFIX) \ ++ chegv.$(SUFFIX) chegvd.$(SUFFIX) chegvx.$(SUFFIX) cherfs.$(SUFFIX) chesv.$(SUFFIX) chesvx.$(SUFFIX) chetd2.$(SUFFIX) \ ++ chetf2.$(SUFFIX) chetrd.$(SUFFIX) \ ++ chetrf.$(SUFFIX) chetri.$(SUFFIX) chetrs.$(SUFFIX) chgeqz.$(SUFFIX) chpcon.$(SUFFIX) chpev.$(SUFFIX) chpevd.$(SUFFIX) \ ++ chpevx.$(SUFFIX) chpgst.$(SUFFIX) chpgv.$(SUFFIX) chpgvd.$(SUFFIX) chpgvx.$(SUFFIX) chprfs.$(SUFFIX) chpsv.$(SUFFIX) \ ++ chpsvx.$(SUFFIX) \ ++ chptrd.$(SUFFIX) chptrf.$(SUFFIX) chptri.$(SUFFIX) chptrs.$(SUFFIX) chsein.$(SUFFIX) chseqr.$(SUFFIX) clabrd.$(SUFFIX) \ ++ clacgv.$(SUFFIX) clacon.$(SUFFIX) clacn2.$(SUFFIX) clacp2.$(SUFFIX) clacpy.$(SUFFIX) clacrm.$(SUFFIX) clacrt.$(SUFFIX) cladiv.$(SUFFIX) \ ++ claed0.$(SUFFIX) claed7.$(SUFFIX) claed8.$(SUFFIX) \ ++ claein.$(SUFFIX) claesy.$(SUFFIX) claev2.$(SUFFIX) clags2.$(SUFFIX) clagtm.$(SUFFIX) \ ++ clahef.$(SUFFIX) clahqr.$(SUFFIX) \ ++ clahrd.$(SUFFIX) clahr2.$(SUFFIX) claic1.$(SUFFIX) clals0.$(SUFFIX) clalsa.$(SUFFIX) clalsd.$(SUFFIX) clangb.$(SUFFIX) clange.$(SUFFIX) clangt.$(SUFFIX) \ ++ clanhb.$(SUFFIX) clanhe.$(SUFFIX) \ ++ clanhp.$(SUFFIX) clanhs.$(SUFFIX) clanht.$(SUFFIX) clansb.$(SUFFIX) clansp.$(SUFFIX) clansy.$(SUFFIX) clantb.$(SUFFIX) \ ++ clantp.$(SUFFIX) clantr.$(SUFFIX) clapll.$(SUFFIX) clapmt.$(SUFFIX) clarcm.$(SUFFIX) claqgb.$(SUFFIX) claqge.$(SUFFIX) \ ++ claqhb.$(SUFFIX) claqhe.$(SUFFIX) claqhp.$(SUFFIX) claqp2.$(SUFFIX) claqps.$(SUFFIX) claqsb.$(SUFFIX) \ ++ claqr0.$(SUFFIX) claqr1.$(SUFFIX) claqr2.$(SUFFIX) claqr3.$(SUFFIX) claqr4.$(SUFFIX) claqr5.$(SUFFIX) \ ++ claqsp.$(SUFFIX) claqsy.$(SUFFIX) clar1v.$(SUFFIX) clar2v.$(SUFFIX) clarf.$(SUFFIX) clarfb.$(SUFFIX) clarfg.$(SUFFIX) clarft.$(SUFFIX) \ ++ clarfx.$(SUFFIX) clargv.$(SUFFIX) clarnv.$(SUFFIX) clarrv.$(SUFFIX) clartg.$(SUFFIX) clartv.$(SUFFIX) \ ++ clarz.$(SUFFIX) clarzb.$(SUFFIX) clarzt.$(SUFFIX) clascl.$(SUFFIX) claset.$(SUFFIX) clasr.$(SUFFIX) classq.$(SUFFIX) \ ++ clasyf.$(SUFFIX) clatbs.$(SUFFIX) clatdf.$(SUFFIX) clatps.$(SUFFIX) clatrd.$(SUFFIX) clatrs.$(SUFFIX) clatrz.$(SUFFIX) \ ++ clatzm.$(SUFFIX) cpbcon.$(SUFFIX) cpbequ.$(SUFFIX) cpbrfs.$(SUFFIX) cpbstf.$(SUFFIX) cpbsv.$(SUFFIX) \ ++ cpbsvx.$(SUFFIX) cpbtf2.$(SUFFIX) cpbtrf.$(SUFFIX) cpbtrs.$(SUFFIX) cpocon.$(SUFFIX) cpoequ.$(SUFFIX) cporfs.$(SUFFIX) \ ++ cposv.$(SUFFIX) cposvx.$(SUFFIX) cpotrs.$(SUFFIX) cppcon.$(SUFFIX) \ ++ cppequ.$(SUFFIX) cpprfs.$(SUFFIX) cppsv.$(SUFFIX) cppsvx.$(SUFFIX) cpptrf.$(SUFFIX) cpptri.$(SUFFIX) cpptrs.$(SUFFIX) \ ++ cptcon.$(SUFFIX) cpteqr.$(SUFFIX) cptrfs.$(SUFFIX) cptsv.$(SUFFIX) cptsvx.$(SUFFIX) cpttrf.$(SUFFIX) cpttrs.$(SUFFIX) cptts2.$(SUFFIX) \ ++ crot.$(SUFFIX) cspcon.$(SUFFIX) csprfs.$(SUFFIX) cspsv.$(SUFFIX) \ ++ cspsvx.$(SUFFIX) csptrf.$(SUFFIX) csptri.$(SUFFIX) csptrs.$(SUFFIX) csrscl.$(SUFFIX) cstedc.$(SUFFIX) \ ++ cstegr.$(SUFFIX) cstein.$(SUFFIX) csteqr.$(SUFFIX) csycon.$(SUFFIX) \ ++ csyrfs.$(SUFFIX) csysv.$(SUFFIX) csysvx.$(SUFFIX) csytf2.$(SUFFIX) csytrf.$(SUFFIX) csytri.$(SUFFIX) \ ++ csytrs.$(SUFFIX) ctbcon.$(SUFFIX) ctbrfs.$(SUFFIX) ctbtrs.$(SUFFIX) ctgevc.$(SUFFIX) ctgex2.$(SUFFIX) \ ++ ctgexc.$(SUFFIX) ctgsen.$(SUFFIX) ctgsja.$(SUFFIX) ctgsna.$(SUFFIX) ctgsy2.$(SUFFIX) ctgsyl.$(SUFFIX) ctpcon.$(SUFFIX) \ ++ ctprfs.$(SUFFIX) ctptri.$(SUFFIX) \ ++ ctptrs.$(SUFFIX) ctrcon.$(SUFFIX) ctrevc.$(SUFFIX) ctrexc.$(SUFFIX) ctrrfs.$(SUFFIX) ctrsen.$(SUFFIX) ctrsna.$(SUFFIX) \ ++ ctrsyl.$(SUFFIX) ctrtrs.$(SUFFIX) ctzrqf.$(SUFFIX) ctzrzf.$(SUFFIX) cung2l.$(SUFFIX) cung2r.$(SUFFIX) \ ++ cungbr.$(SUFFIX) cunghr.$(SUFFIX) cungl2.$(SUFFIX) cunglq.$(SUFFIX) cungql.$(SUFFIX) cungqr.$(SUFFIX) cungr2.$(SUFFIX) \ ++ cungrq.$(SUFFIX) cungtr.$(SUFFIX) cunm2l.$(SUFFIX) cunm2r.$(SUFFIX) cunmbr.$(SUFFIX) cunmhr.$(SUFFIX) cunml2.$(SUFFIX) \ ++ cunmlq.$(SUFFIX) cunmql.$(SUFFIX) cunmqr.$(SUFFIX) cunmr2.$(SUFFIX) cunmr3.$(SUFFIX) cunmrq.$(SUFFIX) cunmrz.$(SUFFIX) \ ++ cunmtr.$(SUFFIX) cupgtr.$(SUFFIX) cupmtr.$(SUFFIX) icmax1.$(SUFFIX) scsum1.$(SUFFIX) cstemr.$(SUFFIX) + + DLASRC = \ +- dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ +- dgbsvx.o dgbtf2.o dgbtrf.o dgbtrs.o dgebak.o dgebal.o dgebd2.o \ +- dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ +- dgegs.o dgegv.o dgehd2.o dgehrd.o dgelq2.o dgelqf.o \ +- dgels.o dgelsd.o dgelss.o dgelsx.o dgelsy.o dgeql2.o dgeqlf.o \ +- dgeqp3.o dgeqpf.o dgeqr2.o dgeqrf.o dgerfs.o dgerq2.o dgerqf.o \ +- dgesc2.o dgesdd.o dgesv.o dgesvd.o dgesvx.o dgetc2.o dgetf2.o \ +- dgetrf.o dgetri.o \ +- dgetrs.o dggbak.o dggbal.o dgges.o dggesx.o dggev.o dggevx.o \ +- dggglm.o dgghrd.o dgglse.o dggqrf.o \ +- dggrqf.o dggsvd.o dggsvp.o dgtcon.o dgtrfs.o dgtsv.o \ +- dgtsvx.o dgttrf.o dgttrs.o dgtts2.o dhgeqz.o \ +- dhsein.o dhseqr.o dlabrd.o dlacon.o dlacn2.o \ +- dlaein.o dlaexc.o dlag2.o dlags2.o dlagtm.o dlagv2.o dlahqr.o \ +- dlahrd.o dlahr2.o dlaic1.o dlaln2.o dlals0.o dlalsa.o dlalsd.o \ +- dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \ +- dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \ +- dlapll.o dlapmt.o \ +- dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ +- dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ +- dlaqtr.o dlar1v.o dlar2v.o \ +- dlarf.o dlarfb.o dlarfg.o dlarft.o dlarfx.o dlargv.o \ +- dlarrv.o dlartv.o \ +- dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o dlasyf.o \ +- dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlatzm.o dlauu2.o \ +- dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ +- dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ +- dorgrq.o dorgtr.o dorm2l.o dorm2r.o \ +- dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ +- dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ +- dpbstf.o dpbsv.o dpbsvx.o \ +- dpbtf2.o dpbtrf.o dpbtrs.o dpocon.o dpoequ.o dporfs.o dposv.o \ +- dposvx.o dpotf2.o dpotrf.o dpotri.o dpotrs.o dppcon.o dppequ.o \ +- dpprfs.o dppsv.o dppsvx.o dpptrf.o dpptri.o dpptrs.o dptcon.o \ +- dpteqr.o dptrfs.o dptsv.o dptsvx.o dpttrs.o dptts2.o drscl.o \ +- dsbev.o dsbevd.o dsbevx.o dsbgst.o dsbgv.o dsbgvd.o dsbgvx.o \ +- dsbtrd.o dspcon.o dspev.o dspevd.o dspevx.o dspgst.o \ +- dspgv.o dspgvd.o dspgvx.o dsprfs.o dspsv.o dspsvx.o dsptrd.o \ +- dsptrf.o dsptri.o dsptrs.o dstegr.o dstein.o dstev.o dstevd.o dstevr.o \ +- dstevx.o dsycon.o dsyev.o dsyevd.o dsyevr.o \ +- dsyevx.o dsygs2.o dsygst.o dsygv.o dsygvd.o dsygvx.o dsyrfs.o \ +- dsysv.o dsysvx.o \ +- dsytd2.o dsytf2.o dsytrd.o dsytrf.o dsytri.o dsytrs.o dtbcon.o \ +- dtbrfs.o dtbtrs.o dtgevc.o dtgex2.o dtgexc.o dtgsen.o \ +- dtgsja.o dtgsna.o dtgsy2.o dtgsyl.o dtpcon.o dtprfs.o dtptri.o \ +- dtptrs.o \ +- dtrcon.o dtrevc.o dtrexc.o dtrrfs.o dtrsen.o dtrsna.o dtrsyl.o \ +- dtrti2.o dtrtri.o dtrtrs.o dtzrqf.o dtzrzf.o dstemr.o \ +- dsgesv.o dlag2s.o slag2d.o ++ dgbbrd.$(SUFFIX) dgbcon.$(SUFFIX) dgbequ.$(SUFFIX) dgbrfs.$(SUFFIX) dgbsv.$(SUFFIX) \ ++ dgbsvx.$(SUFFIX) dgbtf2.$(SUFFIX) dgbtrf.$(SUFFIX) dgbtrs.$(SUFFIX) dgebak.$(SUFFIX) dgebal.$(SUFFIX) dgebd2.$(SUFFIX) \ ++ dgebrd.$(SUFFIX) dgecon.$(SUFFIX) dgeequ.$(SUFFIX) dgees.$(SUFFIX) dgeesx.$(SUFFIX) dgeev.$(SUFFIX) dgeevx.$(SUFFIX) \ ++ dgegs.$(SUFFIX) dgegv.$(SUFFIX) dgehd2.$(SUFFIX) dgehrd.$(SUFFIX) dgelq2.$(SUFFIX) dgelqf.$(SUFFIX) \ ++ dgels.$(SUFFIX) dgelsd.$(SUFFIX) dgelss.$(SUFFIX) dgelsx.$(SUFFIX) dgelsy.$(SUFFIX) dgeql2.$(SUFFIX) dgeqlf.$(SUFFIX) \ ++ dgeqp3.$(SUFFIX) dgeqpf.$(SUFFIX) dgeqr2.$(SUFFIX) dgeqrf.$(SUFFIX) dgerfs.$(SUFFIX) dgerq2.$(SUFFIX) dgerqf.$(SUFFIX) \ ++ dgesc2.$(SUFFIX) dgesdd.$(SUFFIX) dgesvd.$(SUFFIX) dgesvx.$(SUFFIX) dgetc2.$(SUFFIX) \ ++ dgetri.$(SUFFIX) \ ++ dggbak.$(SUFFIX) dggbal.$(SUFFIX) dgges.$(SUFFIX) dggesx.$(SUFFIX) dggev.$(SUFFIX) dggevx.$(SUFFIX) \ ++ dggglm.$(SUFFIX) dgghrd.$(SUFFIX) dgglse.$(SUFFIX) dggqrf.$(SUFFIX) \ ++ dggrqf.$(SUFFIX) dggsvd.$(SUFFIX) dggsvp.$(SUFFIX) dgtcon.$(SUFFIX) dgtrfs.$(SUFFIX) dgtsv.$(SUFFIX) \ ++ dgtsvx.$(SUFFIX) dgttrf.$(SUFFIX) dgttrs.$(SUFFIX) dgtts2.$(SUFFIX) dhgeqz.$(SUFFIX) \ ++ dhsein.$(SUFFIX) dhseqr.$(SUFFIX) dlabrd.$(SUFFIX) dlacon.$(SUFFIX) dlacn2.$(SUFFIX) \ ++ dlaein.$(SUFFIX) dlaexc.$(SUFFIX) dlag2.$(SUFFIX) dlags2.$(SUFFIX) dlagtm.$(SUFFIX) dlagv2.$(SUFFIX) dlahqr.$(SUFFIX) \ ++ dlahrd.$(SUFFIX) dlahr2.$(SUFFIX) dlaic1.$(SUFFIX) dlaln2.$(SUFFIX) dlals0.$(SUFFIX) dlalsa.$(SUFFIX) dlalsd.$(SUFFIX) \ ++ dlangb.$(SUFFIX) dlange.$(SUFFIX) dlangt.$(SUFFIX) dlanhs.$(SUFFIX) dlansb.$(SUFFIX) dlansp.$(SUFFIX) \ ++ dlansy.$(SUFFIX) dlantb.$(SUFFIX) dlantp.$(SUFFIX) dlantr.$(SUFFIX) dlanv2.$(SUFFIX) \ ++ dlapll.$(SUFFIX) dlapmt.$(SUFFIX) \ ++ dlaqgb.$(SUFFIX) dlaqge.$(SUFFIX) dlaqp2.$(SUFFIX) dlaqps.$(SUFFIX) dlaqsb.$(SUFFIX) dlaqsp.$(SUFFIX) dlaqsy.$(SUFFIX) \ ++ dlaqr0.$(SUFFIX) dlaqr1.$(SUFFIX) dlaqr2.$(SUFFIX) dlaqr3.$(SUFFIX) dlaqr4.$(SUFFIX) dlaqr5.$(SUFFIX) \ ++ dlaqtr.$(SUFFIX) dlar1v.$(SUFFIX) dlar2v.$(SUFFIX) \ ++ dlarf.$(SUFFIX) dlarfb.$(SUFFIX) dlarfg.$(SUFFIX) dlarft.$(SUFFIX) dlarfx.$(SUFFIX) dlargv.$(SUFFIX) \ ++ dlarrv.$(SUFFIX) dlartv.$(SUFFIX) \ ++ dlarz.$(SUFFIX) dlarzb.$(SUFFIX) dlarzt.$(SUFFIX) dlasy2.$(SUFFIX) dlasyf.$(SUFFIX) \ ++ dlatbs.$(SUFFIX) dlatdf.$(SUFFIX) dlatps.$(SUFFIX) dlatrd.$(SUFFIX) dlatrs.$(SUFFIX) dlatrz.$(SUFFIX) dlatzm.$(SUFFIX) \ ++ dopgtr.$(SUFFIX) dopmtr.$(SUFFIX) dorg2l.$(SUFFIX) dorg2r.$(SUFFIX) \ ++ dorgbr.$(SUFFIX) dorghr.$(SUFFIX) dorgl2.$(SUFFIX) dorglq.$(SUFFIX) dorgql.$(SUFFIX) dorgqr.$(SUFFIX) dorgr2.$(SUFFIX) \ ++ dorgrq.$(SUFFIX) dorgtr.$(SUFFIX) dorm2l.$(SUFFIX) dorm2r.$(SUFFIX) \ ++ dormbr.$(SUFFIX) dormhr.$(SUFFIX) dorml2.$(SUFFIX) dormlq.$(SUFFIX) dormql.$(SUFFIX) dormqr.$(SUFFIX) dormr2.$(SUFFIX) \ ++ dormr3.$(SUFFIX) dormrq.$(SUFFIX) dormrz.$(SUFFIX) dormtr.$(SUFFIX) dpbcon.$(SUFFIX) dpbequ.$(SUFFIX) dpbrfs.$(SUFFIX) \ ++ dpbstf.$(SUFFIX) dpbsv.$(SUFFIX) dpbsvx.$(SUFFIX) \ ++ dpbtf2.$(SUFFIX) dpbtrf.$(SUFFIX) dpbtrs.$(SUFFIX) dpocon.$(SUFFIX) dpoequ.$(SUFFIX) dporfs.$(SUFFIX) dposv.$(SUFFIX) \ ++ dposvx.$(SUFFIX) dpotrs.$(SUFFIX) dppcon.$(SUFFIX) dppequ.$(SUFFIX) \ ++ dpprfs.$(SUFFIX) dppsv.$(SUFFIX) dppsvx.$(SUFFIX) dpptrf.$(SUFFIX) dpptri.$(SUFFIX) dpptrs.$(SUFFIX) dptcon.$(SUFFIX) \ ++ dpteqr.$(SUFFIX) dptrfs.$(SUFFIX) dptsv.$(SUFFIX) dptsvx.$(SUFFIX) dpttrs.$(SUFFIX) dptts2.$(SUFFIX) drscl.$(SUFFIX) \ ++ dsbev.$(SUFFIX) dsbevd.$(SUFFIX) dsbevx.$(SUFFIX) dsbgst.$(SUFFIX) dsbgv.$(SUFFIX) dsbgvd.$(SUFFIX) dsbgvx.$(SUFFIX) \ ++ dsbtrd.$(SUFFIX) dspcon.$(SUFFIX) dspev.$(SUFFIX) dspevd.$(SUFFIX) dspevx.$(SUFFIX) dspgst.$(SUFFIX) \ ++ dspgv.$(SUFFIX) dspgvd.$(SUFFIX) dspgvx.$(SUFFIX) dsprfs.$(SUFFIX) dspsv.$(SUFFIX) dspsvx.$(SUFFIX) dsptrd.$(SUFFIX) \ ++ dsptrf.$(SUFFIX) dsptri.$(SUFFIX) dsptrs.$(SUFFIX) dstegr.$(SUFFIX) dstein.$(SUFFIX) dstev.$(SUFFIX) dstevd.$(SUFFIX) dstevr.$(SUFFIX) \ ++ dstevx.$(SUFFIX) dsycon.$(SUFFIX) dsyev.$(SUFFIX) dsyevd.$(SUFFIX) dsyevr.$(SUFFIX) \ ++ dsyevx.$(SUFFIX) dsygs2.$(SUFFIX) dsygst.$(SUFFIX) dsygv.$(SUFFIX) dsygvd.$(SUFFIX) dsygvx.$(SUFFIX) dsyrfs.$(SUFFIX) \ ++ dsysv.$(SUFFIX) dsysvx.$(SUFFIX) \ ++ dsytd2.$(SUFFIX) dsytf2.$(SUFFIX) dsytrd.$(SUFFIX) dsytrf.$(SUFFIX) dsytri.$(SUFFIX) dsytrs.$(SUFFIX) dtbcon.$(SUFFIX) \ ++ dtbrfs.$(SUFFIX) dtbtrs.$(SUFFIX) dtgevc.$(SUFFIX) dtgex2.$(SUFFIX) dtgexc.$(SUFFIX) dtgsen.$(SUFFIX) \ ++ dtgsja.$(SUFFIX) dtgsna.$(SUFFIX) dtgsy2.$(SUFFIX) dtgsyl.$(SUFFIX) dtpcon.$(SUFFIX) dtprfs.$(SUFFIX) dtptri.$(SUFFIX) \ ++ dtptrs.$(SUFFIX) \ ++ dtrcon.$(SUFFIX) dtrevc.$(SUFFIX) dtrexc.$(SUFFIX) dtrrfs.$(SUFFIX) dtrsen.$(SUFFIX) dtrsna.$(SUFFIX) dtrsyl.$(SUFFIX) \ ++ dtrtrs.$(SUFFIX) dtzrqf.$(SUFFIX) dtzrzf.$(SUFFIX) dstemr.$(SUFFIX) \ ++ dsgesv.$(SUFFIX) dlag2s.$(SUFFIX) slag2d.$(SUFFIX) + + ZLASRC = \ +- zbdsqr.o zgbbrd.o zgbcon.o zgbequ.o zgbrfs.o zgbsv.o zgbsvx.o \ +- zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \ +- zgecon.o zgeequ.o zgees.o zgeesx.o zgeev.o zgeevx.o \ +- zgegs.o zgegv.o zgehd2.o zgehrd.o zgelq2.o zgelqf.o \ +- zgels.o zgelsd.o zgelss.o zgelsx.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \ +- zgeqpf.o zgeqr2.o zgeqrf.o zgerfs.o zgerq2.o zgerqf.o \ +- zgesc2.o zgesdd.o zgesv.o zgesvd.o zgesvx.o zgetc2.o zgetf2.o zgetrf.o \ +- zgetri.o zgetrs.o \ +- zggbak.o zggbal.o zgges.o zggesx.o zggev.o zggevx.o zggglm.o \ +- zgghrd.o zgglse.o zggqrf.o zggrqf.o \ +- zggsvd.o zggsvp.o \ +- zgtcon.o zgtrfs.o zgtsv.o zgtsvx.o zgttrf.o zgttrs.o zgtts2.o zhbev.o \ +- zhbevd.o zhbevx.o zhbgst.o zhbgv.o zhbgvd.o zhbgvx.o zhbtrd.o \ +- zhecon.o zheev.o zheevd.o zheevr.o zheevx.o zhegs2.o zhegst.o \ +- zhegv.o zhegvd.o zhegvx.o zherfs.o zhesv.o zhesvx.o zhetd2.o \ +- zhetf2.o zhetrd.o \ +- zhetrf.o zhetri.o zhetrs.o zhgeqz.o zhpcon.o zhpev.o zhpevd.o \ +- zhpevx.o zhpgst.o zhpgv.o zhpgvd.o zhpgvx.o zhprfs.o zhpsv.o \ +- zhpsvx.o \ +- zhptrd.o zhptrf.o zhptri.o zhptrs.o zhsein.o zhseqr.o zlabrd.o \ +- zlacgv.o zlacon.o zlacn2.o zlacp2.o zlacpy.o zlacrm.o zlacrt.o zladiv.o \ +- zlaed0.o zlaed7.o zlaed8.o \ +- zlaein.o zlaesy.o zlaev2.o zlags2.o zlagtm.o \ +- zlahef.o zlahqr.o \ +- zlahrd.o zlahr2.o zlaic1.o zlals0.o zlalsa.o zlalsd.o zlangb.o zlange.o \ +- zlangt.o zlanhb.o \ +- zlanhe.o \ +- zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \ +- zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \ +- zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ +- zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ +- zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o zlarcm.o zlarf.o zlarfb.o \ +- zlarfg.o zlarft.o \ +- zlarfx.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ +- zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ +- zlassq.o zlaswp.o zlasyf.o \ +- zlatbs.o zlatdf.o zlatps.o zlatrd.o zlatrs.o zlatrz.o zlatzm.o zlauu2.o \ +- zlauum.o zpbcon.o zpbequ.o zpbrfs.o zpbstf.o zpbsv.o \ +- zpbsvx.o zpbtf2.o zpbtrf.o zpbtrs.o zpocon.o zpoequ.o zporfs.o \ +- zposv.o zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zppcon.o \ +- zppequ.o zpprfs.o zppsv.o zppsvx.o zpptrf.o zpptri.o zpptrs.o \ +- zptcon.o zpteqr.o zptrfs.o zptsv.o zptsvx.o zpttrf.o zpttrs.o zptts2.o \ +- zrot.o zspcon.o zspmv.o zspr.o zsprfs.o zspsv.o \ +- zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \ +- zstegr.o zstein.o zsteqr.o zsycon.o zsymv.o \ +- zsyr.o zsyrfs.o zsysv.o zsysvx.o zsytf2.o zsytrf.o zsytri.o \ +- zsytrs.o ztbcon.o ztbrfs.o ztbtrs.o ztgevc.o ztgex2.o \ +- ztgexc.o ztgsen.o ztgsja.o ztgsna.o ztgsy2.o ztgsyl.o ztpcon.o \ +- ztprfs.o ztptri.o \ +- ztptrs.o ztrcon.o ztrevc.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ +- ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrqf.o ztzrzf.o zung2l.o \ +- zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ +- zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o \ +- zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ +- zunmtr.o zupgtr.o \ +- zupmtr.o izmax1.o dzsum1.o zstemr.o \ +- zcgesv.o zlag2c.o clag2z.o ++ zbdsqr.$(SUFFIX) zgbbrd.$(SUFFIX) zgbcon.$(SUFFIX) zgbequ.$(SUFFIX) zgbrfs.$(SUFFIX) zgbsv.$(SUFFIX) zgbsvx.$(SUFFIX) \ ++ zgbtf2.$(SUFFIX) zgbtrf.$(SUFFIX) zgbtrs.$(SUFFIX) zgebak.$(SUFFIX) zgebal.$(SUFFIX) zgebd2.$(SUFFIX) zgebrd.$(SUFFIX) \ ++ zgecon.$(SUFFIX) zgeequ.$(SUFFIX) zgees.$(SUFFIX) zgeesx.$(SUFFIX) zgeev.$(SUFFIX) zgeevx.$(SUFFIX) \ ++ zgegs.$(SUFFIX) zgegv.$(SUFFIX) zgehd2.$(SUFFIX) zgehrd.$(SUFFIX) zgelq2.$(SUFFIX) zgelqf.$(SUFFIX) \ ++ zgels.$(SUFFIX) zgelsd.$(SUFFIX) zgelss.$(SUFFIX) zgelsx.$(SUFFIX) zgelsy.$(SUFFIX) zgeql2.$(SUFFIX) zgeqlf.$(SUFFIX) zgeqp3.$(SUFFIX) \ ++ zgeqpf.$(SUFFIX) zgeqr2.$(SUFFIX) zgeqrf.$(SUFFIX) zgerfs.$(SUFFIX) zgerq2.$(SUFFIX) zgerqf.$(SUFFIX) \ ++ zgesc2.$(SUFFIX) zgesdd.$(SUFFIX) zgesvd.$(SUFFIX) zgesvx.$(SUFFIX) zgetc2.$(SUFFIX) \ ++ zgetri.$(SUFFIX) \ ++ zggbak.$(SUFFIX) zggbal.$(SUFFIX) zgges.$(SUFFIX) zggesx.$(SUFFIX) zggev.$(SUFFIX) zggevx.$(SUFFIX) zggglm.$(SUFFIX) \ ++ zgghrd.$(SUFFIX) zgglse.$(SUFFIX) zggqrf.$(SUFFIX) zggrqf.$(SUFFIX) \ ++ zggsvd.$(SUFFIX) zggsvp.$(SUFFIX) \ ++ zgtcon.$(SUFFIX) zgtrfs.$(SUFFIX) zgtsv.$(SUFFIX) zgtsvx.$(SUFFIX) zgttrf.$(SUFFIX) zgttrs.$(SUFFIX) zgtts2.$(SUFFIX) zhbev.$(SUFFIX) \ ++ zhbevd.$(SUFFIX) zhbevx.$(SUFFIX) zhbgst.$(SUFFIX) zhbgv.$(SUFFIX) zhbgvd.$(SUFFIX) zhbgvx.$(SUFFIX) zhbtrd.$(SUFFIX) \ ++ zhecon.$(SUFFIX) zheev.$(SUFFIX) zheevd.$(SUFFIX) zheevr.$(SUFFIX) zheevx.$(SUFFIX) zhegs2.$(SUFFIX) zhegst.$(SUFFIX) \ ++ zhegv.$(SUFFIX) zhegvd.$(SUFFIX) zhegvx.$(SUFFIX) zherfs.$(SUFFIX) zhesv.$(SUFFIX) zhesvx.$(SUFFIX) zhetd2.$(SUFFIX) \ ++ zhetf2.$(SUFFIX) zhetrd.$(SUFFIX) \ ++ zhetrf.$(SUFFIX) zhetri.$(SUFFIX) zhetrs.$(SUFFIX) zhgeqz.$(SUFFIX) zhpcon.$(SUFFIX) zhpev.$(SUFFIX) zhpevd.$(SUFFIX) \ ++ zhpevx.$(SUFFIX) zhpgst.$(SUFFIX) zhpgv.$(SUFFIX) zhpgvd.$(SUFFIX) zhpgvx.$(SUFFIX) zhprfs.$(SUFFIX) zhpsv.$(SUFFIX) \ ++ zhpsvx.$(SUFFIX) \ ++ zhptrd.$(SUFFIX) zhptrf.$(SUFFIX) zhptri.$(SUFFIX) zhptrs.$(SUFFIX) zhsein.$(SUFFIX) zhseqr.$(SUFFIX) zlabrd.$(SUFFIX) \ ++ zlacgv.$(SUFFIX) zlacon.$(SUFFIX) zlacn2.$(SUFFIX) zlacp2.$(SUFFIX) zlacpy.$(SUFFIX) zlacrm.$(SUFFIX) zlacrt.$(SUFFIX) zladiv.$(SUFFIX) \ ++ zlaed0.$(SUFFIX) zlaed7.$(SUFFIX) zlaed8.$(SUFFIX) \ ++ zlaein.$(SUFFIX) zlaesy.$(SUFFIX) zlaev2.$(SUFFIX) zlags2.$(SUFFIX) zlagtm.$(SUFFIX) \ ++ zlahef.$(SUFFIX) zlahqr.$(SUFFIX) \ ++ zlahrd.$(SUFFIX) zlahr2.$(SUFFIX) zlaic1.$(SUFFIX) zlals0.$(SUFFIX) zlalsa.$(SUFFIX) zlalsd.$(SUFFIX) zlangb.$(SUFFIX) zlange.$(SUFFIX) \ ++ zlangt.$(SUFFIX) zlanhb.$(SUFFIX) \ ++ zlanhe.$(SUFFIX) \ ++ zlanhp.$(SUFFIX) zlanhs.$(SUFFIX) zlanht.$(SUFFIX) zlansb.$(SUFFIX) zlansp.$(SUFFIX) zlansy.$(SUFFIX) zlantb.$(SUFFIX) \ ++ zlantp.$(SUFFIX) zlantr.$(SUFFIX) zlapll.$(SUFFIX) zlapmt.$(SUFFIX) zlaqgb.$(SUFFIX) zlaqge.$(SUFFIX) \ ++ zlaqhb.$(SUFFIX) zlaqhe.$(SUFFIX) zlaqhp.$(SUFFIX) zlaqp2.$(SUFFIX) zlaqps.$(SUFFIX) zlaqsb.$(SUFFIX) \ ++ zlaqr0.$(SUFFIX) zlaqr1.$(SUFFIX) zlaqr2.$(SUFFIX) zlaqr3.$(SUFFIX) zlaqr4.$(SUFFIX) zlaqr5.$(SUFFIX) \ ++ zlaqsp.$(SUFFIX) zlaqsy.$(SUFFIX) zlar1v.$(SUFFIX) zlar2v.$(SUFFIX) zlarcm.$(SUFFIX) zlarf.$(SUFFIX) zlarfb.$(SUFFIX) \ ++ zlarfg.$(SUFFIX) zlarft.$(SUFFIX) \ ++ zlarfx.$(SUFFIX) zlargv.$(SUFFIX) zlarnv.$(SUFFIX) zlarrv.$(SUFFIX) zlartg.$(SUFFIX) zlartv.$(SUFFIX) \ ++ zlarz.$(SUFFIX) zlarzb.$(SUFFIX) zlarzt.$(SUFFIX) zlascl.$(SUFFIX) zlaset.$(SUFFIX) zlasr.$(SUFFIX) \ ++ zlassq.$(SUFFIX) zlasyf.$(SUFFIX) \ ++ zlatbs.$(SUFFIX) zlatdf.$(SUFFIX) zlatps.$(SUFFIX) zlatrd.$(SUFFIX) zlatrs.$(SUFFIX) zlatrz.$(SUFFIX) zlatzm.$(SUFFIX) \ ++ zpbcon.$(SUFFIX) zpbequ.$(SUFFIX) zpbrfs.$(SUFFIX) zpbstf.$(SUFFIX) zpbsv.$(SUFFIX) \ ++ zpbsvx.$(SUFFIX) zpbtf2.$(SUFFIX) zpbtrf.$(SUFFIX) zpbtrs.$(SUFFIX) zpocon.$(SUFFIX) zpoequ.$(SUFFIX) zporfs.$(SUFFIX) \ ++ zposv.$(SUFFIX) zposvx.$(SUFFIX) zpotrs.$(SUFFIX) zppcon.$(SUFFIX) \ ++ zppequ.$(SUFFIX) zpprfs.$(SUFFIX) zppsv.$(SUFFIX) zppsvx.$(SUFFIX) zpptrf.$(SUFFIX) zpptri.$(SUFFIX) zpptrs.$(SUFFIX) \ ++ zptcon.$(SUFFIX) zpteqr.$(SUFFIX) zptrfs.$(SUFFIX) zptsv.$(SUFFIX) zptsvx.$(SUFFIX) zpttrf.$(SUFFIX) zpttrs.$(SUFFIX) zptts2.$(SUFFIX) \ ++ zrot.$(SUFFIX) zspcon.$(SUFFIX) zsprfs.$(SUFFIX) zspsv.$(SUFFIX) \ ++ zspsvx.$(SUFFIX) zsptrf.$(SUFFIX) zsptri.$(SUFFIX) zsptrs.$(SUFFIX) zdrscl.$(SUFFIX) zstedc.$(SUFFIX) \ ++ zstegr.$(SUFFIX) zstein.$(SUFFIX) zsteqr.$(SUFFIX) zsycon.$(SUFFIX) \ ++ zsyrfs.$(SUFFIX) zsysv.$(SUFFIX) zsysvx.$(SUFFIX) zsytf2.$(SUFFIX) zsytrf.$(SUFFIX) zsytri.$(SUFFIX) \ ++ zsytrs.$(SUFFIX) ztbcon.$(SUFFIX) ztbrfs.$(SUFFIX) ztbtrs.$(SUFFIX) ztgevc.$(SUFFIX) ztgex2.$(SUFFIX) \ ++ ztgexc.$(SUFFIX) ztgsen.$(SUFFIX) ztgsja.$(SUFFIX) ztgsna.$(SUFFIX) ztgsy2.$(SUFFIX) ztgsyl.$(SUFFIX) ztpcon.$(SUFFIX) \ ++ ztprfs.$(SUFFIX) ztptri.$(SUFFIX) \ ++ ztptrs.$(SUFFIX) ztrcon.$(SUFFIX) ztrevc.$(SUFFIX) ztrexc.$(SUFFIX) ztrrfs.$(SUFFIX) ztrsen.$(SUFFIX) ztrsna.$(SUFFIX) \ ++ ztrsyl.$(SUFFIX) ztrtrs.$(SUFFIX) ztzrqf.$(SUFFIX) ztzrzf.$(SUFFIX) zung2l.$(SUFFIX) \ ++ zung2r.$(SUFFIX) zungbr.$(SUFFIX) zunghr.$(SUFFIX) zungl2.$(SUFFIX) zunglq.$(SUFFIX) zungql.$(SUFFIX) zungqr.$(SUFFIX) zungr2.$(SUFFIX) \ ++ zungrq.$(SUFFIX) zungtr.$(SUFFIX) zunm2l.$(SUFFIX) zunm2r.$(SUFFIX) zunmbr.$(SUFFIX) zunmhr.$(SUFFIX) zunml2.$(SUFFIX) \ ++ zunmlq.$(SUFFIX) zunmql.$(SUFFIX) zunmqr.$(SUFFIX) zunmr2.$(SUFFIX) zunmr3.$(SUFFIX) zunmrq.$(SUFFIX) zunmrz.$(SUFFIX) \ ++ zunmtr.$(SUFFIX) zupgtr.$(SUFFIX) \ ++ zupmtr.$(SUFFIX) izmax1.$(SUFFIX) dzsum1.$(SUFFIX) zstemr.$(SUFFIX) \ ++ zcgesv.$(SUFFIX) zlag2c.$(SUFFIX) clag2z.$(SUFFIX) + + all: ../$(LAPACKLIB) + ++lapack_prof: ../$(LAPACKLIB_P) ++ + ALLOBJ=$(SLASRC) $(DLASRC) $(CLASRC) $(ZLASRC) $(SCLAUX) $(DZLAUX) \ + $(ALLAUX) + ++ALLOBJ_P = $(ALLOBJ:.$(SUFFIX)=.$(PSUFFIX)) ++ + ../$(LAPACKLIB): $(ALLOBJ) + $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) + $(RANLIB) $@ + ++../$(LAPACKLIB_P): $(ALLOBJ_P) ++ $(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ_P) ++ $(RANLIB) $@ ++ + single: $(SLASRC) $(ALLAUX) $(SCLAUX) + $(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(ALLAUX) \ + $(SCLAUX) +@@ -317,6 +325,7 @@ + $(DZLAUX) + $(RANLIB) ../$(LAPACKLIB) + ++ + $(ALLAUX): $(FRC) + $(SCLAUX): $(FRC) + $(DZLAUX): $(FRC) +@@ -329,11 +338,16 @@ + @FRC=$(FRC) + + clean: +- rm -f *.o ++ rm -f *.$(SUFFIX) *.$(PSUFFIX) + +-.f.o: ++%.$(SUFFIX): %.f + $(FORTRAN) $(OPTS) -c $< -o $@ + +-slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ +-dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c $< -o $@ ++%.$(PSUFFIX): %.f ++ $(FORTRAN) $(POPTS) -c $< -o $@ ++ ++slaruv.$(SUFFIX): slaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ ++dlaruv.$(SUFFIX): dlaruv.f ; $(FORTRAN) $(NOOPT) -O0 -c $< -o $@ + ++slaruv.$(PSUFFIX): slaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ ++dlaruv.$(PSUFFIX): dlaruv.f ; $(FORTRAN) $(PNOOPT) -O0 -c $< -o $@ +diff -ruN lapack-3.1.1.old/TESTING/EIG/Makefile lapack-3.1.1/TESTING/EIG/Makefile +--- lapack-3.1.1.old/TESTING/EIG/Makefile 2007-02-20 15:33:03.000000000 -0600 ++++ lapack-3.1.1/TESTING/EIG/Makefile 2009-12-16 14:40:35.000000000 -0600 +@@ -78,7 +78,7 @@ + cget35.o cget36.o cget37.o cget38.o cget51.o cget52.o \ + cget54.o cglmts.o cgqrts.o cgrqts.o cgsvts.o \ + chbt21.o chet21.o chet22.o chpt21.o chst01.o \ +- clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o csbmv.o \ ++ clarfy.o clarhs.o clatm4.o clctes.o clctsx.o clsets.o \ + csgt01.o cslect.o \ + cstt21.o cstt22.o cunt01.o cunt03.o + +@@ -115,7 +115,7 @@ + zget35.o zget36.o zget37.o zget38.o zget51.o zget52.o \ + zget54.o zglmts.o zgqrts.o zgrqts.o zgsvts.o \ + zhbt21.o zhet21.o zhet22.o zhpt21.o zhst01.o \ +- zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o zsbmv.o \ ++ zlarfy.o zlarhs.o zlatm4.o zlctes.o zlctsx.o zlsets.o \ + zsgt01.o zslect.o \ + zstt21.o zstt22.o zunt01.o zunt03.o + +@@ -129,22 +129,22 @@ + ../xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) ; \ + $(LOADER) $(LOADOPTS) -o $@ \ + $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) + + ../xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) ; \ + $(LOADER) $(LOADOPTS) -o $@ \ + $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) + + ../xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) ; \ + $(LOADER) $(LOADOPTS) -o $@ \ + $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) + + ../xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) ; \ + $(LOADER) $(LOADOPTS) -o $@ \ + $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) \ +- ../../$(LAPACKLIB) $(BLASLIB) ++ ../../$(LAPACKLIB) $(BLASLIB) $(CEXTRALIB) + + $(AEIGTST): $(FRC) + $(SCIGTST): $(FRC) +diff -ruN lapack-3.1.1.old/TESTING/LIN/Makefile lapack-3.1.1/TESTING/LIN/Makefile +--- lapack-3.1.1.old/TESTING/LIN/Makefile 2007-02-20 15:33:03.000000000 -0600 ++++ lapack-3.1.1/TESTING/LIN/Makefile 2009-12-16 14:40:35.000000000 -0600 +@@ -97,7 +97,7 @@ + cqpt01.o cqrt01.o cqrt02.o cqrt03.o cqrt11.o \ + cqrt12.o cqrt13.o cqrt14.o cqrt15.o cqrt16.o \ + cqrt17.o crqt01.o crqt02.o crqt03.o crzt01.o crzt02.o \ +- csbmv.o cspt01.o \ ++ cspt01.o \ + cspt02.o cspt03.o csyt01.o csyt02.o csyt03.o \ + ctbt02.o ctbt03.o ctbt05.o ctbt06.o ctpt01.o \ + ctpt02.o ctpt03.o ctpt05.o ctpt06.o ctrt01.o \ +@@ -159,7 +159,7 @@ + zqpt01.o zqrt01.o zqrt02.o zqrt03.o zqrt11.o \ + zqrt12.o zqrt13.o zqrt14.o zqrt15.o zqrt16.o \ + zqrt17.o zrqt01.o zrqt02.o zrqt03.o zrzt01.o zrzt02.o \ +- zsbmv.o zspt01.o \ ++ zspt01.o \ + zspt02.o zspt03.o zsyt01.o zsyt02.o zsyt03.o \ + ztbt02.o ztbt03.o ztbt05.o ztbt06.o ztpt01.o \ + ztpt02.o ztpt03.o ztpt05.o ztpt06.o ztrt01.o \ +@@ -176,7 +176,7 @@ + zdrvab.o zerrab.o zget08.o \ + alaerh.o alahd.o aladhd.o alareq.o \ + chkxer.o zget02.o zlarhs.o zlatb4.o \ +- zsbmv.o xerbla.o ++ xerbla.o + + all: single double complex complex16 proto-double proto-complex16 + +@@ -190,27 +190,27 @@ + + ../xlintsts : $(ALINTST) $(SLINTST) $(SCLNTST) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(SLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstc : $(ALINTST) $(CLINTST) $(SCLNTST) + $(LOADER) $(LOADOPTS) $(ALINTST) $(SCLNTST) $(CLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstd : $(ALINTST) $(DLINTST) $(DZLNTST) + $(LOADER) $(LOADOPTS) $(ALINTST) $(DZLNTST) $(DLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstz : $(ALINTST) $(ZLINTST) $(DZLNTST) + $(LOADER) $(LOADOPTS) $(ALINTST) $(DZLNTST) $(ZLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstds : $(DSLINTST) + $(LOADER) $(LOADOPTS) $(DSLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + ../xlintstzc : $(ZCLINTST) + $(LOADER) $(LOADOPTS) $(ZCLINTST) \ +- ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ ++ ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB) -o $@ $(CEXTRALIB) + + $(ALINTST): $(FRC) + $(SCLNTST): $(FRC) diff --git a/quickbuild.32bit b/quickbuild.32bit new file mode 100644 index 0000000000..b1b548aa15 --- /dev/null +++ b/quickbuild.32bit @@ -0,0 +1,3 @@ +#!/bin/bash + +make -j 2 BINARY=32 diff --git a/quickbuild.64bit b/quickbuild.64bit new file mode 100644 index 0000000000..fd313df845 --- /dev/null +++ b/quickbuild.64bit @@ -0,0 +1,3 @@ +#!/bin/bash + +make BINARY=64 diff --git a/quickbuild.win32 b/quickbuild.win32 new file mode 100644 index 0000000000..29949c1920 --- /dev/null +++ b/quickbuild.win32 @@ -0,0 +1,3 @@ +#!/bin/bash + +make BINARY=32 CC=gcc FC=gfortran diff --git a/quickbuild.win64 b/quickbuild.win64 new file mode 100644 index 0000000000..88f748a8d2 --- /dev/null +++ b/quickbuild.win64 @@ -0,0 +1,3 @@ +#!/bin/bash + +make BINARY=64 CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran diff --git a/reference/LICENSE b/reference/LICENSE new file mode 100644 index 0000000000..85061f29fe --- /dev/null +++ b/reference/LICENSE @@ -0,0 +1,23 @@ +This directory contains the reference implementation of BLAS +which is obtainable at: http://netlib.org/blas/ + +The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, +2010, is as follows: + +2) Are there legal restrictions on the use of BLAS reference implementation +software? + +The reference BLAS is a freely-available software package. It is available from +netlib via anonymous ftp and the World Wide Web. Thus, it can be included in +commercial software packages (and has been). We only ask that proper credit be +given to the authors. + +Like all software, it is copyrighted. It is not trademarked, but we do ask the +following: + +If you modify the source for these routines we ask that you change the name of +the routine and comment the changes made to the original. + +We will gladly answer any questions regarding the software. If a modification +is done, however, it is the responsibility of the person who modified the +routine to provide support. diff --git a/reference/Makefile b/reference/Makefile new file mode 100644 index 0000000000..6cbde28ef0 --- /dev/null +++ b/reference/Makefile @@ -0,0 +1,176 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +ifeq ($(ARCH), x86) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), x86_64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), ia64) +SUPPORT_GEMM3M = 1 +endif + +ifeq ($(ARCH), MIPS) +SUPPORT_GEMM3M = 1 +endif + +SBLAS1OBJS = \ + saxpyf.$(SUFFIX) sswapf.$(SUFFIX) \ + scopyf.$(SUFFIX) sscalf.$(SUFFIX) \ + sdotf.$(SUFFIX) sdsdotf.$(SUFFIX) dsdotf.$(SUFFIX) \ + sasumf.$(SUFFIX) snrm2f.$(SUFFIX) \ + smaxf.$(SUFFIX) samaxf.$(SUFFIX) ismaxf.$(SUFFIX) isamaxf.$(SUFFIX) \ + sminf.$(SUFFIX) saminf.$(SUFFIX) isminf.$(SUFFIX) isaminf.$(SUFFIX) \ + srotf.$(SUFFIX) srotgf.$(SUFFIX) srotmf.$(SUFFIX) srotmgf.$(SUFFIX) \ + +SBLAS2OBJS = \ + sgemvf.$(SUFFIX) sgerf.$(SUFFIX) \ + strsvf.$(SUFFIX) strmvf.$(SUFFIX) ssymvf.$(SUFFIX) \ + ssyrf.$(SUFFIX) ssyr2f.$(SUFFIX) sgbmvf.$(SUFFIX) \ + ssbmvf.$(SUFFIX) sspmvf.$(SUFFIX) \ + ssprf.$(SUFFIX) sspr2f.$(SUFFIX) \ + stbsvf.$(SUFFIX) stbmvf.$(SUFFIX) \ + stpsvf.$(SUFFIX) stpmvf.$(SUFFIX) + +SBLAS3OBJS = \ + sgemmf.$(SUFFIX) ssymmf.$(SUFFIX) strmmf.$(SUFFIX) \ + strsmf.$(SUFFIX) ssyrkf.$(SUFFIX) ssyr2kf.$(SUFFIX) + +DBLAS1OBJS = \ + daxpyf.$(SUFFIX) dswapf.$(SUFFIX) \ + dcopyf.$(SUFFIX) dscalf.$(SUFFIX) \ + ddotf.$(SUFFIX) \ + dasumf.$(SUFFIX) dnrm2f.$(SUFFIX) \ + dmaxf.$(SUFFIX) damaxf.$(SUFFIX) idmaxf.$(SUFFIX) idamaxf.$(SUFFIX) \ + dminf.$(SUFFIX) daminf.$(SUFFIX) idminf.$(SUFFIX) idaminf.$(SUFFIX) \ + drotf.$(SUFFIX) drotgf.$(SUFFIX) drotmf.$(SUFFIX) drotmgf.$(SUFFIX) \ + +DBLAS2OBJS = \ + dgemvf.$(SUFFIX) dgerf.$(SUFFIX) \ + dtrsvf.$(SUFFIX) dtrmvf.$(SUFFIX) dsymvf.$(SUFFIX) \ + dsyrf.$(SUFFIX) dsyr2f.$(SUFFIX) dgbmvf.$(SUFFIX) \ + dsbmvf.$(SUFFIX) dspmvf.$(SUFFIX) \ + dsprf.$(SUFFIX) dspr2f.$(SUFFIX) \ + dtbsvf.$(SUFFIX) dtbmvf.$(SUFFIX) \ + dtpsvf.$(SUFFIX) dtpmvf.$(SUFFIX) + +DBLAS3OBJS = \ + dgemmf.$(SUFFIX) dsymmf.$(SUFFIX) dtrmmf.$(SUFFIX) \ + dtrsmf.$(SUFFIX) dsyrkf.$(SUFFIX) dsyr2kf.$(SUFFIX) + +CBLAS1OBJS = \ + caxpyf.$(SUFFIX) caxpycf.$(SUFFIX) cswapf.$(SUFFIX) \ + ccopyf.$(SUFFIX) cscalf.$(SUFFIX) csscalf.$(SUFFIX) \ + cdotcf.$(SUFFIX) cdotuf.$(SUFFIX) \ + scasumf.$(SUFFIX) scnrm2f.$(SUFFIX) \ + scamaxf.$(SUFFIX) icamaxf.$(SUFFIX) \ + scaminf.$(SUFFIX) icaminf.$(SUFFIX) \ + csrotf.$(SUFFIX) crotgf.$(SUFFIX) \ + +CBLAS2OBJS = \ + cgemvf.$(SUFFIX) cgeruf.$(SUFFIX) cgercf.$(SUFFIX) \ + ctrsvf.$(SUFFIX) ctrmvf.$(SUFFIX) csymvf.$(SUFFIX) \ + csyrf.$(SUFFIX) csyr2f.$(SUFFIX) cgbmvf.$(SUFFIX) \ + csbmvf.$(SUFFIX) cspmvf.$(SUFFIX) \ + csprf.$(SUFFIX) cspr2f.$(SUFFIX) \ + ctbsvf.$(SUFFIX) ctbmvf.$(SUFFIX) \ + ctpsvf.$(SUFFIX) ctpmvf.$(SUFFIX) \ + chemvf.$(SUFFIX) chbmvf.$(SUFFIX) \ + cherf.$(SUFFIX) cher2f.$(SUFFIX) \ + chpmvf.$(SUFFIX) chprf.$(SUFFIX) chpr2f.$(SUFFIX) + +CBLAS3OBJS = \ + cgemmf.$(SUFFIX) csymmf.$(SUFFIX) ctrmmf.$(SUFFIX) \ + ctrsmf.$(SUFFIX) csyrkf.$(SUFFIX) csyr2kf.$(SUFFIX) \ + chemmf.$(SUFFIX) cherkf.$(SUFFIX) cher2kf.$(SUFFIX) + +ZBLAS1OBJS = \ + zaxpyf.$(SUFFIX) zaxpycf.$(SUFFIX) zswapf.$(SUFFIX) \ + zcopyf.$(SUFFIX) zscalf.$(SUFFIX) zdscalf.$(SUFFIX) \ + zdotcf.$(SUFFIX) zdotuf.$(SUFFIX) \ + dzasumf.$(SUFFIX) dznrm2f.$(SUFFIX) \ + dzamaxf.$(SUFFIX) izamaxf.$(SUFFIX) \ + dzaminf.$(SUFFIX) izaminf.$(SUFFIX) \ + zdrotf.$(SUFFIX) zrotgf.$(SUFFIX) \ + +ZBLAS2OBJS = \ + zgemvf.$(SUFFIX) zgeruf.$(SUFFIX) zgercf.$(SUFFIX) \ + ztrsvf.$(SUFFIX) ztrmvf.$(SUFFIX) zsymvf.$(SUFFIX) \ + zsyrf.$(SUFFIX) zsyr2f.$(SUFFIX) zgbmvf.$(SUFFIX) \ + zsbmvf.$(SUFFIX) zspmvf.$(SUFFIX) \ + zsprf.$(SUFFIX) zspr2f.$(SUFFIX) \ + ztbsvf.$(SUFFIX) ztbmvf.$(SUFFIX) \ + ztpsvf.$(SUFFIX) ztpmvf.$(SUFFIX) \ + zhemvf.$(SUFFIX) zhbmvf.$(SUFFIX) \ + zherf.$(SUFFIX) zher2f.$(SUFFIX) \ + zhpmvf.$(SUFFIX) zhprf.$(SUFFIX) zhpr2f.$(SUFFIX) + +ZBLAS3OBJS = \ + zgemmf.$(SUFFIX) zsymmf.$(SUFFIX) ztrmmf.$(SUFFIX) \ + ztrsmf.$(SUFFIX) zsyrkf.$(SUFFIX) zsyr2kf.$(SUFFIX) \ + zhemmf.$(SUFFIX) zherkf.$(SUFFIX) zher2kf.$(SUFFIX) + +ifdef SUPPORT_GEMM3M + +CBLAS3OBJS += cgemm3mf.$(SUFFIX) csymm3mf.$(SUFFIX) chemm3mf.$(SUFFIX) + +ZBLAS3OBJS += zgemm3mf.$(SUFFIX) zsymm3mf.$(SUFFIX) zhemm3mf.$(SUFFIX) + +endif + +SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) +DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) +QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) +CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) +ZBLASOBJS = $(ZBLAS1OBJS) $(ZBLAS2OBJS) $(ZBLAS3OBJS) +XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) + +SBLASOBJS += \ + sgetf2f.$(SUFFIX) sgetrff.$(SUFFIX) slauu2f.$(SUFFIX) slauumf.$(SUFFIX) \ + spotf2f.$(SUFFIX) spotrff.$(SUFFIX) strti2f.$(SUFFIX) strtrif.$(SUFFIX) \ + slaswpf.$(SUFFIX) sgetrsf.$(SUFFIX) sgesvf.$(SUFFIX) spotrif.$(SUFFIX) \ + +DBLASOBJS += \ + dgetf2f.$(SUFFIX) dgetrff.$(SUFFIX) dlauu2f.$(SUFFIX) dlauumf.$(SUFFIX) \ + dpotf2f.$(SUFFIX) dpotrff.$(SUFFIX) dtrti2f.$(SUFFIX) dtrtrif.$(SUFFIX) \ + dlaswpf.$(SUFFIX) dgetrsf.$(SUFFIX) dgesvf.$(SUFFIX) dpotrif.$(SUFFIX) \ + +QBLASOBJS += \ + qgetf2f.$(SUFFIX) qgetrff.$(SUFFIX) qlauu2f.$(SUFFIX) qlauumf.$(SUFFIX) \ + qpotf2f.$(SUFFIX) qpotrff.$(SUFFIX) qtrti2f.$(SUFFIX) qtrtrif.$(SUFFIX) \ + qlaswpf.$(SUFFIX) qgetrsf.$(SUFFIX) qgesvf.$(SUFFIX) qpotrif.$(SUFFIX) \ + +CBLASOBJS += \ + cgetf2f.$(SUFFIX) cgetrff.$(SUFFIX) clauu2f.$(SUFFIX) clauumf.$(SUFFIX) \ + cpotf2f.$(SUFFIX) cpotrff.$(SUFFIX) ctrti2f.$(SUFFIX) ctrtrif.$(SUFFIX) \ + claswpf.$(SUFFIX) cgetrsf.$(SUFFIX) cgesvf.$(SUFFIX) cpotrif.$(SUFFIX) \ + +ZBLASOBJS += \ + zgetf2f.$(SUFFIX) zgetrff.$(SUFFIX) zlauu2f.$(SUFFIX) zlauumf.$(SUFFIX) \ + zpotf2f.$(SUFFIX) zpotrff.$(SUFFIX) ztrti2f.$(SUFFIX) ztrtrif.$(SUFFIX) \ + zlaswpf.$(SUFFIX) zgetrsf.$(SUFFIX) zgesvf.$(SUFFIX) zpotrif.$(SUFFIX) \ + +XBLASOBJS += \ + xgetf2f.$(SUFFIX) xgetrff.$(SUFFIX) xlauu2f.$(SUFFIX) xlauumf.$(SUFFIX) \ + xpotf2f.$(SUFFIX) xpotrff.$(SUFFIX) xtrti2f.$(SUFFIX) xtrtrif.$(SUFFIX) \ + xlaswpf.$(SUFFIX) xgetrsf.$(SUFFIX) xgesvf.$(SUFFIX) xpotrif.$(SUFFIX) \ + + +include $(TOPDIR)/Makefile.tail + +all :: libs + +clean :: + +level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + diff --git a/reference/caxpycf.f b/reference/caxpycf.f new file mode 100644 index 0000000000..092c8c12e7 --- /dev/null +++ b/reference/caxpycf.f @@ -0,0 +1,35 @@ + subroutine caxpycf(n,ca,cx,incx,cy,incy) +c +c constant times a vector plus a vector. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ca + integer i,incx,incy,ix,iy,n + INTRINSIC conjg +c + if(n.le.0)return + if (abs(real(ca)) + abs(aimag(ca)) .eq. 0.0 ) return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + cy(iy) = cy(iy) + ca*conjg(cx(ix)) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + cy(i) = cy(i) + ca*conjg(cx(i)) + 30 continue + return + end diff --git a/reference/caxpyf.f b/reference/caxpyf.f new file mode 100644 index 0000000000..554f71d1bd --- /dev/null +++ b/reference/caxpyf.f @@ -0,0 +1,34 @@ + subroutine caxpyf(n,ca,cx,incx,cy,incy) +c +c constant times a vector plus a vector. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ca + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if (abs(real(ca)) + abs(aimag(ca)) .eq. 0.0 ) return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + cy(iy) = cy(iy) + ca*cx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + cy(i) = cy(i) + ca*cx(i) + 30 continue + return + end diff --git a/reference/ccopyf.f b/reference/ccopyf.f new file mode 100644 index 0000000000..2a33255535 --- /dev/null +++ b/reference/ccopyf.f @@ -0,0 +1,33 @@ + subroutine ccopyf(n,cx,incx,cy,incy) +c +c copies a vector, x, to a vector, y. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*) + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + cy(iy) = cx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + cy(i) = cx(i) + 30 continue + return + end diff --git a/reference/cdotcf.f b/reference/cdotcf.f new file mode 100644 index 0000000000..79aa39c8a3 --- /dev/null +++ b/reference/cdotcf.f @@ -0,0 +1,38 @@ + complex function cdotcf(n,cx,incx,cy,incy) +c +c forms the dot product of two vectors, conjugating the first +c vector. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ctemp + integer i,incx,incy,ix,iy,n +c + ctemp = (0.0,0.0) + cdotcf = (0.0,0.0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ctemp = ctemp + conjg(cx(ix))*cy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + cdotcf = ctemp + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ctemp = ctemp + conjg(cx(i))*cy(i) + 30 continue + cdotcf = ctemp + return + end diff --git a/reference/cdotuf.f b/reference/cdotuf.f new file mode 100644 index 0000000000..bf93390a33 --- /dev/null +++ b/reference/cdotuf.f @@ -0,0 +1,37 @@ + complex function cdotuf(n,cx,incx,cy,incy) +c +c forms the dot product of two vectors. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ctemp + integer i,incx,incy,ix,iy,n +c + ctemp = (0.0,0.0) + cdotuf = (0.0,0.0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ctemp = ctemp + cx(ix)*cy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + cdotuf = ctemp + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ctemp = ctemp + cx(i)*cy(i) + 30 continue + cdotuf = ctemp + return + end diff --git a/reference/cgbmvf.f b/reference/cgbmvf.f new file mode 100644 index 0000000000..27ce62cb54 --- /dev/null +++ b/reference/cgbmvf.f @@ -0,0 +1,450 @@ + SUBROUTINE CGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, KL, KU, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZGBMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or +* +* y := alpha*conjg( A' )*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n band matrix, with kl sub-diagonals and ku super-diagonals. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* KL - INTEGER. +* On entry, KL specifies the number of sub-diagonals of the +* matrix A. KL must satisfy 0 .le. KL. +* Unchanged on exit. +* +* KU - INTEGER. +* On entry, KU specifies the number of super-diagonals of the +* matrix A. KU must satisfy 0 .le. KU. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading ( kl + ku + 1 ) by n part of the +* array A must contain the matrix of coefficients, supplied +* column by column, with the leading diagonal of the matrix in +* row ( ku + 1 ) of the array, the first super-diagonal +* starting at position 2 in row ku, the first sub-diagonal +* starting at position 1 in row ( ku + 2 ), and so on. +* Elements in the array A that do not correspond to elements +* in the band matrix (such as the top left ku by ku triangle) +* are not referenced. +* The following program segment will transfer a band matrix +* from conventional full matrix storage to band storage: +* +* DO 20, J = 1, N +* K = KU + 1 - J +* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) +* A( K + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( kl + ku + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, + $ LENX, LENY + LOGICAL NOCONJ, NOTRANS, XCONJ +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ).AND. + $ .NOT.LSAME( TRANS, 'O' ).AND. + $ .NOT.LSAME( TRANS, 'U' ).AND. + $ .NOT.LSAME( TRANS, 'S' ).AND. + $ .NOT.LSAME( TRANS, 'D' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( KL.LT.0 )THEN + INFO = 4 + ELSE IF( KU.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN + INFO = 8 + ELSE IF( INCX.EQ.0 )THEN + INFO = 10 + ELSE IF( INCY.EQ.0 )THEN + INFO = 13 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZGBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* + NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) + + NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) + + XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF(NOTRANS)THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the band part of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + KUP1 = KU + 1 + + IF(XCONJ)THEN + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + K = KUP1 - J + IF( NOCONJ )THEN + DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 50 CONTINUE + ELSE + DO 55, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*CONJG(A( K + I, J )) + 55 CONTINUE + END IF + + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + K = KUP1 - J + IF( NOCONJ )THEN + DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 70 CONTINUE + ELSE + DO 75, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*CONJG(A( K + I, J )) + IY = IY + INCY + 75 CONTINUE + END IF + + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = ZERO + K = KUP1 - J + IF( NOCONJ )THEN + DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( I ) + 90 CONTINUE + ELSE + DO 100, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + CONJG( A( K + I, J ) )*X( I ) + 100 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + IF( NOCONJ )THEN + DO 120, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( IX ) + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + CONJG( A( K + I, J ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 140 CONTINUE + END IF + END IF + + ELSE + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 160, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG(X( JX )) + K = KUP1 - J + IF( NOCONJ )THEN + DO 150, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 150 CONTINUE + ELSE + DO 155, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*CONJG(A( K + I, J )) + 155 CONTINUE + END IF + + END IF + JX = JX + INCX + 160 CONTINUE + ELSE + DO 180, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG(X( JX )) + IY = KY + K = KUP1 - J + IF( NOCONJ )THEN + DO 170, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 170 CONTINUE + ELSE + DO 175, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*CONJG(A( K + I, J )) + IY = IY + INCY + 175 CONTINUE + END IF + + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 180 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 210, J = 1, N + TEMP = ZERO + K = KUP1 - J + IF( NOCONJ )THEN + DO 190, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*CONJG(X( I )) + 190 CONTINUE + ELSE + DO 200, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + CONJG( A( K + I, J ) )*CONJG(X( I )) + 200 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 210 CONTINUE + ELSE + DO 240, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + IF( NOCONJ )THEN + DO 220, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*CONJG(X( IX )) + IX = IX + INCX + 220 CONTINUE + ELSE + DO 230, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + CONJG( A( K + I, J ) )*CONJG(X(IX )) + IX = IX + INCX + 230 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 240 CONTINUE + END IF + END IF + + END IF + +* + RETURN +* +* End of ZGBMV . +* + END diff --git a/reference/cgemm3mf.f b/reference/cgemm3mf.f new file mode 100644 index 0000000000..a144aa2b3b --- /dev/null +++ b/reference/cgemm3mf.f @@ -0,0 +1,414 @@ + SUBROUTINE CGEMM3MF(TRA,TRB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + COMPLEX ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRA,TRB +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* CGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRA - CHARACTER*1. +* On entry, TRA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRA = 'N' or 'n', op( A ) = A. +* +* TRA = 'T' or 't', op( A ) = A'. +* +* TRA = 'C' or 'c', op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* TRB - CHARACTER*1. +* On entry, TRB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRB = 'N' or 'n', op( B ) = B. +* +* TRB = 'T' or 't', op( B ) = B'. +* +* TRB = 'C' or 'c', op( B ) = conjg( B' ). +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRA = 'N' or 'n', and is m otherwise. +* Before entry with TRA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is +* n when TRB = 'N' or 'n', and is k otherwise. +* Before entry with TRB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG,MAX +* .. +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL CONJA,CONJB,NOTA,NOTB +* .. +* .. Parameters .. + COMPLEX ONE + PARAMETER (ONE= (1.0E+0,0.0E+0)) + COMPLEX ZERO + PARAMETER (ZERO= (0.0E+0,0.0E+0)) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* conjugated or transposed, set CONJA and CONJB as true if A and +* B respectively are to be transposed but not conjugated and set +* NROWA, NCOLA and NROWB as the number of rows and columns of A +* and the number of rows of B respectively. +* + NOTA = LSAME(TRA,'N') + NOTB = LSAME(TRB,'N') + CONJA = LSAME(TRA,'C') + CONJB = LSAME(TRB,'C') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + + (.NOT.LSAME(TRA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + + (.NOT.LSAME(TRB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('CGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And when alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE IF (CONJA) THEN +* +* Form C := alpha*conjg( A' )*B + beta*C. +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 150 J = 1,N + DO 140 I = 1,M + TEMP = ZERO + DO 130 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 130 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 140 CONTINUE + 150 CONTINUE + END IF + ELSE IF (NOTA) THEN + IF (CONJB) THEN +* +* Form C := alpha*A*conjg( B' ) + beta*C. +* + DO 200 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 160 I = 1,M + C(I,J) = ZERO + 160 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 170 I = 1,M + C(I,J) = BETA*C(I,J) + 170 CONTINUE + END IF + DO 190 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*CONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE +* +* Form C := alpha*A*B' + beta*C +* + DO 250 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 210 I = 1,M + C(I,J) = ZERO + 210 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 220 I = 1,M + C(I,J) = BETA*C(I,J) + 220 CONTINUE + END IF + DO 240 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE + END IF + 240 CONTINUE + 250 CONTINUE + END IF + ELSE IF (CONJA) THEN + IF (CONJB) THEN +* +* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. +* + DO 280 J = 1,N + DO 270 I = 1,M + TEMP = ZERO + DO 260 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*CONJG(B(J,L)) + 260 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 270 CONTINUE + 280 CONTINUE + ELSE +* +* Form C := alpha*conjg( A' )*B' + beta*C +* + DO 310 J = 1,N + DO 300 I = 1,M + TEMP = ZERO + DO 290 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*B(J,L) + 290 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 300 CONTINUE + 310 CONTINUE + END IF + ELSE + IF (CONJB) THEN +* +* Form C := alpha*A'*conjg( B' ) + beta*C +* + DO 340 J = 1,N + DO 330 I = 1,M + TEMP = ZERO + DO 320 L = 1,K + TEMP = TEMP + A(L,I)*CONJG(B(J,L)) + 320 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 330 CONTINUE + 340 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 370 J = 1,N + DO 360 I = 1,M + TEMP = ZERO + DO 350 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 350 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 360 CONTINUE + 370 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMM . +* + END diff --git a/reference/cgemmf.f b/reference/cgemmf.f new file mode 100644 index 0000000000..d554fd3e73 --- /dev/null +++ b/reference/cgemmf.f @@ -0,0 +1,414 @@ + SUBROUTINE CGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + COMPLEX ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRANA,TRANB +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* CGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRANA - CHARACTER*1. +* On entry, TRANA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANA = 'N' or 'n', op( A ) = A. +* +* TRANA = 'T' or 't', op( A ) = A'. +* +* TRANA = 'C' or 'c', op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* TRANB - CHARACTER*1. +* On entry, TRANB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRANB = 'N' or 'n', op( B ) = B. +* +* TRANB = 'T' or 't', op( B ) = B'. +* +* TRANB = 'C' or 'c', op( B ) = conjg( B' ). +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANA = 'N' or 'n', and is m otherwise. +* Before entry with TRANA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is +* n when TRANB = 'N' or 'n', and is k otherwise. +* Before entry with TRANB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG,MAX +* .. +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL CONJA,CONJB,NOTA,NOTB +* .. +* .. Parameters .. + COMPLEX ONE + PARAMETER (ONE= (1.0E+0,0.0E+0)) + COMPLEX ZERO + PARAMETER (ZERO= (0.0E+0,0.0E+0)) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* conjugated or transposed, set CONJA and CONJB as true if A and +* B respectively are to be transposed but not conjugated and set +* NROWA, NCOLA and NROWB as the number of rows and columns of A +* and the number of rows of B respectively. +* + NOTA = LSAME(TRANA,'N') + NOTB = LSAME(TRANB,'N') + CONJA = LSAME(TRANA,'C') + CONJB = LSAME(TRANB,'C') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + + (.NOT.LSAME(TRANA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + + (.NOT.LSAME(TRANB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('CGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And when alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE IF (CONJA) THEN +* +* Form C := alpha*conjg( A' )*B + beta*C. +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 150 J = 1,N + DO 140 I = 1,M + TEMP = ZERO + DO 130 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 130 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 140 CONTINUE + 150 CONTINUE + END IF + ELSE IF (NOTA) THEN + IF (CONJB) THEN +* +* Form C := alpha*A*conjg( B' ) + beta*C. +* + DO 200 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 160 I = 1,M + C(I,J) = ZERO + 160 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 170 I = 1,M + C(I,J) = BETA*C(I,J) + 170 CONTINUE + END IF + DO 190 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*CONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE +* +* Form C := alpha*A*B' + beta*C +* + DO 250 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 210 I = 1,M + C(I,J) = ZERO + 210 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 220 I = 1,M + C(I,J) = BETA*C(I,J) + 220 CONTINUE + END IF + DO 240 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE + END IF + 240 CONTINUE + 250 CONTINUE + END IF + ELSE IF (CONJA) THEN + IF (CONJB) THEN +* +* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. +* + DO 280 J = 1,N + DO 270 I = 1,M + TEMP = ZERO + DO 260 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*CONJG(B(J,L)) + 260 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 270 CONTINUE + 280 CONTINUE + ELSE +* +* Form C := alpha*conjg( A' )*B' + beta*C +* + DO 310 J = 1,N + DO 300 I = 1,M + TEMP = ZERO + DO 290 L = 1,K + TEMP = TEMP + CONJG(A(L,I))*B(J,L) + 290 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 300 CONTINUE + 310 CONTINUE + END IF + ELSE + IF (CONJB) THEN +* +* Form C := alpha*A'*conjg( B' ) + beta*C +* + DO 340 J = 1,N + DO 330 I = 1,M + TEMP = ZERO + DO 320 L = 1,K + TEMP = TEMP + A(L,I)*CONJG(B(J,L)) + 320 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 330 CONTINUE + 340 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 370 J = 1,N + DO 360 I = 1,M + TEMP = ZERO + DO 350 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 350 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 360 CONTINUE + 370 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMM . +* + END diff --git a/reference/cgemvf.f b/reference/cgemvf.f new file mode 100644 index 0000000000..d3a1d9e7ce --- /dev/null +++ b/reference/cgemvf.f @@ -0,0 +1,332 @@ + SUBROUTINE CGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or +* +* y := alpha*conjg( A' )*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY + LOGICAL NOCONJ, NOTRANS, XCONJ +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ).AND. + $ .NOT.LSAME( TRANS, 'O' ).AND. + $ .NOT.LSAME( TRANS, 'U' ).AND. + $ .NOT.LSAME( TRANS, 'S' ).AND. + $ .NOT.LSAME( TRANS, 'D' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CGEMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* + NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) + + NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) + + XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF(NOTRANS)THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (XCONJ) THEN + TEMP = ALPHA*X( JX ) + ELSE + TEMP = ALPHA*CONJG(X( JX )) + ENDIF + IF (NOCONJ) THEN + DO 50, I = 1, M + Y( I ) = Y( I ) + TEMP*A( I, J ) + 50 CONTINUE + ELSE + DO 55, I = 1, M + Y( I ) = Y( I ) + TEMP*CONJG(A( I, J )) + 55 CONTINUE + ENDIF + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (XCONJ) THEN + TEMP = ALPHA*X( JX ) + ELSE + TEMP = ALPHA*CONJG(X( JX )) + ENDIF + IY = KY + IF (NOCONJ) THEN + DO 70, I = 1, M + Y( IY ) = Y( IY ) + TEMP*A( I, J ) + IY = IY + INCY + 70 CONTINUE + ELSE + DO 75, I = 1, M + Y( IY ) = Y( IY ) + TEMP* CONJG(A( I, J )) + IY = IY + INCY + 75 CONTINUE + ENDIF + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = ZERO + IF( NOCONJ )THEN + DO 90, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + A( I, J )*X( I ) + ELSE + TEMP = TEMP + A( I, J )*CONJG(X( I )) + ENDIF + 90 CONTINUE + ELSE + DO 100, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + CONJG( A( I, J ) )*X( I ) + ELSE + TEMP = TEMP + CONJG( A( I, J ) )*CONJG(X( I )) + ENDIF + 100 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140, J = 1, N + TEMP = ZERO + IX = KX + IF( NOCONJ )THEN + DO 120, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + A( I, J )*X( IX ) + ELSE + TEMP = TEMP + A( I, J )*CONJG(X( IX )) + ENDIF + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) + ELSE + TEMP = TEMP + CONJG( A( I, J ) )*CONJG(X( IX )) + ENDIF + IX = IX + INCX + 130 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 140 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMV . +* + END + diff --git a/reference/cgercf.f b/reference/cgercf.f new file mode 100644 index 0000000000..9b4b41bbda --- /dev/null +++ b/reference/cgercf.f @@ -0,0 +1,157 @@ + SUBROUTINE CGERCF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CGERC performs the rank 1 operation +* +* A := alpha*x*conjg( y' ) + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CGERC ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( Y( JY ) ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( Y( JY ) ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of CGERC . +* + END diff --git a/reference/cgeruf.f b/reference/cgeruf.f new file mode 100644 index 0000000000..72e6969ed7 --- /dev/null +++ b/reference/cgeruf.f @@ -0,0 +1,157 @@ + SUBROUTINE CGERUF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CGERU performs the rank 1 operation +* +* A := alpha*x*y' + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CGERU ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of CGERU . +* + END diff --git a/reference/cgesvf.f b/reference/cgesvf.f new file mode 100644 index 0000000000..6544059347 --- /dev/null +++ b/reference/cgesvf.f @@ -0,0 +1,107 @@ + SUBROUTINE CGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK driver routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* CGESV computes the solution to a complex system of linear equations +* A * X = B, +* where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +* +* The LU decomposition with partial pivoting and row interchanges is +* used to factor A as +* A = P * L * U, +* where P is a permutation matrix, L is unit lower triangular, and U is +* upper triangular. The factored form of A is then used to solve the +* system of equations A * X = B. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of linear equations, i.e., the order of the +* matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the N-by-N coefficient matrix A. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (output) INTEGER array, dimension (N) +* The pivot indices that define the permutation matrix P; +* row i of the matrix was interchanged with row IPIV(i). +* +* B (input/output) COMPLEX array, dimension (LDB,NRHS) +* On entry, the N-by-NRHS matrix of right hand side matrix B. +* On exit, if INFO = 0, the N-by-NRHS solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, so the solution could not be computed. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL CGETRF, CGETRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGESV ', -INFO ) + RETURN + END IF +* +* Compute the LU factorization of A. +* + CALL CGETRF( N, N, A, LDA, IPIV, INFO ) + IF( INFO.EQ.0 ) THEN +* +* Solve the system A*X = B, overwriting B with X. +* + CALL CGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, + $ INFO ) + END IF + RETURN +* +* End of CGESV +* + END diff --git a/reference/cgetf2f.f b/reference/cgetf2f.f new file mode 100644 index 0000000000..f406750284 --- /dev/null +++ b/reference/cgetf2f.f @@ -0,0 +1,136 @@ + SUBROUTINE CGETF2F( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CGETF2 computes an LU factorization of a general m-by-n matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the m by n matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, U(k,k) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE, ZERO + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), + $ ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER J, JP +* .. +* .. External Functions .. + INTEGER ICAMAX + EXTERNAL ICAMAX +* .. +* .. External Subroutines .. + EXTERNAL CGERU, CSCAL, CSWAP, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* + DO 10 J = 1, MIN( M, N ) +* +* Find pivot and test for singularity. +* + JP = J - 1 + ICAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + IF( A( JP, J ).NE.ZERO ) THEN +* +* Apply the interchange to columns 1:N. +* + IF( JP.NE.J ) + $ CALL CSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) +* +* Compute elements J+1:M of J-th column. +* + IF( J.LT.M ) + $ CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) +* + ELSE IF( INFO.EQ.0 ) THEN +* + INFO = J + END IF +* + IF( J.LT.MIN( M, N ) ) THEN +* +* Update trailing submatrix. +* + CALL CGERU( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), + $ LDA, A( J+1, J+1 ), LDA ) + END IF + 10 CONTINUE + RETURN +* +* End of CGETF2 +* + END diff --git a/reference/cgetrff.f b/reference/cgetrff.f new file mode 100644 index 0000000000..2935c5d06a --- /dev/null +++ b/reference/cgetrff.f @@ -0,0 +1,156 @@ + SUBROUTINE CGETRFF( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGETF2, CLASWP, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 64 + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL CGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL CGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to columns 1:J-1. +* + CALL CLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF( J+JB.LE.N ) THEN +* +* Apply interchanges to columns J+JB:N. +* + CALL CLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* +* Compute block row of U. +* + CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL CGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + 20 CONTINUE + END IF + RETURN +* +* End of CGETRF +* + END diff --git a/reference/cgetrsf.f b/reference/cgetrsf.f new file mode 100644 index 0000000000..c4f0079c13 --- /dev/null +++ b/reference/cgetrsf.f @@ -0,0 +1,150 @@ + SUBROUTINE CGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* CGETRS solves a system of linear equations +* A * X = B, A**T * X = B, or A**H * X = B +* with a general N-by-N matrix A using the LU factorization computed +* by CGETRF. +* +* Arguments +* ========= +* +* TRANS (input) CHARACTER*1 +* Specifies the form of the system of equations: +* = 'N': A * X = B (No transpose) +* = 'T': A**T * X = B (Transpose) +* = 'C': A**H * X = B (Conjugate transpose) +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input) COMPLEX array, dimension (LDA,N) +* The factors L and U from the factorization A = P*L*U +* as computed by CGETRF. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from CGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* B (input/output) COMPLEX array, dimension (LDB,NRHS) +* On entry, the right hand side matrix B. +* On exit, the solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRAN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CLASWP, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NOTRAN = LSAME( TRANS, 'N' ) .OR. LSAME(TRANS, 'R') + IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETRS', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 .OR. NRHS.EQ.0 ) + $ RETURN +* + IF( NOTRAN ) THEN +* +* Solve A * X = B. +* +* Apply row interchanges to the right hand sides. +* + CALL CLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) +* +* Solve L*X = B, overwriting B with X. +* + CALL CTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve U*X = B, overwriting B with X. +* + CALL CTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, + $ NRHS, ONE, A, LDA, B, LDB ) + ELSE +* +* Solve A**T * X = B or A**H * X = B. +* +* Solve U'*X = B, overwriting B with X. +* + CALL CTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, NRHS, ONE, + $ A, LDA, B, LDB ) +* +* Solve L'*X = B, overwriting B with X. +* + CALL CTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, ONE, A, + $ LDA, B, LDB ) +* +* Apply row interchanges to the solution vectors. +* + CALL CLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) + END IF +* + RETURN +* +* End of CGETRS +* + END diff --git a/reference/chbmvf.f b/reference/chbmvf.f new file mode 100644 index 0000000000..85285c4b83 --- /dev/null +++ b/reference/chbmvf.f @@ -0,0 +1,309 @@ + SUBROUTINE CHBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, K, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian band matrix, with k super-diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the hermitian matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a hermitian band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the hermitian matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a hermitian band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, MIN, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( K.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*REAL( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*REAL( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*REAL( A( 1, J ) ) + L = 1 - J + DO 90, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*REAL( A( 1, J ) ) + L = 1 - J + IX = JX + IY = JY + DO 110, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + CONJG( A( L + I, J ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHBMV . +* + END diff --git a/reference/chemm3mf.f b/reference/chemm3mf.f new file mode 100644 index 0000000000..7fd2e6e22e --- /dev/null +++ b/reference/chemm3mf.f @@ -0,0 +1,304 @@ + SUBROUTINE CHEMM3MF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CHEMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is an hermitian matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the hermitian matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the hermitian matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* hermitian matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* hermitian matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHEMM3M', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*CONJG( A( K, I ) ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*CONJG( A( K, I ) ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*REAL( A( J, J ) ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*CONJG( A( J, K ) ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*CONJG( A( J, K ) ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of CHEMM . +* + END diff --git a/reference/chemmf.f b/reference/chemmf.f new file mode 100644 index 0000000000..ccb9b0a6a0 --- /dev/null +++ b/reference/chemmf.f @@ -0,0 +1,304 @@ + SUBROUTINE CHEMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CHEMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is an hermitian matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the hermitian matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the hermitian matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* hermitian matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* hermitian matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHEMM3M', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*CONJG( A( K, I ) ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*CONJG( A( K, I ) ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*REAL( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*REAL( A( J, J ) ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*CONJG( A( J, K ) ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*CONJG( A( J, K ) ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of CHEMM . +* + END diff --git a/reference/chemvf.f b/reference/chemvf.f new file mode 100644 index 0000000000..6ce567d830 --- /dev/null +++ b/reference/chemvf.f @@ -0,0 +1,349 @@ + SUBROUTINE CHEMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHEMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ).AND. + $ .NOT.LSAME( UPLO, 'V' ).AND. + $ .NOT.LSAME( UPLO, 'M' ))THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 5 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + ELSE IF( INCY.EQ.0 )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHEMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + RETURN + ENDIF + + IF( LSAME( UPLO, 'L' ) )THEN +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + IX = JX + IY = JY + DO 110, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + CONJG( A( I, J ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + RETURN + END IF + + IF( LSAME( UPLO, 'V' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 160, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 150, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1* CONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 150 CONTINUE + Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 + 160 CONTINUE + ELSE + JX = KX + JY = KY + DO 180, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 170, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1* CONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 170 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 180 CONTINUE + END IF + RETURN + ENDIF + + + IF( LSAME( UPLO, 'M' ) )THEN +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 200, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*REAL( A( J, J ) ) + DO 190, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*CONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 190 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 200 CONTINUE + ELSE + JX = KX + JY = KY + DO 220, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*REAL( A( J, J ) ) + IX = JX + IY = JY + DO 210, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*CONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 210 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 220 CONTINUE + END IF + RETURN + END IF + +* +* +* End of CHEMV . +* + END diff --git a/reference/cher2f.f b/reference/cher2f.f new file mode 100644 index 0000000000..096709a540 --- /dev/null +++ b/reference/cher2f.f @@ -0,0 +1,249 @@ + SUBROUTINE CHER2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHER2 performs the hermitian rank 2 operation +* +* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHER2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( J ) ) + TEMP2 = CONJG( ALPHA*X( J ) ) + DO 10, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + A( J, J ) = REAL( A( J, J ) ) + + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( JY ) ) + TEMP2 = CONJG( ALPHA*X( JX ) ) + IX = KX + IY = KY + DO 30, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + A( J, J ) = REAL( A( J, J ) ) + + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( J ) ) + TEMP2 = CONJG( ALPHA*X( J ) ) + A( J, J ) = REAL( A( J, J ) ) + + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) + DO 50, I = J + 1, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( JY ) ) + TEMP2 = CONJG( ALPHA*X( JX ) ) + A( J, J ) = REAL( A( J, J ) ) + + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + IX = JX + IY = JY + DO 70, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + 70 CONTINUE + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHER2 . +* + END diff --git a/reference/cher2kf.f b/reference/cher2kf.f new file mode 100644 index 0000000000..935c92d5cb --- /dev/null +++ b/reference/cher2kf.f @@ -0,0 +1,371 @@ + SUBROUTINE CHER2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + REAL BETA + COMPLEX ALPHA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CHER2K performs one of the hermitian rank 2k operations +* +* C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C, +* +* or +* +* C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C, +* +* where alpha and beta are scalars with beta real, C is an n by n +* hermitian matrix and A and B are n by k matrices in the first case +* and k by n matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*conjg( B' ) + +* conjg( alpha )*B*conjg( A' ) + +* beta*C. +* +* TRANS = 'C' or 'c' C := alpha*conjg( A' )*B + +* conjg( alpha )*conjg( B' )*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'C' or 'c', K specifies the number of rows of the +* matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1. +* Ed Anderson, Cray Research Inc. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHER2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.REAL( ZERO ) )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.REAL( ZERO ) )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + C( J, J ) = BETA*REAL( C( J, J ) ) + DO 70, I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + +* C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.REAL( ZERO ) )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + ELSE + C( J, J ) = REAL( C( J, J ) ) + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( B( J, L ) ) + TEMP2 = CONJG( ALPHA*A( J, L ) ) + DO 110, I = 1, J - 1 + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 110 CONTINUE + C( J, J ) = REAL( C( J, J ) ) + + $ REAL( A( J, L )*TEMP1 + + $ B( J, L )*TEMP2 ) + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.REAL( ZERO ) )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + ELSE + C( J, J ) = REAL( C( J, J ) ) + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( B( J, L ) ) + TEMP2 = CONJG( ALPHA*A( J, L ) ) + DO 160, I = J + 1, N + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 160 CONTINUE + C( J, J ) = REAL( C( J, J ) ) + + $ REAL( A( J, L )*TEMP1 + + $ B( J, L )*TEMP2 ) + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + +* C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + CONJG( A( L, I ) )*B( L, J ) + TEMP2 = TEMP2 + CONJG( B( L, I ) )*A( L, J ) + 190 CONTINUE + IF( I.EQ.J )THEN + IF( BETA.EQ.REAL( ZERO ) )THEN + C( J, J ) = REAL( ALPHA *TEMP1 + + $ CONJG( ALPHA )*TEMP2 ) + ELSE + C( J, J ) = BETA*REAL( C( J, J ) ) + + $ REAL( ALPHA *TEMP1 + + $ CONJG( ALPHA )*TEMP2 ) + END IF + ELSE + IF( BETA.EQ.REAL( ZERO ) )THEN + C( I, J ) = ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 + END IF + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + CONJG( A( L, I ) )*B( L, J ) + TEMP2 = TEMP2 + CONJG( B( L, I ) )*A( L, J ) + 220 CONTINUE + IF( I.EQ.J )THEN + IF( BETA.EQ.REAL( ZERO ) )THEN + C( J, J ) = REAL( ALPHA *TEMP1 + + $ CONJG( ALPHA )*TEMP2 ) + ELSE + C( J, J ) = BETA*REAL( C( J, J ) ) + + $ REAL( ALPHA *TEMP1 + + $ CONJG( ALPHA )*TEMP2 ) + END IF + ELSE + IF( BETA.EQ.REAL( ZERO ) )THEN + C( I, J ) = ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + CONJG( ALPHA )*TEMP2 + END IF + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHER2K. +* + END diff --git a/reference/cherf.f b/reference/cherf.f new file mode 100644 index 0000000000..748ae565ce --- /dev/null +++ b/reference/cherf.f @@ -0,0 +1,212 @@ + SUBROUTINE CHERF ( UPLO, N, ALPHA, X, INCX, A, LDA ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CHER performs the hermitian rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHER ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.REAL( ZERO ) ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( J ) ) + DO 10, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + A( J, J ) = REAL( A( J, J ) ) + REAL( X( J )*TEMP ) + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( JX ) ) + IX = KX + DO 30, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + A( J, J ) = REAL( A( J, J ) ) + REAL( X( JX )*TEMP ) + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( J ) ) + A( J, J ) = REAL( A( J, J ) ) + REAL( TEMP*X( J ) ) + DO 50, I = J + 1, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( JX ) ) + A( J, J ) = REAL( A( J, J ) ) + REAL( TEMP*X( JX ) ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + A( I, J ) = A( I, J ) + X( IX )*TEMP + 70 CONTINUE + ELSE + A( J, J ) = REAL( A( J, J ) ) + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHER . +* + END diff --git a/reference/cherkf.f b/reference/cherkf.f new file mode 100644 index 0000000000..e3d0157e32 --- /dev/null +++ b/reference/cherkf.f @@ -0,0 +1,328 @@ + SUBROUTINE CHERKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + REAL ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CHERK performs one of the hermitian rank k operations +* +* C := alpha*A*conjg( A' ) + beta*C, +* +* or +* +* C := alpha*conjg( A' )*A + beta*C, +* +* where alpha and beta are real scalars, C is an n by n hermitian +* matrix and A is an n by k matrix in the first case and a k by n +* matrix in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*conjg( A' ) + beta*C. +* +* TRANS = 'C' or 'c' C := alpha*conjg( A' )*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'C' or 'c', K specifies the number of rows of the +* matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* -- Modified 8-Nov-93 to set C(J,J) to REAL( C(J,J) ) when BETA = 1. +* Ed Anderson, Cray Research Inc. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + REAL RTEMP + COMPLEX TEMP +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHERK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + C( J, J ) = BETA*REAL( C( J, J ) ) + DO 70, I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*conjg( A' ) + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + C( J, J ) = BETA*REAL( C( J, J ) ) + ELSE + C( J, J ) = REAL( C( J, J ) ) + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.CMPLX( ZERO ) )THEN + TEMP = ALPHA*CONJG( A( J, L ) ) + DO 110, I = 1, J - 1 + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + C( J, J ) = REAL( C( J, J ) ) + + $ REAL( TEMP*A( I, L ) ) + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + C( J, J ) = BETA*REAL( C( J, J ) ) + DO 150, I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + ELSE + C( J, J ) = REAL( C( J, J ) ) + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.CMPLX( ZERO ) )THEN + TEMP = ALPHA*CONJG( A( J, L ) ) + C( J, J ) = REAL( C( J, J ) ) + + $ REAL( TEMP*A( J, L ) ) + DO 160, I = J + 1, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*conjg( A' )*A + beta*C. +* + IF( UPPER )THEN + DO 220, J = 1, N + DO 200, I = 1, J - 1 + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + CONJG( A( L, I ) )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + RTEMP = ZERO + DO 210, L = 1, K + RTEMP = RTEMP + CONJG( A( L, J ) )*A( L, J ) + 210 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( J, J ) = ALPHA*RTEMP + ELSE + C( J, J ) = ALPHA*RTEMP + BETA*REAL( C( J, J ) ) + END IF + 220 CONTINUE + ELSE + DO 260, J = 1, N + RTEMP = ZERO + DO 230, L = 1, K + RTEMP = RTEMP + CONJG( A( L, J ) )*A( L, J ) + 230 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( J, J ) = ALPHA*RTEMP + ELSE + C( J, J ) = ALPHA*RTEMP + BETA*REAL( C( J, J ) ) + END IF + DO 250, I = J + 1, N + TEMP = ZERO + DO 240, L = 1, K + TEMP = TEMP + CONJG( A( L, I ) )*A( L, J ) + 240 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 250 CONTINUE + 260 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHERK . +* + END diff --git a/reference/chpmvf.f b/reference/chpmvf.f new file mode 100644 index 0000000000..9f65105d62 --- /dev/null +++ b/reference/chpmvf.f @@ -0,0 +1,270 @@ + SUBROUTINE CHPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 6 + ELSE IF( INCY.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + CONJG( AP( K ) )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*REAL( AP( KK + J - 1 ) ) + $ + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + CONJG( AP( K ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*REAL( AP( KK + J - 1 ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*REAL( AP( KK ) ) + K = KK + 1 + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + CONJG( AP( K ) )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N - J + 1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*REAL( AP( KK ) ) + IX = JX + IY = JY + DO 110, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + CONJG( AP( K ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N - J + 1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHPMV . +* + END diff --git a/reference/chpr2f.f b/reference/chpr2f.f new file mode 100644 index 0000000000..64f8fe96ca --- /dev/null +++ b/reference/chpr2f.f @@ -0,0 +1,251 @@ + SUBROUTINE CHPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + COMPLEX ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CHPR2 performs the hermitian rank 2 operation +* +* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( J ) ) + TEMP2 = CONJG( ALPHA*X( J ) ) + K = KK + DO 10, I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) + ELSE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( JY ) ) + TEMP2 = CONJG( ALPHA*X( JX ) ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + + $ REAL( X( JX )*TEMP1 + + $ Y( JY )*TEMP2 ) + ELSE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( J ) ) + TEMP2 = CONJG( ALPHA*X( J ) ) + AP( KK ) = REAL( AP( KK ) ) + + $ REAL( X( J )*TEMP1 + Y( J )*TEMP2 ) + K = KK + 1 + DO 50, I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = REAL( AP( KK ) ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*CONJG( Y( JY ) ) + TEMP2 = CONJG( ALPHA*X( JX ) ) + AP( KK ) = REAL( AP( KK ) ) + + $ REAL( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + IX = JX + IY = JY + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + 70 CONTINUE + ELSE + AP( KK ) = REAL( AP( KK ) ) + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHPR2 . +* + END diff --git a/reference/chprf.f b/reference/chprf.f new file mode 100644 index 0000000000..6d1d380caa --- /dev/null +++ b/reference/chprf.f @@ -0,0 +1,217 @@ + SUBROUTINE CHPRF ( UPLO, N, ALPHA, X, INCX, AP ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CHPR performs the hermitian rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CHPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.REAL( ZERO ) ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( J ) ) + K = KK + DO 10, I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + $ + REAL( X( J )*TEMP ) + ELSE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( JX ) ) + IX = KX + DO 30, K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + $ + REAL( X( JX )*TEMP ) + ELSE + AP( KK + J - 1 ) = REAL( AP( KK + J - 1 ) ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( J ) ) + AP( KK ) = REAL( AP( KK ) ) + REAL( TEMP*X( J ) ) + K = KK + 1 + DO 50, I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = REAL( AP( KK ) ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*CONJG( X( JX ) ) + AP( KK ) = REAL( AP( KK ) ) + REAL( TEMP*X( JX ) ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + AP( K ) = AP( K ) + X( IX )*TEMP + 70 CONTINUE + ELSE + AP( KK ) = REAL( AP( KK ) ) + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CHPR . +* + END diff --git a/reference/claswpf.f b/reference/claswpf.f new file mode 100644 index 0000000000..4d47e4f752 --- /dev/null +++ b/reference/claswpf.f @@ -0,0 +1,120 @@ + SUBROUTINE CLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INCX, K1, K2, LDA, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CLASWP performs a series of row interchanges on the matrix A. +* One row interchange is initiated for each of rows K1 through K2 of A. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of columns of the matrix A. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the matrix of column dimension N to which the row +* interchanges will be applied. +* On exit, the permuted matrix. +* +* LDA (input) INTEGER +* The leading dimension of the array A. +* +* K1 (input) INTEGER +* The first element of IPIV for which a row interchange will +* be done. +* +* K2 (input) INTEGER +* The last element of IPIV for which a row interchange will +* be done. +* +* IPIV (input) INTEGER array, dimension (M*abs(INCX)) +* The vector of pivot indices. Only the elements in positions +* K1 through K2 of IPIV are accessed. +* IPIV(K) = L implies rows K and L are to be interchanged. +* +* INCX (input) INTEGER +* The increment between successive values of IPIV. If IPIV +* is negative, the pivots are applied in reverse order. +* +* Further Details +* =============== +* +* Modified by +* R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 + COMPLEX TEMP +* .. +* .. Executable Statements .. +* +* Interchange row I with row IPIV(I) for each of rows K1 through K2. +* + IF( INCX.GT.0 ) THEN + IX0 = K1 + I1 = K1 + I2 = K2 + INC = 1 + ELSE IF( INCX.LT.0 ) THEN + IX0 = 1 + ( 1-K2 )*INCX + I1 = K2 + I2 = K1 + INC = -1 + ELSE + RETURN + END IF +* + N32 = ( N / 32 )*32 + IF( N32.NE.0 ) THEN + DO 30 J = 1, N32, 32 + IX = IX0 + DO 20 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 10 K = J, J + 31 + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 10 CONTINUE + END IF + IX = IX + INCX + 20 CONTINUE + 30 CONTINUE + END IF + IF( N32.NE.N ) THEN + N32 = N32 + 1 + IX = IX0 + DO 50 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 40 K = N32, N + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 40 CONTINUE + END IF + IX = IX + INCX + 50 CONTINUE + END IF +* + RETURN +* +* End of CLASWP +* + END diff --git a/reference/clauu2f.f b/reference/clauu2f.f new file mode 100644 index 0000000000..4bb87250c6 --- /dev/null +++ b/reference/clauu2f.f @@ -0,0 +1,143 @@ + SUBROUTINE CLAUU2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CLAUU2 computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the unblocked form of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I + REAL AII +* .. +* .. External Functions .. + LOGICAL LSAME + COMPLEX CDOTC + EXTERNAL LSAME, CDOTC +* .. +* .. External Subroutines .. + EXTERNAL CGEMV, CLACGV, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLAUU2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = AII*AII + REAL( CDOTC( N-I, A( I, I+1 ), LDA, + $ A( I, I+1 ), LDA ) ) + CALL CLACGV( N-I, A( I, I+1 ), LDA ) + CALL CGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), + $ LDA, A( I, I+1 ), LDA, CMPLX( AII ), + $ A( 1, I ), 1 ) + CALL CLACGV( N-I, A( I, I+1 ), LDA ) + ELSE + CALL CSSCAL( I, AII, A( 1, I ), 1 ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = AII*AII + REAL( CDOTC( N-I, A( I+1, I ), 1, + $ A( I+1, I ), 1 ) ) + CALL CLACGV( I-1, A( I, 1 ), LDA ) + CALL CGEMV( 'Conjugate transpose', N-I, I-1, ONE, + $ A( I+1, 1 ), LDA, A( I+1, I ), 1, + $ CMPLX( AII ), A( I, 1 ), LDA ) + CALL CLACGV( I-1, A( I, 1 ), LDA ) + ELSE + CALL CSSCAL( I, AII, A( I, 1 ), LDA ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of CLAUU2 +* + END diff --git a/reference/clauumf.f b/reference/clauumf.f new file mode 100644 index 0000000000..9b57fe11d2 --- /dev/null +++ b/reference/clauumf.f @@ -0,0 +1,161 @@ + SUBROUTINE CLAUUMF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CLAUUM computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the blocked form of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, IB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CHERK, CLAUU2, CTRMM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CLAUUM', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 128 +* + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL CLAUU2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL CTRMM( 'Right', 'Upper', 'Conjugate transpose', + $ 'Non-unit', I-1, IB, CONE, A( I, I ), LDA, + $ A( 1, I ), LDA ) + CALL CLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ I-1, IB, N-I-IB+1, CONE, A( 1, I+IB ), + $ LDA, A( I, I+IB ), LDA, CONE, A( 1, I ), + $ LDA ) + CALL CHERK( 'Upper', 'No transpose', IB, N-I-IB+1, + $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), + $ LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL CTRMM( 'Left', 'Lower', 'Conjugate transpose', + $ 'Non-unit', IB, I-1, CONE, A( I, I ), LDA, + $ A( I, 1 ), LDA ) + CALL CLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL CGEMM( 'Conjugate transpose', 'No transpose', IB, + $ I-1, N-I-IB+1, CONE, A( I+IB, I ), LDA, + $ A( I+IB, 1 ), LDA, CONE, A( I, 1 ), LDA ) + CALL CHERK( 'Lower', 'Conjugate transpose', IB, + $ N-I-IB+1, ONE, A( I+IB, I ), LDA, ONE, + $ A( I, I ), LDA ) + END IF + 20 CONTINUE + END IF + END IF +* + RETURN +* +* End of CLAUUM +* + END diff --git a/reference/cpotf2f.f b/reference/cpotf2f.f new file mode 100644 index 0000000000..2b451ccd77 --- /dev/null +++ b/reference/cpotf2f.f @@ -0,0 +1,175 @@ + SUBROUTINE CPOTF2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CPOTF2 computes the Cholesky factorization of a complex Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U' * U , if UPLO = 'U', or +* A = L * L', if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the unblocked version of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the upper or lower triangular part of the +* Hermitian matrix A is stored. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* n by n upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U'*U or A = L*L'. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, the leading minor of order k is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J + REAL AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + COMPLEX CDOTC + EXTERNAL LSAME, CDOTC +* .. +* .. External Subroutines .. + EXTERNAL CGEMV, CLACGV, CSSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, REAL, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CPOTF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N +* +* Compute U(J,J) and test for non-positive-definiteness. +* + AJJ = REAL( A( J, J ) ) - CDOTC( J-1, A( 1, J ), 1, + $ A( 1, J ), 1 ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of row J. +* + IF( J.LT.N ) THEN + CALL CLACGV( J-1, A( 1, J ), 1 ) + CALL CGEMV( 'Transpose', J-1, N-J, -CONE, A( 1, J+1 ), + $ LDA, A( 1, J ), 1, CONE, A( J, J+1 ), LDA ) + CALL CLACGV( J-1, A( 1, J ), 1 ) + CALL CSSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N +* +* Compute L(J,J) and test for non-positive-definiteness. +* + AJJ = REAL( A( J, J ) ) - CDOTC( J-1, A( J, 1 ), LDA, + $ A( J, 1 ), LDA ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of column J. +* + IF( J.LT.N ) THEN + CALL CLACGV( J-1, A( J, 1 ), LDA ) + CALL CGEMV( 'No transpose', N-J, J-1, -CONE, A( J+1, 1 ), + $ LDA, A( J, 1 ), LDA, CONE, A( J+1, J ), 1 ) + CALL CLACGV( J-1, A( J, 1 ), LDA ) + CALL CSSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF + GO TO 40 +* + 30 CONTINUE + INFO = J +* + 40 CONTINUE + RETURN +* +* End of CPOTF2 +* + END diff --git a/reference/cpotrff.f b/reference/cpotrff.f new file mode 100644 index 0000000000..696de862a1 --- /dev/null +++ b/reference/cpotrff.f @@ -0,0 +1,187 @@ + SUBROUTINE CPOTRFF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CPOTRF computes the Cholesky factorization of a complex Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U**H * U, if UPLO = 'U', or +* A = L * L**H, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + COMPLEX CONE + PARAMETER ( ONE = 1.0E+0, CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CHERK, CPOTF2, CTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 56 + + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL CPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL CHERK( 'Upper', 'Conjugate transpose', JB, J-1, + $ -ONE, A( 1, J ), LDA, ONE, A( J, J ), LDA ) + CALL CPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block row. +* + CALL CGEMM( 'Conjugate transpose', 'No transpose', JB, + $ N-J-JB+1, J-1, -CONE, A( 1, J ), LDA, + $ A( 1, J+JB ), LDA, CONE, A( J, J+JB ), + $ LDA ) + CALL CTRSM( 'Left', 'Upper', 'Conjugate transpose', + $ 'Non-unit', JB, N-J-JB+1, CONE, A( J, J ), + $ LDA, A( J, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL CHERK( 'Lower', 'No transpose', JB, J-1, -ONE, + $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) + CALL CPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block column. +* + CALL CGEMM( 'No transpose', 'Conjugate transpose', + $ N-J-JB+1, JB, J-1, -CONE, A( J+JB, 1 ), + $ LDA, A( J, 1 ), LDA, CONE, A( J+JB, J ), + $ LDA ) + CALL CTRSM( 'Right', 'Lower', 'Conjugate transpose', + $ 'Non-unit', N-J-JB+1, JB, CONE, A( J, J ), + $ LDA, A( J+JB, J ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of CPOTRF +* + END diff --git a/reference/cpotrif.f b/reference/cpotrif.f new file mode 100644 index 0000000000..e14b28772e --- /dev/null +++ b/reference/cpotrif.f @@ -0,0 +1,96 @@ + SUBROUTINE CPOTRIF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CPOTRI computes the inverse of a complex Hermitian positive definite +* matrix A using the Cholesky factorization A = U**H*U or A = L*L**H +* computed by CPOTRF. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H, as computed by +* CPOTRF. +* On exit, the upper or lower triangle of the (Hermitian) +* inverse of A, overwriting the input factor U or L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the (i,i) element of the factor U or L is +* zero, and the inverse could not be computed. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CLAUUM, CTRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CPOTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Invert the triangular Cholesky factor U or L. +* + CALL CTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* +* Form inv(U)*inv(U)' or inv(L)'*inv(L). +* + CALL CLAUUM( UPLO, N, A, LDA, INFO ) +* + RETURN +* +* End of CPOTRI +* + END diff --git a/reference/crotgf.f b/reference/crotgf.f new file mode 100644 index 0000000000..6195133708 --- /dev/null +++ b/reference/crotgf.f @@ -0,0 +1,20 @@ + subroutine crotgf(ca,cb,c,s) + complex ca,cb,s + real c + real norm,scale + complex alpha + if (cabs(ca) .ne. 0.) go to 10 + c = 0. + s = (1.,0.) + ca = cb + go to 20 + 10 continue + scale = cabs(ca) + cabs(cb) + norm = scale * sqrt((cabs(ca/scale))**2 + (cabs(cb/scale))**2) + alpha = ca /cabs(ca) + c = cabs(ca) / norm + s = alpha * conjg(cb) / norm + ca = alpha * norm + 20 continue + return + end diff --git a/reference/csbmvf.f b/reference/csbmvf.f new file mode 100644 index 0000000000..e635af8b28 --- /dev/null +++ b/reference/csbmvf.f @@ -0,0 +1,306 @@ + SUBROUTINE CSBMVF(UPLO, N, K, ALPHA, A, LDA, X, INCX, BETA, Y, + $ INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, K, LDA, N + COMPLEX ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CSBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric band matrix, with k super-diagonals. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array, dimension( LDA, N ) +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L + COMPLEX TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( K.LT.0 ) THEN + INFO = 3 + ELSE IF( LDA.LT.( K+1 ) ) THEN + INFO = 6 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 8 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50 I = MAX( 1, J-K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70 I = MAX( 1, J-K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K ) THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( 1, J ) + L = 1 - J + DO 90 I = J + 1, MIN( N, J+K ) + Y( I ) = Y( I ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) + L = 1 - J + IX = JX + IY = JY + DO 110 I = J + 1, MIN( N, J+K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSBMV +* + END diff --git a/reference/cscalf.f b/reference/cscalf.f new file mode 100644 index 0000000000..714dc42304 --- /dev/null +++ b/reference/cscalf.f @@ -0,0 +1,28 @@ + subroutine cscalf(n,ca,cx,incx) +c +c scales a vector by a constant. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex ca,cx(*) + integer i,incx,n,nincx +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + cx(i) = ca*cx(i) + 10 continue + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + cx(i) = ca*cx(i) + 30 continue + return + end diff --git a/reference/cspmvf.f b/reference/cspmvf.f new file mode 100644 index 0000000000..7f357c6853 --- /dev/null +++ b/reference/cspmvf.f @@ -0,0 +1,264 @@ + SUBROUTINE CSPMVF(UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, N + COMPLEX ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CSPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix, supplied in packed form. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP (input) COMPLEX array, dimension at least +* ( ( N*( N + 1 ) )/2 ). +* Before entry, with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry, with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Unchanged on exit. +* +* X (input) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA (input) COMPLEX +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y (input/output) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY (input) INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY + COMPLEX TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 6 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 9 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50 I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70 K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*AP( KK ) + K = KK + 1 + DO 90 I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N-J+1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*AP( KK ) + IX = JX + IY = JY + DO 110 K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N-J+1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSPMV +* + END diff --git a/reference/cspr2f.f b/reference/cspr2f.f new file mode 100644 index 0000000000..8ba35f5f2a --- /dev/null +++ b/reference/cspr2f.f @@ -0,0 +1,229 @@ + SUBROUTINE CSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + COMPLEX*8 ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*8 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSPR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*8 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*8 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*8 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX*8 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*8 ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + COMPLEX*8 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSPR2 . +* + END diff --git a/reference/csprf.f b/reference/csprf.f new file mode 100644 index 0000000000..9010f0c2eb --- /dev/null +++ b/reference/csprf.f @@ -0,0 +1,213 @@ + SUBROUTINE CSPRF( UPLO, N, ALPHA, X, INCX, AP ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, N + COMPLEX ALPHA +* .. +* .. Array Arguments .. + COMPLEX AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CSPR performs the symmetric rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a complex scalar, x is an n element vector and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X (input) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP (input/output) COMPLEX array, dimension at least +* ( ( N*( N + 1 ) )/2 ). +* Before entry, with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry, with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, J, JX, K, KK, KX + COMPLEX TEMP +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 ) THEN + KX = 1 - ( N-1 )*INCX + ELSE IF( INCX.NE.1 ) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 ) THEN + DO 20 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + K = KK + DO 10 I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + AP( KK+J-1 ) = AP( KK+J-1 ) + X( J )*TEMP + ELSE + AP( KK+J-1 ) = AP( KK+J-1 ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30 K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + AP( KK+J-1 ) = AP( KK+J-1 ) + X( JX )*TEMP + ELSE + AP( KK+J-1 ) = AP( KK+J-1 ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 ) THEN + DO 60 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + AP( KK ) = AP( KK ) + TEMP*X( J ) + K = KK + 1 + DO 50 I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = AP( KK ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + AP( KK ) = AP( KK ) + TEMP*X( JX ) + IX = JX + DO 70 K = KK + 1, KK + N - J + IX = IX + INCX + AP( K ) = AP( K ) + X( IX )*TEMP + 70 CONTINUE + ELSE + AP( KK ) = AP( KK ) + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSPR +* + END diff --git a/reference/csrotf.f b/reference/csrotf.f new file mode 100644 index 0000000000..1ecdb0a5f8 --- /dev/null +++ b/reference/csrotf.f @@ -0,0 +1,38 @@ + subroutine csrotf (n,cx,incx,cy,incy,c,s) +c +c applies a plane rotation, where the cos and sin (c and s) are real +c and the vectors cx and cy are complex. +c jack dongarra, linpack, 3/11/78. +c + complex cx(1),cy(1),ctemp + real c,s + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ctemp = c*cx(ix) + s*cy(iy) + cy(iy) = c*cy(iy) - s*cx(ix) + cx(ix) = ctemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ctemp = c*cx(i) + s*cy(i) + cy(i) = c*cy(i) - s*cx(i) + cx(i) = ctemp + 30 continue + return + end diff --git a/reference/csscalf.f b/reference/csscalf.f new file mode 100644 index 0000000000..099d519b68 --- /dev/null +++ b/reference/csscalf.f @@ -0,0 +1,29 @@ + subroutine csscalf(n,sa,cx,incx) +c +c scales a complex vector by a real constant. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*) + real sa + integer i,incx,n,nincx +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + cx(i) = cmplx(sa*real(cx(i)),sa*aimag(cx(i))) + 10 continue + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + cx(i) = cmplx(sa*real(cx(i)),sa*aimag(cx(i))) + 30 continue + return + end diff --git a/reference/cswapf.f b/reference/cswapf.f new file mode 100644 index 0000000000..39683b68ce --- /dev/null +++ b/reference/cswapf.f @@ -0,0 +1,36 @@ + subroutine cswapf (n,cx,incx,cy,incy) +c +c interchanges two vectors. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*),cy(*),ctemp + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ctemp = cx(ix) + cx(ix) = cy(iy) + cy(iy) = ctemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 + 20 do 30 i = 1,n + ctemp = cx(i) + cx(i) = cy(i) + cy(i) = ctemp + 30 continue + return + end diff --git a/reference/csymm3mf.f b/reference/csymm3mf.f new file mode 100644 index 0000000000..2640a18b71 --- /dev/null +++ b/reference/csymm3mf.f @@ -0,0 +1,296 @@ + SUBROUTINE CSYMM3MF( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of CSYMM . +* + END diff --git a/reference/csymmf.f b/reference/csymmf.f new file mode 100644 index 0000000000..d5480e4736 --- /dev/null +++ b/reference/csymmf.f @@ -0,0 +1,296 @@ + SUBROUTINE CSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of CSYMM . +* + END diff --git a/reference/csymvf.f b/reference/csymvf.f new file mode 100644 index 0000000000..09d247a13e --- /dev/null +++ b/reference/csymvf.f @@ -0,0 +1,264 @@ + SUBROUTINE CSYMVF(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, LDA, N + COMPLEX ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CSYMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A (input) COMPLEX array, dimension ( LDA, N ) +* Before entry, with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry, with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. +* Unchanged on exit. +* +* LDA (input) INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, N ). +* Unchanged on exit. +* +* X (input) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA (input) COMPLEX +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y (input/output) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY (input) INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY + COMPLEX TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = 5 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 7 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 10 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSYMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50 I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70 I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( J, J ) + DO 90 I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + IX = JX + IY = JY + DO 110 I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSYMV +* + END diff --git a/reference/csyr2f.f b/reference/csyr2f.f new file mode 100644 index 0000000000..1fde4c0b14 --- /dev/null +++ b/reference/csyr2f.f @@ -0,0 +1,230 @@ + SUBROUTINE CSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*8 ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*8 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSYR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*8 ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + COMPLEX*8 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYR2 . +* + END diff --git a/reference/csyr2kf.f b/reference/csyr2kf.f new file mode 100644 index 0000000000..f9468dd14f --- /dev/null +++ b/reference/csyr2kf.f @@ -0,0 +1,324 @@ + SUBROUTINE CSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CSYR2K performs one of the symmetric rank 2k operations +* +* C := alpha*A*B' + alpha*B*A' + beta*C, +* +* or +* +* C := alpha*A'*B + alpha*B'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A and B are n by k matrices in the first case and k by n +* matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + +* beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'T' or 't', K specifies the number of rows of the +* matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX TEMP1, TEMP2 +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CSYR2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*B' + alpha*B*A' + C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*B + alpha*B'*A + C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSYR2K. +* + END diff --git a/reference/csyrf.f b/reference/csyrf.f new file mode 100644 index 0000000000..f1a2d597d2 --- /dev/null +++ b/reference/csyrf.f @@ -0,0 +1,198 @@ + SUBROUTINE CSYRF( UPLO, N, ALPHA, X, INCX, A, LDA ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, LDA, N + COMPLEX ALPHA +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CSYR performs the symmetric rank 1 operation +* +* A := alpha*x*( x' ) + A, +* +* where alpha is a complex scalar, x is an n element vector and A is an +* n by n symmetric matrix. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X (input) COMPLEX array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A (input/output) COMPLEX array, dimension ( LDA, N ) +* Before entry, with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry, with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA (input) INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, N ). +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, J, JX, KX + COMPLEX TEMP +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = 7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CSYR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 ) THEN + KX = 1 - ( N-1 )*INCX + ELSE IF( INCX.NE.1 ) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 ) THEN + DO 20 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + DO 10 I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30 I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 ) THEN + DO 60 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + DO 50 I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70 I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSYR +* + END diff --git a/reference/csyrkf.f b/reference/csyrkf.f new file mode 100644 index 0000000000..7dbaefa498 --- /dev/null +++ b/reference/csyrkf.f @@ -0,0 +1,293 @@ + SUBROUTINE CSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + COMPLEX ALPHA, BETA +* .. Array Arguments .. + COMPLEX A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* CSYRK performs one of the symmetric rank k operations +* +* C := alpha*A*A' + beta*C, +* +* or +* +* C := alpha*A'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A is an n by k matrix in the first case and a k by n matrix +* in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'T' or 't', K specifies the number of rows of the +* matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX TEMP +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CSYRK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*A' + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*A + beta*C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP = ZERO + DO 220, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of CSYRK . +* + END diff --git a/reference/ctbmvf.f b/reference/ctbmvf.f new file mode 100644 index 0000000000..ff3c5268da --- /dev/null +++ b/reference/ctbmvf.f @@ -0,0 +1,377 @@ + SUBROUTINE CTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, K, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTBMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular band matrix, with ( k + 1 ) diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( K.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 7 + ELSE IF( INCX.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = KPLUS1 - J + DO 10, I = MAX( 1, J - K ), J - 1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( KPLUS1, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = KPLUS1 - J + DO 30, I = MAX( 1, J - K ), J - 1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( KPLUS1, J ) + END IF + JX = JX + INCX + IF( J.GT.K ) + $ KX = KX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = 1 - J + DO 50, I = MIN( N, J + K ), J + 1, -1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( 1, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = 1 - J + DO 70, I = MIN( N, J + K ), J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( 1, J ) + END IF + JX = JX - INCX + IF( ( N - J ).GE.K ) + $ KX = KX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + L = KPLUS1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 90, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( I ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( KPLUS1, J ) ) + DO 100, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + CONJG( A( L + I, J ) )*X( I ) + 100 CONTINUE + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + KX = KX - INCX + IX = KX + L = KPLUS1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 120, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX - INCX + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( KPLUS1, J ) ) + DO 130, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + CONJG( A( L + I, J ) )*X( IX ) + IX = IX - INCX + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + L = 1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 150, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( I ) + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( 1, J ) ) + DO 160, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + CONJG( A( L + I, J ) )*X( I ) + 160 CONTINUE + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + KX = KX + INCX + IX = KX + L = 1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 180, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX + INCX + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( 1, J ) ) + DO 190, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + CONJG( A( L + I, J ) )*X( IX ) + IX = IX + INCX + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTBMV . +* + END diff --git a/reference/ctbsvf.f b/reference/ctbsvf.f new file mode 100644 index 0000000000..9358433201 --- /dev/null +++ b/reference/ctbsvf.f @@ -0,0 +1,367 @@ + SUBROUTINE CTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) +* .. Scalar Arguments .. + INTEGER INCX,K,LDA,N + CHARACTER DIAG,TRANS,UPLO +* .. +* .. Array Arguments .. + COMPLEX A(LDA,*),X(*) +* .. +* +* Purpose +* ======= +* +* CTBSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular band matrix, with ( k + 1 ) +* diagonals. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER (ZERO= (0.0E+0,0.0E+0)) +* .. +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L + LOGICAL NOCONJ,NOUNIT +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CONJG,MAX,MIN +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN + INFO = 1 + ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 2 + ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT. (K+1)) THEN + INFO = 7 + ELSE IF (INCX.EQ.0) THEN + INFO = 9 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('CTBSV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF (N.EQ.0) RETURN +* + NOCONJ = LSAME(TRANS,'T') + NOUNIT = LSAME(DIAG,'N') +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF (INCX.LE.0) THEN + KX = 1 - (N-1)*INCX + ELSE IF (INCX.NE.1) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed by sequentially with one pass through A. +* + IF (LSAME(TRANS,'N')) THEN +* +* Form x := inv( A )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 20 J = N,1,-1 + IF (X(J).NE.ZERO) THEN + L = KPLUS1 - J + IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) + TEMP = X(J) + DO 10 I = J - 1,MAX(1,J-K),-1 + X(I) = X(I) - TEMP*A(L+I,J) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 40 J = N,1,-1 + KX = KX - INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = KPLUS1 - J + IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) + TEMP = X(JX) + DO 30 I = J - 1,MAX(1,J-K),-1 + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX - INCX + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 60 J = 1,N + IF (X(J).NE.ZERO) THEN + L = 1 - J + IF (NOUNIT) X(J) = X(J)/A(1,J) + TEMP = X(J) + DO 50 I = J + 1,MIN(N,J+K) + X(I) = X(I) - TEMP*A(L+I,J) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1,N + KX = KX + INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = 1 - J + IF (NOUNIT) X(JX) = X(JX)/A(1,J) + TEMP = X(JX) + DO 70 I = J + 1,MIN(N,J+K) + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A') )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 110 J = 1,N + TEMP = X(J) + L = KPLUS1 - J + IF (NOCONJ) THEN + DO 90 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(I) + 90 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + ELSE + DO 100 I = MAX(1,J-K),J - 1 + TEMP = TEMP - CONJG(A(L+I,J))*X(I) + 100 CONTINUE + IF (NOUNIT) TEMP = TEMP/CONJG(A(KPLUS1,J)) + END IF + X(J) = TEMP + 110 CONTINUE + ELSE + JX = KX + DO 140 J = 1,N + TEMP = X(JX) + IX = KX + L = KPLUS1 - J + IF (NOCONJ) THEN + DO 120 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX + INCX + 120 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + ELSE + DO 130 I = MAX(1,J-K),J - 1 + TEMP = TEMP - CONJG(A(L+I,J))*X(IX) + IX = IX + INCX + 130 CONTINUE + IF (NOUNIT) TEMP = TEMP/CONJG(A(KPLUS1,J)) + END IF + X(JX) = TEMP + JX = JX + INCX + IF (J.GT.K) KX = KX + INCX + 140 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 170 J = N,1,-1 + TEMP = X(J) + L = 1 - J + IF (NOCONJ) THEN + DO 150 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(I) + 150 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + ELSE + DO 160 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - CONJG(A(L+I,J))*X(I) + 160 CONTINUE + IF (NOUNIT) TEMP = TEMP/CONJG(A(1,J)) + END IF + X(J) = TEMP + 170 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 200 J = N,1,-1 + TEMP = X(JX) + IX = KX + L = 1 - J + IF (NOCONJ) THEN + DO 180 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX - INCX + 180 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + ELSE + DO 190 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - CONJG(A(L+I,J))*X(IX) + IX = IX - INCX + 190 CONTINUE + IF (NOUNIT) TEMP = TEMP/CONJG(A(1,J)) + END IF + X(JX) = TEMP + JX = JX - INCX + IF ((N-J).GE.K) KX = KX - INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTBSV . +* + END diff --git a/reference/ctpmvf.f b/reference/ctpmvf.f new file mode 100644 index 0000000000..cd29ec5729 --- /dev/null +++ b/reference/ctpmvf.f @@ -0,0 +1,376 @@ + SUBROUTINE CTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTPMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTPMVF', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ))THEN +* +* Form x:= A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 10, I = 1, J - 1 + IF( NOCONJ )THEN + X( I ) = X( I ) + TEMP*AP( K ) + ELSE + X( I ) = X( I ) + TEMP*CONJG(AP( K )) + END IF + K = K + 1 + 10 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK + J - 1 ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*CONJG(AP( KK + J-1)) + END IF + END IF + + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, K = KK, KK + J - 2 + IF( NOCONJ )THEN + X( IX ) = X( IX ) + TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) + TEMP*CONJG(AP(K)) + END IF + IX = IX + INCX + 30 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK + J - 1 ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*CONJG(AP( KK + J-1)) + END IF + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 50, I = N, J + 1, -1 + IF( NOCONJ )THEN + X( I ) = X( I ) + TEMP*AP( K ) + ELSE + X( I ) = X( I ) + TEMP*CONJG(AP( K )) + END IF + K = K - 1 + 50 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK - N + J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*CONJG(AP(KK - N+J)) + END IF + END IF + KK = KK - ( N - J + 1 ) + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 + IF( NOCONJ )THEN + X( IX ) = X( IX ) + TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) + TEMP*CONJG(AP(K)) + ENDIF + IX = IX - INCX + 70 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK - N + J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*CONJG(AP(KK-N+J)) + ENDIF + END IF + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + K = KK - 1 + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + AP( K )*X( I ) + K = K - 1 + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( AP( KK ) ) + DO 100, I = J - 1, 1, -1 + TEMP = TEMP + CONJG( AP( K ) )*X( I ) + K = K - 1 + 100 CONTINUE + END IF + X( J ) = TEMP + KK = KK - J + 110 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 120, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + AP( K )*X( IX ) + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( AP( KK ) ) + DO 130, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + CONJG( AP( K ) )*X( IX ) + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + KK = KK - J + 140 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + K = KK + 1 + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 150, I = J + 1, N + TEMP = TEMP + AP( K )*X( I ) + K = K + 1 + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( AP( KK ) ) + DO 160, I = J + 1, N + TEMP = TEMP + CONJG( AP( K ) )*X( I ) + K = K + 1 + 160 CONTINUE + END IF + X( J ) = TEMP + KK = KK + ( N - J + 1 ) + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 180, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + AP( K )*X( IX ) + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( AP( KK ) ) + DO 190, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + CONJG( AP( K ) )*X( IX ) + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTPMV . +* + END diff --git a/reference/ctpsvf.f b/reference/ctpsvf.f new file mode 100644 index 0000000000..2da9215126 --- /dev/null +++ b/reference/ctpsvf.f @@ -0,0 +1,379 @@ + SUBROUTINE CTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTPSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix, supplied in packed form. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - COMPLEX array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTPSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) .OR.LSAME( TRANS, 'R' ))THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/CONJG(AP( KK )) + END IF + + TEMP = X( J ) + K = KK - 1 + DO 10, I = J - 1, 1, -1 + IF( NOCONJ )THEN + X( I ) = X( I ) - TEMP*AP( K ) + ELSE + X( I ) = X( I ) - TEMP*CONJG(AP( K )) + END IF + K = K - 1 + 10 CONTINUE + END IF + KK = KK - J + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/CONJG(AP( KK )) + END IF + TEMP = X( JX ) + IX = JX + DO 30, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + IF( NOCONJ )THEN + X( IX ) = X( IX ) - TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) - TEMP*CONJG(AP( K )) + END IF + 30 CONTINUE + END IF + JX = JX - INCX + KK = KK - J + 40 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/CONJG(AP( KK )) + END IF + TEMP = X( J ) + K = KK + 1 + DO 50, I = J + 1, N + IF( NOCONJ )THEN + X( I ) = X( I ) - TEMP*AP( K ) + ELSE + X( I ) = X( I ) - TEMP*CONJG(AP( K )) + END IF + K = K + 1 + 50 CONTINUE + END IF + KK = KK + ( N - J + 1 ) + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/CONJG(AP( KK )) + END IF + TEMP = X( JX ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + IF( NOCONJ )THEN + X( IX ) = X( IX ) - TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) - TEMP*CONJG(AP( K )) + END IF + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = X( J ) + K = KK + IF( NOCONJ )THEN + DO 90, I = 1, J - 1 + TEMP = TEMP - AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + ELSE + DO 100, I = 1, J - 1 + TEMP = TEMP - CONJG( AP( K ) )*X( I ) + K = K + 1 + 100 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( AP( KK + J - 1 ) ) + END IF + X( J ) = TEMP + KK = KK + J + 110 CONTINUE + ELSE + JX = KX + DO 140, J = 1, N + TEMP = X( JX ) + IX = KX + IF( NOCONJ )THEN + DO 120, K = KK, KK + J - 2 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX + INCX + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + ELSE + DO 130, K = KK, KK + J - 2 + TEMP = TEMP - CONJG( AP( K ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( AP( KK + J - 1 ) ) + END IF + X( JX ) = TEMP + JX = JX + INCX + KK = KK + J + 140 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 170, J = N, 1, -1 + TEMP = X( J ) + K = KK + IF( NOCONJ )THEN + DO 150, I = N, J + 1, -1 + TEMP = TEMP - AP( K )*X( I ) + K = K - 1 + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + ELSE + DO 160, I = N, J + 1, -1 + TEMP = TEMP - CONJG( AP( K ) )*X( I ) + K = K - 1 + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( AP( KK - N + J ) ) + END IF + X( J ) = TEMP + KK = KK - ( N - J + 1 ) + 170 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 200, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + IF( NOCONJ )THEN + DO 180, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX - INCX + 180 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + ELSE + DO 190, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - CONJG( AP( K ) )*X( IX ) + IX = IX - INCX + 190 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( AP( KK - N + J ) ) + END IF + X( JX ) = TEMP + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTPSV . +* + END diff --git a/reference/ctrmmf.f b/reference/ctrmmf.f new file mode 100644 index 0000000000..d65bf4484e --- /dev/null +++ b/reference/ctrmmf.f @@ -0,0 +1,428 @@ + SUBROUTINE CTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + COMPLEX ALPHA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* CTRMM performs one of the matrix-matrix operations +* +* B := alpha*op( A )*B, or B := alpha*B*op( A ) +* +* where alpha is a scalar, B is an m by n matrix, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) multiplies B from +* the left or right as follows: +* +* SIDE = 'L' or 'l' B := alpha*op( A )*B. +* +* SIDE = 'R' or 'r' B := alpha*B*op( A ). +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B, and on exit is overwritten by the +* transformed matrix. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTRMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*A*B. +* + IF( UPPER )THEN + DO 50, J = 1, N + DO 40, K = 1, M + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + IF (NOCONJ) THEN + DO 30, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 30 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + B( K, J ) = TEMP + ELSE + DO 35, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*CONJG(A( I, K )) + 35 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG(A( K, K )) + B( K, J ) = TEMP + ENDIF + END IF + 40 CONTINUE + 50 CONTINUE + ELSE + DO 80, J = 1, N + DO 70 K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + B( K, J ) = TEMP + IF (NOCONJ) THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*A( K, K ) + DO 60, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 60 CONTINUE + ELSE + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*CONJG(A( K, K )) + DO 65, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*CONJG(A( I, K )) + 65 CONTINUE + ENDIF + END IF + 70 CONTINUE + 80 CONTINUE + END IF + ELSE +* +* Form B := alpha*A'*B or B := alpha*conjg( A' )*B. +* + IF( UPPER )THEN + DO 120, J = 1, N + DO 110, I = M, 1, -1 + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 90, K = 1, I - 1 + TEMP = TEMP + A( K, I )*B( K, J ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( I, I ) ) + DO 100, K = 1, I - 1 + TEMP = TEMP + CONJG( A( K, I ) )*B( K, J ) + 100 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 110 CONTINUE + 120 CONTINUE + ELSE + DO 160, J = 1, N + DO 150, I = 1, M + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 130, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 130 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( I, I ) ) + DO 140, K = I + 1, M + TEMP = TEMP + CONJG( A( K, I ) )*B( K, J ) + 140 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 150 CONTINUE + 160 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*B*A. +* + IF( UPPER )THEN + DO 200, J = N, 1, -1 + TEMP = ALPHA + IF (NOCONJ) THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG(A( J, J )) + ENDIF + DO 170, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 170 CONTINUE + DO 190, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + TEMP = ALPHA*A( K, J ) + ELSE + TEMP = ALPHA*CONJG(A( K, J )) + ENDIF + DO 180, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE + DO 240, J = 1, N + TEMP = ALPHA + IF (NOCONJ) THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG(A( J, J )) + ENDIF + DO 210, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 210 CONTINUE + DO 230, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + TEMP = ALPHA*A( K, J ) + ELSE + TEMP = ALPHA*CONJG(A( K, J )) + ENDIF + DO 220, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 220 CONTINUE + END IF + 230 CONTINUE + 240 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*A' or B := alpha*B*conjg( A' ). +* + IF( UPPER )THEN + DO 280, K = 1, N + DO 260, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = ALPHA*A( J, K ) + ELSE + TEMP = ALPHA*CONJG( A( J, K ) ) + END IF + DO 250, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 250 CONTINUE + END IF + 260 CONTINUE + TEMP = ALPHA + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = TEMP*A( K, K ) + ELSE + TEMP = TEMP*CONJG( A( K, K ) ) + END IF + END IF + IF( TEMP.NE.ONE )THEN + DO 270, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 270 CONTINUE + END IF + 280 CONTINUE + ELSE + DO 320, K = N, 1, -1 + DO 300, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = ALPHA*A( J, K ) + ELSE + TEMP = ALPHA*CONJG( A( J, K ) ) + END IF + DO 290, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 290 CONTINUE + END IF + 300 CONTINUE + TEMP = ALPHA + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = TEMP*A( K, K ) + ELSE + TEMP = TEMP*CONJG( A( K, K ) ) + END IF + END IF + IF( TEMP.NE.ONE )THEN + DO 310, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 310 CONTINUE + END IF + 320 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTRMM . +* + END diff --git a/reference/ctrmvf.f b/reference/ctrmvf.f new file mode 100644 index 0000000000..f9d3b445ac --- /dev/null +++ b/reference/ctrmvf.f @@ -0,0 +1,358 @@ + SUBROUTINE CTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTRMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTRMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 10, I = 1, J - 1 + IF (NOCONJ) THEN + X( I ) = X( I ) + TEMP*A( I, J ) + ELSE + X( I ) = X( I ) + TEMP*CONJG(A( I, J )) + ENDIF + 10 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*CONJG(A( J, J )) + ENDIF + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, I = 1, J - 1 + IF (NOCONJ) THEN + X( IX ) = X( IX ) + TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) + TEMP*CONJG(A( I, J )) + ENDIF + IX = IX + INCX + 30 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*CONJG(A( J, J )) + ENDIF + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 50, I = N, J + 1, -1 + IF (NOCONJ) THEN + X( I ) = X( I ) + TEMP*A( I, J ) + ELSE + X( I ) = X( I ) + TEMP*CONJG(A( I, J )) + ENDIF + 50 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*CONJG(A( J, J )) + ENDIF + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, I = N, J + 1, -1 + IF (NOCONJ) THEN + X( IX ) = X( IX ) + TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) + TEMP*CONJG(A( I, J )) + ENDIF + IX = IX - INCX + 70 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*CONJG(A( J, J )) + ENDIF + END IF + JX = JX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( J, J ) ) + DO 100, I = J - 1, 1, -1 + TEMP = TEMP + CONJG( A( I, J ) )*X( I ) + 100 CONTINUE + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 120, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + A( I, J )*X( IX ) + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( J, J ) ) + DO 130, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = J + 1, N + TEMP = TEMP + A( I, J )*X( I ) + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( J, J ) ) + DO 160, I = J + 1, N + TEMP = TEMP + CONJG( A( I, J ) )*X( I ) + 160 CONTINUE + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 180, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + A( I, J )*X( IX ) + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*CONJG( A( J, J ) ) + DO 190, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + CONJG( A( I, J ) )*X( IX ) + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTRMV . +* + END diff --git a/reference/ctrsmf.f b/reference/ctrsmf.f new file mode 100644 index 0000000000..3d27822d17 --- /dev/null +++ b/reference/ctrsmf.f @@ -0,0 +1,459 @@ + SUBROUTINE CTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + COMPLEX ALPHA +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* CTRSM solves one of the matrix equations +* +* op( A )*X = alpha*B, or X*op( A ) = alpha*B, +* +* where alpha is a scalar, X and B are m by n matrices, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). +* +* The matrix X is overwritten on B. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) appears on the left +* or right of X as follows: +* +* SIDE = 'L' or 'l' op( A )*X = alpha*B. +* +* SIDE = 'R' or 'r' X*op( A ) = alpha*B. +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the right-hand side matrix B, and on exit is +* overwritten by the solution matrix X. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX TEMP +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = (LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' )) + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTRSM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*inv( A )*B. +* + IF( UPPER )THEN + DO 60, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 30, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 30 CONTINUE + END IF + + DO 50, K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + + IF( NOUNIT ) THEN + IF (NOCONJ) THEN + B( K, J ) = B( K, J )/A( K, K ) + ELSE + B( K, J ) = B( K, J )/CONJG(A( K, K )) + ENDIF + ENDIF + + IF (NOCONJ) THEN + DO 40, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 40 CONTINUE + ELSE + DO 45, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*CONJG(A( I, K )) + 45 CONTINUE + ENDIF + ENDIF + 50 CONTINUE + 60 CONTINUE + ELSE + DO 100, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 70, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 70 CONTINUE + END IF + DO 90 K = 1, M + IF (NOCONJ) THEN + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 80, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 80 CONTINUE + END IF + ELSE + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/CONJG(A( K, K )) + DO 85, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*CONJG(A( I, K )) + 85 CONTINUE + END IF + ENDIF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form B := alpha*inv( A' )*B +* or B := alpha*inv( conjg( A' ) )*B. +* + IF( UPPER )THEN + DO 140, J = 1, N + DO 130, I = 1, M + TEMP = ALPHA*B( I, J ) + IF( NOCONJ )THEN + DO 110, K = 1, I - 1 + TEMP = TEMP - A( K, I )*B( K, J ) + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + ELSE + DO 120, K = 1, I - 1 + TEMP = TEMP - CONJG( A( K, I ) )*B( K, J ) + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( I, I ) ) + END IF + B( I, J ) = TEMP + 130 CONTINUE + 140 CONTINUE + ELSE + DO 180, J = 1, N + DO 170, I = M, 1, -1 + TEMP = ALPHA*B( I, J ) + IF( NOCONJ )THEN + DO 150, K = I + 1, M + TEMP = TEMP - A( K, I )*B( K, J ) + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + ELSE + DO 160, K = I + 1, M + TEMP = TEMP - CONJG( A( K, I ) )*B( K, J ) + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( I, I ) ) + END IF + B( I, J ) = TEMP + 170 CONTINUE + 180 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*B*inv( A ). +* + IF( UPPER )THEN + DO 230, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 190, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 190 CONTINUE + END IF + DO 210, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + DO 200, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 200 CONTINUE + ELSE + DO 205, I = 1, M + B( I, J ) = B( I, J ) - CONJG(A( K, J ))*B( I, K ) + 205 CONTINUE + ENDIF + END IF + 210 CONTINUE + IF( NOUNIT )THEN + IF (NOCONJ) THEN + TEMP = ONE/A( J, J ) + ELSE + TEMP = ONE/CONJG(A( J, J )) + ENDIF + DO 220, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 220 CONTINUE + END IF + 230 CONTINUE + ELSE + DO 280, J = N, 1, -1 + IF( ALPHA.NE.ONE )THEN + DO 240, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 240 CONTINUE + END IF + DO 260, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + DO 250, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 250 CONTINUE + ELSE + DO 255, I = 1, M + B( I, J ) = B( I, J ) - CONJG(A( K, J ))*B( I, K ) + 255 CONTINUE + ENDIF + END IF + 260 CONTINUE + IF( NOUNIT )THEN + IF (NOCONJ) THEN + TEMP = ONE/A( J, J ) + ELSE + TEMP = ONE/CONJG(A( J, J )) + ENDIF + DO 270, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 270 CONTINUE + END IF + 280 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*inv( A' ) +* or B := alpha*B*inv( conjg( A' ) ). +* + IF( UPPER )THEN + DO 330, K = N, 1, -1 + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = ONE/A( K, K ) + ELSE + TEMP = ONE/CONJG( A( K, K ) ) + END IF + DO 290, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 290 CONTINUE + END IF + DO 310, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = A( J, K ) + ELSE + TEMP = CONJG( A( J, K ) ) + END IF + DO 300, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 300 CONTINUE + END IF + 310 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 320, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 320 CONTINUE + END IF + 330 CONTINUE + ELSE + DO 380, K = 1, N + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = ONE/A( K, K ) + ELSE + TEMP = ONE/CONJG( A( K, K ) ) + END IF + DO 340, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 340 CONTINUE + END IF + DO 360, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = A( J, K ) + ELSE + TEMP = CONJG( A( J, K ) ) + END IF + DO 350, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 350 CONTINUE + END IF + 360 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 370, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 370 CONTINUE + END IF + 380 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTRSM . +* + END diff --git a/reference/ctrsvf.f b/reference/ctrsvf.f new file mode 100644 index 0000000000..86061b48b6 --- /dev/null +++ b/reference/ctrsvf.f @@ -0,0 +1,361 @@ + SUBROUTINE CTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* CTRSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + COMPLEX TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC CONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CTRSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*A( I, J ) + 10 CONTINUE + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/CONJG(A( J, J )) + TEMP = X( J ) + DO 15, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*CONJG(A( I, J )) + 15 CONTINUE + ENDIF + END IF + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/CONJG(A( J, J )) + ENDIF + TEMP = X( JX ) + IX = JX + DO 30, I = J - 1, 1, -1 + IX = IX - INCX + IF (NOCONJ) THEN + X( IX ) = X( IX ) - TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) - TEMP*CONJG(A( I, J )) + ENDIF + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*A( I, J ) + 50 CONTINUE + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/CONJG(A( J, J )) + TEMP = X( J ) + DO 55, I = J + 1, N + X( I ) = X( I ) - TEMP*CONJG(A( I, J )) + 55 CONTINUE + ENDIF + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/CONJG(A( J, J )) + ENDIF + TEMP = X( JX ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + IF (NOCONJ) THEN + X( IX ) = X( IX ) - TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) - TEMP*CONJG(A( I, J )) + ENDIF + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = X( J ) + IF( NOCONJ )THEN + DO 90, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( I ) + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 100, I = 1, J - 1 + TEMP = TEMP - CONJG( A( I, J ) )*X( I ) + 100 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( J, J ) ) + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + JX = KX + DO 140, J = 1, N + IX = KX + TEMP = X( JX ) + IF( NOCONJ )THEN + DO 120, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX + INCX + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 130, I = 1, J - 1 + TEMP = TEMP - CONJG( A( I, J ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( J, J ) ) + END IF + X( JX ) = TEMP + JX = JX + INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = N, 1, -1 + TEMP = X( J ) + IF( NOCONJ )THEN + DO 150, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( I ) + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 160, I = N, J + 1, -1 + TEMP = TEMP - CONJG( A( I, J ) )*X( I ) + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( J, J ) ) + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 200, J = N, 1, -1 + IX = KX + TEMP = X( JX ) + IF( NOCONJ )THEN + DO 180, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX - INCX + 180 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 190, I = N, J + 1, -1 + TEMP = TEMP - CONJG( A( I, J ) )*X( IX ) + IX = IX - INCX + 190 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/CONJG( A( J, J ) ) + END IF + X( JX ) = TEMP + JX = JX - INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of CTRSV . +* + END diff --git a/reference/ctrti2f.f b/reference/ctrti2f.f new file mode 100644 index 0000000000..24604b4b52 --- /dev/null +++ b/reference/ctrti2f.f @@ -0,0 +1,146 @@ + SUBROUTINE CTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CTRTI2 computes the inverse of a complex upper or lower triangular +* matrix. +* +* This is the Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the matrix A is upper or lower triangular. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* DIAG (input) CHARACTER*1 +* Specifies whether or not the matrix A is unit triangular. +* = 'N': Non-unit triangular +* = 'U': Unit triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading n by n upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J + COMPLEX AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CSCAL, CTRMV, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRTI2', -INFO ) + RETURN + END IF +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix. +* + DO 10 J = 1, N + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF +* +* Compute elements 1:j-1 of j-th column. +* + CALL CTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, + $ A( 1, J ), 1 ) + CALL CSCAL( J-1, AJJ, A( 1, J ), 1 ) + 10 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix. +* + DO 20 J = N, 1, -1 + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF + IF( J.LT.N ) THEN +* +* Compute elements j+1:n of j-th column. +* + CALL CTRMV( 'Lower', 'No transpose', DIAG, N-J, + $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) + CALL CSCAL( N-J, AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of CTRTI2 +* + END diff --git a/reference/ctrtrif.f b/reference/ctrtrif.f new file mode 100644 index 0000000000..cb1ec980a8 --- /dev/null +++ b/reference/ctrtrif.f @@ -0,0 +1,177 @@ + SUBROUTINE CTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* CTRTRI computes the inverse of a complex upper or lower triangular +* matrix A. +* +* This is the Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': A is upper triangular; +* = 'L': A is lower triangular. +* +* DIAG (input) CHARACTER*1 +* = 'N': A is non-unit triangular; +* = 'U': A is unit triangular. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading N-by-N upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, A(i,i) is exactly zero. The triangular +* matrix is singular and its inverse can not be computed. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE, ZERO + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), + $ ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J, JB, NB, NN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CTRMM, CTRSM, CTRTI2, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CTRTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Check for singularity if non-unit. +* + IF( NOUNIT ) THEN + DO 10 INFO = 1, N + IF( A( INFO, INFO ).EQ.ZERO ) + $ RETURN + 10 CONTINUE + INFO = 0 + END IF +* +* Determine the block size for this environment. +* + NB = 128 + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL CTRTI2( UPLO, DIAG, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix +* + DO 20 J = 1, N, NB + JB = MIN( NB, N-J+1 ) +* +* Compute rows 1:j-1 of current block column +* + CALL CTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, + $ JB, ONE, A, LDA, A( 1, J ), LDA ) + CALL CTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, + $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) +* +* Compute inverse of current diagonal block +* + CALL CTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) + 20 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 30 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) + IF( J+JB.LE.N ) THEN +* +* Compute rows j+jb:n of current block column +* + CALL CTRMM( 'Left', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, + $ A( J+JB, J ), LDA ) + CALL CTRSM( 'Right', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF +* +* Compute inverse of current diagonal block +* + CALL CTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) + 30 CONTINUE + END IF + END IF +* + RETURN +* +* End of CTRTRI +* + END diff --git a/reference/damaxf.f b/reference/damaxf.f new file mode 100644 index 0000000000..a0c0b01913 --- /dev/null +++ b/reference/damaxf.f @@ -0,0 +1,36 @@ + REAL*8 function damaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*) + integer i,incx,ix,n +c + damaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + damaxf = dabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(dabs(dx(ix)).le.damaxf) go to 5 + damaxf = dabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 damaxf = dabs(dx(1)) + do 30 i = 2,n + if(dabs(dx(i)).le.damaxf) go to 30 + damaxf = dabs(dx(i)) + 30 continue + return + end diff --git a/reference/daminf.f b/reference/daminf.f new file mode 100644 index 0000000000..21ce9d7c06 --- /dev/null +++ b/reference/daminf.f @@ -0,0 +1,36 @@ + REAL*8 function daminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*) + integer i,incx,ix,n +c + daminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + daminf = dabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(dabs(dx(ix)).ge.daminf) go to 5 + daminf = dabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 daminf = dabs(dx(1)) + do 30 i = 2,n + if(dabs(dx(i)).ge.daminf) go to 30 + daminf = dabs(dx(i)) + 30 continue + return + end diff --git a/reference/dasumf.f b/reference/dasumf.f new file mode 100644 index 0000000000..0713694ddd --- /dev/null +++ b/reference/dasumf.f @@ -0,0 +1,43 @@ + double precision function dasumf(n,dx,incx) +c +c takes the sum of the absolute values. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dtemp + integer i,incx,m,mp1,n,nincx +c + dasumf = 0.0d0 + dtemp = 0.0d0 + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + dtemp = dtemp + dabs(dx(i)) + 10 continue + dasumf = dtemp + return +c +c code for increment equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,6) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dtemp = dtemp + dabs(dx(i)) + 30 continue + if( n .lt. 6 ) go to 60 + 40 mp1 = m + 1 + do 50 i = mp1,n,6 + dtemp = dtemp + dabs(dx(i)) + dabs(dx(i + 1)) + dabs(dx(i + 2)) + * + dabs(dx(i + 3)) + dabs(dx(i + 4)) + dabs(dx(i + 5)) + 50 continue + 60 dasumf = dtemp + return + end diff --git a/reference/daxpyf.f b/reference/daxpyf.f new file mode 100644 index 0000000000..259217c21d --- /dev/null +++ b/reference/daxpyf.f @@ -0,0 +1,48 @@ + subroutine daxpyf(n,da,dx,incx,dy,incy) +c +c constant times a vector plus a vector. +c uses unrolled loops for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*),da + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if (da .eq. 0.0d0) return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dy(iy) = dy(iy) + da*dx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,4) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dy(i) = dy(i) + da*dx(i) + 30 continue + if( n .lt. 4 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,4 + dy(i) = dy(i) + da*dx(i) + dy(i + 1) = dy(i + 1) + da*dx(i + 1) + dy(i + 2) = dy(i + 2) + da*dx(i + 2) + dy(i + 3) = dy(i + 3) + da*dx(i + 3) + 50 continue + return + end diff --git a/reference/dcopyf.f b/reference/dcopyf.f new file mode 100644 index 0000000000..e9303034fe --- /dev/null +++ b/reference/dcopyf.f @@ -0,0 +1,50 @@ + subroutine dcopyf(n,dx,incx,dy,incy) +c +c copies a vector, x, to a vector, y. +c uses unrolled loops for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*) + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dy(iy) = dx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,7) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dy(i) = dx(i) + 30 continue + if( n .lt. 7 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,7 + dy(i) = dx(i) + dy(i + 1) = dx(i + 1) + dy(i + 2) = dx(i + 2) + dy(i + 3) = dx(i + 3) + dy(i + 4) = dx(i + 4) + dy(i + 5) = dx(i + 5) + dy(i + 6) = dx(i + 6) + 50 continue + return + end diff --git a/reference/ddotf.f b/reference/ddotf.f new file mode 100644 index 0000000000..ed8defcb89 --- /dev/null +++ b/reference/ddotf.f @@ -0,0 +1,49 @@ + double precision function ddotf(n,dx,incx,dy,incy) +c +c forms the dot product of two vectors. +c uses unrolled loops for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*),dtemp + integer i,incx,incy,ix,iy,m,mp1,n +c + ddotf = 0.0d0 + dtemp = 0.0d0 + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dtemp = dtemp + dx(ix)*dy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + ddotf = dtemp + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,5) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dtemp = dtemp + dx(i)*dy(i) + 30 continue + if( n .lt. 5 ) go to 60 + 40 mp1 = m + 1 + do 50 i = mp1,n,5 + dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) + + * dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4) + 50 continue + 60 ddotf = dtemp + return + end diff --git a/reference/dgbmvf.f b/reference/dgbmvf.f new file mode 100644 index 0000000000..0033ac190f --- /dev/null +++ b/reference/dgbmvf.f @@ -0,0 +1,300 @@ + SUBROUTINE DGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, KL, KU, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DGBMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n band matrix, with kl sub-diagonals and ku super-diagonals. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* KL - INTEGER. +* On entry, KL specifies the number of sub-diagonals of the +* matrix A. KL must satisfy 0 .le. KL. +* Unchanged on exit. +* +* KU - INTEGER. +* On entry, KU specifies the number of super-diagonals of the +* matrix A. KU must satisfy 0 .le. KU. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry, the leading ( kl + ku + 1 ) by n part of the +* array A must contain the matrix of coefficients, supplied +* column by column, with the leading diagonal of the matrix in +* row ( ku + 1 ) of the array, the first super-diagonal +* starting at position 2 in row ku, the first sub-diagonal +* starting at position 1 in row ( ku + 2 ), and so on. +* Elements in the array A that do not correspond to elements +* in the band matrix (such as the top left ku by ku triangle) +* are not referenced. +* The following program segment will transfer a band matrix +* from conventional full matrix storage to band storage: +* +* DO 20, J = 1, N +* K = KU + 1 - J +* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) +* A( K + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( kl + ku + 1 ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, + $ LENX, LENY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( KL.LT.0 )THEN + INFO = 4 + ELSE IF( KU.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN + INFO = 8 + ELSE IF( INCX.EQ.0 )THEN + INFO = 10 + ELSE IF( INCY.EQ.0 )THEN + INFO = 13 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DGBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF( LSAME( TRANS, 'N' ) )THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the band part of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KUP1 = KU + 1 + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + K = KUP1 - J + DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + K = KUP1 - J + DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = ZERO + K = KUP1 - J + DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( I ) + 90 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + DO 110, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DGBMV . +* + END diff --git a/reference/dgemmf.f b/reference/dgemmf.f new file mode 100644 index 0000000000..0af812014a --- /dev/null +++ b/reference/dgemmf.f @@ -0,0 +1,313 @@ + SUBROUTINE DGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRANA,TRANB +* .. +* .. Array Arguments .. + DOUBLE PRECISION A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* DGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X', +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRANA - CHARACTER*1. +* On entry, TRANA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANA = 'N' or 'n', op( A ) = A. +* +* TRANA = 'T' or 't', op( A ) = A'. +* +* TRANA = 'C' or 'c', op( A ) = A'. +* +* Unchanged on exit. +* +* TRANB - CHARACTER*1. +* On entry, TRANB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRANB = 'N' or 'n', op( B ) = B. +* +* TRANB = 'T' or 't', op( B ) = B'. +* +* TRANB = 'C' or 'c', op( B ) = B'. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is +* k when TRANA = 'N' or 'n', and is m otherwise. +* Before entry with TRANA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is +* n when TRANB = 'N' or 'n', and is k otherwise. +* Before entry with TRANB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL NOTA,NOTB +* .. +* .. Parameters .. + DOUBLE PRECISION ONE,ZERO + PARAMETER (ONE=1.0D+0,ZERO=0.0D+0) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* transposed and set NROWA, NCOLA and NROWB as the number of rows +* and columns of A and the number of rows of B respectively. +* + NOTA = LSAME(TRANA,'N') + NOTB = LSAME(TRANB,'N') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANA,'C')) .AND. + + (.NOT.LSAME(TRANA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANB,'C')) .AND. + + (.NOT.LSAME(TRANB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('DGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And if alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + END IF + ELSE + IF (NOTA) THEN +* +* Form C := alpha*A*B' + beta*C +* + DO 170 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 130 I = 1,M + C(I,J) = ZERO + 130 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 140 I = 1,M + C(I,J) = BETA*C(I,J) + 140 CONTINUE + END IF + DO 160 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 150 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 150 CONTINUE + END IF + 160 CONTINUE + 170 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 200 J = 1,N + DO 190 I = 1,M + TEMP = ZERO + DO 180 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 180 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 190 CONTINUE + 200 CONTINUE + END IF + END IF +* + RETURN +* +* End of DGEMM . +* + END diff --git a/reference/dgemvf.f b/reference/dgemvf.f new file mode 100644 index 0000000000..ae50c3ce29 --- /dev/null +++ b/reference/dgemvf.f @@ -0,0 +1,256 @@ + SUBROUTINE DGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF( LSAME( TRANS, 'N' ) )THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + DO 50, I = 1, M + Y( I ) = Y( I ) + TEMP*A( I, J ) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + DO 70, I = 1, M + Y( IY ) = Y( IY ) + TEMP*A( I, J ) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = ZERO + DO 90, I = 1, M + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120, J = 1, N + TEMP = ZERO + IX = KX + DO 110, I = 1, M + TEMP = TEMP + A( I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DGEMV . +* + END diff --git a/reference/dgerf.f b/reference/dgerf.f new file mode 100644 index 0000000000..f340ceb56d --- /dev/null +++ b/reference/dgerf.f @@ -0,0 +1,158 @@ + SUBROUTINE DGERF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) + +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DGER performs the rank 1 operation +* +* A := alpha*x*y' + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DGER ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of DGER . +* + END diff --git a/reference/dgesvf.f b/reference/dgesvf.f new file mode 100644 index 0000000000..751acf349c --- /dev/null +++ b/reference/dgesvf.f @@ -0,0 +1,107 @@ + SUBROUTINE DGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK driver routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* DGESV computes the solution to a real system of linear equations +* A * X = B, +* where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +* +* The LU decomposition with partial pivoting and row interchanges is +* used to factor A as +* A = P * L * U, +* where P is a permutation matrix, L is unit lower triangular, and U is +* upper triangular. The factored form of A is then used to solve the +* system of equations A * X = B. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of linear equations, i.e., the order of the +* matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the N-by-N coefficient matrix A. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (output) INTEGER array, dimension (N) +* The pivot indices that define the permutation matrix P; +* row i of the matrix was interchanged with row IPIV(i). +* +* B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) +* On entry, the N-by-NRHS matrix of right hand side matrix B. +* On exit, if INFO = 0, the N-by-NRHS solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, so the solution could not be computed. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL DGETRF, DGETRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGESV ', -INFO ) + RETURN + END IF +* +* Compute the LU factorization of A. +* + CALL DGETRF( N, N, A, LDA, IPIV, INFO ) + IF( INFO.EQ.0 ) THEN +* +* Solve the system A*X = B, overwriting B with X. +* + CALL DGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, + $ INFO ) + END IF + RETURN +* +* End of DGESV +* + END diff --git a/reference/dgetf2f.f b/reference/dgetf2f.f new file mode 100644 index 0000000000..f977a7c11d --- /dev/null +++ b/reference/dgetf2f.f @@ -0,0 +1,135 @@ + SUBROUTINE DGETF2F( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1992 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DGETF2 computes an LU factorization of a general m-by-n matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the m by n matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, U(k,k) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER J, JP +* .. +* .. External Functions .. + INTEGER IDAMAX + EXTERNAL IDAMAX +* .. +* .. External Subroutines .. + EXTERNAL DGER, DSCAL, DSWAP, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* + DO 10 J = 1, MIN( M, N ) +* +* Find pivot and test for singularity. +* + JP = J - 1 + IDAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + IF( A( JP, J ).NE.ZERO ) THEN +* +* Apply the interchange to columns 1:N. +* + IF( JP.NE.J ) + $ CALL DSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) +* +* Compute elements J+1:M of J-th column. +* + IF( J.LT.M ) + $ CALL DSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) +* + ELSE IF( INFO.EQ.0 ) THEN +* + INFO = J + END IF +* + IF( J.LT.MIN( M, N ) ) THEN +* +* Update trailing submatrix. +* + CALL DGER( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), LDA, + $ A( J+1, J+1 ), LDA ) + END IF + 10 CONTINUE + RETURN +* +* End of DGETF2 +* + END diff --git a/reference/dgetrff.f b/reference/dgetrff.f new file mode 100644 index 0000000000..1425596c8f --- /dev/null +++ b/reference/dgetrff.f @@ -0,0 +1,156 @@ + SUBROUTINE DGETRFF( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DGETF2, DLASWP, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 64 + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL DGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL DGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to columns 1:J-1. +* + CALL DLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF( J+JB.LE.N ) THEN +* +* Apply interchanges to columns J+JB:N. +* + CALL DLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* +* Compute block row of U. +* + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL DGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + 20 CONTINUE + END IF + RETURN +* +* End of DGETRF +* + END diff --git a/reference/dgetrsf.f b/reference/dgetrsf.f new file mode 100644 index 0000000000..86624cb71b --- /dev/null +++ b/reference/dgetrsf.f @@ -0,0 +1,150 @@ + SUBROUTINE DGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* DGETRS solves a system of linear equations +* A * X = B or A' * X = B +* with a general N-by-N matrix A using the LU factorization computed +* by DGETRF. +* +* Arguments +* ========= +* +* TRANS (input) CHARACTER*1 +* Specifies the form of the system of equations: +* = 'N': A * X = B (No transpose) +* = 'T': A'* X = B (Transpose) +* = 'C': A'* X = B (Conjugate transpose = Transpose) +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input) DOUBLE PRECISION array, dimension (LDA,N) +* The factors L and U from the factorization A = P*L*U +* as computed by DGETRF. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from DGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* B (input/output) DOUBLE PRECISION array, dimension (LDB,NRHS) +* On entry, the right hand side matrix B. +* On exit, the solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRAN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLASWP, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NOTRAN = LSAME( TRANS, 'N' ) + IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETRS', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 .OR. NRHS.EQ.0 ) + $ RETURN +* + IF( NOTRAN ) THEN +* +* Solve A * X = B. +* +* Apply row interchanges to the right hand sides. +* + CALL DLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) +* +* Solve L*X = B, overwriting B with X. +* + CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve U*X = B, overwriting B with X. +* + CALL DTRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, + $ NRHS, ONE, A, LDA, B, LDB ) + ELSE +* +* Solve A' * X = B. +* +* Solve U'*X = B, overwriting B with X. +* + CALL DTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve L'*X = B, overwriting B with X. +* + CALL DTRSM( 'Left', 'Lower', 'Transpose', 'Unit', N, NRHS, ONE, + $ A, LDA, B, LDB ) +* +* Apply row interchanges to the solution vectors. +* + CALL DLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) + END IF +* + RETURN +* +* End of DGETRS +* + END diff --git a/reference/dlaswpf.f b/reference/dlaswpf.f new file mode 100644 index 0000000000..1e83dbe73a --- /dev/null +++ b/reference/dlaswpf.f @@ -0,0 +1,120 @@ + SUBROUTINE DLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INCX, K1, K2, LDA, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DLASWP performs a series of row interchanges on the matrix A. +* One row interchange is initiated for each of rows K1 through K2 of A. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of columns of the matrix A. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the matrix of column dimension N to which the row +* interchanges will be applied. +* On exit, the permuted matrix. +* +* LDA (input) INTEGER +* The leading dimension of the array A. +* +* K1 (input) INTEGER +* The first element of IPIV for which a row interchange will +* be done. +* +* K2 (input) INTEGER +* The last element of IPIV for which a row interchange will +* be done. +* +* IPIV (input) INTEGER array, dimension (M*abs(INCX)) +* The vector of pivot indices. Only the elements in positions +* K1 through K2 of IPIV are accessed. +* IPIV(K) = L implies rows K and L are to be interchanged. +* +* INCX (input) INTEGER +* The increment between successive values of IPIV. If IPIV +* is negative, the pivots are applied in reverse order. +* +* Further Details +* =============== +* +* Modified by +* R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 + DOUBLE PRECISION TEMP +* .. +* .. Executable Statements .. +* +* Interchange row I with row IPIV(I) for each of rows K1 through K2. +* + IF( INCX.GT.0 ) THEN + IX0 = K1 + I1 = K1 + I2 = K2 + INC = 1 + ELSE IF( INCX.LT.0 ) THEN + IX0 = 1 + ( 1-K2 )*INCX + I1 = K2 + I2 = K1 + INC = -1 + ELSE + RETURN + END IF +* + N32 = ( N / 32 )*32 + IF( N32.NE.0 ) THEN + DO 30 J = 1, N32, 32 + IX = IX0 + DO 20 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 10 K = J, J + 31 + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 10 CONTINUE + END IF + IX = IX + INCX + 20 CONTINUE + 30 CONTINUE + END IF + IF( N32.NE.N ) THEN + N32 = N32 + 1 + IX = IX0 + DO 50 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 40 K = N32, N + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 40 CONTINUE + END IF + IX = IX + INCX + 50 CONTINUE + END IF +* + RETURN +* +* End of DLASWP +* + END diff --git a/reference/dlauu2f.f b/reference/dlauu2f.f new file mode 100644 index 0000000000..0f957b426b --- /dev/null +++ b/reference/dlauu2f.f @@ -0,0 +1,135 @@ + SUBROUTINE DLAUU2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DLAUU2 computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the unblocked form of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I + DOUBLE PRECISION AII +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DDOT + EXTERNAL LSAME, DDOT +* .. +* .. External Subroutines .. + EXTERNAL DGEMV, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLAUU2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = DDOT( N-I+1, A( I, I ), LDA, A( I, I ), LDA ) + CALL DGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), + $ LDA, A( I, I+1 ), LDA, AII, A( 1, I ), 1 ) + ELSE + CALL DSCAL( I, AII, A( 1, I ), 1 ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = DDOT( N-I+1, A( I, I ), 1, A( I, I ), 1 ) + CALL DGEMV( 'Transpose', N-I, I-1, ONE, A( I+1, 1 ), LDA, + $ A( I+1, I ), 1, AII, A( I, 1 ), LDA ) + ELSE + CALL DSCAL( I, AII, A( I, 1 ), LDA ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of DLAUU2 +* + END diff --git a/reference/dlauumf.f b/reference/dlauumf.f new file mode 100644 index 0000000000..c0584cc0db --- /dev/null +++ b/reference/dlauumf.f @@ -0,0 +1,155 @@ + SUBROUTINE DLAUUMF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* February 29, 1992 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DLAUUM computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the blocked form of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, IB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DLAUU2, DSYRK, DTRMM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DLAUUM', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 128 +* + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL DLAUU2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL DTRMM( 'Right', 'Upper', 'Transpose', 'Non-unit', + $ I-1, IB, ONE, A( I, I ), LDA, A( 1, I ), + $ LDA ) + CALL DLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL DGEMM( 'No transpose', 'Transpose', I-1, IB, + $ N-I-IB+1, ONE, A( 1, I+IB ), LDA, + $ A( I, I+IB ), LDA, ONE, A( 1, I ), LDA ) + CALL DSYRK( 'Upper', 'No transpose', IB, N-I-IB+1, + $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), + $ LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL DTRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', IB, + $ I-1, ONE, A( I, I ), LDA, A( I, 1 ), LDA ) + CALL DLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL DGEMM( 'Transpose', 'No transpose', IB, I-1, + $ N-I-IB+1, ONE, A( I+IB, I ), LDA, + $ A( I+IB, 1 ), LDA, ONE, A( I, 1 ), LDA ) + CALL DSYRK( 'Lower', 'Transpose', IB, N-I-IB+1, ONE, + $ A( I+IB, I ), LDA, ONE, A( I, I ), LDA ) + END IF + 20 CONTINUE + END IF + END IF +* + RETURN +* +* End of DLAUUM +* + END diff --git a/reference/dmaxf.f b/reference/dmaxf.f new file mode 100644 index 0000000000..11a7322fa8 --- /dev/null +++ b/reference/dmaxf.f @@ -0,0 +1,36 @@ + REAL*8 function dmaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*) + integer i,incx,ix,n +c + dmaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmaxf = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).le.dmaxf) go to 5 + dmaxf = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmaxf = dx(1) + do 30 i = 2,n + if(dx(i).le.dmaxf) go to 30 + dmaxf = dx(i) + 30 continue + return + end diff --git a/reference/dminf.f b/reference/dminf.f new file mode 100644 index 0000000000..497fb533de --- /dev/null +++ b/reference/dminf.f @@ -0,0 +1,36 @@ + REAL*8 function dminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*) + integer i,incx,ix,n +c + dminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dminf = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).ge.dminf) go to 5 + dminf = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dminf = dx(1) + do 30 i = 2,n + if(dx(i).ge.dminf) go to 30 + dminf = dx(i) + 30 continue + return + end diff --git a/reference/dnrm2f.f b/reference/dnrm2f.f new file mode 100644 index 0000000000..2a4b6f2e90 --- /dev/null +++ b/reference/dnrm2f.f @@ -0,0 +1,61 @@ + DOUBLE PRECISION FUNCTION DNRM2F ( N, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N +* .. Array Arguments .. + DOUBLE PRECISION X( * ) +* .. +* +* DNRM2 returns the euclidean norm of a vector via the function +* name, so that +* +* DNRM2 := sqrt( x'*x ) +* +* +* +* -- This version written on 25-October-1982. +* Modified on 14-October-1993 to inline the call to DLASSQ. +* Sven Hammarling, Nag Ltd. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + INTEGER IX + DOUBLE PRECISION ABSXI, NORM, SCALE, SSQ +* .. Intrinsic Functions .. + INTRINSIC ABS, SQRT +* .. +* .. Executable Statements .. + + IF( N.LT.1 .OR. INCX.LT.1 )THEN + NORM = ZERO + ELSE IF( N.EQ.1 )THEN + NORM = ABS( X( 1 ) ) + ELSE + SCALE = ZERO + SSQ = ONE +* The following loop is equivalent to this call to the LAPACK +* auxiliary routine: +* CALL DLASSQ( N, X, INCX, SCALE, SSQ ) +* + DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX + IF( X( IX ).NE.ZERO )THEN + ABSXI = ABS( X( IX ) ) + IF( SCALE.LT.ABSXI )THEN + SSQ = ONE + SSQ*( SCALE/ABSXI )**2 + SCALE = ABSXI + ELSE + SSQ = SSQ + ( ABSXI/SCALE )**2 + END IF + END IF + 10 CONTINUE + NORM = SCALE * SQRT( SSQ ) + END IF +* + DNRM2F = NORM + RETURN +* +* End of DNRM2. +* + END diff --git a/reference/dpotf2f.f b/reference/dpotf2f.f new file mode 100644 index 0000000000..932726361b --- /dev/null +++ b/reference/dpotf2f.f @@ -0,0 +1,168 @@ + SUBROUTINE DPOTF2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* February 29, 1992 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DPOTF2 computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U' * U , if UPLO = 'U', or +* A = L * L', if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the unblocked version of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the upper or lower triangular part of the +* symmetric matrix A is stored. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* n by n upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U'*U or A = L*L'. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, the leading minor of order k is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J + DOUBLE PRECISION AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + DOUBLE PRECISION DDOT + EXTERNAL LSAME, DDOT +* .. +* .. External Subroutines .. + EXTERNAL DGEMV, DSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DPOTF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N +* +* Compute U(J,J) and test for non-positive-definiteness. +* + AJJ = A( J, J ) - DDOT( J-1, A( 1, J ), 1, A( 1, J ), 1 ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of row J. +* + IF( J.LT.N ) THEN + CALL DGEMV( 'Transpose', J-1, N-J, -ONE, A( 1, J+1 ), + $ LDA, A( 1, J ), 1, ONE, A( J, J+1 ), LDA ) + CALL DSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N +* +* Compute L(J,J) and test for non-positive-definiteness. +* + AJJ = A( J, J ) - DDOT( J-1, A( J, 1 ), LDA, A( J, 1 ), + $ LDA ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of column J. +* + IF( J.LT.N ) THEN + CALL DGEMV( 'No transpose', N-J, J-1, -ONE, A( J+1, 1 ), + $ LDA, A( J, 1 ), LDA, ONE, A( J+1, J ), 1 ) + CALL DSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF + GO TO 40 +* + 30 CONTINUE + INFO = J +* + 40 CONTINUE + RETURN +* +* End of DPOTF2 +* + END diff --git a/reference/dpotrff.f b/reference/dpotrff.f new file mode 100644 index 0000000000..10faf05570 --- /dev/null +++ b/reference/dpotrff.f @@ -0,0 +1,184 @@ + SUBROUTINE DPOTRFF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**T * U, if UPLO = 'U', or +* A = L * L**T, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DGEMM, DPOTF2, DSYRK, DTRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 224 + + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL DPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL DSYRK( 'Upper', 'Transpose', JB, J-1, -ONE, + $ A( 1, J ), LDA, ONE, A( J, J ), LDA ) + CALL DPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block row. +* + CALL DGEMM( 'Transpose', 'No transpose', JB, N-J-JB+1, + $ J-1, -ONE, A( 1, J ), LDA, A( 1, J+JB ), + $ LDA, ONE, A( J, J+JB ), LDA ) + CALL DTRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL DSYRK( 'Lower', 'No transpose', JB, J-1, -ONE, + $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) + CALL DPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block column. +* + CALL DGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, + $ J-1, -ONE, A( J+JB, 1 ), LDA, A( J, 1 ), + $ LDA, ONE, A( J+JB, J ), LDA ) + CALL DTRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', + $ N-J-JB+1, JB, ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of DPOTRF +* + END diff --git a/reference/dpotrif.f b/reference/dpotrif.f new file mode 100644 index 0000000000..2027042439 --- /dev/null +++ b/reference/dpotrif.f @@ -0,0 +1,96 @@ + SUBROUTINE DPOTRIF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DPOTRI computes the inverse of a real symmetric positive definite +* matrix A using the Cholesky factorization A = U**T*U or A = L*L**T +* computed by DPOTRF. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T, as computed by +* DPOTRF. +* On exit, the upper or lower triangle of the (symmetric) +* inverse of A, overwriting the input factor U or L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the (i,i) element of the factor U or L is +* zero, and the inverse could not be computed. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DLAUUM, DTRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DPOTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Invert the triangular Cholesky factor U or L. +* + CALL DTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* +* Form inv(U)*inv(U)' or inv(L)'*inv(L). +* + CALL DLAUUM( UPLO, N, A, LDA, INFO ) +* + RETURN +* +* End of DPOTRI +* + END diff --git a/reference/drotf.f b/reference/drotf.f new file mode 100644 index 0000000000..70525ad7d9 --- /dev/null +++ b/reference/drotf.f @@ -0,0 +1,37 @@ + subroutine drotf (n,dx,incx,dy,incy,c,s) +c +c applies a plane rotation. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*),dtemp,c,s + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dtemp = c*dx(ix) + s*dy(iy) + dy(iy) = c*dy(iy) - s*dx(ix) + dx(ix) = dtemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + dtemp = c*dx(i) + s*dy(i) + dy(i) = c*dy(i) - s*dx(i) + dx(i) = dtemp + 30 continue + return + end diff --git a/reference/drotgf.f b/reference/drotgf.f new file mode 100644 index 0000000000..265a7cd738 --- /dev/null +++ b/reference/drotgf.f @@ -0,0 +1,27 @@ + subroutine drotgf(da,db,c,s) +c +c construct givens plane rotation. +c jack dongarra, linpack, 3/11/78. +c + double precision da,db,c,s,roe,scale,r,z +c + roe = db + if( dabs(da) .gt. dabs(db) ) roe = da + scale = dabs(da) + dabs(db) + if( scale .ne. 0.0d0 ) go to 10 + c = 1.0d0 + s = 0.0d0 + r = 0.0d0 + z = 0.0d0 + go to 20 + 10 r = scale*dsqrt((da/scale)**2 + (db/scale)**2) + r = dsign(1.0d0,roe)*r + c = da/r + s = db/r + z = 1.0d0 + if( dabs(da) .gt. dabs(db) ) z = s + if( dabs(db) .ge. dabs(da) .and. c .ne. 0.0d0 ) z = 1.0d0/c + 20 da = r + db = z + return + end diff --git a/reference/drotmf.f b/reference/drotmf.f new file mode 100644 index 0000000000..7447680e3c --- /dev/null +++ b/reference/drotmf.f @@ -0,0 +1,108 @@ + SUBROUTINE DROTMF (N,DX,INCX,DY,INCY,DPARAM) +C +C APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX +C +C (DX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF DX ARE IN +C (DY**T) +C +C DX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE +C LX = (-INCX)*N, AND SIMILARLY FOR SY USING LY AND INCY. +C WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. +C +C DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 +C +C (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) +C H=( ) ( ) ( ) ( ) +C (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). +C SEE DROTMG FOR A DESCRIPTION OF DATA STORAGE IN DPARAM. +C + DOUBLE PRECISION DFLAG,DH12,DH22,DX,TWO,Z,DH11,DH21, + 1 DPARAM,DY,W,ZERO + DIMENSION DX(1),DY(1),DPARAM(5) + DATA ZERO,TWO/0.D0,2.D0/ +C + DFLAG=DPARAM(1) + IF(N .LE. 0 .OR.(DFLAG+TWO.EQ.ZERO)) GO TO 140 + IF(.NOT.(INCX.EQ.INCY.AND. INCX .GT.0)) GO TO 70 +C + NSTEPS=N*INCX + IF(DFLAG) 50,10,30 + 10 CONTINUE + DH12=DPARAM(4) + DH21=DPARAM(3) + DO 20 I=1,NSTEPS,INCX + W=DX(I) + Z=DY(I) + DX(I)=W+Z*DH12 + DY(I)=W*DH21+Z + 20 CONTINUE + GO TO 140 + 30 CONTINUE + DH11=DPARAM(2) + DH22=DPARAM(5) + DO 40 I=1,NSTEPS,INCX + W=DX(I) + Z=DY(I) + DX(I)=W*DH11+Z + DY(I)=-W+DH22*Z + 40 CONTINUE + GO TO 140 + 50 CONTINUE + DH11=DPARAM(2) + DH12=DPARAM(4) + DH21=DPARAM(3) + DH22=DPARAM(5) + DO 60 I=1,NSTEPS,INCX + W=DX(I) + Z=DY(I) + DX(I)=W*DH11+Z*DH12 + DY(I)=W*DH21+Z*DH22 + 60 CONTINUE + GO TO 140 + 70 CONTINUE + KX=1 + KY=1 + IF(INCX .LT. 0) KX=1+(1-N)*INCX + IF(INCY .LT. 0) KY=1+(1-N)*INCY +C + IF(DFLAG)120,80,100 + 80 CONTINUE + DH12=DPARAM(4) + DH21=DPARAM(3) + DO 90 I=1,N + W=DX(KX) + Z=DY(KY) + DX(KX)=W+Z*DH12 + DY(KY)=W*DH21+Z + KX=KX+INCX + KY=KY+INCY + 90 CONTINUE + GO TO 140 + 100 CONTINUE + DH11=DPARAM(2) + DH22=DPARAM(5) + DO 110 I=1,N + W=DX(KX) + Z=DY(KY) + DX(KX)=W*DH11+Z + DY(KY)=-W+DH22*Z + KX=KX+INCX + KY=KY+INCY + 110 CONTINUE + GO TO 140 + 120 CONTINUE + DH11=DPARAM(2) + DH12=DPARAM(4) + DH21=DPARAM(3) + DH22=DPARAM(5) + DO 130 I=1,N + W=DX(KX) + Z=DY(KY) + DX(KX)=W*DH11+Z*DH12 + DY(KY)=W*DH21+Z*DH22 + KX=KX+INCX + KY=KY+INCY + 130 CONTINUE + 140 CONTINUE + RETURN + END diff --git a/reference/drotmgf.f b/reference/drotmgf.f new file mode 100644 index 0000000000..bc9c03eeaf --- /dev/null +++ b/reference/drotmgf.f @@ -0,0 +1,169 @@ + SUBROUTINE DROTMGF (DD1,DD2,DX1,DY1,DPARAM) +C +C CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS +C THE SECOND COMPONENT OF THE 2-VECTOR (DSQRT(DD1)*DX1,DSQRT(DD2)* +C DY2)**T. +C WITH DPARAM(1)=DFLAG, H HAS ONE OF THE FOLLOWING FORMS.. +C +C DFLAG=-1.D0 DFLAG=0.D0 DFLAG=1.D0 DFLAG=-2.D0 +C +C (DH11 DH12) (1.D0 DH12) (DH11 1.D0) (1.D0 0.D0) +C H=( ) ( ) ( ) ( ) +C (DH21 DH22), (DH21 1.D0), (-1.D0 DH22), (0.D0 1.D0). +C LOCATIONS 2-4 OF DPARAM CONTAIN DH11, DH21, DH12, AND DH22 +C RESPECTIVELY. (VALUES OF 1.D0, -1.D0, OR 0.D0 IMPLIED BY THE +C VALUE OF DPARAM(1) ARE NOT STORED IN DPARAM.) +C +C THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE +C INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE +C OF DD1 AND DD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. +C + DOUBLE PRECISION GAM,ONE,RGAMSQ,DD2,DH11,DH21,DPARAM,DP2, + 1 DQ2,DU,DY1,ZERO,GAMSQ,DD1,DFLAG,DH12,DH22,DP1,DQ1, + 2 DTEMP,DX1,TWO + DIMENSION DPARAM(5) +C + DATA ZERO,ONE,TWO /0.D0,1.D0,2.D0/ + DATA GAM,GAMSQ,RGAMSQ/4096.D0,16777216.D0,5.9604645D-8/ + IF(.NOT. DD1 .LT. ZERO) GO TO 10 +C GO ZERO-H-D-AND-DX1.. + GO TO 60 + 10 CONTINUE +C CASE-DD1-NONNEGATIVE + DP2=DD2*DY1 + IF(.NOT. DP2 .EQ. ZERO) GO TO 20 + DFLAG=-TWO + GO TO 260 +C REGULAR-CASE.. + 20 CONTINUE + DP1=DD1*DX1 + DQ2=DP2*DY1 + DQ1=DP1*DX1 +C + IF(.NOT. DABS(DQ1) .GT. DABS(DQ2)) GO TO 40 + DH21=-DY1/DX1 + DH12=DP2/DP1 +C + DU=ONE-DH12*DH21 +C + IF(.NOT. DU .LE. ZERO) GO TO 30 +C GO ZERO-H-D-AND-DX1.. + GO TO 60 + 30 CONTINUE + DFLAG=ZERO + DD1=DD1/DU + DD2=DD2/DU + DX1=DX1*DU +C GO SCALE-CHECK.. + GO TO 100 + 40 CONTINUE + IF(.NOT. DQ2 .LT. ZERO) GO TO 50 +C GO ZERO-H-D-AND-DX1.. + GO TO 60 + 50 CONTINUE + DFLAG=ONE + DH11=DP1/DP2 + DH22=DX1/DY1 + DU=ONE+DH11*DH22 + DTEMP=DD2/DU + DD2=DD1/DU + DD1=DTEMP + DX1=DY1*DU +C GO SCALE-CHECK + GO TO 100 +C PROCEDURE..ZERO-H-D-AND-DX1.. + 60 CONTINUE + DFLAG=-ONE + DH11=ZERO + DH12=ZERO + DH21=ZERO + DH22=ZERO +C + DD1=ZERO + DD2=ZERO + DX1=ZERO +C RETURN.. + GO TO 220 +C PROCEDURE..FIX-H.. + 70 CONTINUE + IF(.NOT. DFLAG .GE. ZERO) GO TO 90 +C + IF(.NOT. DFLAG .EQ. ZERO) GO TO 80 + DH11=ONE + DH22=ONE + DFLAG=-ONE + GO TO 90 + 80 CONTINUE + DH21=-ONE + DH12=ONE + DFLAG=-ONE + 90 CONTINUE + GO TO IGO,(120,150,180,210) +C PROCEDURE..SCALE-CHECK + 100 CONTINUE + 110 CONTINUE + IF(.NOT. DD1 .LE. RGAMSQ) GO TO 130 + IF(DD1 .EQ. ZERO) GO TO 160 + ASSIGN 120 TO IGO +C FIX-H.. + GO TO 70 + 120 CONTINUE + DD1=DD1*GAM**2 + DX1=DX1/GAM + DH11=DH11/GAM + DH12=DH12/GAM + GO TO 110 + 130 CONTINUE + 140 CONTINUE + IF(.NOT. DD1 .GE. GAMSQ) GO TO 160 + ASSIGN 150 TO IGO +C FIX-H.. + GO TO 70 + 150 CONTINUE + DD1=DD1/GAM**2 + DX1=DX1*GAM + DH11=DH11*GAM + DH12=DH12*GAM + GO TO 140 + 160 CONTINUE + 170 CONTINUE + IF(.NOT. DABS(DD2) .LE. RGAMSQ) GO TO 190 + IF(DD2 .EQ. ZERO) GO TO 220 + ASSIGN 180 TO IGO +C FIX-H.. + GO TO 70 + 180 CONTINUE + DD2=DD2*GAM**2 + DH21=DH21/GAM + DH22=DH22/GAM + GO TO 170 + 190 CONTINUE + 200 CONTINUE + IF(.NOT. DABS(DD2) .GE. GAMSQ) GO TO 220 + ASSIGN 210 TO IGO +C FIX-H.. + GO TO 70 + 210 CONTINUE + DD2=DD2/GAM**2 + DH21=DH21*GAM + DH22=DH22*GAM + GO TO 200 + 220 CONTINUE + IF(DFLAG)250,230,240 + 230 CONTINUE + DPARAM(3)=DH21 + DPARAM(4)=DH12 + GO TO 260 + 240 CONTINUE + DPARAM(2)=DH11 + DPARAM(5)=DH22 + GO TO 260 + 250 CONTINUE + DPARAM(2)=DH11 + DPARAM(3)=DH21 + DPARAM(4)=DH12 + DPARAM(5)=DH22 + 260 CONTINUE + DPARAM(1)=DFLAG + RETURN + END diff --git a/reference/dsbmvf.f b/reference/dsbmvf.f new file mode 100644 index 0000000000..7a882a3f81 --- /dev/null +++ b/reference/dsbmvf.f @@ -0,0 +1,303 @@ + SUBROUTINE DSBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, K, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric band matrix, with k super-diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( K.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( 1, J ) + L = 1 - J + DO 90, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) + L = 1 - J + IX = JX + IY = JY + DO 110, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSBMV . +* + END diff --git a/reference/dscalf.f b/reference/dscalf.f new file mode 100644 index 0000000000..84d88987d2 --- /dev/null +++ b/reference/dscalf.f @@ -0,0 +1,43 @@ + subroutine dscalf(n,da,dx,incx) +c +c scales a vector by a constant. +c uses unrolled loops for increment equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision da,dx(*) + integer i,incx,m,mp1,n,nincx +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + dx(i) = da*dx(i) + 10 continue + return +c +c code for increment equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,5) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dx(i) = da*dx(i) + 30 continue + if( n .lt. 5 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,5 + dx(i) = da*dx(i) + dx(i + 1) = da*dx(i + 1) + dx(i + 2) = da*dx(i + 2) + dx(i + 3) = da*dx(i + 3) + dx(i + 4) = da*dx(i + 4) + 50 continue + return + end diff --git a/reference/dsdotf.f b/reference/dsdotf.f new file mode 100644 index 0000000000..d4e183e343 --- /dev/null +++ b/reference/dsdotf.f @@ -0,0 +1,74 @@ +*DECK DSDOT + DOUBLE PRECISION FUNCTION DSDOTF (N, SX, INCX, SY, INCY) +C***BEGIN PROLOGUE DSDOT +C***PURPOSE Compute the inner product of two vectors with extended +C precision accumulation and result. +C***LIBRARY SLATEC (BLAS) +C***CATEGORY D1A4 +C***TYPE DOUBLE PRECISION (DSDOT-D, DCDOT-C) +C***KEYWORDS BLAS, COMPLEX VECTORS, DOT PRODUCT, INNER PRODUCT, +C LINEAR ALGEBRA, VECTOR +C***AUTHOR Lawson, C. L., (JPL) +C Hanson, R. J., (SNLA) +C Kincaid, D. R., (U. of Texas) +C Krogh, F. T., (JPL) +C***DESCRIPTION +C +C B L A S Subprogram +C Description of Parameters +C +C --Input-- +C N number of elements in input vector(s) +C SX single precision vector with N elements +C INCX storage spacing between elements of SX +C SY single precision vector with N elements +C INCY storage spacing between elements of SY +C +C --Output-- +C DSDOT double precision dot product (zero if N.LE.0) +C +C Returns D.P. dot product accumulated in D.P., for S.P. SX and SY +C DSDOT = sum for I = 0 to N-1 of SX(LX+I*INCX) * SY(LY+I*INCY), +C where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is +C defined in a similar way using INCY. +C +C***REFERENCES C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T. +C Krogh, Basic linear algebra subprograms for Fortran +C usage, Algorithm No. 539, Transactions on Mathematical +C Software 5, 3 (September 1979), pp. 308-323. +C***ROUTINES CALLED (NONE) +C***REVISION HISTORY (YYMMDD) +C 791001 DATE WRITTEN +C 890831 Modified array declarations. (WRB) +C 890831 REVISION DATE from Version 3.2 +C 891214 Prologue converted to Version 4.0 format. (BAB) +C 920310 Corrected definition of LX in DESCRIPTION. (WRB) +C 920501 Reformatted the REFERENCES section. (WRB) +C***END PROLOGUE DSDOT + REAL SX(*),SY(*) +C***FIRST EXECUTABLE STATEMENT DSDOT + DSDOTF = 0.0D0 + IF (N .LE. 0) RETURN + IF (INCX.EQ.INCY .AND. INCX.GT.0) GO TO 20 +C +C Code for unequal or nonpositive increments. +C + KX = 1 + KY = 1 + IF (INCX .LT. 0) KX = 1+(1-N)*INCX + IF (INCY .LT. 0) KY = 1+(1-N)*INCY + DO 10 I = 1,N + DSDOTF = DSDOTF + DBLE(SX(KX))*DBLE(SY(KY)) + KX = KX + INCX + KY = KY + INCY + 10 CONTINUE + RETURN +C +C Code for equal, positive, non-unit increments. +C + 20 NS = N*INCX + DO 30 I = 1,NS,INCX + DSDOTF = DSDOTF + DBLE(SX(I))*DBLE(SY(I)) + 30 CONTINUE + RETURN + END diff --git a/reference/dspmvf.f b/reference/dspmvf.f new file mode 100644 index 0000000000..a83a6097f8 --- /dev/null +++ b/reference/dspmvf.f @@ -0,0 +1,262 @@ + SUBROUTINE DSPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 6 + ELSE IF( INCY.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*AP( KK ) + K = KK + 1 + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N - J + 1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*AP( KK ) + IX = JX + IY = JY + DO 110, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N - J + 1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSPMV . +* + END diff --git a/reference/dspr2f.f b/reference/dspr2f.f new file mode 100644 index 0000000000..9eabacf5f7 --- /dev/null +++ b/reference/dspr2f.f @@ -0,0 +1,229 @@ + SUBROUTINE DSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSPR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSPR2 . +* + END diff --git a/reference/dsprf.f b/reference/dsprf.f new file mode 100644 index 0000000000..69b74005fd --- /dev/null +++ b/reference/dsprf.f @@ -0,0 +1,198 @@ + SUBROUTINE DSPRF ( UPLO, N, ALPHA, X, INCX, AP ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DSPR performs the symmetric rank 1 operation +* +* A := alpha*x*x' + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSPR . +* + END diff --git a/reference/dswapf.f b/reference/dswapf.f new file mode 100644 index 0000000000..597ee83fa2 --- /dev/null +++ b/reference/dswapf.f @@ -0,0 +1,56 @@ + subroutine dswapf (n,dx,incx,dy,incy) +c +c interchanges two vectors. +c uses unrolled loops for increments equal one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dy(*),dtemp + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + dtemp = dx(ix) + dx(ix) = dy(iy) + dy(iy) = dtemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,3) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + dtemp = dx(i) + dx(i) = dy(i) + dy(i) = dtemp + 30 continue + if( n .lt. 3 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,3 + dtemp = dx(i) + dx(i) = dy(i) + dy(i) = dtemp + dtemp = dx(i + 1) + dx(i + 1) = dy(i + 1) + dy(i + 1) = dtemp + dtemp = dx(i + 2) + dx(i + 2) = dy(i + 2) + dy(i + 2) = dtemp + 50 continue + return + end diff --git a/reference/dsymmf.f b/reference/dsymmf.f new file mode 100644 index 0000000000..d0053f3700 --- /dev/null +++ b/reference/dsymmf.f @@ -0,0 +1,294 @@ + SUBROUTINE DSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* DSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + DOUBLE PRECISION TEMP1, TEMP2 +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of DSYMM . +* + END diff --git a/reference/dsymvf.f b/reference/dsymvf.f new file mode 100644 index 0000000000..1b38747e0a --- /dev/null +++ b/reference/dsymvf.f @@ -0,0 +1,262 @@ + SUBROUTINE DSYMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSYMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 5 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + ELSE IF( INCY.EQ.0 )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( J, J ) + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + IX = JX + IY = JY + DO 110, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYMV . +* + END diff --git a/reference/dsyr2f.f b/reference/dsyr2f.f new file mode 100644 index 0000000000..826bdb01ad --- /dev/null +++ b/reference/dsyr2f.f @@ -0,0 +1,230 @@ + SUBROUTINE DSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSYR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYR2 . +* + END diff --git a/reference/dsyr2kf.f b/reference/dsyr2kf.f new file mode 100644 index 0000000000..81e73da8dc --- /dev/null +++ b/reference/dsyr2kf.f @@ -0,0 +1,327 @@ + SUBROUTINE DSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + DOUBLE PRECISION ALPHA, BETA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* DSYR2K performs one of the symmetric rank 2k operations +* +* C := alpha*A*B' + alpha*B*A' + beta*C, +* +* or +* +* C := alpha*A'*B + alpha*B'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A and B are n by k matrices in the first case and k by n +* matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + +* beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* TRANS = 'C' or 'c' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'T' or 't' or 'C' or 'c', K specifies the number +* of rows of the matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + DOUBLE PRECISION TEMP1, TEMP2 +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYR2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*B' + alpha*B*A' + C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*B + alpha*B'*A + C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYR2K. +* + END diff --git a/reference/dsyrf.f b/reference/dsyrf.f new file mode 100644 index 0000000000..b5bcd00a59 --- /dev/null +++ b/reference/dsyrf.f @@ -0,0 +1,197 @@ + SUBROUTINE DSYRF ( UPLO, N, ALPHA, X, INCX, A, LDA ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DSYR performs the symmetric rank 1 operation +* +* A := alpha*x*x' + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYR . +* + END diff --git a/reference/dsyrkf.f b/reference/dsyrkf.f new file mode 100644 index 0000000000..6376b09285 --- /dev/null +++ b/reference/dsyrkf.f @@ -0,0 +1,294 @@ + SUBROUTINE DSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + DOUBLE PRECISION ALPHA, BETA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* DSYRK performs one of the symmetric rank k operations +* +* C := alpha*A*A' + beta*C, +* +* or +* +* C := alpha*A'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A is an n by k matrix in the first case and a k by n matrix +* in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*A + beta*C. +* +* TRANS = 'C' or 'c' C := alpha*A'*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'T' or 't' or 'C' or 'c', K specifies the number +* of rows of the matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - DOUBLE PRECISION array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + DOUBLE PRECISION TEMP +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYRK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*A' + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*A + beta*C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP = ZERO + DO 220, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYRK . +* + END diff --git a/reference/dtbmvf.f b/reference/dtbmvf.f new file mode 100644 index 0000000000..da340774e0 --- /dev/null +++ b/reference/dtbmvf.f @@ -0,0 +1,342 @@ + SUBROUTINE DTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, K, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTBMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular band matrix, with ( k + 1 ) diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( K.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 7 + ELSE IF( INCX.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = KPLUS1 - J + DO 10, I = MAX( 1, J - K ), J - 1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( KPLUS1, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = KPLUS1 - J + DO 30, I = MAX( 1, J - K ), J - 1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( KPLUS1, J ) + END IF + JX = JX + INCX + IF( J.GT.K ) + $ KX = KX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = 1 - J + DO 50, I = MIN( N, J + K ), J + 1, -1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( 1, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = 1 - J + DO 70, I = MIN( N, J + K ), J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( 1, J ) + END IF + JX = JX - INCX + IF( ( N - J ).GE.K ) + $ KX = KX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + L = KPLUS1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 90, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( I ) + 90 CONTINUE + X( J ) = TEMP + 100 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + KX = KX - INCX + IX = KX + L = KPLUS1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 110, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX - INCX + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + L = 1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 130, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( I ) + 130 CONTINUE + X( J ) = TEMP + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + KX = KX + INCX + IX = KX + L = 1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 150, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX + INCX + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTBMV . +* + END diff --git a/reference/dtbsvf.f b/reference/dtbsvf.f new file mode 100644 index 0000000000..4dd16d5ef3 --- /dev/null +++ b/reference/dtbsvf.f @@ -0,0 +1,336 @@ + SUBROUTINE DTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) +* .. Scalar Arguments .. + INTEGER INCX,K,LDA,N + CHARACTER DIAG,TRANS,UPLO +* .. +* .. Array Arguments .. + DOUBLE PRECISION A(LDA,*),X(*) +* .. +* +* Purpose +* ======= +* +* DTBSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular band matrix, with ( k + 1 ) +* diagonals. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER (ZERO=0.0D+0) +* .. +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L + LOGICAL NOUNIT +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX,MIN +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN + INFO = 1 + ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 2 + ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT. (K+1)) THEN + INFO = 7 + ELSE IF (INCX.EQ.0) THEN + INFO = 9 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('DTBSV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF (N.EQ.0) RETURN +* + NOUNIT = LSAME(DIAG,'N') +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF (INCX.LE.0) THEN + KX = 1 - (N-1)*INCX + ELSE IF (INCX.NE.1) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed by sequentially with one pass through A. +* + IF (LSAME(TRANS,'N')) THEN +* +* Form x := inv( A )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 20 J = N,1,-1 + IF (X(J).NE.ZERO) THEN + L = KPLUS1 - J + IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) + TEMP = X(J) + DO 10 I = J - 1,MAX(1,J-K),-1 + X(I) = X(I) - TEMP*A(L+I,J) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 40 J = N,1,-1 + KX = KX - INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = KPLUS1 - J + IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) + TEMP = X(JX) + DO 30 I = J - 1,MAX(1,J-K),-1 + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX - INCX + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 60 J = 1,N + IF (X(J).NE.ZERO) THEN + L = 1 - J + IF (NOUNIT) X(J) = X(J)/A(1,J) + TEMP = X(J) + DO 50 I = J + 1,MIN(N,J+K) + X(I) = X(I) - TEMP*A(L+I,J) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1,N + KX = KX + INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = 1 - J + IF (NOUNIT) X(JX) = X(JX)/A(1,J) + TEMP = X(JX) + DO 70 I = J + 1,MIN(N,J+K) + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A')*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 100 J = 1,N + TEMP = X(J) + L = KPLUS1 - J + DO 90 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(I) + 90 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + X(J) = TEMP + 100 CONTINUE + ELSE + JX = KX + DO 120 J = 1,N + TEMP = X(JX) + IX = KX + L = KPLUS1 - J + DO 110 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX + INCX + 110 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + X(JX) = TEMP + JX = JX + INCX + IF (J.GT.K) KX = KX + INCX + 120 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 140 J = N,1,-1 + TEMP = X(J) + L = 1 - J + DO 130 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(I) + 130 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + X(J) = TEMP + 140 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 160 J = N,1,-1 + TEMP = X(JX) + IX = KX + L = 1 - J + DO 150 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX - INCX + 150 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + X(JX) = TEMP + JX = JX - INCX + IF ((N-J).GE.K) KX = KX - INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTBSV . +* + END diff --git a/reference/dtpmvf.f b/reference/dtpmvf.f new file mode 100644 index 0000000000..e8f6eb4124 --- /dev/null +++ b/reference/dtpmvf.f @@ -0,0 +1,299 @@ + SUBROUTINE DTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTPMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTPMVF', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x:= A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK =1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 10, I = 1, J - 1 + X( I ) = X( I ) + TEMP*AP( K ) + K = K + 1 + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK + J - 1 ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, K = KK, KK + J - 2 + X( IX ) = X( IX ) + TEMP*AP( K ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK + J - 1 ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 50, I = N, J + 1, -1 + X( I ) = X( I ) + TEMP*AP( K ) + K = K - 1 + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK - N + J ) + END IF + KK = KK - ( N - J + 1 ) + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 + X( IX ) = X( IX ) + TEMP*AP( K ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK - N + J ) + END IF + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + K = KK - 1 + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + AP( K )*X( I ) + K = K - 1 + 90 CONTINUE + X( J ) = TEMP + KK = KK - J + 100 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 110, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + AP( K )*X( IX ) + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + KK = KK - J + 120 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + K = KK + 1 + DO 130, I = J + 1, N + TEMP = TEMP + AP( K )*X( I ) + K = K + 1 + 130 CONTINUE + X( J ) = TEMP + KK = KK + ( N - J + 1 ) + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 150, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + AP( K )*X( IX ) + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTPMV . +* + END diff --git a/reference/dtpsvf.f b/reference/dtpsvf.f new file mode 100644 index 0000000000..3639ba21f0 --- /dev/null +++ b/reference/dtpsvf.f @@ -0,0 +1,302 @@ + SUBROUTINE DTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTPSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix, supplied in packed form. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTPSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + TEMP = X( J ) + K = KK - 1 + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*AP( K ) + K = K - 1 + 10 CONTINUE + END IF + KK = KK - J + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + TEMP = X( JX ) + IX = JX + DO 30, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + X( IX ) = X( IX ) - TEMP*AP( K ) + 30 CONTINUE + END IF + JX = JX - INCX + KK = KK - J + 40 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + TEMP = X( J ) + K = KK + 1 + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*AP( K ) + K = K + 1 + 50 CONTINUE + END IF + KK = KK + ( N - J + 1 ) + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + TEMP = X( JX ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + X( IX ) = X( IX ) - TEMP*AP( K ) + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = X( J ) + K = KK + DO 90, I = 1, J - 1 + TEMP = TEMP - AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + X( J ) = TEMP + KK = KK + J + 100 CONTINUE + ELSE + JX = KX + DO 120, J = 1, N + TEMP = X( JX ) + IX = KX + DO 110, K = KK, KK + J - 2 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX + INCX + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + X( JX ) = TEMP + JX = JX + INCX + KK = KK + J + 120 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 140, J = N, 1, -1 + TEMP = X( J ) + K = KK + DO 130, I = N, J + 1, -1 + TEMP = TEMP - AP( K )*X( I ) + K = K - 1 + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + X( J ) = TEMP + KK = KK - ( N - J + 1 ) + 140 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 160, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + DO 150, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX - INCX + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + X( JX ) = TEMP + JX = JX - INCX + KK = KK - (N - J + 1 ) + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTPSV . +* + END diff --git a/reference/dtrmmf.f b/reference/dtrmmf.f new file mode 100644 index 0000000000..399d45b8dd --- /dev/null +++ b/reference/dtrmmf.f @@ -0,0 +1,355 @@ + SUBROUTINE DTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + DOUBLE PRECISION ALPHA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* DTRMM performs one of the matrix-matrix operations +* +* B := alpha*op( A )*B, or B := alpha*B*op( A ), +* +* where alpha is a scalar, B is an m by n matrix, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A'. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) multiplies B from +* the left or right as follows: +* +* SIDE = 'L' or 'l' B := alpha*op( A )*B. +* +* SIDE = 'R' or 'r' B := alpha*B*op( A ). +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = A'. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B, and on exit is overwritten by the +* transformed matrix. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + DOUBLE PRECISION TEMP +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTRMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*A*B. +* + IF( UPPER )THEN + DO 50, J = 1, N + DO 40, K = 1, M + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + DO 30, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 30 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + B( K, J ) = TEMP + END IF + 40 CONTINUE + 50 CONTINUE + ELSE + DO 80, J = 1, N + DO 70 K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + B( K, J ) = TEMP + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*A( K, K ) + DO 60, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 60 CONTINUE + END IF + 70 CONTINUE + 80 CONTINUE + END IF + ELSE +* +* Form B := alpha*A'*B. +* + IF( UPPER )THEN + DO 110, J = 1, N + DO 100, I = M, 1, -1 + TEMP = B( I, J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 90, K = 1, I - 1 + TEMP = TEMP + A( K, I )*B( K, J ) + 90 CONTINUE + B( I, J ) = ALPHA*TEMP + 100 CONTINUE + 110 CONTINUE + ELSE + DO 140, J = 1, N + DO 130, I = 1, M + TEMP = B( I, J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 120, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 120 CONTINUE + B( I, J ) = ALPHA*TEMP + 130 CONTINUE + 140 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*B*A. +* + IF( UPPER )THEN + DO 180, J = N, 1, -1 + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 150 CONTINUE + DO 170, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + TEMP = ALPHA*A( K, J ) + DO 160, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + ELSE + DO 220, J = 1, N + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 190, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 190 CONTINUE + DO 210, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + TEMP = ALPHA*A( K, J ) + DO 200, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 200 CONTINUE + END IF + 210 CONTINUE + 220 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*A'. +* + IF( UPPER )THEN + DO 260, K = 1, N + DO 240, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + TEMP = ALPHA*A( J, K ) + DO 230, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 230 CONTINUE + END IF + 240 CONTINUE + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + IF( TEMP.NE.ONE )THEN + DO 250, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 250 CONTINUE + END IF + 260 CONTINUE + ELSE + DO 300, K = N, 1, -1 + DO 280, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + TEMP = ALPHA*A( J, K ) + DO 270, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 270 CONTINUE + END IF + 280 CONTINUE + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + IF( TEMP.NE.ONE )THEN + DO 290, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 290 CONTINUE + END IF + 300 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTRMM . +* + END diff --git a/reference/dtrmvf.f b/reference/dtrmvf.f new file mode 100644 index 0000000000..0619d3eca8 --- /dev/null +++ b/reference/dtrmvf.f @@ -0,0 +1,286 @@ + SUBROUTINE DTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTRMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTRMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 10, I = 1, J - 1 + X( I ) = X( I ) + TEMP*A( I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, I = 1, J - 1 + X( IX ) = X( IX ) + TEMP*A( I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 50, I = N, J + 1, -1 + X( I ) = X( I ) + TEMP*A( I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, I = N, J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + END IF + JX = JX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + X( J ) = TEMP + 100 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 110, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + A( I, J )*X( IX ) + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 130, I = J + 1, N + TEMP = TEMP + A( I, J )*X( I ) + 130 CONTINUE + X( J ) = TEMP + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + A( I, J )*X( IX ) + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTRMV . +* + END diff --git a/reference/dtrsmf.f b/reference/dtrsmf.f new file mode 100644 index 0000000000..be3b4075aa --- /dev/null +++ b/reference/dtrsmf.f @@ -0,0 +1,378 @@ + SUBROUTINE DTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + DOUBLE PRECISION ALPHA +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* DTRSM solves one of the matrix equations +* +* op( A )*X = alpha*B, or X*op( A ) = alpha*B, +* +* where alpha is a scalar, X and B are m by n matrices, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A'. +* +* The matrix X is overwritten on B. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) appears on the left +* or right of X as follows: +* +* SIDE = 'L' or 'l' op( A )*X = alpha*B. +* +* SIDE = 'R' or 'r' X*op( A ) = alpha*B. +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = A'. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - DOUBLE PRECISION array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the right-hand side matrix B, and on exit is +* overwritten by the solution matrix X. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + DOUBLE PRECISION TEMP +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTRSM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*inv( A )*B. +* + IF( UPPER )THEN + DO 60, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 30, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 30 CONTINUE + END IF + DO 50, K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 40, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 40 CONTINUE + END IF + 50 CONTINUE + 60 CONTINUE + ELSE + DO 100, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 70, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 70 CONTINUE + END IF + DO 90 K = 1, M + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 80, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 80 CONTINUE + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form B := alpha*inv( A' )*B. +* + IF( UPPER )THEN + DO 130, J = 1, N + DO 120, I = 1, M + TEMP = ALPHA*B( I, J ) + DO 110, K = 1, I - 1 + TEMP = TEMP - A( K, I )*B( K, J ) + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + B( I, J ) = TEMP + 120 CONTINUE + 130 CONTINUE + ELSE + DO 160, J = 1, N + DO 150, I = M, 1, -1 + TEMP = ALPHA*B( I, J ) + DO 140, K = I + 1, M + TEMP = TEMP - A( K, I )*B( K, J ) + 140 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + B( I, J ) = TEMP + 150 CONTINUE + 160 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*B*inv( A ). +* + IF( UPPER )THEN + DO 210, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 170, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 170 CONTINUE + END IF + DO 190, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + DO 180, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 180 CONTINUE + END IF + 190 CONTINUE + IF( NOUNIT )THEN + TEMP = ONE/A( J, J ) + DO 200, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 200 CONTINUE + END IF + 210 CONTINUE + ELSE + DO 260, J = N, 1, -1 + IF( ALPHA.NE.ONE )THEN + DO 220, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 220 CONTINUE + END IF + DO 240, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + DO 230, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 230 CONTINUE + END IF + 240 CONTINUE + IF( NOUNIT )THEN + TEMP = ONE/A( J, J ) + DO 250, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 250 CONTINUE + END IF + 260 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*inv( A' ). +* + IF( UPPER )THEN + DO 310, K = N, 1, -1 + IF( NOUNIT )THEN + TEMP = ONE/A( K, K ) + DO 270, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 270 CONTINUE + END IF + DO 290, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + TEMP = A( J, K ) + DO 280, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 280 CONTINUE + END IF + 290 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 300, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 300 CONTINUE + END IF + 310 CONTINUE + ELSE + DO 360, K = 1, N + IF( NOUNIT )THEN + TEMP = ONE/A( K, K ) + DO 320, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 320 CONTINUE + END IF + DO 340, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + TEMP = A( J, K ) + DO 330, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 330 CONTINUE + END IF + 340 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 350, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 350 CONTINUE + END IF + 360 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTRSM . +* + END diff --git a/reference/dtrsvf.f b/reference/dtrsvf.f new file mode 100644 index 0000000000..2f4a702a1d --- /dev/null +++ b/reference/dtrsvf.f @@ -0,0 +1,289 @@ + SUBROUTINE DTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* DTRSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + DOUBLE PRECISION TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DTRSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*A( I, J ) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + TEMP = X( JX ) + IX = JX + DO 30, I = J - 1, 1, -1 + IX = IX - INCX + X( IX ) = X( IX ) - TEMP*A( I, J ) + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*A( I, J ) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + TEMP = X( JX ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + X( IX ) = X( IX ) - TEMP*A( I, J ) + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = X( J ) + DO 90, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( I ) + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( J ) = TEMP + 100 CONTINUE + ELSE + JX = KX + DO 120, J = 1, N + TEMP = X( JX ) + IX = KX + DO 110, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( JX ) = TEMP + JX = JX + INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = N, 1, -1 + TEMP = X( J ) + DO 130, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( I ) + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( J ) = TEMP + 140 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 160, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + DO 150, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX - INCX + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( JX ) = TEMP + JX = JX - INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of DTRSV . +* + END diff --git a/reference/dtrti2f.f b/reference/dtrti2f.f new file mode 100644 index 0000000000..214d4f5233 --- /dev/null +++ b/reference/dtrti2f.f @@ -0,0 +1,146 @@ + SUBROUTINE DTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DTRTI2 computes the inverse of a real upper or lower triangular +* matrix. +* +* This is the Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the matrix A is upper or lower triangular. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* DIAG (input) CHARACTER*1 +* Specifies whether or not the matrix A is unit triangular. +* = 'N': Non-unit triangular +* = 'U': Unit triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading n by n upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J + DOUBLE PRECISION AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DSCAL, DTRMV, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRTI2', -INFO ) + RETURN + END IF +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix. +* + DO 10 J = 1, N + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF +* +* Compute elements 1:j-1 of j-th column. +* + CALL DTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, + $ A( 1, J ), 1 ) + CALL DSCAL( J-1, AJJ, A( 1, J ), 1 ) + 10 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix. +* + DO 20 J = N, 1, -1 + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF + IF( J.LT.N ) THEN +* +* Compute elements j+1:n of j-th column. +* + CALL DTRMV( 'Lower', 'No transpose', DIAG, N-J, + $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) + CALL DSCAL( N-J, AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of DTRTI2 +* + END diff --git a/reference/dtrtrif.f b/reference/dtrtrif.f new file mode 100644 index 0000000000..e2af835270 --- /dev/null +++ b/reference/dtrtrif.f @@ -0,0 +1,176 @@ + SUBROUTINE DTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* DTRTRI computes the inverse of a real upper or lower triangular +* matrix A. +* +* This is the Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': A is upper triangular; +* = 'L': A is lower triangular. +* +* DIAG (input) CHARACTER*1 +* = 'N': A is non-unit triangular; +* = 'U': A is unit triangular. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) DOUBLE PRECISION array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading N-by-N upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, A(i,i) is exactly zero. The triangular +* matrix is singular and its inverse can not be computed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J, JB, NB, NN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DTRMM, DTRSM, DTRTI2, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DTRTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Check for singularity if non-unit. +* + IF( NOUNIT ) THEN + DO 10 INFO = 1, N + IF( A( INFO, INFO ).EQ.ZERO ) + $ RETURN + 10 CONTINUE + INFO = 0 + END IF +* +* Determine the block size for this environment. +* + NB = 128 + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL DTRTI2( UPLO, DIAG, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix +* + DO 20 J = 1, N, NB + JB = MIN( NB, N-J+1 ) +* +* Compute rows 1:j-1 of current block column +* + CALL DTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, + $ JB, ONE, A, LDA, A( 1, J ), LDA ) + CALL DTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, + $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) +* +* Compute inverse of current diagonal block +* + CALL DTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) + 20 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 30 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) + IF( J+JB.LE.N ) THEN +* +* Compute rows j+jb:n of current block column +* + CALL DTRMM( 'Left', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, + $ A( J+JB, J ), LDA ) + CALL DTRSM( 'Right', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF +* +* Compute inverse of current diagonal block +* + CALL DTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) + 30 CONTINUE + END IF + END IF +* + RETURN +* +* End of DTRTRI +* + END diff --git a/reference/dzamaxf.f b/reference/dzamaxf.f new file mode 100644 index 0000000000..e75cbc600d --- /dev/null +++ b/reference/dzamaxf.f @@ -0,0 +1,40 @@ + REAL*8 function dzamaxf(n,zx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + COMPLEX*16 zx(*) + integer i,incx,ix,n + double precision dcabs1 +c + dzamaxf = 0. + if( n.lt.1 .or. incx.le.0 )return + dzamaxf = dcabs1(zx(1)) + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dzamaxf = dcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(dcabs1(zx(ix)).le.dzamaxf) go to 5 + dzamaxf = i + dzamaxf = dcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dzamaxf = dcabs1(zx(1)) + do 30 i = 2,n + if(dcabs1(zx(i)).le.dzamaxf) go to 30 + dzamaxf = i + dzamaxf = dcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/dzaminf.f b/reference/dzaminf.f new file mode 100644 index 0000000000..61f59e3e0b --- /dev/null +++ b/reference/dzaminf.f @@ -0,0 +1,38 @@ + REAL*8 function dzaminf(n,zx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + COMPLEX*16 zx(*) + integer i,incx,ix,n + double precision dcabs1 +c + dzaminf = 0. + if( n.lt.1 .or. incx.le.0 )return + dzaminf = dcabs1(zx(1)) + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dzaminf = dcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(dcabs1(zx(ix)).ge.dzaminf) go to 5 + dzaminf = dcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dzaminf = dcabs1(zx(1)) + do 30 i = 2,n + if(dcabs1(zx(i)).ge.dzaminf) go to 30 + dzaminf = dcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/dzasumf.f b/reference/dzasumf.f new file mode 100644 index 0000000000..1b4dbdb453 --- /dev/null +++ b/reference/dzasumf.f @@ -0,0 +1,34 @@ + double precision function dzasumf(n,zx,incx) +c +c takes the sum of the absolute values. +c jack dongarra, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*) + double precision stemp,dcabs1 + integer i,incx,ix,n +c + dzasumf = 0.0d0 + stemp = 0.0d0 + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + do 10 i = 1,n + stemp = stemp + dcabs1(zx(ix)) + ix = ix + incx + 10 continue + dzasumf = stemp + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + stemp = stemp + dcabs1(zx(i)) + 30 continue + dzasumf = stemp + return + end diff --git a/reference/dznrm2f.f b/reference/dznrm2f.f new file mode 100644 index 0000000000..1e9cba68e3 --- /dev/null +++ b/reference/dznrm2f.f @@ -0,0 +1,67 @@ + DOUBLE PRECISION FUNCTION DZNRM2F( N, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N +* .. Array Arguments .. + COMPLEX*16 X( * ) +* .. +* +* DZNRM2 returns the euclidean norm of a vector via the function +* name, so that +* +* DZNRM2 := sqrt( conjg( x' )*x ) +* +* +* +* -- This version written on 25-October-1982. +* Modified on 14-October-1993 to inline the call to ZLASSQ. +* Sven Hammarling, Nag Ltd. +* +* +* .. Parameters .. + DOUBLE PRECISION ONE , ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. Local Scalars .. + INTEGER IX + DOUBLE PRECISION NORM, SCALE, SSQ, TEMP +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DBLE, SQRT +* .. +* .. Executable Statements .. + IF( N.LT.1 .OR. INCX.LT.1 )THEN + NORM = ZERO + ELSE + SCALE = ZERO + SSQ = ONE +* The following loop is equivalent to this call to the LAPACK +* auxiliary routine: +* CALL ZLASSQ( N, X, INCX, SCALE, SSQ ) +* + DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX + IF( DBLE( X( IX ) ).NE.ZERO )THEN + TEMP = ABS( DBLE( X( IX ) ) ) + IF( SCALE.LT.TEMP )THEN + SSQ = ONE + SSQ*( SCALE/TEMP )**2 + SCALE = TEMP + ELSE + SSQ = SSQ + ( TEMP/SCALE )**2 + END IF + END IF + IF( DIMAG( X( IX ) ).NE.ZERO )THEN + TEMP = ABS( DIMAG( X( IX ) ) ) + IF( SCALE.LT.TEMP )THEN + SSQ = ONE + SSQ*( SCALE/TEMP )**2 + SCALE = TEMP + ELSE + SSQ = SSQ + ( TEMP/SCALE )**2 + END IF + END IF + 10 CONTINUE + NORM = SCALE * SQRT( SSQ ) + END IF +* + DZNRM2F = NORM + RETURN +* +* End of DZNRM2. +* + END diff --git a/reference/icamaxf.f b/reference/icamaxf.f new file mode 100644 index 0000000000..928ad32fc2 --- /dev/null +++ b/reference/icamaxf.f @@ -0,0 +1,41 @@ + integer function icamaxf(n,cx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*) + real smax + integer i,incx,ix,n + real scabs1 +c + icamaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + icamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = scabs1(cx(1)) + ix = ix + incx + do 10 i = 2,n + if(scabs1(cx(ix)).le.smax) go to 5 + icamaxf = i + smax = scabs1(cx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = scabs1(cx(1)) + do 30 i = 2,n + if(scabs1(cx(i)).le.smax) go to 30 + icamaxf = i + smax = scabs1(cx(i)) + 30 continue + return + end diff --git a/reference/icaminf.f b/reference/icaminf.f new file mode 100644 index 0000000000..3535450686 --- /dev/null +++ b/reference/icaminf.f @@ -0,0 +1,41 @@ + integer function icaminf(n,cx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*) + real smin + integer i,incx,ix,n + real scabs1 +c + icaminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + icaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = scabs1(cx(1)) + ix = ix + incx + do 10 i = 2,n + if(scabs1(cx(ix)).ge.smin) go to 5 + icaminf = i + smin = scabs1(cx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = scabs1(cx(1)) + do 30 i = 2,n + if(scabs1(cx(i)).ge.smin) go to 30 + icaminf = i + smin = scabs1(cx(i)) + 30 continue + return + end diff --git a/reference/idamaxf.f b/reference/idamaxf.f new file mode 100644 index 0000000000..e1359e542a --- /dev/null +++ b/reference/idamaxf.f @@ -0,0 +1,39 @@ + integer function idamaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dmax + integer i,incx,ix,n +c + idamaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + idamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmax = dabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(dabs(dx(ix)).le.dmax) go to 5 + idamaxf = i + dmax = dabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmax = dabs(dx(1)) + do 30 i = 2,n + if(dabs(dx(i)).le.dmax) go to 30 + idamaxf = i + dmax = dabs(dx(i)) + 30 continue + return + end diff --git a/reference/idaminf.f b/reference/idaminf.f new file mode 100644 index 0000000000..86e18cb8a3 --- /dev/null +++ b/reference/idaminf.f @@ -0,0 +1,39 @@ + integer function idaminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dmin + integer i,incx,ix,n +c + idaminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + idaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmin = dabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(dabs(dx(ix)).ge.dmin) go to 5 + idaminf = i + dmin = dabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmin = dabs(dx(1)) + do 30 i = 2,n + if(dabs(dx(i)).ge.dmin) go to 30 + idaminf = i + dmin = dabs(dx(i)) + 30 continue + return + end diff --git a/reference/idmaxf.f b/reference/idmaxf.f new file mode 100644 index 0000000000..9b0d25c16b --- /dev/null +++ b/reference/idmaxf.f @@ -0,0 +1,39 @@ + integer function idmaxf(n,dx,incx) +c +c finds the index of element having max. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dmax + integer i,incx,ix,n +c + idmaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + idmaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmax = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).le.dmax) go to 5 + idmaxf = i + dmax = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmax = dx(1) + do 30 i = 2,n + if(dx(i).le.dmax) go to 30 + idmaxf = i + dmax = dx(i) + 30 continue + return + end diff --git a/reference/idminf.f b/reference/idminf.f new file mode 100644 index 0000000000..4ba0b5e6ca --- /dev/null +++ b/reference/idminf.f @@ -0,0 +1,39 @@ + integer function idminf(n,dx,incx) +c +c finds the index of element having min. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double precision dx(*),dmin + integer i,incx,ix,n +c + idminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + idminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmin = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).ge.dmin) go to 5 + idminf = i + dmin = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmin = dx(1) + do 30 i = 2,n + if(dx(i).ge.dmin) go to 30 + idminf = i + dmin = dx(i) + 30 continue + return + end diff --git a/reference/iqamaxf.f b/reference/iqamaxf.f new file mode 100644 index 0000000000..13e9fc71ea --- /dev/null +++ b/reference/iqamaxf.f @@ -0,0 +1,48 @@ + REAL*10 function qabs(dx) + REAL*10 dx + + qabs = dx + if (dx >= 0) return + qabs = -dx + return + end + + integer function iqamaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real*10 dx(*),dmax + integer i,incx,ix,n +c + iqamaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + iqamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmax = qabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(qabs(dx(ix)).le.dmax) go to 5 + iqamaxf = i + dmax = qabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmax = qabs(dx(1)) + do 30 i = 2,n + if(qabs(dx(i)).le.dmax) go to 30 + iqamaxf = i + dmax = qabs(dx(i)) + 30 continue + return + end diff --git a/reference/iqaminf.f b/reference/iqaminf.f new file mode 100644 index 0000000000..1429be7fb3 --- /dev/null +++ b/reference/iqaminf.f @@ -0,0 +1,49 @@ + REAL*10 function qabs(dx) + REAL*10 dx + + qabs = dx + if (dx >= 0) return + qabs = -dx + return + end + + + integer function iqaminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real*10 dx(*),dmin + integer i,incx,ix,n +c + iqaminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + iqaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmin = qabs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(qabs(dx(ix)).ge.dmin) go to 5 + iqaminf = i + dmin = qabs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmin = qabs(dx(1)) + do 30 i = 2,n + if(qabs(dx(i)).ge.dmin) go to 30 + iqaminf = i + dmin = qabs(dx(i)) + 30 continue + return + end diff --git a/reference/iqmaxf.f b/reference/iqmaxf.f new file mode 100644 index 0000000000..782e4f2cd6 --- /dev/null +++ b/reference/iqmaxf.f @@ -0,0 +1,39 @@ + integer function iqmaxf(n,dx,incx) +c +c finds the index of element having max. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real*10 dx(*),dmax + integer i,incx,ix,n +c + iqmaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + iqmaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmax = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).le.dmax) go to 5 + iqmaxf = i + dmax = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmax = dx(1) + do 30 i = 2,n + if(dx(i).le.dmax) go to 30 + iqmaxf = i + dmax = dx(i) + 30 continue + return + end diff --git a/reference/iqminf.f b/reference/iqminf.f new file mode 100644 index 0000000000..bc75c2bcb2 --- /dev/null +++ b/reference/iqminf.f @@ -0,0 +1,39 @@ + integer function iqminf(n,dx,incx) +c +c finds the index of element having min. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real*10 dx(*),dmin + integer i,incx,ix,n +c + iqminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + iqminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + dmin = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).ge.dmin) go to 5 + iqminf = i + dmin = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 dmin = dx(1) + do 30 i = 2,n + if(dx(i).ge.dmin) go to 30 + iqminf = i + dmin = dx(i) + 30 continue + return + end diff --git a/reference/isamaxf.f b/reference/isamaxf.f new file mode 100644 index 0000000000..95be5a55af --- /dev/null +++ b/reference/isamaxf.f @@ -0,0 +1,39 @@ + integer function isamaxf(n,sx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),smax + integer i,incx,ix,n +c + isamaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + isamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = abs(sx(1)) + ix = ix + incx + do 10 i = 2,n + if(abs(sx(ix)).le.smax) go to 5 + isamaxf = i + smax = abs(sx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = abs(sx(1)) + do 30 i = 2,n + if(abs(sx(i)).le.smax) go to 30 + isamaxf = i + smax = abs(sx(i)) + 30 continue + return + end diff --git a/reference/isaminf.f b/reference/isaminf.f new file mode 100644 index 0000000000..83eb129d53 --- /dev/null +++ b/reference/isaminf.f @@ -0,0 +1,39 @@ + integer function isaminf(n,sx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),smin + integer i,incx,ix,n +c + isaminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + isaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = abs(sx(1)) + ix = ix + incx + do 10 i = 2,n + if(abs(sx(ix)).ge.smin) go to 5 + isaminf = i + smin = abs(sx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = abs(sx(1)) + do 30 i = 2,n + if(abs(sx(i)).ge.smin) go to 30 + isaminf = i + smin = abs(sx(i)) + 30 continue + return + end diff --git a/reference/ismaxf.f b/reference/ismaxf.f new file mode 100644 index 0000000000..63cab5f776 --- /dev/null +++ b/reference/ismaxf.f @@ -0,0 +1,39 @@ + integer function ismaxf(n,sx,incx) +c +c finds the index of element having max. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),smax + integer i,incx,ix,n +c + ismaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + ismaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = sx(1) + ix = ix + incx + do 10 i = 2,n + if(sx(ix).le.smax) go to 5 + ismaxf = i + smax = sx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = sx(1) + do 30 i = 2,n + if(sx(i).le.smax) go to 30 + ismaxf = i + smax = sx(i) + 30 continue + return + end diff --git a/reference/isminf.f b/reference/isminf.f new file mode 100644 index 0000000000..dc59801751 --- /dev/null +++ b/reference/isminf.f @@ -0,0 +1,39 @@ + integer function isminf(n,sx,incx) +c +c finds the index of element having min. value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),smin + integer i,incx,ix,n +c + isminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + isminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = sx(1) + ix = ix + incx + do 10 i = 2,n + if(sx(ix).ge.smin) go to 5 + isminf = i + smin = sx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = sx(1) + do 30 i = 2,n + if(sx(i).ge.smin) go to 30 + isminf = i + smin = sx(i) + 30 continue + return + end diff --git a/reference/ixamaxf.f b/reference/ixamaxf.f new file mode 100644 index 0000000000..536602f058 --- /dev/null +++ b/reference/ixamaxf.f @@ -0,0 +1,41 @@ + integer function ixamaxf(n,zx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex*20 zx(*) + real*10 smax + integer i,incx,ix,n + real*10 qcabs1 +c + ixamaxf = 0 + if( n.lt.1 .or. incx.le.0 )return + ixamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = qcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(qcabs1(zx(ix)).le.smax) go to 5 + ixamaxf = i + smax = qcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = qcabs1(zx(1)) + do 30 i = 2,n + if(qcabs1(zx(i)).le.smax) go to 30 + ixamaxf = i + smax = qcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/ixaminf.f b/reference/ixaminf.f new file mode 100644 index 0000000000..8112e8b976 --- /dev/null +++ b/reference/ixaminf.f @@ -0,0 +1,41 @@ + integer function ixaminf(n,zx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex*20 zx(*) + real*10 smin + integer i,incx,ix,n + real*10 qcabs1 +c + ixaminf = 0 + if( n.lt.1 .or. incx.le.0 )return + ixaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = qcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(qcabs1(zx(ix)).ge.smin) go to 5 + ixaminf = i + smin = qcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = qcabs1(zx(1)) + do 30 i = 2,n + if(qcabs1(zx(i)).ge.smin) go to 30 + ixaminf = i + smin = qcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/izamaxf.f b/reference/izamaxf.f new file mode 100644 index 0000000000..902c014a4e --- /dev/null +++ b/reference/izamaxf.f @@ -0,0 +1,41 @@ + integer function izamaxf(n,zx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*) + double precision smax + integer i,incx,ix,n + double precision dcabs1 +c + izamaxf = 0 + if( n.lt.1 .or. incx.le.0 )return + izamaxf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smax = dcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(dcabs1(zx(ix)).le.smax) go to 5 + izamaxf = i + smax = dcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smax = dcabs1(zx(1)) + do 30 i = 2,n + if(dcabs1(zx(i)).le.smax) go to 30 + izamaxf = i + smax = dcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/izaminf.f b/reference/izaminf.f new file mode 100644 index 0000000000..8779849379 --- /dev/null +++ b/reference/izaminf.f @@ -0,0 +1,41 @@ + integer function izaminf(n,zx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*) + double precision smin + integer i,incx,ix,n + double precision dcabs1 +c + izaminf = 0 + if( n.lt.1 .or. incx.le.0 )return + izaminf = 1 + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smin = dcabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(dcabs1(zx(ix)).ge.smin) go to 5 + izaminf = i + smin = dcabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smin = dcabs1(zx(1)) + do 30 i = 2,n + if(dcabs1(zx(i)).ge.smin) go to 30 + izaminf = i + smin = dcabs1(zx(i)) + 30 continue + return + end diff --git a/reference/lsamef.f b/reference/lsamef.f new file mode 100644 index 0000000000..f895174605 --- /dev/null +++ b/reference/lsamef.f @@ -0,0 +1,87 @@ + LOGICAL FUNCTION LSAME( CA, CB ) +* +* -- LAPACK auxiliary routine (version 2.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* January 31, 1994 +* +* .. Scalar Arguments .. + CHARACTER CA, CB +* .. +* +* Purpose +* ======= +* +* LSAME returns .TRUE. if CA is the same letter as CB regardless of +* case. +* +* Arguments +* ========= +* +* CA (input) CHARACTER*1 +* CB (input) CHARACTER*1 +* CA and CB specify the single characters to be compared. +* +* ===================================================================== +* +* .. Intrinsic Functions .. + INTRINSIC ICHAR +* .. +* .. Local Scalars .. + INTEGER INTA, INTB, ZCODE +* .. +* .. Executable Statements .. +* +* Test if the characters are equal +* + LSAME = CA.EQ.CB + IF( LSAME ) + $ RETURN +* +* Now test for equivalence if both characters are alphabetic. +* + ZCODE = ICHAR( 'Z' ) +* +* Use 'Z' rather than 'A' so that ASCII can be detected on Prime +* machines, on which ICHAR returns a value with bit 8 set. +* ICHAR('A') on Prime machines returns 193 which is the same as +* ICHAR('A') on an EBCDIC machine. +* + INTA = ICHAR( CA ) + INTB = ICHAR( CB ) +* + IF( ZCODE.EQ.90 .OR. ZCODE.EQ.122 ) THEN +* +* ASCII is assumed - ZCODE is the ASCII code of either lower or +* upper case 'Z'. +* + IF( INTA.GE.97 .AND. INTA.LE.122 ) INTA = INTA - 32 + IF( INTB.GE.97 .AND. INTB.LE.122 ) INTB = INTB - 32 +* + ELSE IF( ZCODE.EQ.233 .OR. ZCODE.EQ.169 ) THEN +* +* EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or +* upper case 'Z'. +* + IF( INTA.GE.129 .AND. INTA.LE.137 .OR. + $ INTA.GE.145 .AND. INTA.LE.153 .OR. + $ INTA.GE.162 .AND. INTA.LE.169 ) INTA = INTA + 64 + IF( INTB.GE.129 .AND. INTB.LE.137 .OR. + $ INTB.GE.145 .AND. INTB.LE.153 .OR. + $ INTB.GE.162 .AND. INTB.LE.169 ) INTB = INTB + 64 +* + ELSE IF( ZCODE.EQ.218 .OR. ZCODE.EQ.250 ) THEN +* +* ASCII is assumed, on Prime machines - ZCODE is the ASCII code +* plus 128 of either lower or upper case 'Z'. +* + IF( INTA.GE.225 .AND. INTA.LE.250 ) INTA = INTA - 32 + IF( INTB.GE.225 .AND. INTB.LE.250 ) INTB = INTB - 32 + END IF + LSAME = INTA.EQ.INTB +* +* RETURN +* +* End of LSAME +* + END diff --git a/reference/samaxf.f b/reference/samaxf.f new file mode 100644 index 0000000000..ef0b80f4d4 --- /dev/null +++ b/reference/samaxf.f @@ -0,0 +1,36 @@ + REAL*4 function samaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + REAL*4 dx(*) + integer i,incx,ix,n +c + samaxf = 0. + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + samaxf = abs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(abs(dx(ix)).le.samaxf) go to 5 + samaxf = abs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 samaxf = abs(dx(1)) + do 30 i = 2,n + if(abs(dx(i)).le.samaxf) go to 30 + samaxf = abs(dx(i)) + 30 continue + return + end diff --git a/reference/saminf.f b/reference/saminf.f new file mode 100644 index 0000000000..455436b291 --- /dev/null +++ b/reference/saminf.f @@ -0,0 +1,36 @@ + REAL*4 function saminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + REAL*4 dx(*) + integer i,incx,ix,n +c + saminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + saminf = abs(dx(1)) + ix = ix + incx + do 10 i = 2,n + if(abs(dx(ix)).ge.saminf) go to 5 + saminf = abs(dx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 saminf = abs(dx(1)) + do 30 i = 2,n + if(abs(dx(i)).ge.saminf) go to 30 + saminf = abs(dx(i)) + 30 continue + return + end diff --git a/reference/sasumf.f b/reference/sasumf.f new file mode 100644 index 0000000000..bf3805b405 --- /dev/null +++ b/reference/sasumf.f @@ -0,0 +1,44 @@ + real function sasumf(n,sx,incx) +c +c takes the sum of the absolute values. +c uses unrolled loops for increment equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),stemp + integer i,incx,m,mp1,n,nincx +c + sasumf = 0.0e0 + stemp = 0.0e0 + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + stemp = stemp + abs(sx(i)) + 10 continue + sasumf = stemp + return +c +c code for increment equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,6) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + stemp = stemp + abs(sx(i)) + 30 continue + if( n .lt. 6 ) go to 60 + 40 mp1 = m + 1 + do 50 i = mp1,n,6 + stemp = stemp + abs(sx(i)) + abs(sx(i + 1)) + abs(sx(i + 2)) + * + abs(sx(i + 3)) + abs(sx(i + 4)) + abs(sx(i + 5)) + 50 continue + 60 sasumf = stemp + return + end diff --git a/reference/saxpyf.f b/reference/saxpyf.f new file mode 100644 index 0000000000..95f1e01db9 --- /dev/null +++ b/reference/saxpyf.f @@ -0,0 +1,48 @@ + subroutine saxpyf(n,sa,sx,incx,sy,incy) +c +c constant times a vector plus a vector. +c uses unrolled loop for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*),sa + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if (sa .eq. 0.0) return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + sy(iy) = sy(iy) + sa*sx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,4) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + sy(i) = sy(i) + sa*sx(i) + 30 continue + if( n .lt. 4 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,4 + sy(i) = sy(i) + sa*sx(i) + sy(i + 1) = sy(i + 1) + sa*sx(i + 1) + sy(i + 2) = sy(i + 2) + sa*sx(i + 2) + sy(i + 3) = sy(i + 3) + sa*sx(i + 3) + 50 continue + return + end diff --git a/reference/scamaxf.f b/reference/scamaxf.f new file mode 100644 index 0000000000..f3d0a5149f --- /dev/null +++ b/reference/scamaxf.f @@ -0,0 +1,40 @@ + REAL*4 function scamaxf(n,zx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + COMPLEX*8 zx(*) + integer i,incx,ix,n + REAL*4 scabs1 +c + scamaxf = 0. + if( n.lt.1 .or. incx.le.0 )return + scamaxf = scabs1(zx(1)) + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + scamaxf = scabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(scabs1(zx(ix)).le.scamaxf) go to 5 + scamaxf = i + scamaxf = scabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 scamaxf = scabs1(zx(1)) + do 30 i = 2,n + if(scabs1(zx(i)).le.scamaxf) go to 30 + scamaxf = i + scamaxf = scabs1(zx(i)) + 30 continue + return + end diff --git a/reference/scaminf.f b/reference/scaminf.f new file mode 100644 index 0000000000..e6a6e91c5a --- /dev/null +++ b/reference/scaminf.f @@ -0,0 +1,38 @@ + REAL*4 function scaminf(n,zx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, 1/15/85. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + COMPLEX*8 zx(*) + integer i,incx,ix,n + REAL*4 scabs1 +c + scaminf = 0. + if( n.lt.1 .or. incx.le.0 )return + scaminf = scabs1(zx(1)) + if(n.eq.1)return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + scaminf = scabs1(zx(1)) + ix = ix + incx + do 10 i = 2,n + if(scabs1(zx(ix)).ge.scaminf) go to 5 + scaminf = scabs1(zx(ix)) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 scaminf = scabs1(zx(1)) + do 30 i = 2,n + if(scabs1(zx(i)).ge.scaminf) go to 30 + scaminf = scabs1(zx(i)) + 30 continue + return + end diff --git a/reference/scasumf.f b/reference/scasumf.f new file mode 100644 index 0000000000..6cc139f14d --- /dev/null +++ b/reference/scasumf.f @@ -0,0 +1,34 @@ + real function scasumf(n,cx,incx) +c +c takes the sum of the absolute values of a complex vector and +c returns a single precision result. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + complex cx(*) + real stemp + integer i,incx,n,nincx +c + scasumf = 0.0e0 + stemp = 0.0e0 + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + stemp = stemp + abs(real(cx(i))) + abs(aimag(cx(i))) + 10 continue + scasumf = stemp + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + stemp = stemp + abs(real(cx(i))) + abs(aimag(cx(i))) + 30 continue + scasumf = stemp + return + end diff --git a/reference/scnrm2f.f b/reference/scnrm2f.f new file mode 100644 index 0000000000..d7e0b379bb --- /dev/null +++ b/reference/scnrm2f.f @@ -0,0 +1,67 @@ + REAL FUNCTION SCNRM2F( N, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N +* .. Array Arguments .. + COMPLEX X( * ) +* .. +* +* SCNRM2 returns the euclidean norm of a vector via the function +* name, so that +* +* SCNRM2 := sqrt( conjg( x' )*x ) +* +* +* +* -- This version written on 25-October-1982. +* Modified on 14-October-1993 to inline the call to CLASSQ. +* Sven Hammarling, Nag Ltd. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + INTEGER IX + REAL NORM, SCALE, SSQ, TEMP +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, REAL, SQRT +* .. +* .. Executable Statements .. + IF( N.LT.1 .OR. INCX.LT.1 )THEN + NORM = ZERO + ELSE + SCALE = ZERO + SSQ = ONE +* The following loop is equivalent to this call to the LAPACK +* auxiliary routine: +* CALL CLASSQ( N, X, INCX, SCALE, SSQ ) +* + DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX + IF( REAL( X( IX ) ).NE.ZERO )THEN + TEMP = ABS( REAL( X( IX ) ) ) + IF( SCALE.LT.TEMP )THEN + SSQ = ONE + SSQ*( SCALE/TEMP )**2 + SCALE = TEMP + ELSE + SSQ = SSQ + ( TEMP/SCALE )**2 + END IF + END IF + IF( AIMAG( X( IX ) ).NE.ZERO )THEN + TEMP = ABS( AIMAG( X( IX ) ) ) + IF( SCALE.LT.TEMP )THEN + SSQ = ONE + SSQ*( SCALE/TEMP )**2 + SCALE = TEMP + ELSE + SSQ = SSQ + ( TEMP/SCALE )**2 + END IF + END IF + 10 CONTINUE + NORM = SCALE * SQRT( SSQ ) + END IF +* + SCNRM2F = NORM + RETURN +* +* End of SCNRM2. +* + END diff --git a/reference/scopyf.f b/reference/scopyf.f new file mode 100644 index 0000000000..bec15844d4 --- /dev/null +++ b/reference/scopyf.f @@ -0,0 +1,50 @@ + subroutine scopyf(n,sx,incx,sy,incy) +c +c copies a vector, x, to a vector, y. +c uses unrolled loops for increments equal to 1. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*) + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + sy(iy) = sx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,7) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + sy(i) = sx(i) + 30 continue + if( n .lt. 7 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,7 + sy(i) = sx(i) + sy(i + 1) = sx(i + 1) + sy(i + 2) = sx(i + 2) + sy(i + 3) = sx(i + 3) + sy(i + 4) = sx(i + 4) + sy(i + 5) = sx(i + 5) + sy(i + 6) = sx(i + 6) + 50 continue + return + end diff --git a/reference/sdotf.f b/reference/sdotf.f new file mode 100644 index 0000000000..dabda7c4c7 --- /dev/null +++ b/reference/sdotf.f @@ -0,0 +1,49 @@ + real function sdotf(n,sx,incx,sy,incy) +c +c forms the dot product of two vectors. +c uses unrolled loops for increments equal to one. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*),stemp + integer i,incx,incy,ix,iy,m,mp1,n +c + stemp = 0.0e0 + sdotf = 0.0e0 + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + stemp = stemp + sx(ix)*sy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + sdotf = stemp + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,5) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + stemp = stemp + sx(i)*sy(i) + 30 continue + if( n .lt. 5 ) go to 60 + 40 mp1 = m + 1 + do 50 i = mp1,n,5 + stemp = stemp + sx(i)*sy(i) + sx(i + 1)*sy(i + 1) + + * sx(i + 2)*sy(i + 2) + sx(i + 3)*sy(i + 3) + sx(i + 4)*sy(i + 4) + 50 continue + 60 sdotf = stemp + return + end diff --git a/reference/sdsdotf.f b/reference/sdsdotf.f new file mode 100644 index 0000000000..c3aa6a53b3 --- /dev/null +++ b/reference/sdsdotf.f @@ -0,0 +1,78 @@ +*DECK SDSDOTF + REAL FUNCTION SDSDOTF (N, SB, SX, INCX, SY, INCY) +C***BEGIN PROLOGUE SDSDOT +C***PURPOSE Compute the inner product of two vectors with extended +C precision accumulation. +C***LIBRARY SLATEC (BLAS) +C***CATEGORY D1A4 +C***TYPE SINGLE PRECISION (SDSDOT-S, CDCDOT-C) +C***KEYWORDS BLAS, DOT PRODUCT, INNER PRODUCT, LINEAR ALGEBRA, VECTOR +C***AUTHOR Lawson, C. L., (JPL) +C Hanson, R. J., (SNLA) +C Kincaid, D. R., (U. of Texas) +C Krogh, F. T., (JPL) +C***DESCRIPTION +C +C B L A S Subprogram +C Description of Parameters +C +C --Input-- +C N number of elements in input vector(s) +C SB single precision scalar to be added to inner product +C SX single precision vector with N elements +C INCX storage spacing between elements of SX +C SY single precision vector with N elements +C INCY storage spacing between elements of SY +C +C --Output-- +C SDSDOT single precision dot product (SB if N .LE. 0) +C +C Returns S.P. result with dot product accumulated in D.P. +C SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY), +C where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is +C defined in a similar way using INCY. +C +C***REFERENCES C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T. +C Krogh, Basic linear algebra subprograms for Fortran +C usage, Algorithm No. 539, Transactions on Mathematical +C Software 5, 3 (September 1979), pp. 308-323. +C***ROUTINES CALLED (NONE) +C***REVISION HISTORY (YYMMDD) +C 791001 DATE WRITTEN +C 890531 Changed all specific intrinsics to generic. (WRB) +C 890831 Modified array declarations. (WRB) +C 890831 REVISION DATE from Version 3.2 +C 891214 Prologue converted to Version 4.0 format. (BAB) +C 920310 Corrected definition of LX in DESCRIPTION. (WRB) +C 920501 Reformatted the REFERENCES section. (WRB) +C***END PROLOGUE SDSDOT + REAL SX(*), SY(*), SB + DOUBLE PRECISION DSDOT +C***FIRST EXECUTABLE STATEMENT SDSDOT + DSDOT = SB + IF (N .LE. 0) GO TO 30 + IF (INCX.EQ.INCY .AND. INCX.GT.0) GO TO 40 +C +C Code for unequal or nonpositive increments. +C + KX = 1 + KY = 1 + IF (INCX .LT. 0) KX = 1+(1-N)*INCX + IF (INCY .LT. 0) KY = 1+(1-N)*INCY + DO 10 I = 1,N + DSDOT = DSDOT + DBLE(SX(KX))*DBLE(SY(KY)) + KX = KX + INCX + KY = KY + INCY + 10 CONTINUE + 30 SDSDOTF = DSDOT + RETURN +C +C Code for equal and positive increments. +C + 40 NS = N*INCX + DO 50 I = 1,NS,INCX + DSDOT = DSDOT + DBLE(SX(I))*DBLE(SY(I)) + 50 CONTINUE + SDSDOTF = DSDOT + RETURN + END diff --git a/reference/sgbmvf.f b/reference/sgbmvf.f new file mode 100644 index 0000000000..c8bc9ff097 --- /dev/null +++ b/reference/sgbmvf.f @@ -0,0 +1,300 @@ + SUBROUTINE SGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, KL, KU, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SGBMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n band matrix, with kl sub-diagonals and ku super-diagonals. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* KL - INTEGER. +* On entry, KL specifies the number of sub-diagonals of the +* matrix A. KL must satisfy 0 .le. KL. +* Unchanged on exit. +* +* KU - INTEGER. +* On entry, KU specifies the number of super-diagonals of the +* matrix A. KU must satisfy 0 .le. KU. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry, the leading ( kl + ku + 1 ) by n part of the +* array A must contain the matrix of coefficients, supplied +* column by column, with the leading diagonal of the matrix in +* row ( ku + 1 ) of the array, the first super-diagonal +* starting at position 2 in row ku, the first sub-diagonal +* starting at position 1 in row ( ku + 2 ), and so on. +* Elements in the array A that do not correspond to elements +* in the band matrix (such as the top left ku by ku triangle) +* are not referenced. +* The following program segment will transfer a band matrix +* from conventional full matrix storage to band storage: +* +* DO 20, J = 1, N +* K = KU + 1 - J +* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) +* A( K + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( kl + ku + 1 ). +* Unchanged on exit. +* +* X - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, + $ LENX, LENY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( KL.LT.0 )THEN + INFO = 4 + ELSE IF( KU.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN + INFO = 8 + ELSE IF( INCX.EQ.0 )THEN + INFO = 10 + ELSE IF( INCY.EQ.0 )THEN + INFO = 13 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SGBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF( LSAME( TRANS, 'N' ) )THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the band part of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KUP1 = KU + 1 + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + K = KUP1 - J + DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + K = KUP1 - J + DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = ZERO + K = KUP1 - J + DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( I ) + 90 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + DO 110, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SGBMV . +* + END diff --git a/reference/sgemmf.f b/reference/sgemmf.f new file mode 100644 index 0000000000..ebb50c3af0 --- /dev/null +++ b/reference/sgemmf.f @@ -0,0 +1,313 @@ + SUBROUTINE SGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + REAL ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRANA,TRANB +* .. +* .. Array Arguments .. + REAL A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* SGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X', +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRANA - CHARACTER*1. +* On entry, TRANA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANA = 'N' or 'n', op( A ) = A. +* +* TRANA = 'T' or 't', op( A ) = A'. +* +* TRANA = 'C' or 'c', op( A ) = A'. +* +* Unchanged on exit. +* +* TRANB - CHARACTER*1. +* On entry, TRANB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRANB = 'N' or 'n', op( B ) = B. +* +* TRANB = 'T' or 't', op( B ) = B'. +* +* TRANB = 'C' or 'c', op( B ) = B'. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, ka ), where ka is +* k when TRANA = 'N' or 'n', and is m otherwise. +* Before entry with TRANA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, kb ), where kb is +* n when TRANB = 'N' or 'n', and is k otherwise. +* Before entry with TRANB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - REAL array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Local Scalars .. + REAL TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL NOTA,NOTB +* .. +* .. Parameters .. + REAL ONE,ZERO + PARAMETER (ONE=1.0E+0,ZERO=0.0E+0) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* transposed and set NROWA, NCOLA and NROWB as the number of rows +* and columns of A and the number of rows of B respectively. +* + NOTA = LSAME(TRANA,'N') + NOTB = LSAME(TRANB,'N') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.LSAME(TRANA,'C')) .AND. + + (.NOT.LSAME(TRANA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.LSAME(TRANB,'C')) .AND. + + (.NOT.LSAME(TRANB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('SGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And if alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + END IF + ELSE + IF (NOTA) THEN +* +* Form C := alpha*A*B' + beta*C +* + DO 170 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 130 I = 1,M + C(I,J) = ZERO + 130 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 140 I = 1,M + C(I,J) = BETA*C(I,J) + 140 CONTINUE + END IF + DO 160 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 150 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 150 CONTINUE + END IF + 160 CONTINUE + 170 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 200 J = 1,N + DO 190 I = 1,M + TEMP = ZERO + DO 180 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 180 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 190 CONTINUE + 200 CONTINUE + END IF + END IF +* + RETURN +* +* End of SGEMM . +* + END diff --git a/reference/sgemvf.f b/reference/sgemvf.f new file mode 100644 index 0000000000..351da45fff --- /dev/null +++ b/reference/sgemvf.f @@ -0,0 +1,257 @@ + SUBROUTINE SGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*A'*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF( LSAME( TRANS, 'N' ) )THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + DO 50, I = 1, M + Y( I ) = Y( I ) + TEMP*A( I, J ) + 50 CONTINUE + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + DO 70, I = 1, M + Y( IY ) = Y( IY ) + TEMP*A( I, J ) + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = ZERO + DO 90, I = 1, M + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 100 CONTINUE + ELSE + DO 120, J = 1, N + TEMP = ZERO + IX = KX + DO 110, I = 1, M + TEMP = TEMP + A( I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SGEMV . +* + END diff --git a/reference/sgerf.f b/reference/sgerf.f new file mode 100644 index 0000000000..f84c933285 --- /dev/null +++ b/reference/sgerf.f @@ -0,0 +1,157 @@ + SUBROUTINE SGERF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SGER performs the rank 1 operation +* +* A := alpha*x*y' + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SGER ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of SGER . +* + END diff --git a/reference/sgesvf.f b/reference/sgesvf.f new file mode 100644 index 0000000000..8d313abde4 --- /dev/null +++ b/reference/sgesvf.f @@ -0,0 +1,107 @@ + SUBROUTINE SGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK driver routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* SGESV computes the solution to a real system of linear equations +* A * X = B, +* where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +* +* The LU decomposition with partial pivoting and row interchanges is +* used to factor A as +* A = P * L * U, +* where P is a permutation matrix, L is unit lower triangular, and U is +* upper triangular. The factored form of A is then used to solve the +* system of equations A * X = B. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of linear equations, i.e., the order of the +* matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the N-by-N coefficient matrix A. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (output) INTEGER array, dimension (N) +* The pivot indices that define the permutation matrix P; +* row i of the matrix was interchanged with row IPIV(i). +* +* B (input/output) REAL array, dimension (LDB,NRHS) +* On entry, the N-by-NRHS matrix of right hand side matrix B. +* On exit, if INFO = 0, the N-by-NRHS solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, so the solution could not be computed. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL SGETRF, SGETRS, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGESV ', -INFO ) + RETURN + END IF +* +* Compute the LU factorization of A. +* + CALL SGETRF( N, N, A, LDA, IPIV, INFO ) + IF( INFO.EQ.0 ) THEN +* +* Solve the system A*X = B, overwriting B with X. +* + CALL SGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, + $ INFO ) + END IF + RETURN +* +* End of SGESV +* + END diff --git a/reference/sgetf2f.f b/reference/sgetf2f.f new file mode 100644 index 0000000000..15861b131e --- /dev/null +++ b/reference/sgetf2f.f @@ -0,0 +1,135 @@ + SUBROUTINE SGETF2F( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1992 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SGETF2 computes an LU factorization of a general m-by-n matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the m by n matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, U(k,k) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER J, JP +* .. +* .. External Functions .. + INTEGER ISAMAX + EXTERNAL ISAMAX +* .. +* .. External Subroutines .. + EXTERNAL SGER, SSCAL, SSWAP, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* + DO 10 J = 1, MIN( M, N ) +* +* Find pivot and test for singularity. +* + JP = J - 1 + ISAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + IF( A( JP, J ).NE.ZERO ) THEN +* +* Apply the interchange to columns 1:N. +* + IF( JP.NE.J ) + $ CALL SSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) +* +* Compute elements J+1:M of J-th column. +* + IF( J.LT.M ) + $ CALL SSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) +* + ELSE IF( INFO.EQ.0 ) THEN +* + INFO = J + END IF +* + IF( J.LT.MIN( M, N ) ) THEN +* +* Update trailing submatrix. +* + CALL SGER( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), LDA, + $ A( J+1, J+1 ), LDA ) + END IF + 10 CONTINUE + RETURN +* +* End of SGETF2 +* + END diff --git a/reference/sgetrff.f b/reference/sgetrff.f new file mode 100644 index 0000000000..139e7dee77 --- /dev/null +++ b/reference/sgetrff.f @@ -0,0 +1,156 @@ + SUBROUTINE SGETRFF( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SGETF2, SLASWP, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 64 + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL SGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL SGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to columns 1:J-1. +* + CALL SLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF( J+JB.LE.N ) THEN +* +* Apply interchanges to columns J+JB:N. +* + CALL SLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* +* Compute block row of U. +* + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL SGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + 20 CONTINUE + END IF + RETURN +* +* End of SGETRF +* + END diff --git a/reference/sgetrsf.f b/reference/sgetrsf.f new file mode 100644 index 0000000000..f00921868d --- /dev/null +++ b/reference/sgetrsf.f @@ -0,0 +1,150 @@ + SUBROUTINE SGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* SGETRS solves a system of linear equations +* A * X = B or A' * X = B +* with a general N-by-N matrix A using the LU factorization computed +* by SGETRF. +* +* Arguments +* ========= +* +* TRANS (input) CHARACTER*1 +* Specifies the form of the system of equations: +* = 'N': A * X = B (No transpose) +* = 'T': A'* X = B (Transpose) +* = 'C': A'* X = B (Conjugate transpose = Transpose) +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input) REAL array, dimension (LDA,N) +* The factors L and U from the factorization A = P*L*U +* as computed by SGETRF. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from SGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* B (input/output) REAL array, dimension (LDB,NRHS) +* On entry, the right hand side matrix B. +* On exit, the solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOTRAN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SLASWP, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NOTRAN = LSAME( TRANS, 'N' ) + IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETRS', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 .OR. NRHS.EQ.0 ) + $ RETURN +* + IF( NOTRAN ) THEN +* +* Solve A * X = B. +* +* Apply row interchanges to the right hand sides. +* + CALL SLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) +* +* Solve L*X = B, overwriting B with X. +* + CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve U*X = B, overwriting B with X. +* + CALL STRSM( 'Left', 'Upper', 'No transpose', 'Non-unit', N, + $ NRHS, ONE, A, LDA, B, LDB ) + ELSE +* +* Solve A' * X = B. +* +* Solve U'*X = B, overwriting B with X. +* + CALL STRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve L'*X = B, overwriting B with X. +* + CALL STRSM( 'Left', 'Lower', 'Transpose', 'Unit', N, NRHS, ONE, + $ A, LDA, B, LDB ) +* +* Apply row interchanges to the solution vectors. +* + CALL SLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) + END IF +* + RETURN +* +* End of SGETRS +* + END diff --git a/reference/slaswpf.f b/reference/slaswpf.f new file mode 100644 index 0000000000..ab300e2982 --- /dev/null +++ b/reference/slaswpf.f @@ -0,0 +1,120 @@ + SUBROUTINE SLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INCX, K1, K2, LDA, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SLASWP performs a series of row interchanges on the matrix A. +* One row interchange is initiated for each of rows K1 through K2 of A. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of columns of the matrix A. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the matrix of column dimension N to which the row +* interchanges will be applied. +* On exit, the permuted matrix. +* +* LDA (input) INTEGER +* The leading dimension of the array A. +* +* K1 (input) INTEGER +* The first element of IPIV for which a row interchange will +* be done. +* +* K2 (input) INTEGER +* The last element of IPIV for which a row interchange will +* be done. +* +* IPIV (input) INTEGER array, dimension (M*abs(INCX)) +* The vector of pivot indices. Only the elements in positions +* K1 through K2 of IPIV are accessed. +* IPIV(K) = L implies rows K and L are to be interchanged. +* +* INCX (input) INTEGER +* The increment between successive values of IPIV. If IPIV +* is negative, the pivots are applied in reverse order. +* +* Further Details +* =============== +* +* Modified by +* R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 + REAL TEMP +* .. +* .. Executable Statements .. +* +* Interchange row I with row IPIV(I) for each of rows K1 through K2. +* + IF( INCX.GT.0 ) THEN + IX0 = K1 + I1 = K1 + I2 = K2 + INC = 1 + ELSE IF( INCX.LT.0 ) THEN + IX0 = 1 + ( 1-K2 )*INCX + I1 = K2 + I2 = K1 + INC = -1 + ELSE + RETURN + END IF +* + N32 = ( N / 32 )*32 + IF( N32.NE.0 ) THEN + DO 30 J = 1, N32, 32 + IX = IX0 + DO 20 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 10 K = J, J + 31 + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 10 CONTINUE + END IF + IX = IX + INCX + 20 CONTINUE + 30 CONTINUE + END IF + IF( N32.NE.N ) THEN + N32 = N32 + 1 + IX = IX0 + DO 50 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 40 K = N32, N + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 40 CONTINUE + END IF + IX = IX + INCX + 50 CONTINUE + END IF +* + RETURN +* +* End of SLASWP +* + END diff --git a/reference/slauu2f.f b/reference/slauu2f.f new file mode 100644 index 0000000000..5d48e1202d --- /dev/null +++ b/reference/slauu2f.f @@ -0,0 +1,135 @@ + SUBROUTINE SLAUU2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SLAUU2 computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the unblocked form of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I + REAL AII +* .. +* .. External Functions .. + LOGICAL LSAME + REAL SDOT + EXTERNAL LSAME, SDOT +* .. +* .. External Subroutines .. + EXTERNAL SGEMV, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLAUU2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = SDOT( N-I+1, A( I, I ), LDA, A( I, I ), LDA ) + CALL SGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), + $ LDA, A( I, I+1 ), LDA, AII, A( 1, I ), 1 ) + ELSE + CALL SSCAL( I, AII, A( 1, I ), 1 ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = SDOT( N-I+1, A( I, I ), 1, A( I, I ), 1 ) + CALL SGEMV( 'Transpose', N-I, I-1, ONE, A( I+1, 1 ), LDA, + $ A( I+1, I ), 1, AII, A( I, 1 ), LDA ) + ELSE + CALL SSCAL( I, AII, A( I, 1 ), LDA ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of SLAUU2 +* + END diff --git a/reference/slauumf.f b/reference/slauumf.f new file mode 100644 index 0000000000..a4b25b985d --- /dev/null +++ b/reference/slauumf.f @@ -0,0 +1,156 @@ + SUBROUTINE SLAUUMF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* February 29, 1992 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SLAUUM computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the blocked form of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, IB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + INTEGER ILAENV + EXTERNAL LSAME, ILAENV +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SLAUU2, SSYRK, STRMM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SLAUUM', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 128 +* + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL SLAUU2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL STRMM( 'Right', 'Upper', 'Transpose', 'Non-unit', + $ I-1, IB, ONE, A( I, I ), LDA, A( 1, I ), + $ LDA ) + CALL SLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL SGEMM( 'No transpose', 'Transpose', I-1, IB, + $ N-I-IB+1, ONE, A( 1, I+IB ), LDA, + $ A( I, I+IB ), LDA, ONE, A( 1, I ), LDA ) + CALL SSYRK( 'Upper', 'No transpose', IB, N-I-IB+1, + $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), + $ LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL STRMM( 'Left', 'Lower', 'Transpose', 'Non-unit', IB, + $ I-1, ONE, A( I, I ), LDA, A( I, 1 ), LDA ) + CALL SLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL SGEMM( 'Transpose', 'No transpose', IB, I-1, + $ N-I-IB+1, ONE, A( I+IB, I ), LDA, + $ A( I+IB, 1 ), LDA, ONE, A( I, 1 ), LDA ) + CALL SSYRK( 'Lower', 'Transpose', IB, N-I-IB+1, ONE, + $ A( I+IB, I ), LDA, ONE, A( I, I ), LDA ) + END IF + 20 CONTINUE + END IF + END IF +* + RETURN +* +* End of SLAUUM +* + END diff --git a/reference/smaxf.f b/reference/smaxf.f new file mode 100644 index 0000000000..69d473843f --- /dev/null +++ b/reference/smaxf.f @@ -0,0 +1,36 @@ + REAL*4 function smaxf(n,dx,incx) +c +c finds the index of element having max. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + REAL*4 dx(*) + integer i,incx,ix,n +c + smaxf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + smaxf = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).le.smaxf) go to 5 + smaxf = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 smaxf = dx(1) + do 30 i = 2,n + if(dx(i).le.smaxf) go to 30 + smaxf = dx(i) + 30 continue + return + end diff --git a/reference/sminf.f b/reference/sminf.f new file mode 100644 index 0000000000..de59c2e363 --- /dev/null +++ b/reference/sminf.f @@ -0,0 +1,36 @@ + REAL*4 function sminf(n,dx,incx) +c +c finds the index of element having min. absolute value. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + REAL*4 dx(*) + integer i,incx,ix,n +c + sminf = 0 + if( n.lt.1 .or. incx.le.0 ) return + + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + sminf = dx(1) + ix = ix + incx + do 10 i = 2,n + if(dx(ix).ge.sminf) go to 5 + sminf = dx(ix) + 5 ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 sminf = dx(1) + do 30 i = 2,n + if(dx(i).ge.sminf) go to 30 + sminf = dx(i) + 30 continue + return + end diff --git a/reference/snrm2f.f b/reference/snrm2f.f new file mode 100644 index 0000000000..cff495d20f --- /dev/null +++ b/reference/snrm2f.f @@ -0,0 +1,60 @@ + REAL FUNCTION SNRM2F ( N, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N +* .. Array Arguments .. + REAL X( * ) +* .. +* +* SNRM2 returns the euclidean norm of a vector via the function +* name, so that +* +* SNRM2 := sqrt( x'*x ) +* +* +* +* -- This version written on 25-October-1982. +* Modified on 14-October-1993 to inline the call to SLASSQ. +* Sven Hammarling, Nag Ltd. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + INTEGER IX + REAL ABSXI, NORM, SCALE, SSQ +* .. Intrinsic Functions .. + INTRINSIC ABS, SQRT +* .. +* .. Executable Statements .. + IF( N.LT.1 .OR. INCX.LT.1 )THEN + NORM = ZERO + ELSE IF( N.EQ.1 )THEN + NORM = ABS( X( 1 ) ) + ELSE + SCALE = ZERO + SSQ = ONE +* The following loop is equivalent to this call to the LAPACK +* auxiliary routine: +* CALL SLASSQ( N, X, INCX, SCALE, SSQ ) +* + DO 10, IX = 1, 1 + ( N - 1 )*INCX, INCX + IF( X( IX ).NE.ZERO )THEN + ABSXI = ABS( X( IX ) ) + IF( SCALE.LT.ABSXI )THEN + SSQ = ONE + SSQ*( SCALE/ABSXI )**2 + SCALE = ABSXI + ELSE + SSQ = SSQ + ( ABSXI/SCALE )**2 + END IF + END IF + 10 CONTINUE + NORM = SCALE * SQRT( SSQ ) + END IF +* + SNRM2F = NORM + RETURN +* +* End of SNRM2. +* + END diff --git a/reference/spotf2f.f b/reference/spotf2f.f new file mode 100644 index 0000000000..5662b803be --- /dev/null +++ b/reference/spotf2f.f @@ -0,0 +1,168 @@ + SUBROUTINE SPOTF2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* February 29, 1992 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SPOTF2 computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U' * U , if UPLO = 'U', or +* A = L * L', if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the unblocked version of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the upper or lower triangular part of the +* symmetric matrix A is stored. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* n by n upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U'*U or A = L*L'. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, the leading minor of order k is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J + REAL AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + REAL SDOT + EXTERNAL LSAME, SDOT +* .. +* .. External Subroutines .. + EXTERNAL SGEMV, SSCAL, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SPOTF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N +* +* Compute U(J,J) and test for non-positive-definiteness. +* + AJJ = A( J, J ) - SDOT( J-1, A( 1, J ), 1, A( 1, J ), 1 ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of row J. +* + IF( J.LT.N ) THEN + CALL SGEMV( 'Transpose', J-1, N-J, -ONE, A( 1, J+1 ), + $ LDA, A( 1, J ), 1, ONE, A( J, J+1 ), LDA ) + CALL SSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N +* +* Compute L(J,J) and test for non-positive-definiteness. +* + AJJ = A( J, J ) - SDOT( J-1, A( J, 1 ), LDA, A( J, 1 ), + $ LDA ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of column J. +* + IF( J.LT.N ) THEN + CALL SGEMV( 'No transpose', N-J, J-1, -ONE, A( J+1, 1 ), + $ LDA, A( J, 1 ), LDA, ONE, A( J+1, J ), 1 ) + CALL SSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF + GO TO 40 +* + 30 CONTINUE + INFO = J +* + 40 CONTINUE + RETURN +* +* End of SPOTF2 +* + END diff --git a/reference/spotrff.f b/reference/spotrff.f new file mode 100644 index 0000000000..0a4925138f --- /dev/null +++ b/reference/spotrff.f @@ -0,0 +1,184 @@ + SUBROUTINE SPOTRFF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SPOTRF computes the Cholesky factorization of a real symmetric +* positive definite matrix A. +* +* The factorization has the form +* A = U**T * U, if UPLO = 'U', or +* A = L * L**T, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the symmetric matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SGEMM, SPOTF2, SSYRK, STRSM, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 56 + + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL SPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL SSYRK( 'Upper', 'Transpose', JB, J-1, -ONE, + $ A( 1, J ), LDA, ONE, A( J, J ), LDA ) + CALL SPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block row. +* + CALL SGEMM( 'Transpose', 'No transpose', JB, N-J-JB+1, + $ J-1, -ONE, A( 1, J ), LDA, A( 1, J+JB ), + $ LDA, ONE, A( J, J+JB ), LDA ) + CALL STRSM( 'Left', 'Upper', 'Transpose', 'Non-unit', + $ JB, N-J-JB+1, ONE, A( J, J ), LDA, + $ A( J, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL SSYRK( 'Lower', 'No transpose', JB, J-1, -ONE, + $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) + CALL SPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block column. +* + CALL SGEMM( 'No transpose', 'Transpose', N-J-JB+1, JB, + $ J-1, -ONE, A( J+JB, 1 ), LDA, A( J, 1 ), + $ LDA, ONE, A( J+JB, J ), LDA ) + CALL STRSM( 'Right', 'Lower', 'Transpose', 'Non-unit', + $ N-J-JB+1, JB, ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of SPOTRF +* + END diff --git a/reference/spotrif.f b/reference/spotrif.f new file mode 100644 index 0000000000..ad24e2345a --- /dev/null +++ b/reference/spotrif.f @@ -0,0 +1,96 @@ + SUBROUTINE SPOTRIF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* SPOTRI computes the inverse of a real symmetric positive definite +* matrix A using the Cholesky factorization A = U**T*U or A = L*L**T +* computed by SPOTRF. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular factor U or L from the Cholesky +* factorization A = U**T*U or A = L*L**T, as computed by +* SPOTRF. +* On exit, the upper or lower triangle of the (symmetric) +* inverse of A, overwriting the input factor U or L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the (i,i) element of the factor U or L is +* zero, and the inverse could not be computed. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SLAUUM, STRTRI, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SPOTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Invert the triangular Cholesky factor U or L. +* + CALL STRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* +* Form inv(U)*inv(U)' or inv(L)'*inv(L). +* + CALL SLAUUM( UPLO, N, A, LDA, INFO ) +* + RETURN +* +* End of SPOTRI +* + END diff --git a/reference/srotf.f b/reference/srotf.f new file mode 100644 index 0000000000..02230800c5 --- /dev/null +++ b/reference/srotf.f @@ -0,0 +1,37 @@ + subroutine srotf (n,sx,incx,sy,incy,c,s) +c +c applies a plane rotation. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*),stemp,c,s + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + stemp = c*sx(ix) + s*sy(iy) + sy(iy) = c*sy(iy) - s*sx(ix) + sx(ix) = stemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + stemp = c*sx(i) + s*sy(i) + sy(i) = c*sy(i) - s*sx(i) + sx(i) = stemp + 30 continue + return + end diff --git a/reference/srotgf.f b/reference/srotgf.f new file mode 100644 index 0000000000..4f222988ce --- /dev/null +++ b/reference/srotgf.f @@ -0,0 +1,27 @@ + subroutine srotgf(sa,sb,c,s) +c +c construct givens plane rotation. +c jack dongarra, linpack, 3/11/78. +c + real sa,sb,c,s,roe,scale,r,z +c + roe = sb + if( abs(sa) .gt. abs(sb) ) roe = sa + scale = abs(sa) + abs(sb) + if( scale .ne. 0.0 ) go to 10 + c = 1.0 + s = 0.0 + r = 0.0 + z = 0.0 + go to 20 + 10 r = scale*sqrt((sa/scale)**2 + (sb/scale)**2) + r = sign(1.0,roe)*r + c = sa/r + s = sb/r + z = 1.0 + if( abs(sa) .gt. abs(sb) ) z = s + if( abs(sb) .ge. abs(sa) .and. c .ne. 0.0 ) z = 1.0/c + 20 sa = r + sb = z + return + end diff --git a/reference/srotmf.f b/reference/srotmf.f new file mode 100644 index 0000000000..3924edbc23 --- /dev/null +++ b/reference/srotmf.f @@ -0,0 +1,106 @@ + SUBROUTINE SROTMF (N,SX,INCX,SY,INCY,SPARAM) +C +C APPLY THE MODIFIED GIVENS TRANSFORMATION, H, TO THE 2 BY N MATRIX +C +C (SX**T) , WHERE **T INDICATES TRANSPOSE. THE ELEMENTS OF SX ARE IN +C (DX**T) +C +C SX(LX+I*INCX), I = 0 TO N-1, WHERE LX = 1 IF INCX .GE. 0, ELSE +C LX = (-INCX)*N, AND SIMILARLY FOR SY USING USING LY AND INCY. +C WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. +C +C SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 +C +C (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) +C H=( ) ( ) ( ) ( ) +C (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). +C SEE SROTMG FOR A DESCRIPTION OF DATA STORAGE IN SPARAM. +C + DIMENSION SX(1),SY(1),SPARAM(5) + DATA ZERO,TWO/0.E0,2.E0/ +C + SFLAG=SPARAM(1) + IF(N .LE. 0 .OR.(SFLAG+TWO.EQ.ZERO)) GO TO 140 + IF(.NOT.(INCX.EQ.INCY.AND. INCX .GT.0)) GO TO 70 +C + NSTEPS=N*INCX + IF(SFLAG) 50,10,30 + 10 CONTINUE + SH12=SPARAM(4) + SH21=SPARAM(3) + DO 20 I=1,NSTEPS,INCX + W=SX(I) + Z=SY(I) + SX(I)=W+Z*SH12 + SY(I)=W*SH21+Z + 20 CONTINUE + GO TO 140 + 30 CONTINUE + SH11=SPARAM(2) + SH22=SPARAM(5) + DO 40 I=1,NSTEPS,INCX + W=SX(I) + Z=SY(I) + SX(I)=W*SH11+Z + SY(I)=-W+SH22*Z + 40 CONTINUE + GO TO 140 + 50 CONTINUE + SH11=SPARAM(2) + SH12=SPARAM(4) + SH21=SPARAM(3) + SH22=SPARAM(5) + DO 60 I=1,NSTEPS,INCX + W=SX(I) + Z=SY(I) + SX(I)=W*SH11+Z*SH12 + SY(I)=W*SH21+Z*SH22 + 60 CONTINUE + GO TO 140 + 70 CONTINUE + KX=1 + KY=1 + IF(INCX .LT. 0) KX=1+(1-N)*INCX + IF(INCY .LT. 0) KY=1+(1-N)*INCY +C + IF(SFLAG)120,80,100 + 80 CONTINUE + SH12=SPARAM(4) + SH21=SPARAM(3) + DO 90 I=1,N + W=SX(KX) + Z=SY(KY) + SX(KX)=W+Z*SH12 + SY(KY)=W*SH21+Z + KX=KX+INCX + KY=KY+INCY + 90 CONTINUE + GO TO 140 + 100 CONTINUE + SH11=SPARAM(2) + SH22=SPARAM(5) + DO 110 I=1,N + W=SX(KX) + Z=SY(KY) + SX(KX)=W*SH11+Z + SY(KY)=-W+SH22*Z + KX=KX+INCX + KY=KY+INCY + 110 CONTINUE + GO TO 140 + 120 CONTINUE + SH11=SPARAM(2) + SH12=SPARAM(4) + SH21=SPARAM(3) + SH22=SPARAM(5) + DO 130 I=1,N + W=SX(KX) + Z=SY(KY) + SX(KX)=W*SH11+Z*SH12 + SY(KY)=W*SH21+Z*SH22 + KX=KX+INCX + KY=KY+INCY + 130 CONTINUE + 140 CONTINUE + RETURN + END diff --git a/reference/srotmgf.f b/reference/srotmgf.f new file mode 100644 index 0000000000..e9998ff36e --- /dev/null +++ b/reference/srotmgf.f @@ -0,0 +1,166 @@ + SUBROUTINE SROTMGF (SD1,SD2,SX1,SY1,SPARAM) +C +C CONSTRUCT THE MODIFIED GIVENS TRANSFORMATION MATRIX H WHICH ZEROS +C THE SECOND COMPONENT OF THE 2-VECTOR (SQRT(SD1)*SX1,SQRT(SD2)* +C SY2)**T. +C WITH SPARAM(1)=SFLAG, H HAS ONE OF THE FOLLOWING FORMS.. +C +C SFLAG=-1.E0 SFLAG=0.E0 SFLAG=1.E0 SFLAG=-2.E0 +C +C (SH11 SH12) (1.E0 SH12) (SH11 1.E0) (1.E0 0.E0) +C H=( ) ( ) ( ) ( ) +C (SH21 SH22), (SH21 1.E0), (-1.E0 SH22), (0.E0 1.E0). +C LOCATIONS 2-4 OF SPARAM CONTAIN SH11,SH21,SH12, AND SH22 +C RESPECTIVELY. (VALUES OF 1.E0, -1.E0, OR 0.E0 IMPLIED BY THE +C VALUE OF SPARAM(1) ARE NOT STORED IN SPARAM.) +C +C THE VALUES OF GAMSQ AND RGAMSQ SET IN THE DATA STATEMENT MAY BE +C INEXACT. THIS IS OK AS THEY ARE ONLY USED FOR TESTING THE SIZE +C OF SD1 AND SD2. ALL ACTUAL SCALING OF DATA IS DONE USING GAM. +C + DIMENSION SPARAM(5) +C + DATA ZERO,ONE,TWO /0.E0,1.E0,2.E0/ + DATA GAM,GAMSQ,RGAMSQ/4096.E0,1.67772E7,5.96046E-8/ + IF(.NOT. SD1 .LT. ZERO) GO TO 10 +C GO ZERO-H-D-AND-SX1.. + GO TO 60 + 10 CONTINUE +C CASE-SD1-NONNEGATIVE + SP2=SD2*SY1 + IF(.NOT. SP2 .EQ. ZERO) GO TO 20 + SFLAG=-TWO + GO TO 260 +C REGULAR-CASE.. + 20 CONTINUE + SP1=SD1*SX1 + SQ2=SP2*SY1 + SQ1=SP1*SX1 +C + IF(.NOT. ABS(SQ1) .GT. ABS(SQ2)) GO TO 40 + SH21=-SY1/SX1 + SH12=SP2/SP1 +C + SU=ONE-SH12*SH21 +C + IF(.NOT. SU .LE. ZERO) GO TO 30 +C GO ZERO-H-D-AND-SX1.. + GO TO 60 + 30 CONTINUE + SFLAG=ZERO + SD1=SD1/SU + SD2=SD2/SU + SX1=SX1*SU +C GO SCALE-CHECK.. + GO TO 100 + 40 CONTINUE + IF(.NOT. SQ2 .LT. ZERO) GO TO 50 +C GO ZERO-H-D-AND-SX1.. + GO TO 60 + 50 CONTINUE + SFLAG=ONE + SH11=SP1/SP2 + SH22=SX1/SY1 + SU=ONE+SH11*SH22 + STEMP=SD2/SU + SD2=SD1/SU + SD1=STEMP + SX1=SY1*SU +C GO SCALE-CHECK + GO TO 100 +C PROCEDURE..ZERO-H-D-AND-SX1.. + 60 CONTINUE + SFLAG=-ONE + SH11=ZERO + SH12=ZERO + SH21=ZERO + SH22=ZERO +C + SD1=ZERO + SD2=ZERO + SX1=ZERO +C RETURN.. + GO TO 220 +C PROCEDURE..FIX-H.. + 70 CONTINUE + IF(.NOT. SFLAG .GE. ZERO) GO TO 90 +C + IF(.NOT. SFLAG .EQ. ZERO) GO TO 80 + SH11=ONE + SH22=ONE + SFLAG=-ONE + GO TO 90 + 80 CONTINUE + SH21=-ONE + SH12=ONE + SFLAG=-ONE + 90 CONTINUE + GO TO IGO,(120,150,180,210) +C PROCEDURE..SCALE-CHECK + 100 CONTINUE + 110 CONTINUE + IF(.NOT. SD1 .LE. RGAMSQ) GO TO 130 + IF(SD1 .EQ. ZERO) GO TO 160 + ASSIGN 120 TO IGO +C FIX-H.. + GO TO 70 + 120 CONTINUE + SD1=SD1*GAM**2 + SX1=SX1/GAM + SH11=SH11/GAM + SH12=SH12/GAM + GO TO 110 + 130 CONTINUE + 140 CONTINUE + IF(.NOT. SD1 .GE. GAMSQ) GO TO 160 + ASSIGN 150 TO IGO +C FIX-H.. + GO TO 70 + 150 CONTINUE + SD1=SD1/GAM**2 + SX1=SX1*GAM + SH11=SH11*GAM + SH12=SH12*GAM + GO TO 140 + 160 CONTINUE + 170 CONTINUE + IF(.NOT. ABS(SD2) .LE. RGAMSQ) GO TO 190 + IF(SD2 .EQ. ZERO) GO TO 220 + ASSIGN 180 TO IGO +C FIX-H.. + GO TO 70 + 180 CONTINUE + SD2=SD2*GAM**2 + SH21=SH21/GAM + SH22=SH22/GAM + GO TO 170 + 190 CONTINUE + 200 CONTINUE + IF(.NOT. ABS(SD2) .GE. GAMSQ) GO TO 220 + ASSIGN 210 TO IGO +C FIX-H.. + GO TO 70 + 210 CONTINUE + SD2=SD2/GAM**2 + SH21=SH21*GAM + SH22=SH22*GAM + GO TO 200 + 220 CONTINUE + IF(SFLAG)250,230,240 + 230 CONTINUE + SPARAM(3)=SH21 + SPARAM(4)=SH12 + GO TO 260 + 240 CONTINUE + SPARAM(2)=SH11 + SPARAM(5)=SH22 + GO TO 260 + 250 CONTINUE + SPARAM(2)=SH11 + SPARAM(3)=SH21 + SPARAM(4)=SH12 + SPARAM(5)=SH22 + 260 CONTINUE + SPARAM(1)=SFLAG + RETURN + END diff --git a/reference/ssbmvf.f b/reference/ssbmvf.f new file mode 100644 index 0000000000..d1d7a67a0d --- /dev/null +++ b/reference/ssbmvf.f @@ -0,0 +1,303 @@ + SUBROUTINE SSBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, K, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric band matrix, with k super-diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - REAL array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( K.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( 1, J ) + L = 1 - J + DO 90, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) + L = 1 - J + IX = JX + IY = JY + DO 110, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSBMV . +* + END diff --git a/reference/sscalf.f b/reference/sscalf.f new file mode 100644 index 0000000000..73571bca7f --- /dev/null +++ b/reference/sscalf.f @@ -0,0 +1,43 @@ + subroutine sscalf(n,sa,sx,incx) +c +c scales a vector by a constant. +c uses unrolled loops for increment equal to 1. +c jack dongarra, linpack, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sa,sx(*) + integer i,incx,m,mp1,n,nincx +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + nincx = n*incx + do 10 i = 1,nincx,incx + sx(i) = sa*sx(i) + 10 continue + return +c +c code for increment equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,5) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + sx(i) = sa*sx(i) + 30 continue + if( n .lt. 5 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,5 + sx(i) = sa*sx(i) + sx(i + 1) = sa*sx(i + 1) + sx(i + 2) = sa*sx(i + 2) + sx(i + 3) = sa*sx(i + 3) + sx(i + 4) = sa*sx(i + 4) + 50 continue + return + end diff --git a/reference/sspmvf.f b/reference/sspmvf.f new file mode 100644 index 0000000000..70740ae448 --- /dev/null +++ b/reference/sspmvf.f @@ -0,0 +1,262 @@ + SUBROUTINE SSPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 6 + ELSE IF( INCY.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*AP( KK + J - 1 ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*AP( KK ) + K = KK + 1 + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N - J + 1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*AP( KK ) + IX = JX + IY = JY + DO 110, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N - J + 1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSPMV . +* + END diff --git a/reference/sspr2f.f b/reference/sspr2f.f new file mode 100644 index 0000000000..fd9b0e4825 --- /dev/null +++ b/reference/sspr2f.f @@ -0,0 +1,229 @@ + SUBROUTINE SSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSPR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSPR2 . +* + END diff --git a/reference/ssprf.f b/reference/ssprf.f new file mode 100644 index 0000000000..cdf352b232 --- /dev/null +++ b/reference/ssprf.f @@ -0,0 +1,198 @@ + SUBROUTINE SSPRF ( UPLO, N, ALPHA, X, INCX, AP ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* SSPR performs the symmetric rank 1 operation +* +* A := alpha*x*x' + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSPR . +* + END diff --git a/reference/sswapf.f b/reference/sswapf.f new file mode 100644 index 0000000000..d7368960db --- /dev/null +++ b/reference/sswapf.f @@ -0,0 +1,56 @@ + subroutine sswapf (n,sx,incx,sy,incy) +c +c interchanges two vectors. +c uses unrolled loops for increments equal to 1. +c jack dongarra, linpack, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + real sx(*),sy(*),stemp + integer i,incx,incy,ix,iy,m,mp1,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + stemp = sx(ix) + sx(ix) = sy(iy) + sy(iy) = stemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c +c +c clean-up loop +c + 20 m = mod(n,3) + if( m .eq. 0 ) go to 40 + do 30 i = 1,m + stemp = sx(i) + sx(i) = sy(i) + sy(i) = stemp + 30 continue + if( n .lt. 3 ) return + 40 mp1 = m + 1 + do 50 i = mp1,n,3 + stemp = sx(i) + sx(i) = sy(i) + sy(i) = stemp + stemp = sx(i + 1) + sx(i + 1) = sy(i + 1) + sy(i + 1) = stemp + stemp = sx(i + 2) + sx(i + 2) = sy(i + 2) + sy(i + 2) = stemp + 50 continue + return + end diff --git a/reference/ssymmf.f b/reference/ssymmf.f new file mode 100644 index 0000000000..5b08824edd --- /dev/null +++ b/reference/ssymmf.f @@ -0,0 +1,294 @@ + SUBROUTINE SSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + REAL ALPHA, BETA +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* SSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - REAL array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + REAL TEMP1, TEMP2 +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of SSYMM . +* + END diff --git a/reference/ssymvf.f b/reference/ssymvf.f new file mode 100644 index 0000000000..c1ebc35863 --- /dev/null +++ b/reference/ssymvf.f @@ -0,0 +1,262 @@ + SUBROUTINE SSYMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + REAL ALPHA, BETA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSYMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 5 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + ELSE IF( INCY.EQ.0 )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( J, J ) + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + IX = JX + IY = JY + DO 110, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYMV . +* + END diff --git a/reference/ssyr2f.f b/reference/ssyr2f.f new file mode 100644 index 0000000000..bd962e1c4b --- /dev/null +++ b/reference/ssyr2f.f @@ -0,0 +1,230 @@ + SUBROUTINE SSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* SSYR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYR2 . +* + END diff --git a/reference/ssyr2kf.f b/reference/ssyr2kf.f new file mode 100644 index 0000000000..bc214ca4ab --- /dev/null +++ b/reference/ssyr2kf.f @@ -0,0 +1,327 @@ + SUBROUTINE SSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + REAL ALPHA, BETA +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* SSYR2K performs one of the symmetric rank 2k operations +* +* C := alpha*A*B' + alpha*B*A' + beta*C, +* +* or +* +* C := alpha*A'*B + alpha*B'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A and B are n by k matrices in the first case and k by n +* matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + +* beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* TRANS = 'C' or 'c' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'T' or 't' or 'C' or 'c', K specifies the number +* of rows of the matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - REAL array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + REAL TEMP1, TEMP2 +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYR2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*B' + alpha*B*A' + C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + + $ A( I, L )*TEMP1 + B( I, L )*TEMP2 + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*B + alpha*B'*A + C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYR2K. +* + END diff --git a/reference/ssyrf.f b/reference/ssyrf.f new file mode 100644 index 0000000000..9877f563ca --- /dev/null +++ b/reference/ssyrf.f @@ -0,0 +1,197 @@ + SUBROUTINE SSYRF ( UPLO, N, ALPHA, X, INCX, A, LDA ) +* .. Scalar Arguments .. + REAL ALPHA + INTEGER INCX, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* SSYR performs the symmetric rank 1 operation +* +* A := alpha*x*x' + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYR . +* + END diff --git a/reference/ssyrkf.f b/reference/ssyrkf.f new file mode 100644 index 0000000000..26b250981b --- /dev/null +++ b/reference/ssyrkf.f @@ -0,0 +1,294 @@ + SUBROUTINE SSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + REAL ALPHA, BETA +* .. Array Arguments .. + REAL A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* SSYRK performs one of the symmetric rank k operations +* +* C := alpha*A*A' + beta*C, +* +* or +* +* C := alpha*A'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A is an n by k matrix in the first case and a k by n matrix +* in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*A + beta*C. +* +* TRANS = 'C' or 'c' C := alpha*A'*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'T' or 't' or 'C' or 'c', K specifies the number +* of rows of the matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - REAL . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - REAL array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + REAL TEMP +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'SSYRK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*A' + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*A + beta*C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP = ZERO + DO 220, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of SSYRK . +* + END diff --git a/reference/stbmvf.f b/reference/stbmvf.f new file mode 100644 index 0000000000..353e63ee8f --- /dev/null +++ b/reference/stbmvf.f @@ -0,0 +1,342 @@ + SUBROUTINE STBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, K, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STBMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular band matrix, with ( k + 1 ) diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( K.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 7 + ELSE IF( INCX.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = KPLUS1 - J + DO 10, I = MAX( 1, J - K ), J - 1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( KPLUS1, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = KPLUS1 - J + DO 30, I = MAX( 1, J - K ), J - 1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( KPLUS1, J ) + END IF + JX = JX + INCX + IF( J.GT.K ) + $ KX = KX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = 1 - J + DO 50, I = MIN( N, J + K ), J + 1, -1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( 1, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = 1 - J + DO 70, I = MIN( N, J + K ), J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( 1, J ) + END IF + JX = JX - INCX + IF( ( N - J ).GE.K ) + $ KX = KX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + L = KPLUS1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 90, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( I ) + 90 CONTINUE + X( J ) = TEMP + 100 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + KX = KX - INCX + IX = KX + L = KPLUS1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 110, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX - INCX + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + L = 1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 130, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( I ) + 130 CONTINUE + X( J ) = TEMP + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + KX = KX + INCX + IX = KX + L = 1 - J + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 150, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX + INCX + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STBMV . +* + END diff --git a/reference/stbsvf.f b/reference/stbsvf.f new file mode 100644 index 0000000000..b0f7e46a51 --- /dev/null +++ b/reference/stbsvf.f @@ -0,0 +1,336 @@ + SUBROUTINE STBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) +* .. Scalar Arguments .. + INTEGER INCX,K,LDA,N + CHARACTER DIAG,TRANS,UPLO +* .. +* .. Array Arguments .. + REAL A(LDA,*),X(*) +* .. +* +* Purpose +* ======= +* +* STBSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular band matrix, with ( k + 1 ) +* diagonals. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER (ZERO=0.0E+0) +* .. +* .. Local Scalars .. + REAL TEMP + INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L + LOGICAL NOUNIT +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX,MIN +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN + INFO = 1 + ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 2 + ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT. (K+1)) THEN + INFO = 7 + ELSE IF (INCX.EQ.0) THEN + INFO = 9 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('STBSV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF (N.EQ.0) RETURN +* + NOUNIT = LSAME(DIAG,'N') +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF (INCX.LE.0) THEN + KX = 1 - (N-1)*INCX + ELSE IF (INCX.NE.1) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed by sequentially with one pass through A. +* + IF (LSAME(TRANS,'N')) THEN +* +* Form x := inv( A )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 20 J = N,1,-1 + IF (X(J).NE.ZERO) THEN + L = KPLUS1 - J + IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) + TEMP = X(J) + DO 10 I = J - 1,MAX(1,J-K),-1 + X(I) = X(I) - TEMP*A(L+I,J) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 40 J = N,1,-1 + KX = KX - INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = KPLUS1 - J + IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) + TEMP = X(JX) + DO 30 I = J - 1,MAX(1,J-K),-1 + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX - INCX + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 60 J = 1,N + IF (X(J).NE.ZERO) THEN + L = 1 - J + IF (NOUNIT) X(J) = X(J)/A(1,J) + TEMP = X(J) + DO 50 I = J + 1,MIN(N,J+K) + X(I) = X(I) - TEMP*A(L+I,J) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1,N + KX = KX + INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = 1 - J + IF (NOUNIT) X(JX) = X(JX)/A(1,J) + TEMP = X(JX) + DO 70 I = J + 1,MIN(N,J+K) + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A')*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 100 J = 1,N + TEMP = X(J) + L = KPLUS1 - J + DO 90 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(I) + 90 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + X(J) = TEMP + 100 CONTINUE + ELSE + JX = KX + DO 120 J = 1,N + TEMP = X(JX) + IX = KX + L = KPLUS1 - J + DO 110 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX + INCX + 110 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + X(JX) = TEMP + JX = JX + INCX + IF (J.GT.K) KX = KX + INCX + 120 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 140 J = N,1,-1 + TEMP = X(J) + L = 1 - J + DO 130 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(I) + 130 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + X(J) = TEMP + 140 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 160 J = N,1,-1 + TEMP = X(JX) + IX = KX + L = 1 - J + DO 150 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX - INCX + 150 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + X(JX) = TEMP + JX = JX - INCX + IF ((N-J).GE.K) KX = KX - INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STBSV . +* + END diff --git a/reference/stpmvf.f b/reference/stpmvf.f new file mode 100644 index 0000000000..1e93b843aa --- /dev/null +++ b/reference/stpmvf.f @@ -0,0 +1,299 @@ + SUBROUTINE STPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STPMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STPMVF', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x:= A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK =1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 10, I = 1, J - 1 + X( I ) = X( I ) + TEMP*AP( K ) + K = K + 1 + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK + J - 1 ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, K = KK, KK + J - 2 + X( IX ) = X( IX ) + TEMP*AP( K ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK + J - 1 ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 50, I = N, J + 1, -1 + X( I ) = X( I ) + TEMP*AP( K ) + K = K - 1 + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK - N + J ) + END IF + KK = KK - ( N - J + 1 ) + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 + X( IX ) = X( IX ) + TEMP*AP( K ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK - N + J ) + END IF + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + K = KK - 1 + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + AP( K )*X( I ) + K = K - 1 + 90 CONTINUE + X( J ) = TEMP + KK = KK - J + 100 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 110, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + AP( K )*X( IX ) + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + KK = KK - J + 120 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + K = KK + 1 + DO 130, I = J + 1, N + TEMP = TEMP + AP( K )*X( I ) + K = K + 1 + 130 CONTINUE + X( J ) = TEMP + KK = KK + ( N - J + 1 ) + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 150, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + AP( K )*X( IX ) + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STPMV . +* + END diff --git a/reference/stpsvf.f b/reference/stpsvf.f new file mode 100644 index 0000000000..9fa2f59e94 --- /dev/null +++ b/reference/stpsvf.f @@ -0,0 +1,302 @@ + SUBROUTINE STPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STPSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix, supplied in packed form. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - REAL array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STPSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + TEMP = X( J ) + K = KK - 1 + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*AP( K ) + K = K - 1 + 10 CONTINUE + END IF + KK = KK - J + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + TEMP = X( JX ) + IX = JX + DO 30, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + X( IX ) = X( IX ) - TEMP*AP( K ) + 30 CONTINUE + END IF + JX = JX - INCX + KK = KK - J + 40 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + TEMP = X( J ) + K = KK + 1 + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*AP( K ) + K = K + 1 + 50 CONTINUE + END IF + KK = KK + ( N - J + 1 ) + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + TEMP = X( JX ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + X( IX ) = X( IX ) - TEMP*AP( K ) + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = X( J ) + K = KK + DO 90, I = 1, J - 1 + TEMP = TEMP - AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + X( J ) = TEMP + KK = KK + J + 100 CONTINUE + ELSE + JX = KX + DO 120, J = 1, N + TEMP = X( JX ) + IX = KX + DO 110, K = KK, KK + J - 2 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX + INCX + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + X( JX ) = TEMP + JX = JX + INCX + KK = KK + J + 120 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 140, J = N, 1, -1 + TEMP = X( J ) + K = KK + DO 130, I = N, J + 1, -1 + TEMP = TEMP - AP( K )*X( I ) + K = K - 1 + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + X( J ) = TEMP + KK = KK - ( N - J + 1 ) + 140 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 160, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + DO 150, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX - INCX + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + X( JX ) = TEMP + JX = JX - INCX + KK = KK - (N - J + 1 ) + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STPSV . +* + END diff --git a/reference/strmmf.f b/reference/strmmf.f new file mode 100644 index 0000000000..04ea865ac1 --- /dev/null +++ b/reference/strmmf.f @@ -0,0 +1,355 @@ + SUBROUTINE STRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + REAL ALPHA +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* STRMM performs one of the matrix-matrix operations +* +* B := alpha*op( A )*B, or B := alpha*B*op( A ), +* +* where alpha is a scalar, B is an m by n matrix, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A'. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) multiplies B from +* the left or right as follows: +* +* SIDE = 'L' or 'l' B := alpha*op( A )*B. +* +* SIDE = 'R' or 'r' B := alpha*B*op( A ). +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = A'. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B, and on exit is overwritten by the +* transformed matrix. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + REAL TEMP +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STRMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*A*B. +* + IF( UPPER )THEN + DO 50, J = 1, N + DO 40, K = 1, M + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + DO 30, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 30 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + B( K, J ) = TEMP + END IF + 40 CONTINUE + 50 CONTINUE + ELSE + DO 80, J = 1, N + DO 70 K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + B( K, J ) = TEMP + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*A( K, K ) + DO 60, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 60 CONTINUE + END IF + 70 CONTINUE + 80 CONTINUE + END IF + ELSE +* +* Form B := alpha*A'*B. +* + IF( UPPER )THEN + DO 110, J = 1, N + DO 100, I = M, 1, -1 + TEMP = B( I, J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 90, K = 1, I - 1 + TEMP = TEMP + A( K, I )*B( K, J ) + 90 CONTINUE + B( I, J ) = ALPHA*TEMP + 100 CONTINUE + 110 CONTINUE + ELSE + DO 140, J = 1, N + DO 130, I = 1, M + TEMP = B( I, J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 120, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 120 CONTINUE + B( I, J ) = ALPHA*TEMP + 130 CONTINUE + 140 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*B*A. +* + IF( UPPER )THEN + DO 180, J = N, 1, -1 + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 150 CONTINUE + DO 170, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + TEMP = ALPHA*A( K, J ) + DO 160, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + ELSE + DO 220, J = 1, N + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 190, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 190 CONTINUE + DO 210, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + TEMP = ALPHA*A( K, J ) + DO 200, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 200 CONTINUE + END IF + 210 CONTINUE + 220 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*A'. +* + IF( UPPER )THEN + DO 260, K = 1, N + DO 240, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + TEMP = ALPHA*A( J, K ) + DO 230, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 230 CONTINUE + END IF + 240 CONTINUE + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + IF( TEMP.NE.ONE )THEN + DO 250, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 250 CONTINUE + END IF + 260 CONTINUE + ELSE + DO 300, K = N, 1, -1 + DO 280, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + TEMP = ALPHA*A( J, K ) + DO 270, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 270 CONTINUE + END IF + 280 CONTINUE + TEMP = ALPHA + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + IF( TEMP.NE.ONE )THEN + DO 290, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 290 CONTINUE + END IF + 300 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STRMM . +* + END diff --git a/reference/strmvf.f b/reference/strmvf.f new file mode 100644 index 0000000000..249aff275d --- /dev/null +++ b/reference/strmvf.f @@ -0,0 +1,286 @@ + SUBROUTINE STRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STRMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := A'*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STRMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 10, I = 1, J - 1 + X( I ) = X( I ) + TEMP*A( I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, I = 1, J - 1 + X( IX ) = X( IX ) + TEMP*A( I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 50, I = N, J + 1, -1 + X( I ) = X( I ) + TEMP*A( I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, I = N, J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + END IF + JX = JX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 100, J = N, 1, -1 + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + X( J ) = TEMP + 100 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 120, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 110, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + A( I, J )*X( IX ) + 110 CONTINUE + X( JX ) = TEMP + JX = JX - INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = 1, N + TEMP = X( J ) + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 130, I = J + 1, N + TEMP = TEMP + A( I, J )*X( I ) + 130 CONTINUE + X( J ) = TEMP + 140 CONTINUE + ELSE + JX = KX + DO 160, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + A( I, J )*X( IX ) + 150 CONTINUE + X( JX ) = TEMP + JX = JX + INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STRMV . +* + END diff --git a/reference/strsmf.f b/reference/strsmf.f new file mode 100644 index 0000000000..31d71a785d --- /dev/null +++ b/reference/strsmf.f @@ -0,0 +1,378 @@ + SUBROUTINE STRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + REAL ALPHA +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* STRSM solves one of the matrix equations +* +* op( A )*X = alpha*B, or X*op( A ) = alpha*B, +* +* where alpha is a scalar, X and B are m by n matrices, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A'. +* +* The matrix X is overwritten on B. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) appears on the left +* or right of X as follows: +* +* SIDE = 'L' or 'l' op( A )*X = alpha*B. +* +* SIDE = 'R' or 'r' X*op( A ) = alpha*B. +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = A'. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - REAL . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - REAL array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the right-hand side matrix B, and on exit is +* overwritten by the solution matrix X. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + REAL TEMP +* .. Parameters .. + REAL ONE , ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STRSM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*inv( A )*B. +* + IF( UPPER )THEN + DO 60, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 30, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 30 CONTINUE + END IF + DO 50, K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 40, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 40 CONTINUE + END IF + 50 CONTINUE + 60 CONTINUE + ELSE + DO 100, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 70, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 70 CONTINUE + END IF + DO 90 K = 1, M + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 80, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 80 CONTINUE + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form B := alpha*inv( A' )*B. +* + IF( UPPER )THEN + DO 130, J = 1, N + DO 120, I = 1, M + TEMP = ALPHA*B( I, J ) + DO 110, K = 1, I - 1 + TEMP = TEMP - A( K, I )*B( K, J ) + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + B( I, J ) = TEMP + 120 CONTINUE + 130 CONTINUE + ELSE + DO 160, J = 1, N + DO 150, I = M, 1, -1 + TEMP = ALPHA*B( I, J ) + DO 140, K = I + 1, M + TEMP = TEMP - A( K, I )*B( K, J ) + 140 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + B( I, J ) = TEMP + 150 CONTINUE + 160 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) )THEN +* +* Form B := alpha*B*inv( A ). +* + IF( UPPER )THEN + DO 210, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 170, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 170 CONTINUE + END IF + DO 190, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + DO 180, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 180 CONTINUE + END IF + 190 CONTINUE + IF( NOUNIT )THEN + TEMP = ONE/A( J, J ) + DO 200, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 200 CONTINUE + END IF + 210 CONTINUE + ELSE + DO 260, J = N, 1, -1 + IF( ALPHA.NE.ONE )THEN + DO 220, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 220 CONTINUE + END IF + DO 240, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + DO 230, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 230 CONTINUE + END IF + 240 CONTINUE + IF( NOUNIT )THEN + TEMP = ONE/A( J, J ) + DO 250, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 250 CONTINUE + END IF + 260 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*inv( A' ). +* + IF( UPPER )THEN + DO 310, K = N, 1, -1 + IF( NOUNIT )THEN + TEMP = ONE/A( K, K ) + DO 270, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 270 CONTINUE + END IF + DO 290, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + TEMP = A( J, K ) + DO 280, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 280 CONTINUE + END IF + 290 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 300, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 300 CONTINUE + END IF + 310 CONTINUE + ELSE + DO 360, K = 1, N + IF( NOUNIT )THEN + TEMP = ONE/A( K, K ) + DO 320, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 320 CONTINUE + END IF + DO 340, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + TEMP = A( J, K ) + DO 330, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 330 CONTINUE + END IF + 340 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 350, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 350 CONTINUE + END IF + 360 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STRSM . +* + END diff --git a/reference/strsvf.f b/reference/strsvf.f new file mode 100644 index 0000000000..dcf020f30f --- /dev/null +++ b/reference/strsvf.f @@ -0,0 +1,289 @@ + SUBROUTINE STRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + REAL A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* STRSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' A'*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - REAL array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - REAL array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) +* .. Local Scalars .. + REAL TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'STRSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOUNIT = LSAME( DIAG, 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*A( I, J ) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + TEMP = X( JX ) + IX = JX + DO 30, I = J - 1, 1, -1 + IX = IX - INCX + X( IX ) = X( IX ) - TEMP*A( I, J ) + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*A( I, J ) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + TEMP = X( JX ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + X( IX ) = X( IX ) - TEMP*A( I, J ) + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 100, J = 1, N + TEMP = X( J ) + DO 90, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( I ) + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( J ) = TEMP + 100 CONTINUE + ELSE + JX = KX + DO 120, J = 1, N + TEMP = X( JX ) + IX = KX + DO 110, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX + INCX + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( JX ) = TEMP + JX = JX + INCX + 120 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 140, J = N, 1, -1 + TEMP = X( J ) + DO 130, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( I ) + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( J ) = TEMP + 140 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 160, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + DO 150, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX - INCX + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + X( JX ) = TEMP + JX = JX - INCX + 160 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of STRSV . +* + END diff --git a/reference/strti2f.f b/reference/strti2f.f new file mode 100644 index 0000000000..b859cff3b0 --- /dev/null +++ b/reference/strti2f.f @@ -0,0 +1,146 @@ + SUBROUTINE STRTI2F( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* STRTI2 computes the inverse of a real upper or lower triangular +* matrix. +* +* This is the Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the matrix A is upper or lower triangular. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* DIAG (input) CHARACTER*1 +* Specifies whether or not the matrix A is unit triangular. +* = 'N': Non-unit triangular +* = 'U': Unit triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading n by n upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J + REAL AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SSCAL, STRMV, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRTI2', -INFO ) + RETURN + END IF +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix. +* + DO 10 J = 1, N + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF +* +* Compute elements 1:j-1 of j-th column. +* + CALL STRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, + $ A( 1, J ), 1 ) + CALL SSCAL( J-1, AJJ, A( 1, J ), 1 ) + 10 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix. +* + DO 20 J = N, 1, -1 + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF + IF( J.LT.N ) THEN +* +* Compute elements j+1:n of j-th column. +* + CALL STRMV( 'Lower', 'No transpose', DIAG, N-J, + $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) + CALL SSCAL( N-J, AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of STRTI2 +* + END diff --git a/reference/strtrif.f b/reference/strtrif.f new file mode 100644 index 0000000000..27e3234bbd --- /dev/null +++ b/reference/strtrif.f @@ -0,0 +1,176 @@ + SUBROUTINE STRTRIF( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* March 31, 1993 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* STRTRI computes the inverse of a real upper or lower triangular +* matrix A. +* +* This is the Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': A is upper triangular; +* = 'L': A is lower triangular. +* +* DIAG (input) CHARACTER*1 +* = 'N': A is non-unit triangular; +* = 'U': A is unit triangular. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) REAL array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading N-by-N upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, A(i,i) is exactly zero. The triangular +* matrix is singular and its inverse can not be computed. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J, JB, NB, NN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL STRMM, STRSM, STRTI2, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'STRTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Check for singularity if non-unit. +* + IF( NOUNIT ) THEN + DO 10 INFO = 1, N + IF( A( INFO, INFO ).EQ.ZERO ) + $ RETURN + 10 CONTINUE + INFO = 0 + END IF +* +* Determine the block size for this environment. +* + NB = 128 + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL STRTI2( UPLO, DIAG, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix +* + DO 20 J = 1, N, NB + JB = MIN( NB, N-J+1 ) +* +* Compute rows 1:j-1 of current block column +* + CALL STRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, + $ JB, ONE, A, LDA, A( 1, J ), LDA ) + CALL STRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, + $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) +* +* Compute inverse of current diagonal block +* + CALL STRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) + 20 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 30 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) + IF( J+JB.LE.N ) THEN +* +* Compute rows j+jb:n of current block column +* + CALL STRMM( 'Left', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, + $ A( J+JB, J ), LDA ) + CALL STRSM( 'Right', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF +* +* Compute inverse of current diagonal block +* + CALL STRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) + 30 CONTINUE + END IF + END IF +* + RETURN +* +* End of STRTRI +* + END diff --git a/reference/zaxpycf.f b/reference/zaxpycf.f new file mode 100644 index 0000000000..aaf21da9f9 --- /dev/null +++ b/reference/zaxpycf.f @@ -0,0 +1,36 @@ + subroutine zaxpycf(n,za,zx,incx,zy,incy) +c +c constant times a vector plus a vector. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),za + integer i,incx,incy,ix,iy,n + double precision dcabs1 + INTRINSIC dconjg + + if(n.le.0)return + if (dcabs1(za) .eq. 0.0d0) return + if (incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + zy(iy) = zy(iy) + za*dconjg(zx(ix)) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + zy(i) = zy(i) + za*dconjg(zx(i)) + 30 continue + return + end diff --git a/reference/zaxpyf.f b/reference/zaxpyf.f new file mode 100644 index 0000000000..2f0f6a0773 --- /dev/null +++ b/reference/zaxpyf.f @@ -0,0 +1,34 @@ + subroutine zaxpyf(n,za,zx,incx,zy,incy) +c +c constant times a vector plus a vector. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),za + integer i,incx,incy,ix,iy,n + double precision dcabs1 + if(n.le.0)return + if (dcabs1(za) .eq. 0.0d0) return + if (incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + zy(iy) = zy(iy) + za*zx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + zy(i) = zy(i) + za*zx(i) + 30 continue + return + end diff --git a/reference/zcopyf.f b/reference/zcopyf.f new file mode 100644 index 0000000000..a3bfdfc98b --- /dev/null +++ b/reference/zcopyf.f @@ -0,0 +1,33 @@ + subroutine zcopyf(n,zx,incx,zy,incy) +c +c copies a vector, x, to a vector, y. +c jack dongarra, linpack, 4/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*) + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + zy(iy) = zx(ix) + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + zy(i) = zx(i) + 30 continue + return + end diff --git a/reference/zdotcf.f b/reference/zdotcf.f new file mode 100644 index 0000000000..1611aee8ca --- /dev/null +++ b/reference/zdotcf.f @@ -0,0 +1,36 @@ + double complex function zdotcf(n,zx,incx,zy,incy) +c +c forms the dot product of a vector. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),ztemp + integer i,incx,incy,ix,iy,n + ztemp = (0.0d0,0.0d0) + zdotcf = (0.0d0,0.0d0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ztemp = ztemp + dconjg(zx(ix))*zy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + zdotcf = ztemp + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ztemp = ztemp + dconjg(zx(i))*zy(i) + 30 continue + zdotcf = ztemp + return + end diff --git a/reference/zdotuf.f b/reference/zdotuf.f new file mode 100644 index 0000000000..cc2ea939ff --- /dev/null +++ b/reference/zdotuf.f @@ -0,0 +1,36 @@ + double complex function zdotuf(n,zx,incx,zy,incy) +c +c forms the dot product of two vectors. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),ztemp + integer i,incx,incy,ix,iy,n + ztemp = (0.0d0,0.0d0) + zdotuf = (0.0d0,0.0d0) + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments +c not equal to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ztemp = ztemp + zx(ix)*zy(iy) + ix = ix + incx + iy = iy + incy + 10 continue + zdotuf = ztemp + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ztemp = ztemp + zx(i)*zy(i) + 30 continue + zdotuf = ztemp + return + end diff --git a/reference/zdrotf.f b/reference/zdrotf.f new file mode 100644 index 0000000000..fe11288c44 --- /dev/null +++ b/reference/zdrotf.f @@ -0,0 +1,38 @@ + subroutine zdrotf (n,zx,incx,zy,incy,c,s) +c +c applies a plane rotation, where the cos and sin (c and s) are +c double precision and the vectors zx and zy are double complex. +c jack dongarra, linpack, 3/11/78. +c + double complex zx(1),zy(1),ztemp + double precision c,s + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ztemp = c*zx(ix) + s*zy(iy) + zy(iy) = c*zy(iy) - s*zx(ix) + zx(ix) = ztemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 +c + 20 do 30 i = 1,n + ztemp = c*zx(i) + s*zy(i) + zy(i) = c*zy(i) - s*zx(i) + zx(i) = ztemp + 30 continue + return + end diff --git a/reference/zdscalf.f b/reference/zdscalf.f new file mode 100644 index 0000000000..0ac1534b62 --- /dev/null +++ b/reference/zdscalf.f @@ -0,0 +1,30 @@ + subroutine zdscalf(n,da,zx,incx) +c +c scales a vector by a constant. +c jack dongarra, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*) + double precision da + integer i,incx,ix,n +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + do 10 i = 1,n + zx(ix) = dcmplx(da,0.0d0)*zx(ix) + ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + zx(i) = dcmplx(da,0.0d0)*zx(i) + 30 continue + return + end diff --git a/reference/zgbmvf.f b/reference/zgbmvf.f new file mode 100644 index 0000000000..bd888b16f5 --- /dev/null +++ b/reference/zgbmvf.f @@ -0,0 +1,450 @@ + SUBROUTINE ZGBMVF( TRANS, M, N, KL, KU, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER INCX, INCY, KL, KU, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZGBMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or +* +* y := alpha*conjg( A' )*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n band matrix, with kl sub-diagonals and ku super-diagonals. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* KL - INTEGER. +* On entry, KL specifies the number of sub-diagonals of the +* matrix A. KL must satisfy 0 .le. KL. +* Unchanged on exit. +* +* KU - INTEGER. +* On entry, KU specifies the number of super-diagonals of the +* matrix A. KU must satisfy 0 .le. KU. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading ( kl + ku + 1 ) by n part of the +* array A must contain the matrix of coefficients, supplied +* column by column, with the leading diagonal of the matrix in +* row ( ku + 1 ) of the array, the first super-diagonal +* starting at position 2 in row ku, the first sub-diagonal +* starting at position 1 in row ( ku + 2 ), and so on. +* Elements in the array A that do not correspond to elements +* in the band matrix (such as the top left ku by ku triangle) +* are not referenced. +* The following program segment will transfer a band matrix +* from conventional full matrix storage to band storage: +* +* DO 20, J = 1, N +* K = KU + 1 - J +* DO 10, I = MAX( 1, J - KU ), MIN( M, J + KL ) +* A( K + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( kl + ku + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, K, KUP1, KX, KY, + $ LENX, LENY + LOGICAL NOCONJ, NOTRANS, XCONJ +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ).AND. + $ .NOT.LSAME( TRANS, 'O' ).AND. + $ .NOT.LSAME( TRANS, 'U' ).AND. + $ .NOT.LSAME( TRANS, 'S' ).AND. + $ .NOT.LSAME( TRANS, 'D' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( KL.LT.0 )THEN + INFO = 4 + ELSE IF( KU.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( KL + KU + 1 ) )THEN + INFO = 8 + ELSE IF( INCX.EQ.0 )THEN + INFO = 10 + ELSE IF( INCY.EQ.0 )THEN + INFO = 13 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZGBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* + NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) + + NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) + + XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF(NOTRANS)THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the band part of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + KUP1 = KU + 1 + + IF(XCONJ)THEN + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + K = KUP1 - J + IF( NOCONJ )THEN + DO 50, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 50 CONTINUE + ELSE + DO 55, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*DCONJG(A( K + I, J )) + 55 CONTINUE + END IF + + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*X( JX ) + IY = KY + K = KUP1 - J + IF( NOCONJ )THEN + DO 70, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 70 CONTINUE + ELSE + DO 75, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*DCONJG(A( K + I, J )) + IY = IY + INCY + 75 CONTINUE + END IF + + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = ZERO + K = KUP1 - J + IF( NOCONJ )THEN + DO 90, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( I ) + 90 CONTINUE + ELSE + DO 100, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + DCONJG( A( K + I, J ) )*X( I ) + 100 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + IF( NOCONJ )THEN + DO 120, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*X( IX ) + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + DCONJG( A( K + I, J ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 140 CONTINUE + END IF + END IF + + ELSE + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 160, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG(X( JX )) + K = KUP1 - J + IF( NOCONJ )THEN + DO 150, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*A( K + I, J ) + 150 CONTINUE + ELSE + DO 155, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( I ) = Y( I ) + TEMP*DCONJG(A( K + I, J )) + 155 CONTINUE + END IF + + END IF + JX = JX + INCX + 160 CONTINUE + ELSE + DO 180, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG(X( JX )) + IY = KY + K = KUP1 - J + IF( NOCONJ )THEN + DO 170, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*A( K + I, J ) + IY = IY + INCY + 170 CONTINUE + ELSE + DO 175, I = MAX( 1, J - KU ), MIN( M, J + KL ) + Y( IY ) = Y( IY ) + TEMP*DCONJG(A( K + I, J )) + IY = IY + INCY + 175 CONTINUE + END IF + + END IF + JX = JX + INCX + IF( J.GT.KU ) + $ KY = KY + INCY + 180 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 210, J = 1, N + TEMP = ZERO + K = KUP1 - J + IF( NOCONJ )THEN + DO 190, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*DCONJG(X( I )) + 190 CONTINUE + ELSE + DO 200, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + DCONJG( A( K + I, J ) )*DCONJG(X( I )) + 200 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 210 CONTINUE + ELSE + DO 240, J = 1, N + TEMP = ZERO + IX = KX + K = KUP1 - J + IF( NOCONJ )THEN + DO 220, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + A( K + I, J )*DCONJG(X( IX )) + IX = IX + INCX + 220 CONTINUE + ELSE + DO 230, I = MAX( 1, J - KU ), MIN( M, J + KL ) + TEMP = TEMP + DCONJG( A( K + I, J ) )*DCONJG(X(IX )) + IX = IX + INCX + 230 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + IF( J.GT.KU ) + $ KX = KX + INCX + 240 CONTINUE + END IF + END IF + + END IF + +* + RETURN +* +* End of ZGBMV . +* + END diff --git a/reference/zgemm3mf.f b/reference/zgemm3mf.f new file mode 100644 index 0000000000..3bfc88bb53 --- /dev/null +++ b/reference/zgemm3mf.f @@ -0,0 +1,414 @@ + SUBROUTINE ZGEMM3MF(TRA,TRB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + DOUBLE COMPLEX ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRA,TRB +* .. +* .. Array Arguments .. + DOUBLE COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* ZGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRA - CHARACTER*1. +* On entry, TRA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRA = 'N' or 'n', op( A ) = A. +* +* TRA = 'T' or 't', op( A ) = A'. +* +* TRA = 'C' or 'c', op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* TRB - CHARACTER*1. +* On entry, TRB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRB = 'N' or 'n', op( B ) = B. +* +* TRB = 'T' or 't', op( B ) = B'. +* +* TRB = 'C' or 'c', op( B ) = conjg( B' ). +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRA = 'N' or 'n', and is m otherwise. +* Before entry with TRA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is +* n when TRB = 'N' or 'n', and is k otherwise. +* Before entry with TRB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG,MAX +* .. +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL CONJA,CONJB,NOTA,NOTB +* .. +* .. Parameters .. + DOUBLE COMPLEX ONE + PARAMETER (ONE= (1.0D+0,0.0D+0)) + DOUBLE COMPLEX ZERO + PARAMETER (ZERO= (0.0D+0,0.0D+0)) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* conjugated or transposed, set CONJA and CONJB as true if A and +* B respectively are to be transposed but not conjugated and set +* NROWA, NCOLA and NROWB as the number of rows and columns of A +* and the number of rows of B respectively. +* + NOTA = LSAME(TRA,'N') + NOTB = LSAME(TRB,'N') + CONJA = LSAME(TRA,'C') + CONJB = LSAME(TRB,'C') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + + (.NOT.LSAME(TRA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + + (.NOT.LSAME(TRB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('ZGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And when alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE IF (CONJA) THEN +* +* Form C := alpha*conjg( A' )*B + beta*C. +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 150 J = 1,N + DO 140 I = 1,M + TEMP = ZERO + DO 130 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 130 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 140 CONTINUE + 150 CONTINUE + END IF + ELSE IF (NOTA) THEN + IF (CONJB) THEN +* +* Form C := alpha*A*conjg( B' ) + beta*C. +* + DO 200 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 160 I = 1,M + C(I,J) = ZERO + 160 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 170 I = 1,M + C(I,J) = BETA*C(I,J) + 170 CONTINUE + END IF + DO 190 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*DCONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE +* +* Form C := alpha*A*B' + beta*C +* + DO 250 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 210 I = 1,M + C(I,J) = ZERO + 210 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 220 I = 1,M + C(I,J) = BETA*C(I,J) + 220 CONTINUE + END IF + DO 240 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE + END IF + 240 CONTINUE + 250 CONTINUE + END IF + ELSE IF (CONJA) THEN + IF (CONJB) THEN +* +* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. +* + DO 280 J = 1,N + DO 270 I = 1,M + TEMP = ZERO + DO 260 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*DCONJG(B(J,L)) + 260 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 270 CONTINUE + 280 CONTINUE + ELSE +* +* Form C := alpha*conjg( A' )*B' + beta*C +* + DO 310 J = 1,N + DO 300 I = 1,M + TEMP = ZERO + DO 290 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*B(J,L) + 290 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 300 CONTINUE + 310 CONTINUE + END IF + ELSE + IF (CONJB) THEN +* +* Form C := alpha*A'*conjg( B' ) + beta*C +* + DO 340 J = 1,N + DO 330 I = 1,M + TEMP = ZERO + DO 320 L = 1,K + TEMP = TEMP + A(L,I)*DCONJG(B(J,L)) + 320 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 330 CONTINUE + 340 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 370 J = 1,N + DO 360 I = 1,M + TEMP = ZERO + DO 350 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 350 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 360 CONTINUE + 370 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZGEMM . +* + END diff --git a/reference/zgemmf.f b/reference/zgemmf.f new file mode 100644 index 0000000000..65cd317503 --- /dev/null +++ b/reference/zgemmf.f @@ -0,0 +1,414 @@ + SUBROUTINE ZGEMMF(TRANA,TRANB,M,N,K,ALPHA,A,LDA,B,LDB,BETA,C,LDC) +* .. Scalar Arguments .. + DOUBLE COMPLEX ALPHA,BETA + INTEGER K,LDA,LDB,LDC,M,N + CHARACTER TRANA,TRANB +* .. +* .. Array Arguments .. + DOUBLE COMPLEX A(LDA,*),B(LDB,*),C(LDC,*) +* .. +* +* Purpose +* ======= +* +* ZGEMM performs one of the matrix-matrix operations +* +* C := alpha*op( A )*op( B ) + beta*C, +* +* where op( X ) is one of +* +* op( X ) = X or op( X ) = X' or op( X ) = conjg( X' ), +* +* alpha and beta are scalars, and A, B and C are matrices, with op( A ) +* an m by k matrix, op( B ) a k by n matrix and C an m by n matrix. +* +* Arguments +* ========== +* +* TRANA - CHARACTER*1. +* On entry, TRANA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANA = 'N' or 'n', op( A ) = A. +* +* TRANA = 'T' or 't', op( A ) = A'. +* +* TRANA = 'C' or 'c', op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* TRANB - CHARACTER*1. +* On entry, TRANB specifies the form of op( B ) to be used in +* the matrix multiplication as follows: +* +* TRANB = 'N' or 'n', op( B ) = B. +* +* TRANB = 'T' or 't', op( B ) = B'. +* +* TRANB = 'C' or 'c', op( B ) = conjg( B' ). +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix +* op( A ) and of the matrix C. M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix +* op( B ) and the number of columns of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of columns of the matrix +* op( A ) and the number of rows of the matrix op( B ). K must +* be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANA = 'N' or 'n', and is m otherwise. +* Before entry with TRANA = 'N' or 'n', the leading m by k +* part of the array A must contain the matrix A, otherwise +* the leading k by m part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANA = 'N' or 'n' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is +* n when TRANB = 'N' or 'n', and is k otherwise. +* Before entry with TRANB = 'N' or 'n', the leading k by n +* part of the array B must contain the matrix B, otherwise +* the leading n by k part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANB = 'N' or 'n' then +* LDB must be at least max( 1, k ), otherwise LDB must be at +* least max( 1, n ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n matrix +* ( alpha*op( A )*op( B ) + beta*C ). +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG,MAX +* .. +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I,INFO,J,L,NCOLA,NROWA,NROWB + LOGICAL CONJA,CONJB,NOTA,NOTB +* .. +* .. Parameters .. + DOUBLE COMPLEX ONE + PARAMETER (ONE= (1.0D+0,0.0D+0)) + DOUBLE COMPLEX ZERO + PARAMETER (ZERO= (0.0D+0,0.0D+0)) +* .. +* +* Set NOTA and NOTB as true if A and B respectively are not +* conjugated or transposed, set CONJA and CONJB as true if A and +* B respectively are to be transposed but not conjugated and set +* NROWA, NCOLA and NROWB as the number of rows and columns of A +* and the number of rows of B respectively. +* + NOTA = LSAME(TRANA,'N') + NOTB = LSAME(TRANB,'N') + CONJA = LSAME(TRANA,'C') + CONJB = LSAME(TRANB,'C') + IF (NOTA) THEN + NROWA = M + NCOLA = K + ELSE + NROWA = K + NCOLA = M + END IF + IF (NOTB) THEN + NROWB = K + ELSE + NROWB = N + END IF +* +* Test the input parameters. +* + INFO = 0 + IF ((.NOT.NOTA) .AND. (.NOT.CONJA) .AND. + + (.NOT.LSAME(TRANA,'T'))) THEN + INFO = 1 + ELSE IF ((.NOT.NOTB) .AND. (.NOT.CONJB) .AND. + + (.NOT.LSAME(TRANB,'T'))) THEN + INFO = 2 + ELSE IF (M.LT.0) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT.MAX(1,NROWA)) THEN + INFO = 8 + ELSE IF (LDB.LT.MAX(1,NROWB)) THEN + INFO = 10 + ELSE IF (LDC.LT.MAX(1,M)) THEN + INFO = 13 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('ZGEMM ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF ((M.EQ.0) .OR. (N.EQ.0) .OR. + + (((ALPHA.EQ.ZERO).OR. (K.EQ.0)).AND. (BETA.EQ.ONE))) RETURN +* +* And when alpha.eq.zero. +* + IF (ALPHA.EQ.ZERO) THEN + IF (BETA.EQ.ZERO) THEN + DO 20 J = 1,N + DO 10 I = 1,M + C(I,J) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1,N + DO 30 I = 1,M + C(I,J) = BETA*C(I,J) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF (NOTB) THEN + IF (NOTA) THEN +* +* Form C := alpha*A*B + beta*C. +* + DO 90 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 50 I = 1,M + C(I,J) = ZERO + 50 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 60 I = 1,M + C(I,J) = BETA*C(I,J) + 60 CONTINUE + END IF + DO 80 L = 1,K + IF (B(L,J).NE.ZERO) THEN + TEMP = ALPHA*B(L,J) + DO 70 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 70 CONTINUE + END IF + 80 CONTINUE + 90 CONTINUE + ELSE IF (CONJA) THEN +* +* Form C := alpha*conjg( A' )*B + beta*C. +* + DO 120 J = 1,N + DO 110 I = 1,M + TEMP = ZERO + DO 100 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*B(L,J) + 100 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 110 CONTINUE + 120 CONTINUE + ELSE +* +* Form C := alpha*A'*B + beta*C +* + DO 150 J = 1,N + DO 140 I = 1,M + TEMP = ZERO + DO 130 L = 1,K + TEMP = TEMP + A(L,I)*B(L,J) + 130 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 140 CONTINUE + 150 CONTINUE + END IF + ELSE IF (NOTA) THEN + IF (CONJB) THEN +* +* Form C := alpha*A*conjg( B' ) + beta*C. +* + DO 200 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 160 I = 1,M + C(I,J) = ZERO + 160 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 170 I = 1,M + C(I,J) = BETA*C(I,J) + 170 CONTINUE + END IF + DO 190 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*DCONJG(B(J,L)) + DO 180 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE +* +* Form C := alpha*A*B' + beta*C +* + DO 250 J = 1,N + IF (BETA.EQ.ZERO) THEN + DO 210 I = 1,M + C(I,J) = ZERO + 210 CONTINUE + ELSE IF (BETA.NE.ONE) THEN + DO 220 I = 1,M + C(I,J) = BETA*C(I,J) + 220 CONTINUE + END IF + DO 240 L = 1,K + IF (B(J,L).NE.ZERO) THEN + TEMP = ALPHA*B(J,L) + DO 230 I = 1,M + C(I,J) = C(I,J) + TEMP*A(I,L) + 230 CONTINUE + END IF + 240 CONTINUE + 250 CONTINUE + END IF + ELSE IF (CONJA) THEN + IF (CONJB) THEN +* +* Form C := alpha*conjg( A' )*conjg( B' ) + beta*C. +* + DO 280 J = 1,N + DO 270 I = 1,M + TEMP = ZERO + DO 260 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*DCONJG(B(J,L)) + 260 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 270 CONTINUE + 280 CONTINUE + ELSE +* +* Form C := alpha*conjg( A' )*B' + beta*C +* + DO 310 J = 1,N + DO 300 I = 1,M + TEMP = ZERO + DO 290 L = 1,K + TEMP = TEMP + DCONJG(A(L,I))*B(J,L) + 290 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 300 CONTINUE + 310 CONTINUE + END IF + ELSE + IF (CONJB) THEN +* +* Form C := alpha*A'*conjg( B' ) + beta*C +* + DO 340 J = 1,N + DO 330 I = 1,M + TEMP = ZERO + DO 320 L = 1,K + TEMP = TEMP + A(L,I)*DCONJG(B(J,L)) + 320 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 330 CONTINUE + 340 CONTINUE + ELSE +* +* Form C := alpha*A'*B' + beta*C +* + DO 370 J = 1,N + DO 360 I = 1,M + TEMP = ZERO + DO 350 L = 1,K + TEMP = TEMP + A(L,I)*B(J,L) + 350 CONTINUE + IF (BETA.EQ.ZERO) THEN + C(I,J) = ALPHA*TEMP + ELSE + C(I,J) = ALPHA*TEMP + BETA*C(I,J) + END IF + 360 CONTINUE + 370 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZGEMM . +* + END diff --git a/reference/zgemvf.f b/reference/zgemvf.f new file mode 100644 index 0000000000..10d2d7413c --- /dev/null +++ b/reference/zgemvf.f @@ -0,0 +1,332 @@ + SUBROUTINE ZGEMVF ( TRANS, M, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + DOUBLE COMPLEX ALPHA, BETA + INTEGER INCX, INCY, LDA, M, N + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE COMPLEX A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* CGEMV performs one of the matrix-vector operations +* +* y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y, or +* +* y := alpha*conjg( A' )*x + beta*y, +* +* where alpha and beta are scalars, x and y are vectors and A is an +* m by n matrix. +* +* Parameters +* ========== +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' y := alpha*A*x + beta*y. +* +* TRANS = 'T' or 't' y := alpha*A'*x + beta*y. +* +* TRANS = 'C' or 'c' y := alpha*conjg( A' )*x + beta*y. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* X - COMPLEX array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( m - 1 )*abs( INCX ) ) otherwise. +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX array of DIMENSION at least +* ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n' +* and at least +* ( 1 + ( n - 1 )*abs( INCY ) ) otherwise. +* Before entry with BETA non-zero, the incremented array Y +* must contain the vector y. On exit, Y is overwritten by the +* updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE COMPLEX ONE + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ) ) + DOUBLE COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY, LENX, LENY + LOGICAL NOCONJ, NOTRANS, XCONJ +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ).AND. + $ .NOT.LSAME( TRANS, 'O' ).AND. + $ .NOT.LSAME( TRANS, 'U' ).AND. + $ .NOT.LSAME( TRANS, 'S' ).AND. + $ .NOT.LSAME( TRANS, 'D' ) )THEN + INFO = 1 + ELSE IF( M.LT.0 )THEN + INFO = 2 + ELSE IF( N.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'CGEMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* + NOCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'U' )) + + NOTRANS = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) + $ .OR. LSAME( TRANS, 'O' ) .OR. LSAME( TRANS, 'S' )) + + XCONJ = (LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + $ .OR. LSAME( TRANS, 'R' ) .OR. LSAME( TRANS, 'C' )) +* +* Set LENX and LENY, the lengths of the vectors x and y, and set +* up the start points in X and Y. +* + IF(NOTRANS)THEN + LENX = N + LENY = M + ELSE + LENX = M + LENY = N + END IF + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( LENX - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( LENY - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, LENY + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, LENY + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, LENY + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, LENY + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + IF(NOTRANS)THEN +* +* Form y := alpha*A*x + y. +* + JX = KX + IF( INCY.EQ.1 )THEN + DO 60, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (XCONJ) THEN + TEMP = ALPHA*X( JX ) + ELSE + TEMP = ALPHA*DCONJG(X( JX )) + ENDIF + IF (NOCONJ) THEN + DO 50, I = 1, M + Y( I ) = Y( I ) + TEMP*A( I, J ) + 50 CONTINUE + ELSE + DO 55, I = 1, M + Y( I ) = Y( I ) + TEMP*DCONJG(A( I, J )) + 55 CONTINUE + ENDIF + END IF + JX = JX + INCX + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (XCONJ) THEN + TEMP = ALPHA*X( JX ) + ELSE + TEMP = ALPHA*DCONJG(X( JX )) + ENDIF + IY = KY + IF (NOCONJ) THEN + DO 70, I = 1, M + Y( IY ) = Y( IY ) + TEMP*A( I, J ) + IY = IY + INCY + 70 CONTINUE + ELSE + DO 75, I = 1, M + Y( IY ) = Y( IY ) + TEMP* DCONJG(A( I, J )) + IY = IY + INCY + 75 CONTINUE + ENDIF + END IF + JX = JX + INCX + 80 CONTINUE + END IF + ELSE +* +* Form y := alpha*A'*x + y or y := alpha*conjg( A' )*x + y. +* + JY = KY + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = ZERO + IF( NOCONJ )THEN + DO 90, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + A( I, J )*X( I ) + ELSE + TEMP = TEMP + A( I, J )*DCONJG(X( I )) + ENDIF + 90 CONTINUE + ELSE + DO 100, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) + ELSE + TEMP = TEMP + DCONJG( A( I, J ) )*DCONJG(X( I )) + ENDIF + 100 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 110 CONTINUE + ELSE + DO 140, J = 1, N + TEMP = ZERO + IX = KX + IF( NOCONJ )THEN + DO 120, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + A( I, J )*X( IX ) + ELSE + TEMP = TEMP + A( I, J )*DCONJG(X( IX )) + ENDIF + IX = IX + INCX + 120 CONTINUE + ELSE + DO 130, I = 1, M + IF (XCONJ) THEN + TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) + ELSE + TEMP = TEMP + DCONJG( A( I, J ) )*DCONJG(X( IX )) + ENDIF + IX = IX + INCX + 130 CONTINUE + END IF + Y( JY ) = Y( JY ) + ALPHA*TEMP + JY = JY + INCY + 140 CONTINUE + END IF + END IF +* + RETURN +* +* End of CGEMV . +* + END + diff --git a/reference/zgercf.f b/reference/zgercf.f new file mode 100644 index 0000000000..47f8a93071 --- /dev/null +++ b/reference/zgercf.f @@ -0,0 +1,157 @@ + SUBROUTINE ZGERCF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZGERC performs the rank 1 operation +* +* A := alpha*x*conjg( y' ) + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZGERC ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( Y( JY ) ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( Y( JY ) ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of ZGERC . +* + END diff --git a/reference/zgeruf.f b/reference/zgeruf.f new file mode 100644 index 0000000000..619f778cc8 --- /dev/null +++ b/reference/zgeruf.f @@ -0,0 +1,157 @@ + SUBROUTINE ZGERUF ( M, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, LDA, M, N +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZGERU performs the rank 1 operation +* +* A := alpha*x*y' + A, +* +* where alpha is a scalar, x is an m element vector, y is an n element +* vector and A is an m by n matrix. +* +* Parameters +* ========== +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix A. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( m - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the m +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry, the leading m by n part of the array A must +* contain the matrix of coefficients. On exit, A is +* overwritten by the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JY, KX +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( M.LT.0 )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, M ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZGERU ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( INCY.GT.0 )THEN + JY = 1 + ELSE + JY = 1 - ( N - 1 )*INCY + END IF + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + DO 10, I = 1, M + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + JY = JY + INCY + 20 CONTINUE + ELSE + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( M - 1 )*INCX + END IF + DO 40, J = 1, N + IF( Y( JY ).NE.ZERO )THEN + TEMP = ALPHA*Y( JY ) + IX = KX + DO 30, I = 1, M + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JY = JY + INCY + 40 CONTINUE + END IF +* + RETURN +* +* End of ZGERU . +* + END diff --git a/reference/zgesvf.f b/reference/zgesvf.f new file mode 100644 index 0000000000..d341dd790b --- /dev/null +++ b/reference/zgesvf.f @@ -0,0 +1,107 @@ + SUBROUTINE ZGESVF( N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK driver routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* ZGESV computes the solution to a complex system of linear equations +* A * X = B, +* where A is an N-by-N matrix and X and B are N-by-NRHS matrices. +* +* The LU decomposition with partial pivoting and row interchanges is +* used to factor A as +* A = P * L * U, +* where P is a permutation matrix, L is unit lower triangular, and U is +* upper triangular. The factored form of A is then used to solve the +* system of equations A * X = B. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of linear equations, i.e., the order of the +* matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the N-by-N coefficient matrix A. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (output) INTEGER array, dimension (N) +* The pivot indices that define the permutation matrix P; +* row i of the matrix was interchanged with row IPIV(i). +* +* B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) +* On entry, the N-by-NRHS matrix of right hand side matrix B. +* On exit, if INFO = 0, the N-by-NRHS solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, so the solution could not be computed. +* +* ===================================================================== +* +* .. External Subroutines .. + EXTERNAL XERBLA, ZGETRF, ZGETRS +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( N.LT.0 ) THEN + INFO = -1 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGESV ', -INFO ) + RETURN + END IF +* +* Compute the LU factorization of A. +* + CALL ZGETRF( N, N, A, LDA, IPIV, INFO ) + IF( INFO.EQ.0 ) THEN +* +* Solve the system A*X = B, overwriting B with X. +* + CALL ZGETRS( 'No transpose', N, NRHS, A, LDA, IPIV, B, LDB, + $ INFO ) + END IF + RETURN +* +* End of ZGESV +* + END diff --git a/reference/zgetf2f.f b/reference/zgetf2f.f new file mode 100644 index 0000000000..6b8bc39525 --- /dev/null +++ b/reference/zgetf2f.f @@ -0,0 +1,136 @@ + SUBROUTINE ZGETF2F( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZGETF2 computes an LU factorization of a general m-by-n matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the m by n matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, U(k,k) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE, ZERO + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), + $ ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER J, JP +* .. +* .. External Functions .. + INTEGER IZAMAX + EXTERNAL IZAMAX +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGERU, ZSCAL, ZSWAP +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* + DO 10 J = 1, MIN( M, N ) +* +* Find pivot and test for singularity. +* + JP = J - 1 + IZAMAX( M-J+1, A( J, J ), 1 ) + IPIV( J ) = JP + IF( A( JP, J ).NE.ZERO ) THEN +* +* Apply the interchange to columns 1:N. +* + IF( JP.NE.J ) + $ CALL ZSWAP( N, A( J, 1 ), LDA, A( JP, 1 ), LDA ) +* +* Compute elements J+1:M of J-th column. +* + IF( J.LT.M ) + $ CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 ) +* + ELSE IF( INFO.EQ.0 ) THEN +* + INFO = J + END IF +* + IF( J.LT.MIN( M, N ) ) THEN +* +* Update trailing submatrix. +* + CALL ZGERU( M-J, N-J, -ONE, A( J+1, J ), 1, A( J, J+1 ), + $ LDA, A( J+1, J+1 ), LDA ) + END IF + 10 CONTINUE + RETURN +* +* End of ZGETF2 +* + END diff --git a/reference/zgetrff.f b/reference/zgetrff.f new file mode 100644 index 0000000000..bfb438d696 --- /dev/null +++ b/reference/zgetrff.f @@ -0,0 +1,156 @@ + SUBROUTINE ZGETRFF( M, N, A, LDA, IPIV, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, M, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZGETRF computes an LU factorization of a general M-by-N matrix A +* using partial pivoting with row interchanges. +* +* The factorization has the form +* A = P * L * U +* where P is a permutation matrix, L is lower triangular with unit +* diagonal elements (lower trapezoidal if m > n), and U is upper +* triangular (upper trapezoidal if m < n). +* +* This is the right-looking Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* M (input) INTEGER +* The number of rows of the matrix A. M >= 0. +* +* N (input) INTEGER +* The number of columns of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the M-by-N matrix to be factored. +* On exit, the factors L and U from the factorization +* A = P*L*U; the unit diagonal elements of L are not stored. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,M). +* +* IPIV (output) INTEGER array, dimension (min(M,N)) +* The pivot indices; for 1 <= i <= min(M,N), row i of the +* matrix was interchanged with row IPIV(i). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, U(i,i) is exactly zero. The factorization +* has been completed, but the factor U is exactly +* singular, and division by zero will occur if it is used +* to solve a system of equations. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, IINFO, J, JB, NB +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGEMM, ZGETF2, ZLASWP, ZTRSM +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( M.EQ.0 .OR. N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 64 + IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN +* +* Use unblocked code. +* + CALL ZGETF2( M, N, A, LDA, IPIV, INFO ) + ELSE +* +* Use blocked code. +* + DO 20 J = 1, MIN( M, N ), NB + JB = MIN( MIN( M, N )-J+1, NB ) +* +* Factor diagonal and subdiagonal blocks and test for exact +* singularity. +* + CALL ZGETF2( M-J+1, JB, A( J, J ), LDA, IPIV( J ), IINFO ) +* +* Adjust INFO and the pivot indices. +* + IF( INFO.EQ.0 .AND. IINFO.GT.0 ) + $ INFO = IINFO + J - 1 + DO 10 I = J, MIN( M, J+JB-1 ) + IPIV( I ) = J - 1 + IPIV( I ) + 10 CONTINUE +* +* Apply interchanges to columns 1:J-1. +* + CALL ZLASWP( J-1, A, LDA, J, J+JB-1, IPIV, 1 ) +* + IF( J+JB.LE.N ) THEN +* +* Apply interchanges to columns J+JB:N. +* + CALL ZLASWP( N-J-JB+1, A( 1, J+JB ), LDA, J, J+JB-1, + $ IPIV, 1 ) +* +* Compute block row of U. +* + CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB, + $ N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ), + $ LDA ) + IF( J+JB.LE.M ) THEN +* +* Update trailing submatrix. +* + CALL ZGEMM( 'No transpose', 'No transpose', M-J-JB+1, + $ N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA, + $ A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ), + $ LDA ) + END IF + END IF + 20 CONTINUE + END IF + RETURN +* +* End of ZGETRF +* + END diff --git a/reference/zgetrsf.f b/reference/zgetrsf.f new file mode 100644 index 0000000000..823798b529 --- /dev/null +++ b/reference/zgetrsf.f @@ -0,0 +1,150 @@ + SUBROUTINE ZGETRSF( TRANS, N, NRHS, A, LDA, IPIV, B, LDB, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER TRANS + INTEGER INFO, LDA, LDB, N, NRHS +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* ZGETRS solves a system of linear equations +* A * X = B, A**T * X = B, or A**H * X = B +* with a general N-by-N matrix A using the LU factorization computed +* by ZGETRF. +* +* Arguments +* ========= +* +* TRANS (input) CHARACTER*1 +* Specifies the form of the system of equations: +* = 'N': A * X = B (No transpose) +* = 'T': A**T * X = B (Transpose) +* = 'C': A**H * X = B (Conjugate transpose) +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* NRHS (input) INTEGER +* The number of right hand sides, i.e., the number of columns +* of the matrix B. NRHS >= 0. +* +* A (input) COMPLEX*16 array, dimension (LDA,N) +* The factors L and U from the factorization A = P*L*U +* as computed by ZGETRF. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* IPIV (input) INTEGER array, dimension (N) +* The pivot indices from ZGETRF; for 1<=i<=N, row i of the +* matrix was interchanged with row IPIV(i). +* +* B (input/output) COMPLEX*16 array, dimension (LDB,NRHS) +* On entry, the right hand side matrix B. +* On exit, the solution matrix X. +* +* LDB (input) INTEGER +* The leading dimension of the array B. LDB >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOTRAN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZLASWP, ZTRSM +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + NOTRAN = LSAME( TRANS, 'N' ) .OR. LSAME(TRANS, 'R') + IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. + $ LSAME( TRANS, 'C' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( NRHS.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + ELSE IF( LDB.LT.MAX( 1, N ) ) THEN + INFO = -8 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETRS', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 .OR. NRHS.EQ.0 ) + $ RETURN +* + IF( NOTRAN ) THEN +* +* Solve A * X = B. +* +* Apply row interchanges to the right hand sides. +* + CALL ZLASWP( NRHS, B, LDB, 1, N, IPIV, 1 ) +* +* Solve L*X = B, overwriting B with X. +* + CALL ZTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, + $ ONE, A, LDA, B, LDB ) +* +* Solve U*X = B, overwriting B with X. +* + CALL ZTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, + $ NRHS, ONE, A, LDA, B, LDB ) + ELSE +* +* Solve A**T * X = B or A**H * X = B. +* +* Solve U'*X = B, overwriting B with X. +* + CALL ZTRSM( 'Left', 'Upper', TRANS, 'Non-unit', N, NRHS, ONE, + $ A, LDA, B, LDB ) +* +* Solve L'*X = B, overwriting B with X. +* + CALL ZTRSM( 'Left', 'Lower', TRANS, 'Unit', N, NRHS, ONE, A, + $ LDA, B, LDB ) +* +* Apply row interchanges to the solution vectors. +* + CALL ZLASWP( NRHS, B, LDB, 1, N, IPIV, -1 ) + END IF +* + RETURN +* +* End of ZGETRS +* + END diff --git a/reference/zhbmvf.f b/reference/zhbmvf.f new file mode 100644 index 0000000000..875c3e0341 --- /dev/null +++ b/reference/zhbmvf.f @@ -0,0 +1,406 @@ + SUBROUTINE ZHBMVF( UPLO, N, K, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER INCX, INCY, K, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian band matrix, with k super-diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the hermitian matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a hermitian band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the hermitian matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a hermitian band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of DIMENSION at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, MIN, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ).AND. + $ .NOT.LSAME( UPLO, 'V' ).AND. + $ .NOT.LSAME( UPLO, 'M' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( K.LT.0 )THEN + INFO = 3 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + ELSE IF( INCY.EQ.0 )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + +* +* Form y when upper triangle of A is stored. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + RETURN + ENDIF + +* +* Form y when lower triangle of A is stored. +* + IF( LSAME( UPLO, 'L' ) )THEN + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( A( 1, J ) ) + L = 1 - J + DO 90, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( 1, J ) ) + L = 1 - J + IX = JX + IY = JY + DO 110, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L + I, J ) + TEMP2 = TEMP2 + DCONJG( A( L + I, J ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + RETURN + END IF + + +* +* Form y when upper triangle of A is stored. +* + IF( LSAME( UPLO, 'V' ) )THEN + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 160, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 150, I = MAX( 1, J - K ), J - 1 + Y( I ) = Y( I ) + TEMP1*DCONJG(A( L + I, J )) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 150 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + 160 CONTINUE + ELSE + JX = KX + JY = KY + DO 180, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 170, I = MAX( 1, J - K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( L + I, J )) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 170 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( KPLUS1, J ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K )THEN + KX = KX + INCX + KY = KY + INCY + END IF + 180 CONTINUE + END IF + RETURN + ENDIF + +* +* Form y when lower triangle of A is stored. +* + IF( LSAME( UPLO, 'M' ) )THEN + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 200, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( A( 1, J ) ) + L = 1 - J + DO 190, I = J + 1, MIN( N, J + K ) + Y( I ) = Y( I ) + TEMP1*DCONJG(A( L + I, J )) + TEMP2 = TEMP2 + A( L + I, J )*X( I ) + 190 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 200 CONTINUE + ELSE + JX = KX + JY = KY + DO 220, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( 1, J ) ) + L = 1 - J + IX = JX + IY = JY + DO 210, I = J + 1, MIN( N, J + K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( L + I, J )) + TEMP2 = TEMP2 + A( L + I, J )*X( IX ) + 210 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 220 CONTINUE + END IF + RETURN + END IF + + + +* + RETURN +* +* End of ZHBMV . +* + END diff --git a/reference/zhemm3mf.f b/reference/zhemm3mf.f new file mode 100644 index 0000000000..2247e2cb38 --- /dev/null +++ b/reference/zhemm3mf.f @@ -0,0 +1,304 @@ + SUBROUTINE ZHEMM3MF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZHEMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is an hermitian matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the hermitian matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the hermitian matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* hermitian matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* hermitian matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHEMM3M', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*DCONJG( A( K, I ) ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*DCONJG( A( K, I ) ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*DBLE( A( J, J ) ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*DCONJG( A( J, K ) ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*DCONJG( A( J, K ) ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of ZHEMM . +* + END diff --git a/reference/zhemmf.f b/reference/zhemmf.f new file mode 100644 index 0000000000..dbe8fb188c --- /dev/null +++ b/reference/zhemmf.f @@ -0,0 +1,304 @@ + SUBROUTINE ZHEMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZHEMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is an hermitian matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the hermitian matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the hermitian matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* hermitian matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* hermitian matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the hermitian matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the hermitian matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the hermitian +* matrix and the strictly upper triangular part of A is not +* referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHEMM3M', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*DCONJG( A( K, I ) ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1*A( K, I ) + TEMP2 = TEMP2 + + $ B( K, J )*DCONJG( A( K, I ) ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*DBLE( A( I, I ) ) + + $ ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*DBLE( A( J, J ) ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*DCONJG( A( J, K ) ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*DCONJG( A( J, K ) ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of ZHEMM . +* + END diff --git a/reference/zhemvf.f b/reference/zhemvf.f new file mode 100644 index 0000000000..ac8a04f1ff --- /dev/null +++ b/reference/zhemvf.f @@ -0,0 +1,351 @@ + SUBROUTINE ZHEMVF ( UPLO, N, ALPHA, A, LDA, X, INCX, + $ BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHEMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ).AND. + $ .NOT.LSAME( UPLO, 'V' ).AND. + $ .NOT.LSAME( UPLO, 'M' ))THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 5 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + ELSE IF( INCY.EQ.0 )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHEMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + + + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + RETURN + ENDIF + + + IF( LSAME( UPLO, 'L' ) )THEN +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + IX = JX + IY = JY + DO 110, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + DCONJG( A( I, J ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + RETURN + END IF + + + IF( LSAME( UPLO, 'V' ) )THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 160, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 150, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1* DCONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 150 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 + 160 CONTINUE + ELSE + JX = KX + JY = KY + DO 180, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 170, I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1* DCONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 170 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 180 CONTINUE + END IF + RETURN + ENDIF + + + IF( LSAME( UPLO, 'M' ) )THEN +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 200, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( A( J, J ) ) + DO 190, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*DCONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 190 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 200 CONTINUE + ELSE + JX = KX + JY = KY + DO 220, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( A( J, J ) ) + IX = JX + IY = JY + DO 210, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*DCONJG(A( I, J )) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 210 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 220 CONTINUE + END IF + RETURN + END IF +* + RETURN +* +* End of ZHEMV . +* + END diff --git a/reference/zher2f.f b/reference/zher2f.f new file mode 100644 index 0000000000..4ae3e4ce34 --- /dev/null +++ b/reference/zher2f.f @@ -0,0 +1,249 @@ + SUBROUTINE ZHER2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHER2 performs the hermitian rank 2 operation +* +* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHER2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( J ) ) + TEMP2 = DCONJG( ALPHA*X( J ) ) + DO 10, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + A( J, J ) = DBLE( A( J, J ) ) + + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( JY ) ) + TEMP2 = DCONJG( ALPHA*X( JX ) ) + IX = KX + IY = KY + DO 30, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + A( J, J ) = DBLE( A( J, J ) ) + + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( J ) ) + TEMP2 = DCONJG( ALPHA*X( J ) ) + A( J, J ) = DBLE( A( J, J ) ) + + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) + DO 50, I = J + 1, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( JY ) ) + TEMP2 = DCONJG( ALPHA*X( JX ) ) + A( J, J ) = DBLE( A( J, J ) ) + + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + IX = JX + IY = JY + DO 70, I = J + 1, N + IX = IX + INCX + IY = IY + INCY + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + 70 CONTINUE + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHER2 . +* + END diff --git a/reference/zher2kf.f b/reference/zher2kf.f new file mode 100644 index 0000000000..43b75d95f4 --- /dev/null +++ b/reference/zher2kf.f @@ -0,0 +1,372 @@ + SUBROUTINE ZHER2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B,LDB, BETA, + $ C, LDC ) +* .. Scalar Arguments .. + CHARACTER TRANS, UPLO + INTEGER K, LDA, LDB, LDC, N + DOUBLE PRECISION BETA + COMPLEX*16 ALPHA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZHER2K performs one of the hermitian rank 2k operations +* +* C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + beta*C, +* +* or +* +* C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + beta*C, +* +* where alpha and beta are scalars with beta real, C is an n by n +* hermitian matrix and A and B are n by k matrices in the first case +* and k by n matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*conjg( B' ) + +* conjg( alpha )*B*conjg( A' ) + +* beta*C. +* +* TRANS = 'C' or 'c' C := alpha*conjg( A' )*B + +* conjg( alpha )*conjg( B' )*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'C' or 'c', K specifies the number of rows of the +* matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1. +* Ed Anderson, Cray Research Inc. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCONJG, MAX +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) ) THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ) .AND. ( .NOT.LSAME( UPLO, 'L' ) ) ) THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ) .AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN + INFO = 2 + ELSE IF( N.LT.0 ) THEN + INFO = 3 + ELSE IF( K.LT.0 ) THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) ) THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) ) THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) ) THEN + INFO = 12 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHER2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ( ALPHA.EQ.ZERO ) .OR. ( K.EQ.0 ) ) .AND. + $ ( BETA.EQ.ONE ) ) )RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO ) THEN + IF( UPPER ) THEN + IF( BETA.EQ.DBLE( ZERO ) ) THEN + DO 20 J = 1, N + DO 10 I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1, N + DO 30 I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.DBLE( ZERO ) ) THEN + DO 60 J = 1, N + DO 50 I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80 J = 1, N + C( J, J ) = BETA*DBLE( C( J, J ) ) + DO 70 I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) ) THEN +* +* Form C := alpha*A*conjg( B' ) + conjg( alpha )*B*conjg( A' ) + +* C. +* + IF( UPPER ) THEN + DO 130 J = 1, N + IF( BETA.EQ.DBLE( ZERO ) ) THEN + DO 90 I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE ) THEN + DO 100 I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + ELSE + C( J, J ) = DBLE( C( J, J ) ) + END IF + DO 120 L = 1, K + IF( ( A( J, L ).NE.ZERO ) .OR. ( B( J, L ).NE.ZERO ) ) + $ THEN + TEMP1 = ALPHA*DCONJG( B( J, L ) ) + TEMP2 = DCONJG( ALPHA*A( J, L ) ) + DO 110 I = 1, J - 1 + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 110 CONTINUE + C( J, J ) = DBLE( C( J, J ) ) + + $ DBLE( A( J, L )*TEMP1+B( J, L )*TEMP2 ) + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180 J = 1, N + IF( BETA.EQ.DBLE( ZERO ) ) THEN + DO 140 I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE ) THEN + DO 150 I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + ELSE + C( J, J ) = DBLE( C( J, J ) ) + END IF + DO 170 L = 1, K + IF( ( A( J, L ).NE.ZERO ) .OR. ( B( J, L ).NE.ZERO ) ) + $ THEN + TEMP1 = ALPHA*DCONJG( B( J, L ) ) + TEMP2 = DCONJG( ALPHA*A( J, L ) ) + DO 160 I = J + 1, N + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 160 CONTINUE + C( J, J ) = DBLE( C( J, J ) ) + + $ DBLE( A( J, L )*TEMP1+B( J, L )*TEMP2 ) + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*conjg( A' )*B + conjg( alpha )*conjg( B' )*A + +* C. +* + IF( UPPER ) THEN + DO 210 J = 1, N + DO 200 I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190 L = 1, K + TEMP1 = TEMP1 + DCONJG( A( L, I ) )*B( L, J ) + TEMP2 = TEMP2 + DCONJG( B( L, I ) )*A( L, J ) + 190 CONTINUE + IF( I.EQ.J ) THEN + IF( BETA.EQ.DBLE( ZERO ) ) THEN + C( J, J ) = DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* + $ TEMP2 ) + ELSE + C( J, J ) = BETA*DBLE( C( J, J ) ) + + $ DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* + $ TEMP2 ) + END IF + ELSE + IF( BETA.EQ.DBLE( ZERO ) ) THEN + C( I, J ) = ALPHA*TEMP1 + DCONJG( ALPHA )*TEMP2 + ELSE + C( I, J ) = BETA*C( I, J ) + ALPHA*TEMP1 + + $ DCONJG( ALPHA )*TEMP2 + END IF + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240 J = 1, N + DO 230 I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220 L = 1, K + TEMP1 = TEMP1 + DCONJG( A( L, I ) )*B( L, J ) + TEMP2 = TEMP2 + DCONJG( B( L, I ) )*A( L, J ) + 220 CONTINUE + IF( I.EQ.J ) THEN + IF( BETA.EQ.DBLE( ZERO ) ) THEN + C( J, J ) = DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* + $ TEMP2 ) + ELSE + C( J, J ) = BETA*DBLE( C( J, J ) ) + + $ DBLE( ALPHA*TEMP1+DCONJG( ALPHA )* + $ TEMP2 ) + END IF + ELSE + IF( BETA.EQ.DBLE( ZERO ) ) THEN + C( I, J ) = ALPHA*TEMP1 + DCONJG( ALPHA )*TEMP2 + ELSE + C( I, J ) = BETA*C( I, J ) + ALPHA*TEMP1 + + $ DCONJG( ALPHA )*TEMP2 + END IF + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHER2K. +* + END diff --git a/reference/zherf.f b/reference/zherf.f new file mode 100644 index 0000000000..ebde22ca59 --- /dev/null +++ b/reference/zherf.f @@ -0,0 +1,212 @@ + SUBROUTINE ZHERF ( UPLO, N, ALPHA, X, INCX, A, LDA ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZHER performs the hermitian rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n hermitian matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHER ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.DBLE( ZERO ) ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( J ) ) + DO 10, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + A( J, J ) = DBLE( A( J, J ) ) + DBLE( X( J )*TEMP ) + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( JX ) ) + IX = KX + DO 30, I = 1, J - 1 + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + A( J, J ) = DBLE( A( J, J ) ) + DBLE( X( JX )*TEMP ) + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( J ) ) + A( J, J ) = DBLE( A( J, J ) ) + DBLE( TEMP*X( J ) ) + DO 50, I = J + 1, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( JX ) ) + A( J, J ) = DBLE( A( J, J ) ) + DBLE( TEMP*X( JX ) ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + A( I, J ) = A( I, J ) + X( IX )*TEMP + 70 CONTINUE + ELSE + A( J, J ) = DBLE( A( J, J ) ) + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHER . +* + END diff --git a/reference/zherkf.f b/reference/zherkf.f new file mode 100644 index 0000000000..5a7e082483 --- /dev/null +++ b/reference/zherkf.f @@ -0,0 +1,330 @@ + SUBROUTINE ZHERKF( UPLO,TRANS, N, K, ALPHA, A, LDA, BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER TRANS, UPLO + INTEGER K, LDA, LDC, N + DOUBLE PRECISION ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZHERK performs one of the hermitian rank k operations +* +* C := alpha*A*conjg( A' ) + beta*C, +* +* or +* +* C := alpha*conjg( A' )*A + beta*C, +* +* where alpha and beta are real scalars, C is an n by n hermitian +* matrix and A is an n by k matrix in the first case and a k by n +* matrix in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*conjg( A' ) + beta*C. +* +* TRANS = 'C' or 'c' C := alpha*conjg( A' )*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'C' or 'c', K specifies the number of rows of the +* matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - DOUBLE PRECISION. +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the hermitian matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the hermitian matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* -- Modified 8-Nov-93 to set C(J,J) to DBLE( C(J,J) ) when BETA = 1. +* Ed Anderson, Cray Research Inc. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCMPLX, DCONJG, MAX +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + DOUBLE PRECISION RTEMP + COMPLEX*16 TEMP +* .. +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) ) THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ) .AND. ( .NOT.LSAME( UPLO, 'L' ) ) ) THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ) .AND. + $ ( .NOT.LSAME( TRANS, 'C' ) ) ) THEN + INFO = 2 + ELSE IF( N.LT.0 ) THEN + INFO = 3 + ELSE IF( K.LT.0 ) THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) ) THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) ) THEN + INFO = 10 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZHERK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ( ALPHA.EQ.ZERO ) .OR. ( K.EQ.0 ) ) .AND. + $ ( BETA.EQ.ONE ) ) )RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO ) THEN + IF( UPPER ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 20 J = 1, N + DO 10 I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40 J = 1, N + DO 30 I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO ) THEN + DO 60 J = 1, N + DO 50 I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80 J = 1, N + C( J, J ) = BETA*DBLE( C( J, J ) ) + DO 70 I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) ) THEN +* +* Form C := alpha*A*conjg( A' ) + beta*C. +* + IF( UPPER ) THEN + DO 130 J = 1, N + IF( BETA.EQ.ZERO ) THEN + DO 90 I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE ) THEN + DO 100 I = 1, J - 1 + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + C( J, J ) = BETA*DBLE( C( J, J ) ) + ELSE + C( J, J ) = DBLE( C( J, J ) ) + END IF + DO 120 L = 1, K + IF( A( J, L ).NE.DCMPLX( ZERO ) ) THEN + TEMP = ALPHA*DCONJG( A( J, L ) ) + DO 110 I = 1, J - 1 + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + C( J, J ) = DBLE( C( J, J ) ) + + $ DBLE( TEMP*A( I, L ) ) + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180 J = 1, N + IF( BETA.EQ.ZERO ) THEN + DO 140 I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE ) THEN + C( J, J ) = BETA*DBLE( C( J, J ) ) + DO 150 I = J + 1, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + ELSE + C( J, J ) = DBLE( C( J, J ) ) + END IF + DO 170 L = 1, K + IF( A( J, L ).NE.DCMPLX( ZERO ) ) THEN + TEMP = ALPHA*DCONJG( A( J, L ) ) + C( J, J ) = DBLE( C( J, J ) ) + + $ DBLE( TEMP*A( J, L ) ) + DO 160 I = J + 1, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*conjg( A' )*A + beta*C. +* + IF( UPPER ) THEN + DO 220 J = 1, N + DO 200 I = 1, J - 1 + TEMP = ZERO + DO 190 L = 1, K + TEMP = TEMP + DCONJG( A( L, I ) )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO ) THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + RTEMP = ZERO + DO 210 L = 1, K + RTEMP = RTEMP + DCONJG( A( L, J ) )*A( L, J ) + 210 CONTINUE + IF( BETA.EQ.ZERO ) THEN + C( J, J ) = ALPHA*RTEMP + ELSE + C( J, J ) = ALPHA*RTEMP + BETA*DBLE( C( J, J ) ) + END IF + 220 CONTINUE + ELSE + DO 260 J = 1, N + RTEMP = ZERO + DO 230 L = 1, K + RTEMP = RTEMP + DCONJG( A( L, J ) )*A( L, J ) + 230 CONTINUE + IF( BETA.EQ.ZERO ) THEN + C( J, J ) = ALPHA*RTEMP + ELSE + C( J, J ) = ALPHA*RTEMP + BETA*DBLE( C( J, J ) ) + END IF + DO 250 I = J + 1, N + TEMP = ZERO + DO 240 L = 1, K + TEMP = TEMP + DCONJG( A( L, I ) )*A( L, J ) + 240 CONTINUE + IF( BETA.EQ.ZERO ) THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 250 CONTINUE + 260 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHERK . +* + END diff --git a/reference/zhpmvf.f b/reference/zhpmvf.f new file mode 100644 index 0000000000..8631861b30 --- /dev/null +++ b/reference/zhpmvf.f @@ -0,0 +1,270 @@ + SUBROUTINE ZHPMVF( UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Note that the imaginary parts of the diagonal elements need +* not be set and are assumed to be zero. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 6 + ELSE IF( INCY.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE )THEN + IF( INCY.EQ.1 )THEN + IF( BETA.EQ.ZERO )THEN + DO 10, I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20, I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO )THEN + DO 30, I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40, I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50, I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*DBLE( AP( KK + J - 1 ) ) + $ + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70, K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*DBLE( AP( KK + J - 1 ) ) + $ + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 100, J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*DBLE( AP( KK ) ) + K = KK + 1 + DO 90, I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N - J + 1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120, J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*DBLE( AP( KK ) ) + IX = JX + IY = JY + DO 110, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + DCONJG( AP( K ) )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N - J + 1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHPMV . +* + END diff --git a/reference/zhpr2f.f b/reference/zhpr2f.f new file mode 100644 index 0000000000..462913d4a2 --- /dev/null +++ b/reference/zhpr2f.f @@ -0,0 +1,251 @@ + SUBROUTINE ZHPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZHPR2 performs the hermitian rank 2 operation +* +* A := alpha*x*conjg( y' ) + conjg( alpha )*y*conjg( x' ) + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( J ) ) + TEMP2 = DCONJG( ALPHA*X( J ) ) + K = KK + DO 10, I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) + ELSE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( JY ) ) + TEMP2 = DCONJG( ALPHA*X( JX ) ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + + $ DBLE( X( JX )*TEMP1 + + $ Y( JY )*TEMP2 ) + ELSE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( J ) ) + TEMP2 = DCONJG( ALPHA*X( J ) ) + AP( KK ) = DBLE( AP( KK ) ) + + $ DBLE( X( J )*TEMP1 + Y( J )*TEMP2 ) + K = KK + 1 + DO 50, I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = DBLE( AP( KK ) ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*DCONJG( Y( JY ) ) + TEMP2 = DCONJG( ALPHA*X( JX ) ) + AP( KK ) = DBLE( AP( KK ) ) + + $ DBLE( X( JX )*TEMP1 + Y( JY )*TEMP2 ) + IX = JX + IY = JY + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + 70 CONTINUE + ELSE + AP( KK ) = DBLE( AP( KK ) ) + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHPR2 . +* + END diff --git a/reference/zhprf.f b/reference/zhprf.f new file mode 100644 index 0000000000..2c93f1e7e4 --- /dev/null +++ b/reference/zhprf.f @@ -0,0 +1,217 @@ + SUBROUTINE ZHPRF ( UPLO, N, ALPHA, X, INCX, AP ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA + INTEGER INCX, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZHPR performs the hermitian rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a real scalar, x is an n element vector and A is an +* n by n hermitian matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the hermitian matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, DBLE +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZHPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.DBLE( ZERO ) ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( J ) ) + K = KK + DO 10, I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + $ + DBLE( X( J )*TEMP ) + ELSE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( JX ) ) + IX = KX + DO 30, K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + $ + DBLE( X( JX )*TEMP ) + ELSE + AP( KK + J - 1 ) = DBLE( AP( KK + J - 1 ) ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( J ) ) + AP( KK ) = DBLE( AP( KK ) ) + DBLE( TEMP*X( J ) ) + K = KK + 1 + DO 50, I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = DBLE( AP( KK ) ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = ALPHA*DCONJG( X( JX ) ) + AP( KK ) = DBLE( AP( KK ) ) + DBLE( TEMP*X( JX ) ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + AP( K ) = AP( K ) + X( IX )*TEMP + 70 CONTINUE + ELSE + AP( KK ) = DBLE( AP( KK ) ) + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZHPR . +* + END diff --git a/reference/zlaswpf.f b/reference/zlaswpf.f new file mode 100644 index 0000000000..582f15b52c --- /dev/null +++ b/reference/zlaswpf.f @@ -0,0 +1,120 @@ + SUBROUTINE ZLASWPF( N, A, LDA, K1, K2, IPIV, INCX ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* June 30, 1999 +* +* .. Scalar Arguments .. + INTEGER INCX, K1, K2, LDA, N +* .. +* .. Array Arguments .. + INTEGER IPIV( * ) + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZLASWP performs a series of row interchanges on the matrix A. +* One row interchange is initiated for each of rows K1 through K2 of A. +* +* Arguments +* ========= +* +* N (input) INTEGER +* The number of columns of the matrix A. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the matrix of column dimension N to which the row +* interchanges will be applied. +* On exit, the permuted matrix. +* +* LDA (input) INTEGER +* The leading dimension of the array A. +* +* K1 (input) INTEGER +* The first element of IPIV for which a row interchange will +* be done. +* +* K2 (input) INTEGER +* The last element of IPIV for which a row interchange will +* be done. +* +* IPIV (input) INTEGER array, dimension (M*abs(INCX)) +* The vector of pivot indices. Only the elements in positions +* K1 through K2 of IPIV are accessed. +* IPIV(K) = L implies rows K and L are to be interchanged. +* +* INCX (input) INTEGER +* The increment between successive values of IPIV. If IPIV +* is negative, the pivots are applied in reverse order. +* +* Further Details +* =============== +* +* Modified by +* R. C. Whaley, Computer Science Dept., Univ. of Tenn., Knoxville, USA +* +* ===================================================================== +* +* .. Local Scalars .. + INTEGER I, I1, I2, INC, IP, IX, IX0, J, K, N32 + COMPLEX*16 TEMP +* .. +* .. Executable Statements .. +* +* Interchange row I with row IPIV(I) for each of rows K1 through K2. +* + IF( INCX.GT.0 ) THEN + IX0 = K1 + I1 = K1 + I2 = K2 + INC = 1 + ELSE IF( INCX.LT.0 ) THEN + IX0 = 1 + ( 1-K2 )*INCX + I1 = K2 + I2 = K1 + INC = -1 + ELSE + RETURN + END IF +* + N32 = ( N / 32 )*32 + IF( N32.NE.0 ) THEN + DO 30 J = 1, N32, 32 + IX = IX0 + DO 20 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 10 K = J, J + 31 + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 10 CONTINUE + END IF + IX = IX + INCX + 20 CONTINUE + 30 CONTINUE + END IF + IF( N32.NE.N ) THEN + N32 = N32 + 1 + IX = IX0 + DO 50 I = I1, I2, INC + IP = IPIV( IX ) + IF( IP.NE.I ) THEN + DO 40 K = N32, N + TEMP = A( I, K ) + A( I, K ) = A( IP, K ) + A( IP, K ) = TEMP + 40 CONTINUE + END IF + IX = IX + INCX + 50 CONTINUE + END IF +* + RETURN +* +* End of ZLASWP +* + END diff --git a/reference/zlauu2f.f b/reference/zlauu2f.f new file mode 100644 index 0000000000..f53f99d557 --- /dev/null +++ b/reference/zlauu2f.f @@ -0,0 +1,143 @@ + SUBROUTINE ZLAUU2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZLAUU2 computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the unblocked form of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I + DOUBLE PRECISION AII +* .. +* .. External Functions .. + LOGICAL LSAME + COMPLEX*16 ZDOTC + EXTERNAL LSAME, ZDOTC +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMV, ZLACGV +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCMPLX, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLAUU2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = AII*AII + DBLE( ZDOTC( N-I, A( I, I+1 ), LDA, + $ A( I, I+1 ), LDA ) ) + CALL ZLACGV( N-I, A( I, I+1 ), LDA ) + CALL ZGEMV( 'No transpose', I-1, N-I, ONE, A( 1, I+1 ), + $ LDA, A( I, I+1 ), LDA, DCMPLX( AII ), + $ A( 1, I ), 1 ) + CALL ZLACGV( N-I, A( I, I+1 ), LDA ) + ELSE + CALL ZDSCAL( I, AII, A( 1, I ), 1 ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N + AII = A( I, I ) + IF( I.LT.N ) THEN + A( I, I ) = AII*AII + DBLE( ZDOTC( N-I, A( I+1, I ), 1, + $ A( I+1, I ), 1 ) ) + CALL ZLACGV( I-1, A( I, 1 ), LDA ) + CALL ZGEMV( 'Conjugate transpose', N-I, I-1, ONE, + $ A( I+1, 1 ), LDA, A( I+1, I ), 1, + $ DCMPLX( AII ), A( I, 1 ), LDA ) + CALL ZLACGV( I-1, A( I, 1 ), LDA ) + ELSE + CALL ZDSCAL( I, AII, A( I, 1 ), LDA ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of ZLAUU2 +* + END diff --git a/reference/zlauumf.f b/reference/zlauumf.f new file mode 100644 index 0000000000..3a84646ef4 --- /dev/null +++ b/reference/zlauumf.f @@ -0,0 +1,160 @@ + SUBROUTINE ZLAUUMF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK auxiliary routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZLAUUM computes the product U * U' or L' * L, where the triangular +* factor U or L is stored in the upper or lower triangular part of +* the array A. +* +* If UPLO = 'U' or 'u' then the upper triangle of the result is stored, +* overwriting the factor U in A. +* If UPLO = 'L' or 'l' then the lower triangle of the result is stored, +* overwriting the factor L in A. +* +* This is the blocked form of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the triangular factor stored in the array A +* is upper or lower triangular: +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the triangular factor U or L. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular factor U or L. +* On exit, if UPLO = 'U', the upper triangle of A is +* overwritten with the upper triangle of the product U * U'; +* if UPLO = 'L', the lower triangle of A is overwritten with +* the lower triangle of the product L' * L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, IB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGEMM, ZHERK, ZLAUU2, ZTRMM +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZLAUUM', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 128 +* + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL ZLAUU2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute the product U * U'. +* + DO 10 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL ZTRMM( 'Right', 'Upper', 'Conjugate transpose', + $ 'Non-unit', I-1, IB, CONE, A( I, I ), LDA, + $ A( 1, I ), LDA ) + CALL ZLAUU2( 'Upper', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ I-1, IB, N-I-IB+1, CONE, A( 1, I+IB ), + $ LDA, A( I, I+IB ), LDA, CONE, A( 1, I ), + $ LDA ) + CALL ZHERK( 'Upper', 'No transpose', IB, N-I-IB+1, + $ ONE, A( I, I+IB ), LDA, ONE, A( I, I ), + $ LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the product L' * L. +* + DO 20 I = 1, N, NB + IB = MIN( NB, N-I+1 ) + CALL ZTRMM( 'Left', 'Lower', 'Conjugate transpose', + $ 'Non-unit', IB, I-1, CONE, A( I, I ), LDA, + $ A( I, 1 ), LDA ) + CALL ZLAUU2( 'Lower', IB, A( I, I ), LDA, INFO ) + IF( I+IB.LE.N ) THEN + CALL ZGEMM( 'Conjugate transpose', 'No transpose', IB, + $ I-1, N-I-IB+1, CONE, A( I+IB, I ), LDA, + $ A( I+IB, 1 ), LDA, CONE, A( I, 1 ), LDA ) + CALL ZHERK( 'Lower', 'Conjugate transpose', IB, + $ N-I-IB+1, ONE, A( I+IB, I ), LDA, ONE, + $ A( I, I ), LDA ) + END IF + 20 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZLAUUM +* + END diff --git a/reference/zpotf2f.f b/reference/zpotf2f.f new file mode 100644 index 0000000000..bfb6f113d3 --- /dev/null +++ b/reference/zpotf2f.f @@ -0,0 +1,175 @@ + SUBROUTINE ZPOTF2F( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZPOTF2 computes the Cholesky factorization of a complex Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U' * U , if UPLO = 'U', or +* A = L * L', if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the unblocked version of the algorithm, calling Level 2 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the upper or lower triangular part of the +* Hermitian matrix A is stored. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* n by n upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U'*U or A = L*L'. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* > 0: if INFO = k, the leading minor of order k is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J + DOUBLE PRECISION AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + COMPLEX*16 ZDOTC + EXTERNAL LSAME, ZDOTC +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZDSCAL, ZGEMV, ZLACGV +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, SQRT +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZPOTF2', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N +* +* Compute U(J,J) and test for non-positive-definiteness. +* + AJJ = DBLE( A( J, J ) ) - ZDOTC( J-1, A( 1, J ), 1, + $ A( 1, J ), 1 ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of row J. +* + IF( J.LT.N ) THEN + CALL ZLACGV( J-1, A( 1, J ), 1 ) + CALL ZGEMV( 'Transpose', J-1, N-J, -CONE, A( 1, J+1 ), + $ LDA, A( 1, J ), 1, CONE, A( J, J+1 ), LDA ) + CALL ZLACGV( J-1, A( 1, J ), 1 ) + CALL ZDSCAL( N-J, ONE / AJJ, A( J, J+1 ), LDA ) + END IF + 10 CONTINUE + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N +* +* Compute L(J,J) and test for non-positive-definiteness. +* + AJJ = DBLE( A( J, J ) ) - ZDOTC( J-1, A( J, 1 ), LDA, + $ A( J, 1 ), LDA ) + IF( AJJ.LE.ZERO ) THEN + A( J, J ) = AJJ + GO TO 30 + END IF + AJJ = SQRT( AJJ ) + A( J, J ) = AJJ +* +* Compute elements J+1:N of column J. +* + IF( J.LT.N ) THEN + CALL ZLACGV( J-1, A( J, 1 ), LDA ) + CALL ZGEMV( 'No transpose', N-J, J-1, -CONE, A( J+1, 1 ), + $ LDA, A( J, 1 ), LDA, CONE, A( J+1, J ), 1 ) + CALL ZLACGV( J-1, A( J, 1 ), LDA ) + CALL ZDSCAL( N-J, ONE / AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF + GO TO 40 +* + 30 CONTINUE + INFO = J +* + 40 CONTINUE + RETURN +* +* End of ZPOTF2 +* + END diff --git a/reference/zpotrff.f b/reference/zpotrff.f new file mode 100644 index 0000000000..7cef580a41 --- /dev/null +++ b/reference/zpotrff.f @@ -0,0 +1,187 @@ + SUBROUTINE ZPOTRFF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZPOTRF computes the Cholesky factorization of a complex Hermitian +* positive definite matrix A. +* +* The factorization has the form +* A = U**H * U, if UPLO = 'U', or +* A = L * L**H, if UPLO = 'L', +* where U is an upper triangular matrix and L is lower triangular. +* +* This is the block version of the algorithm, calling Level 3 BLAS. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the Hermitian matrix A. If UPLO = 'U', the leading +* N-by-N upper triangular part of A contains the upper +* triangular part of the matrix A, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of A contains the lower +* triangular part of the matrix A, and the strictly upper +* triangular part of A is not referenced. +* +* On exit, if INFO = 0, the factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the leading minor of order i is not +* positive definite, and the factorization could not be +* completed. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + COMPLEX*16 CONE + PARAMETER ( ONE = 1.0D+0, CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL UPPER + INTEGER J, JB, NB +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZGEMM, ZHERK, ZPOTF2, ZTRSM +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZPOTRF', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Determine the block size for this environment. +* + NB = 56 + + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code. +* + CALL ZPOTF2( UPLO, N, A, LDA, INFO ) + ELSE +* +* Use blocked code. +* + IF( UPPER ) THEN +* +* Compute the Cholesky factorization A = U'*U. +* + DO 10 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL ZHERK( 'Upper', 'Conjugate transpose', JB, J-1, + $ -ONE, A( 1, J ), LDA, ONE, A( J, J ), LDA ) + CALL ZPOTF2( 'Upper', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block row. +* + CALL ZGEMM( 'Conjugate transpose', 'No transpose', JB, + $ N-J-JB+1, J-1, -CONE, A( 1, J ), LDA, + $ A( 1, J+JB ), LDA, CONE, A( J, J+JB ), + $ LDA ) + CALL ZTRSM( 'Left', 'Upper', 'Conjugate transpose', + $ 'Non-unit', JB, N-J-JB+1, CONE, A( J, J ), + $ LDA, A( J, J+JB ), LDA ) + END IF + 10 CONTINUE +* + ELSE +* +* Compute the Cholesky factorization A = L*L'. +* + DO 20 J = 1, N, NB +* +* Update and factorize the current diagonal block and test +* for non-positive-definiteness. +* + JB = MIN( NB, N-J+1 ) + CALL ZHERK( 'Lower', 'No transpose', JB, J-1, -ONE, + $ A( J, 1 ), LDA, ONE, A( J, J ), LDA ) + CALL ZPOTF2( 'Lower', JB, A( J, J ), LDA, INFO ) + IF( INFO.NE.0 ) + $ GO TO 30 + IF( J+JB.LE.N ) THEN +* +* Compute the current block column. +* + CALL ZGEMM( 'No transpose', 'Conjugate transpose', + $ N-J-JB+1, JB, J-1, -CONE, A( J+JB, 1 ), + $ LDA, A( J, 1 ), LDA, CONE, A( J+JB, J ), + $ LDA ) + CALL ZTRSM( 'Right', 'Lower', 'Conjugate transpose', + $ 'Non-unit', N-J-JB+1, JB, CONE, A( J, J ), + $ LDA, A( J+JB, J ), LDA ) + END IF + 20 CONTINUE + END IF + END IF + GO TO 40 +* + 30 CONTINUE + INFO = INFO + J - 1 +* + 40 CONTINUE + RETURN +* +* End of ZPOTRF +* + END diff --git a/reference/zpotrif.f b/reference/zpotrif.f new file mode 100644 index 0000000000..5a11880bc6 --- /dev/null +++ b/reference/zpotrif.f @@ -0,0 +1,96 @@ + SUBROUTINE ZPOTRIF( UPLO, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZPOTRI computes the inverse of a complex Hermitian positive definite +* matrix A using the Cholesky factorization A = U**H*U or A = L*L**H +* computed by ZPOTRF. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': Upper triangle of A is stored; +* = 'L': Lower triangle of A is stored. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular factor U or L from the Cholesky +* factorization A = U**H*U or A = L*L**H, as computed by +* ZPOTRF. +* On exit, the upper or lower triangle of the (Hermitian) +* inverse of A, overwriting the input factor U or L. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, the (i,i) element of the factor U or L is +* zero, and the inverse could not be computed. +* +* ===================================================================== +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZLAUUM, ZTRTRI +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( N.LT.0 ) THEN + INFO = -2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -4 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZPOTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Invert the triangular Cholesky factor U or L. +* + CALL ZTRTRI( UPLO, 'Non-unit', N, A, LDA, INFO ) + IF( INFO.GT.0 ) + $ RETURN +* +* Form inv(U)*inv(U)' or inv(L)'*inv(L). +* + CALL ZLAUUM( UPLO, N, A, LDA, INFO ) +* + RETURN +* +* End of ZPOTRI +* + END diff --git a/reference/zrotgf.f b/reference/zrotgf.f new file mode 100644 index 0000000000..d6f1e0d681 --- /dev/null +++ b/reference/zrotgf.f @@ -0,0 +1,23 @@ + subroutine zrotgf(ca,cb,c,s) + double complex ca,cb,s + double precision c + double precision norm,scale + double complex alpha + if (cdabs(ca) .ne. 0.0d0) go to 10 + c = 0.0d0 + s = (1.0d0,0.0d0) + ca = cb + go to 20 + 10 continue + scale = cdabs(ca) + cdabs(cb) + + norm = scale*dsqrt((cdabs(ca/dcmplx(scale,0.0d0)))**2 + + * (cdabs(cb/dcmplx(scale,0.0d0)))**2) + + alpha = ca /cdabs(ca) + c = cdabs(ca) / norm + s = alpha * dconjg(cb) / norm + ca = alpha * norm + 20 continue + return + end diff --git a/reference/zsbmvf.f b/reference/zsbmvf.f new file mode 100644 index 0000000000..2b7787c898 --- /dev/null +++ b/reference/zsbmvf.f @@ -0,0 +1,306 @@ + SUBROUTINE ZSBMVF(UPLO, N, K, ALPHA, A, LDA, X, INCX, BETA, Y, + $ INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, K, LDA, N + COMPLEX*16 ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZSBMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric band matrix, with k super-diagonals. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the band matrix A is being supplied as +* follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* being supplied. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* being supplied. +* +* Unchanged on exit. +* +* N - INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER +* On entry, K specifies the number of super-diagonals of the +* matrix A. K must satisfy 0 .le. K. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array, dimension( LDA, N ) +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer the upper +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the symmetric matrix, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer the lower +* triangular part of a symmetric band matrix from conventional +* full matrix storage to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Unchanged on exit. +* +* LDA - INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the +* vector x. +* Unchanged on exit. +* +* INCX - INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA - COMPLEX*16 +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* Y - COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the +* vector y. On exit, Y is overwritten by the updated vector y. +* +* INCY - INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, KPLUS1, KX, KY, L + COMPLEX*16 TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( K.LT.0 ) THEN + INFO = 3 + ELSE IF( LDA.LT.( K+1 ) ) THEN + INFO = 6 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 8 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 11 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array A +* are accessed sequentially with one pass through A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when upper triangle of A is stored. +* + KPLUS1 = K + 1 + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + L = KPLUS1 - J + DO 50 I = MAX( 1, J-K ), J - 1 + Y( I ) = Y( I ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + L = KPLUS1 - J + DO 70 I = MAX( 1, J-K ), J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( KPLUS1, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + IF( J.GT.K ) THEN + KX = KX + INCX + KY = KY + INCY + END IF + 80 CONTINUE + END IF + ELSE +* +* Form y when lower triangle of A is stored. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( 1, J ) + L = 1 - J + DO 90 I = J + 1, MIN( N, J+K ) + Y( I ) = Y( I ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( 1, J ) + L = 1 - J + IX = JX + IY = JY + DO 110 I = J + 1, MIN( N, J+K ) + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( L+I, J ) + TEMP2 = TEMP2 + A( L+I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSBMV +* + END diff --git a/reference/zscalf.f b/reference/zscalf.f new file mode 100644 index 0000000000..f9c2c534b5 --- /dev/null +++ b/reference/zscalf.f @@ -0,0 +1,29 @@ + subroutine zscalf(n,za,zx,incx) +c +c scales a vector by a constant. +c jack dongarra, 3/11/78. +c modified 3/93 to return if incx .le. 0. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex za,zx(*) + integer i,incx,ix,n +c + if( n.le.0 .or. incx.le.0 )return + if(incx.eq.1)go to 20 +c +c code for increment not equal to 1 +c + ix = 1 + do 10 i = 1,n + zx(ix) = za*zx(ix) + ix = ix + incx + 10 continue + return +c +c code for increment equal to 1 +c + 20 do 30 i = 1,n + zx(i) = za*zx(i) + 30 continue + return + end diff --git a/reference/zspmvf.f b/reference/zspmvf.f new file mode 100644 index 0000000000..8c6057ee32 --- /dev/null +++ b/reference/zspmvf.f @@ -0,0 +1,264 @@ + SUBROUTINE ZSPMVF(UPLO, N, ALPHA, AP, X, INCX, BETA, Y, INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, N + COMPLEX*16 ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZSPMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix, supplied in packed form. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* AP (input) COMPLEX*16 array, dimension at least +* ( ( N*( N + 1 ) )/2 ). +* Before entry, with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. +* Before entry, with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. +* Unchanged on exit. +* +* X (input) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA (input) COMPLEX*16 +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y (input/output) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY (input) INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY + COMPLEX*16 TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 6 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 9 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSPMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + KK = 1 + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when AP contains the upper triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + K = KK + DO 50 I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 + KK = KK + J + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70 K = KK, KK + J - 2 + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*AP( KK+J-1 ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 80 CONTINUE + END IF + ELSE +* +* Form y when AP contains the lower triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*AP( KK ) + K = KK + 1 + DO 90 I = J + 1, N + Y( I ) = Y( I ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + KK = KK + ( N-J+1 ) + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*AP( KK ) + IX = JX + IY = JY + DO 110 K = KK + 1, KK + N - J + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*AP( K ) + TEMP2 = TEMP2 + AP( K )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + KK = KK + ( N-J+1 ) + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSPMV +* + END diff --git a/reference/zspr2f.f b/reference/zspr2f.f new file mode 100644 index 0000000000..aad5f718a0 --- /dev/null +++ b/reference/zspr2f.f @@ -0,0 +1,229 @@ + SUBROUTINE ZSPR2F( UPLO, N, ALPHA, X, INCX, Y, INCY, AP ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSPR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* AP - DOUBLE PRECISION array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, K, KK, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSPR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when upper triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 10, I = 1, J + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 10 CONTINUE + END IF + KK = KK + J + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, K = KK, KK + J - 1 + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + K = KK + DO 50, I = J, N + AP( K ) = AP( K ) + X( I )*TEMP1 + Y( I )*TEMP2 + K = K + 1 + 50 CONTINUE + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, K = KK, KK + N - J + AP( K ) = AP( K ) + X( IX )*TEMP1 + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSPR2 . +* + END diff --git a/reference/zsprf.f b/reference/zsprf.f new file mode 100644 index 0000000000..c21f6020b4 --- /dev/null +++ b/reference/zsprf.f @@ -0,0 +1,213 @@ + SUBROUTINE ZSPRF( UPLO, N, ALPHA, X, INCX, AP ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, N + COMPLEX*16 ALPHA +* .. +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZSPR performs the symmetric rank 1 operation +* +* A := alpha*x*conjg( x' ) + A, +* +* where alpha is a complex scalar, x is an n element vector and A is an +* n by n symmetric matrix, supplied in packed form. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the matrix A is supplied in the packed +* array AP as follows: +* +* UPLO = 'U' or 'u' The upper triangular part of A is +* supplied in AP. +* +* UPLO = 'L' or 'l' The lower triangular part of A is +* supplied in AP. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X (input) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* AP (input/output) COMPLEX*16 array, dimension at least +* ( ( N*( N + 1 ) )/2 ). +* Before entry, with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 1, 2 ) +* and a( 2, 2 ) respectively, and so on. On exit, the array +* AP is overwritten by the upper triangular part of the +* updated matrix. +* Before entry, with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular part of the symmetric matrix +* packed sequentially, column by column, so that AP( 1 ) +* contains a( 1, 1 ), AP( 2 ) and AP( 3 ) contain a( 2, 1 ) +* and a( 3, 1 ) respectively, and so on. On exit, the array +* AP is overwritten by the lower triangular part of the +* updated matrix. +* Note that the imaginary parts of the diagonal elements need +* not be set, they are assumed to be zero, and on exit they +* are set to zero. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, J, JX, K, KK, KX + COMPLEX*16 TEMP +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSPR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 ) THEN + KX = 1 - ( N-1 )*INCX + ELSE IF( INCX.NE.1 ) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of the array AP +* are accessed sequentially with one pass through AP. +* + KK = 1 + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form A when upper triangle is stored in AP. +* + IF( INCX.EQ.1 ) THEN + DO 20 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + K = KK + DO 10 I = 1, J - 1 + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 10 CONTINUE + AP( KK+J-1 ) = AP( KK+J-1 ) + X( J )*TEMP + ELSE + AP( KK+J-1 ) = AP( KK+J-1 ) + END IF + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30 K = KK, KK + J - 2 + AP( K ) = AP( K ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + AP( KK+J-1 ) = AP( KK+J-1 ) + X( JX )*TEMP + ELSE + AP( KK+J-1 ) = AP( KK+J-1 ) + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE +* +* Form A when lower triangle is stored in AP. +* + IF( INCX.EQ.1 ) THEN + DO 60 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + AP( KK ) = AP( KK ) + TEMP*X( J ) + K = KK + 1 + DO 50 I = J + 1, N + AP( K ) = AP( K ) + X( I )*TEMP + K = K + 1 + 50 CONTINUE + ELSE + AP( KK ) = AP( KK ) + END IF + KK = KK + N - J + 1 + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + AP( KK ) = AP( KK ) + TEMP*X( JX ) + IX = JX + DO 70 K = KK + 1, KK + N - J + IX = IX + INCX + AP( K ) = AP( K ) + X( IX )*TEMP + 70 CONTINUE + ELSE + AP( KK ) = AP( KK ) + END IF + JX = JX + INCX + KK = KK + N - J + 1 + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSPR +* + END diff --git a/reference/zswapf.f b/reference/zswapf.f new file mode 100644 index 0000000000..f42d7ec5d5 --- /dev/null +++ b/reference/zswapf.f @@ -0,0 +1,36 @@ + subroutine zswapf (n,zx,incx,zy,incy) +c +c interchanges two vectors. +c jack dongarra, 3/11/78. +c modified 12/3/93, array(1) declarations changed to array(*) +c + double complex zx(*),zy(*),ztemp + integer i,incx,incy,ix,iy,n +c + if(n.le.0)return + if(incx.eq.1.and.incy.eq.1)go to 20 +c +c code for unequal increments or equal increments not equal +c to 1 +c + ix = 1 + iy = 1 + if(incx.lt.0)ix = (-n+1)*incx + 1 + if(incy.lt.0)iy = (-n+1)*incy + 1 + do 10 i = 1,n + ztemp = zx(ix) + zx(ix) = zy(iy) + zy(iy) = ztemp + ix = ix + incx + iy = iy + incy + 10 continue + return +c +c code for both increments equal to 1 + 20 do 30 i = 1,n + ztemp = zx(i) + zx(i) = zy(i) + zy(i) = ztemp + 30 continue + return + end diff --git a/reference/zsymm3mf.f b/reference/zsymm3mf.f new file mode 100644 index 0000000000..82423babf1 --- /dev/null +++ b/reference/zsymm3mf.f @@ -0,0 +1,296 @@ + SUBROUTINE ZSYMM3MF( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of ZSYMM . +* + END diff --git a/reference/zsymmf.f b/reference/zsymmf.f new file mode 100644 index 0000000000..ce24be4d1b --- /dev/null +++ b/reference/zsymmf.f @@ -0,0 +1,296 @@ + SUBROUTINE ZSYMMF ( SIDE, UPLO, M, N, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO + INTEGER M, N, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZSYMM performs one of the matrix-matrix operations +* +* C := alpha*A*B + beta*C, +* +* or +* +* C := alpha*B*A + beta*C, +* +* where alpha and beta are scalars, A is a symmetric matrix and B and +* C are m by n matrices. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether the symmetric matrix A +* appears on the left or right in the operation as follows: +* +* SIDE = 'L' or 'l' C := alpha*A*B + beta*C, +* +* SIDE = 'R' or 'r' C := alpha*B*A + beta*C, +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the symmetric matrix A is to be +* referenced as follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of the +* symmetric matrix is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of the +* symmetric matrix is to be referenced. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of the matrix C. +* M must be at least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of the matrix C. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* m when SIDE = 'L' or 'l' and is n otherwise. +* Before entry with SIDE = 'L' or 'l', the m by m part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading m by m upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading m by m lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Before entry with SIDE = 'R' or 'r', the n by n part of +* the array A must contain the symmetric matrix, such that +* when UPLO = 'U' or 'u', the leading n by n upper triangular +* part of the array A must contain the upper triangular part +* of the symmetric matrix and the strictly lower triangular +* part of A is not referenced, and when UPLO = 'L' or 'l', +* the leading n by n lower triangular part of the array A +* must contain the lower triangular part of the symmetric +* matrix and the strictly upper triangular part of A is not +* referenced. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), otherwise LDA must be at +* least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then C need not be set on input. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry, the leading m by n part of the array C must +* contain the matrix C, except when beta is zero, in which +* case C need not be set on entry. +* On exit, the array C is overwritten by the m by n updated +* matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Set NROWA as the number of rows of A. +* + IF( LSAME( SIDE, 'L' ) )THEN + NROWA = M + ELSE + NROWA = N + END IF + UPPER = LSAME( UPLO, 'U' ) +* +* Test the input parameters. +* + INFO = 0 + IF( ( .NOT.LSAME( SIDE, 'L' ) ).AND. + $ ( .NOT.LSAME( SIDE, 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO, 'L' ) ) )THEN + INFO = 2 + ELSE IF( M .LT.0 )THEN + INFO = 3 + ELSE IF( N .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, M ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZSYMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( M.EQ.0 ).OR.( N.EQ.0 ).OR. + $ ( ( ALPHA.EQ.ZERO ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, M + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( SIDE, 'L' ) )THEN +* +* Form C := alpha*A*B + beta*C. +* + IF( UPPER )THEN + DO 70, J = 1, N + DO 60, I = 1, M + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 50, K = 1, I - 1 + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 50 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 60 CONTINUE + 70 CONTINUE + ELSE + DO 100, J = 1, N + DO 90, I = M, 1, -1 + TEMP1 = ALPHA*B( I, J ) + TEMP2 = ZERO + DO 80, K = I + 1, M + C( K, J ) = C( K, J ) + TEMP1 *A( K, I ) + TEMP2 = TEMP2 + B( K, J )*A( K, I ) + 80 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = TEMP1*A( I, I ) + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ TEMP1*A( I, I ) + ALPHA*TEMP2 + END IF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form C := alpha*B*A + beta*C. +* + DO 170, J = 1, N + TEMP1 = ALPHA*A( J, J ) + IF( BETA.EQ.ZERO )THEN + DO 110, I = 1, M + C( I, J ) = TEMP1*B( I, J ) + 110 CONTINUE + ELSE + DO 120, I = 1, M + C( I, J ) = BETA*C( I, J ) + TEMP1*B( I, J ) + 120 CONTINUE + END IF + DO 140, K = 1, J - 1 + IF( UPPER )THEN + TEMP1 = ALPHA*A( K, J ) + ELSE + TEMP1 = ALPHA*A( J, K ) + END IF + DO 130, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 130 CONTINUE + 140 CONTINUE + DO 160, K = J + 1, N + IF( UPPER )THEN + TEMP1 = ALPHA*A( J, K ) + ELSE + TEMP1 = ALPHA*A( K, J ) + END IF + DO 150, I = 1, M + C( I, J ) = C( I, J ) + TEMP1*B( I, K ) + 150 CONTINUE + 160 CONTINUE + 170 CONTINUE + END IF +* + RETURN +* +* End of ZSYMM . +* + END diff --git a/reference/zsymvf.f b/reference/zsymvf.f new file mode 100644 index 0000000000..7161f1a815 --- /dev/null +++ b/reference/zsymvf.f @@ -0,0 +1,264 @@ + SUBROUTINE ZSYMVF(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, INCY, LDA, N + COMPLEX*16 ALPHA, BETA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* ZSYMV performs the matrix-vector operation +* +* y := alpha*A*x + beta*y, +* +* where alpha and beta are scalars, x and y are n element vectors and +* A is an n by n symmetric matrix. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A (input) COMPLEX*16 array, dimension ( LDA, N ) +* Before entry, with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. +* Before entry, with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. +* Unchanged on exit. +* +* LDA (input) INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, N ). +* Unchanged on exit. +* +* X (input) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* BETA (input) COMPLEX*16 +* On entry, BETA specifies the scalar beta. When BETA is +* supplied as zero then Y need not be set on input. +* Unchanged on exit. +* +* Y (input/output) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. On exit, Y is overwritten by the updated +* vector y. +* +* INCY (input) INTEGER +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY + COMPLEX*16 TEMP1, TEMP2 +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = 5 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 7 + ELSE IF( INCY.EQ.0 ) THEN + INFO = 10 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSYMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ( ALPHA.EQ.ZERO ) .AND. ( BETA.EQ.ONE ) ) ) + $ RETURN +* +* Set up the start points in X and Y. +* + IF( INCX.GT.0 ) THEN + KX = 1 + ELSE + KX = 1 - ( N-1 )*INCX + END IF + IF( INCY.GT.0 ) THEN + KY = 1 + ELSE + KY = 1 - ( N-1 )*INCY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* +* First form y := beta*y. +* + IF( BETA.NE.ONE ) THEN + IF( INCY.EQ.1 ) THEN + IF( BETA.EQ.ZERO ) THEN + DO 10 I = 1, N + Y( I ) = ZERO + 10 CONTINUE + ELSE + DO 20 I = 1, N + Y( I ) = BETA*Y( I ) + 20 CONTINUE + END IF + ELSE + IY = KY + IF( BETA.EQ.ZERO ) THEN + DO 30 I = 1, N + Y( IY ) = ZERO + IY = IY + INCY + 30 CONTINUE + ELSE + DO 40 I = 1, N + Y( IY ) = BETA*Y( IY ) + IY = IY + INCY + 40 CONTINUE + END IF + END IF + END IF + IF( ALPHA.EQ.ZERO ) + $ RETURN + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form y when A is stored in upper triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 60 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + DO 50 I = 1, J - 1 + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 50 CONTINUE + Y( J ) = Y( J ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + 60 CONTINUE + ELSE + JX = KX + JY = KY + DO 80 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + IX = KX + IY = KY + DO 70 I = 1, J - 1 + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + ELSE +* +* Form y when A is stored in lower triangle. +* + IF( ( INCX.EQ.1 ) .AND. ( INCY.EQ.1 ) ) THEN + DO 100 J = 1, N + TEMP1 = ALPHA*X( J ) + TEMP2 = ZERO + Y( J ) = Y( J ) + TEMP1*A( J, J ) + DO 90 I = J + 1, N + Y( I ) = Y( I ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( I ) + 90 CONTINUE + Y( J ) = Y( J ) + ALPHA*TEMP2 + 100 CONTINUE + ELSE + JX = KX + JY = KY + DO 120 J = 1, N + TEMP1 = ALPHA*X( JX ) + TEMP2 = ZERO + Y( JY ) = Y( JY ) + TEMP1*A( J, J ) + IX = JX + IY = JY + DO 110 I = J + 1, N + IX = IX + INCX + IY = IY + INCY + Y( IY ) = Y( IY ) + TEMP1*A( I, J ) + TEMP2 = TEMP2 + A( I, J )*X( IX ) + 110 CONTINUE + Y( JY ) = Y( JY ) + ALPHA*TEMP2 + JX = JX + INCX + JY = JY + INCY + 120 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSYMV +* + END diff --git a/reference/zsyr2f.f b/reference/zsyr2f.f new file mode 100644 index 0000000000..d77e4d211c --- /dev/null +++ b/reference/zsyr2f.f @@ -0,0 +1,230 @@ + SUBROUTINE ZSYR2F ( UPLO, N, ALPHA, X, INCX, Y, INCY, A, LDA ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA + INTEGER INCX, INCY, LDA, N + CHARACTER*1 UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ), Y( * ) +* .. +* +* Purpose +* ======= +* +* DSYR2 performs the symmetric rank 2 operation +* +* A := alpha*x*y' + alpha*y*x' + A, +* +* where alpha is a scalar, x and y are n element vectors and A is an n +* by n symmetric matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA - DOUBLE PRECISION. +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. +* Unchanged on exit. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* Y - DOUBLE PRECISION array of dimension at least +* ( 1 + ( n - 1 )*abs( INCY ) ). +* Before entry, the incremented array Y must contain the n +* element vector y. +* Unchanged on exit. +* +* INCY - INTEGER. +* On entry, INCY specifies the increment for the elements of +* Y. INCY must not be zero. +* Unchanged on exit. +* +* A - DOUBLE PRECISION array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = 0.0D+0 ) +* .. Local Scalars .. + COMPLEX*16 TEMP1, TEMP2 + INTEGER I, INFO, IX, IY, J, JX, JY, KX, KY +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO, 'U' ).AND. + $ .NOT.LSAME( UPLO, 'L' ) )THEN + INFO = 1 + ELSE IF( N.LT.0 )THEN + INFO = 2 + ELSE IF( INCX.EQ.0 )THEN + INFO = 5 + ELSE IF( INCY.EQ.0 )THEN + INFO = 7 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'DSYR2 ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR.( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set up the start points in X and Y if the increments are not both +* unity. +* + IF( ( INCX.NE.1 ).OR.( INCY.NE.1 ) )THEN + IF( INCX.GT.0 )THEN + KX = 1 + ELSE + KX = 1 - ( N - 1 )*INCX + END IF + IF( INCY.GT.0 )THEN + KY = 1 + ELSE + KY = 1 - ( N - 1 )*INCY + END IF + JX = KX + JY = KY + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) )THEN +* +* Form A when A is stored in the upper triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 20, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 10, I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + DO 40, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = KX + IY = KY + DO 30, I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 30 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in the lower triangle. +* + IF( ( INCX.EQ.1 ).AND.( INCY.EQ.1 ) )THEN + DO 60, J = 1, N + IF( ( X( J ).NE.ZERO ).OR.( Y( J ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( J ) + TEMP2 = ALPHA*X( J ) + DO 50, I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP1 + Y( I )*TEMP2 + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + DO 80, J = 1, N + IF( ( X( JX ).NE.ZERO ).OR.( Y( JY ).NE.ZERO ) )THEN + TEMP1 = ALPHA*Y( JY ) + TEMP2 = ALPHA*X( JX ) + IX = JX + IY = JY + DO 70, I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP1 + $ + Y( IY )*TEMP2 + IX = IX + INCX + IY = IY + INCY + 70 CONTINUE + END IF + JX = JX + INCX + JY = JY + INCY + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of DSYR2 . +* + END diff --git a/reference/zsyr2kf.f b/reference/zsyr2kf.f new file mode 100644 index 0000000000..f6f0992d9f --- /dev/null +++ b/reference/zsyr2kf.f @@ -0,0 +1,324 @@ + SUBROUTINE ZSYR2KF( UPLO, TRANS, N, K, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDB, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZSYR2K performs one of the symmetric rank 2k operations +* +* C := alpha*A*B' + alpha*B*A' + beta*C, +* +* or +* +* C := alpha*A'*B + alpha*B'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A and B are n by k matrices in the first case and k by n +* matrices in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*B' + alpha*B*A' + +* beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*B + alpha*B'*A + +* beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrices A and B, and on entry with +* TRANS = 'T' or 't', K specifies the number of rows of the +* matrices A and B. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, kb ), where kb is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array B must contain the matrix B, otherwise +* the leading k by n part of the array B must contain the +* matrix B. +* Unchanged on exit. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDB must be at least max( 1, n ), otherwise LDB must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX*16 TEMP1, TEMP2 +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDB.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 12 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZSYR2K', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*B' + alpha*B*A' + C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( ( A( J, L ).NE.ZERO ).OR. + $ ( B( J, L ).NE.ZERO ) )THEN + TEMP1 = ALPHA*B( J, L ) + TEMP2 = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + A( I, L )*TEMP1 + + $ B( I, L )*TEMP2 + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*B + alpha*B'*A + C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP1 = ZERO + TEMP2 = ZERO + DO 190, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP1 = ZERO + TEMP2 = ZERO + DO 220, L = 1, K + TEMP1 = TEMP1 + A( L, I )*B( L, J ) + TEMP2 = TEMP2 + B( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP1 + ALPHA*TEMP2 + ELSE + C( I, J ) = BETA *C( I, J ) + + $ ALPHA*TEMP1 + ALPHA*TEMP2 + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSYR2K. +* + END diff --git a/reference/zsyrf.f b/reference/zsyrf.f new file mode 100644 index 0000000000..4262ed9be1 --- /dev/null +++ b/reference/zsyrf.f @@ -0,0 +1,198 @@ + SUBROUTINE ZSYRF( UPLO, N, ALPHA, X, INCX, A, LDA ) +* +* -- LAPACK auxiliary routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER UPLO + INTEGER INCX, LDA, N + COMPLEX*16 ALPHA +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZSYR performs the symmetric rank 1 operation +* +* A := alpha*x*( x' ) + A, +* +* where alpha is a complex scalar, x is an n element vector and A is an +* n by n symmetric matrix. +* +* Arguments +* ========== +* +* UPLO (input) CHARACTER*1 +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array A is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of A +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of A +* is to be referenced. +* +* Unchanged on exit. +* +* N (input) INTEGER +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* ALPHA (input) COMPLEX*16 +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* X (input) COMPLEX*16 array, dimension at least +* ( 1 + ( N - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the N- +* element vector x. +* Unchanged on exit. +* +* INCX (input) INTEGER +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* A (input/output) COMPLEX*16 array, dimension ( LDA, N ) +* Before entry, with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of A is not referenced. On exit, the +* upper triangular part of the array A is overwritten by the +* upper triangular part of the updated matrix. +* Before entry, with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of A is not referenced. On exit, the +* lower triangular part of the array A is overwritten by the +* lower triangular part of the updated matrix. +* +* LDA (input) INTEGER +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, N ). +* Unchanged on exit. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, INFO, IX, J, JX, KX + COMPLEX*16 TEMP +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = 1 + ELSE IF( N.LT.0 ) THEN + INFO = 2 + ELSE IF( INCX.EQ.0 ) THEN + INFO = 5 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = 7 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZSYR ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ) .OR. ( ALPHA.EQ.ZERO ) ) + $ RETURN +* +* Set the start point in X if the increment is not unity. +* + IF( INCX.LE.0 ) THEN + KX = 1 - ( N-1 )*INCX + ELSE IF( INCX.NE.1 ) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through the triangular part +* of A. +* + IF( LSAME( UPLO, 'U' ) ) THEN +* +* Form A when A is stored in upper triangle. +* + IF( INCX.EQ.1 ) THEN + DO 20 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + DO 10 I = 1, J + A( I, J ) = A( I, J ) + X( I )*TEMP + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = KX + DO 30 I = 1, J + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 30 CONTINUE + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE +* +* Form A when A is stored in lower triangle. +* + IF( INCX.EQ.1 ) THEN + DO 60 J = 1, N + IF( X( J ).NE.ZERO ) THEN + TEMP = ALPHA*X( J ) + DO 50 I = J, N + A( I, J ) = A( I, J ) + X( I )*TEMP + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1, N + IF( X( JX ).NE.ZERO ) THEN + TEMP = ALPHA*X( JX ) + IX = JX + DO 70 I = J, N + A( I, J ) = A( I, J ) + X( IX )*TEMP + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSYR +* + END diff --git a/reference/zsyrkf.f b/reference/zsyrkf.f new file mode 100644 index 0000000000..99bfa82504 --- /dev/null +++ b/reference/zsyrkf.f @@ -0,0 +1,293 @@ + SUBROUTINE ZSYRKF ( UPLO, TRANS, N, K, ALPHA, A, LDA, + $ BETA, C, LDC ) +* .. Scalar Arguments .. + CHARACTER*1 UPLO, TRANS + INTEGER N, K, LDA, LDC + COMPLEX*16 ALPHA, BETA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), C( LDC, * ) +* .. +* +* Purpose +* ======= +* +* ZSYRK performs one of the symmetric rank k operations +* +* C := alpha*A*A' + beta*C, +* +* or +* +* C := alpha*A'*A + beta*C, +* +* where alpha and beta are scalars, C is an n by n symmetric matrix +* and A is an n by k matrix in the first case and a k by n matrix +* in the second case. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the upper or lower +* triangular part of the array C is to be referenced as +* follows: +* +* UPLO = 'U' or 'u' Only the upper triangular part of C +* is to be referenced. +* +* UPLO = 'L' or 'l' Only the lower triangular part of C +* is to be referenced. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' C := alpha*A*A' + beta*C. +* +* TRANS = 'T' or 't' C := alpha*A'*A + beta*C. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix C. N must be +* at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with TRANS = 'N' or 'n', K specifies the number +* of columns of the matrix A, and on entry with +* TRANS = 'T' or 't', K specifies the number of rows of the +* matrix A. K must be at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, ka ), where ka is +* k when TRANS = 'N' or 'n', and is n otherwise. +* Before entry with TRANS = 'N' or 'n', the leading n by k +* part of the array A must contain the matrix A, otherwise +* the leading k by n part of the array A must contain the +* matrix A. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When TRANS = 'N' or 'n' +* then LDA must be at least max( 1, n ), otherwise LDA must +* be at least max( 1, k ). +* Unchanged on exit. +* +* BETA - COMPLEX*16 . +* On entry, BETA specifies the scalar beta. +* Unchanged on exit. +* +* C - COMPLEX*16 array of DIMENSION ( LDC, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array C must contain the upper +* triangular part of the symmetric matrix and the strictly +* lower triangular part of C is not referenced. On exit, the +* upper triangular part of the array C is overwritten by the +* upper triangular part of the updated matrix. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array C must contain the lower +* triangular part of the symmetric matrix and the strictly +* upper triangular part of C is not referenced. On exit, the +* lower triangular part of the array C is overwritten by the +* lower triangular part of the updated matrix. +* +* LDC - INTEGER. +* On entry, LDC specifies the first dimension of C as declared +* in the calling (sub) program. LDC must be at least +* max( 1, n ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Local Scalars .. + LOGICAL UPPER + INTEGER I, INFO, J, L, NROWA + COMPLEX*16 TEMP +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + IF( LSAME( TRANS, 'N' ) )THEN + NROWA = N + ELSE + NROWA = K + END IF + UPPER = LSAME( UPLO, 'U' ) +* + INFO = 0 + IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.LSAME( TRANS, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANS, 'T' ) ) )THEN + INFO = 2 + ELSE IF( N .LT.0 )THEN + INFO = 3 + ELSE IF( K .LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 7 + ELSE IF( LDC.LT.MAX( 1, N ) )THEN + INFO = 10 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZSYRK ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( ( N.EQ.0 ).OR. + $ ( ( ( ALPHA.EQ.ZERO ).OR.( K.EQ.0 ) ).AND.( BETA.EQ.ONE ) ) ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + IF( UPPER )THEN + IF( BETA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, J + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + ELSE + DO 40, J = 1, N + DO 30, I = 1, J + C( I, J ) = BETA*C( I, J ) + 30 CONTINUE + 40 CONTINUE + END IF + ELSE + IF( BETA.EQ.ZERO )THEN + DO 60, J = 1, N + DO 50, I = J, N + C( I, J ) = ZERO + 50 CONTINUE + 60 CONTINUE + ELSE + DO 80, J = 1, N + DO 70, I = J, N + C( I, J ) = BETA*C( I, J ) + 70 CONTINUE + 80 CONTINUE + END IF + END IF + RETURN + END IF +* +* Start the operations. +* + IF( LSAME( TRANS, 'N' ) )THEN +* +* Form C := alpha*A*A' + beta*C. +* + IF( UPPER )THEN + DO 130, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 90, I = 1, J + C( I, J ) = ZERO + 90 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 100, I = 1, J + C( I, J ) = BETA*C( I, J ) + 100 CONTINUE + END IF + DO 120, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 110, I = 1, J + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 110 CONTINUE + END IF + 120 CONTINUE + 130 CONTINUE + ELSE + DO 180, J = 1, N + IF( BETA.EQ.ZERO )THEN + DO 140, I = J, N + C( I, J ) = ZERO + 140 CONTINUE + ELSE IF( BETA.NE.ONE )THEN + DO 150, I = J, N + C( I, J ) = BETA*C( I, J ) + 150 CONTINUE + END IF + DO 170, L = 1, K + IF( A( J, L ).NE.ZERO )THEN + TEMP = ALPHA*A( J, L ) + DO 160, I = J, N + C( I, J ) = C( I, J ) + TEMP*A( I, L ) + 160 CONTINUE + END IF + 170 CONTINUE + 180 CONTINUE + END IF + ELSE +* +* Form C := alpha*A'*A + beta*C. +* + IF( UPPER )THEN + DO 210, J = 1, N + DO 200, I = 1, J + TEMP = ZERO + DO 190, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 190 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 200 CONTINUE + 210 CONTINUE + ELSE + DO 240, J = 1, N + DO 230, I = J, N + TEMP = ZERO + DO 220, L = 1, K + TEMP = TEMP + A( L, I )*A( L, J ) + 220 CONTINUE + IF( BETA.EQ.ZERO )THEN + C( I, J ) = ALPHA*TEMP + ELSE + C( I, J ) = ALPHA*TEMP + BETA*C( I, J ) + END IF + 230 CONTINUE + 240 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZSYRK . +* + END diff --git a/reference/ztbmvf.f b/reference/ztbmvf.f new file mode 100644 index 0000000000..8df5609ad8 --- /dev/null +++ b/reference/ztbmvf.f @@ -0,0 +1,378 @@ + SUBROUTINE ZTBMVF( UPLO, TRANS, DIAG, N, K, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, K, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTBMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular band matrix, with ( k + 1 ) diagonals. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, KPLUS1, KX, L + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( K.LT.0 )THEN + INFO = 5 + ELSE IF( LDA.LT.( K + 1 ) )THEN + INFO = 7 + ELSE IF( INCX.EQ.0 )THEN + INFO = 9 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTBMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ) )THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = KPLUS1 - J + DO 10, I = MAX( 1, J - K ), J - 1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 10 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( KPLUS1, J ) + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = KPLUS1 - J + DO 30, I = MAX( 1, J - K ), J - 1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX + INCX + 30 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( KPLUS1, J ) + END IF + JX = JX + INCX + IF( J.GT.K ) + $ KX = KX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + L = 1 - J + DO 50, I = MIN( N, J + K ), J + 1, -1 + X( I ) = X( I ) + TEMP*A( L + I, J ) + 50 CONTINUE + IF( NOUNIT ) + $ X( J ) = X( J )*A( 1, J ) + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + L = 1 - J + DO 70, I = MIN( N, J + K ), J + 1, -1 + X( IX ) = X( IX ) + TEMP*A( L + I, J ) + IX = IX - INCX + 70 CONTINUE + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( 1, J ) + END IF + JX = JX - INCX + IF( ( N - J ).GE.K ) + $ KX = KX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KPLUS1 = K + 1 + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + L = KPLUS1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 90, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( I ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( KPLUS1, J ) ) + DO 100, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + DCONJG( A( L + I, J ) )*X( I ) + 100 CONTINUE + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + KX = KX - INCX + IX = KX + L = KPLUS1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( KPLUS1, J ) + DO 120, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX - INCX + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( KPLUS1, J ) ) + DO 130, I = J - 1, MAX( 1, J - K ), -1 + TEMP = TEMP + DCONJG( A( L + I, J ) )*X( IX ) + IX = IX - INCX + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + L = 1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 150, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( I ) + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( 1, J ) ) + DO 160, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + DCONJG( A( L + I, J ) )*X( I ) + 160 CONTINUE + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + KX = KX + INCX + IX = KX + L = 1 - J + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( 1, J ) + DO 180, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + A( L + I, J )*X( IX ) + IX = IX + INCX + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( 1, J ) ) + DO 190, I = J + 1, MIN( N, J + K ) + TEMP = TEMP + DCONJG( A( L + I, J ) )*X( IX ) + IX = IX + INCX + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTBMV . +* + END diff --git a/reference/ztbsvf.f b/reference/ztbsvf.f new file mode 100644 index 0000000000..78c37e3d43 --- /dev/null +++ b/reference/ztbsvf.f @@ -0,0 +1,367 @@ + SUBROUTINE ZTBSVF(UPLO,TRANS,DIAG,N,K,A,LDA,X,INCX) +* .. Scalar Arguments .. + INTEGER INCX,K,LDA,N + CHARACTER DIAG,TRANS,UPLO +* .. +* .. Array Arguments .. + DOUBLE COMPLEX A(LDA,*),X(*) +* .. +* +* Purpose +* ======= +* +* ZTBSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular band matrix, with ( k + 1 ) +* diagonals. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Arguments +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* K - INTEGER. +* On entry with UPLO = 'U' or 'u', K specifies the number of +* super-diagonals of the matrix A. +* On entry with UPLO = 'L' or 'l', K specifies the number of +* sub-diagonals of the matrix A. +* K must satisfy 0 .le. K. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading ( k + 1 ) +* by n part of the array A must contain the upper triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row +* ( k + 1 ) of the array, the first super-diagonal starting at +* position 2 in row k, and so on. The top left k by k triangle +* of the array A is not referenced. +* The following program segment will transfer an upper +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = K + 1 - J +* DO 10, I = MAX( 1, J - K ), J +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Before entry with UPLO = 'L' or 'l', the leading ( k + 1 ) +* by n part of the array A must contain the lower triangular +* band part of the matrix of coefficients, supplied column by +* column, with the leading diagonal of the matrix in row 1 of +* the array, the first sub-diagonal starting at position 1 in +* row 2, and so on. The bottom right k by k triangle of the +* array A is not referenced. +* The following program segment will transfer a lower +* triangular band matrix from conventional full matrix storage +* to band storage: +* +* DO 20, J = 1, N +* M = 1 - J +* DO 10, I = J, MIN( N, J + K ) +* A( M + I, J ) = matrix( I, J ) +* 10 CONTINUE +* 20 CONTINUE +* +* Note that when DIAG = 'U' or 'u' the elements of the array A +* corresponding to the diagonal elements of the matrix are not +* referenced, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* ( k + 1 ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + DOUBLE COMPLEX ZERO + PARAMETER (ZERO= (0.0D+0,0.0D+0)) +* .. +* .. Local Scalars .. + DOUBLE COMPLEX TEMP + INTEGER I,INFO,IX,J,JX,KPLUS1,KX,L + LOGICAL NOCONJ,NOUNIT +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCONJG,MAX,MIN +* .. +* +* Test the input parameters. +* + INFO = 0 + IF (.NOT.LSAME(UPLO,'U') .AND. .NOT.LSAME(UPLO,'L')) THEN + INFO = 1 + ELSE IF (.NOT.LSAME(TRANS,'N') .AND. .NOT.LSAME(TRANS,'T') .AND. + + .NOT.LSAME(TRANS,'C')) THEN + INFO = 2 + ELSE IF (.NOT.LSAME(DIAG,'U') .AND. .NOT.LSAME(DIAG,'N')) THEN + INFO = 3 + ELSE IF (N.LT.0) THEN + INFO = 4 + ELSE IF (K.LT.0) THEN + INFO = 5 + ELSE IF (LDA.LT. (K+1)) THEN + INFO = 7 + ELSE IF (INCX.EQ.0) THEN + INFO = 9 + END IF + IF (INFO.NE.0) THEN + CALL XERBLA('ZTBSV ',INFO) + RETURN + END IF +* +* Quick return if possible. +* + IF (N.EQ.0) RETURN +* + NOCONJ = LSAME(TRANS,'T') + NOUNIT = LSAME(DIAG,'N') +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF (INCX.LE.0) THEN + KX = 1 - (N-1)*INCX + ELSE IF (INCX.NE.1) THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed by sequentially with one pass through A. +* + IF (LSAME(TRANS,'N')) THEN +* +* Form x := inv( A )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 20 J = N,1,-1 + IF (X(J).NE.ZERO) THEN + L = KPLUS1 - J + IF (NOUNIT) X(J) = X(J)/A(KPLUS1,J) + TEMP = X(J) + DO 10 I = J - 1,MAX(1,J-K),-1 + X(I) = X(I) - TEMP*A(L+I,J) + 10 CONTINUE + END IF + 20 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 40 J = N,1,-1 + KX = KX - INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = KPLUS1 - J + IF (NOUNIT) X(JX) = X(JX)/A(KPLUS1,J) + TEMP = X(JX) + DO 30 I = J - 1,MAX(1,J-K),-1 + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX - INCX + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 60 J = 1,N + IF (X(J).NE.ZERO) THEN + L = 1 - J + IF (NOUNIT) X(J) = X(J)/A(1,J) + TEMP = X(J) + DO 50 I = J + 1,MIN(N,J+K) + X(I) = X(I) - TEMP*A(L+I,J) + 50 CONTINUE + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80 J = 1,N + KX = KX + INCX + IF (X(JX).NE.ZERO) THEN + IX = KX + L = 1 - J + IF (NOUNIT) X(JX) = X(JX)/A(1,J) + TEMP = X(JX) + DO 70 I = J + 1,MIN(N,J+K) + X(IX) = X(IX) - TEMP*A(L+I,J) + IX = IX + INCX + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A') )*x. +* + IF (LSAME(UPLO,'U')) THEN + KPLUS1 = K + 1 + IF (INCX.EQ.1) THEN + DO 110 J = 1,N + TEMP = X(J) + L = KPLUS1 - J + IF (NOCONJ) THEN + DO 90 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(I) + 90 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + ELSE + DO 100 I = MAX(1,J-K),J - 1 + TEMP = TEMP - DCONJG(A(L+I,J))*X(I) + 100 CONTINUE + IF (NOUNIT) TEMP = TEMP/DCONJG(A(KPLUS1,J)) + END IF + X(J) = TEMP + 110 CONTINUE + ELSE + JX = KX + DO 140 J = 1,N + TEMP = X(JX) + IX = KX + L = KPLUS1 - J + IF (NOCONJ) THEN + DO 120 I = MAX(1,J-K),J - 1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX + INCX + 120 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(KPLUS1,J) + ELSE + DO 130 I = MAX(1,J-K),J - 1 + TEMP = TEMP - DCONJG(A(L+I,J))*X(IX) + IX = IX + INCX + 130 CONTINUE + IF (NOUNIT) TEMP = TEMP/DCONJG(A(KPLUS1,J)) + END IF + X(JX) = TEMP + JX = JX + INCX + IF (J.GT.K) KX = KX + INCX + 140 CONTINUE + END IF + ELSE + IF (INCX.EQ.1) THEN + DO 170 J = N,1,-1 + TEMP = X(J) + L = 1 - J + IF (NOCONJ) THEN + DO 150 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(I) + 150 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + ELSE + DO 160 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - DCONJG(A(L+I,J))*X(I) + 160 CONTINUE + IF (NOUNIT) TEMP = TEMP/DCONJG(A(1,J)) + END IF + X(J) = TEMP + 170 CONTINUE + ELSE + KX = KX + (N-1)*INCX + JX = KX + DO 200 J = N,1,-1 + TEMP = X(JX) + IX = KX + L = 1 - J + IF (NOCONJ) THEN + DO 180 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - A(L+I,J)*X(IX) + IX = IX - INCX + 180 CONTINUE + IF (NOUNIT) TEMP = TEMP/A(1,J) + ELSE + DO 190 I = MIN(N,J+K),J + 1,-1 + TEMP = TEMP - DCONJG(A(L+I,J))*X(IX) + IX = IX - INCX + 190 CONTINUE + IF (NOUNIT) TEMP = TEMP/DCONJG(A(1,J)) + END IF + X(JX) = TEMP + JX = JX - INCX + IF ((N-J).GE.K) KX = KX - INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTBSV . +* + END diff --git a/reference/ztpmvf.f b/reference/ztpmvf.f new file mode 100644 index 0000000000..d050272169 --- /dev/null +++ b/reference/ztpmvf.f @@ -0,0 +1,377 @@ + SUBROUTINE ZTPMVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTPMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix, supplied in packed form. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTPMVF', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ).OR.LSAME( TRANS, 'R' ))THEN +* +* Form x:= A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 10, I = 1, J - 1 + IF( NOCONJ )THEN + X( I ) = X( I ) + TEMP*AP( K ) + ELSE + X( I ) = X( I ) + TEMP*DCONJG(AP( K )) + END IF + K = K + 1 + 10 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK + J - 1 ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*DCONJG(AP( KK + J-1)) + END IF + END IF + + KK = KK + J + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, K = KK, KK + J - 2 + IF( NOCONJ )THEN + X( IX ) = X( IX ) + TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) + TEMP*DCONJG(AP(K)) + END IF + IX = IX + INCX + 30 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK + J - 1 ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*DCONJG(AP( KK + J-1)) + END IF + END IF + JX = JX + INCX + KK = KK + J + 40 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + K = KK + DO 50, I = N, J + 1, -1 + IF( NOCONJ )THEN + X( I ) = X( I ) + TEMP*AP( K ) + ELSE + X( I ) = X( I ) + TEMP*DCONJG(AP( K )) + END IF + K = K - 1 + 50 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )*AP( KK - N + J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*DCONJG(AP(KK - N+J)) + END IF + + END IF + KK = KK - ( N - J + 1 ) + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, K = KK, KK - ( N - ( J + 1 ) ), -1 + IF( NOCONJ )THEN + X( IX ) = X( IX ) + TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) + TEMP*DCONJG(AP(K)) + ENDIF + IX = IX - INCX + 70 CONTINUE + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*AP( KK - N + J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*DCONJG(AP(KK-N+J)) + ENDIF + END IF + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + K = KK - 1 + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + AP( K )*X( I ) + K = K - 1 + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( AP( KK ) ) + DO 100, I = J - 1, 1, -1 + TEMP = TEMP + DCONJG( AP( K ) )*X( I ) + K = K - 1 + 100 CONTINUE + END IF + X( J ) = TEMP + KK = KK - J + 110 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 120, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + AP( K )*X( IX ) + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( AP( KK ) ) + DO 130, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + TEMP = TEMP + DCONJG( AP( K ) )*X( IX ) + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + KK = KK - J + 140 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + K = KK + 1 + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 150, I = J + 1, N + TEMP = TEMP + AP( K )*X( I ) + K = K + 1 + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( AP( KK ) ) + DO 160, I = J + 1, N + TEMP = TEMP + DCONJG( AP( K ) )*X( I ) + K = K + 1 + 160 CONTINUE + END IF + X( J ) = TEMP + KK = KK + ( N - J + 1 ) + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*AP( KK ) + DO 180, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + AP( K )*X( IX ) + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( AP( KK ) ) + DO 190, K = KK + 1, KK + N - J + IX = IX + INCX + TEMP = TEMP + DCONJG( AP( K ) )*X( IX ) + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTPMV . +* + END diff --git a/reference/ztpsvf.f b/reference/ztpsvf.f new file mode 100644 index 0000000000..d5a981efda --- /dev/null +++ b/reference/ztpsvf.f @@ -0,0 +1,379 @@ + SUBROUTINE ZTPSVF( UPLO, TRANS, DIAG, N, AP, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 AP( * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTPSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix, supplied in packed form. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* AP - COMPLEX*16 array of DIMENSION at least +* ( ( n*( n + 1 ) )/2 ). +* Before entry with UPLO = 'U' or 'u', the array AP must +* contain the upper triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 1, 2 ) and a( 2, 2 ) +* respectively, and so on. +* Before entry with UPLO = 'L' or 'l', the array AP must +* contain the lower triangular matrix packed sequentially, +* column by column, so that AP( 1 ) contains a( 1, 1 ), +* AP( 2 ) and AP( 3 ) contain a( 2, 1 ) and a( 3, 1 ) +* respectively, and so on. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced, but are assumed to be unity. +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, K, KK, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( INCX.EQ.0 )THEN + INFO = 7 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTPSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of AP are +* accessed sequentially with one pass through AP. +* + IF( LSAME( TRANS, 'N' ) .OR.LSAME( TRANS, 'R' ))THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/DCONJG(AP( KK )) + END IF + + TEMP = X( J ) + K = KK - 1 + DO 10, I = J - 1, 1, -1 + IF( NOCONJ )THEN + X( I ) = X( I ) - TEMP*AP( K ) + ELSE + X( I ) = X( I ) - TEMP*DCONJG(AP( K )) + END IF + K = K - 1 + 10 CONTINUE + END IF + KK = KK - J + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/DCONJG(AP( KK )) + END IF + TEMP = X( JX ) + IX = JX + DO 30, K = KK - 1, KK - J + 1, -1 + IX = IX - INCX + IF( NOCONJ )THEN + X( IX ) = X( IX ) - TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) - TEMP*DCONJG(AP( K )) + END IF + 30 CONTINUE + END IF + JX = JX - INCX + KK = KK - J + 40 CONTINUE + END IF + ELSE + KK = 1 + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( J ) = X( J )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/DCONJG(AP( KK )) + END IF + TEMP = X( J ) + K = KK + 1 + DO 50, I = J + 1, N + IF( NOCONJ )THEN + X( I ) = X( I ) - TEMP*AP( K ) + ELSE + X( I ) = X( I ) - TEMP*DCONJG(AP( K )) + END IF + K = K + 1 + 50 CONTINUE + END IF + KK = KK + ( N - J + 1 ) + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF( NOCONJ )THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/AP( KK ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/DCONJG(AP( KK )) + END IF + TEMP = X( JX ) + IX = JX + DO 70, K = KK + 1, KK + N - J + IX = IX + INCX + IF( NOCONJ )THEN + X( IX ) = X( IX ) - TEMP*AP( K ) + ELSE + X( IX ) = X( IX ) - TEMP*DCONJG(AP( K )) + END IF + 70 CONTINUE + END IF + JX = JX + INCX + KK = KK + ( N - J + 1 ) + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + KK = 1 + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = X( J ) + K = KK + IF( NOCONJ )THEN + DO 90, I = 1, J - 1 + TEMP = TEMP - AP( K )*X( I ) + K = K + 1 + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + ELSE + DO 100, I = 1, J - 1 + TEMP = TEMP - DCONJG( AP( K ) )*X( I ) + K = K + 1 + 100 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( AP( KK + J - 1 ) ) + END IF + X( J ) = TEMP + KK = KK + J + 110 CONTINUE + ELSE + JX = KX + DO 140, J = 1, N + TEMP = X( JX ) + IX = KX + IF( NOCONJ )THEN + DO 120, K = KK, KK + J - 2 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX + INCX + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK + J - 1 ) + ELSE + DO 130, K = KK, KK + J - 2 + TEMP = TEMP - DCONJG( AP( K ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( AP( KK + J - 1 ) ) + END IF + X( JX ) = TEMP + JX = JX + INCX + KK = KK + J + 140 CONTINUE + END IF + ELSE + KK = ( N*( N + 1 ) )/2 + IF( INCX.EQ.1 )THEN + DO 170, J = N, 1, -1 + TEMP = X( J ) + K = KK + IF( NOCONJ )THEN + DO 150, I = N, J + 1, -1 + TEMP = TEMP - AP( K )*X( I ) + K = K - 1 + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + ELSE + DO 160, I = N, J + 1, -1 + TEMP = TEMP - DCONJG( AP( K ) )*X( I ) + K = K - 1 + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( AP( KK - N + J ) ) + END IF + X( J ) = TEMP + KK = KK - ( N - J + 1 ) + 170 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 200, J = N, 1, -1 + TEMP = X( JX ) + IX = KX + IF( NOCONJ )THEN + DO 180, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - AP( K )*X( IX ) + IX = IX - INCX + 180 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/AP( KK - N + J ) + ELSE + DO 190, K = KK, KK - ( N - ( J + 1 ) ), -1 + TEMP = TEMP - DCONJG( AP( K ) )*X( IX ) + IX = IX - INCX + 190 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( AP( KK - N + J ) ) + END IF + X( JX ) = TEMP + JX = JX - INCX + KK = KK - ( N - J + 1 ) + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTPSV . +* + END diff --git a/reference/ztrmmf.f b/reference/ztrmmf.f new file mode 100644 index 0000000000..d286f96738 --- /dev/null +++ b/reference/ztrmmf.f @@ -0,0 +1,428 @@ + SUBROUTINE ZTRMMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + COMPLEX*16 ALPHA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* ZTRMM performs one of the matrix-matrix operations +* +* B := alpha*op( A )*B, or B := alpha*B*op( A ) +* +* where alpha is a scalar, B is an m by n matrix, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) multiplies B from +* the left or right as follows: +* +* SIDE = 'L' or 'l' B := alpha*op( A )*B. +* +* SIDE = 'R' or 'r' B := alpha*B*op( A ). +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the matrix B, and on exit is overwritten by the +* transformed matrix. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTRMM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*A*B. +* + IF( UPPER )THEN + DO 50, J = 1, N + DO 40, K = 1, M + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + IF (NOCONJ) THEN + DO 30, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 30 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*A( K, K ) + B( K, J ) = TEMP + ELSE + DO 35, I = 1, K - 1 + B( I, J ) = B( I, J ) + TEMP*DCONJG(A( I, K )) + 35 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG(A( K, K )) + B( K, J ) = TEMP + ENDIF + END IF + 40 CONTINUE + 50 CONTINUE + ELSE + DO 80, J = 1, N + DO 70 K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + TEMP = ALPHA*B( K, J ) + B( K, J ) = TEMP + IF (NOCONJ) THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*A( K, K ) + DO 60, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*A( I, K ) + 60 CONTINUE + ELSE + IF( NOUNIT ) + $ B( K, J ) = B( K, J )*DCONJG(A( K, K )) + DO 65, I = K + 1, M + B( I, J ) = B( I, J ) + TEMP*DCONJG(A( I, K )) + 65 CONTINUE + ENDIF + END IF + 70 CONTINUE + 80 CONTINUE + END IF + ELSE +* +* Form B := alpha*A'*B or B := alpha*conjg( A' )*B. +* + IF( UPPER )THEN + DO 120, J = 1, N + DO 110, I = M, 1, -1 + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 90, K = 1, I - 1 + TEMP = TEMP + A( K, I )*B( K, J ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( I, I ) ) + DO 100, K = 1, I - 1 + TEMP = TEMP + DCONJG( A( K, I ) )*B( K, J ) + 100 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 110 CONTINUE + 120 CONTINUE + ELSE + DO 160, J = 1, N + DO 150, I = 1, M + TEMP = B( I, J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( I, I ) + DO 130, K = I + 1, M + TEMP = TEMP + A( K, I )*B( K, J ) + 130 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( I, I ) ) + DO 140, K = I + 1, M + TEMP = TEMP + DCONJG( A( K, I ) )*B( K, J ) + 140 CONTINUE + END IF + B( I, J ) = ALPHA*TEMP + 150 CONTINUE + 160 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ))THEN +* +* Form B := alpha*B*A. +* + IF( UPPER )THEN + DO 200, J = N, 1, -1 + TEMP = ALPHA + IF (NOCONJ) THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG(A( J, J )) + ENDIF + DO 170, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 170 CONTINUE + DO 190, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + TEMP = ALPHA*A( K, J ) + ELSE + TEMP = ALPHA*DCONJG(A( K, J )) + ENDIF + DO 180, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 180 CONTINUE + END IF + 190 CONTINUE + 200 CONTINUE + ELSE + DO 240, J = 1, N + TEMP = ALPHA + IF (NOCONJ) THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG(A( J, J )) + ENDIF + DO 210, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 210 CONTINUE + DO 230, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + TEMP = ALPHA*A( K, J ) + ELSE + TEMP = ALPHA*DCONJG(A( K, J )) + ENDIF + DO 220, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 220 CONTINUE + END IF + 230 CONTINUE + 240 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*A' or B := alpha*B*conjg( A' ). +* + IF( UPPER )THEN + DO 280, K = 1, N + DO 260, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = ALPHA*A( J, K ) + ELSE + TEMP = ALPHA*DCONJG( A( J, K ) ) + END IF + DO 250, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 250 CONTINUE + END IF + 260 CONTINUE + TEMP = ALPHA + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = TEMP*A( K, K ) + ELSE + TEMP = TEMP*DCONJG( A( K, K ) ) + END IF + END IF + IF( TEMP.NE.ONE )THEN + DO 270, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 270 CONTINUE + END IF + 280 CONTINUE + ELSE + DO 320, K = N, 1, -1 + DO 300, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = ALPHA*A( J, K ) + ELSE + TEMP = ALPHA*DCONJG( A( J, K ) ) + END IF + DO 290, I = 1, M + B( I, J ) = B( I, J ) + TEMP*B( I, K ) + 290 CONTINUE + END IF + 300 CONTINUE + TEMP = ALPHA + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = TEMP*A( K, K ) + ELSE + TEMP = TEMP*DCONJG( A( K, K ) ) + END IF + END IF + IF( TEMP.NE.ONE )THEN + DO 310, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 310 CONTINUE + END IF + 320 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTRMM . +* + END diff --git a/reference/ztrmvf.f b/reference/ztrmvf.f new file mode 100644 index 0000000000..db0f9ca04c --- /dev/null +++ b/reference/ztrmvf.f @@ -0,0 +1,358 @@ + SUBROUTINE ZTRMVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTRMV performs one of the matrix-vector operations +* +* x := A*x, or x := A'*x, or x := conjg( A' )*x, +* +* where x is an n element vector and A is an n by n unit, or non-unit, +* upper or lower triangular matrix. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the operation to be performed as +* follows: +* +* TRANS = 'N' or 'n' x := A*x. +* +* TRANS = 'T' or 't' x := A'*x. +* +* TRANS = 'C' or 'c' x := conjg( A' )*x. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element vector x. On exit, X is overwritten with the +* tranformed vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTRMV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ))THEN +* +* Form x := A*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = 1, N + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 10, I = 1, J - 1 + IF (NOCONJ) THEN + X( I ) = X( I ) + TEMP*A( I, J ) + ELSE + X( I ) = X( I ) + TEMP*DCONJG(A( I, J )) + ENDIF + 10 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*DCONJG(A( J, J )) + ENDIF + END IF + 20 CONTINUE + ELSE + JX = KX + DO 40, J = 1, N + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 30, I = 1, J - 1 + IF (NOCONJ) THEN + X( IX ) = X( IX ) + TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) + TEMP*DCONJG(A( I, J )) + ENDIF + IX = IX + INCX + 30 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*DCONJG(A( J, J )) + ENDIF + END IF + JX = JX + INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + TEMP = X( J ) + DO 50, I = N, J + 1, -1 + IF (NOCONJ) THEN + X( I ) = X( I ) + TEMP*A( I, J ) + ELSE + X( I ) = X( I ) + TEMP*DCONJG(A( I, J )) + ENDIF + 50 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )*DCONJG(A( J, J )) + ENDIF + END IF + 60 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 80, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + TEMP = X( JX ) + IX = KX + DO 70, I = N, J + 1, -1 + IF (NOCONJ) THEN + X( IX ) = X( IX ) + TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) + TEMP*DCONJG(A( I, J )) + ENDIF + IX = IX - INCX + 70 CONTINUE + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )*A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )*DCONJG(A( J, J )) + ENDIF + END IF + JX = JX - INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := A'*x or x := conjg( A' )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 110, J = N, 1, -1 + TEMP = X( J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 90, I = J - 1, 1, -1 + TEMP = TEMP + A( I, J )*X( I ) + 90 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( J, J ) ) + DO 100, I = J - 1, 1, -1 + TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) + 100 CONTINUE + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 140, J = N, 1, -1 + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 120, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + A( I, J )*X( IX ) + 120 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( J, J ) ) + DO 130, I = J - 1, 1, -1 + IX = IX - INCX + TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) + 130 CONTINUE + END IF + X( JX ) = TEMP + JX = JX - INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = 1, N + TEMP = X( J ) + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 150, I = J + 1, N + TEMP = TEMP + A( I, J )*X( I ) + 150 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( J, J ) ) + DO 160, I = J + 1, N + TEMP = TEMP + DCONJG( A( I, J ) )*X( I ) + 160 CONTINUE + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + JX = KX + DO 200, J = 1, N + TEMP = X( JX ) + IX = JX + IF( NOCONJ )THEN + IF( NOUNIT ) + $ TEMP = TEMP*A( J, J ) + DO 180, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + A( I, J )*X( IX ) + 180 CONTINUE + ELSE + IF( NOUNIT ) + $ TEMP = TEMP*DCONJG( A( J, J ) ) + DO 190, I = J + 1, N + IX = IX + INCX + TEMP = TEMP + DCONJG( A( I, J ) )*X( IX ) + 190 CONTINUE + END IF + X( JX ) = TEMP + JX = JX + INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTRMV . +* + END diff --git a/reference/ztrsmf.f b/reference/ztrsmf.f new file mode 100644 index 0000000000..ed7d227789 --- /dev/null +++ b/reference/ztrsmf.f @@ -0,0 +1,457 @@ + SUBROUTINE ZTRSMF ( SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, A, LDA, + $ B, LDB ) +* .. Scalar Arguments .. + IMPLICIT NONE + CHARACTER*1 SIDE, UPLO, TRANSA, DIAG + INTEGER M, N, LDA, LDB + COMPLEX*16 ALPHA +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ) +* .. +* +* Purpose +* ======= +* +* ZTRSM solves one of the matrix equations +* +* op( A )*X = alpha*B, or X*op( A ) = alpha*B, +* +* where alpha is a scalar, X and B are m by n matrices, A is a unit, or +* non-unit, upper or lower triangular matrix and op( A ) is one of +* +* op( A ) = A or op( A ) = A' or op( A ) = conjg( A' ). +* +* The matrix X is overwritten on B. +* +* Parameters +* ========== +* +* SIDE - CHARACTER*1. +* On entry, SIDE specifies whether op( A ) appears on the left +* or right of X as follows: +* +* SIDE = 'L' or 'l' op( A )*X = alpha*B. +* +* SIDE = 'R' or 'r' X*op( A ) = alpha*B. +* +* Unchanged on exit. +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix A is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANSA - CHARACTER*1. +* On entry, TRANSA specifies the form of op( A ) to be used in +* the matrix multiplication as follows: +* +* TRANSA = 'N' or 'n' op( A ) = A. +* +* TRANSA = 'T' or 't' op( A ) = A'. +* +* TRANSA = 'C' or 'c' op( A ) = conjg( A' ). +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit triangular +* as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* M - INTEGER. +* On entry, M specifies the number of rows of B. M must be at +* least zero. +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the number of columns of B. N must be +* at least zero. +* Unchanged on exit. +* +* ALPHA - COMPLEX*16 . +* On entry, ALPHA specifies the scalar alpha. When alpha is +* zero then A is not referenced and B need not be set before +* entry. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, k ), where k is m +* when SIDE = 'L' or 'l' and is n when SIDE = 'R' or 'r'. +* Before entry with UPLO = 'U' or 'u', the leading k by k +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading k by k +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. When SIDE = 'L' or 'l' then +* LDA must be at least max( 1, m ), when SIDE = 'R' or 'r' +* then LDA must be at least max( 1, n ). +* Unchanged on exit. +* +* B - COMPLEX*16 array of DIMENSION ( LDB, n ). +* Before entry, the leading m by n part of the array B must +* contain the right-hand side matrix B, and on exit is +* overwritten by the solution matrix X. +* +* LDB - INTEGER. +* On entry, LDB specifies the first dimension of B as declared +* in the calling (sub) program. LDB must be at least +* max( 1, m ). +* Unchanged on exit. +* +* +* Level 3 Blas routine. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. Local Scalars .. + LOGICAL LSIDE, NOCONJ, NOUNIT, UPPER + INTEGER I, INFO, J, K, NROWA + COMPLEX*16 TEMP +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + LSIDE = LSAME( SIDE , 'L' ) + IF( LSIDE )THEN + NROWA = M + ELSE + NROWA = N + END IF + NOCONJ = (LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'T' )) + NOUNIT = LSAME( DIAG , 'N' ) + UPPER = LSAME( UPLO , 'U' ) +* + INFO = 0 + IF( ( .NOT.LSIDE ).AND. + $ ( .NOT.LSAME( SIDE , 'R' ) ) )THEN + INFO = 1 + ELSE IF( ( .NOT.UPPER ).AND. + $ ( .NOT.LSAME( UPLO , 'L' ) ) )THEN + INFO = 2 + ELSE IF( ( .NOT.LSAME( TRANSA, 'N' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'T' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'R' ) ).AND. + $ ( .NOT.LSAME( TRANSA, 'C' ) ) )THEN + INFO = 3 + ELSE IF( ( .NOT.LSAME( DIAG , 'U' ) ).AND. + $ ( .NOT.LSAME( DIAG , 'N' ) ) )THEN + INFO = 4 + ELSE IF( M .LT.0 )THEN + INFO = 5 + ELSE IF( N .LT.0 )THEN + INFO = 6 + ELSE IF( LDA.LT.MAX( 1, NROWA ) )THEN + INFO = 9 + ELSE IF( LDB.LT.MAX( 1, M ) )THEN + INFO = 11 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTRSM ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* +* And when alpha.eq.zero. +* + IF( ALPHA.EQ.ZERO )THEN + DO 20, J = 1, N + DO 10, I = 1, M + B( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE + RETURN + END IF +* +* Start the operations. +* + IF( LSIDE )THEN + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ) )THEN +* +* Form B := alpha*inv( A )*B. +* + IF( UPPER )THEN + DO 60, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 30, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 30 CONTINUE + END IF + DO 50, K = M, 1, -1 + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) THEN + IF (NOCONJ) THEN + B( K, J ) = B( K, J )/A( K, K ) + ELSE + B( K, J ) = B( K, J )/DCONJG(A( K, K )) + ENDIF + ENDIF + IF (NOCONJ) THEN + DO 40, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 40 CONTINUE + ELSE + DO 45, I = 1, K - 1 + B( I, J ) = B( I, J ) - B( K, J )*DCONJG(A( I, K )) + 45 CONTINUE + ENDIF + END IF + 50 CONTINUE + 60 CONTINUE + ELSE + DO 100, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 70, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 70 CONTINUE + END IF + DO 90 K = 1, M + IF (NOCONJ) THEN + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/A( K, K ) + DO 80, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*A( I, K ) + 80 CONTINUE + END IF + ELSE + IF( B( K, J ).NE.ZERO )THEN + IF( NOUNIT ) + $ B( K, J ) = B( K, J )/DCONJG(A( K, K )) + DO 85, I = K + 1, M + B( I, J ) = B( I, J ) - B( K, J )*DCONJG(A( I, K )) + 85 CONTINUE + END IF + ENDIF + 90 CONTINUE + 100 CONTINUE + END IF + ELSE +* +* Form B := alpha*inv( A' )*B +* or B := alpha*inv( conjg( A' ) )*B. +* + IF( UPPER )THEN + DO 140, J = 1, N + DO 130, I = 1, M + TEMP = ALPHA*B( I, J ) + IF( NOCONJ )THEN + DO 110, K = 1, I - 1 + TEMP = TEMP - A( K, I )*B( K, J ) + 110 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + ELSE + DO 120, K = 1, I - 1 + TEMP = TEMP - DCONJG( A( K, I ) )*B( K, J ) + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( I, I ) ) + END IF + B( I, J ) = TEMP + 130 CONTINUE + 140 CONTINUE + ELSE + DO 180, J = 1, N + DO 170, I = M, 1, -1 + TEMP = ALPHA*B( I, J ) + IF( NOCONJ )THEN + DO 150, K = I + 1, M + TEMP = TEMP - A( K, I )*B( K, J ) + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( I, I ) + ELSE + DO 160, K = I + 1, M + TEMP = TEMP - DCONJG( A( K, I ) )*B( K, J ) + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( I, I ) ) + END IF + B( I, J ) = TEMP + 170 CONTINUE + 180 CONTINUE + END IF + END IF + ELSE + IF( LSAME( TRANSA, 'N' ) .OR. LSAME( TRANSA, 'R' ) )THEN +* +* Form B := alpha*B*inv( A ). +* + IF( UPPER )THEN + DO 230, J = 1, N + IF( ALPHA.NE.ONE )THEN + DO 190, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 190 CONTINUE + END IF + DO 210, K = 1, J - 1 + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + DO 200, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 200 CONTINUE + ELSE + DO 205, I = 1, M + B( I, J ) = B( I, J ) - DCONJG(A( K, J ))*B( I, K ) + 205 CONTINUE + ENDIF + END IF + 210 CONTINUE + IF( NOUNIT )THEN + IF (NOCONJ) THEN + TEMP = ONE/A( J, J ) + ELSE + TEMP = ONE/DCONJG(A( J, J )) + ENDIF + DO 220, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 220 CONTINUE + END IF + 230 CONTINUE + ELSE + DO 280, J = N, 1, -1 + IF( ALPHA.NE.ONE )THEN + DO 240, I = 1, M + B( I, J ) = ALPHA*B( I, J ) + 240 CONTINUE + END IF + DO 260, K = J + 1, N + IF( A( K, J ).NE.ZERO )THEN + IF (NOCONJ) THEN + DO 250, I = 1, M + B( I, J ) = B( I, J ) - A( K, J )*B( I, K ) + 250 CONTINUE + ELSE + DO 255, I = 1, M + B( I, J ) = B( I, J ) - DCONJG(A( K, J ))*B( I, K ) + 255 CONTINUE + ENDIF + END IF + 260 CONTINUE + IF( NOUNIT )THEN + IF (NOCONJ) THEN + TEMP = ONE/A( J, J ) + ELSE + TEMP = ONE/DCONJG(A( J, J )) + ENDIF + DO 270, I = 1, M + B( I, J ) = TEMP*B( I, J ) + 270 CONTINUE + END IF + 280 CONTINUE + END IF + ELSE +* +* Form B := alpha*B*inv( A' ) +* or B := alpha*B*inv( conjg( A' ) ). +* + IF( UPPER )THEN + DO 330, K = N, 1, -1 + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = ONE/A( K, K ) + ELSE + TEMP = ONE/DCONJG( A( K, K ) ) + END IF + DO 290, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 290 CONTINUE + END IF + DO 310, J = 1, K - 1 + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = A( J, K ) + ELSE + TEMP = DCONJG( A( J, K ) ) + END IF + DO 300, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 300 CONTINUE + END IF + 310 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 320, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 320 CONTINUE + END IF + 330 CONTINUE + ELSE + DO 380, K = 1, N + IF( NOUNIT )THEN + IF( NOCONJ )THEN + TEMP = ONE/A( K, K ) + ELSE + TEMP = ONE/DCONJG( A( K, K ) ) + END IF + DO 340, I = 1, M + B( I, K ) = TEMP*B( I, K ) + 340 CONTINUE + END IF + DO 360, J = K + 1, N + IF( A( J, K ).NE.ZERO )THEN + IF( NOCONJ )THEN + TEMP = A( J, K ) + ELSE + TEMP = DCONJG( A( J, K ) ) + END IF + DO 350, I = 1, M + B( I, J ) = B( I, J ) - TEMP*B( I, K ) + 350 CONTINUE + END IF + 360 CONTINUE + IF( ALPHA.NE.ONE )THEN + DO 370, I = 1, M + B( I, K ) = ALPHA*B( I, K ) + 370 CONTINUE + END IF + 380 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTRSM . +* + END diff --git a/reference/ztrsvf.f b/reference/ztrsvf.f new file mode 100644 index 0000000000..c8b3d542fe --- /dev/null +++ b/reference/ztrsvf.f @@ -0,0 +1,361 @@ + SUBROUTINE ZTRSVF ( UPLO, TRANS, DIAG, N, A, LDA, X, INCX ) +* .. Scalar Arguments .. + INTEGER INCX, LDA, N + CHARACTER*1 DIAG, TRANS, UPLO +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), X( * ) +* .. +* +* Purpose +* ======= +* +* ZTRSV solves one of the systems of equations +* +* A*x = b, or A'*x = b, or conjg( A' )*x = b, +* +* where b and x are n element vectors and A is an n by n unit, or +* non-unit, upper or lower triangular matrix. +* +* No test for singularity or near-singularity is included in this +* routine. Such tests must be performed before calling this routine. +* +* Parameters +* ========== +* +* UPLO - CHARACTER*1. +* On entry, UPLO specifies whether the matrix is an upper or +* lower triangular matrix as follows: +* +* UPLO = 'U' or 'u' A is an upper triangular matrix. +* +* UPLO = 'L' or 'l' A is a lower triangular matrix. +* +* Unchanged on exit. +* +* TRANS - CHARACTER*1. +* On entry, TRANS specifies the equations to be solved as +* follows: +* +* TRANS = 'N' or 'n' A*x = b. +* +* TRANS = 'T' or 't' A'*x = b. +* +* TRANS = 'C' or 'c' conjg( A' )*x = b. +* +* Unchanged on exit. +* +* DIAG - CHARACTER*1. +* On entry, DIAG specifies whether or not A is unit +* triangular as follows: +* +* DIAG = 'U' or 'u' A is assumed to be unit triangular. +* +* DIAG = 'N' or 'n' A is not assumed to be unit +* triangular. +* +* Unchanged on exit. +* +* N - INTEGER. +* On entry, N specifies the order of the matrix A. +* N must be at least zero. +* Unchanged on exit. +* +* A - COMPLEX*16 array of DIMENSION ( LDA, n ). +* Before entry with UPLO = 'U' or 'u', the leading n by n +* upper triangular part of the array A must contain the upper +* triangular matrix and the strictly lower triangular part of +* A is not referenced. +* Before entry with UPLO = 'L' or 'l', the leading n by n +* lower triangular part of the array A must contain the lower +* triangular matrix and the strictly upper triangular part of +* A is not referenced. +* Note that when DIAG = 'U' or 'u', the diagonal elements of +* A are not referenced either, but are assumed to be unity. +* Unchanged on exit. +* +* LDA - INTEGER. +* On entry, LDA specifies the first dimension of A as declared +* in the calling (sub) program. LDA must be at least +* max( 1, n ). +* Unchanged on exit. +* +* X - COMPLEX*16 array of dimension at least +* ( 1 + ( n - 1 )*abs( INCX ) ). +* Before entry, the incremented array X must contain the n +* element right-hand side vector b. On exit, X is overwritten +* with the solution vector x. +* +* INCX - INTEGER. +* On entry, INCX specifies the increment for the elements of +* X. INCX must not be zero. +* Unchanged on exit. +* +* +* Level 2 Blas routine. +* +* -- Written on 22-October-1986. +* Jack Dongarra, Argonne National Lab. +* Jeremy Du Croz, Nag Central Office. +* Sven Hammarling, Nag Central Office. +* Richard Hanson, Sandia National Labs. +* +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. Local Scalars .. + COMPLEX*16 TEMP + INTEGER I, INFO, IX, J, JX, KX + LOGICAL NOCONJ, NOUNIT +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. External Subroutines .. + EXTERNAL XERBLA +* .. Intrinsic Functions .. + INTRINSIC DCONJG, MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + IF ( .NOT.LSAME( UPLO , 'U' ).AND. + $ .NOT.LSAME( UPLO , 'L' ) )THEN + INFO = 1 + ELSE IF( .NOT.LSAME( TRANS, 'N' ).AND. + $ .NOT.LSAME( TRANS, 'T' ).AND. + $ .NOT.LSAME( TRANS, 'R' ).AND. + $ .NOT.LSAME( TRANS, 'C' ) )THEN + INFO = 2 + ELSE IF( .NOT.LSAME( DIAG , 'U' ).AND. + $ .NOT.LSAME( DIAG , 'N' ) )THEN + INFO = 3 + ELSE IF( N.LT.0 )THEN + INFO = 4 + ELSE IF( LDA.LT.MAX( 1, N ) )THEN + INFO = 6 + ELSE IF( INCX.EQ.0 )THEN + INFO = 8 + END IF + IF( INFO.NE.0 )THEN + CALL XERBLA( 'ZTRSV ', INFO ) + RETURN + END IF +* +* Quick return if possible. +* + IF( N.EQ.0 ) + $ RETURN +* + NOCONJ = LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'T' ) + NOUNIT = LSAME( DIAG , 'N' ) +* +* Set up the start point in X if the increment is not unity. This +* will be ( N - 1 )*INCX too small for descending loops. +* + IF( INCX.LE.0 )THEN + KX = 1 - ( N - 1 )*INCX + ELSE IF( INCX.NE.1 )THEN + KX = 1 + END IF +* +* Start the operations. In this version the elements of A are +* accessed sequentially with one pass through A. +* + IF( LSAME( TRANS, 'N' ) .OR. LSAME( TRANS, 'R' ) ) THEN +* +* Form x := inv( A )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 20, J = N, 1, -1 + IF( X( J ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 10, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*A( I, J ) + 10 CONTINUE + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/DCONJG(A( J, J )) + TEMP = X( J ) + DO 15, I = J - 1, 1, -1 + X( I ) = X( I ) - TEMP*DCONJG(A( I, J )) + 15 CONTINUE + ENDIF + END IF + 20 CONTINUE + ELSE + JX = KX + ( N - 1 )*INCX + DO 40, J = N, 1, -1 + IF( X( JX ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/DCONJG(A( J, J )) + ENDIF + TEMP = X( JX ) + IX = JX + DO 30, I = J - 1, 1, -1 + IX = IX - INCX + IF (NOCONJ) THEN + X( IX ) = X( IX ) - TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) - TEMP*DCONJG(A( I, J )) + ENDIF + 30 CONTINUE + END IF + JX = JX - INCX + 40 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 60, J = 1, N + IF( X( J ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( J ) = X( J )/A( J, J ) + TEMP = X( J ) + DO 50, I = J + 1, N + X( I ) = X( I ) - TEMP*A( I, J ) + 50 CONTINUE + ELSE + IF( NOUNIT ) + $ X( J ) = X( J )/DCONJG(A( J, J )) + TEMP = X( J ) + DO 55, I = J + 1, N + X( I ) = X( I ) - TEMP*DCONJG(A( I, J )) + 55 CONTINUE + ENDIF + END IF + 60 CONTINUE + ELSE + JX = KX + DO 80, J = 1, N + IF( X( JX ).NE.ZERO )THEN + IF (NOCONJ) THEN + IF( NOUNIT ) + $ X( JX ) = X( JX )/A( J, J ) + ELSE + IF( NOUNIT ) + $ X( JX ) = X( JX )/DCONJG(A( J, J )) + ENDIF + TEMP = X( JX ) + IX = JX + DO 70, I = J + 1, N + IX = IX + INCX + IF (NOCONJ) THEN + X( IX ) = X( IX ) - TEMP*A( I, J ) + ELSE + X( IX ) = X( IX ) - TEMP*DCONJG(A( I, J )) + ENDIF + 70 CONTINUE + END IF + JX = JX + INCX + 80 CONTINUE + END IF + END IF + ELSE +* +* Form x := inv( A' )*x or x := inv( conjg( A' ) )*x. +* + IF( LSAME( UPLO, 'U' ) )THEN + IF( INCX.EQ.1 )THEN + DO 110, J = 1, N + TEMP = X( J ) + IF( NOCONJ )THEN + DO 90, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( I ) + 90 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 100, I = 1, J - 1 + TEMP = TEMP - DCONJG( A( I, J ) )*X( I ) + 100 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( J, J ) ) + END IF + X( J ) = TEMP + 110 CONTINUE + ELSE + JX = KX + DO 140, J = 1, N + IX = KX + TEMP = X( JX ) + IF( NOCONJ )THEN + DO 120, I = 1, J - 1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX + INCX + 120 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 130, I = 1, J - 1 + TEMP = TEMP - DCONJG( A( I, J ) )*X( IX ) + IX = IX + INCX + 130 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( J, J ) ) + END IF + X( JX ) = TEMP + JX = JX + INCX + 140 CONTINUE + END IF + ELSE + IF( INCX.EQ.1 )THEN + DO 170, J = N, 1, -1 + TEMP = X( J ) + IF( NOCONJ )THEN + DO 150, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( I ) + 150 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 160, I = N, J + 1, -1 + TEMP = TEMP - DCONJG( A( I, J ) )*X( I ) + 160 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( J, J ) ) + END IF + X( J ) = TEMP + 170 CONTINUE + ELSE + KX = KX + ( N - 1 )*INCX + JX = KX + DO 200, J = N, 1, -1 + IX = KX + TEMP = X( JX ) + IF( NOCONJ )THEN + DO 180, I = N, J + 1, -1 + TEMP = TEMP - A( I, J )*X( IX ) + IX = IX - INCX + 180 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/A( J, J ) + ELSE + DO 190, I = N, J + 1, -1 + TEMP = TEMP - DCONJG( A( I, J ) )*X( IX ) + IX = IX - INCX + 190 CONTINUE + IF( NOUNIT ) + $ TEMP = TEMP/DCONJG( A( J, J ) ) + END IF + X( JX ) = TEMP + JX = JX - INCX + 200 CONTINUE + END IF + END IF + END IF +* + RETURN +* +* End of ZTRSV . +* + END diff --git a/reference/ztrti2f.f b/reference/ztrti2f.f new file mode 100644 index 0000000000..a40d2ed57e --- /dev/null +++ b/reference/ztrti2f.f @@ -0,0 +1,146 @@ + SUBROUTINE ZTRTI2F( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.1) -- +* Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. +* November 2006 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZTRTI2 computes the inverse of a complex upper or lower triangular +* matrix. +* +* This is the Level 2 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* Specifies whether the matrix A is upper or lower triangular. +* = 'U': Upper triangular +* = 'L': Lower triangular +* +* DIAG (input) CHARACTER*1 +* Specifies whether or not the matrix A is unit triangular. +* = 'N': Non-unit triangular +* = 'U': Unit triangular +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading n by n upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading n by n lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -k, the k-th argument had an illegal value +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J + COMPLEX*16 AJJ +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZSCAL, ZTRMV +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRTI2', -INFO ) + RETURN + END IF +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix. +* + DO 10 J = 1, N + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF +* +* Compute elements 1:j-1 of j-th column. +* + CALL ZTRMV( 'Upper', 'No transpose', DIAG, J-1, A, LDA, + $ A( 1, J ), 1 ) + CALL ZSCAL( J-1, AJJ, A( 1, J ), 1 ) + 10 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix. +* + DO 20 J = N, 1, -1 + IF( NOUNIT ) THEN + A( J, J ) = ONE / A( J, J ) + AJJ = -A( J, J ) + ELSE + AJJ = -ONE + END IF + IF( J.LT.N ) THEN +* +* Compute elements j+1:n of j-th column. +* + CALL ZTRMV( 'Lower', 'No transpose', DIAG, N-J, + $ A( J+1, J+1 ), LDA, A( J+1, J ), 1 ) + CALL ZSCAL( N-J, AJJ, A( J+1, J ), 1 ) + END IF + 20 CONTINUE + END IF +* + RETURN +* +* End of ZTRTI2 +* + END diff --git a/reference/ztrtrif.f b/reference/ztrtrif.f new file mode 100644 index 0000000000..f68caf4b6e --- /dev/null +++ b/reference/ztrtrif.f @@ -0,0 +1,177 @@ + SUBROUTINE ZTRTRIF( UPLO, DIAG, N, A, LDA, INFO ) +* +* -- LAPACK routine (version 3.0) -- +* Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd., +* Courant Institute, Argonne National Lab, and Rice University +* September 30, 1994 +* +* .. Scalar Arguments .. + CHARACTER DIAG, UPLO + INTEGER INFO, LDA, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ) +* .. +* +* Purpose +* ======= +* +* ZTRTRI computes the inverse of a complex upper or lower triangular +* matrix A. +* +* This is the Level 3 BLAS version of the algorithm. +* +* Arguments +* ========= +* +* UPLO (input) CHARACTER*1 +* = 'U': A is upper triangular; +* = 'L': A is lower triangular. +* +* DIAG (input) CHARACTER*1 +* = 'N': A is non-unit triangular; +* = 'U': A is unit triangular. +* +* N (input) INTEGER +* The order of the matrix A. N >= 0. +* +* A (input/output) COMPLEX*16 array, dimension (LDA,N) +* On entry, the triangular matrix A. If UPLO = 'U', the +* leading N-by-N upper triangular part of the array A contains +* the upper triangular matrix, and the strictly lower +* triangular part of A is not referenced. If UPLO = 'L', the +* leading N-by-N lower triangular part of the array A contains +* the lower triangular matrix, and the strictly upper +* triangular part of A is not referenced. If DIAG = 'U', the +* diagonal elements of A are also not referenced and are +* assumed to be 1. +* On exit, the (triangular) inverse of the original matrix, in +* the same storage format. +* +* LDA (input) INTEGER +* The leading dimension of the array A. LDA >= max(1,N). +* +* INFO (output) INTEGER +* = 0: successful exit +* < 0: if INFO = -i, the i-th argument had an illegal value +* > 0: if INFO = i, A(i,i) is exactly zero. The triangular +* matrix is singular and its inverse can not be computed. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE, ZERO + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), + $ ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL NOUNIT, UPPER + INTEGER J, JB, NB, NN +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL XERBLA, ZTRMM, ZTRSM, ZTRTI2 +* .. +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters. +* + INFO = 0 + UPPER = LSAME( UPLO, 'U' ) + NOUNIT = LSAME( DIAG, 'N' ) + IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN + INFO = -1 + ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN + INFO = -2 + ELSE IF( N.LT.0 ) THEN + INFO = -3 + ELSE IF( LDA.LT.MAX( 1, N ) ) THEN + INFO = -5 + END IF + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZTRTRI', -INFO ) + RETURN + END IF +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* +* Check for singularity if non-unit. +* + IF( NOUNIT ) THEN + DO 10 INFO = 1, N + IF( A( INFO, INFO ).EQ.ZERO ) + $ RETURN + 10 CONTINUE + INFO = 0 + END IF +* +* Determine the block size for this environment. +* + NB = 128 + IF( NB.LE.1 .OR. NB.GE.N ) THEN +* +* Use unblocked code +* + CALL ZTRTI2( UPLO, DIAG, N, A, LDA, INFO ) + ELSE +* +* Use blocked code +* + IF( UPPER ) THEN +* +* Compute inverse of upper triangular matrix +* + DO 20 J = 1, N, NB + JB = MIN( NB, N-J+1 ) +* +* Compute rows 1:j-1 of current block column +* + CALL ZTRMM( 'Left', 'Upper', 'No transpose', DIAG, J-1, + $ JB, ONE, A, LDA, A( 1, J ), LDA ) + CALL ZTRSM( 'Right', 'Upper', 'No transpose', DIAG, J-1, + $ JB, -ONE, A( J, J ), LDA, A( 1, J ), LDA ) +* +* Compute inverse of current diagonal block +* + CALL ZTRTI2( 'Upper', DIAG, JB, A( J, J ), LDA, INFO ) + 20 CONTINUE + ELSE +* +* Compute inverse of lower triangular matrix +* + NN = ( ( N-1 ) / NB )*NB + 1 + DO 30 J = NN, 1, -NB + JB = MIN( NB, N-J+1 ) + IF( J+JB.LE.N ) THEN +* +* Compute rows j+jb:n of current block column +* + CALL ZTRMM( 'Left', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, ONE, A( J+JB, J+JB ), LDA, + $ A( J+JB, J ), LDA ) + CALL ZTRSM( 'Right', 'Lower', 'No transpose', DIAG, + $ N-J-JB+1, JB, -ONE, A( J, J ), LDA, + $ A( J+JB, J ), LDA ) + END IF +* +* Compute inverse of current diagonal block +* + CALL ZTRTI2( 'Lower', DIAG, JB, A( J, J ), LDA, INFO ) + 30 CONTINUE + END IF + END IF +* + RETURN +* +* End of ZTRTRI +* + END diff --git a/symcopy.h b/symcopy.h new file mode 100644 index 0000000000..ed6e5b4178 --- /dev/null +++ b/symcopy.h @@ -0,0 +1,1873 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +/* This implementation is completely wrong. I'll rewrite this */ + +#ifndef SYMCOPY_H +#define SYMCOPY_H + +#if !defined(XDOUBLE) || !defined(QUAD_PRECISION) + +static inline void SYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void SYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + + +static inline void ZSYMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZSYMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +static inline void ZHEMCOPY_L(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = -a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = 0.; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = -a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + + } +} + +static inline void ZHEMCOPY_U(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = -a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = -a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a12; + *(bb1 + 3) = -a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = 0.; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = -a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = -a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + } +} + + +static inline void ZHEMCOPY_M(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = 0.; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = -a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + + } +} + +static inline void ZHEMCOPY_V(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = -a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = -a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = 0.; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = -a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = -a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + *(bb1 + 1) = 0.; + } + } +} + + +static inline void TRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void TRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 2; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m + 2; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2 * m + 2; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a21; + *(bb2 + 1) = a22; + aa1 += 2; + aa2 += 2; + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + + is --; + } + + is = ((m - js - 2) & 1); + + if (is == 1){ + a11 = *(aa1 + 0); + a12 = *(aa2 + 0); + + *(bb1 + 0) = a11; + *(bb2 + 0) = a12; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + + } +} + +static inline void TRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + +static inline void TRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a12; + FLOAT a21, a22; + + b1 = b; + b2 = b; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 1 * m; + b1 += 2 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 1 * m; + b2 += 2; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + aa1 += 2; + aa2 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a12; + *(cc2 + 0) = a21; + *(cc2 + 1) = a22; + + bb1 += 2; + bb2 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a12; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + aa1 += 2; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(cc1 + 0) = a11; + *(cc2 + 0) = a21; + bb1 += 2; + + cc1 += 2 * m; + cc2 += 2 * m; + } + + a11 = *(aa1 + 0); + *(bb1 + 0) = a11; + } + } +} + +static inline void ZTRMCOPY_NL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZTRMCOPY_TL(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda + 4; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m + 4; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4 * m + 4; + + if (m - js >= 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 2); + a22 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a31; + *(bb2 + 1) = a41; + *(bb2 + 2) = a12; + *(bb2 + 3) = a22; + + aa1 += 4; + aa2 += 4; + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is = ((m - js - 2) >> 1); + + while (is > 0){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + + is --; + } + + if (m & 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + } + } + + if (m - js == 1){ + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + + } +} + +static inline void ZTRMCOPY_NU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +static inline void ZTRMCOPY_TU(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b){ + BLASLONG is, js; + + FLOAT *aa1, *aa2; + FLOAT *b1, *b2; + FLOAT *bb1, *bb2; + FLOAT *cc1, *cc2; + FLOAT a11, a21, a31, a41; + FLOAT a12, a22, a32, a42; + + b1 = b; + b2 = b; + + lda *= 2; + + for (js = 0; js < m; js += 2){ + + aa1 = a + 0 * lda; + aa2 = a + 1 * lda; + a += 2 * lda; + + bb1 = b1 + 0 * m; + bb2 = b1 + 2 * m; + b1 += 4 * m; + + cc1 = b2 + 0 * m; + cc2 = b2 + 2 * m; + b2 += 4; + + if (m - js >= 2){ + + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + aa1 += 4; + aa2 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc1 + 2) = a12; + *(cc1 + 3) = a22; + + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + *(cc2 + 2) = a32; + *(cc2 + 3) = a42; + + bb1 += 4; + bb2 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + + a12 = *(aa2 + 0); + a22 = *(aa2 + 1); + a32 = *(aa2 + 2); + a42 = *(aa2 + 3); + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a12; + *(bb1 + 3) = a22; + + *(bb2 + 0) = a12; + *(bb2 + 1) = a22; + *(bb2 + 2) = a32; + *(bb2 + 3) = a42; + } + + if (m - js == 1){ + for (is = 0; is < js; is += 2){ + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + a31 = *(aa1 + 2); + a41 = *(aa1 + 3); + aa1 += 4; + + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + *(bb1 + 2) = a31; + *(bb1 + 3) = a41; + + *(cc1 + 0) = a11; + *(cc1 + 1) = a21; + *(cc2 + 0) = a31; + *(cc2 + 1) = a41; + bb1 += 4; + + cc1 += 4 * m; + cc2 += 4 * m; + } + + a11 = *(aa1 + 0); + a21 = *(aa1 + 1); + *(bb1 + 0) = a11; + *(bb1 + 1) = a21; + } + } +} + +#endif +#endif + diff --git a/test/LICENSE b/test/LICENSE new file mode 100644 index 0000000000..85061f29fe --- /dev/null +++ b/test/LICENSE @@ -0,0 +1,23 @@ +This directory contains the reference implementation of BLAS +which is obtainable at: http://netlib.org/blas/ + +The license, obtained from http://netlib.org/blas/faq.html#2 on November 3, +2010, is as follows: + +2) Are there legal restrictions on the use of BLAS reference implementation +software? + +The reference BLAS is a freely-available software package. It is available from +netlib via anonymous ftp and the World Wide Web. Thus, it can be included in +commercial software packages (and has been). We only ask that proper credit be +given to the authors. + +Like all software, it is copyrighted. It is not trademarked, but we do ask the +following: + +If you modify the source for these routines we ask that you change the name of +the routine and comment the changes made to the original. + +We will gladly answer any questions regarding the software. If a modification +is done, however, it is the responsibility of the person who modified the +routine to provide support. diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000000..4f6ca91d18 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,122 @@ +TOPDIR = .. +include ../Makefile.system + +all :: level1 level2 level3 + +level1 : sblat1 dblat1 cblat1 zblat1 + GOTO_NUM_THREADS=1 ./sblat1 + GOTO_NUM_THREADS=1 ./dblat1 + GOTO_NUM_THREADS=1 ./cblat1 + GOTO_NUM_THREADS=1 ./zblat1 +ifdef SMP + GOTO_NUM_THREADS=2 ./sblat1 + GOTO_NUM_THREADS=2 ./dblat1 + GOTO_NUM_THREADS=2 ./cblat1 + GOTO_NUM_THREADS=2 ./zblat1 +endif + +level2 : sblat2 dblat2 cblat2 zblat2 + rm -f ?BLAT2.SUMM + GOTO_NUM_THREADS=1 ./sblat2 < ./sblat2.dat + @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./dblat2 < ./dblat2.dat + @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./cblat2 < ./cblat2.dat + @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./zblat2 < ./zblat2.dat + @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +ifdef SMP + rm -f ?BLAT2.SUMM + GOTO_NUM_THREADS=2 ./sblat2 < ./sblat2.dat + @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./dblat2 < ./dblat2.dat + @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./cblat2 < ./cblat2.dat + @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./zblat2 < ./zblat2.dat + @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +endif + +level3 : sblat3 dblat3 cblat3 zblat3 + rm -f ?BLAT3.SUMM + GOTO_NUM_THREADS=1 ./sblat3 < ./sblat3.dat + @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./dblat3 < ./dblat3.dat + @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./cblat3 < ./cblat3.dat + @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=1 ./zblat3 < ./zblat3.dat + @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +ifdef SMP + rm -f ?BLAT3.SUMM + GOTO_NUM_THREADS=2 ./sblat3 < ./sblat3.dat + @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./dblat3 < ./dblat3.dat + @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./cblat3 < ./cblat3.dat + @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 + GOTO_NUM_THREADS=2 ./zblat3 < ./zblat3.dat + @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +endif + +FLDFLAGS = $(FFLAGS:-fPIC=) +CEXTRALIB = + + +sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +clean: + @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ + sblat1 dblat1 cblat1 zblat1 \ + sblat2 dblat2 cblat2 zblat2 \ + sblat3 dblat3 cblat3 zblat3 \ + sblat1p dblat1p cblat1p zblat1p \ + sblat2p dblat2p cblat2p zblat2p \ + sblat3p dblat3p cblat3p zblat3p \ + *.stackdump *.dll + +libs: + +prof: + +quick : + $(MAKE) -C $(TOPDIR) libs + +# include ../Makefile.tail diff --git a/test/cblat1.f b/test/cblat1.f new file mode 100644 index 0000000000..a4c996fda1 --- /dev/null +++ b/test/cblat1.f @@ -0,0 +1,681 @@ + PROGRAM CBLAT1 +* Test program for the COMPLEX Level 1 BLAS. +* Based upon the original BLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex BLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*6 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'CDOTC '/ + DATA L(2)/'CDOTU '/ + DATA L(3)/'CAXPY '/ + DATA L(4)/'CCOPY '/ + DATA L(5)/'CSWAP '/ + DATA L(6)/'SCNRM2'/ + DATA L(7)/'SCASUM'/ + DATA L(8)/'CSCAL '/ + DATA L(9)/'CSSCAL'/ + DATA L(10)/'ICAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,12X,A6) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA + REAL SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + REAL STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + REAL SCASUM, SCNRM2 + INTEGER ICAMAX + EXTERNAL SCASUM, SCNRM2, ICAMAX +* .. External Subroutines .. + EXTERNAL CSCAL, CSSCAL, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3E0, (0.4E0,-0.7E0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.3E0,-0.4E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.1E0,-0.3E0), (0.5E0,-0.1E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (0.1E0,0.1E0), + + (-0.6E0,0.1E0), (0.1E0,-0.3E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (0.3E0,0.1E0), (0.1E0,0.4E0), + + (0.4E0,0.1E0), (0.1E0,0.2E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.3E0,-0.4E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.1E0,-0.3E0), (8.0E0,9.0E0), (0.5E0,-0.1E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (0.1E0,0.1E0), + + (3.0E0,6.0E0), (-0.6E0,0.1E0), (4.0E0,7.0E0), + + (0.1E0,-0.3E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.3E0,0.1E0), (5.0E0,8.0E0), + + (0.1E0,0.4E0), (6.0E0,9.0E0), (0.4E0,0.1E0), + + (8.0E0,3.0E0), (0.1E0,0.2E0), (9.0E0,4.0E0)/ + DATA STRUE2/0.0E0, 0.5E0, 0.6E0, 0.7E0, 0.7E0/ + DATA STRUE4/0.0E0, 0.7E0, 1.0E0, 1.3E0, 1.7E0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (-0.16E0,-0.37E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (-0.17E0,-0.19E0), (0.13E0,-0.39E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.11E0,-0.03E0), (-0.17E0,0.46E0), + + (-0.17E0,-0.19E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.19E0,-0.17E0), (0.32E0,0.09E0), + + (0.23E0,-0.24E0), (0.18E0,0.01E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (-0.16E0,-0.37E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (-0.17E0,-0.19E0), (8.0E0,9.0E0), + + (0.13E0,-0.39E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.11E0,-0.03E0), (3.0E0,6.0E0), + + (-0.17E0,0.46E0), (4.0E0,7.0E0), + + (-0.17E0,-0.19E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.19E0,-0.17E0), (5.0E0,8.0E0), + + (0.32E0,0.09E0), (6.0E0,9.0E0), + + (0.23E0,-0.24E0), (8.0E0,3.0E0), + + (0.18E0,0.01E0), (9.0E0,4.0E0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (1.0E0,2.0E0), (1.0E0,2.0E0), + + (1.0E0,2.0E0), (0.09E0,-0.12E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (3.0E0,4.0E0), (3.0E0,4.0E0), (3.0E0,4.0E0), + + (0.03E0,-0.09E0), (0.15E0,-0.03E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (5.0E0,6.0E0), (5.0E0,6.0E0), (5.0E0,6.0E0), + + (0.03E0,0.03E0), (-0.18E0,0.03E0), + + (0.03E0,-0.09E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (7.0E0,8.0E0), (7.0E0,8.0E0), (7.0E0,8.0E0), + + (0.09E0,0.03E0), (0.03E0,0.12E0), + + (0.12E0,0.03E0), (0.03E0,0.06E0), (2.0E0,3.0E0), + + (2.0E0,3.0E0), (2.0E0,3.0E0), (2.0E0,3.0E0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1E0,0.1E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (4.0E0,5.0E0), (4.0E0,5.0E0), + + (4.0E0,5.0E0), (0.09E0,-0.12E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (6.0E0,7.0E0), (6.0E0,7.0E0), (6.0E0,7.0E0), + + (0.03E0,-0.09E0), (8.0E0,9.0E0), + + (0.15E0,-0.03E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (2.0E0,5.0E0), (2.0E0,5.0E0), (2.0E0,5.0E0), + + (0.03E0,0.03E0), (3.0E0,6.0E0), + + (-0.18E0,0.03E0), (4.0E0,7.0E0), + + (0.03E0,-0.09E0), (7.0E0,2.0E0), (7.0E0,2.0E0), + + (7.0E0,2.0E0), (0.09E0,0.03E0), (5.0E0,8.0E0), + + (0.03E0,0.12E0), (6.0E0,9.0E0), (0.12E0,0.03E0), + + (8.0E0,3.0E0), (0.03E0,0.06E0), (9.0E0,4.0E0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. SCNRM2 .. + CALL STEST1(SCNRM2(N,CX,INCX),STRUE2(NP1),STRUE2(NP1), + + SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. SCASUM .. + CALL STEST1(SCASUM(N,CX,INCX),STRUE4(NP1),STRUE4(NP1), + + SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. CSCAL .. + CALL CSCAL(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. CSSCAL .. + CALL CSSCAL(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ICAMAX .. + CALL ITEST1(ICAMAX(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* CSCAL +* Add a test for alpha equal to zero. + CA = (0.0E0,0.0E0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 80 CONTINUE + CALL CSCAL(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* CSSCAL +* Add a test for alpha equal to zero. + SA = 0.0E0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0E0,0.0E0) + MWPCS(I) = (1.0E0,1.0E0) + 100 CONTINUE + CALL CSSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0E0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL CSSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0E0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL CSSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX CA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + COMPLEX CDOTC, CDOTU + EXTERNAL CDOTC, CDOTU +* .. External Subroutines .. + EXTERNAL CAXPY, CCOPY, CSWAP, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4E0,-0.7E0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (-0.1E0,-0.9E0), (0.2E0,-0.8E0), + + (-0.9E0,-0.4E0), (0.1E0,0.4E0), (-0.6E0,0.6E0)/ + DATA CY1/(0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.7E0,-0.6E0), (0.1E0,-0.5E0), (-0.1E0,-0.2E0), + + (-0.5E0,-0.3E0), (0.8E0,-0.7E0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-1.55E0,0.5E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (-1.55E0,0.5E0), + + (0.03E0,-0.89E0), (-0.38E0,-0.96E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-0.9E0,0.5E0), (0.42E0,-1.41E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-0.9E0,0.5E0), + + (0.06E0,-0.13E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.52E0,-1.51E0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.07E0,-0.89E0), + + (-1.18E0,-0.31E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.78E0,0.06E0), (-1.54E0,0.97E0), + + (0.03E0,-0.89E0), (-0.18E0,-1.31E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.32E0,-1.41E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.32E0,-1.41E0), (-0.9E0,0.5E0), + + (0.05E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.32E0,-1.41E0), + + (-0.9E0,0.5E0), (0.05E0,-0.6E0), (0.1E0,-0.5E0), + + (-0.77E0,-0.49E0), (-0.5E0,-0.3E0), + + (0.32E0,-1.16E0)/ + DATA CT7/(0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (0.65E0,-0.47E0), (-0.34E0,-1.22E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.59E0,-1.46E0), (-1.04E0,-0.04E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.83E0,0.59E0), (0.07E0,-0.37E0), + + (0.0E0,0.0E0), (-0.06E0,-0.90E0), + + (-0.76E0,-1.15E0), (-1.33E0,-1.82E0)/ + DATA CT6/(0.0E0,0.0E0), (0.90E0,0.06E0), + + (0.91E0,-0.77E0), (1.80E0,-0.10E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.45E0,0.74E0), + + (0.20E0,0.90E0), (0.0E0,0.0E0), (0.90E0,0.06E0), + + (-0.55E0,0.23E0), (0.83E0,-0.39E0), + + (0.0E0,0.0E0), (0.90E0,0.06E0), (1.04E0,0.79E0), + + (1.95E0,1.22E0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (-0.9E0,0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (-0.9E0,0.5E0), (0.7E0,-0.6E0), (0.1E0,-0.5E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.6E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.8E0,-0.7E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.2E0), + + (0.2E0,-0.8E0), (0.7E0,-0.6E0), (0.1E0,0.4E0), + + (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.9E0,0.5E0), (-0.4E0,-0.7E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.1E0,-0.5E0), + + (-0.4E0,-0.7E0), (0.7E0,-0.6E0), (0.2E0,-0.8E0), + + (-0.9E0,0.5E0), (0.1E0,0.4E0), (0.6E0,-0.6E0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.6E0,-0.6E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.6E0,-0.6E0), (0.7E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.6E0,-0.6E0), + + (0.7E0,-0.6E0), (-0.1E0,-0.2E0), (0.8E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.4E0,-0.7E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.4E0,-0.7E0), (-0.1E0,-0.9E0), + + (0.2E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (-0.9E0,0.5E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,0.5E0), (-0.9E0,-0.4E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.7E0,-0.8E0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (-0.1E0,-0.9E0), (0.7E0,-0.8E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (-0.6E0,0.6E0), + + (-0.9E0,-0.4E0), (-0.1E0,-0.9E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6E0,-0.6E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.7E0,-0.8E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.7E0,-0.8E0), (-0.9E0,0.5E0), + + (-0.4E0,-0.7E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.7E0,-0.8E0), + + (-0.9E0,0.5E0), (-0.4E0,-0.7E0), (0.1E0,-0.5E0), + + (-0.1E0,-0.9E0), (-0.5E0,-0.3E0), + + (0.2E0,-0.8E0)/ + DATA CSIZE1/(0.0E0,0.0E0), (0.9E0,0.9E0), + + (1.63E0,1.73E0), (2.90E0,2.78E0)/ + DATA CSIZE3/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0), + + (1.17E0,1.17E0), (1.17E0,1.17E0)/ + DATA CSIZE2/(0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (0.0E0,0.0E0), + + (0.0E0,0.0E0), (0.0E0,0.0E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0), + + (1.54E0,1.54E0), (1.54E0,1.54E0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. CDOTC .. + CDOT(1) = CDOTC(N,CX,INCX,CY,INCY) + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. CDOTU .. + CDOT(1) = CDOTU(N,CX,INCX,CY,INCY) + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. CAXPY .. + CALL CAXPY(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. CCOPY .. + CALL CCOPY(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE IF (ICASE.EQ.5) THEN +* .. CSWAP .. + CALL CSWAP(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0E0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + REAL SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC AIMAG, REAL +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = REAL(CCOMP(I)) + SCOMP(2*I) = AIMAG(CCOMP(I)) + STRUE(2*I-1) = REAL(CTRUE(I)) + STRUE(2*I) = AIMAG(CTRUE(I)) + SSIZE(2*I-1) = REAL(CSIZE(I)) + SSIZE(2*I) = AIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/test/cblat2.dat b/test/cblat2.dat new file mode 100644 index 0000000000..1c6e315064 --- /dev/null +++ b/test/cblat2.dat @@ -0,0 +1,35 @@ +'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +CGEMV T PUT F FOR NO TEST. SAME COLUMNS. +CGBMV T PUT F FOR NO TEST. SAME COLUMNS. +CHEMV T PUT F FOR NO TEST. SAME COLUMNS. +CHBMV T PUT F FOR NO TEST. SAME COLUMNS. +CHPMV T PUT F FOR NO TEST. SAME COLUMNS. +CTRMV T PUT F FOR NO TEST. SAME COLUMNS. +CTBMV T PUT F FOR NO TEST. SAME COLUMNS. +CTPMV T PUT F FOR NO TEST. SAME COLUMNS. +CTRSV T PUT F FOR NO TEST. SAME COLUMNS. +CTBSV T PUT F FOR NO TEST. SAME COLUMNS. +CTPSV T PUT F FOR NO TEST. SAME COLUMNS. +CGERC T PUT F FOR NO TEST. SAME COLUMNS. +CGERU T PUT F FOR NO TEST. SAME COLUMNS. +CHER T PUT F FOR NO TEST. SAME COLUMNS. +CHPR T PUT F FOR NO TEST. SAME COLUMNS. +CHER2 T PUT F FOR NO TEST. SAME COLUMNS. +CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/cblat2.f b/test/cblat2.f new file mode 100644 index 0000000000..20f1881005 --- /dev/null +++ b/test/cblat2.f @@ -0,0 +1,3241 @@ + PROGRAM CBLAT2 +* +* Test program for the COMPLEX Level 2 Blas. +* +* The program must be driven by a short data file. The first 18 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 35 lines: +* 'CBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* CGEMV T PUT F FOR NO TEST. SAME COLUMNS. +* CGBMV T PUT F FOR NO TEST. SAME COLUMNS. +* CHEMV T PUT F FOR NO TEST. SAME COLUMNS. +* CHBMV T PUT F FOR NO TEST. SAME COLUMNS. +* CHPMV T PUT F FOR NO TEST. SAME COLUMNS. +* CTRMV T PUT F FOR NO TEST. SAME COLUMNS. +* CTBMV T PUT F FOR NO TEST. SAME COLUMNS. +* CTPMV T PUT F FOR NO TEST. SAME COLUMNS. +* CTRSV T PUT F FOR NO TEST. SAME COLUMNS. +* CTBSV T PUT F FOR NO TEST. SAME COLUMNS. +* CTPSV T PUT F FOR NO TEST. SAME COLUMNS. +* CGERC T PUT F FOR NO TEST. SAME COLUMNS. +* CGERU T PUT F FOR NO TEST. SAME COLUMNS. +* CHER T PUT F FOR NO TEST. SAME COLUMNS. +* CHPR T PUT F FOR NO TEST. SAME COLUMNS. +* CHER2 T PUT F FOR NO TEST. SAME COLUMNS. +* CHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANS + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHK6, + $ CCHKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'CGEMV ', 'CGBMV ', 'CHEMV ', 'CHBMV ', + $ 'CHPMV ', 'CTRMV ', 'CTBMV ', 'CTPMV ', + $ 'CTRSV ', 'CTBSV ', 'CTPSV ', 'CGERC ', + $ 'CGERU ', 'CHER ', 'CHPR ', 'CHER2 ', + $ 'CHPR2 '/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from CMVCH YT holds +* the result computed by CMVCH. + TRANS = 'N' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test CGEMV, 01, and CGBMV, 02. + 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test CHEMV, 03, CHBMV, 04, and CHPMV, 05. + 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test CTRMV, 06, CTBMV, 07, CTPMV, 08, +* CTRSV, 09, CTBSV, 10, and CTPSV, 11. + 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) + GO TO 200 +* Test CGERC, 12, CGERU, 13. + 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test CHER, 14, and CHPR, 15. + 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test CHER2, 16, and CHPR2, 17. + 190 CALL CCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE COMPLEX LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN CMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A6, L2 ) + 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT2. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests CGEMV and CGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CGBMV, CGEMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ TRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CGEMV( TRANS, M, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CGBMV( TRANS, M, N, KL, KU, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LCERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests CHEMV, CHBMV and CHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHBMV, CHEMV, CHPMV, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CHEMV( UPLO, N, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL CHBMV( UPLO, N, K, ALPHA, AA, LDA, + $ XX, INCX, BETA, YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL CHPMV( UPLO, N, ALPHA, AA, XX, INCX, + $ BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LCE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LCERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LCE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LCERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( AS, AA, LAA ) + ISAME( 5 ) = LCE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LCE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LCERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,', I2, ',(', F4.1, ',', F4.1, '), Y,', I2, + $ ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', F4.1, '), ', + $ 'Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) +* +* Tests CTRMV, CTBMV, CTPMV, CTRSV, CTBSV and CTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX TRANSL + REAL ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMVCH, CTBMV, CTBSV, CTPMV, CTPSV, + $ CTRMV, CTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'R' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for CMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTRMV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTBMV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTPMV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTRSV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTBSV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL CTPSV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LCERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LCERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LCE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LCE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LCERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MV' )THEN +* +* Check the result. +* + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL CMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, + $ INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests CGERC and CGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CGERC, CGERU, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. + CONJ = SNAME( 5: 5 ).EQ.'C' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL CGERC( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL CGERU( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( 'GE', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = CONJG( W( 1 ) ) + CALL CMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests CHER and CHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, TRANSL + REAL ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHER, CHPR, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = REAL( ALF( IA ) ) + ALPHA = CMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL CHER( UPLO, N, RALPHA, XX, INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL CHPR( UPLO, N, RALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LCERES( SNAME( 2: 3 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = CONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END + SUBROUTINE CCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests CHER2 and CHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), HALF = ( 0.5, 0.0 ), + $ ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, TRANSL + REAL ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHER2, CHPR2, CMAKE, CMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, CONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL CMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL CMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL CHER2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL CHPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LCE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LCE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LCE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LCERES( SNAME( 2: 3 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*CONJG( Z( J, 2 ) ) + W( 2 ) = CONJG( ALPHA )*CONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL CMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') ', + $ ' .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK6. +* + END + SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 2 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, A, X and Y should not need to be defined. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX ALPHA, BETA + REAL RALPHA +* .. Local Arrays .. + COMPLEX A( 1, 1 ), X( 1 ), Y( 1 ) +* .. External Subroutines .. + EXTERNAL CGBMV, CGEMV, CGERC, CGERU, CHBMV, CHEMV, CHER, + $ CHER2, CHKXER, CHPMV, CHPR, CHPR2, CTBMV, + $ CTBSV, CTPMV, CTPSV, CTRMV, CTRSV +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90, 100, 110, 120, 130, 140, 150, 160, + $ 170 )ISNUM + 10 INFOT = 1 + CALL CGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 20 INFOT = 1 + CALL CGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 30 INFOT = 1 + CALL CHEMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHEMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHEMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHEMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 40 INFOT = 1 + CALL CHBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CHBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CHBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CHBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 50 INFOT = 1 + CALL CHPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CHPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 60 INFOT = 1 + CALL CTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 70 INFOT = 1 + CALL CTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 80 INFOT = 1 + CALL CTPMV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTPMV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTPMV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTPMV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTPMV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 90 INFOT = 1 + CALL CTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 100 INFOT = 1 + CALL CTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 110 INFOT = 1 + CALL CTPSV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTPSV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTPSV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTPSV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CTPSV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 120 INFOT = 1 + CALL CGERC( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGERC( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGERC( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CGERC( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CGERC( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 130 INFOT = 1 + CALL CGERU( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGERU( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGERU( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CGERU( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CGERU( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 140 INFOT = 1 + CALL CHER( '/', 0, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHER( 'U', -1, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHER( 'U', 0, RALPHA, X, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER( 'U', 2, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 150 INFOT = 1 + CALL CHPR( '/', 0, RALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHPR( 'U', -1, RALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHPR( 'U', 0, RALPHA, X, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 160 INFOT = 1 + CALL CHER2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHER2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHER2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 170 INFOT = 1 + CALL CHPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CHPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 180 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of CCHKE. +* + END + SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'GB', 'HE', 'HB', 'HP', 'TR', 'TB' OR 'TP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, MIN, REAL +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'G' + SYM = TYPE( 1: 1 ).EQ.'H' + TRI = TYPE( 1: 1 ).EQ.'T' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = CBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'GB' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'TR' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'HB'.OR.TYPE.EQ.'TB' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'HP'.OR.TYPE.EQ.'TP' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = CMPLX( REAL( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX C + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( REAL( C ) ) + ABS( AIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + CONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of CMVCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE', 'HE' or 'HP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 2 BLAS +* routines. +* +* XERBLA is an error handler for the Level 2 BLAS routines. +* +* It is called by the Level 2 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/cblat3.dat b/test/cblat3.dat new file mode 100644 index 0000000000..72c00b98f8 --- /dev/null +++ b/test/cblat3.dat @@ -0,0 +1,23 @@ +'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +F LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +CGEMM T PUT F FOR NO TEST. SAME COLUMNS. +CHEMM T PUT F FOR NO TEST. SAME COLUMNS. +CSYMM T PUT F FOR NO TEST. SAME COLUMNS. +CTRMM T PUT F FOR NO TEST. SAME COLUMNS. +CTRSM T PUT F FOR NO TEST. SAME COLUMNS. +CHERK T PUT F FOR NO TEST. SAME COLUMNS. +CSYRK T PUT F FOR NO TEST. SAME COLUMNS. +CHER2K T PUT F FOR NO TEST. SAME COLUMNS. +CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/cblat3.f b/test/cblat3.f new file mode 100644 index 0000000000..b26be91e6b --- /dev/null +++ b/test/cblat3.f @@ -0,0 +1,3439 @@ + PROGRAM CBLAT3 +* +* Test program for the COMPLEX Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 23 lines: +* 'CBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* CGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* CHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* CSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* CTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* CTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* CHERK T PUT F FOR NO TEST. SAME COLUMNS. +* CSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* CHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* CSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0, RHALF = 0.5, RONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LCE + EXTERNAL SDIFF, LCE +* .. External Subroutines .. + EXTERNAL CCHK1, CCHK2, CCHK3, CCHK4, CCHK5, CCHKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'CGEMM ', 'CHEMM ', 'CSYMM ', 'CTRMM ', + $ 'CTRSM ', 'CHERK ', 'CSYRK ', 'CHER2K', + $ 'CSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( SDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of CMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from CMMCH CT holds +* the result computed by CMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LCE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL CCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test CGEMM, 01. + 140 CALL CCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CHEMM, 02, CSYMM, 03. + 150 CALL CCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CTRMM, 04, CTRSM, 05. + 160 CALL CCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test CHERK, 06, CSYRK, 07. + 170 CALL CCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test CHER2K, 08, CSYR2K, 09. + 180 CALL CCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE COMPLEX LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' CMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A6, L2 ) + 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of CBLAT3. +* + END + SUBROUTINE CCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CGEMM, CMAKE, CMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CGEMM( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LCE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LCE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LCERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL CMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK1. +* + END + SUBROUTINE CCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CHEMM and CSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BLS + REAL ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHEMM, CMAKE, CMMCH, CSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL CHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + ELSE + CALL CSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK2. +* + END + SUBROUTINE CCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests CTRMM and CTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS + REAL ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CMAKE, CMMCH, CTRMM, CTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for CMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL CMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL CMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL CTRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL CTRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LCE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LCE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LCERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL CMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL CMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK3. +* + END + SUBROUTINE CCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests CHERK and CSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHERK, CMAKE, CMMCH, CSYRK +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL CMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = REAL( ALPHA ) + ALPHA = CMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, RALPHA, LDA, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CHERK( UPLO, TRANS, N, K, RALPHA, AA, + $ LDA, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CSYRK( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LCERES( SNAME( 2: 3 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL CMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL CMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, + $ LDA, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK4. +* + END + SUBROUTINE CCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests CHER2K and CSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + REAL RONE, RZERO + PARAMETER ( RONE = 1.0, RZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + REAL G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX ALPHA, ALS, BETA, BETS + REAL ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LCE, LCERES + EXTERNAL LCE, LCERES +* .. External Subroutines .. + EXTERNAL CHER2K, CMAKE, CMMCH, CSYR2K +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, MAX, REAL +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL CMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = REAL( BETA ) + BETA = CMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL CMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CHER2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL CSYR2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LCE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LCE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LCE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LCERES( 'HE', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = CONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL CMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*CONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = CONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL CMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC + END IF +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of CCHK5. +* + END + SUBROUTINE CCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX ALPHA, BETA + REAL RALPHA, RBETA +* .. Local Arrays .. + COMPLEX A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL CGEMM, CHEMM, CHER2K, CHERK, CHKXER, CSYMM, + $ CSYR2K, CSYRK, CTRMM, CTRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90 )ISNUM + 10 INFOT = 1 + CALL CGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL CGEMM( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL CGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL CGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL CGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 20 INFOT = 1 + CALL CHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 30 INFOT = 1 + CALL CSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 40 INFOT = 1 + CALL CTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 50 INFOT = 1 + CALL CTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL CTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL CTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL CTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 60 INFOT = 1 + CALL CHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 70 INFOT = 1 + CALL CSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL CSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 80 INFOT = 1 + CALL CHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 90 INFOT = 1 + CALL CSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL CSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL CSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL CSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL CSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL CSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 100 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of CCHKE. +* + END + SUBROUTINE CMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'HE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO, ONE + PARAMETER ( ZERO = ( 0.0, 0.0 ), ONE = ( 1.0, 0.0 ) ) + COMPLEX ROGUE + PARAMETER ( ROGUE = ( -1.0E10, 1.0E10 ) ) + REAL RZERO + PARAMETER ( RZERO = 0.0 ) + REAL RROGUE + PARAMETER ( RROGUE = -1.0E10 ) +* .. Scalar Arguments .. + COMPLEX TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX CBEG + EXTERNAL CBEG +* .. Intrinsic Functions .. + INTRINSIC CMPLX, CONJG, REAL +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + HER = TYPE.EQ.'HE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = CBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = CONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = CMPLX( REAL( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = CMPLX( REAL( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of CMAKE. +* + END + SUBROUTINE CMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX ZERO + PARAMETER ( ZERO = ( 0.0, 0.0 ) ) + REAL RZERO, RONE + PARAMETER ( RZERO = 0.0, RONE = 1.0 ) +* .. Scalar Arguments .. + COMPLEX ALPHA, BETA + REAL EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + REAL G( * ) +* .. Local Scalars .. + COMPLEX CL + REAL ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, AIMAG, CONJG, MAX, REAL, SQRT +* .. Statement Functions .. + REAL ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( REAL( CL ) ) + ABS( AIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )* + $ CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + CONJG( A( K, I ) )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )*CONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of CMMCH. +* + END + LOGICAL FUNCTION LCE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LCE = .TRUE. + GO TO 30 + 20 CONTINUE + LCE = .FALSE. + 30 RETURN +* +* End of LCE. +* + END + LOGICAL FUNCTION LCERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'HE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LCERES = .TRUE. + GO TO 80 + 70 CONTINUE + LCERES = .FALSE. + 80 RETURN +* +* End of LCERES. +* + END + COMPLEX FUNCTION CBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC CMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + CBEG = CMPLX( ( I - 500 )/1001.0, ( J - 500 )/1001.0 ) + RETURN +* +* End of CBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/dblat1.f b/test/dblat1.f new file mode 100644 index 0000000000..5a45d69f4b --- /dev/null +++ b/test/dblat1.f @@ -0,0 +1,769 @@ + PROGRAM DBLAT1 +* Test program for the DOUBLE PRECISION Level 1 BLAS. +* Based upon the original BLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real BLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*6 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/' DDOT '/ + DATA L(2)/'DAXPY '/ + DATA L(3)/'DROTG '/ + DATA L(4)/' DROT '/ + DATA L(5)/'DCOPY '/ + DATA L(6)/'DSWAP '/ + DATA L(7)/'DNRM2 '/ + DATA L(8)/'DASUM '/ + DATA L(9)/'DSCAL '/ + DATA L(10)/'IDAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,12X,A6) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION D12, SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + DOUBLE PRECISION DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL DROTG, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3D0, 0.4D0, -0.3D0, -0.4D0, -0.3D0, 0.0D0, + + 0.0D0, 1.0D0/ + DATA DB1/0.4D0, 0.3D0, 0.4D0, 0.3D0, -0.4D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DC1/0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.6D0, 1.0D0, + + 0.0D0, 1.0D0/ + DATA DS1/0.8D0, 0.6D0, 0.8D0, -0.6D0, 0.8D0, 0.0D0, + + 1.0D0, 0.0D0/ + DATA DATRUE/0.5D0, 0.5D0, 0.5D0, -0.5D0, -0.5D0, + + 0.0D0, 1.0D0, 1.0D0/ + DATA DBTRUE/0.0D0, 0.6D0, 0.0D0, -0.6D0, 0.0D0, + + 0.0D0, 1.0D0, 0.0D0/ + DATA D12/4096.0D0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0D0/0.6D0 + DBTRUE(3) = -1.0D0/0.6D0 + DBTRUE(5) = 1.0D0/0.6D0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. DROTG .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL DROTG(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + DOUBLE PRECISION DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + DOUBLE PRECISION DASUM, DNRM2 + INTEGER IDAMAX + EXTERNAL DASUM, DNRM2, IDAMAX +* .. External Subroutines .. + EXTERNAL ITEST1, DSCAL, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0, -1.0D0, 0.0D0, 1.0D0, 0.3D0, 0.3D0, + + 0.3D0, 0.3D0, 0.3D0, 0.3D0/ + DATA DV/0.1D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 0.3D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 0.3D0, -0.4D0, 4.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 0.2D0, + + -0.6D0, 0.3D0, 5.0D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 0.1D0, -0.3D0, 0.5D0, -0.1D0, 6.0D0, + + 6.0D0, 6.0D0, 6.0D0, 0.1D0, 8.0D0, 8.0D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 0.3D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 0.3D0, 2.0D0, + + -0.4D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 0.2D0, 3.0D0, -0.6D0, 5.0D0, 0.3D0, 2.0D0, + + 2.0D0, 2.0D0, 0.1D0, 4.0D0, -0.3D0, 6.0D0, + + -0.5D0, 7.0D0, -0.1D0, 3.0D0/ + DATA DTRUE1/0.0D0, 0.3D0, 0.5D0, 0.7D0, 0.6D0/ + DATA DTRUE3/0.0D0, 0.3D0, 0.7D0, 1.1D0, 1.0D0/ + DATA DTRUE5/0.10D0, 2.0D0, 2.0D0, 2.0D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, -0.3D0, 3.0D0, 3.0D0, + + 3.0D0, 3.0D0, 3.0D0, 3.0D0, 3.0D0, 0.0D0, 0.0D0, + + 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, 4.0D0, + + 0.20D0, -0.60D0, 0.30D0, 5.0D0, 5.0D0, 5.0D0, + + 5.0D0, 5.0D0, 0.03D0, -0.09D0, 0.15D0, -0.03D0, + + 6.0D0, 6.0D0, 6.0D0, 6.0D0, 0.10D0, 8.0D0, + + 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, 8.0D0, + + 0.09D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, 9.0D0, + + 9.0D0, 9.0D0, 0.09D0, 2.0D0, -0.12D0, 2.0D0, + + 2.0D0, 2.0D0, 2.0D0, 2.0D0, 0.06D0, 3.0D0, + + -0.18D0, 5.0D0, 0.09D0, 2.0D0, 2.0D0, 2.0D0, + + 0.03D0, 4.0D0, -0.09D0, 6.0D0, -0.15D0, 7.0D0, + + -0.03D0, 3.0D0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. DNRM2 .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(DNRM2(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. DASUM .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(DASUM(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. DSCAL .. + CALL DSCAL(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IDAMAX .. + CALL ITEST1(IDAMAX(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA, SC, SS + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DT9X(7,4,4), DT9Y(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + DOUBLE PRECISION DDOT + EXTERNAL DDOT +* .. External Subroutines .. + EXTERNAL DAXPY, DCOPY, DSWAP, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA SC, SS/0.8D0, 0.6D0/ + DATA DT7/0.0D0, 0.30D0, 0.21D0, 0.62D0, 0.0D0, + + 0.30D0, -0.07D0, 0.85D0, 0.0D0, 0.30D0, -0.79D0, + + -0.74D0, 0.0D0, 0.30D0, 0.33D0, 1.27D0/ + DATA DT8/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, -0.87D0, 0.15D0, + + 0.94D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.68D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.35D0, -0.9D0, 0.48D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.38D0, -0.9D0, 0.57D0, 0.7D0, -0.75D0, + + 0.2D0, 0.98D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.68D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.35D0, -0.72D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.38D0, + + -0.63D0, 0.15D0, 0.88D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.68D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.68D0, -0.9D0, 0.33D0, 0.7D0, + + -0.75D0, 0.2D0, 1.04D0/ + DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + + 0.0D0, 0.0D0, 0.0D0/ + DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + + -0.18D0, 0.2D0, 0.16D0/ + DATA DT10X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, -0.9D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, -0.9D0, 0.3D0, 0.7D0, + + 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.3D0, 0.1D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.8D0, 0.1D0, -0.6D0, + + 0.8D0, 0.3D0, -0.3D0, 0.5D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.9D0, + + 0.1D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + 0.1D0, 0.3D0, 0.8D0, -0.9D0, -0.3D0, 0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.5D0, 0.3D0, -0.6D0, 0.8D0, 0.0D0, 0.0D0, + + 0.0D0/ + DATA DT10Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.0D0, + + 0.0D0, 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, -0.5D0, -0.9D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, -0.4D0, -0.9D0, 0.9D0, + + 0.7D0, -0.5D0, 0.2D0, 0.6D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.5D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + -0.4D0, 0.9D0, -0.5D0, 0.6D0, 0.0D0, 0.0D0, + + 0.0D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.6D0, -0.9D0, 0.1D0, 0.7D0, + + -0.5D0, 0.2D0, 0.8D0/ + DATA SSIZE1/0.0D0, 0.3D0, 1.6D0, 3.2D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. DDOT .. + CALL STEST1(DDOT(N,SX,INCX,SY,INCY),DT7(KN,KI),SSIZE1(KN) + + ,SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. DAXPY .. + CALL DAXPY(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. DCOPY .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL DCOPY(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE IF (ICASE.EQ.6) THEN +* .. DSWAP .. + CALL DSWAP(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0D0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SA, SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + DOUBLE PRECISION COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL DROT, STEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3D0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6D0, 0.1D0, -0.5D0, 0.8D0, 0.9D0, -0.3D0, + + -0.4D0/ + DATA DY1/0.5D0, -0.9D0, 0.3D0, 0.7D0, -0.6D0, 0.2D0, + + 0.8D0/ + DATA SC, SS/0.8D0, 0.6D0/ + DATA DT9X/0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, -0.46D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, -0.46D0, -0.22D0, + + 1.06D0, 0.0D0, 0.0D0, 0.0D0, 0.6D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.78D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.66D0, 0.1D0, -0.1D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.96D0, 0.1D0, -0.76D0, 0.8D0, 0.90D0, + + -0.3D0, -0.02D0, 0.6D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, -0.06D0, 0.1D0, + + -0.1D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.90D0, + + 0.1D0, -0.22D0, 0.8D0, 0.18D0, -0.3D0, -0.02D0, + + 0.6D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.78D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.78D0, 0.26D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.78D0, 0.26D0, -0.76D0, 1.12D0, + + 0.0D0, 0.0D0, 0.0D0/ + DATA DT9Y/0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.04D0, -0.78D0, 0.54D0, + + 0.08D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.04D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.7D0, + + -0.9D0, -0.12D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.64D0, -0.9D0, -0.30D0, 0.7D0, -0.18D0, 0.2D0, + + 0.28D0, 0.5D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.7D0, -1.08D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.64D0, -1.26D0, + + 0.54D0, 0.20D0, 0.0D0, 0.0D0, 0.0D0, 0.5D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.04D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.04D0, -0.9D0, 0.18D0, 0.7D0, + + -0.18D0, 0.2D0, 0.16D0/ + DATA SSIZE2/0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, 0.0D0, + + 0.0D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, 1.17D0, + + 1.17D0, 1.17D0, 1.17D0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. DROT .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL DROT(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0 + DO 100 I = 2, 6 + MWPS(I) = 1 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL DROT(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/test/dblat2.dat b/test/dblat2.dat new file mode 100644 index 0000000000..2680425973 --- /dev/null +++ b/test/dblat2.dat @@ -0,0 +1,34 @@ +'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +DGEMV T PUT F FOR NO TEST. SAME COLUMNS. +DGBMV T PUT F FOR NO TEST. SAME COLUMNS. +DSYMV T PUT F FOR NO TEST. SAME COLUMNS. +DSBMV T PUT F FOR NO TEST. SAME COLUMNS. +DSPMV T PUT F FOR NO TEST. SAME COLUMNS. +DTRMV T PUT F FOR NO TEST. SAME COLUMNS. +DTBMV T PUT F FOR NO TEST. SAME COLUMNS. +DTPMV T PUT F FOR NO TEST. SAME COLUMNS. +DTRSV T PUT F FOR NO TEST. SAME COLUMNS. +DTBSV T PUT F FOR NO TEST. SAME COLUMNS. +DTPSV T PUT F FOR NO TEST. SAME COLUMNS. +DGER T PUT F FOR NO TEST. SAME COLUMNS. +DSYR T PUT F FOR NO TEST. SAME COLUMNS. +DSPR T PUT F FOR NO TEST. SAME COLUMNS. +DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/dblat2.f b/test/dblat2.f new file mode 100644 index 0000000000..4002d43689 --- /dev/null +++ b/test/dblat2.f @@ -0,0 +1,3138 @@ + PROGRAM DBLAT2 +* +* Test program for the DOUBLE PRECISION Level 2 Blas. +* +* The program must be driven by a short data file. The first 18 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'DBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* DGEMV T PUT F FOR NO TEST. SAME COLUMNS. +* DGBMV T PUT F FOR NO TEST. SAME COLUMNS. +* DSYMV T PUT F FOR NO TEST. SAME COLUMNS. +* DSBMV T PUT F FOR NO TEST. SAME COLUMNS. +* DSPMV T PUT F FOR NO TEST. SAME COLUMNS. +* DTRMV T PUT F FOR NO TEST. SAME COLUMNS. +* DTBMV T PUT F FOR NO TEST. SAME COLUMNS. +* DTPMV T PUT F FOR NO TEST. SAME COLUMNS. +* DTRSV T PUT F FOR NO TEST. SAME COLUMNS. +* DTBSV T PUT F FOR NO TEST. SAME COLUMNS. +* DTPSV T PUT F FOR NO TEST. SAME COLUMNS. +* DGER T PUT F FOR NO TEST. SAME COLUMNS. +* DSYR T PUT F FOR NO TEST. SAME COLUMNS. +* DSPR T PUT F FOR NO TEST. SAME COLUMNS. +* DSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +* DSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANS + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHK6, + $ DCHKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'DGEMV ', 'DGBMV ', 'DSYMV ', 'DSBMV ', + $ 'DSPMV ', 'DTRMV ', 'DTBMV ', 'DTPMV ', + $ 'DTRSV ', 'DTBSV ', 'DTPSV ', 'DGER ', + $ 'DSYR ', 'DSPR ', 'DSYR2 ', 'DSPR2 '/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from DMVCH YT holds +* the result computed by DMVCH. + TRANS = 'N' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL DCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test DGEMV, 01, and DGBMV, 02. + 140 CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test DSYMV, 03, DSBMV, 04, and DSPMV, 05. + 150 CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test DTRMV, 06, DTBMV, 07, DTPMV, 08, +* DTRSV, 09, DTBSV, 10, and DTPSV, 11. + 160 CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) + GO TO 200 +* Test DGER, 12. + 170 CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test DSYR, 13, and DSPR, 14. + 180 CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test DSYR2, 15, and DSPR2, 16. + 190 CALL DCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN DMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A6, L2 ) + 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT2. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests DGEMV and DGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGBMV, DGEMV, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ TRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL DGEMV( TRANS, M, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL DGBMV( TRANS, M, N, KL, KU, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LDERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests DSYMV, DSBMV and DSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, DSBMV, DSPMV, DSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL DSYMV( UPLO, N, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL DSBMV( UPLO, N, K, ALPHA, AA, LDA, + $ XX, INCX, BETA, YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL DSPMV( UPLO, N, ALPHA, AA, XX, INCX, + $ BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LDE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LDERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LDE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( AS, AA, LAA ) + ISAME( 5 ) = LDE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LDE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LDERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) +* +* Tests DTRMV, DTBMV, DTPMV, DTRSV, DTBSV and DTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + DOUBLE PRECISION ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, DTBMV, DTBSV, DTPMV, DTPSV, + $ DTRMV, DTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'R' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for DMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTRMV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTBMV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTPMV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTRSV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTBSV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL DTPSV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LDERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LDERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LDE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LDE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LDERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MV' )THEN +* +* Check the result. +* + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL DMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, + $ INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests DGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGER, DMAKE, DMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL DGER( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( 'GE', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL DMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests DSYR and DSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, DSPR, DSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL DSYR( UPLO, N, ALPHA, XX, INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL DSPR( UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LDERES( SNAME( 2: 3 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END + SUBROUTINE DCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests DSYR2 and DSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + DOUBLE PRECISION W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMVCH, DSPR2, DSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL DMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL DMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL DMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL DSYR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL DSPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LDE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LDE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LDE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LDERES( SNAME( 2: 3 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL DMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK6. +* + END + SUBROUTINE DCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 2 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, BETA, A, X and Y should not need to be defined. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, BETA +* .. Local Arrays .. + DOUBLE PRECISION A( 1, 1 ), X( 1 ), Y( 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, DGBMV, DGEMV, DGER, DSBMV, DSPMV, DSPR, + $ DSPR2, DSYMV, DSYR, DSYR2, DTBMV, DTBSV, DTPMV, + $ DTPSV, DTRMV, DTRSV +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90, 100, 110, 120, 130, 140, 150, + $ 160 )ISNUM + 10 INFOT = 1 + CALL DGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 20 INFOT = 1 + CALL DGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 30 INFOT = 1 + CALL DSYMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSYMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 40 INFOT = 1 + CALL DSBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DSBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DSBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DSBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 50 INFOT = 1 + CALL DSPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DSPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 60 INFOT = 1 + CALL DTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 70 INFOT = 1 + CALL DTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 80 INFOT = 1 + CALL DTPMV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTPMV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTPMV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTPMV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTPMV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 90 INFOT = 1 + CALL DTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 100 INFOT = 1 + CALL DTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 110 INFOT = 1 + CALL DTPSV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTPSV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTPSV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTPSV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DTPSV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 120 INFOT = 1 + CALL DGER( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGER( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGER( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DGER( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DGER( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 130 INFOT = 1 + CALL DSYR( '/', 0, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYR( 'U', -1, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSYR( 'U', 0, ALPHA, X, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR( 'U', 2, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 140 INFOT = 1 + CALL DSPR( '/', 0, ALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSPR( 'U', -1, ALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSPR( 'U', 0, ALPHA, X, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 150 INFOT = 1 + CALL DSYR2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYR2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSYR2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 160 INFOT = 1 + CALL DSPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DSPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 170 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of DCHKE. +* + END + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'GB', 'SY', 'SB', 'SP', 'TR', 'TB' OR 'TP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'G' + SYM = TYPE( 1: 1 ).EQ.'S' + TRI = TYPE( 1: 1 ).EQ.'T' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = DBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'GB' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'SB'.OR.TYPE.EQ.'TB' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'SP'.OR.TYPE.EQ.'TP' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of DMVCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE', 'SY' or 'SP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC DBLE +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = DBLE( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 2 BLAS +* routines. +* +* XERBLA is an error handler for the Level 2 BLAS routines. +* +* It is called by the Level 2 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/dblat3.dat b/test/dblat3.dat new file mode 100644 index 0000000000..78b6d189a7 --- /dev/null +++ b/test/dblat3.dat @@ -0,0 +1,20 @@ +'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +DGEMM T PUT F FOR NO TEST. SAME COLUMNS. +DSYMM T PUT F FOR NO TEST. SAME COLUMNS. +DTRMM T PUT F FOR NO TEST. SAME COLUMNS. +DTRSM T PUT F FOR NO TEST. SAME COLUMNS. +DSYRK T PUT F FOR NO TEST. SAME COLUMNS. +DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/dblat3.f b/test/dblat3.f new file mode 100644 index 0000000000..082e03e5e2 --- /dev/null +++ b/test/dblat3.f @@ -0,0 +1,2823 @@ + PROGRAM DBLAT3 +* +* Test program for the DOUBLE PRECISION Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 20 lines: +* 'DBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* DGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* DSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* DTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* DTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* DSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* DSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + DOUBLE PRECISION ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LDE + EXTERNAL DDIFF, LDE +* .. External Subroutines .. + EXTERNAL DCHK1, DCHK2, DCHK3, DCHK4, DCHK5, DCHKE, DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'DGEMM ', 'DSYMM ', 'DTRMM ', 'DTRSM ', + $ 'DSYRK ', 'DSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( DDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of DMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from DMMCH CT holds +* the result computed by DMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LDE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL DCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test DGEMM, 01. + 140 CALL DCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test DSYMM, 02. + 150 CALL DCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test DTRMM, 03, DTRSM, 04. + 160 CALL DCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test DSYRK, 05. + 170 CALL DCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test DSYR2K, 06. + 180 CALL DCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE DOUBLE PRECISION LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN DMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' DMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A6, L2 ) + 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of DBLAT3. +* + END + SUBROUTINE DCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests DGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DGEMM, DMAKE, DMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL DGEMM( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LDE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LDE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL DMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK1. +* + END + SUBROUTINE DCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests DSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, DSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL DMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL DSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK2. +* + END + SUBROUTINE DCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests DTRMM and DTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, DTRMM, DTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for DMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL DMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL DMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL DTRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL DTRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LDE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LDE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LDERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL DMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL DMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK3. +* + END + SUBROUTINE DCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests DSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, DSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL DMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL DSYRK( UPLO, TRANS, N, K, ALPHA, AA, LDA, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL DMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL DMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK4. +* + END + SUBROUTINE DCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests DSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + DOUBLE PRECISION AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LDE, LDERES + EXTERNAL LDE, LDERES +* .. External Subroutines .. + EXTERNAL DMAKE, DMMCH, DSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL DMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL DMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL DSYR2K( UPLO, TRANS, N, K, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LDE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LDE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LDE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LDERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL DMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL DMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of DCHK5. +* + END + SUBROUTINE DCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, BETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + DOUBLE PRECISION ALPHA, BETA +* .. Local Arrays .. + DOUBLE PRECISION A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, DGEMM, DSYMM, DSYR2K, DSYRK, DTRMM, + $ DTRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM + 10 INFOT = 1 + CALL DGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL DGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL DGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL DGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 20 INFOT = 1 + CALL DSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 30 INFOT = 1 + CALL DTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 40 INFOT = 1 + CALL DTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL DTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL DTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL DTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 50 INFOT = 1 + CALL DSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYRK( 'U', '/', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL DSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 60 INFOT = 1 + CALL DSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL DSYR2K( 'U', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL DSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL DSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL DSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL DSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL DSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 70 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of DCHKE. +* + END + SUBROUTINE DMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) + DOUBLE PRECISION ROGUE + PARAMETER ( ROGUE = -1.0D10 ) +* .. Scalar Arguments .. + DOUBLE PRECISION TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + DOUBLE PRECISION DBEG + EXTERNAL DBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = DBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of DMAKE. +* + END + SUBROUTINE DMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + DOUBLE PRECISION ZERO, ONE + PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of DMMCH. +* + END + LOGICAL FUNCTION LDE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + DOUBLE PRECISION RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LDE = .TRUE. + GO TO 30 + 20 CONTINUE + LDE = .FALSE. + 30 RETURN +* +* End of LDE. +* + END + LOGICAL FUNCTION LDERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + DOUBLE PRECISION AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LDERES = .TRUE. + GO TO 80 + 70 CONTINUE + LDERES = .FALSE. + 80 RETURN +* +* End of LDERES. +* + END + DOUBLE PRECISION FUNCTION DBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + DBEG = ( I - 500 )/1001.0D0 + RETURN +* +* End of DBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/sblat1.f b/test/sblat1.f new file mode 100644 index 0000000000..a982d1852e --- /dev/null +++ b/test/sblat1.f @@ -0,0 +1,769 @@ + PROGRAM SBLAT1 +* Test program for the REAL Level 1 BLAS. +* Based upon the original BLAS test routine together with: +* F06EAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK0, CHECK1, CHECK2, CHECK3, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625E-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* .. Initialize PASS, INCX, INCY, and MODE for a new case. .. +* .. the value 9999 for INCX, INCY or MODE will appear in the .. +* .. detailed output, if any, for cases that do not involve .. +* .. these parameters .. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.EQ.3) THEN + CALL CHECK0(SFAC) + ELSE IF (ICASE.EQ.7 .OR. ICASE.EQ.8 .OR. ICASE.EQ.9 .OR. + + ICASE.EQ.10) THEN + CALL CHECK1(SFAC) + ELSE IF (ICASE.EQ.1 .OR. ICASE.EQ.2 .OR. ICASE.EQ.5 .OR. + + ICASE.EQ.6) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.EQ.4) THEN + CALL CHECK3(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Real BLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*6 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/' SDOT '/ + DATA L(2)/'SAXPY '/ + DATA L(3)/'SROTG '/ + DATA L(4)/' SROT '/ + DATA L(5)/'SCOPY '/ + DATA L(6)/'SSWAP '/ + DATA L(7)/'SNRM2 '/ + DATA L(8)/'SASUM '/ + DATA L(9)/'SSCAL '/ + DATA L(10)/'ISAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,12X,A6) + END + SUBROUTINE CHECK0(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL D12, SA, SB, SC, SS + INTEGER K +* .. Local Arrays .. + REAL DA1(8), DATRUE(8), DB1(8), DBTRUE(8), DC1(8), + + DS1(8) +* .. External Subroutines .. + EXTERNAL SROTG, STEST1 +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA DA1/0.3E0, 0.4E0, -0.3E0, -0.4E0, -0.3E0, 0.0E0, + + 0.0E0, 1.0E0/ + DATA DB1/0.4E0, 0.3E0, 0.4E0, 0.3E0, -0.4E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DC1/0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.6E0, 1.0E0, + + 0.0E0, 1.0E0/ + DATA DS1/0.8E0, 0.6E0, 0.8E0, -0.6E0, 0.8E0, 0.0E0, + + 1.0E0, 0.0E0/ + DATA DATRUE/0.5E0, 0.5E0, 0.5E0, -0.5E0, -0.5E0, + + 0.0E0, 1.0E0, 1.0E0/ + DATA DBTRUE/0.0E0, 0.6E0, 0.0E0, -0.6E0, 0.0E0, + + 0.0E0, 1.0E0, 0.0E0/ + DATA D12/4096.0E0/ +* .. Executable Statements .. +* +* Compute true values which cannot be prestored +* in decimal notation +* + DBTRUE(1) = 1.0E0/0.6E0 + DBTRUE(3) = -1.0E0/0.6E0 + DBTRUE(5) = 1.0E0/0.6E0 +* + DO 20 K = 1, 8 +* .. Set N=K for identification in output if any .. + N = K + IF (ICASE.EQ.3) THEN +* .. SROTG .. + IF (K.GT.8) GO TO 40 + SA = DA1(K) + SB = DB1(K) + CALL SROTG(SA,SB,SC,SS) + CALL STEST1(SA,DATRUE(K),DATRUE(K),SFAC) + CALL STEST1(SB,DBTRUE(K),DBTRUE(K),SFAC) + CALL STEST1(SC,DC1(K),DC1(K),SFAC) + CALL STEST1(SS,DS1(K),DS1(K),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' + STOP + END IF + 20 CONTINUE + 40 RETURN + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER I, LEN, NP1 +* .. Local Arrays .. + REAL DTRUE1(5), DTRUE3(5), DTRUE5(8,5,2), DV(8,5,2), + + SA(10), STEMP(1), STRUE(8), SX(8) + INTEGER ITRUE2(5) +* .. External Functions .. + REAL SASUM, SNRM2 + INTEGER ISAMAX + EXTERNAL SASUM, SNRM2, ISAMAX +* .. External Subroutines .. + EXTERNAL ITEST1, SSCAL, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0, -1.0E0, 0.0E0, 1.0E0, 0.3E0, 0.3E0, + + 0.3E0, 0.3E0, 0.3E0, 0.3E0/ + DATA DV/0.1E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 0.3E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 0.3E0, -0.4E0, 4.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 0.2E0, + + -0.6E0, 0.3E0, 5.0E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 0.1E0, -0.3E0, 0.5E0, -0.1E0, 6.0E0, + + 6.0E0, 6.0E0, 6.0E0, 0.1E0, 8.0E0, 8.0E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 0.3E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 0.3E0, 2.0E0, + + -0.4E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 0.2E0, 3.0E0, -0.6E0, 5.0E0, 0.3E0, 2.0E0, + + 2.0E0, 2.0E0, 0.1E0, 4.0E0, -0.3E0, 6.0E0, + + -0.5E0, 7.0E0, -0.1E0, 3.0E0/ + DATA DTRUE1/0.0E0, 0.3E0, 0.5E0, 0.7E0, 0.6E0/ + DATA DTRUE3/0.0E0, 0.3E0, 0.7E0, 1.1E0, 1.0E0/ + DATA DTRUE5/0.10E0, 2.0E0, 2.0E0, 2.0E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, -0.3E0, 3.0E0, 3.0E0, + + 3.0E0, 3.0E0, 3.0E0, 3.0E0, 3.0E0, 0.0E0, 0.0E0, + + 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, 4.0E0, + + 0.20E0, -0.60E0, 0.30E0, 5.0E0, 5.0E0, 5.0E0, + + 5.0E0, 5.0E0, 0.03E0, -0.09E0, 0.15E0, -0.03E0, + + 6.0E0, 6.0E0, 6.0E0, 6.0E0, 0.10E0, 8.0E0, + + 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, 8.0E0, + + 0.09E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, 9.0E0, + + 9.0E0, 9.0E0, 0.09E0, 2.0E0, -0.12E0, 2.0E0, + + 2.0E0, 2.0E0, 2.0E0, 2.0E0, 0.06E0, 3.0E0, + + -0.18E0, 5.0E0, 0.09E0, 2.0E0, 2.0E0, 2.0E0, + + 0.03E0, 4.0E0, -0.09E0, 6.0E0, -0.15E0, 7.0E0, + + -0.03E0, 3.0E0/ + DATA ITRUE2/0, 1, 2, 2, 3/ +* .. Executable Statements .. + DO 80 INCX = 1, 2 + DO 60 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + SX(I) = DV(I,NP1,INCX) + 20 CONTINUE +* + IF (ICASE.EQ.7) THEN +* .. SNRM2 .. + STEMP(1) = DTRUE1(NP1) + CALL STEST1(SNRM2(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. SASUM .. + STEMP(1) = DTRUE3(NP1) + CALL STEST1(SASUM(N,SX,INCX),STEMP,STEMP,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. SSCAL .. + CALL SSCAL(N,SA((INCX-1)*5+NP1),SX,INCX) + DO 40 I = 1, LEN + STRUE(I) = DTRUE5(I,NP1,INCX) + 40 CONTINUE + CALL STEST(LEN,SX,STRUE,STRUE,SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. ISAMAX .. + CALL ITEST1(ISAMAX(N,SX,INCX),ITRUE2(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF + 60 CONTINUE + 80 CONTINUE + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA, SC, SS + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL DT10X(7,4,4), DT10Y(7,4,4), DT7(4,4), + + DT8(7,4,4), DT9X(7,4,4), DT9Y(7,4,4), DX1(7), + + DY1(7), SSIZE1(4), SSIZE2(14,2), STX(7), STY(7), + + SX(7), SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + REAL SDOT + EXTERNAL SDOT +* .. External Subroutines .. + EXTERNAL SAXPY, SCOPY, SSWAP, STEST, STEST1 +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA SC, SS/0.8E0, 0.6E0/ + DATA DT7/0.0E0, 0.30E0, 0.21E0, 0.62E0, 0.0E0, + + 0.30E0, -0.07E0, 0.85E0, 0.0E0, 0.30E0, -0.79E0, + + -0.74E0, 0.0E0, 0.30E0, 0.33E0, 1.27E0/ + DATA DT8/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, -0.87E0, 0.15E0, + + 0.94E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.68E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.35E0, -0.9E0, 0.48E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.38E0, -0.9E0, 0.57E0, 0.7E0, -0.75E0, + + 0.2E0, 0.98E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.68E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.35E0, -0.72E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.38E0, + + -0.63E0, 0.15E0, 0.88E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.68E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.68E0, -0.9E0, 0.33E0, 0.7E0, + + -0.75E0, 0.2E0, 1.04E0/ + DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + + 0.0E0, 0.0E0, 0.0E0/ + DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + + -0.18E0, 0.2E0, 0.16E0/ + DATA DT10X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, -0.9E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, -0.9E0, 0.3E0, 0.7E0, + + 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.3E0, 0.1E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.8E0, 0.1E0, -0.6E0, + + 0.8E0, 0.3E0, -0.3E0, 0.5E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.9E0, + + 0.1E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + 0.1E0, 0.3E0, 0.8E0, -0.9E0, -0.3E0, 0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.5E0, 0.3E0, -0.6E0, 0.8E0, 0.0E0, 0.0E0, + + 0.0E0/ + DATA DT10Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.0E0, + + 0.0E0, 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, -0.5E0, -0.9E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, -0.4E0, -0.9E0, 0.9E0, + + 0.7E0, -0.5E0, 0.2E0, 0.6E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.5E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + -0.4E0, 0.9E0, -0.5E0, 0.6E0, 0.0E0, 0.0E0, + + 0.0E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.6E0, -0.9E0, 0.1E0, 0.7E0, + + -0.5E0, 0.2E0, 0.8E0/ + DATA SSIZE1/0.0E0, 0.3E0, 1.6E0, 3.2E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 120 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 100 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. Initialize all argument arrays .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + 20 CONTINUE +* + IF (ICASE.EQ.1) THEN +* .. SDOT .. + CALL STEST1(SDOT(N,SX,INCX,SY,INCY),DT7(KN,KI),SSIZE1(KN) + + ,SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. SAXPY .. + CALL SAXPY(N,SA,SX,INCX,SY,INCY) + DO 40 J = 1, LENY + STY(J) = DT8(J,KN,KI) + 40 CONTINUE + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.5) THEN +* .. SCOPY .. + DO 60 I = 1, 7 + STY(I) = DT10Y(I,KN,KI) + 60 CONTINUE + CALL SCOPY(N,SX,INCX,SY,INCY) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE IF (ICASE.EQ.6) THEN +* .. SSWAP .. + CALL SSWAP(N,SX,INCX,SY,INCY) + DO 80 I = 1, 7 + STX(I) = DT10X(I,KN,KI) + STY(I) = DT10Y(I,KN,KI) + 80 CONTINUE + CALL STEST(LENX,SX,STX,SSIZE2(1,1),1.0E0) + CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF + 100 CONTINUE + 120 CONTINUE + RETURN + END + SUBROUTINE CHECK3(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SA, SC, SS + INTEGER I, K, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + REAL COPYX(5), COPYY(5), DT9X(7,4,4), DT9Y(7,4,4), + + DX1(7), DY1(7), MWPC(11), MWPS(11), MWPSTX(5), + + MWPSTY(5), MWPTX(11,5), MWPTY(11,5), MWPX(5), + + MWPY(5), SSIZE2(14,2), STX(7), STY(7), SX(7), + + SY(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), MWPINX(11), + + MWPINY(11), MWPN(11), NS(4) +* .. External Subroutines .. + EXTERNAL SROT, STEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA/0.3E0/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA DX1/0.6E0, 0.1E0, -0.5E0, 0.8E0, 0.9E0, -0.3E0, + + -0.4E0/ + DATA DY1/0.5E0, -0.9E0, 0.3E0, 0.7E0, -0.6E0, 0.2E0, + + 0.8E0/ + DATA SC, SS/0.8E0, 0.6E0/ + DATA DT9X/0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, -0.46E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, -0.46E0, -0.22E0, + + 1.06E0, 0.0E0, 0.0E0, 0.0E0, 0.6E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.78E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.66E0, 0.1E0, -0.1E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.96E0, 0.1E0, -0.76E0, 0.8E0, 0.90E0, + + -0.3E0, -0.02E0, 0.6E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, -0.06E0, 0.1E0, + + -0.1E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.90E0, + + 0.1E0, -0.22E0, 0.8E0, 0.18E0, -0.3E0, -0.02E0, + + 0.6E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.78E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.78E0, 0.26E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.78E0, 0.26E0, -0.76E0, 1.12E0, + + 0.0E0, 0.0E0, 0.0E0/ + DATA DT9Y/0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.04E0, -0.78E0, 0.54E0, + + 0.08E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.04E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.7E0, + + -0.9E0, -0.12E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.64E0, -0.9E0, -0.30E0, 0.7E0, -0.18E0, 0.2E0, + + 0.28E0, 0.5E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.7E0, -1.08E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.64E0, -1.26E0, + + 0.54E0, 0.20E0, 0.0E0, 0.0E0, 0.0E0, 0.5E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.04E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.04E0, -0.9E0, 0.18E0, 0.7E0, + + -0.18E0, 0.2E0, 0.16E0/ + DATA SSIZE2/0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, 0.0E0, + + 0.0E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, 1.17E0, + + 1.17E0, 1.17E0, 1.17E0/ +* .. Executable Statements .. +* + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* + IF (ICASE.EQ.4) THEN +* .. SROT .. + DO 20 I = 1, 7 + SX(I) = DX1(I) + SY(I) = DY1(I) + STX(I) = DT9X(I,KN,KI) + STY(I) = DT9Y(I,KN,KI) + 20 CONTINUE + CALL SROT(N,SX,INCX,SY,INCY,SC,SS) + CALL STEST(LENX,SX,STX,SSIZE2(1,KSIZE),SFAC) + CALL STEST(LENY,SY,STY,SSIZE2(1,KSIZE),SFAC) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' + STOP + END IF + 40 CONTINUE + 60 CONTINUE +* + MWPC(1) = 1 + DO 80 I = 2, 11 + MWPC(I) = 0 + 80 CONTINUE + MWPS(1) = 0 + DO 100 I = 2, 6 + MWPS(I) = 1 + 100 CONTINUE + DO 120 I = 7, 11 + MWPS(I) = -1 + 120 CONTINUE + MWPINX(1) = 1 + MWPINX(2) = 1 + MWPINX(3) = 1 + MWPINX(4) = -1 + MWPINX(5) = 1 + MWPINX(6) = -1 + MWPINX(7) = 1 + MWPINX(8) = 1 + MWPINX(9) = -1 + MWPINX(10) = 1 + MWPINX(11) = -1 + MWPINY(1) = 1 + MWPINY(2) = 1 + MWPINY(3) = -1 + MWPINY(4) = -1 + MWPINY(5) = 2 + MWPINY(6) = 1 + MWPINY(7) = 1 + MWPINY(8) = -1 + MWPINY(9) = -1 + MWPINY(10) = 2 + MWPINY(11) = 1 + DO 140 I = 1, 11 + MWPN(I) = 5 + 140 CONTINUE + MWPN(5) = 3 + MWPN(10) = 3 + DO 160 I = 1, 5 + MWPX(I) = I + MWPY(I) = I + MWPTX(1,I) = I + MWPTY(1,I) = I + MWPTX(2,I) = I + MWPTY(2,I) = -I + MWPTX(3,I) = 6 - I + MWPTY(3,I) = I - 6 + MWPTX(4,I) = I + MWPTY(4,I) = -I + MWPTX(6,I) = 6 - I + MWPTY(6,I) = I - 6 + MWPTX(7,I) = -I + MWPTY(7,I) = I + MWPTX(8,I) = I - 6 + MWPTY(8,I) = 6 - I + MWPTX(9,I) = -I + MWPTY(9,I) = I + MWPTX(11,I) = I - 6 + MWPTY(11,I) = 6 - I + 160 CONTINUE + MWPTX(5,1) = 1 + MWPTX(5,2) = 3 + MWPTX(5,3) = 5 + MWPTX(5,4) = 4 + MWPTX(5,5) = 5 + MWPTY(5,1) = -1 + MWPTY(5,2) = 2 + MWPTY(5,3) = -2 + MWPTY(5,4) = 4 + MWPTY(5,5) = -3 + MWPTX(10,1) = -1 + MWPTX(10,2) = -3 + MWPTX(10,3) = -5 + MWPTX(10,4) = 4 + MWPTX(10,5) = 5 + MWPTY(10,1) = 1 + MWPTY(10,2) = 2 + MWPTY(10,3) = 2 + MWPTY(10,4) = 4 + MWPTY(10,5) = 3 + DO 200 I = 1, 11 + INCX = MWPINX(I) + INCY = MWPINY(I) + DO 180 K = 1, 5 + COPYX(K) = MWPX(K) + COPYY(K) = MWPY(K) + MWPSTX(K) = MWPTX(I,K) + MWPSTY(K) = MWPTY(I,K) + 180 CONTINUE + CALL SROT(MWPN(I),COPYX,INCX,COPYY,INCY,MWPC(I),MWPS(I)) + CALL STEST(5,COPYX,MWPSTX,MWPSTX,SFAC) + CALL STEST(5,COPYY,MWPSTY,MWPSTY,SFAC) + 200 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + REAL SFAC + INTEGER LEN +* .. Array Arguments .. + REAL SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + REAL SD + INTEGER I +* .. External Functions .. + REAL SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0E0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2E36.8,2E12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + REAL SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + REAL SSIZE(*) +* .. Local Arrays .. + REAL SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + REAL FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + REAL SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/test/sblat2.dat b/test/sblat2.dat new file mode 100644 index 0000000000..5ed9dd76d0 --- /dev/null +++ b/test/sblat2.dat @@ -0,0 +1,34 @@ +'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 0.9 VALUES OF BETA +SGEMV T PUT F FOR NO TEST. SAME COLUMNS. +SGBMV T PUT F FOR NO TEST. SAME COLUMNS. +SSYMV T PUT F FOR NO TEST. SAME COLUMNS. +SSBMV T PUT F FOR NO TEST. SAME COLUMNS. +SSPMV T PUT F FOR NO TEST. SAME COLUMNS. +STRMV T PUT F FOR NO TEST. SAME COLUMNS. +STBMV T PUT F FOR NO TEST. SAME COLUMNS. +STPMV T PUT F FOR NO TEST. SAME COLUMNS. +STRSV T PUT F FOR NO TEST. SAME COLUMNS. +STBSV T PUT F FOR NO TEST. SAME COLUMNS. +STPSV T PUT F FOR NO TEST. SAME COLUMNS. +SGER T PUT F FOR NO TEST. SAME COLUMNS. +SSYR T PUT F FOR NO TEST. SAME COLUMNS. +SSPR T PUT F FOR NO TEST. SAME COLUMNS. +SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/sblat2.f b/test/sblat2.f new file mode 100644 index 0000000000..057a85429a --- /dev/null +++ b/test/sblat2.f @@ -0,0 +1,3138 @@ + PROGRAM SBLAT2 +* +* Test program for the REAL Level 2 Blas. +* +* The program must be driven by a short data file. The first 18 records +* of the file are read using list-directed input, the last 16 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 34 lines: +* 'SBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 0.9 VALUES OF BETA +* SGEMV T PUT F FOR NO TEST. SAME COLUMNS. +* SGBMV T PUT F FOR NO TEST. SAME COLUMNS. +* SSYMV T PUT F FOR NO TEST. SAME COLUMNS. +* SSBMV T PUT F FOR NO TEST. SAME COLUMNS. +* SSPMV T PUT F FOR NO TEST. SAME COLUMNS. +* STRMV T PUT F FOR NO TEST. SAME COLUMNS. +* STBMV T PUT F FOR NO TEST. SAME COLUMNS. +* STPMV T PUT F FOR NO TEST. SAME COLUMNS. +* STRSV T PUT F FOR NO TEST. SAME COLUMNS. +* STBSV T PUT F FOR NO TEST. SAME COLUMNS. +* STPSV T PUT F FOR NO TEST. SAME COLUMNS. +* SGER T PUT F FOR NO TEST. SAME COLUMNS. +* SSYR T PUT F FOR NO TEST. SAME COLUMNS. +* SSPR T PUT F FOR NO TEST. SAME COLUMNS. +* SSYR2 T PUT F FOR NO TEST. SAME COLUMNS. +* SSPR2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 16 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANS + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ G( NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHK6, + $ SCHKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'SGEMV ', 'SGBMV ', 'SSYMV ', 'SSBMV ', + $ 'SSPMV ', 'STRMV ', 'STBMV ', 'STPMV ', + $ 'STRSV ', 'STBSV ', 'STPSV ', 'SGER ', + $ 'SSYR ', 'SSPR ', 'SSYR2 ', 'SSPR2 '/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 90 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 100 + EPS = HALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from SMVCH YT holds +* the result computed by SMVCH. + TRANS = 'N' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL SCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 180, 180, + $ 190, 190 )ISNUM +* Test SGEMV, 01, and SGBMV, 02. + 140 CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test SSYMV, 03, SSBMV, 04, and SSPMV, 05. + 150 CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test STRMV, 06, STBMV, 07, STPMV, 08, +* STRSV, 09, STBSV, 10, and STPSV, 11. + 160 CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) + GO TO 200 +* Test SGER, 12. + 170 CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test SSYR, 13, and SSPR, 14. + 180 CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test SSYR2, 15, and SSPR2, 16. + 190 CALL SCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE REAL LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9988 FORMAT( ' FOR BETA ', 7F6.1 ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN SMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A6, L2 ) + 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT2. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests SGEMV and SGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SGBMV, SGEMV, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ TRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL SGEMV( TRANS, M, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL SGBMV( TRANS, M, N, KL, KU, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LSERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests SSYMV, SSBMV and SSPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF + PARAMETER ( ZERO = 0.0, HALF = 0.5 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), G( NMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, SSBMV, SSPMV, SSYMV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL SSYMV( UPLO, N, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL SSBMV( UPLO, N, K, ALPHA, AA, LDA, + $ XX, INCX, BETA, YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL SSPMV( UPLO, N, ALPHA, AA, XX, INCX, + $ BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LSE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LSERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LSE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( AS, AA, LAA ) + ISAME( 5 ) = LSE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LSE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LSERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', AP', + $ ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), F4.1, + $ ', A,', I3, ', X,', I2, ',', F4.1, ', Y,', I2, + $ ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', A,', + $ I3, ', X,', I2, ',', F4.1, ', Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) +* +* Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XT( NMAX ), + $ XX( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + REAL ERR, ERRMAX, TRANSL + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, STBMV, STBSV, STPMV, STPSV, + $ STRMV, STRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'R' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero vector for SMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL STRMV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL STBMV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL STPMV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL STRSV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL STBSV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL STPSV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LSERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LSERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LSE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LSE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LSERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MV' )THEN +* +* Check the result. +* + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL SMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, + $ INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests SGER. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL NULL, RESET, SAME +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SGER, SMAKE, SMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL SGER( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( 'GE', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + CALL SMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), F4.1, ', X,', I2, + $ ', Y,', I2, ', A,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests SSYR and SSPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, SSPR, SSYR +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL SSYR( UPLO, N, ALPHA, XX, INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL SSPR( UPLO, N, ALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LSERES( SNAME( 2: 3 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = Z( J ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END + SUBROUTINE SCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests SSYR2 and SSPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), G( NMAX ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX, TRANSL + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + REAL W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMVCH, SSPR2, SSYR2 +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'Y' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL SMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL SMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL SMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL SSYR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL SSPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LSE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LSE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LSE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LSERES( SNAME( 2: 3 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = Z( J, 2 ) + W( 2 ) = Z( J, 1 ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL SMVCH( 'N', LJ, 2, ALPHA, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', Y,', I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK6. +* + END + SUBROUTINE SCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 2 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, BETA, A, X and Y should not need to be defined. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + REAL ALPHA, BETA +* .. Local Arrays .. + REAL A( 1, 1 ), X( 1 ), Y( 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, SGBMV, SGEMV, SGER, SSBMV, SSPMV, SSPR, + $ SSPR2, SSYMV, SSYR, SSYR2, STBMV, STBSV, STPMV, + $ STPSV, STRMV, STRSV +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90, 100, 110, 120, 130, 140, 150, + $ 160 )ISNUM + 10 INFOT = 1 + CALL SGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL SGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 20 INFOT = 1 + CALL SGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 30 INFOT = 1 + CALL SSYMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 40 INFOT = 1 + CALL SSBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SSBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SSBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL SSBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 50 INFOT = 1 + CALL SSPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL SSPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 60 INFOT = 1 + CALL STRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL STRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 70 INFOT = 1 + CALL STBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 80 INFOT = 1 + CALL STPMV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STPMV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STPMV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STPMV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STPMV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 90 INFOT = 1 + CALL STRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL STRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 100 INFOT = 1 + CALL STBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 110 INFOT = 1 + CALL STPSV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STPSV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STPSV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STPSV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL STPSV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 120 INFOT = 1 + CALL SGER( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGER( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGER( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SGER( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SGER( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 130 INFOT = 1 + CALL SSYR( '/', 0, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYR( 'U', -1, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYR( 'U', 0, ALPHA, X, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR( 'U', 2, ALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 140 INFOT = 1 + CALL SSPR( '/', 0, ALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSPR( 'U', -1, ALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSPR( 'U', 0, ALPHA, X, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 150 INFOT = 1 + CALL SSYR2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYR2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSYR2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 170 + 160 INFOT = 1 + CALL SSPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SSPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 170 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of SCHKE. +* + END + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'GB', 'SY', 'SB', 'SP', 'TR', 'TB' OR 'TP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'G' + SYM = TYPE( 1: 1 ).EQ.'S' + TRI = TYPE( 1: 1 ).EQ.'T' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = SBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'GB' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + 130 CONTINUE + ELSE IF( TYPE.EQ.'SB'.OR.TYPE.EQ.'TB' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + 170 CONTINUE + ELSE IF( TYPE.EQ.'SP'.OR.TYPE.EQ.'TP' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + REAL A( NMAX, * ), G( * ), X( * ), Y( * ), YT( * ), + $ YY( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 30 I = 1, ML + YT( IY ) = ZERO + G( IY ) = ZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS( A( J, I )*X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS( A( I, J )*X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS( ALPHA )*G( IY ) + ABS( BETA*Y( IY ) ) + IY = IY + INCYL + 30 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 40 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 50 + 40 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 70 +* +* Report fatal error. +* + 50 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 60 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT(I) + END IF + 60 CONTINUE +* + 70 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) +* +* End of SMVCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE', 'SY' or 'SP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Intrinsic Functions .. + INTRINSIC REAL +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = REAL( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 2 BLAS +* routines. +* +* XERBLA is an error handler for the Level 2 BLAS routines. +* +* It is called by the Level 2 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/sblat3.dat b/test/sblat3.dat new file mode 100644 index 0000000000..98d36a51c9 --- /dev/null +++ b/test/sblat3.dat @@ -0,0 +1,20 @@ +'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +0.0 1.0 0.7 VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +0.0 1.0 1.3 VALUES OF BETA +SGEMM T PUT F FOR NO TEST. SAME COLUMNS. +SSYMM T PUT F FOR NO TEST. SAME COLUMNS. +STRMM T PUT F FOR NO TEST. SAME COLUMNS. +STRSM T PUT F FOR NO TEST. SAME COLUMNS. +SSYRK T PUT F FOR NO TEST. SAME COLUMNS. +SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/sblat3.f b/test/sblat3.f new file mode 100644 index 0000000000..325a9eb927 --- /dev/null +++ b/test/sblat3.f @@ -0,0 +1,2823 @@ + PROGRAM SBLAT3 +* +* Test program for the REAL Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 6 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 20 lines: +* 'SBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* 0.0 1.0 0.7 VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* 0.0 1.0 1.3 VALUES OF BETA +* SGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* SSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* STRMM T PUT F FOR NO TEST. SAME COLUMNS. +* STRSM T PUT F FOR NO TEST. SAME COLUMNS. +* SSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* SSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 6 ) + REAL ZERO, HALF, ONE + PARAMETER ( ZERO = 0.0, HALF = 0.5, ONE = 1.0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + REAL EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + REAL AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + REAL SDIFF + LOGICAL LSE + EXTERNAL SDIFF, LSE +* .. External Subroutines .. + EXTERNAL SCHK1, SCHK2, SCHK3, SCHK4, SCHK5, SCHKE, SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'SGEMM ', 'SSYMM ', 'STRMM ', 'STRSM ', + $ 'SSYRK ', 'SSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = ONE + 70 CONTINUE + IF( SDIFF( ONE + EPS, ONE ).EQ.ZERO ) + $ GO TO 80 + EPS = HALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of SMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from SMMCH CT holds +* the result computed by SMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'T' + TRANSB = 'N' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'T' + CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LSE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL SCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 160, 160, 170, 180 )ISNUM +* Test SGEMM, 01. + 140 CALL SCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test SSYMM, 02. + 150 CALL SCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test STRMM, 03, STRSM, 04. + 160 CALL SCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test SSYRK, 05. + 170 CALL SCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test SSYR2K, 06. + 180 CALL SCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, E9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE REAL LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', 7F6.1 ) + 9992 FORMAT( ' FOR BETA ', 7F6.1 ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN SMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' SMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A6, L2 ) + 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of SBLAT3. +* + END + SUBROUTINE SCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests SGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SGEMM, SMAKE, SMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL SGEMM( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LSE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LSE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL SMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', ', + $ 'C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK1. +* + END + SUBROUTINE SCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests SSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BLS, ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, SSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the symmetric matrix A. +* + CALL SMAKE( 'SY', UPLO, ' ', NA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL SSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK2. +* + END + SUBROUTINE SCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests STRMM and STRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, STRMM, STRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* Set up zero matrix for SMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL SMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL SMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL STRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL STRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LSE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LSE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LSERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL SMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL SMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK3. +* + END + SUBROUTINE SCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests SSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ), G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, SSYRK +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL SMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + BETS = BETA + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL SSYRK( UPLO, TRANS, N, K, ALPHA, AA, LDA, + $ BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL SMMCH( 'T', 'N', LJ, 1, K, ALPHA, + $ A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL SMMCH( 'N', 'T', LJ, 1, K, ALPHA, + $ A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK4. +* + END + SUBROUTINE SCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests SSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0 ) +* .. Scalar Arguments .. + REAL EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + REAL AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ G( NMAX ), W( 2*NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + REAL ALPHA, ALS, BETA, BETS, ERR, ERRMAX + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LSE, LSERES + EXTERNAL LSE, LSERES +* .. External Subroutines .. + EXTERNAL SMAKE, SMMCH, SSYR2K +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NTC'/, ICHU/'UL'/ +* .. Executable Statements .. +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = ZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N + NULL = N.LE.0 +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL SMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL SMAKE( 'SY', UPLO, ' ', N, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BETS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL SSYR2K( UPLO, TRANS, N, K, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LSE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LSE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BETS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LSE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LSERES( 'SY', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = AB( ( J - 1 )*2*NMAX + K + + $ I ) + W( K + I ) = AB( ( J - 1 )*2*NMAX + + $ I ) + 50 CONTINUE + CALL SMMCH( 'T', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJAB ), 2*NMAX, + $ W, 2*NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + DO 60 I = 1, K + W( I ) = AB( ( K + I - 1 )*NMAX + + $ J ) + W( K + I ) = AB( ( I - 1 )*NMAX + + $ J ) + 60 CONTINUE + CALL SMMCH( 'N', 'N', LJ, 1, 2*K, + $ ALPHA, AB( JJ ), NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ', B,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of SCHK5. +* + END + SUBROUTINE SCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, BETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + REAL ALPHA, BETA +* .. Local Arrays .. + REAL A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, SGEMM, SSYMM, SSYR2K, SSYRK, STRMM, + $ STRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60 )ISNUM + 10 INFOT = 1 + CALL SGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL SGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL SGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL SGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL SGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 20 INFOT = 1 + CALL SSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 30 INFOT = 1 + CALL STRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 40 INFOT = 1 + CALL STRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL STRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL STRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL STRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL STRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL STRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL STRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL STRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 50 INFOT = 1 + CALL SSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYRK( 'U', '/', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL SSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 70 + 60 INFOT = 1 + CALL SSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL SSYR2K( 'U', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL SSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL SSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL SSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL SSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL SSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 70 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of SCHKE. +* + END + SUBROUTINE SMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) + REAL ROGUE + PARAMETER ( ROGUE = -1.0E10 ) +* .. Scalar Arguments .. + REAL TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + REAL SBEG + EXTERNAL SBEG +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = SBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + END IF + RETURN +* +* End of SMAKE. +* + END + SUBROUTINE SMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + REAL ZERO, ONE + PARAMETER ( ZERO = 0.0, ONE = 1.0 ) +* .. Scalar Arguments .. + REAL ALPHA, BETA, EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ), G( * ) +* .. Local Scalars .. + REAL ERRI + INTEGER I, J, K + LOGICAL TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, SQRT +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 120 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = ZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS( A( I, K ) )*ABS( B( J, K ) ) + 60 CONTINUE + 70 CONTINUE + ELSE IF( TRANA.AND.TRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS( A( K, I ) )*ABS( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + END IF + DO 100 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS( ALPHA )*G( I ) + ABS( BETA )*ABS( C( I, J ) ) + 100 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 110 I = 1, M + ERRI = ABS( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.ZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.ONE ) + $ GO TO 130 + 110 CONTINUE +* + 120 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 150 +* +* Report fatal error. +* + 130 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 140 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RESULT COMPU', + $ 'TED RESULT' ) + 9998 FORMAT( 1X, I7, 2G18.6 ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of SMMCH. +* + END + LOGICAL FUNCTION LSE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + REAL RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LSE = .TRUE. + GO TO 30 + 20 CONTINUE + LSE = .FALSE. + 30 RETURN +* +* End of LSE. +* + END + LOGICAL FUNCTION LSERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + REAL AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LSERES = .TRUE. + GO TO 80 + 70 CONTINUE + LSERES = .FALSE. + 80 RETURN +* +* End of LSERES. +* + END + REAL FUNCTION SBEG( RESET ) +* +* Generates random numbers uniformly distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, MI +* .. Save statement .. + SAVE I, IC, MI +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + I = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I is bounded between 1 and 999. +* If initial I = 1,2,3,6,7 or 9, the period will be 50. +* If initial I = 4 or 8, the period will be 25. +* If initial I = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I in 6. +* + IC = IC + 1 + 10 I = I*MI + I = I - 1000*( I/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + SBEG = ( I - 500 )/1001.0 + RETURN +* +* End of SBEG. +* + END + REAL FUNCTION SDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + REAL X, Y +* .. Executable Statements .. + SDIFF = X - Y + RETURN +* +* End of SDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/zblat1.f b/test/zblat1.f new file mode 100644 index 0000000000..e2415e1c46 --- /dev/null +++ b/test/zblat1.f @@ -0,0 +1,681 @@ + PROGRAM ZBLAT1 +* Test program for the COMPLEX*16 Level 1 BLAS. +* Based upon the original BLAS test routine together with: +* F06GAF Example Program Text +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SFAC + INTEGER IC +* .. External Subroutines .. + EXTERNAL CHECK1, CHECK2, HEADER +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SFAC/9.765625D-4/ +* .. Executable Statements .. + WRITE (NOUT,99999) + DO 20 IC = 1, 10 + ICASE = IC + CALL HEADER +* +* Initialize PASS, INCX, INCY, and MODE for a new case. +* The value 9999 for INCX, INCY or MODE will appear in the +* detailed output, if any, for cases that do not involve +* these parameters. +* + PASS = .TRUE. + INCX = 9999 + INCY = 9999 + MODE = 9999 + IF (ICASE.LE.5) THEN + CALL CHECK2(SFAC) + ELSE IF (ICASE.GE.6) THEN + CALL CHECK1(SFAC) + END IF +* -- Print + IF (PASS) WRITE (NOUT,99998) + 20 CONTINUE + STOP +* +99999 FORMAT (' Complex BLAS Test Program Results',/1X) +99998 FORMAT (' ----- PASS -----') + END + SUBROUTINE HEADER +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Arrays .. + CHARACTER*6 L(10) +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA L(1)/'ZDOTC '/ + DATA L(2)/'ZDOTU '/ + DATA L(3)/'ZAXPY '/ + DATA L(4)/'ZCOPY '/ + DATA L(5)/'ZSWAP '/ + DATA L(6)/'DZNRM2'/ + DATA L(7)/'DZASUM'/ + DATA L(8)/'ZSCAL '/ + DATA L(9)/'ZDSCAL'/ + DATA L(10)/'IZAMAX'/ +* .. Executable Statements .. + WRITE (NOUT,99999) ICASE, L(ICASE) + RETURN +* +99999 FORMAT (/' Test of subprogram number',I3,12X,A6) + END + SUBROUTINE CHECK1(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA + DOUBLE PRECISION SA + INTEGER I, J, LEN, NP1 +* .. Local Arrays .. + COMPLEX*16 CTRUE5(8,5,2), CTRUE6(8,5,2), CV(8,5,2), CX(8), + + MWPCS(5), MWPCT(5) + DOUBLE PRECISION STRUE2(5), STRUE4(5) + INTEGER ITRUE3(5) +* .. External Functions .. + DOUBLE PRECISION DZASUM, DZNRM2 + INTEGER IZAMAX + EXTERNAL DZASUM, DZNRM2, IZAMAX +* .. External Subroutines .. + EXTERNAL ZSCAL, ZDSCAL, CTEST, ITEST1, STEST1 +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA SA, CA/0.3D0, (0.4D0,-0.7D0)/ + DATA ((CV(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.3D0,-0.4D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.1D0,-0.3D0), (0.5D0,-0.1D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (0.1D0,0.1D0), + + (-0.6D0,0.1D0), (0.1D0,-0.3D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (0.3D0,0.1D0), (0.1D0,0.4D0), + + (0.4D0,0.1D0), (0.1D0,0.2D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CV(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.3D0,-0.4D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.1D0,-0.3D0), (8.0D0,9.0D0), (0.5D0,-0.1D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (0.1D0,0.1D0), + + (3.0D0,6.0D0), (-0.6D0,0.1D0), (4.0D0,7.0D0), + + (0.1D0,-0.3D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.3D0,0.1D0), (5.0D0,8.0D0), + + (0.1D0,0.4D0), (6.0D0,9.0D0), (0.4D0,0.1D0), + + (8.0D0,3.0D0), (0.1D0,0.2D0), (9.0D0,4.0D0)/ + DATA STRUE2/0.0D0, 0.5D0, 0.6D0, 0.7D0, 0.7D0/ + DATA STRUE4/0.0D0, 0.7D0, 1.0D0, 1.3D0, 1.7D0/ + DATA ((CTRUE5(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (-0.16D0,-0.37D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (-0.17D0,-0.19D0), (0.13D0,-0.39D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.11D0,-0.03D0), (-0.17D0,0.46D0), + + (-0.17D0,-0.19D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.19D0,-0.17D0), (0.32D0,0.09D0), + + (0.23D0,-0.24D0), (0.18D0,0.01D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0)/ + DATA ((CTRUE5(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (-0.16D0,-0.37D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (-0.17D0,-0.19D0), (8.0D0,9.0D0), + + (0.13D0,-0.39D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.11D0,-0.03D0), (3.0D0,6.0D0), + + (-0.17D0,0.46D0), (4.0D0,7.0D0), + + (-0.17D0,-0.19D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.19D0,-0.17D0), (5.0D0,8.0D0), + + (0.32D0,0.09D0), (6.0D0,9.0D0), + + (0.23D0,-0.24D0), (8.0D0,3.0D0), + + (0.18D0,0.01D0), (9.0D0,4.0D0)/ + DATA ((CTRUE6(I,J,1),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (1.0D0,2.0D0), (1.0D0,2.0D0), + + (1.0D0,2.0D0), (0.09D0,-0.12D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (3.0D0,4.0D0), (3.0D0,4.0D0), (3.0D0,4.0D0), + + (0.03D0,-0.09D0), (0.15D0,-0.03D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (5.0D0,6.0D0), (5.0D0,6.0D0), (5.0D0,6.0D0), + + (0.03D0,0.03D0), (-0.18D0,0.03D0), + + (0.03D0,-0.09D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (7.0D0,8.0D0), (7.0D0,8.0D0), (7.0D0,8.0D0), + + (0.09D0,0.03D0), (0.03D0,0.12D0), + + (0.12D0,0.03D0), (0.03D0,0.06D0), (2.0D0,3.0D0), + + (2.0D0,3.0D0), (2.0D0,3.0D0), (2.0D0,3.0D0)/ + DATA ((CTRUE6(I,J,2),I=1,8),J=1,5)/(0.1D0,0.1D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (4.0D0,5.0D0), (4.0D0,5.0D0), + + (4.0D0,5.0D0), (0.09D0,-0.12D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (6.0D0,7.0D0), (6.0D0,7.0D0), (6.0D0,7.0D0), + + (0.03D0,-0.09D0), (8.0D0,9.0D0), + + (0.15D0,-0.03D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (2.0D0,5.0D0), (2.0D0,5.0D0), (2.0D0,5.0D0), + + (0.03D0,0.03D0), (3.0D0,6.0D0), + + (-0.18D0,0.03D0), (4.0D0,7.0D0), + + (0.03D0,-0.09D0), (7.0D0,2.0D0), (7.0D0,2.0D0), + + (7.0D0,2.0D0), (0.09D0,0.03D0), (5.0D0,8.0D0), + + (0.03D0,0.12D0), (6.0D0,9.0D0), (0.12D0,0.03D0), + + (8.0D0,3.0D0), (0.03D0,0.06D0), (9.0D0,4.0D0)/ + DATA ITRUE3/0, 1, 2, 2, 2/ +* .. Executable Statements .. + DO 60 INCX = 1, 2 + DO 40 NP1 = 1, 5 + N = NP1 - 1 + LEN = 2*MAX(N,1) +* .. Set vector arguments .. + DO 20 I = 1, LEN + CX(I) = CV(I,NP1,INCX) + 20 CONTINUE + IF (ICASE.EQ.6) THEN +* .. DZNRM2 .. + CALL STEST1(DZNRM2(N,CX,INCX),STRUE2(NP1),STRUE2(NP1), + + SFAC) + ELSE IF (ICASE.EQ.7) THEN +* .. DZASUM .. + CALL STEST1(DZASUM(N,CX,INCX),STRUE4(NP1),STRUE4(NP1), + + SFAC) + ELSE IF (ICASE.EQ.8) THEN +* .. ZSCAL .. + CALL ZSCAL(N,CA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.9) THEN +* .. ZDSCAL .. + CALL ZDSCAL(N,SA,CX,INCX) + CALL CTEST(LEN,CX,CTRUE6(1,NP1,INCX),CTRUE6(1,NP1,INCX), + + SFAC) + ELSE IF (ICASE.EQ.10) THEN +* .. IZAMAX .. + CALL ITEST1(IZAMAX(N,CX,INCX),ITRUE3(NP1)) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE +* + INCX = 1 + IF (ICASE.EQ.8) THEN +* ZSCAL +* Add a test for alpha equal to zero. + CA = (0.0D0,0.0D0) + DO 80 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 80 CONTINUE + CALL ZSCAL(5,CA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + ELSE IF (ICASE.EQ.9) THEN +* ZDSCAL +* Add a test for alpha equal to zero. + SA = 0.0D0 + DO 100 I = 1, 5 + MWPCT(I) = (0.0D0,0.0D0) + MWPCS(I) = (1.0D0,1.0D0) + 100 CONTINUE + CALL ZDSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to one. + SA = 1.0D0 + DO 120 I = 1, 5 + MWPCT(I) = CX(I) + MWPCS(I) = CX(I) + 120 CONTINUE + CALL ZDSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) +* Add a test for alpha equal to minus one. + SA = -1.0D0 + DO 140 I = 1, 5 + MWPCT(I) = -CX(I) + MWPCS(I) = -CX(I) + 140 CONTINUE + CALL ZDSCAL(5,SA,CX,INCX) + CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) + END IF + RETURN + END + SUBROUTINE CHECK2(SFAC) +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + COMPLEX*16 CA + INTEGER I, J, KI, KN, KSIZE, LENX, LENY, MX, MY +* .. Local Arrays .. + COMPLEX*16 CDOT(1), CSIZE1(4), CSIZE2(7,2), CSIZE3(14), + + CT10X(7,4,4), CT10Y(7,4,4), CT6(4,4), CT7(4,4), + + CT8(7,4,4), CX(7), CX1(7), CY(7), CY1(7) + INTEGER INCXS(4), INCYS(4), LENS(4,2), NS(4) +* .. External Functions .. + COMPLEX*16 ZDOTC, ZDOTU + EXTERNAL ZDOTC, ZDOTU +* .. External Subroutines .. + EXTERNAL ZAXPY, ZCOPY, ZSWAP, CTEST +* .. Intrinsic Functions .. + INTRINSIC ABS, MIN +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Data statements .. + DATA CA/(0.4D0,-0.7D0)/ + DATA INCXS/1, 2, -2, -1/ + DATA INCYS/1, -2, 1, -2/ + DATA LENS/1, 1, 2, 4, 1, 1, 3, 7/ + DATA NS/0, 1, 2, 4/ + DATA CX1/(0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (-0.1D0,-0.9D0), (0.2D0,-0.8D0), + + (-0.9D0,-0.4D0), (0.1D0,0.4D0), (-0.6D0,0.6D0)/ + DATA CY1/(0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.7D0,-0.6D0), (0.1D0,-0.5D0), (-0.1D0,-0.2D0), + + (-0.5D0,-0.3D0), (0.8D0,-0.7D0)/ + DATA ((CT8(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-1.55D0,0.5D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (-1.55D0,0.5D0), + + (0.03D0,-0.89D0), (-0.38D0,-0.96D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-0.9D0,0.5D0), (0.42D0,-1.41D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-0.9D0,0.5D0), + + (0.06D0,-0.13D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.52D0,-1.51D0)/ + DATA ((CT8(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.07D0,-0.89D0), + + (-1.18D0,-0.31D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.78D0,0.06D0), (-1.54D0,0.97D0), + + (0.03D0,-0.89D0), (-0.18D0,-1.31D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT8(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.32D0,-1.41D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.32D0,-1.41D0), (-0.9D0,0.5D0), + + (0.05D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.32D0,-1.41D0), + + (-0.9D0,0.5D0), (0.05D0,-0.6D0), (0.1D0,-0.5D0), + + (-0.77D0,-0.49D0), (-0.5D0,-0.3D0), + + (0.32D0,-1.16D0)/ + DATA CT7/(0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (0.65D0,-0.47D0), (-0.34D0,-1.22D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.59D0,-1.46D0), (-1.04D0,-0.04D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.83D0,0.59D0), (0.07D0,-0.37D0), + + (0.0D0,0.0D0), (-0.06D0,-0.90D0), + + (-0.76D0,-1.15D0), (-1.33D0,-1.82D0)/ + DATA CT6/(0.0D0,0.0D0), (0.90D0,0.06D0), + + (0.91D0,-0.77D0), (1.80D0,-0.10D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.45D0,0.74D0), + + (0.20D0,0.90D0), (0.0D0,0.0D0), (0.90D0,0.06D0), + + (-0.55D0,0.23D0), (0.83D0,-0.39D0), + + (0.0D0,0.0D0), (0.90D0,0.06D0), (1.04D0,0.79D0), + + (1.95D0,1.22D0)/ + DATA ((CT10X(I,J,1),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (-0.9D0,0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (-0.9D0,0.5D0), (0.7D0,-0.6D0), (0.1D0,-0.5D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10X(I,J,2),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.6D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.8D0,-0.7D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.2D0), + + (0.2D0,-0.8D0), (0.7D0,-0.6D0), (0.1D0,0.4D0), + + (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,3),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.9D0,0.5D0), (-0.4D0,-0.7D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.1D0,-0.5D0), + + (-0.4D0,-0.7D0), (0.7D0,-0.6D0), (0.2D0,-0.8D0), + + (-0.9D0,0.5D0), (0.1D0,0.4D0), (0.6D0,-0.6D0)/ + DATA ((CT10X(I,J,4),I=1,7),J=1,4)/(0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.6D0,-0.6D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.6D0,-0.6D0), (0.7D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.6D0,-0.6D0), + + (0.7D0,-0.6D0), (-0.1D0,-0.2D0), (0.8D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,1),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.4D0,-0.7D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.4D0,-0.7D0), (-0.1D0,-0.9D0), + + (0.2D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,2),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (-0.9D0,0.5D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,0.5D0), (-0.9D0,-0.4D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.7D0,-0.8D0)/ + DATA ((CT10Y(I,J,3),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (-0.1D0,-0.9D0), (0.7D0,-0.8D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (-0.6D0,0.6D0), + + (-0.9D0,-0.4D0), (-0.1D0,-0.9D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0)/ + DATA ((CT10Y(I,J,4),I=1,7),J=1,4)/(0.6D0,-0.6D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.7D0,-0.8D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.7D0,-0.8D0), (-0.9D0,0.5D0), + + (-0.4D0,-0.7D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.7D0,-0.8D0), + + (-0.9D0,0.5D0), (-0.4D0,-0.7D0), (0.1D0,-0.5D0), + + (-0.1D0,-0.9D0), (-0.5D0,-0.3D0), + + (0.2D0,-0.8D0)/ + DATA CSIZE1/(0.0D0,0.0D0), (0.9D0,0.9D0), + + (1.63D0,1.73D0), (2.90D0,2.78D0)/ + DATA CSIZE3/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0), + + (1.17D0,1.17D0), (1.17D0,1.17D0)/ + DATA CSIZE2/(0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (0.0D0,0.0D0), + + (0.0D0,0.0D0), (0.0D0,0.0D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0), + + (1.54D0,1.54D0), (1.54D0,1.54D0)/ +* .. Executable Statements .. + DO 60 KI = 1, 4 + INCX = INCXS(KI) + INCY = INCYS(KI) + MX = ABS(INCX) + MY = ABS(INCY) +* + DO 40 KN = 1, 4 + N = NS(KN) + KSIZE = MIN(2,KN) + LENX = LENS(KN,MX) + LENY = LENS(KN,MY) +* .. initialize all argument arrays .. + DO 20 I = 1, 7 + CX(I) = CX1(I) + CY(I) = CY1(I) + 20 CONTINUE + IF (ICASE.EQ.1) THEN +* .. ZDOTC .. + CDOT(1) = ZDOTC(N,CX,INCX,CY,INCY) + CALL CTEST(1,CDOT,CT6(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.2) THEN +* .. ZDOTU .. + CDOT(1) = ZDOTU(N,CX,INCX,CY,INCY) + CALL CTEST(1,CDOT,CT7(KN,KI),CSIZE1(KN),SFAC) + ELSE IF (ICASE.EQ.3) THEN +* .. ZAXPY .. + CALL ZAXPY(N,CA,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT8(1,KN,KI),CSIZE2(1,KSIZE),SFAC) + ELSE IF (ICASE.EQ.4) THEN +* .. ZCOPY .. + CALL ZCOPY(N,CX,INCX,CY,INCY) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE IF (ICASE.EQ.5) THEN +* .. ZSWAP .. + CALL ZSWAP(N,CX,INCX,CY,INCY) + CALL CTEST(LENX,CX,CT10X(1,KN,KI),CSIZE3,1.0D0) + CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) + ELSE + WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' + STOP + END IF +* + 40 CONTINUE + 60 CONTINUE + RETURN + END + SUBROUTINE STEST(LEN,SCOMP,STRUE,SSIZE,SFAC) +* ********************************* STEST ************************** +* +* THIS SUBR COMPARES ARRAYS SCOMP() AND STRUE() OF LENGTH LEN TO +* SEE IF THE TERM BY TERM DIFFERENCES, MULTIPLIED BY SFAC, ARE +* NEGLIGIBLE. +* +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + DOUBLE PRECISION SCOMP(LEN), SSIZE(LEN), STRUE(LEN) +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + DOUBLE PRECISION SD + INTEGER I +* .. External Functions .. + DOUBLE PRECISION SDIFF + EXTERNAL SDIFF +* .. Intrinsic Functions .. + INTRINSIC ABS +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. +* + DO 40 I = 1, LEN + SD = SCOMP(I) - STRUE(I) + IF (SDIFF(ABS(SSIZE(I))+ABS(SFAC*SD),ABS(SSIZE(I))).EQ.0.0D0) + + GO TO 40 +* +* HERE SCOMP(I) IS NOT CLOSE TO STRUE(I). +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, I, SCOMP(I), + + STRUE(I), SD, SSIZE(I) + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE I ', + + ' COMP(I) TRUE(I) DIFFERENCE', + + ' SIZE(I)',/1X) +99997 FORMAT (1X,I4,I3,3I5,I3,2D36.8,2D12.4) + END + SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) +* ************************* STEST1 ***************************** +* +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN +* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE +* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SCOMP1, SFAC, STRUE1 +* .. Array Arguments .. + DOUBLE PRECISION SSIZE(*) +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(1), STRUE(1) +* .. External Subroutines .. + EXTERNAL STEST +* .. Executable Statements .. +* + SCOMP(1) = SCOMP1 + STRUE(1) = STRUE1 + CALL STEST(1,SCOMP,STRUE,SSIZE,SFAC) +* + RETURN + END + DOUBLE PRECISION FUNCTION SDIFF(SA,SB) +* ********************************* SDIFF ************************** +* COMPUTES DIFFERENCE OF TWO NUMBERS. C. L. LAWSON, JPL 1974 FEB 15 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SA, SB +* .. Executable Statements .. + SDIFF = SA - SB + RETURN + END + SUBROUTINE CTEST(LEN,CCOMP,CTRUE,CSIZE,SFAC) +* **************************** CTEST ***************************** +* +* C.L. LAWSON, JPL, 1978 DEC 6 +* +* .. Scalar Arguments .. + DOUBLE PRECISION SFAC + INTEGER LEN +* .. Array Arguments .. + COMPLEX*16 CCOMP(LEN), CSIZE(LEN), CTRUE(LEN) +* .. Local Scalars .. + INTEGER I +* .. Local Arrays .. + DOUBLE PRECISION SCOMP(20), SSIZE(20), STRUE(20) +* .. External Subroutines .. + EXTERNAL STEST +* .. Intrinsic Functions .. + INTRINSIC DIMAG, DBLE +* .. Executable Statements .. + DO 20 I = 1, LEN + SCOMP(2*I-1) = DBLE(CCOMP(I)) + SCOMP(2*I) = DIMAG(CCOMP(I)) + STRUE(2*I-1) = DBLE(CTRUE(I)) + STRUE(2*I) = DIMAG(CTRUE(I)) + SSIZE(2*I-1) = DBLE(CSIZE(I)) + SSIZE(2*I) = DIMAG(CSIZE(I)) + 20 CONTINUE +* + CALL STEST(2*LEN,SCOMP,STRUE,SSIZE,SFAC) + RETURN + END + SUBROUTINE ITEST1(ICOMP,ITRUE) +* ********************************* ITEST1 ************************* +* +* THIS SUBROUTINE COMPARES THE VARIABLES ICOMP AND ITRUE FOR +* EQUALITY. +* C. L. LAWSON, JPL, 1974 DEC 10 +* +* .. Parameters .. + INTEGER NOUT + PARAMETER (NOUT=6) +* .. Scalar Arguments .. + INTEGER ICOMP, ITRUE +* .. Scalars in Common .. + INTEGER ICASE, INCX, INCY, MODE, N + LOGICAL PASS +* .. Local Scalars .. + INTEGER ID +* .. Common blocks .. + COMMON /COMBLA/ICASE, N, INCX, INCY, MODE, PASS +* .. Executable Statements .. + IF (ICOMP.EQ.ITRUE) GO TO 40 +* +* HERE ICOMP IS NOT EQUAL TO ITRUE. +* + IF ( .NOT. PASS) GO TO 20 +* PRINT FAIL MESSAGE AND HEADER. + PASS = .FALSE. + WRITE (NOUT,99999) + WRITE (NOUT,99998) + 20 ID = ICOMP - ITRUE + WRITE (NOUT,99997) ICASE, N, INCX, INCY, MODE, ICOMP, ITRUE, ID + 40 CONTINUE + RETURN +* +99999 FORMAT (' FAIL') +99998 FORMAT (/' CASE N INCX INCY MODE ', + + ' COMP TRUE DIFFERENCE', + + /1X) +99997 FORMAT (1X,I4,I3,3I5,2I36,I12) + END diff --git a/test/zblat2.dat b/test/zblat2.dat new file mode 100644 index 0000000000..69a9f150d1 --- /dev/null +++ b/test/zblat2.dat @@ -0,0 +1,35 @@ +'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +T LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +7 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +4 NUMBER OF VALUES OF K +0 1 2 4 VALUES OF K +4 NUMBER OF VALUES OF INCX AND INCY +1 2 -1 -2 VALUES OF INCX AND INCY +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. +ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. +ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. +ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. +ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. +ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. +ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. +ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. +ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. +ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. +ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. +ZGERC T PUT F FOR NO TEST. SAME COLUMNS. +ZGERU T PUT F FOR NO TEST. SAME COLUMNS. +ZHER T PUT F FOR NO TEST. SAME COLUMNS. +ZHPR T PUT F FOR NO TEST. SAME COLUMNS. +ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. +ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/zblat2.f b/test/zblat2.f new file mode 100644 index 0000000000..e65cdcc703 --- /dev/null +++ b/test/zblat2.f @@ -0,0 +1,3249 @@ + PROGRAM ZBLAT2 +* +* Test program for the COMPLEX*16 Level 2 Blas. +* +* The program must be driven by a short data file. The first 18 records +* of the file are read using list-directed input, the last 17 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 35 lines: +* 'ZBLAT2.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'CBLA2T.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 4 NUMBER OF VALUES OF K +* 0 1 2 4 VALUES OF K +* 4 NUMBER OF VALUES OF INCX AND INCY +* 1 2 -1 -2 VALUES OF INCX AND INCY +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZGBMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZHBMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZHPMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTBMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTPMV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTBSV T PUT F FOR NO TEST. SAME COLUMNS. +* ZTPSV T PUT F FOR NO TEST. SAME COLUMNS. +* ZGERC T PUT F FOR NO TEST. SAME COLUMNS. +* ZGERU T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER T PUT F FOR NO TEST. SAME COLUMNS. +* ZHPR T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2 T PUT F FOR NO TEST. SAME COLUMNS. +* ZHPR2 T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Hammarling S. and Hanson R. J.. +* An extended set of Fortran Basic Linear Algebra Subprograms. +* +* Technical Memoranda Nos. 41 (revision 3) and 81, Mathematics +* and Computer Science Division, Argonne National Laboratory, +* 9700 South Cass Avenue, Argonne, Illinois 60439, US. +* +* Or +* +* NAG Technical Reports TR3/87 and TR4/87, Numerical Algorithms +* Group Ltd., NAG Central Office, 256 Banbury Road, Oxford +* OX2 7DE, UK, and Numerical Algorithms Group Inc., 1101 31st +* Street, Suite 100, Downers Grove, Illinois 60515-1263, USA. +* +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 17 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX, INCMAX + PARAMETER ( NMAX = 65, INCMAX = 2 ) + INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX + PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, + $ NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NINC, NKB, + $ NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANS + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), BET( NBEMAX ), + $ X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ), INC( NINMAX ), KB( NKBMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHK6, + $ ZCHKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'ZGEMV ', 'ZGBMV ', 'ZHEMV ', 'ZHBMV ', + $ 'ZHPMV ', 'ZTRMV ', 'ZTBMV ', 'ZTPMV ', + $ 'ZTRSV ', 'ZTBSV ', 'ZTPSV ', 'ZGERC ', + $ 'ZGERU ', 'ZHER ', 'ZHPR ', 'ZHER2 ', + $ 'ZHPR2 '/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 230 + END IF + 10 CONTINUE +* Values of K + READ( NIN, FMT = * )NKB + IF( NKB.LT.1.OR.NKB.GT.NKBMAX )THEN + WRITE( NOUT, FMT = 9997 )'K', NKBMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( KB( I ), I = 1, NKB ) + DO 20 I = 1, NKB + IF( KB( I ).LT.0 )THEN + WRITE( NOUT, FMT = 9995 ) + GO TO 230 + END IF + 20 CONTINUE +* Values of INCX and INCY + READ( NIN, FMT = * )NINC + IF( NINC.LT.1.OR.NINC.GT.NINMAX )THEN + WRITE( NOUT, FMT = 9997 )'INCX AND INCY', NINMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( INC( I ), I = 1, NINC ) + DO 30 I = 1, NINC + IF( INC( I ).EQ.0.OR.ABS( INC( I ) ).GT.INCMAX )THEN + WRITE( NOUT, FMT = 9994 )INCMAX + GO TO 230 + END IF + 30 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 230 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9993 ) + WRITE( NOUT, FMT = 9992 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9991 )( KB( I ), I = 1, NKB ) + WRITE( NOUT, FMT = 9990 )( INC( I ), I = 1, NINC ) + WRITE( NOUT, FMT = 9989 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9988 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9980 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 40 I = 1, NSUBS + LTEST( I ) = .FALSE. + 40 CONTINUE + 50 READ( NIN, FMT = 9984, END = 80 )SNAMET, LTESTT + DO 60 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 70 + 60 CONTINUE + WRITE( NOUT, FMT = 9986 )SNAMET + STOP + 70 LTEST( I ) = LTESTT + GO TO 50 +* + 80 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 90 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 100 + EPS = RHALF*EPS + GO TO 90 + 100 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMVCH using exact data. +* + N = MIN( 32, NMAX ) + DO 120 J = 1, N + DO 110 I = 1, N + A( I, J ) = MAX( I - J + 1, 0 ) + 110 CONTINUE + X( J ) = J + Y( J ) = ZERO + 120 CONTINUE + DO 130 J = 1, N + YY( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE +* YY holds the exact result. On exit from ZMVCH YT holds +* the result computed by ZMVCH. + TRANS = 'N' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, 1, ZERO, Y, 1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF + TRANS = 'T' + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( YY, YT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 210 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9983 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 140, 150, 150, 150, 160, 160, + $ 160, 160, 160, 160, 170, 170, 180, + $ 180, 190, 190 )ISNUM +* Test ZGEMV, 01, and ZGBMV, 02. + 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test ZHEMV, 03, ZHBMV, 04, and ZHPMV, 05. + 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, + $ NBET, BET, NINC, INC, NMAX, INCMAX, A, AA, AS, + $ X, XX, XS, Y, YY, YS, YT, G ) + GO TO 200 +* Test ZTRMV, 06, ZTBMV, 07, ZTPMV, 08, +* ZTRSV, 09, ZTBSV, 10, and ZTPSV, 11. + 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, Y, YY, YS, YT, G, Z ) + GO TO 200 +* Test ZGERC, 12, ZGERU, 13. + 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test ZHER, 14, and ZHPR, 15. + 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) + GO TO 200 +* Test ZHER2, 16, and ZHPR2, 17. + 190 CALL ZCHK6( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, + $ NMAX, INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, + $ YT, G, Z ) +* + 200 IF( FATAL.AND.SFATAL ) + $ GO TO 220 + END IF + 210 CONTINUE + WRITE( NOUT, FMT = 9982 ) + GO TO 240 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9981 ) + GO TO 240 +* + 230 CONTINUE + WRITE( NOUT, FMT = 9987 ) +* + 240 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' VALUE OF K IS LESS THAN 0' ) + 9994 FORMAT( ' ABSOLUTE VALUE OF INCX OR INCY IS 0 OR GREATER THAN ', + $ I2 ) + 9993 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 2 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9992 FORMAT( ' FOR N ', 9I6 ) + 9991 FORMAT( ' FOR K ', 7I6 ) + 9990 FORMAT( ' FOR INCX AND INCY ', 7I6 ) + 9989 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9988 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9987 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9986 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9985 FORMAT( ' ERROR IN ZMVCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMVCH WAS CALLED WITH TRANS = ', A1, + $ ' AND RETURNED SAME = ', L1, ' AND ERR = ', F12.3, '.', / + $ ' THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.' + $ , /' ******* TESTS ABANDONED *******' ) + 9984 FORMAT( A6, L2 ) + 9983 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9982 FORMAT( /' END OF TESTS' ) + 9981 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9980 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT2. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests ZGEMV and ZGBMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IKU, IM, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, KL, KLS, KU, KUS, LAA, LDA, + $ LDAS, LX, LY, M, ML, MS, N, NARGS, NC, ND, NK, + $ NL, NS + LOGICAL BANDED, FULL, NULL, RESET, SAME, TRAN + CHARACTER*1 TRANS, TRANSS + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZGBMV, ZGEMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 11 + ELSE IF( BANDED )THEN + NARGS = 13 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IKU = 1, NK + IF( BANDED )THEN + KU = KB( IKU ) + KL = MAX( KU - 1, 0 ) + ELSE + KU = N - 1 + KL = M - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = KL + KU + 1 + ELSE + LDA = M + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, AA, + $ LDA, KL, KU, RESET, TRANSL ) +* + DO 90 IC = 1, 3 + TRANS = ICH( IC: IC ) + TRAN = TRANS.EQ.'T'.OR.TRANS.EQ.'C' +* + IF( TRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*NL +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, NL, X, 1, XX, + $ ABS( INCX ), 0, NL - 1, RESET, TRANSL ) + IF( NL.GT.1 )THEN + X( NL/2 ) = ZERO + XX( 1 + ABS( INCX )*( NL/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*ML +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'GE', ' ', ' ', 1, ML, Y, 1, + $ YY, ABS( INCY ), 0, ML - 1, + $ RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANSS = TRANS + MS = M + NS = N + KLS = KL + KUS = KU + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ TRANS, M, N, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL ZGEMV( TRANS, M, N, ALPHA, AA, + $ LDA, XX, INCX, BETA, YY, + $ INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANS, M, N, KL, KU, ALPHA, LDA, + $ INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL ZGBMV( TRANS, M, N, KL, KU, ALPHA, + $ AA, LDA, XX, INCX, BETA, + $ YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 130 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANS.EQ.TRANSS + ISAME( 2 ) = MS.EQ.M + ISAME( 3 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 4 ) = KLS.EQ.KL + ISAME( 5 ) = KUS.EQ.KU + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( XS, XX, LX ) + ISAME( 10 ) = INCXS.EQ.INCX + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 12 ) = LZERES( 'GE', ' ', 1, + $ ML, YS, YY, + $ ABS( INCY ) ) + END IF + ISAME( 13 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 130 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, M, N, ALPHA, A, + $ NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 130 + ELSE +* Avoid repeating tests with M.le.0 or +* N.le.0. + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 140 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, TRANS, M, N, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANS, M, N, KL, KU, + $ ALPHA, LDA, INCX, BETA, INCY + END IF +* + 140 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 4( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NALF, ALF, NBET, + $ BET, NINC, INC, NMAX, INCMAX, A, AA, AS, X, XX, + $ XS, Y, YY, YS, YT, G ) +* +* Tests ZHEMV, ZHBMV and ZHPMV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NBET, NIDIM, NINC, NKB, NMAX, + $ NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), BET( NBET ), X( NMAX ), + $ XS( NMAX*INCMAX ), XX( NMAX*INCMAX ), + $ Y( NMAX ), YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, IC, IK, IN, INCX, INCXS, INCY, + $ INCYS, IX, IY, K, KS, LAA, LDA, LDAS, LX, LY, + $ N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHBMV, ZHEMV, ZHPMV, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 10 + ELSE IF( BANDED )THEN + NARGS = 11 + ELSE IF( PACKED )THEN + NARGS = 9 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, AA, + $ LDA, K, K, RESET, TRANSL ) +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + UPLOS = UPLO + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + BLS = BETA + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, N, ALPHA, LDA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL ZHEMV( UPLO, N, ALPHA, AA, LDA, XX, + $ INCX, BETA, YY, INCY ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, N, K, ALPHA, LDA, INCX, BETA, + $ INCY + IF( REWI ) + $ REWIND NTRA + CALL ZHBMV( UPLO, N, K, ALPHA, AA, LDA, + $ XX, INCX, BETA, YY, INCY ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, N, ALPHA, INCX, BETA, INCY + IF( REWI ) + $ REWIND NTRA + CALL ZHPMV( UPLO, N, ALPHA, AA, XX, INCX, + $ BETA, YY, INCY ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LDAS.EQ.LDA + ISAME( 6 ) = LZE( XS, XX, LX ) + ISAME( 7 ) = INCXS.EQ.INCX + ISAME( 8 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 9 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 9 ) = LZERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 10 ) = INCYS.EQ.INCY + ELSE IF( BANDED )THEN + ISAME( 3 ) = KS.EQ.K + ISAME( 4 ) = ALS.EQ.ALPHA + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + ISAME( 7 ) = LZE( XS, XX, LX ) + ISAME( 8 ) = INCXS.EQ.INCX + ISAME( 9 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 10 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 10 ) = LZERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 11 ) = INCYS.EQ.INCY + ELSE IF( PACKED )THEN + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( AS, AA, LAA ) + ISAME( 5 ) = LZE( XS, XX, LX ) + ISAME( 6 ) = INCXS.EQ.INCX + ISAME( 7 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 8 ) = LZE( YS, YY, LY ) + ELSE + ISAME( 8 ) = LZERES( 'GE', ' ', 1, N, + $ YS, YY, ABS( INCY ) ) + END IF + ISAME( 9 ) = INCYS.EQ.INCY + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMVCH( 'N', N, N, ALPHA, A, NMAX, X, + $ INCX, BETA, Y, INCY, YT, G, + $ YY, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0 + GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, LDA, INCX, + $ BETA, INCY + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, K, ALPHA, LDA, + $ INCX, BETA, INCY + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ BETA, INCY + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), AP, X,', I2, ',(', F4.1, ',', F4.1, '), Y,', I2, + $ ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', 2( I3, ',' ), '(', + $ F4.1, ',', F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', + $ F4.1, '), Y,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), A,', I3, ', X,', I2, ',(', F4.1, ',', F4.1, '), ', + $ 'Y,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NKB, KB, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, XT, G, Z ) +* +* Tests ZTRMV, ZTBMV, ZTPMV, ZTRSV, ZTBSV and ZTPSV. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NIDIM, NINC, NKB, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XT( NMAX ), XX( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ), KB( NKB ) +* .. Local Scalars .. + COMPLEX*16 TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, ICD, ICT, ICU, IK, IN, INCX, INCXS, IX, K, + $ KS, LAA, LDA, LDAS, LX, N, NARGS, NC, NK, NS + LOGICAL BANDED, FULL, NULL, PACKED, RESET, SAME + CHARACTER*1 DIAG, DIAGS, TRANS, TRANSS, UPLO, UPLOS + CHARACTER*2 ICHD, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMVCH, ZTBMV, ZTBSV, ZTPMV, ZTPSV, + $ ZTRMV, ZTRSV +* .. Intrinsic Functions .. + INTRINSIC ABS, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'R' + BANDED = SNAME( 3: 3 ).EQ.'B' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 8 + ELSE IF( BANDED )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 7 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero vector for ZMVCH. + DO 10 I = 1, NMAX + Z( I ) = ZERO + 10 CONTINUE +* + DO 110 IN = 1, NIDIM + N = IDIM( IN ) +* + IF( BANDED )THEN + NK = NKB + ELSE + NK = 1 + END IF + DO 100 IK = 1, NK + IF( BANDED )THEN + K = KB( IK ) + ELSE + K = N - 1 + END IF +* Set LDA to 1 more than minimum value if room. + IF( BANDED )THEN + LDA = K + 1 + ELSE + LDA = N + END IF + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF + NULL = N.LE.0 +* + DO 90 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 80 ICT = 1, 3 + TRANS = ICHT( ICT: ICT ) +* + DO 70 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), UPLO, DIAG, N, N, A, + $ NMAX, AA, LDA, K, K, RESET, TRANSL ) +* + DO 60 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, + $ ABS( INCX ), 0, N - 1, RESET, + $ TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + DIAGS = DIAG + NS = N + KS = K + DO 20 I = 1, LAA + AS( I ) = AA( I ) + 20 CONTINUE + LDAS = LDA + DO 30 I = 1, LX + XS( I ) = XX( I ) + 30 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTRMV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTBMV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTPMV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTRSV( UPLO, TRANS, DIAG, N, AA, LDA, + $ XX, INCX ) + ELSE IF( BANDED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, K, LDA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTBSV( UPLO, TRANS, DIAG, N, K, AA, + $ LDA, XX, INCX ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ UPLO, TRANS, DIAG, N, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZTPSV( UPLO, TRANS, DIAG, N, AA, XX, + $ INCX ) + END IF + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = TRANS.EQ.TRANSS + ISAME( 3 ) = DIAG.EQ.DIAGS + ISAME( 4 ) = NS.EQ.N + IF( FULL )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + ISAME( 6 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 7 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 7 ) = LZERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 8 ) = INCXS.EQ.INCX + ELSE IF( BANDED )THEN + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 8 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 8 ) = LZERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 9 ) = INCXS.EQ.INCX + ELSE IF( PACKED )THEN + ISAME( 5 ) = LZE( AS, AA, LAA ) + IF( NULL )THEN + ISAME( 6 ) = LZE( XS, XX, LX ) + ELSE + ISAME( 6 ) = LZERES( 'GE', ' ', 1, N, XS, + $ XX, ABS( INCX ) ) + END IF + ISAME( 7 ) = INCXS.EQ.INCX + END IF +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MV' )THEN +* +* Check the result. +* + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, + $ INCX, ZERO, Z, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE IF( SNAME( 4: 5 ).EQ.'SV' )THEN +* +* Compute approximation to original vector. +* + DO 50 I = 1, N + Z( I ) = XX( 1 + ( I - 1 )* + $ ABS( INCX ) ) + XX( 1 + ( I - 1 )*ABS( INCX ) ) + $ = X( I ) + 50 CONTINUE + CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, Z, + $ INCX, ZERO, X, INCX, XT, G, + $ XX, EPS, ERR, FATAL, NOUT, + $ .FALSE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 120 + ELSE +* Avoid repeating tests with N.le.0. + GO TO 110 + END IF +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, DIAG, N, LDA, + $ INCX + ELSE IF( BANDED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, DIAG, N, K, + $ LDA, INCX + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9995 )NC, SNAME, UPLO, TRANS, DIAG, N, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', AP, ', + $ 'X,', I2, ') .' ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), 2( I3, ',' ), + $ ' A,', I3, ', X,', I2, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 3( '''', A1, ''',' ), I3, ', A,', + $ I3, ', X,', I2, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests ZGERC and ZGERU. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IM, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, LAA, LDA, LDAS, LX, LY, M, MS, N, NARGS, + $ NC, ND, NS + LOGICAL CONJ, NULL, RESET, SAME +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZGERC, ZGERU, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. + CONJ = SNAME( 5: 5 ).EQ.'C' +* Define the number of arguments. + NARGS = 9 +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 120 IN = 1, NIDIM + N = IDIM( IN ) + ND = N/2 + 1 +* + DO 110 IM = 1, 2 + IF( IM.EQ.1 ) + $ M = MAX( N - ND, 0 ) + IF( IM.EQ.2 ) + $ M = MIN( N + ND, NMAX ) +* +* Set LDA to 1 more than minimum value if room. + LDA = M + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 100 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*M +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, M, X, 1, XX, ABS( INCX ), + $ 0, M - 1, RESET, TRANSL ) + IF( M.GT.1 )THEN + X( M/2 ) = ZERO + XX( 1 + ABS( INCX )*( M/2 - 1 ) ) = ZERO + END IF +* + DO 90 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), ' ', ' ', M, N, A, NMAX, + $ AA, LDA, M - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, M, N, + $ ALPHA, INCX, INCY, LDA + IF( CONJ )THEN + IF( REWI ) + $ REWIND NTRA + CALL ZGERC( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) + ELSE + IF( REWI ) + $ REWIND NTRA + CALL ZGERU( M, N, ALPHA, XX, INCX, YY, INCY, AA, + $ LDA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9993 ) + FATAL = .TRUE. + GO TO 140 + END IF +* +* See what data changed inside subroutine. +* + ISAME( 1 ) = MS.EQ.M + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( 'GE', ' ', M, N, AS, AA, + $ LDA ) + END IF + ISAME( 9 ) = LDAS.EQ.LDA +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 140 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, M + Z( I ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, M + Z( I ) = X( M - I + 1 ) + 60 CONTINUE + END IF + DO 70 J = 1, N + IF( INCY.GT.0 )THEN + W( 1 ) = Y( J ) + ELSE + W( 1 ) = Y( N - J + 1 ) + END IF + IF( CONJ ) + $ W( 1 ) = DCONJG( W( 1 ) ) + CALL ZMVCH( 'N', M, 1, ALPHA, Z, NMAX, W, 1, + $ ONE, A( 1, J ), 1, YT, G, + $ AA( 1 + ( J - 1 )*LDA ), EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 130 + 70 CONTINUE + ELSE +* Avoid repeating tests with M.le.0 or N.le.0. + GO TO 110 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 150 +* + 130 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 140 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9994 )NC, SNAME, M, N, ALPHA, INCX, INCY, LDA +* + 150 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( I3, ',' ), '(', F4.1, ',', F4.1, + $ '), X,', I2, ', Y,', I2, ', A,', I3, ') ', + $ ' .' ) + 9993 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK4. +* + END + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests ZHER and ZHPR. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, TRANSL + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS + INTEGER I, IA, IC, IN, INCX, INCXS, IX, J, JA, JJ, LAA, + $ LDA, LDAS, LJ, LX, N, NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 1 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHER, ZHPR, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCMPLX, DCONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 7 + ELSE IF( PACKED )THEN + NARGS = 6 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 100 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 90 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 80 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 70 IA = 1, NALF + RALPHA = DBLE( ALF( IA ) ) + ALPHA = DCMPLX( RALPHA, RZERO ) + NULL = N.LE.0.OR.RALPHA.EQ.RZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, NMAX, + $ AA, LDA, N - 1, N - 1, RESET, TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + RALS = RALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ RALPHA, INCX, LDA + IF( REWI ) + $ REWIND NTRA + CALL ZHER( UPLO, N, RALPHA, XX, INCX, AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ RALPHA, INCX + IF( REWI ) + $ REWIND NTRA + CALL ZHPR( UPLO, N, RALPHA, XX, INCX, AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = RALS.EQ.RALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + IF( NULL )THEN + ISAME( 6 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 6 ) = LZERES( SNAME( 2: 3 ), UPLO, N, N, AS, + $ AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 7 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 40 I = 1, N + Z( I ) = X( I ) + 40 CONTINUE + ELSE + DO 50 I = 1, N + Z( I ) = X( N - I + 1 ) + 50 CONTINUE + END IF + JA = 1 + DO 60 J = 1, N + W( 1 ) = DCONJG( Z( J ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 1, ALPHA, Z( JJ ), LJ, W, + $ 1, ONE, A( JJ, J ), 1, YT, G, + $ AA( JA ), EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 110 + 60 CONTINUE + ELSE +* Avoid repeating tests if N.le.0. + IF( N.LE.0 ) + $ GO TO 100 + END IF +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, RALPHA, INCX, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, RALPHA, INCX + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', AP) .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',', F4.1, ', X,', + $ I2, ', A,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END + SUBROUTINE ZCHK6( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NINC, INC, NMAX, + $ INCMAX, A, AA, AS, X, XX, XS, Y, YY, YS, YT, G, + $ Z ) +* +* Tests ZHER2 and ZHPR2. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, HALF, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ HALF = ( 0.5D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER INCMAX, NALF, NIDIM, NINC, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), X( NMAX ), XS( NMAX*INCMAX ), + $ XX( NMAX*INCMAX ), Y( NMAX ), + $ YS( NMAX*INCMAX ), YT( NMAX ), + $ YY( NMAX*INCMAX ), Z( NMAX, 2 ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ), INC( NINC ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, TRANSL + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IC, IN, INCX, INCXS, INCY, INCYS, IX, + $ IY, J, JA, JJ, LAA, LDA, LDAS, LJ, LX, LY, N, + $ NARGS, NC, NS + LOGICAL FULL, NULL, PACKED, RESET, SAME, UPPER + CHARACTER*1 UPLO, UPLOS + CHARACTER*2 ICH +* .. Local Arrays .. + COMPLEX*16 W( 2 ) + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHER2, ZHPR2, ZMAKE, ZMVCH +* .. Intrinsic Functions .. + INTRINSIC ABS, DCONJG, MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'UL'/ +* .. Executable Statements .. + FULL = SNAME( 3: 3 ).EQ.'E' + PACKED = SNAME( 3: 3 ).EQ.'P' +* Define the number of arguments. + IF( FULL )THEN + NARGS = 9 + ELSE IF( PACKED )THEN + NARGS = 8 + END IF +* + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 140 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDA to 1 more than minimum value if room. + LDA = N + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 140 + IF( PACKED )THEN + LAA = ( N*( N + 1 ) )/2 + ELSE + LAA = LDA*N + END IF +* + DO 130 IC = 1, 2 + UPLO = ICH( IC: IC ) + UPPER = UPLO.EQ.'U' +* + DO 120 IX = 1, NINC + INCX = INC( IX ) + LX = ABS( INCX )*N +* +* Generate the vector X. +* + TRANSL = HALF + CALL ZMAKE( 'GE', ' ', ' ', 1, N, X, 1, XX, ABS( INCX ), + $ 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + X( N/2 ) = ZERO + XX( 1 + ABS( INCX )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 110 IY = 1, NINC + INCY = INC( IY ) + LY = ABS( INCY )*N +* +* Generate the vector Y. +* + TRANSL = ZERO + CALL ZMAKE( 'GE', ' ', ' ', 1, N, Y, 1, YY, + $ ABS( INCY ), 0, N - 1, RESET, TRANSL ) + IF( N.GT.1 )THEN + Y( N/2 ) = ZERO + YY( 1 + ABS( INCY )*( N/2 - 1 ) ) = ZERO + END IF +* + DO 100 IA = 1, NALF + ALPHA = ALF( IA ) + NULL = N.LE.0.OR.ALPHA.EQ.ZERO +* +* Generate the matrix A. +* + TRANSL = ZERO + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, A, + $ NMAX, AA, LDA, N - 1, N - 1, RESET, + $ TRANSL ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LX + XS( I ) = XX( I ) + 20 CONTINUE + INCXS = INCX + DO 30 I = 1, LY + YS( I ) = YY( I ) + 30 CONTINUE + INCYS = INCY +* +* Call the subroutine. +* + IF( FULL )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY, LDA + IF( REWI ) + $ REWIND NTRA + CALL ZHER2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA, LDA ) + ELSE IF( PACKED )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, N, + $ ALPHA, INCX, INCY + IF( REWI ) + $ REWIND NTRA + CALL ZHPR2( UPLO, N, ALPHA, XX, INCX, YY, INCY, + $ AA ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 160 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLO.EQ.UPLOS + ISAME( 2 ) = NS.EQ.N + ISAME( 3 ) = ALS.EQ.ALPHA + ISAME( 4 ) = LZE( XS, XX, LX ) + ISAME( 5 ) = INCXS.EQ.INCX + ISAME( 6 ) = LZE( YS, YY, LY ) + ISAME( 7 ) = INCYS.EQ.INCY + IF( NULL )THEN + ISAME( 8 ) = LZE( AS, AA, LAA ) + ELSE + ISAME( 8 ) = LZERES( SNAME( 2: 3 ), UPLO, N, N, + $ AS, AA, LDA ) + END IF + IF( .NOT.PACKED )THEN + ISAME( 9 ) = LDAS.EQ.LDA + END IF +* +* If data was incorrectly changed, report and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 160 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( INCX.GT.0 )THEN + DO 50 I = 1, N + Z( I, 1 ) = X( I ) + 50 CONTINUE + ELSE + DO 60 I = 1, N + Z( I, 1 ) = X( N - I + 1 ) + 60 CONTINUE + END IF + IF( INCY.GT.0 )THEN + DO 70 I = 1, N + Z( I, 2 ) = Y( I ) + 70 CONTINUE + ELSE + DO 80 I = 1, N + Z( I, 2 ) = Y( N - I + 1 ) + 80 CONTINUE + END IF + JA = 1 + DO 90 J = 1, N + W( 1 ) = ALPHA*DCONJG( Z( J, 2 ) ) + W( 2 ) = DCONJG( ALPHA )*DCONJG( Z( J, 1 ) ) + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + CALL ZMVCH( 'N', LJ, 2, ONE, Z( JJ, 1 ), + $ NMAX, W, 1, ONE, A( JJ, J ), 1, + $ YT, G, AA( JA ), EPS, ERR, FATAL, + $ NOUT, .TRUE. ) + IF( FULL )THEN + IF( UPPER )THEN + JA = JA + LDA + ELSE + JA = JA + LDA + 1 + END IF + ELSE + JA = JA + LJ + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and return. + IF( FATAL ) + $ GO TO 150 + 90 CONTINUE + ELSE +* Avoid repeating tests with N.le.0. + IF( N.LE.0 ) + $ GO TO 140 + END IF +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 170 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9995 )J +* + 160 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( FULL )THEN + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, N, ALPHA, INCX, + $ INCY, LDA + ELSE IF( PACKED )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, N, ALPHA, INCX, INCY + END IF +* + 170 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', AP) ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',', I3, ',(', F4.1, ',', + $ F4.1, '), X,', I2, ', Y,', I2, ', A,', I3, ') ', + $ ' .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK6. +* + END + SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 2 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, A, X and Y should not need to be defined. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION RALPHA +* .. Local Arrays .. + COMPLEX*16 A( 1, 1 ), X( 1 ), Y( 1 ) +* .. External Subroutines .. + EXTERNAL CHKXER, ZGBMV, ZGEMV, ZGERC, ZGERU, ZHBMV, + $ ZHEMV, ZHER, ZHER2, ZHPMV, ZHPR, ZHPR2, ZTBMV, + $ ZTBSV, ZTPMV, ZTPSV, ZTRMV, ZTRSV +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90, 100, 110, 120, 130, 140, 150, 160, + $ 170 )ISNUM + 10 INFOT = 1 + CALL ZGEMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMV( 'N', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMV( 'N', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZGEMV( 'N', 2, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMV( 'N', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZGEMV( 'N', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 20 INFOT = 1 + CALL ZGBMV( '/', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGBMV( 'N', -1, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGBMV( 'N', 0, -1, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGBMV( 'N', 0, 0, -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGBMV( 'N', 2, 0, 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGBMV( 'N', 0, 0, 1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGBMV( 'N', 0, 0, 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 30 INFOT = 1 + CALL ZHEMV( '/', 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHEMV( 'U', -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHEMV( 'U', 2, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMV( 'U', 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHEMV( 'U', 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 40 INFOT = 1 + CALL ZHBMV( '/', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHBMV( 'U', -1, 0, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHBMV( 'U', 0, -1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZHBMV( 'U', 0, 1, ALPHA, A, 1, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZHBMV( 'U', 0, 0, ALPHA, A, 1, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZHBMV( 'U', 0, 0, ALPHA, A, 1, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 50 INFOT = 1 + CALL ZHPMV( '/', 0, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHPMV( 'U', -1, ALPHA, A, X, 1, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZHPMV( 'U', 0, ALPHA, A, X, 0, BETA, Y, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHPMV( 'U', 0, ALPHA, A, X, 1, BETA, Y, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 60 INFOT = 1 + CALL ZTRMV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRMV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRMV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRMV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZTRMV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 70 INFOT = 1 + CALL ZTBMV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTBMV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTBMV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTBMV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTBMV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTBMV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTBMV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 80 INFOT = 1 + CALL ZTPMV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTPMV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTPMV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTPMV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTPMV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 90 INFOT = 1 + CALL ZTRSV( '/', 'N', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSV( 'U', '/', 'N', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSV( 'U', 'N', '/', 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSV( 'U', 'N', 'N', -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSV( 'U', 'N', 'N', 2, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZTRSV( 'U', 'N', 'N', 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 100 INFOT = 1 + CALL ZTBSV( '/', 'N', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTBSV( 'U', '/', 'N', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTBSV( 'U', 'N', '/', 0, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTBSV( 'U', 'N', 'N', -1, 0, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTBSV( 'U', 'N', 'N', 0, -1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTBSV( 'U', 'N', 'N', 0, 1, A, 1, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTBSV( 'U', 'N', 'N', 0, 0, A, 1, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 110 INFOT = 1 + CALL ZTPSV( '/', 'N', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTPSV( 'U', '/', 'N', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTPSV( 'U', 'N', '/', 0, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTPSV( 'U', 'N', 'N', -1, A, X, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZTPSV( 'U', 'N', 'N', 0, A, X, 0 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 120 INFOT = 1 + CALL ZGERC( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGERC( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGERC( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZGERC( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZGERC( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 130 INFOT = 1 + CALL ZGERU( -1, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGERU( 0, -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGERU( 0, 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZGERU( 0, 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZGERU( 2, 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 140 INFOT = 1 + CALL ZHER( '/', 0, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHER( 'U', -1, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHER( 'U', 0, RALPHA, X, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER( 'U', 2, RALPHA, X, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 150 INFOT = 1 + CALL ZHPR( '/', 0, RALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHPR( 'U', -1, RALPHA, X, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHPR( 'U', 0, RALPHA, X, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 160 INFOT = 1 + CALL ZHER2( '/', 0, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHER2( 'U', -1, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHER2( 'U', 0, ALPHA, X, 0, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2( 'U', 0, ALPHA, X, 1, Y, 0, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2( 'U', 2, ALPHA, X, 1, Y, 1, A, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 180 + 170 INFOT = 1 + CALL ZHPR2( '/', 0, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHPR2( 'U', -1, ALPHA, X, 1, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZHPR2( 'U', 0, ALPHA, X, 0, Y, 1, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHPR2( 'U', 0, ALPHA, X, 1, Y, 0, A ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 180 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of ZCHKE. +* + END + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, KL, + $ KU, RESET, TRANSL ) +* +* Generates values for an M by N matrix A within the bandwidth +* defined by KL and KU. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'GB', 'HE', 'HB', 'HP', 'TR', 'TB' OR 'TP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER KL, KU, LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, I1, I2, I3, IBEG, IEND, IOFF, J, JJ, KK + LOGICAL GEN, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DBLE, DCMPLX, DCONJG, MAX, MIN +* .. Executable Statements .. + GEN = TYPE( 1: 1 ).EQ.'G' + SYM = TYPE( 1: 1 ).EQ.'H' + TRI = TYPE( 1: 1 ).EQ.'T' + UPPER = ( SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + IF( ( I.LE.J.AND.J - I.LE.KU ).OR. + $ ( I.GE.J.AND.I - J.LE.KL ) )THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + ELSE + A( I, J ) = ZERO + END IF + IF( I.NE.J )THEN + IF( SYM )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( SYM ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'GB' )THEN + DO 90 J = 1, N + DO 60 I1 = 1, KU + 1 - J + AA( I1 + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I2 = I1, MIN( KL + KU + 1, KU + 1 + M - J ) + AA( I2 + ( J - 1 )*LDA ) = A( I2 + J - KU - 1, J ) + 70 CONTINUE + DO 80 I3 = I2, LDA + AA( I3 + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + 90 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'TR' )THEN + DO 130 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 100 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 100 CONTINUE + DO 110 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 110 CONTINUE + DO 120 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 120 CONTINUE + IF( SYM )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 130 CONTINUE + ELSE IF( TYPE.EQ.'HB'.OR.TYPE.EQ.'TB' )THEN + DO 170 J = 1, N + IF( UPPER )THEN + KK = KL + 1 + IBEG = MAX( 1, KL + 2 - J ) + IF( UNIT )THEN + IEND = KL + ELSE + IEND = KL + 1 + END IF + ELSE + KK = 1 + IF( UNIT )THEN + IBEG = 2 + ELSE + IBEG = 1 + END IF + IEND = MIN( KL + 1, 1 + M - J ) + END IF + DO 140 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 140 CONTINUE + DO 150 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I + J - KK, J ) + 150 CONTINUE + DO 160 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 160 CONTINUE + IF( SYM )THEN + JJ = KK + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 170 CONTINUE + ELSE IF( TYPE.EQ.'HP'.OR.TYPE.EQ.'TP' )THEN + IOFF = 0 + DO 190 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 180 I = IBEG, IEND + IOFF = IOFF + 1 + AA( IOFF ) = A( I, J ) + IF( I.EQ.J )THEN + IF( UNIT ) + $ AA( IOFF ) = ROGUE + IF( SYM ) + $ AA( IOFF ) = DCMPLX( DBLE( AA( IOFF ) ), RROGUE ) + END IF + 180 CONTINUE + 190 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMVCH( TRANS, M, N, ALPHA, A, NMAX, X, INCX, BETA, Y, + $ INCY, YT, G, YY, EPS, ERR, FATAL, NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER INCX, INCY, M, N, NMAX, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANS +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), X( * ), Y( * ), YT( * ), YY( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 C + DOUBLE PRECISION ERRI + INTEGER I, INCXL, INCYL, IY, J, JX, KX, KY, ML, NL + LOGICAL CTRAN, TRAN +* .. Intrinsic Functions .. + INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( C ) = ABS( DBLE( C ) ) + ABS( DIMAG( C ) ) +* .. Executable Statements .. + TRAN = TRANS.EQ.'T' + CTRAN = TRANS.EQ.'C' + IF( TRAN.OR.CTRAN )THEN + ML = N + NL = M + ELSE + ML = M + NL = N + END IF + IF( INCX.LT.0 )THEN + KX = NL + INCXL = -1 + ELSE + KX = 1 + INCXL = 1 + END IF + IF( INCY.LT.0 )THEN + KY = ML + INCYL = -1 + ELSE + KY = 1 + INCYL = 1 + END IF +* +* Compute expected result in YT using data in A, X and Y. +* Compute gauges in G. +* + IY = KY + DO 40 I = 1, ML + YT( IY ) = ZERO + G( IY ) = RZERO + JX = KX + IF( TRAN )THEN + DO 10 J = 1, NL + YT( IY ) = YT( IY ) + A( J, I )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 10 CONTINUE + ELSE IF( CTRAN )THEN + DO 20 J = 1, NL + YT( IY ) = YT( IY ) + DCONJG( A( J, I ) )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( J, I ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 20 CONTINUE + ELSE + DO 30 J = 1, NL + YT( IY ) = YT( IY ) + A( I, J )*X( JX ) + G( IY ) = G( IY ) + ABS1( A( I, J ) )*ABS1( X( JX ) ) + JX = JX + INCXL + 30 CONTINUE + END IF + YT( IY ) = ALPHA*YT( IY ) + BETA*Y( IY ) + G( IY ) = ABS1( ALPHA )*G( IY ) + ABS1( BETA )*ABS1( Y( IY ) ) + IY = IY + INCYL + 40 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 50 I = 1, ML + ERRI = ABS( YT( I ) - YY( 1 + ( I - 1 )*ABS( INCY ) ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 60 + 50 CONTINUE +* If the loop completes, all results are at least half accurate. + GO TO 80 +* +* Report fatal error. +* + 60 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 70 I = 1, ML + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, YT( I ), + $ YY( 1 + ( I - 1 )*ABS( INCY ) ) + ELSE + WRITE( NOUT, FMT = 9998 )I, + $ YY( 1 + ( I - 1 )*ABS( INCY ) ), YT( I ) + END IF + 70 CONTINUE +* + 80 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) +* +* End of ZMVCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE', 'HE' or 'HP'. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 2 BLAS +* routines. +* +* XERBLA is an error handler for the Level 2 BLAS routines. +* +* It is called by the Level 2 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 2 Blas. +* +* -- Written on 10-August-1987. +* Richard Hanson, Sandia National Labs. +* Jeremy Du Croz, NAG Central Office. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/test/zblat3.dat b/test/zblat3.dat new file mode 100644 index 0000000000..c02ac4f767 --- /dev/null +++ b/test/zblat3.dat @@ -0,0 +1,23 @@ +'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +6 UNIT NUMBER OF SUMMARY FILE +'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +-1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +F LOGICAL FLAG, T TO STOP ON FAILURES. +F LOGICAL FLAG, T TO TEST ERROR EXITS. +16.0 THRESHOLD VALUE OF TEST RATIO +6 NUMBER OF VALUES OF N +0 1 2 3 7 31 63 VALUES OF N +3 NUMBER OF VALUES OF ALPHA +(0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +3 NUMBER OF VALUES OF BETA +(0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. diff --git a/test/zblat3.f b/test/zblat3.f new file mode 100644 index 0000000000..d6a522f2ae --- /dev/null +++ b/test/zblat3.f @@ -0,0 +1,3445 @@ + PROGRAM ZBLAT3 +* +* Test program for the COMPLEX*16 Level 3 Blas. +* +* The program must be driven by a short data file. The first 14 records +* of the file are read using list-directed input, the last 9 records +* are read using the format ( A6, L2 ). An annotated example of a data +* file can be obtained by deleting the first 3 characters from the +* following 23 lines: +* 'ZBLAT3.SUMM' NAME OF SUMMARY OUTPUT FILE +* 6 UNIT NUMBER OF SUMMARY FILE +* 'ZBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE +* -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) +* F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. +* F LOGICAL FLAG, T TO STOP ON FAILURES. +* T LOGICAL FLAG, T TO TEST ERROR EXITS. +* 16.0 THRESHOLD VALUE OF TEST RATIO +* 6 NUMBER OF VALUES OF N +* 0 1 2 3 5 9 VALUES OF N +* 3 NUMBER OF VALUES OF ALPHA +* (0.0,0.0) (1.0,0.0) (0.7,-0.9) VALUES OF ALPHA +* 3 NUMBER OF VALUES OF BETA +* (0.0,0.0) (1.0,0.0) (1.3,-1.1) VALUES OF BETA +* ZGEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHEMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRMM T PUT F FOR NO TEST. SAME COLUMNS. +* ZTRSM T PUT F FOR NO TEST. SAME COLUMNS. +* ZHERK T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYRK T PUT F FOR NO TEST. SAME COLUMNS. +* ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. +* ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. +* +* See: +* +* Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. +* A Set of Level 3 Basic Linear Algebra Subprograms. +* +* Technical Memorandum No.88 (Revision 1), Mathematics and +* Computer Science Division, Argonne National Laboratory, 9700 +* South Cass Avenue, Argonne, Illinois 60439, US. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + INTEGER NIN + PARAMETER ( NIN = 5 ) + INTEGER NSUBS + PARAMETER ( NSUBS = 9 ) + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RHALF, RONE + PARAMETER ( RZERO = 0.0D0, RHALF = 0.5D0, RONE = 1.0D0 ) + INTEGER NMAX + PARAMETER ( NMAX = 65 ) + INTEGER NIDMAX, NALMAX, NBEMAX + PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) +* .. Local Scalars .. + DOUBLE PRECISION EPS, ERR, THRESH + INTEGER I, ISNUM, J, N, NALF, NBET, NIDIM, NOUT, NTRA + LOGICAL FATAL, LTESTT, REWI, SAME, SFATAL, TRACE, + $ TSTERR + CHARACTER*1 TRANSA, TRANSB + CHARACTER*6 SNAMET + CHARACTER*32 SNAPS, SUMMRY +* .. Local Arrays .. + COMPLEX*16 AA( NMAX*NMAX ), AB( NMAX, 2*NMAX ), + $ ALF( NALMAX ), AS( NMAX*NMAX ), + $ BB( NMAX*NMAX ), BET( NBEMAX ), + $ BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDMAX ) + LOGICAL LTEST( NSUBS ) + CHARACTER*6 SNAMES( NSUBS ) +* .. External Functions .. + DOUBLE PRECISION DDIFF + LOGICAL LZE + EXTERNAL DDIFF, LZE +* .. External Subroutines .. + EXTERNAL ZCHK1, ZCHK2, ZCHK3, ZCHK4, ZCHK5, ZCHKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX, MIN +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Data statements .. + DATA SNAMES/'ZGEMM ', 'ZHEMM ', 'ZSYMM ', 'ZTRMM ', + $ 'ZTRSM ', 'ZHERK ', 'ZSYRK ', 'ZHER2K', + $ 'ZSYR2K'/ +* .. Executable Statements .. +* +* Read name and unit number for summary output file and open file. +* + READ( NIN, FMT = * )SUMMRY + READ( NIN, FMT = * )NOUT + OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + NOUTC = NOUT +* +* Read name and unit number for snapshot output file and open file. +* + READ( NIN, FMT = * )SNAPS + READ( NIN, FMT = * )NTRA + TRACE = NTRA.GE.0 + IF( TRACE )THEN + OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + END IF +* Read the flag that directs rewinding of the snapshot file. + READ( NIN, FMT = * )REWI + REWI = REWI.AND.TRACE +* Read the flag that directs stopping on any failure. + READ( NIN, FMT = * )SFATAL +* Read the flag that indicates whether error exits are to be tested. + READ( NIN, FMT = * )TSTERR +* Read the threshold value of the test ratio + READ( NIN, FMT = * )THRESH +* +* Read and check the parameter values for the tests. +* +* Values of N + READ( NIN, FMT = * )NIDIM + IF( NIDIM.LT.1.OR.NIDIM.GT.NIDMAX )THEN + WRITE( NOUT, FMT = 9997 )'N', NIDMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( IDIM( I ), I = 1, NIDIM ) + DO 10 I = 1, NIDIM + IF( IDIM( I ).LT.0.OR.IDIM( I ).GT.NMAX )THEN + WRITE( NOUT, FMT = 9996 )NMAX + GO TO 220 + END IF + 10 CONTINUE +* Values of ALPHA + READ( NIN, FMT = * )NALF + IF( NALF.LT.1.OR.NALF.GT.NALMAX )THEN + WRITE( NOUT, FMT = 9997 )'ALPHA', NALMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( ALF( I ), I = 1, NALF ) +* Values of BETA + READ( NIN, FMT = * )NBET + IF( NBET.LT.1.OR.NBET.GT.NBEMAX )THEN + WRITE( NOUT, FMT = 9997 )'BETA', NBEMAX + GO TO 220 + END IF + READ( NIN, FMT = * )( BET( I ), I = 1, NBET ) +* +* Report values of parameters. +* + WRITE( NOUT, FMT = 9995 ) + WRITE( NOUT, FMT = 9994 )( IDIM( I ), I = 1, NIDIM ) + WRITE( NOUT, FMT = 9993 )( ALF( I ), I = 1, NALF ) + WRITE( NOUT, FMT = 9992 )( BET( I ), I = 1, NBET ) + IF( .NOT.TSTERR )THEN + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9984 ) + END IF + WRITE( NOUT, FMT = * ) + WRITE( NOUT, FMT = 9999 )THRESH + WRITE( NOUT, FMT = * ) +* +* Read names of subroutines and flags which indicate +* whether they are to be tested. +* + DO 20 I = 1, NSUBS + LTEST( I ) = .FALSE. + 20 CONTINUE + 30 READ( NIN, FMT = 9988, END = 60 )SNAMET, LTESTT + DO 40 I = 1, NSUBS + IF( SNAMET.EQ.SNAMES( I ) ) + $ GO TO 50 + 40 CONTINUE + WRITE( NOUT, FMT = 9990 )SNAMET + STOP + 50 LTEST( I ) = LTESTT + GO TO 30 +* + 60 CONTINUE + CLOSE ( NIN ) +* +* Compute EPS (the machine precision). +* + EPS = RONE + 70 CONTINUE + IF( DDIFF( RONE + EPS, RONE ).EQ.RZERO ) + $ GO TO 80 + EPS = RHALF*EPS + GO TO 70 + 80 CONTINUE + EPS = EPS + EPS + WRITE( NOUT, FMT = 9998 )EPS +* +* Check the reliability of ZMMCH using exact data. +* + N = MIN( 32, NMAX ) + DO 100 J = 1, N + DO 90 I = 1, N + AB( I, J ) = MAX( I - J + 1, 0 ) + 90 CONTINUE + AB( J, NMAX + 1 ) = J + AB( 1, NMAX + J ) = J + C( J, 1 ) = ZERO + 100 CONTINUE + DO 110 J = 1, N + CC( J ) = J*( ( J + 1 )*J )/2 - ( ( J + 1 )*J*( J - 1 ) )/3 + 110 CONTINUE +* CC holds the exact result. On exit from ZMMCH CT holds +* the result computed by ZMMCH. + TRANSA = 'N' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + DO 120 J = 1, N + AB( J, NMAX + 1 ) = N - J + 1 + AB( 1, NMAX + J ) = N - J + 1 + 120 CONTINUE + DO 130 J = 1, N + CC( N - J + 1 ) = J*( ( J + 1 )*J )/2 - + $ ( ( J + 1 )*J*( J - 1 ) )/3 + 130 CONTINUE + TRANSA = 'C' + TRANSB = 'N' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF + TRANSB = 'C' + CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, + $ AB( 1, NMAX + 1 ), NMAX, ZERO, C, NMAX, CT, G, CC, + $ NMAX, EPS, ERR, FATAL, NOUT, .TRUE. ) + SAME = LZE( CC, CT, N ) + IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN + WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR + STOP + END IF +* +* Test each subroutine in turn. +* + DO 200 ISNUM = 1, NSUBS + WRITE( NOUT, FMT = * ) + IF( .NOT.LTEST( ISNUM ) )THEN +* Subprogram is not to be tested. + WRITE( NOUT, FMT = 9987 )SNAMES( ISNUM ) + ELSE + SRNAMT = SNAMES( ISNUM ) +* Test error exits. + IF( TSTERR )THEN + CALL ZCHKE( ISNUM, SNAMES( ISNUM ), NOUT ) + WRITE( NOUT, FMT = * ) + END IF +* Test computations. + INFOT = 0 + OK = .TRUE. + FATAL = .FALSE. + GO TO ( 140, 150, 150, 160, 160, 170, 170, + $ 180, 180 )ISNUM +* Test ZGEMM, 01. + 140 CALL ZCHK1( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZHEMM, 02, ZSYMM, 03. + 150 CALL ZCHK2( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZTRMM, 04, ZTRSM, 05. + 160 CALL ZCHK3( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NMAX, AB, + $ AA, AS, AB( 1, NMAX + 1 ), BB, BS, CT, G, C ) + GO TO 190 +* Test ZHERK, 06, ZSYRK, 07. + 170 CALL ZCHK4( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, AB( 1, NMAX + 1 ), BB, BS, C, + $ CC, CS, CT, G ) + GO TO 190 +* Test ZHER2K, 08, ZSYR2K, 09. + 180 CALL ZCHK5( SNAMES( ISNUM ), EPS, THRESH, NOUT, NTRA, TRACE, + $ REWI, FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, + $ NMAX, AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) + GO TO 190 +* + 190 IF( FATAL.AND.SFATAL ) + $ GO TO 210 + END IF + 200 CONTINUE + WRITE( NOUT, FMT = 9986 ) + GO TO 230 +* + 210 CONTINUE + WRITE( NOUT, FMT = 9985 ) + GO TO 230 +* + 220 CONTINUE + WRITE( NOUT, FMT = 9991 ) +* + 230 CONTINUE + IF( TRACE ) + $ CLOSE ( NTRA ) + CLOSE ( NOUT ) + STOP +* + 9999 FORMAT( ' ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LES', + $ 'S THAN', F8.2 ) + 9998 FORMAT( ' RELATIVE MACHINE PRECISION IS TAKEN TO BE', 1P, D9.1 ) + 9997 FORMAT( ' NUMBER OF VALUES OF ', A, ' IS LESS THAN 1 OR GREATER ', + $ 'THAN ', I2 ) + 9996 FORMAT( ' VALUE OF N IS LESS THAN 0 OR GREATER THAN ', I2 ) + 9995 FORMAT( ' TESTS OF THE COMPLEX*16 LEVEL 3 BLAS', //' THE F', + $ 'OLLOWING PARAMETER VALUES WILL BE USED:' ) + 9994 FORMAT( ' FOR N ', 9I6 ) + 9993 FORMAT( ' FOR ALPHA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9992 FORMAT( ' FOR BETA ', + $ 7( '(', F4.1, ',', F4.1, ') ', : ) ) + 9991 FORMAT( ' AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM', + $ /' ******* TESTS ABANDONED *******' ) + 9990 FORMAT( ' SUBPROGRAM NAME ', A6, ' NOT RECOGNIZED', /' ******* T', + $ 'ESTS ABANDONED *******' ) + 9989 FORMAT( ' ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALU', + $ 'ATED WRONGLY.', /' ZMMCH WAS CALLED WITH TRANSA = ', A1, + $ ' AND TRANSB = ', A1, /' AND RETURNED SAME = ', L1, ' AND ', + $ 'ERR = ', F12.3, '.', /' THIS MAY BE DUE TO FAULTS IN THE ', + $ 'ARITHMETIC OR THE COMPILER.', /' ******* TESTS ABANDONED ', + $ '*******' ) + 9988 FORMAT( A6, L2 ) + 9987 FORMAT( 1X, A6, ' WAS NOT TESTED' ) + 9986 FORMAT( /' END OF TESTS' ) + 9985 FORMAT( /' ******* FATAL ERROR - TESTS ABANDONED *******' ) + 9984 FORMAT( ' ERROR-EXITS WILL NOT BE TESTED' ) +* +* End of ZBLAT3. +* + END + SUBROUTINE ZCHK1( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZGEMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICA, ICB, IK, IM, IN, K, KS, LAA, + $ LBB, LCC, LDA, LDAS, LDB, LDBS, LDC, LDCS, M, + $ MA, MB, MS, N, NA, NARGS, NB, NC, NS + LOGICAL NULL, RESET, SAME, TRANA, TRANB + CHARACTER*1 TRANAS, TRANBS, TRANSA, TRANSB + CHARACTER*3 ICH +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZGEMM, ZMAKE, ZMMCH +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICH/'NTC'/ +* .. Executable Statements .. +* + NARGS = 13 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 110 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICA = 1, 3 + TRANSA = ICH( ICA: ICA ) + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' +* + IF( TRANA )THEN + MA = K + NA = M + ELSE + MA = M + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICB = 1, 3 + TRANSB = ICH( ICB: ICB ) + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' +* + IF( TRANB )THEN + MB = N + NB = K + ELSE + MB = K + NB = N + END IF +* Set LDB to 1 more than minimum value if room. + LDB = MB + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 70 + LBB = LDB*NB +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', MB, NB, B, NMAX, BB, + $ LDB, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, + $ CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + TRANAS = TRANSA + TRANBS = TRANSB + MS = M + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ TRANSA, TRANSB, M, N, K, ALPHA, LDA, LDB, + $ BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZGEMM( TRANSA, TRANSB, M, N, K, ALPHA, + $ AA, LDA, BB, LDB, BETA, CC, LDC ) +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = TRANSA.EQ.TRANAS + ISAME( 2 ) = TRANSB.EQ.TRANBS + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = KS.EQ.K + ISAME( 6 ) = ALS.EQ.ALPHA + ISAME( 7 ) = LZE( AS, AA, LAA ) + ISAME( 8 ) = LDAS.EQ.LDA + ISAME( 9 ) = LZE( BS, BB, LBB ) + ISAME( 10 ) = LDBS.EQ.LDB + ISAME( 11 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 12 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 12 ) = LZERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 13 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report +* and return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + CALL ZMMCH( TRANSA, TRANSB, M, N, K, + $ ALPHA, A, NMAX, B, NMAX, BETA, + $ C, NMAX, CT, G, CC, LDC, EPS, + $ ERR, FATAL, NOUT, .TRUE. ) + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 120 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, TRANSA, TRANSB, M, N, K, + $ ALPHA, LDA, LDB, BETA, LDC +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(''', A1, ''',''', A1, ''',', + $ 3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, + $ ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK1. +* + END + SUBROUTINE ZCHK2( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZHEMM and ZSYMM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BLS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, IB, ICS, ICU, IM, IN, LAA, LBB, LCC, + $ LDA, LDAS, LDB, LDBS, LDC, LDCS, M, MS, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, LEFT, NULL, RESET, SAME + CHARACTER*1 SIDE, SIDES, UPLO, UPLOS + CHARACTER*2 ICHS, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHEMM, ZMAKE, ZMMCH, ZSYMM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHS/'LR'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 90 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = M + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 90 + LCC = LDC*N + NULL = N.LE.0.OR.M.LE.0 +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 90 + LBB = LDB*N +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, BB, LDB, RESET, + $ ZERO ) +* + DO 80 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' +* + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* +* Generate the hermitian or symmetric matrix A. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', NA, NA, A, NMAX, + $ AA, LDA, RESET, ZERO ) +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 50 IB = 1, NBET + BETA = BET( IB ) +* +* Generate the matrix C. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, C, NMAX, CC, + $ LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + MS = M + NS = N + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + BLS = BETA + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, SIDE, + $ UPLO, M, N, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + IF( CONJ )THEN + CALL ZHEMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + ELSE + CALL ZSYMM( SIDE, UPLO, M, N, ALPHA, AA, LDA, + $ BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 110 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = MS.EQ.M + ISAME( 4 ) = NS.EQ.N + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + ISAME( 10 ) = BLS.EQ.BETA + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'GE', ' ', M, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 110 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( 'N', 'N', M, N, M, ALPHA, A, + $ NMAX, B, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', 'N', M, N, N, ALPHA, B, + $ NMAX, A, NMAX, BETA, C, NMAX, + $ CT, G, CC, LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 120 +* + 110 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, M, N, ALPHA, LDA, + $ LDB, BETA, LDC +* + 120 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK2. +* + END + SUBROUTINE ZCHK3( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NMAX, A, AA, AS, + $ B, BB, BS, CT, G, C ) +* +* Tests ZTRMM and ZTRSM. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS + DOUBLE PRECISION ERR, ERRMAX + INTEGER I, IA, ICD, ICS, ICT, ICU, IM, IN, J, LAA, LBB, + $ LDA, LDAS, LDB, LDBS, M, MS, N, NA, NARGS, NC, + $ NS + LOGICAL LEFT, NULL, RESET, SAME + CHARACTER*1 DIAG, DIAGS, SIDE, SIDES, TRANAS, TRANSA, UPLO, + $ UPLOS + CHARACTER*2 ICHD, ICHS, ICHU + CHARACTER*3 ICHT +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZMAKE, ZMMCH, ZTRMM, ZTRSM +* .. Intrinsic Functions .. + INTRINSIC MAX +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHU/'UL'/, ICHT/'NTC'/, ICHD/'UN'/, ICHS/'LR'/ +* .. Executable Statements .. +* + NARGS = 11 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* Set up zero matrix for ZMMCH. + DO 20 J = 1, NMAX + DO 10 I = 1, NMAX + C( I, J ) = ZERO + 10 CONTINUE + 20 CONTINUE +* + DO 140 IM = 1, NIDIM + M = IDIM( IM ) +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDB to 1 more than minimum value if room. + LDB = M + IF( LDB.LT.NMAX ) + $ LDB = LDB + 1 +* Skip tests if not enough room. + IF( LDB.GT.NMAX ) + $ GO TO 130 + LBB = LDB*N + NULL = M.LE.0.OR.N.LE.0 +* + DO 120 ICS = 1, 2 + SIDE = ICHS( ICS: ICS ) + LEFT = SIDE.EQ.'L' + IF( LEFT )THEN + NA = M + ELSE + NA = N + END IF +* Set LDA to 1 more than minimum value if room. + LDA = NA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 130 + LAA = LDA*NA +* + DO 110 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) +* + DO 100 ICT = 1, 3 + TRANSA = ICHT( ICT: ICT ) +* + DO 90 ICD = 1, 2 + DIAG = ICHD( ICD: ICD ) +* + DO 80 IA = 1, NALF + ALPHA = ALF( IA ) +* +* Generate the matrix A. +* + CALL ZMAKE( 'TR', UPLO, DIAG, NA, NA, A, + $ NMAX, AA, LDA, RESET, ZERO ) +* +* Generate the matrix B. +* + CALL ZMAKE( 'GE', ' ', ' ', M, N, B, NMAX, + $ BB, LDB, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the +* subroutine. +* + SIDES = SIDE + UPLOS = UPLO + TRANAS = TRANSA + DIAGS = DIAG + MS = M + NS = N + ALS = ALPHA + DO 30 I = 1, LAA + AS( I ) = AA( I ) + 30 CONTINUE + LDAS = LDA + DO 40 I = 1, LBB + BS( I ) = BB( I ) + 40 CONTINUE + LDBS = LDB +* +* Call the subroutine. +* + IF( SNAME( 4: 5 ).EQ.'MM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL ZTRMM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9995 )NC, SNAME, + $ SIDE, UPLO, TRANSA, DIAG, M, N, ALPHA, + $ LDA, LDB + IF( REWI ) + $ REWIND NTRA + CALL ZTRSM( SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, AA, LDA, BB, LDB ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9994 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = SIDES.EQ.SIDE + ISAME( 2 ) = UPLOS.EQ.UPLO + ISAME( 3 ) = TRANAS.EQ.TRANSA + ISAME( 4 ) = DIAGS.EQ.DIAG + ISAME( 5 ) = MS.EQ.M + ISAME( 6 ) = NS.EQ.N + ISAME( 7 ) = ALS.EQ.ALPHA + ISAME( 8 ) = LZE( AS, AA, LAA ) + ISAME( 9 ) = LDAS.EQ.LDA + IF( NULL )THEN + ISAME( 10 ) = LZE( BS, BB, LBB ) + ELSE + ISAME( 10 ) = LZERES( 'GE', ' ', M, N, BS, + $ BB, LDB ) + END IF + ISAME( 11 ) = LDBS.EQ.LDB +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 50 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 50 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN + IF( SNAME( 4: 5 ).EQ.'MM' )THEN +* +* Check the result. +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ALPHA, A, NMAX, B, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ALPHA, B, NMAX, A, NMAX, + $ ZERO, C, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + ELSE IF( SNAME( 4: 5 ).EQ.'SM' )THEN +* +* Compute approximation to original +* matrix. +* + DO 70 J = 1, N + DO 60 I = 1, M + C( I, J ) = BB( I + ( J - 1 )* + $ LDB ) + BB( I + ( J - 1 )*LDB ) = ALPHA* + $ B( I, J ) + 60 CONTINUE + 70 CONTINUE +* + IF( LEFT )THEN + CALL ZMMCH( TRANSA, 'N', M, N, M, + $ ONE, A, NMAX, C, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + ELSE + CALL ZMMCH( 'N', TRANSA, M, N, N, + $ ONE, C, NMAX, A, NMAX, + $ ZERO, B, NMAX, CT, G, + $ BB, LDB, EPS, ERR, + $ FATAL, NOUT, .FALSE. ) + END IF + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 150 + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* + 140 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + WRITE( NOUT, FMT = 9995 )NC, SNAME, SIDE, UPLO, TRANSA, DIAG, M, + $ N, ALPHA, LDA, LDB +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( 1X, I6, ': ', A6, '(', 4( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ') ', + $ ' .' ) + 9994 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK3. +* + END + SUBROUTINE ZCHK4( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ A, AA, AS, B, BB, BS, C, CC, CS, CT, G ) +* +* Tests ZHERK and ZSYRK. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 A( NMAX, NMAX ), AA( NMAX*NMAX ), ALF( NALF ), + $ AS( NMAX*NMAX ), B( NMAX, NMAX ), + $ BB( NMAX*NMAX ), BET( NBET ), BS( NMAX*NMAX ), + $ C( NMAX, NMAX ), CC( NMAX*NMAX ), + $ CS( NMAX*NMAX ), CT( NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RALPHA, RALS, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, K, KS, + $ LAA, LCC, LDA, LDAS, LDC, LDCS, LJ, MA, N, NA, + $ NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHERK, ZMAKE, ZMMCH, ZSYRK +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 10 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 100 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 100 + LCC = LDC*N +* + DO 90 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 80 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 80 + LAA = LDA*NA +* +* Generate the matrix A. +* + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, A, NMAX, AA, LDA, + $ RESET, ZERO ) +* + DO 70 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 60 IA = 1, NALF + ALPHA = ALF( IA ) + IF( CONJ )THEN + RALPHA = DBLE( ALPHA ) + ALPHA = DCMPLX( RALPHA, RZERO ) + END IF +* + DO 50 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.RALPHA.EQ. + $ RZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + IF( CONJ )THEN + RALS = RALPHA + ELSE + ALS = ALPHA + END IF + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 20 I = 1, LCC + CS( I ) = CC( I ) + 20 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, RALPHA, LDA, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZHERK( UPLO, TRANS, N, K, RALPHA, AA, + $ LDA, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZSYRK( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 120 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + IF( CONJ )THEN + ISAME( 5 ) = RALS.EQ.RALPHA + ELSE + ISAME( 5 ) = ALS.EQ.ALPHA + END IF + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + IF( CONJ )THEN + ISAME( 8 ) = RBETS.EQ.RBETA + ELSE + ISAME( 8 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 9 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 9 ) = LZERES( SNAME( 2: 3 ), UPLO, N, + $ N, CS, CC, LDC ) + END IF + ISAME( 10 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 30 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 30 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 120 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JC = 1 + DO 40 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + CALL ZMMCH( TRANST, 'N', LJ, 1, K, + $ ALPHA, A( 1, JJ ), NMAX, + $ A( 1, J ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + ELSE + CALL ZMMCH( 'N', TRANST, LJ, 1, K, + $ ALPHA, A( JJ, 1 ), NMAX, + $ A( J, 1 ), NMAX, BETA, + $ C( JJ, J ), NMAX, CT, G, + $ CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 110 + 40 CONTINUE + END IF +* + 50 CONTINUE +* + 60 CONTINUE +* + 70 CONTINUE +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 130 +* + 110 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 120 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, RALPHA, + $ LDA, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, BETA, LDC + END IF +* + 130 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ') ', + $ ' .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, + $ '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK4. +* + END + SUBROUTINE ZCHK5( SNAME, EPS, THRESH, NOUT, NTRA, TRACE, REWI, + $ FATAL, NIDIM, IDIM, NALF, ALF, NBET, BET, NMAX, + $ AB, AA, AS, BB, BS, C, CC, CS, CT, G, W ) +* +* Tests ZHER2K and ZSYR2K. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + DOUBLE PRECISION RONE, RZERO + PARAMETER ( RONE = 1.0D0, RZERO = 0.0D0 ) +* .. Scalar Arguments .. + DOUBLE PRECISION EPS, THRESH + INTEGER NALF, NBET, NIDIM, NMAX, NOUT, NTRA + LOGICAL FATAL, REWI, TRACE + CHARACTER*6 SNAME +* .. Array Arguments .. + COMPLEX*16 AA( NMAX*NMAX ), AB( 2*NMAX*NMAX ), + $ ALF( NALF ), AS( NMAX*NMAX ), BB( NMAX*NMAX ), + $ BET( NBET ), BS( NMAX*NMAX ), C( NMAX, NMAX ), + $ CC( NMAX*NMAX ), CS( NMAX*NMAX ), CT( NMAX ), + $ W( 2*NMAX ) + DOUBLE PRECISION G( NMAX ) + INTEGER IDIM( NIDIM ) +* .. Local Scalars .. + COMPLEX*16 ALPHA, ALS, BETA, BETS + DOUBLE PRECISION ERR, ERRMAX, RBETA, RBETS + INTEGER I, IA, IB, ICT, ICU, IK, IN, J, JC, JJ, JJAB, + $ K, KS, LAA, LBB, LCC, LDA, LDAS, LDB, LDBS, + $ LDC, LDCS, LJ, MA, N, NA, NARGS, NC, NS + LOGICAL CONJ, NULL, RESET, SAME, TRAN, UPPER + CHARACTER*1 TRANS, TRANSS, TRANST, UPLO, UPLOS + CHARACTER*2 ICHT, ICHU +* .. Local Arrays .. + LOGICAL ISAME( 13 ) +* .. External Functions .. + LOGICAL LZE, LZERES + EXTERNAL LZE, LZERES +* .. External Subroutines .. + EXTERNAL ZHER2K, ZMAKE, ZMMCH, ZSYR2K +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, MAX, DBLE +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Data statements .. + DATA ICHT/'NC'/, ICHU/'UL'/ +* .. Executable Statements .. + CONJ = SNAME( 2: 3 ).EQ.'HE' +* + NARGS = 12 + NC = 0 + RESET = .TRUE. + ERRMAX = RZERO +* + DO 130 IN = 1, NIDIM + N = IDIM( IN ) +* Set LDC to 1 more than minimum value if room. + LDC = N + IF( LDC.LT.NMAX ) + $ LDC = LDC + 1 +* Skip tests if not enough room. + IF( LDC.GT.NMAX ) + $ GO TO 130 + LCC = LDC*N +* + DO 120 IK = 1, NIDIM + K = IDIM( IK ) +* + DO 110 ICT = 1, 2 + TRANS = ICHT( ICT: ICT ) + TRAN = TRANS.EQ.'C' + IF( TRAN.AND..NOT.CONJ ) + $ TRANS = 'T' + IF( TRAN )THEN + MA = K + NA = N + ELSE + MA = N + NA = K + END IF +* Set LDA to 1 more than minimum value if room. + LDA = MA + IF( LDA.LT.NMAX ) + $ LDA = LDA + 1 +* Skip tests if not enough room. + IF( LDA.GT.NMAX ) + $ GO TO 110 + LAA = LDA*NA +* +* Generate the matrix A. +* + IF( TRAN )THEN + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, 2*NMAX, AA, + $ LDA, RESET, ZERO ) + ELSE + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB, NMAX, AA, LDA, + $ RESET, ZERO ) + END IF +* +* Generate the matrix B. +* + LDB = LDA + LBB = LAA + IF( TRAN )THEN + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K + 1 ), + $ 2*NMAX, BB, LDB, RESET, ZERO ) + ELSE + CALL ZMAKE( 'GE', ' ', ' ', MA, NA, AB( K*NMAX + 1 ), + $ NMAX, BB, LDB, RESET, ZERO ) + END IF +* + DO 100 ICU = 1, 2 + UPLO = ICHU( ICU: ICU ) + UPPER = UPLO.EQ.'U' +* + DO 90 IA = 1, NALF + ALPHA = ALF( IA ) +* + DO 80 IB = 1, NBET + BETA = BET( IB ) + IF( CONJ )THEN + RBETA = DBLE( BETA ) + BETA = DCMPLX( RBETA, RZERO ) + END IF + NULL = N.LE.0 + IF( CONJ ) + $ NULL = NULL.OR.( ( K.LE.0.OR.ALPHA.EQ. + $ ZERO ).AND.RBETA.EQ.RONE ) +* +* Generate the matrix C. +* + CALL ZMAKE( SNAME( 2: 3 ), UPLO, ' ', N, N, C, + $ NMAX, CC, LDC, RESET, ZERO ) +* + NC = NC + 1 +* +* Save every datum before calling the subroutine. +* + UPLOS = UPLO + TRANSS = TRANS + NS = N + KS = K + ALS = ALPHA + DO 10 I = 1, LAA + AS( I ) = AA( I ) + 10 CONTINUE + LDAS = LDA + DO 20 I = 1, LBB + BS( I ) = BB( I ) + 20 CONTINUE + LDBS = LDB + IF( CONJ )THEN + RBETS = RBETA + ELSE + BETS = BETA + END IF + DO 30 I = 1, LCC + CS( I ) = CC( I ) + 30 CONTINUE + LDCS = LDC +* +* Call the subroutine. +* + IF( CONJ )THEN + IF( TRACE ) + $ WRITE( NTRA, FMT = 9994 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, RBETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZHER2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, RBETA, CC, LDC ) + ELSE + IF( TRACE ) + $ WRITE( NTRA, FMT = 9993 )NC, SNAME, UPLO, + $ TRANS, N, K, ALPHA, LDA, LDB, BETA, LDC + IF( REWI ) + $ REWIND NTRA + CALL ZSYR2K( UPLO, TRANS, N, K, ALPHA, AA, + $ LDA, BB, LDB, BETA, CC, LDC ) + END IF +* +* Check if error-exit was taken incorrectly. +* + IF( .NOT.OK )THEN + WRITE( NOUT, FMT = 9992 ) + FATAL = .TRUE. + GO TO 150 + END IF +* +* See what data changed inside subroutines. +* + ISAME( 1 ) = UPLOS.EQ.UPLO + ISAME( 2 ) = TRANSS.EQ.TRANS + ISAME( 3 ) = NS.EQ.N + ISAME( 4 ) = KS.EQ.K + ISAME( 5 ) = ALS.EQ.ALPHA + ISAME( 6 ) = LZE( AS, AA, LAA ) + ISAME( 7 ) = LDAS.EQ.LDA + ISAME( 8 ) = LZE( BS, BB, LBB ) + ISAME( 9 ) = LDBS.EQ.LDB + IF( CONJ )THEN + ISAME( 10 ) = RBETS.EQ.RBETA + ELSE + ISAME( 10 ) = BETS.EQ.BETA + END IF + IF( NULL )THEN + ISAME( 11 ) = LZE( CS, CC, LCC ) + ELSE + ISAME( 11 ) = LZERES( 'HE', UPLO, N, N, CS, + $ CC, LDC ) + END IF + ISAME( 12 ) = LDCS.EQ.LDC +* +* If data was incorrectly changed, report and +* return. +* + SAME = .TRUE. + DO 40 I = 1, NARGS + SAME = SAME.AND.ISAME( I ) + IF( .NOT.ISAME( I ) ) + $ WRITE( NOUT, FMT = 9998 )I + 40 CONTINUE + IF( .NOT.SAME )THEN + FATAL = .TRUE. + GO TO 150 + END IF +* + IF( .NOT.NULL )THEN +* +* Check the result column by column. +* + IF( CONJ )THEN + TRANST = 'C' + ELSE + TRANST = 'T' + END IF + JJAB = 1 + JC = 1 + DO 70 J = 1, N + IF( UPPER )THEN + JJ = 1 + LJ = J + ELSE + JJ = J + LJ = N - J + 1 + END IF + IF( TRAN )THEN + DO 50 I = 1, K + W( I ) = ALPHA*AB( ( J - 1 )*2* + $ NMAX + K + I ) + IF( CONJ )THEN + W( K + I ) = DCONJG( ALPHA )* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + ELSE + W( K + I ) = ALPHA* + $ AB( ( J - 1 )*2* + $ NMAX + I ) + END IF + 50 CONTINUE + CALL ZMMCH( TRANST, 'N', LJ, 1, 2*K, + $ ONE, AB( JJAB ), 2*NMAX, W, + $ 2*NMAX, BETA, C( JJ, J ), + $ NMAX, CT, G, CC( JC ), LDC, + $ EPS, ERR, FATAL, NOUT, + $ .TRUE. ) + ELSE + DO 60 I = 1, K + IF( CONJ )THEN + W( I ) = ALPHA*DCONJG( AB( ( K + + $ I - 1 )*NMAX + J ) ) + W( K + I ) = DCONJG( ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) ) + ELSE + W( I ) = ALPHA*AB( ( K + I - 1 )* + $ NMAX + J ) + W( K + I ) = ALPHA* + $ AB( ( I - 1 )*NMAX + + $ J ) + END IF + 60 CONTINUE + CALL ZMMCH( 'N', 'N', LJ, 1, 2*K, ONE, + $ AB( JJ ), NMAX, W, 2*NMAX, + $ BETA, C( JJ, J ), NMAX, CT, + $ G, CC( JC ), LDC, EPS, ERR, + $ FATAL, NOUT, .TRUE. ) + END IF + IF( UPPER )THEN + JC = JC + LDC + ELSE + JC = JC + LDC + 1 + IF( TRAN ) + $ JJAB = JJAB + 2*NMAX + END IF + ERRMAX = MAX( ERRMAX, ERR ) +* If got really bad answer, report and +* return. + IF( FATAL ) + $ GO TO 140 + 70 CONTINUE + END IF +* + 80 CONTINUE +* + 90 CONTINUE +* + 100 CONTINUE +* + 110 CONTINUE +* + 120 CONTINUE +* + 130 CONTINUE +* +* Report result. +* + IF( ERRMAX.LT.THRESH )THEN + WRITE( NOUT, FMT = 9999 )SNAME, NC + ELSE + WRITE( NOUT, FMT = 9997 )SNAME, NC, ERRMAX + END IF + GO TO 160 +* + 140 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9995 )J +* + 150 CONTINUE + WRITE( NOUT, FMT = 9996 )SNAME + IF( CONJ )THEN + WRITE( NOUT, FMT = 9994 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, RBETA, LDC + ELSE + WRITE( NOUT, FMT = 9993 )NC, SNAME, UPLO, TRANS, N, K, ALPHA, + $ LDA, LDB, BETA, LDC + END IF +* + 160 CONTINUE + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE COMPUTATIONAL TESTS (', I6, ' CALL', + $ 'S)' ) + 9998 FORMAT( ' ******* FATAL ERROR - PARAMETER NUMBER ', I2, ' WAS CH', + $ 'ANGED INCORRECTLY *******' ) + 9997 FORMAT( ' ', A6, ' COMPLETED THE COMPUTATIONAL TESTS (', I6, ' C', + $ 'ALLS)', /' ******* BUT WITH MAXIMUM TEST RATIO', F8.2, + $ ' - SUSPECT *******' ) + 9996 FORMAT( ' ******* ', A6, ' FAILED ON CALL NUMBER:' ) + 9995 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) + 9994 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, + $ ', C,', I3, ') .' ) + 9993 FORMAT( 1X, I6, ': ', A6, '(', 2( '''', A1, ''',' ), 2( I3, ',' ), + $ '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, + $ ',', F4.1, '), C,', I3, ') .' ) + 9992 FORMAT( ' ******* FATAL ERROR - ERROR-EXIT TAKEN ON VALID CALL *', + $ '******' ) +* +* End of ZCHK5. +* + END + SUBROUTINE ZCHKE( ISNUM, SRNAMT, NOUT ) +* +* Tests the error exits from the Level 3 Blas. +* Requires a special version of the error-handling routine XERBLA. +* ALPHA, RALPHA, BETA, RBETA, A, B and C should not need to be defined. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER ISNUM, NOUT + CHARACTER*6 SRNAMT +* .. Scalars in Common .. + INTEGER INFOT, NOUTC + LOGICAL LERR, OK +* .. Local Scalars .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION RALPHA, RBETA +* .. Local Arrays .. + COMPLEX*16 A( 2, 1 ), B( 2, 1 ), C( 2, 1 ) +* .. External Subroutines .. + EXTERNAL ZGEMM, ZHEMM, ZHER2K, ZHERK, CHKXER, ZSYMM, + $ ZSYR2K, ZSYRK, ZTRMM, ZTRSM +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUTC, OK, LERR +* .. Executable Statements .. +* OK is set to .FALSE. by the special version of XERBLA or by CHKXER +* if anything is wrong. + OK = .TRUE. +* LERR is set to .TRUE. by the special version of XERBLA each time +* it is called, and is then tested and re-set by CHKXER. + LERR = .FALSE. + GO TO ( 10, 20, 30, 40, 50, 60, 70, 80, + $ 90 )ISNUM + 10 INFOT = 1 + CALL ZGEMM( '/', 'N', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL ZGEMM( '/', 'C', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 1 + CALL ZGEMM( '/', 'T', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM( 'N', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM( 'C', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZGEMM( 'T', '/', 0, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'N', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'N', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'N', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'C', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'C', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'C', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'T', 'N', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'T', 'C', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZGEMM( 'T', 'T', -1, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'N', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'N', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'N', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'C', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'C', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'C', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'T', 'N', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'T', 'C', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZGEMM( 'T', 'T', 0, -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'N', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'N', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'N', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'C', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'C', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'C', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'T', 'N', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'T', 'C', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZGEMM( 'T', 'T', 0, 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'C', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'C', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'T', 'C', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 8 + CALL ZGEMM( 'T', 'T', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'N', 'N', 0, 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'C', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'T', 'N', 0, 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'N', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'C', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'T', 'C', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'N', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'C', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZGEMM( 'T', 'T', 0, 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'N', 'N', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'N', 'C', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'N', 'T', 2, 0, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'C', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'C', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'C', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'T', 'N', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'T', 'C', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 13 + CALL ZGEMM( 'T', 'T', 2, 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 20 INFOT = 1 + CALL ZHEMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHEMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHEMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHEMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHEMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHEMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 30 INFOT = 1 + CALL ZSYMM( '/', 'U', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYMM( 'L', '/', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM( 'L', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM( 'R', 'U', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM( 'L', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYMM( 'R', 'L', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM( 'L', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM( 'R', 'U', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM( 'L', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYMM( 'R', 'L', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM( 'R', 'U', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYMM( 'R', 'L', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM( 'L', 'U', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM( 'R', 'U', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM( 'L', 'L', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYMM( 'R', 'L', 2, 0, ALPHA, A, 1, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 40 INFOT = 1 + CALL ZTRMM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRMM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRMM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRMM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRMM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRMM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRMM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRMM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 50 INFOT = 1 + CALL ZTRSM( '/', 'U', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZTRSM( 'L', '/', 'N', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZTRSM( 'L', 'U', '/', 'N', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZTRSM( 'L', 'U', 'N', '/', 0, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'U', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'L', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'N', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'C', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 5 + CALL ZTRSM( 'R', 'L', 'T', 'N', -1, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'U', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'L', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'N', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'C', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 6 + CALL ZTRSM( 'R', 'L', 'T', 'N', 0, -1, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'U', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'N', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'C', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZTRSM( 'R', 'L', 'T', 'N', 0, 2, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'U', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'U', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'N', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'C', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'L', 'L', 'T', 'N', 2, 0, ALPHA, A, 2, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'N', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'C', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 11 + CALL ZTRSM( 'R', 'L', 'T', 'N', 2, 0, ALPHA, A, 1, B, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 60 INFOT = 1 + CALL ZHERK( '/', 'N', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHERK( 'U', 'T', 0, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'U', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'U', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'L', 'N', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHERK( 'L', 'C', -1, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'U', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'U', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'L', 'N', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHERK( 'L', 'C', 0, -1, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'U', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHERK( 'L', 'C', 0, 2, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'U', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'U', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'L', 'N', 2, 0, RALPHA, A, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZHERK( 'L', 'C', 2, 0, RALPHA, A, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 70 INFOT = 1 + CALL ZSYRK( '/', 'N', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYRK( 'U', 'C', 0, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'U', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'U', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'L', 'N', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYRK( 'L', 'T', -1, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'U', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'U', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'L', 'N', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYRK( 'L', 'T', 0, -1, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'U', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYRK( 'L', 'T', 0, 2, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'U', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'U', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'L', 'N', 2, 0, ALPHA, A, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 10 + CALL ZSYRK( 'L', 'T', 2, 0, ALPHA, A, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 80 INFOT = 1 + CALL ZHER2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZHER2K( 'U', 'T', 0, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'U', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZHER2K( 'L', 'C', -1, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'U', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZHER2K( 'L', 'C', 0, -1, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'U', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, RBETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZHER2K( 'L', 'C', 0, 2, ALPHA, A, 2, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'U', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZHER2K( 'L', 'C', 2, 0, ALPHA, A, 1, B, 1, RBETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + GO TO 100 + 90 INFOT = 1 + CALL ZSYR2K( '/', 'N', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 2 + CALL ZSYR2K( 'U', 'C', 0, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'U', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'U', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'L', 'N', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 3 + CALL ZSYR2K( 'L', 'T', -1, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'U', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'U', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'L', 'N', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 4 + CALL ZSYR2K( 'L', 'T', 0, -1, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 7 + CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'U', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 1, BETA, C, 2 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 9 + CALL ZSYR2K( 'L', 'T', 0, 2, ALPHA, A, 2, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'U', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'U', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'L', 'N', 2, 0, ALPHA, A, 2, B, 2, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) + INFOT = 12 + CALL ZSYR2K( 'L', 'T', 2, 0, ALPHA, A, 1, B, 1, BETA, C, 1 ) + CALL CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* + 100 IF( OK )THEN + WRITE( NOUT, FMT = 9999 )SRNAMT + ELSE + WRITE( NOUT, FMT = 9998 )SRNAMT + END IF + RETURN +* + 9999 FORMAT( ' ', A6, ' PASSED THE TESTS OF ERROR-EXITS' ) + 9998 FORMAT( ' ******* ', A6, ' FAILED THE TESTS OF ERROR-EXITS *****', + $ '**' ) +* +* End of ZCHKE. +* + END + SUBROUTINE ZMAKE( TYPE, UPLO, DIAG, M, N, A, NMAX, AA, LDA, RESET, + $ TRANSL ) +* +* Generates values for an M by N matrix A. +* Stores the values in the array AA in the data structure required +* by the routine, with unwanted elements set to rogue value. +* +* TYPE is 'GE', 'HE', 'SY' or 'TR'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO, ONE + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ), + $ ONE = ( 1.0D0, 0.0D0 ) ) + COMPLEX*16 ROGUE + PARAMETER ( ROGUE = ( -1.0D10, 1.0D10 ) ) + DOUBLE PRECISION RZERO + PARAMETER ( RZERO = 0.0D0 ) + DOUBLE PRECISION RROGUE + PARAMETER ( RROGUE = -1.0D10 ) +* .. Scalar Arguments .. + COMPLEX*16 TRANSL + INTEGER LDA, M, N, NMAX + LOGICAL RESET + CHARACTER*1 DIAG, UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 A( NMAX, * ), AA( * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J, JJ + LOGICAL GEN, HER, LOWER, SYM, TRI, UNIT, UPPER +* .. External Functions .. + COMPLEX*16 ZBEG + EXTERNAL ZBEG +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, DCONJG, DBLE +* .. Executable Statements .. + GEN = TYPE.EQ.'GE' + HER = TYPE.EQ.'HE' + SYM = TYPE.EQ.'SY' + TRI = TYPE.EQ.'TR' + UPPER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'U' + LOWER = ( HER.OR.SYM.OR.TRI ).AND.UPLO.EQ.'L' + UNIT = TRI.AND.DIAG.EQ.'U' +* +* Generate data in array A. +* + DO 20 J = 1, N + DO 10 I = 1, M + IF( GEN.OR.( UPPER.AND.I.LE.J ).OR.( LOWER.AND.I.GE.J ) ) + $ THEN + A( I, J ) = ZBEG( RESET ) + TRANSL + IF( I.NE.J )THEN +* Set some elements to zero + IF( N.GT.3.AND.J.EQ.N/2 ) + $ A( I, J ) = ZERO + IF( HER )THEN + A( J, I ) = DCONJG( A( I, J ) ) + ELSE IF( SYM )THEN + A( J, I ) = A( I, J ) + ELSE IF( TRI )THEN + A( J, I ) = ZERO + END IF + END IF + END IF + 10 CONTINUE + IF( HER ) + $ A( J, J ) = DCMPLX( DBLE( A( J, J ) ), RZERO ) + IF( TRI ) + $ A( J, J ) = A( J, J ) + ONE + IF( UNIT ) + $ A( J, J ) = ONE + 20 CONTINUE +* +* Store elements in array AS in data structure required by routine. +* + IF( TYPE.EQ.'GE' )THEN + DO 50 J = 1, N + DO 30 I = 1, M + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 30 CONTINUE + DO 40 I = M + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 40 CONTINUE + 50 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY'.OR.TYPE.EQ.'TR' )THEN + DO 90 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IF( UNIT )THEN + IEND = J - 1 + ELSE + IEND = J + END IF + ELSE + IF( UNIT )THEN + IBEG = J + 1 + ELSE + IBEG = J + END IF + IEND = N + END IF + DO 60 I = 1, IBEG - 1 + AA( I + ( J - 1 )*LDA ) = ROGUE + 60 CONTINUE + DO 70 I = IBEG, IEND + AA( I + ( J - 1 )*LDA ) = A( I, J ) + 70 CONTINUE + DO 80 I = IEND + 1, LDA + AA( I + ( J - 1 )*LDA ) = ROGUE + 80 CONTINUE + IF( HER )THEN + JJ = J + ( J - 1 )*LDA + AA( JJ ) = DCMPLX( DBLE( AA( JJ ) ), RROGUE ) + END IF + 90 CONTINUE + END IF + RETURN +* +* End of ZMAKE. +* + END + SUBROUTINE ZMMCH( TRANSA, TRANSB, M, N, KK, ALPHA, A, LDA, B, LDB, + $ BETA, C, LDC, CT, G, CC, LDCC, EPS, ERR, FATAL, + $ NOUT, MV ) +* +* Checks the results of the computational tests. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Parameters .. + COMPLEX*16 ZERO + PARAMETER ( ZERO = ( 0.0D0, 0.0D0 ) ) + DOUBLE PRECISION RZERO, RONE + PARAMETER ( RZERO = 0.0D0, RONE = 1.0D0 ) +* .. Scalar Arguments .. + COMPLEX*16 ALPHA, BETA + DOUBLE PRECISION EPS, ERR + INTEGER KK, LDA, LDB, LDC, LDCC, M, N, NOUT + LOGICAL FATAL, MV + CHARACTER*1 TRANSA, TRANSB +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), C( LDC, * ), + $ CC( LDCC, * ), CT( * ) + DOUBLE PRECISION G( * ) +* .. Local Scalars .. + COMPLEX*16 CL + DOUBLE PRECISION ERRI + INTEGER I, J, K + LOGICAL CTRANA, CTRANB, TRANA, TRANB +* .. Intrinsic Functions .. + INTRINSIC ABS, DIMAG, DCONJG, MAX, DBLE, SQRT +* .. Statement Functions .. + DOUBLE PRECISION ABS1 +* .. Statement Function definitions .. + ABS1( CL ) = ABS( DBLE( CL ) ) + ABS( DIMAG( CL ) ) +* .. Executable Statements .. + TRANA = TRANSA.EQ.'T'.OR.TRANSA.EQ.'C' + TRANB = TRANSB.EQ.'T'.OR.TRANSB.EQ.'C' + CTRANA = TRANSA.EQ.'C' + CTRANB = TRANSB.EQ.'C' +* +* Compute expected result, one column at a time, in CT using data +* in A, B and C. +* Compute gauges in G. +* + DO 220 J = 1, N +* + DO 10 I = 1, M + CT( I ) = ZERO + G( I ) = RZERO + 10 CONTINUE + IF( .NOT.TRANA.AND..NOT.TRANB )THEN + DO 30 K = 1, KK + DO 20 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( K, J ) + G( I ) = G( I ) + ABS1( A( I, K ) )*ABS1( B( K, J ) ) + 20 CONTINUE + 30 CONTINUE + ELSE IF( TRANA.AND..NOT.TRANB )THEN + IF( CTRANA )THEN + DO 50 K = 1, KK + DO 40 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 40 CONTINUE + 50 CONTINUE + ELSE + DO 70 K = 1, KK + DO 60 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( K, J ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( K, J ) ) + 60 CONTINUE + 70 CONTINUE + END IF + ELSE IF( .NOT.TRANA.AND.TRANB )THEN + IF( CTRANB )THEN + DO 90 K = 1, KK + DO 80 I = 1, M + CT( I ) = CT( I ) + A( I, K )*DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 80 CONTINUE + 90 CONTINUE + ELSE + DO 110 K = 1, KK + DO 100 I = 1, M + CT( I ) = CT( I ) + A( I, K )*B( J, K ) + G( I ) = G( I ) + ABS1( A( I, K ) )* + $ ABS1( B( J, K ) ) + 100 CONTINUE + 110 CONTINUE + END IF + ELSE IF( TRANA.AND.TRANB )THEN + IF( CTRANA )THEN + IF( CTRANB )THEN + DO 130 K = 1, KK + DO 120 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 120 CONTINUE + 130 CONTINUE + ELSE + DO 150 K = 1, KK + DO 140 I = 1, M + CT( I ) = CT( I ) + DCONJG( A( K, I ) )* + $ B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 140 CONTINUE + 150 CONTINUE + END IF + ELSE + IF( CTRANB )THEN + DO 170 K = 1, KK + DO 160 I = 1, M + CT( I ) = CT( I ) + A( K, I )* + $ DCONJG( B( J, K ) ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 160 CONTINUE + 170 CONTINUE + ELSE + DO 190 K = 1, KK + DO 180 I = 1, M + CT( I ) = CT( I ) + A( K, I )*B( J, K ) + G( I ) = G( I ) + ABS1( A( K, I ) )* + $ ABS1( B( J, K ) ) + 180 CONTINUE + 190 CONTINUE + END IF + END IF + END IF + DO 200 I = 1, M + CT( I ) = ALPHA*CT( I ) + BETA*C( I, J ) + G( I ) = ABS1( ALPHA )*G( I ) + + $ ABS1( BETA )*ABS1( C( I, J ) ) + 200 CONTINUE +* +* Compute the error ratio for this result. +* + ERR = ZERO + DO 210 I = 1, M + ERRI = ABS1( CT( I ) - CC( I, J ) )/EPS + IF( G( I ).NE.RZERO ) + $ ERRI = ERRI/G( I ) + ERR = MAX( ERR, ERRI ) + IF( ERR*SQRT( EPS ).GE.RONE ) + $ GO TO 230 + 210 CONTINUE +* + 220 CONTINUE +* +* If the loop completes, all results are at least half accurate. + GO TO 250 +* +* Report fatal error. +* + 230 FATAL = .TRUE. + WRITE( NOUT, FMT = 9999 ) + DO 240 I = 1, M + IF( MV )THEN + WRITE( NOUT, FMT = 9998 )I, CT( I ), CC( I, J ) + ELSE + WRITE( NOUT, FMT = 9998 )I, CC( I, J ), CT( I ) + END IF + 240 CONTINUE + IF( N.GT.1 ) + $ WRITE( NOUT, FMT = 9997 )J +* + 250 CONTINUE + RETURN +* + 9999 FORMAT( ' ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HAL', + $ 'F ACCURATE *******', /' EXPECTED RE', + $ 'SULT COMPUTED RESULT' ) + 9998 FORMAT( 1X, I7, 2( ' (', G15.6, ',', G15.6, ')' ) ) + 9997 FORMAT( ' THESE ARE THE RESULTS FOR COLUMN ', I3 ) +* +* End of ZMMCH. +* + END + LOGICAL FUNCTION LZE( RI, RJ, LR ) +* +* Tests if two arrays are identical. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LR +* .. Array Arguments .. + COMPLEX*16 RI( * ), RJ( * ) +* .. Local Scalars .. + INTEGER I +* .. Executable Statements .. + DO 10 I = 1, LR + IF( RI( I ).NE.RJ( I ) ) + $ GO TO 20 + 10 CONTINUE + LZE = .TRUE. + GO TO 30 + 20 CONTINUE + LZE = .FALSE. + 30 RETURN +* +* End of LZE. +* + END + LOGICAL FUNCTION LZERES( TYPE, UPLO, M, N, AA, AS, LDA ) +* +* Tests if selected elements in two arrays are equal. +* +* TYPE is 'GE' or 'HE' or 'SY'. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER LDA, M, N + CHARACTER*1 UPLO + CHARACTER*2 TYPE +* .. Array Arguments .. + COMPLEX*16 AA( LDA, * ), AS( LDA, * ) +* .. Local Scalars .. + INTEGER I, IBEG, IEND, J + LOGICAL UPPER +* .. Executable Statements .. + UPPER = UPLO.EQ.'U' + IF( TYPE.EQ.'GE' )THEN + DO 20 J = 1, N + DO 10 I = M + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 10 CONTINUE + 20 CONTINUE + ELSE IF( TYPE.EQ.'HE'.OR.TYPE.EQ.'SY' )THEN + DO 50 J = 1, N + IF( UPPER )THEN + IBEG = 1 + IEND = J + ELSE + IBEG = J + IEND = N + END IF + DO 30 I = 1, IBEG - 1 + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 30 CONTINUE + DO 40 I = IEND + 1, LDA + IF( AA( I, J ).NE.AS( I, J ) ) + $ GO TO 70 + 40 CONTINUE + 50 CONTINUE + END IF +* + 60 CONTINUE + LZERES = .TRUE. + GO TO 80 + 70 CONTINUE + LZERES = .FALSE. + 80 RETURN +* +* End of LZERES. +* + END + COMPLEX*16 FUNCTION ZBEG( RESET ) +* +* Generates complex numbers as pairs of random numbers uniformly +* distributed between -0.5 and 0.5. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + LOGICAL RESET +* .. Local Scalars .. + INTEGER I, IC, J, MI, MJ +* .. Save statement .. + SAVE I, IC, J, MI, MJ +* .. Intrinsic Functions .. + INTRINSIC DCMPLX +* .. Executable Statements .. + IF( RESET )THEN +* Initialize local variables. + MI = 891 + MJ = 457 + I = 7 + J = 7 + IC = 0 + RESET = .FALSE. + END IF +* +* The sequence of values of I or J is bounded between 1 and 999. +* If initial I or J = 1,2,3,6,7 or 9, the period will be 50. +* If initial I or J = 4 or 8, the period will be 25. +* If initial I or J = 5, the period will be 10. +* IC is used to break up the period by skipping 1 value of I or J +* in 6. +* + IC = IC + 1 + 10 I = I*MI + J = J*MJ + I = I - 1000*( I/1000 ) + J = J - 1000*( J/1000 ) + IF( IC.GE.5 )THEN + IC = 0 + GO TO 10 + END IF + ZBEG = DCMPLX( ( I - 500 )/1001.0D0, ( J - 500 )/1001.0D0 ) + RETURN +* +* End of ZBEG. +* + END + DOUBLE PRECISION FUNCTION DDIFF( X, Y ) +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + DOUBLE PRECISION X, Y +* .. Executable Statements .. + DDIFF = X - Y + RETURN +* +* End of DDIFF. +* + END + SUBROUTINE CHKXER( SRNAMT, INFOT, NOUT, LERR, OK ) +* +* Tests whether XERBLA has detected an error when it should. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Executable Statements .. + IF( .NOT.LERR )THEN + WRITE( NOUT, FMT = 9999 )INFOT, SRNAMT + OK = .FALSE. + END IF + LERR = .FALSE. + RETURN +* + 9999 FORMAT( ' ***** ILLEGAL VALUE OF PARAMETER NUMBER ', I2, ' NOT D', + $ 'ETECTED BY ', A6, ' *****' ) +* +* End of CHKXER. +* + END + SUBROUTINE XERBLA( SRNAME, INFO ) +* +* This is a special version of XERBLA to be used only as part of +* the test program for testing error exits from the Level 3 BLAS +* routines. +* +* XERBLA is an error handler for the Level 3 BLAS routines. +* +* It is called by the Level 3 BLAS routines if an input parameter is +* invalid. +* +* Auxiliary routine for test program for Level 3 Blas. +* +* -- Written on 8-February-1989. +* Jack Dongarra, Argonne National Laboratory. +* Iain Duff, AERE Harwell. +* Jeremy Du Croz, Numerical Algorithms Group Ltd. +* Sven Hammarling, Numerical Algorithms Group Ltd. +* +* .. Scalar Arguments .. + INTEGER INFO + CHARACTER*6 SRNAME +* .. Scalars in Common .. + INTEGER INFOT, NOUT + LOGICAL LERR, OK + CHARACTER*6 SRNAMT +* .. Common blocks .. + COMMON /INFOC/INFOT, NOUT, OK, LERR + COMMON /SRNAMC/SRNAMT +* .. Executable Statements .. + LERR = .TRUE. + IF( INFO.NE.INFOT )THEN + IF( INFOT.NE.0 )THEN + WRITE( NOUT, FMT = 9999 )INFO, INFOT + ELSE + WRITE( NOUT, FMT = 9997 )INFO + END IF + OK = .FALSE. + END IF + IF( SRNAME.NE.SRNAMT )THEN + WRITE( NOUT, FMT = 9998 )SRNAME, SRNAMT + OK = .FALSE. + END IF + RETURN +* + 9999 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, ' INSTEAD', + $ ' OF ', I2, ' *******' ) + 9998 FORMAT( ' ******* XERBLA WAS CALLED WITH SRNAME = ', A6, ' INSTE', + $ 'AD OF ', A6, ' *******' ) + 9997 FORMAT( ' ******* XERBLA WAS CALLED WITH INFO = ', I6, + $ ' *******' ) +* +* End of XERBLA +* + END + diff --git a/version.h b/version.h new file mode 100644 index 0000000000..d41444674e --- /dev/null +++ b/version.h @@ -0,0 +1,43 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef VERSION_H +#define VERSION_H + +#define VERSION " Optimized BLAS by Kazushige Goto " +#endif